1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2012 Red Hat. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm.h" 9 #include "dm-bio-prison-v2.h" 10 #include "dm-bio-record.h" 11 #include "dm-cache-metadata.h" 12 #include "dm-io-tracker.h" 13 #include "dm-cache-background-tracker.h" 14 15 #include <linux/dm-io.h> 16 #include <linux/dm-kcopyd.h> 17 #include <linux/jiffies.h> 18 #include <linux/init.h> 19 #include <linux/mempool.h> 20 #include <linux/module.h> 21 #include <linux/rwsem.h> 22 #include <linux/slab.h> 23 #include <linux/vmalloc.h> 24 25 #define DM_MSG_PREFIX "cache" 26 27 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, 28 "A percentage of time allocated for copying to and/or from cache"); 29 30 /*----------------------------------------------------------------*/ 31 32 /* 33 * Glossary: 34 * 35 * oblock: index of an origin block 36 * cblock: index of a cache block 37 * promotion: movement of a block from origin to cache 38 * demotion: movement of a block from cache to origin 39 * migration: movement of a block between the origin and cache device, 40 * either direction 41 */ 42 43 /*----------------------------------------------------------------*/ 44 45 /* 46 * Represents a chunk of future work. 'input' allows continuations to pass 47 * values between themselves, typically error values. 48 */ 49 struct continuation { 50 struct work_struct ws; 51 blk_status_t input; 52 }; 53 54 static inline void init_continuation(struct continuation *k, 55 void (*fn)(struct work_struct *)) 56 { 57 INIT_WORK(&k->ws, fn); 58 k->input = 0; 59 } 60 61 static inline void queue_continuation(struct workqueue_struct *wq, 62 struct continuation *k) 63 { 64 queue_work(wq, &k->ws); 65 } 66 67 /*----------------------------------------------------------------*/ 68 69 /* 70 * The batcher collects together pieces of work that need a particular 71 * operation to occur before they can proceed (typically a commit). 72 */ 73 struct batcher { 74 /* 75 * The operation that everyone is waiting for. 76 */ 77 blk_status_t (*commit_op)(void *context); 78 void *commit_context; 79 80 /* 81 * This is how bios should be issued once the commit op is complete 82 * (accounted_request). 83 */ 84 void (*issue_op)(struct bio *bio, void *context); 85 void *issue_context; 86 87 /* 88 * Queued work gets put on here after commit. 89 */ 90 struct workqueue_struct *wq; 91 92 spinlock_t lock; 93 struct list_head work_items; 94 struct bio_list bios; 95 struct work_struct commit_work; 96 97 bool commit_scheduled; 98 }; 99 100 static void __commit(struct work_struct *_ws) 101 { 102 struct batcher *b = container_of(_ws, struct batcher, commit_work); 103 blk_status_t r; 104 struct list_head work_items; 105 struct work_struct *ws, *tmp; 106 struct continuation *k; 107 struct bio *bio; 108 struct bio_list bios; 109 110 INIT_LIST_HEAD(&work_items); 111 bio_list_init(&bios); 112 113 /* 114 * We have to grab these before the commit_op to avoid a race 115 * condition. 116 */ 117 spin_lock_irq(&b->lock); 118 list_splice_init(&b->work_items, &work_items); 119 bio_list_merge_init(&bios, &b->bios); 120 b->commit_scheduled = false; 121 spin_unlock_irq(&b->lock); 122 123 r = b->commit_op(b->commit_context); 124 125 list_for_each_entry_safe(ws, tmp, &work_items, entry) { 126 k = container_of(ws, struct continuation, ws); 127 k->input = r; 128 INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */ 129 queue_work(b->wq, ws); 130 } 131 132 while ((bio = bio_list_pop(&bios))) { 133 if (r) { 134 bio->bi_status = r; 135 bio_endio(bio); 136 } else 137 b->issue_op(bio, b->issue_context); 138 } 139 } 140 141 static void batcher_init(struct batcher *b, 142 blk_status_t (*commit_op)(void *), 143 void *commit_context, 144 void (*issue_op)(struct bio *bio, void *), 145 void *issue_context, 146 struct workqueue_struct *wq) 147 { 148 b->commit_op = commit_op; 149 b->commit_context = commit_context; 150 b->issue_op = issue_op; 151 b->issue_context = issue_context; 152 b->wq = wq; 153 154 spin_lock_init(&b->lock); 155 INIT_LIST_HEAD(&b->work_items); 156 bio_list_init(&b->bios); 157 INIT_WORK(&b->commit_work, __commit); 158 b->commit_scheduled = false; 159 } 160 161 static void async_commit(struct batcher *b) 162 { 163 queue_work(b->wq, &b->commit_work); 164 } 165 166 static void continue_after_commit(struct batcher *b, struct continuation *k) 167 { 168 bool commit_scheduled; 169 170 spin_lock_irq(&b->lock); 171 commit_scheduled = b->commit_scheduled; 172 list_add_tail(&k->ws.entry, &b->work_items); 173 spin_unlock_irq(&b->lock); 174 175 if (commit_scheduled) 176 async_commit(b); 177 } 178 179 /* 180 * Bios are errored if commit failed. 181 */ 182 static void issue_after_commit(struct batcher *b, struct bio *bio) 183 { 184 bool commit_scheduled; 185 186 spin_lock_irq(&b->lock); 187 commit_scheduled = b->commit_scheduled; 188 bio_list_add(&b->bios, bio); 189 spin_unlock_irq(&b->lock); 190 191 if (commit_scheduled) 192 async_commit(b); 193 } 194 195 /* 196 * Call this if some urgent work is waiting for the commit to complete. 197 */ 198 static void schedule_commit(struct batcher *b) 199 { 200 bool immediate; 201 202 spin_lock_irq(&b->lock); 203 immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios); 204 b->commit_scheduled = true; 205 spin_unlock_irq(&b->lock); 206 207 if (immediate) 208 async_commit(b); 209 } 210 211 /* 212 * There are a couple of places where we let a bio run, but want to do some 213 * work before calling its endio function. We do this by temporarily 214 * changing the endio fn. 215 */ 216 struct dm_hook_info { 217 bio_end_io_t *bi_end_io; 218 }; 219 220 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, 221 bio_end_io_t *bi_end_io, void *bi_private) 222 { 223 h->bi_end_io = bio->bi_end_io; 224 225 bio->bi_end_io = bi_end_io; 226 bio->bi_private = bi_private; 227 } 228 229 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) 230 { 231 bio->bi_end_io = h->bi_end_io; 232 } 233 234 /*----------------------------------------------------------------*/ 235 236 #define MIGRATION_POOL_SIZE 128 237 #define COMMIT_PERIOD HZ 238 #define MIGRATION_COUNT_WINDOW 10 239 240 /* 241 * The block size of the device holding cache data must be 242 * between 32KB and 1GB. 243 */ 244 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) 245 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 246 247 enum cache_metadata_mode { 248 CM_WRITE, /* metadata may be changed */ 249 CM_READ_ONLY, /* metadata may not be changed */ 250 CM_FAIL 251 }; 252 253 enum cache_io_mode { 254 /* 255 * Data is written to cached blocks only. These blocks are marked 256 * dirty. If you lose the cache device you will lose data. 257 * Potential performance increase for both reads and writes. 258 */ 259 CM_IO_WRITEBACK, 260 261 /* 262 * Data is written to both cache and origin. Blocks are never 263 * dirty. Potential performance benfit for reads only. 264 */ 265 CM_IO_WRITETHROUGH, 266 267 /* 268 * A degraded mode useful for various cache coherency situations 269 * (eg, rolling back snapshots). Reads and writes always go to the 270 * origin. If a write goes to a cached oblock, then the cache 271 * block is invalidated. 272 */ 273 CM_IO_PASSTHROUGH 274 }; 275 276 struct cache_features { 277 enum cache_metadata_mode mode; 278 enum cache_io_mode io_mode; 279 unsigned int metadata_version; 280 bool discard_passdown:1; 281 }; 282 283 struct cache_stats { 284 atomic_t read_hit; 285 atomic_t read_miss; 286 atomic_t write_hit; 287 atomic_t write_miss; 288 atomic_t demotion; 289 atomic_t promotion; 290 atomic_t writeback; 291 atomic_t copies_avoided; 292 atomic_t cache_cell_clash; 293 atomic_t commit_count; 294 atomic_t discard_count; 295 }; 296 297 struct cache { 298 struct dm_target *ti; 299 spinlock_t lock; 300 301 /* 302 * Fields for converting from sectors to blocks. 303 */ 304 int sectors_per_block_shift; 305 sector_t sectors_per_block; 306 307 struct dm_cache_metadata *cmd; 308 309 /* 310 * Metadata is written to this device. 311 */ 312 struct dm_dev *metadata_dev; 313 314 /* 315 * The slower of the two data devices. Typically a spindle. 316 */ 317 struct dm_dev *origin_dev; 318 319 /* 320 * The faster of the two data devices. Typically an SSD. 321 */ 322 struct dm_dev *cache_dev; 323 324 /* 325 * Size of the origin device in _complete_ blocks and native sectors. 326 */ 327 dm_oblock_t origin_blocks; 328 sector_t origin_sectors; 329 330 /* 331 * Size of the cache device in blocks. 332 */ 333 dm_cblock_t cache_size; 334 335 /* 336 * Invalidation fields. 337 */ 338 spinlock_t invalidation_lock; 339 struct list_head invalidation_requests; 340 341 sector_t migration_threshold; 342 wait_queue_head_t migration_wait; 343 atomic_t nr_allocated_migrations; 344 345 /* 346 * The number of in flight migrations that are performing 347 * background io. eg, promotion, writeback. 348 */ 349 atomic_t nr_io_migrations; 350 351 struct bio_list deferred_bios; 352 353 struct rw_semaphore quiesce_lock; 354 355 /* 356 * origin_blocks entries, discarded if set. 357 */ 358 dm_dblock_t discard_nr_blocks; 359 unsigned long *discard_bitset; 360 uint32_t discard_block_size; /* a power of 2 times sectors per block */ 361 362 /* 363 * Rather than reconstructing the table line for the status we just 364 * save it and regurgitate. 365 */ 366 unsigned int nr_ctr_args; 367 const char **ctr_args; 368 369 struct dm_kcopyd_client *copier; 370 struct work_struct deferred_bio_worker; 371 struct work_struct migration_worker; 372 struct workqueue_struct *wq; 373 struct delayed_work waker; 374 struct dm_bio_prison_v2 *prison; 375 376 /* 377 * cache_size entries, dirty if set 378 */ 379 unsigned long *dirty_bitset; 380 atomic_t nr_dirty; 381 382 unsigned int policy_nr_args; 383 struct dm_cache_policy *policy; 384 385 /* 386 * Cache features such as write-through. 387 */ 388 struct cache_features features; 389 390 struct cache_stats stats; 391 392 bool need_tick_bio:1; 393 bool sized:1; 394 bool invalidate:1; 395 bool commit_requested:1; 396 bool loaded_mappings:1; 397 bool loaded_discards:1; 398 399 struct rw_semaphore background_work_lock; 400 401 struct batcher committer; 402 struct work_struct commit_ws; 403 404 struct dm_io_tracker tracker; 405 406 mempool_t migration_pool; 407 408 struct bio_set bs; 409 410 /* 411 * Cache_size entries. Set bits indicate blocks mapped beyond the 412 * target length, which are marked for invalidation. 413 */ 414 unsigned long *invalid_bitset; 415 }; 416 417 struct per_bio_data { 418 bool tick:1; 419 unsigned int req_nr:2; 420 struct dm_bio_prison_cell_v2 *cell; 421 struct dm_hook_info hook_info; 422 sector_t len; 423 }; 424 425 struct dm_cache_migration { 426 struct continuation k; 427 struct cache *cache; 428 429 struct policy_work *op; 430 struct bio *overwrite_bio; 431 struct dm_bio_prison_cell_v2 *cell; 432 433 dm_cblock_t invalidate_cblock; 434 dm_oblock_t invalidate_oblock; 435 }; 436 437 /*----------------------------------------------------------------*/ 438 439 static bool writethrough_mode(struct cache *cache) 440 { 441 return cache->features.io_mode == CM_IO_WRITETHROUGH; 442 } 443 444 static bool writeback_mode(struct cache *cache) 445 { 446 return cache->features.io_mode == CM_IO_WRITEBACK; 447 } 448 449 static inline bool passthrough_mode(struct cache *cache) 450 { 451 return unlikely(cache->features.io_mode == CM_IO_PASSTHROUGH); 452 } 453 454 /*----------------------------------------------------------------*/ 455 456 static void wake_deferred_bio_worker(struct cache *cache) 457 { 458 queue_work(cache->wq, &cache->deferred_bio_worker); 459 } 460 461 static void wake_migration_worker(struct cache *cache) 462 { 463 if (passthrough_mode(cache)) 464 return; 465 466 queue_work(cache->wq, &cache->migration_worker); 467 } 468 469 /*----------------------------------------------------------------*/ 470 471 static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache) 472 { 473 return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOIO); 474 } 475 476 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell) 477 { 478 dm_bio_prison_free_cell_v2(cache->prison, cell); 479 } 480 481 static struct dm_cache_migration *alloc_migration(struct cache *cache) 482 { 483 struct dm_cache_migration *mg; 484 485 mg = mempool_alloc(&cache->migration_pool, GFP_NOIO); 486 487 memset(mg, 0, sizeof(*mg)); 488 489 mg->cache = cache; 490 atomic_inc(&cache->nr_allocated_migrations); 491 492 return mg; 493 } 494 495 static void free_migration(struct dm_cache_migration *mg) 496 { 497 struct cache *cache = mg->cache; 498 499 if (atomic_dec_and_test(&cache->nr_allocated_migrations)) 500 wake_up(&cache->migration_wait); 501 502 mempool_free(mg, &cache->migration_pool); 503 } 504 505 /*----------------------------------------------------------------*/ 506 507 static inline dm_oblock_t oblock_succ(dm_oblock_t b) 508 { 509 return to_oblock(from_oblock(b) + 1ull); 510 } 511 512 static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key) 513 { 514 key->virtual = 0; 515 key->dev = 0; 516 key->block_begin = from_oblock(begin); 517 key->block_end = from_oblock(end); 518 } 519 520 /* 521 * We have two lock levels. Level 0, which is used to prevent WRITEs, and 522 * level 1 which prevents *both* READs and WRITEs. 523 */ 524 #define WRITE_LOCK_LEVEL 0 525 #define READ_WRITE_LOCK_LEVEL 1 526 527 static unsigned int lock_level(struct bio *bio) 528 { 529 return bio_data_dir(bio) == WRITE ? 530 WRITE_LOCK_LEVEL : 531 READ_WRITE_LOCK_LEVEL; 532 } 533 534 /* 535 *-------------------------------------------------------------- 536 * Per bio data 537 *-------------------------------------------------------------- 538 */ 539 540 static struct per_bio_data *get_per_bio_data(struct bio *bio) 541 { 542 struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); 543 544 BUG_ON(!pb); 545 return pb; 546 } 547 548 static struct per_bio_data *init_per_bio_data(struct bio *bio) 549 { 550 struct per_bio_data *pb = get_per_bio_data(bio); 551 552 pb->tick = false; 553 pb->req_nr = dm_bio_get_target_bio_nr(bio); 554 pb->cell = NULL; 555 pb->len = 0; 556 557 return pb; 558 } 559 560 /*----------------------------------------------------------------*/ 561 562 static void defer_bio(struct cache *cache, struct bio *bio) 563 { 564 spin_lock_irq(&cache->lock); 565 bio_list_add(&cache->deferred_bios, bio); 566 spin_unlock_irq(&cache->lock); 567 568 wake_deferred_bio_worker(cache); 569 } 570 571 static void defer_bios(struct cache *cache, struct bio_list *bios) 572 { 573 spin_lock_irq(&cache->lock); 574 bio_list_merge_init(&cache->deferred_bios, bios); 575 spin_unlock_irq(&cache->lock); 576 577 wake_deferred_bio_worker(cache); 578 } 579 580 /*----------------------------------------------------------------*/ 581 582 static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio) 583 { 584 bool r; 585 struct per_bio_data *pb; 586 struct dm_cell_key_v2 key; 587 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); 588 struct dm_bio_prison_cell_v2 *cell_prealloc, *cell; 589 590 cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */ 591 592 build_key(oblock, end, &key); 593 r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell); 594 if (!r) { 595 /* 596 * Failed to get the lock. 597 */ 598 free_prison_cell(cache, cell_prealloc); 599 return r; 600 } 601 602 if (cell != cell_prealloc) 603 free_prison_cell(cache, cell_prealloc); 604 605 pb = get_per_bio_data(bio); 606 pb->cell = cell; 607 608 return r; 609 } 610 611 /*----------------------------------------------------------------*/ 612 613 static bool is_dirty(struct cache *cache, dm_cblock_t b) 614 { 615 return test_bit(from_cblock(b), cache->dirty_bitset); 616 } 617 618 static void set_dirty(struct cache *cache, dm_cblock_t cblock) 619 { 620 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 621 atomic_inc(&cache->nr_dirty); 622 policy_set_dirty(cache->policy, cblock); 623 } 624 } 625 626 /* 627 * These two are called when setting after migrations to force the policy 628 * and dirty bitset to be in sync. 629 */ 630 static void force_set_dirty(struct cache *cache, dm_cblock_t cblock) 631 { 632 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) 633 atomic_inc(&cache->nr_dirty); 634 policy_set_dirty(cache->policy, cblock); 635 } 636 637 static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock) 638 { 639 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 640 if (atomic_dec_return(&cache->nr_dirty) == 0) 641 dm_table_event(cache->ti->table); 642 } 643 644 policy_clear_dirty(cache->policy, cblock); 645 } 646 647 /*----------------------------------------------------------------*/ 648 649 static bool block_size_is_power_of_two(struct cache *cache) 650 { 651 return cache->sectors_per_block_shift >= 0; 652 } 653 654 static dm_block_t block_div(dm_block_t b, uint32_t n) 655 { 656 do_div(b, n); 657 658 return b; 659 } 660 661 static dm_block_t oblocks_per_dblock(struct cache *cache) 662 { 663 dm_block_t oblocks = cache->discard_block_size; 664 665 if (block_size_is_power_of_two(cache)) 666 oblocks >>= cache->sectors_per_block_shift; 667 else 668 oblocks = block_div(oblocks, cache->sectors_per_block); 669 670 return oblocks; 671 } 672 673 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) 674 { 675 return to_dblock(block_div(from_oblock(oblock), 676 oblocks_per_dblock(cache))); 677 } 678 679 static void set_discard(struct cache *cache, dm_dblock_t b) 680 { 681 BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks)); 682 atomic_inc(&cache->stats.discard_count); 683 684 spin_lock_irq(&cache->lock); 685 set_bit(from_dblock(b), cache->discard_bitset); 686 spin_unlock_irq(&cache->lock); 687 } 688 689 static void clear_discard(struct cache *cache, dm_dblock_t b) 690 { 691 spin_lock_irq(&cache->lock); 692 clear_bit(from_dblock(b), cache->discard_bitset); 693 spin_unlock_irq(&cache->lock); 694 } 695 696 static bool is_discarded(struct cache *cache, dm_dblock_t b) 697 { 698 int r; 699 700 spin_lock_irq(&cache->lock); 701 r = test_bit(from_dblock(b), cache->discard_bitset); 702 spin_unlock_irq(&cache->lock); 703 704 return r; 705 } 706 707 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) 708 { 709 int r; 710 711 spin_lock_irq(&cache->lock); 712 r = test_bit(from_dblock(oblock_to_dblock(cache, b)), 713 cache->discard_bitset); 714 spin_unlock_irq(&cache->lock); 715 716 return r; 717 } 718 719 /* 720 * ------------------------------------------------------------- 721 * Remapping 722 *-------------------------------------------------------------- 723 */ 724 static void remap_to_origin(struct cache *cache, struct bio *bio) 725 { 726 bio_set_dev(bio, cache->origin_dev->bdev); 727 } 728 729 static void remap_to_cache(struct cache *cache, struct bio *bio, 730 dm_cblock_t cblock) 731 { 732 sector_t bi_sector = bio->bi_iter.bi_sector; 733 sector_t block = from_cblock(cblock); 734 735 bio_set_dev(bio, cache->cache_dev->bdev); 736 if (!block_size_is_power_of_two(cache)) 737 bio->bi_iter.bi_sector = 738 (block * cache->sectors_per_block) + 739 sector_div(bi_sector, cache->sectors_per_block); 740 else 741 bio->bi_iter.bi_sector = 742 (block << cache->sectors_per_block_shift) | 743 (bi_sector & (cache->sectors_per_block - 1)); 744 } 745 746 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) 747 { 748 struct per_bio_data *pb; 749 750 spin_lock_irq(&cache->lock); 751 if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) && 752 bio_op(bio) != REQ_OP_DISCARD) { 753 pb = get_per_bio_data(bio); 754 pb->tick = true; 755 cache->need_tick_bio = false; 756 } 757 spin_unlock_irq(&cache->lock); 758 } 759 760 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 761 dm_oblock_t oblock) 762 { 763 // FIXME: check_if_tick_bio_needed() is called way too much through this interface 764 check_if_tick_bio_needed(cache, bio); 765 remap_to_origin(cache, bio); 766 if (bio_data_dir(bio) == WRITE) 767 clear_discard(cache, oblock_to_dblock(cache, oblock)); 768 } 769 770 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, 771 dm_oblock_t oblock, dm_cblock_t cblock) 772 { 773 check_if_tick_bio_needed(cache, bio); 774 remap_to_cache(cache, bio, cblock); 775 if (bio_data_dir(bio) == WRITE) { 776 set_dirty(cache, cblock); 777 clear_discard(cache, oblock_to_dblock(cache, oblock)); 778 } 779 } 780 781 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) 782 { 783 sector_t block_nr = bio->bi_iter.bi_sector; 784 785 if (!block_size_is_power_of_two(cache)) 786 (void) sector_div(block_nr, cache->sectors_per_block); 787 else 788 block_nr >>= cache->sectors_per_block_shift; 789 790 return to_oblock(block_nr); 791 } 792 793 static bool accountable_bio(struct cache *cache, struct bio *bio) 794 { 795 return bio_op(bio) != REQ_OP_DISCARD; 796 } 797 798 static void accounted_begin(struct cache *cache, struct bio *bio) 799 { 800 struct per_bio_data *pb; 801 802 if (accountable_bio(cache, bio)) { 803 pb = get_per_bio_data(bio); 804 pb->len = bio_sectors(bio); 805 dm_iot_io_begin(&cache->tracker, pb->len); 806 } 807 } 808 809 static void accounted_complete(struct cache *cache, struct bio *bio) 810 { 811 struct per_bio_data *pb = get_per_bio_data(bio); 812 813 dm_iot_io_end(&cache->tracker, pb->len); 814 } 815 816 static void accounted_request(struct cache *cache, struct bio *bio) 817 { 818 accounted_begin(cache, bio); 819 dm_submit_bio_remap(bio, NULL); 820 } 821 822 static void issue_op(struct bio *bio, void *context) 823 { 824 struct cache *cache = context; 825 826 accounted_request(cache, bio); 827 } 828 829 /* 830 * When running in writethrough mode we need to send writes to clean blocks 831 * to both the cache and origin devices. Clone the bio and send them in parallel. 832 */ 833 static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio, 834 dm_oblock_t oblock, dm_cblock_t cblock) 835 { 836 struct bio *origin_bio = bio_alloc_clone(cache->origin_dev->bdev, bio, 837 GFP_NOIO, &cache->bs); 838 839 BUG_ON(!origin_bio); 840 841 bio_chain(origin_bio, bio); 842 843 if (bio_data_dir(origin_bio) == WRITE) 844 clear_discard(cache, oblock_to_dblock(cache, oblock)); 845 submit_bio(origin_bio); 846 847 remap_to_cache(cache, bio, cblock); 848 } 849 850 /* 851 *-------------------------------------------------------------- 852 * Failure modes 853 *-------------------------------------------------------------- 854 */ 855 static enum cache_metadata_mode get_cache_mode(struct cache *cache) 856 { 857 return cache->features.mode; 858 } 859 860 static const char *cache_device_name(struct cache *cache) 861 { 862 return dm_table_device_name(cache->ti->table); 863 } 864 865 static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode) 866 { 867 static const char *descs[] = { 868 "write", 869 "read-only", 870 "fail" 871 }; 872 873 dm_table_event(cache->ti->table); 874 DMINFO("%s: switching cache to %s mode", 875 cache_device_name(cache), descs[(int)mode]); 876 } 877 878 static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode) 879 { 880 bool needs_check; 881 enum cache_metadata_mode old_mode = get_cache_mode(cache); 882 883 if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) { 884 DMERR("%s: unable to read needs_check flag, setting failure mode.", 885 cache_device_name(cache)); 886 new_mode = CM_FAIL; 887 } 888 889 if (new_mode == CM_WRITE && needs_check) { 890 DMERR("%s: unable to switch cache to write mode until repaired.", 891 cache_device_name(cache)); 892 if (old_mode != new_mode) 893 new_mode = old_mode; 894 else 895 new_mode = CM_READ_ONLY; 896 } 897 898 /* Never move out of fail mode */ 899 if (old_mode == CM_FAIL) 900 new_mode = CM_FAIL; 901 902 switch (new_mode) { 903 case CM_FAIL: 904 case CM_READ_ONLY: 905 dm_cache_metadata_set_read_only(cache->cmd); 906 break; 907 908 case CM_WRITE: 909 dm_cache_metadata_set_read_write(cache->cmd); 910 break; 911 } 912 913 cache->features.mode = new_mode; 914 915 if (new_mode != old_mode) 916 notify_mode_switch(cache, new_mode); 917 } 918 919 static void abort_transaction(struct cache *cache) 920 { 921 const char *dev_name = cache_device_name(cache); 922 923 if (get_cache_mode(cache) >= CM_READ_ONLY) 924 return; 925 926 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); 927 if (dm_cache_metadata_abort(cache->cmd)) { 928 DMERR("%s: failed to abort metadata transaction", dev_name); 929 set_cache_mode(cache, CM_FAIL); 930 } 931 932 if (dm_cache_metadata_set_needs_check(cache->cmd)) { 933 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); 934 set_cache_mode(cache, CM_FAIL); 935 } 936 } 937 938 static void metadata_operation_failed(struct cache *cache, const char *op, int r) 939 { 940 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", 941 cache_device_name(cache), op, r); 942 abort_transaction(cache); 943 set_cache_mode(cache, CM_READ_ONLY); 944 } 945 946 /*----------------------------------------------------------------*/ 947 948 static void load_stats(struct cache *cache) 949 { 950 struct dm_cache_statistics stats; 951 952 dm_cache_metadata_get_stats(cache->cmd, &stats); 953 atomic_set(&cache->stats.read_hit, stats.read_hits); 954 atomic_set(&cache->stats.read_miss, stats.read_misses); 955 atomic_set(&cache->stats.write_hit, stats.write_hits); 956 atomic_set(&cache->stats.write_miss, stats.write_misses); 957 } 958 959 static void save_stats(struct cache *cache) 960 { 961 struct dm_cache_statistics stats; 962 963 if (get_cache_mode(cache) >= CM_READ_ONLY) 964 return; 965 966 stats.read_hits = atomic_read(&cache->stats.read_hit); 967 stats.read_misses = atomic_read(&cache->stats.read_miss); 968 stats.write_hits = atomic_read(&cache->stats.write_hit); 969 stats.write_misses = atomic_read(&cache->stats.write_miss); 970 971 dm_cache_metadata_set_stats(cache->cmd, &stats); 972 } 973 974 static void update_stats(struct cache_stats *stats, enum policy_operation op) 975 { 976 switch (op) { 977 case POLICY_PROMOTE: 978 atomic_inc(&stats->promotion); 979 break; 980 981 case POLICY_DEMOTE: 982 atomic_inc(&stats->demotion); 983 break; 984 985 case POLICY_WRITEBACK: 986 atomic_inc(&stats->writeback); 987 break; 988 } 989 } 990 991 /* 992 *--------------------------------------------------------------------- 993 * Migration processing 994 * 995 * Migration covers moving data from the origin device to the cache, or 996 * vice versa. 997 *--------------------------------------------------------------------- 998 */ 999 static void inc_io_migrations(struct cache *cache) 1000 { 1001 atomic_inc(&cache->nr_io_migrations); 1002 } 1003 1004 static void dec_io_migrations(struct cache *cache) 1005 { 1006 atomic_dec(&cache->nr_io_migrations); 1007 } 1008 1009 static bool discard_or_flush(struct bio *bio) 1010 { 1011 return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf); 1012 } 1013 1014 static void calc_discard_block_range(struct cache *cache, struct bio *bio, 1015 dm_dblock_t *b, dm_dblock_t *e) 1016 { 1017 sector_t sb = bio->bi_iter.bi_sector; 1018 sector_t se = bio_end_sector(bio); 1019 1020 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); 1021 1022 if (se - sb < cache->discard_block_size) 1023 *e = *b; 1024 else 1025 *e = to_dblock(block_div(se, cache->discard_block_size)); 1026 } 1027 1028 /*----------------------------------------------------------------*/ 1029 1030 static void prevent_background_work(struct cache *cache) 1031 { 1032 lockdep_off(); 1033 down_write(&cache->background_work_lock); 1034 lockdep_on(); 1035 } 1036 1037 static void allow_background_work(struct cache *cache) 1038 { 1039 lockdep_off(); 1040 up_write(&cache->background_work_lock); 1041 lockdep_on(); 1042 } 1043 1044 static bool background_work_begin(struct cache *cache) 1045 { 1046 bool r; 1047 1048 lockdep_off(); 1049 r = down_read_trylock(&cache->background_work_lock); 1050 lockdep_on(); 1051 1052 return r; 1053 } 1054 1055 static void background_work_end(struct cache *cache) 1056 { 1057 lockdep_off(); 1058 up_read(&cache->background_work_lock); 1059 lockdep_on(); 1060 } 1061 1062 /*----------------------------------------------------------------*/ 1063 1064 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) 1065 { 1066 return (bio_data_dir(bio) == WRITE) && 1067 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); 1068 } 1069 1070 static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block) 1071 { 1072 return writeback_mode(cache) && 1073 (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio)); 1074 } 1075 1076 static void quiesce(struct dm_cache_migration *mg, 1077 void (*continuation)(struct work_struct *)) 1078 { 1079 init_continuation(&mg->k, continuation); 1080 dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws); 1081 } 1082 1083 static struct dm_cache_migration *ws_to_mg(struct work_struct *ws) 1084 { 1085 struct continuation *k = container_of(ws, struct continuation, ws); 1086 1087 return container_of(k, struct dm_cache_migration, k); 1088 } 1089 1090 static void copy_complete(int read_err, unsigned long write_err, void *context) 1091 { 1092 struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k); 1093 1094 if (read_err || write_err) 1095 mg->k.input = BLK_STS_IOERR; 1096 1097 queue_continuation(mg->cache->wq, &mg->k); 1098 } 1099 1100 static void copy(struct dm_cache_migration *mg, bool promote) 1101 { 1102 struct dm_io_region o_region, c_region; 1103 struct cache *cache = mg->cache; 1104 1105 o_region.bdev = cache->origin_dev->bdev; 1106 o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block; 1107 o_region.count = cache->sectors_per_block; 1108 1109 c_region.bdev = cache->cache_dev->bdev; 1110 c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block; 1111 c_region.count = cache->sectors_per_block; 1112 1113 if (promote) 1114 dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k); 1115 else 1116 dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k); 1117 } 1118 1119 static void bio_drop_shared_lock(struct cache *cache, struct bio *bio) 1120 { 1121 struct per_bio_data *pb = get_per_bio_data(bio); 1122 1123 if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell)) 1124 free_prison_cell(cache, pb->cell); 1125 pb->cell = NULL; 1126 } 1127 1128 static void overwrite_endio(struct bio *bio) 1129 { 1130 struct dm_cache_migration *mg = bio->bi_private; 1131 struct cache *cache = mg->cache; 1132 struct per_bio_data *pb = get_per_bio_data(bio); 1133 1134 dm_unhook_bio(&pb->hook_info, bio); 1135 1136 if (bio->bi_status) 1137 mg->k.input = bio->bi_status; 1138 1139 queue_continuation(cache->wq, &mg->k); 1140 } 1141 1142 static void overwrite(struct dm_cache_migration *mg, 1143 void (*continuation)(struct work_struct *)) 1144 { 1145 struct bio *bio = mg->overwrite_bio; 1146 struct per_bio_data *pb = get_per_bio_data(bio); 1147 1148 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1149 1150 /* 1151 * The overwrite bio is part of the copy operation, as such it does 1152 * not set/clear discard or dirty flags. 1153 */ 1154 if (mg->op->op == POLICY_PROMOTE) 1155 remap_to_cache(mg->cache, bio, mg->op->cblock); 1156 else 1157 remap_to_origin(mg->cache, bio); 1158 1159 init_continuation(&mg->k, continuation); 1160 accounted_request(mg->cache, bio); 1161 } 1162 1163 /* 1164 * Migration steps: 1165 * 1166 * 1) exclusive lock preventing WRITEs 1167 * 2) quiesce 1168 * 3) copy or issue overwrite bio 1169 * 4) upgrade to exclusive lock preventing READs and WRITEs 1170 * 5) quiesce 1171 * 6) update metadata and commit 1172 * 7) unlock 1173 */ 1174 static void mg_complete(struct dm_cache_migration *mg, bool success) 1175 { 1176 struct bio_list bios; 1177 struct cache *cache = mg->cache; 1178 struct policy_work *op = mg->op; 1179 dm_cblock_t cblock = op->cblock; 1180 1181 if (success) 1182 update_stats(&cache->stats, op->op); 1183 1184 switch (op->op) { 1185 case POLICY_PROMOTE: 1186 clear_discard(cache, oblock_to_dblock(cache, op->oblock)); 1187 policy_complete_background_work(cache->policy, op, success); 1188 1189 if (mg->overwrite_bio) { 1190 if (success) 1191 force_set_dirty(cache, cblock); 1192 else if (mg->k.input) 1193 mg->overwrite_bio->bi_status = mg->k.input; 1194 else 1195 mg->overwrite_bio->bi_status = BLK_STS_IOERR; 1196 bio_endio(mg->overwrite_bio); 1197 } else { 1198 if (success) 1199 force_clear_dirty(cache, cblock); 1200 dec_io_migrations(cache); 1201 } 1202 break; 1203 1204 case POLICY_DEMOTE: 1205 /* 1206 * We clear dirty here to update the nr_dirty counter. 1207 */ 1208 if (success) 1209 force_clear_dirty(cache, cblock); 1210 policy_complete_background_work(cache->policy, op, success); 1211 dec_io_migrations(cache); 1212 break; 1213 1214 case POLICY_WRITEBACK: 1215 if (success) 1216 force_clear_dirty(cache, cblock); 1217 policy_complete_background_work(cache->policy, op, success); 1218 dec_io_migrations(cache); 1219 break; 1220 } 1221 1222 bio_list_init(&bios); 1223 if (mg->cell) { 1224 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) 1225 free_prison_cell(cache, mg->cell); 1226 } 1227 1228 free_migration(mg); 1229 defer_bios(cache, &bios); 1230 wake_migration_worker(cache); 1231 1232 background_work_end(cache); 1233 } 1234 1235 static void mg_success(struct work_struct *ws) 1236 { 1237 struct dm_cache_migration *mg = ws_to_mg(ws); 1238 1239 mg_complete(mg, mg->k.input == 0); 1240 } 1241 1242 static void mg_update_metadata(struct work_struct *ws) 1243 { 1244 int r; 1245 struct dm_cache_migration *mg = ws_to_mg(ws); 1246 struct cache *cache = mg->cache; 1247 struct policy_work *op = mg->op; 1248 1249 switch (op->op) { 1250 case POLICY_PROMOTE: 1251 r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock); 1252 if (r) { 1253 DMERR_LIMIT("%s: migration failed; couldn't insert mapping", 1254 cache_device_name(cache)); 1255 metadata_operation_failed(cache, "dm_cache_insert_mapping", r); 1256 1257 mg_complete(mg, false); 1258 return; 1259 } 1260 mg_complete(mg, true); 1261 break; 1262 1263 case POLICY_DEMOTE: 1264 r = dm_cache_remove_mapping(cache->cmd, op->cblock); 1265 if (r) { 1266 DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata", 1267 cache_device_name(cache)); 1268 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1269 1270 mg_complete(mg, false); 1271 return; 1272 } 1273 1274 /* 1275 * It would be nice if we only had to commit when a REQ_FLUSH 1276 * comes through. But there's one scenario that we have to 1277 * look out for: 1278 * 1279 * - vblock x in a cache block 1280 * - domotion occurs 1281 * - cache block gets reallocated and over written 1282 * - crash 1283 * 1284 * When we recover, because there was no commit the cache will 1285 * rollback to having the data for vblock x in the cache block. 1286 * But the cache block has since been overwritten, so it'll end 1287 * up pointing to data that was never in 'x' during the history 1288 * of the device. 1289 * 1290 * To avoid this issue we require a commit as part of the 1291 * demotion operation. 1292 */ 1293 init_continuation(&mg->k, mg_success); 1294 continue_after_commit(&cache->committer, &mg->k); 1295 schedule_commit(&cache->committer); 1296 break; 1297 1298 case POLICY_WRITEBACK: 1299 mg_complete(mg, true); 1300 break; 1301 } 1302 } 1303 1304 static void mg_update_metadata_after_copy(struct work_struct *ws) 1305 { 1306 struct dm_cache_migration *mg = ws_to_mg(ws); 1307 1308 /* 1309 * Did the copy succeed? 1310 */ 1311 if (mg->k.input) 1312 mg_complete(mg, false); 1313 else 1314 mg_update_metadata(ws); 1315 } 1316 1317 static void mg_upgrade_lock(struct work_struct *ws) 1318 { 1319 int r; 1320 struct dm_cache_migration *mg = ws_to_mg(ws); 1321 1322 /* 1323 * Did the copy succeed? 1324 */ 1325 if (mg->k.input) 1326 mg_complete(mg, false); 1327 1328 else { 1329 /* 1330 * Now we want the lock to prevent both reads and writes. 1331 */ 1332 r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell, 1333 READ_WRITE_LOCK_LEVEL); 1334 if (r < 0) 1335 mg_complete(mg, false); 1336 1337 else if (r) 1338 quiesce(mg, mg_update_metadata); 1339 1340 else 1341 mg_update_metadata(ws); 1342 } 1343 } 1344 1345 static void mg_full_copy(struct work_struct *ws) 1346 { 1347 struct dm_cache_migration *mg = ws_to_mg(ws); 1348 struct cache *cache = mg->cache; 1349 struct policy_work *op = mg->op; 1350 bool is_policy_promote = (op->op == POLICY_PROMOTE); 1351 1352 if ((!is_policy_promote && !is_dirty(cache, op->cblock)) || 1353 is_discarded_oblock(cache, op->oblock)) { 1354 mg_upgrade_lock(ws); 1355 return; 1356 } 1357 1358 init_continuation(&mg->k, mg_upgrade_lock); 1359 copy(mg, is_policy_promote); 1360 } 1361 1362 static void mg_copy(struct work_struct *ws) 1363 { 1364 struct dm_cache_migration *mg = ws_to_mg(ws); 1365 1366 if (mg->overwrite_bio) { 1367 /* 1368 * No exclusive lock was held when we last checked if the bio 1369 * was optimisable. So we have to check again in case things 1370 * have changed (eg, the block may no longer be discarded). 1371 */ 1372 if (!optimisable_bio(mg->cache, mg->overwrite_bio, mg->op->oblock)) { 1373 /* 1374 * Fallback to a real full copy after doing some tidying up. 1375 */ 1376 bool rb = bio_detain_shared(mg->cache, mg->op->oblock, mg->overwrite_bio); 1377 1378 BUG_ON(rb); /* An exclusive lock must _not_ be held for this block */ 1379 mg->overwrite_bio = NULL; 1380 inc_io_migrations(mg->cache); 1381 mg_full_copy(ws); 1382 return; 1383 } 1384 1385 /* 1386 * It's safe to do this here, even though it's new data 1387 * because all IO has been locked out of the block. 1388 * 1389 * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL 1390 * so _not_ using mg_upgrade_lock() as continutation. 1391 */ 1392 overwrite(mg, mg_update_metadata_after_copy); 1393 1394 } else 1395 mg_full_copy(ws); 1396 } 1397 1398 static int mg_lock_writes(struct dm_cache_migration *mg) 1399 { 1400 int r; 1401 struct dm_cell_key_v2 key; 1402 struct cache *cache = mg->cache; 1403 struct dm_bio_prison_cell_v2 *prealloc; 1404 1405 prealloc = alloc_prison_cell(cache); 1406 1407 /* 1408 * Prevent writes to the block, but allow reads to continue. 1409 * Unless we're using an overwrite bio, in which case we lock 1410 * everything. 1411 */ 1412 build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key); 1413 r = dm_cell_lock_v2(cache->prison, &key, 1414 mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL, 1415 prealloc, &mg->cell); 1416 if (r < 0) { 1417 free_prison_cell(cache, prealloc); 1418 mg_complete(mg, false); 1419 return r; 1420 } 1421 1422 if (mg->cell != prealloc) 1423 free_prison_cell(cache, prealloc); 1424 1425 if (r == 0) 1426 mg_copy(&mg->k.ws); 1427 else 1428 quiesce(mg, mg_copy); 1429 1430 return 0; 1431 } 1432 1433 static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio) 1434 { 1435 struct dm_cache_migration *mg; 1436 1437 if (!background_work_begin(cache)) { 1438 policy_complete_background_work(cache->policy, op, false); 1439 return -EPERM; 1440 } 1441 1442 mg = alloc_migration(cache); 1443 1444 mg->op = op; 1445 mg->overwrite_bio = bio; 1446 1447 if (!bio) 1448 inc_io_migrations(cache); 1449 1450 return mg_lock_writes(mg); 1451 } 1452 1453 /* 1454 *-------------------------------------------------------------- 1455 * invalidation processing 1456 *-------------------------------------------------------------- 1457 */ 1458 1459 static void invalidate_complete(struct dm_cache_migration *mg, bool success) 1460 { 1461 struct bio_list bios; 1462 struct cache *cache = mg->cache; 1463 1464 bio_list_init(&bios); 1465 if (mg->cell) { 1466 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) 1467 free_prison_cell(cache, mg->cell); 1468 } 1469 1470 if (mg->overwrite_bio) { 1471 // Set generic error if the bio hasn't been issued yet, 1472 // e.g., invalidation or metadata commit failed before bio 1473 // submission. Otherwise preserve the bio's own error status. 1474 if (!success && !mg->overwrite_bio->bi_status) 1475 mg->overwrite_bio->bi_status = BLK_STS_IOERR; 1476 bio_endio(mg->overwrite_bio); 1477 } 1478 1479 free_migration(mg); 1480 defer_bios(cache, &bios); 1481 1482 background_work_end(cache); 1483 } 1484 1485 static void invalidate_completed(struct work_struct *ws) 1486 { 1487 struct dm_cache_migration *mg = ws_to_mg(ws); 1488 1489 invalidate_complete(mg, !mg->k.input); 1490 } 1491 1492 static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock) 1493 { 1494 int r; 1495 1496 r = policy_invalidate_mapping(cache->policy, cblock); 1497 if (!r) { 1498 r = dm_cache_remove_mapping(cache->cmd, cblock); 1499 if (r) { 1500 DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata", 1501 cache_device_name(cache)); 1502 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1503 } 1504 1505 } else if (r == -ENODATA) { 1506 /* 1507 * Harmless, already unmapped. 1508 */ 1509 r = 0; 1510 1511 } else 1512 DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache)); 1513 1514 return r; 1515 } 1516 1517 static void invalidate_committed(struct work_struct *ws) 1518 { 1519 struct dm_cache_migration *mg = ws_to_mg(ws); 1520 struct cache *cache = mg->cache; 1521 struct bio *bio = mg->overwrite_bio; 1522 struct per_bio_data *pb = get_per_bio_data(bio); 1523 1524 if (mg->k.input) { 1525 invalidate_complete(mg, false); 1526 return; 1527 } 1528 1529 init_continuation(&mg->k, invalidate_completed); 1530 remap_to_origin_clear_discard(cache, bio, mg->invalidate_oblock); 1531 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1532 dm_submit_bio_remap(bio, NULL); 1533 } 1534 1535 static void invalidate_remove(struct work_struct *ws) 1536 { 1537 int r; 1538 struct dm_cache_migration *mg = ws_to_mg(ws); 1539 struct cache *cache = mg->cache; 1540 1541 r = invalidate_cblock(cache, mg->invalidate_cblock); 1542 if (r) { 1543 invalidate_complete(mg, false); 1544 return; 1545 } 1546 1547 init_continuation(&mg->k, invalidate_committed); 1548 continue_after_commit(&cache->committer, &mg->k); 1549 schedule_commit(&cache->committer); 1550 } 1551 1552 static int invalidate_lock(struct dm_cache_migration *mg) 1553 { 1554 int r; 1555 struct dm_cell_key_v2 key; 1556 struct cache *cache = mg->cache; 1557 struct dm_bio_prison_cell_v2 *prealloc; 1558 1559 prealloc = alloc_prison_cell(cache); 1560 1561 build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key); 1562 r = dm_cell_lock_v2(cache->prison, &key, 1563 READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell); 1564 if (r < 0) { 1565 free_prison_cell(cache, prealloc); 1566 1567 /* Defer the bio for retrying the cell lock */ 1568 if (mg->overwrite_bio) { 1569 struct bio *bio = mg->overwrite_bio; 1570 1571 mg->overwrite_bio = NULL; 1572 defer_bio(cache, bio); 1573 } 1574 1575 invalidate_complete(mg, false); 1576 return r; 1577 } 1578 1579 if (mg->cell != prealloc) 1580 free_prison_cell(cache, prealloc); 1581 1582 if (r) 1583 quiesce(mg, invalidate_remove); 1584 1585 else { 1586 /* 1587 * We can't call invalidate_remove() directly here because we 1588 * might still be in request context. 1589 */ 1590 init_continuation(&mg->k, invalidate_remove); 1591 queue_work(cache->wq, &mg->k.ws); 1592 } 1593 1594 return 0; 1595 } 1596 1597 static int invalidate_start(struct cache *cache, dm_cblock_t cblock, 1598 dm_oblock_t oblock, struct bio *bio) 1599 { 1600 struct dm_cache_migration *mg; 1601 1602 if (!background_work_begin(cache)) 1603 return -EPERM; 1604 1605 mg = alloc_migration(cache); 1606 1607 mg->overwrite_bio = bio; 1608 mg->invalidate_cblock = cblock; 1609 mg->invalidate_oblock = oblock; 1610 1611 return invalidate_lock(mg); 1612 } 1613 1614 /* 1615 *-------------------------------------------------------------- 1616 * bio processing 1617 *-------------------------------------------------------------- 1618 */ 1619 1620 enum busy { 1621 IDLE, 1622 BUSY 1623 }; 1624 1625 static enum busy spare_migration_bandwidth(struct cache *cache) 1626 { 1627 bool idle = dm_iot_idle_for(&cache->tracker, HZ); 1628 sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * 1629 cache->sectors_per_block; 1630 1631 if (idle && current_volume <= cache->migration_threshold) 1632 return IDLE; 1633 else 1634 return BUSY; 1635 } 1636 1637 static void inc_hit_counter(struct cache *cache, struct bio *bio) 1638 { 1639 atomic_inc(bio_data_dir(bio) == READ ? 1640 &cache->stats.read_hit : &cache->stats.write_hit); 1641 } 1642 1643 static void inc_miss_counter(struct cache *cache, struct bio *bio) 1644 { 1645 atomic_inc(bio_data_dir(bio) == READ ? 1646 &cache->stats.read_miss : &cache->stats.write_miss); 1647 } 1648 1649 /*----------------------------------------------------------------*/ 1650 1651 static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, 1652 bool *commit_needed) 1653 { 1654 int r, data_dir; 1655 bool rb, background_queued; 1656 dm_cblock_t cblock; 1657 1658 *commit_needed = false; 1659 1660 rb = bio_detain_shared(cache, block, bio); 1661 if (!rb) { 1662 /* 1663 * An exclusive lock is held for this block, so we have to 1664 * wait. We set the commit_needed flag so the current 1665 * transaction will be committed asap, allowing this lock 1666 * to be dropped. 1667 */ 1668 *commit_needed = true; 1669 return DM_MAPIO_SUBMITTED; 1670 } 1671 1672 data_dir = bio_data_dir(bio); 1673 1674 if (optimisable_bio(cache, bio, block)) { 1675 struct policy_work *op = NULL; 1676 1677 r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op); 1678 if (unlikely(r && r != -ENOENT)) { 1679 DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d", 1680 cache_device_name(cache), r); 1681 bio_io_error(bio); 1682 return DM_MAPIO_SUBMITTED; 1683 } 1684 1685 if (r == -ENOENT && op) { 1686 bio_drop_shared_lock(cache, bio); 1687 BUG_ON(op->op != POLICY_PROMOTE); 1688 mg_start(cache, op, bio); 1689 return DM_MAPIO_SUBMITTED; 1690 } 1691 } else { 1692 r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued); 1693 if (unlikely(r && r != -ENOENT)) { 1694 DMERR_LIMIT("%s: policy_lookup() failed with r = %d", 1695 cache_device_name(cache), r); 1696 bio_io_error(bio); 1697 return DM_MAPIO_SUBMITTED; 1698 } 1699 1700 if (background_queued) 1701 wake_migration_worker(cache); 1702 } 1703 1704 if (r == -ENOENT) { 1705 struct per_bio_data *pb = get_per_bio_data(bio); 1706 1707 /* 1708 * Miss. 1709 */ 1710 inc_miss_counter(cache, bio); 1711 if (pb->req_nr == 0) { 1712 accounted_begin(cache, bio); 1713 remap_to_origin_clear_discard(cache, bio, block); 1714 } else { 1715 /* 1716 * This is a duplicate writethrough io that is no 1717 * longer needed because the block has been demoted. 1718 */ 1719 bio_endio(bio); 1720 return DM_MAPIO_SUBMITTED; 1721 } 1722 } else { 1723 /* 1724 * Hit. 1725 */ 1726 inc_hit_counter(cache, bio); 1727 1728 /* 1729 * Passthrough always maps to the origin, invalidating any 1730 * cache blocks that are written to. 1731 */ 1732 if (passthrough_mode(cache)) { 1733 if (bio_data_dir(bio) == WRITE) { 1734 bio_drop_shared_lock(cache, bio); 1735 atomic_inc(&cache->stats.demotion); 1736 invalidate_start(cache, cblock, block, bio); 1737 return DM_MAPIO_SUBMITTED; 1738 } else 1739 remap_to_origin_clear_discard(cache, bio, block); 1740 } else { 1741 if (bio_data_dir(bio) == WRITE && writethrough_mode(cache) && 1742 !is_dirty(cache, cblock)) { 1743 remap_to_origin_and_cache(cache, bio, block, cblock); 1744 accounted_begin(cache, bio); 1745 } else 1746 remap_to_cache_dirty(cache, bio, block, cblock); 1747 } 1748 } 1749 1750 /* 1751 * dm core turns FUA requests into a separate payload and FLUSH req. 1752 */ 1753 if (bio->bi_opf & REQ_FUA) { 1754 /* 1755 * issue_after_commit will call accounted_begin a second time. So 1756 * we call accounted_complete() to avoid double accounting. 1757 */ 1758 accounted_complete(cache, bio); 1759 issue_after_commit(&cache->committer, bio); 1760 *commit_needed = true; 1761 return DM_MAPIO_SUBMITTED; 1762 } 1763 1764 return DM_MAPIO_REMAPPED; 1765 } 1766 1767 static bool process_bio(struct cache *cache, struct bio *bio) 1768 { 1769 bool commit_needed; 1770 1771 if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED) 1772 dm_submit_bio_remap(bio, NULL); 1773 1774 return commit_needed; 1775 } 1776 1777 /* 1778 * A non-zero return indicates read_only or fail_io mode. 1779 */ 1780 static int commit(struct cache *cache, bool clean_shutdown) 1781 { 1782 int r; 1783 1784 if (get_cache_mode(cache) >= CM_READ_ONLY) 1785 return -EINVAL; 1786 1787 atomic_inc(&cache->stats.commit_count); 1788 r = dm_cache_commit(cache->cmd, clean_shutdown); 1789 if (r) 1790 metadata_operation_failed(cache, "dm_cache_commit", r); 1791 1792 return r; 1793 } 1794 1795 /* 1796 * Used by the batcher. 1797 */ 1798 static blk_status_t commit_op(void *context) 1799 { 1800 struct cache *cache = context; 1801 1802 if (dm_cache_changed_this_transaction(cache->cmd)) 1803 return errno_to_blk_status(commit(cache, false)); 1804 1805 return 0; 1806 } 1807 1808 /*----------------------------------------------------------------*/ 1809 1810 static bool process_flush_bio(struct cache *cache, struct bio *bio) 1811 { 1812 struct per_bio_data *pb = get_per_bio_data(bio); 1813 1814 if (!pb->req_nr) 1815 remap_to_origin(cache, bio); 1816 else 1817 remap_to_cache(cache, bio, 0); 1818 1819 issue_after_commit(&cache->committer, bio); 1820 return true; 1821 } 1822 1823 static bool process_discard_bio(struct cache *cache, struct bio *bio) 1824 { 1825 dm_dblock_t b, e; 1826 1827 /* 1828 * FIXME: do we need to lock the region? Or can we just assume the 1829 * user wont be so foolish as to issue discard concurrently with 1830 * other IO? 1831 */ 1832 calc_discard_block_range(cache, bio, &b, &e); 1833 while (b != e) { 1834 set_discard(cache, b); 1835 b = to_dblock(from_dblock(b) + 1); 1836 } 1837 1838 if (cache->features.discard_passdown) { 1839 remap_to_origin(cache, bio); 1840 dm_submit_bio_remap(bio, NULL); 1841 } else 1842 bio_endio(bio); 1843 1844 return false; 1845 } 1846 1847 static void process_deferred_bios(struct work_struct *ws) 1848 { 1849 struct cache *cache = container_of(ws, struct cache, deferred_bio_worker); 1850 1851 bool commit_needed = false; 1852 struct bio_list bios; 1853 struct bio *bio; 1854 1855 bio_list_init(&bios); 1856 1857 spin_lock_irq(&cache->lock); 1858 bio_list_merge_init(&bios, &cache->deferred_bios); 1859 spin_unlock_irq(&cache->lock); 1860 1861 while ((bio = bio_list_pop(&bios))) { 1862 if (bio->bi_opf & REQ_PREFLUSH) 1863 commit_needed = process_flush_bio(cache, bio) || commit_needed; 1864 1865 else if (bio_op(bio) == REQ_OP_DISCARD) 1866 commit_needed = process_discard_bio(cache, bio) || commit_needed; 1867 1868 else 1869 commit_needed = process_bio(cache, bio) || commit_needed; 1870 cond_resched(); 1871 } 1872 1873 if (commit_needed) 1874 schedule_commit(&cache->committer); 1875 } 1876 1877 /* 1878 *-------------------------------------------------------------- 1879 * Main worker loop 1880 *-------------------------------------------------------------- 1881 */ 1882 static void requeue_deferred_bios(struct cache *cache) 1883 { 1884 struct bio *bio; 1885 struct bio_list bios; 1886 1887 bio_list_init(&bios); 1888 bio_list_merge_init(&bios, &cache->deferred_bios); 1889 1890 while ((bio = bio_list_pop(&bios))) { 1891 bio->bi_status = BLK_STS_DM_REQUEUE; 1892 bio_endio(bio); 1893 cond_resched(); 1894 } 1895 } 1896 1897 /* 1898 * We want to commit periodically so that not too much 1899 * unwritten metadata builds up. 1900 */ 1901 static void do_waker(struct work_struct *ws) 1902 { 1903 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 1904 1905 policy_tick(cache->policy, true); 1906 wake_migration_worker(cache); 1907 schedule_commit(&cache->committer); 1908 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 1909 } 1910 1911 static void check_migrations(struct work_struct *ws) 1912 { 1913 int r; 1914 struct policy_work *op; 1915 struct cache *cache = container_of(ws, struct cache, migration_worker); 1916 enum busy b; 1917 1918 for (;;) { 1919 b = spare_migration_bandwidth(cache); 1920 1921 r = policy_get_background_work(cache->policy, b == IDLE, &op); 1922 if (r == -ENODATA) 1923 break; 1924 1925 if (r) { 1926 DMERR_LIMIT("%s: policy_background_work failed", 1927 cache_device_name(cache)); 1928 break; 1929 } 1930 1931 r = mg_start(cache, op, NULL); 1932 if (r) 1933 break; 1934 1935 cond_resched(); 1936 } 1937 } 1938 1939 /* 1940 *-------------------------------------------------------------- 1941 * Target methods 1942 *-------------------------------------------------------------- 1943 */ 1944 1945 /* 1946 * This function gets called on the error paths of the constructor, so we 1947 * have to cope with a partially initialised struct. 1948 */ 1949 static void __destroy(struct cache *cache) 1950 { 1951 mempool_exit(&cache->migration_pool); 1952 1953 if (cache->prison) 1954 dm_bio_prison_destroy_v2(cache->prison); 1955 1956 if (cache->wq) 1957 destroy_workqueue(cache->wq); 1958 1959 if (cache->dirty_bitset) 1960 free_bitset(cache->dirty_bitset); 1961 1962 if (cache->discard_bitset) 1963 free_bitset(cache->discard_bitset); 1964 1965 if (cache->invalid_bitset) 1966 free_bitset(cache->invalid_bitset); 1967 1968 if (cache->copier) 1969 dm_kcopyd_client_destroy(cache->copier); 1970 1971 if (cache->cmd) 1972 dm_cache_metadata_close(cache->cmd); 1973 1974 if (cache->metadata_dev) 1975 dm_put_device(cache->ti, cache->metadata_dev); 1976 1977 if (cache->origin_dev) 1978 dm_put_device(cache->ti, cache->origin_dev); 1979 1980 if (cache->cache_dev) 1981 dm_put_device(cache->ti, cache->cache_dev); 1982 1983 if (cache->policy) 1984 dm_cache_policy_destroy(cache->policy); 1985 1986 bioset_exit(&cache->bs); 1987 1988 kfree(cache); 1989 } 1990 1991 static void destroy(struct cache *cache) 1992 { 1993 unsigned int i; 1994 1995 cancel_delayed_work_sync(&cache->waker); 1996 1997 for (i = 0; i < cache->nr_ctr_args ; i++) 1998 kfree(cache->ctr_args[i]); 1999 kfree(cache->ctr_args); 2000 2001 __destroy(cache); 2002 } 2003 2004 static void cache_dtr(struct dm_target *ti) 2005 { 2006 struct cache *cache = ti->private; 2007 2008 destroy(cache); 2009 } 2010 2011 static sector_t get_dev_size(struct dm_dev *dev) 2012 { 2013 return bdev_nr_sectors(dev->bdev); 2014 } 2015 2016 /*----------------------------------------------------------------*/ 2017 2018 /* 2019 * Construct a cache device mapping. 2020 * 2021 * cache <metadata dev> <cache dev> <origin dev> <block size> 2022 * <#feature args> [<feature arg>]* 2023 * <policy> <#policy args> [<policy arg>]* 2024 * 2025 * metadata dev : fast device holding the persistent metadata 2026 * cache dev : fast device holding cached data blocks 2027 * origin dev : slow device holding original data blocks 2028 * block size : cache unit size in sectors 2029 * 2030 * #feature args : number of feature arguments passed 2031 * feature args : writethrough. (The default is writeback.) 2032 * 2033 * policy : the replacement policy to use 2034 * #policy args : an even number of policy arguments corresponding 2035 * to key/value pairs passed to the policy 2036 * policy args : key/value pairs passed to the policy 2037 * E.g. 'sequential_threshold 1024' 2038 * See cache-policies.txt for details. 2039 * 2040 * Optional feature arguments are: 2041 * writethrough : write through caching that prohibits cache block 2042 * content from being different from origin block content. 2043 * Without this argument, the default behaviour is to write 2044 * back cache block contents later for performance reasons, 2045 * so they may differ from the corresponding origin blocks. 2046 */ 2047 struct cache_args { 2048 struct dm_target *ti; 2049 2050 struct dm_dev *metadata_dev; 2051 2052 struct dm_dev *cache_dev; 2053 sector_t cache_sectors; 2054 2055 struct dm_dev *origin_dev; 2056 2057 uint32_t block_size; 2058 2059 const char *policy_name; 2060 int policy_argc; 2061 const char **policy_argv; 2062 2063 struct cache_features features; 2064 }; 2065 2066 static void destroy_cache_args(struct cache_args *ca) 2067 { 2068 if (ca->metadata_dev) 2069 dm_put_device(ca->ti, ca->metadata_dev); 2070 2071 if (ca->cache_dev) 2072 dm_put_device(ca->ti, ca->cache_dev); 2073 2074 if (ca->origin_dev) 2075 dm_put_device(ca->ti, ca->origin_dev); 2076 2077 kfree(ca); 2078 } 2079 2080 static bool at_least_one_arg(struct dm_arg_set *as, char **error) 2081 { 2082 if (!as->argc) { 2083 *error = "Insufficient args"; 2084 return false; 2085 } 2086 2087 return true; 2088 } 2089 2090 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, 2091 char **error) 2092 { 2093 int r; 2094 sector_t metadata_dev_size; 2095 2096 if (!at_least_one_arg(as, error)) 2097 return -EINVAL; 2098 2099 r = dm_get_device(ca->ti, dm_shift_arg(as), 2100 BLK_OPEN_READ | BLK_OPEN_WRITE, &ca->metadata_dev); 2101 if (r) { 2102 *error = "Error opening metadata device"; 2103 return r; 2104 } 2105 2106 metadata_dev_size = get_dev_size(ca->metadata_dev); 2107 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING) 2108 DMWARN("Metadata device %pg is larger than %u sectors: excess space will not be used.", 2109 ca->metadata_dev->bdev, THIN_METADATA_MAX_SECTORS); 2110 2111 return 0; 2112 } 2113 2114 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, 2115 char **error) 2116 { 2117 int r; 2118 2119 if (!at_least_one_arg(as, error)) 2120 return -EINVAL; 2121 2122 r = dm_get_device(ca->ti, dm_shift_arg(as), 2123 BLK_OPEN_READ | BLK_OPEN_WRITE, &ca->cache_dev); 2124 if (r) { 2125 *error = "Error opening cache device"; 2126 return r; 2127 } 2128 ca->cache_sectors = get_dev_size(ca->cache_dev); 2129 2130 return 0; 2131 } 2132 2133 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, 2134 char **error) 2135 { 2136 int r; 2137 2138 if (!at_least_one_arg(as, error)) 2139 return -EINVAL; 2140 2141 r = dm_get_device(ca->ti, dm_shift_arg(as), 2142 BLK_OPEN_READ | BLK_OPEN_WRITE, &ca->origin_dev); 2143 if (r) { 2144 *error = "Error opening origin device"; 2145 return r; 2146 } 2147 2148 return 0; 2149 } 2150 2151 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, 2152 char **error) 2153 { 2154 unsigned long block_size; 2155 2156 if (!at_least_one_arg(as, error)) 2157 return -EINVAL; 2158 2159 if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size || 2160 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 2161 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 2162 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 2163 *error = "Invalid data block size"; 2164 return -EINVAL; 2165 } 2166 2167 if (block_size > ca->cache_sectors) { 2168 *error = "Data block size is larger than the cache device"; 2169 return -EINVAL; 2170 } 2171 2172 ca->block_size = block_size; 2173 2174 return 0; 2175 } 2176 2177 static void init_features(struct cache_features *cf) 2178 { 2179 cf->mode = CM_WRITE; 2180 cf->io_mode = CM_IO_WRITEBACK; 2181 cf->metadata_version = 1; 2182 cf->discard_passdown = true; 2183 } 2184 2185 static int parse_features(struct cache_args *ca, struct dm_arg_set *as, 2186 char **error) 2187 { 2188 static const struct dm_arg _args[] = { 2189 {0, 3, "Invalid number of cache feature arguments"}, 2190 }; 2191 2192 int r, mode_ctr = 0; 2193 unsigned int argc; 2194 const char *arg; 2195 struct cache_features *cf = &ca->features; 2196 2197 init_features(cf); 2198 2199 r = dm_read_arg_group(_args, as, &argc, error); 2200 if (r) 2201 return -EINVAL; 2202 2203 while (argc--) { 2204 arg = dm_shift_arg(as); 2205 2206 if (!strcasecmp(arg, "writeback")) { 2207 cf->io_mode = CM_IO_WRITEBACK; 2208 mode_ctr++; 2209 } 2210 2211 else if (!strcasecmp(arg, "writethrough")) { 2212 cf->io_mode = CM_IO_WRITETHROUGH; 2213 mode_ctr++; 2214 } 2215 2216 else if (!strcasecmp(arg, "passthrough")) { 2217 cf->io_mode = CM_IO_PASSTHROUGH; 2218 mode_ctr++; 2219 } 2220 2221 else if (!strcasecmp(arg, "metadata2")) 2222 cf->metadata_version = 2; 2223 2224 else if (!strcasecmp(arg, "no_discard_passdown")) 2225 cf->discard_passdown = false; 2226 2227 else { 2228 *error = "Unrecognised cache feature requested"; 2229 return -EINVAL; 2230 } 2231 } 2232 2233 if (mode_ctr > 1) { 2234 *error = "Duplicate cache io_mode features requested"; 2235 return -EINVAL; 2236 } 2237 2238 return 0; 2239 } 2240 2241 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as, 2242 char **error) 2243 { 2244 static const struct dm_arg _args[] = { 2245 {0, 1024, "Invalid number of policy arguments"}, 2246 }; 2247 2248 int r; 2249 2250 if (!at_least_one_arg(as, error)) 2251 return -EINVAL; 2252 2253 ca->policy_name = dm_shift_arg(as); 2254 2255 r = dm_read_arg_group(_args, as, &ca->policy_argc, error); 2256 if (r) 2257 return -EINVAL; 2258 2259 ca->policy_argv = (const char **)as->argv; 2260 dm_consume_args(as, ca->policy_argc); 2261 2262 return 0; 2263 } 2264 2265 static int parse_cache_args(struct cache_args *ca, int argc, char **argv, 2266 char **error) 2267 { 2268 int r; 2269 struct dm_arg_set as; 2270 2271 as.argc = argc; 2272 as.argv = argv; 2273 2274 r = parse_metadata_dev(ca, &as, error); 2275 if (r) 2276 return r; 2277 2278 r = parse_cache_dev(ca, &as, error); 2279 if (r) 2280 return r; 2281 2282 r = parse_origin_dev(ca, &as, error); 2283 if (r) 2284 return r; 2285 2286 r = parse_block_size(ca, &as, error); 2287 if (r) 2288 return r; 2289 2290 r = parse_features(ca, &as, error); 2291 if (r) 2292 return r; 2293 2294 r = parse_policy(ca, &as, error); 2295 if (r) 2296 return r; 2297 2298 return 0; 2299 } 2300 2301 /*----------------------------------------------------------------*/ 2302 2303 static struct kmem_cache *migration_cache = NULL; 2304 2305 #define NOT_CORE_OPTION 1 2306 2307 static int process_config_option(struct cache *cache, const char *key, const char *value) 2308 { 2309 unsigned long tmp; 2310 2311 if (!strcasecmp(key, "migration_threshold")) { 2312 if (kstrtoul(value, 10, &tmp)) 2313 return -EINVAL; 2314 2315 cache->migration_threshold = tmp; 2316 return 0; 2317 } 2318 2319 return NOT_CORE_OPTION; 2320 } 2321 2322 static int set_config_value(struct cache *cache, const char *key, const char *value) 2323 { 2324 int r = process_config_option(cache, key, value); 2325 2326 if (r == NOT_CORE_OPTION) 2327 r = policy_set_config_value(cache->policy, key, value); 2328 2329 if (r) 2330 DMWARN("bad config value for %s: %s", key, value); 2331 2332 return r; 2333 } 2334 2335 static int set_config_values(struct cache *cache, int argc, const char **argv) 2336 { 2337 int r = 0; 2338 2339 if (argc & 1) { 2340 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs."); 2341 return -EINVAL; 2342 } 2343 2344 while (argc) { 2345 r = set_config_value(cache, argv[0], argv[1]); 2346 if (r) 2347 break; 2348 2349 argc -= 2; 2350 argv += 2; 2351 } 2352 2353 return r; 2354 } 2355 2356 static int create_cache_policy(struct cache *cache, struct cache_args *ca, 2357 char **error) 2358 { 2359 struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name, 2360 cache->cache_size, 2361 cache->origin_sectors, 2362 cache->sectors_per_block); 2363 if (IS_ERR(p)) { 2364 *error = "Error creating cache's policy"; 2365 return PTR_ERR(p); 2366 } 2367 cache->policy = p; 2368 BUG_ON(!cache->policy); 2369 2370 return 0; 2371 } 2372 2373 /* 2374 * We want the discard block size to be at least the size of the cache 2375 * block size and have no more than 2^14 discard blocks across the origin. 2376 */ 2377 #define MAX_DISCARD_BLOCKS (1 << 14) 2378 2379 static bool too_many_discard_blocks(sector_t discard_block_size, 2380 sector_t origin_size) 2381 { 2382 (void) sector_div(origin_size, discard_block_size); 2383 2384 return origin_size > MAX_DISCARD_BLOCKS; 2385 } 2386 2387 static sector_t calculate_discard_block_size(sector_t cache_block_size, 2388 sector_t origin_size) 2389 { 2390 sector_t discard_block_size = cache_block_size; 2391 2392 if (origin_size) 2393 while (too_many_discard_blocks(discard_block_size, origin_size)) 2394 discard_block_size *= 2; 2395 2396 return discard_block_size; 2397 } 2398 2399 static void set_cache_size(struct cache *cache, dm_cblock_t size) 2400 { 2401 dm_block_t nr_blocks = from_cblock(size); 2402 2403 if (nr_blocks > (1 << 20) && cache->cache_size != size) 2404 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n" 2405 "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n" 2406 "Please consider increasing the cache block size to reduce the overall cache block count.", 2407 (unsigned long long) nr_blocks); 2408 2409 cache->cache_size = size; 2410 } 2411 2412 #define DEFAULT_MIGRATION_THRESHOLD 2048 2413 2414 static int cache_create(struct cache_args *ca, struct cache **result) 2415 { 2416 int r = 0; 2417 char **error = &ca->ti->error; 2418 struct cache *cache; 2419 struct dm_target *ti = ca->ti; 2420 dm_block_t origin_blocks; 2421 struct dm_cache_metadata *cmd; 2422 bool may_format = ca->features.mode == CM_WRITE; 2423 2424 cache = kzalloc_obj(*cache); 2425 if (!cache) 2426 return -ENOMEM; 2427 2428 cache->ti = ca->ti; 2429 ti->private = cache; 2430 ti->accounts_remapped_io = true; 2431 ti->num_flush_bios = 2; 2432 ti->flush_supported = true; 2433 2434 ti->num_discard_bios = 1; 2435 ti->discards_supported = true; 2436 2437 ti->per_io_data_size = sizeof(struct per_bio_data); 2438 2439 cache->features = ca->features; 2440 if (writethrough_mode(cache)) { 2441 /* Create bioset for writethrough bios issued to origin */ 2442 r = bioset_init(&cache->bs, BIO_POOL_SIZE, 0, 0); 2443 if (r) 2444 goto bad; 2445 } 2446 2447 cache->metadata_dev = ca->metadata_dev; 2448 cache->origin_dev = ca->origin_dev; 2449 cache->cache_dev = ca->cache_dev; 2450 2451 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 2452 2453 origin_blocks = cache->origin_sectors = ti->len; 2454 origin_blocks = block_div(origin_blocks, ca->block_size); 2455 cache->origin_blocks = to_oblock(origin_blocks); 2456 2457 cache->sectors_per_block = ca->block_size; 2458 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) { 2459 r = -EINVAL; 2460 goto bad; 2461 } 2462 2463 if (ca->block_size & (ca->block_size - 1)) { 2464 dm_block_t cache_size = ca->cache_sectors; 2465 2466 cache->sectors_per_block_shift = -1; 2467 cache_size = block_div(cache_size, ca->block_size); 2468 set_cache_size(cache, to_cblock(cache_size)); 2469 } else { 2470 cache->sectors_per_block_shift = __ffs(ca->block_size); 2471 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift)); 2472 } 2473 2474 r = create_cache_policy(cache, ca, error); 2475 if (r) 2476 goto bad; 2477 2478 cache->policy_nr_args = ca->policy_argc; 2479 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; 2480 2481 r = set_config_values(cache, ca->policy_argc, ca->policy_argv); 2482 if (r) { 2483 *error = "Error setting cache policy's config values"; 2484 goto bad; 2485 } 2486 2487 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, 2488 ca->block_size, may_format, 2489 dm_cache_policy_get_hint_size(cache->policy), 2490 ca->features.metadata_version); 2491 if (IS_ERR(cmd)) { 2492 *error = "Error creating metadata object"; 2493 r = PTR_ERR(cmd); 2494 goto bad; 2495 } 2496 cache->cmd = cmd; 2497 set_cache_mode(cache, CM_WRITE); 2498 if (get_cache_mode(cache) != CM_WRITE) { 2499 *error = "Unable to get write access to metadata, please check/repair metadata."; 2500 r = -EINVAL; 2501 goto bad; 2502 } 2503 2504 if (passthrough_mode(cache)) 2505 policy_allow_migrations(cache->policy, false); 2506 2507 spin_lock_init(&cache->lock); 2508 bio_list_init(&cache->deferred_bios); 2509 atomic_set(&cache->nr_allocated_migrations, 0); 2510 atomic_set(&cache->nr_io_migrations, 0); 2511 init_waitqueue_head(&cache->migration_wait); 2512 2513 r = -ENOMEM; 2514 atomic_set(&cache->nr_dirty, 0); 2515 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2516 if (!cache->dirty_bitset) { 2517 *error = "could not allocate dirty bitset"; 2518 goto bad; 2519 } 2520 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); 2521 2522 cache->discard_block_size = 2523 calculate_discard_block_size(cache->sectors_per_block, 2524 cache->origin_sectors); 2525 cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors, 2526 cache->discard_block_size)); 2527 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); 2528 if (!cache->discard_bitset) { 2529 *error = "could not allocate discard bitset"; 2530 goto bad; 2531 } 2532 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2533 2534 cache->invalid_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2535 if (!cache->invalid_bitset) { 2536 *error = "could not allocate bitset for invalid blocks"; 2537 goto bad; 2538 } 2539 clear_bitset(cache->invalid_bitset, from_cblock(cache->cache_size)); 2540 2541 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2542 if (IS_ERR(cache->copier)) { 2543 *error = "could not create kcopyd client"; 2544 r = PTR_ERR(cache->copier); 2545 goto bad; 2546 } 2547 2548 cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, 2549 WQ_MEM_RECLAIM | WQ_PERCPU, 0); 2550 if (!cache->wq) { 2551 *error = "could not create workqueue for metadata object"; 2552 goto bad; 2553 } 2554 INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios); 2555 INIT_WORK(&cache->migration_worker, check_migrations); 2556 INIT_DELAYED_WORK(&cache->waker, do_waker); 2557 2558 cache->prison = dm_bio_prison_create_v2(cache->wq); 2559 if (!cache->prison) { 2560 *error = "could not create bio prison"; 2561 goto bad; 2562 } 2563 2564 r = mempool_init_slab_pool(&cache->migration_pool, MIGRATION_POOL_SIZE, 2565 migration_cache); 2566 if (r) { 2567 *error = "Error creating cache's migration mempool"; 2568 goto bad; 2569 } 2570 2571 cache->need_tick_bio = true; 2572 cache->sized = false; 2573 cache->invalidate = false; 2574 cache->commit_requested = false; 2575 cache->loaded_mappings = false; 2576 cache->loaded_discards = false; 2577 2578 load_stats(cache); 2579 2580 atomic_set(&cache->stats.demotion, 0); 2581 atomic_set(&cache->stats.promotion, 0); 2582 atomic_set(&cache->stats.copies_avoided, 0); 2583 atomic_set(&cache->stats.cache_cell_clash, 0); 2584 atomic_set(&cache->stats.commit_count, 0); 2585 atomic_set(&cache->stats.discard_count, 0); 2586 2587 spin_lock_init(&cache->invalidation_lock); 2588 INIT_LIST_HEAD(&cache->invalidation_requests); 2589 2590 batcher_init(&cache->committer, commit_op, cache, 2591 issue_op, cache, cache->wq); 2592 dm_iot_init(&cache->tracker); 2593 2594 init_rwsem(&cache->background_work_lock); 2595 prevent_background_work(cache); 2596 2597 *result = cache; 2598 return 0; 2599 bad: 2600 __destroy(cache); 2601 return r; 2602 } 2603 2604 static int copy_ctr_args(struct cache *cache, int argc, const char **argv) 2605 { 2606 unsigned int i; 2607 const char **copy; 2608 2609 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); 2610 if (!copy) 2611 return -ENOMEM; 2612 for (i = 0; i < argc; i++) { 2613 copy[i] = kstrdup(argv[i], GFP_KERNEL); 2614 if (!copy[i]) { 2615 while (i--) 2616 kfree(copy[i]); 2617 kfree(copy); 2618 return -ENOMEM; 2619 } 2620 } 2621 2622 cache->nr_ctr_args = argc; 2623 cache->ctr_args = copy; 2624 2625 return 0; 2626 } 2627 2628 static int cache_ctr(struct dm_target *ti, unsigned int argc, char **argv) 2629 { 2630 int r = -EINVAL; 2631 struct cache_args *ca; 2632 struct cache *cache = NULL; 2633 2634 ca = kzalloc_obj(*ca); 2635 if (!ca) { 2636 ti->error = "Error allocating memory for cache"; 2637 return -ENOMEM; 2638 } 2639 ca->ti = ti; 2640 2641 r = parse_cache_args(ca, argc, argv, &ti->error); 2642 if (r) 2643 goto out; 2644 2645 r = cache_create(ca, &cache); 2646 if (r) 2647 goto out; 2648 2649 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); 2650 if (r) { 2651 __destroy(cache); 2652 goto out; 2653 } 2654 2655 ti->private = cache; 2656 out: 2657 destroy_cache_args(ca); 2658 return r; 2659 } 2660 2661 /*----------------------------------------------------------------*/ 2662 2663 static int cache_map(struct dm_target *ti, struct bio *bio) 2664 { 2665 struct cache *cache = ti->private; 2666 2667 int r; 2668 bool commit_needed; 2669 dm_oblock_t block = get_bio_block(cache, bio); 2670 2671 init_per_bio_data(bio); 2672 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { 2673 /* 2674 * This can only occur if the io goes to a partial block at 2675 * the end of the origin device. We don't cache these. 2676 * Just remap to the origin and carry on. 2677 */ 2678 remap_to_origin(cache, bio); 2679 accounted_begin(cache, bio); 2680 return DM_MAPIO_REMAPPED; 2681 } 2682 2683 if (discard_or_flush(bio)) { 2684 defer_bio(cache, bio); 2685 return DM_MAPIO_SUBMITTED; 2686 } 2687 2688 r = map_bio(cache, bio, block, &commit_needed); 2689 if (commit_needed) 2690 schedule_commit(&cache->committer); 2691 2692 return r; 2693 } 2694 2695 static int cache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error) 2696 { 2697 struct cache *cache = ti->private; 2698 unsigned long flags; 2699 struct per_bio_data *pb = get_per_bio_data(bio); 2700 2701 if (pb->tick) { 2702 policy_tick(cache->policy, false); 2703 2704 spin_lock_irqsave(&cache->lock, flags); 2705 cache->need_tick_bio = true; 2706 spin_unlock_irqrestore(&cache->lock, flags); 2707 } 2708 2709 bio_drop_shared_lock(cache, bio); 2710 accounted_complete(cache, bio); 2711 2712 return DM_ENDIO_DONE; 2713 } 2714 2715 static int write_dirty_bitset(struct cache *cache) 2716 { 2717 int r; 2718 2719 if (get_cache_mode(cache) >= CM_READ_ONLY) 2720 return -EINVAL; 2721 2722 r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset); 2723 if (r) 2724 metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r); 2725 2726 return r; 2727 } 2728 2729 static int write_discard_bitset(struct cache *cache) 2730 { 2731 unsigned int i, r; 2732 2733 if (get_cache_mode(cache) >= CM_READ_ONLY) 2734 return -EINVAL; 2735 2736 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, 2737 cache->discard_nr_blocks); 2738 if (r) { 2739 DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache)); 2740 metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r); 2741 return r; 2742 } 2743 2744 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { 2745 r = dm_cache_set_discard(cache->cmd, to_dblock(i), 2746 is_discarded(cache, to_dblock(i))); 2747 if (r) { 2748 metadata_operation_failed(cache, "dm_cache_set_discard", r); 2749 return r; 2750 } 2751 } 2752 2753 return 0; 2754 } 2755 2756 static int write_hints(struct cache *cache) 2757 { 2758 int r; 2759 2760 if (get_cache_mode(cache) >= CM_READ_ONLY) 2761 return -EINVAL; 2762 2763 r = dm_cache_write_hints(cache->cmd, cache->policy); 2764 if (r) { 2765 metadata_operation_failed(cache, "dm_cache_write_hints", r); 2766 return r; 2767 } 2768 2769 return 0; 2770 } 2771 2772 /* 2773 * returns true on success 2774 */ 2775 static bool sync_metadata(struct cache *cache) 2776 { 2777 int r1, r2, r3, r4; 2778 2779 r1 = write_dirty_bitset(cache); 2780 if (r1) 2781 DMERR("%s: could not write dirty bitset", cache_device_name(cache)); 2782 2783 r2 = write_discard_bitset(cache); 2784 if (r2) 2785 DMERR("%s: could not write discard bitset", cache_device_name(cache)); 2786 2787 save_stats(cache); 2788 2789 r3 = write_hints(cache); 2790 if (r3) 2791 DMERR("%s: could not write hints", cache_device_name(cache)); 2792 2793 /* 2794 * If writing the above metadata failed, we still commit, but don't 2795 * set the clean shutdown flag. This will effectively force every 2796 * dirty bit to be set on reload. 2797 */ 2798 r4 = commit(cache, !r1 && !r2 && !r3); 2799 if (r4) 2800 DMERR("%s: could not write cache metadata", cache_device_name(cache)); 2801 2802 return !r1 && !r2 && !r3 && !r4; 2803 } 2804 2805 static void cache_postsuspend(struct dm_target *ti) 2806 { 2807 struct cache *cache = ti->private; 2808 2809 prevent_background_work(cache); 2810 BUG_ON(atomic_read(&cache->nr_io_migrations)); 2811 2812 cancel_delayed_work_sync(&cache->waker); 2813 drain_workqueue(cache->wq); 2814 WARN_ON(cache->tracker.in_flight); 2815 2816 /* 2817 * If it's a flush suspend there won't be any deferred bios, so this 2818 * call is harmless. 2819 */ 2820 requeue_deferred_bios(cache); 2821 2822 if (get_cache_mode(cache) == CM_WRITE) 2823 (void) sync_metadata(cache); 2824 } 2825 2826 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, 2827 bool dirty, uint32_t hint, bool hint_valid) 2828 { 2829 struct cache *cache = context; 2830 2831 if (dirty) { 2832 if (passthrough_mode(cache)) { 2833 DMERR("%s: cannot enter passthrough mode unless all blocks are clean", 2834 cache_device_name(cache)); 2835 return -EBUSY; 2836 } 2837 2838 set_bit(from_cblock(cblock), cache->dirty_bitset); 2839 atomic_inc(&cache->nr_dirty); 2840 } else 2841 clear_bit(from_cblock(cblock), cache->dirty_bitset); 2842 2843 return policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid); 2844 } 2845 2846 static int load_filtered_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, 2847 bool dirty, uint32_t hint, bool hint_valid) 2848 { 2849 struct cache *cache = context; 2850 2851 if (from_oblock(oblock) >= from_oblock(cache->origin_blocks)) { 2852 if (dirty) { 2853 DMERR("%s: unable to shrink origin; cache block %u is dirty", 2854 cache_device_name(cache), from_cblock(cblock)); 2855 return -EFBIG; 2856 } 2857 set_bit(from_cblock(cblock), cache->invalid_bitset); 2858 return 0; 2859 } 2860 2861 return load_mapping(context, oblock, cblock, dirty, hint, hint_valid); 2862 } 2863 2864 /* 2865 * The discard block size in the on disk metadata is not 2866 * necessarily the same as we're currently using. So we have to 2867 * be careful to only set the discarded attribute if we know it 2868 * covers a complete block of the new size. 2869 */ 2870 struct discard_load_info { 2871 struct cache *cache; 2872 2873 /* 2874 * These blocks are sized using the on disk dblock size, rather 2875 * than the current one. 2876 */ 2877 dm_block_t block_size; 2878 dm_block_t discard_begin, discard_end; 2879 }; 2880 2881 static void discard_load_info_init(struct cache *cache, 2882 struct discard_load_info *li) 2883 { 2884 li->cache = cache; 2885 li->discard_begin = li->discard_end = 0; 2886 } 2887 2888 static void set_discard_range(struct discard_load_info *li) 2889 { 2890 sector_t b, e; 2891 2892 if (li->discard_begin == li->discard_end) 2893 return; 2894 2895 /* 2896 * Convert to sectors. 2897 */ 2898 b = li->discard_begin * li->block_size; 2899 e = li->discard_end * li->block_size; 2900 2901 /* 2902 * Then convert back to the current dblock size. 2903 */ 2904 b = dm_sector_div_up(b, li->cache->discard_block_size); 2905 sector_div(e, li->cache->discard_block_size); 2906 2907 /* 2908 * The origin may have shrunk, so we need to check we're still in 2909 * bounds. 2910 */ 2911 if (e > from_dblock(li->cache->discard_nr_blocks)) 2912 e = from_dblock(li->cache->discard_nr_blocks); 2913 2914 for (; b < e; b++) 2915 set_discard(li->cache, to_dblock(b)); 2916 } 2917 2918 static int load_discard(void *context, sector_t discard_block_size, 2919 dm_dblock_t dblock, bool discard) 2920 { 2921 struct discard_load_info *li = context; 2922 2923 li->block_size = discard_block_size; 2924 2925 if (discard) { 2926 if (from_dblock(dblock) == li->discard_end) 2927 /* 2928 * We're already in a discard range, just extend it. 2929 */ 2930 li->discard_end = li->discard_end + 1ULL; 2931 2932 else { 2933 /* 2934 * Emit the old range and start a new one. 2935 */ 2936 set_discard_range(li); 2937 li->discard_begin = from_dblock(dblock); 2938 li->discard_end = li->discard_begin + 1ULL; 2939 } 2940 } else { 2941 set_discard_range(li); 2942 li->discard_begin = li->discard_end = 0; 2943 } 2944 2945 return 0; 2946 } 2947 2948 static dm_cblock_t get_cache_dev_size(struct cache *cache) 2949 { 2950 sector_t size = get_dev_size(cache->cache_dev); 2951 (void) sector_div(size, cache->sectors_per_block); 2952 return to_cblock(size); 2953 } 2954 2955 static bool can_resume(struct cache *cache) 2956 { 2957 bool clean_when_opened; 2958 int r; 2959 2960 /* 2961 * Disallow retrying the resume operation for devices that failed the 2962 * first resume attempt, as the failure leaves the policy object partially 2963 * initialized. Retrying could trigger BUG_ON when loading cache mappings 2964 * into the incomplete policy object. 2965 */ 2966 if (cache->sized && !cache->loaded_mappings) { 2967 if (get_cache_mode(cache) != CM_WRITE) 2968 DMERR("%s: unable to resume a failed-loaded cache, please check metadata.", 2969 cache_device_name(cache)); 2970 else 2971 DMERR("%s: unable to resume cache due to missing proper cache table reload", 2972 cache_device_name(cache)); 2973 return false; 2974 } 2975 2976 if (passthrough_mode(cache)) { 2977 r = dm_cache_metadata_clean_when_opened(cache->cmd, &clean_when_opened); 2978 if (r) { 2979 DMERR("%s: failed to query metadata flags", cache_device_name(cache)); 2980 return false; 2981 } 2982 2983 if (!clean_when_opened) { 2984 DMERR("%s: unable to resume into passthrough mode after unclean shutdown", 2985 cache_device_name(cache)); 2986 return false; 2987 } 2988 } 2989 2990 return true; 2991 } 2992 2993 static bool can_resize(struct cache *cache, dm_cblock_t new_size) 2994 { 2995 if (from_cblock(new_size) > from_cblock(cache->cache_size)) { 2996 DMERR("%s: unable to extend cache due to missing cache table reload", 2997 cache_device_name(cache)); 2998 return false; 2999 } 3000 3001 /* 3002 * We can't drop a dirty block when shrinking the cache. 3003 */ 3004 if (cache->loaded_mappings) { 3005 new_size = to_cblock(find_next_bit(cache->dirty_bitset, 3006 from_cblock(cache->cache_size), 3007 from_cblock(new_size))); 3008 if (new_size != cache->cache_size) { 3009 DMERR("%s: unable to shrink cache; cache block %llu is dirty", 3010 cache_device_name(cache), 3011 (unsigned long long) from_cblock(new_size)); 3012 return false; 3013 } 3014 } 3015 3016 return true; 3017 } 3018 3019 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) 3020 { 3021 int r; 3022 3023 r = dm_cache_resize(cache->cmd, new_size); 3024 if (r) { 3025 DMERR("%s: could not resize cache metadata", cache_device_name(cache)); 3026 metadata_operation_failed(cache, "dm_cache_resize", r); 3027 return r; 3028 } 3029 3030 set_cache_size(cache, new_size); 3031 3032 return 0; 3033 } 3034 3035 static int truncate_oblocks(struct cache *cache) 3036 { 3037 uint32_t nr_blocks = from_cblock(cache->cache_size); 3038 uint32_t i; 3039 int r; 3040 3041 for_each_set_bit(i, cache->invalid_bitset, nr_blocks) { 3042 r = dm_cache_remove_mapping(cache->cmd, to_cblock(i)); 3043 if (r) { 3044 DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata", 3045 cache_device_name(cache)); 3046 return r; 3047 } 3048 } 3049 3050 return 0; 3051 } 3052 3053 static int cache_preresume(struct dm_target *ti) 3054 { 3055 int r = 0; 3056 struct cache *cache = ti->private; 3057 dm_cblock_t csize = get_cache_dev_size(cache); 3058 3059 if (!can_resume(cache)) 3060 return -EINVAL; 3061 3062 /* 3063 * Check to see if the cache has resized. 3064 */ 3065 if (!cache->sized || csize != cache->cache_size) { 3066 if (!can_resize(cache, csize)) 3067 return -EINVAL; 3068 3069 r = resize_cache_dev(cache, csize); 3070 if (r) 3071 return r; 3072 3073 cache->sized = true; 3074 } 3075 3076 if (!cache->loaded_mappings) { 3077 /* 3078 * The fast device could have been resized since the last 3079 * failed preresume attempt. To be safe we start by a blank 3080 * bitset for cache blocks. 3081 */ 3082 clear_bitset(cache->invalid_bitset, from_cblock(cache->cache_size)); 3083 3084 r = dm_cache_load_mappings(cache->cmd, cache->policy, 3085 load_filtered_mapping, cache); 3086 if (r) { 3087 DMERR("%s: could not load cache mappings", cache_device_name(cache)); 3088 if (r != -EFBIG && r != -EBUSY) 3089 metadata_operation_failed(cache, "dm_cache_load_mappings", r); 3090 return r; 3091 } 3092 3093 r = truncate_oblocks(cache); 3094 if (r) { 3095 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 3096 return r; 3097 } 3098 3099 cache->loaded_mappings = true; 3100 } 3101 3102 if (!cache->loaded_discards) { 3103 struct discard_load_info li; 3104 3105 /* 3106 * The discard bitset could have been resized, or the 3107 * discard block size changed. To be safe we start by 3108 * setting every dblock to not discarded. 3109 */ 3110 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 3111 3112 discard_load_info_init(cache, &li); 3113 r = dm_cache_load_discards(cache->cmd, load_discard, &li); 3114 if (r) { 3115 DMERR("%s: could not load origin discards", cache_device_name(cache)); 3116 metadata_operation_failed(cache, "dm_cache_load_discards", r); 3117 return r; 3118 } 3119 set_discard_range(&li); 3120 3121 cache->loaded_discards = true; 3122 } 3123 3124 return r; 3125 } 3126 3127 static void cache_resume(struct dm_target *ti) 3128 { 3129 struct cache *cache = ti->private; 3130 3131 cache->need_tick_bio = true; 3132 allow_background_work(cache); 3133 do_waker(&cache->waker.work); 3134 } 3135 3136 static void emit_flags(struct cache *cache, char *result, 3137 unsigned int maxlen, ssize_t *sz_ptr) 3138 { 3139 ssize_t sz = *sz_ptr; 3140 struct cache_features *cf = &cache->features; 3141 unsigned int count = (cf->metadata_version == 2) + !cf->discard_passdown + 1; 3142 3143 DMEMIT("%u ", count); 3144 3145 if (cf->metadata_version == 2) 3146 DMEMIT("metadata2 "); 3147 3148 if (writethrough_mode(cache)) 3149 DMEMIT("writethrough "); 3150 3151 else if (passthrough_mode(cache)) 3152 DMEMIT("passthrough "); 3153 3154 else if (writeback_mode(cache)) 3155 DMEMIT("writeback "); 3156 3157 else { 3158 DMEMIT("unknown "); 3159 DMERR("%s: internal error: unknown io mode: %d", 3160 cache_device_name(cache), (int) cf->io_mode); 3161 } 3162 3163 if (!cf->discard_passdown) 3164 DMEMIT("no_discard_passdown "); 3165 3166 *sz_ptr = sz; 3167 } 3168 3169 /* 3170 * Status format: 3171 * 3172 * <metadata block size> <#used metadata blocks>/<#total metadata blocks> 3173 * <cache block size> <#used cache blocks>/<#total cache blocks> 3174 * <#read hits> <#read misses> <#write hits> <#write misses> 3175 * <#demotions> <#promotions> <#dirty> 3176 * <#features> <features>* 3177 * <#core args> <core args> 3178 * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check> 3179 */ 3180 static void cache_status(struct dm_target *ti, status_type_t type, 3181 unsigned int status_flags, char *result, unsigned int maxlen) 3182 { 3183 int r = 0; 3184 unsigned int i; 3185 ssize_t sz = 0; 3186 dm_block_t nr_free_blocks_metadata = 0; 3187 dm_block_t nr_blocks_metadata = 0; 3188 char buf[BDEVNAME_SIZE]; 3189 struct cache *cache = ti->private; 3190 dm_cblock_t residency; 3191 bool needs_check; 3192 3193 switch (type) { 3194 case STATUSTYPE_INFO: 3195 if (get_cache_mode(cache) == CM_FAIL) { 3196 DMEMIT("Fail"); 3197 break; 3198 } 3199 3200 /* Commit to ensure statistics aren't out-of-date */ 3201 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) 3202 (void) commit(cache, false); 3203 3204 r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata); 3205 if (r) { 3206 DMERR("%s: dm_cache_get_free_metadata_block_count returned %d", 3207 cache_device_name(cache), r); 3208 goto err; 3209 } 3210 3211 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata); 3212 if (r) { 3213 DMERR("%s: dm_cache_get_metadata_dev_size returned %d", 3214 cache_device_name(cache), r); 3215 goto err; 3216 } 3217 3218 residency = policy_residency(cache->policy); 3219 3220 DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ", 3221 (unsigned int)DM_CACHE_METADATA_BLOCK_SIZE, 3222 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 3223 (unsigned long long)nr_blocks_metadata, 3224 (unsigned long long)cache->sectors_per_block, 3225 (unsigned long long) from_cblock(residency), 3226 (unsigned long long) from_cblock(cache->cache_size), 3227 (unsigned int) atomic_read(&cache->stats.read_hit), 3228 (unsigned int) atomic_read(&cache->stats.read_miss), 3229 (unsigned int) atomic_read(&cache->stats.write_hit), 3230 (unsigned int) atomic_read(&cache->stats.write_miss), 3231 (unsigned int) atomic_read(&cache->stats.demotion), 3232 (unsigned int) atomic_read(&cache->stats.promotion), 3233 (unsigned long) atomic_read(&cache->nr_dirty)); 3234 3235 emit_flags(cache, result, maxlen, &sz); 3236 3237 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); 3238 3239 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy)); 3240 if (sz < maxlen) { 3241 r = policy_emit_config_values(cache->policy, result, maxlen, &sz); 3242 if (r) 3243 DMERR("%s: policy_emit_config_values returned %d", 3244 cache_device_name(cache), r); 3245 } 3246 3247 if (get_cache_mode(cache) == CM_READ_ONLY) 3248 DMEMIT("ro "); 3249 else 3250 DMEMIT("rw "); 3251 3252 r = dm_cache_metadata_needs_check(cache->cmd, &needs_check); 3253 3254 if (r || needs_check) 3255 DMEMIT("needs_check "); 3256 else 3257 DMEMIT("- "); 3258 3259 break; 3260 3261 case STATUSTYPE_TABLE: 3262 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 3263 DMEMIT("%s ", buf); 3264 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 3265 DMEMIT("%s ", buf); 3266 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 3267 DMEMIT("%s", buf); 3268 3269 for (i = 0; i < cache->nr_ctr_args - 1; i++) 3270 DMEMIT(" %s", cache->ctr_args[i]); 3271 if (cache->nr_ctr_args) 3272 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]); 3273 break; 3274 3275 case STATUSTYPE_IMA: 3276 DMEMIT_TARGET_NAME_VERSION(ti->type); 3277 if (get_cache_mode(cache) == CM_FAIL) 3278 DMEMIT(",metadata_mode=fail"); 3279 else if (get_cache_mode(cache) == CM_READ_ONLY) 3280 DMEMIT(",metadata_mode=ro"); 3281 else 3282 DMEMIT(",metadata_mode=rw"); 3283 3284 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 3285 DMEMIT(",cache_metadata_device=%s", buf); 3286 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 3287 DMEMIT(",cache_device=%s", buf); 3288 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 3289 DMEMIT(",cache_origin_device=%s", buf); 3290 DMEMIT(",writethrough=%c", writethrough_mode(cache) ? 'y' : 'n'); 3291 DMEMIT(",writeback=%c", writeback_mode(cache) ? 'y' : 'n'); 3292 DMEMIT(",passthrough=%c", passthrough_mode(cache) ? 'y' : 'n'); 3293 DMEMIT(",metadata2=%c", cache->features.metadata_version == 2 ? 'y' : 'n'); 3294 DMEMIT(",no_discard_passdown=%c", cache->features.discard_passdown ? 'n' : 'y'); 3295 DMEMIT(";"); 3296 break; 3297 } 3298 3299 return; 3300 3301 err: 3302 DMEMIT("Error"); 3303 } 3304 3305 /* 3306 * Defines a range of cblocks, begin to (end - 1) are in the range. end is 3307 * the one-past-the-end value. 3308 */ 3309 struct cblock_range { 3310 dm_cblock_t begin; 3311 dm_cblock_t end; 3312 }; 3313 3314 /* 3315 * A cache block range can take two forms: 3316 * 3317 * i) A single cblock, eg. '3456' 3318 * ii) A begin and end cblock with a dash between, eg. 123-234 3319 */ 3320 static int parse_cblock_range(struct cache *cache, const char *str, 3321 struct cblock_range *result) 3322 { 3323 char dummy; 3324 uint64_t b, e; 3325 int r; 3326 3327 /* 3328 * Try and parse form (ii) first. 3329 */ 3330 r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy); 3331 3332 if (r == 2) { 3333 result->begin = to_cblock(b); 3334 result->end = to_cblock(e); 3335 return 0; 3336 } 3337 3338 /* 3339 * That didn't work, try form (i). 3340 */ 3341 r = sscanf(str, "%llu%c", &b, &dummy); 3342 3343 if (r == 1) { 3344 result->begin = to_cblock(b); 3345 result->end = to_cblock(from_cblock(result->begin) + 1u); 3346 return 0; 3347 } 3348 3349 DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str); 3350 return -EINVAL; 3351 } 3352 3353 static int validate_cblock_range(struct cache *cache, struct cblock_range *range) 3354 { 3355 uint64_t b = from_cblock(range->begin); 3356 uint64_t e = from_cblock(range->end); 3357 uint64_t n = from_cblock(cache->cache_size); 3358 3359 if (b >= n) { 3360 DMERR("%s: begin cblock out of range: %llu >= %llu", 3361 cache_device_name(cache), b, n); 3362 return -EINVAL; 3363 } 3364 3365 if (e > n) { 3366 DMERR("%s: end cblock out of range: %llu > %llu", 3367 cache_device_name(cache), e, n); 3368 return -EINVAL; 3369 } 3370 3371 if (b >= e) { 3372 DMERR("%s: invalid cblock range: %llu >= %llu", 3373 cache_device_name(cache), b, e); 3374 return -EINVAL; 3375 } 3376 3377 return 0; 3378 } 3379 3380 static inline dm_cblock_t cblock_succ(dm_cblock_t b) 3381 { 3382 return to_cblock(from_cblock(b) + 1); 3383 } 3384 3385 static int request_invalidation(struct cache *cache, struct cblock_range *range) 3386 { 3387 int r = 0; 3388 3389 /* 3390 * We don't need to do any locking here because we know we're in 3391 * passthrough mode. There's is potential for a race between an 3392 * invalidation triggered by an io and an invalidation message. This 3393 * is harmless, we must not worry if the policy call fails. 3394 */ 3395 while (range->begin != range->end) { 3396 r = invalidate_cblock(cache, range->begin); 3397 if (r) 3398 return r; 3399 3400 range->begin = cblock_succ(range->begin); 3401 } 3402 3403 cache->commit_requested = true; 3404 return r; 3405 } 3406 3407 static int process_invalidate_cblocks_message(struct cache *cache, unsigned int count, 3408 const char **cblock_ranges) 3409 { 3410 int r = 0; 3411 unsigned int i; 3412 struct cblock_range range; 3413 3414 if (!passthrough_mode(cache)) { 3415 DMERR("%s: cache has to be in passthrough mode for invalidation", 3416 cache_device_name(cache)); 3417 return -EPERM; 3418 } 3419 3420 for (i = 0; i < count; i++) { 3421 r = parse_cblock_range(cache, cblock_ranges[i], &range); 3422 if (r) 3423 break; 3424 3425 r = validate_cblock_range(cache, &range); 3426 if (r) 3427 break; 3428 3429 /* 3430 * Pass begin and end origin blocks to the worker and wake it. 3431 */ 3432 r = request_invalidation(cache, &range); 3433 if (r) 3434 break; 3435 } 3436 3437 return r; 3438 } 3439 3440 /* 3441 * Supports 3442 * "<key> <value>" 3443 * and 3444 * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]* 3445 * 3446 * The key migration_threshold is supported by the cache target core. 3447 */ 3448 static int cache_message(struct dm_target *ti, unsigned int argc, char **argv, 3449 char *result, unsigned int maxlen) 3450 { 3451 struct cache *cache = ti->private; 3452 3453 if (!argc) 3454 return -EINVAL; 3455 3456 if (get_cache_mode(cache) >= CM_READ_ONLY) { 3457 DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode", 3458 cache_device_name(cache)); 3459 return -EOPNOTSUPP; 3460 } 3461 3462 if (!strcasecmp(argv[0], "invalidate_cblocks")) 3463 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1); 3464 3465 if (argc != 2) 3466 return -EINVAL; 3467 3468 return set_config_value(cache, argv[0], argv[1]); 3469 } 3470 3471 static int cache_iterate_devices(struct dm_target *ti, 3472 iterate_devices_callout_fn fn, void *data) 3473 { 3474 int r = 0; 3475 struct cache *cache = ti->private; 3476 3477 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data); 3478 if (!r) 3479 r = fn(ti, cache->origin_dev, 0, ti->len, data); 3480 3481 return r; 3482 } 3483 3484 /* 3485 * If discard_passdown was enabled verify that the origin device 3486 * supports discards. Disable discard_passdown if not. 3487 */ 3488 static void disable_passdown_if_not_supported(struct cache *cache) 3489 { 3490 struct block_device *origin_bdev = cache->origin_dev->bdev; 3491 struct queue_limits *origin_limits = bdev_limits(origin_bdev); 3492 const char *reason = NULL; 3493 3494 if (!cache->features.discard_passdown) 3495 return; 3496 3497 if (!bdev_max_discard_sectors(origin_bdev)) 3498 reason = "discard unsupported"; 3499 3500 else if (origin_limits->max_discard_sectors < cache->sectors_per_block) 3501 reason = "max discard sectors smaller than a block"; 3502 3503 if (reason) { 3504 DMWARN("Origin device (%pg) %s: Disabling discard passdown.", 3505 origin_bdev, reason); 3506 cache->features.discard_passdown = false; 3507 } 3508 } 3509 3510 static void set_discard_limits(struct cache *cache, struct queue_limits *limits) 3511 { 3512 struct block_device *origin_bdev = cache->origin_dev->bdev; 3513 struct queue_limits *origin_limits = bdev_limits(origin_bdev); 3514 3515 if (!cache->features.discard_passdown) { 3516 /* No passdown is done so setting own virtual limits */ 3517 limits->max_hw_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024, 3518 cache->origin_sectors); 3519 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; 3520 return; 3521 } 3522 3523 /* 3524 * cache_iterate_devices() is stacking both origin and fast device limits 3525 * but discards aren't passed to fast device, so inherit origin's limits. 3526 */ 3527 limits->max_hw_discard_sectors = origin_limits->max_hw_discard_sectors; 3528 limits->discard_granularity = origin_limits->discard_granularity; 3529 limits->discard_alignment = origin_limits->discard_alignment; 3530 } 3531 3532 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 3533 { 3534 struct cache *cache = ti->private; 3535 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 3536 3537 /* 3538 * If the system-determined stacked limits are compatible with the 3539 * cache's blocksize (io_opt is a factor) do not override them. 3540 */ 3541 if (io_opt_sectors < cache->sectors_per_block || 3542 do_div(io_opt_sectors, cache->sectors_per_block)) { 3543 limits->io_min = cache->sectors_per_block << SECTOR_SHIFT; 3544 limits->io_opt = cache->sectors_per_block << SECTOR_SHIFT; 3545 } 3546 3547 disable_passdown_if_not_supported(cache); 3548 set_discard_limits(cache, limits); 3549 } 3550 3551 /*----------------------------------------------------------------*/ 3552 3553 static struct target_type cache_target = { 3554 .name = "cache", 3555 .version = {2, 4, 0}, 3556 .module = THIS_MODULE, 3557 .ctr = cache_ctr, 3558 .dtr = cache_dtr, 3559 .map = cache_map, 3560 .end_io = cache_end_io, 3561 .postsuspend = cache_postsuspend, 3562 .preresume = cache_preresume, 3563 .resume = cache_resume, 3564 .status = cache_status, 3565 .message = cache_message, 3566 .iterate_devices = cache_iterate_devices, 3567 .io_hints = cache_io_hints, 3568 }; 3569 3570 static int __init dm_cache_init(void) 3571 { 3572 int r; 3573 3574 migration_cache = KMEM_CACHE(dm_cache_migration, 0); 3575 if (!migration_cache) { 3576 r = -ENOMEM; 3577 goto err; 3578 } 3579 3580 btracker_work_cache = kmem_cache_create("dm_cache_bt_work", 3581 sizeof(struct bt_work), __alignof__(struct bt_work), 0, NULL); 3582 if (!btracker_work_cache) { 3583 r = -ENOMEM; 3584 goto err; 3585 } 3586 3587 r = dm_register_target(&cache_target); 3588 if (r) { 3589 goto err; 3590 } 3591 3592 return 0; 3593 3594 err: 3595 kmem_cache_destroy(migration_cache); 3596 kmem_cache_destroy(btracker_work_cache); 3597 return r; 3598 } 3599 3600 static void __exit dm_cache_exit(void) 3601 { 3602 dm_unregister_target(&cache_target); 3603 kmem_cache_destroy(migration_cache); 3604 kmem_cache_destroy(btracker_work_cache); 3605 } 3606 3607 module_init(dm_cache_init); 3608 module_exit(dm_cache_exit); 3609 3610 MODULE_DESCRIPTION(DM_NAME " cache target"); 3611 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); 3612 MODULE_LICENSE("GPL"); 3613