1 /* 2 * Copyright (C) 2012 Red Hat. All rights reserved. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm.h" 8 #include "dm-bio-prison.h" 9 #include "dm-bio-record.h" 10 #include "dm-cache-metadata.h" 11 12 #include <linux/dm-io.h> 13 #include <linux/dm-kcopyd.h> 14 #include <linux/jiffies.h> 15 #include <linux/init.h> 16 #include <linux/mempool.h> 17 #include <linux/module.h> 18 #include <linux/slab.h> 19 #include <linux/vmalloc.h> 20 21 #define DM_MSG_PREFIX "cache" 22 23 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, 24 "A percentage of time allocated for copying to and/or from cache"); 25 26 /*----------------------------------------------------------------*/ 27 28 #define IOT_RESOLUTION 4 29 30 struct io_tracker { 31 spinlock_t lock; 32 33 /* 34 * Sectors of in-flight IO. 35 */ 36 sector_t in_flight; 37 38 /* 39 * The time, in jiffies, when this device became idle (if it is 40 * indeed idle). 41 */ 42 unsigned long idle_time; 43 unsigned long last_update_time; 44 }; 45 46 static void iot_init(struct io_tracker *iot) 47 { 48 spin_lock_init(&iot->lock); 49 iot->in_flight = 0ul; 50 iot->idle_time = 0ul; 51 iot->last_update_time = jiffies; 52 } 53 54 static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs) 55 { 56 if (iot->in_flight) 57 return false; 58 59 return time_after(jiffies, iot->idle_time + jifs); 60 } 61 62 static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs) 63 { 64 bool r; 65 unsigned long flags; 66 67 spin_lock_irqsave(&iot->lock, flags); 68 r = __iot_idle_for(iot, jifs); 69 spin_unlock_irqrestore(&iot->lock, flags); 70 71 return r; 72 } 73 74 static void iot_io_begin(struct io_tracker *iot, sector_t len) 75 { 76 unsigned long flags; 77 78 spin_lock_irqsave(&iot->lock, flags); 79 iot->in_flight += len; 80 spin_unlock_irqrestore(&iot->lock, flags); 81 } 82 83 static void __iot_io_end(struct io_tracker *iot, sector_t len) 84 { 85 iot->in_flight -= len; 86 if (!iot->in_flight) 87 iot->idle_time = jiffies; 88 } 89 90 static void iot_io_end(struct io_tracker *iot, sector_t len) 91 { 92 unsigned long flags; 93 94 spin_lock_irqsave(&iot->lock, flags); 95 __iot_io_end(iot, len); 96 spin_unlock_irqrestore(&iot->lock, flags); 97 } 98 99 /*----------------------------------------------------------------*/ 100 101 /* 102 * Glossary: 103 * 104 * oblock: index of an origin block 105 * cblock: index of a cache block 106 * promotion: movement of a block from origin to cache 107 * demotion: movement of a block from cache to origin 108 * migration: movement of a block between the origin and cache device, 109 * either direction 110 */ 111 112 /*----------------------------------------------------------------*/ 113 114 /* 115 * There are a couple of places where we let a bio run, but want to do some 116 * work before calling its endio function. We do this by temporarily 117 * changing the endio fn. 118 */ 119 struct dm_hook_info { 120 bio_end_io_t *bi_end_io; 121 }; 122 123 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, 124 bio_end_io_t *bi_end_io, void *bi_private) 125 { 126 h->bi_end_io = bio->bi_end_io; 127 128 bio->bi_end_io = bi_end_io; 129 bio->bi_private = bi_private; 130 } 131 132 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) 133 { 134 bio->bi_end_io = h->bi_end_io; 135 } 136 137 /*----------------------------------------------------------------*/ 138 139 #define MIGRATION_POOL_SIZE 128 140 #define COMMIT_PERIOD HZ 141 #define MIGRATION_COUNT_WINDOW 10 142 143 /* 144 * The block size of the device holding cache data must be 145 * between 32KB and 1GB. 146 */ 147 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) 148 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 149 150 enum cache_metadata_mode { 151 CM_WRITE, /* metadata may be changed */ 152 CM_READ_ONLY, /* metadata may not be changed */ 153 CM_FAIL 154 }; 155 156 enum cache_io_mode { 157 /* 158 * Data is written to cached blocks only. These blocks are marked 159 * dirty. If you lose the cache device you will lose data. 160 * Potential performance increase for both reads and writes. 161 */ 162 CM_IO_WRITEBACK, 163 164 /* 165 * Data is written to both cache and origin. Blocks are never 166 * dirty. Potential performance benfit for reads only. 167 */ 168 CM_IO_WRITETHROUGH, 169 170 /* 171 * A degraded mode useful for various cache coherency situations 172 * (eg, rolling back snapshots). Reads and writes always go to the 173 * origin. If a write goes to a cached oblock, then the cache 174 * block is invalidated. 175 */ 176 CM_IO_PASSTHROUGH 177 }; 178 179 struct cache_features { 180 enum cache_metadata_mode mode; 181 enum cache_io_mode io_mode; 182 }; 183 184 struct cache_stats { 185 atomic_t read_hit; 186 atomic_t read_miss; 187 atomic_t write_hit; 188 atomic_t write_miss; 189 atomic_t demotion; 190 atomic_t promotion; 191 atomic_t copies_avoided; 192 atomic_t cache_cell_clash; 193 atomic_t commit_count; 194 atomic_t discard_count; 195 }; 196 197 /* 198 * Defines a range of cblocks, begin to (end - 1) are in the range. end is 199 * the one-past-the-end value. 200 */ 201 struct cblock_range { 202 dm_cblock_t begin; 203 dm_cblock_t end; 204 }; 205 206 struct invalidation_request { 207 struct list_head list; 208 struct cblock_range *cblocks; 209 210 atomic_t complete; 211 int err; 212 213 wait_queue_head_t result_wait; 214 }; 215 216 struct cache { 217 struct dm_target *ti; 218 struct dm_target_callbacks callbacks; 219 220 struct dm_cache_metadata *cmd; 221 222 /* 223 * Metadata is written to this device. 224 */ 225 struct dm_dev *metadata_dev; 226 227 /* 228 * The slower of the two data devices. Typically a spindle. 229 */ 230 struct dm_dev *origin_dev; 231 232 /* 233 * The faster of the two data devices. Typically an SSD. 234 */ 235 struct dm_dev *cache_dev; 236 237 /* 238 * Size of the origin device in _complete_ blocks and native sectors. 239 */ 240 dm_oblock_t origin_blocks; 241 sector_t origin_sectors; 242 243 /* 244 * Size of the cache device in blocks. 245 */ 246 dm_cblock_t cache_size; 247 248 /* 249 * Fields for converting from sectors to blocks. 250 */ 251 uint32_t sectors_per_block; 252 int sectors_per_block_shift; 253 254 spinlock_t lock; 255 struct list_head deferred_cells; 256 struct bio_list deferred_bios; 257 struct bio_list deferred_flush_bios; 258 struct bio_list deferred_writethrough_bios; 259 struct list_head quiesced_migrations; 260 struct list_head completed_migrations; 261 struct list_head need_commit_migrations; 262 sector_t migration_threshold; 263 wait_queue_head_t migration_wait; 264 atomic_t nr_allocated_migrations; 265 266 /* 267 * The number of in flight migrations that are performing 268 * background io. eg, promotion, writeback. 269 */ 270 atomic_t nr_io_migrations; 271 272 wait_queue_head_t quiescing_wait; 273 atomic_t quiescing; 274 atomic_t quiescing_ack; 275 276 /* 277 * cache_size entries, dirty if set 278 */ 279 atomic_t nr_dirty; 280 unsigned long *dirty_bitset; 281 282 /* 283 * origin_blocks entries, discarded if set. 284 */ 285 dm_dblock_t discard_nr_blocks; 286 unsigned long *discard_bitset; 287 uint32_t discard_block_size; /* a power of 2 times sectors per block */ 288 289 /* 290 * Rather than reconstructing the table line for the status we just 291 * save it and regurgitate. 292 */ 293 unsigned nr_ctr_args; 294 const char **ctr_args; 295 296 struct dm_kcopyd_client *copier; 297 struct workqueue_struct *wq; 298 struct work_struct worker; 299 300 struct delayed_work waker; 301 unsigned long last_commit_jiffies; 302 303 struct dm_bio_prison *prison; 304 struct dm_deferred_set *all_io_ds; 305 306 mempool_t *migration_pool; 307 308 struct dm_cache_policy *policy; 309 unsigned policy_nr_args; 310 311 bool need_tick_bio:1; 312 bool sized:1; 313 bool invalidate:1; 314 bool commit_requested:1; 315 bool loaded_mappings:1; 316 bool loaded_discards:1; 317 318 /* 319 * Cache features such as write-through. 320 */ 321 struct cache_features features; 322 323 struct cache_stats stats; 324 325 /* 326 * Invalidation fields. 327 */ 328 spinlock_t invalidation_lock; 329 struct list_head invalidation_requests; 330 331 struct io_tracker origin_tracker; 332 }; 333 334 struct per_bio_data { 335 bool tick:1; 336 unsigned req_nr:2; 337 struct dm_deferred_entry *all_io_entry; 338 struct dm_hook_info hook_info; 339 sector_t len; 340 341 /* 342 * writethrough fields. These MUST remain at the end of this 343 * structure and the 'cache' member must be the first as it 344 * is used to determine the offset of the writethrough fields. 345 */ 346 struct cache *cache; 347 dm_cblock_t cblock; 348 struct dm_bio_details bio_details; 349 }; 350 351 struct dm_cache_migration { 352 struct list_head list; 353 struct cache *cache; 354 355 unsigned long start_jiffies; 356 dm_oblock_t old_oblock; 357 dm_oblock_t new_oblock; 358 dm_cblock_t cblock; 359 360 bool err:1; 361 bool discard:1; 362 bool writeback:1; 363 bool demote:1; 364 bool promote:1; 365 bool requeue_holder:1; 366 bool invalidate:1; 367 368 struct dm_bio_prison_cell *old_ocell; 369 struct dm_bio_prison_cell *new_ocell; 370 }; 371 372 /* 373 * Processing a bio in the worker thread may require these memory 374 * allocations. We prealloc to avoid deadlocks (the same worker thread 375 * frees them back to the mempool). 376 */ 377 struct prealloc { 378 struct dm_cache_migration *mg; 379 struct dm_bio_prison_cell *cell1; 380 struct dm_bio_prison_cell *cell2; 381 }; 382 383 static enum cache_metadata_mode get_cache_mode(struct cache *cache); 384 385 static void wake_worker(struct cache *cache) 386 { 387 queue_work(cache->wq, &cache->worker); 388 } 389 390 /*----------------------------------------------------------------*/ 391 392 static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache) 393 { 394 /* FIXME: change to use a local slab. */ 395 return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT); 396 } 397 398 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell) 399 { 400 dm_bio_prison_free_cell(cache->prison, cell); 401 } 402 403 static struct dm_cache_migration *alloc_migration(struct cache *cache) 404 { 405 struct dm_cache_migration *mg; 406 407 mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT); 408 if (mg) { 409 mg->cache = cache; 410 atomic_inc(&mg->cache->nr_allocated_migrations); 411 } 412 413 return mg; 414 } 415 416 static void free_migration(struct dm_cache_migration *mg) 417 { 418 struct cache *cache = mg->cache; 419 420 if (atomic_dec_and_test(&cache->nr_allocated_migrations)) 421 wake_up(&cache->migration_wait); 422 423 mempool_free(mg, cache->migration_pool); 424 } 425 426 static int prealloc_data_structs(struct cache *cache, struct prealloc *p) 427 { 428 if (!p->mg) { 429 p->mg = alloc_migration(cache); 430 if (!p->mg) 431 return -ENOMEM; 432 } 433 434 if (!p->cell1) { 435 p->cell1 = alloc_prison_cell(cache); 436 if (!p->cell1) 437 return -ENOMEM; 438 } 439 440 if (!p->cell2) { 441 p->cell2 = alloc_prison_cell(cache); 442 if (!p->cell2) 443 return -ENOMEM; 444 } 445 446 return 0; 447 } 448 449 static void prealloc_free_structs(struct cache *cache, struct prealloc *p) 450 { 451 if (p->cell2) 452 free_prison_cell(cache, p->cell2); 453 454 if (p->cell1) 455 free_prison_cell(cache, p->cell1); 456 457 if (p->mg) 458 free_migration(p->mg); 459 } 460 461 static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p) 462 { 463 struct dm_cache_migration *mg = p->mg; 464 465 BUG_ON(!mg); 466 p->mg = NULL; 467 468 return mg; 469 } 470 471 /* 472 * You must have a cell within the prealloc struct to return. If not this 473 * function will BUG() rather than returning NULL. 474 */ 475 static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p) 476 { 477 struct dm_bio_prison_cell *r = NULL; 478 479 if (p->cell1) { 480 r = p->cell1; 481 p->cell1 = NULL; 482 483 } else if (p->cell2) { 484 r = p->cell2; 485 p->cell2 = NULL; 486 } else 487 BUG(); 488 489 return r; 490 } 491 492 /* 493 * You can't have more than two cells in a prealloc struct. BUG() will be 494 * called if you try and overfill. 495 */ 496 static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell) 497 { 498 if (!p->cell2) 499 p->cell2 = cell; 500 501 else if (!p->cell1) 502 p->cell1 = cell; 503 504 else 505 BUG(); 506 } 507 508 /*----------------------------------------------------------------*/ 509 510 static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key) 511 { 512 key->virtual = 0; 513 key->dev = 0; 514 key->block_begin = from_oblock(begin); 515 key->block_end = from_oblock(end); 516 } 517 518 /* 519 * The caller hands in a preallocated cell, and a free function for it. 520 * The cell will be freed if there's an error, or if it wasn't used because 521 * a cell with that key already exists. 522 */ 523 typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell); 524 525 static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end, 526 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 527 cell_free_fn free_fn, void *free_context, 528 struct dm_bio_prison_cell **cell_result) 529 { 530 int r; 531 struct dm_cell_key key; 532 533 build_key(oblock_begin, oblock_end, &key); 534 r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result); 535 if (r) 536 free_fn(free_context, cell_prealloc); 537 538 return r; 539 } 540 541 static int bio_detain(struct cache *cache, dm_oblock_t oblock, 542 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 543 cell_free_fn free_fn, void *free_context, 544 struct dm_bio_prison_cell **cell_result) 545 { 546 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); 547 return bio_detain_range(cache, oblock, end, bio, 548 cell_prealloc, free_fn, free_context, cell_result); 549 } 550 551 static int get_cell(struct cache *cache, 552 dm_oblock_t oblock, 553 struct prealloc *structs, 554 struct dm_bio_prison_cell **cell_result) 555 { 556 int r; 557 struct dm_cell_key key; 558 struct dm_bio_prison_cell *cell_prealloc; 559 560 cell_prealloc = prealloc_get_cell(structs); 561 562 build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key); 563 r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result); 564 if (r) 565 prealloc_put_cell(structs, cell_prealloc); 566 567 return r; 568 } 569 570 /*----------------------------------------------------------------*/ 571 572 static bool is_dirty(struct cache *cache, dm_cblock_t b) 573 { 574 return test_bit(from_cblock(b), cache->dirty_bitset); 575 } 576 577 static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 578 { 579 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 580 atomic_inc(&cache->nr_dirty); 581 policy_set_dirty(cache->policy, oblock); 582 } 583 } 584 585 static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 586 { 587 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 588 policy_clear_dirty(cache->policy, oblock); 589 if (atomic_dec_return(&cache->nr_dirty) == 0) 590 dm_table_event(cache->ti->table); 591 } 592 } 593 594 /*----------------------------------------------------------------*/ 595 596 static bool block_size_is_power_of_two(struct cache *cache) 597 { 598 return cache->sectors_per_block_shift >= 0; 599 } 600 601 /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */ 602 #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6 603 __always_inline 604 #endif 605 static dm_block_t block_div(dm_block_t b, uint32_t n) 606 { 607 do_div(b, n); 608 609 return b; 610 } 611 612 static dm_block_t oblocks_per_dblock(struct cache *cache) 613 { 614 dm_block_t oblocks = cache->discard_block_size; 615 616 if (block_size_is_power_of_two(cache)) 617 oblocks >>= cache->sectors_per_block_shift; 618 else 619 oblocks = block_div(oblocks, cache->sectors_per_block); 620 621 return oblocks; 622 } 623 624 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) 625 { 626 return to_dblock(block_div(from_oblock(oblock), 627 oblocks_per_dblock(cache))); 628 } 629 630 static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock) 631 { 632 return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache)); 633 } 634 635 static void set_discard(struct cache *cache, dm_dblock_t b) 636 { 637 unsigned long flags; 638 639 BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks)); 640 atomic_inc(&cache->stats.discard_count); 641 642 spin_lock_irqsave(&cache->lock, flags); 643 set_bit(from_dblock(b), cache->discard_bitset); 644 spin_unlock_irqrestore(&cache->lock, flags); 645 } 646 647 static void clear_discard(struct cache *cache, dm_dblock_t b) 648 { 649 unsigned long flags; 650 651 spin_lock_irqsave(&cache->lock, flags); 652 clear_bit(from_dblock(b), cache->discard_bitset); 653 spin_unlock_irqrestore(&cache->lock, flags); 654 } 655 656 static bool is_discarded(struct cache *cache, dm_dblock_t b) 657 { 658 int r; 659 unsigned long flags; 660 661 spin_lock_irqsave(&cache->lock, flags); 662 r = test_bit(from_dblock(b), cache->discard_bitset); 663 spin_unlock_irqrestore(&cache->lock, flags); 664 665 return r; 666 } 667 668 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) 669 { 670 int r; 671 unsigned long flags; 672 673 spin_lock_irqsave(&cache->lock, flags); 674 r = test_bit(from_dblock(oblock_to_dblock(cache, b)), 675 cache->discard_bitset); 676 spin_unlock_irqrestore(&cache->lock, flags); 677 678 return r; 679 } 680 681 /*----------------------------------------------------------------*/ 682 683 static void load_stats(struct cache *cache) 684 { 685 struct dm_cache_statistics stats; 686 687 dm_cache_metadata_get_stats(cache->cmd, &stats); 688 atomic_set(&cache->stats.read_hit, stats.read_hits); 689 atomic_set(&cache->stats.read_miss, stats.read_misses); 690 atomic_set(&cache->stats.write_hit, stats.write_hits); 691 atomic_set(&cache->stats.write_miss, stats.write_misses); 692 } 693 694 static void save_stats(struct cache *cache) 695 { 696 struct dm_cache_statistics stats; 697 698 if (get_cache_mode(cache) >= CM_READ_ONLY) 699 return; 700 701 stats.read_hits = atomic_read(&cache->stats.read_hit); 702 stats.read_misses = atomic_read(&cache->stats.read_miss); 703 stats.write_hits = atomic_read(&cache->stats.write_hit); 704 stats.write_misses = atomic_read(&cache->stats.write_miss); 705 706 dm_cache_metadata_set_stats(cache->cmd, &stats); 707 } 708 709 /*---------------------------------------------------------------- 710 * Per bio data 711 *--------------------------------------------------------------*/ 712 713 /* 714 * If using writeback, leave out struct per_bio_data's writethrough fields. 715 */ 716 #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) 717 #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) 718 719 static bool writethrough_mode(struct cache_features *f) 720 { 721 return f->io_mode == CM_IO_WRITETHROUGH; 722 } 723 724 static bool writeback_mode(struct cache_features *f) 725 { 726 return f->io_mode == CM_IO_WRITEBACK; 727 } 728 729 static bool passthrough_mode(struct cache_features *f) 730 { 731 return f->io_mode == CM_IO_PASSTHROUGH; 732 } 733 734 static size_t get_per_bio_data_size(struct cache *cache) 735 { 736 return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; 737 } 738 739 static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) 740 { 741 struct per_bio_data *pb = dm_per_bio_data(bio, data_size); 742 BUG_ON(!pb); 743 return pb; 744 } 745 746 static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) 747 { 748 struct per_bio_data *pb = get_per_bio_data(bio, data_size); 749 750 pb->tick = false; 751 pb->req_nr = dm_bio_get_target_bio_nr(bio); 752 pb->all_io_entry = NULL; 753 pb->len = 0; 754 755 return pb; 756 } 757 758 /*---------------------------------------------------------------- 759 * Remapping 760 *--------------------------------------------------------------*/ 761 static void remap_to_origin(struct cache *cache, struct bio *bio) 762 { 763 bio->bi_bdev = cache->origin_dev->bdev; 764 } 765 766 static void remap_to_cache(struct cache *cache, struct bio *bio, 767 dm_cblock_t cblock) 768 { 769 sector_t bi_sector = bio->bi_iter.bi_sector; 770 sector_t block = from_cblock(cblock); 771 772 bio->bi_bdev = cache->cache_dev->bdev; 773 if (!block_size_is_power_of_two(cache)) 774 bio->bi_iter.bi_sector = 775 (block * cache->sectors_per_block) + 776 sector_div(bi_sector, cache->sectors_per_block); 777 else 778 bio->bi_iter.bi_sector = 779 (block << cache->sectors_per_block_shift) | 780 (bi_sector & (cache->sectors_per_block - 1)); 781 } 782 783 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) 784 { 785 unsigned long flags; 786 size_t pb_data_size = get_per_bio_data_size(cache); 787 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 788 789 spin_lock_irqsave(&cache->lock, flags); 790 if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) && 791 bio_op(bio) != REQ_OP_DISCARD) { 792 pb->tick = true; 793 cache->need_tick_bio = false; 794 } 795 spin_unlock_irqrestore(&cache->lock, flags); 796 } 797 798 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 799 dm_oblock_t oblock) 800 { 801 check_if_tick_bio_needed(cache, bio); 802 remap_to_origin(cache, bio); 803 if (bio_data_dir(bio) == WRITE) 804 clear_discard(cache, oblock_to_dblock(cache, oblock)); 805 } 806 807 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, 808 dm_oblock_t oblock, dm_cblock_t cblock) 809 { 810 check_if_tick_bio_needed(cache, bio); 811 remap_to_cache(cache, bio, cblock); 812 if (bio_data_dir(bio) == WRITE) { 813 set_dirty(cache, oblock, cblock); 814 clear_discard(cache, oblock_to_dblock(cache, oblock)); 815 } 816 } 817 818 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) 819 { 820 sector_t block_nr = bio->bi_iter.bi_sector; 821 822 if (!block_size_is_power_of_two(cache)) 823 (void) sector_div(block_nr, cache->sectors_per_block); 824 else 825 block_nr >>= cache->sectors_per_block_shift; 826 827 return to_oblock(block_nr); 828 } 829 830 /* 831 * You must increment the deferred set whilst the prison cell is held. To 832 * encourage this, we ask for 'cell' to be passed in. 833 */ 834 static void inc_ds(struct cache *cache, struct bio *bio, 835 struct dm_bio_prison_cell *cell) 836 { 837 size_t pb_data_size = get_per_bio_data_size(cache); 838 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 839 840 BUG_ON(!cell); 841 BUG_ON(pb->all_io_entry); 842 843 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 844 } 845 846 static bool accountable_bio(struct cache *cache, struct bio *bio) 847 { 848 return ((bio->bi_bdev == cache->origin_dev->bdev) && 849 bio_op(bio) != REQ_OP_DISCARD); 850 } 851 852 static void accounted_begin(struct cache *cache, struct bio *bio) 853 { 854 size_t pb_data_size = get_per_bio_data_size(cache); 855 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 856 857 if (accountable_bio(cache, bio)) { 858 pb->len = bio_sectors(bio); 859 iot_io_begin(&cache->origin_tracker, pb->len); 860 } 861 } 862 863 static void accounted_complete(struct cache *cache, struct bio *bio) 864 { 865 size_t pb_data_size = get_per_bio_data_size(cache); 866 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 867 868 iot_io_end(&cache->origin_tracker, pb->len); 869 } 870 871 static void accounted_request(struct cache *cache, struct bio *bio) 872 { 873 accounted_begin(cache, bio); 874 generic_make_request(bio); 875 } 876 877 static void issue(struct cache *cache, struct bio *bio) 878 { 879 unsigned long flags; 880 881 if (!op_is_flush(bio->bi_opf)) { 882 accounted_request(cache, bio); 883 return; 884 } 885 886 /* 887 * Batch together any bios that trigger commits and then issue a 888 * single commit for them in do_worker(). 889 */ 890 spin_lock_irqsave(&cache->lock, flags); 891 cache->commit_requested = true; 892 bio_list_add(&cache->deferred_flush_bios, bio); 893 spin_unlock_irqrestore(&cache->lock, flags); 894 } 895 896 static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell) 897 { 898 inc_ds(cache, bio, cell); 899 issue(cache, bio); 900 } 901 902 static void defer_writethrough_bio(struct cache *cache, struct bio *bio) 903 { 904 unsigned long flags; 905 906 spin_lock_irqsave(&cache->lock, flags); 907 bio_list_add(&cache->deferred_writethrough_bios, bio); 908 spin_unlock_irqrestore(&cache->lock, flags); 909 910 wake_worker(cache); 911 } 912 913 static void writethrough_endio(struct bio *bio) 914 { 915 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 916 917 dm_unhook_bio(&pb->hook_info, bio); 918 919 if (bio->bi_error) { 920 bio_endio(bio); 921 return; 922 } 923 924 dm_bio_restore(&pb->bio_details, bio); 925 remap_to_cache(pb->cache, bio, pb->cblock); 926 927 /* 928 * We can't issue this bio directly, since we're in interrupt 929 * context. So it gets put on a bio list for processing by the 930 * worker thread. 931 */ 932 defer_writethrough_bio(pb->cache, bio); 933 } 934 935 /* 936 * When running in writethrough mode we need to send writes to clean blocks 937 * to both the cache and origin devices. In future we'd like to clone the 938 * bio and send them in parallel, but for now we're doing them in 939 * series as this is easier. 940 */ 941 static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, 942 dm_oblock_t oblock, dm_cblock_t cblock) 943 { 944 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 945 946 pb->cache = cache; 947 pb->cblock = cblock; 948 dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL); 949 dm_bio_record(&pb->bio_details, bio); 950 951 remap_to_origin_clear_discard(pb->cache, bio, oblock); 952 } 953 954 /*---------------------------------------------------------------- 955 * Failure modes 956 *--------------------------------------------------------------*/ 957 static enum cache_metadata_mode get_cache_mode(struct cache *cache) 958 { 959 return cache->features.mode; 960 } 961 962 static const char *cache_device_name(struct cache *cache) 963 { 964 return dm_device_name(dm_table_get_md(cache->ti->table)); 965 } 966 967 static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode) 968 { 969 const char *descs[] = { 970 "write", 971 "read-only", 972 "fail" 973 }; 974 975 dm_table_event(cache->ti->table); 976 DMINFO("%s: switching cache to %s mode", 977 cache_device_name(cache), descs[(int)mode]); 978 } 979 980 static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode) 981 { 982 bool needs_check; 983 enum cache_metadata_mode old_mode = get_cache_mode(cache); 984 985 if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) { 986 DMERR("%s: unable to read needs_check flag, setting failure mode.", 987 cache_device_name(cache)); 988 new_mode = CM_FAIL; 989 } 990 991 if (new_mode == CM_WRITE && needs_check) { 992 DMERR("%s: unable to switch cache to write mode until repaired.", 993 cache_device_name(cache)); 994 if (old_mode != new_mode) 995 new_mode = old_mode; 996 else 997 new_mode = CM_READ_ONLY; 998 } 999 1000 /* Never move out of fail mode */ 1001 if (old_mode == CM_FAIL) 1002 new_mode = CM_FAIL; 1003 1004 switch (new_mode) { 1005 case CM_FAIL: 1006 case CM_READ_ONLY: 1007 dm_cache_metadata_set_read_only(cache->cmd); 1008 break; 1009 1010 case CM_WRITE: 1011 dm_cache_metadata_set_read_write(cache->cmd); 1012 break; 1013 } 1014 1015 cache->features.mode = new_mode; 1016 1017 if (new_mode != old_mode) 1018 notify_mode_switch(cache, new_mode); 1019 } 1020 1021 static void abort_transaction(struct cache *cache) 1022 { 1023 const char *dev_name = cache_device_name(cache); 1024 1025 if (get_cache_mode(cache) >= CM_READ_ONLY) 1026 return; 1027 1028 if (dm_cache_metadata_set_needs_check(cache->cmd)) { 1029 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); 1030 set_cache_mode(cache, CM_FAIL); 1031 } 1032 1033 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); 1034 if (dm_cache_metadata_abort(cache->cmd)) { 1035 DMERR("%s: failed to abort metadata transaction", dev_name); 1036 set_cache_mode(cache, CM_FAIL); 1037 } 1038 } 1039 1040 static void metadata_operation_failed(struct cache *cache, const char *op, int r) 1041 { 1042 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", 1043 cache_device_name(cache), op, r); 1044 abort_transaction(cache); 1045 set_cache_mode(cache, CM_READ_ONLY); 1046 } 1047 1048 /*---------------------------------------------------------------- 1049 * Migration processing 1050 * 1051 * Migration covers moving data from the origin device to the cache, or 1052 * vice versa. 1053 *--------------------------------------------------------------*/ 1054 static void inc_io_migrations(struct cache *cache) 1055 { 1056 atomic_inc(&cache->nr_io_migrations); 1057 } 1058 1059 static void dec_io_migrations(struct cache *cache) 1060 { 1061 atomic_dec(&cache->nr_io_migrations); 1062 } 1063 1064 static bool discard_or_flush(struct bio *bio) 1065 { 1066 return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf); 1067 } 1068 1069 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell) 1070 { 1071 if (discard_or_flush(cell->holder)) { 1072 /* 1073 * We have to handle these bios individually. 1074 */ 1075 dm_cell_release(cache->prison, cell, &cache->deferred_bios); 1076 free_prison_cell(cache, cell); 1077 } else 1078 list_add_tail(&cell->user_list, &cache->deferred_cells); 1079 } 1080 1081 static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder) 1082 { 1083 unsigned long flags; 1084 1085 if (!holder && dm_cell_promote_or_release(cache->prison, cell)) { 1086 /* 1087 * There was no prisoner to promote to holder, the 1088 * cell has been released. 1089 */ 1090 free_prison_cell(cache, cell); 1091 return; 1092 } 1093 1094 spin_lock_irqsave(&cache->lock, flags); 1095 __cell_defer(cache, cell); 1096 spin_unlock_irqrestore(&cache->lock, flags); 1097 1098 wake_worker(cache); 1099 } 1100 1101 static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err) 1102 { 1103 dm_cell_error(cache->prison, cell, err); 1104 free_prison_cell(cache, cell); 1105 } 1106 1107 static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell) 1108 { 1109 cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE); 1110 } 1111 1112 static void free_io_migration(struct dm_cache_migration *mg) 1113 { 1114 struct cache *cache = mg->cache; 1115 1116 dec_io_migrations(cache); 1117 free_migration(mg); 1118 wake_worker(cache); 1119 } 1120 1121 static void migration_failure(struct dm_cache_migration *mg) 1122 { 1123 struct cache *cache = mg->cache; 1124 const char *dev_name = cache_device_name(cache); 1125 1126 if (mg->writeback) { 1127 DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name); 1128 set_dirty(cache, mg->old_oblock, mg->cblock); 1129 cell_defer(cache, mg->old_ocell, false); 1130 1131 } else if (mg->demote) { 1132 DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name); 1133 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock); 1134 1135 cell_defer(cache, mg->old_ocell, mg->promote ? false : true); 1136 if (mg->promote) 1137 cell_defer(cache, mg->new_ocell, true); 1138 } else { 1139 DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name); 1140 policy_remove_mapping(cache->policy, mg->new_oblock); 1141 cell_defer(cache, mg->new_ocell, true); 1142 } 1143 1144 free_io_migration(mg); 1145 } 1146 1147 static void migration_success_pre_commit(struct dm_cache_migration *mg) 1148 { 1149 int r; 1150 unsigned long flags; 1151 struct cache *cache = mg->cache; 1152 1153 if (mg->writeback) { 1154 clear_dirty(cache, mg->old_oblock, mg->cblock); 1155 cell_defer(cache, mg->old_ocell, false); 1156 free_io_migration(mg); 1157 return; 1158 1159 } else if (mg->demote) { 1160 r = dm_cache_remove_mapping(cache->cmd, mg->cblock); 1161 if (r) { 1162 DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata", 1163 cache_device_name(cache)); 1164 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1165 policy_force_mapping(cache->policy, mg->new_oblock, 1166 mg->old_oblock); 1167 if (mg->promote) 1168 cell_defer(cache, mg->new_ocell, true); 1169 free_io_migration(mg); 1170 return; 1171 } 1172 } else { 1173 r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock); 1174 if (r) { 1175 DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata", 1176 cache_device_name(cache)); 1177 metadata_operation_failed(cache, "dm_cache_insert_mapping", r); 1178 policy_remove_mapping(cache->policy, mg->new_oblock); 1179 free_io_migration(mg); 1180 return; 1181 } 1182 } 1183 1184 spin_lock_irqsave(&cache->lock, flags); 1185 list_add_tail(&mg->list, &cache->need_commit_migrations); 1186 cache->commit_requested = true; 1187 spin_unlock_irqrestore(&cache->lock, flags); 1188 } 1189 1190 static void migration_success_post_commit(struct dm_cache_migration *mg) 1191 { 1192 unsigned long flags; 1193 struct cache *cache = mg->cache; 1194 1195 if (mg->writeback) { 1196 DMWARN_LIMIT("%s: writeback unexpectedly triggered commit", 1197 cache_device_name(cache)); 1198 return; 1199 1200 } else if (mg->demote) { 1201 cell_defer(cache, mg->old_ocell, mg->promote ? false : true); 1202 1203 if (mg->promote) { 1204 mg->demote = false; 1205 1206 spin_lock_irqsave(&cache->lock, flags); 1207 list_add_tail(&mg->list, &cache->quiesced_migrations); 1208 spin_unlock_irqrestore(&cache->lock, flags); 1209 1210 } else { 1211 if (mg->invalidate) 1212 policy_remove_mapping(cache->policy, mg->old_oblock); 1213 free_io_migration(mg); 1214 } 1215 1216 } else { 1217 if (mg->requeue_holder) { 1218 clear_dirty(cache, mg->new_oblock, mg->cblock); 1219 cell_defer(cache, mg->new_ocell, true); 1220 } else { 1221 /* 1222 * The block was promoted via an overwrite, so it's dirty. 1223 */ 1224 set_dirty(cache, mg->new_oblock, mg->cblock); 1225 bio_endio(mg->new_ocell->holder); 1226 cell_defer(cache, mg->new_ocell, false); 1227 } 1228 free_io_migration(mg); 1229 } 1230 } 1231 1232 static void copy_complete(int read_err, unsigned long write_err, void *context) 1233 { 1234 unsigned long flags; 1235 struct dm_cache_migration *mg = (struct dm_cache_migration *) context; 1236 struct cache *cache = mg->cache; 1237 1238 if (read_err || write_err) 1239 mg->err = true; 1240 1241 spin_lock_irqsave(&cache->lock, flags); 1242 list_add_tail(&mg->list, &cache->completed_migrations); 1243 spin_unlock_irqrestore(&cache->lock, flags); 1244 1245 wake_worker(cache); 1246 } 1247 1248 static void issue_copy(struct dm_cache_migration *mg) 1249 { 1250 int r; 1251 struct dm_io_region o_region, c_region; 1252 struct cache *cache = mg->cache; 1253 sector_t cblock = from_cblock(mg->cblock); 1254 1255 o_region.bdev = cache->origin_dev->bdev; 1256 o_region.count = cache->sectors_per_block; 1257 1258 c_region.bdev = cache->cache_dev->bdev; 1259 c_region.sector = cblock * cache->sectors_per_block; 1260 c_region.count = cache->sectors_per_block; 1261 1262 if (mg->writeback || mg->demote) { 1263 /* demote */ 1264 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block; 1265 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg); 1266 } else { 1267 /* promote */ 1268 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block; 1269 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg); 1270 } 1271 1272 if (r < 0) { 1273 DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache)); 1274 migration_failure(mg); 1275 } 1276 } 1277 1278 static void overwrite_endio(struct bio *bio) 1279 { 1280 struct dm_cache_migration *mg = bio->bi_private; 1281 struct cache *cache = mg->cache; 1282 size_t pb_data_size = get_per_bio_data_size(cache); 1283 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1284 unsigned long flags; 1285 1286 dm_unhook_bio(&pb->hook_info, bio); 1287 1288 if (bio->bi_error) 1289 mg->err = true; 1290 1291 mg->requeue_holder = false; 1292 1293 spin_lock_irqsave(&cache->lock, flags); 1294 list_add_tail(&mg->list, &cache->completed_migrations); 1295 spin_unlock_irqrestore(&cache->lock, flags); 1296 1297 wake_worker(cache); 1298 } 1299 1300 static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio) 1301 { 1302 size_t pb_data_size = get_per_bio_data_size(mg->cache); 1303 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1304 1305 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1306 remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock); 1307 1308 /* 1309 * No need to inc_ds() here, since the cell will be held for the 1310 * duration of the io. 1311 */ 1312 accounted_request(mg->cache, bio); 1313 } 1314 1315 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) 1316 { 1317 return (bio_data_dir(bio) == WRITE) && 1318 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); 1319 } 1320 1321 static void avoid_copy(struct dm_cache_migration *mg) 1322 { 1323 atomic_inc(&mg->cache->stats.copies_avoided); 1324 migration_success_pre_commit(mg); 1325 } 1326 1327 static void calc_discard_block_range(struct cache *cache, struct bio *bio, 1328 dm_dblock_t *b, dm_dblock_t *e) 1329 { 1330 sector_t sb = bio->bi_iter.bi_sector; 1331 sector_t se = bio_end_sector(bio); 1332 1333 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); 1334 1335 if (se - sb < cache->discard_block_size) 1336 *e = *b; 1337 else 1338 *e = to_dblock(block_div(se, cache->discard_block_size)); 1339 } 1340 1341 static void issue_discard(struct dm_cache_migration *mg) 1342 { 1343 dm_dblock_t b, e; 1344 struct bio *bio = mg->new_ocell->holder; 1345 struct cache *cache = mg->cache; 1346 1347 calc_discard_block_range(cache, bio, &b, &e); 1348 while (b != e) { 1349 set_discard(cache, b); 1350 b = to_dblock(from_dblock(b) + 1); 1351 } 1352 1353 bio_endio(bio); 1354 cell_defer(cache, mg->new_ocell, false); 1355 free_migration(mg); 1356 wake_worker(cache); 1357 } 1358 1359 static void issue_copy_or_discard(struct dm_cache_migration *mg) 1360 { 1361 bool avoid; 1362 struct cache *cache = mg->cache; 1363 1364 if (mg->discard) { 1365 issue_discard(mg); 1366 return; 1367 } 1368 1369 if (mg->writeback || mg->demote) 1370 avoid = !is_dirty(cache, mg->cblock) || 1371 is_discarded_oblock(cache, mg->old_oblock); 1372 else { 1373 struct bio *bio = mg->new_ocell->holder; 1374 1375 avoid = is_discarded_oblock(cache, mg->new_oblock); 1376 1377 if (writeback_mode(&cache->features) && 1378 !avoid && bio_writes_complete_block(cache, bio)) { 1379 issue_overwrite(mg, bio); 1380 return; 1381 } 1382 } 1383 1384 avoid ? avoid_copy(mg) : issue_copy(mg); 1385 } 1386 1387 static void complete_migration(struct dm_cache_migration *mg) 1388 { 1389 if (mg->err) 1390 migration_failure(mg); 1391 else 1392 migration_success_pre_commit(mg); 1393 } 1394 1395 static void process_migrations(struct cache *cache, struct list_head *head, 1396 void (*fn)(struct dm_cache_migration *)) 1397 { 1398 unsigned long flags; 1399 struct list_head list; 1400 struct dm_cache_migration *mg, *tmp; 1401 1402 INIT_LIST_HEAD(&list); 1403 spin_lock_irqsave(&cache->lock, flags); 1404 list_splice_init(head, &list); 1405 spin_unlock_irqrestore(&cache->lock, flags); 1406 1407 list_for_each_entry_safe(mg, tmp, &list, list) 1408 fn(mg); 1409 } 1410 1411 static void __queue_quiesced_migration(struct dm_cache_migration *mg) 1412 { 1413 list_add_tail(&mg->list, &mg->cache->quiesced_migrations); 1414 } 1415 1416 static void queue_quiesced_migration(struct dm_cache_migration *mg) 1417 { 1418 unsigned long flags; 1419 struct cache *cache = mg->cache; 1420 1421 spin_lock_irqsave(&cache->lock, flags); 1422 __queue_quiesced_migration(mg); 1423 spin_unlock_irqrestore(&cache->lock, flags); 1424 1425 wake_worker(cache); 1426 } 1427 1428 static void queue_quiesced_migrations(struct cache *cache, struct list_head *work) 1429 { 1430 unsigned long flags; 1431 struct dm_cache_migration *mg, *tmp; 1432 1433 spin_lock_irqsave(&cache->lock, flags); 1434 list_for_each_entry_safe(mg, tmp, work, list) 1435 __queue_quiesced_migration(mg); 1436 spin_unlock_irqrestore(&cache->lock, flags); 1437 1438 wake_worker(cache); 1439 } 1440 1441 static void check_for_quiesced_migrations(struct cache *cache, 1442 struct per_bio_data *pb) 1443 { 1444 struct list_head work; 1445 1446 if (!pb->all_io_entry) 1447 return; 1448 1449 INIT_LIST_HEAD(&work); 1450 dm_deferred_entry_dec(pb->all_io_entry, &work); 1451 1452 if (!list_empty(&work)) 1453 queue_quiesced_migrations(cache, &work); 1454 } 1455 1456 static void quiesce_migration(struct dm_cache_migration *mg) 1457 { 1458 if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list)) 1459 queue_quiesced_migration(mg); 1460 } 1461 1462 static void promote(struct cache *cache, struct prealloc *structs, 1463 dm_oblock_t oblock, dm_cblock_t cblock, 1464 struct dm_bio_prison_cell *cell) 1465 { 1466 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1467 1468 mg->err = false; 1469 mg->discard = false; 1470 mg->writeback = false; 1471 mg->demote = false; 1472 mg->promote = true; 1473 mg->requeue_holder = true; 1474 mg->invalidate = false; 1475 mg->cache = cache; 1476 mg->new_oblock = oblock; 1477 mg->cblock = cblock; 1478 mg->old_ocell = NULL; 1479 mg->new_ocell = cell; 1480 mg->start_jiffies = jiffies; 1481 1482 inc_io_migrations(cache); 1483 quiesce_migration(mg); 1484 } 1485 1486 static void writeback(struct cache *cache, struct prealloc *structs, 1487 dm_oblock_t oblock, dm_cblock_t cblock, 1488 struct dm_bio_prison_cell *cell) 1489 { 1490 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1491 1492 mg->err = false; 1493 mg->discard = false; 1494 mg->writeback = true; 1495 mg->demote = false; 1496 mg->promote = false; 1497 mg->requeue_holder = true; 1498 mg->invalidate = false; 1499 mg->cache = cache; 1500 mg->old_oblock = oblock; 1501 mg->cblock = cblock; 1502 mg->old_ocell = cell; 1503 mg->new_ocell = NULL; 1504 mg->start_jiffies = jiffies; 1505 1506 inc_io_migrations(cache); 1507 quiesce_migration(mg); 1508 } 1509 1510 static void demote_then_promote(struct cache *cache, struct prealloc *structs, 1511 dm_oblock_t old_oblock, dm_oblock_t new_oblock, 1512 dm_cblock_t cblock, 1513 struct dm_bio_prison_cell *old_ocell, 1514 struct dm_bio_prison_cell *new_ocell) 1515 { 1516 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1517 1518 mg->err = false; 1519 mg->discard = false; 1520 mg->writeback = false; 1521 mg->demote = true; 1522 mg->promote = true; 1523 mg->requeue_holder = true; 1524 mg->invalidate = false; 1525 mg->cache = cache; 1526 mg->old_oblock = old_oblock; 1527 mg->new_oblock = new_oblock; 1528 mg->cblock = cblock; 1529 mg->old_ocell = old_ocell; 1530 mg->new_ocell = new_ocell; 1531 mg->start_jiffies = jiffies; 1532 1533 inc_io_migrations(cache); 1534 quiesce_migration(mg); 1535 } 1536 1537 /* 1538 * Invalidate a cache entry. No writeback occurs; any changes in the cache 1539 * block are thrown away. 1540 */ 1541 static void invalidate(struct cache *cache, struct prealloc *structs, 1542 dm_oblock_t oblock, dm_cblock_t cblock, 1543 struct dm_bio_prison_cell *cell) 1544 { 1545 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1546 1547 mg->err = false; 1548 mg->discard = false; 1549 mg->writeback = false; 1550 mg->demote = true; 1551 mg->promote = false; 1552 mg->requeue_holder = true; 1553 mg->invalidate = true; 1554 mg->cache = cache; 1555 mg->old_oblock = oblock; 1556 mg->cblock = cblock; 1557 mg->old_ocell = cell; 1558 mg->new_ocell = NULL; 1559 mg->start_jiffies = jiffies; 1560 1561 inc_io_migrations(cache); 1562 quiesce_migration(mg); 1563 } 1564 1565 static void discard(struct cache *cache, struct prealloc *structs, 1566 struct dm_bio_prison_cell *cell) 1567 { 1568 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1569 1570 mg->err = false; 1571 mg->discard = true; 1572 mg->writeback = false; 1573 mg->demote = false; 1574 mg->promote = false; 1575 mg->requeue_holder = false; 1576 mg->invalidate = false; 1577 mg->cache = cache; 1578 mg->old_ocell = NULL; 1579 mg->new_ocell = cell; 1580 mg->start_jiffies = jiffies; 1581 1582 quiesce_migration(mg); 1583 } 1584 1585 /*---------------------------------------------------------------- 1586 * bio processing 1587 *--------------------------------------------------------------*/ 1588 static void defer_bio(struct cache *cache, struct bio *bio) 1589 { 1590 unsigned long flags; 1591 1592 spin_lock_irqsave(&cache->lock, flags); 1593 bio_list_add(&cache->deferred_bios, bio); 1594 spin_unlock_irqrestore(&cache->lock, flags); 1595 1596 wake_worker(cache); 1597 } 1598 1599 static void process_flush_bio(struct cache *cache, struct bio *bio) 1600 { 1601 size_t pb_data_size = get_per_bio_data_size(cache); 1602 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1603 1604 BUG_ON(bio->bi_iter.bi_size); 1605 if (!pb->req_nr) 1606 remap_to_origin(cache, bio); 1607 else 1608 remap_to_cache(cache, bio, 0); 1609 1610 /* 1611 * REQ_PREFLUSH is not directed at any particular block so we don't 1612 * need to inc_ds(). REQ_FUA's are split into a write + REQ_PREFLUSH 1613 * by dm-core. 1614 */ 1615 issue(cache, bio); 1616 } 1617 1618 static void process_discard_bio(struct cache *cache, struct prealloc *structs, 1619 struct bio *bio) 1620 { 1621 int r; 1622 dm_dblock_t b, e; 1623 struct dm_bio_prison_cell *cell_prealloc, *new_ocell; 1624 1625 calc_discard_block_range(cache, bio, &b, &e); 1626 if (b == e) { 1627 bio_endio(bio); 1628 return; 1629 } 1630 1631 cell_prealloc = prealloc_get_cell(structs); 1632 r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc, 1633 (cell_free_fn) prealloc_put_cell, 1634 structs, &new_ocell); 1635 if (r > 0) 1636 return; 1637 1638 discard(cache, structs, new_ocell); 1639 } 1640 1641 static bool spare_migration_bandwidth(struct cache *cache) 1642 { 1643 sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * 1644 cache->sectors_per_block; 1645 return current_volume < cache->migration_threshold; 1646 } 1647 1648 static void inc_hit_counter(struct cache *cache, struct bio *bio) 1649 { 1650 atomic_inc(bio_data_dir(bio) == READ ? 1651 &cache->stats.read_hit : &cache->stats.write_hit); 1652 } 1653 1654 static void inc_miss_counter(struct cache *cache, struct bio *bio) 1655 { 1656 atomic_inc(bio_data_dir(bio) == READ ? 1657 &cache->stats.read_miss : &cache->stats.write_miss); 1658 } 1659 1660 /*----------------------------------------------------------------*/ 1661 1662 struct inc_detail { 1663 struct cache *cache; 1664 struct bio_list bios_for_issue; 1665 struct bio_list unhandled_bios; 1666 bool any_writes; 1667 }; 1668 1669 static void inc_fn(void *context, struct dm_bio_prison_cell *cell) 1670 { 1671 struct bio *bio; 1672 struct inc_detail *detail = context; 1673 struct cache *cache = detail->cache; 1674 1675 inc_ds(cache, cell->holder, cell); 1676 if (bio_data_dir(cell->holder) == WRITE) 1677 detail->any_writes = true; 1678 1679 while ((bio = bio_list_pop(&cell->bios))) { 1680 if (discard_or_flush(bio)) { 1681 bio_list_add(&detail->unhandled_bios, bio); 1682 continue; 1683 } 1684 1685 if (bio_data_dir(bio) == WRITE) 1686 detail->any_writes = true; 1687 1688 bio_list_add(&detail->bios_for_issue, bio); 1689 inc_ds(cache, bio, cell); 1690 } 1691 } 1692 1693 // FIXME: refactor these two 1694 static void remap_cell_to_origin_clear_discard(struct cache *cache, 1695 struct dm_bio_prison_cell *cell, 1696 dm_oblock_t oblock, bool issue_holder) 1697 { 1698 struct bio *bio; 1699 unsigned long flags; 1700 struct inc_detail detail; 1701 1702 detail.cache = cache; 1703 bio_list_init(&detail.bios_for_issue); 1704 bio_list_init(&detail.unhandled_bios); 1705 detail.any_writes = false; 1706 1707 spin_lock_irqsave(&cache->lock, flags); 1708 dm_cell_visit_release(cache->prison, inc_fn, &detail, cell); 1709 bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios); 1710 spin_unlock_irqrestore(&cache->lock, flags); 1711 1712 remap_to_origin(cache, cell->holder); 1713 if (issue_holder) 1714 issue(cache, cell->holder); 1715 else 1716 accounted_begin(cache, cell->holder); 1717 1718 if (detail.any_writes) 1719 clear_discard(cache, oblock_to_dblock(cache, oblock)); 1720 1721 while ((bio = bio_list_pop(&detail.bios_for_issue))) { 1722 remap_to_origin(cache, bio); 1723 issue(cache, bio); 1724 } 1725 1726 free_prison_cell(cache, cell); 1727 } 1728 1729 static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell, 1730 dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder) 1731 { 1732 struct bio *bio; 1733 unsigned long flags; 1734 struct inc_detail detail; 1735 1736 detail.cache = cache; 1737 bio_list_init(&detail.bios_for_issue); 1738 bio_list_init(&detail.unhandled_bios); 1739 detail.any_writes = false; 1740 1741 spin_lock_irqsave(&cache->lock, flags); 1742 dm_cell_visit_release(cache->prison, inc_fn, &detail, cell); 1743 bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios); 1744 spin_unlock_irqrestore(&cache->lock, flags); 1745 1746 remap_to_cache(cache, cell->holder, cblock); 1747 if (issue_holder) 1748 issue(cache, cell->holder); 1749 else 1750 accounted_begin(cache, cell->holder); 1751 1752 if (detail.any_writes) { 1753 set_dirty(cache, oblock, cblock); 1754 clear_discard(cache, oblock_to_dblock(cache, oblock)); 1755 } 1756 1757 while ((bio = bio_list_pop(&detail.bios_for_issue))) { 1758 remap_to_cache(cache, bio, cblock); 1759 issue(cache, bio); 1760 } 1761 1762 free_prison_cell(cache, cell); 1763 } 1764 1765 /*----------------------------------------------------------------*/ 1766 1767 struct old_oblock_lock { 1768 struct policy_locker locker; 1769 struct cache *cache; 1770 struct prealloc *structs; 1771 struct dm_bio_prison_cell *cell; 1772 }; 1773 1774 static int null_locker(struct policy_locker *locker, dm_oblock_t b) 1775 { 1776 /* This should never be called */ 1777 BUG(); 1778 return 0; 1779 } 1780 1781 static int cell_locker(struct policy_locker *locker, dm_oblock_t b) 1782 { 1783 struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker); 1784 struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs); 1785 1786 return bio_detain(l->cache, b, NULL, cell_prealloc, 1787 (cell_free_fn) prealloc_put_cell, 1788 l->structs, &l->cell); 1789 } 1790 1791 static void process_cell(struct cache *cache, struct prealloc *structs, 1792 struct dm_bio_prison_cell *new_ocell) 1793 { 1794 int r; 1795 bool release_cell = true; 1796 struct bio *bio = new_ocell->holder; 1797 dm_oblock_t block = get_bio_block(cache, bio); 1798 struct policy_result lookup_result; 1799 bool passthrough = passthrough_mode(&cache->features); 1800 bool fast_promotion, can_migrate; 1801 struct old_oblock_lock ool; 1802 1803 fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio); 1804 can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache)); 1805 1806 ool.locker.fn = cell_locker; 1807 ool.cache = cache; 1808 ool.structs = structs; 1809 ool.cell = NULL; 1810 r = policy_map(cache->policy, block, true, can_migrate, fast_promotion, 1811 bio, &ool.locker, &lookup_result); 1812 1813 if (r == -EWOULDBLOCK) 1814 /* migration has been denied */ 1815 lookup_result.op = POLICY_MISS; 1816 1817 switch (lookup_result.op) { 1818 case POLICY_HIT: 1819 if (passthrough) { 1820 inc_miss_counter(cache, bio); 1821 1822 /* 1823 * Passthrough always maps to the origin, 1824 * invalidating any cache blocks that are written 1825 * to. 1826 */ 1827 1828 if (bio_data_dir(bio) == WRITE) { 1829 atomic_inc(&cache->stats.demotion); 1830 invalidate(cache, structs, block, lookup_result.cblock, new_ocell); 1831 release_cell = false; 1832 1833 } else { 1834 /* FIXME: factor out issue_origin() */ 1835 remap_to_origin_clear_discard(cache, bio, block); 1836 inc_and_issue(cache, bio, new_ocell); 1837 } 1838 } else { 1839 inc_hit_counter(cache, bio); 1840 1841 if (bio_data_dir(bio) == WRITE && 1842 writethrough_mode(&cache->features) && 1843 !is_dirty(cache, lookup_result.cblock)) { 1844 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 1845 inc_and_issue(cache, bio, new_ocell); 1846 1847 } else { 1848 remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true); 1849 release_cell = false; 1850 } 1851 } 1852 1853 break; 1854 1855 case POLICY_MISS: 1856 inc_miss_counter(cache, bio); 1857 remap_cell_to_origin_clear_discard(cache, new_ocell, block, true); 1858 release_cell = false; 1859 break; 1860 1861 case POLICY_NEW: 1862 atomic_inc(&cache->stats.promotion); 1863 promote(cache, structs, block, lookup_result.cblock, new_ocell); 1864 release_cell = false; 1865 break; 1866 1867 case POLICY_REPLACE: 1868 atomic_inc(&cache->stats.demotion); 1869 atomic_inc(&cache->stats.promotion); 1870 demote_then_promote(cache, structs, lookup_result.old_oblock, 1871 block, lookup_result.cblock, 1872 ool.cell, new_ocell); 1873 release_cell = false; 1874 break; 1875 1876 default: 1877 DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u", 1878 cache_device_name(cache), __func__, 1879 (unsigned) lookup_result.op); 1880 bio_io_error(bio); 1881 } 1882 1883 if (release_cell) 1884 cell_defer(cache, new_ocell, false); 1885 } 1886 1887 static void process_bio(struct cache *cache, struct prealloc *structs, 1888 struct bio *bio) 1889 { 1890 int r; 1891 dm_oblock_t block = get_bio_block(cache, bio); 1892 struct dm_bio_prison_cell *cell_prealloc, *new_ocell; 1893 1894 /* 1895 * Check to see if that block is currently migrating. 1896 */ 1897 cell_prealloc = prealloc_get_cell(structs); 1898 r = bio_detain(cache, block, bio, cell_prealloc, 1899 (cell_free_fn) prealloc_put_cell, 1900 structs, &new_ocell); 1901 if (r > 0) 1902 return; 1903 1904 process_cell(cache, structs, new_ocell); 1905 } 1906 1907 static int need_commit_due_to_time(struct cache *cache) 1908 { 1909 return jiffies < cache->last_commit_jiffies || 1910 jiffies > cache->last_commit_jiffies + COMMIT_PERIOD; 1911 } 1912 1913 /* 1914 * A non-zero return indicates read_only or fail_io mode. 1915 */ 1916 static int commit(struct cache *cache, bool clean_shutdown) 1917 { 1918 int r; 1919 1920 if (get_cache_mode(cache) >= CM_READ_ONLY) 1921 return -EINVAL; 1922 1923 atomic_inc(&cache->stats.commit_count); 1924 r = dm_cache_commit(cache->cmd, clean_shutdown); 1925 if (r) 1926 metadata_operation_failed(cache, "dm_cache_commit", r); 1927 1928 return r; 1929 } 1930 1931 static int commit_if_needed(struct cache *cache) 1932 { 1933 int r = 0; 1934 1935 if ((cache->commit_requested || need_commit_due_to_time(cache)) && 1936 dm_cache_changed_this_transaction(cache->cmd)) { 1937 r = commit(cache, false); 1938 cache->commit_requested = false; 1939 cache->last_commit_jiffies = jiffies; 1940 } 1941 1942 return r; 1943 } 1944 1945 static void process_deferred_bios(struct cache *cache) 1946 { 1947 bool prealloc_used = false; 1948 unsigned long flags; 1949 struct bio_list bios; 1950 struct bio *bio; 1951 struct prealloc structs; 1952 1953 memset(&structs, 0, sizeof(structs)); 1954 bio_list_init(&bios); 1955 1956 spin_lock_irqsave(&cache->lock, flags); 1957 bio_list_merge(&bios, &cache->deferred_bios); 1958 bio_list_init(&cache->deferred_bios); 1959 spin_unlock_irqrestore(&cache->lock, flags); 1960 1961 while (!bio_list_empty(&bios)) { 1962 /* 1963 * If we've got no free migration structs, and processing 1964 * this bio might require one, we pause until there are some 1965 * prepared mappings to process. 1966 */ 1967 prealloc_used = true; 1968 if (prealloc_data_structs(cache, &structs)) { 1969 spin_lock_irqsave(&cache->lock, flags); 1970 bio_list_merge(&cache->deferred_bios, &bios); 1971 spin_unlock_irqrestore(&cache->lock, flags); 1972 break; 1973 } 1974 1975 bio = bio_list_pop(&bios); 1976 1977 if (bio->bi_opf & REQ_PREFLUSH) 1978 process_flush_bio(cache, bio); 1979 else if (bio_op(bio) == REQ_OP_DISCARD) 1980 process_discard_bio(cache, &structs, bio); 1981 else 1982 process_bio(cache, &structs, bio); 1983 } 1984 1985 if (prealloc_used) 1986 prealloc_free_structs(cache, &structs); 1987 } 1988 1989 static void process_deferred_cells(struct cache *cache) 1990 { 1991 bool prealloc_used = false; 1992 unsigned long flags; 1993 struct dm_bio_prison_cell *cell, *tmp; 1994 struct list_head cells; 1995 struct prealloc structs; 1996 1997 memset(&structs, 0, sizeof(structs)); 1998 1999 INIT_LIST_HEAD(&cells); 2000 2001 spin_lock_irqsave(&cache->lock, flags); 2002 list_splice_init(&cache->deferred_cells, &cells); 2003 spin_unlock_irqrestore(&cache->lock, flags); 2004 2005 list_for_each_entry_safe(cell, tmp, &cells, user_list) { 2006 /* 2007 * If we've got no free migration structs, and processing 2008 * this bio might require one, we pause until there are some 2009 * prepared mappings to process. 2010 */ 2011 prealloc_used = true; 2012 if (prealloc_data_structs(cache, &structs)) { 2013 spin_lock_irqsave(&cache->lock, flags); 2014 list_splice(&cells, &cache->deferred_cells); 2015 spin_unlock_irqrestore(&cache->lock, flags); 2016 break; 2017 } 2018 2019 process_cell(cache, &structs, cell); 2020 } 2021 2022 if (prealloc_used) 2023 prealloc_free_structs(cache, &structs); 2024 } 2025 2026 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) 2027 { 2028 unsigned long flags; 2029 struct bio_list bios; 2030 struct bio *bio; 2031 2032 bio_list_init(&bios); 2033 2034 spin_lock_irqsave(&cache->lock, flags); 2035 bio_list_merge(&bios, &cache->deferred_flush_bios); 2036 bio_list_init(&cache->deferred_flush_bios); 2037 spin_unlock_irqrestore(&cache->lock, flags); 2038 2039 /* 2040 * These bios have already been through inc_ds() 2041 */ 2042 while ((bio = bio_list_pop(&bios))) 2043 submit_bios ? accounted_request(cache, bio) : bio_io_error(bio); 2044 } 2045 2046 static void process_deferred_writethrough_bios(struct cache *cache) 2047 { 2048 unsigned long flags; 2049 struct bio_list bios; 2050 struct bio *bio; 2051 2052 bio_list_init(&bios); 2053 2054 spin_lock_irqsave(&cache->lock, flags); 2055 bio_list_merge(&bios, &cache->deferred_writethrough_bios); 2056 bio_list_init(&cache->deferred_writethrough_bios); 2057 spin_unlock_irqrestore(&cache->lock, flags); 2058 2059 /* 2060 * These bios have already been through inc_ds() 2061 */ 2062 while ((bio = bio_list_pop(&bios))) 2063 accounted_request(cache, bio); 2064 } 2065 2066 static void writeback_some_dirty_blocks(struct cache *cache) 2067 { 2068 bool prealloc_used = false; 2069 dm_oblock_t oblock; 2070 dm_cblock_t cblock; 2071 struct prealloc structs; 2072 struct dm_bio_prison_cell *old_ocell; 2073 bool busy = !iot_idle_for(&cache->origin_tracker, HZ); 2074 2075 memset(&structs, 0, sizeof(structs)); 2076 2077 while (spare_migration_bandwidth(cache)) { 2078 if (policy_writeback_work(cache->policy, &oblock, &cblock, busy)) 2079 break; /* no work to do */ 2080 2081 prealloc_used = true; 2082 if (prealloc_data_structs(cache, &structs) || 2083 get_cell(cache, oblock, &structs, &old_ocell)) { 2084 policy_set_dirty(cache->policy, oblock); 2085 break; 2086 } 2087 2088 writeback(cache, &structs, oblock, cblock, old_ocell); 2089 } 2090 2091 if (prealloc_used) 2092 prealloc_free_structs(cache, &structs); 2093 } 2094 2095 /*---------------------------------------------------------------- 2096 * Invalidations. 2097 * Dropping something from the cache *without* writing back. 2098 *--------------------------------------------------------------*/ 2099 2100 static void process_invalidation_request(struct cache *cache, struct invalidation_request *req) 2101 { 2102 int r = 0; 2103 uint64_t begin = from_cblock(req->cblocks->begin); 2104 uint64_t end = from_cblock(req->cblocks->end); 2105 2106 while (begin != end) { 2107 r = policy_remove_cblock(cache->policy, to_cblock(begin)); 2108 if (!r) { 2109 r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin)); 2110 if (r) { 2111 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 2112 break; 2113 } 2114 2115 } else if (r == -ENODATA) { 2116 /* harmless, already unmapped */ 2117 r = 0; 2118 2119 } else { 2120 DMERR("%s: policy_remove_cblock failed", cache_device_name(cache)); 2121 break; 2122 } 2123 2124 begin++; 2125 } 2126 2127 cache->commit_requested = true; 2128 2129 req->err = r; 2130 atomic_set(&req->complete, 1); 2131 2132 wake_up(&req->result_wait); 2133 } 2134 2135 static void process_invalidation_requests(struct cache *cache) 2136 { 2137 struct list_head list; 2138 struct invalidation_request *req, *tmp; 2139 2140 INIT_LIST_HEAD(&list); 2141 spin_lock(&cache->invalidation_lock); 2142 list_splice_init(&cache->invalidation_requests, &list); 2143 spin_unlock(&cache->invalidation_lock); 2144 2145 list_for_each_entry_safe (req, tmp, &list, list) 2146 process_invalidation_request(cache, req); 2147 } 2148 2149 /*---------------------------------------------------------------- 2150 * Main worker loop 2151 *--------------------------------------------------------------*/ 2152 static bool is_quiescing(struct cache *cache) 2153 { 2154 return atomic_read(&cache->quiescing); 2155 } 2156 2157 static void ack_quiescing(struct cache *cache) 2158 { 2159 if (is_quiescing(cache)) { 2160 atomic_inc(&cache->quiescing_ack); 2161 wake_up(&cache->quiescing_wait); 2162 } 2163 } 2164 2165 static void wait_for_quiescing_ack(struct cache *cache) 2166 { 2167 wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack)); 2168 } 2169 2170 static void start_quiescing(struct cache *cache) 2171 { 2172 atomic_inc(&cache->quiescing); 2173 wait_for_quiescing_ack(cache); 2174 } 2175 2176 static void stop_quiescing(struct cache *cache) 2177 { 2178 atomic_set(&cache->quiescing, 0); 2179 atomic_set(&cache->quiescing_ack, 0); 2180 } 2181 2182 static void wait_for_migrations(struct cache *cache) 2183 { 2184 wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations)); 2185 } 2186 2187 static void stop_worker(struct cache *cache) 2188 { 2189 cancel_delayed_work(&cache->waker); 2190 flush_workqueue(cache->wq); 2191 } 2192 2193 static void requeue_deferred_cells(struct cache *cache) 2194 { 2195 unsigned long flags; 2196 struct list_head cells; 2197 struct dm_bio_prison_cell *cell, *tmp; 2198 2199 INIT_LIST_HEAD(&cells); 2200 spin_lock_irqsave(&cache->lock, flags); 2201 list_splice_init(&cache->deferred_cells, &cells); 2202 spin_unlock_irqrestore(&cache->lock, flags); 2203 2204 list_for_each_entry_safe(cell, tmp, &cells, user_list) 2205 cell_requeue(cache, cell); 2206 } 2207 2208 static void requeue_deferred_bios(struct cache *cache) 2209 { 2210 struct bio *bio; 2211 struct bio_list bios; 2212 2213 bio_list_init(&bios); 2214 bio_list_merge(&bios, &cache->deferred_bios); 2215 bio_list_init(&cache->deferred_bios); 2216 2217 while ((bio = bio_list_pop(&bios))) { 2218 bio->bi_error = DM_ENDIO_REQUEUE; 2219 bio_endio(bio); 2220 } 2221 } 2222 2223 static int more_work(struct cache *cache) 2224 { 2225 if (is_quiescing(cache)) 2226 return !list_empty(&cache->quiesced_migrations) || 2227 !list_empty(&cache->completed_migrations) || 2228 !list_empty(&cache->need_commit_migrations); 2229 else 2230 return !bio_list_empty(&cache->deferred_bios) || 2231 !list_empty(&cache->deferred_cells) || 2232 !bio_list_empty(&cache->deferred_flush_bios) || 2233 !bio_list_empty(&cache->deferred_writethrough_bios) || 2234 !list_empty(&cache->quiesced_migrations) || 2235 !list_empty(&cache->completed_migrations) || 2236 !list_empty(&cache->need_commit_migrations) || 2237 cache->invalidate; 2238 } 2239 2240 static void do_worker(struct work_struct *ws) 2241 { 2242 struct cache *cache = container_of(ws, struct cache, worker); 2243 2244 do { 2245 if (!is_quiescing(cache)) { 2246 writeback_some_dirty_blocks(cache); 2247 process_deferred_writethrough_bios(cache); 2248 process_deferred_bios(cache); 2249 process_deferred_cells(cache); 2250 process_invalidation_requests(cache); 2251 } 2252 2253 process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard); 2254 process_migrations(cache, &cache->completed_migrations, complete_migration); 2255 2256 if (commit_if_needed(cache)) { 2257 process_deferred_flush_bios(cache, false); 2258 process_migrations(cache, &cache->need_commit_migrations, migration_failure); 2259 } else { 2260 process_deferred_flush_bios(cache, true); 2261 process_migrations(cache, &cache->need_commit_migrations, 2262 migration_success_post_commit); 2263 } 2264 2265 ack_quiescing(cache); 2266 2267 } while (more_work(cache)); 2268 } 2269 2270 /* 2271 * We want to commit periodically so that not too much 2272 * unwritten metadata builds up. 2273 */ 2274 static void do_waker(struct work_struct *ws) 2275 { 2276 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 2277 policy_tick(cache->policy, true); 2278 wake_worker(cache); 2279 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 2280 } 2281 2282 /*----------------------------------------------------------------*/ 2283 2284 static int is_congested(struct dm_dev *dev, int bdi_bits) 2285 { 2286 struct request_queue *q = bdev_get_queue(dev->bdev); 2287 return bdi_congested(&q->backing_dev_info, bdi_bits); 2288 } 2289 2290 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 2291 { 2292 struct cache *cache = container_of(cb, struct cache, callbacks); 2293 2294 return is_congested(cache->origin_dev, bdi_bits) || 2295 is_congested(cache->cache_dev, bdi_bits); 2296 } 2297 2298 /*---------------------------------------------------------------- 2299 * Target methods 2300 *--------------------------------------------------------------*/ 2301 2302 /* 2303 * This function gets called on the error paths of the constructor, so we 2304 * have to cope with a partially initialised struct. 2305 */ 2306 static void destroy(struct cache *cache) 2307 { 2308 unsigned i; 2309 2310 mempool_destroy(cache->migration_pool); 2311 2312 if (cache->all_io_ds) 2313 dm_deferred_set_destroy(cache->all_io_ds); 2314 2315 if (cache->prison) 2316 dm_bio_prison_destroy(cache->prison); 2317 2318 if (cache->wq) 2319 destroy_workqueue(cache->wq); 2320 2321 if (cache->dirty_bitset) 2322 free_bitset(cache->dirty_bitset); 2323 2324 if (cache->discard_bitset) 2325 free_bitset(cache->discard_bitset); 2326 2327 if (cache->copier) 2328 dm_kcopyd_client_destroy(cache->copier); 2329 2330 if (cache->cmd) 2331 dm_cache_metadata_close(cache->cmd); 2332 2333 if (cache->metadata_dev) 2334 dm_put_device(cache->ti, cache->metadata_dev); 2335 2336 if (cache->origin_dev) 2337 dm_put_device(cache->ti, cache->origin_dev); 2338 2339 if (cache->cache_dev) 2340 dm_put_device(cache->ti, cache->cache_dev); 2341 2342 if (cache->policy) 2343 dm_cache_policy_destroy(cache->policy); 2344 2345 for (i = 0; i < cache->nr_ctr_args ; i++) 2346 kfree(cache->ctr_args[i]); 2347 kfree(cache->ctr_args); 2348 2349 kfree(cache); 2350 } 2351 2352 static void cache_dtr(struct dm_target *ti) 2353 { 2354 struct cache *cache = ti->private; 2355 2356 destroy(cache); 2357 } 2358 2359 static sector_t get_dev_size(struct dm_dev *dev) 2360 { 2361 return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; 2362 } 2363 2364 /*----------------------------------------------------------------*/ 2365 2366 /* 2367 * Construct a cache device mapping. 2368 * 2369 * cache <metadata dev> <cache dev> <origin dev> <block size> 2370 * <#feature args> [<feature arg>]* 2371 * <policy> <#policy args> [<policy arg>]* 2372 * 2373 * metadata dev : fast device holding the persistent metadata 2374 * cache dev : fast device holding cached data blocks 2375 * origin dev : slow device holding original data blocks 2376 * block size : cache unit size in sectors 2377 * 2378 * #feature args : number of feature arguments passed 2379 * feature args : writethrough. (The default is writeback.) 2380 * 2381 * policy : the replacement policy to use 2382 * #policy args : an even number of policy arguments corresponding 2383 * to key/value pairs passed to the policy 2384 * policy args : key/value pairs passed to the policy 2385 * E.g. 'sequential_threshold 1024' 2386 * See cache-policies.txt for details. 2387 * 2388 * Optional feature arguments are: 2389 * writethrough : write through caching that prohibits cache block 2390 * content from being different from origin block content. 2391 * Without this argument, the default behaviour is to write 2392 * back cache block contents later for performance reasons, 2393 * so they may differ from the corresponding origin blocks. 2394 */ 2395 struct cache_args { 2396 struct dm_target *ti; 2397 2398 struct dm_dev *metadata_dev; 2399 2400 struct dm_dev *cache_dev; 2401 sector_t cache_sectors; 2402 2403 struct dm_dev *origin_dev; 2404 sector_t origin_sectors; 2405 2406 uint32_t block_size; 2407 2408 const char *policy_name; 2409 int policy_argc; 2410 const char **policy_argv; 2411 2412 struct cache_features features; 2413 }; 2414 2415 static void destroy_cache_args(struct cache_args *ca) 2416 { 2417 if (ca->metadata_dev) 2418 dm_put_device(ca->ti, ca->metadata_dev); 2419 2420 if (ca->cache_dev) 2421 dm_put_device(ca->ti, ca->cache_dev); 2422 2423 if (ca->origin_dev) 2424 dm_put_device(ca->ti, ca->origin_dev); 2425 2426 kfree(ca); 2427 } 2428 2429 static bool at_least_one_arg(struct dm_arg_set *as, char **error) 2430 { 2431 if (!as->argc) { 2432 *error = "Insufficient args"; 2433 return false; 2434 } 2435 2436 return true; 2437 } 2438 2439 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, 2440 char **error) 2441 { 2442 int r; 2443 sector_t metadata_dev_size; 2444 char b[BDEVNAME_SIZE]; 2445 2446 if (!at_least_one_arg(as, error)) 2447 return -EINVAL; 2448 2449 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2450 &ca->metadata_dev); 2451 if (r) { 2452 *error = "Error opening metadata device"; 2453 return r; 2454 } 2455 2456 metadata_dev_size = get_dev_size(ca->metadata_dev); 2457 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING) 2458 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 2459 bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); 2460 2461 return 0; 2462 } 2463 2464 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, 2465 char **error) 2466 { 2467 int r; 2468 2469 if (!at_least_one_arg(as, error)) 2470 return -EINVAL; 2471 2472 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2473 &ca->cache_dev); 2474 if (r) { 2475 *error = "Error opening cache device"; 2476 return r; 2477 } 2478 ca->cache_sectors = get_dev_size(ca->cache_dev); 2479 2480 return 0; 2481 } 2482 2483 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, 2484 char **error) 2485 { 2486 int r; 2487 2488 if (!at_least_one_arg(as, error)) 2489 return -EINVAL; 2490 2491 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2492 &ca->origin_dev); 2493 if (r) { 2494 *error = "Error opening origin device"; 2495 return r; 2496 } 2497 2498 ca->origin_sectors = get_dev_size(ca->origin_dev); 2499 if (ca->ti->len > ca->origin_sectors) { 2500 *error = "Device size larger than cached device"; 2501 return -EINVAL; 2502 } 2503 2504 return 0; 2505 } 2506 2507 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, 2508 char **error) 2509 { 2510 unsigned long block_size; 2511 2512 if (!at_least_one_arg(as, error)) 2513 return -EINVAL; 2514 2515 if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size || 2516 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 2517 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 2518 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 2519 *error = "Invalid data block size"; 2520 return -EINVAL; 2521 } 2522 2523 if (block_size > ca->cache_sectors) { 2524 *error = "Data block size is larger than the cache device"; 2525 return -EINVAL; 2526 } 2527 2528 ca->block_size = block_size; 2529 2530 return 0; 2531 } 2532 2533 static void init_features(struct cache_features *cf) 2534 { 2535 cf->mode = CM_WRITE; 2536 cf->io_mode = CM_IO_WRITEBACK; 2537 } 2538 2539 static int parse_features(struct cache_args *ca, struct dm_arg_set *as, 2540 char **error) 2541 { 2542 static struct dm_arg _args[] = { 2543 {0, 1, "Invalid number of cache feature arguments"}, 2544 }; 2545 2546 int r; 2547 unsigned argc; 2548 const char *arg; 2549 struct cache_features *cf = &ca->features; 2550 2551 init_features(cf); 2552 2553 r = dm_read_arg_group(_args, as, &argc, error); 2554 if (r) 2555 return -EINVAL; 2556 2557 while (argc--) { 2558 arg = dm_shift_arg(as); 2559 2560 if (!strcasecmp(arg, "writeback")) 2561 cf->io_mode = CM_IO_WRITEBACK; 2562 2563 else if (!strcasecmp(arg, "writethrough")) 2564 cf->io_mode = CM_IO_WRITETHROUGH; 2565 2566 else if (!strcasecmp(arg, "passthrough")) 2567 cf->io_mode = CM_IO_PASSTHROUGH; 2568 2569 else { 2570 *error = "Unrecognised cache feature requested"; 2571 return -EINVAL; 2572 } 2573 } 2574 2575 return 0; 2576 } 2577 2578 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as, 2579 char **error) 2580 { 2581 static struct dm_arg _args[] = { 2582 {0, 1024, "Invalid number of policy arguments"}, 2583 }; 2584 2585 int r; 2586 2587 if (!at_least_one_arg(as, error)) 2588 return -EINVAL; 2589 2590 ca->policy_name = dm_shift_arg(as); 2591 2592 r = dm_read_arg_group(_args, as, &ca->policy_argc, error); 2593 if (r) 2594 return -EINVAL; 2595 2596 ca->policy_argv = (const char **)as->argv; 2597 dm_consume_args(as, ca->policy_argc); 2598 2599 return 0; 2600 } 2601 2602 static int parse_cache_args(struct cache_args *ca, int argc, char **argv, 2603 char **error) 2604 { 2605 int r; 2606 struct dm_arg_set as; 2607 2608 as.argc = argc; 2609 as.argv = argv; 2610 2611 r = parse_metadata_dev(ca, &as, error); 2612 if (r) 2613 return r; 2614 2615 r = parse_cache_dev(ca, &as, error); 2616 if (r) 2617 return r; 2618 2619 r = parse_origin_dev(ca, &as, error); 2620 if (r) 2621 return r; 2622 2623 r = parse_block_size(ca, &as, error); 2624 if (r) 2625 return r; 2626 2627 r = parse_features(ca, &as, error); 2628 if (r) 2629 return r; 2630 2631 r = parse_policy(ca, &as, error); 2632 if (r) 2633 return r; 2634 2635 return 0; 2636 } 2637 2638 /*----------------------------------------------------------------*/ 2639 2640 static struct kmem_cache *migration_cache; 2641 2642 #define NOT_CORE_OPTION 1 2643 2644 static int process_config_option(struct cache *cache, const char *key, const char *value) 2645 { 2646 unsigned long tmp; 2647 2648 if (!strcasecmp(key, "migration_threshold")) { 2649 if (kstrtoul(value, 10, &tmp)) 2650 return -EINVAL; 2651 2652 cache->migration_threshold = tmp; 2653 return 0; 2654 } 2655 2656 return NOT_CORE_OPTION; 2657 } 2658 2659 static int set_config_value(struct cache *cache, const char *key, const char *value) 2660 { 2661 int r = process_config_option(cache, key, value); 2662 2663 if (r == NOT_CORE_OPTION) 2664 r = policy_set_config_value(cache->policy, key, value); 2665 2666 if (r) 2667 DMWARN("bad config value for %s: %s", key, value); 2668 2669 return r; 2670 } 2671 2672 static int set_config_values(struct cache *cache, int argc, const char **argv) 2673 { 2674 int r = 0; 2675 2676 if (argc & 1) { 2677 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs."); 2678 return -EINVAL; 2679 } 2680 2681 while (argc) { 2682 r = set_config_value(cache, argv[0], argv[1]); 2683 if (r) 2684 break; 2685 2686 argc -= 2; 2687 argv += 2; 2688 } 2689 2690 return r; 2691 } 2692 2693 static int create_cache_policy(struct cache *cache, struct cache_args *ca, 2694 char **error) 2695 { 2696 struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name, 2697 cache->cache_size, 2698 cache->origin_sectors, 2699 cache->sectors_per_block); 2700 if (IS_ERR(p)) { 2701 *error = "Error creating cache's policy"; 2702 return PTR_ERR(p); 2703 } 2704 cache->policy = p; 2705 2706 return 0; 2707 } 2708 2709 /* 2710 * We want the discard block size to be at least the size of the cache 2711 * block size and have no more than 2^14 discard blocks across the origin. 2712 */ 2713 #define MAX_DISCARD_BLOCKS (1 << 14) 2714 2715 static bool too_many_discard_blocks(sector_t discard_block_size, 2716 sector_t origin_size) 2717 { 2718 (void) sector_div(origin_size, discard_block_size); 2719 2720 return origin_size > MAX_DISCARD_BLOCKS; 2721 } 2722 2723 static sector_t calculate_discard_block_size(sector_t cache_block_size, 2724 sector_t origin_size) 2725 { 2726 sector_t discard_block_size = cache_block_size; 2727 2728 if (origin_size) 2729 while (too_many_discard_blocks(discard_block_size, origin_size)) 2730 discard_block_size *= 2; 2731 2732 return discard_block_size; 2733 } 2734 2735 static void set_cache_size(struct cache *cache, dm_cblock_t size) 2736 { 2737 dm_block_t nr_blocks = from_cblock(size); 2738 2739 if (nr_blocks > (1 << 20) && cache->cache_size != size) 2740 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n" 2741 "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n" 2742 "Please consider increasing the cache block size to reduce the overall cache block count.", 2743 (unsigned long long) nr_blocks); 2744 2745 cache->cache_size = size; 2746 } 2747 2748 #define DEFAULT_MIGRATION_THRESHOLD 2048 2749 2750 static int cache_create(struct cache_args *ca, struct cache **result) 2751 { 2752 int r = 0; 2753 char **error = &ca->ti->error; 2754 struct cache *cache; 2755 struct dm_target *ti = ca->ti; 2756 dm_block_t origin_blocks; 2757 struct dm_cache_metadata *cmd; 2758 bool may_format = ca->features.mode == CM_WRITE; 2759 2760 cache = kzalloc(sizeof(*cache), GFP_KERNEL); 2761 if (!cache) 2762 return -ENOMEM; 2763 2764 cache->ti = ca->ti; 2765 ti->private = cache; 2766 ti->num_flush_bios = 2; 2767 ti->flush_supported = true; 2768 2769 ti->num_discard_bios = 1; 2770 ti->discards_supported = true; 2771 ti->discard_zeroes_data_unsupported = true; 2772 ti->split_discard_bios = false; 2773 2774 cache->features = ca->features; 2775 ti->per_io_data_size = get_per_bio_data_size(cache); 2776 2777 cache->callbacks.congested_fn = cache_is_congested; 2778 dm_table_add_target_callbacks(ti->table, &cache->callbacks); 2779 2780 cache->metadata_dev = ca->metadata_dev; 2781 cache->origin_dev = ca->origin_dev; 2782 cache->cache_dev = ca->cache_dev; 2783 2784 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 2785 2786 /* FIXME: factor out this whole section */ 2787 origin_blocks = cache->origin_sectors = ca->origin_sectors; 2788 origin_blocks = block_div(origin_blocks, ca->block_size); 2789 cache->origin_blocks = to_oblock(origin_blocks); 2790 2791 cache->sectors_per_block = ca->block_size; 2792 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) { 2793 r = -EINVAL; 2794 goto bad; 2795 } 2796 2797 if (ca->block_size & (ca->block_size - 1)) { 2798 dm_block_t cache_size = ca->cache_sectors; 2799 2800 cache->sectors_per_block_shift = -1; 2801 cache_size = block_div(cache_size, ca->block_size); 2802 set_cache_size(cache, to_cblock(cache_size)); 2803 } else { 2804 cache->sectors_per_block_shift = __ffs(ca->block_size); 2805 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift)); 2806 } 2807 2808 r = create_cache_policy(cache, ca, error); 2809 if (r) 2810 goto bad; 2811 2812 cache->policy_nr_args = ca->policy_argc; 2813 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; 2814 2815 r = set_config_values(cache, ca->policy_argc, ca->policy_argv); 2816 if (r) { 2817 *error = "Error setting cache policy's config values"; 2818 goto bad; 2819 } 2820 2821 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, 2822 ca->block_size, may_format, 2823 dm_cache_policy_get_hint_size(cache->policy)); 2824 if (IS_ERR(cmd)) { 2825 *error = "Error creating metadata object"; 2826 r = PTR_ERR(cmd); 2827 goto bad; 2828 } 2829 cache->cmd = cmd; 2830 set_cache_mode(cache, CM_WRITE); 2831 if (get_cache_mode(cache) != CM_WRITE) { 2832 *error = "Unable to get write access to metadata, please check/repair metadata."; 2833 r = -EINVAL; 2834 goto bad; 2835 } 2836 2837 if (passthrough_mode(&cache->features)) { 2838 bool all_clean; 2839 2840 r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); 2841 if (r) { 2842 *error = "dm_cache_metadata_all_clean() failed"; 2843 goto bad; 2844 } 2845 2846 if (!all_clean) { 2847 *error = "Cannot enter passthrough mode unless all blocks are clean"; 2848 r = -EINVAL; 2849 goto bad; 2850 } 2851 } 2852 2853 spin_lock_init(&cache->lock); 2854 INIT_LIST_HEAD(&cache->deferred_cells); 2855 bio_list_init(&cache->deferred_bios); 2856 bio_list_init(&cache->deferred_flush_bios); 2857 bio_list_init(&cache->deferred_writethrough_bios); 2858 INIT_LIST_HEAD(&cache->quiesced_migrations); 2859 INIT_LIST_HEAD(&cache->completed_migrations); 2860 INIT_LIST_HEAD(&cache->need_commit_migrations); 2861 atomic_set(&cache->nr_allocated_migrations, 0); 2862 atomic_set(&cache->nr_io_migrations, 0); 2863 init_waitqueue_head(&cache->migration_wait); 2864 2865 init_waitqueue_head(&cache->quiescing_wait); 2866 atomic_set(&cache->quiescing, 0); 2867 atomic_set(&cache->quiescing_ack, 0); 2868 2869 r = -ENOMEM; 2870 atomic_set(&cache->nr_dirty, 0); 2871 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2872 if (!cache->dirty_bitset) { 2873 *error = "could not allocate dirty bitset"; 2874 goto bad; 2875 } 2876 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); 2877 2878 cache->discard_block_size = 2879 calculate_discard_block_size(cache->sectors_per_block, 2880 cache->origin_sectors); 2881 cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors, 2882 cache->discard_block_size)); 2883 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); 2884 if (!cache->discard_bitset) { 2885 *error = "could not allocate discard bitset"; 2886 goto bad; 2887 } 2888 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2889 2890 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2891 if (IS_ERR(cache->copier)) { 2892 *error = "could not create kcopyd client"; 2893 r = PTR_ERR(cache->copier); 2894 goto bad; 2895 } 2896 2897 cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 2898 if (!cache->wq) { 2899 *error = "could not create workqueue for metadata object"; 2900 goto bad; 2901 } 2902 INIT_WORK(&cache->worker, do_worker); 2903 INIT_DELAYED_WORK(&cache->waker, do_waker); 2904 cache->last_commit_jiffies = jiffies; 2905 2906 cache->prison = dm_bio_prison_create(); 2907 if (!cache->prison) { 2908 *error = "could not create bio prison"; 2909 goto bad; 2910 } 2911 2912 cache->all_io_ds = dm_deferred_set_create(); 2913 if (!cache->all_io_ds) { 2914 *error = "could not create all_io deferred set"; 2915 goto bad; 2916 } 2917 2918 cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE, 2919 migration_cache); 2920 if (!cache->migration_pool) { 2921 *error = "Error creating cache's migration mempool"; 2922 goto bad; 2923 } 2924 2925 cache->need_tick_bio = true; 2926 cache->sized = false; 2927 cache->invalidate = false; 2928 cache->commit_requested = false; 2929 cache->loaded_mappings = false; 2930 cache->loaded_discards = false; 2931 2932 load_stats(cache); 2933 2934 atomic_set(&cache->stats.demotion, 0); 2935 atomic_set(&cache->stats.promotion, 0); 2936 atomic_set(&cache->stats.copies_avoided, 0); 2937 atomic_set(&cache->stats.cache_cell_clash, 0); 2938 atomic_set(&cache->stats.commit_count, 0); 2939 atomic_set(&cache->stats.discard_count, 0); 2940 2941 spin_lock_init(&cache->invalidation_lock); 2942 INIT_LIST_HEAD(&cache->invalidation_requests); 2943 2944 iot_init(&cache->origin_tracker); 2945 2946 *result = cache; 2947 return 0; 2948 2949 bad: 2950 destroy(cache); 2951 return r; 2952 } 2953 2954 static int copy_ctr_args(struct cache *cache, int argc, const char **argv) 2955 { 2956 unsigned i; 2957 const char **copy; 2958 2959 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); 2960 if (!copy) 2961 return -ENOMEM; 2962 for (i = 0; i < argc; i++) { 2963 copy[i] = kstrdup(argv[i], GFP_KERNEL); 2964 if (!copy[i]) { 2965 while (i--) 2966 kfree(copy[i]); 2967 kfree(copy); 2968 return -ENOMEM; 2969 } 2970 } 2971 2972 cache->nr_ctr_args = argc; 2973 cache->ctr_args = copy; 2974 2975 return 0; 2976 } 2977 2978 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) 2979 { 2980 int r = -EINVAL; 2981 struct cache_args *ca; 2982 struct cache *cache = NULL; 2983 2984 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 2985 if (!ca) { 2986 ti->error = "Error allocating memory for cache"; 2987 return -ENOMEM; 2988 } 2989 ca->ti = ti; 2990 2991 r = parse_cache_args(ca, argc, argv, &ti->error); 2992 if (r) 2993 goto out; 2994 2995 r = cache_create(ca, &cache); 2996 if (r) 2997 goto out; 2998 2999 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); 3000 if (r) { 3001 destroy(cache); 3002 goto out; 3003 } 3004 3005 ti->private = cache; 3006 3007 out: 3008 destroy_cache_args(ca); 3009 return r; 3010 } 3011 3012 /*----------------------------------------------------------------*/ 3013 3014 static int cache_map(struct dm_target *ti, struct bio *bio) 3015 { 3016 struct cache *cache = ti->private; 3017 3018 int r; 3019 struct dm_bio_prison_cell *cell = NULL; 3020 dm_oblock_t block = get_bio_block(cache, bio); 3021 size_t pb_data_size = get_per_bio_data_size(cache); 3022 bool can_migrate = false; 3023 bool fast_promotion; 3024 struct policy_result lookup_result; 3025 struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size); 3026 struct old_oblock_lock ool; 3027 3028 ool.locker.fn = null_locker; 3029 3030 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { 3031 /* 3032 * This can only occur if the io goes to a partial block at 3033 * the end of the origin device. We don't cache these. 3034 * Just remap to the origin and carry on. 3035 */ 3036 remap_to_origin(cache, bio); 3037 accounted_begin(cache, bio); 3038 return DM_MAPIO_REMAPPED; 3039 } 3040 3041 if (discard_or_flush(bio)) { 3042 defer_bio(cache, bio); 3043 return DM_MAPIO_SUBMITTED; 3044 } 3045 3046 /* 3047 * Check to see if that block is currently migrating. 3048 */ 3049 cell = alloc_prison_cell(cache); 3050 if (!cell) { 3051 defer_bio(cache, bio); 3052 return DM_MAPIO_SUBMITTED; 3053 } 3054 3055 r = bio_detain(cache, block, bio, cell, 3056 (cell_free_fn) free_prison_cell, 3057 cache, &cell); 3058 if (r) { 3059 if (r < 0) 3060 defer_bio(cache, bio); 3061 3062 return DM_MAPIO_SUBMITTED; 3063 } 3064 3065 fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio); 3066 3067 r = policy_map(cache->policy, block, false, can_migrate, fast_promotion, 3068 bio, &ool.locker, &lookup_result); 3069 if (r == -EWOULDBLOCK) { 3070 cell_defer(cache, cell, true); 3071 return DM_MAPIO_SUBMITTED; 3072 3073 } else if (r) { 3074 DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d", 3075 cache_device_name(cache), r); 3076 cell_defer(cache, cell, false); 3077 bio_io_error(bio); 3078 return DM_MAPIO_SUBMITTED; 3079 } 3080 3081 r = DM_MAPIO_REMAPPED; 3082 switch (lookup_result.op) { 3083 case POLICY_HIT: 3084 if (passthrough_mode(&cache->features)) { 3085 if (bio_data_dir(bio) == WRITE) { 3086 /* 3087 * We need to invalidate this block, so 3088 * defer for the worker thread. 3089 */ 3090 cell_defer(cache, cell, true); 3091 r = DM_MAPIO_SUBMITTED; 3092 3093 } else { 3094 inc_miss_counter(cache, bio); 3095 remap_to_origin_clear_discard(cache, bio, block); 3096 accounted_begin(cache, bio); 3097 inc_ds(cache, bio, cell); 3098 // FIXME: we want to remap hits or misses straight 3099 // away rather than passing over to the worker. 3100 cell_defer(cache, cell, false); 3101 } 3102 3103 } else { 3104 inc_hit_counter(cache, bio); 3105 if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && 3106 !is_dirty(cache, lookup_result.cblock)) { 3107 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 3108 accounted_begin(cache, bio); 3109 inc_ds(cache, bio, cell); 3110 cell_defer(cache, cell, false); 3111 3112 } else 3113 remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false); 3114 } 3115 break; 3116 3117 case POLICY_MISS: 3118 inc_miss_counter(cache, bio); 3119 if (pb->req_nr != 0) { 3120 /* 3121 * This is a duplicate writethrough io that is no 3122 * longer needed because the block has been demoted. 3123 */ 3124 bio_endio(bio); 3125 // FIXME: remap everything as a miss 3126 cell_defer(cache, cell, false); 3127 r = DM_MAPIO_SUBMITTED; 3128 3129 } else 3130 remap_cell_to_origin_clear_discard(cache, cell, block, false); 3131 break; 3132 3133 default: 3134 DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u", 3135 cache_device_name(cache), __func__, 3136 (unsigned) lookup_result.op); 3137 cell_defer(cache, cell, false); 3138 bio_io_error(bio); 3139 r = DM_MAPIO_SUBMITTED; 3140 } 3141 3142 return r; 3143 } 3144 3145 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) 3146 { 3147 struct cache *cache = ti->private; 3148 unsigned long flags; 3149 size_t pb_data_size = get_per_bio_data_size(cache); 3150 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 3151 3152 if (pb->tick) { 3153 policy_tick(cache->policy, false); 3154 3155 spin_lock_irqsave(&cache->lock, flags); 3156 cache->need_tick_bio = true; 3157 spin_unlock_irqrestore(&cache->lock, flags); 3158 } 3159 3160 check_for_quiesced_migrations(cache, pb); 3161 accounted_complete(cache, bio); 3162 3163 return 0; 3164 } 3165 3166 static int write_dirty_bitset(struct cache *cache) 3167 { 3168 unsigned i, r; 3169 3170 if (get_cache_mode(cache) >= CM_READ_ONLY) 3171 return -EINVAL; 3172 3173 for (i = 0; i < from_cblock(cache->cache_size); i++) { 3174 r = dm_cache_set_dirty(cache->cmd, to_cblock(i), 3175 is_dirty(cache, to_cblock(i))); 3176 if (r) { 3177 metadata_operation_failed(cache, "dm_cache_set_dirty", r); 3178 return r; 3179 } 3180 } 3181 3182 return 0; 3183 } 3184 3185 static int write_discard_bitset(struct cache *cache) 3186 { 3187 unsigned i, r; 3188 3189 if (get_cache_mode(cache) >= CM_READ_ONLY) 3190 return -EINVAL; 3191 3192 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, 3193 cache->discard_nr_blocks); 3194 if (r) { 3195 DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache)); 3196 metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r); 3197 return r; 3198 } 3199 3200 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { 3201 r = dm_cache_set_discard(cache->cmd, to_dblock(i), 3202 is_discarded(cache, to_dblock(i))); 3203 if (r) { 3204 metadata_operation_failed(cache, "dm_cache_set_discard", r); 3205 return r; 3206 } 3207 } 3208 3209 return 0; 3210 } 3211 3212 static int write_hints(struct cache *cache) 3213 { 3214 int r; 3215 3216 if (get_cache_mode(cache) >= CM_READ_ONLY) 3217 return -EINVAL; 3218 3219 r = dm_cache_write_hints(cache->cmd, cache->policy); 3220 if (r) { 3221 metadata_operation_failed(cache, "dm_cache_write_hints", r); 3222 return r; 3223 } 3224 3225 return 0; 3226 } 3227 3228 /* 3229 * returns true on success 3230 */ 3231 static bool sync_metadata(struct cache *cache) 3232 { 3233 int r1, r2, r3, r4; 3234 3235 r1 = write_dirty_bitset(cache); 3236 if (r1) 3237 DMERR("%s: could not write dirty bitset", cache_device_name(cache)); 3238 3239 r2 = write_discard_bitset(cache); 3240 if (r2) 3241 DMERR("%s: could not write discard bitset", cache_device_name(cache)); 3242 3243 save_stats(cache); 3244 3245 r3 = write_hints(cache); 3246 if (r3) 3247 DMERR("%s: could not write hints", cache_device_name(cache)); 3248 3249 /* 3250 * If writing the above metadata failed, we still commit, but don't 3251 * set the clean shutdown flag. This will effectively force every 3252 * dirty bit to be set on reload. 3253 */ 3254 r4 = commit(cache, !r1 && !r2 && !r3); 3255 if (r4) 3256 DMERR("%s: could not write cache metadata", cache_device_name(cache)); 3257 3258 return !r1 && !r2 && !r3 && !r4; 3259 } 3260 3261 static void cache_postsuspend(struct dm_target *ti) 3262 { 3263 struct cache *cache = ti->private; 3264 3265 start_quiescing(cache); 3266 wait_for_migrations(cache); 3267 stop_worker(cache); 3268 requeue_deferred_bios(cache); 3269 requeue_deferred_cells(cache); 3270 stop_quiescing(cache); 3271 3272 if (get_cache_mode(cache) == CM_WRITE) 3273 (void) sync_metadata(cache); 3274 } 3275 3276 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, 3277 bool dirty, uint32_t hint, bool hint_valid) 3278 { 3279 int r; 3280 struct cache *cache = context; 3281 3282 r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid); 3283 if (r) 3284 return r; 3285 3286 if (dirty) 3287 set_dirty(cache, oblock, cblock); 3288 else 3289 clear_dirty(cache, oblock, cblock); 3290 3291 return 0; 3292 } 3293 3294 /* 3295 * The discard block size in the on disk metadata is not 3296 * neccessarily the same as we're currently using. So we have to 3297 * be careful to only set the discarded attribute if we know it 3298 * covers a complete block of the new size. 3299 */ 3300 struct discard_load_info { 3301 struct cache *cache; 3302 3303 /* 3304 * These blocks are sized using the on disk dblock size, rather 3305 * than the current one. 3306 */ 3307 dm_block_t block_size; 3308 dm_block_t discard_begin, discard_end; 3309 }; 3310 3311 static void discard_load_info_init(struct cache *cache, 3312 struct discard_load_info *li) 3313 { 3314 li->cache = cache; 3315 li->discard_begin = li->discard_end = 0; 3316 } 3317 3318 static void set_discard_range(struct discard_load_info *li) 3319 { 3320 sector_t b, e; 3321 3322 if (li->discard_begin == li->discard_end) 3323 return; 3324 3325 /* 3326 * Convert to sectors. 3327 */ 3328 b = li->discard_begin * li->block_size; 3329 e = li->discard_end * li->block_size; 3330 3331 /* 3332 * Then convert back to the current dblock size. 3333 */ 3334 b = dm_sector_div_up(b, li->cache->discard_block_size); 3335 sector_div(e, li->cache->discard_block_size); 3336 3337 /* 3338 * The origin may have shrunk, so we need to check we're still in 3339 * bounds. 3340 */ 3341 if (e > from_dblock(li->cache->discard_nr_blocks)) 3342 e = from_dblock(li->cache->discard_nr_blocks); 3343 3344 for (; b < e; b++) 3345 set_discard(li->cache, to_dblock(b)); 3346 } 3347 3348 static int load_discard(void *context, sector_t discard_block_size, 3349 dm_dblock_t dblock, bool discard) 3350 { 3351 struct discard_load_info *li = context; 3352 3353 li->block_size = discard_block_size; 3354 3355 if (discard) { 3356 if (from_dblock(dblock) == li->discard_end) 3357 /* 3358 * We're already in a discard range, just extend it. 3359 */ 3360 li->discard_end = li->discard_end + 1ULL; 3361 3362 else { 3363 /* 3364 * Emit the old range and start a new one. 3365 */ 3366 set_discard_range(li); 3367 li->discard_begin = from_dblock(dblock); 3368 li->discard_end = li->discard_begin + 1ULL; 3369 } 3370 } else { 3371 set_discard_range(li); 3372 li->discard_begin = li->discard_end = 0; 3373 } 3374 3375 return 0; 3376 } 3377 3378 static dm_cblock_t get_cache_dev_size(struct cache *cache) 3379 { 3380 sector_t size = get_dev_size(cache->cache_dev); 3381 (void) sector_div(size, cache->sectors_per_block); 3382 return to_cblock(size); 3383 } 3384 3385 static bool can_resize(struct cache *cache, dm_cblock_t new_size) 3386 { 3387 if (from_cblock(new_size) > from_cblock(cache->cache_size)) 3388 return true; 3389 3390 /* 3391 * We can't drop a dirty block when shrinking the cache. 3392 */ 3393 while (from_cblock(new_size) < from_cblock(cache->cache_size)) { 3394 new_size = to_cblock(from_cblock(new_size) + 1); 3395 if (is_dirty(cache, new_size)) { 3396 DMERR("%s: unable to shrink cache; cache block %llu is dirty", 3397 cache_device_name(cache), 3398 (unsigned long long) from_cblock(new_size)); 3399 return false; 3400 } 3401 } 3402 3403 return true; 3404 } 3405 3406 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) 3407 { 3408 int r; 3409 3410 r = dm_cache_resize(cache->cmd, new_size); 3411 if (r) { 3412 DMERR("%s: could not resize cache metadata", cache_device_name(cache)); 3413 metadata_operation_failed(cache, "dm_cache_resize", r); 3414 return r; 3415 } 3416 3417 set_cache_size(cache, new_size); 3418 3419 return 0; 3420 } 3421 3422 static int cache_preresume(struct dm_target *ti) 3423 { 3424 int r = 0; 3425 struct cache *cache = ti->private; 3426 dm_cblock_t csize = get_cache_dev_size(cache); 3427 3428 /* 3429 * Check to see if the cache has resized. 3430 */ 3431 if (!cache->sized) { 3432 r = resize_cache_dev(cache, csize); 3433 if (r) 3434 return r; 3435 3436 cache->sized = true; 3437 3438 } else if (csize != cache->cache_size) { 3439 if (!can_resize(cache, csize)) 3440 return -EINVAL; 3441 3442 r = resize_cache_dev(cache, csize); 3443 if (r) 3444 return r; 3445 } 3446 3447 if (!cache->loaded_mappings) { 3448 r = dm_cache_load_mappings(cache->cmd, cache->policy, 3449 load_mapping, cache); 3450 if (r) { 3451 DMERR("%s: could not load cache mappings", cache_device_name(cache)); 3452 metadata_operation_failed(cache, "dm_cache_load_mappings", r); 3453 return r; 3454 } 3455 3456 cache->loaded_mappings = true; 3457 } 3458 3459 if (!cache->loaded_discards) { 3460 struct discard_load_info li; 3461 3462 /* 3463 * The discard bitset could have been resized, or the 3464 * discard block size changed. To be safe we start by 3465 * setting every dblock to not discarded. 3466 */ 3467 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 3468 3469 discard_load_info_init(cache, &li); 3470 r = dm_cache_load_discards(cache->cmd, load_discard, &li); 3471 if (r) { 3472 DMERR("%s: could not load origin discards", cache_device_name(cache)); 3473 metadata_operation_failed(cache, "dm_cache_load_discards", r); 3474 return r; 3475 } 3476 set_discard_range(&li); 3477 3478 cache->loaded_discards = true; 3479 } 3480 3481 return r; 3482 } 3483 3484 static void cache_resume(struct dm_target *ti) 3485 { 3486 struct cache *cache = ti->private; 3487 3488 cache->need_tick_bio = true; 3489 do_waker(&cache->waker.work); 3490 } 3491 3492 /* 3493 * Status format: 3494 * 3495 * <metadata block size> <#used metadata blocks>/<#total metadata blocks> 3496 * <cache block size> <#used cache blocks>/<#total cache blocks> 3497 * <#read hits> <#read misses> <#write hits> <#write misses> 3498 * <#demotions> <#promotions> <#dirty> 3499 * <#features> <features>* 3500 * <#core args> <core args> 3501 * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check> 3502 */ 3503 static void cache_status(struct dm_target *ti, status_type_t type, 3504 unsigned status_flags, char *result, unsigned maxlen) 3505 { 3506 int r = 0; 3507 unsigned i; 3508 ssize_t sz = 0; 3509 dm_block_t nr_free_blocks_metadata = 0; 3510 dm_block_t nr_blocks_metadata = 0; 3511 char buf[BDEVNAME_SIZE]; 3512 struct cache *cache = ti->private; 3513 dm_cblock_t residency; 3514 bool needs_check; 3515 3516 switch (type) { 3517 case STATUSTYPE_INFO: 3518 if (get_cache_mode(cache) == CM_FAIL) { 3519 DMEMIT("Fail"); 3520 break; 3521 } 3522 3523 /* Commit to ensure statistics aren't out-of-date */ 3524 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) 3525 (void) commit(cache, false); 3526 3527 r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata); 3528 if (r) { 3529 DMERR("%s: dm_cache_get_free_metadata_block_count returned %d", 3530 cache_device_name(cache), r); 3531 goto err; 3532 } 3533 3534 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata); 3535 if (r) { 3536 DMERR("%s: dm_cache_get_metadata_dev_size returned %d", 3537 cache_device_name(cache), r); 3538 goto err; 3539 } 3540 3541 residency = policy_residency(cache->policy); 3542 3543 DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ", 3544 (unsigned)DM_CACHE_METADATA_BLOCK_SIZE, 3545 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 3546 (unsigned long long)nr_blocks_metadata, 3547 cache->sectors_per_block, 3548 (unsigned long long) from_cblock(residency), 3549 (unsigned long long) from_cblock(cache->cache_size), 3550 (unsigned) atomic_read(&cache->stats.read_hit), 3551 (unsigned) atomic_read(&cache->stats.read_miss), 3552 (unsigned) atomic_read(&cache->stats.write_hit), 3553 (unsigned) atomic_read(&cache->stats.write_miss), 3554 (unsigned) atomic_read(&cache->stats.demotion), 3555 (unsigned) atomic_read(&cache->stats.promotion), 3556 (unsigned long) atomic_read(&cache->nr_dirty)); 3557 3558 if (writethrough_mode(&cache->features)) 3559 DMEMIT("1 writethrough "); 3560 3561 else if (passthrough_mode(&cache->features)) 3562 DMEMIT("1 passthrough "); 3563 3564 else if (writeback_mode(&cache->features)) 3565 DMEMIT("1 writeback "); 3566 3567 else { 3568 DMERR("%s: internal error: unknown io mode: %d", 3569 cache_device_name(cache), (int) cache->features.io_mode); 3570 goto err; 3571 } 3572 3573 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); 3574 3575 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy)); 3576 if (sz < maxlen) { 3577 r = policy_emit_config_values(cache->policy, result, maxlen, &sz); 3578 if (r) 3579 DMERR("%s: policy_emit_config_values returned %d", 3580 cache_device_name(cache), r); 3581 } 3582 3583 if (get_cache_mode(cache) == CM_READ_ONLY) 3584 DMEMIT("ro "); 3585 else 3586 DMEMIT("rw "); 3587 3588 r = dm_cache_metadata_needs_check(cache->cmd, &needs_check); 3589 3590 if (r || needs_check) 3591 DMEMIT("needs_check "); 3592 else 3593 DMEMIT("- "); 3594 3595 break; 3596 3597 case STATUSTYPE_TABLE: 3598 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 3599 DMEMIT("%s ", buf); 3600 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 3601 DMEMIT("%s ", buf); 3602 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 3603 DMEMIT("%s", buf); 3604 3605 for (i = 0; i < cache->nr_ctr_args - 1; i++) 3606 DMEMIT(" %s", cache->ctr_args[i]); 3607 if (cache->nr_ctr_args) 3608 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]); 3609 } 3610 3611 return; 3612 3613 err: 3614 DMEMIT("Error"); 3615 } 3616 3617 /* 3618 * A cache block range can take two forms: 3619 * 3620 * i) A single cblock, eg. '3456' 3621 * ii) A begin and end cblock with dots between, eg. 123-234 3622 */ 3623 static int parse_cblock_range(struct cache *cache, const char *str, 3624 struct cblock_range *result) 3625 { 3626 char dummy; 3627 uint64_t b, e; 3628 int r; 3629 3630 /* 3631 * Try and parse form (ii) first. 3632 */ 3633 r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy); 3634 if (r < 0) 3635 return r; 3636 3637 if (r == 2) { 3638 result->begin = to_cblock(b); 3639 result->end = to_cblock(e); 3640 return 0; 3641 } 3642 3643 /* 3644 * That didn't work, try form (i). 3645 */ 3646 r = sscanf(str, "%llu%c", &b, &dummy); 3647 if (r < 0) 3648 return r; 3649 3650 if (r == 1) { 3651 result->begin = to_cblock(b); 3652 result->end = to_cblock(from_cblock(result->begin) + 1u); 3653 return 0; 3654 } 3655 3656 DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str); 3657 return -EINVAL; 3658 } 3659 3660 static int validate_cblock_range(struct cache *cache, struct cblock_range *range) 3661 { 3662 uint64_t b = from_cblock(range->begin); 3663 uint64_t e = from_cblock(range->end); 3664 uint64_t n = from_cblock(cache->cache_size); 3665 3666 if (b >= n) { 3667 DMERR("%s: begin cblock out of range: %llu >= %llu", 3668 cache_device_name(cache), b, n); 3669 return -EINVAL; 3670 } 3671 3672 if (e > n) { 3673 DMERR("%s: end cblock out of range: %llu > %llu", 3674 cache_device_name(cache), e, n); 3675 return -EINVAL; 3676 } 3677 3678 if (b >= e) { 3679 DMERR("%s: invalid cblock range: %llu >= %llu", 3680 cache_device_name(cache), b, e); 3681 return -EINVAL; 3682 } 3683 3684 return 0; 3685 } 3686 3687 static int request_invalidation(struct cache *cache, struct cblock_range *range) 3688 { 3689 struct invalidation_request req; 3690 3691 INIT_LIST_HEAD(&req.list); 3692 req.cblocks = range; 3693 atomic_set(&req.complete, 0); 3694 req.err = 0; 3695 init_waitqueue_head(&req.result_wait); 3696 3697 spin_lock(&cache->invalidation_lock); 3698 list_add(&req.list, &cache->invalidation_requests); 3699 spin_unlock(&cache->invalidation_lock); 3700 wake_worker(cache); 3701 3702 wait_event(req.result_wait, atomic_read(&req.complete)); 3703 return req.err; 3704 } 3705 3706 static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, 3707 const char **cblock_ranges) 3708 { 3709 int r = 0; 3710 unsigned i; 3711 struct cblock_range range; 3712 3713 if (!passthrough_mode(&cache->features)) { 3714 DMERR("%s: cache has to be in passthrough mode for invalidation", 3715 cache_device_name(cache)); 3716 return -EPERM; 3717 } 3718 3719 for (i = 0; i < count; i++) { 3720 r = parse_cblock_range(cache, cblock_ranges[i], &range); 3721 if (r) 3722 break; 3723 3724 r = validate_cblock_range(cache, &range); 3725 if (r) 3726 break; 3727 3728 /* 3729 * Pass begin and end origin blocks to the worker and wake it. 3730 */ 3731 r = request_invalidation(cache, &range); 3732 if (r) 3733 break; 3734 } 3735 3736 return r; 3737 } 3738 3739 /* 3740 * Supports 3741 * "<key> <value>" 3742 * and 3743 * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]* 3744 * 3745 * The key migration_threshold is supported by the cache target core. 3746 */ 3747 static int cache_message(struct dm_target *ti, unsigned argc, char **argv) 3748 { 3749 struct cache *cache = ti->private; 3750 3751 if (!argc) 3752 return -EINVAL; 3753 3754 if (get_cache_mode(cache) >= CM_READ_ONLY) { 3755 DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode", 3756 cache_device_name(cache)); 3757 return -EOPNOTSUPP; 3758 } 3759 3760 if (!strcasecmp(argv[0], "invalidate_cblocks")) 3761 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1); 3762 3763 if (argc != 2) 3764 return -EINVAL; 3765 3766 return set_config_value(cache, argv[0], argv[1]); 3767 } 3768 3769 static int cache_iterate_devices(struct dm_target *ti, 3770 iterate_devices_callout_fn fn, void *data) 3771 { 3772 int r = 0; 3773 struct cache *cache = ti->private; 3774 3775 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data); 3776 if (!r) 3777 r = fn(ti, cache->origin_dev, 0, ti->len, data); 3778 3779 return r; 3780 } 3781 3782 static void set_discard_limits(struct cache *cache, struct queue_limits *limits) 3783 { 3784 /* 3785 * FIXME: these limits may be incompatible with the cache device 3786 */ 3787 limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024, 3788 cache->origin_sectors); 3789 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; 3790 } 3791 3792 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 3793 { 3794 struct cache *cache = ti->private; 3795 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 3796 3797 /* 3798 * If the system-determined stacked limits are compatible with the 3799 * cache's blocksize (io_opt is a factor) do not override them. 3800 */ 3801 if (io_opt_sectors < cache->sectors_per_block || 3802 do_div(io_opt_sectors, cache->sectors_per_block)) { 3803 blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT); 3804 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); 3805 } 3806 set_discard_limits(cache, limits); 3807 } 3808 3809 /*----------------------------------------------------------------*/ 3810 3811 static struct target_type cache_target = { 3812 .name = "cache", 3813 .version = {1, 9, 0}, 3814 .module = THIS_MODULE, 3815 .ctr = cache_ctr, 3816 .dtr = cache_dtr, 3817 .map = cache_map, 3818 .end_io = cache_end_io, 3819 .postsuspend = cache_postsuspend, 3820 .preresume = cache_preresume, 3821 .resume = cache_resume, 3822 .status = cache_status, 3823 .message = cache_message, 3824 .iterate_devices = cache_iterate_devices, 3825 .io_hints = cache_io_hints, 3826 }; 3827 3828 static int __init dm_cache_init(void) 3829 { 3830 int r; 3831 3832 r = dm_register_target(&cache_target); 3833 if (r) { 3834 DMERR("cache target registration failed: %d", r); 3835 return r; 3836 } 3837 3838 migration_cache = KMEM_CACHE(dm_cache_migration, 0); 3839 if (!migration_cache) { 3840 dm_unregister_target(&cache_target); 3841 return -ENOMEM; 3842 } 3843 3844 return 0; 3845 } 3846 3847 static void __exit dm_cache_exit(void) 3848 { 3849 dm_unregister_target(&cache_target); 3850 kmem_cache_destroy(migration_cache); 3851 } 3852 3853 module_init(dm_cache_init); 3854 module_exit(dm_cache_exit); 3855 3856 MODULE_DESCRIPTION(DM_NAME " cache target"); 3857 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); 3858 MODULE_LICENSE("GPL"); 3859