1 /* 2 * Copyright (C) 2012 Red Hat. All rights reserved. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm.h" 8 #include "dm-bio-prison.h" 9 #include "dm-bio-record.h" 10 #include "dm-cache-metadata.h" 11 12 #include <linux/dm-io.h> 13 #include <linux/dm-kcopyd.h> 14 #include <linux/jiffies.h> 15 #include <linux/init.h> 16 #include <linux/mempool.h> 17 #include <linux/module.h> 18 #include <linux/slab.h> 19 #include <linux/vmalloc.h> 20 21 #define DM_MSG_PREFIX "cache" 22 23 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, 24 "A percentage of time allocated for copying to and/or from cache"); 25 26 /*----------------------------------------------------------------*/ 27 28 #define IOT_RESOLUTION 4 29 30 struct io_tracker { 31 spinlock_t lock; 32 33 /* 34 * Sectors of in-flight IO. 35 */ 36 sector_t in_flight; 37 38 /* 39 * The time, in jiffies, when this device became idle (if it is 40 * indeed idle). 41 */ 42 unsigned long idle_time; 43 unsigned long last_update_time; 44 }; 45 46 static void iot_init(struct io_tracker *iot) 47 { 48 spin_lock_init(&iot->lock); 49 iot->in_flight = 0ul; 50 iot->idle_time = 0ul; 51 iot->last_update_time = jiffies; 52 } 53 54 static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs) 55 { 56 if (iot->in_flight) 57 return false; 58 59 return time_after(jiffies, iot->idle_time + jifs); 60 } 61 62 static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs) 63 { 64 bool r; 65 unsigned long flags; 66 67 spin_lock_irqsave(&iot->lock, flags); 68 r = __iot_idle_for(iot, jifs); 69 spin_unlock_irqrestore(&iot->lock, flags); 70 71 return r; 72 } 73 74 static void iot_io_begin(struct io_tracker *iot, sector_t len) 75 { 76 unsigned long flags; 77 78 spin_lock_irqsave(&iot->lock, flags); 79 iot->in_flight += len; 80 spin_unlock_irqrestore(&iot->lock, flags); 81 } 82 83 static void __iot_io_end(struct io_tracker *iot, sector_t len) 84 { 85 iot->in_flight -= len; 86 if (!iot->in_flight) 87 iot->idle_time = jiffies; 88 } 89 90 static void iot_io_end(struct io_tracker *iot, sector_t len) 91 { 92 unsigned long flags; 93 94 spin_lock_irqsave(&iot->lock, flags); 95 __iot_io_end(iot, len); 96 spin_unlock_irqrestore(&iot->lock, flags); 97 } 98 99 /*----------------------------------------------------------------*/ 100 101 /* 102 * Glossary: 103 * 104 * oblock: index of an origin block 105 * cblock: index of a cache block 106 * promotion: movement of a block from origin to cache 107 * demotion: movement of a block from cache to origin 108 * migration: movement of a block between the origin and cache device, 109 * either direction 110 */ 111 112 /*----------------------------------------------------------------*/ 113 114 /* 115 * There are a couple of places where we let a bio run, but want to do some 116 * work before calling its endio function. We do this by temporarily 117 * changing the endio fn. 118 */ 119 struct dm_hook_info { 120 bio_end_io_t *bi_end_io; 121 }; 122 123 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, 124 bio_end_io_t *bi_end_io, void *bi_private) 125 { 126 h->bi_end_io = bio->bi_end_io; 127 128 bio->bi_end_io = bi_end_io; 129 bio->bi_private = bi_private; 130 } 131 132 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) 133 { 134 bio->bi_end_io = h->bi_end_io; 135 } 136 137 /*----------------------------------------------------------------*/ 138 139 #define MIGRATION_POOL_SIZE 128 140 #define COMMIT_PERIOD HZ 141 #define MIGRATION_COUNT_WINDOW 10 142 143 /* 144 * The block size of the device holding cache data must be 145 * between 32KB and 1GB. 146 */ 147 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) 148 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 149 150 enum cache_metadata_mode { 151 CM_WRITE, /* metadata may be changed */ 152 CM_READ_ONLY, /* metadata may not be changed */ 153 CM_FAIL 154 }; 155 156 enum cache_io_mode { 157 /* 158 * Data is written to cached blocks only. These blocks are marked 159 * dirty. If you lose the cache device you will lose data. 160 * Potential performance increase for both reads and writes. 161 */ 162 CM_IO_WRITEBACK, 163 164 /* 165 * Data is written to both cache and origin. Blocks are never 166 * dirty. Potential performance benfit for reads only. 167 */ 168 CM_IO_WRITETHROUGH, 169 170 /* 171 * A degraded mode useful for various cache coherency situations 172 * (eg, rolling back snapshots). Reads and writes always go to the 173 * origin. If a write goes to a cached oblock, then the cache 174 * block is invalidated. 175 */ 176 CM_IO_PASSTHROUGH 177 }; 178 179 struct cache_features { 180 enum cache_metadata_mode mode; 181 enum cache_io_mode io_mode; 182 }; 183 184 struct cache_stats { 185 atomic_t read_hit; 186 atomic_t read_miss; 187 atomic_t write_hit; 188 atomic_t write_miss; 189 atomic_t demotion; 190 atomic_t promotion; 191 atomic_t copies_avoided; 192 atomic_t cache_cell_clash; 193 atomic_t commit_count; 194 atomic_t discard_count; 195 }; 196 197 /* 198 * Defines a range of cblocks, begin to (end - 1) are in the range. end is 199 * the one-past-the-end value. 200 */ 201 struct cblock_range { 202 dm_cblock_t begin; 203 dm_cblock_t end; 204 }; 205 206 struct invalidation_request { 207 struct list_head list; 208 struct cblock_range *cblocks; 209 210 atomic_t complete; 211 int err; 212 213 wait_queue_head_t result_wait; 214 }; 215 216 struct cache { 217 struct dm_target *ti; 218 struct dm_target_callbacks callbacks; 219 220 struct dm_cache_metadata *cmd; 221 222 /* 223 * Metadata is written to this device. 224 */ 225 struct dm_dev *metadata_dev; 226 227 /* 228 * The slower of the two data devices. Typically a spindle. 229 */ 230 struct dm_dev *origin_dev; 231 232 /* 233 * The faster of the two data devices. Typically an SSD. 234 */ 235 struct dm_dev *cache_dev; 236 237 /* 238 * Size of the origin device in _complete_ blocks and native sectors. 239 */ 240 dm_oblock_t origin_blocks; 241 sector_t origin_sectors; 242 243 /* 244 * Size of the cache device in blocks. 245 */ 246 dm_cblock_t cache_size; 247 248 /* 249 * Fields for converting from sectors to blocks. 250 */ 251 uint32_t sectors_per_block; 252 int sectors_per_block_shift; 253 254 spinlock_t lock; 255 struct list_head deferred_cells; 256 struct bio_list deferred_bios; 257 struct bio_list deferred_flush_bios; 258 struct bio_list deferred_writethrough_bios; 259 struct list_head quiesced_migrations; 260 struct list_head completed_migrations; 261 struct list_head need_commit_migrations; 262 sector_t migration_threshold; 263 wait_queue_head_t migration_wait; 264 atomic_t nr_allocated_migrations; 265 266 /* 267 * The number of in flight migrations that are performing 268 * background io. eg, promotion, writeback. 269 */ 270 atomic_t nr_io_migrations; 271 272 wait_queue_head_t quiescing_wait; 273 atomic_t quiescing; 274 atomic_t quiescing_ack; 275 276 /* 277 * cache_size entries, dirty if set 278 */ 279 atomic_t nr_dirty; 280 unsigned long *dirty_bitset; 281 282 /* 283 * origin_blocks entries, discarded if set. 284 */ 285 dm_dblock_t discard_nr_blocks; 286 unsigned long *discard_bitset; 287 uint32_t discard_block_size; /* a power of 2 times sectors per block */ 288 289 /* 290 * Rather than reconstructing the table line for the status we just 291 * save it and regurgitate. 292 */ 293 unsigned nr_ctr_args; 294 const char **ctr_args; 295 296 struct dm_kcopyd_client *copier; 297 struct workqueue_struct *wq; 298 struct work_struct worker; 299 300 struct delayed_work waker; 301 unsigned long last_commit_jiffies; 302 303 struct dm_bio_prison *prison; 304 struct dm_deferred_set *all_io_ds; 305 306 mempool_t *migration_pool; 307 308 struct dm_cache_policy *policy; 309 unsigned policy_nr_args; 310 311 bool need_tick_bio:1; 312 bool sized:1; 313 bool invalidate:1; 314 bool commit_requested:1; 315 bool loaded_mappings:1; 316 bool loaded_discards:1; 317 318 /* 319 * Cache features such as write-through. 320 */ 321 struct cache_features features; 322 323 struct cache_stats stats; 324 325 /* 326 * Invalidation fields. 327 */ 328 spinlock_t invalidation_lock; 329 struct list_head invalidation_requests; 330 331 struct io_tracker origin_tracker; 332 }; 333 334 struct per_bio_data { 335 bool tick:1; 336 unsigned req_nr:2; 337 struct dm_deferred_entry *all_io_entry; 338 struct dm_hook_info hook_info; 339 sector_t len; 340 341 /* 342 * writethrough fields. These MUST remain at the end of this 343 * structure and the 'cache' member must be the first as it 344 * is used to determine the offset of the writethrough fields. 345 */ 346 struct cache *cache; 347 dm_cblock_t cblock; 348 struct dm_bio_details bio_details; 349 }; 350 351 struct dm_cache_migration { 352 struct list_head list; 353 struct cache *cache; 354 355 unsigned long start_jiffies; 356 dm_oblock_t old_oblock; 357 dm_oblock_t new_oblock; 358 dm_cblock_t cblock; 359 360 bool err:1; 361 bool discard:1; 362 bool writeback:1; 363 bool demote:1; 364 bool promote:1; 365 bool requeue_holder:1; 366 bool invalidate:1; 367 368 struct dm_bio_prison_cell *old_ocell; 369 struct dm_bio_prison_cell *new_ocell; 370 }; 371 372 /* 373 * Processing a bio in the worker thread may require these memory 374 * allocations. We prealloc to avoid deadlocks (the same worker thread 375 * frees them back to the mempool). 376 */ 377 struct prealloc { 378 struct dm_cache_migration *mg; 379 struct dm_bio_prison_cell *cell1; 380 struct dm_bio_prison_cell *cell2; 381 }; 382 383 static enum cache_metadata_mode get_cache_mode(struct cache *cache); 384 385 static void wake_worker(struct cache *cache) 386 { 387 queue_work(cache->wq, &cache->worker); 388 } 389 390 /*----------------------------------------------------------------*/ 391 392 static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache) 393 { 394 /* FIXME: change to use a local slab. */ 395 return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT); 396 } 397 398 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell) 399 { 400 dm_bio_prison_free_cell(cache->prison, cell); 401 } 402 403 static struct dm_cache_migration *alloc_migration(struct cache *cache) 404 { 405 struct dm_cache_migration *mg; 406 407 mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT); 408 if (mg) { 409 mg->cache = cache; 410 atomic_inc(&mg->cache->nr_allocated_migrations); 411 } 412 413 return mg; 414 } 415 416 static void free_migration(struct dm_cache_migration *mg) 417 { 418 struct cache *cache = mg->cache; 419 420 if (atomic_dec_and_test(&cache->nr_allocated_migrations)) 421 wake_up(&cache->migration_wait); 422 423 mempool_free(mg, cache->migration_pool); 424 } 425 426 static int prealloc_data_structs(struct cache *cache, struct prealloc *p) 427 { 428 if (!p->mg) { 429 p->mg = alloc_migration(cache); 430 if (!p->mg) 431 return -ENOMEM; 432 } 433 434 if (!p->cell1) { 435 p->cell1 = alloc_prison_cell(cache); 436 if (!p->cell1) 437 return -ENOMEM; 438 } 439 440 if (!p->cell2) { 441 p->cell2 = alloc_prison_cell(cache); 442 if (!p->cell2) 443 return -ENOMEM; 444 } 445 446 return 0; 447 } 448 449 static void prealloc_free_structs(struct cache *cache, struct prealloc *p) 450 { 451 if (p->cell2) 452 free_prison_cell(cache, p->cell2); 453 454 if (p->cell1) 455 free_prison_cell(cache, p->cell1); 456 457 if (p->mg) 458 free_migration(p->mg); 459 } 460 461 static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p) 462 { 463 struct dm_cache_migration *mg = p->mg; 464 465 BUG_ON(!mg); 466 p->mg = NULL; 467 468 return mg; 469 } 470 471 /* 472 * You must have a cell within the prealloc struct to return. If not this 473 * function will BUG() rather than returning NULL. 474 */ 475 static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p) 476 { 477 struct dm_bio_prison_cell *r = NULL; 478 479 if (p->cell1) { 480 r = p->cell1; 481 p->cell1 = NULL; 482 483 } else if (p->cell2) { 484 r = p->cell2; 485 p->cell2 = NULL; 486 } else 487 BUG(); 488 489 return r; 490 } 491 492 /* 493 * You can't have more than two cells in a prealloc struct. BUG() will be 494 * called if you try and overfill. 495 */ 496 static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell) 497 { 498 if (!p->cell2) 499 p->cell2 = cell; 500 501 else if (!p->cell1) 502 p->cell1 = cell; 503 504 else 505 BUG(); 506 } 507 508 /*----------------------------------------------------------------*/ 509 510 static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key) 511 { 512 key->virtual = 0; 513 key->dev = 0; 514 key->block_begin = from_oblock(begin); 515 key->block_end = from_oblock(end); 516 } 517 518 /* 519 * The caller hands in a preallocated cell, and a free function for it. 520 * The cell will be freed if there's an error, or if it wasn't used because 521 * a cell with that key already exists. 522 */ 523 typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell); 524 525 static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end, 526 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 527 cell_free_fn free_fn, void *free_context, 528 struct dm_bio_prison_cell **cell_result) 529 { 530 int r; 531 struct dm_cell_key key; 532 533 build_key(oblock_begin, oblock_end, &key); 534 r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result); 535 if (r) 536 free_fn(free_context, cell_prealloc); 537 538 return r; 539 } 540 541 static int bio_detain(struct cache *cache, dm_oblock_t oblock, 542 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 543 cell_free_fn free_fn, void *free_context, 544 struct dm_bio_prison_cell **cell_result) 545 { 546 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); 547 return bio_detain_range(cache, oblock, end, bio, 548 cell_prealloc, free_fn, free_context, cell_result); 549 } 550 551 static int get_cell(struct cache *cache, 552 dm_oblock_t oblock, 553 struct prealloc *structs, 554 struct dm_bio_prison_cell **cell_result) 555 { 556 int r; 557 struct dm_cell_key key; 558 struct dm_bio_prison_cell *cell_prealloc; 559 560 cell_prealloc = prealloc_get_cell(structs); 561 562 build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key); 563 r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result); 564 if (r) 565 prealloc_put_cell(structs, cell_prealloc); 566 567 return r; 568 } 569 570 /*----------------------------------------------------------------*/ 571 572 static bool is_dirty(struct cache *cache, dm_cblock_t b) 573 { 574 return test_bit(from_cblock(b), cache->dirty_bitset); 575 } 576 577 static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 578 { 579 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 580 atomic_inc(&cache->nr_dirty); 581 policy_set_dirty(cache->policy, oblock); 582 } 583 } 584 585 static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 586 { 587 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 588 policy_clear_dirty(cache->policy, oblock); 589 if (atomic_dec_return(&cache->nr_dirty) == 0) 590 dm_table_event(cache->ti->table); 591 } 592 } 593 594 /*----------------------------------------------------------------*/ 595 596 static bool block_size_is_power_of_two(struct cache *cache) 597 { 598 return cache->sectors_per_block_shift >= 0; 599 } 600 601 /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */ 602 #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6 603 __always_inline 604 #endif 605 static dm_block_t block_div(dm_block_t b, uint32_t n) 606 { 607 do_div(b, n); 608 609 return b; 610 } 611 612 static dm_block_t oblocks_per_dblock(struct cache *cache) 613 { 614 dm_block_t oblocks = cache->discard_block_size; 615 616 if (block_size_is_power_of_two(cache)) 617 oblocks >>= cache->sectors_per_block_shift; 618 else 619 oblocks = block_div(oblocks, cache->sectors_per_block); 620 621 return oblocks; 622 } 623 624 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) 625 { 626 return to_dblock(block_div(from_oblock(oblock), 627 oblocks_per_dblock(cache))); 628 } 629 630 static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock) 631 { 632 return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache)); 633 } 634 635 static void set_discard(struct cache *cache, dm_dblock_t b) 636 { 637 unsigned long flags; 638 639 BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks)); 640 atomic_inc(&cache->stats.discard_count); 641 642 spin_lock_irqsave(&cache->lock, flags); 643 set_bit(from_dblock(b), cache->discard_bitset); 644 spin_unlock_irqrestore(&cache->lock, flags); 645 } 646 647 static void clear_discard(struct cache *cache, dm_dblock_t b) 648 { 649 unsigned long flags; 650 651 spin_lock_irqsave(&cache->lock, flags); 652 clear_bit(from_dblock(b), cache->discard_bitset); 653 spin_unlock_irqrestore(&cache->lock, flags); 654 } 655 656 static bool is_discarded(struct cache *cache, dm_dblock_t b) 657 { 658 int r; 659 unsigned long flags; 660 661 spin_lock_irqsave(&cache->lock, flags); 662 r = test_bit(from_dblock(b), cache->discard_bitset); 663 spin_unlock_irqrestore(&cache->lock, flags); 664 665 return r; 666 } 667 668 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) 669 { 670 int r; 671 unsigned long flags; 672 673 spin_lock_irqsave(&cache->lock, flags); 674 r = test_bit(from_dblock(oblock_to_dblock(cache, b)), 675 cache->discard_bitset); 676 spin_unlock_irqrestore(&cache->lock, flags); 677 678 return r; 679 } 680 681 /*----------------------------------------------------------------*/ 682 683 static void load_stats(struct cache *cache) 684 { 685 struct dm_cache_statistics stats; 686 687 dm_cache_metadata_get_stats(cache->cmd, &stats); 688 atomic_set(&cache->stats.read_hit, stats.read_hits); 689 atomic_set(&cache->stats.read_miss, stats.read_misses); 690 atomic_set(&cache->stats.write_hit, stats.write_hits); 691 atomic_set(&cache->stats.write_miss, stats.write_misses); 692 } 693 694 static void save_stats(struct cache *cache) 695 { 696 struct dm_cache_statistics stats; 697 698 if (get_cache_mode(cache) >= CM_READ_ONLY) 699 return; 700 701 stats.read_hits = atomic_read(&cache->stats.read_hit); 702 stats.read_misses = atomic_read(&cache->stats.read_miss); 703 stats.write_hits = atomic_read(&cache->stats.write_hit); 704 stats.write_misses = atomic_read(&cache->stats.write_miss); 705 706 dm_cache_metadata_set_stats(cache->cmd, &stats); 707 } 708 709 /*---------------------------------------------------------------- 710 * Per bio data 711 *--------------------------------------------------------------*/ 712 713 /* 714 * If using writeback, leave out struct per_bio_data's writethrough fields. 715 */ 716 #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) 717 #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) 718 719 static bool writethrough_mode(struct cache_features *f) 720 { 721 return f->io_mode == CM_IO_WRITETHROUGH; 722 } 723 724 static bool writeback_mode(struct cache_features *f) 725 { 726 return f->io_mode == CM_IO_WRITEBACK; 727 } 728 729 static bool passthrough_mode(struct cache_features *f) 730 { 731 return f->io_mode == CM_IO_PASSTHROUGH; 732 } 733 734 static size_t get_per_bio_data_size(struct cache *cache) 735 { 736 return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; 737 } 738 739 static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) 740 { 741 struct per_bio_data *pb = dm_per_bio_data(bio, data_size); 742 BUG_ON(!pb); 743 return pb; 744 } 745 746 static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) 747 { 748 struct per_bio_data *pb = get_per_bio_data(bio, data_size); 749 750 pb->tick = false; 751 pb->req_nr = dm_bio_get_target_bio_nr(bio); 752 pb->all_io_entry = NULL; 753 pb->len = 0; 754 755 return pb; 756 } 757 758 /*---------------------------------------------------------------- 759 * Remapping 760 *--------------------------------------------------------------*/ 761 static void remap_to_origin(struct cache *cache, struct bio *bio) 762 { 763 bio->bi_bdev = cache->origin_dev->bdev; 764 } 765 766 static void remap_to_cache(struct cache *cache, struct bio *bio, 767 dm_cblock_t cblock) 768 { 769 sector_t bi_sector = bio->bi_iter.bi_sector; 770 sector_t block = from_cblock(cblock); 771 772 bio->bi_bdev = cache->cache_dev->bdev; 773 if (!block_size_is_power_of_two(cache)) 774 bio->bi_iter.bi_sector = 775 (block * cache->sectors_per_block) + 776 sector_div(bi_sector, cache->sectors_per_block); 777 else 778 bio->bi_iter.bi_sector = 779 (block << cache->sectors_per_block_shift) | 780 (bi_sector & (cache->sectors_per_block - 1)); 781 } 782 783 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) 784 { 785 unsigned long flags; 786 size_t pb_data_size = get_per_bio_data_size(cache); 787 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 788 789 spin_lock_irqsave(&cache->lock, flags); 790 if (cache->need_tick_bio && 791 !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) { 792 pb->tick = true; 793 cache->need_tick_bio = false; 794 } 795 spin_unlock_irqrestore(&cache->lock, flags); 796 } 797 798 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 799 dm_oblock_t oblock) 800 { 801 check_if_tick_bio_needed(cache, bio); 802 remap_to_origin(cache, bio); 803 if (bio_data_dir(bio) == WRITE) 804 clear_discard(cache, oblock_to_dblock(cache, oblock)); 805 } 806 807 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, 808 dm_oblock_t oblock, dm_cblock_t cblock) 809 { 810 check_if_tick_bio_needed(cache, bio); 811 remap_to_cache(cache, bio, cblock); 812 if (bio_data_dir(bio) == WRITE) { 813 set_dirty(cache, oblock, cblock); 814 clear_discard(cache, oblock_to_dblock(cache, oblock)); 815 } 816 } 817 818 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) 819 { 820 sector_t block_nr = bio->bi_iter.bi_sector; 821 822 if (!block_size_is_power_of_two(cache)) 823 (void) sector_div(block_nr, cache->sectors_per_block); 824 else 825 block_nr >>= cache->sectors_per_block_shift; 826 827 return to_oblock(block_nr); 828 } 829 830 static int bio_triggers_commit(struct cache *cache, struct bio *bio) 831 { 832 return bio->bi_rw & (REQ_FLUSH | REQ_FUA); 833 } 834 835 /* 836 * You must increment the deferred set whilst the prison cell is held. To 837 * encourage this, we ask for 'cell' to be passed in. 838 */ 839 static void inc_ds(struct cache *cache, struct bio *bio, 840 struct dm_bio_prison_cell *cell) 841 { 842 size_t pb_data_size = get_per_bio_data_size(cache); 843 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 844 845 BUG_ON(!cell); 846 BUG_ON(pb->all_io_entry); 847 848 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 849 } 850 851 static bool accountable_bio(struct cache *cache, struct bio *bio) 852 { 853 return ((bio->bi_bdev == cache->origin_dev->bdev) && 854 !(bio->bi_rw & REQ_DISCARD)); 855 } 856 857 static void accounted_begin(struct cache *cache, struct bio *bio) 858 { 859 size_t pb_data_size = get_per_bio_data_size(cache); 860 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 861 862 if (accountable_bio(cache, bio)) { 863 pb->len = bio_sectors(bio); 864 iot_io_begin(&cache->origin_tracker, pb->len); 865 } 866 } 867 868 static void accounted_complete(struct cache *cache, struct bio *bio) 869 { 870 size_t pb_data_size = get_per_bio_data_size(cache); 871 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 872 873 iot_io_end(&cache->origin_tracker, pb->len); 874 } 875 876 static void accounted_request(struct cache *cache, struct bio *bio) 877 { 878 accounted_begin(cache, bio); 879 generic_make_request(bio); 880 } 881 882 static void issue(struct cache *cache, struct bio *bio) 883 { 884 unsigned long flags; 885 886 if (!bio_triggers_commit(cache, bio)) { 887 accounted_request(cache, bio); 888 return; 889 } 890 891 /* 892 * Batch together any bios that trigger commits and then issue a 893 * single commit for them in do_worker(). 894 */ 895 spin_lock_irqsave(&cache->lock, flags); 896 cache->commit_requested = true; 897 bio_list_add(&cache->deferred_flush_bios, bio); 898 spin_unlock_irqrestore(&cache->lock, flags); 899 } 900 901 static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell) 902 { 903 inc_ds(cache, bio, cell); 904 issue(cache, bio); 905 } 906 907 static void defer_writethrough_bio(struct cache *cache, struct bio *bio) 908 { 909 unsigned long flags; 910 911 spin_lock_irqsave(&cache->lock, flags); 912 bio_list_add(&cache->deferred_writethrough_bios, bio); 913 spin_unlock_irqrestore(&cache->lock, flags); 914 915 wake_worker(cache); 916 } 917 918 static void writethrough_endio(struct bio *bio) 919 { 920 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 921 922 dm_unhook_bio(&pb->hook_info, bio); 923 924 if (bio->bi_error) { 925 bio_endio(bio); 926 return; 927 } 928 929 dm_bio_restore(&pb->bio_details, bio); 930 remap_to_cache(pb->cache, bio, pb->cblock); 931 932 /* 933 * We can't issue this bio directly, since we're in interrupt 934 * context. So it gets put on a bio list for processing by the 935 * worker thread. 936 */ 937 defer_writethrough_bio(pb->cache, bio); 938 } 939 940 /* 941 * When running in writethrough mode we need to send writes to clean blocks 942 * to both the cache and origin devices. In future we'd like to clone the 943 * bio and send them in parallel, but for now we're doing them in 944 * series as this is easier. 945 */ 946 static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, 947 dm_oblock_t oblock, dm_cblock_t cblock) 948 { 949 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 950 951 pb->cache = cache; 952 pb->cblock = cblock; 953 dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL); 954 dm_bio_record(&pb->bio_details, bio); 955 956 remap_to_origin_clear_discard(pb->cache, bio, oblock); 957 } 958 959 /*---------------------------------------------------------------- 960 * Failure modes 961 *--------------------------------------------------------------*/ 962 static enum cache_metadata_mode get_cache_mode(struct cache *cache) 963 { 964 return cache->features.mode; 965 } 966 967 static const char *cache_device_name(struct cache *cache) 968 { 969 return dm_device_name(dm_table_get_md(cache->ti->table)); 970 } 971 972 static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode) 973 { 974 const char *descs[] = { 975 "write", 976 "read-only", 977 "fail" 978 }; 979 980 dm_table_event(cache->ti->table); 981 DMINFO("%s: switching cache to %s mode", 982 cache_device_name(cache), descs[(int)mode]); 983 } 984 985 static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode) 986 { 987 bool needs_check; 988 enum cache_metadata_mode old_mode = get_cache_mode(cache); 989 990 if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) { 991 DMERR("unable to read needs_check flag, setting failure mode"); 992 new_mode = CM_FAIL; 993 } 994 995 if (new_mode == CM_WRITE && needs_check) { 996 DMERR("%s: unable to switch cache to write mode until repaired.", 997 cache_device_name(cache)); 998 if (old_mode != new_mode) 999 new_mode = old_mode; 1000 else 1001 new_mode = CM_READ_ONLY; 1002 } 1003 1004 /* Never move out of fail mode */ 1005 if (old_mode == CM_FAIL) 1006 new_mode = CM_FAIL; 1007 1008 switch (new_mode) { 1009 case CM_FAIL: 1010 case CM_READ_ONLY: 1011 dm_cache_metadata_set_read_only(cache->cmd); 1012 break; 1013 1014 case CM_WRITE: 1015 dm_cache_metadata_set_read_write(cache->cmd); 1016 break; 1017 } 1018 1019 cache->features.mode = new_mode; 1020 1021 if (new_mode != old_mode) 1022 notify_mode_switch(cache, new_mode); 1023 } 1024 1025 static void abort_transaction(struct cache *cache) 1026 { 1027 const char *dev_name = cache_device_name(cache); 1028 1029 if (get_cache_mode(cache) >= CM_READ_ONLY) 1030 return; 1031 1032 if (dm_cache_metadata_set_needs_check(cache->cmd)) { 1033 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); 1034 set_cache_mode(cache, CM_FAIL); 1035 } 1036 1037 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); 1038 if (dm_cache_metadata_abort(cache->cmd)) { 1039 DMERR("%s: failed to abort metadata transaction", dev_name); 1040 set_cache_mode(cache, CM_FAIL); 1041 } 1042 } 1043 1044 static void metadata_operation_failed(struct cache *cache, const char *op, int r) 1045 { 1046 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", 1047 cache_device_name(cache), op, r); 1048 abort_transaction(cache); 1049 set_cache_mode(cache, CM_READ_ONLY); 1050 } 1051 1052 /*---------------------------------------------------------------- 1053 * Migration processing 1054 * 1055 * Migration covers moving data from the origin device to the cache, or 1056 * vice versa. 1057 *--------------------------------------------------------------*/ 1058 static void inc_io_migrations(struct cache *cache) 1059 { 1060 atomic_inc(&cache->nr_io_migrations); 1061 } 1062 1063 static void dec_io_migrations(struct cache *cache) 1064 { 1065 atomic_dec(&cache->nr_io_migrations); 1066 } 1067 1068 static bool discard_or_flush(struct bio *bio) 1069 { 1070 return bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD); 1071 } 1072 1073 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell) 1074 { 1075 if (discard_or_flush(cell->holder)) { 1076 /* 1077 * We have to handle these bios individually. 1078 */ 1079 dm_cell_release(cache->prison, cell, &cache->deferred_bios); 1080 free_prison_cell(cache, cell); 1081 } else 1082 list_add_tail(&cell->user_list, &cache->deferred_cells); 1083 } 1084 1085 static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder) 1086 { 1087 unsigned long flags; 1088 1089 if (!holder && dm_cell_promote_or_release(cache->prison, cell)) { 1090 /* 1091 * There was no prisoner to promote to holder, the 1092 * cell has been released. 1093 */ 1094 free_prison_cell(cache, cell); 1095 return; 1096 } 1097 1098 spin_lock_irqsave(&cache->lock, flags); 1099 __cell_defer(cache, cell); 1100 spin_unlock_irqrestore(&cache->lock, flags); 1101 1102 wake_worker(cache); 1103 } 1104 1105 static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err) 1106 { 1107 dm_cell_error(cache->prison, cell, err); 1108 free_prison_cell(cache, cell); 1109 } 1110 1111 static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell) 1112 { 1113 cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE); 1114 } 1115 1116 static void free_io_migration(struct dm_cache_migration *mg) 1117 { 1118 struct cache *cache = mg->cache; 1119 1120 dec_io_migrations(cache); 1121 free_migration(mg); 1122 wake_worker(cache); 1123 } 1124 1125 static void migration_failure(struct dm_cache_migration *mg) 1126 { 1127 struct cache *cache = mg->cache; 1128 const char *dev_name = cache_device_name(cache); 1129 1130 if (mg->writeback) { 1131 DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name); 1132 set_dirty(cache, mg->old_oblock, mg->cblock); 1133 cell_defer(cache, mg->old_ocell, false); 1134 1135 } else if (mg->demote) { 1136 DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name); 1137 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock); 1138 1139 cell_defer(cache, mg->old_ocell, mg->promote ? false : true); 1140 if (mg->promote) 1141 cell_defer(cache, mg->new_ocell, true); 1142 } else { 1143 DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name); 1144 policy_remove_mapping(cache->policy, mg->new_oblock); 1145 cell_defer(cache, mg->new_ocell, true); 1146 } 1147 1148 free_io_migration(mg); 1149 } 1150 1151 static void migration_success_pre_commit(struct dm_cache_migration *mg) 1152 { 1153 int r; 1154 unsigned long flags; 1155 struct cache *cache = mg->cache; 1156 1157 if (mg->writeback) { 1158 clear_dirty(cache, mg->old_oblock, mg->cblock); 1159 cell_defer(cache, mg->old_ocell, false); 1160 free_io_migration(mg); 1161 return; 1162 1163 } else if (mg->demote) { 1164 r = dm_cache_remove_mapping(cache->cmd, mg->cblock); 1165 if (r) { 1166 DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata", 1167 cache_device_name(cache)); 1168 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1169 policy_force_mapping(cache->policy, mg->new_oblock, 1170 mg->old_oblock); 1171 if (mg->promote) 1172 cell_defer(cache, mg->new_ocell, true); 1173 free_io_migration(mg); 1174 return; 1175 } 1176 } else { 1177 r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock); 1178 if (r) { 1179 DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata", 1180 cache_device_name(cache)); 1181 metadata_operation_failed(cache, "dm_cache_insert_mapping", r); 1182 policy_remove_mapping(cache->policy, mg->new_oblock); 1183 free_io_migration(mg); 1184 return; 1185 } 1186 } 1187 1188 spin_lock_irqsave(&cache->lock, flags); 1189 list_add_tail(&mg->list, &cache->need_commit_migrations); 1190 cache->commit_requested = true; 1191 spin_unlock_irqrestore(&cache->lock, flags); 1192 } 1193 1194 static void migration_success_post_commit(struct dm_cache_migration *mg) 1195 { 1196 unsigned long flags; 1197 struct cache *cache = mg->cache; 1198 1199 if (mg->writeback) { 1200 DMWARN_LIMIT("%s: writeback unexpectedly triggered commit", 1201 cache_device_name(cache)); 1202 return; 1203 1204 } else if (mg->demote) { 1205 cell_defer(cache, mg->old_ocell, mg->promote ? false : true); 1206 1207 if (mg->promote) { 1208 mg->demote = false; 1209 1210 spin_lock_irqsave(&cache->lock, flags); 1211 list_add_tail(&mg->list, &cache->quiesced_migrations); 1212 spin_unlock_irqrestore(&cache->lock, flags); 1213 1214 } else { 1215 if (mg->invalidate) 1216 policy_remove_mapping(cache->policy, mg->old_oblock); 1217 free_io_migration(mg); 1218 } 1219 1220 } else { 1221 if (mg->requeue_holder) { 1222 clear_dirty(cache, mg->new_oblock, mg->cblock); 1223 cell_defer(cache, mg->new_ocell, true); 1224 } else { 1225 /* 1226 * The block was promoted via an overwrite, so it's dirty. 1227 */ 1228 set_dirty(cache, mg->new_oblock, mg->cblock); 1229 bio_endio(mg->new_ocell->holder); 1230 cell_defer(cache, mg->new_ocell, false); 1231 } 1232 free_io_migration(mg); 1233 } 1234 } 1235 1236 static void copy_complete(int read_err, unsigned long write_err, void *context) 1237 { 1238 unsigned long flags; 1239 struct dm_cache_migration *mg = (struct dm_cache_migration *) context; 1240 struct cache *cache = mg->cache; 1241 1242 if (read_err || write_err) 1243 mg->err = true; 1244 1245 spin_lock_irqsave(&cache->lock, flags); 1246 list_add_tail(&mg->list, &cache->completed_migrations); 1247 spin_unlock_irqrestore(&cache->lock, flags); 1248 1249 wake_worker(cache); 1250 } 1251 1252 static void issue_copy(struct dm_cache_migration *mg) 1253 { 1254 int r; 1255 struct dm_io_region o_region, c_region; 1256 struct cache *cache = mg->cache; 1257 sector_t cblock = from_cblock(mg->cblock); 1258 1259 o_region.bdev = cache->origin_dev->bdev; 1260 o_region.count = cache->sectors_per_block; 1261 1262 c_region.bdev = cache->cache_dev->bdev; 1263 c_region.sector = cblock * cache->sectors_per_block; 1264 c_region.count = cache->sectors_per_block; 1265 1266 if (mg->writeback || mg->demote) { 1267 /* demote */ 1268 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block; 1269 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg); 1270 } else { 1271 /* promote */ 1272 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block; 1273 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg); 1274 } 1275 1276 if (r < 0) { 1277 DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache)); 1278 migration_failure(mg); 1279 } 1280 } 1281 1282 static void overwrite_endio(struct bio *bio) 1283 { 1284 struct dm_cache_migration *mg = bio->bi_private; 1285 struct cache *cache = mg->cache; 1286 size_t pb_data_size = get_per_bio_data_size(cache); 1287 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1288 unsigned long flags; 1289 1290 dm_unhook_bio(&pb->hook_info, bio); 1291 1292 if (bio->bi_error) 1293 mg->err = true; 1294 1295 mg->requeue_holder = false; 1296 1297 spin_lock_irqsave(&cache->lock, flags); 1298 list_add_tail(&mg->list, &cache->completed_migrations); 1299 spin_unlock_irqrestore(&cache->lock, flags); 1300 1301 wake_worker(cache); 1302 } 1303 1304 static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio) 1305 { 1306 size_t pb_data_size = get_per_bio_data_size(mg->cache); 1307 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1308 1309 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1310 remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock); 1311 1312 /* 1313 * No need to inc_ds() here, since the cell will be held for the 1314 * duration of the io. 1315 */ 1316 accounted_request(mg->cache, bio); 1317 } 1318 1319 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) 1320 { 1321 return (bio_data_dir(bio) == WRITE) && 1322 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); 1323 } 1324 1325 static void avoid_copy(struct dm_cache_migration *mg) 1326 { 1327 atomic_inc(&mg->cache->stats.copies_avoided); 1328 migration_success_pre_commit(mg); 1329 } 1330 1331 static void calc_discard_block_range(struct cache *cache, struct bio *bio, 1332 dm_dblock_t *b, dm_dblock_t *e) 1333 { 1334 sector_t sb = bio->bi_iter.bi_sector; 1335 sector_t se = bio_end_sector(bio); 1336 1337 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); 1338 1339 if (se - sb < cache->discard_block_size) 1340 *e = *b; 1341 else 1342 *e = to_dblock(block_div(se, cache->discard_block_size)); 1343 } 1344 1345 static void issue_discard(struct dm_cache_migration *mg) 1346 { 1347 dm_dblock_t b, e; 1348 struct bio *bio = mg->new_ocell->holder; 1349 struct cache *cache = mg->cache; 1350 1351 calc_discard_block_range(cache, bio, &b, &e); 1352 while (b != e) { 1353 set_discard(cache, b); 1354 b = to_dblock(from_dblock(b) + 1); 1355 } 1356 1357 bio_endio(bio); 1358 cell_defer(cache, mg->new_ocell, false); 1359 free_migration(mg); 1360 wake_worker(cache); 1361 } 1362 1363 static void issue_copy_or_discard(struct dm_cache_migration *mg) 1364 { 1365 bool avoid; 1366 struct cache *cache = mg->cache; 1367 1368 if (mg->discard) { 1369 issue_discard(mg); 1370 return; 1371 } 1372 1373 if (mg->writeback || mg->demote) 1374 avoid = !is_dirty(cache, mg->cblock) || 1375 is_discarded_oblock(cache, mg->old_oblock); 1376 else { 1377 struct bio *bio = mg->new_ocell->holder; 1378 1379 avoid = is_discarded_oblock(cache, mg->new_oblock); 1380 1381 if (writeback_mode(&cache->features) && 1382 !avoid && bio_writes_complete_block(cache, bio)) { 1383 issue_overwrite(mg, bio); 1384 return; 1385 } 1386 } 1387 1388 avoid ? avoid_copy(mg) : issue_copy(mg); 1389 } 1390 1391 static void complete_migration(struct dm_cache_migration *mg) 1392 { 1393 if (mg->err) 1394 migration_failure(mg); 1395 else 1396 migration_success_pre_commit(mg); 1397 } 1398 1399 static void process_migrations(struct cache *cache, struct list_head *head, 1400 void (*fn)(struct dm_cache_migration *)) 1401 { 1402 unsigned long flags; 1403 struct list_head list; 1404 struct dm_cache_migration *mg, *tmp; 1405 1406 INIT_LIST_HEAD(&list); 1407 spin_lock_irqsave(&cache->lock, flags); 1408 list_splice_init(head, &list); 1409 spin_unlock_irqrestore(&cache->lock, flags); 1410 1411 list_for_each_entry_safe(mg, tmp, &list, list) 1412 fn(mg); 1413 } 1414 1415 static void __queue_quiesced_migration(struct dm_cache_migration *mg) 1416 { 1417 list_add_tail(&mg->list, &mg->cache->quiesced_migrations); 1418 } 1419 1420 static void queue_quiesced_migration(struct dm_cache_migration *mg) 1421 { 1422 unsigned long flags; 1423 struct cache *cache = mg->cache; 1424 1425 spin_lock_irqsave(&cache->lock, flags); 1426 __queue_quiesced_migration(mg); 1427 spin_unlock_irqrestore(&cache->lock, flags); 1428 1429 wake_worker(cache); 1430 } 1431 1432 static void queue_quiesced_migrations(struct cache *cache, struct list_head *work) 1433 { 1434 unsigned long flags; 1435 struct dm_cache_migration *mg, *tmp; 1436 1437 spin_lock_irqsave(&cache->lock, flags); 1438 list_for_each_entry_safe(mg, tmp, work, list) 1439 __queue_quiesced_migration(mg); 1440 spin_unlock_irqrestore(&cache->lock, flags); 1441 1442 wake_worker(cache); 1443 } 1444 1445 static void check_for_quiesced_migrations(struct cache *cache, 1446 struct per_bio_data *pb) 1447 { 1448 struct list_head work; 1449 1450 if (!pb->all_io_entry) 1451 return; 1452 1453 INIT_LIST_HEAD(&work); 1454 dm_deferred_entry_dec(pb->all_io_entry, &work); 1455 1456 if (!list_empty(&work)) 1457 queue_quiesced_migrations(cache, &work); 1458 } 1459 1460 static void quiesce_migration(struct dm_cache_migration *mg) 1461 { 1462 if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list)) 1463 queue_quiesced_migration(mg); 1464 } 1465 1466 static void promote(struct cache *cache, struct prealloc *structs, 1467 dm_oblock_t oblock, dm_cblock_t cblock, 1468 struct dm_bio_prison_cell *cell) 1469 { 1470 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1471 1472 mg->err = false; 1473 mg->discard = false; 1474 mg->writeback = false; 1475 mg->demote = false; 1476 mg->promote = true; 1477 mg->requeue_holder = true; 1478 mg->invalidate = false; 1479 mg->cache = cache; 1480 mg->new_oblock = oblock; 1481 mg->cblock = cblock; 1482 mg->old_ocell = NULL; 1483 mg->new_ocell = cell; 1484 mg->start_jiffies = jiffies; 1485 1486 inc_io_migrations(cache); 1487 quiesce_migration(mg); 1488 } 1489 1490 static void writeback(struct cache *cache, struct prealloc *structs, 1491 dm_oblock_t oblock, dm_cblock_t cblock, 1492 struct dm_bio_prison_cell *cell) 1493 { 1494 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1495 1496 mg->err = false; 1497 mg->discard = false; 1498 mg->writeback = true; 1499 mg->demote = false; 1500 mg->promote = false; 1501 mg->requeue_holder = true; 1502 mg->invalidate = false; 1503 mg->cache = cache; 1504 mg->old_oblock = oblock; 1505 mg->cblock = cblock; 1506 mg->old_ocell = cell; 1507 mg->new_ocell = NULL; 1508 mg->start_jiffies = jiffies; 1509 1510 inc_io_migrations(cache); 1511 quiesce_migration(mg); 1512 } 1513 1514 static void demote_then_promote(struct cache *cache, struct prealloc *structs, 1515 dm_oblock_t old_oblock, dm_oblock_t new_oblock, 1516 dm_cblock_t cblock, 1517 struct dm_bio_prison_cell *old_ocell, 1518 struct dm_bio_prison_cell *new_ocell) 1519 { 1520 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1521 1522 mg->err = false; 1523 mg->discard = false; 1524 mg->writeback = false; 1525 mg->demote = true; 1526 mg->promote = true; 1527 mg->requeue_holder = true; 1528 mg->invalidate = false; 1529 mg->cache = cache; 1530 mg->old_oblock = old_oblock; 1531 mg->new_oblock = new_oblock; 1532 mg->cblock = cblock; 1533 mg->old_ocell = old_ocell; 1534 mg->new_ocell = new_ocell; 1535 mg->start_jiffies = jiffies; 1536 1537 inc_io_migrations(cache); 1538 quiesce_migration(mg); 1539 } 1540 1541 /* 1542 * Invalidate a cache entry. No writeback occurs; any changes in the cache 1543 * block are thrown away. 1544 */ 1545 static void invalidate(struct cache *cache, struct prealloc *structs, 1546 dm_oblock_t oblock, dm_cblock_t cblock, 1547 struct dm_bio_prison_cell *cell) 1548 { 1549 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1550 1551 mg->err = false; 1552 mg->discard = false; 1553 mg->writeback = false; 1554 mg->demote = true; 1555 mg->promote = false; 1556 mg->requeue_holder = true; 1557 mg->invalidate = true; 1558 mg->cache = cache; 1559 mg->old_oblock = oblock; 1560 mg->cblock = cblock; 1561 mg->old_ocell = cell; 1562 mg->new_ocell = NULL; 1563 mg->start_jiffies = jiffies; 1564 1565 inc_io_migrations(cache); 1566 quiesce_migration(mg); 1567 } 1568 1569 static void discard(struct cache *cache, struct prealloc *structs, 1570 struct dm_bio_prison_cell *cell) 1571 { 1572 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1573 1574 mg->err = false; 1575 mg->discard = true; 1576 mg->writeback = false; 1577 mg->demote = false; 1578 mg->promote = false; 1579 mg->requeue_holder = false; 1580 mg->invalidate = false; 1581 mg->cache = cache; 1582 mg->old_ocell = NULL; 1583 mg->new_ocell = cell; 1584 mg->start_jiffies = jiffies; 1585 1586 quiesce_migration(mg); 1587 } 1588 1589 /*---------------------------------------------------------------- 1590 * bio processing 1591 *--------------------------------------------------------------*/ 1592 static void defer_bio(struct cache *cache, struct bio *bio) 1593 { 1594 unsigned long flags; 1595 1596 spin_lock_irqsave(&cache->lock, flags); 1597 bio_list_add(&cache->deferred_bios, bio); 1598 spin_unlock_irqrestore(&cache->lock, flags); 1599 1600 wake_worker(cache); 1601 } 1602 1603 static void process_flush_bio(struct cache *cache, struct bio *bio) 1604 { 1605 size_t pb_data_size = get_per_bio_data_size(cache); 1606 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1607 1608 BUG_ON(bio->bi_iter.bi_size); 1609 if (!pb->req_nr) 1610 remap_to_origin(cache, bio); 1611 else 1612 remap_to_cache(cache, bio, 0); 1613 1614 /* 1615 * REQ_FLUSH is not directed at any particular block so we don't 1616 * need to inc_ds(). REQ_FUA's are split into a write + REQ_FLUSH 1617 * by dm-core. 1618 */ 1619 issue(cache, bio); 1620 } 1621 1622 static void process_discard_bio(struct cache *cache, struct prealloc *structs, 1623 struct bio *bio) 1624 { 1625 int r; 1626 dm_dblock_t b, e; 1627 struct dm_bio_prison_cell *cell_prealloc, *new_ocell; 1628 1629 calc_discard_block_range(cache, bio, &b, &e); 1630 if (b == e) { 1631 bio_endio(bio); 1632 return; 1633 } 1634 1635 cell_prealloc = prealloc_get_cell(structs); 1636 r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc, 1637 (cell_free_fn) prealloc_put_cell, 1638 structs, &new_ocell); 1639 if (r > 0) 1640 return; 1641 1642 discard(cache, structs, new_ocell); 1643 } 1644 1645 static bool spare_migration_bandwidth(struct cache *cache) 1646 { 1647 sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * 1648 cache->sectors_per_block; 1649 return current_volume < cache->migration_threshold; 1650 } 1651 1652 static void inc_hit_counter(struct cache *cache, struct bio *bio) 1653 { 1654 atomic_inc(bio_data_dir(bio) == READ ? 1655 &cache->stats.read_hit : &cache->stats.write_hit); 1656 } 1657 1658 static void inc_miss_counter(struct cache *cache, struct bio *bio) 1659 { 1660 atomic_inc(bio_data_dir(bio) == READ ? 1661 &cache->stats.read_miss : &cache->stats.write_miss); 1662 } 1663 1664 /*----------------------------------------------------------------*/ 1665 1666 struct inc_detail { 1667 struct cache *cache; 1668 struct bio_list bios_for_issue; 1669 struct bio_list unhandled_bios; 1670 bool any_writes; 1671 }; 1672 1673 static void inc_fn(void *context, struct dm_bio_prison_cell *cell) 1674 { 1675 struct bio *bio; 1676 struct inc_detail *detail = context; 1677 struct cache *cache = detail->cache; 1678 1679 inc_ds(cache, cell->holder, cell); 1680 if (bio_data_dir(cell->holder) == WRITE) 1681 detail->any_writes = true; 1682 1683 while ((bio = bio_list_pop(&cell->bios))) { 1684 if (discard_or_flush(bio)) { 1685 bio_list_add(&detail->unhandled_bios, bio); 1686 continue; 1687 } 1688 1689 if (bio_data_dir(bio) == WRITE) 1690 detail->any_writes = true; 1691 1692 bio_list_add(&detail->bios_for_issue, bio); 1693 inc_ds(cache, bio, cell); 1694 } 1695 } 1696 1697 // FIXME: refactor these two 1698 static void remap_cell_to_origin_clear_discard(struct cache *cache, 1699 struct dm_bio_prison_cell *cell, 1700 dm_oblock_t oblock, bool issue_holder) 1701 { 1702 struct bio *bio; 1703 unsigned long flags; 1704 struct inc_detail detail; 1705 1706 detail.cache = cache; 1707 bio_list_init(&detail.bios_for_issue); 1708 bio_list_init(&detail.unhandled_bios); 1709 detail.any_writes = false; 1710 1711 spin_lock_irqsave(&cache->lock, flags); 1712 dm_cell_visit_release(cache->prison, inc_fn, &detail, cell); 1713 bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios); 1714 spin_unlock_irqrestore(&cache->lock, flags); 1715 1716 remap_to_origin(cache, cell->holder); 1717 if (issue_holder) 1718 issue(cache, cell->holder); 1719 else 1720 accounted_begin(cache, cell->holder); 1721 1722 if (detail.any_writes) 1723 clear_discard(cache, oblock_to_dblock(cache, oblock)); 1724 1725 while ((bio = bio_list_pop(&detail.bios_for_issue))) { 1726 remap_to_origin(cache, bio); 1727 issue(cache, bio); 1728 } 1729 1730 free_prison_cell(cache, cell); 1731 } 1732 1733 static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell, 1734 dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder) 1735 { 1736 struct bio *bio; 1737 unsigned long flags; 1738 struct inc_detail detail; 1739 1740 detail.cache = cache; 1741 bio_list_init(&detail.bios_for_issue); 1742 bio_list_init(&detail.unhandled_bios); 1743 detail.any_writes = false; 1744 1745 spin_lock_irqsave(&cache->lock, flags); 1746 dm_cell_visit_release(cache->prison, inc_fn, &detail, cell); 1747 bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios); 1748 spin_unlock_irqrestore(&cache->lock, flags); 1749 1750 remap_to_cache(cache, cell->holder, cblock); 1751 if (issue_holder) 1752 issue(cache, cell->holder); 1753 else 1754 accounted_begin(cache, cell->holder); 1755 1756 if (detail.any_writes) { 1757 set_dirty(cache, oblock, cblock); 1758 clear_discard(cache, oblock_to_dblock(cache, oblock)); 1759 } 1760 1761 while ((bio = bio_list_pop(&detail.bios_for_issue))) { 1762 remap_to_cache(cache, bio, cblock); 1763 issue(cache, bio); 1764 } 1765 1766 free_prison_cell(cache, cell); 1767 } 1768 1769 /*----------------------------------------------------------------*/ 1770 1771 struct old_oblock_lock { 1772 struct policy_locker locker; 1773 struct cache *cache; 1774 struct prealloc *structs; 1775 struct dm_bio_prison_cell *cell; 1776 }; 1777 1778 static int null_locker(struct policy_locker *locker, dm_oblock_t b) 1779 { 1780 /* This should never be called */ 1781 BUG(); 1782 return 0; 1783 } 1784 1785 static int cell_locker(struct policy_locker *locker, dm_oblock_t b) 1786 { 1787 struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker); 1788 struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs); 1789 1790 return bio_detain(l->cache, b, NULL, cell_prealloc, 1791 (cell_free_fn) prealloc_put_cell, 1792 l->structs, &l->cell); 1793 } 1794 1795 static void process_cell(struct cache *cache, struct prealloc *structs, 1796 struct dm_bio_prison_cell *new_ocell) 1797 { 1798 int r; 1799 bool release_cell = true; 1800 struct bio *bio = new_ocell->holder; 1801 dm_oblock_t block = get_bio_block(cache, bio); 1802 struct policy_result lookup_result; 1803 bool passthrough = passthrough_mode(&cache->features); 1804 bool fast_promotion, can_migrate; 1805 struct old_oblock_lock ool; 1806 1807 fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio); 1808 can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache)); 1809 1810 ool.locker.fn = cell_locker; 1811 ool.cache = cache; 1812 ool.structs = structs; 1813 ool.cell = NULL; 1814 r = policy_map(cache->policy, block, true, can_migrate, fast_promotion, 1815 bio, &ool.locker, &lookup_result); 1816 1817 if (r == -EWOULDBLOCK) 1818 /* migration has been denied */ 1819 lookup_result.op = POLICY_MISS; 1820 1821 switch (lookup_result.op) { 1822 case POLICY_HIT: 1823 if (passthrough) { 1824 inc_miss_counter(cache, bio); 1825 1826 /* 1827 * Passthrough always maps to the origin, 1828 * invalidating any cache blocks that are written 1829 * to. 1830 */ 1831 1832 if (bio_data_dir(bio) == WRITE) { 1833 atomic_inc(&cache->stats.demotion); 1834 invalidate(cache, structs, block, lookup_result.cblock, new_ocell); 1835 release_cell = false; 1836 1837 } else { 1838 /* FIXME: factor out issue_origin() */ 1839 remap_to_origin_clear_discard(cache, bio, block); 1840 inc_and_issue(cache, bio, new_ocell); 1841 } 1842 } else { 1843 inc_hit_counter(cache, bio); 1844 1845 if (bio_data_dir(bio) == WRITE && 1846 writethrough_mode(&cache->features) && 1847 !is_dirty(cache, lookup_result.cblock)) { 1848 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 1849 inc_and_issue(cache, bio, new_ocell); 1850 1851 } else { 1852 remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true); 1853 release_cell = false; 1854 } 1855 } 1856 1857 break; 1858 1859 case POLICY_MISS: 1860 inc_miss_counter(cache, bio); 1861 remap_cell_to_origin_clear_discard(cache, new_ocell, block, true); 1862 release_cell = false; 1863 break; 1864 1865 case POLICY_NEW: 1866 atomic_inc(&cache->stats.promotion); 1867 promote(cache, structs, block, lookup_result.cblock, new_ocell); 1868 release_cell = false; 1869 break; 1870 1871 case POLICY_REPLACE: 1872 atomic_inc(&cache->stats.demotion); 1873 atomic_inc(&cache->stats.promotion); 1874 demote_then_promote(cache, structs, lookup_result.old_oblock, 1875 block, lookup_result.cblock, 1876 ool.cell, new_ocell); 1877 release_cell = false; 1878 break; 1879 1880 default: 1881 DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u", 1882 cache_device_name(cache), __func__, 1883 (unsigned) lookup_result.op); 1884 bio_io_error(bio); 1885 } 1886 1887 if (release_cell) 1888 cell_defer(cache, new_ocell, false); 1889 } 1890 1891 static void process_bio(struct cache *cache, struct prealloc *structs, 1892 struct bio *bio) 1893 { 1894 int r; 1895 dm_oblock_t block = get_bio_block(cache, bio); 1896 struct dm_bio_prison_cell *cell_prealloc, *new_ocell; 1897 1898 /* 1899 * Check to see if that block is currently migrating. 1900 */ 1901 cell_prealloc = prealloc_get_cell(structs); 1902 r = bio_detain(cache, block, bio, cell_prealloc, 1903 (cell_free_fn) prealloc_put_cell, 1904 structs, &new_ocell); 1905 if (r > 0) 1906 return; 1907 1908 process_cell(cache, structs, new_ocell); 1909 } 1910 1911 static int need_commit_due_to_time(struct cache *cache) 1912 { 1913 return jiffies < cache->last_commit_jiffies || 1914 jiffies > cache->last_commit_jiffies + COMMIT_PERIOD; 1915 } 1916 1917 /* 1918 * A non-zero return indicates read_only or fail_io mode. 1919 */ 1920 static int commit(struct cache *cache, bool clean_shutdown) 1921 { 1922 int r; 1923 1924 if (get_cache_mode(cache) >= CM_READ_ONLY) 1925 return -EINVAL; 1926 1927 atomic_inc(&cache->stats.commit_count); 1928 r = dm_cache_commit(cache->cmd, clean_shutdown); 1929 if (r) 1930 metadata_operation_failed(cache, "dm_cache_commit", r); 1931 1932 return r; 1933 } 1934 1935 static int commit_if_needed(struct cache *cache) 1936 { 1937 int r = 0; 1938 1939 if ((cache->commit_requested || need_commit_due_to_time(cache)) && 1940 dm_cache_changed_this_transaction(cache->cmd)) { 1941 r = commit(cache, false); 1942 cache->commit_requested = false; 1943 cache->last_commit_jiffies = jiffies; 1944 } 1945 1946 return r; 1947 } 1948 1949 static void process_deferred_bios(struct cache *cache) 1950 { 1951 bool prealloc_used = false; 1952 unsigned long flags; 1953 struct bio_list bios; 1954 struct bio *bio; 1955 struct prealloc structs; 1956 1957 memset(&structs, 0, sizeof(structs)); 1958 bio_list_init(&bios); 1959 1960 spin_lock_irqsave(&cache->lock, flags); 1961 bio_list_merge(&bios, &cache->deferred_bios); 1962 bio_list_init(&cache->deferred_bios); 1963 spin_unlock_irqrestore(&cache->lock, flags); 1964 1965 while (!bio_list_empty(&bios)) { 1966 /* 1967 * If we've got no free migration structs, and processing 1968 * this bio might require one, we pause until there are some 1969 * prepared mappings to process. 1970 */ 1971 prealloc_used = true; 1972 if (prealloc_data_structs(cache, &structs)) { 1973 spin_lock_irqsave(&cache->lock, flags); 1974 bio_list_merge(&cache->deferred_bios, &bios); 1975 spin_unlock_irqrestore(&cache->lock, flags); 1976 break; 1977 } 1978 1979 bio = bio_list_pop(&bios); 1980 1981 if (bio->bi_rw & REQ_FLUSH) 1982 process_flush_bio(cache, bio); 1983 else if (bio->bi_rw & REQ_DISCARD) 1984 process_discard_bio(cache, &structs, bio); 1985 else 1986 process_bio(cache, &structs, bio); 1987 } 1988 1989 if (prealloc_used) 1990 prealloc_free_structs(cache, &structs); 1991 } 1992 1993 static void process_deferred_cells(struct cache *cache) 1994 { 1995 bool prealloc_used = false; 1996 unsigned long flags; 1997 struct dm_bio_prison_cell *cell, *tmp; 1998 struct list_head cells; 1999 struct prealloc structs; 2000 2001 memset(&structs, 0, sizeof(structs)); 2002 2003 INIT_LIST_HEAD(&cells); 2004 2005 spin_lock_irqsave(&cache->lock, flags); 2006 list_splice_init(&cache->deferred_cells, &cells); 2007 spin_unlock_irqrestore(&cache->lock, flags); 2008 2009 list_for_each_entry_safe(cell, tmp, &cells, user_list) { 2010 /* 2011 * If we've got no free migration structs, and processing 2012 * this bio might require one, we pause until there are some 2013 * prepared mappings to process. 2014 */ 2015 prealloc_used = true; 2016 if (prealloc_data_structs(cache, &structs)) { 2017 spin_lock_irqsave(&cache->lock, flags); 2018 list_splice(&cells, &cache->deferred_cells); 2019 spin_unlock_irqrestore(&cache->lock, flags); 2020 break; 2021 } 2022 2023 process_cell(cache, &structs, cell); 2024 } 2025 2026 if (prealloc_used) 2027 prealloc_free_structs(cache, &structs); 2028 } 2029 2030 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) 2031 { 2032 unsigned long flags; 2033 struct bio_list bios; 2034 struct bio *bio; 2035 2036 bio_list_init(&bios); 2037 2038 spin_lock_irqsave(&cache->lock, flags); 2039 bio_list_merge(&bios, &cache->deferred_flush_bios); 2040 bio_list_init(&cache->deferred_flush_bios); 2041 spin_unlock_irqrestore(&cache->lock, flags); 2042 2043 /* 2044 * These bios have already been through inc_ds() 2045 */ 2046 while ((bio = bio_list_pop(&bios))) 2047 submit_bios ? accounted_request(cache, bio) : bio_io_error(bio); 2048 } 2049 2050 static void process_deferred_writethrough_bios(struct cache *cache) 2051 { 2052 unsigned long flags; 2053 struct bio_list bios; 2054 struct bio *bio; 2055 2056 bio_list_init(&bios); 2057 2058 spin_lock_irqsave(&cache->lock, flags); 2059 bio_list_merge(&bios, &cache->deferred_writethrough_bios); 2060 bio_list_init(&cache->deferred_writethrough_bios); 2061 spin_unlock_irqrestore(&cache->lock, flags); 2062 2063 /* 2064 * These bios have already been through inc_ds() 2065 */ 2066 while ((bio = bio_list_pop(&bios))) 2067 accounted_request(cache, bio); 2068 } 2069 2070 static void writeback_some_dirty_blocks(struct cache *cache) 2071 { 2072 bool prealloc_used = false; 2073 dm_oblock_t oblock; 2074 dm_cblock_t cblock; 2075 struct prealloc structs; 2076 struct dm_bio_prison_cell *old_ocell; 2077 bool busy = !iot_idle_for(&cache->origin_tracker, HZ); 2078 2079 memset(&structs, 0, sizeof(structs)); 2080 2081 while (spare_migration_bandwidth(cache)) { 2082 if (policy_writeback_work(cache->policy, &oblock, &cblock, busy)) 2083 break; /* no work to do */ 2084 2085 prealloc_used = true; 2086 if (prealloc_data_structs(cache, &structs) || 2087 get_cell(cache, oblock, &structs, &old_ocell)) { 2088 policy_set_dirty(cache->policy, oblock); 2089 break; 2090 } 2091 2092 writeback(cache, &structs, oblock, cblock, old_ocell); 2093 } 2094 2095 if (prealloc_used) 2096 prealloc_free_structs(cache, &structs); 2097 } 2098 2099 /*---------------------------------------------------------------- 2100 * Invalidations. 2101 * Dropping something from the cache *without* writing back. 2102 *--------------------------------------------------------------*/ 2103 2104 static void process_invalidation_request(struct cache *cache, struct invalidation_request *req) 2105 { 2106 int r = 0; 2107 uint64_t begin = from_cblock(req->cblocks->begin); 2108 uint64_t end = from_cblock(req->cblocks->end); 2109 2110 while (begin != end) { 2111 r = policy_remove_cblock(cache->policy, to_cblock(begin)); 2112 if (!r) { 2113 r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin)); 2114 if (r) { 2115 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 2116 break; 2117 } 2118 2119 } else if (r == -ENODATA) { 2120 /* harmless, already unmapped */ 2121 r = 0; 2122 2123 } else { 2124 DMERR("%s: policy_remove_cblock failed", cache_device_name(cache)); 2125 break; 2126 } 2127 2128 begin++; 2129 } 2130 2131 cache->commit_requested = true; 2132 2133 req->err = r; 2134 atomic_set(&req->complete, 1); 2135 2136 wake_up(&req->result_wait); 2137 } 2138 2139 static void process_invalidation_requests(struct cache *cache) 2140 { 2141 struct list_head list; 2142 struct invalidation_request *req, *tmp; 2143 2144 INIT_LIST_HEAD(&list); 2145 spin_lock(&cache->invalidation_lock); 2146 list_splice_init(&cache->invalidation_requests, &list); 2147 spin_unlock(&cache->invalidation_lock); 2148 2149 list_for_each_entry_safe (req, tmp, &list, list) 2150 process_invalidation_request(cache, req); 2151 } 2152 2153 /*---------------------------------------------------------------- 2154 * Main worker loop 2155 *--------------------------------------------------------------*/ 2156 static bool is_quiescing(struct cache *cache) 2157 { 2158 return atomic_read(&cache->quiescing); 2159 } 2160 2161 static void ack_quiescing(struct cache *cache) 2162 { 2163 if (is_quiescing(cache)) { 2164 atomic_inc(&cache->quiescing_ack); 2165 wake_up(&cache->quiescing_wait); 2166 } 2167 } 2168 2169 static void wait_for_quiescing_ack(struct cache *cache) 2170 { 2171 wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack)); 2172 } 2173 2174 static void start_quiescing(struct cache *cache) 2175 { 2176 atomic_inc(&cache->quiescing); 2177 wait_for_quiescing_ack(cache); 2178 } 2179 2180 static void stop_quiescing(struct cache *cache) 2181 { 2182 atomic_set(&cache->quiescing, 0); 2183 atomic_set(&cache->quiescing_ack, 0); 2184 } 2185 2186 static void wait_for_migrations(struct cache *cache) 2187 { 2188 wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations)); 2189 } 2190 2191 static void stop_worker(struct cache *cache) 2192 { 2193 cancel_delayed_work(&cache->waker); 2194 flush_workqueue(cache->wq); 2195 } 2196 2197 static void requeue_deferred_cells(struct cache *cache) 2198 { 2199 unsigned long flags; 2200 struct list_head cells; 2201 struct dm_bio_prison_cell *cell, *tmp; 2202 2203 INIT_LIST_HEAD(&cells); 2204 spin_lock_irqsave(&cache->lock, flags); 2205 list_splice_init(&cache->deferred_cells, &cells); 2206 spin_unlock_irqrestore(&cache->lock, flags); 2207 2208 list_for_each_entry_safe(cell, tmp, &cells, user_list) 2209 cell_requeue(cache, cell); 2210 } 2211 2212 static void requeue_deferred_bios(struct cache *cache) 2213 { 2214 struct bio *bio; 2215 struct bio_list bios; 2216 2217 bio_list_init(&bios); 2218 bio_list_merge(&bios, &cache->deferred_bios); 2219 bio_list_init(&cache->deferred_bios); 2220 2221 while ((bio = bio_list_pop(&bios))) { 2222 bio->bi_error = DM_ENDIO_REQUEUE; 2223 bio_endio(bio); 2224 } 2225 } 2226 2227 static int more_work(struct cache *cache) 2228 { 2229 if (is_quiescing(cache)) 2230 return !list_empty(&cache->quiesced_migrations) || 2231 !list_empty(&cache->completed_migrations) || 2232 !list_empty(&cache->need_commit_migrations); 2233 else 2234 return !bio_list_empty(&cache->deferred_bios) || 2235 !list_empty(&cache->deferred_cells) || 2236 !bio_list_empty(&cache->deferred_flush_bios) || 2237 !bio_list_empty(&cache->deferred_writethrough_bios) || 2238 !list_empty(&cache->quiesced_migrations) || 2239 !list_empty(&cache->completed_migrations) || 2240 !list_empty(&cache->need_commit_migrations) || 2241 cache->invalidate; 2242 } 2243 2244 static void do_worker(struct work_struct *ws) 2245 { 2246 struct cache *cache = container_of(ws, struct cache, worker); 2247 2248 do { 2249 if (!is_quiescing(cache)) { 2250 writeback_some_dirty_blocks(cache); 2251 process_deferred_writethrough_bios(cache); 2252 process_deferred_bios(cache); 2253 process_deferred_cells(cache); 2254 process_invalidation_requests(cache); 2255 } 2256 2257 process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard); 2258 process_migrations(cache, &cache->completed_migrations, complete_migration); 2259 2260 if (commit_if_needed(cache)) { 2261 process_deferred_flush_bios(cache, false); 2262 process_migrations(cache, &cache->need_commit_migrations, migration_failure); 2263 } else { 2264 process_deferred_flush_bios(cache, true); 2265 process_migrations(cache, &cache->need_commit_migrations, 2266 migration_success_post_commit); 2267 } 2268 2269 ack_quiescing(cache); 2270 2271 } while (more_work(cache)); 2272 } 2273 2274 /* 2275 * We want to commit periodically so that not too much 2276 * unwritten metadata builds up. 2277 */ 2278 static void do_waker(struct work_struct *ws) 2279 { 2280 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 2281 policy_tick(cache->policy, true); 2282 wake_worker(cache); 2283 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 2284 } 2285 2286 /*----------------------------------------------------------------*/ 2287 2288 static int is_congested(struct dm_dev *dev, int bdi_bits) 2289 { 2290 struct request_queue *q = bdev_get_queue(dev->bdev); 2291 return bdi_congested(&q->backing_dev_info, bdi_bits); 2292 } 2293 2294 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 2295 { 2296 struct cache *cache = container_of(cb, struct cache, callbacks); 2297 2298 return is_congested(cache->origin_dev, bdi_bits) || 2299 is_congested(cache->cache_dev, bdi_bits); 2300 } 2301 2302 /*---------------------------------------------------------------- 2303 * Target methods 2304 *--------------------------------------------------------------*/ 2305 2306 /* 2307 * This function gets called on the error paths of the constructor, so we 2308 * have to cope with a partially initialised struct. 2309 */ 2310 static void destroy(struct cache *cache) 2311 { 2312 unsigned i; 2313 2314 mempool_destroy(cache->migration_pool); 2315 2316 if (cache->all_io_ds) 2317 dm_deferred_set_destroy(cache->all_io_ds); 2318 2319 if (cache->prison) 2320 dm_bio_prison_destroy(cache->prison); 2321 2322 if (cache->wq) 2323 destroy_workqueue(cache->wq); 2324 2325 if (cache->dirty_bitset) 2326 free_bitset(cache->dirty_bitset); 2327 2328 if (cache->discard_bitset) 2329 free_bitset(cache->discard_bitset); 2330 2331 if (cache->copier) 2332 dm_kcopyd_client_destroy(cache->copier); 2333 2334 if (cache->cmd) 2335 dm_cache_metadata_close(cache->cmd); 2336 2337 if (cache->metadata_dev) 2338 dm_put_device(cache->ti, cache->metadata_dev); 2339 2340 if (cache->origin_dev) 2341 dm_put_device(cache->ti, cache->origin_dev); 2342 2343 if (cache->cache_dev) 2344 dm_put_device(cache->ti, cache->cache_dev); 2345 2346 if (cache->policy) 2347 dm_cache_policy_destroy(cache->policy); 2348 2349 for (i = 0; i < cache->nr_ctr_args ; i++) 2350 kfree(cache->ctr_args[i]); 2351 kfree(cache->ctr_args); 2352 2353 kfree(cache); 2354 } 2355 2356 static void cache_dtr(struct dm_target *ti) 2357 { 2358 struct cache *cache = ti->private; 2359 2360 destroy(cache); 2361 } 2362 2363 static sector_t get_dev_size(struct dm_dev *dev) 2364 { 2365 return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; 2366 } 2367 2368 /*----------------------------------------------------------------*/ 2369 2370 /* 2371 * Construct a cache device mapping. 2372 * 2373 * cache <metadata dev> <cache dev> <origin dev> <block size> 2374 * <#feature args> [<feature arg>]* 2375 * <policy> <#policy args> [<policy arg>]* 2376 * 2377 * metadata dev : fast device holding the persistent metadata 2378 * cache dev : fast device holding cached data blocks 2379 * origin dev : slow device holding original data blocks 2380 * block size : cache unit size in sectors 2381 * 2382 * #feature args : number of feature arguments passed 2383 * feature args : writethrough. (The default is writeback.) 2384 * 2385 * policy : the replacement policy to use 2386 * #policy args : an even number of policy arguments corresponding 2387 * to key/value pairs passed to the policy 2388 * policy args : key/value pairs passed to the policy 2389 * E.g. 'sequential_threshold 1024' 2390 * See cache-policies.txt for details. 2391 * 2392 * Optional feature arguments are: 2393 * writethrough : write through caching that prohibits cache block 2394 * content from being different from origin block content. 2395 * Without this argument, the default behaviour is to write 2396 * back cache block contents later for performance reasons, 2397 * so they may differ from the corresponding origin blocks. 2398 */ 2399 struct cache_args { 2400 struct dm_target *ti; 2401 2402 struct dm_dev *metadata_dev; 2403 2404 struct dm_dev *cache_dev; 2405 sector_t cache_sectors; 2406 2407 struct dm_dev *origin_dev; 2408 sector_t origin_sectors; 2409 2410 uint32_t block_size; 2411 2412 const char *policy_name; 2413 int policy_argc; 2414 const char **policy_argv; 2415 2416 struct cache_features features; 2417 }; 2418 2419 static void destroy_cache_args(struct cache_args *ca) 2420 { 2421 if (ca->metadata_dev) 2422 dm_put_device(ca->ti, ca->metadata_dev); 2423 2424 if (ca->cache_dev) 2425 dm_put_device(ca->ti, ca->cache_dev); 2426 2427 if (ca->origin_dev) 2428 dm_put_device(ca->ti, ca->origin_dev); 2429 2430 kfree(ca); 2431 } 2432 2433 static bool at_least_one_arg(struct dm_arg_set *as, char **error) 2434 { 2435 if (!as->argc) { 2436 *error = "Insufficient args"; 2437 return false; 2438 } 2439 2440 return true; 2441 } 2442 2443 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, 2444 char **error) 2445 { 2446 int r; 2447 sector_t metadata_dev_size; 2448 char b[BDEVNAME_SIZE]; 2449 2450 if (!at_least_one_arg(as, error)) 2451 return -EINVAL; 2452 2453 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2454 &ca->metadata_dev); 2455 if (r) { 2456 *error = "Error opening metadata device"; 2457 return r; 2458 } 2459 2460 metadata_dev_size = get_dev_size(ca->metadata_dev); 2461 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING) 2462 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 2463 bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); 2464 2465 return 0; 2466 } 2467 2468 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, 2469 char **error) 2470 { 2471 int r; 2472 2473 if (!at_least_one_arg(as, error)) 2474 return -EINVAL; 2475 2476 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2477 &ca->cache_dev); 2478 if (r) { 2479 *error = "Error opening cache device"; 2480 return r; 2481 } 2482 ca->cache_sectors = get_dev_size(ca->cache_dev); 2483 2484 return 0; 2485 } 2486 2487 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, 2488 char **error) 2489 { 2490 int r; 2491 2492 if (!at_least_one_arg(as, error)) 2493 return -EINVAL; 2494 2495 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2496 &ca->origin_dev); 2497 if (r) { 2498 *error = "Error opening origin device"; 2499 return r; 2500 } 2501 2502 ca->origin_sectors = get_dev_size(ca->origin_dev); 2503 if (ca->ti->len > ca->origin_sectors) { 2504 *error = "Device size larger than cached device"; 2505 return -EINVAL; 2506 } 2507 2508 return 0; 2509 } 2510 2511 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, 2512 char **error) 2513 { 2514 unsigned long block_size; 2515 2516 if (!at_least_one_arg(as, error)) 2517 return -EINVAL; 2518 2519 if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size || 2520 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 2521 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 2522 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 2523 *error = "Invalid data block size"; 2524 return -EINVAL; 2525 } 2526 2527 if (block_size > ca->cache_sectors) { 2528 *error = "Data block size is larger than the cache device"; 2529 return -EINVAL; 2530 } 2531 2532 ca->block_size = block_size; 2533 2534 return 0; 2535 } 2536 2537 static void init_features(struct cache_features *cf) 2538 { 2539 cf->mode = CM_WRITE; 2540 cf->io_mode = CM_IO_WRITEBACK; 2541 } 2542 2543 static int parse_features(struct cache_args *ca, struct dm_arg_set *as, 2544 char **error) 2545 { 2546 static struct dm_arg _args[] = { 2547 {0, 1, "Invalid number of cache feature arguments"}, 2548 }; 2549 2550 int r; 2551 unsigned argc; 2552 const char *arg; 2553 struct cache_features *cf = &ca->features; 2554 2555 init_features(cf); 2556 2557 r = dm_read_arg_group(_args, as, &argc, error); 2558 if (r) 2559 return -EINVAL; 2560 2561 while (argc--) { 2562 arg = dm_shift_arg(as); 2563 2564 if (!strcasecmp(arg, "writeback")) 2565 cf->io_mode = CM_IO_WRITEBACK; 2566 2567 else if (!strcasecmp(arg, "writethrough")) 2568 cf->io_mode = CM_IO_WRITETHROUGH; 2569 2570 else if (!strcasecmp(arg, "passthrough")) 2571 cf->io_mode = CM_IO_PASSTHROUGH; 2572 2573 else { 2574 *error = "Unrecognised cache feature requested"; 2575 return -EINVAL; 2576 } 2577 } 2578 2579 return 0; 2580 } 2581 2582 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as, 2583 char **error) 2584 { 2585 static struct dm_arg _args[] = { 2586 {0, 1024, "Invalid number of policy arguments"}, 2587 }; 2588 2589 int r; 2590 2591 if (!at_least_one_arg(as, error)) 2592 return -EINVAL; 2593 2594 ca->policy_name = dm_shift_arg(as); 2595 2596 r = dm_read_arg_group(_args, as, &ca->policy_argc, error); 2597 if (r) 2598 return -EINVAL; 2599 2600 ca->policy_argv = (const char **)as->argv; 2601 dm_consume_args(as, ca->policy_argc); 2602 2603 return 0; 2604 } 2605 2606 static int parse_cache_args(struct cache_args *ca, int argc, char **argv, 2607 char **error) 2608 { 2609 int r; 2610 struct dm_arg_set as; 2611 2612 as.argc = argc; 2613 as.argv = argv; 2614 2615 r = parse_metadata_dev(ca, &as, error); 2616 if (r) 2617 return r; 2618 2619 r = parse_cache_dev(ca, &as, error); 2620 if (r) 2621 return r; 2622 2623 r = parse_origin_dev(ca, &as, error); 2624 if (r) 2625 return r; 2626 2627 r = parse_block_size(ca, &as, error); 2628 if (r) 2629 return r; 2630 2631 r = parse_features(ca, &as, error); 2632 if (r) 2633 return r; 2634 2635 r = parse_policy(ca, &as, error); 2636 if (r) 2637 return r; 2638 2639 return 0; 2640 } 2641 2642 /*----------------------------------------------------------------*/ 2643 2644 static struct kmem_cache *migration_cache; 2645 2646 #define NOT_CORE_OPTION 1 2647 2648 static int process_config_option(struct cache *cache, const char *key, const char *value) 2649 { 2650 unsigned long tmp; 2651 2652 if (!strcasecmp(key, "migration_threshold")) { 2653 if (kstrtoul(value, 10, &tmp)) 2654 return -EINVAL; 2655 2656 cache->migration_threshold = tmp; 2657 return 0; 2658 } 2659 2660 return NOT_CORE_OPTION; 2661 } 2662 2663 static int set_config_value(struct cache *cache, const char *key, const char *value) 2664 { 2665 int r = process_config_option(cache, key, value); 2666 2667 if (r == NOT_CORE_OPTION) 2668 r = policy_set_config_value(cache->policy, key, value); 2669 2670 if (r) 2671 DMWARN("bad config value for %s: %s", key, value); 2672 2673 return r; 2674 } 2675 2676 static int set_config_values(struct cache *cache, int argc, const char **argv) 2677 { 2678 int r = 0; 2679 2680 if (argc & 1) { 2681 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs."); 2682 return -EINVAL; 2683 } 2684 2685 while (argc) { 2686 r = set_config_value(cache, argv[0], argv[1]); 2687 if (r) 2688 break; 2689 2690 argc -= 2; 2691 argv += 2; 2692 } 2693 2694 return r; 2695 } 2696 2697 static int create_cache_policy(struct cache *cache, struct cache_args *ca, 2698 char **error) 2699 { 2700 struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name, 2701 cache->cache_size, 2702 cache->origin_sectors, 2703 cache->sectors_per_block); 2704 if (IS_ERR(p)) { 2705 *error = "Error creating cache's policy"; 2706 return PTR_ERR(p); 2707 } 2708 cache->policy = p; 2709 2710 return 0; 2711 } 2712 2713 /* 2714 * We want the discard block size to be at least the size of the cache 2715 * block size and have no more than 2^14 discard blocks across the origin. 2716 */ 2717 #define MAX_DISCARD_BLOCKS (1 << 14) 2718 2719 static bool too_many_discard_blocks(sector_t discard_block_size, 2720 sector_t origin_size) 2721 { 2722 (void) sector_div(origin_size, discard_block_size); 2723 2724 return origin_size > MAX_DISCARD_BLOCKS; 2725 } 2726 2727 static sector_t calculate_discard_block_size(sector_t cache_block_size, 2728 sector_t origin_size) 2729 { 2730 sector_t discard_block_size = cache_block_size; 2731 2732 if (origin_size) 2733 while (too_many_discard_blocks(discard_block_size, origin_size)) 2734 discard_block_size *= 2; 2735 2736 return discard_block_size; 2737 } 2738 2739 static void set_cache_size(struct cache *cache, dm_cblock_t size) 2740 { 2741 dm_block_t nr_blocks = from_cblock(size); 2742 2743 if (nr_blocks > (1 << 20) && cache->cache_size != size) 2744 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n" 2745 "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n" 2746 "Please consider increasing the cache block size to reduce the overall cache block count.", 2747 (unsigned long long) nr_blocks); 2748 2749 cache->cache_size = size; 2750 } 2751 2752 #define DEFAULT_MIGRATION_THRESHOLD 2048 2753 2754 static int cache_create(struct cache_args *ca, struct cache **result) 2755 { 2756 int r = 0; 2757 char **error = &ca->ti->error; 2758 struct cache *cache; 2759 struct dm_target *ti = ca->ti; 2760 dm_block_t origin_blocks; 2761 struct dm_cache_metadata *cmd; 2762 bool may_format = ca->features.mode == CM_WRITE; 2763 2764 cache = kzalloc(sizeof(*cache), GFP_KERNEL); 2765 if (!cache) 2766 return -ENOMEM; 2767 2768 cache->ti = ca->ti; 2769 ti->private = cache; 2770 ti->num_flush_bios = 2; 2771 ti->flush_supported = true; 2772 2773 ti->num_discard_bios = 1; 2774 ti->discards_supported = true; 2775 ti->discard_zeroes_data_unsupported = true; 2776 ti->split_discard_bios = false; 2777 2778 cache->features = ca->features; 2779 ti->per_io_data_size = get_per_bio_data_size(cache); 2780 2781 cache->callbacks.congested_fn = cache_is_congested; 2782 dm_table_add_target_callbacks(ti->table, &cache->callbacks); 2783 2784 cache->metadata_dev = ca->metadata_dev; 2785 cache->origin_dev = ca->origin_dev; 2786 cache->cache_dev = ca->cache_dev; 2787 2788 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 2789 2790 /* FIXME: factor out this whole section */ 2791 origin_blocks = cache->origin_sectors = ca->origin_sectors; 2792 origin_blocks = block_div(origin_blocks, ca->block_size); 2793 cache->origin_blocks = to_oblock(origin_blocks); 2794 2795 cache->sectors_per_block = ca->block_size; 2796 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) { 2797 r = -EINVAL; 2798 goto bad; 2799 } 2800 2801 if (ca->block_size & (ca->block_size - 1)) { 2802 dm_block_t cache_size = ca->cache_sectors; 2803 2804 cache->sectors_per_block_shift = -1; 2805 cache_size = block_div(cache_size, ca->block_size); 2806 set_cache_size(cache, to_cblock(cache_size)); 2807 } else { 2808 cache->sectors_per_block_shift = __ffs(ca->block_size); 2809 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift)); 2810 } 2811 2812 r = create_cache_policy(cache, ca, error); 2813 if (r) 2814 goto bad; 2815 2816 cache->policy_nr_args = ca->policy_argc; 2817 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; 2818 2819 r = set_config_values(cache, ca->policy_argc, ca->policy_argv); 2820 if (r) { 2821 *error = "Error setting cache policy's config values"; 2822 goto bad; 2823 } 2824 2825 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, 2826 ca->block_size, may_format, 2827 dm_cache_policy_get_hint_size(cache->policy)); 2828 if (IS_ERR(cmd)) { 2829 *error = "Error creating metadata object"; 2830 r = PTR_ERR(cmd); 2831 goto bad; 2832 } 2833 cache->cmd = cmd; 2834 set_cache_mode(cache, CM_WRITE); 2835 if (get_cache_mode(cache) != CM_WRITE) { 2836 *error = "Unable to get write access to metadata, please check/repair metadata."; 2837 r = -EINVAL; 2838 goto bad; 2839 } 2840 2841 if (passthrough_mode(&cache->features)) { 2842 bool all_clean; 2843 2844 r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); 2845 if (r) { 2846 *error = "dm_cache_metadata_all_clean() failed"; 2847 goto bad; 2848 } 2849 2850 if (!all_clean) { 2851 *error = "Cannot enter passthrough mode unless all blocks are clean"; 2852 r = -EINVAL; 2853 goto bad; 2854 } 2855 } 2856 2857 spin_lock_init(&cache->lock); 2858 INIT_LIST_HEAD(&cache->deferred_cells); 2859 bio_list_init(&cache->deferred_bios); 2860 bio_list_init(&cache->deferred_flush_bios); 2861 bio_list_init(&cache->deferred_writethrough_bios); 2862 INIT_LIST_HEAD(&cache->quiesced_migrations); 2863 INIT_LIST_HEAD(&cache->completed_migrations); 2864 INIT_LIST_HEAD(&cache->need_commit_migrations); 2865 atomic_set(&cache->nr_allocated_migrations, 0); 2866 atomic_set(&cache->nr_io_migrations, 0); 2867 init_waitqueue_head(&cache->migration_wait); 2868 2869 init_waitqueue_head(&cache->quiescing_wait); 2870 atomic_set(&cache->quiescing, 0); 2871 atomic_set(&cache->quiescing_ack, 0); 2872 2873 r = -ENOMEM; 2874 atomic_set(&cache->nr_dirty, 0); 2875 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2876 if (!cache->dirty_bitset) { 2877 *error = "could not allocate dirty bitset"; 2878 goto bad; 2879 } 2880 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); 2881 2882 cache->discard_block_size = 2883 calculate_discard_block_size(cache->sectors_per_block, 2884 cache->origin_sectors); 2885 cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors, 2886 cache->discard_block_size)); 2887 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); 2888 if (!cache->discard_bitset) { 2889 *error = "could not allocate discard bitset"; 2890 goto bad; 2891 } 2892 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2893 2894 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2895 if (IS_ERR(cache->copier)) { 2896 *error = "could not create kcopyd client"; 2897 r = PTR_ERR(cache->copier); 2898 goto bad; 2899 } 2900 2901 cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 2902 if (!cache->wq) { 2903 *error = "could not create workqueue for metadata object"; 2904 goto bad; 2905 } 2906 INIT_WORK(&cache->worker, do_worker); 2907 INIT_DELAYED_WORK(&cache->waker, do_waker); 2908 cache->last_commit_jiffies = jiffies; 2909 2910 cache->prison = dm_bio_prison_create(); 2911 if (!cache->prison) { 2912 *error = "could not create bio prison"; 2913 goto bad; 2914 } 2915 2916 cache->all_io_ds = dm_deferred_set_create(); 2917 if (!cache->all_io_ds) { 2918 *error = "could not create all_io deferred set"; 2919 goto bad; 2920 } 2921 2922 cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE, 2923 migration_cache); 2924 if (!cache->migration_pool) { 2925 *error = "Error creating cache's migration mempool"; 2926 goto bad; 2927 } 2928 2929 cache->need_tick_bio = true; 2930 cache->sized = false; 2931 cache->invalidate = false; 2932 cache->commit_requested = false; 2933 cache->loaded_mappings = false; 2934 cache->loaded_discards = false; 2935 2936 load_stats(cache); 2937 2938 atomic_set(&cache->stats.demotion, 0); 2939 atomic_set(&cache->stats.promotion, 0); 2940 atomic_set(&cache->stats.copies_avoided, 0); 2941 atomic_set(&cache->stats.cache_cell_clash, 0); 2942 atomic_set(&cache->stats.commit_count, 0); 2943 atomic_set(&cache->stats.discard_count, 0); 2944 2945 spin_lock_init(&cache->invalidation_lock); 2946 INIT_LIST_HEAD(&cache->invalidation_requests); 2947 2948 iot_init(&cache->origin_tracker); 2949 2950 *result = cache; 2951 return 0; 2952 2953 bad: 2954 destroy(cache); 2955 return r; 2956 } 2957 2958 static int copy_ctr_args(struct cache *cache, int argc, const char **argv) 2959 { 2960 unsigned i; 2961 const char **copy; 2962 2963 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); 2964 if (!copy) 2965 return -ENOMEM; 2966 for (i = 0; i < argc; i++) { 2967 copy[i] = kstrdup(argv[i], GFP_KERNEL); 2968 if (!copy[i]) { 2969 while (i--) 2970 kfree(copy[i]); 2971 kfree(copy); 2972 return -ENOMEM; 2973 } 2974 } 2975 2976 cache->nr_ctr_args = argc; 2977 cache->ctr_args = copy; 2978 2979 return 0; 2980 } 2981 2982 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) 2983 { 2984 int r = -EINVAL; 2985 struct cache_args *ca; 2986 struct cache *cache = NULL; 2987 2988 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 2989 if (!ca) { 2990 ti->error = "Error allocating memory for cache"; 2991 return -ENOMEM; 2992 } 2993 ca->ti = ti; 2994 2995 r = parse_cache_args(ca, argc, argv, &ti->error); 2996 if (r) 2997 goto out; 2998 2999 r = cache_create(ca, &cache); 3000 if (r) 3001 goto out; 3002 3003 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); 3004 if (r) { 3005 destroy(cache); 3006 goto out; 3007 } 3008 3009 ti->private = cache; 3010 3011 out: 3012 destroy_cache_args(ca); 3013 return r; 3014 } 3015 3016 /*----------------------------------------------------------------*/ 3017 3018 static int cache_map(struct dm_target *ti, struct bio *bio) 3019 { 3020 struct cache *cache = ti->private; 3021 3022 int r; 3023 struct dm_bio_prison_cell *cell = NULL; 3024 dm_oblock_t block = get_bio_block(cache, bio); 3025 size_t pb_data_size = get_per_bio_data_size(cache); 3026 bool can_migrate = false; 3027 bool fast_promotion; 3028 struct policy_result lookup_result; 3029 struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size); 3030 struct old_oblock_lock ool; 3031 3032 ool.locker.fn = null_locker; 3033 3034 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { 3035 /* 3036 * This can only occur if the io goes to a partial block at 3037 * the end of the origin device. We don't cache these. 3038 * Just remap to the origin and carry on. 3039 */ 3040 remap_to_origin(cache, bio); 3041 accounted_begin(cache, bio); 3042 return DM_MAPIO_REMAPPED; 3043 } 3044 3045 if (discard_or_flush(bio)) { 3046 defer_bio(cache, bio); 3047 return DM_MAPIO_SUBMITTED; 3048 } 3049 3050 /* 3051 * Check to see if that block is currently migrating. 3052 */ 3053 cell = alloc_prison_cell(cache); 3054 if (!cell) { 3055 defer_bio(cache, bio); 3056 return DM_MAPIO_SUBMITTED; 3057 } 3058 3059 r = bio_detain(cache, block, bio, cell, 3060 (cell_free_fn) free_prison_cell, 3061 cache, &cell); 3062 if (r) { 3063 if (r < 0) 3064 defer_bio(cache, bio); 3065 3066 return DM_MAPIO_SUBMITTED; 3067 } 3068 3069 fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio); 3070 3071 r = policy_map(cache->policy, block, false, can_migrate, fast_promotion, 3072 bio, &ool.locker, &lookup_result); 3073 if (r == -EWOULDBLOCK) { 3074 cell_defer(cache, cell, true); 3075 return DM_MAPIO_SUBMITTED; 3076 3077 } else if (r) { 3078 DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d", 3079 cache_device_name(cache), r); 3080 cell_defer(cache, cell, false); 3081 bio_io_error(bio); 3082 return DM_MAPIO_SUBMITTED; 3083 } 3084 3085 r = DM_MAPIO_REMAPPED; 3086 switch (lookup_result.op) { 3087 case POLICY_HIT: 3088 if (passthrough_mode(&cache->features)) { 3089 if (bio_data_dir(bio) == WRITE) { 3090 /* 3091 * We need to invalidate this block, so 3092 * defer for the worker thread. 3093 */ 3094 cell_defer(cache, cell, true); 3095 r = DM_MAPIO_SUBMITTED; 3096 3097 } else { 3098 inc_miss_counter(cache, bio); 3099 remap_to_origin_clear_discard(cache, bio, block); 3100 accounted_begin(cache, bio); 3101 inc_ds(cache, bio, cell); 3102 // FIXME: we want to remap hits or misses straight 3103 // away rather than passing over to the worker. 3104 cell_defer(cache, cell, false); 3105 } 3106 3107 } else { 3108 inc_hit_counter(cache, bio); 3109 if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && 3110 !is_dirty(cache, lookup_result.cblock)) { 3111 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 3112 accounted_begin(cache, bio); 3113 inc_ds(cache, bio, cell); 3114 cell_defer(cache, cell, false); 3115 3116 } else 3117 remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false); 3118 } 3119 break; 3120 3121 case POLICY_MISS: 3122 inc_miss_counter(cache, bio); 3123 if (pb->req_nr != 0) { 3124 /* 3125 * This is a duplicate writethrough io that is no 3126 * longer needed because the block has been demoted. 3127 */ 3128 bio_endio(bio); 3129 // FIXME: remap everything as a miss 3130 cell_defer(cache, cell, false); 3131 r = DM_MAPIO_SUBMITTED; 3132 3133 } else 3134 remap_cell_to_origin_clear_discard(cache, cell, block, false); 3135 break; 3136 3137 default: 3138 DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u", 3139 cache_device_name(cache), __func__, 3140 (unsigned) lookup_result.op); 3141 cell_defer(cache, cell, false); 3142 bio_io_error(bio); 3143 r = DM_MAPIO_SUBMITTED; 3144 } 3145 3146 return r; 3147 } 3148 3149 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) 3150 { 3151 struct cache *cache = ti->private; 3152 unsigned long flags; 3153 size_t pb_data_size = get_per_bio_data_size(cache); 3154 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 3155 3156 if (pb->tick) { 3157 policy_tick(cache->policy, false); 3158 3159 spin_lock_irqsave(&cache->lock, flags); 3160 cache->need_tick_bio = true; 3161 spin_unlock_irqrestore(&cache->lock, flags); 3162 } 3163 3164 check_for_quiesced_migrations(cache, pb); 3165 accounted_complete(cache, bio); 3166 3167 return 0; 3168 } 3169 3170 static int write_dirty_bitset(struct cache *cache) 3171 { 3172 unsigned i, r; 3173 3174 if (get_cache_mode(cache) >= CM_READ_ONLY) 3175 return -EINVAL; 3176 3177 for (i = 0; i < from_cblock(cache->cache_size); i++) { 3178 r = dm_cache_set_dirty(cache->cmd, to_cblock(i), 3179 is_dirty(cache, to_cblock(i))); 3180 if (r) { 3181 metadata_operation_failed(cache, "dm_cache_set_dirty", r); 3182 return r; 3183 } 3184 } 3185 3186 return 0; 3187 } 3188 3189 static int write_discard_bitset(struct cache *cache) 3190 { 3191 unsigned i, r; 3192 3193 if (get_cache_mode(cache) >= CM_READ_ONLY) 3194 return -EINVAL; 3195 3196 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, 3197 cache->discard_nr_blocks); 3198 if (r) { 3199 DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache)); 3200 metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r); 3201 return r; 3202 } 3203 3204 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { 3205 r = dm_cache_set_discard(cache->cmd, to_dblock(i), 3206 is_discarded(cache, to_dblock(i))); 3207 if (r) { 3208 metadata_operation_failed(cache, "dm_cache_set_discard", r); 3209 return r; 3210 } 3211 } 3212 3213 return 0; 3214 } 3215 3216 static int write_hints(struct cache *cache) 3217 { 3218 int r; 3219 3220 if (get_cache_mode(cache) >= CM_READ_ONLY) 3221 return -EINVAL; 3222 3223 r = dm_cache_write_hints(cache->cmd, cache->policy); 3224 if (r) { 3225 metadata_operation_failed(cache, "dm_cache_write_hints", r); 3226 return r; 3227 } 3228 3229 return 0; 3230 } 3231 3232 /* 3233 * returns true on success 3234 */ 3235 static bool sync_metadata(struct cache *cache) 3236 { 3237 int r1, r2, r3, r4; 3238 3239 r1 = write_dirty_bitset(cache); 3240 if (r1) 3241 DMERR("%s: could not write dirty bitset", cache_device_name(cache)); 3242 3243 r2 = write_discard_bitset(cache); 3244 if (r2) 3245 DMERR("%s: could not write discard bitset", cache_device_name(cache)); 3246 3247 save_stats(cache); 3248 3249 r3 = write_hints(cache); 3250 if (r3) 3251 DMERR("%s: could not write hints", cache_device_name(cache)); 3252 3253 /* 3254 * If writing the above metadata failed, we still commit, but don't 3255 * set the clean shutdown flag. This will effectively force every 3256 * dirty bit to be set on reload. 3257 */ 3258 r4 = commit(cache, !r1 && !r2 && !r3); 3259 if (r4) 3260 DMERR("%s: could not write cache metadata", cache_device_name(cache)); 3261 3262 return !r1 && !r2 && !r3 && !r4; 3263 } 3264 3265 static void cache_postsuspend(struct dm_target *ti) 3266 { 3267 struct cache *cache = ti->private; 3268 3269 start_quiescing(cache); 3270 wait_for_migrations(cache); 3271 stop_worker(cache); 3272 requeue_deferred_bios(cache); 3273 requeue_deferred_cells(cache); 3274 stop_quiescing(cache); 3275 3276 if (get_cache_mode(cache) == CM_WRITE) 3277 (void) sync_metadata(cache); 3278 } 3279 3280 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, 3281 bool dirty, uint32_t hint, bool hint_valid) 3282 { 3283 int r; 3284 struct cache *cache = context; 3285 3286 r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid); 3287 if (r) 3288 return r; 3289 3290 if (dirty) 3291 set_dirty(cache, oblock, cblock); 3292 else 3293 clear_dirty(cache, oblock, cblock); 3294 3295 return 0; 3296 } 3297 3298 /* 3299 * The discard block size in the on disk metadata is not 3300 * neccessarily the same as we're currently using. So we have to 3301 * be careful to only set the discarded attribute if we know it 3302 * covers a complete block of the new size. 3303 */ 3304 struct discard_load_info { 3305 struct cache *cache; 3306 3307 /* 3308 * These blocks are sized using the on disk dblock size, rather 3309 * than the current one. 3310 */ 3311 dm_block_t block_size; 3312 dm_block_t discard_begin, discard_end; 3313 }; 3314 3315 static void discard_load_info_init(struct cache *cache, 3316 struct discard_load_info *li) 3317 { 3318 li->cache = cache; 3319 li->discard_begin = li->discard_end = 0; 3320 } 3321 3322 static void set_discard_range(struct discard_load_info *li) 3323 { 3324 sector_t b, e; 3325 3326 if (li->discard_begin == li->discard_end) 3327 return; 3328 3329 /* 3330 * Convert to sectors. 3331 */ 3332 b = li->discard_begin * li->block_size; 3333 e = li->discard_end * li->block_size; 3334 3335 /* 3336 * Then convert back to the current dblock size. 3337 */ 3338 b = dm_sector_div_up(b, li->cache->discard_block_size); 3339 sector_div(e, li->cache->discard_block_size); 3340 3341 /* 3342 * The origin may have shrunk, so we need to check we're still in 3343 * bounds. 3344 */ 3345 if (e > from_dblock(li->cache->discard_nr_blocks)) 3346 e = from_dblock(li->cache->discard_nr_blocks); 3347 3348 for (; b < e; b++) 3349 set_discard(li->cache, to_dblock(b)); 3350 } 3351 3352 static int load_discard(void *context, sector_t discard_block_size, 3353 dm_dblock_t dblock, bool discard) 3354 { 3355 struct discard_load_info *li = context; 3356 3357 li->block_size = discard_block_size; 3358 3359 if (discard) { 3360 if (from_dblock(dblock) == li->discard_end) 3361 /* 3362 * We're already in a discard range, just extend it. 3363 */ 3364 li->discard_end = li->discard_end + 1ULL; 3365 3366 else { 3367 /* 3368 * Emit the old range and start a new one. 3369 */ 3370 set_discard_range(li); 3371 li->discard_begin = from_dblock(dblock); 3372 li->discard_end = li->discard_begin + 1ULL; 3373 } 3374 } else { 3375 set_discard_range(li); 3376 li->discard_begin = li->discard_end = 0; 3377 } 3378 3379 return 0; 3380 } 3381 3382 static dm_cblock_t get_cache_dev_size(struct cache *cache) 3383 { 3384 sector_t size = get_dev_size(cache->cache_dev); 3385 (void) sector_div(size, cache->sectors_per_block); 3386 return to_cblock(size); 3387 } 3388 3389 static bool can_resize(struct cache *cache, dm_cblock_t new_size) 3390 { 3391 if (from_cblock(new_size) > from_cblock(cache->cache_size)) 3392 return true; 3393 3394 /* 3395 * We can't drop a dirty block when shrinking the cache. 3396 */ 3397 while (from_cblock(new_size) < from_cblock(cache->cache_size)) { 3398 new_size = to_cblock(from_cblock(new_size) + 1); 3399 if (is_dirty(cache, new_size)) { 3400 DMERR("%s: unable to shrink cache; cache block %llu is dirty", 3401 cache_device_name(cache), 3402 (unsigned long long) from_cblock(new_size)); 3403 return false; 3404 } 3405 } 3406 3407 return true; 3408 } 3409 3410 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) 3411 { 3412 int r; 3413 3414 r = dm_cache_resize(cache->cmd, new_size); 3415 if (r) { 3416 DMERR("%s: could not resize cache metadata", cache_device_name(cache)); 3417 metadata_operation_failed(cache, "dm_cache_resize", r); 3418 return r; 3419 } 3420 3421 set_cache_size(cache, new_size); 3422 3423 return 0; 3424 } 3425 3426 static int cache_preresume(struct dm_target *ti) 3427 { 3428 int r = 0; 3429 struct cache *cache = ti->private; 3430 dm_cblock_t csize = get_cache_dev_size(cache); 3431 3432 /* 3433 * Check to see if the cache has resized. 3434 */ 3435 if (!cache->sized) { 3436 r = resize_cache_dev(cache, csize); 3437 if (r) 3438 return r; 3439 3440 cache->sized = true; 3441 3442 } else if (csize != cache->cache_size) { 3443 if (!can_resize(cache, csize)) 3444 return -EINVAL; 3445 3446 r = resize_cache_dev(cache, csize); 3447 if (r) 3448 return r; 3449 } 3450 3451 if (!cache->loaded_mappings) { 3452 r = dm_cache_load_mappings(cache->cmd, cache->policy, 3453 load_mapping, cache); 3454 if (r) { 3455 DMERR("%s: could not load cache mappings", cache_device_name(cache)); 3456 metadata_operation_failed(cache, "dm_cache_load_mappings", r); 3457 return r; 3458 } 3459 3460 cache->loaded_mappings = true; 3461 } 3462 3463 if (!cache->loaded_discards) { 3464 struct discard_load_info li; 3465 3466 /* 3467 * The discard bitset could have been resized, or the 3468 * discard block size changed. To be safe we start by 3469 * setting every dblock to not discarded. 3470 */ 3471 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 3472 3473 discard_load_info_init(cache, &li); 3474 r = dm_cache_load_discards(cache->cmd, load_discard, &li); 3475 if (r) { 3476 DMERR("%s: could not load origin discards", cache_device_name(cache)); 3477 metadata_operation_failed(cache, "dm_cache_load_discards", r); 3478 return r; 3479 } 3480 set_discard_range(&li); 3481 3482 cache->loaded_discards = true; 3483 } 3484 3485 return r; 3486 } 3487 3488 static void cache_resume(struct dm_target *ti) 3489 { 3490 struct cache *cache = ti->private; 3491 3492 cache->need_tick_bio = true; 3493 do_waker(&cache->waker.work); 3494 } 3495 3496 /* 3497 * Status format: 3498 * 3499 * <metadata block size> <#used metadata blocks>/<#total metadata blocks> 3500 * <cache block size> <#used cache blocks>/<#total cache blocks> 3501 * <#read hits> <#read misses> <#write hits> <#write misses> 3502 * <#demotions> <#promotions> <#dirty> 3503 * <#features> <features>* 3504 * <#core args> <core args> 3505 * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check> 3506 */ 3507 static void cache_status(struct dm_target *ti, status_type_t type, 3508 unsigned status_flags, char *result, unsigned maxlen) 3509 { 3510 int r = 0; 3511 unsigned i; 3512 ssize_t sz = 0; 3513 dm_block_t nr_free_blocks_metadata = 0; 3514 dm_block_t nr_blocks_metadata = 0; 3515 char buf[BDEVNAME_SIZE]; 3516 struct cache *cache = ti->private; 3517 dm_cblock_t residency; 3518 bool needs_check; 3519 3520 switch (type) { 3521 case STATUSTYPE_INFO: 3522 if (get_cache_mode(cache) == CM_FAIL) { 3523 DMEMIT("Fail"); 3524 break; 3525 } 3526 3527 /* Commit to ensure statistics aren't out-of-date */ 3528 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) 3529 (void) commit(cache, false); 3530 3531 r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata); 3532 if (r) { 3533 DMERR("%s: dm_cache_get_free_metadata_block_count returned %d", 3534 cache_device_name(cache), r); 3535 goto err; 3536 } 3537 3538 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata); 3539 if (r) { 3540 DMERR("%s: dm_cache_get_metadata_dev_size returned %d", 3541 cache_device_name(cache), r); 3542 goto err; 3543 } 3544 3545 residency = policy_residency(cache->policy); 3546 3547 DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ", 3548 (unsigned)DM_CACHE_METADATA_BLOCK_SIZE, 3549 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 3550 (unsigned long long)nr_blocks_metadata, 3551 cache->sectors_per_block, 3552 (unsigned long long) from_cblock(residency), 3553 (unsigned long long) from_cblock(cache->cache_size), 3554 (unsigned) atomic_read(&cache->stats.read_hit), 3555 (unsigned) atomic_read(&cache->stats.read_miss), 3556 (unsigned) atomic_read(&cache->stats.write_hit), 3557 (unsigned) atomic_read(&cache->stats.write_miss), 3558 (unsigned) atomic_read(&cache->stats.demotion), 3559 (unsigned) atomic_read(&cache->stats.promotion), 3560 (unsigned long) atomic_read(&cache->nr_dirty)); 3561 3562 if (writethrough_mode(&cache->features)) 3563 DMEMIT("1 writethrough "); 3564 3565 else if (passthrough_mode(&cache->features)) 3566 DMEMIT("1 passthrough "); 3567 3568 else if (writeback_mode(&cache->features)) 3569 DMEMIT("1 writeback "); 3570 3571 else { 3572 DMERR("%s: internal error: unknown io mode: %d", 3573 cache_device_name(cache), (int) cache->features.io_mode); 3574 goto err; 3575 } 3576 3577 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); 3578 3579 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy)); 3580 if (sz < maxlen) { 3581 r = policy_emit_config_values(cache->policy, result, maxlen, &sz); 3582 if (r) 3583 DMERR("%s: policy_emit_config_values returned %d", 3584 cache_device_name(cache), r); 3585 } 3586 3587 if (get_cache_mode(cache) == CM_READ_ONLY) 3588 DMEMIT("ro "); 3589 else 3590 DMEMIT("rw "); 3591 3592 r = dm_cache_metadata_needs_check(cache->cmd, &needs_check); 3593 3594 if (r || needs_check) 3595 DMEMIT("needs_check "); 3596 else 3597 DMEMIT("- "); 3598 3599 break; 3600 3601 case STATUSTYPE_TABLE: 3602 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 3603 DMEMIT("%s ", buf); 3604 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 3605 DMEMIT("%s ", buf); 3606 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 3607 DMEMIT("%s", buf); 3608 3609 for (i = 0; i < cache->nr_ctr_args - 1; i++) 3610 DMEMIT(" %s", cache->ctr_args[i]); 3611 if (cache->nr_ctr_args) 3612 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]); 3613 } 3614 3615 return; 3616 3617 err: 3618 DMEMIT("Error"); 3619 } 3620 3621 /* 3622 * A cache block range can take two forms: 3623 * 3624 * i) A single cblock, eg. '3456' 3625 * ii) A begin and end cblock with dots between, eg. 123-234 3626 */ 3627 static int parse_cblock_range(struct cache *cache, const char *str, 3628 struct cblock_range *result) 3629 { 3630 char dummy; 3631 uint64_t b, e; 3632 int r; 3633 3634 /* 3635 * Try and parse form (ii) first. 3636 */ 3637 r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy); 3638 if (r < 0) 3639 return r; 3640 3641 if (r == 2) { 3642 result->begin = to_cblock(b); 3643 result->end = to_cblock(e); 3644 return 0; 3645 } 3646 3647 /* 3648 * That didn't work, try form (i). 3649 */ 3650 r = sscanf(str, "%llu%c", &b, &dummy); 3651 if (r < 0) 3652 return r; 3653 3654 if (r == 1) { 3655 result->begin = to_cblock(b); 3656 result->end = to_cblock(from_cblock(result->begin) + 1u); 3657 return 0; 3658 } 3659 3660 DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str); 3661 return -EINVAL; 3662 } 3663 3664 static int validate_cblock_range(struct cache *cache, struct cblock_range *range) 3665 { 3666 uint64_t b = from_cblock(range->begin); 3667 uint64_t e = from_cblock(range->end); 3668 uint64_t n = from_cblock(cache->cache_size); 3669 3670 if (b >= n) { 3671 DMERR("%s: begin cblock out of range: %llu >= %llu", 3672 cache_device_name(cache), b, n); 3673 return -EINVAL; 3674 } 3675 3676 if (e > n) { 3677 DMERR("%s: end cblock out of range: %llu > %llu", 3678 cache_device_name(cache), e, n); 3679 return -EINVAL; 3680 } 3681 3682 if (b >= e) { 3683 DMERR("%s: invalid cblock range: %llu >= %llu", 3684 cache_device_name(cache), b, e); 3685 return -EINVAL; 3686 } 3687 3688 return 0; 3689 } 3690 3691 static int request_invalidation(struct cache *cache, struct cblock_range *range) 3692 { 3693 struct invalidation_request req; 3694 3695 INIT_LIST_HEAD(&req.list); 3696 req.cblocks = range; 3697 atomic_set(&req.complete, 0); 3698 req.err = 0; 3699 init_waitqueue_head(&req.result_wait); 3700 3701 spin_lock(&cache->invalidation_lock); 3702 list_add(&req.list, &cache->invalidation_requests); 3703 spin_unlock(&cache->invalidation_lock); 3704 wake_worker(cache); 3705 3706 wait_event(req.result_wait, atomic_read(&req.complete)); 3707 return req.err; 3708 } 3709 3710 static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, 3711 const char **cblock_ranges) 3712 { 3713 int r = 0; 3714 unsigned i; 3715 struct cblock_range range; 3716 3717 if (!passthrough_mode(&cache->features)) { 3718 DMERR("%s: cache has to be in passthrough mode for invalidation", 3719 cache_device_name(cache)); 3720 return -EPERM; 3721 } 3722 3723 for (i = 0; i < count; i++) { 3724 r = parse_cblock_range(cache, cblock_ranges[i], &range); 3725 if (r) 3726 break; 3727 3728 r = validate_cblock_range(cache, &range); 3729 if (r) 3730 break; 3731 3732 /* 3733 * Pass begin and end origin blocks to the worker and wake it. 3734 */ 3735 r = request_invalidation(cache, &range); 3736 if (r) 3737 break; 3738 } 3739 3740 return r; 3741 } 3742 3743 /* 3744 * Supports 3745 * "<key> <value>" 3746 * and 3747 * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]* 3748 * 3749 * The key migration_threshold is supported by the cache target core. 3750 */ 3751 static int cache_message(struct dm_target *ti, unsigned argc, char **argv) 3752 { 3753 struct cache *cache = ti->private; 3754 3755 if (!argc) 3756 return -EINVAL; 3757 3758 if (get_cache_mode(cache) >= CM_READ_ONLY) { 3759 DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode", 3760 cache_device_name(cache)); 3761 return -EOPNOTSUPP; 3762 } 3763 3764 if (!strcasecmp(argv[0], "invalidate_cblocks")) 3765 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1); 3766 3767 if (argc != 2) 3768 return -EINVAL; 3769 3770 return set_config_value(cache, argv[0], argv[1]); 3771 } 3772 3773 static int cache_iterate_devices(struct dm_target *ti, 3774 iterate_devices_callout_fn fn, void *data) 3775 { 3776 int r = 0; 3777 struct cache *cache = ti->private; 3778 3779 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data); 3780 if (!r) 3781 r = fn(ti, cache->origin_dev, 0, ti->len, data); 3782 3783 return r; 3784 } 3785 3786 static void set_discard_limits(struct cache *cache, struct queue_limits *limits) 3787 { 3788 /* 3789 * FIXME: these limits may be incompatible with the cache device 3790 */ 3791 limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024, 3792 cache->origin_sectors); 3793 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; 3794 } 3795 3796 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 3797 { 3798 struct cache *cache = ti->private; 3799 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 3800 3801 /* 3802 * If the system-determined stacked limits are compatible with the 3803 * cache's blocksize (io_opt is a factor) do not override them. 3804 */ 3805 if (io_opt_sectors < cache->sectors_per_block || 3806 do_div(io_opt_sectors, cache->sectors_per_block)) { 3807 blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT); 3808 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); 3809 } 3810 set_discard_limits(cache, limits); 3811 } 3812 3813 /*----------------------------------------------------------------*/ 3814 3815 static struct target_type cache_target = { 3816 .name = "cache", 3817 .version = {1, 9, 0}, 3818 .module = THIS_MODULE, 3819 .ctr = cache_ctr, 3820 .dtr = cache_dtr, 3821 .map = cache_map, 3822 .end_io = cache_end_io, 3823 .postsuspend = cache_postsuspend, 3824 .preresume = cache_preresume, 3825 .resume = cache_resume, 3826 .status = cache_status, 3827 .message = cache_message, 3828 .iterate_devices = cache_iterate_devices, 3829 .io_hints = cache_io_hints, 3830 }; 3831 3832 static int __init dm_cache_init(void) 3833 { 3834 int r; 3835 3836 r = dm_register_target(&cache_target); 3837 if (r) { 3838 DMERR("cache target registration failed: %d", r); 3839 return r; 3840 } 3841 3842 migration_cache = KMEM_CACHE(dm_cache_migration, 0); 3843 if (!migration_cache) { 3844 dm_unregister_target(&cache_target); 3845 return -ENOMEM; 3846 } 3847 3848 return 0; 3849 } 3850 3851 static void __exit dm_cache_exit(void) 3852 { 3853 dm_unregister_target(&cache_target); 3854 kmem_cache_destroy(migration_cache); 3855 } 3856 3857 module_init(dm_cache_init); 3858 module_exit(dm_cache_exit); 3859 3860 MODULE_DESCRIPTION(DM_NAME " cache target"); 3861 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); 3862 MODULE_LICENSE("GPL"); 3863