1 /* 2 * Copyright (C) 2012 Red Hat. All rights reserved. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm.h" 8 #include "dm-bio-prison.h" 9 #include "dm-bio-record.h" 10 #include "dm-cache-metadata.h" 11 12 #include <linux/dm-io.h> 13 #include <linux/dm-kcopyd.h> 14 #include <linux/init.h> 15 #include <linux/mempool.h> 16 #include <linux/module.h> 17 #include <linux/slab.h> 18 #include <linux/vmalloc.h> 19 20 #define DM_MSG_PREFIX "cache" 21 22 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, 23 "A percentage of time allocated for copying to and/or from cache"); 24 25 /*----------------------------------------------------------------*/ 26 27 /* 28 * Glossary: 29 * 30 * oblock: index of an origin block 31 * cblock: index of a cache block 32 * promotion: movement of a block from origin to cache 33 * demotion: movement of a block from cache to origin 34 * migration: movement of a block between the origin and cache device, 35 * either direction 36 */ 37 38 /*----------------------------------------------------------------*/ 39 40 static size_t bitset_size_in_bytes(unsigned nr_entries) 41 { 42 return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG); 43 } 44 45 static unsigned long *alloc_bitset(unsigned nr_entries) 46 { 47 size_t s = bitset_size_in_bytes(nr_entries); 48 return vzalloc(s); 49 } 50 51 static void clear_bitset(void *bitset, unsigned nr_entries) 52 { 53 size_t s = bitset_size_in_bytes(nr_entries); 54 memset(bitset, 0, s); 55 } 56 57 static void free_bitset(unsigned long *bits) 58 { 59 vfree(bits); 60 } 61 62 /*----------------------------------------------------------------*/ 63 64 #define PRISON_CELLS 1024 65 #define MIGRATION_POOL_SIZE 128 66 #define COMMIT_PERIOD HZ 67 #define MIGRATION_COUNT_WINDOW 10 68 69 /* 70 * The block size of the device holding cache data must be >= 32KB 71 */ 72 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) 73 74 /* 75 * FIXME: the cache is read/write for the time being. 76 */ 77 enum cache_mode { 78 CM_WRITE, /* metadata may be changed */ 79 CM_READ_ONLY, /* metadata may not be changed */ 80 }; 81 82 struct cache_features { 83 enum cache_mode mode; 84 bool write_through:1; 85 }; 86 87 struct cache_stats { 88 atomic_t read_hit; 89 atomic_t read_miss; 90 atomic_t write_hit; 91 atomic_t write_miss; 92 atomic_t demotion; 93 atomic_t promotion; 94 atomic_t copies_avoided; 95 atomic_t cache_cell_clash; 96 atomic_t commit_count; 97 atomic_t discard_count; 98 }; 99 100 struct cache { 101 struct dm_target *ti; 102 struct dm_target_callbacks callbacks; 103 104 /* 105 * Metadata is written to this device. 106 */ 107 struct dm_dev *metadata_dev; 108 109 /* 110 * The slower of the two data devices. Typically a spindle. 111 */ 112 struct dm_dev *origin_dev; 113 114 /* 115 * The faster of the two data devices. Typically an SSD. 116 */ 117 struct dm_dev *cache_dev; 118 119 /* 120 * Cache features such as write-through. 121 */ 122 struct cache_features features; 123 124 /* 125 * Size of the origin device in _complete_ blocks and native sectors. 126 */ 127 dm_oblock_t origin_blocks; 128 sector_t origin_sectors; 129 130 /* 131 * Size of the cache device in blocks. 132 */ 133 dm_cblock_t cache_size; 134 135 /* 136 * Fields for converting from sectors to blocks. 137 */ 138 uint32_t sectors_per_block; 139 int sectors_per_block_shift; 140 141 struct dm_cache_metadata *cmd; 142 143 spinlock_t lock; 144 struct bio_list deferred_bios; 145 struct bio_list deferred_flush_bios; 146 struct bio_list deferred_writethrough_bios; 147 struct list_head quiesced_migrations; 148 struct list_head completed_migrations; 149 struct list_head need_commit_migrations; 150 sector_t migration_threshold; 151 atomic_t nr_migrations; 152 wait_queue_head_t migration_wait; 153 154 /* 155 * cache_size entries, dirty if set 156 */ 157 dm_cblock_t nr_dirty; 158 unsigned long *dirty_bitset; 159 160 /* 161 * origin_blocks entries, discarded if set. 162 */ 163 uint32_t discard_block_size; /* a power of 2 times sectors per block */ 164 dm_dblock_t discard_nr_blocks; 165 unsigned long *discard_bitset; 166 167 struct dm_kcopyd_client *copier; 168 struct workqueue_struct *wq; 169 struct work_struct worker; 170 171 struct delayed_work waker; 172 unsigned long last_commit_jiffies; 173 174 struct dm_bio_prison *prison; 175 struct dm_deferred_set *all_io_ds; 176 177 mempool_t *migration_pool; 178 struct dm_cache_migration *next_migration; 179 180 struct dm_cache_policy *policy; 181 unsigned policy_nr_args; 182 183 bool need_tick_bio:1; 184 bool sized:1; 185 bool quiescing:1; 186 bool commit_requested:1; 187 bool loaded_mappings:1; 188 bool loaded_discards:1; 189 190 struct cache_stats stats; 191 192 /* 193 * Rather than reconstructing the table line for the status we just 194 * save it and regurgitate. 195 */ 196 unsigned nr_ctr_args; 197 const char **ctr_args; 198 }; 199 200 struct per_bio_data { 201 bool tick:1; 202 unsigned req_nr:2; 203 struct dm_deferred_entry *all_io_entry; 204 205 /* 206 * writethrough fields. These MUST remain at the end of this 207 * structure and the 'cache' member must be the first as it 208 * is used to determine the offsetof the writethrough fields. 209 */ 210 struct cache *cache; 211 dm_cblock_t cblock; 212 bio_end_io_t *saved_bi_end_io; 213 struct dm_bio_details bio_details; 214 }; 215 216 struct dm_cache_migration { 217 struct list_head list; 218 struct cache *cache; 219 220 unsigned long start_jiffies; 221 dm_oblock_t old_oblock; 222 dm_oblock_t new_oblock; 223 dm_cblock_t cblock; 224 225 bool err:1; 226 bool writeback:1; 227 bool demote:1; 228 bool promote:1; 229 230 struct dm_bio_prison_cell *old_ocell; 231 struct dm_bio_prison_cell *new_ocell; 232 }; 233 234 /* 235 * Processing a bio in the worker thread may require these memory 236 * allocations. We prealloc to avoid deadlocks (the same worker thread 237 * frees them back to the mempool). 238 */ 239 struct prealloc { 240 struct dm_cache_migration *mg; 241 struct dm_bio_prison_cell *cell1; 242 struct dm_bio_prison_cell *cell2; 243 }; 244 245 static void wake_worker(struct cache *cache) 246 { 247 queue_work(cache->wq, &cache->worker); 248 } 249 250 /*----------------------------------------------------------------*/ 251 252 static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache) 253 { 254 /* FIXME: change to use a local slab. */ 255 return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT); 256 } 257 258 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell) 259 { 260 dm_bio_prison_free_cell(cache->prison, cell); 261 } 262 263 static int prealloc_data_structs(struct cache *cache, struct prealloc *p) 264 { 265 if (!p->mg) { 266 p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT); 267 if (!p->mg) 268 return -ENOMEM; 269 } 270 271 if (!p->cell1) { 272 p->cell1 = alloc_prison_cell(cache); 273 if (!p->cell1) 274 return -ENOMEM; 275 } 276 277 if (!p->cell2) { 278 p->cell2 = alloc_prison_cell(cache); 279 if (!p->cell2) 280 return -ENOMEM; 281 } 282 283 return 0; 284 } 285 286 static void prealloc_free_structs(struct cache *cache, struct prealloc *p) 287 { 288 if (p->cell2) 289 free_prison_cell(cache, p->cell2); 290 291 if (p->cell1) 292 free_prison_cell(cache, p->cell1); 293 294 if (p->mg) 295 mempool_free(p->mg, cache->migration_pool); 296 } 297 298 static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p) 299 { 300 struct dm_cache_migration *mg = p->mg; 301 302 BUG_ON(!mg); 303 p->mg = NULL; 304 305 return mg; 306 } 307 308 /* 309 * You must have a cell within the prealloc struct to return. If not this 310 * function will BUG() rather than returning NULL. 311 */ 312 static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p) 313 { 314 struct dm_bio_prison_cell *r = NULL; 315 316 if (p->cell1) { 317 r = p->cell1; 318 p->cell1 = NULL; 319 320 } else if (p->cell2) { 321 r = p->cell2; 322 p->cell2 = NULL; 323 } else 324 BUG(); 325 326 return r; 327 } 328 329 /* 330 * You can't have more than two cells in a prealloc struct. BUG() will be 331 * called if you try and overfill. 332 */ 333 static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell) 334 { 335 if (!p->cell2) 336 p->cell2 = cell; 337 338 else if (!p->cell1) 339 p->cell1 = cell; 340 341 else 342 BUG(); 343 } 344 345 /*----------------------------------------------------------------*/ 346 347 static void build_key(dm_oblock_t oblock, struct dm_cell_key *key) 348 { 349 key->virtual = 0; 350 key->dev = 0; 351 key->block = from_oblock(oblock); 352 } 353 354 /* 355 * The caller hands in a preallocated cell, and a free function for it. 356 * The cell will be freed if there's an error, or if it wasn't used because 357 * a cell with that key already exists. 358 */ 359 typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell); 360 361 static int bio_detain(struct cache *cache, dm_oblock_t oblock, 362 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 363 cell_free_fn free_fn, void *free_context, 364 struct dm_bio_prison_cell **cell_result) 365 { 366 int r; 367 struct dm_cell_key key; 368 369 build_key(oblock, &key); 370 r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result); 371 if (r) 372 free_fn(free_context, cell_prealloc); 373 374 return r; 375 } 376 377 static int get_cell(struct cache *cache, 378 dm_oblock_t oblock, 379 struct prealloc *structs, 380 struct dm_bio_prison_cell **cell_result) 381 { 382 int r; 383 struct dm_cell_key key; 384 struct dm_bio_prison_cell *cell_prealloc; 385 386 cell_prealloc = prealloc_get_cell(structs); 387 388 build_key(oblock, &key); 389 r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result); 390 if (r) 391 prealloc_put_cell(structs, cell_prealloc); 392 393 return r; 394 } 395 396 /*----------------------------------------------------------------*/ 397 398 static bool is_dirty(struct cache *cache, dm_cblock_t b) 399 { 400 return test_bit(from_cblock(b), cache->dirty_bitset); 401 } 402 403 static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 404 { 405 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 406 cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) + 1); 407 policy_set_dirty(cache->policy, oblock); 408 } 409 } 410 411 static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 412 { 413 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 414 policy_clear_dirty(cache->policy, oblock); 415 cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) - 1); 416 if (!from_cblock(cache->nr_dirty)) 417 dm_table_event(cache->ti->table); 418 } 419 } 420 421 /*----------------------------------------------------------------*/ 422 static bool block_size_is_power_of_two(struct cache *cache) 423 { 424 return cache->sectors_per_block_shift >= 0; 425 } 426 427 static dm_block_t block_div(dm_block_t b, uint32_t n) 428 { 429 do_div(b, n); 430 431 return b; 432 } 433 434 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) 435 { 436 uint32_t discard_blocks = cache->discard_block_size; 437 dm_block_t b = from_oblock(oblock); 438 439 if (!block_size_is_power_of_two(cache)) 440 discard_blocks = discard_blocks / cache->sectors_per_block; 441 else 442 discard_blocks >>= cache->sectors_per_block_shift; 443 444 b = block_div(b, discard_blocks); 445 446 return to_dblock(b); 447 } 448 449 static void set_discard(struct cache *cache, dm_dblock_t b) 450 { 451 unsigned long flags; 452 453 atomic_inc(&cache->stats.discard_count); 454 455 spin_lock_irqsave(&cache->lock, flags); 456 set_bit(from_dblock(b), cache->discard_bitset); 457 spin_unlock_irqrestore(&cache->lock, flags); 458 } 459 460 static void clear_discard(struct cache *cache, dm_dblock_t b) 461 { 462 unsigned long flags; 463 464 spin_lock_irqsave(&cache->lock, flags); 465 clear_bit(from_dblock(b), cache->discard_bitset); 466 spin_unlock_irqrestore(&cache->lock, flags); 467 } 468 469 static bool is_discarded(struct cache *cache, dm_dblock_t b) 470 { 471 int r; 472 unsigned long flags; 473 474 spin_lock_irqsave(&cache->lock, flags); 475 r = test_bit(from_dblock(b), cache->discard_bitset); 476 spin_unlock_irqrestore(&cache->lock, flags); 477 478 return r; 479 } 480 481 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) 482 { 483 int r; 484 unsigned long flags; 485 486 spin_lock_irqsave(&cache->lock, flags); 487 r = test_bit(from_dblock(oblock_to_dblock(cache, b)), 488 cache->discard_bitset); 489 spin_unlock_irqrestore(&cache->lock, flags); 490 491 return r; 492 } 493 494 /*----------------------------------------------------------------*/ 495 496 static void load_stats(struct cache *cache) 497 { 498 struct dm_cache_statistics stats; 499 500 dm_cache_metadata_get_stats(cache->cmd, &stats); 501 atomic_set(&cache->stats.read_hit, stats.read_hits); 502 atomic_set(&cache->stats.read_miss, stats.read_misses); 503 atomic_set(&cache->stats.write_hit, stats.write_hits); 504 atomic_set(&cache->stats.write_miss, stats.write_misses); 505 } 506 507 static void save_stats(struct cache *cache) 508 { 509 struct dm_cache_statistics stats; 510 511 stats.read_hits = atomic_read(&cache->stats.read_hit); 512 stats.read_misses = atomic_read(&cache->stats.read_miss); 513 stats.write_hits = atomic_read(&cache->stats.write_hit); 514 stats.write_misses = atomic_read(&cache->stats.write_miss); 515 516 dm_cache_metadata_set_stats(cache->cmd, &stats); 517 } 518 519 /*---------------------------------------------------------------- 520 * Per bio data 521 *--------------------------------------------------------------*/ 522 523 /* 524 * If using writeback, leave out struct per_bio_data's writethrough fields. 525 */ 526 #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) 527 #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) 528 529 static size_t get_per_bio_data_size(struct cache *cache) 530 { 531 return cache->features.write_through ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; 532 } 533 534 static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) 535 { 536 struct per_bio_data *pb = dm_per_bio_data(bio, data_size); 537 BUG_ON(!pb); 538 return pb; 539 } 540 541 static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) 542 { 543 struct per_bio_data *pb = get_per_bio_data(bio, data_size); 544 545 pb->tick = false; 546 pb->req_nr = dm_bio_get_target_bio_nr(bio); 547 pb->all_io_entry = NULL; 548 549 return pb; 550 } 551 552 /*---------------------------------------------------------------- 553 * Remapping 554 *--------------------------------------------------------------*/ 555 static void remap_to_origin(struct cache *cache, struct bio *bio) 556 { 557 bio->bi_bdev = cache->origin_dev->bdev; 558 } 559 560 static void remap_to_cache(struct cache *cache, struct bio *bio, 561 dm_cblock_t cblock) 562 { 563 sector_t bi_sector = bio->bi_sector; 564 565 bio->bi_bdev = cache->cache_dev->bdev; 566 if (!block_size_is_power_of_two(cache)) 567 bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) + 568 sector_div(bi_sector, cache->sectors_per_block); 569 else 570 bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) | 571 (bi_sector & (cache->sectors_per_block - 1)); 572 } 573 574 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) 575 { 576 unsigned long flags; 577 size_t pb_data_size = get_per_bio_data_size(cache); 578 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 579 580 spin_lock_irqsave(&cache->lock, flags); 581 if (cache->need_tick_bio && 582 !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) { 583 pb->tick = true; 584 cache->need_tick_bio = false; 585 } 586 spin_unlock_irqrestore(&cache->lock, flags); 587 } 588 589 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 590 dm_oblock_t oblock) 591 { 592 check_if_tick_bio_needed(cache, bio); 593 remap_to_origin(cache, bio); 594 if (bio_data_dir(bio) == WRITE) 595 clear_discard(cache, oblock_to_dblock(cache, oblock)); 596 } 597 598 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, 599 dm_oblock_t oblock, dm_cblock_t cblock) 600 { 601 remap_to_cache(cache, bio, cblock); 602 if (bio_data_dir(bio) == WRITE) { 603 set_dirty(cache, oblock, cblock); 604 clear_discard(cache, oblock_to_dblock(cache, oblock)); 605 } 606 } 607 608 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) 609 { 610 sector_t block_nr = bio->bi_sector; 611 612 if (!block_size_is_power_of_two(cache)) 613 (void) sector_div(block_nr, cache->sectors_per_block); 614 else 615 block_nr >>= cache->sectors_per_block_shift; 616 617 return to_oblock(block_nr); 618 } 619 620 static int bio_triggers_commit(struct cache *cache, struct bio *bio) 621 { 622 return bio->bi_rw & (REQ_FLUSH | REQ_FUA); 623 } 624 625 static void issue(struct cache *cache, struct bio *bio) 626 { 627 unsigned long flags; 628 629 if (!bio_triggers_commit(cache, bio)) { 630 generic_make_request(bio); 631 return; 632 } 633 634 /* 635 * Batch together any bios that trigger commits and then issue a 636 * single commit for them in do_worker(). 637 */ 638 spin_lock_irqsave(&cache->lock, flags); 639 cache->commit_requested = true; 640 bio_list_add(&cache->deferred_flush_bios, bio); 641 spin_unlock_irqrestore(&cache->lock, flags); 642 } 643 644 static void defer_writethrough_bio(struct cache *cache, struct bio *bio) 645 { 646 unsigned long flags; 647 648 spin_lock_irqsave(&cache->lock, flags); 649 bio_list_add(&cache->deferred_writethrough_bios, bio); 650 spin_unlock_irqrestore(&cache->lock, flags); 651 652 wake_worker(cache); 653 } 654 655 static void writethrough_endio(struct bio *bio, int err) 656 { 657 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 658 bio->bi_end_io = pb->saved_bi_end_io; 659 660 if (err) { 661 bio_endio(bio, err); 662 return; 663 } 664 665 dm_bio_restore(&pb->bio_details, bio); 666 remap_to_cache(pb->cache, bio, pb->cblock); 667 668 /* 669 * We can't issue this bio directly, since we're in interrupt 670 * context. So it get's put on a bio list for processing by the 671 * worker thread. 672 */ 673 defer_writethrough_bio(pb->cache, bio); 674 } 675 676 /* 677 * When running in writethrough mode we need to send writes to clean blocks 678 * to both the cache and origin devices. In future we'd like to clone the 679 * bio and send them in parallel, but for now we're doing them in 680 * series as this is easier. 681 */ 682 static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, 683 dm_oblock_t oblock, dm_cblock_t cblock) 684 { 685 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 686 687 pb->cache = cache; 688 pb->cblock = cblock; 689 pb->saved_bi_end_io = bio->bi_end_io; 690 dm_bio_record(&pb->bio_details, bio); 691 bio->bi_end_io = writethrough_endio; 692 693 remap_to_origin_clear_discard(pb->cache, bio, oblock); 694 } 695 696 /*---------------------------------------------------------------- 697 * Migration processing 698 * 699 * Migration covers moving data from the origin device to the cache, or 700 * vice versa. 701 *--------------------------------------------------------------*/ 702 static void free_migration(struct dm_cache_migration *mg) 703 { 704 mempool_free(mg, mg->cache->migration_pool); 705 } 706 707 static void inc_nr_migrations(struct cache *cache) 708 { 709 atomic_inc(&cache->nr_migrations); 710 } 711 712 static void dec_nr_migrations(struct cache *cache) 713 { 714 atomic_dec(&cache->nr_migrations); 715 716 /* 717 * Wake the worker in case we're suspending the target. 718 */ 719 wake_up(&cache->migration_wait); 720 } 721 722 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, 723 bool holder) 724 { 725 (holder ? dm_cell_release : dm_cell_release_no_holder) 726 (cache->prison, cell, &cache->deferred_bios); 727 free_prison_cell(cache, cell); 728 } 729 730 static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, 731 bool holder) 732 { 733 unsigned long flags; 734 735 spin_lock_irqsave(&cache->lock, flags); 736 __cell_defer(cache, cell, holder); 737 spin_unlock_irqrestore(&cache->lock, flags); 738 739 wake_worker(cache); 740 } 741 742 static void cleanup_migration(struct dm_cache_migration *mg) 743 { 744 dec_nr_migrations(mg->cache); 745 free_migration(mg); 746 } 747 748 static void migration_failure(struct dm_cache_migration *mg) 749 { 750 struct cache *cache = mg->cache; 751 752 if (mg->writeback) { 753 DMWARN_LIMIT("writeback failed; couldn't copy block"); 754 set_dirty(cache, mg->old_oblock, mg->cblock); 755 cell_defer(cache, mg->old_ocell, false); 756 757 } else if (mg->demote) { 758 DMWARN_LIMIT("demotion failed; couldn't copy block"); 759 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock); 760 761 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1); 762 if (mg->promote) 763 cell_defer(cache, mg->new_ocell, 1); 764 } else { 765 DMWARN_LIMIT("promotion failed; couldn't copy block"); 766 policy_remove_mapping(cache->policy, mg->new_oblock); 767 cell_defer(cache, mg->new_ocell, 1); 768 } 769 770 cleanup_migration(mg); 771 } 772 773 static void migration_success_pre_commit(struct dm_cache_migration *mg) 774 { 775 unsigned long flags; 776 struct cache *cache = mg->cache; 777 778 if (mg->writeback) { 779 cell_defer(cache, mg->old_ocell, false); 780 clear_dirty(cache, mg->old_oblock, mg->cblock); 781 cleanup_migration(mg); 782 return; 783 784 } else if (mg->demote) { 785 if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) { 786 DMWARN_LIMIT("demotion failed; couldn't update on disk metadata"); 787 policy_force_mapping(cache->policy, mg->new_oblock, 788 mg->old_oblock); 789 if (mg->promote) 790 cell_defer(cache, mg->new_ocell, true); 791 cleanup_migration(mg); 792 return; 793 } 794 } else { 795 if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) { 796 DMWARN_LIMIT("promotion failed; couldn't update on disk metadata"); 797 policy_remove_mapping(cache->policy, mg->new_oblock); 798 cleanup_migration(mg); 799 return; 800 } 801 } 802 803 spin_lock_irqsave(&cache->lock, flags); 804 list_add_tail(&mg->list, &cache->need_commit_migrations); 805 cache->commit_requested = true; 806 spin_unlock_irqrestore(&cache->lock, flags); 807 } 808 809 static void migration_success_post_commit(struct dm_cache_migration *mg) 810 { 811 unsigned long flags; 812 struct cache *cache = mg->cache; 813 814 if (mg->writeback) { 815 DMWARN("writeback unexpectedly triggered commit"); 816 return; 817 818 } else if (mg->demote) { 819 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1); 820 821 if (mg->promote) { 822 mg->demote = false; 823 824 spin_lock_irqsave(&cache->lock, flags); 825 list_add_tail(&mg->list, &cache->quiesced_migrations); 826 spin_unlock_irqrestore(&cache->lock, flags); 827 828 } else 829 cleanup_migration(mg); 830 831 } else { 832 cell_defer(cache, mg->new_ocell, true); 833 clear_dirty(cache, mg->new_oblock, mg->cblock); 834 cleanup_migration(mg); 835 } 836 } 837 838 static void copy_complete(int read_err, unsigned long write_err, void *context) 839 { 840 unsigned long flags; 841 struct dm_cache_migration *mg = (struct dm_cache_migration *) context; 842 struct cache *cache = mg->cache; 843 844 if (read_err || write_err) 845 mg->err = true; 846 847 spin_lock_irqsave(&cache->lock, flags); 848 list_add_tail(&mg->list, &cache->completed_migrations); 849 spin_unlock_irqrestore(&cache->lock, flags); 850 851 wake_worker(cache); 852 } 853 854 static void issue_copy_real(struct dm_cache_migration *mg) 855 { 856 int r; 857 struct dm_io_region o_region, c_region; 858 struct cache *cache = mg->cache; 859 860 o_region.bdev = cache->origin_dev->bdev; 861 o_region.count = cache->sectors_per_block; 862 863 c_region.bdev = cache->cache_dev->bdev; 864 c_region.sector = from_cblock(mg->cblock) * cache->sectors_per_block; 865 c_region.count = cache->sectors_per_block; 866 867 if (mg->writeback || mg->demote) { 868 /* demote */ 869 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block; 870 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg); 871 } else { 872 /* promote */ 873 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block; 874 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg); 875 } 876 877 if (r < 0) 878 migration_failure(mg); 879 } 880 881 static void avoid_copy(struct dm_cache_migration *mg) 882 { 883 atomic_inc(&mg->cache->stats.copies_avoided); 884 migration_success_pre_commit(mg); 885 } 886 887 static void issue_copy(struct dm_cache_migration *mg) 888 { 889 bool avoid; 890 struct cache *cache = mg->cache; 891 892 if (mg->writeback || mg->demote) 893 avoid = !is_dirty(cache, mg->cblock) || 894 is_discarded_oblock(cache, mg->old_oblock); 895 else 896 avoid = is_discarded_oblock(cache, mg->new_oblock); 897 898 avoid ? avoid_copy(mg) : issue_copy_real(mg); 899 } 900 901 static void complete_migration(struct dm_cache_migration *mg) 902 { 903 if (mg->err) 904 migration_failure(mg); 905 else 906 migration_success_pre_commit(mg); 907 } 908 909 static void process_migrations(struct cache *cache, struct list_head *head, 910 void (*fn)(struct dm_cache_migration *)) 911 { 912 unsigned long flags; 913 struct list_head list; 914 struct dm_cache_migration *mg, *tmp; 915 916 INIT_LIST_HEAD(&list); 917 spin_lock_irqsave(&cache->lock, flags); 918 list_splice_init(head, &list); 919 spin_unlock_irqrestore(&cache->lock, flags); 920 921 list_for_each_entry_safe(mg, tmp, &list, list) 922 fn(mg); 923 } 924 925 static void __queue_quiesced_migration(struct dm_cache_migration *mg) 926 { 927 list_add_tail(&mg->list, &mg->cache->quiesced_migrations); 928 } 929 930 static void queue_quiesced_migration(struct dm_cache_migration *mg) 931 { 932 unsigned long flags; 933 struct cache *cache = mg->cache; 934 935 spin_lock_irqsave(&cache->lock, flags); 936 __queue_quiesced_migration(mg); 937 spin_unlock_irqrestore(&cache->lock, flags); 938 939 wake_worker(cache); 940 } 941 942 static void queue_quiesced_migrations(struct cache *cache, struct list_head *work) 943 { 944 unsigned long flags; 945 struct dm_cache_migration *mg, *tmp; 946 947 spin_lock_irqsave(&cache->lock, flags); 948 list_for_each_entry_safe(mg, tmp, work, list) 949 __queue_quiesced_migration(mg); 950 spin_unlock_irqrestore(&cache->lock, flags); 951 952 wake_worker(cache); 953 } 954 955 static void check_for_quiesced_migrations(struct cache *cache, 956 struct per_bio_data *pb) 957 { 958 struct list_head work; 959 960 if (!pb->all_io_entry) 961 return; 962 963 INIT_LIST_HEAD(&work); 964 if (pb->all_io_entry) 965 dm_deferred_entry_dec(pb->all_io_entry, &work); 966 967 if (!list_empty(&work)) 968 queue_quiesced_migrations(cache, &work); 969 } 970 971 static void quiesce_migration(struct dm_cache_migration *mg) 972 { 973 if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list)) 974 queue_quiesced_migration(mg); 975 } 976 977 static void promote(struct cache *cache, struct prealloc *structs, 978 dm_oblock_t oblock, dm_cblock_t cblock, 979 struct dm_bio_prison_cell *cell) 980 { 981 struct dm_cache_migration *mg = prealloc_get_migration(structs); 982 983 mg->err = false; 984 mg->writeback = false; 985 mg->demote = false; 986 mg->promote = true; 987 mg->cache = cache; 988 mg->new_oblock = oblock; 989 mg->cblock = cblock; 990 mg->old_ocell = NULL; 991 mg->new_ocell = cell; 992 mg->start_jiffies = jiffies; 993 994 inc_nr_migrations(cache); 995 quiesce_migration(mg); 996 } 997 998 static void writeback(struct cache *cache, struct prealloc *structs, 999 dm_oblock_t oblock, dm_cblock_t cblock, 1000 struct dm_bio_prison_cell *cell) 1001 { 1002 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1003 1004 mg->err = false; 1005 mg->writeback = true; 1006 mg->demote = false; 1007 mg->promote = false; 1008 mg->cache = cache; 1009 mg->old_oblock = oblock; 1010 mg->cblock = cblock; 1011 mg->old_ocell = cell; 1012 mg->new_ocell = NULL; 1013 mg->start_jiffies = jiffies; 1014 1015 inc_nr_migrations(cache); 1016 quiesce_migration(mg); 1017 } 1018 1019 static void demote_then_promote(struct cache *cache, struct prealloc *structs, 1020 dm_oblock_t old_oblock, dm_oblock_t new_oblock, 1021 dm_cblock_t cblock, 1022 struct dm_bio_prison_cell *old_ocell, 1023 struct dm_bio_prison_cell *new_ocell) 1024 { 1025 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1026 1027 mg->err = false; 1028 mg->writeback = false; 1029 mg->demote = true; 1030 mg->promote = true; 1031 mg->cache = cache; 1032 mg->old_oblock = old_oblock; 1033 mg->new_oblock = new_oblock; 1034 mg->cblock = cblock; 1035 mg->old_ocell = old_ocell; 1036 mg->new_ocell = new_ocell; 1037 mg->start_jiffies = jiffies; 1038 1039 inc_nr_migrations(cache); 1040 quiesce_migration(mg); 1041 } 1042 1043 /*---------------------------------------------------------------- 1044 * bio processing 1045 *--------------------------------------------------------------*/ 1046 static void defer_bio(struct cache *cache, struct bio *bio) 1047 { 1048 unsigned long flags; 1049 1050 spin_lock_irqsave(&cache->lock, flags); 1051 bio_list_add(&cache->deferred_bios, bio); 1052 spin_unlock_irqrestore(&cache->lock, flags); 1053 1054 wake_worker(cache); 1055 } 1056 1057 static void process_flush_bio(struct cache *cache, struct bio *bio) 1058 { 1059 size_t pb_data_size = get_per_bio_data_size(cache); 1060 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1061 1062 BUG_ON(bio->bi_size); 1063 if (!pb->req_nr) 1064 remap_to_origin(cache, bio); 1065 else 1066 remap_to_cache(cache, bio, 0); 1067 1068 issue(cache, bio); 1069 } 1070 1071 /* 1072 * People generally discard large parts of a device, eg, the whole device 1073 * when formatting. Splitting these large discards up into cache block 1074 * sized ios and then quiescing (always neccessary for discard) takes too 1075 * long. 1076 * 1077 * We keep it simple, and allow any size of discard to come in, and just 1078 * mark off blocks on the discard bitset. No passdown occurs! 1079 * 1080 * To implement passdown we need to change the bio_prison such that a cell 1081 * can have a key that spans many blocks. 1082 */ 1083 static void process_discard_bio(struct cache *cache, struct bio *bio) 1084 { 1085 dm_block_t start_block = dm_sector_div_up(bio->bi_sector, 1086 cache->discard_block_size); 1087 dm_block_t end_block = bio->bi_sector + bio_sectors(bio); 1088 dm_block_t b; 1089 1090 end_block = block_div(end_block, cache->discard_block_size); 1091 1092 for (b = start_block; b < end_block; b++) 1093 set_discard(cache, to_dblock(b)); 1094 1095 bio_endio(bio, 0); 1096 } 1097 1098 static bool spare_migration_bandwidth(struct cache *cache) 1099 { 1100 sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) * 1101 cache->sectors_per_block; 1102 return current_volume < cache->migration_threshold; 1103 } 1104 1105 static bool is_writethrough_io(struct cache *cache, struct bio *bio, 1106 dm_cblock_t cblock) 1107 { 1108 return bio_data_dir(bio) == WRITE && 1109 cache->features.write_through && !is_dirty(cache, cblock); 1110 } 1111 1112 static void inc_hit_counter(struct cache *cache, struct bio *bio) 1113 { 1114 atomic_inc(bio_data_dir(bio) == READ ? 1115 &cache->stats.read_hit : &cache->stats.write_hit); 1116 } 1117 1118 static void inc_miss_counter(struct cache *cache, struct bio *bio) 1119 { 1120 atomic_inc(bio_data_dir(bio) == READ ? 1121 &cache->stats.read_miss : &cache->stats.write_miss); 1122 } 1123 1124 static void process_bio(struct cache *cache, struct prealloc *structs, 1125 struct bio *bio) 1126 { 1127 int r; 1128 bool release_cell = true; 1129 dm_oblock_t block = get_bio_block(cache, bio); 1130 struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell; 1131 struct policy_result lookup_result; 1132 size_t pb_data_size = get_per_bio_data_size(cache); 1133 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1134 bool discarded_block = is_discarded_oblock(cache, block); 1135 bool can_migrate = discarded_block || spare_migration_bandwidth(cache); 1136 1137 /* 1138 * Check to see if that block is currently migrating. 1139 */ 1140 cell_prealloc = prealloc_get_cell(structs); 1141 r = bio_detain(cache, block, bio, cell_prealloc, 1142 (cell_free_fn) prealloc_put_cell, 1143 structs, &new_ocell); 1144 if (r > 0) 1145 return; 1146 1147 r = policy_map(cache->policy, block, true, can_migrate, discarded_block, 1148 bio, &lookup_result); 1149 1150 if (r == -EWOULDBLOCK) 1151 /* migration has been denied */ 1152 lookup_result.op = POLICY_MISS; 1153 1154 switch (lookup_result.op) { 1155 case POLICY_HIT: 1156 inc_hit_counter(cache, bio); 1157 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 1158 1159 if (is_writethrough_io(cache, bio, lookup_result.cblock)) 1160 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 1161 else 1162 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); 1163 1164 issue(cache, bio); 1165 break; 1166 1167 case POLICY_MISS: 1168 inc_miss_counter(cache, bio); 1169 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 1170 remap_to_origin_clear_discard(cache, bio, block); 1171 issue(cache, bio); 1172 break; 1173 1174 case POLICY_NEW: 1175 atomic_inc(&cache->stats.promotion); 1176 promote(cache, structs, block, lookup_result.cblock, new_ocell); 1177 release_cell = false; 1178 break; 1179 1180 case POLICY_REPLACE: 1181 cell_prealloc = prealloc_get_cell(structs); 1182 r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc, 1183 (cell_free_fn) prealloc_put_cell, 1184 structs, &old_ocell); 1185 if (r > 0) { 1186 /* 1187 * We have to be careful to avoid lock inversion of 1188 * the cells. So we back off, and wait for the 1189 * old_ocell to become free. 1190 */ 1191 policy_force_mapping(cache->policy, block, 1192 lookup_result.old_oblock); 1193 atomic_inc(&cache->stats.cache_cell_clash); 1194 break; 1195 } 1196 atomic_inc(&cache->stats.demotion); 1197 atomic_inc(&cache->stats.promotion); 1198 1199 demote_then_promote(cache, structs, lookup_result.old_oblock, 1200 block, lookup_result.cblock, 1201 old_ocell, new_ocell); 1202 release_cell = false; 1203 break; 1204 1205 default: 1206 DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__, 1207 (unsigned) lookup_result.op); 1208 bio_io_error(bio); 1209 } 1210 1211 if (release_cell) 1212 cell_defer(cache, new_ocell, false); 1213 } 1214 1215 static int need_commit_due_to_time(struct cache *cache) 1216 { 1217 return jiffies < cache->last_commit_jiffies || 1218 jiffies > cache->last_commit_jiffies + COMMIT_PERIOD; 1219 } 1220 1221 static int commit_if_needed(struct cache *cache) 1222 { 1223 if (dm_cache_changed_this_transaction(cache->cmd) && 1224 (cache->commit_requested || need_commit_due_to_time(cache))) { 1225 atomic_inc(&cache->stats.commit_count); 1226 cache->last_commit_jiffies = jiffies; 1227 cache->commit_requested = false; 1228 return dm_cache_commit(cache->cmd, false); 1229 } 1230 1231 return 0; 1232 } 1233 1234 static void process_deferred_bios(struct cache *cache) 1235 { 1236 unsigned long flags; 1237 struct bio_list bios; 1238 struct bio *bio; 1239 struct prealloc structs; 1240 1241 memset(&structs, 0, sizeof(structs)); 1242 bio_list_init(&bios); 1243 1244 spin_lock_irqsave(&cache->lock, flags); 1245 bio_list_merge(&bios, &cache->deferred_bios); 1246 bio_list_init(&cache->deferred_bios); 1247 spin_unlock_irqrestore(&cache->lock, flags); 1248 1249 while (!bio_list_empty(&bios)) { 1250 /* 1251 * If we've got no free migration structs, and processing 1252 * this bio might require one, we pause until there are some 1253 * prepared mappings to process. 1254 */ 1255 if (prealloc_data_structs(cache, &structs)) { 1256 spin_lock_irqsave(&cache->lock, flags); 1257 bio_list_merge(&cache->deferred_bios, &bios); 1258 spin_unlock_irqrestore(&cache->lock, flags); 1259 break; 1260 } 1261 1262 bio = bio_list_pop(&bios); 1263 1264 if (bio->bi_rw & REQ_FLUSH) 1265 process_flush_bio(cache, bio); 1266 else if (bio->bi_rw & REQ_DISCARD) 1267 process_discard_bio(cache, bio); 1268 else 1269 process_bio(cache, &structs, bio); 1270 } 1271 1272 prealloc_free_structs(cache, &structs); 1273 } 1274 1275 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) 1276 { 1277 unsigned long flags; 1278 struct bio_list bios; 1279 struct bio *bio; 1280 1281 bio_list_init(&bios); 1282 1283 spin_lock_irqsave(&cache->lock, flags); 1284 bio_list_merge(&bios, &cache->deferred_flush_bios); 1285 bio_list_init(&cache->deferred_flush_bios); 1286 spin_unlock_irqrestore(&cache->lock, flags); 1287 1288 while ((bio = bio_list_pop(&bios))) 1289 submit_bios ? generic_make_request(bio) : bio_io_error(bio); 1290 } 1291 1292 static void process_deferred_writethrough_bios(struct cache *cache) 1293 { 1294 unsigned long flags; 1295 struct bio_list bios; 1296 struct bio *bio; 1297 1298 bio_list_init(&bios); 1299 1300 spin_lock_irqsave(&cache->lock, flags); 1301 bio_list_merge(&bios, &cache->deferred_writethrough_bios); 1302 bio_list_init(&cache->deferred_writethrough_bios); 1303 spin_unlock_irqrestore(&cache->lock, flags); 1304 1305 while ((bio = bio_list_pop(&bios))) 1306 generic_make_request(bio); 1307 } 1308 1309 static void writeback_some_dirty_blocks(struct cache *cache) 1310 { 1311 int r = 0; 1312 dm_oblock_t oblock; 1313 dm_cblock_t cblock; 1314 struct prealloc structs; 1315 struct dm_bio_prison_cell *old_ocell; 1316 1317 memset(&structs, 0, sizeof(structs)); 1318 1319 while (spare_migration_bandwidth(cache)) { 1320 if (prealloc_data_structs(cache, &structs)) 1321 break; 1322 1323 r = policy_writeback_work(cache->policy, &oblock, &cblock); 1324 if (r) 1325 break; 1326 1327 r = get_cell(cache, oblock, &structs, &old_ocell); 1328 if (r) { 1329 policy_set_dirty(cache->policy, oblock); 1330 break; 1331 } 1332 1333 writeback(cache, &structs, oblock, cblock, old_ocell); 1334 } 1335 1336 prealloc_free_structs(cache, &structs); 1337 } 1338 1339 /*---------------------------------------------------------------- 1340 * Main worker loop 1341 *--------------------------------------------------------------*/ 1342 static void start_quiescing(struct cache *cache) 1343 { 1344 unsigned long flags; 1345 1346 spin_lock_irqsave(&cache->lock, flags); 1347 cache->quiescing = 1; 1348 spin_unlock_irqrestore(&cache->lock, flags); 1349 } 1350 1351 static void stop_quiescing(struct cache *cache) 1352 { 1353 unsigned long flags; 1354 1355 spin_lock_irqsave(&cache->lock, flags); 1356 cache->quiescing = 0; 1357 spin_unlock_irqrestore(&cache->lock, flags); 1358 } 1359 1360 static bool is_quiescing(struct cache *cache) 1361 { 1362 int r; 1363 unsigned long flags; 1364 1365 spin_lock_irqsave(&cache->lock, flags); 1366 r = cache->quiescing; 1367 spin_unlock_irqrestore(&cache->lock, flags); 1368 1369 return r; 1370 } 1371 1372 static void wait_for_migrations(struct cache *cache) 1373 { 1374 wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations)); 1375 } 1376 1377 static void stop_worker(struct cache *cache) 1378 { 1379 cancel_delayed_work(&cache->waker); 1380 flush_workqueue(cache->wq); 1381 } 1382 1383 static void requeue_deferred_io(struct cache *cache) 1384 { 1385 struct bio *bio; 1386 struct bio_list bios; 1387 1388 bio_list_init(&bios); 1389 bio_list_merge(&bios, &cache->deferred_bios); 1390 bio_list_init(&cache->deferred_bios); 1391 1392 while ((bio = bio_list_pop(&bios))) 1393 bio_endio(bio, DM_ENDIO_REQUEUE); 1394 } 1395 1396 static int more_work(struct cache *cache) 1397 { 1398 if (is_quiescing(cache)) 1399 return !list_empty(&cache->quiesced_migrations) || 1400 !list_empty(&cache->completed_migrations) || 1401 !list_empty(&cache->need_commit_migrations); 1402 else 1403 return !bio_list_empty(&cache->deferred_bios) || 1404 !bio_list_empty(&cache->deferred_flush_bios) || 1405 !bio_list_empty(&cache->deferred_writethrough_bios) || 1406 !list_empty(&cache->quiesced_migrations) || 1407 !list_empty(&cache->completed_migrations) || 1408 !list_empty(&cache->need_commit_migrations); 1409 } 1410 1411 static void do_worker(struct work_struct *ws) 1412 { 1413 struct cache *cache = container_of(ws, struct cache, worker); 1414 1415 do { 1416 if (!is_quiescing(cache)) 1417 process_deferred_bios(cache); 1418 1419 process_migrations(cache, &cache->quiesced_migrations, issue_copy); 1420 process_migrations(cache, &cache->completed_migrations, complete_migration); 1421 1422 writeback_some_dirty_blocks(cache); 1423 1424 process_deferred_writethrough_bios(cache); 1425 1426 if (commit_if_needed(cache)) { 1427 process_deferred_flush_bios(cache, false); 1428 1429 /* 1430 * FIXME: rollback metadata or just go into a 1431 * failure mode and error everything 1432 */ 1433 } else { 1434 process_deferred_flush_bios(cache, true); 1435 process_migrations(cache, &cache->need_commit_migrations, 1436 migration_success_post_commit); 1437 } 1438 } while (more_work(cache)); 1439 } 1440 1441 /* 1442 * We want to commit periodically so that not too much 1443 * unwritten metadata builds up. 1444 */ 1445 static void do_waker(struct work_struct *ws) 1446 { 1447 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 1448 wake_worker(cache); 1449 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 1450 } 1451 1452 /*----------------------------------------------------------------*/ 1453 1454 static int is_congested(struct dm_dev *dev, int bdi_bits) 1455 { 1456 struct request_queue *q = bdev_get_queue(dev->bdev); 1457 return bdi_congested(&q->backing_dev_info, bdi_bits); 1458 } 1459 1460 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 1461 { 1462 struct cache *cache = container_of(cb, struct cache, callbacks); 1463 1464 return is_congested(cache->origin_dev, bdi_bits) || 1465 is_congested(cache->cache_dev, bdi_bits); 1466 } 1467 1468 /*---------------------------------------------------------------- 1469 * Target methods 1470 *--------------------------------------------------------------*/ 1471 1472 /* 1473 * This function gets called on the error paths of the constructor, so we 1474 * have to cope with a partially initialised struct. 1475 */ 1476 static void destroy(struct cache *cache) 1477 { 1478 unsigned i; 1479 1480 if (cache->next_migration) 1481 mempool_free(cache->next_migration, cache->migration_pool); 1482 1483 if (cache->migration_pool) 1484 mempool_destroy(cache->migration_pool); 1485 1486 if (cache->all_io_ds) 1487 dm_deferred_set_destroy(cache->all_io_ds); 1488 1489 if (cache->prison) 1490 dm_bio_prison_destroy(cache->prison); 1491 1492 if (cache->wq) 1493 destroy_workqueue(cache->wq); 1494 1495 if (cache->dirty_bitset) 1496 free_bitset(cache->dirty_bitset); 1497 1498 if (cache->discard_bitset) 1499 free_bitset(cache->discard_bitset); 1500 1501 if (cache->copier) 1502 dm_kcopyd_client_destroy(cache->copier); 1503 1504 if (cache->cmd) 1505 dm_cache_metadata_close(cache->cmd); 1506 1507 if (cache->metadata_dev) 1508 dm_put_device(cache->ti, cache->metadata_dev); 1509 1510 if (cache->origin_dev) 1511 dm_put_device(cache->ti, cache->origin_dev); 1512 1513 if (cache->cache_dev) 1514 dm_put_device(cache->ti, cache->cache_dev); 1515 1516 if (cache->policy) 1517 dm_cache_policy_destroy(cache->policy); 1518 1519 for (i = 0; i < cache->nr_ctr_args ; i++) 1520 kfree(cache->ctr_args[i]); 1521 kfree(cache->ctr_args); 1522 1523 kfree(cache); 1524 } 1525 1526 static void cache_dtr(struct dm_target *ti) 1527 { 1528 struct cache *cache = ti->private; 1529 1530 destroy(cache); 1531 } 1532 1533 static sector_t get_dev_size(struct dm_dev *dev) 1534 { 1535 return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; 1536 } 1537 1538 /*----------------------------------------------------------------*/ 1539 1540 /* 1541 * Construct a cache device mapping. 1542 * 1543 * cache <metadata dev> <cache dev> <origin dev> <block size> 1544 * <#feature args> [<feature arg>]* 1545 * <policy> <#policy args> [<policy arg>]* 1546 * 1547 * metadata dev : fast device holding the persistent metadata 1548 * cache dev : fast device holding cached data blocks 1549 * origin dev : slow device holding original data blocks 1550 * block size : cache unit size in sectors 1551 * 1552 * #feature args : number of feature arguments passed 1553 * feature args : writethrough. (The default is writeback.) 1554 * 1555 * policy : the replacement policy to use 1556 * #policy args : an even number of policy arguments corresponding 1557 * to key/value pairs passed to the policy 1558 * policy args : key/value pairs passed to the policy 1559 * E.g. 'sequential_threshold 1024' 1560 * See cache-policies.txt for details. 1561 * 1562 * Optional feature arguments are: 1563 * writethrough : write through caching that prohibits cache block 1564 * content from being different from origin block content. 1565 * Without this argument, the default behaviour is to write 1566 * back cache block contents later for performance reasons, 1567 * so they may differ from the corresponding origin blocks. 1568 */ 1569 struct cache_args { 1570 struct dm_target *ti; 1571 1572 struct dm_dev *metadata_dev; 1573 1574 struct dm_dev *cache_dev; 1575 sector_t cache_sectors; 1576 1577 struct dm_dev *origin_dev; 1578 sector_t origin_sectors; 1579 1580 uint32_t block_size; 1581 1582 const char *policy_name; 1583 int policy_argc; 1584 const char **policy_argv; 1585 1586 struct cache_features features; 1587 }; 1588 1589 static void destroy_cache_args(struct cache_args *ca) 1590 { 1591 if (ca->metadata_dev) 1592 dm_put_device(ca->ti, ca->metadata_dev); 1593 1594 if (ca->cache_dev) 1595 dm_put_device(ca->ti, ca->cache_dev); 1596 1597 if (ca->origin_dev) 1598 dm_put_device(ca->ti, ca->origin_dev); 1599 1600 kfree(ca); 1601 } 1602 1603 static bool at_least_one_arg(struct dm_arg_set *as, char **error) 1604 { 1605 if (!as->argc) { 1606 *error = "Insufficient args"; 1607 return false; 1608 } 1609 1610 return true; 1611 } 1612 1613 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, 1614 char **error) 1615 { 1616 int r; 1617 sector_t metadata_dev_size; 1618 char b[BDEVNAME_SIZE]; 1619 1620 if (!at_least_one_arg(as, error)) 1621 return -EINVAL; 1622 1623 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 1624 &ca->metadata_dev); 1625 if (r) { 1626 *error = "Error opening metadata device"; 1627 return r; 1628 } 1629 1630 metadata_dev_size = get_dev_size(ca->metadata_dev); 1631 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING) 1632 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 1633 bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); 1634 1635 return 0; 1636 } 1637 1638 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, 1639 char **error) 1640 { 1641 int r; 1642 1643 if (!at_least_one_arg(as, error)) 1644 return -EINVAL; 1645 1646 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 1647 &ca->cache_dev); 1648 if (r) { 1649 *error = "Error opening cache device"; 1650 return r; 1651 } 1652 ca->cache_sectors = get_dev_size(ca->cache_dev); 1653 1654 return 0; 1655 } 1656 1657 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, 1658 char **error) 1659 { 1660 int r; 1661 1662 if (!at_least_one_arg(as, error)) 1663 return -EINVAL; 1664 1665 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 1666 &ca->origin_dev); 1667 if (r) { 1668 *error = "Error opening origin device"; 1669 return r; 1670 } 1671 1672 ca->origin_sectors = get_dev_size(ca->origin_dev); 1673 if (ca->ti->len > ca->origin_sectors) { 1674 *error = "Device size larger than cached device"; 1675 return -EINVAL; 1676 } 1677 1678 return 0; 1679 } 1680 1681 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, 1682 char **error) 1683 { 1684 unsigned long tmp; 1685 1686 if (!at_least_one_arg(as, error)) 1687 return -EINVAL; 1688 1689 if (kstrtoul(dm_shift_arg(as), 10, &tmp) || !tmp || 1690 tmp < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 1691 tmp & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 1692 *error = "Invalid data block size"; 1693 return -EINVAL; 1694 } 1695 1696 if (tmp > ca->cache_sectors) { 1697 *error = "Data block size is larger than the cache device"; 1698 return -EINVAL; 1699 } 1700 1701 ca->block_size = tmp; 1702 1703 return 0; 1704 } 1705 1706 static void init_features(struct cache_features *cf) 1707 { 1708 cf->mode = CM_WRITE; 1709 cf->write_through = false; 1710 } 1711 1712 static int parse_features(struct cache_args *ca, struct dm_arg_set *as, 1713 char **error) 1714 { 1715 static struct dm_arg _args[] = { 1716 {0, 1, "Invalid number of cache feature arguments"}, 1717 }; 1718 1719 int r; 1720 unsigned argc; 1721 const char *arg; 1722 struct cache_features *cf = &ca->features; 1723 1724 init_features(cf); 1725 1726 r = dm_read_arg_group(_args, as, &argc, error); 1727 if (r) 1728 return -EINVAL; 1729 1730 while (argc--) { 1731 arg = dm_shift_arg(as); 1732 1733 if (!strcasecmp(arg, "writeback")) 1734 cf->write_through = false; 1735 1736 else if (!strcasecmp(arg, "writethrough")) 1737 cf->write_through = true; 1738 1739 else { 1740 *error = "Unrecognised cache feature requested"; 1741 return -EINVAL; 1742 } 1743 } 1744 1745 return 0; 1746 } 1747 1748 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as, 1749 char **error) 1750 { 1751 static struct dm_arg _args[] = { 1752 {0, 1024, "Invalid number of policy arguments"}, 1753 }; 1754 1755 int r; 1756 1757 if (!at_least_one_arg(as, error)) 1758 return -EINVAL; 1759 1760 ca->policy_name = dm_shift_arg(as); 1761 1762 r = dm_read_arg_group(_args, as, &ca->policy_argc, error); 1763 if (r) 1764 return -EINVAL; 1765 1766 ca->policy_argv = (const char **)as->argv; 1767 dm_consume_args(as, ca->policy_argc); 1768 1769 return 0; 1770 } 1771 1772 static int parse_cache_args(struct cache_args *ca, int argc, char **argv, 1773 char **error) 1774 { 1775 int r; 1776 struct dm_arg_set as; 1777 1778 as.argc = argc; 1779 as.argv = argv; 1780 1781 r = parse_metadata_dev(ca, &as, error); 1782 if (r) 1783 return r; 1784 1785 r = parse_cache_dev(ca, &as, error); 1786 if (r) 1787 return r; 1788 1789 r = parse_origin_dev(ca, &as, error); 1790 if (r) 1791 return r; 1792 1793 r = parse_block_size(ca, &as, error); 1794 if (r) 1795 return r; 1796 1797 r = parse_features(ca, &as, error); 1798 if (r) 1799 return r; 1800 1801 r = parse_policy(ca, &as, error); 1802 if (r) 1803 return r; 1804 1805 return 0; 1806 } 1807 1808 /*----------------------------------------------------------------*/ 1809 1810 static struct kmem_cache *migration_cache; 1811 1812 static int set_config_values(struct dm_cache_policy *p, int argc, const char **argv) 1813 { 1814 int r = 0; 1815 1816 if (argc & 1) { 1817 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs."); 1818 return -EINVAL; 1819 } 1820 1821 while (argc) { 1822 r = policy_set_config_value(p, argv[0], argv[1]); 1823 if (r) { 1824 DMWARN("policy_set_config_value failed: key = '%s', value = '%s'", 1825 argv[0], argv[1]); 1826 return r; 1827 } 1828 1829 argc -= 2; 1830 argv += 2; 1831 } 1832 1833 return r; 1834 } 1835 1836 static int create_cache_policy(struct cache *cache, struct cache_args *ca, 1837 char **error) 1838 { 1839 int r; 1840 1841 cache->policy = dm_cache_policy_create(ca->policy_name, 1842 cache->cache_size, 1843 cache->origin_sectors, 1844 cache->sectors_per_block); 1845 if (!cache->policy) { 1846 *error = "Error creating cache's policy"; 1847 return -ENOMEM; 1848 } 1849 1850 r = set_config_values(cache->policy, ca->policy_argc, ca->policy_argv); 1851 if (r) { 1852 *error = "Error setting cache policy's config values"; 1853 dm_cache_policy_destroy(cache->policy); 1854 cache->policy = NULL; 1855 } 1856 1857 return r; 1858 } 1859 1860 /* 1861 * We want the discard block size to be a power of two, at least the size 1862 * of the cache block size, and have no more than 2^14 discard blocks 1863 * across the origin. 1864 */ 1865 #define MAX_DISCARD_BLOCKS (1 << 14) 1866 1867 static bool too_many_discard_blocks(sector_t discard_block_size, 1868 sector_t origin_size) 1869 { 1870 (void) sector_div(origin_size, discard_block_size); 1871 1872 return origin_size > MAX_DISCARD_BLOCKS; 1873 } 1874 1875 static sector_t calculate_discard_block_size(sector_t cache_block_size, 1876 sector_t origin_size) 1877 { 1878 sector_t discard_block_size; 1879 1880 discard_block_size = roundup_pow_of_two(cache_block_size); 1881 1882 if (origin_size) 1883 while (too_many_discard_blocks(discard_block_size, origin_size)) 1884 discard_block_size *= 2; 1885 1886 return discard_block_size; 1887 } 1888 1889 #define DEFAULT_MIGRATION_THRESHOLD (2048 * 100) 1890 1891 static int cache_create(struct cache_args *ca, struct cache **result) 1892 { 1893 int r = 0; 1894 char **error = &ca->ti->error; 1895 struct cache *cache; 1896 struct dm_target *ti = ca->ti; 1897 dm_block_t origin_blocks; 1898 struct dm_cache_metadata *cmd; 1899 bool may_format = ca->features.mode == CM_WRITE; 1900 1901 cache = kzalloc(sizeof(*cache), GFP_KERNEL); 1902 if (!cache) 1903 return -ENOMEM; 1904 1905 cache->ti = ca->ti; 1906 ti->private = cache; 1907 ti->num_flush_bios = 2; 1908 ti->flush_supported = true; 1909 1910 ti->num_discard_bios = 1; 1911 ti->discards_supported = true; 1912 ti->discard_zeroes_data_unsupported = true; 1913 1914 memcpy(&cache->features, &ca->features, sizeof(cache->features)); 1915 ti->per_bio_data_size = get_per_bio_data_size(cache); 1916 1917 cache->callbacks.congested_fn = cache_is_congested; 1918 dm_table_add_target_callbacks(ti->table, &cache->callbacks); 1919 1920 cache->metadata_dev = ca->metadata_dev; 1921 cache->origin_dev = ca->origin_dev; 1922 cache->cache_dev = ca->cache_dev; 1923 1924 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 1925 1926 /* FIXME: factor out this whole section */ 1927 origin_blocks = cache->origin_sectors = ca->origin_sectors; 1928 origin_blocks = block_div(origin_blocks, ca->block_size); 1929 cache->origin_blocks = to_oblock(origin_blocks); 1930 1931 cache->sectors_per_block = ca->block_size; 1932 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) { 1933 r = -EINVAL; 1934 goto bad; 1935 } 1936 1937 if (ca->block_size & (ca->block_size - 1)) { 1938 dm_block_t cache_size = ca->cache_sectors; 1939 1940 cache->sectors_per_block_shift = -1; 1941 cache_size = block_div(cache_size, ca->block_size); 1942 cache->cache_size = to_cblock(cache_size); 1943 } else { 1944 cache->sectors_per_block_shift = __ffs(ca->block_size); 1945 cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift); 1946 } 1947 1948 r = create_cache_policy(cache, ca, error); 1949 if (r) 1950 goto bad; 1951 cache->policy_nr_args = ca->policy_argc; 1952 1953 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, 1954 ca->block_size, may_format, 1955 dm_cache_policy_get_hint_size(cache->policy)); 1956 if (IS_ERR(cmd)) { 1957 *error = "Error creating metadata object"; 1958 r = PTR_ERR(cmd); 1959 goto bad; 1960 } 1961 cache->cmd = cmd; 1962 1963 spin_lock_init(&cache->lock); 1964 bio_list_init(&cache->deferred_bios); 1965 bio_list_init(&cache->deferred_flush_bios); 1966 bio_list_init(&cache->deferred_writethrough_bios); 1967 INIT_LIST_HEAD(&cache->quiesced_migrations); 1968 INIT_LIST_HEAD(&cache->completed_migrations); 1969 INIT_LIST_HEAD(&cache->need_commit_migrations); 1970 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; 1971 atomic_set(&cache->nr_migrations, 0); 1972 init_waitqueue_head(&cache->migration_wait); 1973 1974 cache->nr_dirty = 0; 1975 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); 1976 if (!cache->dirty_bitset) { 1977 *error = "could not allocate dirty bitset"; 1978 goto bad; 1979 } 1980 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); 1981 1982 cache->discard_block_size = 1983 calculate_discard_block_size(cache->sectors_per_block, 1984 cache->origin_sectors); 1985 cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks); 1986 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); 1987 if (!cache->discard_bitset) { 1988 *error = "could not allocate discard bitset"; 1989 goto bad; 1990 } 1991 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 1992 1993 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 1994 if (IS_ERR(cache->copier)) { 1995 *error = "could not create kcopyd client"; 1996 r = PTR_ERR(cache->copier); 1997 goto bad; 1998 } 1999 2000 cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 2001 if (!cache->wq) { 2002 *error = "could not create workqueue for metadata object"; 2003 goto bad; 2004 } 2005 INIT_WORK(&cache->worker, do_worker); 2006 INIT_DELAYED_WORK(&cache->waker, do_waker); 2007 cache->last_commit_jiffies = jiffies; 2008 2009 cache->prison = dm_bio_prison_create(PRISON_CELLS); 2010 if (!cache->prison) { 2011 *error = "could not create bio prison"; 2012 goto bad; 2013 } 2014 2015 cache->all_io_ds = dm_deferred_set_create(); 2016 if (!cache->all_io_ds) { 2017 *error = "could not create all_io deferred set"; 2018 goto bad; 2019 } 2020 2021 cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE, 2022 migration_cache); 2023 if (!cache->migration_pool) { 2024 *error = "Error creating cache's migration mempool"; 2025 goto bad; 2026 } 2027 2028 cache->next_migration = NULL; 2029 2030 cache->need_tick_bio = true; 2031 cache->sized = false; 2032 cache->quiescing = false; 2033 cache->commit_requested = false; 2034 cache->loaded_mappings = false; 2035 cache->loaded_discards = false; 2036 2037 load_stats(cache); 2038 2039 atomic_set(&cache->stats.demotion, 0); 2040 atomic_set(&cache->stats.promotion, 0); 2041 atomic_set(&cache->stats.copies_avoided, 0); 2042 atomic_set(&cache->stats.cache_cell_clash, 0); 2043 atomic_set(&cache->stats.commit_count, 0); 2044 atomic_set(&cache->stats.discard_count, 0); 2045 2046 *result = cache; 2047 return 0; 2048 2049 bad: 2050 destroy(cache); 2051 return r; 2052 } 2053 2054 static int copy_ctr_args(struct cache *cache, int argc, const char **argv) 2055 { 2056 unsigned i; 2057 const char **copy; 2058 2059 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); 2060 if (!copy) 2061 return -ENOMEM; 2062 for (i = 0; i < argc; i++) { 2063 copy[i] = kstrdup(argv[i], GFP_KERNEL); 2064 if (!copy[i]) { 2065 while (i--) 2066 kfree(copy[i]); 2067 kfree(copy); 2068 return -ENOMEM; 2069 } 2070 } 2071 2072 cache->nr_ctr_args = argc; 2073 cache->ctr_args = copy; 2074 2075 return 0; 2076 } 2077 2078 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) 2079 { 2080 int r = -EINVAL; 2081 struct cache_args *ca; 2082 struct cache *cache = NULL; 2083 2084 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 2085 if (!ca) { 2086 ti->error = "Error allocating memory for cache"; 2087 return -ENOMEM; 2088 } 2089 ca->ti = ti; 2090 2091 r = parse_cache_args(ca, argc, argv, &ti->error); 2092 if (r) 2093 goto out; 2094 2095 r = cache_create(ca, &cache); 2096 if (r) 2097 goto out; 2098 2099 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); 2100 if (r) { 2101 destroy(cache); 2102 goto out; 2103 } 2104 2105 ti->private = cache; 2106 2107 out: 2108 destroy_cache_args(ca); 2109 return r; 2110 } 2111 2112 static int cache_map(struct dm_target *ti, struct bio *bio) 2113 { 2114 struct cache *cache = ti->private; 2115 2116 int r; 2117 dm_oblock_t block = get_bio_block(cache, bio); 2118 size_t pb_data_size = get_per_bio_data_size(cache); 2119 bool can_migrate = false; 2120 bool discarded_block; 2121 struct dm_bio_prison_cell *cell; 2122 struct policy_result lookup_result; 2123 struct per_bio_data *pb; 2124 2125 if (from_oblock(block) > from_oblock(cache->origin_blocks)) { 2126 /* 2127 * This can only occur if the io goes to a partial block at 2128 * the end of the origin device. We don't cache these. 2129 * Just remap to the origin and carry on. 2130 */ 2131 remap_to_origin_clear_discard(cache, bio, block); 2132 return DM_MAPIO_REMAPPED; 2133 } 2134 2135 pb = init_per_bio_data(bio, pb_data_size); 2136 2137 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) { 2138 defer_bio(cache, bio); 2139 return DM_MAPIO_SUBMITTED; 2140 } 2141 2142 /* 2143 * Check to see if that block is currently migrating. 2144 */ 2145 cell = alloc_prison_cell(cache); 2146 if (!cell) { 2147 defer_bio(cache, bio); 2148 return DM_MAPIO_SUBMITTED; 2149 } 2150 2151 r = bio_detain(cache, block, bio, cell, 2152 (cell_free_fn) free_prison_cell, 2153 cache, &cell); 2154 if (r) { 2155 if (r < 0) 2156 defer_bio(cache, bio); 2157 2158 return DM_MAPIO_SUBMITTED; 2159 } 2160 2161 discarded_block = is_discarded_oblock(cache, block); 2162 2163 r = policy_map(cache->policy, block, false, can_migrate, discarded_block, 2164 bio, &lookup_result); 2165 if (r == -EWOULDBLOCK) { 2166 cell_defer(cache, cell, true); 2167 return DM_MAPIO_SUBMITTED; 2168 2169 } else if (r) { 2170 DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r); 2171 bio_io_error(bio); 2172 return DM_MAPIO_SUBMITTED; 2173 } 2174 2175 switch (lookup_result.op) { 2176 case POLICY_HIT: 2177 inc_hit_counter(cache, bio); 2178 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 2179 2180 if (is_writethrough_io(cache, bio, lookup_result.cblock)) 2181 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 2182 else 2183 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); 2184 2185 cell_defer(cache, cell, false); 2186 break; 2187 2188 case POLICY_MISS: 2189 inc_miss_counter(cache, bio); 2190 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 2191 2192 if (pb->req_nr != 0) { 2193 /* 2194 * This is a duplicate writethrough io that is no 2195 * longer needed because the block has been demoted. 2196 */ 2197 bio_endio(bio, 0); 2198 cell_defer(cache, cell, false); 2199 return DM_MAPIO_SUBMITTED; 2200 } else { 2201 remap_to_origin_clear_discard(cache, bio, block); 2202 cell_defer(cache, cell, false); 2203 } 2204 break; 2205 2206 default: 2207 DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__, 2208 (unsigned) lookup_result.op); 2209 bio_io_error(bio); 2210 return DM_MAPIO_SUBMITTED; 2211 } 2212 2213 return DM_MAPIO_REMAPPED; 2214 } 2215 2216 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) 2217 { 2218 struct cache *cache = ti->private; 2219 unsigned long flags; 2220 size_t pb_data_size = get_per_bio_data_size(cache); 2221 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 2222 2223 if (pb->tick) { 2224 policy_tick(cache->policy); 2225 2226 spin_lock_irqsave(&cache->lock, flags); 2227 cache->need_tick_bio = true; 2228 spin_unlock_irqrestore(&cache->lock, flags); 2229 } 2230 2231 check_for_quiesced_migrations(cache, pb); 2232 2233 return 0; 2234 } 2235 2236 static int write_dirty_bitset(struct cache *cache) 2237 { 2238 unsigned i, r; 2239 2240 for (i = 0; i < from_cblock(cache->cache_size); i++) { 2241 r = dm_cache_set_dirty(cache->cmd, to_cblock(i), 2242 is_dirty(cache, to_cblock(i))); 2243 if (r) 2244 return r; 2245 } 2246 2247 return 0; 2248 } 2249 2250 static int write_discard_bitset(struct cache *cache) 2251 { 2252 unsigned i, r; 2253 2254 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, 2255 cache->discard_nr_blocks); 2256 if (r) { 2257 DMERR("could not resize on-disk discard bitset"); 2258 return r; 2259 } 2260 2261 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { 2262 r = dm_cache_set_discard(cache->cmd, to_dblock(i), 2263 is_discarded(cache, to_dblock(i))); 2264 if (r) 2265 return r; 2266 } 2267 2268 return 0; 2269 } 2270 2271 static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock, 2272 uint32_t hint) 2273 { 2274 struct cache *cache = context; 2275 return dm_cache_save_hint(cache->cmd, cblock, hint); 2276 } 2277 2278 static int write_hints(struct cache *cache) 2279 { 2280 int r; 2281 2282 r = dm_cache_begin_hints(cache->cmd, cache->policy); 2283 if (r) { 2284 DMERR("dm_cache_begin_hints failed"); 2285 return r; 2286 } 2287 2288 r = policy_walk_mappings(cache->policy, save_hint, cache); 2289 if (r) 2290 DMERR("policy_walk_mappings failed"); 2291 2292 return r; 2293 } 2294 2295 /* 2296 * returns true on success 2297 */ 2298 static bool sync_metadata(struct cache *cache) 2299 { 2300 int r1, r2, r3, r4; 2301 2302 r1 = write_dirty_bitset(cache); 2303 if (r1) 2304 DMERR("could not write dirty bitset"); 2305 2306 r2 = write_discard_bitset(cache); 2307 if (r2) 2308 DMERR("could not write discard bitset"); 2309 2310 save_stats(cache); 2311 2312 r3 = write_hints(cache); 2313 if (r3) 2314 DMERR("could not write hints"); 2315 2316 /* 2317 * If writing the above metadata failed, we still commit, but don't 2318 * set the clean shutdown flag. This will effectively force every 2319 * dirty bit to be set on reload. 2320 */ 2321 r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3); 2322 if (r4) 2323 DMERR("could not write cache metadata. Data loss may occur."); 2324 2325 return !r1 && !r2 && !r3 && !r4; 2326 } 2327 2328 static void cache_postsuspend(struct dm_target *ti) 2329 { 2330 struct cache *cache = ti->private; 2331 2332 start_quiescing(cache); 2333 wait_for_migrations(cache); 2334 stop_worker(cache); 2335 requeue_deferred_io(cache); 2336 stop_quiescing(cache); 2337 2338 (void) sync_metadata(cache); 2339 } 2340 2341 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, 2342 bool dirty, uint32_t hint, bool hint_valid) 2343 { 2344 int r; 2345 struct cache *cache = context; 2346 2347 r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid); 2348 if (r) 2349 return r; 2350 2351 if (dirty) 2352 set_dirty(cache, oblock, cblock); 2353 else 2354 clear_dirty(cache, oblock, cblock); 2355 2356 return 0; 2357 } 2358 2359 static int load_discard(void *context, sector_t discard_block_size, 2360 dm_dblock_t dblock, bool discard) 2361 { 2362 struct cache *cache = context; 2363 2364 /* FIXME: handle mis-matched block size */ 2365 2366 if (discard) 2367 set_discard(cache, dblock); 2368 else 2369 clear_discard(cache, dblock); 2370 2371 return 0; 2372 } 2373 2374 static int cache_preresume(struct dm_target *ti) 2375 { 2376 int r = 0; 2377 struct cache *cache = ti->private; 2378 sector_t actual_cache_size = get_dev_size(cache->cache_dev); 2379 (void) sector_div(actual_cache_size, cache->sectors_per_block); 2380 2381 /* 2382 * Check to see if the cache has resized. 2383 */ 2384 if (from_cblock(cache->cache_size) != actual_cache_size || !cache->sized) { 2385 cache->cache_size = to_cblock(actual_cache_size); 2386 2387 r = dm_cache_resize(cache->cmd, cache->cache_size); 2388 if (r) { 2389 DMERR("could not resize cache metadata"); 2390 return r; 2391 } 2392 2393 cache->sized = true; 2394 } 2395 2396 if (!cache->loaded_mappings) { 2397 r = dm_cache_load_mappings(cache->cmd, cache->policy, 2398 load_mapping, cache); 2399 if (r) { 2400 DMERR("could not load cache mappings"); 2401 return r; 2402 } 2403 2404 cache->loaded_mappings = true; 2405 } 2406 2407 if (!cache->loaded_discards) { 2408 r = dm_cache_load_discards(cache->cmd, load_discard, cache); 2409 if (r) { 2410 DMERR("could not load origin discards"); 2411 return r; 2412 } 2413 2414 cache->loaded_discards = true; 2415 } 2416 2417 return r; 2418 } 2419 2420 static void cache_resume(struct dm_target *ti) 2421 { 2422 struct cache *cache = ti->private; 2423 2424 cache->need_tick_bio = true; 2425 do_waker(&cache->waker.work); 2426 } 2427 2428 /* 2429 * Status format: 2430 * 2431 * <#used metadata blocks>/<#total metadata blocks> 2432 * <#read hits> <#read misses> <#write hits> <#write misses> 2433 * <#demotions> <#promotions> <#blocks in cache> <#dirty> 2434 * <#features> <features>* 2435 * <#core args> <core args> 2436 * <#policy args> <policy args>* 2437 */ 2438 static void cache_status(struct dm_target *ti, status_type_t type, 2439 unsigned status_flags, char *result, unsigned maxlen) 2440 { 2441 int r = 0; 2442 unsigned i; 2443 ssize_t sz = 0; 2444 dm_block_t nr_free_blocks_metadata = 0; 2445 dm_block_t nr_blocks_metadata = 0; 2446 char buf[BDEVNAME_SIZE]; 2447 struct cache *cache = ti->private; 2448 dm_cblock_t residency; 2449 2450 switch (type) { 2451 case STATUSTYPE_INFO: 2452 /* Commit to ensure statistics aren't out-of-date */ 2453 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) { 2454 r = dm_cache_commit(cache->cmd, false); 2455 if (r) 2456 DMERR("could not commit metadata for accurate status"); 2457 } 2458 2459 r = dm_cache_get_free_metadata_block_count(cache->cmd, 2460 &nr_free_blocks_metadata); 2461 if (r) { 2462 DMERR("could not get metadata free block count"); 2463 goto err; 2464 } 2465 2466 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata); 2467 if (r) { 2468 DMERR("could not get metadata device size"); 2469 goto err; 2470 } 2471 2472 residency = policy_residency(cache->policy); 2473 2474 DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ", 2475 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 2476 (unsigned long long)nr_blocks_metadata, 2477 (unsigned) atomic_read(&cache->stats.read_hit), 2478 (unsigned) atomic_read(&cache->stats.read_miss), 2479 (unsigned) atomic_read(&cache->stats.write_hit), 2480 (unsigned) atomic_read(&cache->stats.write_miss), 2481 (unsigned) atomic_read(&cache->stats.demotion), 2482 (unsigned) atomic_read(&cache->stats.promotion), 2483 (unsigned long long) from_cblock(residency), 2484 cache->nr_dirty); 2485 2486 if (cache->features.write_through) 2487 DMEMIT("1 writethrough "); 2488 else 2489 DMEMIT("0 "); 2490 2491 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); 2492 if (sz < maxlen) { 2493 r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz); 2494 if (r) 2495 DMERR("policy_emit_config_values returned %d", r); 2496 } 2497 2498 break; 2499 2500 case STATUSTYPE_TABLE: 2501 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 2502 DMEMIT("%s ", buf); 2503 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 2504 DMEMIT("%s ", buf); 2505 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 2506 DMEMIT("%s", buf); 2507 2508 for (i = 0; i < cache->nr_ctr_args - 1; i++) 2509 DMEMIT(" %s", cache->ctr_args[i]); 2510 if (cache->nr_ctr_args) 2511 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]); 2512 } 2513 2514 return; 2515 2516 err: 2517 DMEMIT("Error"); 2518 } 2519 2520 #define NOT_CORE_OPTION 1 2521 2522 static int process_config_option(struct cache *cache, char **argv) 2523 { 2524 unsigned long tmp; 2525 2526 if (!strcasecmp(argv[0], "migration_threshold")) { 2527 if (kstrtoul(argv[1], 10, &tmp)) 2528 return -EINVAL; 2529 2530 cache->migration_threshold = tmp; 2531 return 0; 2532 } 2533 2534 return NOT_CORE_OPTION; 2535 } 2536 2537 /* 2538 * Supports <key> <value>. 2539 * 2540 * The key migration_threshold is supported by the cache target core. 2541 */ 2542 static int cache_message(struct dm_target *ti, unsigned argc, char **argv) 2543 { 2544 int r; 2545 struct cache *cache = ti->private; 2546 2547 if (argc != 2) 2548 return -EINVAL; 2549 2550 r = process_config_option(cache, argv); 2551 if (r == NOT_CORE_OPTION) 2552 return policy_set_config_value(cache->policy, argv[0], argv[1]); 2553 2554 return r; 2555 } 2556 2557 static int cache_iterate_devices(struct dm_target *ti, 2558 iterate_devices_callout_fn fn, void *data) 2559 { 2560 int r = 0; 2561 struct cache *cache = ti->private; 2562 2563 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data); 2564 if (!r) 2565 r = fn(ti, cache->origin_dev, 0, ti->len, data); 2566 2567 return r; 2568 } 2569 2570 /* 2571 * We assume I/O is going to the origin (which is the volume 2572 * more likely to have restrictions e.g. by being striped). 2573 * (Looking up the exact location of the data would be expensive 2574 * and could always be out of date by the time the bio is submitted.) 2575 */ 2576 static int cache_bvec_merge(struct dm_target *ti, 2577 struct bvec_merge_data *bvm, 2578 struct bio_vec *biovec, int max_size) 2579 { 2580 struct cache *cache = ti->private; 2581 struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev); 2582 2583 if (!q->merge_bvec_fn) 2584 return max_size; 2585 2586 bvm->bi_bdev = cache->origin_dev->bdev; 2587 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 2588 } 2589 2590 static void set_discard_limits(struct cache *cache, struct queue_limits *limits) 2591 { 2592 /* 2593 * FIXME: these limits may be incompatible with the cache device 2594 */ 2595 limits->max_discard_sectors = cache->discard_block_size * 1024; 2596 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; 2597 } 2598 2599 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 2600 { 2601 struct cache *cache = ti->private; 2602 2603 blk_limits_io_min(limits, 0); 2604 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); 2605 set_discard_limits(cache, limits); 2606 } 2607 2608 /*----------------------------------------------------------------*/ 2609 2610 static struct target_type cache_target = { 2611 .name = "cache", 2612 .version = {1, 1, 0}, 2613 .module = THIS_MODULE, 2614 .ctr = cache_ctr, 2615 .dtr = cache_dtr, 2616 .map = cache_map, 2617 .end_io = cache_end_io, 2618 .postsuspend = cache_postsuspend, 2619 .preresume = cache_preresume, 2620 .resume = cache_resume, 2621 .status = cache_status, 2622 .message = cache_message, 2623 .iterate_devices = cache_iterate_devices, 2624 .merge = cache_bvec_merge, 2625 .io_hints = cache_io_hints, 2626 }; 2627 2628 static int __init dm_cache_init(void) 2629 { 2630 int r; 2631 2632 r = dm_register_target(&cache_target); 2633 if (r) { 2634 DMERR("cache target registration failed: %d", r); 2635 return r; 2636 } 2637 2638 migration_cache = KMEM_CACHE(dm_cache_migration, 0); 2639 if (!migration_cache) { 2640 dm_unregister_target(&cache_target); 2641 return -ENOMEM; 2642 } 2643 2644 return 0; 2645 } 2646 2647 static void __exit dm_cache_exit(void) 2648 { 2649 dm_unregister_target(&cache_target); 2650 kmem_cache_destroy(migration_cache); 2651 } 2652 2653 module_init(dm_cache_init); 2654 module_exit(dm_cache_exit); 2655 2656 MODULE_DESCRIPTION(DM_NAME " cache target"); 2657 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); 2658 MODULE_LICENSE("GPL"); 2659