1 /* 2 * Copyright (C) 2011-2012 Red Hat UK. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm-thin-metadata.h" 8 #include "dm.h" 9 10 #include <linux/device-mapper.h> 11 #include <linux/dm-io.h> 12 #include <linux/dm-kcopyd.h> 13 #include <linux/list.h> 14 #include <linux/init.h> 15 #include <linux/module.h> 16 #include <linux/slab.h> 17 18 #define DM_MSG_PREFIX "thin" 19 20 /* 21 * Tunable constants 22 */ 23 #define ENDIO_HOOK_POOL_SIZE 1024 24 #define DEFERRED_SET_SIZE 64 25 #define MAPPING_POOL_SIZE 1024 26 #define PRISON_CELLS 1024 27 #define COMMIT_PERIOD HZ 28 29 /* 30 * The block size of the device holding pool data must be 31 * between 64KB and 1GB. 32 */ 33 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT) 34 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 35 36 /* 37 * Device id is restricted to 24 bits. 38 */ 39 #define MAX_DEV_ID ((1 << 24) - 1) 40 41 /* 42 * How do we handle breaking sharing of data blocks? 43 * ================================================= 44 * 45 * We use a standard copy-on-write btree to store the mappings for the 46 * devices (note I'm talking about copy-on-write of the metadata here, not 47 * the data). When you take an internal snapshot you clone the root node 48 * of the origin btree. After this there is no concept of an origin or a 49 * snapshot. They are just two device trees that happen to point to the 50 * same data blocks. 51 * 52 * When we get a write in we decide if it's to a shared data block using 53 * some timestamp magic. If it is, we have to break sharing. 54 * 55 * Let's say we write to a shared block in what was the origin. The 56 * steps are: 57 * 58 * i) plug io further to this physical block. (see bio_prison code). 59 * 60 * ii) quiesce any read io to that shared data block. Obviously 61 * including all devices that share this block. (see deferred_set code) 62 * 63 * iii) copy the data block to a newly allocate block. This step can be 64 * missed out if the io covers the block. (schedule_copy). 65 * 66 * iv) insert the new mapping into the origin's btree 67 * (process_prepared_mapping). This act of inserting breaks some 68 * sharing of btree nodes between the two devices. Breaking sharing only 69 * effects the btree of that specific device. Btrees for the other 70 * devices that share the block never change. The btree for the origin 71 * device as it was after the last commit is untouched, ie. we're using 72 * persistent data structures in the functional programming sense. 73 * 74 * v) unplug io to this physical block, including the io that triggered 75 * the breaking of sharing. 76 * 77 * Steps (ii) and (iii) occur in parallel. 78 * 79 * The metadata _doesn't_ need to be committed before the io continues. We 80 * get away with this because the io is always written to a _new_ block. 81 * If there's a crash, then: 82 * 83 * - The origin mapping will point to the old origin block (the shared 84 * one). This will contain the data as it was before the io that triggered 85 * the breaking of sharing came in. 86 * 87 * - The snap mapping still points to the old block. As it would after 88 * the commit. 89 * 90 * The downside of this scheme is the timestamp magic isn't perfect, and 91 * will continue to think that data block in the snapshot device is shared 92 * even after the write to the origin has broken sharing. I suspect data 93 * blocks will typically be shared by many different devices, so we're 94 * breaking sharing n + 1 times, rather than n, where n is the number of 95 * devices that reference this data block. At the moment I think the 96 * benefits far, far outweigh the disadvantages. 97 */ 98 99 /*----------------------------------------------------------------*/ 100 101 /* 102 * Sometimes we can't deal with a bio straight away. We put them in prison 103 * where they can't cause any mischief. Bios are put in a cell identified 104 * by a key, multiple bios can be in the same cell. When the cell is 105 * subsequently unlocked the bios become available. 106 */ 107 struct bio_prison; 108 109 struct cell_key { 110 int virtual; 111 dm_thin_id dev; 112 dm_block_t block; 113 }; 114 115 struct dm_bio_prison_cell { 116 struct hlist_node list; 117 struct bio_prison *prison; 118 struct cell_key key; 119 struct bio *holder; 120 struct bio_list bios; 121 }; 122 123 struct bio_prison { 124 spinlock_t lock; 125 mempool_t *cell_pool; 126 127 unsigned nr_buckets; 128 unsigned hash_mask; 129 struct hlist_head *cells; 130 }; 131 132 static uint32_t calc_nr_buckets(unsigned nr_cells) 133 { 134 uint32_t n = 128; 135 136 nr_cells /= 4; 137 nr_cells = min(nr_cells, 8192u); 138 139 while (n < nr_cells) 140 n <<= 1; 141 142 return n; 143 } 144 145 static struct kmem_cache *_cell_cache; 146 147 /* 148 * @nr_cells should be the number of cells you want in use _concurrently_. 149 * Don't confuse it with the number of distinct keys. 150 */ 151 static struct bio_prison *prison_create(unsigned nr_cells) 152 { 153 unsigned i; 154 uint32_t nr_buckets = calc_nr_buckets(nr_cells); 155 size_t len = sizeof(struct bio_prison) + 156 (sizeof(struct hlist_head) * nr_buckets); 157 struct bio_prison *prison = kmalloc(len, GFP_KERNEL); 158 159 if (!prison) 160 return NULL; 161 162 spin_lock_init(&prison->lock); 163 prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache); 164 if (!prison->cell_pool) { 165 kfree(prison); 166 return NULL; 167 } 168 169 prison->nr_buckets = nr_buckets; 170 prison->hash_mask = nr_buckets - 1; 171 prison->cells = (struct hlist_head *) (prison + 1); 172 for (i = 0; i < nr_buckets; i++) 173 INIT_HLIST_HEAD(prison->cells + i); 174 175 return prison; 176 } 177 178 static void prison_destroy(struct bio_prison *prison) 179 { 180 mempool_destroy(prison->cell_pool); 181 kfree(prison); 182 } 183 184 static uint32_t hash_key(struct bio_prison *prison, struct cell_key *key) 185 { 186 const unsigned long BIG_PRIME = 4294967291UL; 187 uint64_t hash = key->block * BIG_PRIME; 188 189 return (uint32_t) (hash & prison->hash_mask); 190 } 191 192 static int keys_equal(struct cell_key *lhs, struct cell_key *rhs) 193 { 194 return (lhs->virtual == rhs->virtual) && 195 (lhs->dev == rhs->dev) && 196 (lhs->block == rhs->block); 197 } 198 199 static struct dm_bio_prison_cell *__search_bucket(struct hlist_head *bucket, 200 struct cell_key *key) 201 { 202 struct dm_bio_prison_cell *cell; 203 struct hlist_node *tmp; 204 205 hlist_for_each_entry(cell, tmp, bucket, list) 206 if (keys_equal(&cell->key, key)) 207 return cell; 208 209 return NULL; 210 } 211 212 /* 213 * This may block if a new cell needs allocating. You must ensure that 214 * cells will be unlocked even if the calling thread is blocked. 215 * 216 * Returns 1 if the cell was already held, 0 if @inmate is the new holder. 217 */ 218 static int bio_detain(struct bio_prison *prison, struct cell_key *key, 219 struct bio *inmate, struct dm_bio_prison_cell **ref) 220 { 221 int r = 1; 222 unsigned long flags; 223 uint32_t hash = hash_key(prison, key); 224 struct dm_bio_prison_cell *cell, *cell2; 225 226 BUG_ON(hash > prison->nr_buckets); 227 228 spin_lock_irqsave(&prison->lock, flags); 229 230 cell = __search_bucket(prison->cells + hash, key); 231 if (cell) { 232 bio_list_add(&cell->bios, inmate); 233 goto out; 234 } 235 236 /* 237 * Allocate a new cell 238 */ 239 spin_unlock_irqrestore(&prison->lock, flags); 240 cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO); 241 spin_lock_irqsave(&prison->lock, flags); 242 243 /* 244 * We've been unlocked, so we have to double check that 245 * nobody else has inserted this cell in the meantime. 246 */ 247 cell = __search_bucket(prison->cells + hash, key); 248 if (cell) { 249 mempool_free(cell2, prison->cell_pool); 250 bio_list_add(&cell->bios, inmate); 251 goto out; 252 } 253 254 /* 255 * Use new cell. 256 */ 257 cell = cell2; 258 259 cell->prison = prison; 260 memcpy(&cell->key, key, sizeof(cell->key)); 261 cell->holder = inmate; 262 bio_list_init(&cell->bios); 263 hlist_add_head(&cell->list, prison->cells + hash); 264 265 r = 0; 266 267 out: 268 spin_unlock_irqrestore(&prison->lock, flags); 269 270 *ref = cell; 271 272 return r; 273 } 274 275 /* 276 * @inmates must have been initialised prior to this call 277 */ 278 static void __cell_release(struct dm_bio_prison_cell *cell, struct bio_list *inmates) 279 { 280 struct bio_prison *prison = cell->prison; 281 282 hlist_del(&cell->list); 283 284 if (inmates) { 285 bio_list_add(inmates, cell->holder); 286 bio_list_merge(inmates, &cell->bios); 287 } 288 289 mempool_free(cell, prison->cell_pool); 290 } 291 292 static void cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios) 293 { 294 unsigned long flags; 295 struct bio_prison *prison = cell->prison; 296 297 spin_lock_irqsave(&prison->lock, flags); 298 __cell_release(cell, bios); 299 spin_unlock_irqrestore(&prison->lock, flags); 300 } 301 302 /* 303 * There are a couple of places where we put a bio into a cell briefly 304 * before taking it out again. In these situations we know that no other 305 * bio may be in the cell. This function releases the cell, and also does 306 * a sanity check. 307 */ 308 static void __cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio) 309 { 310 BUG_ON(cell->holder != bio); 311 BUG_ON(!bio_list_empty(&cell->bios)); 312 313 __cell_release(cell, NULL); 314 } 315 316 static void cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio) 317 { 318 unsigned long flags; 319 struct bio_prison *prison = cell->prison; 320 321 spin_lock_irqsave(&prison->lock, flags); 322 __cell_release_singleton(cell, bio); 323 spin_unlock_irqrestore(&prison->lock, flags); 324 } 325 326 /* 327 * Sometimes we don't want the holder, just the additional bios. 328 */ 329 static void __cell_release_no_holder(struct dm_bio_prison_cell *cell, 330 struct bio_list *inmates) 331 { 332 struct bio_prison *prison = cell->prison; 333 334 hlist_del(&cell->list); 335 bio_list_merge(inmates, &cell->bios); 336 337 mempool_free(cell, prison->cell_pool); 338 } 339 340 static void cell_release_no_holder(struct dm_bio_prison_cell *cell, 341 struct bio_list *inmates) 342 { 343 unsigned long flags; 344 struct bio_prison *prison = cell->prison; 345 346 spin_lock_irqsave(&prison->lock, flags); 347 __cell_release_no_holder(cell, inmates); 348 spin_unlock_irqrestore(&prison->lock, flags); 349 } 350 351 static void cell_error(struct dm_bio_prison_cell *cell) 352 { 353 struct bio_prison *prison = cell->prison; 354 struct bio_list bios; 355 struct bio *bio; 356 unsigned long flags; 357 358 bio_list_init(&bios); 359 360 spin_lock_irqsave(&prison->lock, flags); 361 __cell_release(cell, &bios); 362 spin_unlock_irqrestore(&prison->lock, flags); 363 364 while ((bio = bio_list_pop(&bios))) 365 bio_io_error(bio); 366 } 367 368 /*----------------------------------------------------------------*/ 369 370 /* 371 * We use the deferred set to keep track of pending reads to shared blocks. 372 * We do this to ensure the new mapping caused by a write isn't performed 373 * until these prior reads have completed. Otherwise the insertion of the 374 * new mapping could free the old block that the read bios are mapped to. 375 */ 376 377 struct deferred_set; 378 struct deferred_entry { 379 struct deferred_set *ds; 380 unsigned count; 381 struct list_head work_items; 382 }; 383 384 struct deferred_set { 385 spinlock_t lock; 386 unsigned current_entry; 387 unsigned sweeper; 388 struct deferred_entry entries[DEFERRED_SET_SIZE]; 389 }; 390 391 static void ds_init(struct deferred_set *ds) 392 { 393 int i; 394 395 spin_lock_init(&ds->lock); 396 ds->current_entry = 0; 397 ds->sweeper = 0; 398 for (i = 0; i < DEFERRED_SET_SIZE; i++) { 399 ds->entries[i].ds = ds; 400 ds->entries[i].count = 0; 401 INIT_LIST_HEAD(&ds->entries[i].work_items); 402 } 403 } 404 405 static struct deferred_entry *ds_inc(struct deferred_set *ds) 406 { 407 unsigned long flags; 408 struct deferred_entry *entry; 409 410 spin_lock_irqsave(&ds->lock, flags); 411 entry = ds->entries + ds->current_entry; 412 entry->count++; 413 spin_unlock_irqrestore(&ds->lock, flags); 414 415 return entry; 416 } 417 418 static unsigned ds_next(unsigned index) 419 { 420 return (index + 1) % DEFERRED_SET_SIZE; 421 } 422 423 static void __sweep(struct deferred_set *ds, struct list_head *head) 424 { 425 while ((ds->sweeper != ds->current_entry) && 426 !ds->entries[ds->sweeper].count) { 427 list_splice_init(&ds->entries[ds->sweeper].work_items, head); 428 ds->sweeper = ds_next(ds->sweeper); 429 } 430 431 if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count) 432 list_splice_init(&ds->entries[ds->sweeper].work_items, head); 433 } 434 435 static void ds_dec(struct deferred_entry *entry, struct list_head *head) 436 { 437 unsigned long flags; 438 439 spin_lock_irqsave(&entry->ds->lock, flags); 440 BUG_ON(!entry->count); 441 --entry->count; 442 __sweep(entry->ds, head); 443 spin_unlock_irqrestore(&entry->ds->lock, flags); 444 } 445 446 /* 447 * Returns 1 if deferred or 0 if no pending items to delay job. 448 */ 449 static int ds_add_work(struct deferred_set *ds, struct list_head *work) 450 { 451 int r = 1; 452 unsigned long flags; 453 unsigned next_entry; 454 455 spin_lock_irqsave(&ds->lock, flags); 456 if ((ds->sweeper == ds->current_entry) && 457 !ds->entries[ds->current_entry].count) 458 r = 0; 459 else { 460 list_add(work, &ds->entries[ds->current_entry].work_items); 461 next_entry = ds_next(ds->current_entry); 462 if (!ds->entries[next_entry].count) 463 ds->current_entry = next_entry; 464 } 465 spin_unlock_irqrestore(&ds->lock, flags); 466 467 return r; 468 } 469 470 /*----------------------------------------------------------------*/ 471 472 /* 473 * Key building. 474 */ 475 static void build_data_key(struct dm_thin_device *td, 476 dm_block_t b, struct cell_key *key) 477 { 478 key->virtual = 0; 479 key->dev = dm_thin_dev_id(td); 480 key->block = b; 481 } 482 483 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, 484 struct cell_key *key) 485 { 486 key->virtual = 1; 487 key->dev = dm_thin_dev_id(td); 488 key->block = b; 489 } 490 491 /*----------------------------------------------------------------*/ 492 493 /* 494 * A pool device ties together a metadata device and a data device. It 495 * also provides the interface for creating and destroying internal 496 * devices. 497 */ 498 struct dm_thin_new_mapping; 499 500 /* 501 * The pool runs in 3 modes. Ordered in degraded order for comparisons. 502 */ 503 enum pool_mode { 504 PM_WRITE, /* metadata may be changed */ 505 PM_READ_ONLY, /* metadata may not be changed */ 506 PM_FAIL, /* all I/O fails */ 507 }; 508 509 struct pool_features { 510 enum pool_mode mode; 511 512 unsigned zero_new_blocks:1; 513 unsigned discard_enabled:1; 514 unsigned discard_passdown:1; 515 }; 516 517 struct thin_c; 518 typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio); 519 typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m); 520 521 struct pool { 522 struct list_head list; 523 struct dm_target *ti; /* Only set if a pool target is bound */ 524 525 struct mapped_device *pool_md; 526 struct block_device *md_dev; 527 struct dm_pool_metadata *pmd; 528 529 dm_block_t low_water_blocks; 530 uint32_t sectors_per_block; 531 int sectors_per_block_shift; 532 533 struct pool_features pf; 534 unsigned low_water_triggered:1; /* A dm event has been sent */ 535 unsigned no_free_space:1; /* A -ENOSPC warning has been issued */ 536 537 struct bio_prison *prison; 538 struct dm_kcopyd_client *copier; 539 540 struct workqueue_struct *wq; 541 struct work_struct worker; 542 struct delayed_work waker; 543 544 unsigned long last_commit_jiffies; 545 unsigned ref_count; 546 547 spinlock_t lock; 548 struct bio_list deferred_bios; 549 struct bio_list deferred_flush_bios; 550 struct list_head prepared_mappings; 551 struct list_head prepared_discards; 552 553 struct bio_list retry_on_resume_list; 554 555 struct deferred_set shared_read_ds; 556 struct deferred_set all_io_ds; 557 558 struct dm_thin_new_mapping *next_mapping; 559 mempool_t *mapping_pool; 560 mempool_t *endio_hook_pool; 561 562 process_bio_fn process_bio; 563 process_bio_fn process_discard; 564 565 process_mapping_fn process_prepared_mapping; 566 process_mapping_fn process_prepared_discard; 567 }; 568 569 static enum pool_mode get_pool_mode(struct pool *pool); 570 static void set_pool_mode(struct pool *pool, enum pool_mode mode); 571 572 /* 573 * Target context for a pool. 574 */ 575 struct pool_c { 576 struct dm_target *ti; 577 struct pool *pool; 578 struct dm_dev *data_dev; 579 struct dm_dev *metadata_dev; 580 struct dm_target_callbacks callbacks; 581 582 dm_block_t low_water_blocks; 583 struct pool_features pf; 584 }; 585 586 /* 587 * Target context for a thin. 588 */ 589 struct thin_c { 590 struct dm_dev *pool_dev; 591 struct dm_dev *origin_dev; 592 dm_thin_id dev_id; 593 594 struct pool *pool; 595 struct dm_thin_device *td; 596 }; 597 598 /*----------------------------------------------------------------*/ 599 600 /* 601 * A global list of pools that uses a struct mapped_device as a key. 602 */ 603 static struct dm_thin_pool_table { 604 struct mutex mutex; 605 struct list_head pools; 606 } dm_thin_pool_table; 607 608 static void pool_table_init(void) 609 { 610 mutex_init(&dm_thin_pool_table.mutex); 611 INIT_LIST_HEAD(&dm_thin_pool_table.pools); 612 } 613 614 static void __pool_table_insert(struct pool *pool) 615 { 616 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 617 list_add(&pool->list, &dm_thin_pool_table.pools); 618 } 619 620 static void __pool_table_remove(struct pool *pool) 621 { 622 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 623 list_del(&pool->list); 624 } 625 626 static struct pool *__pool_table_lookup(struct mapped_device *md) 627 { 628 struct pool *pool = NULL, *tmp; 629 630 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 631 632 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { 633 if (tmp->pool_md == md) { 634 pool = tmp; 635 break; 636 } 637 } 638 639 return pool; 640 } 641 642 static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev) 643 { 644 struct pool *pool = NULL, *tmp; 645 646 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 647 648 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { 649 if (tmp->md_dev == md_dev) { 650 pool = tmp; 651 break; 652 } 653 } 654 655 return pool; 656 } 657 658 /*----------------------------------------------------------------*/ 659 660 struct dm_thin_endio_hook { 661 struct thin_c *tc; 662 struct deferred_entry *shared_read_entry; 663 struct deferred_entry *all_io_entry; 664 struct dm_thin_new_mapping *overwrite_mapping; 665 }; 666 667 static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) 668 { 669 struct bio *bio; 670 struct bio_list bios; 671 672 bio_list_init(&bios); 673 bio_list_merge(&bios, master); 674 bio_list_init(master); 675 676 while ((bio = bio_list_pop(&bios))) { 677 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 678 679 if (h->tc == tc) 680 bio_endio(bio, DM_ENDIO_REQUEUE); 681 else 682 bio_list_add(master, bio); 683 } 684 } 685 686 static void requeue_io(struct thin_c *tc) 687 { 688 struct pool *pool = tc->pool; 689 unsigned long flags; 690 691 spin_lock_irqsave(&pool->lock, flags); 692 __requeue_bio_list(tc, &pool->deferred_bios); 693 __requeue_bio_list(tc, &pool->retry_on_resume_list); 694 spin_unlock_irqrestore(&pool->lock, flags); 695 } 696 697 /* 698 * This section of code contains the logic for processing a thin device's IO. 699 * Much of the code depends on pool object resources (lists, workqueues, etc) 700 * but most is exclusively called from the thin target rather than the thin-pool 701 * target. 702 */ 703 704 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) 705 { 706 sector_t block_nr = bio->bi_sector; 707 708 if (tc->pool->sectors_per_block_shift < 0) 709 (void) sector_div(block_nr, tc->pool->sectors_per_block); 710 else 711 block_nr >>= tc->pool->sectors_per_block_shift; 712 713 return block_nr; 714 } 715 716 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) 717 { 718 struct pool *pool = tc->pool; 719 sector_t bi_sector = bio->bi_sector; 720 721 bio->bi_bdev = tc->pool_dev->bdev; 722 if (tc->pool->sectors_per_block_shift < 0) 723 bio->bi_sector = (block * pool->sectors_per_block) + 724 sector_div(bi_sector, pool->sectors_per_block); 725 else 726 bio->bi_sector = (block << pool->sectors_per_block_shift) | 727 (bi_sector & (pool->sectors_per_block - 1)); 728 } 729 730 static void remap_to_origin(struct thin_c *tc, struct bio *bio) 731 { 732 bio->bi_bdev = tc->origin_dev->bdev; 733 } 734 735 static int bio_triggers_commit(struct thin_c *tc, struct bio *bio) 736 { 737 return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && 738 dm_thin_changed_this_transaction(tc->td); 739 } 740 741 static void issue(struct thin_c *tc, struct bio *bio) 742 { 743 struct pool *pool = tc->pool; 744 unsigned long flags; 745 746 if (!bio_triggers_commit(tc, bio)) { 747 generic_make_request(bio); 748 return; 749 } 750 751 /* 752 * Complete bio with an error if earlier I/O caused changes to 753 * the metadata that can't be committed e.g, due to I/O errors 754 * on the metadata device. 755 */ 756 if (dm_thin_aborted_changes(tc->td)) { 757 bio_io_error(bio); 758 return; 759 } 760 761 /* 762 * Batch together any bios that trigger commits and then issue a 763 * single commit for them in process_deferred_bios(). 764 */ 765 spin_lock_irqsave(&pool->lock, flags); 766 bio_list_add(&pool->deferred_flush_bios, bio); 767 spin_unlock_irqrestore(&pool->lock, flags); 768 } 769 770 static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio) 771 { 772 remap_to_origin(tc, bio); 773 issue(tc, bio); 774 } 775 776 static void remap_and_issue(struct thin_c *tc, struct bio *bio, 777 dm_block_t block) 778 { 779 remap(tc, bio, block); 780 issue(tc, bio); 781 } 782 783 /* 784 * wake_worker() is used when new work is queued and when pool_resume is 785 * ready to continue deferred IO processing. 786 */ 787 static void wake_worker(struct pool *pool) 788 { 789 queue_work(pool->wq, &pool->worker); 790 } 791 792 /*----------------------------------------------------------------*/ 793 794 /* 795 * Bio endio functions. 796 */ 797 struct dm_thin_new_mapping { 798 struct list_head list; 799 800 unsigned quiesced:1; 801 unsigned prepared:1; 802 unsigned pass_discard:1; 803 804 struct thin_c *tc; 805 dm_block_t virt_block; 806 dm_block_t data_block; 807 struct dm_bio_prison_cell *cell, *cell2; 808 int err; 809 810 /* 811 * If the bio covers the whole area of a block then we can avoid 812 * zeroing or copying. Instead this bio is hooked. The bio will 813 * still be in the cell, so care has to be taken to avoid issuing 814 * the bio twice. 815 */ 816 struct bio *bio; 817 bio_end_io_t *saved_bi_end_io; 818 }; 819 820 static void __maybe_add_mapping(struct dm_thin_new_mapping *m) 821 { 822 struct pool *pool = m->tc->pool; 823 824 if (m->quiesced && m->prepared) { 825 list_add(&m->list, &pool->prepared_mappings); 826 wake_worker(pool); 827 } 828 } 829 830 static void copy_complete(int read_err, unsigned long write_err, void *context) 831 { 832 unsigned long flags; 833 struct dm_thin_new_mapping *m = context; 834 struct pool *pool = m->tc->pool; 835 836 m->err = read_err || write_err ? -EIO : 0; 837 838 spin_lock_irqsave(&pool->lock, flags); 839 m->prepared = 1; 840 __maybe_add_mapping(m); 841 spin_unlock_irqrestore(&pool->lock, flags); 842 } 843 844 static void overwrite_endio(struct bio *bio, int err) 845 { 846 unsigned long flags; 847 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 848 struct dm_thin_new_mapping *m = h->overwrite_mapping; 849 struct pool *pool = m->tc->pool; 850 851 m->err = err; 852 853 spin_lock_irqsave(&pool->lock, flags); 854 m->prepared = 1; 855 __maybe_add_mapping(m); 856 spin_unlock_irqrestore(&pool->lock, flags); 857 } 858 859 /*----------------------------------------------------------------*/ 860 861 /* 862 * Workqueue. 863 */ 864 865 /* 866 * Prepared mapping jobs. 867 */ 868 869 /* 870 * This sends the bios in the cell back to the deferred_bios list. 871 */ 872 static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell, 873 dm_block_t data_block) 874 { 875 struct pool *pool = tc->pool; 876 unsigned long flags; 877 878 spin_lock_irqsave(&pool->lock, flags); 879 cell_release(cell, &pool->deferred_bios); 880 spin_unlock_irqrestore(&tc->pool->lock, flags); 881 882 wake_worker(pool); 883 } 884 885 /* 886 * Same as cell_defer above, except it omits one particular detainee, 887 * a write bio that covers the block and has already been processed. 888 */ 889 static void cell_defer_except(struct thin_c *tc, struct dm_bio_prison_cell *cell) 890 { 891 struct bio_list bios; 892 struct pool *pool = tc->pool; 893 unsigned long flags; 894 895 bio_list_init(&bios); 896 897 spin_lock_irqsave(&pool->lock, flags); 898 cell_release_no_holder(cell, &pool->deferred_bios); 899 spin_unlock_irqrestore(&pool->lock, flags); 900 901 wake_worker(pool); 902 } 903 904 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) 905 { 906 if (m->bio) 907 m->bio->bi_end_io = m->saved_bi_end_io; 908 cell_error(m->cell); 909 list_del(&m->list); 910 mempool_free(m, m->tc->pool->mapping_pool); 911 } 912 static void process_prepared_mapping(struct dm_thin_new_mapping *m) 913 { 914 struct thin_c *tc = m->tc; 915 struct bio *bio; 916 int r; 917 918 bio = m->bio; 919 if (bio) 920 bio->bi_end_io = m->saved_bi_end_io; 921 922 if (m->err) { 923 cell_error(m->cell); 924 goto out; 925 } 926 927 /* 928 * Commit the prepared block into the mapping btree. 929 * Any I/O for this block arriving after this point will get 930 * remapped to it directly. 931 */ 932 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); 933 if (r) { 934 DMERR("dm_thin_insert_block() failed"); 935 cell_error(m->cell); 936 goto out; 937 } 938 939 /* 940 * Release any bios held while the block was being provisioned. 941 * If we are processing a write bio that completely covers the block, 942 * we already processed it so can ignore it now when processing 943 * the bios in the cell. 944 */ 945 if (bio) { 946 cell_defer_except(tc, m->cell); 947 bio_endio(bio, 0); 948 } else 949 cell_defer(tc, m->cell, m->data_block); 950 951 out: 952 list_del(&m->list); 953 mempool_free(m, tc->pool->mapping_pool); 954 } 955 956 static void process_prepared_discard_fail(struct dm_thin_new_mapping *m) 957 { 958 struct thin_c *tc = m->tc; 959 960 bio_io_error(m->bio); 961 cell_defer_except(tc, m->cell); 962 cell_defer_except(tc, m->cell2); 963 mempool_free(m, tc->pool->mapping_pool); 964 } 965 966 static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m) 967 { 968 struct thin_c *tc = m->tc; 969 970 if (m->pass_discard) 971 remap_and_issue(tc, m->bio, m->data_block); 972 else 973 bio_endio(m->bio, 0); 974 975 cell_defer_except(tc, m->cell); 976 cell_defer_except(tc, m->cell2); 977 mempool_free(m, tc->pool->mapping_pool); 978 } 979 980 static void process_prepared_discard(struct dm_thin_new_mapping *m) 981 { 982 int r; 983 struct thin_c *tc = m->tc; 984 985 r = dm_thin_remove_block(tc->td, m->virt_block); 986 if (r) 987 DMERR("dm_thin_remove_block() failed"); 988 989 process_prepared_discard_passdown(m); 990 } 991 992 static void process_prepared(struct pool *pool, struct list_head *head, 993 process_mapping_fn *fn) 994 { 995 unsigned long flags; 996 struct list_head maps; 997 struct dm_thin_new_mapping *m, *tmp; 998 999 INIT_LIST_HEAD(&maps); 1000 spin_lock_irqsave(&pool->lock, flags); 1001 list_splice_init(head, &maps); 1002 spin_unlock_irqrestore(&pool->lock, flags); 1003 1004 list_for_each_entry_safe(m, tmp, &maps, list) 1005 (*fn)(m); 1006 } 1007 1008 /* 1009 * Deferred bio jobs. 1010 */ 1011 static int io_overlaps_block(struct pool *pool, struct bio *bio) 1012 { 1013 return bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT); 1014 } 1015 1016 static int io_overwrites_block(struct pool *pool, struct bio *bio) 1017 { 1018 return (bio_data_dir(bio) == WRITE) && 1019 io_overlaps_block(pool, bio); 1020 } 1021 1022 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save, 1023 bio_end_io_t *fn) 1024 { 1025 *save = bio->bi_end_io; 1026 bio->bi_end_io = fn; 1027 } 1028 1029 static int ensure_next_mapping(struct pool *pool) 1030 { 1031 if (pool->next_mapping) 1032 return 0; 1033 1034 pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC); 1035 1036 return pool->next_mapping ? 0 : -ENOMEM; 1037 } 1038 1039 static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool) 1040 { 1041 struct dm_thin_new_mapping *r = pool->next_mapping; 1042 1043 BUG_ON(!pool->next_mapping); 1044 1045 pool->next_mapping = NULL; 1046 1047 return r; 1048 } 1049 1050 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, 1051 struct dm_dev *origin, dm_block_t data_origin, 1052 dm_block_t data_dest, 1053 struct dm_bio_prison_cell *cell, struct bio *bio) 1054 { 1055 int r; 1056 struct pool *pool = tc->pool; 1057 struct dm_thin_new_mapping *m = get_next_mapping(pool); 1058 1059 INIT_LIST_HEAD(&m->list); 1060 m->quiesced = 0; 1061 m->prepared = 0; 1062 m->tc = tc; 1063 m->virt_block = virt_block; 1064 m->data_block = data_dest; 1065 m->cell = cell; 1066 m->err = 0; 1067 m->bio = NULL; 1068 1069 if (!ds_add_work(&pool->shared_read_ds, &m->list)) 1070 m->quiesced = 1; 1071 1072 /* 1073 * IO to pool_dev remaps to the pool target's data_dev. 1074 * 1075 * If the whole block of data is being overwritten, we can issue the 1076 * bio immediately. Otherwise we use kcopyd to clone the data first. 1077 */ 1078 if (io_overwrites_block(pool, bio)) { 1079 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 1080 1081 h->overwrite_mapping = m; 1082 m->bio = bio; 1083 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 1084 remap_and_issue(tc, bio, data_dest); 1085 } else { 1086 struct dm_io_region from, to; 1087 1088 from.bdev = origin->bdev; 1089 from.sector = data_origin * pool->sectors_per_block; 1090 from.count = pool->sectors_per_block; 1091 1092 to.bdev = tc->pool_dev->bdev; 1093 to.sector = data_dest * pool->sectors_per_block; 1094 to.count = pool->sectors_per_block; 1095 1096 r = dm_kcopyd_copy(pool->copier, &from, 1, &to, 1097 0, copy_complete, m); 1098 if (r < 0) { 1099 mempool_free(m, pool->mapping_pool); 1100 DMERR("dm_kcopyd_copy() failed"); 1101 cell_error(cell); 1102 } 1103 } 1104 } 1105 1106 static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block, 1107 dm_block_t data_origin, dm_block_t data_dest, 1108 struct dm_bio_prison_cell *cell, struct bio *bio) 1109 { 1110 schedule_copy(tc, virt_block, tc->pool_dev, 1111 data_origin, data_dest, cell, bio); 1112 } 1113 1114 static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block, 1115 dm_block_t data_dest, 1116 struct dm_bio_prison_cell *cell, struct bio *bio) 1117 { 1118 schedule_copy(tc, virt_block, tc->origin_dev, 1119 virt_block, data_dest, cell, bio); 1120 } 1121 1122 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, 1123 dm_block_t data_block, struct dm_bio_prison_cell *cell, 1124 struct bio *bio) 1125 { 1126 struct pool *pool = tc->pool; 1127 struct dm_thin_new_mapping *m = get_next_mapping(pool); 1128 1129 INIT_LIST_HEAD(&m->list); 1130 m->quiesced = 1; 1131 m->prepared = 0; 1132 m->tc = tc; 1133 m->virt_block = virt_block; 1134 m->data_block = data_block; 1135 m->cell = cell; 1136 m->err = 0; 1137 m->bio = NULL; 1138 1139 /* 1140 * If the whole block of data is being overwritten or we are not 1141 * zeroing pre-existing data, we can issue the bio immediately. 1142 * Otherwise we use kcopyd to zero the data first. 1143 */ 1144 if (!pool->pf.zero_new_blocks) 1145 process_prepared_mapping(m); 1146 1147 else if (io_overwrites_block(pool, bio)) { 1148 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 1149 1150 h->overwrite_mapping = m; 1151 m->bio = bio; 1152 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 1153 remap_and_issue(tc, bio, data_block); 1154 } else { 1155 int r; 1156 struct dm_io_region to; 1157 1158 to.bdev = tc->pool_dev->bdev; 1159 to.sector = data_block * pool->sectors_per_block; 1160 to.count = pool->sectors_per_block; 1161 1162 r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m); 1163 if (r < 0) { 1164 mempool_free(m, pool->mapping_pool); 1165 DMERR("dm_kcopyd_zero() failed"); 1166 cell_error(cell); 1167 } 1168 } 1169 } 1170 1171 static int commit(struct pool *pool) 1172 { 1173 int r; 1174 1175 r = dm_pool_commit_metadata(pool->pmd); 1176 if (r) 1177 DMERR("commit failed, error = %d", r); 1178 1179 return r; 1180 } 1181 1182 /* 1183 * A non-zero return indicates read_only or fail_io mode. 1184 * Many callers don't care about the return value. 1185 */ 1186 static int commit_or_fallback(struct pool *pool) 1187 { 1188 int r; 1189 1190 if (get_pool_mode(pool) != PM_WRITE) 1191 return -EINVAL; 1192 1193 r = commit(pool); 1194 if (r) 1195 set_pool_mode(pool, PM_READ_ONLY); 1196 1197 return r; 1198 } 1199 1200 static int alloc_data_block(struct thin_c *tc, dm_block_t *result) 1201 { 1202 int r; 1203 dm_block_t free_blocks; 1204 unsigned long flags; 1205 struct pool *pool = tc->pool; 1206 1207 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 1208 if (r) 1209 return r; 1210 1211 if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) { 1212 DMWARN("%s: reached low water mark, sending event.", 1213 dm_device_name(pool->pool_md)); 1214 spin_lock_irqsave(&pool->lock, flags); 1215 pool->low_water_triggered = 1; 1216 spin_unlock_irqrestore(&pool->lock, flags); 1217 dm_table_event(pool->ti->table); 1218 } 1219 1220 if (!free_blocks) { 1221 if (pool->no_free_space) 1222 return -ENOSPC; 1223 else { 1224 /* 1225 * Try to commit to see if that will free up some 1226 * more space. 1227 */ 1228 (void) commit_or_fallback(pool); 1229 1230 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 1231 if (r) 1232 return r; 1233 1234 /* 1235 * If we still have no space we set a flag to avoid 1236 * doing all this checking and return -ENOSPC. 1237 */ 1238 if (!free_blocks) { 1239 DMWARN("%s: no free space available.", 1240 dm_device_name(pool->pool_md)); 1241 spin_lock_irqsave(&pool->lock, flags); 1242 pool->no_free_space = 1; 1243 spin_unlock_irqrestore(&pool->lock, flags); 1244 return -ENOSPC; 1245 } 1246 } 1247 } 1248 1249 r = dm_pool_alloc_data_block(pool->pmd, result); 1250 if (r) 1251 return r; 1252 1253 return 0; 1254 } 1255 1256 /* 1257 * If we have run out of space, queue bios until the device is 1258 * resumed, presumably after having been reloaded with more space. 1259 */ 1260 static void retry_on_resume(struct bio *bio) 1261 { 1262 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 1263 struct thin_c *tc = h->tc; 1264 struct pool *pool = tc->pool; 1265 unsigned long flags; 1266 1267 spin_lock_irqsave(&pool->lock, flags); 1268 bio_list_add(&pool->retry_on_resume_list, bio); 1269 spin_unlock_irqrestore(&pool->lock, flags); 1270 } 1271 1272 static void no_space(struct dm_bio_prison_cell *cell) 1273 { 1274 struct bio *bio; 1275 struct bio_list bios; 1276 1277 bio_list_init(&bios); 1278 cell_release(cell, &bios); 1279 1280 while ((bio = bio_list_pop(&bios))) 1281 retry_on_resume(bio); 1282 } 1283 1284 static void process_discard(struct thin_c *tc, struct bio *bio) 1285 { 1286 int r; 1287 unsigned long flags; 1288 struct pool *pool = tc->pool; 1289 struct dm_bio_prison_cell *cell, *cell2; 1290 struct cell_key key, key2; 1291 dm_block_t block = get_bio_block(tc, bio); 1292 struct dm_thin_lookup_result lookup_result; 1293 struct dm_thin_new_mapping *m; 1294 1295 build_virtual_key(tc->td, block, &key); 1296 if (bio_detain(tc->pool->prison, &key, bio, &cell)) 1297 return; 1298 1299 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1300 switch (r) { 1301 case 0: 1302 /* 1303 * Check nobody is fiddling with this pool block. This can 1304 * happen if someone's in the process of breaking sharing 1305 * on this block. 1306 */ 1307 build_data_key(tc->td, lookup_result.block, &key2); 1308 if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) { 1309 cell_release_singleton(cell, bio); 1310 break; 1311 } 1312 1313 if (io_overlaps_block(pool, bio)) { 1314 /* 1315 * IO may still be going to the destination block. We must 1316 * quiesce before we can do the removal. 1317 */ 1318 m = get_next_mapping(pool); 1319 m->tc = tc; 1320 m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown; 1321 m->virt_block = block; 1322 m->data_block = lookup_result.block; 1323 m->cell = cell; 1324 m->cell2 = cell2; 1325 m->err = 0; 1326 m->bio = bio; 1327 1328 if (!ds_add_work(&pool->all_io_ds, &m->list)) { 1329 spin_lock_irqsave(&pool->lock, flags); 1330 list_add(&m->list, &pool->prepared_discards); 1331 spin_unlock_irqrestore(&pool->lock, flags); 1332 wake_worker(pool); 1333 } 1334 } else { 1335 /* 1336 * The DM core makes sure that the discard doesn't span 1337 * a block boundary. So we submit the discard of a 1338 * partial block appropriately. 1339 */ 1340 cell_release_singleton(cell, bio); 1341 cell_release_singleton(cell2, bio); 1342 if ((!lookup_result.shared) && pool->pf.discard_passdown) 1343 remap_and_issue(tc, bio, lookup_result.block); 1344 else 1345 bio_endio(bio, 0); 1346 } 1347 break; 1348 1349 case -ENODATA: 1350 /* 1351 * It isn't provisioned, just forget it. 1352 */ 1353 cell_release_singleton(cell, bio); 1354 bio_endio(bio, 0); 1355 break; 1356 1357 default: 1358 DMERR("discard: find block unexpectedly returned %d", r); 1359 cell_release_singleton(cell, bio); 1360 bio_io_error(bio); 1361 break; 1362 } 1363 } 1364 1365 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, 1366 struct cell_key *key, 1367 struct dm_thin_lookup_result *lookup_result, 1368 struct dm_bio_prison_cell *cell) 1369 { 1370 int r; 1371 dm_block_t data_block; 1372 1373 r = alloc_data_block(tc, &data_block); 1374 switch (r) { 1375 case 0: 1376 schedule_internal_copy(tc, block, lookup_result->block, 1377 data_block, cell, bio); 1378 break; 1379 1380 case -ENOSPC: 1381 no_space(cell); 1382 break; 1383 1384 default: 1385 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r); 1386 cell_error(cell); 1387 break; 1388 } 1389 } 1390 1391 static void process_shared_bio(struct thin_c *tc, struct bio *bio, 1392 dm_block_t block, 1393 struct dm_thin_lookup_result *lookup_result) 1394 { 1395 struct dm_bio_prison_cell *cell; 1396 struct pool *pool = tc->pool; 1397 struct cell_key key; 1398 1399 /* 1400 * If cell is already occupied, then sharing is already in the process 1401 * of being broken so we have nothing further to do here. 1402 */ 1403 build_data_key(tc->td, lookup_result->block, &key); 1404 if (bio_detain(pool->prison, &key, bio, &cell)) 1405 return; 1406 1407 if (bio_data_dir(bio) == WRITE && bio->bi_size) 1408 break_sharing(tc, bio, block, &key, lookup_result, cell); 1409 else { 1410 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 1411 1412 h->shared_read_entry = ds_inc(&pool->shared_read_ds); 1413 1414 cell_release_singleton(cell, bio); 1415 remap_and_issue(tc, bio, lookup_result->block); 1416 } 1417 } 1418 1419 static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block, 1420 struct dm_bio_prison_cell *cell) 1421 { 1422 int r; 1423 dm_block_t data_block; 1424 1425 /* 1426 * Remap empty bios (flushes) immediately, without provisioning. 1427 */ 1428 if (!bio->bi_size) { 1429 cell_release_singleton(cell, bio); 1430 remap_and_issue(tc, bio, 0); 1431 return; 1432 } 1433 1434 /* 1435 * Fill read bios with zeroes and complete them immediately. 1436 */ 1437 if (bio_data_dir(bio) == READ) { 1438 zero_fill_bio(bio); 1439 cell_release_singleton(cell, bio); 1440 bio_endio(bio, 0); 1441 return; 1442 } 1443 1444 r = alloc_data_block(tc, &data_block); 1445 switch (r) { 1446 case 0: 1447 if (tc->origin_dev) 1448 schedule_external_copy(tc, block, data_block, cell, bio); 1449 else 1450 schedule_zero(tc, block, data_block, cell, bio); 1451 break; 1452 1453 case -ENOSPC: 1454 no_space(cell); 1455 break; 1456 1457 default: 1458 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r); 1459 set_pool_mode(tc->pool, PM_READ_ONLY); 1460 cell_error(cell); 1461 break; 1462 } 1463 } 1464 1465 static void process_bio(struct thin_c *tc, struct bio *bio) 1466 { 1467 int r; 1468 dm_block_t block = get_bio_block(tc, bio); 1469 struct dm_bio_prison_cell *cell; 1470 struct cell_key key; 1471 struct dm_thin_lookup_result lookup_result; 1472 1473 /* 1474 * If cell is already occupied, then the block is already 1475 * being provisioned so we have nothing further to do here. 1476 */ 1477 build_virtual_key(tc->td, block, &key); 1478 if (bio_detain(tc->pool->prison, &key, bio, &cell)) 1479 return; 1480 1481 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1482 switch (r) { 1483 case 0: 1484 /* 1485 * We can release this cell now. This thread is the only 1486 * one that puts bios into a cell, and we know there were 1487 * no preceding bios. 1488 */ 1489 /* 1490 * TODO: this will probably have to change when discard goes 1491 * back in. 1492 */ 1493 cell_release_singleton(cell, bio); 1494 1495 if (lookup_result.shared) 1496 process_shared_bio(tc, bio, block, &lookup_result); 1497 else 1498 remap_and_issue(tc, bio, lookup_result.block); 1499 break; 1500 1501 case -ENODATA: 1502 if (bio_data_dir(bio) == READ && tc->origin_dev) { 1503 cell_release_singleton(cell, bio); 1504 remap_to_origin_and_issue(tc, bio); 1505 } else 1506 provision_block(tc, bio, block, cell); 1507 break; 1508 1509 default: 1510 DMERR("dm_thin_find_block() failed, error = %d", r); 1511 cell_release_singleton(cell, bio); 1512 bio_io_error(bio); 1513 break; 1514 } 1515 } 1516 1517 static void process_bio_read_only(struct thin_c *tc, struct bio *bio) 1518 { 1519 int r; 1520 int rw = bio_data_dir(bio); 1521 dm_block_t block = get_bio_block(tc, bio); 1522 struct dm_thin_lookup_result lookup_result; 1523 1524 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1525 switch (r) { 1526 case 0: 1527 if (lookup_result.shared && (rw == WRITE) && bio->bi_size) 1528 bio_io_error(bio); 1529 else 1530 remap_and_issue(tc, bio, lookup_result.block); 1531 break; 1532 1533 case -ENODATA: 1534 if (rw != READ) { 1535 bio_io_error(bio); 1536 break; 1537 } 1538 1539 if (tc->origin_dev) { 1540 remap_to_origin_and_issue(tc, bio); 1541 break; 1542 } 1543 1544 zero_fill_bio(bio); 1545 bio_endio(bio, 0); 1546 break; 1547 1548 default: 1549 DMERR("dm_thin_find_block() failed, error = %d", r); 1550 bio_io_error(bio); 1551 break; 1552 } 1553 } 1554 1555 static void process_bio_fail(struct thin_c *tc, struct bio *bio) 1556 { 1557 bio_io_error(bio); 1558 } 1559 1560 static int need_commit_due_to_time(struct pool *pool) 1561 { 1562 return jiffies < pool->last_commit_jiffies || 1563 jiffies > pool->last_commit_jiffies + COMMIT_PERIOD; 1564 } 1565 1566 static void process_deferred_bios(struct pool *pool) 1567 { 1568 unsigned long flags; 1569 struct bio *bio; 1570 struct bio_list bios; 1571 1572 bio_list_init(&bios); 1573 1574 spin_lock_irqsave(&pool->lock, flags); 1575 bio_list_merge(&bios, &pool->deferred_bios); 1576 bio_list_init(&pool->deferred_bios); 1577 spin_unlock_irqrestore(&pool->lock, flags); 1578 1579 while ((bio = bio_list_pop(&bios))) { 1580 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 1581 struct thin_c *tc = h->tc; 1582 1583 /* 1584 * If we've got no free new_mapping structs, and processing 1585 * this bio might require one, we pause until there are some 1586 * prepared mappings to process. 1587 */ 1588 if (ensure_next_mapping(pool)) { 1589 spin_lock_irqsave(&pool->lock, flags); 1590 bio_list_merge(&pool->deferred_bios, &bios); 1591 spin_unlock_irqrestore(&pool->lock, flags); 1592 1593 break; 1594 } 1595 1596 if (bio->bi_rw & REQ_DISCARD) 1597 pool->process_discard(tc, bio); 1598 else 1599 pool->process_bio(tc, bio); 1600 } 1601 1602 /* 1603 * If there are any deferred flush bios, we must commit 1604 * the metadata before issuing them. 1605 */ 1606 bio_list_init(&bios); 1607 spin_lock_irqsave(&pool->lock, flags); 1608 bio_list_merge(&bios, &pool->deferred_flush_bios); 1609 bio_list_init(&pool->deferred_flush_bios); 1610 spin_unlock_irqrestore(&pool->lock, flags); 1611 1612 if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) 1613 return; 1614 1615 if (commit_or_fallback(pool)) { 1616 while ((bio = bio_list_pop(&bios))) 1617 bio_io_error(bio); 1618 return; 1619 } 1620 pool->last_commit_jiffies = jiffies; 1621 1622 while ((bio = bio_list_pop(&bios))) 1623 generic_make_request(bio); 1624 } 1625 1626 static void do_worker(struct work_struct *ws) 1627 { 1628 struct pool *pool = container_of(ws, struct pool, worker); 1629 1630 process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping); 1631 process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard); 1632 process_deferred_bios(pool); 1633 } 1634 1635 /* 1636 * We want to commit periodically so that not too much 1637 * unwritten data builds up. 1638 */ 1639 static void do_waker(struct work_struct *ws) 1640 { 1641 struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker); 1642 wake_worker(pool); 1643 queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD); 1644 } 1645 1646 /*----------------------------------------------------------------*/ 1647 1648 static enum pool_mode get_pool_mode(struct pool *pool) 1649 { 1650 return pool->pf.mode; 1651 } 1652 1653 static void set_pool_mode(struct pool *pool, enum pool_mode mode) 1654 { 1655 int r; 1656 1657 pool->pf.mode = mode; 1658 1659 switch (mode) { 1660 case PM_FAIL: 1661 DMERR("switching pool to failure mode"); 1662 pool->process_bio = process_bio_fail; 1663 pool->process_discard = process_bio_fail; 1664 pool->process_prepared_mapping = process_prepared_mapping_fail; 1665 pool->process_prepared_discard = process_prepared_discard_fail; 1666 break; 1667 1668 case PM_READ_ONLY: 1669 DMERR("switching pool to read-only mode"); 1670 r = dm_pool_abort_metadata(pool->pmd); 1671 if (r) { 1672 DMERR("aborting transaction failed"); 1673 set_pool_mode(pool, PM_FAIL); 1674 } else { 1675 dm_pool_metadata_read_only(pool->pmd); 1676 pool->process_bio = process_bio_read_only; 1677 pool->process_discard = process_discard; 1678 pool->process_prepared_mapping = process_prepared_mapping_fail; 1679 pool->process_prepared_discard = process_prepared_discard_passdown; 1680 } 1681 break; 1682 1683 case PM_WRITE: 1684 pool->process_bio = process_bio; 1685 pool->process_discard = process_discard; 1686 pool->process_prepared_mapping = process_prepared_mapping; 1687 pool->process_prepared_discard = process_prepared_discard; 1688 break; 1689 } 1690 } 1691 1692 /*----------------------------------------------------------------*/ 1693 1694 /* 1695 * Mapping functions. 1696 */ 1697 1698 /* 1699 * Called only while mapping a thin bio to hand it over to the workqueue. 1700 */ 1701 static void thin_defer_bio(struct thin_c *tc, struct bio *bio) 1702 { 1703 unsigned long flags; 1704 struct pool *pool = tc->pool; 1705 1706 spin_lock_irqsave(&pool->lock, flags); 1707 bio_list_add(&pool->deferred_bios, bio); 1708 spin_unlock_irqrestore(&pool->lock, flags); 1709 1710 wake_worker(pool); 1711 } 1712 1713 static struct dm_thin_endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio) 1714 { 1715 struct pool *pool = tc->pool; 1716 struct dm_thin_endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO); 1717 1718 h->tc = tc; 1719 h->shared_read_entry = NULL; 1720 h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds); 1721 h->overwrite_mapping = NULL; 1722 1723 return h; 1724 } 1725 1726 /* 1727 * Non-blocking function called from the thin target's map function. 1728 */ 1729 static int thin_bio_map(struct dm_target *ti, struct bio *bio, 1730 union map_info *map_context) 1731 { 1732 int r; 1733 struct thin_c *tc = ti->private; 1734 dm_block_t block = get_bio_block(tc, bio); 1735 struct dm_thin_device *td = tc->td; 1736 struct dm_thin_lookup_result result; 1737 1738 map_context->ptr = thin_hook_bio(tc, bio); 1739 1740 if (get_pool_mode(tc->pool) == PM_FAIL) { 1741 bio_io_error(bio); 1742 return DM_MAPIO_SUBMITTED; 1743 } 1744 1745 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { 1746 thin_defer_bio(tc, bio); 1747 return DM_MAPIO_SUBMITTED; 1748 } 1749 1750 r = dm_thin_find_block(td, block, 0, &result); 1751 1752 /* 1753 * Note that we defer readahead too. 1754 */ 1755 switch (r) { 1756 case 0: 1757 if (unlikely(result.shared)) { 1758 /* 1759 * We have a race condition here between the 1760 * result.shared value returned by the lookup and 1761 * snapshot creation, which may cause new 1762 * sharing. 1763 * 1764 * To avoid this always quiesce the origin before 1765 * taking the snap. You want to do this anyway to 1766 * ensure a consistent application view 1767 * (i.e. lockfs). 1768 * 1769 * More distant ancestors are irrelevant. The 1770 * shared flag will be set in their case. 1771 */ 1772 thin_defer_bio(tc, bio); 1773 r = DM_MAPIO_SUBMITTED; 1774 } else { 1775 remap(tc, bio, result.block); 1776 r = DM_MAPIO_REMAPPED; 1777 } 1778 break; 1779 1780 case -ENODATA: 1781 if (get_pool_mode(tc->pool) == PM_READ_ONLY) { 1782 /* 1783 * This block isn't provisioned, and we have no way 1784 * of doing so. Just error it. 1785 */ 1786 bio_io_error(bio); 1787 r = DM_MAPIO_SUBMITTED; 1788 break; 1789 } 1790 /* fall through */ 1791 1792 case -EWOULDBLOCK: 1793 /* 1794 * In future, the failed dm_thin_find_block above could 1795 * provide the hint to load the metadata into cache. 1796 */ 1797 thin_defer_bio(tc, bio); 1798 r = DM_MAPIO_SUBMITTED; 1799 break; 1800 1801 default: 1802 /* 1803 * Must always call bio_io_error on failure. 1804 * dm_thin_find_block can fail with -EINVAL if the 1805 * pool is switched to fail-io mode. 1806 */ 1807 bio_io_error(bio); 1808 r = DM_MAPIO_SUBMITTED; 1809 break; 1810 } 1811 1812 return r; 1813 } 1814 1815 static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 1816 { 1817 int r; 1818 unsigned long flags; 1819 struct pool_c *pt = container_of(cb, struct pool_c, callbacks); 1820 1821 spin_lock_irqsave(&pt->pool->lock, flags); 1822 r = !bio_list_empty(&pt->pool->retry_on_resume_list); 1823 spin_unlock_irqrestore(&pt->pool->lock, flags); 1824 1825 if (!r) { 1826 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 1827 r = bdi_congested(&q->backing_dev_info, bdi_bits); 1828 } 1829 1830 return r; 1831 } 1832 1833 static void __requeue_bios(struct pool *pool) 1834 { 1835 bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list); 1836 bio_list_init(&pool->retry_on_resume_list); 1837 } 1838 1839 /*---------------------------------------------------------------- 1840 * Binding of control targets to a pool object 1841 *--------------------------------------------------------------*/ 1842 static int bind_control_target(struct pool *pool, struct dm_target *ti) 1843 { 1844 struct pool_c *pt = ti->private; 1845 1846 /* 1847 * We want to make sure that degraded pools are never upgraded. 1848 */ 1849 enum pool_mode old_mode = pool->pf.mode; 1850 enum pool_mode new_mode = pt->pf.mode; 1851 1852 if (old_mode > new_mode) 1853 new_mode = old_mode; 1854 1855 pool->ti = ti; 1856 pool->low_water_blocks = pt->low_water_blocks; 1857 pool->pf = pt->pf; 1858 set_pool_mode(pool, new_mode); 1859 1860 /* 1861 * If discard_passdown was enabled verify that the data device 1862 * supports discards. Disable discard_passdown if not; otherwise 1863 * -EOPNOTSUPP will be returned. 1864 */ 1865 /* FIXME: pull this out into a sep fn. */ 1866 if (pt->pf.discard_passdown) { 1867 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 1868 if (!q || !blk_queue_discard(q)) { 1869 char buf[BDEVNAME_SIZE]; 1870 DMWARN("Discard unsupported by data device (%s): Disabling discard passdown.", 1871 bdevname(pt->data_dev->bdev, buf)); 1872 pool->pf.discard_passdown = 0; 1873 } 1874 } 1875 1876 return 0; 1877 } 1878 1879 static void unbind_control_target(struct pool *pool, struct dm_target *ti) 1880 { 1881 if (pool->ti == ti) 1882 pool->ti = NULL; 1883 } 1884 1885 /*---------------------------------------------------------------- 1886 * Pool creation 1887 *--------------------------------------------------------------*/ 1888 /* Initialize pool features. */ 1889 static void pool_features_init(struct pool_features *pf) 1890 { 1891 pf->mode = PM_WRITE; 1892 pf->zero_new_blocks = 1; 1893 pf->discard_enabled = 1; 1894 pf->discard_passdown = 1; 1895 } 1896 1897 static void __pool_destroy(struct pool *pool) 1898 { 1899 __pool_table_remove(pool); 1900 1901 if (dm_pool_metadata_close(pool->pmd) < 0) 1902 DMWARN("%s: dm_pool_metadata_close() failed.", __func__); 1903 1904 prison_destroy(pool->prison); 1905 dm_kcopyd_client_destroy(pool->copier); 1906 1907 if (pool->wq) 1908 destroy_workqueue(pool->wq); 1909 1910 if (pool->next_mapping) 1911 mempool_free(pool->next_mapping, pool->mapping_pool); 1912 mempool_destroy(pool->mapping_pool); 1913 mempool_destroy(pool->endio_hook_pool); 1914 kfree(pool); 1915 } 1916 1917 static struct kmem_cache *_new_mapping_cache; 1918 static struct kmem_cache *_endio_hook_cache; 1919 1920 static struct pool *pool_create(struct mapped_device *pool_md, 1921 struct block_device *metadata_dev, 1922 unsigned long block_size, 1923 int read_only, char **error) 1924 { 1925 int r; 1926 void *err_p; 1927 struct pool *pool; 1928 struct dm_pool_metadata *pmd; 1929 bool format_device = read_only ? false : true; 1930 1931 pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device); 1932 if (IS_ERR(pmd)) { 1933 *error = "Error creating metadata object"; 1934 return (struct pool *)pmd; 1935 } 1936 1937 pool = kmalloc(sizeof(*pool), GFP_KERNEL); 1938 if (!pool) { 1939 *error = "Error allocating memory for pool"; 1940 err_p = ERR_PTR(-ENOMEM); 1941 goto bad_pool; 1942 } 1943 1944 pool->pmd = pmd; 1945 pool->sectors_per_block = block_size; 1946 if (block_size & (block_size - 1)) 1947 pool->sectors_per_block_shift = -1; 1948 else 1949 pool->sectors_per_block_shift = __ffs(block_size); 1950 pool->low_water_blocks = 0; 1951 pool_features_init(&pool->pf); 1952 pool->prison = prison_create(PRISON_CELLS); 1953 if (!pool->prison) { 1954 *error = "Error creating pool's bio prison"; 1955 err_p = ERR_PTR(-ENOMEM); 1956 goto bad_prison; 1957 } 1958 1959 pool->copier = dm_kcopyd_client_create(); 1960 if (IS_ERR(pool->copier)) { 1961 r = PTR_ERR(pool->copier); 1962 *error = "Error creating pool's kcopyd client"; 1963 err_p = ERR_PTR(r); 1964 goto bad_kcopyd_client; 1965 } 1966 1967 /* 1968 * Create singlethreaded workqueue that will service all devices 1969 * that use this metadata. 1970 */ 1971 pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 1972 if (!pool->wq) { 1973 *error = "Error creating pool's workqueue"; 1974 err_p = ERR_PTR(-ENOMEM); 1975 goto bad_wq; 1976 } 1977 1978 INIT_WORK(&pool->worker, do_worker); 1979 INIT_DELAYED_WORK(&pool->waker, do_waker); 1980 spin_lock_init(&pool->lock); 1981 bio_list_init(&pool->deferred_bios); 1982 bio_list_init(&pool->deferred_flush_bios); 1983 INIT_LIST_HEAD(&pool->prepared_mappings); 1984 INIT_LIST_HEAD(&pool->prepared_discards); 1985 pool->low_water_triggered = 0; 1986 pool->no_free_space = 0; 1987 bio_list_init(&pool->retry_on_resume_list); 1988 ds_init(&pool->shared_read_ds); 1989 ds_init(&pool->all_io_ds); 1990 1991 pool->next_mapping = NULL; 1992 pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE, 1993 _new_mapping_cache); 1994 if (!pool->mapping_pool) { 1995 *error = "Error creating pool's mapping mempool"; 1996 err_p = ERR_PTR(-ENOMEM); 1997 goto bad_mapping_pool; 1998 } 1999 2000 pool->endio_hook_pool = mempool_create_slab_pool(ENDIO_HOOK_POOL_SIZE, 2001 _endio_hook_cache); 2002 if (!pool->endio_hook_pool) { 2003 *error = "Error creating pool's endio_hook mempool"; 2004 err_p = ERR_PTR(-ENOMEM); 2005 goto bad_endio_hook_pool; 2006 } 2007 pool->ref_count = 1; 2008 pool->last_commit_jiffies = jiffies; 2009 pool->pool_md = pool_md; 2010 pool->md_dev = metadata_dev; 2011 __pool_table_insert(pool); 2012 2013 return pool; 2014 2015 bad_endio_hook_pool: 2016 mempool_destroy(pool->mapping_pool); 2017 bad_mapping_pool: 2018 destroy_workqueue(pool->wq); 2019 bad_wq: 2020 dm_kcopyd_client_destroy(pool->copier); 2021 bad_kcopyd_client: 2022 prison_destroy(pool->prison); 2023 bad_prison: 2024 kfree(pool); 2025 bad_pool: 2026 if (dm_pool_metadata_close(pmd)) 2027 DMWARN("%s: dm_pool_metadata_close() failed.", __func__); 2028 2029 return err_p; 2030 } 2031 2032 static void __pool_inc(struct pool *pool) 2033 { 2034 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 2035 pool->ref_count++; 2036 } 2037 2038 static void __pool_dec(struct pool *pool) 2039 { 2040 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 2041 BUG_ON(!pool->ref_count); 2042 if (!--pool->ref_count) 2043 __pool_destroy(pool); 2044 } 2045 2046 static struct pool *__pool_find(struct mapped_device *pool_md, 2047 struct block_device *metadata_dev, 2048 unsigned long block_size, int read_only, 2049 char **error, int *created) 2050 { 2051 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); 2052 2053 if (pool) { 2054 if (pool->pool_md != pool_md) { 2055 *error = "metadata device already in use by a pool"; 2056 return ERR_PTR(-EBUSY); 2057 } 2058 __pool_inc(pool); 2059 2060 } else { 2061 pool = __pool_table_lookup(pool_md); 2062 if (pool) { 2063 if (pool->md_dev != metadata_dev) { 2064 *error = "different pool cannot replace a pool"; 2065 return ERR_PTR(-EINVAL); 2066 } 2067 __pool_inc(pool); 2068 2069 } else { 2070 pool = pool_create(pool_md, metadata_dev, block_size, read_only, error); 2071 *created = 1; 2072 } 2073 } 2074 2075 return pool; 2076 } 2077 2078 /*---------------------------------------------------------------- 2079 * Pool target methods 2080 *--------------------------------------------------------------*/ 2081 static void pool_dtr(struct dm_target *ti) 2082 { 2083 struct pool_c *pt = ti->private; 2084 2085 mutex_lock(&dm_thin_pool_table.mutex); 2086 2087 unbind_control_target(pt->pool, ti); 2088 __pool_dec(pt->pool); 2089 dm_put_device(ti, pt->metadata_dev); 2090 dm_put_device(ti, pt->data_dev); 2091 kfree(pt); 2092 2093 mutex_unlock(&dm_thin_pool_table.mutex); 2094 } 2095 2096 static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, 2097 struct dm_target *ti) 2098 { 2099 int r; 2100 unsigned argc; 2101 const char *arg_name; 2102 2103 static struct dm_arg _args[] = { 2104 {0, 3, "Invalid number of pool feature arguments"}, 2105 }; 2106 2107 /* 2108 * No feature arguments supplied. 2109 */ 2110 if (!as->argc) 2111 return 0; 2112 2113 r = dm_read_arg_group(_args, as, &argc, &ti->error); 2114 if (r) 2115 return -EINVAL; 2116 2117 while (argc && !r) { 2118 arg_name = dm_shift_arg(as); 2119 argc--; 2120 2121 if (!strcasecmp(arg_name, "skip_block_zeroing")) 2122 pf->zero_new_blocks = 0; 2123 2124 else if (!strcasecmp(arg_name, "ignore_discard")) 2125 pf->discard_enabled = 0; 2126 2127 else if (!strcasecmp(arg_name, "no_discard_passdown")) 2128 pf->discard_passdown = 0; 2129 2130 else if (!strcasecmp(arg_name, "read_only")) 2131 pf->mode = PM_READ_ONLY; 2132 2133 else { 2134 ti->error = "Unrecognised pool feature requested"; 2135 r = -EINVAL; 2136 break; 2137 } 2138 } 2139 2140 return r; 2141 } 2142 2143 /* 2144 * thin-pool <metadata dev> <data dev> 2145 * <data block size (sectors)> 2146 * <low water mark (blocks)> 2147 * [<#feature args> [<arg>]*] 2148 * 2149 * Optional feature arguments are: 2150 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. 2151 * ignore_discard: disable discard 2152 * no_discard_passdown: don't pass discards down to the data device 2153 */ 2154 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) 2155 { 2156 int r, pool_created = 0; 2157 struct pool_c *pt; 2158 struct pool *pool; 2159 struct pool_features pf; 2160 struct dm_arg_set as; 2161 struct dm_dev *data_dev; 2162 unsigned long block_size; 2163 dm_block_t low_water_blocks; 2164 struct dm_dev *metadata_dev; 2165 sector_t metadata_dev_size; 2166 char b[BDEVNAME_SIZE]; 2167 2168 /* 2169 * FIXME Remove validation from scope of lock. 2170 */ 2171 mutex_lock(&dm_thin_pool_table.mutex); 2172 2173 if (argc < 4) { 2174 ti->error = "Invalid argument count"; 2175 r = -EINVAL; 2176 goto out_unlock; 2177 } 2178 as.argc = argc; 2179 as.argv = argv; 2180 2181 r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev); 2182 if (r) { 2183 ti->error = "Error opening metadata block device"; 2184 goto out_unlock; 2185 } 2186 2187 metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT; 2188 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) 2189 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 2190 bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); 2191 2192 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); 2193 if (r) { 2194 ti->error = "Error getting data device"; 2195 goto out_metadata; 2196 } 2197 2198 if (kstrtoul(argv[2], 10, &block_size) || !block_size || 2199 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 2200 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 2201 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 2202 ti->error = "Invalid block size"; 2203 r = -EINVAL; 2204 goto out; 2205 } 2206 2207 if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) { 2208 ti->error = "Invalid low water mark"; 2209 r = -EINVAL; 2210 goto out; 2211 } 2212 2213 /* 2214 * Set default pool features. 2215 */ 2216 pool_features_init(&pf); 2217 2218 dm_consume_args(&as, 4); 2219 r = parse_pool_features(&as, &pf, ti); 2220 if (r) 2221 goto out; 2222 2223 pt = kzalloc(sizeof(*pt), GFP_KERNEL); 2224 if (!pt) { 2225 r = -ENOMEM; 2226 goto out; 2227 } 2228 2229 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, 2230 block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created); 2231 if (IS_ERR(pool)) { 2232 r = PTR_ERR(pool); 2233 goto out_free_pt; 2234 } 2235 2236 /* 2237 * 'pool_created' reflects whether this is the first table load. 2238 * Top level discard support is not allowed to be changed after 2239 * initial load. This would require a pool reload to trigger thin 2240 * device changes. 2241 */ 2242 if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) { 2243 ti->error = "Discard support cannot be disabled once enabled"; 2244 r = -EINVAL; 2245 goto out_flags_changed; 2246 } 2247 2248 /* 2249 * The block layer requires discard_granularity to be a power of 2. 2250 */ 2251 if (pf.discard_enabled && !is_power_of_2(block_size)) { 2252 ti->error = "Discard support must be disabled when the block size is not a power of 2"; 2253 r = -EINVAL; 2254 goto out_flags_changed; 2255 } 2256 2257 pt->pool = pool; 2258 pt->ti = ti; 2259 pt->metadata_dev = metadata_dev; 2260 pt->data_dev = data_dev; 2261 pt->low_water_blocks = low_water_blocks; 2262 pt->pf = pf; 2263 ti->num_flush_requests = 1; 2264 /* 2265 * Only need to enable discards if the pool should pass 2266 * them down to the data device. The thin device's discard 2267 * processing will cause mappings to be removed from the btree. 2268 */ 2269 if (pf.discard_enabled && pf.discard_passdown) { 2270 ti->num_discard_requests = 1; 2271 /* 2272 * Setting 'discards_supported' circumvents the normal 2273 * stacking of discard limits (this keeps the pool and 2274 * thin devices' discard limits consistent). 2275 */ 2276 ti->discards_supported = true; 2277 } 2278 ti->private = pt; 2279 2280 pt->callbacks.congested_fn = pool_is_congested; 2281 dm_table_add_target_callbacks(ti->table, &pt->callbacks); 2282 2283 mutex_unlock(&dm_thin_pool_table.mutex); 2284 2285 return 0; 2286 2287 out_flags_changed: 2288 __pool_dec(pool); 2289 out_free_pt: 2290 kfree(pt); 2291 out: 2292 dm_put_device(ti, data_dev); 2293 out_metadata: 2294 dm_put_device(ti, metadata_dev); 2295 out_unlock: 2296 mutex_unlock(&dm_thin_pool_table.mutex); 2297 2298 return r; 2299 } 2300 2301 static int pool_map(struct dm_target *ti, struct bio *bio, 2302 union map_info *map_context) 2303 { 2304 int r; 2305 struct pool_c *pt = ti->private; 2306 struct pool *pool = pt->pool; 2307 unsigned long flags; 2308 2309 /* 2310 * As this is a singleton target, ti->begin is always zero. 2311 */ 2312 spin_lock_irqsave(&pool->lock, flags); 2313 bio->bi_bdev = pt->data_dev->bdev; 2314 r = DM_MAPIO_REMAPPED; 2315 spin_unlock_irqrestore(&pool->lock, flags); 2316 2317 return r; 2318 } 2319 2320 /* 2321 * Retrieves the number of blocks of the data device from 2322 * the superblock and compares it to the actual device size, 2323 * thus resizing the data device in case it has grown. 2324 * 2325 * This both copes with opening preallocated data devices in the ctr 2326 * being followed by a resume 2327 * -and- 2328 * calling the resume method individually after userspace has 2329 * grown the data device in reaction to a table event. 2330 */ 2331 static int pool_preresume(struct dm_target *ti) 2332 { 2333 int r; 2334 struct pool_c *pt = ti->private; 2335 struct pool *pool = pt->pool; 2336 sector_t data_size = ti->len; 2337 dm_block_t sb_data_size; 2338 2339 /* 2340 * Take control of the pool object. 2341 */ 2342 r = bind_control_target(pool, ti); 2343 if (r) 2344 return r; 2345 2346 (void) sector_div(data_size, pool->sectors_per_block); 2347 2348 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size); 2349 if (r) { 2350 DMERR("failed to retrieve data device size"); 2351 return r; 2352 } 2353 2354 if (data_size < sb_data_size) { 2355 DMERR("pool target too small, is %llu blocks (expected %llu)", 2356 (unsigned long long)data_size, sb_data_size); 2357 return -EINVAL; 2358 2359 } else if (data_size > sb_data_size) { 2360 r = dm_pool_resize_data_dev(pool->pmd, data_size); 2361 if (r) { 2362 DMERR("failed to resize data device"); 2363 /* FIXME Stricter than necessary: Rollback transaction instead here */ 2364 set_pool_mode(pool, PM_READ_ONLY); 2365 return r; 2366 } 2367 2368 (void) commit_or_fallback(pool); 2369 } 2370 2371 return 0; 2372 } 2373 2374 static void pool_resume(struct dm_target *ti) 2375 { 2376 struct pool_c *pt = ti->private; 2377 struct pool *pool = pt->pool; 2378 unsigned long flags; 2379 2380 spin_lock_irqsave(&pool->lock, flags); 2381 pool->low_water_triggered = 0; 2382 pool->no_free_space = 0; 2383 __requeue_bios(pool); 2384 spin_unlock_irqrestore(&pool->lock, flags); 2385 2386 do_waker(&pool->waker.work); 2387 } 2388 2389 static void pool_postsuspend(struct dm_target *ti) 2390 { 2391 struct pool_c *pt = ti->private; 2392 struct pool *pool = pt->pool; 2393 2394 cancel_delayed_work(&pool->waker); 2395 flush_workqueue(pool->wq); 2396 (void) commit_or_fallback(pool); 2397 } 2398 2399 static int check_arg_count(unsigned argc, unsigned args_required) 2400 { 2401 if (argc != args_required) { 2402 DMWARN("Message received with %u arguments instead of %u.", 2403 argc, args_required); 2404 return -EINVAL; 2405 } 2406 2407 return 0; 2408 } 2409 2410 static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning) 2411 { 2412 if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) && 2413 *dev_id <= MAX_DEV_ID) 2414 return 0; 2415 2416 if (warning) 2417 DMWARN("Message received with invalid device id: %s", arg); 2418 2419 return -EINVAL; 2420 } 2421 2422 static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool) 2423 { 2424 dm_thin_id dev_id; 2425 int r; 2426 2427 r = check_arg_count(argc, 2); 2428 if (r) 2429 return r; 2430 2431 r = read_dev_id(argv[1], &dev_id, 1); 2432 if (r) 2433 return r; 2434 2435 r = dm_pool_create_thin(pool->pmd, dev_id); 2436 if (r) { 2437 DMWARN("Creation of new thinly-provisioned device with id %s failed.", 2438 argv[1]); 2439 return r; 2440 } 2441 2442 return 0; 2443 } 2444 2445 static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool) 2446 { 2447 dm_thin_id dev_id; 2448 dm_thin_id origin_dev_id; 2449 int r; 2450 2451 r = check_arg_count(argc, 3); 2452 if (r) 2453 return r; 2454 2455 r = read_dev_id(argv[1], &dev_id, 1); 2456 if (r) 2457 return r; 2458 2459 r = read_dev_id(argv[2], &origin_dev_id, 1); 2460 if (r) 2461 return r; 2462 2463 r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id); 2464 if (r) { 2465 DMWARN("Creation of new snapshot %s of device %s failed.", 2466 argv[1], argv[2]); 2467 return r; 2468 } 2469 2470 return 0; 2471 } 2472 2473 static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool) 2474 { 2475 dm_thin_id dev_id; 2476 int r; 2477 2478 r = check_arg_count(argc, 2); 2479 if (r) 2480 return r; 2481 2482 r = read_dev_id(argv[1], &dev_id, 1); 2483 if (r) 2484 return r; 2485 2486 r = dm_pool_delete_thin_device(pool->pmd, dev_id); 2487 if (r) 2488 DMWARN("Deletion of thin device %s failed.", argv[1]); 2489 2490 return r; 2491 } 2492 2493 static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool) 2494 { 2495 dm_thin_id old_id, new_id; 2496 int r; 2497 2498 r = check_arg_count(argc, 3); 2499 if (r) 2500 return r; 2501 2502 if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) { 2503 DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]); 2504 return -EINVAL; 2505 } 2506 2507 if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) { 2508 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]); 2509 return -EINVAL; 2510 } 2511 2512 r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id); 2513 if (r) { 2514 DMWARN("Failed to change transaction id from %s to %s.", 2515 argv[1], argv[2]); 2516 return r; 2517 } 2518 2519 return 0; 2520 } 2521 2522 static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool) 2523 { 2524 int r; 2525 2526 r = check_arg_count(argc, 1); 2527 if (r) 2528 return r; 2529 2530 (void) commit_or_fallback(pool); 2531 2532 r = dm_pool_reserve_metadata_snap(pool->pmd); 2533 if (r) 2534 DMWARN("reserve_metadata_snap message failed."); 2535 2536 return r; 2537 } 2538 2539 static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool) 2540 { 2541 int r; 2542 2543 r = check_arg_count(argc, 1); 2544 if (r) 2545 return r; 2546 2547 r = dm_pool_release_metadata_snap(pool->pmd); 2548 if (r) 2549 DMWARN("release_metadata_snap message failed."); 2550 2551 return r; 2552 } 2553 2554 /* 2555 * Messages supported: 2556 * create_thin <dev_id> 2557 * create_snap <dev_id> <origin_id> 2558 * delete <dev_id> 2559 * trim <dev_id> <new_size_in_sectors> 2560 * set_transaction_id <current_trans_id> <new_trans_id> 2561 * reserve_metadata_snap 2562 * release_metadata_snap 2563 */ 2564 static int pool_message(struct dm_target *ti, unsigned argc, char **argv) 2565 { 2566 int r = -EINVAL; 2567 struct pool_c *pt = ti->private; 2568 struct pool *pool = pt->pool; 2569 2570 if (!strcasecmp(argv[0], "create_thin")) 2571 r = process_create_thin_mesg(argc, argv, pool); 2572 2573 else if (!strcasecmp(argv[0], "create_snap")) 2574 r = process_create_snap_mesg(argc, argv, pool); 2575 2576 else if (!strcasecmp(argv[0], "delete")) 2577 r = process_delete_mesg(argc, argv, pool); 2578 2579 else if (!strcasecmp(argv[0], "set_transaction_id")) 2580 r = process_set_transaction_id_mesg(argc, argv, pool); 2581 2582 else if (!strcasecmp(argv[0], "reserve_metadata_snap")) 2583 r = process_reserve_metadata_snap_mesg(argc, argv, pool); 2584 2585 else if (!strcasecmp(argv[0], "release_metadata_snap")) 2586 r = process_release_metadata_snap_mesg(argc, argv, pool); 2587 2588 else 2589 DMWARN("Unrecognised thin pool target message received: %s", argv[0]); 2590 2591 if (!r) 2592 (void) commit_or_fallback(pool); 2593 2594 return r; 2595 } 2596 2597 static void emit_flags(struct pool_features *pf, char *result, 2598 unsigned sz, unsigned maxlen) 2599 { 2600 unsigned count = !pf->zero_new_blocks + !pf->discard_enabled + 2601 !pf->discard_passdown + (pf->mode == PM_READ_ONLY); 2602 DMEMIT("%u ", count); 2603 2604 if (!pf->zero_new_blocks) 2605 DMEMIT("skip_block_zeroing "); 2606 2607 if (!pf->discard_enabled) 2608 DMEMIT("ignore_discard "); 2609 2610 if (!pf->discard_passdown) 2611 DMEMIT("no_discard_passdown "); 2612 2613 if (pf->mode == PM_READ_ONLY) 2614 DMEMIT("read_only "); 2615 } 2616 2617 /* 2618 * Status line is: 2619 * <transaction id> <used metadata sectors>/<total metadata sectors> 2620 * <used data sectors>/<total data sectors> <held metadata root> 2621 */ 2622 static int pool_status(struct dm_target *ti, status_type_t type, 2623 unsigned status_flags, char *result, unsigned maxlen) 2624 { 2625 int r; 2626 unsigned sz = 0; 2627 uint64_t transaction_id; 2628 dm_block_t nr_free_blocks_data; 2629 dm_block_t nr_free_blocks_metadata; 2630 dm_block_t nr_blocks_data; 2631 dm_block_t nr_blocks_metadata; 2632 dm_block_t held_root; 2633 char buf[BDEVNAME_SIZE]; 2634 char buf2[BDEVNAME_SIZE]; 2635 struct pool_c *pt = ti->private; 2636 struct pool *pool = pt->pool; 2637 2638 switch (type) { 2639 case STATUSTYPE_INFO: 2640 if (get_pool_mode(pool) == PM_FAIL) { 2641 DMEMIT("Fail"); 2642 break; 2643 } 2644 2645 /* Commit to ensure statistics aren't out-of-date */ 2646 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) 2647 (void) commit_or_fallback(pool); 2648 2649 r = dm_pool_get_metadata_transaction_id(pool->pmd, 2650 &transaction_id); 2651 if (r) 2652 return r; 2653 2654 r = dm_pool_get_free_metadata_block_count(pool->pmd, 2655 &nr_free_blocks_metadata); 2656 if (r) 2657 return r; 2658 2659 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata); 2660 if (r) 2661 return r; 2662 2663 r = dm_pool_get_free_block_count(pool->pmd, 2664 &nr_free_blocks_data); 2665 if (r) 2666 return r; 2667 2668 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data); 2669 if (r) 2670 return r; 2671 2672 r = dm_pool_get_metadata_snap(pool->pmd, &held_root); 2673 if (r) 2674 return r; 2675 2676 DMEMIT("%llu %llu/%llu %llu/%llu ", 2677 (unsigned long long)transaction_id, 2678 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 2679 (unsigned long long)nr_blocks_metadata, 2680 (unsigned long long)(nr_blocks_data - nr_free_blocks_data), 2681 (unsigned long long)nr_blocks_data); 2682 2683 if (held_root) 2684 DMEMIT("%llu ", held_root); 2685 else 2686 DMEMIT("- "); 2687 2688 if (pool->pf.mode == PM_READ_ONLY) 2689 DMEMIT("ro "); 2690 else 2691 DMEMIT("rw "); 2692 2693 if (pool->pf.discard_enabled && pool->pf.discard_passdown) 2694 DMEMIT("discard_passdown"); 2695 else 2696 DMEMIT("no_discard_passdown"); 2697 2698 break; 2699 2700 case STATUSTYPE_TABLE: 2701 DMEMIT("%s %s %lu %llu ", 2702 format_dev_t(buf, pt->metadata_dev->bdev->bd_dev), 2703 format_dev_t(buf2, pt->data_dev->bdev->bd_dev), 2704 (unsigned long)pool->sectors_per_block, 2705 (unsigned long long)pt->low_water_blocks); 2706 emit_flags(&pt->pf, result, sz, maxlen); 2707 break; 2708 } 2709 2710 return 0; 2711 } 2712 2713 static int pool_iterate_devices(struct dm_target *ti, 2714 iterate_devices_callout_fn fn, void *data) 2715 { 2716 struct pool_c *pt = ti->private; 2717 2718 return fn(ti, pt->data_dev, 0, ti->len, data); 2719 } 2720 2721 static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm, 2722 struct bio_vec *biovec, int max_size) 2723 { 2724 struct pool_c *pt = ti->private; 2725 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 2726 2727 if (!q->merge_bvec_fn) 2728 return max_size; 2729 2730 bvm->bi_bdev = pt->data_dev->bdev; 2731 2732 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 2733 } 2734 2735 static void set_discard_limits(struct pool *pool, struct queue_limits *limits) 2736 { 2737 /* 2738 * FIXME: these limits may be incompatible with the pool's data device 2739 */ 2740 limits->max_discard_sectors = pool->sectors_per_block; 2741 2742 /* 2743 * This is just a hint, and not enforced. We have to cope with 2744 * bios that cover a block partially. A discard that spans a block 2745 * boundary is not sent to this target. 2746 */ 2747 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; 2748 limits->discard_zeroes_data = pool->pf.zero_new_blocks; 2749 } 2750 2751 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) 2752 { 2753 struct pool_c *pt = ti->private; 2754 struct pool *pool = pt->pool; 2755 2756 blk_limits_io_min(limits, 0); 2757 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); 2758 if (pool->pf.discard_enabled) 2759 set_discard_limits(pool, limits); 2760 } 2761 2762 static struct target_type pool_target = { 2763 .name = "thin-pool", 2764 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 2765 DM_TARGET_IMMUTABLE, 2766 .version = {1, 3, 0}, 2767 .module = THIS_MODULE, 2768 .ctr = pool_ctr, 2769 .dtr = pool_dtr, 2770 .map = pool_map, 2771 .postsuspend = pool_postsuspend, 2772 .preresume = pool_preresume, 2773 .resume = pool_resume, 2774 .message = pool_message, 2775 .status = pool_status, 2776 .merge = pool_merge, 2777 .iterate_devices = pool_iterate_devices, 2778 .io_hints = pool_io_hints, 2779 }; 2780 2781 /*---------------------------------------------------------------- 2782 * Thin target methods 2783 *--------------------------------------------------------------*/ 2784 static void thin_dtr(struct dm_target *ti) 2785 { 2786 struct thin_c *tc = ti->private; 2787 2788 mutex_lock(&dm_thin_pool_table.mutex); 2789 2790 __pool_dec(tc->pool); 2791 dm_pool_close_thin_device(tc->td); 2792 dm_put_device(ti, tc->pool_dev); 2793 if (tc->origin_dev) 2794 dm_put_device(ti, tc->origin_dev); 2795 kfree(tc); 2796 2797 mutex_unlock(&dm_thin_pool_table.mutex); 2798 } 2799 2800 /* 2801 * Thin target parameters: 2802 * 2803 * <pool_dev> <dev_id> [origin_dev] 2804 * 2805 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool) 2806 * dev_id: the internal device identifier 2807 * origin_dev: a device external to the pool that should act as the origin 2808 * 2809 * If the pool device has discards disabled, they get disabled for the thin 2810 * device as well. 2811 */ 2812 static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) 2813 { 2814 int r; 2815 struct thin_c *tc; 2816 struct dm_dev *pool_dev, *origin_dev; 2817 struct mapped_device *pool_md; 2818 2819 mutex_lock(&dm_thin_pool_table.mutex); 2820 2821 if (argc != 2 && argc != 3) { 2822 ti->error = "Invalid argument count"; 2823 r = -EINVAL; 2824 goto out_unlock; 2825 } 2826 2827 tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL); 2828 if (!tc) { 2829 ti->error = "Out of memory"; 2830 r = -ENOMEM; 2831 goto out_unlock; 2832 } 2833 2834 if (argc == 3) { 2835 r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev); 2836 if (r) { 2837 ti->error = "Error opening origin device"; 2838 goto bad_origin_dev; 2839 } 2840 tc->origin_dev = origin_dev; 2841 } 2842 2843 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev); 2844 if (r) { 2845 ti->error = "Error opening pool device"; 2846 goto bad_pool_dev; 2847 } 2848 tc->pool_dev = pool_dev; 2849 2850 if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) { 2851 ti->error = "Invalid device id"; 2852 r = -EINVAL; 2853 goto bad_common; 2854 } 2855 2856 pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev); 2857 if (!pool_md) { 2858 ti->error = "Couldn't get pool mapped device"; 2859 r = -EINVAL; 2860 goto bad_common; 2861 } 2862 2863 tc->pool = __pool_table_lookup(pool_md); 2864 if (!tc->pool) { 2865 ti->error = "Couldn't find pool object"; 2866 r = -EINVAL; 2867 goto bad_pool_lookup; 2868 } 2869 __pool_inc(tc->pool); 2870 2871 if (get_pool_mode(tc->pool) == PM_FAIL) { 2872 ti->error = "Couldn't open thin device, Pool is in fail mode"; 2873 goto bad_thin_open; 2874 } 2875 2876 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td); 2877 if (r) { 2878 ti->error = "Couldn't open thin internal device"; 2879 goto bad_thin_open; 2880 } 2881 2882 r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block); 2883 if (r) 2884 goto bad_thin_open; 2885 2886 ti->num_flush_requests = 1; 2887 ti->flush_supported = true; 2888 2889 /* In case the pool supports discards, pass them on. */ 2890 if (tc->pool->pf.discard_enabled) { 2891 ti->discards_supported = true; 2892 ti->num_discard_requests = 1; 2893 ti->discard_zeroes_data_unsupported = true; 2894 /* Discard requests must be split on a block boundary */ 2895 ti->split_discard_requests = true; 2896 } 2897 2898 dm_put(pool_md); 2899 2900 mutex_unlock(&dm_thin_pool_table.mutex); 2901 2902 return 0; 2903 2904 bad_thin_open: 2905 __pool_dec(tc->pool); 2906 bad_pool_lookup: 2907 dm_put(pool_md); 2908 bad_common: 2909 dm_put_device(ti, tc->pool_dev); 2910 bad_pool_dev: 2911 if (tc->origin_dev) 2912 dm_put_device(ti, tc->origin_dev); 2913 bad_origin_dev: 2914 kfree(tc); 2915 out_unlock: 2916 mutex_unlock(&dm_thin_pool_table.mutex); 2917 2918 return r; 2919 } 2920 2921 static int thin_map(struct dm_target *ti, struct bio *bio, 2922 union map_info *map_context) 2923 { 2924 bio->bi_sector = dm_target_offset(ti, bio->bi_sector); 2925 2926 return thin_bio_map(ti, bio, map_context); 2927 } 2928 2929 static int thin_endio(struct dm_target *ti, 2930 struct bio *bio, int err, 2931 union map_info *map_context) 2932 { 2933 unsigned long flags; 2934 struct dm_thin_endio_hook *h = map_context->ptr; 2935 struct list_head work; 2936 struct dm_thin_new_mapping *m, *tmp; 2937 struct pool *pool = h->tc->pool; 2938 2939 if (h->shared_read_entry) { 2940 INIT_LIST_HEAD(&work); 2941 ds_dec(h->shared_read_entry, &work); 2942 2943 spin_lock_irqsave(&pool->lock, flags); 2944 list_for_each_entry_safe(m, tmp, &work, list) { 2945 list_del(&m->list); 2946 m->quiesced = 1; 2947 __maybe_add_mapping(m); 2948 } 2949 spin_unlock_irqrestore(&pool->lock, flags); 2950 } 2951 2952 if (h->all_io_entry) { 2953 INIT_LIST_HEAD(&work); 2954 ds_dec(h->all_io_entry, &work); 2955 spin_lock_irqsave(&pool->lock, flags); 2956 list_for_each_entry_safe(m, tmp, &work, list) 2957 list_add(&m->list, &pool->prepared_discards); 2958 spin_unlock_irqrestore(&pool->lock, flags); 2959 } 2960 2961 mempool_free(h, pool->endio_hook_pool); 2962 2963 return 0; 2964 } 2965 2966 static void thin_postsuspend(struct dm_target *ti) 2967 { 2968 if (dm_noflush_suspending(ti)) 2969 requeue_io((struct thin_c *)ti->private); 2970 } 2971 2972 /* 2973 * <nr mapped sectors> <highest mapped sector> 2974 */ 2975 static int thin_status(struct dm_target *ti, status_type_t type, 2976 unsigned status_flags, char *result, unsigned maxlen) 2977 { 2978 int r; 2979 ssize_t sz = 0; 2980 dm_block_t mapped, highest; 2981 char buf[BDEVNAME_SIZE]; 2982 struct thin_c *tc = ti->private; 2983 2984 if (get_pool_mode(tc->pool) == PM_FAIL) { 2985 DMEMIT("Fail"); 2986 return 0; 2987 } 2988 2989 if (!tc->td) 2990 DMEMIT("-"); 2991 else { 2992 switch (type) { 2993 case STATUSTYPE_INFO: 2994 r = dm_thin_get_mapped_count(tc->td, &mapped); 2995 if (r) 2996 return r; 2997 2998 r = dm_thin_get_highest_mapped_block(tc->td, &highest); 2999 if (r < 0) 3000 return r; 3001 3002 DMEMIT("%llu ", mapped * tc->pool->sectors_per_block); 3003 if (r) 3004 DMEMIT("%llu", ((highest + 1) * 3005 tc->pool->sectors_per_block) - 1); 3006 else 3007 DMEMIT("-"); 3008 break; 3009 3010 case STATUSTYPE_TABLE: 3011 DMEMIT("%s %lu", 3012 format_dev_t(buf, tc->pool_dev->bdev->bd_dev), 3013 (unsigned long) tc->dev_id); 3014 if (tc->origin_dev) 3015 DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev)); 3016 break; 3017 } 3018 } 3019 3020 return 0; 3021 } 3022 3023 static int thin_iterate_devices(struct dm_target *ti, 3024 iterate_devices_callout_fn fn, void *data) 3025 { 3026 sector_t blocks; 3027 struct thin_c *tc = ti->private; 3028 struct pool *pool = tc->pool; 3029 3030 /* 3031 * We can't call dm_pool_get_data_dev_size() since that blocks. So 3032 * we follow a more convoluted path through to the pool's target. 3033 */ 3034 if (!pool->ti) 3035 return 0; /* nothing is bound */ 3036 3037 blocks = pool->ti->len; 3038 (void) sector_div(blocks, pool->sectors_per_block); 3039 if (blocks) 3040 return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data); 3041 3042 return 0; 3043 } 3044 3045 static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) 3046 { 3047 struct thin_c *tc = ti->private; 3048 struct pool *pool = tc->pool; 3049 3050 blk_limits_io_min(limits, 0); 3051 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); 3052 set_discard_limits(pool, limits); 3053 } 3054 3055 static struct target_type thin_target = { 3056 .name = "thin", 3057 .version = {1, 3, 0}, 3058 .module = THIS_MODULE, 3059 .ctr = thin_ctr, 3060 .dtr = thin_dtr, 3061 .map = thin_map, 3062 .end_io = thin_endio, 3063 .postsuspend = thin_postsuspend, 3064 .status = thin_status, 3065 .iterate_devices = thin_iterate_devices, 3066 .io_hints = thin_io_hints, 3067 }; 3068 3069 /*----------------------------------------------------------------*/ 3070 3071 static int __init dm_thin_init(void) 3072 { 3073 int r; 3074 3075 pool_table_init(); 3076 3077 r = dm_register_target(&thin_target); 3078 if (r) 3079 return r; 3080 3081 r = dm_register_target(&pool_target); 3082 if (r) 3083 goto bad_pool_target; 3084 3085 r = -ENOMEM; 3086 3087 _cell_cache = KMEM_CACHE(dm_bio_prison_cell, 0); 3088 if (!_cell_cache) 3089 goto bad_cell_cache; 3090 3091 _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0); 3092 if (!_new_mapping_cache) 3093 goto bad_new_mapping_cache; 3094 3095 _endio_hook_cache = KMEM_CACHE(dm_thin_endio_hook, 0); 3096 if (!_endio_hook_cache) 3097 goto bad_endio_hook_cache; 3098 3099 return 0; 3100 3101 bad_endio_hook_cache: 3102 kmem_cache_destroy(_new_mapping_cache); 3103 bad_new_mapping_cache: 3104 kmem_cache_destroy(_cell_cache); 3105 bad_cell_cache: 3106 dm_unregister_target(&pool_target); 3107 bad_pool_target: 3108 dm_unregister_target(&thin_target); 3109 3110 return r; 3111 } 3112 3113 static void dm_thin_exit(void) 3114 { 3115 dm_unregister_target(&thin_target); 3116 dm_unregister_target(&pool_target); 3117 3118 kmem_cache_destroy(_cell_cache); 3119 kmem_cache_destroy(_new_mapping_cache); 3120 kmem_cache_destroy(_endio_hook_cache); 3121 } 3122 3123 module_init(dm_thin_init); 3124 module_exit(dm_thin_exit); 3125 3126 MODULE_DESCRIPTION(DM_NAME " thin provisioning target"); 3127 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 3128 MODULE_LICENSE("GPL"); 3129