1 /* 2 * Copyright (C) 2011-2012 Red Hat UK. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm-thin-metadata.h" 8 #include "dm.h" 9 10 #include <linux/device-mapper.h> 11 #include <linux/dm-io.h> 12 #include <linux/dm-kcopyd.h> 13 #include <linux/list.h> 14 #include <linux/init.h> 15 #include <linux/module.h> 16 #include <linux/slab.h> 17 18 #define DM_MSG_PREFIX "thin" 19 20 /* 21 * Tunable constants 22 */ 23 #define ENDIO_HOOK_POOL_SIZE 1024 24 #define DEFERRED_SET_SIZE 64 25 #define MAPPING_POOL_SIZE 1024 26 #define PRISON_CELLS 1024 27 #define COMMIT_PERIOD HZ 28 29 /* 30 * The block size of the device holding pool data must be 31 * between 64KB and 1GB. 32 */ 33 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT) 34 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 35 36 /* 37 * Device id is restricted to 24 bits. 38 */ 39 #define MAX_DEV_ID ((1 << 24) - 1) 40 41 /* 42 * How do we handle breaking sharing of data blocks? 43 * ================================================= 44 * 45 * We use a standard copy-on-write btree to store the mappings for the 46 * devices (note I'm talking about copy-on-write of the metadata here, not 47 * the data). When you take an internal snapshot you clone the root node 48 * of the origin btree. After this there is no concept of an origin or a 49 * snapshot. They are just two device trees that happen to point to the 50 * same data blocks. 51 * 52 * When we get a write in we decide if it's to a shared data block using 53 * some timestamp magic. If it is, we have to break sharing. 54 * 55 * Let's say we write to a shared block in what was the origin. The 56 * steps are: 57 * 58 * i) plug io further to this physical block. (see bio_prison code). 59 * 60 * ii) quiesce any read io to that shared data block. Obviously 61 * including all devices that share this block. (see deferred_set code) 62 * 63 * iii) copy the data block to a newly allocate block. This step can be 64 * missed out if the io covers the block. (schedule_copy). 65 * 66 * iv) insert the new mapping into the origin's btree 67 * (process_prepared_mapping). This act of inserting breaks some 68 * sharing of btree nodes between the two devices. Breaking sharing only 69 * effects the btree of that specific device. Btrees for the other 70 * devices that share the block never change. The btree for the origin 71 * device as it was after the last commit is untouched, ie. we're using 72 * persistent data structures in the functional programming sense. 73 * 74 * v) unplug io to this physical block, including the io that triggered 75 * the breaking of sharing. 76 * 77 * Steps (ii) and (iii) occur in parallel. 78 * 79 * The metadata _doesn't_ need to be committed before the io continues. We 80 * get away with this because the io is always written to a _new_ block. 81 * If there's a crash, then: 82 * 83 * - The origin mapping will point to the old origin block (the shared 84 * one). This will contain the data as it was before the io that triggered 85 * the breaking of sharing came in. 86 * 87 * - The snap mapping still points to the old block. As it would after 88 * the commit. 89 * 90 * The downside of this scheme is the timestamp magic isn't perfect, and 91 * will continue to think that data block in the snapshot device is shared 92 * even after the write to the origin has broken sharing. I suspect data 93 * blocks will typically be shared by many different devices, so we're 94 * breaking sharing n + 1 times, rather than n, where n is the number of 95 * devices that reference this data block. At the moment I think the 96 * benefits far, far outweigh the disadvantages. 97 */ 98 99 /*----------------------------------------------------------------*/ 100 101 /* 102 * Sometimes we can't deal with a bio straight away. We put them in prison 103 * where they can't cause any mischief. Bios are put in a cell identified 104 * by a key, multiple bios can be in the same cell. When the cell is 105 * subsequently unlocked the bios become available. 106 */ 107 struct bio_prison; 108 109 struct cell_key { 110 int virtual; 111 dm_thin_id dev; 112 dm_block_t block; 113 }; 114 115 struct dm_bio_prison_cell { 116 struct hlist_node list; 117 struct bio_prison *prison; 118 struct cell_key key; 119 struct bio *holder; 120 struct bio_list bios; 121 }; 122 123 struct bio_prison { 124 spinlock_t lock; 125 mempool_t *cell_pool; 126 127 unsigned nr_buckets; 128 unsigned hash_mask; 129 struct hlist_head *cells; 130 }; 131 132 static uint32_t calc_nr_buckets(unsigned nr_cells) 133 { 134 uint32_t n = 128; 135 136 nr_cells /= 4; 137 nr_cells = min(nr_cells, 8192u); 138 139 while (n < nr_cells) 140 n <<= 1; 141 142 return n; 143 } 144 145 static struct kmem_cache *_cell_cache; 146 147 /* 148 * @nr_cells should be the number of cells you want in use _concurrently_. 149 * Don't confuse it with the number of distinct keys. 150 */ 151 static struct bio_prison *prison_create(unsigned nr_cells) 152 { 153 unsigned i; 154 uint32_t nr_buckets = calc_nr_buckets(nr_cells); 155 size_t len = sizeof(struct bio_prison) + 156 (sizeof(struct hlist_head) * nr_buckets); 157 struct bio_prison *prison = kmalloc(len, GFP_KERNEL); 158 159 if (!prison) 160 return NULL; 161 162 spin_lock_init(&prison->lock); 163 prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache); 164 if (!prison->cell_pool) { 165 kfree(prison); 166 return NULL; 167 } 168 169 prison->nr_buckets = nr_buckets; 170 prison->hash_mask = nr_buckets - 1; 171 prison->cells = (struct hlist_head *) (prison + 1); 172 for (i = 0; i < nr_buckets; i++) 173 INIT_HLIST_HEAD(prison->cells + i); 174 175 return prison; 176 } 177 178 static void prison_destroy(struct bio_prison *prison) 179 { 180 mempool_destroy(prison->cell_pool); 181 kfree(prison); 182 } 183 184 static uint32_t hash_key(struct bio_prison *prison, struct cell_key *key) 185 { 186 const unsigned long BIG_PRIME = 4294967291UL; 187 uint64_t hash = key->block * BIG_PRIME; 188 189 return (uint32_t) (hash & prison->hash_mask); 190 } 191 192 static int keys_equal(struct cell_key *lhs, struct cell_key *rhs) 193 { 194 return (lhs->virtual == rhs->virtual) && 195 (lhs->dev == rhs->dev) && 196 (lhs->block == rhs->block); 197 } 198 199 static struct dm_bio_prison_cell *__search_bucket(struct hlist_head *bucket, 200 struct cell_key *key) 201 { 202 struct dm_bio_prison_cell *cell; 203 struct hlist_node *tmp; 204 205 hlist_for_each_entry(cell, tmp, bucket, list) 206 if (keys_equal(&cell->key, key)) 207 return cell; 208 209 return NULL; 210 } 211 212 /* 213 * This may block if a new cell needs allocating. You must ensure that 214 * cells will be unlocked even if the calling thread is blocked. 215 * 216 * Returns 1 if the cell was already held, 0 if @inmate is the new holder. 217 */ 218 static int bio_detain(struct bio_prison *prison, struct cell_key *key, 219 struct bio *inmate, struct dm_bio_prison_cell **ref) 220 { 221 int r = 1; 222 unsigned long flags; 223 uint32_t hash = hash_key(prison, key); 224 struct dm_bio_prison_cell *cell, *cell2; 225 226 BUG_ON(hash > prison->nr_buckets); 227 228 spin_lock_irqsave(&prison->lock, flags); 229 230 cell = __search_bucket(prison->cells + hash, key); 231 if (cell) { 232 bio_list_add(&cell->bios, inmate); 233 goto out; 234 } 235 236 /* 237 * Allocate a new cell 238 */ 239 spin_unlock_irqrestore(&prison->lock, flags); 240 cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO); 241 spin_lock_irqsave(&prison->lock, flags); 242 243 /* 244 * We've been unlocked, so we have to double check that 245 * nobody else has inserted this cell in the meantime. 246 */ 247 cell = __search_bucket(prison->cells + hash, key); 248 if (cell) { 249 mempool_free(cell2, prison->cell_pool); 250 bio_list_add(&cell->bios, inmate); 251 goto out; 252 } 253 254 /* 255 * Use new cell. 256 */ 257 cell = cell2; 258 259 cell->prison = prison; 260 memcpy(&cell->key, key, sizeof(cell->key)); 261 cell->holder = inmate; 262 bio_list_init(&cell->bios); 263 hlist_add_head(&cell->list, prison->cells + hash); 264 265 r = 0; 266 267 out: 268 spin_unlock_irqrestore(&prison->lock, flags); 269 270 *ref = cell; 271 272 return r; 273 } 274 275 /* 276 * @inmates must have been initialised prior to this call 277 */ 278 static void __cell_release(struct dm_bio_prison_cell *cell, struct bio_list *inmates) 279 { 280 struct bio_prison *prison = cell->prison; 281 282 hlist_del(&cell->list); 283 284 if (inmates) { 285 bio_list_add(inmates, cell->holder); 286 bio_list_merge(inmates, &cell->bios); 287 } 288 289 mempool_free(cell, prison->cell_pool); 290 } 291 292 static void cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios) 293 { 294 unsigned long flags; 295 struct bio_prison *prison = cell->prison; 296 297 spin_lock_irqsave(&prison->lock, flags); 298 __cell_release(cell, bios); 299 spin_unlock_irqrestore(&prison->lock, flags); 300 } 301 302 /* 303 * There are a couple of places where we put a bio into a cell briefly 304 * before taking it out again. In these situations we know that no other 305 * bio may be in the cell. This function releases the cell, and also does 306 * a sanity check. 307 */ 308 static void __cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio) 309 { 310 BUG_ON(cell->holder != bio); 311 BUG_ON(!bio_list_empty(&cell->bios)); 312 313 __cell_release(cell, NULL); 314 } 315 316 static void cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio) 317 { 318 unsigned long flags; 319 struct bio_prison *prison = cell->prison; 320 321 spin_lock_irqsave(&prison->lock, flags); 322 __cell_release_singleton(cell, bio); 323 spin_unlock_irqrestore(&prison->lock, flags); 324 } 325 326 /* 327 * Sometimes we don't want the holder, just the additional bios. 328 */ 329 static void __cell_release_no_holder(struct dm_bio_prison_cell *cell, 330 struct bio_list *inmates) 331 { 332 struct bio_prison *prison = cell->prison; 333 334 hlist_del(&cell->list); 335 bio_list_merge(inmates, &cell->bios); 336 337 mempool_free(cell, prison->cell_pool); 338 } 339 340 static void cell_release_no_holder(struct dm_bio_prison_cell *cell, 341 struct bio_list *inmates) 342 { 343 unsigned long flags; 344 struct bio_prison *prison = cell->prison; 345 346 spin_lock_irqsave(&prison->lock, flags); 347 __cell_release_no_holder(cell, inmates); 348 spin_unlock_irqrestore(&prison->lock, flags); 349 } 350 351 static void cell_error(struct dm_bio_prison_cell *cell) 352 { 353 struct bio_prison *prison = cell->prison; 354 struct bio_list bios; 355 struct bio *bio; 356 unsigned long flags; 357 358 bio_list_init(&bios); 359 360 spin_lock_irqsave(&prison->lock, flags); 361 __cell_release(cell, &bios); 362 spin_unlock_irqrestore(&prison->lock, flags); 363 364 while ((bio = bio_list_pop(&bios))) 365 bio_io_error(bio); 366 } 367 368 /*----------------------------------------------------------------*/ 369 370 /* 371 * We use the deferred set to keep track of pending reads to shared blocks. 372 * We do this to ensure the new mapping caused by a write isn't performed 373 * until these prior reads have completed. Otherwise the insertion of the 374 * new mapping could free the old block that the read bios are mapped to. 375 */ 376 377 struct deferred_set; 378 struct deferred_entry { 379 struct deferred_set *ds; 380 unsigned count; 381 struct list_head work_items; 382 }; 383 384 struct deferred_set { 385 spinlock_t lock; 386 unsigned current_entry; 387 unsigned sweeper; 388 struct deferred_entry entries[DEFERRED_SET_SIZE]; 389 }; 390 391 static void ds_init(struct deferred_set *ds) 392 { 393 int i; 394 395 spin_lock_init(&ds->lock); 396 ds->current_entry = 0; 397 ds->sweeper = 0; 398 for (i = 0; i < DEFERRED_SET_SIZE; i++) { 399 ds->entries[i].ds = ds; 400 ds->entries[i].count = 0; 401 INIT_LIST_HEAD(&ds->entries[i].work_items); 402 } 403 } 404 405 static struct deferred_entry *ds_inc(struct deferred_set *ds) 406 { 407 unsigned long flags; 408 struct deferred_entry *entry; 409 410 spin_lock_irqsave(&ds->lock, flags); 411 entry = ds->entries + ds->current_entry; 412 entry->count++; 413 spin_unlock_irqrestore(&ds->lock, flags); 414 415 return entry; 416 } 417 418 static unsigned ds_next(unsigned index) 419 { 420 return (index + 1) % DEFERRED_SET_SIZE; 421 } 422 423 static void __sweep(struct deferred_set *ds, struct list_head *head) 424 { 425 while ((ds->sweeper != ds->current_entry) && 426 !ds->entries[ds->sweeper].count) { 427 list_splice_init(&ds->entries[ds->sweeper].work_items, head); 428 ds->sweeper = ds_next(ds->sweeper); 429 } 430 431 if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count) 432 list_splice_init(&ds->entries[ds->sweeper].work_items, head); 433 } 434 435 static void ds_dec(struct deferred_entry *entry, struct list_head *head) 436 { 437 unsigned long flags; 438 439 spin_lock_irqsave(&entry->ds->lock, flags); 440 BUG_ON(!entry->count); 441 --entry->count; 442 __sweep(entry->ds, head); 443 spin_unlock_irqrestore(&entry->ds->lock, flags); 444 } 445 446 /* 447 * Returns 1 if deferred or 0 if no pending items to delay job. 448 */ 449 static int ds_add_work(struct deferred_set *ds, struct list_head *work) 450 { 451 int r = 1; 452 unsigned long flags; 453 unsigned next_entry; 454 455 spin_lock_irqsave(&ds->lock, flags); 456 if ((ds->sweeper == ds->current_entry) && 457 !ds->entries[ds->current_entry].count) 458 r = 0; 459 else { 460 list_add(work, &ds->entries[ds->current_entry].work_items); 461 next_entry = ds_next(ds->current_entry); 462 if (!ds->entries[next_entry].count) 463 ds->current_entry = next_entry; 464 } 465 spin_unlock_irqrestore(&ds->lock, flags); 466 467 return r; 468 } 469 470 /*----------------------------------------------------------------*/ 471 472 /* 473 * Key building. 474 */ 475 static void build_data_key(struct dm_thin_device *td, 476 dm_block_t b, struct cell_key *key) 477 { 478 key->virtual = 0; 479 key->dev = dm_thin_dev_id(td); 480 key->block = b; 481 } 482 483 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, 484 struct cell_key *key) 485 { 486 key->virtual = 1; 487 key->dev = dm_thin_dev_id(td); 488 key->block = b; 489 } 490 491 /*----------------------------------------------------------------*/ 492 493 /* 494 * A pool device ties together a metadata device and a data device. It 495 * also provides the interface for creating and destroying internal 496 * devices. 497 */ 498 struct dm_thin_new_mapping; 499 500 /* 501 * The pool runs in 3 modes. Ordered in degraded order for comparisons. 502 */ 503 enum pool_mode { 504 PM_WRITE, /* metadata may be changed */ 505 PM_READ_ONLY, /* metadata may not be changed */ 506 PM_FAIL, /* all I/O fails */ 507 }; 508 509 struct pool_features { 510 enum pool_mode mode; 511 512 bool zero_new_blocks:1; 513 bool discard_enabled:1; 514 bool discard_passdown:1; 515 }; 516 517 struct thin_c; 518 typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio); 519 typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m); 520 521 struct pool { 522 struct list_head list; 523 struct dm_target *ti; /* Only set if a pool target is bound */ 524 525 struct mapped_device *pool_md; 526 struct block_device *md_dev; 527 struct dm_pool_metadata *pmd; 528 529 dm_block_t low_water_blocks; 530 uint32_t sectors_per_block; 531 int sectors_per_block_shift; 532 533 struct pool_features pf; 534 unsigned low_water_triggered:1; /* A dm event has been sent */ 535 unsigned no_free_space:1; /* A -ENOSPC warning has been issued */ 536 537 struct bio_prison *prison; 538 struct dm_kcopyd_client *copier; 539 540 struct workqueue_struct *wq; 541 struct work_struct worker; 542 struct delayed_work waker; 543 544 unsigned long last_commit_jiffies; 545 unsigned ref_count; 546 547 spinlock_t lock; 548 struct bio_list deferred_bios; 549 struct bio_list deferred_flush_bios; 550 struct list_head prepared_mappings; 551 struct list_head prepared_discards; 552 553 struct bio_list retry_on_resume_list; 554 555 struct deferred_set shared_read_ds; 556 struct deferred_set all_io_ds; 557 558 struct dm_thin_new_mapping *next_mapping; 559 mempool_t *mapping_pool; 560 mempool_t *endio_hook_pool; 561 562 process_bio_fn process_bio; 563 process_bio_fn process_discard; 564 565 process_mapping_fn process_prepared_mapping; 566 process_mapping_fn process_prepared_discard; 567 }; 568 569 static enum pool_mode get_pool_mode(struct pool *pool); 570 static void set_pool_mode(struct pool *pool, enum pool_mode mode); 571 572 /* 573 * Target context for a pool. 574 */ 575 struct pool_c { 576 struct dm_target *ti; 577 struct pool *pool; 578 struct dm_dev *data_dev; 579 struct dm_dev *metadata_dev; 580 struct dm_target_callbacks callbacks; 581 582 dm_block_t low_water_blocks; 583 struct pool_features requested_pf; /* Features requested during table load */ 584 struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */ 585 }; 586 587 /* 588 * Target context for a thin. 589 */ 590 struct thin_c { 591 struct dm_dev *pool_dev; 592 struct dm_dev *origin_dev; 593 dm_thin_id dev_id; 594 595 struct pool *pool; 596 struct dm_thin_device *td; 597 }; 598 599 /*----------------------------------------------------------------*/ 600 601 /* 602 * A global list of pools that uses a struct mapped_device as a key. 603 */ 604 static struct dm_thin_pool_table { 605 struct mutex mutex; 606 struct list_head pools; 607 } dm_thin_pool_table; 608 609 static void pool_table_init(void) 610 { 611 mutex_init(&dm_thin_pool_table.mutex); 612 INIT_LIST_HEAD(&dm_thin_pool_table.pools); 613 } 614 615 static void __pool_table_insert(struct pool *pool) 616 { 617 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 618 list_add(&pool->list, &dm_thin_pool_table.pools); 619 } 620 621 static void __pool_table_remove(struct pool *pool) 622 { 623 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 624 list_del(&pool->list); 625 } 626 627 static struct pool *__pool_table_lookup(struct mapped_device *md) 628 { 629 struct pool *pool = NULL, *tmp; 630 631 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 632 633 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { 634 if (tmp->pool_md == md) { 635 pool = tmp; 636 break; 637 } 638 } 639 640 return pool; 641 } 642 643 static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev) 644 { 645 struct pool *pool = NULL, *tmp; 646 647 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 648 649 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { 650 if (tmp->md_dev == md_dev) { 651 pool = tmp; 652 break; 653 } 654 } 655 656 return pool; 657 } 658 659 /*----------------------------------------------------------------*/ 660 661 struct dm_thin_endio_hook { 662 struct thin_c *tc; 663 struct deferred_entry *shared_read_entry; 664 struct deferred_entry *all_io_entry; 665 struct dm_thin_new_mapping *overwrite_mapping; 666 }; 667 668 static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) 669 { 670 struct bio *bio; 671 struct bio_list bios; 672 673 bio_list_init(&bios); 674 bio_list_merge(&bios, master); 675 bio_list_init(master); 676 677 while ((bio = bio_list_pop(&bios))) { 678 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 679 680 if (h->tc == tc) 681 bio_endio(bio, DM_ENDIO_REQUEUE); 682 else 683 bio_list_add(master, bio); 684 } 685 } 686 687 static void requeue_io(struct thin_c *tc) 688 { 689 struct pool *pool = tc->pool; 690 unsigned long flags; 691 692 spin_lock_irqsave(&pool->lock, flags); 693 __requeue_bio_list(tc, &pool->deferred_bios); 694 __requeue_bio_list(tc, &pool->retry_on_resume_list); 695 spin_unlock_irqrestore(&pool->lock, flags); 696 } 697 698 /* 699 * This section of code contains the logic for processing a thin device's IO. 700 * Much of the code depends on pool object resources (lists, workqueues, etc) 701 * but most is exclusively called from the thin target rather than the thin-pool 702 * target. 703 */ 704 705 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) 706 { 707 sector_t block_nr = bio->bi_sector; 708 709 if (tc->pool->sectors_per_block_shift < 0) 710 (void) sector_div(block_nr, tc->pool->sectors_per_block); 711 else 712 block_nr >>= tc->pool->sectors_per_block_shift; 713 714 return block_nr; 715 } 716 717 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) 718 { 719 struct pool *pool = tc->pool; 720 sector_t bi_sector = bio->bi_sector; 721 722 bio->bi_bdev = tc->pool_dev->bdev; 723 if (tc->pool->sectors_per_block_shift < 0) 724 bio->bi_sector = (block * pool->sectors_per_block) + 725 sector_div(bi_sector, pool->sectors_per_block); 726 else 727 bio->bi_sector = (block << pool->sectors_per_block_shift) | 728 (bi_sector & (pool->sectors_per_block - 1)); 729 } 730 731 static void remap_to_origin(struct thin_c *tc, struct bio *bio) 732 { 733 bio->bi_bdev = tc->origin_dev->bdev; 734 } 735 736 static int bio_triggers_commit(struct thin_c *tc, struct bio *bio) 737 { 738 return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && 739 dm_thin_changed_this_transaction(tc->td); 740 } 741 742 static void issue(struct thin_c *tc, struct bio *bio) 743 { 744 struct pool *pool = tc->pool; 745 unsigned long flags; 746 747 if (!bio_triggers_commit(tc, bio)) { 748 generic_make_request(bio); 749 return; 750 } 751 752 /* 753 * Complete bio with an error if earlier I/O caused changes to 754 * the metadata that can't be committed e.g, due to I/O errors 755 * on the metadata device. 756 */ 757 if (dm_thin_aborted_changes(tc->td)) { 758 bio_io_error(bio); 759 return; 760 } 761 762 /* 763 * Batch together any bios that trigger commits and then issue a 764 * single commit for them in process_deferred_bios(). 765 */ 766 spin_lock_irqsave(&pool->lock, flags); 767 bio_list_add(&pool->deferred_flush_bios, bio); 768 spin_unlock_irqrestore(&pool->lock, flags); 769 } 770 771 static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio) 772 { 773 remap_to_origin(tc, bio); 774 issue(tc, bio); 775 } 776 777 static void remap_and_issue(struct thin_c *tc, struct bio *bio, 778 dm_block_t block) 779 { 780 remap(tc, bio, block); 781 issue(tc, bio); 782 } 783 784 /* 785 * wake_worker() is used when new work is queued and when pool_resume is 786 * ready to continue deferred IO processing. 787 */ 788 static void wake_worker(struct pool *pool) 789 { 790 queue_work(pool->wq, &pool->worker); 791 } 792 793 /*----------------------------------------------------------------*/ 794 795 /* 796 * Bio endio functions. 797 */ 798 struct dm_thin_new_mapping { 799 struct list_head list; 800 801 unsigned quiesced:1; 802 unsigned prepared:1; 803 unsigned pass_discard:1; 804 805 struct thin_c *tc; 806 dm_block_t virt_block; 807 dm_block_t data_block; 808 struct dm_bio_prison_cell *cell, *cell2; 809 int err; 810 811 /* 812 * If the bio covers the whole area of a block then we can avoid 813 * zeroing or copying. Instead this bio is hooked. The bio will 814 * still be in the cell, so care has to be taken to avoid issuing 815 * the bio twice. 816 */ 817 struct bio *bio; 818 bio_end_io_t *saved_bi_end_io; 819 }; 820 821 static void __maybe_add_mapping(struct dm_thin_new_mapping *m) 822 { 823 struct pool *pool = m->tc->pool; 824 825 if (m->quiesced && m->prepared) { 826 list_add(&m->list, &pool->prepared_mappings); 827 wake_worker(pool); 828 } 829 } 830 831 static void copy_complete(int read_err, unsigned long write_err, void *context) 832 { 833 unsigned long flags; 834 struct dm_thin_new_mapping *m = context; 835 struct pool *pool = m->tc->pool; 836 837 m->err = read_err || write_err ? -EIO : 0; 838 839 spin_lock_irqsave(&pool->lock, flags); 840 m->prepared = 1; 841 __maybe_add_mapping(m); 842 spin_unlock_irqrestore(&pool->lock, flags); 843 } 844 845 static void overwrite_endio(struct bio *bio, int err) 846 { 847 unsigned long flags; 848 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 849 struct dm_thin_new_mapping *m = h->overwrite_mapping; 850 struct pool *pool = m->tc->pool; 851 852 m->err = err; 853 854 spin_lock_irqsave(&pool->lock, flags); 855 m->prepared = 1; 856 __maybe_add_mapping(m); 857 spin_unlock_irqrestore(&pool->lock, flags); 858 } 859 860 /*----------------------------------------------------------------*/ 861 862 /* 863 * Workqueue. 864 */ 865 866 /* 867 * Prepared mapping jobs. 868 */ 869 870 /* 871 * This sends the bios in the cell back to the deferred_bios list. 872 */ 873 static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell, 874 dm_block_t data_block) 875 { 876 struct pool *pool = tc->pool; 877 unsigned long flags; 878 879 spin_lock_irqsave(&pool->lock, flags); 880 cell_release(cell, &pool->deferred_bios); 881 spin_unlock_irqrestore(&tc->pool->lock, flags); 882 883 wake_worker(pool); 884 } 885 886 /* 887 * Same as cell_defer above, except it omits one particular detainee, 888 * a write bio that covers the block and has already been processed. 889 */ 890 static void cell_defer_except(struct thin_c *tc, struct dm_bio_prison_cell *cell) 891 { 892 struct bio_list bios; 893 struct pool *pool = tc->pool; 894 unsigned long flags; 895 896 bio_list_init(&bios); 897 898 spin_lock_irqsave(&pool->lock, flags); 899 cell_release_no_holder(cell, &pool->deferred_bios); 900 spin_unlock_irqrestore(&pool->lock, flags); 901 902 wake_worker(pool); 903 } 904 905 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) 906 { 907 if (m->bio) 908 m->bio->bi_end_io = m->saved_bi_end_io; 909 cell_error(m->cell); 910 list_del(&m->list); 911 mempool_free(m, m->tc->pool->mapping_pool); 912 } 913 static void process_prepared_mapping(struct dm_thin_new_mapping *m) 914 { 915 struct thin_c *tc = m->tc; 916 struct bio *bio; 917 int r; 918 919 bio = m->bio; 920 if (bio) 921 bio->bi_end_io = m->saved_bi_end_io; 922 923 if (m->err) { 924 cell_error(m->cell); 925 goto out; 926 } 927 928 /* 929 * Commit the prepared block into the mapping btree. 930 * Any I/O for this block arriving after this point will get 931 * remapped to it directly. 932 */ 933 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); 934 if (r) { 935 DMERR("dm_thin_insert_block() failed"); 936 cell_error(m->cell); 937 goto out; 938 } 939 940 /* 941 * Release any bios held while the block was being provisioned. 942 * If we are processing a write bio that completely covers the block, 943 * we already processed it so can ignore it now when processing 944 * the bios in the cell. 945 */ 946 if (bio) { 947 cell_defer_except(tc, m->cell); 948 bio_endio(bio, 0); 949 } else 950 cell_defer(tc, m->cell, m->data_block); 951 952 out: 953 list_del(&m->list); 954 mempool_free(m, tc->pool->mapping_pool); 955 } 956 957 static void process_prepared_discard_fail(struct dm_thin_new_mapping *m) 958 { 959 struct thin_c *tc = m->tc; 960 961 bio_io_error(m->bio); 962 cell_defer_except(tc, m->cell); 963 cell_defer_except(tc, m->cell2); 964 mempool_free(m, tc->pool->mapping_pool); 965 } 966 967 static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m) 968 { 969 struct thin_c *tc = m->tc; 970 971 if (m->pass_discard) 972 remap_and_issue(tc, m->bio, m->data_block); 973 else 974 bio_endio(m->bio, 0); 975 976 cell_defer_except(tc, m->cell); 977 cell_defer_except(tc, m->cell2); 978 mempool_free(m, tc->pool->mapping_pool); 979 } 980 981 static void process_prepared_discard(struct dm_thin_new_mapping *m) 982 { 983 int r; 984 struct thin_c *tc = m->tc; 985 986 r = dm_thin_remove_block(tc->td, m->virt_block); 987 if (r) 988 DMERR("dm_thin_remove_block() failed"); 989 990 process_prepared_discard_passdown(m); 991 } 992 993 static void process_prepared(struct pool *pool, struct list_head *head, 994 process_mapping_fn *fn) 995 { 996 unsigned long flags; 997 struct list_head maps; 998 struct dm_thin_new_mapping *m, *tmp; 999 1000 INIT_LIST_HEAD(&maps); 1001 spin_lock_irqsave(&pool->lock, flags); 1002 list_splice_init(head, &maps); 1003 spin_unlock_irqrestore(&pool->lock, flags); 1004 1005 list_for_each_entry_safe(m, tmp, &maps, list) 1006 (*fn)(m); 1007 } 1008 1009 /* 1010 * Deferred bio jobs. 1011 */ 1012 static int io_overlaps_block(struct pool *pool, struct bio *bio) 1013 { 1014 return bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT); 1015 } 1016 1017 static int io_overwrites_block(struct pool *pool, struct bio *bio) 1018 { 1019 return (bio_data_dir(bio) == WRITE) && 1020 io_overlaps_block(pool, bio); 1021 } 1022 1023 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save, 1024 bio_end_io_t *fn) 1025 { 1026 *save = bio->bi_end_io; 1027 bio->bi_end_io = fn; 1028 } 1029 1030 static int ensure_next_mapping(struct pool *pool) 1031 { 1032 if (pool->next_mapping) 1033 return 0; 1034 1035 pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC); 1036 1037 return pool->next_mapping ? 0 : -ENOMEM; 1038 } 1039 1040 static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool) 1041 { 1042 struct dm_thin_new_mapping *r = pool->next_mapping; 1043 1044 BUG_ON(!pool->next_mapping); 1045 1046 pool->next_mapping = NULL; 1047 1048 return r; 1049 } 1050 1051 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, 1052 struct dm_dev *origin, dm_block_t data_origin, 1053 dm_block_t data_dest, 1054 struct dm_bio_prison_cell *cell, struct bio *bio) 1055 { 1056 int r; 1057 struct pool *pool = tc->pool; 1058 struct dm_thin_new_mapping *m = get_next_mapping(pool); 1059 1060 INIT_LIST_HEAD(&m->list); 1061 m->quiesced = 0; 1062 m->prepared = 0; 1063 m->tc = tc; 1064 m->virt_block = virt_block; 1065 m->data_block = data_dest; 1066 m->cell = cell; 1067 m->err = 0; 1068 m->bio = NULL; 1069 1070 if (!ds_add_work(&pool->shared_read_ds, &m->list)) 1071 m->quiesced = 1; 1072 1073 /* 1074 * IO to pool_dev remaps to the pool target's data_dev. 1075 * 1076 * If the whole block of data is being overwritten, we can issue the 1077 * bio immediately. Otherwise we use kcopyd to clone the data first. 1078 */ 1079 if (io_overwrites_block(pool, bio)) { 1080 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 1081 1082 h->overwrite_mapping = m; 1083 m->bio = bio; 1084 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 1085 remap_and_issue(tc, bio, data_dest); 1086 } else { 1087 struct dm_io_region from, to; 1088 1089 from.bdev = origin->bdev; 1090 from.sector = data_origin * pool->sectors_per_block; 1091 from.count = pool->sectors_per_block; 1092 1093 to.bdev = tc->pool_dev->bdev; 1094 to.sector = data_dest * pool->sectors_per_block; 1095 to.count = pool->sectors_per_block; 1096 1097 r = dm_kcopyd_copy(pool->copier, &from, 1, &to, 1098 0, copy_complete, m); 1099 if (r < 0) { 1100 mempool_free(m, pool->mapping_pool); 1101 DMERR("dm_kcopyd_copy() failed"); 1102 cell_error(cell); 1103 } 1104 } 1105 } 1106 1107 static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block, 1108 dm_block_t data_origin, dm_block_t data_dest, 1109 struct dm_bio_prison_cell *cell, struct bio *bio) 1110 { 1111 schedule_copy(tc, virt_block, tc->pool_dev, 1112 data_origin, data_dest, cell, bio); 1113 } 1114 1115 static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block, 1116 dm_block_t data_dest, 1117 struct dm_bio_prison_cell *cell, struct bio *bio) 1118 { 1119 schedule_copy(tc, virt_block, tc->origin_dev, 1120 virt_block, data_dest, cell, bio); 1121 } 1122 1123 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, 1124 dm_block_t data_block, struct dm_bio_prison_cell *cell, 1125 struct bio *bio) 1126 { 1127 struct pool *pool = tc->pool; 1128 struct dm_thin_new_mapping *m = get_next_mapping(pool); 1129 1130 INIT_LIST_HEAD(&m->list); 1131 m->quiesced = 1; 1132 m->prepared = 0; 1133 m->tc = tc; 1134 m->virt_block = virt_block; 1135 m->data_block = data_block; 1136 m->cell = cell; 1137 m->err = 0; 1138 m->bio = NULL; 1139 1140 /* 1141 * If the whole block of data is being overwritten or we are not 1142 * zeroing pre-existing data, we can issue the bio immediately. 1143 * Otherwise we use kcopyd to zero the data first. 1144 */ 1145 if (!pool->pf.zero_new_blocks) 1146 process_prepared_mapping(m); 1147 1148 else if (io_overwrites_block(pool, bio)) { 1149 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 1150 1151 h->overwrite_mapping = m; 1152 m->bio = bio; 1153 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 1154 remap_and_issue(tc, bio, data_block); 1155 } else { 1156 int r; 1157 struct dm_io_region to; 1158 1159 to.bdev = tc->pool_dev->bdev; 1160 to.sector = data_block * pool->sectors_per_block; 1161 to.count = pool->sectors_per_block; 1162 1163 r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m); 1164 if (r < 0) { 1165 mempool_free(m, pool->mapping_pool); 1166 DMERR("dm_kcopyd_zero() failed"); 1167 cell_error(cell); 1168 } 1169 } 1170 } 1171 1172 static int commit(struct pool *pool) 1173 { 1174 int r; 1175 1176 r = dm_pool_commit_metadata(pool->pmd); 1177 if (r) 1178 DMERR("commit failed, error = %d", r); 1179 1180 return r; 1181 } 1182 1183 /* 1184 * A non-zero return indicates read_only or fail_io mode. 1185 * Many callers don't care about the return value. 1186 */ 1187 static int commit_or_fallback(struct pool *pool) 1188 { 1189 int r; 1190 1191 if (get_pool_mode(pool) != PM_WRITE) 1192 return -EINVAL; 1193 1194 r = commit(pool); 1195 if (r) 1196 set_pool_mode(pool, PM_READ_ONLY); 1197 1198 return r; 1199 } 1200 1201 static int alloc_data_block(struct thin_c *tc, dm_block_t *result) 1202 { 1203 int r; 1204 dm_block_t free_blocks; 1205 unsigned long flags; 1206 struct pool *pool = tc->pool; 1207 1208 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 1209 if (r) 1210 return r; 1211 1212 if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) { 1213 DMWARN("%s: reached low water mark, sending event.", 1214 dm_device_name(pool->pool_md)); 1215 spin_lock_irqsave(&pool->lock, flags); 1216 pool->low_water_triggered = 1; 1217 spin_unlock_irqrestore(&pool->lock, flags); 1218 dm_table_event(pool->ti->table); 1219 } 1220 1221 if (!free_blocks) { 1222 if (pool->no_free_space) 1223 return -ENOSPC; 1224 else { 1225 /* 1226 * Try to commit to see if that will free up some 1227 * more space. 1228 */ 1229 (void) commit_or_fallback(pool); 1230 1231 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 1232 if (r) 1233 return r; 1234 1235 /* 1236 * If we still have no space we set a flag to avoid 1237 * doing all this checking and return -ENOSPC. 1238 */ 1239 if (!free_blocks) { 1240 DMWARN("%s: no free space available.", 1241 dm_device_name(pool->pool_md)); 1242 spin_lock_irqsave(&pool->lock, flags); 1243 pool->no_free_space = 1; 1244 spin_unlock_irqrestore(&pool->lock, flags); 1245 return -ENOSPC; 1246 } 1247 } 1248 } 1249 1250 r = dm_pool_alloc_data_block(pool->pmd, result); 1251 if (r) 1252 return r; 1253 1254 return 0; 1255 } 1256 1257 /* 1258 * If we have run out of space, queue bios until the device is 1259 * resumed, presumably after having been reloaded with more space. 1260 */ 1261 static void retry_on_resume(struct bio *bio) 1262 { 1263 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 1264 struct thin_c *tc = h->tc; 1265 struct pool *pool = tc->pool; 1266 unsigned long flags; 1267 1268 spin_lock_irqsave(&pool->lock, flags); 1269 bio_list_add(&pool->retry_on_resume_list, bio); 1270 spin_unlock_irqrestore(&pool->lock, flags); 1271 } 1272 1273 static void no_space(struct dm_bio_prison_cell *cell) 1274 { 1275 struct bio *bio; 1276 struct bio_list bios; 1277 1278 bio_list_init(&bios); 1279 cell_release(cell, &bios); 1280 1281 while ((bio = bio_list_pop(&bios))) 1282 retry_on_resume(bio); 1283 } 1284 1285 static void process_discard(struct thin_c *tc, struct bio *bio) 1286 { 1287 int r; 1288 unsigned long flags; 1289 struct pool *pool = tc->pool; 1290 struct dm_bio_prison_cell *cell, *cell2; 1291 struct cell_key key, key2; 1292 dm_block_t block = get_bio_block(tc, bio); 1293 struct dm_thin_lookup_result lookup_result; 1294 struct dm_thin_new_mapping *m; 1295 1296 build_virtual_key(tc->td, block, &key); 1297 if (bio_detain(tc->pool->prison, &key, bio, &cell)) 1298 return; 1299 1300 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1301 switch (r) { 1302 case 0: 1303 /* 1304 * Check nobody is fiddling with this pool block. This can 1305 * happen if someone's in the process of breaking sharing 1306 * on this block. 1307 */ 1308 build_data_key(tc->td, lookup_result.block, &key2); 1309 if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) { 1310 cell_release_singleton(cell, bio); 1311 break; 1312 } 1313 1314 if (io_overlaps_block(pool, bio)) { 1315 /* 1316 * IO may still be going to the destination block. We must 1317 * quiesce before we can do the removal. 1318 */ 1319 m = get_next_mapping(pool); 1320 m->tc = tc; 1321 m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown; 1322 m->virt_block = block; 1323 m->data_block = lookup_result.block; 1324 m->cell = cell; 1325 m->cell2 = cell2; 1326 m->err = 0; 1327 m->bio = bio; 1328 1329 if (!ds_add_work(&pool->all_io_ds, &m->list)) { 1330 spin_lock_irqsave(&pool->lock, flags); 1331 list_add(&m->list, &pool->prepared_discards); 1332 spin_unlock_irqrestore(&pool->lock, flags); 1333 wake_worker(pool); 1334 } 1335 } else { 1336 /* 1337 * The DM core makes sure that the discard doesn't span 1338 * a block boundary. So we submit the discard of a 1339 * partial block appropriately. 1340 */ 1341 cell_release_singleton(cell, bio); 1342 cell_release_singleton(cell2, bio); 1343 if ((!lookup_result.shared) && pool->pf.discard_passdown) 1344 remap_and_issue(tc, bio, lookup_result.block); 1345 else 1346 bio_endio(bio, 0); 1347 } 1348 break; 1349 1350 case -ENODATA: 1351 /* 1352 * It isn't provisioned, just forget it. 1353 */ 1354 cell_release_singleton(cell, bio); 1355 bio_endio(bio, 0); 1356 break; 1357 1358 default: 1359 DMERR("discard: find block unexpectedly returned %d", r); 1360 cell_release_singleton(cell, bio); 1361 bio_io_error(bio); 1362 break; 1363 } 1364 } 1365 1366 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, 1367 struct cell_key *key, 1368 struct dm_thin_lookup_result *lookup_result, 1369 struct dm_bio_prison_cell *cell) 1370 { 1371 int r; 1372 dm_block_t data_block; 1373 1374 r = alloc_data_block(tc, &data_block); 1375 switch (r) { 1376 case 0: 1377 schedule_internal_copy(tc, block, lookup_result->block, 1378 data_block, cell, bio); 1379 break; 1380 1381 case -ENOSPC: 1382 no_space(cell); 1383 break; 1384 1385 default: 1386 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r); 1387 cell_error(cell); 1388 break; 1389 } 1390 } 1391 1392 static void process_shared_bio(struct thin_c *tc, struct bio *bio, 1393 dm_block_t block, 1394 struct dm_thin_lookup_result *lookup_result) 1395 { 1396 struct dm_bio_prison_cell *cell; 1397 struct pool *pool = tc->pool; 1398 struct cell_key key; 1399 1400 /* 1401 * If cell is already occupied, then sharing is already in the process 1402 * of being broken so we have nothing further to do here. 1403 */ 1404 build_data_key(tc->td, lookup_result->block, &key); 1405 if (bio_detain(pool->prison, &key, bio, &cell)) 1406 return; 1407 1408 if (bio_data_dir(bio) == WRITE && bio->bi_size) 1409 break_sharing(tc, bio, block, &key, lookup_result, cell); 1410 else { 1411 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 1412 1413 h->shared_read_entry = ds_inc(&pool->shared_read_ds); 1414 1415 cell_release_singleton(cell, bio); 1416 remap_and_issue(tc, bio, lookup_result->block); 1417 } 1418 } 1419 1420 static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block, 1421 struct dm_bio_prison_cell *cell) 1422 { 1423 int r; 1424 dm_block_t data_block; 1425 1426 /* 1427 * Remap empty bios (flushes) immediately, without provisioning. 1428 */ 1429 if (!bio->bi_size) { 1430 cell_release_singleton(cell, bio); 1431 remap_and_issue(tc, bio, 0); 1432 return; 1433 } 1434 1435 /* 1436 * Fill read bios with zeroes and complete them immediately. 1437 */ 1438 if (bio_data_dir(bio) == READ) { 1439 zero_fill_bio(bio); 1440 cell_release_singleton(cell, bio); 1441 bio_endio(bio, 0); 1442 return; 1443 } 1444 1445 r = alloc_data_block(tc, &data_block); 1446 switch (r) { 1447 case 0: 1448 if (tc->origin_dev) 1449 schedule_external_copy(tc, block, data_block, cell, bio); 1450 else 1451 schedule_zero(tc, block, data_block, cell, bio); 1452 break; 1453 1454 case -ENOSPC: 1455 no_space(cell); 1456 break; 1457 1458 default: 1459 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r); 1460 set_pool_mode(tc->pool, PM_READ_ONLY); 1461 cell_error(cell); 1462 break; 1463 } 1464 } 1465 1466 static void process_bio(struct thin_c *tc, struct bio *bio) 1467 { 1468 int r; 1469 dm_block_t block = get_bio_block(tc, bio); 1470 struct dm_bio_prison_cell *cell; 1471 struct cell_key key; 1472 struct dm_thin_lookup_result lookup_result; 1473 1474 /* 1475 * If cell is already occupied, then the block is already 1476 * being provisioned so we have nothing further to do here. 1477 */ 1478 build_virtual_key(tc->td, block, &key); 1479 if (bio_detain(tc->pool->prison, &key, bio, &cell)) 1480 return; 1481 1482 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1483 switch (r) { 1484 case 0: 1485 /* 1486 * We can release this cell now. This thread is the only 1487 * one that puts bios into a cell, and we know there were 1488 * no preceding bios. 1489 */ 1490 /* 1491 * TODO: this will probably have to change when discard goes 1492 * back in. 1493 */ 1494 cell_release_singleton(cell, bio); 1495 1496 if (lookup_result.shared) 1497 process_shared_bio(tc, bio, block, &lookup_result); 1498 else 1499 remap_and_issue(tc, bio, lookup_result.block); 1500 break; 1501 1502 case -ENODATA: 1503 if (bio_data_dir(bio) == READ && tc->origin_dev) { 1504 cell_release_singleton(cell, bio); 1505 remap_to_origin_and_issue(tc, bio); 1506 } else 1507 provision_block(tc, bio, block, cell); 1508 break; 1509 1510 default: 1511 DMERR("dm_thin_find_block() failed, error = %d", r); 1512 cell_release_singleton(cell, bio); 1513 bio_io_error(bio); 1514 break; 1515 } 1516 } 1517 1518 static void process_bio_read_only(struct thin_c *tc, struct bio *bio) 1519 { 1520 int r; 1521 int rw = bio_data_dir(bio); 1522 dm_block_t block = get_bio_block(tc, bio); 1523 struct dm_thin_lookup_result lookup_result; 1524 1525 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1526 switch (r) { 1527 case 0: 1528 if (lookup_result.shared && (rw == WRITE) && bio->bi_size) 1529 bio_io_error(bio); 1530 else 1531 remap_and_issue(tc, bio, lookup_result.block); 1532 break; 1533 1534 case -ENODATA: 1535 if (rw != READ) { 1536 bio_io_error(bio); 1537 break; 1538 } 1539 1540 if (tc->origin_dev) { 1541 remap_to_origin_and_issue(tc, bio); 1542 break; 1543 } 1544 1545 zero_fill_bio(bio); 1546 bio_endio(bio, 0); 1547 break; 1548 1549 default: 1550 DMERR("dm_thin_find_block() failed, error = %d", r); 1551 bio_io_error(bio); 1552 break; 1553 } 1554 } 1555 1556 static void process_bio_fail(struct thin_c *tc, struct bio *bio) 1557 { 1558 bio_io_error(bio); 1559 } 1560 1561 static int need_commit_due_to_time(struct pool *pool) 1562 { 1563 return jiffies < pool->last_commit_jiffies || 1564 jiffies > pool->last_commit_jiffies + COMMIT_PERIOD; 1565 } 1566 1567 static void process_deferred_bios(struct pool *pool) 1568 { 1569 unsigned long flags; 1570 struct bio *bio; 1571 struct bio_list bios; 1572 1573 bio_list_init(&bios); 1574 1575 spin_lock_irqsave(&pool->lock, flags); 1576 bio_list_merge(&bios, &pool->deferred_bios); 1577 bio_list_init(&pool->deferred_bios); 1578 spin_unlock_irqrestore(&pool->lock, flags); 1579 1580 while ((bio = bio_list_pop(&bios))) { 1581 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 1582 struct thin_c *tc = h->tc; 1583 1584 /* 1585 * If we've got no free new_mapping structs, and processing 1586 * this bio might require one, we pause until there are some 1587 * prepared mappings to process. 1588 */ 1589 if (ensure_next_mapping(pool)) { 1590 spin_lock_irqsave(&pool->lock, flags); 1591 bio_list_merge(&pool->deferred_bios, &bios); 1592 spin_unlock_irqrestore(&pool->lock, flags); 1593 1594 break; 1595 } 1596 1597 if (bio->bi_rw & REQ_DISCARD) 1598 pool->process_discard(tc, bio); 1599 else 1600 pool->process_bio(tc, bio); 1601 } 1602 1603 /* 1604 * If there are any deferred flush bios, we must commit 1605 * the metadata before issuing them. 1606 */ 1607 bio_list_init(&bios); 1608 spin_lock_irqsave(&pool->lock, flags); 1609 bio_list_merge(&bios, &pool->deferred_flush_bios); 1610 bio_list_init(&pool->deferred_flush_bios); 1611 spin_unlock_irqrestore(&pool->lock, flags); 1612 1613 if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) 1614 return; 1615 1616 if (commit_or_fallback(pool)) { 1617 while ((bio = bio_list_pop(&bios))) 1618 bio_io_error(bio); 1619 return; 1620 } 1621 pool->last_commit_jiffies = jiffies; 1622 1623 while ((bio = bio_list_pop(&bios))) 1624 generic_make_request(bio); 1625 } 1626 1627 static void do_worker(struct work_struct *ws) 1628 { 1629 struct pool *pool = container_of(ws, struct pool, worker); 1630 1631 process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping); 1632 process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard); 1633 process_deferred_bios(pool); 1634 } 1635 1636 /* 1637 * We want to commit periodically so that not too much 1638 * unwritten data builds up. 1639 */ 1640 static void do_waker(struct work_struct *ws) 1641 { 1642 struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker); 1643 wake_worker(pool); 1644 queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD); 1645 } 1646 1647 /*----------------------------------------------------------------*/ 1648 1649 static enum pool_mode get_pool_mode(struct pool *pool) 1650 { 1651 return pool->pf.mode; 1652 } 1653 1654 static void set_pool_mode(struct pool *pool, enum pool_mode mode) 1655 { 1656 int r; 1657 1658 pool->pf.mode = mode; 1659 1660 switch (mode) { 1661 case PM_FAIL: 1662 DMERR("switching pool to failure mode"); 1663 pool->process_bio = process_bio_fail; 1664 pool->process_discard = process_bio_fail; 1665 pool->process_prepared_mapping = process_prepared_mapping_fail; 1666 pool->process_prepared_discard = process_prepared_discard_fail; 1667 break; 1668 1669 case PM_READ_ONLY: 1670 DMERR("switching pool to read-only mode"); 1671 r = dm_pool_abort_metadata(pool->pmd); 1672 if (r) { 1673 DMERR("aborting transaction failed"); 1674 set_pool_mode(pool, PM_FAIL); 1675 } else { 1676 dm_pool_metadata_read_only(pool->pmd); 1677 pool->process_bio = process_bio_read_only; 1678 pool->process_discard = process_discard; 1679 pool->process_prepared_mapping = process_prepared_mapping_fail; 1680 pool->process_prepared_discard = process_prepared_discard_passdown; 1681 } 1682 break; 1683 1684 case PM_WRITE: 1685 pool->process_bio = process_bio; 1686 pool->process_discard = process_discard; 1687 pool->process_prepared_mapping = process_prepared_mapping; 1688 pool->process_prepared_discard = process_prepared_discard; 1689 break; 1690 } 1691 } 1692 1693 /*----------------------------------------------------------------*/ 1694 1695 /* 1696 * Mapping functions. 1697 */ 1698 1699 /* 1700 * Called only while mapping a thin bio to hand it over to the workqueue. 1701 */ 1702 static void thin_defer_bio(struct thin_c *tc, struct bio *bio) 1703 { 1704 unsigned long flags; 1705 struct pool *pool = tc->pool; 1706 1707 spin_lock_irqsave(&pool->lock, flags); 1708 bio_list_add(&pool->deferred_bios, bio); 1709 spin_unlock_irqrestore(&pool->lock, flags); 1710 1711 wake_worker(pool); 1712 } 1713 1714 static struct dm_thin_endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio) 1715 { 1716 struct pool *pool = tc->pool; 1717 struct dm_thin_endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO); 1718 1719 h->tc = tc; 1720 h->shared_read_entry = NULL; 1721 h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds); 1722 h->overwrite_mapping = NULL; 1723 1724 return h; 1725 } 1726 1727 /* 1728 * Non-blocking function called from the thin target's map function. 1729 */ 1730 static int thin_bio_map(struct dm_target *ti, struct bio *bio, 1731 union map_info *map_context) 1732 { 1733 int r; 1734 struct thin_c *tc = ti->private; 1735 dm_block_t block = get_bio_block(tc, bio); 1736 struct dm_thin_device *td = tc->td; 1737 struct dm_thin_lookup_result result; 1738 1739 map_context->ptr = thin_hook_bio(tc, bio); 1740 1741 if (get_pool_mode(tc->pool) == PM_FAIL) { 1742 bio_io_error(bio); 1743 return DM_MAPIO_SUBMITTED; 1744 } 1745 1746 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { 1747 thin_defer_bio(tc, bio); 1748 return DM_MAPIO_SUBMITTED; 1749 } 1750 1751 r = dm_thin_find_block(td, block, 0, &result); 1752 1753 /* 1754 * Note that we defer readahead too. 1755 */ 1756 switch (r) { 1757 case 0: 1758 if (unlikely(result.shared)) { 1759 /* 1760 * We have a race condition here between the 1761 * result.shared value returned by the lookup and 1762 * snapshot creation, which may cause new 1763 * sharing. 1764 * 1765 * To avoid this always quiesce the origin before 1766 * taking the snap. You want to do this anyway to 1767 * ensure a consistent application view 1768 * (i.e. lockfs). 1769 * 1770 * More distant ancestors are irrelevant. The 1771 * shared flag will be set in their case. 1772 */ 1773 thin_defer_bio(tc, bio); 1774 r = DM_MAPIO_SUBMITTED; 1775 } else { 1776 remap(tc, bio, result.block); 1777 r = DM_MAPIO_REMAPPED; 1778 } 1779 break; 1780 1781 case -ENODATA: 1782 if (get_pool_mode(tc->pool) == PM_READ_ONLY) { 1783 /* 1784 * This block isn't provisioned, and we have no way 1785 * of doing so. Just error it. 1786 */ 1787 bio_io_error(bio); 1788 r = DM_MAPIO_SUBMITTED; 1789 break; 1790 } 1791 /* fall through */ 1792 1793 case -EWOULDBLOCK: 1794 /* 1795 * In future, the failed dm_thin_find_block above could 1796 * provide the hint to load the metadata into cache. 1797 */ 1798 thin_defer_bio(tc, bio); 1799 r = DM_MAPIO_SUBMITTED; 1800 break; 1801 1802 default: 1803 /* 1804 * Must always call bio_io_error on failure. 1805 * dm_thin_find_block can fail with -EINVAL if the 1806 * pool is switched to fail-io mode. 1807 */ 1808 bio_io_error(bio); 1809 r = DM_MAPIO_SUBMITTED; 1810 break; 1811 } 1812 1813 return r; 1814 } 1815 1816 static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 1817 { 1818 int r; 1819 unsigned long flags; 1820 struct pool_c *pt = container_of(cb, struct pool_c, callbacks); 1821 1822 spin_lock_irqsave(&pt->pool->lock, flags); 1823 r = !bio_list_empty(&pt->pool->retry_on_resume_list); 1824 spin_unlock_irqrestore(&pt->pool->lock, flags); 1825 1826 if (!r) { 1827 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 1828 r = bdi_congested(&q->backing_dev_info, bdi_bits); 1829 } 1830 1831 return r; 1832 } 1833 1834 static void __requeue_bios(struct pool *pool) 1835 { 1836 bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list); 1837 bio_list_init(&pool->retry_on_resume_list); 1838 } 1839 1840 /*---------------------------------------------------------------- 1841 * Binding of control targets to a pool object 1842 *--------------------------------------------------------------*/ 1843 static bool data_dev_supports_discard(struct pool_c *pt) 1844 { 1845 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 1846 1847 return q && blk_queue_discard(q); 1848 } 1849 1850 /* 1851 * If discard_passdown was enabled verify that the data device 1852 * supports discards. Disable discard_passdown if not. 1853 */ 1854 static void disable_passdown_if_not_supported(struct pool_c *pt) 1855 { 1856 struct pool *pool = pt->pool; 1857 struct block_device *data_bdev = pt->data_dev->bdev; 1858 struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits; 1859 sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT; 1860 const char *reason = NULL; 1861 char buf[BDEVNAME_SIZE]; 1862 1863 if (!pt->adjusted_pf.discard_passdown) 1864 return; 1865 1866 if (!data_dev_supports_discard(pt)) 1867 reason = "discard unsupported"; 1868 1869 else if (data_limits->max_discard_sectors < pool->sectors_per_block) 1870 reason = "max discard sectors smaller than a block"; 1871 1872 else if (data_limits->discard_granularity > block_size) 1873 reason = "discard granularity larger than a block"; 1874 1875 else if (block_size & (data_limits->discard_granularity - 1)) 1876 reason = "discard granularity not a factor of block size"; 1877 1878 if (reason) { 1879 DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason); 1880 pt->adjusted_pf.discard_passdown = false; 1881 } 1882 } 1883 1884 static int bind_control_target(struct pool *pool, struct dm_target *ti) 1885 { 1886 struct pool_c *pt = ti->private; 1887 1888 /* 1889 * We want to make sure that degraded pools are never upgraded. 1890 */ 1891 enum pool_mode old_mode = pool->pf.mode; 1892 enum pool_mode new_mode = pt->adjusted_pf.mode; 1893 1894 if (old_mode > new_mode) 1895 new_mode = old_mode; 1896 1897 pool->ti = ti; 1898 pool->low_water_blocks = pt->low_water_blocks; 1899 pool->pf = pt->adjusted_pf; 1900 1901 set_pool_mode(pool, new_mode); 1902 1903 return 0; 1904 } 1905 1906 static void unbind_control_target(struct pool *pool, struct dm_target *ti) 1907 { 1908 if (pool->ti == ti) 1909 pool->ti = NULL; 1910 } 1911 1912 /*---------------------------------------------------------------- 1913 * Pool creation 1914 *--------------------------------------------------------------*/ 1915 /* Initialize pool features. */ 1916 static void pool_features_init(struct pool_features *pf) 1917 { 1918 pf->mode = PM_WRITE; 1919 pf->zero_new_blocks = true; 1920 pf->discard_enabled = true; 1921 pf->discard_passdown = true; 1922 } 1923 1924 static void __pool_destroy(struct pool *pool) 1925 { 1926 __pool_table_remove(pool); 1927 1928 if (dm_pool_metadata_close(pool->pmd) < 0) 1929 DMWARN("%s: dm_pool_metadata_close() failed.", __func__); 1930 1931 prison_destroy(pool->prison); 1932 dm_kcopyd_client_destroy(pool->copier); 1933 1934 if (pool->wq) 1935 destroy_workqueue(pool->wq); 1936 1937 if (pool->next_mapping) 1938 mempool_free(pool->next_mapping, pool->mapping_pool); 1939 mempool_destroy(pool->mapping_pool); 1940 mempool_destroy(pool->endio_hook_pool); 1941 kfree(pool); 1942 } 1943 1944 static struct kmem_cache *_new_mapping_cache; 1945 static struct kmem_cache *_endio_hook_cache; 1946 1947 static struct pool *pool_create(struct mapped_device *pool_md, 1948 struct block_device *metadata_dev, 1949 unsigned long block_size, 1950 int read_only, char **error) 1951 { 1952 int r; 1953 void *err_p; 1954 struct pool *pool; 1955 struct dm_pool_metadata *pmd; 1956 bool format_device = read_only ? false : true; 1957 1958 pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device); 1959 if (IS_ERR(pmd)) { 1960 *error = "Error creating metadata object"; 1961 return (struct pool *)pmd; 1962 } 1963 1964 pool = kmalloc(sizeof(*pool), GFP_KERNEL); 1965 if (!pool) { 1966 *error = "Error allocating memory for pool"; 1967 err_p = ERR_PTR(-ENOMEM); 1968 goto bad_pool; 1969 } 1970 1971 pool->pmd = pmd; 1972 pool->sectors_per_block = block_size; 1973 if (block_size & (block_size - 1)) 1974 pool->sectors_per_block_shift = -1; 1975 else 1976 pool->sectors_per_block_shift = __ffs(block_size); 1977 pool->low_water_blocks = 0; 1978 pool_features_init(&pool->pf); 1979 pool->prison = prison_create(PRISON_CELLS); 1980 if (!pool->prison) { 1981 *error = "Error creating pool's bio prison"; 1982 err_p = ERR_PTR(-ENOMEM); 1983 goto bad_prison; 1984 } 1985 1986 pool->copier = dm_kcopyd_client_create(); 1987 if (IS_ERR(pool->copier)) { 1988 r = PTR_ERR(pool->copier); 1989 *error = "Error creating pool's kcopyd client"; 1990 err_p = ERR_PTR(r); 1991 goto bad_kcopyd_client; 1992 } 1993 1994 /* 1995 * Create singlethreaded workqueue that will service all devices 1996 * that use this metadata. 1997 */ 1998 pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 1999 if (!pool->wq) { 2000 *error = "Error creating pool's workqueue"; 2001 err_p = ERR_PTR(-ENOMEM); 2002 goto bad_wq; 2003 } 2004 2005 INIT_WORK(&pool->worker, do_worker); 2006 INIT_DELAYED_WORK(&pool->waker, do_waker); 2007 spin_lock_init(&pool->lock); 2008 bio_list_init(&pool->deferred_bios); 2009 bio_list_init(&pool->deferred_flush_bios); 2010 INIT_LIST_HEAD(&pool->prepared_mappings); 2011 INIT_LIST_HEAD(&pool->prepared_discards); 2012 pool->low_water_triggered = 0; 2013 pool->no_free_space = 0; 2014 bio_list_init(&pool->retry_on_resume_list); 2015 ds_init(&pool->shared_read_ds); 2016 ds_init(&pool->all_io_ds); 2017 2018 pool->next_mapping = NULL; 2019 pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE, 2020 _new_mapping_cache); 2021 if (!pool->mapping_pool) { 2022 *error = "Error creating pool's mapping mempool"; 2023 err_p = ERR_PTR(-ENOMEM); 2024 goto bad_mapping_pool; 2025 } 2026 2027 pool->endio_hook_pool = mempool_create_slab_pool(ENDIO_HOOK_POOL_SIZE, 2028 _endio_hook_cache); 2029 if (!pool->endio_hook_pool) { 2030 *error = "Error creating pool's endio_hook mempool"; 2031 err_p = ERR_PTR(-ENOMEM); 2032 goto bad_endio_hook_pool; 2033 } 2034 pool->ref_count = 1; 2035 pool->last_commit_jiffies = jiffies; 2036 pool->pool_md = pool_md; 2037 pool->md_dev = metadata_dev; 2038 __pool_table_insert(pool); 2039 2040 return pool; 2041 2042 bad_endio_hook_pool: 2043 mempool_destroy(pool->mapping_pool); 2044 bad_mapping_pool: 2045 destroy_workqueue(pool->wq); 2046 bad_wq: 2047 dm_kcopyd_client_destroy(pool->copier); 2048 bad_kcopyd_client: 2049 prison_destroy(pool->prison); 2050 bad_prison: 2051 kfree(pool); 2052 bad_pool: 2053 if (dm_pool_metadata_close(pmd)) 2054 DMWARN("%s: dm_pool_metadata_close() failed.", __func__); 2055 2056 return err_p; 2057 } 2058 2059 static void __pool_inc(struct pool *pool) 2060 { 2061 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 2062 pool->ref_count++; 2063 } 2064 2065 static void __pool_dec(struct pool *pool) 2066 { 2067 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 2068 BUG_ON(!pool->ref_count); 2069 if (!--pool->ref_count) 2070 __pool_destroy(pool); 2071 } 2072 2073 static struct pool *__pool_find(struct mapped_device *pool_md, 2074 struct block_device *metadata_dev, 2075 unsigned long block_size, int read_only, 2076 char **error, int *created) 2077 { 2078 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); 2079 2080 if (pool) { 2081 if (pool->pool_md != pool_md) { 2082 *error = "metadata device already in use by a pool"; 2083 return ERR_PTR(-EBUSY); 2084 } 2085 __pool_inc(pool); 2086 2087 } else { 2088 pool = __pool_table_lookup(pool_md); 2089 if (pool) { 2090 if (pool->md_dev != metadata_dev) { 2091 *error = "different pool cannot replace a pool"; 2092 return ERR_PTR(-EINVAL); 2093 } 2094 __pool_inc(pool); 2095 2096 } else { 2097 pool = pool_create(pool_md, metadata_dev, block_size, read_only, error); 2098 *created = 1; 2099 } 2100 } 2101 2102 return pool; 2103 } 2104 2105 /*---------------------------------------------------------------- 2106 * Pool target methods 2107 *--------------------------------------------------------------*/ 2108 static void pool_dtr(struct dm_target *ti) 2109 { 2110 struct pool_c *pt = ti->private; 2111 2112 mutex_lock(&dm_thin_pool_table.mutex); 2113 2114 unbind_control_target(pt->pool, ti); 2115 __pool_dec(pt->pool); 2116 dm_put_device(ti, pt->metadata_dev); 2117 dm_put_device(ti, pt->data_dev); 2118 kfree(pt); 2119 2120 mutex_unlock(&dm_thin_pool_table.mutex); 2121 } 2122 2123 static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, 2124 struct dm_target *ti) 2125 { 2126 int r; 2127 unsigned argc; 2128 const char *arg_name; 2129 2130 static struct dm_arg _args[] = { 2131 {0, 3, "Invalid number of pool feature arguments"}, 2132 }; 2133 2134 /* 2135 * No feature arguments supplied. 2136 */ 2137 if (!as->argc) 2138 return 0; 2139 2140 r = dm_read_arg_group(_args, as, &argc, &ti->error); 2141 if (r) 2142 return -EINVAL; 2143 2144 while (argc && !r) { 2145 arg_name = dm_shift_arg(as); 2146 argc--; 2147 2148 if (!strcasecmp(arg_name, "skip_block_zeroing")) 2149 pf->zero_new_blocks = false; 2150 2151 else if (!strcasecmp(arg_name, "ignore_discard")) 2152 pf->discard_enabled = false; 2153 2154 else if (!strcasecmp(arg_name, "no_discard_passdown")) 2155 pf->discard_passdown = false; 2156 2157 else if (!strcasecmp(arg_name, "read_only")) 2158 pf->mode = PM_READ_ONLY; 2159 2160 else { 2161 ti->error = "Unrecognised pool feature requested"; 2162 r = -EINVAL; 2163 break; 2164 } 2165 } 2166 2167 return r; 2168 } 2169 2170 /* 2171 * thin-pool <metadata dev> <data dev> 2172 * <data block size (sectors)> 2173 * <low water mark (blocks)> 2174 * [<#feature args> [<arg>]*] 2175 * 2176 * Optional feature arguments are: 2177 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. 2178 * ignore_discard: disable discard 2179 * no_discard_passdown: don't pass discards down to the data device 2180 */ 2181 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) 2182 { 2183 int r, pool_created = 0; 2184 struct pool_c *pt; 2185 struct pool *pool; 2186 struct pool_features pf; 2187 struct dm_arg_set as; 2188 struct dm_dev *data_dev; 2189 unsigned long block_size; 2190 dm_block_t low_water_blocks; 2191 struct dm_dev *metadata_dev; 2192 sector_t metadata_dev_size; 2193 char b[BDEVNAME_SIZE]; 2194 2195 /* 2196 * FIXME Remove validation from scope of lock. 2197 */ 2198 mutex_lock(&dm_thin_pool_table.mutex); 2199 2200 if (argc < 4) { 2201 ti->error = "Invalid argument count"; 2202 r = -EINVAL; 2203 goto out_unlock; 2204 } 2205 as.argc = argc; 2206 as.argv = argv; 2207 2208 r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev); 2209 if (r) { 2210 ti->error = "Error opening metadata block device"; 2211 goto out_unlock; 2212 } 2213 2214 metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT; 2215 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) 2216 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 2217 bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); 2218 2219 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); 2220 if (r) { 2221 ti->error = "Error getting data device"; 2222 goto out_metadata; 2223 } 2224 2225 if (kstrtoul(argv[2], 10, &block_size) || !block_size || 2226 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 2227 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 2228 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 2229 ti->error = "Invalid block size"; 2230 r = -EINVAL; 2231 goto out; 2232 } 2233 2234 if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) { 2235 ti->error = "Invalid low water mark"; 2236 r = -EINVAL; 2237 goto out; 2238 } 2239 2240 /* 2241 * Set default pool features. 2242 */ 2243 pool_features_init(&pf); 2244 2245 dm_consume_args(&as, 4); 2246 r = parse_pool_features(&as, &pf, ti); 2247 if (r) 2248 goto out; 2249 2250 pt = kzalloc(sizeof(*pt), GFP_KERNEL); 2251 if (!pt) { 2252 r = -ENOMEM; 2253 goto out; 2254 } 2255 2256 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, 2257 block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created); 2258 if (IS_ERR(pool)) { 2259 r = PTR_ERR(pool); 2260 goto out_free_pt; 2261 } 2262 2263 /* 2264 * 'pool_created' reflects whether this is the first table load. 2265 * Top level discard support is not allowed to be changed after 2266 * initial load. This would require a pool reload to trigger thin 2267 * device changes. 2268 */ 2269 if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) { 2270 ti->error = "Discard support cannot be disabled once enabled"; 2271 r = -EINVAL; 2272 goto out_flags_changed; 2273 } 2274 2275 /* 2276 * The block layer requires discard_granularity to be a power of 2. 2277 */ 2278 if (pf.discard_enabled && !is_power_of_2(block_size)) { 2279 ti->error = "Discard support must be disabled when the block size is not a power of 2"; 2280 r = -EINVAL; 2281 goto out_flags_changed; 2282 } 2283 2284 pt->pool = pool; 2285 pt->ti = ti; 2286 pt->metadata_dev = metadata_dev; 2287 pt->data_dev = data_dev; 2288 pt->low_water_blocks = low_water_blocks; 2289 pt->adjusted_pf = pt->requested_pf = pf; 2290 ti->num_flush_requests = 1; 2291 2292 /* 2293 * Only need to enable discards if the pool should pass 2294 * them down to the data device. The thin device's discard 2295 * processing will cause mappings to be removed from the btree. 2296 */ 2297 if (pf.discard_enabled && pf.discard_passdown) { 2298 ti->num_discard_requests = 1; 2299 2300 /* 2301 * Setting 'discards_supported' circumvents the normal 2302 * stacking of discard limits (this keeps the pool and 2303 * thin devices' discard limits consistent). 2304 */ 2305 ti->discards_supported = true; 2306 ti->discard_zeroes_data_unsupported = true; 2307 } 2308 ti->private = pt; 2309 2310 pt->callbacks.congested_fn = pool_is_congested; 2311 dm_table_add_target_callbacks(ti->table, &pt->callbacks); 2312 2313 mutex_unlock(&dm_thin_pool_table.mutex); 2314 2315 return 0; 2316 2317 out_flags_changed: 2318 __pool_dec(pool); 2319 out_free_pt: 2320 kfree(pt); 2321 out: 2322 dm_put_device(ti, data_dev); 2323 out_metadata: 2324 dm_put_device(ti, metadata_dev); 2325 out_unlock: 2326 mutex_unlock(&dm_thin_pool_table.mutex); 2327 2328 return r; 2329 } 2330 2331 static int pool_map(struct dm_target *ti, struct bio *bio, 2332 union map_info *map_context) 2333 { 2334 int r; 2335 struct pool_c *pt = ti->private; 2336 struct pool *pool = pt->pool; 2337 unsigned long flags; 2338 2339 /* 2340 * As this is a singleton target, ti->begin is always zero. 2341 */ 2342 spin_lock_irqsave(&pool->lock, flags); 2343 bio->bi_bdev = pt->data_dev->bdev; 2344 r = DM_MAPIO_REMAPPED; 2345 spin_unlock_irqrestore(&pool->lock, flags); 2346 2347 return r; 2348 } 2349 2350 /* 2351 * Retrieves the number of blocks of the data device from 2352 * the superblock and compares it to the actual device size, 2353 * thus resizing the data device in case it has grown. 2354 * 2355 * This both copes with opening preallocated data devices in the ctr 2356 * being followed by a resume 2357 * -and- 2358 * calling the resume method individually after userspace has 2359 * grown the data device in reaction to a table event. 2360 */ 2361 static int pool_preresume(struct dm_target *ti) 2362 { 2363 int r; 2364 struct pool_c *pt = ti->private; 2365 struct pool *pool = pt->pool; 2366 sector_t data_size = ti->len; 2367 dm_block_t sb_data_size; 2368 2369 /* 2370 * Take control of the pool object. 2371 */ 2372 r = bind_control_target(pool, ti); 2373 if (r) 2374 return r; 2375 2376 (void) sector_div(data_size, pool->sectors_per_block); 2377 2378 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size); 2379 if (r) { 2380 DMERR("failed to retrieve data device size"); 2381 return r; 2382 } 2383 2384 if (data_size < sb_data_size) { 2385 DMERR("pool target too small, is %llu blocks (expected %llu)", 2386 (unsigned long long)data_size, sb_data_size); 2387 return -EINVAL; 2388 2389 } else if (data_size > sb_data_size) { 2390 r = dm_pool_resize_data_dev(pool->pmd, data_size); 2391 if (r) { 2392 DMERR("failed to resize data device"); 2393 /* FIXME Stricter than necessary: Rollback transaction instead here */ 2394 set_pool_mode(pool, PM_READ_ONLY); 2395 return r; 2396 } 2397 2398 (void) commit_or_fallback(pool); 2399 } 2400 2401 return 0; 2402 } 2403 2404 static void pool_resume(struct dm_target *ti) 2405 { 2406 struct pool_c *pt = ti->private; 2407 struct pool *pool = pt->pool; 2408 unsigned long flags; 2409 2410 spin_lock_irqsave(&pool->lock, flags); 2411 pool->low_water_triggered = 0; 2412 pool->no_free_space = 0; 2413 __requeue_bios(pool); 2414 spin_unlock_irqrestore(&pool->lock, flags); 2415 2416 do_waker(&pool->waker.work); 2417 } 2418 2419 static void pool_postsuspend(struct dm_target *ti) 2420 { 2421 struct pool_c *pt = ti->private; 2422 struct pool *pool = pt->pool; 2423 2424 cancel_delayed_work(&pool->waker); 2425 flush_workqueue(pool->wq); 2426 (void) commit_or_fallback(pool); 2427 } 2428 2429 static int check_arg_count(unsigned argc, unsigned args_required) 2430 { 2431 if (argc != args_required) { 2432 DMWARN("Message received with %u arguments instead of %u.", 2433 argc, args_required); 2434 return -EINVAL; 2435 } 2436 2437 return 0; 2438 } 2439 2440 static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning) 2441 { 2442 if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) && 2443 *dev_id <= MAX_DEV_ID) 2444 return 0; 2445 2446 if (warning) 2447 DMWARN("Message received with invalid device id: %s", arg); 2448 2449 return -EINVAL; 2450 } 2451 2452 static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool) 2453 { 2454 dm_thin_id dev_id; 2455 int r; 2456 2457 r = check_arg_count(argc, 2); 2458 if (r) 2459 return r; 2460 2461 r = read_dev_id(argv[1], &dev_id, 1); 2462 if (r) 2463 return r; 2464 2465 r = dm_pool_create_thin(pool->pmd, dev_id); 2466 if (r) { 2467 DMWARN("Creation of new thinly-provisioned device with id %s failed.", 2468 argv[1]); 2469 return r; 2470 } 2471 2472 return 0; 2473 } 2474 2475 static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool) 2476 { 2477 dm_thin_id dev_id; 2478 dm_thin_id origin_dev_id; 2479 int r; 2480 2481 r = check_arg_count(argc, 3); 2482 if (r) 2483 return r; 2484 2485 r = read_dev_id(argv[1], &dev_id, 1); 2486 if (r) 2487 return r; 2488 2489 r = read_dev_id(argv[2], &origin_dev_id, 1); 2490 if (r) 2491 return r; 2492 2493 r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id); 2494 if (r) { 2495 DMWARN("Creation of new snapshot %s of device %s failed.", 2496 argv[1], argv[2]); 2497 return r; 2498 } 2499 2500 return 0; 2501 } 2502 2503 static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool) 2504 { 2505 dm_thin_id dev_id; 2506 int r; 2507 2508 r = check_arg_count(argc, 2); 2509 if (r) 2510 return r; 2511 2512 r = read_dev_id(argv[1], &dev_id, 1); 2513 if (r) 2514 return r; 2515 2516 r = dm_pool_delete_thin_device(pool->pmd, dev_id); 2517 if (r) 2518 DMWARN("Deletion of thin device %s failed.", argv[1]); 2519 2520 return r; 2521 } 2522 2523 static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool) 2524 { 2525 dm_thin_id old_id, new_id; 2526 int r; 2527 2528 r = check_arg_count(argc, 3); 2529 if (r) 2530 return r; 2531 2532 if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) { 2533 DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]); 2534 return -EINVAL; 2535 } 2536 2537 if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) { 2538 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]); 2539 return -EINVAL; 2540 } 2541 2542 r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id); 2543 if (r) { 2544 DMWARN("Failed to change transaction id from %s to %s.", 2545 argv[1], argv[2]); 2546 return r; 2547 } 2548 2549 return 0; 2550 } 2551 2552 static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool) 2553 { 2554 int r; 2555 2556 r = check_arg_count(argc, 1); 2557 if (r) 2558 return r; 2559 2560 (void) commit_or_fallback(pool); 2561 2562 r = dm_pool_reserve_metadata_snap(pool->pmd); 2563 if (r) 2564 DMWARN("reserve_metadata_snap message failed."); 2565 2566 return r; 2567 } 2568 2569 static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool) 2570 { 2571 int r; 2572 2573 r = check_arg_count(argc, 1); 2574 if (r) 2575 return r; 2576 2577 r = dm_pool_release_metadata_snap(pool->pmd); 2578 if (r) 2579 DMWARN("release_metadata_snap message failed."); 2580 2581 return r; 2582 } 2583 2584 /* 2585 * Messages supported: 2586 * create_thin <dev_id> 2587 * create_snap <dev_id> <origin_id> 2588 * delete <dev_id> 2589 * trim <dev_id> <new_size_in_sectors> 2590 * set_transaction_id <current_trans_id> <new_trans_id> 2591 * reserve_metadata_snap 2592 * release_metadata_snap 2593 */ 2594 static int pool_message(struct dm_target *ti, unsigned argc, char **argv) 2595 { 2596 int r = -EINVAL; 2597 struct pool_c *pt = ti->private; 2598 struct pool *pool = pt->pool; 2599 2600 if (!strcasecmp(argv[0], "create_thin")) 2601 r = process_create_thin_mesg(argc, argv, pool); 2602 2603 else if (!strcasecmp(argv[0], "create_snap")) 2604 r = process_create_snap_mesg(argc, argv, pool); 2605 2606 else if (!strcasecmp(argv[0], "delete")) 2607 r = process_delete_mesg(argc, argv, pool); 2608 2609 else if (!strcasecmp(argv[0], "set_transaction_id")) 2610 r = process_set_transaction_id_mesg(argc, argv, pool); 2611 2612 else if (!strcasecmp(argv[0], "reserve_metadata_snap")) 2613 r = process_reserve_metadata_snap_mesg(argc, argv, pool); 2614 2615 else if (!strcasecmp(argv[0], "release_metadata_snap")) 2616 r = process_release_metadata_snap_mesg(argc, argv, pool); 2617 2618 else 2619 DMWARN("Unrecognised thin pool target message received: %s", argv[0]); 2620 2621 if (!r) 2622 (void) commit_or_fallback(pool); 2623 2624 return r; 2625 } 2626 2627 static void emit_flags(struct pool_features *pf, char *result, 2628 unsigned sz, unsigned maxlen) 2629 { 2630 unsigned count = !pf->zero_new_blocks + !pf->discard_enabled + 2631 !pf->discard_passdown + (pf->mode == PM_READ_ONLY); 2632 DMEMIT("%u ", count); 2633 2634 if (!pf->zero_new_blocks) 2635 DMEMIT("skip_block_zeroing "); 2636 2637 if (!pf->discard_enabled) 2638 DMEMIT("ignore_discard "); 2639 2640 if (!pf->discard_passdown) 2641 DMEMIT("no_discard_passdown "); 2642 2643 if (pf->mode == PM_READ_ONLY) 2644 DMEMIT("read_only "); 2645 } 2646 2647 /* 2648 * Status line is: 2649 * <transaction id> <used metadata sectors>/<total metadata sectors> 2650 * <used data sectors>/<total data sectors> <held metadata root> 2651 */ 2652 static int pool_status(struct dm_target *ti, status_type_t type, 2653 unsigned status_flags, char *result, unsigned maxlen) 2654 { 2655 int r; 2656 unsigned sz = 0; 2657 uint64_t transaction_id; 2658 dm_block_t nr_free_blocks_data; 2659 dm_block_t nr_free_blocks_metadata; 2660 dm_block_t nr_blocks_data; 2661 dm_block_t nr_blocks_metadata; 2662 dm_block_t held_root; 2663 char buf[BDEVNAME_SIZE]; 2664 char buf2[BDEVNAME_SIZE]; 2665 struct pool_c *pt = ti->private; 2666 struct pool *pool = pt->pool; 2667 2668 switch (type) { 2669 case STATUSTYPE_INFO: 2670 if (get_pool_mode(pool) == PM_FAIL) { 2671 DMEMIT("Fail"); 2672 break; 2673 } 2674 2675 /* Commit to ensure statistics aren't out-of-date */ 2676 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) 2677 (void) commit_or_fallback(pool); 2678 2679 r = dm_pool_get_metadata_transaction_id(pool->pmd, 2680 &transaction_id); 2681 if (r) 2682 return r; 2683 2684 r = dm_pool_get_free_metadata_block_count(pool->pmd, 2685 &nr_free_blocks_metadata); 2686 if (r) 2687 return r; 2688 2689 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata); 2690 if (r) 2691 return r; 2692 2693 r = dm_pool_get_free_block_count(pool->pmd, 2694 &nr_free_blocks_data); 2695 if (r) 2696 return r; 2697 2698 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data); 2699 if (r) 2700 return r; 2701 2702 r = dm_pool_get_metadata_snap(pool->pmd, &held_root); 2703 if (r) 2704 return r; 2705 2706 DMEMIT("%llu %llu/%llu %llu/%llu ", 2707 (unsigned long long)transaction_id, 2708 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 2709 (unsigned long long)nr_blocks_metadata, 2710 (unsigned long long)(nr_blocks_data - nr_free_blocks_data), 2711 (unsigned long long)nr_blocks_data); 2712 2713 if (held_root) 2714 DMEMIT("%llu ", held_root); 2715 else 2716 DMEMIT("- "); 2717 2718 if (pool->pf.mode == PM_READ_ONLY) 2719 DMEMIT("ro "); 2720 else 2721 DMEMIT("rw "); 2722 2723 if (pool->pf.discard_enabled && pool->pf.discard_passdown) 2724 DMEMIT("discard_passdown"); 2725 else 2726 DMEMIT("no_discard_passdown"); 2727 2728 break; 2729 2730 case STATUSTYPE_TABLE: 2731 DMEMIT("%s %s %lu %llu ", 2732 format_dev_t(buf, pt->metadata_dev->bdev->bd_dev), 2733 format_dev_t(buf2, pt->data_dev->bdev->bd_dev), 2734 (unsigned long)pool->sectors_per_block, 2735 (unsigned long long)pt->low_water_blocks); 2736 emit_flags(&pt->requested_pf, result, sz, maxlen); 2737 break; 2738 } 2739 2740 return 0; 2741 } 2742 2743 static int pool_iterate_devices(struct dm_target *ti, 2744 iterate_devices_callout_fn fn, void *data) 2745 { 2746 struct pool_c *pt = ti->private; 2747 2748 return fn(ti, pt->data_dev, 0, ti->len, data); 2749 } 2750 2751 static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm, 2752 struct bio_vec *biovec, int max_size) 2753 { 2754 struct pool_c *pt = ti->private; 2755 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 2756 2757 if (!q->merge_bvec_fn) 2758 return max_size; 2759 2760 bvm->bi_bdev = pt->data_dev->bdev; 2761 2762 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 2763 } 2764 2765 static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits) 2766 { 2767 struct pool *pool = pt->pool; 2768 struct queue_limits *data_limits; 2769 2770 limits->max_discard_sectors = pool->sectors_per_block; 2771 2772 /* 2773 * discard_granularity is just a hint, and not enforced. 2774 */ 2775 if (pt->adjusted_pf.discard_passdown) { 2776 data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits; 2777 limits->discard_granularity = data_limits->discard_granularity; 2778 } else 2779 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; 2780 } 2781 2782 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) 2783 { 2784 struct pool_c *pt = ti->private; 2785 struct pool *pool = pt->pool; 2786 2787 blk_limits_io_min(limits, 0); 2788 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); 2789 2790 /* 2791 * pt->adjusted_pf is a staging area for the actual features to use. 2792 * They get transferred to the live pool in bind_control_target() 2793 * called from pool_preresume(). 2794 */ 2795 if (!pt->adjusted_pf.discard_enabled) 2796 return; 2797 2798 disable_passdown_if_not_supported(pt); 2799 2800 set_discard_limits(pt, limits); 2801 } 2802 2803 static struct target_type pool_target = { 2804 .name = "thin-pool", 2805 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 2806 DM_TARGET_IMMUTABLE, 2807 .version = {1, 4, 0}, 2808 .module = THIS_MODULE, 2809 .ctr = pool_ctr, 2810 .dtr = pool_dtr, 2811 .map = pool_map, 2812 .postsuspend = pool_postsuspend, 2813 .preresume = pool_preresume, 2814 .resume = pool_resume, 2815 .message = pool_message, 2816 .status = pool_status, 2817 .merge = pool_merge, 2818 .iterate_devices = pool_iterate_devices, 2819 .io_hints = pool_io_hints, 2820 }; 2821 2822 /*---------------------------------------------------------------- 2823 * Thin target methods 2824 *--------------------------------------------------------------*/ 2825 static void thin_dtr(struct dm_target *ti) 2826 { 2827 struct thin_c *tc = ti->private; 2828 2829 mutex_lock(&dm_thin_pool_table.mutex); 2830 2831 __pool_dec(tc->pool); 2832 dm_pool_close_thin_device(tc->td); 2833 dm_put_device(ti, tc->pool_dev); 2834 if (tc->origin_dev) 2835 dm_put_device(ti, tc->origin_dev); 2836 kfree(tc); 2837 2838 mutex_unlock(&dm_thin_pool_table.mutex); 2839 } 2840 2841 /* 2842 * Thin target parameters: 2843 * 2844 * <pool_dev> <dev_id> [origin_dev] 2845 * 2846 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool) 2847 * dev_id: the internal device identifier 2848 * origin_dev: a device external to the pool that should act as the origin 2849 * 2850 * If the pool device has discards disabled, they get disabled for the thin 2851 * device as well. 2852 */ 2853 static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) 2854 { 2855 int r; 2856 struct thin_c *tc; 2857 struct dm_dev *pool_dev, *origin_dev; 2858 struct mapped_device *pool_md; 2859 2860 mutex_lock(&dm_thin_pool_table.mutex); 2861 2862 if (argc != 2 && argc != 3) { 2863 ti->error = "Invalid argument count"; 2864 r = -EINVAL; 2865 goto out_unlock; 2866 } 2867 2868 tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL); 2869 if (!tc) { 2870 ti->error = "Out of memory"; 2871 r = -ENOMEM; 2872 goto out_unlock; 2873 } 2874 2875 if (argc == 3) { 2876 r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev); 2877 if (r) { 2878 ti->error = "Error opening origin device"; 2879 goto bad_origin_dev; 2880 } 2881 tc->origin_dev = origin_dev; 2882 } 2883 2884 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev); 2885 if (r) { 2886 ti->error = "Error opening pool device"; 2887 goto bad_pool_dev; 2888 } 2889 tc->pool_dev = pool_dev; 2890 2891 if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) { 2892 ti->error = "Invalid device id"; 2893 r = -EINVAL; 2894 goto bad_common; 2895 } 2896 2897 pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev); 2898 if (!pool_md) { 2899 ti->error = "Couldn't get pool mapped device"; 2900 r = -EINVAL; 2901 goto bad_common; 2902 } 2903 2904 tc->pool = __pool_table_lookup(pool_md); 2905 if (!tc->pool) { 2906 ti->error = "Couldn't find pool object"; 2907 r = -EINVAL; 2908 goto bad_pool_lookup; 2909 } 2910 __pool_inc(tc->pool); 2911 2912 if (get_pool_mode(tc->pool) == PM_FAIL) { 2913 ti->error = "Couldn't open thin device, Pool is in fail mode"; 2914 goto bad_thin_open; 2915 } 2916 2917 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td); 2918 if (r) { 2919 ti->error = "Couldn't open thin internal device"; 2920 goto bad_thin_open; 2921 } 2922 2923 r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block); 2924 if (r) 2925 goto bad_thin_open; 2926 2927 ti->num_flush_requests = 1; 2928 ti->flush_supported = true; 2929 2930 /* In case the pool supports discards, pass them on. */ 2931 if (tc->pool->pf.discard_enabled) { 2932 ti->discards_supported = true; 2933 ti->num_discard_requests = 1; 2934 ti->discard_zeroes_data_unsupported = true; 2935 /* Discard requests must be split on a block boundary */ 2936 ti->split_discard_requests = true; 2937 } 2938 2939 dm_put(pool_md); 2940 2941 mutex_unlock(&dm_thin_pool_table.mutex); 2942 2943 return 0; 2944 2945 bad_thin_open: 2946 __pool_dec(tc->pool); 2947 bad_pool_lookup: 2948 dm_put(pool_md); 2949 bad_common: 2950 dm_put_device(ti, tc->pool_dev); 2951 bad_pool_dev: 2952 if (tc->origin_dev) 2953 dm_put_device(ti, tc->origin_dev); 2954 bad_origin_dev: 2955 kfree(tc); 2956 out_unlock: 2957 mutex_unlock(&dm_thin_pool_table.mutex); 2958 2959 return r; 2960 } 2961 2962 static int thin_map(struct dm_target *ti, struct bio *bio, 2963 union map_info *map_context) 2964 { 2965 bio->bi_sector = dm_target_offset(ti, bio->bi_sector); 2966 2967 return thin_bio_map(ti, bio, map_context); 2968 } 2969 2970 static int thin_endio(struct dm_target *ti, 2971 struct bio *bio, int err, 2972 union map_info *map_context) 2973 { 2974 unsigned long flags; 2975 struct dm_thin_endio_hook *h = map_context->ptr; 2976 struct list_head work; 2977 struct dm_thin_new_mapping *m, *tmp; 2978 struct pool *pool = h->tc->pool; 2979 2980 if (h->shared_read_entry) { 2981 INIT_LIST_HEAD(&work); 2982 ds_dec(h->shared_read_entry, &work); 2983 2984 spin_lock_irqsave(&pool->lock, flags); 2985 list_for_each_entry_safe(m, tmp, &work, list) { 2986 list_del(&m->list); 2987 m->quiesced = 1; 2988 __maybe_add_mapping(m); 2989 } 2990 spin_unlock_irqrestore(&pool->lock, flags); 2991 } 2992 2993 if (h->all_io_entry) { 2994 INIT_LIST_HEAD(&work); 2995 ds_dec(h->all_io_entry, &work); 2996 spin_lock_irqsave(&pool->lock, flags); 2997 list_for_each_entry_safe(m, tmp, &work, list) 2998 list_add(&m->list, &pool->prepared_discards); 2999 spin_unlock_irqrestore(&pool->lock, flags); 3000 } 3001 3002 mempool_free(h, pool->endio_hook_pool); 3003 3004 return 0; 3005 } 3006 3007 static void thin_postsuspend(struct dm_target *ti) 3008 { 3009 if (dm_noflush_suspending(ti)) 3010 requeue_io((struct thin_c *)ti->private); 3011 } 3012 3013 /* 3014 * <nr mapped sectors> <highest mapped sector> 3015 */ 3016 static int thin_status(struct dm_target *ti, status_type_t type, 3017 unsigned status_flags, char *result, unsigned maxlen) 3018 { 3019 int r; 3020 ssize_t sz = 0; 3021 dm_block_t mapped, highest; 3022 char buf[BDEVNAME_SIZE]; 3023 struct thin_c *tc = ti->private; 3024 3025 if (get_pool_mode(tc->pool) == PM_FAIL) { 3026 DMEMIT("Fail"); 3027 return 0; 3028 } 3029 3030 if (!tc->td) 3031 DMEMIT("-"); 3032 else { 3033 switch (type) { 3034 case STATUSTYPE_INFO: 3035 r = dm_thin_get_mapped_count(tc->td, &mapped); 3036 if (r) 3037 return r; 3038 3039 r = dm_thin_get_highest_mapped_block(tc->td, &highest); 3040 if (r < 0) 3041 return r; 3042 3043 DMEMIT("%llu ", mapped * tc->pool->sectors_per_block); 3044 if (r) 3045 DMEMIT("%llu", ((highest + 1) * 3046 tc->pool->sectors_per_block) - 1); 3047 else 3048 DMEMIT("-"); 3049 break; 3050 3051 case STATUSTYPE_TABLE: 3052 DMEMIT("%s %lu", 3053 format_dev_t(buf, tc->pool_dev->bdev->bd_dev), 3054 (unsigned long) tc->dev_id); 3055 if (tc->origin_dev) 3056 DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev)); 3057 break; 3058 } 3059 } 3060 3061 return 0; 3062 } 3063 3064 static int thin_iterate_devices(struct dm_target *ti, 3065 iterate_devices_callout_fn fn, void *data) 3066 { 3067 sector_t blocks; 3068 struct thin_c *tc = ti->private; 3069 struct pool *pool = tc->pool; 3070 3071 /* 3072 * We can't call dm_pool_get_data_dev_size() since that blocks. So 3073 * we follow a more convoluted path through to the pool's target. 3074 */ 3075 if (!pool->ti) 3076 return 0; /* nothing is bound */ 3077 3078 blocks = pool->ti->len; 3079 (void) sector_div(blocks, pool->sectors_per_block); 3080 if (blocks) 3081 return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data); 3082 3083 return 0; 3084 } 3085 3086 /* 3087 * A thin device always inherits its queue limits from its pool. 3088 */ 3089 static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) 3090 { 3091 struct thin_c *tc = ti->private; 3092 3093 *limits = bdev_get_queue(tc->pool_dev->bdev)->limits; 3094 } 3095 3096 static struct target_type thin_target = { 3097 .name = "thin", 3098 .version = {1, 4, 0}, 3099 .module = THIS_MODULE, 3100 .ctr = thin_ctr, 3101 .dtr = thin_dtr, 3102 .map = thin_map, 3103 .end_io = thin_endio, 3104 .postsuspend = thin_postsuspend, 3105 .status = thin_status, 3106 .iterate_devices = thin_iterate_devices, 3107 .io_hints = thin_io_hints, 3108 }; 3109 3110 /*----------------------------------------------------------------*/ 3111 3112 static int __init dm_thin_init(void) 3113 { 3114 int r; 3115 3116 pool_table_init(); 3117 3118 r = dm_register_target(&thin_target); 3119 if (r) 3120 return r; 3121 3122 r = dm_register_target(&pool_target); 3123 if (r) 3124 goto bad_pool_target; 3125 3126 r = -ENOMEM; 3127 3128 _cell_cache = KMEM_CACHE(dm_bio_prison_cell, 0); 3129 if (!_cell_cache) 3130 goto bad_cell_cache; 3131 3132 _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0); 3133 if (!_new_mapping_cache) 3134 goto bad_new_mapping_cache; 3135 3136 _endio_hook_cache = KMEM_CACHE(dm_thin_endio_hook, 0); 3137 if (!_endio_hook_cache) 3138 goto bad_endio_hook_cache; 3139 3140 return 0; 3141 3142 bad_endio_hook_cache: 3143 kmem_cache_destroy(_new_mapping_cache); 3144 bad_new_mapping_cache: 3145 kmem_cache_destroy(_cell_cache); 3146 bad_cell_cache: 3147 dm_unregister_target(&pool_target); 3148 bad_pool_target: 3149 dm_unregister_target(&thin_target); 3150 3151 return r; 3152 } 3153 3154 static void dm_thin_exit(void) 3155 { 3156 dm_unregister_target(&thin_target); 3157 dm_unregister_target(&pool_target); 3158 3159 kmem_cache_destroy(_cell_cache); 3160 kmem_cache_destroy(_new_mapping_cache); 3161 kmem_cache_destroy(_endio_hook_cache); 3162 } 3163 3164 module_init(dm_thin_init); 3165 module_exit(dm_thin_exit); 3166 3167 MODULE_DESCRIPTION(DM_NAME " thin provisioning target"); 3168 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 3169 MODULE_LICENSE("GPL"); 3170