1 /* 2 * Copyright (C) 2011 Red Hat UK. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm-thin-metadata.h" 8 9 #include <linux/device-mapper.h> 10 #include <linux/dm-io.h> 11 #include <linux/dm-kcopyd.h> 12 #include <linux/list.h> 13 #include <linux/init.h> 14 #include <linux/module.h> 15 #include <linux/slab.h> 16 17 #define DM_MSG_PREFIX "thin" 18 19 /* 20 * Tunable constants 21 */ 22 #define ENDIO_HOOK_POOL_SIZE 10240 23 #define DEFERRED_SET_SIZE 64 24 #define MAPPING_POOL_SIZE 1024 25 #define PRISON_CELLS 1024 26 #define COMMIT_PERIOD HZ 27 28 /* 29 * The block size of the device holding pool data must be 30 * between 64KB and 1GB. 31 */ 32 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT) 33 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 34 35 /* 36 * Device id is restricted to 24 bits. 37 */ 38 #define MAX_DEV_ID ((1 << 24) - 1) 39 40 /* 41 * How do we handle breaking sharing of data blocks? 42 * ================================================= 43 * 44 * We use a standard copy-on-write btree to store the mappings for the 45 * devices (note I'm talking about copy-on-write of the metadata here, not 46 * the data). When you take an internal snapshot you clone the root node 47 * of the origin btree. After this there is no concept of an origin or a 48 * snapshot. They are just two device trees that happen to point to the 49 * same data blocks. 50 * 51 * When we get a write in we decide if it's to a shared data block using 52 * some timestamp magic. If it is, we have to break sharing. 53 * 54 * Let's say we write to a shared block in what was the origin. The 55 * steps are: 56 * 57 * i) plug io further to this physical block. (see bio_prison code). 58 * 59 * ii) quiesce any read io to that shared data block. Obviously 60 * including all devices that share this block. (see deferred_set code) 61 * 62 * iii) copy the data block to a newly allocate block. This step can be 63 * missed out if the io covers the block. (schedule_copy). 64 * 65 * iv) insert the new mapping into the origin's btree 66 * (process_prepared_mapping). This act of inserting breaks some 67 * sharing of btree nodes between the two devices. Breaking sharing only 68 * effects the btree of that specific device. Btrees for the other 69 * devices that share the block never change. The btree for the origin 70 * device as it was after the last commit is untouched, ie. we're using 71 * persistent data structures in the functional programming sense. 72 * 73 * v) unplug io to this physical block, including the io that triggered 74 * the breaking of sharing. 75 * 76 * Steps (ii) and (iii) occur in parallel. 77 * 78 * The metadata _doesn't_ need to be committed before the io continues. We 79 * get away with this because the io is always written to a _new_ block. 80 * If there's a crash, then: 81 * 82 * - The origin mapping will point to the old origin block (the shared 83 * one). This will contain the data as it was before the io that triggered 84 * the breaking of sharing came in. 85 * 86 * - The snap mapping still points to the old block. As it would after 87 * the commit. 88 * 89 * The downside of this scheme is the timestamp magic isn't perfect, and 90 * will continue to think that data block in the snapshot device is shared 91 * even after the write to the origin has broken sharing. I suspect data 92 * blocks will typically be shared by many different devices, so we're 93 * breaking sharing n + 1 times, rather than n, where n is the number of 94 * devices that reference this data block. At the moment I think the 95 * benefits far, far outweigh the disadvantages. 96 */ 97 98 /*----------------------------------------------------------------*/ 99 100 /* 101 * Sometimes we can't deal with a bio straight away. We put them in prison 102 * where they can't cause any mischief. Bios are put in a cell identified 103 * by a key, multiple bios can be in the same cell. When the cell is 104 * subsequently unlocked the bios become available. 105 */ 106 struct bio_prison; 107 108 struct cell_key { 109 int virtual; 110 dm_thin_id dev; 111 dm_block_t block; 112 }; 113 114 struct dm_bio_prison_cell { 115 struct hlist_node list; 116 struct bio_prison *prison; 117 struct cell_key key; 118 struct bio *holder; 119 struct bio_list bios; 120 }; 121 122 struct bio_prison { 123 spinlock_t lock; 124 mempool_t *cell_pool; 125 126 unsigned nr_buckets; 127 unsigned hash_mask; 128 struct hlist_head *cells; 129 }; 130 131 static uint32_t calc_nr_buckets(unsigned nr_cells) 132 { 133 uint32_t n = 128; 134 135 nr_cells /= 4; 136 nr_cells = min(nr_cells, 8192u); 137 138 while (n < nr_cells) 139 n <<= 1; 140 141 return n; 142 } 143 144 static struct kmem_cache *_cell_cache; 145 146 /* 147 * @nr_cells should be the number of cells you want in use _concurrently_. 148 * Don't confuse it with the number of distinct keys. 149 */ 150 static struct bio_prison *prison_create(unsigned nr_cells) 151 { 152 unsigned i; 153 uint32_t nr_buckets = calc_nr_buckets(nr_cells); 154 size_t len = sizeof(struct bio_prison) + 155 (sizeof(struct hlist_head) * nr_buckets); 156 struct bio_prison *prison = kmalloc(len, GFP_KERNEL); 157 158 if (!prison) 159 return NULL; 160 161 spin_lock_init(&prison->lock); 162 prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache); 163 if (!prison->cell_pool) { 164 kfree(prison); 165 return NULL; 166 } 167 168 prison->nr_buckets = nr_buckets; 169 prison->hash_mask = nr_buckets - 1; 170 prison->cells = (struct hlist_head *) (prison + 1); 171 for (i = 0; i < nr_buckets; i++) 172 INIT_HLIST_HEAD(prison->cells + i); 173 174 return prison; 175 } 176 177 static void prison_destroy(struct bio_prison *prison) 178 { 179 mempool_destroy(prison->cell_pool); 180 kfree(prison); 181 } 182 183 static uint32_t hash_key(struct bio_prison *prison, struct cell_key *key) 184 { 185 const unsigned long BIG_PRIME = 4294967291UL; 186 uint64_t hash = key->block * BIG_PRIME; 187 188 return (uint32_t) (hash & prison->hash_mask); 189 } 190 191 static int keys_equal(struct cell_key *lhs, struct cell_key *rhs) 192 { 193 return (lhs->virtual == rhs->virtual) && 194 (lhs->dev == rhs->dev) && 195 (lhs->block == rhs->block); 196 } 197 198 static struct dm_bio_prison_cell *__search_bucket(struct hlist_head *bucket, 199 struct cell_key *key) 200 { 201 struct dm_bio_prison_cell *cell; 202 struct hlist_node *tmp; 203 204 hlist_for_each_entry(cell, tmp, bucket, list) 205 if (keys_equal(&cell->key, key)) 206 return cell; 207 208 return NULL; 209 } 210 211 /* 212 * This may block if a new cell needs allocating. You must ensure that 213 * cells will be unlocked even if the calling thread is blocked. 214 * 215 * Returns 1 if the cell was already held, 0 if @inmate is the new holder. 216 */ 217 static int bio_detain(struct bio_prison *prison, struct cell_key *key, 218 struct bio *inmate, struct dm_bio_prison_cell **ref) 219 { 220 int r = 1; 221 unsigned long flags; 222 uint32_t hash = hash_key(prison, key); 223 struct dm_bio_prison_cell *cell, *cell2; 224 225 BUG_ON(hash > prison->nr_buckets); 226 227 spin_lock_irqsave(&prison->lock, flags); 228 229 cell = __search_bucket(prison->cells + hash, key); 230 if (cell) { 231 bio_list_add(&cell->bios, inmate); 232 goto out; 233 } 234 235 /* 236 * Allocate a new cell 237 */ 238 spin_unlock_irqrestore(&prison->lock, flags); 239 cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO); 240 spin_lock_irqsave(&prison->lock, flags); 241 242 /* 243 * We've been unlocked, so we have to double check that 244 * nobody else has inserted this cell in the meantime. 245 */ 246 cell = __search_bucket(prison->cells + hash, key); 247 if (cell) { 248 mempool_free(cell2, prison->cell_pool); 249 bio_list_add(&cell->bios, inmate); 250 goto out; 251 } 252 253 /* 254 * Use new cell. 255 */ 256 cell = cell2; 257 258 cell->prison = prison; 259 memcpy(&cell->key, key, sizeof(cell->key)); 260 cell->holder = inmate; 261 bio_list_init(&cell->bios); 262 hlist_add_head(&cell->list, prison->cells + hash); 263 264 r = 0; 265 266 out: 267 spin_unlock_irqrestore(&prison->lock, flags); 268 269 *ref = cell; 270 271 return r; 272 } 273 274 /* 275 * @inmates must have been initialised prior to this call 276 */ 277 static void __cell_release(struct dm_bio_prison_cell *cell, struct bio_list *inmates) 278 { 279 struct bio_prison *prison = cell->prison; 280 281 hlist_del(&cell->list); 282 283 if (inmates) { 284 bio_list_add(inmates, cell->holder); 285 bio_list_merge(inmates, &cell->bios); 286 } 287 288 mempool_free(cell, prison->cell_pool); 289 } 290 291 static void cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios) 292 { 293 unsigned long flags; 294 struct bio_prison *prison = cell->prison; 295 296 spin_lock_irqsave(&prison->lock, flags); 297 __cell_release(cell, bios); 298 spin_unlock_irqrestore(&prison->lock, flags); 299 } 300 301 /* 302 * There are a couple of places where we put a bio into a cell briefly 303 * before taking it out again. In these situations we know that no other 304 * bio may be in the cell. This function releases the cell, and also does 305 * a sanity check. 306 */ 307 static void __cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio) 308 { 309 BUG_ON(cell->holder != bio); 310 BUG_ON(!bio_list_empty(&cell->bios)); 311 312 __cell_release(cell, NULL); 313 } 314 315 static void cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio) 316 { 317 unsigned long flags; 318 struct bio_prison *prison = cell->prison; 319 320 spin_lock_irqsave(&prison->lock, flags); 321 __cell_release_singleton(cell, bio); 322 spin_unlock_irqrestore(&prison->lock, flags); 323 } 324 325 /* 326 * Sometimes we don't want the holder, just the additional bios. 327 */ 328 static void __cell_release_no_holder(struct dm_bio_prison_cell *cell, 329 struct bio_list *inmates) 330 { 331 struct bio_prison *prison = cell->prison; 332 333 hlist_del(&cell->list); 334 bio_list_merge(inmates, &cell->bios); 335 336 mempool_free(cell, prison->cell_pool); 337 } 338 339 static void cell_release_no_holder(struct dm_bio_prison_cell *cell, 340 struct bio_list *inmates) 341 { 342 unsigned long flags; 343 struct bio_prison *prison = cell->prison; 344 345 spin_lock_irqsave(&prison->lock, flags); 346 __cell_release_no_holder(cell, inmates); 347 spin_unlock_irqrestore(&prison->lock, flags); 348 } 349 350 static void cell_error(struct dm_bio_prison_cell *cell) 351 { 352 struct bio_prison *prison = cell->prison; 353 struct bio_list bios; 354 struct bio *bio; 355 unsigned long flags; 356 357 bio_list_init(&bios); 358 359 spin_lock_irqsave(&prison->lock, flags); 360 __cell_release(cell, &bios); 361 spin_unlock_irqrestore(&prison->lock, flags); 362 363 while ((bio = bio_list_pop(&bios))) 364 bio_io_error(bio); 365 } 366 367 /*----------------------------------------------------------------*/ 368 369 /* 370 * We use the deferred set to keep track of pending reads to shared blocks. 371 * We do this to ensure the new mapping caused by a write isn't performed 372 * until these prior reads have completed. Otherwise the insertion of the 373 * new mapping could free the old block that the read bios are mapped to. 374 */ 375 376 struct deferred_set; 377 struct deferred_entry { 378 struct deferred_set *ds; 379 unsigned count; 380 struct list_head work_items; 381 }; 382 383 struct deferred_set { 384 spinlock_t lock; 385 unsigned current_entry; 386 unsigned sweeper; 387 struct deferred_entry entries[DEFERRED_SET_SIZE]; 388 }; 389 390 static void ds_init(struct deferred_set *ds) 391 { 392 int i; 393 394 spin_lock_init(&ds->lock); 395 ds->current_entry = 0; 396 ds->sweeper = 0; 397 for (i = 0; i < DEFERRED_SET_SIZE; i++) { 398 ds->entries[i].ds = ds; 399 ds->entries[i].count = 0; 400 INIT_LIST_HEAD(&ds->entries[i].work_items); 401 } 402 } 403 404 static struct deferred_entry *ds_inc(struct deferred_set *ds) 405 { 406 unsigned long flags; 407 struct deferred_entry *entry; 408 409 spin_lock_irqsave(&ds->lock, flags); 410 entry = ds->entries + ds->current_entry; 411 entry->count++; 412 spin_unlock_irqrestore(&ds->lock, flags); 413 414 return entry; 415 } 416 417 static unsigned ds_next(unsigned index) 418 { 419 return (index + 1) % DEFERRED_SET_SIZE; 420 } 421 422 static void __sweep(struct deferred_set *ds, struct list_head *head) 423 { 424 while ((ds->sweeper != ds->current_entry) && 425 !ds->entries[ds->sweeper].count) { 426 list_splice_init(&ds->entries[ds->sweeper].work_items, head); 427 ds->sweeper = ds_next(ds->sweeper); 428 } 429 430 if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count) 431 list_splice_init(&ds->entries[ds->sweeper].work_items, head); 432 } 433 434 static void ds_dec(struct deferred_entry *entry, struct list_head *head) 435 { 436 unsigned long flags; 437 438 spin_lock_irqsave(&entry->ds->lock, flags); 439 BUG_ON(!entry->count); 440 --entry->count; 441 __sweep(entry->ds, head); 442 spin_unlock_irqrestore(&entry->ds->lock, flags); 443 } 444 445 /* 446 * Returns 1 if deferred or 0 if no pending items to delay job. 447 */ 448 static int ds_add_work(struct deferred_set *ds, struct list_head *work) 449 { 450 int r = 1; 451 unsigned long flags; 452 unsigned next_entry; 453 454 spin_lock_irqsave(&ds->lock, flags); 455 if ((ds->sweeper == ds->current_entry) && 456 !ds->entries[ds->current_entry].count) 457 r = 0; 458 else { 459 list_add(work, &ds->entries[ds->current_entry].work_items); 460 next_entry = ds_next(ds->current_entry); 461 if (!ds->entries[next_entry].count) 462 ds->current_entry = next_entry; 463 } 464 spin_unlock_irqrestore(&ds->lock, flags); 465 466 return r; 467 } 468 469 /*----------------------------------------------------------------*/ 470 471 /* 472 * Key building. 473 */ 474 static void build_data_key(struct dm_thin_device *td, 475 dm_block_t b, struct cell_key *key) 476 { 477 key->virtual = 0; 478 key->dev = dm_thin_dev_id(td); 479 key->block = b; 480 } 481 482 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, 483 struct cell_key *key) 484 { 485 key->virtual = 1; 486 key->dev = dm_thin_dev_id(td); 487 key->block = b; 488 } 489 490 /*----------------------------------------------------------------*/ 491 492 /* 493 * A pool device ties together a metadata device and a data device. It 494 * also provides the interface for creating and destroying internal 495 * devices. 496 */ 497 struct dm_thin_new_mapping; 498 499 struct pool_features { 500 unsigned zero_new_blocks:1; 501 unsigned discard_enabled:1; 502 unsigned discard_passdown:1; 503 }; 504 505 struct pool { 506 struct list_head list; 507 struct dm_target *ti; /* Only set if a pool target is bound */ 508 509 struct mapped_device *pool_md; 510 struct block_device *md_dev; 511 struct dm_pool_metadata *pmd; 512 513 uint32_t sectors_per_block; 514 unsigned block_shift; 515 dm_block_t offset_mask; 516 dm_block_t low_water_blocks; 517 518 struct pool_features pf; 519 unsigned low_water_triggered:1; /* A dm event has been sent */ 520 unsigned no_free_space:1; /* A -ENOSPC warning has been issued */ 521 522 struct bio_prison *prison; 523 struct dm_kcopyd_client *copier; 524 525 struct workqueue_struct *wq; 526 struct work_struct worker; 527 struct delayed_work waker; 528 529 unsigned ref_count; 530 unsigned long last_commit_jiffies; 531 532 spinlock_t lock; 533 struct bio_list deferred_bios; 534 struct bio_list deferred_flush_bios; 535 struct list_head prepared_mappings; 536 struct list_head prepared_discards; 537 538 struct bio_list retry_on_resume_list; 539 540 struct deferred_set shared_read_ds; 541 struct deferred_set all_io_ds; 542 543 struct dm_thin_new_mapping *next_mapping; 544 mempool_t *mapping_pool; 545 mempool_t *endio_hook_pool; 546 }; 547 548 /* 549 * Target context for a pool. 550 */ 551 struct pool_c { 552 struct dm_target *ti; 553 struct pool *pool; 554 struct dm_dev *data_dev; 555 struct dm_dev *metadata_dev; 556 struct dm_target_callbacks callbacks; 557 558 dm_block_t low_water_blocks; 559 struct pool_features pf; 560 }; 561 562 /* 563 * Target context for a thin. 564 */ 565 struct thin_c { 566 struct dm_dev *pool_dev; 567 struct dm_dev *origin_dev; 568 dm_thin_id dev_id; 569 570 struct pool *pool; 571 struct dm_thin_device *td; 572 }; 573 574 /*----------------------------------------------------------------*/ 575 576 /* 577 * A global list of pools that uses a struct mapped_device as a key. 578 */ 579 static struct dm_thin_pool_table { 580 struct mutex mutex; 581 struct list_head pools; 582 } dm_thin_pool_table; 583 584 static void pool_table_init(void) 585 { 586 mutex_init(&dm_thin_pool_table.mutex); 587 INIT_LIST_HEAD(&dm_thin_pool_table.pools); 588 } 589 590 static void __pool_table_insert(struct pool *pool) 591 { 592 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 593 list_add(&pool->list, &dm_thin_pool_table.pools); 594 } 595 596 static void __pool_table_remove(struct pool *pool) 597 { 598 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 599 list_del(&pool->list); 600 } 601 602 static struct pool *__pool_table_lookup(struct mapped_device *md) 603 { 604 struct pool *pool = NULL, *tmp; 605 606 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 607 608 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { 609 if (tmp->pool_md == md) { 610 pool = tmp; 611 break; 612 } 613 } 614 615 return pool; 616 } 617 618 static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev) 619 { 620 struct pool *pool = NULL, *tmp; 621 622 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 623 624 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { 625 if (tmp->md_dev == md_dev) { 626 pool = tmp; 627 break; 628 } 629 } 630 631 return pool; 632 } 633 634 /*----------------------------------------------------------------*/ 635 636 struct dm_thin_endio_hook { 637 struct thin_c *tc; 638 struct deferred_entry *shared_read_entry; 639 struct deferred_entry *all_io_entry; 640 struct dm_thin_new_mapping *overwrite_mapping; 641 }; 642 643 static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) 644 { 645 struct bio *bio; 646 struct bio_list bios; 647 648 bio_list_init(&bios); 649 bio_list_merge(&bios, master); 650 bio_list_init(master); 651 652 while ((bio = bio_list_pop(&bios))) { 653 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 654 655 if (h->tc == tc) 656 bio_endio(bio, DM_ENDIO_REQUEUE); 657 else 658 bio_list_add(master, bio); 659 } 660 } 661 662 static void requeue_io(struct thin_c *tc) 663 { 664 struct pool *pool = tc->pool; 665 unsigned long flags; 666 667 spin_lock_irqsave(&pool->lock, flags); 668 __requeue_bio_list(tc, &pool->deferred_bios); 669 __requeue_bio_list(tc, &pool->retry_on_resume_list); 670 spin_unlock_irqrestore(&pool->lock, flags); 671 } 672 673 /* 674 * This section of code contains the logic for processing a thin device's IO. 675 * Much of the code depends on pool object resources (lists, workqueues, etc) 676 * but most is exclusively called from the thin target rather than the thin-pool 677 * target. 678 */ 679 680 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) 681 { 682 return bio->bi_sector >> tc->pool->block_shift; 683 } 684 685 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) 686 { 687 struct pool *pool = tc->pool; 688 689 bio->bi_bdev = tc->pool_dev->bdev; 690 bio->bi_sector = (block << pool->block_shift) + 691 (bio->bi_sector & pool->offset_mask); 692 } 693 694 static void remap_to_origin(struct thin_c *tc, struct bio *bio) 695 { 696 bio->bi_bdev = tc->origin_dev->bdev; 697 } 698 699 static void issue(struct thin_c *tc, struct bio *bio) 700 { 701 struct pool *pool = tc->pool; 702 unsigned long flags; 703 704 /* 705 * Batch together any FUA/FLUSH bios we find and then issue 706 * a single commit for them in process_deferred_bios(). 707 */ 708 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { 709 spin_lock_irqsave(&pool->lock, flags); 710 bio_list_add(&pool->deferred_flush_bios, bio); 711 spin_unlock_irqrestore(&pool->lock, flags); 712 } else 713 generic_make_request(bio); 714 } 715 716 static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio) 717 { 718 remap_to_origin(tc, bio); 719 issue(tc, bio); 720 } 721 722 static void remap_and_issue(struct thin_c *tc, struct bio *bio, 723 dm_block_t block) 724 { 725 remap(tc, bio, block); 726 issue(tc, bio); 727 } 728 729 /* 730 * wake_worker() is used when new work is queued and when pool_resume is 731 * ready to continue deferred IO processing. 732 */ 733 static void wake_worker(struct pool *pool) 734 { 735 queue_work(pool->wq, &pool->worker); 736 } 737 738 /*----------------------------------------------------------------*/ 739 740 /* 741 * Bio endio functions. 742 */ 743 struct dm_thin_new_mapping { 744 struct list_head list; 745 746 unsigned quiesced:1; 747 unsigned prepared:1; 748 unsigned pass_discard:1; 749 750 struct thin_c *tc; 751 dm_block_t virt_block; 752 dm_block_t data_block; 753 struct dm_bio_prison_cell *cell, *cell2; 754 int err; 755 756 /* 757 * If the bio covers the whole area of a block then we can avoid 758 * zeroing or copying. Instead this bio is hooked. The bio will 759 * still be in the cell, so care has to be taken to avoid issuing 760 * the bio twice. 761 */ 762 struct bio *bio; 763 bio_end_io_t *saved_bi_end_io; 764 }; 765 766 static void __maybe_add_mapping(struct dm_thin_new_mapping *m) 767 { 768 struct pool *pool = m->tc->pool; 769 770 if (m->quiesced && m->prepared) { 771 list_add(&m->list, &pool->prepared_mappings); 772 wake_worker(pool); 773 } 774 } 775 776 static void copy_complete(int read_err, unsigned long write_err, void *context) 777 { 778 unsigned long flags; 779 struct dm_thin_new_mapping *m = context; 780 struct pool *pool = m->tc->pool; 781 782 m->err = read_err || write_err ? -EIO : 0; 783 784 spin_lock_irqsave(&pool->lock, flags); 785 m->prepared = 1; 786 __maybe_add_mapping(m); 787 spin_unlock_irqrestore(&pool->lock, flags); 788 } 789 790 static void overwrite_endio(struct bio *bio, int err) 791 { 792 unsigned long flags; 793 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 794 struct dm_thin_new_mapping *m = h->overwrite_mapping; 795 struct pool *pool = m->tc->pool; 796 797 m->err = err; 798 799 spin_lock_irqsave(&pool->lock, flags); 800 m->prepared = 1; 801 __maybe_add_mapping(m); 802 spin_unlock_irqrestore(&pool->lock, flags); 803 } 804 805 /*----------------------------------------------------------------*/ 806 807 /* 808 * Workqueue. 809 */ 810 811 /* 812 * Prepared mapping jobs. 813 */ 814 815 /* 816 * This sends the bios in the cell back to the deferred_bios list. 817 */ 818 static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell, 819 dm_block_t data_block) 820 { 821 struct pool *pool = tc->pool; 822 unsigned long flags; 823 824 spin_lock_irqsave(&pool->lock, flags); 825 cell_release(cell, &pool->deferred_bios); 826 spin_unlock_irqrestore(&tc->pool->lock, flags); 827 828 wake_worker(pool); 829 } 830 831 /* 832 * Same as cell_defer above, except it omits one particular detainee, 833 * a write bio that covers the block and has already been processed. 834 */ 835 static void cell_defer_except(struct thin_c *tc, struct dm_bio_prison_cell *cell) 836 { 837 struct bio_list bios; 838 struct pool *pool = tc->pool; 839 unsigned long flags; 840 841 bio_list_init(&bios); 842 843 spin_lock_irqsave(&pool->lock, flags); 844 cell_release_no_holder(cell, &pool->deferred_bios); 845 spin_unlock_irqrestore(&pool->lock, flags); 846 847 wake_worker(pool); 848 } 849 850 static void process_prepared_mapping(struct dm_thin_new_mapping *m) 851 { 852 struct thin_c *tc = m->tc; 853 struct bio *bio; 854 int r; 855 856 bio = m->bio; 857 if (bio) 858 bio->bi_end_io = m->saved_bi_end_io; 859 860 if (m->err) { 861 cell_error(m->cell); 862 return; 863 } 864 865 /* 866 * Commit the prepared block into the mapping btree. 867 * Any I/O for this block arriving after this point will get 868 * remapped to it directly. 869 */ 870 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); 871 if (r) { 872 DMERR("dm_thin_insert_block() failed"); 873 cell_error(m->cell); 874 return; 875 } 876 877 /* 878 * Release any bios held while the block was being provisioned. 879 * If we are processing a write bio that completely covers the block, 880 * we already processed it so can ignore it now when processing 881 * the bios in the cell. 882 */ 883 if (bio) { 884 cell_defer_except(tc, m->cell); 885 bio_endio(bio, 0); 886 } else 887 cell_defer(tc, m->cell, m->data_block); 888 889 list_del(&m->list); 890 mempool_free(m, tc->pool->mapping_pool); 891 } 892 893 static void process_prepared_discard(struct dm_thin_new_mapping *m) 894 { 895 int r; 896 struct thin_c *tc = m->tc; 897 898 r = dm_thin_remove_block(tc->td, m->virt_block); 899 if (r) 900 DMERR("dm_thin_remove_block() failed"); 901 902 /* 903 * Pass the discard down to the underlying device? 904 */ 905 if (m->pass_discard) 906 remap_and_issue(tc, m->bio, m->data_block); 907 else 908 bio_endio(m->bio, 0); 909 910 cell_defer_except(tc, m->cell); 911 cell_defer_except(tc, m->cell2); 912 mempool_free(m, tc->pool->mapping_pool); 913 } 914 915 static void process_prepared(struct pool *pool, struct list_head *head, 916 void (*fn)(struct dm_thin_new_mapping *)) 917 { 918 unsigned long flags; 919 struct list_head maps; 920 struct dm_thin_new_mapping *m, *tmp; 921 922 INIT_LIST_HEAD(&maps); 923 spin_lock_irqsave(&pool->lock, flags); 924 list_splice_init(head, &maps); 925 spin_unlock_irqrestore(&pool->lock, flags); 926 927 list_for_each_entry_safe(m, tmp, &maps, list) 928 fn(m); 929 } 930 931 /* 932 * Deferred bio jobs. 933 */ 934 static int io_overlaps_block(struct pool *pool, struct bio *bio) 935 { 936 return !(bio->bi_sector & pool->offset_mask) && 937 (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT)); 938 939 } 940 941 static int io_overwrites_block(struct pool *pool, struct bio *bio) 942 { 943 return (bio_data_dir(bio) == WRITE) && 944 io_overlaps_block(pool, bio); 945 } 946 947 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save, 948 bio_end_io_t *fn) 949 { 950 *save = bio->bi_end_io; 951 bio->bi_end_io = fn; 952 } 953 954 static int ensure_next_mapping(struct pool *pool) 955 { 956 if (pool->next_mapping) 957 return 0; 958 959 pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC); 960 961 return pool->next_mapping ? 0 : -ENOMEM; 962 } 963 964 static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool) 965 { 966 struct dm_thin_new_mapping *r = pool->next_mapping; 967 968 BUG_ON(!pool->next_mapping); 969 970 pool->next_mapping = NULL; 971 972 return r; 973 } 974 975 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, 976 struct dm_dev *origin, dm_block_t data_origin, 977 dm_block_t data_dest, 978 struct dm_bio_prison_cell *cell, struct bio *bio) 979 { 980 int r; 981 struct pool *pool = tc->pool; 982 struct dm_thin_new_mapping *m = get_next_mapping(pool); 983 984 INIT_LIST_HEAD(&m->list); 985 m->quiesced = 0; 986 m->prepared = 0; 987 m->tc = tc; 988 m->virt_block = virt_block; 989 m->data_block = data_dest; 990 m->cell = cell; 991 m->err = 0; 992 m->bio = NULL; 993 994 if (!ds_add_work(&pool->shared_read_ds, &m->list)) 995 m->quiesced = 1; 996 997 /* 998 * IO to pool_dev remaps to the pool target's data_dev. 999 * 1000 * If the whole block of data is being overwritten, we can issue the 1001 * bio immediately. Otherwise we use kcopyd to clone the data first. 1002 */ 1003 if (io_overwrites_block(pool, bio)) { 1004 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 1005 1006 h->overwrite_mapping = m; 1007 m->bio = bio; 1008 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 1009 remap_and_issue(tc, bio, data_dest); 1010 } else { 1011 struct dm_io_region from, to; 1012 1013 from.bdev = origin->bdev; 1014 from.sector = data_origin * pool->sectors_per_block; 1015 from.count = pool->sectors_per_block; 1016 1017 to.bdev = tc->pool_dev->bdev; 1018 to.sector = data_dest * pool->sectors_per_block; 1019 to.count = pool->sectors_per_block; 1020 1021 r = dm_kcopyd_copy(pool->copier, &from, 1, &to, 1022 0, copy_complete, m); 1023 if (r < 0) { 1024 mempool_free(m, pool->mapping_pool); 1025 DMERR("dm_kcopyd_copy() failed"); 1026 cell_error(cell); 1027 } 1028 } 1029 } 1030 1031 static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block, 1032 dm_block_t data_origin, dm_block_t data_dest, 1033 struct dm_bio_prison_cell *cell, struct bio *bio) 1034 { 1035 schedule_copy(tc, virt_block, tc->pool_dev, 1036 data_origin, data_dest, cell, bio); 1037 } 1038 1039 static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block, 1040 dm_block_t data_dest, 1041 struct dm_bio_prison_cell *cell, struct bio *bio) 1042 { 1043 schedule_copy(tc, virt_block, tc->origin_dev, 1044 virt_block, data_dest, cell, bio); 1045 } 1046 1047 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, 1048 dm_block_t data_block, struct dm_bio_prison_cell *cell, 1049 struct bio *bio) 1050 { 1051 struct pool *pool = tc->pool; 1052 struct dm_thin_new_mapping *m = get_next_mapping(pool); 1053 1054 INIT_LIST_HEAD(&m->list); 1055 m->quiesced = 1; 1056 m->prepared = 0; 1057 m->tc = tc; 1058 m->virt_block = virt_block; 1059 m->data_block = data_block; 1060 m->cell = cell; 1061 m->err = 0; 1062 m->bio = NULL; 1063 1064 /* 1065 * If the whole block of data is being overwritten or we are not 1066 * zeroing pre-existing data, we can issue the bio immediately. 1067 * Otherwise we use kcopyd to zero the data first. 1068 */ 1069 if (!pool->pf.zero_new_blocks) 1070 process_prepared_mapping(m); 1071 1072 else if (io_overwrites_block(pool, bio)) { 1073 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 1074 1075 h->overwrite_mapping = m; 1076 m->bio = bio; 1077 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 1078 remap_and_issue(tc, bio, data_block); 1079 } else { 1080 int r; 1081 struct dm_io_region to; 1082 1083 to.bdev = tc->pool_dev->bdev; 1084 to.sector = data_block * pool->sectors_per_block; 1085 to.count = pool->sectors_per_block; 1086 1087 r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m); 1088 if (r < 0) { 1089 mempool_free(m, pool->mapping_pool); 1090 DMERR("dm_kcopyd_zero() failed"); 1091 cell_error(cell); 1092 } 1093 } 1094 } 1095 1096 static int alloc_data_block(struct thin_c *tc, dm_block_t *result) 1097 { 1098 int r; 1099 dm_block_t free_blocks; 1100 unsigned long flags; 1101 struct pool *pool = tc->pool; 1102 1103 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 1104 if (r) 1105 return r; 1106 1107 if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) { 1108 DMWARN("%s: reached low water mark, sending event.", 1109 dm_device_name(pool->pool_md)); 1110 spin_lock_irqsave(&pool->lock, flags); 1111 pool->low_water_triggered = 1; 1112 spin_unlock_irqrestore(&pool->lock, flags); 1113 dm_table_event(pool->ti->table); 1114 } 1115 1116 if (!free_blocks) { 1117 if (pool->no_free_space) 1118 return -ENOSPC; 1119 else { 1120 /* 1121 * Try to commit to see if that will free up some 1122 * more space. 1123 */ 1124 r = dm_pool_commit_metadata(pool->pmd); 1125 if (r) { 1126 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 1127 __func__, r); 1128 return r; 1129 } 1130 1131 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 1132 if (r) 1133 return r; 1134 1135 /* 1136 * If we still have no space we set a flag to avoid 1137 * doing all this checking and return -ENOSPC. 1138 */ 1139 if (!free_blocks) { 1140 DMWARN("%s: no free space available.", 1141 dm_device_name(pool->pool_md)); 1142 spin_lock_irqsave(&pool->lock, flags); 1143 pool->no_free_space = 1; 1144 spin_unlock_irqrestore(&pool->lock, flags); 1145 return -ENOSPC; 1146 } 1147 } 1148 } 1149 1150 r = dm_pool_alloc_data_block(pool->pmd, result); 1151 if (r) 1152 return r; 1153 1154 return 0; 1155 } 1156 1157 /* 1158 * If we have run out of space, queue bios until the device is 1159 * resumed, presumably after having been reloaded with more space. 1160 */ 1161 static void retry_on_resume(struct bio *bio) 1162 { 1163 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 1164 struct thin_c *tc = h->tc; 1165 struct pool *pool = tc->pool; 1166 unsigned long flags; 1167 1168 spin_lock_irqsave(&pool->lock, flags); 1169 bio_list_add(&pool->retry_on_resume_list, bio); 1170 spin_unlock_irqrestore(&pool->lock, flags); 1171 } 1172 1173 static void no_space(struct dm_bio_prison_cell *cell) 1174 { 1175 struct bio *bio; 1176 struct bio_list bios; 1177 1178 bio_list_init(&bios); 1179 cell_release(cell, &bios); 1180 1181 while ((bio = bio_list_pop(&bios))) 1182 retry_on_resume(bio); 1183 } 1184 1185 static void process_discard(struct thin_c *tc, struct bio *bio) 1186 { 1187 int r; 1188 unsigned long flags; 1189 struct pool *pool = tc->pool; 1190 struct dm_bio_prison_cell *cell, *cell2; 1191 struct cell_key key, key2; 1192 dm_block_t block = get_bio_block(tc, bio); 1193 struct dm_thin_lookup_result lookup_result; 1194 struct dm_thin_new_mapping *m; 1195 1196 build_virtual_key(tc->td, block, &key); 1197 if (bio_detain(tc->pool->prison, &key, bio, &cell)) 1198 return; 1199 1200 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1201 switch (r) { 1202 case 0: 1203 /* 1204 * Check nobody is fiddling with this pool block. This can 1205 * happen if someone's in the process of breaking sharing 1206 * on this block. 1207 */ 1208 build_data_key(tc->td, lookup_result.block, &key2); 1209 if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) { 1210 cell_release_singleton(cell, bio); 1211 break; 1212 } 1213 1214 if (io_overlaps_block(pool, bio)) { 1215 /* 1216 * IO may still be going to the destination block. We must 1217 * quiesce before we can do the removal. 1218 */ 1219 m = get_next_mapping(pool); 1220 m->tc = tc; 1221 m->pass_discard = (!lookup_result.shared) & pool->pf.discard_passdown; 1222 m->virt_block = block; 1223 m->data_block = lookup_result.block; 1224 m->cell = cell; 1225 m->cell2 = cell2; 1226 m->err = 0; 1227 m->bio = bio; 1228 1229 if (!ds_add_work(&pool->all_io_ds, &m->list)) { 1230 spin_lock_irqsave(&pool->lock, flags); 1231 list_add(&m->list, &pool->prepared_discards); 1232 spin_unlock_irqrestore(&pool->lock, flags); 1233 wake_worker(pool); 1234 } 1235 } else { 1236 /* 1237 * This path is hit if people are ignoring 1238 * limits->discard_granularity. It ignores any 1239 * part of the discard that is in a subsequent 1240 * block. 1241 */ 1242 sector_t offset = bio->bi_sector - (block << pool->block_shift); 1243 unsigned remaining = (pool->sectors_per_block - offset) << 9; 1244 bio->bi_size = min(bio->bi_size, remaining); 1245 1246 cell_release_singleton(cell, bio); 1247 cell_release_singleton(cell2, bio); 1248 if ((!lookup_result.shared) && pool->pf.discard_passdown) 1249 remap_and_issue(tc, bio, lookup_result.block); 1250 else 1251 bio_endio(bio, 0); 1252 } 1253 break; 1254 1255 case -ENODATA: 1256 /* 1257 * It isn't provisioned, just forget it. 1258 */ 1259 cell_release_singleton(cell, bio); 1260 bio_endio(bio, 0); 1261 break; 1262 1263 default: 1264 DMERR("discard: find block unexpectedly returned %d", r); 1265 cell_release_singleton(cell, bio); 1266 bio_io_error(bio); 1267 break; 1268 } 1269 } 1270 1271 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, 1272 struct cell_key *key, 1273 struct dm_thin_lookup_result *lookup_result, 1274 struct dm_bio_prison_cell *cell) 1275 { 1276 int r; 1277 dm_block_t data_block; 1278 1279 r = alloc_data_block(tc, &data_block); 1280 switch (r) { 1281 case 0: 1282 schedule_internal_copy(tc, block, lookup_result->block, 1283 data_block, cell, bio); 1284 break; 1285 1286 case -ENOSPC: 1287 no_space(cell); 1288 break; 1289 1290 default: 1291 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r); 1292 cell_error(cell); 1293 break; 1294 } 1295 } 1296 1297 static void process_shared_bio(struct thin_c *tc, struct bio *bio, 1298 dm_block_t block, 1299 struct dm_thin_lookup_result *lookup_result) 1300 { 1301 struct dm_bio_prison_cell *cell; 1302 struct pool *pool = tc->pool; 1303 struct cell_key key; 1304 1305 /* 1306 * If cell is already occupied, then sharing is already in the process 1307 * of being broken so we have nothing further to do here. 1308 */ 1309 build_data_key(tc->td, lookup_result->block, &key); 1310 if (bio_detain(pool->prison, &key, bio, &cell)) 1311 return; 1312 1313 if (bio_data_dir(bio) == WRITE) 1314 break_sharing(tc, bio, block, &key, lookup_result, cell); 1315 else { 1316 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 1317 1318 h->shared_read_entry = ds_inc(&pool->shared_read_ds); 1319 1320 cell_release_singleton(cell, bio); 1321 remap_and_issue(tc, bio, lookup_result->block); 1322 } 1323 } 1324 1325 static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block, 1326 struct dm_bio_prison_cell *cell) 1327 { 1328 int r; 1329 dm_block_t data_block; 1330 1331 /* 1332 * Remap empty bios (flushes) immediately, without provisioning. 1333 */ 1334 if (!bio->bi_size) { 1335 cell_release_singleton(cell, bio); 1336 remap_and_issue(tc, bio, 0); 1337 return; 1338 } 1339 1340 /* 1341 * Fill read bios with zeroes and complete them immediately. 1342 */ 1343 if (bio_data_dir(bio) == READ) { 1344 zero_fill_bio(bio); 1345 cell_release_singleton(cell, bio); 1346 bio_endio(bio, 0); 1347 return; 1348 } 1349 1350 r = alloc_data_block(tc, &data_block); 1351 switch (r) { 1352 case 0: 1353 if (tc->origin_dev) 1354 schedule_external_copy(tc, block, data_block, cell, bio); 1355 else 1356 schedule_zero(tc, block, data_block, cell, bio); 1357 break; 1358 1359 case -ENOSPC: 1360 no_space(cell); 1361 break; 1362 1363 default: 1364 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r); 1365 cell_error(cell); 1366 break; 1367 } 1368 } 1369 1370 static void process_bio(struct thin_c *tc, struct bio *bio) 1371 { 1372 int r; 1373 dm_block_t block = get_bio_block(tc, bio); 1374 struct dm_bio_prison_cell *cell; 1375 struct cell_key key; 1376 struct dm_thin_lookup_result lookup_result; 1377 1378 /* 1379 * If cell is already occupied, then the block is already 1380 * being provisioned so we have nothing further to do here. 1381 */ 1382 build_virtual_key(tc->td, block, &key); 1383 if (bio_detain(tc->pool->prison, &key, bio, &cell)) 1384 return; 1385 1386 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1387 switch (r) { 1388 case 0: 1389 /* 1390 * We can release this cell now. This thread is the only 1391 * one that puts bios into a cell, and we know there were 1392 * no preceding bios. 1393 */ 1394 /* 1395 * TODO: this will probably have to change when discard goes 1396 * back in. 1397 */ 1398 cell_release_singleton(cell, bio); 1399 1400 if (lookup_result.shared) 1401 process_shared_bio(tc, bio, block, &lookup_result); 1402 else 1403 remap_and_issue(tc, bio, lookup_result.block); 1404 break; 1405 1406 case -ENODATA: 1407 if (bio_data_dir(bio) == READ && tc->origin_dev) { 1408 cell_release_singleton(cell, bio); 1409 remap_to_origin_and_issue(tc, bio); 1410 } else 1411 provision_block(tc, bio, block, cell); 1412 break; 1413 1414 default: 1415 DMERR("dm_thin_find_block() failed, error = %d", r); 1416 cell_release_singleton(cell, bio); 1417 bio_io_error(bio); 1418 break; 1419 } 1420 } 1421 1422 static int need_commit_due_to_time(struct pool *pool) 1423 { 1424 return jiffies < pool->last_commit_jiffies || 1425 jiffies > pool->last_commit_jiffies + COMMIT_PERIOD; 1426 } 1427 1428 static void process_deferred_bios(struct pool *pool) 1429 { 1430 unsigned long flags; 1431 struct bio *bio; 1432 struct bio_list bios; 1433 int r; 1434 1435 bio_list_init(&bios); 1436 1437 spin_lock_irqsave(&pool->lock, flags); 1438 bio_list_merge(&bios, &pool->deferred_bios); 1439 bio_list_init(&pool->deferred_bios); 1440 spin_unlock_irqrestore(&pool->lock, flags); 1441 1442 while ((bio = bio_list_pop(&bios))) { 1443 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 1444 struct thin_c *tc = h->tc; 1445 1446 /* 1447 * If we've got no free new_mapping structs, and processing 1448 * this bio might require one, we pause until there are some 1449 * prepared mappings to process. 1450 */ 1451 if (ensure_next_mapping(pool)) { 1452 spin_lock_irqsave(&pool->lock, flags); 1453 bio_list_merge(&pool->deferred_bios, &bios); 1454 spin_unlock_irqrestore(&pool->lock, flags); 1455 1456 break; 1457 } 1458 1459 if (bio->bi_rw & REQ_DISCARD) 1460 process_discard(tc, bio); 1461 else 1462 process_bio(tc, bio); 1463 } 1464 1465 /* 1466 * If there are any deferred flush bios, we must commit 1467 * the metadata before issuing them. 1468 */ 1469 bio_list_init(&bios); 1470 spin_lock_irqsave(&pool->lock, flags); 1471 bio_list_merge(&bios, &pool->deferred_flush_bios); 1472 bio_list_init(&pool->deferred_flush_bios); 1473 spin_unlock_irqrestore(&pool->lock, flags); 1474 1475 if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) 1476 return; 1477 1478 r = dm_pool_commit_metadata(pool->pmd); 1479 if (r) { 1480 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 1481 __func__, r); 1482 while ((bio = bio_list_pop(&bios))) 1483 bio_io_error(bio); 1484 return; 1485 } 1486 pool->last_commit_jiffies = jiffies; 1487 1488 while ((bio = bio_list_pop(&bios))) 1489 generic_make_request(bio); 1490 } 1491 1492 static void do_worker(struct work_struct *ws) 1493 { 1494 struct pool *pool = container_of(ws, struct pool, worker); 1495 1496 process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping); 1497 process_prepared(pool, &pool->prepared_discards, process_prepared_discard); 1498 process_deferred_bios(pool); 1499 } 1500 1501 /* 1502 * We want to commit periodically so that not too much 1503 * unwritten data builds up. 1504 */ 1505 static void do_waker(struct work_struct *ws) 1506 { 1507 struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker); 1508 wake_worker(pool); 1509 queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD); 1510 } 1511 1512 /*----------------------------------------------------------------*/ 1513 1514 /* 1515 * Mapping functions. 1516 */ 1517 1518 /* 1519 * Called only while mapping a thin bio to hand it over to the workqueue. 1520 */ 1521 static void thin_defer_bio(struct thin_c *tc, struct bio *bio) 1522 { 1523 unsigned long flags; 1524 struct pool *pool = tc->pool; 1525 1526 spin_lock_irqsave(&pool->lock, flags); 1527 bio_list_add(&pool->deferred_bios, bio); 1528 spin_unlock_irqrestore(&pool->lock, flags); 1529 1530 wake_worker(pool); 1531 } 1532 1533 static struct dm_thin_endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio) 1534 { 1535 struct pool *pool = tc->pool; 1536 struct dm_thin_endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO); 1537 1538 h->tc = tc; 1539 h->shared_read_entry = NULL; 1540 h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds); 1541 h->overwrite_mapping = NULL; 1542 1543 return h; 1544 } 1545 1546 /* 1547 * Non-blocking function called from the thin target's map function. 1548 */ 1549 static int thin_bio_map(struct dm_target *ti, struct bio *bio, 1550 union map_info *map_context) 1551 { 1552 int r; 1553 struct thin_c *tc = ti->private; 1554 dm_block_t block = get_bio_block(tc, bio); 1555 struct dm_thin_device *td = tc->td; 1556 struct dm_thin_lookup_result result; 1557 1558 map_context->ptr = thin_hook_bio(tc, bio); 1559 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { 1560 thin_defer_bio(tc, bio); 1561 return DM_MAPIO_SUBMITTED; 1562 } 1563 1564 r = dm_thin_find_block(td, block, 0, &result); 1565 1566 /* 1567 * Note that we defer readahead too. 1568 */ 1569 switch (r) { 1570 case 0: 1571 if (unlikely(result.shared)) { 1572 /* 1573 * We have a race condition here between the 1574 * result.shared value returned by the lookup and 1575 * snapshot creation, which may cause new 1576 * sharing. 1577 * 1578 * To avoid this always quiesce the origin before 1579 * taking the snap. You want to do this anyway to 1580 * ensure a consistent application view 1581 * (i.e. lockfs). 1582 * 1583 * More distant ancestors are irrelevant. The 1584 * shared flag will be set in their case. 1585 */ 1586 thin_defer_bio(tc, bio); 1587 r = DM_MAPIO_SUBMITTED; 1588 } else { 1589 remap(tc, bio, result.block); 1590 r = DM_MAPIO_REMAPPED; 1591 } 1592 break; 1593 1594 case -ENODATA: 1595 /* 1596 * In future, the failed dm_thin_find_block above could 1597 * provide the hint to load the metadata into cache. 1598 */ 1599 case -EWOULDBLOCK: 1600 thin_defer_bio(tc, bio); 1601 r = DM_MAPIO_SUBMITTED; 1602 break; 1603 } 1604 1605 return r; 1606 } 1607 1608 static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 1609 { 1610 int r; 1611 unsigned long flags; 1612 struct pool_c *pt = container_of(cb, struct pool_c, callbacks); 1613 1614 spin_lock_irqsave(&pt->pool->lock, flags); 1615 r = !bio_list_empty(&pt->pool->retry_on_resume_list); 1616 spin_unlock_irqrestore(&pt->pool->lock, flags); 1617 1618 if (!r) { 1619 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 1620 r = bdi_congested(&q->backing_dev_info, bdi_bits); 1621 } 1622 1623 return r; 1624 } 1625 1626 static void __requeue_bios(struct pool *pool) 1627 { 1628 bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list); 1629 bio_list_init(&pool->retry_on_resume_list); 1630 } 1631 1632 /*---------------------------------------------------------------- 1633 * Binding of control targets to a pool object 1634 *--------------------------------------------------------------*/ 1635 static int bind_control_target(struct pool *pool, struct dm_target *ti) 1636 { 1637 struct pool_c *pt = ti->private; 1638 1639 pool->ti = ti; 1640 pool->low_water_blocks = pt->low_water_blocks; 1641 pool->pf = pt->pf; 1642 1643 /* 1644 * If discard_passdown was enabled verify that the data device 1645 * supports discards. Disable discard_passdown if not; otherwise 1646 * -EOPNOTSUPP will be returned. 1647 */ 1648 if (pt->pf.discard_passdown) { 1649 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 1650 if (!q || !blk_queue_discard(q)) { 1651 char buf[BDEVNAME_SIZE]; 1652 DMWARN("Discard unsupported by data device (%s): Disabling discard passdown.", 1653 bdevname(pt->data_dev->bdev, buf)); 1654 pool->pf.discard_passdown = 0; 1655 } 1656 } 1657 1658 return 0; 1659 } 1660 1661 static void unbind_control_target(struct pool *pool, struct dm_target *ti) 1662 { 1663 if (pool->ti == ti) 1664 pool->ti = NULL; 1665 } 1666 1667 /*---------------------------------------------------------------- 1668 * Pool creation 1669 *--------------------------------------------------------------*/ 1670 /* Initialize pool features. */ 1671 static void pool_features_init(struct pool_features *pf) 1672 { 1673 pf->zero_new_blocks = 1; 1674 pf->discard_enabled = 1; 1675 pf->discard_passdown = 1; 1676 } 1677 1678 static void __pool_destroy(struct pool *pool) 1679 { 1680 __pool_table_remove(pool); 1681 1682 if (dm_pool_metadata_close(pool->pmd) < 0) 1683 DMWARN("%s: dm_pool_metadata_close() failed.", __func__); 1684 1685 prison_destroy(pool->prison); 1686 dm_kcopyd_client_destroy(pool->copier); 1687 1688 if (pool->wq) 1689 destroy_workqueue(pool->wq); 1690 1691 if (pool->next_mapping) 1692 mempool_free(pool->next_mapping, pool->mapping_pool); 1693 mempool_destroy(pool->mapping_pool); 1694 mempool_destroy(pool->endio_hook_pool); 1695 kfree(pool); 1696 } 1697 1698 static struct kmem_cache *_new_mapping_cache; 1699 static struct kmem_cache *_endio_hook_cache; 1700 1701 static struct pool *pool_create(struct mapped_device *pool_md, 1702 struct block_device *metadata_dev, 1703 unsigned long block_size, char **error) 1704 { 1705 int r; 1706 void *err_p; 1707 struct pool *pool; 1708 struct dm_pool_metadata *pmd; 1709 1710 pmd = dm_pool_metadata_open(metadata_dev, block_size); 1711 if (IS_ERR(pmd)) { 1712 *error = "Error creating metadata object"; 1713 return (struct pool *)pmd; 1714 } 1715 1716 pool = kmalloc(sizeof(*pool), GFP_KERNEL); 1717 if (!pool) { 1718 *error = "Error allocating memory for pool"; 1719 err_p = ERR_PTR(-ENOMEM); 1720 goto bad_pool; 1721 } 1722 1723 pool->pmd = pmd; 1724 pool->sectors_per_block = block_size; 1725 pool->block_shift = ffs(block_size) - 1; 1726 pool->offset_mask = block_size - 1; 1727 pool->low_water_blocks = 0; 1728 pool_features_init(&pool->pf); 1729 pool->prison = prison_create(PRISON_CELLS); 1730 if (!pool->prison) { 1731 *error = "Error creating pool's bio prison"; 1732 err_p = ERR_PTR(-ENOMEM); 1733 goto bad_prison; 1734 } 1735 1736 pool->copier = dm_kcopyd_client_create(); 1737 if (IS_ERR(pool->copier)) { 1738 r = PTR_ERR(pool->copier); 1739 *error = "Error creating pool's kcopyd client"; 1740 err_p = ERR_PTR(r); 1741 goto bad_kcopyd_client; 1742 } 1743 1744 /* 1745 * Create singlethreaded workqueue that will service all devices 1746 * that use this metadata. 1747 */ 1748 pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 1749 if (!pool->wq) { 1750 *error = "Error creating pool's workqueue"; 1751 err_p = ERR_PTR(-ENOMEM); 1752 goto bad_wq; 1753 } 1754 1755 INIT_WORK(&pool->worker, do_worker); 1756 INIT_DELAYED_WORK(&pool->waker, do_waker); 1757 spin_lock_init(&pool->lock); 1758 bio_list_init(&pool->deferred_bios); 1759 bio_list_init(&pool->deferred_flush_bios); 1760 INIT_LIST_HEAD(&pool->prepared_mappings); 1761 INIT_LIST_HEAD(&pool->prepared_discards); 1762 pool->low_water_triggered = 0; 1763 pool->no_free_space = 0; 1764 bio_list_init(&pool->retry_on_resume_list); 1765 ds_init(&pool->shared_read_ds); 1766 ds_init(&pool->all_io_ds); 1767 1768 pool->next_mapping = NULL; 1769 pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE, 1770 _new_mapping_cache); 1771 if (!pool->mapping_pool) { 1772 *error = "Error creating pool's mapping mempool"; 1773 err_p = ERR_PTR(-ENOMEM); 1774 goto bad_mapping_pool; 1775 } 1776 1777 pool->endio_hook_pool = mempool_create_slab_pool(ENDIO_HOOK_POOL_SIZE, 1778 _endio_hook_cache); 1779 if (!pool->endio_hook_pool) { 1780 *error = "Error creating pool's endio_hook mempool"; 1781 err_p = ERR_PTR(-ENOMEM); 1782 goto bad_endio_hook_pool; 1783 } 1784 pool->ref_count = 1; 1785 pool->last_commit_jiffies = jiffies; 1786 pool->pool_md = pool_md; 1787 pool->md_dev = metadata_dev; 1788 __pool_table_insert(pool); 1789 1790 return pool; 1791 1792 bad_endio_hook_pool: 1793 mempool_destroy(pool->mapping_pool); 1794 bad_mapping_pool: 1795 destroy_workqueue(pool->wq); 1796 bad_wq: 1797 dm_kcopyd_client_destroy(pool->copier); 1798 bad_kcopyd_client: 1799 prison_destroy(pool->prison); 1800 bad_prison: 1801 kfree(pool); 1802 bad_pool: 1803 if (dm_pool_metadata_close(pmd)) 1804 DMWARN("%s: dm_pool_metadata_close() failed.", __func__); 1805 1806 return err_p; 1807 } 1808 1809 static void __pool_inc(struct pool *pool) 1810 { 1811 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 1812 pool->ref_count++; 1813 } 1814 1815 static void __pool_dec(struct pool *pool) 1816 { 1817 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 1818 BUG_ON(!pool->ref_count); 1819 if (!--pool->ref_count) 1820 __pool_destroy(pool); 1821 } 1822 1823 static struct pool *__pool_find(struct mapped_device *pool_md, 1824 struct block_device *metadata_dev, 1825 unsigned long block_size, char **error, 1826 int *created) 1827 { 1828 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); 1829 1830 if (pool) { 1831 if (pool->pool_md != pool_md) 1832 return ERR_PTR(-EBUSY); 1833 __pool_inc(pool); 1834 1835 } else { 1836 pool = __pool_table_lookup(pool_md); 1837 if (pool) { 1838 if (pool->md_dev != metadata_dev) 1839 return ERR_PTR(-EINVAL); 1840 __pool_inc(pool); 1841 1842 } else { 1843 pool = pool_create(pool_md, metadata_dev, block_size, error); 1844 *created = 1; 1845 } 1846 } 1847 1848 return pool; 1849 } 1850 1851 /*---------------------------------------------------------------- 1852 * Pool target methods 1853 *--------------------------------------------------------------*/ 1854 static void pool_dtr(struct dm_target *ti) 1855 { 1856 struct pool_c *pt = ti->private; 1857 1858 mutex_lock(&dm_thin_pool_table.mutex); 1859 1860 unbind_control_target(pt->pool, ti); 1861 __pool_dec(pt->pool); 1862 dm_put_device(ti, pt->metadata_dev); 1863 dm_put_device(ti, pt->data_dev); 1864 kfree(pt); 1865 1866 mutex_unlock(&dm_thin_pool_table.mutex); 1867 } 1868 1869 static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, 1870 struct dm_target *ti) 1871 { 1872 int r; 1873 unsigned argc; 1874 const char *arg_name; 1875 1876 static struct dm_arg _args[] = { 1877 {0, 3, "Invalid number of pool feature arguments"}, 1878 }; 1879 1880 /* 1881 * No feature arguments supplied. 1882 */ 1883 if (!as->argc) 1884 return 0; 1885 1886 r = dm_read_arg_group(_args, as, &argc, &ti->error); 1887 if (r) 1888 return -EINVAL; 1889 1890 while (argc && !r) { 1891 arg_name = dm_shift_arg(as); 1892 argc--; 1893 1894 if (!strcasecmp(arg_name, "skip_block_zeroing")) { 1895 pf->zero_new_blocks = 0; 1896 continue; 1897 } else if (!strcasecmp(arg_name, "ignore_discard")) { 1898 pf->discard_enabled = 0; 1899 continue; 1900 } else if (!strcasecmp(arg_name, "no_discard_passdown")) { 1901 pf->discard_passdown = 0; 1902 continue; 1903 } 1904 1905 ti->error = "Unrecognised pool feature requested"; 1906 r = -EINVAL; 1907 } 1908 1909 return r; 1910 } 1911 1912 /* 1913 * thin-pool <metadata dev> <data dev> 1914 * <data block size (sectors)> 1915 * <low water mark (blocks)> 1916 * [<#feature args> [<arg>]*] 1917 * 1918 * Optional feature arguments are: 1919 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. 1920 * ignore_discard: disable discard 1921 * no_discard_passdown: don't pass discards down to the data device 1922 */ 1923 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) 1924 { 1925 int r, pool_created = 0; 1926 struct pool_c *pt; 1927 struct pool *pool; 1928 struct pool_features pf; 1929 struct dm_arg_set as; 1930 struct dm_dev *data_dev; 1931 unsigned long block_size; 1932 dm_block_t low_water_blocks; 1933 struct dm_dev *metadata_dev; 1934 sector_t metadata_dev_size; 1935 char b[BDEVNAME_SIZE]; 1936 1937 /* 1938 * FIXME Remove validation from scope of lock. 1939 */ 1940 mutex_lock(&dm_thin_pool_table.mutex); 1941 1942 if (argc < 4) { 1943 ti->error = "Invalid argument count"; 1944 r = -EINVAL; 1945 goto out_unlock; 1946 } 1947 as.argc = argc; 1948 as.argv = argv; 1949 1950 r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev); 1951 if (r) { 1952 ti->error = "Error opening metadata block device"; 1953 goto out_unlock; 1954 } 1955 1956 metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT; 1957 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) 1958 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 1959 bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); 1960 1961 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); 1962 if (r) { 1963 ti->error = "Error getting data device"; 1964 goto out_metadata; 1965 } 1966 1967 if (kstrtoul(argv[2], 10, &block_size) || !block_size || 1968 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 1969 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 1970 !is_power_of_2(block_size)) { 1971 ti->error = "Invalid block size"; 1972 r = -EINVAL; 1973 goto out; 1974 } 1975 1976 if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) { 1977 ti->error = "Invalid low water mark"; 1978 r = -EINVAL; 1979 goto out; 1980 } 1981 1982 /* 1983 * Set default pool features. 1984 */ 1985 pool_features_init(&pf); 1986 1987 dm_consume_args(&as, 4); 1988 r = parse_pool_features(&as, &pf, ti); 1989 if (r) 1990 goto out; 1991 1992 pt = kzalloc(sizeof(*pt), GFP_KERNEL); 1993 if (!pt) { 1994 r = -ENOMEM; 1995 goto out; 1996 } 1997 1998 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, 1999 block_size, &ti->error, &pool_created); 2000 if (IS_ERR(pool)) { 2001 r = PTR_ERR(pool); 2002 goto out_free_pt; 2003 } 2004 2005 /* 2006 * 'pool_created' reflects whether this is the first table load. 2007 * Top level discard support is not allowed to be changed after 2008 * initial load. This would require a pool reload to trigger thin 2009 * device changes. 2010 */ 2011 if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) { 2012 ti->error = "Discard support cannot be disabled once enabled"; 2013 r = -EINVAL; 2014 goto out_flags_changed; 2015 } 2016 2017 pt->pool = pool; 2018 pt->ti = ti; 2019 pt->metadata_dev = metadata_dev; 2020 pt->data_dev = data_dev; 2021 pt->low_water_blocks = low_water_blocks; 2022 pt->pf = pf; 2023 ti->num_flush_requests = 1; 2024 /* 2025 * Only need to enable discards if the pool should pass 2026 * them down to the data device. The thin device's discard 2027 * processing will cause mappings to be removed from the btree. 2028 */ 2029 if (pf.discard_enabled && pf.discard_passdown) { 2030 ti->num_discard_requests = 1; 2031 /* 2032 * Setting 'discards_supported' circumvents the normal 2033 * stacking of discard limits (this keeps the pool and 2034 * thin devices' discard limits consistent). 2035 */ 2036 ti->discards_supported = 1; 2037 } 2038 ti->private = pt; 2039 2040 pt->callbacks.congested_fn = pool_is_congested; 2041 dm_table_add_target_callbacks(ti->table, &pt->callbacks); 2042 2043 mutex_unlock(&dm_thin_pool_table.mutex); 2044 2045 return 0; 2046 2047 out_flags_changed: 2048 __pool_dec(pool); 2049 out_free_pt: 2050 kfree(pt); 2051 out: 2052 dm_put_device(ti, data_dev); 2053 out_metadata: 2054 dm_put_device(ti, metadata_dev); 2055 out_unlock: 2056 mutex_unlock(&dm_thin_pool_table.mutex); 2057 2058 return r; 2059 } 2060 2061 static int pool_map(struct dm_target *ti, struct bio *bio, 2062 union map_info *map_context) 2063 { 2064 int r; 2065 struct pool_c *pt = ti->private; 2066 struct pool *pool = pt->pool; 2067 unsigned long flags; 2068 2069 /* 2070 * As this is a singleton target, ti->begin is always zero. 2071 */ 2072 spin_lock_irqsave(&pool->lock, flags); 2073 bio->bi_bdev = pt->data_dev->bdev; 2074 r = DM_MAPIO_REMAPPED; 2075 spin_unlock_irqrestore(&pool->lock, flags); 2076 2077 return r; 2078 } 2079 2080 /* 2081 * Retrieves the number of blocks of the data device from 2082 * the superblock and compares it to the actual device size, 2083 * thus resizing the data device in case it has grown. 2084 * 2085 * This both copes with opening preallocated data devices in the ctr 2086 * being followed by a resume 2087 * -and- 2088 * calling the resume method individually after userspace has 2089 * grown the data device in reaction to a table event. 2090 */ 2091 static int pool_preresume(struct dm_target *ti) 2092 { 2093 int r; 2094 struct pool_c *pt = ti->private; 2095 struct pool *pool = pt->pool; 2096 dm_block_t data_size, sb_data_size; 2097 2098 /* 2099 * Take control of the pool object. 2100 */ 2101 r = bind_control_target(pool, ti); 2102 if (r) 2103 return r; 2104 2105 data_size = ti->len >> pool->block_shift; 2106 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size); 2107 if (r) { 2108 DMERR("failed to retrieve data device size"); 2109 return r; 2110 } 2111 2112 if (data_size < sb_data_size) { 2113 DMERR("pool target too small, is %llu blocks (expected %llu)", 2114 data_size, sb_data_size); 2115 return -EINVAL; 2116 2117 } else if (data_size > sb_data_size) { 2118 r = dm_pool_resize_data_dev(pool->pmd, data_size); 2119 if (r) { 2120 DMERR("failed to resize data device"); 2121 return r; 2122 } 2123 2124 r = dm_pool_commit_metadata(pool->pmd); 2125 if (r) { 2126 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 2127 __func__, r); 2128 return r; 2129 } 2130 } 2131 2132 return 0; 2133 } 2134 2135 static void pool_resume(struct dm_target *ti) 2136 { 2137 struct pool_c *pt = ti->private; 2138 struct pool *pool = pt->pool; 2139 unsigned long flags; 2140 2141 spin_lock_irqsave(&pool->lock, flags); 2142 pool->low_water_triggered = 0; 2143 pool->no_free_space = 0; 2144 __requeue_bios(pool); 2145 spin_unlock_irqrestore(&pool->lock, flags); 2146 2147 do_waker(&pool->waker.work); 2148 } 2149 2150 static void pool_postsuspend(struct dm_target *ti) 2151 { 2152 int r; 2153 struct pool_c *pt = ti->private; 2154 struct pool *pool = pt->pool; 2155 2156 cancel_delayed_work(&pool->waker); 2157 flush_workqueue(pool->wq); 2158 2159 r = dm_pool_commit_metadata(pool->pmd); 2160 if (r < 0) { 2161 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 2162 __func__, r); 2163 /* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/ 2164 } 2165 } 2166 2167 static int check_arg_count(unsigned argc, unsigned args_required) 2168 { 2169 if (argc != args_required) { 2170 DMWARN("Message received with %u arguments instead of %u.", 2171 argc, args_required); 2172 return -EINVAL; 2173 } 2174 2175 return 0; 2176 } 2177 2178 static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning) 2179 { 2180 if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) && 2181 *dev_id <= MAX_DEV_ID) 2182 return 0; 2183 2184 if (warning) 2185 DMWARN("Message received with invalid device id: %s", arg); 2186 2187 return -EINVAL; 2188 } 2189 2190 static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool) 2191 { 2192 dm_thin_id dev_id; 2193 int r; 2194 2195 r = check_arg_count(argc, 2); 2196 if (r) 2197 return r; 2198 2199 r = read_dev_id(argv[1], &dev_id, 1); 2200 if (r) 2201 return r; 2202 2203 r = dm_pool_create_thin(pool->pmd, dev_id); 2204 if (r) { 2205 DMWARN("Creation of new thinly-provisioned device with id %s failed.", 2206 argv[1]); 2207 return r; 2208 } 2209 2210 return 0; 2211 } 2212 2213 static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool) 2214 { 2215 dm_thin_id dev_id; 2216 dm_thin_id origin_dev_id; 2217 int r; 2218 2219 r = check_arg_count(argc, 3); 2220 if (r) 2221 return r; 2222 2223 r = read_dev_id(argv[1], &dev_id, 1); 2224 if (r) 2225 return r; 2226 2227 r = read_dev_id(argv[2], &origin_dev_id, 1); 2228 if (r) 2229 return r; 2230 2231 r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id); 2232 if (r) { 2233 DMWARN("Creation of new snapshot %s of device %s failed.", 2234 argv[1], argv[2]); 2235 return r; 2236 } 2237 2238 return 0; 2239 } 2240 2241 static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool) 2242 { 2243 dm_thin_id dev_id; 2244 int r; 2245 2246 r = check_arg_count(argc, 2); 2247 if (r) 2248 return r; 2249 2250 r = read_dev_id(argv[1], &dev_id, 1); 2251 if (r) 2252 return r; 2253 2254 r = dm_pool_delete_thin_device(pool->pmd, dev_id); 2255 if (r) 2256 DMWARN("Deletion of thin device %s failed.", argv[1]); 2257 2258 return r; 2259 } 2260 2261 static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool) 2262 { 2263 dm_thin_id old_id, new_id; 2264 int r; 2265 2266 r = check_arg_count(argc, 3); 2267 if (r) 2268 return r; 2269 2270 if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) { 2271 DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]); 2272 return -EINVAL; 2273 } 2274 2275 if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) { 2276 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]); 2277 return -EINVAL; 2278 } 2279 2280 r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id); 2281 if (r) { 2282 DMWARN("Failed to change transaction id from %s to %s.", 2283 argv[1], argv[2]); 2284 return r; 2285 } 2286 2287 return 0; 2288 } 2289 2290 static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool) 2291 { 2292 int r; 2293 2294 r = check_arg_count(argc, 1); 2295 if (r) 2296 return r; 2297 2298 r = dm_pool_commit_metadata(pool->pmd); 2299 if (r) { 2300 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 2301 __func__, r); 2302 return r; 2303 } 2304 2305 r = dm_pool_reserve_metadata_snap(pool->pmd); 2306 if (r) 2307 DMWARN("reserve_metadata_snap message failed."); 2308 2309 return r; 2310 } 2311 2312 static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool) 2313 { 2314 int r; 2315 2316 r = check_arg_count(argc, 1); 2317 if (r) 2318 return r; 2319 2320 r = dm_pool_release_metadata_snap(pool->pmd); 2321 if (r) 2322 DMWARN("release_metadata_snap message failed."); 2323 2324 return r; 2325 } 2326 2327 /* 2328 * Messages supported: 2329 * create_thin <dev_id> 2330 * create_snap <dev_id> <origin_id> 2331 * delete <dev_id> 2332 * trim <dev_id> <new_size_in_sectors> 2333 * set_transaction_id <current_trans_id> <new_trans_id> 2334 * reserve_metadata_snap 2335 * release_metadata_snap 2336 */ 2337 static int pool_message(struct dm_target *ti, unsigned argc, char **argv) 2338 { 2339 int r = -EINVAL; 2340 struct pool_c *pt = ti->private; 2341 struct pool *pool = pt->pool; 2342 2343 if (!strcasecmp(argv[0], "create_thin")) 2344 r = process_create_thin_mesg(argc, argv, pool); 2345 2346 else if (!strcasecmp(argv[0], "create_snap")) 2347 r = process_create_snap_mesg(argc, argv, pool); 2348 2349 else if (!strcasecmp(argv[0], "delete")) 2350 r = process_delete_mesg(argc, argv, pool); 2351 2352 else if (!strcasecmp(argv[0], "set_transaction_id")) 2353 r = process_set_transaction_id_mesg(argc, argv, pool); 2354 2355 else if (!strcasecmp(argv[0], "reserve_metadata_snap")) 2356 r = process_reserve_metadata_snap_mesg(argc, argv, pool); 2357 2358 else if (!strcasecmp(argv[0], "release_metadata_snap")) 2359 r = process_release_metadata_snap_mesg(argc, argv, pool); 2360 2361 else 2362 DMWARN("Unrecognised thin pool target message received: %s", argv[0]); 2363 2364 if (!r) { 2365 r = dm_pool_commit_metadata(pool->pmd); 2366 if (r) 2367 DMERR("%s message: dm_pool_commit_metadata() failed, error = %d", 2368 argv[0], r); 2369 } 2370 2371 return r; 2372 } 2373 2374 /* 2375 * Status line is: 2376 * <transaction id> <used metadata sectors>/<total metadata sectors> 2377 * <used data sectors>/<total data sectors> <held metadata root> 2378 */ 2379 static int pool_status(struct dm_target *ti, status_type_t type, 2380 char *result, unsigned maxlen) 2381 { 2382 int r, count; 2383 unsigned sz = 0; 2384 uint64_t transaction_id; 2385 dm_block_t nr_free_blocks_data; 2386 dm_block_t nr_free_blocks_metadata; 2387 dm_block_t nr_blocks_data; 2388 dm_block_t nr_blocks_metadata; 2389 dm_block_t held_root; 2390 char buf[BDEVNAME_SIZE]; 2391 char buf2[BDEVNAME_SIZE]; 2392 struct pool_c *pt = ti->private; 2393 struct pool *pool = pt->pool; 2394 2395 switch (type) { 2396 case STATUSTYPE_INFO: 2397 r = dm_pool_get_metadata_transaction_id(pool->pmd, 2398 &transaction_id); 2399 if (r) 2400 return r; 2401 2402 r = dm_pool_get_free_metadata_block_count(pool->pmd, 2403 &nr_free_blocks_metadata); 2404 if (r) 2405 return r; 2406 2407 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata); 2408 if (r) 2409 return r; 2410 2411 r = dm_pool_get_free_block_count(pool->pmd, 2412 &nr_free_blocks_data); 2413 if (r) 2414 return r; 2415 2416 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data); 2417 if (r) 2418 return r; 2419 2420 r = dm_pool_get_metadata_snap(pool->pmd, &held_root); 2421 if (r) 2422 return r; 2423 2424 DMEMIT("%llu %llu/%llu %llu/%llu ", 2425 (unsigned long long)transaction_id, 2426 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 2427 (unsigned long long)nr_blocks_metadata, 2428 (unsigned long long)(nr_blocks_data - nr_free_blocks_data), 2429 (unsigned long long)nr_blocks_data); 2430 2431 if (held_root) 2432 DMEMIT("%llu", held_root); 2433 else 2434 DMEMIT("-"); 2435 2436 break; 2437 2438 case STATUSTYPE_TABLE: 2439 DMEMIT("%s %s %lu %llu ", 2440 format_dev_t(buf, pt->metadata_dev->bdev->bd_dev), 2441 format_dev_t(buf2, pt->data_dev->bdev->bd_dev), 2442 (unsigned long)pool->sectors_per_block, 2443 (unsigned long long)pt->low_water_blocks); 2444 2445 count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled + 2446 !pt->pf.discard_passdown; 2447 DMEMIT("%u ", count); 2448 2449 if (!pool->pf.zero_new_blocks) 2450 DMEMIT("skip_block_zeroing "); 2451 2452 if (!pool->pf.discard_enabled) 2453 DMEMIT("ignore_discard "); 2454 2455 if (!pt->pf.discard_passdown) 2456 DMEMIT("no_discard_passdown "); 2457 2458 break; 2459 } 2460 2461 return 0; 2462 } 2463 2464 static int pool_iterate_devices(struct dm_target *ti, 2465 iterate_devices_callout_fn fn, void *data) 2466 { 2467 struct pool_c *pt = ti->private; 2468 2469 return fn(ti, pt->data_dev, 0, ti->len, data); 2470 } 2471 2472 static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm, 2473 struct bio_vec *biovec, int max_size) 2474 { 2475 struct pool_c *pt = ti->private; 2476 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 2477 2478 if (!q->merge_bvec_fn) 2479 return max_size; 2480 2481 bvm->bi_bdev = pt->data_dev->bdev; 2482 2483 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 2484 } 2485 2486 static void set_discard_limits(struct pool *pool, struct queue_limits *limits) 2487 { 2488 /* 2489 * FIXME: these limits may be incompatible with the pool's data device 2490 */ 2491 limits->max_discard_sectors = pool->sectors_per_block; 2492 2493 /* 2494 * This is just a hint, and not enforced. We have to cope with 2495 * bios that overlap 2 blocks. 2496 */ 2497 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; 2498 limits->discard_zeroes_data = pool->pf.zero_new_blocks; 2499 } 2500 2501 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) 2502 { 2503 struct pool_c *pt = ti->private; 2504 struct pool *pool = pt->pool; 2505 2506 blk_limits_io_min(limits, 0); 2507 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); 2508 if (pool->pf.discard_enabled) 2509 set_discard_limits(pool, limits); 2510 } 2511 2512 static struct target_type pool_target = { 2513 .name = "thin-pool", 2514 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 2515 DM_TARGET_IMMUTABLE, 2516 .version = {1, 2, 0}, 2517 .module = THIS_MODULE, 2518 .ctr = pool_ctr, 2519 .dtr = pool_dtr, 2520 .map = pool_map, 2521 .postsuspend = pool_postsuspend, 2522 .preresume = pool_preresume, 2523 .resume = pool_resume, 2524 .message = pool_message, 2525 .status = pool_status, 2526 .merge = pool_merge, 2527 .iterate_devices = pool_iterate_devices, 2528 .io_hints = pool_io_hints, 2529 }; 2530 2531 /*---------------------------------------------------------------- 2532 * Thin target methods 2533 *--------------------------------------------------------------*/ 2534 static void thin_dtr(struct dm_target *ti) 2535 { 2536 struct thin_c *tc = ti->private; 2537 2538 mutex_lock(&dm_thin_pool_table.mutex); 2539 2540 __pool_dec(tc->pool); 2541 dm_pool_close_thin_device(tc->td); 2542 dm_put_device(ti, tc->pool_dev); 2543 if (tc->origin_dev) 2544 dm_put_device(ti, tc->origin_dev); 2545 kfree(tc); 2546 2547 mutex_unlock(&dm_thin_pool_table.mutex); 2548 } 2549 2550 /* 2551 * Thin target parameters: 2552 * 2553 * <pool_dev> <dev_id> [origin_dev] 2554 * 2555 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool) 2556 * dev_id: the internal device identifier 2557 * origin_dev: a device external to the pool that should act as the origin 2558 * 2559 * If the pool device has discards disabled, they get disabled for the thin 2560 * device as well. 2561 */ 2562 static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) 2563 { 2564 int r; 2565 struct thin_c *tc; 2566 struct dm_dev *pool_dev, *origin_dev; 2567 struct mapped_device *pool_md; 2568 2569 mutex_lock(&dm_thin_pool_table.mutex); 2570 2571 if (argc != 2 && argc != 3) { 2572 ti->error = "Invalid argument count"; 2573 r = -EINVAL; 2574 goto out_unlock; 2575 } 2576 2577 tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL); 2578 if (!tc) { 2579 ti->error = "Out of memory"; 2580 r = -ENOMEM; 2581 goto out_unlock; 2582 } 2583 2584 if (argc == 3) { 2585 r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev); 2586 if (r) { 2587 ti->error = "Error opening origin device"; 2588 goto bad_origin_dev; 2589 } 2590 tc->origin_dev = origin_dev; 2591 } 2592 2593 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev); 2594 if (r) { 2595 ti->error = "Error opening pool device"; 2596 goto bad_pool_dev; 2597 } 2598 tc->pool_dev = pool_dev; 2599 2600 if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) { 2601 ti->error = "Invalid device id"; 2602 r = -EINVAL; 2603 goto bad_common; 2604 } 2605 2606 pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev); 2607 if (!pool_md) { 2608 ti->error = "Couldn't get pool mapped device"; 2609 r = -EINVAL; 2610 goto bad_common; 2611 } 2612 2613 tc->pool = __pool_table_lookup(pool_md); 2614 if (!tc->pool) { 2615 ti->error = "Couldn't find pool object"; 2616 r = -EINVAL; 2617 goto bad_pool_lookup; 2618 } 2619 __pool_inc(tc->pool); 2620 2621 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td); 2622 if (r) { 2623 ti->error = "Couldn't open thin internal device"; 2624 goto bad_thin_open; 2625 } 2626 2627 ti->split_io = tc->pool->sectors_per_block; 2628 ti->num_flush_requests = 1; 2629 2630 /* In case the pool supports discards, pass them on. */ 2631 if (tc->pool->pf.discard_enabled) { 2632 ti->discards_supported = 1; 2633 ti->num_discard_requests = 1; 2634 ti->discard_zeroes_data_unsupported = 1; 2635 } 2636 2637 dm_put(pool_md); 2638 2639 mutex_unlock(&dm_thin_pool_table.mutex); 2640 2641 return 0; 2642 2643 bad_thin_open: 2644 __pool_dec(tc->pool); 2645 bad_pool_lookup: 2646 dm_put(pool_md); 2647 bad_common: 2648 dm_put_device(ti, tc->pool_dev); 2649 bad_pool_dev: 2650 if (tc->origin_dev) 2651 dm_put_device(ti, tc->origin_dev); 2652 bad_origin_dev: 2653 kfree(tc); 2654 out_unlock: 2655 mutex_unlock(&dm_thin_pool_table.mutex); 2656 2657 return r; 2658 } 2659 2660 static int thin_map(struct dm_target *ti, struct bio *bio, 2661 union map_info *map_context) 2662 { 2663 bio->bi_sector = dm_target_offset(ti, bio->bi_sector); 2664 2665 return thin_bio_map(ti, bio, map_context); 2666 } 2667 2668 static int thin_endio(struct dm_target *ti, 2669 struct bio *bio, int err, 2670 union map_info *map_context) 2671 { 2672 unsigned long flags; 2673 struct dm_thin_endio_hook *h = map_context->ptr; 2674 struct list_head work; 2675 struct dm_thin_new_mapping *m, *tmp; 2676 struct pool *pool = h->tc->pool; 2677 2678 if (h->shared_read_entry) { 2679 INIT_LIST_HEAD(&work); 2680 ds_dec(h->shared_read_entry, &work); 2681 2682 spin_lock_irqsave(&pool->lock, flags); 2683 list_for_each_entry_safe(m, tmp, &work, list) { 2684 list_del(&m->list); 2685 m->quiesced = 1; 2686 __maybe_add_mapping(m); 2687 } 2688 spin_unlock_irqrestore(&pool->lock, flags); 2689 } 2690 2691 if (h->all_io_entry) { 2692 INIT_LIST_HEAD(&work); 2693 ds_dec(h->all_io_entry, &work); 2694 spin_lock_irqsave(&pool->lock, flags); 2695 list_for_each_entry_safe(m, tmp, &work, list) 2696 list_add(&m->list, &pool->prepared_discards); 2697 spin_unlock_irqrestore(&pool->lock, flags); 2698 } 2699 2700 mempool_free(h, pool->endio_hook_pool); 2701 2702 return 0; 2703 } 2704 2705 static void thin_postsuspend(struct dm_target *ti) 2706 { 2707 if (dm_noflush_suspending(ti)) 2708 requeue_io((struct thin_c *)ti->private); 2709 } 2710 2711 /* 2712 * <nr mapped sectors> <highest mapped sector> 2713 */ 2714 static int thin_status(struct dm_target *ti, status_type_t type, 2715 char *result, unsigned maxlen) 2716 { 2717 int r; 2718 ssize_t sz = 0; 2719 dm_block_t mapped, highest; 2720 char buf[BDEVNAME_SIZE]; 2721 struct thin_c *tc = ti->private; 2722 2723 if (!tc->td) 2724 DMEMIT("-"); 2725 else { 2726 switch (type) { 2727 case STATUSTYPE_INFO: 2728 r = dm_thin_get_mapped_count(tc->td, &mapped); 2729 if (r) 2730 return r; 2731 2732 r = dm_thin_get_highest_mapped_block(tc->td, &highest); 2733 if (r < 0) 2734 return r; 2735 2736 DMEMIT("%llu ", mapped * tc->pool->sectors_per_block); 2737 if (r) 2738 DMEMIT("%llu", ((highest + 1) * 2739 tc->pool->sectors_per_block) - 1); 2740 else 2741 DMEMIT("-"); 2742 break; 2743 2744 case STATUSTYPE_TABLE: 2745 DMEMIT("%s %lu", 2746 format_dev_t(buf, tc->pool_dev->bdev->bd_dev), 2747 (unsigned long) tc->dev_id); 2748 if (tc->origin_dev) 2749 DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev)); 2750 break; 2751 } 2752 } 2753 2754 return 0; 2755 } 2756 2757 static int thin_iterate_devices(struct dm_target *ti, 2758 iterate_devices_callout_fn fn, void *data) 2759 { 2760 dm_block_t blocks; 2761 struct thin_c *tc = ti->private; 2762 2763 /* 2764 * We can't call dm_pool_get_data_dev_size() since that blocks. So 2765 * we follow a more convoluted path through to the pool's target. 2766 */ 2767 if (!tc->pool->ti) 2768 return 0; /* nothing is bound */ 2769 2770 blocks = tc->pool->ti->len >> tc->pool->block_shift; 2771 if (blocks) 2772 return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block * blocks, data); 2773 2774 return 0; 2775 } 2776 2777 static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) 2778 { 2779 struct thin_c *tc = ti->private; 2780 struct pool *pool = tc->pool; 2781 2782 blk_limits_io_min(limits, 0); 2783 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); 2784 set_discard_limits(pool, limits); 2785 } 2786 2787 static struct target_type thin_target = { 2788 .name = "thin", 2789 .version = {1, 1, 0}, 2790 .module = THIS_MODULE, 2791 .ctr = thin_ctr, 2792 .dtr = thin_dtr, 2793 .map = thin_map, 2794 .end_io = thin_endio, 2795 .postsuspend = thin_postsuspend, 2796 .status = thin_status, 2797 .iterate_devices = thin_iterate_devices, 2798 .io_hints = thin_io_hints, 2799 }; 2800 2801 /*----------------------------------------------------------------*/ 2802 2803 static int __init dm_thin_init(void) 2804 { 2805 int r; 2806 2807 pool_table_init(); 2808 2809 r = dm_register_target(&thin_target); 2810 if (r) 2811 return r; 2812 2813 r = dm_register_target(&pool_target); 2814 if (r) 2815 goto bad_pool_target; 2816 2817 r = -ENOMEM; 2818 2819 _cell_cache = KMEM_CACHE(dm_bio_prison_cell, 0); 2820 if (!_cell_cache) 2821 goto bad_cell_cache; 2822 2823 _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0); 2824 if (!_new_mapping_cache) 2825 goto bad_new_mapping_cache; 2826 2827 _endio_hook_cache = KMEM_CACHE(dm_thin_endio_hook, 0); 2828 if (!_endio_hook_cache) 2829 goto bad_endio_hook_cache; 2830 2831 return 0; 2832 2833 bad_endio_hook_cache: 2834 kmem_cache_destroy(_new_mapping_cache); 2835 bad_new_mapping_cache: 2836 kmem_cache_destroy(_cell_cache); 2837 bad_cell_cache: 2838 dm_unregister_target(&pool_target); 2839 bad_pool_target: 2840 dm_unregister_target(&thin_target); 2841 2842 return r; 2843 } 2844 2845 static void dm_thin_exit(void) 2846 { 2847 dm_unregister_target(&thin_target); 2848 dm_unregister_target(&pool_target); 2849 2850 kmem_cache_destroy(_cell_cache); 2851 kmem_cache_destroy(_new_mapping_cache); 2852 kmem_cache_destroy(_endio_hook_cache); 2853 } 2854 2855 module_init(dm_thin_init); 2856 module_exit(dm_thin_exit); 2857 2858 MODULE_DESCRIPTION(DM_NAME " thin provisioning target"); 2859 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 2860 MODULE_LICENSE("GPL"); 2861