1 /* 2 * Copyright (C) 2011 Red Hat UK. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm-thin-metadata.h" 8 9 #include <linux/device-mapper.h> 10 #include <linux/dm-io.h> 11 #include <linux/dm-kcopyd.h> 12 #include <linux/list.h> 13 #include <linux/init.h> 14 #include <linux/module.h> 15 #include <linux/slab.h> 16 17 #define DM_MSG_PREFIX "thin" 18 19 /* 20 * Tunable constants 21 */ 22 #define ENDIO_HOOK_POOL_SIZE 10240 23 #define DEFERRED_SET_SIZE 64 24 #define MAPPING_POOL_SIZE 1024 25 #define PRISON_CELLS 1024 26 #define COMMIT_PERIOD HZ 27 28 /* 29 * The block size of the device holding pool data must be 30 * between 64KB and 1GB. 31 */ 32 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT) 33 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 34 35 /* 36 * Device id is restricted to 24 bits. 37 */ 38 #define MAX_DEV_ID ((1 << 24) - 1) 39 40 /* 41 * How do we handle breaking sharing of data blocks? 42 * ================================================= 43 * 44 * We use a standard copy-on-write btree to store the mappings for the 45 * devices (note I'm talking about copy-on-write of the metadata here, not 46 * the data). When you take an internal snapshot you clone the root node 47 * of the origin btree. After this there is no concept of an origin or a 48 * snapshot. They are just two device trees that happen to point to the 49 * same data blocks. 50 * 51 * When we get a write in we decide if it's to a shared data block using 52 * some timestamp magic. If it is, we have to break sharing. 53 * 54 * Let's say we write to a shared block in what was the origin. The 55 * steps are: 56 * 57 * i) plug io further to this physical block. (see bio_prison code). 58 * 59 * ii) quiesce any read io to that shared data block. Obviously 60 * including all devices that share this block. (see deferred_set code) 61 * 62 * iii) copy the data block to a newly allocate block. This step can be 63 * missed out if the io covers the block. (schedule_copy). 64 * 65 * iv) insert the new mapping into the origin's btree 66 * (process_prepared_mapping). This act of inserting breaks some 67 * sharing of btree nodes between the two devices. Breaking sharing only 68 * effects the btree of that specific device. Btrees for the other 69 * devices that share the block never change. The btree for the origin 70 * device as it was after the last commit is untouched, ie. we're using 71 * persistent data structures in the functional programming sense. 72 * 73 * v) unplug io to this physical block, including the io that triggered 74 * the breaking of sharing. 75 * 76 * Steps (ii) and (iii) occur in parallel. 77 * 78 * The metadata _doesn't_ need to be committed before the io continues. We 79 * get away with this because the io is always written to a _new_ block. 80 * If there's a crash, then: 81 * 82 * - The origin mapping will point to the old origin block (the shared 83 * one). This will contain the data as it was before the io that triggered 84 * the breaking of sharing came in. 85 * 86 * - The snap mapping still points to the old block. As it would after 87 * the commit. 88 * 89 * The downside of this scheme is the timestamp magic isn't perfect, and 90 * will continue to think that data block in the snapshot device is shared 91 * even after the write to the origin has broken sharing. I suspect data 92 * blocks will typically be shared by many different devices, so we're 93 * breaking sharing n + 1 times, rather than n, where n is the number of 94 * devices that reference this data block. At the moment I think the 95 * benefits far, far outweigh the disadvantages. 96 */ 97 98 /*----------------------------------------------------------------*/ 99 100 /* 101 * Sometimes we can't deal with a bio straight away. We put them in prison 102 * where they can't cause any mischief. Bios are put in a cell identified 103 * by a key, multiple bios can be in the same cell. When the cell is 104 * subsequently unlocked the bios become available. 105 */ 106 struct bio_prison; 107 108 struct cell_key { 109 int virtual; 110 dm_thin_id dev; 111 dm_block_t block; 112 }; 113 114 struct dm_bio_prison_cell { 115 struct hlist_node list; 116 struct bio_prison *prison; 117 struct cell_key key; 118 struct bio *holder; 119 struct bio_list bios; 120 }; 121 122 struct bio_prison { 123 spinlock_t lock; 124 mempool_t *cell_pool; 125 126 unsigned nr_buckets; 127 unsigned hash_mask; 128 struct hlist_head *cells; 129 }; 130 131 static uint32_t calc_nr_buckets(unsigned nr_cells) 132 { 133 uint32_t n = 128; 134 135 nr_cells /= 4; 136 nr_cells = min(nr_cells, 8192u); 137 138 while (n < nr_cells) 139 n <<= 1; 140 141 return n; 142 } 143 144 static struct kmem_cache *_cell_cache; 145 146 /* 147 * @nr_cells should be the number of cells you want in use _concurrently_. 148 * Don't confuse it with the number of distinct keys. 149 */ 150 static struct bio_prison *prison_create(unsigned nr_cells) 151 { 152 unsigned i; 153 uint32_t nr_buckets = calc_nr_buckets(nr_cells); 154 size_t len = sizeof(struct bio_prison) + 155 (sizeof(struct hlist_head) * nr_buckets); 156 struct bio_prison *prison = kmalloc(len, GFP_KERNEL); 157 158 if (!prison) 159 return NULL; 160 161 spin_lock_init(&prison->lock); 162 prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache); 163 if (!prison->cell_pool) { 164 kfree(prison); 165 return NULL; 166 } 167 168 prison->nr_buckets = nr_buckets; 169 prison->hash_mask = nr_buckets - 1; 170 prison->cells = (struct hlist_head *) (prison + 1); 171 for (i = 0; i < nr_buckets; i++) 172 INIT_HLIST_HEAD(prison->cells + i); 173 174 return prison; 175 } 176 177 static void prison_destroy(struct bio_prison *prison) 178 { 179 mempool_destroy(prison->cell_pool); 180 kfree(prison); 181 } 182 183 static uint32_t hash_key(struct bio_prison *prison, struct cell_key *key) 184 { 185 const unsigned long BIG_PRIME = 4294967291UL; 186 uint64_t hash = key->block * BIG_PRIME; 187 188 return (uint32_t) (hash & prison->hash_mask); 189 } 190 191 static int keys_equal(struct cell_key *lhs, struct cell_key *rhs) 192 { 193 return (lhs->virtual == rhs->virtual) && 194 (lhs->dev == rhs->dev) && 195 (lhs->block == rhs->block); 196 } 197 198 static struct dm_bio_prison_cell *__search_bucket(struct hlist_head *bucket, 199 struct cell_key *key) 200 { 201 struct dm_bio_prison_cell *cell; 202 struct hlist_node *tmp; 203 204 hlist_for_each_entry(cell, tmp, bucket, list) 205 if (keys_equal(&cell->key, key)) 206 return cell; 207 208 return NULL; 209 } 210 211 /* 212 * This may block if a new cell needs allocating. You must ensure that 213 * cells will be unlocked even if the calling thread is blocked. 214 * 215 * Returns 1 if the cell was already held, 0 if @inmate is the new holder. 216 */ 217 static int bio_detain(struct bio_prison *prison, struct cell_key *key, 218 struct bio *inmate, struct dm_bio_prison_cell **ref) 219 { 220 int r = 1; 221 unsigned long flags; 222 uint32_t hash = hash_key(prison, key); 223 struct dm_bio_prison_cell *cell, *cell2; 224 225 BUG_ON(hash > prison->nr_buckets); 226 227 spin_lock_irqsave(&prison->lock, flags); 228 229 cell = __search_bucket(prison->cells + hash, key); 230 if (cell) { 231 bio_list_add(&cell->bios, inmate); 232 goto out; 233 } 234 235 /* 236 * Allocate a new cell 237 */ 238 spin_unlock_irqrestore(&prison->lock, flags); 239 cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO); 240 spin_lock_irqsave(&prison->lock, flags); 241 242 /* 243 * We've been unlocked, so we have to double check that 244 * nobody else has inserted this cell in the meantime. 245 */ 246 cell = __search_bucket(prison->cells + hash, key); 247 if (cell) { 248 mempool_free(cell2, prison->cell_pool); 249 bio_list_add(&cell->bios, inmate); 250 goto out; 251 } 252 253 /* 254 * Use new cell. 255 */ 256 cell = cell2; 257 258 cell->prison = prison; 259 memcpy(&cell->key, key, sizeof(cell->key)); 260 cell->holder = inmate; 261 bio_list_init(&cell->bios); 262 hlist_add_head(&cell->list, prison->cells + hash); 263 264 r = 0; 265 266 out: 267 spin_unlock_irqrestore(&prison->lock, flags); 268 269 *ref = cell; 270 271 return r; 272 } 273 274 /* 275 * @inmates must have been initialised prior to this call 276 */ 277 static void __cell_release(struct dm_bio_prison_cell *cell, struct bio_list *inmates) 278 { 279 struct bio_prison *prison = cell->prison; 280 281 hlist_del(&cell->list); 282 283 if (inmates) { 284 bio_list_add(inmates, cell->holder); 285 bio_list_merge(inmates, &cell->bios); 286 } 287 288 mempool_free(cell, prison->cell_pool); 289 } 290 291 static void cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios) 292 { 293 unsigned long flags; 294 struct bio_prison *prison = cell->prison; 295 296 spin_lock_irqsave(&prison->lock, flags); 297 __cell_release(cell, bios); 298 spin_unlock_irqrestore(&prison->lock, flags); 299 } 300 301 /* 302 * There are a couple of places where we put a bio into a cell briefly 303 * before taking it out again. In these situations we know that no other 304 * bio may be in the cell. This function releases the cell, and also does 305 * a sanity check. 306 */ 307 static void __cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio) 308 { 309 BUG_ON(cell->holder != bio); 310 BUG_ON(!bio_list_empty(&cell->bios)); 311 312 __cell_release(cell, NULL); 313 } 314 315 static void cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio) 316 { 317 unsigned long flags; 318 struct bio_prison *prison = cell->prison; 319 320 spin_lock_irqsave(&prison->lock, flags); 321 __cell_release_singleton(cell, bio); 322 spin_unlock_irqrestore(&prison->lock, flags); 323 } 324 325 /* 326 * Sometimes we don't want the holder, just the additional bios. 327 */ 328 static void __cell_release_no_holder(struct dm_bio_prison_cell *cell, 329 struct bio_list *inmates) 330 { 331 struct bio_prison *prison = cell->prison; 332 333 hlist_del(&cell->list); 334 bio_list_merge(inmates, &cell->bios); 335 336 mempool_free(cell, prison->cell_pool); 337 } 338 339 static void cell_release_no_holder(struct dm_bio_prison_cell *cell, 340 struct bio_list *inmates) 341 { 342 unsigned long flags; 343 struct bio_prison *prison = cell->prison; 344 345 spin_lock_irqsave(&prison->lock, flags); 346 __cell_release_no_holder(cell, inmates); 347 spin_unlock_irqrestore(&prison->lock, flags); 348 } 349 350 static void cell_error(struct dm_bio_prison_cell *cell) 351 { 352 struct bio_prison *prison = cell->prison; 353 struct bio_list bios; 354 struct bio *bio; 355 unsigned long flags; 356 357 bio_list_init(&bios); 358 359 spin_lock_irqsave(&prison->lock, flags); 360 __cell_release(cell, &bios); 361 spin_unlock_irqrestore(&prison->lock, flags); 362 363 while ((bio = bio_list_pop(&bios))) 364 bio_io_error(bio); 365 } 366 367 /*----------------------------------------------------------------*/ 368 369 /* 370 * We use the deferred set to keep track of pending reads to shared blocks. 371 * We do this to ensure the new mapping caused by a write isn't performed 372 * until these prior reads have completed. Otherwise the insertion of the 373 * new mapping could free the old block that the read bios are mapped to. 374 */ 375 376 struct deferred_set; 377 struct deferred_entry { 378 struct deferred_set *ds; 379 unsigned count; 380 struct list_head work_items; 381 }; 382 383 struct deferred_set { 384 spinlock_t lock; 385 unsigned current_entry; 386 unsigned sweeper; 387 struct deferred_entry entries[DEFERRED_SET_SIZE]; 388 }; 389 390 static void ds_init(struct deferred_set *ds) 391 { 392 int i; 393 394 spin_lock_init(&ds->lock); 395 ds->current_entry = 0; 396 ds->sweeper = 0; 397 for (i = 0; i < DEFERRED_SET_SIZE; i++) { 398 ds->entries[i].ds = ds; 399 ds->entries[i].count = 0; 400 INIT_LIST_HEAD(&ds->entries[i].work_items); 401 } 402 } 403 404 static struct deferred_entry *ds_inc(struct deferred_set *ds) 405 { 406 unsigned long flags; 407 struct deferred_entry *entry; 408 409 spin_lock_irqsave(&ds->lock, flags); 410 entry = ds->entries + ds->current_entry; 411 entry->count++; 412 spin_unlock_irqrestore(&ds->lock, flags); 413 414 return entry; 415 } 416 417 static unsigned ds_next(unsigned index) 418 { 419 return (index + 1) % DEFERRED_SET_SIZE; 420 } 421 422 static void __sweep(struct deferred_set *ds, struct list_head *head) 423 { 424 while ((ds->sweeper != ds->current_entry) && 425 !ds->entries[ds->sweeper].count) { 426 list_splice_init(&ds->entries[ds->sweeper].work_items, head); 427 ds->sweeper = ds_next(ds->sweeper); 428 } 429 430 if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count) 431 list_splice_init(&ds->entries[ds->sweeper].work_items, head); 432 } 433 434 static void ds_dec(struct deferred_entry *entry, struct list_head *head) 435 { 436 unsigned long flags; 437 438 spin_lock_irqsave(&entry->ds->lock, flags); 439 BUG_ON(!entry->count); 440 --entry->count; 441 __sweep(entry->ds, head); 442 spin_unlock_irqrestore(&entry->ds->lock, flags); 443 } 444 445 /* 446 * Returns 1 if deferred or 0 if no pending items to delay job. 447 */ 448 static int ds_add_work(struct deferred_set *ds, struct list_head *work) 449 { 450 int r = 1; 451 unsigned long flags; 452 unsigned next_entry; 453 454 spin_lock_irqsave(&ds->lock, flags); 455 if ((ds->sweeper == ds->current_entry) && 456 !ds->entries[ds->current_entry].count) 457 r = 0; 458 else { 459 list_add(work, &ds->entries[ds->current_entry].work_items); 460 next_entry = ds_next(ds->current_entry); 461 if (!ds->entries[next_entry].count) 462 ds->current_entry = next_entry; 463 } 464 spin_unlock_irqrestore(&ds->lock, flags); 465 466 return r; 467 } 468 469 /*----------------------------------------------------------------*/ 470 471 /* 472 * Key building. 473 */ 474 static void build_data_key(struct dm_thin_device *td, 475 dm_block_t b, struct cell_key *key) 476 { 477 key->virtual = 0; 478 key->dev = dm_thin_dev_id(td); 479 key->block = b; 480 } 481 482 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, 483 struct cell_key *key) 484 { 485 key->virtual = 1; 486 key->dev = dm_thin_dev_id(td); 487 key->block = b; 488 } 489 490 /*----------------------------------------------------------------*/ 491 492 /* 493 * A pool device ties together a metadata device and a data device. It 494 * also provides the interface for creating and destroying internal 495 * devices. 496 */ 497 struct dm_thin_new_mapping; 498 499 struct pool_features { 500 unsigned zero_new_blocks:1; 501 unsigned discard_enabled:1; 502 unsigned discard_passdown:1; 503 }; 504 505 struct pool { 506 struct list_head list; 507 struct dm_target *ti; /* Only set if a pool target is bound */ 508 509 struct mapped_device *pool_md; 510 struct block_device *md_dev; 511 struct dm_pool_metadata *pmd; 512 513 uint32_t sectors_per_block; 514 unsigned block_shift; 515 dm_block_t offset_mask; 516 dm_block_t low_water_blocks; 517 518 struct pool_features pf; 519 unsigned low_water_triggered:1; /* A dm event has been sent */ 520 unsigned no_free_space:1; /* A -ENOSPC warning has been issued */ 521 522 struct bio_prison *prison; 523 struct dm_kcopyd_client *copier; 524 525 struct workqueue_struct *wq; 526 struct work_struct worker; 527 struct delayed_work waker; 528 529 unsigned ref_count; 530 unsigned long last_commit_jiffies; 531 532 spinlock_t lock; 533 struct bio_list deferred_bios; 534 struct bio_list deferred_flush_bios; 535 struct list_head prepared_mappings; 536 struct list_head prepared_discards; 537 538 struct bio_list retry_on_resume_list; 539 540 struct deferred_set shared_read_ds; 541 struct deferred_set all_io_ds; 542 543 struct dm_thin_new_mapping *next_mapping; 544 mempool_t *mapping_pool; 545 mempool_t *endio_hook_pool; 546 }; 547 548 /* 549 * Target context for a pool. 550 */ 551 struct pool_c { 552 struct dm_target *ti; 553 struct pool *pool; 554 struct dm_dev *data_dev; 555 struct dm_dev *metadata_dev; 556 struct dm_target_callbacks callbacks; 557 558 dm_block_t low_water_blocks; 559 struct pool_features pf; 560 }; 561 562 /* 563 * Target context for a thin. 564 */ 565 struct thin_c { 566 struct dm_dev *pool_dev; 567 struct dm_dev *origin_dev; 568 dm_thin_id dev_id; 569 570 struct pool *pool; 571 struct dm_thin_device *td; 572 }; 573 574 /*----------------------------------------------------------------*/ 575 576 /* 577 * A global list of pools that uses a struct mapped_device as a key. 578 */ 579 static struct dm_thin_pool_table { 580 struct mutex mutex; 581 struct list_head pools; 582 } dm_thin_pool_table; 583 584 static void pool_table_init(void) 585 { 586 mutex_init(&dm_thin_pool_table.mutex); 587 INIT_LIST_HEAD(&dm_thin_pool_table.pools); 588 } 589 590 static void __pool_table_insert(struct pool *pool) 591 { 592 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 593 list_add(&pool->list, &dm_thin_pool_table.pools); 594 } 595 596 static void __pool_table_remove(struct pool *pool) 597 { 598 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 599 list_del(&pool->list); 600 } 601 602 static struct pool *__pool_table_lookup(struct mapped_device *md) 603 { 604 struct pool *pool = NULL, *tmp; 605 606 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 607 608 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { 609 if (tmp->pool_md == md) { 610 pool = tmp; 611 break; 612 } 613 } 614 615 return pool; 616 } 617 618 static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev) 619 { 620 struct pool *pool = NULL, *tmp; 621 622 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 623 624 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { 625 if (tmp->md_dev == md_dev) { 626 pool = tmp; 627 break; 628 } 629 } 630 631 return pool; 632 } 633 634 /*----------------------------------------------------------------*/ 635 636 struct dm_thin_endio_hook { 637 struct thin_c *tc; 638 struct deferred_entry *shared_read_entry; 639 struct deferred_entry *all_io_entry; 640 struct dm_thin_new_mapping *overwrite_mapping; 641 }; 642 643 static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) 644 { 645 struct bio *bio; 646 struct bio_list bios; 647 648 bio_list_init(&bios); 649 bio_list_merge(&bios, master); 650 bio_list_init(master); 651 652 while ((bio = bio_list_pop(&bios))) { 653 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 654 655 if (h->tc == tc) 656 bio_endio(bio, DM_ENDIO_REQUEUE); 657 else 658 bio_list_add(master, bio); 659 } 660 } 661 662 static void requeue_io(struct thin_c *tc) 663 { 664 struct pool *pool = tc->pool; 665 unsigned long flags; 666 667 spin_lock_irqsave(&pool->lock, flags); 668 __requeue_bio_list(tc, &pool->deferred_bios); 669 __requeue_bio_list(tc, &pool->retry_on_resume_list); 670 spin_unlock_irqrestore(&pool->lock, flags); 671 } 672 673 /* 674 * This section of code contains the logic for processing a thin device's IO. 675 * Much of the code depends on pool object resources (lists, workqueues, etc) 676 * but most is exclusively called from the thin target rather than the thin-pool 677 * target. 678 */ 679 680 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) 681 { 682 return bio->bi_sector >> tc->pool->block_shift; 683 } 684 685 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) 686 { 687 struct pool *pool = tc->pool; 688 689 bio->bi_bdev = tc->pool_dev->bdev; 690 bio->bi_sector = (block << pool->block_shift) + 691 (bio->bi_sector & pool->offset_mask); 692 } 693 694 static void remap_to_origin(struct thin_c *tc, struct bio *bio) 695 { 696 bio->bi_bdev = tc->origin_dev->bdev; 697 } 698 699 static void issue(struct thin_c *tc, struct bio *bio) 700 { 701 struct pool *pool = tc->pool; 702 unsigned long flags; 703 704 /* 705 * Batch together any FUA/FLUSH bios we find and then issue 706 * a single commit for them in process_deferred_bios(). 707 */ 708 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { 709 spin_lock_irqsave(&pool->lock, flags); 710 bio_list_add(&pool->deferred_flush_bios, bio); 711 spin_unlock_irqrestore(&pool->lock, flags); 712 } else 713 generic_make_request(bio); 714 } 715 716 static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio) 717 { 718 remap_to_origin(tc, bio); 719 issue(tc, bio); 720 } 721 722 static void remap_and_issue(struct thin_c *tc, struct bio *bio, 723 dm_block_t block) 724 { 725 remap(tc, bio, block); 726 issue(tc, bio); 727 } 728 729 /* 730 * wake_worker() is used when new work is queued and when pool_resume is 731 * ready to continue deferred IO processing. 732 */ 733 static void wake_worker(struct pool *pool) 734 { 735 queue_work(pool->wq, &pool->worker); 736 } 737 738 /*----------------------------------------------------------------*/ 739 740 /* 741 * Bio endio functions. 742 */ 743 struct dm_thin_new_mapping { 744 struct list_head list; 745 746 unsigned quiesced:1; 747 unsigned prepared:1; 748 unsigned pass_discard:1; 749 750 struct thin_c *tc; 751 dm_block_t virt_block; 752 dm_block_t data_block; 753 struct dm_bio_prison_cell *cell, *cell2; 754 int err; 755 756 /* 757 * If the bio covers the whole area of a block then we can avoid 758 * zeroing or copying. Instead this bio is hooked. The bio will 759 * still be in the cell, so care has to be taken to avoid issuing 760 * the bio twice. 761 */ 762 struct bio *bio; 763 bio_end_io_t *saved_bi_end_io; 764 }; 765 766 static void __maybe_add_mapping(struct dm_thin_new_mapping *m) 767 { 768 struct pool *pool = m->tc->pool; 769 770 if (m->quiesced && m->prepared) { 771 list_add(&m->list, &pool->prepared_mappings); 772 wake_worker(pool); 773 } 774 } 775 776 static void copy_complete(int read_err, unsigned long write_err, void *context) 777 { 778 unsigned long flags; 779 struct dm_thin_new_mapping *m = context; 780 struct pool *pool = m->tc->pool; 781 782 m->err = read_err || write_err ? -EIO : 0; 783 784 spin_lock_irqsave(&pool->lock, flags); 785 m->prepared = 1; 786 __maybe_add_mapping(m); 787 spin_unlock_irqrestore(&pool->lock, flags); 788 } 789 790 static void overwrite_endio(struct bio *bio, int err) 791 { 792 unsigned long flags; 793 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 794 struct dm_thin_new_mapping *m = h->overwrite_mapping; 795 struct pool *pool = m->tc->pool; 796 797 m->err = err; 798 799 spin_lock_irqsave(&pool->lock, flags); 800 m->prepared = 1; 801 __maybe_add_mapping(m); 802 spin_unlock_irqrestore(&pool->lock, flags); 803 } 804 805 /*----------------------------------------------------------------*/ 806 807 /* 808 * Workqueue. 809 */ 810 811 /* 812 * Prepared mapping jobs. 813 */ 814 815 /* 816 * This sends the bios in the cell back to the deferred_bios list. 817 */ 818 static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell, 819 dm_block_t data_block) 820 { 821 struct pool *pool = tc->pool; 822 unsigned long flags; 823 824 spin_lock_irqsave(&pool->lock, flags); 825 cell_release(cell, &pool->deferred_bios); 826 spin_unlock_irqrestore(&tc->pool->lock, flags); 827 828 wake_worker(pool); 829 } 830 831 /* 832 * Same as cell_defer above, except it omits one particular detainee, 833 * a write bio that covers the block and has already been processed. 834 */ 835 static void cell_defer_except(struct thin_c *tc, struct dm_bio_prison_cell *cell) 836 { 837 struct bio_list bios; 838 struct pool *pool = tc->pool; 839 unsigned long flags; 840 841 bio_list_init(&bios); 842 843 spin_lock_irqsave(&pool->lock, flags); 844 cell_release_no_holder(cell, &pool->deferred_bios); 845 spin_unlock_irqrestore(&pool->lock, flags); 846 847 wake_worker(pool); 848 } 849 850 static void process_prepared_mapping(struct dm_thin_new_mapping *m) 851 { 852 struct thin_c *tc = m->tc; 853 struct bio *bio; 854 int r; 855 856 bio = m->bio; 857 if (bio) 858 bio->bi_end_io = m->saved_bi_end_io; 859 860 if (m->err) { 861 cell_error(m->cell); 862 return; 863 } 864 865 /* 866 * Commit the prepared block into the mapping btree. 867 * Any I/O for this block arriving after this point will get 868 * remapped to it directly. 869 */ 870 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); 871 if (r) { 872 DMERR("dm_thin_insert_block() failed"); 873 cell_error(m->cell); 874 return; 875 } 876 877 /* 878 * Release any bios held while the block was being provisioned. 879 * If we are processing a write bio that completely covers the block, 880 * we already processed it so can ignore it now when processing 881 * the bios in the cell. 882 */ 883 if (bio) { 884 cell_defer_except(tc, m->cell); 885 bio_endio(bio, 0); 886 } else 887 cell_defer(tc, m->cell, m->data_block); 888 889 list_del(&m->list); 890 mempool_free(m, tc->pool->mapping_pool); 891 } 892 893 static void process_prepared_discard(struct dm_thin_new_mapping *m) 894 { 895 int r; 896 struct thin_c *tc = m->tc; 897 898 r = dm_thin_remove_block(tc->td, m->virt_block); 899 if (r) 900 DMERR("dm_thin_remove_block() failed"); 901 902 /* 903 * Pass the discard down to the underlying device? 904 */ 905 if (m->pass_discard) 906 remap_and_issue(tc, m->bio, m->data_block); 907 else 908 bio_endio(m->bio, 0); 909 910 cell_defer_except(tc, m->cell); 911 cell_defer_except(tc, m->cell2); 912 mempool_free(m, tc->pool->mapping_pool); 913 } 914 915 static void process_prepared(struct pool *pool, struct list_head *head, 916 void (*fn)(struct dm_thin_new_mapping *)) 917 { 918 unsigned long flags; 919 struct list_head maps; 920 struct dm_thin_new_mapping *m, *tmp; 921 922 INIT_LIST_HEAD(&maps); 923 spin_lock_irqsave(&pool->lock, flags); 924 list_splice_init(head, &maps); 925 spin_unlock_irqrestore(&pool->lock, flags); 926 927 list_for_each_entry_safe(m, tmp, &maps, list) 928 fn(m); 929 } 930 931 /* 932 * Deferred bio jobs. 933 */ 934 static int io_overlaps_block(struct pool *pool, struct bio *bio) 935 { 936 return !(bio->bi_sector & pool->offset_mask) && 937 (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT)); 938 939 } 940 941 static int io_overwrites_block(struct pool *pool, struct bio *bio) 942 { 943 return (bio_data_dir(bio) == WRITE) && 944 io_overlaps_block(pool, bio); 945 } 946 947 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save, 948 bio_end_io_t *fn) 949 { 950 *save = bio->bi_end_io; 951 bio->bi_end_io = fn; 952 } 953 954 static int ensure_next_mapping(struct pool *pool) 955 { 956 if (pool->next_mapping) 957 return 0; 958 959 pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC); 960 961 return pool->next_mapping ? 0 : -ENOMEM; 962 } 963 964 static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool) 965 { 966 struct dm_thin_new_mapping *r = pool->next_mapping; 967 968 BUG_ON(!pool->next_mapping); 969 970 pool->next_mapping = NULL; 971 972 return r; 973 } 974 975 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, 976 struct dm_dev *origin, dm_block_t data_origin, 977 dm_block_t data_dest, 978 struct dm_bio_prison_cell *cell, struct bio *bio) 979 { 980 int r; 981 struct pool *pool = tc->pool; 982 struct dm_thin_new_mapping *m = get_next_mapping(pool); 983 984 INIT_LIST_HEAD(&m->list); 985 m->quiesced = 0; 986 m->prepared = 0; 987 m->tc = tc; 988 m->virt_block = virt_block; 989 m->data_block = data_dest; 990 m->cell = cell; 991 m->err = 0; 992 m->bio = NULL; 993 994 if (!ds_add_work(&pool->shared_read_ds, &m->list)) 995 m->quiesced = 1; 996 997 /* 998 * IO to pool_dev remaps to the pool target's data_dev. 999 * 1000 * If the whole block of data is being overwritten, we can issue the 1001 * bio immediately. Otherwise we use kcopyd to clone the data first. 1002 */ 1003 if (io_overwrites_block(pool, bio)) { 1004 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 1005 1006 h->overwrite_mapping = m; 1007 m->bio = bio; 1008 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 1009 remap_and_issue(tc, bio, data_dest); 1010 } else { 1011 struct dm_io_region from, to; 1012 1013 from.bdev = origin->bdev; 1014 from.sector = data_origin * pool->sectors_per_block; 1015 from.count = pool->sectors_per_block; 1016 1017 to.bdev = tc->pool_dev->bdev; 1018 to.sector = data_dest * pool->sectors_per_block; 1019 to.count = pool->sectors_per_block; 1020 1021 r = dm_kcopyd_copy(pool->copier, &from, 1, &to, 1022 0, copy_complete, m); 1023 if (r < 0) { 1024 mempool_free(m, pool->mapping_pool); 1025 DMERR("dm_kcopyd_copy() failed"); 1026 cell_error(cell); 1027 } 1028 } 1029 } 1030 1031 static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block, 1032 dm_block_t data_origin, dm_block_t data_dest, 1033 struct dm_bio_prison_cell *cell, struct bio *bio) 1034 { 1035 schedule_copy(tc, virt_block, tc->pool_dev, 1036 data_origin, data_dest, cell, bio); 1037 } 1038 1039 static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block, 1040 dm_block_t data_dest, 1041 struct dm_bio_prison_cell *cell, struct bio *bio) 1042 { 1043 schedule_copy(tc, virt_block, tc->origin_dev, 1044 virt_block, data_dest, cell, bio); 1045 } 1046 1047 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, 1048 dm_block_t data_block, struct dm_bio_prison_cell *cell, 1049 struct bio *bio) 1050 { 1051 struct pool *pool = tc->pool; 1052 struct dm_thin_new_mapping *m = get_next_mapping(pool); 1053 1054 INIT_LIST_HEAD(&m->list); 1055 m->quiesced = 1; 1056 m->prepared = 0; 1057 m->tc = tc; 1058 m->virt_block = virt_block; 1059 m->data_block = data_block; 1060 m->cell = cell; 1061 m->err = 0; 1062 m->bio = NULL; 1063 1064 /* 1065 * If the whole block of data is being overwritten or we are not 1066 * zeroing pre-existing data, we can issue the bio immediately. 1067 * Otherwise we use kcopyd to zero the data first. 1068 */ 1069 if (!pool->pf.zero_new_blocks) 1070 process_prepared_mapping(m); 1071 1072 else if (io_overwrites_block(pool, bio)) { 1073 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 1074 1075 h->overwrite_mapping = m; 1076 m->bio = bio; 1077 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 1078 remap_and_issue(tc, bio, data_block); 1079 } else { 1080 int r; 1081 struct dm_io_region to; 1082 1083 to.bdev = tc->pool_dev->bdev; 1084 to.sector = data_block * pool->sectors_per_block; 1085 to.count = pool->sectors_per_block; 1086 1087 r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m); 1088 if (r < 0) { 1089 mempool_free(m, pool->mapping_pool); 1090 DMERR("dm_kcopyd_zero() failed"); 1091 cell_error(cell); 1092 } 1093 } 1094 } 1095 1096 static int alloc_data_block(struct thin_c *tc, dm_block_t *result) 1097 { 1098 int r; 1099 dm_block_t free_blocks; 1100 unsigned long flags; 1101 struct pool *pool = tc->pool; 1102 1103 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 1104 if (r) 1105 return r; 1106 1107 if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) { 1108 DMWARN("%s: reached low water mark, sending event.", 1109 dm_device_name(pool->pool_md)); 1110 spin_lock_irqsave(&pool->lock, flags); 1111 pool->low_water_triggered = 1; 1112 spin_unlock_irqrestore(&pool->lock, flags); 1113 dm_table_event(pool->ti->table); 1114 } 1115 1116 if (!free_blocks) { 1117 if (pool->no_free_space) 1118 return -ENOSPC; 1119 else { 1120 /* 1121 * Try to commit to see if that will free up some 1122 * more space. 1123 */ 1124 r = dm_pool_commit_metadata(pool->pmd); 1125 if (r) { 1126 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 1127 __func__, r); 1128 return r; 1129 } 1130 1131 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 1132 if (r) 1133 return r; 1134 1135 /* 1136 * If we still have no space we set a flag to avoid 1137 * doing all this checking and return -ENOSPC. 1138 */ 1139 if (!free_blocks) { 1140 DMWARN("%s: no free space available.", 1141 dm_device_name(pool->pool_md)); 1142 spin_lock_irqsave(&pool->lock, flags); 1143 pool->no_free_space = 1; 1144 spin_unlock_irqrestore(&pool->lock, flags); 1145 return -ENOSPC; 1146 } 1147 } 1148 } 1149 1150 r = dm_pool_alloc_data_block(pool->pmd, result); 1151 if (r) 1152 return r; 1153 1154 return 0; 1155 } 1156 1157 /* 1158 * If we have run out of space, queue bios until the device is 1159 * resumed, presumably after having been reloaded with more space. 1160 */ 1161 static void retry_on_resume(struct bio *bio) 1162 { 1163 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 1164 struct thin_c *tc = h->tc; 1165 struct pool *pool = tc->pool; 1166 unsigned long flags; 1167 1168 spin_lock_irqsave(&pool->lock, flags); 1169 bio_list_add(&pool->retry_on_resume_list, bio); 1170 spin_unlock_irqrestore(&pool->lock, flags); 1171 } 1172 1173 static void no_space(struct dm_bio_prison_cell *cell) 1174 { 1175 struct bio *bio; 1176 struct bio_list bios; 1177 1178 bio_list_init(&bios); 1179 cell_release(cell, &bios); 1180 1181 while ((bio = bio_list_pop(&bios))) 1182 retry_on_resume(bio); 1183 } 1184 1185 static void process_discard(struct thin_c *tc, struct bio *bio) 1186 { 1187 int r; 1188 unsigned long flags; 1189 struct pool *pool = tc->pool; 1190 struct dm_bio_prison_cell *cell, *cell2; 1191 struct cell_key key, key2; 1192 dm_block_t block = get_bio_block(tc, bio); 1193 struct dm_thin_lookup_result lookup_result; 1194 struct dm_thin_new_mapping *m; 1195 1196 build_virtual_key(tc->td, block, &key); 1197 if (bio_detain(tc->pool->prison, &key, bio, &cell)) 1198 return; 1199 1200 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1201 switch (r) { 1202 case 0: 1203 /* 1204 * Check nobody is fiddling with this pool block. This can 1205 * happen if someone's in the process of breaking sharing 1206 * on this block. 1207 */ 1208 build_data_key(tc->td, lookup_result.block, &key2); 1209 if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) { 1210 cell_release_singleton(cell, bio); 1211 break; 1212 } 1213 1214 if (io_overlaps_block(pool, bio)) { 1215 /* 1216 * IO may still be going to the destination block. We must 1217 * quiesce before we can do the removal. 1218 */ 1219 m = get_next_mapping(pool); 1220 m->tc = tc; 1221 m->pass_discard = (!lookup_result.shared) & pool->pf.discard_passdown; 1222 m->virt_block = block; 1223 m->data_block = lookup_result.block; 1224 m->cell = cell; 1225 m->cell2 = cell2; 1226 m->err = 0; 1227 m->bio = bio; 1228 1229 if (!ds_add_work(&pool->all_io_ds, &m->list)) { 1230 spin_lock_irqsave(&pool->lock, flags); 1231 list_add(&m->list, &pool->prepared_discards); 1232 spin_unlock_irqrestore(&pool->lock, flags); 1233 wake_worker(pool); 1234 } 1235 } else { 1236 /* 1237 * This path is hit if people are ignoring 1238 * limits->discard_granularity. It ignores any 1239 * part of the discard that is in a subsequent 1240 * block. 1241 */ 1242 sector_t offset = bio->bi_sector - (block << pool->block_shift); 1243 unsigned remaining = (pool->sectors_per_block - offset) << 9; 1244 bio->bi_size = min(bio->bi_size, remaining); 1245 1246 cell_release_singleton(cell, bio); 1247 cell_release_singleton(cell2, bio); 1248 remap_and_issue(tc, bio, lookup_result.block); 1249 } 1250 break; 1251 1252 case -ENODATA: 1253 /* 1254 * It isn't provisioned, just forget it. 1255 */ 1256 cell_release_singleton(cell, bio); 1257 bio_endio(bio, 0); 1258 break; 1259 1260 default: 1261 DMERR("discard: find block unexpectedly returned %d", r); 1262 cell_release_singleton(cell, bio); 1263 bio_io_error(bio); 1264 break; 1265 } 1266 } 1267 1268 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, 1269 struct cell_key *key, 1270 struct dm_thin_lookup_result *lookup_result, 1271 struct dm_bio_prison_cell *cell) 1272 { 1273 int r; 1274 dm_block_t data_block; 1275 1276 r = alloc_data_block(tc, &data_block); 1277 switch (r) { 1278 case 0: 1279 schedule_internal_copy(tc, block, lookup_result->block, 1280 data_block, cell, bio); 1281 break; 1282 1283 case -ENOSPC: 1284 no_space(cell); 1285 break; 1286 1287 default: 1288 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r); 1289 cell_error(cell); 1290 break; 1291 } 1292 } 1293 1294 static void process_shared_bio(struct thin_c *tc, struct bio *bio, 1295 dm_block_t block, 1296 struct dm_thin_lookup_result *lookup_result) 1297 { 1298 struct dm_bio_prison_cell *cell; 1299 struct pool *pool = tc->pool; 1300 struct cell_key key; 1301 1302 /* 1303 * If cell is already occupied, then sharing is already in the process 1304 * of being broken so we have nothing further to do here. 1305 */ 1306 build_data_key(tc->td, lookup_result->block, &key); 1307 if (bio_detain(pool->prison, &key, bio, &cell)) 1308 return; 1309 1310 if (bio_data_dir(bio) == WRITE) 1311 break_sharing(tc, bio, block, &key, lookup_result, cell); 1312 else { 1313 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 1314 1315 h->shared_read_entry = ds_inc(&pool->shared_read_ds); 1316 1317 cell_release_singleton(cell, bio); 1318 remap_and_issue(tc, bio, lookup_result->block); 1319 } 1320 } 1321 1322 static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block, 1323 struct dm_bio_prison_cell *cell) 1324 { 1325 int r; 1326 dm_block_t data_block; 1327 1328 /* 1329 * Remap empty bios (flushes) immediately, without provisioning. 1330 */ 1331 if (!bio->bi_size) { 1332 cell_release_singleton(cell, bio); 1333 remap_and_issue(tc, bio, 0); 1334 return; 1335 } 1336 1337 /* 1338 * Fill read bios with zeroes and complete them immediately. 1339 */ 1340 if (bio_data_dir(bio) == READ) { 1341 zero_fill_bio(bio); 1342 cell_release_singleton(cell, bio); 1343 bio_endio(bio, 0); 1344 return; 1345 } 1346 1347 r = alloc_data_block(tc, &data_block); 1348 switch (r) { 1349 case 0: 1350 if (tc->origin_dev) 1351 schedule_external_copy(tc, block, data_block, cell, bio); 1352 else 1353 schedule_zero(tc, block, data_block, cell, bio); 1354 break; 1355 1356 case -ENOSPC: 1357 no_space(cell); 1358 break; 1359 1360 default: 1361 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r); 1362 cell_error(cell); 1363 break; 1364 } 1365 } 1366 1367 static void process_bio(struct thin_c *tc, struct bio *bio) 1368 { 1369 int r; 1370 dm_block_t block = get_bio_block(tc, bio); 1371 struct dm_bio_prison_cell *cell; 1372 struct cell_key key; 1373 struct dm_thin_lookup_result lookup_result; 1374 1375 /* 1376 * If cell is already occupied, then the block is already 1377 * being provisioned so we have nothing further to do here. 1378 */ 1379 build_virtual_key(tc->td, block, &key); 1380 if (bio_detain(tc->pool->prison, &key, bio, &cell)) 1381 return; 1382 1383 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1384 switch (r) { 1385 case 0: 1386 /* 1387 * We can release this cell now. This thread is the only 1388 * one that puts bios into a cell, and we know there were 1389 * no preceding bios. 1390 */ 1391 /* 1392 * TODO: this will probably have to change when discard goes 1393 * back in. 1394 */ 1395 cell_release_singleton(cell, bio); 1396 1397 if (lookup_result.shared) 1398 process_shared_bio(tc, bio, block, &lookup_result); 1399 else 1400 remap_and_issue(tc, bio, lookup_result.block); 1401 break; 1402 1403 case -ENODATA: 1404 if (bio_data_dir(bio) == READ && tc->origin_dev) { 1405 cell_release_singleton(cell, bio); 1406 remap_to_origin_and_issue(tc, bio); 1407 } else 1408 provision_block(tc, bio, block, cell); 1409 break; 1410 1411 default: 1412 DMERR("dm_thin_find_block() failed, error = %d", r); 1413 cell_release_singleton(cell, bio); 1414 bio_io_error(bio); 1415 break; 1416 } 1417 } 1418 1419 static int need_commit_due_to_time(struct pool *pool) 1420 { 1421 return jiffies < pool->last_commit_jiffies || 1422 jiffies > pool->last_commit_jiffies + COMMIT_PERIOD; 1423 } 1424 1425 static void process_deferred_bios(struct pool *pool) 1426 { 1427 unsigned long flags; 1428 struct bio *bio; 1429 struct bio_list bios; 1430 int r; 1431 1432 bio_list_init(&bios); 1433 1434 spin_lock_irqsave(&pool->lock, flags); 1435 bio_list_merge(&bios, &pool->deferred_bios); 1436 bio_list_init(&pool->deferred_bios); 1437 spin_unlock_irqrestore(&pool->lock, flags); 1438 1439 while ((bio = bio_list_pop(&bios))) { 1440 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 1441 struct thin_c *tc = h->tc; 1442 1443 /* 1444 * If we've got no free new_mapping structs, and processing 1445 * this bio might require one, we pause until there are some 1446 * prepared mappings to process. 1447 */ 1448 if (ensure_next_mapping(pool)) { 1449 spin_lock_irqsave(&pool->lock, flags); 1450 bio_list_merge(&pool->deferred_bios, &bios); 1451 spin_unlock_irqrestore(&pool->lock, flags); 1452 1453 break; 1454 } 1455 1456 if (bio->bi_rw & REQ_DISCARD) 1457 process_discard(tc, bio); 1458 else 1459 process_bio(tc, bio); 1460 } 1461 1462 /* 1463 * If there are any deferred flush bios, we must commit 1464 * the metadata before issuing them. 1465 */ 1466 bio_list_init(&bios); 1467 spin_lock_irqsave(&pool->lock, flags); 1468 bio_list_merge(&bios, &pool->deferred_flush_bios); 1469 bio_list_init(&pool->deferred_flush_bios); 1470 spin_unlock_irqrestore(&pool->lock, flags); 1471 1472 if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) 1473 return; 1474 1475 r = dm_pool_commit_metadata(pool->pmd); 1476 if (r) { 1477 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 1478 __func__, r); 1479 while ((bio = bio_list_pop(&bios))) 1480 bio_io_error(bio); 1481 return; 1482 } 1483 pool->last_commit_jiffies = jiffies; 1484 1485 while ((bio = bio_list_pop(&bios))) 1486 generic_make_request(bio); 1487 } 1488 1489 static void do_worker(struct work_struct *ws) 1490 { 1491 struct pool *pool = container_of(ws, struct pool, worker); 1492 1493 process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping); 1494 process_prepared(pool, &pool->prepared_discards, process_prepared_discard); 1495 process_deferred_bios(pool); 1496 } 1497 1498 /* 1499 * We want to commit periodically so that not too much 1500 * unwritten data builds up. 1501 */ 1502 static void do_waker(struct work_struct *ws) 1503 { 1504 struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker); 1505 wake_worker(pool); 1506 queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD); 1507 } 1508 1509 /*----------------------------------------------------------------*/ 1510 1511 /* 1512 * Mapping functions. 1513 */ 1514 1515 /* 1516 * Called only while mapping a thin bio to hand it over to the workqueue. 1517 */ 1518 static void thin_defer_bio(struct thin_c *tc, struct bio *bio) 1519 { 1520 unsigned long flags; 1521 struct pool *pool = tc->pool; 1522 1523 spin_lock_irqsave(&pool->lock, flags); 1524 bio_list_add(&pool->deferred_bios, bio); 1525 spin_unlock_irqrestore(&pool->lock, flags); 1526 1527 wake_worker(pool); 1528 } 1529 1530 static struct dm_thin_endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio) 1531 { 1532 struct pool *pool = tc->pool; 1533 struct dm_thin_endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO); 1534 1535 h->tc = tc; 1536 h->shared_read_entry = NULL; 1537 h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds); 1538 h->overwrite_mapping = NULL; 1539 1540 return h; 1541 } 1542 1543 /* 1544 * Non-blocking function called from the thin target's map function. 1545 */ 1546 static int thin_bio_map(struct dm_target *ti, struct bio *bio, 1547 union map_info *map_context) 1548 { 1549 int r; 1550 struct thin_c *tc = ti->private; 1551 dm_block_t block = get_bio_block(tc, bio); 1552 struct dm_thin_device *td = tc->td; 1553 struct dm_thin_lookup_result result; 1554 1555 map_context->ptr = thin_hook_bio(tc, bio); 1556 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { 1557 thin_defer_bio(tc, bio); 1558 return DM_MAPIO_SUBMITTED; 1559 } 1560 1561 r = dm_thin_find_block(td, block, 0, &result); 1562 1563 /* 1564 * Note that we defer readahead too. 1565 */ 1566 switch (r) { 1567 case 0: 1568 if (unlikely(result.shared)) { 1569 /* 1570 * We have a race condition here between the 1571 * result.shared value returned by the lookup and 1572 * snapshot creation, which may cause new 1573 * sharing. 1574 * 1575 * To avoid this always quiesce the origin before 1576 * taking the snap. You want to do this anyway to 1577 * ensure a consistent application view 1578 * (i.e. lockfs). 1579 * 1580 * More distant ancestors are irrelevant. The 1581 * shared flag will be set in their case. 1582 */ 1583 thin_defer_bio(tc, bio); 1584 r = DM_MAPIO_SUBMITTED; 1585 } else { 1586 remap(tc, bio, result.block); 1587 r = DM_MAPIO_REMAPPED; 1588 } 1589 break; 1590 1591 case -ENODATA: 1592 /* 1593 * In future, the failed dm_thin_find_block above could 1594 * provide the hint to load the metadata into cache. 1595 */ 1596 case -EWOULDBLOCK: 1597 thin_defer_bio(tc, bio); 1598 r = DM_MAPIO_SUBMITTED; 1599 break; 1600 } 1601 1602 return r; 1603 } 1604 1605 static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 1606 { 1607 int r; 1608 unsigned long flags; 1609 struct pool_c *pt = container_of(cb, struct pool_c, callbacks); 1610 1611 spin_lock_irqsave(&pt->pool->lock, flags); 1612 r = !bio_list_empty(&pt->pool->retry_on_resume_list); 1613 spin_unlock_irqrestore(&pt->pool->lock, flags); 1614 1615 if (!r) { 1616 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 1617 r = bdi_congested(&q->backing_dev_info, bdi_bits); 1618 } 1619 1620 return r; 1621 } 1622 1623 static void __requeue_bios(struct pool *pool) 1624 { 1625 bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list); 1626 bio_list_init(&pool->retry_on_resume_list); 1627 } 1628 1629 /*---------------------------------------------------------------- 1630 * Binding of control targets to a pool object 1631 *--------------------------------------------------------------*/ 1632 static int bind_control_target(struct pool *pool, struct dm_target *ti) 1633 { 1634 struct pool_c *pt = ti->private; 1635 1636 pool->ti = ti; 1637 pool->low_water_blocks = pt->low_water_blocks; 1638 pool->pf = pt->pf; 1639 1640 /* 1641 * If discard_passdown was enabled verify that the data device 1642 * supports discards. Disable discard_passdown if not; otherwise 1643 * -EOPNOTSUPP will be returned. 1644 */ 1645 if (pt->pf.discard_passdown) { 1646 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 1647 if (!q || !blk_queue_discard(q)) { 1648 char buf[BDEVNAME_SIZE]; 1649 DMWARN("Discard unsupported by data device (%s): Disabling discard passdown.", 1650 bdevname(pt->data_dev->bdev, buf)); 1651 pool->pf.discard_passdown = 0; 1652 } 1653 } 1654 1655 return 0; 1656 } 1657 1658 static void unbind_control_target(struct pool *pool, struct dm_target *ti) 1659 { 1660 if (pool->ti == ti) 1661 pool->ti = NULL; 1662 } 1663 1664 /*---------------------------------------------------------------- 1665 * Pool creation 1666 *--------------------------------------------------------------*/ 1667 /* Initialize pool features. */ 1668 static void pool_features_init(struct pool_features *pf) 1669 { 1670 pf->zero_new_blocks = 1; 1671 pf->discard_enabled = 1; 1672 pf->discard_passdown = 1; 1673 } 1674 1675 static void __pool_destroy(struct pool *pool) 1676 { 1677 __pool_table_remove(pool); 1678 1679 if (dm_pool_metadata_close(pool->pmd) < 0) 1680 DMWARN("%s: dm_pool_metadata_close() failed.", __func__); 1681 1682 prison_destroy(pool->prison); 1683 dm_kcopyd_client_destroy(pool->copier); 1684 1685 if (pool->wq) 1686 destroy_workqueue(pool->wq); 1687 1688 if (pool->next_mapping) 1689 mempool_free(pool->next_mapping, pool->mapping_pool); 1690 mempool_destroy(pool->mapping_pool); 1691 mempool_destroy(pool->endio_hook_pool); 1692 kfree(pool); 1693 } 1694 1695 static struct kmem_cache *_new_mapping_cache; 1696 static struct kmem_cache *_endio_hook_cache; 1697 1698 static struct pool *pool_create(struct mapped_device *pool_md, 1699 struct block_device *metadata_dev, 1700 unsigned long block_size, char **error) 1701 { 1702 int r; 1703 void *err_p; 1704 struct pool *pool; 1705 struct dm_pool_metadata *pmd; 1706 1707 pmd = dm_pool_metadata_open(metadata_dev, block_size); 1708 if (IS_ERR(pmd)) { 1709 *error = "Error creating metadata object"; 1710 return (struct pool *)pmd; 1711 } 1712 1713 pool = kmalloc(sizeof(*pool), GFP_KERNEL); 1714 if (!pool) { 1715 *error = "Error allocating memory for pool"; 1716 err_p = ERR_PTR(-ENOMEM); 1717 goto bad_pool; 1718 } 1719 1720 pool->pmd = pmd; 1721 pool->sectors_per_block = block_size; 1722 pool->block_shift = ffs(block_size) - 1; 1723 pool->offset_mask = block_size - 1; 1724 pool->low_water_blocks = 0; 1725 pool_features_init(&pool->pf); 1726 pool->prison = prison_create(PRISON_CELLS); 1727 if (!pool->prison) { 1728 *error = "Error creating pool's bio prison"; 1729 err_p = ERR_PTR(-ENOMEM); 1730 goto bad_prison; 1731 } 1732 1733 pool->copier = dm_kcopyd_client_create(); 1734 if (IS_ERR(pool->copier)) { 1735 r = PTR_ERR(pool->copier); 1736 *error = "Error creating pool's kcopyd client"; 1737 err_p = ERR_PTR(r); 1738 goto bad_kcopyd_client; 1739 } 1740 1741 /* 1742 * Create singlethreaded workqueue that will service all devices 1743 * that use this metadata. 1744 */ 1745 pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 1746 if (!pool->wq) { 1747 *error = "Error creating pool's workqueue"; 1748 err_p = ERR_PTR(-ENOMEM); 1749 goto bad_wq; 1750 } 1751 1752 INIT_WORK(&pool->worker, do_worker); 1753 INIT_DELAYED_WORK(&pool->waker, do_waker); 1754 spin_lock_init(&pool->lock); 1755 bio_list_init(&pool->deferred_bios); 1756 bio_list_init(&pool->deferred_flush_bios); 1757 INIT_LIST_HEAD(&pool->prepared_mappings); 1758 INIT_LIST_HEAD(&pool->prepared_discards); 1759 pool->low_water_triggered = 0; 1760 pool->no_free_space = 0; 1761 bio_list_init(&pool->retry_on_resume_list); 1762 ds_init(&pool->shared_read_ds); 1763 ds_init(&pool->all_io_ds); 1764 1765 pool->next_mapping = NULL; 1766 pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE, 1767 _new_mapping_cache); 1768 if (!pool->mapping_pool) { 1769 *error = "Error creating pool's mapping mempool"; 1770 err_p = ERR_PTR(-ENOMEM); 1771 goto bad_mapping_pool; 1772 } 1773 1774 pool->endio_hook_pool = mempool_create_slab_pool(ENDIO_HOOK_POOL_SIZE, 1775 _endio_hook_cache); 1776 if (!pool->endio_hook_pool) { 1777 *error = "Error creating pool's endio_hook mempool"; 1778 err_p = ERR_PTR(-ENOMEM); 1779 goto bad_endio_hook_pool; 1780 } 1781 pool->ref_count = 1; 1782 pool->last_commit_jiffies = jiffies; 1783 pool->pool_md = pool_md; 1784 pool->md_dev = metadata_dev; 1785 __pool_table_insert(pool); 1786 1787 return pool; 1788 1789 bad_endio_hook_pool: 1790 mempool_destroy(pool->mapping_pool); 1791 bad_mapping_pool: 1792 destroy_workqueue(pool->wq); 1793 bad_wq: 1794 dm_kcopyd_client_destroy(pool->copier); 1795 bad_kcopyd_client: 1796 prison_destroy(pool->prison); 1797 bad_prison: 1798 kfree(pool); 1799 bad_pool: 1800 if (dm_pool_metadata_close(pmd)) 1801 DMWARN("%s: dm_pool_metadata_close() failed.", __func__); 1802 1803 return err_p; 1804 } 1805 1806 static void __pool_inc(struct pool *pool) 1807 { 1808 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 1809 pool->ref_count++; 1810 } 1811 1812 static void __pool_dec(struct pool *pool) 1813 { 1814 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 1815 BUG_ON(!pool->ref_count); 1816 if (!--pool->ref_count) 1817 __pool_destroy(pool); 1818 } 1819 1820 static struct pool *__pool_find(struct mapped_device *pool_md, 1821 struct block_device *metadata_dev, 1822 unsigned long block_size, char **error, 1823 int *created) 1824 { 1825 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); 1826 1827 if (pool) { 1828 if (pool->pool_md != pool_md) 1829 return ERR_PTR(-EBUSY); 1830 __pool_inc(pool); 1831 1832 } else { 1833 pool = __pool_table_lookup(pool_md); 1834 if (pool) { 1835 if (pool->md_dev != metadata_dev) 1836 return ERR_PTR(-EINVAL); 1837 __pool_inc(pool); 1838 1839 } else { 1840 pool = pool_create(pool_md, metadata_dev, block_size, error); 1841 *created = 1; 1842 } 1843 } 1844 1845 return pool; 1846 } 1847 1848 /*---------------------------------------------------------------- 1849 * Pool target methods 1850 *--------------------------------------------------------------*/ 1851 static void pool_dtr(struct dm_target *ti) 1852 { 1853 struct pool_c *pt = ti->private; 1854 1855 mutex_lock(&dm_thin_pool_table.mutex); 1856 1857 unbind_control_target(pt->pool, ti); 1858 __pool_dec(pt->pool); 1859 dm_put_device(ti, pt->metadata_dev); 1860 dm_put_device(ti, pt->data_dev); 1861 kfree(pt); 1862 1863 mutex_unlock(&dm_thin_pool_table.mutex); 1864 } 1865 1866 static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, 1867 struct dm_target *ti) 1868 { 1869 int r; 1870 unsigned argc; 1871 const char *arg_name; 1872 1873 static struct dm_arg _args[] = { 1874 {0, 3, "Invalid number of pool feature arguments"}, 1875 }; 1876 1877 /* 1878 * No feature arguments supplied. 1879 */ 1880 if (!as->argc) 1881 return 0; 1882 1883 r = dm_read_arg_group(_args, as, &argc, &ti->error); 1884 if (r) 1885 return -EINVAL; 1886 1887 while (argc && !r) { 1888 arg_name = dm_shift_arg(as); 1889 argc--; 1890 1891 if (!strcasecmp(arg_name, "skip_block_zeroing")) { 1892 pf->zero_new_blocks = 0; 1893 continue; 1894 } else if (!strcasecmp(arg_name, "ignore_discard")) { 1895 pf->discard_enabled = 0; 1896 continue; 1897 } else if (!strcasecmp(arg_name, "no_discard_passdown")) { 1898 pf->discard_passdown = 0; 1899 continue; 1900 } 1901 1902 ti->error = "Unrecognised pool feature requested"; 1903 r = -EINVAL; 1904 } 1905 1906 return r; 1907 } 1908 1909 /* 1910 * thin-pool <metadata dev> <data dev> 1911 * <data block size (sectors)> 1912 * <low water mark (blocks)> 1913 * [<#feature args> [<arg>]*] 1914 * 1915 * Optional feature arguments are: 1916 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. 1917 * ignore_discard: disable discard 1918 * no_discard_passdown: don't pass discards down to the data device 1919 */ 1920 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) 1921 { 1922 int r, pool_created = 0; 1923 struct pool_c *pt; 1924 struct pool *pool; 1925 struct pool_features pf; 1926 struct dm_arg_set as; 1927 struct dm_dev *data_dev; 1928 unsigned long block_size; 1929 dm_block_t low_water_blocks; 1930 struct dm_dev *metadata_dev; 1931 sector_t metadata_dev_size; 1932 char b[BDEVNAME_SIZE]; 1933 1934 /* 1935 * FIXME Remove validation from scope of lock. 1936 */ 1937 mutex_lock(&dm_thin_pool_table.mutex); 1938 1939 if (argc < 4) { 1940 ti->error = "Invalid argument count"; 1941 r = -EINVAL; 1942 goto out_unlock; 1943 } 1944 as.argc = argc; 1945 as.argv = argv; 1946 1947 r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev); 1948 if (r) { 1949 ti->error = "Error opening metadata block device"; 1950 goto out_unlock; 1951 } 1952 1953 metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT; 1954 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) 1955 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 1956 bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); 1957 1958 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); 1959 if (r) { 1960 ti->error = "Error getting data device"; 1961 goto out_metadata; 1962 } 1963 1964 if (kstrtoul(argv[2], 10, &block_size) || !block_size || 1965 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 1966 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 1967 !is_power_of_2(block_size)) { 1968 ti->error = "Invalid block size"; 1969 r = -EINVAL; 1970 goto out; 1971 } 1972 1973 if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) { 1974 ti->error = "Invalid low water mark"; 1975 r = -EINVAL; 1976 goto out; 1977 } 1978 1979 /* 1980 * Set default pool features. 1981 */ 1982 pool_features_init(&pf); 1983 1984 dm_consume_args(&as, 4); 1985 r = parse_pool_features(&as, &pf, ti); 1986 if (r) 1987 goto out; 1988 1989 pt = kzalloc(sizeof(*pt), GFP_KERNEL); 1990 if (!pt) { 1991 r = -ENOMEM; 1992 goto out; 1993 } 1994 1995 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, 1996 block_size, &ti->error, &pool_created); 1997 if (IS_ERR(pool)) { 1998 r = PTR_ERR(pool); 1999 goto out_free_pt; 2000 } 2001 2002 /* 2003 * 'pool_created' reflects whether this is the first table load. 2004 * Top level discard support is not allowed to be changed after 2005 * initial load. This would require a pool reload to trigger thin 2006 * device changes. 2007 */ 2008 if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) { 2009 ti->error = "Discard support cannot be disabled once enabled"; 2010 r = -EINVAL; 2011 goto out_flags_changed; 2012 } 2013 2014 pt->pool = pool; 2015 pt->ti = ti; 2016 pt->metadata_dev = metadata_dev; 2017 pt->data_dev = data_dev; 2018 pt->low_water_blocks = low_water_blocks; 2019 pt->pf = pf; 2020 ti->num_flush_requests = 1; 2021 /* 2022 * Only need to enable discards if the pool should pass 2023 * them down to the data device. The thin device's discard 2024 * processing will cause mappings to be removed from the btree. 2025 */ 2026 if (pf.discard_enabled && pf.discard_passdown) { 2027 ti->num_discard_requests = 1; 2028 /* 2029 * Setting 'discards_supported' circumvents the normal 2030 * stacking of discard limits (this keeps the pool and 2031 * thin devices' discard limits consistent). 2032 */ 2033 ti->discards_supported = 1; 2034 } 2035 ti->private = pt; 2036 2037 pt->callbacks.congested_fn = pool_is_congested; 2038 dm_table_add_target_callbacks(ti->table, &pt->callbacks); 2039 2040 mutex_unlock(&dm_thin_pool_table.mutex); 2041 2042 return 0; 2043 2044 out_flags_changed: 2045 __pool_dec(pool); 2046 out_free_pt: 2047 kfree(pt); 2048 out: 2049 dm_put_device(ti, data_dev); 2050 out_metadata: 2051 dm_put_device(ti, metadata_dev); 2052 out_unlock: 2053 mutex_unlock(&dm_thin_pool_table.mutex); 2054 2055 return r; 2056 } 2057 2058 static int pool_map(struct dm_target *ti, struct bio *bio, 2059 union map_info *map_context) 2060 { 2061 int r; 2062 struct pool_c *pt = ti->private; 2063 struct pool *pool = pt->pool; 2064 unsigned long flags; 2065 2066 /* 2067 * As this is a singleton target, ti->begin is always zero. 2068 */ 2069 spin_lock_irqsave(&pool->lock, flags); 2070 bio->bi_bdev = pt->data_dev->bdev; 2071 r = DM_MAPIO_REMAPPED; 2072 spin_unlock_irqrestore(&pool->lock, flags); 2073 2074 return r; 2075 } 2076 2077 /* 2078 * Retrieves the number of blocks of the data device from 2079 * the superblock and compares it to the actual device size, 2080 * thus resizing the data device in case it has grown. 2081 * 2082 * This both copes with opening preallocated data devices in the ctr 2083 * being followed by a resume 2084 * -and- 2085 * calling the resume method individually after userspace has 2086 * grown the data device in reaction to a table event. 2087 */ 2088 static int pool_preresume(struct dm_target *ti) 2089 { 2090 int r; 2091 struct pool_c *pt = ti->private; 2092 struct pool *pool = pt->pool; 2093 dm_block_t data_size, sb_data_size; 2094 2095 /* 2096 * Take control of the pool object. 2097 */ 2098 r = bind_control_target(pool, ti); 2099 if (r) 2100 return r; 2101 2102 data_size = ti->len >> pool->block_shift; 2103 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size); 2104 if (r) { 2105 DMERR("failed to retrieve data device size"); 2106 return r; 2107 } 2108 2109 if (data_size < sb_data_size) { 2110 DMERR("pool target too small, is %llu blocks (expected %llu)", 2111 data_size, sb_data_size); 2112 return -EINVAL; 2113 2114 } else if (data_size > sb_data_size) { 2115 r = dm_pool_resize_data_dev(pool->pmd, data_size); 2116 if (r) { 2117 DMERR("failed to resize data device"); 2118 return r; 2119 } 2120 2121 r = dm_pool_commit_metadata(pool->pmd); 2122 if (r) { 2123 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 2124 __func__, r); 2125 return r; 2126 } 2127 } 2128 2129 return 0; 2130 } 2131 2132 static void pool_resume(struct dm_target *ti) 2133 { 2134 struct pool_c *pt = ti->private; 2135 struct pool *pool = pt->pool; 2136 unsigned long flags; 2137 2138 spin_lock_irqsave(&pool->lock, flags); 2139 pool->low_water_triggered = 0; 2140 pool->no_free_space = 0; 2141 __requeue_bios(pool); 2142 spin_unlock_irqrestore(&pool->lock, flags); 2143 2144 do_waker(&pool->waker.work); 2145 } 2146 2147 static void pool_postsuspend(struct dm_target *ti) 2148 { 2149 int r; 2150 struct pool_c *pt = ti->private; 2151 struct pool *pool = pt->pool; 2152 2153 cancel_delayed_work(&pool->waker); 2154 flush_workqueue(pool->wq); 2155 2156 r = dm_pool_commit_metadata(pool->pmd); 2157 if (r < 0) { 2158 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 2159 __func__, r); 2160 /* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/ 2161 } 2162 } 2163 2164 static int check_arg_count(unsigned argc, unsigned args_required) 2165 { 2166 if (argc != args_required) { 2167 DMWARN("Message received with %u arguments instead of %u.", 2168 argc, args_required); 2169 return -EINVAL; 2170 } 2171 2172 return 0; 2173 } 2174 2175 static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning) 2176 { 2177 if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) && 2178 *dev_id <= MAX_DEV_ID) 2179 return 0; 2180 2181 if (warning) 2182 DMWARN("Message received with invalid device id: %s", arg); 2183 2184 return -EINVAL; 2185 } 2186 2187 static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool) 2188 { 2189 dm_thin_id dev_id; 2190 int r; 2191 2192 r = check_arg_count(argc, 2); 2193 if (r) 2194 return r; 2195 2196 r = read_dev_id(argv[1], &dev_id, 1); 2197 if (r) 2198 return r; 2199 2200 r = dm_pool_create_thin(pool->pmd, dev_id); 2201 if (r) { 2202 DMWARN("Creation of new thinly-provisioned device with id %s failed.", 2203 argv[1]); 2204 return r; 2205 } 2206 2207 return 0; 2208 } 2209 2210 static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool) 2211 { 2212 dm_thin_id dev_id; 2213 dm_thin_id origin_dev_id; 2214 int r; 2215 2216 r = check_arg_count(argc, 3); 2217 if (r) 2218 return r; 2219 2220 r = read_dev_id(argv[1], &dev_id, 1); 2221 if (r) 2222 return r; 2223 2224 r = read_dev_id(argv[2], &origin_dev_id, 1); 2225 if (r) 2226 return r; 2227 2228 r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id); 2229 if (r) { 2230 DMWARN("Creation of new snapshot %s of device %s failed.", 2231 argv[1], argv[2]); 2232 return r; 2233 } 2234 2235 return 0; 2236 } 2237 2238 static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool) 2239 { 2240 dm_thin_id dev_id; 2241 int r; 2242 2243 r = check_arg_count(argc, 2); 2244 if (r) 2245 return r; 2246 2247 r = read_dev_id(argv[1], &dev_id, 1); 2248 if (r) 2249 return r; 2250 2251 r = dm_pool_delete_thin_device(pool->pmd, dev_id); 2252 if (r) 2253 DMWARN("Deletion of thin device %s failed.", argv[1]); 2254 2255 return r; 2256 } 2257 2258 static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool) 2259 { 2260 dm_thin_id old_id, new_id; 2261 int r; 2262 2263 r = check_arg_count(argc, 3); 2264 if (r) 2265 return r; 2266 2267 if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) { 2268 DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]); 2269 return -EINVAL; 2270 } 2271 2272 if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) { 2273 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]); 2274 return -EINVAL; 2275 } 2276 2277 r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id); 2278 if (r) { 2279 DMWARN("Failed to change transaction id from %s to %s.", 2280 argv[1], argv[2]); 2281 return r; 2282 } 2283 2284 return 0; 2285 } 2286 2287 static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool) 2288 { 2289 int r; 2290 2291 r = check_arg_count(argc, 1); 2292 if (r) 2293 return r; 2294 2295 r = dm_pool_commit_metadata(pool->pmd); 2296 if (r) { 2297 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 2298 __func__, r); 2299 return r; 2300 } 2301 2302 r = dm_pool_reserve_metadata_snap(pool->pmd); 2303 if (r) 2304 DMWARN("reserve_metadata_snap message failed."); 2305 2306 return r; 2307 } 2308 2309 static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool) 2310 { 2311 int r; 2312 2313 r = check_arg_count(argc, 1); 2314 if (r) 2315 return r; 2316 2317 r = dm_pool_release_metadata_snap(pool->pmd); 2318 if (r) 2319 DMWARN("release_metadata_snap message failed."); 2320 2321 return r; 2322 } 2323 2324 /* 2325 * Messages supported: 2326 * create_thin <dev_id> 2327 * create_snap <dev_id> <origin_id> 2328 * delete <dev_id> 2329 * trim <dev_id> <new_size_in_sectors> 2330 * set_transaction_id <current_trans_id> <new_trans_id> 2331 * reserve_metadata_snap 2332 * release_metadata_snap 2333 */ 2334 static int pool_message(struct dm_target *ti, unsigned argc, char **argv) 2335 { 2336 int r = -EINVAL; 2337 struct pool_c *pt = ti->private; 2338 struct pool *pool = pt->pool; 2339 2340 if (!strcasecmp(argv[0], "create_thin")) 2341 r = process_create_thin_mesg(argc, argv, pool); 2342 2343 else if (!strcasecmp(argv[0], "create_snap")) 2344 r = process_create_snap_mesg(argc, argv, pool); 2345 2346 else if (!strcasecmp(argv[0], "delete")) 2347 r = process_delete_mesg(argc, argv, pool); 2348 2349 else if (!strcasecmp(argv[0], "set_transaction_id")) 2350 r = process_set_transaction_id_mesg(argc, argv, pool); 2351 2352 else if (!strcasecmp(argv[0], "reserve_metadata_snap")) 2353 r = process_reserve_metadata_snap_mesg(argc, argv, pool); 2354 2355 else if (!strcasecmp(argv[0], "release_metadata_snap")) 2356 r = process_release_metadata_snap_mesg(argc, argv, pool); 2357 2358 else 2359 DMWARN("Unrecognised thin pool target message received: %s", argv[0]); 2360 2361 if (!r) { 2362 r = dm_pool_commit_metadata(pool->pmd); 2363 if (r) 2364 DMERR("%s message: dm_pool_commit_metadata() failed, error = %d", 2365 argv[0], r); 2366 } 2367 2368 return r; 2369 } 2370 2371 /* 2372 * Status line is: 2373 * <transaction id> <used metadata sectors>/<total metadata sectors> 2374 * <used data sectors>/<total data sectors> <held metadata root> 2375 */ 2376 static int pool_status(struct dm_target *ti, status_type_t type, 2377 char *result, unsigned maxlen) 2378 { 2379 int r, count; 2380 unsigned sz = 0; 2381 uint64_t transaction_id; 2382 dm_block_t nr_free_blocks_data; 2383 dm_block_t nr_free_blocks_metadata; 2384 dm_block_t nr_blocks_data; 2385 dm_block_t nr_blocks_metadata; 2386 dm_block_t held_root; 2387 char buf[BDEVNAME_SIZE]; 2388 char buf2[BDEVNAME_SIZE]; 2389 struct pool_c *pt = ti->private; 2390 struct pool *pool = pt->pool; 2391 2392 switch (type) { 2393 case STATUSTYPE_INFO: 2394 r = dm_pool_get_metadata_transaction_id(pool->pmd, 2395 &transaction_id); 2396 if (r) 2397 return r; 2398 2399 r = dm_pool_get_free_metadata_block_count(pool->pmd, 2400 &nr_free_blocks_metadata); 2401 if (r) 2402 return r; 2403 2404 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata); 2405 if (r) 2406 return r; 2407 2408 r = dm_pool_get_free_block_count(pool->pmd, 2409 &nr_free_blocks_data); 2410 if (r) 2411 return r; 2412 2413 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data); 2414 if (r) 2415 return r; 2416 2417 r = dm_pool_get_metadata_snap(pool->pmd, &held_root); 2418 if (r) 2419 return r; 2420 2421 DMEMIT("%llu %llu/%llu %llu/%llu ", 2422 (unsigned long long)transaction_id, 2423 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 2424 (unsigned long long)nr_blocks_metadata, 2425 (unsigned long long)(nr_blocks_data - nr_free_blocks_data), 2426 (unsigned long long)nr_blocks_data); 2427 2428 if (held_root) 2429 DMEMIT("%llu", held_root); 2430 else 2431 DMEMIT("-"); 2432 2433 break; 2434 2435 case STATUSTYPE_TABLE: 2436 DMEMIT("%s %s %lu %llu ", 2437 format_dev_t(buf, pt->metadata_dev->bdev->bd_dev), 2438 format_dev_t(buf2, pt->data_dev->bdev->bd_dev), 2439 (unsigned long)pool->sectors_per_block, 2440 (unsigned long long)pt->low_water_blocks); 2441 2442 count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled + 2443 !pt->pf.discard_passdown; 2444 DMEMIT("%u ", count); 2445 2446 if (!pool->pf.zero_new_blocks) 2447 DMEMIT("skip_block_zeroing "); 2448 2449 if (!pool->pf.discard_enabled) 2450 DMEMIT("ignore_discard "); 2451 2452 if (!pt->pf.discard_passdown) 2453 DMEMIT("no_discard_passdown "); 2454 2455 break; 2456 } 2457 2458 return 0; 2459 } 2460 2461 static int pool_iterate_devices(struct dm_target *ti, 2462 iterate_devices_callout_fn fn, void *data) 2463 { 2464 struct pool_c *pt = ti->private; 2465 2466 return fn(ti, pt->data_dev, 0, ti->len, data); 2467 } 2468 2469 static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm, 2470 struct bio_vec *biovec, int max_size) 2471 { 2472 struct pool_c *pt = ti->private; 2473 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 2474 2475 if (!q->merge_bvec_fn) 2476 return max_size; 2477 2478 bvm->bi_bdev = pt->data_dev->bdev; 2479 2480 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 2481 } 2482 2483 static void set_discard_limits(struct pool *pool, struct queue_limits *limits) 2484 { 2485 /* 2486 * FIXME: these limits may be incompatible with the pool's data device 2487 */ 2488 limits->max_discard_sectors = pool->sectors_per_block; 2489 2490 /* 2491 * This is just a hint, and not enforced. We have to cope with 2492 * bios that overlap 2 blocks. 2493 */ 2494 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; 2495 limits->discard_zeroes_data = pool->pf.zero_new_blocks; 2496 } 2497 2498 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) 2499 { 2500 struct pool_c *pt = ti->private; 2501 struct pool *pool = pt->pool; 2502 2503 blk_limits_io_min(limits, 0); 2504 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); 2505 if (pool->pf.discard_enabled) 2506 set_discard_limits(pool, limits); 2507 } 2508 2509 static struct target_type pool_target = { 2510 .name = "thin-pool", 2511 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 2512 DM_TARGET_IMMUTABLE, 2513 .version = {1, 2, 0}, 2514 .module = THIS_MODULE, 2515 .ctr = pool_ctr, 2516 .dtr = pool_dtr, 2517 .map = pool_map, 2518 .postsuspend = pool_postsuspend, 2519 .preresume = pool_preresume, 2520 .resume = pool_resume, 2521 .message = pool_message, 2522 .status = pool_status, 2523 .merge = pool_merge, 2524 .iterate_devices = pool_iterate_devices, 2525 .io_hints = pool_io_hints, 2526 }; 2527 2528 /*---------------------------------------------------------------- 2529 * Thin target methods 2530 *--------------------------------------------------------------*/ 2531 static void thin_dtr(struct dm_target *ti) 2532 { 2533 struct thin_c *tc = ti->private; 2534 2535 mutex_lock(&dm_thin_pool_table.mutex); 2536 2537 __pool_dec(tc->pool); 2538 dm_pool_close_thin_device(tc->td); 2539 dm_put_device(ti, tc->pool_dev); 2540 if (tc->origin_dev) 2541 dm_put_device(ti, tc->origin_dev); 2542 kfree(tc); 2543 2544 mutex_unlock(&dm_thin_pool_table.mutex); 2545 } 2546 2547 /* 2548 * Thin target parameters: 2549 * 2550 * <pool_dev> <dev_id> [origin_dev] 2551 * 2552 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool) 2553 * dev_id: the internal device identifier 2554 * origin_dev: a device external to the pool that should act as the origin 2555 * 2556 * If the pool device has discards disabled, they get disabled for the thin 2557 * device as well. 2558 */ 2559 static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) 2560 { 2561 int r; 2562 struct thin_c *tc; 2563 struct dm_dev *pool_dev, *origin_dev; 2564 struct mapped_device *pool_md; 2565 2566 mutex_lock(&dm_thin_pool_table.mutex); 2567 2568 if (argc != 2 && argc != 3) { 2569 ti->error = "Invalid argument count"; 2570 r = -EINVAL; 2571 goto out_unlock; 2572 } 2573 2574 tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL); 2575 if (!tc) { 2576 ti->error = "Out of memory"; 2577 r = -ENOMEM; 2578 goto out_unlock; 2579 } 2580 2581 if (argc == 3) { 2582 r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev); 2583 if (r) { 2584 ti->error = "Error opening origin device"; 2585 goto bad_origin_dev; 2586 } 2587 tc->origin_dev = origin_dev; 2588 } 2589 2590 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev); 2591 if (r) { 2592 ti->error = "Error opening pool device"; 2593 goto bad_pool_dev; 2594 } 2595 tc->pool_dev = pool_dev; 2596 2597 if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) { 2598 ti->error = "Invalid device id"; 2599 r = -EINVAL; 2600 goto bad_common; 2601 } 2602 2603 pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev); 2604 if (!pool_md) { 2605 ti->error = "Couldn't get pool mapped device"; 2606 r = -EINVAL; 2607 goto bad_common; 2608 } 2609 2610 tc->pool = __pool_table_lookup(pool_md); 2611 if (!tc->pool) { 2612 ti->error = "Couldn't find pool object"; 2613 r = -EINVAL; 2614 goto bad_pool_lookup; 2615 } 2616 __pool_inc(tc->pool); 2617 2618 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td); 2619 if (r) { 2620 ti->error = "Couldn't open thin internal device"; 2621 goto bad_thin_open; 2622 } 2623 2624 ti->split_io = tc->pool->sectors_per_block; 2625 ti->num_flush_requests = 1; 2626 2627 /* In case the pool supports discards, pass them on. */ 2628 if (tc->pool->pf.discard_enabled) { 2629 ti->discards_supported = 1; 2630 ti->num_discard_requests = 1; 2631 } 2632 2633 dm_put(pool_md); 2634 2635 mutex_unlock(&dm_thin_pool_table.mutex); 2636 2637 return 0; 2638 2639 bad_thin_open: 2640 __pool_dec(tc->pool); 2641 bad_pool_lookup: 2642 dm_put(pool_md); 2643 bad_common: 2644 dm_put_device(ti, tc->pool_dev); 2645 bad_pool_dev: 2646 if (tc->origin_dev) 2647 dm_put_device(ti, tc->origin_dev); 2648 bad_origin_dev: 2649 kfree(tc); 2650 out_unlock: 2651 mutex_unlock(&dm_thin_pool_table.mutex); 2652 2653 return r; 2654 } 2655 2656 static int thin_map(struct dm_target *ti, struct bio *bio, 2657 union map_info *map_context) 2658 { 2659 bio->bi_sector = dm_target_offset(ti, bio->bi_sector); 2660 2661 return thin_bio_map(ti, bio, map_context); 2662 } 2663 2664 static int thin_endio(struct dm_target *ti, 2665 struct bio *bio, int err, 2666 union map_info *map_context) 2667 { 2668 unsigned long flags; 2669 struct dm_thin_endio_hook *h = map_context->ptr; 2670 struct list_head work; 2671 struct dm_thin_new_mapping *m, *tmp; 2672 struct pool *pool = h->tc->pool; 2673 2674 if (h->shared_read_entry) { 2675 INIT_LIST_HEAD(&work); 2676 ds_dec(h->shared_read_entry, &work); 2677 2678 spin_lock_irqsave(&pool->lock, flags); 2679 list_for_each_entry_safe(m, tmp, &work, list) { 2680 list_del(&m->list); 2681 m->quiesced = 1; 2682 __maybe_add_mapping(m); 2683 } 2684 spin_unlock_irqrestore(&pool->lock, flags); 2685 } 2686 2687 if (h->all_io_entry) { 2688 INIT_LIST_HEAD(&work); 2689 ds_dec(h->all_io_entry, &work); 2690 spin_lock_irqsave(&pool->lock, flags); 2691 list_for_each_entry_safe(m, tmp, &work, list) 2692 list_add(&m->list, &pool->prepared_discards); 2693 spin_unlock_irqrestore(&pool->lock, flags); 2694 } 2695 2696 mempool_free(h, pool->endio_hook_pool); 2697 2698 return 0; 2699 } 2700 2701 static void thin_postsuspend(struct dm_target *ti) 2702 { 2703 if (dm_noflush_suspending(ti)) 2704 requeue_io((struct thin_c *)ti->private); 2705 } 2706 2707 /* 2708 * <nr mapped sectors> <highest mapped sector> 2709 */ 2710 static int thin_status(struct dm_target *ti, status_type_t type, 2711 char *result, unsigned maxlen) 2712 { 2713 int r; 2714 ssize_t sz = 0; 2715 dm_block_t mapped, highest; 2716 char buf[BDEVNAME_SIZE]; 2717 struct thin_c *tc = ti->private; 2718 2719 if (!tc->td) 2720 DMEMIT("-"); 2721 else { 2722 switch (type) { 2723 case STATUSTYPE_INFO: 2724 r = dm_thin_get_mapped_count(tc->td, &mapped); 2725 if (r) 2726 return r; 2727 2728 r = dm_thin_get_highest_mapped_block(tc->td, &highest); 2729 if (r < 0) 2730 return r; 2731 2732 DMEMIT("%llu ", mapped * tc->pool->sectors_per_block); 2733 if (r) 2734 DMEMIT("%llu", ((highest + 1) * 2735 tc->pool->sectors_per_block) - 1); 2736 else 2737 DMEMIT("-"); 2738 break; 2739 2740 case STATUSTYPE_TABLE: 2741 DMEMIT("%s %lu", 2742 format_dev_t(buf, tc->pool_dev->bdev->bd_dev), 2743 (unsigned long) tc->dev_id); 2744 if (tc->origin_dev) 2745 DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev)); 2746 break; 2747 } 2748 } 2749 2750 return 0; 2751 } 2752 2753 static int thin_iterate_devices(struct dm_target *ti, 2754 iterate_devices_callout_fn fn, void *data) 2755 { 2756 dm_block_t blocks; 2757 struct thin_c *tc = ti->private; 2758 2759 /* 2760 * We can't call dm_pool_get_data_dev_size() since that blocks. So 2761 * we follow a more convoluted path through to the pool's target. 2762 */ 2763 if (!tc->pool->ti) 2764 return 0; /* nothing is bound */ 2765 2766 blocks = tc->pool->ti->len >> tc->pool->block_shift; 2767 if (blocks) 2768 return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block * blocks, data); 2769 2770 return 0; 2771 } 2772 2773 static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) 2774 { 2775 struct thin_c *tc = ti->private; 2776 struct pool *pool = tc->pool; 2777 2778 blk_limits_io_min(limits, 0); 2779 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); 2780 set_discard_limits(pool, limits); 2781 } 2782 2783 static struct target_type thin_target = { 2784 .name = "thin", 2785 .version = {1, 1, 0}, 2786 .module = THIS_MODULE, 2787 .ctr = thin_ctr, 2788 .dtr = thin_dtr, 2789 .map = thin_map, 2790 .end_io = thin_endio, 2791 .postsuspend = thin_postsuspend, 2792 .status = thin_status, 2793 .iterate_devices = thin_iterate_devices, 2794 .io_hints = thin_io_hints, 2795 }; 2796 2797 /*----------------------------------------------------------------*/ 2798 2799 static int __init dm_thin_init(void) 2800 { 2801 int r; 2802 2803 pool_table_init(); 2804 2805 r = dm_register_target(&thin_target); 2806 if (r) 2807 return r; 2808 2809 r = dm_register_target(&pool_target); 2810 if (r) 2811 goto bad_pool_target; 2812 2813 r = -ENOMEM; 2814 2815 _cell_cache = KMEM_CACHE(dm_bio_prison_cell, 0); 2816 if (!_cell_cache) 2817 goto bad_cell_cache; 2818 2819 _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0); 2820 if (!_new_mapping_cache) 2821 goto bad_new_mapping_cache; 2822 2823 _endio_hook_cache = KMEM_CACHE(dm_thin_endio_hook, 0); 2824 if (!_endio_hook_cache) 2825 goto bad_endio_hook_cache; 2826 2827 return 0; 2828 2829 bad_endio_hook_cache: 2830 kmem_cache_destroy(_new_mapping_cache); 2831 bad_new_mapping_cache: 2832 kmem_cache_destroy(_cell_cache); 2833 bad_cell_cache: 2834 dm_unregister_target(&pool_target); 2835 bad_pool_target: 2836 dm_unregister_target(&thin_target); 2837 2838 return r; 2839 } 2840 2841 static void dm_thin_exit(void) 2842 { 2843 dm_unregister_target(&thin_target); 2844 dm_unregister_target(&pool_target); 2845 2846 kmem_cache_destroy(_cell_cache); 2847 kmem_cache_destroy(_new_mapping_cache); 2848 kmem_cache_destroy(_endio_hook_cache); 2849 } 2850 2851 module_init(dm_thin_init); 2852 module_exit(dm_thin_exit); 2853 2854 MODULE_DESCRIPTION(DM_NAME " thin provisioning target"); 2855 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 2856 MODULE_LICENSE("GPL"); 2857