1 /* 2 * Copyright (C) 2011 Red Hat UK. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm-thin-metadata.h" 8 9 #include <linux/device-mapper.h> 10 #include <linux/dm-io.h> 11 #include <linux/dm-kcopyd.h> 12 #include <linux/list.h> 13 #include <linux/init.h> 14 #include <linux/module.h> 15 #include <linux/slab.h> 16 17 #define DM_MSG_PREFIX "thin" 18 19 /* 20 * Tunable constants 21 */ 22 #define ENDIO_HOOK_POOL_SIZE 10240 23 #define DEFERRED_SET_SIZE 64 24 #define MAPPING_POOL_SIZE 1024 25 #define PRISON_CELLS 1024 26 #define COMMIT_PERIOD HZ 27 28 /* 29 * The block size of the device holding pool data must be 30 * between 64KB and 1GB. 31 */ 32 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT) 33 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 34 35 /* 36 * Device id is restricted to 24 bits. 37 */ 38 #define MAX_DEV_ID ((1 << 24) - 1) 39 40 /* 41 * How do we handle breaking sharing of data blocks? 42 * ================================================= 43 * 44 * We use a standard copy-on-write btree to store the mappings for the 45 * devices (note I'm talking about copy-on-write of the metadata here, not 46 * the data). When you take an internal snapshot you clone the root node 47 * of the origin btree. After this there is no concept of an origin or a 48 * snapshot. They are just two device trees that happen to point to the 49 * same data blocks. 50 * 51 * When we get a write in we decide if it's to a shared data block using 52 * some timestamp magic. If it is, we have to break sharing. 53 * 54 * Let's say we write to a shared block in what was the origin. The 55 * steps are: 56 * 57 * i) plug io further to this physical block. (see bio_prison code). 58 * 59 * ii) quiesce any read io to that shared data block. Obviously 60 * including all devices that share this block. (see deferred_set code) 61 * 62 * iii) copy the data block to a newly allocate block. This step can be 63 * missed out if the io covers the block. (schedule_copy). 64 * 65 * iv) insert the new mapping into the origin's btree 66 * (process_prepared_mapping). This act of inserting breaks some 67 * sharing of btree nodes between the two devices. Breaking sharing only 68 * effects the btree of that specific device. Btrees for the other 69 * devices that share the block never change. The btree for the origin 70 * device as it was after the last commit is untouched, ie. we're using 71 * persistent data structures in the functional programming sense. 72 * 73 * v) unplug io to this physical block, including the io that triggered 74 * the breaking of sharing. 75 * 76 * Steps (ii) and (iii) occur in parallel. 77 * 78 * The metadata _doesn't_ need to be committed before the io continues. We 79 * get away with this because the io is always written to a _new_ block. 80 * If there's a crash, then: 81 * 82 * - The origin mapping will point to the old origin block (the shared 83 * one). This will contain the data as it was before the io that triggered 84 * the breaking of sharing came in. 85 * 86 * - The snap mapping still points to the old block. As it would after 87 * the commit. 88 * 89 * The downside of this scheme is the timestamp magic isn't perfect, and 90 * will continue to think that data block in the snapshot device is shared 91 * even after the write to the origin has broken sharing. I suspect data 92 * blocks will typically be shared by many different devices, so we're 93 * breaking sharing n + 1 times, rather than n, where n is the number of 94 * devices that reference this data block. At the moment I think the 95 * benefits far, far outweigh the disadvantages. 96 */ 97 98 /*----------------------------------------------------------------*/ 99 100 /* 101 * Sometimes we can't deal with a bio straight away. We put them in prison 102 * where they can't cause any mischief. Bios are put in a cell identified 103 * by a key, multiple bios can be in the same cell. When the cell is 104 * subsequently unlocked the bios become available. 105 */ 106 struct bio_prison; 107 108 struct cell_key { 109 int virtual; 110 dm_thin_id dev; 111 dm_block_t block; 112 }; 113 114 struct cell { 115 struct hlist_node list; 116 struct bio_prison *prison; 117 struct cell_key key; 118 struct bio *holder; 119 struct bio_list bios; 120 }; 121 122 struct bio_prison { 123 spinlock_t lock; 124 mempool_t *cell_pool; 125 126 unsigned nr_buckets; 127 unsigned hash_mask; 128 struct hlist_head *cells; 129 }; 130 131 static uint32_t calc_nr_buckets(unsigned nr_cells) 132 { 133 uint32_t n = 128; 134 135 nr_cells /= 4; 136 nr_cells = min(nr_cells, 8192u); 137 138 while (n < nr_cells) 139 n <<= 1; 140 141 return n; 142 } 143 144 /* 145 * @nr_cells should be the number of cells you want in use _concurrently_. 146 * Don't confuse it with the number of distinct keys. 147 */ 148 static struct bio_prison *prison_create(unsigned nr_cells) 149 { 150 unsigned i; 151 uint32_t nr_buckets = calc_nr_buckets(nr_cells); 152 size_t len = sizeof(struct bio_prison) + 153 (sizeof(struct hlist_head) * nr_buckets); 154 struct bio_prison *prison = kmalloc(len, GFP_KERNEL); 155 156 if (!prison) 157 return NULL; 158 159 spin_lock_init(&prison->lock); 160 prison->cell_pool = mempool_create_kmalloc_pool(nr_cells, 161 sizeof(struct cell)); 162 if (!prison->cell_pool) { 163 kfree(prison); 164 return NULL; 165 } 166 167 prison->nr_buckets = nr_buckets; 168 prison->hash_mask = nr_buckets - 1; 169 prison->cells = (struct hlist_head *) (prison + 1); 170 for (i = 0; i < nr_buckets; i++) 171 INIT_HLIST_HEAD(prison->cells + i); 172 173 return prison; 174 } 175 176 static void prison_destroy(struct bio_prison *prison) 177 { 178 mempool_destroy(prison->cell_pool); 179 kfree(prison); 180 } 181 182 static uint32_t hash_key(struct bio_prison *prison, struct cell_key *key) 183 { 184 const unsigned long BIG_PRIME = 4294967291UL; 185 uint64_t hash = key->block * BIG_PRIME; 186 187 return (uint32_t) (hash & prison->hash_mask); 188 } 189 190 static int keys_equal(struct cell_key *lhs, struct cell_key *rhs) 191 { 192 return (lhs->virtual == rhs->virtual) && 193 (lhs->dev == rhs->dev) && 194 (lhs->block == rhs->block); 195 } 196 197 static struct cell *__search_bucket(struct hlist_head *bucket, 198 struct cell_key *key) 199 { 200 struct cell *cell; 201 struct hlist_node *tmp; 202 203 hlist_for_each_entry(cell, tmp, bucket, list) 204 if (keys_equal(&cell->key, key)) 205 return cell; 206 207 return NULL; 208 } 209 210 /* 211 * This may block if a new cell needs allocating. You must ensure that 212 * cells will be unlocked even if the calling thread is blocked. 213 * 214 * Returns 1 if the cell was already held, 0 if @inmate is the new holder. 215 */ 216 static int bio_detain(struct bio_prison *prison, struct cell_key *key, 217 struct bio *inmate, struct cell **ref) 218 { 219 int r = 1; 220 unsigned long flags; 221 uint32_t hash = hash_key(prison, key); 222 struct cell *cell, *cell2; 223 224 BUG_ON(hash > prison->nr_buckets); 225 226 spin_lock_irqsave(&prison->lock, flags); 227 228 cell = __search_bucket(prison->cells + hash, key); 229 if (cell) { 230 bio_list_add(&cell->bios, inmate); 231 goto out; 232 } 233 234 /* 235 * Allocate a new cell 236 */ 237 spin_unlock_irqrestore(&prison->lock, flags); 238 cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO); 239 spin_lock_irqsave(&prison->lock, flags); 240 241 /* 242 * We've been unlocked, so we have to double check that 243 * nobody else has inserted this cell in the meantime. 244 */ 245 cell = __search_bucket(prison->cells + hash, key); 246 if (cell) { 247 mempool_free(cell2, prison->cell_pool); 248 bio_list_add(&cell->bios, inmate); 249 goto out; 250 } 251 252 /* 253 * Use new cell. 254 */ 255 cell = cell2; 256 257 cell->prison = prison; 258 memcpy(&cell->key, key, sizeof(cell->key)); 259 cell->holder = inmate; 260 bio_list_init(&cell->bios); 261 hlist_add_head(&cell->list, prison->cells + hash); 262 263 r = 0; 264 265 out: 266 spin_unlock_irqrestore(&prison->lock, flags); 267 268 *ref = cell; 269 270 return r; 271 } 272 273 /* 274 * @inmates must have been initialised prior to this call 275 */ 276 static void __cell_release(struct cell *cell, struct bio_list *inmates) 277 { 278 struct bio_prison *prison = cell->prison; 279 280 hlist_del(&cell->list); 281 282 bio_list_add(inmates, cell->holder); 283 bio_list_merge(inmates, &cell->bios); 284 285 mempool_free(cell, prison->cell_pool); 286 } 287 288 static void cell_release(struct cell *cell, struct bio_list *bios) 289 { 290 unsigned long flags; 291 struct bio_prison *prison = cell->prison; 292 293 spin_lock_irqsave(&prison->lock, flags); 294 __cell_release(cell, bios); 295 spin_unlock_irqrestore(&prison->lock, flags); 296 } 297 298 /* 299 * There are a couple of places where we put a bio into a cell briefly 300 * before taking it out again. In these situations we know that no other 301 * bio may be in the cell. This function releases the cell, and also does 302 * a sanity check. 303 */ 304 static void __cell_release_singleton(struct cell *cell, struct bio *bio) 305 { 306 hlist_del(&cell->list); 307 BUG_ON(cell->holder != bio); 308 BUG_ON(!bio_list_empty(&cell->bios)); 309 } 310 311 static void cell_release_singleton(struct cell *cell, struct bio *bio) 312 { 313 unsigned long flags; 314 struct bio_prison *prison = cell->prison; 315 316 spin_lock_irqsave(&prison->lock, flags); 317 __cell_release_singleton(cell, bio); 318 spin_unlock_irqrestore(&prison->lock, flags); 319 } 320 321 /* 322 * Sometimes we don't want the holder, just the additional bios. 323 */ 324 static void __cell_release_no_holder(struct cell *cell, struct bio_list *inmates) 325 { 326 struct bio_prison *prison = cell->prison; 327 328 hlist_del(&cell->list); 329 bio_list_merge(inmates, &cell->bios); 330 331 mempool_free(cell, prison->cell_pool); 332 } 333 334 static void cell_release_no_holder(struct cell *cell, struct bio_list *inmates) 335 { 336 unsigned long flags; 337 struct bio_prison *prison = cell->prison; 338 339 spin_lock_irqsave(&prison->lock, flags); 340 __cell_release_no_holder(cell, inmates); 341 spin_unlock_irqrestore(&prison->lock, flags); 342 } 343 344 static void cell_error(struct cell *cell) 345 { 346 struct bio_prison *prison = cell->prison; 347 struct bio_list bios; 348 struct bio *bio; 349 unsigned long flags; 350 351 bio_list_init(&bios); 352 353 spin_lock_irqsave(&prison->lock, flags); 354 __cell_release(cell, &bios); 355 spin_unlock_irqrestore(&prison->lock, flags); 356 357 while ((bio = bio_list_pop(&bios))) 358 bio_io_error(bio); 359 } 360 361 /*----------------------------------------------------------------*/ 362 363 /* 364 * We use the deferred set to keep track of pending reads to shared blocks. 365 * We do this to ensure the new mapping caused by a write isn't performed 366 * until these prior reads have completed. Otherwise the insertion of the 367 * new mapping could free the old block that the read bios are mapped to. 368 */ 369 370 struct deferred_set; 371 struct deferred_entry { 372 struct deferred_set *ds; 373 unsigned count; 374 struct list_head work_items; 375 }; 376 377 struct deferred_set { 378 spinlock_t lock; 379 unsigned current_entry; 380 unsigned sweeper; 381 struct deferred_entry entries[DEFERRED_SET_SIZE]; 382 }; 383 384 static void ds_init(struct deferred_set *ds) 385 { 386 int i; 387 388 spin_lock_init(&ds->lock); 389 ds->current_entry = 0; 390 ds->sweeper = 0; 391 for (i = 0; i < DEFERRED_SET_SIZE; i++) { 392 ds->entries[i].ds = ds; 393 ds->entries[i].count = 0; 394 INIT_LIST_HEAD(&ds->entries[i].work_items); 395 } 396 } 397 398 static struct deferred_entry *ds_inc(struct deferred_set *ds) 399 { 400 unsigned long flags; 401 struct deferred_entry *entry; 402 403 spin_lock_irqsave(&ds->lock, flags); 404 entry = ds->entries + ds->current_entry; 405 entry->count++; 406 spin_unlock_irqrestore(&ds->lock, flags); 407 408 return entry; 409 } 410 411 static unsigned ds_next(unsigned index) 412 { 413 return (index + 1) % DEFERRED_SET_SIZE; 414 } 415 416 static void __sweep(struct deferred_set *ds, struct list_head *head) 417 { 418 while ((ds->sweeper != ds->current_entry) && 419 !ds->entries[ds->sweeper].count) { 420 list_splice_init(&ds->entries[ds->sweeper].work_items, head); 421 ds->sweeper = ds_next(ds->sweeper); 422 } 423 424 if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count) 425 list_splice_init(&ds->entries[ds->sweeper].work_items, head); 426 } 427 428 static void ds_dec(struct deferred_entry *entry, struct list_head *head) 429 { 430 unsigned long flags; 431 432 spin_lock_irqsave(&entry->ds->lock, flags); 433 BUG_ON(!entry->count); 434 --entry->count; 435 __sweep(entry->ds, head); 436 spin_unlock_irqrestore(&entry->ds->lock, flags); 437 } 438 439 /* 440 * Returns 1 if deferred or 0 if no pending items to delay job. 441 */ 442 static int ds_add_work(struct deferred_set *ds, struct list_head *work) 443 { 444 int r = 1; 445 unsigned long flags; 446 unsigned next_entry; 447 448 spin_lock_irqsave(&ds->lock, flags); 449 if ((ds->sweeper == ds->current_entry) && 450 !ds->entries[ds->current_entry].count) 451 r = 0; 452 else { 453 list_add(work, &ds->entries[ds->current_entry].work_items); 454 next_entry = ds_next(ds->current_entry); 455 if (!ds->entries[next_entry].count) 456 ds->current_entry = next_entry; 457 } 458 spin_unlock_irqrestore(&ds->lock, flags); 459 460 return r; 461 } 462 463 /*----------------------------------------------------------------*/ 464 465 /* 466 * Key building. 467 */ 468 static void build_data_key(struct dm_thin_device *td, 469 dm_block_t b, struct cell_key *key) 470 { 471 key->virtual = 0; 472 key->dev = dm_thin_dev_id(td); 473 key->block = b; 474 } 475 476 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, 477 struct cell_key *key) 478 { 479 key->virtual = 1; 480 key->dev = dm_thin_dev_id(td); 481 key->block = b; 482 } 483 484 /*----------------------------------------------------------------*/ 485 486 /* 487 * A pool device ties together a metadata device and a data device. It 488 * also provides the interface for creating and destroying internal 489 * devices. 490 */ 491 struct new_mapping; 492 493 struct pool_features { 494 unsigned zero_new_blocks:1; 495 unsigned discard_enabled:1; 496 unsigned discard_passdown:1; 497 }; 498 499 struct pool { 500 struct list_head list; 501 struct dm_target *ti; /* Only set if a pool target is bound */ 502 503 struct mapped_device *pool_md; 504 struct block_device *md_dev; 505 struct dm_pool_metadata *pmd; 506 507 uint32_t sectors_per_block; 508 unsigned block_shift; 509 dm_block_t offset_mask; 510 dm_block_t low_water_blocks; 511 512 struct pool_features pf; 513 unsigned low_water_triggered:1; /* A dm event has been sent */ 514 unsigned no_free_space:1; /* A -ENOSPC warning has been issued */ 515 516 struct bio_prison *prison; 517 struct dm_kcopyd_client *copier; 518 519 struct workqueue_struct *wq; 520 struct work_struct worker; 521 struct delayed_work waker; 522 523 unsigned ref_count; 524 unsigned long last_commit_jiffies; 525 526 spinlock_t lock; 527 struct bio_list deferred_bios; 528 struct bio_list deferred_flush_bios; 529 struct list_head prepared_mappings; 530 struct list_head prepared_discards; 531 532 struct bio_list retry_on_resume_list; 533 534 struct deferred_set shared_read_ds; 535 struct deferred_set all_io_ds; 536 537 struct new_mapping *next_mapping; 538 mempool_t *mapping_pool; 539 mempool_t *endio_hook_pool; 540 }; 541 542 /* 543 * Target context for a pool. 544 */ 545 struct pool_c { 546 struct dm_target *ti; 547 struct pool *pool; 548 struct dm_dev *data_dev; 549 struct dm_dev *metadata_dev; 550 struct dm_target_callbacks callbacks; 551 552 dm_block_t low_water_blocks; 553 struct pool_features pf; 554 }; 555 556 /* 557 * Target context for a thin. 558 */ 559 struct thin_c { 560 struct dm_dev *pool_dev; 561 struct dm_dev *origin_dev; 562 dm_thin_id dev_id; 563 564 struct pool *pool; 565 struct dm_thin_device *td; 566 }; 567 568 /*----------------------------------------------------------------*/ 569 570 /* 571 * A global list of pools that uses a struct mapped_device as a key. 572 */ 573 static struct dm_thin_pool_table { 574 struct mutex mutex; 575 struct list_head pools; 576 } dm_thin_pool_table; 577 578 static void pool_table_init(void) 579 { 580 mutex_init(&dm_thin_pool_table.mutex); 581 INIT_LIST_HEAD(&dm_thin_pool_table.pools); 582 } 583 584 static void __pool_table_insert(struct pool *pool) 585 { 586 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 587 list_add(&pool->list, &dm_thin_pool_table.pools); 588 } 589 590 static void __pool_table_remove(struct pool *pool) 591 { 592 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 593 list_del(&pool->list); 594 } 595 596 static struct pool *__pool_table_lookup(struct mapped_device *md) 597 { 598 struct pool *pool = NULL, *tmp; 599 600 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 601 602 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { 603 if (tmp->pool_md == md) { 604 pool = tmp; 605 break; 606 } 607 } 608 609 return pool; 610 } 611 612 static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev) 613 { 614 struct pool *pool = NULL, *tmp; 615 616 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 617 618 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { 619 if (tmp->md_dev == md_dev) { 620 pool = tmp; 621 break; 622 } 623 } 624 625 return pool; 626 } 627 628 /*----------------------------------------------------------------*/ 629 630 struct endio_hook { 631 struct thin_c *tc; 632 struct deferred_entry *shared_read_entry; 633 struct deferred_entry *all_io_entry; 634 struct new_mapping *overwrite_mapping; 635 }; 636 637 static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) 638 { 639 struct bio *bio; 640 struct bio_list bios; 641 642 bio_list_init(&bios); 643 bio_list_merge(&bios, master); 644 bio_list_init(master); 645 646 while ((bio = bio_list_pop(&bios))) { 647 struct endio_hook *h = dm_get_mapinfo(bio)->ptr; 648 if (h->tc == tc) 649 bio_endio(bio, DM_ENDIO_REQUEUE); 650 else 651 bio_list_add(master, bio); 652 } 653 } 654 655 static void requeue_io(struct thin_c *tc) 656 { 657 struct pool *pool = tc->pool; 658 unsigned long flags; 659 660 spin_lock_irqsave(&pool->lock, flags); 661 __requeue_bio_list(tc, &pool->deferred_bios); 662 __requeue_bio_list(tc, &pool->retry_on_resume_list); 663 spin_unlock_irqrestore(&pool->lock, flags); 664 } 665 666 /* 667 * This section of code contains the logic for processing a thin device's IO. 668 * Much of the code depends on pool object resources (lists, workqueues, etc) 669 * but most is exclusively called from the thin target rather than the thin-pool 670 * target. 671 */ 672 673 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) 674 { 675 return bio->bi_sector >> tc->pool->block_shift; 676 } 677 678 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) 679 { 680 struct pool *pool = tc->pool; 681 682 bio->bi_bdev = tc->pool_dev->bdev; 683 bio->bi_sector = (block << pool->block_shift) + 684 (bio->bi_sector & pool->offset_mask); 685 } 686 687 static void remap_to_origin(struct thin_c *tc, struct bio *bio) 688 { 689 bio->bi_bdev = tc->origin_dev->bdev; 690 } 691 692 static void issue(struct thin_c *tc, struct bio *bio) 693 { 694 struct pool *pool = tc->pool; 695 unsigned long flags; 696 697 /* 698 * Batch together any FUA/FLUSH bios we find and then issue 699 * a single commit for them in process_deferred_bios(). 700 */ 701 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { 702 spin_lock_irqsave(&pool->lock, flags); 703 bio_list_add(&pool->deferred_flush_bios, bio); 704 spin_unlock_irqrestore(&pool->lock, flags); 705 } else 706 generic_make_request(bio); 707 } 708 709 static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio) 710 { 711 remap_to_origin(tc, bio); 712 issue(tc, bio); 713 } 714 715 static void remap_and_issue(struct thin_c *tc, struct bio *bio, 716 dm_block_t block) 717 { 718 remap(tc, bio, block); 719 issue(tc, bio); 720 } 721 722 /* 723 * wake_worker() is used when new work is queued and when pool_resume is 724 * ready to continue deferred IO processing. 725 */ 726 static void wake_worker(struct pool *pool) 727 { 728 queue_work(pool->wq, &pool->worker); 729 } 730 731 /*----------------------------------------------------------------*/ 732 733 /* 734 * Bio endio functions. 735 */ 736 struct new_mapping { 737 struct list_head list; 738 739 unsigned quiesced:1; 740 unsigned prepared:1; 741 unsigned pass_discard:1; 742 743 struct thin_c *tc; 744 dm_block_t virt_block; 745 dm_block_t data_block; 746 struct cell *cell, *cell2; 747 int err; 748 749 /* 750 * If the bio covers the whole area of a block then we can avoid 751 * zeroing or copying. Instead this bio is hooked. The bio will 752 * still be in the cell, so care has to be taken to avoid issuing 753 * the bio twice. 754 */ 755 struct bio *bio; 756 bio_end_io_t *saved_bi_end_io; 757 }; 758 759 static void __maybe_add_mapping(struct new_mapping *m) 760 { 761 struct pool *pool = m->tc->pool; 762 763 if (m->quiesced && m->prepared) { 764 list_add(&m->list, &pool->prepared_mappings); 765 wake_worker(pool); 766 } 767 } 768 769 static void copy_complete(int read_err, unsigned long write_err, void *context) 770 { 771 unsigned long flags; 772 struct new_mapping *m = context; 773 struct pool *pool = m->tc->pool; 774 775 m->err = read_err || write_err ? -EIO : 0; 776 777 spin_lock_irqsave(&pool->lock, flags); 778 m->prepared = 1; 779 __maybe_add_mapping(m); 780 spin_unlock_irqrestore(&pool->lock, flags); 781 } 782 783 static void overwrite_endio(struct bio *bio, int err) 784 { 785 unsigned long flags; 786 struct endio_hook *h = dm_get_mapinfo(bio)->ptr; 787 struct new_mapping *m = h->overwrite_mapping; 788 struct pool *pool = m->tc->pool; 789 790 m->err = err; 791 792 spin_lock_irqsave(&pool->lock, flags); 793 m->prepared = 1; 794 __maybe_add_mapping(m); 795 spin_unlock_irqrestore(&pool->lock, flags); 796 } 797 798 /*----------------------------------------------------------------*/ 799 800 /* 801 * Workqueue. 802 */ 803 804 /* 805 * Prepared mapping jobs. 806 */ 807 808 /* 809 * This sends the bios in the cell back to the deferred_bios list. 810 */ 811 static void cell_defer(struct thin_c *tc, struct cell *cell, 812 dm_block_t data_block) 813 { 814 struct pool *pool = tc->pool; 815 unsigned long flags; 816 817 spin_lock_irqsave(&pool->lock, flags); 818 cell_release(cell, &pool->deferred_bios); 819 spin_unlock_irqrestore(&tc->pool->lock, flags); 820 821 wake_worker(pool); 822 } 823 824 /* 825 * Same as cell_defer above, except it omits one particular detainee, 826 * a write bio that covers the block and has already been processed. 827 */ 828 static void cell_defer_except(struct thin_c *tc, struct cell *cell) 829 { 830 struct bio_list bios; 831 struct pool *pool = tc->pool; 832 unsigned long flags; 833 834 bio_list_init(&bios); 835 836 spin_lock_irqsave(&pool->lock, flags); 837 cell_release_no_holder(cell, &pool->deferred_bios); 838 spin_unlock_irqrestore(&pool->lock, flags); 839 840 wake_worker(pool); 841 } 842 843 static void process_prepared_mapping(struct new_mapping *m) 844 { 845 struct thin_c *tc = m->tc; 846 struct bio *bio; 847 int r; 848 849 bio = m->bio; 850 if (bio) 851 bio->bi_end_io = m->saved_bi_end_io; 852 853 if (m->err) { 854 cell_error(m->cell); 855 return; 856 } 857 858 /* 859 * Commit the prepared block into the mapping btree. 860 * Any I/O for this block arriving after this point will get 861 * remapped to it directly. 862 */ 863 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); 864 if (r) { 865 DMERR("dm_thin_insert_block() failed"); 866 cell_error(m->cell); 867 return; 868 } 869 870 /* 871 * Release any bios held while the block was being provisioned. 872 * If we are processing a write bio that completely covers the block, 873 * we already processed it so can ignore it now when processing 874 * the bios in the cell. 875 */ 876 if (bio) { 877 cell_defer_except(tc, m->cell); 878 bio_endio(bio, 0); 879 } else 880 cell_defer(tc, m->cell, m->data_block); 881 882 list_del(&m->list); 883 mempool_free(m, tc->pool->mapping_pool); 884 } 885 886 static void process_prepared_discard(struct new_mapping *m) 887 { 888 int r; 889 struct thin_c *tc = m->tc; 890 891 r = dm_thin_remove_block(tc->td, m->virt_block); 892 if (r) 893 DMERR("dm_thin_remove_block() failed"); 894 895 /* 896 * Pass the discard down to the underlying device? 897 */ 898 if (m->pass_discard) 899 remap_and_issue(tc, m->bio, m->data_block); 900 else 901 bio_endio(m->bio, 0); 902 903 cell_defer_except(tc, m->cell); 904 cell_defer_except(tc, m->cell2); 905 mempool_free(m, tc->pool->mapping_pool); 906 } 907 908 static void process_prepared(struct pool *pool, struct list_head *head, 909 void (*fn)(struct new_mapping *)) 910 { 911 unsigned long flags; 912 struct list_head maps; 913 struct new_mapping *m, *tmp; 914 915 INIT_LIST_HEAD(&maps); 916 spin_lock_irqsave(&pool->lock, flags); 917 list_splice_init(head, &maps); 918 spin_unlock_irqrestore(&pool->lock, flags); 919 920 list_for_each_entry_safe(m, tmp, &maps, list) 921 fn(m); 922 } 923 924 /* 925 * Deferred bio jobs. 926 */ 927 static int io_overlaps_block(struct pool *pool, struct bio *bio) 928 { 929 return !(bio->bi_sector & pool->offset_mask) && 930 (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT)); 931 932 } 933 934 static int io_overwrites_block(struct pool *pool, struct bio *bio) 935 { 936 return (bio_data_dir(bio) == WRITE) && 937 io_overlaps_block(pool, bio); 938 } 939 940 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save, 941 bio_end_io_t *fn) 942 { 943 *save = bio->bi_end_io; 944 bio->bi_end_io = fn; 945 } 946 947 static int ensure_next_mapping(struct pool *pool) 948 { 949 if (pool->next_mapping) 950 return 0; 951 952 pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC); 953 954 return pool->next_mapping ? 0 : -ENOMEM; 955 } 956 957 static struct new_mapping *get_next_mapping(struct pool *pool) 958 { 959 struct new_mapping *r = pool->next_mapping; 960 961 BUG_ON(!pool->next_mapping); 962 963 pool->next_mapping = NULL; 964 965 return r; 966 } 967 968 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, 969 struct dm_dev *origin, dm_block_t data_origin, 970 dm_block_t data_dest, 971 struct cell *cell, struct bio *bio) 972 { 973 int r; 974 struct pool *pool = tc->pool; 975 struct new_mapping *m = get_next_mapping(pool); 976 977 INIT_LIST_HEAD(&m->list); 978 m->quiesced = 0; 979 m->prepared = 0; 980 m->tc = tc; 981 m->virt_block = virt_block; 982 m->data_block = data_dest; 983 m->cell = cell; 984 m->err = 0; 985 m->bio = NULL; 986 987 if (!ds_add_work(&pool->shared_read_ds, &m->list)) 988 m->quiesced = 1; 989 990 /* 991 * IO to pool_dev remaps to the pool target's data_dev. 992 * 993 * If the whole block of data is being overwritten, we can issue the 994 * bio immediately. Otherwise we use kcopyd to clone the data first. 995 */ 996 if (io_overwrites_block(pool, bio)) { 997 struct endio_hook *h = dm_get_mapinfo(bio)->ptr; 998 h->overwrite_mapping = m; 999 m->bio = bio; 1000 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 1001 remap_and_issue(tc, bio, data_dest); 1002 } else { 1003 struct dm_io_region from, to; 1004 1005 from.bdev = origin->bdev; 1006 from.sector = data_origin * pool->sectors_per_block; 1007 from.count = pool->sectors_per_block; 1008 1009 to.bdev = tc->pool_dev->bdev; 1010 to.sector = data_dest * pool->sectors_per_block; 1011 to.count = pool->sectors_per_block; 1012 1013 r = dm_kcopyd_copy(pool->copier, &from, 1, &to, 1014 0, copy_complete, m); 1015 if (r < 0) { 1016 mempool_free(m, pool->mapping_pool); 1017 DMERR("dm_kcopyd_copy() failed"); 1018 cell_error(cell); 1019 } 1020 } 1021 } 1022 1023 static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block, 1024 dm_block_t data_origin, dm_block_t data_dest, 1025 struct cell *cell, struct bio *bio) 1026 { 1027 schedule_copy(tc, virt_block, tc->pool_dev, 1028 data_origin, data_dest, cell, bio); 1029 } 1030 1031 static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block, 1032 dm_block_t data_dest, 1033 struct cell *cell, struct bio *bio) 1034 { 1035 schedule_copy(tc, virt_block, tc->origin_dev, 1036 virt_block, data_dest, cell, bio); 1037 } 1038 1039 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, 1040 dm_block_t data_block, struct cell *cell, 1041 struct bio *bio) 1042 { 1043 struct pool *pool = tc->pool; 1044 struct new_mapping *m = get_next_mapping(pool); 1045 1046 INIT_LIST_HEAD(&m->list); 1047 m->quiesced = 1; 1048 m->prepared = 0; 1049 m->tc = tc; 1050 m->virt_block = virt_block; 1051 m->data_block = data_block; 1052 m->cell = cell; 1053 m->err = 0; 1054 m->bio = NULL; 1055 1056 /* 1057 * If the whole block of data is being overwritten or we are not 1058 * zeroing pre-existing data, we can issue the bio immediately. 1059 * Otherwise we use kcopyd to zero the data first. 1060 */ 1061 if (!pool->pf.zero_new_blocks) 1062 process_prepared_mapping(m); 1063 1064 else if (io_overwrites_block(pool, bio)) { 1065 struct endio_hook *h = dm_get_mapinfo(bio)->ptr; 1066 h->overwrite_mapping = m; 1067 m->bio = bio; 1068 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 1069 remap_and_issue(tc, bio, data_block); 1070 1071 } else { 1072 int r; 1073 struct dm_io_region to; 1074 1075 to.bdev = tc->pool_dev->bdev; 1076 to.sector = data_block * pool->sectors_per_block; 1077 to.count = pool->sectors_per_block; 1078 1079 r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m); 1080 if (r < 0) { 1081 mempool_free(m, pool->mapping_pool); 1082 DMERR("dm_kcopyd_zero() failed"); 1083 cell_error(cell); 1084 } 1085 } 1086 } 1087 1088 static int alloc_data_block(struct thin_c *tc, dm_block_t *result) 1089 { 1090 int r; 1091 dm_block_t free_blocks; 1092 unsigned long flags; 1093 struct pool *pool = tc->pool; 1094 1095 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 1096 if (r) 1097 return r; 1098 1099 if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) { 1100 DMWARN("%s: reached low water mark, sending event.", 1101 dm_device_name(pool->pool_md)); 1102 spin_lock_irqsave(&pool->lock, flags); 1103 pool->low_water_triggered = 1; 1104 spin_unlock_irqrestore(&pool->lock, flags); 1105 dm_table_event(pool->ti->table); 1106 } 1107 1108 if (!free_blocks) { 1109 if (pool->no_free_space) 1110 return -ENOSPC; 1111 else { 1112 /* 1113 * Try to commit to see if that will free up some 1114 * more space. 1115 */ 1116 r = dm_pool_commit_metadata(pool->pmd); 1117 if (r) { 1118 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 1119 __func__, r); 1120 return r; 1121 } 1122 1123 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 1124 if (r) 1125 return r; 1126 1127 /* 1128 * If we still have no space we set a flag to avoid 1129 * doing all this checking and return -ENOSPC. 1130 */ 1131 if (!free_blocks) { 1132 DMWARN("%s: no free space available.", 1133 dm_device_name(pool->pool_md)); 1134 spin_lock_irqsave(&pool->lock, flags); 1135 pool->no_free_space = 1; 1136 spin_unlock_irqrestore(&pool->lock, flags); 1137 return -ENOSPC; 1138 } 1139 } 1140 } 1141 1142 r = dm_pool_alloc_data_block(pool->pmd, result); 1143 if (r) 1144 return r; 1145 1146 return 0; 1147 } 1148 1149 /* 1150 * If we have run out of space, queue bios until the device is 1151 * resumed, presumably after having been reloaded with more space. 1152 */ 1153 static void retry_on_resume(struct bio *bio) 1154 { 1155 struct endio_hook *h = dm_get_mapinfo(bio)->ptr; 1156 struct thin_c *tc = h->tc; 1157 struct pool *pool = tc->pool; 1158 unsigned long flags; 1159 1160 spin_lock_irqsave(&pool->lock, flags); 1161 bio_list_add(&pool->retry_on_resume_list, bio); 1162 spin_unlock_irqrestore(&pool->lock, flags); 1163 } 1164 1165 static void no_space(struct cell *cell) 1166 { 1167 struct bio *bio; 1168 struct bio_list bios; 1169 1170 bio_list_init(&bios); 1171 cell_release(cell, &bios); 1172 1173 while ((bio = bio_list_pop(&bios))) 1174 retry_on_resume(bio); 1175 } 1176 1177 static void process_discard(struct thin_c *tc, struct bio *bio) 1178 { 1179 int r; 1180 struct pool *pool = tc->pool; 1181 struct cell *cell, *cell2; 1182 struct cell_key key, key2; 1183 dm_block_t block = get_bio_block(tc, bio); 1184 struct dm_thin_lookup_result lookup_result; 1185 struct new_mapping *m; 1186 1187 build_virtual_key(tc->td, block, &key); 1188 if (bio_detain(tc->pool->prison, &key, bio, &cell)) 1189 return; 1190 1191 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1192 switch (r) { 1193 case 0: 1194 /* 1195 * Check nobody is fiddling with this pool block. This can 1196 * happen if someone's in the process of breaking sharing 1197 * on this block. 1198 */ 1199 build_data_key(tc->td, lookup_result.block, &key2); 1200 if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) { 1201 cell_release_singleton(cell, bio); 1202 break; 1203 } 1204 1205 if (io_overlaps_block(pool, bio)) { 1206 /* 1207 * IO may still be going to the destination block. We must 1208 * quiesce before we can do the removal. 1209 */ 1210 m = get_next_mapping(pool); 1211 m->tc = tc; 1212 m->pass_discard = (!lookup_result.shared) & pool->pf.discard_passdown; 1213 m->virt_block = block; 1214 m->data_block = lookup_result.block; 1215 m->cell = cell; 1216 m->cell2 = cell2; 1217 m->err = 0; 1218 m->bio = bio; 1219 1220 if (!ds_add_work(&pool->all_io_ds, &m->list)) { 1221 list_add(&m->list, &pool->prepared_discards); 1222 wake_worker(pool); 1223 } 1224 } else { 1225 /* 1226 * This path is hit if people are ignoring 1227 * limits->discard_granularity. It ignores any 1228 * part of the discard that is in a subsequent 1229 * block. 1230 */ 1231 sector_t offset = bio->bi_sector - (block << pool->block_shift); 1232 unsigned remaining = (pool->sectors_per_block - offset) << 9; 1233 bio->bi_size = min(bio->bi_size, remaining); 1234 1235 cell_release_singleton(cell, bio); 1236 cell_release_singleton(cell2, bio); 1237 remap_and_issue(tc, bio, lookup_result.block); 1238 } 1239 break; 1240 1241 case -ENODATA: 1242 /* 1243 * It isn't provisioned, just forget it. 1244 */ 1245 cell_release_singleton(cell, bio); 1246 bio_endio(bio, 0); 1247 break; 1248 1249 default: 1250 DMERR("discard: find block unexpectedly returned %d", r); 1251 cell_release_singleton(cell, bio); 1252 bio_io_error(bio); 1253 break; 1254 } 1255 } 1256 1257 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, 1258 struct cell_key *key, 1259 struct dm_thin_lookup_result *lookup_result, 1260 struct cell *cell) 1261 { 1262 int r; 1263 dm_block_t data_block; 1264 1265 r = alloc_data_block(tc, &data_block); 1266 switch (r) { 1267 case 0: 1268 schedule_internal_copy(tc, block, lookup_result->block, 1269 data_block, cell, bio); 1270 break; 1271 1272 case -ENOSPC: 1273 no_space(cell); 1274 break; 1275 1276 default: 1277 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r); 1278 cell_error(cell); 1279 break; 1280 } 1281 } 1282 1283 static void process_shared_bio(struct thin_c *tc, struct bio *bio, 1284 dm_block_t block, 1285 struct dm_thin_lookup_result *lookup_result) 1286 { 1287 struct cell *cell; 1288 struct pool *pool = tc->pool; 1289 struct cell_key key; 1290 1291 /* 1292 * If cell is already occupied, then sharing is already in the process 1293 * of being broken so we have nothing further to do here. 1294 */ 1295 build_data_key(tc->td, lookup_result->block, &key); 1296 if (bio_detain(pool->prison, &key, bio, &cell)) 1297 return; 1298 1299 if (bio_data_dir(bio) == WRITE) 1300 break_sharing(tc, bio, block, &key, lookup_result, cell); 1301 else { 1302 struct endio_hook *h = dm_get_mapinfo(bio)->ptr; 1303 1304 h->shared_read_entry = ds_inc(&pool->shared_read_ds); 1305 1306 cell_release_singleton(cell, bio); 1307 remap_and_issue(tc, bio, lookup_result->block); 1308 } 1309 } 1310 1311 static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block, 1312 struct cell *cell) 1313 { 1314 int r; 1315 dm_block_t data_block; 1316 1317 /* 1318 * Remap empty bios (flushes) immediately, without provisioning. 1319 */ 1320 if (!bio->bi_size) { 1321 cell_release_singleton(cell, bio); 1322 remap_and_issue(tc, bio, 0); 1323 return; 1324 } 1325 1326 /* 1327 * Fill read bios with zeroes and complete them immediately. 1328 */ 1329 if (bio_data_dir(bio) == READ) { 1330 zero_fill_bio(bio); 1331 cell_release_singleton(cell, bio); 1332 bio_endio(bio, 0); 1333 return; 1334 } 1335 1336 r = alloc_data_block(tc, &data_block); 1337 switch (r) { 1338 case 0: 1339 if (tc->origin_dev) 1340 schedule_external_copy(tc, block, data_block, cell, bio); 1341 else 1342 schedule_zero(tc, block, data_block, cell, bio); 1343 break; 1344 1345 case -ENOSPC: 1346 no_space(cell); 1347 break; 1348 1349 default: 1350 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r); 1351 cell_error(cell); 1352 break; 1353 } 1354 } 1355 1356 static void process_bio(struct thin_c *tc, struct bio *bio) 1357 { 1358 int r; 1359 dm_block_t block = get_bio_block(tc, bio); 1360 struct cell *cell; 1361 struct cell_key key; 1362 struct dm_thin_lookup_result lookup_result; 1363 1364 /* 1365 * If cell is already occupied, then the block is already 1366 * being provisioned so we have nothing further to do here. 1367 */ 1368 build_virtual_key(tc->td, block, &key); 1369 if (bio_detain(tc->pool->prison, &key, bio, &cell)) 1370 return; 1371 1372 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1373 switch (r) { 1374 case 0: 1375 /* 1376 * We can release this cell now. This thread is the only 1377 * one that puts bios into a cell, and we know there were 1378 * no preceding bios. 1379 */ 1380 /* 1381 * TODO: this will probably have to change when discard goes 1382 * back in. 1383 */ 1384 cell_release_singleton(cell, bio); 1385 1386 if (lookup_result.shared) 1387 process_shared_bio(tc, bio, block, &lookup_result); 1388 else 1389 remap_and_issue(tc, bio, lookup_result.block); 1390 break; 1391 1392 case -ENODATA: 1393 if (bio_data_dir(bio) == READ && tc->origin_dev) { 1394 cell_release_singleton(cell, bio); 1395 remap_to_origin_and_issue(tc, bio); 1396 } else 1397 provision_block(tc, bio, block, cell); 1398 break; 1399 1400 default: 1401 DMERR("dm_thin_find_block() failed, error = %d", r); 1402 cell_release_singleton(cell, bio); 1403 bio_io_error(bio); 1404 break; 1405 } 1406 } 1407 1408 static int need_commit_due_to_time(struct pool *pool) 1409 { 1410 return jiffies < pool->last_commit_jiffies || 1411 jiffies > pool->last_commit_jiffies + COMMIT_PERIOD; 1412 } 1413 1414 static void process_deferred_bios(struct pool *pool) 1415 { 1416 unsigned long flags; 1417 struct bio *bio; 1418 struct bio_list bios; 1419 int r; 1420 1421 bio_list_init(&bios); 1422 1423 spin_lock_irqsave(&pool->lock, flags); 1424 bio_list_merge(&bios, &pool->deferred_bios); 1425 bio_list_init(&pool->deferred_bios); 1426 spin_unlock_irqrestore(&pool->lock, flags); 1427 1428 while ((bio = bio_list_pop(&bios))) { 1429 struct endio_hook *h = dm_get_mapinfo(bio)->ptr; 1430 struct thin_c *tc = h->tc; 1431 1432 /* 1433 * If we've got no free new_mapping structs, and processing 1434 * this bio might require one, we pause until there are some 1435 * prepared mappings to process. 1436 */ 1437 if (ensure_next_mapping(pool)) { 1438 spin_lock_irqsave(&pool->lock, flags); 1439 bio_list_merge(&pool->deferred_bios, &bios); 1440 spin_unlock_irqrestore(&pool->lock, flags); 1441 1442 break; 1443 } 1444 1445 if (bio->bi_rw & REQ_DISCARD) 1446 process_discard(tc, bio); 1447 else 1448 process_bio(tc, bio); 1449 } 1450 1451 /* 1452 * If there are any deferred flush bios, we must commit 1453 * the metadata before issuing them. 1454 */ 1455 bio_list_init(&bios); 1456 spin_lock_irqsave(&pool->lock, flags); 1457 bio_list_merge(&bios, &pool->deferred_flush_bios); 1458 bio_list_init(&pool->deferred_flush_bios); 1459 spin_unlock_irqrestore(&pool->lock, flags); 1460 1461 if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) 1462 return; 1463 1464 r = dm_pool_commit_metadata(pool->pmd); 1465 if (r) { 1466 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 1467 __func__, r); 1468 while ((bio = bio_list_pop(&bios))) 1469 bio_io_error(bio); 1470 return; 1471 } 1472 pool->last_commit_jiffies = jiffies; 1473 1474 while ((bio = bio_list_pop(&bios))) 1475 generic_make_request(bio); 1476 } 1477 1478 static void do_worker(struct work_struct *ws) 1479 { 1480 struct pool *pool = container_of(ws, struct pool, worker); 1481 1482 process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping); 1483 process_prepared(pool, &pool->prepared_discards, process_prepared_discard); 1484 process_deferred_bios(pool); 1485 } 1486 1487 /* 1488 * We want to commit periodically so that not too much 1489 * unwritten data builds up. 1490 */ 1491 static void do_waker(struct work_struct *ws) 1492 { 1493 struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker); 1494 wake_worker(pool); 1495 queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD); 1496 } 1497 1498 /*----------------------------------------------------------------*/ 1499 1500 /* 1501 * Mapping functions. 1502 */ 1503 1504 /* 1505 * Called only while mapping a thin bio to hand it over to the workqueue. 1506 */ 1507 static void thin_defer_bio(struct thin_c *tc, struct bio *bio) 1508 { 1509 unsigned long flags; 1510 struct pool *pool = tc->pool; 1511 1512 spin_lock_irqsave(&pool->lock, flags); 1513 bio_list_add(&pool->deferred_bios, bio); 1514 spin_unlock_irqrestore(&pool->lock, flags); 1515 1516 wake_worker(pool); 1517 } 1518 1519 static struct endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio) 1520 { 1521 struct pool *pool = tc->pool; 1522 struct endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO); 1523 1524 h->tc = tc; 1525 h->shared_read_entry = NULL; 1526 h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds); 1527 h->overwrite_mapping = NULL; 1528 1529 return h; 1530 } 1531 1532 /* 1533 * Non-blocking function called from the thin target's map function. 1534 */ 1535 static int thin_bio_map(struct dm_target *ti, struct bio *bio, 1536 union map_info *map_context) 1537 { 1538 int r; 1539 struct thin_c *tc = ti->private; 1540 dm_block_t block = get_bio_block(tc, bio); 1541 struct dm_thin_device *td = tc->td; 1542 struct dm_thin_lookup_result result; 1543 1544 map_context->ptr = thin_hook_bio(tc, bio); 1545 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { 1546 thin_defer_bio(tc, bio); 1547 return DM_MAPIO_SUBMITTED; 1548 } 1549 1550 r = dm_thin_find_block(td, block, 0, &result); 1551 1552 /* 1553 * Note that we defer readahead too. 1554 */ 1555 switch (r) { 1556 case 0: 1557 if (unlikely(result.shared)) { 1558 /* 1559 * We have a race condition here between the 1560 * result.shared value returned by the lookup and 1561 * snapshot creation, which may cause new 1562 * sharing. 1563 * 1564 * To avoid this always quiesce the origin before 1565 * taking the snap. You want to do this anyway to 1566 * ensure a consistent application view 1567 * (i.e. lockfs). 1568 * 1569 * More distant ancestors are irrelevant. The 1570 * shared flag will be set in their case. 1571 */ 1572 thin_defer_bio(tc, bio); 1573 r = DM_MAPIO_SUBMITTED; 1574 } else { 1575 remap(tc, bio, result.block); 1576 r = DM_MAPIO_REMAPPED; 1577 } 1578 break; 1579 1580 case -ENODATA: 1581 /* 1582 * In future, the failed dm_thin_find_block above could 1583 * provide the hint to load the metadata into cache. 1584 */ 1585 case -EWOULDBLOCK: 1586 thin_defer_bio(tc, bio); 1587 r = DM_MAPIO_SUBMITTED; 1588 break; 1589 } 1590 1591 return r; 1592 } 1593 1594 static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 1595 { 1596 int r; 1597 unsigned long flags; 1598 struct pool_c *pt = container_of(cb, struct pool_c, callbacks); 1599 1600 spin_lock_irqsave(&pt->pool->lock, flags); 1601 r = !bio_list_empty(&pt->pool->retry_on_resume_list); 1602 spin_unlock_irqrestore(&pt->pool->lock, flags); 1603 1604 if (!r) { 1605 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 1606 r = bdi_congested(&q->backing_dev_info, bdi_bits); 1607 } 1608 1609 return r; 1610 } 1611 1612 static void __requeue_bios(struct pool *pool) 1613 { 1614 bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list); 1615 bio_list_init(&pool->retry_on_resume_list); 1616 } 1617 1618 /*---------------------------------------------------------------- 1619 * Binding of control targets to a pool object 1620 *--------------------------------------------------------------*/ 1621 static int bind_control_target(struct pool *pool, struct dm_target *ti) 1622 { 1623 struct pool_c *pt = ti->private; 1624 1625 pool->ti = ti; 1626 pool->low_water_blocks = pt->low_water_blocks; 1627 pool->pf = pt->pf; 1628 1629 return 0; 1630 } 1631 1632 static void unbind_control_target(struct pool *pool, struct dm_target *ti) 1633 { 1634 if (pool->ti == ti) 1635 pool->ti = NULL; 1636 } 1637 1638 /*---------------------------------------------------------------- 1639 * Pool creation 1640 *--------------------------------------------------------------*/ 1641 /* Initialize pool features. */ 1642 static void pool_features_init(struct pool_features *pf) 1643 { 1644 pf->zero_new_blocks = 1; 1645 pf->discard_enabled = 1; 1646 pf->discard_passdown = 1; 1647 } 1648 1649 static void __pool_destroy(struct pool *pool) 1650 { 1651 __pool_table_remove(pool); 1652 1653 if (dm_pool_metadata_close(pool->pmd) < 0) 1654 DMWARN("%s: dm_pool_metadata_close() failed.", __func__); 1655 1656 prison_destroy(pool->prison); 1657 dm_kcopyd_client_destroy(pool->copier); 1658 1659 if (pool->wq) 1660 destroy_workqueue(pool->wq); 1661 1662 if (pool->next_mapping) 1663 mempool_free(pool->next_mapping, pool->mapping_pool); 1664 mempool_destroy(pool->mapping_pool); 1665 mempool_destroy(pool->endio_hook_pool); 1666 kfree(pool); 1667 } 1668 1669 static struct pool *pool_create(struct mapped_device *pool_md, 1670 struct block_device *metadata_dev, 1671 unsigned long block_size, char **error) 1672 { 1673 int r; 1674 void *err_p; 1675 struct pool *pool; 1676 struct dm_pool_metadata *pmd; 1677 1678 pmd = dm_pool_metadata_open(metadata_dev, block_size); 1679 if (IS_ERR(pmd)) { 1680 *error = "Error creating metadata object"; 1681 return (struct pool *)pmd; 1682 } 1683 1684 pool = kmalloc(sizeof(*pool), GFP_KERNEL); 1685 if (!pool) { 1686 *error = "Error allocating memory for pool"; 1687 err_p = ERR_PTR(-ENOMEM); 1688 goto bad_pool; 1689 } 1690 1691 pool->pmd = pmd; 1692 pool->sectors_per_block = block_size; 1693 pool->block_shift = ffs(block_size) - 1; 1694 pool->offset_mask = block_size - 1; 1695 pool->low_water_blocks = 0; 1696 pool_features_init(&pool->pf); 1697 pool->prison = prison_create(PRISON_CELLS); 1698 if (!pool->prison) { 1699 *error = "Error creating pool's bio prison"; 1700 err_p = ERR_PTR(-ENOMEM); 1701 goto bad_prison; 1702 } 1703 1704 pool->copier = dm_kcopyd_client_create(); 1705 if (IS_ERR(pool->copier)) { 1706 r = PTR_ERR(pool->copier); 1707 *error = "Error creating pool's kcopyd client"; 1708 err_p = ERR_PTR(r); 1709 goto bad_kcopyd_client; 1710 } 1711 1712 /* 1713 * Create singlethreaded workqueue that will service all devices 1714 * that use this metadata. 1715 */ 1716 pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 1717 if (!pool->wq) { 1718 *error = "Error creating pool's workqueue"; 1719 err_p = ERR_PTR(-ENOMEM); 1720 goto bad_wq; 1721 } 1722 1723 INIT_WORK(&pool->worker, do_worker); 1724 INIT_DELAYED_WORK(&pool->waker, do_waker); 1725 spin_lock_init(&pool->lock); 1726 bio_list_init(&pool->deferred_bios); 1727 bio_list_init(&pool->deferred_flush_bios); 1728 INIT_LIST_HEAD(&pool->prepared_mappings); 1729 INIT_LIST_HEAD(&pool->prepared_discards); 1730 pool->low_water_triggered = 0; 1731 pool->no_free_space = 0; 1732 bio_list_init(&pool->retry_on_resume_list); 1733 ds_init(&pool->shared_read_ds); 1734 ds_init(&pool->all_io_ds); 1735 1736 pool->next_mapping = NULL; 1737 pool->mapping_pool = 1738 mempool_create_kmalloc_pool(MAPPING_POOL_SIZE, sizeof(struct new_mapping)); 1739 if (!pool->mapping_pool) { 1740 *error = "Error creating pool's mapping mempool"; 1741 err_p = ERR_PTR(-ENOMEM); 1742 goto bad_mapping_pool; 1743 } 1744 1745 pool->endio_hook_pool = 1746 mempool_create_kmalloc_pool(ENDIO_HOOK_POOL_SIZE, sizeof(struct endio_hook)); 1747 if (!pool->endio_hook_pool) { 1748 *error = "Error creating pool's endio_hook mempool"; 1749 err_p = ERR_PTR(-ENOMEM); 1750 goto bad_endio_hook_pool; 1751 } 1752 pool->ref_count = 1; 1753 pool->last_commit_jiffies = jiffies; 1754 pool->pool_md = pool_md; 1755 pool->md_dev = metadata_dev; 1756 __pool_table_insert(pool); 1757 1758 return pool; 1759 1760 bad_endio_hook_pool: 1761 mempool_destroy(pool->mapping_pool); 1762 bad_mapping_pool: 1763 destroy_workqueue(pool->wq); 1764 bad_wq: 1765 dm_kcopyd_client_destroy(pool->copier); 1766 bad_kcopyd_client: 1767 prison_destroy(pool->prison); 1768 bad_prison: 1769 kfree(pool); 1770 bad_pool: 1771 if (dm_pool_metadata_close(pmd)) 1772 DMWARN("%s: dm_pool_metadata_close() failed.", __func__); 1773 1774 return err_p; 1775 } 1776 1777 static void __pool_inc(struct pool *pool) 1778 { 1779 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 1780 pool->ref_count++; 1781 } 1782 1783 static void __pool_dec(struct pool *pool) 1784 { 1785 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 1786 BUG_ON(!pool->ref_count); 1787 if (!--pool->ref_count) 1788 __pool_destroy(pool); 1789 } 1790 1791 static struct pool *__pool_find(struct mapped_device *pool_md, 1792 struct block_device *metadata_dev, 1793 unsigned long block_size, char **error, 1794 int *created) 1795 { 1796 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); 1797 1798 if (pool) { 1799 if (pool->pool_md != pool_md) 1800 return ERR_PTR(-EBUSY); 1801 __pool_inc(pool); 1802 1803 } else { 1804 pool = __pool_table_lookup(pool_md); 1805 if (pool) { 1806 if (pool->md_dev != metadata_dev) 1807 return ERR_PTR(-EINVAL); 1808 __pool_inc(pool); 1809 1810 } else { 1811 pool = pool_create(pool_md, metadata_dev, block_size, error); 1812 *created = 1; 1813 } 1814 } 1815 1816 return pool; 1817 } 1818 1819 /*---------------------------------------------------------------- 1820 * Pool target methods 1821 *--------------------------------------------------------------*/ 1822 static void pool_dtr(struct dm_target *ti) 1823 { 1824 struct pool_c *pt = ti->private; 1825 1826 mutex_lock(&dm_thin_pool_table.mutex); 1827 1828 unbind_control_target(pt->pool, ti); 1829 __pool_dec(pt->pool); 1830 dm_put_device(ti, pt->metadata_dev); 1831 dm_put_device(ti, pt->data_dev); 1832 kfree(pt); 1833 1834 mutex_unlock(&dm_thin_pool_table.mutex); 1835 } 1836 1837 static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, 1838 struct dm_target *ti) 1839 { 1840 int r; 1841 unsigned argc; 1842 const char *arg_name; 1843 1844 static struct dm_arg _args[] = { 1845 {0, 3, "Invalid number of pool feature arguments"}, 1846 }; 1847 1848 /* 1849 * No feature arguments supplied. 1850 */ 1851 if (!as->argc) 1852 return 0; 1853 1854 r = dm_read_arg_group(_args, as, &argc, &ti->error); 1855 if (r) 1856 return -EINVAL; 1857 1858 while (argc && !r) { 1859 arg_name = dm_shift_arg(as); 1860 argc--; 1861 1862 if (!strcasecmp(arg_name, "skip_block_zeroing")) { 1863 pf->zero_new_blocks = 0; 1864 continue; 1865 } else if (!strcasecmp(arg_name, "ignore_discard")) { 1866 pf->discard_enabled = 0; 1867 continue; 1868 } else if (!strcasecmp(arg_name, "no_discard_passdown")) { 1869 pf->discard_passdown = 0; 1870 continue; 1871 } 1872 1873 ti->error = "Unrecognised pool feature requested"; 1874 r = -EINVAL; 1875 } 1876 1877 return r; 1878 } 1879 1880 /* 1881 * thin-pool <metadata dev> <data dev> 1882 * <data block size (sectors)> 1883 * <low water mark (blocks)> 1884 * [<#feature args> [<arg>]*] 1885 * 1886 * Optional feature arguments are: 1887 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. 1888 * ignore_discard: disable discard 1889 * no_discard_passdown: don't pass discards down to the data device 1890 */ 1891 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) 1892 { 1893 int r, pool_created = 0; 1894 struct pool_c *pt; 1895 struct pool *pool; 1896 struct pool_features pf; 1897 struct dm_arg_set as; 1898 struct dm_dev *data_dev; 1899 unsigned long block_size; 1900 dm_block_t low_water_blocks; 1901 struct dm_dev *metadata_dev; 1902 sector_t metadata_dev_size; 1903 char b[BDEVNAME_SIZE]; 1904 1905 /* 1906 * FIXME Remove validation from scope of lock. 1907 */ 1908 mutex_lock(&dm_thin_pool_table.mutex); 1909 1910 if (argc < 4) { 1911 ti->error = "Invalid argument count"; 1912 r = -EINVAL; 1913 goto out_unlock; 1914 } 1915 as.argc = argc; 1916 as.argv = argv; 1917 1918 r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev); 1919 if (r) { 1920 ti->error = "Error opening metadata block device"; 1921 goto out_unlock; 1922 } 1923 1924 metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT; 1925 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) 1926 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 1927 bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); 1928 1929 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); 1930 if (r) { 1931 ti->error = "Error getting data device"; 1932 goto out_metadata; 1933 } 1934 1935 if (kstrtoul(argv[2], 10, &block_size) || !block_size || 1936 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 1937 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 1938 !is_power_of_2(block_size)) { 1939 ti->error = "Invalid block size"; 1940 r = -EINVAL; 1941 goto out; 1942 } 1943 1944 if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) { 1945 ti->error = "Invalid low water mark"; 1946 r = -EINVAL; 1947 goto out; 1948 } 1949 1950 /* 1951 * Set default pool features. 1952 */ 1953 pool_features_init(&pf); 1954 1955 dm_consume_args(&as, 4); 1956 r = parse_pool_features(&as, &pf, ti); 1957 if (r) 1958 goto out; 1959 1960 pt = kzalloc(sizeof(*pt), GFP_KERNEL); 1961 if (!pt) { 1962 r = -ENOMEM; 1963 goto out; 1964 } 1965 1966 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, 1967 block_size, &ti->error, &pool_created); 1968 if (IS_ERR(pool)) { 1969 r = PTR_ERR(pool); 1970 goto out_free_pt; 1971 } 1972 1973 /* 1974 * 'pool_created' reflects whether this is the first table load. 1975 * Top level discard support is not allowed to be changed after 1976 * initial load. This would require a pool reload to trigger thin 1977 * device changes. 1978 */ 1979 if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) { 1980 ti->error = "Discard support cannot be disabled once enabled"; 1981 r = -EINVAL; 1982 goto out_flags_changed; 1983 } 1984 1985 /* 1986 * If discard_passdown was enabled verify that the data device 1987 * supports discards. Disable discard_passdown if not; otherwise 1988 * -EOPNOTSUPP will be returned. 1989 */ 1990 if (pf.discard_passdown) { 1991 struct request_queue *q = bdev_get_queue(data_dev->bdev); 1992 if (!q || !blk_queue_discard(q)) { 1993 DMWARN("Discard unsupported by data device: Disabling discard passdown."); 1994 pf.discard_passdown = 0; 1995 } 1996 } 1997 1998 pt->pool = pool; 1999 pt->ti = ti; 2000 pt->metadata_dev = metadata_dev; 2001 pt->data_dev = data_dev; 2002 pt->low_water_blocks = low_water_blocks; 2003 pt->pf = pf; 2004 ti->num_flush_requests = 1; 2005 /* 2006 * Only need to enable discards if the pool should pass 2007 * them down to the data device. The thin device's discard 2008 * processing will cause mappings to be removed from the btree. 2009 */ 2010 if (pf.discard_enabled && pf.discard_passdown) { 2011 ti->num_discard_requests = 1; 2012 /* 2013 * Setting 'discards_supported' circumvents the normal 2014 * stacking of discard limits (this keeps the pool and 2015 * thin devices' discard limits consistent). 2016 */ 2017 ti->discards_supported = 1; 2018 } 2019 ti->private = pt; 2020 2021 pt->callbacks.congested_fn = pool_is_congested; 2022 dm_table_add_target_callbacks(ti->table, &pt->callbacks); 2023 2024 mutex_unlock(&dm_thin_pool_table.mutex); 2025 2026 return 0; 2027 2028 out_flags_changed: 2029 __pool_dec(pool); 2030 out_free_pt: 2031 kfree(pt); 2032 out: 2033 dm_put_device(ti, data_dev); 2034 out_metadata: 2035 dm_put_device(ti, metadata_dev); 2036 out_unlock: 2037 mutex_unlock(&dm_thin_pool_table.mutex); 2038 2039 return r; 2040 } 2041 2042 static int pool_map(struct dm_target *ti, struct bio *bio, 2043 union map_info *map_context) 2044 { 2045 int r; 2046 struct pool_c *pt = ti->private; 2047 struct pool *pool = pt->pool; 2048 unsigned long flags; 2049 2050 /* 2051 * As this is a singleton target, ti->begin is always zero. 2052 */ 2053 spin_lock_irqsave(&pool->lock, flags); 2054 bio->bi_bdev = pt->data_dev->bdev; 2055 r = DM_MAPIO_REMAPPED; 2056 spin_unlock_irqrestore(&pool->lock, flags); 2057 2058 return r; 2059 } 2060 2061 /* 2062 * Retrieves the number of blocks of the data device from 2063 * the superblock and compares it to the actual device size, 2064 * thus resizing the data device in case it has grown. 2065 * 2066 * This both copes with opening preallocated data devices in the ctr 2067 * being followed by a resume 2068 * -and- 2069 * calling the resume method individually after userspace has 2070 * grown the data device in reaction to a table event. 2071 */ 2072 static int pool_preresume(struct dm_target *ti) 2073 { 2074 int r; 2075 struct pool_c *pt = ti->private; 2076 struct pool *pool = pt->pool; 2077 dm_block_t data_size, sb_data_size; 2078 2079 /* 2080 * Take control of the pool object. 2081 */ 2082 r = bind_control_target(pool, ti); 2083 if (r) 2084 return r; 2085 2086 data_size = ti->len >> pool->block_shift; 2087 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size); 2088 if (r) { 2089 DMERR("failed to retrieve data device size"); 2090 return r; 2091 } 2092 2093 if (data_size < sb_data_size) { 2094 DMERR("pool target too small, is %llu blocks (expected %llu)", 2095 data_size, sb_data_size); 2096 return -EINVAL; 2097 2098 } else if (data_size > sb_data_size) { 2099 r = dm_pool_resize_data_dev(pool->pmd, data_size); 2100 if (r) { 2101 DMERR("failed to resize data device"); 2102 return r; 2103 } 2104 2105 r = dm_pool_commit_metadata(pool->pmd); 2106 if (r) { 2107 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 2108 __func__, r); 2109 return r; 2110 } 2111 } 2112 2113 return 0; 2114 } 2115 2116 static void pool_resume(struct dm_target *ti) 2117 { 2118 struct pool_c *pt = ti->private; 2119 struct pool *pool = pt->pool; 2120 unsigned long flags; 2121 2122 spin_lock_irqsave(&pool->lock, flags); 2123 pool->low_water_triggered = 0; 2124 pool->no_free_space = 0; 2125 __requeue_bios(pool); 2126 spin_unlock_irqrestore(&pool->lock, flags); 2127 2128 do_waker(&pool->waker.work); 2129 } 2130 2131 static void pool_postsuspend(struct dm_target *ti) 2132 { 2133 int r; 2134 struct pool_c *pt = ti->private; 2135 struct pool *pool = pt->pool; 2136 2137 cancel_delayed_work(&pool->waker); 2138 flush_workqueue(pool->wq); 2139 2140 r = dm_pool_commit_metadata(pool->pmd); 2141 if (r < 0) { 2142 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 2143 __func__, r); 2144 /* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/ 2145 } 2146 } 2147 2148 static int check_arg_count(unsigned argc, unsigned args_required) 2149 { 2150 if (argc != args_required) { 2151 DMWARN("Message received with %u arguments instead of %u.", 2152 argc, args_required); 2153 return -EINVAL; 2154 } 2155 2156 return 0; 2157 } 2158 2159 static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning) 2160 { 2161 if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) && 2162 *dev_id <= MAX_DEV_ID) 2163 return 0; 2164 2165 if (warning) 2166 DMWARN("Message received with invalid device id: %s", arg); 2167 2168 return -EINVAL; 2169 } 2170 2171 static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool) 2172 { 2173 dm_thin_id dev_id; 2174 int r; 2175 2176 r = check_arg_count(argc, 2); 2177 if (r) 2178 return r; 2179 2180 r = read_dev_id(argv[1], &dev_id, 1); 2181 if (r) 2182 return r; 2183 2184 r = dm_pool_create_thin(pool->pmd, dev_id); 2185 if (r) { 2186 DMWARN("Creation of new thinly-provisioned device with id %s failed.", 2187 argv[1]); 2188 return r; 2189 } 2190 2191 return 0; 2192 } 2193 2194 static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool) 2195 { 2196 dm_thin_id dev_id; 2197 dm_thin_id origin_dev_id; 2198 int r; 2199 2200 r = check_arg_count(argc, 3); 2201 if (r) 2202 return r; 2203 2204 r = read_dev_id(argv[1], &dev_id, 1); 2205 if (r) 2206 return r; 2207 2208 r = read_dev_id(argv[2], &origin_dev_id, 1); 2209 if (r) 2210 return r; 2211 2212 r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id); 2213 if (r) { 2214 DMWARN("Creation of new snapshot %s of device %s failed.", 2215 argv[1], argv[2]); 2216 return r; 2217 } 2218 2219 return 0; 2220 } 2221 2222 static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool) 2223 { 2224 dm_thin_id dev_id; 2225 int r; 2226 2227 r = check_arg_count(argc, 2); 2228 if (r) 2229 return r; 2230 2231 r = read_dev_id(argv[1], &dev_id, 1); 2232 if (r) 2233 return r; 2234 2235 r = dm_pool_delete_thin_device(pool->pmd, dev_id); 2236 if (r) 2237 DMWARN("Deletion of thin device %s failed.", argv[1]); 2238 2239 return r; 2240 } 2241 2242 static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool) 2243 { 2244 dm_thin_id old_id, new_id; 2245 int r; 2246 2247 r = check_arg_count(argc, 3); 2248 if (r) 2249 return r; 2250 2251 if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) { 2252 DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]); 2253 return -EINVAL; 2254 } 2255 2256 if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) { 2257 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]); 2258 return -EINVAL; 2259 } 2260 2261 r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id); 2262 if (r) { 2263 DMWARN("Failed to change transaction id from %s to %s.", 2264 argv[1], argv[2]); 2265 return r; 2266 } 2267 2268 return 0; 2269 } 2270 2271 /* 2272 * Messages supported: 2273 * create_thin <dev_id> 2274 * create_snap <dev_id> <origin_id> 2275 * delete <dev_id> 2276 * trim <dev_id> <new_size_in_sectors> 2277 * set_transaction_id <current_trans_id> <new_trans_id> 2278 */ 2279 static int pool_message(struct dm_target *ti, unsigned argc, char **argv) 2280 { 2281 int r = -EINVAL; 2282 struct pool_c *pt = ti->private; 2283 struct pool *pool = pt->pool; 2284 2285 if (!strcasecmp(argv[0], "create_thin")) 2286 r = process_create_thin_mesg(argc, argv, pool); 2287 2288 else if (!strcasecmp(argv[0], "create_snap")) 2289 r = process_create_snap_mesg(argc, argv, pool); 2290 2291 else if (!strcasecmp(argv[0], "delete")) 2292 r = process_delete_mesg(argc, argv, pool); 2293 2294 else if (!strcasecmp(argv[0], "set_transaction_id")) 2295 r = process_set_transaction_id_mesg(argc, argv, pool); 2296 2297 else 2298 DMWARN("Unrecognised thin pool target message received: %s", argv[0]); 2299 2300 if (!r) { 2301 r = dm_pool_commit_metadata(pool->pmd); 2302 if (r) 2303 DMERR("%s message: dm_pool_commit_metadata() failed, error = %d", 2304 argv[0], r); 2305 } 2306 2307 return r; 2308 } 2309 2310 /* 2311 * Status line is: 2312 * <transaction id> <used metadata sectors>/<total metadata sectors> 2313 * <used data sectors>/<total data sectors> <held metadata root> 2314 */ 2315 static int pool_status(struct dm_target *ti, status_type_t type, 2316 char *result, unsigned maxlen) 2317 { 2318 int r, count; 2319 unsigned sz = 0; 2320 uint64_t transaction_id; 2321 dm_block_t nr_free_blocks_data; 2322 dm_block_t nr_free_blocks_metadata; 2323 dm_block_t nr_blocks_data; 2324 dm_block_t nr_blocks_metadata; 2325 dm_block_t held_root; 2326 char buf[BDEVNAME_SIZE]; 2327 char buf2[BDEVNAME_SIZE]; 2328 struct pool_c *pt = ti->private; 2329 struct pool *pool = pt->pool; 2330 2331 switch (type) { 2332 case STATUSTYPE_INFO: 2333 r = dm_pool_get_metadata_transaction_id(pool->pmd, 2334 &transaction_id); 2335 if (r) 2336 return r; 2337 2338 r = dm_pool_get_free_metadata_block_count(pool->pmd, 2339 &nr_free_blocks_metadata); 2340 if (r) 2341 return r; 2342 2343 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata); 2344 if (r) 2345 return r; 2346 2347 r = dm_pool_get_free_block_count(pool->pmd, 2348 &nr_free_blocks_data); 2349 if (r) 2350 return r; 2351 2352 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data); 2353 if (r) 2354 return r; 2355 2356 r = dm_pool_get_held_metadata_root(pool->pmd, &held_root); 2357 if (r) 2358 return r; 2359 2360 DMEMIT("%llu %llu/%llu %llu/%llu ", 2361 (unsigned long long)transaction_id, 2362 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 2363 (unsigned long long)nr_blocks_metadata, 2364 (unsigned long long)(nr_blocks_data - nr_free_blocks_data), 2365 (unsigned long long)nr_blocks_data); 2366 2367 if (held_root) 2368 DMEMIT("%llu", held_root); 2369 else 2370 DMEMIT("-"); 2371 2372 break; 2373 2374 case STATUSTYPE_TABLE: 2375 DMEMIT("%s %s %lu %llu ", 2376 format_dev_t(buf, pt->metadata_dev->bdev->bd_dev), 2377 format_dev_t(buf2, pt->data_dev->bdev->bd_dev), 2378 (unsigned long)pool->sectors_per_block, 2379 (unsigned long long)pt->low_water_blocks); 2380 2381 count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled + 2382 !pool->pf.discard_passdown; 2383 DMEMIT("%u ", count); 2384 2385 if (!pool->pf.zero_new_blocks) 2386 DMEMIT("skip_block_zeroing "); 2387 2388 if (!pool->pf.discard_enabled) 2389 DMEMIT("ignore_discard "); 2390 2391 if (!pool->pf.discard_passdown) 2392 DMEMIT("no_discard_passdown "); 2393 2394 break; 2395 } 2396 2397 return 0; 2398 } 2399 2400 static int pool_iterate_devices(struct dm_target *ti, 2401 iterate_devices_callout_fn fn, void *data) 2402 { 2403 struct pool_c *pt = ti->private; 2404 2405 return fn(ti, pt->data_dev, 0, ti->len, data); 2406 } 2407 2408 static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm, 2409 struct bio_vec *biovec, int max_size) 2410 { 2411 struct pool_c *pt = ti->private; 2412 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 2413 2414 if (!q->merge_bvec_fn) 2415 return max_size; 2416 2417 bvm->bi_bdev = pt->data_dev->bdev; 2418 2419 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 2420 } 2421 2422 static void set_discard_limits(struct pool *pool, struct queue_limits *limits) 2423 { 2424 /* 2425 * FIXME: these limits may be incompatible with the pool's data device 2426 */ 2427 limits->max_discard_sectors = pool->sectors_per_block; 2428 2429 /* 2430 * This is just a hint, and not enforced. We have to cope with 2431 * bios that overlap 2 blocks. 2432 */ 2433 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; 2434 limits->discard_zeroes_data = pool->pf.zero_new_blocks; 2435 } 2436 2437 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) 2438 { 2439 struct pool_c *pt = ti->private; 2440 struct pool *pool = pt->pool; 2441 2442 blk_limits_io_min(limits, 0); 2443 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); 2444 if (pool->pf.discard_enabled) 2445 set_discard_limits(pool, limits); 2446 } 2447 2448 static struct target_type pool_target = { 2449 .name = "thin-pool", 2450 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 2451 DM_TARGET_IMMUTABLE, 2452 .version = {1, 1, 0}, 2453 .module = THIS_MODULE, 2454 .ctr = pool_ctr, 2455 .dtr = pool_dtr, 2456 .map = pool_map, 2457 .postsuspend = pool_postsuspend, 2458 .preresume = pool_preresume, 2459 .resume = pool_resume, 2460 .message = pool_message, 2461 .status = pool_status, 2462 .merge = pool_merge, 2463 .iterate_devices = pool_iterate_devices, 2464 .io_hints = pool_io_hints, 2465 }; 2466 2467 /*---------------------------------------------------------------- 2468 * Thin target methods 2469 *--------------------------------------------------------------*/ 2470 static void thin_dtr(struct dm_target *ti) 2471 { 2472 struct thin_c *tc = ti->private; 2473 2474 mutex_lock(&dm_thin_pool_table.mutex); 2475 2476 __pool_dec(tc->pool); 2477 dm_pool_close_thin_device(tc->td); 2478 dm_put_device(ti, tc->pool_dev); 2479 if (tc->origin_dev) 2480 dm_put_device(ti, tc->origin_dev); 2481 kfree(tc); 2482 2483 mutex_unlock(&dm_thin_pool_table.mutex); 2484 } 2485 2486 /* 2487 * Thin target parameters: 2488 * 2489 * <pool_dev> <dev_id> [origin_dev] 2490 * 2491 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool) 2492 * dev_id: the internal device identifier 2493 * origin_dev: a device external to the pool that should act as the origin 2494 * 2495 * If the pool device has discards disabled, they get disabled for the thin 2496 * device as well. 2497 */ 2498 static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) 2499 { 2500 int r; 2501 struct thin_c *tc; 2502 struct dm_dev *pool_dev, *origin_dev; 2503 struct mapped_device *pool_md; 2504 2505 mutex_lock(&dm_thin_pool_table.mutex); 2506 2507 if (argc != 2 && argc != 3) { 2508 ti->error = "Invalid argument count"; 2509 r = -EINVAL; 2510 goto out_unlock; 2511 } 2512 2513 tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL); 2514 if (!tc) { 2515 ti->error = "Out of memory"; 2516 r = -ENOMEM; 2517 goto out_unlock; 2518 } 2519 2520 if (argc == 3) { 2521 r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev); 2522 if (r) { 2523 ti->error = "Error opening origin device"; 2524 goto bad_origin_dev; 2525 } 2526 tc->origin_dev = origin_dev; 2527 } 2528 2529 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev); 2530 if (r) { 2531 ti->error = "Error opening pool device"; 2532 goto bad_pool_dev; 2533 } 2534 tc->pool_dev = pool_dev; 2535 2536 if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) { 2537 ti->error = "Invalid device id"; 2538 r = -EINVAL; 2539 goto bad_common; 2540 } 2541 2542 pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev); 2543 if (!pool_md) { 2544 ti->error = "Couldn't get pool mapped device"; 2545 r = -EINVAL; 2546 goto bad_common; 2547 } 2548 2549 tc->pool = __pool_table_lookup(pool_md); 2550 if (!tc->pool) { 2551 ti->error = "Couldn't find pool object"; 2552 r = -EINVAL; 2553 goto bad_pool_lookup; 2554 } 2555 __pool_inc(tc->pool); 2556 2557 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td); 2558 if (r) { 2559 ti->error = "Couldn't open thin internal device"; 2560 goto bad_thin_open; 2561 } 2562 2563 ti->split_io = tc->pool->sectors_per_block; 2564 ti->num_flush_requests = 1; 2565 2566 /* In case the pool supports discards, pass them on. */ 2567 if (tc->pool->pf.discard_enabled) { 2568 ti->discards_supported = 1; 2569 ti->num_discard_requests = 1; 2570 } 2571 2572 dm_put(pool_md); 2573 2574 mutex_unlock(&dm_thin_pool_table.mutex); 2575 2576 return 0; 2577 2578 bad_thin_open: 2579 __pool_dec(tc->pool); 2580 bad_pool_lookup: 2581 dm_put(pool_md); 2582 bad_common: 2583 dm_put_device(ti, tc->pool_dev); 2584 bad_pool_dev: 2585 if (tc->origin_dev) 2586 dm_put_device(ti, tc->origin_dev); 2587 bad_origin_dev: 2588 kfree(tc); 2589 out_unlock: 2590 mutex_unlock(&dm_thin_pool_table.mutex); 2591 2592 return r; 2593 } 2594 2595 static int thin_map(struct dm_target *ti, struct bio *bio, 2596 union map_info *map_context) 2597 { 2598 bio->bi_sector = dm_target_offset(ti, bio->bi_sector); 2599 2600 return thin_bio_map(ti, bio, map_context); 2601 } 2602 2603 static int thin_endio(struct dm_target *ti, 2604 struct bio *bio, int err, 2605 union map_info *map_context) 2606 { 2607 unsigned long flags; 2608 struct endio_hook *h = map_context->ptr; 2609 struct list_head work; 2610 struct new_mapping *m, *tmp; 2611 struct pool *pool = h->tc->pool; 2612 2613 if (h->shared_read_entry) { 2614 INIT_LIST_HEAD(&work); 2615 ds_dec(h->shared_read_entry, &work); 2616 2617 spin_lock_irqsave(&pool->lock, flags); 2618 list_for_each_entry_safe(m, tmp, &work, list) { 2619 list_del(&m->list); 2620 m->quiesced = 1; 2621 __maybe_add_mapping(m); 2622 } 2623 spin_unlock_irqrestore(&pool->lock, flags); 2624 } 2625 2626 if (h->all_io_entry) { 2627 INIT_LIST_HEAD(&work); 2628 ds_dec(h->all_io_entry, &work); 2629 list_for_each_entry_safe(m, tmp, &work, list) 2630 list_add(&m->list, &pool->prepared_discards); 2631 } 2632 2633 mempool_free(h, pool->endio_hook_pool); 2634 2635 return 0; 2636 } 2637 2638 static void thin_postsuspend(struct dm_target *ti) 2639 { 2640 if (dm_noflush_suspending(ti)) 2641 requeue_io((struct thin_c *)ti->private); 2642 } 2643 2644 /* 2645 * <nr mapped sectors> <highest mapped sector> 2646 */ 2647 static int thin_status(struct dm_target *ti, status_type_t type, 2648 char *result, unsigned maxlen) 2649 { 2650 int r; 2651 ssize_t sz = 0; 2652 dm_block_t mapped, highest; 2653 char buf[BDEVNAME_SIZE]; 2654 struct thin_c *tc = ti->private; 2655 2656 if (!tc->td) 2657 DMEMIT("-"); 2658 else { 2659 switch (type) { 2660 case STATUSTYPE_INFO: 2661 r = dm_thin_get_mapped_count(tc->td, &mapped); 2662 if (r) 2663 return r; 2664 2665 r = dm_thin_get_highest_mapped_block(tc->td, &highest); 2666 if (r < 0) 2667 return r; 2668 2669 DMEMIT("%llu ", mapped * tc->pool->sectors_per_block); 2670 if (r) 2671 DMEMIT("%llu", ((highest + 1) * 2672 tc->pool->sectors_per_block) - 1); 2673 else 2674 DMEMIT("-"); 2675 break; 2676 2677 case STATUSTYPE_TABLE: 2678 DMEMIT("%s %lu", 2679 format_dev_t(buf, tc->pool_dev->bdev->bd_dev), 2680 (unsigned long) tc->dev_id); 2681 if (tc->origin_dev) 2682 DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev)); 2683 break; 2684 } 2685 } 2686 2687 return 0; 2688 } 2689 2690 static int thin_iterate_devices(struct dm_target *ti, 2691 iterate_devices_callout_fn fn, void *data) 2692 { 2693 dm_block_t blocks; 2694 struct thin_c *tc = ti->private; 2695 2696 /* 2697 * We can't call dm_pool_get_data_dev_size() since that blocks. So 2698 * we follow a more convoluted path through to the pool's target. 2699 */ 2700 if (!tc->pool->ti) 2701 return 0; /* nothing is bound */ 2702 2703 blocks = tc->pool->ti->len >> tc->pool->block_shift; 2704 if (blocks) 2705 return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block * blocks, data); 2706 2707 return 0; 2708 } 2709 2710 static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) 2711 { 2712 struct thin_c *tc = ti->private; 2713 struct pool *pool = tc->pool; 2714 2715 blk_limits_io_min(limits, 0); 2716 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); 2717 set_discard_limits(pool, limits); 2718 } 2719 2720 static struct target_type thin_target = { 2721 .name = "thin", 2722 .version = {1, 1, 0}, 2723 .module = THIS_MODULE, 2724 .ctr = thin_ctr, 2725 .dtr = thin_dtr, 2726 .map = thin_map, 2727 .end_io = thin_endio, 2728 .postsuspend = thin_postsuspend, 2729 .status = thin_status, 2730 .iterate_devices = thin_iterate_devices, 2731 .io_hints = thin_io_hints, 2732 }; 2733 2734 /*----------------------------------------------------------------*/ 2735 2736 static int __init dm_thin_init(void) 2737 { 2738 int r; 2739 2740 pool_table_init(); 2741 2742 r = dm_register_target(&thin_target); 2743 if (r) 2744 return r; 2745 2746 r = dm_register_target(&pool_target); 2747 if (r) 2748 dm_unregister_target(&thin_target); 2749 2750 return r; 2751 } 2752 2753 static void dm_thin_exit(void) 2754 { 2755 dm_unregister_target(&thin_target); 2756 dm_unregister_target(&pool_target); 2757 } 2758 2759 module_init(dm_thin_init); 2760 module_exit(dm_thin_exit); 2761 2762 MODULE_DESCRIPTION(DM_NAME "device-mapper thin provisioning target"); 2763 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 2764 MODULE_LICENSE("GPL"); 2765