1 /* 2 * Copyright (C) 2011 Red Hat UK. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm-thin-metadata.h" 8 9 #include <linux/device-mapper.h> 10 #include <linux/dm-io.h> 11 #include <linux/dm-kcopyd.h> 12 #include <linux/list.h> 13 #include <linux/init.h> 14 #include <linux/module.h> 15 #include <linux/slab.h> 16 17 #define DM_MSG_PREFIX "thin" 18 19 /* 20 * Tunable constants 21 */ 22 #define ENDIO_HOOK_POOL_SIZE 10240 23 #define DEFERRED_SET_SIZE 64 24 #define MAPPING_POOL_SIZE 1024 25 #define PRISON_CELLS 1024 26 #define COMMIT_PERIOD HZ 27 28 /* 29 * The block size of the device holding pool data must be 30 * between 64KB and 1GB. 31 */ 32 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT) 33 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 34 35 /* 36 * Device id is restricted to 24 bits. 37 */ 38 #define MAX_DEV_ID ((1 << 24) - 1) 39 40 /* 41 * How do we handle breaking sharing of data blocks? 42 * ================================================= 43 * 44 * We use a standard copy-on-write btree to store the mappings for the 45 * devices (note I'm talking about copy-on-write of the metadata here, not 46 * the data). When you take an internal snapshot you clone the root node 47 * of the origin btree. After this there is no concept of an origin or a 48 * snapshot. They are just two device trees that happen to point to the 49 * same data blocks. 50 * 51 * When we get a write in we decide if it's to a shared data block using 52 * some timestamp magic. If it is, we have to break sharing. 53 * 54 * Let's say we write to a shared block in what was the origin. The 55 * steps are: 56 * 57 * i) plug io further to this physical block. (see bio_prison code). 58 * 59 * ii) quiesce any read io to that shared data block. Obviously 60 * including all devices that share this block. (see deferred_set code) 61 * 62 * iii) copy the data block to a newly allocate block. This step can be 63 * missed out if the io covers the block. (schedule_copy). 64 * 65 * iv) insert the new mapping into the origin's btree 66 * (process_prepared_mapping). This act of inserting breaks some 67 * sharing of btree nodes between the two devices. Breaking sharing only 68 * effects the btree of that specific device. Btrees for the other 69 * devices that share the block never change. The btree for the origin 70 * device as it was after the last commit is untouched, ie. we're using 71 * persistent data structures in the functional programming sense. 72 * 73 * v) unplug io to this physical block, including the io that triggered 74 * the breaking of sharing. 75 * 76 * Steps (ii) and (iii) occur in parallel. 77 * 78 * The metadata _doesn't_ need to be committed before the io continues. We 79 * get away with this because the io is always written to a _new_ block. 80 * If there's a crash, then: 81 * 82 * - The origin mapping will point to the old origin block (the shared 83 * one). This will contain the data as it was before the io that triggered 84 * the breaking of sharing came in. 85 * 86 * - The snap mapping still points to the old block. As it would after 87 * the commit. 88 * 89 * The downside of this scheme is the timestamp magic isn't perfect, and 90 * will continue to think that data block in the snapshot device is shared 91 * even after the write to the origin has broken sharing. I suspect data 92 * blocks will typically be shared by many different devices, so we're 93 * breaking sharing n + 1 times, rather than n, where n is the number of 94 * devices that reference this data block. At the moment I think the 95 * benefits far, far outweigh the disadvantages. 96 */ 97 98 /*----------------------------------------------------------------*/ 99 100 /* 101 * Sometimes we can't deal with a bio straight away. We put them in prison 102 * where they can't cause any mischief. Bios are put in a cell identified 103 * by a key, multiple bios can be in the same cell. When the cell is 104 * subsequently unlocked the bios become available. 105 */ 106 struct bio_prison; 107 108 struct cell_key { 109 int virtual; 110 dm_thin_id dev; 111 dm_block_t block; 112 }; 113 114 struct cell { 115 struct hlist_node list; 116 struct bio_prison *prison; 117 struct cell_key key; 118 struct bio *holder; 119 struct bio_list bios; 120 }; 121 122 struct bio_prison { 123 spinlock_t lock; 124 mempool_t *cell_pool; 125 126 unsigned nr_buckets; 127 unsigned hash_mask; 128 struct hlist_head *cells; 129 }; 130 131 static uint32_t calc_nr_buckets(unsigned nr_cells) 132 { 133 uint32_t n = 128; 134 135 nr_cells /= 4; 136 nr_cells = min(nr_cells, 8192u); 137 138 while (n < nr_cells) 139 n <<= 1; 140 141 return n; 142 } 143 144 /* 145 * @nr_cells should be the number of cells you want in use _concurrently_. 146 * Don't confuse it with the number of distinct keys. 147 */ 148 static struct bio_prison *prison_create(unsigned nr_cells) 149 { 150 unsigned i; 151 uint32_t nr_buckets = calc_nr_buckets(nr_cells); 152 size_t len = sizeof(struct bio_prison) + 153 (sizeof(struct hlist_head) * nr_buckets); 154 struct bio_prison *prison = kmalloc(len, GFP_KERNEL); 155 156 if (!prison) 157 return NULL; 158 159 spin_lock_init(&prison->lock); 160 prison->cell_pool = mempool_create_kmalloc_pool(nr_cells, 161 sizeof(struct cell)); 162 if (!prison->cell_pool) { 163 kfree(prison); 164 return NULL; 165 } 166 167 prison->nr_buckets = nr_buckets; 168 prison->hash_mask = nr_buckets - 1; 169 prison->cells = (struct hlist_head *) (prison + 1); 170 for (i = 0; i < nr_buckets; i++) 171 INIT_HLIST_HEAD(prison->cells + i); 172 173 return prison; 174 } 175 176 static void prison_destroy(struct bio_prison *prison) 177 { 178 mempool_destroy(prison->cell_pool); 179 kfree(prison); 180 } 181 182 static uint32_t hash_key(struct bio_prison *prison, struct cell_key *key) 183 { 184 const unsigned long BIG_PRIME = 4294967291UL; 185 uint64_t hash = key->block * BIG_PRIME; 186 187 return (uint32_t) (hash & prison->hash_mask); 188 } 189 190 static int keys_equal(struct cell_key *lhs, struct cell_key *rhs) 191 { 192 return (lhs->virtual == rhs->virtual) && 193 (lhs->dev == rhs->dev) && 194 (lhs->block == rhs->block); 195 } 196 197 static struct cell *__search_bucket(struct hlist_head *bucket, 198 struct cell_key *key) 199 { 200 struct cell *cell; 201 struct hlist_node *tmp; 202 203 hlist_for_each_entry(cell, tmp, bucket, list) 204 if (keys_equal(&cell->key, key)) 205 return cell; 206 207 return NULL; 208 } 209 210 /* 211 * This may block if a new cell needs allocating. You must ensure that 212 * cells will be unlocked even if the calling thread is blocked. 213 * 214 * Returns 1 if the cell was already held, 0 if @inmate is the new holder. 215 */ 216 static int bio_detain(struct bio_prison *prison, struct cell_key *key, 217 struct bio *inmate, struct cell **ref) 218 { 219 int r = 1; 220 unsigned long flags; 221 uint32_t hash = hash_key(prison, key); 222 struct cell *cell, *cell2; 223 224 BUG_ON(hash > prison->nr_buckets); 225 226 spin_lock_irqsave(&prison->lock, flags); 227 228 cell = __search_bucket(prison->cells + hash, key); 229 if (cell) { 230 bio_list_add(&cell->bios, inmate); 231 goto out; 232 } 233 234 /* 235 * Allocate a new cell 236 */ 237 spin_unlock_irqrestore(&prison->lock, flags); 238 cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO); 239 spin_lock_irqsave(&prison->lock, flags); 240 241 /* 242 * We've been unlocked, so we have to double check that 243 * nobody else has inserted this cell in the meantime. 244 */ 245 cell = __search_bucket(prison->cells + hash, key); 246 if (cell) { 247 mempool_free(cell2, prison->cell_pool); 248 bio_list_add(&cell->bios, inmate); 249 goto out; 250 } 251 252 /* 253 * Use new cell. 254 */ 255 cell = cell2; 256 257 cell->prison = prison; 258 memcpy(&cell->key, key, sizeof(cell->key)); 259 cell->holder = inmate; 260 bio_list_init(&cell->bios); 261 hlist_add_head(&cell->list, prison->cells + hash); 262 263 r = 0; 264 265 out: 266 spin_unlock_irqrestore(&prison->lock, flags); 267 268 *ref = cell; 269 270 return r; 271 } 272 273 /* 274 * @inmates must have been initialised prior to this call 275 */ 276 static void __cell_release(struct cell *cell, struct bio_list *inmates) 277 { 278 struct bio_prison *prison = cell->prison; 279 280 hlist_del(&cell->list); 281 282 if (inmates) { 283 bio_list_add(inmates, cell->holder); 284 bio_list_merge(inmates, &cell->bios); 285 } 286 287 mempool_free(cell, prison->cell_pool); 288 } 289 290 static void cell_release(struct cell *cell, struct bio_list *bios) 291 { 292 unsigned long flags; 293 struct bio_prison *prison = cell->prison; 294 295 spin_lock_irqsave(&prison->lock, flags); 296 __cell_release(cell, bios); 297 spin_unlock_irqrestore(&prison->lock, flags); 298 } 299 300 /* 301 * There are a couple of places where we put a bio into a cell briefly 302 * before taking it out again. In these situations we know that no other 303 * bio may be in the cell. This function releases the cell, and also does 304 * a sanity check. 305 */ 306 static void __cell_release_singleton(struct cell *cell, struct bio *bio) 307 { 308 BUG_ON(cell->holder != bio); 309 BUG_ON(!bio_list_empty(&cell->bios)); 310 311 __cell_release(cell, NULL); 312 } 313 314 static void cell_release_singleton(struct cell *cell, struct bio *bio) 315 { 316 unsigned long flags; 317 struct bio_prison *prison = cell->prison; 318 319 spin_lock_irqsave(&prison->lock, flags); 320 __cell_release_singleton(cell, bio); 321 spin_unlock_irqrestore(&prison->lock, flags); 322 } 323 324 /* 325 * Sometimes we don't want the holder, just the additional bios. 326 */ 327 static void __cell_release_no_holder(struct cell *cell, struct bio_list *inmates) 328 { 329 struct bio_prison *prison = cell->prison; 330 331 hlist_del(&cell->list); 332 bio_list_merge(inmates, &cell->bios); 333 334 mempool_free(cell, prison->cell_pool); 335 } 336 337 static void cell_release_no_holder(struct cell *cell, struct bio_list *inmates) 338 { 339 unsigned long flags; 340 struct bio_prison *prison = cell->prison; 341 342 spin_lock_irqsave(&prison->lock, flags); 343 __cell_release_no_holder(cell, inmates); 344 spin_unlock_irqrestore(&prison->lock, flags); 345 } 346 347 static void cell_error(struct cell *cell) 348 { 349 struct bio_prison *prison = cell->prison; 350 struct bio_list bios; 351 struct bio *bio; 352 unsigned long flags; 353 354 bio_list_init(&bios); 355 356 spin_lock_irqsave(&prison->lock, flags); 357 __cell_release(cell, &bios); 358 spin_unlock_irqrestore(&prison->lock, flags); 359 360 while ((bio = bio_list_pop(&bios))) 361 bio_io_error(bio); 362 } 363 364 /*----------------------------------------------------------------*/ 365 366 /* 367 * We use the deferred set to keep track of pending reads to shared blocks. 368 * We do this to ensure the new mapping caused by a write isn't performed 369 * until these prior reads have completed. Otherwise the insertion of the 370 * new mapping could free the old block that the read bios are mapped to. 371 */ 372 373 struct deferred_set; 374 struct deferred_entry { 375 struct deferred_set *ds; 376 unsigned count; 377 struct list_head work_items; 378 }; 379 380 struct deferred_set { 381 spinlock_t lock; 382 unsigned current_entry; 383 unsigned sweeper; 384 struct deferred_entry entries[DEFERRED_SET_SIZE]; 385 }; 386 387 static void ds_init(struct deferred_set *ds) 388 { 389 int i; 390 391 spin_lock_init(&ds->lock); 392 ds->current_entry = 0; 393 ds->sweeper = 0; 394 for (i = 0; i < DEFERRED_SET_SIZE; i++) { 395 ds->entries[i].ds = ds; 396 ds->entries[i].count = 0; 397 INIT_LIST_HEAD(&ds->entries[i].work_items); 398 } 399 } 400 401 static struct deferred_entry *ds_inc(struct deferred_set *ds) 402 { 403 unsigned long flags; 404 struct deferred_entry *entry; 405 406 spin_lock_irqsave(&ds->lock, flags); 407 entry = ds->entries + ds->current_entry; 408 entry->count++; 409 spin_unlock_irqrestore(&ds->lock, flags); 410 411 return entry; 412 } 413 414 static unsigned ds_next(unsigned index) 415 { 416 return (index + 1) % DEFERRED_SET_SIZE; 417 } 418 419 static void __sweep(struct deferred_set *ds, struct list_head *head) 420 { 421 while ((ds->sweeper != ds->current_entry) && 422 !ds->entries[ds->sweeper].count) { 423 list_splice_init(&ds->entries[ds->sweeper].work_items, head); 424 ds->sweeper = ds_next(ds->sweeper); 425 } 426 427 if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count) 428 list_splice_init(&ds->entries[ds->sweeper].work_items, head); 429 } 430 431 static void ds_dec(struct deferred_entry *entry, struct list_head *head) 432 { 433 unsigned long flags; 434 435 spin_lock_irqsave(&entry->ds->lock, flags); 436 BUG_ON(!entry->count); 437 --entry->count; 438 __sweep(entry->ds, head); 439 spin_unlock_irqrestore(&entry->ds->lock, flags); 440 } 441 442 /* 443 * Returns 1 if deferred or 0 if no pending items to delay job. 444 */ 445 static int ds_add_work(struct deferred_set *ds, struct list_head *work) 446 { 447 int r = 1; 448 unsigned long flags; 449 unsigned next_entry; 450 451 spin_lock_irqsave(&ds->lock, flags); 452 if ((ds->sweeper == ds->current_entry) && 453 !ds->entries[ds->current_entry].count) 454 r = 0; 455 else { 456 list_add(work, &ds->entries[ds->current_entry].work_items); 457 next_entry = ds_next(ds->current_entry); 458 if (!ds->entries[next_entry].count) 459 ds->current_entry = next_entry; 460 } 461 spin_unlock_irqrestore(&ds->lock, flags); 462 463 return r; 464 } 465 466 /*----------------------------------------------------------------*/ 467 468 /* 469 * Key building. 470 */ 471 static void build_data_key(struct dm_thin_device *td, 472 dm_block_t b, struct cell_key *key) 473 { 474 key->virtual = 0; 475 key->dev = dm_thin_dev_id(td); 476 key->block = b; 477 } 478 479 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, 480 struct cell_key *key) 481 { 482 key->virtual = 1; 483 key->dev = dm_thin_dev_id(td); 484 key->block = b; 485 } 486 487 /*----------------------------------------------------------------*/ 488 489 /* 490 * A pool device ties together a metadata device and a data device. It 491 * also provides the interface for creating and destroying internal 492 * devices. 493 */ 494 struct new_mapping; 495 496 struct pool_features { 497 unsigned zero_new_blocks:1; 498 unsigned discard_enabled:1; 499 unsigned discard_passdown:1; 500 }; 501 502 struct pool { 503 struct list_head list; 504 struct dm_target *ti; /* Only set if a pool target is bound */ 505 506 struct mapped_device *pool_md; 507 struct block_device *md_dev; 508 struct dm_pool_metadata *pmd; 509 510 uint32_t sectors_per_block; 511 unsigned block_shift; 512 dm_block_t offset_mask; 513 dm_block_t low_water_blocks; 514 515 struct pool_features pf; 516 unsigned low_water_triggered:1; /* A dm event has been sent */ 517 unsigned no_free_space:1; /* A -ENOSPC warning has been issued */ 518 519 struct bio_prison *prison; 520 struct dm_kcopyd_client *copier; 521 522 struct workqueue_struct *wq; 523 struct work_struct worker; 524 struct delayed_work waker; 525 526 unsigned ref_count; 527 unsigned long last_commit_jiffies; 528 529 spinlock_t lock; 530 struct bio_list deferred_bios; 531 struct bio_list deferred_flush_bios; 532 struct list_head prepared_mappings; 533 struct list_head prepared_discards; 534 535 struct bio_list retry_on_resume_list; 536 537 struct deferred_set shared_read_ds; 538 struct deferred_set all_io_ds; 539 540 struct new_mapping *next_mapping; 541 mempool_t *mapping_pool; 542 mempool_t *endio_hook_pool; 543 }; 544 545 /* 546 * Target context for a pool. 547 */ 548 struct pool_c { 549 struct dm_target *ti; 550 struct pool *pool; 551 struct dm_dev *data_dev; 552 struct dm_dev *metadata_dev; 553 struct dm_target_callbacks callbacks; 554 555 dm_block_t low_water_blocks; 556 struct pool_features pf; 557 }; 558 559 /* 560 * Target context for a thin. 561 */ 562 struct thin_c { 563 struct dm_dev *pool_dev; 564 struct dm_dev *origin_dev; 565 dm_thin_id dev_id; 566 567 struct pool *pool; 568 struct dm_thin_device *td; 569 }; 570 571 /*----------------------------------------------------------------*/ 572 573 /* 574 * A global list of pools that uses a struct mapped_device as a key. 575 */ 576 static struct dm_thin_pool_table { 577 struct mutex mutex; 578 struct list_head pools; 579 } dm_thin_pool_table; 580 581 static void pool_table_init(void) 582 { 583 mutex_init(&dm_thin_pool_table.mutex); 584 INIT_LIST_HEAD(&dm_thin_pool_table.pools); 585 } 586 587 static void __pool_table_insert(struct pool *pool) 588 { 589 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 590 list_add(&pool->list, &dm_thin_pool_table.pools); 591 } 592 593 static void __pool_table_remove(struct pool *pool) 594 { 595 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 596 list_del(&pool->list); 597 } 598 599 static struct pool *__pool_table_lookup(struct mapped_device *md) 600 { 601 struct pool *pool = NULL, *tmp; 602 603 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 604 605 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { 606 if (tmp->pool_md == md) { 607 pool = tmp; 608 break; 609 } 610 } 611 612 return pool; 613 } 614 615 static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev) 616 { 617 struct pool *pool = NULL, *tmp; 618 619 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 620 621 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { 622 if (tmp->md_dev == md_dev) { 623 pool = tmp; 624 break; 625 } 626 } 627 628 return pool; 629 } 630 631 /*----------------------------------------------------------------*/ 632 633 struct endio_hook { 634 struct thin_c *tc; 635 struct deferred_entry *shared_read_entry; 636 struct deferred_entry *all_io_entry; 637 struct new_mapping *overwrite_mapping; 638 }; 639 640 static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) 641 { 642 struct bio *bio; 643 struct bio_list bios; 644 645 bio_list_init(&bios); 646 bio_list_merge(&bios, master); 647 bio_list_init(master); 648 649 while ((bio = bio_list_pop(&bios))) { 650 struct endio_hook *h = dm_get_mapinfo(bio)->ptr; 651 if (h->tc == tc) 652 bio_endio(bio, DM_ENDIO_REQUEUE); 653 else 654 bio_list_add(master, bio); 655 } 656 } 657 658 static void requeue_io(struct thin_c *tc) 659 { 660 struct pool *pool = tc->pool; 661 unsigned long flags; 662 663 spin_lock_irqsave(&pool->lock, flags); 664 __requeue_bio_list(tc, &pool->deferred_bios); 665 __requeue_bio_list(tc, &pool->retry_on_resume_list); 666 spin_unlock_irqrestore(&pool->lock, flags); 667 } 668 669 /* 670 * This section of code contains the logic for processing a thin device's IO. 671 * Much of the code depends on pool object resources (lists, workqueues, etc) 672 * but most is exclusively called from the thin target rather than the thin-pool 673 * target. 674 */ 675 676 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) 677 { 678 return bio->bi_sector >> tc->pool->block_shift; 679 } 680 681 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) 682 { 683 struct pool *pool = tc->pool; 684 685 bio->bi_bdev = tc->pool_dev->bdev; 686 bio->bi_sector = (block << pool->block_shift) + 687 (bio->bi_sector & pool->offset_mask); 688 } 689 690 static void remap_to_origin(struct thin_c *tc, struct bio *bio) 691 { 692 bio->bi_bdev = tc->origin_dev->bdev; 693 } 694 695 static void issue(struct thin_c *tc, struct bio *bio) 696 { 697 struct pool *pool = tc->pool; 698 unsigned long flags; 699 700 /* 701 * Batch together any FUA/FLUSH bios we find and then issue 702 * a single commit for them in process_deferred_bios(). 703 */ 704 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { 705 spin_lock_irqsave(&pool->lock, flags); 706 bio_list_add(&pool->deferred_flush_bios, bio); 707 spin_unlock_irqrestore(&pool->lock, flags); 708 } else 709 generic_make_request(bio); 710 } 711 712 static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio) 713 { 714 remap_to_origin(tc, bio); 715 issue(tc, bio); 716 } 717 718 static void remap_and_issue(struct thin_c *tc, struct bio *bio, 719 dm_block_t block) 720 { 721 remap(tc, bio, block); 722 issue(tc, bio); 723 } 724 725 /* 726 * wake_worker() is used when new work is queued and when pool_resume is 727 * ready to continue deferred IO processing. 728 */ 729 static void wake_worker(struct pool *pool) 730 { 731 queue_work(pool->wq, &pool->worker); 732 } 733 734 /*----------------------------------------------------------------*/ 735 736 /* 737 * Bio endio functions. 738 */ 739 struct new_mapping { 740 struct list_head list; 741 742 unsigned quiesced:1; 743 unsigned prepared:1; 744 unsigned pass_discard:1; 745 746 struct thin_c *tc; 747 dm_block_t virt_block; 748 dm_block_t data_block; 749 struct cell *cell, *cell2; 750 int err; 751 752 /* 753 * If the bio covers the whole area of a block then we can avoid 754 * zeroing or copying. Instead this bio is hooked. The bio will 755 * still be in the cell, so care has to be taken to avoid issuing 756 * the bio twice. 757 */ 758 struct bio *bio; 759 bio_end_io_t *saved_bi_end_io; 760 }; 761 762 static void __maybe_add_mapping(struct new_mapping *m) 763 { 764 struct pool *pool = m->tc->pool; 765 766 if (m->quiesced && m->prepared) { 767 list_add(&m->list, &pool->prepared_mappings); 768 wake_worker(pool); 769 } 770 } 771 772 static void copy_complete(int read_err, unsigned long write_err, void *context) 773 { 774 unsigned long flags; 775 struct new_mapping *m = context; 776 struct pool *pool = m->tc->pool; 777 778 m->err = read_err || write_err ? -EIO : 0; 779 780 spin_lock_irqsave(&pool->lock, flags); 781 m->prepared = 1; 782 __maybe_add_mapping(m); 783 spin_unlock_irqrestore(&pool->lock, flags); 784 } 785 786 static void overwrite_endio(struct bio *bio, int err) 787 { 788 unsigned long flags; 789 struct endio_hook *h = dm_get_mapinfo(bio)->ptr; 790 struct new_mapping *m = h->overwrite_mapping; 791 struct pool *pool = m->tc->pool; 792 793 m->err = err; 794 795 spin_lock_irqsave(&pool->lock, flags); 796 m->prepared = 1; 797 __maybe_add_mapping(m); 798 spin_unlock_irqrestore(&pool->lock, flags); 799 } 800 801 /*----------------------------------------------------------------*/ 802 803 /* 804 * Workqueue. 805 */ 806 807 /* 808 * Prepared mapping jobs. 809 */ 810 811 /* 812 * This sends the bios in the cell back to the deferred_bios list. 813 */ 814 static void cell_defer(struct thin_c *tc, struct cell *cell, 815 dm_block_t data_block) 816 { 817 struct pool *pool = tc->pool; 818 unsigned long flags; 819 820 spin_lock_irqsave(&pool->lock, flags); 821 cell_release(cell, &pool->deferred_bios); 822 spin_unlock_irqrestore(&tc->pool->lock, flags); 823 824 wake_worker(pool); 825 } 826 827 /* 828 * Same as cell_defer above, except it omits one particular detainee, 829 * a write bio that covers the block and has already been processed. 830 */ 831 static void cell_defer_except(struct thin_c *tc, struct cell *cell) 832 { 833 struct bio_list bios; 834 struct pool *pool = tc->pool; 835 unsigned long flags; 836 837 bio_list_init(&bios); 838 839 spin_lock_irqsave(&pool->lock, flags); 840 cell_release_no_holder(cell, &pool->deferred_bios); 841 spin_unlock_irqrestore(&pool->lock, flags); 842 843 wake_worker(pool); 844 } 845 846 static void process_prepared_mapping(struct new_mapping *m) 847 { 848 struct thin_c *tc = m->tc; 849 struct bio *bio; 850 int r; 851 852 bio = m->bio; 853 if (bio) 854 bio->bi_end_io = m->saved_bi_end_io; 855 856 if (m->err) { 857 cell_error(m->cell); 858 return; 859 } 860 861 /* 862 * Commit the prepared block into the mapping btree. 863 * Any I/O for this block arriving after this point will get 864 * remapped to it directly. 865 */ 866 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); 867 if (r) { 868 DMERR("dm_thin_insert_block() failed"); 869 cell_error(m->cell); 870 return; 871 } 872 873 /* 874 * Release any bios held while the block was being provisioned. 875 * If we are processing a write bio that completely covers the block, 876 * we already processed it so can ignore it now when processing 877 * the bios in the cell. 878 */ 879 if (bio) { 880 cell_defer_except(tc, m->cell); 881 bio_endio(bio, 0); 882 } else 883 cell_defer(tc, m->cell, m->data_block); 884 885 list_del(&m->list); 886 mempool_free(m, tc->pool->mapping_pool); 887 } 888 889 static void process_prepared_discard(struct new_mapping *m) 890 { 891 int r; 892 struct thin_c *tc = m->tc; 893 894 r = dm_thin_remove_block(tc->td, m->virt_block); 895 if (r) 896 DMERR("dm_thin_remove_block() failed"); 897 898 /* 899 * Pass the discard down to the underlying device? 900 */ 901 if (m->pass_discard) 902 remap_and_issue(tc, m->bio, m->data_block); 903 else 904 bio_endio(m->bio, 0); 905 906 cell_defer_except(tc, m->cell); 907 cell_defer_except(tc, m->cell2); 908 mempool_free(m, tc->pool->mapping_pool); 909 } 910 911 static void process_prepared(struct pool *pool, struct list_head *head, 912 void (*fn)(struct new_mapping *)) 913 { 914 unsigned long flags; 915 struct list_head maps; 916 struct new_mapping *m, *tmp; 917 918 INIT_LIST_HEAD(&maps); 919 spin_lock_irqsave(&pool->lock, flags); 920 list_splice_init(head, &maps); 921 spin_unlock_irqrestore(&pool->lock, flags); 922 923 list_for_each_entry_safe(m, tmp, &maps, list) 924 fn(m); 925 } 926 927 /* 928 * Deferred bio jobs. 929 */ 930 static int io_overlaps_block(struct pool *pool, struct bio *bio) 931 { 932 return !(bio->bi_sector & pool->offset_mask) && 933 (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT)); 934 935 } 936 937 static int io_overwrites_block(struct pool *pool, struct bio *bio) 938 { 939 return (bio_data_dir(bio) == WRITE) && 940 io_overlaps_block(pool, bio); 941 } 942 943 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save, 944 bio_end_io_t *fn) 945 { 946 *save = bio->bi_end_io; 947 bio->bi_end_io = fn; 948 } 949 950 static int ensure_next_mapping(struct pool *pool) 951 { 952 if (pool->next_mapping) 953 return 0; 954 955 pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC); 956 957 return pool->next_mapping ? 0 : -ENOMEM; 958 } 959 960 static struct new_mapping *get_next_mapping(struct pool *pool) 961 { 962 struct new_mapping *r = pool->next_mapping; 963 964 BUG_ON(!pool->next_mapping); 965 966 pool->next_mapping = NULL; 967 968 return r; 969 } 970 971 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, 972 struct dm_dev *origin, dm_block_t data_origin, 973 dm_block_t data_dest, 974 struct cell *cell, struct bio *bio) 975 { 976 int r; 977 struct pool *pool = tc->pool; 978 struct new_mapping *m = get_next_mapping(pool); 979 980 INIT_LIST_HEAD(&m->list); 981 m->quiesced = 0; 982 m->prepared = 0; 983 m->tc = tc; 984 m->virt_block = virt_block; 985 m->data_block = data_dest; 986 m->cell = cell; 987 m->err = 0; 988 m->bio = NULL; 989 990 if (!ds_add_work(&pool->shared_read_ds, &m->list)) 991 m->quiesced = 1; 992 993 /* 994 * IO to pool_dev remaps to the pool target's data_dev. 995 * 996 * If the whole block of data is being overwritten, we can issue the 997 * bio immediately. Otherwise we use kcopyd to clone the data first. 998 */ 999 if (io_overwrites_block(pool, bio)) { 1000 struct endio_hook *h = dm_get_mapinfo(bio)->ptr; 1001 h->overwrite_mapping = m; 1002 m->bio = bio; 1003 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 1004 remap_and_issue(tc, bio, data_dest); 1005 } else { 1006 struct dm_io_region from, to; 1007 1008 from.bdev = origin->bdev; 1009 from.sector = data_origin * pool->sectors_per_block; 1010 from.count = pool->sectors_per_block; 1011 1012 to.bdev = tc->pool_dev->bdev; 1013 to.sector = data_dest * pool->sectors_per_block; 1014 to.count = pool->sectors_per_block; 1015 1016 r = dm_kcopyd_copy(pool->copier, &from, 1, &to, 1017 0, copy_complete, m); 1018 if (r < 0) { 1019 mempool_free(m, pool->mapping_pool); 1020 DMERR("dm_kcopyd_copy() failed"); 1021 cell_error(cell); 1022 } 1023 } 1024 } 1025 1026 static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block, 1027 dm_block_t data_origin, dm_block_t data_dest, 1028 struct cell *cell, struct bio *bio) 1029 { 1030 schedule_copy(tc, virt_block, tc->pool_dev, 1031 data_origin, data_dest, cell, bio); 1032 } 1033 1034 static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block, 1035 dm_block_t data_dest, 1036 struct cell *cell, struct bio *bio) 1037 { 1038 schedule_copy(tc, virt_block, tc->origin_dev, 1039 virt_block, data_dest, cell, bio); 1040 } 1041 1042 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, 1043 dm_block_t data_block, struct cell *cell, 1044 struct bio *bio) 1045 { 1046 struct pool *pool = tc->pool; 1047 struct new_mapping *m = get_next_mapping(pool); 1048 1049 INIT_LIST_HEAD(&m->list); 1050 m->quiesced = 1; 1051 m->prepared = 0; 1052 m->tc = tc; 1053 m->virt_block = virt_block; 1054 m->data_block = data_block; 1055 m->cell = cell; 1056 m->err = 0; 1057 m->bio = NULL; 1058 1059 /* 1060 * If the whole block of data is being overwritten or we are not 1061 * zeroing pre-existing data, we can issue the bio immediately. 1062 * Otherwise we use kcopyd to zero the data first. 1063 */ 1064 if (!pool->pf.zero_new_blocks) 1065 process_prepared_mapping(m); 1066 1067 else if (io_overwrites_block(pool, bio)) { 1068 struct endio_hook *h = dm_get_mapinfo(bio)->ptr; 1069 h->overwrite_mapping = m; 1070 m->bio = bio; 1071 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 1072 remap_and_issue(tc, bio, data_block); 1073 1074 } else { 1075 int r; 1076 struct dm_io_region to; 1077 1078 to.bdev = tc->pool_dev->bdev; 1079 to.sector = data_block * pool->sectors_per_block; 1080 to.count = pool->sectors_per_block; 1081 1082 r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m); 1083 if (r < 0) { 1084 mempool_free(m, pool->mapping_pool); 1085 DMERR("dm_kcopyd_zero() failed"); 1086 cell_error(cell); 1087 } 1088 } 1089 } 1090 1091 static int alloc_data_block(struct thin_c *tc, dm_block_t *result) 1092 { 1093 int r; 1094 dm_block_t free_blocks; 1095 unsigned long flags; 1096 struct pool *pool = tc->pool; 1097 1098 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 1099 if (r) 1100 return r; 1101 1102 if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) { 1103 DMWARN("%s: reached low water mark, sending event.", 1104 dm_device_name(pool->pool_md)); 1105 spin_lock_irqsave(&pool->lock, flags); 1106 pool->low_water_triggered = 1; 1107 spin_unlock_irqrestore(&pool->lock, flags); 1108 dm_table_event(pool->ti->table); 1109 } 1110 1111 if (!free_blocks) { 1112 if (pool->no_free_space) 1113 return -ENOSPC; 1114 else { 1115 /* 1116 * Try to commit to see if that will free up some 1117 * more space. 1118 */ 1119 r = dm_pool_commit_metadata(pool->pmd); 1120 if (r) { 1121 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 1122 __func__, r); 1123 return r; 1124 } 1125 1126 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 1127 if (r) 1128 return r; 1129 1130 /* 1131 * If we still have no space we set a flag to avoid 1132 * doing all this checking and return -ENOSPC. 1133 */ 1134 if (!free_blocks) { 1135 DMWARN("%s: no free space available.", 1136 dm_device_name(pool->pool_md)); 1137 spin_lock_irqsave(&pool->lock, flags); 1138 pool->no_free_space = 1; 1139 spin_unlock_irqrestore(&pool->lock, flags); 1140 return -ENOSPC; 1141 } 1142 } 1143 } 1144 1145 r = dm_pool_alloc_data_block(pool->pmd, result); 1146 if (r) 1147 return r; 1148 1149 return 0; 1150 } 1151 1152 /* 1153 * If we have run out of space, queue bios until the device is 1154 * resumed, presumably after having been reloaded with more space. 1155 */ 1156 static void retry_on_resume(struct bio *bio) 1157 { 1158 struct endio_hook *h = dm_get_mapinfo(bio)->ptr; 1159 struct thin_c *tc = h->tc; 1160 struct pool *pool = tc->pool; 1161 unsigned long flags; 1162 1163 spin_lock_irqsave(&pool->lock, flags); 1164 bio_list_add(&pool->retry_on_resume_list, bio); 1165 spin_unlock_irqrestore(&pool->lock, flags); 1166 } 1167 1168 static void no_space(struct cell *cell) 1169 { 1170 struct bio *bio; 1171 struct bio_list bios; 1172 1173 bio_list_init(&bios); 1174 cell_release(cell, &bios); 1175 1176 while ((bio = bio_list_pop(&bios))) 1177 retry_on_resume(bio); 1178 } 1179 1180 static void process_discard(struct thin_c *tc, struct bio *bio) 1181 { 1182 int r; 1183 unsigned long flags; 1184 struct pool *pool = tc->pool; 1185 struct cell *cell, *cell2; 1186 struct cell_key key, key2; 1187 dm_block_t block = get_bio_block(tc, bio); 1188 struct dm_thin_lookup_result lookup_result; 1189 struct new_mapping *m; 1190 1191 build_virtual_key(tc->td, block, &key); 1192 if (bio_detain(tc->pool->prison, &key, bio, &cell)) 1193 return; 1194 1195 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1196 switch (r) { 1197 case 0: 1198 /* 1199 * Check nobody is fiddling with this pool block. This can 1200 * happen if someone's in the process of breaking sharing 1201 * on this block. 1202 */ 1203 build_data_key(tc->td, lookup_result.block, &key2); 1204 if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) { 1205 cell_release_singleton(cell, bio); 1206 break; 1207 } 1208 1209 if (io_overlaps_block(pool, bio)) { 1210 /* 1211 * IO may still be going to the destination block. We must 1212 * quiesce before we can do the removal. 1213 */ 1214 m = get_next_mapping(pool); 1215 m->tc = tc; 1216 m->pass_discard = (!lookup_result.shared) & pool->pf.discard_passdown; 1217 m->virt_block = block; 1218 m->data_block = lookup_result.block; 1219 m->cell = cell; 1220 m->cell2 = cell2; 1221 m->err = 0; 1222 m->bio = bio; 1223 1224 if (!ds_add_work(&pool->all_io_ds, &m->list)) { 1225 spin_lock_irqsave(&pool->lock, flags); 1226 list_add(&m->list, &pool->prepared_discards); 1227 spin_unlock_irqrestore(&pool->lock, flags); 1228 wake_worker(pool); 1229 } 1230 } else { 1231 /* 1232 * This path is hit if people are ignoring 1233 * limits->discard_granularity. It ignores any 1234 * part of the discard that is in a subsequent 1235 * block. 1236 */ 1237 sector_t offset = bio->bi_sector - (block << pool->block_shift); 1238 unsigned remaining = (pool->sectors_per_block - offset) << 9; 1239 bio->bi_size = min(bio->bi_size, remaining); 1240 1241 cell_release_singleton(cell, bio); 1242 cell_release_singleton(cell2, bio); 1243 remap_and_issue(tc, bio, lookup_result.block); 1244 } 1245 break; 1246 1247 case -ENODATA: 1248 /* 1249 * It isn't provisioned, just forget it. 1250 */ 1251 cell_release_singleton(cell, bio); 1252 bio_endio(bio, 0); 1253 break; 1254 1255 default: 1256 DMERR("discard: find block unexpectedly returned %d", r); 1257 cell_release_singleton(cell, bio); 1258 bio_io_error(bio); 1259 break; 1260 } 1261 } 1262 1263 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, 1264 struct cell_key *key, 1265 struct dm_thin_lookup_result *lookup_result, 1266 struct cell *cell) 1267 { 1268 int r; 1269 dm_block_t data_block; 1270 1271 r = alloc_data_block(tc, &data_block); 1272 switch (r) { 1273 case 0: 1274 schedule_internal_copy(tc, block, lookup_result->block, 1275 data_block, cell, bio); 1276 break; 1277 1278 case -ENOSPC: 1279 no_space(cell); 1280 break; 1281 1282 default: 1283 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r); 1284 cell_error(cell); 1285 break; 1286 } 1287 } 1288 1289 static void process_shared_bio(struct thin_c *tc, struct bio *bio, 1290 dm_block_t block, 1291 struct dm_thin_lookup_result *lookup_result) 1292 { 1293 struct cell *cell; 1294 struct pool *pool = tc->pool; 1295 struct cell_key key; 1296 1297 /* 1298 * If cell is already occupied, then sharing is already in the process 1299 * of being broken so we have nothing further to do here. 1300 */ 1301 build_data_key(tc->td, lookup_result->block, &key); 1302 if (bio_detain(pool->prison, &key, bio, &cell)) 1303 return; 1304 1305 if (bio_data_dir(bio) == WRITE) 1306 break_sharing(tc, bio, block, &key, lookup_result, cell); 1307 else { 1308 struct endio_hook *h = dm_get_mapinfo(bio)->ptr; 1309 1310 h->shared_read_entry = ds_inc(&pool->shared_read_ds); 1311 1312 cell_release_singleton(cell, bio); 1313 remap_and_issue(tc, bio, lookup_result->block); 1314 } 1315 } 1316 1317 static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block, 1318 struct cell *cell) 1319 { 1320 int r; 1321 dm_block_t data_block; 1322 1323 /* 1324 * Remap empty bios (flushes) immediately, without provisioning. 1325 */ 1326 if (!bio->bi_size) { 1327 cell_release_singleton(cell, bio); 1328 remap_and_issue(tc, bio, 0); 1329 return; 1330 } 1331 1332 /* 1333 * Fill read bios with zeroes and complete them immediately. 1334 */ 1335 if (bio_data_dir(bio) == READ) { 1336 zero_fill_bio(bio); 1337 cell_release_singleton(cell, bio); 1338 bio_endio(bio, 0); 1339 return; 1340 } 1341 1342 r = alloc_data_block(tc, &data_block); 1343 switch (r) { 1344 case 0: 1345 if (tc->origin_dev) 1346 schedule_external_copy(tc, block, data_block, cell, bio); 1347 else 1348 schedule_zero(tc, block, data_block, cell, bio); 1349 break; 1350 1351 case -ENOSPC: 1352 no_space(cell); 1353 break; 1354 1355 default: 1356 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r); 1357 cell_error(cell); 1358 break; 1359 } 1360 } 1361 1362 static void process_bio(struct thin_c *tc, struct bio *bio) 1363 { 1364 int r; 1365 dm_block_t block = get_bio_block(tc, bio); 1366 struct cell *cell; 1367 struct cell_key key; 1368 struct dm_thin_lookup_result lookup_result; 1369 1370 /* 1371 * If cell is already occupied, then the block is already 1372 * being provisioned so we have nothing further to do here. 1373 */ 1374 build_virtual_key(tc->td, block, &key); 1375 if (bio_detain(tc->pool->prison, &key, bio, &cell)) 1376 return; 1377 1378 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1379 switch (r) { 1380 case 0: 1381 /* 1382 * We can release this cell now. This thread is the only 1383 * one that puts bios into a cell, and we know there were 1384 * no preceding bios. 1385 */ 1386 /* 1387 * TODO: this will probably have to change when discard goes 1388 * back in. 1389 */ 1390 cell_release_singleton(cell, bio); 1391 1392 if (lookup_result.shared) 1393 process_shared_bio(tc, bio, block, &lookup_result); 1394 else 1395 remap_and_issue(tc, bio, lookup_result.block); 1396 break; 1397 1398 case -ENODATA: 1399 if (bio_data_dir(bio) == READ && tc->origin_dev) { 1400 cell_release_singleton(cell, bio); 1401 remap_to_origin_and_issue(tc, bio); 1402 } else 1403 provision_block(tc, bio, block, cell); 1404 break; 1405 1406 default: 1407 DMERR("dm_thin_find_block() failed, error = %d", r); 1408 cell_release_singleton(cell, bio); 1409 bio_io_error(bio); 1410 break; 1411 } 1412 } 1413 1414 static int need_commit_due_to_time(struct pool *pool) 1415 { 1416 return jiffies < pool->last_commit_jiffies || 1417 jiffies > pool->last_commit_jiffies + COMMIT_PERIOD; 1418 } 1419 1420 static void process_deferred_bios(struct pool *pool) 1421 { 1422 unsigned long flags; 1423 struct bio *bio; 1424 struct bio_list bios; 1425 int r; 1426 1427 bio_list_init(&bios); 1428 1429 spin_lock_irqsave(&pool->lock, flags); 1430 bio_list_merge(&bios, &pool->deferred_bios); 1431 bio_list_init(&pool->deferred_bios); 1432 spin_unlock_irqrestore(&pool->lock, flags); 1433 1434 while ((bio = bio_list_pop(&bios))) { 1435 struct endio_hook *h = dm_get_mapinfo(bio)->ptr; 1436 struct thin_c *tc = h->tc; 1437 1438 /* 1439 * If we've got no free new_mapping structs, and processing 1440 * this bio might require one, we pause until there are some 1441 * prepared mappings to process. 1442 */ 1443 if (ensure_next_mapping(pool)) { 1444 spin_lock_irqsave(&pool->lock, flags); 1445 bio_list_merge(&pool->deferred_bios, &bios); 1446 spin_unlock_irqrestore(&pool->lock, flags); 1447 1448 break; 1449 } 1450 1451 if (bio->bi_rw & REQ_DISCARD) 1452 process_discard(tc, bio); 1453 else 1454 process_bio(tc, bio); 1455 } 1456 1457 /* 1458 * If there are any deferred flush bios, we must commit 1459 * the metadata before issuing them. 1460 */ 1461 bio_list_init(&bios); 1462 spin_lock_irqsave(&pool->lock, flags); 1463 bio_list_merge(&bios, &pool->deferred_flush_bios); 1464 bio_list_init(&pool->deferred_flush_bios); 1465 spin_unlock_irqrestore(&pool->lock, flags); 1466 1467 if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) 1468 return; 1469 1470 r = dm_pool_commit_metadata(pool->pmd); 1471 if (r) { 1472 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 1473 __func__, r); 1474 while ((bio = bio_list_pop(&bios))) 1475 bio_io_error(bio); 1476 return; 1477 } 1478 pool->last_commit_jiffies = jiffies; 1479 1480 while ((bio = bio_list_pop(&bios))) 1481 generic_make_request(bio); 1482 } 1483 1484 static void do_worker(struct work_struct *ws) 1485 { 1486 struct pool *pool = container_of(ws, struct pool, worker); 1487 1488 process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping); 1489 process_prepared(pool, &pool->prepared_discards, process_prepared_discard); 1490 process_deferred_bios(pool); 1491 } 1492 1493 /* 1494 * We want to commit periodically so that not too much 1495 * unwritten data builds up. 1496 */ 1497 static void do_waker(struct work_struct *ws) 1498 { 1499 struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker); 1500 wake_worker(pool); 1501 queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD); 1502 } 1503 1504 /*----------------------------------------------------------------*/ 1505 1506 /* 1507 * Mapping functions. 1508 */ 1509 1510 /* 1511 * Called only while mapping a thin bio to hand it over to the workqueue. 1512 */ 1513 static void thin_defer_bio(struct thin_c *tc, struct bio *bio) 1514 { 1515 unsigned long flags; 1516 struct pool *pool = tc->pool; 1517 1518 spin_lock_irqsave(&pool->lock, flags); 1519 bio_list_add(&pool->deferred_bios, bio); 1520 spin_unlock_irqrestore(&pool->lock, flags); 1521 1522 wake_worker(pool); 1523 } 1524 1525 static struct endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio) 1526 { 1527 struct pool *pool = tc->pool; 1528 struct endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO); 1529 1530 h->tc = tc; 1531 h->shared_read_entry = NULL; 1532 h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds); 1533 h->overwrite_mapping = NULL; 1534 1535 return h; 1536 } 1537 1538 /* 1539 * Non-blocking function called from the thin target's map function. 1540 */ 1541 static int thin_bio_map(struct dm_target *ti, struct bio *bio, 1542 union map_info *map_context) 1543 { 1544 int r; 1545 struct thin_c *tc = ti->private; 1546 dm_block_t block = get_bio_block(tc, bio); 1547 struct dm_thin_device *td = tc->td; 1548 struct dm_thin_lookup_result result; 1549 1550 map_context->ptr = thin_hook_bio(tc, bio); 1551 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { 1552 thin_defer_bio(tc, bio); 1553 return DM_MAPIO_SUBMITTED; 1554 } 1555 1556 r = dm_thin_find_block(td, block, 0, &result); 1557 1558 /* 1559 * Note that we defer readahead too. 1560 */ 1561 switch (r) { 1562 case 0: 1563 if (unlikely(result.shared)) { 1564 /* 1565 * We have a race condition here between the 1566 * result.shared value returned by the lookup and 1567 * snapshot creation, which may cause new 1568 * sharing. 1569 * 1570 * To avoid this always quiesce the origin before 1571 * taking the snap. You want to do this anyway to 1572 * ensure a consistent application view 1573 * (i.e. lockfs). 1574 * 1575 * More distant ancestors are irrelevant. The 1576 * shared flag will be set in their case. 1577 */ 1578 thin_defer_bio(tc, bio); 1579 r = DM_MAPIO_SUBMITTED; 1580 } else { 1581 remap(tc, bio, result.block); 1582 r = DM_MAPIO_REMAPPED; 1583 } 1584 break; 1585 1586 case -ENODATA: 1587 /* 1588 * In future, the failed dm_thin_find_block above could 1589 * provide the hint to load the metadata into cache. 1590 */ 1591 case -EWOULDBLOCK: 1592 thin_defer_bio(tc, bio); 1593 r = DM_MAPIO_SUBMITTED; 1594 break; 1595 } 1596 1597 return r; 1598 } 1599 1600 static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 1601 { 1602 int r; 1603 unsigned long flags; 1604 struct pool_c *pt = container_of(cb, struct pool_c, callbacks); 1605 1606 spin_lock_irqsave(&pt->pool->lock, flags); 1607 r = !bio_list_empty(&pt->pool->retry_on_resume_list); 1608 spin_unlock_irqrestore(&pt->pool->lock, flags); 1609 1610 if (!r) { 1611 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 1612 r = bdi_congested(&q->backing_dev_info, bdi_bits); 1613 } 1614 1615 return r; 1616 } 1617 1618 static void __requeue_bios(struct pool *pool) 1619 { 1620 bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list); 1621 bio_list_init(&pool->retry_on_resume_list); 1622 } 1623 1624 /*---------------------------------------------------------------- 1625 * Binding of control targets to a pool object 1626 *--------------------------------------------------------------*/ 1627 static int bind_control_target(struct pool *pool, struct dm_target *ti) 1628 { 1629 struct pool_c *pt = ti->private; 1630 1631 pool->ti = ti; 1632 pool->low_water_blocks = pt->low_water_blocks; 1633 pool->pf = pt->pf; 1634 1635 /* 1636 * If discard_passdown was enabled verify that the data device 1637 * supports discards. Disable discard_passdown if not; otherwise 1638 * -EOPNOTSUPP will be returned. 1639 */ 1640 if (pt->pf.discard_passdown) { 1641 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 1642 if (!q || !blk_queue_discard(q)) { 1643 char buf[BDEVNAME_SIZE]; 1644 DMWARN("Discard unsupported by data device (%s): Disabling discard passdown.", 1645 bdevname(pt->data_dev->bdev, buf)); 1646 pool->pf.discard_passdown = 0; 1647 } 1648 } 1649 1650 return 0; 1651 } 1652 1653 static void unbind_control_target(struct pool *pool, struct dm_target *ti) 1654 { 1655 if (pool->ti == ti) 1656 pool->ti = NULL; 1657 } 1658 1659 /*---------------------------------------------------------------- 1660 * Pool creation 1661 *--------------------------------------------------------------*/ 1662 /* Initialize pool features. */ 1663 static void pool_features_init(struct pool_features *pf) 1664 { 1665 pf->zero_new_blocks = 1; 1666 pf->discard_enabled = 1; 1667 pf->discard_passdown = 1; 1668 } 1669 1670 static void __pool_destroy(struct pool *pool) 1671 { 1672 __pool_table_remove(pool); 1673 1674 if (dm_pool_metadata_close(pool->pmd) < 0) 1675 DMWARN("%s: dm_pool_metadata_close() failed.", __func__); 1676 1677 prison_destroy(pool->prison); 1678 dm_kcopyd_client_destroy(pool->copier); 1679 1680 if (pool->wq) 1681 destroy_workqueue(pool->wq); 1682 1683 if (pool->next_mapping) 1684 mempool_free(pool->next_mapping, pool->mapping_pool); 1685 mempool_destroy(pool->mapping_pool); 1686 mempool_destroy(pool->endio_hook_pool); 1687 kfree(pool); 1688 } 1689 1690 static struct pool *pool_create(struct mapped_device *pool_md, 1691 struct block_device *metadata_dev, 1692 unsigned long block_size, char **error) 1693 { 1694 int r; 1695 void *err_p; 1696 struct pool *pool; 1697 struct dm_pool_metadata *pmd; 1698 1699 pmd = dm_pool_metadata_open(metadata_dev, block_size); 1700 if (IS_ERR(pmd)) { 1701 *error = "Error creating metadata object"; 1702 return (struct pool *)pmd; 1703 } 1704 1705 pool = kmalloc(sizeof(*pool), GFP_KERNEL); 1706 if (!pool) { 1707 *error = "Error allocating memory for pool"; 1708 err_p = ERR_PTR(-ENOMEM); 1709 goto bad_pool; 1710 } 1711 1712 pool->pmd = pmd; 1713 pool->sectors_per_block = block_size; 1714 pool->block_shift = ffs(block_size) - 1; 1715 pool->offset_mask = block_size - 1; 1716 pool->low_water_blocks = 0; 1717 pool_features_init(&pool->pf); 1718 pool->prison = prison_create(PRISON_CELLS); 1719 if (!pool->prison) { 1720 *error = "Error creating pool's bio prison"; 1721 err_p = ERR_PTR(-ENOMEM); 1722 goto bad_prison; 1723 } 1724 1725 pool->copier = dm_kcopyd_client_create(); 1726 if (IS_ERR(pool->copier)) { 1727 r = PTR_ERR(pool->copier); 1728 *error = "Error creating pool's kcopyd client"; 1729 err_p = ERR_PTR(r); 1730 goto bad_kcopyd_client; 1731 } 1732 1733 /* 1734 * Create singlethreaded workqueue that will service all devices 1735 * that use this metadata. 1736 */ 1737 pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 1738 if (!pool->wq) { 1739 *error = "Error creating pool's workqueue"; 1740 err_p = ERR_PTR(-ENOMEM); 1741 goto bad_wq; 1742 } 1743 1744 INIT_WORK(&pool->worker, do_worker); 1745 INIT_DELAYED_WORK(&pool->waker, do_waker); 1746 spin_lock_init(&pool->lock); 1747 bio_list_init(&pool->deferred_bios); 1748 bio_list_init(&pool->deferred_flush_bios); 1749 INIT_LIST_HEAD(&pool->prepared_mappings); 1750 INIT_LIST_HEAD(&pool->prepared_discards); 1751 pool->low_water_triggered = 0; 1752 pool->no_free_space = 0; 1753 bio_list_init(&pool->retry_on_resume_list); 1754 ds_init(&pool->shared_read_ds); 1755 ds_init(&pool->all_io_ds); 1756 1757 pool->next_mapping = NULL; 1758 pool->mapping_pool = 1759 mempool_create_kmalloc_pool(MAPPING_POOL_SIZE, sizeof(struct new_mapping)); 1760 if (!pool->mapping_pool) { 1761 *error = "Error creating pool's mapping mempool"; 1762 err_p = ERR_PTR(-ENOMEM); 1763 goto bad_mapping_pool; 1764 } 1765 1766 pool->endio_hook_pool = 1767 mempool_create_kmalloc_pool(ENDIO_HOOK_POOL_SIZE, sizeof(struct endio_hook)); 1768 if (!pool->endio_hook_pool) { 1769 *error = "Error creating pool's endio_hook mempool"; 1770 err_p = ERR_PTR(-ENOMEM); 1771 goto bad_endio_hook_pool; 1772 } 1773 pool->ref_count = 1; 1774 pool->last_commit_jiffies = jiffies; 1775 pool->pool_md = pool_md; 1776 pool->md_dev = metadata_dev; 1777 __pool_table_insert(pool); 1778 1779 return pool; 1780 1781 bad_endio_hook_pool: 1782 mempool_destroy(pool->mapping_pool); 1783 bad_mapping_pool: 1784 destroy_workqueue(pool->wq); 1785 bad_wq: 1786 dm_kcopyd_client_destroy(pool->copier); 1787 bad_kcopyd_client: 1788 prison_destroy(pool->prison); 1789 bad_prison: 1790 kfree(pool); 1791 bad_pool: 1792 if (dm_pool_metadata_close(pmd)) 1793 DMWARN("%s: dm_pool_metadata_close() failed.", __func__); 1794 1795 return err_p; 1796 } 1797 1798 static void __pool_inc(struct pool *pool) 1799 { 1800 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 1801 pool->ref_count++; 1802 } 1803 1804 static void __pool_dec(struct pool *pool) 1805 { 1806 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 1807 BUG_ON(!pool->ref_count); 1808 if (!--pool->ref_count) 1809 __pool_destroy(pool); 1810 } 1811 1812 static struct pool *__pool_find(struct mapped_device *pool_md, 1813 struct block_device *metadata_dev, 1814 unsigned long block_size, char **error, 1815 int *created) 1816 { 1817 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); 1818 1819 if (pool) { 1820 if (pool->pool_md != pool_md) 1821 return ERR_PTR(-EBUSY); 1822 __pool_inc(pool); 1823 1824 } else { 1825 pool = __pool_table_lookup(pool_md); 1826 if (pool) { 1827 if (pool->md_dev != metadata_dev) 1828 return ERR_PTR(-EINVAL); 1829 __pool_inc(pool); 1830 1831 } else { 1832 pool = pool_create(pool_md, metadata_dev, block_size, error); 1833 *created = 1; 1834 } 1835 } 1836 1837 return pool; 1838 } 1839 1840 /*---------------------------------------------------------------- 1841 * Pool target methods 1842 *--------------------------------------------------------------*/ 1843 static void pool_dtr(struct dm_target *ti) 1844 { 1845 struct pool_c *pt = ti->private; 1846 1847 mutex_lock(&dm_thin_pool_table.mutex); 1848 1849 unbind_control_target(pt->pool, ti); 1850 __pool_dec(pt->pool); 1851 dm_put_device(ti, pt->metadata_dev); 1852 dm_put_device(ti, pt->data_dev); 1853 kfree(pt); 1854 1855 mutex_unlock(&dm_thin_pool_table.mutex); 1856 } 1857 1858 static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, 1859 struct dm_target *ti) 1860 { 1861 int r; 1862 unsigned argc; 1863 const char *arg_name; 1864 1865 static struct dm_arg _args[] = { 1866 {0, 3, "Invalid number of pool feature arguments"}, 1867 }; 1868 1869 /* 1870 * No feature arguments supplied. 1871 */ 1872 if (!as->argc) 1873 return 0; 1874 1875 r = dm_read_arg_group(_args, as, &argc, &ti->error); 1876 if (r) 1877 return -EINVAL; 1878 1879 while (argc && !r) { 1880 arg_name = dm_shift_arg(as); 1881 argc--; 1882 1883 if (!strcasecmp(arg_name, "skip_block_zeroing")) { 1884 pf->zero_new_blocks = 0; 1885 continue; 1886 } else if (!strcasecmp(arg_name, "ignore_discard")) { 1887 pf->discard_enabled = 0; 1888 continue; 1889 } else if (!strcasecmp(arg_name, "no_discard_passdown")) { 1890 pf->discard_passdown = 0; 1891 continue; 1892 } 1893 1894 ti->error = "Unrecognised pool feature requested"; 1895 r = -EINVAL; 1896 } 1897 1898 return r; 1899 } 1900 1901 /* 1902 * thin-pool <metadata dev> <data dev> 1903 * <data block size (sectors)> 1904 * <low water mark (blocks)> 1905 * [<#feature args> [<arg>]*] 1906 * 1907 * Optional feature arguments are: 1908 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. 1909 * ignore_discard: disable discard 1910 * no_discard_passdown: don't pass discards down to the data device 1911 */ 1912 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) 1913 { 1914 int r, pool_created = 0; 1915 struct pool_c *pt; 1916 struct pool *pool; 1917 struct pool_features pf; 1918 struct dm_arg_set as; 1919 struct dm_dev *data_dev; 1920 unsigned long block_size; 1921 dm_block_t low_water_blocks; 1922 struct dm_dev *metadata_dev; 1923 sector_t metadata_dev_size; 1924 char b[BDEVNAME_SIZE]; 1925 1926 /* 1927 * FIXME Remove validation from scope of lock. 1928 */ 1929 mutex_lock(&dm_thin_pool_table.mutex); 1930 1931 if (argc < 4) { 1932 ti->error = "Invalid argument count"; 1933 r = -EINVAL; 1934 goto out_unlock; 1935 } 1936 as.argc = argc; 1937 as.argv = argv; 1938 1939 r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev); 1940 if (r) { 1941 ti->error = "Error opening metadata block device"; 1942 goto out_unlock; 1943 } 1944 1945 metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT; 1946 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) 1947 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 1948 bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); 1949 1950 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); 1951 if (r) { 1952 ti->error = "Error getting data device"; 1953 goto out_metadata; 1954 } 1955 1956 if (kstrtoul(argv[2], 10, &block_size) || !block_size || 1957 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 1958 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 1959 !is_power_of_2(block_size)) { 1960 ti->error = "Invalid block size"; 1961 r = -EINVAL; 1962 goto out; 1963 } 1964 1965 if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) { 1966 ti->error = "Invalid low water mark"; 1967 r = -EINVAL; 1968 goto out; 1969 } 1970 1971 /* 1972 * Set default pool features. 1973 */ 1974 pool_features_init(&pf); 1975 1976 dm_consume_args(&as, 4); 1977 r = parse_pool_features(&as, &pf, ti); 1978 if (r) 1979 goto out; 1980 1981 pt = kzalloc(sizeof(*pt), GFP_KERNEL); 1982 if (!pt) { 1983 r = -ENOMEM; 1984 goto out; 1985 } 1986 1987 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, 1988 block_size, &ti->error, &pool_created); 1989 if (IS_ERR(pool)) { 1990 r = PTR_ERR(pool); 1991 goto out_free_pt; 1992 } 1993 1994 /* 1995 * 'pool_created' reflects whether this is the first table load. 1996 * Top level discard support is not allowed to be changed after 1997 * initial load. This would require a pool reload to trigger thin 1998 * device changes. 1999 */ 2000 if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) { 2001 ti->error = "Discard support cannot be disabled once enabled"; 2002 r = -EINVAL; 2003 goto out_flags_changed; 2004 } 2005 2006 pt->pool = pool; 2007 pt->ti = ti; 2008 pt->metadata_dev = metadata_dev; 2009 pt->data_dev = data_dev; 2010 pt->low_water_blocks = low_water_blocks; 2011 pt->pf = pf; 2012 ti->num_flush_requests = 1; 2013 /* 2014 * Only need to enable discards if the pool should pass 2015 * them down to the data device. The thin device's discard 2016 * processing will cause mappings to be removed from the btree. 2017 */ 2018 if (pf.discard_enabled && pf.discard_passdown) { 2019 ti->num_discard_requests = 1; 2020 /* 2021 * Setting 'discards_supported' circumvents the normal 2022 * stacking of discard limits (this keeps the pool and 2023 * thin devices' discard limits consistent). 2024 */ 2025 ti->discards_supported = 1; 2026 } 2027 ti->private = pt; 2028 2029 pt->callbacks.congested_fn = pool_is_congested; 2030 dm_table_add_target_callbacks(ti->table, &pt->callbacks); 2031 2032 mutex_unlock(&dm_thin_pool_table.mutex); 2033 2034 return 0; 2035 2036 out_flags_changed: 2037 __pool_dec(pool); 2038 out_free_pt: 2039 kfree(pt); 2040 out: 2041 dm_put_device(ti, data_dev); 2042 out_metadata: 2043 dm_put_device(ti, metadata_dev); 2044 out_unlock: 2045 mutex_unlock(&dm_thin_pool_table.mutex); 2046 2047 return r; 2048 } 2049 2050 static int pool_map(struct dm_target *ti, struct bio *bio, 2051 union map_info *map_context) 2052 { 2053 int r; 2054 struct pool_c *pt = ti->private; 2055 struct pool *pool = pt->pool; 2056 unsigned long flags; 2057 2058 /* 2059 * As this is a singleton target, ti->begin is always zero. 2060 */ 2061 spin_lock_irqsave(&pool->lock, flags); 2062 bio->bi_bdev = pt->data_dev->bdev; 2063 r = DM_MAPIO_REMAPPED; 2064 spin_unlock_irqrestore(&pool->lock, flags); 2065 2066 return r; 2067 } 2068 2069 /* 2070 * Retrieves the number of blocks of the data device from 2071 * the superblock and compares it to the actual device size, 2072 * thus resizing the data device in case it has grown. 2073 * 2074 * This both copes with opening preallocated data devices in the ctr 2075 * being followed by a resume 2076 * -and- 2077 * calling the resume method individually after userspace has 2078 * grown the data device in reaction to a table event. 2079 */ 2080 static int pool_preresume(struct dm_target *ti) 2081 { 2082 int r; 2083 struct pool_c *pt = ti->private; 2084 struct pool *pool = pt->pool; 2085 dm_block_t data_size, sb_data_size; 2086 2087 /* 2088 * Take control of the pool object. 2089 */ 2090 r = bind_control_target(pool, ti); 2091 if (r) 2092 return r; 2093 2094 data_size = ti->len >> pool->block_shift; 2095 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size); 2096 if (r) { 2097 DMERR("failed to retrieve data device size"); 2098 return r; 2099 } 2100 2101 if (data_size < sb_data_size) { 2102 DMERR("pool target too small, is %llu blocks (expected %llu)", 2103 data_size, sb_data_size); 2104 return -EINVAL; 2105 2106 } else if (data_size > sb_data_size) { 2107 r = dm_pool_resize_data_dev(pool->pmd, data_size); 2108 if (r) { 2109 DMERR("failed to resize data device"); 2110 return r; 2111 } 2112 2113 r = dm_pool_commit_metadata(pool->pmd); 2114 if (r) { 2115 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 2116 __func__, r); 2117 return r; 2118 } 2119 } 2120 2121 return 0; 2122 } 2123 2124 static void pool_resume(struct dm_target *ti) 2125 { 2126 struct pool_c *pt = ti->private; 2127 struct pool *pool = pt->pool; 2128 unsigned long flags; 2129 2130 spin_lock_irqsave(&pool->lock, flags); 2131 pool->low_water_triggered = 0; 2132 pool->no_free_space = 0; 2133 __requeue_bios(pool); 2134 spin_unlock_irqrestore(&pool->lock, flags); 2135 2136 do_waker(&pool->waker.work); 2137 } 2138 2139 static void pool_postsuspend(struct dm_target *ti) 2140 { 2141 int r; 2142 struct pool_c *pt = ti->private; 2143 struct pool *pool = pt->pool; 2144 2145 cancel_delayed_work(&pool->waker); 2146 flush_workqueue(pool->wq); 2147 2148 r = dm_pool_commit_metadata(pool->pmd); 2149 if (r < 0) { 2150 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 2151 __func__, r); 2152 /* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/ 2153 } 2154 } 2155 2156 static int check_arg_count(unsigned argc, unsigned args_required) 2157 { 2158 if (argc != args_required) { 2159 DMWARN("Message received with %u arguments instead of %u.", 2160 argc, args_required); 2161 return -EINVAL; 2162 } 2163 2164 return 0; 2165 } 2166 2167 static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning) 2168 { 2169 if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) && 2170 *dev_id <= MAX_DEV_ID) 2171 return 0; 2172 2173 if (warning) 2174 DMWARN("Message received with invalid device id: %s", arg); 2175 2176 return -EINVAL; 2177 } 2178 2179 static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool) 2180 { 2181 dm_thin_id dev_id; 2182 int r; 2183 2184 r = check_arg_count(argc, 2); 2185 if (r) 2186 return r; 2187 2188 r = read_dev_id(argv[1], &dev_id, 1); 2189 if (r) 2190 return r; 2191 2192 r = dm_pool_create_thin(pool->pmd, dev_id); 2193 if (r) { 2194 DMWARN("Creation of new thinly-provisioned device with id %s failed.", 2195 argv[1]); 2196 return r; 2197 } 2198 2199 return 0; 2200 } 2201 2202 static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool) 2203 { 2204 dm_thin_id dev_id; 2205 dm_thin_id origin_dev_id; 2206 int r; 2207 2208 r = check_arg_count(argc, 3); 2209 if (r) 2210 return r; 2211 2212 r = read_dev_id(argv[1], &dev_id, 1); 2213 if (r) 2214 return r; 2215 2216 r = read_dev_id(argv[2], &origin_dev_id, 1); 2217 if (r) 2218 return r; 2219 2220 r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id); 2221 if (r) { 2222 DMWARN("Creation of new snapshot %s of device %s failed.", 2223 argv[1], argv[2]); 2224 return r; 2225 } 2226 2227 return 0; 2228 } 2229 2230 static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool) 2231 { 2232 dm_thin_id dev_id; 2233 int r; 2234 2235 r = check_arg_count(argc, 2); 2236 if (r) 2237 return r; 2238 2239 r = read_dev_id(argv[1], &dev_id, 1); 2240 if (r) 2241 return r; 2242 2243 r = dm_pool_delete_thin_device(pool->pmd, dev_id); 2244 if (r) 2245 DMWARN("Deletion of thin device %s failed.", argv[1]); 2246 2247 return r; 2248 } 2249 2250 static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool) 2251 { 2252 dm_thin_id old_id, new_id; 2253 int r; 2254 2255 r = check_arg_count(argc, 3); 2256 if (r) 2257 return r; 2258 2259 if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) { 2260 DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]); 2261 return -EINVAL; 2262 } 2263 2264 if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) { 2265 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]); 2266 return -EINVAL; 2267 } 2268 2269 r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id); 2270 if (r) { 2271 DMWARN("Failed to change transaction id from %s to %s.", 2272 argv[1], argv[2]); 2273 return r; 2274 } 2275 2276 return 0; 2277 } 2278 2279 /* 2280 * Messages supported: 2281 * create_thin <dev_id> 2282 * create_snap <dev_id> <origin_id> 2283 * delete <dev_id> 2284 * trim <dev_id> <new_size_in_sectors> 2285 * set_transaction_id <current_trans_id> <new_trans_id> 2286 */ 2287 static int pool_message(struct dm_target *ti, unsigned argc, char **argv) 2288 { 2289 int r = -EINVAL; 2290 struct pool_c *pt = ti->private; 2291 struct pool *pool = pt->pool; 2292 2293 if (!strcasecmp(argv[0], "create_thin")) 2294 r = process_create_thin_mesg(argc, argv, pool); 2295 2296 else if (!strcasecmp(argv[0], "create_snap")) 2297 r = process_create_snap_mesg(argc, argv, pool); 2298 2299 else if (!strcasecmp(argv[0], "delete")) 2300 r = process_delete_mesg(argc, argv, pool); 2301 2302 else if (!strcasecmp(argv[0], "set_transaction_id")) 2303 r = process_set_transaction_id_mesg(argc, argv, pool); 2304 2305 else 2306 DMWARN("Unrecognised thin pool target message received: %s", argv[0]); 2307 2308 if (!r) { 2309 r = dm_pool_commit_metadata(pool->pmd); 2310 if (r) 2311 DMERR("%s message: dm_pool_commit_metadata() failed, error = %d", 2312 argv[0], r); 2313 } 2314 2315 return r; 2316 } 2317 2318 /* 2319 * Status line is: 2320 * <transaction id> <used metadata sectors>/<total metadata sectors> 2321 * <used data sectors>/<total data sectors> <held metadata root> 2322 */ 2323 static int pool_status(struct dm_target *ti, status_type_t type, 2324 char *result, unsigned maxlen) 2325 { 2326 int r, count; 2327 unsigned sz = 0; 2328 uint64_t transaction_id; 2329 dm_block_t nr_free_blocks_data; 2330 dm_block_t nr_free_blocks_metadata; 2331 dm_block_t nr_blocks_data; 2332 dm_block_t nr_blocks_metadata; 2333 dm_block_t held_root; 2334 char buf[BDEVNAME_SIZE]; 2335 char buf2[BDEVNAME_SIZE]; 2336 struct pool_c *pt = ti->private; 2337 struct pool *pool = pt->pool; 2338 2339 switch (type) { 2340 case STATUSTYPE_INFO: 2341 r = dm_pool_get_metadata_transaction_id(pool->pmd, 2342 &transaction_id); 2343 if (r) 2344 return r; 2345 2346 r = dm_pool_get_free_metadata_block_count(pool->pmd, 2347 &nr_free_blocks_metadata); 2348 if (r) 2349 return r; 2350 2351 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata); 2352 if (r) 2353 return r; 2354 2355 r = dm_pool_get_free_block_count(pool->pmd, 2356 &nr_free_blocks_data); 2357 if (r) 2358 return r; 2359 2360 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data); 2361 if (r) 2362 return r; 2363 2364 r = dm_pool_get_held_metadata_root(pool->pmd, &held_root); 2365 if (r) 2366 return r; 2367 2368 DMEMIT("%llu %llu/%llu %llu/%llu ", 2369 (unsigned long long)transaction_id, 2370 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 2371 (unsigned long long)nr_blocks_metadata, 2372 (unsigned long long)(nr_blocks_data - nr_free_blocks_data), 2373 (unsigned long long)nr_blocks_data); 2374 2375 if (held_root) 2376 DMEMIT("%llu", held_root); 2377 else 2378 DMEMIT("-"); 2379 2380 break; 2381 2382 case STATUSTYPE_TABLE: 2383 DMEMIT("%s %s %lu %llu ", 2384 format_dev_t(buf, pt->metadata_dev->bdev->bd_dev), 2385 format_dev_t(buf2, pt->data_dev->bdev->bd_dev), 2386 (unsigned long)pool->sectors_per_block, 2387 (unsigned long long)pt->low_water_blocks); 2388 2389 count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled + 2390 !pt->pf.discard_passdown; 2391 DMEMIT("%u ", count); 2392 2393 if (!pool->pf.zero_new_blocks) 2394 DMEMIT("skip_block_zeroing "); 2395 2396 if (!pool->pf.discard_enabled) 2397 DMEMIT("ignore_discard "); 2398 2399 if (!pt->pf.discard_passdown) 2400 DMEMIT("no_discard_passdown "); 2401 2402 break; 2403 } 2404 2405 return 0; 2406 } 2407 2408 static int pool_iterate_devices(struct dm_target *ti, 2409 iterate_devices_callout_fn fn, void *data) 2410 { 2411 struct pool_c *pt = ti->private; 2412 2413 return fn(ti, pt->data_dev, 0, ti->len, data); 2414 } 2415 2416 static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm, 2417 struct bio_vec *biovec, int max_size) 2418 { 2419 struct pool_c *pt = ti->private; 2420 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 2421 2422 if (!q->merge_bvec_fn) 2423 return max_size; 2424 2425 bvm->bi_bdev = pt->data_dev->bdev; 2426 2427 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 2428 } 2429 2430 static void set_discard_limits(struct pool *pool, struct queue_limits *limits) 2431 { 2432 /* 2433 * FIXME: these limits may be incompatible with the pool's data device 2434 */ 2435 limits->max_discard_sectors = pool->sectors_per_block; 2436 2437 /* 2438 * This is just a hint, and not enforced. We have to cope with 2439 * bios that overlap 2 blocks. 2440 */ 2441 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; 2442 limits->discard_zeroes_data = pool->pf.zero_new_blocks; 2443 } 2444 2445 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) 2446 { 2447 struct pool_c *pt = ti->private; 2448 struct pool *pool = pt->pool; 2449 2450 blk_limits_io_min(limits, 0); 2451 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); 2452 if (pool->pf.discard_enabled) 2453 set_discard_limits(pool, limits); 2454 } 2455 2456 static struct target_type pool_target = { 2457 .name = "thin-pool", 2458 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 2459 DM_TARGET_IMMUTABLE, 2460 .version = {1, 1, 0}, 2461 .module = THIS_MODULE, 2462 .ctr = pool_ctr, 2463 .dtr = pool_dtr, 2464 .map = pool_map, 2465 .postsuspend = pool_postsuspend, 2466 .preresume = pool_preresume, 2467 .resume = pool_resume, 2468 .message = pool_message, 2469 .status = pool_status, 2470 .merge = pool_merge, 2471 .iterate_devices = pool_iterate_devices, 2472 .io_hints = pool_io_hints, 2473 }; 2474 2475 /*---------------------------------------------------------------- 2476 * Thin target methods 2477 *--------------------------------------------------------------*/ 2478 static void thin_dtr(struct dm_target *ti) 2479 { 2480 struct thin_c *tc = ti->private; 2481 2482 mutex_lock(&dm_thin_pool_table.mutex); 2483 2484 __pool_dec(tc->pool); 2485 dm_pool_close_thin_device(tc->td); 2486 dm_put_device(ti, tc->pool_dev); 2487 if (tc->origin_dev) 2488 dm_put_device(ti, tc->origin_dev); 2489 kfree(tc); 2490 2491 mutex_unlock(&dm_thin_pool_table.mutex); 2492 } 2493 2494 /* 2495 * Thin target parameters: 2496 * 2497 * <pool_dev> <dev_id> [origin_dev] 2498 * 2499 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool) 2500 * dev_id: the internal device identifier 2501 * origin_dev: a device external to the pool that should act as the origin 2502 * 2503 * If the pool device has discards disabled, they get disabled for the thin 2504 * device as well. 2505 */ 2506 static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) 2507 { 2508 int r; 2509 struct thin_c *tc; 2510 struct dm_dev *pool_dev, *origin_dev; 2511 struct mapped_device *pool_md; 2512 2513 mutex_lock(&dm_thin_pool_table.mutex); 2514 2515 if (argc != 2 && argc != 3) { 2516 ti->error = "Invalid argument count"; 2517 r = -EINVAL; 2518 goto out_unlock; 2519 } 2520 2521 tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL); 2522 if (!tc) { 2523 ti->error = "Out of memory"; 2524 r = -ENOMEM; 2525 goto out_unlock; 2526 } 2527 2528 if (argc == 3) { 2529 r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev); 2530 if (r) { 2531 ti->error = "Error opening origin device"; 2532 goto bad_origin_dev; 2533 } 2534 tc->origin_dev = origin_dev; 2535 } 2536 2537 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev); 2538 if (r) { 2539 ti->error = "Error opening pool device"; 2540 goto bad_pool_dev; 2541 } 2542 tc->pool_dev = pool_dev; 2543 2544 if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) { 2545 ti->error = "Invalid device id"; 2546 r = -EINVAL; 2547 goto bad_common; 2548 } 2549 2550 pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev); 2551 if (!pool_md) { 2552 ti->error = "Couldn't get pool mapped device"; 2553 r = -EINVAL; 2554 goto bad_common; 2555 } 2556 2557 tc->pool = __pool_table_lookup(pool_md); 2558 if (!tc->pool) { 2559 ti->error = "Couldn't find pool object"; 2560 r = -EINVAL; 2561 goto bad_pool_lookup; 2562 } 2563 __pool_inc(tc->pool); 2564 2565 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td); 2566 if (r) { 2567 ti->error = "Couldn't open thin internal device"; 2568 goto bad_thin_open; 2569 } 2570 2571 ti->split_io = tc->pool->sectors_per_block; 2572 ti->num_flush_requests = 1; 2573 2574 /* In case the pool supports discards, pass them on. */ 2575 if (tc->pool->pf.discard_enabled) { 2576 ti->discards_supported = 1; 2577 ti->num_discard_requests = 1; 2578 } 2579 2580 dm_put(pool_md); 2581 2582 mutex_unlock(&dm_thin_pool_table.mutex); 2583 2584 return 0; 2585 2586 bad_thin_open: 2587 __pool_dec(tc->pool); 2588 bad_pool_lookup: 2589 dm_put(pool_md); 2590 bad_common: 2591 dm_put_device(ti, tc->pool_dev); 2592 bad_pool_dev: 2593 if (tc->origin_dev) 2594 dm_put_device(ti, tc->origin_dev); 2595 bad_origin_dev: 2596 kfree(tc); 2597 out_unlock: 2598 mutex_unlock(&dm_thin_pool_table.mutex); 2599 2600 return r; 2601 } 2602 2603 static int thin_map(struct dm_target *ti, struct bio *bio, 2604 union map_info *map_context) 2605 { 2606 bio->bi_sector = dm_target_offset(ti, bio->bi_sector); 2607 2608 return thin_bio_map(ti, bio, map_context); 2609 } 2610 2611 static int thin_endio(struct dm_target *ti, 2612 struct bio *bio, int err, 2613 union map_info *map_context) 2614 { 2615 unsigned long flags; 2616 struct endio_hook *h = map_context->ptr; 2617 struct list_head work; 2618 struct new_mapping *m, *tmp; 2619 struct pool *pool = h->tc->pool; 2620 2621 if (h->shared_read_entry) { 2622 INIT_LIST_HEAD(&work); 2623 ds_dec(h->shared_read_entry, &work); 2624 2625 spin_lock_irqsave(&pool->lock, flags); 2626 list_for_each_entry_safe(m, tmp, &work, list) { 2627 list_del(&m->list); 2628 m->quiesced = 1; 2629 __maybe_add_mapping(m); 2630 } 2631 spin_unlock_irqrestore(&pool->lock, flags); 2632 } 2633 2634 if (h->all_io_entry) { 2635 INIT_LIST_HEAD(&work); 2636 ds_dec(h->all_io_entry, &work); 2637 spin_lock_irqsave(&pool->lock, flags); 2638 list_for_each_entry_safe(m, tmp, &work, list) 2639 list_add(&m->list, &pool->prepared_discards); 2640 spin_unlock_irqrestore(&pool->lock, flags); 2641 } 2642 2643 mempool_free(h, pool->endio_hook_pool); 2644 2645 return 0; 2646 } 2647 2648 static void thin_postsuspend(struct dm_target *ti) 2649 { 2650 if (dm_noflush_suspending(ti)) 2651 requeue_io((struct thin_c *)ti->private); 2652 } 2653 2654 /* 2655 * <nr mapped sectors> <highest mapped sector> 2656 */ 2657 static int thin_status(struct dm_target *ti, status_type_t type, 2658 char *result, unsigned maxlen) 2659 { 2660 int r; 2661 ssize_t sz = 0; 2662 dm_block_t mapped, highest; 2663 char buf[BDEVNAME_SIZE]; 2664 struct thin_c *tc = ti->private; 2665 2666 if (!tc->td) 2667 DMEMIT("-"); 2668 else { 2669 switch (type) { 2670 case STATUSTYPE_INFO: 2671 r = dm_thin_get_mapped_count(tc->td, &mapped); 2672 if (r) 2673 return r; 2674 2675 r = dm_thin_get_highest_mapped_block(tc->td, &highest); 2676 if (r < 0) 2677 return r; 2678 2679 DMEMIT("%llu ", mapped * tc->pool->sectors_per_block); 2680 if (r) 2681 DMEMIT("%llu", ((highest + 1) * 2682 tc->pool->sectors_per_block) - 1); 2683 else 2684 DMEMIT("-"); 2685 break; 2686 2687 case STATUSTYPE_TABLE: 2688 DMEMIT("%s %lu", 2689 format_dev_t(buf, tc->pool_dev->bdev->bd_dev), 2690 (unsigned long) tc->dev_id); 2691 if (tc->origin_dev) 2692 DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev)); 2693 break; 2694 } 2695 } 2696 2697 return 0; 2698 } 2699 2700 static int thin_iterate_devices(struct dm_target *ti, 2701 iterate_devices_callout_fn fn, void *data) 2702 { 2703 dm_block_t blocks; 2704 struct thin_c *tc = ti->private; 2705 2706 /* 2707 * We can't call dm_pool_get_data_dev_size() since that blocks. So 2708 * we follow a more convoluted path through to the pool's target. 2709 */ 2710 if (!tc->pool->ti) 2711 return 0; /* nothing is bound */ 2712 2713 blocks = tc->pool->ti->len >> tc->pool->block_shift; 2714 if (blocks) 2715 return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block * blocks, data); 2716 2717 return 0; 2718 } 2719 2720 static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) 2721 { 2722 struct thin_c *tc = ti->private; 2723 struct pool *pool = tc->pool; 2724 2725 blk_limits_io_min(limits, 0); 2726 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); 2727 set_discard_limits(pool, limits); 2728 } 2729 2730 static struct target_type thin_target = { 2731 .name = "thin", 2732 .version = {1, 1, 0}, 2733 .module = THIS_MODULE, 2734 .ctr = thin_ctr, 2735 .dtr = thin_dtr, 2736 .map = thin_map, 2737 .end_io = thin_endio, 2738 .postsuspend = thin_postsuspend, 2739 .status = thin_status, 2740 .iterate_devices = thin_iterate_devices, 2741 .io_hints = thin_io_hints, 2742 }; 2743 2744 /*----------------------------------------------------------------*/ 2745 2746 static int __init dm_thin_init(void) 2747 { 2748 int r; 2749 2750 pool_table_init(); 2751 2752 r = dm_register_target(&thin_target); 2753 if (r) 2754 return r; 2755 2756 r = dm_register_target(&pool_target); 2757 if (r) 2758 dm_unregister_target(&thin_target); 2759 2760 return r; 2761 } 2762 2763 static void dm_thin_exit(void) 2764 { 2765 dm_unregister_target(&thin_target); 2766 dm_unregister_target(&pool_target); 2767 } 2768 2769 module_init(dm_thin_init); 2770 module_exit(dm_thin_exit); 2771 2772 MODULE_DESCRIPTION(DM_NAME " thin provisioning target"); 2773 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 2774 MODULE_LICENSE("GPL"); 2775