1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2011-2012 Red Hat, Inc. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm-thin-metadata.h" 9 #include "persistent-data/dm-btree.h" 10 #include "persistent-data/dm-space-map.h" 11 #include "persistent-data/dm-space-map-disk.h" 12 #include "persistent-data/dm-transaction-manager.h" 13 14 #include <linux/list.h> 15 #include <linux/device-mapper.h> 16 #include <linux/workqueue.h> 17 18 /* 19 *-------------------------------------------------------------------------- 20 * As far as the metadata goes, there is: 21 * 22 * - A superblock in block zero, taking up fewer than 512 bytes for 23 * atomic writes. 24 * 25 * - A space map managing the metadata blocks. 26 * 27 * - A space map managing the data blocks. 28 * 29 * - A btree mapping our internal thin dev ids onto struct disk_device_details. 30 * 31 * - A hierarchical btree, with 2 levels which effectively maps (thin 32 * dev id, virtual block) -> block_time. Block time is a 64-bit 33 * field holding the time in the low 24 bits, and block in the top 40 34 * bits. 35 * 36 * BTrees consist solely of btree_nodes, that fill a block. Some are 37 * internal nodes, as such their values are a __le64 pointing to other 38 * nodes. Leaf nodes can store data of any reasonable size (ie. much 39 * smaller than the block size). The nodes consist of the header, 40 * followed by an array of keys, followed by an array of values. We have 41 * to binary search on the keys so they're all held together to help the 42 * cpu cache. 43 * 44 * Space maps have 2 btrees: 45 * 46 * - One maps a uint64_t onto a struct index_entry. Which points to a 47 * bitmap block, and has some details about how many free entries there 48 * are etc. 49 * 50 * - The bitmap blocks have a header (for the checksum). Then the rest 51 * of the block is pairs of bits. With the meaning being: 52 * 53 * 0 - ref count is 0 54 * 1 - ref count is 1 55 * 2 - ref count is 2 56 * 3 - ref count is higher than 2 57 * 58 * - If the count is higher than 2 then the ref count is entered in a 59 * second btree that directly maps the block_address to a uint32_t ref 60 * count. 61 * 62 * The space map metadata variant doesn't have a bitmaps btree. Instead 63 * it has one single blocks worth of index_entries. This avoids 64 * recursive issues with the bitmap btree needing to allocate space in 65 * order to insert. With a small data block size such as 64k the 66 * metadata support data devices that are hundreds of terrabytes. 67 * 68 * The space maps allocate space linearly from front to back. Space that 69 * is freed in a transaction is never recycled within that transaction. 70 * To try and avoid fragmenting _free_ space the allocator always goes 71 * back and fills in gaps. 72 * 73 * All metadata io is in THIN_METADATA_BLOCK_SIZE sized/aligned chunks 74 * from the block manager. 75 *-------------------------------------------------------------------------- 76 */ 77 78 #define DM_MSG_PREFIX "thin metadata" 79 80 #define THIN_SUPERBLOCK_MAGIC 27022010 81 #define THIN_SUPERBLOCK_LOCATION 0 82 #define THIN_VERSION 2 83 #define SECTOR_TO_BLOCK_SHIFT 3 84 85 /* 86 * For btree insert: 87 * 3 for btree insert + 88 * 2 for btree lookup used within space map 89 * For btree remove: 90 * 2 for shadow spine + 91 * 4 for rebalance 3 child node 92 */ 93 #define THIN_MAX_CONCURRENT_LOCKS 6 94 95 /* This should be plenty */ 96 #define SPACE_MAP_ROOT_SIZE 128 97 98 /* 99 * Little endian on-disk superblock and device details. 100 */ 101 struct thin_disk_superblock { 102 __le32 csum; /* Checksum of superblock except for this field. */ 103 __le32 flags; 104 __le64 blocknr; /* This block number, dm_block_t. */ 105 106 __u8 uuid[16]; 107 __le64 magic; 108 __le32 version; 109 __le32 time; 110 111 __le64 trans_id; 112 113 /* 114 * Root held by userspace transactions. 115 */ 116 __le64 held_root; 117 118 __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE]; 119 __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; 120 121 /* 122 * 2-level btree mapping (dev_id, (dev block, time)) -> data block 123 */ 124 __le64 data_mapping_root; 125 126 /* 127 * Device detail root mapping dev_id -> device_details 128 */ 129 __le64 device_details_root; 130 131 __le32 data_block_size; /* In 512-byte sectors. */ 132 133 __le32 metadata_block_size; /* In 512-byte sectors. */ 134 __le64 metadata_nr_blocks; 135 136 __le32 compat_flags; 137 __le32 compat_ro_flags; 138 __le32 incompat_flags; 139 } __packed; 140 141 struct disk_device_details { 142 __le64 mapped_blocks; 143 __le64 transaction_id; /* When created. */ 144 __le32 creation_time; 145 __le32 snapshotted_time; 146 } __packed; 147 148 struct dm_pool_metadata { 149 struct hlist_node hash; 150 151 struct block_device *bdev; 152 struct dm_block_manager *bm; 153 struct dm_space_map *metadata_sm; 154 struct dm_space_map *data_sm; 155 struct dm_transaction_manager *tm; 156 struct dm_transaction_manager *nb_tm; 157 158 /* 159 * Two-level btree. 160 * First level holds thin_dev_t. 161 * Second level holds mappings. 162 */ 163 struct dm_btree_info info; 164 165 /* 166 * Non-blocking version of the above. 167 */ 168 struct dm_btree_info nb_info; 169 170 /* 171 * Just the top level for deleting whole devices. 172 */ 173 struct dm_btree_info tl_info; 174 175 /* 176 * Just the bottom level for creating new devices. 177 */ 178 struct dm_btree_info bl_info; 179 180 /* 181 * Describes the device details btree. 182 */ 183 struct dm_btree_info details_info; 184 185 struct rw_semaphore root_lock; 186 uint32_t time; 187 dm_block_t root; 188 dm_block_t details_root; 189 struct list_head thin_devices; 190 uint64_t trans_id; 191 unsigned long flags; 192 sector_t data_block_size; 193 194 /* 195 * Pre-commit callback. 196 * 197 * This allows the thin provisioning target to run a callback before 198 * the metadata are committed. 199 */ 200 dm_pool_pre_commit_fn pre_commit_fn; 201 void *pre_commit_context; 202 203 /* 204 * We reserve a section of the metadata for commit overhead. 205 * All reported space does *not* include this. 206 */ 207 dm_block_t metadata_reserve; 208 209 /* 210 * Set if a transaction has to be aborted but the attempt to roll back 211 * to the previous (good) transaction failed. The only pool metadata 212 * operation possible in this state is the closing of the device. 213 */ 214 bool fail_io:1; 215 216 /* 217 * Set once a thin-pool has been accessed through one of the interfaces 218 * that imply the pool is in-service (e.g. thin devices created/deleted, 219 * thin-pool message, metadata snapshots, etc). 220 */ 221 bool in_service:1; 222 223 /* 224 * Reading the space map roots can fail, so we read it into these 225 * buffers before the superblock is locked and updated. 226 */ 227 __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE]; 228 __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; 229 }; 230 231 struct dm_thin_device { 232 struct list_head list; 233 struct dm_pool_metadata *pmd; 234 dm_thin_id id; 235 236 int open_count; 237 bool changed:1; 238 bool aborted_with_changes:1; 239 uint64_t mapped_blocks; 240 uint64_t transaction_id; 241 uint32_t creation_time; 242 uint32_t snapshotted_time; 243 }; 244 245 /* 246 *-------------------------------------------------------------- 247 * superblock validator 248 *-------------------------------------------------------------- 249 */ 250 #define SUPERBLOCK_CSUM_XOR 160774 251 252 static void sb_prepare_for_write(struct dm_block_validator *v, 253 struct dm_block *b, 254 size_t block_size) 255 { 256 struct thin_disk_superblock *disk_super = dm_block_data(b); 257 258 disk_super->blocknr = cpu_to_le64(dm_block_location(b)); 259 disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags, 260 block_size - sizeof(__le32), 261 SUPERBLOCK_CSUM_XOR)); 262 } 263 264 static int sb_check(struct dm_block_validator *v, 265 struct dm_block *b, 266 size_t block_size) 267 { 268 struct thin_disk_superblock *disk_super = dm_block_data(b); 269 __le32 csum_le; 270 271 if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) { 272 DMERR("%s failed: blocknr %llu: wanted %llu", 273 __func__, le64_to_cpu(disk_super->blocknr), 274 (unsigned long long)dm_block_location(b)); 275 return -ENOTBLK; 276 } 277 278 if (le64_to_cpu(disk_super->magic) != THIN_SUPERBLOCK_MAGIC) { 279 DMERR("%s failed: magic %llu: wanted %llu", 280 __func__, le64_to_cpu(disk_super->magic), 281 (unsigned long long)THIN_SUPERBLOCK_MAGIC); 282 return -EILSEQ; 283 } 284 285 csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags, 286 block_size - sizeof(__le32), 287 SUPERBLOCK_CSUM_XOR)); 288 if (csum_le != disk_super->csum) { 289 DMERR("%s failed: csum %u: wanted %u", 290 __func__, le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum)); 291 return -EILSEQ; 292 } 293 294 return 0; 295 } 296 297 static struct dm_block_validator sb_validator = { 298 .name = "superblock", 299 .prepare_for_write = sb_prepare_for_write, 300 .check = sb_check 301 }; 302 303 /* 304 *-------------------------------------------------------------- 305 * Methods for the btree value types 306 *-------------------------------------------------------------- 307 */ 308 static uint64_t pack_block_time(dm_block_t b, uint32_t t) 309 { 310 return (b << 24) | t; 311 } 312 313 static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t) 314 { 315 *b = v >> 24; 316 *t = v & ((1 << 24) - 1); 317 } 318 319 /* 320 * It's more efficient to call dm_sm_{inc,dec}_blocks as few times as 321 * possible. 'with_runs' reads contiguous runs of blocks, and calls the 322 * given sm function. 323 */ 324 typedef int (*run_fn)(struct dm_space_map *, dm_block_t, dm_block_t); 325 326 static void with_runs(struct dm_space_map *sm, const __le64 *value_le, unsigned int count, run_fn fn) 327 { 328 uint64_t b, begin, end; 329 uint32_t t; 330 bool in_run = false; 331 unsigned int i; 332 333 for (i = 0; i < count; i++, value_le++) { 334 /* We know value_le is 8 byte aligned */ 335 unpack_block_time(le64_to_cpu(*value_le), &b, &t); 336 337 if (in_run) { 338 if (b == end) { 339 end++; 340 } else { 341 fn(sm, begin, end); 342 begin = b; 343 end = b + 1; 344 } 345 } else { 346 in_run = true; 347 begin = b; 348 end = b + 1; 349 } 350 } 351 352 if (in_run) 353 fn(sm, begin, end); 354 } 355 356 static void data_block_inc(void *context, const void *value_le, unsigned int count) 357 { 358 with_runs((struct dm_space_map *) context, 359 (const __le64 *) value_le, count, dm_sm_inc_blocks); 360 } 361 362 static void data_block_dec(void *context, const void *value_le, unsigned int count) 363 { 364 with_runs((struct dm_space_map *) context, 365 (const __le64 *) value_le, count, dm_sm_dec_blocks); 366 } 367 368 static int data_block_equal(void *context, const void *value1_le, const void *value2_le) 369 { 370 __le64 v1_le, v2_le; 371 uint64_t b1, b2; 372 uint32_t t; 373 374 memcpy(&v1_le, value1_le, sizeof(v1_le)); 375 memcpy(&v2_le, value2_le, sizeof(v2_le)); 376 unpack_block_time(le64_to_cpu(v1_le), &b1, &t); 377 unpack_block_time(le64_to_cpu(v2_le), &b2, &t); 378 379 return b1 == b2; 380 } 381 382 static void subtree_inc(void *context, const void *value, unsigned int count) 383 { 384 struct dm_btree_info *info = context; 385 const __le64 *root_le = value; 386 unsigned int i; 387 388 for (i = 0; i < count; i++, root_le++) 389 dm_tm_inc(info->tm, le64_to_cpu(*root_le)); 390 } 391 392 static void subtree_dec(void *context, const void *value, unsigned int count) 393 { 394 struct dm_btree_info *info = context; 395 const __le64 *root_le = value; 396 unsigned int i; 397 398 for (i = 0; i < count; i++, root_le++) 399 if (dm_btree_del(info, le64_to_cpu(*root_le))) 400 DMERR("btree delete failed"); 401 } 402 403 static int subtree_equal(void *context, const void *value1_le, const void *value2_le) 404 { 405 __le64 v1_le, v2_le; 406 407 memcpy(&v1_le, value1_le, sizeof(v1_le)); 408 memcpy(&v2_le, value2_le, sizeof(v2_le)); 409 410 return v1_le == v2_le; 411 } 412 413 /*----------------------------------------------------------------*/ 414 415 /* 416 * Variant that is used for in-core only changes or code that 417 * shouldn't put the pool in service on its own (e.g. commit). 418 */ 419 static inline void pmd_write_lock_in_core(struct dm_pool_metadata *pmd) 420 __acquires(pmd->root_lock) 421 { 422 down_write(&pmd->root_lock); 423 } 424 425 static inline void pmd_write_lock(struct dm_pool_metadata *pmd) 426 { 427 pmd_write_lock_in_core(pmd); 428 if (unlikely(!pmd->in_service)) 429 pmd->in_service = true; 430 } 431 432 static inline void pmd_write_unlock(struct dm_pool_metadata *pmd) 433 __releases(pmd->root_lock) 434 { 435 up_write(&pmd->root_lock); 436 } 437 438 /*----------------------------------------------------------------*/ 439 440 static int superblock_lock_zero(struct dm_pool_metadata *pmd, 441 struct dm_block **sblock) 442 { 443 return dm_bm_write_lock_zero(pmd->bm, THIN_SUPERBLOCK_LOCATION, 444 &sb_validator, sblock); 445 } 446 447 static int superblock_lock(struct dm_pool_metadata *pmd, 448 struct dm_block **sblock) 449 { 450 return dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, 451 &sb_validator, sblock); 452 } 453 454 static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result) 455 { 456 int r; 457 unsigned int i; 458 struct dm_block *b; 459 __le64 *data_le, zero = cpu_to_le64(0); 460 unsigned int block_size = dm_bm_block_size(bm) / sizeof(__le64); 461 462 /* 463 * We can't use a validator here - it may be all zeroes. 464 */ 465 r = dm_bm_read_lock(bm, THIN_SUPERBLOCK_LOCATION, NULL, &b); 466 if (r) 467 return r; 468 469 data_le = dm_block_data(b); 470 *result = 1; 471 for (i = 0; i < block_size; i++) { 472 if (data_le[i] != zero) { 473 *result = 0; 474 break; 475 } 476 } 477 478 dm_bm_unlock(b); 479 480 return 0; 481 } 482 483 static void __setup_btree_details(struct dm_pool_metadata *pmd) 484 { 485 pmd->info.tm = pmd->tm; 486 pmd->info.levels = 2; 487 pmd->info.value_type.context = pmd->data_sm; 488 pmd->info.value_type.size = sizeof(__le64); 489 pmd->info.value_type.inc = data_block_inc; 490 pmd->info.value_type.dec = data_block_dec; 491 pmd->info.value_type.equal = data_block_equal; 492 493 memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info)); 494 pmd->nb_info.tm = pmd->nb_tm; 495 496 pmd->tl_info.tm = pmd->tm; 497 pmd->tl_info.levels = 1; 498 pmd->tl_info.value_type.context = &pmd->bl_info; 499 pmd->tl_info.value_type.size = sizeof(__le64); 500 pmd->tl_info.value_type.inc = subtree_inc; 501 pmd->tl_info.value_type.dec = subtree_dec; 502 pmd->tl_info.value_type.equal = subtree_equal; 503 504 pmd->bl_info.tm = pmd->tm; 505 pmd->bl_info.levels = 1; 506 pmd->bl_info.value_type.context = pmd->data_sm; 507 pmd->bl_info.value_type.size = sizeof(__le64); 508 pmd->bl_info.value_type.inc = data_block_inc; 509 pmd->bl_info.value_type.dec = data_block_dec; 510 pmd->bl_info.value_type.equal = data_block_equal; 511 512 pmd->details_info.tm = pmd->tm; 513 pmd->details_info.levels = 1; 514 pmd->details_info.value_type.context = NULL; 515 pmd->details_info.value_type.size = sizeof(struct disk_device_details); 516 pmd->details_info.value_type.inc = NULL; 517 pmd->details_info.value_type.dec = NULL; 518 pmd->details_info.value_type.equal = NULL; 519 } 520 521 static int save_sm_roots(struct dm_pool_metadata *pmd) 522 { 523 int r; 524 size_t len; 525 526 r = dm_sm_root_size(pmd->metadata_sm, &len); 527 if (r < 0) 528 return r; 529 530 r = dm_sm_copy_root(pmd->metadata_sm, &pmd->metadata_space_map_root, len); 531 if (r < 0) 532 return r; 533 534 r = dm_sm_root_size(pmd->data_sm, &len); 535 if (r < 0) 536 return r; 537 538 return dm_sm_copy_root(pmd->data_sm, &pmd->data_space_map_root, len); 539 } 540 541 static void copy_sm_roots(struct dm_pool_metadata *pmd, 542 struct thin_disk_superblock *disk) 543 { 544 memcpy(&disk->metadata_space_map_root, 545 &pmd->metadata_space_map_root, 546 sizeof(pmd->metadata_space_map_root)); 547 548 memcpy(&disk->data_space_map_root, 549 &pmd->data_space_map_root, 550 sizeof(pmd->data_space_map_root)); 551 } 552 553 static int __write_initial_superblock(struct dm_pool_metadata *pmd) 554 { 555 int r; 556 struct dm_block *sblock; 557 struct thin_disk_superblock *disk_super; 558 sector_t bdev_size = bdev_nr_sectors(pmd->bdev); 559 560 if (bdev_size > THIN_METADATA_MAX_SECTORS) 561 bdev_size = THIN_METADATA_MAX_SECTORS; 562 563 r = dm_sm_commit(pmd->data_sm); 564 if (r < 0) 565 return r; 566 567 r = dm_tm_pre_commit(pmd->tm); 568 if (r < 0) 569 return r; 570 571 r = save_sm_roots(pmd); 572 if (r < 0) 573 return r; 574 575 r = superblock_lock_zero(pmd, &sblock); 576 if (r) 577 return r; 578 579 disk_super = dm_block_data(sblock); 580 disk_super->flags = 0; 581 memset(disk_super->uuid, 0, sizeof(disk_super->uuid)); 582 disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC); 583 disk_super->version = cpu_to_le32(THIN_VERSION); 584 disk_super->time = 0; 585 disk_super->trans_id = 0; 586 disk_super->held_root = 0; 587 588 copy_sm_roots(pmd, disk_super); 589 590 disk_super->data_mapping_root = cpu_to_le64(pmd->root); 591 disk_super->device_details_root = cpu_to_le64(pmd->details_root); 592 disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE); 593 disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT); 594 disk_super->data_block_size = cpu_to_le32(pmd->data_block_size); 595 596 return dm_tm_commit(pmd->tm, sblock); 597 } 598 599 static int __format_metadata(struct dm_pool_metadata *pmd) 600 { 601 int r; 602 603 r = dm_tm_create_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION, 604 &pmd->tm, &pmd->metadata_sm); 605 if (r < 0) { 606 pmd->tm = NULL; 607 pmd->metadata_sm = NULL; 608 DMERR("tm_create_with_sm failed"); 609 return r; 610 } 611 612 pmd->data_sm = dm_sm_disk_create(pmd->tm, 0); 613 if (IS_ERR(pmd->data_sm)) { 614 DMERR("sm_disk_create failed"); 615 r = PTR_ERR(pmd->data_sm); 616 pmd->data_sm = NULL; 617 goto bad_cleanup_tm; 618 } 619 620 pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm); 621 if (!pmd->nb_tm) { 622 DMERR("could not create non-blocking clone tm"); 623 r = -ENOMEM; 624 goto bad_cleanup_data_sm; 625 } 626 627 __setup_btree_details(pmd); 628 629 r = dm_btree_empty(&pmd->info, &pmd->root); 630 if (r < 0) 631 goto bad_cleanup_nb_tm; 632 633 r = dm_btree_empty(&pmd->details_info, &pmd->details_root); 634 if (r < 0) { 635 DMERR("couldn't create devices root"); 636 goto bad_cleanup_nb_tm; 637 } 638 639 r = __write_initial_superblock(pmd); 640 if (r) 641 goto bad_cleanup_nb_tm; 642 643 return 0; 644 645 bad_cleanup_nb_tm: 646 dm_tm_destroy(pmd->nb_tm); 647 pmd->nb_tm = NULL; 648 bad_cleanup_data_sm: 649 dm_sm_destroy(pmd->data_sm); 650 pmd->data_sm = NULL; 651 bad_cleanup_tm: 652 dm_tm_destroy(pmd->tm); 653 pmd->tm = NULL; 654 dm_sm_destroy(pmd->metadata_sm); 655 pmd->metadata_sm = NULL; 656 657 return r; 658 } 659 660 static int __check_incompat_features(struct thin_disk_superblock *disk_super, 661 struct dm_pool_metadata *pmd) 662 { 663 uint32_t features; 664 665 features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP; 666 if (features) { 667 DMERR("could not access metadata due to unsupported optional features (%lx).", 668 (unsigned long)features); 669 return -EINVAL; 670 } 671 672 /* 673 * Check for read-only metadata to skip the following RDWR checks. 674 */ 675 if (bdev_read_only(pmd->bdev)) 676 return 0; 677 678 features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP; 679 if (features) { 680 DMERR("could not access metadata RDWR due to unsupported optional features (%lx).", 681 (unsigned long)features); 682 return -EINVAL; 683 } 684 685 return 0; 686 } 687 688 static int __open_metadata(struct dm_pool_metadata *pmd) 689 { 690 int r; 691 struct dm_block *sblock; 692 struct thin_disk_superblock *disk_super; 693 694 r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, 695 &sb_validator, &sblock); 696 if (r < 0) { 697 DMERR("couldn't read superblock"); 698 return r; 699 } 700 701 disk_super = dm_block_data(sblock); 702 703 /* Verify the data block size hasn't changed */ 704 if (le32_to_cpu(disk_super->data_block_size) != pmd->data_block_size) { 705 DMERR("changing the data block size (from %u to %llu) is not supported", 706 le32_to_cpu(disk_super->data_block_size), 707 (unsigned long long)pmd->data_block_size); 708 r = -EINVAL; 709 goto bad_unlock_sblock; 710 } 711 712 r = __check_incompat_features(disk_super, pmd); 713 if (r < 0) 714 goto bad_unlock_sblock; 715 716 r = dm_tm_open_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION, 717 disk_super->metadata_space_map_root, 718 sizeof(disk_super->metadata_space_map_root), 719 &pmd->tm, &pmd->metadata_sm); 720 if (r < 0) { 721 pmd->tm = NULL; 722 pmd->metadata_sm = NULL; 723 DMERR("tm_open_with_sm failed"); 724 goto bad_unlock_sblock; 725 } 726 727 pmd->data_sm = dm_sm_disk_open(pmd->tm, disk_super->data_space_map_root, 728 sizeof(disk_super->data_space_map_root)); 729 if (IS_ERR(pmd->data_sm)) { 730 DMERR("sm_disk_open failed"); 731 r = PTR_ERR(pmd->data_sm); 732 pmd->data_sm = NULL; 733 goto bad_cleanup_tm; 734 } 735 736 pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm); 737 if (!pmd->nb_tm) { 738 DMERR("could not create non-blocking clone tm"); 739 r = -ENOMEM; 740 goto bad_cleanup_data_sm; 741 } 742 743 /* 744 * For pool metadata opening process, root setting is redundant 745 * because it will be set again in __begin_transaction(). But dm 746 * pool aborting process really needs to get last transaction's 747 * root to avoid accessing broken btree. 748 */ 749 pmd->root = le64_to_cpu(disk_super->data_mapping_root); 750 pmd->details_root = le64_to_cpu(disk_super->device_details_root); 751 752 __setup_btree_details(pmd); 753 dm_bm_unlock(sblock); 754 755 return 0; 756 757 bad_cleanup_data_sm: 758 dm_sm_destroy(pmd->data_sm); 759 pmd->data_sm = NULL; 760 bad_cleanup_tm: 761 dm_tm_destroy(pmd->tm); 762 pmd->tm = NULL; 763 dm_sm_destroy(pmd->metadata_sm); 764 pmd->metadata_sm = NULL; 765 bad_unlock_sblock: 766 dm_bm_unlock(sblock); 767 768 return r; 769 } 770 771 static int __open_or_format_metadata(struct dm_pool_metadata *pmd, bool format_device) 772 { 773 int r, unformatted; 774 775 r = __superblock_all_zeroes(pmd->bm, &unformatted); 776 if (r) 777 return r; 778 779 if (unformatted) 780 return format_device ? __format_metadata(pmd) : -EPERM; 781 782 return __open_metadata(pmd); 783 } 784 785 static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool format_device) 786 { 787 int r; 788 789 pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT, 790 THIN_MAX_CONCURRENT_LOCKS); 791 if (IS_ERR(pmd->bm)) { 792 DMERR("could not create block manager"); 793 r = PTR_ERR(pmd->bm); 794 pmd->bm = NULL; 795 return r; 796 } 797 798 r = __open_or_format_metadata(pmd, format_device); 799 if (r) { 800 dm_block_manager_destroy(pmd->bm); 801 pmd->bm = NULL; 802 } 803 804 return r; 805 } 806 807 static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd, 808 bool destroy_bm) 809 { 810 dm_sm_destroy(pmd->data_sm); 811 pmd->data_sm = NULL; 812 dm_sm_destroy(pmd->metadata_sm); 813 pmd->metadata_sm = NULL; 814 dm_tm_destroy(pmd->nb_tm); 815 pmd->nb_tm = NULL; 816 dm_tm_destroy(pmd->tm); 817 pmd->tm = NULL; 818 if (destroy_bm) 819 dm_block_manager_destroy(pmd->bm); 820 } 821 822 static int __begin_transaction(struct dm_pool_metadata *pmd) 823 { 824 int r; 825 struct thin_disk_superblock *disk_super; 826 struct dm_block *sblock; 827 828 /* 829 * We re-read the superblock every time. Shouldn't need to do this 830 * really. 831 */ 832 r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, 833 &sb_validator, &sblock); 834 if (r) 835 return r; 836 837 disk_super = dm_block_data(sblock); 838 pmd->time = le32_to_cpu(disk_super->time); 839 pmd->root = le64_to_cpu(disk_super->data_mapping_root); 840 pmd->details_root = le64_to_cpu(disk_super->device_details_root); 841 pmd->trans_id = le64_to_cpu(disk_super->trans_id); 842 pmd->flags = le32_to_cpu(disk_super->flags); 843 pmd->data_block_size = le32_to_cpu(disk_super->data_block_size); 844 845 dm_bm_unlock(sblock); 846 return 0; 847 } 848 849 static int __write_changed_details(struct dm_pool_metadata *pmd) 850 { 851 int r; 852 struct dm_thin_device *td, *tmp; 853 struct disk_device_details details; 854 uint64_t key; 855 856 list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) { 857 if (!td->changed) 858 continue; 859 860 key = td->id; 861 862 details.mapped_blocks = cpu_to_le64(td->mapped_blocks); 863 details.transaction_id = cpu_to_le64(td->transaction_id); 864 details.creation_time = cpu_to_le32(td->creation_time); 865 details.snapshotted_time = cpu_to_le32(td->snapshotted_time); 866 __dm_bless_for_disk(&details); 867 868 r = dm_btree_insert(&pmd->details_info, pmd->details_root, 869 &key, &details, &pmd->details_root); 870 if (r) 871 return r; 872 873 if (td->open_count) 874 td->changed = false; 875 else { 876 list_del(&td->list); 877 kfree(td); 878 } 879 } 880 881 return 0; 882 } 883 884 static int __commit_transaction(struct dm_pool_metadata *pmd) 885 { 886 int r; 887 struct thin_disk_superblock *disk_super; 888 struct dm_block *sblock; 889 890 /* 891 * We need to know if the thin_disk_superblock exceeds a 512-byte sector. 892 */ 893 BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512); 894 BUG_ON(!rwsem_is_locked(&pmd->root_lock)); 895 896 if (unlikely(!pmd->in_service)) 897 return 0; 898 899 if (pmd->pre_commit_fn) { 900 r = pmd->pre_commit_fn(pmd->pre_commit_context); 901 if (r < 0) { 902 DMERR("pre-commit callback failed"); 903 return r; 904 } 905 } 906 907 r = __write_changed_details(pmd); 908 if (r < 0) 909 return r; 910 911 r = dm_sm_commit(pmd->data_sm); 912 if (r < 0) 913 return r; 914 915 r = dm_tm_pre_commit(pmd->tm); 916 if (r < 0) 917 return r; 918 919 r = save_sm_roots(pmd); 920 if (r < 0) 921 return r; 922 923 r = superblock_lock(pmd, &sblock); 924 if (r) 925 return r; 926 927 disk_super = dm_block_data(sblock); 928 disk_super->time = cpu_to_le32(pmd->time); 929 disk_super->data_mapping_root = cpu_to_le64(pmd->root); 930 disk_super->device_details_root = cpu_to_le64(pmd->details_root); 931 disk_super->trans_id = cpu_to_le64(pmd->trans_id); 932 disk_super->flags = cpu_to_le32(pmd->flags); 933 934 copy_sm_roots(pmd, disk_super); 935 936 return dm_tm_commit(pmd->tm, sblock); 937 } 938 939 static void __set_metadata_reserve(struct dm_pool_metadata *pmd) 940 { 941 int r; 942 dm_block_t total; 943 dm_block_t max_blocks = 4096; /* 16M */ 944 945 r = dm_sm_get_nr_blocks(pmd->metadata_sm, &total); 946 if (r) { 947 DMERR("could not get size of metadata device"); 948 pmd->metadata_reserve = max_blocks; 949 } else 950 pmd->metadata_reserve = min(max_blocks, div_u64(total, 10)); 951 } 952 953 struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, 954 sector_t data_block_size, 955 bool format_device) 956 { 957 int r; 958 struct dm_pool_metadata *pmd; 959 960 pmd = kmalloc(sizeof(*pmd), GFP_KERNEL); 961 if (!pmd) { 962 DMERR("could not allocate metadata struct"); 963 return ERR_PTR(-ENOMEM); 964 } 965 966 init_rwsem(&pmd->root_lock); 967 pmd->time = 0; 968 INIT_LIST_HEAD(&pmd->thin_devices); 969 pmd->fail_io = false; 970 pmd->in_service = false; 971 pmd->bdev = bdev; 972 pmd->data_block_size = data_block_size; 973 pmd->pre_commit_fn = NULL; 974 pmd->pre_commit_context = NULL; 975 976 r = __create_persistent_data_objects(pmd, format_device); 977 if (r) { 978 kfree(pmd); 979 return ERR_PTR(r); 980 } 981 982 r = __begin_transaction(pmd); 983 if (r < 0) { 984 if (dm_pool_metadata_close(pmd) < 0) 985 DMWARN("%s: dm_pool_metadata_close() failed.", __func__); 986 return ERR_PTR(r); 987 } 988 989 __set_metadata_reserve(pmd); 990 991 return pmd; 992 } 993 994 int dm_pool_metadata_close(struct dm_pool_metadata *pmd) 995 { 996 int r; 997 unsigned int open_devices = 0; 998 struct dm_thin_device *td, *tmp; 999 1000 down_read(&pmd->root_lock); 1001 list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) { 1002 if (td->open_count) 1003 open_devices++; 1004 else { 1005 list_del(&td->list); 1006 kfree(td); 1007 } 1008 } 1009 up_read(&pmd->root_lock); 1010 1011 if (open_devices) { 1012 DMERR("attempt to close pmd when %u device(s) are still open", 1013 open_devices); 1014 return -EBUSY; 1015 } 1016 1017 pmd_write_lock_in_core(pmd); 1018 if (!pmd->fail_io && !dm_bm_is_read_only(pmd->bm)) { 1019 r = __commit_transaction(pmd); 1020 if (r < 0) 1021 DMWARN("%s: __commit_transaction() failed, error = %d", 1022 __func__, r); 1023 } 1024 pmd_write_unlock(pmd); 1025 __destroy_persistent_data_objects(pmd, true); 1026 1027 kfree(pmd); 1028 return 0; 1029 } 1030 1031 /* 1032 * __open_device: Returns @td corresponding to device with id @dev, 1033 * creating it if @create is set and incrementing @td->open_count. 1034 * On failure, @td is undefined. 1035 */ 1036 static int __open_device(struct dm_pool_metadata *pmd, 1037 dm_thin_id dev, int create, 1038 struct dm_thin_device **td) 1039 { 1040 int r, changed = 0; 1041 struct dm_thin_device *td2; 1042 uint64_t key = dev; 1043 struct disk_device_details details_le; 1044 1045 /* 1046 * If the device is already open, return it. 1047 */ 1048 list_for_each_entry(td2, &pmd->thin_devices, list) 1049 if (td2->id == dev) { 1050 /* 1051 * May not create an already-open device. 1052 */ 1053 if (create) 1054 return -EEXIST; 1055 1056 td2->open_count++; 1057 *td = td2; 1058 return 0; 1059 } 1060 1061 /* 1062 * Check the device exists. 1063 */ 1064 r = dm_btree_lookup(&pmd->details_info, pmd->details_root, 1065 &key, &details_le); 1066 if (r) { 1067 if (r != -ENODATA || !create) 1068 return r; 1069 1070 /* 1071 * Create new device. 1072 */ 1073 changed = 1; 1074 details_le.mapped_blocks = 0; 1075 details_le.transaction_id = cpu_to_le64(pmd->trans_id); 1076 details_le.creation_time = cpu_to_le32(pmd->time); 1077 details_le.snapshotted_time = cpu_to_le32(pmd->time); 1078 } 1079 1080 *td = kmalloc(sizeof(**td), GFP_NOIO); 1081 if (!*td) 1082 return -ENOMEM; 1083 1084 (*td)->pmd = pmd; 1085 (*td)->id = dev; 1086 (*td)->open_count = 1; 1087 (*td)->changed = changed; 1088 (*td)->aborted_with_changes = false; 1089 (*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks); 1090 (*td)->transaction_id = le64_to_cpu(details_le.transaction_id); 1091 (*td)->creation_time = le32_to_cpu(details_le.creation_time); 1092 (*td)->snapshotted_time = le32_to_cpu(details_le.snapshotted_time); 1093 1094 list_add(&(*td)->list, &pmd->thin_devices); 1095 1096 return 0; 1097 } 1098 1099 static void __close_device(struct dm_thin_device *td) 1100 { 1101 --td->open_count; 1102 } 1103 1104 static int __create_thin(struct dm_pool_metadata *pmd, 1105 dm_thin_id dev) 1106 { 1107 int r; 1108 dm_block_t dev_root; 1109 uint64_t key = dev; 1110 struct dm_thin_device *td; 1111 __le64 value; 1112 1113 r = dm_btree_lookup(&pmd->details_info, pmd->details_root, 1114 &key, NULL); 1115 if (!r) 1116 return -EEXIST; 1117 1118 /* 1119 * Create an empty btree for the mappings. 1120 */ 1121 r = dm_btree_empty(&pmd->bl_info, &dev_root); 1122 if (r) 1123 return r; 1124 1125 /* 1126 * Insert it into the main mapping tree. 1127 */ 1128 value = cpu_to_le64(dev_root); 1129 __dm_bless_for_disk(&value); 1130 r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root); 1131 if (r) { 1132 dm_btree_del(&pmd->bl_info, dev_root); 1133 return r; 1134 } 1135 1136 r = __open_device(pmd, dev, 1, &td); 1137 if (r) { 1138 dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); 1139 dm_btree_del(&pmd->bl_info, dev_root); 1140 return r; 1141 } 1142 __close_device(td); 1143 1144 return r; 1145 } 1146 1147 int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev) 1148 { 1149 int r = -EINVAL; 1150 1151 pmd_write_lock(pmd); 1152 if (!pmd->fail_io) 1153 r = __create_thin(pmd, dev); 1154 pmd_write_unlock(pmd); 1155 1156 return r; 1157 } 1158 1159 static int __set_snapshot_details(struct dm_pool_metadata *pmd, 1160 struct dm_thin_device *snap, 1161 dm_thin_id origin, uint32_t time) 1162 { 1163 int r; 1164 struct dm_thin_device *td; 1165 1166 r = __open_device(pmd, origin, 0, &td); 1167 if (r) 1168 return r; 1169 1170 td->changed = true; 1171 td->snapshotted_time = time; 1172 1173 snap->mapped_blocks = td->mapped_blocks; 1174 snap->snapshotted_time = time; 1175 __close_device(td); 1176 1177 return 0; 1178 } 1179 1180 static int __create_snap(struct dm_pool_metadata *pmd, 1181 dm_thin_id dev, dm_thin_id origin) 1182 { 1183 int r; 1184 dm_block_t origin_root; 1185 uint64_t key = origin, dev_key = dev; 1186 struct dm_thin_device *td; 1187 __le64 value; 1188 1189 /* check this device is unused */ 1190 r = dm_btree_lookup(&pmd->details_info, pmd->details_root, 1191 &dev_key, NULL); 1192 if (!r) 1193 return -EEXIST; 1194 1195 /* find the mapping tree for the origin */ 1196 r = dm_btree_lookup(&pmd->tl_info, pmd->root, &key, &value); 1197 if (r) 1198 return r; 1199 origin_root = le64_to_cpu(value); 1200 1201 /* clone the origin, an inc will do */ 1202 dm_tm_inc(pmd->tm, origin_root); 1203 1204 /* insert into the main mapping tree */ 1205 value = cpu_to_le64(origin_root); 1206 __dm_bless_for_disk(&value); 1207 key = dev; 1208 r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root); 1209 if (r) { 1210 dm_tm_dec(pmd->tm, origin_root); 1211 return r; 1212 } 1213 1214 pmd->time++; 1215 1216 r = __open_device(pmd, dev, 1, &td); 1217 if (r) 1218 goto bad; 1219 1220 r = __set_snapshot_details(pmd, td, origin, pmd->time); 1221 __close_device(td); 1222 1223 if (r) 1224 goto bad; 1225 1226 return 0; 1227 1228 bad: 1229 dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); 1230 dm_btree_remove(&pmd->details_info, pmd->details_root, 1231 &key, &pmd->details_root); 1232 return r; 1233 } 1234 1235 int dm_pool_create_snap(struct dm_pool_metadata *pmd, 1236 dm_thin_id dev, 1237 dm_thin_id origin) 1238 { 1239 int r = -EINVAL; 1240 1241 pmd_write_lock(pmd); 1242 if (!pmd->fail_io) 1243 r = __create_snap(pmd, dev, origin); 1244 pmd_write_unlock(pmd); 1245 1246 return r; 1247 } 1248 1249 static int __delete_device(struct dm_pool_metadata *pmd, dm_thin_id dev) 1250 { 1251 int r; 1252 uint64_t key = dev; 1253 struct dm_thin_device *td; 1254 1255 /* TODO: failure should mark the transaction invalid */ 1256 r = __open_device(pmd, dev, 0, &td); 1257 if (r) 1258 return r; 1259 1260 if (td->open_count > 1) { 1261 __close_device(td); 1262 return -EBUSY; 1263 } 1264 1265 list_del(&td->list); 1266 kfree(td); 1267 r = dm_btree_remove(&pmd->details_info, pmd->details_root, 1268 &key, &pmd->details_root); 1269 if (r) 1270 return r; 1271 1272 r = dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); 1273 if (r) 1274 return r; 1275 1276 return 0; 1277 } 1278 1279 int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd, 1280 dm_thin_id dev) 1281 { 1282 int r = -EINVAL; 1283 1284 pmd_write_lock(pmd); 1285 if (!pmd->fail_io) 1286 r = __delete_device(pmd, dev); 1287 pmd_write_unlock(pmd); 1288 1289 return r; 1290 } 1291 1292 int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd, 1293 uint64_t current_id, 1294 uint64_t new_id) 1295 { 1296 int r = -EINVAL; 1297 1298 pmd_write_lock(pmd); 1299 1300 if (pmd->fail_io) 1301 goto out; 1302 1303 if (pmd->trans_id != current_id) { 1304 DMERR("mismatched transaction id"); 1305 goto out; 1306 } 1307 1308 pmd->trans_id = new_id; 1309 r = 0; 1310 1311 out: 1312 pmd_write_unlock(pmd); 1313 1314 return r; 1315 } 1316 1317 int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd, 1318 uint64_t *result) 1319 { 1320 int r = -EINVAL; 1321 1322 down_read(&pmd->root_lock); 1323 if (!pmd->fail_io) { 1324 *result = pmd->trans_id; 1325 r = 0; 1326 } 1327 up_read(&pmd->root_lock); 1328 1329 return r; 1330 } 1331 1332 static int __reserve_metadata_snap(struct dm_pool_metadata *pmd) 1333 { 1334 int r, inc; 1335 struct thin_disk_superblock *disk_super; 1336 struct dm_block *copy, *sblock; 1337 dm_block_t held_root; 1338 1339 /* 1340 * We commit to ensure the btree roots which we increment in a 1341 * moment are up to date. 1342 */ 1343 r = __commit_transaction(pmd); 1344 if (r < 0) { 1345 DMWARN("%s: __commit_transaction() failed, error = %d", 1346 __func__, r); 1347 return r; 1348 } 1349 1350 /* 1351 * Copy the superblock. 1352 */ 1353 dm_sm_inc_block(pmd->metadata_sm, THIN_SUPERBLOCK_LOCATION); 1354 r = dm_tm_shadow_block(pmd->tm, THIN_SUPERBLOCK_LOCATION, 1355 &sb_validator, ©, &inc); 1356 if (r) 1357 return r; 1358 1359 BUG_ON(!inc); 1360 1361 held_root = dm_block_location(copy); 1362 disk_super = dm_block_data(copy); 1363 1364 if (le64_to_cpu(disk_super->held_root)) { 1365 DMWARN("Pool metadata snapshot already exists: release this before taking another."); 1366 1367 dm_tm_dec(pmd->tm, held_root); 1368 dm_tm_unlock(pmd->tm, copy); 1369 return -EBUSY; 1370 } 1371 1372 /* 1373 * Wipe the spacemap since we're not publishing this. 1374 */ 1375 memset(&disk_super->data_space_map_root, 0, 1376 sizeof(disk_super->data_space_map_root)); 1377 memset(&disk_super->metadata_space_map_root, 0, 1378 sizeof(disk_super->metadata_space_map_root)); 1379 1380 /* 1381 * Increment the data structures that need to be preserved. 1382 */ 1383 dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->data_mapping_root)); 1384 dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->device_details_root)); 1385 dm_tm_unlock(pmd->tm, copy); 1386 1387 /* 1388 * Write the held root into the superblock. 1389 */ 1390 r = superblock_lock(pmd, &sblock); 1391 if (r) { 1392 dm_tm_dec(pmd->tm, held_root); 1393 return r; 1394 } 1395 1396 disk_super = dm_block_data(sblock); 1397 disk_super->held_root = cpu_to_le64(held_root); 1398 dm_bm_unlock(sblock); 1399 return 0; 1400 } 1401 1402 int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd) 1403 { 1404 int r = -EINVAL; 1405 1406 pmd_write_lock(pmd); 1407 if (!pmd->fail_io) 1408 r = __reserve_metadata_snap(pmd); 1409 pmd_write_unlock(pmd); 1410 1411 return r; 1412 } 1413 1414 static int __release_metadata_snap(struct dm_pool_metadata *pmd) 1415 { 1416 int r; 1417 struct thin_disk_superblock *disk_super; 1418 struct dm_block *sblock, *copy; 1419 dm_block_t held_root; 1420 1421 r = superblock_lock(pmd, &sblock); 1422 if (r) 1423 return r; 1424 1425 disk_super = dm_block_data(sblock); 1426 held_root = le64_to_cpu(disk_super->held_root); 1427 disk_super->held_root = cpu_to_le64(0); 1428 1429 dm_bm_unlock(sblock); 1430 1431 if (!held_root) { 1432 DMWARN("No pool metadata snapshot found: nothing to release."); 1433 return -EINVAL; 1434 } 1435 1436 r = dm_tm_read_lock(pmd->tm, held_root, &sb_validator, ©); 1437 if (r) 1438 return r; 1439 1440 disk_super = dm_block_data(copy); 1441 dm_btree_del(&pmd->info, le64_to_cpu(disk_super->data_mapping_root)); 1442 dm_btree_del(&pmd->details_info, le64_to_cpu(disk_super->device_details_root)); 1443 dm_sm_dec_block(pmd->metadata_sm, held_root); 1444 1445 dm_tm_unlock(pmd->tm, copy); 1446 1447 return 0; 1448 } 1449 1450 int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd) 1451 { 1452 int r = -EINVAL; 1453 1454 pmd_write_lock(pmd); 1455 if (!pmd->fail_io) 1456 r = __release_metadata_snap(pmd); 1457 pmd_write_unlock(pmd); 1458 1459 return r; 1460 } 1461 1462 static int __get_metadata_snap(struct dm_pool_metadata *pmd, 1463 dm_block_t *result) 1464 { 1465 int r; 1466 struct thin_disk_superblock *disk_super; 1467 struct dm_block *sblock; 1468 1469 r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, 1470 &sb_validator, &sblock); 1471 if (r) 1472 return r; 1473 1474 disk_super = dm_block_data(sblock); 1475 *result = le64_to_cpu(disk_super->held_root); 1476 1477 dm_bm_unlock(sblock); 1478 1479 return 0; 1480 } 1481 1482 int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd, 1483 dm_block_t *result) 1484 { 1485 int r = -EINVAL; 1486 1487 down_read(&pmd->root_lock); 1488 if (!pmd->fail_io) 1489 r = __get_metadata_snap(pmd, result); 1490 up_read(&pmd->root_lock); 1491 1492 return r; 1493 } 1494 1495 int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev, 1496 struct dm_thin_device **td) 1497 { 1498 int r = -EINVAL; 1499 1500 pmd_write_lock_in_core(pmd); 1501 if (!pmd->fail_io) 1502 r = __open_device(pmd, dev, 0, td); 1503 pmd_write_unlock(pmd); 1504 1505 return r; 1506 } 1507 1508 int dm_pool_close_thin_device(struct dm_thin_device *td) 1509 { 1510 pmd_write_lock_in_core(td->pmd); 1511 __close_device(td); 1512 pmd_write_unlock(td->pmd); 1513 1514 return 0; 1515 } 1516 1517 dm_thin_id dm_thin_dev_id(struct dm_thin_device *td) 1518 { 1519 return td->id; 1520 } 1521 1522 /* 1523 * Check whether @time (of block creation) is older than @td's last snapshot. 1524 * If so then the associated block is shared with the last snapshot device. 1525 * Any block on a device created *after* the device last got snapshotted is 1526 * necessarily not shared. 1527 */ 1528 static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time) 1529 { 1530 return td->snapshotted_time > time; 1531 } 1532 1533 static void unpack_lookup_result(struct dm_thin_device *td, __le64 value, 1534 struct dm_thin_lookup_result *result) 1535 { 1536 uint64_t block_time = 0; 1537 dm_block_t exception_block; 1538 uint32_t exception_time; 1539 1540 block_time = le64_to_cpu(value); 1541 unpack_block_time(block_time, &exception_block, &exception_time); 1542 result->block = exception_block; 1543 result->shared = __snapshotted_since(td, exception_time); 1544 } 1545 1546 static int __find_block(struct dm_thin_device *td, dm_block_t block, 1547 int can_issue_io, struct dm_thin_lookup_result *result) 1548 { 1549 int r; 1550 __le64 value; 1551 struct dm_pool_metadata *pmd = td->pmd; 1552 dm_block_t keys[2] = { td->id, block }; 1553 struct dm_btree_info *info; 1554 1555 if (can_issue_io) 1556 info = &pmd->info; 1557 else 1558 info = &pmd->nb_info; 1559 1560 r = dm_btree_lookup(info, pmd->root, keys, &value); 1561 if (!r) 1562 unpack_lookup_result(td, value, result); 1563 1564 return r; 1565 } 1566 1567 int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block, 1568 int can_issue_io, struct dm_thin_lookup_result *result) 1569 { 1570 int r; 1571 struct dm_pool_metadata *pmd = td->pmd; 1572 1573 down_read(&pmd->root_lock); 1574 if (pmd->fail_io) { 1575 up_read(&pmd->root_lock); 1576 return -EINVAL; 1577 } 1578 1579 r = __find_block(td, block, can_issue_io, result); 1580 1581 up_read(&pmd->root_lock); 1582 return r; 1583 } 1584 1585 static int __find_next_mapped_block(struct dm_thin_device *td, dm_block_t block, 1586 dm_block_t *vblock, 1587 struct dm_thin_lookup_result *result) 1588 { 1589 int r; 1590 __le64 value; 1591 struct dm_pool_metadata *pmd = td->pmd; 1592 dm_block_t keys[2] = { td->id, block }; 1593 1594 r = dm_btree_lookup_next(&pmd->info, pmd->root, keys, vblock, &value); 1595 if (!r) 1596 unpack_lookup_result(td, value, result); 1597 1598 return r; 1599 } 1600 1601 static int __find_mapped_range(struct dm_thin_device *td, 1602 dm_block_t begin, dm_block_t end, 1603 dm_block_t *thin_begin, dm_block_t *thin_end, 1604 dm_block_t *pool_begin, bool *maybe_shared) 1605 { 1606 int r; 1607 dm_block_t pool_end; 1608 struct dm_thin_lookup_result lookup; 1609 1610 if (end < begin) 1611 return -ENODATA; 1612 1613 r = __find_next_mapped_block(td, begin, &begin, &lookup); 1614 if (r) 1615 return r; 1616 1617 if (begin >= end) 1618 return -ENODATA; 1619 1620 *thin_begin = begin; 1621 *pool_begin = lookup.block; 1622 *maybe_shared = lookup.shared; 1623 1624 begin++; 1625 pool_end = *pool_begin + 1; 1626 while (begin != end) { 1627 r = __find_block(td, begin, true, &lookup); 1628 if (r) { 1629 if (r == -ENODATA) 1630 break; 1631 1632 return r; 1633 } 1634 1635 if ((lookup.block != pool_end) || 1636 (lookup.shared != *maybe_shared)) 1637 break; 1638 1639 pool_end++; 1640 begin++; 1641 } 1642 1643 *thin_end = begin; 1644 return 0; 1645 } 1646 1647 int dm_thin_find_mapped_range(struct dm_thin_device *td, 1648 dm_block_t begin, dm_block_t end, 1649 dm_block_t *thin_begin, dm_block_t *thin_end, 1650 dm_block_t *pool_begin, bool *maybe_shared) 1651 { 1652 int r = -EINVAL; 1653 struct dm_pool_metadata *pmd = td->pmd; 1654 1655 down_read(&pmd->root_lock); 1656 if (!pmd->fail_io) { 1657 r = __find_mapped_range(td, begin, end, thin_begin, thin_end, 1658 pool_begin, maybe_shared); 1659 } 1660 up_read(&pmd->root_lock); 1661 1662 return r; 1663 } 1664 1665 static int __insert(struct dm_thin_device *td, dm_block_t block, 1666 dm_block_t data_block) 1667 { 1668 int r, inserted; 1669 __le64 value; 1670 struct dm_pool_metadata *pmd = td->pmd; 1671 dm_block_t keys[2] = { td->id, block }; 1672 1673 value = cpu_to_le64(pack_block_time(data_block, pmd->time)); 1674 __dm_bless_for_disk(&value); 1675 1676 r = dm_btree_insert_notify(&pmd->info, pmd->root, keys, &value, 1677 &pmd->root, &inserted); 1678 if (r) 1679 return r; 1680 1681 td->changed = true; 1682 if (inserted) 1683 td->mapped_blocks++; 1684 1685 return 0; 1686 } 1687 1688 int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block, 1689 dm_block_t data_block) 1690 { 1691 int r = -EINVAL; 1692 1693 pmd_write_lock(td->pmd); 1694 if (!td->pmd->fail_io) 1695 r = __insert(td, block, data_block); 1696 pmd_write_unlock(td->pmd); 1697 1698 return r; 1699 } 1700 1701 static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_t end) 1702 { 1703 int r; 1704 unsigned int count, total_count = 0; 1705 struct dm_pool_metadata *pmd = td->pmd; 1706 dm_block_t keys[1] = { td->id }; 1707 __le64 value; 1708 dm_block_t mapping_root; 1709 1710 /* 1711 * Find the mapping tree 1712 */ 1713 r = dm_btree_lookup(&pmd->tl_info, pmd->root, keys, &value); 1714 if (r) 1715 return r; 1716 1717 /* 1718 * Remove from the mapping tree, taking care to inc the 1719 * ref count so it doesn't get deleted. 1720 */ 1721 mapping_root = le64_to_cpu(value); 1722 dm_tm_inc(pmd->tm, mapping_root); 1723 r = dm_btree_remove(&pmd->tl_info, pmd->root, keys, &pmd->root); 1724 if (r) 1725 return r; 1726 1727 /* 1728 * Remove leaves stops at the first unmapped entry, so we have to 1729 * loop round finding mapped ranges. 1730 */ 1731 while (begin < end) { 1732 r = dm_btree_lookup_next(&pmd->bl_info, mapping_root, &begin, &begin, &value); 1733 if (r == -ENODATA) 1734 break; 1735 1736 if (r) 1737 return r; 1738 1739 if (begin >= end) 1740 break; 1741 1742 r = dm_btree_remove_leaves(&pmd->bl_info, mapping_root, &begin, end, &mapping_root, &count); 1743 if (r) 1744 return r; 1745 1746 total_count += count; 1747 } 1748 1749 td->mapped_blocks -= total_count; 1750 td->changed = true; 1751 1752 /* 1753 * Reinsert the mapping tree. 1754 */ 1755 value = cpu_to_le64(mapping_root); 1756 __dm_bless_for_disk(&value); 1757 return dm_btree_insert(&pmd->tl_info, pmd->root, keys, &value, &pmd->root); 1758 } 1759 1760 int dm_thin_remove_range(struct dm_thin_device *td, 1761 dm_block_t begin, dm_block_t end) 1762 { 1763 int r = -EINVAL; 1764 1765 pmd_write_lock(td->pmd); 1766 if (!td->pmd->fail_io) 1767 r = __remove_range(td, begin, end); 1768 pmd_write_unlock(td->pmd); 1769 1770 return r; 1771 } 1772 1773 int dm_pool_block_is_shared(struct dm_pool_metadata *pmd, dm_block_t b, bool *result) 1774 { 1775 int r = -EINVAL; 1776 uint32_t ref_count; 1777 1778 down_read(&pmd->root_lock); 1779 if (!pmd->fail_io) { 1780 r = dm_sm_get_count(pmd->data_sm, b, &ref_count); 1781 if (!r) 1782 *result = (ref_count > 1); 1783 } 1784 up_read(&pmd->root_lock); 1785 1786 return r; 1787 } 1788 1789 int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e) 1790 { 1791 int r = -EINVAL; 1792 1793 pmd_write_lock(pmd); 1794 if (!pmd->fail_io) 1795 r = dm_sm_inc_blocks(pmd->data_sm, b, e); 1796 pmd_write_unlock(pmd); 1797 1798 return r; 1799 } 1800 1801 int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e) 1802 { 1803 int r = -EINVAL; 1804 1805 pmd_write_lock(pmd); 1806 if (!pmd->fail_io) 1807 r = dm_sm_dec_blocks(pmd->data_sm, b, e); 1808 pmd_write_unlock(pmd); 1809 1810 return r; 1811 } 1812 1813 bool dm_thin_changed_this_transaction(struct dm_thin_device *td) 1814 { 1815 int r; 1816 1817 down_read(&td->pmd->root_lock); 1818 r = td->changed; 1819 up_read(&td->pmd->root_lock); 1820 1821 return r; 1822 } 1823 1824 bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd) 1825 { 1826 bool r = false; 1827 struct dm_thin_device *td, *tmp; 1828 1829 down_read(&pmd->root_lock); 1830 list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) { 1831 if (td->changed) { 1832 r = td->changed; 1833 break; 1834 } 1835 } 1836 up_read(&pmd->root_lock); 1837 1838 return r; 1839 } 1840 1841 bool dm_thin_aborted_changes(struct dm_thin_device *td) 1842 { 1843 bool r; 1844 1845 down_read(&td->pmd->root_lock); 1846 r = td->aborted_with_changes; 1847 up_read(&td->pmd->root_lock); 1848 1849 return r; 1850 } 1851 1852 int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result) 1853 { 1854 int r = -EINVAL; 1855 1856 pmd_write_lock(pmd); 1857 if (!pmd->fail_io) 1858 r = dm_sm_new_block(pmd->data_sm, result); 1859 pmd_write_unlock(pmd); 1860 1861 return r; 1862 } 1863 1864 int dm_pool_commit_metadata(struct dm_pool_metadata *pmd) 1865 { 1866 int r = -EINVAL; 1867 1868 /* 1869 * Care is taken to not have commit be what 1870 * triggers putting the thin-pool in-service. 1871 */ 1872 pmd_write_lock_in_core(pmd); 1873 if (pmd->fail_io) 1874 goto out; 1875 1876 r = __commit_transaction(pmd); 1877 if (r < 0) 1878 goto out; 1879 1880 /* 1881 * Open the next transaction. 1882 */ 1883 r = __begin_transaction(pmd); 1884 out: 1885 pmd_write_unlock(pmd); 1886 return r; 1887 } 1888 1889 static void __set_abort_with_changes_flags(struct dm_pool_metadata *pmd) 1890 { 1891 struct dm_thin_device *td; 1892 1893 list_for_each_entry(td, &pmd->thin_devices, list) 1894 td->aborted_with_changes = td->changed; 1895 } 1896 1897 int dm_pool_abort_metadata(struct dm_pool_metadata *pmd) 1898 { 1899 int r = -EINVAL; 1900 1901 /* fail_io is double-checked with pmd->root_lock held below */ 1902 if (unlikely(pmd->fail_io)) 1903 return r; 1904 1905 pmd_write_lock(pmd); 1906 if (pmd->fail_io) { 1907 pmd_write_unlock(pmd); 1908 return r; 1909 } 1910 __set_abort_with_changes_flags(pmd); 1911 1912 /* destroy data_sm/metadata_sm/nb_tm/tm */ 1913 __destroy_persistent_data_objects(pmd, false); 1914 1915 /* reset bm */ 1916 dm_block_manager_reset(pmd->bm); 1917 1918 /* rebuild data_sm/metadata_sm/nb_tm/tm */ 1919 r = __open_or_format_metadata(pmd, false); 1920 if (r) 1921 pmd->fail_io = true; 1922 pmd_write_unlock(pmd); 1923 return r; 1924 } 1925 1926 int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result) 1927 { 1928 int r = -EINVAL; 1929 1930 down_read(&pmd->root_lock); 1931 if (!pmd->fail_io) 1932 r = dm_sm_get_nr_free(pmd->data_sm, result); 1933 up_read(&pmd->root_lock); 1934 1935 return r; 1936 } 1937 1938 int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd, 1939 dm_block_t *result) 1940 { 1941 int r = -EINVAL; 1942 1943 down_read(&pmd->root_lock); 1944 if (!pmd->fail_io) 1945 r = dm_sm_get_nr_free(pmd->metadata_sm, result); 1946 1947 if (!r) { 1948 if (*result < pmd->metadata_reserve) 1949 *result = 0; 1950 else 1951 *result -= pmd->metadata_reserve; 1952 } 1953 up_read(&pmd->root_lock); 1954 1955 return r; 1956 } 1957 1958 int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd, 1959 dm_block_t *result) 1960 { 1961 int r = -EINVAL; 1962 1963 down_read(&pmd->root_lock); 1964 if (!pmd->fail_io) 1965 r = dm_sm_get_nr_blocks(pmd->metadata_sm, result); 1966 up_read(&pmd->root_lock); 1967 1968 return r; 1969 } 1970 1971 int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result) 1972 { 1973 int r = -EINVAL; 1974 1975 down_read(&pmd->root_lock); 1976 if (!pmd->fail_io) 1977 r = dm_sm_get_nr_blocks(pmd->data_sm, result); 1978 up_read(&pmd->root_lock); 1979 1980 return r; 1981 } 1982 1983 int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result) 1984 { 1985 int r = -EINVAL; 1986 struct dm_pool_metadata *pmd = td->pmd; 1987 1988 down_read(&pmd->root_lock); 1989 if (!pmd->fail_io) { 1990 *result = td->mapped_blocks; 1991 r = 0; 1992 } 1993 up_read(&pmd->root_lock); 1994 1995 return r; 1996 } 1997 1998 static int __highest_block(struct dm_thin_device *td, dm_block_t *result) 1999 { 2000 int r; 2001 __le64 value_le; 2002 dm_block_t thin_root; 2003 struct dm_pool_metadata *pmd = td->pmd; 2004 2005 r = dm_btree_lookup(&pmd->tl_info, pmd->root, &td->id, &value_le); 2006 if (r) 2007 return r; 2008 2009 thin_root = le64_to_cpu(value_le); 2010 2011 return dm_btree_find_highest_key(&pmd->bl_info, thin_root, result); 2012 } 2013 2014 int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, 2015 dm_block_t *result) 2016 { 2017 int r = -EINVAL; 2018 struct dm_pool_metadata *pmd = td->pmd; 2019 2020 down_read(&pmd->root_lock); 2021 if (!pmd->fail_io) 2022 r = __highest_block(td, result); 2023 up_read(&pmd->root_lock); 2024 2025 return r; 2026 } 2027 2028 static int __resize_space_map(struct dm_space_map *sm, dm_block_t new_count) 2029 { 2030 int r; 2031 dm_block_t old_count; 2032 2033 r = dm_sm_get_nr_blocks(sm, &old_count); 2034 if (r) 2035 return r; 2036 2037 if (new_count == old_count) 2038 return 0; 2039 2040 if (new_count < old_count) { 2041 DMERR("cannot reduce size of space map"); 2042 return -EINVAL; 2043 } 2044 2045 return dm_sm_extend(sm, new_count - old_count); 2046 } 2047 2048 int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) 2049 { 2050 int r = -EINVAL; 2051 2052 pmd_write_lock(pmd); 2053 if (!pmd->fail_io) 2054 r = __resize_space_map(pmd->data_sm, new_count); 2055 pmd_write_unlock(pmd); 2056 2057 return r; 2058 } 2059 2060 int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) 2061 { 2062 int r = -EINVAL; 2063 2064 pmd_write_lock(pmd); 2065 if (!pmd->fail_io) { 2066 r = __resize_space_map(pmd->metadata_sm, new_count); 2067 if (!r) 2068 __set_metadata_reserve(pmd); 2069 } 2070 pmd_write_unlock(pmd); 2071 2072 return r; 2073 } 2074 2075 void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd) 2076 { 2077 pmd_write_lock_in_core(pmd); 2078 dm_bm_set_read_only(pmd->bm); 2079 pmd_write_unlock(pmd); 2080 } 2081 2082 void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd) 2083 { 2084 pmd_write_lock_in_core(pmd); 2085 dm_bm_set_read_write(pmd->bm); 2086 pmd_write_unlock(pmd); 2087 } 2088 2089 int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, 2090 dm_block_t threshold, 2091 dm_sm_threshold_fn fn, 2092 void *context) 2093 { 2094 int r = -EINVAL; 2095 2096 pmd_write_lock_in_core(pmd); 2097 if (!pmd->fail_io) { 2098 r = dm_sm_register_threshold_callback(pmd->metadata_sm, 2099 threshold, fn, context); 2100 } 2101 pmd_write_unlock(pmd); 2102 2103 return r; 2104 } 2105 2106 void dm_pool_register_pre_commit_callback(struct dm_pool_metadata *pmd, 2107 dm_pool_pre_commit_fn fn, 2108 void *context) 2109 { 2110 pmd_write_lock_in_core(pmd); 2111 pmd->pre_commit_fn = fn; 2112 pmd->pre_commit_context = context; 2113 pmd_write_unlock(pmd); 2114 } 2115 2116 int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd) 2117 { 2118 int r = -EINVAL; 2119 struct dm_block *sblock; 2120 struct thin_disk_superblock *disk_super; 2121 2122 pmd_write_lock(pmd); 2123 if (pmd->fail_io) 2124 goto out; 2125 2126 pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG; 2127 2128 r = superblock_lock(pmd, &sblock); 2129 if (r) { 2130 DMERR("couldn't lock superblock"); 2131 goto out; 2132 } 2133 2134 disk_super = dm_block_data(sblock); 2135 disk_super->flags = cpu_to_le32(pmd->flags); 2136 2137 dm_bm_unlock(sblock); 2138 out: 2139 pmd_write_unlock(pmd); 2140 return r; 2141 } 2142 2143 bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd) 2144 { 2145 bool needs_check; 2146 2147 down_read(&pmd->root_lock); 2148 needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG; 2149 up_read(&pmd->root_lock); 2150 2151 return needs_check; 2152 } 2153 2154 void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd) 2155 { 2156 down_read(&pmd->root_lock); 2157 if (!pmd->fail_io) 2158 dm_tm_issue_prefetches(pmd->tm); 2159 up_read(&pmd->root_lock); 2160 } 2161