xref: /linux/drivers/md/dm-thin-metadata.c (revision a1ff5a7d78a036d6c2178ee5acd6ba4946243800)
1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   * Copyright (C) 2011-2012 Red Hat, Inc.
4   *
5   * This file is released under the GPL.
6   */
7  
8  #include "dm-thin-metadata.h"
9  #include "persistent-data/dm-btree.h"
10  #include "persistent-data/dm-space-map.h"
11  #include "persistent-data/dm-space-map-disk.h"
12  #include "persistent-data/dm-transaction-manager.h"
13  
14  #include <linux/list.h>
15  #include <linux/device-mapper.h>
16  #include <linux/workqueue.h>
17  
18  /*
19   *--------------------------------------------------------------------------
20   * As far as the metadata goes, there is:
21   *
22   * - A superblock in block zero, taking up fewer than 512 bytes for
23   *   atomic writes.
24   *
25   * - A space map managing the metadata blocks.
26   *
27   * - A space map managing the data blocks.
28   *
29   * - A btree mapping our internal thin dev ids onto struct disk_device_details.
30   *
31   * - A hierarchical btree, with 2 levels which effectively maps (thin
32   *   dev id, virtual block) -> block_time.  Block time is a 64-bit
33   *   field holding the time in the low 24 bits, and block in the top 40
34   *   bits.
35   *
36   * BTrees consist solely of btree_nodes, that fill a block.  Some are
37   * internal nodes, as such their values are a __le64 pointing to other
38   * nodes.  Leaf nodes can store data of any reasonable size (ie. much
39   * smaller than the block size).  The nodes consist of the header,
40   * followed by an array of keys, followed by an array of values.  We have
41   * to binary search on the keys so they're all held together to help the
42   * cpu cache.
43   *
44   * Space maps have 2 btrees:
45   *
46   * - One maps a uint64_t onto a struct index_entry.  Which points to a
47   *   bitmap block, and has some details about how many free entries there
48   *   are etc.
49   *
50   * - The bitmap blocks have a header (for the checksum).  Then the rest
51   *   of the block is pairs of bits.  With the meaning being:
52   *
53   *   0 - ref count is 0
54   *   1 - ref count is 1
55   *   2 - ref count is 2
56   *   3 - ref count is higher than 2
57   *
58   * - If the count is higher than 2 then the ref count is entered in a
59   *   second btree that directly maps the block_address to a uint32_t ref
60   *   count.
61   *
62   * The space map metadata variant doesn't have a bitmaps btree.  Instead
63   * it has one single blocks worth of index_entries.  This avoids
64   * recursive issues with the bitmap btree needing to allocate space in
65   * order to insert.  With a small data block size such as 64k the
66   * metadata support data devices that are hundreds of terrabytes.
67   *
68   * The space maps allocate space linearly from front to back.  Space that
69   * is freed in a transaction is never recycled within that transaction.
70   * To try and avoid fragmenting _free_ space the allocator always goes
71   * back and fills in gaps.
72   *
73   * All metadata io is in THIN_METADATA_BLOCK_SIZE sized/aligned chunks
74   * from the block manager.
75   *--------------------------------------------------------------------------
76   */
77  
78  #define DM_MSG_PREFIX   "thin metadata"
79  
80  #define THIN_SUPERBLOCK_MAGIC 27022010
81  #define THIN_SUPERBLOCK_LOCATION 0
82  #define THIN_VERSION 2
83  #define SECTOR_TO_BLOCK_SHIFT 3
84  
85  /*
86   * For btree insert:
87   *  3 for btree insert +
88   *  2 for btree lookup used within space map
89   * For btree remove:
90   *  2 for shadow spine +
91   *  4 for rebalance 3 child node
92   */
93  #define THIN_MAX_CONCURRENT_LOCKS 6
94  
95  /* This should be plenty */
96  #define SPACE_MAP_ROOT_SIZE 128
97  
98  /*
99   * Little endian on-disk superblock and device details.
100   */
101  struct thin_disk_superblock {
102  	__le32 csum;	/* Checksum of superblock except for this field. */
103  	__le32 flags;
104  	__le64 blocknr;	/* This block number, dm_block_t. */
105  
106  	__u8 uuid[16];
107  	__le64 magic;
108  	__le32 version;
109  	__le32 time;
110  
111  	__le64 trans_id;
112  
113  	/*
114  	 * Root held by userspace transactions.
115  	 */
116  	__le64 held_root;
117  
118  	__u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
119  	__u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
120  
121  	/*
122  	 * 2-level btree mapping (dev_id, (dev block, time)) -> data block
123  	 */
124  	__le64 data_mapping_root;
125  
126  	/*
127  	 * Device detail root mapping dev_id -> device_details
128  	 */
129  	__le64 device_details_root;
130  
131  	__le32 data_block_size;		/* In 512-byte sectors. */
132  
133  	__le32 metadata_block_size;	/* In 512-byte sectors. */
134  	__le64 metadata_nr_blocks;
135  
136  	__le32 compat_flags;
137  	__le32 compat_ro_flags;
138  	__le32 incompat_flags;
139  } __packed;
140  
141  struct disk_device_details {
142  	__le64 mapped_blocks;
143  	__le64 transaction_id;		/* When created. */
144  	__le32 creation_time;
145  	__le32 snapshotted_time;
146  } __packed;
147  
148  struct dm_pool_metadata {
149  	struct hlist_node hash;
150  
151  	struct block_device *bdev;
152  	struct dm_block_manager *bm;
153  	struct dm_space_map *metadata_sm;
154  	struct dm_space_map *data_sm;
155  	struct dm_transaction_manager *tm;
156  	struct dm_transaction_manager *nb_tm;
157  
158  	/*
159  	 * Two-level btree.
160  	 * First level holds thin_dev_t.
161  	 * Second level holds mappings.
162  	 */
163  	struct dm_btree_info info;
164  
165  	/*
166  	 * Non-blocking version of the above.
167  	 */
168  	struct dm_btree_info nb_info;
169  
170  	/*
171  	 * Just the top level for deleting whole devices.
172  	 */
173  	struct dm_btree_info tl_info;
174  
175  	/*
176  	 * Just the bottom level for creating new devices.
177  	 */
178  	struct dm_btree_info bl_info;
179  
180  	/*
181  	 * Describes the device details btree.
182  	 */
183  	struct dm_btree_info details_info;
184  
185  	struct rw_semaphore root_lock;
186  	uint32_t time;
187  	dm_block_t root;
188  	dm_block_t details_root;
189  	struct list_head thin_devices;
190  	uint64_t trans_id;
191  	unsigned long flags;
192  	sector_t data_block_size;
193  
194  	/*
195  	 * Pre-commit callback.
196  	 *
197  	 * This allows the thin provisioning target to run a callback before
198  	 * the metadata are committed.
199  	 */
200  	dm_pool_pre_commit_fn pre_commit_fn;
201  	void *pre_commit_context;
202  
203  	/*
204  	 * We reserve a section of the metadata for commit overhead.
205  	 * All reported space does *not* include this.
206  	 */
207  	dm_block_t metadata_reserve;
208  
209  	/*
210  	 * Set if a transaction has to be aborted but the attempt to roll back
211  	 * to the previous (good) transaction failed.  The only pool metadata
212  	 * operation possible in this state is the closing of the device.
213  	 */
214  	bool fail_io:1;
215  
216  	/*
217  	 * Set once a thin-pool has been accessed through one of the interfaces
218  	 * that imply the pool is in-service (e.g. thin devices created/deleted,
219  	 * thin-pool message, metadata snapshots, etc).
220  	 */
221  	bool in_service:1;
222  
223  	/*
224  	 * Reading the space map roots can fail, so we read it into these
225  	 * buffers before the superblock is locked and updated.
226  	 */
227  	__u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
228  	__u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
229  };
230  
231  struct dm_thin_device {
232  	struct list_head list;
233  	struct dm_pool_metadata *pmd;
234  	dm_thin_id id;
235  
236  	int open_count;
237  	bool changed:1;
238  	bool aborted_with_changes:1;
239  	uint64_t mapped_blocks;
240  	uint64_t transaction_id;
241  	uint32_t creation_time;
242  	uint32_t snapshotted_time;
243  };
244  
245  /*
246   *--------------------------------------------------------------
247   * superblock validator
248   *--------------------------------------------------------------
249   */
250  #define SUPERBLOCK_CSUM_XOR 160774
251  
sb_prepare_for_write(const struct dm_block_validator * v,struct dm_block * b,size_t block_size)252  static void sb_prepare_for_write(const struct dm_block_validator *v,
253  				 struct dm_block *b,
254  				 size_t block_size)
255  {
256  	struct thin_disk_superblock *disk_super = dm_block_data(b);
257  
258  	disk_super->blocknr = cpu_to_le64(dm_block_location(b));
259  	disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
260  						      block_size - sizeof(__le32),
261  						      SUPERBLOCK_CSUM_XOR));
262  }
263  
sb_check(const struct dm_block_validator * v,struct dm_block * b,size_t block_size)264  static int sb_check(const struct dm_block_validator *v,
265  		    struct dm_block *b,
266  		    size_t block_size)
267  {
268  	struct thin_disk_superblock *disk_super = dm_block_data(b);
269  	__le32 csum_le;
270  
271  	if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) {
272  		DMERR("%s failed: blocknr %llu: wanted %llu",
273  		      __func__, le64_to_cpu(disk_super->blocknr),
274  		      (unsigned long long)dm_block_location(b));
275  		return -ENOTBLK;
276  	}
277  
278  	if (le64_to_cpu(disk_super->magic) != THIN_SUPERBLOCK_MAGIC) {
279  		DMERR("%s failed: magic %llu: wanted %llu",
280  		      __func__, le64_to_cpu(disk_super->magic),
281  		      (unsigned long long)THIN_SUPERBLOCK_MAGIC);
282  		return -EILSEQ;
283  	}
284  
285  	csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
286  					     block_size - sizeof(__le32),
287  					     SUPERBLOCK_CSUM_XOR));
288  	if (csum_le != disk_super->csum) {
289  		DMERR("%s failed: csum %u: wanted %u",
290  		      __func__, le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum));
291  		return -EILSEQ;
292  	}
293  
294  	return 0;
295  }
296  
297  static const struct dm_block_validator sb_validator = {
298  	.name = "superblock",
299  	.prepare_for_write = sb_prepare_for_write,
300  	.check = sb_check
301  };
302  
303  /*
304   *--------------------------------------------------------------
305   * Methods for the btree value types
306   *--------------------------------------------------------------
307   */
pack_block_time(dm_block_t b,uint32_t t)308  static uint64_t pack_block_time(dm_block_t b, uint32_t t)
309  {
310  	return (b << 24) | t;
311  }
312  
unpack_block_time(uint64_t v,dm_block_t * b,uint32_t * t)313  static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t)
314  {
315  	*b = v >> 24;
316  	*t = v & ((1 << 24) - 1);
317  }
318  
319  /*
320   * It's more efficient to call dm_sm_{inc,dec}_blocks as few times as
321   * possible.  'with_runs' reads contiguous runs of blocks, and calls the
322   * given sm function.
323   */
324  typedef int (*run_fn)(struct dm_space_map *, dm_block_t, dm_block_t);
325  
with_runs(struct dm_space_map * sm,const __le64 * value_le,unsigned int count,run_fn fn)326  static void with_runs(struct dm_space_map *sm, const __le64 *value_le, unsigned int count, run_fn fn)
327  {
328  	uint64_t b, begin, end;
329  	uint32_t t;
330  	bool in_run = false;
331  	unsigned int i;
332  
333  	for (i = 0; i < count; i++, value_le++) {
334  		/* We know value_le is 8 byte aligned */
335  		unpack_block_time(le64_to_cpu(*value_le), &b, &t);
336  
337  		if (in_run) {
338  			if (b == end) {
339  				end++;
340  			} else {
341  				fn(sm, begin, end);
342  				begin = b;
343  				end = b + 1;
344  			}
345  		} else {
346  			in_run = true;
347  			begin = b;
348  			end = b + 1;
349  		}
350  	}
351  
352  	if (in_run)
353  		fn(sm, begin, end);
354  }
355  
data_block_inc(void * context,const void * value_le,unsigned int count)356  static void data_block_inc(void *context, const void *value_le, unsigned int count)
357  {
358  	with_runs((struct dm_space_map *) context,
359  		  (const __le64 *) value_le, count, dm_sm_inc_blocks);
360  }
361  
data_block_dec(void * context,const void * value_le,unsigned int count)362  static void data_block_dec(void *context, const void *value_le, unsigned int count)
363  {
364  	with_runs((struct dm_space_map *) context,
365  		  (const __le64 *) value_le, count, dm_sm_dec_blocks);
366  }
367  
data_block_equal(void * context,const void * value1_le,const void * value2_le)368  static int data_block_equal(void *context, const void *value1_le, const void *value2_le)
369  {
370  	__le64 v1_le, v2_le;
371  	uint64_t b1, b2;
372  	uint32_t t;
373  
374  	memcpy(&v1_le, value1_le, sizeof(v1_le));
375  	memcpy(&v2_le, value2_le, sizeof(v2_le));
376  	unpack_block_time(le64_to_cpu(v1_le), &b1, &t);
377  	unpack_block_time(le64_to_cpu(v2_le), &b2, &t);
378  
379  	return b1 == b2;
380  }
381  
subtree_inc(void * context,const void * value,unsigned int count)382  static void subtree_inc(void *context, const void *value, unsigned int count)
383  {
384  	struct dm_btree_info *info = context;
385  	const __le64 *root_le = value;
386  	unsigned int i;
387  
388  	for (i = 0; i < count; i++, root_le++)
389  		dm_tm_inc(info->tm, le64_to_cpu(*root_le));
390  }
391  
subtree_dec(void * context,const void * value,unsigned int count)392  static void subtree_dec(void *context, const void *value, unsigned int count)
393  {
394  	struct dm_btree_info *info = context;
395  	const __le64 *root_le = value;
396  	unsigned int i;
397  
398  	for (i = 0; i < count; i++, root_le++)
399  		if (dm_btree_del(info, le64_to_cpu(*root_le)))
400  			DMERR("btree delete failed");
401  }
402  
subtree_equal(void * context,const void * value1_le,const void * value2_le)403  static int subtree_equal(void *context, const void *value1_le, const void *value2_le)
404  {
405  	__le64 v1_le, v2_le;
406  
407  	memcpy(&v1_le, value1_le, sizeof(v1_le));
408  	memcpy(&v2_le, value2_le, sizeof(v2_le));
409  
410  	return v1_le == v2_le;
411  }
412  
413  /*----------------------------------------------------------------*/
414  
415  /*
416   * Variant that is used for in-core only changes or code that
417   * shouldn't put the pool in service on its own (e.g. commit).
418   */
pmd_write_lock_in_core(struct dm_pool_metadata * pmd)419  static inline void pmd_write_lock_in_core(struct dm_pool_metadata *pmd)
420  	__acquires(pmd->root_lock)
421  {
422  	down_write(&pmd->root_lock);
423  }
424  
pmd_write_lock(struct dm_pool_metadata * pmd)425  static inline void pmd_write_lock(struct dm_pool_metadata *pmd)
426  {
427  	pmd_write_lock_in_core(pmd);
428  	if (unlikely(!pmd->in_service))
429  		pmd->in_service = true;
430  }
431  
pmd_write_unlock(struct dm_pool_metadata * pmd)432  static inline void pmd_write_unlock(struct dm_pool_metadata *pmd)
433  	__releases(pmd->root_lock)
434  {
435  	up_write(&pmd->root_lock);
436  }
437  
438  /*----------------------------------------------------------------*/
439  
superblock_lock_zero(struct dm_pool_metadata * pmd,struct dm_block ** sblock)440  static int superblock_lock_zero(struct dm_pool_metadata *pmd,
441  				struct dm_block **sblock)
442  {
443  	return dm_bm_write_lock_zero(pmd->bm, THIN_SUPERBLOCK_LOCATION,
444  				     &sb_validator, sblock);
445  }
446  
superblock_lock(struct dm_pool_metadata * pmd,struct dm_block ** sblock)447  static int superblock_lock(struct dm_pool_metadata *pmd,
448  			   struct dm_block **sblock)
449  {
450  	return dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
451  				&sb_validator, sblock);
452  }
453  
__superblock_all_zeroes(struct dm_block_manager * bm,int * result)454  static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
455  {
456  	int r;
457  	unsigned int i;
458  	struct dm_block *b;
459  	__le64 *data_le, zero = cpu_to_le64(0);
460  	unsigned int block_size = dm_bm_block_size(bm) / sizeof(__le64);
461  
462  	/*
463  	 * We can't use a validator here - it may be all zeroes.
464  	 */
465  	r = dm_bm_read_lock(bm, THIN_SUPERBLOCK_LOCATION, NULL, &b);
466  	if (r)
467  		return r;
468  
469  	data_le = dm_block_data(b);
470  	*result = 1;
471  	for (i = 0; i < block_size; i++) {
472  		if (data_le[i] != zero) {
473  			*result = 0;
474  			break;
475  		}
476  	}
477  
478  	dm_bm_unlock(b);
479  
480  	return 0;
481  }
482  
__setup_btree_details(struct dm_pool_metadata * pmd)483  static void __setup_btree_details(struct dm_pool_metadata *pmd)
484  {
485  	pmd->info.tm = pmd->tm;
486  	pmd->info.levels = 2;
487  	pmd->info.value_type.context = pmd->data_sm;
488  	pmd->info.value_type.size = sizeof(__le64);
489  	pmd->info.value_type.inc = data_block_inc;
490  	pmd->info.value_type.dec = data_block_dec;
491  	pmd->info.value_type.equal = data_block_equal;
492  
493  	memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info));
494  	pmd->nb_info.tm = pmd->nb_tm;
495  
496  	pmd->tl_info.tm = pmd->tm;
497  	pmd->tl_info.levels = 1;
498  	pmd->tl_info.value_type.context = &pmd->bl_info;
499  	pmd->tl_info.value_type.size = sizeof(__le64);
500  	pmd->tl_info.value_type.inc = subtree_inc;
501  	pmd->tl_info.value_type.dec = subtree_dec;
502  	pmd->tl_info.value_type.equal = subtree_equal;
503  
504  	pmd->bl_info.tm = pmd->tm;
505  	pmd->bl_info.levels = 1;
506  	pmd->bl_info.value_type.context = pmd->data_sm;
507  	pmd->bl_info.value_type.size = sizeof(__le64);
508  	pmd->bl_info.value_type.inc = data_block_inc;
509  	pmd->bl_info.value_type.dec = data_block_dec;
510  	pmd->bl_info.value_type.equal = data_block_equal;
511  
512  	pmd->details_info.tm = pmd->tm;
513  	pmd->details_info.levels = 1;
514  	pmd->details_info.value_type.context = NULL;
515  	pmd->details_info.value_type.size = sizeof(struct disk_device_details);
516  	pmd->details_info.value_type.inc = NULL;
517  	pmd->details_info.value_type.dec = NULL;
518  	pmd->details_info.value_type.equal = NULL;
519  }
520  
save_sm_roots(struct dm_pool_metadata * pmd)521  static int save_sm_roots(struct dm_pool_metadata *pmd)
522  {
523  	int r;
524  	size_t len;
525  
526  	r = dm_sm_root_size(pmd->metadata_sm, &len);
527  	if (r < 0)
528  		return r;
529  
530  	r = dm_sm_copy_root(pmd->metadata_sm, &pmd->metadata_space_map_root, len);
531  	if (r < 0)
532  		return r;
533  
534  	r = dm_sm_root_size(pmd->data_sm, &len);
535  	if (r < 0)
536  		return r;
537  
538  	return dm_sm_copy_root(pmd->data_sm, &pmd->data_space_map_root, len);
539  }
540  
copy_sm_roots(struct dm_pool_metadata * pmd,struct thin_disk_superblock * disk)541  static void copy_sm_roots(struct dm_pool_metadata *pmd,
542  			  struct thin_disk_superblock *disk)
543  {
544  	memcpy(&disk->metadata_space_map_root,
545  	       &pmd->metadata_space_map_root,
546  	       sizeof(pmd->metadata_space_map_root));
547  
548  	memcpy(&disk->data_space_map_root,
549  	       &pmd->data_space_map_root,
550  	       sizeof(pmd->data_space_map_root));
551  }
552  
__write_initial_superblock(struct dm_pool_metadata * pmd)553  static int __write_initial_superblock(struct dm_pool_metadata *pmd)
554  {
555  	int r;
556  	struct dm_block *sblock;
557  	struct thin_disk_superblock *disk_super;
558  	sector_t bdev_size = bdev_nr_sectors(pmd->bdev);
559  
560  	if (bdev_size > THIN_METADATA_MAX_SECTORS)
561  		bdev_size = THIN_METADATA_MAX_SECTORS;
562  
563  	r = dm_sm_commit(pmd->data_sm);
564  	if (r < 0)
565  		return r;
566  
567  	r = dm_tm_pre_commit(pmd->tm);
568  	if (r < 0)
569  		return r;
570  
571  	r = save_sm_roots(pmd);
572  	if (r < 0)
573  		return r;
574  
575  	r = superblock_lock_zero(pmd, &sblock);
576  	if (r)
577  		return r;
578  
579  	disk_super = dm_block_data(sblock);
580  	disk_super->flags = 0;
581  	memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
582  	disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC);
583  	disk_super->version = cpu_to_le32(THIN_VERSION);
584  	disk_super->time = 0;
585  	disk_super->trans_id = 0;
586  	disk_super->held_root = 0;
587  
588  	copy_sm_roots(pmd, disk_super);
589  
590  	disk_super->data_mapping_root = cpu_to_le64(pmd->root);
591  	disk_super->device_details_root = cpu_to_le64(pmd->details_root);
592  	disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE);
593  	disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT);
594  	disk_super->data_block_size = cpu_to_le32(pmd->data_block_size);
595  
596  	return dm_tm_commit(pmd->tm, sblock);
597  }
598  
__format_metadata(struct dm_pool_metadata * pmd)599  static int __format_metadata(struct dm_pool_metadata *pmd)
600  {
601  	int r;
602  
603  	r = dm_tm_create_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
604  				 &pmd->tm, &pmd->metadata_sm);
605  	if (r < 0) {
606  		pmd->tm = NULL;
607  		pmd->metadata_sm = NULL;
608  		DMERR("tm_create_with_sm failed");
609  		return r;
610  	}
611  
612  	pmd->data_sm = dm_sm_disk_create(pmd->tm, 0);
613  	if (IS_ERR(pmd->data_sm)) {
614  		DMERR("sm_disk_create failed");
615  		r = PTR_ERR(pmd->data_sm);
616  		pmd->data_sm = NULL;
617  		goto bad_cleanup_tm;
618  	}
619  
620  	pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
621  	if (!pmd->nb_tm) {
622  		DMERR("could not create non-blocking clone tm");
623  		r = -ENOMEM;
624  		goto bad_cleanup_data_sm;
625  	}
626  
627  	__setup_btree_details(pmd);
628  
629  	r = dm_btree_empty(&pmd->info, &pmd->root);
630  	if (r < 0)
631  		goto bad_cleanup_nb_tm;
632  
633  	r = dm_btree_empty(&pmd->details_info, &pmd->details_root);
634  	if (r < 0) {
635  		DMERR("couldn't create devices root");
636  		goto bad_cleanup_nb_tm;
637  	}
638  
639  	r = __write_initial_superblock(pmd);
640  	if (r)
641  		goto bad_cleanup_nb_tm;
642  
643  	return 0;
644  
645  bad_cleanup_nb_tm:
646  	dm_tm_destroy(pmd->nb_tm);
647  	pmd->nb_tm = NULL;
648  bad_cleanup_data_sm:
649  	dm_sm_destroy(pmd->data_sm);
650  	pmd->data_sm = NULL;
651  bad_cleanup_tm:
652  	dm_tm_destroy(pmd->tm);
653  	pmd->tm = NULL;
654  	dm_sm_destroy(pmd->metadata_sm);
655  	pmd->metadata_sm = NULL;
656  
657  	return r;
658  }
659  
__check_incompat_features(struct thin_disk_superblock * disk_super,struct dm_pool_metadata * pmd)660  static int __check_incompat_features(struct thin_disk_superblock *disk_super,
661  				     struct dm_pool_metadata *pmd)
662  {
663  	uint32_t features;
664  
665  	features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP;
666  	if (features) {
667  		DMERR("could not access metadata due to unsupported optional features (%lx).",
668  		      (unsigned long)features);
669  		return -EINVAL;
670  	}
671  
672  	/*
673  	 * Check for read-only metadata to skip the following RDWR checks.
674  	 */
675  	if (bdev_read_only(pmd->bdev))
676  		return 0;
677  
678  	features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP;
679  	if (features) {
680  		DMERR("could not access metadata RDWR due to unsupported optional features (%lx).",
681  		      (unsigned long)features);
682  		return -EINVAL;
683  	}
684  
685  	return 0;
686  }
687  
__open_metadata(struct dm_pool_metadata * pmd)688  static int __open_metadata(struct dm_pool_metadata *pmd)
689  {
690  	int r;
691  	struct dm_block *sblock;
692  	struct thin_disk_superblock *disk_super;
693  
694  	r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
695  			    &sb_validator, &sblock);
696  	if (r < 0) {
697  		DMERR("couldn't read superblock");
698  		return r;
699  	}
700  
701  	disk_super = dm_block_data(sblock);
702  
703  	/* Verify the data block size hasn't changed */
704  	if (le32_to_cpu(disk_super->data_block_size) != pmd->data_block_size) {
705  		DMERR("changing the data block size (from %u to %llu) is not supported",
706  		      le32_to_cpu(disk_super->data_block_size),
707  		      (unsigned long long)pmd->data_block_size);
708  		r = -EINVAL;
709  		goto bad_unlock_sblock;
710  	}
711  
712  	r = __check_incompat_features(disk_super, pmd);
713  	if (r < 0)
714  		goto bad_unlock_sblock;
715  
716  	r = dm_tm_open_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
717  			       disk_super->metadata_space_map_root,
718  			       sizeof(disk_super->metadata_space_map_root),
719  			       &pmd->tm, &pmd->metadata_sm);
720  	if (r < 0) {
721  		pmd->tm = NULL;
722  		pmd->metadata_sm = NULL;
723  		DMERR("tm_open_with_sm failed");
724  		goto bad_unlock_sblock;
725  	}
726  
727  	pmd->data_sm = dm_sm_disk_open(pmd->tm, disk_super->data_space_map_root,
728  				       sizeof(disk_super->data_space_map_root));
729  	if (IS_ERR(pmd->data_sm)) {
730  		DMERR("sm_disk_open failed");
731  		r = PTR_ERR(pmd->data_sm);
732  		pmd->data_sm = NULL;
733  		goto bad_cleanup_tm;
734  	}
735  
736  	pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
737  	if (!pmd->nb_tm) {
738  		DMERR("could not create non-blocking clone tm");
739  		r = -ENOMEM;
740  		goto bad_cleanup_data_sm;
741  	}
742  
743  	/*
744  	 * For pool metadata opening process, root setting is redundant
745  	 * because it will be set again in __begin_transaction(). But dm
746  	 * pool aborting process really needs to get last transaction's
747  	 * root to avoid accessing broken btree.
748  	 */
749  	pmd->root = le64_to_cpu(disk_super->data_mapping_root);
750  	pmd->details_root = le64_to_cpu(disk_super->device_details_root);
751  
752  	__setup_btree_details(pmd);
753  	dm_bm_unlock(sblock);
754  
755  	return 0;
756  
757  bad_cleanup_data_sm:
758  	dm_sm_destroy(pmd->data_sm);
759  	pmd->data_sm = NULL;
760  bad_cleanup_tm:
761  	dm_tm_destroy(pmd->tm);
762  	pmd->tm = NULL;
763  	dm_sm_destroy(pmd->metadata_sm);
764  	pmd->metadata_sm = NULL;
765  bad_unlock_sblock:
766  	dm_bm_unlock(sblock);
767  
768  	return r;
769  }
770  
__open_or_format_metadata(struct dm_pool_metadata * pmd,bool format_device)771  static int __open_or_format_metadata(struct dm_pool_metadata *pmd, bool format_device)
772  {
773  	int r, unformatted;
774  
775  	r = __superblock_all_zeroes(pmd->bm, &unformatted);
776  	if (r)
777  		return r;
778  
779  	if (unformatted)
780  		return format_device ? __format_metadata(pmd) : -EPERM;
781  
782  	return __open_metadata(pmd);
783  }
784  
__create_persistent_data_objects(struct dm_pool_metadata * pmd,bool format_device)785  static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool format_device)
786  {
787  	int r;
788  
789  	pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
790  					  THIN_MAX_CONCURRENT_LOCKS);
791  	if (IS_ERR(pmd->bm)) {
792  		DMERR("could not create block manager");
793  		r = PTR_ERR(pmd->bm);
794  		pmd->bm = NULL;
795  		return r;
796  	}
797  
798  	r = __open_or_format_metadata(pmd, format_device);
799  	if (r) {
800  		dm_block_manager_destroy(pmd->bm);
801  		pmd->bm = NULL;
802  	}
803  
804  	return r;
805  }
806  
__destroy_persistent_data_objects(struct dm_pool_metadata * pmd,bool destroy_bm)807  static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd,
808  					      bool destroy_bm)
809  {
810  	dm_sm_destroy(pmd->data_sm);
811  	pmd->data_sm = NULL;
812  	dm_sm_destroy(pmd->metadata_sm);
813  	pmd->metadata_sm = NULL;
814  	dm_tm_destroy(pmd->nb_tm);
815  	pmd->nb_tm = NULL;
816  	dm_tm_destroy(pmd->tm);
817  	pmd->tm = NULL;
818  	if (destroy_bm)
819  		dm_block_manager_destroy(pmd->bm);
820  }
821  
__begin_transaction(struct dm_pool_metadata * pmd)822  static int __begin_transaction(struct dm_pool_metadata *pmd)
823  {
824  	int r;
825  	struct thin_disk_superblock *disk_super;
826  	struct dm_block *sblock;
827  
828  	/*
829  	 * We re-read the superblock every time.  Shouldn't need to do this
830  	 * really.
831  	 */
832  	r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
833  			    &sb_validator, &sblock);
834  	if (r)
835  		return r;
836  
837  	disk_super = dm_block_data(sblock);
838  	pmd->time = le32_to_cpu(disk_super->time);
839  	pmd->root = le64_to_cpu(disk_super->data_mapping_root);
840  	pmd->details_root = le64_to_cpu(disk_super->device_details_root);
841  	pmd->trans_id = le64_to_cpu(disk_super->trans_id);
842  	pmd->flags = le32_to_cpu(disk_super->flags);
843  	pmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
844  
845  	dm_bm_unlock(sblock);
846  	return 0;
847  }
848  
__write_changed_details(struct dm_pool_metadata * pmd)849  static int __write_changed_details(struct dm_pool_metadata *pmd)
850  {
851  	int r;
852  	struct dm_thin_device *td, *tmp;
853  	struct disk_device_details details;
854  	uint64_t key;
855  
856  	list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
857  		if (!td->changed)
858  			continue;
859  
860  		key = td->id;
861  
862  		details.mapped_blocks = cpu_to_le64(td->mapped_blocks);
863  		details.transaction_id = cpu_to_le64(td->transaction_id);
864  		details.creation_time = cpu_to_le32(td->creation_time);
865  		details.snapshotted_time = cpu_to_le32(td->snapshotted_time);
866  		__dm_bless_for_disk(&details);
867  
868  		r = dm_btree_insert(&pmd->details_info, pmd->details_root,
869  				    &key, &details, &pmd->details_root);
870  		if (r)
871  			return r;
872  
873  		if (td->open_count)
874  			td->changed = false;
875  		else {
876  			list_del(&td->list);
877  			kfree(td);
878  		}
879  	}
880  
881  	return 0;
882  }
883  
__commit_transaction(struct dm_pool_metadata * pmd)884  static int __commit_transaction(struct dm_pool_metadata *pmd)
885  {
886  	int r;
887  	struct thin_disk_superblock *disk_super;
888  	struct dm_block *sblock;
889  
890  	/*
891  	 * We need to know if the thin_disk_superblock exceeds a 512-byte sector.
892  	 */
893  	BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512);
894  	BUG_ON(!rwsem_is_locked(&pmd->root_lock));
895  
896  	if (unlikely(!pmd->in_service))
897  		return 0;
898  
899  	if (pmd->pre_commit_fn) {
900  		r = pmd->pre_commit_fn(pmd->pre_commit_context);
901  		if (r < 0) {
902  			DMERR("pre-commit callback failed");
903  			return r;
904  		}
905  	}
906  
907  	r = __write_changed_details(pmd);
908  	if (r < 0)
909  		return r;
910  
911  	r = dm_sm_commit(pmd->data_sm);
912  	if (r < 0)
913  		return r;
914  
915  	r = dm_tm_pre_commit(pmd->tm);
916  	if (r < 0)
917  		return r;
918  
919  	r = save_sm_roots(pmd);
920  	if (r < 0)
921  		return r;
922  
923  	r = superblock_lock(pmd, &sblock);
924  	if (r)
925  		return r;
926  
927  	disk_super = dm_block_data(sblock);
928  	disk_super->time = cpu_to_le32(pmd->time);
929  	disk_super->data_mapping_root = cpu_to_le64(pmd->root);
930  	disk_super->device_details_root = cpu_to_le64(pmd->details_root);
931  	disk_super->trans_id = cpu_to_le64(pmd->trans_id);
932  	disk_super->flags = cpu_to_le32(pmd->flags);
933  
934  	copy_sm_roots(pmd, disk_super);
935  
936  	return dm_tm_commit(pmd->tm, sblock);
937  }
938  
__set_metadata_reserve(struct dm_pool_metadata * pmd)939  static void __set_metadata_reserve(struct dm_pool_metadata *pmd)
940  {
941  	int r;
942  	dm_block_t total;
943  	dm_block_t max_blocks = 4096; /* 16M */
944  
945  	r = dm_sm_get_nr_blocks(pmd->metadata_sm, &total);
946  	if (r) {
947  		DMERR("could not get size of metadata device");
948  		pmd->metadata_reserve = max_blocks;
949  	} else
950  		pmd->metadata_reserve = min(max_blocks, div_u64(total, 10));
951  }
952  
dm_pool_metadata_open(struct block_device * bdev,sector_t data_block_size,bool format_device)953  struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
954  					       sector_t data_block_size,
955  					       bool format_device)
956  {
957  	int r;
958  	struct dm_pool_metadata *pmd;
959  
960  	pmd = kmalloc(sizeof(*pmd), GFP_KERNEL);
961  	if (!pmd) {
962  		DMERR("could not allocate metadata struct");
963  		return ERR_PTR(-ENOMEM);
964  	}
965  
966  	init_rwsem(&pmd->root_lock);
967  	pmd->time = 0;
968  	INIT_LIST_HEAD(&pmd->thin_devices);
969  	pmd->fail_io = false;
970  	pmd->in_service = false;
971  	pmd->bdev = bdev;
972  	pmd->data_block_size = data_block_size;
973  	pmd->pre_commit_fn = NULL;
974  	pmd->pre_commit_context = NULL;
975  
976  	r = __create_persistent_data_objects(pmd, format_device);
977  	if (r) {
978  		kfree(pmd);
979  		return ERR_PTR(r);
980  	}
981  
982  	r = __begin_transaction(pmd);
983  	if (r < 0) {
984  		if (dm_pool_metadata_close(pmd) < 0)
985  			DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
986  		return ERR_PTR(r);
987  	}
988  
989  	__set_metadata_reserve(pmd);
990  
991  	return pmd;
992  }
993  
dm_pool_metadata_close(struct dm_pool_metadata * pmd)994  int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
995  {
996  	int r;
997  	unsigned int open_devices = 0;
998  	struct dm_thin_device *td, *tmp;
999  
1000  	down_read(&pmd->root_lock);
1001  	list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
1002  		if (td->open_count)
1003  			open_devices++;
1004  		else {
1005  			list_del(&td->list);
1006  			kfree(td);
1007  		}
1008  	}
1009  	up_read(&pmd->root_lock);
1010  
1011  	if (open_devices) {
1012  		DMERR("attempt to close pmd when %u device(s) are still open",
1013  		       open_devices);
1014  		return -EBUSY;
1015  	}
1016  
1017  	pmd_write_lock_in_core(pmd);
1018  	if (!pmd->fail_io && !dm_bm_is_read_only(pmd->bm)) {
1019  		r = __commit_transaction(pmd);
1020  		if (r < 0)
1021  			DMWARN("%s: __commit_transaction() failed, error = %d",
1022  			       __func__, r);
1023  	}
1024  	pmd_write_unlock(pmd);
1025  	__destroy_persistent_data_objects(pmd, true);
1026  
1027  	kfree(pmd);
1028  	return 0;
1029  }
1030  
1031  /*
1032   * __open_device: Returns @td corresponding to device with id @dev,
1033   * creating it if @create is set and incrementing @td->open_count.
1034   * On failure, @td is undefined.
1035   */
__open_device(struct dm_pool_metadata * pmd,dm_thin_id dev,int create,struct dm_thin_device ** td)1036  static int __open_device(struct dm_pool_metadata *pmd,
1037  			 dm_thin_id dev, int create,
1038  			 struct dm_thin_device **td)
1039  {
1040  	int r, changed = 0;
1041  	struct dm_thin_device *td2;
1042  	uint64_t key = dev;
1043  	struct disk_device_details details_le;
1044  
1045  	/*
1046  	 * If the device is already open, return it.
1047  	 */
1048  	list_for_each_entry(td2, &pmd->thin_devices, list)
1049  		if (td2->id == dev) {
1050  			/*
1051  			 * May not create an already-open device.
1052  			 */
1053  			if (create)
1054  				return -EEXIST;
1055  
1056  			td2->open_count++;
1057  			*td = td2;
1058  			return 0;
1059  		}
1060  
1061  	/*
1062  	 * Check the device exists.
1063  	 */
1064  	r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
1065  			    &key, &details_le);
1066  	if (r) {
1067  		if (r != -ENODATA || !create)
1068  			return r;
1069  
1070  		/*
1071  		 * Create new device.
1072  		 */
1073  		changed = 1;
1074  		details_le.mapped_blocks = 0;
1075  		details_le.transaction_id = cpu_to_le64(pmd->trans_id);
1076  		details_le.creation_time = cpu_to_le32(pmd->time);
1077  		details_le.snapshotted_time = cpu_to_le32(pmd->time);
1078  	}
1079  
1080  	*td = kmalloc(sizeof(**td), GFP_NOIO);
1081  	if (!*td)
1082  		return -ENOMEM;
1083  
1084  	(*td)->pmd = pmd;
1085  	(*td)->id = dev;
1086  	(*td)->open_count = 1;
1087  	(*td)->changed = changed;
1088  	(*td)->aborted_with_changes = false;
1089  	(*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks);
1090  	(*td)->transaction_id = le64_to_cpu(details_le.transaction_id);
1091  	(*td)->creation_time = le32_to_cpu(details_le.creation_time);
1092  	(*td)->snapshotted_time = le32_to_cpu(details_le.snapshotted_time);
1093  
1094  	list_add(&(*td)->list, &pmd->thin_devices);
1095  
1096  	return 0;
1097  }
1098  
__close_device(struct dm_thin_device * td)1099  static void __close_device(struct dm_thin_device *td)
1100  {
1101  	--td->open_count;
1102  }
1103  
__create_thin(struct dm_pool_metadata * pmd,dm_thin_id dev)1104  static int __create_thin(struct dm_pool_metadata *pmd,
1105  			 dm_thin_id dev)
1106  {
1107  	int r;
1108  	dm_block_t dev_root;
1109  	uint64_t key = dev;
1110  	struct dm_thin_device *td;
1111  	__le64 value;
1112  
1113  	r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
1114  			    &key, NULL);
1115  	if (!r)
1116  		return -EEXIST;
1117  
1118  	/*
1119  	 * Create an empty btree for the mappings.
1120  	 */
1121  	r = dm_btree_empty(&pmd->bl_info, &dev_root);
1122  	if (r)
1123  		return r;
1124  
1125  	/*
1126  	 * Insert it into the main mapping tree.
1127  	 */
1128  	value = cpu_to_le64(dev_root);
1129  	__dm_bless_for_disk(&value);
1130  	r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
1131  	if (r) {
1132  		dm_btree_del(&pmd->bl_info, dev_root);
1133  		return r;
1134  	}
1135  
1136  	r = __open_device(pmd, dev, 1, &td);
1137  	if (r) {
1138  		dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
1139  		dm_btree_del(&pmd->bl_info, dev_root);
1140  		return r;
1141  	}
1142  	__close_device(td);
1143  
1144  	return r;
1145  }
1146  
dm_pool_create_thin(struct dm_pool_metadata * pmd,dm_thin_id dev)1147  int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev)
1148  {
1149  	int r = -EINVAL;
1150  
1151  	pmd_write_lock(pmd);
1152  	if (!pmd->fail_io)
1153  		r = __create_thin(pmd, dev);
1154  	pmd_write_unlock(pmd);
1155  
1156  	return r;
1157  }
1158  
__set_snapshot_details(struct dm_pool_metadata * pmd,struct dm_thin_device * snap,dm_thin_id origin,uint32_t time)1159  static int __set_snapshot_details(struct dm_pool_metadata *pmd,
1160  				  struct dm_thin_device *snap,
1161  				  dm_thin_id origin, uint32_t time)
1162  {
1163  	int r;
1164  	struct dm_thin_device *td;
1165  
1166  	r = __open_device(pmd, origin, 0, &td);
1167  	if (r)
1168  		return r;
1169  
1170  	td->changed = true;
1171  	td->snapshotted_time = time;
1172  
1173  	snap->mapped_blocks = td->mapped_blocks;
1174  	snap->snapshotted_time = time;
1175  	__close_device(td);
1176  
1177  	return 0;
1178  }
1179  
__create_snap(struct dm_pool_metadata * pmd,dm_thin_id dev,dm_thin_id origin)1180  static int __create_snap(struct dm_pool_metadata *pmd,
1181  			 dm_thin_id dev, dm_thin_id origin)
1182  {
1183  	int r;
1184  	dm_block_t origin_root;
1185  	uint64_t key = origin, dev_key = dev;
1186  	struct dm_thin_device *td;
1187  	__le64 value;
1188  
1189  	/* check this device is unused */
1190  	r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
1191  			    &dev_key, NULL);
1192  	if (!r)
1193  		return -EEXIST;
1194  
1195  	/* find the mapping tree for the origin */
1196  	r = dm_btree_lookup(&pmd->tl_info, pmd->root, &key, &value);
1197  	if (r)
1198  		return r;
1199  	origin_root = le64_to_cpu(value);
1200  
1201  	/* clone the origin, an inc will do */
1202  	dm_tm_inc(pmd->tm, origin_root);
1203  
1204  	/* insert into the main mapping tree */
1205  	value = cpu_to_le64(origin_root);
1206  	__dm_bless_for_disk(&value);
1207  	key = dev;
1208  	r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
1209  	if (r) {
1210  		dm_tm_dec(pmd->tm, origin_root);
1211  		return r;
1212  	}
1213  
1214  	pmd->time++;
1215  
1216  	r = __open_device(pmd, dev, 1, &td);
1217  	if (r)
1218  		goto bad;
1219  
1220  	r = __set_snapshot_details(pmd, td, origin, pmd->time);
1221  	__close_device(td);
1222  
1223  	if (r)
1224  		goto bad;
1225  
1226  	return 0;
1227  
1228  bad:
1229  	dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
1230  	dm_btree_remove(&pmd->details_info, pmd->details_root,
1231  			&key, &pmd->details_root);
1232  	return r;
1233  }
1234  
dm_pool_create_snap(struct dm_pool_metadata * pmd,dm_thin_id dev,dm_thin_id origin)1235  int dm_pool_create_snap(struct dm_pool_metadata *pmd,
1236  				 dm_thin_id dev,
1237  				 dm_thin_id origin)
1238  {
1239  	int r = -EINVAL;
1240  
1241  	pmd_write_lock(pmd);
1242  	if (!pmd->fail_io)
1243  		r = __create_snap(pmd, dev, origin);
1244  	pmd_write_unlock(pmd);
1245  
1246  	return r;
1247  }
1248  
__delete_device(struct dm_pool_metadata * pmd,dm_thin_id dev)1249  static int __delete_device(struct dm_pool_metadata *pmd, dm_thin_id dev)
1250  {
1251  	int r;
1252  	uint64_t key = dev;
1253  	struct dm_thin_device *td;
1254  
1255  	/* TODO: failure should mark the transaction invalid */
1256  	r = __open_device(pmd, dev, 0, &td);
1257  	if (r)
1258  		return r;
1259  
1260  	if (td->open_count > 1) {
1261  		__close_device(td);
1262  		return -EBUSY;
1263  	}
1264  
1265  	list_del(&td->list);
1266  	kfree(td);
1267  	r = dm_btree_remove(&pmd->details_info, pmd->details_root,
1268  			    &key, &pmd->details_root);
1269  	if (r)
1270  		return r;
1271  
1272  	r = dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
1273  	if (r)
1274  		return r;
1275  
1276  	return 0;
1277  }
1278  
dm_pool_delete_thin_device(struct dm_pool_metadata * pmd,dm_thin_id dev)1279  int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
1280  			       dm_thin_id dev)
1281  {
1282  	int r = -EINVAL;
1283  
1284  	pmd_write_lock(pmd);
1285  	if (!pmd->fail_io)
1286  		r = __delete_device(pmd, dev);
1287  	pmd_write_unlock(pmd);
1288  
1289  	return r;
1290  }
1291  
dm_pool_set_metadata_transaction_id(struct dm_pool_metadata * pmd,uint64_t current_id,uint64_t new_id)1292  int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
1293  					uint64_t current_id,
1294  					uint64_t new_id)
1295  {
1296  	int r = -EINVAL;
1297  
1298  	pmd_write_lock(pmd);
1299  
1300  	if (pmd->fail_io)
1301  		goto out;
1302  
1303  	if (pmd->trans_id != current_id) {
1304  		DMERR("mismatched transaction id");
1305  		goto out;
1306  	}
1307  
1308  	pmd->trans_id = new_id;
1309  	r = 0;
1310  
1311  out:
1312  	pmd_write_unlock(pmd);
1313  
1314  	return r;
1315  }
1316  
dm_pool_get_metadata_transaction_id(struct dm_pool_metadata * pmd,uint64_t * result)1317  int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
1318  					uint64_t *result)
1319  {
1320  	int r = -EINVAL;
1321  
1322  	down_read(&pmd->root_lock);
1323  	if (!pmd->fail_io) {
1324  		*result = pmd->trans_id;
1325  		r = 0;
1326  	}
1327  	up_read(&pmd->root_lock);
1328  
1329  	return r;
1330  }
1331  
__reserve_metadata_snap(struct dm_pool_metadata * pmd)1332  static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
1333  {
1334  	int r, inc;
1335  	struct thin_disk_superblock *disk_super;
1336  	struct dm_block *copy, *sblock;
1337  	dm_block_t held_root;
1338  
1339  	/*
1340  	 * We commit to ensure the btree roots which we increment in a
1341  	 * moment are up to date.
1342  	 */
1343  	r = __commit_transaction(pmd);
1344  	if (r < 0) {
1345  		DMWARN("%s: __commit_transaction() failed, error = %d",
1346  		       __func__, r);
1347  		return r;
1348  	}
1349  
1350  	/*
1351  	 * Copy the superblock.
1352  	 */
1353  	dm_sm_inc_block(pmd->metadata_sm, THIN_SUPERBLOCK_LOCATION);
1354  	r = dm_tm_shadow_block(pmd->tm, THIN_SUPERBLOCK_LOCATION,
1355  			       &sb_validator, &copy, &inc);
1356  	if (r)
1357  		return r;
1358  
1359  	BUG_ON(!inc);
1360  
1361  	held_root = dm_block_location(copy);
1362  	disk_super = dm_block_data(copy);
1363  
1364  	if (le64_to_cpu(disk_super->held_root)) {
1365  		DMWARN("Pool metadata snapshot already exists: release this before taking another.");
1366  
1367  		dm_tm_dec(pmd->tm, held_root);
1368  		dm_tm_unlock(pmd->tm, copy);
1369  		return -EBUSY;
1370  	}
1371  
1372  	/*
1373  	 * Wipe the spacemap since we're not publishing this.
1374  	 */
1375  	memset(&disk_super->data_space_map_root, 0,
1376  	       sizeof(disk_super->data_space_map_root));
1377  	memset(&disk_super->metadata_space_map_root, 0,
1378  	       sizeof(disk_super->metadata_space_map_root));
1379  
1380  	/*
1381  	 * Increment the data structures that need to be preserved.
1382  	 */
1383  	dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->data_mapping_root));
1384  	dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->device_details_root));
1385  	dm_tm_unlock(pmd->tm, copy);
1386  
1387  	/*
1388  	 * Write the held root into the superblock.
1389  	 */
1390  	r = superblock_lock(pmd, &sblock);
1391  	if (r) {
1392  		dm_tm_dec(pmd->tm, held_root);
1393  		return r;
1394  	}
1395  
1396  	disk_super = dm_block_data(sblock);
1397  	disk_super->held_root = cpu_to_le64(held_root);
1398  	dm_bm_unlock(sblock);
1399  	return 0;
1400  }
1401  
dm_pool_reserve_metadata_snap(struct dm_pool_metadata * pmd)1402  int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd)
1403  {
1404  	int r = -EINVAL;
1405  
1406  	pmd_write_lock(pmd);
1407  	if (!pmd->fail_io)
1408  		r = __reserve_metadata_snap(pmd);
1409  	pmd_write_unlock(pmd);
1410  
1411  	return r;
1412  }
1413  
__release_metadata_snap(struct dm_pool_metadata * pmd)1414  static int __release_metadata_snap(struct dm_pool_metadata *pmd)
1415  {
1416  	int r;
1417  	struct thin_disk_superblock *disk_super;
1418  	struct dm_block *sblock, *copy;
1419  	dm_block_t held_root;
1420  
1421  	r = superblock_lock(pmd, &sblock);
1422  	if (r)
1423  		return r;
1424  
1425  	disk_super = dm_block_data(sblock);
1426  	held_root = le64_to_cpu(disk_super->held_root);
1427  	disk_super->held_root = cpu_to_le64(0);
1428  
1429  	dm_bm_unlock(sblock);
1430  
1431  	if (!held_root) {
1432  		DMWARN("No pool metadata snapshot found: nothing to release.");
1433  		return -EINVAL;
1434  	}
1435  
1436  	r = dm_tm_read_lock(pmd->tm, held_root, &sb_validator, &copy);
1437  	if (r)
1438  		return r;
1439  
1440  	disk_super = dm_block_data(copy);
1441  	dm_btree_del(&pmd->info, le64_to_cpu(disk_super->data_mapping_root));
1442  	dm_btree_del(&pmd->details_info, le64_to_cpu(disk_super->device_details_root));
1443  	dm_sm_dec_block(pmd->metadata_sm, held_root);
1444  
1445  	dm_tm_unlock(pmd->tm, copy);
1446  
1447  	return 0;
1448  }
1449  
dm_pool_release_metadata_snap(struct dm_pool_metadata * pmd)1450  int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd)
1451  {
1452  	int r = -EINVAL;
1453  
1454  	pmd_write_lock(pmd);
1455  	if (!pmd->fail_io)
1456  		r = __release_metadata_snap(pmd);
1457  	pmd_write_unlock(pmd);
1458  
1459  	return r;
1460  }
1461  
__get_metadata_snap(struct dm_pool_metadata * pmd,dm_block_t * result)1462  static int __get_metadata_snap(struct dm_pool_metadata *pmd,
1463  			       dm_block_t *result)
1464  {
1465  	int r;
1466  	struct thin_disk_superblock *disk_super;
1467  	struct dm_block *sblock;
1468  
1469  	r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
1470  			    &sb_validator, &sblock);
1471  	if (r)
1472  		return r;
1473  
1474  	disk_super = dm_block_data(sblock);
1475  	*result = le64_to_cpu(disk_super->held_root);
1476  
1477  	dm_bm_unlock(sblock);
1478  
1479  	return 0;
1480  }
1481  
dm_pool_get_metadata_snap(struct dm_pool_metadata * pmd,dm_block_t * result)1482  int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd,
1483  			      dm_block_t *result)
1484  {
1485  	int r = -EINVAL;
1486  
1487  	down_read(&pmd->root_lock);
1488  	if (!pmd->fail_io)
1489  		r = __get_metadata_snap(pmd, result);
1490  	up_read(&pmd->root_lock);
1491  
1492  	return r;
1493  }
1494  
dm_pool_open_thin_device(struct dm_pool_metadata * pmd,dm_thin_id dev,struct dm_thin_device ** td)1495  int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
1496  			     struct dm_thin_device **td)
1497  {
1498  	int r = -EINVAL;
1499  
1500  	pmd_write_lock_in_core(pmd);
1501  	if (!pmd->fail_io)
1502  		r = __open_device(pmd, dev, 0, td);
1503  	pmd_write_unlock(pmd);
1504  
1505  	return r;
1506  }
1507  
dm_pool_close_thin_device(struct dm_thin_device * td)1508  int dm_pool_close_thin_device(struct dm_thin_device *td)
1509  {
1510  	pmd_write_lock_in_core(td->pmd);
1511  	__close_device(td);
1512  	pmd_write_unlock(td->pmd);
1513  
1514  	return 0;
1515  }
1516  
dm_thin_dev_id(struct dm_thin_device * td)1517  dm_thin_id dm_thin_dev_id(struct dm_thin_device *td)
1518  {
1519  	return td->id;
1520  }
1521  
1522  /*
1523   * Check whether @time (of block creation) is older than @td's last snapshot.
1524   * If so then the associated block is shared with the last snapshot device.
1525   * Any block on a device created *after* the device last got snapshotted is
1526   * necessarily not shared.
1527   */
__snapshotted_since(struct dm_thin_device * td,uint32_t time)1528  static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
1529  {
1530  	return td->snapshotted_time > time;
1531  }
1532  
unpack_lookup_result(struct dm_thin_device * td,__le64 value,struct dm_thin_lookup_result * result)1533  static void unpack_lookup_result(struct dm_thin_device *td, __le64 value,
1534  				 struct dm_thin_lookup_result *result)
1535  {
1536  	uint64_t block_time = 0;
1537  	dm_block_t exception_block;
1538  	uint32_t exception_time;
1539  
1540  	block_time = le64_to_cpu(value);
1541  	unpack_block_time(block_time, &exception_block, &exception_time);
1542  	result->block = exception_block;
1543  	result->shared = __snapshotted_since(td, exception_time);
1544  }
1545  
__find_block(struct dm_thin_device * td,dm_block_t block,int can_issue_io,struct dm_thin_lookup_result * result)1546  static int __find_block(struct dm_thin_device *td, dm_block_t block,
1547  			int can_issue_io, struct dm_thin_lookup_result *result)
1548  {
1549  	int r;
1550  	__le64 value;
1551  	struct dm_pool_metadata *pmd = td->pmd;
1552  	dm_block_t keys[2] = { td->id, block };
1553  	struct dm_btree_info *info;
1554  
1555  	if (can_issue_io)
1556  		info = &pmd->info;
1557  	else
1558  		info = &pmd->nb_info;
1559  
1560  	r = dm_btree_lookup(info, pmd->root, keys, &value);
1561  	if (!r)
1562  		unpack_lookup_result(td, value, result);
1563  
1564  	return r;
1565  }
1566  
dm_thin_find_block(struct dm_thin_device * td,dm_block_t block,int can_issue_io,struct dm_thin_lookup_result * result)1567  int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
1568  		       int can_issue_io, struct dm_thin_lookup_result *result)
1569  {
1570  	int r;
1571  	struct dm_pool_metadata *pmd = td->pmd;
1572  
1573  	down_read(&pmd->root_lock);
1574  	if (pmd->fail_io) {
1575  		up_read(&pmd->root_lock);
1576  		return -EINVAL;
1577  	}
1578  
1579  	r = __find_block(td, block, can_issue_io, result);
1580  
1581  	up_read(&pmd->root_lock);
1582  	return r;
1583  }
1584  
__find_next_mapped_block(struct dm_thin_device * td,dm_block_t block,dm_block_t * vblock,struct dm_thin_lookup_result * result)1585  static int __find_next_mapped_block(struct dm_thin_device *td, dm_block_t block,
1586  					  dm_block_t *vblock,
1587  					  struct dm_thin_lookup_result *result)
1588  {
1589  	int r;
1590  	__le64 value;
1591  	struct dm_pool_metadata *pmd = td->pmd;
1592  	dm_block_t keys[2] = { td->id, block };
1593  
1594  	r = dm_btree_lookup_next(&pmd->info, pmd->root, keys, vblock, &value);
1595  	if (!r)
1596  		unpack_lookup_result(td, value, result);
1597  
1598  	return r;
1599  }
1600  
__find_mapped_range(struct dm_thin_device * td,dm_block_t begin,dm_block_t end,dm_block_t * thin_begin,dm_block_t * thin_end,dm_block_t * pool_begin,bool * maybe_shared)1601  static int __find_mapped_range(struct dm_thin_device *td,
1602  			       dm_block_t begin, dm_block_t end,
1603  			       dm_block_t *thin_begin, dm_block_t *thin_end,
1604  			       dm_block_t *pool_begin, bool *maybe_shared)
1605  {
1606  	int r;
1607  	dm_block_t pool_end;
1608  	struct dm_thin_lookup_result lookup;
1609  
1610  	if (end < begin)
1611  		return -ENODATA;
1612  
1613  	r = __find_next_mapped_block(td, begin, &begin, &lookup);
1614  	if (r)
1615  		return r;
1616  
1617  	if (begin >= end)
1618  		return -ENODATA;
1619  
1620  	*thin_begin = begin;
1621  	*pool_begin = lookup.block;
1622  	*maybe_shared = lookup.shared;
1623  
1624  	begin++;
1625  	pool_end = *pool_begin + 1;
1626  	while (begin != end) {
1627  		r = __find_block(td, begin, true, &lookup);
1628  		if (r) {
1629  			if (r == -ENODATA)
1630  				break;
1631  
1632  			return r;
1633  		}
1634  
1635  		if ((lookup.block != pool_end) ||
1636  		    (lookup.shared != *maybe_shared))
1637  			break;
1638  
1639  		pool_end++;
1640  		begin++;
1641  	}
1642  
1643  	*thin_end = begin;
1644  	return 0;
1645  }
1646  
dm_thin_find_mapped_range(struct dm_thin_device * td,dm_block_t begin,dm_block_t end,dm_block_t * thin_begin,dm_block_t * thin_end,dm_block_t * pool_begin,bool * maybe_shared)1647  int dm_thin_find_mapped_range(struct dm_thin_device *td,
1648  			      dm_block_t begin, dm_block_t end,
1649  			      dm_block_t *thin_begin, dm_block_t *thin_end,
1650  			      dm_block_t *pool_begin, bool *maybe_shared)
1651  {
1652  	int r = -EINVAL;
1653  	struct dm_pool_metadata *pmd = td->pmd;
1654  
1655  	down_read(&pmd->root_lock);
1656  	if (!pmd->fail_io) {
1657  		r = __find_mapped_range(td, begin, end, thin_begin, thin_end,
1658  					pool_begin, maybe_shared);
1659  	}
1660  	up_read(&pmd->root_lock);
1661  
1662  	return r;
1663  }
1664  
__insert(struct dm_thin_device * td,dm_block_t block,dm_block_t data_block)1665  static int __insert(struct dm_thin_device *td, dm_block_t block,
1666  		    dm_block_t data_block)
1667  {
1668  	int r, inserted;
1669  	__le64 value;
1670  	struct dm_pool_metadata *pmd = td->pmd;
1671  	dm_block_t keys[2] = { td->id, block };
1672  
1673  	value = cpu_to_le64(pack_block_time(data_block, pmd->time));
1674  	__dm_bless_for_disk(&value);
1675  
1676  	r = dm_btree_insert_notify(&pmd->info, pmd->root, keys, &value,
1677  				   &pmd->root, &inserted);
1678  	if (r)
1679  		return r;
1680  
1681  	td->changed = true;
1682  	if (inserted)
1683  		td->mapped_blocks++;
1684  
1685  	return 0;
1686  }
1687  
dm_thin_insert_block(struct dm_thin_device * td,dm_block_t block,dm_block_t data_block)1688  int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
1689  			 dm_block_t data_block)
1690  {
1691  	int r = -EINVAL;
1692  
1693  	pmd_write_lock(td->pmd);
1694  	if (!td->pmd->fail_io)
1695  		r = __insert(td, block, data_block);
1696  	pmd_write_unlock(td->pmd);
1697  
1698  	return r;
1699  }
1700  
__remove_range(struct dm_thin_device * td,dm_block_t begin,dm_block_t end)1701  static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_t end)
1702  {
1703  	int r;
1704  	unsigned int count, total_count = 0;
1705  	struct dm_pool_metadata *pmd = td->pmd;
1706  	dm_block_t keys[1] = { td->id };
1707  	__le64 value;
1708  	dm_block_t mapping_root;
1709  
1710  	/*
1711  	 * Find the mapping tree
1712  	 */
1713  	r = dm_btree_lookup(&pmd->tl_info, pmd->root, keys, &value);
1714  	if (r)
1715  		return r;
1716  
1717  	/*
1718  	 * Remove from the mapping tree, taking care to inc the
1719  	 * ref count so it doesn't get deleted.
1720  	 */
1721  	mapping_root = le64_to_cpu(value);
1722  	dm_tm_inc(pmd->tm, mapping_root);
1723  	r = dm_btree_remove(&pmd->tl_info, pmd->root, keys, &pmd->root);
1724  	if (r)
1725  		return r;
1726  
1727  	/*
1728  	 * Remove leaves stops at the first unmapped entry, so we have to
1729  	 * loop round finding mapped ranges.
1730  	 */
1731  	while (begin < end) {
1732  		r = dm_btree_lookup_next(&pmd->bl_info, mapping_root, &begin, &begin, &value);
1733  		if (r == -ENODATA)
1734  			break;
1735  
1736  		if (r)
1737  			return r;
1738  
1739  		if (begin >= end)
1740  			break;
1741  
1742  		r = dm_btree_remove_leaves(&pmd->bl_info, mapping_root, &begin, end, &mapping_root, &count);
1743  		if (r)
1744  			return r;
1745  
1746  		total_count += count;
1747  	}
1748  
1749  	td->mapped_blocks -= total_count;
1750  	td->changed = true;
1751  
1752  	/*
1753  	 * Reinsert the mapping tree.
1754  	 */
1755  	value = cpu_to_le64(mapping_root);
1756  	__dm_bless_for_disk(&value);
1757  	return dm_btree_insert(&pmd->tl_info, pmd->root, keys, &value, &pmd->root);
1758  }
1759  
dm_thin_remove_range(struct dm_thin_device * td,dm_block_t begin,dm_block_t end)1760  int dm_thin_remove_range(struct dm_thin_device *td,
1761  			 dm_block_t begin, dm_block_t end)
1762  {
1763  	int r = -EINVAL;
1764  
1765  	pmd_write_lock(td->pmd);
1766  	if (!td->pmd->fail_io)
1767  		r = __remove_range(td, begin, end);
1768  	pmd_write_unlock(td->pmd);
1769  
1770  	return r;
1771  }
1772  
dm_pool_block_is_shared(struct dm_pool_metadata * pmd,dm_block_t b,bool * result)1773  int dm_pool_block_is_shared(struct dm_pool_metadata *pmd, dm_block_t b, bool *result)
1774  {
1775  	int r = -EINVAL;
1776  	uint32_t ref_count;
1777  
1778  	down_read(&pmd->root_lock);
1779  	if (!pmd->fail_io) {
1780  		r = dm_sm_get_count(pmd->data_sm, b, &ref_count);
1781  		if (!r)
1782  			*result = (ref_count > 1);
1783  	}
1784  	up_read(&pmd->root_lock);
1785  
1786  	return r;
1787  }
1788  
dm_pool_inc_data_range(struct dm_pool_metadata * pmd,dm_block_t b,dm_block_t e)1789  int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e)
1790  {
1791  	int r = -EINVAL;
1792  
1793  	pmd_write_lock(pmd);
1794  	if (!pmd->fail_io)
1795  		r = dm_sm_inc_blocks(pmd->data_sm, b, e);
1796  	pmd_write_unlock(pmd);
1797  
1798  	return r;
1799  }
1800  
dm_pool_dec_data_range(struct dm_pool_metadata * pmd,dm_block_t b,dm_block_t e)1801  int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e)
1802  {
1803  	int r = -EINVAL;
1804  
1805  	pmd_write_lock(pmd);
1806  	if (!pmd->fail_io)
1807  		r = dm_sm_dec_blocks(pmd->data_sm, b, e);
1808  	pmd_write_unlock(pmd);
1809  
1810  	return r;
1811  }
1812  
dm_thin_changed_this_transaction(struct dm_thin_device * td)1813  bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
1814  {
1815  	int r;
1816  
1817  	down_read(&td->pmd->root_lock);
1818  	r = td->changed;
1819  	up_read(&td->pmd->root_lock);
1820  
1821  	return r;
1822  }
1823  
dm_pool_changed_this_transaction(struct dm_pool_metadata * pmd)1824  bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd)
1825  {
1826  	bool r = false;
1827  	struct dm_thin_device *td, *tmp;
1828  
1829  	down_read(&pmd->root_lock);
1830  	list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
1831  		if (td->changed) {
1832  			r = td->changed;
1833  			break;
1834  		}
1835  	}
1836  	up_read(&pmd->root_lock);
1837  
1838  	return r;
1839  }
1840  
dm_thin_aborted_changes(struct dm_thin_device * td)1841  bool dm_thin_aborted_changes(struct dm_thin_device *td)
1842  {
1843  	bool r;
1844  
1845  	down_read(&td->pmd->root_lock);
1846  	r = td->aborted_with_changes;
1847  	up_read(&td->pmd->root_lock);
1848  
1849  	return r;
1850  }
1851  
dm_pool_alloc_data_block(struct dm_pool_metadata * pmd,dm_block_t * result)1852  int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result)
1853  {
1854  	int r = -EINVAL;
1855  
1856  	pmd_write_lock(pmd);
1857  	if (!pmd->fail_io)
1858  		r = dm_sm_new_block(pmd->data_sm, result);
1859  	pmd_write_unlock(pmd);
1860  
1861  	return r;
1862  }
1863  
dm_pool_commit_metadata(struct dm_pool_metadata * pmd)1864  int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
1865  {
1866  	int r = -EINVAL;
1867  
1868  	/*
1869  	 * Care is taken to not have commit be what
1870  	 * triggers putting the thin-pool in-service.
1871  	 */
1872  	pmd_write_lock_in_core(pmd);
1873  	if (pmd->fail_io)
1874  		goto out;
1875  
1876  	r = __commit_transaction(pmd);
1877  	if (r < 0)
1878  		goto out;
1879  
1880  	/*
1881  	 * Open the next transaction.
1882  	 */
1883  	r = __begin_transaction(pmd);
1884  out:
1885  	pmd_write_unlock(pmd);
1886  	return r;
1887  }
1888  
__set_abort_with_changes_flags(struct dm_pool_metadata * pmd)1889  static void __set_abort_with_changes_flags(struct dm_pool_metadata *pmd)
1890  {
1891  	struct dm_thin_device *td;
1892  
1893  	list_for_each_entry(td, &pmd->thin_devices, list)
1894  		td->aborted_with_changes = td->changed;
1895  }
1896  
dm_pool_abort_metadata(struct dm_pool_metadata * pmd)1897  int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
1898  {
1899  	int r = -EINVAL;
1900  
1901  	/* fail_io is double-checked with pmd->root_lock held below */
1902  	if (unlikely(pmd->fail_io))
1903  		return r;
1904  
1905  	pmd_write_lock(pmd);
1906  	if (pmd->fail_io) {
1907  		pmd_write_unlock(pmd);
1908  		return r;
1909  	}
1910  	__set_abort_with_changes_flags(pmd);
1911  
1912  	/* destroy data_sm/metadata_sm/nb_tm/tm */
1913  	__destroy_persistent_data_objects(pmd, false);
1914  
1915  	/* reset bm */
1916  	dm_block_manager_reset(pmd->bm);
1917  
1918  	/* rebuild data_sm/metadata_sm/nb_tm/tm */
1919  	r = __open_or_format_metadata(pmd, false);
1920  	if (r)
1921  		pmd->fail_io = true;
1922  	pmd_write_unlock(pmd);
1923  	return r;
1924  }
1925  
dm_pool_get_free_block_count(struct dm_pool_metadata * pmd,dm_block_t * result)1926  int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result)
1927  {
1928  	int r = -EINVAL;
1929  
1930  	down_read(&pmd->root_lock);
1931  	if (!pmd->fail_io)
1932  		r = dm_sm_get_nr_free(pmd->data_sm, result);
1933  	up_read(&pmd->root_lock);
1934  
1935  	return r;
1936  }
1937  
dm_pool_get_free_metadata_block_count(struct dm_pool_metadata * pmd,dm_block_t * result)1938  int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
1939  					  dm_block_t *result)
1940  {
1941  	int r = -EINVAL;
1942  
1943  	down_read(&pmd->root_lock);
1944  	if (!pmd->fail_io)
1945  		r = dm_sm_get_nr_free(pmd->metadata_sm, result);
1946  
1947  	if (!r) {
1948  		if (*result < pmd->metadata_reserve)
1949  			*result = 0;
1950  		else
1951  			*result -= pmd->metadata_reserve;
1952  	}
1953  	up_read(&pmd->root_lock);
1954  
1955  	return r;
1956  }
1957  
dm_pool_get_metadata_dev_size(struct dm_pool_metadata * pmd,dm_block_t * result)1958  int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
1959  				  dm_block_t *result)
1960  {
1961  	int r = -EINVAL;
1962  
1963  	down_read(&pmd->root_lock);
1964  	if (!pmd->fail_io)
1965  		r = dm_sm_get_nr_blocks(pmd->metadata_sm, result);
1966  	up_read(&pmd->root_lock);
1967  
1968  	return r;
1969  }
1970  
dm_pool_get_data_dev_size(struct dm_pool_metadata * pmd,dm_block_t * result)1971  int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result)
1972  {
1973  	int r = -EINVAL;
1974  
1975  	down_read(&pmd->root_lock);
1976  	if (!pmd->fail_io)
1977  		r = dm_sm_get_nr_blocks(pmd->data_sm, result);
1978  	up_read(&pmd->root_lock);
1979  
1980  	return r;
1981  }
1982  
dm_thin_get_mapped_count(struct dm_thin_device * td,dm_block_t * result)1983  int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result)
1984  {
1985  	int r = -EINVAL;
1986  	struct dm_pool_metadata *pmd = td->pmd;
1987  
1988  	down_read(&pmd->root_lock);
1989  	if (!pmd->fail_io) {
1990  		*result = td->mapped_blocks;
1991  		r = 0;
1992  	}
1993  	up_read(&pmd->root_lock);
1994  
1995  	return r;
1996  }
1997  
__highest_block(struct dm_thin_device * td,dm_block_t * result)1998  static int __highest_block(struct dm_thin_device *td, dm_block_t *result)
1999  {
2000  	int r;
2001  	__le64 value_le;
2002  	dm_block_t thin_root;
2003  	struct dm_pool_metadata *pmd = td->pmd;
2004  
2005  	r = dm_btree_lookup(&pmd->tl_info, pmd->root, &td->id, &value_le);
2006  	if (r)
2007  		return r;
2008  
2009  	thin_root = le64_to_cpu(value_le);
2010  
2011  	return dm_btree_find_highest_key(&pmd->bl_info, thin_root, result);
2012  }
2013  
dm_thin_get_highest_mapped_block(struct dm_thin_device * td,dm_block_t * result)2014  int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
2015  				     dm_block_t *result)
2016  {
2017  	int r = -EINVAL;
2018  	struct dm_pool_metadata *pmd = td->pmd;
2019  
2020  	down_read(&pmd->root_lock);
2021  	if (!pmd->fail_io)
2022  		r = __highest_block(td, result);
2023  	up_read(&pmd->root_lock);
2024  
2025  	return r;
2026  }
2027  
__resize_space_map(struct dm_space_map * sm,dm_block_t new_count)2028  static int __resize_space_map(struct dm_space_map *sm, dm_block_t new_count)
2029  {
2030  	int r;
2031  	dm_block_t old_count;
2032  
2033  	r = dm_sm_get_nr_blocks(sm, &old_count);
2034  	if (r)
2035  		return r;
2036  
2037  	if (new_count == old_count)
2038  		return 0;
2039  
2040  	if (new_count < old_count) {
2041  		DMERR("cannot reduce size of space map");
2042  		return -EINVAL;
2043  	}
2044  
2045  	return dm_sm_extend(sm, new_count - old_count);
2046  }
2047  
dm_pool_resize_data_dev(struct dm_pool_metadata * pmd,dm_block_t new_count)2048  int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
2049  {
2050  	int r = -EINVAL;
2051  
2052  	pmd_write_lock(pmd);
2053  	if (!pmd->fail_io)
2054  		r = __resize_space_map(pmd->data_sm, new_count);
2055  	pmd_write_unlock(pmd);
2056  
2057  	return r;
2058  }
2059  
dm_pool_resize_metadata_dev(struct dm_pool_metadata * pmd,dm_block_t new_count)2060  int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
2061  {
2062  	int r = -EINVAL;
2063  
2064  	pmd_write_lock(pmd);
2065  	if (!pmd->fail_io) {
2066  		r = __resize_space_map(pmd->metadata_sm, new_count);
2067  		if (!r)
2068  			__set_metadata_reserve(pmd);
2069  	}
2070  	pmd_write_unlock(pmd);
2071  
2072  	return r;
2073  }
2074  
dm_pool_metadata_read_only(struct dm_pool_metadata * pmd)2075  void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
2076  {
2077  	pmd_write_lock_in_core(pmd);
2078  	dm_bm_set_read_only(pmd->bm);
2079  	pmd_write_unlock(pmd);
2080  }
2081  
dm_pool_metadata_read_write(struct dm_pool_metadata * pmd)2082  void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd)
2083  {
2084  	pmd_write_lock_in_core(pmd);
2085  	dm_bm_set_read_write(pmd->bm);
2086  	pmd_write_unlock(pmd);
2087  }
2088  
dm_pool_register_metadata_threshold(struct dm_pool_metadata * pmd,dm_block_t threshold,dm_sm_threshold_fn fn,void * context)2089  int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
2090  					dm_block_t threshold,
2091  					dm_sm_threshold_fn fn,
2092  					void *context)
2093  {
2094  	int r = -EINVAL;
2095  
2096  	pmd_write_lock_in_core(pmd);
2097  	if (!pmd->fail_io) {
2098  		r = dm_sm_register_threshold_callback(pmd->metadata_sm,
2099  						      threshold, fn, context);
2100  	}
2101  	pmd_write_unlock(pmd);
2102  
2103  	return r;
2104  }
2105  
dm_pool_register_pre_commit_callback(struct dm_pool_metadata * pmd,dm_pool_pre_commit_fn fn,void * context)2106  void dm_pool_register_pre_commit_callback(struct dm_pool_metadata *pmd,
2107  					  dm_pool_pre_commit_fn fn,
2108  					  void *context)
2109  {
2110  	pmd_write_lock_in_core(pmd);
2111  	pmd->pre_commit_fn = fn;
2112  	pmd->pre_commit_context = context;
2113  	pmd_write_unlock(pmd);
2114  }
2115  
dm_pool_metadata_set_needs_check(struct dm_pool_metadata * pmd)2116  int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
2117  {
2118  	int r = -EINVAL;
2119  	struct dm_block *sblock;
2120  	struct thin_disk_superblock *disk_super;
2121  
2122  	pmd_write_lock(pmd);
2123  	if (pmd->fail_io)
2124  		goto out;
2125  
2126  	pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG;
2127  
2128  	r = superblock_lock(pmd, &sblock);
2129  	if (r) {
2130  		DMERR("couldn't lock superblock");
2131  		goto out;
2132  	}
2133  
2134  	disk_super = dm_block_data(sblock);
2135  	disk_super->flags = cpu_to_le32(pmd->flags);
2136  
2137  	dm_bm_unlock(sblock);
2138  out:
2139  	pmd_write_unlock(pmd);
2140  	return r;
2141  }
2142  
dm_pool_metadata_needs_check(struct dm_pool_metadata * pmd)2143  bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd)
2144  {
2145  	bool needs_check;
2146  
2147  	down_read(&pmd->root_lock);
2148  	needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG;
2149  	up_read(&pmd->root_lock);
2150  
2151  	return needs_check;
2152  }
2153  
dm_pool_issue_prefetches(struct dm_pool_metadata * pmd)2154  void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd)
2155  {
2156  	down_read(&pmd->root_lock);
2157  	if (!pmd->fail_io)
2158  		dm_tm_issue_prefetches(pmd->tm);
2159  	up_read(&pmd->root_lock);
2160  }
2161