xref: /freebsd/sys/contrib/openzfs/module/zfs/brt.c (revision 63f537551380d2dab29fa402ad1269feae17e594)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
24  */
25 
26 #include <sys/zfs_context.h>
27 #include <sys/spa.h>
28 #include <sys/spa_impl.h>
29 #include <sys/zio.h>
30 #include <sys/brt.h>
31 #include <sys/ddt.h>
32 #include <sys/bitmap.h>
33 #include <sys/zap.h>
34 #include <sys/dmu_tx.h>
35 #include <sys/arc.h>
36 #include <sys/dsl_pool.h>
37 #include <sys/dsl_scan.h>
38 #include <sys/vdev_impl.h>
39 #include <sys/kstat.h>
40 #include <sys/wmsum.h>
41 
42 /*
43  * Block Cloning design.
44  *
45  * Block Cloning allows to manually clone a file (or a subset of its blocks)
46  * into another (or the same) file by just creating additional references to
47  * the data blocks without copying the data itself. Those references are kept
48  * in the Block Reference Tables (BRTs).
49  *
50  * In many ways this is similar to the existing deduplication, but there are
51  * some important differences:
52  *
53  * - Deduplication is automatic and Block Cloning is not - one has to use a
54  *   dedicated system call(s) to clone the given file/blocks.
55  * - Deduplication keeps all data blocks in its table, even those referenced
56  *   just once. Block Cloning creates an entry in its tables only when there
57  *   are at least two references to the given data block. If the block was
58  *   never explicitly cloned or the second to last reference was dropped,
59  *   there will be neither space nor performance overhead.
60  * - Deduplication needs data to work - one needs to pass real data to the
61  *   write(2) syscall, so hash can be calculated. Block Cloning doesn't require
62  *   data, just block pointers to the data, so it is extremely fast, as we pay
63  *   neither the cost of reading the data, nor the cost of writing the data -
64  *   we operate exclusively on metadata.
65  * - If the D (dedup) bit is not set in the block pointer, it means that
66  *   the block is not in the dedup table (DDT) and we won't consult the DDT
67  *   when we need to free the block. Block Cloning must be consulted on every
68  *   free, because we cannot modify the source BP (eg. by setting something
69  *   similar to the D bit), thus we have no hint if the block is in the
70  *   Block Reference Table (BRT), so we need to look into the BRT. There is
71  *   an optimization in place that allows us to eliminate the majority of BRT
72  *   lookups which is described below in the "Minimizing free penalty" section.
73  * - The BRT entry is much smaller than the DDT entry - for BRT we only store
74  *   64bit offset and 64bit reference counter.
75  * - Dedup keys are cryptographic hashes, so two blocks that are close to each
76  *   other on disk are most likely in totally different parts of the DDT.
77  *   The BRT entry keys are offsets into a single top-level VDEV, so data blocks
78  *   from one file should have BRT entries close to each other.
79  * - Scrub will only do a single pass over a block that is referenced multiple
80  *   times in the DDT. Unfortunately it is not currently (if at all) possible
81  *   with Block Cloning and block referenced multiple times will be scrubbed
82  *   multiple times. The new, sorted scrub should be able to eliminate
83  *   duplicated reads given enough memory.
84  * - Deduplication requires cryptographically strong hash as a checksum or
85  *   additional data verification. Block Cloning works with any checksum
86  *   algorithm or even with checksumming disabled.
87  *
88  * As mentioned above, the BRT entries are much smaller than the DDT entries.
89  * To uniquely identify a block we just need its vdev id and offset. We also
90  * need to maintain a reference counter. The vdev id will often repeat, as there
91  * is a small number of top-level VDEVs and a large number of blocks stored in
92  * each VDEV. We take advantage of that to reduce the BRT entry size further by
93  * maintaining one BRT for each top-level VDEV, so we can then have only offset
94  * and counter as the BRT entry.
95  *
96  * Minimizing free penalty.
97  *
98  * Block Cloning allows creating additional references to any existing block.
99  * When we free a block there is no hint in the block pointer whether the block
100  * was cloned or not, so on each free we have to check if there is a
101  * corresponding entry in the BRT or not. If there is, we need to decrease
102  * the reference counter. Doing BRT lookup on every free can potentially be
103  * expensive by requiring additional I/Os if the BRT doesn't fit into memory.
104  * This is the main problem with deduplication, so we've learned our lesson and
105  * try not to repeat the same mistake here. How do we do that? We divide each
106  * top-level VDEV into 16MB regions. For each region we maintain a counter that
107  * is a sum of all the BRT entries that have offsets within the region. This
108  * creates the entries count array of 16bit numbers for each top-level VDEV.
109  * The entries count array is always kept in memory and updated on disk in the
110  * same transaction group as the BRT updates to keep everything in-sync. We can
111  * keep the array in memory, because it is very small. With 16MB regions and
112  * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease
113  * the region size even further in the future). Now, when we want to free
114  * a block, we first consult the array. If the counter for the whole region is
115  * zero, there is no need to look for the BRT entry, as there isn't one for
116  * sure. If the counter for the region is greater than zero, only then we will
117  * do a BRT lookup and if an entry is found we will decrease the reference
118  * counter in the BRT entry and in the entry counters array.
119  *
120  * The entry counters array is small, but can potentially be larger for very
121  * large VDEVs or smaller regions. In this case we don't want to rewrite entire
122  * array on every change. We then divide the array into 32kB block and keep
123  * a bitmap of dirty blocks within a transaction group. When we sync the
124  * transaction group we can only update the parts of the entry counters array
125  * that were modified. Note: Keeping track of the dirty parts of the entry
126  * counters array is implemented, but updating only parts of the array on disk
127  * is not yet implemented - for now we will update entire array if there was
128  * any change.
129  *
130  * The implementation tries to be economic: if BRT is not used, or no longer
131  * used, there will be no entries in the MOS and no additional memory used (eg.
132  * the entry counters array is only allocated if needed).
133  *
134  * Interaction between Deduplication and Block Cloning.
135  *
136  * If both functionalities are in use, we could end up with a block that is
137  * referenced multiple times in both DDT and BRT. When we free one of the
138  * references we couldn't tell where it belongs, so we would have to decide
139  * what table takes the precedence: do we first clear DDT references or BRT
140  * references? To avoid this dilemma BRT cooperates with DDT - if a given block
141  * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will
142  * lookup DDT entry instead and increase the counter there. No BRT entry
143  * will be created for a block which has the D (dedup) bit set.
144  * BRT may be more efficient for manual deduplication, but if the block is
145  * already in the DDT, then creating additional BRT entry would be less
146  * efficient. This clever idea was proposed by Allan Jude.
147  *
148  * Block Cloning across datasets.
149  *
150  * Block Cloning is not limited to cloning blocks within the same dataset.
151  * It is possible (and very useful) to clone blocks between different datasets.
152  * One use case is recovering files from snapshots. By cloning the files into
153  * dataset we need no additional storage. Without Block Cloning we would need
154  * additional space for those files.
155  * Another interesting use case is moving the files between datasets
156  * (copying the file content to the new dataset and removing the source file).
157  * In that case Block Cloning will only be used briefly, because the BRT entries
158  * will be removed when the source is removed.
159  * Note: currently it is not possible to clone blocks between encrypted
160  * datasets, even if those datasets use the same encryption key (this includes
161  * snapshots of encrypted datasets). Cloning blocks between datasets that use
162  * the same keys should be possible and should be implemented in the future.
163  *
164  * Block Cloning flow through ZFS layers.
165  *
166  * Note: Block Cloning can be used both for cloning file system blocks and ZVOL
167  * blocks. As of this writing no interface is implemented that allows for block
168  * cloning within a ZVOL.
169  * FreeBSD and Linux provides copy_file_range(2) system call and we will use it
170  * for blocking cloning.
171  *
172  *	ssize_t
173  *	copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp,
174  *	                size_t len, unsigned int flags);
175  *
176  * Even though offsets and length represent bytes, they have to be
177  * block-aligned or we will return an error so the upper layer can
178  * fallback to the generic mechanism that will just copy the data.
179  * Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
180  * This function was implemented based on zfs_write(), but instead of writing
181  * the given data we first read block pointers using the new dmu_read_l0_bps()
182  * function from the source file. Once we have BPs from the source file we call
183  * the dmu_brt_clone() function on the destination file. This function
184  * allocates BPs for us. We iterate over all source BPs. If the given BP is
185  * a hole or an embedded block, we just copy BP as-is. If it points to a real
186  * data we place this BP on a BRT pending list using the brt_pending_add()
187  * function.
188  *
189  * We use this pending list to keep track of all BPs that got new references
190  * within this transaction group.
191  *
192  * Some special cases to consider and how we address them:
193  * - The block we want to clone may have been created within the same
194  *   transaction group that we are trying to clone. Such block has no BP
195  *   allocated yet, so cannot be immediately cloned. We return EAGAIN.
196  * - The block we want to clone may have been modified within the same
197  *   transaction group. We return EAGAIN.
198  * - A block may be cloned multiple times during one transaction group (that's
199  *   why pending list is actually a tree and not an append-only list - this
200  *   way we can figure out faster if this block is cloned for the first time
201  *   in this txg or consecutive time).
202  * - A block may be cloned and freed within the same transaction group
203  *   (see dbuf_undirty()).
204  * - A block may be cloned and within the same transaction group the clone
205  *   can be cloned again (see dmu_read_l0_bps()).
206  * - A file might have been deleted, but the caller still has a file descriptor
207  *   open to this file and clones it.
208  *
209  * When we free a block we have an additional step in the ZIO pipeline where we
210  * call the zio_brt_free() function. We then call the brt_entry_decref()
211  * that loads the corresponding BRT entry (if one exists) and decreases
212  * reference counter. If this is not the last reference we will stop ZIO
213  * pipeline here. If this is the last reference or the block is not in the
214  * BRT, we continue the pipeline and free the block as usual.
215  *
216  * At the beginning of spa_sync() where there can be no more block cloning,
217  * but before issuing frees we call brt_pending_apply(). This function applies
218  * all the new clones to the BRT table - we load BRT entries and update
219  * reference counters. To sync new BRT entries to disk, we use brt_sync()
220  * function. This function will sync all dirty per-top-level-vdev BRTs,
221  * the entry counters arrays, etc.
222  *
223  * Block Cloning and ZIL.
224  *
225  * Every clone operation is divided into chunks (similar to write) and each
226  * chunk is cloned in a separate transaction. The chunk size is determined by
227  * how many BPs we can fit into a single ZIL entry.
228  * Replaying clone operation is different from the regular clone operation,
229  * as when we log clone operations we cannot use the source object - it may
230  * reside on a different dataset, so we log BPs we want to clone.
231  * The ZIL is replayed when we mount the given dataset, not when the pool is
232  * imported. Taking this into account it is possible that the pool is imported
233  * without mounting datasets and the source dataset is destroyed before the
234  * destination dataset is mounted and its ZIL replayed.
235  * To address this situation we leverage zil_claim() mechanism where ZFS will
236  * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
237  * entries, we will bump reference counters for their BPs in the BRT and then
238  * on mount and ZIL replay we will just attach BPs to the file without
239  * bumping reference counters.
240  * Note it is still possible that after zil_claim() we never mount the
241  * destination, so we never replay its ZIL and we destroy it. This way we would
242  * end up with leaked references in BRT. We address that too as ZFS gives us
243  * a chance to clean this up on dataset destroy (see zil_free_clone_range()).
244  */
245 
246 /*
247  * BRT - Block Reference Table.
248  */
249 #define	BRT_OBJECT_VDEV_PREFIX	"com.fudosecurity:brt:vdev:"
250 
251 /*
252  * We divide each VDEV into 16MB chunks. Each chunk is represented in memory
253  * by a 16bit counter, thus 1TB VDEV requires 128kB of memory: (1TB / 16MB) * 2B
254  * Each element in this array represents how many BRT entries do we have in this
255  * chunk of storage. We always load this entire array into memory and update as
256  * needed. By having it in memory we can quickly tell (during zio_free()) if
257  * there are any BRT entries that we might need to update.
258  *
259  * This value cannot be larger than 16MB, at least as long as we support
260  * 512 byte block sizes. With 512 byte block size we can have exactly
261  * 32768 blocks in 16MB. In 32MB we could have 65536 blocks, which is one too
262  * many for a 16bit counter.
263  */
264 #define	BRT_RANGESIZE	(16 * 1024 * 1024)
265 _Static_assert(BRT_RANGESIZE / SPA_MINBLOCKSIZE <= UINT16_MAX,
266 	"BRT_RANGESIZE is too large.");
267 /*
268  * We don't want to update the whole structure every time. Maintain bitmap
269  * of dirty blocks within the regions, so that a single bit represents a
270  * block size of entcounts. For example if we have a 1PB vdev then all
271  * entcounts take 128MB of memory ((64TB / 16MB) * 2B). We can divide this
272  * 128MB array of entcounts into 32kB disk blocks, as we don't want to update
273  * the whole 128MB on disk when we have updated only a single entcount.
274  * We maintain a bitmap where each 32kB disk block within 128MB entcounts array
275  * is represented by a single bit. This gives us 4096 bits. A set bit in the
276  * bitmap means that we had a change in at least one of the 16384 entcounts
277  * that reside on a 32kB disk block (32kB / sizeof (uint16_t)).
278  */
279 #define	BRT_BLOCKSIZE	(32 * 1024)
280 #define	BRT_RANGESIZE_TO_NBLOCKS(size)					\
281 	(((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1)
282 
283 #define	BRT_LITTLE_ENDIAN	0
284 #define	BRT_BIG_ENDIAN		1
285 #ifdef _ZFS_LITTLE_ENDIAN
286 #define	BRT_NATIVE_BYTEORDER		BRT_LITTLE_ENDIAN
287 #define	BRT_NON_NATIVE_BYTEORDER	BRT_BIG_ENDIAN
288 #else
289 #define	BRT_NATIVE_BYTEORDER		BRT_BIG_ENDIAN
290 #define	BRT_NON_NATIVE_BYTEORDER	BRT_LITTLE_ENDIAN
291 #endif
292 
293 typedef struct brt_vdev_phys {
294 	uint64_t	bvp_mos_entries;
295 	uint64_t	bvp_size;
296 	uint64_t	bvp_byteorder;
297 	uint64_t	bvp_totalcount;
298 	uint64_t	bvp_rangesize;
299 	uint64_t	bvp_usedspace;
300 	uint64_t	bvp_savedspace;
301 } brt_vdev_phys_t;
302 
303 typedef struct brt_vdev {
304 	/*
305 	 * VDEV id.
306 	 */
307 	uint64_t	bv_vdevid;
308 	/*
309 	 * Is the structure initiated?
310 	 * (bv_entcount and bv_bitmap are allocated?)
311 	 */
312 	boolean_t	bv_initiated;
313 	/*
314 	 * Object number in the MOS for the entcount array and brt_vdev_phys.
315 	 */
316 	uint64_t	bv_mos_brtvdev;
317 	/*
318 	 * Object number in the MOS for the entries table.
319 	 */
320 	uint64_t	bv_mos_entries;
321 	/*
322 	 * Entries to sync.
323 	 */
324 	avl_tree_t	bv_tree;
325 	/*
326 	 * Does the bv_entcount[] array needs byte swapping?
327 	 */
328 	boolean_t	bv_need_byteswap;
329 	/*
330 	 * Number of entries in the bv_entcount[] array.
331 	 */
332 	uint64_t	bv_size;
333 	/*
334 	 * This is the array with BRT entry count per BRT_RANGESIZE.
335 	 */
336 	uint16_t	*bv_entcount;
337 	/*
338 	 * Sum of all bv_entcount[]s.
339 	 */
340 	uint64_t	bv_totalcount;
341 	/*
342 	 * Space on disk occupied by cloned blocks (without compression).
343 	 */
344 	uint64_t	bv_usedspace;
345 	/*
346 	 * How much additional space would be occupied without block cloning.
347 	 */
348 	uint64_t	bv_savedspace;
349 	/*
350 	 * brt_vdev_phys needs updating on disk.
351 	 */
352 	boolean_t	bv_meta_dirty;
353 	/*
354 	 * bv_entcount[] needs updating on disk.
355 	 */
356 	boolean_t	bv_entcount_dirty;
357 	/*
358 	 * bv_entcount[] potentially can be a bit too big to sychronize it all
359 	 * when we just changed few entcounts. The fields below allow us to
360 	 * track updates to bv_entcount[] array since the last sync.
361 	 * A single bit in the bv_bitmap represents as many entcounts as can
362 	 * fit into a single BRT_BLOCKSIZE.
363 	 * For example we have 65536 entcounts in the bv_entcount array
364 	 * (so the whole array is 128kB). We updated bv_entcount[2] and
365 	 * bv_entcount[5]. In that case only first bit in the bv_bitmap will
366 	 * be set and we will write only first BRT_BLOCKSIZE out of 128kB.
367 	 */
368 	ulong_t		*bv_bitmap;
369 	uint64_t	bv_nblocks;
370 } brt_vdev_t;
371 
372 /*
373  * In-core brt
374  */
375 typedef struct brt {
376 	krwlock_t	brt_lock;
377 	spa_t		*brt_spa;
378 #define	brt_mos		brt_spa->spa_meta_objset
379 	uint64_t	brt_rangesize;
380 	uint64_t	brt_usedspace;
381 	uint64_t	brt_savedspace;
382 	avl_tree_t	brt_pending_tree[TXG_SIZE];
383 	kmutex_t	brt_pending_lock[TXG_SIZE];
384 	/* Sum of all entries across all bv_trees. */
385 	uint64_t	brt_nentries;
386 	brt_vdev_t	*brt_vdevs;
387 	uint64_t	brt_nvdevs;
388 } brt_t;
389 
390 /* Size of bre_offset / sizeof (uint64_t). */
391 #define	BRT_KEY_WORDS	(1)
392 
393 /*
394  * In-core brt entry.
395  * On-disk we use bre_offset as the key and bre_refcount as the value.
396  */
397 typedef struct brt_entry {
398 	uint64_t	bre_offset;
399 	uint64_t	bre_refcount;
400 	avl_node_t	bre_node;
401 } brt_entry_t;
402 
403 typedef struct brt_pending_entry {
404 	blkptr_t	bpe_bp;
405 	int		bpe_count;
406 	avl_node_t	bpe_node;
407 } brt_pending_entry_t;
408 
409 static kmem_cache_t *brt_entry_cache;
410 static kmem_cache_t *brt_pending_entry_cache;
411 
412 /*
413  * Enable/disable prefetching of BRT entries that we are going to modify.
414  */
415 int zfs_brt_prefetch = 1;
416 
417 #ifdef ZFS_DEBUG
418 #define	BRT_DEBUG(...)	do {						\
419 	if ((zfs_flags & ZFS_DEBUG_BRT) != 0) {				\
420 		__dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \
421 	}								\
422 } while (0)
423 #else
424 #define	BRT_DEBUG(...)	do { } while (0)
425 #endif
426 
427 int brt_zap_leaf_blockshift = 12;
428 int brt_zap_indirect_blockshift = 12;
429 
430 static kstat_t	*brt_ksp;
431 
432 typedef struct brt_stats {
433 	kstat_named_t brt_addref_entry_in_memory;
434 	kstat_named_t brt_addref_entry_not_on_disk;
435 	kstat_named_t brt_addref_entry_on_disk;
436 	kstat_named_t brt_addref_entry_read_lost_race;
437 	kstat_named_t brt_decref_entry_in_memory;
438 	kstat_named_t brt_decref_entry_loaded_from_disk;
439 	kstat_named_t brt_decref_entry_not_in_memory;
440 	kstat_named_t brt_decref_entry_not_on_disk;
441 	kstat_named_t brt_decref_entry_read_lost_race;
442 	kstat_named_t brt_decref_entry_still_referenced;
443 	kstat_named_t brt_decref_free_data_later;
444 	kstat_named_t brt_decref_free_data_now;
445 	kstat_named_t brt_decref_no_entry;
446 } brt_stats_t;
447 
448 static brt_stats_t brt_stats = {
449 	{ "addref_entry_in_memory",		KSTAT_DATA_UINT64 },
450 	{ "addref_entry_not_on_disk",		KSTAT_DATA_UINT64 },
451 	{ "addref_entry_on_disk",		KSTAT_DATA_UINT64 },
452 	{ "addref_entry_read_lost_race",	KSTAT_DATA_UINT64 },
453 	{ "decref_entry_in_memory",		KSTAT_DATA_UINT64 },
454 	{ "decref_entry_loaded_from_disk",	KSTAT_DATA_UINT64 },
455 	{ "decref_entry_not_in_memory",		KSTAT_DATA_UINT64 },
456 	{ "decref_entry_not_on_disk",		KSTAT_DATA_UINT64 },
457 	{ "decref_entry_read_lost_race",	KSTAT_DATA_UINT64 },
458 	{ "decref_entry_still_referenced",	KSTAT_DATA_UINT64 },
459 	{ "decref_free_data_later",		KSTAT_DATA_UINT64 },
460 	{ "decref_free_data_now",		KSTAT_DATA_UINT64 },
461 	{ "decref_no_entry",			KSTAT_DATA_UINT64 }
462 };
463 
464 struct {
465 	wmsum_t brt_addref_entry_in_memory;
466 	wmsum_t brt_addref_entry_not_on_disk;
467 	wmsum_t brt_addref_entry_on_disk;
468 	wmsum_t brt_addref_entry_read_lost_race;
469 	wmsum_t brt_decref_entry_in_memory;
470 	wmsum_t brt_decref_entry_loaded_from_disk;
471 	wmsum_t brt_decref_entry_not_in_memory;
472 	wmsum_t brt_decref_entry_not_on_disk;
473 	wmsum_t brt_decref_entry_read_lost_race;
474 	wmsum_t brt_decref_entry_still_referenced;
475 	wmsum_t brt_decref_free_data_later;
476 	wmsum_t brt_decref_free_data_now;
477 	wmsum_t brt_decref_no_entry;
478 } brt_sums;
479 
480 #define	BRTSTAT_BUMP(stat)	wmsum_add(&brt_sums.stat, 1)
481 
482 static int brt_entry_compare(const void *x1, const void *x2);
483 static int brt_pending_entry_compare(const void *x1, const void *x2);
484 
485 static void
486 brt_rlock(brt_t *brt)
487 {
488 	rw_enter(&brt->brt_lock, RW_READER);
489 }
490 
491 static void
492 brt_wlock(brt_t *brt)
493 {
494 	rw_enter(&brt->brt_lock, RW_WRITER);
495 }
496 
497 static void
498 brt_unlock(brt_t *brt)
499 {
500 	rw_exit(&brt->brt_lock);
501 }
502 
503 static uint16_t
504 brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx)
505 {
506 
507 	ASSERT3U(idx, <, brtvd->bv_size);
508 
509 	if (brtvd->bv_need_byteswap) {
510 		return (BSWAP_16(brtvd->bv_entcount[idx]));
511 	} else {
512 		return (brtvd->bv_entcount[idx]);
513 	}
514 }
515 
516 static void
517 brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt)
518 {
519 
520 	ASSERT3U(idx, <, brtvd->bv_size);
521 
522 	if (brtvd->bv_need_byteswap) {
523 		brtvd->bv_entcount[idx] = BSWAP_16(entcnt);
524 	} else {
525 		brtvd->bv_entcount[idx] = entcnt;
526 	}
527 }
528 
529 static void
530 brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx)
531 {
532 	uint16_t entcnt;
533 
534 	ASSERT3U(idx, <, brtvd->bv_size);
535 
536 	entcnt = brt_vdev_entcount_get(brtvd, idx);
537 	ASSERT(entcnt < UINT16_MAX);
538 
539 	brt_vdev_entcount_set(brtvd, idx, entcnt + 1);
540 }
541 
542 static void
543 brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx)
544 {
545 	uint16_t entcnt;
546 
547 	ASSERT3U(idx, <, brtvd->bv_size);
548 
549 	entcnt = brt_vdev_entcount_get(brtvd, idx);
550 	ASSERT(entcnt > 0);
551 
552 	brt_vdev_entcount_set(brtvd, idx, entcnt - 1);
553 }
554 
555 #ifdef ZFS_DEBUG
556 static void
557 brt_vdev_dump(brt_t *brt)
558 {
559 	brt_vdev_t *brtvd;
560 	uint64_t vdevid;
561 
562 	if ((zfs_flags & ZFS_DEBUG_BRT) == 0) {
563 		return;
564 	}
565 
566 	if (brt->brt_nvdevs == 0) {
567 		zfs_dbgmsg("BRT empty");
568 		return;
569 	}
570 
571 	zfs_dbgmsg("BRT vdev dump:");
572 	for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
573 		uint64_t idx;
574 
575 		brtvd = &brt->brt_vdevs[vdevid];
576 		zfs_dbgmsg("  vdevid=%llu/%llu meta_dirty=%d entcount_dirty=%d "
577 		    "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n",
578 		    (u_longlong_t)vdevid, (u_longlong_t)brtvd->bv_vdevid,
579 		    brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
580 		    (u_longlong_t)brtvd->bv_size,
581 		    (u_longlong_t)brtvd->bv_totalcount,
582 		    (u_longlong_t)brtvd->bv_nblocks,
583 		    (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks));
584 		if (brtvd->bv_totalcount > 0) {
585 			zfs_dbgmsg("    entcounts:");
586 			for (idx = 0; idx < brtvd->bv_size; idx++) {
587 				if (brt_vdev_entcount_get(brtvd, idx) > 0) {
588 					zfs_dbgmsg("      [%04llu] %hu",
589 					    (u_longlong_t)idx,
590 					    brt_vdev_entcount_get(brtvd, idx));
591 				}
592 			}
593 		}
594 		if (brtvd->bv_entcount_dirty) {
595 			char *bitmap;
596 
597 			bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP);
598 			for (idx = 0; idx < brtvd->bv_nblocks; idx++) {
599 				bitmap[idx] =
600 				    BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
601 			}
602 			bitmap[idx] = '\0';
603 			zfs_dbgmsg("    bitmap: %s", bitmap);
604 			kmem_free(bitmap, brtvd->bv_nblocks + 1);
605 		}
606 	}
607 }
608 #endif
609 
610 static brt_vdev_t *
611 brt_vdev(brt_t *brt, uint64_t vdevid)
612 {
613 	brt_vdev_t *brtvd;
614 
615 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
616 
617 	if (vdevid < brt->brt_nvdevs) {
618 		brtvd = &brt->brt_vdevs[vdevid];
619 	} else {
620 		brtvd = NULL;
621 	}
622 
623 	return (brtvd);
624 }
625 
626 static void
627 brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
628 {
629 	char name[64];
630 
631 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
632 	ASSERT0(brtvd->bv_mos_brtvdev);
633 	ASSERT0(brtvd->bv_mos_entries);
634 	ASSERT(brtvd->bv_entcount != NULL);
635 	ASSERT(brtvd->bv_size > 0);
636 	ASSERT(brtvd->bv_bitmap != NULL);
637 	ASSERT(brtvd->bv_nblocks > 0);
638 
639 	brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0,
640 	    ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
641 	    brt_zap_leaf_blockshift, brt_zap_indirect_blockshift, DMU_OT_NONE,
642 	    0, tx);
643 	VERIFY(brtvd->bv_mos_entries != 0);
644 	BRT_DEBUG("MOS entries created, object=%llu",
645 	    (u_longlong_t)brtvd->bv_mos_entries);
646 
647 	/*
648 	 * We allocate DMU buffer to store the bv_entcount[] array.
649 	 * We will keep array size (bv_size) and cummulative count for all
650 	 * bv_entcount[]s (bv_totalcount) in the bonus buffer.
651 	 */
652 	brtvd->bv_mos_brtvdev = dmu_object_alloc(brt->brt_mos,
653 	    DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE,
654 	    DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx);
655 	VERIFY(brtvd->bv_mos_brtvdev != 0);
656 	BRT_DEBUG("MOS BRT VDEV created, object=%llu",
657 	    (u_longlong_t)brtvd->bv_mos_brtvdev);
658 
659 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
660 	    (u_longlong_t)brtvd->bv_vdevid);
661 	VERIFY0(zap_add(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
662 	    sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx));
663 	BRT_DEBUG("Pool directory object created, object=%s", name);
664 
665 	spa_feature_incr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
666 }
667 
668 static void
669 brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd)
670 {
671 	vdev_t *vd;
672 	uint16_t *entcount;
673 	ulong_t *bitmap;
674 	uint64_t nblocks, size;
675 
676 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
677 
678 	spa_config_enter(brt->brt_spa, SCL_VDEV, FTAG, RW_READER);
679 	vd = vdev_lookup_top(brt->brt_spa, brtvd->bv_vdevid);
680 	size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1;
681 	spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG);
682 
683 	entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP);
684 	nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);
685 	bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);
686 
687 	if (!brtvd->bv_initiated) {
688 		ASSERT0(brtvd->bv_size);
689 		ASSERT(brtvd->bv_entcount == NULL);
690 		ASSERT(brtvd->bv_bitmap == NULL);
691 		ASSERT0(brtvd->bv_nblocks);
692 
693 		avl_create(&brtvd->bv_tree, brt_entry_compare,
694 		    sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node));
695 	} else {
696 		ASSERT(brtvd->bv_size > 0);
697 		ASSERT(brtvd->bv_entcount != NULL);
698 		ASSERT(brtvd->bv_bitmap != NULL);
699 		ASSERT(brtvd->bv_nblocks > 0);
700 		/*
701 		 * TODO: Allow vdev shrinking. We only need to implement
702 		 * shrinking the on-disk BRT VDEV object.
703 		 * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset,
704 		 *     size, tx);
705 		 */
706 		ASSERT3U(brtvd->bv_size, <=, size);
707 
708 		memcpy(entcount, brtvd->bv_entcount,
709 		    sizeof (entcount[0]) * MIN(size, brtvd->bv_size));
710 		memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
711 		    BT_SIZEOFMAP(brtvd->bv_nblocks)));
712 		vmem_free(brtvd->bv_entcount,
713 		    sizeof (entcount[0]) * brtvd->bv_size);
714 		kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
715 	}
716 
717 	brtvd->bv_size = size;
718 	brtvd->bv_entcount = entcount;
719 	brtvd->bv_bitmap = bitmap;
720 	brtvd->bv_nblocks = nblocks;
721 	if (!brtvd->bv_initiated) {
722 		brtvd->bv_need_byteswap = FALSE;
723 		brtvd->bv_initiated = TRUE;
724 		BRT_DEBUG("BRT VDEV %llu initiated.",
725 		    (u_longlong_t)brtvd->bv_vdevid);
726 	}
727 }
728 
729 static void
730 brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd)
731 {
732 	char name[64];
733 	dmu_buf_t *db;
734 	brt_vdev_phys_t *bvphys;
735 	int error;
736 
737 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
738 	    (u_longlong_t)brtvd->bv_vdevid);
739 	error = zap_lookup(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
740 	    sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev);
741 	if (error != 0)
742 		return;
743 	ASSERT(brtvd->bv_mos_brtvdev != 0);
744 
745 	error = dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db);
746 	ASSERT0(error);
747 	if (error != 0)
748 		return;
749 
750 	bvphys = db->db_data;
751 	if (brt->brt_rangesize == 0) {
752 		brt->brt_rangesize = bvphys->bvp_rangesize;
753 	} else {
754 		ASSERT3U(brt->brt_rangesize, ==, bvphys->bvp_rangesize);
755 	}
756 
757 	ASSERT(!brtvd->bv_initiated);
758 	brt_vdev_realloc(brt, brtvd);
759 
760 	/* TODO: We don't support VDEV shrinking. */
761 	ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size);
762 
763 	/*
764 	 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out.
765 	 */
766 	error = dmu_read(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
767 	    MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t),
768 	    brtvd->bv_entcount, DMU_READ_NO_PREFETCH);
769 	ASSERT0(error);
770 
771 	brtvd->bv_mos_entries = bvphys->bvp_mos_entries;
772 	ASSERT(brtvd->bv_mos_entries != 0);
773 	brtvd->bv_need_byteswap =
774 	    (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER);
775 	brtvd->bv_totalcount = bvphys->bvp_totalcount;
776 	brtvd->bv_usedspace = bvphys->bvp_usedspace;
777 	brtvd->bv_savedspace = bvphys->bvp_savedspace;
778 	brt->brt_usedspace += brtvd->bv_usedspace;
779 	brt->brt_savedspace += brtvd->bv_savedspace;
780 
781 	dmu_buf_rele(db, FTAG);
782 
783 	BRT_DEBUG("MOS BRT VDEV %s loaded: mos_brtvdev=%llu, mos_entries=%llu",
784 	    name, (u_longlong_t)brtvd->bv_mos_brtvdev,
785 	    (u_longlong_t)brtvd->bv_mos_entries);
786 }
787 
788 static void
789 brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd)
790 {
791 
792 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
793 	ASSERT(brtvd->bv_initiated);
794 
795 	vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size);
796 	brtvd->bv_entcount = NULL;
797 	kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
798 	brtvd->bv_bitmap = NULL;
799 	ASSERT0(avl_numnodes(&brtvd->bv_tree));
800 	avl_destroy(&brtvd->bv_tree);
801 
802 	brtvd->bv_size = 0;
803 	brtvd->bv_nblocks = 0;
804 
805 	brtvd->bv_initiated = FALSE;
806 	BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid);
807 }
808 
809 static void
810 brt_vdev_destroy(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
811 {
812 	char name[64];
813 	uint64_t count;
814 	dmu_buf_t *db;
815 	brt_vdev_phys_t *bvphys;
816 
817 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
818 	ASSERT(brtvd->bv_mos_brtvdev != 0);
819 	ASSERT(brtvd->bv_mos_entries != 0);
820 
821 	VERIFY0(zap_count(brt->brt_mos, brtvd->bv_mos_entries, &count));
822 	VERIFY0(count);
823 	VERIFY0(zap_destroy(brt->brt_mos, brtvd->bv_mos_entries, tx));
824 	BRT_DEBUG("MOS entries destroyed, object=%llu",
825 	    (u_longlong_t)brtvd->bv_mos_entries);
826 	brtvd->bv_mos_entries = 0;
827 
828 	VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
829 	bvphys = db->db_data;
830 	ASSERT0(bvphys->bvp_totalcount);
831 	ASSERT0(bvphys->bvp_usedspace);
832 	ASSERT0(bvphys->bvp_savedspace);
833 	dmu_buf_rele(db, FTAG);
834 
835 	VERIFY0(dmu_object_free(brt->brt_mos, brtvd->bv_mos_brtvdev, tx));
836 	BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",
837 	    (u_longlong_t)brtvd->bv_mos_brtvdev);
838 	brtvd->bv_mos_brtvdev = 0;
839 
840 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
841 	    (u_longlong_t)brtvd->bv_vdevid);
842 	VERIFY0(zap_remove(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, tx));
843 	BRT_DEBUG("Pool directory object removed, object=%s", name);
844 
845 	brt_vdev_dealloc(brt, brtvd);
846 
847 	spa_feature_decr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
848 }
849 
850 static void
851 brt_vdevs_expand(brt_t *brt, uint64_t nvdevs)
852 {
853 	brt_vdev_t *brtvd, *vdevs;
854 	uint64_t vdevid;
855 
856 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
857 	ASSERT3U(nvdevs, >, brt->brt_nvdevs);
858 
859 	vdevs = kmem_zalloc(sizeof (vdevs[0]) * nvdevs, KM_SLEEP);
860 	if (brt->brt_nvdevs > 0) {
861 		ASSERT(brt->brt_vdevs != NULL);
862 
863 		memcpy(vdevs, brt->brt_vdevs,
864 		    sizeof (brt_vdev_t) * brt->brt_nvdevs);
865 		kmem_free(brt->brt_vdevs,
866 		    sizeof (brt_vdev_t) * brt->brt_nvdevs);
867 	}
868 	for (vdevid = brt->brt_nvdevs; vdevid < nvdevs; vdevid++) {
869 		brtvd = &vdevs[vdevid];
870 
871 		brtvd->bv_vdevid = vdevid;
872 		brtvd->bv_initiated = FALSE;
873 	}
874 
875 	BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",
876 	    (u_longlong_t)brt->brt_nvdevs, (u_longlong_t)nvdevs);
877 
878 	brt->brt_vdevs = vdevs;
879 	brt->brt_nvdevs = nvdevs;
880 }
881 
882 static boolean_t
883 brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre)
884 {
885 	uint64_t idx;
886 
887 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
888 
889 	idx = bre->bre_offset / brt->brt_rangesize;
890 	if (brtvd->bv_entcount != NULL && idx < brtvd->bv_size) {
891 		/* VDEV wasn't expanded. */
892 		return (brt_vdev_entcount_get(brtvd, idx) > 0);
893 	}
894 
895 	return (FALSE);
896 }
897 
898 static void
899 brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
900     uint64_t dsize)
901 {
902 	uint64_t idx;
903 
904 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
905 	ASSERT(brtvd != NULL);
906 	ASSERT(brtvd->bv_entcount != NULL);
907 
908 	brt->brt_savedspace += dsize;
909 	brtvd->bv_savedspace += dsize;
910 	brtvd->bv_meta_dirty = TRUE;
911 
912 	if (bre->bre_refcount > 1) {
913 		return;
914 	}
915 
916 	brt->brt_usedspace += dsize;
917 	brtvd->bv_usedspace += dsize;
918 
919 	idx = bre->bre_offset / brt->brt_rangesize;
920 	if (idx >= brtvd->bv_size) {
921 		/* VDEV has been expanded. */
922 		brt_vdev_realloc(brt, brtvd);
923 	}
924 
925 	ASSERT3U(idx, <, brtvd->bv_size);
926 
927 	brtvd->bv_totalcount++;
928 	brt_vdev_entcount_inc(brtvd, idx);
929 	brtvd->bv_entcount_dirty = TRUE;
930 	idx = idx / BRT_BLOCKSIZE / 8;
931 	BT_SET(brtvd->bv_bitmap, idx);
932 
933 #ifdef ZFS_DEBUG
934 	brt_vdev_dump(brt);
935 #endif
936 }
937 
938 static void
939 brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
940     uint64_t dsize)
941 {
942 	uint64_t idx;
943 
944 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
945 	ASSERT(brtvd != NULL);
946 	ASSERT(brtvd->bv_entcount != NULL);
947 
948 	brt->brt_savedspace -= dsize;
949 	brtvd->bv_savedspace -= dsize;
950 	brtvd->bv_meta_dirty = TRUE;
951 
952 	if (bre->bre_refcount > 0) {
953 		return;
954 	}
955 
956 	brt->brt_usedspace -= dsize;
957 	brtvd->bv_usedspace -= dsize;
958 
959 	idx = bre->bre_offset / brt->brt_rangesize;
960 	ASSERT3U(idx, <, brtvd->bv_size);
961 
962 	ASSERT(brtvd->bv_totalcount > 0);
963 	brtvd->bv_totalcount--;
964 	brt_vdev_entcount_dec(brtvd, idx);
965 	brtvd->bv_entcount_dirty = TRUE;
966 	idx = idx / BRT_BLOCKSIZE / 8;
967 	BT_SET(brtvd->bv_bitmap, idx);
968 
969 #ifdef ZFS_DEBUG
970 	brt_vdev_dump(brt);
971 #endif
972 }
973 
974 static void
975 brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
976 {
977 	dmu_buf_t *db;
978 	brt_vdev_phys_t *bvphys;
979 
980 	ASSERT(brtvd->bv_meta_dirty);
981 	ASSERT(brtvd->bv_mos_brtvdev != 0);
982 	ASSERT(dmu_tx_is_syncing(tx));
983 
984 	VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
985 
986 	if (brtvd->bv_entcount_dirty) {
987 		/*
988 		 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
989 		 */
990 		dmu_write(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
991 		    brtvd->bv_size * sizeof (brtvd->bv_entcount[0]),
992 		    brtvd->bv_entcount, tx);
993 		memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(brtvd->bv_nblocks));
994 		brtvd->bv_entcount_dirty = FALSE;
995 	}
996 
997 	dmu_buf_will_dirty(db, tx);
998 	bvphys = db->db_data;
999 	bvphys->bvp_mos_entries = brtvd->bv_mos_entries;
1000 	bvphys->bvp_size = brtvd->bv_size;
1001 	if (brtvd->bv_need_byteswap) {
1002 		bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER;
1003 	} else {
1004 		bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER;
1005 	}
1006 	bvphys->bvp_totalcount = brtvd->bv_totalcount;
1007 	bvphys->bvp_rangesize = brt->brt_rangesize;
1008 	bvphys->bvp_usedspace = brtvd->bv_usedspace;
1009 	bvphys->bvp_savedspace = brtvd->bv_savedspace;
1010 	dmu_buf_rele(db, FTAG);
1011 
1012 	brtvd->bv_meta_dirty = FALSE;
1013 }
1014 
1015 static void
1016 brt_vdevs_alloc(brt_t *brt, boolean_t load)
1017 {
1018 	brt_vdev_t *brtvd;
1019 	uint64_t vdevid;
1020 
1021 	brt_wlock(brt);
1022 
1023 	brt_vdevs_expand(brt, brt->brt_spa->spa_root_vdev->vdev_children);
1024 
1025 	if (load) {
1026 		for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
1027 			brtvd = &brt->brt_vdevs[vdevid];
1028 			ASSERT(brtvd->bv_entcount == NULL);
1029 
1030 			brt_vdev_load(brt, brtvd);
1031 		}
1032 	}
1033 
1034 	if (brt->brt_rangesize == 0) {
1035 		brt->brt_rangesize = BRT_RANGESIZE;
1036 	}
1037 
1038 	brt_unlock(brt);
1039 }
1040 
1041 static void
1042 brt_vdevs_free(brt_t *brt)
1043 {
1044 	brt_vdev_t *brtvd;
1045 	uint64_t vdevid;
1046 
1047 	brt_wlock(brt);
1048 
1049 	for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
1050 		brtvd = &brt->brt_vdevs[vdevid];
1051 		if (brtvd->bv_initiated)
1052 			brt_vdev_dealloc(brt, brtvd);
1053 	}
1054 	kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs);
1055 
1056 	brt_unlock(brt);
1057 }
1058 
1059 static void
1060 brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp)
1061 {
1062 
1063 	bre->bre_offset = DVA_GET_OFFSET(&bp->blk_dva[0]);
1064 	bre->bre_refcount = 0;
1065 
1066 	*vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]);
1067 }
1068 
1069 static int
1070 brt_entry_compare(const void *x1, const void *x2)
1071 {
1072 	const brt_entry_t *bre1 = x1;
1073 	const brt_entry_t *bre2 = x2;
1074 
1075 	return (TREE_CMP(bre1->bre_offset, bre2->bre_offset));
1076 }
1077 
1078 static int
1079 brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre)
1080 {
1081 	uint64_t mos_entries;
1082 	uint64_t one, physsize;
1083 	int error;
1084 
1085 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
1086 
1087 	if (!brt_vdev_lookup(brt, brtvd, bre))
1088 		return (SET_ERROR(ENOENT));
1089 
1090 	/*
1091 	 * Remember mos_entries object number. After we reacquire the BRT lock,
1092 	 * the brtvd pointer may be invalid.
1093 	 */
1094 	mos_entries = brtvd->bv_mos_entries;
1095 	if (mos_entries == 0)
1096 		return (SET_ERROR(ENOENT));
1097 
1098 	brt_unlock(brt);
1099 
1100 	error = zap_length_uint64(brt->brt_mos, mos_entries, &bre->bre_offset,
1101 	    BRT_KEY_WORDS, &one, &physsize);
1102 	if (error == 0) {
1103 		ASSERT3U(one, ==, 1);
1104 		ASSERT3U(physsize, ==, sizeof (bre->bre_refcount));
1105 
1106 		error = zap_lookup_uint64(brt->brt_mos, mos_entries,
1107 		    &bre->bre_offset, BRT_KEY_WORDS, 1,
1108 		    sizeof (bre->bre_refcount), &bre->bre_refcount);
1109 		BRT_DEBUG("ZAP lookup: object=%llu vdev=%llu offset=%llu "
1110 		    "count=%llu error=%d", (u_longlong_t)mos_entries,
1111 		    (u_longlong_t)brtvd->bv_vdevid,
1112 		    (u_longlong_t)bre->bre_offset,
1113 		    error == 0 ? (u_longlong_t)bre->bre_refcount : 0, error);
1114 	}
1115 
1116 	brt_wlock(brt);
1117 
1118 	return (error);
1119 }
1120 
1121 static void
1122 brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre)
1123 {
1124 	brt_vdev_t *brtvd;
1125 	uint64_t mos_entries = 0;
1126 
1127 	brt_rlock(brt);
1128 	brtvd = brt_vdev(brt, vdevid);
1129 	if (brtvd != NULL)
1130 		mos_entries = brtvd->bv_mos_entries;
1131 	brt_unlock(brt);
1132 
1133 	if (mos_entries == 0)
1134 		return;
1135 
1136 	BRT_DEBUG("ZAP prefetch: object=%llu vdev=%llu offset=%llu",
1137 	    (u_longlong_t)mos_entries, (u_longlong_t)vdevid,
1138 	    (u_longlong_t)bre->bre_offset);
1139 	(void) zap_prefetch_uint64(brt->brt_mos, mos_entries,
1140 	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS);
1141 }
1142 
1143 static int
1144 brt_entry_update(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
1145 {
1146 	int error;
1147 
1148 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
1149 	ASSERT(brtvd->bv_mos_entries != 0);
1150 	ASSERT(bre->bre_refcount > 0);
1151 
1152 	error = zap_update_uint64(brt->brt_mos, brtvd->bv_mos_entries,
1153 	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, 1,
1154 	    sizeof (bre->bre_refcount), &bre->bre_refcount, tx);
1155 	BRT_DEBUG("ZAP update: object=%llu vdev=%llu offset=%llu count=%llu "
1156 	    "error=%d", (u_longlong_t)brtvd->bv_mos_entries,
1157 	    (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
1158 	    (u_longlong_t)bre->bre_refcount, error);
1159 
1160 	return (error);
1161 }
1162 
1163 static int
1164 brt_entry_remove(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
1165 {
1166 	int error;
1167 
1168 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
1169 	ASSERT(brtvd->bv_mos_entries != 0);
1170 	ASSERT0(bre->bre_refcount);
1171 
1172 	error = zap_remove_uint64(brt->brt_mos, brtvd->bv_mos_entries,
1173 	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, tx);
1174 	BRT_DEBUG("ZAP remove: object=%llu vdev=%llu offset=%llu count=%llu "
1175 	    "error=%d", (u_longlong_t)brtvd->bv_mos_entries,
1176 	    (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
1177 	    (u_longlong_t)bre->bre_refcount, error);
1178 
1179 	return (error);
1180 }
1181 
1182 /*
1183  * Return TRUE if we _can_ have BRT entry for this bp. It might be false
1184  * positive, but gives us quick answer if we should look into BRT, which
1185  * may require reads and thus will be more expensive.
1186  */
1187 boolean_t
1188 brt_maybe_exists(spa_t *spa, const blkptr_t *bp)
1189 {
1190 	brt_t *brt = spa->spa_brt;
1191 	brt_vdev_t *brtvd;
1192 	brt_entry_t bre_search;
1193 	boolean_t mayexists = FALSE;
1194 	uint64_t vdevid;
1195 
1196 	brt_entry_fill(bp, &bre_search, &vdevid);
1197 
1198 	brt_rlock(brt);
1199 
1200 	brtvd = brt_vdev(brt, vdevid);
1201 	if (brtvd != NULL && brtvd->bv_initiated) {
1202 		if (!avl_is_empty(&brtvd->bv_tree) ||
1203 		    brt_vdev_lookup(brt, brtvd, &bre_search)) {
1204 			mayexists = TRUE;
1205 		}
1206 	}
1207 
1208 	brt_unlock(brt);
1209 
1210 	return (mayexists);
1211 }
1212 
1213 uint64_t
1214 brt_get_dspace(spa_t *spa)
1215 {
1216 	brt_t *brt = spa->spa_brt;
1217 
1218 	if (brt == NULL)
1219 		return (0);
1220 
1221 	return (brt->brt_savedspace);
1222 }
1223 
1224 uint64_t
1225 brt_get_used(spa_t *spa)
1226 {
1227 	brt_t *brt = spa->spa_brt;
1228 
1229 	if (brt == NULL)
1230 		return (0);
1231 
1232 	return (brt->brt_usedspace);
1233 }
1234 
1235 uint64_t
1236 brt_get_saved(spa_t *spa)
1237 {
1238 	brt_t *brt = spa->spa_brt;
1239 
1240 	if (brt == NULL)
1241 		return (0);
1242 
1243 	return (brt->brt_savedspace);
1244 }
1245 
1246 uint64_t
1247 brt_get_ratio(spa_t *spa)
1248 {
1249 	brt_t *brt = spa->spa_brt;
1250 
1251 	if (brt->brt_usedspace == 0)
1252 		return (100);
1253 
1254 	return ((brt->brt_usedspace + brt->brt_savedspace) * 100 /
1255 	    brt->brt_usedspace);
1256 }
1257 
1258 static int
1259 brt_kstats_update(kstat_t *ksp, int rw)
1260 {
1261 	brt_stats_t *bs = ksp->ks_data;
1262 
1263 	if (rw == KSTAT_WRITE)
1264 		return (EACCES);
1265 
1266 	bs->brt_addref_entry_in_memory.value.ui64 =
1267 	    wmsum_value(&brt_sums.brt_addref_entry_in_memory);
1268 	bs->brt_addref_entry_not_on_disk.value.ui64 =
1269 	    wmsum_value(&brt_sums.brt_addref_entry_not_on_disk);
1270 	bs->brt_addref_entry_on_disk.value.ui64 =
1271 	    wmsum_value(&brt_sums.brt_addref_entry_on_disk);
1272 	bs->brt_addref_entry_read_lost_race.value.ui64 =
1273 	    wmsum_value(&brt_sums.brt_addref_entry_read_lost_race);
1274 	bs->brt_decref_entry_in_memory.value.ui64 =
1275 	    wmsum_value(&brt_sums.brt_decref_entry_in_memory);
1276 	bs->brt_decref_entry_loaded_from_disk.value.ui64 =
1277 	    wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk);
1278 	bs->brt_decref_entry_not_in_memory.value.ui64 =
1279 	    wmsum_value(&brt_sums.brt_decref_entry_not_in_memory);
1280 	bs->brt_decref_entry_not_on_disk.value.ui64 =
1281 	    wmsum_value(&brt_sums.brt_decref_entry_not_on_disk);
1282 	bs->brt_decref_entry_read_lost_race.value.ui64 =
1283 	    wmsum_value(&brt_sums.brt_decref_entry_read_lost_race);
1284 	bs->brt_decref_entry_still_referenced.value.ui64 =
1285 	    wmsum_value(&brt_sums.brt_decref_entry_still_referenced);
1286 	bs->brt_decref_free_data_later.value.ui64 =
1287 	    wmsum_value(&brt_sums.brt_decref_free_data_later);
1288 	bs->brt_decref_free_data_now.value.ui64 =
1289 	    wmsum_value(&brt_sums.brt_decref_free_data_now);
1290 	bs->brt_decref_no_entry.value.ui64 =
1291 	    wmsum_value(&brt_sums.brt_decref_no_entry);
1292 
1293 	return (0);
1294 }
1295 
1296 static void
1297 brt_stat_init(void)
1298 {
1299 
1300 	wmsum_init(&brt_sums.brt_addref_entry_in_memory, 0);
1301 	wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0);
1302 	wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0);
1303 	wmsum_init(&brt_sums.brt_addref_entry_read_lost_race, 0);
1304 	wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0);
1305 	wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0);
1306 	wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0);
1307 	wmsum_init(&brt_sums.brt_decref_entry_not_on_disk, 0);
1308 	wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0);
1309 	wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0);
1310 	wmsum_init(&brt_sums.brt_decref_free_data_later, 0);
1311 	wmsum_init(&brt_sums.brt_decref_free_data_now, 0);
1312 	wmsum_init(&brt_sums.brt_decref_no_entry, 0);
1313 
1314 	brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED,
1315 	    sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
1316 	if (brt_ksp != NULL) {
1317 		brt_ksp->ks_data = &brt_stats;
1318 		brt_ksp->ks_update = brt_kstats_update;
1319 		kstat_install(brt_ksp);
1320 	}
1321 }
1322 
1323 static void
1324 brt_stat_fini(void)
1325 {
1326 	if (brt_ksp != NULL) {
1327 		kstat_delete(brt_ksp);
1328 		brt_ksp = NULL;
1329 	}
1330 
1331 	wmsum_fini(&brt_sums.brt_addref_entry_in_memory);
1332 	wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk);
1333 	wmsum_fini(&brt_sums.brt_addref_entry_on_disk);
1334 	wmsum_fini(&brt_sums.brt_addref_entry_read_lost_race);
1335 	wmsum_fini(&brt_sums.brt_decref_entry_in_memory);
1336 	wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk);
1337 	wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory);
1338 	wmsum_fini(&brt_sums.brt_decref_entry_not_on_disk);
1339 	wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race);
1340 	wmsum_fini(&brt_sums.brt_decref_entry_still_referenced);
1341 	wmsum_fini(&brt_sums.brt_decref_free_data_later);
1342 	wmsum_fini(&brt_sums.brt_decref_free_data_now);
1343 	wmsum_fini(&brt_sums.brt_decref_no_entry);
1344 }
1345 
1346 void
1347 brt_init(void)
1348 {
1349 	brt_entry_cache = kmem_cache_create("brt_entry_cache",
1350 	    sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1351 	brt_pending_entry_cache = kmem_cache_create("brt_pending_entry_cache",
1352 	    sizeof (brt_pending_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1353 
1354 	brt_stat_init();
1355 }
1356 
1357 void
1358 brt_fini(void)
1359 {
1360 	brt_stat_fini();
1361 
1362 	kmem_cache_destroy(brt_entry_cache);
1363 	kmem_cache_destroy(brt_pending_entry_cache);
1364 }
1365 
1366 static brt_entry_t *
1367 brt_entry_alloc(const brt_entry_t *bre_init)
1368 {
1369 	brt_entry_t *bre;
1370 
1371 	bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
1372 	bre->bre_offset = bre_init->bre_offset;
1373 	bre->bre_refcount = bre_init->bre_refcount;
1374 
1375 	return (bre);
1376 }
1377 
1378 static void
1379 brt_entry_free(brt_entry_t *bre)
1380 {
1381 
1382 	kmem_cache_free(brt_entry_cache, bre);
1383 }
1384 
1385 static void
1386 brt_entry_addref(brt_t *brt, const blkptr_t *bp)
1387 {
1388 	brt_vdev_t *brtvd;
1389 	brt_entry_t *bre, *racebre;
1390 	brt_entry_t bre_search;
1391 	avl_index_t where;
1392 	uint64_t vdevid;
1393 	int error;
1394 
1395 	ASSERT(!RW_WRITE_HELD(&brt->brt_lock));
1396 
1397 	brt_entry_fill(bp, &bre_search, &vdevid);
1398 
1399 	brt_wlock(brt);
1400 
1401 	brtvd = brt_vdev(brt, vdevid);
1402 	if (brtvd == NULL) {
1403 		ASSERT3U(vdevid, >=, brt->brt_nvdevs);
1404 
1405 		/* New VDEV was added. */
1406 		brt_vdevs_expand(brt, vdevid + 1);
1407 		brtvd = brt_vdev(brt, vdevid);
1408 	}
1409 	ASSERT(brtvd != NULL);
1410 	if (!brtvd->bv_initiated)
1411 		brt_vdev_realloc(brt, brtvd);
1412 
1413 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1414 	if (bre != NULL) {
1415 		BRTSTAT_BUMP(brt_addref_entry_in_memory);
1416 	} else {
1417 		/*
1418 		 * brt_entry_lookup() may drop the BRT (read) lock and
1419 		 * reacquire it (write).
1420 		 */
1421 		error = brt_entry_lookup(brt, brtvd, &bre_search);
1422 		/* bre_search now contains correct bre_refcount */
1423 		ASSERT(error == 0 || error == ENOENT);
1424 		if (error == 0)
1425 			BRTSTAT_BUMP(brt_addref_entry_on_disk);
1426 		else
1427 			BRTSTAT_BUMP(brt_addref_entry_not_on_disk);
1428 		/*
1429 		 * When the BRT lock was dropped, brt_vdevs[] may have been
1430 		 * expanded and reallocated, we need to update brtvd's pointer.
1431 		 */
1432 		brtvd = brt_vdev(brt, vdevid);
1433 		ASSERT(brtvd != NULL);
1434 
1435 		racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
1436 		if (racebre == NULL) {
1437 			bre = brt_entry_alloc(&bre_search);
1438 			ASSERT(RW_WRITE_HELD(&brt->brt_lock));
1439 			avl_insert(&brtvd->bv_tree, bre, where);
1440 			brt->brt_nentries++;
1441 		} else {
1442 			/*
1443 			 * The entry was added when the BRT lock was dropped in
1444 			 * brt_entry_lookup().
1445 			 */
1446 			BRTSTAT_BUMP(brt_addref_entry_read_lost_race);
1447 			bre = racebre;
1448 		}
1449 	}
1450 	bre->bre_refcount++;
1451 	brt_vdev_addref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
1452 
1453 	brt_unlock(brt);
1454 }
1455 
1456 /* Return TRUE if block should be freed immediately. */
1457 boolean_t
1458 brt_entry_decref(spa_t *spa, const blkptr_t *bp)
1459 {
1460 	brt_t *brt = spa->spa_brt;
1461 	brt_vdev_t *brtvd;
1462 	brt_entry_t *bre, *racebre;
1463 	brt_entry_t bre_search;
1464 	avl_index_t where;
1465 	uint64_t vdevid;
1466 	int error;
1467 
1468 	brt_entry_fill(bp, &bre_search, &vdevid);
1469 
1470 	brt_wlock(brt);
1471 
1472 	brtvd = brt_vdev(brt, vdevid);
1473 	ASSERT(brtvd != NULL);
1474 
1475 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1476 	if (bre != NULL) {
1477 		BRTSTAT_BUMP(brt_decref_entry_in_memory);
1478 		goto out;
1479 	} else {
1480 		BRTSTAT_BUMP(brt_decref_entry_not_in_memory);
1481 	}
1482 
1483 	/*
1484 	 * brt_entry_lookup() may drop the BRT lock and reacquire it.
1485 	 */
1486 	error = brt_entry_lookup(brt, brtvd, &bre_search);
1487 	/* bre_search now contains correct bre_refcount */
1488 	ASSERT(error == 0 || error == ENOENT);
1489 	/*
1490 	 * When the BRT lock was dropped, brt_vdevs[] may have been expanded
1491 	 * and reallocated, we need to update brtvd's pointer.
1492 	 */
1493 	brtvd = brt_vdev(brt, vdevid);
1494 	ASSERT(brtvd != NULL);
1495 
1496 	if (error == ENOENT) {
1497 		BRTSTAT_BUMP(brt_decref_entry_not_on_disk);
1498 		bre = NULL;
1499 		goto out;
1500 	}
1501 
1502 	racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
1503 	if (racebre != NULL) {
1504 		/*
1505 		 * The entry was added when the BRT lock was dropped in
1506 		 * brt_entry_lookup().
1507 		 */
1508 		BRTSTAT_BUMP(brt_decref_entry_read_lost_race);
1509 		bre = racebre;
1510 		goto out;
1511 	}
1512 
1513 	BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk);
1514 	bre = brt_entry_alloc(&bre_search);
1515 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
1516 	avl_insert(&brtvd->bv_tree, bre, where);
1517 	brt->brt_nentries++;
1518 
1519 out:
1520 	if (bre == NULL) {
1521 		/*
1522 		 * This is a free of a regular (not cloned) block.
1523 		 */
1524 		brt_unlock(brt);
1525 		BRTSTAT_BUMP(brt_decref_no_entry);
1526 		return (B_TRUE);
1527 	}
1528 	if (bre->bre_refcount == 0) {
1529 		brt_unlock(brt);
1530 		BRTSTAT_BUMP(brt_decref_free_data_now);
1531 		return (B_TRUE);
1532 	}
1533 
1534 	ASSERT(bre->bre_refcount > 0);
1535 	bre->bre_refcount--;
1536 	if (bre->bre_refcount == 0)
1537 		BRTSTAT_BUMP(brt_decref_free_data_later);
1538 	else
1539 		BRTSTAT_BUMP(brt_decref_entry_still_referenced);
1540 	brt_vdev_decref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
1541 
1542 	brt_unlock(brt);
1543 
1544 	return (B_FALSE);
1545 }
1546 
1547 uint64_t
1548 brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)
1549 {
1550 	brt_t *brt = spa->spa_brt;
1551 	brt_vdev_t *brtvd;
1552 	brt_entry_t bre_search, *bre;
1553 	uint64_t vdevid, refcnt;
1554 	int error;
1555 
1556 	brt_entry_fill(bp, &bre_search, &vdevid);
1557 
1558 	brt_rlock(brt);
1559 
1560 	brtvd = brt_vdev(brt, vdevid);
1561 	ASSERT(brtvd != NULL);
1562 
1563 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1564 	if (bre == NULL) {
1565 		error = brt_entry_lookup(brt, brtvd, &bre_search);
1566 		ASSERT(error == 0 || error == ENOENT);
1567 		if (error == ENOENT)
1568 			refcnt = 0;
1569 		else
1570 			refcnt = bre_search.bre_refcount;
1571 	} else
1572 		refcnt = bre->bre_refcount;
1573 
1574 	brt_unlock(brt);
1575 	return (refcnt);
1576 }
1577 
1578 static void
1579 brt_prefetch(brt_t *brt, const blkptr_t *bp)
1580 {
1581 	brt_entry_t bre;
1582 	uint64_t vdevid;
1583 
1584 	ASSERT(bp != NULL);
1585 
1586 	if (!zfs_brt_prefetch)
1587 		return;
1588 
1589 	brt_entry_fill(bp, &bre, &vdevid);
1590 
1591 	brt_entry_prefetch(brt, vdevid, &bre);
1592 }
1593 
1594 static int
1595 brt_pending_entry_compare(const void *x1, const void *x2)
1596 {
1597 	const brt_pending_entry_t *bpe1 = x1, *bpe2 = x2;
1598 	const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp;
1599 	int cmp;
1600 
1601 	cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1), BP_PHYSICAL_BIRTH(bp2));
1602 	if (cmp == 0) {
1603 		cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]),
1604 		    DVA_GET_VDEV(&bp2->blk_dva[0]));
1605 		if (cmp == 0) {
1606 			cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
1607 			    DVA_GET_OFFSET(&bp2->blk_dva[0]));
1608 		}
1609 	}
1610 
1611 	return (cmp);
1612 }
1613 
1614 void
1615 brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1616 {
1617 	brt_t *brt;
1618 	avl_tree_t *pending_tree;
1619 	kmutex_t *pending_lock;
1620 	brt_pending_entry_t *bpe, *newbpe;
1621 	avl_index_t where;
1622 	uint64_t txg;
1623 
1624 	brt = spa->spa_brt;
1625 	txg = dmu_tx_get_txg(tx);
1626 	ASSERT3U(txg, !=, 0);
1627 	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
1628 	pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
1629 
1630 	newbpe = kmem_cache_alloc(brt_pending_entry_cache, KM_SLEEP);
1631 	newbpe->bpe_bp = *bp;
1632 	newbpe->bpe_count = 1;
1633 
1634 	mutex_enter(pending_lock);
1635 
1636 	bpe = avl_find(pending_tree, newbpe, &where);
1637 	if (bpe == NULL) {
1638 		avl_insert(pending_tree, newbpe, where);
1639 		newbpe = NULL;
1640 	} else {
1641 		bpe->bpe_count++;
1642 	}
1643 
1644 	mutex_exit(pending_lock);
1645 
1646 	if (newbpe != NULL) {
1647 		ASSERT(bpe != NULL);
1648 		ASSERT(bpe != newbpe);
1649 		kmem_cache_free(brt_pending_entry_cache, newbpe);
1650 	} else {
1651 		ASSERT(bpe == NULL);
1652 	}
1653 
1654 	/* Prefetch BRT entry, as we will need it in the syncing context. */
1655 	brt_prefetch(brt, bp);
1656 }
1657 
1658 void
1659 brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1660 {
1661 	brt_t *brt;
1662 	avl_tree_t *pending_tree;
1663 	kmutex_t *pending_lock;
1664 	brt_pending_entry_t *bpe, bpe_search;
1665 	uint64_t txg;
1666 
1667 	brt = spa->spa_brt;
1668 	txg = dmu_tx_get_txg(tx);
1669 	ASSERT3U(txg, !=, 0);
1670 	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
1671 	pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
1672 
1673 	bpe_search.bpe_bp = *bp;
1674 
1675 	mutex_enter(pending_lock);
1676 
1677 	bpe = avl_find(pending_tree, &bpe_search, NULL);
1678 	/* I believe we should always find bpe when this function is called. */
1679 	if (bpe != NULL) {
1680 		ASSERT(bpe->bpe_count > 0);
1681 
1682 		bpe->bpe_count--;
1683 		if (bpe->bpe_count == 0) {
1684 			avl_remove(pending_tree, bpe);
1685 			kmem_cache_free(brt_pending_entry_cache, bpe);
1686 		}
1687 	}
1688 
1689 	mutex_exit(pending_lock);
1690 }
1691 
1692 void
1693 brt_pending_apply(spa_t *spa, uint64_t txg)
1694 {
1695 	brt_t *brt;
1696 	brt_pending_entry_t *bpe;
1697 	avl_tree_t *pending_tree;
1698 	kmutex_t *pending_lock;
1699 	void *c;
1700 
1701 	ASSERT3U(txg, !=, 0);
1702 
1703 	brt = spa->spa_brt;
1704 	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
1705 	pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
1706 
1707 	mutex_enter(pending_lock);
1708 
1709 	c = NULL;
1710 	while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) {
1711 		boolean_t added_to_ddt;
1712 
1713 		mutex_exit(pending_lock);
1714 
1715 		for (int i = 0; i < bpe->bpe_count; i++) {
1716 			/*
1717 			 * If the block has DEDUP bit set, it means that it
1718 			 * already exists in the DEDUP table, so we can just
1719 			 * use that instead of creating new entry in
1720 			 * the BRT table.
1721 			 */
1722 			if (BP_GET_DEDUP(&bpe->bpe_bp)) {
1723 				added_to_ddt = ddt_addref(spa, &bpe->bpe_bp);
1724 			} else {
1725 				added_to_ddt = B_FALSE;
1726 			}
1727 			if (!added_to_ddt)
1728 				brt_entry_addref(brt, &bpe->bpe_bp);
1729 		}
1730 
1731 		kmem_cache_free(brt_pending_entry_cache, bpe);
1732 		mutex_enter(pending_lock);
1733 	}
1734 
1735 	mutex_exit(pending_lock);
1736 }
1737 
1738 static void
1739 brt_sync_entry(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
1740 {
1741 
1742 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
1743 	ASSERT(brtvd->bv_mos_entries != 0);
1744 
1745 	if (bre->bre_refcount == 0) {
1746 		int error;
1747 
1748 		error = brt_entry_remove(brt, brtvd, bre, tx);
1749 		ASSERT(error == 0 || error == ENOENT);
1750 		/*
1751 		 * If error == ENOENT then zfs_clone_range() was done from a
1752 		 * removed (but opened) file (open(), unlink()).
1753 		 */
1754 		ASSERT(brt_entry_lookup(brt, brtvd, bre) == ENOENT);
1755 	} else {
1756 		VERIFY0(brt_entry_update(brt, brtvd, bre, tx));
1757 	}
1758 }
1759 
1760 static void
1761 brt_sync_table(brt_t *brt, dmu_tx_t *tx)
1762 {
1763 	brt_vdev_t *brtvd;
1764 	brt_entry_t *bre;
1765 	uint64_t vdevid;
1766 	void *c;
1767 
1768 	brt_wlock(brt);
1769 
1770 	for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
1771 		brtvd = &brt->brt_vdevs[vdevid];
1772 
1773 		if (!brtvd->bv_initiated)
1774 			continue;
1775 
1776 		if (!brtvd->bv_meta_dirty) {
1777 			ASSERT(!brtvd->bv_entcount_dirty);
1778 			ASSERT0(avl_numnodes(&brtvd->bv_tree));
1779 			continue;
1780 		}
1781 
1782 		ASSERT(!brtvd->bv_entcount_dirty ||
1783 		    avl_numnodes(&brtvd->bv_tree) != 0);
1784 
1785 		if (brtvd->bv_mos_brtvdev == 0)
1786 			brt_vdev_create(brt, brtvd, tx);
1787 
1788 		c = NULL;
1789 		while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
1790 			brt_sync_entry(brt, brtvd, bre, tx);
1791 			brt_entry_free(bre);
1792 			ASSERT(brt->brt_nentries > 0);
1793 			brt->brt_nentries--;
1794 		}
1795 
1796 		brt_vdev_sync(brt, brtvd, tx);
1797 
1798 		if (brtvd->bv_totalcount == 0)
1799 			brt_vdev_destroy(brt, brtvd, tx);
1800 	}
1801 
1802 	ASSERT0(brt->brt_nentries);
1803 
1804 	brt_unlock(brt);
1805 }
1806 
1807 void
1808 brt_sync(spa_t *spa, uint64_t txg)
1809 {
1810 	dmu_tx_t *tx;
1811 	brt_t *brt;
1812 
1813 	ASSERT(spa_syncing_txg(spa) == txg);
1814 
1815 	brt = spa->spa_brt;
1816 	brt_rlock(brt);
1817 	if (brt->brt_nentries == 0) {
1818 		/* No changes. */
1819 		brt_unlock(brt);
1820 		return;
1821 	}
1822 	brt_unlock(brt);
1823 
1824 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1825 
1826 	brt_sync_table(brt, tx);
1827 
1828 	dmu_tx_commit(tx);
1829 }
1830 
1831 static void
1832 brt_table_alloc(brt_t *brt)
1833 {
1834 
1835 	for (int i = 0; i < TXG_SIZE; i++) {
1836 		avl_create(&brt->brt_pending_tree[i],
1837 		    brt_pending_entry_compare,
1838 		    sizeof (brt_pending_entry_t),
1839 		    offsetof(brt_pending_entry_t, bpe_node));
1840 		mutex_init(&brt->brt_pending_lock[i], NULL, MUTEX_DEFAULT,
1841 		    NULL);
1842 	}
1843 }
1844 
1845 static void
1846 brt_table_free(brt_t *brt)
1847 {
1848 
1849 	for (int i = 0; i < TXG_SIZE; i++) {
1850 		ASSERT(avl_is_empty(&brt->brt_pending_tree[i]));
1851 
1852 		avl_destroy(&brt->brt_pending_tree[i]);
1853 		mutex_destroy(&brt->brt_pending_lock[i]);
1854 	}
1855 }
1856 
1857 static void
1858 brt_alloc(spa_t *spa)
1859 {
1860 	brt_t *brt;
1861 
1862 	ASSERT(spa->spa_brt == NULL);
1863 
1864 	brt = kmem_zalloc(sizeof (*brt), KM_SLEEP);
1865 	rw_init(&brt->brt_lock, NULL, RW_DEFAULT, NULL);
1866 	brt->brt_spa = spa;
1867 	brt->brt_rangesize = 0;
1868 	brt->brt_nentries = 0;
1869 	brt->brt_vdevs = NULL;
1870 	brt->brt_nvdevs = 0;
1871 	brt_table_alloc(brt);
1872 
1873 	spa->spa_brt = brt;
1874 }
1875 
1876 void
1877 brt_create(spa_t *spa)
1878 {
1879 
1880 	brt_alloc(spa);
1881 	brt_vdevs_alloc(spa->spa_brt, B_FALSE);
1882 }
1883 
1884 int
1885 brt_load(spa_t *spa)
1886 {
1887 
1888 	brt_alloc(spa);
1889 	brt_vdevs_alloc(spa->spa_brt, B_TRUE);
1890 
1891 	return (0);
1892 }
1893 
1894 void
1895 brt_unload(spa_t *spa)
1896 {
1897 	brt_t *brt = spa->spa_brt;
1898 
1899 	if (brt == NULL)
1900 		return;
1901 
1902 	brt_vdevs_free(brt);
1903 	brt_table_free(brt);
1904 	rw_destroy(&brt->brt_lock);
1905 	kmem_free(brt, sizeof (*brt));
1906 	spa->spa_brt = NULL;
1907 }
1908 
1909 /* BEGIN CSTYLED */
1910 ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, prefetch, INT, ZMOD_RW,
1911     "Enable prefetching of BRT entries");
1912 #ifdef ZFS_BRT_DEBUG
1913 ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, debug, INT, ZMOD_RW, "BRT debug");
1914 #endif
1915 /* END CSTYLED */
1916