xref: /freebsd/sys/contrib/openzfs/module/zfs/brt.c (revision 069ac18495ad8fde2748bc94b0f80a50250bb01d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
24  */
25 
26 #include <sys/zfs_context.h>
27 #include <sys/spa.h>
28 #include <sys/spa_impl.h>
29 #include <sys/zio.h>
30 #include <sys/brt.h>
31 #include <sys/brt_impl.h>
32 #include <sys/ddt.h>
33 #include <sys/bitmap.h>
34 #include <sys/zap.h>
35 #include <sys/dmu_tx.h>
36 #include <sys/arc.h>
37 #include <sys/dsl_pool.h>
38 #include <sys/dsl_scan.h>
39 #include <sys/vdev_impl.h>
40 #include <sys/kstat.h>
41 #include <sys/wmsum.h>
42 
43 /*
44  * Block Cloning design.
45  *
46  * Block Cloning allows to manually clone a file (or a subset of its blocks)
47  * into another (or the same) file by just creating additional references to
48  * the data blocks without copying the data itself. Those references are kept
49  * in the Block Reference Tables (BRTs).
50  *
51  * In many ways this is similar to the existing deduplication, but there are
52  * some important differences:
53  *
54  * - Deduplication is automatic and Block Cloning is not - one has to use a
55  *   dedicated system call(s) to clone the given file/blocks.
56  * - Deduplication keeps all data blocks in its table, even those referenced
57  *   just once. Block Cloning creates an entry in its tables only when there
58  *   are at least two references to the given data block. If the block was
59  *   never explicitly cloned or the second to last reference was dropped,
60  *   there will be neither space nor performance overhead.
61  * - Deduplication needs data to work - one needs to pass real data to the
62  *   write(2) syscall, so hash can be calculated. Block Cloning doesn't require
63  *   data, just block pointers to the data, so it is extremely fast, as we pay
64  *   neither the cost of reading the data, nor the cost of writing the data -
65  *   we operate exclusively on metadata.
66  * - If the D (dedup) bit is not set in the block pointer, it means that
67  *   the block is not in the dedup table (DDT) and we won't consult the DDT
68  *   when we need to free the block. Block Cloning must be consulted on every
69  *   free, because we cannot modify the source BP (eg. by setting something
70  *   similar to the D bit), thus we have no hint if the block is in the
71  *   Block Reference Table (BRT), so we need to look into the BRT. There is
72  *   an optimization in place that allows us to eliminate the majority of BRT
73  *   lookups which is described below in the "Minimizing free penalty" section.
74  * - The BRT entry is much smaller than the DDT entry - for BRT we only store
75  *   64bit offset and 64bit reference counter.
76  * - Dedup keys are cryptographic hashes, so two blocks that are close to each
77  *   other on disk are most likely in totally different parts of the DDT.
78  *   The BRT entry keys are offsets into a single top-level VDEV, so data blocks
79  *   from one file should have BRT entries close to each other.
80  * - Scrub will only do a single pass over a block that is referenced multiple
81  *   times in the DDT. Unfortunately it is not currently (if at all) possible
82  *   with Block Cloning and block referenced multiple times will be scrubbed
83  *   multiple times. The new, sorted scrub should be able to eliminate
84  *   duplicated reads given enough memory.
85  * - Deduplication requires cryptographically strong hash as a checksum or
86  *   additional data verification. Block Cloning works with any checksum
87  *   algorithm or even with checksumming disabled.
88  *
89  * As mentioned above, the BRT entries are much smaller than the DDT entries.
90  * To uniquely identify a block we just need its vdev id and offset. We also
91  * need to maintain a reference counter. The vdev id will often repeat, as there
92  * is a small number of top-level VDEVs and a large number of blocks stored in
93  * each VDEV. We take advantage of that to reduce the BRT entry size further by
94  * maintaining one BRT for each top-level VDEV, so we can then have only offset
95  * and counter as the BRT entry.
96  *
97  * Minimizing free penalty.
98  *
99  * Block Cloning allows creating additional references to any existing block.
100  * When we free a block there is no hint in the block pointer whether the block
101  * was cloned or not, so on each free we have to check if there is a
102  * corresponding entry in the BRT or not. If there is, we need to decrease
103  * the reference counter. Doing BRT lookup on every free can potentially be
104  * expensive by requiring additional I/Os if the BRT doesn't fit into memory.
105  * This is the main problem with deduplication, so we've learned our lesson and
106  * try not to repeat the same mistake here. How do we do that? We divide each
107  * top-level VDEV into 16MB regions. For each region we maintain a counter that
108  * is a sum of all the BRT entries that have offsets within the region. This
109  * creates the entries count array of 16bit numbers for each top-level VDEV.
110  * The entries count array is always kept in memory and updated on disk in the
111  * same transaction group as the BRT updates to keep everything in-sync. We can
112  * keep the array in memory, because it is very small. With 16MB regions and
113  * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease
114  * the region size even further in the future). Now, when we want to free
115  * a block, we first consult the array. If the counter for the whole region is
116  * zero, there is no need to look for the BRT entry, as there isn't one for
117  * sure. If the counter for the region is greater than zero, only then we will
118  * do a BRT lookup and if an entry is found we will decrease the reference
119  * counter in the BRT entry and in the entry counters array.
120  *
121  * The entry counters array is small, but can potentially be larger for very
122  * large VDEVs or smaller regions. In this case we don't want to rewrite entire
123  * array on every change. We then divide the array into 32kB block and keep
124  * a bitmap of dirty blocks within a transaction group. When we sync the
125  * transaction group we can only update the parts of the entry counters array
126  * that were modified. Note: Keeping track of the dirty parts of the entry
127  * counters array is implemented, but updating only parts of the array on disk
128  * is not yet implemented - for now we will update entire array if there was
129  * any change.
130  *
131  * The implementation tries to be economic: if BRT is not used, or no longer
132  * used, there will be no entries in the MOS and no additional memory used (eg.
133  * the entry counters array is only allocated if needed).
134  *
135  * Interaction between Deduplication and Block Cloning.
136  *
137  * If both functionalities are in use, we could end up with a block that is
138  * referenced multiple times in both DDT and BRT. When we free one of the
139  * references we couldn't tell where it belongs, so we would have to decide
140  * what table takes the precedence: do we first clear DDT references or BRT
141  * references? To avoid this dilemma BRT cooperates with DDT - if a given block
142  * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will
143  * lookup DDT entry instead and increase the counter there. No BRT entry
144  * will be created for a block which has the D (dedup) bit set.
145  * BRT may be more efficient for manual deduplication, but if the block is
146  * already in the DDT, then creating additional BRT entry would be less
147  * efficient. This clever idea was proposed by Allan Jude.
148  *
149  * Block Cloning across datasets.
150  *
151  * Block Cloning is not limited to cloning blocks within the same dataset.
152  * It is possible (and very useful) to clone blocks between different datasets.
153  * One use case is recovering files from snapshots. By cloning the files into
154  * dataset we need no additional storage. Without Block Cloning we would need
155  * additional space for those files.
156  * Another interesting use case is moving the files between datasets
157  * (copying the file content to the new dataset and removing the source file).
158  * In that case Block Cloning will only be used briefly, because the BRT entries
159  * will be removed when the source is removed.
160  * Block Cloning across encrypted datasets is supported as long as both
161  * datasets share the same master key (e.g. snapshots and clones)
162  *
163  * Block Cloning flow through ZFS layers.
164  *
165  * Note: Block Cloning can be used both for cloning file system blocks and ZVOL
166  * blocks. As of this writing no interface is implemented that allows for block
167  * cloning within a ZVOL.
168  * FreeBSD and Linux provides copy_file_range(2) system call and we will use it
169  * for blocking cloning.
170  *
171  *	ssize_t
172  *	copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp,
173  *	                size_t len, unsigned int flags);
174  *
175  * Even though offsets and length represent bytes, they have to be
176  * block-aligned or we will return an error so the upper layer can
177  * fallback to the generic mechanism that will just copy the data.
178  * Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
179  * This function was implemented based on zfs_write(), but instead of writing
180  * the given data we first read block pointers using the new dmu_read_l0_bps()
181  * function from the source file. Once we have BPs from the source file we call
182  * the dmu_brt_clone() function on the destination file. This function
183  * allocates BPs for us. We iterate over all source BPs. If the given BP is
184  * a hole or an embedded block, we just copy BP as-is. If it points to a real
185  * data we place this BP on a BRT pending list using the brt_pending_add()
186  * function.
187  *
188  * We use this pending list to keep track of all BPs that got new references
189  * within this transaction group.
190  *
191  * Some special cases to consider and how we address them:
192  * - The block we want to clone may have been created within the same
193  *   transaction group that we are trying to clone. Such block has no BP
194  *   allocated yet, so cannot be immediately cloned. We return EAGAIN.
195  * - The block we want to clone may have been modified within the same
196  *   transaction group. We return EAGAIN.
197  * - A block may be cloned multiple times during one transaction group (that's
198  *   why pending list is actually a tree and not an append-only list - this
199  *   way we can figure out faster if this block is cloned for the first time
200  *   in this txg or consecutive time).
201  * - A block may be cloned and freed within the same transaction group
202  *   (see dbuf_undirty()).
203  * - A block may be cloned and within the same transaction group the clone
204  *   can be cloned again (see dmu_read_l0_bps()).
205  * - A file might have been deleted, but the caller still has a file descriptor
206  *   open to this file and clones it.
207  *
208  * When we free a block we have an additional step in the ZIO pipeline where we
209  * call the zio_brt_free() function. We then call the brt_entry_decref()
210  * that loads the corresponding BRT entry (if one exists) and decreases
211  * reference counter. If this is not the last reference we will stop ZIO
212  * pipeline here. If this is the last reference or the block is not in the
213  * BRT, we continue the pipeline and free the block as usual.
214  *
215  * At the beginning of spa_sync() where there can be no more block cloning,
216  * but before issuing frees we call brt_pending_apply(). This function applies
217  * all the new clones to the BRT table - we load BRT entries and update
218  * reference counters. To sync new BRT entries to disk, we use brt_sync()
219  * function. This function will sync all dirty per-top-level-vdev BRTs,
220  * the entry counters arrays, etc.
221  *
222  * Block Cloning and ZIL.
223  *
224  * Every clone operation is divided into chunks (similar to write) and each
225  * chunk is cloned in a separate transaction. The chunk size is determined by
226  * how many BPs we can fit into a single ZIL entry.
227  * Replaying clone operation is different from the regular clone operation,
228  * as when we log clone operations we cannot use the source object - it may
229  * reside on a different dataset, so we log BPs we want to clone.
230  * The ZIL is replayed when we mount the given dataset, not when the pool is
231  * imported. Taking this into account it is possible that the pool is imported
232  * without mounting datasets and the source dataset is destroyed before the
233  * destination dataset is mounted and its ZIL replayed.
234  * To address this situation we leverage zil_claim() mechanism where ZFS will
235  * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
236  * entries, we will bump reference counters for their BPs in the BRT.  Then
237  * on mount and ZIL replay we bump the reference counters once more, while the
238  * first references are dropped during ZIL destroy by zil_free_clone_range().
239  * It is possible that after zil_claim() we never mount the destination, so
240  * we never replay its ZIL and just destroy it.  In this case the only taken
241  * references will be dropped by zil_free_clone_range(), since the cloning is
242  * not going to ever take place.
243  */
244 
245 static kmem_cache_t *brt_entry_cache;
246 static kmem_cache_t *brt_pending_entry_cache;
247 
248 /*
249  * Enable/disable prefetching of BRT entries that we are going to modify.
250  */
251 int zfs_brt_prefetch = 1;
252 
253 #ifdef ZFS_DEBUG
254 #define	BRT_DEBUG(...)	do {						\
255 	if ((zfs_flags & ZFS_DEBUG_BRT) != 0) {				\
256 		__dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \
257 	}								\
258 } while (0)
259 #else
260 #define	BRT_DEBUG(...)	do { } while (0)
261 #endif
262 
263 int brt_zap_leaf_blockshift = 12;
264 int brt_zap_indirect_blockshift = 12;
265 
266 static kstat_t	*brt_ksp;
267 
268 typedef struct brt_stats {
269 	kstat_named_t brt_addref_entry_in_memory;
270 	kstat_named_t brt_addref_entry_not_on_disk;
271 	kstat_named_t brt_addref_entry_on_disk;
272 	kstat_named_t brt_addref_entry_read_lost_race;
273 	kstat_named_t brt_decref_entry_in_memory;
274 	kstat_named_t brt_decref_entry_loaded_from_disk;
275 	kstat_named_t brt_decref_entry_not_in_memory;
276 	kstat_named_t brt_decref_entry_not_on_disk;
277 	kstat_named_t brt_decref_entry_read_lost_race;
278 	kstat_named_t brt_decref_entry_still_referenced;
279 	kstat_named_t brt_decref_free_data_later;
280 	kstat_named_t brt_decref_free_data_now;
281 	kstat_named_t brt_decref_no_entry;
282 } brt_stats_t;
283 
284 static brt_stats_t brt_stats = {
285 	{ "addref_entry_in_memory",		KSTAT_DATA_UINT64 },
286 	{ "addref_entry_not_on_disk",		KSTAT_DATA_UINT64 },
287 	{ "addref_entry_on_disk",		KSTAT_DATA_UINT64 },
288 	{ "addref_entry_read_lost_race",	KSTAT_DATA_UINT64 },
289 	{ "decref_entry_in_memory",		KSTAT_DATA_UINT64 },
290 	{ "decref_entry_loaded_from_disk",	KSTAT_DATA_UINT64 },
291 	{ "decref_entry_not_in_memory",		KSTAT_DATA_UINT64 },
292 	{ "decref_entry_not_on_disk",		KSTAT_DATA_UINT64 },
293 	{ "decref_entry_read_lost_race",	KSTAT_DATA_UINT64 },
294 	{ "decref_entry_still_referenced",	KSTAT_DATA_UINT64 },
295 	{ "decref_free_data_later",		KSTAT_DATA_UINT64 },
296 	{ "decref_free_data_now",		KSTAT_DATA_UINT64 },
297 	{ "decref_no_entry",			KSTAT_DATA_UINT64 }
298 };
299 
300 struct {
301 	wmsum_t brt_addref_entry_in_memory;
302 	wmsum_t brt_addref_entry_not_on_disk;
303 	wmsum_t brt_addref_entry_on_disk;
304 	wmsum_t brt_addref_entry_read_lost_race;
305 	wmsum_t brt_decref_entry_in_memory;
306 	wmsum_t brt_decref_entry_loaded_from_disk;
307 	wmsum_t brt_decref_entry_not_in_memory;
308 	wmsum_t brt_decref_entry_not_on_disk;
309 	wmsum_t brt_decref_entry_read_lost_race;
310 	wmsum_t brt_decref_entry_still_referenced;
311 	wmsum_t brt_decref_free_data_later;
312 	wmsum_t brt_decref_free_data_now;
313 	wmsum_t brt_decref_no_entry;
314 } brt_sums;
315 
316 #define	BRTSTAT_BUMP(stat)	wmsum_add(&brt_sums.stat, 1)
317 
318 static int brt_entry_compare(const void *x1, const void *x2);
319 static int brt_pending_entry_compare(const void *x1, const void *x2);
320 
321 static void
322 brt_rlock(brt_t *brt)
323 {
324 	rw_enter(&brt->brt_lock, RW_READER);
325 }
326 
327 static void
328 brt_wlock(brt_t *brt)
329 {
330 	rw_enter(&brt->brt_lock, RW_WRITER);
331 }
332 
333 static void
334 brt_unlock(brt_t *brt)
335 {
336 	rw_exit(&brt->brt_lock);
337 }
338 
339 static uint16_t
340 brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx)
341 {
342 
343 	ASSERT3U(idx, <, brtvd->bv_size);
344 
345 	if (unlikely(brtvd->bv_need_byteswap)) {
346 		return (BSWAP_16(brtvd->bv_entcount[idx]));
347 	} else {
348 		return (brtvd->bv_entcount[idx]);
349 	}
350 }
351 
352 static void
353 brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt)
354 {
355 
356 	ASSERT3U(idx, <, brtvd->bv_size);
357 
358 	if (unlikely(brtvd->bv_need_byteswap)) {
359 		brtvd->bv_entcount[idx] = BSWAP_16(entcnt);
360 	} else {
361 		brtvd->bv_entcount[idx] = entcnt;
362 	}
363 }
364 
365 static void
366 brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx)
367 {
368 	uint16_t entcnt;
369 
370 	ASSERT3U(idx, <, brtvd->bv_size);
371 
372 	entcnt = brt_vdev_entcount_get(brtvd, idx);
373 	ASSERT(entcnt < UINT16_MAX);
374 
375 	brt_vdev_entcount_set(brtvd, idx, entcnt + 1);
376 }
377 
378 static void
379 brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx)
380 {
381 	uint16_t entcnt;
382 
383 	ASSERT3U(idx, <, brtvd->bv_size);
384 
385 	entcnt = brt_vdev_entcount_get(brtvd, idx);
386 	ASSERT(entcnt > 0);
387 
388 	brt_vdev_entcount_set(brtvd, idx, entcnt - 1);
389 }
390 
391 #ifdef ZFS_DEBUG
392 static void
393 brt_vdev_dump(brt_vdev_t *brtvd)
394 {
395 	uint64_t idx;
396 
397 	zfs_dbgmsg("  BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d "
398 	    "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n",
399 	    (u_longlong_t)brtvd->bv_vdevid,
400 	    brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
401 	    (u_longlong_t)brtvd->bv_size,
402 	    (u_longlong_t)brtvd->bv_totalcount,
403 	    (u_longlong_t)brtvd->bv_nblocks,
404 	    (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks));
405 	if (brtvd->bv_totalcount > 0) {
406 		zfs_dbgmsg("    entcounts:");
407 		for (idx = 0; idx < brtvd->bv_size; idx++) {
408 			uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx);
409 			if (entcnt > 0) {
410 				zfs_dbgmsg("      [%04llu] %hu",
411 				    (u_longlong_t)idx, entcnt);
412 			}
413 		}
414 	}
415 	if (brtvd->bv_entcount_dirty) {
416 		char *bitmap;
417 
418 		bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP);
419 		for (idx = 0; idx < brtvd->bv_nblocks; idx++) {
420 			bitmap[idx] =
421 			    BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
422 		}
423 		bitmap[idx] = '\0';
424 		zfs_dbgmsg("    dirty: %s", bitmap);
425 		kmem_free(bitmap, brtvd->bv_nblocks + 1);
426 	}
427 }
428 #endif
429 
430 static brt_vdev_t *
431 brt_vdev(brt_t *brt, uint64_t vdevid)
432 {
433 	brt_vdev_t *brtvd;
434 
435 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
436 
437 	if (vdevid < brt->brt_nvdevs) {
438 		brtvd = &brt->brt_vdevs[vdevid];
439 	} else {
440 		brtvd = NULL;
441 	}
442 
443 	return (brtvd);
444 }
445 
446 static void
447 brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
448 {
449 	char name[64];
450 
451 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
452 	ASSERT0(brtvd->bv_mos_brtvdev);
453 	ASSERT0(brtvd->bv_mos_entries);
454 	ASSERT(brtvd->bv_entcount != NULL);
455 	ASSERT(brtvd->bv_size > 0);
456 	ASSERT(brtvd->bv_bitmap != NULL);
457 	ASSERT(brtvd->bv_nblocks > 0);
458 
459 	brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0,
460 	    ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
461 	    brt_zap_leaf_blockshift, brt_zap_indirect_blockshift, DMU_OT_NONE,
462 	    0, tx);
463 	VERIFY(brtvd->bv_mos_entries != 0);
464 	BRT_DEBUG("MOS entries created, object=%llu",
465 	    (u_longlong_t)brtvd->bv_mos_entries);
466 
467 	/*
468 	 * We allocate DMU buffer to store the bv_entcount[] array.
469 	 * We will keep array size (bv_size) and cummulative count for all
470 	 * bv_entcount[]s (bv_totalcount) in the bonus buffer.
471 	 */
472 	brtvd->bv_mos_brtvdev = dmu_object_alloc(brt->brt_mos,
473 	    DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE,
474 	    DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx);
475 	VERIFY(brtvd->bv_mos_brtvdev != 0);
476 	BRT_DEBUG("MOS BRT VDEV created, object=%llu",
477 	    (u_longlong_t)brtvd->bv_mos_brtvdev);
478 
479 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
480 	    (u_longlong_t)brtvd->bv_vdevid);
481 	VERIFY0(zap_add(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
482 	    sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx));
483 	BRT_DEBUG("Pool directory object created, object=%s", name);
484 
485 	spa_feature_incr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
486 }
487 
488 static void
489 brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd)
490 {
491 	vdev_t *vd;
492 	uint16_t *entcount;
493 	ulong_t *bitmap;
494 	uint64_t nblocks, size;
495 
496 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
497 
498 	spa_config_enter(brt->brt_spa, SCL_VDEV, FTAG, RW_READER);
499 	vd = vdev_lookup_top(brt->brt_spa, brtvd->bv_vdevid);
500 	size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1;
501 	spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG);
502 
503 	entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP);
504 	nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);
505 	bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);
506 
507 	if (!brtvd->bv_initiated) {
508 		ASSERT0(brtvd->bv_size);
509 		ASSERT(brtvd->bv_entcount == NULL);
510 		ASSERT(brtvd->bv_bitmap == NULL);
511 		ASSERT0(brtvd->bv_nblocks);
512 
513 		avl_create(&brtvd->bv_tree, brt_entry_compare,
514 		    sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node));
515 	} else {
516 		ASSERT(brtvd->bv_size > 0);
517 		ASSERT(brtvd->bv_entcount != NULL);
518 		ASSERT(brtvd->bv_bitmap != NULL);
519 		ASSERT(brtvd->bv_nblocks > 0);
520 		/*
521 		 * TODO: Allow vdev shrinking. We only need to implement
522 		 * shrinking the on-disk BRT VDEV object.
523 		 * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset,
524 		 *     size, tx);
525 		 */
526 		ASSERT3U(brtvd->bv_size, <=, size);
527 
528 		memcpy(entcount, brtvd->bv_entcount,
529 		    sizeof (entcount[0]) * MIN(size, brtvd->bv_size));
530 		memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
531 		    BT_SIZEOFMAP(brtvd->bv_nblocks)));
532 		vmem_free(brtvd->bv_entcount,
533 		    sizeof (entcount[0]) * brtvd->bv_size);
534 		kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
535 	}
536 
537 	brtvd->bv_size = size;
538 	brtvd->bv_entcount = entcount;
539 	brtvd->bv_bitmap = bitmap;
540 	brtvd->bv_nblocks = nblocks;
541 	if (!brtvd->bv_initiated) {
542 		brtvd->bv_need_byteswap = FALSE;
543 		brtvd->bv_initiated = TRUE;
544 		BRT_DEBUG("BRT VDEV %llu initiated.",
545 		    (u_longlong_t)brtvd->bv_vdevid);
546 	}
547 }
548 
549 static void
550 brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd)
551 {
552 	char name[64];
553 	dmu_buf_t *db;
554 	brt_vdev_phys_t *bvphys;
555 	int error;
556 
557 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
558 	    (u_longlong_t)brtvd->bv_vdevid);
559 	error = zap_lookup(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
560 	    sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev);
561 	if (error != 0)
562 		return;
563 	ASSERT(brtvd->bv_mos_brtvdev != 0);
564 
565 	error = dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db);
566 	ASSERT0(error);
567 	if (error != 0)
568 		return;
569 
570 	bvphys = db->db_data;
571 	if (brt->brt_rangesize == 0) {
572 		brt->brt_rangesize = bvphys->bvp_rangesize;
573 	} else {
574 		ASSERT3U(brt->brt_rangesize, ==, bvphys->bvp_rangesize);
575 	}
576 
577 	ASSERT(!brtvd->bv_initiated);
578 	brt_vdev_realloc(brt, brtvd);
579 
580 	/* TODO: We don't support VDEV shrinking. */
581 	ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size);
582 
583 	/*
584 	 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out.
585 	 */
586 	error = dmu_read(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
587 	    MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t),
588 	    brtvd->bv_entcount, DMU_READ_NO_PREFETCH);
589 	ASSERT0(error);
590 
591 	brtvd->bv_mos_entries = bvphys->bvp_mos_entries;
592 	ASSERT(brtvd->bv_mos_entries != 0);
593 	brtvd->bv_need_byteswap =
594 	    (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER);
595 	brtvd->bv_totalcount = bvphys->bvp_totalcount;
596 	brtvd->bv_usedspace = bvphys->bvp_usedspace;
597 	brtvd->bv_savedspace = bvphys->bvp_savedspace;
598 	brt->brt_usedspace += brtvd->bv_usedspace;
599 	brt->brt_savedspace += brtvd->bv_savedspace;
600 
601 	dmu_buf_rele(db, FTAG);
602 
603 	BRT_DEBUG("MOS BRT VDEV %s loaded: mos_brtvdev=%llu, mos_entries=%llu",
604 	    name, (u_longlong_t)brtvd->bv_mos_brtvdev,
605 	    (u_longlong_t)brtvd->bv_mos_entries);
606 }
607 
608 static void
609 brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd)
610 {
611 
612 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
613 	ASSERT(brtvd->bv_initiated);
614 
615 	vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size);
616 	brtvd->bv_entcount = NULL;
617 	kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
618 	brtvd->bv_bitmap = NULL;
619 	ASSERT0(avl_numnodes(&brtvd->bv_tree));
620 	avl_destroy(&brtvd->bv_tree);
621 
622 	brtvd->bv_size = 0;
623 	brtvd->bv_nblocks = 0;
624 
625 	brtvd->bv_initiated = FALSE;
626 	BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid);
627 }
628 
629 static void
630 brt_vdev_destroy(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
631 {
632 	char name[64];
633 	uint64_t count;
634 	dmu_buf_t *db;
635 	brt_vdev_phys_t *bvphys;
636 
637 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
638 	ASSERT(brtvd->bv_mos_brtvdev != 0);
639 	ASSERT(brtvd->bv_mos_entries != 0);
640 
641 	VERIFY0(zap_count(brt->brt_mos, brtvd->bv_mos_entries, &count));
642 	VERIFY0(count);
643 	VERIFY0(zap_destroy(brt->brt_mos, brtvd->bv_mos_entries, tx));
644 	BRT_DEBUG("MOS entries destroyed, object=%llu",
645 	    (u_longlong_t)brtvd->bv_mos_entries);
646 	brtvd->bv_mos_entries = 0;
647 
648 	VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
649 	bvphys = db->db_data;
650 	ASSERT0(bvphys->bvp_totalcount);
651 	ASSERT0(bvphys->bvp_usedspace);
652 	ASSERT0(bvphys->bvp_savedspace);
653 	dmu_buf_rele(db, FTAG);
654 
655 	VERIFY0(dmu_object_free(brt->brt_mos, brtvd->bv_mos_brtvdev, tx));
656 	BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",
657 	    (u_longlong_t)brtvd->bv_mos_brtvdev);
658 	brtvd->bv_mos_brtvdev = 0;
659 
660 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
661 	    (u_longlong_t)brtvd->bv_vdevid);
662 	VERIFY0(zap_remove(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, tx));
663 	BRT_DEBUG("Pool directory object removed, object=%s", name);
664 
665 	brt_vdev_dealloc(brt, brtvd);
666 
667 	spa_feature_decr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
668 }
669 
670 static void
671 brt_vdevs_expand(brt_t *brt, uint64_t nvdevs)
672 {
673 	brt_vdev_t *brtvd, *vdevs;
674 	uint64_t vdevid;
675 
676 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
677 	ASSERT3U(nvdevs, >, brt->brt_nvdevs);
678 
679 	vdevs = kmem_zalloc(sizeof (vdevs[0]) * nvdevs, KM_SLEEP);
680 	if (brt->brt_nvdevs > 0) {
681 		ASSERT(brt->brt_vdevs != NULL);
682 
683 		memcpy(vdevs, brt->brt_vdevs,
684 		    sizeof (brt_vdev_t) * brt->brt_nvdevs);
685 		kmem_free(brt->brt_vdevs,
686 		    sizeof (brt_vdev_t) * brt->brt_nvdevs);
687 	}
688 	for (vdevid = brt->brt_nvdevs; vdevid < nvdevs; vdevid++) {
689 		brtvd = &vdevs[vdevid];
690 
691 		brtvd->bv_vdevid = vdevid;
692 		brtvd->bv_initiated = FALSE;
693 	}
694 
695 	BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",
696 	    (u_longlong_t)brt->brt_nvdevs, (u_longlong_t)nvdevs);
697 
698 	brt->brt_vdevs = vdevs;
699 	brt->brt_nvdevs = nvdevs;
700 }
701 
702 static boolean_t
703 brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre)
704 {
705 	uint64_t idx;
706 
707 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
708 
709 	idx = bre->bre_offset / brt->brt_rangesize;
710 	if (brtvd->bv_entcount != NULL && idx < brtvd->bv_size) {
711 		/* VDEV wasn't expanded. */
712 		return (brt_vdev_entcount_get(brtvd, idx) > 0);
713 	}
714 
715 	return (FALSE);
716 }
717 
718 static void
719 brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
720     uint64_t dsize)
721 {
722 	uint64_t idx;
723 
724 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
725 	ASSERT(brtvd != NULL);
726 	ASSERT(brtvd->bv_entcount != NULL);
727 
728 	brt->brt_savedspace += dsize;
729 	brtvd->bv_savedspace += dsize;
730 	brtvd->bv_meta_dirty = TRUE;
731 
732 	if (bre->bre_refcount > 1) {
733 		return;
734 	}
735 
736 	brt->brt_usedspace += dsize;
737 	brtvd->bv_usedspace += dsize;
738 
739 	idx = bre->bre_offset / brt->brt_rangesize;
740 	if (idx >= brtvd->bv_size) {
741 		/* VDEV has been expanded. */
742 		brt_vdev_realloc(brt, brtvd);
743 	}
744 
745 	ASSERT3U(idx, <, brtvd->bv_size);
746 
747 	brtvd->bv_totalcount++;
748 	brt_vdev_entcount_inc(brtvd, idx);
749 	brtvd->bv_entcount_dirty = TRUE;
750 	idx = idx / BRT_BLOCKSIZE / 8;
751 	BT_SET(brtvd->bv_bitmap, idx);
752 
753 #ifdef ZFS_DEBUG
754 	if (zfs_flags & ZFS_DEBUG_BRT)
755 		brt_vdev_dump(brtvd);
756 #endif
757 }
758 
759 static void
760 brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
761     uint64_t dsize)
762 {
763 	uint64_t idx;
764 
765 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
766 	ASSERT(brtvd != NULL);
767 	ASSERT(brtvd->bv_entcount != NULL);
768 
769 	brt->brt_savedspace -= dsize;
770 	brtvd->bv_savedspace -= dsize;
771 	brtvd->bv_meta_dirty = TRUE;
772 
773 	if (bre->bre_refcount > 0) {
774 		return;
775 	}
776 
777 	brt->brt_usedspace -= dsize;
778 	brtvd->bv_usedspace -= dsize;
779 
780 	idx = bre->bre_offset / brt->brt_rangesize;
781 	ASSERT3U(idx, <, brtvd->bv_size);
782 
783 	ASSERT(brtvd->bv_totalcount > 0);
784 	brtvd->bv_totalcount--;
785 	brt_vdev_entcount_dec(brtvd, idx);
786 	brtvd->bv_entcount_dirty = TRUE;
787 	idx = idx / BRT_BLOCKSIZE / 8;
788 	BT_SET(brtvd->bv_bitmap, idx);
789 
790 #ifdef ZFS_DEBUG
791 	if (zfs_flags & ZFS_DEBUG_BRT)
792 		brt_vdev_dump(brtvd);
793 #endif
794 }
795 
796 static void
797 brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
798 {
799 	dmu_buf_t *db;
800 	brt_vdev_phys_t *bvphys;
801 
802 	ASSERT(brtvd->bv_meta_dirty);
803 	ASSERT(brtvd->bv_mos_brtvdev != 0);
804 	ASSERT(dmu_tx_is_syncing(tx));
805 
806 	VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
807 
808 	if (brtvd->bv_entcount_dirty) {
809 		/*
810 		 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
811 		 */
812 		dmu_write(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
813 		    brtvd->bv_size * sizeof (brtvd->bv_entcount[0]),
814 		    brtvd->bv_entcount, tx);
815 		memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(brtvd->bv_nblocks));
816 		brtvd->bv_entcount_dirty = FALSE;
817 	}
818 
819 	dmu_buf_will_dirty(db, tx);
820 	bvphys = db->db_data;
821 	bvphys->bvp_mos_entries = brtvd->bv_mos_entries;
822 	bvphys->bvp_size = brtvd->bv_size;
823 	if (brtvd->bv_need_byteswap) {
824 		bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER;
825 	} else {
826 		bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER;
827 	}
828 	bvphys->bvp_totalcount = brtvd->bv_totalcount;
829 	bvphys->bvp_rangesize = brt->brt_rangesize;
830 	bvphys->bvp_usedspace = brtvd->bv_usedspace;
831 	bvphys->bvp_savedspace = brtvd->bv_savedspace;
832 	dmu_buf_rele(db, FTAG);
833 
834 	brtvd->bv_meta_dirty = FALSE;
835 }
836 
837 static void
838 brt_vdevs_alloc(brt_t *brt, boolean_t load)
839 {
840 	brt_vdev_t *brtvd;
841 	uint64_t vdevid;
842 
843 	brt_wlock(brt);
844 
845 	brt_vdevs_expand(brt, brt->brt_spa->spa_root_vdev->vdev_children);
846 
847 	if (load) {
848 		for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
849 			brtvd = &brt->brt_vdevs[vdevid];
850 			ASSERT(brtvd->bv_entcount == NULL);
851 
852 			brt_vdev_load(brt, brtvd);
853 		}
854 	}
855 
856 	if (brt->brt_rangesize == 0) {
857 		brt->brt_rangesize = BRT_RANGESIZE;
858 	}
859 
860 	brt_unlock(brt);
861 }
862 
863 static void
864 brt_vdevs_free(brt_t *brt)
865 {
866 	brt_vdev_t *brtvd;
867 	uint64_t vdevid;
868 
869 	brt_wlock(brt);
870 
871 	for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
872 		brtvd = &brt->brt_vdevs[vdevid];
873 		if (brtvd->bv_initiated)
874 			brt_vdev_dealloc(brt, brtvd);
875 	}
876 	kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs);
877 
878 	brt_unlock(brt);
879 }
880 
881 static void
882 brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp)
883 {
884 
885 	bre->bre_offset = DVA_GET_OFFSET(&bp->blk_dva[0]);
886 	bre->bre_refcount = 0;
887 
888 	*vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]);
889 }
890 
891 static int
892 brt_entry_compare(const void *x1, const void *x2)
893 {
894 	const brt_entry_t *bre1 = x1;
895 	const brt_entry_t *bre2 = x2;
896 
897 	return (TREE_CMP(bre1->bre_offset, bre2->bre_offset));
898 }
899 
900 static int
901 brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre)
902 {
903 	uint64_t mos_entries;
904 	uint64_t one, physsize;
905 	int error;
906 
907 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
908 
909 	if (!brt_vdev_lookup(brt, brtvd, bre))
910 		return (SET_ERROR(ENOENT));
911 
912 	/*
913 	 * Remember mos_entries object number. After we reacquire the BRT lock,
914 	 * the brtvd pointer may be invalid.
915 	 */
916 	mos_entries = brtvd->bv_mos_entries;
917 	if (mos_entries == 0)
918 		return (SET_ERROR(ENOENT));
919 
920 	brt_unlock(brt);
921 
922 	error = zap_length_uint64(brt->brt_mos, mos_entries, &bre->bre_offset,
923 	    BRT_KEY_WORDS, &one, &physsize);
924 	if (error == 0) {
925 		ASSERT3U(one, ==, 1);
926 		ASSERT3U(physsize, ==, sizeof (bre->bre_refcount));
927 
928 		error = zap_lookup_uint64(brt->brt_mos, mos_entries,
929 		    &bre->bre_offset, BRT_KEY_WORDS, 1,
930 		    sizeof (bre->bre_refcount), &bre->bre_refcount);
931 		BRT_DEBUG("ZAP lookup: object=%llu vdev=%llu offset=%llu "
932 		    "count=%llu error=%d", (u_longlong_t)mos_entries,
933 		    (u_longlong_t)brtvd->bv_vdevid,
934 		    (u_longlong_t)bre->bre_offset,
935 		    error == 0 ? (u_longlong_t)bre->bre_refcount : 0, error);
936 	}
937 
938 	brt_wlock(brt);
939 
940 	return (error);
941 }
942 
943 static void
944 brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre)
945 {
946 	brt_vdev_t *brtvd;
947 	uint64_t mos_entries = 0;
948 
949 	brt_rlock(brt);
950 	brtvd = brt_vdev(brt, vdevid);
951 	if (brtvd != NULL)
952 		mos_entries = brtvd->bv_mos_entries;
953 	brt_unlock(brt);
954 
955 	if (mos_entries == 0)
956 		return;
957 
958 	BRT_DEBUG("ZAP prefetch: object=%llu vdev=%llu offset=%llu",
959 	    (u_longlong_t)mos_entries, (u_longlong_t)vdevid,
960 	    (u_longlong_t)bre->bre_offset);
961 	(void) zap_prefetch_uint64(brt->brt_mos, mos_entries,
962 	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS);
963 }
964 
965 static int
966 brt_entry_update(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
967 {
968 	int error;
969 
970 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
971 	ASSERT(brtvd->bv_mos_entries != 0);
972 	ASSERT(bre->bre_refcount > 0);
973 
974 	error = zap_update_uint64(brt->brt_mos, brtvd->bv_mos_entries,
975 	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, 1,
976 	    sizeof (bre->bre_refcount), &bre->bre_refcount, tx);
977 	BRT_DEBUG("ZAP update: object=%llu vdev=%llu offset=%llu count=%llu "
978 	    "error=%d", (u_longlong_t)brtvd->bv_mos_entries,
979 	    (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
980 	    (u_longlong_t)bre->bre_refcount, error);
981 
982 	return (error);
983 }
984 
985 static int
986 brt_entry_remove(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
987 {
988 	int error;
989 
990 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
991 	ASSERT(brtvd->bv_mos_entries != 0);
992 	ASSERT0(bre->bre_refcount);
993 
994 	error = zap_remove_uint64(brt->brt_mos, brtvd->bv_mos_entries,
995 	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, tx);
996 	BRT_DEBUG("ZAP remove: object=%llu vdev=%llu offset=%llu count=%llu "
997 	    "error=%d", (u_longlong_t)brtvd->bv_mos_entries,
998 	    (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
999 	    (u_longlong_t)bre->bre_refcount, error);
1000 
1001 	return (error);
1002 }
1003 
1004 /*
1005  * Return TRUE if we _can_ have BRT entry for this bp. It might be false
1006  * positive, but gives us quick answer if we should look into BRT, which
1007  * may require reads and thus will be more expensive.
1008  */
1009 boolean_t
1010 brt_maybe_exists(spa_t *spa, const blkptr_t *bp)
1011 {
1012 	brt_t *brt = spa->spa_brt;
1013 	brt_vdev_t *brtvd;
1014 	brt_entry_t bre_search;
1015 	boolean_t mayexists = FALSE;
1016 	uint64_t vdevid;
1017 
1018 	brt_entry_fill(bp, &bre_search, &vdevid);
1019 
1020 	brt_rlock(brt);
1021 
1022 	brtvd = brt_vdev(brt, vdevid);
1023 	if (brtvd != NULL && brtvd->bv_initiated) {
1024 		if (!avl_is_empty(&brtvd->bv_tree) ||
1025 		    brt_vdev_lookup(brt, brtvd, &bre_search)) {
1026 			mayexists = TRUE;
1027 		}
1028 	}
1029 
1030 	brt_unlock(brt);
1031 
1032 	return (mayexists);
1033 }
1034 
1035 uint64_t
1036 brt_get_dspace(spa_t *spa)
1037 {
1038 	brt_t *brt = spa->spa_brt;
1039 
1040 	if (brt == NULL)
1041 		return (0);
1042 
1043 	return (brt->brt_savedspace);
1044 }
1045 
1046 uint64_t
1047 brt_get_used(spa_t *spa)
1048 {
1049 	brt_t *brt = spa->spa_brt;
1050 
1051 	if (brt == NULL)
1052 		return (0);
1053 
1054 	return (brt->brt_usedspace);
1055 }
1056 
1057 uint64_t
1058 brt_get_saved(spa_t *spa)
1059 {
1060 	brt_t *brt = spa->spa_brt;
1061 
1062 	if (brt == NULL)
1063 		return (0);
1064 
1065 	return (brt->brt_savedspace);
1066 }
1067 
1068 uint64_t
1069 brt_get_ratio(spa_t *spa)
1070 {
1071 	brt_t *brt = spa->spa_brt;
1072 
1073 	if (brt->brt_usedspace == 0)
1074 		return (100);
1075 
1076 	return ((brt->brt_usedspace + brt->brt_savedspace) * 100 /
1077 	    brt->brt_usedspace);
1078 }
1079 
1080 static int
1081 brt_kstats_update(kstat_t *ksp, int rw)
1082 {
1083 	brt_stats_t *bs = ksp->ks_data;
1084 
1085 	if (rw == KSTAT_WRITE)
1086 		return (EACCES);
1087 
1088 	bs->brt_addref_entry_in_memory.value.ui64 =
1089 	    wmsum_value(&brt_sums.brt_addref_entry_in_memory);
1090 	bs->brt_addref_entry_not_on_disk.value.ui64 =
1091 	    wmsum_value(&brt_sums.brt_addref_entry_not_on_disk);
1092 	bs->brt_addref_entry_on_disk.value.ui64 =
1093 	    wmsum_value(&brt_sums.brt_addref_entry_on_disk);
1094 	bs->brt_addref_entry_read_lost_race.value.ui64 =
1095 	    wmsum_value(&brt_sums.brt_addref_entry_read_lost_race);
1096 	bs->brt_decref_entry_in_memory.value.ui64 =
1097 	    wmsum_value(&brt_sums.brt_decref_entry_in_memory);
1098 	bs->brt_decref_entry_loaded_from_disk.value.ui64 =
1099 	    wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk);
1100 	bs->brt_decref_entry_not_in_memory.value.ui64 =
1101 	    wmsum_value(&brt_sums.brt_decref_entry_not_in_memory);
1102 	bs->brt_decref_entry_not_on_disk.value.ui64 =
1103 	    wmsum_value(&brt_sums.brt_decref_entry_not_on_disk);
1104 	bs->brt_decref_entry_read_lost_race.value.ui64 =
1105 	    wmsum_value(&brt_sums.brt_decref_entry_read_lost_race);
1106 	bs->brt_decref_entry_still_referenced.value.ui64 =
1107 	    wmsum_value(&brt_sums.brt_decref_entry_still_referenced);
1108 	bs->brt_decref_free_data_later.value.ui64 =
1109 	    wmsum_value(&brt_sums.brt_decref_free_data_later);
1110 	bs->brt_decref_free_data_now.value.ui64 =
1111 	    wmsum_value(&brt_sums.brt_decref_free_data_now);
1112 	bs->brt_decref_no_entry.value.ui64 =
1113 	    wmsum_value(&brt_sums.brt_decref_no_entry);
1114 
1115 	return (0);
1116 }
1117 
1118 static void
1119 brt_stat_init(void)
1120 {
1121 
1122 	wmsum_init(&brt_sums.brt_addref_entry_in_memory, 0);
1123 	wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0);
1124 	wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0);
1125 	wmsum_init(&brt_sums.brt_addref_entry_read_lost_race, 0);
1126 	wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0);
1127 	wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0);
1128 	wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0);
1129 	wmsum_init(&brt_sums.brt_decref_entry_not_on_disk, 0);
1130 	wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0);
1131 	wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0);
1132 	wmsum_init(&brt_sums.brt_decref_free_data_later, 0);
1133 	wmsum_init(&brt_sums.brt_decref_free_data_now, 0);
1134 	wmsum_init(&brt_sums.brt_decref_no_entry, 0);
1135 
1136 	brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED,
1137 	    sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
1138 	if (brt_ksp != NULL) {
1139 		brt_ksp->ks_data = &brt_stats;
1140 		brt_ksp->ks_update = brt_kstats_update;
1141 		kstat_install(brt_ksp);
1142 	}
1143 }
1144 
1145 static void
1146 brt_stat_fini(void)
1147 {
1148 	if (brt_ksp != NULL) {
1149 		kstat_delete(brt_ksp);
1150 		brt_ksp = NULL;
1151 	}
1152 
1153 	wmsum_fini(&brt_sums.brt_addref_entry_in_memory);
1154 	wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk);
1155 	wmsum_fini(&brt_sums.brt_addref_entry_on_disk);
1156 	wmsum_fini(&brt_sums.brt_addref_entry_read_lost_race);
1157 	wmsum_fini(&brt_sums.brt_decref_entry_in_memory);
1158 	wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk);
1159 	wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory);
1160 	wmsum_fini(&brt_sums.brt_decref_entry_not_on_disk);
1161 	wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race);
1162 	wmsum_fini(&brt_sums.brt_decref_entry_still_referenced);
1163 	wmsum_fini(&brt_sums.brt_decref_free_data_later);
1164 	wmsum_fini(&brt_sums.brt_decref_free_data_now);
1165 	wmsum_fini(&brt_sums.brt_decref_no_entry);
1166 }
1167 
1168 void
1169 brt_init(void)
1170 {
1171 	brt_entry_cache = kmem_cache_create("brt_entry_cache",
1172 	    sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1173 	brt_pending_entry_cache = kmem_cache_create("brt_pending_entry_cache",
1174 	    sizeof (brt_pending_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1175 
1176 	brt_stat_init();
1177 }
1178 
1179 void
1180 brt_fini(void)
1181 {
1182 	brt_stat_fini();
1183 
1184 	kmem_cache_destroy(brt_entry_cache);
1185 	kmem_cache_destroy(brt_pending_entry_cache);
1186 }
1187 
1188 static brt_entry_t *
1189 brt_entry_alloc(const brt_entry_t *bre_init)
1190 {
1191 	brt_entry_t *bre;
1192 
1193 	bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
1194 	bre->bre_offset = bre_init->bre_offset;
1195 	bre->bre_refcount = bre_init->bre_refcount;
1196 
1197 	return (bre);
1198 }
1199 
1200 static void
1201 brt_entry_free(brt_entry_t *bre)
1202 {
1203 
1204 	kmem_cache_free(brt_entry_cache, bre);
1205 }
1206 
1207 static void
1208 brt_entry_addref(brt_t *brt, const blkptr_t *bp)
1209 {
1210 	brt_vdev_t *brtvd;
1211 	brt_entry_t *bre, *racebre;
1212 	brt_entry_t bre_search;
1213 	avl_index_t where;
1214 	uint64_t vdevid;
1215 	int error;
1216 
1217 	ASSERT(!RW_WRITE_HELD(&brt->brt_lock));
1218 
1219 	brt_entry_fill(bp, &bre_search, &vdevid);
1220 
1221 	brt_wlock(brt);
1222 
1223 	brtvd = brt_vdev(brt, vdevid);
1224 	if (brtvd == NULL) {
1225 		ASSERT3U(vdevid, >=, brt->brt_nvdevs);
1226 
1227 		/* New VDEV was added. */
1228 		brt_vdevs_expand(brt, vdevid + 1);
1229 		brtvd = brt_vdev(brt, vdevid);
1230 	}
1231 	ASSERT(brtvd != NULL);
1232 	if (!brtvd->bv_initiated)
1233 		brt_vdev_realloc(brt, brtvd);
1234 
1235 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1236 	if (bre != NULL) {
1237 		BRTSTAT_BUMP(brt_addref_entry_in_memory);
1238 	} else {
1239 		/*
1240 		 * brt_entry_lookup() may drop the BRT (read) lock and
1241 		 * reacquire it (write).
1242 		 */
1243 		error = brt_entry_lookup(brt, brtvd, &bre_search);
1244 		/* bre_search now contains correct bre_refcount */
1245 		ASSERT(error == 0 || error == ENOENT);
1246 		if (error == 0)
1247 			BRTSTAT_BUMP(brt_addref_entry_on_disk);
1248 		else
1249 			BRTSTAT_BUMP(brt_addref_entry_not_on_disk);
1250 		/*
1251 		 * When the BRT lock was dropped, brt_vdevs[] may have been
1252 		 * expanded and reallocated, we need to update brtvd's pointer.
1253 		 */
1254 		brtvd = brt_vdev(brt, vdevid);
1255 		ASSERT(brtvd != NULL);
1256 
1257 		racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
1258 		if (racebre == NULL) {
1259 			bre = brt_entry_alloc(&bre_search);
1260 			ASSERT(RW_WRITE_HELD(&brt->brt_lock));
1261 			avl_insert(&brtvd->bv_tree, bre, where);
1262 			brt->brt_nentries++;
1263 		} else {
1264 			/*
1265 			 * The entry was added when the BRT lock was dropped in
1266 			 * brt_entry_lookup().
1267 			 */
1268 			BRTSTAT_BUMP(brt_addref_entry_read_lost_race);
1269 			bre = racebre;
1270 		}
1271 	}
1272 	bre->bre_refcount++;
1273 	brt_vdev_addref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
1274 
1275 	brt_unlock(brt);
1276 }
1277 
1278 /* Return TRUE if block should be freed immediately. */
1279 boolean_t
1280 brt_entry_decref(spa_t *spa, const blkptr_t *bp)
1281 {
1282 	brt_t *brt = spa->spa_brt;
1283 	brt_vdev_t *brtvd;
1284 	brt_entry_t *bre, *racebre;
1285 	brt_entry_t bre_search;
1286 	avl_index_t where;
1287 	uint64_t vdevid;
1288 	int error;
1289 
1290 	brt_entry_fill(bp, &bre_search, &vdevid);
1291 
1292 	brt_wlock(brt);
1293 
1294 	brtvd = brt_vdev(brt, vdevid);
1295 	ASSERT(brtvd != NULL);
1296 
1297 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1298 	if (bre != NULL) {
1299 		BRTSTAT_BUMP(brt_decref_entry_in_memory);
1300 		goto out;
1301 	} else {
1302 		BRTSTAT_BUMP(brt_decref_entry_not_in_memory);
1303 	}
1304 
1305 	/*
1306 	 * brt_entry_lookup() may drop the BRT lock and reacquire it.
1307 	 */
1308 	error = brt_entry_lookup(brt, brtvd, &bre_search);
1309 	/* bre_search now contains correct bre_refcount */
1310 	ASSERT(error == 0 || error == ENOENT);
1311 	/*
1312 	 * When the BRT lock was dropped, brt_vdevs[] may have been expanded
1313 	 * and reallocated, we need to update brtvd's pointer.
1314 	 */
1315 	brtvd = brt_vdev(brt, vdevid);
1316 	ASSERT(brtvd != NULL);
1317 
1318 	if (error == ENOENT) {
1319 		BRTSTAT_BUMP(brt_decref_entry_not_on_disk);
1320 		bre = NULL;
1321 		goto out;
1322 	}
1323 
1324 	racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
1325 	if (racebre != NULL) {
1326 		/*
1327 		 * The entry was added when the BRT lock was dropped in
1328 		 * brt_entry_lookup().
1329 		 */
1330 		BRTSTAT_BUMP(brt_decref_entry_read_lost_race);
1331 		bre = racebre;
1332 		goto out;
1333 	}
1334 
1335 	BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk);
1336 	bre = brt_entry_alloc(&bre_search);
1337 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
1338 	avl_insert(&brtvd->bv_tree, bre, where);
1339 	brt->brt_nentries++;
1340 
1341 out:
1342 	if (bre == NULL) {
1343 		/*
1344 		 * This is a free of a regular (not cloned) block.
1345 		 */
1346 		brt_unlock(brt);
1347 		BRTSTAT_BUMP(brt_decref_no_entry);
1348 		return (B_TRUE);
1349 	}
1350 	if (bre->bre_refcount == 0) {
1351 		brt_unlock(brt);
1352 		BRTSTAT_BUMP(brt_decref_free_data_now);
1353 		return (B_TRUE);
1354 	}
1355 
1356 	ASSERT(bre->bre_refcount > 0);
1357 	bre->bre_refcount--;
1358 	if (bre->bre_refcount == 0)
1359 		BRTSTAT_BUMP(brt_decref_free_data_later);
1360 	else
1361 		BRTSTAT_BUMP(brt_decref_entry_still_referenced);
1362 	brt_vdev_decref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
1363 
1364 	brt_unlock(brt);
1365 
1366 	return (B_FALSE);
1367 }
1368 
1369 uint64_t
1370 brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)
1371 {
1372 	brt_t *brt = spa->spa_brt;
1373 	brt_vdev_t *brtvd;
1374 	brt_entry_t bre_search, *bre;
1375 	uint64_t vdevid, refcnt;
1376 	int error;
1377 
1378 	brt_entry_fill(bp, &bre_search, &vdevid);
1379 
1380 	brt_rlock(brt);
1381 
1382 	brtvd = brt_vdev(brt, vdevid);
1383 	ASSERT(brtvd != NULL);
1384 
1385 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1386 	if (bre == NULL) {
1387 		error = brt_entry_lookup(brt, brtvd, &bre_search);
1388 		ASSERT(error == 0 || error == ENOENT);
1389 		if (error == ENOENT)
1390 			refcnt = 0;
1391 		else
1392 			refcnt = bre_search.bre_refcount;
1393 	} else
1394 		refcnt = bre->bre_refcount;
1395 
1396 	brt_unlock(brt);
1397 	return (refcnt);
1398 }
1399 
1400 static void
1401 brt_prefetch(brt_t *brt, const blkptr_t *bp)
1402 {
1403 	brt_entry_t bre;
1404 	uint64_t vdevid;
1405 
1406 	ASSERT(bp != NULL);
1407 
1408 	if (!zfs_brt_prefetch)
1409 		return;
1410 
1411 	brt_entry_fill(bp, &bre, &vdevid);
1412 
1413 	brt_entry_prefetch(brt, vdevid, &bre);
1414 }
1415 
1416 static int
1417 brt_pending_entry_compare(const void *x1, const void *x2)
1418 {
1419 	const brt_pending_entry_t *bpe1 = x1, *bpe2 = x2;
1420 	const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp;
1421 	int cmp;
1422 
1423 	cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1), BP_PHYSICAL_BIRTH(bp2));
1424 	if (cmp == 0) {
1425 		cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]),
1426 		    DVA_GET_VDEV(&bp2->blk_dva[0]));
1427 		if (cmp == 0) {
1428 			cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
1429 			    DVA_GET_OFFSET(&bp2->blk_dva[0]));
1430 		}
1431 	}
1432 
1433 	return (cmp);
1434 }
1435 
1436 void
1437 brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1438 {
1439 	brt_t *brt;
1440 	avl_tree_t *pending_tree;
1441 	kmutex_t *pending_lock;
1442 	brt_pending_entry_t *bpe, *newbpe;
1443 	avl_index_t where;
1444 	uint64_t txg;
1445 
1446 	brt = spa->spa_brt;
1447 	txg = dmu_tx_get_txg(tx);
1448 	ASSERT3U(txg, !=, 0);
1449 	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
1450 	pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
1451 
1452 	newbpe = kmem_cache_alloc(brt_pending_entry_cache, KM_SLEEP);
1453 	newbpe->bpe_bp = *bp;
1454 	newbpe->bpe_count = 1;
1455 
1456 	mutex_enter(pending_lock);
1457 
1458 	bpe = avl_find(pending_tree, newbpe, &where);
1459 	if (bpe == NULL) {
1460 		avl_insert(pending_tree, newbpe, where);
1461 		newbpe = NULL;
1462 	} else {
1463 		bpe->bpe_count++;
1464 	}
1465 
1466 	mutex_exit(pending_lock);
1467 
1468 	if (newbpe != NULL) {
1469 		ASSERT(bpe != NULL);
1470 		ASSERT(bpe != newbpe);
1471 		kmem_cache_free(brt_pending_entry_cache, newbpe);
1472 	} else {
1473 		ASSERT(bpe == NULL);
1474 	}
1475 
1476 	/* Prefetch BRT entry, as we will need it in the syncing context. */
1477 	brt_prefetch(brt, bp);
1478 }
1479 
1480 void
1481 brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1482 {
1483 	brt_t *brt;
1484 	avl_tree_t *pending_tree;
1485 	kmutex_t *pending_lock;
1486 	brt_pending_entry_t *bpe, bpe_search;
1487 	uint64_t txg;
1488 
1489 	brt = spa->spa_brt;
1490 	txg = dmu_tx_get_txg(tx);
1491 	ASSERT3U(txg, !=, 0);
1492 	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
1493 	pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
1494 
1495 	bpe_search.bpe_bp = *bp;
1496 
1497 	mutex_enter(pending_lock);
1498 
1499 	bpe = avl_find(pending_tree, &bpe_search, NULL);
1500 	/* I believe we should always find bpe when this function is called. */
1501 	if (bpe != NULL) {
1502 		ASSERT(bpe->bpe_count > 0);
1503 
1504 		bpe->bpe_count--;
1505 		if (bpe->bpe_count == 0) {
1506 			avl_remove(pending_tree, bpe);
1507 			kmem_cache_free(brt_pending_entry_cache, bpe);
1508 		}
1509 	}
1510 
1511 	mutex_exit(pending_lock);
1512 }
1513 
1514 void
1515 brt_pending_apply(spa_t *spa, uint64_t txg)
1516 {
1517 	brt_t *brt;
1518 	brt_pending_entry_t *bpe;
1519 	avl_tree_t *pending_tree;
1520 	kmutex_t *pending_lock;
1521 	void *c;
1522 
1523 	ASSERT3U(txg, !=, 0);
1524 
1525 	brt = spa->spa_brt;
1526 	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
1527 	pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
1528 
1529 	mutex_enter(pending_lock);
1530 
1531 	c = NULL;
1532 	while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) {
1533 		boolean_t added_to_ddt;
1534 
1535 		mutex_exit(pending_lock);
1536 
1537 		for (int i = 0; i < bpe->bpe_count; i++) {
1538 			/*
1539 			 * If the block has DEDUP bit set, it means that it
1540 			 * already exists in the DEDUP table, so we can just
1541 			 * use that instead of creating new entry in
1542 			 * the BRT table.
1543 			 */
1544 			if (BP_GET_DEDUP(&bpe->bpe_bp)) {
1545 				added_to_ddt = ddt_addref(spa, &bpe->bpe_bp);
1546 			} else {
1547 				added_to_ddt = B_FALSE;
1548 			}
1549 			if (!added_to_ddt)
1550 				brt_entry_addref(brt, &bpe->bpe_bp);
1551 		}
1552 
1553 		kmem_cache_free(brt_pending_entry_cache, bpe);
1554 		mutex_enter(pending_lock);
1555 	}
1556 
1557 	mutex_exit(pending_lock);
1558 }
1559 
1560 static void
1561 brt_sync_entry(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
1562 {
1563 
1564 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
1565 	ASSERT(brtvd->bv_mos_entries != 0);
1566 
1567 	if (bre->bre_refcount == 0) {
1568 		int error;
1569 
1570 		error = brt_entry_remove(brt, brtvd, bre, tx);
1571 		ASSERT(error == 0 || error == ENOENT);
1572 		/*
1573 		 * If error == ENOENT then zfs_clone_range() was done from a
1574 		 * removed (but opened) file (open(), unlink()).
1575 		 */
1576 		ASSERT(brt_entry_lookup(brt, brtvd, bre) == ENOENT);
1577 	} else {
1578 		VERIFY0(brt_entry_update(brt, brtvd, bre, tx));
1579 	}
1580 }
1581 
1582 static void
1583 brt_sync_table(brt_t *brt, dmu_tx_t *tx)
1584 {
1585 	brt_vdev_t *brtvd;
1586 	brt_entry_t *bre;
1587 	uint64_t vdevid;
1588 	void *c;
1589 
1590 	brt_wlock(brt);
1591 
1592 	for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
1593 		brtvd = &brt->brt_vdevs[vdevid];
1594 
1595 		if (!brtvd->bv_initiated)
1596 			continue;
1597 
1598 		if (!brtvd->bv_meta_dirty) {
1599 			ASSERT(!brtvd->bv_entcount_dirty);
1600 			ASSERT0(avl_numnodes(&brtvd->bv_tree));
1601 			continue;
1602 		}
1603 
1604 		ASSERT(!brtvd->bv_entcount_dirty ||
1605 		    avl_numnodes(&brtvd->bv_tree) != 0);
1606 
1607 		if (brtvd->bv_mos_brtvdev == 0)
1608 			brt_vdev_create(brt, brtvd, tx);
1609 
1610 		c = NULL;
1611 		while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
1612 			brt_sync_entry(brt, brtvd, bre, tx);
1613 			brt_entry_free(bre);
1614 			ASSERT(brt->brt_nentries > 0);
1615 			brt->brt_nentries--;
1616 		}
1617 
1618 		brt_vdev_sync(brt, brtvd, tx);
1619 
1620 		if (brtvd->bv_totalcount == 0)
1621 			brt_vdev_destroy(brt, brtvd, tx);
1622 	}
1623 
1624 	ASSERT0(brt->brt_nentries);
1625 
1626 	brt_unlock(brt);
1627 }
1628 
1629 void
1630 brt_sync(spa_t *spa, uint64_t txg)
1631 {
1632 	dmu_tx_t *tx;
1633 	brt_t *brt;
1634 
1635 	ASSERT(spa_syncing_txg(spa) == txg);
1636 
1637 	brt = spa->spa_brt;
1638 	brt_rlock(brt);
1639 	if (brt->brt_nentries == 0) {
1640 		/* No changes. */
1641 		brt_unlock(brt);
1642 		return;
1643 	}
1644 	brt_unlock(brt);
1645 
1646 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1647 
1648 	brt_sync_table(brt, tx);
1649 
1650 	dmu_tx_commit(tx);
1651 }
1652 
1653 static void
1654 brt_table_alloc(brt_t *brt)
1655 {
1656 
1657 	for (int i = 0; i < TXG_SIZE; i++) {
1658 		avl_create(&brt->brt_pending_tree[i],
1659 		    brt_pending_entry_compare,
1660 		    sizeof (brt_pending_entry_t),
1661 		    offsetof(brt_pending_entry_t, bpe_node));
1662 		mutex_init(&brt->brt_pending_lock[i], NULL, MUTEX_DEFAULT,
1663 		    NULL);
1664 	}
1665 }
1666 
1667 static void
1668 brt_table_free(brt_t *brt)
1669 {
1670 
1671 	for (int i = 0; i < TXG_SIZE; i++) {
1672 		ASSERT(avl_is_empty(&brt->brt_pending_tree[i]));
1673 
1674 		avl_destroy(&brt->brt_pending_tree[i]);
1675 		mutex_destroy(&brt->brt_pending_lock[i]);
1676 	}
1677 }
1678 
1679 static void
1680 brt_alloc(spa_t *spa)
1681 {
1682 	brt_t *brt;
1683 
1684 	ASSERT(spa->spa_brt == NULL);
1685 
1686 	brt = kmem_zalloc(sizeof (*brt), KM_SLEEP);
1687 	rw_init(&brt->brt_lock, NULL, RW_DEFAULT, NULL);
1688 	brt->brt_spa = spa;
1689 	brt->brt_rangesize = 0;
1690 	brt->brt_nentries = 0;
1691 	brt->brt_vdevs = NULL;
1692 	brt->brt_nvdevs = 0;
1693 	brt_table_alloc(brt);
1694 
1695 	spa->spa_brt = brt;
1696 }
1697 
1698 void
1699 brt_create(spa_t *spa)
1700 {
1701 
1702 	brt_alloc(spa);
1703 	brt_vdevs_alloc(spa->spa_brt, B_FALSE);
1704 }
1705 
1706 int
1707 brt_load(spa_t *spa)
1708 {
1709 
1710 	brt_alloc(spa);
1711 	brt_vdevs_alloc(spa->spa_brt, B_TRUE);
1712 
1713 	return (0);
1714 }
1715 
1716 void
1717 brt_unload(spa_t *spa)
1718 {
1719 	brt_t *brt = spa->spa_brt;
1720 
1721 	if (brt == NULL)
1722 		return;
1723 
1724 	brt_vdevs_free(brt);
1725 	brt_table_free(brt);
1726 	rw_destroy(&brt->brt_lock);
1727 	kmem_free(brt, sizeof (*brt));
1728 	spa->spa_brt = NULL;
1729 }
1730 
1731 /* BEGIN CSTYLED */
1732 ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, prefetch, INT, ZMOD_RW,
1733     "Enable prefetching of BRT entries");
1734 #ifdef ZFS_BRT_DEBUG
1735 ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, debug, INT, ZMOD_RW, "BRT debug");
1736 #endif
1737 /* END CSTYLED */
1738