xref: /freebsd/sys/contrib/openzfs/module/zfs/brt.c (revision 1db64f89363c97858961c4df0b7d02f3223723cf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
24  */
25 
26 #include <sys/zfs_context.h>
27 #include <sys/spa.h>
28 #include <sys/spa_impl.h>
29 #include <sys/zio.h>
30 #include <sys/brt.h>
31 #include <sys/brt_impl.h>
32 #include <sys/ddt.h>
33 #include <sys/bitmap.h>
34 #include <sys/zap.h>
35 #include <sys/dmu_tx.h>
36 #include <sys/arc.h>
37 #include <sys/dsl_pool.h>
38 #include <sys/dsl_scan.h>
39 #include <sys/vdev_impl.h>
40 #include <sys/kstat.h>
41 #include <sys/wmsum.h>
42 
43 /*
44  * Block Cloning design.
45  *
46  * Block Cloning allows to manually clone a file (or a subset of its blocks)
47  * into another (or the same) file by just creating additional references to
48  * the data blocks without copying the data itself. Those references are kept
49  * in the Block Reference Tables (BRTs).
50  *
51  * In many ways this is similar to the existing deduplication, but there are
52  * some important differences:
53  *
54  * - Deduplication is automatic and Block Cloning is not - one has to use a
55  *   dedicated system call(s) to clone the given file/blocks.
56  * - Deduplication keeps all data blocks in its table, even those referenced
57  *   just once. Block Cloning creates an entry in its tables only when there
58  *   are at least two references to the given data block. If the block was
59  *   never explicitly cloned or the second to last reference was dropped,
60  *   there will be neither space nor performance overhead.
61  * - Deduplication needs data to work - one needs to pass real data to the
62  *   write(2) syscall, so hash can be calculated. Block Cloning doesn't require
63  *   data, just block pointers to the data, so it is extremely fast, as we pay
64  *   neither the cost of reading the data, nor the cost of writing the data -
65  *   we operate exclusively on metadata.
66  * - If the D (dedup) bit is not set in the block pointer, it means that
67  *   the block is not in the dedup table (DDT) and we won't consult the DDT
68  *   when we need to free the block. Block Cloning must be consulted on every
69  *   free, because we cannot modify the source BP (eg. by setting something
70  *   similar to the D bit), thus we have no hint if the block is in the
71  *   Block Reference Table (BRT), so we need to look into the BRT. There is
72  *   an optimization in place that allows us to eliminate the majority of BRT
73  *   lookups which is described below in the "Minimizing free penalty" section.
74  * - The BRT entry is much smaller than the DDT entry - for BRT we only store
75  *   64bit offset and 64bit reference counter.
76  * - Dedup keys are cryptographic hashes, so two blocks that are close to each
77  *   other on disk are most likely in totally different parts of the DDT.
78  *   The BRT entry keys are offsets into a single top-level VDEV, so data blocks
79  *   from one file should have BRT entries close to each other.
80  * - Scrub will only do a single pass over a block that is referenced multiple
81  *   times in the DDT. Unfortunately it is not currently (if at all) possible
82  *   with Block Cloning and block referenced multiple times will be scrubbed
83  *   multiple times. The new, sorted scrub should be able to eliminate
84  *   duplicated reads given enough memory.
85  * - Deduplication requires cryptographically strong hash as a checksum or
86  *   additional data verification. Block Cloning works with any checksum
87  *   algorithm or even with checksumming disabled.
88  *
89  * As mentioned above, the BRT entries are much smaller than the DDT entries.
90  * To uniquely identify a block we just need its vdev id and offset. We also
91  * need to maintain a reference counter. The vdev id will often repeat, as there
92  * is a small number of top-level VDEVs and a large number of blocks stored in
93  * each VDEV. We take advantage of that to reduce the BRT entry size further by
94  * maintaining one BRT for each top-level VDEV, so we can then have only offset
95  * and counter as the BRT entry.
96  *
97  * Minimizing free penalty.
98  *
99  * Block Cloning allows creating additional references to any existing block.
100  * When we free a block there is no hint in the block pointer whether the block
101  * was cloned or not, so on each free we have to check if there is a
102  * corresponding entry in the BRT or not. If there is, we need to decrease
103  * the reference counter. Doing BRT lookup on every free can potentially be
104  * expensive by requiring additional I/Os if the BRT doesn't fit into memory.
105  * This is the main problem with deduplication, so we've learned our lesson and
106  * try not to repeat the same mistake here. How do we do that? We divide each
107  * top-level VDEV into 16MB regions. For each region we maintain a counter that
108  * is a sum of all the BRT entries that have offsets within the region. This
109  * creates the entries count array of 16bit numbers for each top-level VDEV.
110  * The entries count array is always kept in memory and updated on disk in the
111  * same transaction group as the BRT updates to keep everything in-sync. We can
112  * keep the array in memory, because it is very small. With 16MB regions and
113  * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease
114  * the region size even further in the future). Now, when we want to free
115  * a block, we first consult the array. If the counter for the whole region is
116  * zero, there is no need to look for the BRT entry, as there isn't one for
117  * sure. If the counter for the region is greater than zero, only then we will
118  * do a BRT lookup and if an entry is found we will decrease the reference
119  * counter in the BRT entry and in the entry counters array.
120  *
121  * The entry counters array is small, but can potentially be larger for very
122  * large VDEVs or smaller regions. In this case we don't want to rewrite entire
123  * array on every change. We then divide the array into 32kB block and keep
124  * a bitmap of dirty blocks within a transaction group. When we sync the
125  * transaction group we can only update the parts of the entry counters array
126  * that were modified. Note: Keeping track of the dirty parts of the entry
127  * counters array is implemented, but updating only parts of the array on disk
128  * is not yet implemented - for now we will update entire array if there was
129  * any change.
130  *
131  * The implementation tries to be economic: if BRT is not used, or no longer
132  * used, there will be no entries in the MOS and no additional memory used (eg.
133  * the entry counters array is only allocated if needed).
134  *
135  * Interaction between Deduplication and Block Cloning.
136  *
137  * If both functionalities are in use, we could end up with a block that is
138  * referenced multiple times in both DDT and BRT. When we free one of the
139  * references we couldn't tell where it belongs, so we would have to decide
140  * what table takes the precedence: do we first clear DDT references or BRT
141  * references? To avoid this dilemma BRT cooperates with DDT - if a given block
142  * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will
143  * lookup DDT entry instead and increase the counter there. No BRT entry
144  * will be created for a block which has the D (dedup) bit set.
145  * BRT may be more efficient for manual deduplication, but if the block is
146  * already in the DDT, then creating additional BRT entry would be less
147  * efficient. This clever idea was proposed by Allan Jude.
148  *
149  * Block Cloning across datasets.
150  *
151  * Block Cloning is not limited to cloning blocks within the same dataset.
152  * It is possible (and very useful) to clone blocks between different datasets.
153  * One use case is recovering files from snapshots. By cloning the files into
154  * dataset we need no additional storage. Without Block Cloning we would need
155  * additional space for those files.
156  * Another interesting use case is moving the files between datasets
157  * (copying the file content to the new dataset and removing the source file).
158  * In that case Block Cloning will only be used briefly, because the BRT entries
159  * will be removed when the source is removed.
160  * Note: currently it is not possible to clone blocks between encrypted
161  * datasets, even if those datasets use the same encryption key (this includes
162  * snapshots of encrypted datasets). Cloning blocks between datasets that use
163  * the same keys should be possible and should be implemented in the future.
164  *
165  * Block Cloning flow through ZFS layers.
166  *
167  * Note: Block Cloning can be used both for cloning file system blocks and ZVOL
168  * blocks. As of this writing no interface is implemented that allows for block
169  * cloning within a ZVOL.
170  * FreeBSD and Linux provides copy_file_range(2) system call and we will use it
171  * for blocking cloning.
172  *
173  *	ssize_t
174  *	copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp,
175  *	                size_t len, unsigned int flags);
176  *
177  * Even though offsets and length represent bytes, they have to be
178  * block-aligned or we will return an error so the upper layer can
179  * fallback to the generic mechanism that will just copy the data.
180  * Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
181  * This function was implemented based on zfs_write(), but instead of writing
182  * the given data we first read block pointers using the new dmu_read_l0_bps()
183  * function from the source file. Once we have BPs from the source file we call
184  * the dmu_brt_clone() function on the destination file. This function
185  * allocates BPs for us. We iterate over all source BPs. If the given BP is
186  * a hole or an embedded block, we just copy BP as-is. If it points to a real
187  * data we place this BP on a BRT pending list using the brt_pending_add()
188  * function.
189  *
190  * We use this pending list to keep track of all BPs that got new references
191  * within this transaction group.
192  *
193  * Some special cases to consider and how we address them:
194  * - The block we want to clone may have been created within the same
195  *   transaction group that we are trying to clone. Such block has no BP
196  *   allocated yet, so cannot be immediately cloned. We return EAGAIN.
197  * - The block we want to clone may have been modified within the same
198  *   transaction group. We return EAGAIN.
199  * - A block may be cloned multiple times during one transaction group (that's
200  *   why pending list is actually a tree and not an append-only list - this
201  *   way we can figure out faster if this block is cloned for the first time
202  *   in this txg or consecutive time).
203  * - A block may be cloned and freed within the same transaction group
204  *   (see dbuf_undirty()).
205  * - A block may be cloned and within the same transaction group the clone
206  *   can be cloned again (see dmu_read_l0_bps()).
207  * - A file might have been deleted, but the caller still has a file descriptor
208  *   open to this file and clones it.
209  *
210  * When we free a block we have an additional step in the ZIO pipeline where we
211  * call the zio_brt_free() function. We then call the brt_entry_decref()
212  * that loads the corresponding BRT entry (if one exists) and decreases
213  * reference counter. If this is not the last reference we will stop ZIO
214  * pipeline here. If this is the last reference or the block is not in the
215  * BRT, we continue the pipeline and free the block as usual.
216  *
217  * At the beginning of spa_sync() where there can be no more block cloning,
218  * but before issuing frees we call brt_pending_apply(). This function applies
219  * all the new clones to the BRT table - we load BRT entries and update
220  * reference counters. To sync new BRT entries to disk, we use brt_sync()
221  * function. This function will sync all dirty per-top-level-vdev BRTs,
222  * the entry counters arrays, etc.
223  *
224  * Block Cloning and ZIL.
225  *
226  * Every clone operation is divided into chunks (similar to write) and each
227  * chunk is cloned in a separate transaction. The chunk size is determined by
228  * how many BPs we can fit into a single ZIL entry.
229  * Replaying clone operation is different from the regular clone operation,
230  * as when we log clone operations we cannot use the source object - it may
231  * reside on a different dataset, so we log BPs we want to clone.
232  * The ZIL is replayed when we mount the given dataset, not when the pool is
233  * imported. Taking this into account it is possible that the pool is imported
234  * without mounting datasets and the source dataset is destroyed before the
235  * destination dataset is mounted and its ZIL replayed.
236  * To address this situation we leverage zil_claim() mechanism where ZFS will
237  * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
238  * entries, we will bump reference counters for their BPs in the BRT and then
239  * on mount and ZIL replay we will just attach BPs to the file without
240  * bumping reference counters.
241  * Note it is still possible that after zil_claim() we never mount the
242  * destination, so we never replay its ZIL and we destroy it. This way we would
243  * end up with leaked references in BRT. We address that too as ZFS gives us
244  * a chance to clean this up on dataset destroy (see zil_free_clone_range()).
245  */
246 
247 static kmem_cache_t *brt_entry_cache;
248 static kmem_cache_t *brt_pending_entry_cache;
249 
250 /*
251  * Enable/disable prefetching of BRT entries that we are going to modify.
252  */
253 int zfs_brt_prefetch = 1;
254 
255 #ifdef ZFS_DEBUG
256 #define	BRT_DEBUG(...)	do {						\
257 	if ((zfs_flags & ZFS_DEBUG_BRT) != 0) {				\
258 		__dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \
259 	}								\
260 } while (0)
261 #else
262 #define	BRT_DEBUG(...)	do { } while (0)
263 #endif
264 
265 int brt_zap_leaf_blockshift = 12;
266 int brt_zap_indirect_blockshift = 12;
267 
268 static kstat_t	*brt_ksp;
269 
270 typedef struct brt_stats {
271 	kstat_named_t brt_addref_entry_in_memory;
272 	kstat_named_t brt_addref_entry_not_on_disk;
273 	kstat_named_t brt_addref_entry_on_disk;
274 	kstat_named_t brt_addref_entry_read_lost_race;
275 	kstat_named_t brt_decref_entry_in_memory;
276 	kstat_named_t brt_decref_entry_loaded_from_disk;
277 	kstat_named_t brt_decref_entry_not_in_memory;
278 	kstat_named_t brt_decref_entry_not_on_disk;
279 	kstat_named_t brt_decref_entry_read_lost_race;
280 	kstat_named_t brt_decref_entry_still_referenced;
281 	kstat_named_t brt_decref_free_data_later;
282 	kstat_named_t brt_decref_free_data_now;
283 	kstat_named_t brt_decref_no_entry;
284 } brt_stats_t;
285 
286 static brt_stats_t brt_stats = {
287 	{ "addref_entry_in_memory",		KSTAT_DATA_UINT64 },
288 	{ "addref_entry_not_on_disk",		KSTAT_DATA_UINT64 },
289 	{ "addref_entry_on_disk",		KSTAT_DATA_UINT64 },
290 	{ "addref_entry_read_lost_race",	KSTAT_DATA_UINT64 },
291 	{ "decref_entry_in_memory",		KSTAT_DATA_UINT64 },
292 	{ "decref_entry_loaded_from_disk",	KSTAT_DATA_UINT64 },
293 	{ "decref_entry_not_in_memory",		KSTAT_DATA_UINT64 },
294 	{ "decref_entry_not_on_disk",		KSTAT_DATA_UINT64 },
295 	{ "decref_entry_read_lost_race",	KSTAT_DATA_UINT64 },
296 	{ "decref_entry_still_referenced",	KSTAT_DATA_UINT64 },
297 	{ "decref_free_data_later",		KSTAT_DATA_UINT64 },
298 	{ "decref_free_data_now",		KSTAT_DATA_UINT64 },
299 	{ "decref_no_entry",			KSTAT_DATA_UINT64 }
300 };
301 
302 struct {
303 	wmsum_t brt_addref_entry_in_memory;
304 	wmsum_t brt_addref_entry_not_on_disk;
305 	wmsum_t brt_addref_entry_on_disk;
306 	wmsum_t brt_addref_entry_read_lost_race;
307 	wmsum_t brt_decref_entry_in_memory;
308 	wmsum_t brt_decref_entry_loaded_from_disk;
309 	wmsum_t brt_decref_entry_not_in_memory;
310 	wmsum_t brt_decref_entry_not_on_disk;
311 	wmsum_t brt_decref_entry_read_lost_race;
312 	wmsum_t brt_decref_entry_still_referenced;
313 	wmsum_t brt_decref_free_data_later;
314 	wmsum_t brt_decref_free_data_now;
315 	wmsum_t brt_decref_no_entry;
316 } brt_sums;
317 
318 #define	BRTSTAT_BUMP(stat)	wmsum_add(&brt_sums.stat, 1)
319 
320 static int brt_entry_compare(const void *x1, const void *x2);
321 static int brt_pending_entry_compare(const void *x1, const void *x2);
322 
323 static void
324 brt_rlock(brt_t *brt)
325 {
326 	rw_enter(&brt->brt_lock, RW_READER);
327 }
328 
329 static void
330 brt_wlock(brt_t *brt)
331 {
332 	rw_enter(&brt->brt_lock, RW_WRITER);
333 }
334 
335 static void
336 brt_unlock(brt_t *brt)
337 {
338 	rw_exit(&brt->brt_lock);
339 }
340 
341 static uint16_t
342 brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx)
343 {
344 
345 	ASSERT3U(idx, <, brtvd->bv_size);
346 
347 	if (brtvd->bv_need_byteswap) {
348 		return (BSWAP_16(brtvd->bv_entcount[idx]));
349 	} else {
350 		return (brtvd->bv_entcount[idx]);
351 	}
352 }
353 
354 static void
355 brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt)
356 {
357 
358 	ASSERT3U(idx, <, brtvd->bv_size);
359 
360 	if (brtvd->bv_need_byteswap) {
361 		brtvd->bv_entcount[idx] = BSWAP_16(entcnt);
362 	} else {
363 		brtvd->bv_entcount[idx] = entcnt;
364 	}
365 }
366 
367 static void
368 brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx)
369 {
370 	uint16_t entcnt;
371 
372 	ASSERT3U(idx, <, brtvd->bv_size);
373 
374 	entcnt = brt_vdev_entcount_get(brtvd, idx);
375 	ASSERT(entcnt < UINT16_MAX);
376 
377 	brt_vdev_entcount_set(brtvd, idx, entcnt + 1);
378 }
379 
380 static void
381 brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx)
382 {
383 	uint16_t entcnt;
384 
385 	ASSERT3U(idx, <, brtvd->bv_size);
386 
387 	entcnt = brt_vdev_entcount_get(brtvd, idx);
388 	ASSERT(entcnt > 0);
389 
390 	brt_vdev_entcount_set(brtvd, idx, entcnt - 1);
391 }
392 
393 #ifdef ZFS_DEBUG
394 static void
395 brt_vdev_dump(brt_t *brt)
396 {
397 	brt_vdev_t *brtvd;
398 	uint64_t vdevid;
399 
400 	if ((zfs_flags & ZFS_DEBUG_BRT) == 0) {
401 		return;
402 	}
403 
404 	if (brt->brt_nvdevs == 0) {
405 		zfs_dbgmsg("BRT empty");
406 		return;
407 	}
408 
409 	zfs_dbgmsg("BRT vdev dump:");
410 	for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
411 		uint64_t idx;
412 
413 		brtvd = &brt->brt_vdevs[vdevid];
414 		zfs_dbgmsg("  vdevid=%llu/%llu meta_dirty=%d entcount_dirty=%d "
415 		    "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n",
416 		    (u_longlong_t)vdevid, (u_longlong_t)brtvd->bv_vdevid,
417 		    brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
418 		    (u_longlong_t)brtvd->bv_size,
419 		    (u_longlong_t)brtvd->bv_totalcount,
420 		    (u_longlong_t)brtvd->bv_nblocks,
421 		    (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks));
422 		if (brtvd->bv_totalcount > 0) {
423 			zfs_dbgmsg("    entcounts:");
424 			for (idx = 0; idx < brtvd->bv_size; idx++) {
425 				if (brt_vdev_entcount_get(brtvd, idx) > 0) {
426 					zfs_dbgmsg("      [%04llu] %hu",
427 					    (u_longlong_t)idx,
428 					    brt_vdev_entcount_get(brtvd, idx));
429 				}
430 			}
431 		}
432 		if (brtvd->bv_entcount_dirty) {
433 			char *bitmap;
434 
435 			bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP);
436 			for (idx = 0; idx < brtvd->bv_nblocks; idx++) {
437 				bitmap[idx] =
438 				    BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
439 			}
440 			bitmap[idx] = '\0';
441 			zfs_dbgmsg("    bitmap: %s", bitmap);
442 			kmem_free(bitmap, brtvd->bv_nblocks + 1);
443 		}
444 	}
445 }
446 #endif
447 
448 static brt_vdev_t *
449 brt_vdev(brt_t *brt, uint64_t vdevid)
450 {
451 	brt_vdev_t *brtvd;
452 
453 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
454 
455 	if (vdevid < brt->brt_nvdevs) {
456 		brtvd = &brt->brt_vdevs[vdevid];
457 	} else {
458 		brtvd = NULL;
459 	}
460 
461 	return (brtvd);
462 }
463 
464 static void
465 brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
466 {
467 	char name[64];
468 
469 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
470 	ASSERT0(brtvd->bv_mos_brtvdev);
471 	ASSERT0(brtvd->bv_mos_entries);
472 	ASSERT(brtvd->bv_entcount != NULL);
473 	ASSERT(brtvd->bv_size > 0);
474 	ASSERT(brtvd->bv_bitmap != NULL);
475 	ASSERT(brtvd->bv_nblocks > 0);
476 
477 	brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0,
478 	    ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
479 	    brt_zap_leaf_blockshift, brt_zap_indirect_blockshift, DMU_OT_NONE,
480 	    0, tx);
481 	VERIFY(brtvd->bv_mos_entries != 0);
482 	BRT_DEBUG("MOS entries created, object=%llu",
483 	    (u_longlong_t)brtvd->bv_mos_entries);
484 
485 	/*
486 	 * We allocate DMU buffer to store the bv_entcount[] array.
487 	 * We will keep array size (bv_size) and cummulative count for all
488 	 * bv_entcount[]s (bv_totalcount) in the bonus buffer.
489 	 */
490 	brtvd->bv_mos_brtvdev = dmu_object_alloc(brt->brt_mos,
491 	    DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE,
492 	    DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx);
493 	VERIFY(brtvd->bv_mos_brtvdev != 0);
494 	BRT_DEBUG("MOS BRT VDEV created, object=%llu",
495 	    (u_longlong_t)brtvd->bv_mos_brtvdev);
496 
497 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
498 	    (u_longlong_t)brtvd->bv_vdevid);
499 	VERIFY0(zap_add(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
500 	    sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx));
501 	BRT_DEBUG("Pool directory object created, object=%s", name);
502 
503 	spa_feature_incr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
504 }
505 
506 static void
507 brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd)
508 {
509 	vdev_t *vd;
510 	uint16_t *entcount;
511 	ulong_t *bitmap;
512 	uint64_t nblocks, size;
513 
514 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
515 
516 	spa_config_enter(brt->brt_spa, SCL_VDEV, FTAG, RW_READER);
517 	vd = vdev_lookup_top(brt->brt_spa, brtvd->bv_vdevid);
518 	size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1;
519 	spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG);
520 
521 	entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP);
522 	nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);
523 	bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);
524 
525 	if (!brtvd->bv_initiated) {
526 		ASSERT0(brtvd->bv_size);
527 		ASSERT(brtvd->bv_entcount == NULL);
528 		ASSERT(brtvd->bv_bitmap == NULL);
529 		ASSERT0(brtvd->bv_nblocks);
530 
531 		avl_create(&brtvd->bv_tree, brt_entry_compare,
532 		    sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node));
533 	} else {
534 		ASSERT(brtvd->bv_size > 0);
535 		ASSERT(brtvd->bv_entcount != NULL);
536 		ASSERT(brtvd->bv_bitmap != NULL);
537 		ASSERT(brtvd->bv_nblocks > 0);
538 		/*
539 		 * TODO: Allow vdev shrinking. We only need to implement
540 		 * shrinking the on-disk BRT VDEV object.
541 		 * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset,
542 		 *     size, tx);
543 		 */
544 		ASSERT3U(brtvd->bv_size, <=, size);
545 
546 		memcpy(entcount, brtvd->bv_entcount,
547 		    sizeof (entcount[0]) * MIN(size, brtvd->bv_size));
548 		memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
549 		    BT_SIZEOFMAP(brtvd->bv_nblocks)));
550 		vmem_free(brtvd->bv_entcount,
551 		    sizeof (entcount[0]) * brtvd->bv_size);
552 		kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
553 	}
554 
555 	brtvd->bv_size = size;
556 	brtvd->bv_entcount = entcount;
557 	brtvd->bv_bitmap = bitmap;
558 	brtvd->bv_nblocks = nblocks;
559 	if (!brtvd->bv_initiated) {
560 		brtvd->bv_need_byteswap = FALSE;
561 		brtvd->bv_initiated = TRUE;
562 		BRT_DEBUG("BRT VDEV %llu initiated.",
563 		    (u_longlong_t)brtvd->bv_vdevid);
564 	}
565 }
566 
567 static void
568 brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd)
569 {
570 	char name[64];
571 	dmu_buf_t *db;
572 	brt_vdev_phys_t *bvphys;
573 	int error;
574 
575 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
576 	    (u_longlong_t)brtvd->bv_vdevid);
577 	error = zap_lookup(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
578 	    sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev);
579 	if (error != 0)
580 		return;
581 	ASSERT(brtvd->bv_mos_brtvdev != 0);
582 
583 	error = dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db);
584 	ASSERT0(error);
585 	if (error != 0)
586 		return;
587 
588 	bvphys = db->db_data;
589 	if (brt->brt_rangesize == 0) {
590 		brt->brt_rangesize = bvphys->bvp_rangesize;
591 	} else {
592 		ASSERT3U(brt->brt_rangesize, ==, bvphys->bvp_rangesize);
593 	}
594 
595 	ASSERT(!brtvd->bv_initiated);
596 	brt_vdev_realloc(brt, brtvd);
597 
598 	/* TODO: We don't support VDEV shrinking. */
599 	ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size);
600 
601 	/*
602 	 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out.
603 	 */
604 	error = dmu_read(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
605 	    MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t),
606 	    brtvd->bv_entcount, DMU_READ_NO_PREFETCH);
607 	ASSERT0(error);
608 
609 	brtvd->bv_mos_entries = bvphys->bvp_mos_entries;
610 	ASSERT(brtvd->bv_mos_entries != 0);
611 	brtvd->bv_need_byteswap =
612 	    (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER);
613 	brtvd->bv_totalcount = bvphys->bvp_totalcount;
614 	brtvd->bv_usedspace = bvphys->bvp_usedspace;
615 	brtvd->bv_savedspace = bvphys->bvp_savedspace;
616 	brt->brt_usedspace += brtvd->bv_usedspace;
617 	brt->brt_savedspace += brtvd->bv_savedspace;
618 
619 	dmu_buf_rele(db, FTAG);
620 
621 	BRT_DEBUG("MOS BRT VDEV %s loaded: mos_brtvdev=%llu, mos_entries=%llu",
622 	    name, (u_longlong_t)brtvd->bv_mos_brtvdev,
623 	    (u_longlong_t)brtvd->bv_mos_entries);
624 }
625 
626 static void
627 brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd)
628 {
629 
630 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
631 	ASSERT(brtvd->bv_initiated);
632 
633 	vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size);
634 	brtvd->bv_entcount = NULL;
635 	kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
636 	brtvd->bv_bitmap = NULL;
637 	ASSERT0(avl_numnodes(&brtvd->bv_tree));
638 	avl_destroy(&brtvd->bv_tree);
639 
640 	brtvd->bv_size = 0;
641 	brtvd->bv_nblocks = 0;
642 
643 	brtvd->bv_initiated = FALSE;
644 	BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid);
645 }
646 
647 static void
648 brt_vdev_destroy(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
649 {
650 	char name[64];
651 	uint64_t count;
652 	dmu_buf_t *db;
653 	brt_vdev_phys_t *bvphys;
654 
655 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
656 	ASSERT(brtvd->bv_mos_brtvdev != 0);
657 	ASSERT(brtvd->bv_mos_entries != 0);
658 
659 	VERIFY0(zap_count(brt->brt_mos, brtvd->bv_mos_entries, &count));
660 	VERIFY0(count);
661 	VERIFY0(zap_destroy(brt->brt_mos, brtvd->bv_mos_entries, tx));
662 	BRT_DEBUG("MOS entries destroyed, object=%llu",
663 	    (u_longlong_t)brtvd->bv_mos_entries);
664 	brtvd->bv_mos_entries = 0;
665 
666 	VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
667 	bvphys = db->db_data;
668 	ASSERT0(bvphys->bvp_totalcount);
669 	ASSERT0(bvphys->bvp_usedspace);
670 	ASSERT0(bvphys->bvp_savedspace);
671 	dmu_buf_rele(db, FTAG);
672 
673 	VERIFY0(dmu_object_free(brt->brt_mos, brtvd->bv_mos_brtvdev, tx));
674 	BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",
675 	    (u_longlong_t)brtvd->bv_mos_brtvdev);
676 	brtvd->bv_mos_brtvdev = 0;
677 
678 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
679 	    (u_longlong_t)brtvd->bv_vdevid);
680 	VERIFY0(zap_remove(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, tx));
681 	BRT_DEBUG("Pool directory object removed, object=%s", name);
682 
683 	brt_vdev_dealloc(brt, brtvd);
684 
685 	spa_feature_decr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
686 }
687 
688 static void
689 brt_vdevs_expand(brt_t *brt, uint64_t nvdevs)
690 {
691 	brt_vdev_t *brtvd, *vdevs;
692 	uint64_t vdevid;
693 
694 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
695 	ASSERT3U(nvdevs, >, brt->brt_nvdevs);
696 
697 	vdevs = kmem_zalloc(sizeof (vdevs[0]) * nvdevs, KM_SLEEP);
698 	if (brt->brt_nvdevs > 0) {
699 		ASSERT(brt->brt_vdevs != NULL);
700 
701 		memcpy(vdevs, brt->brt_vdevs,
702 		    sizeof (brt_vdev_t) * brt->brt_nvdevs);
703 		kmem_free(brt->brt_vdevs,
704 		    sizeof (brt_vdev_t) * brt->brt_nvdevs);
705 	}
706 	for (vdevid = brt->brt_nvdevs; vdevid < nvdevs; vdevid++) {
707 		brtvd = &vdevs[vdevid];
708 
709 		brtvd->bv_vdevid = vdevid;
710 		brtvd->bv_initiated = FALSE;
711 	}
712 
713 	BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",
714 	    (u_longlong_t)brt->brt_nvdevs, (u_longlong_t)nvdevs);
715 
716 	brt->brt_vdevs = vdevs;
717 	brt->brt_nvdevs = nvdevs;
718 }
719 
720 static boolean_t
721 brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre)
722 {
723 	uint64_t idx;
724 
725 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
726 
727 	idx = bre->bre_offset / brt->brt_rangesize;
728 	if (brtvd->bv_entcount != NULL && idx < brtvd->bv_size) {
729 		/* VDEV wasn't expanded. */
730 		return (brt_vdev_entcount_get(brtvd, idx) > 0);
731 	}
732 
733 	return (FALSE);
734 }
735 
736 static void
737 brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
738     uint64_t dsize)
739 {
740 	uint64_t idx;
741 
742 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
743 	ASSERT(brtvd != NULL);
744 	ASSERT(brtvd->bv_entcount != NULL);
745 
746 	brt->brt_savedspace += dsize;
747 	brtvd->bv_savedspace += dsize;
748 	brtvd->bv_meta_dirty = TRUE;
749 
750 	if (bre->bre_refcount > 1) {
751 		return;
752 	}
753 
754 	brt->brt_usedspace += dsize;
755 	brtvd->bv_usedspace += dsize;
756 
757 	idx = bre->bre_offset / brt->brt_rangesize;
758 	if (idx >= brtvd->bv_size) {
759 		/* VDEV has been expanded. */
760 		brt_vdev_realloc(brt, brtvd);
761 	}
762 
763 	ASSERT3U(idx, <, brtvd->bv_size);
764 
765 	brtvd->bv_totalcount++;
766 	brt_vdev_entcount_inc(brtvd, idx);
767 	brtvd->bv_entcount_dirty = TRUE;
768 	idx = idx / BRT_BLOCKSIZE / 8;
769 	BT_SET(brtvd->bv_bitmap, idx);
770 
771 #ifdef ZFS_DEBUG
772 	brt_vdev_dump(brt);
773 #endif
774 }
775 
776 static void
777 brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
778     uint64_t dsize)
779 {
780 	uint64_t idx;
781 
782 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
783 	ASSERT(brtvd != NULL);
784 	ASSERT(brtvd->bv_entcount != NULL);
785 
786 	brt->brt_savedspace -= dsize;
787 	brtvd->bv_savedspace -= dsize;
788 	brtvd->bv_meta_dirty = TRUE;
789 
790 	if (bre->bre_refcount > 0) {
791 		return;
792 	}
793 
794 	brt->brt_usedspace -= dsize;
795 	brtvd->bv_usedspace -= dsize;
796 
797 	idx = bre->bre_offset / brt->brt_rangesize;
798 	ASSERT3U(idx, <, brtvd->bv_size);
799 
800 	ASSERT(brtvd->bv_totalcount > 0);
801 	brtvd->bv_totalcount--;
802 	brt_vdev_entcount_dec(brtvd, idx);
803 	brtvd->bv_entcount_dirty = TRUE;
804 	idx = idx / BRT_BLOCKSIZE / 8;
805 	BT_SET(brtvd->bv_bitmap, idx);
806 
807 #ifdef ZFS_DEBUG
808 	brt_vdev_dump(brt);
809 #endif
810 }
811 
812 static void
813 brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
814 {
815 	dmu_buf_t *db;
816 	brt_vdev_phys_t *bvphys;
817 
818 	ASSERT(brtvd->bv_meta_dirty);
819 	ASSERT(brtvd->bv_mos_brtvdev != 0);
820 	ASSERT(dmu_tx_is_syncing(tx));
821 
822 	VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
823 
824 	if (brtvd->bv_entcount_dirty) {
825 		/*
826 		 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
827 		 */
828 		dmu_write(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
829 		    brtvd->bv_size * sizeof (brtvd->bv_entcount[0]),
830 		    brtvd->bv_entcount, tx);
831 		memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(brtvd->bv_nblocks));
832 		brtvd->bv_entcount_dirty = FALSE;
833 	}
834 
835 	dmu_buf_will_dirty(db, tx);
836 	bvphys = db->db_data;
837 	bvphys->bvp_mos_entries = brtvd->bv_mos_entries;
838 	bvphys->bvp_size = brtvd->bv_size;
839 	if (brtvd->bv_need_byteswap) {
840 		bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER;
841 	} else {
842 		bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER;
843 	}
844 	bvphys->bvp_totalcount = brtvd->bv_totalcount;
845 	bvphys->bvp_rangesize = brt->brt_rangesize;
846 	bvphys->bvp_usedspace = brtvd->bv_usedspace;
847 	bvphys->bvp_savedspace = brtvd->bv_savedspace;
848 	dmu_buf_rele(db, FTAG);
849 
850 	brtvd->bv_meta_dirty = FALSE;
851 }
852 
853 static void
854 brt_vdevs_alloc(brt_t *brt, boolean_t load)
855 {
856 	brt_vdev_t *brtvd;
857 	uint64_t vdevid;
858 
859 	brt_wlock(brt);
860 
861 	brt_vdevs_expand(brt, brt->brt_spa->spa_root_vdev->vdev_children);
862 
863 	if (load) {
864 		for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
865 			brtvd = &brt->brt_vdevs[vdevid];
866 			ASSERT(brtvd->bv_entcount == NULL);
867 
868 			brt_vdev_load(brt, brtvd);
869 		}
870 	}
871 
872 	if (brt->brt_rangesize == 0) {
873 		brt->brt_rangesize = BRT_RANGESIZE;
874 	}
875 
876 	brt_unlock(brt);
877 }
878 
879 static void
880 brt_vdevs_free(brt_t *brt)
881 {
882 	brt_vdev_t *brtvd;
883 	uint64_t vdevid;
884 
885 	brt_wlock(brt);
886 
887 	for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
888 		brtvd = &brt->brt_vdevs[vdevid];
889 		if (brtvd->bv_initiated)
890 			brt_vdev_dealloc(brt, brtvd);
891 	}
892 	kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs);
893 
894 	brt_unlock(brt);
895 }
896 
897 static void
898 brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp)
899 {
900 
901 	bre->bre_offset = DVA_GET_OFFSET(&bp->blk_dva[0]);
902 	bre->bre_refcount = 0;
903 
904 	*vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]);
905 }
906 
907 static int
908 brt_entry_compare(const void *x1, const void *x2)
909 {
910 	const brt_entry_t *bre1 = x1;
911 	const brt_entry_t *bre2 = x2;
912 
913 	return (TREE_CMP(bre1->bre_offset, bre2->bre_offset));
914 }
915 
916 static int
917 brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre)
918 {
919 	uint64_t mos_entries;
920 	uint64_t one, physsize;
921 	int error;
922 
923 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
924 
925 	if (!brt_vdev_lookup(brt, brtvd, bre))
926 		return (SET_ERROR(ENOENT));
927 
928 	/*
929 	 * Remember mos_entries object number. After we reacquire the BRT lock,
930 	 * the brtvd pointer may be invalid.
931 	 */
932 	mos_entries = brtvd->bv_mos_entries;
933 	if (mos_entries == 0)
934 		return (SET_ERROR(ENOENT));
935 
936 	brt_unlock(brt);
937 
938 	error = zap_length_uint64(brt->brt_mos, mos_entries, &bre->bre_offset,
939 	    BRT_KEY_WORDS, &one, &physsize);
940 	if (error == 0) {
941 		ASSERT3U(one, ==, 1);
942 		ASSERT3U(physsize, ==, sizeof (bre->bre_refcount));
943 
944 		error = zap_lookup_uint64(brt->brt_mos, mos_entries,
945 		    &bre->bre_offset, BRT_KEY_WORDS, 1,
946 		    sizeof (bre->bre_refcount), &bre->bre_refcount);
947 		BRT_DEBUG("ZAP lookup: object=%llu vdev=%llu offset=%llu "
948 		    "count=%llu error=%d", (u_longlong_t)mos_entries,
949 		    (u_longlong_t)brtvd->bv_vdevid,
950 		    (u_longlong_t)bre->bre_offset,
951 		    error == 0 ? (u_longlong_t)bre->bre_refcount : 0, error);
952 	}
953 
954 	brt_wlock(brt);
955 
956 	return (error);
957 }
958 
959 static void
960 brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre)
961 {
962 	brt_vdev_t *brtvd;
963 	uint64_t mos_entries = 0;
964 
965 	brt_rlock(brt);
966 	brtvd = brt_vdev(brt, vdevid);
967 	if (brtvd != NULL)
968 		mos_entries = brtvd->bv_mos_entries;
969 	brt_unlock(brt);
970 
971 	if (mos_entries == 0)
972 		return;
973 
974 	BRT_DEBUG("ZAP prefetch: object=%llu vdev=%llu offset=%llu",
975 	    (u_longlong_t)mos_entries, (u_longlong_t)vdevid,
976 	    (u_longlong_t)bre->bre_offset);
977 	(void) zap_prefetch_uint64(brt->brt_mos, mos_entries,
978 	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS);
979 }
980 
981 static int
982 brt_entry_update(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
983 {
984 	int error;
985 
986 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
987 	ASSERT(brtvd->bv_mos_entries != 0);
988 	ASSERT(bre->bre_refcount > 0);
989 
990 	error = zap_update_uint64(brt->brt_mos, brtvd->bv_mos_entries,
991 	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, 1,
992 	    sizeof (bre->bre_refcount), &bre->bre_refcount, tx);
993 	BRT_DEBUG("ZAP update: object=%llu vdev=%llu offset=%llu count=%llu "
994 	    "error=%d", (u_longlong_t)brtvd->bv_mos_entries,
995 	    (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
996 	    (u_longlong_t)bre->bre_refcount, error);
997 
998 	return (error);
999 }
1000 
1001 static int
1002 brt_entry_remove(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
1003 {
1004 	int error;
1005 
1006 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
1007 	ASSERT(brtvd->bv_mos_entries != 0);
1008 	ASSERT0(bre->bre_refcount);
1009 
1010 	error = zap_remove_uint64(brt->brt_mos, brtvd->bv_mos_entries,
1011 	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, tx);
1012 	BRT_DEBUG("ZAP remove: object=%llu vdev=%llu offset=%llu count=%llu "
1013 	    "error=%d", (u_longlong_t)brtvd->bv_mos_entries,
1014 	    (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
1015 	    (u_longlong_t)bre->bre_refcount, error);
1016 
1017 	return (error);
1018 }
1019 
1020 /*
1021  * Return TRUE if we _can_ have BRT entry for this bp. It might be false
1022  * positive, but gives us quick answer if we should look into BRT, which
1023  * may require reads and thus will be more expensive.
1024  */
1025 boolean_t
1026 brt_maybe_exists(spa_t *spa, const blkptr_t *bp)
1027 {
1028 	brt_t *brt = spa->spa_brt;
1029 	brt_vdev_t *brtvd;
1030 	brt_entry_t bre_search;
1031 	boolean_t mayexists = FALSE;
1032 	uint64_t vdevid;
1033 
1034 	brt_entry_fill(bp, &bre_search, &vdevid);
1035 
1036 	brt_rlock(brt);
1037 
1038 	brtvd = brt_vdev(brt, vdevid);
1039 	if (brtvd != NULL && brtvd->bv_initiated) {
1040 		if (!avl_is_empty(&brtvd->bv_tree) ||
1041 		    brt_vdev_lookup(brt, brtvd, &bre_search)) {
1042 			mayexists = TRUE;
1043 		}
1044 	}
1045 
1046 	brt_unlock(brt);
1047 
1048 	return (mayexists);
1049 }
1050 
1051 uint64_t
1052 brt_get_dspace(spa_t *spa)
1053 {
1054 	brt_t *brt = spa->spa_brt;
1055 
1056 	if (brt == NULL)
1057 		return (0);
1058 
1059 	return (brt->brt_savedspace);
1060 }
1061 
1062 uint64_t
1063 brt_get_used(spa_t *spa)
1064 {
1065 	brt_t *brt = spa->spa_brt;
1066 
1067 	if (brt == NULL)
1068 		return (0);
1069 
1070 	return (brt->brt_usedspace);
1071 }
1072 
1073 uint64_t
1074 brt_get_saved(spa_t *spa)
1075 {
1076 	brt_t *brt = spa->spa_brt;
1077 
1078 	if (brt == NULL)
1079 		return (0);
1080 
1081 	return (brt->brt_savedspace);
1082 }
1083 
1084 uint64_t
1085 brt_get_ratio(spa_t *spa)
1086 {
1087 	brt_t *brt = spa->spa_brt;
1088 
1089 	if (brt->brt_usedspace == 0)
1090 		return (100);
1091 
1092 	return ((brt->brt_usedspace + brt->brt_savedspace) * 100 /
1093 	    brt->brt_usedspace);
1094 }
1095 
1096 static int
1097 brt_kstats_update(kstat_t *ksp, int rw)
1098 {
1099 	brt_stats_t *bs = ksp->ks_data;
1100 
1101 	if (rw == KSTAT_WRITE)
1102 		return (EACCES);
1103 
1104 	bs->brt_addref_entry_in_memory.value.ui64 =
1105 	    wmsum_value(&brt_sums.brt_addref_entry_in_memory);
1106 	bs->brt_addref_entry_not_on_disk.value.ui64 =
1107 	    wmsum_value(&brt_sums.brt_addref_entry_not_on_disk);
1108 	bs->brt_addref_entry_on_disk.value.ui64 =
1109 	    wmsum_value(&brt_sums.brt_addref_entry_on_disk);
1110 	bs->brt_addref_entry_read_lost_race.value.ui64 =
1111 	    wmsum_value(&brt_sums.brt_addref_entry_read_lost_race);
1112 	bs->brt_decref_entry_in_memory.value.ui64 =
1113 	    wmsum_value(&brt_sums.brt_decref_entry_in_memory);
1114 	bs->brt_decref_entry_loaded_from_disk.value.ui64 =
1115 	    wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk);
1116 	bs->brt_decref_entry_not_in_memory.value.ui64 =
1117 	    wmsum_value(&brt_sums.brt_decref_entry_not_in_memory);
1118 	bs->brt_decref_entry_not_on_disk.value.ui64 =
1119 	    wmsum_value(&brt_sums.brt_decref_entry_not_on_disk);
1120 	bs->brt_decref_entry_read_lost_race.value.ui64 =
1121 	    wmsum_value(&brt_sums.brt_decref_entry_read_lost_race);
1122 	bs->brt_decref_entry_still_referenced.value.ui64 =
1123 	    wmsum_value(&brt_sums.brt_decref_entry_still_referenced);
1124 	bs->brt_decref_free_data_later.value.ui64 =
1125 	    wmsum_value(&brt_sums.brt_decref_free_data_later);
1126 	bs->brt_decref_free_data_now.value.ui64 =
1127 	    wmsum_value(&brt_sums.brt_decref_free_data_now);
1128 	bs->brt_decref_no_entry.value.ui64 =
1129 	    wmsum_value(&brt_sums.brt_decref_no_entry);
1130 
1131 	return (0);
1132 }
1133 
1134 static void
1135 brt_stat_init(void)
1136 {
1137 
1138 	wmsum_init(&brt_sums.brt_addref_entry_in_memory, 0);
1139 	wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0);
1140 	wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0);
1141 	wmsum_init(&brt_sums.brt_addref_entry_read_lost_race, 0);
1142 	wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0);
1143 	wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0);
1144 	wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0);
1145 	wmsum_init(&brt_sums.brt_decref_entry_not_on_disk, 0);
1146 	wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0);
1147 	wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0);
1148 	wmsum_init(&brt_sums.brt_decref_free_data_later, 0);
1149 	wmsum_init(&brt_sums.brt_decref_free_data_now, 0);
1150 	wmsum_init(&brt_sums.brt_decref_no_entry, 0);
1151 
1152 	brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED,
1153 	    sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
1154 	if (brt_ksp != NULL) {
1155 		brt_ksp->ks_data = &brt_stats;
1156 		brt_ksp->ks_update = brt_kstats_update;
1157 		kstat_install(brt_ksp);
1158 	}
1159 }
1160 
1161 static void
1162 brt_stat_fini(void)
1163 {
1164 	if (brt_ksp != NULL) {
1165 		kstat_delete(brt_ksp);
1166 		brt_ksp = NULL;
1167 	}
1168 
1169 	wmsum_fini(&brt_sums.brt_addref_entry_in_memory);
1170 	wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk);
1171 	wmsum_fini(&brt_sums.brt_addref_entry_on_disk);
1172 	wmsum_fini(&brt_sums.brt_addref_entry_read_lost_race);
1173 	wmsum_fini(&brt_sums.brt_decref_entry_in_memory);
1174 	wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk);
1175 	wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory);
1176 	wmsum_fini(&brt_sums.brt_decref_entry_not_on_disk);
1177 	wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race);
1178 	wmsum_fini(&brt_sums.brt_decref_entry_still_referenced);
1179 	wmsum_fini(&brt_sums.brt_decref_free_data_later);
1180 	wmsum_fini(&brt_sums.brt_decref_free_data_now);
1181 	wmsum_fini(&brt_sums.brt_decref_no_entry);
1182 }
1183 
1184 void
1185 brt_init(void)
1186 {
1187 	brt_entry_cache = kmem_cache_create("brt_entry_cache",
1188 	    sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1189 	brt_pending_entry_cache = kmem_cache_create("brt_pending_entry_cache",
1190 	    sizeof (brt_pending_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1191 
1192 	brt_stat_init();
1193 }
1194 
1195 void
1196 brt_fini(void)
1197 {
1198 	brt_stat_fini();
1199 
1200 	kmem_cache_destroy(brt_entry_cache);
1201 	kmem_cache_destroy(brt_pending_entry_cache);
1202 }
1203 
1204 static brt_entry_t *
1205 brt_entry_alloc(const brt_entry_t *bre_init)
1206 {
1207 	brt_entry_t *bre;
1208 
1209 	bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
1210 	bre->bre_offset = bre_init->bre_offset;
1211 	bre->bre_refcount = bre_init->bre_refcount;
1212 
1213 	return (bre);
1214 }
1215 
1216 static void
1217 brt_entry_free(brt_entry_t *bre)
1218 {
1219 
1220 	kmem_cache_free(brt_entry_cache, bre);
1221 }
1222 
1223 static void
1224 brt_entry_addref(brt_t *brt, const blkptr_t *bp)
1225 {
1226 	brt_vdev_t *brtvd;
1227 	brt_entry_t *bre, *racebre;
1228 	brt_entry_t bre_search;
1229 	avl_index_t where;
1230 	uint64_t vdevid;
1231 	int error;
1232 
1233 	ASSERT(!RW_WRITE_HELD(&brt->brt_lock));
1234 
1235 	brt_entry_fill(bp, &bre_search, &vdevid);
1236 
1237 	brt_wlock(brt);
1238 
1239 	brtvd = brt_vdev(brt, vdevid);
1240 	if (brtvd == NULL) {
1241 		ASSERT3U(vdevid, >=, brt->brt_nvdevs);
1242 
1243 		/* New VDEV was added. */
1244 		brt_vdevs_expand(brt, vdevid + 1);
1245 		brtvd = brt_vdev(brt, vdevid);
1246 	}
1247 	ASSERT(brtvd != NULL);
1248 	if (!brtvd->bv_initiated)
1249 		brt_vdev_realloc(brt, brtvd);
1250 
1251 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1252 	if (bre != NULL) {
1253 		BRTSTAT_BUMP(brt_addref_entry_in_memory);
1254 	} else {
1255 		/*
1256 		 * brt_entry_lookup() may drop the BRT (read) lock and
1257 		 * reacquire it (write).
1258 		 */
1259 		error = brt_entry_lookup(brt, brtvd, &bre_search);
1260 		/* bre_search now contains correct bre_refcount */
1261 		ASSERT(error == 0 || error == ENOENT);
1262 		if (error == 0)
1263 			BRTSTAT_BUMP(brt_addref_entry_on_disk);
1264 		else
1265 			BRTSTAT_BUMP(brt_addref_entry_not_on_disk);
1266 		/*
1267 		 * When the BRT lock was dropped, brt_vdevs[] may have been
1268 		 * expanded and reallocated, we need to update brtvd's pointer.
1269 		 */
1270 		brtvd = brt_vdev(brt, vdevid);
1271 		ASSERT(brtvd != NULL);
1272 
1273 		racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
1274 		if (racebre == NULL) {
1275 			bre = brt_entry_alloc(&bre_search);
1276 			ASSERT(RW_WRITE_HELD(&brt->brt_lock));
1277 			avl_insert(&brtvd->bv_tree, bre, where);
1278 			brt->brt_nentries++;
1279 		} else {
1280 			/*
1281 			 * The entry was added when the BRT lock was dropped in
1282 			 * brt_entry_lookup().
1283 			 */
1284 			BRTSTAT_BUMP(brt_addref_entry_read_lost_race);
1285 			bre = racebre;
1286 		}
1287 	}
1288 	bre->bre_refcount++;
1289 	brt_vdev_addref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
1290 
1291 	brt_unlock(brt);
1292 }
1293 
1294 /* Return TRUE if block should be freed immediately. */
1295 boolean_t
1296 brt_entry_decref(spa_t *spa, const blkptr_t *bp)
1297 {
1298 	brt_t *brt = spa->spa_brt;
1299 	brt_vdev_t *brtvd;
1300 	brt_entry_t *bre, *racebre;
1301 	brt_entry_t bre_search;
1302 	avl_index_t where;
1303 	uint64_t vdevid;
1304 	int error;
1305 
1306 	brt_entry_fill(bp, &bre_search, &vdevid);
1307 
1308 	brt_wlock(brt);
1309 
1310 	brtvd = brt_vdev(brt, vdevid);
1311 	ASSERT(brtvd != NULL);
1312 
1313 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1314 	if (bre != NULL) {
1315 		BRTSTAT_BUMP(brt_decref_entry_in_memory);
1316 		goto out;
1317 	} else {
1318 		BRTSTAT_BUMP(brt_decref_entry_not_in_memory);
1319 	}
1320 
1321 	/*
1322 	 * brt_entry_lookup() may drop the BRT lock and reacquire it.
1323 	 */
1324 	error = brt_entry_lookup(brt, brtvd, &bre_search);
1325 	/* bre_search now contains correct bre_refcount */
1326 	ASSERT(error == 0 || error == ENOENT);
1327 	/*
1328 	 * When the BRT lock was dropped, brt_vdevs[] may have been expanded
1329 	 * and reallocated, we need to update brtvd's pointer.
1330 	 */
1331 	brtvd = brt_vdev(brt, vdevid);
1332 	ASSERT(brtvd != NULL);
1333 
1334 	if (error == ENOENT) {
1335 		BRTSTAT_BUMP(brt_decref_entry_not_on_disk);
1336 		bre = NULL;
1337 		goto out;
1338 	}
1339 
1340 	racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
1341 	if (racebre != NULL) {
1342 		/*
1343 		 * The entry was added when the BRT lock was dropped in
1344 		 * brt_entry_lookup().
1345 		 */
1346 		BRTSTAT_BUMP(brt_decref_entry_read_lost_race);
1347 		bre = racebre;
1348 		goto out;
1349 	}
1350 
1351 	BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk);
1352 	bre = brt_entry_alloc(&bre_search);
1353 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
1354 	avl_insert(&brtvd->bv_tree, bre, where);
1355 	brt->brt_nentries++;
1356 
1357 out:
1358 	if (bre == NULL) {
1359 		/*
1360 		 * This is a free of a regular (not cloned) block.
1361 		 */
1362 		brt_unlock(brt);
1363 		BRTSTAT_BUMP(brt_decref_no_entry);
1364 		return (B_TRUE);
1365 	}
1366 	if (bre->bre_refcount == 0) {
1367 		brt_unlock(brt);
1368 		BRTSTAT_BUMP(brt_decref_free_data_now);
1369 		return (B_TRUE);
1370 	}
1371 
1372 	ASSERT(bre->bre_refcount > 0);
1373 	bre->bre_refcount--;
1374 	if (bre->bre_refcount == 0)
1375 		BRTSTAT_BUMP(brt_decref_free_data_later);
1376 	else
1377 		BRTSTAT_BUMP(brt_decref_entry_still_referenced);
1378 	brt_vdev_decref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
1379 
1380 	brt_unlock(brt);
1381 
1382 	return (B_FALSE);
1383 }
1384 
1385 uint64_t
1386 brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)
1387 {
1388 	brt_t *brt = spa->spa_brt;
1389 	brt_vdev_t *brtvd;
1390 	brt_entry_t bre_search, *bre;
1391 	uint64_t vdevid, refcnt;
1392 	int error;
1393 
1394 	brt_entry_fill(bp, &bre_search, &vdevid);
1395 
1396 	brt_rlock(brt);
1397 
1398 	brtvd = brt_vdev(brt, vdevid);
1399 	ASSERT(brtvd != NULL);
1400 
1401 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1402 	if (bre == NULL) {
1403 		error = brt_entry_lookup(brt, brtvd, &bre_search);
1404 		ASSERT(error == 0 || error == ENOENT);
1405 		if (error == ENOENT)
1406 			refcnt = 0;
1407 		else
1408 			refcnt = bre_search.bre_refcount;
1409 	} else
1410 		refcnt = bre->bre_refcount;
1411 
1412 	brt_unlock(brt);
1413 	return (refcnt);
1414 }
1415 
1416 static void
1417 brt_prefetch(brt_t *brt, const blkptr_t *bp)
1418 {
1419 	brt_entry_t bre;
1420 	uint64_t vdevid;
1421 
1422 	ASSERT(bp != NULL);
1423 
1424 	if (!zfs_brt_prefetch)
1425 		return;
1426 
1427 	brt_entry_fill(bp, &bre, &vdevid);
1428 
1429 	brt_entry_prefetch(brt, vdevid, &bre);
1430 }
1431 
1432 static int
1433 brt_pending_entry_compare(const void *x1, const void *x2)
1434 {
1435 	const brt_pending_entry_t *bpe1 = x1, *bpe2 = x2;
1436 	const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp;
1437 	int cmp;
1438 
1439 	cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1), BP_PHYSICAL_BIRTH(bp2));
1440 	if (cmp == 0) {
1441 		cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]),
1442 		    DVA_GET_VDEV(&bp2->blk_dva[0]));
1443 		if (cmp == 0) {
1444 			cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
1445 			    DVA_GET_OFFSET(&bp2->blk_dva[0]));
1446 		}
1447 	}
1448 
1449 	return (cmp);
1450 }
1451 
1452 void
1453 brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1454 {
1455 	brt_t *brt;
1456 	avl_tree_t *pending_tree;
1457 	kmutex_t *pending_lock;
1458 	brt_pending_entry_t *bpe, *newbpe;
1459 	avl_index_t where;
1460 	uint64_t txg;
1461 
1462 	brt = spa->spa_brt;
1463 	txg = dmu_tx_get_txg(tx);
1464 	ASSERT3U(txg, !=, 0);
1465 	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
1466 	pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
1467 
1468 	newbpe = kmem_cache_alloc(brt_pending_entry_cache, KM_SLEEP);
1469 	newbpe->bpe_bp = *bp;
1470 	newbpe->bpe_count = 1;
1471 
1472 	mutex_enter(pending_lock);
1473 
1474 	bpe = avl_find(pending_tree, newbpe, &where);
1475 	if (bpe == NULL) {
1476 		avl_insert(pending_tree, newbpe, where);
1477 		newbpe = NULL;
1478 	} else {
1479 		bpe->bpe_count++;
1480 	}
1481 
1482 	mutex_exit(pending_lock);
1483 
1484 	if (newbpe != NULL) {
1485 		ASSERT(bpe != NULL);
1486 		ASSERT(bpe != newbpe);
1487 		kmem_cache_free(brt_pending_entry_cache, newbpe);
1488 	} else {
1489 		ASSERT(bpe == NULL);
1490 	}
1491 
1492 	/* Prefetch BRT entry, as we will need it in the syncing context. */
1493 	brt_prefetch(brt, bp);
1494 }
1495 
1496 void
1497 brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1498 {
1499 	brt_t *brt;
1500 	avl_tree_t *pending_tree;
1501 	kmutex_t *pending_lock;
1502 	brt_pending_entry_t *bpe, bpe_search;
1503 	uint64_t txg;
1504 
1505 	brt = spa->spa_brt;
1506 	txg = dmu_tx_get_txg(tx);
1507 	ASSERT3U(txg, !=, 0);
1508 	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
1509 	pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
1510 
1511 	bpe_search.bpe_bp = *bp;
1512 
1513 	mutex_enter(pending_lock);
1514 
1515 	bpe = avl_find(pending_tree, &bpe_search, NULL);
1516 	/* I believe we should always find bpe when this function is called. */
1517 	if (bpe != NULL) {
1518 		ASSERT(bpe->bpe_count > 0);
1519 
1520 		bpe->bpe_count--;
1521 		if (bpe->bpe_count == 0) {
1522 			avl_remove(pending_tree, bpe);
1523 			kmem_cache_free(brt_pending_entry_cache, bpe);
1524 		}
1525 	}
1526 
1527 	mutex_exit(pending_lock);
1528 }
1529 
1530 void
1531 brt_pending_apply(spa_t *spa, uint64_t txg)
1532 {
1533 	brt_t *brt;
1534 	brt_pending_entry_t *bpe;
1535 	avl_tree_t *pending_tree;
1536 	kmutex_t *pending_lock;
1537 	void *c;
1538 
1539 	ASSERT3U(txg, !=, 0);
1540 
1541 	brt = spa->spa_brt;
1542 	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
1543 	pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
1544 
1545 	mutex_enter(pending_lock);
1546 
1547 	c = NULL;
1548 	while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) {
1549 		boolean_t added_to_ddt;
1550 
1551 		mutex_exit(pending_lock);
1552 
1553 		for (int i = 0; i < bpe->bpe_count; i++) {
1554 			/*
1555 			 * If the block has DEDUP bit set, it means that it
1556 			 * already exists in the DEDUP table, so we can just
1557 			 * use that instead of creating new entry in
1558 			 * the BRT table.
1559 			 */
1560 			if (BP_GET_DEDUP(&bpe->bpe_bp)) {
1561 				added_to_ddt = ddt_addref(spa, &bpe->bpe_bp);
1562 			} else {
1563 				added_to_ddt = B_FALSE;
1564 			}
1565 			if (!added_to_ddt)
1566 				brt_entry_addref(brt, &bpe->bpe_bp);
1567 		}
1568 
1569 		kmem_cache_free(brt_pending_entry_cache, bpe);
1570 		mutex_enter(pending_lock);
1571 	}
1572 
1573 	mutex_exit(pending_lock);
1574 }
1575 
1576 static void
1577 brt_sync_entry(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
1578 {
1579 
1580 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
1581 	ASSERT(brtvd->bv_mos_entries != 0);
1582 
1583 	if (bre->bre_refcount == 0) {
1584 		int error;
1585 
1586 		error = brt_entry_remove(brt, brtvd, bre, tx);
1587 		ASSERT(error == 0 || error == ENOENT);
1588 		/*
1589 		 * If error == ENOENT then zfs_clone_range() was done from a
1590 		 * removed (but opened) file (open(), unlink()).
1591 		 */
1592 		ASSERT(brt_entry_lookup(brt, brtvd, bre) == ENOENT);
1593 	} else {
1594 		VERIFY0(brt_entry_update(brt, brtvd, bre, tx));
1595 	}
1596 }
1597 
1598 static void
1599 brt_sync_table(brt_t *brt, dmu_tx_t *tx)
1600 {
1601 	brt_vdev_t *brtvd;
1602 	brt_entry_t *bre;
1603 	uint64_t vdevid;
1604 	void *c;
1605 
1606 	brt_wlock(brt);
1607 
1608 	for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
1609 		brtvd = &brt->brt_vdevs[vdevid];
1610 
1611 		if (!brtvd->bv_initiated)
1612 			continue;
1613 
1614 		if (!brtvd->bv_meta_dirty) {
1615 			ASSERT(!brtvd->bv_entcount_dirty);
1616 			ASSERT0(avl_numnodes(&brtvd->bv_tree));
1617 			continue;
1618 		}
1619 
1620 		ASSERT(!brtvd->bv_entcount_dirty ||
1621 		    avl_numnodes(&brtvd->bv_tree) != 0);
1622 
1623 		if (brtvd->bv_mos_brtvdev == 0)
1624 			brt_vdev_create(brt, brtvd, tx);
1625 
1626 		c = NULL;
1627 		while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
1628 			brt_sync_entry(brt, brtvd, bre, tx);
1629 			brt_entry_free(bre);
1630 			ASSERT(brt->brt_nentries > 0);
1631 			brt->brt_nentries--;
1632 		}
1633 
1634 		brt_vdev_sync(brt, brtvd, tx);
1635 
1636 		if (brtvd->bv_totalcount == 0)
1637 			brt_vdev_destroy(brt, brtvd, tx);
1638 	}
1639 
1640 	ASSERT0(brt->brt_nentries);
1641 
1642 	brt_unlock(brt);
1643 }
1644 
1645 void
1646 brt_sync(spa_t *spa, uint64_t txg)
1647 {
1648 	dmu_tx_t *tx;
1649 	brt_t *brt;
1650 
1651 	ASSERT(spa_syncing_txg(spa) == txg);
1652 
1653 	brt = spa->spa_brt;
1654 	brt_rlock(brt);
1655 	if (brt->brt_nentries == 0) {
1656 		/* No changes. */
1657 		brt_unlock(brt);
1658 		return;
1659 	}
1660 	brt_unlock(brt);
1661 
1662 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1663 
1664 	brt_sync_table(brt, tx);
1665 
1666 	dmu_tx_commit(tx);
1667 }
1668 
1669 static void
1670 brt_table_alloc(brt_t *brt)
1671 {
1672 
1673 	for (int i = 0; i < TXG_SIZE; i++) {
1674 		avl_create(&brt->brt_pending_tree[i],
1675 		    brt_pending_entry_compare,
1676 		    sizeof (brt_pending_entry_t),
1677 		    offsetof(brt_pending_entry_t, bpe_node));
1678 		mutex_init(&brt->brt_pending_lock[i], NULL, MUTEX_DEFAULT,
1679 		    NULL);
1680 	}
1681 }
1682 
1683 static void
1684 brt_table_free(brt_t *brt)
1685 {
1686 
1687 	for (int i = 0; i < TXG_SIZE; i++) {
1688 		ASSERT(avl_is_empty(&brt->brt_pending_tree[i]));
1689 
1690 		avl_destroy(&brt->brt_pending_tree[i]);
1691 		mutex_destroy(&brt->brt_pending_lock[i]);
1692 	}
1693 }
1694 
1695 static void
1696 brt_alloc(spa_t *spa)
1697 {
1698 	brt_t *brt;
1699 
1700 	ASSERT(spa->spa_brt == NULL);
1701 
1702 	brt = kmem_zalloc(sizeof (*brt), KM_SLEEP);
1703 	rw_init(&brt->brt_lock, NULL, RW_DEFAULT, NULL);
1704 	brt->brt_spa = spa;
1705 	brt->brt_rangesize = 0;
1706 	brt->brt_nentries = 0;
1707 	brt->brt_vdevs = NULL;
1708 	brt->brt_nvdevs = 0;
1709 	brt_table_alloc(brt);
1710 
1711 	spa->spa_brt = brt;
1712 }
1713 
1714 void
1715 brt_create(spa_t *spa)
1716 {
1717 
1718 	brt_alloc(spa);
1719 	brt_vdevs_alloc(spa->spa_brt, B_FALSE);
1720 }
1721 
1722 int
1723 brt_load(spa_t *spa)
1724 {
1725 
1726 	brt_alloc(spa);
1727 	brt_vdevs_alloc(spa->spa_brt, B_TRUE);
1728 
1729 	return (0);
1730 }
1731 
1732 void
1733 brt_unload(spa_t *spa)
1734 {
1735 	brt_t *brt = spa->spa_brt;
1736 
1737 	if (brt == NULL)
1738 		return;
1739 
1740 	brt_vdevs_free(brt);
1741 	brt_table_free(brt);
1742 	rw_destroy(&brt->brt_lock);
1743 	kmem_free(brt, sizeof (*brt));
1744 	spa->spa_brt = NULL;
1745 }
1746 
1747 /* BEGIN CSTYLED */
1748 ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, prefetch, INT, ZMOD_RW,
1749     "Enable prefetching of BRT entries");
1750 #ifdef ZFS_BRT_DEBUG
1751 ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, debug, INT, ZMOD_RW, "BRT debug");
1752 #endif
1753 /* END CSTYLED */
1754