xref: /freebsd/sys/contrib/openzfs/module/zfs/brt.c (revision 2f9966ff63d65bd474478888c9088eeae3f9c669)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
24  */
25 
26 #include <sys/zfs_context.h>
27 #include <sys/spa.h>
28 #include <sys/spa_impl.h>
29 #include <sys/zio.h>
30 #include <sys/brt.h>
31 #include <sys/brt_impl.h>
32 #include <sys/ddt.h>
33 #include <sys/bitmap.h>
34 #include <sys/zap.h>
35 #include <sys/dmu_tx.h>
36 #include <sys/arc.h>
37 #include <sys/dsl_pool.h>
38 #include <sys/dsl_scan.h>
39 #include <sys/vdev_impl.h>
40 #include <sys/kstat.h>
41 #include <sys/wmsum.h>
42 
43 /*
44  * Block Cloning design.
45  *
46  * Block Cloning allows to manually clone a file (or a subset of its blocks)
47  * into another (or the same) file by just creating additional references to
48  * the data blocks without copying the data itself. Those references are kept
49  * in the Block Reference Tables (BRTs).
50  *
51  * In many ways this is similar to the existing deduplication, but there are
52  * some important differences:
53  *
54  * - Deduplication is automatic and Block Cloning is not - one has to use a
55  *   dedicated system call(s) to clone the given file/blocks.
56  * - Deduplication keeps all data blocks in its table, even those referenced
57  *   just once. Block Cloning creates an entry in its tables only when there
58  *   are at least two references to the given data block. If the block was
59  *   never explicitly cloned or the second to last reference was dropped,
60  *   there will be neither space nor performance overhead.
61  * - Deduplication needs data to work - one needs to pass real data to the
62  *   write(2) syscall, so hash can be calculated. Block Cloning doesn't require
63  *   data, just block pointers to the data, so it is extremely fast, as we pay
64  *   neither the cost of reading the data, nor the cost of writing the data -
65  *   we operate exclusively on metadata.
66  * - If the D (dedup) bit is not set in the block pointer, it means that
67  *   the block is not in the dedup table (DDT) and we won't consult the DDT
68  *   when we need to free the block. Block Cloning must be consulted on every
69  *   free, because we cannot modify the source BP (eg. by setting something
70  *   similar to the D bit), thus we have no hint if the block is in the
71  *   Block Reference Table (BRT), so we need to look into the BRT. There is
72  *   an optimization in place that allows us to eliminate the majority of BRT
73  *   lookups which is described below in the "Minimizing free penalty" section.
74  * - The BRT entry is much smaller than the DDT entry - for BRT we only store
75  *   64bit offset and 64bit reference counter.
76  * - Dedup keys are cryptographic hashes, so two blocks that are close to each
77  *   other on disk are most likely in totally different parts of the DDT.
78  *   The BRT entry keys are offsets into a single top-level VDEV, so data blocks
79  *   from one file should have BRT entries close to each other.
80  * - Scrub will only do a single pass over a block that is referenced multiple
81  *   times in the DDT. Unfortunately it is not currently (if at all) possible
82  *   with Block Cloning and block referenced multiple times will be scrubbed
83  *   multiple times. The new, sorted scrub should be able to eliminate
84  *   duplicated reads given enough memory.
85  * - Deduplication requires cryptographically strong hash as a checksum or
86  *   additional data verification. Block Cloning works with any checksum
87  *   algorithm or even with checksumming disabled.
88  *
89  * As mentioned above, the BRT entries are much smaller than the DDT entries.
90  * To uniquely identify a block we just need its vdev id and offset. We also
91  * need to maintain a reference counter. The vdev id will often repeat, as there
92  * is a small number of top-level VDEVs and a large number of blocks stored in
93  * each VDEV. We take advantage of that to reduce the BRT entry size further by
94  * maintaining one BRT for each top-level VDEV, so we can then have only offset
95  * and counter as the BRT entry.
96  *
97  * Minimizing free penalty.
98  *
99  * Block Cloning allows creating additional references to any existing block.
100  * When we free a block there is no hint in the block pointer whether the block
101  * was cloned or not, so on each free we have to check if there is a
102  * corresponding entry in the BRT or not. If there is, we need to decrease
103  * the reference counter. Doing BRT lookup on every free can potentially be
104  * expensive by requiring additional I/Os if the BRT doesn't fit into memory.
105  * This is the main problem with deduplication, so we've learned our lesson and
106  * try not to repeat the same mistake here. How do we do that? We divide each
107  * top-level VDEV into 16MB regions. For each region we maintain a counter that
108  * is a sum of all the BRT entries that have offsets within the region. This
109  * creates the entries count array of 16bit numbers for each top-level VDEV.
110  * The entries count array is always kept in memory and updated on disk in the
111  * same transaction group as the BRT updates to keep everything in-sync. We can
112  * keep the array in memory, because it is very small. With 16MB regions and
113  * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease
114  * the region size even further in the future). Now, when we want to free
115  * a block, we first consult the array. If the counter for the whole region is
116  * zero, there is no need to look for the BRT entry, as there isn't one for
117  * sure. If the counter for the region is greater than zero, only then we will
118  * do a BRT lookup and if an entry is found we will decrease the reference
119  * counter in the BRT entry and in the entry counters array.
120  *
121  * The entry counters array is small, but can potentially be larger for very
122  * large VDEVs or smaller regions. In this case we don't want to rewrite entire
123  * array on every change. We then divide the array into 32kB block and keep
124  * a bitmap of dirty blocks within a transaction group. When we sync the
125  * transaction group we can only update the parts of the entry counters array
126  * that were modified. Note: Keeping track of the dirty parts of the entry
127  * counters array is implemented, but updating only parts of the array on disk
128  * is not yet implemented - for now we will update entire array if there was
129  * any change.
130  *
131  * The implementation tries to be economic: if BRT is not used, or no longer
132  * used, there will be no entries in the MOS and no additional memory used (eg.
133  * the entry counters array is only allocated if needed).
134  *
135  * Interaction between Deduplication and Block Cloning.
136  *
137  * If both functionalities are in use, we could end up with a block that is
138  * referenced multiple times in both DDT and BRT. When we free one of the
139  * references we couldn't tell where it belongs, so we would have to decide
140  * what table takes the precedence: do we first clear DDT references or BRT
141  * references? To avoid this dilemma BRT cooperates with DDT - if a given block
142  * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will
143  * lookup DDT entry instead and increase the counter there. No BRT entry
144  * will be created for a block which has the D (dedup) bit set.
145  * BRT may be more efficient for manual deduplication, but if the block is
146  * already in the DDT, then creating additional BRT entry would be less
147  * efficient. This clever idea was proposed by Allan Jude.
148  *
149  * Block Cloning across datasets.
150  *
151  * Block Cloning is not limited to cloning blocks within the same dataset.
152  * It is possible (and very useful) to clone blocks between different datasets.
153  * One use case is recovering files from snapshots. By cloning the files into
154  * dataset we need no additional storage. Without Block Cloning we would need
155  * additional space for those files.
156  * Another interesting use case is moving the files between datasets
157  * (copying the file content to the new dataset and removing the source file).
158  * In that case Block Cloning will only be used briefly, because the BRT entries
159  * will be removed when the source is removed.
160  * Block Cloning across encrypted datasets is supported as long as both
161  * datasets share the same master key (e.g. snapshots and clones)
162  *
163  * Block Cloning flow through ZFS layers.
164  *
165  * Note: Block Cloning can be used both for cloning file system blocks and ZVOL
166  * blocks. As of this writing no interface is implemented that allows for block
167  * cloning within a ZVOL.
168  * FreeBSD and Linux provides copy_file_range(2) system call and we will use it
169  * for blocking cloning.
170  *
171  *	ssize_t
172  *	copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp,
173  *	                size_t len, unsigned int flags);
174  *
175  * Even though offsets and length represent bytes, they have to be
176  * block-aligned or we will return an error so the upper layer can
177  * fallback to the generic mechanism that will just copy the data.
178  * Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
179  * This function was implemented based on zfs_write(), but instead of writing
180  * the given data we first read block pointers using the new dmu_read_l0_bps()
181  * function from the source file. Once we have BPs from the source file we call
182  * the dmu_brt_clone() function on the destination file. This function
183  * allocates BPs for us. We iterate over all source BPs. If the given BP is
184  * a hole or an embedded block, we just copy BP as-is. If it points to a real
185  * data we place this BP on a BRT pending list using the brt_pending_add()
186  * function.
187  *
188  * We use this pending list to keep track of all BPs that got new references
189  * within this transaction group.
190  *
191  * Some special cases to consider and how we address them:
192  * - The block we want to clone may have been created within the same
193  *   transaction group that we are trying to clone. Such block has no BP
194  *   allocated yet, so cannot be immediately cloned. We return EAGAIN.
195  * - The block we want to clone may have been modified within the same
196  *   transaction group. We return EAGAIN.
197  * - A block may be cloned multiple times during one transaction group (that's
198  *   why pending list is actually a tree and not an append-only list - this
199  *   way we can figure out faster if this block is cloned for the first time
200  *   in this txg or consecutive time).
201  * - A block may be cloned and freed within the same transaction group
202  *   (see dbuf_undirty()).
203  * - A block may be cloned and within the same transaction group the clone
204  *   can be cloned again (see dmu_read_l0_bps()).
205  * - A file might have been deleted, but the caller still has a file descriptor
206  *   open to this file and clones it.
207  *
208  * When we free a block we have an additional step in the ZIO pipeline where we
209  * call the zio_brt_free() function. We then call the brt_entry_decref()
210  * that loads the corresponding BRT entry (if one exists) and decreases
211  * reference counter. If this is not the last reference we will stop ZIO
212  * pipeline here. If this is the last reference or the block is not in the
213  * BRT, we continue the pipeline and free the block as usual.
214  *
215  * At the beginning of spa_sync() where there can be no more block cloning,
216  * but before issuing frees we call brt_pending_apply(). This function applies
217  * all the new clones to the BRT table - we load BRT entries and update
218  * reference counters. To sync new BRT entries to disk, we use brt_sync()
219  * function. This function will sync all dirty per-top-level-vdev BRTs,
220  * the entry counters arrays, etc.
221  *
222  * Block Cloning and ZIL.
223  *
224  * Every clone operation is divided into chunks (similar to write) and each
225  * chunk is cloned in a separate transaction. The chunk size is determined by
226  * how many BPs we can fit into a single ZIL entry.
227  * Replaying clone operation is different from the regular clone operation,
228  * as when we log clone operations we cannot use the source object - it may
229  * reside on a different dataset, so we log BPs we want to clone.
230  * The ZIL is replayed when we mount the given dataset, not when the pool is
231  * imported. Taking this into account it is possible that the pool is imported
232  * without mounting datasets and the source dataset is destroyed before the
233  * destination dataset is mounted and its ZIL replayed.
234  * To address this situation we leverage zil_claim() mechanism where ZFS will
235  * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
236  * entries, we will bump reference counters for their BPs in the BRT.  Then
237  * on mount and ZIL replay we bump the reference counters once more, while the
238  * first references are dropped during ZIL destroy by zil_free_clone_range().
239  * It is possible that after zil_claim() we never mount the destination, so
240  * we never replay its ZIL and just destroy it.  In this case the only taken
241  * references will be dropped by zil_free_clone_range(), since the cloning is
242  * not going to ever take place.
243  */
244 
245 static kmem_cache_t *brt_entry_cache;
246 static kmem_cache_t *brt_pending_entry_cache;
247 
248 /*
249  * Enable/disable prefetching of BRT entries that we are going to modify.
250  */
251 static int brt_zap_prefetch = 1;
252 
253 #ifdef ZFS_DEBUG
254 #define	BRT_DEBUG(...)	do {						\
255 	if ((zfs_flags & ZFS_DEBUG_BRT) != 0) {				\
256 		__dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \
257 	}								\
258 } while (0)
259 #else
260 #define	BRT_DEBUG(...)	do { } while (0)
261 #endif
262 
263 static int brt_zap_default_bs = 12;
264 static int brt_zap_default_ibs = 12;
265 
266 static kstat_t	*brt_ksp;
267 
268 typedef struct brt_stats {
269 	kstat_named_t brt_addref_entry_in_memory;
270 	kstat_named_t brt_addref_entry_not_on_disk;
271 	kstat_named_t brt_addref_entry_on_disk;
272 	kstat_named_t brt_addref_entry_read_lost_race;
273 	kstat_named_t brt_decref_entry_in_memory;
274 	kstat_named_t brt_decref_entry_loaded_from_disk;
275 	kstat_named_t brt_decref_entry_not_in_memory;
276 	kstat_named_t brt_decref_entry_not_on_disk;
277 	kstat_named_t brt_decref_entry_read_lost_race;
278 	kstat_named_t brt_decref_entry_still_referenced;
279 	kstat_named_t brt_decref_free_data_later;
280 	kstat_named_t brt_decref_free_data_now;
281 	kstat_named_t brt_decref_no_entry;
282 } brt_stats_t;
283 
284 static brt_stats_t brt_stats = {
285 	{ "addref_entry_in_memory",		KSTAT_DATA_UINT64 },
286 	{ "addref_entry_not_on_disk",		KSTAT_DATA_UINT64 },
287 	{ "addref_entry_on_disk",		KSTAT_DATA_UINT64 },
288 	{ "addref_entry_read_lost_race",	KSTAT_DATA_UINT64 },
289 	{ "decref_entry_in_memory",		KSTAT_DATA_UINT64 },
290 	{ "decref_entry_loaded_from_disk",	KSTAT_DATA_UINT64 },
291 	{ "decref_entry_not_in_memory",		KSTAT_DATA_UINT64 },
292 	{ "decref_entry_not_on_disk",		KSTAT_DATA_UINT64 },
293 	{ "decref_entry_read_lost_race",	KSTAT_DATA_UINT64 },
294 	{ "decref_entry_still_referenced",	KSTAT_DATA_UINT64 },
295 	{ "decref_free_data_later",		KSTAT_DATA_UINT64 },
296 	{ "decref_free_data_now",		KSTAT_DATA_UINT64 },
297 	{ "decref_no_entry",			KSTAT_DATA_UINT64 }
298 };
299 
300 struct {
301 	wmsum_t brt_addref_entry_in_memory;
302 	wmsum_t brt_addref_entry_not_on_disk;
303 	wmsum_t brt_addref_entry_on_disk;
304 	wmsum_t brt_addref_entry_read_lost_race;
305 	wmsum_t brt_decref_entry_in_memory;
306 	wmsum_t brt_decref_entry_loaded_from_disk;
307 	wmsum_t brt_decref_entry_not_in_memory;
308 	wmsum_t brt_decref_entry_not_on_disk;
309 	wmsum_t brt_decref_entry_read_lost_race;
310 	wmsum_t brt_decref_entry_still_referenced;
311 	wmsum_t brt_decref_free_data_later;
312 	wmsum_t brt_decref_free_data_now;
313 	wmsum_t brt_decref_no_entry;
314 } brt_sums;
315 
316 #define	BRTSTAT_BUMP(stat)	wmsum_add(&brt_sums.stat, 1)
317 
318 static int brt_entry_compare(const void *x1, const void *x2);
319 static int brt_pending_entry_compare(const void *x1, const void *x2);
320 
321 static void
322 brt_rlock(brt_t *brt)
323 {
324 	rw_enter(&brt->brt_lock, RW_READER);
325 }
326 
327 static void
328 brt_wlock(brt_t *brt)
329 {
330 	rw_enter(&brt->brt_lock, RW_WRITER);
331 }
332 
333 static void
334 brt_unlock(brt_t *brt)
335 {
336 	rw_exit(&brt->brt_lock);
337 }
338 
339 static uint16_t
340 brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx)
341 {
342 
343 	ASSERT3U(idx, <, brtvd->bv_size);
344 
345 	if (unlikely(brtvd->bv_need_byteswap)) {
346 		return (BSWAP_16(brtvd->bv_entcount[idx]));
347 	} else {
348 		return (brtvd->bv_entcount[idx]);
349 	}
350 }
351 
352 static void
353 brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt)
354 {
355 
356 	ASSERT3U(idx, <, brtvd->bv_size);
357 
358 	if (unlikely(brtvd->bv_need_byteswap)) {
359 		brtvd->bv_entcount[idx] = BSWAP_16(entcnt);
360 	} else {
361 		brtvd->bv_entcount[idx] = entcnt;
362 	}
363 }
364 
365 static void
366 brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx)
367 {
368 	uint16_t entcnt;
369 
370 	ASSERT3U(idx, <, brtvd->bv_size);
371 
372 	entcnt = brt_vdev_entcount_get(brtvd, idx);
373 	ASSERT(entcnt < UINT16_MAX);
374 
375 	brt_vdev_entcount_set(brtvd, idx, entcnt + 1);
376 }
377 
378 static void
379 brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx)
380 {
381 	uint16_t entcnt;
382 
383 	ASSERT3U(idx, <, brtvd->bv_size);
384 
385 	entcnt = brt_vdev_entcount_get(brtvd, idx);
386 	ASSERT(entcnt > 0);
387 
388 	brt_vdev_entcount_set(brtvd, idx, entcnt - 1);
389 }
390 
391 #ifdef ZFS_DEBUG
392 static void
393 brt_vdev_dump(brt_vdev_t *brtvd)
394 {
395 	uint64_t idx;
396 
397 	zfs_dbgmsg("  BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d "
398 	    "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n",
399 	    (u_longlong_t)brtvd->bv_vdevid,
400 	    brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
401 	    (u_longlong_t)brtvd->bv_size,
402 	    (u_longlong_t)brtvd->bv_totalcount,
403 	    (u_longlong_t)brtvd->bv_nblocks,
404 	    (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks));
405 	if (brtvd->bv_totalcount > 0) {
406 		zfs_dbgmsg("    entcounts:");
407 		for (idx = 0; idx < brtvd->bv_size; idx++) {
408 			uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx);
409 			if (entcnt > 0) {
410 				zfs_dbgmsg("      [%04llu] %hu",
411 				    (u_longlong_t)idx, entcnt);
412 			}
413 		}
414 	}
415 	if (brtvd->bv_entcount_dirty) {
416 		char *bitmap;
417 
418 		bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP);
419 		for (idx = 0; idx < brtvd->bv_nblocks; idx++) {
420 			bitmap[idx] =
421 			    BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
422 		}
423 		bitmap[idx] = '\0';
424 		zfs_dbgmsg("    dirty: %s", bitmap);
425 		kmem_free(bitmap, brtvd->bv_nblocks + 1);
426 	}
427 }
428 #endif
429 
430 static brt_vdev_t *
431 brt_vdev(brt_t *brt, uint64_t vdevid)
432 {
433 	brt_vdev_t *brtvd;
434 
435 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
436 
437 	if (vdevid < brt->brt_nvdevs) {
438 		brtvd = &brt->brt_vdevs[vdevid];
439 	} else {
440 		brtvd = NULL;
441 	}
442 
443 	return (brtvd);
444 }
445 
446 static void
447 brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
448 {
449 	char name[64];
450 
451 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
452 	ASSERT0(brtvd->bv_mos_brtvdev);
453 	ASSERT0(brtvd->bv_mos_entries);
454 	ASSERT(brtvd->bv_entcount != NULL);
455 	ASSERT(brtvd->bv_size > 0);
456 	ASSERT(brtvd->bv_bitmap != NULL);
457 	ASSERT(brtvd->bv_nblocks > 0);
458 
459 	brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0,
460 	    ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
461 	    brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx);
462 	VERIFY(brtvd->bv_mos_entries != 0);
463 	BRT_DEBUG("MOS entries created, object=%llu",
464 	    (u_longlong_t)brtvd->bv_mos_entries);
465 
466 	/*
467 	 * We allocate DMU buffer to store the bv_entcount[] array.
468 	 * We will keep array size (bv_size) and cummulative count for all
469 	 * bv_entcount[]s (bv_totalcount) in the bonus buffer.
470 	 */
471 	brtvd->bv_mos_brtvdev = dmu_object_alloc(brt->brt_mos,
472 	    DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE,
473 	    DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx);
474 	VERIFY(brtvd->bv_mos_brtvdev != 0);
475 	BRT_DEBUG("MOS BRT VDEV created, object=%llu",
476 	    (u_longlong_t)brtvd->bv_mos_brtvdev);
477 
478 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
479 	    (u_longlong_t)brtvd->bv_vdevid);
480 	VERIFY0(zap_add(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
481 	    sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx));
482 	BRT_DEBUG("Pool directory object created, object=%s", name);
483 
484 	spa_feature_incr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
485 }
486 
487 static void
488 brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd)
489 {
490 	vdev_t *vd;
491 	uint16_t *entcount;
492 	ulong_t *bitmap;
493 	uint64_t nblocks, size;
494 
495 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
496 
497 	spa_config_enter(brt->brt_spa, SCL_VDEV, FTAG, RW_READER);
498 	vd = vdev_lookup_top(brt->brt_spa, brtvd->bv_vdevid);
499 	size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1;
500 	spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG);
501 
502 	entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP);
503 	nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);
504 	bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);
505 
506 	if (!brtvd->bv_initiated) {
507 		ASSERT0(brtvd->bv_size);
508 		ASSERT(brtvd->bv_entcount == NULL);
509 		ASSERT(brtvd->bv_bitmap == NULL);
510 		ASSERT0(brtvd->bv_nblocks);
511 
512 		avl_create(&brtvd->bv_tree, brt_entry_compare,
513 		    sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node));
514 	} else {
515 		ASSERT(brtvd->bv_size > 0);
516 		ASSERT(brtvd->bv_entcount != NULL);
517 		ASSERT(brtvd->bv_bitmap != NULL);
518 		ASSERT(brtvd->bv_nblocks > 0);
519 		/*
520 		 * TODO: Allow vdev shrinking. We only need to implement
521 		 * shrinking the on-disk BRT VDEV object.
522 		 * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset,
523 		 *     size, tx);
524 		 */
525 		ASSERT3U(brtvd->bv_size, <=, size);
526 
527 		memcpy(entcount, brtvd->bv_entcount,
528 		    sizeof (entcount[0]) * MIN(size, brtvd->bv_size));
529 		memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
530 		    BT_SIZEOFMAP(brtvd->bv_nblocks)));
531 		vmem_free(brtvd->bv_entcount,
532 		    sizeof (entcount[0]) * brtvd->bv_size);
533 		kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
534 	}
535 
536 	brtvd->bv_size = size;
537 	brtvd->bv_entcount = entcount;
538 	brtvd->bv_bitmap = bitmap;
539 	brtvd->bv_nblocks = nblocks;
540 	if (!brtvd->bv_initiated) {
541 		brtvd->bv_need_byteswap = FALSE;
542 		brtvd->bv_initiated = TRUE;
543 		BRT_DEBUG("BRT VDEV %llu initiated.",
544 		    (u_longlong_t)brtvd->bv_vdevid);
545 	}
546 }
547 
548 static void
549 brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd)
550 {
551 	char name[64];
552 	dmu_buf_t *db;
553 	brt_vdev_phys_t *bvphys;
554 	int error;
555 
556 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
557 	    (u_longlong_t)brtvd->bv_vdevid);
558 	error = zap_lookup(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
559 	    sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev);
560 	if (error != 0)
561 		return;
562 	ASSERT(brtvd->bv_mos_brtvdev != 0);
563 
564 	error = dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db);
565 	ASSERT0(error);
566 	if (error != 0)
567 		return;
568 
569 	bvphys = db->db_data;
570 	if (brt->brt_rangesize == 0) {
571 		brt->brt_rangesize = bvphys->bvp_rangesize;
572 	} else {
573 		ASSERT3U(brt->brt_rangesize, ==, bvphys->bvp_rangesize);
574 	}
575 
576 	ASSERT(!brtvd->bv_initiated);
577 	brt_vdev_realloc(brt, brtvd);
578 
579 	/* TODO: We don't support VDEV shrinking. */
580 	ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size);
581 
582 	/*
583 	 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out.
584 	 */
585 	error = dmu_read(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
586 	    MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t),
587 	    brtvd->bv_entcount, DMU_READ_NO_PREFETCH);
588 	ASSERT0(error);
589 
590 	brtvd->bv_mos_entries = bvphys->bvp_mos_entries;
591 	ASSERT(brtvd->bv_mos_entries != 0);
592 	brtvd->bv_need_byteswap =
593 	    (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER);
594 	brtvd->bv_totalcount = bvphys->bvp_totalcount;
595 	brtvd->bv_usedspace = bvphys->bvp_usedspace;
596 	brtvd->bv_savedspace = bvphys->bvp_savedspace;
597 	brt->brt_usedspace += brtvd->bv_usedspace;
598 	brt->brt_savedspace += brtvd->bv_savedspace;
599 
600 	dmu_buf_rele(db, FTAG);
601 
602 	BRT_DEBUG("MOS BRT VDEV %s loaded: mos_brtvdev=%llu, mos_entries=%llu",
603 	    name, (u_longlong_t)brtvd->bv_mos_brtvdev,
604 	    (u_longlong_t)brtvd->bv_mos_entries);
605 }
606 
607 static void
608 brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd)
609 {
610 
611 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
612 	ASSERT(brtvd->bv_initiated);
613 
614 	vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size);
615 	brtvd->bv_entcount = NULL;
616 	kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
617 	brtvd->bv_bitmap = NULL;
618 	ASSERT0(avl_numnodes(&brtvd->bv_tree));
619 	avl_destroy(&brtvd->bv_tree);
620 
621 	brtvd->bv_size = 0;
622 	brtvd->bv_nblocks = 0;
623 
624 	brtvd->bv_initiated = FALSE;
625 	BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid);
626 }
627 
628 static void
629 brt_vdev_destroy(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
630 {
631 	char name[64];
632 	uint64_t count;
633 	dmu_buf_t *db;
634 	brt_vdev_phys_t *bvphys;
635 
636 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
637 	ASSERT(brtvd->bv_mos_brtvdev != 0);
638 	ASSERT(brtvd->bv_mos_entries != 0);
639 
640 	VERIFY0(zap_count(brt->brt_mos, brtvd->bv_mos_entries, &count));
641 	VERIFY0(count);
642 	VERIFY0(zap_destroy(brt->brt_mos, brtvd->bv_mos_entries, tx));
643 	BRT_DEBUG("MOS entries destroyed, object=%llu",
644 	    (u_longlong_t)brtvd->bv_mos_entries);
645 	brtvd->bv_mos_entries = 0;
646 
647 	VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
648 	bvphys = db->db_data;
649 	ASSERT0(bvphys->bvp_totalcount);
650 	ASSERT0(bvphys->bvp_usedspace);
651 	ASSERT0(bvphys->bvp_savedspace);
652 	dmu_buf_rele(db, FTAG);
653 
654 	VERIFY0(dmu_object_free(brt->brt_mos, brtvd->bv_mos_brtvdev, tx));
655 	BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",
656 	    (u_longlong_t)brtvd->bv_mos_brtvdev);
657 	brtvd->bv_mos_brtvdev = 0;
658 
659 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
660 	    (u_longlong_t)brtvd->bv_vdevid);
661 	VERIFY0(zap_remove(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, tx));
662 	BRT_DEBUG("Pool directory object removed, object=%s", name);
663 
664 	brt_vdev_dealloc(brt, brtvd);
665 
666 	spa_feature_decr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
667 }
668 
669 static void
670 brt_vdevs_expand(brt_t *brt, uint64_t nvdevs)
671 {
672 	brt_vdev_t *brtvd, *vdevs;
673 	uint64_t vdevid;
674 
675 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
676 	ASSERT3U(nvdevs, >, brt->brt_nvdevs);
677 
678 	vdevs = kmem_zalloc(sizeof (vdevs[0]) * nvdevs, KM_SLEEP);
679 	if (brt->brt_nvdevs > 0) {
680 		ASSERT(brt->brt_vdevs != NULL);
681 
682 		memcpy(vdevs, brt->brt_vdevs,
683 		    sizeof (brt_vdev_t) * brt->brt_nvdevs);
684 		kmem_free(brt->brt_vdevs,
685 		    sizeof (brt_vdev_t) * brt->brt_nvdevs);
686 	}
687 	for (vdevid = brt->brt_nvdevs; vdevid < nvdevs; vdevid++) {
688 		brtvd = &vdevs[vdevid];
689 
690 		brtvd->bv_vdevid = vdevid;
691 		brtvd->bv_initiated = FALSE;
692 	}
693 
694 	BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",
695 	    (u_longlong_t)brt->brt_nvdevs, (u_longlong_t)nvdevs);
696 
697 	brt->brt_vdevs = vdevs;
698 	brt->brt_nvdevs = nvdevs;
699 }
700 
701 static boolean_t
702 brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre)
703 {
704 	uint64_t idx;
705 
706 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
707 
708 	idx = bre->bre_offset / brt->brt_rangesize;
709 	if (brtvd->bv_entcount != NULL && idx < brtvd->bv_size) {
710 		/* VDEV wasn't expanded. */
711 		return (brt_vdev_entcount_get(brtvd, idx) > 0);
712 	}
713 
714 	return (FALSE);
715 }
716 
717 static void
718 brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
719     uint64_t dsize)
720 {
721 	uint64_t idx;
722 
723 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
724 	ASSERT(brtvd != NULL);
725 	ASSERT(brtvd->bv_entcount != NULL);
726 
727 	brt->brt_savedspace += dsize;
728 	brtvd->bv_savedspace += dsize;
729 	brtvd->bv_meta_dirty = TRUE;
730 
731 	if (bre->bre_refcount > 1) {
732 		return;
733 	}
734 
735 	brt->brt_usedspace += dsize;
736 	brtvd->bv_usedspace += dsize;
737 
738 	idx = bre->bre_offset / brt->brt_rangesize;
739 	if (idx >= brtvd->bv_size) {
740 		/* VDEV has been expanded. */
741 		brt_vdev_realloc(brt, brtvd);
742 	}
743 
744 	ASSERT3U(idx, <, brtvd->bv_size);
745 
746 	brtvd->bv_totalcount++;
747 	brt_vdev_entcount_inc(brtvd, idx);
748 	brtvd->bv_entcount_dirty = TRUE;
749 	idx = idx / BRT_BLOCKSIZE / 8;
750 	BT_SET(brtvd->bv_bitmap, idx);
751 
752 #ifdef ZFS_DEBUG
753 	if (zfs_flags & ZFS_DEBUG_BRT)
754 		brt_vdev_dump(brtvd);
755 #endif
756 }
757 
758 static void
759 brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
760     uint64_t dsize)
761 {
762 	uint64_t idx;
763 
764 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
765 	ASSERT(brtvd != NULL);
766 	ASSERT(brtvd->bv_entcount != NULL);
767 
768 	brt->brt_savedspace -= dsize;
769 	brtvd->bv_savedspace -= dsize;
770 	brtvd->bv_meta_dirty = TRUE;
771 
772 	if (bre->bre_refcount > 0) {
773 		return;
774 	}
775 
776 	brt->brt_usedspace -= dsize;
777 	brtvd->bv_usedspace -= dsize;
778 
779 	idx = bre->bre_offset / brt->brt_rangesize;
780 	ASSERT3U(idx, <, brtvd->bv_size);
781 
782 	ASSERT(brtvd->bv_totalcount > 0);
783 	brtvd->bv_totalcount--;
784 	brt_vdev_entcount_dec(brtvd, idx);
785 	brtvd->bv_entcount_dirty = TRUE;
786 	idx = idx / BRT_BLOCKSIZE / 8;
787 	BT_SET(brtvd->bv_bitmap, idx);
788 
789 #ifdef ZFS_DEBUG
790 	if (zfs_flags & ZFS_DEBUG_BRT)
791 		brt_vdev_dump(brtvd);
792 #endif
793 }
794 
795 static void
796 brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
797 {
798 	dmu_buf_t *db;
799 	brt_vdev_phys_t *bvphys;
800 
801 	ASSERT(brtvd->bv_meta_dirty);
802 	ASSERT(brtvd->bv_mos_brtvdev != 0);
803 	ASSERT(dmu_tx_is_syncing(tx));
804 
805 	VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
806 
807 	if (brtvd->bv_entcount_dirty) {
808 		/*
809 		 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
810 		 */
811 		dmu_write(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
812 		    brtvd->bv_size * sizeof (brtvd->bv_entcount[0]),
813 		    brtvd->bv_entcount, tx);
814 		memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(brtvd->bv_nblocks));
815 		brtvd->bv_entcount_dirty = FALSE;
816 	}
817 
818 	dmu_buf_will_dirty(db, tx);
819 	bvphys = db->db_data;
820 	bvphys->bvp_mos_entries = brtvd->bv_mos_entries;
821 	bvphys->bvp_size = brtvd->bv_size;
822 	if (brtvd->bv_need_byteswap) {
823 		bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER;
824 	} else {
825 		bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER;
826 	}
827 	bvphys->bvp_totalcount = brtvd->bv_totalcount;
828 	bvphys->bvp_rangesize = brt->brt_rangesize;
829 	bvphys->bvp_usedspace = brtvd->bv_usedspace;
830 	bvphys->bvp_savedspace = brtvd->bv_savedspace;
831 	dmu_buf_rele(db, FTAG);
832 
833 	brtvd->bv_meta_dirty = FALSE;
834 }
835 
836 static void
837 brt_vdevs_alloc(brt_t *brt, boolean_t load)
838 {
839 	brt_vdev_t *brtvd;
840 	uint64_t vdevid;
841 
842 	brt_wlock(brt);
843 
844 	brt_vdevs_expand(brt, brt->brt_spa->spa_root_vdev->vdev_children);
845 
846 	if (load) {
847 		for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
848 			brtvd = &brt->brt_vdevs[vdevid];
849 			ASSERT(brtvd->bv_entcount == NULL);
850 
851 			brt_vdev_load(brt, brtvd);
852 		}
853 	}
854 
855 	if (brt->brt_rangesize == 0) {
856 		brt->brt_rangesize = BRT_RANGESIZE;
857 	}
858 
859 	brt_unlock(brt);
860 }
861 
862 static void
863 brt_vdevs_free(brt_t *brt)
864 {
865 	brt_vdev_t *brtvd;
866 	uint64_t vdevid;
867 
868 	brt_wlock(brt);
869 
870 	for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
871 		brtvd = &brt->brt_vdevs[vdevid];
872 		if (brtvd->bv_initiated)
873 			brt_vdev_dealloc(brt, brtvd);
874 	}
875 	kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs);
876 
877 	brt_unlock(brt);
878 }
879 
880 static void
881 brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp)
882 {
883 
884 	bre->bre_offset = DVA_GET_OFFSET(&bp->blk_dva[0]);
885 	bre->bre_refcount = 0;
886 
887 	*vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]);
888 }
889 
890 static int
891 brt_entry_compare(const void *x1, const void *x2)
892 {
893 	const brt_entry_t *bre1 = x1;
894 	const brt_entry_t *bre2 = x2;
895 
896 	return (TREE_CMP(bre1->bre_offset, bre2->bre_offset));
897 }
898 
899 static int
900 brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre)
901 {
902 	uint64_t mos_entries;
903 	int error;
904 
905 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
906 
907 	if (!brt_vdev_lookup(brt, brtvd, bre))
908 		return (SET_ERROR(ENOENT));
909 
910 	/*
911 	 * Remember mos_entries object number. After we reacquire the BRT lock,
912 	 * the brtvd pointer may be invalid.
913 	 */
914 	mos_entries = brtvd->bv_mos_entries;
915 	if (mos_entries == 0)
916 		return (SET_ERROR(ENOENT));
917 
918 	brt_unlock(brt);
919 
920 	error = zap_lookup_uint64(brt->brt_mos, mos_entries, &bre->bre_offset,
921 	    BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount), &bre->bre_refcount);
922 
923 	brt_wlock(brt);
924 
925 	return (error);
926 }
927 
928 static void
929 brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre)
930 {
931 	brt_vdev_t *brtvd;
932 	uint64_t mos_entries = 0;
933 
934 	brt_rlock(brt);
935 	brtvd = brt_vdev(brt, vdevid);
936 	if (brtvd != NULL)
937 		mos_entries = brtvd->bv_mos_entries;
938 	brt_unlock(brt);
939 
940 	if (mos_entries == 0)
941 		return;
942 
943 	(void) zap_prefetch_uint64(brt->brt_mos, mos_entries,
944 	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS);
945 }
946 
947 /*
948  * Return TRUE if we _can_ have BRT entry for this bp. It might be false
949  * positive, but gives us quick answer if we should look into BRT, which
950  * may require reads and thus will be more expensive.
951  */
952 boolean_t
953 brt_maybe_exists(spa_t *spa, const blkptr_t *bp)
954 {
955 	brt_t *brt = spa->spa_brt;
956 	brt_vdev_t *brtvd;
957 	brt_entry_t bre_search;
958 	boolean_t mayexists = FALSE;
959 	uint64_t vdevid;
960 
961 	brt_entry_fill(bp, &bre_search, &vdevid);
962 
963 	brt_rlock(brt);
964 
965 	brtvd = brt_vdev(brt, vdevid);
966 	if (brtvd != NULL && brtvd->bv_initiated) {
967 		if (!avl_is_empty(&brtvd->bv_tree) ||
968 		    brt_vdev_lookup(brt, brtvd, &bre_search)) {
969 			mayexists = TRUE;
970 		}
971 	}
972 
973 	brt_unlock(brt);
974 
975 	return (mayexists);
976 }
977 
978 uint64_t
979 brt_get_dspace(spa_t *spa)
980 {
981 	brt_t *brt = spa->spa_brt;
982 
983 	if (brt == NULL)
984 		return (0);
985 
986 	return (brt->brt_savedspace);
987 }
988 
989 uint64_t
990 brt_get_used(spa_t *spa)
991 {
992 	brt_t *brt = spa->spa_brt;
993 
994 	if (brt == NULL)
995 		return (0);
996 
997 	return (brt->brt_usedspace);
998 }
999 
1000 uint64_t
1001 brt_get_saved(spa_t *spa)
1002 {
1003 	brt_t *brt = spa->spa_brt;
1004 
1005 	if (brt == NULL)
1006 		return (0);
1007 
1008 	return (brt->brt_savedspace);
1009 }
1010 
1011 uint64_t
1012 brt_get_ratio(spa_t *spa)
1013 {
1014 	brt_t *brt = spa->spa_brt;
1015 
1016 	if (brt->brt_usedspace == 0)
1017 		return (100);
1018 
1019 	return ((brt->brt_usedspace + brt->brt_savedspace) * 100 /
1020 	    brt->brt_usedspace);
1021 }
1022 
1023 static int
1024 brt_kstats_update(kstat_t *ksp, int rw)
1025 {
1026 	brt_stats_t *bs = ksp->ks_data;
1027 
1028 	if (rw == KSTAT_WRITE)
1029 		return (EACCES);
1030 
1031 	bs->brt_addref_entry_in_memory.value.ui64 =
1032 	    wmsum_value(&brt_sums.brt_addref_entry_in_memory);
1033 	bs->brt_addref_entry_not_on_disk.value.ui64 =
1034 	    wmsum_value(&brt_sums.brt_addref_entry_not_on_disk);
1035 	bs->brt_addref_entry_on_disk.value.ui64 =
1036 	    wmsum_value(&brt_sums.brt_addref_entry_on_disk);
1037 	bs->brt_addref_entry_read_lost_race.value.ui64 =
1038 	    wmsum_value(&brt_sums.brt_addref_entry_read_lost_race);
1039 	bs->brt_decref_entry_in_memory.value.ui64 =
1040 	    wmsum_value(&brt_sums.brt_decref_entry_in_memory);
1041 	bs->brt_decref_entry_loaded_from_disk.value.ui64 =
1042 	    wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk);
1043 	bs->brt_decref_entry_not_in_memory.value.ui64 =
1044 	    wmsum_value(&brt_sums.brt_decref_entry_not_in_memory);
1045 	bs->brt_decref_entry_not_on_disk.value.ui64 =
1046 	    wmsum_value(&brt_sums.brt_decref_entry_not_on_disk);
1047 	bs->brt_decref_entry_read_lost_race.value.ui64 =
1048 	    wmsum_value(&brt_sums.brt_decref_entry_read_lost_race);
1049 	bs->brt_decref_entry_still_referenced.value.ui64 =
1050 	    wmsum_value(&brt_sums.brt_decref_entry_still_referenced);
1051 	bs->brt_decref_free_data_later.value.ui64 =
1052 	    wmsum_value(&brt_sums.brt_decref_free_data_later);
1053 	bs->brt_decref_free_data_now.value.ui64 =
1054 	    wmsum_value(&brt_sums.brt_decref_free_data_now);
1055 	bs->brt_decref_no_entry.value.ui64 =
1056 	    wmsum_value(&brt_sums.brt_decref_no_entry);
1057 
1058 	return (0);
1059 }
1060 
1061 static void
1062 brt_stat_init(void)
1063 {
1064 
1065 	wmsum_init(&brt_sums.brt_addref_entry_in_memory, 0);
1066 	wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0);
1067 	wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0);
1068 	wmsum_init(&brt_sums.brt_addref_entry_read_lost_race, 0);
1069 	wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0);
1070 	wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0);
1071 	wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0);
1072 	wmsum_init(&brt_sums.brt_decref_entry_not_on_disk, 0);
1073 	wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0);
1074 	wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0);
1075 	wmsum_init(&brt_sums.brt_decref_free_data_later, 0);
1076 	wmsum_init(&brt_sums.brt_decref_free_data_now, 0);
1077 	wmsum_init(&brt_sums.brt_decref_no_entry, 0);
1078 
1079 	brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED,
1080 	    sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
1081 	if (brt_ksp != NULL) {
1082 		brt_ksp->ks_data = &brt_stats;
1083 		brt_ksp->ks_update = brt_kstats_update;
1084 		kstat_install(brt_ksp);
1085 	}
1086 }
1087 
1088 static void
1089 brt_stat_fini(void)
1090 {
1091 	if (brt_ksp != NULL) {
1092 		kstat_delete(brt_ksp);
1093 		brt_ksp = NULL;
1094 	}
1095 
1096 	wmsum_fini(&brt_sums.brt_addref_entry_in_memory);
1097 	wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk);
1098 	wmsum_fini(&brt_sums.brt_addref_entry_on_disk);
1099 	wmsum_fini(&brt_sums.brt_addref_entry_read_lost_race);
1100 	wmsum_fini(&brt_sums.brt_decref_entry_in_memory);
1101 	wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk);
1102 	wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory);
1103 	wmsum_fini(&brt_sums.brt_decref_entry_not_on_disk);
1104 	wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race);
1105 	wmsum_fini(&brt_sums.brt_decref_entry_still_referenced);
1106 	wmsum_fini(&brt_sums.brt_decref_free_data_later);
1107 	wmsum_fini(&brt_sums.brt_decref_free_data_now);
1108 	wmsum_fini(&brt_sums.brt_decref_no_entry);
1109 }
1110 
1111 void
1112 brt_init(void)
1113 {
1114 	brt_entry_cache = kmem_cache_create("brt_entry_cache",
1115 	    sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1116 	brt_pending_entry_cache = kmem_cache_create("brt_pending_entry_cache",
1117 	    sizeof (brt_pending_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1118 
1119 	brt_stat_init();
1120 }
1121 
1122 void
1123 brt_fini(void)
1124 {
1125 	brt_stat_fini();
1126 
1127 	kmem_cache_destroy(brt_entry_cache);
1128 	kmem_cache_destroy(brt_pending_entry_cache);
1129 }
1130 
1131 static brt_entry_t *
1132 brt_entry_alloc(const brt_entry_t *bre_init)
1133 {
1134 	brt_entry_t *bre;
1135 
1136 	bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
1137 	bre->bre_offset = bre_init->bre_offset;
1138 	bre->bre_refcount = bre_init->bre_refcount;
1139 
1140 	return (bre);
1141 }
1142 
1143 static void
1144 brt_entry_free(brt_entry_t *bre)
1145 {
1146 
1147 	kmem_cache_free(brt_entry_cache, bre);
1148 }
1149 
1150 static void
1151 brt_entry_addref(brt_t *brt, const blkptr_t *bp)
1152 {
1153 	brt_vdev_t *brtvd;
1154 	brt_entry_t *bre, *racebre;
1155 	brt_entry_t bre_search;
1156 	avl_index_t where;
1157 	uint64_t vdevid;
1158 	int error;
1159 
1160 	ASSERT(!RW_WRITE_HELD(&brt->brt_lock));
1161 
1162 	brt_entry_fill(bp, &bre_search, &vdevid);
1163 
1164 	brt_wlock(brt);
1165 
1166 	brtvd = brt_vdev(brt, vdevid);
1167 	if (brtvd == NULL) {
1168 		ASSERT3U(vdevid, >=, brt->brt_nvdevs);
1169 
1170 		/* New VDEV was added. */
1171 		brt_vdevs_expand(brt, vdevid + 1);
1172 		brtvd = brt_vdev(brt, vdevid);
1173 	}
1174 	ASSERT(brtvd != NULL);
1175 	if (!brtvd->bv_initiated)
1176 		brt_vdev_realloc(brt, brtvd);
1177 
1178 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1179 	if (bre != NULL) {
1180 		BRTSTAT_BUMP(brt_addref_entry_in_memory);
1181 	} else {
1182 		/*
1183 		 * brt_entry_lookup() may drop the BRT (read) lock and
1184 		 * reacquire it (write).
1185 		 */
1186 		error = brt_entry_lookup(brt, brtvd, &bre_search);
1187 		/* bre_search now contains correct bre_refcount */
1188 		ASSERT(error == 0 || error == ENOENT);
1189 		if (error == 0)
1190 			BRTSTAT_BUMP(brt_addref_entry_on_disk);
1191 		else
1192 			BRTSTAT_BUMP(brt_addref_entry_not_on_disk);
1193 		/*
1194 		 * When the BRT lock was dropped, brt_vdevs[] may have been
1195 		 * expanded and reallocated, we need to update brtvd's pointer.
1196 		 */
1197 		brtvd = brt_vdev(brt, vdevid);
1198 		ASSERT(brtvd != NULL);
1199 
1200 		racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
1201 		if (racebre == NULL) {
1202 			bre = brt_entry_alloc(&bre_search);
1203 			ASSERT(RW_WRITE_HELD(&brt->brt_lock));
1204 			avl_insert(&brtvd->bv_tree, bre, where);
1205 			brt->brt_nentries++;
1206 		} else {
1207 			/*
1208 			 * The entry was added when the BRT lock was dropped in
1209 			 * brt_entry_lookup().
1210 			 */
1211 			BRTSTAT_BUMP(brt_addref_entry_read_lost_race);
1212 			bre = racebre;
1213 		}
1214 	}
1215 	bre->bre_refcount++;
1216 	brt_vdev_addref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
1217 
1218 	brt_unlock(brt);
1219 }
1220 
1221 /* Return TRUE if block should be freed immediately. */
1222 boolean_t
1223 brt_entry_decref(spa_t *spa, const blkptr_t *bp)
1224 {
1225 	brt_t *brt = spa->spa_brt;
1226 	brt_vdev_t *brtvd;
1227 	brt_entry_t *bre, *racebre;
1228 	brt_entry_t bre_search;
1229 	avl_index_t where;
1230 	uint64_t vdevid;
1231 	int error;
1232 
1233 	brt_entry_fill(bp, &bre_search, &vdevid);
1234 
1235 	brt_wlock(brt);
1236 
1237 	brtvd = brt_vdev(brt, vdevid);
1238 	ASSERT(brtvd != NULL);
1239 
1240 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1241 	if (bre != NULL) {
1242 		BRTSTAT_BUMP(brt_decref_entry_in_memory);
1243 		goto out;
1244 	} else {
1245 		BRTSTAT_BUMP(brt_decref_entry_not_in_memory);
1246 	}
1247 
1248 	/*
1249 	 * brt_entry_lookup() may drop the BRT lock and reacquire it.
1250 	 */
1251 	error = brt_entry_lookup(brt, brtvd, &bre_search);
1252 	/* bre_search now contains correct bre_refcount */
1253 	ASSERT(error == 0 || error == ENOENT);
1254 	/*
1255 	 * When the BRT lock was dropped, brt_vdevs[] may have been expanded
1256 	 * and reallocated, we need to update brtvd's pointer.
1257 	 */
1258 	brtvd = brt_vdev(brt, vdevid);
1259 	ASSERT(brtvd != NULL);
1260 
1261 	if (error == ENOENT) {
1262 		BRTSTAT_BUMP(brt_decref_entry_not_on_disk);
1263 		bre = NULL;
1264 		goto out;
1265 	}
1266 
1267 	racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
1268 	if (racebre != NULL) {
1269 		/*
1270 		 * The entry was added when the BRT lock was dropped in
1271 		 * brt_entry_lookup().
1272 		 */
1273 		BRTSTAT_BUMP(brt_decref_entry_read_lost_race);
1274 		bre = racebre;
1275 		goto out;
1276 	}
1277 
1278 	BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk);
1279 	bre = brt_entry_alloc(&bre_search);
1280 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
1281 	avl_insert(&brtvd->bv_tree, bre, where);
1282 	brt->brt_nentries++;
1283 
1284 out:
1285 	if (bre == NULL) {
1286 		/*
1287 		 * This is a free of a regular (not cloned) block.
1288 		 */
1289 		brt_unlock(brt);
1290 		BRTSTAT_BUMP(brt_decref_no_entry);
1291 		return (B_TRUE);
1292 	}
1293 	if (bre->bre_refcount == 0) {
1294 		brt_unlock(brt);
1295 		BRTSTAT_BUMP(brt_decref_free_data_now);
1296 		return (B_TRUE);
1297 	}
1298 
1299 	ASSERT(bre->bre_refcount > 0);
1300 	bre->bre_refcount--;
1301 	if (bre->bre_refcount == 0)
1302 		BRTSTAT_BUMP(brt_decref_free_data_later);
1303 	else
1304 		BRTSTAT_BUMP(brt_decref_entry_still_referenced);
1305 	brt_vdev_decref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
1306 
1307 	brt_unlock(brt);
1308 
1309 	return (B_FALSE);
1310 }
1311 
1312 uint64_t
1313 brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)
1314 {
1315 	brt_t *brt = spa->spa_brt;
1316 	brt_vdev_t *brtvd;
1317 	brt_entry_t bre_search, *bre;
1318 	uint64_t vdevid, refcnt;
1319 	int error;
1320 
1321 	brt_entry_fill(bp, &bre_search, &vdevid);
1322 
1323 	brt_rlock(brt);
1324 
1325 	brtvd = brt_vdev(brt, vdevid);
1326 	ASSERT(brtvd != NULL);
1327 
1328 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1329 	if (bre == NULL) {
1330 		error = brt_entry_lookup(brt, brtvd, &bre_search);
1331 		ASSERT(error == 0 || error == ENOENT);
1332 		if (error == ENOENT)
1333 			refcnt = 0;
1334 		else
1335 			refcnt = bre_search.bre_refcount;
1336 	} else
1337 		refcnt = bre->bre_refcount;
1338 
1339 	brt_unlock(brt);
1340 	return (refcnt);
1341 }
1342 
1343 static void
1344 brt_prefetch(brt_t *brt, const blkptr_t *bp)
1345 {
1346 	brt_entry_t bre;
1347 	uint64_t vdevid;
1348 
1349 	ASSERT(bp != NULL);
1350 
1351 	if (!brt_zap_prefetch)
1352 		return;
1353 
1354 	brt_entry_fill(bp, &bre, &vdevid);
1355 
1356 	brt_entry_prefetch(brt, vdevid, &bre);
1357 }
1358 
1359 static int
1360 brt_pending_entry_compare(const void *x1, const void *x2)
1361 {
1362 	const brt_pending_entry_t *bpe1 = x1, *bpe2 = x2;
1363 	const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp;
1364 	int cmp;
1365 
1366 	cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]),
1367 	    DVA_GET_VDEV(&bp2->blk_dva[0]));
1368 	if (cmp == 0) {
1369 		cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
1370 		    DVA_GET_OFFSET(&bp2->blk_dva[0]));
1371 		if (unlikely(cmp == 0)) {
1372 			cmp = TREE_CMP(BP_GET_BIRTH(bp1), BP_GET_BIRTH(bp2));
1373 		}
1374 	}
1375 
1376 	return (cmp);
1377 }
1378 
1379 void
1380 brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1381 {
1382 	brt_t *brt;
1383 	avl_tree_t *pending_tree;
1384 	kmutex_t *pending_lock;
1385 	brt_pending_entry_t *bpe, *newbpe;
1386 	avl_index_t where;
1387 	uint64_t txg;
1388 
1389 	brt = spa->spa_brt;
1390 	txg = dmu_tx_get_txg(tx);
1391 	ASSERT3U(txg, !=, 0);
1392 	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
1393 	pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
1394 
1395 	newbpe = kmem_cache_alloc(brt_pending_entry_cache, KM_SLEEP);
1396 	newbpe->bpe_bp = *bp;
1397 	newbpe->bpe_count = 1;
1398 
1399 	mutex_enter(pending_lock);
1400 
1401 	bpe = avl_find(pending_tree, newbpe, &where);
1402 	if (bpe == NULL) {
1403 		avl_insert(pending_tree, newbpe, where);
1404 		newbpe = NULL;
1405 	} else {
1406 		bpe->bpe_count++;
1407 	}
1408 
1409 	mutex_exit(pending_lock);
1410 
1411 	if (newbpe != NULL) {
1412 		ASSERT(bpe != NULL);
1413 		ASSERT(bpe != newbpe);
1414 		kmem_cache_free(brt_pending_entry_cache, newbpe);
1415 	} else {
1416 		ASSERT(bpe == NULL);
1417 
1418 		/* Prefetch BRT entry for the syncing context. */
1419 		brt_prefetch(brt, bp);
1420 	}
1421 }
1422 
1423 void
1424 brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1425 {
1426 	brt_t *brt;
1427 	avl_tree_t *pending_tree;
1428 	kmutex_t *pending_lock;
1429 	brt_pending_entry_t *bpe, bpe_search;
1430 	uint64_t txg;
1431 
1432 	brt = spa->spa_brt;
1433 	txg = dmu_tx_get_txg(tx);
1434 	ASSERT3U(txg, !=, 0);
1435 	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
1436 	pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
1437 
1438 	bpe_search.bpe_bp = *bp;
1439 
1440 	mutex_enter(pending_lock);
1441 
1442 	bpe = avl_find(pending_tree, &bpe_search, NULL);
1443 	/* I believe we should always find bpe when this function is called. */
1444 	if (bpe != NULL) {
1445 		ASSERT(bpe->bpe_count > 0);
1446 
1447 		bpe->bpe_count--;
1448 		if (bpe->bpe_count == 0) {
1449 			avl_remove(pending_tree, bpe);
1450 			kmem_cache_free(brt_pending_entry_cache, bpe);
1451 		}
1452 	}
1453 
1454 	mutex_exit(pending_lock);
1455 }
1456 
1457 void
1458 brt_pending_apply(spa_t *spa, uint64_t txg)
1459 {
1460 	brt_t *brt = spa->spa_brt;
1461 	brt_pending_entry_t *bpe;
1462 	avl_tree_t *pending_tree;
1463 	void *c;
1464 
1465 	ASSERT3U(txg, !=, 0);
1466 
1467 	/*
1468 	 * We are in syncing context, so no other brt_pending_tree accesses
1469 	 * are possible for the TXG. Don't need to acquire brt_pending_lock.
1470 	 */
1471 	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
1472 
1473 	c = NULL;
1474 	while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) {
1475 		boolean_t added_to_ddt;
1476 
1477 		for (int i = 0; i < bpe->bpe_count; i++) {
1478 			/*
1479 			 * If the block has DEDUP bit set, it means that it
1480 			 * already exists in the DEDUP table, so we can just
1481 			 * use that instead of creating new entry in
1482 			 * the BRT table.
1483 			 */
1484 			if (BP_GET_DEDUP(&bpe->bpe_bp)) {
1485 				added_to_ddt = ddt_addref(spa, &bpe->bpe_bp);
1486 			} else {
1487 				added_to_ddt = B_FALSE;
1488 			}
1489 			if (!added_to_ddt)
1490 				brt_entry_addref(brt, &bpe->bpe_bp);
1491 		}
1492 
1493 		kmem_cache_free(brt_pending_entry_cache, bpe);
1494 	}
1495 }
1496 
1497 static void
1498 brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx)
1499 {
1500 	if (bre->bre_refcount == 0) {
1501 		int error = zap_remove_uint64_by_dnode(dn, &bre->bre_offset,
1502 		    BRT_KEY_WORDS, tx);
1503 		VERIFY(error == 0 || error == ENOENT);
1504 	} else {
1505 		VERIFY0(zap_update_uint64_by_dnode(dn, &bre->bre_offset,
1506 		    BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount),
1507 		    &bre->bre_refcount, tx));
1508 	}
1509 }
1510 
1511 static void
1512 brt_sync_table(brt_t *brt, dmu_tx_t *tx)
1513 {
1514 	brt_vdev_t *brtvd;
1515 	brt_entry_t *bre;
1516 	dnode_t *dn;
1517 	uint64_t vdevid;
1518 	void *c;
1519 
1520 	brt_wlock(brt);
1521 
1522 	for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
1523 		brtvd = &brt->brt_vdevs[vdevid];
1524 
1525 		if (!brtvd->bv_initiated)
1526 			continue;
1527 
1528 		if (!brtvd->bv_meta_dirty) {
1529 			ASSERT(!brtvd->bv_entcount_dirty);
1530 			ASSERT0(avl_numnodes(&brtvd->bv_tree));
1531 			continue;
1532 		}
1533 
1534 		ASSERT(!brtvd->bv_entcount_dirty ||
1535 		    avl_numnodes(&brtvd->bv_tree) != 0);
1536 
1537 		if (brtvd->bv_mos_brtvdev == 0)
1538 			brt_vdev_create(brt, brtvd, tx);
1539 
1540 		VERIFY0(dnode_hold(brt->brt_mos, brtvd->bv_mos_entries,
1541 		    FTAG, &dn));
1542 
1543 		c = NULL;
1544 		while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
1545 			brt_sync_entry(dn, bre, tx);
1546 			brt_entry_free(bre);
1547 			ASSERT(brt->brt_nentries > 0);
1548 			brt->brt_nentries--;
1549 		}
1550 
1551 		dnode_rele(dn, FTAG);
1552 
1553 		brt_vdev_sync(brt, brtvd, tx);
1554 
1555 		if (brtvd->bv_totalcount == 0)
1556 			brt_vdev_destroy(brt, brtvd, tx);
1557 	}
1558 
1559 	ASSERT0(brt->brt_nentries);
1560 
1561 	brt_unlock(brt);
1562 }
1563 
1564 void
1565 brt_sync(spa_t *spa, uint64_t txg)
1566 {
1567 	dmu_tx_t *tx;
1568 	brt_t *brt;
1569 
1570 	ASSERT(spa_syncing_txg(spa) == txg);
1571 
1572 	brt = spa->spa_brt;
1573 	brt_rlock(brt);
1574 	if (brt->brt_nentries == 0) {
1575 		/* No changes. */
1576 		brt_unlock(brt);
1577 		return;
1578 	}
1579 	brt_unlock(brt);
1580 
1581 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1582 
1583 	brt_sync_table(brt, tx);
1584 
1585 	dmu_tx_commit(tx);
1586 }
1587 
1588 static void
1589 brt_table_alloc(brt_t *brt)
1590 {
1591 
1592 	for (int i = 0; i < TXG_SIZE; i++) {
1593 		avl_create(&brt->brt_pending_tree[i],
1594 		    brt_pending_entry_compare,
1595 		    sizeof (brt_pending_entry_t),
1596 		    offsetof(brt_pending_entry_t, bpe_node));
1597 		mutex_init(&brt->brt_pending_lock[i], NULL, MUTEX_DEFAULT,
1598 		    NULL);
1599 	}
1600 }
1601 
1602 static void
1603 brt_table_free(brt_t *brt)
1604 {
1605 
1606 	for (int i = 0; i < TXG_SIZE; i++) {
1607 		ASSERT(avl_is_empty(&brt->brt_pending_tree[i]));
1608 
1609 		avl_destroy(&brt->brt_pending_tree[i]);
1610 		mutex_destroy(&brt->brt_pending_lock[i]);
1611 	}
1612 }
1613 
1614 static void
1615 brt_alloc(spa_t *spa)
1616 {
1617 	brt_t *brt;
1618 
1619 	ASSERT(spa->spa_brt == NULL);
1620 
1621 	brt = kmem_zalloc(sizeof (*brt), KM_SLEEP);
1622 	rw_init(&brt->brt_lock, NULL, RW_DEFAULT, NULL);
1623 	brt->brt_spa = spa;
1624 	brt->brt_rangesize = 0;
1625 	brt->brt_nentries = 0;
1626 	brt->brt_vdevs = NULL;
1627 	brt->brt_nvdevs = 0;
1628 	brt_table_alloc(brt);
1629 
1630 	spa->spa_brt = brt;
1631 }
1632 
1633 void
1634 brt_create(spa_t *spa)
1635 {
1636 
1637 	brt_alloc(spa);
1638 	brt_vdevs_alloc(spa->spa_brt, B_FALSE);
1639 }
1640 
1641 int
1642 brt_load(spa_t *spa)
1643 {
1644 
1645 	brt_alloc(spa);
1646 	brt_vdevs_alloc(spa->spa_brt, B_TRUE);
1647 
1648 	return (0);
1649 }
1650 
1651 void
1652 brt_unload(spa_t *spa)
1653 {
1654 	brt_t *brt = spa->spa_brt;
1655 
1656 	if (brt == NULL)
1657 		return;
1658 
1659 	brt_vdevs_free(brt);
1660 	brt_table_free(brt);
1661 	rw_destroy(&brt->brt_lock);
1662 	kmem_free(brt, sizeof (*brt));
1663 	spa->spa_brt = NULL;
1664 }
1665 
1666 /* BEGIN CSTYLED */
1667 ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW,
1668 	"Enable prefetching of BRT ZAP entries");
1669 ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW,
1670 	"BRT ZAP leaf blockshift");
1671 ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW,
1672 	"BRT ZAP indirect blockshift");
1673 /* END CSTYLED */
1674