xref: /freebsd/sys/contrib/openzfs/module/zfs/brt.c (revision b196276c20b577b364372f1aa1a646b9ce34bf5c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
24  */
25 
26 #include <sys/zfs_context.h>
27 #include <sys/spa.h>
28 #include <sys/spa_impl.h>
29 #include <sys/zio.h>
30 #include <sys/brt.h>
31 #include <sys/brt_impl.h>
32 #include <sys/ddt.h>
33 #include <sys/bitmap.h>
34 #include <sys/zap.h>
35 #include <sys/dmu_tx.h>
36 #include <sys/arc.h>
37 #include <sys/dsl_pool.h>
38 #include <sys/dsl_scan.h>
39 #include <sys/vdev_impl.h>
40 #include <sys/kstat.h>
41 #include <sys/wmsum.h>
42 
43 /*
44  * Block Cloning design.
45  *
46  * Block Cloning allows to manually clone a file (or a subset of its blocks)
47  * into another (or the same) file by just creating additional references to
48  * the data blocks without copying the data itself. Those references are kept
49  * in the Block Reference Tables (BRTs).
50  *
51  * In many ways this is similar to the existing deduplication, but there are
52  * some important differences:
53  *
54  * - Deduplication is automatic and Block Cloning is not - one has to use a
55  *   dedicated system call(s) to clone the given file/blocks.
56  * - Deduplication keeps all data blocks in its table, even those referenced
57  *   just once. Block Cloning creates an entry in its tables only when there
58  *   are at least two references to the given data block. If the block was
59  *   never explicitly cloned or the second to last reference was dropped,
60  *   there will be neither space nor performance overhead.
61  * - Deduplication needs data to work - one needs to pass real data to the
62  *   write(2) syscall, so hash can be calculated. Block Cloning doesn't require
63  *   data, just block pointers to the data, so it is extremely fast, as we pay
64  *   neither the cost of reading the data, nor the cost of writing the data -
65  *   we operate exclusively on metadata.
66  * - If the D (dedup) bit is not set in the block pointer, it means that
67  *   the block is not in the dedup table (DDT) and we won't consult the DDT
68  *   when we need to free the block. Block Cloning must be consulted on every
69  *   free, because we cannot modify the source BP (eg. by setting something
70  *   similar to the D bit), thus we have no hint if the block is in the
71  *   Block Reference Table (BRT), so we need to look into the BRT. There is
72  *   an optimization in place that allows us to eliminate the majority of BRT
73  *   lookups which is described below in the "Minimizing free penalty" section.
74  * - The BRT entry is much smaller than the DDT entry - for BRT we only store
75  *   64bit offset and 64bit reference counter.
76  * - Dedup keys are cryptographic hashes, so two blocks that are close to each
77  *   other on disk are most likely in totally different parts of the DDT.
78  *   The BRT entry keys are offsets into a single top-level VDEV, so data blocks
79  *   from one file should have BRT entries close to each other.
80  * - Scrub will only do a single pass over a block that is referenced multiple
81  *   times in the DDT. Unfortunately it is not currently (if at all) possible
82  *   with Block Cloning and block referenced multiple times will be scrubbed
83  *   multiple times. The new, sorted scrub should be able to eliminate
84  *   duplicated reads given enough memory.
85  * - Deduplication requires cryptographically strong hash as a checksum or
86  *   additional data verification. Block Cloning works with any checksum
87  *   algorithm or even with checksumming disabled.
88  *
89  * As mentioned above, the BRT entries are much smaller than the DDT entries.
90  * To uniquely identify a block we just need its vdev id and offset. We also
91  * need to maintain a reference counter. The vdev id will often repeat, as there
92  * is a small number of top-level VDEVs and a large number of blocks stored in
93  * each VDEV. We take advantage of that to reduce the BRT entry size further by
94  * maintaining one BRT for each top-level VDEV, so we can then have only offset
95  * and counter as the BRT entry.
96  *
97  * Minimizing free penalty.
98  *
99  * Block Cloning allows creating additional references to any existing block.
100  * When we free a block there is no hint in the block pointer whether the block
101  * was cloned or not, so on each free we have to check if there is a
102  * corresponding entry in the BRT or not. If there is, we need to decrease
103  * the reference counter. Doing BRT lookup on every free can potentially be
104  * expensive by requiring additional I/Os if the BRT doesn't fit into memory.
105  * This is the main problem with deduplication, so we've learned our lesson and
106  * try not to repeat the same mistake here. How do we do that? We divide each
107  * top-level VDEV into 16MB regions. For each region we maintain a counter that
108  * is a sum of all the BRT entries that have offsets within the region. This
109  * creates the entries count array of 16bit numbers for each top-level VDEV.
110  * The entries count array is always kept in memory and updated on disk in the
111  * same transaction group as the BRT updates to keep everything in-sync. We can
112  * keep the array in memory, because it is very small. With 16MB regions and
113  * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease
114  * the region size even further in the future). Now, when we want to free
115  * a block, we first consult the array. If the counter for the whole region is
116  * zero, there is no need to look for the BRT entry, as there isn't one for
117  * sure. If the counter for the region is greater than zero, only then we will
118  * do a BRT lookup and if an entry is found we will decrease the reference
119  * counter in the BRT entry and in the entry counters array.
120  *
121  * The entry counters array is small, but can potentially be larger for very
122  * large VDEVs or smaller regions. In this case we don't want to rewrite entire
123  * array on every change. We then divide the array into 32kB block and keep
124  * a bitmap of dirty blocks within a transaction group. When we sync the
125  * transaction group we can only update the parts of the entry counters array
126  * that were modified. Note: Keeping track of the dirty parts of the entry
127  * counters array is implemented, but updating only parts of the array on disk
128  * is not yet implemented - for now we will update entire array if there was
129  * any change.
130  *
131  * The implementation tries to be economic: if BRT is not used, or no longer
132  * used, there will be no entries in the MOS and no additional memory used (eg.
133  * the entry counters array is only allocated if needed).
134  *
135  * Interaction between Deduplication and Block Cloning.
136  *
137  * If both functionalities are in use, we could end up with a block that is
138  * referenced multiple times in both DDT and BRT. When we free one of the
139  * references we couldn't tell where it belongs, so we would have to decide
140  * what table takes the precedence: do we first clear DDT references or BRT
141  * references? To avoid this dilemma BRT cooperates with DDT - if a given block
142  * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will
143  * lookup DDT entry instead and increase the counter there. No BRT entry
144  * will be created for a block which has the D (dedup) bit set.
145  * BRT may be more efficient for manual deduplication, but if the block is
146  * already in the DDT, then creating additional BRT entry would be less
147  * efficient. This clever idea was proposed by Allan Jude.
148  *
149  * Block Cloning across datasets.
150  *
151  * Block Cloning is not limited to cloning blocks within the same dataset.
152  * It is possible (and very useful) to clone blocks between different datasets.
153  * One use case is recovering files from snapshots. By cloning the files into
154  * dataset we need no additional storage. Without Block Cloning we would need
155  * additional space for those files.
156  * Another interesting use case is moving the files between datasets
157  * (copying the file content to the new dataset and removing the source file).
158  * In that case Block Cloning will only be used briefly, because the BRT entries
159  * will be removed when the source is removed.
160  * Block Cloning across encrypted datasets is supported as long as both
161  * datasets share the same master key (e.g. snapshots and clones)
162  *
163  * Block Cloning flow through ZFS layers.
164  *
165  * Note: Block Cloning can be used both for cloning file system blocks and ZVOL
166  * blocks. As of this writing no interface is implemented that allows for block
167  * cloning within a ZVOL.
168  * FreeBSD and Linux provides copy_file_range(2) system call and we will use it
169  * for blocking cloning.
170  *
171  *	ssize_t
172  *	copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp,
173  *	                size_t len, unsigned int flags);
174  *
175  * Even though offsets and length represent bytes, they have to be
176  * block-aligned or we will return an error so the upper layer can
177  * fallback to the generic mechanism that will just copy the data.
178  * Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
179  * This function was implemented based on zfs_write(), but instead of writing
180  * the given data we first read block pointers using the new dmu_read_l0_bps()
181  * function from the source file. Once we have BPs from the source file we call
182  * the dmu_brt_clone() function on the destination file. This function
183  * allocates BPs for us. We iterate over all source BPs. If the given BP is
184  * a hole or an embedded block, we just copy BP as-is. If it points to a real
185  * data we place this BP on a BRT pending list using the brt_pending_add()
186  * function.
187  *
188  * We use this pending list to keep track of all BPs that got new references
189  * within this transaction group.
190  *
191  * Some special cases to consider and how we address them:
192  * - The block we want to clone may have been created within the same
193  *   transaction group that we are trying to clone. Such block has no BP
194  *   allocated yet, so cannot be immediately cloned. We return EAGAIN.
195  * - The block we want to clone may have been modified within the same
196  *   transaction group. We return EAGAIN.
197  * - A block may be cloned multiple times during one transaction group (that's
198  *   why pending list is actually a tree and not an append-only list - this
199  *   way we can figure out faster if this block is cloned for the first time
200  *   in this txg or consecutive time).
201  * - A block may be cloned and freed within the same transaction group
202  *   (see dbuf_undirty()).
203  * - A block may be cloned and within the same transaction group the clone
204  *   can be cloned again (see dmu_read_l0_bps()).
205  * - A file might have been deleted, but the caller still has a file descriptor
206  *   open to this file and clones it.
207  *
208  * When we free a block we have an additional step in the ZIO pipeline where we
209  * call the zio_brt_free() function. We then call the brt_entry_decref()
210  * that loads the corresponding BRT entry (if one exists) and decreases
211  * reference counter. If this is not the last reference we will stop ZIO
212  * pipeline here. If this is the last reference or the block is not in the
213  * BRT, we continue the pipeline and free the block as usual.
214  *
215  * At the beginning of spa_sync() where there can be no more block cloning,
216  * but before issuing frees we call brt_pending_apply(). This function applies
217  * all the new clones to the BRT table - we load BRT entries and update
218  * reference counters. To sync new BRT entries to disk, we use brt_sync()
219  * function. This function will sync all dirty per-top-level-vdev BRTs,
220  * the entry counters arrays, etc.
221  *
222  * Block Cloning and ZIL.
223  *
224  * Every clone operation is divided into chunks (similar to write) and each
225  * chunk is cloned in a separate transaction. The chunk size is determined by
226  * how many BPs we can fit into a single ZIL entry.
227  * Replaying clone operation is different from the regular clone operation,
228  * as when we log clone operations we cannot use the source object - it may
229  * reside on a different dataset, so we log BPs we want to clone.
230  * The ZIL is replayed when we mount the given dataset, not when the pool is
231  * imported. Taking this into account it is possible that the pool is imported
232  * without mounting datasets and the source dataset is destroyed before the
233  * destination dataset is mounted and its ZIL replayed.
234  * To address this situation we leverage zil_claim() mechanism where ZFS will
235  * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
236  * entries, we will bump reference counters for their BPs in the BRT.  Then
237  * on mount and ZIL replay we bump the reference counters once more, while the
238  * first references are dropped during ZIL destroy by zil_free_clone_range().
239  * It is possible that after zil_claim() we never mount the destination, so
240  * we never replay its ZIL and just destroy it.  In this case the only taken
241  * references will be dropped by zil_free_clone_range(), since the cloning is
242  * not going to ever take place.
243  */
244 
245 static kmem_cache_t *brt_entry_cache;
246 
247 /*
248  * Enable/disable prefetching of BRT entries that we are going to modify.
249  */
250 static int brt_zap_prefetch = 1;
251 
252 #ifdef ZFS_DEBUG
253 #define	BRT_DEBUG(...)	do {						\
254 	if ((zfs_flags & ZFS_DEBUG_BRT) != 0) {				\
255 		__dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \
256 	}								\
257 } while (0)
258 #else
259 #define	BRT_DEBUG(...)	do { } while (0)
260 #endif
261 
262 static int brt_zap_default_bs = 12;
263 static int brt_zap_default_ibs = 12;
264 
265 static kstat_t	*brt_ksp;
266 
267 typedef struct brt_stats {
268 	kstat_named_t brt_addref_entry_not_on_disk;
269 	kstat_named_t brt_addref_entry_on_disk;
270 	kstat_named_t brt_decref_entry_in_memory;
271 	kstat_named_t brt_decref_entry_loaded_from_disk;
272 	kstat_named_t brt_decref_entry_not_in_memory;
273 	kstat_named_t brt_decref_entry_read_lost_race;
274 	kstat_named_t brt_decref_entry_still_referenced;
275 	kstat_named_t brt_decref_free_data_later;
276 	kstat_named_t brt_decref_free_data_now;
277 	kstat_named_t brt_decref_no_entry;
278 } brt_stats_t;
279 
280 static brt_stats_t brt_stats = {
281 	{ "addref_entry_not_on_disk",		KSTAT_DATA_UINT64 },
282 	{ "addref_entry_on_disk",		KSTAT_DATA_UINT64 },
283 	{ "decref_entry_in_memory",		KSTAT_DATA_UINT64 },
284 	{ "decref_entry_loaded_from_disk",	KSTAT_DATA_UINT64 },
285 	{ "decref_entry_not_in_memory",		KSTAT_DATA_UINT64 },
286 	{ "decref_entry_read_lost_race",	KSTAT_DATA_UINT64 },
287 	{ "decref_entry_still_referenced",	KSTAT_DATA_UINT64 },
288 	{ "decref_free_data_later",		KSTAT_DATA_UINT64 },
289 	{ "decref_free_data_now",		KSTAT_DATA_UINT64 },
290 	{ "decref_no_entry",			KSTAT_DATA_UINT64 }
291 };
292 
293 struct {
294 	wmsum_t brt_addref_entry_not_on_disk;
295 	wmsum_t brt_addref_entry_on_disk;
296 	wmsum_t brt_decref_entry_in_memory;
297 	wmsum_t brt_decref_entry_loaded_from_disk;
298 	wmsum_t brt_decref_entry_not_in_memory;
299 	wmsum_t brt_decref_entry_read_lost_race;
300 	wmsum_t brt_decref_entry_still_referenced;
301 	wmsum_t brt_decref_free_data_later;
302 	wmsum_t brt_decref_free_data_now;
303 	wmsum_t brt_decref_no_entry;
304 } brt_sums;
305 
306 #define	BRTSTAT_BUMP(stat)	wmsum_add(&brt_sums.stat, 1)
307 
308 static int brt_entry_compare(const void *x1, const void *x2);
309 static void brt_vdevs_expand(spa_t *spa, uint64_t nvdevs);
310 
311 static void
312 brt_rlock(spa_t *spa)
313 {
314 	rw_enter(&spa->spa_brt_lock, RW_READER);
315 }
316 
317 static void
318 brt_wlock(spa_t *spa)
319 {
320 	rw_enter(&spa->spa_brt_lock, RW_WRITER);
321 }
322 
323 static void
324 brt_unlock(spa_t *spa)
325 {
326 	rw_exit(&spa->spa_brt_lock);
327 }
328 
329 static uint16_t
330 brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx)
331 {
332 
333 	ASSERT3U(idx, <, brtvd->bv_size);
334 
335 	if (unlikely(brtvd->bv_need_byteswap)) {
336 		return (BSWAP_16(brtvd->bv_entcount[idx]));
337 	} else {
338 		return (brtvd->bv_entcount[idx]);
339 	}
340 }
341 
342 static void
343 brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt)
344 {
345 
346 	ASSERT3U(idx, <, brtvd->bv_size);
347 
348 	if (unlikely(brtvd->bv_need_byteswap)) {
349 		brtvd->bv_entcount[idx] = BSWAP_16(entcnt);
350 	} else {
351 		brtvd->bv_entcount[idx] = entcnt;
352 	}
353 }
354 
355 static void
356 brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx)
357 {
358 	uint16_t entcnt;
359 
360 	ASSERT3U(idx, <, brtvd->bv_size);
361 
362 	entcnt = brt_vdev_entcount_get(brtvd, idx);
363 	ASSERT(entcnt < UINT16_MAX);
364 
365 	brt_vdev_entcount_set(brtvd, idx, entcnt + 1);
366 }
367 
368 static void
369 brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx)
370 {
371 	uint16_t entcnt;
372 
373 	ASSERT3U(idx, <, brtvd->bv_size);
374 
375 	entcnt = brt_vdev_entcount_get(brtvd, idx);
376 	ASSERT(entcnt > 0);
377 
378 	brt_vdev_entcount_set(brtvd, idx, entcnt - 1);
379 }
380 
381 #ifdef ZFS_DEBUG
382 static void
383 brt_vdev_dump(brt_vdev_t *brtvd)
384 {
385 	uint64_t idx;
386 
387 	uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
388 	zfs_dbgmsg("  BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d "
389 	    "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu",
390 	    (u_longlong_t)brtvd->bv_vdevid,
391 	    brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
392 	    (u_longlong_t)brtvd->bv_size,
393 	    (u_longlong_t)brtvd->bv_totalcount,
394 	    (u_longlong_t)nblocks,
395 	    (size_t)BT_SIZEOFMAP(nblocks));
396 	if (brtvd->bv_totalcount > 0) {
397 		zfs_dbgmsg("    entcounts:");
398 		for (idx = 0; idx < brtvd->bv_size; idx++) {
399 			uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx);
400 			if (entcnt > 0) {
401 				zfs_dbgmsg("      [%04llu] %hu",
402 				    (u_longlong_t)idx, entcnt);
403 			}
404 		}
405 	}
406 	if (brtvd->bv_entcount_dirty) {
407 		char *bitmap;
408 
409 		bitmap = kmem_alloc(nblocks + 1, KM_SLEEP);
410 		for (idx = 0; idx < nblocks; idx++) {
411 			bitmap[idx] =
412 			    BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
413 		}
414 		bitmap[idx] = '\0';
415 		zfs_dbgmsg("    dirty: %s", bitmap);
416 		kmem_free(bitmap, nblocks + 1);
417 	}
418 }
419 #endif
420 
421 static brt_vdev_t *
422 brt_vdev(spa_t *spa, uint64_t vdevid, boolean_t alloc)
423 {
424 	brt_vdev_t *brtvd = NULL;
425 
426 	brt_rlock(spa);
427 	if (vdevid < spa->spa_brt_nvdevs) {
428 		brtvd = spa->spa_brt_vdevs[vdevid];
429 	} else if (alloc) {
430 		/* New VDEV was added. */
431 		brt_unlock(spa);
432 		brt_wlock(spa);
433 		if (vdevid >= spa->spa_brt_nvdevs)
434 			brt_vdevs_expand(spa, vdevid + 1);
435 		brtvd = spa->spa_brt_vdevs[vdevid];
436 	}
437 	brt_unlock(spa);
438 	return (brtvd);
439 }
440 
441 static void
442 brt_vdev_create(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
443 {
444 	char name[64];
445 
446 	ASSERT(brtvd->bv_initiated);
447 	ASSERT0(brtvd->bv_mos_brtvdev);
448 	ASSERT0(brtvd->bv_mos_entries);
449 
450 	uint64_t mos_entries = zap_create_flags(spa->spa_meta_objset, 0,
451 	    ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
452 	    brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx);
453 	VERIFY(mos_entries != 0);
454 	VERIFY0(dnode_hold(spa->spa_meta_objset, mos_entries, brtvd,
455 	    &brtvd->bv_mos_entries_dnode));
456 	rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
457 	brtvd->bv_mos_entries = mos_entries;
458 	rw_exit(&brtvd->bv_mos_entries_lock);
459 	BRT_DEBUG("MOS entries created, object=%llu",
460 	    (u_longlong_t)brtvd->bv_mos_entries);
461 
462 	/*
463 	 * We allocate DMU buffer to store the bv_entcount[] array.
464 	 * We will keep array size (bv_size) and cummulative count for all
465 	 * bv_entcount[]s (bv_totalcount) in the bonus buffer.
466 	 */
467 	brtvd->bv_mos_brtvdev = dmu_object_alloc(spa->spa_meta_objset,
468 	    DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE,
469 	    DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx);
470 	VERIFY(brtvd->bv_mos_brtvdev != 0);
471 	BRT_DEBUG("MOS BRT VDEV created, object=%llu",
472 	    (u_longlong_t)brtvd->bv_mos_brtvdev);
473 
474 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
475 	    (u_longlong_t)brtvd->bv_vdevid);
476 	VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name,
477 	    sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx));
478 	BRT_DEBUG("Pool directory object created, object=%s", name);
479 
480 	spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING, tx);
481 }
482 
483 static void
484 brt_vdev_realloc(spa_t *spa, brt_vdev_t *brtvd)
485 {
486 	vdev_t *vd;
487 	uint16_t *entcount;
488 	ulong_t *bitmap;
489 	uint64_t nblocks, onblocks, size;
490 
491 	ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
492 
493 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
494 	vd = vdev_lookup_top(spa, brtvd->bv_vdevid);
495 	size = (vdev_get_min_asize(vd) - 1) / spa->spa_brt_rangesize + 1;
496 	spa_config_exit(spa, SCL_VDEV, FTAG);
497 
498 	entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP);
499 	nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);
500 	bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);
501 
502 	if (!brtvd->bv_initiated) {
503 		ASSERT0(brtvd->bv_size);
504 		ASSERT0P(brtvd->bv_entcount);
505 		ASSERT0P(brtvd->bv_bitmap);
506 	} else {
507 		ASSERT(brtvd->bv_size > 0);
508 		ASSERT(brtvd->bv_entcount != NULL);
509 		ASSERT(brtvd->bv_bitmap != NULL);
510 		/*
511 		 * TODO: Allow vdev shrinking. We only need to implement
512 		 * shrinking the on-disk BRT VDEV object.
513 		 * dmu_free_range(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
514 		 *     offset, size, tx);
515 		 */
516 		ASSERT3U(brtvd->bv_size, <=, size);
517 
518 		memcpy(entcount, brtvd->bv_entcount,
519 		    sizeof (entcount[0]) * MIN(size, brtvd->bv_size));
520 		vmem_free(brtvd->bv_entcount,
521 		    sizeof (entcount[0]) * brtvd->bv_size);
522 		onblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
523 		memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
524 		    BT_SIZEOFMAP(onblocks)));
525 		kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(onblocks));
526 	}
527 
528 	brtvd->bv_size = size;
529 	brtvd->bv_entcount = entcount;
530 	brtvd->bv_bitmap = bitmap;
531 	if (!brtvd->bv_initiated) {
532 		brtvd->bv_need_byteswap = FALSE;
533 		brtvd->bv_initiated = TRUE;
534 		BRT_DEBUG("BRT VDEV %llu initiated.",
535 		    (u_longlong_t)brtvd->bv_vdevid);
536 	}
537 }
538 
539 static int
540 brt_vdev_load(spa_t *spa, brt_vdev_t *brtvd)
541 {
542 	dmu_buf_t *db;
543 	brt_vdev_phys_t *bvphys;
544 	int error;
545 
546 	ASSERT(!brtvd->bv_initiated);
547 	ASSERT(brtvd->bv_mos_brtvdev != 0);
548 
549 	error = dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
550 	    FTAG, &db);
551 	if (error != 0)
552 		return (error);
553 
554 	bvphys = db->db_data;
555 	if (spa->spa_brt_rangesize == 0) {
556 		spa->spa_brt_rangesize = bvphys->bvp_rangesize;
557 	} else {
558 		ASSERT3U(spa->spa_brt_rangesize, ==, bvphys->bvp_rangesize);
559 	}
560 
561 	brt_vdev_realloc(spa, brtvd);
562 
563 	/* TODO: We don't support VDEV shrinking. */
564 	ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size);
565 
566 	/*
567 	 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out.
568 	 */
569 	error = dmu_read(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0,
570 	    MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t),
571 	    brtvd->bv_entcount, DMU_READ_NO_PREFETCH);
572 	if (error != 0)
573 		return (error);
574 
575 	ASSERT(bvphys->bvp_mos_entries != 0);
576 	VERIFY0(dnode_hold(spa->spa_meta_objset, bvphys->bvp_mos_entries, brtvd,
577 	    &brtvd->bv_mos_entries_dnode));
578 	rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
579 	brtvd->bv_mos_entries = bvphys->bvp_mos_entries;
580 	rw_exit(&brtvd->bv_mos_entries_lock);
581 	brtvd->bv_need_byteswap =
582 	    (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER);
583 	brtvd->bv_totalcount = bvphys->bvp_totalcount;
584 	brtvd->bv_usedspace = bvphys->bvp_usedspace;
585 	brtvd->bv_savedspace = bvphys->bvp_savedspace;
586 
587 	dmu_buf_rele(db, FTAG);
588 
589 	BRT_DEBUG("BRT VDEV %llu loaded: mos_brtvdev=%llu, mos_entries=%llu",
590 	    (u_longlong_t)brtvd->bv_vdevid,
591 	    (u_longlong_t)brtvd->bv_mos_brtvdev,
592 	    (u_longlong_t)brtvd->bv_mos_entries);
593 	return (0);
594 }
595 
596 static void
597 brt_vdev_dealloc(brt_vdev_t *brtvd)
598 {
599 	ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
600 	ASSERT(brtvd->bv_initiated);
601 	ASSERT0(avl_numnodes(&brtvd->bv_tree));
602 
603 	vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size);
604 	brtvd->bv_entcount = NULL;
605 	uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
606 	kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(nblocks));
607 	brtvd->bv_bitmap = NULL;
608 
609 	brtvd->bv_size = 0;
610 
611 	brtvd->bv_initiated = FALSE;
612 	BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid);
613 }
614 
615 static void
616 brt_vdev_destroy(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
617 {
618 	char name[64];
619 	uint64_t count;
620 
621 	ASSERT(brtvd->bv_initiated);
622 	ASSERT(brtvd->bv_mos_brtvdev != 0);
623 	ASSERT(brtvd->bv_mos_entries != 0);
624 	ASSERT0(brtvd->bv_totalcount);
625 	ASSERT0(brtvd->bv_usedspace);
626 	ASSERT0(brtvd->bv_savedspace);
627 
628 	uint64_t mos_entries = brtvd->bv_mos_entries;
629 	rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
630 	brtvd->bv_mos_entries = 0;
631 	rw_exit(&brtvd->bv_mos_entries_lock);
632 	dnode_rele(brtvd->bv_mos_entries_dnode, brtvd);
633 	brtvd->bv_mos_entries_dnode = NULL;
634 	ASSERT0(zap_count(spa->spa_meta_objset, mos_entries, &count));
635 	ASSERT0(count);
636 	VERIFY0(zap_destroy(spa->spa_meta_objset, mos_entries, tx));
637 	BRT_DEBUG("MOS entries destroyed, object=%llu",
638 	    (u_longlong_t)mos_entries);
639 
640 	VERIFY0(dmu_object_free(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
641 	    tx));
642 	BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",
643 	    (u_longlong_t)brtvd->bv_mos_brtvdev);
644 	brtvd->bv_mos_brtvdev = 0;
645 	brtvd->bv_entcount_dirty = FALSE;
646 
647 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
648 	    (u_longlong_t)brtvd->bv_vdevid);
649 	VERIFY0(zap_remove(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
650 	    name, tx));
651 	BRT_DEBUG("Pool directory object removed, object=%s", name);
652 
653 	brtvd->bv_meta_dirty = FALSE;
654 
655 	rw_enter(&brtvd->bv_lock, RW_WRITER);
656 	brt_vdev_dealloc(brtvd);
657 	rw_exit(&brtvd->bv_lock);
658 
659 	spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING, tx);
660 }
661 
662 static void
663 brt_vdevs_expand(spa_t *spa, uint64_t nvdevs)
664 {
665 	brt_vdev_t **vdevs;
666 
667 	ASSERT(RW_WRITE_HELD(&spa->spa_brt_lock));
668 	ASSERT3U(nvdevs, >=, spa->spa_brt_nvdevs);
669 
670 	if (nvdevs == spa->spa_brt_nvdevs)
671 		return;
672 
673 	vdevs = kmem_zalloc(sizeof (*spa->spa_brt_vdevs) * nvdevs, KM_SLEEP);
674 	if (spa->spa_brt_nvdevs > 0) {
675 		ASSERT(spa->spa_brt_vdevs != NULL);
676 
677 		memcpy(vdevs, spa->spa_brt_vdevs,
678 		    sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs);
679 		kmem_free(spa->spa_brt_vdevs,
680 		    sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs);
681 	}
682 	spa->spa_brt_vdevs = vdevs;
683 
684 	for (uint64_t vdevid = spa->spa_brt_nvdevs; vdevid < nvdevs; vdevid++) {
685 		brt_vdev_t *brtvd = kmem_zalloc(sizeof (*brtvd), KM_SLEEP);
686 		rw_init(&brtvd->bv_lock, NULL, RW_DEFAULT, NULL);
687 		brtvd->bv_vdevid = vdevid;
688 		brtvd->bv_initiated = FALSE;
689 		rw_init(&brtvd->bv_mos_entries_lock, NULL, RW_DEFAULT, NULL);
690 		avl_create(&brtvd->bv_tree, brt_entry_compare,
691 		    sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node));
692 		for (int i = 0; i < TXG_SIZE; i++) {
693 			avl_create(&brtvd->bv_pending_tree[i],
694 			    brt_entry_compare, sizeof (brt_entry_t),
695 			    offsetof(brt_entry_t, bre_node));
696 		}
697 		mutex_init(&brtvd->bv_pending_lock, NULL, MUTEX_DEFAULT, NULL);
698 		spa->spa_brt_vdevs[vdevid] = brtvd;
699 	}
700 
701 	BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",
702 	    (u_longlong_t)spa->spa_brt_nvdevs, (u_longlong_t)nvdevs);
703 	spa->spa_brt_nvdevs = nvdevs;
704 }
705 
706 static boolean_t
707 brt_vdev_lookup(spa_t *spa, brt_vdev_t *brtvd, uint64_t offset)
708 {
709 	uint64_t idx = offset / spa->spa_brt_rangesize;
710 	if (idx < brtvd->bv_size) {
711 		/* VDEV wasn't expanded. */
712 		return (brt_vdev_entcount_get(brtvd, idx) > 0);
713 	}
714 	return (FALSE);
715 }
716 
717 static void
718 brt_vdev_addref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre,
719     uint64_t dsize, uint64_t count)
720 {
721 	uint64_t idx;
722 
723 	ASSERT(brtvd->bv_initiated);
724 
725 	brtvd->bv_savedspace += dsize * count;
726 	brtvd->bv_meta_dirty = TRUE;
727 
728 	if (bre->bre_count > 0)
729 		return;
730 
731 	brtvd->bv_usedspace += dsize;
732 
733 	idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize;
734 	if (idx >= brtvd->bv_size) {
735 		/* VDEV has been expanded. */
736 		rw_enter(&brtvd->bv_lock, RW_WRITER);
737 		brt_vdev_realloc(spa, brtvd);
738 		rw_exit(&brtvd->bv_lock);
739 	}
740 
741 	ASSERT3U(idx, <, brtvd->bv_size);
742 
743 	brtvd->bv_totalcount++;
744 	brt_vdev_entcount_inc(brtvd, idx);
745 	brtvd->bv_entcount_dirty = TRUE;
746 	idx = idx / BRT_BLOCKSIZE / 8;
747 	BT_SET(brtvd->bv_bitmap, idx);
748 }
749 
750 static void
751 brt_vdev_decref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre,
752     uint64_t dsize)
753 {
754 	uint64_t idx;
755 
756 	ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
757 	ASSERT(brtvd->bv_initiated);
758 
759 	brtvd->bv_savedspace -= dsize;
760 	brtvd->bv_meta_dirty = TRUE;
761 
762 	if (bre->bre_count > 0)
763 		return;
764 
765 	brtvd->bv_usedspace -= dsize;
766 
767 	idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize;
768 	ASSERT3U(idx, <, brtvd->bv_size);
769 
770 	ASSERT(brtvd->bv_totalcount > 0);
771 	brtvd->bv_totalcount--;
772 	brt_vdev_entcount_dec(brtvd, idx);
773 	brtvd->bv_entcount_dirty = TRUE;
774 	idx = idx / BRT_BLOCKSIZE / 8;
775 	BT_SET(brtvd->bv_bitmap, idx);
776 }
777 
778 static void
779 brt_vdev_sync(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
780 {
781 	dmu_buf_t *db;
782 	brt_vdev_phys_t *bvphys;
783 
784 	ASSERT(brtvd->bv_meta_dirty);
785 	ASSERT(brtvd->bv_mos_brtvdev != 0);
786 	ASSERT(dmu_tx_is_syncing(tx));
787 
788 	VERIFY0(dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
789 	    FTAG, &db));
790 
791 	if (brtvd->bv_entcount_dirty) {
792 		/*
793 		 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
794 		 */
795 		dmu_write(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0,
796 		    brtvd->bv_size * sizeof (brtvd->bv_entcount[0]),
797 		    brtvd->bv_entcount, tx);
798 		uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
799 		memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(nblocks));
800 		brtvd->bv_entcount_dirty = FALSE;
801 	}
802 
803 	dmu_buf_will_dirty(db, tx);
804 	bvphys = db->db_data;
805 	bvphys->bvp_mos_entries = brtvd->bv_mos_entries;
806 	bvphys->bvp_size = brtvd->bv_size;
807 	if (brtvd->bv_need_byteswap) {
808 		bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER;
809 	} else {
810 		bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER;
811 	}
812 	bvphys->bvp_totalcount = brtvd->bv_totalcount;
813 	bvphys->bvp_rangesize = spa->spa_brt_rangesize;
814 	bvphys->bvp_usedspace = brtvd->bv_usedspace;
815 	bvphys->bvp_savedspace = brtvd->bv_savedspace;
816 	dmu_buf_rele(db, FTAG);
817 
818 	brtvd->bv_meta_dirty = FALSE;
819 }
820 
821 static void
822 brt_vdevs_free(spa_t *spa)
823 {
824 	if (spa->spa_brt_vdevs == 0)
825 		return;
826 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
827 		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
828 		rw_enter(&brtvd->bv_lock, RW_WRITER);
829 		if (brtvd->bv_initiated)
830 			brt_vdev_dealloc(brtvd);
831 		rw_exit(&brtvd->bv_lock);
832 		rw_destroy(&brtvd->bv_lock);
833 		if (brtvd->bv_mos_entries != 0)
834 			dnode_rele(brtvd->bv_mos_entries_dnode, brtvd);
835 		rw_destroy(&brtvd->bv_mos_entries_lock);
836 		avl_destroy(&brtvd->bv_tree);
837 		for (int i = 0; i < TXG_SIZE; i++)
838 			avl_destroy(&brtvd->bv_pending_tree[i]);
839 		mutex_destroy(&brtvd->bv_pending_lock);
840 		kmem_free(brtvd, sizeof (*brtvd));
841 	}
842 	kmem_free(spa->spa_brt_vdevs, sizeof (*spa->spa_brt_vdevs) *
843 	    spa->spa_brt_nvdevs);
844 }
845 
846 static void
847 brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp)
848 {
849 
850 	bre->bre_bp = *bp;
851 	bre->bre_count = 0;
852 	bre->bre_pcount = 0;
853 
854 	*vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]);
855 }
856 
857 static int
858 brt_entry_lookup(brt_vdev_t *brtvd, brt_entry_t *bre)
859 {
860 	uint64_t off = BRE_OFFSET(bre);
861 
862 	return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
863 	    &off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count), &bre->bre_count));
864 }
865 
866 /*
867  * Return TRUE if we _can_ have BRT entry for this bp. It might be false
868  * positive, but gives us quick answer if we should look into BRT, which
869  * may require reads and thus will be more expensive.
870  */
871 boolean_t
872 brt_maybe_exists(spa_t *spa, const blkptr_t *bp)
873 {
874 
875 	if (spa->spa_brt_nvdevs == 0)
876 		return (B_FALSE);
877 
878 	uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
879 	brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
880 	if (brtvd == NULL || !brtvd->bv_initiated)
881 		return (FALSE);
882 
883 	/*
884 	 * We don't need locks here, since bv_entcount pointer must be
885 	 * stable at this point, and we don't care about false positive
886 	 * races here, while false negative should be impossible, since
887 	 * all brt_vdev_addref() have already completed by this point.
888 	 */
889 	uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]);
890 	return (brt_vdev_lookup(spa, brtvd, off));
891 }
892 
893 uint64_t
894 brt_get_dspace(spa_t *spa)
895 {
896 	if (spa->spa_brt_nvdevs == 0)
897 		return (0);
898 
899 	brt_rlock(spa);
900 	uint64_t s = 0;
901 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++)
902 		s += spa->spa_brt_vdevs[vdevid]->bv_savedspace;
903 	brt_unlock(spa);
904 	return (s);
905 }
906 
907 uint64_t
908 brt_get_used(spa_t *spa)
909 {
910 	if (spa->spa_brt_nvdevs == 0)
911 		return (0);
912 
913 	brt_rlock(spa);
914 	uint64_t s = 0;
915 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++)
916 		s += spa->spa_brt_vdevs[vdevid]->bv_usedspace;
917 	brt_unlock(spa);
918 	return (s);
919 }
920 
921 uint64_t
922 brt_get_saved(spa_t *spa)
923 {
924 	return (brt_get_dspace(spa));
925 }
926 
927 uint64_t
928 brt_get_ratio(spa_t *spa)
929 {
930 	uint64_t used = brt_get_used(spa);
931 	if (used == 0)
932 		return (100);
933 	return ((used + brt_get_saved(spa)) * 100 / used);
934 }
935 
936 static int
937 brt_kstats_update(kstat_t *ksp, int rw)
938 {
939 	brt_stats_t *bs = ksp->ks_data;
940 
941 	if (rw == KSTAT_WRITE)
942 		return (EACCES);
943 
944 	bs->brt_addref_entry_not_on_disk.value.ui64 =
945 	    wmsum_value(&brt_sums.brt_addref_entry_not_on_disk);
946 	bs->brt_addref_entry_on_disk.value.ui64 =
947 	    wmsum_value(&brt_sums.brt_addref_entry_on_disk);
948 	bs->brt_decref_entry_in_memory.value.ui64 =
949 	    wmsum_value(&brt_sums.brt_decref_entry_in_memory);
950 	bs->brt_decref_entry_loaded_from_disk.value.ui64 =
951 	    wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk);
952 	bs->brt_decref_entry_not_in_memory.value.ui64 =
953 	    wmsum_value(&brt_sums.brt_decref_entry_not_in_memory);
954 	bs->brt_decref_entry_read_lost_race.value.ui64 =
955 	    wmsum_value(&brt_sums.brt_decref_entry_read_lost_race);
956 	bs->brt_decref_entry_still_referenced.value.ui64 =
957 	    wmsum_value(&brt_sums.brt_decref_entry_still_referenced);
958 	bs->brt_decref_free_data_later.value.ui64 =
959 	    wmsum_value(&brt_sums.brt_decref_free_data_later);
960 	bs->brt_decref_free_data_now.value.ui64 =
961 	    wmsum_value(&brt_sums.brt_decref_free_data_now);
962 	bs->brt_decref_no_entry.value.ui64 =
963 	    wmsum_value(&brt_sums.brt_decref_no_entry);
964 
965 	return (0);
966 }
967 
968 static void
969 brt_stat_init(void)
970 {
971 
972 	wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0);
973 	wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0);
974 	wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0);
975 	wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0);
976 	wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0);
977 	wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0);
978 	wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0);
979 	wmsum_init(&brt_sums.brt_decref_free_data_later, 0);
980 	wmsum_init(&brt_sums.brt_decref_free_data_now, 0);
981 	wmsum_init(&brt_sums.brt_decref_no_entry, 0);
982 
983 	brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED,
984 	    sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
985 	if (brt_ksp != NULL) {
986 		brt_ksp->ks_data = &brt_stats;
987 		brt_ksp->ks_update = brt_kstats_update;
988 		kstat_install(brt_ksp);
989 	}
990 }
991 
992 static void
993 brt_stat_fini(void)
994 {
995 	if (brt_ksp != NULL) {
996 		kstat_delete(brt_ksp);
997 		brt_ksp = NULL;
998 	}
999 
1000 	wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk);
1001 	wmsum_fini(&brt_sums.brt_addref_entry_on_disk);
1002 	wmsum_fini(&brt_sums.brt_decref_entry_in_memory);
1003 	wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk);
1004 	wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory);
1005 	wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race);
1006 	wmsum_fini(&brt_sums.brt_decref_entry_still_referenced);
1007 	wmsum_fini(&brt_sums.brt_decref_free_data_later);
1008 	wmsum_fini(&brt_sums.brt_decref_free_data_now);
1009 	wmsum_fini(&brt_sums.brt_decref_no_entry);
1010 }
1011 
1012 void
1013 brt_init(void)
1014 {
1015 	brt_entry_cache = kmem_cache_create("brt_entry_cache",
1016 	    sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1017 
1018 	brt_stat_init();
1019 }
1020 
1021 void
1022 brt_fini(void)
1023 {
1024 	brt_stat_fini();
1025 
1026 	kmem_cache_destroy(brt_entry_cache);
1027 }
1028 
1029 /* Return TRUE if block should be freed immediately. */
1030 boolean_t
1031 brt_entry_decref(spa_t *spa, const blkptr_t *bp)
1032 {
1033 	brt_entry_t *bre, *racebre;
1034 	brt_entry_t bre_search;
1035 	avl_index_t where;
1036 	uint64_t vdevid;
1037 	int error;
1038 
1039 	brt_entry_fill(bp, &bre_search, &vdevid);
1040 
1041 	brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
1042 	ASSERT(brtvd != NULL);
1043 
1044 	rw_enter(&brtvd->bv_lock, RW_WRITER);
1045 	ASSERT(brtvd->bv_initiated);
1046 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1047 	if (bre != NULL) {
1048 		BRTSTAT_BUMP(brt_decref_entry_in_memory);
1049 		goto out;
1050 	} else {
1051 		BRTSTAT_BUMP(brt_decref_entry_not_in_memory);
1052 	}
1053 	rw_exit(&brtvd->bv_lock);
1054 
1055 	error = brt_entry_lookup(brtvd, &bre_search);
1056 	/* bre_search now contains correct bre_count */
1057 	if (error == ENOENT) {
1058 		BRTSTAT_BUMP(brt_decref_no_entry);
1059 		return (B_TRUE);
1060 	}
1061 	ASSERT0(error);
1062 
1063 	rw_enter(&brtvd->bv_lock, RW_WRITER);
1064 	racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
1065 	if (racebre != NULL) {
1066 		/* The entry was added when the lock was dropped. */
1067 		BRTSTAT_BUMP(brt_decref_entry_read_lost_race);
1068 		bre = racebre;
1069 		goto out;
1070 	}
1071 
1072 	BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk);
1073 	bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
1074 	bre->bre_bp = bre_search.bre_bp;
1075 	bre->bre_count = bre_search.bre_count;
1076 	bre->bre_pcount = 0;
1077 	avl_insert(&brtvd->bv_tree, bre, where);
1078 
1079 out:
1080 	if (bre->bre_count == 0) {
1081 		rw_exit(&brtvd->bv_lock);
1082 		BRTSTAT_BUMP(brt_decref_free_data_now);
1083 		return (B_TRUE);
1084 	}
1085 
1086 	bre->bre_pcount--;
1087 	ASSERT(bre->bre_count > 0);
1088 	bre->bre_count--;
1089 	if (bre->bre_count == 0)
1090 		BRTSTAT_BUMP(brt_decref_free_data_later);
1091 	else
1092 		BRTSTAT_BUMP(brt_decref_entry_still_referenced);
1093 	brt_vdev_decref(spa, brtvd, bre, bp_get_dsize_sync(spa, bp));
1094 
1095 	rw_exit(&brtvd->bv_lock);
1096 
1097 	return (B_FALSE);
1098 }
1099 
1100 uint64_t
1101 brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)
1102 {
1103 	brt_entry_t bre_search, *bre;
1104 	uint64_t vdevid, refcnt;
1105 	int error;
1106 
1107 	brt_entry_fill(bp, &bre_search, &vdevid);
1108 
1109 	brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
1110 	ASSERT(brtvd != NULL);
1111 
1112 	rw_enter(&brtvd->bv_lock, RW_READER);
1113 	ASSERT(brtvd->bv_initiated);
1114 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1115 	if (bre == NULL) {
1116 		rw_exit(&brtvd->bv_lock);
1117 		error = brt_entry_lookup(brtvd, &bre_search);
1118 		if (error == ENOENT) {
1119 			refcnt = 0;
1120 		} else {
1121 			ASSERT0(error);
1122 			refcnt = bre_search.bre_count;
1123 		}
1124 	} else {
1125 		refcnt = bre->bre_count;
1126 		rw_exit(&brtvd->bv_lock);
1127 	}
1128 
1129 	return (refcnt);
1130 }
1131 
1132 static void
1133 brt_prefetch(brt_vdev_t *brtvd, const blkptr_t *bp)
1134 {
1135 	if (!brt_zap_prefetch || brtvd->bv_mos_entries == 0)
1136 		return;
1137 
1138 	uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]);
1139 	rw_enter(&brtvd->bv_mos_entries_lock, RW_READER);
1140 	if (brtvd->bv_mos_entries != 0) {
1141 		(void) zap_prefetch_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
1142 		    &off, BRT_KEY_WORDS);
1143 	}
1144 	rw_exit(&brtvd->bv_mos_entries_lock);
1145 }
1146 
1147 static int
1148 brt_entry_compare(const void *x1, const void *x2)
1149 {
1150 	const brt_entry_t *bre1 = x1, *bre2 = x2;
1151 	const blkptr_t *bp1 = &bre1->bre_bp, *bp2 = &bre2->bre_bp;
1152 
1153 	return (TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
1154 	    DVA_GET_OFFSET(&bp2->blk_dva[0])));
1155 }
1156 
1157 void
1158 brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1159 {
1160 	brt_entry_t *bre, *newbre;
1161 	avl_index_t where;
1162 	uint64_t txg;
1163 
1164 	txg = dmu_tx_get_txg(tx);
1165 	ASSERT3U(txg, !=, 0);
1166 
1167 	uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
1168 	brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_TRUE);
1169 	avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK];
1170 
1171 	newbre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
1172 	newbre->bre_bp = *bp;
1173 	newbre->bre_count = 0;
1174 	newbre->bre_pcount = 1;
1175 
1176 	mutex_enter(&brtvd->bv_pending_lock);
1177 	bre = avl_find(pending_tree, newbre, &where);
1178 	if (bre == NULL) {
1179 		avl_insert(pending_tree, newbre, where);
1180 		newbre = NULL;
1181 	} else {
1182 		bre->bre_pcount++;
1183 	}
1184 	mutex_exit(&brtvd->bv_pending_lock);
1185 
1186 	if (newbre != NULL) {
1187 		ASSERT(bre != NULL);
1188 		ASSERT(bre != newbre);
1189 		kmem_cache_free(brt_entry_cache, newbre);
1190 	} else {
1191 		ASSERT0P(bre);
1192 
1193 		/* Prefetch BRT entry for the syncing context. */
1194 		brt_prefetch(brtvd, bp);
1195 	}
1196 }
1197 
1198 void
1199 brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1200 {
1201 	brt_entry_t *bre, bre_search;
1202 	uint64_t txg;
1203 
1204 	txg = dmu_tx_get_txg(tx);
1205 	ASSERT3U(txg, !=, 0);
1206 
1207 	uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
1208 	brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
1209 	ASSERT(brtvd != NULL);
1210 	avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK];
1211 
1212 	bre_search.bre_bp = *bp;
1213 
1214 	mutex_enter(&brtvd->bv_pending_lock);
1215 	bre = avl_find(pending_tree, &bre_search, NULL);
1216 	ASSERT(bre != NULL);
1217 	ASSERT(bre->bre_pcount > 0);
1218 	bre->bre_pcount--;
1219 	if (bre->bre_pcount == 0)
1220 		avl_remove(pending_tree, bre);
1221 	else
1222 		bre = NULL;
1223 	mutex_exit(&brtvd->bv_pending_lock);
1224 
1225 	if (bre)
1226 		kmem_cache_free(brt_entry_cache, bre);
1227 }
1228 
1229 static void
1230 brt_pending_apply_vdev(spa_t *spa, brt_vdev_t *brtvd, uint64_t txg)
1231 {
1232 	brt_entry_t *bre, *nbre;
1233 
1234 	/*
1235 	 * We are in syncing context, so no other bv_pending_tree accesses
1236 	 * are possible for the TXG.  So we don't need bv_pending_lock.
1237 	 */
1238 	ASSERT(avl_is_empty(&brtvd->bv_tree));
1239 	avl_swap(&brtvd->bv_tree, &brtvd->bv_pending_tree[txg & TXG_MASK]);
1240 
1241 	for (bre = avl_first(&brtvd->bv_tree); bre; bre = nbre) {
1242 		nbre = AVL_NEXT(&brtvd->bv_tree, bre);
1243 
1244 		/*
1245 		 * If the block has DEDUP bit set, it means that it
1246 		 * already exists in the DEDUP table, so we can just
1247 		 * use that instead of creating new entry in the BRT.
1248 		 */
1249 		if (BP_GET_DEDUP(&bre->bre_bp)) {
1250 			while (bre->bre_pcount > 0) {
1251 				if (!ddt_addref(spa, &bre->bre_bp))
1252 					break;
1253 				bre->bre_pcount--;
1254 			}
1255 			if (bre->bre_pcount == 0) {
1256 				avl_remove(&brtvd->bv_tree, bre);
1257 				kmem_cache_free(brt_entry_cache, bre);
1258 				continue;
1259 			}
1260 		}
1261 
1262 		/*
1263 		 * Unless we know that the block is definitely not in ZAP,
1264 		 * try to get its reference count from there.
1265 		 */
1266 		uint64_t off = BRE_OFFSET(bre);
1267 		if (brtvd->bv_mos_entries != 0 &&
1268 		    brt_vdev_lookup(spa, brtvd, off)) {
1269 			int error = zap_lookup_uint64_by_dnode(
1270 			    brtvd->bv_mos_entries_dnode, &off,
1271 			    BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
1272 			    &bre->bre_count);
1273 			if (error == 0) {
1274 				BRTSTAT_BUMP(brt_addref_entry_on_disk);
1275 			} else {
1276 				ASSERT3U(error, ==, ENOENT);
1277 				BRTSTAT_BUMP(brt_addref_entry_not_on_disk);
1278 			}
1279 		}
1280 	}
1281 
1282 	/*
1283 	 * If all the cloned blocks we had were handled by DDT, we don't need
1284 	 * to initiate the vdev.
1285 	 */
1286 	if (avl_is_empty(&brtvd->bv_tree))
1287 		return;
1288 
1289 	if (!brtvd->bv_initiated) {
1290 		rw_enter(&brtvd->bv_lock, RW_WRITER);
1291 		brt_vdev_realloc(spa, brtvd);
1292 		rw_exit(&brtvd->bv_lock);
1293 	}
1294 
1295 	/*
1296 	 * Convert pending references into proper ones.  This has to be a
1297 	 * separate loop, since entcount modifications would cause false
1298 	 * positives for brt_vdev_lookup() on following iterations.
1299 	 */
1300 	for (bre = avl_first(&brtvd->bv_tree); bre;
1301 	    bre = AVL_NEXT(&brtvd->bv_tree, bre)) {
1302 		brt_vdev_addref(spa, brtvd, bre,
1303 		    bp_get_dsize(spa, &bre->bre_bp), bre->bre_pcount);
1304 		bre->bre_count += bre->bre_pcount;
1305 	}
1306 }
1307 
1308 void
1309 brt_pending_apply(spa_t *spa, uint64_t txg)
1310 {
1311 
1312 	brt_rlock(spa);
1313 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
1314 		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
1315 		brt_unlock(spa);
1316 
1317 		brt_pending_apply_vdev(spa, brtvd, txg);
1318 
1319 		brt_rlock(spa);
1320 	}
1321 	brt_unlock(spa);
1322 }
1323 
1324 static void
1325 brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx)
1326 {
1327 	uint64_t off = BRE_OFFSET(bre);
1328 
1329 	if (bre->bre_pcount == 0) {
1330 		/* The net change is zero, nothing to do in ZAP. */
1331 	} else if (bre->bre_count == 0) {
1332 		int error = zap_remove_uint64_by_dnode(dn, &off,
1333 		    BRT_KEY_WORDS, tx);
1334 		VERIFY(error == 0 || error == ENOENT);
1335 	} else {
1336 		VERIFY0(zap_update_uint64_by_dnode(dn, &off,
1337 		    BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
1338 		    &bre->bre_count, tx));
1339 	}
1340 }
1341 
1342 static void
1343 brt_sync_table(spa_t *spa, dmu_tx_t *tx)
1344 {
1345 	brt_entry_t *bre;
1346 
1347 	brt_rlock(spa);
1348 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
1349 		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
1350 		brt_unlock(spa);
1351 
1352 		if (!brtvd->bv_meta_dirty) {
1353 			ASSERT(!brtvd->bv_entcount_dirty);
1354 			ASSERT0(avl_numnodes(&brtvd->bv_tree));
1355 			brt_rlock(spa);
1356 			continue;
1357 		}
1358 
1359 		ASSERT(!brtvd->bv_entcount_dirty ||
1360 		    avl_numnodes(&brtvd->bv_tree) != 0);
1361 
1362 		if (brtvd->bv_mos_brtvdev == 0)
1363 			brt_vdev_create(spa, brtvd, tx);
1364 
1365 		void *c = NULL;
1366 		while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
1367 			brt_sync_entry(brtvd->bv_mos_entries_dnode, bre, tx);
1368 			kmem_cache_free(brt_entry_cache, bre);
1369 		}
1370 
1371 #ifdef ZFS_DEBUG
1372 		if (zfs_flags & ZFS_DEBUG_BRT)
1373 			brt_vdev_dump(brtvd);
1374 #endif
1375 		if (brtvd->bv_totalcount == 0)
1376 			brt_vdev_destroy(spa, brtvd, tx);
1377 		else
1378 			brt_vdev_sync(spa, brtvd, tx);
1379 		brt_rlock(spa);
1380 	}
1381 	brt_unlock(spa);
1382 }
1383 
1384 void
1385 brt_sync(spa_t *spa, uint64_t txg)
1386 {
1387 	dmu_tx_t *tx;
1388 	uint64_t vdevid;
1389 
1390 	ASSERT3U(spa_syncing_txg(spa), ==, txg);
1391 
1392 	brt_rlock(spa);
1393 	for (vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
1394 		if (spa->spa_brt_vdevs[vdevid]->bv_meta_dirty)
1395 			break;
1396 	}
1397 	if (vdevid >= spa->spa_brt_nvdevs) {
1398 		brt_unlock(spa);
1399 		return;
1400 	}
1401 	brt_unlock(spa);
1402 
1403 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1404 	brt_sync_table(spa, tx);
1405 	dmu_tx_commit(tx);
1406 }
1407 
1408 static void
1409 brt_alloc(spa_t *spa)
1410 {
1411 	rw_init(&spa->spa_brt_lock, NULL, RW_DEFAULT, NULL);
1412 	spa->spa_brt_vdevs = NULL;
1413 	spa->spa_brt_nvdevs = 0;
1414 	spa->spa_brt_rangesize = 0;
1415 }
1416 
1417 void
1418 brt_create(spa_t *spa)
1419 {
1420 	brt_alloc(spa);
1421 	spa->spa_brt_rangesize = BRT_RANGESIZE;
1422 }
1423 
1424 int
1425 brt_load(spa_t *spa)
1426 {
1427 	int error = 0;
1428 
1429 	brt_alloc(spa);
1430 	brt_wlock(spa);
1431 	for (uint64_t vdevid = 0; vdevid < spa->spa_root_vdev->vdev_children;
1432 	    vdevid++) {
1433 		char name[64];
1434 		uint64_t mos_brtvdev;
1435 
1436 		/* Look if this vdev had active block cloning. */
1437 		snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
1438 		    (u_longlong_t)vdevid);
1439 		error = zap_lookup(spa->spa_meta_objset,
1440 		    DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1,
1441 		    &mos_brtvdev);
1442 		if (error == ENOENT) {
1443 			error = 0;
1444 			continue;
1445 		}
1446 		if (error != 0)
1447 			break;
1448 
1449 		/* If it did, then allocate them all and load this one. */
1450 		brt_vdevs_expand(spa, spa->spa_root_vdev->vdev_children);
1451 		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
1452 		rw_enter(&brtvd->bv_lock, RW_WRITER);
1453 		brtvd->bv_mos_brtvdev = mos_brtvdev;
1454 		error = brt_vdev_load(spa, brtvd);
1455 		rw_exit(&brtvd->bv_lock);
1456 		if (error != 0)
1457 			break;
1458 	}
1459 
1460 	if (spa->spa_brt_rangesize == 0)
1461 		spa->spa_brt_rangesize = BRT_RANGESIZE;
1462 	brt_unlock(spa);
1463 	return (error);
1464 }
1465 
1466 void
1467 brt_unload(spa_t *spa)
1468 {
1469 	if (spa->spa_brt_rangesize == 0)
1470 		return;
1471 	brt_vdevs_free(spa);
1472 	rw_destroy(&spa->spa_brt_lock);
1473 	spa->spa_brt_rangesize = 0;
1474 }
1475 
1476 ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW,
1477 	"Enable prefetching of BRT ZAP entries");
1478 ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW,
1479 	"BRT ZAP leaf blockshift");
1480 ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW,
1481 	"BRT ZAP indirect blockshift");
1482