xref: /freebsd/sys/contrib/openzfs/module/zfs/brt.c (revision 8ac904ce090b1c2e355da8aa122ca2252183f4e1)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
25  */
26 
27 #include <sys/zfs_context.h>
28 #include <sys/spa.h>
29 #include <sys/spa_impl.h>
30 #include <sys/zio.h>
31 #include <sys/brt.h>
32 #include <sys/brt_impl.h>
33 #include <sys/ddt.h>
34 #include <sys/bitmap.h>
35 #include <sys/zap.h>
36 #include <sys/dmu_tx.h>
37 #include <sys/arc.h>
38 #include <sys/dsl_pool.h>
39 #include <sys/dsl_scan.h>
40 #include <sys/vdev_impl.h>
41 #include <sys/kstat.h>
42 #include <sys/wmsum.h>
43 
44 /*
45  * Block Cloning design.
46  *
47  * Block Cloning allows to manually clone a file (or a subset of its blocks)
48  * into another (or the same) file by just creating additional references to
49  * the data blocks without copying the data itself. Those references are kept
50  * in the Block Reference Tables (BRTs).
51  *
52  * In many ways this is similar to the existing deduplication, but there are
53  * some important differences:
54  *
55  * - Deduplication is automatic and Block Cloning is not - one has to use a
56  *   dedicated system call(s) to clone the given file/blocks.
57  * - Deduplication keeps all data blocks in its table, even those referenced
58  *   just once. Block Cloning creates an entry in its tables only when there
59  *   are at least two references to the given data block. If the block was
60  *   never explicitly cloned or the second to last reference was dropped,
61  *   there will be neither space nor performance overhead.
62  * - Deduplication needs data to work - one needs to pass real data to the
63  *   write(2) syscall, so hash can be calculated. Block Cloning doesn't require
64  *   data, just block pointers to the data, so it is extremely fast, as we pay
65  *   neither the cost of reading the data, nor the cost of writing the data -
66  *   we operate exclusively on metadata.
67  * - If the D (dedup) bit is not set in the block pointer, it means that
68  *   the block is not in the dedup table (DDT) and we won't consult the DDT
69  *   when we need to free the block. Block Cloning must be consulted on every
70  *   free, because we cannot modify the source BP (eg. by setting something
71  *   similar to the D bit), thus we have no hint if the block is in the
72  *   Block Reference Table (BRT), so we need to look into the BRT. There is
73  *   an optimization in place that allows us to eliminate the majority of BRT
74  *   lookups which is described below in the "Minimizing free penalty" section.
75  * - The BRT entry is much smaller than the DDT entry - for BRT we only store
76  *   64bit offset and 64bit reference counter.
77  * - Dedup keys are cryptographic hashes, so two blocks that are close to each
78  *   other on disk are most likely in totally different parts of the DDT.
79  *   The BRT entry keys are offsets into a single top-level VDEV, so data blocks
80  *   from one file should have BRT entries close to each other.
81  * - Scrub will only do a single pass over a block that is referenced multiple
82  *   times in the DDT. Unfortunately it is not currently (if at all) possible
83  *   with Block Cloning and block referenced multiple times will be scrubbed
84  *   multiple times. The new, sorted scrub should be able to eliminate
85  *   duplicated reads given enough memory.
86  * - Deduplication requires cryptographically strong hash as a checksum or
87  *   additional data verification. Block Cloning works with any checksum
88  *   algorithm or even with checksumming disabled.
89  *
90  * As mentioned above, the BRT entries are much smaller than the DDT entries.
91  * To uniquely identify a block we just need its vdev id and offset. We also
92  * need to maintain a reference counter. The vdev id will often repeat, as there
93  * is a small number of top-level VDEVs and a large number of blocks stored in
94  * each VDEV. We take advantage of that to reduce the BRT entry size further by
95  * maintaining one BRT for each top-level VDEV, so we can then have only offset
96  * and counter as the BRT entry.
97  *
98  * Minimizing free penalty.
99  *
100  * Block Cloning allows creating additional references to any existing block.
101  * When we free a block there is no hint in the block pointer whether the block
102  * was cloned or not, so on each free we have to check if there is a
103  * corresponding entry in the BRT or not. If there is, we need to decrease
104  * the reference counter. Doing BRT lookup on every free can potentially be
105  * expensive by requiring additional I/Os if the BRT doesn't fit into memory.
106  * This is the main problem with deduplication, so we've learned our lesson and
107  * try not to repeat the same mistake here. How do we do that? We divide each
108  * top-level VDEV into 16MB regions. For each region we maintain a counter that
109  * is a sum of all the BRT entries that have offsets within the region. This
110  * creates the entries count array of 16bit numbers for each top-level VDEV.
111  * The entries count array is always kept in memory and updated on disk in the
112  * same transaction group as the BRT updates to keep everything in-sync. We can
113  * keep the array in memory, because it is very small. With 16MB regions and
114  * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease
115  * the region size even further in the future). Now, when we want to free
116  * a block, we first consult the array. If the counter for the whole region is
117  * zero, there is no need to look for the BRT entry, as there isn't one for
118  * sure. If the counter for the region is greater than zero, only then we will
119  * do a BRT lookup and if an entry is found we will decrease the reference
120  * counter in the BRT entry and in the entry counters array.
121  *
122  * The entry counters array is small, but can potentially be larger for very
123  * large VDEVs or smaller regions. In this case we don't want to rewrite entire
124  * array on every change. We then divide the array into 32kB block and keep
125  * a bitmap of dirty blocks within a transaction group. When we sync the
126  * transaction group we can only update the parts of the entry counters array
127  * that were modified. Note: Keeping track of the dirty parts of the entry
128  * counters array is implemented, but updating only parts of the array on disk
129  * is not yet implemented - for now we will update entire array if there was
130  * any change.
131  *
132  * The implementation tries to be economic: if BRT is not used, or no longer
133  * used, there will be no entries in the MOS and no additional memory used (eg.
134  * the entry counters array is only allocated if needed).
135  *
136  * Interaction between Deduplication and Block Cloning.
137  *
138  * If both functionalities are in use, we could end up with a block that is
139  * referenced multiple times in both DDT and BRT. When we free one of the
140  * references we couldn't tell where it belongs, so we would have to decide
141  * what table takes the precedence: do we first clear DDT references or BRT
142  * references? To avoid this dilemma BRT cooperates with DDT - if a given block
143  * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will
144  * lookup DDT entry instead and increase the counter there. No BRT entry
145  * will be created for a block which has the D (dedup) bit set.
146  * BRT may be more efficient for manual deduplication, but if the block is
147  * already in the DDT, then creating additional BRT entry would be less
148  * efficient. This clever idea was proposed by Allan Jude.
149  *
150  * Block Cloning across datasets.
151  *
152  * Block Cloning is not limited to cloning blocks within the same dataset.
153  * It is possible (and very useful) to clone blocks between different datasets.
154  * One use case is recovering files from snapshots. By cloning the files into
155  * dataset we need no additional storage. Without Block Cloning we would need
156  * additional space for those files.
157  * Another interesting use case is moving the files between datasets
158  * (copying the file content to the new dataset and removing the source file).
159  * In that case Block Cloning will only be used briefly, because the BRT entries
160  * will be removed when the source is removed.
161  * Block Cloning across encrypted datasets is supported as long as both
162  * datasets share the same master key (e.g. snapshots and clones)
163  *
164  * Block Cloning flow through ZFS layers.
165  *
166  * Note: Block Cloning can be used both for cloning file system blocks and ZVOL
167  * blocks. As of this writing no interface is implemented that allows for block
168  * cloning within a ZVOL.
169  * FreeBSD and Linux provides copy_file_range(2) system call and we will use it
170  * for blocking cloning.
171  *
172  *	ssize_t
173  *	copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp,
174  *	                size_t len, unsigned int flags);
175  *
176  * Even though offsets and length represent bytes, they have to be
177  * block-aligned or we will return an error so the upper layer can
178  * fallback to the generic mechanism that will just copy the data.
179  * Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
180  * This function was implemented based on zfs_write(), but instead of writing
181  * the given data we first read block pointers using the new dmu_read_l0_bps()
182  * function from the source file. Once we have BPs from the source file we call
183  * the dmu_brt_clone() function on the destination file. This function
184  * allocates BPs for us. We iterate over all source BPs. If the given BP is
185  * a hole or an embedded block, we just copy BP as-is. If it points to a real
186  * data we place this BP on a BRT pending list using the brt_pending_add()
187  * function.
188  *
189  * We use this pending list to keep track of all BPs that got new references
190  * within this transaction group.
191  *
192  * Some special cases to consider and how we address them:
193  * - The block we want to clone may have been created within the same
194  *   transaction group that we are trying to clone. Such block has no BP
195  *   allocated yet, so cannot be immediately cloned. We return EAGAIN.
196  * - The block we want to clone may have been modified within the same
197  *   transaction group. We return EAGAIN.
198  * - A block may be cloned multiple times during one transaction group (that's
199  *   why pending list is actually a tree and not an append-only list - this
200  *   way we can figure out faster if this block is cloned for the first time
201  *   in this txg or consecutive time).
202  * - A block may be cloned and freed within the same transaction group
203  *   (see dbuf_undirty()).
204  * - A block may be cloned and within the same transaction group the clone
205  *   can be cloned again (see dmu_read_l0_bps()).
206  * - A file might have been deleted, but the caller still has a file descriptor
207  *   open to this file and clones it.
208  *
209  * When we free a block we have an additional step in the ZIO pipeline where we
210  * call the zio_brt_free() function. We then call the brt_entry_decref()
211  * that loads the corresponding BRT entry (if one exists) and decreases
212  * reference counter. If this is not the last reference we will stop ZIO
213  * pipeline here. If this is the last reference or the block is not in the
214  * BRT, we continue the pipeline and free the block as usual.
215  *
216  * At the beginning of spa_sync() where there can be no more block cloning,
217  * but before issuing frees we call brt_pending_apply(). This function applies
218  * all the new clones to the BRT table - we load BRT entries and update
219  * reference counters. To sync new BRT entries to disk, we use brt_sync()
220  * function. This function will sync all dirty per-top-level-vdev BRTs,
221  * the entry counters arrays, etc.
222  *
223  * Block Cloning and ZIL.
224  *
225  * Every clone operation is divided into chunks (similar to write) and each
226  * chunk is cloned in a separate transaction. The chunk size is determined by
227  * how many BPs we can fit into a single ZIL entry.
228  * Replaying clone operation is different from the regular clone operation,
229  * as when we log clone operations we cannot use the source object - it may
230  * reside on a different dataset, so we log BPs we want to clone.
231  * The ZIL is replayed when we mount the given dataset, not when the pool is
232  * imported. Taking this into account it is possible that the pool is imported
233  * without mounting datasets and the source dataset is destroyed before the
234  * destination dataset is mounted and its ZIL replayed.
235  * To address this situation we leverage zil_claim() mechanism where ZFS will
236  * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
237  * entries, we will bump reference counters for their BPs in the BRT.  Then
238  * on mount and ZIL replay we bump the reference counters once more, while the
239  * first references are dropped during ZIL destroy by zil_free_clone_range().
240  * It is possible that after zil_claim() we never mount the destination, so
241  * we never replay its ZIL and just destroy it.  In this case the only taken
242  * references will be dropped by zil_free_clone_range(), since the cloning is
243  * not going to ever take place.
244  */
245 
246 static kmem_cache_t *brt_entry_cache;
247 
248 /*
249  * Enable/disable prefetching of BRT entries that we are going to modify.
250  */
251 static int brt_zap_prefetch = 1;
252 
253 #ifdef ZFS_DEBUG
254 #define	BRT_DEBUG(...)	do {						\
255 	if ((zfs_flags & ZFS_DEBUG_BRT) != 0) {				\
256 		__dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \
257 	}								\
258 } while (0)
259 #else
260 #define	BRT_DEBUG(...)	do { } while (0)
261 #endif
262 
263 static int brt_zap_default_bs = 13;
264 static int brt_zap_default_ibs = 13;
265 
266 static kstat_t	*brt_ksp;
267 
268 typedef struct brt_stats {
269 	kstat_named_t brt_addref_entry_not_on_disk;
270 	kstat_named_t brt_addref_entry_on_disk;
271 	kstat_named_t brt_decref_entry_in_memory;
272 	kstat_named_t brt_decref_entry_loaded_from_disk;
273 	kstat_named_t brt_decref_entry_not_in_memory;
274 	kstat_named_t brt_decref_entry_read_lost_race;
275 	kstat_named_t brt_decref_entry_still_referenced;
276 	kstat_named_t brt_decref_free_data_later;
277 	kstat_named_t brt_decref_free_data_now;
278 	kstat_named_t brt_decref_no_entry;
279 } brt_stats_t;
280 
281 static brt_stats_t brt_stats = {
282 	{ "addref_entry_not_on_disk",		KSTAT_DATA_UINT64 },
283 	{ "addref_entry_on_disk",		KSTAT_DATA_UINT64 },
284 	{ "decref_entry_in_memory",		KSTAT_DATA_UINT64 },
285 	{ "decref_entry_loaded_from_disk",	KSTAT_DATA_UINT64 },
286 	{ "decref_entry_not_in_memory",		KSTAT_DATA_UINT64 },
287 	{ "decref_entry_read_lost_race",	KSTAT_DATA_UINT64 },
288 	{ "decref_entry_still_referenced",	KSTAT_DATA_UINT64 },
289 	{ "decref_free_data_later",		KSTAT_DATA_UINT64 },
290 	{ "decref_free_data_now",		KSTAT_DATA_UINT64 },
291 	{ "decref_no_entry",			KSTAT_DATA_UINT64 }
292 };
293 
294 struct {
295 	wmsum_t brt_addref_entry_not_on_disk;
296 	wmsum_t brt_addref_entry_on_disk;
297 	wmsum_t brt_decref_entry_in_memory;
298 	wmsum_t brt_decref_entry_loaded_from_disk;
299 	wmsum_t brt_decref_entry_not_in_memory;
300 	wmsum_t brt_decref_entry_read_lost_race;
301 	wmsum_t brt_decref_entry_still_referenced;
302 	wmsum_t brt_decref_free_data_later;
303 	wmsum_t brt_decref_free_data_now;
304 	wmsum_t brt_decref_no_entry;
305 } brt_sums;
306 
307 #define	BRTSTAT_BUMP(stat)	wmsum_add(&brt_sums.stat, 1)
308 
309 static int brt_entry_compare(const void *x1, const void *x2);
310 static void brt_vdevs_expand(spa_t *spa, uint64_t nvdevs);
311 
312 static void
brt_rlock(spa_t * spa)313 brt_rlock(spa_t *spa)
314 {
315 	rw_enter(&spa->spa_brt_lock, RW_READER);
316 }
317 
318 static void
brt_wlock(spa_t * spa)319 brt_wlock(spa_t *spa)
320 {
321 	rw_enter(&spa->spa_brt_lock, RW_WRITER);
322 }
323 
324 static void
brt_unlock(spa_t * spa)325 brt_unlock(spa_t *spa)
326 {
327 	rw_exit(&spa->spa_brt_lock);
328 }
329 
330 static uint16_t
brt_vdev_entcount_get(const brt_vdev_t * brtvd,uint64_t idx)331 brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx)
332 {
333 
334 	ASSERT3U(idx, <, brtvd->bv_size);
335 
336 	if (unlikely(brtvd->bv_need_byteswap)) {
337 		return (BSWAP_16(brtvd->bv_entcount[idx]));
338 	} else {
339 		return (brtvd->bv_entcount[idx]);
340 	}
341 }
342 
343 static void
brt_vdev_entcount_set(brt_vdev_t * brtvd,uint64_t idx,uint16_t entcnt)344 brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt)
345 {
346 
347 	ASSERT3U(idx, <, brtvd->bv_size);
348 
349 	if (unlikely(brtvd->bv_need_byteswap)) {
350 		brtvd->bv_entcount[idx] = BSWAP_16(entcnt);
351 	} else {
352 		brtvd->bv_entcount[idx] = entcnt;
353 	}
354 }
355 
356 static void
brt_vdev_entcount_inc(brt_vdev_t * brtvd,uint64_t idx)357 brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx)
358 {
359 	uint16_t entcnt;
360 
361 	ASSERT3U(idx, <, brtvd->bv_size);
362 
363 	entcnt = brt_vdev_entcount_get(brtvd, idx);
364 	ASSERT(entcnt < UINT16_MAX);
365 
366 	brt_vdev_entcount_set(brtvd, idx, entcnt + 1);
367 }
368 
369 static void
brt_vdev_entcount_dec(brt_vdev_t * brtvd,uint64_t idx)370 brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx)
371 {
372 	uint16_t entcnt;
373 
374 	ASSERT3U(idx, <, brtvd->bv_size);
375 
376 	entcnt = brt_vdev_entcount_get(brtvd, idx);
377 	ASSERT(entcnt > 0);
378 
379 	brt_vdev_entcount_set(brtvd, idx, entcnt - 1);
380 }
381 
382 #ifdef ZFS_DEBUG
383 static void
brt_vdev_dump(brt_vdev_t * brtvd)384 brt_vdev_dump(brt_vdev_t *brtvd)
385 {
386 	uint64_t idx;
387 
388 	uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
389 	zfs_dbgmsg("  BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d "
390 	    "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu",
391 	    (u_longlong_t)brtvd->bv_vdevid,
392 	    brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
393 	    (u_longlong_t)brtvd->bv_size,
394 	    (u_longlong_t)brtvd->bv_totalcount,
395 	    (u_longlong_t)nblocks,
396 	    (size_t)BT_SIZEOFMAP(nblocks));
397 	if (brtvd->bv_totalcount > 0) {
398 		zfs_dbgmsg("    entcounts:");
399 		for (idx = 0; idx < brtvd->bv_size; idx++) {
400 			uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx);
401 			if (entcnt > 0) {
402 				zfs_dbgmsg("      [%04llu] %hu",
403 				    (u_longlong_t)idx, entcnt);
404 			}
405 		}
406 	}
407 	if (brtvd->bv_entcount_dirty) {
408 		char *bitmap;
409 
410 		bitmap = kmem_alloc(nblocks + 1, KM_SLEEP);
411 		for (idx = 0; idx < nblocks; idx++) {
412 			bitmap[idx] =
413 			    BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
414 		}
415 		bitmap[idx] = '\0';
416 		zfs_dbgmsg("    dirty: %s", bitmap);
417 		kmem_free(bitmap, nblocks + 1);
418 	}
419 }
420 #endif
421 
422 static brt_vdev_t *
brt_vdev(spa_t * spa,uint64_t vdevid,boolean_t alloc)423 brt_vdev(spa_t *spa, uint64_t vdevid, boolean_t alloc)
424 {
425 	brt_vdev_t *brtvd = NULL;
426 
427 	brt_rlock(spa);
428 	if (vdevid < spa->spa_brt_nvdevs) {
429 		brtvd = spa->spa_brt_vdevs[vdevid];
430 	} else if (alloc) {
431 		/* New VDEV was added. */
432 		brt_unlock(spa);
433 		brt_wlock(spa);
434 		if (vdevid >= spa->spa_brt_nvdevs)
435 			brt_vdevs_expand(spa, vdevid + 1);
436 		brtvd = spa->spa_brt_vdevs[vdevid];
437 	}
438 	brt_unlock(spa);
439 	return (brtvd);
440 }
441 
442 static void
brt_vdev_create(spa_t * spa,brt_vdev_t * brtvd,dmu_tx_t * tx)443 brt_vdev_create(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
444 {
445 	char name[64];
446 
447 	ASSERT(brtvd->bv_initiated);
448 	ASSERT0(brtvd->bv_mos_brtvdev);
449 	ASSERT0(brtvd->bv_mos_entries);
450 
451 	uint64_t mos_entries = zap_create_flags(spa->spa_meta_objset, 0,
452 	    ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
453 	    brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx);
454 	VERIFY(mos_entries != 0);
455 	VERIFY0(dnode_hold(spa->spa_meta_objset, mos_entries, brtvd,
456 	    &brtvd->bv_mos_entries_dnode));
457 	dnode_set_storage_type(brtvd->bv_mos_entries_dnode, DMU_OT_DDT_ZAP);
458 	rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
459 	brtvd->bv_mos_entries = mos_entries;
460 	rw_exit(&brtvd->bv_mos_entries_lock);
461 	BRT_DEBUG("MOS entries created, object=%llu",
462 	    (u_longlong_t)brtvd->bv_mos_entries);
463 
464 	/*
465 	 * We allocate DMU buffer to store the bv_entcount[] array.
466 	 * We will keep array size (bv_size) and cummulative count for all
467 	 * bv_entcount[]s (bv_totalcount) in the bonus buffer.
468 	 */
469 	brtvd->bv_mos_brtvdev = dmu_object_alloc(spa->spa_meta_objset,
470 	    DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE,
471 	    DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx);
472 	VERIFY(brtvd->bv_mos_brtvdev != 0);
473 	BRT_DEBUG("MOS BRT VDEV created, object=%llu",
474 	    (u_longlong_t)brtvd->bv_mos_brtvdev);
475 
476 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
477 	    (u_longlong_t)brtvd->bv_vdevid);
478 	VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name,
479 	    sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx));
480 	BRT_DEBUG("Pool directory object created, object=%s", name);
481 
482 	/*
483 	 * Activate the endian-fixed feature if this is the first BRT ZAP
484 	 * (i.e., BLOCK_CLONING is not yet active) and the feature is enabled.
485 	 */
486 	if (spa_feature_is_enabled(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN) &&
487 	    !spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
488 		spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN, tx);
489 	} else if (spa_feature_is_active(spa,
490 	    SPA_FEATURE_BLOCK_CLONING_ENDIAN)) {
491 		spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN, tx);
492 	}
493 
494 	spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING, tx);
495 }
496 
497 static void
brt_vdev_realloc(spa_t * spa,brt_vdev_t * brtvd)498 brt_vdev_realloc(spa_t *spa, brt_vdev_t *brtvd)
499 {
500 	vdev_t *vd;
501 	uint16_t *entcount;
502 	ulong_t *bitmap;
503 	uint64_t nblocks, onblocks, size;
504 
505 	ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
506 
507 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
508 	vd = vdev_lookup_top(spa, brtvd->bv_vdevid);
509 	size = (vdev_get_min_asize(vd) - 1) / spa->spa_brt_rangesize + 1;
510 	spa_config_exit(spa, SCL_VDEV, FTAG);
511 
512 	nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);
513 	entcount = vmem_zalloc(nblocks * BRT_BLOCKSIZE, KM_SLEEP);
514 	bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);
515 
516 	if (!brtvd->bv_initiated) {
517 		ASSERT0(brtvd->bv_size);
518 		ASSERT0P(brtvd->bv_entcount);
519 		ASSERT0P(brtvd->bv_bitmap);
520 	} else {
521 		ASSERT(brtvd->bv_size > 0);
522 		ASSERT(brtvd->bv_entcount != NULL);
523 		ASSERT(brtvd->bv_bitmap != NULL);
524 		/*
525 		 * TODO: Allow vdev shrinking. We only need to implement
526 		 * shrinking the on-disk BRT VDEV object.
527 		 * dmu_free_range(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
528 		 *     offset, size, tx);
529 		 */
530 		ASSERT3U(brtvd->bv_size, <=, size);
531 
532 		memcpy(entcount, brtvd->bv_entcount,
533 		    sizeof (entcount[0]) * MIN(size, brtvd->bv_size));
534 		onblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
535 		vmem_free(brtvd->bv_entcount, onblocks * BRT_BLOCKSIZE);
536 		memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
537 		    BT_SIZEOFMAP(onblocks)));
538 		kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(onblocks));
539 	}
540 
541 	brtvd->bv_size = size;
542 	brtvd->bv_entcount = entcount;
543 	brtvd->bv_bitmap = bitmap;
544 	if (!brtvd->bv_initiated) {
545 		brtvd->bv_need_byteswap = FALSE;
546 		brtvd->bv_initiated = TRUE;
547 		BRT_DEBUG("BRT VDEV %llu initiated.",
548 		    (u_longlong_t)brtvd->bv_vdevid);
549 	}
550 }
551 
552 static int
brt_vdev_load(spa_t * spa,brt_vdev_t * brtvd)553 brt_vdev_load(spa_t *spa, brt_vdev_t *brtvd)
554 {
555 	dmu_buf_t *db;
556 	brt_vdev_phys_t *bvphys;
557 	int error;
558 
559 	ASSERT(!brtvd->bv_initiated);
560 	ASSERT(brtvd->bv_mos_brtvdev != 0);
561 
562 	error = dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
563 	    FTAG, &db);
564 	if (error != 0)
565 		return (error);
566 
567 	bvphys = db->db_data;
568 	if (spa->spa_brt_rangesize == 0) {
569 		spa->spa_brt_rangesize = bvphys->bvp_rangesize;
570 	} else {
571 		ASSERT3U(spa->spa_brt_rangesize, ==, bvphys->bvp_rangesize);
572 	}
573 
574 	brt_vdev_realloc(spa, brtvd);
575 
576 	/* TODO: We don't support VDEV shrinking. */
577 	ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size);
578 
579 	/*
580 	 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out.
581 	 */
582 	error = dmu_read(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0,
583 	    MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t),
584 	    brtvd->bv_entcount, DMU_READ_NO_PREFETCH | DMU_UNCACHEDIO);
585 	if (error != 0)
586 		return (error);
587 
588 	ASSERT(bvphys->bvp_mos_entries != 0);
589 	VERIFY0(dnode_hold(spa->spa_meta_objset, bvphys->bvp_mos_entries, brtvd,
590 	    &brtvd->bv_mos_entries_dnode));
591 	dnode_set_storage_type(brtvd->bv_mos_entries_dnode, DMU_OT_DDT_ZAP);
592 	rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
593 	brtvd->bv_mos_entries = bvphys->bvp_mos_entries;
594 	rw_exit(&brtvd->bv_mos_entries_lock);
595 	brtvd->bv_need_byteswap =
596 	    (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER);
597 	brtvd->bv_totalcount = bvphys->bvp_totalcount;
598 	brtvd->bv_usedspace = bvphys->bvp_usedspace;
599 	brtvd->bv_savedspace = bvphys->bvp_savedspace;
600 
601 	dmu_buf_rele(db, FTAG);
602 
603 	BRT_DEBUG("BRT VDEV %llu loaded: mos_brtvdev=%llu, mos_entries=%llu",
604 	    (u_longlong_t)brtvd->bv_vdevid,
605 	    (u_longlong_t)brtvd->bv_mos_brtvdev,
606 	    (u_longlong_t)brtvd->bv_mos_entries);
607 	return (0);
608 }
609 
610 static void
brt_vdev_dealloc(brt_vdev_t * brtvd)611 brt_vdev_dealloc(brt_vdev_t *brtvd)
612 {
613 	ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
614 	ASSERT(brtvd->bv_initiated);
615 	ASSERT0(avl_numnodes(&brtvd->bv_tree));
616 
617 	uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
618 	vmem_free(brtvd->bv_entcount, nblocks * BRT_BLOCKSIZE);
619 	brtvd->bv_entcount = NULL;
620 	kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(nblocks));
621 	brtvd->bv_bitmap = NULL;
622 
623 	brtvd->bv_size = 0;
624 
625 	brtvd->bv_initiated = FALSE;
626 	BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid);
627 }
628 
629 static void
brt_vdev_destroy(spa_t * spa,brt_vdev_t * brtvd,dmu_tx_t * tx)630 brt_vdev_destroy(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
631 {
632 	char name[64];
633 	uint64_t count;
634 
635 	ASSERT(brtvd->bv_initiated);
636 	ASSERT(brtvd->bv_mos_brtvdev != 0);
637 	ASSERT(brtvd->bv_mos_entries != 0);
638 	ASSERT0(brtvd->bv_totalcount);
639 	ASSERT0(brtvd->bv_usedspace);
640 	ASSERT0(brtvd->bv_savedspace);
641 
642 	uint64_t mos_entries = brtvd->bv_mos_entries;
643 	rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
644 	brtvd->bv_mos_entries = 0;
645 	rw_exit(&brtvd->bv_mos_entries_lock);
646 	dnode_rele(brtvd->bv_mos_entries_dnode, brtvd);
647 	brtvd->bv_mos_entries_dnode = NULL;
648 	ASSERT0(zap_count(spa->spa_meta_objset, mos_entries, &count));
649 	ASSERT0(count);
650 	VERIFY0(zap_destroy(spa->spa_meta_objset, mos_entries, tx));
651 	BRT_DEBUG("MOS entries destroyed, object=%llu",
652 	    (u_longlong_t)mos_entries);
653 
654 	VERIFY0(dmu_object_free(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
655 	    tx));
656 	BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",
657 	    (u_longlong_t)brtvd->bv_mos_brtvdev);
658 	brtvd->bv_mos_brtvdev = 0;
659 	brtvd->bv_entcount_dirty = FALSE;
660 
661 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
662 	    (u_longlong_t)brtvd->bv_vdevid);
663 	VERIFY0(zap_remove(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
664 	    name, tx));
665 	BRT_DEBUG("Pool directory object removed, object=%s", name);
666 
667 	brtvd->bv_meta_dirty = FALSE;
668 
669 	rw_enter(&brtvd->bv_lock, RW_WRITER);
670 	brt_vdev_dealloc(brtvd);
671 	rw_exit(&brtvd->bv_lock);
672 
673 	spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING, tx);
674 	if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN))
675 		spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN, tx);
676 }
677 
678 static void
brt_vdevs_expand(spa_t * spa,uint64_t nvdevs)679 brt_vdevs_expand(spa_t *spa, uint64_t nvdevs)
680 {
681 	brt_vdev_t **vdevs;
682 
683 	ASSERT(RW_WRITE_HELD(&spa->spa_brt_lock));
684 	ASSERT3U(nvdevs, >=, spa->spa_brt_nvdevs);
685 
686 	if (nvdevs == spa->spa_brt_nvdevs)
687 		return;
688 
689 	vdevs = kmem_zalloc(sizeof (*spa->spa_brt_vdevs) * nvdevs, KM_SLEEP);
690 	if (spa->spa_brt_nvdevs > 0) {
691 		ASSERT(spa->spa_brt_vdevs != NULL);
692 
693 		memcpy(vdevs, spa->spa_brt_vdevs,
694 		    sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs);
695 		kmem_free(spa->spa_brt_vdevs,
696 		    sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs);
697 	}
698 	spa->spa_brt_vdevs = vdevs;
699 
700 	for (uint64_t vdevid = spa->spa_brt_nvdevs; vdevid < nvdevs; vdevid++) {
701 		brt_vdev_t *brtvd = kmem_zalloc(sizeof (*brtvd), KM_SLEEP);
702 		rw_init(&brtvd->bv_lock, NULL, RW_DEFAULT, NULL);
703 		brtvd->bv_vdevid = vdevid;
704 		brtvd->bv_initiated = FALSE;
705 		rw_init(&brtvd->bv_mos_entries_lock, NULL, RW_DEFAULT, NULL);
706 		avl_create(&brtvd->bv_tree, brt_entry_compare,
707 		    sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node));
708 		for (int i = 0; i < TXG_SIZE; i++) {
709 			avl_create(&brtvd->bv_pending_tree[i],
710 			    brt_entry_compare, sizeof (brt_entry_t),
711 			    offsetof(brt_entry_t, bre_node));
712 		}
713 		mutex_init(&brtvd->bv_pending_lock, NULL, MUTEX_DEFAULT, NULL);
714 		spa->spa_brt_vdevs[vdevid] = brtvd;
715 	}
716 
717 	BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",
718 	    (u_longlong_t)spa->spa_brt_nvdevs, (u_longlong_t)nvdevs);
719 	spa->spa_brt_nvdevs = nvdevs;
720 }
721 
722 static boolean_t
brt_vdev_lookup(spa_t * spa,brt_vdev_t * brtvd,uint64_t offset)723 brt_vdev_lookup(spa_t *spa, brt_vdev_t *brtvd, uint64_t offset)
724 {
725 	uint64_t idx = offset / spa->spa_brt_rangesize;
726 	if (idx < brtvd->bv_size) {
727 		/* VDEV wasn't expanded. */
728 		return (brt_vdev_entcount_get(brtvd, idx) > 0);
729 	}
730 	return (FALSE);
731 }
732 
733 static void
brt_vdev_addref(spa_t * spa,brt_vdev_t * brtvd,const brt_entry_t * bre,uint64_t dsize,uint64_t count)734 brt_vdev_addref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre,
735     uint64_t dsize, uint64_t count)
736 {
737 	uint64_t idx;
738 
739 	ASSERT(brtvd->bv_initiated);
740 
741 	brtvd->bv_savedspace += dsize * count;
742 	brtvd->bv_meta_dirty = TRUE;
743 
744 	if (bre->bre_count > 0)
745 		return;
746 
747 	brtvd->bv_usedspace += dsize;
748 
749 	idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize;
750 	if (idx >= brtvd->bv_size) {
751 		/* VDEV has been expanded. */
752 		rw_enter(&brtvd->bv_lock, RW_WRITER);
753 		brt_vdev_realloc(spa, brtvd);
754 		rw_exit(&brtvd->bv_lock);
755 	}
756 
757 	ASSERT3U(idx, <, brtvd->bv_size);
758 
759 	brtvd->bv_totalcount++;
760 	brt_vdev_entcount_inc(brtvd, idx);
761 	brtvd->bv_entcount_dirty = TRUE;
762 	idx = idx / BRT_BLOCKSIZE / 8;
763 	BT_SET(brtvd->bv_bitmap, idx);
764 }
765 
766 static void
brt_vdev_decref(spa_t * spa,brt_vdev_t * brtvd,const brt_entry_t * bre,uint64_t dsize)767 brt_vdev_decref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre,
768     uint64_t dsize)
769 {
770 	uint64_t idx;
771 
772 	ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
773 	ASSERT(brtvd->bv_initiated);
774 
775 	brtvd->bv_savedspace -= dsize;
776 	brtvd->bv_meta_dirty = TRUE;
777 
778 	if (bre->bre_count > 0)
779 		return;
780 
781 	brtvd->bv_usedspace -= dsize;
782 
783 	idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize;
784 	ASSERT3U(idx, <, brtvd->bv_size);
785 
786 	ASSERT(brtvd->bv_totalcount > 0);
787 	brtvd->bv_totalcount--;
788 	brt_vdev_entcount_dec(brtvd, idx);
789 	brtvd->bv_entcount_dirty = TRUE;
790 	idx = idx / BRT_BLOCKSIZE / 8;
791 	BT_SET(brtvd->bv_bitmap, idx);
792 }
793 
794 static void
brt_vdev_sync(spa_t * spa,brt_vdev_t * brtvd,dmu_tx_t * tx)795 brt_vdev_sync(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
796 {
797 	dmu_buf_t *db;
798 	brt_vdev_phys_t *bvphys;
799 
800 	ASSERT(brtvd->bv_meta_dirty);
801 	ASSERT(brtvd->bv_mos_brtvdev != 0);
802 	ASSERT(dmu_tx_is_syncing(tx));
803 
804 	VERIFY0(dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
805 	    FTAG, &db));
806 
807 	if (brtvd->bv_entcount_dirty) {
808 		/*
809 		 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
810 		 */
811 		uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
812 		dmu_write(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0,
813 		    nblocks * BRT_BLOCKSIZE, brtvd->bv_entcount, tx,
814 		    DMU_READ_NO_PREFETCH | DMU_UNCACHEDIO);
815 		memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(nblocks));
816 		brtvd->bv_entcount_dirty = FALSE;
817 	}
818 
819 	dmu_buf_will_dirty(db, tx);
820 	bvphys = db->db_data;
821 	bvphys->bvp_mos_entries = brtvd->bv_mos_entries;
822 	bvphys->bvp_size = brtvd->bv_size;
823 	if (brtvd->bv_need_byteswap) {
824 		bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER;
825 	} else {
826 		bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER;
827 	}
828 	bvphys->bvp_totalcount = brtvd->bv_totalcount;
829 	bvphys->bvp_rangesize = spa->spa_brt_rangesize;
830 	bvphys->bvp_usedspace = brtvd->bv_usedspace;
831 	bvphys->bvp_savedspace = brtvd->bv_savedspace;
832 	dmu_buf_rele(db, FTAG);
833 
834 	brtvd->bv_meta_dirty = FALSE;
835 }
836 
837 static void
brt_vdevs_free(spa_t * spa)838 brt_vdevs_free(spa_t *spa)
839 {
840 	if (spa->spa_brt_vdevs == 0)
841 		return;
842 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
843 		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
844 		rw_enter(&brtvd->bv_lock, RW_WRITER);
845 		if (brtvd->bv_initiated)
846 			brt_vdev_dealloc(brtvd);
847 		rw_exit(&brtvd->bv_lock);
848 		rw_destroy(&brtvd->bv_lock);
849 		if (brtvd->bv_mos_entries != 0)
850 			dnode_rele(brtvd->bv_mos_entries_dnode, brtvd);
851 		rw_destroy(&brtvd->bv_mos_entries_lock);
852 		avl_destroy(&brtvd->bv_tree);
853 		for (int i = 0; i < TXG_SIZE; i++)
854 			avl_destroy(&brtvd->bv_pending_tree[i]);
855 		mutex_destroy(&brtvd->bv_pending_lock);
856 		kmem_free(brtvd, sizeof (*brtvd));
857 	}
858 	kmem_free(spa->spa_brt_vdevs, sizeof (*spa->spa_brt_vdevs) *
859 	    spa->spa_brt_nvdevs);
860 }
861 
862 static void
brt_entry_fill(const blkptr_t * bp,brt_entry_t * bre,uint64_t * vdevidp)863 brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp)
864 {
865 
866 	bre->bre_bp = *bp;
867 	bre->bre_count = 0;
868 	bre->bre_pcount = 0;
869 
870 	*vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]);
871 }
872 
873 static boolean_t
brt_has_endian_fixed(spa_t * spa)874 brt_has_endian_fixed(spa_t *spa)
875 {
876 	return (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN));
877 }
878 
879 static int
brt_entry_lookup(spa_t * spa,brt_vdev_t * brtvd,brt_entry_t * bre)880 brt_entry_lookup(spa_t *spa, brt_vdev_t *brtvd, brt_entry_t *bre)
881 {
882 	uint64_t off = BRE_OFFSET(bre);
883 
884 	if (brtvd->bv_mos_entries == 0)
885 		return (SET_ERROR(ENOENT));
886 
887 	if (brt_has_endian_fixed(spa)) {
888 		return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
889 		    &off, BRT_KEY_WORDS, sizeof (bre->bre_count), 1,
890 		    &bre->bre_count));
891 	} else {
892 		return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
893 		    &off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
894 		    &bre->bre_count));
895 	}
896 }
897 
898 /*
899  * Return TRUE if we _can_ have BRT entry for this bp. It might be false
900  * positive, but gives us quick answer if we should look into BRT, which
901  * may require reads and thus will be more expensive.
902  */
903 boolean_t
brt_maybe_exists(spa_t * spa,const blkptr_t * bp)904 brt_maybe_exists(spa_t *spa, const blkptr_t *bp)
905 {
906 
907 	if (spa->spa_brt_nvdevs == 0)
908 		return (B_FALSE);
909 
910 	uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
911 	brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
912 	if (brtvd == NULL || !brtvd->bv_initiated)
913 		return (FALSE);
914 
915 	/*
916 	 * We don't need locks here, since bv_entcount pointer must be
917 	 * stable at this point, and we don't care about false positive
918 	 * races here, while false negative should be impossible, since
919 	 * all brt_vdev_addref() have already completed by this point.
920 	 */
921 	uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]);
922 	return (brt_vdev_lookup(spa, brtvd, off));
923 }
924 
925 uint64_t
brt_get_dspace(spa_t * spa)926 brt_get_dspace(spa_t *spa)
927 {
928 	if (spa->spa_brt_nvdevs == 0)
929 		return (0);
930 
931 	brt_rlock(spa);
932 	uint64_t s = 0;
933 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++)
934 		s += spa->spa_brt_vdevs[vdevid]->bv_savedspace;
935 	brt_unlock(spa);
936 	return (s);
937 }
938 
939 uint64_t
brt_get_used(spa_t * spa)940 brt_get_used(spa_t *spa)
941 {
942 	if (spa->spa_brt_nvdevs == 0)
943 		return (0);
944 
945 	brt_rlock(spa);
946 	uint64_t s = 0;
947 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++)
948 		s += spa->spa_brt_vdevs[vdevid]->bv_usedspace;
949 	brt_unlock(spa);
950 	return (s);
951 }
952 
953 uint64_t
brt_get_saved(spa_t * spa)954 brt_get_saved(spa_t *spa)
955 {
956 	return (brt_get_dspace(spa));
957 }
958 
959 uint64_t
brt_get_ratio(spa_t * spa)960 brt_get_ratio(spa_t *spa)
961 {
962 	uint64_t used = brt_get_used(spa);
963 	if (used == 0)
964 		return (100);
965 	return ((used + brt_get_saved(spa)) * 100 / used);
966 }
967 
968 static int
brt_kstats_update(kstat_t * ksp,int rw)969 brt_kstats_update(kstat_t *ksp, int rw)
970 {
971 	brt_stats_t *bs = ksp->ks_data;
972 
973 	if (rw == KSTAT_WRITE)
974 		return (EACCES);
975 
976 	bs->brt_addref_entry_not_on_disk.value.ui64 =
977 	    wmsum_value(&brt_sums.brt_addref_entry_not_on_disk);
978 	bs->brt_addref_entry_on_disk.value.ui64 =
979 	    wmsum_value(&brt_sums.brt_addref_entry_on_disk);
980 	bs->brt_decref_entry_in_memory.value.ui64 =
981 	    wmsum_value(&brt_sums.brt_decref_entry_in_memory);
982 	bs->brt_decref_entry_loaded_from_disk.value.ui64 =
983 	    wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk);
984 	bs->brt_decref_entry_not_in_memory.value.ui64 =
985 	    wmsum_value(&brt_sums.brt_decref_entry_not_in_memory);
986 	bs->brt_decref_entry_read_lost_race.value.ui64 =
987 	    wmsum_value(&brt_sums.brt_decref_entry_read_lost_race);
988 	bs->brt_decref_entry_still_referenced.value.ui64 =
989 	    wmsum_value(&brt_sums.brt_decref_entry_still_referenced);
990 	bs->brt_decref_free_data_later.value.ui64 =
991 	    wmsum_value(&brt_sums.brt_decref_free_data_later);
992 	bs->brt_decref_free_data_now.value.ui64 =
993 	    wmsum_value(&brt_sums.brt_decref_free_data_now);
994 	bs->brt_decref_no_entry.value.ui64 =
995 	    wmsum_value(&brt_sums.brt_decref_no_entry);
996 
997 	return (0);
998 }
999 
1000 static void
brt_stat_init(void)1001 brt_stat_init(void)
1002 {
1003 
1004 	wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0);
1005 	wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0);
1006 	wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0);
1007 	wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0);
1008 	wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0);
1009 	wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0);
1010 	wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0);
1011 	wmsum_init(&brt_sums.brt_decref_free_data_later, 0);
1012 	wmsum_init(&brt_sums.brt_decref_free_data_now, 0);
1013 	wmsum_init(&brt_sums.brt_decref_no_entry, 0);
1014 
1015 	brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED,
1016 	    sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
1017 	if (brt_ksp != NULL) {
1018 		brt_ksp->ks_data = &brt_stats;
1019 		brt_ksp->ks_update = brt_kstats_update;
1020 		kstat_install(brt_ksp);
1021 	}
1022 }
1023 
1024 static void
brt_stat_fini(void)1025 brt_stat_fini(void)
1026 {
1027 	if (brt_ksp != NULL) {
1028 		kstat_delete(brt_ksp);
1029 		brt_ksp = NULL;
1030 	}
1031 
1032 	wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk);
1033 	wmsum_fini(&brt_sums.brt_addref_entry_on_disk);
1034 	wmsum_fini(&brt_sums.brt_decref_entry_in_memory);
1035 	wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk);
1036 	wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory);
1037 	wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race);
1038 	wmsum_fini(&brt_sums.brt_decref_entry_still_referenced);
1039 	wmsum_fini(&brt_sums.brt_decref_free_data_later);
1040 	wmsum_fini(&brt_sums.brt_decref_free_data_now);
1041 	wmsum_fini(&brt_sums.brt_decref_no_entry);
1042 }
1043 
1044 void
brt_init(void)1045 brt_init(void)
1046 {
1047 	brt_entry_cache = kmem_cache_create("brt_entry_cache",
1048 	    sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1049 
1050 	brt_stat_init();
1051 }
1052 
1053 void
brt_fini(void)1054 brt_fini(void)
1055 {
1056 	brt_stat_fini();
1057 
1058 	kmem_cache_destroy(brt_entry_cache);
1059 }
1060 
1061 /* Return TRUE if block should be freed immediately. */
1062 boolean_t
brt_entry_decref(spa_t * spa,const blkptr_t * bp)1063 brt_entry_decref(spa_t *spa, const blkptr_t *bp)
1064 {
1065 	brt_entry_t *bre, *racebre;
1066 	brt_entry_t bre_search;
1067 	avl_index_t where;
1068 	uint64_t vdevid;
1069 	int error;
1070 
1071 	brt_entry_fill(bp, &bre_search, &vdevid);
1072 
1073 	brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
1074 	ASSERT(brtvd != NULL);
1075 
1076 	rw_enter(&brtvd->bv_lock, RW_WRITER);
1077 	ASSERT(brtvd->bv_initiated);
1078 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1079 	if (bre != NULL) {
1080 		BRTSTAT_BUMP(brt_decref_entry_in_memory);
1081 		goto out;
1082 	} else {
1083 		BRTSTAT_BUMP(brt_decref_entry_not_in_memory);
1084 	}
1085 	rw_exit(&brtvd->bv_lock);
1086 
1087 	error = brt_entry_lookup(spa, brtvd, &bre_search);
1088 	/* bre_search now contains correct bre_count */
1089 	if (error == ENOENT) {
1090 		BRTSTAT_BUMP(brt_decref_no_entry);
1091 		return (B_TRUE);
1092 	}
1093 	ASSERT0(error);
1094 
1095 	rw_enter(&brtvd->bv_lock, RW_WRITER);
1096 	racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
1097 	if (racebre != NULL) {
1098 		/* The entry was added when the lock was dropped. */
1099 		BRTSTAT_BUMP(brt_decref_entry_read_lost_race);
1100 		bre = racebre;
1101 		goto out;
1102 	}
1103 
1104 	BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk);
1105 	bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
1106 	bre->bre_bp = bre_search.bre_bp;
1107 	bre->bre_count = bre_search.bre_count;
1108 	bre->bre_pcount = 0;
1109 	avl_insert(&brtvd->bv_tree, bre, where);
1110 
1111 out:
1112 	if (bre->bre_count == 0) {
1113 		rw_exit(&brtvd->bv_lock);
1114 		BRTSTAT_BUMP(brt_decref_free_data_now);
1115 		return (B_TRUE);
1116 	}
1117 
1118 	bre->bre_pcount--;
1119 	ASSERT(bre->bre_count > 0);
1120 	bre->bre_count--;
1121 	if (bre->bre_count == 0)
1122 		BRTSTAT_BUMP(brt_decref_free_data_later);
1123 	else
1124 		BRTSTAT_BUMP(brt_decref_entry_still_referenced);
1125 	brt_vdev_decref(spa, brtvd, bre, bp_get_dsize_sync(spa, bp));
1126 
1127 	rw_exit(&brtvd->bv_lock);
1128 
1129 	return (B_FALSE);
1130 }
1131 
1132 uint64_t
brt_entry_get_refcount(spa_t * spa,const blkptr_t * bp)1133 brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)
1134 {
1135 	brt_entry_t bre_search, *bre;
1136 	uint64_t vdevid, refcnt;
1137 	int error;
1138 
1139 	brt_entry_fill(bp, &bre_search, &vdevid);
1140 
1141 	brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
1142 	ASSERT(brtvd != NULL);
1143 
1144 	rw_enter(&brtvd->bv_lock, RW_READER);
1145 	ASSERT(brtvd->bv_initiated);
1146 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1147 	if (bre == NULL) {
1148 		rw_exit(&brtvd->bv_lock);
1149 		error = brt_entry_lookup(spa, brtvd, &bre_search);
1150 		if (error == ENOENT) {
1151 			refcnt = 0;
1152 		} else {
1153 			ASSERT0(error);
1154 			refcnt = bre_search.bre_count;
1155 		}
1156 	} else {
1157 		refcnt = bre->bre_count;
1158 		rw_exit(&brtvd->bv_lock);
1159 	}
1160 
1161 	return (refcnt);
1162 }
1163 
1164 static void
brt_prefetch(brt_vdev_t * brtvd,const blkptr_t * bp)1165 brt_prefetch(brt_vdev_t *brtvd, const blkptr_t *bp)
1166 {
1167 	if (!brt_zap_prefetch || brtvd->bv_mos_entries == 0)
1168 		return;
1169 
1170 	uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]);
1171 	rw_enter(&brtvd->bv_mos_entries_lock, RW_READER);
1172 	if (brtvd->bv_mos_entries != 0) {
1173 		(void) zap_prefetch_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
1174 		    &off, BRT_KEY_WORDS);
1175 	}
1176 	rw_exit(&brtvd->bv_mos_entries_lock);
1177 }
1178 
1179 static int
brt_entry_compare(const void * x1,const void * x2)1180 brt_entry_compare(const void *x1, const void *x2)
1181 {
1182 	const brt_entry_t *bre1 = x1, *bre2 = x2;
1183 	const blkptr_t *bp1 = &bre1->bre_bp, *bp2 = &bre2->bre_bp;
1184 
1185 	return (TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
1186 	    DVA_GET_OFFSET(&bp2->blk_dva[0])));
1187 }
1188 
1189 void
brt_pending_add(spa_t * spa,const blkptr_t * bp,dmu_tx_t * tx)1190 brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1191 {
1192 	brt_entry_t *bre, *newbre;
1193 	avl_index_t where;
1194 	uint64_t txg;
1195 
1196 	txg = dmu_tx_get_txg(tx);
1197 	ASSERT3U(txg, !=, 0);
1198 
1199 	uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
1200 	brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_TRUE);
1201 	avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK];
1202 
1203 	newbre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
1204 	newbre->bre_bp = *bp;
1205 	newbre->bre_count = 0;
1206 	newbre->bre_pcount = 1;
1207 
1208 	mutex_enter(&brtvd->bv_pending_lock);
1209 	bre = avl_find(pending_tree, newbre, &where);
1210 	if (bre == NULL) {
1211 		avl_insert(pending_tree, newbre, where);
1212 		newbre = NULL;
1213 	} else {
1214 		bre->bre_pcount++;
1215 	}
1216 	mutex_exit(&brtvd->bv_pending_lock);
1217 
1218 	if (newbre != NULL) {
1219 		ASSERT(bre != NULL);
1220 		ASSERT(bre != newbre);
1221 		kmem_cache_free(brt_entry_cache, newbre);
1222 	} else {
1223 		ASSERT0P(bre);
1224 
1225 		/* Prefetch BRT entry for the syncing context. */
1226 		brt_prefetch(brtvd, bp);
1227 	}
1228 }
1229 
1230 void
brt_pending_remove(spa_t * spa,const blkptr_t * bp,dmu_tx_t * tx)1231 brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1232 {
1233 	brt_entry_t *bre, bre_search;
1234 	uint64_t txg;
1235 
1236 	txg = dmu_tx_get_txg(tx);
1237 	ASSERT3U(txg, !=, 0);
1238 
1239 	uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
1240 	brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
1241 	ASSERT(brtvd != NULL);
1242 	avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK];
1243 
1244 	bre_search.bre_bp = *bp;
1245 
1246 	mutex_enter(&brtvd->bv_pending_lock);
1247 	bre = avl_find(pending_tree, &bre_search, NULL);
1248 	ASSERT(bre != NULL);
1249 	ASSERT(bre->bre_pcount > 0);
1250 	bre->bre_pcount--;
1251 	if (bre->bre_pcount == 0)
1252 		avl_remove(pending_tree, bre);
1253 	else
1254 		bre = NULL;
1255 	mutex_exit(&brtvd->bv_pending_lock);
1256 
1257 	if (bre)
1258 		kmem_cache_free(brt_entry_cache, bre);
1259 }
1260 
1261 static void
brt_pending_apply_vdev(spa_t * spa,brt_vdev_t * brtvd,uint64_t txg)1262 brt_pending_apply_vdev(spa_t *spa, brt_vdev_t *brtvd, uint64_t txg)
1263 {
1264 	brt_entry_t *bre, *nbre;
1265 
1266 	/*
1267 	 * We are in syncing context, so no other bv_pending_tree accesses
1268 	 * are possible for the TXG.  So we don't need bv_pending_lock.
1269 	 */
1270 	ASSERT(avl_is_empty(&brtvd->bv_tree));
1271 	avl_swap(&brtvd->bv_tree, &brtvd->bv_pending_tree[txg & TXG_MASK]);
1272 
1273 	for (bre = avl_first(&brtvd->bv_tree); bre; bre = nbre) {
1274 		nbre = AVL_NEXT(&brtvd->bv_tree, bre);
1275 
1276 		/*
1277 		 * If the block has DEDUP bit set, it means that it
1278 		 * already exists in the DEDUP table, so we can just
1279 		 * use that instead of creating new entry in the BRT.
1280 		 */
1281 		if (BP_GET_DEDUP(&bre->bre_bp)) {
1282 			while (bre->bre_pcount > 0) {
1283 				if (!ddt_addref(spa, &bre->bre_bp))
1284 					break;
1285 				bre->bre_pcount--;
1286 			}
1287 			if (bre->bre_pcount == 0) {
1288 				avl_remove(&brtvd->bv_tree, bre);
1289 				kmem_cache_free(brt_entry_cache, bre);
1290 				continue;
1291 			}
1292 		}
1293 
1294 		/*
1295 		 * Unless we know that the block is definitely not in ZAP,
1296 		 * try to get its reference count from there.
1297 		 */
1298 		uint64_t off = BRE_OFFSET(bre);
1299 		if (brtvd->bv_mos_entries != 0 &&
1300 		    brt_vdev_lookup(spa, brtvd, off)) {
1301 			int error;
1302 			if (brt_has_endian_fixed(spa)) {
1303 				error = zap_lookup_uint64_by_dnode(
1304 				    brtvd->bv_mos_entries_dnode, &off,
1305 				    BRT_KEY_WORDS, sizeof (bre->bre_count), 1,
1306 				    &bre->bre_count);
1307 			} else {
1308 				error = zap_lookup_uint64_by_dnode(
1309 				    brtvd->bv_mos_entries_dnode, &off,
1310 				    BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
1311 				    &bre->bre_count);
1312 			}
1313 			if (error == 0) {
1314 				BRTSTAT_BUMP(brt_addref_entry_on_disk);
1315 			} else {
1316 				ASSERT3U(error, ==, ENOENT);
1317 				BRTSTAT_BUMP(brt_addref_entry_not_on_disk);
1318 			}
1319 		}
1320 	}
1321 
1322 	/*
1323 	 * If all the cloned blocks we had were handled by DDT, we don't need
1324 	 * to initiate the vdev.
1325 	 */
1326 	if (avl_is_empty(&brtvd->bv_tree))
1327 		return;
1328 
1329 	if (!brtvd->bv_initiated) {
1330 		rw_enter(&brtvd->bv_lock, RW_WRITER);
1331 		brt_vdev_realloc(spa, brtvd);
1332 		rw_exit(&brtvd->bv_lock);
1333 	}
1334 
1335 	/*
1336 	 * Convert pending references into proper ones.  This has to be a
1337 	 * separate loop, since entcount modifications would cause false
1338 	 * positives for brt_vdev_lookup() on following iterations.
1339 	 */
1340 	for (bre = avl_first(&brtvd->bv_tree); bre;
1341 	    bre = AVL_NEXT(&brtvd->bv_tree, bre)) {
1342 		brt_vdev_addref(spa, brtvd, bre,
1343 		    bp_get_dsize(spa, &bre->bre_bp), bre->bre_pcount);
1344 		bre->bre_count += bre->bre_pcount;
1345 	}
1346 }
1347 
1348 void
brt_pending_apply(spa_t * spa,uint64_t txg)1349 brt_pending_apply(spa_t *spa, uint64_t txg)
1350 {
1351 
1352 	brt_rlock(spa);
1353 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
1354 		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
1355 		brt_unlock(spa);
1356 
1357 		brt_pending_apply_vdev(spa, brtvd, txg);
1358 
1359 		brt_rlock(spa);
1360 	}
1361 	brt_unlock(spa);
1362 }
1363 
1364 static void
brt_sync_entry(spa_t * spa,dnode_t * dn,brt_entry_t * bre,dmu_tx_t * tx)1365 brt_sync_entry(spa_t *spa, dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx)
1366 {
1367 	uint64_t off = BRE_OFFSET(bre);
1368 
1369 	if (bre->bre_pcount == 0) {
1370 		/* The net change is zero, nothing to do in ZAP. */
1371 	} else if (bre->bre_count == 0) {
1372 		int error = zap_remove_uint64_by_dnode(dn, &off,
1373 		    BRT_KEY_WORDS, tx);
1374 		VERIFY(error == 0 || error == ENOENT);
1375 	} else {
1376 		if (brt_has_endian_fixed(spa)) {
1377 			VERIFY0(zap_update_uint64_by_dnode(dn, &off,
1378 			    BRT_KEY_WORDS, sizeof (bre->bre_count), 1,
1379 			    &bre->bre_count, tx));
1380 		} else {
1381 			VERIFY0(zap_update_uint64_by_dnode(dn, &off,
1382 			    BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
1383 			    &bre->bre_count, tx));
1384 		}
1385 	}
1386 }
1387 
1388 static void
brt_sync_table(spa_t * spa,dmu_tx_t * tx)1389 brt_sync_table(spa_t *spa, dmu_tx_t *tx)
1390 {
1391 	brt_entry_t *bre;
1392 
1393 	brt_rlock(spa);
1394 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
1395 		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
1396 		brt_unlock(spa);
1397 
1398 		if (!brtvd->bv_meta_dirty) {
1399 			ASSERT(!brtvd->bv_entcount_dirty);
1400 			ASSERT0(avl_numnodes(&brtvd->bv_tree));
1401 			brt_rlock(spa);
1402 			continue;
1403 		}
1404 
1405 		ASSERT(!brtvd->bv_entcount_dirty ||
1406 		    avl_numnodes(&brtvd->bv_tree) != 0);
1407 
1408 		if (brtvd->bv_mos_brtvdev == 0)
1409 			brt_vdev_create(spa, brtvd, tx);
1410 
1411 		void *c = NULL;
1412 		while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
1413 			brt_sync_entry(spa, brtvd->bv_mos_entries_dnode, bre,
1414 			    tx);
1415 			kmem_cache_free(brt_entry_cache, bre);
1416 		}
1417 
1418 #ifdef ZFS_DEBUG
1419 		if (zfs_flags & ZFS_DEBUG_BRT)
1420 			brt_vdev_dump(brtvd);
1421 #endif
1422 		if (brtvd->bv_totalcount == 0)
1423 			brt_vdev_destroy(spa, brtvd, tx);
1424 		else
1425 			brt_vdev_sync(spa, brtvd, tx);
1426 		brt_rlock(spa);
1427 	}
1428 	brt_unlock(spa);
1429 }
1430 
1431 void
brt_sync(spa_t * spa,uint64_t txg)1432 brt_sync(spa_t *spa, uint64_t txg)
1433 {
1434 	dmu_tx_t *tx;
1435 	uint64_t vdevid;
1436 
1437 	ASSERT3U(spa_syncing_txg(spa), ==, txg);
1438 
1439 	brt_rlock(spa);
1440 	for (vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
1441 		if (spa->spa_brt_vdevs[vdevid]->bv_meta_dirty)
1442 			break;
1443 	}
1444 	if (vdevid >= spa->spa_brt_nvdevs) {
1445 		brt_unlock(spa);
1446 		return;
1447 	}
1448 	brt_unlock(spa);
1449 
1450 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1451 	brt_sync_table(spa, tx);
1452 	dmu_tx_commit(tx);
1453 }
1454 
1455 static void
brt_alloc(spa_t * spa)1456 brt_alloc(spa_t *spa)
1457 {
1458 	rw_init(&spa->spa_brt_lock, NULL, RW_DEFAULT, NULL);
1459 	spa->spa_brt_vdevs = NULL;
1460 	spa->spa_brt_nvdevs = 0;
1461 	spa->spa_brt_rangesize = 0;
1462 }
1463 
1464 void
brt_create(spa_t * spa)1465 brt_create(spa_t *spa)
1466 {
1467 	brt_alloc(spa);
1468 	spa->spa_brt_rangesize = BRT_RANGESIZE;
1469 }
1470 
1471 int
brt_load(spa_t * spa)1472 brt_load(spa_t *spa)
1473 {
1474 	int error = 0;
1475 
1476 	brt_alloc(spa);
1477 	brt_wlock(spa);
1478 	for (uint64_t vdevid = 0; vdevid < spa->spa_root_vdev->vdev_children;
1479 	    vdevid++) {
1480 		char name[64];
1481 		uint64_t mos_brtvdev;
1482 
1483 		/* Look if this vdev had active block cloning. */
1484 		snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
1485 		    (u_longlong_t)vdevid);
1486 		error = zap_lookup(spa->spa_meta_objset,
1487 		    DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1,
1488 		    &mos_brtvdev);
1489 		if (error == ENOENT) {
1490 			error = 0;
1491 			continue;
1492 		}
1493 		if (error != 0)
1494 			break;
1495 
1496 		/* If it did, then allocate them all and load this one. */
1497 		brt_vdevs_expand(spa, spa->spa_root_vdev->vdev_children);
1498 		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
1499 		rw_enter(&brtvd->bv_lock, RW_WRITER);
1500 		brtvd->bv_mos_brtvdev = mos_brtvdev;
1501 		error = brt_vdev_load(spa, brtvd);
1502 		rw_exit(&brtvd->bv_lock);
1503 		if (error != 0)
1504 			break;
1505 	}
1506 
1507 	if (spa->spa_brt_rangesize == 0)
1508 		spa->spa_brt_rangesize = BRT_RANGESIZE;
1509 	brt_unlock(spa);
1510 	return (error);
1511 }
1512 
1513 void
brt_prefetch_all(spa_t * spa)1514 brt_prefetch_all(spa_t *spa)
1515 {
1516 	/*
1517 	 * Load all BRT entries for each vdev. This is intended to perform
1518 	 * a prefetch on all such blocks. For the same reason that brt_prefetch
1519 	 * (called from brt_pending_add) isn't locked, this is also not locked.
1520 	 */
1521 	brt_rlock(spa);
1522 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
1523 		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
1524 		brt_unlock(spa);
1525 
1526 		rw_enter(&brtvd->bv_mos_entries_lock, RW_READER);
1527 		if (brtvd->bv_mos_entries != 0) {
1528 			(void) zap_prefetch_object(spa->spa_meta_objset,
1529 			    brtvd->bv_mos_entries);
1530 		}
1531 		rw_exit(&brtvd->bv_mos_entries_lock);
1532 
1533 		brt_rlock(spa);
1534 	}
1535 	brt_unlock(spa);
1536 }
1537 
1538 void
brt_unload(spa_t * spa)1539 brt_unload(spa_t *spa)
1540 {
1541 	if (spa->spa_brt_rangesize == 0)
1542 		return;
1543 	brt_vdevs_free(spa);
1544 	rw_destroy(&spa->spa_brt_lock);
1545 	spa->spa_brt_rangesize = 0;
1546 }
1547 
1548 ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW,
1549 	"Enable prefetching of BRT ZAP entries");
1550 ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW,
1551 	"BRT ZAP leaf blockshift");
1552 ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW,
1553 	"BRT ZAP indirect blockshift");
1554