xref: /freebsd/sys/contrib/openzfs/module/zfs/brt.c (revision dd21556857e8d40f66bf5ad54754d9d52669ebf7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
24  */
25 
26 #include <sys/zfs_context.h>
27 #include <sys/spa.h>
28 #include <sys/spa_impl.h>
29 #include <sys/zio.h>
30 #include <sys/brt.h>
31 #include <sys/brt_impl.h>
32 #include <sys/ddt.h>
33 #include <sys/bitmap.h>
34 #include <sys/zap.h>
35 #include <sys/dmu_tx.h>
36 #include <sys/arc.h>
37 #include <sys/dsl_pool.h>
38 #include <sys/dsl_scan.h>
39 #include <sys/vdev_impl.h>
40 #include <sys/kstat.h>
41 #include <sys/wmsum.h>
42 
43 /*
44  * Block Cloning design.
45  *
46  * Block Cloning allows to manually clone a file (or a subset of its blocks)
47  * into another (or the same) file by just creating additional references to
48  * the data blocks without copying the data itself. Those references are kept
49  * in the Block Reference Tables (BRTs).
50  *
51  * In many ways this is similar to the existing deduplication, but there are
52  * some important differences:
53  *
54  * - Deduplication is automatic and Block Cloning is not - one has to use a
55  *   dedicated system call(s) to clone the given file/blocks.
56  * - Deduplication keeps all data blocks in its table, even those referenced
57  *   just once. Block Cloning creates an entry in its tables only when there
58  *   are at least two references to the given data block. If the block was
59  *   never explicitly cloned or the second to last reference was dropped,
60  *   there will be neither space nor performance overhead.
61  * - Deduplication needs data to work - one needs to pass real data to the
62  *   write(2) syscall, so hash can be calculated. Block Cloning doesn't require
63  *   data, just block pointers to the data, so it is extremely fast, as we pay
64  *   neither the cost of reading the data, nor the cost of writing the data -
65  *   we operate exclusively on metadata.
66  * - If the D (dedup) bit is not set in the block pointer, it means that
67  *   the block is not in the dedup table (DDT) and we won't consult the DDT
68  *   when we need to free the block. Block Cloning must be consulted on every
69  *   free, because we cannot modify the source BP (eg. by setting something
70  *   similar to the D bit), thus we have no hint if the block is in the
71  *   Block Reference Table (BRT), so we need to look into the BRT. There is
72  *   an optimization in place that allows us to eliminate the majority of BRT
73  *   lookups which is described below in the "Minimizing free penalty" section.
74  * - The BRT entry is much smaller than the DDT entry - for BRT we only store
75  *   64bit offset and 64bit reference counter.
76  * - Dedup keys are cryptographic hashes, so two blocks that are close to each
77  *   other on disk are most likely in totally different parts of the DDT.
78  *   The BRT entry keys are offsets into a single top-level VDEV, so data blocks
79  *   from one file should have BRT entries close to each other.
80  * - Scrub will only do a single pass over a block that is referenced multiple
81  *   times in the DDT. Unfortunately it is not currently (if at all) possible
82  *   with Block Cloning and block referenced multiple times will be scrubbed
83  *   multiple times. The new, sorted scrub should be able to eliminate
84  *   duplicated reads given enough memory.
85  * - Deduplication requires cryptographically strong hash as a checksum or
86  *   additional data verification. Block Cloning works with any checksum
87  *   algorithm or even with checksumming disabled.
88  *
89  * As mentioned above, the BRT entries are much smaller than the DDT entries.
90  * To uniquely identify a block we just need its vdev id and offset. We also
91  * need to maintain a reference counter. The vdev id will often repeat, as there
92  * is a small number of top-level VDEVs and a large number of blocks stored in
93  * each VDEV. We take advantage of that to reduce the BRT entry size further by
94  * maintaining one BRT for each top-level VDEV, so we can then have only offset
95  * and counter as the BRT entry.
96  *
97  * Minimizing free penalty.
98  *
99  * Block Cloning allows creating additional references to any existing block.
100  * When we free a block there is no hint in the block pointer whether the block
101  * was cloned or not, so on each free we have to check if there is a
102  * corresponding entry in the BRT or not. If there is, we need to decrease
103  * the reference counter. Doing BRT lookup on every free can potentially be
104  * expensive by requiring additional I/Os if the BRT doesn't fit into memory.
105  * This is the main problem with deduplication, so we've learned our lesson and
106  * try not to repeat the same mistake here. How do we do that? We divide each
107  * top-level VDEV into 16MB regions. For each region we maintain a counter that
108  * is a sum of all the BRT entries that have offsets within the region. This
109  * creates the entries count array of 16bit numbers for each top-level VDEV.
110  * The entries count array is always kept in memory and updated on disk in the
111  * same transaction group as the BRT updates to keep everything in-sync. We can
112  * keep the array in memory, because it is very small. With 16MB regions and
113  * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease
114  * the region size even further in the future). Now, when we want to free
115  * a block, we first consult the array. If the counter for the whole region is
116  * zero, there is no need to look for the BRT entry, as there isn't one for
117  * sure. If the counter for the region is greater than zero, only then we will
118  * do a BRT lookup and if an entry is found we will decrease the reference
119  * counter in the BRT entry and in the entry counters array.
120  *
121  * The entry counters array is small, but can potentially be larger for very
122  * large VDEVs or smaller regions. In this case we don't want to rewrite entire
123  * array on every change. We then divide the array into 32kB block and keep
124  * a bitmap of dirty blocks within a transaction group. When we sync the
125  * transaction group we can only update the parts of the entry counters array
126  * that were modified. Note: Keeping track of the dirty parts of the entry
127  * counters array is implemented, but updating only parts of the array on disk
128  * is not yet implemented - for now we will update entire array if there was
129  * any change.
130  *
131  * The implementation tries to be economic: if BRT is not used, or no longer
132  * used, there will be no entries in the MOS and no additional memory used (eg.
133  * the entry counters array is only allocated if needed).
134  *
135  * Interaction between Deduplication and Block Cloning.
136  *
137  * If both functionalities are in use, we could end up with a block that is
138  * referenced multiple times in both DDT and BRT. When we free one of the
139  * references we couldn't tell where it belongs, so we would have to decide
140  * what table takes the precedence: do we first clear DDT references or BRT
141  * references? To avoid this dilemma BRT cooperates with DDT - if a given block
142  * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will
143  * lookup DDT entry instead and increase the counter there. No BRT entry
144  * will be created for a block which has the D (dedup) bit set.
145  * BRT may be more efficient for manual deduplication, but if the block is
146  * already in the DDT, then creating additional BRT entry would be less
147  * efficient. This clever idea was proposed by Allan Jude.
148  *
149  * Block Cloning across datasets.
150  *
151  * Block Cloning is not limited to cloning blocks within the same dataset.
152  * It is possible (and very useful) to clone blocks between different datasets.
153  * One use case is recovering files from snapshots. By cloning the files into
154  * dataset we need no additional storage. Without Block Cloning we would need
155  * additional space for those files.
156  * Another interesting use case is moving the files between datasets
157  * (copying the file content to the new dataset and removing the source file).
158  * In that case Block Cloning will only be used briefly, because the BRT entries
159  * will be removed when the source is removed.
160  * Block Cloning across encrypted datasets is supported as long as both
161  * datasets share the same master key (e.g. snapshots and clones)
162  *
163  * Block Cloning flow through ZFS layers.
164  *
165  * Note: Block Cloning can be used both for cloning file system blocks and ZVOL
166  * blocks. As of this writing no interface is implemented that allows for block
167  * cloning within a ZVOL.
168  * FreeBSD and Linux provides copy_file_range(2) system call and we will use it
169  * for blocking cloning.
170  *
171  *	ssize_t
172  *	copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp,
173  *	                size_t len, unsigned int flags);
174  *
175  * Even though offsets and length represent bytes, they have to be
176  * block-aligned or we will return an error so the upper layer can
177  * fallback to the generic mechanism that will just copy the data.
178  * Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
179  * This function was implemented based on zfs_write(), but instead of writing
180  * the given data we first read block pointers using the new dmu_read_l0_bps()
181  * function from the source file. Once we have BPs from the source file we call
182  * the dmu_brt_clone() function on the destination file. This function
183  * allocates BPs for us. We iterate over all source BPs. If the given BP is
184  * a hole or an embedded block, we just copy BP as-is. If it points to a real
185  * data we place this BP on a BRT pending list using the brt_pending_add()
186  * function.
187  *
188  * We use this pending list to keep track of all BPs that got new references
189  * within this transaction group.
190  *
191  * Some special cases to consider and how we address them:
192  * - The block we want to clone may have been created within the same
193  *   transaction group that we are trying to clone. Such block has no BP
194  *   allocated yet, so cannot be immediately cloned. We return EAGAIN.
195  * - The block we want to clone may have been modified within the same
196  *   transaction group. We return EAGAIN.
197  * - A block may be cloned multiple times during one transaction group (that's
198  *   why pending list is actually a tree and not an append-only list - this
199  *   way we can figure out faster if this block is cloned for the first time
200  *   in this txg or consecutive time).
201  * - A block may be cloned and freed within the same transaction group
202  *   (see dbuf_undirty()).
203  * - A block may be cloned and within the same transaction group the clone
204  *   can be cloned again (see dmu_read_l0_bps()).
205  * - A file might have been deleted, but the caller still has a file descriptor
206  *   open to this file and clones it.
207  *
208  * When we free a block we have an additional step in the ZIO pipeline where we
209  * call the zio_brt_free() function. We then call the brt_entry_decref()
210  * that loads the corresponding BRT entry (if one exists) and decreases
211  * reference counter. If this is not the last reference we will stop ZIO
212  * pipeline here. If this is the last reference or the block is not in the
213  * BRT, we continue the pipeline and free the block as usual.
214  *
215  * At the beginning of spa_sync() where there can be no more block cloning,
216  * but before issuing frees we call brt_pending_apply(). This function applies
217  * all the new clones to the BRT table - we load BRT entries and update
218  * reference counters. To sync new BRT entries to disk, we use brt_sync()
219  * function. This function will sync all dirty per-top-level-vdev BRTs,
220  * the entry counters arrays, etc.
221  *
222  * Block Cloning and ZIL.
223  *
224  * Every clone operation is divided into chunks (similar to write) and each
225  * chunk is cloned in a separate transaction. The chunk size is determined by
226  * how many BPs we can fit into a single ZIL entry.
227  * Replaying clone operation is different from the regular clone operation,
228  * as when we log clone operations we cannot use the source object - it may
229  * reside on a different dataset, so we log BPs we want to clone.
230  * The ZIL is replayed when we mount the given dataset, not when the pool is
231  * imported. Taking this into account it is possible that the pool is imported
232  * without mounting datasets and the source dataset is destroyed before the
233  * destination dataset is mounted and its ZIL replayed.
234  * To address this situation we leverage zil_claim() mechanism where ZFS will
235  * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
236  * entries, we will bump reference counters for their BPs in the BRT.  Then
237  * on mount and ZIL replay we bump the reference counters once more, while the
238  * first references are dropped during ZIL destroy by zil_free_clone_range().
239  * It is possible that after zil_claim() we never mount the destination, so
240  * we never replay its ZIL and just destroy it.  In this case the only taken
241  * references will be dropped by zil_free_clone_range(), since the cloning is
242  * not going to ever take place.
243  */
244 
245 static kmem_cache_t *brt_entry_cache;
246 
247 /*
248  * Enable/disable prefetching of BRT entries that we are going to modify.
249  */
250 static int brt_zap_prefetch = 1;
251 
252 #ifdef ZFS_DEBUG
253 #define	BRT_DEBUG(...)	do {						\
254 	if ((zfs_flags & ZFS_DEBUG_BRT) != 0) {				\
255 		__dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \
256 	}								\
257 } while (0)
258 #else
259 #define	BRT_DEBUG(...)	do { } while (0)
260 #endif
261 
262 static int brt_zap_default_bs = 12;
263 static int brt_zap_default_ibs = 12;
264 
265 static kstat_t	*brt_ksp;
266 
267 typedef struct brt_stats {
268 	kstat_named_t brt_addref_entry_not_on_disk;
269 	kstat_named_t brt_addref_entry_on_disk;
270 	kstat_named_t brt_decref_entry_in_memory;
271 	kstat_named_t brt_decref_entry_loaded_from_disk;
272 	kstat_named_t brt_decref_entry_not_in_memory;
273 	kstat_named_t brt_decref_entry_read_lost_race;
274 	kstat_named_t brt_decref_entry_still_referenced;
275 	kstat_named_t brt_decref_free_data_later;
276 	kstat_named_t brt_decref_free_data_now;
277 	kstat_named_t brt_decref_no_entry;
278 } brt_stats_t;
279 
280 static brt_stats_t brt_stats = {
281 	{ "addref_entry_not_on_disk",		KSTAT_DATA_UINT64 },
282 	{ "addref_entry_on_disk",		KSTAT_DATA_UINT64 },
283 	{ "decref_entry_in_memory",		KSTAT_DATA_UINT64 },
284 	{ "decref_entry_loaded_from_disk",	KSTAT_DATA_UINT64 },
285 	{ "decref_entry_not_in_memory",		KSTAT_DATA_UINT64 },
286 	{ "decref_entry_read_lost_race",	KSTAT_DATA_UINT64 },
287 	{ "decref_entry_still_referenced",	KSTAT_DATA_UINT64 },
288 	{ "decref_free_data_later",		KSTAT_DATA_UINT64 },
289 	{ "decref_free_data_now",		KSTAT_DATA_UINT64 },
290 	{ "decref_no_entry",			KSTAT_DATA_UINT64 }
291 };
292 
293 struct {
294 	wmsum_t brt_addref_entry_not_on_disk;
295 	wmsum_t brt_addref_entry_on_disk;
296 	wmsum_t brt_decref_entry_in_memory;
297 	wmsum_t brt_decref_entry_loaded_from_disk;
298 	wmsum_t brt_decref_entry_not_in_memory;
299 	wmsum_t brt_decref_entry_read_lost_race;
300 	wmsum_t brt_decref_entry_still_referenced;
301 	wmsum_t brt_decref_free_data_later;
302 	wmsum_t brt_decref_free_data_now;
303 	wmsum_t brt_decref_no_entry;
304 } brt_sums;
305 
306 #define	BRTSTAT_BUMP(stat)	wmsum_add(&brt_sums.stat, 1)
307 
308 static int brt_entry_compare(const void *x1, const void *x2);
309 static void brt_vdevs_expand(spa_t *spa, uint64_t nvdevs);
310 
311 static void
brt_rlock(spa_t * spa)312 brt_rlock(spa_t *spa)
313 {
314 	rw_enter(&spa->spa_brt_lock, RW_READER);
315 }
316 
317 static void
brt_wlock(spa_t * spa)318 brt_wlock(spa_t *spa)
319 {
320 	rw_enter(&spa->spa_brt_lock, RW_WRITER);
321 }
322 
323 static void
brt_unlock(spa_t * spa)324 brt_unlock(spa_t *spa)
325 {
326 	rw_exit(&spa->spa_brt_lock);
327 }
328 
329 static uint16_t
brt_vdev_entcount_get(const brt_vdev_t * brtvd,uint64_t idx)330 brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx)
331 {
332 
333 	ASSERT3U(idx, <, brtvd->bv_size);
334 
335 	if (unlikely(brtvd->bv_need_byteswap)) {
336 		return (BSWAP_16(brtvd->bv_entcount[idx]));
337 	} else {
338 		return (brtvd->bv_entcount[idx]);
339 	}
340 }
341 
342 static void
brt_vdev_entcount_set(brt_vdev_t * brtvd,uint64_t idx,uint16_t entcnt)343 brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt)
344 {
345 
346 	ASSERT3U(idx, <, brtvd->bv_size);
347 
348 	if (unlikely(brtvd->bv_need_byteswap)) {
349 		brtvd->bv_entcount[idx] = BSWAP_16(entcnt);
350 	} else {
351 		brtvd->bv_entcount[idx] = entcnt;
352 	}
353 }
354 
355 static void
brt_vdev_entcount_inc(brt_vdev_t * brtvd,uint64_t idx)356 brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx)
357 {
358 	uint16_t entcnt;
359 
360 	ASSERT3U(idx, <, brtvd->bv_size);
361 
362 	entcnt = brt_vdev_entcount_get(brtvd, idx);
363 	ASSERT(entcnt < UINT16_MAX);
364 
365 	brt_vdev_entcount_set(brtvd, idx, entcnt + 1);
366 }
367 
368 static void
brt_vdev_entcount_dec(brt_vdev_t * brtvd,uint64_t idx)369 brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx)
370 {
371 	uint16_t entcnt;
372 
373 	ASSERT3U(idx, <, brtvd->bv_size);
374 
375 	entcnt = brt_vdev_entcount_get(brtvd, idx);
376 	ASSERT(entcnt > 0);
377 
378 	brt_vdev_entcount_set(brtvd, idx, entcnt - 1);
379 }
380 
381 #ifdef ZFS_DEBUG
382 static void
brt_vdev_dump(brt_vdev_t * brtvd)383 brt_vdev_dump(brt_vdev_t *brtvd)
384 {
385 	uint64_t idx;
386 
387 	uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
388 	zfs_dbgmsg("  BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d "
389 	    "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu",
390 	    (u_longlong_t)brtvd->bv_vdevid,
391 	    brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
392 	    (u_longlong_t)brtvd->bv_size,
393 	    (u_longlong_t)brtvd->bv_totalcount,
394 	    (u_longlong_t)nblocks,
395 	    (size_t)BT_SIZEOFMAP(nblocks));
396 	if (brtvd->bv_totalcount > 0) {
397 		zfs_dbgmsg("    entcounts:");
398 		for (idx = 0; idx < brtvd->bv_size; idx++) {
399 			uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx);
400 			if (entcnt > 0) {
401 				zfs_dbgmsg("      [%04llu] %hu",
402 				    (u_longlong_t)idx, entcnt);
403 			}
404 		}
405 	}
406 	if (brtvd->bv_entcount_dirty) {
407 		char *bitmap;
408 
409 		bitmap = kmem_alloc(nblocks + 1, KM_SLEEP);
410 		for (idx = 0; idx < nblocks; idx++) {
411 			bitmap[idx] =
412 			    BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
413 		}
414 		bitmap[idx] = '\0';
415 		zfs_dbgmsg("    dirty: %s", bitmap);
416 		kmem_free(bitmap, nblocks + 1);
417 	}
418 }
419 #endif
420 
421 static brt_vdev_t *
brt_vdev(spa_t * spa,uint64_t vdevid,boolean_t alloc)422 brt_vdev(spa_t *spa, uint64_t vdevid, boolean_t alloc)
423 {
424 	brt_vdev_t *brtvd = NULL;
425 
426 	brt_rlock(spa);
427 	if (vdevid < spa->spa_brt_nvdevs) {
428 		brtvd = spa->spa_brt_vdevs[vdevid];
429 	} else if (alloc) {
430 		/* New VDEV was added. */
431 		brt_unlock(spa);
432 		brt_wlock(spa);
433 		if (vdevid >= spa->spa_brt_nvdevs)
434 			brt_vdevs_expand(spa, vdevid + 1);
435 		brtvd = spa->spa_brt_vdevs[vdevid];
436 	}
437 	brt_unlock(spa);
438 	return (brtvd);
439 }
440 
441 static void
brt_vdev_create(spa_t * spa,brt_vdev_t * brtvd,dmu_tx_t * tx)442 brt_vdev_create(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
443 {
444 	char name[64];
445 
446 	ASSERT(brtvd->bv_initiated);
447 	ASSERT0(brtvd->bv_mos_brtvdev);
448 	ASSERT0(brtvd->bv_mos_entries);
449 
450 	uint64_t mos_entries = zap_create_flags(spa->spa_meta_objset, 0,
451 	    ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
452 	    brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx);
453 	VERIFY(mos_entries != 0);
454 	VERIFY0(dnode_hold(spa->spa_meta_objset, mos_entries, brtvd,
455 	    &brtvd->bv_mos_entries_dnode));
456 	rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
457 	brtvd->bv_mos_entries = mos_entries;
458 	rw_exit(&brtvd->bv_mos_entries_lock);
459 	BRT_DEBUG("MOS entries created, object=%llu",
460 	    (u_longlong_t)brtvd->bv_mos_entries);
461 
462 	/*
463 	 * We allocate DMU buffer to store the bv_entcount[] array.
464 	 * We will keep array size (bv_size) and cummulative count for all
465 	 * bv_entcount[]s (bv_totalcount) in the bonus buffer.
466 	 */
467 	brtvd->bv_mos_brtvdev = dmu_object_alloc(spa->spa_meta_objset,
468 	    DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE,
469 	    DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx);
470 	VERIFY(brtvd->bv_mos_brtvdev != 0);
471 	BRT_DEBUG("MOS BRT VDEV created, object=%llu",
472 	    (u_longlong_t)brtvd->bv_mos_brtvdev);
473 
474 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
475 	    (u_longlong_t)brtvd->bv_vdevid);
476 	VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name,
477 	    sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx));
478 	BRT_DEBUG("Pool directory object created, object=%s", name);
479 
480 	spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING, tx);
481 }
482 
483 static void
brt_vdev_realloc(spa_t * spa,brt_vdev_t * brtvd)484 brt_vdev_realloc(spa_t *spa, brt_vdev_t *brtvd)
485 {
486 	vdev_t *vd;
487 	uint16_t *entcount;
488 	ulong_t *bitmap;
489 	uint64_t nblocks, onblocks, size;
490 
491 	ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
492 
493 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
494 	vd = vdev_lookup_top(spa, brtvd->bv_vdevid);
495 	size = (vdev_get_min_asize(vd) - 1) / spa->spa_brt_rangesize + 1;
496 	spa_config_exit(spa, SCL_VDEV, FTAG);
497 
498 	entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP);
499 	nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);
500 	bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);
501 
502 	if (!brtvd->bv_initiated) {
503 		ASSERT0(brtvd->bv_size);
504 		ASSERT0P(brtvd->bv_entcount);
505 		ASSERT0P(brtvd->bv_bitmap);
506 	} else {
507 		ASSERT(brtvd->bv_size > 0);
508 		ASSERT(brtvd->bv_entcount != NULL);
509 		ASSERT(brtvd->bv_bitmap != NULL);
510 		/*
511 		 * TODO: Allow vdev shrinking. We only need to implement
512 		 * shrinking the on-disk BRT VDEV object.
513 		 * dmu_free_range(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
514 		 *     offset, size, tx);
515 		 */
516 		ASSERT3U(brtvd->bv_size, <=, size);
517 
518 		memcpy(entcount, brtvd->bv_entcount,
519 		    sizeof (entcount[0]) * MIN(size, brtvd->bv_size));
520 		vmem_free(brtvd->bv_entcount,
521 		    sizeof (entcount[0]) * brtvd->bv_size);
522 		onblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
523 		memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
524 		    BT_SIZEOFMAP(onblocks)));
525 		kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(onblocks));
526 	}
527 
528 	brtvd->bv_size = size;
529 	brtvd->bv_entcount = entcount;
530 	brtvd->bv_bitmap = bitmap;
531 	if (!brtvd->bv_initiated) {
532 		brtvd->bv_need_byteswap = FALSE;
533 		brtvd->bv_initiated = TRUE;
534 		BRT_DEBUG("BRT VDEV %llu initiated.",
535 		    (u_longlong_t)brtvd->bv_vdevid);
536 	}
537 }
538 
539 static int
brt_vdev_load(spa_t * spa,brt_vdev_t * brtvd)540 brt_vdev_load(spa_t *spa, brt_vdev_t *brtvd)
541 {
542 	dmu_buf_t *db;
543 	brt_vdev_phys_t *bvphys;
544 	int error;
545 
546 	ASSERT(!brtvd->bv_initiated);
547 	ASSERT(brtvd->bv_mos_brtvdev != 0);
548 
549 	error = dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
550 	    FTAG, &db);
551 	if (error != 0)
552 		return (error);
553 
554 	bvphys = db->db_data;
555 	if (spa->spa_brt_rangesize == 0) {
556 		spa->spa_brt_rangesize = bvphys->bvp_rangesize;
557 	} else {
558 		ASSERT3U(spa->spa_brt_rangesize, ==, bvphys->bvp_rangesize);
559 	}
560 
561 	brt_vdev_realloc(spa, brtvd);
562 
563 	/* TODO: We don't support VDEV shrinking. */
564 	ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size);
565 
566 	/*
567 	 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out.
568 	 */
569 	error = dmu_read(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0,
570 	    MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t),
571 	    brtvd->bv_entcount, DMU_READ_NO_PREFETCH);
572 	if (error != 0)
573 		return (error);
574 
575 	ASSERT(bvphys->bvp_mos_entries != 0);
576 	VERIFY0(dnode_hold(spa->spa_meta_objset, bvphys->bvp_mos_entries, brtvd,
577 	    &brtvd->bv_mos_entries_dnode));
578 	rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
579 	brtvd->bv_mos_entries = bvphys->bvp_mos_entries;
580 	rw_exit(&brtvd->bv_mos_entries_lock);
581 	brtvd->bv_need_byteswap =
582 	    (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER);
583 	brtvd->bv_totalcount = bvphys->bvp_totalcount;
584 	brtvd->bv_usedspace = bvphys->bvp_usedspace;
585 	brtvd->bv_savedspace = bvphys->bvp_savedspace;
586 
587 	dmu_buf_rele(db, FTAG);
588 
589 	BRT_DEBUG("BRT VDEV %llu loaded: mos_brtvdev=%llu, mos_entries=%llu",
590 	    (u_longlong_t)brtvd->bv_vdevid,
591 	    (u_longlong_t)brtvd->bv_mos_brtvdev,
592 	    (u_longlong_t)brtvd->bv_mos_entries);
593 	return (0);
594 }
595 
596 static void
brt_vdev_dealloc(brt_vdev_t * brtvd)597 brt_vdev_dealloc(brt_vdev_t *brtvd)
598 {
599 	ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
600 	ASSERT(brtvd->bv_initiated);
601 	ASSERT0(avl_numnodes(&brtvd->bv_tree));
602 
603 	vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size);
604 	brtvd->bv_entcount = NULL;
605 	uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
606 	kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(nblocks));
607 	brtvd->bv_bitmap = NULL;
608 
609 	brtvd->bv_size = 0;
610 
611 	brtvd->bv_initiated = FALSE;
612 	BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid);
613 }
614 
615 static void
brt_vdev_destroy(spa_t * spa,brt_vdev_t * brtvd,dmu_tx_t * tx)616 brt_vdev_destroy(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
617 {
618 	char name[64];
619 	uint64_t count;
620 
621 	ASSERT(brtvd->bv_initiated);
622 	ASSERT(brtvd->bv_mos_brtvdev != 0);
623 	ASSERT(brtvd->bv_mos_entries != 0);
624 	ASSERT0(brtvd->bv_totalcount);
625 	ASSERT0(brtvd->bv_usedspace);
626 	ASSERT0(brtvd->bv_savedspace);
627 
628 	uint64_t mos_entries = brtvd->bv_mos_entries;
629 	rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
630 	brtvd->bv_mos_entries = 0;
631 	rw_exit(&brtvd->bv_mos_entries_lock);
632 	dnode_rele(brtvd->bv_mos_entries_dnode, brtvd);
633 	brtvd->bv_mos_entries_dnode = NULL;
634 	ASSERT0(zap_count(spa->spa_meta_objset, mos_entries, &count));
635 	ASSERT0(count);
636 	VERIFY0(zap_destroy(spa->spa_meta_objset, mos_entries, tx));
637 	BRT_DEBUG("MOS entries destroyed, object=%llu",
638 	    (u_longlong_t)mos_entries);
639 
640 	VERIFY0(dmu_object_free(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
641 	    tx));
642 	BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",
643 	    (u_longlong_t)brtvd->bv_mos_brtvdev);
644 	brtvd->bv_mos_brtvdev = 0;
645 	brtvd->bv_entcount_dirty = FALSE;
646 
647 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
648 	    (u_longlong_t)brtvd->bv_vdevid);
649 	VERIFY0(zap_remove(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
650 	    name, tx));
651 	BRT_DEBUG("Pool directory object removed, object=%s", name);
652 
653 	brtvd->bv_meta_dirty = FALSE;
654 
655 	rw_enter(&brtvd->bv_lock, RW_WRITER);
656 	brt_vdev_dealloc(brtvd);
657 	rw_exit(&brtvd->bv_lock);
658 
659 	spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING, tx);
660 }
661 
662 static void
brt_vdevs_expand(spa_t * spa,uint64_t nvdevs)663 brt_vdevs_expand(spa_t *spa, uint64_t nvdevs)
664 {
665 	brt_vdev_t **vdevs;
666 
667 	ASSERT(RW_WRITE_HELD(&spa->spa_brt_lock));
668 	ASSERT3U(nvdevs, >=, spa->spa_brt_nvdevs);
669 
670 	if (nvdevs == spa->spa_brt_nvdevs)
671 		return;
672 
673 	vdevs = kmem_zalloc(sizeof (*spa->spa_brt_vdevs) * nvdevs, KM_SLEEP);
674 	if (spa->spa_brt_nvdevs > 0) {
675 		ASSERT(spa->spa_brt_vdevs != NULL);
676 
677 		memcpy(vdevs, spa->spa_brt_vdevs,
678 		    sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs);
679 		kmem_free(spa->spa_brt_vdevs,
680 		    sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs);
681 	}
682 	spa->spa_brt_vdevs = vdevs;
683 
684 	for (uint64_t vdevid = spa->spa_brt_nvdevs; vdevid < nvdevs; vdevid++) {
685 		brt_vdev_t *brtvd = kmem_zalloc(sizeof (*brtvd), KM_SLEEP);
686 		rw_init(&brtvd->bv_lock, NULL, RW_DEFAULT, NULL);
687 		brtvd->bv_vdevid = vdevid;
688 		brtvd->bv_initiated = FALSE;
689 		rw_init(&brtvd->bv_mos_entries_lock, NULL, RW_DEFAULT, NULL);
690 		avl_create(&brtvd->bv_tree, brt_entry_compare,
691 		    sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node));
692 		for (int i = 0; i < TXG_SIZE; i++) {
693 			avl_create(&brtvd->bv_pending_tree[i],
694 			    brt_entry_compare, sizeof (brt_entry_t),
695 			    offsetof(brt_entry_t, bre_node));
696 		}
697 		mutex_init(&brtvd->bv_pending_lock, NULL, MUTEX_DEFAULT, NULL);
698 		spa->spa_brt_vdevs[vdevid] = brtvd;
699 	}
700 
701 	BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",
702 	    (u_longlong_t)spa->spa_brt_nvdevs, (u_longlong_t)nvdevs);
703 	spa->spa_brt_nvdevs = nvdevs;
704 }
705 
706 static boolean_t
brt_vdev_lookup(spa_t * spa,brt_vdev_t * brtvd,uint64_t offset)707 brt_vdev_lookup(spa_t *spa, brt_vdev_t *brtvd, uint64_t offset)
708 {
709 	uint64_t idx = offset / spa->spa_brt_rangesize;
710 	if (idx < brtvd->bv_size) {
711 		/* VDEV wasn't expanded. */
712 		return (brt_vdev_entcount_get(brtvd, idx) > 0);
713 	}
714 	return (FALSE);
715 }
716 
717 static void
brt_vdev_addref(spa_t * spa,brt_vdev_t * brtvd,const brt_entry_t * bre,uint64_t dsize,uint64_t count)718 brt_vdev_addref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre,
719     uint64_t dsize, uint64_t count)
720 {
721 	uint64_t idx;
722 
723 	ASSERT(brtvd->bv_initiated);
724 
725 	brtvd->bv_savedspace += dsize * count;
726 	brtvd->bv_meta_dirty = TRUE;
727 
728 	if (bre->bre_count > 0)
729 		return;
730 
731 	brtvd->bv_usedspace += dsize;
732 
733 	idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize;
734 	if (idx >= brtvd->bv_size) {
735 		/* VDEV has been expanded. */
736 		rw_enter(&brtvd->bv_lock, RW_WRITER);
737 		brt_vdev_realloc(spa, brtvd);
738 		rw_exit(&brtvd->bv_lock);
739 	}
740 
741 	ASSERT3U(idx, <, brtvd->bv_size);
742 
743 	brtvd->bv_totalcount++;
744 	brt_vdev_entcount_inc(brtvd, idx);
745 	brtvd->bv_entcount_dirty = TRUE;
746 	idx = idx / BRT_BLOCKSIZE / 8;
747 	BT_SET(brtvd->bv_bitmap, idx);
748 }
749 
750 static void
brt_vdev_decref(spa_t * spa,brt_vdev_t * brtvd,const brt_entry_t * bre,uint64_t dsize)751 brt_vdev_decref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre,
752     uint64_t dsize)
753 {
754 	uint64_t idx;
755 
756 	ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
757 	ASSERT(brtvd->bv_initiated);
758 
759 	brtvd->bv_savedspace -= dsize;
760 	brtvd->bv_meta_dirty = TRUE;
761 
762 	if (bre->bre_count > 0)
763 		return;
764 
765 	brtvd->bv_usedspace -= dsize;
766 
767 	idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize;
768 	ASSERT3U(idx, <, brtvd->bv_size);
769 
770 	ASSERT(brtvd->bv_totalcount > 0);
771 	brtvd->bv_totalcount--;
772 	brt_vdev_entcount_dec(brtvd, idx);
773 	brtvd->bv_entcount_dirty = TRUE;
774 	idx = idx / BRT_BLOCKSIZE / 8;
775 	BT_SET(brtvd->bv_bitmap, idx);
776 }
777 
778 static void
brt_vdev_sync(spa_t * spa,brt_vdev_t * brtvd,dmu_tx_t * tx)779 brt_vdev_sync(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
780 {
781 	dmu_buf_t *db;
782 	brt_vdev_phys_t *bvphys;
783 
784 	ASSERT(brtvd->bv_meta_dirty);
785 	ASSERT(brtvd->bv_mos_brtvdev != 0);
786 	ASSERT(dmu_tx_is_syncing(tx));
787 
788 	VERIFY0(dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
789 	    FTAG, &db));
790 
791 	if (brtvd->bv_entcount_dirty) {
792 		/*
793 		 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
794 		 */
795 		dmu_write(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0,
796 		    brtvd->bv_size * sizeof (brtvd->bv_entcount[0]),
797 		    brtvd->bv_entcount, tx);
798 		uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
799 		memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(nblocks));
800 		brtvd->bv_entcount_dirty = FALSE;
801 	}
802 
803 	dmu_buf_will_dirty(db, tx);
804 	bvphys = db->db_data;
805 	bvphys->bvp_mos_entries = brtvd->bv_mos_entries;
806 	bvphys->bvp_size = brtvd->bv_size;
807 	if (brtvd->bv_need_byteswap) {
808 		bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER;
809 	} else {
810 		bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER;
811 	}
812 	bvphys->bvp_totalcount = brtvd->bv_totalcount;
813 	bvphys->bvp_rangesize = spa->spa_brt_rangesize;
814 	bvphys->bvp_usedspace = brtvd->bv_usedspace;
815 	bvphys->bvp_savedspace = brtvd->bv_savedspace;
816 	dmu_buf_rele(db, FTAG);
817 
818 	brtvd->bv_meta_dirty = FALSE;
819 }
820 
821 static void
brt_vdevs_free(spa_t * spa)822 brt_vdevs_free(spa_t *spa)
823 {
824 	if (spa->spa_brt_vdevs == 0)
825 		return;
826 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
827 		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
828 		rw_enter(&brtvd->bv_lock, RW_WRITER);
829 		if (brtvd->bv_initiated)
830 			brt_vdev_dealloc(brtvd);
831 		rw_exit(&brtvd->bv_lock);
832 		rw_destroy(&brtvd->bv_lock);
833 		if (brtvd->bv_mos_entries != 0)
834 			dnode_rele(brtvd->bv_mos_entries_dnode, brtvd);
835 		rw_destroy(&brtvd->bv_mos_entries_lock);
836 		avl_destroy(&brtvd->bv_tree);
837 		for (int i = 0; i < TXG_SIZE; i++)
838 			avl_destroy(&brtvd->bv_pending_tree[i]);
839 		mutex_destroy(&brtvd->bv_pending_lock);
840 		kmem_free(brtvd, sizeof (*brtvd));
841 	}
842 	kmem_free(spa->spa_brt_vdevs, sizeof (*spa->spa_brt_vdevs) *
843 	    spa->spa_brt_nvdevs);
844 }
845 
846 static void
brt_entry_fill(const blkptr_t * bp,brt_entry_t * bre,uint64_t * vdevidp)847 brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp)
848 {
849 
850 	bre->bre_bp = *bp;
851 	bre->bre_count = 0;
852 	bre->bre_pcount = 0;
853 
854 	*vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]);
855 }
856 
857 static int
brt_entry_lookup(brt_vdev_t * brtvd,brt_entry_t * bre)858 brt_entry_lookup(brt_vdev_t *brtvd, brt_entry_t *bre)
859 {
860 	uint64_t off = BRE_OFFSET(bre);
861 
862 	if (brtvd->bv_mos_entries == 0)
863 		return (SET_ERROR(ENOENT));
864 
865 	return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
866 	    &off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count), &bre->bre_count));
867 }
868 
869 /*
870  * Return TRUE if we _can_ have BRT entry for this bp. It might be false
871  * positive, but gives us quick answer if we should look into BRT, which
872  * may require reads and thus will be more expensive.
873  */
874 boolean_t
brt_maybe_exists(spa_t * spa,const blkptr_t * bp)875 brt_maybe_exists(spa_t *spa, const blkptr_t *bp)
876 {
877 
878 	if (spa->spa_brt_nvdevs == 0)
879 		return (B_FALSE);
880 
881 	uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
882 	brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
883 	if (brtvd == NULL || !brtvd->bv_initiated)
884 		return (FALSE);
885 
886 	/*
887 	 * We don't need locks here, since bv_entcount pointer must be
888 	 * stable at this point, and we don't care about false positive
889 	 * races here, while false negative should be impossible, since
890 	 * all brt_vdev_addref() have already completed by this point.
891 	 */
892 	uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]);
893 	return (brt_vdev_lookup(spa, brtvd, off));
894 }
895 
896 uint64_t
brt_get_dspace(spa_t * spa)897 brt_get_dspace(spa_t *spa)
898 {
899 	if (spa->spa_brt_nvdevs == 0)
900 		return (0);
901 
902 	brt_rlock(spa);
903 	uint64_t s = 0;
904 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++)
905 		s += spa->spa_brt_vdevs[vdevid]->bv_savedspace;
906 	brt_unlock(spa);
907 	return (s);
908 }
909 
910 uint64_t
brt_get_used(spa_t * spa)911 brt_get_used(spa_t *spa)
912 {
913 	if (spa->spa_brt_nvdevs == 0)
914 		return (0);
915 
916 	brt_rlock(spa);
917 	uint64_t s = 0;
918 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++)
919 		s += spa->spa_brt_vdevs[vdevid]->bv_usedspace;
920 	brt_unlock(spa);
921 	return (s);
922 }
923 
924 uint64_t
brt_get_saved(spa_t * spa)925 brt_get_saved(spa_t *spa)
926 {
927 	return (brt_get_dspace(spa));
928 }
929 
930 uint64_t
brt_get_ratio(spa_t * spa)931 brt_get_ratio(spa_t *spa)
932 {
933 	uint64_t used = brt_get_used(spa);
934 	if (used == 0)
935 		return (100);
936 	return ((used + brt_get_saved(spa)) * 100 / used);
937 }
938 
939 static int
brt_kstats_update(kstat_t * ksp,int rw)940 brt_kstats_update(kstat_t *ksp, int rw)
941 {
942 	brt_stats_t *bs = ksp->ks_data;
943 
944 	if (rw == KSTAT_WRITE)
945 		return (EACCES);
946 
947 	bs->brt_addref_entry_not_on_disk.value.ui64 =
948 	    wmsum_value(&brt_sums.brt_addref_entry_not_on_disk);
949 	bs->brt_addref_entry_on_disk.value.ui64 =
950 	    wmsum_value(&brt_sums.brt_addref_entry_on_disk);
951 	bs->brt_decref_entry_in_memory.value.ui64 =
952 	    wmsum_value(&brt_sums.brt_decref_entry_in_memory);
953 	bs->brt_decref_entry_loaded_from_disk.value.ui64 =
954 	    wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk);
955 	bs->brt_decref_entry_not_in_memory.value.ui64 =
956 	    wmsum_value(&brt_sums.brt_decref_entry_not_in_memory);
957 	bs->brt_decref_entry_read_lost_race.value.ui64 =
958 	    wmsum_value(&brt_sums.brt_decref_entry_read_lost_race);
959 	bs->brt_decref_entry_still_referenced.value.ui64 =
960 	    wmsum_value(&brt_sums.brt_decref_entry_still_referenced);
961 	bs->brt_decref_free_data_later.value.ui64 =
962 	    wmsum_value(&brt_sums.brt_decref_free_data_later);
963 	bs->brt_decref_free_data_now.value.ui64 =
964 	    wmsum_value(&brt_sums.brt_decref_free_data_now);
965 	bs->brt_decref_no_entry.value.ui64 =
966 	    wmsum_value(&brt_sums.brt_decref_no_entry);
967 
968 	return (0);
969 }
970 
971 static void
brt_stat_init(void)972 brt_stat_init(void)
973 {
974 
975 	wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0);
976 	wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0);
977 	wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0);
978 	wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0);
979 	wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0);
980 	wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0);
981 	wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0);
982 	wmsum_init(&brt_sums.brt_decref_free_data_later, 0);
983 	wmsum_init(&brt_sums.brt_decref_free_data_now, 0);
984 	wmsum_init(&brt_sums.brt_decref_no_entry, 0);
985 
986 	brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED,
987 	    sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
988 	if (brt_ksp != NULL) {
989 		brt_ksp->ks_data = &brt_stats;
990 		brt_ksp->ks_update = brt_kstats_update;
991 		kstat_install(brt_ksp);
992 	}
993 }
994 
995 static void
brt_stat_fini(void)996 brt_stat_fini(void)
997 {
998 	if (brt_ksp != NULL) {
999 		kstat_delete(brt_ksp);
1000 		brt_ksp = NULL;
1001 	}
1002 
1003 	wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk);
1004 	wmsum_fini(&brt_sums.brt_addref_entry_on_disk);
1005 	wmsum_fini(&brt_sums.brt_decref_entry_in_memory);
1006 	wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk);
1007 	wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory);
1008 	wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race);
1009 	wmsum_fini(&brt_sums.brt_decref_entry_still_referenced);
1010 	wmsum_fini(&brt_sums.brt_decref_free_data_later);
1011 	wmsum_fini(&brt_sums.brt_decref_free_data_now);
1012 	wmsum_fini(&brt_sums.brt_decref_no_entry);
1013 }
1014 
1015 void
brt_init(void)1016 brt_init(void)
1017 {
1018 	brt_entry_cache = kmem_cache_create("brt_entry_cache",
1019 	    sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1020 
1021 	brt_stat_init();
1022 }
1023 
1024 void
brt_fini(void)1025 brt_fini(void)
1026 {
1027 	brt_stat_fini();
1028 
1029 	kmem_cache_destroy(brt_entry_cache);
1030 }
1031 
1032 /* Return TRUE if block should be freed immediately. */
1033 boolean_t
brt_entry_decref(spa_t * spa,const blkptr_t * bp)1034 brt_entry_decref(spa_t *spa, const blkptr_t *bp)
1035 {
1036 	brt_entry_t *bre, *racebre;
1037 	brt_entry_t bre_search;
1038 	avl_index_t where;
1039 	uint64_t vdevid;
1040 	int error;
1041 
1042 	brt_entry_fill(bp, &bre_search, &vdevid);
1043 
1044 	brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
1045 	ASSERT(brtvd != NULL);
1046 
1047 	rw_enter(&brtvd->bv_lock, RW_WRITER);
1048 	ASSERT(brtvd->bv_initiated);
1049 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1050 	if (bre != NULL) {
1051 		BRTSTAT_BUMP(brt_decref_entry_in_memory);
1052 		goto out;
1053 	} else {
1054 		BRTSTAT_BUMP(brt_decref_entry_not_in_memory);
1055 	}
1056 	rw_exit(&brtvd->bv_lock);
1057 
1058 	error = brt_entry_lookup(brtvd, &bre_search);
1059 	/* bre_search now contains correct bre_count */
1060 	if (error == ENOENT) {
1061 		BRTSTAT_BUMP(brt_decref_no_entry);
1062 		return (B_TRUE);
1063 	}
1064 	ASSERT0(error);
1065 
1066 	rw_enter(&brtvd->bv_lock, RW_WRITER);
1067 	racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
1068 	if (racebre != NULL) {
1069 		/* The entry was added when the lock was dropped. */
1070 		BRTSTAT_BUMP(brt_decref_entry_read_lost_race);
1071 		bre = racebre;
1072 		goto out;
1073 	}
1074 
1075 	BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk);
1076 	bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
1077 	bre->bre_bp = bre_search.bre_bp;
1078 	bre->bre_count = bre_search.bre_count;
1079 	bre->bre_pcount = 0;
1080 	avl_insert(&brtvd->bv_tree, bre, where);
1081 
1082 out:
1083 	if (bre->bre_count == 0) {
1084 		rw_exit(&brtvd->bv_lock);
1085 		BRTSTAT_BUMP(brt_decref_free_data_now);
1086 		return (B_TRUE);
1087 	}
1088 
1089 	bre->bre_pcount--;
1090 	ASSERT(bre->bre_count > 0);
1091 	bre->bre_count--;
1092 	if (bre->bre_count == 0)
1093 		BRTSTAT_BUMP(brt_decref_free_data_later);
1094 	else
1095 		BRTSTAT_BUMP(brt_decref_entry_still_referenced);
1096 	brt_vdev_decref(spa, brtvd, bre, bp_get_dsize_sync(spa, bp));
1097 
1098 	rw_exit(&brtvd->bv_lock);
1099 
1100 	return (B_FALSE);
1101 }
1102 
1103 uint64_t
brt_entry_get_refcount(spa_t * spa,const blkptr_t * bp)1104 brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)
1105 {
1106 	brt_entry_t bre_search, *bre;
1107 	uint64_t vdevid, refcnt;
1108 	int error;
1109 
1110 	brt_entry_fill(bp, &bre_search, &vdevid);
1111 
1112 	brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
1113 	ASSERT(brtvd != NULL);
1114 
1115 	rw_enter(&brtvd->bv_lock, RW_READER);
1116 	ASSERT(brtvd->bv_initiated);
1117 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1118 	if (bre == NULL) {
1119 		rw_exit(&brtvd->bv_lock);
1120 		error = brt_entry_lookup(brtvd, &bre_search);
1121 		if (error == ENOENT) {
1122 			refcnt = 0;
1123 		} else {
1124 			ASSERT0(error);
1125 			refcnt = bre_search.bre_count;
1126 		}
1127 	} else {
1128 		refcnt = bre->bre_count;
1129 		rw_exit(&brtvd->bv_lock);
1130 	}
1131 
1132 	return (refcnt);
1133 }
1134 
1135 static void
brt_prefetch(brt_vdev_t * brtvd,const blkptr_t * bp)1136 brt_prefetch(brt_vdev_t *brtvd, const blkptr_t *bp)
1137 {
1138 	if (!brt_zap_prefetch || brtvd->bv_mos_entries == 0)
1139 		return;
1140 
1141 	uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]);
1142 	rw_enter(&brtvd->bv_mos_entries_lock, RW_READER);
1143 	if (brtvd->bv_mos_entries != 0) {
1144 		(void) zap_prefetch_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
1145 		    &off, BRT_KEY_WORDS);
1146 	}
1147 	rw_exit(&brtvd->bv_mos_entries_lock);
1148 }
1149 
1150 static int
brt_entry_compare(const void * x1,const void * x2)1151 brt_entry_compare(const void *x1, const void *x2)
1152 {
1153 	const brt_entry_t *bre1 = x1, *bre2 = x2;
1154 	const blkptr_t *bp1 = &bre1->bre_bp, *bp2 = &bre2->bre_bp;
1155 
1156 	return (TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
1157 	    DVA_GET_OFFSET(&bp2->blk_dva[0])));
1158 }
1159 
1160 void
brt_pending_add(spa_t * spa,const blkptr_t * bp,dmu_tx_t * tx)1161 brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1162 {
1163 	brt_entry_t *bre, *newbre;
1164 	avl_index_t where;
1165 	uint64_t txg;
1166 
1167 	txg = dmu_tx_get_txg(tx);
1168 	ASSERT3U(txg, !=, 0);
1169 
1170 	uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
1171 	brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_TRUE);
1172 	avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK];
1173 
1174 	newbre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
1175 	newbre->bre_bp = *bp;
1176 	newbre->bre_count = 0;
1177 	newbre->bre_pcount = 1;
1178 
1179 	mutex_enter(&brtvd->bv_pending_lock);
1180 	bre = avl_find(pending_tree, newbre, &where);
1181 	if (bre == NULL) {
1182 		avl_insert(pending_tree, newbre, where);
1183 		newbre = NULL;
1184 	} else {
1185 		bre->bre_pcount++;
1186 	}
1187 	mutex_exit(&brtvd->bv_pending_lock);
1188 
1189 	if (newbre != NULL) {
1190 		ASSERT(bre != NULL);
1191 		ASSERT(bre != newbre);
1192 		kmem_cache_free(brt_entry_cache, newbre);
1193 	} else {
1194 		ASSERT0P(bre);
1195 
1196 		/* Prefetch BRT entry for the syncing context. */
1197 		brt_prefetch(brtvd, bp);
1198 	}
1199 }
1200 
1201 void
brt_pending_remove(spa_t * spa,const blkptr_t * bp,dmu_tx_t * tx)1202 brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1203 {
1204 	brt_entry_t *bre, bre_search;
1205 	uint64_t txg;
1206 
1207 	txg = dmu_tx_get_txg(tx);
1208 	ASSERT3U(txg, !=, 0);
1209 
1210 	uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
1211 	brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
1212 	ASSERT(brtvd != NULL);
1213 	avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK];
1214 
1215 	bre_search.bre_bp = *bp;
1216 
1217 	mutex_enter(&brtvd->bv_pending_lock);
1218 	bre = avl_find(pending_tree, &bre_search, NULL);
1219 	ASSERT(bre != NULL);
1220 	ASSERT(bre->bre_pcount > 0);
1221 	bre->bre_pcount--;
1222 	if (bre->bre_pcount == 0)
1223 		avl_remove(pending_tree, bre);
1224 	else
1225 		bre = NULL;
1226 	mutex_exit(&brtvd->bv_pending_lock);
1227 
1228 	if (bre)
1229 		kmem_cache_free(brt_entry_cache, bre);
1230 }
1231 
1232 static void
brt_pending_apply_vdev(spa_t * spa,brt_vdev_t * brtvd,uint64_t txg)1233 brt_pending_apply_vdev(spa_t *spa, brt_vdev_t *brtvd, uint64_t txg)
1234 {
1235 	brt_entry_t *bre, *nbre;
1236 
1237 	/*
1238 	 * We are in syncing context, so no other bv_pending_tree accesses
1239 	 * are possible for the TXG.  So we don't need bv_pending_lock.
1240 	 */
1241 	ASSERT(avl_is_empty(&brtvd->bv_tree));
1242 	avl_swap(&brtvd->bv_tree, &brtvd->bv_pending_tree[txg & TXG_MASK]);
1243 
1244 	for (bre = avl_first(&brtvd->bv_tree); bre; bre = nbre) {
1245 		nbre = AVL_NEXT(&brtvd->bv_tree, bre);
1246 
1247 		/*
1248 		 * If the block has DEDUP bit set, it means that it
1249 		 * already exists in the DEDUP table, so we can just
1250 		 * use that instead of creating new entry in the BRT.
1251 		 */
1252 		if (BP_GET_DEDUP(&bre->bre_bp)) {
1253 			while (bre->bre_pcount > 0) {
1254 				if (!ddt_addref(spa, &bre->bre_bp))
1255 					break;
1256 				bre->bre_pcount--;
1257 			}
1258 			if (bre->bre_pcount == 0) {
1259 				avl_remove(&brtvd->bv_tree, bre);
1260 				kmem_cache_free(brt_entry_cache, bre);
1261 				continue;
1262 			}
1263 		}
1264 
1265 		/*
1266 		 * Unless we know that the block is definitely not in ZAP,
1267 		 * try to get its reference count from there.
1268 		 */
1269 		uint64_t off = BRE_OFFSET(bre);
1270 		if (brtvd->bv_mos_entries != 0 &&
1271 		    brt_vdev_lookup(spa, brtvd, off)) {
1272 			int error = zap_lookup_uint64_by_dnode(
1273 			    brtvd->bv_mos_entries_dnode, &off,
1274 			    BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
1275 			    &bre->bre_count);
1276 			if (error == 0) {
1277 				BRTSTAT_BUMP(brt_addref_entry_on_disk);
1278 			} else {
1279 				ASSERT3U(error, ==, ENOENT);
1280 				BRTSTAT_BUMP(brt_addref_entry_not_on_disk);
1281 			}
1282 		}
1283 	}
1284 
1285 	/*
1286 	 * If all the cloned blocks we had were handled by DDT, we don't need
1287 	 * to initiate the vdev.
1288 	 */
1289 	if (avl_is_empty(&brtvd->bv_tree))
1290 		return;
1291 
1292 	if (!brtvd->bv_initiated) {
1293 		rw_enter(&brtvd->bv_lock, RW_WRITER);
1294 		brt_vdev_realloc(spa, brtvd);
1295 		rw_exit(&brtvd->bv_lock);
1296 	}
1297 
1298 	/*
1299 	 * Convert pending references into proper ones.  This has to be a
1300 	 * separate loop, since entcount modifications would cause false
1301 	 * positives for brt_vdev_lookup() on following iterations.
1302 	 */
1303 	for (bre = avl_first(&brtvd->bv_tree); bre;
1304 	    bre = AVL_NEXT(&brtvd->bv_tree, bre)) {
1305 		brt_vdev_addref(spa, brtvd, bre,
1306 		    bp_get_dsize(spa, &bre->bre_bp), bre->bre_pcount);
1307 		bre->bre_count += bre->bre_pcount;
1308 	}
1309 }
1310 
1311 void
brt_pending_apply(spa_t * spa,uint64_t txg)1312 brt_pending_apply(spa_t *spa, uint64_t txg)
1313 {
1314 
1315 	brt_rlock(spa);
1316 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
1317 		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
1318 		brt_unlock(spa);
1319 
1320 		brt_pending_apply_vdev(spa, brtvd, txg);
1321 
1322 		brt_rlock(spa);
1323 	}
1324 	brt_unlock(spa);
1325 }
1326 
1327 static void
brt_sync_entry(dnode_t * dn,brt_entry_t * bre,dmu_tx_t * tx)1328 brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx)
1329 {
1330 	uint64_t off = BRE_OFFSET(bre);
1331 
1332 	if (bre->bre_pcount == 0) {
1333 		/* The net change is zero, nothing to do in ZAP. */
1334 	} else if (bre->bre_count == 0) {
1335 		int error = zap_remove_uint64_by_dnode(dn, &off,
1336 		    BRT_KEY_WORDS, tx);
1337 		VERIFY(error == 0 || error == ENOENT);
1338 	} else {
1339 		VERIFY0(zap_update_uint64_by_dnode(dn, &off,
1340 		    BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
1341 		    &bre->bre_count, tx));
1342 	}
1343 }
1344 
1345 static void
brt_sync_table(spa_t * spa,dmu_tx_t * tx)1346 brt_sync_table(spa_t *spa, dmu_tx_t *tx)
1347 {
1348 	brt_entry_t *bre;
1349 
1350 	brt_rlock(spa);
1351 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
1352 		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
1353 		brt_unlock(spa);
1354 
1355 		if (!brtvd->bv_meta_dirty) {
1356 			ASSERT(!brtvd->bv_entcount_dirty);
1357 			ASSERT0(avl_numnodes(&brtvd->bv_tree));
1358 			brt_rlock(spa);
1359 			continue;
1360 		}
1361 
1362 		ASSERT(!brtvd->bv_entcount_dirty ||
1363 		    avl_numnodes(&brtvd->bv_tree) != 0);
1364 
1365 		if (brtvd->bv_mos_brtvdev == 0)
1366 			brt_vdev_create(spa, brtvd, tx);
1367 
1368 		void *c = NULL;
1369 		while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
1370 			brt_sync_entry(brtvd->bv_mos_entries_dnode, bre, tx);
1371 			kmem_cache_free(brt_entry_cache, bre);
1372 		}
1373 
1374 #ifdef ZFS_DEBUG
1375 		if (zfs_flags & ZFS_DEBUG_BRT)
1376 			brt_vdev_dump(brtvd);
1377 #endif
1378 		if (brtvd->bv_totalcount == 0)
1379 			brt_vdev_destroy(spa, brtvd, tx);
1380 		else
1381 			brt_vdev_sync(spa, brtvd, tx);
1382 		brt_rlock(spa);
1383 	}
1384 	brt_unlock(spa);
1385 }
1386 
1387 void
brt_sync(spa_t * spa,uint64_t txg)1388 brt_sync(spa_t *spa, uint64_t txg)
1389 {
1390 	dmu_tx_t *tx;
1391 	uint64_t vdevid;
1392 
1393 	ASSERT3U(spa_syncing_txg(spa), ==, txg);
1394 
1395 	brt_rlock(spa);
1396 	for (vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
1397 		if (spa->spa_brt_vdevs[vdevid]->bv_meta_dirty)
1398 			break;
1399 	}
1400 	if (vdevid >= spa->spa_brt_nvdevs) {
1401 		brt_unlock(spa);
1402 		return;
1403 	}
1404 	brt_unlock(spa);
1405 
1406 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1407 	brt_sync_table(spa, tx);
1408 	dmu_tx_commit(tx);
1409 }
1410 
1411 static void
brt_alloc(spa_t * spa)1412 brt_alloc(spa_t *spa)
1413 {
1414 	rw_init(&spa->spa_brt_lock, NULL, RW_DEFAULT, NULL);
1415 	spa->spa_brt_vdevs = NULL;
1416 	spa->spa_brt_nvdevs = 0;
1417 	spa->spa_brt_rangesize = 0;
1418 }
1419 
1420 void
brt_create(spa_t * spa)1421 brt_create(spa_t *spa)
1422 {
1423 	brt_alloc(spa);
1424 	spa->spa_brt_rangesize = BRT_RANGESIZE;
1425 }
1426 
1427 int
brt_load(spa_t * spa)1428 brt_load(spa_t *spa)
1429 {
1430 	int error = 0;
1431 
1432 	brt_alloc(spa);
1433 	brt_wlock(spa);
1434 	for (uint64_t vdevid = 0; vdevid < spa->spa_root_vdev->vdev_children;
1435 	    vdevid++) {
1436 		char name[64];
1437 		uint64_t mos_brtvdev;
1438 
1439 		/* Look if this vdev had active block cloning. */
1440 		snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
1441 		    (u_longlong_t)vdevid);
1442 		error = zap_lookup(spa->spa_meta_objset,
1443 		    DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1,
1444 		    &mos_brtvdev);
1445 		if (error == ENOENT) {
1446 			error = 0;
1447 			continue;
1448 		}
1449 		if (error != 0)
1450 			break;
1451 
1452 		/* If it did, then allocate them all and load this one. */
1453 		brt_vdevs_expand(spa, spa->spa_root_vdev->vdev_children);
1454 		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
1455 		rw_enter(&brtvd->bv_lock, RW_WRITER);
1456 		brtvd->bv_mos_brtvdev = mos_brtvdev;
1457 		error = brt_vdev_load(spa, brtvd);
1458 		rw_exit(&brtvd->bv_lock);
1459 		if (error != 0)
1460 			break;
1461 	}
1462 
1463 	if (spa->spa_brt_rangesize == 0)
1464 		spa->spa_brt_rangesize = BRT_RANGESIZE;
1465 	brt_unlock(spa);
1466 	return (error);
1467 }
1468 
1469 void
brt_unload(spa_t * spa)1470 brt_unload(spa_t *spa)
1471 {
1472 	if (spa->spa_brt_rangesize == 0)
1473 		return;
1474 	brt_vdevs_free(spa);
1475 	rw_destroy(&spa->spa_brt_lock);
1476 	spa->spa_brt_rangesize = 0;
1477 }
1478 
1479 ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW,
1480 	"Enable prefetching of BRT ZAP entries");
1481 ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW,
1482 	"BRT ZAP leaf blockshift");
1483 ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW,
1484 	"BRT ZAP indirect blockshift");
1485