1*61145dc2SMartin Matuska // SPDX-License-Identifier: CDDL-1.0
22a58b312SMartin Matuska /*
32a58b312SMartin Matuska * CDDL HEADER START
42a58b312SMartin Matuska *
52a58b312SMartin Matuska * The contents of this file are subject to the terms of the
62a58b312SMartin Matuska * Common Development and Distribution License (the "License").
72a58b312SMartin Matuska * You may not use this file except in compliance with the License.
82a58b312SMartin Matuska *
92a58b312SMartin Matuska * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
102a58b312SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0.
112a58b312SMartin Matuska * See the License for the specific language governing permissions
122a58b312SMartin Matuska * and limitations under the License.
132a58b312SMartin Matuska *
142a58b312SMartin Matuska * When distributing Covered Code, include this CDDL HEADER in each
152a58b312SMartin Matuska * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
162a58b312SMartin Matuska * If applicable, add the following below this CDDL HEADER, with the
172a58b312SMartin Matuska * fields enclosed by brackets "[]" replaced with your own identifying
182a58b312SMartin Matuska * information: Portions Copyright [yyyy] [name of copyright owner]
192a58b312SMartin Matuska *
202a58b312SMartin Matuska * CDDL HEADER END
212a58b312SMartin Matuska */
222a58b312SMartin Matuska
232a58b312SMartin Matuska /*
242a58b312SMartin Matuska * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
252a58b312SMartin Matuska */
262a58b312SMartin Matuska
272a58b312SMartin Matuska #include <sys/zfs_context.h>
282a58b312SMartin Matuska #include <sys/spa.h>
292a58b312SMartin Matuska #include <sys/spa_impl.h>
302a58b312SMartin Matuska #include <sys/zio.h>
312a58b312SMartin Matuska #include <sys/brt.h>
322276e539SMartin Matuska #include <sys/brt_impl.h>
332a58b312SMartin Matuska #include <sys/ddt.h>
342a58b312SMartin Matuska #include <sys/bitmap.h>
352a58b312SMartin Matuska #include <sys/zap.h>
362a58b312SMartin Matuska #include <sys/dmu_tx.h>
372a58b312SMartin Matuska #include <sys/arc.h>
382a58b312SMartin Matuska #include <sys/dsl_pool.h>
392a58b312SMartin Matuska #include <sys/dsl_scan.h>
402a58b312SMartin Matuska #include <sys/vdev_impl.h>
412a58b312SMartin Matuska #include <sys/kstat.h>
422a58b312SMartin Matuska #include <sys/wmsum.h>
432a58b312SMartin Matuska
442a58b312SMartin Matuska /*
452a58b312SMartin Matuska * Block Cloning design.
462a58b312SMartin Matuska *
472a58b312SMartin Matuska * Block Cloning allows to manually clone a file (or a subset of its blocks)
482a58b312SMartin Matuska * into another (or the same) file by just creating additional references to
492a58b312SMartin Matuska * the data blocks without copying the data itself. Those references are kept
502a58b312SMartin Matuska * in the Block Reference Tables (BRTs).
512a58b312SMartin Matuska *
522a58b312SMartin Matuska * In many ways this is similar to the existing deduplication, but there are
532a58b312SMartin Matuska * some important differences:
542a58b312SMartin Matuska *
552a58b312SMartin Matuska * - Deduplication is automatic and Block Cloning is not - one has to use a
562a58b312SMartin Matuska * dedicated system call(s) to clone the given file/blocks.
572a58b312SMartin Matuska * - Deduplication keeps all data blocks in its table, even those referenced
582a58b312SMartin Matuska * just once. Block Cloning creates an entry in its tables only when there
592a58b312SMartin Matuska * are at least two references to the given data block. If the block was
602a58b312SMartin Matuska * never explicitly cloned or the second to last reference was dropped,
612a58b312SMartin Matuska * there will be neither space nor performance overhead.
622a58b312SMartin Matuska * - Deduplication needs data to work - one needs to pass real data to the
632a58b312SMartin Matuska * write(2) syscall, so hash can be calculated. Block Cloning doesn't require
642a58b312SMartin Matuska * data, just block pointers to the data, so it is extremely fast, as we pay
652a58b312SMartin Matuska * neither the cost of reading the data, nor the cost of writing the data -
662a58b312SMartin Matuska * we operate exclusively on metadata.
672a58b312SMartin Matuska * - If the D (dedup) bit is not set in the block pointer, it means that
682a58b312SMartin Matuska * the block is not in the dedup table (DDT) and we won't consult the DDT
692a58b312SMartin Matuska * when we need to free the block. Block Cloning must be consulted on every
702a58b312SMartin Matuska * free, because we cannot modify the source BP (eg. by setting something
712a58b312SMartin Matuska * similar to the D bit), thus we have no hint if the block is in the
722a58b312SMartin Matuska * Block Reference Table (BRT), so we need to look into the BRT. There is
732a58b312SMartin Matuska * an optimization in place that allows us to eliminate the majority of BRT
742a58b312SMartin Matuska * lookups which is described below in the "Minimizing free penalty" section.
752a58b312SMartin Matuska * - The BRT entry is much smaller than the DDT entry - for BRT we only store
762a58b312SMartin Matuska * 64bit offset and 64bit reference counter.
772a58b312SMartin Matuska * - Dedup keys are cryptographic hashes, so two blocks that are close to each
782a58b312SMartin Matuska * other on disk are most likely in totally different parts of the DDT.
792a58b312SMartin Matuska * The BRT entry keys are offsets into a single top-level VDEV, so data blocks
802a58b312SMartin Matuska * from one file should have BRT entries close to each other.
812a58b312SMartin Matuska * - Scrub will only do a single pass over a block that is referenced multiple
822a58b312SMartin Matuska * times in the DDT. Unfortunately it is not currently (if at all) possible
832a58b312SMartin Matuska * with Block Cloning and block referenced multiple times will be scrubbed
842a58b312SMartin Matuska * multiple times. The new, sorted scrub should be able to eliminate
852a58b312SMartin Matuska * duplicated reads given enough memory.
862a58b312SMartin Matuska * - Deduplication requires cryptographically strong hash as a checksum or
872a58b312SMartin Matuska * additional data verification. Block Cloning works with any checksum
882a58b312SMartin Matuska * algorithm or even with checksumming disabled.
892a58b312SMartin Matuska *
902a58b312SMartin Matuska * As mentioned above, the BRT entries are much smaller than the DDT entries.
912a58b312SMartin Matuska * To uniquely identify a block we just need its vdev id and offset. We also
922a58b312SMartin Matuska * need to maintain a reference counter. The vdev id will often repeat, as there
932a58b312SMartin Matuska * is a small number of top-level VDEVs and a large number of blocks stored in
942a58b312SMartin Matuska * each VDEV. We take advantage of that to reduce the BRT entry size further by
952a58b312SMartin Matuska * maintaining one BRT for each top-level VDEV, so we can then have only offset
962a58b312SMartin Matuska * and counter as the BRT entry.
972a58b312SMartin Matuska *
982a58b312SMartin Matuska * Minimizing free penalty.
992a58b312SMartin Matuska *
1002a58b312SMartin Matuska * Block Cloning allows creating additional references to any existing block.
1012a58b312SMartin Matuska * When we free a block there is no hint in the block pointer whether the block
1022a58b312SMartin Matuska * was cloned or not, so on each free we have to check if there is a
1032a58b312SMartin Matuska * corresponding entry in the BRT or not. If there is, we need to decrease
1042a58b312SMartin Matuska * the reference counter. Doing BRT lookup on every free can potentially be
1052a58b312SMartin Matuska * expensive by requiring additional I/Os if the BRT doesn't fit into memory.
1062a58b312SMartin Matuska * This is the main problem with deduplication, so we've learned our lesson and
1072a58b312SMartin Matuska * try not to repeat the same mistake here. How do we do that? We divide each
1082a58b312SMartin Matuska * top-level VDEV into 16MB regions. For each region we maintain a counter that
1092a58b312SMartin Matuska * is a sum of all the BRT entries that have offsets within the region. This
1102a58b312SMartin Matuska * creates the entries count array of 16bit numbers for each top-level VDEV.
1112a58b312SMartin Matuska * The entries count array is always kept in memory and updated on disk in the
1122a58b312SMartin Matuska * same transaction group as the BRT updates to keep everything in-sync. We can
1132a58b312SMartin Matuska * keep the array in memory, because it is very small. With 16MB regions and
1142a58b312SMartin Matuska * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease
1152a58b312SMartin Matuska * the region size even further in the future). Now, when we want to free
1162a58b312SMartin Matuska * a block, we first consult the array. If the counter for the whole region is
1172a58b312SMartin Matuska * zero, there is no need to look for the BRT entry, as there isn't one for
1182a58b312SMartin Matuska * sure. If the counter for the region is greater than zero, only then we will
1192a58b312SMartin Matuska * do a BRT lookup and if an entry is found we will decrease the reference
1202a58b312SMartin Matuska * counter in the BRT entry and in the entry counters array.
1212a58b312SMartin Matuska *
1222a58b312SMartin Matuska * The entry counters array is small, but can potentially be larger for very
1232a58b312SMartin Matuska * large VDEVs or smaller regions. In this case we don't want to rewrite entire
1242a58b312SMartin Matuska * array on every change. We then divide the array into 32kB block and keep
1252a58b312SMartin Matuska * a bitmap of dirty blocks within a transaction group. When we sync the
1262a58b312SMartin Matuska * transaction group we can only update the parts of the entry counters array
1272a58b312SMartin Matuska * that were modified. Note: Keeping track of the dirty parts of the entry
1282a58b312SMartin Matuska * counters array is implemented, but updating only parts of the array on disk
1292a58b312SMartin Matuska * is not yet implemented - for now we will update entire array if there was
1302a58b312SMartin Matuska * any change.
1312a58b312SMartin Matuska *
1322a58b312SMartin Matuska * The implementation tries to be economic: if BRT is not used, or no longer
1332a58b312SMartin Matuska * used, there will be no entries in the MOS and no additional memory used (eg.
1342a58b312SMartin Matuska * the entry counters array is only allocated if needed).
1352a58b312SMartin Matuska *
1362a58b312SMartin Matuska * Interaction between Deduplication and Block Cloning.
1372a58b312SMartin Matuska *
1382a58b312SMartin Matuska * If both functionalities are in use, we could end up with a block that is
1392a58b312SMartin Matuska * referenced multiple times in both DDT and BRT. When we free one of the
1402a58b312SMartin Matuska * references we couldn't tell where it belongs, so we would have to decide
1412a58b312SMartin Matuska * what table takes the precedence: do we first clear DDT references or BRT
1422a58b312SMartin Matuska * references? To avoid this dilemma BRT cooperates with DDT - if a given block
1432a58b312SMartin Matuska * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will
1442a58b312SMartin Matuska * lookup DDT entry instead and increase the counter there. No BRT entry
1452a58b312SMartin Matuska * will be created for a block which has the D (dedup) bit set.
1462a58b312SMartin Matuska * BRT may be more efficient for manual deduplication, but if the block is
1472a58b312SMartin Matuska * already in the DDT, then creating additional BRT entry would be less
1482a58b312SMartin Matuska * efficient. This clever idea was proposed by Allan Jude.
1492a58b312SMartin Matuska *
1502a58b312SMartin Matuska * Block Cloning across datasets.
1512a58b312SMartin Matuska *
1522a58b312SMartin Matuska * Block Cloning is not limited to cloning blocks within the same dataset.
1532a58b312SMartin Matuska * It is possible (and very useful) to clone blocks between different datasets.
1542a58b312SMartin Matuska * One use case is recovering files from snapshots. By cloning the files into
1552a58b312SMartin Matuska * dataset we need no additional storage. Without Block Cloning we would need
1562a58b312SMartin Matuska * additional space for those files.
1572a58b312SMartin Matuska * Another interesting use case is moving the files between datasets
1582a58b312SMartin Matuska * (copying the file content to the new dataset and removing the source file).
1592a58b312SMartin Matuska * In that case Block Cloning will only be used briefly, because the BRT entries
1602a58b312SMartin Matuska * will be removed when the source is removed.
1613494f7c0SMartin Matuska * Block Cloning across encrypted datasets is supported as long as both
1623494f7c0SMartin Matuska * datasets share the same master key (e.g. snapshots and clones)
1632a58b312SMartin Matuska *
1642a58b312SMartin Matuska * Block Cloning flow through ZFS layers.
1652a58b312SMartin Matuska *
1662a58b312SMartin Matuska * Note: Block Cloning can be used both for cloning file system blocks and ZVOL
1672a58b312SMartin Matuska * blocks. As of this writing no interface is implemented that allows for block
1682a58b312SMartin Matuska * cloning within a ZVOL.
1692a58b312SMartin Matuska * FreeBSD and Linux provides copy_file_range(2) system call and we will use it
1702a58b312SMartin Matuska * for blocking cloning.
1712a58b312SMartin Matuska *
1722a58b312SMartin Matuska * ssize_t
1732a58b312SMartin Matuska * copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp,
1742a58b312SMartin Matuska * size_t len, unsigned int flags);
1752a58b312SMartin Matuska *
1762a58b312SMartin Matuska * Even though offsets and length represent bytes, they have to be
177315ee00fSMartin Matuska * block-aligned or we will return an error so the upper layer can
1782a58b312SMartin Matuska * fallback to the generic mechanism that will just copy the data.
1792a58b312SMartin Matuska * Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
1802a58b312SMartin Matuska * This function was implemented based on zfs_write(), but instead of writing
1812a58b312SMartin Matuska * the given data we first read block pointers using the new dmu_read_l0_bps()
1822a58b312SMartin Matuska * function from the source file. Once we have BPs from the source file we call
1832a58b312SMartin Matuska * the dmu_brt_clone() function on the destination file. This function
1842a58b312SMartin Matuska * allocates BPs for us. We iterate over all source BPs. If the given BP is
1852a58b312SMartin Matuska * a hole or an embedded block, we just copy BP as-is. If it points to a real
1862a58b312SMartin Matuska * data we place this BP on a BRT pending list using the brt_pending_add()
1872a58b312SMartin Matuska * function.
1882a58b312SMartin Matuska *
1892a58b312SMartin Matuska * We use this pending list to keep track of all BPs that got new references
1902a58b312SMartin Matuska * within this transaction group.
1912a58b312SMartin Matuska *
1922a58b312SMartin Matuska * Some special cases to consider and how we address them:
1932a58b312SMartin Matuska * - The block we want to clone may have been created within the same
1942a58b312SMartin Matuska * transaction group that we are trying to clone. Such block has no BP
195315ee00fSMartin Matuska * allocated yet, so cannot be immediately cloned. We return EAGAIN.
1962a58b312SMartin Matuska * - The block we want to clone may have been modified within the same
197315ee00fSMartin Matuska * transaction group. We return EAGAIN.
1982a58b312SMartin Matuska * - A block may be cloned multiple times during one transaction group (that's
1992a58b312SMartin Matuska * why pending list is actually a tree and not an append-only list - this
2002a58b312SMartin Matuska * way we can figure out faster if this block is cloned for the first time
2012a58b312SMartin Matuska * in this txg or consecutive time).
2022a58b312SMartin Matuska * - A block may be cloned and freed within the same transaction group
2032a58b312SMartin Matuska * (see dbuf_undirty()).
2042a58b312SMartin Matuska * - A block may be cloned and within the same transaction group the clone
2052a58b312SMartin Matuska * can be cloned again (see dmu_read_l0_bps()).
2062a58b312SMartin Matuska * - A file might have been deleted, but the caller still has a file descriptor
2072a58b312SMartin Matuska * open to this file and clones it.
2082a58b312SMartin Matuska *
2092a58b312SMartin Matuska * When we free a block we have an additional step in the ZIO pipeline where we
2102a58b312SMartin Matuska * call the zio_brt_free() function. We then call the brt_entry_decref()
2112a58b312SMartin Matuska * that loads the corresponding BRT entry (if one exists) and decreases
2122a58b312SMartin Matuska * reference counter. If this is not the last reference we will stop ZIO
2132a58b312SMartin Matuska * pipeline here. If this is the last reference or the block is not in the
2142a58b312SMartin Matuska * BRT, we continue the pipeline and free the block as usual.
2152a58b312SMartin Matuska *
2162a58b312SMartin Matuska * At the beginning of spa_sync() where there can be no more block cloning,
2172a58b312SMartin Matuska * but before issuing frees we call brt_pending_apply(). This function applies
2182a58b312SMartin Matuska * all the new clones to the BRT table - we load BRT entries and update
2192a58b312SMartin Matuska * reference counters. To sync new BRT entries to disk, we use brt_sync()
2202a58b312SMartin Matuska * function. This function will sync all dirty per-top-level-vdev BRTs,
2212a58b312SMartin Matuska * the entry counters arrays, etc.
2222a58b312SMartin Matuska *
2232a58b312SMartin Matuska * Block Cloning and ZIL.
2242a58b312SMartin Matuska *
2252a58b312SMartin Matuska * Every clone operation is divided into chunks (similar to write) and each
2262a58b312SMartin Matuska * chunk is cloned in a separate transaction. The chunk size is determined by
2272a58b312SMartin Matuska * how many BPs we can fit into a single ZIL entry.
2282a58b312SMartin Matuska * Replaying clone operation is different from the regular clone operation,
2292a58b312SMartin Matuska * as when we log clone operations we cannot use the source object - it may
2302a58b312SMartin Matuska * reside on a different dataset, so we log BPs we want to clone.
2312a58b312SMartin Matuska * The ZIL is replayed when we mount the given dataset, not when the pool is
2322a58b312SMartin Matuska * imported. Taking this into account it is possible that the pool is imported
2332a58b312SMartin Matuska * without mounting datasets and the source dataset is destroyed before the
2342a58b312SMartin Matuska * destination dataset is mounted and its ZIL replayed.
2352a58b312SMartin Matuska * To address this situation we leverage zil_claim() mechanism where ZFS will
2362a58b312SMartin Matuska * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
237525fe93dSMartin Matuska * entries, we will bump reference counters for their BPs in the BRT. Then
238525fe93dSMartin Matuska * on mount and ZIL replay we bump the reference counters once more, while the
239525fe93dSMartin Matuska * first references are dropped during ZIL destroy by zil_free_clone_range().
240525fe93dSMartin Matuska * It is possible that after zil_claim() we never mount the destination, so
241525fe93dSMartin Matuska * we never replay its ZIL and just destroy it. In this case the only taken
242525fe93dSMartin Matuska * references will be dropped by zil_free_clone_range(), since the cloning is
243525fe93dSMartin Matuska * not going to ever take place.
2442a58b312SMartin Matuska */
2452a58b312SMartin Matuska
2462a58b312SMartin Matuska static kmem_cache_t *brt_entry_cache;
2472a58b312SMartin Matuska
2482a58b312SMartin Matuska /*
2492a58b312SMartin Matuska * Enable/disable prefetching of BRT entries that we are going to modify.
2502a58b312SMartin Matuska */
251783d3ff6SMartin Matuska static int brt_zap_prefetch = 1;
2522a58b312SMartin Matuska
2532a58b312SMartin Matuska #ifdef ZFS_DEBUG
2542a58b312SMartin Matuska #define BRT_DEBUG(...) do { \
2552a58b312SMartin Matuska if ((zfs_flags & ZFS_DEBUG_BRT) != 0) { \
2562a58b312SMartin Matuska __dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \
2572a58b312SMartin Matuska } \
2582a58b312SMartin Matuska } while (0)
2592a58b312SMartin Matuska #else
2602a58b312SMartin Matuska #define BRT_DEBUG(...) do { } while (0)
2612a58b312SMartin Matuska #endif
2622a58b312SMartin Matuska
263783d3ff6SMartin Matuska static int brt_zap_default_bs = 12;
264783d3ff6SMartin Matuska static int brt_zap_default_ibs = 12;
2652a58b312SMartin Matuska
2662a58b312SMartin Matuska static kstat_t *brt_ksp;
2672a58b312SMartin Matuska
2682a58b312SMartin Matuska typedef struct brt_stats {
2692a58b312SMartin Matuska kstat_named_t brt_addref_entry_not_on_disk;
2702a58b312SMartin Matuska kstat_named_t brt_addref_entry_on_disk;
2712a58b312SMartin Matuska kstat_named_t brt_decref_entry_in_memory;
2722a58b312SMartin Matuska kstat_named_t brt_decref_entry_loaded_from_disk;
2732a58b312SMartin Matuska kstat_named_t brt_decref_entry_not_in_memory;
2742a58b312SMartin Matuska kstat_named_t brt_decref_entry_read_lost_race;
2752a58b312SMartin Matuska kstat_named_t brt_decref_entry_still_referenced;
2762a58b312SMartin Matuska kstat_named_t brt_decref_free_data_later;
2772a58b312SMartin Matuska kstat_named_t brt_decref_free_data_now;
2782a58b312SMartin Matuska kstat_named_t brt_decref_no_entry;
2792a58b312SMartin Matuska } brt_stats_t;
2802a58b312SMartin Matuska
2812a58b312SMartin Matuska static brt_stats_t brt_stats = {
2822a58b312SMartin Matuska { "addref_entry_not_on_disk", KSTAT_DATA_UINT64 },
2832a58b312SMartin Matuska { "addref_entry_on_disk", KSTAT_DATA_UINT64 },
2842a58b312SMartin Matuska { "decref_entry_in_memory", KSTAT_DATA_UINT64 },
2852a58b312SMartin Matuska { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64 },
2862a58b312SMartin Matuska { "decref_entry_not_in_memory", KSTAT_DATA_UINT64 },
2872a58b312SMartin Matuska { "decref_entry_read_lost_race", KSTAT_DATA_UINT64 },
2882a58b312SMartin Matuska { "decref_entry_still_referenced", KSTAT_DATA_UINT64 },
2892a58b312SMartin Matuska { "decref_free_data_later", KSTAT_DATA_UINT64 },
2902a58b312SMartin Matuska { "decref_free_data_now", KSTAT_DATA_UINT64 },
2912a58b312SMartin Matuska { "decref_no_entry", KSTAT_DATA_UINT64 }
2922a58b312SMartin Matuska };
2932a58b312SMartin Matuska
2942a58b312SMartin Matuska struct {
2952a58b312SMartin Matuska wmsum_t brt_addref_entry_not_on_disk;
2962a58b312SMartin Matuska wmsum_t brt_addref_entry_on_disk;
2972a58b312SMartin Matuska wmsum_t brt_decref_entry_in_memory;
2982a58b312SMartin Matuska wmsum_t brt_decref_entry_loaded_from_disk;
2992a58b312SMartin Matuska wmsum_t brt_decref_entry_not_in_memory;
3002a58b312SMartin Matuska wmsum_t brt_decref_entry_read_lost_race;
3012a58b312SMartin Matuska wmsum_t brt_decref_entry_still_referenced;
3022a58b312SMartin Matuska wmsum_t brt_decref_free_data_later;
3032a58b312SMartin Matuska wmsum_t brt_decref_free_data_now;
3042a58b312SMartin Matuska wmsum_t brt_decref_no_entry;
3052a58b312SMartin Matuska } brt_sums;
3062a58b312SMartin Matuska
3072a58b312SMartin Matuska #define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1)
3082a58b312SMartin Matuska
3092a58b312SMartin Matuska static int brt_entry_compare(const void *x1, const void *x2);
310718519f4SMartin Matuska static void brt_vdevs_expand(spa_t *spa, uint64_t nvdevs);
3112a58b312SMartin Matuska
3122a58b312SMartin Matuska static void
brt_rlock(spa_t * spa)313718519f4SMartin Matuska brt_rlock(spa_t *spa)
3142a58b312SMartin Matuska {
315718519f4SMartin Matuska rw_enter(&spa->spa_brt_lock, RW_READER);
3162a58b312SMartin Matuska }
3172a58b312SMartin Matuska
3182a58b312SMartin Matuska static void
brt_wlock(spa_t * spa)319718519f4SMartin Matuska brt_wlock(spa_t *spa)
3202a58b312SMartin Matuska {
321718519f4SMartin Matuska rw_enter(&spa->spa_brt_lock, RW_WRITER);
3222a58b312SMartin Matuska }
3232a58b312SMartin Matuska
3242a58b312SMartin Matuska static void
brt_unlock(spa_t * spa)325718519f4SMartin Matuska brt_unlock(spa_t *spa)
3262a58b312SMartin Matuska {
327718519f4SMartin Matuska rw_exit(&spa->spa_brt_lock);
3282a58b312SMartin Matuska }
3292a58b312SMartin Matuska
3302a58b312SMartin Matuska static uint16_t
brt_vdev_entcount_get(const brt_vdev_t * brtvd,uint64_t idx)3312a58b312SMartin Matuska brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx)
3322a58b312SMartin Matuska {
3332a58b312SMartin Matuska
3342a58b312SMartin Matuska ASSERT3U(idx, <, brtvd->bv_size);
3352a58b312SMartin Matuska
3363494f7c0SMartin Matuska if (unlikely(brtvd->bv_need_byteswap)) {
3372a58b312SMartin Matuska return (BSWAP_16(brtvd->bv_entcount[idx]));
3382a58b312SMartin Matuska } else {
3392a58b312SMartin Matuska return (brtvd->bv_entcount[idx]);
3402a58b312SMartin Matuska }
3412a58b312SMartin Matuska }
3422a58b312SMartin Matuska
3432a58b312SMartin Matuska static void
brt_vdev_entcount_set(brt_vdev_t * brtvd,uint64_t idx,uint16_t entcnt)3442a58b312SMartin Matuska brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt)
3452a58b312SMartin Matuska {
3462a58b312SMartin Matuska
3472a58b312SMartin Matuska ASSERT3U(idx, <, brtvd->bv_size);
3482a58b312SMartin Matuska
3493494f7c0SMartin Matuska if (unlikely(brtvd->bv_need_byteswap)) {
3502a58b312SMartin Matuska brtvd->bv_entcount[idx] = BSWAP_16(entcnt);
3512a58b312SMartin Matuska } else {
3522a58b312SMartin Matuska brtvd->bv_entcount[idx] = entcnt;
3532a58b312SMartin Matuska }
3542a58b312SMartin Matuska }
3552a58b312SMartin Matuska
3562a58b312SMartin Matuska static void
brt_vdev_entcount_inc(brt_vdev_t * brtvd,uint64_t idx)3572a58b312SMartin Matuska brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx)
3582a58b312SMartin Matuska {
3592a58b312SMartin Matuska uint16_t entcnt;
3602a58b312SMartin Matuska
3612a58b312SMartin Matuska ASSERT3U(idx, <, brtvd->bv_size);
3622a58b312SMartin Matuska
3632a58b312SMartin Matuska entcnt = brt_vdev_entcount_get(brtvd, idx);
3642a58b312SMartin Matuska ASSERT(entcnt < UINT16_MAX);
3652a58b312SMartin Matuska
3662a58b312SMartin Matuska brt_vdev_entcount_set(brtvd, idx, entcnt + 1);
3672a58b312SMartin Matuska }
3682a58b312SMartin Matuska
3692a58b312SMartin Matuska static void
brt_vdev_entcount_dec(brt_vdev_t * brtvd,uint64_t idx)3702a58b312SMartin Matuska brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx)
3712a58b312SMartin Matuska {
3722a58b312SMartin Matuska uint16_t entcnt;
3732a58b312SMartin Matuska
3742a58b312SMartin Matuska ASSERT3U(idx, <, brtvd->bv_size);
3752a58b312SMartin Matuska
3762a58b312SMartin Matuska entcnt = brt_vdev_entcount_get(brtvd, idx);
3772a58b312SMartin Matuska ASSERT(entcnt > 0);
3782a58b312SMartin Matuska
3792a58b312SMartin Matuska brt_vdev_entcount_set(brtvd, idx, entcnt - 1);
3802a58b312SMartin Matuska }
3812a58b312SMartin Matuska
3822a58b312SMartin Matuska #ifdef ZFS_DEBUG
3832a58b312SMartin Matuska static void
brt_vdev_dump(brt_vdev_t * brtvd)3843494f7c0SMartin Matuska brt_vdev_dump(brt_vdev_t *brtvd)
3852a58b312SMartin Matuska {
3862a58b312SMartin Matuska uint64_t idx;
3872a58b312SMartin Matuska
388718519f4SMartin Matuska uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
3893494f7c0SMartin Matuska zfs_dbgmsg(" BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d "
390718519f4SMartin Matuska "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu",
3913494f7c0SMartin Matuska (u_longlong_t)brtvd->bv_vdevid,
3922a58b312SMartin Matuska brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
3932a58b312SMartin Matuska (u_longlong_t)brtvd->bv_size,
3942a58b312SMartin Matuska (u_longlong_t)brtvd->bv_totalcount,
395718519f4SMartin Matuska (u_longlong_t)nblocks,
396718519f4SMartin Matuska (size_t)BT_SIZEOFMAP(nblocks));
3972a58b312SMartin Matuska if (brtvd->bv_totalcount > 0) {
3982a58b312SMartin Matuska zfs_dbgmsg(" entcounts:");
3992a58b312SMartin Matuska for (idx = 0; idx < brtvd->bv_size; idx++) {
4003494f7c0SMartin Matuska uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx);
4013494f7c0SMartin Matuska if (entcnt > 0) {
4022a58b312SMartin Matuska zfs_dbgmsg(" [%04llu] %hu",
4033494f7c0SMartin Matuska (u_longlong_t)idx, entcnt);
4042a58b312SMartin Matuska }
4052a58b312SMartin Matuska }
4062a58b312SMartin Matuska }
4072a58b312SMartin Matuska if (brtvd->bv_entcount_dirty) {
4082a58b312SMartin Matuska char *bitmap;
4092a58b312SMartin Matuska
410718519f4SMartin Matuska bitmap = kmem_alloc(nblocks + 1, KM_SLEEP);
411718519f4SMartin Matuska for (idx = 0; idx < nblocks; idx++) {
4122a58b312SMartin Matuska bitmap[idx] =
4132a58b312SMartin Matuska BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
4142a58b312SMartin Matuska }
4152a58b312SMartin Matuska bitmap[idx] = '\0';
4163494f7c0SMartin Matuska zfs_dbgmsg(" dirty: %s", bitmap);
417718519f4SMartin Matuska kmem_free(bitmap, nblocks + 1);
4182a58b312SMartin Matuska }
4192a58b312SMartin Matuska }
4202a58b312SMartin Matuska #endif
4212a58b312SMartin Matuska
4222a58b312SMartin Matuska static brt_vdev_t *
brt_vdev(spa_t * spa,uint64_t vdevid,boolean_t alloc)423718519f4SMartin Matuska brt_vdev(spa_t *spa, uint64_t vdevid, boolean_t alloc)
4242a58b312SMartin Matuska {
425718519f4SMartin Matuska brt_vdev_t *brtvd = NULL;
4262a58b312SMartin Matuska
427718519f4SMartin Matuska brt_rlock(spa);
428718519f4SMartin Matuska if (vdevid < spa->spa_brt_nvdevs) {
429718519f4SMartin Matuska brtvd = spa->spa_brt_vdevs[vdevid];
430718519f4SMartin Matuska } else if (alloc) {
431718519f4SMartin Matuska /* New VDEV was added. */
432718519f4SMartin Matuska brt_unlock(spa);
433718519f4SMartin Matuska brt_wlock(spa);
434718519f4SMartin Matuska if (vdevid >= spa->spa_brt_nvdevs)
435718519f4SMartin Matuska brt_vdevs_expand(spa, vdevid + 1);
436718519f4SMartin Matuska brtvd = spa->spa_brt_vdevs[vdevid];
4372a58b312SMartin Matuska }
438718519f4SMartin Matuska brt_unlock(spa);
4392a58b312SMartin Matuska return (brtvd);
4402a58b312SMartin Matuska }
4412a58b312SMartin Matuska
4422a58b312SMartin Matuska static void
brt_vdev_create(spa_t * spa,brt_vdev_t * brtvd,dmu_tx_t * tx)443718519f4SMartin Matuska brt_vdev_create(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
4442a58b312SMartin Matuska {
4452a58b312SMartin Matuska char name[64];
4462a58b312SMartin Matuska
447718519f4SMartin Matuska ASSERT(brtvd->bv_initiated);
4482a58b312SMartin Matuska ASSERT0(brtvd->bv_mos_brtvdev);
4492a58b312SMartin Matuska ASSERT0(brtvd->bv_mos_entries);
4502a58b312SMartin Matuska
451718519f4SMartin Matuska uint64_t mos_entries = zap_create_flags(spa->spa_meta_objset, 0,
4522a58b312SMartin Matuska ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
453783d3ff6SMartin Matuska brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx);
454718519f4SMartin Matuska VERIFY(mos_entries != 0);
455718519f4SMartin Matuska VERIFY0(dnode_hold(spa->spa_meta_objset, mos_entries, brtvd,
456718519f4SMartin Matuska &brtvd->bv_mos_entries_dnode));
457718519f4SMartin Matuska rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
458718519f4SMartin Matuska brtvd->bv_mos_entries = mos_entries;
459718519f4SMartin Matuska rw_exit(&brtvd->bv_mos_entries_lock);
4602a58b312SMartin Matuska BRT_DEBUG("MOS entries created, object=%llu",
4612a58b312SMartin Matuska (u_longlong_t)brtvd->bv_mos_entries);
4622a58b312SMartin Matuska
4632a58b312SMartin Matuska /*
4642a58b312SMartin Matuska * We allocate DMU buffer to store the bv_entcount[] array.
4652a58b312SMartin Matuska * We will keep array size (bv_size) and cummulative count for all
4662a58b312SMartin Matuska * bv_entcount[]s (bv_totalcount) in the bonus buffer.
4672a58b312SMartin Matuska */
468718519f4SMartin Matuska brtvd->bv_mos_brtvdev = dmu_object_alloc(spa->spa_meta_objset,
4692a58b312SMartin Matuska DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE,
4702a58b312SMartin Matuska DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx);
4712a58b312SMartin Matuska VERIFY(brtvd->bv_mos_brtvdev != 0);
4722a58b312SMartin Matuska BRT_DEBUG("MOS BRT VDEV created, object=%llu",
4732a58b312SMartin Matuska (u_longlong_t)brtvd->bv_mos_brtvdev);
4742a58b312SMartin Matuska
4752a58b312SMartin Matuska snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
4762a58b312SMartin Matuska (u_longlong_t)brtvd->bv_vdevid);
477718519f4SMartin Matuska VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name,
4782a58b312SMartin Matuska sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx));
4792a58b312SMartin Matuska BRT_DEBUG("Pool directory object created, object=%s", name);
4802a58b312SMartin Matuska
481718519f4SMartin Matuska spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING, tx);
4822a58b312SMartin Matuska }
4832a58b312SMartin Matuska
4842a58b312SMartin Matuska static void
brt_vdev_realloc(spa_t * spa,brt_vdev_t * brtvd)485718519f4SMartin Matuska brt_vdev_realloc(spa_t *spa, brt_vdev_t *brtvd)
4862a58b312SMartin Matuska {
4872a58b312SMartin Matuska vdev_t *vd;
4882a58b312SMartin Matuska uint16_t *entcount;
4892a58b312SMartin Matuska ulong_t *bitmap;
490718519f4SMartin Matuska uint64_t nblocks, onblocks, size;
4912a58b312SMartin Matuska
492718519f4SMartin Matuska ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
4932a58b312SMartin Matuska
494718519f4SMartin Matuska spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
495718519f4SMartin Matuska vd = vdev_lookup_top(spa, brtvd->bv_vdevid);
496718519f4SMartin Matuska size = (vdev_get_min_asize(vd) - 1) / spa->spa_brt_rangesize + 1;
497718519f4SMartin Matuska spa_config_exit(spa, SCL_VDEV, FTAG);
4982a58b312SMartin Matuska
499315ee00fSMartin Matuska entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP);
5002a58b312SMartin Matuska nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);
5012a58b312SMartin Matuska bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);
5022a58b312SMartin Matuska
5032a58b312SMartin Matuska if (!brtvd->bv_initiated) {
5042a58b312SMartin Matuska ASSERT0(brtvd->bv_size);
505718519f4SMartin Matuska ASSERT0P(brtvd->bv_entcount);
506718519f4SMartin Matuska ASSERT0P(brtvd->bv_bitmap);
5072a58b312SMartin Matuska } else {
5082a58b312SMartin Matuska ASSERT(brtvd->bv_size > 0);
5092a58b312SMartin Matuska ASSERT(brtvd->bv_entcount != NULL);
5102a58b312SMartin Matuska ASSERT(brtvd->bv_bitmap != NULL);
5112a58b312SMartin Matuska /*
5122a58b312SMartin Matuska * TODO: Allow vdev shrinking. We only need to implement
5132a58b312SMartin Matuska * shrinking the on-disk BRT VDEV object.
514718519f4SMartin Matuska * dmu_free_range(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
515718519f4SMartin Matuska * offset, size, tx);
5162a58b312SMartin Matuska */
5172a58b312SMartin Matuska ASSERT3U(brtvd->bv_size, <=, size);
5182a58b312SMartin Matuska
5192a58b312SMartin Matuska memcpy(entcount, brtvd->bv_entcount,
5202a58b312SMartin Matuska sizeof (entcount[0]) * MIN(size, brtvd->bv_size));
521315ee00fSMartin Matuska vmem_free(brtvd->bv_entcount,
5222a58b312SMartin Matuska sizeof (entcount[0]) * brtvd->bv_size);
523718519f4SMartin Matuska onblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
524718519f4SMartin Matuska memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
525718519f4SMartin Matuska BT_SIZEOFMAP(onblocks)));
526718519f4SMartin Matuska kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(onblocks));
5272a58b312SMartin Matuska }
5282a58b312SMartin Matuska
5292a58b312SMartin Matuska brtvd->bv_size = size;
5302a58b312SMartin Matuska brtvd->bv_entcount = entcount;
5312a58b312SMartin Matuska brtvd->bv_bitmap = bitmap;
5322a58b312SMartin Matuska if (!brtvd->bv_initiated) {
5332a58b312SMartin Matuska brtvd->bv_need_byteswap = FALSE;
5342a58b312SMartin Matuska brtvd->bv_initiated = TRUE;
5352a58b312SMartin Matuska BRT_DEBUG("BRT VDEV %llu initiated.",
5362a58b312SMartin Matuska (u_longlong_t)brtvd->bv_vdevid);
5372a58b312SMartin Matuska }
5382a58b312SMartin Matuska }
5392a58b312SMartin Matuska
540718519f4SMartin Matuska static int
brt_vdev_load(spa_t * spa,brt_vdev_t * brtvd)541718519f4SMartin Matuska brt_vdev_load(spa_t *spa, brt_vdev_t *brtvd)
5422a58b312SMartin Matuska {
5432a58b312SMartin Matuska dmu_buf_t *db;
5442a58b312SMartin Matuska brt_vdev_phys_t *bvphys;
5452a58b312SMartin Matuska int error;
5462a58b312SMartin Matuska
547718519f4SMartin Matuska ASSERT(!brtvd->bv_initiated);
5482a58b312SMartin Matuska ASSERT(brtvd->bv_mos_brtvdev != 0);
5492a58b312SMartin Matuska
550718519f4SMartin Matuska error = dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
551718519f4SMartin Matuska FTAG, &db);
5522a58b312SMartin Matuska if (error != 0)
553718519f4SMartin Matuska return (error);
5542a58b312SMartin Matuska
5552a58b312SMartin Matuska bvphys = db->db_data;
556718519f4SMartin Matuska if (spa->spa_brt_rangesize == 0) {
557718519f4SMartin Matuska spa->spa_brt_rangesize = bvphys->bvp_rangesize;
5582a58b312SMartin Matuska } else {
559718519f4SMartin Matuska ASSERT3U(spa->spa_brt_rangesize, ==, bvphys->bvp_rangesize);
5602a58b312SMartin Matuska }
5612a58b312SMartin Matuska
562718519f4SMartin Matuska brt_vdev_realloc(spa, brtvd);
5632a58b312SMartin Matuska
5642a58b312SMartin Matuska /* TODO: We don't support VDEV shrinking. */
5652a58b312SMartin Matuska ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size);
5662a58b312SMartin Matuska
5672a58b312SMartin Matuska /*
5682a58b312SMartin Matuska * If VDEV grew, we will leave new bv_entcount[] entries zeroed out.
5692a58b312SMartin Matuska */
570718519f4SMartin Matuska error = dmu_read(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0,
5712a58b312SMartin Matuska MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t),
5722a58b312SMartin Matuska brtvd->bv_entcount, DMU_READ_NO_PREFETCH);
573718519f4SMartin Matuska if (error != 0)
574718519f4SMartin Matuska return (error);
5752a58b312SMartin Matuska
576718519f4SMartin Matuska ASSERT(bvphys->bvp_mos_entries != 0);
577718519f4SMartin Matuska VERIFY0(dnode_hold(spa->spa_meta_objset, bvphys->bvp_mos_entries, brtvd,
578718519f4SMartin Matuska &brtvd->bv_mos_entries_dnode));
579718519f4SMartin Matuska rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
5802a58b312SMartin Matuska brtvd->bv_mos_entries = bvphys->bvp_mos_entries;
581718519f4SMartin Matuska rw_exit(&brtvd->bv_mos_entries_lock);
5822a58b312SMartin Matuska brtvd->bv_need_byteswap =
5832a58b312SMartin Matuska (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER);
5842a58b312SMartin Matuska brtvd->bv_totalcount = bvphys->bvp_totalcount;
5852a58b312SMartin Matuska brtvd->bv_usedspace = bvphys->bvp_usedspace;
5862a58b312SMartin Matuska brtvd->bv_savedspace = bvphys->bvp_savedspace;
5872a58b312SMartin Matuska
5882a58b312SMartin Matuska dmu_buf_rele(db, FTAG);
5892a58b312SMartin Matuska
590718519f4SMartin Matuska BRT_DEBUG("BRT VDEV %llu loaded: mos_brtvdev=%llu, mos_entries=%llu",
591718519f4SMartin Matuska (u_longlong_t)brtvd->bv_vdevid,
592718519f4SMartin Matuska (u_longlong_t)brtvd->bv_mos_brtvdev,
5932a58b312SMartin Matuska (u_longlong_t)brtvd->bv_mos_entries);
594718519f4SMartin Matuska return (0);
5952a58b312SMartin Matuska }
5962a58b312SMartin Matuska
5972a58b312SMartin Matuska static void
brt_vdev_dealloc(brt_vdev_t * brtvd)598718519f4SMartin Matuska brt_vdev_dealloc(brt_vdev_t *brtvd)
5992a58b312SMartin Matuska {
600718519f4SMartin Matuska ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
6012a58b312SMartin Matuska ASSERT(brtvd->bv_initiated);
602718519f4SMartin Matuska ASSERT0(avl_numnodes(&brtvd->bv_tree));
6032a58b312SMartin Matuska
604315ee00fSMartin Matuska vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size);
6052a58b312SMartin Matuska brtvd->bv_entcount = NULL;
606718519f4SMartin Matuska uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
607718519f4SMartin Matuska kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(nblocks));
6082a58b312SMartin Matuska brtvd->bv_bitmap = NULL;
6092a58b312SMartin Matuska
6102a58b312SMartin Matuska brtvd->bv_size = 0;
6112a58b312SMartin Matuska
6122a58b312SMartin Matuska brtvd->bv_initiated = FALSE;
6132a58b312SMartin Matuska BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid);
6142a58b312SMartin Matuska }
6152a58b312SMartin Matuska
6162a58b312SMartin Matuska static void
brt_vdev_destroy(spa_t * spa,brt_vdev_t * brtvd,dmu_tx_t * tx)617718519f4SMartin Matuska brt_vdev_destroy(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
6182a58b312SMartin Matuska {
6192a58b312SMartin Matuska char name[64];
6202a58b312SMartin Matuska uint64_t count;
6212a58b312SMartin Matuska
622718519f4SMartin Matuska ASSERT(brtvd->bv_initiated);
6232a58b312SMartin Matuska ASSERT(brtvd->bv_mos_brtvdev != 0);
6242a58b312SMartin Matuska ASSERT(brtvd->bv_mos_entries != 0);
625718519f4SMartin Matuska ASSERT0(brtvd->bv_totalcount);
626718519f4SMartin Matuska ASSERT0(brtvd->bv_usedspace);
627718519f4SMartin Matuska ASSERT0(brtvd->bv_savedspace);
6282a58b312SMartin Matuska
629718519f4SMartin Matuska uint64_t mos_entries = brtvd->bv_mos_entries;
630718519f4SMartin Matuska rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
6312a58b312SMartin Matuska brtvd->bv_mos_entries = 0;
632718519f4SMartin Matuska rw_exit(&brtvd->bv_mos_entries_lock);
633718519f4SMartin Matuska dnode_rele(brtvd->bv_mos_entries_dnode, brtvd);
634718519f4SMartin Matuska brtvd->bv_mos_entries_dnode = NULL;
635718519f4SMartin Matuska ASSERT0(zap_count(spa->spa_meta_objset, mos_entries, &count));
636718519f4SMartin Matuska ASSERT0(count);
637718519f4SMartin Matuska VERIFY0(zap_destroy(spa->spa_meta_objset, mos_entries, tx));
638718519f4SMartin Matuska BRT_DEBUG("MOS entries destroyed, object=%llu",
639718519f4SMartin Matuska (u_longlong_t)mos_entries);
6402a58b312SMartin Matuska
641718519f4SMartin Matuska VERIFY0(dmu_object_free(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
642718519f4SMartin Matuska tx));
6432a58b312SMartin Matuska BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",
6442a58b312SMartin Matuska (u_longlong_t)brtvd->bv_mos_brtvdev);
6452a58b312SMartin Matuska brtvd->bv_mos_brtvdev = 0;
646718519f4SMartin Matuska brtvd->bv_entcount_dirty = FALSE;
6472a58b312SMartin Matuska
6482a58b312SMartin Matuska snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
6492a58b312SMartin Matuska (u_longlong_t)brtvd->bv_vdevid);
650718519f4SMartin Matuska VERIFY0(zap_remove(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
651718519f4SMartin Matuska name, tx));
6522a58b312SMartin Matuska BRT_DEBUG("Pool directory object removed, object=%s", name);
6532a58b312SMartin Matuska
654718519f4SMartin Matuska brtvd->bv_meta_dirty = FALSE;
6552a58b312SMartin Matuska
656718519f4SMartin Matuska rw_enter(&brtvd->bv_lock, RW_WRITER);
657718519f4SMartin Matuska brt_vdev_dealloc(brtvd);
658718519f4SMartin Matuska rw_exit(&brtvd->bv_lock);
659718519f4SMartin Matuska
660718519f4SMartin Matuska spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING, tx);
6612a58b312SMartin Matuska }
6622a58b312SMartin Matuska
6632a58b312SMartin Matuska static void
brt_vdevs_expand(spa_t * spa,uint64_t nvdevs)664718519f4SMartin Matuska brt_vdevs_expand(spa_t *spa, uint64_t nvdevs)
6652a58b312SMartin Matuska {
666718519f4SMartin Matuska brt_vdev_t **vdevs;
6672a58b312SMartin Matuska
668718519f4SMartin Matuska ASSERT(RW_WRITE_HELD(&spa->spa_brt_lock));
669718519f4SMartin Matuska ASSERT3U(nvdevs, >=, spa->spa_brt_nvdevs);
6702a58b312SMartin Matuska
671718519f4SMartin Matuska if (nvdevs == spa->spa_brt_nvdevs)
672718519f4SMartin Matuska return;
6732a58b312SMartin Matuska
674718519f4SMartin Matuska vdevs = kmem_zalloc(sizeof (*spa->spa_brt_vdevs) * nvdevs, KM_SLEEP);
675718519f4SMartin Matuska if (spa->spa_brt_nvdevs > 0) {
676718519f4SMartin Matuska ASSERT(spa->spa_brt_vdevs != NULL);
677718519f4SMartin Matuska
678718519f4SMartin Matuska memcpy(vdevs, spa->spa_brt_vdevs,
679718519f4SMartin Matuska sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs);
680718519f4SMartin Matuska kmem_free(spa->spa_brt_vdevs,
681718519f4SMartin Matuska sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs);
6822a58b312SMartin Matuska }
683718519f4SMartin Matuska spa->spa_brt_vdevs = vdevs;
6842a58b312SMartin Matuska
685718519f4SMartin Matuska for (uint64_t vdevid = spa->spa_brt_nvdevs; vdevid < nvdevs; vdevid++) {
686718519f4SMartin Matuska brt_vdev_t *brtvd = kmem_zalloc(sizeof (*brtvd), KM_SLEEP);
687718519f4SMartin Matuska rw_init(&brtvd->bv_lock, NULL, RW_DEFAULT, NULL);
6882a58b312SMartin Matuska brtvd->bv_vdevid = vdevid;
6892a58b312SMartin Matuska brtvd->bv_initiated = FALSE;
690718519f4SMartin Matuska rw_init(&brtvd->bv_mos_entries_lock, NULL, RW_DEFAULT, NULL);
691718519f4SMartin Matuska avl_create(&brtvd->bv_tree, brt_entry_compare,
692718519f4SMartin Matuska sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node));
693718519f4SMartin Matuska for (int i = 0; i < TXG_SIZE; i++) {
694718519f4SMartin Matuska avl_create(&brtvd->bv_pending_tree[i],
695718519f4SMartin Matuska brt_entry_compare, sizeof (brt_entry_t),
696718519f4SMartin Matuska offsetof(brt_entry_t, bre_node));
697718519f4SMartin Matuska }
698718519f4SMartin Matuska mutex_init(&brtvd->bv_pending_lock, NULL, MUTEX_DEFAULT, NULL);
699718519f4SMartin Matuska spa->spa_brt_vdevs[vdevid] = brtvd;
7002a58b312SMartin Matuska }
7012a58b312SMartin Matuska
7022a58b312SMartin Matuska BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",
703718519f4SMartin Matuska (u_longlong_t)spa->spa_brt_nvdevs, (u_longlong_t)nvdevs);
704718519f4SMartin Matuska spa->spa_brt_nvdevs = nvdevs;
7052a58b312SMartin Matuska }
7062a58b312SMartin Matuska
7072a58b312SMartin Matuska static boolean_t
brt_vdev_lookup(spa_t * spa,brt_vdev_t * brtvd,uint64_t offset)708718519f4SMartin Matuska brt_vdev_lookup(spa_t *spa, brt_vdev_t *brtvd, uint64_t offset)
7092a58b312SMartin Matuska {
710718519f4SMartin Matuska uint64_t idx = offset / spa->spa_brt_rangesize;
711718519f4SMartin Matuska if (idx < brtvd->bv_size) {
7122a58b312SMartin Matuska /* VDEV wasn't expanded. */
7132a58b312SMartin Matuska return (brt_vdev_entcount_get(brtvd, idx) > 0);
7142a58b312SMartin Matuska }
7152a58b312SMartin Matuska return (FALSE);
7162a58b312SMartin Matuska }
7172a58b312SMartin Matuska
7182a58b312SMartin Matuska static void
brt_vdev_addref(spa_t * spa,brt_vdev_t * brtvd,const brt_entry_t * bre,uint64_t dsize,uint64_t count)719718519f4SMartin Matuska brt_vdev_addref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre,
720718519f4SMartin Matuska uint64_t dsize, uint64_t count)
7212a58b312SMartin Matuska {
7222a58b312SMartin Matuska uint64_t idx;
7232a58b312SMartin Matuska
724718519f4SMartin Matuska ASSERT(brtvd->bv_initiated);
7252a58b312SMartin Matuska
726718519f4SMartin Matuska brtvd->bv_savedspace += dsize * count;
7272a58b312SMartin Matuska brtvd->bv_meta_dirty = TRUE;
7282a58b312SMartin Matuska
729718519f4SMartin Matuska if (bre->bre_count > 0)
7302a58b312SMartin Matuska return;
7312a58b312SMartin Matuska
7322a58b312SMartin Matuska brtvd->bv_usedspace += dsize;
7332a58b312SMartin Matuska
734718519f4SMartin Matuska idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize;
7352a58b312SMartin Matuska if (idx >= brtvd->bv_size) {
7362a58b312SMartin Matuska /* VDEV has been expanded. */
737718519f4SMartin Matuska rw_enter(&brtvd->bv_lock, RW_WRITER);
738718519f4SMartin Matuska brt_vdev_realloc(spa, brtvd);
739718519f4SMartin Matuska rw_exit(&brtvd->bv_lock);
7402a58b312SMartin Matuska }
7412a58b312SMartin Matuska
7422a58b312SMartin Matuska ASSERT3U(idx, <, brtvd->bv_size);
7432a58b312SMartin Matuska
7442a58b312SMartin Matuska brtvd->bv_totalcount++;
7452a58b312SMartin Matuska brt_vdev_entcount_inc(brtvd, idx);
7462a58b312SMartin Matuska brtvd->bv_entcount_dirty = TRUE;
7472a58b312SMartin Matuska idx = idx / BRT_BLOCKSIZE / 8;
7482a58b312SMartin Matuska BT_SET(brtvd->bv_bitmap, idx);
7492a58b312SMartin Matuska }
7502a58b312SMartin Matuska
7512a58b312SMartin Matuska static void
brt_vdev_decref(spa_t * spa,brt_vdev_t * brtvd,const brt_entry_t * bre,uint64_t dsize)752718519f4SMartin Matuska brt_vdev_decref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre,
7532a58b312SMartin Matuska uint64_t dsize)
7542a58b312SMartin Matuska {
7552a58b312SMartin Matuska uint64_t idx;
7562a58b312SMartin Matuska
757718519f4SMartin Matuska ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
758718519f4SMartin Matuska ASSERT(brtvd->bv_initiated);
7592a58b312SMartin Matuska
7602a58b312SMartin Matuska brtvd->bv_savedspace -= dsize;
7612a58b312SMartin Matuska brtvd->bv_meta_dirty = TRUE;
7622a58b312SMartin Matuska
763718519f4SMartin Matuska if (bre->bre_count > 0)
7642a58b312SMartin Matuska return;
7652a58b312SMartin Matuska
7662a58b312SMartin Matuska brtvd->bv_usedspace -= dsize;
7672a58b312SMartin Matuska
768718519f4SMartin Matuska idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize;
7692a58b312SMartin Matuska ASSERT3U(idx, <, brtvd->bv_size);
7702a58b312SMartin Matuska
7712a58b312SMartin Matuska ASSERT(brtvd->bv_totalcount > 0);
7722a58b312SMartin Matuska brtvd->bv_totalcount--;
7732a58b312SMartin Matuska brt_vdev_entcount_dec(brtvd, idx);
7742a58b312SMartin Matuska brtvd->bv_entcount_dirty = TRUE;
7752a58b312SMartin Matuska idx = idx / BRT_BLOCKSIZE / 8;
7762a58b312SMartin Matuska BT_SET(brtvd->bv_bitmap, idx);
7772a58b312SMartin Matuska }
7782a58b312SMartin Matuska
7792a58b312SMartin Matuska static void
brt_vdev_sync(spa_t * spa,brt_vdev_t * brtvd,dmu_tx_t * tx)780718519f4SMartin Matuska brt_vdev_sync(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
7812a58b312SMartin Matuska {
7822a58b312SMartin Matuska dmu_buf_t *db;
7832a58b312SMartin Matuska brt_vdev_phys_t *bvphys;
7842a58b312SMartin Matuska
7852a58b312SMartin Matuska ASSERT(brtvd->bv_meta_dirty);
7862a58b312SMartin Matuska ASSERT(brtvd->bv_mos_brtvdev != 0);
7872a58b312SMartin Matuska ASSERT(dmu_tx_is_syncing(tx));
7882a58b312SMartin Matuska
789718519f4SMartin Matuska VERIFY0(dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
790718519f4SMartin Matuska FTAG, &db));
7912a58b312SMartin Matuska
7922a58b312SMartin Matuska if (brtvd->bv_entcount_dirty) {
7932a58b312SMartin Matuska /*
7942a58b312SMartin Matuska * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
7952a58b312SMartin Matuska */
796718519f4SMartin Matuska dmu_write(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0,
7972a58b312SMartin Matuska brtvd->bv_size * sizeof (brtvd->bv_entcount[0]),
7982a58b312SMartin Matuska brtvd->bv_entcount, tx);
799718519f4SMartin Matuska uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
800718519f4SMartin Matuska memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(nblocks));
8012a58b312SMartin Matuska brtvd->bv_entcount_dirty = FALSE;
8022a58b312SMartin Matuska }
8032a58b312SMartin Matuska
8042a58b312SMartin Matuska dmu_buf_will_dirty(db, tx);
8052a58b312SMartin Matuska bvphys = db->db_data;
8062a58b312SMartin Matuska bvphys->bvp_mos_entries = brtvd->bv_mos_entries;
8072a58b312SMartin Matuska bvphys->bvp_size = brtvd->bv_size;
8082a58b312SMartin Matuska if (brtvd->bv_need_byteswap) {
8092a58b312SMartin Matuska bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER;
8102a58b312SMartin Matuska } else {
8112a58b312SMartin Matuska bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER;
8122a58b312SMartin Matuska }
8132a58b312SMartin Matuska bvphys->bvp_totalcount = brtvd->bv_totalcount;
814718519f4SMartin Matuska bvphys->bvp_rangesize = spa->spa_brt_rangesize;
8152a58b312SMartin Matuska bvphys->bvp_usedspace = brtvd->bv_usedspace;
8162a58b312SMartin Matuska bvphys->bvp_savedspace = brtvd->bv_savedspace;
8172a58b312SMartin Matuska dmu_buf_rele(db, FTAG);
8182a58b312SMartin Matuska
8192a58b312SMartin Matuska brtvd->bv_meta_dirty = FALSE;
8202a58b312SMartin Matuska }
8212a58b312SMartin Matuska
8222a58b312SMartin Matuska static void
brt_vdevs_free(spa_t * spa)823718519f4SMartin Matuska brt_vdevs_free(spa_t *spa)
8242a58b312SMartin Matuska {
825718519f4SMartin Matuska if (spa->spa_brt_vdevs == 0)
826718519f4SMartin Matuska return;
827718519f4SMartin Matuska for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
828718519f4SMartin Matuska brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
829718519f4SMartin Matuska rw_enter(&brtvd->bv_lock, RW_WRITER);
8302a58b312SMartin Matuska if (brtvd->bv_initiated)
831718519f4SMartin Matuska brt_vdev_dealloc(brtvd);
832718519f4SMartin Matuska rw_exit(&brtvd->bv_lock);
833718519f4SMartin Matuska rw_destroy(&brtvd->bv_lock);
834718519f4SMartin Matuska if (brtvd->bv_mos_entries != 0)
835718519f4SMartin Matuska dnode_rele(brtvd->bv_mos_entries_dnode, brtvd);
836718519f4SMartin Matuska rw_destroy(&brtvd->bv_mos_entries_lock);
837718519f4SMartin Matuska avl_destroy(&brtvd->bv_tree);
838718519f4SMartin Matuska for (int i = 0; i < TXG_SIZE; i++)
839718519f4SMartin Matuska avl_destroy(&brtvd->bv_pending_tree[i]);
840718519f4SMartin Matuska mutex_destroy(&brtvd->bv_pending_lock);
841718519f4SMartin Matuska kmem_free(brtvd, sizeof (*brtvd));
8422a58b312SMartin Matuska }
843718519f4SMartin Matuska kmem_free(spa->spa_brt_vdevs, sizeof (*spa->spa_brt_vdevs) *
844718519f4SMartin Matuska spa->spa_brt_nvdevs);
8452a58b312SMartin Matuska }
8462a58b312SMartin Matuska
8472a58b312SMartin Matuska static void
brt_entry_fill(const blkptr_t * bp,brt_entry_t * bre,uint64_t * vdevidp)8482a58b312SMartin Matuska brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp)
8492a58b312SMartin Matuska {
8502a58b312SMartin Matuska
851718519f4SMartin Matuska bre->bre_bp = *bp;
852718519f4SMartin Matuska bre->bre_count = 0;
853718519f4SMartin Matuska bre->bre_pcount = 0;
8542a58b312SMartin Matuska
8552a58b312SMartin Matuska *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]);
8562a58b312SMartin Matuska }
8572a58b312SMartin Matuska
8582a58b312SMartin Matuska static int
brt_entry_lookup(brt_vdev_t * brtvd,brt_entry_t * bre)859718519f4SMartin Matuska brt_entry_lookup(brt_vdev_t *brtvd, brt_entry_t *bre)
8602a58b312SMartin Matuska {
861718519f4SMartin Matuska uint64_t off = BRE_OFFSET(bre);
8622a58b312SMartin Matuska
863dd215568SMartin Matuska if (brtvd->bv_mos_entries == 0)
864dd215568SMartin Matuska return (SET_ERROR(ENOENT));
865dd215568SMartin Matuska
866718519f4SMartin Matuska return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
867718519f4SMartin Matuska &off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count), &bre->bre_count));
8682a58b312SMartin Matuska }
8692a58b312SMartin Matuska
8702a58b312SMartin Matuska /*
8712a58b312SMartin Matuska * Return TRUE if we _can_ have BRT entry for this bp. It might be false
8722a58b312SMartin Matuska * positive, but gives us quick answer if we should look into BRT, which
8732a58b312SMartin Matuska * may require reads and thus will be more expensive.
8742a58b312SMartin Matuska */
8752a58b312SMartin Matuska boolean_t
brt_maybe_exists(spa_t * spa,const blkptr_t * bp)8762a58b312SMartin Matuska brt_maybe_exists(spa_t *spa, const blkptr_t *bp)
8772a58b312SMartin Matuska {
8782a58b312SMartin Matuska
879718519f4SMartin Matuska if (spa->spa_brt_nvdevs == 0)
880718519f4SMartin Matuska return (B_FALSE);
8812a58b312SMartin Matuska
882718519f4SMartin Matuska uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
883718519f4SMartin Matuska brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
884718519f4SMartin Matuska if (brtvd == NULL || !brtvd->bv_initiated)
885718519f4SMartin Matuska return (FALSE);
8862a58b312SMartin Matuska
887718519f4SMartin Matuska /*
888718519f4SMartin Matuska * We don't need locks here, since bv_entcount pointer must be
889718519f4SMartin Matuska * stable at this point, and we don't care about false positive
890718519f4SMartin Matuska * races here, while false negative should be impossible, since
891718519f4SMartin Matuska * all brt_vdev_addref() have already completed by this point.
892718519f4SMartin Matuska */
893718519f4SMartin Matuska uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]);
894718519f4SMartin Matuska return (brt_vdev_lookup(spa, brtvd, off));
8952a58b312SMartin Matuska }
8962a58b312SMartin Matuska
8972a58b312SMartin Matuska uint64_t
brt_get_dspace(spa_t * spa)8982a58b312SMartin Matuska brt_get_dspace(spa_t *spa)
8992a58b312SMartin Matuska {
900718519f4SMartin Matuska if (spa->spa_brt_nvdevs == 0)
9012a58b312SMartin Matuska return (0);
9022a58b312SMartin Matuska
903718519f4SMartin Matuska brt_rlock(spa);
904718519f4SMartin Matuska uint64_t s = 0;
905718519f4SMartin Matuska for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++)
906718519f4SMartin Matuska s += spa->spa_brt_vdevs[vdevid]->bv_savedspace;
907718519f4SMartin Matuska brt_unlock(spa);
908718519f4SMartin Matuska return (s);
9092a58b312SMartin Matuska }
9102a58b312SMartin Matuska
9112a58b312SMartin Matuska uint64_t
brt_get_used(spa_t * spa)9122a58b312SMartin Matuska brt_get_used(spa_t *spa)
9132a58b312SMartin Matuska {
914718519f4SMartin Matuska if (spa->spa_brt_nvdevs == 0)
9152a58b312SMartin Matuska return (0);
9162a58b312SMartin Matuska
917718519f4SMartin Matuska brt_rlock(spa);
918718519f4SMartin Matuska uint64_t s = 0;
919718519f4SMartin Matuska for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++)
920718519f4SMartin Matuska s += spa->spa_brt_vdevs[vdevid]->bv_usedspace;
921718519f4SMartin Matuska brt_unlock(spa);
922718519f4SMartin Matuska return (s);
9232a58b312SMartin Matuska }
9242a58b312SMartin Matuska
9252a58b312SMartin Matuska uint64_t
brt_get_saved(spa_t * spa)9262a58b312SMartin Matuska brt_get_saved(spa_t *spa)
9272a58b312SMartin Matuska {
928718519f4SMartin Matuska return (brt_get_dspace(spa));
9292a58b312SMartin Matuska }
9302a58b312SMartin Matuska
9312a58b312SMartin Matuska uint64_t
brt_get_ratio(spa_t * spa)9322a58b312SMartin Matuska brt_get_ratio(spa_t *spa)
9332a58b312SMartin Matuska {
934718519f4SMartin Matuska uint64_t used = brt_get_used(spa);
935718519f4SMartin Matuska if (used == 0)
9362a58b312SMartin Matuska return (100);
937718519f4SMartin Matuska return ((used + brt_get_saved(spa)) * 100 / used);
9382a58b312SMartin Matuska }
9392a58b312SMartin Matuska
9402a58b312SMartin Matuska static int
brt_kstats_update(kstat_t * ksp,int rw)9412a58b312SMartin Matuska brt_kstats_update(kstat_t *ksp, int rw)
9422a58b312SMartin Matuska {
9432a58b312SMartin Matuska brt_stats_t *bs = ksp->ks_data;
9442a58b312SMartin Matuska
9452a58b312SMartin Matuska if (rw == KSTAT_WRITE)
9462a58b312SMartin Matuska return (EACCES);
9472a58b312SMartin Matuska
9482a58b312SMartin Matuska bs->brt_addref_entry_not_on_disk.value.ui64 =
9492a58b312SMartin Matuska wmsum_value(&brt_sums.brt_addref_entry_not_on_disk);
9502a58b312SMartin Matuska bs->brt_addref_entry_on_disk.value.ui64 =
9512a58b312SMartin Matuska wmsum_value(&brt_sums.brt_addref_entry_on_disk);
9522a58b312SMartin Matuska bs->brt_decref_entry_in_memory.value.ui64 =
9532a58b312SMartin Matuska wmsum_value(&brt_sums.brt_decref_entry_in_memory);
9542a58b312SMartin Matuska bs->brt_decref_entry_loaded_from_disk.value.ui64 =
9552a58b312SMartin Matuska wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk);
9562a58b312SMartin Matuska bs->brt_decref_entry_not_in_memory.value.ui64 =
9572a58b312SMartin Matuska wmsum_value(&brt_sums.brt_decref_entry_not_in_memory);
9582a58b312SMartin Matuska bs->brt_decref_entry_read_lost_race.value.ui64 =
9592a58b312SMartin Matuska wmsum_value(&brt_sums.brt_decref_entry_read_lost_race);
9602a58b312SMartin Matuska bs->brt_decref_entry_still_referenced.value.ui64 =
9612a58b312SMartin Matuska wmsum_value(&brt_sums.brt_decref_entry_still_referenced);
9622a58b312SMartin Matuska bs->brt_decref_free_data_later.value.ui64 =
9632a58b312SMartin Matuska wmsum_value(&brt_sums.brt_decref_free_data_later);
9642a58b312SMartin Matuska bs->brt_decref_free_data_now.value.ui64 =
9652a58b312SMartin Matuska wmsum_value(&brt_sums.brt_decref_free_data_now);
9662a58b312SMartin Matuska bs->brt_decref_no_entry.value.ui64 =
9672a58b312SMartin Matuska wmsum_value(&brt_sums.brt_decref_no_entry);
9682a58b312SMartin Matuska
9692a58b312SMartin Matuska return (0);
9702a58b312SMartin Matuska }
9712a58b312SMartin Matuska
9722a58b312SMartin Matuska static void
brt_stat_init(void)9732a58b312SMartin Matuska brt_stat_init(void)
9742a58b312SMartin Matuska {
9752a58b312SMartin Matuska
9762a58b312SMartin Matuska wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0);
9772a58b312SMartin Matuska wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0);
9782a58b312SMartin Matuska wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0);
9792a58b312SMartin Matuska wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0);
9802a58b312SMartin Matuska wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0);
9812a58b312SMartin Matuska wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0);
9822a58b312SMartin Matuska wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0);
9832a58b312SMartin Matuska wmsum_init(&brt_sums.brt_decref_free_data_later, 0);
9842a58b312SMartin Matuska wmsum_init(&brt_sums.brt_decref_free_data_now, 0);
9852a58b312SMartin Matuska wmsum_init(&brt_sums.brt_decref_no_entry, 0);
9862a58b312SMartin Matuska
9872a58b312SMartin Matuska brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED,
9882a58b312SMartin Matuska sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
9892a58b312SMartin Matuska if (brt_ksp != NULL) {
9902a58b312SMartin Matuska brt_ksp->ks_data = &brt_stats;
9912a58b312SMartin Matuska brt_ksp->ks_update = brt_kstats_update;
9922a58b312SMartin Matuska kstat_install(brt_ksp);
9932a58b312SMartin Matuska }
9942a58b312SMartin Matuska }
9952a58b312SMartin Matuska
9962a58b312SMartin Matuska static void
brt_stat_fini(void)9972a58b312SMartin Matuska brt_stat_fini(void)
9982a58b312SMartin Matuska {
9992a58b312SMartin Matuska if (brt_ksp != NULL) {
10002a58b312SMartin Matuska kstat_delete(brt_ksp);
10012a58b312SMartin Matuska brt_ksp = NULL;
10022a58b312SMartin Matuska }
10032a58b312SMartin Matuska
10042a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk);
10052a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_addref_entry_on_disk);
10062a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_decref_entry_in_memory);
10072a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk);
10082a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory);
10092a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race);
10102a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_decref_entry_still_referenced);
10112a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_decref_free_data_later);
10122a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_decref_free_data_now);
10132a58b312SMartin Matuska wmsum_fini(&brt_sums.brt_decref_no_entry);
10142a58b312SMartin Matuska }
10152a58b312SMartin Matuska
10162a58b312SMartin Matuska void
brt_init(void)10172a58b312SMartin Matuska brt_init(void)
10182a58b312SMartin Matuska {
10192a58b312SMartin Matuska brt_entry_cache = kmem_cache_create("brt_entry_cache",
10202a58b312SMartin Matuska sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
10212a58b312SMartin Matuska
10222a58b312SMartin Matuska brt_stat_init();
10232a58b312SMartin Matuska }
10242a58b312SMartin Matuska
10252a58b312SMartin Matuska void
brt_fini(void)10262a58b312SMartin Matuska brt_fini(void)
10272a58b312SMartin Matuska {
10282a58b312SMartin Matuska brt_stat_fini();
10292a58b312SMartin Matuska
10302a58b312SMartin Matuska kmem_cache_destroy(brt_entry_cache);
10312a58b312SMartin Matuska }
10322a58b312SMartin Matuska
10332a58b312SMartin Matuska /* Return TRUE if block should be freed immediately. */
10342a58b312SMartin Matuska boolean_t
brt_entry_decref(spa_t * spa,const blkptr_t * bp)10352a58b312SMartin Matuska brt_entry_decref(spa_t *spa, const blkptr_t *bp)
10362a58b312SMartin Matuska {
10372a58b312SMartin Matuska brt_entry_t *bre, *racebre;
10382a58b312SMartin Matuska brt_entry_t bre_search;
10392a58b312SMartin Matuska avl_index_t where;
10402a58b312SMartin Matuska uint64_t vdevid;
10412a58b312SMartin Matuska int error;
10422a58b312SMartin Matuska
10432a58b312SMartin Matuska brt_entry_fill(bp, &bre_search, &vdevid);
10442a58b312SMartin Matuska
1045718519f4SMartin Matuska brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
10462a58b312SMartin Matuska ASSERT(brtvd != NULL);
10472a58b312SMartin Matuska
1048718519f4SMartin Matuska rw_enter(&brtvd->bv_lock, RW_WRITER);
1049718519f4SMartin Matuska ASSERT(brtvd->bv_initiated);
10502a58b312SMartin Matuska bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
10512a58b312SMartin Matuska if (bre != NULL) {
10522a58b312SMartin Matuska BRTSTAT_BUMP(brt_decref_entry_in_memory);
10532a58b312SMartin Matuska goto out;
10542a58b312SMartin Matuska } else {
10552a58b312SMartin Matuska BRTSTAT_BUMP(brt_decref_entry_not_in_memory);
10562a58b312SMartin Matuska }
1057718519f4SMartin Matuska rw_exit(&brtvd->bv_lock);
10582a58b312SMartin Matuska
1059718519f4SMartin Matuska error = brt_entry_lookup(brtvd, &bre_search);
1060718519f4SMartin Matuska /* bre_search now contains correct bre_count */
10612a58b312SMartin Matuska if (error == ENOENT) {
1062718519f4SMartin Matuska BRTSTAT_BUMP(brt_decref_no_entry);
1063718519f4SMartin Matuska return (B_TRUE);
10642a58b312SMartin Matuska }
1065718519f4SMartin Matuska ASSERT0(error);
10662a58b312SMartin Matuska
1067718519f4SMartin Matuska rw_enter(&brtvd->bv_lock, RW_WRITER);
10682a58b312SMartin Matuska racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
10692a58b312SMartin Matuska if (racebre != NULL) {
1070718519f4SMartin Matuska /* The entry was added when the lock was dropped. */
10712a58b312SMartin Matuska BRTSTAT_BUMP(brt_decref_entry_read_lost_race);
10722a58b312SMartin Matuska bre = racebre;
10732a58b312SMartin Matuska goto out;
10742a58b312SMartin Matuska }
10752a58b312SMartin Matuska
10762a58b312SMartin Matuska BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk);
1077718519f4SMartin Matuska bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
1078718519f4SMartin Matuska bre->bre_bp = bre_search.bre_bp;
1079718519f4SMartin Matuska bre->bre_count = bre_search.bre_count;
1080718519f4SMartin Matuska bre->bre_pcount = 0;
10812a58b312SMartin Matuska avl_insert(&brtvd->bv_tree, bre, where);
10822a58b312SMartin Matuska
10832a58b312SMartin Matuska out:
1084718519f4SMartin Matuska if (bre->bre_count == 0) {
1085718519f4SMartin Matuska rw_exit(&brtvd->bv_lock);
10862a58b312SMartin Matuska BRTSTAT_BUMP(brt_decref_free_data_now);
10872a58b312SMartin Matuska return (B_TRUE);
10882a58b312SMartin Matuska }
10892a58b312SMartin Matuska
1090718519f4SMartin Matuska bre->bre_pcount--;
1091718519f4SMartin Matuska ASSERT(bre->bre_count > 0);
1092718519f4SMartin Matuska bre->bre_count--;
1093718519f4SMartin Matuska if (bre->bre_count == 0)
10942a58b312SMartin Matuska BRTSTAT_BUMP(brt_decref_free_data_later);
10952a58b312SMartin Matuska else
10962a58b312SMartin Matuska BRTSTAT_BUMP(brt_decref_entry_still_referenced);
1097718519f4SMartin Matuska brt_vdev_decref(spa, brtvd, bre, bp_get_dsize_sync(spa, bp));
10982a58b312SMartin Matuska
1099718519f4SMartin Matuska rw_exit(&brtvd->bv_lock);
11002a58b312SMartin Matuska
11012a58b312SMartin Matuska return (B_FALSE);
11022a58b312SMartin Matuska }
11032a58b312SMartin Matuska
1104315ee00fSMartin Matuska uint64_t
brt_entry_get_refcount(spa_t * spa,const blkptr_t * bp)1105315ee00fSMartin Matuska brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)
1106315ee00fSMartin Matuska {
1107315ee00fSMartin Matuska brt_entry_t bre_search, *bre;
1108315ee00fSMartin Matuska uint64_t vdevid, refcnt;
1109315ee00fSMartin Matuska int error;
1110315ee00fSMartin Matuska
1111315ee00fSMartin Matuska brt_entry_fill(bp, &bre_search, &vdevid);
1112315ee00fSMartin Matuska
1113718519f4SMartin Matuska brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
1114315ee00fSMartin Matuska ASSERT(brtvd != NULL);
1115315ee00fSMartin Matuska
1116718519f4SMartin Matuska rw_enter(&brtvd->bv_lock, RW_READER);
1117718519f4SMartin Matuska ASSERT(brtvd->bv_initiated);
1118315ee00fSMartin Matuska bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1119315ee00fSMartin Matuska if (bre == NULL) {
1120718519f4SMartin Matuska rw_exit(&brtvd->bv_lock);
1121718519f4SMartin Matuska error = brt_entry_lookup(brtvd, &bre_search);
1122718519f4SMartin Matuska if (error == ENOENT) {
1123315ee00fSMartin Matuska refcnt = 0;
1124718519f4SMartin Matuska } else {
1125718519f4SMartin Matuska ASSERT0(error);
1126718519f4SMartin Matuska refcnt = bre_search.bre_count;
1127718519f4SMartin Matuska }
1128718519f4SMartin Matuska } else {
1129718519f4SMartin Matuska refcnt = bre->bre_count;
1130718519f4SMartin Matuska rw_exit(&brtvd->bv_lock);
1131718519f4SMartin Matuska }
1132315ee00fSMartin Matuska
1133315ee00fSMartin Matuska return (refcnt);
1134315ee00fSMartin Matuska }
1135315ee00fSMartin Matuska
11362a58b312SMartin Matuska static void
brt_prefetch(brt_vdev_t * brtvd,const blkptr_t * bp)1137718519f4SMartin Matuska brt_prefetch(brt_vdev_t *brtvd, const blkptr_t *bp)
11382a58b312SMartin Matuska {
1139718519f4SMartin Matuska if (!brt_zap_prefetch || brtvd->bv_mos_entries == 0)
11402a58b312SMartin Matuska return;
11412a58b312SMartin Matuska
1142718519f4SMartin Matuska uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]);
1143718519f4SMartin Matuska rw_enter(&brtvd->bv_mos_entries_lock, RW_READER);
1144718519f4SMartin Matuska if (brtvd->bv_mos_entries != 0) {
1145718519f4SMartin Matuska (void) zap_prefetch_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
1146718519f4SMartin Matuska &off, BRT_KEY_WORDS);
1147718519f4SMartin Matuska }
1148718519f4SMartin Matuska rw_exit(&brtvd->bv_mos_entries_lock);
11492a58b312SMartin Matuska }
11502a58b312SMartin Matuska
11512a58b312SMartin Matuska static int
brt_entry_compare(const void * x1,const void * x2)1152718519f4SMartin Matuska brt_entry_compare(const void *x1, const void *x2)
11532a58b312SMartin Matuska {
1154718519f4SMartin Matuska const brt_entry_t *bre1 = x1, *bre2 = x2;
1155718519f4SMartin Matuska const blkptr_t *bp1 = &bre1->bre_bp, *bp2 = &bre2->bre_bp;
11562a58b312SMartin Matuska
1157718519f4SMartin Matuska return (TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
1158718519f4SMartin Matuska DVA_GET_OFFSET(&bp2->blk_dva[0])));
11592a58b312SMartin Matuska }
11602a58b312SMartin Matuska
11612a58b312SMartin Matuska void
brt_pending_add(spa_t * spa,const blkptr_t * bp,dmu_tx_t * tx)11622a58b312SMartin Matuska brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
11632a58b312SMartin Matuska {
1164718519f4SMartin Matuska brt_entry_t *bre, *newbre;
11652a58b312SMartin Matuska avl_index_t where;
11662a58b312SMartin Matuska uint64_t txg;
11672a58b312SMartin Matuska
11682a58b312SMartin Matuska txg = dmu_tx_get_txg(tx);
11692a58b312SMartin Matuska ASSERT3U(txg, !=, 0);
11702a58b312SMartin Matuska
1171718519f4SMartin Matuska uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
1172718519f4SMartin Matuska brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_TRUE);
1173718519f4SMartin Matuska avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK];
11742a58b312SMartin Matuska
1175718519f4SMartin Matuska newbre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
1176718519f4SMartin Matuska newbre->bre_bp = *bp;
1177718519f4SMartin Matuska newbre->bre_count = 0;
1178718519f4SMartin Matuska newbre->bre_pcount = 1;
11792a58b312SMartin Matuska
1180718519f4SMartin Matuska mutex_enter(&brtvd->bv_pending_lock);
1181718519f4SMartin Matuska bre = avl_find(pending_tree, newbre, &where);
1182718519f4SMartin Matuska if (bre == NULL) {
1183718519f4SMartin Matuska avl_insert(pending_tree, newbre, where);
1184718519f4SMartin Matuska newbre = NULL;
11852a58b312SMartin Matuska } else {
1186718519f4SMartin Matuska bre->bre_pcount++;
11872a58b312SMartin Matuska }
1188718519f4SMartin Matuska mutex_exit(&brtvd->bv_pending_lock);
11892a58b312SMartin Matuska
1190718519f4SMartin Matuska if (newbre != NULL) {
1191718519f4SMartin Matuska ASSERT(bre != NULL);
1192718519f4SMartin Matuska ASSERT(bre != newbre);
1193718519f4SMartin Matuska kmem_cache_free(brt_entry_cache, newbre);
11942a58b312SMartin Matuska } else {
1195718519f4SMartin Matuska ASSERT0P(bre);
11962a58b312SMartin Matuska
1197783d3ff6SMartin Matuska /* Prefetch BRT entry for the syncing context. */
1198718519f4SMartin Matuska brt_prefetch(brtvd, bp);
11992a58b312SMartin Matuska }
1200783d3ff6SMartin Matuska }
12012a58b312SMartin Matuska
12022a58b312SMartin Matuska void
brt_pending_remove(spa_t * spa,const blkptr_t * bp,dmu_tx_t * tx)12032a58b312SMartin Matuska brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
12042a58b312SMartin Matuska {
1205718519f4SMartin Matuska brt_entry_t *bre, bre_search;
12062a58b312SMartin Matuska uint64_t txg;
12072a58b312SMartin Matuska
12082a58b312SMartin Matuska txg = dmu_tx_get_txg(tx);
12092a58b312SMartin Matuska ASSERT3U(txg, !=, 0);
12102a58b312SMartin Matuska
1211718519f4SMartin Matuska uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
1212718519f4SMartin Matuska brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
1213718519f4SMartin Matuska ASSERT(brtvd != NULL);
1214718519f4SMartin Matuska avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK];
12152a58b312SMartin Matuska
1216718519f4SMartin Matuska bre_search.bre_bp = *bp;
12172a58b312SMartin Matuska
1218718519f4SMartin Matuska mutex_enter(&brtvd->bv_pending_lock);
1219718519f4SMartin Matuska bre = avl_find(pending_tree, &bre_search, NULL);
1220718519f4SMartin Matuska ASSERT(bre != NULL);
1221718519f4SMartin Matuska ASSERT(bre->bre_pcount > 0);
1222718519f4SMartin Matuska bre->bre_pcount--;
1223718519f4SMartin Matuska if (bre->bre_pcount == 0)
1224718519f4SMartin Matuska avl_remove(pending_tree, bre);
1225718519f4SMartin Matuska else
1226718519f4SMartin Matuska bre = NULL;
1227718519f4SMartin Matuska mutex_exit(&brtvd->bv_pending_lock);
12282a58b312SMartin Matuska
1229718519f4SMartin Matuska if (bre)
1230718519f4SMartin Matuska kmem_cache_free(brt_entry_cache, bre);
1231718519f4SMartin Matuska }
1232718519f4SMartin Matuska
1233718519f4SMartin Matuska static void
brt_pending_apply_vdev(spa_t * spa,brt_vdev_t * brtvd,uint64_t txg)1234718519f4SMartin Matuska brt_pending_apply_vdev(spa_t *spa, brt_vdev_t *brtvd, uint64_t txg)
1235718519f4SMartin Matuska {
1236718519f4SMartin Matuska brt_entry_t *bre, *nbre;
1237718519f4SMartin Matuska
1238718519f4SMartin Matuska /*
1239718519f4SMartin Matuska * We are in syncing context, so no other bv_pending_tree accesses
1240718519f4SMartin Matuska * are possible for the TXG. So we don't need bv_pending_lock.
1241718519f4SMartin Matuska */
1242718519f4SMartin Matuska ASSERT(avl_is_empty(&brtvd->bv_tree));
1243718519f4SMartin Matuska avl_swap(&brtvd->bv_tree, &brtvd->bv_pending_tree[txg & TXG_MASK]);
1244718519f4SMartin Matuska
1245718519f4SMartin Matuska for (bre = avl_first(&brtvd->bv_tree); bre; bre = nbre) {
1246718519f4SMartin Matuska nbre = AVL_NEXT(&brtvd->bv_tree, bre);
1247718519f4SMartin Matuska
1248718519f4SMartin Matuska /*
1249718519f4SMartin Matuska * If the block has DEDUP bit set, it means that it
1250718519f4SMartin Matuska * already exists in the DEDUP table, so we can just
1251718519f4SMartin Matuska * use that instead of creating new entry in the BRT.
1252718519f4SMartin Matuska */
1253718519f4SMartin Matuska if (BP_GET_DEDUP(&bre->bre_bp)) {
1254718519f4SMartin Matuska while (bre->bre_pcount > 0) {
1255718519f4SMartin Matuska if (!ddt_addref(spa, &bre->bre_bp))
1256718519f4SMartin Matuska break;
1257718519f4SMartin Matuska bre->bre_pcount--;
1258718519f4SMartin Matuska }
1259718519f4SMartin Matuska if (bre->bre_pcount == 0) {
1260718519f4SMartin Matuska avl_remove(&brtvd->bv_tree, bre);
1261718519f4SMartin Matuska kmem_cache_free(brt_entry_cache, bre);
1262718519f4SMartin Matuska continue;
12632a58b312SMartin Matuska }
12642a58b312SMartin Matuska }
12652a58b312SMartin Matuska
1266718519f4SMartin Matuska /*
1267718519f4SMartin Matuska * Unless we know that the block is definitely not in ZAP,
1268718519f4SMartin Matuska * try to get its reference count from there.
1269718519f4SMartin Matuska */
1270718519f4SMartin Matuska uint64_t off = BRE_OFFSET(bre);
1271718519f4SMartin Matuska if (brtvd->bv_mos_entries != 0 &&
1272718519f4SMartin Matuska brt_vdev_lookup(spa, brtvd, off)) {
1273718519f4SMartin Matuska int error = zap_lookup_uint64_by_dnode(
1274718519f4SMartin Matuska brtvd->bv_mos_entries_dnode, &off,
1275718519f4SMartin Matuska BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
1276718519f4SMartin Matuska &bre->bre_count);
1277718519f4SMartin Matuska if (error == 0) {
1278718519f4SMartin Matuska BRTSTAT_BUMP(brt_addref_entry_on_disk);
1279718519f4SMartin Matuska } else {
1280718519f4SMartin Matuska ASSERT3U(error, ==, ENOENT);
1281718519f4SMartin Matuska BRTSTAT_BUMP(brt_addref_entry_not_on_disk);
1282718519f4SMartin Matuska }
1283718519f4SMartin Matuska }
1284718519f4SMartin Matuska }
1285718519f4SMartin Matuska
1286718519f4SMartin Matuska /*
1287718519f4SMartin Matuska * If all the cloned blocks we had were handled by DDT, we don't need
1288718519f4SMartin Matuska * to initiate the vdev.
1289718519f4SMartin Matuska */
1290718519f4SMartin Matuska if (avl_is_empty(&brtvd->bv_tree))
1291718519f4SMartin Matuska return;
1292718519f4SMartin Matuska
1293718519f4SMartin Matuska if (!brtvd->bv_initiated) {
1294718519f4SMartin Matuska rw_enter(&brtvd->bv_lock, RW_WRITER);
1295718519f4SMartin Matuska brt_vdev_realloc(spa, brtvd);
1296718519f4SMartin Matuska rw_exit(&brtvd->bv_lock);
1297718519f4SMartin Matuska }
1298718519f4SMartin Matuska
1299718519f4SMartin Matuska /*
1300718519f4SMartin Matuska * Convert pending references into proper ones. This has to be a
1301718519f4SMartin Matuska * separate loop, since entcount modifications would cause false
1302718519f4SMartin Matuska * positives for brt_vdev_lookup() on following iterations.
1303718519f4SMartin Matuska */
1304718519f4SMartin Matuska for (bre = avl_first(&brtvd->bv_tree); bre;
1305718519f4SMartin Matuska bre = AVL_NEXT(&brtvd->bv_tree, bre)) {
1306718519f4SMartin Matuska brt_vdev_addref(spa, brtvd, bre,
1307718519f4SMartin Matuska bp_get_dsize(spa, &bre->bre_bp), bre->bre_pcount);
1308718519f4SMartin Matuska bre->bre_count += bre->bre_pcount;
1309718519f4SMartin Matuska }
13102a58b312SMartin Matuska }
13112a58b312SMartin Matuska
13122a58b312SMartin Matuska void
brt_pending_apply(spa_t * spa,uint64_t txg)13132a58b312SMartin Matuska brt_pending_apply(spa_t *spa, uint64_t txg)
13142a58b312SMartin Matuska {
13152a58b312SMartin Matuska
1316718519f4SMartin Matuska brt_rlock(spa);
1317718519f4SMartin Matuska for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
1318718519f4SMartin Matuska brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
1319718519f4SMartin Matuska brt_unlock(spa);
13202a58b312SMartin Matuska
1321718519f4SMartin Matuska brt_pending_apply_vdev(spa, brtvd, txg);
13222a58b312SMartin Matuska
1323718519f4SMartin Matuska brt_rlock(spa);
13242a58b312SMartin Matuska }
1325718519f4SMartin Matuska brt_unlock(spa);
13262a58b312SMartin Matuska }
13272a58b312SMartin Matuska
13282a58b312SMartin Matuska static void
brt_sync_entry(dnode_t * dn,brt_entry_t * bre,dmu_tx_t * tx)1329783d3ff6SMartin Matuska brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx)
13302a58b312SMartin Matuska {
1331718519f4SMartin Matuska uint64_t off = BRE_OFFSET(bre);
1332718519f4SMartin Matuska
1333718519f4SMartin Matuska if (bre->bre_pcount == 0) {
1334718519f4SMartin Matuska /* The net change is zero, nothing to do in ZAP. */
1335718519f4SMartin Matuska } else if (bre->bre_count == 0) {
1336718519f4SMartin Matuska int error = zap_remove_uint64_by_dnode(dn, &off,
1337783d3ff6SMartin Matuska BRT_KEY_WORDS, tx);
1338783d3ff6SMartin Matuska VERIFY(error == 0 || error == ENOENT);
13392a58b312SMartin Matuska } else {
1340718519f4SMartin Matuska VERIFY0(zap_update_uint64_by_dnode(dn, &off,
1341718519f4SMartin Matuska BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
1342718519f4SMartin Matuska &bre->bre_count, tx));
13432a58b312SMartin Matuska }
13442a58b312SMartin Matuska }
13452a58b312SMartin Matuska
13462a58b312SMartin Matuska static void
brt_sync_table(spa_t * spa,dmu_tx_t * tx)1347718519f4SMartin Matuska brt_sync_table(spa_t *spa, dmu_tx_t *tx)
13482a58b312SMartin Matuska {
13492a58b312SMartin Matuska brt_entry_t *bre;
13502a58b312SMartin Matuska
1351718519f4SMartin Matuska brt_rlock(spa);
1352718519f4SMartin Matuska for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
1353718519f4SMartin Matuska brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
1354718519f4SMartin Matuska brt_unlock(spa);
13552a58b312SMartin Matuska
13562a58b312SMartin Matuska if (!brtvd->bv_meta_dirty) {
13572a58b312SMartin Matuska ASSERT(!brtvd->bv_entcount_dirty);
13582a58b312SMartin Matuska ASSERT0(avl_numnodes(&brtvd->bv_tree));
1359718519f4SMartin Matuska brt_rlock(spa);
13602a58b312SMartin Matuska continue;
13612a58b312SMartin Matuska }
13622a58b312SMartin Matuska
13632a58b312SMartin Matuska ASSERT(!brtvd->bv_entcount_dirty ||
13642a58b312SMartin Matuska avl_numnodes(&brtvd->bv_tree) != 0);
13652a58b312SMartin Matuska
13662a58b312SMartin Matuska if (brtvd->bv_mos_brtvdev == 0)
1367718519f4SMartin Matuska brt_vdev_create(spa, brtvd, tx);
13682a58b312SMartin Matuska
1369718519f4SMartin Matuska void *c = NULL;
13702a58b312SMartin Matuska while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
1371718519f4SMartin Matuska brt_sync_entry(brtvd->bv_mos_entries_dnode, bre, tx);
1372718519f4SMartin Matuska kmem_cache_free(brt_entry_cache, bre);
13732a58b312SMartin Matuska }
13742a58b312SMartin Matuska
1375718519f4SMartin Matuska #ifdef ZFS_DEBUG
1376718519f4SMartin Matuska if (zfs_flags & ZFS_DEBUG_BRT)
1377718519f4SMartin Matuska brt_vdev_dump(brtvd);
1378718519f4SMartin Matuska #endif
13792a58b312SMartin Matuska if (brtvd->bv_totalcount == 0)
1380718519f4SMartin Matuska brt_vdev_destroy(spa, brtvd, tx);
1381718519f4SMartin Matuska else
1382718519f4SMartin Matuska brt_vdev_sync(spa, brtvd, tx);
1383718519f4SMartin Matuska brt_rlock(spa);
13842a58b312SMartin Matuska }
1385718519f4SMartin Matuska brt_unlock(spa);
13862a58b312SMartin Matuska }
13872a58b312SMartin Matuska
13882a58b312SMartin Matuska void
brt_sync(spa_t * spa,uint64_t txg)13892a58b312SMartin Matuska brt_sync(spa_t *spa, uint64_t txg)
13902a58b312SMartin Matuska {
13912a58b312SMartin Matuska dmu_tx_t *tx;
1392718519f4SMartin Matuska uint64_t vdevid;
13932a58b312SMartin Matuska
1394718519f4SMartin Matuska ASSERT3U(spa_syncing_txg(spa), ==, txg);
13952a58b312SMartin Matuska
1396718519f4SMartin Matuska brt_rlock(spa);
1397718519f4SMartin Matuska for (vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
1398718519f4SMartin Matuska if (spa->spa_brt_vdevs[vdevid]->bv_meta_dirty)
1399718519f4SMartin Matuska break;
1400718519f4SMartin Matuska }
1401718519f4SMartin Matuska if (vdevid >= spa->spa_brt_nvdevs) {
1402718519f4SMartin Matuska brt_unlock(spa);
14032a58b312SMartin Matuska return;
14042a58b312SMartin Matuska }
1405718519f4SMartin Matuska brt_unlock(spa);
14062a58b312SMartin Matuska
14072a58b312SMartin Matuska tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1408718519f4SMartin Matuska brt_sync_table(spa, tx);
14092a58b312SMartin Matuska dmu_tx_commit(tx);
14102a58b312SMartin Matuska }
14112a58b312SMartin Matuska
14122a58b312SMartin Matuska static void
brt_alloc(spa_t * spa)14132a58b312SMartin Matuska brt_alloc(spa_t *spa)
14142a58b312SMartin Matuska {
1415718519f4SMartin Matuska rw_init(&spa->spa_brt_lock, NULL, RW_DEFAULT, NULL);
1416718519f4SMartin Matuska spa->spa_brt_vdevs = NULL;
1417718519f4SMartin Matuska spa->spa_brt_nvdevs = 0;
1418718519f4SMartin Matuska spa->spa_brt_rangesize = 0;
14192a58b312SMartin Matuska }
14202a58b312SMartin Matuska
14212a58b312SMartin Matuska void
brt_create(spa_t * spa)14222a58b312SMartin Matuska brt_create(spa_t *spa)
14232a58b312SMartin Matuska {
14242a58b312SMartin Matuska brt_alloc(spa);
1425718519f4SMartin Matuska spa->spa_brt_rangesize = BRT_RANGESIZE;
14262a58b312SMartin Matuska }
14272a58b312SMartin Matuska
14282a58b312SMartin Matuska int
brt_load(spa_t * spa)14292a58b312SMartin Matuska brt_load(spa_t *spa)
14302a58b312SMartin Matuska {
1431718519f4SMartin Matuska int error = 0;
14322a58b312SMartin Matuska
14332a58b312SMartin Matuska brt_alloc(spa);
1434718519f4SMartin Matuska brt_wlock(spa);
1435718519f4SMartin Matuska for (uint64_t vdevid = 0; vdevid < spa->spa_root_vdev->vdev_children;
1436718519f4SMartin Matuska vdevid++) {
1437718519f4SMartin Matuska char name[64];
1438718519f4SMartin Matuska uint64_t mos_brtvdev;
14392a58b312SMartin Matuska
1440718519f4SMartin Matuska /* Look if this vdev had active block cloning. */
1441718519f4SMartin Matuska snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
1442718519f4SMartin Matuska (u_longlong_t)vdevid);
1443718519f4SMartin Matuska error = zap_lookup(spa->spa_meta_objset,
1444718519f4SMartin Matuska DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1,
1445718519f4SMartin Matuska &mos_brtvdev);
1446718519f4SMartin Matuska if (error == ENOENT) {
1447718519f4SMartin Matuska error = 0;
1448718519f4SMartin Matuska continue;
1449718519f4SMartin Matuska }
1450718519f4SMartin Matuska if (error != 0)
1451718519f4SMartin Matuska break;
1452718519f4SMartin Matuska
1453718519f4SMartin Matuska /* If it did, then allocate them all and load this one. */
1454718519f4SMartin Matuska brt_vdevs_expand(spa, spa->spa_root_vdev->vdev_children);
1455718519f4SMartin Matuska brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
1456718519f4SMartin Matuska rw_enter(&brtvd->bv_lock, RW_WRITER);
1457718519f4SMartin Matuska brtvd->bv_mos_brtvdev = mos_brtvdev;
1458718519f4SMartin Matuska error = brt_vdev_load(spa, brtvd);
1459718519f4SMartin Matuska rw_exit(&brtvd->bv_lock);
1460718519f4SMartin Matuska if (error != 0)
1461718519f4SMartin Matuska break;
1462718519f4SMartin Matuska }
1463718519f4SMartin Matuska
1464718519f4SMartin Matuska if (spa->spa_brt_rangesize == 0)
1465718519f4SMartin Matuska spa->spa_brt_rangesize = BRT_RANGESIZE;
1466718519f4SMartin Matuska brt_unlock(spa);
1467718519f4SMartin Matuska return (error);
14682a58b312SMartin Matuska }
14692a58b312SMartin Matuska
14702a58b312SMartin Matuska void
brt_unload(spa_t * spa)14712a58b312SMartin Matuska brt_unload(spa_t *spa)
14722a58b312SMartin Matuska {
1473718519f4SMartin Matuska if (spa->spa_brt_rangesize == 0)
14742a58b312SMartin Matuska return;
1475718519f4SMartin Matuska brt_vdevs_free(spa);
1476718519f4SMartin Matuska rw_destroy(&spa->spa_brt_lock);
1477718519f4SMartin Matuska spa->spa_brt_rangesize = 0;
14782a58b312SMartin Matuska }
14792a58b312SMartin Matuska
1480783d3ff6SMartin Matuska ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW,
1481783d3ff6SMartin Matuska "Enable prefetching of BRT ZAP entries");
1482783d3ff6SMartin Matuska ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW,
1483783d3ff6SMartin Matuska "BRT ZAP leaf blockshift");
1484783d3ff6SMartin Matuska ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW,
1485783d3ff6SMartin Matuska "BRT ZAP indirect blockshift");
1486