1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23 /*
24 * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
25 */
26
27 #include <sys/zfs_context.h>
28 #include <sys/spa.h>
29 #include <sys/spa_impl.h>
30 #include <sys/zio.h>
31 #include <sys/brt.h>
32 #include <sys/brt_impl.h>
33 #include <sys/ddt.h>
34 #include <sys/bitmap.h>
35 #include <sys/zap.h>
36 #include <sys/dmu_tx.h>
37 #include <sys/arc.h>
38 #include <sys/dsl_pool.h>
39 #include <sys/dsl_scan.h>
40 #include <sys/vdev_impl.h>
41 #include <sys/kstat.h>
42 #include <sys/wmsum.h>
43
44 /*
45 * Block Cloning design.
46 *
47 * Block Cloning allows to manually clone a file (or a subset of its blocks)
48 * into another (or the same) file by just creating additional references to
49 * the data blocks without copying the data itself. Those references are kept
50 * in the Block Reference Tables (BRTs).
51 *
52 * In many ways this is similar to the existing deduplication, but there are
53 * some important differences:
54 *
55 * - Deduplication is automatic and Block Cloning is not - one has to use a
56 * dedicated system call(s) to clone the given file/blocks.
57 * - Deduplication keeps all data blocks in its table, even those referenced
58 * just once. Block Cloning creates an entry in its tables only when there
59 * are at least two references to the given data block. If the block was
60 * never explicitly cloned or the second to last reference was dropped,
61 * there will be neither space nor performance overhead.
62 * - Deduplication needs data to work - one needs to pass real data to the
63 * write(2) syscall, so hash can be calculated. Block Cloning doesn't require
64 * data, just block pointers to the data, so it is extremely fast, as we pay
65 * neither the cost of reading the data, nor the cost of writing the data -
66 * we operate exclusively on metadata.
67 * - If the D (dedup) bit is not set in the block pointer, it means that
68 * the block is not in the dedup table (DDT) and we won't consult the DDT
69 * when we need to free the block. Block Cloning must be consulted on every
70 * free, because we cannot modify the source BP (eg. by setting something
71 * similar to the D bit), thus we have no hint if the block is in the
72 * Block Reference Table (BRT), so we need to look into the BRT. There is
73 * an optimization in place that allows us to eliminate the majority of BRT
74 * lookups which is described below in the "Minimizing free penalty" section.
75 * - The BRT entry is much smaller than the DDT entry - for BRT we only store
76 * 64bit offset and 64bit reference counter.
77 * - Dedup keys are cryptographic hashes, so two blocks that are close to each
78 * other on disk are most likely in totally different parts of the DDT.
79 * The BRT entry keys are offsets into a single top-level VDEV, so data blocks
80 * from one file should have BRT entries close to each other.
81 * - Scrub will only do a single pass over a block that is referenced multiple
82 * times in the DDT. Unfortunately it is not currently (if at all) possible
83 * with Block Cloning and block referenced multiple times will be scrubbed
84 * multiple times. The new, sorted scrub should be able to eliminate
85 * duplicated reads given enough memory.
86 * - Deduplication requires cryptographically strong hash as a checksum or
87 * additional data verification. Block Cloning works with any checksum
88 * algorithm or even with checksumming disabled.
89 *
90 * As mentioned above, the BRT entries are much smaller than the DDT entries.
91 * To uniquely identify a block we just need its vdev id and offset. We also
92 * need to maintain a reference counter. The vdev id will often repeat, as there
93 * is a small number of top-level VDEVs and a large number of blocks stored in
94 * each VDEV. We take advantage of that to reduce the BRT entry size further by
95 * maintaining one BRT for each top-level VDEV, so we can then have only offset
96 * and counter as the BRT entry.
97 *
98 * Minimizing free penalty.
99 *
100 * Block Cloning allows creating additional references to any existing block.
101 * When we free a block there is no hint in the block pointer whether the block
102 * was cloned or not, so on each free we have to check if there is a
103 * corresponding entry in the BRT or not. If there is, we need to decrease
104 * the reference counter. Doing BRT lookup on every free can potentially be
105 * expensive by requiring additional I/Os if the BRT doesn't fit into memory.
106 * This is the main problem with deduplication, so we've learned our lesson and
107 * try not to repeat the same mistake here. How do we do that? We divide each
108 * top-level VDEV into 16MB regions. For each region we maintain a counter that
109 * is a sum of all the BRT entries that have offsets within the region. This
110 * creates the entries count array of 16bit numbers for each top-level VDEV.
111 * The entries count array is always kept in memory and updated on disk in the
112 * same transaction group as the BRT updates to keep everything in-sync. We can
113 * keep the array in memory, because it is very small. With 16MB regions and
114 * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease
115 * the region size even further in the future). Now, when we want to free
116 * a block, we first consult the array. If the counter for the whole region is
117 * zero, there is no need to look for the BRT entry, as there isn't one for
118 * sure. If the counter for the region is greater than zero, only then we will
119 * do a BRT lookup and if an entry is found we will decrease the reference
120 * counter in the BRT entry and in the entry counters array.
121 *
122 * The entry counters array is small, but can potentially be larger for very
123 * large VDEVs or smaller regions. In this case we don't want to rewrite entire
124 * array on every change. We then divide the array into 32kB block and keep
125 * a bitmap of dirty blocks within a transaction group. When we sync the
126 * transaction group we can only update the parts of the entry counters array
127 * that were modified. Note: Keeping track of the dirty parts of the entry
128 * counters array is implemented, but updating only parts of the array on disk
129 * is not yet implemented - for now we will update entire array if there was
130 * any change.
131 *
132 * The implementation tries to be economic: if BRT is not used, or no longer
133 * used, there will be no entries in the MOS and no additional memory used (eg.
134 * the entry counters array is only allocated if needed).
135 *
136 * Interaction between Deduplication and Block Cloning.
137 *
138 * If both functionalities are in use, we could end up with a block that is
139 * referenced multiple times in both DDT and BRT. When we free one of the
140 * references we couldn't tell where it belongs, so we would have to decide
141 * what table takes the precedence: do we first clear DDT references or BRT
142 * references? To avoid this dilemma BRT cooperates with DDT - if a given block
143 * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will
144 * lookup DDT entry instead and increase the counter there. No BRT entry
145 * will be created for a block which has the D (dedup) bit set.
146 * BRT may be more efficient for manual deduplication, but if the block is
147 * already in the DDT, then creating additional BRT entry would be less
148 * efficient. This clever idea was proposed by Allan Jude.
149 *
150 * Block Cloning across datasets.
151 *
152 * Block Cloning is not limited to cloning blocks within the same dataset.
153 * It is possible (and very useful) to clone blocks between different datasets.
154 * One use case is recovering files from snapshots. By cloning the files into
155 * dataset we need no additional storage. Without Block Cloning we would need
156 * additional space for those files.
157 * Another interesting use case is moving the files between datasets
158 * (copying the file content to the new dataset and removing the source file).
159 * In that case Block Cloning will only be used briefly, because the BRT entries
160 * will be removed when the source is removed.
161 * Block Cloning across encrypted datasets is supported as long as both
162 * datasets share the same master key (e.g. snapshots and clones)
163 *
164 * Block Cloning flow through ZFS layers.
165 *
166 * Note: Block Cloning can be used both for cloning file system blocks and ZVOL
167 * blocks. As of this writing no interface is implemented that allows for block
168 * cloning within a ZVOL.
169 * FreeBSD and Linux provides copy_file_range(2) system call and we will use it
170 * for blocking cloning.
171 *
172 * ssize_t
173 * copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp,
174 * size_t len, unsigned int flags);
175 *
176 * Even though offsets and length represent bytes, they have to be
177 * block-aligned or we will return an error so the upper layer can
178 * fallback to the generic mechanism that will just copy the data.
179 * Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
180 * This function was implemented based on zfs_write(), but instead of writing
181 * the given data we first read block pointers using the new dmu_read_l0_bps()
182 * function from the source file. Once we have BPs from the source file we call
183 * the dmu_brt_clone() function on the destination file. This function
184 * allocates BPs for us. We iterate over all source BPs. If the given BP is
185 * a hole or an embedded block, we just copy BP as-is. If it points to a real
186 * data we place this BP on a BRT pending list using the brt_pending_add()
187 * function.
188 *
189 * We use this pending list to keep track of all BPs that got new references
190 * within this transaction group.
191 *
192 * Some special cases to consider and how we address them:
193 * - The block we want to clone may have been created within the same
194 * transaction group that we are trying to clone. Such block has no BP
195 * allocated yet, so cannot be immediately cloned. We return EAGAIN.
196 * - The block we want to clone may have been modified within the same
197 * transaction group. We return EAGAIN.
198 * - A block may be cloned multiple times during one transaction group (that's
199 * why pending list is actually a tree and not an append-only list - this
200 * way we can figure out faster if this block is cloned for the first time
201 * in this txg or consecutive time).
202 * - A block may be cloned and freed within the same transaction group
203 * (see dbuf_undirty()).
204 * - A block may be cloned and within the same transaction group the clone
205 * can be cloned again (see dmu_read_l0_bps()).
206 * - A file might have been deleted, but the caller still has a file descriptor
207 * open to this file and clones it.
208 *
209 * When we free a block we have an additional step in the ZIO pipeline where we
210 * call the zio_brt_free() function. We then call the brt_entry_decref()
211 * that loads the corresponding BRT entry (if one exists) and decreases
212 * reference counter. If this is not the last reference we will stop ZIO
213 * pipeline here. If this is the last reference or the block is not in the
214 * BRT, we continue the pipeline and free the block as usual.
215 *
216 * At the beginning of spa_sync() where there can be no more block cloning,
217 * but before issuing frees we call brt_pending_apply(). This function applies
218 * all the new clones to the BRT table - we load BRT entries and update
219 * reference counters. To sync new BRT entries to disk, we use brt_sync()
220 * function. This function will sync all dirty per-top-level-vdev BRTs,
221 * the entry counters arrays, etc.
222 *
223 * Block Cloning and ZIL.
224 *
225 * Every clone operation is divided into chunks (similar to write) and each
226 * chunk is cloned in a separate transaction. The chunk size is determined by
227 * how many BPs we can fit into a single ZIL entry.
228 * Replaying clone operation is different from the regular clone operation,
229 * as when we log clone operations we cannot use the source object - it may
230 * reside on a different dataset, so we log BPs we want to clone.
231 * The ZIL is replayed when we mount the given dataset, not when the pool is
232 * imported. Taking this into account it is possible that the pool is imported
233 * without mounting datasets and the source dataset is destroyed before the
234 * destination dataset is mounted and its ZIL replayed.
235 * To address this situation we leverage zil_claim() mechanism where ZFS will
236 * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
237 * entries, we will bump reference counters for their BPs in the BRT. Then
238 * on mount and ZIL replay we bump the reference counters once more, while the
239 * first references are dropped during ZIL destroy by zil_free_clone_range().
240 * It is possible that after zil_claim() we never mount the destination, so
241 * we never replay its ZIL and just destroy it. In this case the only taken
242 * references will be dropped by zil_free_clone_range(), since the cloning is
243 * not going to ever take place.
244 */
245
246 static kmem_cache_t *brt_entry_cache;
247
248 /*
249 * Enable/disable prefetching of BRT entries that we are going to modify.
250 */
251 static int brt_zap_prefetch = 1;
252
253 #ifdef ZFS_DEBUG
254 #define BRT_DEBUG(...) do { \
255 if ((zfs_flags & ZFS_DEBUG_BRT) != 0) { \
256 __dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \
257 } \
258 } while (0)
259 #else
260 #define BRT_DEBUG(...) do { } while (0)
261 #endif
262
263 static int brt_zap_default_bs = 12;
264 static int brt_zap_default_ibs = 12;
265
266 static kstat_t *brt_ksp;
267
268 typedef struct brt_stats {
269 kstat_named_t brt_addref_entry_not_on_disk;
270 kstat_named_t brt_addref_entry_on_disk;
271 kstat_named_t brt_decref_entry_in_memory;
272 kstat_named_t brt_decref_entry_loaded_from_disk;
273 kstat_named_t brt_decref_entry_not_in_memory;
274 kstat_named_t brt_decref_entry_read_lost_race;
275 kstat_named_t brt_decref_entry_still_referenced;
276 kstat_named_t brt_decref_free_data_later;
277 kstat_named_t brt_decref_free_data_now;
278 kstat_named_t brt_decref_no_entry;
279 } brt_stats_t;
280
281 static brt_stats_t brt_stats = {
282 { "addref_entry_not_on_disk", KSTAT_DATA_UINT64 },
283 { "addref_entry_on_disk", KSTAT_DATA_UINT64 },
284 { "decref_entry_in_memory", KSTAT_DATA_UINT64 },
285 { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64 },
286 { "decref_entry_not_in_memory", KSTAT_DATA_UINT64 },
287 { "decref_entry_read_lost_race", KSTAT_DATA_UINT64 },
288 { "decref_entry_still_referenced", KSTAT_DATA_UINT64 },
289 { "decref_free_data_later", KSTAT_DATA_UINT64 },
290 { "decref_free_data_now", KSTAT_DATA_UINT64 },
291 { "decref_no_entry", KSTAT_DATA_UINT64 }
292 };
293
294 struct {
295 wmsum_t brt_addref_entry_not_on_disk;
296 wmsum_t brt_addref_entry_on_disk;
297 wmsum_t brt_decref_entry_in_memory;
298 wmsum_t brt_decref_entry_loaded_from_disk;
299 wmsum_t brt_decref_entry_not_in_memory;
300 wmsum_t brt_decref_entry_read_lost_race;
301 wmsum_t brt_decref_entry_still_referenced;
302 wmsum_t brt_decref_free_data_later;
303 wmsum_t brt_decref_free_data_now;
304 wmsum_t brt_decref_no_entry;
305 } brt_sums;
306
307 #define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1)
308
309 static int brt_entry_compare(const void *x1, const void *x2);
310 static void brt_vdevs_expand(spa_t *spa, uint64_t nvdevs);
311
312 static void
brt_rlock(spa_t * spa)313 brt_rlock(spa_t *spa)
314 {
315 rw_enter(&spa->spa_brt_lock, RW_READER);
316 }
317
318 static void
brt_wlock(spa_t * spa)319 brt_wlock(spa_t *spa)
320 {
321 rw_enter(&spa->spa_brt_lock, RW_WRITER);
322 }
323
324 static void
brt_unlock(spa_t * spa)325 brt_unlock(spa_t *spa)
326 {
327 rw_exit(&spa->spa_brt_lock);
328 }
329
330 static uint16_t
brt_vdev_entcount_get(const brt_vdev_t * brtvd,uint64_t idx)331 brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx)
332 {
333
334 ASSERT3U(idx, <, brtvd->bv_size);
335
336 if (unlikely(brtvd->bv_need_byteswap)) {
337 return (BSWAP_16(brtvd->bv_entcount[idx]));
338 } else {
339 return (brtvd->bv_entcount[idx]);
340 }
341 }
342
343 static void
brt_vdev_entcount_set(brt_vdev_t * brtvd,uint64_t idx,uint16_t entcnt)344 brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt)
345 {
346
347 ASSERT3U(idx, <, brtvd->bv_size);
348
349 if (unlikely(brtvd->bv_need_byteswap)) {
350 brtvd->bv_entcount[idx] = BSWAP_16(entcnt);
351 } else {
352 brtvd->bv_entcount[idx] = entcnt;
353 }
354 }
355
356 static void
brt_vdev_entcount_inc(brt_vdev_t * brtvd,uint64_t idx)357 brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx)
358 {
359 uint16_t entcnt;
360
361 ASSERT3U(idx, <, brtvd->bv_size);
362
363 entcnt = brt_vdev_entcount_get(brtvd, idx);
364 ASSERT(entcnt < UINT16_MAX);
365
366 brt_vdev_entcount_set(brtvd, idx, entcnt + 1);
367 }
368
369 static void
brt_vdev_entcount_dec(brt_vdev_t * brtvd,uint64_t idx)370 brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx)
371 {
372 uint16_t entcnt;
373
374 ASSERT3U(idx, <, brtvd->bv_size);
375
376 entcnt = brt_vdev_entcount_get(brtvd, idx);
377 ASSERT(entcnt > 0);
378
379 brt_vdev_entcount_set(brtvd, idx, entcnt - 1);
380 }
381
382 #ifdef ZFS_DEBUG
383 static void
brt_vdev_dump(brt_vdev_t * brtvd)384 brt_vdev_dump(brt_vdev_t *brtvd)
385 {
386 uint64_t idx;
387
388 uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
389 zfs_dbgmsg(" BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d "
390 "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu",
391 (u_longlong_t)brtvd->bv_vdevid,
392 brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
393 (u_longlong_t)brtvd->bv_size,
394 (u_longlong_t)brtvd->bv_totalcount,
395 (u_longlong_t)nblocks,
396 (size_t)BT_SIZEOFMAP(nblocks));
397 if (brtvd->bv_totalcount > 0) {
398 zfs_dbgmsg(" entcounts:");
399 for (idx = 0; idx < brtvd->bv_size; idx++) {
400 uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx);
401 if (entcnt > 0) {
402 zfs_dbgmsg(" [%04llu] %hu",
403 (u_longlong_t)idx, entcnt);
404 }
405 }
406 }
407 if (brtvd->bv_entcount_dirty) {
408 char *bitmap;
409
410 bitmap = kmem_alloc(nblocks + 1, KM_SLEEP);
411 for (idx = 0; idx < nblocks; idx++) {
412 bitmap[idx] =
413 BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
414 }
415 bitmap[idx] = '\0';
416 zfs_dbgmsg(" dirty: %s", bitmap);
417 kmem_free(bitmap, nblocks + 1);
418 }
419 }
420 #endif
421
422 static brt_vdev_t *
brt_vdev(spa_t * spa,uint64_t vdevid,boolean_t alloc)423 brt_vdev(spa_t *spa, uint64_t vdevid, boolean_t alloc)
424 {
425 brt_vdev_t *brtvd = NULL;
426
427 brt_rlock(spa);
428 if (vdevid < spa->spa_brt_nvdevs) {
429 brtvd = spa->spa_brt_vdevs[vdevid];
430 } else if (alloc) {
431 /* New VDEV was added. */
432 brt_unlock(spa);
433 brt_wlock(spa);
434 if (vdevid >= spa->spa_brt_nvdevs)
435 brt_vdevs_expand(spa, vdevid + 1);
436 brtvd = spa->spa_brt_vdevs[vdevid];
437 }
438 brt_unlock(spa);
439 return (brtvd);
440 }
441
442 static void
brt_vdev_create(spa_t * spa,brt_vdev_t * brtvd,dmu_tx_t * tx)443 brt_vdev_create(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
444 {
445 char name[64];
446
447 ASSERT(brtvd->bv_initiated);
448 ASSERT0(brtvd->bv_mos_brtvdev);
449 ASSERT0(brtvd->bv_mos_entries);
450
451 uint64_t mos_entries = zap_create_flags(spa->spa_meta_objset, 0,
452 ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
453 brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx);
454 VERIFY(mos_entries != 0);
455 VERIFY0(dnode_hold(spa->spa_meta_objset, mos_entries, brtvd,
456 &brtvd->bv_mos_entries_dnode));
457 rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
458 brtvd->bv_mos_entries = mos_entries;
459 rw_exit(&brtvd->bv_mos_entries_lock);
460 BRT_DEBUG("MOS entries created, object=%llu",
461 (u_longlong_t)brtvd->bv_mos_entries);
462
463 /*
464 * We allocate DMU buffer to store the bv_entcount[] array.
465 * We will keep array size (bv_size) and cummulative count for all
466 * bv_entcount[]s (bv_totalcount) in the bonus buffer.
467 */
468 brtvd->bv_mos_brtvdev = dmu_object_alloc(spa->spa_meta_objset,
469 DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE,
470 DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx);
471 VERIFY(brtvd->bv_mos_brtvdev != 0);
472 BRT_DEBUG("MOS BRT VDEV created, object=%llu",
473 (u_longlong_t)brtvd->bv_mos_brtvdev);
474
475 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
476 (u_longlong_t)brtvd->bv_vdevid);
477 VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name,
478 sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx));
479 BRT_DEBUG("Pool directory object created, object=%s", name);
480
481 /*
482 * Activate the endian-fixed feature if this is the first BRT ZAP
483 * (i.e., BLOCK_CLONING is not yet active) and the feature is enabled.
484 */
485 if (spa_feature_is_enabled(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN) &&
486 !spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
487 spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN, tx);
488 } else if (spa_feature_is_active(spa,
489 SPA_FEATURE_BLOCK_CLONING_ENDIAN)) {
490 spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN, tx);
491 }
492
493 spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING, tx);
494 }
495
496 static void
brt_vdev_realloc(spa_t * spa,brt_vdev_t * brtvd)497 brt_vdev_realloc(spa_t *spa, brt_vdev_t *brtvd)
498 {
499 vdev_t *vd;
500 uint16_t *entcount;
501 ulong_t *bitmap;
502 uint64_t nblocks, onblocks, size;
503
504 ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
505
506 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
507 vd = vdev_lookup_top(spa, brtvd->bv_vdevid);
508 size = (vdev_get_min_asize(vd) - 1) / spa->spa_brt_rangesize + 1;
509 spa_config_exit(spa, SCL_VDEV, FTAG);
510
511 entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP);
512 nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);
513 bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);
514
515 if (!brtvd->bv_initiated) {
516 ASSERT0(brtvd->bv_size);
517 ASSERT0P(brtvd->bv_entcount);
518 ASSERT0P(brtvd->bv_bitmap);
519 } else {
520 ASSERT(brtvd->bv_size > 0);
521 ASSERT(brtvd->bv_entcount != NULL);
522 ASSERT(brtvd->bv_bitmap != NULL);
523 /*
524 * TODO: Allow vdev shrinking. We only need to implement
525 * shrinking the on-disk BRT VDEV object.
526 * dmu_free_range(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
527 * offset, size, tx);
528 */
529 ASSERT3U(brtvd->bv_size, <=, size);
530
531 memcpy(entcount, brtvd->bv_entcount,
532 sizeof (entcount[0]) * MIN(size, brtvd->bv_size));
533 vmem_free(brtvd->bv_entcount,
534 sizeof (entcount[0]) * brtvd->bv_size);
535 onblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
536 memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
537 BT_SIZEOFMAP(onblocks)));
538 kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(onblocks));
539 }
540
541 brtvd->bv_size = size;
542 brtvd->bv_entcount = entcount;
543 brtvd->bv_bitmap = bitmap;
544 if (!brtvd->bv_initiated) {
545 brtvd->bv_need_byteswap = FALSE;
546 brtvd->bv_initiated = TRUE;
547 BRT_DEBUG("BRT VDEV %llu initiated.",
548 (u_longlong_t)brtvd->bv_vdevid);
549 }
550 }
551
552 static int
brt_vdev_load(spa_t * spa,brt_vdev_t * brtvd)553 brt_vdev_load(spa_t *spa, brt_vdev_t *brtvd)
554 {
555 dmu_buf_t *db;
556 brt_vdev_phys_t *bvphys;
557 int error;
558
559 ASSERT(!brtvd->bv_initiated);
560 ASSERT(brtvd->bv_mos_brtvdev != 0);
561
562 error = dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
563 FTAG, &db);
564 if (error != 0)
565 return (error);
566
567 bvphys = db->db_data;
568 if (spa->spa_brt_rangesize == 0) {
569 spa->spa_brt_rangesize = bvphys->bvp_rangesize;
570 } else {
571 ASSERT3U(spa->spa_brt_rangesize, ==, bvphys->bvp_rangesize);
572 }
573
574 brt_vdev_realloc(spa, brtvd);
575
576 /* TODO: We don't support VDEV shrinking. */
577 ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size);
578
579 /*
580 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out.
581 */
582 error = dmu_read(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0,
583 MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t),
584 brtvd->bv_entcount, DMU_READ_NO_PREFETCH);
585 if (error != 0)
586 return (error);
587
588 ASSERT(bvphys->bvp_mos_entries != 0);
589 VERIFY0(dnode_hold(spa->spa_meta_objset, bvphys->bvp_mos_entries, brtvd,
590 &brtvd->bv_mos_entries_dnode));
591 rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
592 brtvd->bv_mos_entries = bvphys->bvp_mos_entries;
593 rw_exit(&brtvd->bv_mos_entries_lock);
594 brtvd->bv_need_byteswap =
595 (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER);
596 brtvd->bv_totalcount = bvphys->bvp_totalcount;
597 brtvd->bv_usedspace = bvphys->bvp_usedspace;
598 brtvd->bv_savedspace = bvphys->bvp_savedspace;
599
600 dmu_buf_rele(db, FTAG);
601
602 BRT_DEBUG("BRT VDEV %llu loaded: mos_brtvdev=%llu, mos_entries=%llu",
603 (u_longlong_t)brtvd->bv_vdevid,
604 (u_longlong_t)brtvd->bv_mos_brtvdev,
605 (u_longlong_t)brtvd->bv_mos_entries);
606 return (0);
607 }
608
609 static void
brt_vdev_dealloc(brt_vdev_t * brtvd)610 brt_vdev_dealloc(brt_vdev_t *brtvd)
611 {
612 ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
613 ASSERT(brtvd->bv_initiated);
614 ASSERT0(avl_numnodes(&brtvd->bv_tree));
615
616 vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size);
617 brtvd->bv_entcount = NULL;
618 uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
619 kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(nblocks));
620 brtvd->bv_bitmap = NULL;
621
622 brtvd->bv_size = 0;
623
624 brtvd->bv_initiated = FALSE;
625 BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid);
626 }
627
628 static void
brt_vdev_destroy(spa_t * spa,brt_vdev_t * brtvd,dmu_tx_t * tx)629 brt_vdev_destroy(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
630 {
631 char name[64];
632 uint64_t count;
633
634 ASSERT(brtvd->bv_initiated);
635 ASSERT(brtvd->bv_mos_brtvdev != 0);
636 ASSERT(brtvd->bv_mos_entries != 0);
637 ASSERT0(brtvd->bv_totalcount);
638 ASSERT0(brtvd->bv_usedspace);
639 ASSERT0(brtvd->bv_savedspace);
640
641 uint64_t mos_entries = brtvd->bv_mos_entries;
642 rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
643 brtvd->bv_mos_entries = 0;
644 rw_exit(&brtvd->bv_mos_entries_lock);
645 dnode_rele(brtvd->bv_mos_entries_dnode, brtvd);
646 brtvd->bv_mos_entries_dnode = NULL;
647 ASSERT0(zap_count(spa->spa_meta_objset, mos_entries, &count));
648 ASSERT0(count);
649 VERIFY0(zap_destroy(spa->spa_meta_objset, mos_entries, tx));
650 BRT_DEBUG("MOS entries destroyed, object=%llu",
651 (u_longlong_t)mos_entries);
652
653 VERIFY0(dmu_object_free(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
654 tx));
655 BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",
656 (u_longlong_t)brtvd->bv_mos_brtvdev);
657 brtvd->bv_mos_brtvdev = 0;
658 brtvd->bv_entcount_dirty = FALSE;
659
660 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
661 (u_longlong_t)brtvd->bv_vdevid);
662 VERIFY0(zap_remove(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
663 name, tx));
664 BRT_DEBUG("Pool directory object removed, object=%s", name);
665
666 brtvd->bv_meta_dirty = FALSE;
667
668 rw_enter(&brtvd->bv_lock, RW_WRITER);
669 brt_vdev_dealloc(brtvd);
670 rw_exit(&brtvd->bv_lock);
671
672 spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING, tx);
673 if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN))
674 spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN, tx);
675 }
676
677 static void
brt_vdevs_expand(spa_t * spa,uint64_t nvdevs)678 brt_vdevs_expand(spa_t *spa, uint64_t nvdevs)
679 {
680 brt_vdev_t **vdevs;
681
682 ASSERT(RW_WRITE_HELD(&spa->spa_brt_lock));
683 ASSERT3U(nvdevs, >=, spa->spa_brt_nvdevs);
684
685 if (nvdevs == spa->spa_brt_nvdevs)
686 return;
687
688 vdevs = kmem_zalloc(sizeof (*spa->spa_brt_vdevs) * nvdevs, KM_SLEEP);
689 if (spa->spa_brt_nvdevs > 0) {
690 ASSERT(spa->spa_brt_vdevs != NULL);
691
692 memcpy(vdevs, spa->spa_brt_vdevs,
693 sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs);
694 kmem_free(spa->spa_brt_vdevs,
695 sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs);
696 }
697 spa->spa_brt_vdevs = vdevs;
698
699 for (uint64_t vdevid = spa->spa_brt_nvdevs; vdevid < nvdevs; vdevid++) {
700 brt_vdev_t *brtvd = kmem_zalloc(sizeof (*brtvd), KM_SLEEP);
701 rw_init(&brtvd->bv_lock, NULL, RW_DEFAULT, NULL);
702 brtvd->bv_vdevid = vdevid;
703 brtvd->bv_initiated = FALSE;
704 rw_init(&brtvd->bv_mos_entries_lock, NULL, RW_DEFAULT, NULL);
705 avl_create(&brtvd->bv_tree, brt_entry_compare,
706 sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node));
707 for (int i = 0; i < TXG_SIZE; i++) {
708 avl_create(&brtvd->bv_pending_tree[i],
709 brt_entry_compare, sizeof (brt_entry_t),
710 offsetof(brt_entry_t, bre_node));
711 }
712 mutex_init(&brtvd->bv_pending_lock, NULL, MUTEX_DEFAULT, NULL);
713 spa->spa_brt_vdevs[vdevid] = brtvd;
714 }
715
716 BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",
717 (u_longlong_t)spa->spa_brt_nvdevs, (u_longlong_t)nvdevs);
718 spa->spa_brt_nvdevs = nvdevs;
719 }
720
721 static boolean_t
brt_vdev_lookup(spa_t * spa,brt_vdev_t * brtvd,uint64_t offset)722 brt_vdev_lookup(spa_t *spa, brt_vdev_t *brtvd, uint64_t offset)
723 {
724 uint64_t idx = offset / spa->spa_brt_rangesize;
725 if (idx < brtvd->bv_size) {
726 /* VDEV wasn't expanded. */
727 return (brt_vdev_entcount_get(brtvd, idx) > 0);
728 }
729 return (FALSE);
730 }
731
732 static void
brt_vdev_addref(spa_t * spa,brt_vdev_t * brtvd,const brt_entry_t * bre,uint64_t dsize,uint64_t count)733 brt_vdev_addref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre,
734 uint64_t dsize, uint64_t count)
735 {
736 uint64_t idx;
737
738 ASSERT(brtvd->bv_initiated);
739
740 brtvd->bv_savedspace += dsize * count;
741 brtvd->bv_meta_dirty = TRUE;
742
743 if (bre->bre_count > 0)
744 return;
745
746 brtvd->bv_usedspace += dsize;
747
748 idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize;
749 if (idx >= brtvd->bv_size) {
750 /* VDEV has been expanded. */
751 rw_enter(&brtvd->bv_lock, RW_WRITER);
752 brt_vdev_realloc(spa, brtvd);
753 rw_exit(&brtvd->bv_lock);
754 }
755
756 ASSERT3U(idx, <, brtvd->bv_size);
757
758 brtvd->bv_totalcount++;
759 brt_vdev_entcount_inc(brtvd, idx);
760 brtvd->bv_entcount_dirty = TRUE;
761 idx = idx / BRT_BLOCKSIZE / 8;
762 BT_SET(brtvd->bv_bitmap, idx);
763 }
764
765 static void
brt_vdev_decref(spa_t * spa,brt_vdev_t * brtvd,const brt_entry_t * bre,uint64_t dsize)766 brt_vdev_decref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre,
767 uint64_t dsize)
768 {
769 uint64_t idx;
770
771 ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
772 ASSERT(brtvd->bv_initiated);
773
774 brtvd->bv_savedspace -= dsize;
775 brtvd->bv_meta_dirty = TRUE;
776
777 if (bre->bre_count > 0)
778 return;
779
780 brtvd->bv_usedspace -= dsize;
781
782 idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize;
783 ASSERT3U(idx, <, brtvd->bv_size);
784
785 ASSERT(brtvd->bv_totalcount > 0);
786 brtvd->bv_totalcount--;
787 brt_vdev_entcount_dec(brtvd, idx);
788 brtvd->bv_entcount_dirty = TRUE;
789 idx = idx / BRT_BLOCKSIZE / 8;
790 BT_SET(brtvd->bv_bitmap, idx);
791 }
792
793 static void
brt_vdev_sync(spa_t * spa,brt_vdev_t * brtvd,dmu_tx_t * tx)794 brt_vdev_sync(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
795 {
796 dmu_buf_t *db;
797 brt_vdev_phys_t *bvphys;
798
799 ASSERT(brtvd->bv_meta_dirty);
800 ASSERT(brtvd->bv_mos_brtvdev != 0);
801 ASSERT(dmu_tx_is_syncing(tx));
802
803 VERIFY0(dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
804 FTAG, &db));
805
806 if (brtvd->bv_entcount_dirty) {
807 /*
808 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
809 */
810 dmu_write(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0,
811 brtvd->bv_size * sizeof (brtvd->bv_entcount[0]),
812 brtvd->bv_entcount, tx);
813 uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
814 memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(nblocks));
815 brtvd->bv_entcount_dirty = FALSE;
816 }
817
818 dmu_buf_will_dirty(db, tx);
819 bvphys = db->db_data;
820 bvphys->bvp_mos_entries = brtvd->bv_mos_entries;
821 bvphys->bvp_size = brtvd->bv_size;
822 if (brtvd->bv_need_byteswap) {
823 bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER;
824 } else {
825 bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER;
826 }
827 bvphys->bvp_totalcount = brtvd->bv_totalcount;
828 bvphys->bvp_rangesize = spa->spa_brt_rangesize;
829 bvphys->bvp_usedspace = brtvd->bv_usedspace;
830 bvphys->bvp_savedspace = brtvd->bv_savedspace;
831 dmu_buf_rele(db, FTAG);
832
833 brtvd->bv_meta_dirty = FALSE;
834 }
835
836 static void
brt_vdevs_free(spa_t * spa)837 brt_vdevs_free(spa_t *spa)
838 {
839 if (spa->spa_brt_vdevs == 0)
840 return;
841 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
842 brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
843 rw_enter(&brtvd->bv_lock, RW_WRITER);
844 if (brtvd->bv_initiated)
845 brt_vdev_dealloc(brtvd);
846 rw_exit(&brtvd->bv_lock);
847 rw_destroy(&brtvd->bv_lock);
848 if (brtvd->bv_mos_entries != 0)
849 dnode_rele(brtvd->bv_mos_entries_dnode, brtvd);
850 rw_destroy(&brtvd->bv_mos_entries_lock);
851 avl_destroy(&brtvd->bv_tree);
852 for (int i = 0; i < TXG_SIZE; i++)
853 avl_destroy(&brtvd->bv_pending_tree[i]);
854 mutex_destroy(&brtvd->bv_pending_lock);
855 kmem_free(brtvd, sizeof (*brtvd));
856 }
857 kmem_free(spa->spa_brt_vdevs, sizeof (*spa->spa_brt_vdevs) *
858 spa->spa_brt_nvdevs);
859 }
860
861 static void
brt_entry_fill(const blkptr_t * bp,brt_entry_t * bre,uint64_t * vdevidp)862 brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp)
863 {
864
865 bre->bre_bp = *bp;
866 bre->bre_count = 0;
867 bre->bre_pcount = 0;
868
869 *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]);
870 }
871
872 static boolean_t
brt_has_endian_fixed(spa_t * spa)873 brt_has_endian_fixed(spa_t *spa)
874 {
875 return (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN));
876 }
877
878 static int
brt_entry_lookup(spa_t * spa,brt_vdev_t * brtvd,brt_entry_t * bre)879 brt_entry_lookup(spa_t *spa, brt_vdev_t *brtvd, brt_entry_t *bre)
880 {
881 uint64_t off = BRE_OFFSET(bre);
882
883 if (brtvd->bv_mos_entries == 0)
884 return (SET_ERROR(ENOENT));
885
886 if (brt_has_endian_fixed(spa)) {
887 return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
888 &off, BRT_KEY_WORDS, sizeof (bre->bre_count), 1,
889 &bre->bre_count));
890 } else {
891 return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
892 &off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
893 &bre->bre_count));
894 }
895 }
896
897 /*
898 * Return TRUE if we _can_ have BRT entry for this bp. It might be false
899 * positive, but gives us quick answer if we should look into BRT, which
900 * may require reads and thus will be more expensive.
901 */
902 boolean_t
brt_maybe_exists(spa_t * spa,const blkptr_t * bp)903 brt_maybe_exists(spa_t *spa, const blkptr_t *bp)
904 {
905
906 if (spa->spa_brt_nvdevs == 0)
907 return (B_FALSE);
908
909 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
910 brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
911 if (brtvd == NULL || !brtvd->bv_initiated)
912 return (FALSE);
913
914 /*
915 * We don't need locks here, since bv_entcount pointer must be
916 * stable at this point, and we don't care about false positive
917 * races here, while false negative should be impossible, since
918 * all brt_vdev_addref() have already completed by this point.
919 */
920 uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]);
921 return (brt_vdev_lookup(spa, brtvd, off));
922 }
923
924 uint64_t
brt_get_dspace(spa_t * spa)925 brt_get_dspace(spa_t *spa)
926 {
927 if (spa->spa_brt_nvdevs == 0)
928 return (0);
929
930 brt_rlock(spa);
931 uint64_t s = 0;
932 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++)
933 s += spa->spa_brt_vdevs[vdevid]->bv_savedspace;
934 brt_unlock(spa);
935 return (s);
936 }
937
938 uint64_t
brt_get_used(spa_t * spa)939 brt_get_used(spa_t *spa)
940 {
941 if (spa->spa_brt_nvdevs == 0)
942 return (0);
943
944 brt_rlock(spa);
945 uint64_t s = 0;
946 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++)
947 s += spa->spa_brt_vdevs[vdevid]->bv_usedspace;
948 brt_unlock(spa);
949 return (s);
950 }
951
952 uint64_t
brt_get_saved(spa_t * spa)953 brt_get_saved(spa_t *spa)
954 {
955 return (brt_get_dspace(spa));
956 }
957
958 uint64_t
brt_get_ratio(spa_t * spa)959 brt_get_ratio(spa_t *spa)
960 {
961 uint64_t used = brt_get_used(spa);
962 if (used == 0)
963 return (100);
964 return ((used + brt_get_saved(spa)) * 100 / used);
965 }
966
967 static int
brt_kstats_update(kstat_t * ksp,int rw)968 brt_kstats_update(kstat_t *ksp, int rw)
969 {
970 brt_stats_t *bs = ksp->ks_data;
971
972 if (rw == KSTAT_WRITE)
973 return (EACCES);
974
975 bs->brt_addref_entry_not_on_disk.value.ui64 =
976 wmsum_value(&brt_sums.brt_addref_entry_not_on_disk);
977 bs->brt_addref_entry_on_disk.value.ui64 =
978 wmsum_value(&brt_sums.brt_addref_entry_on_disk);
979 bs->brt_decref_entry_in_memory.value.ui64 =
980 wmsum_value(&brt_sums.brt_decref_entry_in_memory);
981 bs->brt_decref_entry_loaded_from_disk.value.ui64 =
982 wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk);
983 bs->brt_decref_entry_not_in_memory.value.ui64 =
984 wmsum_value(&brt_sums.brt_decref_entry_not_in_memory);
985 bs->brt_decref_entry_read_lost_race.value.ui64 =
986 wmsum_value(&brt_sums.brt_decref_entry_read_lost_race);
987 bs->brt_decref_entry_still_referenced.value.ui64 =
988 wmsum_value(&brt_sums.brt_decref_entry_still_referenced);
989 bs->brt_decref_free_data_later.value.ui64 =
990 wmsum_value(&brt_sums.brt_decref_free_data_later);
991 bs->brt_decref_free_data_now.value.ui64 =
992 wmsum_value(&brt_sums.brt_decref_free_data_now);
993 bs->brt_decref_no_entry.value.ui64 =
994 wmsum_value(&brt_sums.brt_decref_no_entry);
995
996 return (0);
997 }
998
999 static void
brt_stat_init(void)1000 brt_stat_init(void)
1001 {
1002
1003 wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0);
1004 wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0);
1005 wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0);
1006 wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0);
1007 wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0);
1008 wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0);
1009 wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0);
1010 wmsum_init(&brt_sums.brt_decref_free_data_later, 0);
1011 wmsum_init(&brt_sums.brt_decref_free_data_now, 0);
1012 wmsum_init(&brt_sums.brt_decref_no_entry, 0);
1013
1014 brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED,
1015 sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
1016 if (brt_ksp != NULL) {
1017 brt_ksp->ks_data = &brt_stats;
1018 brt_ksp->ks_update = brt_kstats_update;
1019 kstat_install(brt_ksp);
1020 }
1021 }
1022
1023 static void
brt_stat_fini(void)1024 brt_stat_fini(void)
1025 {
1026 if (brt_ksp != NULL) {
1027 kstat_delete(brt_ksp);
1028 brt_ksp = NULL;
1029 }
1030
1031 wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk);
1032 wmsum_fini(&brt_sums.brt_addref_entry_on_disk);
1033 wmsum_fini(&brt_sums.brt_decref_entry_in_memory);
1034 wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk);
1035 wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory);
1036 wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race);
1037 wmsum_fini(&brt_sums.brt_decref_entry_still_referenced);
1038 wmsum_fini(&brt_sums.brt_decref_free_data_later);
1039 wmsum_fini(&brt_sums.brt_decref_free_data_now);
1040 wmsum_fini(&brt_sums.brt_decref_no_entry);
1041 }
1042
1043 void
brt_init(void)1044 brt_init(void)
1045 {
1046 brt_entry_cache = kmem_cache_create("brt_entry_cache",
1047 sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1048
1049 brt_stat_init();
1050 }
1051
1052 void
brt_fini(void)1053 brt_fini(void)
1054 {
1055 brt_stat_fini();
1056
1057 kmem_cache_destroy(brt_entry_cache);
1058 }
1059
1060 /* Return TRUE if block should be freed immediately. */
1061 boolean_t
brt_entry_decref(spa_t * spa,const blkptr_t * bp)1062 brt_entry_decref(spa_t *spa, const blkptr_t *bp)
1063 {
1064 brt_entry_t *bre, *racebre;
1065 brt_entry_t bre_search;
1066 avl_index_t where;
1067 uint64_t vdevid;
1068 int error;
1069
1070 brt_entry_fill(bp, &bre_search, &vdevid);
1071
1072 brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
1073 ASSERT(brtvd != NULL);
1074
1075 rw_enter(&brtvd->bv_lock, RW_WRITER);
1076 ASSERT(brtvd->bv_initiated);
1077 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1078 if (bre != NULL) {
1079 BRTSTAT_BUMP(brt_decref_entry_in_memory);
1080 goto out;
1081 } else {
1082 BRTSTAT_BUMP(brt_decref_entry_not_in_memory);
1083 }
1084 rw_exit(&brtvd->bv_lock);
1085
1086 error = brt_entry_lookup(spa, brtvd, &bre_search);
1087 /* bre_search now contains correct bre_count */
1088 if (error == ENOENT) {
1089 BRTSTAT_BUMP(brt_decref_no_entry);
1090 return (B_TRUE);
1091 }
1092 ASSERT0(error);
1093
1094 rw_enter(&brtvd->bv_lock, RW_WRITER);
1095 racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
1096 if (racebre != NULL) {
1097 /* The entry was added when the lock was dropped. */
1098 BRTSTAT_BUMP(brt_decref_entry_read_lost_race);
1099 bre = racebre;
1100 goto out;
1101 }
1102
1103 BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk);
1104 bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
1105 bre->bre_bp = bre_search.bre_bp;
1106 bre->bre_count = bre_search.bre_count;
1107 bre->bre_pcount = 0;
1108 avl_insert(&brtvd->bv_tree, bre, where);
1109
1110 out:
1111 if (bre->bre_count == 0) {
1112 rw_exit(&brtvd->bv_lock);
1113 BRTSTAT_BUMP(brt_decref_free_data_now);
1114 return (B_TRUE);
1115 }
1116
1117 bre->bre_pcount--;
1118 ASSERT(bre->bre_count > 0);
1119 bre->bre_count--;
1120 if (bre->bre_count == 0)
1121 BRTSTAT_BUMP(brt_decref_free_data_later);
1122 else
1123 BRTSTAT_BUMP(brt_decref_entry_still_referenced);
1124 brt_vdev_decref(spa, brtvd, bre, bp_get_dsize_sync(spa, bp));
1125
1126 rw_exit(&brtvd->bv_lock);
1127
1128 return (B_FALSE);
1129 }
1130
1131 uint64_t
brt_entry_get_refcount(spa_t * spa,const blkptr_t * bp)1132 brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)
1133 {
1134 brt_entry_t bre_search, *bre;
1135 uint64_t vdevid, refcnt;
1136 int error;
1137
1138 brt_entry_fill(bp, &bre_search, &vdevid);
1139
1140 brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
1141 ASSERT(brtvd != NULL);
1142
1143 rw_enter(&brtvd->bv_lock, RW_READER);
1144 ASSERT(brtvd->bv_initiated);
1145 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1146 if (bre == NULL) {
1147 rw_exit(&brtvd->bv_lock);
1148 error = brt_entry_lookup(spa, brtvd, &bre_search);
1149 if (error == ENOENT) {
1150 refcnt = 0;
1151 } else {
1152 ASSERT0(error);
1153 refcnt = bre_search.bre_count;
1154 }
1155 } else {
1156 refcnt = bre->bre_count;
1157 rw_exit(&brtvd->bv_lock);
1158 }
1159
1160 return (refcnt);
1161 }
1162
1163 static void
brt_prefetch(brt_vdev_t * brtvd,const blkptr_t * bp)1164 brt_prefetch(brt_vdev_t *brtvd, const blkptr_t *bp)
1165 {
1166 if (!brt_zap_prefetch || brtvd->bv_mos_entries == 0)
1167 return;
1168
1169 uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]);
1170 rw_enter(&brtvd->bv_mos_entries_lock, RW_READER);
1171 if (brtvd->bv_mos_entries != 0) {
1172 (void) zap_prefetch_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
1173 &off, BRT_KEY_WORDS);
1174 }
1175 rw_exit(&brtvd->bv_mos_entries_lock);
1176 }
1177
1178 static int
brt_entry_compare(const void * x1,const void * x2)1179 brt_entry_compare(const void *x1, const void *x2)
1180 {
1181 const brt_entry_t *bre1 = x1, *bre2 = x2;
1182 const blkptr_t *bp1 = &bre1->bre_bp, *bp2 = &bre2->bre_bp;
1183
1184 return (TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
1185 DVA_GET_OFFSET(&bp2->blk_dva[0])));
1186 }
1187
1188 void
brt_pending_add(spa_t * spa,const blkptr_t * bp,dmu_tx_t * tx)1189 brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1190 {
1191 brt_entry_t *bre, *newbre;
1192 avl_index_t where;
1193 uint64_t txg;
1194
1195 txg = dmu_tx_get_txg(tx);
1196 ASSERT3U(txg, !=, 0);
1197
1198 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
1199 brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_TRUE);
1200 avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK];
1201
1202 newbre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
1203 newbre->bre_bp = *bp;
1204 newbre->bre_count = 0;
1205 newbre->bre_pcount = 1;
1206
1207 mutex_enter(&brtvd->bv_pending_lock);
1208 bre = avl_find(pending_tree, newbre, &where);
1209 if (bre == NULL) {
1210 avl_insert(pending_tree, newbre, where);
1211 newbre = NULL;
1212 } else {
1213 bre->bre_pcount++;
1214 }
1215 mutex_exit(&brtvd->bv_pending_lock);
1216
1217 if (newbre != NULL) {
1218 ASSERT(bre != NULL);
1219 ASSERT(bre != newbre);
1220 kmem_cache_free(brt_entry_cache, newbre);
1221 } else {
1222 ASSERT0P(bre);
1223
1224 /* Prefetch BRT entry for the syncing context. */
1225 brt_prefetch(brtvd, bp);
1226 }
1227 }
1228
1229 void
brt_pending_remove(spa_t * spa,const blkptr_t * bp,dmu_tx_t * tx)1230 brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1231 {
1232 brt_entry_t *bre, bre_search;
1233 uint64_t txg;
1234
1235 txg = dmu_tx_get_txg(tx);
1236 ASSERT3U(txg, !=, 0);
1237
1238 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
1239 brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
1240 ASSERT(brtvd != NULL);
1241 avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK];
1242
1243 bre_search.bre_bp = *bp;
1244
1245 mutex_enter(&brtvd->bv_pending_lock);
1246 bre = avl_find(pending_tree, &bre_search, NULL);
1247 ASSERT(bre != NULL);
1248 ASSERT(bre->bre_pcount > 0);
1249 bre->bre_pcount--;
1250 if (bre->bre_pcount == 0)
1251 avl_remove(pending_tree, bre);
1252 else
1253 bre = NULL;
1254 mutex_exit(&brtvd->bv_pending_lock);
1255
1256 if (bre)
1257 kmem_cache_free(brt_entry_cache, bre);
1258 }
1259
1260 static void
brt_pending_apply_vdev(spa_t * spa,brt_vdev_t * brtvd,uint64_t txg)1261 brt_pending_apply_vdev(spa_t *spa, brt_vdev_t *brtvd, uint64_t txg)
1262 {
1263 brt_entry_t *bre, *nbre;
1264
1265 /*
1266 * We are in syncing context, so no other bv_pending_tree accesses
1267 * are possible for the TXG. So we don't need bv_pending_lock.
1268 */
1269 ASSERT(avl_is_empty(&brtvd->bv_tree));
1270 avl_swap(&brtvd->bv_tree, &brtvd->bv_pending_tree[txg & TXG_MASK]);
1271
1272 for (bre = avl_first(&brtvd->bv_tree); bre; bre = nbre) {
1273 nbre = AVL_NEXT(&brtvd->bv_tree, bre);
1274
1275 /*
1276 * If the block has DEDUP bit set, it means that it
1277 * already exists in the DEDUP table, so we can just
1278 * use that instead of creating new entry in the BRT.
1279 */
1280 if (BP_GET_DEDUP(&bre->bre_bp)) {
1281 while (bre->bre_pcount > 0) {
1282 if (!ddt_addref(spa, &bre->bre_bp))
1283 break;
1284 bre->bre_pcount--;
1285 }
1286 if (bre->bre_pcount == 0) {
1287 avl_remove(&brtvd->bv_tree, bre);
1288 kmem_cache_free(brt_entry_cache, bre);
1289 continue;
1290 }
1291 }
1292
1293 /*
1294 * Unless we know that the block is definitely not in ZAP,
1295 * try to get its reference count from there.
1296 */
1297 uint64_t off = BRE_OFFSET(bre);
1298 if (brtvd->bv_mos_entries != 0 &&
1299 brt_vdev_lookup(spa, brtvd, off)) {
1300 int error;
1301 if (brt_has_endian_fixed(spa)) {
1302 error = zap_lookup_uint64_by_dnode(
1303 brtvd->bv_mos_entries_dnode, &off,
1304 BRT_KEY_WORDS, sizeof (bre->bre_count), 1,
1305 &bre->bre_count);
1306 } else {
1307 error = zap_lookup_uint64_by_dnode(
1308 brtvd->bv_mos_entries_dnode, &off,
1309 BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
1310 &bre->bre_count);
1311 }
1312 if (error == 0) {
1313 BRTSTAT_BUMP(brt_addref_entry_on_disk);
1314 } else {
1315 ASSERT3U(error, ==, ENOENT);
1316 BRTSTAT_BUMP(brt_addref_entry_not_on_disk);
1317 }
1318 }
1319 }
1320
1321 /*
1322 * If all the cloned blocks we had were handled by DDT, we don't need
1323 * to initiate the vdev.
1324 */
1325 if (avl_is_empty(&brtvd->bv_tree))
1326 return;
1327
1328 if (!brtvd->bv_initiated) {
1329 rw_enter(&brtvd->bv_lock, RW_WRITER);
1330 brt_vdev_realloc(spa, brtvd);
1331 rw_exit(&brtvd->bv_lock);
1332 }
1333
1334 /*
1335 * Convert pending references into proper ones. This has to be a
1336 * separate loop, since entcount modifications would cause false
1337 * positives for brt_vdev_lookup() on following iterations.
1338 */
1339 for (bre = avl_first(&brtvd->bv_tree); bre;
1340 bre = AVL_NEXT(&brtvd->bv_tree, bre)) {
1341 brt_vdev_addref(spa, brtvd, bre,
1342 bp_get_dsize(spa, &bre->bre_bp), bre->bre_pcount);
1343 bre->bre_count += bre->bre_pcount;
1344 }
1345 }
1346
1347 void
brt_pending_apply(spa_t * spa,uint64_t txg)1348 brt_pending_apply(spa_t *spa, uint64_t txg)
1349 {
1350
1351 brt_rlock(spa);
1352 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
1353 brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
1354 brt_unlock(spa);
1355
1356 brt_pending_apply_vdev(spa, brtvd, txg);
1357
1358 brt_rlock(spa);
1359 }
1360 brt_unlock(spa);
1361 }
1362
1363 static void
brt_sync_entry(spa_t * spa,dnode_t * dn,brt_entry_t * bre,dmu_tx_t * tx)1364 brt_sync_entry(spa_t *spa, dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx)
1365 {
1366 uint64_t off = BRE_OFFSET(bre);
1367
1368 if (bre->bre_pcount == 0) {
1369 /* The net change is zero, nothing to do in ZAP. */
1370 } else if (bre->bre_count == 0) {
1371 int error = zap_remove_uint64_by_dnode(dn, &off,
1372 BRT_KEY_WORDS, tx);
1373 VERIFY(error == 0 || error == ENOENT);
1374 } else {
1375 if (brt_has_endian_fixed(spa)) {
1376 VERIFY0(zap_update_uint64_by_dnode(dn, &off,
1377 BRT_KEY_WORDS, sizeof (bre->bre_count), 1,
1378 &bre->bre_count, tx));
1379 } else {
1380 VERIFY0(zap_update_uint64_by_dnode(dn, &off,
1381 BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
1382 &bre->bre_count, tx));
1383 }
1384 }
1385 }
1386
1387 static void
brt_sync_table(spa_t * spa,dmu_tx_t * tx)1388 brt_sync_table(spa_t *spa, dmu_tx_t *tx)
1389 {
1390 brt_entry_t *bre;
1391
1392 brt_rlock(spa);
1393 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
1394 brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
1395 brt_unlock(spa);
1396
1397 if (!brtvd->bv_meta_dirty) {
1398 ASSERT(!brtvd->bv_entcount_dirty);
1399 ASSERT0(avl_numnodes(&brtvd->bv_tree));
1400 brt_rlock(spa);
1401 continue;
1402 }
1403
1404 ASSERT(!brtvd->bv_entcount_dirty ||
1405 avl_numnodes(&brtvd->bv_tree) != 0);
1406
1407 if (brtvd->bv_mos_brtvdev == 0)
1408 brt_vdev_create(spa, brtvd, tx);
1409
1410 void *c = NULL;
1411 while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
1412 brt_sync_entry(spa, brtvd->bv_mos_entries_dnode, bre,
1413 tx);
1414 kmem_cache_free(brt_entry_cache, bre);
1415 }
1416
1417 #ifdef ZFS_DEBUG
1418 if (zfs_flags & ZFS_DEBUG_BRT)
1419 brt_vdev_dump(brtvd);
1420 #endif
1421 if (brtvd->bv_totalcount == 0)
1422 brt_vdev_destroy(spa, brtvd, tx);
1423 else
1424 brt_vdev_sync(spa, brtvd, tx);
1425 brt_rlock(spa);
1426 }
1427 brt_unlock(spa);
1428 }
1429
1430 void
brt_sync(spa_t * spa,uint64_t txg)1431 brt_sync(spa_t *spa, uint64_t txg)
1432 {
1433 dmu_tx_t *tx;
1434 uint64_t vdevid;
1435
1436 ASSERT3U(spa_syncing_txg(spa), ==, txg);
1437
1438 brt_rlock(spa);
1439 for (vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
1440 if (spa->spa_brt_vdevs[vdevid]->bv_meta_dirty)
1441 break;
1442 }
1443 if (vdevid >= spa->spa_brt_nvdevs) {
1444 brt_unlock(spa);
1445 return;
1446 }
1447 brt_unlock(spa);
1448
1449 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1450 brt_sync_table(spa, tx);
1451 dmu_tx_commit(tx);
1452 }
1453
1454 static void
brt_alloc(spa_t * spa)1455 brt_alloc(spa_t *spa)
1456 {
1457 rw_init(&spa->spa_brt_lock, NULL, RW_DEFAULT, NULL);
1458 spa->spa_brt_vdevs = NULL;
1459 spa->spa_brt_nvdevs = 0;
1460 spa->spa_brt_rangesize = 0;
1461 }
1462
1463 void
brt_create(spa_t * spa)1464 brt_create(spa_t *spa)
1465 {
1466 brt_alloc(spa);
1467 spa->spa_brt_rangesize = BRT_RANGESIZE;
1468 }
1469
1470 int
brt_load(spa_t * spa)1471 brt_load(spa_t *spa)
1472 {
1473 int error = 0;
1474
1475 brt_alloc(spa);
1476 brt_wlock(spa);
1477 for (uint64_t vdevid = 0; vdevid < spa->spa_root_vdev->vdev_children;
1478 vdevid++) {
1479 char name[64];
1480 uint64_t mos_brtvdev;
1481
1482 /* Look if this vdev had active block cloning. */
1483 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
1484 (u_longlong_t)vdevid);
1485 error = zap_lookup(spa->spa_meta_objset,
1486 DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1,
1487 &mos_brtvdev);
1488 if (error == ENOENT) {
1489 error = 0;
1490 continue;
1491 }
1492 if (error != 0)
1493 break;
1494
1495 /* If it did, then allocate them all and load this one. */
1496 brt_vdevs_expand(spa, spa->spa_root_vdev->vdev_children);
1497 brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
1498 rw_enter(&brtvd->bv_lock, RW_WRITER);
1499 brtvd->bv_mos_brtvdev = mos_brtvdev;
1500 error = brt_vdev_load(spa, brtvd);
1501 rw_exit(&brtvd->bv_lock);
1502 if (error != 0)
1503 break;
1504 }
1505
1506 if (spa->spa_brt_rangesize == 0)
1507 spa->spa_brt_rangesize = BRT_RANGESIZE;
1508 brt_unlock(spa);
1509 return (error);
1510 }
1511
1512 void
brt_unload(spa_t * spa)1513 brt_unload(spa_t *spa)
1514 {
1515 if (spa->spa_brt_rangesize == 0)
1516 return;
1517 brt_vdevs_free(spa);
1518 rw_destroy(&spa->spa_brt_lock);
1519 spa->spa_brt_rangesize = 0;
1520 }
1521
1522 ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW,
1523 "Enable prefetching of BRT ZAP entries");
1524 ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW,
1525 "BRT ZAP leaf blockshift");
1526 ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW,
1527 "BRT ZAP indirect blockshift");
1528