1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
24 */
25
26 #include <sys/zfs_context.h>
27 #include <sys/spa.h>
28 #include <sys/spa_impl.h>
29 #include <sys/zio.h>
30 #include <sys/brt.h>
31 #include <sys/brt_impl.h>
32 #include <sys/ddt.h>
33 #include <sys/bitmap.h>
34 #include <sys/zap.h>
35 #include <sys/dmu_tx.h>
36 #include <sys/arc.h>
37 #include <sys/dsl_pool.h>
38 #include <sys/dsl_scan.h>
39 #include <sys/vdev_impl.h>
40 #include <sys/kstat.h>
41 #include <sys/wmsum.h>
42
43 /*
44 * Block Cloning design.
45 *
46 * Block Cloning allows to manually clone a file (or a subset of its blocks)
47 * into another (or the same) file by just creating additional references to
48 * the data blocks without copying the data itself. Those references are kept
49 * in the Block Reference Tables (BRTs).
50 *
51 * In many ways this is similar to the existing deduplication, but there are
52 * some important differences:
53 *
54 * - Deduplication is automatic and Block Cloning is not - one has to use a
55 * dedicated system call(s) to clone the given file/blocks.
56 * - Deduplication keeps all data blocks in its table, even those referenced
57 * just once. Block Cloning creates an entry in its tables only when there
58 * are at least two references to the given data block. If the block was
59 * never explicitly cloned or the second to last reference was dropped,
60 * there will be neither space nor performance overhead.
61 * - Deduplication needs data to work - one needs to pass real data to the
62 * write(2) syscall, so hash can be calculated. Block Cloning doesn't require
63 * data, just block pointers to the data, so it is extremely fast, as we pay
64 * neither the cost of reading the data, nor the cost of writing the data -
65 * we operate exclusively on metadata.
66 * - If the D (dedup) bit is not set in the block pointer, it means that
67 * the block is not in the dedup table (DDT) and we won't consult the DDT
68 * when we need to free the block. Block Cloning must be consulted on every
69 * free, because we cannot modify the source BP (eg. by setting something
70 * similar to the D bit), thus we have no hint if the block is in the
71 * Block Reference Table (BRT), so we need to look into the BRT. There is
72 * an optimization in place that allows us to eliminate the majority of BRT
73 * lookups which is described below in the "Minimizing free penalty" section.
74 * - The BRT entry is much smaller than the DDT entry - for BRT we only store
75 * 64bit offset and 64bit reference counter.
76 * - Dedup keys are cryptographic hashes, so two blocks that are close to each
77 * other on disk are most likely in totally different parts of the DDT.
78 * The BRT entry keys are offsets into a single top-level VDEV, so data blocks
79 * from one file should have BRT entries close to each other.
80 * - Scrub will only do a single pass over a block that is referenced multiple
81 * times in the DDT. Unfortunately it is not currently (if at all) possible
82 * with Block Cloning and block referenced multiple times will be scrubbed
83 * multiple times. The new, sorted scrub should be able to eliminate
84 * duplicated reads given enough memory.
85 * - Deduplication requires cryptographically strong hash as a checksum or
86 * additional data verification. Block Cloning works with any checksum
87 * algorithm or even with checksumming disabled.
88 *
89 * As mentioned above, the BRT entries are much smaller than the DDT entries.
90 * To uniquely identify a block we just need its vdev id and offset. We also
91 * need to maintain a reference counter. The vdev id will often repeat, as there
92 * is a small number of top-level VDEVs and a large number of blocks stored in
93 * each VDEV. We take advantage of that to reduce the BRT entry size further by
94 * maintaining one BRT for each top-level VDEV, so we can then have only offset
95 * and counter as the BRT entry.
96 *
97 * Minimizing free penalty.
98 *
99 * Block Cloning allows creating additional references to any existing block.
100 * When we free a block there is no hint in the block pointer whether the block
101 * was cloned or not, so on each free we have to check if there is a
102 * corresponding entry in the BRT or not. If there is, we need to decrease
103 * the reference counter. Doing BRT lookup on every free can potentially be
104 * expensive by requiring additional I/Os if the BRT doesn't fit into memory.
105 * This is the main problem with deduplication, so we've learned our lesson and
106 * try not to repeat the same mistake here. How do we do that? We divide each
107 * top-level VDEV into 16MB regions. For each region we maintain a counter that
108 * is a sum of all the BRT entries that have offsets within the region. This
109 * creates the entries count array of 16bit numbers for each top-level VDEV.
110 * The entries count array is always kept in memory and updated on disk in the
111 * same transaction group as the BRT updates to keep everything in-sync. We can
112 * keep the array in memory, because it is very small. With 16MB regions and
113 * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease
114 * the region size even further in the future). Now, when we want to free
115 * a block, we first consult the array. If the counter for the whole region is
116 * zero, there is no need to look for the BRT entry, as there isn't one for
117 * sure. If the counter for the region is greater than zero, only then we will
118 * do a BRT lookup and if an entry is found we will decrease the reference
119 * counter in the BRT entry and in the entry counters array.
120 *
121 * The entry counters array is small, but can potentially be larger for very
122 * large VDEVs or smaller regions. In this case we don't want to rewrite entire
123 * array on every change. We then divide the array into 32kB block and keep
124 * a bitmap of dirty blocks within a transaction group. When we sync the
125 * transaction group we can only update the parts of the entry counters array
126 * that were modified. Note: Keeping track of the dirty parts of the entry
127 * counters array is implemented, but updating only parts of the array on disk
128 * is not yet implemented - for now we will update entire array if there was
129 * any change.
130 *
131 * The implementation tries to be economic: if BRT is not used, or no longer
132 * used, there will be no entries in the MOS and no additional memory used (eg.
133 * the entry counters array is only allocated if needed).
134 *
135 * Interaction between Deduplication and Block Cloning.
136 *
137 * If both functionalities are in use, we could end up with a block that is
138 * referenced multiple times in both DDT and BRT. When we free one of the
139 * references we couldn't tell where it belongs, so we would have to decide
140 * what table takes the precedence: do we first clear DDT references or BRT
141 * references? To avoid this dilemma BRT cooperates with DDT - if a given block
142 * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will
143 * lookup DDT entry instead and increase the counter there. No BRT entry
144 * will be created for a block which has the D (dedup) bit set.
145 * BRT may be more efficient for manual deduplication, but if the block is
146 * already in the DDT, then creating additional BRT entry would be less
147 * efficient. This clever idea was proposed by Allan Jude.
148 *
149 * Block Cloning across datasets.
150 *
151 * Block Cloning is not limited to cloning blocks within the same dataset.
152 * It is possible (and very useful) to clone blocks between different datasets.
153 * One use case is recovering files from snapshots. By cloning the files into
154 * dataset we need no additional storage. Without Block Cloning we would need
155 * additional space for those files.
156 * Another interesting use case is moving the files between datasets
157 * (copying the file content to the new dataset and removing the source file).
158 * In that case Block Cloning will only be used briefly, because the BRT entries
159 * will be removed when the source is removed.
160 * Block Cloning across encrypted datasets is supported as long as both
161 * datasets share the same master key (e.g. snapshots and clones)
162 *
163 * Block Cloning flow through ZFS layers.
164 *
165 * Note: Block Cloning can be used both for cloning file system blocks and ZVOL
166 * blocks. As of this writing no interface is implemented that allows for block
167 * cloning within a ZVOL.
168 * FreeBSD and Linux provides copy_file_range(2) system call and we will use it
169 * for blocking cloning.
170 *
171 * ssize_t
172 * copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp,
173 * size_t len, unsigned int flags);
174 *
175 * Even though offsets and length represent bytes, they have to be
176 * block-aligned or we will return an error so the upper layer can
177 * fallback to the generic mechanism that will just copy the data.
178 * Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
179 * This function was implemented based on zfs_write(), but instead of writing
180 * the given data we first read block pointers using the new dmu_read_l0_bps()
181 * function from the source file. Once we have BPs from the source file we call
182 * the dmu_brt_clone() function on the destination file. This function
183 * allocates BPs for us. We iterate over all source BPs. If the given BP is
184 * a hole or an embedded block, we just copy BP as-is. If it points to a real
185 * data we place this BP on a BRT pending list using the brt_pending_add()
186 * function.
187 *
188 * We use this pending list to keep track of all BPs that got new references
189 * within this transaction group.
190 *
191 * Some special cases to consider and how we address them:
192 * - The block we want to clone may have been created within the same
193 * transaction group that we are trying to clone. Such block has no BP
194 * allocated yet, so cannot be immediately cloned. We return EAGAIN.
195 * - The block we want to clone may have been modified within the same
196 * transaction group. We return EAGAIN.
197 * - A block may be cloned multiple times during one transaction group (that's
198 * why pending list is actually a tree and not an append-only list - this
199 * way we can figure out faster if this block is cloned for the first time
200 * in this txg or consecutive time).
201 * - A block may be cloned and freed within the same transaction group
202 * (see dbuf_undirty()).
203 * - A block may be cloned and within the same transaction group the clone
204 * can be cloned again (see dmu_read_l0_bps()).
205 * - A file might have been deleted, but the caller still has a file descriptor
206 * open to this file and clones it.
207 *
208 * When we free a block we have an additional step in the ZIO pipeline where we
209 * call the zio_brt_free() function. We then call the brt_entry_decref()
210 * that loads the corresponding BRT entry (if one exists) and decreases
211 * reference counter. If this is not the last reference we will stop ZIO
212 * pipeline here. If this is the last reference or the block is not in the
213 * BRT, we continue the pipeline and free the block as usual.
214 *
215 * At the beginning of spa_sync() where there can be no more block cloning,
216 * but before issuing frees we call brt_pending_apply(). This function applies
217 * all the new clones to the BRT table - we load BRT entries and update
218 * reference counters. To sync new BRT entries to disk, we use brt_sync()
219 * function. This function will sync all dirty per-top-level-vdev BRTs,
220 * the entry counters arrays, etc.
221 *
222 * Block Cloning and ZIL.
223 *
224 * Every clone operation is divided into chunks (similar to write) and each
225 * chunk is cloned in a separate transaction. The chunk size is determined by
226 * how many BPs we can fit into a single ZIL entry.
227 * Replaying clone operation is different from the regular clone operation,
228 * as when we log clone operations we cannot use the source object - it may
229 * reside on a different dataset, so we log BPs we want to clone.
230 * The ZIL is replayed when we mount the given dataset, not when the pool is
231 * imported. Taking this into account it is possible that the pool is imported
232 * without mounting datasets and the source dataset is destroyed before the
233 * destination dataset is mounted and its ZIL replayed.
234 * To address this situation we leverage zil_claim() mechanism where ZFS will
235 * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
236 * entries, we will bump reference counters for their BPs in the BRT. Then
237 * on mount and ZIL replay we bump the reference counters once more, while the
238 * first references are dropped during ZIL destroy by zil_free_clone_range().
239 * It is possible that after zil_claim() we never mount the destination, so
240 * we never replay its ZIL and just destroy it. In this case the only taken
241 * references will be dropped by zil_free_clone_range(), since the cloning is
242 * not going to ever take place.
243 */
244
245 static kmem_cache_t *brt_entry_cache;
246
247 /*
248 * Enable/disable prefetching of BRT entries that we are going to modify.
249 */
250 static int brt_zap_prefetch = 1;
251
252 #ifdef ZFS_DEBUG
253 #define BRT_DEBUG(...) do { \
254 if ((zfs_flags & ZFS_DEBUG_BRT) != 0) { \
255 __dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \
256 } \
257 } while (0)
258 #else
259 #define BRT_DEBUG(...) do { } while (0)
260 #endif
261
262 static int brt_zap_default_bs = 12;
263 static int brt_zap_default_ibs = 12;
264
265 static kstat_t *brt_ksp;
266
267 typedef struct brt_stats {
268 kstat_named_t brt_addref_entry_not_on_disk;
269 kstat_named_t brt_addref_entry_on_disk;
270 kstat_named_t brt_decref_entry_in_memory;
271 kstat_named_t brt_decref_entry_loaded_from_disk;
272 kstat_named_t brt_decref_entry_not_in_memory;
273 kstat_named_t brt_decref_entry_read_lost_race;
274 kstat_named_t brt_decref_entry_still_referenced;
275 kstat_named_t brt_decref_free_data_later;
276 kstat_named_t brt_decref_free_data_now;
277 kstat_named_t brt_decref_no_entry;
278 } brt_stats_t;
279
280 static brt_stats_t brt_stats = {
281 { "addref_entry_not_on_disk", KSTAT_DATA_UINT64 },
282 { "addref_entry_on_disk", KSTAT_DATA_UINT64 },
283 { "decref_entry_in_memory", KSTAT_DATA_UINT64 },
284 { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64 },
285 { "decref_entry_not_in_memory", KSTAT_DATA_UINT64 },
286 { "decref_entry_read_lost_race", KSTAT_DATA_UINT64 },
287 { "decref_entry_still_referenced", KSTAT_DATA_UINT64 },
288 { "decref_free_data_later", KSTAT_DATA_UINT64 },
289 { "decref_free_data_now", KSTAT_DATA_UINT64 },
290 { "decref_no_entry", KSTAT_DATA_UINT64 }
291 };
292
293 struct {
294 wmsum_t brt_addref_entry_not_on_disk;
295 wmsum_t brt_addref_entry_on_disk;
296 wmsum_t brt_decref_entry_in_memory;
297 wmsum_t brt_decref_entry_loaded_from_disk;
298 wmsum_t brt_decref_entry_not_in_memory;
299 wmsum_t brt_decref_entry_read_lost_race;
300 wmsum_t brt_decref_entry_still_referenced;
301 wmsum_t brt_decref_free_data_later;
302 wmsum_t brt_decref_free_data_now;
303 wmsum_t brt_decref_no_entry;
304 } brt_sums;
305
306 #define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1)
307
308 static int brt_entry_compare(const void *x1, const void *x2);
309 static void brt_vdevs_expand(spa_t *spa, uint64_t nvdevs);
310
311 static void
brt_rlock(spa_t * spa)312 brt_rlock(spa_t *spa)
313 {
314 rw_enter(&spa->spa_brt_lock, RW_READER);
315 }
316
317 static void
brt_wlock(spa_t * spa)318 brt_wlock(spa_t *spa)
319 {
320 rw_enter(&spa->spa_brt_lock, RW_WRITER);
321 }
322
323 static void
brt_unlock(spa_t * spa)324 brt_unlock(spa_t *spa)
325 {
326 rw_exit(&spa->spa_brt_lock);
327 }
328
329 static uint16_t
brt_vdev_entcount_get(const brt_vdev_t * brtvd,uint64_t idx)330 brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx)
331 {
332
333 ASSERT3U(idx, <, brtvd->bv_size);
334
335 if (unlikely(brtvd->bv_need_byteswap)) {
336 return (BSWAP_16(brtvd->bv_entcount[idx]));
337 } else {
338 return (brtvd->bv_entcount[idx]);
339 }
340 }
341
342 static void
brt_vdev_entcount_set(brt_vdev_t * brtvd,uint64_t idx,uint16_t entcnt)343 brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt)
344 {
345
346 ASSERT3U(idx, <, brtvd->bv_size);
347
348 if (unlikely(brtvd->bv_need_byteswap)) {
349 brtvd->bv_entcount[idx] = BSWAP_16(entcnt);
350 } else {
351 brtvd->bv_entcount[idx] = entcnt;
352 }
353 }
354
355 static void
brt_vdev_entcount_inc(brt_vdev_t * brtvd,uint64_t idx)356 brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx)
357 {
358 uint16_t entcnt;
359
360 ASSERT3U(idx, <, brtvd->bv_size);
361
362 entcnt = brt_vdev_entcount_get(brtvd, idx);
363 ASSERT(entcnt < UINT16_MAX);
364
365 brt_vdev_entcount_set(brtvd, idx, entcnt + 1);
366 }
367
368 static void
brt_vdev_entcount_dec(brt_vdev_t * brtvd,uint64_t idx)369 brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx)
370 {
371 uint16_t entcnt;
372
373 ASSERT3U(idx, <, brtvd->bv_size);
374
375 entcnt = brt_vdev_entcount_get(brtvd, idx);
376 ASSERT(entcnt > 0);
377
378 brt_vdev_entcount_set(brtvd, idx, entcnt - 1);
379 }
380
381 #ifdef ZFS_DEBUG
382 static void
brt_vdev_dump(brt_vdev_t * brtvd)383 brt_vdev_dump(brt_vdev_t *brtvd)
384 {
385 uint64_t idx;
386
387 uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
388 zfs_dbgmsg(" BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d "
389 "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu",
390 (u_longlong_t)brtvd->bv_vdevid,
391 brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
392 (u_longlong_t)brtvd->bv_size,
393 (u_longlong_t)brtvd->bv_totalcount,
394 (u_longlong_t)nblocks,
395 (size_t)BT_SIZEOFMAP(nblocks));
396 if (brtvd->bv_totalcount > 0) {
397 zfs_dbgmsg(" entcounts:");
398 for (idx = 0; idx < brtvd->bv_size; idx++) {
399 uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx);
400 if (entcnt > 0) {
401 zfs_dbgmsg(" [%04llu] %hu",
402 (u_longlong_t)idx, entcnt);
403 }
404 }
405 }
406 if (brtvd->bv_entcount_dirty) {
407 char *bitmap;
408
409 bitmap = kmem_alloc(nblocks + 1, KM_SLEEP);
410 for (idx = 0; idx < nblocks; idx++) {
411 bitmap[idx] =
412 BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
413 }
414 bitmap[idx] = '\0';
415 zfs_dbgmsg(" dirty: %s", bitmap);
416 kmem_free(bitmap, nblocks + 1);
417 }
418 }
419 #endif
420
421 static brt_vdev_t *
brt_vdev(spa_t * spa,uint64_t vdevid,boolean_t alloc)422 brt_vdev(spa_t *spa, uint64_t vdevid, boolean_t alloc)
423 {
424 brt_vdev_t *brtvd = NULL;
425
426 brt_rlock(spa);
427 if (vdevid < spa->spa_brt_nvdevs) {
428 brtvd = spa->spa_brt_vdevs[vdevid];
429 } else if (alloc) {
430 /* New VDEV was added. */
431 brt_unlock(spa);
432 brt_wlock(spa);
433 if (vdevid >= spa->spa_brt_nvdevs)
434 brt_vdevs_expand(spa, vdevid + 1);
435 brtvd = spa->spa_brt_vdevs[vdevid];
436 }
437 brt_unlock(spa);
438 return (brtvd);
439 }
440
441 static void
brt_vdev_create(spa_t * spa,brt_vdev_t * brtvd,dmu_tx_t * tx)442 brt_vdev_create(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
443 {
444 char name[64];
445
446 ASSERT(brtvd->bv_initiated);
447 ASSERT0(brtvd->bv_mos_brtvdev);
448 ASSERT0(brtvd->bv_mos_entries);
449
450 uint64_t mos_entries = zap_create_flags(spa->spa_meta_objset, 0,
451 ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
452 brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx);
453 VERIFY(mos_entries != 0);
454 VERIFY0(dnode_hold(spa->spa_meta_objset, mos_entries, brtvd,
455 &brtvd->bv_mos_entries_dnode));
456 rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
457 brtvd->bv_mos_entries = mos_entries;
458 rw_exit(&brtvd->bv_mos_entries_lock);
459 BRT_DEBUG("MOS entries created, object=%llu",
460 (u_longlong_t)brtvd->bv_mos_entries);
461
462 /*
463 * We allocate DMU buffer to store the bv_entcount[] array.
464 * We will keep array size (bv_size) and cummulative count for all
465 * bv_entcount[]s (bv_totalcount) in the bonus buffer.
466 */
467 brtvd->bv_mos_brtvdev = dmu_object_alloc(spa->spa_meta_objset,
468 DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE,
469 DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx);
470 VERIFY(brtvd->bv_mos_brtvdev != 0);
471 BRT_DEBUG("MOS BRT VDEV created, object=%llu",
472 (u_longlong_t)brtvd->bv_mos_brtvdev);
473
474 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
475 (u_longlong_t)brtvd->bv_vdevid);
476 VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name,
477 sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx));
478 BRT_DEBUG("Pool directory object created, object=%s", name);
479
480 spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING, tx);
481 }
482
483 static void
brt_vdev_realloc(spa_t * spa,brt_vdev_t * brtvd)484 brt_vdev_realloc(spa_t *spa, brt_vdev_t *brtvd)
485 {
486 vdev_t *vd;
487 uint16_t *entcount;
488 ulong_t *bitmap;
489 uint64_t nblocks, onblocks, size;
490
491 ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
492
493 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
494 vd = vdev_lookup_top(spa, brtvd->bv_vdevid);
495 size = (vdev_get_min_asize(vd) - 1) / spa->spa_brt_rangesize + 1;
496 spa_config_exit(spa, SCL_VDEV, FTAG);
497
498 entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP);
499 nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);
500 bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);
501
502 if (!brtvd->bv_initiated) {
503 ASSERT0(brtvd->bv_size);
504 ASSERT0P(brtvd->bv_entcount);
505 ASSERT0P(brtvd->bv_bitmap);
506 } else {
507 ASSERT(brtvd->bv_size > 0);
508 ASSERT(brtvd->bv_entcount != NULL);
509 ASSERT(brtvd->bv_bitmap != NULL);
510 /*
511 * TODO: Allow vdev shrinking. We only need to implement
512 * shrinking the on-disk BRT VDEV object.
513 * dmu_free_range(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
514 * offset, size, tx);
515 */
516 ASSERT3U(brtvd->bv_size, <=, size);
517
518 memcpy(entcount, brtvd->bv_entcount,
519 sizeof (entcount[0]) * MIN(size, brtvd->bv_size));
520 vmem_free(brtvd->bv_entcount,
521 sizeof (entcount[0]) * brtvd->bv_size);
522 onblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
523 memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
524 BT_SIZEOFMAP(onblocks)));
525 kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(onblocks));
526 }
527
528 brtvd->bv_size = size;
529 brtvd->bv_entcount = entcount;
530 brtvd->bv_bitmap = bitmap;
531 if (!brtvd->bv_initiated) {
532 brtvd->bv_need_byteswap = FALSE;
533 brtvd->bv_initiated = TRUE;
534 BRT_DEBUG("BRT VDEV %llu initiated.",
535 (u_longlong_t)brtvd->bv_vdevid);
536 }
537 }
538
539 static int
brt_vdev_load(spa_t * spa,brt_vdev_t * brtvd)540 brt_vdev_load(spa_t *spa, brt_vdev_t *brtvd)
541 {
542 dmu_buf_t *db;
543 brt_vdev_phys_t *bvphys;
544 int error;
545
546 ASSERT(!brtvd->bv_initiated);
547 ASSERT(brtvd->bv_mos_brtvdev != 0);
548
549 error = dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
550 FTAG, &db);
551 if (error != 0)
552 return (error);
553
554 bvphys = db->db_data;
555 if (spa->spa_brt_rangesize == 0) {
556 spa->spa_brt_rangesize = bvphys->bvp_rangesize;
557 } else {
558 ASSERT3U(spa->spa_brt_rangesize, ==, bvphys->bvp_rangesize);
559 }
560
561 brt_vdev_realloc(spa, brtvd);
562
563 /* TODO: We don't support VDEV shrinking. */
564 ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size);
565
566 /*
567 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out.
568 */
569 error = dmu_read(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0,
570 MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t),
571 brtvd->bv_entcount, DMU_READ_NO_PREFETCH);
572 if (error != 0)
573 return (error);
574
575 ASSERT(bvphys->bvp_mos_entries != 0);
576 VERIFY0(dnode_hold(spa->spa_meta_objset, bvphys->bvp_mos_entries, brtvd,
577 &brtvd->bv_mos_entries_dnode));
578 rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
579 brtvd->bv_mos_entries = bvphys->bvp_mos_entries;
580 rw_exit(&brtvd->bv_mos_entries_lock);
581 brtvd->bv_need_byteswap =
582 (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER);
583 brtvd->bv_totalcount = bvphys->bvp_totalcount;
584 brtvd->bv_usedspace = bvphys->bvp_usedspace;
585 brtvd->bv_savedspace = bvphys->bvp_savedspace;
586
587 dmu_buf_rele(db, FTAG);
588
589 BRT_DEBUG("BRT VDEV %llu loaded: mos_brtvdev=%llu, mos_entries=%llu",
590 (u_longlong_t)brtvd->bv_vdevid,
591 (u_longlong_t)brtvd->bv_mos_brtvdev,
592 (u_longlong_t)brtvd->bv_mos_entries);
593 return (0);
594 }
595
596 static void
brt_vdev_dealloc(brt_vdev_t * brtvd)597 brt_vdev_dealloc(brt_vdev_t *brtvd)
598 {
599 ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
600 ASSERT(brtvd->bv_initiated);
601 ASSERT0(avl_numnodes(&brtvd->bv_tree));
602
603 vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size);
604 brtvd->bv_entcount = NULL;
605 uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
606 kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(nblocks));
607 brtvd->bv_bitmap = NULL;
608
609 brtvd->bv_size = 0;
610
611 brtvd->bv_initiated = FALSE;
612 BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid);
613 }
614
615 static void
brt_vdev_destroy(spa_t * spa,brt_vdev_t * brtvd,dmu_tx_t * tx)616 brt_vdev_destroy(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
617 {
618 char name[64];
619 uint64_t count;
620
621 ASSERT(brtvd->bv_initiated);
622 ASSERT(brtvd->bv_mos_brtvdev != 0);
623 ASSERT(brtvd->bv_mos_entries != 0);
624 ASSERT0(brtvd->bv_totalcount);
625 ASSERT0(brtvd->bv_usedspace);
626 ASSERT0(brtvd->bv_savedspace);
627
628 uint64_t mos_entries = brtvd->bv_mos_entries;
629 rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
630 brtvd->bv_mos_entries = 0;
631 rw_exit(&brtvd->bv_mos_entries_lock);
632 dnode_rele(brtvd->bv_mos_entries_dnode, brtvd);
633 brtvd->bv_mos_entries_dnode = NULL;
634 ASSERT0(zap_count(spa->spa_meta_objset, mos_entries, &count));
635 ASSERT0(count);
636 VERIFY0(zap_destroy(spa->spa_meta_objset, mos_entries, tx));
637 BRT_DEBUG("MOS entries destroyed, object=%llu",
638 (u_longlong_t)mos_entries);
639
640 VERIFY0(dmu_object_free(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
641 tx));
642 BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",
643 (u_longlong_t)brtvd->bv_mos_brtvdev);
644 brtvd->bv_mos_brtvdev = 0;
645 brtvd->bv_entcount_dirty = FALSE;
646
647 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
648 (u_longlong_t)brtvd->bv_vdevid);
649 VERIFY0(zap_remove(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
650 name, tx));
651 BRT_DEBUG("Pool directory object removed, object=%s", name);
652
653 brtvd->bv_meta_dirty = FALSE;
654
655 rw_enter(&brtvd->bv_lock, RW_WRITER);
656 brt_vdev_dealloc(brtvd);
657 rw_exit(&brtvd->bv_lock);
658
659 spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING, tx);
660 }
661
662 static void
brt_vdevs_expand(spa_t * spa,uint64_t nvdevs)663 brt_vdevs_expand(spa_t *spa, uint64_t nvdevs)
664 {
665 brt_vdev_t **vdevs;
666
667 ASSERT(RW_WRITE_HELD(&spa->spa_brt_lock));
668 ASSERT3U(nvdevs, >=, spa->spa_brt_nvdevs);
669
670 if (nvdevs == spa->spa_brt_nvdevs)
671 return;
672
673 vdevs = kmem_zalloc(sizeof (*spa->spa_brt_vdevs) * nvdevs, KM_SLEEP);
674 if (spa->spa_brt_nvdevs > 0) {
675 ASSERT(spa->spa_brt_vdevs != NULL);
676
677 memcpy(vdevs, spa->spa_brt_vdevs,
678 sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs);
679 kmem_free(spa->spa_brt_vdevs,
680 sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs);
681 }
682 spa->spa_brt_vdevs = vdevs;
683
684 for (uint64_t vdevid = spa->spa_brt_nvdevs; vdevid < nvdevs; vdevid++) {
685 brt_vdev_t *brtvd = kmem_zalloc(sizeof (*brtvd), KM_SLEEP);
686 rw_init(&brtvd->bv_lock, NULL, RW_DEFAULT, NULL);
687 brtvd->bv_vdevid = vdevid;
688 brtvd->bv_initiated = FALSE;
689 rw_init(&brtvd->bv_mos_entries_lock, NULL, RW_DEFAULT, NULL);
690 avl_create(&brtvd->bv_tree, brt_entry_compare,
691 sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node));
692 for (int i = 0; i < TXG_SIZE; i++) {
693 avl_create(&brtvd->bv_pending_tree[i],
694 brt_entry_compare, sizeof (brt_entry_t),
695 offsetof(brt_entry_t, bre_node));
696 }
697 mutex_init(&brtvd->bv_pending_lock, NULL, MUTEX_DEFAULT, NULL);
698 spa->spa_brt_vdevs[vdevid] = brtvd;
699 }
700
701 BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",
702 (u_longlong_t)spa->spa_brt_nvdevs, (u_longlong_t)nvdevs);
703 spa->spa_brt_nvdevs = nvdevs;
704 }
705
706 static boolean_t
brt_vdev_lookup(spa_t * spa,brt_vdev_t * brtvd,uint64_t offset)707 brt_vdev_lookup(spa_t *spa, brt_vdev_t *brtvd, uint64_t offset)
708 {
709 uint64_t idx = offset / spa->spa_brt_rangesize;
710 if (idx < brtvd->bv_size) {
711 /* VDEV wasn't expanded. */
712 return (brt_vdev_entcount_get(brtvd, idx) > 0);
713 }
714 return (FALSE);
715 }
716
717 static void
brt_vdev_addref(spa_t * spa,brt_vdev_t * brtvd,const brt_entry_t * bre,uint64_t dsize,uint64_t count)718 brt_vdev_addref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre,
719 uint64_t dsize, uint64_t count)
720 {
721 uint64_t idx;
722
723 ASSERT(brtvd->bv_initiated);
724
725 brtvd->bv_savedspace += dsize * count;
726 brtvd->bv_meta_dirty = TRUE;
727
728 if (bre->bre_count > 0)
729 return;
730
731 brtvd->bv_usedspace += dsize;
732
733 idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize;
734 if (idx >= brtvd->bv_size) {
735 /* VDEV has been expanded. */
736 rw_enter(&brtvd->bv_lock, RW_WRITER);
737 brt_vdev_realloc(spa, brtvd);
738 rw_exit(&brtvd->bv_lock);
739 }
740
741 ASSERT3U(idx, <, brtvd->bv_size);
742
743 brtvd->bv_totalcount++;
744 brt_vdev_entcount_inc(brtvd, idx);
745 brtvd->bv_entcount_dirty = TRUE;
746 idx = idx / BRT_BLOCKSIZE / 8;
747 BT_SET(brtvd->bv_bitmap, idx);
748 }
749
750 static void
brt_vdev_decref(spa_t * spa,brt_vdev_t * brtvd,const brt_entry_t * bre,uint64_t dsize)751 brt_vdev_decref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre,
752 uint64_t dsize)
753 {
754 uint64_t idx;
755
756 ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
757 ASSERT(brtvd->bv_initiated);
758
759 brtvd->bv_savedspace -= dsize;
760 brtvd->bv_meta_dirty = TRUE;
761
762 if (bre->bre_count > 0)
763 return;
764
765 brtvd->bv_usedspace -= dsize;
766
767 idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize;
768 ASSERT3U(idx, <, brtvd->bv_size);
769
770 ASSERT(brtvd->bv_totalcount > 0);
771 brtvd->bv_totalcount--;
772 brt_vdev_entcount_dec(brtvd, idx);
773 brtvd->bv_entcount_dirty = TRUE;
774 idx = idx / BRT_BLOCKSIZE / 8;
775 BT_SET(brtvd->bv_bitmap, idx);
776 }
777
778 static void
brt_vdev_sync(spa_t * spa,brt_vdev_t * brtvd,dmu_tx_t * tx)779 brt_vdev_sync(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
780 {
781 dmu_buf_t *db;
782 brt_vdev_phys_t *bvphys;
783
784 ASSERT(brtvd->bv_meta_dirty);
785 ASSERT(brtvd->bv_mos_brtvdev != 0);
786 ASSERT(dmu_tx_is_syncing(tx));
787
788 VERIFY0(dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
789 FTAG, &db));
790
791 if (brtvd->bv_entcount_dirty) {
792 /*
793 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
794 */
795 dmu_write(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0,
796 brtvd->bv_size * sizeof (brtvd->bv_entcount[0]),
797 brtvd->bv_entcount, tx);
798 uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
799 memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(nblocks));
800 brtvd->bv_entcount_dirty = FALSE;
801 }
802
803 dmu_buf_will_dirty(db, tx);
804 bvphys = db->db_data;
805 bvphys->bvp_mos_entries = brtvd->bv_mos_entries;
806 bvphys->bvp_size = brtvd->bv_size;
807 if (brtvd->bv_need_byteswap) {
808 bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER;
809 } else {
810 bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER;
811 }
812 bvphys->bvp_totalcount = brtvd->bv_totalcount;
813 bvphys->bvp_rangesize = spa->spa_brt_rangesize;
814 bvphys->bvp_usedspace = brtvd->bv_usedspace;
815 bvphys->bvp_savedspace = brtvd->bv_savedspace;
816 dmu_buf_rele(db, FTAG);
817
818 brtvd->bv_meta_dirty = FALSE;
819 }
820
821 static void
brt_vdevs_free(spa_t * spa)822 brt_vdevs_free(spa_t *spa)
823 {
824 if (spa->spa_brt_vdevs == 0)
825 return;
826 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
827 brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
828 rw_enter(&brtvd->bv_lock, RW_WRITER);
829 if (brtvd->bv_initiated)
830 brt_vdev_dealloc(brtvd);
831 rw_exit(&brtvd->bv_lock);
832 rw_destroy(&brtvd->bv_lock);
833 if (brtvd->bv_mos_entries != 0)
834 dnode_rele(brtvd->bv_mos_entries_dnode, brtvd);
835 rw_destroy(&brtvd->bv_mos_entries_lock);
836 avl_destroy(&brtvd->bv_tree);
837 for (int i = 0; i < TXG_SIZE; i++)
838 avl_destroy(&brtvd->bv_pending_tree[i]);
839 mutex_destroy(&brtvd->bv_pending_lock);
840 kmem_free(brtvd, sizeof (*brtvd));
841 }
842 kmem_free(spa->spa_brt_vdevs, sizeof (*spa->spa_brt_vdevs) *
843 spa->spa_brt_nvdevs);
844 }
845
846 static void
brt_entry_fill(const blkptr_t * bp,brt_entry_t * bre,uint64_t * vdevidp)847 brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp)
848 {
849
850 bre->bre_bp = *bp;
851 bre->bre_count = 0;
852 bre->bre_pcount = 0;
853
854 *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]);
855 }
856
857 static int
brt_entry_lookup(brt_vdev_t * brtvd,brt_entry_t * bre)858 brt_entry_lookup(brt_vdev_t *brtvd, brt_entry_t *bre)
859 {
860 uint64_t off = BRE_OFFSET(bre);
861
862 if (brtvd->bv_mos_entries == 0)
863 return (SET_ERROR(ENOENT));
864
865 return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
866 &off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count), &bre->bre_count));
867 }
868
869 /*
870 * Return TRUE if we _can_ have BRT entry for this bp. It might be false
871 * positive, but gives us quick answer if we should look into BRT, which
872 * may require reads and thus will be more expensive.
873 */
874 boolean_t
brt_maybe_exists(spa_t * spa,const blkptr_t * bp)875 brt_maybe_exists(spa_t *spa, const blkptr_t *bp)
876 {
877
878 if (spa->spa_brt_nvdevs == 0)
879 return (B_FALSE);
880
881 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
882 brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
883 if (brtvd == NULL || !brtvd->bv_initiated)
884 return (FALSE);
885
886 /*
887 * We don't need locks here, since bv_entcount pointer must be
888 * stable at this point, and we don't care about false positive
889 * races here, while false negative should be impossible, since
890 * all brt_vdev_addref() have already completed by this point.
891 */
892 uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]);
893 return (brt_vdev_lookup(spa, brtvd, off));
894 }
895
896 uint64_t
brt_get_dspace(spa_t * spa)897 brt_get_dspace(spa_t *spa)
898 {
899 if (spa->spa_brt_nvdevs == 0)
900 return (0);
901
902 brt_rlock(spa);
903 uint64_t s = 0;
904 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++)
905 s += spa->spa_brt_vdevs[vdevid]->bv_savedspace;
906 brt_unlock(spa);
907 return (s);
908 }
909
910 uint64_t
brt_get_used(spa_t * spa)911 brt_get_used(spa_t *spa)
912 {
913 if (spa->spa_brt_nvdevs == 0)
914 return (0);
915
916 brt_rlock(spa);
917 uint64_t s = 0;
918 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++)
919 s += spa->spa_brt_vdevs[vdevid]->bv_usedspace;
920 brt_unlock(spa);
921 return (s);
922 }
923
924 uint64_t
brt_get_saved(spa_t * spa)925 brt_get_saved(spa_t *spa)
926 {
927 return (brt_get_dspace(spa));
928 }
929
930 uint64_t
brt_get_ratio(spa_t * spa)931 brt_get_ratio(spa_t *spa)
932 {
933 uint64_t used = brt_get_used(spa);
934 if (used == 0)
935 return (100);
936 return ((used + brt_get_saved(spa)) * 100 / used);
937 }
938
939 static int
brt_kstats_update(kstat_t * ksp,int rw)940 brt_kstats_update(kstat_t *ksp, int rw)
941 {
942 brt_stats_t *bs = ksp->ks_data;
943
944 if (rw == KSTAT_WRITE)
945 return (EACCES);
946
947 bs->brt_addref_entry_not_on_disk.value.ui64 =
948 wmsum_value(&brt_sums.brt_addref_entry_not_on_disk);
949 bs->brt_addref_entry_on_disk.value.ui64 =
950 wmsum_value(&brt_sums.brt_addref_entry_on_disk);
951 bs->brt_decref_entry_in_memory.value.ui64 =
952 wmsum_value(&brt_sums.brt_decref_entry_in_memory);
953 bs->brt_decref_entry_loaded_from_disk.value.ui64 =
954 wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk);
955 bs->brt_decref_entry_not_in_memory.value.ui64 =
956 wmsum_value(&brt_sums.brt_decref_entry_not_in_memory);
957 bs->brt_decref_entry_read_lost_race.value.ui64 =
958 wmsum_value(&brt_sums.brt_decref_entry_read_lost_race);
959 bs->brt_decref_entry_still_referenced.value.ui64 =
960 wmsum_value(&brt_sums.brt_decref_entry_still_referenced);
961 bs->brt_decref_free_data_later.value.ui64 =
962 wmsum_value(&brt_sums.brt_decref_free_data_later);
963 bs->brt_decref_free_data_now.value.ui64 =
964 wmsum_value(&brt_sums.brt_decref_free_data_now);
965 bs->brt_decref_no_entry.value.ui64 =
966 wmsum_value(&brt_sums.brt_decref_no_entry);
967
968 return (0);
969 }
970
971 static void
brt_stat_init(void)972 brt_stat_init(void)
973 {
974
975 wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0);
976 wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0);
977 wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0);
978 wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0);
979 wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0);
980 wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0);
981 wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0);
982 wmsum_init(&brt_sums.brt_decref_free_data_later, 0);
983 wmsum_init(&brt_sums.brt_decref_free_data_now, 0);
984 wmsum_init(&brt_sums.brt_decref_no_entry, 0);
985
986 brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED,
987 sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
988 if (brt_ksp != NULL) {
989 brt_ksp->ks_data = &brt_stats;
990 brt_ksp->ks_update = brt_kstats_update;
991 kstat_install(brt_ksp);
992 }
993 }
994
995 static void
brt_stat_fini(void)996 brt_stat_fini(void)
997 {
998 if (brt_ksp != NULL) {
999 kstat_delete(brt_ksp);
1000 brt_ksp = NULL;
1001 }
1002
1003 wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk);
1004 wmsum_fini(&brt_sums.brt_addref_entry_on_disk);
1005 wmsum_fini(&brt_sums.brt_decref_entry_in_memory);
1006 wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk);
1007 wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory);
1008 wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race);
1009 wmsum_fini(&brt_sums.brt_decref_entry_still_referenced);
1010 wmsum_fini(&brt_sums.brt_decref_free_data_later);
1011 wmsum_fini(&brt_sums.brt_decref_free_data_now);
1012 wmsum_fini(&brt_sums.brt_decref_no_entry);
1013 }
1014
1015 void
brt_init(void)1016 brt_init(void)
1017 {
1018 brt_entry_cache = kmem_cache_create("brt_entry_cache",
1019 sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1020
1021 brt_stat_init();
1022 }
1023
1024 void
brt_fini(void)1025 brt_fini(void)
1026 {
1027 brt_stat_fini();
1028
1029 kmem_cache_destroy(brt_entry_cache);
1030 }
1031
1032 /* Return TRUE if block should be freed immediately. */
1033 boolean_t
brt_entry_decref(spa_t * spa,const blkptr_t * bp)1034 brt_entry_decref(spa_t *spa, const blkptr_t *bp)
1035 {
1036 brt_entry_t *bre, *racebre;
1037 brt_entry_t bre_search;
1038 avl_index_t where;
1039 uint64_t vdevid;
1040 int error;
1041
1042 brt_entry_fill(bp, &bre_search, &vdevid);
1043
1044 brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
1045 ASSERT(brtvd != NULL);
1046
1047 rw_enter(&brtvd->bv_lock, RW_WRITER);
1048 ASSERT(brtvd->bv_initiated);
1049 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1050 if (bre != NULL) {
1051 BRTSTAT_BUMP(brt_decref_entry_in_memory);
1052 goto out;
1053 } else {
1054 BRTSTAT_BUMP(brt_decref_entry_not_in_memory);
1055 }
1056 rw_exit(&brtvd->bv_lock);
1057
1058 error = brt_entry_lookup(brtvd, &bre_search);
1059 /* bre_search now contains correct bre_count */
1060 if (error == ENOENT) {
1061 BRTSTAT_BUMP(brt_decref_no_entry);
1062 return (B_TRUE);
1063 }
1064 ASSERT0(error);
1065
1066 rw_enter(&brtvd->bv_lock, RW_WRITER);
1067 racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
1068 if (racebre != NULL) {
1069 /* The entry was added when the lock was dropped. */
1070 BRTSTAT_BUMP(brt_decref_entry_read_lost_race);
1071 bre = racebre;
1072 goto out;
1073 }
1074
1075 BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk);
1076 bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
1077 bre->bre_bp = bre_search.bre_bp;
1078 bre->bre_count = bre_search.bre_count;
1079 bre->bre_pcount = 0;
1080 avl_insert(&brtvd->bv_tree, bre, where);
1081
1082 out:
1083 if (bre->bre_count == 0) {
1084 rw_exit(&brtvd->bv_lock);
1085 BRTSTAT_BUMP(brt_decref_free_data_now);
1086 return (B_TRUE);
1087 }
1088
1089 bre->bre_pcount--;
1090 ASSERT(bre->bre_count > 0);
1091 bre->bre_count--;
1092 if (bre->bre_count == 0)
1093 BRTSTAT_BUMP(brt_decref_free_data_later);
1094 else
1095 BRTSTAT_BUMP(brt_decref_entry_still_referenced);
1096 brt_vdev_decref(spa, brtvd, bre, bp_get_dsize_sync(spa, bp));
1097
1098 rw_exit(&brtvd->bv_lock);
1099
1100 return (B_FALSE);
1101 }
1102
1103 uint64_t
brt_entry_get_refcount(spa_t * spa,const blkptr_t * bp)1104 brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)
1105 {
1106 brt_entry_t bre_search, *bre;
1107 uint64_t vdevid, refcnt;
1108 int error;
1109
1110 brt_entry_fill(bp, &bre_search, &vdevid);
1111
1112 brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
1113 ASSERT(brtvd != NULL);
1114
1115 rw_enter(&brtvd->bv_lock, RW_READER);
1116 ASSERT(brtvd->bv_initiated);
1117 bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
1118 if (bre == NULL) {
1119 rw_exit(&brtvd->bv_lock);
1120 error = brt_entry_lookup(brtvd, &bre_search);
1121 if (error == ENOENT) {
1122 refcnt = 0;
1123 } else {
1124 ASSERT0(error);
1125 refcnt = bre_search.bre_count;
1126 }
1127 } else {
1128 refcnt = bre->bre_count;
1129 rw_exit(&brtvd->bv_lock);
1130 }
1131
1132 return (refcnt);
1133 }
1134
1135 static void
brt_prefetch(brt_vdev_t * brtvd,const blkptr_t * bp)1136 brt_prefetch(brt_vdev_t *brtvd, const blkptr_t *bp)
1137 {
1138 if (!brt_zap_prefetch || brtvd->bv_mos_entries == 0)
1139 return;
1140
1141 uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]);
1142 rw_enter(&brtvd->bv_mos_entries_lock, RW_READER);
1143 if (brtvd->bv_mos_entries != 0) {
1144 (void) zap_prefetch_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
1145 &off, BRT_KEY_WORDS);
1146 }
1147 rw_exit(&brtvd->bv_mos_entries_lock);
1148 }
1149
1150 static int
brt_entry_compare(const void * x1,const void * x2)1151 brt_entry_compare(const void *x1, const void *x2)
1152 {
1153 const brt_entry_t *bre1 = x1, *bre2 = x2;
1154 const blkptr_t *bp1 = &bre1->bre_bp, *bp2 = &bre2->bre_bp;
1155
1156 return (TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
1157 DVA_GET_OFFSET(&bp2->blk_dva[0])));
1158 }
1159
1160 void
brt_pending_add(spa_t * spa,const blkptr_t * bp,dmu_tx_t * tx)1161 brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1162 {
1163 brt_entry_t *bre, *newbre;
1164 avl_index_t where;
1165 uint64_t txg;
1166
1167 txg = dmu_tx_get_txg(tx);
1168 ASSERT3U(txg, !=, 0);
1169
1170 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
1171 brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_TRUE);
1172 avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK];
1173
1174 newbre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
1175 newbre->bre_bp = *bp;
1176 newbre->bre_count = 0;
1177 newbre->bre_pcount = 1;
1178
1179 mutex_enter(&brtvd->bv_pending_lock);
1180 bre = avl_find(pending_tree, newbre, &where);
1181 if (bre == NULL) {
1182 avl_insert(pending_tree, newbre, where);
1183 newbre = NULL;
1184 } else {
1185 bre->bre_pcount++;
1186 }
1187 mutex_exit(&brtvd->bv_pending_lock);
1188
1189 if (newbre != NULL) {
1190 ASSERT(bre != NULL);
1191 ASSERT(bre != newbre);
1192 kmem_cache_free(brt_entry_cache, newbre);
1193 } else {
1194 ASSERT0P(bre);
1195
1196 /* Prefetch BRT entry for the syncing context. */
1197 brt_prefetch(brtvd, bp);
1198 }
1199 }
1200
1201 void
brt_pending_remove(spa_t * spa,const blkptr_t * bp,dmu_tx_t * tx)1202 brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
1203 {
1204 brt_entry_t *bre, bre_search;
1205 uint64_t txg;
1206
1207 txg = dmu_tx_get_txg(tx);
1208 ASSERT3U(txg, !=, 0);
1209
1210 uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
1211 brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
1212 ASSERT(brtvd != NULL);
1213 avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK];
1214
1215 bre_search.bre_bp = *bp;
1216
1217 mutex_enter(&brtvd->bv_pending_lock);
1218 bre = avl_find(pending_tree, &bre_search, NULL);
1219 ASSERT(bre != NULL);
1220 ASSERT(bre->bre_pcount > 0);
1221 bre->bre_pcount--;
1222 if (bre->bre_pcount == 0)
1223 avl_remove(pending_tree, bre);
1224 else
1225 bre = NULL;
1226 mutex_exit(&brtvd->bv_pending_lock);
1227
1228 if (bre)
1229 kmem_cache_free(brt_entry_cache, bre);
1230 }
1231
1232 static void
brt_pending_apply_vdev(spa_t * spa,brt_vdev_t * brtvd,uint64_t txg)1233 brt_pending_apply_vdev(spa_t *spa, brt_vdev_t *brtvd, uint64_t txg)
1234 {
1235 brt_entry_t *bre, *nbre;
1236
1237 /*
1238 * We are in syncing context, so no other bv_pending_tree accesses
1239 * are possible for the TXG. So we don't need bv_pending_lock.
1240 */
1241 ASSERT(avl_is_empty(&brtvd->bv_tree));
1242 avl_swap(&brtvd->bv_tree, &brtvd->bv_pending_tree[txg & TXG_MASK]);
1243
1244 for (bre = avl_first(&brtvd->bv_tree); bre; bre = nbre) {
1245 nbre = AVL_NEXT(&brtvd->bv_tree, bre);
1246
1247 /*
1248 * If the block has DEDUP bit set, it means that it
1249 * already exists in the DEDUP table, so we can just
1250 * use that instead of creating new entry in the BRT.
1251 */
1252 if (BP_GET_DEDUP(&bre->bre_bp)) {
1253 while (bre->bre_pcount > 0) {
1254 if (!ddt_addref(spa, &bre->bre_bp))
1255 break;
1256 bre->bre_pcount--;
1257 }
1258 if (bre->bre_pcount == 0) {
1259 avl_remove(&brtvd->bv_tree, bre);
1260 kmem_cache_free(brt_entry_cache, bre);
1261 continue;
1262 }
1263 }
1264
1265 /*
1266 * Unless we know that the block is definitely not in ZAP,
1267 * try to get its reference count from there.
1268 */
1269 uint64_t off = BRE_OFFSET(bre);
1270 if (brtvd->bv_mos_entries != 0 &&
1271 brt_vdev_lookup(spa, brtvd, off)) {
1272 int error = zap_lookup_uint64_by_dnode(
1273 brtvd->bv_mos_entries_dnode, &off,
1274 BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
1275 &bre->bre_count);
1276 if (error == 0) {
1277 BRTSTAT_BUMP(brt_addref_entry_on_disk);
1278 } else {
1279 ASSERT3U(error, ==, ENOENT);
1280 BRTSTAT_BUMP(brt_addref_entry_not_on_disk);
1281 }
1282 }
1283 }
1284
1285 /*
1286 * If all the cloned blocks we had were handled by DDT, we don't need
1287 * to initiate the vdev.
1288 */
1289 if (avl_is_empty(&brtvd->bv_tree))
1290 return;
1291
1292 if (!brtvd->bv_initiated) {
1293 rw_enter(&brtvd->bv_lock, RW_WRITER);
1294 brt_vdev_realloc(spa, brtvd);
1295 rw_exit(&brtvd->bv_lock);
1296 }
1297
1298 /*
1299 * Convert pending references into proper ones. This has to be a
1300 * separate loop, since entcount modifications would cause false
1301 * positives for brt_vdev_lookup() on following iterations.
1302 */
1303 for (bre = avl_first(&brtvd->bv_tree); bre;
1304 bre = AVL_NEXT(&brtvd->bv_tree, bre)) {
1305 brt_vdev_addref(spa, brtvd, bre,
1306 bp_get_dsize(spa, &bre->bre_bp), bre->bre_pcount);
1307 bre->bre_count += bre->bre_pcount;
1308 }
1309 }
1310
1311 void
brt_pending_apply(spa_t * spa,uint64_t txg)1312 brt_pending_apply(spa_t *spa, uint64_t txg)
1313 {
1314
1315 brt_rlock(spa);
1316 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
1317 brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
1318 brt_unlock(spa);
1319
1320 brt_pending_apply_vdev(spa, brtvd, txg);
1321
1322 brt_rlock(spa);
1323 }
1324 brt_unlock(spa);
1325 }
1326
1327 static void
brt_sync_entry(dnode_t * dn,brt_entry_t * bre,dmu_tx_t * tx)1328 brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx)
1329 {
1330 uint64_t off = BRE_OFFSET(bre);
1331
1332 if (bre->bre_pcount == 0) {
1333 /* The net change is zero, nothing to do in ZAP. */
1334 } else if (bre->bre_count == 0) {
1335 int error = zap_remove_uint64_by_dnode(dn, &off,
1336 BRT_KEY_WORDS, tx);
1337 VERIFY(error == 0 || error == ENOENT);
1338 } else {
1339 VERIFY0(zap_update_uint64_by_dnode(dn, &off,
1340 BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
1341 &bre->bre_count, tx));
1342 }
1343 }
1344
1345 static void
brt_sync_table(spa_t * spa,dmu_tx_t * tx)1346 brt_sync_table(spa_t *spa, dmu_tx_t *tx)
1347 {
1348 brt_entry_t *bre;
1349
1350 brt_rlock(spa);
1351 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
1352 brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
1353 brt_unlock(spa);
1354
1355 if (!brtvd->bv_meta_dirty) {
1356 ASSERT(!brtvd->bv_entcount_dirty);
1357 ASSERT0(avl_numnodes(&brtvd->bv_tree));
1358 brt_rlock(spa);
1359 continue;
1360 }
1361
1362 ASSERT(!brtvd->bv_entcount_dirty ||
1363 avl_numnodes(&brtvd->bv_tree) != 0);
1364
1365 if (brtvd->bv_mos_brtvdev == 0)
1366 brt_vdev_create(spa, brtvd, tx);
1367
1368 void *c = NULL;
1369 while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
1370 brt_sync_entry(brtvd->bv_mos_entries_dnode, bre, tx);
1371 kmem_cache_free(brt_entry_cache, bre);
1372 }
1373
1374 #ifdef ZFS_DEBUG
1375 if (zfs_flags & ZFS_DEBUG_BRT)
1376 brt_vdev_dump(brtvd);
1377 #endif
1378 if (brtvd->bv_totalcount == 0)
1379 brt_vdev_destroy(spa, brtvd, tx);
1380 else
1381 brt_vdev_sync(spa, brtvd, tx);
1382 brt_rlock(spa);
1383 }
1384 brt_unlock(spa);
1385 }
1386
1387 void
brt_sync(spa_t * spa,uint64_t txg)1388 brt_sync(spa_t *spa, uint64_t txg)
1389 {
1390 dmu_tx_t *tx;
1391 uint64_t vdevid;
1392
1393 ASSERT3U(spa_syncing_txg(spa), ==, txg);
1394
1395 brt_rlock(spa);
1396 for (vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
1397 if (spa->spa_brt_vdevs[vdevid]->bv_meta_dirty)
1398 break;
1399 }
1400 if (vdevid >= spa->spa_brt_nvdevs) {
1401 brt_unlock(spa);
1402 return;
1403 }
1404 brt_unlock(spa);
1405
1406 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1407 brt_sync_table(spa, tx);
1408 dmu_tx_commit(tx);
1409 }
1410
1411 static void
brt_alloc(spa_t * spa)1412 brt_alloc(spa_t *spa)
1413 {
1414 rw_init(&spa->spa_brt_lock, NULL, RW_DEFAULT, NULL);
1415 spa->spa_brt_vdevs = NULL;
1416 spa->spa_brt_nvdevs = 0;
1417 spa->spa_brt_rangesize = 0;
1418 }
1419
1420 void
brt_create(spa_t * spa)1421 brt_create(spa_t *spa)
1422 {
1423 brt_alloc(spa);
1424 spa->spa_brt_rangesize = BRT_RANGESIZE;
1425 }
1426
1427 int
brt_load(spa_t * spa)1428 brt_load(spa_t *spa)
1429 {
1430 int error = 0;
1431
1432 brt_alloc(spa);
1433 brt_wlock(spa);
1434 for (uint64_t vdevid = 0; vdevid < spa->spa_root_vdev->vdev_children;
1435 vdevid++) {
1436 char name[64];
1437 uint64_t mos_brtvdev;
1438
1439 /* Look if this vdev had active block cloning. */
1440 snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
1441 (u_longlong_t)vdevid);
1442 error = zap_lookup(spa->spa_meta_objset,
1443 DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1,
1444 &mos_brtvdev);
1445 if (error == ENOENT) {
1446 error = 0;
1447 continue;
1448 }
1449 if (error != 0)
1450 break;
1451
1452 /* If it did, then allocate them all and load this one. */
1453 brt_vdevs_expand(spa, spa->spa_root_vdev->vdev_children);
1454 brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
1455 rw_enter(&brtvd->bv_lock, RW_WRITER);
1456 brtvd->bv_mos_brtvdev = mos_brtvdev;
1457 error = brt_vdev_load(spa, brtvd);
1458 rw_exit(&brtvd->bv_lock);
1459 if (error != 0)
1460 break;
1461 }
1462
1463 if (spa->spa_brt_rangesize == 0)
1464 spa->spa_brt_rangesize = BRT_RANGESIZE;
1465 brt_unlock(spa);
1466 return (error);
1467 }
1468
1469 void
brt_unload(spa_t * spa)1470 brt_unload(spa_t *spa)
1471 {
1472 if (spa->spa_brt_rangesize == 0)
1473 return;
1474 brt_vdevs_free(spa);
1475 rw_destroy(&spa->spa_brt_lock);
1476 spa->spa_brt_rangesize = 0;
1477 }
1478
1479 ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW,
1480 "Enable prefetching of BRT ZAP entries");
1481 ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW,
1482 "BRT ZAP leaf blockshift");
1483 ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW,
1484 "BRT ZAP indirect blockshift");
1485