xref: /freebsd/sys/contrib/openzfs/module/zfs/ddt.c (revision 546d3d08e5993cbe2d6141b256e8c2ebad5aa102)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
25  * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
26  * Copyright (c) 2022 by Pawel Jakub Dawidek
27  * Copyright (c) 2019, 2023, Klara Inc.
28  */
29 
30 #include <sys/zfs_context.h>
31 #include <sys/spa.h>
32 #include <sys/spa_impl.h>
33 #include <sys/zio.h>
34 #include <sys/ddt.h>
35 #include <sys/ddt_impl.h>
36 #include <sys/zap.h>
37 #include <sys/dmu_tx.h>
38 #include <sys/arc.h>
39 #include <sys/dsl_pool.h>
40 #include <sys/zio_checksum.h>
41 #include <sys/dsl_scan.h>
42 #include <sys/abd.h>
43 #include <sys/zfeature.h>
44 
45 /*
46  * # DDT: Deduplication tables
47  *
48  * The dedup subsystem provides block-level deduplication. When enabled, blocks
49  * to be written will have the dedup (D) bit set, which causes them to be
50  * tracked in a "dedup table", or DDT. If a block has been seen before (exists
51  * in the DDT), instead of being written, it will instead be made to reference
52  * the existing on-disk data, and a refcount bumped in the DDT instead.
53  *
54  * ## Dedup tables and entries
55  *
56  * Conceptually, a DDT is a dictionary or map. Each entry has a "key"
57  * (ddt_key_t) made up a block's checksum and certian properties, and a "value"
58  * (one or more ddt_phys_t) containing valid DVAs for the block's data, birth
59  * time and refcount. Together these are enough to track references to a
60  * specific block, to build a valid block pointer to reference that block (for
61  * freeing, scrubbing, etc), and to fill a new block pointer with the missing
62  * pieces to make it seem like it was written.
63  *
64  * There's a single DDT (ddt_t) for each checksum type, held in spa_ddt[].
65  * Within each DDT, there can be multiple storage "types" (ddt_type_t, on-disk
66  * object data formats, each with their own implementations) and "classes"
67  * (ddt_class_t, instance of a storage type object, for entries with a specific
68  * characteristic). An entry (key) will only ever exist on one of these objects
69  * at any given time, but may be moved from one to another if their type or
70  * class changes.
71  *
72  * The DDT is driven by the write IO pipeline (zio_ddt_write()). When a block
73  * is to be written, before DVAs have been allocated, ddt_lookup() is called to
74  * see if the block has been seen before. If its not found, the write proceeds
75  * as normal, and after it succeeds, a new entry is created. If it is found, we
76  * fill the BP with the DVAs from the entry, increment the refcount and cause
77  * the write IO to return immediately.
78  *
79  * Traditionally, each ddt_phys_t slot in the entry represents a separate dedup
80  * block for the same content/checksum. The slot is selected based on the
81  * zp_copies parameter the block is written with, that is, the number of DVAs
82  * in the block. The "ditto" slot (DDT_PHYS_DITTO) used to be used for
83  * now-removed "dedupditto" feature. These are no longer written, and will be
84  * freed if encountered on old pools.
85  *
86  * If the "fast_dedup" feature is enabled, new dedup tables will be created
87  * with the "flat phys" option. In this mode, there is only one ddt_phys_t
88  * slot. If a write is issued for an entry that exists, but has fewer DVAs,
89  * then only as many new DVAs are allocated and written to make up the
90  * shortfall. The existing entry is then extended (ddt_phys_extend()) with the
91  * new DVAs.
92  *
93  * ## Lifetime of an entry
94  *
95  * A DDT can be enormous, and typically is not held in memory all at once.
96  * Instead, the changes to an entry are tracked in memory, and written down to
97  * disk at the end of each txg.
98  *
99  * A "live" in-memory entry (ddt_entry_t) is a node on the live tree
100  * (ddt_tree).  At the start of a txg, ddt_tree is empty. When an entry is
101  * required for IO, ddt_lookup() is called. If an entry already exists on
102  * ddt_tree, it is returned. Otherwise, a new one is created, and the
103  * type/class objects for the DDT are searched for that key. If its found, its
104  * value is copied into the live entry. If not, an empty entry is created.
105  *
106  * The live entry will be modified during the txg, usually by modifying the
107  * refcount, but sometimes by adding or updating DVAs. At the end of the txg
108  * (during spa_sync()), type and class are recalculated for entry (see
109  * ddt_sync_entry()), and the entry is written to the appropriate storage
110  * object and (if necessary), removed from an old one. ddt_tree is cleared and
111  * the next txg can start.
112  *
113  * ## Dedup quota
114  *
115  * A maximum size for all DDTs on the pool can be set with the
116  * dedup_table_quota property. This is determined in ddt_over_quota() and
117  * enforced during ddt_lookup(). If the pool is at or over its quota limit,
118  * ddt_lookup() will only return entries for existing blocks, as updates are
119  * still possible. New entries will not be created; instead, ddt_lookup() will
120  * return NULL. In response, the DDT write stage (zio_ddt_write()) will remove
121  * the D bit on the block and reissue the IO as a regular write. The block will
122  * not be deduplicated.
123  *
124  * Note that this is based on the on-disk size of the dedup store. Reclaiming
125  * this space after deleting entries relies on the ZAP "shrinking" behaviour,
126  * without which, no space would be recovered and the DDT would continue to be
127  * considered "over quota". See zap_shrink_enabled.
128  *
129  * ## Dedup table pruning
130  *
131  * As a complement to the dedup quota feature, ddtprune allows removal of older
132  * non-duplicate entries to make room for newer duplicate entries. The amount
133  * to prune can be based on a target percentage of the unique entries or based
134  * on the age (i.e., prune unique entry older than N days).
135  *
136  * ## Dedup log
137  *
138  * Historically, all entries modified on a txg were written back to dedup
139  * storage objects at the end of every txg. This could cause significant
140  * overheads, as each entry only takes up a tiny portion of a ZAP leaf node,
141  * and so required reading the whole node, updating the entry, and writing it
142  * back. On busy pools, this could add serious IO and memory overheads.
143  *
144  * To address this, the dedup log was added. If the "fast_dedup" feature is
145  * enabled, at the end of each txg, modified entries will be copied to an
146  * in-memory "log" object (ddt_log_t), and appended to an on-disk log. If the
147  * same block is requested again, the in-memory object will be checked first,
148  * and if its there, the entry inflated back onto the live tree without going
149  * to storage. The on-disk log is only read at pool import time, to reload the
150  * in-memory log.
151  *
152  * Each txg, some amount of the in-memory log will be flushed out to a DDT
153  * storage object (ie ZAP) as normal. OpenZFS will try hard to flush enough to
154  * keep up with the rate of change on dedup entries, but not so much that it
155  * would impact overall throughput, and not using too much memory. See the
156  * zfs_dedup_log_* tunables in zfs(4) for more details.
157  *
158  * ## Repair IO
159  *
160  * If a read on a dedup block fails, but there are other copies of the block in
161  * the other ddt_phys_t slots, reads will be issued for those instead
162  * (zio_ddt_read_start()). If one of those succeeds, the read is returned to
163  * the caller, and a copy is stashed on the entry's dde_repair_abd.
164  *
165  * During the end-of-txg sync, any entries with a dde_repair_abd get a
166  * "rewrite" write issued for the original block pointer, with the data read
167  * from the alternate block. If the block is actually damaged, this will invoke
168  * the pool's "self-healing" mechanism, and repair the block.
169  *
170  * If the "fast_dedup" feature is enabled, the "flat phys" option will be in
171  * use, so there is only ever one ddt_phys_t slot. The repair process will
172  * still happen in this case, though it is unlikely to succeed as there will
173  * usually be no other equivalent blocks to fall back on (though there might
174  * be, if this was an early version of a dedup'd block that has since been
175  * extended).
176  *
177  * Note that this repair mechanism is in addition to and separate from the
178  * regular OpenZFS scrub and self-healing mechanisms.
179  *
180  * ## Scanning (scrub/resilver)
181  *
182  * If dedup is active, the scrub machinery will walk the dedup table first, and
183  * scrub all blocks with refcnt > 1 first. After that it will move on to the
184  * regular top-down scrub, and exclude the refcnt > 1 blocks when it sees them.
185  * In this way, heavily deduplicated blocks are only scrubbed once. See the
186  * commentary on dsl_scan_ddt() for more details.
187  *
188  * Walking the DDT is done via ddt_walk(). The current position is stored in a
189  * ddt_bookmark_t, which represents a stable position in the storage object.
190  * This bookmark is stored by the scan machinery, and must reference the same
191  * position on the object even if the object changes, the pool is exported, or
192  * OpenZFS is upgraded.
193  *
194  * If the "fast_dedup" feature is enabled and the table has a log, the scan
195  * cannot begin until entries on the log are flushed, as the on-disk log has no
196  * concept of a "stable position". Instead, the log flushing process will enter
197  * a more aggressive mode, to flush out as much as is necesary as soon as
198  * possible, in order to begin the scan as soon as possible.
199  *
200  * ## Interaction with block cloning
201  *
202  * If block cloning and dedup are both enabled on a pool, BRT will look for the
203  * dedup bit on an incoming block pointer. If set, it will call into the DDT
204  * (ddt_addref()) to add a reference to the block, instead of adding a
205  * reference to the BRT. See brt_pending_apply().
206  */
207 
208 /*
209  * These are the only checksums valid for dedup. They must match the list
210  * from dedup_table in zfs_prop.c
211  */
212 #define	DDT_CHECKSUM_VALID(c)	\
213 	(c == ZIO_CHECKSUM_SHA256 || c == ZIO_CHECKSUM_SHA512 || \
214 	c == ZIO_CHECKSUM_SKEIN || c == ZIO_CHECKSUM_EDONR || \
215 	c == ZIO_CHECKSUM_BLAKE3)
216 
217 static kmem_cache_t *ddt_cache;
218 
219 static kmem_cache_t *ddt_entry_flat_cache;
220 static kmem_cache_t *ddt_entry_trad_cache;
221 
222 #define	DDT_ENTRY_FLAT_SIZE	(sizeof (ddt_entry_t) + DDT_FLAT_PHYS_SIZE)
223 #define	DDT_ENTRY_TRAD_SIZE	(sizeof (ddt_entry_t) + DDT_TRAD_PHYS_SIZE)
224 
225 #define	DDT_ENTRY_SIZE(ddt)	\
226 	_DDT_PHYS_SWITCH(ddt, DDT_ENTRY_FLAT_SIZE, DDT_ENTRY_TRAD_SIZE)
227 
228 /*
229  * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
230  */
231 int zfs_dedup_prefetch = 0;
232 
233 /*
234  * If the dedup class cannot satisfy a DDT allocation, treat as over quota
235  * for this many TXGs.
236  */
237 uint_t dedup_class_wait_txgs = 5;
238 
239 /*
240  * How many DDT prune entries to add to the DDT sync AVL tree.
241  * Note these addtional entries have a memory footprint of a
242  * ddt_entry_t (216 bytes).
243  */
244 static uint32_t zfs_ddt_prunes_per_txg = 50000;
245 
246 /*
247  * For testing, synthesize aged DDT entries
248  * (in global scope for ztest)
249  */
250 boolean_t ddt_prune_artificial_age = B_FALSE;
251 boolean_t ddt_dump_prune_histogram = B_FALSE;
252 
253 /*
254  * Minimum time to flush per txg.
255  */
256 uint_t zfs_dedup_log_flush_min_time_ms = 1000;
257 
258 /*
259  * Minimum entries to flush per txg.
260  */
261 uint_t zfs_dedup_log_flush_entries_min = 200;
262 
263 /*
264  * Target number of TXGs until the whole dedup log has been flushed.
265  * The log size will float around this value times the ingest rate.
266  */
267 uint_t zfs_dedup_log_flush_txgs = 100;
268 
269 /*
270  * Maximum entries to flush per txg. Used for testing the dedup log.
271  */
272 uint_t zfs_dedup_log_flush_entries_max = UINT_MAX;
273 
274 /*
275  * Soft cap for the size of the current dedup log. If the log is larger
276  * than this size, we slightly increase the aggressiveness of the flushing to
277  * try to bring it back down to the soft cap.
278  */
279 uint_t zfs_dedup_log_cap = UINT_MAX;
280 
281 /*
282  * If this is set to B_TRUE, the cap above acts more like a hard cap:
283  * flushing is significantly more aggressive, increasing the minimum amount we
284  * flush per txg, as well as the maximum.
285  */
286 boolean_t zfs_dedup_log_hard_cap = B_FALSE;
287 
288 /*
289  * Number of txgs to average flow rates across.
290  */
291 uint_t zfs_dedup_log_flush_flow_rate_txgs = 10;
292 
293 static const ddt_ops_t *const ddt_ops[DDT_TYPES] = {
294 	&ddt_zap_ops,
295 };
296 
297 static const char *const ddt_class_name[DDT_CLASSES] = {
298 	"ditto",
299 	"duplicate",
300 	"unique",
301 };
302 
303 /*
304  * DDT feature flags automatically enabled for each on-disk version. Note that
305  * versions >0 cannot exist on disk without SPA_FEATURE_FAST_DEDUP enabled.
306  */
307 static const uint64_t ddt_version_flags[] = {
308 	[DDT_VERSION_LEGACY] = 0,
309 	[DDT_VERSION_FDT] = DDT_FLAG_FLAT | DDT_FLAG_LOG,
310 };
311 
312 /* per-DDT kstats */
313 typedef struct {
314 	/* total lookups and whether they returned new or existing entries */
315 	kstat_named_t dds_lookup;
316 	kstat_named_t dds_lookup_new;
317 	kstat_named_t dds_lookup_existing;
318 
319 	/* entries found on live tree, and if we had to wait for load */
320 	kstat_named_t dds_lookup_live_hit;
321 	kstat_named_t dds_lookup_live_wait;
322 	kstat_named_t dds_lookup_live_miss;
323 
324 	/* entries found on log trees */
325 	kstat_named_t dds_lookup_log_hit;
326 	kstat_named_t dds_lookup_log_active_hit;
327 	kstat_named_t dds_lookup_log_flushing_hit;
328 	kstat_named_t dds_lookup_log_miss;
329 
330 	/* entries found on store objects */
331 	kstat_named_t dds_lookup_stored_hit;
332 	kstat_named_t dds_lookup_stored_miss;
333 
334 	/* number of entries on log trees */
335 	kstat_named_t dds_log_active_entries;
336 	kstat_named_t dds_log_flushing_entries;
337 
338 	/* avg updated/flushed entries per txg */
339 	kstat_named_t dds_log_ingest_rate;
340 	kstat_named_t dds_log_flush_rate;
341 	kstat_named_t dds_log_flush_time_rate;
342 } ddt_kstats_t;
343 
344 static const ddt_kstats_t ddt_kstats_template = {
345 	{ "lookup",			KSTAT_DATA_UINT64 },
346 	{ "lookup_new",			KSTAT_DATA_UINT64 },
347 	{ "lookup_existing",		KSTAT_DATA_UINT64 },
348 	{ "lookup_live_hit",		KSTAT_DATA_UINT64 },
349 	{ "lookup_live_wait",		KSTAT_DATA_UINT64 },
350 	{ "lookup_live_miss",		KSTAT_DATA_UINT64 },
351 	{ "lookup_log_hit",		KSTAT_DATA_UINT64 },
352 	{ "lookup_log_active_hit",	KSTAT_DATA_UINT64 },
353 	{ "lookup_log_flushing_hit",	KSTAT_DATA_UINT64 },
354 	{ "lookup_log_miss",		KSTAT_DATA_UINT64 },
355 	{ "lookup_stored_hit",		KSTAT_DATA_UINT64 },
356 	{ "lookup_stored_miss",		KSTAT_DATA_UINT64 },
357 	{ "log_active_entries",		KSTAT_DATA_UINT64 },
358 	{ "log_flushing_entries",	KSTAT_DATA_UINT64 },
359 	{ "log_ingest_rate",		KSTAT_DATA_UINT32 },
360 	{ "log_flush_rate",		KSTAT_DATA_UINT32 },
361 	{ "log_flush_time_rate",	KSTAT_DATA_UINT32 },
362 };
363 
364 #ifdef _KERNEL
365 /*
366  * Hot-path lookup counters use wmsums to avoid cache line bouncing.
367  * DDT_KSTAT_BUMP: Increment a wmsum counter (lookup stats).
368  *
369  * Sync-only counters use direct kstat assignment (no atomics needed).
370  * DDT_KSTAT_SET: Set a value (log entry counts, rates).
371  * DDT_KSTAT_SUB: Subtract from a value (decrement log entry counts).
372  * DDT_KSTAT_ZERO: Zero a value (clear log entry counts).
373  */
374 #define	_DDT_KSTAT_STAT(ddt, stat) \
375 	&((ddt_kstats_t *)(ddt)->ddt_ksp->ks_data)->stat.value.ui64
376 #define	DDT_KSTAT_BUMP(ddt, stat) \
377 	wmsum_add(&(ddt)->ddt_kstat_##stat, 1)
378 #define	DDT_KSTAT_SUB(ddt, stat, val) \
379 	do { *_DDT_KSTAT_STAT(ddt, stat) -= (val); } while (0)
380 #define	DDT_KSTAT_SET(ddt, stat, val) \
381 	do { *_DDT_KSTAT_STAT(ddt, stat) = (val); } while (0)
382 #define	DDT_KSTAT_ZERO(ddt, stat) DDT_KSTAT_SET(ddt, stat, 0)
383 #else
384 #define	DDT_KSTAT_BUMP(ddt, stat) do {} while (0)
385 #define	DDT_KSTAT_SUB(ddt, stat, val) do {} while (0)
386 #define	DDT_KSTAT_SET(ddt, stat, val) do {} while (0)
387 #define	DDT_KSTAT_ZERO(ddt, stat) do {} while (0)
388 #endif /* _KERNEL */
389 
390 
391 static void
ddt_object_create(ddt_t * ddt,ddt_type_t type,ddt_class_t class,dmu_tx_t * tx)392 ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
393     dmu_tx_t *tx)
394 {
395 	spa_t *spa = ddt->ddt_spa;
396 	objset_t *os = ddt->ddt_os;
397 	uint64_t *objectp = &ddt->ddt_object[type][class];
398 	boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags &
399 	    ZCHECKSUM_FLAG_DEDUP;
400 	char name[DDT_NAMELEN];
401 
402 	ASSERT3U(ddt->ddt_dir_object, >, 0);
403 
404 	ddt_object_name(ddt, type, class, name);
405 
406 	ASSERT0(*objectp);
407 	VERIFY0(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash));
408 	ASSERT3U(*objectp, !=, 0);
409 
410 	VERIFY0(dnode_hold(os, *objectp, ddt,
411 	    &ddt->ddt_object_dnode[type][class]));
412 
413 	ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
414 
415 	VERIFY0(zap_add(os, ddt->ddt_dir_object, name, sizeof (uint64_t), 1,
416 	    objectp, tx));
417 
418 	VERIFY0(zap_add(os, spa->spa_ddt_stat_object, name,
419 	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
420 	    &ddt->ddt_histogram[type][class], tx));
421 }
422 
423 static void
ddt_object_destroy(ddt_t * ddt,ddt_type_t type,ddt_class_t class,dmu_tx_t * tx)424 ddt_object_destroy(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
425     dmu_tx_t *tx)
426 {
427 	spa_t *spa = ddt->ddt_spa;
428 	objset_t *os = ddt->ddt_os;
429 	uint64_t *objectp = &ddt->ddt_object[type][class];
430 	uint64_t count;
431 	char name[DDT_NAMELEN];
432 
433 	ASSERT3U(ddt->ddt_dir_object, >, 0);
434 
435 	ddt_object_name(ddt, type, class, name);
436 
437 	ASSERT3U(*objectp, !=, 0);
438 	ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
439 	VERIFY0(ddt_object_count(ddt, type, class, &count));
440 	VERIFY0(count);
441 	VERIFY0(zap_remove(os, ddt->ddt_dir_object, name, tx));
442 	VERIFY0(zap_remove(os, spa->spa_ddt_stat_object, name, tx));
443 	if (ddt->ddt_object_dnode[type][class] != NULL) {
444 		dnode_rele(ddt->ddt_object_dnode[type][class], ddt);
445 		ddt->ddt_object_dnode[type][class] = NULL;
446 	}
447 	VERIFY0(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx));
448 	memset(&ddt->ddt_object_stats[type][class], 0, sizeof (ddt_object_t));
449 
450 	*objectp = 0;
451 }
452 
453 static int
ddt_object_load(ddt_t * ddt,ddt_type_t type,ddt_class_t class)454 ddt_object_load(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
455 {
456 	ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
457 	dmu_object_info_t doi;
458 	uint64_t count;
459 	char name[DDT_NAMELEN];
460 	int error;
461 
462 	if (ddt->ddt_dir_object == 0) {
463 		/*
464 		 * If we're configured but the containing dir doesn't exist
465 		 * yet, then this object can't possibly exist either.
466 		 */
467 		ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
468 		return (SET_ERROR(ENOENT));
469 	}
470 
471 	ddt_object_name(ddt, type, class, name);
472 
473 	error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name,
474 	    sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
475 	if (error != 0)
476 		return (error);
477 
478 	error = dnode_hold(ddt->ddt_os, ddt->ddt_object[type][class], ddt,
479 	    &ddt->ddt_object_dnode[type][class]);
480 	if (error != 0)
481 		return (error);
482 
483 	error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
484 	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
485 	    &ddt->ddt_histogram[type][class]);
486 	if (error != 0)
487 		goto error;
488 
489 	/*
490 	 * Seed the cached statistics.
491 	 */
492 	error = ddt_object_info(ddt, type, class, &doi);
493 	if (error)
494 		goto error;
495 
496 	error = ddt_object_count(ddt, type, class, &count);
497 	if (error)
498 		goto error;
499 
500 	ddo->ddo_count = count;
501 	ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
502 	ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
503 
504 	return (0);
505 
506 error:
507 	dnode_rele(ddt->ddt_object_dnode[type][class], ddt);
508 	ddt->ddt_object_dnode[type][class] = NULL;
509 	return (error);
510 }
511 
512 static void
ddt_object_sync(ddt_t * ddt,ddt_type_t type,ddt_class_t class,dmu_tx_t * tx)513 ddt_object_sync(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
514     dmu_tx_t *tx)
515 {
516 	ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
517 	dmu_object_info_t doi;
518 	uint64_t count;
519 	char name[DDT_NAMELEN];
520 
521 	ddt_object_name(ddt, type, class, name);
522 
523 	VERIFY0(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
524 	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
525 	    &ddt->ddt_histogram[type][class], tx));
526 
527 	/*
528 	 * Cache DDT statistics; this is the only time they'll change.
529 	 */
530 	VERIFY0(ddt_object_info(ddt, type, class, &doi));
531 	VERIFY0(ddt_object_count(ddt, type, class, &count));
532 
533 	ddo->ddo_count = count;
534 	ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
535 	ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
536 }
537 
538 static boolean_t
ddt_object_exists(ddt_t * ddt,ddt_type_t type,ddt_class_t class)539 ddt_object_exists(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
540 {
541 	return (!!ddt->ddt_object[type][class]);
542 }
543 
544 static int
ddt_object_lookup(ddt_t * ddt,ddt_type_t type,ddt_class_t class,ddt_entry_t * dde)545 ddt_object_lookup(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
546     ddt_entry_t *dde)
547 {
548 	dnode_t *dn = ddt->ddt_object_dnode[type][class];
549 	if (dn == NULL)
550 		return (SET_ERROR(ENOENT));
551 
552 	return (ddt_ops[type]->ddt_op_lookup(dn, &dde->dde_key,
553 	    dde->dde_phys, DDT_PHYS_SIZE(ddt)));
554 }
555 
556 static int
ddt_object_contains(ddt_t * ddt,ddt_type_t type,ddt_class_t class,const ddt_key_t * ddk)557 ddt_object_contains(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
558     const ddt_key_t *ddk)
559 {
560 	dnode_t *dn = ddt->ddt_object_dnode[type][class];
561 	if (dn == NULL)
562 		return (SET_ERROR(ENOENT));
563 
564 	return (ddt_ops[type]->ddt_op_contains(dn, ddk));
565 }
566 
567 static void
ddt_object_prefetch(ddt_t * ddt,ddt_type_t type,ddt_class_t class,const ddt_key_t * ddk)568 ddt_object_prefetch(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
569     const ddt_key_t *ddk)
570 {
571 	dnode_t *dn = ddt->ddt_object_dnode[type][class];
572 	if (dn == NULL)
573 		return;
574 
575 	ddt_ops[type]->ddt_op_prefetch(dn, ddk);
576 }
577 
578 static void
ddt_object_prefetch_all(ddt_t * ddt,ddt_type_t type,ddt_class_t class)579 ddt_object_prefetch_all(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
580 {
581 	dnode_t *dn = ddt->ddt_object_dnode[type][class];
582 	if (dn == NULL)
583 		return;
584 
585 	ddt_ops[type]->ddt_op_prefetch_all(dn);
586 }
587 
588 static int
ddt_object_update(ddt_t * ddt,ddt_type_t type,ddt_class_t class,const ddt_lightweight_entry_t * ddlwe,dmu_tx_t * tx)589 ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
590     const ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
591 {
592 	dnode_t *dn = ddt->ddt_object_dnode[type][class];
593 	ASSERT(dn != NULL);
594 
595 	return (ddt_ops[type]->ddt_op_update(dn, &ddlwe->ddlwe_key,
596 	    &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt), tx));
597 }
598 
599 static int
ddt_object_remove(ddt_t * ddt,ddt_type_t type,ddt_class_t class,const ddt_key_t * ddk,dmu_tx_t * tx)600 ddt_object_remove(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
601     const ddt_key_t *ddk, dmu_tx_t *tx)
602 {
603 	dnode_t *dn = ddt->ddt_object_dnode[type][class];
604 	ASSERT(dn != NULL);
605 
606 	return (ddt_ops[type]->ddt_op_remove(dn, ddk, tx));
607 }
608 
609 int
ddt_object_walk(ddt_t * ddt,ddt_type_t type,ddt_class_t class,uint64_t * walk,ddt_lightweight_entry_t * ddlwe)610 ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
611     uint64_t *walk, ddt_lightweight_entry_t *ddlwe)
612 {
613 	dnode_t *dn = ddt->ddt_object_dnode[type][class];
614 	ASSERT(dn != NULL);
615 
616 	int error = ddt_ops[type]->ddt_op_walk(dn, walk, &ddlwe->ddlwe_key,
617 	    &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
618 	if (error == 0) {
619 		ddlwe->ddlwe_type = type;
620 		ddlwe->ddlwe_class = class;
621 		return (0);
622 	}
623 	return (error);
624 }
625 
626 int
ddt_object_count(ddt_t * ddt,ddt_type_t type,ddt_class_t class,uint64_t * count)627 ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
628     uint64_t *count)
629 {
630 	dnode_t *dn = ddt->ddt_object_dnode[type][class];
631 	ASSERT(dn != NULL);
632 
633 	return (ddt_ops[type]->ddt_op_count(dn, count));
634 }
635 
636 int
ddt_object_info(ddt_t * ddt,ddt_type_t type,ddt_class_t class,dmu_object_info_t * doi)637 ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
638     dmu_object_info_t *doi)
639 {
640 	if (!ddt_object_exists(ddt, type, class))
641 		return (SET_ERROR(ENOENT));
642 
643 	return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
644 	    doi));
645 }
646 
647 void
ddt_object_name(ddt_t * ddt,ddt_type_t type,ddt_class_t class,char * name)648 ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
649     char *name)
650 {
651 	(void) snprintf(name, DDT_NAMELEN, DMU_POOL_DDT,
652 	    zio_checksum_table[ddt->ddt_checksum].ci_name,
653 	    ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
654 }
655 
656 void
ddt_bp_fill(const ddt_univ_phys_t * ddp,ddt_phys_variant_t v,blkptr_t * bp,uint64_t txg)657 ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
658     blkptr_t *bp, uint64_t txg)
659 {
660 	ASSERT3U(txg, !=, 0);
661 	ASSERT3U(v, <, DDT_PHYS_NONE);
662 	uint64_t phys_birth;
663 	const dva_t *dvap;
664 
665 	if (v == DDT_PHYS_FLAT) {
666 		phys_birth = ddp->ddp_flat.ddp_phys_birth;
667 		dvap = ddp->ddp_flat.ddp_dva;
668 	} else {
669 		phys_birth = ddp->ddp_trad[v].ddp_phys_birth;
670 		dvap = ddp->ddp_trad[v].ddp_dva;
671 	}
672 
673 	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
674 		bp->blk_dva[d] = dvap[d];
675 	BP_SET_BIRTH(bp, txg, phys_birth);
676 }
677 
678 /*
679  * The bp created via this function may be used for repairs and scrub, but it
680  * will be missing the salt / IV required to do a full decrypting read.
681  */
682 void
ddt_bp_create(enum zio_checksum checksum,const ddt_key_t * ddk,const ddt_univ_phys_t * ddp,ddt_phys_variant_t v,blkptr_t * bp)683 ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
684     const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp)
685 {
686 	BP_ZERO(bp);
687 
688 	if (ddp != NULL)
689 		ddt_bp_fill(ddp, v, bp, ddt_phys_birth(ddp, v));
690 
691 	bp->blk_cksum = ddk->ddk_cksum;
692 
693 	BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
694 	BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
695 	BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
696 	BP_SET_CRYPT(bp, DDK_GET_CRYPT(ddk));
697 	BP_SET_FILL(bp, 1);
698 	BP_SET_CHECKSUM(bp, checksum);
699 	BP_SET_TYPE(bp, DMU_OT_DEDUP);
700 	BP_SET_LEVEL(bp, 0);
701 	BP_SET_DEDUP(bp, 1);
702 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
703 }
704 
705 void
ddt_key_fill(ddt_key_t * ddk,const blkptr_t * bp)706 ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
707 {
708 	ddk->ddk_cksum = bp->blk_cksum;
709 	ddk->ddk_prop = 0;
710 
711 	ASSERT(BP_IS_ENCRYPTED(bp) || !BP_USES_CRYPT(bp));
712 
713 	DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
714 	DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
715 	DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
716 	DDK_SET_CRYPT(ddk, BP_USES_CRYPT(bp));
717 }
718 
719 void
ddt_phys_extend(ddt_univ_phys_t * ddp,ddt_phys_variant_t v,const blkptr_t * bp)720 ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v, const blkptr_t *bp)
721 {
722 	ASSERT3U(v, <, DDT_PHYS_NONE);
723 	int bp_ndvas = BP_GET_NDVAS(bp);
724 	int ddp_max_dvas = BP_IS_ENCRYPTED(bp) ?
725 	    SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP;
726 	dva_t *dvas = (v == DDT_PHYS_FLAT) ?
727 	    ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva;
728 
729 	int s = 0, d = 0;
730 	while (s < bp_ndvas && d < ddp_max_dvas) {
731 		if (DVA_IS_VALID(&dvas[d])) {
732 			d++;
733 			continue;
734 		}
735 		dvas[d] = bp->blk_dva[s];
736 		s++; d++;
737 	}
738 
739 	/*
740 	 * If the caller offered us more DVAs than we can fit, something has
741 	 * gone wrong in their accounting. zio_ddt_write() should never ask for
742 	 * more than we need.
743 	 */
744 	ASSERT3U(s, ==, bp_ndvas);
745 
746 	if (BP_IS_ENCRYPTED(bp))
747 		dvas[2] = bp->blk_dva[2];
748 
749 	if (ddt_phys_birth(ddp, v) == 0) {
750 		if (v == DDT_PHYS_FLAT) {
751 			ddp->ddp_flat.ddp_phys_birth =
752 			    BP_GET_PHYSICAL_BIRTH(bp);
753 		} else {
754 			ddp->ddp_trad[v].ddp_phys_birth =
755 			    BP_GET_PHYSICAL_BIRTH(bp);
756 		}
757 	}
758 }
759 
760 void
ddt_phys_unextend(ddt_univ_phys_t * cur,ddt_univ_phys_t * orig,ddt_phys_variant_t v)761 ddt_phys_unextend(ddt_univ_phys_t *cur, ddt_univ_phys_t *orig,
762     ddt_phys_variant_t v)
763 {
764 	ASSERT3U(v, <, DDT_PHYS_NONE);
765 	dva_t *cur_dvas = (v == DDT_PHYS_FLAT) ?
766 	    cur->ddp_flat.ddp_dva : cur->ddp_trad[v].ddp_dva;
767 	dva_t *orig_dvas = (v == DDT_PHYS_FLAT) ?
768 	    orig->ddp_flat.ddp_dva : orig->ddp_trad[v].ddp_dva;
769 
770 	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
771 		cur_dvas[d] = orig_dvas[d];
772 
773 	if (ddt_phys_birth(orig, v) == 0) {
774 		if (v == DDT_PHYS_FLAT)
775 			cur->ddp_flat.ddp_phys_birth = 0;
776 		else
777 			cur->ddp_trad[v].ddp_phys_birth = 0;
778 	}
779 }
780 
781 void
ddt_phys_copy(ddt_univ_phys_t * dst,const ddt_univ_phys_t * src,ddt_phys_variant_t v)782 ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src,
783     ddt_phys_variant_t v)
784 {
785 	ASSERT3U(v, <, DDT_PHYS_NONE);
786 
787 	if (v == DDT_PHYS_FLAT)
788 		dst->ddp_flat = src->ddp_flat;
789 	else
790 		dst->ddp_trad[v] = src->ddp_trad[v];
791 }
792 
793 void
ddt_phys_clear(ddt_univ_phys_t * ddp,ddt_phys_variant_t v)794 ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
795 {
796 	ASSERT3U(v, <, DDT_PHYS_NONE);
797 
798 	if (v == DDT_PHYS_FLAT)
799 		memset(&ddp->ddp_flat, 0, DDT_FLAT_PHYS_SIZE);
800 	else
801 		memset(&ddp->ddp_trad[v], 0, DDT_TRAD_PHYS_SIZE / DDT_PHYS_MAX);
802 }
803 
804 static uint64_t
ddt_class_start(void)805 ddt_class_start(void)
806 {
807 	uint64_t start = gethrestime_sec();
808 
809 	if (unlikely(ddt_prune_artificial_age)) {
810 		/*
811 		 * debug aide -- simulate a wider distribution
812 		 * so we don't have to wait for an aged DDT
813 		 * to test prune.
814 		 */
815 		int range = 1 << 21;
816 		int percent = random_in_range(100);
817 		if (percent < 50) {
818 			range = range >> 4;
819 		} else if (percent > 75) {
820 			range /= 2;
821 		}
822 		start -= random_in_range(range);
823 	}
824 
825 	return (start);
826 }
827 
828 void
ddt_phys_addref(ddt_univ_phys_t * ddp,ddt_phys_variant_t v)829 ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
830 {
831 	ASSERT3U(v, <, DDT_PHYS_NONE);
832 
833 	if (v == DDT_PHYS_FLAT)
834 		ddp->ddp_flat.ddp_refcnt++;
835 	else
836 		ddp->ddp_trad[v].ddp_refcnt++;
837 }
838 
839 uint64_t
ddt_phys_decref(ddt_univ_phys_t * ddp,ddt_phys_variant_t v)840 ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
841 {
842 	ASSERT3U(v, <, DDT_PHYS_NONE);
843 
844 	uint64_t *refcntp;
845 
846 	if (v == DDT_PHYS_FLAT)
847 		refcntp = &ddp->ddp_flat.ddp_refcnt;
848 	else
849 		refcntp = &ddp->ddp_trad[v].ddp_refcnt;
850 
851 	ASSERT3U(*refcntp, >, 0);
852 	(*refcntp)--;
853 	return (*refcntp);
854 }
855 
856 static void
ddt_phys_free(ddt_t * ddt,ddt_key_t * ddk,ddt_univ_phys_t * ddp,ddt_phys_variant_t v,uint64_t txg)857 ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_univ_phys_t *ddp,
858     ddt_phys_variant_t v, uint64_t txg)
859 {
860 	blkptr_t blk;
861 
862 	ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
863 
864 	/*
865 	 * We clear the dedup bit so that zio_free() will actually free the
866 	 * space, rather than just decrementing the refcount in the DDT.
867 	 */
868 	BP_SET_DEDUP(&blk, 0);
869 
870 	ddt_phys_clear(ddp, v);
871 	zio_free(ddt->ddt_spa, txg, &blk);
872 }
873 
874 uint64_t
ddt_phys_birth(const ddt_univ_phys_t * ddp,ddt_phys_variant_t v)875 ddt_phys_birth(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
876 {
877 	ASSERT3U(v, <, DDT_PHYS_NONE);
878 
879 	if (v == DDT_PHYS_FLAT)
880 		return (ddp->ddp_flat.ddp_phys_birth);
881 	else
882 		return (ddp->ddp_trad[v].ddp_phys_birth);
883 }
884 
885 int
ddt_phys_is_gang(const ddt_univ_phys_t * ddp,ddt_phys_variant_t v)886 ddt_phys_is_gang(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
887 {
888 	ASSERT3U(v, <, DDT_PHYS_NONE);
889 
890 	const dva_t *dvas = (v == DDT_PHYS_FLAT) ?
891 	    ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva;
892 
893 	return (DVA_GET_GANG(&dvas[0]));
894 }
895 
896 int
ddt_phys_dva_count(const ddt_univ_phys_t * ddp,ddt_phys_variant_t v,boolean_t encrypted)897 ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
898     boolean_t encrypted)
899 {
900 	ASSERT3U(v, <, DDT_PHYS_NONE);
901 
902 	const dva_t *dvas = (v == DDT_PHYS_FLAT) ?
903 	    ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva;
904 
905 	return (DVA_IS_VALID(&dvas[0]) +
906 	    DVA_IS_VALID(&dvas[1]) +
907 	    DVA_IS_VALID(&dvas[2]) * !encrypted);
908 }
909 
910 ddt_phys_variant_t
ddt_phys_select(const ddt_t * ddt,const ddt_entry_t * dde,const blkptr_t * bp)911 ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, const blkptr_t *bp)
912 {
913 	if (dde == NULL)
914 		return (DDT_PHYS_NONE);
915 
916 	const ddt_univ_phys_t *ddp = dde->dde_phys;
917 
918 	if (ddt->ddt_flags & DDT_FLAG_FLAT) {
919 		if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_flat.ddp_dva[0]) &&
920 		    BP_GET_PHYSICAL_BIRTH(bp) == ddp->ddp_flat.ddp_phys_birth) {
921 			return (DDT_PHYS_FLAT);
922 		}
923 	} else /* traditional phys */ {
924 		for (int p = 0; p < DDT_PHYS_MAX; p++) {
925 			if (DVA_EQUAL(BP_IDENTITY(bp),
926 			    &ddp->ddp_trad[p].ddp_dva[0]) &&
927 			    BP_GET_PHYSICAL_BIRTH(bp) ==
928 			    ddp->ddp_trad[p].ddp_phys_birth) {
929 				return (p);
930 			}
931 		}
932 	}
933 	return (DDT_PHYS_NONE);
934 }
935 
936 uint64_t
ddt_phys_refcnt(const ddt_univ_phys_t * ddp,ddt_phys_variant_t v)937 ddt_phys_refcnt(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
938 {
939 	ASSERT3U(v, <, DDT_PHYS_NONE);
940 
941 	if (v == DDT_PHYS_FLAT)
942 		return (ddp->ddp_flat.ddp_refcnt);
943 	else
944 		return (ddp->ddp_trad[v].ddp_refcnt);
945 }
946 
947 uint64_t
ddt_phys_total_refcnt(const ddt_t * ddt,const ddt_univ_phys_t * ddp)948 ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_univ_phys_t *ddp)
949 {
950 	uint64_t refcnt = 0;
951 
952 	if (ddt->ddt_flags & DDT_FLAG_FLAT)
953 		refcnt = ddp->ddp_flat.ddp_refcnt;
954 	else
955 		for (int v = DDT_PHYS_SINGLE; v <= DDT_PHYS_TRIPLE; v++)
956 			refcnt += ddp->ddp_trad[v].ddp_refcnt;
957 
958 	return (refcnt);
959 }
960 
961 ddt_t *
ddt_select(spa_t * spa,const blkptr_t * bp)962 ddt_select(spa_t *spa, const blkptr_t *bp)
963 {
964 	ASSERT(DDT_CHECKSUM_VALID(BP_GET_CHECKSUM(bp)));
965 	return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
966 }
967 
968 void
ddt_enter(ddt_t * ddt)969 ddt_enter(ddt_t *ddt)
970 {
971 	mutex_enter(&ddt->ddt_lock);
972 }
973 
974 void
ddt_exit(ddt_t * ddt)975 ddt_exit(ddt_t *ddt)
976 {
977 	mutex_exit(&ddt->ddt_lock);
978 }
979 
980 void
ddt_init(void)981 ddt_init(void)
982 {
983 	ddt_cache = kmem_cache_create("ddt_cache",
984 	    sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
985 	ddt_entry_flat_cache = kmem_cache_create("ddt_entry_flat_cache",
986 	    DDT_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
987 	ddt_entry_trad_cache = kmem_cache_create("ddt_entry_trad_cache",
988 	    DDT_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
989 
990 	ddt_log_init();
991 }
992 
993 void
ddt_fini(void)994 ddt_fini(void)
995 {
996 	ddt_log_fini();
997 
998 	kmem_cache_destroy(ddt_entry_trad_cache);
999 	kmem_cache_destroy(ddt_entry_flat_cache);
1000 	kmem_cache_destroy(ddt_cache);
1001 }
1002 
1003 static ddt_entry_t *
ddt_alloc(const ddt_t * ddt,const ddt_key_t * ddk)1004 ddt_alloc(const ddt_t *ddt, const ddt_key_t *ddk)
1005 {
1006 	ddt_entry_t *dde;
1007 
1008 	if (ddt->ddt_flags & DDT_FLAG_FLAT) {
1009 		dde = kmem_cache_alloc(ddt_entry_flat_cache, KM_SLEEP);
1010 		memset(dde, 0, DDT_ENTRY_FLAT_SIZE);
1011 	} else {
1012 		dde = kmem_cache_alloc(ddt_entry_trad_cache, KM_SLEEP);
1013 		memset(dde, 0, DDT_ENTRY_TRAD_SIZE);
1014 	}
1015 
1016 	cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
1017 
1018 	dde->dde_key = *ddk;
1019 
1020 	return (dde);
1021 }
1022 
1023 void
ddt_alloc_entry_io(ddt_entry_t * dde)1024 ddt_alloc_entry_io(ddt_entry_t *dde)
1025 {
1026 	if (dde->dde_io != NULL)
1027 		return;
1028 
1029 	dde->dde_io = kmem_zalloc(sizeof (ddt_entry_io_t), KM_SLEEP);
1030 	mutex_init(&dde->dde_io->dde_io_lock, NULL, MUTEX_DEFAULT, NULL);
1031 }
1032 
1033 static void
ddt_free(const ddt_t * ddt,ddt_entry_t * dde)1034 ddt_free(const ddt_t *ddt, ddt_entry_t *dde)
1035 {
1036 	if (dde->dde_io != NULL) {
1037 		for (int p = 0; p < DDT_NPHYS(ddt); p++)
1038 			ASSERT0P(dde->dde_io->dde_lead_zio[p]);
1039 
1040 		if (dde->dde_io->dde_repair_abd != NULL)
1041 			abd_free(dde->dde_io->dde_repair_abd);
1042 
1043 		mutex_destroy(&dde->dde_io->dde_io_lock);
1044 		kmem_free(dde->dde_io, sizeof (ddt_entry_io_t));
1045 	}
1046 
1047 	cv_destroy(&dde->dde_cv);
1048 	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
1049 	    ddt_entry_flat_cache : ddt_entry_trad_cache, dde);
1050 }
1051 
1052 void
ddt_remove(ddt_t * ddt,ddt_entry_t * dde)1053 ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
1054 {
1055 	ASSERT(MUTEX_HELD(&ddt->ddt_lock));
1056 
1057 	avl_remove(&ddt->ddt_tree, dde);
1058 	ddt_free(ddt, dde);
1059 }
1060 
1061 /*
1062  * We're considered over quota when we hit 85% full, or for larger drives,
1063  * when there is less than 8GB free.
1064  */
1065 static boolean_t
ddt_special_over_quota(metaslab_class_t * mc)1066 ddt_special_over_quota(metaslab_class_t *mc)
1067 {
1068 	uint64_t allocated = metaslab_class_get_alloc(mc);
1069 	uint64_t capacity = metaslab_class_get_space(mc);
1070 	uint64_t limit = MAX(capacity * 85 / 100,
1071 	    (capacity > (1LL<<33)) ? capacity - (1LL<<33) : 0);
1072 	return (allocated >= limit);
1073 }
1074 
1075 /*
1076  * Check if the DDT is over its quota.  This can be due to a few conditions:
1077  *   1. 'dedup_table_quota' property is not 0 (none) and the dedup dsize
1078  *       exceeds this limit
1079  *
1080  *   2. 'dedup_table_quota' property is set to automatic and
1081  *      a. the dedup or special allocation class could not satisfy a DDT
1082  *         allocation in a recent transaction
1083  *      b. the dedup or special allocation class has exceeded its 85% limit
1084  */
1085 static boolean_t
ddt_over_quota(spa_t * spa)1086 ddt_over_quota(spa_t *spa)
1087 {
1088 	if (spa->spa_dedup_table_quota == 0)
1089 		return (B_FALSE);
1090 
1091 	if (spa->spa_dedup_table_quota != UINT64_MAX)
1092 		return (ddt_get_ddt_dsize(spa) > spa->spa_dedup_table_quota);
1093 
1094 	/*
1095 	 * Over quota if have to allocate outside of the dedup/special class.
1096 	 */
1097 	if (spa_syncing_txg(spa) <= spa->spa_dedup_class_full_txg +
1098 	    dedup_class_wait_txgs) {
1099 		/* Waiting for some deferred frees to be processed */
1100 		return (B_TRUE);
1101 	}
1102 
1103 	/*
1104 	 * For automatic quota, table size is limited by dedup or special class
1105 	 */
1106 	if (spa_has_dedup(spa))
1107 		return (ddt_special_over_quota(spa_dedup_class(spa)));
1108 	else if (spa_special_has_ddt(spa))
1109 		return (ddt_special_over_quota(spa_special_class(spa)));
1110 
1111 	return (B_FALSE);
1112 }
1113 
1114 void
ddt_prefetch_all(spa_t * spa)1115 ddt_prefetch_all(spa_t *spa)
1116 {
1117 	/*
1118 	 * Load all DDT entries for each type/class combination. This is
1119 	 * indended to perform a prefetch on all such blocks. For the same
1120 	 * reason that ddt_prefetch isn't locked, this is also not locked.
1121 	 */
1122 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1123 		ddt_t *ddt = spa->spa_ddt[c];
1124 		if (!ddt)
1125 			continue;
1126 
1127 		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1128 			for (ddt_class_t class = 0; class < DDT_CLASSES;
1129 			    class++) {
1130 				ddt_object_prefetch_all(ddt, type, class);
1131 			}
1132 		}
1133 	}
1134 }
1135 
1136 static int ddt_configure(ddt_t *ddt, boolean_t new);
1137 
1138 /*
1139  * If the BP passed to ddt_lookup has valid DVAs, then we need to compare them
1140  * to the ones in the entry. If they're different, then the passed-in BP is
1141  * from a previous generation of this entry (ie was previously pruned) and we
1142  * have to act like the entry doesn't exist at all.
1143  *
1144  * This should only happen during a lookup to free the block (zio_ddt_free()).
1145  *
1146  * XXX this is similar in spirit to ddt_phys_select(), maybe can combine
1147  *       -- robn, 2024-02-09
1148  */
1149 static boolean_t
ddt_entry_lookup_is_valid(ddt_t * ddt,const blkptr_t * bp,ddt_entry_t * dde)1150 ddt_entry_lookup_is_valid(ddt_t *ddt, const blkptr_t *bp, ddt_entry_t *dde)
1151 {
1152 	/* If the BP has no DVAs, then this entry is good */
1153 	uint_t ndvas = BP_GET_NDVAS(bp);
1154 	if (ndvas == 0)
1155 		return (B_TRUE);
1156 
1157 	/*
1158 	 * Only checking the phys for the copies. For flat, there's only one;
1159 	 * for trad it'll be the one that has the matching set of DVAs.
1160 	 */
1161 	const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ?
1162 	    dde->dde_phys->ddp_flat.ddp_dva :
1163 	    dde->dde_phys->ddp_trad[ndvas].ddp_dva;
1164 
1165 	/*
1166 	 * Compare entry DVAs with the BP. They should all be there, but
1167 	 * there's not really anything we can do if its only partial anyway,
1168 	 * that's an error somewhere else, maybe long ago.
1169 	 */
1170 	uint_t d;
1171 	for (d = 0; d < ndvas; d++)
1172 		if (!DVA_EQUAL(&dvas[d], &bp->blk_dva[d]))
1173 			return (B_FALSE);
1174 	ASSERT3U(d, ==, ndvas);
1175 
1176 	return (B_TRUE);
1177 }
1178 
1179 ddt_entry_t *
ddt_lookup(ddt_t * ddt,const blkptr_t * bp,boolean_t verify)1180 ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t verify)
1181 {
1182 	spa_t *spa = ddt->ddt_spa;
1183 	ddt_key_t search;
1184 	ddt_entry_t *dde;
1185 	ddt_type_t type;
1186 	ddt_class_t class;
1187 	avl_index_t where;
1188 	int error;
1189 
1190 	ASSERT(MUTEX_HELD(&ddt->ddt_lock));
1191 
1192 	if (unlikely(ddt->ddt_version == DDT_VERSION_UNCONFIGURED)) {
1193 		/*
1194 		 * This is the first use of this DDT since the pool was
1195 		 * created; finish getting it ready for use.
1196 		 */
1197 		VERIFY0(ddt_configure(ddt, B_TRUE));
1198 		ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
1199 	}
1200 
1201 	DDT_KSTAT_BUMP(ddt, dds_lookup);
1202 
1203 	ddt_key_fill(&search, bp);
1204 
1205 	/* Find an existing live entry */
1206 	dde = avl_find(&ddt->ddt_tree, &search, &where);
1207 	if (dde != NULL) {
1208 		/* If we went over quota, act like we didn't find it */
1209 		if (dde->dde_flags & DDE_FLAG_OVERQUOTA)
1210 			return (NULL);
1211 
1212 		/* If it's already loaded, we can just return it. */
1213 		DDT_KSTAT_BUMP(ddt, dds_lookup_live_hit);
1214 		if (dde->dde_flags & DDE_FLAG_LOADED) {
1215 			if (!verify || ddt_entry_lookup_is_valid(ddt, bp, dde))
1216 				return (dde);
1217 			return (NULL);
1218 		}
1219 
1220 		/* Someone else is loading it, wait for it. */
1221 		dde->dde_waiters++;
1222 		DDT_KSTAT_BUMP(ddt, dds_lookup_live_wait);
1223 		while (!(dde->dde_flags & DDE_FLAG_LOADED))
1224 			cv_wait(&dde->dde_cv, &ddt->ddt_lock);
1225 		dde->dde_waiters--;
1226 
1227 		/* Loaded but over quota, forget we were ever here */
1228 		if (dde->dde_flags & DDE_FLAG_OVERQUOTA) {
1229 			if (dde->dde_waiters == 0) {
1230 				avl_remove(&ddt->ddt_tree, dde);
1231 				ddt_free(ddt, dde);
1232 			}
1233 			return (NULL);
1234 		}
1235 
1236 		DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
1237 
1238 		/* Make sure the loaded entry matches the BP */
1239 		if (!verify || ddt_entry_lookup_is_valid(ddt, bp, dde))
1240 			return (dde);
1241 		return (NULL);
1242 	} else
1243 		DDT_KSTAT_BUMP(ddt, dds_lookup_live_miss);
1244 
1245 	/* Time to make a new entry. */
1246 	dde = ddt_alloc(ddt, &search);
1247 	avl_insert(&ddt->ddt_tree, dde, where);
1248 
1249 	/*
1250 	 * The entry in ddt_tree has no DDE_FLAG_LOADED, so other possible
1251 	 * threads will wait even while we drop the lock.
1252 	 */
1253 	ddt_exit(ddt);
1254 
1255 	/*
1256 	 * If there is a log, we should try to "load" from there first.
1257 	 */
1258 	if (ddt->ddt_flags & DDT_FLAG_LOG) {
1259 		ddt_lightweight_entry_t ddlwe;
1260 		boolean_t from_flushing;
1261 
1262 		/* Read-only search, no locks needed (logs stable during I/O) */
1263 		if (ddt_log_find_key(ddt, &search, &ddlwe, &from_flushing)) {
1264 			dde->dde_type = ddlwe.ddlwe_type;
1265 			dde->dde_class = ddlwe.ddlwe_class;
1266 			memcpy(dde->dde_phys, &ddlwe.ddlwe_phys,
1267 			    DDT_PHYS_SIZE(ddt));
1268 
1269 			/*
1270 			 * Check validity. If invalid and no waiters, clean up
1271 			 * immediately. Otherwise continue setup for waiters.
1272 			 */
1273 			boolean_t valid = !verify ||
1274 			    ddt_entry_lookup_is_valid(ddt, bp, dde);
1275 			ddt_enter(ddt);
1276 			if (!valid && dde->dde_waiters == 0) {
1277 				avl_remove(&ddt->ddt_tree, dde);
1278 				ddt_free(ddt, dde);
1279 				return (NULL);
1280 			}
1281 
1282 			dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED;
1283 			if (from_flushing) {
1284 				dde->dde_flags |= DDE_FLAG_FROM_FLUSHING;
1285 				DDT_KSTAT_BUMP(ddt,
1286 				    dds_lookup_log_flushing_hit);
1287 			} else {
1288 				DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit);
1289 			}
1290 
1291 			DDT_KSTAT_BUMP(ddt, dds_lookup_log_hit);
1292 			DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
1293 
1294 			cv_broadcast(&dde->dde_cv);
1295 
1296 			return (valid ? dde : NULL);
1297 		}
1298 
1299 		DDT_KSTAT_BUMP(ddt, dds_lookup_log_miss);
1300 	}
1301 
1302 	/* Search all store objects for the entry. */
1303 	error = ENOENT;
1304 	for (type = 0; type < DDT_TYPES; type++) {
1305 		for (class = 0; class < DDT_CLASSES; class++) {
1306 			error = ddt_object_lookup(ddt, type, class, dde);
1307 			if (error != ENOENT) {
1308 				ASSERT0(error);
1309 				break;
1310 			}
1311 		}
1312 		if (error != ENOENT)
1313 			break;
1314 	}
1315 
1316 	ddt_enter(ddt);
1317 
1318 	ASSERT(!(dde->dde_flags & DDE_FLAG_LOADED));
1319 
1320 	dde->dde_type = type;	/* will be DDT_TYPES if no entry found */
1321 	dde->dde_class = class;	/* will be DDT_CLASSES if no entry found */
1322 
1323 	boolean_t valid = B_TRUE;
1324 
1325 	if (dde->dde_type == DDT_TYPES &&
1326 	    dde->dde_class == DDT_CLASSES &&
1327 	    ddt_over_quota(spa)) {
1328 		/* Over quota. If no one is waiting, clean up right now. */
1329 		if (dde->dde_waiters == 0) {
1330 			avl_remove(&ddt->ddt_tree, dde);
1331 			ddt_free(ddt, dde);
1332 			return (NULL);
1333 		}
1334 
1335 		/* Flag cleanup required */
1336 		dde->dde_flags |= DDE_FLAG_OVERQUOTA;
1337 	} else if (error == 0) {
1338 		/*
1339 		 * If what we loaded is no good for this BP and there's no one
1340 		 * waiting for it, we can just remove it and get out. If its no
1341 		 * good but there are waiters, we have to leave it, because we
1342 		 * don't know what they want. If its not needed we'll end up
1343 		 * taking an entry log/sync, but it can only happen if more
1344 		 * than one previous version of this block is being deleted at
1345 		 * the same time. This is extremely unlikely to happen and not
1346 		 * worth the effort to deal with without taking an entry
1347 		 * update.
1348 		 */
1349 		valid = !verify || ddt_entry_lookup_is_valid(ddt, bp, dde);
1350 		if (!valid && dde->dde_waiters == 0) {
1351 			avl_remove(&ddt->ddt_tree, dde);
1352 			ddt_free(ddt, dde);
1353 			return (NULL);
1354 		}
1355 
1356 		DDT_KSTAT_BUMP(ddt, dds_lookup_stored_hit);
1357 		DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
1358 
1359 		/*
1360 		 * The histograms only track inactive (stored or logged) blocks.
1361 		 * We've just put an entry onto the live list, so we need to
1362 		 * remove its counts. When its synced back, it'll be re-added
1363 		 * to the right one.
1364 		 *
1365 		 * We only do this when we successfully found it in the store.
1366 		 * error == ENOENT means this is a new entry, and so its already
1367 		 * not counted.
1368 		 */
1369 		ddt_histogram_t *ddh =
1370 		    &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
1371 
1372 		ddt_lightweight_entry_t ddlwe;
1373 		DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
1374 		ddt_histogram_sub_entry(ddt, ddh, &ddlwe);
1375 	} else {
1376 		DDT_KSTAT_BUMP(ddt, dds_lookup_stored_miss);
1377 		DDT_KSTAT_BUMP(ddt, dds_lookup_new);
1378 	}
1379 
1380 	/* Entry loaded, everyone can proceed now */
1381 	dde->dde_flags |= DDE_FLAG_LOADED;
1382 	cv_broadcast(&dde->dde_cv);
1383 
1384 	if ((dde->dde_flags & DDE_FLAG_OVERQUOTA) || !valid)
1385 		return (NULL);
1386 
1387 	return (dde);
1388 }
1389 
1390 void
ddt_prefetch(spa_t * spa,const blkptr_t * bp)1391 ddt_prefetch(spa_t *spa, const blkptr_t *bp)
1392 {
1393 	ddt_t *ddt;
1394 	ddt_key_t ddk;
1395 
1396 	if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
1397 		return;
1398 
1399 	/*
1400 	 * We only remove the DDT once all tables are empty and only
1401 	 * prefetch dedup blocks when there are entries in the DDT.
1402 	 * Thus no locking is required as the DDT can't disappear on us.
1403 	 */
1404 	ddt = ddt_select(spa, bp);
1405 	ddt_key_fill(&ddk, bp);
1406 
1407 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1408 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
1409 			ddt_object_prefetch(ddt, type, class, &ddk);
1410 		}
1411 	}
1412 }
1413 
1414 /*
1415  * ddt_key_t comparison. Any struct wanting to make use of this function must
1416  * have the key as the first element. Casts it to N uint64_ts, and checks until
1417  * we find there's a difference. This is intended to match how ddt_zap.c drives
1418  * the ZAPs (first uint64_t as the key prehash), which will minimise the number
1419  * of ZAP blocks touched when flushing logged entries from an AVL walk. This is
1420  * not an invariant for this function though, should you wish to change it.
1421  */
1422 int
ddt_key_compare(const void * x1,const void * x2)1423 ddt_key_compare(const void *x1, const void *x2)
1424 {
1425 	const uint64_t *k1 = (const uint64_t *)x1;
1426 	const uint64_t *k2 = (const uint64_t *)x2;
1427 
1428 	int cmp;
1429 	for (int i = 0; i < (sizeof (ddt_key_t) / sizeof (uint64_t)); i++)
1430 		if (likely((cmp = TREE_CMP(k1[i], k2[i])) != 0))
1431 			return (cmp);
1432 
1433 	return (0);
1434 }
1435 
1436 /* Create the containing dir for this DDT and bump the feature count */
1437 static void
ddt_create_dir(ddt_t * ddt,dmu_tx_t * tx)1438 ddt_create_dir(ddt_t *ddt, dmu_tx_t *tx)
1439 {
1440 	ASSERT0(ddt->ddt_dir_object);
1441 	ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT);
1442 
1443 	char name[DDT_NAMELEN];
1444 	snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR,
1445 	    zio_checksum_table[ddt->ddt_checksum].ci_name);
1446 
1447 	ddt->ddt_dir_object = zap_create_link(ddt->ddt_os,
1448 	    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, name, tx);
1449 
1450 	VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_VERSION,
1451 	    sizeof (uint64_t), 1, &ddt->ddt_version, tx));
1452 	VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS,
1453 	    sizeof (uint64_t), 1, &ddt->ddt_flags, tx));
1454 
1455 	spa_feature_incr(ddt->ddt_spa, SPA_FEATURE_FAST_DEDUP, tx);
1456 }
1457 
1458 /* Destroy the containing dir and deactivate the feature */
1459 static void
ddt_destroy_dir(ddt_t * ddt,dmu_tx_t * tx)1460 ddt_destroy_dir(ddt_t *ddt, dmu_tx_t *tx)
1461 {
1462 	ASSERT3U(ddt->ddt_dir_object, !=, 0);
1463 	ASSERT3U(ddt->ddt_dir_object, !=, DMU_POOL_DIRECTORY_OBJECT);
1464 	ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT);
1465 
1466 	char name[DDT_NAMELEN];
1467 	snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR,
1468 	    zio_checksum_table[ddt->ddt_checksum].ci_name);
1469 
1470 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1471 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
1472 			ASSERT(!ddt_object_exists(ddt, type, class));
1473 		}
1474 	}
1475 
1476 	ddt_log_destroy(ddt, tx);
1477 
1478 	uint64_t count;
1479 	ASSERT0(zap_count(ddt->ddt_os, ddt->ddt_dir_object, &count));
1480 	ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object,
1481 	    DDT_DIR_VERSION));
1482 	ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS));
1483 	ASSERT3U(count, ==, 2);
1484 
1485 	VERIFY0(zap_remove(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, tx));
1486 	VERIFY0(zap_destroy(ddt->ddt_os, ddt->ddt_dir_object, tx));
1487 
1488 	ddt->ddt_dir_object = 0;
1489 
1490 	spa_feature_decr(ddt->ddt_spa, SPA_FEATURE_FAST_DEDUP, tx);
1491 }
1492 
1493 /*
1494  * Determine, flags and on-disk layout from what's already stored. If there's
1495  * nothing stored, then if new is false, returns ENOENT, and if true, selects
1496  * based on pool config.
1497  */
1498 static int
ddt_configure(ddt_t * ddt,boolean_t new)1499 ddt_configure(ddt_t *ddt, boolean_t new)
1500 {
1501 	spa_t *spa = ddt->ddt_spa;
1502 	char name[DDT_NAMELEN];
1503 	int error;
1504 
1505 	ASSERT3U(spa_load_state(spa), !=, SPA_LOAD_CREATE);
1506 
1507 	boolean_t fdt_enabled =
1508 	    spa_feature_is_enabled(spa, SPA_FEATURE_FAST_DEDUP);
1509 	boolean_t fdt_active =
1510 	    spa_feature_is_active(spa, SPA_FEATURE_FAST_DEDUP);
1511 
1512 	/*
1513 	 * First, look for the global DDT stats object. If its not there, then
1514 	 * there's never been a DDT written before ever, and we know we're
1515 	 * starting from scratch.
1516 	 */
1517 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1518 	    DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
1519 	    &spa->spa_ddt_stat_object);
1520 	if (error != 0) {
1521 		if (error != ENOENT)
1522 			return (error);
1523 		goto not_found;
1524 	}
1525 
1526 	if (fdt_active) {
1527 		/*
1528 		 * Now look for a DDT directory. If it exists, then it has
1529 		 * everything we need.
1530 		 */
1531 		snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR,
1532 		    zio_checksum_table[ddt->ddt_checksum].ci_name);
1533 
1534 		error = zap_lookup(spa->spa_meta_objset,
1535 		    DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1,
1536 		    &ddt->ddt_dir_object);
1537 		if (error == 0) {
1538 			ASSERT3U(spa->spa_meta_objset, ==, ddt->ddt_os);
1539 
1540 			error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object,
1541 			    DDT_DIR_VERSION, sizeof (uint64_t), 1,
1542 			    &ddt->ddt_version);
1543 			if (error != 0)
1544 				return (error);
1545 
1546 			error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object,
1547 			    DDT_DIR_FLAGS, sizeof (uint64_t), 1,
1548 			    &ddt->ddt_flags);
1549 			if (error != 0)
1550 				return (error);
1551 
1552 			if (ddt->ddt_version != DDT_VERSION_FDT) {
1553 				zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s "
1554 				    "unknown version %llu", spa_name(spa),
1555 				    name, (u_longlong_t)ddt->ddt_version);
1556 				return (SET_ERROR(EINVAL));
1557 			}
1558 
1559 			if ((ddt->ddt_flags & ~DDT_FLAG_MASK) != 0) {
1560 				zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s "
1561 				    "version=%llu unknown flags %llx",
1562 				    spa_name(spa), name,
1563 				    (u_longlong_t)ddt->ddt_flags,
1564 				    (u_longlong_t)ddt->ddt_version);
1565 				return (SET_ERROR(EINVAL));
1566 			}
1567 
1568 			return (0);
1569 		}
1570 		if (error != ENOENT)
1571 			return (error);
1572 	}
1573 
1574 	/* Any object in the root indicates a traditional setup. */
1575 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1576 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
1577 			ddt_object_name(ddt, type, class, name);
1578 			uint64_t obj;
1579 			error = zap_lookup(spa->spa_meta_objset,
1580 			    DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t),
1581 			    1, &obj);
1582 			if (error == ENOENT)
1583 				continue;
1584 			if (error != 0)
1585 				return (error);
1586 
1587 			ddt->ddt_version = DDT_VERSION_LEGACY;
1588 			ddt->ddt_flags = ddt_version_flags[ddt->ddt_version];
1589 			ddt->ddt_dir_object = DMU_POOL_DIRECTORY_OBJECT;
1590 
1591 			return (0);
1592 		}
1593 	}
1594 
1595 not_found:
1596 	if (!new)
1597 		return (SET_ERROR(ENOENT));
1598 
1599 	/* Nothing on disk, so set up for the best version we can */
1600 	if (fdt_enabled) {
1601 		ddt->ddt_version = DDT_VERSION_FDT;
1602 		ddt->ddt_flags = ddt_version_flags[ddt->ddt_version];
1603 		ddt->ddt_dir_object = 0; /* create on first use */
1604 	} else {
1605 		ddt->ddt_version = DDT_VERSION_LEGACY;
1606 		ddt->ddt_flags = ddt_version_flags[ddt->ddt_version];
1607 		ddt->ddt_dir_object = DMU_POOL_DIRECTORY_OBJECT;
1608 	}
1609 
1610 	return (0);
1611 }
1612 
1613 static int
ddt_kstat_update(kstat_t * ksp,int rw)1614 ddt_kstat_update(kstat_t *ksp, int rw)
1615 {
1616 	ddt_t *ddt = ksp->ks_private;
1617 	ddt_kstats_t *dds = ksp->ks_data;
1618 
1619 	if (rw == KSTAT_WRITE)
1620 		return (SET_ERROR(EACCES));
1621 
1622 	/* Aggregate wmsum counters for lookup stats */
1623 	dds->dds_lookup.value.ui64 =
1624 	    wmsum_value(&ddt->ddt_kstat_dds_lookup);
1625 	dds->dds_lookup_live_hit.value.ui64 =
1626 	    wmsum_value(&ddt->ddt_kstat_dds_lookup_live_hit);
1627 	dds->dds_lookup_live_wait.value.ui64 =
1628 	    wmsum_value(&ddt->ddt_kstat_dds_lookup_live_wait);
1629 	dds->dds_lookup_live_miss.value.ui64 =
1630 	    wmsum_value(&ddt->ddt_kstat_dds_lookup_live_miss);
1631 	dds->dds_lookup_existing.value.ui64 =
1632 	    wmsum_value(&ddt->ddt_kstat_dds_lookup_existing);
1633 	dds->dds_lookup_new.value.ui64 =
1634 	    wmsum_value(&ddt->ddt_kstat_dds_lookup_new);
1635 	dds->dds_lookup_log_hit.value.ui64 =
1636 	    wmsum_value(&ddt->ddt_kstat_dds_lookup_log_hit);
1637 	dds->dds_lookup_log_active_hit.value.ui64 =
1638 	    wmsum_value(&ddt->ddt_kstat_dds_lookup_log_active_hit);
1639 	dds->dds_lookup_log_flushing_hit.value.ui64 =
1640 	    wmsum_value(&ddt->ddt_kstat_dds_lookup_log_flushing_hit);
1641 	dds->dds_lookup_log_miss.value.ui64 =
1642 	    wmsum_value(&ddt->ddt_kstat_dds_lookup_log_miss);
1643 	dds->dds_lookup_stored_hit.value.ui64 =
1644 	    wmsum_value(&ddt->ddt_kstat_dds_lookup_stored_hit);
1645 	dds->dds_lookup_stored_miss.value.ui64 =
1646 	    wmsum_value(&ddt->ddt_kstat_dds_lookup_stored_miss);
1647 
1648 	/* Sync-only counters are already set directly in kstats */
1649 
1650 	return (0);
1651 }
1652 
1653 static void
ddt_table_alloc_kstats(ddt_t * ddt)1654 ddt_table_alloc_kstats(ddt_t *ddt)
1655 {
1656 	char *mod = kmem_asprintf("zfs/%s", spa_name(ddt->ddt_spa));
1657 	char *name = kmem_asprintf("ddt_stats_%s",
1658 	    zio_checksum_table[ddt->ddt_checksum].ci_name);
1659 
1660 	/* Initialize wmsums for lookup counters */
1661 	wmsum_init(&ddt->ddt_kstat_dds_lookup, 0);
1662 	wmsum_init(&ddt->ddt_kstat_dds_lookup_live_hit, 0);
1663 	wmsum_init(&ddt->ddt_kstat_dds_lookup_live_wait, 0);
1664 	wmsum_init(&ddt->ddt_kstat_dds_lookup_live_miss, 0);
1665 	wmsum_init(&ddt->ddt_kstat_dds_lookup_existing, 0);
1666 	wmsum_init(&ddt->ddt_kstat_dds_lookup_new, 0);
1667 	wmsum_init(&ddt->ddt_kstat_dds_lookup_log_hit, 0);
1668 	wmsum_init(&ddt->ddt_kstat_dds_lookup_log_active_hit, 0);
1669 	wmsum_init(&ddt->ddt_kstat_dds_lookup_log_flushing_hit, 0);
1670 	wmsum_init(&ddt->ddt_kstat_dds_lookup_log_miss, 0);
1671 	wmsum_init(&ddt->ddt_kstat_dds_lookup_stored_hit, 0);
1672 	wmsum_init(&ddt->ddt_kstat_dds_lookup_stored_miss, 0);
1673 
1674 	ddt->ddt_ksp = kstat_create(mod, 0, name, "misc", KSTAT_TYPE_NAMED,
1675 	    sizeof (ddt_kstats_t) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
1676 	if (ddt->ddt_ksp != NULL) {
1677 		ddt_kstats_t *dds = kmem_alloc(sizeof (ddt_kstats_t), KM_SLEEP);
1678 		memcpy(dds, &ddt_kstats_template, sizeof (ddt_kstats_t));
1679 		ddt->ddt_ksp->ks_data = dds;
1680 		ddt->ddt_ksp->ks_update = ddt_kstat_update;
1681 		ddt->ddt_ksp->ks_private = ddt;
1682 		kstat_install(ddt->ddt_ksp);
1683 	}
1684 
1685 	kmem_strfree(name);
1686 	kmem_strfree(mod);
1687 }
1688 
1689 static ddt_t *
ddt_table_alloc(spa_t * spa,enum zio_checksum c)1690 ddt_table_alloc(spa_t *spa, enum zio_checksum c)
1691 {
1692 	ddt_t *ddt;
1693 
1694 	ddt = kmem_cache_alloc(ddt_cache, KM_SLEEP);
1695 	memset(ddt, 0, sizeof (ddt_t));
1696 	mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
1697 	avl_create(&ddt->ddt_tree, ddt_key_compare,
1698 	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
1699 	avl_create(&ddt->ddt_repair_tree, ddt_key_compare,
1700 	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
1701 
1702 	ddt->ddt_checksum = c;
1703 	ddt->ddt_spa = spa;
1704 	ddt->ddt_os = spa->spa_meta_objset;
1705 	ddt->ddt_version = DDT_VERSION_UNCONFIGURED;
1706 	ddt->ddt_log_flush_pressure = 10;
1707 
1708 	ddt_log_alloc(ddt);
1709 	ddt_table_alloc_kstats(ddt);
1710 
1711 	return (ddt);
1712 }
1713 
1714 static void
ddt_table_free(ddt_t * ddt)1715 ddt_table_free(ddt_t *ddt)
1716 {
1717 	if (ddt->ddt_ksp != NULL) {
1718 		kmem_free(ddt->ddt_ksp->ks_data, sizeof (ddt_kstats_t));
1719 		ddt->ddt_ksp->ks_data = NULL;
1720 		kstat_delete(ddt->ddt_ksp);
1721 	}
1722 
1723 	/* Cleanup wmsums for lookup counters */
1724 	wmsum_fini(&ddt->ddt_kstat_dds_lookup);
1725 	wmsum_fini(&ddt->ddt_kstat_dds_lookup_live_hit);
1726 	wmsum_fini(&ddt->ddt_kstat_dds_lookup_live_wait);
1727 	wmsum_fini(&ddt->ddt_kstat_dds_lookup_live_miss);
1728 	wmsum_fini(&ddt->ddt_kstat_dds_lookup_existing);
1729 	wmsum_fini(&ddt->ddt_kstat_dds_lookup_new);
1730 	wmsum_fini(&ddt->ddt_kstat_dds_lookup_log_hit);
1731 	wmsum_fini(&ddt->ddt_kstat_dds_lookup_log_active_hit);
1732 	wmsum_fini(&ddt->ddt_kstat_dds_lookup_log_flushing_hit);
1733 	wmsum_fini(&ddt->ddt_kstat_dds_lookup_log_miss);
1734 	wmsum_fini(&ddt->ddt_kstat_dds_lookup_stored_hit);
1735 	wmsum_fini(&ddt->ddt_kstat_dds_lookup_stored_miss);
1736 
1737 	ddt_log_free(ddt);
1738 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1739 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
1740 			if (ddt->ddt_object_dnode[type][class] != NULL) {
1741 				dnode_rele(ddt->ddt_object_dnode[type][class],
1742 				    ddt);
1743 				ddt->ddt_object_dnode[type][class] = NULL;
1744 			}
1745 		}
1746 	}
1747 	ASSERT0(avl_numnodes(&ddt->ddt_tree));
1748 	ASSERT0(avl_numnodes(&ddt->ddt_repair_tree));
1749 	avl_destroy(&ddt->ddt_tree);
1750 	avl_destroy(&ddt->ddt_repair_tree);
1751 	mutex_destroy(&ddt->ddt_lock);
1752 	kmem_cache_free(ddt_cache, ddt);
1753 }
1754 
1755 void
ddt_create(spa_t * spa)1756 ddt_create(spa_t *spa)
1757 {
1758 	spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
1759 
1760 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1761 		if (DDT_CHECKSUM_VALID(c))
1762 			spa->spa_ddt[c] = ddt_table_alloc(spa, c);
1763 	}
1764 }
1765 
1766 int
ddt_load(spa_t * spa)1767 ddt_load(spa_t *spa)
1768 {
1769 	int error;
1770 
1771 	ddt_create(spa);
1772 
1773 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1774 	    DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
1775 	    &spa->spa_ddt_stat_object);
1776 	if (error)
1777 		return (error == ENOENT ? 0 : error);
1778 
1779 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1780 		if (!DDT_CHECKSUM_VALID(c))
1781 			continue;
1782 
1783 		ddt_t *ddt = spa->spa_ddt[c];
1784 		error = ddt_configure(ddt, B_FALSE);
1785 		if (error == ENOENT)
1786 			continue;
1787 		if (error != 0)
1788 			return (error);
1789 
1790 		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1791 			for (ddt_class_t class = 0; class < DDT_CLASSES;
1792 			    class++) {
1793 				error = ddt_object_load(ddt, type, class);
1794 				if (error != 0 && error != ENOENT)
1795 					return (error);
1796 			}
1797 		}
1798 
1799 		if (ddt->ddt_flags & DDT_FLAG_LOG) {
1800 			error = ddt_log_load(ddt);
1801 			if (error != 0 && error != ENOENT)
1802 				return (error);
1803 		}
1804 
1805 		DDT_KSTAT_SET(ddt, dds_log_active_entries,
1806 		    avl_numnodes(&ddt->ddt_log_active->ddl_tree));
1807 		DDT_KSTAT_SET(ddt, dds_log_flushing_entries,
1808 		    avl_numnodes(&ddt->ddt_log_flushing->ddl_tree));
1809 
1810 		/*
1811 		 * Seed the cached histograms.
1812 		 */
1813 		memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
1814 		    sizeof (ddt->ddt_histogram));
1815 	}
1816 
1817 	spa->spa_dedup_dspace = ~0ULL;
1818 	spa->spa_dedup_dsize = ~0ULL;
1819 
1820 	return (0);
1821 }
1822 
1823 void
ddt_unload(spa_t * spa)1824 ddt_unload(spa_t *spa)
1825 {
1826 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1827 		if (spa->spa_ddt[c]) {
1828 			ddt_table_free(spa->spa_ddt[c]);
1829 			spa->spa_ddt[c] = NULL;
1830 		}
1831 	}
1832 }
1833 
1834 boolean_t
ddt_class_contains(spa_t * spa,ddt_class_t max_class,const blkptr_t * bp)1835 ddt_class_contains(spa_t *spa, ddt_class_t max_class, const blkptr_t *bp)
1836 {
1837 	ddt_t *ddt;
1838 	ddt_key_t ddk;
1839 
1840 	if (!BP_GET_DEDUP(bp))
1841 		return (B_FALSE);
1842 
1843 	if (max_class == DDT_CLASS_UNIQUE)
1844 		return (B_TRUE);
1845 
1846 	ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
1847 
1848 	ddt_key_fill(&ddk, bp);
1849 
1850 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1851 		for (ddt_class_t class = 0; class <= max_class; class++) {
1852 			if (ddt_object_contains(ddt, type, class, &ddk) == 0)
1853 				return (B_TRUE);
1854 		}
1855 	}
1856 
1857 	return (B_FALSE);
1858 }
1859 
1860 ddt_entry_t *
ddt_repair_start(ddt_t * ddt,const blkptr_t * bp)1861 ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
1862 {
1863 	ddt_key_t ddk;
1864 	ddt_entry_t *dde;
1865 
1866 	ddt_key_fill(&ddk, bp);
1867 
1868 	dde = ddt_alloc(ddt, &ddk);
1869 	ddt_alloc_entry_io(dde);
1870 
1871 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1872 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
1873 			/*
1874 			 * We can only do repair if there are multiple copies
1875 			 * of the block.  For anything in the UNIQUE class,
1876 			 * there's definitely only one copy, so don't even try.
1877 			 */
1878 			if (class != DDT_CLASS_UNIQUE &&
1879 			    ddt_object_lookup(ddt, type, class, dde) == 0)
1880 				return (dde);
1881 		}
1882 	}
1883 
1884 	memset(dde->dde_phys, 0, DDT_PHYS_SIZE(ddt));
1885 
1886 	return (dde);
1887 }
1888 
1889 void
ddt_repair_done(ddt_t * ddt,ddt_entry_t * dde)1890 ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
1891 {
1892 	avl_index_t where;
1893 
1894 	ddt_enter(ddt);
1895 
1896 	if (dde->dde_io->dde_repair_abd != NULL &&
1897 	    spa_writeable(ddt->ddt_spa) &&
1898 	    avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
1899 		avl_insert(&ddt->ddt_repair_tree, dde, where);
1900 	else
1901 		ddt_free(ddt, dde);
1902 
1903 	ddt_exit(ddt);
1904 }
1905 
1906 static void
ddt_repair_entry_done(zio_t * zio)1907 ddt_repair_entry_done(zio_t *zio)
1908 {
1909 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
1910 	ddt_entry_t *rdde = zio->io_private;
1911 
1912 	ddt_free(ddt, rdde);
1913 }
1914 
1915 static void
ddt_repair_entry(ddt_t * ddt,ddt_entry_t * dde,ddt_entry_t * rdde,zio_t * rio)1916 ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
1917 {
1918 	ddt_key_t *ddk = &dde->dde_key;
1919 	ddt_key_t *rddk = &rdde->dde_key;
1920 	zio_t *zio;
1921 	blkptr_t blk;
1922 
1923 	zio = zio_null(rio, rio->io_spa, NULL,
1924 	    ddt_repair_entry_done, rdde, rio->io_flags);
1925 
1926 	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
1927 		ddt_univ_phys_t *ddp = dde->dde_phys;
1928 		ddt_univ_phys_t *rddp = rdde->dde_phys;
1929 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
1930 		uint64_t phys_birth = ddt_phys_birth(ddp, v);
1931 		const dva_t *dvas, *rdvas;
1932 
1933 		if (ddt->ddt_flags & DDT_FLAG_FLAT) {
1934 			dvas = ddp->ddp_flat.ddp_dva;
1935 			rdvas = rddp->ddp_flat.ddp_dva;
1936 		} else {
1937 			dvas = ddp->ddp_trad[p].ddp_dva;
1938 			rdvas = rddp->ddp_trad[p].ddp_dva;
1939 		}
1940 
1941 		if (phys_birth == 0 ||
1942 		    phys_birth != ddt_phys_birth(rddp, v) ||
1943 		    memcmp(dvas, rdvas, sizeof (dva_t) * SPA_DVAS_PER_BP))
1944 			continue;
1945 
1946 		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
1947 		zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
1948 		    rdde->dde_io->dde_repair_abd, DDK_GET_PSIZE(rddk),
1949 		    NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
1950 		    ZIO_DDT_CHILD_FLAGS(zio), NULL));
1951 	}
1952 
1953 	zio_nowait(zio);
1954 }
1955 
1956 static void
ddt_repair_table(ddt_t * ddt,zio_t * rio)1957 ddt_repair_table(ddt_t *ddt, zio_t *rio)
1958 {
1959 	spa_t *spa = ddt->ddt_spa;
1960 	ddt_entry_t *dde, *rdde_next, *rdde;
1961 	avl_tree_t *t = &ddt->ddt_repair_tree;
1962 	blkptr_t blk;
1963 
1964 	if (spa_sync_pass(spa) > 1)
1965 		return;
1966 
1967 	ddt_enter(ddt);
1968 	for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
1969 		rdde_next = AVL_NEXT(t, rdde);
1970 		avl_remove(&ddt->ddt_repair_tree, rdde);
1971 		ddt_exit(ddt);
1972 		ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL,
1973 		    DDT_PHYS_NONE, &blk);
1974 		dde = ddt_repair_start(ddt, &blk);
1975 		ddt_repair_entry(ddt, dde, rdde, rio);
1976 		ddt_repair_done(ddt, dde);
1977 		ddt_enter(ddt);
1978 	}
1979 	ddt_exit(ddt);
1980 }
1981 
1982 static void
ddt_sync_update_stats(ddt_t * ddt,dmu_tx_t * tx)1983 ddt_sync_update_stats(ddt_t *ddt, dmu_tx_t *tx)
1984 {
1985 	/*
1986 	 * Count all the entries stored for each type/class, and updates the
1987 	 * stats within (ddt_object_sync()). If there's no entries for the
1988 	 * type/class, the whole object is removed. If all objects for the DDT
1989 	 * are removed, its containing dir is removed, effectively resetting
1990 	 * the entire DDT to an empty slate.
1991 	 */
1992 	uint64_t count = 0;
1993 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1994 		uint64_t add, tcount = 0;
1995 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
1996 			if (ddt_object_exists(ddt, type, class)) {
1997 				ddt_object_sync(ddt, type, class, tx);
1998 				VERIFY0(ddt_object_count(ddt, type, class,
1999 				    &add));
2000 				tcount += add;
2001 			}
2002 		}
2003 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
2004 			if (tcount == 0 && ddt_object_exists(ddt, type, class))
2005 				ddt_object_destroy(ddt, type, class, tx);
2006 		}
2007 		count += tcount;
2008 	}
2009 
2010 	if (ddt->ddt_flags & DDT_FLAG_LOG) {
2011 		/* Include logged entries in the total count */
2012 		count += avl_numnodes(&ddt->ddt_log_active->ddl_tree);
2013 		count += avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
2014 	}
2015 
2016 	if (count == 0) {
2017 		/*
2018 		 * No entries left on the DDT, so reset the version for next
2019 		 * time. This allows us to handle the feature being changed
2020 		 * since the DDT was originally created. New entries should get
2021 		 * whatever the feature currently demands.
2022 		 */
2023 		if (ddt->ddt_version == DDT_VERSION_FDT)
2024 			ddt_destroy_dir(ddt, tx);
2025 
2026 		ddt->ddt_version = DDT_VERSION_UNCONFIGURED;
2027 		ddt->ddt_flags = 0;
2028 	}
2029 
2030 	memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
2031 	    sizeof (ddt->ddt_histogram));
2032 	ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
2033 	ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
2034 }
2035 
2036 static void
ddt_sync_scan_entry(ddt_t * ddt,ddt_lightweight_entry_t * ddlwe,dmu_tx_t * tx)2037 ddt_sync_scan_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
2038 {
2039 	dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
2040 
2041 	/*
2042 	 * Compute the target class, so we can decide whether or not to inform
2043 	 * the scrub traversal (below). Note that we don't store this in the
2044 	 * entry, as it might change multiple times before finally being
2045 	 * committed (if we're logging). Instead, we recompute it in
2046 	 * ddt_sync_entry().
2047 	 */
2048 	uint64_t refcnt = ddt_phys_total_refcnt(ddt, &ddlwe->ddlwe_phys);
2049 	ddt_class_t nclass =
2050 	    (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE;
2051 
2052 	/*
2053 	 * If the class changes, the order that we scan this bp changes. If it
2054 	 * decreases, we could miss it, so scan it right now. (This covers both
2055 	 * class changing while we are doing ddt_walk(), and when we are
2056 	 * traversing.)
2057 	 *
2058 	 * We also do this when the refcnt goes to zero, because that change is
2059 	 * only in the log so far; the blocks on disk won't be freed until
2060 	 * the log is flushed, and the refcnt might increase before that. If it
2061 	 * does, then we could miss it in the same way.
2062 	 */
2063 	if (refcnt == 0 || nclass < ddlwe->ddlwe_class)
2064 		dsl_scan_ddt_entry(dp->dp_scan, ddt->ddt_checksum, ddt,
2065 		    ddlwe, tx);
2066 }
2067 
2068 static void
ddt_sync_flush_entry(ddt_t * ddt,ddt_lightweight_entry_t * ddlwe,ddt_type_t otype,ddt_class_t oclass,dmu_tx_t * tx)2069 ddt_sync_flush_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe,
2070     ddt_type_t otype, ddt_class_t oclass, dmu_tx_t *tx)
2071 {
2072 	ddt_key_t *ddk = &ddlwe->ddlwe_key;
2073 	ddt_type_t ntype = DDT_TYPE_DEFAULT;
2074 	uint64_t refcnt = 0;
2075 
2076 	/*
2077 	 * Compute the total refcnt. Along the way, issue frees for any DVAs
2078 	 * we no longer want.
2079 	 */
2080 	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
2081 		ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
2082 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
2083 		uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v);
2084 
2085 		if (ddt_phys_birth(ddp, v) == 0) {
2086 			ASSERT0(phys_refcnt);
2087 			continue;
2088 		}
2089 		if (DDT_PHYS_IS_DITTO(ddt, p)) {
2090 			/*
2091 			 * We don't want to keep any obsolete slots (eg ditto),
2092 			 * regardless of their refcount, but we don't want to
2093 			 * leak them either. So, free them.
2094 			 */
2095 			ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg);
2096 			continue;
2097 		}
2098 		if (phys_refcnt == 0)
2099 			/* No remaining references, free it! */
2100 			ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg);
2101 		refcnt += phys_refcnt;
2102 	}
2103 
2104 	/* Select the best class for the entry. */
2105 	ddt_class_t nclass =
2106 	    (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE;
2107 
2108 	/*
2109 	 * If an existing entry changed type or class, or its refcount reached
2110 	 * zero, delete it from the DDT object
2111 	 */
2112 	if (otype != DDT_TYPES &&
2113 	    (otype != ntype || oclass != nclass || refcnt == 0)) {
2114 		VERIFY0(ddt_object_remove(ddt, otype, oclass, ddk, tx));
2115 		ASSERT(ddt_object_contains(ddt, otype, oclass, ddk) == ENOENT);
2116 	}
2117 
2118 	/*
2119 	 * Add or update the entry
2120 	 */
2121 	if (refcnt != 0) {
2122 		ddt_histogram_t *ddh =
2123 		    &ddt->ddt_histogram[ntype][nclass];
2124 
2125 		ddt_histogram_add_entry(ddt, ddh, ddlwe);
2126 
2127 		if (!ddt_object_exists(ddt, ntype, nclass))
2128 			ddt_object_create(ddt, ntype, nclass, tx);
2129 		VERIFY0(ddt_object_update(ddt, ntype, nclass, ddlwe, tx));
2130 	}
2131 }
2132 
2133 /* Calculate an exponential weighted moving average, lower limited to zero */
2134 static inline int32_t
_ewma(int32_t val,int32_t prev,uint32_t weight)2135 _ewma(int32_t val, int32_t prev, uint32_t weight)
2136 {
2137 	ASSERT3U(val, >=, 0);
2138 	ASSERT3U(prev, >=, 0);
2139 	const int32_t new =
2140 	    MAX(0, prev + (val-prev) / (int32_t)MAX(weight, 1));
2141 	ASSERT3U(new, >=, 0);
2142 	return (new);
2143 }
2144 
2145 static inline void
ddt_flush_force_update_txg(ddt_t * ddt,uint64_t txg)2146 ddt_flush_force_update_txg(ddt_t *ddt, uint64_t txg)
2147 {
2148 	/*
2149 	 * If we're not forcing flush, and not being asked to start, then
2150 	 * there's nothing more to do.
2151 	 */
2152 	if (txg == 0) {
2153 		/* Update requested, are we currently forcing flush? */
2154 		if (ddt->ddt_flush_force_txg == 0)
2155 			return;
2156 		txg = ddt->ddt_flush_force_txg;
2157 	}
2158 
2159 	/*
2160 	 * If either of the logs have entries unflushed entries before
2161 	 * the wanted txg, set the force txg, otherwise clear it.
2162 	 */
2163 
2164 	if ((!avl_is_empty(&ddt->ddt_log_active->ddl_tree) &&
2165 	    ddt->ddt_log_active->ddl_first_txg <= txg) ||
2166 	    (!avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) &&
2167 	    ddt->ddt_log_flushing->ddl_first_txg <= txg)) {
2168 		ddt->ddt_flush_force_txg = txg;
2169 		return;
2170 	}
2171 
2172 	/*
2173 	 * Nothing to flush behind the given txg, so we can clear force flush
2174 	 * state.
2175 	 */
2176 	ddt->ddt_flush_force_txg = 0;
2177 }
2178 
2179 static void
ddt_sync_flush_log(ddt_t * ddt,dmu_tx_t * tx)2180 ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
2181 {
2182 	spa_t *spa = ddt->ddt_spa;
2183 	ASSERT(avl_is_empty(&ddt->ddt_tree));
2184 
2185 	/*
2186 	 * Don't do any flushing when the pool is ready to shut down, or in
2187 	 * passes beyond the first.
2188 	 */
2189 	if (spa_sync_pass(spa) > 1 || tx->tx_txg > spa_final_dirty_txg(spa))
2190 		return;
2191 
2192 	hrtime_t flush_start = gethrtime();
2193 	uint32_t count = 0;
2194 
2195 	/*
2196 	 * How many entries we need to flush. We need to at
2197 	 * least match the ingest rate, and also consider the
2198 	 * current backlog of entries.
2199 	 */
2200 	uint64_t backlog = avl_numnodes(&ddt->ddt_log_flushing->ddl_tree) +
2201 	    avl_numnodes(&ddt->ddt_log_active->ddl_tree);
2202 
2203 	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree))
2204 		goto housekeeping;
2205 
2206 	uint64_t txgs = MAX(1, zfs_dedup_log_flush_txgs);
2207 	uint64_t cap = MAX(1, zfs_dedup_log_cap);
2208 	uint64_t flush_min = MAX(backlog / txgs,
2209 	    zfs_dedup_log_flush_entries_min);
2210 
2211 	/*
2212 	 * The theory for this block is that if we increase the pressure while
2213 	 * we're growing above the cap, and remove it when we're significantly
2214 	 * below the cap, we'll stay near cap while not bouncing around too
2215 	 * much.
2216 	 *
2217 	 * The factor of 10 is to smooth the pressure effect by expressing it
2218 	 * in tenths. The addition of the cap to the backlog in the second
2219 	 * block is to round up, instead of down. We never let the pressure go
2220 	 * below 1 (10 tenths).
2221 	 */
2222 	if (cap != UINT_MAX && backlog > cap &&
2223 	    backlog > ddt->ddt_log_flush_prev_backlog) {
2224 		ddt->ddt_log_flush_pressure += 10 * backlog / cap;
2225 	} else if (cap != UINT_MAX && backlog < cap) {
2226 		ddt->ddt_log_flush_pressure -=
2227 		    11 - (((10 * backlog) + cap - 1) / cap);
2228 		ddt->ddt_log_flush_pressure =
2229 		    MAX(ddt->ddt_log_flush_pressure, 10);
2230 	}
2231 
2232 	if (zfs_dedup_log_hard_cap && cap != UINT_MAX)
2233 		flush_min = MAX(flush_min, MIN(backlog - cap,
2234 		    (flush_min * ddt->ddt_log_flush_pressure) / 10));
2235 
2236 	uint64_t flush_max;
2237 
2238 	/*
2239 	 * If we've been asked to flush everything in a hurry,
2240 	 * try to dump as much as possible on this txg. In
2241 	 * this case we're only limited by time, not amount.
2242 	 *
2243 	 * Otherwise, if we are over the cap, try to get back down to it.
2244 	 *
2245 	 * Finally if there is no cap (or no pressure), just set the max a
2246 	 * little higher than the min to help smooth out variations in flush
2247 	 * times.
2248 	 */
2249 	if (ddt->ddt_flush_force_txg > 0)
2250 		flush_max = avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
2251 	else if (cap != UINT32_MAX && !zfs_dedup_log_hard_cap)
2252 		flush_max = MAX(flush_min * 5 / 4, MIN(backlog - cap,
2253 		    (flush_min * ddt->ddt_log_flush_pressure) / 10));
2254 	else
2255 		flush_max = flush_min * 5 / 4;
2256 	flush_max = MIN(flush_max, zfs_dedup_log_flush_entries_max);
2257 
2258 	/*
2259 	 * When the pool is busy or someone is explicitly waiting for this txg
2260 	 * to complete, use the zfs_dedup_log_flush_min_time_ms.  Otherwise use
2261 	 * half of the time in the txg timeout.
2262 	 */
2263 	uint64_t target_time;
2264 
2265 	if (txg_sync_waiting(ddt->ddt_spa->spa_dsl_pool) ||
2266 	    vdev_queue_pool_busy(spa)) {
2267 		target_time = MIN(MSEC2NSEC(zfs_dedup_log_flush_min_time_ms),
2268 		    SEC2NSEC(zfs_txg_timeout) / 2);
2269 	} else {
2270 		target_time = SEC2NSEC(zfs_txg_timeout) / 2;
2271 	}
2272 
2273 	ddt_lightweight_entry_t ddlwe;
2274 	while (ddt_log_take_first(ddt, ddt->ddt_log_flushing, &ddlwe)) {
2275 		ddt_sync_flush_entry(ddt, &ddlwe,
2276 		    ddlwe.ddlwe_type, ddlwe.ddlwe_class, tx);
2277 
2278 		/* End if we've synced as much as we needed to. */
2279 		if (++count >= flush_max)
2280 			break;
2281 
2282 		/*
2283 		 * As long as we've flushed the absolute minimum,
2284 		 * stop if we're way over our target time.
2285 		 */
2286 		uint64_t diff = gethrtime() - flush_start;
2287 		if (count > zfs_dedup_log_flush_entries_min &&
2288 		    diff >= target_time * 2)
2289 			break;
2290 
2291 		/*
2292 		 * End if we've passed the minimum flush and we're out of time.
2293 		 */
2294 		if (count > flush_min && diff >= target_time)
2295 			break;
2296 	}
2297 
2298 	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) {
2299 		/* We emptied it, so truncate on-disk */
2300 		DDT_KSTAT_ZERO(ddt, dds_log_flushing_entries);
2301 		ddt_log_truncate(ddt, tx);
2302 	} else {
2303 		/* More to do next time, save checkpoint */
2304 		DDT_KSTAT_SUB(ddt, dds_log_flushing_entries, count);
2305 		ddt_log_checkpoint(ddt, &ddlwe, tx);
2306 	}
2307 
2308 	ddt_sync_update_stats(ddt, tx);
2309 
2310 housekeeping:
2311 	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) &&
2312 	    !avl_is_empty(&ddt->ddt_log_active->ddl_tree)) {
2313 		/*
2314 		 * No more to flush, and the active list has stuff, so
2315 		 * try to swap the logs for next time.
2316 		 */
2317 		if (ddt_log_swap(ddt, tx)) {
2318 			DDT_KSTAT_ZERO(ddt, dds_log_active_entries);
2319 			DDT_KSTAT_SET(ddt, dds_log_flushing_entries,
2320 			    avl_numnodes(&ddt->ddt_log_flushing->ddl_tree));
2321 		}
2322 	}
2323 
2324 	/* If force flush is no longer necessary, turn it off. */
2325 	ddt_flush_force_update_txg(ddt, 0);
2326 
2327 	ddt->ddt_log_flush_prev_backlog = backlog;
2328 
2329 	/*
2330 	 * Update flush rate. This is an exponential weighted moving
2331 	 * average of the number of entries flushed over recent txgs.
2332 	 */
2333 	ddt->ddt_log_flush_rate = _ewma(count, ddt->ddt_log_flush_rate,
2334 	    zfs_dedup_log_flush_flow_rate_txgs);
2335 	DDT_KSTAT_SET(ddt, dds_log_flush_rate, ddt->ddt_log_flush_rate);
2336 
2337 	/*
2338 	 * Update flush time rate. This is an exponential weighted moving
2339 	 * average of the total time taken to flush over recent txgs.
2340 	 */
2341 	ddt->ddt_log_flush_time_rate = _ewma(ddt->ddt_log_flush_time_rate,
2342 	    (int32_t)NSEC2MSEC(gethrtime() - flush_start),
2343 	    zfs_dedup_log_flush_flow_rate_txgs);
2344 	DDT_KSTAT_SET(ddt, dds_log_flush_time_rate,
2345 	    ddt->ddt_log_flush_time_rate);
2346 	if (avl_numnodes(&ddt->ddt_log_flushing->ddl_tree) > 0 &&
2347 	    zfs_flags & ZFS_DEBUG_DDT) {
2348 		zfs_dbgmsg("%lu entries remain(%lu in active), flushed %u @ "
2349 		    "txg %llu, in %llu ms, flush rate %d, time rate %d",
2350 		    (ulong_t)avl_numnodes(&ddt->ddt_log_flushing->ddl_tree),
2351 		    (ulong_t)avl_numnodes(&ddt->ddt_log_active->ddl_tree),
2352 		    count, (u_longlong_t)tx->tx_txg,
2353 		    (u_longlong_t)NSEC2MSEC(gethrtime() - flush_start),
2354 		    ddt->ddt_log_flush_rate, ddt->ddt_log_flush_time_rate);
2355 	}
2356 }
2357 
2358 static void
ddt_sync_table_log(ddt_t * ddt,dmu_tx_t * tx)2359 ddt_sync_table_log(ddt_t *ddt, dmu_tx_t *tx)
2360 {
2361 	uint64_t count = avl_numnodes(&ddt->ddt_tree);
2362 
2363 	if (count > 0) {
2364 		ddt_log_update_t dlu = {0};
2365 		ddt_log_begin(ddt, count, tx, &dlu);
2366 
2367 		ddt_entry_t *dde;
2368 		void *cookie = NULL;
2369 		ddt_lightweight_entry_t ddlwe;
2370 		while ((dde =
2371 		    avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
2372 			ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
2373 			DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
2374 
2375 			/* If from flushing log, remove it. */
2376 			if (dde->dde_flags & DDE_FLAG_FROM_FLUSHING) {
2377 				VERIFY(ddt_log_remove_key(ddt,
2378 				    ddt->ddt_log_flushing, &ddlwe.ddlwe_key));
2379 			}
2380 
2381 			/* Update class_start to track last modification time */
2382 			if (ddt->ddt_flags & DDT_FLAG_FLAT) {
2383 				ddlwe.ddlwe_phys.ddp_flat.ddp_class_start =
2384 				    ddt_class_start();
2385 			}
2386 
2387 			ddt_log_entry(ddt, &ddlwe, &dlu);
2388 			ddt_sync_scan_entry(ddt, &ddlwe, tx);
2389 			ddt_free(ddt, dde);
2390 		}
2391 
2392 		ddt_log_commit(ddt, &dlu);
2393 
2394 		DDT_KSTAT_SET(ddt, dds_log_active_entries,
2395 		    avl_numnodes(&ddt->ddt_log_active->ddl_tree));
2396 
2397 		/*
2398 		 * Sync the stats for the store objects. Even though we haven't
2399 		 * modified anything on those objects, they're no longer the
2400 		 * source of truth for entries that are now in the log, and we
2401 		 * need the on-disk counts to reflect that, otherwise we'll
2402 		 * miscount later when importing.
2403 		 */
2404 		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
2405 			for (ddt_class_t class = 0;
2406 			    class < DDT_CLASSES; class++) {
2407 				if (ddt_object_exists(ddt, type, class))
2408 					ddt_object_sync(ddt, type, class, tx);
2409 			}
2410 		}
2411 
2412 		memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
2413 		    sizeof (ddt->ddt_histogram));
2414 		ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
2415 		ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
2416 	}
2417 
2418 	if (spa_sync_pass(ddt->ddt_spa) == 1) {
2419 		/*
2420 		 * Update ingest rate. This is an exponential weighted moving
2421 		 * average of the number of entries changed over recent txgs.
2422 		 * The ramp-up cost shouldn't matter too much because the
2423 		 * flusher will be trying to take at least the minimum anyway.
2424 		 */
2425 		ddt->ddt_log_ingest_rate = _ewma(
2426 		    count, ddt->ddt_log_ingest_rate,
2427 		    zfs_dedup_log_flush_flow_rate_txgs);
2428 		DDT_KSTAT_SET(ddt, dds_log_ingest_rate,
2429 		    ddt->ddt_log_ingest_rate);
2430 	}
2431 }
2432 
2433 static void
ddt_sync_table_flush(ddt_t * ddt,dmu_tx_t * tx)2434 ddt_sync_table_flush(ddt_t *ddt, dmu_tx_t *tx)
2435 {
2436 	if (avl_numnodes(&ddt->ddt_tree) == 0)
2437 		return;
2438 
2439 	ddt_entry_t *dde;
2440 	void *cookie = NULL;
2441 	while ((dde = avl_destroy_nodes(
2442 	    &ddt->ddt_tree, &cookie)) != NULL) {
2443 		ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
2444 
2445 		ddt_lightweight_entry_t ddlwe;
2446 		DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
2447 
2448 		/* Update class_start to track last modification time */
2449 		if (ddt->ddt_flags & DDT_FLAG_FLAT) {
2450 			ddlwe.ddlwe_phys.ddp_flat.ddp_class_start =
2451 			    ddt_class_start();
2452 		}
2453 
2454 		ddt_sync_flush_entry(ddt, &ddlwe,
2455 		    dde->dde_type, dde->dde_class, tx);
2456 		ddt_sync_scan_entry(ddt, &ddlwe, tx);
2457 		ddt_free(ddt, dde);
2458 	}
2459 
2460 	memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
2461 	    sizeof (ddt->ddt_histogram));
2462 	ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
2463 	ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
2464 	ddt_sync_update_stats(ddt, tx);
2465 }
2466 
2467 static void
ddt_sync_table(ddt_t * ddt,dmu_tx_t * tx)2468 ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx)
2469 {
2470 	spa_t *spa = ddt->ddt_spa;
2471 
2472 	if (ddt->ddt_version == UINT64_MAX)
2473 		return;
2474 
2475 	if (spa->spa_uberblock.ub_version < SPA_VERSION_DEDUP) {
2476 		ASSERT0(avl_numnodes(&ddt->ddt_tree));
2477 		return;
2478 	}
2479 
2480 	if (spa->spa_ddt_stat_object == 0) {
2481 		spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
2482 		    DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
2483 		    DMU_POOL_DDT_STATS, tx);
2484 	}
2485 
2486 	if (ddt->ddt_version == DDT_VERSION_FDT && ddt->ddt_dir_object == 0)
2487 		ddt_create_dir(ddt, tx);
2488 
2489 	if (ddt->ddt_flags & DDT_FLAG_LOG)
2490 		ddt_sync_table_log(ddt, tx);
2491 	else
2492 		ddt_sync_table_flush(ddt, tx);
2493 }
2494 
2495 void
ddt_sync(spa_t * spa,uint64_t txg)2496 ddt_sync(spa_t *spa, uint64_t txg)
2497 {
2498 	dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
2499 	dmu_tx_t *tx;
2500 	zio_t *rio;
2501 
2502 	ASSERT3U(spa_syncing_txg(spa), ==, txg);
2503 
2504 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
2505 
2506 	rio = zio_root(spa, NULL, NULL,
2507 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL);
2508 
2509 	/*
2510 	 * This function may cause an immediate scan of ddt blocks (see
2511 	 * the comment above dsl_scan_ddt() for details). We set the
2512 	 * scan's root zio here so that we can wait for any scan IOs in
2513 	 * addition to the regular ddt IOs.
2514 	 */
2515 	ASSERT0P(scn->scn_zio_root);
2516 	scn->scn_zio_root = rio;
2517 
2518 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
2519 		ddt_t *ddt = spa->spa_ddt[c];
2520 		if (ddt == NULL)
2521 			continue;
2522 		ddt_sync_table(ddt, tx);
2523 		if (ddt->ddt_flags & DDT_FLAG_LOG)
2524 			ddt_sync_flush_log(ddt, tx);
2525 		ddt_repair_table(ddt, rio);
2526 	}
2527 
2528 	(void) zio_wait(rio);
2529 	scn->scn_zio_root = NULL;
2530 
2531 	dmu_tx_commit(tx);
2532 }
2533 
2534 void
ddt_walk_init(spa_t * spa,uint64_t txg)2535 ddt_walk_init(spa_t *spa, uint64_t txg)
2536 {
2537 	if (txg == 0)
2538 		txg = spa_syncing_txg(spa);
2539 
2540 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
2541 		ddt_t *ddt = spa->spa_ddt[c];
2542 		if (ddt == NULL || !(ddt->ddt_flags & DDT_FLAG_LOG))
2543 			continue;
2544 
2545 		ddt_enter(ddt);
2546 		ddt_flush_force_update_txg(ddt, txg);
2547 		ddt_exit(ddt);
2548 	}
2549 }
2550 
2551 boolean_t
ddt_walk_ready(spa_t * spa)2552 ddt_walk_ready(spa_t *spa)
2553 {
2554 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
2555 		ddt_t *ddt = spa->spa_ddt[c];
2556 		if (ddt == NULL || !(ddt->ddt_flags & DDT_FLAG_LOG))
2557 			continue;
2558 
2559 		if (ddt->ddt_flush_force_txg > 0)
2560 			return (B_FALSE);
2561 	}
2562 
2563 	return (B_TRUE);
2564 }
2565 
2566 static int
ddt_walk_impl(spa_t * spa,ddt_bookmark_t * ddb,ddt_lightweight_entry_t * ddlwe,uint64_t flags,boolean_t wait)2567 ddt_walk_impl(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe,
2568     uint64_t flags, boolean_t wait)
2569 {
2570 	do {
2571 		do {
2572 			do {
2573 				ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
2574 				if (ddt == NULL)
2575 					continue;
2576 
2577 				if (flags != 0 &&
2578 				    (ddt->ddt_flags & flags) != flags)
2579 					continue;
2580 
2581 				if (wait && ddt->ddt_flush_force_txg > 0)
2582 					return (EAGAIN);
2583 
2584 				int error = ENOENT;
2585 				if (ddt_object_exists(ddt, ddb->ddb_type,
2586 				    ddb->ddb_class)) {
2587 					error = ddt_object_walk(ddt,
2588 					    ddb->ddb_type, ddb->ddb_class,
2589 					    &ddb->ddb_cursor, ddlwe);
2590 				}
2591 				if (error == 0)
2592 					return (0);
2593 				if (error != ENOENT)
2594 					return (error);
2595 				ddb->ddb_cursor = 0;
2596 			} while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
2597 			ddb->ddb_checksum = 0;
2598 		} while (++ddb->ddb_type < DDT_TYPES);
2599 		ddb->ddb_type = 0;
2600 	} while (++ddb->ddb_class < DDT_CLASSES);
2601 
2602 	return (SET_ERROR(ENOENT));
2603 }
2604 
2605 int
ddt_walk(spa_t * spa,ddt_bookmark_t * ddb,ddt_lightweight_entry_t * ddlwe)2606 ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe)
2607 {
2608 	return (ddt_walk_impl(spa, ddb, ddlwe, 0, B_TRUE));
2609 }
2610 
2611 /*
2612  * This function is used by Block Cloning (brt.c) to increase reference
2613  * counter for the DDT entry if the block is already in DDT.
2614  *
2615  * Return false if the block, despite having the D bit set, is not present
2616  * in the DDT. This is possible when the DDT has been pruned by an admin
2617  * or by the DDT quota mechanism.
2618  */
2619 boolean_t
ddt_addref(spa_t * spa,const blkptr_t * bp)2620 ddt_addref(spa_t *spa, const blkptr_t *bp)
2621 {
2622 	ddt_t *ddt;
2623 	ddt_entry_t *dde;
2624 	boolean_t result;
2625 
2626 	spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
2627 	ddt = ddt_select(spa, bp);
2628 	ddt_enter(ddt);
2629 
2630 	dde = ddt_lookup(ddt, bp, B_TRUE);
2631 
2632 	/* Can be NULL if the entry for this block was pruned. */
2633 	if (dde == NULL) {
2634 		ddt_exit(ddt);
2635 		spa_config_exit(spa, SCL_ZIO, FTAG);
2636 		return (B_FALSE);
2637 	}
2638 
2639 	if ((dde->dde_type < DDT_TYPES) || (dde->dde_flags & DDE_FLAG_LOGGED)) {
2640 		/*
2641 		 * This entry was either synced to a store object (dde_type is
2642 		 * real) or was logged. It must be properly on disk at this
2643 		 * point, so we can just bump its refcount.
2644 		 */
2645 		int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp));
2646 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
2647 
2648 		ddt_phys_addref(dde->dde_phys, v);
2649 		result = B_TRUE;
2650 	} else {
2651 		/*
2652 		 * If the block has the DEDUP flag set it still might not
2653 		 * exist in the DEDUP table due to DDT pruning of entries
2654 		 * where refcnt=1.
2655 		 */
2656 		ddt_remove(ddt, dde);
2657 		result = B_FALSE;
2658 	}
2659 
2660 	ddt_exit(ddt);
2661 	spa_config_exit(spa, SCL_ZIO, FTAG);
2662 
2663 	return (result);
2664 }
2665 
2666 typedef struct ddt_prune_entry {
2667 	ddt_t		*dpe_ddt;
2668 	ddt_key_t	dpe_key;
2669 	list_node_t	dpe_node;
2670 	ddt_univ_phys_t	dpe_phys[];
2671 } ddt_prune_entry_t;
2672 
2673 typedef struct ddt_prune_info {
2674 	spa_t		*dpi_spa;
2675 	uint64_t	dpi_txg_syncs;
2676 	uint64_t	dpi_pruned;
2677 	list_t		dpi_candidates;
2678 } ddt_prune_info_t;
2679 
2680 /*
2681  * Add prune candidates for ddt_sync during spa_sync
2682  */
2683 static void
prune_candidates_sync(void * arg,dmu_tx_t * tx)2684 prune_candidates_sync(void *arg, dmu_tx_t *tx)
2685 {
2686 	(void) tx;
2687 	ddt_prune_info_t *dpi = arg;
2688 	ddt_prune_entry_t *dpe;
2689 
2690 	spa_config_enter(dpi->dpi_spa, SCL_ZIO, FTAG, RW_READER);
2691 
2692 	/* Process the prune candidates collected so far */
2693 	while ((dpe = list_remove_head(&dpi->dpi_candidates)) != NULL) {
2694 		blkptr_t blk;
2695 		ddt_t *ddt = dpe->dpe_ddt;
2696 
2697 		ddt_enter(ddt);
2698 
2699 		/*
2700 		 * If it's on the live list, then it was loaded for update
2701 		 * this txg and is no longer stale; skip it.
2702 		 */
2703 		if (avl_find(&ddt->ddt_tree, &dpe->dpe_key, NULL)) {
2704 			ddt_exit(ddt);
2705 			kmem_free(dpe, sizeof (*dpe));
2706 			continue;
2707 		}
2708 
2709 		ddt_bp_create(ddt->ddt_checksum, &dpe->dpe_key,
2710 		    dpe->dpe_phys, DDT_PHYS_FLAT, &blk);
2711 
2712 		ddt_entry_t *dde = ddt_lookup(ddt, &blk, B_TRUE);
2713 		if (dde != NULL && !(dde->dde_flags & DDE_FLAG_LOGGED)) {
2714 			ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
2715 			/*
2716 			 * Zero the physical, so we don't try to free DVAs
2717 			 * at flush nor try to reuse this entry.
2718 			 */
2719 			ddt_phys_clear(dde->dde_phys, DDT_PHYS_FLAT);
2720 
2721 			dpi->dpi_pruned++;
2722 		}
2723 
2724 		ddt_exit(ddt);
2725 		kmem_free(dpe, sizeof (*dpe));
2726 	}
2727 
2728 	spa_config_exit(dpi->dpi_spa, SCL_ZIO, FTAG);
2729 	dpi->dpi_txg_syncs++;
2730 }
2731 
2732 /*
2733  * Prune candidates are collected in open context and processed
2734  * in sync context as part of ddt_sync_table().
2735  */
2736 static void
ddt_prune_entry(list_t * list,ddt_t * ddt,const ddt_key_t * ddk,const ddt_univ_phys_t * ddp)2737 ddt_prune_entry(list_t *list, ddt_t *ddt, const ddt_key_t *ddk,
2738     const ddt_univ_phys_t *ddp)
2739 {
2740 	ASSERT(ddt->ddt_flags & DDT_FLAG_FLAT);
2741 
2742 	size_t dpe_size = sizeof (ddt_prune_entry_t) + DDT_FLAT_PHYS_SIZE;
2743 	ddt_prune_entry_t *dpe = kmem_alloc(dpe_size, KM_SLEEP);
2744 
2745 	dpe->dpe_ddt = ddt;
2746 	dpe->dpe_key = *ddk;
2747 	memcpy(dpe->dpe_phys, ddp, DDT_FLAT_PHYS_SIZE);
2748 	list_insert_head(list, dpe);
2749 }
2750 
2751 /*
2752  * Interate over all the entries in the DDT unique class.
2753  * The walk will perform one of the following operations:
2754  *  (a) build a histogram than can be used when pruning
2755  *  (b) prune entries older than the cutoff
2756  *
2757  *  Also called by zdb(8) to dump the age histogram
2758  */
2759 void
ddt_prune_walk(spa_t * spa,uint64_t cutoff,ddt_age_histo_t * histogram)2760 ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram)
2761 {
2762 	ddt_bookmark_t ddb = {
2763 		.ddb_class = DDT_CLASS_UNIQUE,
2764 		.ddb_type = 0,
2765 		.ddb_checksum = 0,
2766 		.ddb_cursor = 0
2767 	};
2768 	ddt_lightweight_entry_t ddlwe = {0};
2769 	int error;
2770 	int valid = 0;
2771 	int candidates = 0;
2772 	uint64_t now = gethrestime_sec();
2773 	ddt_prune_info_t dpi;
2774 	boolean_t pruning = (cutoff != 0);
2775 
2776 	if (pruning) {
2777 		dpi.dpi_txg_syncs = 0;
2778 		dpi.dpi_pruned = 0;
2779 		dpi.dpi_spa = spa;
2780 		list_create(&dpi.dpi_candidates, sizeof (ddt_prune_entry_t),
2781 		    offsetof(ddt_prune_entry_t, dpe_node));
2782 	}
2783 
2784 	if (histogram != NULL)
2785 		memset(histogram, 0, sizeof (ddt_age_histo_t));
2786 
2787 	while ((error =
2788 	    ddt_walk_impl(spa, &ddb, &ddlwe, DDT_FLAG_FLAT, B_FALSE)) == 0) {
2789 		ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
2790 		VERIFY(ddt);
2791 
2792 		if (spa_shutting_down(spa) || issig())
2793 			break;
2794 
2795 		ASSERT(ddt->ddt_flags & DDT_FLAG_FLAT);
2796 		ASSERT3U(ddlwe.ddlwe_phys.ddp_flat.ddp_refcnt, <=, 1);
2797 
2798 		uint64_t class_start =
2799 		    ddlwe.ddlwe_phys.ddp_flat.ddp_class_start;
2800 
2801 		/*
2802 		 * If this entry is on the log, then the stored entry is stale
2803 		 * and we should skip it.
2804 		 */
2805 		if (ddt_log_find_key(ddt, &ddlwe.ddlwe_key, NULL, NULL))
2806 			continue;
2807 
2808 		/* prune older entries */
2809 		if (pruning && class_start < cutoff) {
2810 			if (candidates++ >= zfs_ddt_prunes_per_txg) {
2811 				/* sync prune candidates in batches */
2812 				VERIFY0(dsl_sync_task(spa_name(spa),
2813 				    NULL, prune_candidates_sync,
2814 				    &dpi, 0, ZFS_SPACE_CHECK_NONE));
2815 				candidates = 1;
2816 			}
2817 			ddt_prune_entry(&dpi.dpi_candidates, ddt,
2818 			    &ddlwe.ddlwe_key, &ddlwe.ddlwe_phys);
2819 		}
2820 
2821 		/* build a histogram */
2822 		if (histogram != NULL) {
2823 			uint64_t age = MAX(1, (now - class_start) / 3600);
2824 			int bin = MIN(highbit64(age) - 1, HIST_BINS - 1);
2825 			histogram->dah_entries++;
2826 			histogram->dah_age_histo[bin]++;
2827 		}
2828 
2829 		valid++;
2830 	}
2831 
2832 	if (pruning && valid > 0) {
2833 		if (!list_is_empty(&dpi.dpi_candidates)) {
2834 			/* sync out final batch of prune candidates */
2835 			VERIFY0(dsl_sync_task(spa_name(spa), NULL,
2836 			    prune_candidates_sync, &dpi, 0,
2837 			    ZFS_SPACE_CHECK_NONE));
2838 		}
2839 		list_destroy(&dpi.dpi_candidates);
2840 
2841 		zfs_dbgmsg("pruned %llu entries (%d%%) across %llu txg syncs",
2842 		    (u_longlong_t)dpi.dpi_pruned,
2843 		    (int)((dpi.dpi_pruned * 100) / valid),
2844 		    (u_longlong_t)dpi.dpi_txg_syncs);
2845 	}
2846 }
2847 
2848 static uint64_t
ddt_total_entries(spa_t * spa)2849 ddt_total_entries(spa_t *spa)
2850 {
2851 	ddt_object_t ddo;
2852 	ddt_get_dedup_object_stats(spa, &ddo);
2853 
2854 	return (ddo.ddo_count);
2855 }
2856 
2857 int
ddt_prune_unique_entries(spa_t * spa,zpool_ddt_prune_unit_t unit,uint64_t amount)2858 ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit,
2859     uint64_t amount)
2860 {
2861 	uint64_t cutoff;
2862 	uint64_t start_time = gethrtime();
2863 
2864 	if (spa->spa_active_ddt_prune)
2865 		return (SET_ERROR(EALREADY));
2866 	if (ddt_total_entries(spa) == 0)
2867 		return (0);
2868 
2869 	spa->spa_active_ddt_prune = B_TRUE;
2870 
2871 	zfs_dbgmsg("prune %llu %s", (u_longlong_t)amount,
2872 	    unit == ZPOOL_DDT_PRUNE_PERCENTAGE ? "%" : "seconds old or older");
2873 
2874 	if (unit == ZPOOL_DDT_PRUNE_PERCENTAGE) {
2875 		ddt_age_histo_t histogram;
2876 		uint64_t oldest = 0;
2877 
2878 		/* Make a pass over DDT to build a histogram */
2879 		ddt_prune_walk(spa, 0, &histogram);
2880 
2881 		int target = (histogram.dah_entries * amount) / 100;
2882 
2883 		/*
2884 		 * Figure out our cutoff date
2885 		 * (i.e., which bins to prune from)
2886 		 */
2887 		for (int i = HIST_BINS - 1; i >= 0 && target > 0; i--) {
2888 			if (histogram.dah_age_histo[i] != 0) {
2889 				/* less than this bucket remaining */
2890 				if (target < histogram.dah_age_histo[i]) {
2891 					oldest = MAX(1, (1<<i) * 3600);
2892 					target = 0;
2893 				} else {
2894 					target -= histogram.dah_age_histo[i];
2895 				}
2896 			}
2897 		}
2898 		cutoff = gethrestime_sec() - oldest;
2899 
2900 		if (ddt_dump_prune_histogram)
2901 			ddt_dump_age_histogram(&histogram, cutoff);
2902 	} else if (unit == ZPOOL_DDT_PRUNE_AGE) {
2903 		cutoff = gethrestime_sec() - amount;
2904 	} else {
2905 		return (EINVAL);
2906 	}
2907 
2908 	if (cutoff > 0 && !spa_shutting_down(spa) && !issig()) {
2909 		/* Traverse DDT to prune entries older that our cuttoff */
2910 		ddt_prune_walk(spa, cutoff, NULL);
2911 	}
2912 
2913 	zfs_dbgmsg("%s: prune completed in %llu ms",
2914 	    spa_name(spa), (u_longlong_t)NSEC2MSEC(gethrtime() - start_time));
2915 
2916 	spa->spa_active_ddt_prune = B_FALSE;
2917 	return (0);
2918 }
2919 
2920 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW,
2921 	"Enable prefetching dedup-ed blks");
2922 
2923 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_min_time_ms, UINT, ZMOD_RW,
2924 	"Min time to spend on incremental dedup log flush each transaction");
2925 
2926 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_min, UINT, ZMOD_RW,
2927 	"Min number of log entries to flush each transaction");
2928 
2929 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_max, UINT, ZMOD_RW,
2930 	"Max number of log entries to flush each transaction");
2931 
2932 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_txgs, UINT, ZMOD_RW,
2933 	"Number of TXGs to try to rotate the log in");
2934 
2935 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_cap, UINT, ZMOD_RW,
2936 	"Soft cap for the size of the current dedup log");
2937 
2938 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_hard_cap, UINT, ZMOD_RW,
2939 	"Whether to use the soft cap as a hard cap");
2940 
2941 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_flow_rate_txgs, UINT, ZMOD_RW,
2942 	"Number of txgs to average flow rates across");
2943