xref: /freebsd/sys/contrib/openzfs/module/zfs/ddt.c (revision 66e85755595a451db490d2fe24267d85db4b09c2)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
25  * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
26  * Copyright (c) 2022 by Pawel Jakub Dawidek
27  * Copyright (c) 2019, 2023, Klara Inc.
28  */
29 
30 #include <sys/zfs_context.h>
31 #include <sys/spa.h>
32 #include <sys/spa_impl.h>
33 #include <sys/zio.h>
34 #include <sys/ddt.h>
35 #include <sys/ddt_impl.h>
36 #include <sys/zap.h>
37 #include <sys/dmu_tx.h>
38 #include <sys/arc.h>
39 #include <sys/dsl_pool.h>
40 #include <sys/zio_checksum.h>
41 #include <sys/dsl_scan.h>
42 #include <sys/abd.h>
43 #include <sys/zfeature.h>
44 
45 /*
46  * # DDT: Deduplication tables
47  *
48  * The dedup subsystem provides block-level deduplication. When enabled, blocks
49  * to be written will have the dedup (D) bit set, which causes them to be
50  * tracked in a "dedup table", or DDT. If a block has been seen before (exists
51  * in the DDT), instead of being written, it will instead be made to reference
52  * the existing on-disk data, and a refcount bumped in the DDT instead.
53  *
54  * ## Dedup tables and entries
55  *
56  * Conceptually, a DDT is a dictionary or map. Each entry has a "key"
57  * (ddt_key_t) made up a block's checksum and certian properties, and a "value"
58  * (one or more ddt_phys_t) containing valid DVAs for the block's data, birth
59  * time and refcount. Together these are enough to track references to a
60  * specific block, to build a valid block pointer to reference that block (for
61  * freeing, scrubbing, etc), and to fill a new block pointer with the missing
62  * pieces to make it seem like it was written.
63  *
64  * There's a single DDT (ddt_t) for each checksum type, held in spa_ddt[].
65  * Within each DDT, there can be multiple storage "types" (ddt_type_t, on-disk
66  * object data formats, each with their own implementations) and "classes"
67  * (ddt_class_t, instance of a storage type object, for entries with a specific
68  * characteristic). An entry (key) will only ever exist on one of these objects
69  * at any given time, but may be moved from one to another if their type or
70  * class changes.
71  *
72  * The DDT is driven by the write IO pipeline (zio_ddt_write()). When a block
73  * is to be written, before DVAs have been allocated, ddt_lookup() is called to
74  * see if the block has been seen before. If its not found, the write proceeds
75  * as normal, and after it succeeds, a new entry is created. If it is found, we
76  * fill the BP with the DVAs from the entry, increment the refcount and cause
77  * the write IO to return immediately.
78  *
79  * Traditionally, each ddt_phys_t slot in the entry represents a separate dedup
80  * block for the same content/checksum. The slot is selected based on the
81  * zp_copies parameter the block is written with, that is, the number of DVAs
82  * in the block. The "ditto" slot (DDT_PHYS_DITTO) used to be used for
83  * now-removed "dedupditto" feature. These are no longer written, and will be
84  * freed if encountered on old pools.
85  *
86  * If the "fast_dedup" feature is enabled, new dedup tables will be created
87  * with the "flat phys" option. In this mode, there is only one ddt_phys_t
88  * slot. If a write is issued for an entry that exists, but has fewer DVAs,
89  * then only as many new DVAs are allocated and written to make up the
90  * shortfall. The existing entry is then extended (ddt_phys_extend()) with the
91  * new DVAs.
92  *
93  * ## Lifetime of an entry
94  *
95  * A DDT can be enormous, and typically is not held in memory all at once.
96  * Instead, the changes to an entry are tracked in memory, and written down to
97  * disk at the end of each txg.
98  *
99  * A "live" in-memory entry (ddt_entry_t) is a node on the live tree
100  * (ddt_tree).  At the start of a txg, ddt_tree is empty. When an entry is
101  * required for IO, ddt_lookup() is called. If an entry already exists on
102  * ddt_tree, it is returned. Otherwise, a new one is created, and the
103  * type/class objects for the DDT are searched for that key. If its found, its
104  * value is copied into the live entry. If not, an empty entry is created.
105  *
106  * The live entry will be modified during the txg, usually by modifying the
107  * refcount, but sometimes by adding or updating DVAs. At the end of the txg
108  * (during spa_sync()), type and class are recalculated for entry (see
109  * ddt_sync_entry()), and the entry is written to the appropriate storage
110  * object and (if necessary), removed from an old one. ddt_tree is cleared and
111  * the next txg can start.
112  *
113  * ## Dedup quota
114  *
115  * A maximum size for all DDTs on the pool can be set with the
116  * dedup_table_quota property. This is determined in ddt_over_quota() and
117  * enforced during ddt_lookup(). If the pool is at or over its quota limit,
118  * ddt_lookup() will only return entries for existing blocks, as updates are
119  * still possible. New entries will not be created; instead, ddt_lookup() will
120  * return NULL. In response, the DDT write stage (zio_ddt_write()) will remove
121  * the D bit on the block and reissue the IO as a regular write. The block will
122  * not be deduplicated.
123  *
124  * Note that this is based on the on-disk size of the dedup store. Reclaiming
125  * this space after deleting entries relies on the ZAP "shrinking" behaviour,
126  * without which, no space would be recovered and the DDT would continue to be
127  * considered "over quota". See zap_shrink_enabled.
128  *
129  * ## Dedup table pruning
130  *
131  * As a complement to the dedup quota feature, ddtprune allows removal of older
132  * non-duplicate entries to make room for newer duplicate entries. The amount
133  * to prune can be based on a target percentage of the unique entries or based
134  * on the age (i.e., prune unique entry older than N days).
135  *
136  * ## Dedup log
137  *
138  * Historically, all entries modified on a txg were written back to dedup
139  * storage objects at the end of every txg. This could cause significant
140  * overheads, as each entry only takes up a tiny portion of a ZAP leaf node,
141  * and so required reading the whole node, updating the entry, and writing it
142  * back. On busy pools, this could add serious IO and memory overheads.
143  *
144  * To address this, the dedup log was added. If the "fast_dedup" feature is
145  * enabled, at the end of each txg, modified entries will be copied to an
146  * in-memory "log" object (ddt_log_t), and appended to an on-disk log. If the
147  * same block is requested again, the in-memory object will be checked first,
148  * and if its there, the entry inflated back onto the live tree without going
149  * to storage. The on-disk log is only read at pool import time, to reload the
150  * in-memory log.
151  *
152  * Each txg, some amount of the in-memory log will be flushed out to a DDT
153  * storage object (ie ZAP) as normal. OpenZFS will try hard to flush enough to
154  * keep up with the rate of change on dedup entries, but not so much that it
155  * would impact overall throughput, and not using too much memory. See the
156  * zfs_dedup_log_* tunables in zfs(4) for more details.
157  *
158  * ## Repair IO
159  *
160  * If a read on a dedup block fails, but there are other copies of the block in
161  * the other ddt_phys_t slots, reads will be issued for those instead
162  * (zio_ddt_read_start()). If one of those succeeds, the read is returned to
163  * the caller, and a copy is stashed on the entry's dde_repair_abd.
164  *
165  * During the end-of-txg sync, any entries with a dde_repair_abd get a
166  * "rewrite" write issued for the original block pointer, with the data read
167  * from the alternate block. If the block is actually damaged, this will invoke
168  * the pool's "self-healing" mechanism, and repair the block.
169  *
170  * If the "fast_dedup" feature is enabled, the "flat phys" option will be in
171  * use, so there is only ever one ddt_phys_t slot. The repair process will
172  * still happen in this case, though it is unlikely to succeed as there will
173  * usually be no other equivalent blocks to fall back on (though there might
174  * be, if this was an early version of a dedup'd block that has since been
175  * extended).
176  *
177  * Note that this repair mechanism is in addition to and separate from the
178  * regular OpenZFS scrub and self-healing mechanisms.
179  *
180  * ## Scanning (scrub/resilver)
181  *
182  * If dedup is active, the scrub machinery will walk the dedup table first, and
183  * scrub all blocks with refcnt > 1 first. After that it will move on to the
184  * regular top-down scrub, and exclude the refcnt > 1 blocks when it sees them.
185  * In this way, heavily deduplicated blocks are only scrubbed once. See the
186  * commentary on dsl_scan_ddt() for more details.
187  *
188  * Walking the DDT is done via ddt_walk(). The current position is stored in a
189  * ddt_bookmark_t, which represents a stable position in the storage object.
190  * This bookmark is stored by the scan machinery, and must reference the same
191  * position on the object even if the object changes, the pool is exported, or
192  * OpenZFS is upgraded.
193  *
194  * If the "fast_dedup" feature is enabled and the table has a log, the scan
195  * cannot begin until entries on the log are flushed, as the on-disk log has no
196  * concept of a "stable position". Instead, the log flushing process will enter
197  * a more aggressive mode, to flush out as much as is necesary as soon as
198  * possible, in order to begin the scan as soon as possible.
199  *
200  * ## Interaction with block cloning
201  *
202  * If block cloning and dedup are both enabled on a pool, BRT will look for the
203  * dedup bit on an incoming block pointer. If set, it will call into the DDT
204  * (ddt_addref()) to add a reference to the block, instead of adding a
205  * reference to the BRT. See brt_pending_apply().
206  */
207 
208 /*
209  * These are the only checksums valid for dedup. They must match the list
210  * from dedup_table in zfs_prop.c
211  */
212 #define	DDT_CHECKSUM_VALID(c)	\
213 	(c == ZIO_CHECKSUM_SHA256 || c == ZIO_CHECKSUM_SHA512 || \
214 	c == ZIO_CHECKSUM_SKEIN || c == ZIO_CHECKSUM_EDONR || \
215 	c == ZIO_CHECKSUM_BLAKE3)
216 
217 static kmem_cache_t *ddt_cache;
218 
219 static kmem_cache_t *ddt_entry_flat_cache;
220 static kmem_cache_t *ddt_entry_trad_cache;
221 
222 #define	DDT_ENTRY_FLAT_SIZE	(sizeof (ddt_entry_t) + DDT_FLAT_PHYS_SIZE)
223 #define	DDT_ENTRY_TRAD_SIZE	(sizeof (ddt_entry_t) + DDT_TRAD_PHYS_SIZE)
224 
225 #define	DDT_ENTRY_SIZE(ddt)	\
226 	_DDT_PHYS_SWITCH(ddt, DDT_ENTRY_FLAT_SIZE, DDT_ENTRY_TRAD_SIZE)
227 
228 /*
229  * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
230  */
231 int zfs_dedup_prefetch = 0;
232 
233 /*
234  * If the dedup class cannot satisfy a DDT allocation, treat as over quota
235  * for this many TXGs.
236  */
237 uint_t dedup_class_wait_txgs = 5;
238 
239 /*
240  * How many DDT prune entries to add to the DDT sync AVL tree.
241  * Note these addtional entries have a memory footprint of a
242  * ddt_entry_t (216 bytes).
243  */
244 static uint32_t zfs_ddt_prunes_per_txg = 50000;
245 
246 /*
247  * For testing, synthesize aged DDT entries
248  * (in global scope for ztest)
249  */
250 boolean_t ddt_prune_artificial_age = B_FALSE;
251 boolean_t ddt_dump_prune_histogram = B_FALSE;
252 
253 /*
254  * Minimum time to flush per txg.
255  */
256 uint_t zfs_dedup_log_flush_min_time_ms = 1000;
257 
258 /*
259  * Minimum entries to flush per txg.
260  */
261 uint_t zfs_dedup_log_flush_entries_min = 200;
262 
263 /*
264  * Target number of TXGs until the whole dedup log has been flushed.
265  * The log size will float around this value times the ingest rate.
266  */
267 uint_t zfs_dedup_log_flush_txgs = 100;
268 
269 /*
270  * Maximum entries to flush per txg. Used for testing the dedup log.
271  */
272 uint_t zfs_dedup_log_flush_entries_max = UINT_MAX;
273 
274 /*
275  * Soft cap for the size of the current dedup log. If the log is larger
276  * than this size, we slightly increase the aggressiveness of the flushing to
277  * try to bring it back down to the soft cap.
278  */
279 uint_t zfs_dedup_log_cap = UINT_MAX;
280 
281 /*
282  * If this is set to B_TRUE, the cap above acts more like a hard cap:
283  * flushing is significantly more aggressive, increasing the minimum amount we
284  * flush per txg, as well as the maximum.
285  */
286 boolean_t zfs_dedup_log_hard_cap = B_FALSE;
287 
288 /*
289  * Number of txgs to average flow rates across.
290  */
291 uint_t zfs_dedup_log_flush_flow_rate_txgs = 10;
292 
293 static const ddt_ops_t *const ddt_ops[DDT_TYPES] = {
294 	&ddt_zap_ops,
295 };
296 
297 static const char *const ddt_class_name[DDT_CLASSES] = {
298 	"ditto",
299 	"duplicate",
300 	"unique",
301 };
302 
303 /*
304  * DDT feature flags automatically enabled for each on-disk version. Note that
305  * versions >0 cannot exist on disk without SPA_FEATURE_FAST_DEDUP enabled.
306  */
307 static const uint64_t ddt_version_flags[] = {
308 	[DDT_VERSION_LEGACY] = 0,
309 	[DDT_VERSION_FDT] = DDT_FLAG_FLAT | DDT_FLAG_LOG,
310 };
311 
312 /* per-DDT kstats */
313 typedef struct {
314 	/* total lookups and whether they returned new or existing entries */
315 	kstat_named_t dds_lookup;
316 	kstat_named_t dds_lookup_new;
317 	kstat_named_t dds_lookup_existing;
318 
319 	/* entries found on live tree, and if we had to wait for load */
320 	kstat_named_t dds_lookup_live_hit;
321 	kstat_named_t dds_lookup_live_wait;
322 	kstat_named_t dds_lookup_live_miss;
323 
324 	/* entries found on log trees */
325 	kstat_named_t dds_lookup_log_hit;
326 	kstat_named_t dds_lookup_log_active_hit;
327 	kstat_named_t dds_lookup_log_flushing_hit;
328 	kstat_named_t dds_lookup_log_miss;
329 
330 	/* entries found on store objects */
331 	kstat_named_t dds_lookup_stored_hit;
332 	kstat_named_t dds_lookup_stored_miss;
333 
334 	/* number of entries on log trees */
335 	kstat_named_t dds_log_active_entries;
336 	kstat_named_t dds_log_flushing_entries;
337 
338 	/* avg updated/flushed entries per txg */
339 	kstat_named_t dds_log_ingest_rate;
340 	kstat_named_t dds_log_flush_rate;
341 	kstat_named_t dds_log_flush_time_rate;
342 } ddt_kstats_t;
343 
344 static const ddt_kstats_t ddt_kstats_template = {
345 	{ "lookup",			KSTAT_DATA_UINT64 },
346 	{ "lookup_new",			KSTAT_DATA_UINT64 },
347 	{ "lookup_existing",		KSTAT_DATA_UINT64 },
348 	{ "lookup_live_hit",		KSTAT_DATA_UINT64 },
349 	{ "lookup_live_wait",		KSTAT_DATA_UINT64 },
350 	{ "lookup_live_miss",		KSTAT_DATA_UINT64 },
351 	{ "lookup_log_hit",		KSTAT_DATA_UINT64 },
352 	{ "lookup_log_active_hit",	KSTAT_DATA_UINT64 },
353 	{ "lookup_log_flushing_hit",	KSTAT_DATA_UINT64 },
354 	{ "lookup_log_miss",		KSTAT_DATA_UINT64 },
355 	{ "lookup_stored_hit",		KSTAT_DATA_UINT64 },
356 	{ "lookup_stored_miss",		KSTAT_DATA_UINT64 },
357 	{ "log_active_entries",		KSTAT_DATA_UINT64 },
358 	{ "log_flushing_entries",	KSTAT_DATA_UINT64 },
359 	{ "log_ingest_rate",		KSTAT_DATA_UINT32 },
360 	{ "log_flush_rate",		KSTAT_DATA_UINT32 },
361 	{ "log_flush_time_rate",	KSTAT_DATA_UINT32 },
362 };
363 
364 #ifdef _KERNEL
365 /*
366  * Hot-path lookup counters use wmsums to avoid cache line bouncing.
367  * DDT_KSTAT_BUMP: Increment a wmsum counter (lookup stats).
368  *
369  * Sync-only counters use direct kstat assignment (no atomics needed).
370  * DDT_KSTAT_SET: Set a value (log entry counts, rates).
371  * DDT_KSTAT_SUB: Subtract from a value (decrement log entry counts).
372  * DDT_KSTAT_ZERO: Zero a value (clear log entry counts).
373  */
374 #define	_DDT_KSTAT_STAT(ddt, stat) \
375 	&((ddt_kstats_t *)(ddt)->ddt_ksp->ks_data)->stat.value.ui64
376 #define	DDT_KSTAT_BUMP(ddt, stat) \
377 	wmsum_add(&(ddt)->ddt_kstat_##stat, 1)
378 #define	DDT_KSTAT_SUB(ddt, stat, val) \
379 	do { *_DDT_KSTAT_STAT(ddt, stat) -= (val); } while (0)
380 #define	DDT_KSTAT_SET(ddt, stat, val) \
381 	do { *_DDT_KSTAT_STAT(ddt, stat) = (val); } while (0)
382 #define	DDT_KSTAT_ZERO(ddt, stat) DDT_KSTAT_SET(ddt, stat, 0)
383 #else
384 #define	DDT_KSTAT_BUMP(ddt, stat) do {} while (0)
385 #define	DDT_KSTAT_SUB(ddt, stat, val) do {} while (0)
386 #define	DDT_KSTAT_SET(ddt, stat, val) do {} while (0)
387 #define	DDT_KSTAT_ZERO(ddt, stat) do {} while (0)
388 #endif /* _KERNEL */
389 
390 
391 static void
ddt_object_create(ddt_t * ddt,ddt_type_t type,ddt_class_t class,dmu_tx_t * tx)392 ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
393     dmu_tx_t *tx)
394 {
395 	spa_t *spa = ddt->ddt_spa;
396 	objset_t *os = ddt->ddt_os;
397 	uint64_t *objectp = &ddt->ddt_object[type][class];
398 	boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags &
399 	    ZCHECKSUM_FLAG_DEDUP;
400 	char name[DDT_NAMELEN];
401 
402 	ASSERT3U(ddt->ddt_dir_object, >, 0);
403 
404 	ddt_object_name(ddt, type, class, name);
405 
406 	ASSERT0(*objectp);
407 	VERIFY0(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash));
408 	ASSERT3U(*objectp, !=, 0);
409 
410 	ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
411 
412 	VERIFY0(zap_add(os, ddt->ddt_dir_object, name, sizeof (uint64_t), 1,
413 	    objectp, tx));
414 
415 	VERIFY0(zap_add(os, spa->spa_ddt_stat_object, name,
416 	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
417 	    &ddt->ddt_histogram[type][class], tx));
418 }
419 
420 static void
ddt_object_destroy(ddt_t * ddt,ddt_type_t type,ddt_class_t class,dmu_tx_t * tx)421 ddt_object_destroy(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
422     dmu_tx_t *tx)
423 {
424 	spa_t *spa = ddt->ddt_spa;
425 	objset_t *os = ddt->ddt_os;
426 	uint64_t *objectp = &ddt->ddt_object[type][class];
427 	uint64_t count;
428 	char name[DDT_NAMELEN];
429 
430 	ASSERT3U(ddt->ddt_dir_object, >, 0);
431 
432 	ddt_object_name(ddt, type, class, name);
433 
434 	ASSERT3U(*objectp, !=, 0);
435 	ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
436 	VERIFY0(ddt_object_count(ddt, type, class, &count));
437 	VERIFY0(count);
438 	VERIFY0(zap_remove(os, ddt->ddt_dir_object, name, tx));
439 	VERIFY0(zap_remove(os, spa->spa_ddt_stat_object, name, tx));
440 	VERIFY0(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx));
441 	memset(&ddt->ddt_object_stats[type][class], 0, sizeof (ddt_object_t));
442 
443 	*objectp = 0;
444 }
445 
446 static int
ddt_object_load(ddt_t * ddt,ddt_type_t type,ddt_class_t class)447 ddt_object_load(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
448 {
449 	ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
450 	dmu_object_info_t doi;
451 	uint64_t count;
452 	char name[DDT_NAMELEN];
453 	int error;
454 
455 	if (ddt->ddt_dir_object == 0) {
456 		/*
457 		 * If we're configured but the containing dir doesn't exist
458 		 * yet, then this object can't possibly exist either.
459 		 */
460 		ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
461 		return (SET_ERROR(ENOENT));
462 	}
463 
464 	ddt_object_name(ddt, type, class, name);
465 
466 	error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name,
467 	    sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
468 	if (error != 0)
469 		return (error);
470 
471 	error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
472 	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
473 	    &ddt->ddt_histogram[type][class]);
474 	if (error != 0)
475 		return (error);
476 
477 	/*
478 	 * Seed the cached statistics.
479 	 */
480 	error = ddt_object_info(ddt, type, class, &doi);
481 	if (error)
482 		return (error);
483 
484 	error = ddt_object_count(ddt, type, class, &count);
485 	if (error)
486 		return (error);
487 
488 	ddo->ddo_count = count;
489 	ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
490 	ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
491 
492 	return (0);
493 }
494 
495 static void
ddt_object_sync(ddt_t * ddt,ddt_type_t type,ddt_class_t class,dmu_tx_t * tx)496 ddt_object_sync(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
497     dmu_tx_t *tx)
498 {
499 	ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
500 	dmu_object_info_t doi;
501 	uint64_t count;
502 	char name[DDT_NAMELEN];
503 
504 	ddt_object_name(ddt, type, class, name);
505 
506 	VERIFY0(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
507 	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
508 	    &ddt->ddt_histogram[type][class], tx));
509 
510 	/*
511 	 * Cache DDT statistics; this is the only time they'll change.
512 	 */
513 	VERIFY0(ddt_object_info(ddt, type, class, &doi));
514 	VERIFY0(ddt_object_count(ddt, type, class, &count));
515 
516 	ddo->ddo_count = count;
517 	ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
518 	ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
519 }
520 
521 static boolean_t
ddt_object_exists(ddt_t * ddt,ddt_type_t type,ddt_class_t class)522 ddt_object_exists(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
523 {
524 	return (!!ddt->ddt_object[type][class]);
525 }
526 
527 static int
ddt_object_lookup(ddt_t * ddt,ddt_type_t type,ddt_class_t class,ddt_entry_t * dde)528 ddt_object_lookup(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
529     ddt_entry_t *dde)
530 {
531 	if (!ddt_object_exists(ddt, type, class))
532 		return (SET_ERROR(ENOENT));
533 
534 	return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
535 	    ddt->ddt_object[type][class], &dde->dde_key,
536 	    dde->dde_phys, DDT_PHYS_SIZE(ddt)));
537 }
538 
539 static int
ddt_object_contains(ddt_t * ddt,ddt_type_t type,ddt_class_t class,const ddt_key_t * ddk)540 ddt_object_contains(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
541     const ddt_key_t *ddk)
542 {
543 	if (!ddt_object_exists(ddt, type, class))
544 		return (SET_ERROR(ENOENT));
545 
546 	return (ddt_ops[type]->ddt_op_contains(ddt->ddt_os,
547 	    ddt->ddt_object[type][class], ddk));
548 }
549 
550 static void
ddt_object_prefetch(ddt_t * ddt,ddt_type_t type,ddt_class_t class,const ddt_key_t * ddk)551 ddt_object_prefetch(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
552     const ddt_key_t *ddk)
553 {
554 	if (!ddt_object_exists(ddt, type, class))
555 		return;
556 
557 	ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os,
558 	    ddt->ddt_object[type][class], ddk);
559 }
560 
561 static void
ddt_object_prefetch_all(ddt_t * ddt,ddt_type_t type,ddt_class_t class)562 ddt_object_prefetch_all(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
563 {
564 	if (!ddt_object_exists(ddt, type, class))
565 		return;
566 
567 	ddt_ops[type]->ddt_op_prefetch_all(ddt->ddt_os,
568 	    ddt->ddt_object[type][class]);
569 }
570 
571 static int
ddt_object_update(ddt_t * ddt,ddt_type_t type,ddt_class_t class,const ddt_lightweight_entry_t * ddlwe,dmu_tx_t * tx)572 ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
573     const ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
574 {
575 	ASSERT(ddt_object_exists(ddt, type, class));
576 
577 	return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
578 	    ddt->ddt_object[type][class], &ddlwe->ddlwe_key,
579 	    &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt), tx));
580 }
581 
582 static int
ddt_object_remove(ddt_t * ddt,ddt_type_t type,ddt_class_t class,const ddt_key_t * ddk,dmu_tx_t * tx)583 ddt_object_remove(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
584     const ddt_key_t *ddk, dmu_tx_t *tx)
585 {
586 	ASSERT(ddt_object_exists(ddt, type, class));
587 
588 	return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
589 	    ddt->ddt_object[type][class], ddk, tx));
590 }
591 
592 int
ddt_object_walk(ddt_t * ddt,ddt_type_t type,ddt_class_t class,uint64_t * walk,ddt_lightweight_entry_t * ddlwe)593 ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
594     uint64_t *walk, ddt_lightweight_entry_t *ddlwe)
595 {
596 	ASSERT(ddt_object_exists(ddt, type, class));
597 
598 	int error = ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
599 	    ddt->ddt_object[type][class], walk, &ddlwe->ddlwe_key,
600 	    &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
601 	if (error == 0) {
602 		ddlwe->ddlwe_type = type;
603 		ddlwe->ddlwe_class = class;
604 		return (0);
605 	}
606 	return (error);
607 }
608 
609 int
ddt_object_count(ddt_t * ddt,ddt_type_t type,ddt_class_t class,uint64_t * count)610 ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
611     uint64_t *count)
612 {
613 	ASSERT(ddt_object_exists(ddt, type, class));
614 
615 	return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
616 	    ddt->ddt_object[type][class], count));
617 }
618 
619 int
ddt_object_info(ddt_t * ddt,ddt_type_t type,ddt_class_t class,dmu_object_info_t * doi)620 ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
621     dmu_object_info_t *doi)
622 {
623 	if (!ddt_object_exists(ddt, type, class))
624 		return (SET_ERROR(ENOENT));
625 
626 	return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
627 	    doi));
628 }
629 
630 void
ddt_object_name(ddt_t * ddt,ddt_type_t type,ddt_class_t class,char * name)631 ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
632     char *name)
633 {
634 	(void) snprintf(name, DDT_NAMELEN, DMU_POOL_DDT,
635 	    zio_checksum_table[ddt->ddt_checksum].ci_name,
636 	    ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
637 }
638 
639 void
ddt_bp_fill(const ddt_univ_phys_t * ddp,ddt_phys_variant_t v,blkptr_t * bp,uint64_t txg)640 ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
641     blkptr_t *bp, uint64_t txg)
642 {
643 	ASSERT3U(txg, !=, 0);
644 	ASSERT3U(v, <, DDT_PHYS_NONE);
645 	uint64_t phys_birth;
646 	const dva_t *dvap;
647 
648 	if (v == DDT_PHYS_FLAT) {
649 		phys_birth = ddp->ddp_flat.ddp_phys_birth;
650 		dvap = ddp->ddp_flat.ddp_dva;
651 	} else {
652 		phys_birth = ddp->ddp_trad[v].ddp_phys_birth;
653 		dvap = ddp->ddp_trad[v].ddp_dva;
654 	}
655 
656 	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
657 		bp->blk_dva[d] = dvap[d];
658 	BP_SET_BIRTH(bp, txg, phys_birth);
659 }
660 
661 /*
662  * The bp created via this function may be used for repairs and scrub, but it
663  * will be missing the salt / IV required to do a full decrypting read.
664  */
665 void
ddt_bp_create(enum zio_checksum checksum,const ddt_key_t * ddk,const ddt_univ_phys_t * ddp,ddt_phys_variant_t v,blkptr_t * bp)666 ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
667     const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp)
668 {
669 	BP_ZERO(bp);
670 
671 	if (ddp != NULL)
672 		ddt_bp_fill(ddp, v, bp, ddt_phys_birth(ddp, v));
673 
674 	bp->blk_cksum = ddk->ddk_cksum;
675 
676 	BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
677 	BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
678 	BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
679 	BP_SET_CRYPT(bp, DDK_GET_CRYPT(ddk));
680 	BP_SET_FILL(bp, 1);
681 	BP_SET_CHECKSUM(bp, checksum);
682 	BP_SET_TYPE(bp, DMU_OT_DEDUP);
683 	BP_SET_LEVEL(bp, 0);
684 	BP_SET_DEDUP(bp, 1);
685 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
686 }
687 
688 void
ddt_key_fill(ddt_key_t * ddk,const blkptr_t * bp)689 ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
690 {
691 	ddk->ddk_cksum = bp->blk_cksum;
692 	ddk->ddk_prop = 0;
693 
694 	ASSERT(BP_IS_ENCRYPTED(bp) || !BP_USES_CRYPT(bp));
695 
696 	DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
697 	DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
698 	DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
699 	DDK_SET_CRYPT(ddk, BP_USES_CRYPT(bp));
700 }
701 
702 void
ddt_phys_extend(ddt_univ_phys_t * ddp,ddt_phys_variant_t v,const blkptr_t * bp)703 ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v, const blkptr_t *bp)
704 {
705 	ASSERT3U(v, <, DDT_PHYS_NONE);
706 	int bp_ndvas = BP_GET_NDVAS(bp);
707 	int ddp_max_dvas = BP_IS_ENCRYPTED(bp) ?
708 	    SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP;
709 	dva_t *dvas = (v == DDT_PHYS_FLAT) ?
710 	    ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva;
711 
712 	int s = 0, d = 0;
713 	while (s < bp_ndvas && d < ddp_max_dvas) {
714 		if (DVA_IS_VALID(&dvas[d])) {
715 			d++;
716 			continue;
717 		}
718 		dvas[d] = bp->blk_dva[s];
719 		s++; d++;
720 	}
721 
722 	/*
723 	 * If the caller offered us more DVAs than we can fit, something has
724 	 * gone wrong in their accounting. zio_ddt_write() should never ask for
725 	 * more than we need.
726 	 */
727 	ASSERT3U(s, ==, bp_ndvas);
728 
729 	if (BP_IS_ENCRYPTED(bp))
730 		dvas[2] = bp->blk_dva[2];
731 
732 	if (ddt_phys_birth(ddp, v) == 0) {
733 		if (v == DDT_PHYS_FLAT) {
734 			ddp->ddp_flat.ddp_phys_birth =
735 			    BP_GET_PHYSICAL_BIRTH(bp);
736 		} else {
737 			ddp->ddp_trad[v].ddp_phys_birth =
738 			    BP_GET_PHYSICAL_BIRTH(bp);
739 		}
740 	}
741 }
742 
743 void
ddt_phys_unextend(ddt_univ_phys_t * cur,ddt_univ_phys_t * orig,ddt_phys_variant_t v)744 ddt_phys_unextend(ddt_univ_phys_t *cur, ddt_univ_phys_t *orig,
745     ddt_phys_variant_t v)
746 {
747 	ASSERT3U(v, <, DDT_PHYS_NONE);
748 	dva_t *cur_dvas = (v == DDT_PHYS_FLAT) ?
749 	    cur->ddp_flat.ddp_dva : cur->ddp_trad[v].ddp_dva;
750 	dva_t *orig_dvas = (v == DDT_PHYS_FLAT) ?
751 	    orig->ddp_flat.ddp_dva : orig->ddp_trad[v].ddp_dva;
752 
753 	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
754 		cur_dvas[d] = orig_dvas[d];
755 
756 	if (ddt_phys_birth(orig, v) == 0) {
757 		if (v == DDT_PHYS_FLAT)
758 			cur->ddp_flat.ddp_phys_birth = 0;
759 		else
760 			cur->ddp_trad[v].ddp_phys_birth = 0;
761 	}
762 }
763 
764 void
ddt_phys_copy(ddt_univ_phys_t * dst,const ddt_univ_phys_t * src,ddt_phys_variant_t v)765 ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src,
766     ddt_phys_variant_t v)
767 {
768 	ASSERT3U(v, <, DDT_PHYS_NONE);
769 
770 	if (v == DDT_PHYS_FLAT)
771 		dst->ddp_flat = src->ddp_flat;
772 	else
773 		dst->ddp_trad[v] = src->ddp_trad[v];
774 }
775 
776 void
ddt_phys_clear(ddt_univ_phys_t * ddp,ddt_phys_variant_t v)777 ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
778 {
779 	ASSERT3U(v, <, DDT_PHYS_NONE);
780 
781 	if (v == DDT_PHYS_FLAT)
782 		memset(&ddp->ddp_flat, 0, DDT_FLAT_PHYS_SIZE);
783 	else
784 		memset(&ddp->ddp_trad[v], 0, DDT_TRAD_PHYS_SIZE / DDT_PHYS_MAX);
785 }
786 
787 static uint64_t
ddt_class_start(void)788 ddt_class_start(void)
789 {
790 	uint64_t start = gethrestime_sec();
791 
792 	if (unlikely(ddt_prune_artificial_age)) {
793 		/*
794 		 * debug aide -- simulate a wider distribution
795 		 * so we don't have to wait for an aged DDT
796 		 * to test prune.
797 		 */
798 		int range = 1 << 21;
799 		int percent = random_in_range(100);
800 		if (percent < 50) {
801 			range = range >> 4;
802 		} else if (percent > 75) {
803 			range /= 2;
804 		}
805 		start -= random_in_range(range);
806 	}
807 
808 	return (start);
809 }
810 
811 void
ddt_phys_addref(ddt_univ_phys_t * ddp,ddt_phys_variant_t v)812 ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
813 {
814 	ASSERT3U(v, <, DDT_PHYS_NONE);
815 
816 	if (v == DDT_PHYS_FLAT)
817 		ddp->ddp_flat.ddp_refcnt++;
818 	else
819 		ddp->ddp_trad[v].ddp_refcnt++;
820 }
821 
822 uint64_t
ddt_phys_decref(ddt_univ_phys_t * ddp,ddt_phys_variant_t v)823 ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
824 {
825 	ASSERT3U(v, <, DDT_PHYS_NONE);
826 
827 	uint64_t *refcntp;
828 
829 	if (v == DDT_PHYS_FLAT)
830 		refcntp = &ddp->ddp_flat.ddp_refcnt;
831 	else
832 		refcntp = &ddp->ddp_trad[v].ddp_refcnt;
833 
834 	ASSERT3U(*refcntp, >, 0);
835 	(*refcntp)--;
836 	return (*refcntp);
837 }
838 
839 static void
ddt_phys_free(ddt_t * ddt,ddt_key_t * ddk,ddt_univ_phys_t * ddp,ddt_phys_variant_t v,uint64_t txg)840 ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_univ_phys_t *ddp,
841     ddt_phys_variant_t v, uint64_t txg)
842 {
843 	blkptr_t blk;
844 
845 	ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
846 
847 	/*
848 	 * We clear the dedup bit so that zio_free() will actually free the
849 	 * space, rather than just decrementing the refcount in the DDT.
850 	 */
851 	BP_SET_DEDUP(&blk, 0);
852 
853 	ddt_phys_clear(ddp, v);
854 	zio_free(ddt->ddt_spa, txg, &blk);
855 }
856 
857 uint64_t
ddt_phys_birth(const ddt_univ_phys_t * ddp,ddt_phys_variant_t v)858 ddt_phys_birth(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
859 {
860 	ASSERT3U(v, <, DDT_PHYS_NONE);
861 
862 	if (v == DDT_PHYS_FLAT)
863 		return (ddp->ddp_flat.ddp_phys_birth);
864 	else
865 		return (ddp->ddp_trad[v].ddp_phys_birth);
866 }
867 
868 int
ddt_phys_is_gang(const ddt_univ_phys_t * ddp,ddt_phys_variant_t v)869 ddt_phys_is_gang(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
870 {
871 	ASSERT3U(v, <, DDT_PHYS_NONE);
872 
873 	const dva_t *dvas = (v == DDT_PHYS_FLAT) ?
874 	    ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva;
875 
876 	return (DVA_GET_GANG(&dvas[0]));
877 }
878 
879 int
ddt_phys_dva_count(const ddt_univ_phys_t * ddp,ddt_phys_variant_t v,boolean_t encrypted)880 ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
881     boolean_t encrypted)
882 {
883 	ASSERT3U(v, <, DDT_PHYS_NONE);
884 
885 	const dva_t *dvas = (v == DDT_PHYS_FLAT) ?
886 	    ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva;
887 
888 	return (DVA_IS_VALID(&dvas[0]) +
889 	    DVA_IS_VALID(&dvas[1]) +
890 	    DVA_IS_VALID(&dvas[2]) * !encrypted);
891 }
892 
893 ddt_phys_variant_t
ddt_phys_select(const ddt_t * ddt,const ddt_entry_t * dde,const blkptr_t * bp)894 ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, const blkptr_t *bp)
895 {
896 	if (dde == NULL)
897 		return (DDT_PHYS_NONE);
898 
899 	const ddt_univ_phys_t *ddp = dde->dde_phys;
900 
901 	if (ddt->ddt_flags & DDT_FLAG_FLAT) {
902 		if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_flat.ddp_dva[0]) &&
903 		    BP_GET_PHYSICAL_BIRTH(bp) == ddp->ddp_flat.ddp_phys_birth) {
904 			return (DDT_PHYS_FLAT);
905 		}
906 	} else /* traditional phys */ {
907 		for (int p = 0; p < DDT_PHYS_MAX; p++) {
908 			if (DVA_EQUAL(BP_IDENTITY(bp),
909 			    &ddp->ddp_trad[p].ddp_dva[0]) &&
910 			    BP_GET_PHYSICAL_BIRTH(bp) ==
911 			    ddp->ddp_trad[p].ddp_phys_birth) {
912 				return (p);
913 			}
914 		}
915 	}
916 	return (DDT_PHYS_NONE);
917 }
918 
919 uint64_t
ddt_phys_refcnt(const ddt_univ_phys_t * ddp,ddt_phys_variant_t v)920 ddt_phys_refcnt(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
921 {
922 	ASSERT3U(v, <, DDT_PHYS_NONE);
923 
924 	if (v == DDT_PHYS_FLAT)
925 		return (ddp->ddp_flat.ddp_refcnt);
926 	else
927 		return (ddp->ddp_trad[v].ddp_refcnt);
928 }
929 
930 uint64_t
ddt_phys_total_refcnt(const ddt_t * ddt,const ddt_univ_phys_t * ddp)931 ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_univ_phys_t *ddp)
932 {
933 	uint64_t refcnt = 0;
934 
935 	if (ddt->ddt_flags & DDT_FLAG_FLAT)
936 		refcnt = ddp->ddp_flat.ddp_refcnt;
937 	else
938 		for (int v = DDT_PHYS_SINGLE; v <= DDT_PHYS_TRIPLE; v++)
939 			refcnt += ddp->ddp_trad[v].ddp_refcnt;
940 
941 	return (refcnt);
942 }
943 
944 ddt_t *
ddt_select(spa_t * spa,const blkptr_t * bp)945 ddt_select(spa_t *spa, const blkptr_t *bp)
946 {
947 	ASSERT(DDT_CHECKSUM_VALID(BP_GET_CHECKSUM(bp)));
948 	return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
949 }
950 
951 void
ddt_enter(ddt_t * ddt)952 ddt_enter(ddt_t *ddt)
953 {
954 	mutex_enter(&ddt->ddt_lock);
955 }
956 
957 void
ddt_exit(ddt_t * ddt)958 ddt_exit(ddt_t *ddt)
959 {
960 	mutex_exit(&ddt->ddt_lock);
961 }
962 
963 void
ddt_init(void)964 ddt_init(void)
965 {
966 	ddt_cache = kmem_cache_create("ddt_cache",
967 	    sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
968 	ddt_entry_flat_cache = kmem_cache_create("ddt_entry_flat_cache",
969 	    DDT_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
970 	ddt_entry_trad_cache = kmem_cache_create("ddt_entry_trad_cache",
971 	    DDT_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
972 
973 	ddt_log_init();
974 }
975 
976 void
ddt_fini(void)977 ddt_fini(void)
978 {
979 	ddt_log_fini();
980 
981 	kmem_cache_destroy(ddt_entry_trad_cache);
982 	kmem_cache_destroy(ddt_entry_flat_cache);
983 	kmem_cache_destroy(ddt_cache);
984 }
985 
986 static ddt_entry_t *
ddt_alloc(const ddt_t * ddt,const ddt_key_t * ddk)987 ddt_alloc(const ddt_t *ddt, const ddt_key_t *ddk)
988 {
989 	ddt_entry_t *dde;
990 
991 	if (ddt->ddt_flags & DDT_FLAG_FLAT) {
992 		dde = kmem_cache_alloc(ddt_entry_flat_cache, KM_SLEEP);
993 		memset(dde, 0, DDT_ENTRY_FLAT_SIZE);
994 	} else {
995 		dde = kmem_cache_alloc(ddt_entry_trad_cache, KM_SLEEP);
996 		memset(dde, 0, DDT_ENTRY_TRAD_SIZE);
997 	}
998 
999 	cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
1000 
1001 	dde->dde_key = *ddk;
1002 
1003 	return (dde);
1004 }
1005 
1006 void
ddt_alloc_entry_io(ddt_entry_t * dde)1007 ddt_alloc_entry_io(ddt_entry_t *dde)
1008 {
1009 	if (dde->dde_io != NULL)
1010 		return;
1011 
1012 	dde->dde_io = kmem_zalloc(sizeof (ddt_entry_io_t), KM_SLEEP);
1013 	mutex_init(&dde->dde_io->dde_io_lock, NULL, MUTEX_DEFAULT, NULL);
1014 }
1015 
1016 static void
ddt_free(const ddt_t * ddt,ddt_entry_t * dde)1017 ddt_free(const ddt_t *ddt, ddt_entry_t *dde)
1018 {
1019 	if (dde->dde_io != NULL) {
1020 		for (int p = 0; p < DDT_NPHYS(ddt); p++)
1021 			ASSERT0P(dde->dde_io->dde_lead_zio[p]);
1022 
1023 		if (dde->dde_io->dde_repair_abd != NULL)
1024 			abd_free(dde->dde_io->dde_repair_abd);
1025 
1026 		mutex_destroy(&dde->dde_io->dde_io_lock);
1027 		kmem_free(dde->dde_io, sizeof (ddt_entry_io_t));
1028 	}
1029 
1030 	cv_destroy(&dde->dde_cv);
1031 	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
1032 	    ddt_entry_flat_cache : ddt_entry_trad_cache, dde);
1033 }
1034 
1035 void
ddt_remove(ddt_t * ddt,ddt_entry_t * dde)1036 ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
1037 {
1038 	ASSERT(MUTEX_HELD(&ddt->ddt_lock));
1039 
1040 	/* Entry is still in the log, so charge the entry back to it */
1041 	if (dde->dde_flags & DDE_FLAG_LOGGED) {
1042 		ddt_lightweight_entry_t ddlwe;
1043 		DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
1044 		ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
1045 	}
1046 
1047 	avl_remove(&ddt->ddt_tree, dde);
1048 	ddt_free(ddt, dde);
1049 }
1050 
1051 /*
1052  * We're considered over quota when we hit 85% full, or for larger drives,
1053  * when there is less than 8GB free.
1054  */
1055 static boolean_t
ddt_special_over_quota(metaslab_class_t * mc)1056 ddt_special_over_quota(metaslab_class_t *mc)
1057 {
1058 	uint64_t allocated = metaslab_class_get_alloc(mc);
1059 	uint64_t capacity = metaslab_class_get_space(mc);
1060 	uint64_t limit = MAX(capacity * 85 / 100,
1061 	    (capacity > (1LL<<33)) ? capacity - (1LL<<33) : 0);
1062 	return (allocated >= limit);
1063 }
1064 
1065 /*
1066  * Check if the DDT is over its quota.  This can be due to a few conditions:
1067  *   1. 'dedup_table_quota' property is not 0 (none) and the dedup dsize
1068  *       exceeds this limit
1069  *
1070  *   2. 'dedup_table_quota' property is set to automatic and
1071  *      a. the dedup or special allocation class could not satisfy a DDT
1072  *         allocation in a recent transaction
1073  *      b. the dedup or special allocation class has exceeded its 85% limit
1074  */
1075 static boolean_t
ddt_over_quota(spa_t * spa)1076 ddt_over_quota(spa_t *spa)
1077 {
1078 	if (spa->spa_dedup_table_quota == 0)
1079 		return (B_FALSE);
1080 
1081 	if (spa->spa_dedup_table_quota != UINT64_MAX)
1082 		return (ddt_get_ddt_dsize(spa) > spa->spa_dedup_table_quota);
1083 
1084 	/*
1085 	 * Over quota if have to allocate outside of the dedup/special class.
1086 	 */
1087 	if (spa_syncing_txg(spa) <= spa->spa_dedup_class_full_txg +
1088 	    dedup_class_wait_txgs) {
1089 		/* Waiting for some deferred frees to be processed */
1090 		return (B_TRUE);
1091 	}
1092 
1093 	/*
1094 	 * For automatic quota, table size is limited by dedup or special class
1095 	 */
1096 	if (spa_has_dedup(spa))
1097 		return (ddt_special_over_quota(spa_dedup_class(spa)));
1098 	else if (spa_special_has_ddt(spa))
1099 		return (ddt_special_over_quota(spa_special_class(spa)));
1100 
1101 	return (B_FALSE);
1102 }
1103 
1104 void
ddt_prefetch_all(spa_t * spa)1105 ddt_prefetch_all(spa_t *spa)
1106 {
1107 	/*
1108 	 * Load all DDT entries for each type/class combination. This is
1109 	 * indended to perform a prefetch on all such blocks. For the same
1110 	 * reason that ddt_prefetch isn't locked, this is also not locked.
1111 	 */
1112 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1113 		ddt_t *ddt = spa->spa_ddt[c];
1114 		if (!ddt)
1115 			continue;
1116 
1117 		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1118 			for (ddt_class_t class = 0; class < DDT_CLASSES;
1119 			    class++) {
1120 				ddt_object_prefetch_all(ddt, type, class);
1121 			}
1122 		}
1123 	}
1124 }
1125 
1126 static int ddt_configure(ddt_t *ddt, boolean_t new);
1127 
1128 /*
1129  * If the BP passed to ddt_lookup has valid DVAs, then we need to compare them
1130  * to the ones in the entry. If they're different, then the passed-in BP is
1131  * from a previous generation of this entry (ie was previously pruned) and we
1132  * have to act like the entry doesn't exist at all.
1133  *
1134  * This should only happen during a lookup to free the block (zio_ddt_free()).
1135  *
1136  * XXX this is similar in spirit to ddt_phys_select(), maybe can combine
1137  *       -- robn, 2024-02-09
1138  */
1139 static boolean_t
ddt_entry_lookup_is_valid(ddt_t * ddt,const blkptr_t * bp,ddt_entry_t * dde)1140 ddt_entry_lookup_is_valid(ddt_t *ddt, const blkptr_t *bp, ddt_entry_t *dde)
1141 {
1142 	/* If the BP has no DVAs, then this entry is good */
1143 	uint_t ndvas = BP_GET_NDVAS(bp);
1144 	if (ndvas == 0)
1145 		return (B_TRUE);
1146 
1147 	/*
1148 	 * Only checking the phys for the copies. For flat, there's only one;
1149 	 * for trad it'll be the one that has the matching set of DVAs.
1150 	 */
1151 	const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ?
1152 	    dde->dde_phys->ddp_flat.ddp_dva :
1153 	    dde->dde_phys->ddp_trad[ndvas].ddp_dva;
1154 
1155 	/*
1156 	 * Compare entry DVAs with the BP. They should all be there, but
1157 	 * there's not really anything we can do if its only partial anyway,
1158 	 * that's an error somewhere else, maybe long ago.
1159 	 */
1160 	uint_t d;
1161 	for (d = 0; d < ndvas; d++)
1162 		if (!DVA_EQUAL(&dvas[d], &bp->blk_dva[d]))
1163 			return (B_FALSE);
1164 	ASSERT3U(d, ==, ndvas);
1165 
1166 	return (B_TRUE);
1167 }
1168 
1169 ddt_entry_t *
ddt_lookup(ddt_t * ddt,const blkptr_t * bp,boolean_t verify)1170 ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t verify)
1171 {
1172 	spa_t *spa = ddt->ddt_spa;
1173 	ddt_key_t search;
1174 	ddt_entry_t *dde;
1175 	ddt_type_t type;
1176 	ddt_class_t class;
1177 	avl_index_t where;
1178 	int error;
1179 
1180 	ASSERT(MUTEX_HELD(&ddt->ddt_lock));
1181 
1182 	if (unlikely(ddt->ddt_version == DDT_VERSION_UNCONFIGURED)) {
1183 		/*
1184 		 * This is the first use of this DDT since the pool was
1185 		 * created; finish getting it ready for use.
1186 		 */
1187 		VERIFY0(ddt_configure(ddt, B_TRUE));
1188 		ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
1189 	}
1190 
1191 	DDT_KSTAT_BUMP(ddt, dds_lookup);
1192 
1193 	ddt_key_fill(&search, bp);
1194 
1195 	/* Find an existing live entry */
1196 	dde = avl_find(&ddt->ddt_tree, &search, &where);
1197 	if (dde != NULL) {
1198 		/* If we went over quota, act like we didn't find it */
1199 		if (dde->dde_flags & DDE_FLAG_OVERQUOTA)
1200 			return (NULL);
1201 
1202 		/* If it's already loaded, we can just return it. */
1203 		DDT_KSTAT_BUMP(ddt, dds_lookup_live_hit);
1204 		if (dde->dde_flags & DDE_FLAG_LOADED) {
1205 			if (!verify || ddt_entry_lookup_is_valid(ddt, bp, dde))
1206 				return (dde);
1207 			return (NULL);
1208 		}
1209 
1210 		/* Someone else is loading it, wait for it. */
1211 		dde->dde_waiters++;
1212 		DDT_KSTAT_BUMP(ddt, dds_lookup_live_wait);
1213 		while (!(dde->dde_flags & DDE_FLAG_LOADED))
1214 			cv_wait(&dde->dde_cv, &ddt->ddt_lock);
1215 		dde->dde_waiters--;
1216 
1217 		/* Loaded but over quota, forget we were ever here */
1218 		if (dde->dde_flags & DDE_FLAG_OVERQUOTA) {
1219 			if (dde->dde_waiters == 0) {
1220 				avl_remove(&ddt->ddt_tree, dde);
1221 				ddt_free(ddt, dde);
1222 			}
1223 			return (NULL);
1224 		}
1225 
1226 		DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
1227 
1228 		/* Make sure the loaded entry matches the BP */
1229 		if (!verify || ddt_entry_lookup_is_valid(ddt, bp, dde))
1230 			return (dde);
1231 		return (NULL);
1232 	} else
1233 		DDT_KSTAT_BUMP(ddt, dds_lookup_live_miss);
1234 
1235 	/* Time to make a new entry. */
1236 	dde = ddt_alloc(ddt, &search);
1237 
1238 	/* Record the time this class was created (used by ddt prune) */
1239 	if (ddt->ddt_flags & DDT_FLAG_FLAT)
1240 		dde->dde_phys->ddp_flat.ddp_class_start = ddt_class_start();
1241 
1242 	avl_insert(&ddt->ddt_tree, dde, where);
1243 
1244 	/* If its in the log tree, we can "load" it from there */
1245 	if (ddt->ddt_flags & DDT_FLAG_LOG) {
1246 		ddt_lightweight_entry_t ddlwe;
1247 
1248 		if (ddt_log_find_key(ddt, &search, &ddlwe)) {
1249 			/*
1250 			 * See if we have the key first, and if so, set up
1251 			 * the entry.
1252 			 */
1253 			dde->dde_type = ddlwe.ddlwe_type;
1254 			dde->dde_class = ddlwe.ddlwe_class;
1255 			memcpy(dde->dde_phys, &ddlwe.ddlwe_phys,
1256 			    DDT_PHYS_SIZE(ddt));
1257 			/* Whatever we found isn't valid for this BP, eject */
1258 			if (verify &&
1259 			    !ddt_entry_lookup_is_valid(ddt, bp, dde)) {
1260 				avl_remove(&ddt->ddt_tree, dde);
1261 				ddt_free(ddt, dde);
1262 				return (NULL);
1263 			}
1264 
1265 			/* Remove it and count it */
1266 			if (ddt_log_remove_key(ddt,
1267 			    ddt->ddt_log_active, &search)) {
1268 				DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit);
1269 			} else {
1270 				VERIFY(ddt_log_remove_key(ddt,
1271 				    ddt->ddt_log_flushing, &search));
1272 				DDT_KSTAT_BUMP(ddt,
1273 				    dds_lookup_log_flushing_hit);
1274 			}
1275 
1276 			dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED;
1277 
1278 			DDT_KSTAT_BUMP(ddt, dds_lookup_log_hit);
1279 			DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
1280 
1281 			return (dde);
1282 		}
1283 
1284 		DDT_KSTAT_BUMP(ddt, dds_lookup_log_miss);
1285 	}
1286 
1287 	/*
1288 	 * ddt_tree is now stable, so unlock and let everyone else keep moving.
1289 	 * Anyone landing on this entry will find it without DDE_FLAG_LOADED,
1290 	 * and go to sleep waiting for it above.
1291 	 */
1292 	ddt_exit(ddt);
1293 
1294 	/* Search all store objects for the entry. */
1295 	error = ENOENT;
1296 	for (type = 0; type < DDT_TYPES; type++) {
1297 		for (class = 0; class < DDT_CLASSES; class++) {
1298 			error = ddt_object_lookup(ddt, type, class, dde);
1299 			if (error != ENOENT) {
1300 				ASSERT0(error);
1301 				break;
1302 			}
1303 		}
1304 		if (error != ENOENT)
1305 			break;
1306 	}
1307 
1308 	ddt_enter(ddt);
1309 
1310 	ASSERT(!(dde->dde_flags & DDE_FLAG_LOADED));
1311 
1312 	dde->dde_type = type;	/* will be DDT_TYPES if no entry found */
1313 	dde->dde_class = class;	/* will be DDT_CLASSES if no entry found */
1314 
1315 	boolean_t valid = B_TRUE;
1316 
1317 	if (dde->dde_type == DDT_TYPES &&
1318 	    dde->dde_class == DDT_CLASSES &&
1319 	    ddt_over_quota(spa)) {
1320 		/* Over quota. If no one is waiting, clean up right now. */
1321 		if (dde->dde_waiters == 0) {
1322 			avl_remove(&ddt->ddt_tree, dde);
1323 			ddt_free(ddt, dde);
1324 			return (NULL);
1325 		}
1326 
1327 		/* Flag cleanup required */
1328 		dde->dde_flags |= DDE_FLAG_OVERQUOTA;
1329 	} else if (error == 0) {
1330 		/*
1331 		 * If what we loaded is no good for this BP and there's no one
1332 		 * waiting for it, we can just remove it and get out. If its no
1333 		 * good but there are waiters, we have to leave it, because we
1334 		 * don't know what they want. If its not needed we'll end up
1335 		 * taking an entry log/sync, but it can only happen if more
1336 		 * than one previous version of this block is being deleted at
1337 		 * the same time. This is extremely unlikely to happen and not
1338 		 * worth the effort to deal with without taking an entry
1339 		 * update.
1340 		 */
1341 		valid = !verify || ddt_entry_lookup_is_valid(ddt, bp, dde);
1342 		if (!valid && dde->dde_waiters == 0) {
1343 			avl_remove(&ddt->ddt_tree, dde);
1344 			ddt_free(ddt, dde);
1345 			return (NULL);
1346 		}
1347 
1348 		DDT_KSTAT_BUMP(ddt, dds_lookup_stored_hit);
1349 		DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
1350 
1351 		/*
1352 		 * The histograms only track inactive (stored or logged) blocks.
1353 		 * We've just put an entry onto the live list, so we need to
1354 		 * remove its counts. When its synced back, it'll be re-added
1355 		 * to the right one.
1356 		 *
1357 		 * We only do this when we successfully found it in the store.
1358 		 * error == ENOENT means this is a new entry, and so its already
1359 		 * not counted.
1360 		 */
1361 		ddt_histogram_t *ddh =
1362 		    &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
1363 
1364 		ddt_lightweight_entry_t ddlwe;
1365 		DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
1366 		ddt_histogram_sub_entry(ddt, ddh, &ddlwe);
1367 	} else {
1368 		DDT_KSTAT_BUMP(ddt, dds_lookup_stored_miss);
1369 		DDT_KSTAT_BUMP(ddt, dds_lookup_new);
1370 	}
1371 
1372 	/* Entry loaded, everyone can proceed now */
1373 	dde->dde_flags |= DDE_FLAG_LOADED;
1374 	cv_broadcast(&dde->dde_cv);
1375 
1376 	if ((dde->dde_flags & DDE_FLAG_OVERQUOTA) || !valid)
1377 		return (NULL);
1378 
1379 	return (dde);
1380 }
1381 
1382 void
ddt_prefetch(spa_t * spa,const blkptr_t * bp)1383 ddt_prefetch(spa_t *spa, const blkptr_t *bp)
1384 {
1385 	ddt_t *ddt;
1386 	ddt_key_t ddk;
1387 
1388 	if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
1389 		return;
1390 
1391 	/*
1392 	 * We only remove the DDT once all tables are empty and only
1393 	 * prefetch dedup blocks when there are entries in the DDT.
1394 	 * Thus no locking is required as the DDT can't disappear on us.
1395 	 */
1396 	ddt = ddt_select(spa, bp);
1397 	ddt_key_fill(&ddk, bp);
1398 
1399 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1400 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
1401 			ddt_object_prefetch(ddt, type, class, &ddk);
1402 		}
1403 	}
1404 }
1405 
1406 /*
1407  * ddt_key_t comparison. Any struct wanting to make use of this function must
1408  * have the key as the first element. Casts it to N uint64_ts, and checks until
1409  * we find there's a difference. This is intended to match how ddt_zap.c drives
1410  * the ZAPs (first uint64_t as the key prehash), which will minimise the number
1411  * of ZAP blocks touched when flushing logged entries from an AVL walk. This is
1412  * not an invariant for this function though, should you wish to change it.
1413  */
1414 int
ddt_key_compare(const void * x1,const void * x2)1415 ddt_key_compare(const void *x1, const void *x2)
1416 {
1417 	const uint64_t *k1 = (const uint64_t *)x1;
1418 	const uint64_t *k2 = (const uint64_t *)x2;
1419 
1420 	int cmp;
1421 	for (int i = 0; i < (sizeof (ddt_key_t) / sizeof (uint64_t)); i++)
1422 		if (likely((cmp = TREE_CMP(k1[i], k2[i])) != 0))
1423 			return (cmp);
1424 
1425 	return (0);
1426 }
1427 
1428 /* Create the containing dir for this DDT and bump the feature count */
1429 static void
ddt_create_dir(ddt_t * ddt,dmu_tx_t * tx)1430 ddt_create_dir(ddt_t *ddt, dmu_tx_t *tx)
1431 {
1432 	ASSERT0(ddt->ddt_dir_object);
1433 	ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT);
1434 
1435 	char name[DDT_NAMELEN];
1436 	snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR,
1437 	    zio_checksum_table[ddt->ddt_checksum].ci_name);
1438 
1439 	ddt->ddt_dir_object = zap_create_link(ddt->ddt_os,
1440 	    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, name, tx);
1441 
1442 	VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_VERSION,
1443 	    sizeof (uint64_t), 1, &ddt->ddt_version, tx));
1444 	VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS,
1445 	    sizeof (uint64_t), 1, &ddt->ddt_flags, tx));
1446 
1447 	spa_feature_incr(ddt->ddt_spa, SPA_FEATURE_FAST_DEDUP, tx);
1448 }
1449 
1450 /* Destroy the containing dir and deactivate the feature */
1451 static void
ddt_destroy_dir(ddt_t * ddt,dmu_tx_t * tx)1452 ddt_destroy_dir(ddt_t *ddt, dmu_tx_t *tx)
1453 {
1454 	ASSERT3U(ddt->ddt_dir_object, !=, 0);
1455 	ASSERT3U(ddt->ddt_dir_object, !=, DMU_POOL_DIRECTORY_OBJECT);
1456 	ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT);
1457 
1458 	char name[DDT_NAMELEN];
1459 	snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR,
1460 	    zio_checksum_table[ddt->ddt_checksum].ci_name);
1461 
1462 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1463 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
1464 			ASSERT(!ddt_object_exists(ddt, type, class));
1465 		}
1466 	}
1467 
1468 	ddt_log_destroy(ddt, tx);
1469 
1470 	uint64_t count;
1471 	ASSERT0(zap_count(ddt->ddt_os, ddt->ddt_dir_object, &count));
1472 	ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object,
1473 	    DDT_DIR_VERSION));
1474 	ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS));
1475 	ASSERT3U(count, ==, 2);
1476 
1477 	VERIFY0(zap_remove(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, tx));
1478 	VERIFY0(zap_destroy(ddt->ddt_os, ddt->ddt_dir_object, tx));
1479 
1480 	ddt->ddt_dir_object = 0;
1481 
1482 	spa_feature_decr(ddt->ddt_spa, SPA_FEATURE_FAST_DEDUP, tx);
1483 }
1484 
1485 /*
1486  * Determine, flags and on-disk layout from what's already stored. If there's
1487  * nothing stored, then if new is false, returns ENOENT, and if true, selects
1488  * based on pool config.
1489  */
1490 static int
ddt_configure(ddt_t * ddt,boolean_t new)1491 ddt_configure(ddt_t *ddt, boolean_t new)
1492 {
1493 	spa_t *spa = ddt->ddt_spa;
1494 	char name[DDT_NAMELEN];
1495 	int error;
1496 
1497 	ASSERT3U(spa_load_state(spa), !=, SPA_LOAD_CREATE);
1498 
1499 	boolean_t fdt_enabled =
1500 	    spa_feature_is_enabled(spa, SPA_FEATURE_FAST_DEDUP);
1501 	boolean_t fdt_active =
1502 	    spa_feature_is_active(spa, SPA_FEATURE_FAST_DEDUP);
1503 
1504 	/*
1505 	 * First, look for the global DDT stats object. If its not there, then
1506 	 * there's never been a DDT written before ever, and we know we're
1507 	 * starting from scratch.
1508 	 */
1509 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1510 	    DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
1511 	    &spa->spa_ddt_stat_object);
1512 	if (error != 0) {
1513 		if (error != ENOENT)
1514 			return (error);
1515 		goto not_found;
1516 	}
1517 
1518 	if (fdt_active) {
1519 		/*
1520 		 * Now look for a DDT directory. If it exists, then it has
1521 		 * everything we need.
1522 		 */
1523 		snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR,
1524 		    zio_checksum_table[ddt->ddt_checksum].ci_name);
1525 
1526 		error = zap_lookup(spa->spa_meta_objset,
1527 		    DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1,
1528 		    &ddt->ddt_dir_object);
1529 		if (error == 0) {
1530 			ASSERT3U(spa->spa_meta_objset, ==, ddt->ddt_os);
1531 
1532 			error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object,
1533 			    DDT_DIR_VERSION, sizeof (uint64_t), 1,
1534 			    &ddt->ddt_version);
1535 			if (error != 0)
1536 				return (error);
1537 
1538 			error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object,
1539 			    DDT_DIR_FLAGS, sizeof (uint64_t), 1,
1540 			    &ddt->ddt_flags);
1541 			if (error != 0)
1542 				return (error);
1543 
1544 			if (ddt->ddt_version != DDT_VERSION_FDT) {
1545 				zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s "
1546 				    "unknown version %llu", spa_name(spa),
1547 				    name, (u_longlong_t)ddt->ddt_version);
1548 				return (SET_ERROR(EINVAL));
1549 			}
1550 
1551 			if ((ddt->ddt_flags & ~DDT_FLAG_MASK) != 0) {
1552 				zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s "
1553 				    "version=%llu unknown flags %llx",
1554 				    spa_name(spa), name,
1555 				    (u_longlong_t)ddt->ddt_flags,
1556 				    (u_longlong_t)ddt->ddt_version);
1557 				return (SET_ERROR(EINVAL));
1558 			}
1559 
1560 			return (0);
1561 		}
1562 		if (error != ENOENT)
1563 			return (error);
1564 	}
1565 
1566 	/* Any object in the root indicates a traditional setup. */
1567 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1568 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
1569 			ddt_object_name(ddt, type, class, name);
1570 			uint64_t obj;
1571 			error = zap_lookup(spa->spa_meta_objset,
1572 			    DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t),
1573 			    1, &obj);
1574 			if (error == ENOENT)
1575 				continue;
1576 			if (error != 0)
1577 				return (error);
1578 
1579 			ddt->ddt_version = DDT_VERSION_LEGACY;
1580 			ddt->ddt_flags = ddt_version_flags[ddt->ddt_version];
1581 			ddt->ddt_dir_object = DMU_POOL_DIRECTORY_OBJECT;
1582 
1583 			return (0);
1584 		}
1585 	}
1586 
1587 not_found:
1588 	if (!new)
1589 		return (SET_ERROR(ENOENT));
1590 
1591 	/* Nothing on disk, so set up for the best version we can */
1592 	if (fdt_enabled) {
1593 		ddt->ddt_version = DDT_VERSION_FDT;
1594 		ddt->ddt_flags = ddt_version_flags[ddt->ddt_version];
1595 		ddt->ddt_dir_object = 0; /* create on first use */
1596 	} else {
1597 		ddt->ddt_version = DDT_VERSION_LEGACY;
1598 		ddt->ddt_flags = ddt_version_flags[ddt->ddt_version];
1599 		ddt->ddt_dir_object = DMU_POOL_DIRECTORY_OBJECT;
1600 	}
1601 
1602 	return (0);
1603 }
1604 
1605 static int
ddt_kstat_update(kstat_t * ksp,int rw)1606 ddt_kstat_update(kstat_t *ksp, int rw)
1607 {
1608 	ddt_t *ddt = ksp->ks_private;
1609 	ddt_kstats_t *dds = ksp->ks_data;
1610 
1611 	if (rw == KSTAT_WRITE)
1612 		return (SET_ERROR(EACCES));
1613 
1614 	/* Aggregate wmsum counters for lookup stats */
1615 	dds->dds_lookup.value.ui64 =
1616 	    wmsum_value(&ddt->ddt_kstat_dds_lookup);
1617 	dds->dds_lookup_live_hit.value.ui64 =
1618 	    wmsum_value(&ddt->ddt_kstat_dds_lookup_live_hit);
1619 	dds->dds_lookup_live_wait.value.ui64 =
1620 	    wmsum_value(&ddt->ddt_kstat_dds_lookup_live_wait);
1621 	dds->dds_lookup_live_miss.value.ui64 =
1622 	    wmsum_value(&ddt->ddt_kstat_dds_lookup_live_miss);
1623 	dds->dds_lookup_existing.value.ui64 =
1624 	    wmsum_value(&ddt->ddt_kstat_dds_lookup_existing);
1625 	dds->dds_lookup_new.value.ui64 =
1626 	    wmsum_value(&ddt->ddt_kstat_dds_lookup_new);
1627 	dds->dds_lookup_log_hit.value.ui64 =
1628 	    wmsum_value(&ddt->ddt_kstat_dds_lookup_log_hit);
1629 	dds->dds_lookup_log_active_hit.value.ui64 =
1630 	    wmsum_value(&ddt->ddt_kstat_dds_lookup_log_active_hit);
1631 	dds->dds_lookup_log_flushing_hit.value.ui64 =
1632 	    wmsum_value(&ddt->ddt_kstat_dds_lookup_log_flushing_hit);
1633 	dds->dds_lookup_log_miss.value.ui64 =
1634 	    wmsum_value(&ddt->ddt_kstat_dds_lookup_log_miss);
1635 	dds->dds_lookup_stored_hit.value.ui64 =
1636 	    wmsum_value(&ddt->ddt_kstat_dds_lookup_stored_hit);
1637 	dds->dds_lookup_stored_miss.value.ui64 =
1638 	    wmsum_value(&ddt->ddt_kstat_dds_lookup_stored_miss);
1639 
1640 	/* Sync-only counters are already set directly in kstats */
1641 
1642 	return (0);
1643 }
1644 
1645 static void
ddt_table_alloc_kstats(ddt_t * ddt)1646 ddt_table_alloc_kstats(ddt_t *ddt)
1647 {
1648 	char *mod = kmem_asprintf("zfs/%s", spa_name(ddt->ddt_spa));
1649 	char *name = kmem_asprintf("ddt_stats_%s",
1650 	    zio_checksum_table[ddt->ddt_checksum].ci_name);
1651 
1652 	/* Initialize wmsums for lookup counters */
1653 	wmsum_init(&ddt->ddt_kstat_dds_lookup, 0);
1654 	wmsum_init(&ddt->ddt_kstat_dds_lookup_live_hit, 0);
1655 	wmsum_init(&ddt->ddt_kstat_dds_lookup_live_wait, 0);
1656 	wmsum_init(&ddt->ddt_kstat_dds_lookup_live_miss, 0);
1657 	wmsum_init(&ddt->ddt_kstat_dds_lookup_existing, 0);
1658 	wmsum_init(&ddt->ddt_kstat_dds_lookup_new, 0);
1659 	wmsum_init(&ddt->ddt_kstat_dds_lookup_log_hit, 0);
1660 	wmsum_init(&ddt->ddt_kstat_dds_lookup_log_active_hit, 0);
1661 	wmsum_init(&ddt->ddt_kstat_dds_lookup_log_flushing_hit, 0);
1662 	wmsum_init(&ddt->ddt_kstat_dds_lookup_log_miss, 0);
1663 	wmsum_init(&ddt->ddt_kstat_dds_lookup_stored_hit, 0);
1664 	wmsum_init(&ddt->ddt_kstat_dds_lookup_stored_miss, 0);
1665 
1666 	ddt->ddt_ksp = kstat_create(mod, 0, name, "misc", KSTAT_TYPE_NAMED,
1667 	    sizeof (ddt_kstats_t) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
1668 	if (ddt->ddt_ksp != NULL) {
1669 		ddt_kstats_t *dds = kmem_alloc(sizeof (ddt_kstats_t), KM_SLEEP);
1670 		memcpy(dds, &ddt_kstats_template, sizeof (ddt_kstats_t));
1671 		ddt->ddt_ksp->ks_data = dds;
1672 		ddt->ddt_ksp->ks_update = ddt_kstat_update;
1673 		ddt->ddt_ksp->ks_private = ddt;
1674 		kstat_install(ddt->ddt_ksp);
1675 	}
1676 
1677 	kmem_strfree(name);
1678 	kmem_strfree(mod);
1679 }
1680 
1681 static ddt_t *
ddt_table_alloc(spa_t * spa,enum zio_checksum c)1682 ddt_table_alloc(spa_t *spa, enum zio_checksum c)
1683 {
1684 	ddt_t *ddt;
1685 
1686 	ddt = kmem_cache_alloc(ddt_cache, KM_SLEEP);
1687 	memset(ddt, 0, sizeof (ddt_t));
1688 	mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
1689 	avl_create(&ddt->ddt_tree, ddt_key_compare,
1690 	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
1691 	avl_create(&ddt->ddt_repair_tree, ddt_key_compare,
1692 	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
1693 
1694 	ddt->ddt_checksum = c;
1695 	ddt->ddt_spa = spa;
1696 	ddt->ddt_os = spa->spa_meta_objset;
1697 	ddt->ddt_version = DDT_VERSION_UNCONFIGURED;
1698 	ddt->ddt_log_flush_pressure = 10;
1699 
1700 	ddt_log_alloc(ddt);
1701 	ddt_table_alloc_kstats(ddt);
1702 
1703 	return (ddt);
1704 }
1705 
1706 static void
ddt_table_free(ddt_t * ddt)1707 ddt_table_free(ddt_t *ddt)
1708 {
1709 	if (ddt->ddt_ksp != NULL) {
1710 		kmem_free(ddt->ddt_ksp->ks_data, sizeof (ddt_kstats_t));
1711 		ddt->ddt_ksp->ks_data = NULL;
1712 		kstat_delete(ddt->ddt_ksp);
1713 	}
1714 
1715 	/* Cleanup wmsums for lookup counters */
1716 	wmsum_fini(&ddt->ddt_kstat_dds_lookup);
1717 	wmsum_fini(&ddt->ddt_kstat_dds_lookup_live_hit);
1718 	wmsum_fini(&ddt->ddt_kstat_dds_lookup_live_wait);
1719 	wmsum_fini(&ddt->ddt_kstat_dds_lookup_live_miss);
1720 	wmsum_fini(&ddt->ddt_kstat_dds_lookup_existing);
1721 	wmsum_fini(&ddt->ddt_kstat_dds_lookup_new);
1722 	wmsum_fini(&ddt->ddt_kstat_dds_lookup_log_hit);
1723 	wmsum_fini(&ddt->ddt_kstat_dds_lookup_log_active_hit);
1724 	wmsum_fini(&ddt->ddt_kstat_dds_lookup_log_flushing_hit);
1725 	wmsum_fini(&ddt->ddt_kstat_dds_lookup_log_miss);
1726 	wmsum_fini(&ddt->ddt_kstat_dds_lookup_stored_hit);
1727 	wmsum_fini(&ddt->ddt_kstat_dds_lookup_stored_miss);
1728 
1729 	ddt_log_free(ddt);
1730 	ASSERT0(avl_numnodes(&ddt->ddt_tree));
1731 	ASSERT0(avl_numnodes(&ddt->ddt_repair_tree));
1732 	avl_destroy(&ddt->ddt_tree);
1733 	avl_destroy(&ddt->ddt_repair_tree);
1734 	mutex_destroy(&ddt->ddt_lock);
1735 	kmem_cache_free(ddt_cache, ddt);
1736 }
1737 
1738 void
ddt_create(spa_t * spa)1739 ddt_create(spa_t *spa)
1740 {
1741 	spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
1742 
1743 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1744 		if (DDT_CHECKSUM_VALID(c))
1745 			spa->spa_ddt[c] = ddt_table_alloc(spa, c);
1746 	}
1747 }
1748 
1749 int
ddt_load(spa_t * spa)1750 ddt_load(spa_t *spa)
1751 {
1752 	int error;
1753 
1754 	ddt_create(spa);
1755 
1756 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1757 	    DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
1758 	    &spa->spa_ddt_stat_object);
1759 	if (error)
1760 		return (error == ENOENT ? 0 : error);
1761 
1762 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1763 		if (!DDT_CHECKSUM_VALID(c))
1764 			continue;
1765 
1766 		ddt_t *ddt = spa->spa_ddt[c];
1767 		error = ddt_configure(ddt, B_FALSE);
1768 		if (error == ENOENT)
1769 			continue;
1770 		if (error != 0)
1771 			return (error);
1772 
1773 		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1774 			for (ddt_class_t class = 0; class < DDT_CLASSES;
1775 			    class++) {
1776 				error = ddt_object_load(ddt, type, class);
1777 				if (error != 0 && error != ENOENT)
1778 					return (error);
1779 			}
1780 		}
1781 
1782 		if (ddt->ddt_flags & DDT_FLAG_LOG) {
1783 			error = ddt_log_load(ddt);
1784 			if (error != 0 && error != ENOENT)
1785 				return (error);
1786 		}
1787 
1788 		DDT_KSTAT_SET(ddt, dds_log_active_entries,
1789 		    avl_numnodes(&ddt->ddt_log_active->ddl_tree));
1790 		DDT_KSTAT_SET(ddt, dds_log_flushing_entries,
1791 		    avl_numnodes(&ddt->ddt_log_flushing->ddl_tree));
1792 
1793 		/*
1794 		 * Seed the cached histograms.
1795 		 */
1796 		memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
1797 		    sizeof (ddt->ddt_histogram));
1798 	}
1799 
1800 	spa->spa_dedup_dspace = ~0ULL;
1801 	spa->spa_dedup_dsize = ~0ULL;
1802 
1803 	return (0);
1804 }
1805 
1806 void
ddt_unload(spa_t * spa)1807 ddt_unload(spa_t *spa)
1808 {
1809 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1810 		if (spa->spa_ddt[c]) {
1811 			ddt_table_free(spa->spa_ddt[c]);
1812 			spa->spa_ddt[c] = NULL;
1813 		}
1814 	}
1815 }
1816 
1817 boolean_t
ddt_class_contains(spa_t * spa,ddt_class_t max_class,const blkptr_t * bp)1818 ddt_class_contains(spa_t *spa, ddt_class_t max_class, const blkptr_t *bp)
1819 {
1820 	ddt_t *ddt;
1821 	ddt_key_t ddk;
1822 
1823 	if (!BP_GET_DEDUP(bp))
1824 		return (B_FALSE);
1825 
1826 	if (max_class == DDT_CLASS_UNIQUE)
1827 		return (B_TRUE);
1828 
1829 	ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
1830 
1831 	ddt_key_fill(&ddk, bp);
1832 
1833 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1834 		for (ddt_class_t class = 0; class <= max_class; class++) {
1835 			if (ddt_object_contains(ddt, type, class, &ddk) == 0)
1836 				return (B_TRUE);
1837 		}
1838 	}
1839 
1840 	return (B_FALSE);
1841 }
1842 
1843 ddt_entry_t *
ddt_repair_start(ddt_t * ddt,const blkptr_t * bp)1844 ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
1845 {
1846 	ddt_key_t ddk;
1847 	ddt_entry_t *dde;
1848 
1849 	ddt_key_fill(&ddk, bp);
1850 
1851 	dde = ddt_alloc(ddt, &ddk);
1852 	ddt_alloc_entry_io(dde);
1853 
1854 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1855 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
1856 			/*
1857 			 * We can only do repair if there are multiple copies
1858 			 * of the block.  For anything in the UNIQUE class,
1859 			 * there's definitely only one copy, so don't even try.
1860 			 */
1861 			if (class != DDT_CLASS_UNIQUE &&
1862 			    ddt_object_lookup(ddt, type, class, dde) == 0)
1863 				return (dde);
1864 		}
1865 	}
1866 
1867 	memset(dde->dde_phys, 0, DDT_PHYS_SIZE(ddt));
1868 
1869 	return (dde);
1870 }
1871 
1872 void
ddt_repair_done(ddt_t * ddt,ddt_entry_t * dde)1873 ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
1874 {
1875 	avl_index_t where;
1876 
1877 	ddt_enter(ddt);
1878 
1879 	if (dde->dde_io->dde_repair_abd != NULL &&
1880 	    spa_writeable(ddt->ddt_spa) &&
1881 	    avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
1882 		avl_insert(&ddt->ddt_repair_tree, dde, where);
1883 	else
1884 		ddt_free(ddt, dde);
1885 
1886 	ddt_exit(ddt);
1887 }
1888 
1889 static void
ddt_repair_entry_done(zio_t * zio)1890 ddt_repair_entry_done(zio_t *zio)
1891 {
1892 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
1893 	ddt_entry_t *rdde = zio->io_private;
1894 
1895 	ddt_free(ddt, rdde);
1896 }
1897 
1898 static void
ddt_repair_entry(ddt_t * ddt,ddt_entry_t * dde,ddt_entry_t * rdde,zio_t * rio)1899 ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
1900 {
1901 	ddt_key_t *ddk = &dde->dde_key;
1902 	ddt_key_t *rddk = &rdde->dde_key;
1903 	zio_t *zio;
1904 	blkptr_t blk;
1905 
1906 	zio = zio_null(rio, rio->io_spa, NULL,
1907 	    ddt_repair_entry_done, rdde, rio->io_flags);
1908 
1909 	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
1910 		ddt_univ_phys_t *ddp = dde->dde_phys;
1911 		ddt_univ_phys_t *rddp = rdde->dde_phys;
1912 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
1913 		uint64_t phys_birth = ddt_phys_birth(ddp, v);
1914 		const dva_t *dvas, *rdvas;
1915 
1916 		if (ddt->ddt_flags & DDT_FLAG_FLAT) {
1917 			dvas = ddp->ddp_flat.ddp_dva;
1918 			rdvas = rddp->ddp_flat.ddp_dva;
1919 		} else {
1920 			dvas = ddp->ddp_trad[p].ddp_dva;
1921 			rdvas = rddp->ddp_trad[p].ddp_dva;
1922 		}
1923 
1924 		if (phys_birth == 0 ||
1925 		    phys_birth != ddt_phys_birth(rddp, v) ||
1926 		    memcmp(dvas, rdvas, sizeof (dva_t) * SPA_DVAS_PER_BP))
1927 			continue;
1928 
1929 		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
1930 		zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
1931 		    rdde->dde_io->dde_repair_abd, DDK_GET_PSIZE(rddk),
1932 		    NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
1933 		    ZIO_DDT_CHILD_FLAGS(zio), NULL));
1934 	}
1935 
1936 	zio_nowait(zio);
1937 }
1938 
1939 static void
ddt_repair_table(ddt_t * ddt,zio_t * rio)1940 ddt_repair_table(ddt_t *ddt, zio_t *rio)
1941 {
1942 	spa_t *spa = ddt->ddt_spa;
1943 	ddt_entry_t *dde, *rdde_next, *rdde;
1944 	avl_tree_t *t = &ddt->ddt_repair_tree;
1945 	blkptr_t blk;
1946 
1947 	if (spa_sync_pass(spa) > 1)
1948 		return;
1949 
1950 	ddt_enter(ddt);
1951 	for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
1952 		rdde_next = AVL_NEXT(t, rdde);
1953 		avl_remove(&ddt->ddt_repair_tree, rdde);
1954 		ddt_exit(ddt);
1955 		ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL,
1956 		    DDT_PHYS_NONE, &blk);
1957 		dde = ddt_repair_start(ddt, &blk);
1958 		ddt_repair_entry(ddt, dde, rdde, rio);
1959 		ddt_repair_done(ddt, dde);
1960 		ddt_enter(ddt);
1961 	}
1962 	ddt_exit(ddt);
1963 }
1964 
1965 static void
ddt_sync_update_stats(ddt_t * ddt,dmu_tx_t * tx)1966 ddt_sync_update_stats(ddt_t *ddt, dmu_tx_t *tx)
1967 {
1968 	/*
1969 	 * Count all the entries stored for each type/class, and updates the
1970 	 * stats within (ddt_object_sync()). If there's no entries for the
1971 	 * type/class, the whole object is removed. If all objects for the DDT
1972 	 * are removed, its containing dir is removed, effectively resetting
1973 	 * the entire DDT to an empty slate.
1974 	 */
1975 	uint64_t count = 0;
1976 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1977 		uint64_t add, tcount = 0;
1978 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
1979 			if (ddt_object_exists(ddt, type, class)) {
1980 				ddt_object_sync(ddt, type, class, tx);
1981 				VERIFY0(ddt_object_count(ddt, type, class,
1982 				    &add));
1983 				tcount += add;
1984 			}
1985 		}
1986 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
1987 			if (tcount == 0 && ddt_object_exists(ddt, type, class))
1988 				ddt_object_destroy(ddt, type, class, tx);
1989 		}
1990 		count += tcount;
1991 	}
1992 
1993 	if (ddt->ddt_flags & DDT_FLAG_LOG) {
1994 		/* Include logged entries in the total count */
1995 		count += avl_numnodes(&ddt->ddt_log_active->ddl_tree);
1996 		count += avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
1997 	}
1998 
1999 	if (count == 0) {
2000 		/*
2001 		 * No entries left on the DDT, so reset the version for next
2002 		 * time. This allows us to handle the feature being changed
2003 		 * since the DDT was originally created. New entries should get
2004 		 * whatever the feature currently demands.
2005 		 */
2006 		if (ddt->ddt_version == DDT_VERSION_FDT)
2007 			ddt_destroy_dir(ddt, tx);
2008 
2009 		ddt->ddt_version = DDT_VERSION_UNCONFIGURED;
2010 		ddt->ddt_flags = 0;
2011 	}
2012 
2013 	memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
2014 	    sizeof (ddt->ddt_histogram));
2015 	ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
2016 	ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
2017 }
2018 
2019 static void
ddt_sync_scan_entry(ddt_t * ddt,ddt_lightweight_entry_t * ddlwe,dmu_tx_t * tx)2020 ddt_sync_scan_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
2021 {
2022 	dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
2023 
2024 	/*
2025 	 * Compute the target class, so we can decide whether or not to inform
2026 	 * the scrub traversal (below). Note that we don't store this in the
2027 	 * entry, as it might change multiple times before finally being
2028 	 * committed (if we're logging). Instead, we recompute it in
2029 	 * ddt_sync_entry().
2030 	 */
2031 	uint64_t refcnt = ddt_phys_total_refcnt(ddt, &ddlwe->ddlwe_phys);
2032 	ddt_class_t nclass =
2033 	    (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE;
2034 
2035 	/*
2036 	 * If the class changes, the order that we scan this bp changes. If it
2037 	 * decreases, we could miss it, so scan it right now. (This covers both
2038 	 * class changing while we are doing ddt_walk(), and when we are
2039 	 * traversing.)
2040 	 *
2041 	 * We also do this when the refcnt goes to zero, because that change is
2042 	 * only in the log so far; the blocks on disk won't be freed until
2043 	 * the log is flushed, and the refcnt might increase before that. If it
2044 	 * does, then we could miss it in the same way.
2045 	 */
2046 	if (refcnt == 0 || nclass < ddlwe->ddlwe_class)
2047 		dsl_scan_ddt_entry(dp->dp_scan, ddt->ddt_checksum, ddt,
2048 		    ddlwe, tx);
2049 }
2050 
2051 static void
ddt_sync_flush_entry(ddt_t * ddt,ddt_lightweight_entry_t * ddlwe,ddt_type_t otype,ddt_class_t oclass,dmu_tx_t * tx)2052 ddt_sync_flush_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe,
2053     ddt_type_t otype, ddt_class_t oclass, dmu_tx_t *tx)
2054 {
2055 	ddt_key_t *ddk = &ddlwe->ddlwe_key;
2056 	ddt_type_t ntype = DDT_TYPE_DEFAULT;
2057 	uint64_t refcnt = 0;
2058 
2059 	/*
2060 	 * Compute the total refcnt. Along the way, issue frees for any DVAs
2061 	 * we no longer want.
2062 	 */
2063 	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
2064 		ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
2065 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
2066 		uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v);
2067 
2068 		if (ddt_phys_birth(ddp, v) == 0) {
2069 			ASSERT0(phys_refcnt);
2070 			continue;
2071 		}
2072 		if (DDT_PHYS_IS_DITTO(ddt, p)) {
2073 			/*
2074 			 * We don't want to keep any obsolete slots (eg ditto),
2075 			 * regardless of their refcount, but we don't want to
2076 			 * leak them either. So, free them.
2077 			 */
2078 			ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg);
2079 			continue;
2080 		}
2081 		if (phys_refcnt == 0)
2082 			/* No remaining references, free it! */
2083 			ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg);
2084 		refcnt += phys_refcnt;
2085 	}
2086 
2087 	/* Select the best class for the entry. */
2088 	ddt_class_t nclass =
2089 	    (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE;
2090 
2091 	/*
2092 	 * If an existing entry changed type or class, or its refcount reached
2093 	 * zero, delete it from the DDT object
2094 	 */
2095 	if (otype != DDT_TYPES &&
2096 	    (otype != ntype || oclass != nclass || refcnt == 0)) {
2097 		VERIFY0(ddt_object_remove(ddt, otype, oclass, ddk, tx));
2098 		ASSERT(ddt_object_contains(ddt, otype, oclass, ddk) == ENOENT);
2099 	}
2100 
2101 	/*
2102 	 * Add or update the entry
2103 	 */
2104 	if (refcnt != 0) {
2105 		ddt_histogram_t *ddh =
2106 		    &ddt->ddt_histogram[ntype][nclass];
2107 
2108 		ddt_histogram_add_entry(ddt, ddh, ddlwe);
2109 
2110 		if (!ddt_object_exists(ddt, ntype, nclass))
2111 			ddt_object_create(ddt, ntype, nclass, tx);
2112 		VERIFY0(ddt_object_update(ddt, ntype, nclass, ddlwe, tx));
2113 	}
2114 }
2115 
2116 /* Calculate an exponential weighted moving average, lower limited to zero */
2117 static inline int32_t
_ewma(int32_t val,int32_t prev,uint32_t weight)2118 _ewma(int32_t val, int32_t prev, uint32_t weight)
2119 {
2120 	ASSERT3U(val, >=, 0);
2121 	ASSERT3U(prev, >=, 0);
2122 	const int32_t new =
2123 	    MAX(0, prev + (val-prev) / (int32_t)MAX(weight, 1));
2124 	ASSERT3U(new, >=, 0);
2125 	return (new);
2126 }
2127 
2128 static inline void
ddt_flush_force_update_txg(ddt_t * ddt,uint64_t txg)2129 ddt_flush_force_update_txg(ddt_t *ddt, uint64_t txg)
2130 {
2131 	/*
2132 	 * If we're not forcing flush, and not being asked to start, then
2133 	 * there's nothing more to do.
2134 	 */
2135 	if (txg == 0) {
2136 		/* Update requested, are we currently forcing flush? */
2137 		if (ddt->ddt_flush_force_txg == 0)
2138 			return;
2139 		txg = ddt->ddt_flush_force_txg;
2140 	}
2141 
2142 	/*
2143 	 * If either of the logs have entries unflushed entries before
2144 	 * the wanted txg, set the force txg, otherwise clear it.
2145 	 */
2146 
2147 	if ((!avl_is_empty(&ddt->ddt_log_active->ddl_tree) &&
2148 	    ddt->ddt_log_active->ddl_first_txg <= txg) ||
2149 	    (!avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) &&
2150 	    ddt->ddt_log_flushing->ddl_first_txg <= txg)) {
2151 		ddt->ddt_flush_force_txg = txg;
2152 		return;
2153 	}
2154 
2155 	/*
2156 	 * Nothing to flush behind the given txg, so we can clear force flush
2157 	 * state.
2158 	 */
2159 	ddt->ddt_flush_force_txg = 0;
2160 }
2161 
2162 static void
ddt_sync_flush_log(ddt_t * ddt,dmu_tx_t * tx)2163 ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
2164 {
2165 	spa_t *spa = ddt->ddt_spa;
2166 	ASSERT(avl_is_empty(&ddt->ddt_tree));
2167 
2168 	/*
2169 	 * Don't do any flushing when the pool is ready to shut down, or in
2170 	 * passes beyond the first.
2171 	 */
2172 	if (spa_sync_pass(spa) > 1 || tx->tx_txg > spa_final_dirty_txg(spa))
2173 		return;
2174 
2175 	hrtime_t flush_start = gethrtime();
2176 	uint32_t count = 0;
2177 
2178 	/*
2179 	 * How many entries we need to flush. We need to at
2180 	 * least match the ingest rate, and also consider the
2181 	 * current backlog of entries.
2182 	 */
2183 	uint64_t backlog = avl_numnodes(&ddt->ddt_log_flushing->ddl_tree) +
2184 	    avl_numnodes(&ddt->ddt_log_active->ddl_tree);
2185 
2186 	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree))
2187 		goto housekeeping;
2188 
2189 	uint64_t txgs = MAX(1, zfs_dedup_log_flush_txgs);
2190 	uint64_t cap = MAX(1, zfs_dedup_log_cap);
2191 	uint64_t flush_min = MAX(backlog / txgs,
2192 	    zfs_dedup_log_flush_entries_min);
2193 
2194 	/*
2195 	 * The theory for this block is that if we increase the pressure while
2196 	 * we're growing above the cap, and remove it when we're significantly
2197 	 * below the cap, we'll stay near cap while not bouncing around too
2198 	 * much.
2199 	 *
2200 	 * The factor of 10 is to smooth the pressure effect by expressing it
2201 	 * in tenths. The addition of the cap to the backlog in the second
2202 	 * block is to round up, instead of down. We never let the pressure go
2203 	 * below 1 (10 tenths).
2204 	 */
2205 	if (cap != UINT_MAX && backlog > cap &&
2206 	    backlog > ddt->ddt_log_flush_prev_backlog) {
2207 		ddt->ddt_log_flush_pressure += 10 * backlog / cap;
2208 	} else if (cap != UINT_MAX && backlog < cap) {
2209 		ddt->ddt_log_flush_pressure -=
2210 		    11 - (((10 * backlog) + cap - 1) / cap);
2211 		ddt->ddt_log_flush_pressure =
2212 		    MAX(ddt->ddt_log_flush_pressure, 10);
2213 	}
2214 
2215 	if (zfs_dedup_log_hard_cap && cap != UINT_MAX)
2216 		flush_min = MAX(flush_min, MIN(backlog - cap,
2217 		    (flush_min * ddt->ddt_log_flush_pressure) / 10));
2218 
2219 	uint64_t flush_max;
2220 
2221 	/*
2222 	 * If we've been asked to flush everything in a hurry,
2223 	 * try to dump as much as possible on this txg. In
2224 	 * this case we're only limited by time, not amount.
2225 	 *
2226 	 * Otherwise, if we are over the cap, try to get back down to it.
2227 	 *
2228 	 * Finally if there is no cap (or no pressure), just set the max a
2229 	 * little higher than the min to help smooth out variations in flush
2230 	 * times.
2231 	 */
2232 	if (ddt->ddt_flush_force_txg > 0)
2233 		flush_max = avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
2234 	else if (cap != UINT32_MAX && !zfs_dedup_log_hard_cap)
2235 		flush_max = MAX(flush_min * 5 / 4, MIN(backlog - cap,
2236 		    (flush_min * ddt->ddt_log_flush_pressure) / 10));
2237 	else
2238 		flush_max = flush_min * 5 / 4;
2239 	flush_max = MIN(flush_max, zfs_dedup_log_flush_entries_max);
2240 
2241 	/*
2242 	 * When the pool is busy or someone is explicitly waiting for this txg
2243 	 * to complete, use the zfs_dedup_log_flush_min_time_ms.  Otherwise use
2244 	 * half of the time in the txg timeout.
2245 	 */
2246 	uint64_t target_time;
2247 
2248 	if (txg_sync_waiting(ddt->ddt_spa->spa_dsl_pool) ||
2249 	    vdev_queue_pool_busy(spa)) {
2250 		target_time = MIN(MSEC2NSEC(zfs_dedup_log_flush_min_time_ms),
2251 		    SEC2NSEC(zfs_txg_timeout) / 2);
2252 	} else {
2253 		target_time = SEC2NSEC(zfs_txg_timeout) / 2;
2254 	}
2255 
2256 	ddt_lightweight_entry_t ddlwe;
2257 	while (ddt_log_take_first(ddt, ddt->ddt_log_flushing, &ddlwe)) {
2258 		ddt_sync_flush_entry(ddt, &ddlwe,
2259 		    ddlwe.ddlwe_type, ddlwe.ddlwe_class, tx);
2260 
2261 		/* End if we've synced as much as we needed to. */
2262 		if (++count >= flush_max)
2263 			break;
2264 
2265 		/*
2266 		 * As long as we've flushed the absolute minimum,
2267 		 * stop if we're way over our target time.
2268 		 */
2269 		uint64_t diff = gethrtime() - flush_start;
2270 		if (count > zfs_dedup_log_flush_entries_min &&
2271 		    diff >= target_time * 2)
2272 			break;
2273 
2274 		/*
2275 		 * End if we've passed the minimum flush and we're out of time.
2276 		 */
2277 		if (count > flush_min && diff >= target_time)
2278 			break;
2279 	}
2280 
2281 	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) {
2282 		/* We emptied it, so truncate on-disk */
2283 		DDT_KSTAT_ZERO(ddt, dds_log_flushing_entries);
2284 		ddt_log_truncate(ddt, tx);
2285 	} else {
2286 		/* More to do next time, save checkpoint */
2287 		DDT_KSTAT_SUB(ddt, dds_log_flushing_entries, count);
2288 		ddt_log_checkpoint(ddt, &ddlwe, tx);
2289 	}
2290 
2291 	ddt_sync_update_stats(ddt, tx);
2292 
2293 housekeeping:
2294 	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) &&
2295 	    !avl_is_empty(&ddt->ddt_log_active->ddl_tree)) {
2296 		/*
2297 		 * No more to flush, and the active list has stuff, so
2298 		 * try to swap the logs for next time.
2299 		 */
2300 		if (ddt_log_swap(ddt, tx)) {
2301 			DDT_KSTAT_ZERO(ddt, dds_log_active_entries);
2302 			DDT_KSTAT_SET(ddt, dds_log_flushing_entries,
2303 			    avl_numnodes(&ddt->ddt_log_flushing->ddl_tree));
2304 		}
2305 	}
2306 
2307 	/* If force flush is no longer necessary, turn it off. */
2308 	ddt_flush_force_update_txg(ddt, 0);
2309 
2310 	ddt->ddt_log_flush_prev_backlog = backlog;
2311 
2312 	/*
2313 	 * Update flush rate. This is an exponential weighted moving
2314 	 * average of the number of entries flushed over recent txgs.
2315 	 */
2316 	ddt->ddt_log_flush_rate = _ewma(count, ddt->ddt_log_flush_rate,
2317 	    zfs_dedup_log_flush_flow_rate_txgs);
2318 	DDT_KSTAT_SET(ddt, dds_log_flush_rate, ddt->ddt_log_flush_rate);
2319 
2320 	/*
2321 	 * Update flush time rate. This is an exponential weighted moving
2322 	 * average of the total time taken to flush over recent txgs.
2323 	 */
2324 	ddt->ddt_log_flush_time_rate = _ewma(ddt->ddt_log_flush_time_rate,
2325 	    (int32_t)NSEC2MSEC(gethrtime() - flush_start),
2326 	    zfs_dedup_log_flush_flow_rate_txgs);
2327 	DDT_KSTAT_SET(ddt, dds_log_flush_time_rate,
2328 	    ddt->ddt_log_flush_time_rate);
2329 	if (avl_numnodes(&ddt->ddt_log_flushing->ddl_tree) > 0 &&
2330 	    zfs_flags & ZFS_DEBUG_DDT) {
2331 		zfs_dbgmsg("%lu entries remain(%lu in active), flushed %u @ "
2332 		    "txg %llu, in %llu ms, flush rate %d, time rate %d",
2333 		    (ulong_t)avl_numnodes(&ddt->ddt_log_flushing->ddl_tree),
2334 		    (ulong_t)avl_numnodes(&ddt->ddt_log_active->ddl_tree),
2335 		    count, (u_longlong_t)tx->tx_txg,
2336 		    (u_longlong_t)NSEC2MSEC(gethrtime() - flush_start),
2337 		    ddt->ddt_log_flush_rate, ddt->ddt_log_flush_time_rate);
2338 	}
2339 }
2340 
2341 static void
ddt_sync_table_log(ddt_t * ddt,dmu_tx_t * tx)2342 ddt_sync_table_log(ddt_t *ddt, dmu_tx_t *tx)
2343 {
2344 	uint64_t count = avl_numnodes(&ddt->ddt_tree);
2345 
2346 	if (count > 0) {
2347 		ddt_log_update_t dlu = {0};
2348 		ddt_log_begin(ddt, count, tx, &dlu);
2349 
2350 		ddt_entry_t *dde;
2351 		void *cookie = NULL;
2352 		ddt_lightweight_entry_t ddlwe;
2353 		while ((dde =
2354 		    avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
2355 			ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
2356 			DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
2357 			ddt_log_entry(ddt, &ddlwe, &dlu);
2358 			ddt_sync_scan_entry(ddt, &ddlwe, tx);
2359 			ddt_free(ddt, dde);
2360 		}
2361 
2362 		ddt_log_commit(ddt, &dlu);
2363 
2364 		DDT_KSTAT_SET(ddt, dds_log_active_entries,
2365 		    avl_numnodes(&ddt->ddt_log_active->ddl_tree));
2366 
2367 		/*
2368 		 * Sync the stats for the store objects. Even though we haven't
2369 		 * modified anything on those objects, they're no longer the
2370 		 * source of truth for entries that are now in the log, and we
2371 		 * need the on-disk counts to reflect that, otherwise we'll
2372 		 * miscount later when importing.
2373 		 */
2374 		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
2375 			for (ddt_class_t class = 0;
2376 			    class < DDT_CLASSES; class++) {
2377 				if (ddt_object_exists(ddt, type, class))
2378 					ddt_object_sync(ddt, type, class, tx);
2379 			}
2380 		}
2381 
2382 		memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
2383 		    sizeof (ddt->ddt_histogram));
2384 		ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
2385 		ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
2386 	}
2387 
2388 	if (spa_sync_pass(ddt->ddt_spa) == 1) {
2389 		/*
2390 		 * Update ingest rate. This is an exponential weighted moving
2391 		 * average of the number of entries changed over recent txgs.
2392 		 * The ramp-up cost shouldn't matter too much because the
2393 		 * flusher will be trying to take at least the minimum anyway.
2394 		 */
2395 		ddt->ddt_log_ingest_rate = _ewma(
2396 		    count, ddt->ddt_log_ingest_rate,
2397 		    zfs_dedup_log_flush_flow_rate_txgs);
2398 		DDT_KSTAT_SET(ddt, dds_log_ingest_rate,
2399 		    ddt->ddt_log_ingest_rate);
2400 	}
2401 }
2402 
2403 static void
ddt_sync_table_flush(ddt_t * ddt,dmu_tx_t * tx)2404 ddt_sync_table_flush(ddt_t *ddt, dmu_tx_t *tx)
2405 {
2406 	if (avl_numnodes(&ddt->ddt_tree) == 0)
2407 		return;
2408 
2409 	ddt_entry_t *dde;
2410 	void *cookie = NULL;
2411 	while ((dde = avl_destroy_nodes(
2412 	    &ddt->ddt_tree, &cookie)) != NULL) {
2413 		ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
2414 
2415 		ddt_lightweight_entry_t ddlwe;
2416 		DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
2417 		ddt_sync_flush_entry(ddt, &ddlwe,
2418 		    dde->dde_type, dde->dde_class, tx);
2419 		ddt_sync_scan_entry(ddt, &ddlwe, tx);
2420 		ddt_free(ddt, dde);
2421 	}
2422 
2423 	memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
2424 	    sizeof (ddt->ddt_histogram));
2425 	ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
2426 	ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
2427 	ddt_sync_update_stats(ddt, tx);
2428 }
2429 
2430 static void
ddt_sync_table(ddt_t * ddt,dmu_tx_t * tx)2431 ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx)
2432 {
2433 	spa_t *spa = ddt->ddt_spa;
2434 
2435 	if (ddt->ddt_version == UINT64_MAX)
2436 		return;
2437 
2438 	if (spa->spa_uberblock.ub_version < SPA_VERSION_DEDUP) {
2439 		ASSERT0(avl_numnodes(&ddt->ddt_tree));
2440 		return;
2441 	}
2442 
2443 	if (spa->spa_ddt_stat_object == 0) {
2444 		spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
2445 		    DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
2446 		    DMU_POOL_DDT_STATS, tx);
2447 	}
2448 
2449 	if (ddt->ddt_version == DDT_VERSION_FDT && ddt->ddt_dir_object == 0)
2450 		ddt_create_dir(ddt, tx);
2451 
2452 	if (ddt->ddt_flags & DDT_FLAG_LOG)
2453 		ddt_sync_table_log(ddt, tx);
2454 	else
2455 		ddt_sync_table_flush(ddt, tx);
2456 }
2457 
2458 void
ddt_sync(spa_t * spa,uint64_t txg)2459 ddt_sync(spa_t *spa, uint64_t txg)
2460 {
2461 	dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
2462 	dmu_tx_t *tx;
2463 	zio_t *rio;
2464 
2465 	ASSERT3U(spa_syncing_txg(spa), ==, txg);
2466 
2467 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
2468 
2469 	rio = zio_root(spa, NULL, NULL,
2470 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL);
2471 
2472 	/*
2473 	 * This function may cause an immediate scan of ddt blocks (see
2474 	 * the comment above dsl_scan_ddt() for details). We set the
2475 	 * scan's root zio here so that we can wait for any scan IOs in
2476 	 * addition to the regular ddt IOs.
2477 	 */
2478 	ASSERT0P(scn->scn_zio_root);
2479 	scn->scn_zio_root = rio;
2480 
2481 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
2482 		ddt_t *ddt = spa->spa_ddt[c];
2483 		if (ddt == NULL)
2484 			continue;
2485 		ddt_sync_table(ddt, tx);
2486 		if (ddt->ddt_flags & DDT_FLAG_LOG)
2487 			ddt_sync_flush_log(ddt, tx);
2488 		ddt_repair_table(ddt, rio);
2489 	}
2490 
2491 	(void) zio_wait(rio);
2492 	scn->scn_zio_root = NULL;
2493 
2494 	dmu_tx_commit(tx);
2495 }
2496 
2497 void
ddt_walk_init(spa_t * spa,uint64_t txg)2498 ddt_walk_init(spa_t *spa, uint64_t txg)
2499 {
2500 	if (txg == 0)
2501 		txg = spa_syncing_txg(spa);
2502 
2503 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
2504 		ddt_t *ddt = spa->spa_ddt[c];
2505 		if (ddt == NULL || !(ddt->ddt_flags & DDT_FLAG_LOG))
2506 			continue;
2507 
2508 		ddt_enter(ddt);
2509 		ddt_flush_force_update_txg(ddt, txg);
2510 		ddt_exit(ddt);
2511 	}
2512 }
2513 
2514 boolean_t
ddt_walk_ready(spa_t * spa)2515 ddt_walk_ready(spa_t *spa)
2516 {
2517 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
2518 		ddt_t *ddt = spa->spa_ddt[c];
2519 		if (ddt == NULL || !(ddt->ddt_flags & DDT_FLAG_LOG))
2520 			continue;
2521 
2522 		if (ddt->ddt_flush_force_txg > 0)
2523 			return (B_FALSE);
2524 	}
2525 
2526 	return (B_TRUE);
2527 }
2528 
2529 static int
ddt_walk_impl(spa_t * spa,ddt_bookmark_t * ddb,ddt_lightweight_entry_t * ddlwe,uint64_t flags,boolean_t wait)2530 ddt_walk_impl(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe,
2531     uint64_t flags, boolean_t wait)
2532 {
2533 	do {
2534 		do {
2535 			do {
2536 				ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
2537 				if (ddt == NULL)
2538 					continue;
2539 
2540 				if (flags != 0 &&
2541 				    (ddt->ddt_flags & flags) != flags)
2542 					continue;
2543 
2544 				if (wait && ddt->ddt_flush_force_txg > 0)
2545 					return (EAGAIN);
2546 
2547 				int error = ENOENT;
2548 				if (ddt_object_exists(ddt, ddb->ddb_type,
2549 				    ddb->ddb_class)) {
2550 					error = ddt_object_walk(ddt,
2551 					    ddb->ddb_type, ddb->ddb_class,
2552 					    &ddb->ddb_cursor, ddlwe);
2553 				}
2554 				if (error == 0)
2555 					return (0);
2556 				if (error != ENOENT)
2557 					return (error);
2558 				ddb->ddb_cursor = 0;
2559 			} while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
2560 			ddb->ddb_checksum = 0;
2561 		} while (++ddb->ddb_type < DDT_TYPES);
2562 		ddb->ddb_type = 0;
2563 	} while (++ddb->ddb_class < DDT_CLASSES);
2564 
2565 	return (SET_ERROR(ENOENT));
2566 }
2567 
2568 int
ddt_walk(spa_t * spa,ddt_bookmark_t * ddb,ddt_lightweight_entry_t * ddlwe)2569 ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe)
2570 {
2571 	return (ddt_walk_impl(spa, ddb, ddlwe, 0, B_TRUE));
2572 }
2573 
2574 /*
2575  * This function is used by Block Cloning (brt.c) to increase reference
2576  * counter for the DDT entry if the block is already in DDT.
2577  *
2578  * Return false if the block, despite having the D bit set, is not present
2579  * in the DDT. This is possible when the DDT has been pruned by an admin
2580  * or by the DDT quota mechanism.
2581  */
2582 boolean_t
ddt_addref(spa_t * spa,const blkptr_t * bp)2583 ddt_addref(spa_t *spa, const blkptr_t *bp)
2584 {
2585 	ddt_t *ddt;
2586 	ddt_entry_t *dde;
2587 	boolean_t result;
2588 
2589 	spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
2590 	ddt = ddt_select(spa, bp);
2591 	ddt_enter(ddt);
2592 
2593 	dde = ddt_lookup(ddt, bp, B_TRUE);
2594 
2595 	/* Can be NULL if the entry for this block was pruned. */
2596 	if (dde == NULL) {
2597 		ddt_exit(ddt);
2598 		spa_config_exit(spa, SCL_ZIO, FTAG);
2599 		return (B_FALSE);
2600 	}
2601 
2602 	if ((dde->dde_type < DDT_TYPES) || (dde->dde_flags & DDE_FLAG_LOGGED)) {
2603 		/*
2604 		 * This entry was either synced to a store object (dde_type is
2605 		 * real) or was logged. It must be properly on disk at this
2606 		 * point, so we can just bump its refcount.
2607 		 */
2608 		int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp));
2609 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
2610 
2611 		ddt_phys_addref(dde->dde_phys, v);
2612 		result = B_TRUE;
2613 	} else {
2614 		/*
2615 		 * If the block has the DEDUP flag set it still might not
2616 		 * exist in the DEDUP table due to DDT pruning of entries
2617 		 * where refcnt=1.
2618 		 */
2619 		ddt_remove(ddt, dde);
2620 		result = B_FALSE;
2621 	}
2622 
2623 	ddt_exit(ddt);
2624 	spa_config_exit(spa, SCL_ZIO, FTAG);
2625 
2626 	return (result);
2627 }
2628 
2629 typedef struct ddt_prune_entry {
2630 	ddt_t		*dpe_ddt;
2631 	ddt_key_t	dpe_key;
2632 	list_node_t	dpe_node;
2633 	ddt_univ_phys_t	dpe_phys[];
2634 } ddt_prune_entry_t;
2635 
2636 typedef struct ddt_prune_info {
2637 	spa_t		*dpi_spa;
2638 	uint64_t	dpi_txg_syncs;
2639 	uint64_t	dpi_pruned;
2640 	list_t		dpi_candidates;
2641 } ddt_prune_info_t;
2642 
2643 /*
2644  * Add prune candidates for ddt_sync during spa_sync
2645  */
2646 static void
prune_candidates_sync(void * arg,dmu_tx_t * tx)2647 prune_candidates_sync(void *arg, dmu_tx_t *tx)
2648 {
2649 	(void) tx;
2650 	ddt_prune_info_t *dpi = arg;
2651 	ddt_prune_entry_t *dpe;
2652 
2653 	spa_config_enter(dpi->dpi_spa, SCL_ZIO, FTAG, RW_READER);
2654 
2655 	/* Process the prune candidates collected so far */
2656 	while ((dpe = list_remove_head(&dpi->dpi_candidates)) != NULL) {
2657 		blkptr_t blk;
2658 		ddt_t *ddt = dpe->dpe_ddt;
2659 
2660 		ddt_enter(ddt);
2661 
2662 		/*
2663 		 * If it's on the live list, then it was loaded for update
2664 		 * this txg and is no longer stale; skip it.
2665 		 */
2666 		if (avl_find(&ddt->ddt_tree, &dpe->dpe_key, NULL)) {
2667 			ddt_exit(ddt);
2668 			kmem_free(dpe, sizeof (*dpe));
2669 			continue;
2670 		}
2671 
2672 		ddt_bp_create(ddt->ddt_checksum, &dpe->dpe_key,
2673 		    dpe->dpe_phys, DDT_PHYS_FLAT, &blk);
2674 
2675 		ddt_entry_t *dde = ddt_lookup(ddt, &blk, B_TRUE);
2676 		if (dde != NULL && !(dde->dde_flags & DDE_FLAG_LOGGED)) {
2677 			ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
2678 			/*
2679 			 * Zero the physical, so we don't try to free DVAs
2680 			 * at flush nor try to reuse this entry.
2681 			 */
2682 			ddt_phys_clear(dde->dde_phys, DDT_PHYS_FLAT);
2683 
2684 			dpi->dpi_pruned++;
2685 		}
2686 
2687 		ddt_exit(ddt);
2688 		kmem_free(dpe, sizeof (*dpe));
2689 	}
2690 
2691 	spa_config_exit(dpi->dpi_spa, SCL_ZIO, FTAG);
2692 	dpi->dpi_txg_syncs++;
2693 }
2694 
2695 /*
2696  * Prune candidates are collected in open context and processed
2697  * in sync context as part of ddt_sync_table().
2698  */
2699 static void
ddt_prune_entry(list_t * list,ddt_t * ddt,const ddt_key_t * ddk,const ddt_univ_phys_t * ddp)2700 ddt_prune_entry(list_t *list, ddt_t *ddt, const ddt_key_t *ddk,
2701     const ddt_univ_phys_t *ddp)
2702 {
2703 	ASSERT(ddt->ddt_flags & DDT_FLAG_FLAT);
2704 
2705 	size_t dpe_size = sizeof (ddt_prune_entry_t) + DDT_FLAT_PHYS_SIZE;
2706 	ddt_prune_entry_t *dpe = kmem_alloc(dpe_size, KM_SLEEP);
2707 
2708 	dpe->dpe_ddt = ddt;
2709 	dpe->dpe_key = *ddk;
2710 	memcpy(dpe->dpe_phys, ddp, DDT_FLAT_PHYS_SIZE);
2711 	list_insert_head(list, dpe);
2712 }
2713 
2714 /*
2715  * Interate over all the entries in the DDT unique class.
2716  * The walk will perform one of the following operations:
2717  *  (a) build a histogram than can be used when pruning
2718  *  (b) prune entries older than the cutoff
2719  *
2720  *  Also called by zdb(8) to dump the age histogram
2721  */
2722 void
ddt_prune_walk(spa_t * spa,uint64_t cutoff,ddt_age_histo_t * histogram)2723 ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram)
2724 {
2725 	ddt_bookmark_t ddb = {
2726 		.ddb_class = DDT_CLASS_UNIQUE,
2727 		.ddb_type = 0,
2728 		.ddb_checksum = 0,
2729 		.ddb_cursor = 0
2730 	};
2731 	ddt_lightweight_entry_t ddlwe = {0};
2732 	int error;
2733 	int valid = 0;
2734 	int candidates = 0;
2735 	uint64_t now = gethrestime_sec();
2736 	ddt_prune_info_t dpi;
2737 	boolean_t pruning = (cutoff != 0);
2738 
2739 	if (pruning) {
2740 		dpi.dpi_txg_syncs = 0;
2741 		dpi.dpi_pruned = 0;
2742 		dpi.dpi_spa = spa;
2743 		list_create(&dpi.dpi_candidates, sizeof (ddt_prune_entry_t),
2744 		    offsetof(ddt_prune_entry_t, dpe_node));
2745 	}
2746 
2747 	if (histogram != NULL)
2748 		memset(histogram, 0, sizeof (ddt_age_histo_t));
2749 
2750 	while ((error =
2751 	    ddt_walk_impl(spa, &ddb, &ddlwe, DDT_FLAG_FLAT, B_FALSE)) == 0) {
2752 		ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
2753 		VERIFY(ddt);
2754 
2755 		if (spa_shutting_down(spa) || issig())
2756 			break;
2757 
2758 		ASSERT(ddt->ddt_flags & DDT_FLAG_FLAT);
2759 		ASSERT3U(ddlwe.ddlwe_phys.ddp_flat.ddp_refcnt, <=, 1);
2760 
2761 		uint64_t class_start =
2762 		    ddlwe.ddlwe_phys.ddp_flat.ddp_class_start;
2763 
2764 		/*
2765 		 * If this entry is on the log, then the stored entry is stale
2766 		 * and we should skip it.
2767 		 */
2768 		if (ddt_log_find_key(ddt, &ddlwe.ddlwe_key, NULL))
2769 			continue;
2770 
2771 		/* prune older entries */
2772 		if (pruning && class_start < cutoff) {
2773 			if (candidates++ >= zfs_ddt_prunes_per_txg) {
2774 				/* sync prune candidates in batches */
2775 				VERIFY0(dsl_sync_task(spa_name(spa),
2776 				    NULL, prune_candidates_sync,
2777 				    &dpi, 0, ZFS_SPACE_CHECK_NONE));
2778 				candidates = 1;
2779 			}
2780 			ddt_prune_entry(&dpi.dpi_candidates, ddt,
2781 			    &ddlwe.ddlwe_key, &ddlwe.ddlwe_phys);
2782 		}
2783 
2784 		/* build a histogram */
2785 		if (histogram != NULL) {
2786 			uint64_t age = MAX(1, (now - class_start) / 3600);
2787 			int bin = MIN(highbit64(age) - 1, HIST_BINS - 1);
2788 			histogram->dah_entries++;
2789 			histogram->dah_age_histo[bin]++;
2790 		}
2791 
2792 		valid++;
2793 	}
2794 
2795 	if (pruning && valid > 0) {
2796 		if (!list_is_empty(&dpi.dpi_candidates)) {
2797 			/* sync out final batch of prune candidates */
2798 			VERIFY0(dsl_sync_task(spa_name(spa), NULL,
2799 			    prune_candidates_sync, &dpi, 0,
2800 			    ZFS_SPACE_CHECK_NONE));
2801 		}
2802 		list_destroy(&dpi.dpi_candidates);
2803 
2804 		zfs_dbgmsg("pruned %llu entries (%d%%) across %llu txg syncs",
2805 		    (u_longlong_t)dpi.dpi_pruned,
2806 		    (int)((dpi.dpi_pruned * 100) / valid),
2807 		    (u_longlong_t)dpi.dpi_txg_syncs);
2808 	}
2809 }
2810 
2811 static uint64_t
ddt_total_entries(spa_t * spa)2812 ddt_total_entries(spa_t *spa)
2813 {
2814 	ddt_object_t ddo;
2815 	ddt_get_dedup_object_stats(spa, &ddo);
2816 
2817 	return (ddo.ddo_count);
2818 }
2819 
2820 int
ddt_prune_unique_entries(spa_t * spa,zpool_ddt_prune_unit_t unit,uint64_t amount)2821 ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit,
2822     uint64_t amount)
2823 {
2824 	uint64_t cutoff;
2825 	uint64_t start_time = gethrtime();
2826 
2827 	if (spa->spa_active_ddt_prune)
2828 		return (SET_ERROR(EALREADY));
2829 	if (ddt_total_entries(spa) == 0)
2830 		return (0);
2831 
2832 	spa->spa_active_ddt_prune = B_TRUE;
2833 
2834 	zfs_dbgmsg("prune %llu %s", (u_longlong_t)amount,
2835 	    unit == ZPOOL_DDT_PRUNE_PERCENTAGE ? "%" : "seconds old or older");
2836 
2837 	if (unit == ZPOOL_DDT_PRUNE_PERCENTAGE) {
2838 		ddt_age_histo_t histogram;
2839 		uint64_t oldest = 0;
2840 
2841 		/* Make a pass over DDT to build a histogram */
2842 		ddt_prune_walk(spa, 0, &histogram);
2843 
2844 		int target = (histogram.dah_entries * amount) / 100;
2845 
2846 		/*
2847 		 * Figure out our cutoff date
2848 		 * (i.e., which bins to prune from)
2849 		 */
2850 		for (int i = HIST_BINS - 1; i >= 0 && target > 0; i--) {
2851 			if (histogram.dah_age_histo[i] != 0) {
2852 				/* less than this bucket remaining */
2853 				if (target < histogram.dah_age_histo[i]) {
2854 					oldest = MAX(1, (1<<i) * 3600);
2855 					target = 0;
2856 				} else {
2857 					target -= histogram.dah_age_histo[i];
2858 				}
2859 			}
2860 		}
2861 		cutoff = gethrestime_sec() - oldest;
2862 
2863 		if (ddt_dump_prune_histogram)
2864 			ddt_dump_age_histogram(&histogram, cutoff);
2865 	} else if (unit == ZPOOL_DDT_PRUNE_AGE) {
2866 		cutoff = gethrestime_sec() - amount;
2867 	} else {
2868 		return (EINVAL);
2869 	}
2870 
2871 	if (cutoff > 0 && !spa_shutting_down(spa) && !issig()) {
2872 		/* Traverse DDT to prune entries older that our cuttoff */
2873 		ddt_prune_walk(spa, cutoff, NULL);
2874 	}
2875 
2876 	zfs_dbgmsg("%s: prune completed in %llu ms",
2877 	    spa_name(spa), (u_longlong_t)NSEC2MSEC(gethrtime() - start_time));
2878 
2879 	spa->spa_active_ddt_prune = B_FALSE;
2880 	return (0);
2881 }
2882 
2883 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW,
2884 	"Enable prefetching dedup-ed blks");
2885 
2886 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_min_time_ms, UINT, ZMOD_RW,
2887 	"Min time to spend on incremental dedup log flush each transaction");
2888 
2889 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_min, UINT, ZMOD_RW,
2890 	"Min number of log entries to flush each transaction");
2891 
2892 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_max, UINT, ZMOD_RW,
2893 	"Max number of log entries to flush each transaction");
2894 
2895 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_txgs, UINT, ZMOD_RW,
2896 	"Number of TXGs to try to rotate the log in");
2897 
2898 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_cap, UINT, ZMOD_RW,
2899 	"Soft cap for the size of the current dedup log");
2900 
2901 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_hard_cap, UINT, ZMOD_RW,
2902 	"Whether to use the soft cap as a hard cap");
2903 
2904 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_flow_rate_txgs, UINT, ZMOD_RW,
2905 	"Number of txgs to average flow rates across");
2906