1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23 /*
24 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
26 * Copyright (c) 2022 by Pawel Jakub Dawidek
27 * Copyright (c) 2019, 2023, Klara Inc.
28 */
29
30 #include <sys/zfs_context.h>
31 #include <sys/spa.h>
32 #include <sys/spa_impl.h>
33 #include <sys/zio.h>
34 #include <sys/ddt.h>
35 #include <sys/ddt_impl.h>
36 #include <sys/zap.h>
37 #include <sys/dmu_tx.h>
38 #include <sys/arc.h>
39 #include <sys/dsl_pool.h>
40 #include <sys/zio_checksum.h>
41 #include <sys/dsl_scan.h>
42 #include <sys/abd.h>
43 #include <sys/zfeature.h>
44
45 /*
46 * # DDT: Deduplication tables
47 *
48 * The dedup subsystem provides block-level deduplication. When enabled, blocks
49 * to be written will have the dedup (D) bit set, which causes them to be
50 * tracked in a "dedup table", or DDT. If a block has been seen before (exists
51 * in the DDT), instead of being written, it will instead be made to reference
52 * the existing on-disk data, and a refcount bumped in the DDT instead.
53 *
54 * ## Dedup tables and entries
55 *
56 * Conceptually, a DDT is a dictionary or map. Each entry has a "key"
57 * (ddt_key_t) made up a block's checksum and certian properties, and a "value"
58 * (one or more ddt_phys_t) containing valid DVAs for the block's data, birth
59 * time and refcount. Together these are enough to track references to a
60 * specific block, to build a valid block pointer to reference that block (for
61 * freeing, scrubbing, etc), and to fill a new block pointer with the missing
62 * pieces to make it seem like it was written.
63 *
64 * There's a single DDT (ddt_t) for each checksum type, held in spa_ddt[].
65 * Within each DDT, there can be multiple storage "types" (ddt_type_t, on-disk
66 * object data formats, each with their own implementations) and "classes"
67 * (ddt_class_t, instance of a storage type object, for entries with a specific
68 * characteristic). An entry (key) will only ever exist on one of these objects
69 * at any given time, but may be moved from one to another if their type or
70 * class changes.
71 *
72 * The DDT is driven by the write IO pipeline (zio_ddt_write()). When a block
73 * is to be written, before DVAs have been allocated, ddt_lookup() is called to
74 * see if the block has been seen before. If its not found, the write proceeds
75 * as normal, and after it succeeds, a new entry is created. If it is found, we
76 * fill the BP with the DVAs from the entry, increment the refcount and cause
77 * the write IO to return immediately.
78 *
79 * Traditionally, each ddt_phys_t slot in the entry represents a separate dedup
80 * block for the same content/checksum. The slot is selected based on the
81 * zp_copies parameter the block is written with, that is, the number of DVAs
82 * in the block. The "ditto" slot (DDT_PHYS_DITTO) used to be used for
83 * now-removed "dedupditto" feature. These are no longer written, and will be
84 * freed if encountered on old pools.
85 *
86 * If the "fast_dedup" feature is enabled, new dedup tables will be created
87 * with the "flat phys" option. In this mode, there is only one ddt_phys_t
88 * slot. If a write is issued for an entry that exists, but has fewer DVAs,
89 * then only as many new DVAs are allocated and written to make up the
90 * shortfall. The existing entry is then extended (ddt_phys_extend()) with the
91 * new DVAs.
92 *
93 * ## Lifetime of an entry
94 *
95 * A DDT can be enormous, and typically is not held in memory all at once.
96 * Instead, the changes to an entry are tracked in memory, and written down to
97 * disk at the end of each txg.
98 *
99 * A "live" in-memory entry (ddt_entry_t) is a node on the live tree
100 * (ddt_tree). At the start of a txg, ddt_tree is empty. When an entry is
101 * required for IO, ddt_lookup() is called. If an entry already exists on
102 * ddt_tree, it is returned. Otherwise, a new one is created, and the
103 * type/class objects for the DDT are searched for that key. If its found, its
104 * value is copied into the live entry. If not, an empty entry is created.
105 *
106 * The live entry will be modified during the txg, usually by modifying the
107 * refcount, but sometimes by adding or updating DVAs. At the end of the txg
108 * (during spa_sync()), type and class are recalculated for entry (see
109 * ddt_sync_entry()), and the entry is written to the appropriate storage
110 * object and (if necessary), removed from an old one. ddt_tree is cleared and
111 * the next txg can start.
112 *
113 * ## Dedup quota
114 *
115 * A maximum size for all DDTs on the pool can be set with the
116 * dedup_table_quota property. This is determined in ddt_over_quota() and
117 * enforced during ddt_lookup(). If the pool is at or over its quota limit,
118 * ddt_lookup() will only return entries for existing blocks, as updates are
119 * still possible. New entries will not be created; instead, ddt_lookup() will
120 * return NULL. In response, the DDT write stage (zio_ddt_write()) will remove
121 * the D bit on the block and reissue the IO as a regular write. The block will
122 * not be deduplicated.
123 *
124 * Note that this is based on the on-disk size of the dedup store. Reclaiming
125 * this space after deleting entries relies on the ZAP "shrinking" behaviour,
126 * without which, no space would be recovered and the DDT would continue to be
127 * considered "over quota". See zap_shrink_enabled.
128 *
129 * ## Dedup table pruning
130 *
131 * As a complement to the dedup quota feature, ddtprune allows removal of older
132 * non-duplicate entries to make room for newer duplicate entries. The amount
133 * to prune can be based on a target percentage of the unique entries or based
134 * on the age (i.e., prune unique entry older than N days).
135 *
136 * ## Dedup log
137 *
138 * Historically, all entries modified on a txg were written back to dedup
139 * storage objects at the end of every txg. This could cause significant
140 * overheads, as each entry only takes up a tiny portion of a ZAP leaf node,
141 * and so required reading the whole node, updating the entry, and writing it
142 * back. On busy pools, this could add serious IO and memory overheads.
143 *
144 * To address this, the dedup log was added. If the "fast_dedup" feature is
145 * enabled, at the end of each txg, modified entries will be copied to an
146 * in-memory "log" object (ddt_log_t), and appended to an on-disk log. If the
147 * same block is requested again, the in-memory object will be checked first,
148 * and if its there, the entry inflated back onto the live tree without going
149 * to storage. The on-disk log is only read at pool import time, to reload the
150 * in-memory log.
151 *
152 * Each txg, some amount of the in-memory log will be flushed out to a DDT
153 * storage object (ie ZAP) as normal. OpenZFS will try hard to flush enough to
154 * keep up with the rate of change on dedup entries, but not so much that it
155 * would impact overall throughput, and not using too much memory. See the
156 * zfs_dedup_log_* tunables in zfs(4) for more details.
157 *
158 * ## Repair IO
159 *
160 * If a read on a dedup block fails, but there are other copies of the block in
161 * the other ddt_phys_t slots, reads will be issued for those instead
162 * (zio_ddt_read_start()). If one of those succeeds, the read is returned to
163 * the caller, and a copy is stashed on the entry's dde_repair_abd.
164 *
165 * During the end-of-txg sync, any entries with a dde_repair_abd get a
166 * "rewrite" write issued for the original block pointer, with the data read
167 * from the alternate block. If the block is actually damaged, this will invoke
168 * the pool's "self-healing" mechanism, and repair the block.
169 *
170 * If the "fast_dedup" feature is enabled, the "flat phys" option will be in
171 * use, so there is only ever one ddt_phys_t slot. The repair process will
172 * still happen in this case, though it is unlikely to succeed as there will
173 * usually be no other equivalent blocks to fall back on (though there might
174 * be, if this was an early version of a dedup'd block that has since been
175 * extended).
176 *
177 * Note that this repair mechanism is in addition to and separate from the
178 * regular OpenZFS scrub and self-healing mechanisms.
179 *
180 * ## Scanning (scrub/resilver)
181 *
182 * If dedup is active, the scrub machinery will walk the dedup table first, and
183 * scrub all blocks with refcnt > 1 first. After that it will move on to the
184 * regular top-down scrub, and exclude the refcnt > 1 blocks when it sees them.
185 * In this way, heavily deduplicated blocks are only scrubbed once. See the
186 * commentary on dsl_scan_ddt() for more details.
187 *
188 * Walking the DDT is done via ddt_walk(). The current position is stored in a
189 * ddt_bookmark_t, which represents a stable position in the storage object.
190 * This bookmark is stored by the scan machinery, and must reference the same
191 * position on the object even if the object changes, the pool is exported, or
192 * OpenZFS is upgraded.
193 *
194 * If the "fast_dedup" feature is enabled and the table has a log, the scan
195 * cannot begin until entries on the log are flushed, as the on-disk log has no
196 * concept of a "stable position". Instead, the log flushing process will enter
197 * a more aggressive mode, to flush out as much as is necesary as soon as
198 * possible, in order to begin the scan as soon as possible.
199 *
200 * ## Interaction with block cloning
201 *
202 * If block cloning and dedup are both enabled on a pool, BRT will look for the
203 * dedup bit on an incoming block pointer. If set, it will call into the DDT
204 * (ddt_addref()) to add a reference to the block, instead of adding a
205 * reference to the BRT. See brt_pending_apply().
206 */
207
208 /*
209 * These are the only checksums valid for dedup. They must match the list
210 * from dedup_table in zfs_prop.c
211 */
212 #define DDT_CHECKSUM_VALID(c) \
213 (c == ZIO_CHECKSUM_SHA256 || c == ZIO_CHECKSUM_SHA512 || \
214 c == ZIO_CHECKSUM_SKEIN || c == ZIO_CHECKSUM_EDONR || \
215 c == ZIO_CHECKSUM_BLAKE3)
216
217 static kmem_cache_t *ddt_cache;
218
219 static kmem_cache_t *ddt_entry_flat_cache;
220 static kmem_cache_t *ddt_entry_trad_cache;
221
222 #define DDT_ENTRY_FLAT_SIZE (sizeof (ddt_entry_t) + DDT_FLAT_PHYS_SIZE)
223 #define DDT_ENTRY_TRAD_SIZE (sizeof (ddt_entry_t) + DDT_TRAD_PHYS_SIZE)
224
225 #define DDT_ENTRY_SIZE(ddt) \
226 _DDT_PHYS_SWITCH(ddt, DDT_ENTRY_FLAT_SIZE, DDT_ENTRY_TRAD_SIZE)
227
228 /*
229 * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
230 */
231 int zfs_dedup_prefetch = 0;
232
233 /*
234 * If the dedup class cannot satisfy a DDT allocation, treat as over quota
235 * for this many TXGs.
236 */
237 uint_t dedup_class_wait_txgs = 5;
238
239 /*
240 * How many DDT prune entries to add to the DDT sync AVL tree.
241 * Note these addtional entries have a memory footprint of a
242 * ddt_entry_t (216 bytes).
243 */
244 static uint32_t zfs_ddt_prunes_per_txg = 50000;
245
246 /*
247 * For testing, synthesize aged DDT entries
248 * (in global scope for ztest)
249 */
250 boolean_t ddt_prune_artificial_age = B_FALSE;
251 boolean_t ddt_dump_prune_histogram = B_FALSE;
252
253 /*
254 * Minimum time to flush per txg.
255 */
256 uint_t zfs_dedup_log_flush_min_time_ms = 1000;
257
258 /*
259 * Minimum entries to flush per txg.
260 */
261 uint_t zfs_dedup_log_flush_entries_min = 200;
262
263 /*
264 * Target number of TXGs until the whole dedup log has been flushed.
265 * The log size will float around this value times the ingest rate.
266 */
267 uint_t zfs_dedup_log_flush_txgs = 100;
268
269 /*
270 * Maximum entries to flush per txg. Used for testing the dedup log.
271 */
272 uint_t zfs_dedup_log_flush_entries_max = UINT_MAX;
273
274 /*
275 * Soft cap for the size of the current dedup log. If the log is larger
276 * than this size, we slightly increase the aggressiveness of the flushing to
277 * try to bring it back down to the soft cap.
278 */
279 uint_t zfs_dedup_log_cap = UINT_MAX;
280
281 /*
282 * If this is set to B_TRUE, the cap above acts more like a hard cap:
283 * flushing is significantly more aggressive, increasing the minimum amount we
284 * flush per txg, as well as the maximum.
285 */
286 boolean_t zfs_dedup_log_hard_cap = B_FALSE;
287
288 /*
289 * Number of txgs to average flow rates across.
290 */
291 uint_t zfs_dedup_log_flush_flow_rate_txgs = 10;
292
293 static const ddt_ops_t *const ddt_ops[DDT_TYPES] = {
294 &ddt_zap_ops,
295 };
296
297 static const char *const ddt_class_name[DDT_CLASSES] = {
298 "ditto",
299 "duplicate",
300 "unique",
301 };
302
303 /*
304 * DDT feature flags automatically enabled for each on-disk version. Note that
305 * versions >0 cannot exist on disk without SPA_FEATURE_FAST_DEDUP enabled.
306 */
307 static const uint64_t ddt_version_flags[] = {
308 [DDT_VERSION_LEGACY] = 0,
309 [DDT_VERSION_FDT] = DDT_FLAG_FLAT | DDT_FLAG_LOG,
310 };
311
312 /* per-DDT kstats */
313 typedef struct {
314 /* total lookups and whether they returned new or existing entries */
315 kstat_named_t dds_lookup;
316 kstat_named_t dds_lookup_new;
317 kstat_named_t dds_lookup_existing;
318
319 /* entries found on live tree, and if we had to wait for load */
320 kstat_named_t dds_lookup_live_hit;
321 kstat_named_t dds_lookup_live_wait;
322 kstat_named_t dds_lookup_live_miss;
323
324 /* entries found on log trees */
325 kstat_named_t dds_lookup_log_hit;
326 kstat_named_t dds_lookup_log_active_hit;
327 kstat_named_t dds_lookup_log_flushing_hit;
328 kstat_named_t dds_lookup_log_miss;
329
330 /* entries found on store objects */
331 kstat_named_t dds_lookup_stored_hit;
332 kstat_named_t dds_lookup_stored_miss;
333
334 /* number of entries on log trees */
335 kstat_named_t dds_log_active_entries;
336 kstat_named_t dds_log_flushing_entries;
337
338 /* avg updated/flushed entries per txg */
339 kstat_named_t dds_log_ingest_rate;
340 kstat_named_t dds_log_flush_rate;
341 kstat_named_t dds_log_flush_time_rate;
342 } ddt_kstats_t;
343
344 static const ddt_kstats_t ddt_kstats_template = {
345 { "lookup", KSTAT_DATA_UINT64 },
346 { "lookup_new", KSTAT_DATA_UINT64 },
347 { "lookup_existing", KSTAT_DATA_UINT64 },
348 { "lookup_live_hit", KSTAT_DATA_UINT64 },
349 { "lookup_live_wait", KSTAT_DATA_UINT64 },
350 { "lookup_live_miss", KSTAT_DATA_UINT64 },
351 { "lookup_log_hit", KSTAT_DATA_UINT64 },
352 { "lookup_log_active_hit", KSTAT_DATA_UINT64 },
353 { "lookup_log_flushing_hit", KSTAT_DATA_UINT64 },
354 { "lookup_log_miss", KSTAT_DATA_UINT64 },
355 { "lookup_stored_hit", KSTAT_DATA_UINT64 },
356 { "lookup_stored_miss", KSTAT_DATA_UINT64 },
357 { "log_active_entries", KSTAT_DATA_UINT64 },
358 { "log_flushing_entries", KSTAT_DATA_UINT64 },
359 { "log_ingest_rate", KSTAT_DATA_UINT32 },
360 { "log_flush_rate", KSTAT_DATA_UINT32 },
361 { "log_flush_time_rate", KSTAT_DATA_UINT32 },
362 };
363
364 #ifdef _KERNEL
365 /*
366 * Hot-path lookup counters use wmsums to avoid cache line bouncing.
367 * DDT_KSTAT_BUMP: Increment a wmsum counter (lookup stats).
368 *
369 * Sync-only counters use direct kstat assignment (no atomics needed).
370 * DDT_KSTAT_SET: Set a value (log entry counts, rates).
371 * DDT_KSTAT_SUB: Subtract from a value (decrement log entry counts).
372 * DDT_KSTAT_ZERO: Zero a value (clear log entry counts).
373 */
374 #define _DDT_KSTAT_STAT(ddt, stat) \
375 &((ddt_kstats_t *)(ddt)->ddt_ksp->ks_data)->stat.value.ui64
376 #define DDT_KSTAT_BUMP(ddt, stat) \
377 wmsum_add(&(ddt)->ddt_kstat_##stat, 1)
378 #define DDT_KSTAT_SUB(ddt, stat, val) \
379 do { *_DDT_KSTAT_STAT(ddt, stat) -= (val); } while (0)
380 #define DDT_KSTAT_SET(ddt, stat, val) \
381 do { *_DDT_KSTAT_STAT(ddt, stat) = (val); } while (0)
382 #define DDT_KSTAT_ZERO(ddt, stat) DDT_KSTAT_SET(ddt, stat, 0)
383 #else
384 #define DDT_KSTAT_BUMP(ddt, stat) do {} while (0)
385 #define DDT_KSTAT_SUB(ddt, stat, val) do {} while (0)
386 #define DDT_KSTAT_SET(ddt, stat, val) do {} while (0)
387 #define DDT_KSTAT_ZERO(ddt, stat) do {} while (0)
388 #endif /* _KERNEL */
389
390
391 static void
ddt_object_create(ddt_t * ddt,ddt_type_t type,ddt_class_t class,dmu_tx_t * tx)392 ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
393 dmu_tx_t *tx)
394 {
395 spa_t *spa = ddt->ddt_spa;
396 objset_t *os = ddt->ddt_os;
397 uint64_t *objectp = &ddt->ddt_object[type][class];
398 boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags &
399 ZCHECKSUM_FLAG_DEDUP;
400 char name[DDT_NAMELEN];
401
402 ASSERT3U(ddt->ddt_dir_object, >, 0);
403
404 ddt_object_name(ddt, type, class, name);
405
406 ASSERT0(*objectp);
407 VERIFY0(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash));
408 ASSERT3U(*objectp, !=, 0);
409
410 ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
411
412 VERIFY0(zap_add(os, ddt->ddt_dir_object, name, sizeof (uint64_t), 1,
413 objectp, tx));
414
415 VERIFY0(zap_add(os, spa->spa_ddt_stat_object, name,
416 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
417 &ddt->ddt_histogram[type][class], tx));
418 }
419
420 static void
ddt_object_destroy(ddt_t * ddt,ddt_type_t type,ddt_class_t class,dmu_tx_t * tx)421 ddt_object_destroy(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
422 dmu_tx_t *tx)
423 {
424 spa_t *spa = ddt->ddt_spa;
425 objset_t *os = ddt->ddt_os;
426 uint64_t *objectp = &ddt->ddt_object[type][class];
427 uint64_t count;
428 char name[DDT_NAMELEN];
429
430 ASSERT3U(ddt->ddt_dir_object, >, 0);
431
432 ddt_object_name(ddt, type, class, name);
433
434 ASSERT3U(*objectp, !=, 0);
435 ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
436 VERIFY0(ddt_object_count(ddt, type, class, &count));
437 VERIFY0(count);
438 VERIFY0(zap_remove(os, ddt->ddt_dir_object, name, tx));
439 VERIFY0(zap_remove(os, spa->spa_ddt_stat_object, name, tx));
440 VERIFY0(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx));
441 memset(&ddt->ddt_object_stats[type][class], 0, sizeof (ddt_object_t));
442
443 *objectp = 0;
444 }
445
446 static int
ddt_object_load(ddt_t * ddt,ddt_type_t type,ddt_class_t class)447 ddt_object_load(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
448 {
449 ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
450 dmu_object_info_t doi;
451 uint64_t count;
452 char name[DDT_NAMELEN];
453 int error;
454
455 if (ddt->ddt_dir_object == 0) {
456 /*
457 * If we're configured but the containing dir doesn't exist
458 * yet, then this object can't possibly exist either.
459 */
460 ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
461 return (SET_ERROR(ENOENT));
462 }
463
464 ddt_object_name(ddt, type, class, name);
465
466 error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name,
467 sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
468 if (error != 0)
469 return (error);
470
471 error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
472 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
473 &ddt->ddt_histogram[type][class]);
474 if (error != 0)
475 return (error);
476
477 /*
478 * Seed the cached statistics.
479 */
480 error = ddt_object_info(ddt, type, class, &doi);
481 if (error)
482 return (error);
483
484 error = ddt_object_count(ddt, type, class, &count);
485 if (error)
486 return (error);
487
488 ddo->ddo_count = count;
489 ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
490 ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
491
492 return (0);
493 }
494
495 static void
ddt_object_sync(ddt_t * ddt,ddt_type_t type,ddt_class_t class,dmu_tx_t * tx)496 ddt_object_sync(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
497 dmu_tx_t *tx)
498 {
499 ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
500 dmu_object_info_t doi;
501 uint64_t count;
502 char name[DDT_NAMELEN];
503
504 ddt_object_name(ddt, type, class, name);
505
506 VERIFY0(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
507 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
508 &ddt->ddt_histogram[type][class], tx));
509
510 /*
511 * Cache DDT statistics; this is the only time they'll change.
512 */
513 VERIFY0(ddt_object_info(ddt, type, class, &doi));
514 VERIFY0(ddt_object_count(ddt, type, class, &count));
515
516 ddo->ddo_count = count;
517 ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
518 ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
519 }
520
521 static boolean_t
ddt_object_exists(ddt_t * ddt,ddt_type_t type,ddt_class_t class)522 ddt_object_exists(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
523 {
524 return (!!ddt->ddt_object[type][class]);
525 }
526
527 static int
ddt_object_lookup(ddt_t * ddt,ddt_type_t type,ddt_class_t class,ddt_entry_t * dde)528 ddt_object_lookup(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
529 ddt_entry_t *dde)
530 {
531 if (!ddt_object_exists(ddt, type, class))
532 return (SET_ERROR(ENOENT));
533
534 return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
535 ddt->ddt_object[type][class], &dde->dde_key,
536 dde->dde_phys, DDT_PHYS_SIZE(ddt)));
537 }
538
539 static int
ddt_object_contains(ddt_t * ddt,ddt_type_t type,ddt_class_t class,const ddt_key_t * ddk)540 ddt_object_contains(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
541 const ddt_key_t *ddk)
542 {
543 if (!ddt_object_exists(ddt, type, class))
544 return (SET_ERROR(ENOENT));
545
546 return (ddt_ops[type]->ddt_op_contains(ddt->ddt_os,
547 ddt->ddt_object[type][class], ddk));
548 }
549
550 static void
ddt_object_prefetch(ddt_t * ddt,ddt_type_t type,ddt_class_t class,const ddt_key_t * ddk)551 ddt_object_prefetch(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
552 const ddt_key_t *ddk)
553 {
554 if (!ddt_object_exists(ddt, type, class))
555 return;
556
557 ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os,
558 ddt->ddt_object[type][class], ddk);
559 }
560
561 static void
ddt_object_prefetch_all(ddt_t * ddt,ddt_type_t type,ddt_class_t class)562 ddt_object_prefetch_all(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
563 {
564 if (!ddt_object_exists(ddt, type, class))
565 return;
566
567 ddt_ops[type]->ddt_op_prefetch_all(ddt->ddt_os,
568 ddt->ddt_object[type][class]);
569 }
570
571 static int
ddt_object_update(ddt_t * ddt,ddt_type_t type,ddt_class_t class,const ddt_lightweight_entry_t * ddlwe,dmu_tx_t * tx)572 ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
573 const ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
574 {
575 ASSERT(ddt_object_exists(ddt, type, class));
576
577 return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
578 ddt->ddt_object[type][class], &ddlwe->ddlwe_key,
579 &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt), tx));
580 }
581
582 static int
ddt_object_remove(ddt_t * ddt,ddt_type_t type,ddt_class_t class,const ddt_key_t * ddk,dmu_tx_t * tx)583 ddt_object_remove(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
584 const ddt_key_t *ddk, dmu_tx_t *tx)
585 {
586 ASSERT(ddt_object_exists(ddt, type, class));
587
588 return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
589 ddt->ddt_object[type][class], ddk, tx));
590 }
591
592 int
ddt_object_walk(ddt_t * ddt,ddt_type_t type,ddt_class_t class,uint64_t * walk,ddt_lightweight_entry_t * ddlwe)593 ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
594 uint64_t *walk, ddt_lightweight_entry_t *ddlwe)
595 {
596 ASSERT(ddt_object_exists(ddt, type, class));
597
598 int error = ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
599 ddt->ddt_object[type][class], walk, &ddlwe->ddlwe_key,
600 &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
601 if (error == 0) {
602 ddlwe->ddlwe_type = type;
603 ddlwe->ddlwe_class = class;
604 return (0);
605 }
606 return (error);
607 }
608
609 int
ddt_object_count(ddt_t * ddt,ddt_type_t type,ddt_class_t class,uint64_t * count)610 ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
611 uint64_t *count)
612 {
613 ASSERT(ddt_object_exists(ddt, type, class));
614
615 return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
616 ddt->ddt_object[type][class], count));
617 }
618
619 int
ddt_object_info(ddt_t * ddt,ddt_type_t type,ddt_class_t class,dmu_object_info_t * doi)620 ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
621 dmu_object_info_t *doi)
622 {
623 if (!ddt_object_exists(ddt, type, class))
624 return (SET_ERROR(ENOENT));
625
626 return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
627 doi));
628 }
629
630 void
ddt_object_name(ddt_t * ddt,ddt_type_t type,ddt_class_t class,char * name)631 ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
632 char *name)
633 {
634 (void) snprintf(name, DDT_NAMELEN, DMU_POOL_DDT,
635 zio_checksum_table[ddt->ddt_checksum].ci_name,
636 ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
637 }
638
639 void
ddt_bp_fill(const ddt_univ_phys_t * ddp,ddt_phys_variant_t v,blkptr_t * bp,uint64_t txg)640 ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
641 blkptr_t *bp, uint64_t txg)
642 {
643 ASSERT3U(txg, !=, 0);
644 ASSERT3U(v, <, DDT_PHYS_NONE);
645 uint64_t phys_birth;
646 const dva_t *dvap;
647
648 if (v == DDT_PHYS_FLAT) {
649 phys_birth = ddp->ddp_flat.ddp_phys_birth;
650 dvap = ddp->ddp_flat.ddp_dva;
651 } else {
652 phys_birth = ddp->ddp_trad[v].ddp_phys_birth;
653 dvap = ddp->ddp_trad[v].ddp_dva;
654 }
655
656 for (int d = 0; d < SPA_DVAS_PER_BP; d++)
657 bp->blk_dva[d] = dvap[d];
658 BP_SET_BIRTH(bp, txg, phys_birth);
659 }
660
661 /*
662 * The bp created via this function may be used for repairs and scrub, but it
663 * will be missing the salt / IV required to do a full decrypting read.
664 */
665 void
ddt_bp_create(enum zio_checksum checksum,const ddt_key_t * ddk,const ddt_univ_phys_t * ddp,ddt_phys_variant_t v,blkptr_t * bp)666 ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
667 const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp)
668 {
669 BP_ZERO(bp);
670
671 if (ddp != NULL)
672 ddt_bp_fill(ddp, v, bp, ddt_phys_birth(ddp, v));
673
674 bp->blk_cksum = ddk->ddk_cksum;
675
676 BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
677 BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
678 BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
679 BP_SET_CRYPT(bp, DDK_GET_CRYPT(ddk));
680 BP_SET_FILL(bp, 1);
681 BP_SET_CHECKSUM(bp, checksum);
682 BP_SET_TYPE(bp, DMU_OT_DEDUP);
683 BP_SET_LEVEL(bp, 0);
684 BP_SET_DEDUP(bp, 1);
685 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
686 }
687
688 void
ddt_key_fill(ddt_key_t * ddk,const blkptr_t * bp)689 ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
690 {
691 ddk->ddk_cksum = bp->blk_cksum;
692 ddk->ddk_prop = 0;
693
694 ASSERT(BP_IS_ENCRYPTED(bp) || !BP_USES_CRYPT(bp));
695
696 DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
697 DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
698 DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
699 DDK_SET_CRYPT(ddk, BP_USES_CRYPT(bp));
700 }
701
702 void
ddt_phys_extend(ddt_univ_phys_t * ddp,ddt_phys_variant_t v,const blkptr_t * bp)703 ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v, const blkptr_t *bp)
704 {
705 ASSERT3U(v, <, DDT_PHYS_NONE);
706 int bp_ndvas = BP_GET_NDVAS(bp);
707 int ddp_max_dvas = BP_IS_ENCRYPTED(bp) ?
708 SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP;
709 dva_t *dvas = (v == DDT_PHYS_FLAT) ?
710 ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva;
711
712 int s = 0, d = 0;
713 while (s < bp_ndvas && d < ddp_max_dvas) {
714 if (DVA_IS_VALID(&dvas[d])) {
715 d++;
716 continue;
717 }
718 dvas[d] = bp->blk_dva[s];
719 s++; d++;
720 }
721
722 /*
723 * If the caller offered us more DVAs than we can fit, something has
724 * gone wrong in their accounting. zio_ddt_write() should never ask for
725 * more than we need.
726 */
727 ASSERT3U(s, ==, bp_ndvas);
728
729 if (BP_IS_ENCRYPTED(bp))
730 dvas[2] = bp->blk_dva[2];
731
732 if (ddt_phys_birth(ddp, v) == 0) {
733 if (v == DDT_PHYS_FLAT) {
734 ddp->ddp_flat.ddp_phys_birth =
735 BP_GET_PHYSICAL_BIRTH(bp);
736 } else {
737 ddp->ddp_trad[v].ddp_phys_birth =
738 BP_GET_PHYSICAL_BIRTH(bp);
739 }
740 }
741 }
742
743 void
ddt_phys_unextend(ddt_univ_phys_t * cur,ddt_univ_phys_t * orig,ddt_phys_variant_t v)744 ddt_phys_unextend(ddt_univ_phys_t *cur, ddt_univ_phys_t *orig,
745 ddt_phys_variant_t v)
746 {
747 ASSERT3U(v, <, DDT_PHYS_NONE);
748 dva_t *cur_dvas = (v == DDT_PHYS_FLAT) ?
749 cur->ddp_flat.ddp_dva : cur->ddp_trad[v].ddp_dva;
750 dva_t *orig_dvas = (v == DDT_PHYS_FLAT) ?
751 orig->ddp_flat.ddp_dva : orig->ddp_trad[v].ddp_dva;
752
753 for (int d = 0; d < SPA_DVAS_PER_BP; d++)
754 cur_dvas[d] = orig_dvas[d];
755
756 if (ddt_phys_birth(orig, v) == 0) {
757 if (v == DDT_PHYS_FLAT)
758 cur->ddp_flat.ddp_phys_birth = 0;
759 else
760 cur->ddp_trad[v].ddp_phys_birth = 0;
761 }
762 }
763
764 void
ddt_phys_copy(ddt_univ_phys_t * dst,const ddt_univ_phys_t * src,ddt_phys_variant_t v)765 ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src,
766 ddt_phys_variant_t v)
767 {
768 ASSERT3U(v, <, DDT_PHYS_NONE);
769
770 if (v == DDT_PHYS_FLAT)
771 dst->ddp_flat = src->ddp_flat;
772 else
773 dst->ddp_trad[v] = src->ddp_trad[v];
774 }
775
776 void
ddt_phys_clear(ddt_univ_phys_t * ddp,ddt_phys_variant_t v)777 ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
778 {
779 ASSERT3U(v, <, DDT_PHYS_NONE);
780
781 if (v == DDT_PHYS_FLAT)
782 memset(&ddp->ddp_flat, 0, DDT_FLAT_PHYS_SIZE);
783 else
784 memset(&ddp->ddp_trad[v], 0, DDT_TRAD_PHYS_SIZE / DDT_PHYS_MAX);
785 }
786
787 static uint64_t
ddt_class_start(void)788 ddt_class_start(void)
789 {
790 uint64_t start = gethrestime_sec();
791
792 if (unlikely(ddt_prune_artificial_age)) {
793 /*
794 * debug aide -- simulate a wider distribution
795 * so we don't have to wait for an aged DDT
796 * to test prune.
797 */
798 int range = 1 << 21;
799 int percent = random_in_range(100);
800 if (percent < 50) {
801 range = range >> 4;
802 } else if (percent > 75) {
803 range /= 2;
804 }
805 start -= random_in_range(range);
806 }
807
808 return (start);
809 }
810
811 void
ddt_phys_addref(ddt_univ_phys_t * ddp,ddt_phys_variant_t v)812 ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
813 {
814 ASSERT3U(v, <, DDT_PHYS_NONE);
815
816 if (v == DDT_PHYS_FLAT)
817 ddp->ddp_flat.ddp_refcnt++;
818 else
819 ddp->ddp_trad[v].ddp_refcnt++;
820 }
821
822 uint64_t
ddt_phys_decref(ddt_univ_phys_t * ddp,ddt_phys_variant_t v)823 ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
824 {
825 ASSERT3U(v, <, DDT_PHYS_NONE);
826
827 uint64_t *refcntp;
828
829 if (v == DDT_PHYS_FLAT)
830 refcntp = &ddp->ddp_flat.ddp_refcnt;
831 else
832 refcntp = &ddp->ddp_trad[v].ddp_refcnt;
833
834 ASSERT3U(*refcntp, >, 0);
835 (*refcntp)--;
836 return (*refcntp);
837 }
838
839 static void
ddt_phys_free(ddt_t * ddt,ddt_key_t * ddk,ddt_univ_phys_t * ddp,ddt_phys_variant_t v,uint64_t txg)840 ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_univ_phys_t *ddp,
841 ddt_phys_variant_t v, uint64_t txg)
842 {
843 blkptr_t blk;
844
845 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
846
847 /*
848 * We clear the dedup bit so that zio_free() will actually free the
849 * space, rather than just decrementing the refcount in the DDT.
850 */
851 BP_SET_DEDUP(&blk, 0);
852
853 ddt_phys_clear(ddp, v);
854 zio_free(ddt->ddt_spa, txg, &blk);
855 }
856
857 uint64_t
ddt_phys_birth(const ddt_univ_phys_t * ddp,ddt_phys_variant_t v)858 ddt_phys_birth(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
859 {
860 ASSERT3U(v, <, DDT_PHYS_NONE);
861
862 if (v == DDT_PHYS_FLAT)
863 return (ddp->ddp_flat.ddp_phys_birth);
864 else
865 return (ddp->ddp_trad[v].ddp_phys_birth);
866 }
867
868 int
ddt_phys_is_gang(const ddt_univ_phys_t * ddp,ddt_phys_variant_t v)869 ddt_phys_is_gang(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
870 {
871 ASSERT3U(v, <, DDT_PHYS_NONE);
872
873 const dva_t *dvas = (v == DDT_PHYS_FLAT) ?
874 ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva;
875
876 return (DVA_GET_GANG(&dvas[0]));
877 }
878
879 int
ddt_phys_dva_count(const ddt_univ_phys_t * ddp,ddt_phys_variant_t v,boolean_t encrypted)880 ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
881 boolean_t encrypted)
882 {
883 ASSERT3U(v, <, DDT_PHYS_NONE);
884
885 const dva_t *dvas = (v == DDT_PHYS_FLAT) ?
886 ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva;
887
888 return (DVA_IS_VALID(&dvas[0]) +
889 DVA_IS_VALID(&dvas[1]) +
890 DVA_IS_VALID(&dvas[2]) * !encrypted);
891 }
892
893 ddt_phys_variant_t
ddt_phys_select(const ddt_t * ddt,const ddt_entry_t * dde,const blkptr_t * bp)894 ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, const blkptr_t *bp)
895 {
896 if (dde == NULL)
897 return (DDT_PHYS_NONE);
898
899 const ddt_univ_phys_t *ddp = dde->dde_phys;
900
901 if (ddt->ddt_flags & DDT_FLAG_FLAT) {
902 if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_flat.ddp_dva[0]) &&
903 BP_GET_PHYSICAL_BIRTH(bp) == ddp->ddp_flat.ddp_phys_birth) {
904 return (DDT_PHYS_FLAT);
905 }
906 } else /* traditional phys */ {
907 for (int p = 0; p < DDT_PHYS_MAX; p++) {
908 if (DVA_EQUAL(BP_IDENTITY(bp),
909 &ddp->ddp_trad[p].ddp_dva[0]) &&
910 BP_GET_PHYSICAL_BIRTH(bp) ==
911 ddp->ddp_trad[p].ddp_phys_birth) {
912 return (p);
913 }
914 }
915 }
916 return (DDT_PHYS_NONE);
917 }
918
919 uint64_t
ddt_phys_refcnt(const ddt_univ_phys_t * ddp,ddt_phys_variant_t v)920 ddt_phys_refcnt(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
921 {
922 ASSERT3U(v, <, DDT_PHYS_NONE);
923
924 if (v == DDT_PHYS_FLAT)
925 return (ddp->ddp_flat.ddp_refcnt);
926 else
927 return (ddp->ddp_trad[v].ddp_refcnt);
928 }
929
930 uint64_t
ddt_phys_total_refcnt(const ddt_t * ddt,const ddt_univ_phys_t * ddp)931 ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_univ_phys_t *ddp)
932 {
933 uint64_t refcnt = 0;
934
935 if (ddt->ddt_flags & DDT_FLAG_FLAT)
936 refcnt = ddp->ddp_flat.ddp_refcnt;
937 else
938 for (int v = DDT_PHYS_SINGLE; v <= DDT_PHYS_TRIPLE; v++)
939 refcnt += ddp->ddp_trad[v].ddp_refcnt;
940
941 return (refcnt);
942 }
943
944 ddt_t *
ddt_select(spa_t * spa,const blkptr_t * bp)945 ddt_select(spa_t *spa, const blkptr_t *bp)
946 {
947 ASSERT(DDT_CHECKSUM_VALID(BP_GET_CHECKSUM(bp)));
948 return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
949 }
950
951 void
ddt_enter(ddt_t * ddt)952 ddt_enter(ddt_t *ddt)
953 {
954 mutex_enter(&ddt->ddt_lock);
955 }
956
957 void
ddt_exit(ddt_t * ddt)958 ddt_exit(ddt_t *ddt)
959 {
960 mutex_exit(&ddt->ddt_lock);
961 }
962
963 void
ddt_init(void)964 ddt_init(void)
965 {
966 ddt_cache = kmem_cache_create("ddt_cache",
967 sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
968 ddt_entry_flat_cache = kmem_cache_create("ddt_entry_flat_cache",
969 DDT_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
970 ddt_entry_trad_cache = kmem_cache_create("ddt_entry_trad_cache",
971 DDT_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
972
973 ddt_log_init();
974 }
975
976 void
ddt_fini(void)977 ddt_fini(void)
978 {
979 ddt_log_fini();
980
981 kmem_cache_destroy(ddt_entry_trad_cache);
982 kmem_cache_destroy(ddt_entry_flat_cache);
983 kmem_cache_destroy(ddt_cache);
984 }
985
986 static ddt_entry_t *
ddt_alloc(const ddt_t * ddt,const ddt_key_t * ddk)987 ddt_alloc(const ddt_t *ddt, const ddt_key_t *ddk)
988 {
989 ddt_entry_t *dde;
990
991 if (ddt->ddt_flags & DDT_FLAG_FLAT) {
992 dde = kmem_cache_alloc(ddt_entry_flat_cache, KM_SLEEP);
993 memset(dde, 0, DDT_ENTRY_FLAT_SIZE);
994 } else {
995 dde = kmem_cache_alloc(ddt_entry_trad_cache, KM_SLEEP);
996 memset(dde, 0, DDT_ENTRY_TRAD_SIZE);
997 }
998
999 cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
1000
1001 dde->dde_key = *ddk;
1002
1003 return (dde);
1004 }
1005
1006 void
ddt_alloc_entry_io(ddt_entry_t * dde)1007 ddt_alloc_entry_io(ddt_entry_t *dde)
1008 {
1009 if (dde->dde_io != NULL)
1010 return;
1011
1012 dde->dde_io = kmem_zalloc(sizeof (ddt_entry_io_t), KM_SLEEP);
1013 mutex_init(&dde->dde_io->dde_io_lock, NULL, MUTEX_DEFAULT, NULL);
1014 }
1015
1016 static void
ddt_free(const ddt_t * ddt,ddt_entry_t * dde)1017 ddt_free(const ddt_t *ddt, ddt_entry_t *dde)
1018 {
1019 if (dde->dde_io != NULL) {
1020 for (int p = 0; p < DDT_NPHYS(ddt); p++)
1021 ASSERT0P(dde->dde_io->dde_lead_zio[p]);
1022
1023 if (dde->dde_io->dde_repair_abd != NULL)
1024 abd_free(dde->dde_io->dde_repair_abd);
1025
1026 mutex_destroy(&dde->dde_io->dde_io_lock);
1027 kmem_free(dde->dde_io, sizeof (ddt_entry_io_t));
1028 }
1029
1030 cv_destroy(&dde->dde_cv);
1031 kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
1032 ddt_entry_flat_cache : ddt_entry_trad_cache, dde);
1033 }
1034
1035 void
ddt_remove(ddt_t * ddt,ddt_entry_t * dde)1036 ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
1037 {
1038 ASSERT(MUTEX_HELD(&ddt->ddt_lock));
1039
1040 /* Entry is still in the log, so charge the entry back to it */
1041 if (dde->dde_flags & DDE_FLAG_LOGGED) {
1042 ddt_lightweight_entry_t ddlwe;
1043 DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
1044 ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
1045 }
1046
1047 avl_remove(&ddt->ddt_tree, dde);
1048 ddt_free(ddt, dde);
1049 }
1050
1051 /*
1052 * We're considered over quota when we hit 85% full, or for larger drives,
1053 * when there is less than 8GB free.
1054 */
1055 static boolean_t
ddt_special_over_quota(metaslab_class_t * mc)1056 ddt_special_over_quota(metaslab_class_t *mc)
1057 {
1058 uint64_t allocated = metaslab_class_get_alloc(mc);
1059 uint64_t capacity = metaslab_class_get_space(mc);
1060 uint64_t limit = MAX(capacity * 85 / 100,
1061 (capacity > (1LL<<33)) ? capacity - (1LL<<33) : 0);
1062 return (allocated >= limit);
1063 }
1064
1065 /*
1066 * Check if the DDT is over its quota. This can be due to a few conditions:
1067 * 1. 'dedup_table_quota' property is not 0 (none) and the dedup dsize
1068 * exceeds this limit
1069 *
1070 * 2. 'dedup_table_quota' property is set to automatic and
1071 * a. the dedup or special allocation class could not satisfy a DDT
1072 * allocation in a recent transaction
1073 * b. the dedup or special allocation class has exceeded its 85% limit
1074 */
1075 static boolean_t
ddt_over_quota(spa_t * spa)1076 ddt_over_quota(spa_t *spa)
1077 {
1078 if (spa->spa_dedup_table_quota == 0)
1079 return (B_FALSE);
1080
1081 if (spa->spa_dedup_table_quota != UINT64_MAX)
1082 return (ddt_get_ddt_dsize(spa) > spa->spa_dedup_table_quota);
1083
1084 /*
1085 * Over quota if have to allocate outside of the dedup/special class.
1086 */
1087 if (spa_syncing_txg(spa) <= spa->spa_dedup_class_full_txg +
1088 dedup_class_wait_txgs) {
1089 /* Waiting for some deferred frees to be processed */
1090 return (B_TRUE);
1091 }
1092
1093 /*
1094 * For automatic quota, table size is limited by dedup or special class
1095 */
1096 if (spa_has_dedup(spa))
1097 return (ddt_special_over_quota(spa_dedup_class(spa)));
1098 else if (spa_special_has_ddt(spa))
1099 return (ddt_special_over_quota(spa_special_class(spa)));
1100
1101 return (B_FALSE);
1102 }
1103
1104 void
ddt_prefetch_all(spa_t * spa)1105 ddt_prefetch_all(spa_t *spa)
1106 {
1107 /*
1108 * Load all DDT entries for each type/class combination. This is
1109 * indended to perform a prefetch on all such blocks. For the same
1110 * reason that ddt_prefetch isn't locked, this is also not locked.
1111 */
1112 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1113 ddt_t *ddt = spa->spa_ddt[c];
1114 if (!ddt)
1115 continue;
1116
1117 for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1118 for (ddt_class_t class = 0; class < DDT_CLASSES;
1119 class++) {
1120 ddt_object_prefetch_all(ddt, type, class);
1121 }
1122 }
1123 }
1124 }
1125
1126 static int ddt_configure(ddt_t *ddt, boolean_t new);
1127
1128 /*
1129 * If the BP passed to ddt_lookup has valid DVAs, then we need to compare them
1130 * to the ones in the entry. If they're different, then the passed-in BP is
1131 * from a previous generation of this entry (ie was previously pruned) and we
1132 * have to act like the entry doesn't exist at all.
1133 *
1134 * This should only happen during a lookup to free the block (zio_ddt_free()).
1135 *
1136 * XXX this is similar in spirit to ddt_phys_select(), maybe can combine
1137 * -- robn, 2024-02-09
1138 */
1139 static boolean_t
ddt_entry_lookup_is_valid(ddt_t * ddt,const blkptr_t * bp,ddt_entry_t * dde)1140 ddt_entry_lookup_is_valid(ddt_t *ddt, const blkptr_t *bp, ddt_entry_t *dde)
1141 {
1142 /* If the BP has no DVAs, then this entry is good */
1143 uint_t ndvas = BP_GET_NDVAS(bp);
1144 if (ndvas == 0)
1145 return (B_TRUE);
1146
1147 /*
1148 * Only checking the phys for the copies. For flat, there's only one;
1149 * for trad it'll be the one that has the matching set of DVAs.
1150 */
1151 const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ?
1152 dde->dde_phys->ddp_flat.ddp_dva :
1153 dde->dde_phys->ddp_trad[ndvas].ddp_dva;
1154
1155 /*
1156 * Compare entry DVAs with the BP. They should all be there, but
1157 * there's not really anything we can do if its only partial anyway,
1158 * that's an error somewhere else, maybe long ago.
1159 */
1160 uint_t d;
1161 for (d = 0; d < ndvas; d++)
1162 if (!DVA_EQUAL(&dvas[d], &bp->blk_dva[d]))
1163 return (B_FALSE);
1164 ASSERT3U(d, ==, ndvas);
1165
1166 return (B_TRUE);
1167 }
1168
1169 ddt_entry_t *
ddt_lookup(ddt_t * ddt,const blkptr_t * bp,boolean_t verify)1170 ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t verify)
1171 {
1172 spa_t *spa = ddt->ddt_spa;
1173 ddt_key_t search;
1174 ddt_entry_t *dde;
1175 ddt_type_t type;
1176 ddt_class_t class;
1177 avl_index_t where;
1178 int error;
1179
1180 ASSERT(MUTEX_HELD(&ddt->ddt_lock));
1181
1182 if (unlikely(ddt->ddt_version == DDT_VERSION_UNCONFIGURED)) {
1183 /*
1184 * This is the first use of this DDT since the pool was
1185 * created; finish getting it ready for use.
1186 */
1187 VERIFY0(ddt_configure(ddt, B_TRUE));
1188 ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
1189 }
1190
1191 DDT_KSTAT_BUMP(ddt, dds_lookup);
1192
1193 ddt_key_fill(&search, bp);
1194
1195 /* Find an existing live entry */
1196 dde = avl_find(&ddt->ddt_tree, &search, &where);
1197 if (dde != NULL) {
1198 /* If we went over quota, act like we didn't find it */
1199 if (dde->dde_flags & DDE_FLAG_OVERQUOTA)
1200 return (NULL);
1201
1202 /* If it's already loaded, we can just return it. */
1203 DDT_KSTAT_BUMP(ddt, dds_lookup_live_hit);
1204 if (dde->dde_flags & DDE_FLAG_LOADED) {
1205 if (!verify || ddt_entry_lookup_is_valid(ddt, bp, dde))
1206 return (dde);
1207 return (NULL);
1208 }
1209
1210 /* Someone else is loading it, wait for it. */
1211 dde->dde_waiters++;
1212 DDT_KSTAT_BUMP(ddt, dds_lookup_live_wait);
1213 while (!(dde->dde_flags & DDE_FLAG_LOADED))
1214 cv_wait(&dde->dde_cv, &ddt->ddt_lock);
1215 dde->dde_waiters--;
1216
1217 /* Loaded but over quota, forget we were ever here */
1218 if (dde->dde_flags & DDE_FLAG_OVERQUOTA) {
1219 if (dde->dde_waiters == 0) {
1220 avl_remove(&ddt->ddt_tree, dde);
1221 ddt_free(ddt, dde);
1222 }
1223 return (NULL);
1224 }
1225
1226 DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
1227
1228 /* Make sure the loaded entry matches the BP */
1229 if (!verify || ddt_entry_lookup_is_valid(ddt, bp, dde))
1230 return (dde);
1231 return (NULL);
1232 } else
1233 DDT_KSTAT_BUMP(ddt, dds_lookup_live_miss);
1234
1235 /* Time to make a new entry. */
1236 dde = ddt_alloc(ddt, &search);
1237
1238 /* Record the time this class was created (used by ddt prune) */
1239 if (ddt->ddt_flags & DDT_FLAG_FLAT)
1240 dde->dde_phys->ddp_flat.ddp_class_start = ddt_class_start();
1241
1242 avl_insert(&ddt->ddt_tree, dde, where);
1243
1244 /* If its in the log tree, we can "load" it from there */
1245 if (ddt->ddt_flags & DDT_FLAG_LOG) {
1246 ddt_lightweight_entry_t ddlwe;
1247
1248 if (ddt_log_find_key(ddt, &search, &ddlwe)) {
1249 /*
1250 * See if we have the key first, and if so, set up
1251 * the entry.
1252 */
1253 dde->dde_type = ddlwe.ddlwe_type;
1254 dde->dde_class = ddlwe.ddlwe_class;
1255 memcpy(dde->dde_phys, &ddlwe.ddlwe_phys,
1256 DDT_PHYS_SIZE(ddt));
1257 /* Whatever we found isn't valid for this BP, eject */
1258 if (verify &&
1259 !ddt_entry_lookup_is_valid(ddt, bp, dde)) {
1260 avl_remove(&ddt->ddt_tree, dde);
1261 ddt_free(ddt, dde);
1262 return (NULL);
1263 }
1264
1265 /* Remove it and count it */
1266 if (ddt_log_remove_key(ddt,
1267 ddt->ddt_log_active, &search)) {
1268 DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit);
1269 } else {
1270 VERIFY(ddt_log_remove_key(ddt,
1271 ddt->ddt_log_flushing, &search));
1272 DDT_KSTAT_BUMP(ddt,
1273 dds_lookup_log_flushing_hit);
1274 }
1275
1276 dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED;
1277
1278 DDT_KSTAT_BUMP(ddt, dds_lookup_log_hit);
1279 DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
1280
1281 return (dde);
1282 }
1283
1284 DDT_KSTAT_BUMP(ddt, dds_lookup_log_miss);
1285 }
1286
1287 /*
1288 * ddt_tree is now stable, so unlock and let everyone else keep moving.
1289 * Anyone landing on this entry will find it without DDE_FLAG_LOADED,
1290 * and go to sleep waiting for it above.
1291 */
1292 ddt_exit(ddt);
1293
1294 /* Search all store objects for the entry. */
1295 error = ENOENT;
1296 for (type = 0; type < DDT_TYPES; type++) {
1297 for (class = 0; class < DDT_CLASSES; class++) {
1298 error = ddt_object_lookup(ddt, type, class, dde);
1299 if (error != ENOENT) {
1300 ASSERT0(error);
1301 break;
1302 }
1303 }
1304 if (error != ENOENT)
1305 break;
1306 }
1307
1308 ddt_enter(ddt);
1309
1310 ASSERT(!(dde->dde_flags & DDE_FLAG_LOADED));
1311
1312 dde->dde_type = type; /* will be DDT_TYPES if no entry found */
1313 dde->dde_class = class; /* will be DDT_CLASSES if no entry found */
1314
1315 boolean_t valid = B_TRUE;
1316
1317 if (dde->dde_type == DDT_TYPES &&
1318 dde->dde_class == DDT_CLASSES &&
1319 ddt_over_quota(spa)) {
1320 /* Over quota. If no one is waiting, clean up right now. */
1321 if (dde->dde_waiters == 0) {
1322 avl_remove(&ddt->ddt_tree, dde);
1323 ddt_free(ddt, dde);
1324 return (NULL);
1325 }
1326
1327 /* Flag cleanup required */
1328 dde->dde_flags |= DDE_FLAG_OVERQUOTA;
1329 } else if (error == 0) {
1330 /*
1331 * If what we loaded is no good for this BP and there's no one
1332 * waiting for it, we can just remove it and get out. If its no
1333 * good but there are waiters, we have to leave it, because we
1334 * don't know what they want. If its not needed we'll end up
1335 * taking an entry log/sync, but it can only happen if more
1336 * than one previous version of this block is being deleted at
1337 * the same time. This is extremely unlikely to happen and not
1338 * worth the effort to deal with without taking an entry
1339 * update.
1340 */
1341 valid = !verify || ddt_entry_lookup_is_valid(ddt, bp, dde);
1342 if (!valid && dde->dde_waiters == 0) {
1343 avl_remove(&ddt->ddt_tree, dde);
1344 ddt_free(ddt, dde);
1345 return (NULL);
1346 }
1347
1348 DDT_KSTAT_BUMP(ddt, dds_lookup_stored_hit);
1349 DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
1350
1351 /*
1352 * The histograms only track inactive (stored or logged) blocks.
1353 * We've just put an entry onto the live list, so we need to
1354 * remove its counts. When its synced back, it'll be re-added
1355 * to the right one.
1356 *
1357 * We only do this when we successfully found it in the store.
1358 * error == ENOENT means this is a new entry, and so its already
1359 * not counted.
1360 */
1361 ddt_histogram_t *ddh =
1362 &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
1363
1364 ddt_lightweight_entry_t ddlwe;
1365 DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
1366 ddt_histogram_sub_entry(ddt, ddh, &ddlwe);
1367 } else {
1368 DDT_KSTAT_BUMP(ddt, dds_lookup_stored_miss);
1369 DDT_KSTAT_BUMP(ddt, dds_lookup_new);
1370 }
1371
1372 /* Entry loaded, everyone can proceed now */
1373 dde->dde_flags |= DDE_FLAG_LOADED;
1374 cv_broadcast(&dde->dde_cv);
1375
1376 if ((dde->dde_flags & DDE_FLAG_OVERQUOTA) || !valid)
1377 return (NULL);
1378
1379 return (dde);
1380 }
1381
1382 void
ddt_prefetch(spa_t * spa,const blkptr_t * bp)1383 ddt_prefetch(spa_t *spa, const blkptr_t *bp)
1384 {
1385 ddt_t *ddt;
1386 ddt_key_t ddk;
1387
1388 if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
1389 return;
1390
1391 /*
1392 * We only remove the DDT once all tables are empty and only
1393 * prefetch dedup blocks when there are entries in the DDT.
1394 * Thus no locking is required as the DDT can't disappear on us.
1395 */
1396 ddt = ddt_select(spa, bp);
1397 ddt_key_fill(&ddk, bp);
1398
1399 for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1400 for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
1401 ddt_object_prefetch(ddt, type, class, &ddk);
1402 }
1403 }
1404 }
1405
1406 /*
1407 * ddt_key_t comparison. Any struct wanting to make use of this function must
1408 * have the key as the first element. Casts it to N uint64_ts, and checks until
1409 * we find there's a difference. This is intended to match how ddt_zap.c drives
1410 * the ZAPs (first uint64_t as the key prehash), which will minimise the number
1411 * of ZAP blocks touched when flushing logged entries from an AVL walk. This is
1412 * not an invariant for this function though, should you wish to change it.
1413 */
1414 int
ddt_key_compare(const void * x1,const void * x2)1415 ddt_key_compare(const void *x1, const void *x2)
1416 {
1417 const uint64_t *k1 = (const uint64_t *)x1;
1418 const uint64_t *k2 = (const uint64_t *)x2;
1419
1420 int cmp;
1421 for (int i = 0; i < (sizeof (ddt_key_t) / sizeof (uint64_t)); i++)
1422 if (likely((cmp = TREE_CMP(k1[i], k2[i])) != 0))
1423 return (cmp);
1424
1425 return (0);
1426 }
1427
1428 /* Create the containing dir for this DDT and bump the feature count */
1429 static void
ddt_create_dir(ddt_t * ddt,dmu_tx_t * tx)1430 ddt_create_dir(ddt_t *ddt, dmu_tx_t *tx)
1431 {
1432 ASSERT0(ddt->ddt_dir_object);
1433 ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT);
1434
1435 char name[DDT_NAMELEN];
1436 snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR,
1437 zio_checksum_table[ddt->ddt_checksum].ci_name);
1438
1439 ddt->ddt_dir_object = zap_create_link(ddt->ddt_os,
1440 DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, name, tx);
1441
1442 VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_VERSION,
1443 sizeof (uint64_t), 1, &ddt->ddt_version, tx));
1444 VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS,
1445 sizeof (uint64_t), 1, &ddt->ddt_flags, tx));
1446
1447 spa_feature_incr(ddt->ddt_spa, SPA_FEATURE_FAST_DEDUP, tx);
1448 }
1449
1450 /* Destroy the containing dir and deactivate the feature */
1451 static void
ddt_destroy_dir(ddt_t * ddt,dmu_tx_t * tx)1452 ddt_destroy_dir(ddt_t *ddt, dmu_tx_t *tx)
1453 {
1454 ASSERT3U(ddt->ddt_dir_object, !=, 0);
1455 ASSERT3U(ddt->ddt_dir_object, !=, DMU_POOL_DIRECTORY_OBJECT);
1456 ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT);
1457
1458 char name[DDT_NAMELEN];
1459 snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR,
1460 zio_checksum_table[ddt->ddt_checksum].ci_name);
1461
1462 for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1463 for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
1464 ASSERT(!ddt_object_exists(ddt, type, class));
1465 }
1466 }
1467
1468 ddt_log_destroy(ddt, tx);
1469
1470 uint64_t count;
1471 ASSERT0(zap_count(ddt->ddt_os, ddt->ddt_dir_object, &count));
1472 ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object,
1473 DDT_DIR_VERSION));
1474 ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS));
1475 ASSERT3U(count, ==, 2);
1476
1477 VERIFY0(zap_remove(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, tx));
1478 VERIFY0(zap_destroy(ddt->ddt_os, ddt->ddt_dir_object, tx));
1479
1480 ddt->ddt_dir_object = 0;
1481
1482 spa_feature_decr(ddt->ddt_spa, SPA_FEATURE_FAST_DEDUP, tx);
1483 }
1484
1485 /*
1486 * Determine, flags and on-disk layout from what's already stored. If there's
1487 * nothing stored, then if new is false, returns ENOENT, and if true, selects
1488 * based on pool config.
1489 */
1490 static int
ddt_configure(ddt_t * ddt,boolean_t new)1491 ddt_configure(ddt_t *ddt, boolean_t new)
1492 {
1493 spa_t *spa = ddt->ddt_spa;
1494 char name[DDT_NAMELEN];
1495 int error;
1496
1497 ASSERT3U(spa_load_state(spa), !=, SPA_LOAD_CREATE);
1498
1499 boolean_t fdt_enabled =
1500 spa_feature_is_enabled(spa, SPA_FEATURE_FAST_DEDUP);
1501 boolean_t fdt_active =
1502 spa_feature_is_active(spa, SPA_FEATURE_FAST_DEDUP);
1503
1504 /*
1505 * First, look for the global DDT stats object. If its not there, then
1506 * there's never been a DDT written before ever, and we know we're
1507 * starting from scratch.
1508 */
1509 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1510 DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
1511 &spa->spa_ddt_stat_object);
1512 if (error != 0) {
1513 if (error != ENOENT)
1514 return (error);
1515 goto not_found;
1516 }
1517
1518 if (fdt_active) {
1519 /*
1520 * Now look for a DDT directory. If it exists, then it has
1521 * everything we need.
1522 */
1523 snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR,
1524 zio_checksum_table[ddt->ddt_checksum].ci_name);
1525
1526 error = zap_lookup(spa->spa_meta_objset,
1527 DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1,
1528 &ddt->ddt_dir_object);
1529 if (error == 0) {
1530 ASSERT3U(spa->spa_meta_objset, ==, ddt->ddt_os);
1531
1532 error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object,
1533 DDT_DIR_VERSION, sizeof (uint64_t), 1,
1534 &ddt->ddt_version);
1535 if (error != 0)
1536 return (error);
1537
1538 error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object,
1539 DDT_DIR_FLAGS, sizeof (uint64_t), 1,
1540 &ddt->ddt_flags);
1541 if (error != 0)
1542 return (error);
1543
1544 if (ddt->ddt_version != DDT_VERSION_FDT) {
1545 zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s "
1546 "unknown version %llu", spa_name(spa),
1547 name, (u_longlong_t)ddt->ddt_version);
1548 return (SET_ERROR(EINVAL));
1549 }
1550
1551 if ((ddt->ddt_flags & ~DDT_FLAG_MASK) != 0) {
1552 zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s "
1553 "version=%llu unknown flags %llx",
1554 spa_name(spa), name,
1555 (u_longlong_t)ddt->ddt_flags,
1556 (u_longlong_t)ddt->ddt_version);
1557 return (SET_ERROR(EINVAL));
1558 }
1559
1560 return (0);
1561 }
1562 if (error != ENOENT)
1563 return (error);
1564 }
1565
1566 /* Any object in the root indicates a traditional setup. */
1567 for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1568 for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
1569 ddt_object_name(ddt, type, class, name);
1570 uint64_t obj;
1571 error = zap_lookup(spa->spa_meta_objset,
1572 DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t),
1573 1, &obj);
1574 if (error == ENOENT)
1575 continue;
1576 if (error != 0)
1577 return (error);
1578
1579 ddt->ddt_version = DDT_VERSION_LEGACY;
1580 ddt->ddt_flags = ddt_version_flags[ddt->ddt_version];
1581 ddt->ddt_dir_object = DMU_POOL_DIRECTORY_OBJECT;
1582
1583 return (0);
1584 }
1585 }
1586
1587 not_found:
1588 if (!new)
1589 return (SET_ERROR(ENOENT));
1590
1591 /* Nothing on disk, so set up for the best version we can */
1592 if (fdt_enabled) {
1593 ddt->ddt_version = DDT_VERSION_FDT;
1594 ddt->ddt_flags = ddt_version_flags[ddt->ddt_version];
1595 ddt->ddt_dir_object = 0; /* create on first use */
1596 } else {
1597 ddt->ddt_version = DDT_VERSION_LEGACY;
1598 ddt->ddt_flags = ddt_version_flags[ddt->ddt_version];
1599 ddt->ddt_dir_object = DMU_POOL_DIRECTORY_OBJECT;
1600 }
1601
1602 return (0);
1603 }
1604
1605 static int
ddt_kstat_update(kstat_t * ksp,int rw)1606 ddt_kstat_update(kstat_t *ksp, int rw)
1607 {
1608 ddt_t *ddt = ksp->ks_private;
1609 ddt_kstats_t *dds = ksp->ks_data;
1610
1611 if (rw == KSTAT_WRITE)
1612 return (SET_ERROR(EACCES));
1613
1614 /* Aggregate wmsum counters for lookup stats */
1615 dds->dds_lookup.value.ui64 =
1616 wmsum_value(&ddt->ddt_kstat_dds_lookup);
1617 dds->dds_lookup_live_hit.value.ui64 =
1618 wmsum_value(&ddt->ddt_kstat_dds_lookup_live_hit);
1619 dds->dds_lookup_live_wait.value.ui64 =
1620 wmsum_value(&ddt->ddt_kstat_dds_lookup_live_wait);
1621 dds->dds_lookup_live_miss.value.ui64 =
1622 wmsum_value(&ddt->ddt_kstat_dds_lookup_live_miss);
1623 dds->dds_lookup_existing.value.ui64 =
1624 wmsum_value(&ddt->ddt_kstat_dds_lookup_existing);
1625 dds->dds_lookup_new.value.ui64 =
1626 wmsum_value(&ddt->ddt_kstat_dds_lookup_new);
1627 dds->dds_lookup_log_hit.value.ui64 =
1628 wmsum_value(&ddt->ddt_kstat_dds_lookup_log_hit);
1629 dds->dds_lookup_log_active_hit.value.ui64 =
1630 wmsum_value(&ddt->ddt_kstat_dds_lookup_log_active_hit);
1631 dds->dds_lookup_log_flushing_hit.value.ui64 =
1632 wmsum_value(&ddt->ddt_kstat_dds_lookup_log_flushing_hit);
1633 dds->dds_lookup_log_miss.value.ui64 =
1634 wmsum_value(&ddt->ddt_kstat_dds_lookup_log_miss);
1635 dds->dds_lookup_stored_hit.value.ui64 =
1636 wmsum_value(&ddt->ddt_kstat_dds_lookup_stored_hit);
1637 dds->dds_lookup_stored_miss.value.ui64 =
1638 wmsum_value(&ddt->ddt_kstat_dds_lookup_stored_miss);
1639
1640 /* Sync-only counters are already set directly in kstats */
1641
1642 return (0);
1643 }
1644
1645 static void
ddt_table_alloc_kstats(ddt_t * ddt)1646 ddt_table_alloc_kstats(ddt_t *ddt)
1647 {
1648 char *mod = kmem_asprintf("zfs/%s", spa_name(ddt->ddt_spa));
1649 char *name = kmem_asprintf("ddt_stats_%s",
1650 zio_checksum_table[ddt->ddt_checksum].ci_name);
1651
1652 /* Initialize wmsums for lookup counters */
1653 wmsum_init(&ddt->ddt_kstat_dds_lookup, 0);
1654 wmsum_init(&ddt->ddt_kstat_dds_lookup_live_hit, 0);
1655 wmsum_init(&ddt->ddt_kstat_dds_lookup_live_wait, 0);
1656 wmsum_init(&ddt->ddt_kstat_dds_lookup_live_miss, 0);
1657 wmsum_init(&ddt->ddt_kstat_dds_lookup_existing, 0);
1658 wmsum_init(&ddt->ddt_kstat_dds_lookup_new, 0);
1659 wmsum_init(&ddt->ddt_kstat_dds_lookup_log_hit, 0);
1660 wmsum_init(&ddt->ddt_kstat_dds_lookup_log_active_hit, 0);
1661 wmsum_init(&ddt->ddt_kstat_dds_lookup_log_flushing_hit, 0);
1662 wmsum_init(&ddt->ddt_kstat_dds_lookup_log_miss, 0);
1663 wmsum_init(&ddt->ddt_kstat_dds_lookup_stored_hit, 0);
1664 wmsum_init(&ddt->ddt_kstat_dds_lookup_stored_miss, 0);
1665
1666 ddt->ddt_ksp = kstat_create(mod, 0, name, "misc", KSTAT_TYPE_NAMED,
1667 sizeof (ddt_kstats_t) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
1668 if (ddt->ddt_ksp != NULL) {
1669 ddt_kstats_t *dds = kmem_alloc(sizeof (ddt_kstats_t), KM_SLEEP);
1670 memcpy(dds, &ddt_kstats_template, sizeof (ddt_kstats_t));
1671 ddt->ddt_ksp->ks_data = dds;
1672 ddt->ddt_ksp->ks_update = ddt_kstat_update;
1673 ddt->ddt_ksp->ks_private = ddt;
1674 kstat_install(ddt->ddt_ksp);
1675 }
1676
1677 kmem_strfree(name);
1678 kmem_strfree(mod);
1679 }
1680
1681 static ddt_t *
ddt_table_alloc(spa_t * spa,enum zio_checksum c)1682 ddt_table_alloc(spa_t *spa, enum zio_checksum c)
1683 {
1684 ddt_t *ddt;
1685
1686 ddt = kmem_cache_alloc(ddt_cache, KM_SLEEP);
1687 memset(ddt, 0, sizeof (ddt_t));
1688 mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
1689 avl_create(&ddt->ddt_tree, ddt_key_compare,
1690 sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
1691 avl_create(&ddt->ddt_repair_tree, ddt_key_compare,
1692 sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
1693
1694 ddt->ddt_checksum = c;
1695 ddt->ddt_spa = spa;
1696 ddt->ddt_os = spa->spa_meta_objset;
1697 ddt->ddt_version = DDT_VERSION_UNCONFIGURED;
1698 ddt->ddt_log_flush_pressure = 10;
1699
1700 ddt_log_alloc(ddt);
1701 ddt_table_alloc_kstats(ddt);
1702
1703 return (ddt);
1704 }
1705
1706 static void
ddt_table_free(ddt_t * ddt)1707 ddt_table_free(ddt_t *ddt)
1708 {
1709 if (ddt->ddt_ksp != NULL) {
1710 kmem_free(ddt->ddt_ksp->ks_data, sizeof (ddt_kstats_t));
1711 ddt->ddt_ksp->ks_data = NULL;
1712 kstat_delete(ddt->ddt_ksp);
1713 }
1714
1715 /* Cleanup wmsums for lookup counters */
1716 wmsum_fini(&ddt->ddt_kstat_dds_lookup);
1717 wmsum_fini(&ddt->ddt_kstat_dds_lookup_live_hit);
1718 wmsum_fini(&ddt->ddt_kstat_dds_lookup_live_wait);
1719 wmsum_fini(&ddt->ddt_kstat_dds_lookup_live_miss);
1720 wmsum_fini(&ddt->ddt_kstat_dds_lookup_existing);
1721 wmsum_fini(&ddt->ddt_kstat_dds_lookup_new);
1722 wmsum_fini(&ddt->ddt_kstat_dds_lookup_log_hit);
1723 wmsum_fini(&ddt->ddt_kstat_dds_lookup_log_active_hit);
1724 wmsum_fini(&ddt->ddt_kstat_dds_lookup_log_flushing_hit);
1725 wmsum_fini(&ddt->ddt_kstat_dds_lookup_log_miss);
1726 wmsum_fini(&ddt->ddt_kstat_dds_lookup_stored_hit);
1727 wmsum_fini(&ddt->ddt_kstat_dds_lookup_stored_miss);
1728
1729 ddt_log_free(ddt);
1730 ASSERT0(avl_numnodes(&ddt->ddt_tree));
1731 ASSERT0(avl_numnodes(&ddt->ddt_repair_tree));
1732 avl_destroy(&ddt->ddt_tree);
1733 avl_destroy(&ddt->ddt_repair_tree);
1734 mutex_destroy(&ddt->ddt_lock);
1735 kmem_cache_free(ddt_cache, ddt);
1736 }
1737
1738 void
ddt_create(spa_t * spa)1739 ddt_create(spa_t *spa)
1740 {
1741 spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
1742
1743 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1744 if (DDT_CHECKSUM_VALID(c))
1745 spa->spa_ddt[c] = ddt_table_alloc(spa, c);
1746 }
1747 }
1748
1749 int
ddt_load(spa_t * spa)1750 ddt_load(spa_t *spa)
1751 {
1752 int error;
1753
1754 ddt_create(spa);
1755
1756 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1757 DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
1758 &spa->spa_ddt_stat_object);
1759 if (error)
1760 return (error == ENOENT ? 0 : error);
1761
1762 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1763 if (!DDT_CHECKSUM_VALID(c))
1764 continue;
1765
1766 ddt_t *ddt = spa->spa_ddt[c];
1767 error = ddt_configure(ddt, B_FALSE);
1768 if (error == ENOENT)
1769 continue;
1770 if (error != 0)
1771 return (error);
1772
1773 for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1774 for (ddt_class_t class = 0; class < DDT_CLASSES;
1775 class++) {
1776 error = ddt_object_load(ddt, type, class);
1777 if (error != 0 && error != ENOENT)
1778 return (error);
1779 }
1780 }
1781
1782 if (ddt->ddt_flags & DDT_FLAG_LOG) {
1783 error = ddt_log_load(ddt);
1784 if (error != 0 && error != ENOENT)
1785 return (error);
1786 }
1787
1788 DDT_KSTAT_SET(ddt, dds_log_active_entries,
1789 avl_numnodes(&ddt->ddt_log_active->ddl_tree));
1790 DDT_KSTAT_SET(ddt, dds_log_flushing_entries,
1791 avl_numnodes(&ddt->ddt_log_flushing->ddl_tree));
1792
1793 /*
1794 * Seed the cached histograms.
1795 */
1796 memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
1797 sizeof (ddt->ddt_histogram));
1798 }
1799
1800 spa->spa_dedup_dspace = ~0ULL;
1801 spa->spa_dedup_dsize = ~0ULL;
1802
1803 return (0);
1804 }
1805
1806 void
ddt_unload(spa_t * spa)1807 ddt_unload(spa_t *spa)
1808 {
1809 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1810 if (spa->spa_ddt[c]) {
1811 ddt_table_free(spa->spa_ddt[c]);
1812 spa->spa_ddt[c] = NULL;
1813 }
1814 }
1815 }
1816
1817 boolean_t
ddt_class_contains(spa_t * spa,ddt_class_t max_class,const blkptr_t * bp)1818 ddt_class_contains(spa_t *spa, ddt_class_t max_class, const blkptr_t *bp)
1819 {
1820 ddt_t *ddt;
1821 ddt_key_t ddk;
1822
1823 if (!BP_GET_DEDUP(bp))
1824 return (B_FALSE);
1825
1826 if (max_class == DDT_CLASS_UNIQUE)
1827 return (B_TRUE);
1828
1829 ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
1830
1831 ddt_key_fill(&ddk, bp);
1832
1833 for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1834 for (ddt_class_t class = 0; class <= max_class; class++) {
1835 if (ddt_object_contains(ddt, type, class, &ddk) == 0)
1836 return (B_TRUE);
1837 }
1838 }
1839
1840 return (B_FALSE);
1841 }
1842
1843 ddt_entry_t *
ddt_repair_start(ddt_t * ddt,const blkptr_t * bp)1844 ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
1845 {
1846 ddt_key_t ddk;
1847 ddt_entry_t *dde;
1848
1849 ddt_key_fill(&ddk, bp);
1850
1851 dde = ddt_alloc(ddt, &ddk);
1852 ddt_alloc_entry_io(dde);
1853
1854 for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1855 for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
1856 /*
1857 * We can only do repair if there are multiple copies
1858 * of the block. For anything in the UNIQUE class,
1859 * there's definitely only one copy, so don't even try.
1860 */
1861 if (class != DDT_CLASS_UNIQUE &&
1862 ddt_object_lookup(ddt, type, class, dde) == 0)
1863 return (dde);
1864 }
1865 }
1866
1867 memset(dde->dde_phys, 0, DDT_PHYS_SIZE(ddt));
1868
1869 return (dde);
1870 }
1871
1872 void
ddt_repair_done(ddt_t * ddt,ddt_entry_t * dde)1873 ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
1874 {
1875 avl_index_t where;
1876
1877 ddt_enter(ddt);
1878
1879 if (dde->dde_io->dde_repair_abd != NULL &&
1880 spa_writeable(ddt->ddt_spa) &&
1881 avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
1882 avl_insert(&ddt->ddt_repair_tree, dde, where);
1883 else
1884 ddt_free(ddt, dde);
1885
1886 ddt_exit(ddt);
1887 }
1888
1889 static void
ddt_repair_entry_done(zio_t * zio)1890 ddt_repair_entry_done(zio_t *zio)
1891 {
1892 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
1893 ddt_entry_t *rdde = zio->io_private;
1894
1895 ddt_free(ddt, rdde);
1896 }
1897
1898 static void
ddt_repair_entry(ddt_t * ddt,ddt_entry_t * dde,ddt_entry_t * rdde,zio_t * rio)1899 ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
1900 {
1901 ddt_key_t *ddk = &dde->dde_key;
1902 ddt_key_t *rddk = &rdde->dde_key;
1903 zio_t *zio;
1904 blkptr_t blk;
1905
1906 zio = zio_null(rio, rio->io_spa, NULL,
1907 ddt_repair_entry_done, rdde, rio->io_flags);
1908
1909 for (int p = 0; p < DDT_NPHYS(ddt); p++) {
1910 ddt_univ_phys_t *ddp = dde->dde_phys;
1911 ddt_univ_phys_t *rddp = rdde->dde_phys;
1912 ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
1913 uint64_t phys_birth = ddt_phys_birth(ddp, v);
1914 const dva_t *dvas, *rdvas;
1915
1916 if (ddt->ddt_flags & DDT_FLAG_FLAT) {
1917 dvas = ddp->ddp_flat.ddp_dva;
1918 rdvas = rddp->ddp_flat.ddp_dva;
1919 } else {
1920 dvas = ddp->ddp_trad[p].ddp_dva;
1921 rdvas = rddp->ddp_trad[p].ddp_dva;
1922 }
1923
1924 if (phys_birth == 0 ||
1925 phys_birth != ddt_phys_birth(rddp, v) ||
1926 memcmp(dvas, rdvas, sizeof (dva_t) * SPA_DVAS_PER_BP))
1927 continue;
1928
1929 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
1930 zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
1931 rdde->dde_io->dde_repair_abd, DDK_GET_PSIZE(rddk),
1932 NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
1933 ZIO_DDT_CHILD_FLAGS(zio), NULL));
1934 }
1935
1936 zio_nowait(zio);
1937 }
1938
1939 static void
ddt_repair_table(ddt_t * ddt,zio_t * rio)1940 ddt_repair_table(ddt_t *ddt, zio_t *rio)
1941 {
1942 spa_t *spa = ddt->ddt_spa;
1943 ddt_entry_t *dde, *rdde_next, *rdde;
1944 avl_tree_t *t = &ddt->ddt_repair_tree;
1945 blkptr_t blk;
1946
1947 if (spa_sync_pass(spa) > 1)
1948 return;
1949
1950 ddt_enter(ddt);
1951 for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
1952 rdde_next = AVL_NEXT(t, rdde);
1953 avl_remove(&ddt->ddt_repair_tree, rdde);
1954 ddt_exit(ddt);
1955 ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL,
1956 DDT_PHYS_NONE, &blk);
1957 dde = ddt_repair_start(ddt, &blk);
1958 ddt_repair_entry(ddt, dde, rdde, rio);
1959 ddt_repair_done(ddt, dde);
1960 ddt_enter(ddt);
1961 }
1962 ddt_exit(ddt);
1963 }
1964
1965 static void
ddt_sync_update_stats(ddt_t * ddt,dmu_tx_t * tx)1966 ddt_sync_update_stats(ddt_t *ddt, dmu_tx_t *tx)
1967 {
1968 /*
1969 * Count all the entries stored for each type/class, and updates the
1970 * stats within (ddt_object_sync()). If there's no entries for the
1971 * type/class, the whole object is removed. If all objects for the DDT
1972 * are removed, its containing dir is removed, effectively resetting
1973 * the entire DDT to an empty slate.
1974 */
1975 uint64_t count = 0;
1976 for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1977 uint64_t add, tcount = 0;
1978 for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
1979 if (ddt_object_exists(ddt, type, class)) {
1980 ddt_object_sync(ddt, type, class, tx);
1981 VERIFY0(ddt_object_count(ddt, type, class,
1982 &add));
1983 tcount += add;
1984 }
1985 }
1986 for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
1987 if (tcount == 0 && ddt_object_exists(ddt, type, class))
1988 ddt_object_destroy(ddt, type, class, tx);
1989 }
1990 count += tcount;
1991 }
1992
1993 if (ddt->ddt_flags & DDT_FLAG_LOG) {
1994 /* Include logged entries in the total count */
1995 count += avl_numnodes(&ddt->ddt_log_active->ddl_tree);
1996 count += avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
1997 }
1998
1999 if (count == 0) {
2000 /*
2001 * No entries left on the DDT, so reset the version for next
2002 * time. This allows us to handle the feature being changed
2003 * since the DDT was originally created. New entries should get
2004 * whatever the feature currently demands.
2005 */
2006 if (ddt->ddt_version == DDT_VERSION_FDT)
2007 ddt_destroy_dir(ddt, tx);
2008
2009 ddt->ddt_version = DDT_VERSION_UNCONFIGURED;
2010 ddt->ddt_flags = 0;
2011 }
2012
2013 memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
2014 sizeof (ddt->ddt_histogram));
2015 ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
2016 ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
2017 }
2018
2019 static void
ddt_sync_scan_entry(ddt_t * ddt,ddt_lightweight_entry_t * ddlwe,dmu_tx_t * tx)2020 ddt_sync_scan_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
2021 {
2022 dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
2023
2024 /*
2025 * Compute the target class, so we can decide whether or not to inform
2026 * the scrub traversal (below). Note that we don't store this in the
2027 * entry, as it might change multiple times before finally being
2028 * committed (if we're logging). Instead, we recompute it in
2029 * ddt_sync_entry().
2030 */
2031 uint64_t refcnt = ddt_phys_total_refcnt(ddt, &ddlwe->ddlwe_phys);
2032 ddt_class_t nclass =
2033 (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE;
2034
2035 /*
2036 * If the class changes, the order that we scan this bp changes. If it
2037 * decreases, we could miss it, so scan it right now. (This covers both
2038 * class changing while we are doing ddt_walk(), and when we are
2039 * traversing.)
2040 *
2041 * We also do this when the refcnt goes to zero, because that change is
2042 * only in the log so far; the blocks on disk won't be freed until
2043 * the log is flushed, and the refcnt might increase before that. If it
2044 * does, then we could miss it in the same way.
2045 */
2046 if (refcnt == 0 || nclass < ddlwe->ddlwe_class)
2047 dsl_scan_ddt_entry(dp->dp_scan, ddt->ddt_checksum, ddt,
2048 ddlwe, tx);
2049 }
2050
2051 static void
ddt_sync_flush_entry(ddt_t * ddt,ddt_lightweight_entry_t * ddlwe,ddt_type_t otype,ddt_class_t oclass,dmu_tx_t * tx)2052 ddt_sync_flush_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe,
2053 ddt_type_t otype, ddt_class_t oclass, dmu_tx_t *tx)
2054 {
2055 ddt_key_t *ddk = &ddlwe->ddlwe_key;
2056 ddt_type_t ntype = DDT_TYPE_DEFAULT;
2057 uint64_t refcnt = 0;
2058
2059 /*
2060 * Compute the total refcnt. Along the way, issue frees for any DVAs
2061 * we no longer want.
2062 */
2063 for (int p = 0; p < DDT_NPHYS(ddt); p++) {
2064 ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
2065 ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
2066 uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v);
2067
2068 if (ddt_phys_birth(ddp, v) == 0) {
2069 ASSERT0(phys_refcnt);
2070 continue;
2071 }
2072 if (DDT_PHYS_IS_DITTO(ddt, p)) {
2073 /*
2074 * We don't want to keep any obsolete slots (eg ditto),
2075 * regardless of their refcount, but we don't want to
2076 * leak them either. So, free them.
2077 */
2078 ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg);
2079 continue;
2080 }
2081 if (phys_refcnt == 0)
2082 /* No remaining references, free it! */
2083 ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg);
2084 refcnt += phys_refcnt;
2085 }
2086
2087 /* Select the best class for the entry. */
2088 ddt_class_t nclass =
2089 (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE;
2090
2091 /*
2092 * If an existing entry changed type or class, or its refcount reached
2093 * zero, delete it from the DDT object
2094 */
2095 if (otype != DDT_TYPES &&
2096 (otype != ntype || oclass != nclass || refcnt == 0)) {
2097 VERIFY0(ddt_object_remove(ddt, otype, oclass, ddk, tx));
2098 ASSERT(ddt_object_contains(ddt, otype, oclass, ddk) == ENOENT);
2099 }
2100
2101 /*
2102 * Add or update the entry
2103 */
2104 if (refcnt != 0) {
2105 ddt_histogram_t *ddh =
2106 &ddt->ddt_histogram[ntype][nclass];
2107
2108 ddt_histogram_add_entry(ddt, ddh, ddlwe);
2109
2110 if (!ddt_object_exists(ddt, ntype, nclass))
2111 ddt_object_create(ddt, ntype, nclass, tx);
2112 VERIFY0(ddt_object_update(ddt, ntype, nclass, ddlwe, tx));
2113 }
2114 }
2115
2116 /* Calculate an exponential weighted moving average, lower limited to zero */
2117 static inline int32_t
_ewma(int32_t val,int32_t prev,uint32_t weight)2118 _ewma(int32_t val, int32_t prev, uint32_t weight)
2119 {
2120 ASSERT3U(val, >=, 0);
2121 ASSERT3U(prev, >=, 0);
2122 const int32_t new =
2123 MAX(0, prev + (val-prev) / (int32_t)MAX(weight, 1));
2124 ASSERT3U(new, >=, 0);
2125 return (new);
2126 }
2127
2128 static inline void
ddt_flush_force_update_txg(ddt_t * ddt,uint64_t txg)2129 ddt_flush_force_update_txg(ddt_t *ddt, uint64_t txg)
2130 {
2131 /*
2132 * If we're not forcing flush, and not being asked to start, then
2133 * there's nothing more to do.
2134 */
2135 if (txg == 0) {
2136 /* Update requested, are we currently forcing flush? */
2137 if (ddt->ddt_flush_force_txg == 0)
2138 return;
2139 txg = ddt->ddt_flush_force_txg;
2140 }
2141
2142 /*
2143 * If either of the logs have entries unflushed entries before
2144 * the wanted txg, set the force txg, otherwise clear it.
2145 */
2146
2147 if ((!avl_is_empty(&ddt->ddt_log_active->ddl_tree) &&
2148 ddt->ddt_log_active->ddl_first_txg <= txg) ||
2149 (!avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) &&
2150 ddt->ddt_log_flushing->ddl_first_txg <= txg)) {
2151 ddt->ddt_flush_force_txg = txg;
2152 return;
2153 }
2154
2155 /*
2156 * Nothing to flush behind the given txg, so we can clear force flush
2157 * state.
2158 */
2159 ddt->ddt_flush_force_txg = 0;
2160 }
2161
2162 static void
ddt_sync_flush_log(ddt_t * ddt,dmu_tx_t * tx)2163 ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
2164 {
2165 spa_t *spa = ddt->ddt_spa;
2166 ASSERT(avl_is_empty(&ddt->ddt_tree));
2167
2168 /*
2169 * Don't do any flushing when the pool is ready to shut down, or in
2170 * passes beyond the first.
2171 */
2172 if (spa_sync_pass(spa) > 1 || tx->tx_txg > spa_final_dirty_txg(spa))
2173 return;
2174
2175 hrtime_t flush_start = gethrtime();
2176 uint32_t count = 0;
2177
2178 /*
2179 * How many entries we need to flush. We need to at
2180 * least match the ingest rate, and also consider the
2181 * current backlog of entries.
2182 */
2183 uint64_t backlog = avl_numnodes(&ddt->ddt_log_flushing->ddl_tree) +
2184 avl_numnodes(&ddt->ddt_log_active->ddl_tree);
2185
2186 if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree))
2187 goto housekeeping;
2188
2189 uint64_t txgs = MAX(1, zfs_dedup_log_flush_txgs);
2190 uint64_t cap = MAX(1, zfs_dedup_log_cap);
2191 uint64_t flush_min = MAX(backlog / txgs,
2192 zfs_dedup_log_flush_entries_min);
2193
2194 /*
2195 * The theory for this block is that if we increase the pressure while
2196 * we're growing above the cap, and remove it when we're significantly
2197 * below the cap, we'll stay near cap while not bouncing around too
2198 * much.
2199 *
2200 * The factor of 10 is to smooth the pressure effect by expressing it
2201 * in tenths. The addition of the cap to the backlog in the second
2202 * block is to round up, instead of down. We never let the pressure go
2203 * below 1 (10 tenths).
2204 */
2205 if (cap != UINT_MAX && backlog > cap &&
2206 backlog > ddt->ddt_log_flush_prev_backlog) {
2207 ddt->ddt_log_flush_pressure += 10 * backlog / cap;
2208 } else if (cap != UINT_MAX && backlog < cap) {
2209 ddt->ddt_log_flush_pressure -=
2210 11 - (((10 * backlog) + cap - 1) / cap);
2211 ddt->ddt_log_flush_pressure =
2212 MAX(ddt->ddt_log_flush_pressure, 10);
2213 }
2214
2215 if (zfs_dedup_log_hard_cap && cap != UINT_MAX)
2216 flush_min = MAX(flush_min, MIN(backlog - cap,
2217 (flush_min * ddt->ddt_log_flush_pressure) / 10));
2218
2219 uint64_t flush_max;
2220
2221 /*
2222 * If we've been asked to flush everything in a hurry,
2223 * try to dump as much as possible on this txg. In
2224 * this case we're only limited by time, not amount.
2225 *
2226 * Otherwise, if we are over the cap, try to get back down to it.
2227 *
2228 * Finally if there is no cap (or no pressure), just set the max a
2229 * little higher than the min to help smooth out variations in flush
2230 * times.
2231 */
2232 if (ddt->ddt_flush_force_txg > 0)
2233 flush_max = avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
2234 else if (cap != UINT32_MAX && !zfs_dedup_log_hard_cap)
2235 flush_max = MAX(flush_min * 5 / 4, MIN(backlog - cap,
2236 (flush_min * ddt->ddt_log_flush_pressure) / 10));
2237 else
2238 flush_max = flush_min * 5 / 4;
2239 flush_max = MIN(flush_max, zfs_dedup_log_flush_entries_max);
2240
2241 /*
2242 * When the pool is busy or someone is explicitly waiting for this txg
2243 * to complete, use the zfs_dedup_log_flush_min_time_ms. Otherwise use
2244 * half of the time in the txg timeout.
2245 */
2246 uint64_t target_time;
2247
2248 if (txg_sync_waiting(ddt->ddt_spa->spa_dsl_pool) ||
2249 vdev_queue_pool_busy(spa)) {
2250 target_time = MIN(MSEC2NSEC(zfs_dedup_log_flush_min_time_ms),
2251 SEC2NSEC(zfs_txg_timeout) / 2);
2252 } else {
2253 target_time = SEC2NSEC(zfs_txg_timeout) / 2;
2254 }
2255
2256 ddt_lightweight_entry_t ddlwe;
2257 while (ddt_log_take_first(ddt, ddt->ddt_log_flushing, &ddlwe)) {
2258 ddt_sync_flush_entry(ddt, &ddlwe,
2259 ddlwe.ddlwe_type, ddlwe.ddlwe_class, tx);
2260
2261 /* End if we've synced as much as we needed to. */
2262 if (++count >= flush_max)
2263 break;
2264
2265 /*
2266 * As long as we've flushed the absolute minimum,
2267 * stop if we're way over our target time.
2268 */
2269 uint64_t diff = gethrtime() - flush_start;
2270 if (count > zfs_dedup_log_flush_entries_min &&
2271 diff >= target_time * 2)
2272 break;
2273
2274 /*
2275 * End if we've passed the minimum flush and we're out of time.
2276 */
2277 if (count > flush_min && diff >= target_time)
2278 break;
2279 }
2280
2281 if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) {
2282 /* We emptied it, so truncate on-disk */
2283 DDT_KSTAT_ZERO(ddt, dds_log_flushing_entries);
2284 ddt_log_truncate(ddt, tx);
2285 } else {
2286 /* More to do next time, save checkpoint */
2287 DDT_KSTAT_SUB(ddt, dds_log_flushing_entries, count);
2288 ddt_log_checkpoint(ddt, &ddlwe, tx);
2289 }
2290
2291 ddt_sync_update_stats(ddt, tx);
2292
2293 housekeeping:
2294 if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) &&
2295 !avl_is_empty(&ddt->ddt_log_active->ddl_tree)) {
2296 /*
2297 * No more to flush, and the active list has stuff, so
2298 * try to swap the logs for next time.
2299 */
2300 if (ddt_log_swap(ddt, tx)) {
2301 DDT_KSTAT_ZERO(ddt, dds_log_active_entries);
2302 DDT_KSTAT_SET(ddt, dds_log_flushing_entries,
2303 avl_numnodes(&ddt->ddt_log_flushing->ddl_tree));
2304 }
2305 }
2306
2307 /* If force flush is no longer necessary, turn it off. */
2308 ddt_flush_force_update_txg(ddt, 0);
2309
2310 ddt->ddt_log_flush_prev_backlog = backlog;
2311
2312 /*
2313 * Update flush rate. This is an exponential weighted moving
2314 * average of the number of entries flushed over recent txgs.
2315 */
2316 ddt->ddt_log_flush_rate = _ewma(count, ddt->ddt_log_flush_rate,
2317 zfs_dedup_log_flush_flow_rate_txgs);
2318 DDT_KSTAT_SET(ddt, dds_log_flush_rate, ddt->ddt_log_flush_rate);
2319
2320 /*
2321 * Update flush time rate. This is an exponential weighted moving
2322 * average of the total time taken to flush over recent txgs.
2323 */
2324 ddt->ddt_log_flush_time_rate = _ewma(ddt->ddt_log_flush_time_rate,
2325 (int32_t)NSEC2MSEC(gethrtime() - flush_start),
2326 zfs_dedup_log_flush_flow_rate_txgs);
2327 DDT_KSTAT_SET(ddt, dds_log_flush_time_rate,
2328 ddt->ddt_log_flush_time_rate);
2329 if (avl_numnodes(&ddt->ddt_log_flushing->ddl_tree) > 0 &&
2330 zfs_flags & ZFS_DEBUG_DDT) {
2331 zfs_dbgmsg("%lu entries remain(%lu in active), flushed %u @ "
2332 "txg %llu, in %llu ms, flush rate %d, time rate %d",
2333 (ulong_t)avl_numnodes(&ddt->ddt_log_flushing->ddl_tree),
2334 (ulong_t)avl_numnodes(&ddt->ddt_log_active->ddl_tree),
2335 count, (u_longlong_t)tx->tx_txg,
2336 (u_longlong_t)NSEC2MSEC(gethrtime() - flush_start),
2337 ddt->ddt_log_flush_rate, ddt->ddt_log_flush_time_rate);
2338 }
2339 }
2340
2341 static void
ddt_sync_table_log(ddt_t * ddt,dmu_tx_t * tx)2342 ddt_sync_table_log(ddt_t *ddt, dmu_tx_t *tx)
2343 {
2344 uint64_t count = avl_numnodes(&ddt->ddt_tree);
2345
2346 if (count > 0) {
2347 ddt_log_update_t dlu = {0};
2348 ddt_log_begin(ddt, count, tx, &dlu);
2349
2350 ddt_entry_t *dde;
2351 void *cookie = NULL;
2352 ddt_lightweight_entry_t ddlwe;
2353 while ((dde =
2354 avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
2355 ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
2356 DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
2357 ddt_log_entry(ddt, &ddlwe, &dlu);
2358 ddt_sync_scan_entry(ddt, &ddlwe, tx);
2359 ddt_free(ddt, dde);
2360 }
2361
2362 ddt_log_commit(ddt, &dlu);
2363
2364 DDT_KSTAT_SET(ddt, dds_log_active_entries,
2365 avl_numnodes(&ddt->ddt_log_active->ddl_tree));
2366
2367 /*
2368 * Sync the stats for the store objects. Even though we haven't
2369 * modified anything on those objects, they're no longer the
2370 * source of truth for entries that are now in the log, and we
2371 * need the on-disk counts to reflect that, otherwise we'll
2372 * miscount later when importing.
2373 */
2374 for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
2375 for (ddt_class_t class = 0;
2376 class < DDT_CLASSES; class++) {
2377 if (ddt_object_exists(ddt, type, class))
2378 ddt_object_sync(ddt, type, class, tx);
2379 }
2380 }
2381
2382 memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
2383 sizeof (ddt->ddt_histogram));
2384 ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
2385 ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
2386 }
2387
2388 if (spa_sync_pass(ddt->ddt_spa) == 1) {
2389 /*
2390 * Update ingest rate. This is an exponential weighted moving
2391 * average of the number of entries changed over recent txgs.
2392 * The ramp-up cost shouldn't matter too much because the
2393 * flusher will be trying to take at least the minimum anyway.
2394 */
2395 ddt->ddt_log_ingest_rate = _ewma(
2396 count, ddt->ddt_log_ingest_rate,
2397 zfs_dedup_log_flush_flow_rate_txgs);
2398 DDT_KSTAT_SET(ddt, dds_log_ingest_rate,
2399 ddt->ddt_log_ingest_rate);
2400 }
2401 }
2402
2403 static void
ddt_sync_table_flush(ddt_t * ddt,dmu_tx_t * tx)2404 ddt_sync_table_flush(ddt_t *ddt, dmu_tx_t *tx)
2405 {
2406 if (avl_numnodes(&ddt->ddt_tree) == 0)
2407 return;
2408
2409 ddt_entry_t *dde;
2410 void *cookie = NULL;
2411 while ((dde = avl_destroy_nodes(
2412 &ddt->ddt_tree, &cookie)) != NULL) {
2413 ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
2414
2415 ddt_lightweight_entry_t ddlwe;
2416 DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
2417 ddt_sync_flush_entry(ddt, &ddlwe,
2418 dde->dde_type, dde->dde_class, tx);
2419 ddt_sync_scan_entry(ddt, &ddlwe, tx);
2420 ddt_free(ddt, dde);
2421 }
2422
2423 memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
2424 sizeof (ddt->ddt_histogram));
2425 ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
2426 ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
2427 ddt_sync_update_stats(ddt, tx);
2428 }
2429
2430 static void
ddt_sync_table(ddt_t * ddt,dmu_tx_t * tx)2431 ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx)
2432 {
2433 spa_t *spa = ddt->ddt_spa;
2434
2435 if (ddt->ddt_version == UINT64_MAX)
2436 return;
2437
2438 if (spa->spa_uberblock.ub_version < SPA_VERSION_DEDUP) {
2439 ASSERT0(avl_numnodes(&ddt->ddt_tree));
2440 return;
2441 }
2442
2443 if (spa->spa_ddt_stat_object == 0) {
2444 spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
2445 DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
2446 DMU_POOL_DDT_STATS, tx);
2447 }
2448
2449 if (ddt->ddt_version == DDT_VERSION_FDT && ddt->ddt_dir_object == 0)
2450 ddt_create_dir(ddt, tx);
2451
2452 if (ddt->ddt_flags & DDT_FLAG_LOG)
2453 ddt_sync_table_log(ddt, tx);
2454 else
2455 ddt_sync_table_flush(ddt, tx);
2456 }
2457
2458 void
ddt_sync(spa_t * spa,uint64_t txg)2459 ddt_sync(spa_t *spa, uint64_t txg)
2460 {
2461 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
2462 dmu_tx_t *tx;
2463 zio_t *rio;
2464
2465 ASSERT3U(spa_syncing_txg(spa), ==, txg);
2466
2467 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
2468
2469 rio = zio_root(spa, NULL, NULL,
2470 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL);
2471
2472 /*
2473 * This function may cause an immediate scan of ddt blocks (see
2474 * the comment above dsl_scan_ddt() for details). We set the
2475 * scan's root zio here so that we can wait for any scan IOs in
2476 * addition to the regular ddt IOs.
2477 */
2478 ASSERT0P(scn->scn_zio_root);
2479 scn->scn_zio_root = rio;
2480
2481 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
2482 ddt_t *ddt = spa->spa_ddt[c];
2483 if (ddt == NULL)
2484 continue;
2485 ddt_sync_table(ddt, tx);
2486 if (ddt->ddt_flags & DDT_FLAG_LOG)
2487 ddt_sync_flush_log(ddt, tx);
2488 ddt_repair_table(ddt, rio);
2489 }
2490
2491 (void) zio_wait(rio);
2492 scn->scn_zio_root = NULL;
2493
2494 dmu_tx_commit(tx);
2495 }
2496
2497 void
ddt_walk_init(spa_t * spa,uint64_t txg)2498 ddt_walk_init(spa_t *spa, uint64_t txg)
2499 {
2500 if (txg == 0)
2501 txg = spa_syncing_txg(spa);
2502
2503 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
2504 ddt_t *ddt = spa->spa_ddt[c];
2505 if (ddt == NULL || !(ddt->ddt_flags & DDT_FLAG_LOG))
2506 continue;
2507
2508 ddt_enter(ddt);
2509 ddt_flush_force_update_txg(ddt, txg);
2510 ddt_exit(ddt);
2511 }
2512 }
2513
2514 boolean_t
ddt_walk_ready(spa_t * spa)2515 ddt_walk_ready(spa_t *spa)
2516 {
2517 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
2518 ddt_t *ddt = spa->spa_ddt[c];
2519 if (ddt == NULL || !(ddt->ddt_flags & DDT_FLAG_LOG))
2520 continue;
2521
2522 if (ddt->ddt_flush_force_txg > 0)
2523 return (B_FALSE);
2524 }
2525
2526 return (B_TRUE);
2527 }
2528
2529 static int
ddt_walk_impl(spa_t * spa,ddt_bookmark_t * ddb,ddt_lightweight_entry_t * ddlwe,uint64_t flags,boolean_t wait)2530 ddt_walk_impl(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe,
2531 uint64_t flags, boolean_t wait)
2532 {
2533 do {
2534 do {
2535 do {
2536 ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
2537 if (ddt == NULL)
2538 continue;
2539
2540 if (flags != 0 &&
2541 (ddt->ddt_flags & flags) != flags)
2542 continue;
2543
2544 if (wait && ddt->ddt_flush_force_txg > 0)
2545 return (EAGAIN);
2546
2547 int error = ENOENT;
2548 if (ddt_object_exists(ddt, ddb->ddb_type,
2549 ddb->ddb_class)) {
2550 error = ddt_object_walk(ddt,
2551 ddb->ddb_type, ddb->ddb_class,
2552 &ddb->ddb_cursor, ddlwe);
2553 }
2554 if (error == 0)
2555 return (0);
2556 if (error != ENOENT)
2557 return (error);
2558 ddb->ddb_cursor = 0;
2559 } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
2560 ddb->ddb_checksum = 0;
2561 } while (++ddb->ddb_type < DDT_TYPES);
2562 ddb->ddb_type = 0;
2563 } while (++ddb->ddb_class < DDT_CLASSES);
2564
2565 return (SET_ERROR(ENOENT));
2566 }
2567
2568 int
ddt_walk(spa_t * spa,ddt_bookmark_t * ddb,ddt_lightweight_entry_t * ddlwe)2569 ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe)
2570 {
2571 return (ddt_walk_impl(spa, ddb, ddlwe, 0, B_TRUE));
2572 }
2573
2574 /*
2575 * This function is used by Block Cloning (brt.c) to increase reference
2576 * counter for the DDT entry if the block is already in DDT.
2577 *
2578 * Return false if the block, despite having the D bit set, is not present
2579 * in the DDT. This is possible when the DDT has been pruned by an admin
2580 * or by the DDT quota mechanism.
2581 */
2582 boolean_t
ddt_addref(spa_t * spa,const blkptr_t * bp)2583 ddt_addref(spa_t *spa, const blkptr_t *bp)
2584 {
2585 ddt_t *ddt;
2586 ddt_entry_t *dde;
2587 boolean_t result;
2588
2589 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
2590 ddt = ddt_select(spa, bp);
2591 ddt_enter(ddt);
2592
2593 dde = ddt_lookup(ddt, bp, B_TRUE);
2594
2595 /* Can be NULL if the entry for this block was pruned. */
2596 if (dde == NULL) {
2597 ddt_exit(ddt);
2598 spa_config_exit(spa, SCL_ZIO, FTAG);
2599 return (B_FALSE);
2600 }
2601
2602 if ((dde->dde_type < DDT_TYPES) || (dde->dde_flags & DDE_FLAG_LOGGED)) {
2603 /*
2604 * This entry was either synced to a store object (dde_type is
2605 * real) or was logged. It must be properly on disk at this
2606 * point, so we can just bump its refcount.
2607 */
2608 int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp));
2609 ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
2610
2611 ddt_phys_addref(dde->dde_phys, v);
2612 result = B_TRUE;
2613 } else {
2614 /*
2615 * If the block has the DEDUP flag set it still might not
2616 * exist in the DEDUP table due to DDT pruning of entries
2617 * where refcnt=1.
2618 */
2619 ddt_remove(ddt, dde);
2620 result = B_FALSE;
2621 }
2622
2623 ddt_exit(ddt);
2624 spa_config_exit(spa, SCL_ZIO, FTAG);
2625
2626 return (result);
2627 }
2628
2629 typedef struct ddt_prune_entry {
2630 ddt_t *dpe_ddt;
2631 ddt_key_t dpe_key;
2632 list_node_t dpe_node;
2633 ddt_univ_phys_t dpe_phys[];
2634 } ddt_prune_entry_t;
2635
2636 typedef struct ddt_prune_info {
2637 spa_t *dpi_spa;
2638 uint64_t dpi_txg_syncs;
2639 uint64_t dpi_pruned;
2640 list_t dpi_candidates;
2641 } ddt_prune_info_t;
2642
2643 /*
2644 * Add prune candidates for ddt_sync during spa_sync
2645 */
2646 static void
prune_candidates_sync(void * arg,dmu_tx_t * tx)2647 prune_candidates_sync(void *arg, dmu_tx_t *tx)
2648 {
2649 (void) tx;
2650 ddt_prune_info_t *dpi = arg;
2651 ddt_prune_entry_t *dpe;
2652
2653 spa_config_enter(dpi->dpi_spa, SCL_ZIO, FTAG, RW_READER);
2654
2655 /* Process the prune candidates collected so far */
2656 while ((dpe = list_remove_head(&dpi->dpi_candidates)) != NULL) {
2657 blkptr_t blk;
2658 ddt_t *ddt = dpe->dpe_ddt;
2659
2660 ddt_enter(ddt);
2661
2662 /*
2663 * If it's on the live list, then it was loaded for update
2664 * this txg and is no longer stale; skip it.
2665 */
2666 if (avl_find(&ddt->ddt_tree, &dpe->dpe_key, NULL)) {
2667 ddt_exit(ddt);
2668 kmem_free(dpe, sizeof (*dpe));
2669 continue;
2670 }
2671
2672 ddt_bp_create(ddt->ddt_checksum, &dpe->dpe_key,
2673 dpe->dpe_phys, DDT_PHYS_FLAT, &blk);
2674
2675 ddt_entry_t *dde = ddt_lookup(ddt, &blk, B_TRUE);
2676 if (dde != NULL && !(dde->dde_flags & DDE_FLAG_LOGGED)) {
2677 ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
2678 /*
2679 * Zero the physical, so we don't try to free DVAs
2680 * at flush nor try to reuse this entry.
2681 */
2682 ddt_phys_clear(dde->dde_phys, DDT_PHYS_FLAT);
2683
2684 dpi->dpi_pruned++;
2685 }
2686
2687 ddt_exit(ddt);
2688 kmem_free(dpe, sizeof (*dpe));
2689 }
2690
2691 spa_config_exit(dpi->dpi_spa, SCL_ZIO, FTAG);
2692 dpi->dpi_txg_syncs++;
2693 }
2694
2695 /*
2696 * Prune candidates are collected in open context and processed
2697 * in sync context as part of ddt_sync_table().
2698 */
2699 static void
ddt_prune_entry(list_t * list,ddt_t * ddt,const ddt_key_t * ddk,const ddt_univ_phys_t * ddp)2700 ddt_prune_entry(list_t *list, ddt_t *ddt, const ddt_key_t *ddk,
2701 const ddt_univ_phys_t *ddp)
2702 {
2703 ASSERT(ddt->ddt_flags & DDT_FLAG_FLAT);
2704
2705 size_t dpe_size = sizeof (ddt_prune_entry_t) + DDT_FLAT_PHYS_SIZE;
2706 ddt_prune_entry_t *dpe = kmem_alloc(dpe_size, KM_SLEEP);
2707
2708 dpe->dpe_ddt = ddt;
2709 dpe->dpe_key = *ddk;
2710 memcpy(dpe->dpe_phys, ddp, DDT_FLAT_PHYS_SIZE);
2711 list_insert_head(list, dpe);
2712 }
2713
2714 /*
2715 * Interate over all the entries in the DDT unique class.
2716 * The walk will perform one of the following operations:
2717 * (a) build a histogram than can be used when pruning
2718 * (b) prune entries older than the cutoff
2719 *
2720 * Also called by zdb(8) to dump the age histogram
2721 */
2722 void
ddt_prune_walk(spa_t * spa,uint64_t cutoff,ddt_age_histo_t * histogram)2723 ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram)
2724 {
2725 ddt_bookmark_t ddb = {
2726 .ddb_class = DDT_CLASS_UNIQUE,
2727 .ddb_type = 0,
2728 .ddb_checksum = 0,
2729 .ddb_cursor = 0
2730 };
2731 ddt_lightweight_entry_t ddlwe = {0};
2732 int error;
2733 int valid = 0;
2734 int candidates = 0;
2735 uint64_t now = gethrestime_sec();
2736 ddt_prune_info_t dpi;
2737 boolean_t pruning = (cutoff != 0);
2738
2739 if (pruning) {
2740 dpi.dpi_txg_syncs = 0;
2741 dpi.dpi_pruned = 0;
2742 dpi.dpi_spa = spa;
2743 list_create(&dpi.dpi_candidates, sizeof (ddt_prune_entry_t),
2744 offsetof(ddt_prune_entry_t, dpe_node));
2745 }
2746
2747 if (histogram != NULL)
2748 memset(histogram, 0, sizeof (ddt_age_histo_t));
2749
2750 while ((error =
2751 ddt_walk_impl(spa, &ddb, &ddlwe, DDT_FLAG_FLAT, B_FALSE)) == 0) {
2752 ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
2753 VERIFY(ddt);
2754
2755 if (spa_shutting_down(spa) || issig())
2756 break;
2757
2758 ASSERT(ddt->ddt_flags & DDT_FLAG_FLAT);
2759 ASSERT3U(ddlwe.ddlwe_phys.ddp_flat.ddp_refcnt, <=, 1);
2760
2761 uint64_t class_start =
2762 ddlwe.ddlwe_phys.ddp_flat.ddp_class_start;
2763
2764 /*
2765 * If this entry is on the log, then the stored entry is stale
2766 * and we should skip it.
2767 */
2768 if (ddt_log_find_key(ddt, &ddlwe.ddlwe_key, NULL))
2769 continue;
2770
2771 /* prune older entries */
2772 if (pruning && class_start < cutoff) {
2773 if (candidates++ >= zfs_ddt_prunes_per_txg) {
2774 /* sync prune candidates in batches */
2775 VERIFY0(dsl_sync_task(spa_name(spa),
2776 NULL, prune_candidates_sync,
2777 &dpi, 0, ZFS_SPACE_CHECK_NONE));
2778 candidates = 1;
2779 }
2780 ddt_prune_entry(&dpi.dpi_candidates, ddt,
2781 &ddlwe.ddlwe_key, &ddlwe.ddlwe_phys);
2782 }
2783
2784 /* build a histogram */
2785 if (histogram != NULL) {
2786 uint64_t age = MAX(1, (now - class_start) / 3600);
2787 int bin = MIN(highbit64(age) - 1, HIST_BINS - 1);
2788 histogram->dah_entries++;
2789 histogram->dah_age_histo[bin]++;
2790 }
2791
2792 valid++;
2793 }
2794
2795 if (pruning && valid > 0) {
2796 if (!list_is_empty(&dpi.dpi_candidates)) {
2797 /* sync out final batch of prune candidates */
2798 VERIFY0(dsl_sync_task(spa_name(spa), NULL,
2799 prune_candidates_sync, &dpi, 0,
2800 ZFS_SPACE_CHECK_NONE));
2801 }
2802 list_destroy(&dpi.dpi_candidates);
2803
2804 zfs_dbgmsg("pruned %llu entries (%d%%) across %llu txg syncs",
2805 (u_longlong_t)dpi.dpi_pruned,
2806 (int)((dpi.dpi_pruned * 100) / valid),
2807 (u_longlong_t)dpi.dpi_txg_syncs);
2808 }
2809 }
2810
2811 static uint64_t
ddt_total_entries(spa_t * spa)2812 ddt_total_entries(spa_t *spa)
2813 {
2814 ddt_object_t ddo;
2815 ddt_get_dedup_object_stats(spa, &ddo);
2816
2817 return (ddo.ddo_count);
2818 }
2819
2820 int
ddt_prune_unique_entries(spa_t * spa,zpool_ddt_prune_unit_t unit,uint64_t amount)2821 ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit,
2822 uint64_t amount)
2823 {
2824 uint64_t cutoff;
2825 uint64_t start_time = gethrtime();
2826
2827 if (spa->spa_active_ddt_prune)
2828 return (SET_ERROR(EALREADY));
2829 if (ddt_total_entries(spa) == 0)
2830 return (0);
2831
2832 spa->spa_active_ddt_prune = B_TRUE;
2833
2834 zfs_dbgmsg("prune %llu %s", (u_longlong_t)amount,
2835 unit == ZPOOL_DDT_PRUNE_PERCENTAGE ? "%" : "seconds old or older");
2836
2837 if (unit == ZPOOL_DDT_PRUNE_PERCENTAGE) {
2838 ddt_age_histo_t histogram;
2839 uint64_t oldest = 0;
2840
2841 /* Make a pass over DDT to build a histogram */
2842 ddt_prune_walk(spa, 0, &histogram);
2843
2844 int target = (histogram.dah_entries * amount) / 100;
2845
2846 /*
2847 * Figure out our cutoff date
2848 * (i.e., which bins to prune from)
2849 */
2850 for (int i = HIST_BINS - 1; i >= 0 && target > 0; i--) {
2851 if (histogram.dah_age_histo[i] != 0) {
2852 /* less than this bucket remaining */
2853 if (target < histogram.dah_age_histo[i]) {
2854 oldest = MAX(1, (1<<i) * 3600);
2855 target = 0;
2856 } else {
2857 target -= histogram.dah_age_histo[i];
2858 }
2859 }
2860 }
2861 cutoff = gethrestime_sec() - oldest;
2862
2863 if (ddt_dump_prune_histogram)
2864 ddt_dump_age_histogram(&histogram, cutoff);
2865 } else if (unit == ZPOOL_DDT_PRUNE_AGE) {
2866 cutoff = gethrestime_sec() - amount;
2867 } else {
2868 return (EINVAL);
2869 }
2870
2871 if (cutoff > 0 && !spa_shutting_down(spa) && !issig()) {
2872 /* Traverse DDT to prune entries older that our cuttoff */
2873 ddt_prune_walk(spa, cutoff, NULL);
2874 }
2875
2876 zfs_dbgmsg("%s: prune completed in %llu ms",
2877 spa_name(spa), (u_longlong_t)NSEC2MSEC(gethrtime() - start_time));
2878
2879 spa->spa_active_ddt_prune = B_FALSE;
2880 return (0);
2881 }
2882
2883 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW,
2884 "Enable prefetching dedup-ed blks");
2885
2886 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_min_time_ms, UINT, ZMOD_RW,
2887 "Min time to spend on incremental dedup log flush each transaction");
2888
2889 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_min, UINT, ZMOD_RW,
2890 "Min number of log entries to flush each transaction");
2891
2892 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_max, UINT, ZMOD_RW,
2893 "Max number of log entries to flush each transaction");
2894
2895 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_txgs, UINT, ZMOD_RW,
2896 "Number of TXGs to try to rotate the log in");
2897
2898 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_cap, UINT, ZMOD_RW,
2899 "Soft cap for the size of the current dedup log");
2900
2901 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_hard_cap, UINT, ZMOD_RW,
2902 "Whether to use the soft cap as a hard cap");
2903
2904 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_flow_rate_txgs, UINT, ZMOD_RW,
2905 "Number of txgs to average flow rates across");
2906