1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23 /*
24 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
26 * Copyright (c) 2022 by Pawel Jakub Dawidek
27 * Copyright (c) 2019, 2023, Klara Inc.
28 */
29
30 #include <sys/zfs_context.h>
31 #include <sys/spa.h>
32 #include <sys/spa_impl.h>
33 #include <sys/zio.h>
34 #include <sys/ddt.h>
35 #include <sys/ddt_impl.h>
36 #include <sys/zap.h>
37 #include <sys/dmu_tx.h>
38 #include <sys/arc.h>
39 #include <sys/dsl_pool.h>
40 #include <sys/zio_checksum.h>
41 #include <sys/dsl_scan.h>
42 #include <sys/abd.h>
43 #include <sys/zfeature.h>
44
45 /*
46 * # DDT: Deduplication tables
47 *
48 * The dedup subsystem provides block-level deduplication. When enabled, blocks
49 * to be written will have the dedup (D) bit set, which causes them to be
50 * tracked in a "dedup table", or DDT. If a block has been seen before (exists
51 * in the DDT), instead of being written, it will instead be made to reference
52 * the existing on-disk data, and a refcount bumped in the DDT instead.
53 *
54 * ## Dedup tables and entries
55 *
56 * Conceptually, a DDT is a dictionary or map. Each entry has a "key"
57 * (ddt_key_t) made up a block's checksum and certian properties, and a "value"
58 * (one or more ddt_phys_t) containing valid DVAs for the block's data, birth
59 * time and refcount. Together these are enough to track references to a
60 * specific block, to build a valid block pointer to reference that block (for
61 * freeing, scrubbing, etc), and to fill a new block pointer with the missing
62 * pieces to make it seem like it was written.
63 *
64 * There's a single DDT (ddt_t) for each checksum type, held in spa_ddt[].
65 * Within each DDT, there can be multiple storage "types" (ddt_type_t, on-disk
66 * object data formats, each with their own implementations) and "classes"
67 * (ddt_class_t, instance of a storage type object, for entries with a specific
68 * characteristic). An entry (key) will only ever exist on one of these objects
69 * at any given time, but may be moved from one to another if their type or
70 * class changes.
71 *
72 * The DDT is driven by the write IO pipeline (zio_ddt_write()). When a block
73 * is to be written, before DVAs have been allocated, ddt_lookup() is called to
74 * see if the block has been seen before. If its not found, the write proceeds
75 * as normal, and after it succeeds, a new entry is created. If it is found, we
76 * fill the BP with the DVAs from the entry, increment the refcount and cause
77 * the write IO to return immediately.
78 *
79 * Traditionally, each ddt_phys_t slot in the entry represents a separate dedup
80 * block for the same content/checksum. The slot is selected based on the
81 * zp_copies parameter the block is written with, that is, the number of DVAs
82 * in the block. The "ditto" slot (DDT_PHYS_DITTO) used to be used for
83 * now-removed "dedupditto" feature. These are no longer written, and will be
84 * freed if encountered on old pools.
85 *
86 * If the "fast_dedup" feature is enabled, new dedup tables will be created
87 * with the "flat phys" option. In this mode, there is only one ddt_phys_t
88 * slot. If a write is issued for an entry that exists, but has fewer DVAs,
89 * then only as many new DVAs are allocated and written to make up the
90 * shortfall. The existing entry is then extended (ddt_phys_extend()) with the
91 * new DVAs.
92 *
93 * ## Lifetime of an entry
94 *
95 * A DDT can be enormous, and typically is not held in memory all at once.
96 * Instead, the changes to an entry are tracked in memory, and written down to
97 * disk at the end of each txg.
98 *
99 * A "live" in-memory entry (ddt_entry_t) is a node on the live tree
100 * (ddt_tree). At the start of a txg, ddt_tree is empty. When an entry is
101 * required for IO, ddt_lookup() is called. If an entry already exists on
102 * ddt_tree, it is returned. Otherwise, a new one is created, and the
103 * type/class objects for the DDT are searched for that key. If its found, its
104 * value is copied into the live entry. If not, an empty entry is created.
105 *
106 * The live entry will be modified during the txg, usually by modifying the
107 * refcount, but sometimes by adding or updating DVAs. At the end of the txg
108 * (during spa_sync()), type and class are recalculated for entry (see
109 * ddt_sync_entry()), and the entry is written to the appropriate storage
110 * object and (if necessary), removed from an old one. ddt_tree is cleared and
111 * the next txg can start.
112 *
113 * ## Dedup quota
114 *
115 * A maximum size for all DDTs on the pool can be set with the
116 * dedup_table_quota property. This is determined in ddt_over_quota() and
117 * enforced during ddt_lookup(). If the pool is at or over its quota limit,
118 * ddt_lookup() will only return entries for existing blocks, as updates are
119 * still possible. New entries will not be created; instead, ddt_lookup() will
120 * return NULL. In response, the DDT write stage (zio_ddt_write()) will remove
121 * the D bit on the block and reissue the IO as a regular write. The block will
122 * not be deduplicated.
123 *
124 * Note that this is based on the on-disk size of the dedup store. Reclaiming
125 * this space after deleting entries relies on the ZAP "shrinking" behaviour,
126 * without which, no space would be recovered and the DDT would continue to be
127 * considered "over quota". See zap_shrink_enabled.
128 *
129 * ## Dedup table pruning
130 *
131 * As a complement to the dedup quota feature, ddtprune allows removal of older
132 * non-duplicate entries to make room for newer duplicate entries. The amount
133 * to prune can be based on a target percentage of the unique entries or based
134 * on the age (i.e., prune unique entry older than N days).
135 *
136 * ## Dedup log
137 *
138 * Historically, all entries modified on a txg were written back to dedup
139 * storage objects at the end of every txg. This could cause significant
140 * overheads, as each entry only takes up a tiny portion of a ZAP leaf node,
141 * and so required reading the whole node, updating the entry, and writing it
142 * back. On busy pools, this could add serious IO and memory overheads.
143 *
144 * To address this, the dedup log was added. If the "fast_dedup" feature is
145 * enabled, at the end of each txg, modified entries will be copied to an
146 * in-memory "log" object (ddt_log_t), and appended to an on-disk log. If the
147 * same block is requested again, the in-memory object will be checked first,
148 * and if its there, the entry inflated back onto the live tree without going
149 * to storage. The on-disk log is only read at pool import time, to reload the
150 * in-memory log.
151 *
152 * Each txg, some amount of the in-memory log will be flushed out to a DDT
153 * storage object (ie ZAP) as normal. OpenZFS will try hard to flush enough to
154 * keep up with the rate of change on dedup entries, but not so much that it
155 * would impact overall throughput, and not using too much memory. See the
156 * zfs_dedup_log_* tunables in zfs(4) for more details.
157 *
158 * ## Repair IO
159 *
160 * If a read on a dedup block fails, but there are other copies of the block in
161 * the other ddt_phys_t slots, reads will be issued for those instead
162 * (zio_ddt_read_start()). If one of those succeeds, the read is returned to
163 * the caller, and a copy is stashed on the entry's dde_repair_abd.
164 *
165 * During the end-of-txg sync, any entries with a dde_repair_abd get a
166 * "rewrite" write issued for the original block pointer, with the data read
167 * from the alternate block. If the block is actually damaged, this will invoke
168 * the pool's "self-healing" mechanism, and repair the block.
169 *
170 * If the "fast_dedup" feature is enabled, the "flat phys" option will be in
171 * use, so there is only ever one ddt_phys_t slot. The repair process will
172 * still happen in this case, though it is unlikely to succeed as there will
173 * usually be no other equivalent blocks to fall back on (though there might
174 * be, if this was an early version of a dedup'd block that has since been
175 * extended).
176 *
177 * Note that this repair mechanism is in addition to and separate from the
178 * regular OpenZFS scrub and self-healing mechanisms.
179 *
180 * ## Scanning (scrub/resilver)
181 *
182 * If dedup is active, the scrub machinery will walk the dedup table first, and
183 * scrub all blocks with refcnt > 1 first. After that it will move on to the
184 * regular top-down scrub, and exclude the refcnt > 1 blocks when it sees them.
185 * In this way, heavily deduplicated blocks are only scrubbed once. See the
186 * commentary on dsl_scan_ddt() for more details.
187 *
188 * Walking the DDT is done via ddt_walk(). The current position is stored in a
189 * ddt_bookmark_t, which represents a stable position in the storage object.
190 * This bookmark is stored by the scan machinery, and must reference the same
191 * position on the object even if the object changes, the pool is exported, or
192 * OpenZFS is upgraded.
193 *
194 * If the "fast_dedup" feature is enabled and the table has a log, the scan
195 * cannot begin until entries on the log are flushed, as the on-disk log has no
196 * concept of a "stable position". Instead, the log flushing process will enter
197 * a more aggressive mode, to flush out as much as is necesary as soon as
198 * possible, in order to begin the scan as soon as possible.
199 *
200 * ## Interaction with block cloning
201 *
202 * If block cloning and dedup are both enabled on a pool, BRT will look for the
203 * dedup bit on an incoming block pointer. If set, it will call into the DDT
204 * (ddt_addref()) to add a reference to the block, instead of adding a
205 * reference to the BRT. See brt_pending_apply().
206 */
207
208 /*
209 * These are the only checksums valid for dedup. They must match the list
210 * from dedup_table in zfs_prop.c
211 */
212 #define DDT_CHECKSUM_VALID(c) \
213 (c == ZIO_CHECKSUM_SHA256 || c == ZIO_CHECKSUM_SHA512 || \
214 c == ZIO_CHECKSUM_SKEIN || c == ZIO_CHECKSUM_EDONR || \
215 c == ZIO_CHECKSUM_BLAKE3)
216
217 static kmem_cache_t *ddt_cache;
218
219 static kmem_cache_t *ddt_entry_flat_cache;
220 static kmem_cache_t *ddt_entry_trad_cache;
221
222 #define DDT_ENTRY_FLAT_SIZE (sizeof (ddt_entry_t) + DDT_FLAT_PHYS_SIZE)
223 #define DDT_ENTRY_TRAD_SIZE (sizeof (ddt_entry_t) + DDT_TRAD_PHYS_SIZE)
224
225 #define DDT_ENTRY_SIZE(ddt) \
226 _DDT_PHYS_SWITCH(ddt, DDT_ENTRY_FLAT_SIZE, DDT_ENTRY_TRAD_SIZE)
227
228 /*
229 * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
230 */
231 int zfs_dedup_prefetch = 0;
232
233 /*
234 * If the dedup class cannot satisfy a DDT allocation, treat as over quota
235 * for this many TXGs.
236 */
237 uint_t dedup_class_wait_txgs = 5;
238
239 /*
240 * How many DDT prune entries to add to the DDT sync AVL tree.
241 * Note these addtional entries have a memory footprint of a
242 * ddt_entry_t (216 bytes).
243 */
244 static uint32_t zfs_ddt_prunes_per_txg = 50000;
245
246 /*
247 * For testing, synthesize aged DDT entries
248 * (in global scope for ztest)
249 */
250 boolean_t ddt_prune_artificial_age = B_FALSE;
251 boolean_t ddt_dump_prune_histogram = B_FALSE;
252
253 /*
254 * Minimum time to flush per txg.
255 */
256 uint_t zfs_dedup_log_flush_min_time_ms = 1000;
257
258 /*
259 * Minimum entries to flush per txg.
260 */
261 uint_t zfs_dedup_log_flush_entries_min = 200;
262
263 /*
264 * Target number of TXGs until the whole dedup log has been flushed.
265 * The log size will float around this value times the ingest rate.
266 */
267 uint_t zfs_dedup_log_flush_txgs = 100;
268
269 /*
270 * Maximum entries to flush per txg. Used for testing the dedup log.
271 */
272 uint_t zfs_dedup_log_flush_entries_max = UINT_MAX;
273
274 /*
275 * Soft cap for the size of the current dedup log. If the log is larger
276 * than this size, we slightly increase the aggressiveness of the flushing to
277 * try to bring it back down to the soft cap.
278 */
279 uint_t zfs_dedup_log_cap = UINT_MAX;
280
281 /*
282 * If this is set to B_TRUE, the cap above acts more like a hard cap:
283 * flushing is significantly more aggressive, increasing the minimum amount we
284 * flush per txg, as well as the maximum.
285 */
286 boolean_t zfs_dedup_log_hard_cap = B_FALSE;
287
288 /*
289 * Number of txgs to average flow rates across.
290 */
291 uint_t zfs_dedup_log_flush_flow_rate_txgs = 10;
292
293 static const ddt_ops_t *const ddt_ops[DDT_TYPES] = {
294 &ddt_zap_ops,
295 };
296
297 static const char *const ddt_class_name[DDT_CLASSES] = {
298 "ditto",
299 "duplicate",
300 "unique",
301 };
302
303 /*
304 * DDT feature flags automatically enabled for each on-disk version. Note that
305 * versions >0 cannot exist on disk without SPA_FEATURE_FAST_DEDUP enabled.
306 */
307 static const uint64_t ddt_version_flags[] = {
308 [DDT_VERSION_LEGACY] = 0,
309 [DDT_VERSION_FDT] = DDT_FLAG_FLAT | DDT_FLAG_LOG,
310 };
311
312 /* per-DDT kstats */
313 typedef struct {
314 /* total lookups and whether they returned new or existing entries */
315 kstat_named_t dds_lookup;
316 kstat_named_t dds_lookup_new;
317 kstat_named_t dds_lookup_existing;
318
319 /* entries found on live tree, and if we had to wait for load */
320 kstat_named_t dds_lookup_live_hit;
321 kstat_named_t dds_lookup_live_wait;
322 kstat_named_t dds_lookup_live_miss;
323
324 /* entries found on log trees */
325 kstat_named_t dds_lookup_log_hit;
326 kstat_named_t dds_lookup_log_active_hit;
327 kstat_named_t dds_lookup_log_flushing_hit;
328 kstat_named_t dds_lookup_log_miss;
329
330 /* entries found on store objects */
331 kstat_named_t dds_lookup_stored_hit;
332 kstat_named_t dds_lookup_stored_miss;
333
334 /* number of entries on log trees */
335 kstat_named_t dds_log_active_entries;
336 kstat_named_t dds_log_flushing_entries;
337
338 /* avg updated/flushed entries per txg */
339 kstat_named_t dds_log_ingest_rate;
340 kstat_named_t dds_log_flush_rate;
341 kstat_named_t dds_log_flush_time_rate;
342 } ddt_kstats_t;
343
344 static const ddt_kstats_t ddt_kstats_template = {
345 { "lookup", KSTAT_DATA_UINT64 },
346 { "lookup_new", KSTAT_DATA_UINT64 },
347 { "lookup_existing", KSTAT_DATA_UINT64 },
348 { "lookup_live_hit", KSTAT_DATA_UINT64 },
349 { "lookup_live_wait", KSTAT_DATA_UINT64 },
350 { "lookup_live_miss", KSTAT_DATA_UINT64 },
351 { "lookup_log_hit", KSTAT_DATA_UINT64 },
352 { "lookup_log_active_hit", KSTAT_DATA_UINT64 },
353 { "lookup_log_flushing_hit", KSTAT_DATA_UINT64 },
354 { "lookup_log_miss", KSTAT_DATA_UINT64 },
355 { "lookup_stored_hit", KSTAT_DATA_UINT64 },
356 { "lookup_stored_miss", KSTAT_DATA_UINT64 },
357 { "log_active_entries", KSTAT_DATA_UINT64 },
358 { "log_flushing_entries", KSTAT_DATA_UINT64 },
359 { "log_ingest_rate", KSTAT_DATA_UINT32 },
360 { "log_flush_rate", KSTAT_DATA_UINT32 },
361 { "log_flush_time_rate", KSTAT_DATA_UINT32 },
362 };
363
364 #ifdef _KERNEL
365 /*
366 * Hot-path lookup counters use wmsums to avoid cache line bouncing.
367 * DDT_KSTAT_BUMP: Increment a wmsum counter (lookup stats).
368 *
369 * Sync-only counters use direct kstat assignment (no atomics needed).
370 * DDT_KSTAT_SET: Set a value (log entry counts, rates).
371 * DDT_KSTAT_SUB: Subtract from a value (decrement log entry counts).
372 * DDT_KSTAT_ZERO: Zero a value (clear log entry counts).
373 */
374 #define _DDT_KSTAT_STAT(ddt, stat) \
375 &((ddt_kstats_t *)(ddt)->ddt_ksp->ks_data)->stat.value.ui64
376 #define DDT_KSTAT_BUMP(ddt, stat) \
377 wmsum_add(&(ddt)->ddt_kstat_##stat, 1)
378 #define DDT_KSTAT_SUB(ddt, stat, val) \
379 do { *_DDT_KSTAT_STAT(ddt, stat) -= (val); } while (0)
380 #define DDT_KSTAT_SET(ddt, stat, val) \
381 do { *_DDT_KSTAT_STAT(ddt, stat) = (val); } while (0)
382 #define DDT_KSTAT_ZERO(ddt, stat) DDT_KSTAT_SET(ddt, stat, 0)
383 #else
384 #define DDT_KSTAT_BUMP(ddt, stat) do {} while (0)
385 #define DDT_KSTAT_SUB(ddt, stat, val) do {} while (0)
386 #define DDT_KSTAT_SET(ddt, stat, val) do {} while (0)
387 #define DDT_KSTAT_ZERO(ddt, stat) do {} while (0)
388 #endif /* _KERNEL */
389
390
391 static void
ddt_object_create(ddt_t * ddt,ddt_type_t type,ddt_class_t class,dmu_tx_t * tx)392 ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
393 dmu_tx_t *tx)
394 {
395 spa_t *spa = ddt->ddt_spa;
396 objset_t *os = ddt->ddt_os;
397 uint64_t *objectp = &ddt->ddt_object[type][class];
398 boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags &
399 ZCHECKSUM_FLAG_DEDUP;
400 char name[DDT_NAMELEN];
401
402 ASSERT3U(ddt->ddt_dir_object, >, 0);
403
404 ddt_object_name(ddt, type, class, name);
405
406 ASSERT0(*objectp);
407 VERIFY0(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash));
408 ASSERT3U(*objectp, !=, 0);
409
410 VERIFY0(dnode_hold(os, *objectp, ddt,
411 &ddt->ddt_object_dnode[type][class]));
412
413 ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
414
415 VERIFY0(zap_add(os, ddt->ddt_dir_object, name, sizeof (uint64_t), 1,
416 objectp, tx));
417
418 VERIFY0(zap_add(os, spa->spa_ddt_stat_object, name,
419 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
420 &ddt->ddt_histogram[type][class], tx));
421 }
422
423 static void
ddt_object_destroy(ddt_t * ddt,ddt_type_t type,ddt_class_t class,dmu_tx_t * tx)424 ddt_object_destroy(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
425 dmu_tx_t *tx)
426 {
427 spa_t *spa = ddt->ddt_spa;
428 objset_t *os = ddt->ddt_os;
429 uint64_t *objectp = &ddt->ddt_object[type][class];
430 uint64_t count;
431 char name[DDT_NAMELEN];
432
433 ASSERT3U(ddt->ddt_dir_object, >, 0);
434
435 ddt_object_name(ddt, type, class, name);
436
437 ASSERT3U(*objectp, !=, 0);
438 ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
439 VERIFY0(ddt_object_count(ddt, type, class, &count));
440 VERIFY0(count);
441 VERIFY0(zap_remove(os, ddt->ddt_dir_object, name, tx));
442 VERIFY0(zap_remove(os, spa->spa_ddt_stat_object, name, tx));
443 if (ddt->ddt_object_dnode[type][class] != NULL) {
444 dnode_rele(ddt->ddt_object_dnode[type][class], ddt);
445 ddt->ddt_object_dnode[type][class] = NULL;
446 }
447 VERIFY0(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx));
448 memset(&ddt->ddt_object_stats[type][class], 0, sizeof (ddt_object_t));
449
450 *objectp = 0;
451 }
452
453 static int
ddt_object_load(ddt_t * ddt,ddt_type_t type,ddt_class_t class)454 ddt_object_load(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
455 {
456 ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
457 dmu_object_info_t doi;
458 uint64_t count;
459 char name[DDT_NAMELEN];
460 int error;
461
462 if (ddt->ddt_dir_object == 0) {
463 /*
464 * If we're configured but the containing dir doesn't exist
465 * yet, then this object can't possibly exist either.
466 */
467 ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
468 return (SET_ERROR(ENOENT));
469 }
470
471 ddt_object_name(ddt, type, class, name);
472
473 error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name,
474 sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
475 if (error != 0)
476 return (error);
477
478 error = dnode_hold(ddt->ddt_os, ddt->ddt_object[type][class], ddt,
479 &ddt->ddt_object_dnode[type][class]);
480 if (error != 0)
481 return (error);
482
483 error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
484 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
485 &ddt->ddt_histogram[type][class]);
486 if (error != 0)
487 goto error;
488
489 /*
490 * Seed the cached statistics.
491 */
492 error = ddt_object_info(ddt, type, class, &doi);
493 if (error)
494 goto error;
495
496 error = ddt_object_count(ddt, type, class, &count);
497 if (error)
498 goto error;
499
500 ddo->ddo_count = count;
501 ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
502 ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
503
504 return (0);
505
506 error:
507 dnode_rele(ddt->ddt_object_dnode[type][class], ddt);
508 ddt->ddt_object_dnode[type][class] = NULL;
509 return (error);
510 }
511
512 static void
ddt_object_sync(ddt_t * ddt,ddt_type_t type,ddt_class_t class,dmu_tx_t * tx)513 ddt_object_sync(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
514 dmu_tx_t *tx)
515 {
516 ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
517 dmu_object_info_t doi;
518 uint64_t count;
519 char name[DDT_NAMELEN];
520
521 ddt_object_name(ddt, type, class, name);
522
523 VERIFY0(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
524 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
525 &ddt->ddt_histogram[type][class], tx));
526
527 /*
528 * Cache DDT statistics; this is the only time they'll change.
529 */
530 VERIFY0(ddt_object_info(ddt, type, class, &doi));
531 VERIFY0(ddt_object_count(ddt, type, class, &count));
532
533 ddo->ddo_count = count;
534 ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
535 ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
536 }
537
538 static boolean_t
ddt_object_exists(ddt_t * ddt,ddt_type_t type,ddt_class_t class)539 ddt_object_exists(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
540 {
541 return (!!ddt->ddt_object[type][class]);
542 }
543
544 static int
ddt_object_lookup(ddt_t * ddt,ddt_type_t type,ddt_class_t class,ddt_entry_t * dde)545 ddt_object_lookup(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
546 ddt_entry_t *dde)
547 {
548 dnode_t *dn = ddt->ddt_object_dnode[type][class];
549 if (dn == NULL)
550 return (SET_ERROR(ENOENT));
551
552 return (ddt_ops[type]->ddt_op_lookup(dn, &dde->dde_key,
553 dde->dde_phys, DDT_PHYS_SIZE(ddt)));
554 }
555
556 static int
ddt_object_contains(ddt_t * ddt,ddt_type_t type,ddt_class_t class,const ddt_key_t * ddk)557 ddt_object_contains(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
558 const ddt_key_t *ddk)
559 {
560 dnode_t *dn = ddt->ddt_object_dnode[type][class];
561 if (dn == NULL)
562 return (SET_ERROR(ENOENT));
563
564 return (ddt_ops[type]->ddt_op_contains(dn, ddk));
565 }
566
567 static void
ddt_object_prefetch(ddt_t * ddt,ddt_type_t type,ddt_class_t class,const ddt_key_t * ddk)568 ddt_object_prefetch(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
569 const ddt_key_t *ddk)
570 {
571 dnode_t *dn = ddt->ddt_object_dnode[type][class];
572 if (dn == NULL)
573 return;
574
575 ddt_ops[type]->ddt_op_prefetch(dn, ddk);
576 }
577
578 static void
ddt_object_prefetch_all(ddt_t * ddt,ddt_type_t type,ddt_class_t class)579 ddt_object_prefetch_all(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
580 {
581 dnode_t *dn = ddt->ddt_object_dnode[type][class];
582 if (dn == NULL)
583 return;
584
585 ddt_ops[type]->ddt_op_prefetch_all(dn);
586 }
587
588 static int
ddt_object_update(ddt_t * ddt,ddt_type_t type,ddt_class_t class,const ddt_lightweight_entry_t * ddlwe,dmu_tx_t * tx)589 ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
590 const ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
591 {
592 dnode_t *dn = ddt->ddt_object_dnode[type][class];
593 ASSERT(dn != NULL);
594
595 return (ddt_ops[type]->ddt_op_update(dn, &ddlwe->ddlwe_key,
596 &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt), tx));
597 }
598
599 static int
ddt_object_remove(ddt_t * ddt,ddt_type_t type,ddt_class_t class,const ddt_key_t * ddk,dmu_tx_t * tx)600 ddt_object_remove(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
601 const ddt_key_t *ddk, dmu_tx_t *tx)
602 {
603 dnode_t *dn = ddt->ddt_object_dnode[type][class];
604 ASSERT(dn != NULL);
605
606 return (ddt_ops[type]->ddt_op_remove(dn, ddk, tx));
607 }
608
609 int
ddt_object_walk(ddt_t * ddt,ddt_type_t type,ddt_class_t class,uint64_t * walk,ddt_lightweight_entry_t * ddlwe)610 ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
611 uint64_t *walk, ddt_lightweight_entry_t *ddlwe)
612 {
613 dnode_t *dn = ddt->ddt_object_dnode[type][class];
614 ASSERT(dn != NULL);
615
616 int error = ddt_ops[type]->ddt_op_walk(dn, walk, &ddlwe->ddlwe_key,
617 &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
618 if (error == 0) {
619 ddlwe->ddlwe_type = type;
620 ddlwe->ddlwe_class = class;
621 return (0);
622 }
623 return (error);
624 }
625
626 int
ddt_object_count(ddt_t * ddt,ddt_type_t type,ddt_class_t class,uint64_t * count)627 ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
628 uint64_t *count)
629 {
630 dnode_t *dn = ddt->ddt_object_dnode[type][class];
631 ASSERT(dn != NULL);
632
633 return (ddt_ops[type]->ddt_op_count(dn, count));
634 }
635
636 int
ddt_object_info(ddt_t * ddt,ddt_type_t type,ddt_class_t class,dmu_object_info_t * doi)637 ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
638 dmu_object_info_t *doi)
639 {
640 if (!ddt_object_exists(ddt, type, class))
641 return (SET_ERROR(ENOENT));
642
643 return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
644 doi));
645 }
646
647 void
ddt_object_name(ddt_t * ddt,ddt_type_t type,ddt_class_t class,char * name)648 ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
649 char *name)
650 {
651 (void) snprintf(name, DDT_NAMELEN, DMU_POOL_DDT,
652 zio_checksum_table[ddt->ddt_checksum].ci_name,
653 ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
654 }
655
656 void
ddt_bp_fill(const ddt_univ_phys_t * ddp,ddt_phys_variant_t v,blkptr_t * bp,uint64_t txg)657 ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
658 blkptr_t *bp, uint64_t txg)
659 {
660 ASSERT3U(txg, !=, 0);
661 ASSERT3U(v, <, DDT_PHYS_NONE);
662 uint64_t phys_birth;
663 const dva_t *dvap;
664
665 if (v == DDT_PHYS_FLAT) {
666 phys_birth = ddp->ddp_flat.ddp_phys_birth;
667 dvap = ddp->ddp_flat.ddp_dva;
668 } else {
669 phys_birth = ddp->ddp_trad[v].ddp_phys_birth;
670 dvap = ddp->ddp_trad[v].ddp_dva;
671 }
672
673 for (int d = 0; d < SPA_DVAS_PER_BP; d++)
674 bp->blk_dva[d] = dvap[d];
675 BP_SET_BIRTH(bp, txg, phys_birth);
676 }
677
678 /*
679 * The bp created via this function may be used for repairs and scrub, but it
680 * will be missing the salt / IV required to do a full decrypting read.
681 */
682 void
ddt_bp_create(enum zio_checksum checksum,const ddt_key_t * ddk,const ddt_univ_phys_t * ddp,ddt_phys_variant_t v,blkptr_t * bp)683 ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
684 const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp)
685 {
686 BP_ZERO(bp);
687
688 if (ddp != NULL)
689 ddt_bp_fill(ddp, v, bp, ddt_phys_birth(ddp, v));
690
691 bp->blk_cksum = ddk->ddk_cksum;
692
693 BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
694 BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
695 BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
696 BP_SET_CRYPT(bp, DDK_GET_CRYPT(ddk));
697 BP_SET_FILL(bp, 1);
698 BP_SET_CHECKSUM(bp, checksum);
699 BP_SET_TYPE(bp, DMU_OT_DEDUP);
700 BP_SET_LEVEL(bp, 0);
701 BP_SET_DEDUP(bp, 1);
702 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
703 }
704
705 void
ddt_key_fill(ddt_key_t * ddk,const blkptr_t * bp)706 ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
707 {
708 ddk->ddk_cksum = bp->blk_cksum;
709 ddk->ddk_prop = 0;
710
711 ASSERT(BP_IS_ENCRYPTED(bp) || !BP_USES_CRYPT(bp));
712
713 DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
714 DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
715 DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
716 DDK_SET_CRYPT(ddk, BP_USES_CRYPT(bp));
717 }
718
719 void
ddt_phys_extend(ddt_univ_phys_t * ddp,ddt_phys_variant_t v,const blkptr_t * bp)720 ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v, const blkptr_t *bp)
721 {
722 ASSERT3U(v, <, DDT_PHYS_NONE);
723 int bp_ndvas = BP_GET_NDVAS(bp);
724 int ddp_max_dvas = BP_IS_ENCRYPTED(bp) ?
725 SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP;
726 dva_t *dvas = (v == DDT_PHYS_FLAT) ?
727 ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva;
728
729 int s = 0, d = 0;
730 while (s < bp_ndvas && d < ddp_max_dvas) {
731 if (DVA_IS_VALID(&dvas[d])) {
732 d++;
733 continue;
734 }
735 dvas[d] = bp->blk_dva[s];
736 s++; d++;
737 }
738
739 /*
740 * If the caller offered us more DVAs than we can fit, something has
741 * gone wrong in their accounting. zio_ddt_write() should never ask for
742 * more than we need.
743 */
744 ASSERT3U(s, ==, bp_ndvas);
745
746 if (BP_IS_ENCRYPTED(bp))
747 dvas[2] = bp->blk_dva[2];
748
749 if (ddt_phys_birth(ddp, v) == 0) {
750 if (v == DDT_PHYS_FLAT) {
751 ddp->ddp_flat.ddp_phys_birth =
752 BP_GET_PHYSICAL_BIRTH(bp);
753 } else {
754 ddp->ddp_trad[v].ddp_phys_birth =
755 BP_GET_PHYSICAL_BIRTH(bp);
756 }
757 }
758 }
759
760 void
ddt_phys_unextend(ddt_univ_phys_t * cur,ddt_univ_phys_t * orig,ddt_phys_variant_t v)761 ddt_phys_unextend(ddt_univ_phys_t *cur, ddt_univ_phys_t *orig,
762 ddt_phys_variant_t v)
763 {
764 ASSERT3U(v, <, DDT_PHYS_NONE);
765 dva_t *cur_dvas = (v == DDT_PHYS_FLAT) ?
766 cur->ddp_flat.ddp_dva : cur->ddp_trad[v].ddp_dva;
767 dva_t *orig_dvas = (v == DDT_PHYS_FLAT) ?
768 orig->ddp_flat.ddp_dva : orig->ddp_trad[v].ddp_dva;
769
770 for (int d = 0; d < SPA_DVAS_PER_BP; d++)
771 cur_dvas[d] = orig_dvas[d];
772
773 if (ddt_phys_birth(orig, v) == 0) {
774 if (v == DDT_PHYS_FLAT)
775 cur->ddp_flat.ddp_phys_birth = 0;
776 else
777 cur->ddp_trad[v].ddp_phys_birth = 0;
778 }
779 }
780
781 void
ddt_phys_copy(ddt_univ_phys_t * dst,const ddt_univ_phys_t * src,ddt_phys_variant_t v)782 ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src,
783 ddt_phys_variant_t v)
784 {
785 ASSERT3U(v, <, DDT_PHYS_NONE);
786
787 if (v == DDT_PHYS_FLAT)
788 dst->ddp_flat = src->ddp_flat;
789 else
790 dst->ddp_trad[v] = src->ddp_trad[v];
791 }
792
793 void
ddt_phys_clear(ddt_univ_phys_t * ddp,ddt_phys_variant_t v)794 ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
795 {
796 ASSERT3U(v, <, DDT_PHYS_NONE);
797
798 if (v == DDT_PHYS_FLAT)
799 memset(&ddp->ddp_flat, 0, DDT_FLAT_PHYS_SIZE);
800 else
801 memset(&ddp->ddp_trad[v], 0, DDT_TRAD_PHYS_SIZE / DDT_PHYS_MAX);
802 }
803
804 static uint64_t
ddt_class_start(void)805 ddt_class_start(void)
806 {
807 uint64_t start = gethrestime_sec();
808
809 if (unlikely(ddt_prune_artificial_age)) {
810 /*
811 * debug aide -- simulate a wider distribution
812 * so we don't have to wait for an aged DDT
813 * to test prune.
814 */
815 int range = 1 << 21;
816 int percent = random_in_range(100);
817 if (percent < 50) {
818 range = range >> 4;
819 } else if (percent > 75) {
820 range /= 2;
821 }
822 start -= random_in_range(range);
823 }
824
825 return (start);
826 }
827
828 void
ddt_phys_addref(ddt_univ_phys_t * ddp,ddt_phys_variant_t v)829 ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
830 {
831 ASSERT3U(v, <, DDT_PHYS_NONE);
832
833 if (v == DDT_PHYS_FLAT)
834 ddp->ddp_flat.ddp_refcnt++;
835 else
836 ddp->ddp_trad[v].ddp_refcnt++;
837 }
838
839 uint64_t
ddt_phys_decref(ddt_univ_phys_t * ddp,ddt_phys_variant_t v)840 ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
841 {
842 ASSERT3U(v, <, DDT_PHYS_NONE);
843
844 uint64_t *refcntp;
845
846 if (v == DDT_PHYS_FLAT)
847 refcntp = &ddp->ddp_flat.ddp_refcnt;
848 else
849 refcntp = &ddp->ddp_trad[v].ddp_refcnt;
850
851 ASSERT3U(*refcntp, >, 0);
852 (*refcntp)--;
853 return (*refcntp);
854 }
855
856 static void
ddt_phys_free(ddt_t * ddt,ddt_key_t * ddk,ddt_univ_phys_t * ddp,ddt_phys_variant_t v,uint64_t txg)857 ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_univ_phys_t *ddp,
858 ddt_phys_variant_t v, uint64_t txg)
859 {
860 blkptr_t blk;
861
862 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
863
864 /*
865 * We clear the dedup bit so that zio_free() will actually free the
866 * space, rather than just decrementing the refcount in the DDT.
867 */
868 BP_SET_DEDUP(&blk, 0);
869
870 ddt_phys_clear(ddp, v);
871 zio_free(ddt->ddt_spa, txg, &blk);
872 }
873
874 uint64_t
ddt_phys_birth(const ddt_univ_phys_t * ddp,ddt_phys_variant_t v)875 ddt_phys_birth(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
876 {
877 ASSERT3U(v, <, DDT_PHYS_NONE);
878
879 if (v == DDT_PHYS_FLAT)
880 return (ddp->ddp_flat.ddp_phys_birth);
881 else
882 return (ddp->ddp_trad[v].ddp_phys_birth);
883 }
884
885 int
ddt_phys_is_gang(const ddt_univ_phys_t * ddp,ddt_phys_variant_t v)886 ddt_phys_is_gang(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
887 {
888 ASSERT3U(v, <, DDT_PHYS_NONE);
889
890 const dva_t *dvas = (v == DDT_PHYS_FLAT) ?
891 ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva;
892
893 return (DVA_GET_GANG(&dvas[0]));
894 }
895
896 int
ddt_phys_dva_count(const ddt_univ_phys_t * ddp,ddt_phys_variant_t v,boolean_t encrypted)897 ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
898 boolean_t encrypted)
899 {
900 ASSERT3U(v, <, DDT_PHYS_NONE);
901
902 const dva_t *dvas = (v == DDT_PHYS_FLAT) ?
903 ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva;
904
905 return (DVA_IS_VALID(&dvas[0]) +
906 DVA_IS_VALID(&dvas[1]) +
907 DVA_IS_VALID(&dvas[2]) * !encrypted);
908 }
909
910 ddt_phys_variant_t
ddt_phys_select(const ddt_t * ddt,const ddt_entry_t * dde,const blkptr_t * bp)911 ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, const blkptr_t *bp)
912 {
913 if (dde == NULL)
914 return (DDT_PHYS_NONE);
915
916 const ddt_univ_phys_t *ddp = dde->dde_phys;
917
918 if (ddt->ddt_flags & DDT_FLAG_FLAT) {
919 if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_flat.ddp_dva[0]) &&
920 BP_GET_PHYSICAL_BIRTH(bp) == ddp->ddp_flat.ddp_phys_birth) {
921 return (DDT_PHYS_FLAT);
922 }
923 } else /* traditional phys */ {
924 for (int p = 0; p < DDT_PHYS_MAX; p++) {
925 if (DVA_EQUAL(BP_IDENTITY(bp),
926 &ddp->ddp_trad[p].ddp_dva[0]) &&
927 BP_GET_PHYSICAL_BIRTH(bp) ==
928 ddp->ddp_trad[p].ddp_phys_birth) {
929 return (p);
930 }
931 }
932 }
933 return (DDT_PHYS_NONE);
934 }
935
936 uint64_t
ddt_phys_refcnt(const ddt_univ_phys_t * ddp,ddt_phys_variant_t v)937 ddt_phys_refcnt(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
938 {
939 ASSERT3U(v, <, DDT_PHYS_NONE);
940
941 if (v == DDT_PHYS_FLAT)
942 return (ddp->ddp_flat.ddp_refcnt);
943 else
944 return (ddp->ddp_trad[v].ddp_refcnt);
945 }
946
947 uint64_t
ddt_phys_total_refcnt(const ddt_t * ddt,const ddt_univ_phys_t * ddp)948 ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_univ_phys_t *ddp)
949 {
950 uint64_t refcnt = 0;
951
952 if (ddt->ddt_flags & DDT_FLAG_FLAT)
953 refcnt = ddp->ddp_flat.ddp_refcnt;
954 else
955 for (int v = DDT_PHYS_SINGLE; v <= DDT_PHYS_TRIPLE; v++)
956 refcnt += ddp->ddp_trad[v].ddp_refcnt;
957
958 return (refcnt);
959 }
960
961 ddt_t *
ddt_select(spa_t * spa,const blkptr_t * bp)962 ddt_select(spa_t *spa, const blkptr_t *bp)
963 {
964 ASSERT(DDT_CHECKSUM_VALID(BP_GET_CHECKSUM(bp)));
965 return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
966 }
967
968 void
ddt_enter(ddt_t * ddt)969 ddt_enter(ddt_t *ddt)
970 {
971 mutex_enter(&ddt->ddt_lock);
972 }
973
974 void
ddt_exit(ddt_t * ddt)975 ddt_exit(ddt_t *ddt)
976 {
977 mutex_exit(&ddt->ddt_lock);
978 }
979
980 void
ddt_init(void)981 ddt_init(void)
982 {
983 ddt_cache = kmem_cache_create("ddt_cache",
984 sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
985 ddt_entry_flat_cache = kmem_cache_create("ddt_entry_flat_cache",
986 DDT_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
987 ddt_entry_trad_cache = kmem_cache_create("ddt_entry_trad_cache",
988 DDT_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
989
990 ddt_log_init();
991 }
992
993 void
ddt_fini(void)994 ddt_fini(void)
995 {
996 ddt_log_fini();
997
998 kmem_cache_destroy(ddt_entry_trad_cache);
999 kmem_cache_destroy(ddt_entry_flat_cache);
1000 kmem_cache_destroy(ddt_cache);
1001 }
1002
1003 static ddt_entry_t *
ddt_alloc(const ddt_t * ddt,const ddt_key_t * ddk)1004 ddt_alloc(const ddt_t *ddt, const ddt_key_t *ddk)
1005 {
1006 ddt_entry_t *dde;
1007
1008 if (ddt->ddt_flags & DDT_FLAG_FLAT) {
1009 dde = kmem_cache_alloc(ddt_entry_flat_cache, KM_SLEEP);
1010 memset(dde, 0, DDT_ENTRY_FLAT_SIZE);
1011 } else {
1012 dde = kmem_cache_alloc(ddt_entry_trad_cache, KM_SLEEP);
1013 memset(dde, 0, DDT_ENTRY_TRAD_SIZE);
1014 }
1015
1016 cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
1017
1018 dde->dde_key = *ddk;
1019
1020 return (dde);
1021 }
1022
1023 void
ddt_alloc_entry_io(ddt_entry_t * dde)1024 ddt_alloc_entry_io(ddt_entry_t *dde)
1025 {
1026 if (dde->dde_io != NULL)
1027 return;
1028
1029 dde->dde_io = kmem_zalloc(sizeof (ddt_entry_io_t), KM_SLEEP);
1030 mutex_init(&dde->dde_io->dde_io_lock, NULL, MUTEX_DEFAULT, NULL);
1031 }
1032
1033 static void
ddt_free(const ddt_t * ddt,ddt_entry_t * dde)1034 ddt_free(const ddt_t *ddt, ddt_entry_t *dde)
1035 {
1036 if (dde->dde_io != NULL) {
1037 for (int p = 0; p < DDT_NPHYS(ddt); p++)
1038 ASSERT0P(dde->dde_io->dde_lead_zio[p]);
1039
1040 if (dde->dde_io->dde_repair_abd != NULL)
1041 abd_free(dde->dde_io->dde_repair_abd);
1042
1043 mutex_destroy(&dde->dde_io->dde_io_lock);
1044 kmem_free(dde->dde_io, sizeof (ddt_entry_io_t));
1045 }
1046
1047 cv_destroy(&dde->dde_cv);
1048 kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
1049 ddt_entry_flat_cache : ddt_entry_trad_cache, dde);
1050 }
1051
1052 void
ddt_remove(ddt_t * ddt,ddt_entry_t * dde)1053 ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
1054 {
1055 ASSERT(MUTEX_HELD(&ddt->ddt_lock));
1056
1057 avl_remove(&ddt->ddt_tree, dde);
1058 ddt_free(ddt, dde);
1059 }
1060
1061 /*
1062 * We're considered over quota when we hit 85% full, or for larger drives,
1063 * when there is less than 8GB free.
1064 */
1065 static boolean_t
ddt_special_over_quota(metaslab_class_t * mc)1066 ddt_special_over_quota(metaslab_class_t *mc)
1067 {
1068 uint64_t allocated = metaslab_class_get_alloc(mc);
1069 uint64_t capacity = metaslab_class_get_space(mc);
1070 uint64_t limit = MAX(capacity * 85 / 100,
1071 (capacity > (1LL<<33)) ? capacity - (1LL<<33) : 0);
1072 return (allocated >= limit);
1073 }
1074
1075 /*
1076 * Check if the DDT is over its quota. This can be due to a few conditions:
1077 * 1. 'dedup_table_quota' property is not 0 (none) and the dedup dsize
1078 * exceeds this limit
1079 *
1080 * 2. 'dedup_table_quota' property is set to automatic and
1081 * a. the dedup or special allocation class could not satisfy a DDT
1082 * allocation in a recent transaction
1083 * b. the dedup or special allocation class has exceeded its 85% limit
1084 */
1085 static boolean_t
ddt_over_quota(spa_t * spa)1086 ddt_over_quota(spa_t *spa)
1087 {
1088 if (spa->spa_dedup_table_quota == 0)
1089 return (B_FALSE);
1090
1091 if (spa->spa_dedup_table_quota != UINT64_MAX)
1092 return (ddt_get_ddt_dsize(spa) > spa->spa_dedup_table_quota);
1093
1094 /*
1095 * Over quota if have to allocate outside of the dedup/special class.
1096 */
1097 if (spa_syncing_txg(spa) <= spa->spa_dedup_class_full_txg +
1098 dedup_class_wait_txgs) {
1099 /* Waiting for some deferred frees to be processed */
1100 return (B_TRUE);
1101 }
1102
1103 /*
1104 * For automatic quota, table size is limited by dedup or special class
1105 */
1106 if (spa_has_dedup(spa))
1107 return (ddt_special_over_quota(spa_dedup_class(spa)));
1108 else if (spa_special_has_ddt(spa))
1109 return (ddt_special_over_quota(spa_special_class(spa)));
1110
1111 return (B_FALSE);
1112 }
1113
1114 void
ddt_prefetch_all(spa_t * spa)1115 ddt_prefetch_all(spa_t *spa)
1116 {
1117 /*
1118 * Load all DDT entries for each type/class combination. This is
1119 * indended to perform a prefetch on all such blocks. For the same
1120 * reason that ddt_prefetch isn't locked, this is also not locked.
1121 */
1122 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1123 ddt_t *ddt = spa->spa_ddt[c];
1124 if (!ddt)
1125 continue;
1126
1127 for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1128 for (ddt_class_t class = 0; class < DDT_CLASSES;
1129 class++) {
1130 ddt_object_prefetch_all(ddt, type, class);
1131 }
1132 }
1133 }
1134 }
1135
1136 static int ddt_configure(ddt_t *ddt, boolean_t new);
1137
1138 /*
1139 * If the BP passed to ddt_lookup has valid DVAs, then we need to compare them
1140 * to the ones in the entry. If they're different, then the passed-in BP is
1141 * from a previous generation of this entry (ie was previously pruned) and we
1142 * have to act like the entry doesn't exist at all.
1143 *
1144 * This should only happen during a lookup to free the block (zio_ddt_free()).
1145 *
1146 * XXX this is similar in spirit to ddt_phys_select(), maybe can combine
1147 * -- robn, 2024-02-09
1148 */
1149 static boolean_t
ddt_entry_lookup_is_valid(ddt_t * ddt,const blkptr_t * bp,ddt_entry_t * dde)1150 ddt_entry_lookup_is_valid(ddt_t *ddt, const blkptr_t *bp, ddt_entry_t *dde)
1151 {
1152 /* If the BP has no DVAs, then this entry is good */
1153 uint_t ndvas = BP_GET_NDVAS(bp);
1154 if (ndvas == 0)
1155 return (B_TRUE);
1156
1157 /*
1158 * Only checking the phys for the copies. For flat, there's only one;
1159 * for trad it'll be the one that has the matching set of DVAs.
1160 */
1161 const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ?
1162 dde->dde_phys->ddp_flat.ddp_dva :
1163 dde->dde_phys->ddp_trad[ndvas].ddp_dva;
1164
1165 /*
1166 * Compare entry DVAs with the BP. They should all be there, but
1167 * there's not really anything we can do if its only partial anyway,
1168 * that's an error somewhere else, maybe long ago.
1169 */
1170 uint_t d;
1171 for (d = 0; d < ndvas; d++)
1172 if (!DVA_EQUAL(&dvas[d], &bp->blk_dva[d]))
1173 return (B_FALSE);
1174 ASSERT3U(d, ==, ndvas);
1175
1176 return (B_TRUE);
1177 }
1178
1179 ddt_entry_t *
ddt_lookup(ddt_t * ddt,const blkptr_t * bp,boolean_t verify)1180 ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t verify)
1181 {
1182 spa_t *spa = ddt->ddt_spa;
1183 ddt_key_t search;
1184 ddt_entry_t *dde;
1185 ddt_type_t type;
1186 ddt_class_t class;
1187 avl_index_t where;
1188 int error;
1189
1190 ASSERT(MUTEX_HELD(&ddt->ddt_lock));
1191
1192 if (unlikely(ddt->ddt_version == DDT_VERSION_UNCONFIGURED)) {
1193 /*
1194 * This is the first use of this DDT since the pool was
1195 * created; finish getting it ready for use.
1196 */
1197 VERIFY0(ddt_configure(ddt, B_TRUE));
1198 ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
1199 }
1200
1201 DDT_KSTAT_BUMP(ddt, dds_lookup);
1202
1203 ddt_key_fill(&search, bp);
1204
1205 /* Find an existing live entry */
1206 dde = avl_find(&ddt->ddt_tree, &search, &where);
1207 if (dde != NULL) {
1208 /* If we went over quota, act like we didn't find it */
1209 if (dde->dde_flags & DDE_FLAG_OVERQUOTA)
1210 return (NULL);
1211
1212 /* If it's already loaded, we can just return it. */
1213 DDT_KSTAT_BUMP(ddt, dds_lookup_live_hit);
1214 if (dde->dde_flags & DDE_FLAG_LOADED) {
1215 if (!verify || ddt_entry_lookup_is_valid(ddt, bp, dde))
1216 return (dde);
1217 return (NULL);
1218 }
1219
1220 /* Someone else is loading it, wait for it. */
1221 dde->dde_waiters++;
1222 DDT_KSTAT_BUMP(ddt, dds_lookup_live_wait);
1223 while (!(dde->dde_flags & DDE_FLAG_LOADED))
1224 cv_wait(&dde->dde_cv, &ddt->ddt_lock);
1225 dde->dde_waiters--;
1226
1227 /* Loaded but over quota, forget we were ever here */
1228 if (dde->dde_flags & DDE_FLAG_OVERQUOTA) {
1229 if (dde->dde_waiters == 0) {
1230 avl_remove(&ddt->ddt_tree, dde);
1231 ddt_free(ddt, dde);
1232 }
1233 return (NULL);
1234 }
1235
1236 DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
1237
1238 /* Make sure the loaded entry matches the BP */
1239 if (!verify || ddt_entry_lookup_is_valid(ddt, bp, dde))
1240 return (dde);
1241 return (NULL);
1242 } else
1243 DDT_KSTAT_BUMP(ddt, dds_lookup_live_miss);
1244
1245 /* Time to make a new entry. */
1246 dde = ddt_alloc(ddt, &search);
1247 avl_insert(&ddt->ddt_tree, dde, where);
1248
1249 /*
1250 * The entry in ddt_tree has no DDE_FLAG_LOADED, so other possible
1251 * threads will wait even while we drop the lock.
1252 */
1253 ddt_exit(ddt);
1254
1255 /*
1256 * If there is a log, we should try to "load" from there first.
1257 */
1258 if (ddt->ddt_flags & DDT_FLAG_LOG) {
1259 ddt_lightweight_entry_t ddlwe;
1260 boolean_t from_flushing;
1261
1262 /* Read-only search, no locks needed (logs stable during I/O) */
1263 if (ddt_log_find_key(ddt, &search, &ddlwe, &from_flushing)) {
1264 dde->dde_type = ddlwe.ddlwe_type;
1265 dde->dde_class = ddlwe.ddlwe_class;
1266 memcpy(dde->dde_phys, &ddlwe.ddlwe_phys,
1267 DDT_PHYS_SIZE(ddt));
1268
1269 /*
1270 * Check validity. If invalid and no waiters, clean up
1271 * immediately. Otherwise continue setup for waiters.
1272 */
1273 boolean_t valid = !verify ||
1274 ddt_entry_lookup_is_valid(ddt, bp, dde);
1275 ddt_enter(ddt);
1276 if (!valid && dde->dde_waiters == 0) {
1277 avl_remove(&ddt->ddt_tree, dde);
1278 ddt_free(ddt, dde);
1279 return (NULL);
1280 }
1281
1282 dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED;
1283 if (from_flushing) {
1284 dde->dde_flags |= DDE_FLAG_FROM_FLUSHING;
1285 DDT_KSTAT_BUMP(ddt,
1286 dds_lookup_log_flushing_hit);
1287 } else {
1288 DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit);
1289 }
1290
1291 DDT_KSTAT_BUMP(ddt, dds_lookup_log_hit);
1292 DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
1293
1294 cv_broadcast(&dde->dde_cv);
1295
1296 return (valid ? dde : NULL);
1297 }
1298
1299 DDT_KSTAT_BUMP(ddt, dds_lookup_log_miss);
1300 }
1301
1302 /* Search all store objects for the entry. */
1303 error = ENOENT;
1304 for (type = 0; type < DDT_TYPES; type++) {
1305 for (class = 0; class < DDT_CLASSES; class++) {
1306 error = ddt_object_lookup(ddt, type, class, dde);
1307 if (error != ENOENT) {
1308 ASSERT0(error);
1309 break;
1310 }
1311 }
1312 if (error != ENOENT)
1313 break;
1314 }
1315
1316 ddt_enter(ddt);
1317
1318 ASSERT(!(dde->dde_flags & DDE_FLAG_LOADED));
1319
1320 dde->dde_type = type; /* will be DDT_TYPES if no entry found */
1321 dde->dde_class = class; /* will be DDT_CLASSES if no entry found */
1322
1323 boolean_t valid = B_TRUE;
1324
1325 if (dde->dde_type == DDT_TYPES &&
1326 dde->dde_class == DDT_CLASSES &&
1327 ddt_over_quota(spa)) {
1328 /* Over quota. If no one is waiting, clean up right now. */
1329 if (dde->dde_waiters == 0) {
1330 avl_remove(&ddt->ddt_tree, dde);
1331 ddt_free(ddt, dde);
1332 return (NULL);
1333 }
1334
1335 /* Flag cleanup required */
1336 dde->dde_flags |= DDE_FLAG_OVERQUOTA;
1337 } else if (error == 0) {
1338 /*
1339 * If what we loaded is no good for this BP and there's no one
1340 * waiting for it, we can just remove it and get out. If its no
1341 * good but there are waiters, we have to leave it, because we
1342 * don't know what they want. If its not needed we'll end up
1343 * taking an entry log/sync, but it can only happen if more
1344 * than one previous version of this block is being deleted at
1345 * the same time. This is extremely unlikely to happen and not
1346 * worth the effort to deal with without taking an entry
1347 * update.
1348 */
1349 valid = !verify || ddt_entry_lookup_is_valid(ddt, bp, dde);
1350 if (!valid && dde->dde_waiters == 0) {
1351 avl_remove(&ddt->ddt_tree, dde);
1352 ddt_free(ddt, dde);
1353 return (NULL);
1354 }
1355
1356 DDT_KSTAT_BUMP(ddt, dds_lookup_stored_hit);
1357 DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
1358
1359 /*
1360 * The histograms only track inactive (stored or logged) blocks.
1361 * We've just put an entry onto the live list, so we need to
1362 * remove its counts. When its synced back, it'll be re-added
1363 * to the right one.
1364 *
1365 * We only do this when we successfully found it in the store.
1366 * error == ENOENT means this is a new entry, and so its already
1367 * not counted.
1368 */
1369 ddt_histogram_t *ddh =
1370 &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
1371
1372 ddt_lightweight_entry_t ddlwe;
1373 DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
1374 ddt_histogram_sub_entry(ddt, ddh, &ddlwe);
1375 } else {
1376 DDT_KSTAT_BUMP(ddt, dds_lookup_stored_miss);
1377 DDT_KSTAT_BUMP(ddt, dds_lookup_new);
1378 }
1379
1380 /* Entry loaded, everyone can proceed now */
1381 dde->dde_flags |= DDE_FLAG_LOADED;
1382 cv_broadcast(&dde->dde_cv);
1383
1384 if ((dde->dde_flags & DDE_FLAG_OVERQUOTA) || !valid)
1385 return (NULL);
1386
1387 return (dde);
1388 }
1389
1390 void
ddt_prefetch(spa_t * spa,const blkptr_t * bp)1391 ddt_prefetch(spa_t *spa, const blkptr_t *bp)
1392 {
1393 ddt_t *ddt;
1394 ddt_key_t ddk;
1395
1396 if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
1397 return;
1398
1399 /*
1400 * We only remove the DDT once all tables are empty and only
1401 * prefetch dedup blocks when there are entries in the DDT.
1402 * Thus no locking is required as the DDT can't disappear on us.
1403 */
1404 ddt = ddt_select(spa, bp);
1405 ddt_key_fill(&ddk, bp);
1406
1407 for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1408 for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
1409 ddt_object_prefetch(ddt, type, class, &ddk);
1410 }
1411 }
1412 }
1413
1414 /*
1415 * ddt_key_t comparison. Any struct wanting to make use of this function must
1416 * have the key as the first element. Casts it to N uint64_ts, and checks until
1417 * we find there's a difference. This is intended to match how ddt_zap.c drives
1418 * the ZAPs (first uint64_t as the key prehash), which will minimise the number
1419 * of ZAP blocks touched when flushing logged entries from an AVL walk. This is
1420 * not an invariant for this function though, should you wish to change it.
1421 */
1422 int
ddt_key_compare(const void * x1,const void * x2)1423 ddt_key_compare(const void *x1, const void *x2)
1424 {
1425 const uint64_t *k1 = (const uint64_t *)x1;
1426 const uint64_t *k2 = (const uint64_t *)x2;
1427
1428 int cmp;
1429 for (int i = 0; i < (sizeof (ddt_key_t) / sizeof (uint64_t)); i++)
1430 if (likely((cmp = TREE_CMP(k1[i], k2[i])) != 0))
1431 return (cmp);
1432
1433 return (0);
1434 }
1435
1436 /* Create the containing dir for this DDT and bump the feature count */
1437 static void
ddt_create_dir(ddt_t * ddt,dmu_tx_t * tx)1438 ddt_create_dir(ddt_t *ddt, dmu_tx_t *tx)
1439 {
1440 ASSERT0(ddt->ddt_dir_object);
1441 ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT);
1442
1443 char name[DDT_NAMELEN];
1444 snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR,
1445 zio_checksum_table[ddt->ddt_checksum].ci_name);
1446
1447 ddt->ddt_dir_object = zap_create_link(ddt->ddt_os,
1448 DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, name, tx);
1449
1450 VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_VERSION,
1451 sizeof (uint64_t), 1, &ddt->ddt_version, tx));
1452 VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS,
1453 sizeof (uint64_t), 1, &ddt->ddt_flags, tx));
1454
1455 spa_feature_incr(ddt->ddt_spa, SPA_FEATURE_FAST_DEDUP, tx);
1456 }
1457
1458 /* Destroy the containing dir and deactivate the feature */
1459 static void
ddt_destroy_dir(ddt_t * ddt,dmu_tx_t * tx)1460 ddt_destroy_dir(ddt_t *ddt, dmu_tx_t *tx)
1461 {
1462 ASSERT3U(ddt->ddt_dir_object, !=, 0);
1463 ASSERT3U(ddt->ddt_dir_object, !=, DMU_POOL_DIRECTORY_OBJECT);
1464 ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT);
1465
1466 char name[DDT_NAMELEN];
1467 snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR,
1468 zio_checksum_table[ddt->ddt_checksum].ci_name);
1469
1470 for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1471 for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
1472 ASSERT(!ddt_object_exists(ddt, type, class));
1473 }
1474 }
1475
1476 ddt_log_destroy(ddt, tx);
1477
1478 uint64_t count;
1479 ASSERT0(zap_count(ddt->ddt_os, ddt->ddt_dir_object, &count));
1480 ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object,
1481 DDT_DIR_VERSION));
1482 ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS));
1483 ASSERT3U(count, ==, 2);
1484
1485 VERIFY0(zap_remove(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, tx));
1486 VERIFY0(zap_destroy(ddt->ddt_os, ddt->ddt_dir_object, tx));
1487
1488 ddt->ddt_dir_object = 0;
1489
1490 spa_feature_decr(ddt->ddt_spa, SPA_FEATURE_FAST_DEDUP, tx);
1491 }
1492
1493 /*
1494 * Determine, flags and on-disk layout from what's already stored. If there's
1495 * nothing stored, then if new is false, returns ENOENT, and if true, selects
1496 * based on pool config.
1497 */
1498 static int
ddt_configure(ddt_t * ddt,boolean_t new)1499 ddt_configure(ddt_t *ddt, boolean_t new)
1500 {
1501 spa_t *spa = ddt->ddt_spa;
1502 char name[DDT_NAMELEN];
1503 int error;
1504
1505 ASSERT3U(spa_load_state(spa), !=, SPA_LOAD_CREATE);
1506
1507 boolean_t fdt_enabled =
1508 spa_feature_is_enabled(spa, SPA_FEATURE_FAST_DEDUP);
1509 boolean_t fdt_active =
1510 spa_feature_is_active(spa, SPA_FEATURE_FAST_DEDUP);
1511
1512 /*
1513 * First, look for the global DDT stats object. If its not there, then
1514 * there's never been a DDT written before ever, and we know we're
1515 * starting from scratch.
1516 */
1517 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1518 DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
1519 &spa->spa_ddt_stat_object);
1520 if (error != 0) {
1521 if (error != ENOENT)
1522 return (error);
1523 goto not_found;
1524 }
1525
1526 if (fdt_active) {
1527 /*
1528 * Now look for a DDT directory. If it exists, then it has
1529 * everything we need.
1530 */
1531 snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR,
1532 zio_checksum_table[ddt->ddt_checksum].ci_name);
1533
1534 error = zap_lookup(spa->spa_meta_objset,
1535 DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1,
1536 &ddt->ddt_dir_object);
1537 if (error == 0) {
1538 ASSERT3U(spa->spa_meta_objset, ==, ddt->ddt_os);
1539
1540 error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object,
1541 DDT_DIR_VERSION, sizeof (uint64_t), 1,
1542 &ddt->ddt_version);
1543 if (error != 0)
1544 return (error);
1545
1546 error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object,
1547 DDT_DIR_FLAGS, sizeof (uint64_t), 1,
1548 &ddt->ddt_flags);
1549 if (error != 0)
1550 return (error);
1551
1552 if (ddt->ddt_version != DDT_VERSION_FDT) {
1553 zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s "
1554 "unknown version %llu", spa_name(spa),
1555 name, (u_longlong_t)ddt->ddt_version);
1556 return (SET_ERROR(EINVAL));
1557 }
1558
1559 if ((ddt->ddt_flags & ~DDT_FLAG_MASK) != 0) {
1560 zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s "
1561 "version=%llu unknown flags %llx",
1562 spa_name(spa), name,
1563 (u_longlong_t)ddt->ddt_flags,
1564 (u_longlong_t)ddt->ddt_version);
1565 return (SET_ERROR(EINVAL));
1566 }
1567
1568 return (0);
1569 }
1570 if (error != ENOENT)
1571 return (error);
1572 }
1573
1574 /* Any object in the root indicates a traditional setup. */
1575 for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1576 for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
1577 ddt_object_name(ddt, type, class, name);
1578 uint64_t obj;
1579 error = zap_lookup(spa->spa_meta_objset,
1580 DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t),
1581 1, &obj);
1582 if (error == ENOENT)
1583 continue;
1584 if (error != 0)
1585 return (error);
1586
1587 ddt->ddt_version = DDT_VERSION_LEGACY;
1588 ddt->ddt_flags = ddt_version_flags[ddt->ddt_version];
1589 ddt->ddt_dir_object = DMU_POOL_DIRECTORY_OBJECT;
1590
1591 return (0);
1592 }
1593 }
1594
1595 not_found:
1596 if (!new)
1597 return (SET_ERROR(ENOENT));
1598
1599 /* Nothing on disk, so set up for the best version we can */
1600 if (fdt_enabled) {
1601 ddt->ddt_version = DDT_VERSION_FDT;
1602 ddt->ddt_flags = ddt_version_flags[ddt->ddt_version];
1603 ddt->ddt_dir_object = 0; /* create on first use */
1604 } else {
1605 ddt->ddt_version = DDT_VERSION_LEGACY;
1606 ddt->ddt_flags = ddt_version_flags[ddt->ddt_version];
1607 ddt->ddt_dir_object = DMU_POOL_DIRECTORY_OBJECT;
1608 }
1609
1610 return (0);
1611 }
1612
1613 static int
ddt_kstat_update(kstat_t * ksp,int rw)1614 ddt_kstat_update(kstat_t *ksp, int rw)
1615 {
1616 ddt_t *ddt = ksp->ks_private;
1617 ddt_kstats_t *dds = ksp->ks_data;
1618
1619 if (rw == KSTAT_WRITE)
1620 return (SET_ERROR(EACCES));
1621
1622 /* Aggregate wmsum counters for lookup stats */
1623 dds->dds_lookup.value.ui64 =
1624 wmsum_value(&ddt->ddt_kstat_dds_lookup);
1625 dds->dds_lookup_live_hit.value.ui64 =
1626 wmsum_value(&ddt->ddt_kstat_dds_lookup_live_hit);
1627 dds->dds_lookup_live_wait.value.ui64 =
1628 wmsum_value(&ddt->ddt_kstat_dds_lookup_live_wait);
1629 dds->dds_lookup_live_miss.value.ui64 =
1630 wmsum_value(&ddt->ddt_kstat_dds_lookup_live_miss);
1631 dds->dds_lookup_existing.value.ui64 =
1632 wmsum_value(&ddt->ddt_kstat_dds_lookup_existing);
1633 dds->dds_lookup_new.value.ui64 =
1634 wmsum_value(&ddt->ddt_kstat_dds_lookup_new);
1635 dds->dds_lookup_log_hit.value.ui64 =
1636 wmsum_value(&ddt->ddt_kstat_dds_lookup_log_hit);
1637 dds->dds_lookup_log_active_hit.value.ui64 =
1638 wmsum_value(&ddt->ddt_kstat_dds_lookup_log_active_hit);
1639 dds->dds_lookup_log_flushing_hit.value.ui64 =
1640 wmsum_value(&ddt->ddt_kstat_dds_lookup_log_flushing_hit);
1641 dds->dds_lookup_log_miss.value.ui64 =
1642 wmsum_value(&ddt->ddt_kstat_dds_lookup_log_miss);
1643 dds->dds_lookup_stored_hit.value.ui64 =
1644 wmsum_value(&ddt->ddt_kstat_dds_lookup_stored_hit);
1645 dds->dds_lookup_stored_miss.value.ui64 =
1646 wmsum_value(&ddt->ddt_kstat_dds_lookup_stored_miss);
1647
1648 /* Sync-only counters are already set directly in kstats */
1649
1650 return (0);
1651 }
1652
1653 static void
ddt_table_alloc_kstats(ddt_t * ddt)1654 ddt_table_alloc_kstats(ddt_t *ddt)
1655 {
1656 char *mod = kmem_asprintf("zfs/%s", spa_name(ddt->ddt_spa));
1657 char *name = kmem_asprintf("ddt_stats_%s",
1658 zio_checksum_table[ddt->ddt_checksum].ci_name);
1659
1660 /* Initialize wmsums for lookup counters */
1661 wmsum_init(&ddt->ddt_kstat_dds_lookup, 0);
1662 wmsum_init(&ddt->ddt_kstat_dds_lookup_live_hit, 0);
1663 wmsum_init(&ddt->ddt_kstat_dds_lookup_live_wait, 0);
1664 wmsum_init(&ddt->ddt_kstat_dds_lookup_live_miss, 0);
1665 wmsum_init(&ddt->ddt_kstat_dds_lookup_existing, 0);
1666 wmsum_init(&ddt->ddt_kstat_dds_lookup_new, 0);
1667 wmsum_init(&ddt->ddt_kstat_dds_lookup_log_hit, 0);
1668 wmsum_init(&ddt->ddt_kstat_dds_lookup_log_active_hit, 0);
1669 wmsum_init(&ddt->ddt_kstat_dds_lookup_log_flushing_hit, 0);
1670 wmsum_init(&ddt->ddt_kstat_dds_lookup_log_miss, 0);
1671 wmsum_init(&ddt->ddt_kstat_dds_lookup_stored_hit, 0);
1672 wmsum_init(&ddt->ddt_kstat_dds_lookup_stored_miss, 0);
1673
1674 ddt->ddt_ksp = kstat_create(mod, 0, name, "misc", KSTAT_TYPE_NAMED,
1675 sizeof (ddt_kstats_t) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
1676 if (ddt->ddt_ksp != NULL) {
1677 ddt_kstats_t *dds = kmem_alloc(sizeof (ddt_kstats_t), KM_SLEEP);
1678 memcpy(dds, &ddt_kstats_template, sizeof (ddt_kstats_t));
1679 ddt->ddt_ksp->ks_data = dds;
1680 ddt->ddt_ksp->ks_update = ddt_kstat_update;
1681 ddt->ddt_ksp->ks_private = ddt;
1682 kstat_install(ddt->ddt_ksp);
1683 }
1684
1685 kmem_strfree(name);
1686 kmem_strfree(mod);
1687 }
1688
1689 static ddt_t *
ddt_table_alloc(spa_t * spa,enum zio_checksum c)1690 ddt_table_alloc(spa_t *spa, enum zio_checksum c)
1691 {
1692 ddt_t *ddt;
1693
1694 ddt = kmem_cache_alloc(ddt_cache, KM_SLEEP);
1695 memset(ddt, 0, sizeof (ddt_t));
1696 mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
1697 avl_create(&ddt->ddt_tree, ddt_key_compare,
1698 sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
1699 avl_create(&ddt->ddt_repair_tree, ddt_key_compare,
1700 sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
1701
1702 ddt->ddt_checksum = c;
1703 ddt->ddt_spa = spa;
1704 ddt->ddt_os = spa->spa_meta_objset;
1705 ddt->ddt_version = DDT_VERSION_UNCONFIGURED;
1706 ddt->ddt_log_flush_pressure = 10;
1707
1708 ddt_log_alloc(ddt);
1709 ddt_table_alloc_kstats(ddt);
1710
1711 return (ddt);
1712 }
1713
1714 static void
ddt_table_free(ddt_t * ddt)1715 ddt_table_free(ddt_t *ddt)
1716 {
1717 if (ddt->ddt_ksp != NULL) {
1718 kmem_free(ddt->ddt_ksp->ks_data, sizeof (ddt_kstats_t));
1719 ddt->ddt_ksp->ks_data = NULL;
1720 kstat_delete(ddt->ddt_ksp);
1721 }
1722
1723 /* Cleanup wmsums for lookup counters */
1724 wmsum_fini(&ddt->ddt_kstat_dds_lookup);
1725 wmsum_fini(&ddt->ddt_kstat_dds_lookup_live_hit);
1726 wmsum_fini(&ddt->ddt_kstat_dds_lookup_live_wait);
1727 wmsum_fini(&ddt->ddt_kstat_dds_lookup_live_miss);
1728 wmsum_fini(&ddt->ddt_kstat_dds_lookup_existing);
1729 wmsum_fini(&ddt->ddt_kstat_dds_lookup_new);
1730 wmsum_fini(&ddt->ddt_kstat_dds_lookup_log_hit);
1731 wmsum_fini(&ddt->ddt_kstat_dds_lookup_log_active_hit);
1732 wmsum_fini(&ddt->ddt_kstat_dds_lookup_log_flushing_hit);
1733 wmsum_fini(&ddt->ddt_kstat_dds_lookup_log_miss);
1734 wmsum_fini(&ddt->ddt_kstat_dds_lookup_stored_hit);
1735 wmsum_fini(&ddt->ddt_kstat_dds_lookup_stored_miss);
1736
1737 ddt_log_free(ddt);
1738 for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1739 for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
1740 if (ddt->ddt_object_dnode[type][class] != NULL) {
1741 dnode_rele(ddt->ddt_object_dnode[type][class],
1742 ddt);
1743 ddt->ddt_object_dnode[type][class] = NULL;
1744 }
1745 }
1746 }
1747 ASSERT0(avl_numnodes(&ddt->ddt_tree));
1748 ASSERT0(avl_numnodes(&ddt->ddt_repair_tree));
1749 avl_destroy(&ddt->ddt_tree);
1750 avl_destroy(&ddt->ddt_repair_tree);
1751 mutex_destroy(&ddt->ddt_lock);
1752 kmem_cache_free(ddt_cache, ddt);
1753 }
1754
1755 void
ddt_create(spa_t * spa)1756 ddt_create(spa_t *spa)
1757 {
1758 spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
1759
1760 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1761 if (DDT_CHECKSUM_VALID(c))
1762 spa->spa_ddt[c] = ddt_table_alloc(spa, c);
1763 }
1764 }
1765
1766 int
ddt_load(spa_t * spa)1767 ddt_load(spa_t *spa)
1768 {
1769 int error;
1770
1771 ddt_create(spa);
1772
1773 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1774 DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
1775 &spa->spa_ddt_stat_object);
1776 if (error)
1777 return (error == ENOENT ? 0 : error);
1778
1779 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1780 if (!DDT_CHECKSUM_VALID(c))
1781 continue;
1782
1783 ddt_t *ddt = spa->spa_ddt[c];
1784 error = ddt_configure(ddt, B_FALSE);
1785 if (error == ENOENT)
1786 continue;
1787 if (error != 0)
1788 return (error);
1789
1790 for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1791 for (ddt_class_t class = 0; class < DDT_CLASSES;
1792 class++) {
1793 error = ddt_object_load(ddt, type, class);
1794 if (error != 0 && error != ENOENT)
1795 return (error);
1796 }
1797 }
1798
1799 if (ddt->ddt_flags & DDT_FLAG_LOG) {
1800 error = ddt_log_load(ddt);
1801 if (error != 0 && error != ENOENT)
1802 return (error);
1803 }
1804
1805 DDT_KSTAT_SET(ddt, dds_log_active_entries,
1806 avl_numnodes(&ddt->ddt_log_active->ddl_tree));
1807 DDT_KSTAT_SET(ddt, dds_log_flushing_entries,
1808 avl_numnodes(&ddt->ddt_log_flushing->ddl_tree));
1809
1810 /*
1811 * Seed the cached histograms.
1812 */
1813 memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
1814 sizeof (ddt->ddt_histogram));
1815 }
1816
1817 spa->spa_dedup_dspace = ~0ULL;
1818 spa->spa_dedup_dsize = ~0ULL;
1819
1820 return (0);
1821 }
1822
1823 void
ddt_unload(spa_t * spa)1824 ddt_unload(spa_t *spa)
1825 {
1826 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1827 if (spa->spa_ddt[c]) {
1828 ddt_table_free(spa->spa_ddt[c]);
1829 spa->spa_ddt[c] = NULL;
1830 }
1831 }
1832 }
1833
1834 boolean_t
ddt_class_contains(spa_t * spa,ddt_class_t max_class,const blkptr_t * bp)1835 ddt_class_contains(spa_t *spa, ddt_class_t max_class, const blkptr_t *bp)
1836 {
1837 ddt_t *ddt;
1838 ddt_key_t ddk;
1839
1840 if (!BP_GET_DEDUP(bp))
1841 return (B_FALSE);
1842
1843 if (max_class == DDT_CLASS_UNIQUE)
1844 return (B_TRUE);
1845
1846 ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
1847
1848 ddt_key_fill(&ddk, bp);
1849
1850 for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1851 for (ddt_class_t class = 0; class <= max_class; class++) {
1852 if (ddt_object_contains(ddt, type, class, &ddk) == 0)
1853 return (B_TRUE);
1854 }
1855 }
1856
1857 return (B_FALSE);
1858 }
1859
1860 ddt_entry_t *
ddt_repair_start(ddt_t * ddt,const blkptr_t * bp)1861 ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
1862 {
1863 ddt_key_t ddk;
1864 ddt_entry_t *dde;
1865
1866 ddt_key_fill(&ddk, bp);
1867
1868 dde = ddt_alloc(ddt, &ddk);
1869 ddt_alloc_entry_io(dde);
1870
1871 for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1872 for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
1873 /*
1874 * We can only do repair if there are multiple copies
1875 * of the block. For anything in the UNIQUE class,
1876 * there's definitely only one copy, so don't even try.
1877 */
1878 if (class != DDT_CLASS_UNIQUE &&
1879 ddt_object_lookup(ddt, type, class, dde) == 0)
1880 return (dde);
1881 }
1882 }
1883
1884 memset(dde->dde_phys, 0, DDT_PHYS_SIZE(ddt));
1885
1886 return (dde);
1887 }
1888
1889 void
ddt_repair_done(ddt_t * ddt,ddt_entry_t * dde)1890 ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
1891 {
1892 avl_index_t where;
1893
1894 ddt_enter(ddt);
1895
1896 if (dde->dde_io->dde_repair_abd != NULL &&
1897 spa_writeable(ddt->ddt_spa) &&
1898 avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
1899 avl_insert(&ddt->ddt_repair_tree, dde, where);
1900 else
1901 ddt_free(ddt, dde);
1902
1903 ddt_exit(ddt);
1904 }
1905
1906 static void
ddt_repair_entry_done(zio_t * zio)1907 ddt_repair_entry_done(zio_t *zio)
1908 {
1909 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
1910 ddt_entry_t *rdde = zio->io_private;
1911
1912 ddt_free(ddt, rdde);
1913 }
1914
1915 static void
ddt_repair_entry(ddt_t * ddt,ddt_entry_t * dde,ddt_entry_t * rdde,zio_t * rio)1916 ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
1917 {
1918 ddt_key_t *ddk = &dde->dde_key;
1919 ddt_key_t *rddk = &rdde->dde_key;
1920 zio_t *zio;
1921 blkptr_t blk;
1922
1923 zio = zio_null(rio, rio->io_spa, NULL,
1924 ddt_repair_entry_done, rdde, rio->io_flags);
1925
1926 for (int p = 0; p < DDT_NPHYS(ddt); p++) {
1927 ddt_univ_phys_t *ddp = dde->dde_phys;
1928 ddt_univ_phys_t *rddp = rdde->dde_phys;
1929 ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
1930 uint64_t phys_birth = ddt_phys_birth(ddp, v);
1931 const dva_t *dvas, *rdvas;
1932
1933 if (ddt->ddt_flags & DDT_FLAG_FLAT) {
1934 dvas = ddp->ddp_flat.ddp_dva;
1935 rdvas = rddp->ddp_flat.ddp_dva;
1936 } else {
1937 dvas = ddp->ddp_trad[p].ddp_dva;
1938 rdvas = rddp->ddp_trad[p].ddp_dva;
1939 }
1940
1941 if (phys_birth == 0 ||
1942 phys_birth != ddt_phys_birth(rddp, v) ||
1943 memcmp(dvas, rdvas, sizeof (dva_t) * SPA_DVAS_PER_BP))
1944 continue;
1945
1946 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
1947 zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
1948 rdde->dde_io->dde_repair_abd, DDK_GET_PSIZE(rddk),
1949 NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
1950 ZIO_DDT_CHILD_FLAGS(zio), NULL));
1951 }
1952
1953 zio_nowait(zio);
1954 }
1955
1956 static void
ddt_repair_table(ddt_t * ddt,zio_t * rio)1957 ddt_repair_table(ddt_t *ddt, zio_t *rio)
1958 {
1959 spa_t *spa = ddt->ddt_spa;
1960 ddt_entry_t *dde, *rdde_next, *rdde;
1961 avl_tree_t *t = &ddt->ddt_repair_tree;
1962 blkptr_t blk;
1963
1964 if (spa_sync_pass(spa) > 1)
1965 return;
1966
1967 ddt_enter(ddt);
1968 for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
1969 rdde_next = AVL_NEXT(t, rdde);
1970 avl_remove(&ddt->ddt_repair_tree, rdde);
1971 ddt_exit(ddt);
1972 ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL,
1973 DDT_PHYS_NONE, &blk);
1974 dde = ddt_repair_start(ddt, &blk);
1975 ddt_repair_entry(ddt, dde, rdde, rio);
1976 ddt_repair_done(ddt, dde);
1977 ddt_enter(ddt);
1978 }
1979 ddt_exit(ddt);
1980 }
1981
1982 static void
ddt_sync_update_stats(ddt_t * ddt,dmu_tx_t * tx)1983 ddt_sync_update_stats(ddt_t *ddt, dmu_tx_t *tx)
1984 {
1985 /*
1986 * Count all the entries stored for each type/class, and updates the
1987 * stats within (ddt_object_sync()). If there's no entries for the
1988 * type/class, the whole object is removed. If all objects for the DDT
1989 * are removed, its containing dir is removed, effectively resetting
1990 * the entire DDT to an empty slate.
1991 */
1992 uint64_t count = 0;
1993 for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1994 uint64_t add, tcount = 0;
1995 for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
1996 if (ddt_object_exists(ddt, type, class)) {
1997 ddt_object_sync(ddt, type, class, tx);
1998 VERIFY0(ddt_object_count(ddt, type, class,
1999 &add));
2000 tcount += add;
2001 }
2002 }
2003 for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
2004 if (tcount == 0 && ddt_object_exists(ddt, type, class))
2005 ddt_object_destroy(ddt, type, class, tx);
2006 }
2007 count += tcount;
2008 }
2009
2010 if (ddt->ddt_flags & DDT_FLAG_LOG) {
2011 /* Include logged entries in the total count */
2012 count += avl_numnodes(&ddt->ddt_log_active->ddl_tree);
2013 count += avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
2014 }
2015
2016 if (count == 0) {
2017 /*
2018 * No entries left on the DDT, so reset the version for next
2019 * time. This allows us to handle the feature being changed
2020 * since the DDT was originally created. New entries should get
2021 * whatever the feature currently demands.
2022 */
2023 if (ddt->ddt_version == DDT_VERSION_FDT)
2024 ddt_destroy_dir(ddt, tx);
2025
2026 ddt->ddt_version = DDT_VERSION_UNCONFIGURED;
2027 ddt->ddt_flags = 0;
2028 }
2029
2030 memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
2031 sizeof (ddt->ddt_histogram));
2032 ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
2033 ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
2034 }
2035
2036 static void
ddt_sync_scan_entry(ddt_t * ddt,ddt_lightweight_entry_t * ddlwe,dmu_tx_t * tx)2037 ddt_sync_scan_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
2038 {
2039 dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
2040
2041 /*
2042 * Compute the target class, so we can decide whether or not to inform
2043 * the scrub traversal (below). Note that we don't store this in the
2044 * entry, as it might change multiple times before finally being
2045 * committed (if we're logging). Instead, we recompute it in
2046 * ddt_sync_entry().
2047 */
2048 uint64_t refcnt = ddt_phys_total_refcnt(ddt, &ddlwe->ddlwe_phys);
2049 ddt_class_t nclass =
2050 (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE;
2051
2052 /*
2053 * If the class changes, the order that we scan this bp changes. If it
2054 * decreases, we could miss it, so scan it right now. (This covers both
2055 * class changing while we are doing ddt_walk(), and when we are
2056 * traversing.)
2057 *
2058 * We also do this when the refcnt goes to zero, because that change is
2059 * only in the log so far; the blocks on disk won't be freed until
2060 * the log is flushed, and the refcnt might increase before that. If it
2061 * does, then we could miss it in the same way.
2062 */
2063 if (refcnt == 0 || nclass < ddlwe->ddlwe_class)
2064 dsl_scan_ddt_entry(dp->dp_scan, ddt->ddt_checksum, ddt,
2065 ddlwe, tx);
2066 }
2067
2068 static void
ddt_sync_flush_entry(ddt_t * ddt,ddt_lightweight_entry_t * ddlwe,ddt_type_t otype,ddt_class_t oclass,dmu_tx_t * tx)2069 ddt_sync_flush_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe,
2070 ddt_type_t otype, ddt_class_t oclass, dmu_tx_t *tx)
2071 {
2072 ddt_key_t *ddk = &ddlwe->ddlwe_key;
2073 ddt_type_t ntype = DDT_TYPE_DEFAULT;
2074 uint64_t refcnt = 0;
2075
2076 /*
2077 * Compute the total refcnt. Along the way, issue frees for any DVAs
2078 * we no longer want.
2079 */
2080 for (int p = 0; p < DDT_NPHYS(ddt); p++) {
2081 ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
2082 ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
2083 uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v);
2084
2085 if (ddt_phys_birth(ddp, v) == 0) {
2086 ASSERT0(phys_refcnt);
2087 continue;
2088 }
2089 if (DDT_PHYS_IS_DITTO(ddt, p)) {
2090 /*
2091 * We don't want to keep any obsolete slots (eg ditto),
2092 * regardless of their refcount, but we don't want to
2093 * leak them either. So, free them.
2094 */
2095 ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg);
2096 continue;
2097 }
2098 if (phys_refcnt == 0)
2099 /* No remaining references, free it! */
2100 ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg);
2101 refcnt += phys_refcnt;
2102 }
2103
2104 /* Select the best class for the entry. */
2105 ddt_class_t nclass =
2106 (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE;
2107
2108 /*
2109 * If an existing entry changed type or class, or its refcount reached
2110 * zero, delete it from the DDT object
2111 */
2112 if (otype != DDT_TYPES &&
2113 (otype != ntype || oclass != nclass || refcnt == 0)) {
2114 VERIFY0(ddt_object_remove(ddt, otype, oclass, ddk, tx));
2115 ASSERT(ddt_object_contains(ddt, otype, oclass, ddk) == ENOENT);
2116 }
2117
2118 /*
2119 * Add or update the entry
2120 */
2121 if (refcnt != 0) {
2122 ddt_histogram_t *ddh =
2123 &ddt->ddt_histogram[ntype][nclass];
2124
2125 ddt_histogram_add_entry(ddt, ddh, ddlwe);
2126
2127 if (!ddt_object_exists(ddt, ntype, nclass))
2128 ddt_object_create(ddt, ntype, nclass, tx);
2129 VERIFY0(ddt_object_update(ddt, ntype, nclass, ddlwe, tx));
2130 }
2131 }
2132
2133 /* Calculate an exponential weighted moving average, lower limited to zero */
2134 static inline int32_t
_ewma(int32_t val,int32_t prev,uint32_t weight)2135 _ewma(int32_t val, int32_t prev, uint32_t weight)
2136 {
2137 ASSERT3U(val, >=, 0);
2138 ASSERT3U(prev, >=, 0);
2139 const int32_t new =
2140 MAX(0, prev + (val-prev) / (int32_t)MAX(weight, 1));
2141 ASSERT3U(new, >=, 0);
2142 return (new);
2143 }
2144
2145 static inline void
ddt_flush_force_update_txg(ddt_t * ddt,uint64_t txg)2146 ddt_flush_force_update_txg(ddt_t *ddt, uint64_t txg)
2147 {
2148 /*
2149 * If we're not forcing flush, and not being asked to start, then
2150 * there's nothing more to do.
2151 */
2152 if (txg == 0) {
2153 /* Update requested, are we currently forcing flush? */
2154 if (ddt->ddt_flush_force_txg == 0)
2155 return;
2156 txg = ddt->ddt_flush_force_txg;
2157 }
2158
2159 /*
2160 * If either of the logs have entries unflushed entries before
2161 * the wanted txg, set the force txg, otherwise clear it.
2162 */
2163
2164 if ((!avl_is_empty(&ddt->ddt_log_active->ddl_tree) &&
2165 ddt->ddt_log_active->ddl_first_txg <= txg) ||
2166 (!avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) &&
2167 ddt->ddt_log_flushing->ddl_first_txg <= txg)) {
2168 ddt->ddt_flush_force_txg = txg;
2169 return;
2170 }
2171
2172 /*
2173 * Nothing to flush behind the given txg, so we can clear force flush
2174 * state.
2175 */
2176 ddt->ddt_flush_force_txg = 0;
2177 }
2178
2179 static void
ddt_sync_flush_log(ddt_t * ddt,dmu_tx_t * tx)2180 ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
2181 {
2182 spa_t *spa = ddt->ddt_spa;
2183 ASSERT(avl_is_empty(&ddt->ddt_tree));
2184
2185 /*
2186 * Don't do any flushing when the pool is ready to shut down, or in
2187 * passes beyond the first.
2188 */
2189 if (spa_sync_pass(spa) > 1 || tx->tx_txg > spa_final_dirty_txg(spa))
2190 return;
2191
2192 hrtime_t flush_start = gethrtime();
2193 uint32_t count = 0;
2194
2195 /*
2196 * How many entries we need to flush. We need to at
2197 * least match the ingest rate, and also consider the
2198 * current backlog of entries.
2199 */
2200 uint64_t backlog = avl_numnodes(&ddt->ddt_log_flushing->ddl_tree) +
2201 avl_numnodes(&ddt->ddt_log_active->ddl_tree);
2202
2203 if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree))
2204 goto housekeeping;
2205
2206 uint64_t txgs = MAX(1, zfs_dedup_log_flush_txgs);
2207 uint64_t cap = MAX(1, zfs_dedup_log_cap);
2208 uint64_t flush_min = MAX(backlog / txgs,
2209 zfs_dedup_log_flush_entries_min);
2210
2211 /*
2212 * The theory for this block is that if we increase the pressure while
2213 * we're growing above the cap, and remove it when we're significantly
2214 * below the cap, we'll stay near cap while not bouncing around too
2215 * much.
2216 *
2217 * The factor of 10 is to smooth the pressure effect by expressing it
2218 * in tenths. The addition of the cap to the backlog in the second
2219 * block is to round up, instead of down. We never let the pressure go
2220 * below 1 (10 tenths).
2221 */
2222 if (cap != UINT_MAX && backlog > cap &&
2223 backlog > ddt->ddt_log_flush_prev_backlog) {
2224 ddt->ddt_log_flush_pressure += 10 * backlog / cap;
2225 } else if (cap != UINT_MAX && backlog < cap) {
2226 ddt->ddt_log_flush_pressure -=
2227 11 - (((10 * backlog) + cap - 1) / cap);
2228 ddt->ddt_log_flush_pressure =
2229 MAX(ddt->ddt_log_flush_pressure, 10);
2230 }
2231
2232 if (zfs_dedup_log_hard_cap && cap != UINT_MAX)
2233 flush_min = MAX(flush_min, MIN(backlog - cap,
2234 (flush_min * ddt->ddt_log_flush_pressure) / 10));
2235
2236 uint64_t flush_max;
2237
2238 /*
2239 * If we've been asked to flush everything in a hurry,
2240 * try to dump as much as possible on this txg. In
2241 * this case we're only limited by time, not amount.
2242 *
2243 * Otherwise, if we are over the cap, try to get back down to it.
2244 *
2245 * Finally if there is no cap (or no pressure), just set the max a
2246 * little higher than the min to help smooth out variations in flush
2247 * times.
2248 */
2249 if (ddt->ddt_flush_force_txg > 0)
2250 flush_max = avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
2251 else if (cap != UINT32_MAX && !zfs_dedup_log_hard_cap)
2252 flush_max = MAX(flush_min * 5 / 4, MIN(backlog - cap,
2253 (flush_min * ddt->ddt_log_flush_pressure) / 10));
2254 else
2255 flush_max = flush_min * 5 / 4;
2256 flush_max = MIN(flush_max, zfs_dedup_log_flush_entries_max);
2257
2258 /*
2259 * When the pool is busy or someone is explicitly waiting for this txg
2260 * to complete, use the zfs_dedup_log_flush_min_time_ms. Otherwise use
2261 * half of the time in the txg timeout.
2262 */
2263 uint64_t target_time;
2264
2265 if (txg_sync_waiting(ddt->ddt_spa->spa_dsl_pool) ||
2266 vdev_queue_pool_busy(spa)) {
2267 target_time = MIN(MSEC2NSEC(zfs_dedup_log_flush_min_time_ms),
2268 SEC2NSEC(zfs_txg_timeout) / 2);
2269 } else {
2270 target_time = SEC2NSEC(zfs_txg_timeout) / 2;
2271 }
2272
2273 ddt_lightweight_entry_t ddlwe;
2274 while (ddt_log_take_first(ddt, ddt->ddt_log_flushing, &ddlwe)) {
2275 ddt_sync_flush_entry(ddt, &ddlwe,
2276 ddlwe.ddlwe_type, ddlwe.ddlwe_class, tx);
2277
2278 /* End if we've synced as much as we needed to. */
2279 if (++count >= flush_max)
2280 break;
2281
2282 /*
2283 * As long as we've flushed the absolute minimum,
2284 * stop if we're way over our target time.
2285 */
2286 uint64_t diff = gethrtime() - flush_start;
2287 if (count > zfs_dedup_log_flush_entries_min &&
2288 diff >= target_time * 2)
2289 break;
2290
2291 /*
2292 * End if we've passed the minimum flush and we're out of time.
2293 */
2294 if (count > flush_min && diff >= target_time)
2295 break;
2296 }
2297
2298 if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) {
2299 /* We emptied it, so truncate on-disk */
2300 DDT_KSTAT_ZERO(ddt, dds_log_flushing_entries);
2301 ddt_log_truncate(ddt, tx);
2302 } else {
2303 /* More to do next time, save checkpoint */
2304 DDT_KSTAT_SUB(ddt, dds_log_flushing_entries, count);
2305 ddt_log_checkpoint(ddt, &ddlwe, tx);
2306 }
2307
2308 ddt_sync_update_stats(ddt, tx);
2309
2310 housekeeping:
2311 if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) &&
2312 !avl_is_empty(&ddt->ddt_log_active->ddl_tree)) {
2313 /*
2314 * No more to flush, and the active list has stuff, so
2315 * try to swap the logs for next time.
2316 */
2317 if (ddt_log_swap(ddt, tx)) {
2318 DDT_KSTAT_ZERO(ddt, dds_log_active_entries);
2319 DDT_KSTAT_SET(ddt, dds_log_flushing_entries,
2320 avl_numnodes(&ddt->ddt_log_flushing->ddl_tree));
2321 }
2322 }
2323
2324 /* If force flush is no longer necessary, turn it off. */
2325 ddt_flush_force_update_txg(ddt, 0);
2326
2327 ddt->ddt_log_flush_prev_backlog = backlog;
2328
2329 /*
2330 * Update flush rate. This is an exponential weighted moving
2331 * average of the number of entries flushed over recent txgs.
2332 */
2333 ddt->ddt_log_flush_rate = _ewma(count, ddt->ddt_log_flush_rate,
2334 zfs_dedup_log_flush_flow_rate_txgs);
2335 DDT_KSTAT_SET(ddt, dds_log_flush_rate, ddt->ddt_log_flush_rate);
2336
2337 /*
2338 * Update flush time rate. This is an exponential weighted moving
2339 * average of the total time taken to flush over recent txgs.
2340 */
2341 ddt->ddt_log_flush_time_rate = _ewma(ddt->ddt_log_flush_time_rate,
2342 (int32_t)NSEC2MSEC(gethrtime() - flush_start),
2343 zfs_dedup_log_flush_flow_rate_txgs);
2344 DDT_KSTAT_SET(ddt, dds_log_flush_time_rate,
2345 ddt->ddt_log_flush_time_rate);
2346 if (avl_numnodes(&ddt->ddt_log_flushing->ddl_tree) > 0 &&
2347 zfs_flags & ZFS_DEBUG_DDT) {
2348 zfs_dbgmsg("%lu entries remain(%lu in active), flushed %u @ "
2349 "txg %llu, in %llu ms, flush rate %d, time rate %d",
2350 (ulong_t)avl_numnodes(&ddt->ddt_log_flushing->ddl_tree),
2351 (ulong_t)avl_numnodes(&ddt->ddt_log_active->ddl_tree),
2352 count, (u_longlong_t)tx->tx_txg,
2353 (u_longlong_t)NSEC2MSEC(gethrtime() - flush_start),
2354 ddt->ddt_log_flush_rate, ddt->ddt_log_flush_time_rate);
2355 }
2356 }
2357
2358 static void
ddt_sync_table_log(ddt_t * ddt,dmu_tx_t * tx)2359 ddt_sync_table_log(ddt_t *ddt, dmu_tx_t *tx)
2360 {
2361 uint64_t count = avl_numnodes(&ddt->ddt_tree);
2362
2363 if (count > 0) {
2364 ddt_log_update_t dlu = {0};
2365 ddt_log_begin(ddt, count, tx, &dlu);
2366
2367 ddt_entry_t *dde;
2368 void *cookie = NULL;
2369 ddt_lightweight_entry_t ddlwe;
2370 while ((dde =
2371 avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
2372 ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
2373 DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
2374
2375 /* If from flushing log, remove it. */
2376 if (dde->dde_flags & DDE_FLAG_FROM_FLUSHING) {
2377 VERIFY(ddt_log_remove_key(ddt,
2378 ddt->ddt_log_flushing, &ddlwe.ddlwe_key));
2379 }
2380
2381 /* Update class_start to track last modification time */
2382 if (ddt->ddt_flags & DDT_FLAG_FLAT) {
2383 ddlwe.ddlwe_phys.ddp_flat.ddp_class_start =
2384 ddt_class_start();
2385 }
2386
2387 ddt_log_entry(ddt, &ddlwe, &dlu);
2388 ddt_sync_scan_entry(ddt, &ddlwe, tx);
2389 ddt_free(ddt, dde);
2390 }
2391
2392 ddt_log_commit(ddt, &dlu);
2393
2394 DDT_KSTAT_SET(ddt, dds_log_active_entries,
2395 avl_numnodes(&ddt->ddt_log_active->ddl_tree));
2396
2397 /*
2398 * Sync the stats for the store objects. Even though we haven't
2399 * modified anything on those objects, they're no longer the
2400 * source of truth for entries that are now in the log, and we
2401 * need the on-disk counts to reflect that, otherwise we'll
2402 * miscount later when importing.
2403 */
2404 for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
2405 for (ddt_class_t class = 0;
2406 class < DDT_CLASSES; class++) {
2407 if (ddt_object_exists(ddt, type, class))
2408 ddt_object_sync(ddt, type, class, tx);
2409 }
2410 }
2411
2412 memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
2413 sizeof (ddt->ddt_histogram));
2414 ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
2415 ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
2416 }
2417
2418 if (spa_sync_pass(ddt->ddt_spa) == 1) {
2419 /*
2420 * Update ingest rate. This is an exponential weighted moving
2421 * average of the number of entries changed over recent txgs.
2422 * The ramp-up cost shouldn't matter too much because the
2423 * flusher will be trying to take at least the minimum anyway.
2424 */
2425 ddt->ddt_log_ingest_rate = _ewma(
2426 count, ddt->ddt_log_ingest_rate,
2427 zfs_dedup_log_flush_flow_rate_txgs);
2428 DDT_KSTAT_SET(ddt, dds_log_ingest_rate,
2429 ddt->ddt_log_ingest_rate);
2430 }
2431 }
2432
2433 static void
ddt_sync_table_flush(ddt_t * ddt,dmu_tx_t * tx)2434 ddt_sync_table_flush(ddt_t *ddt, dmu_tx_t *tx)
2435 {
2436 if (avl_numnodes(&ddt->ddt_tree) == 0)
2437 return;
2438
2439 ddt_entry_t *dde;
2440 void *cookie = NULL;
2441 while ((dde = avl_destroy_nodes(
2442 &ddt->ddt_tree, &cookie)) != NULL) {
2443 ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
2444
2445 ddt_lightweight_entry_t ddlwe;
2446 DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
2447
2448 /* Update class_start to track last modification time */
2449 if (ddt->ddt_flags & DDT_FLAG_FLAT) {
2450 ddlwe.ddlwe_phys.ddp_flat.ddp_class_start =
2451 ddt_class_start();
2452 }
2453
2454 ddt_sync_flush_entry(ddt, &ddlwe,
2455 dde->dde_type, dde->dde_class, tx);
2456 ddt_sync_scan_entry(ddt, &ddlwe, tx);
2457 ddt_free(ddt, dde);
2458 }
2459
2460 memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
2461 sizeof (ddt->ddt_histogram));
2462 ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
2463 ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
2464 ddt_sync_update_stats(ddt, tx);
2465 }
2466
2467 static void
ddt_sync_table(ddt_t * ddt,dmu_tx_t * tx)2468 ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx)
2469 {
2470 spa_t *spa = ddt->ddt_spa;
2471
2472 if (ddt->ddt_version == UINT64_MAX)
2473 return;
2474
2475 if (spa->spa_uberblock.ub_version < SPA_VERSION_DEDUP) {
2476 ASSERT0(avl_numnodes(&ddt->ddt_tree));
2477 return;
2478 }
2479
2480 if (spa->spa_ddt_stat_object == 0) {
2481 spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
2482 DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
2483 DMU_POOL_DDT_STATS, tx);
2484 }
2485
2486 if (ddt->ddt_version == DDT_VERSION_FDT && ddt->ddt_dir_object == 0)
2487 ddt_create_dir(ddt, tx);
2488
2489 if (ddt->ddt_flags & DDT_FLAG_LOG)
2490 ddt_sync_table_log(ddt, tx);
2491 else
2492 ddt_sync_table_flush(ddt, tx);
2493 }
2494
2495 void
ddt_sync(spa_t * spa,uint64_t txg)2496 ddt_sync(spa_t *spa, uint64_t txg)
2497 {
2498 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
2499 dmu_tx_t *tx;
2500 zio_t *rio;
2501
2502 ASSERT3U(spa_syncing_txg(spa), ==, txg);
2503
2504 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
2505
2506 rio = zio_root(spa, NULL, NULL,
2507 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL);
2508
2509 /*
2510 * This function may cause an immediate scan of ddt blocks (see
2511 * the comment above dsl_scan_ddt() for details). We set the
2512 * scan's root zio here so that we can wait for any scan IOs in
2513 * addition to the regular ddt IOs.
2514 */
2515 ASSERT0P(scn->scn_zio_root);
2516 scn->scn_zio_root = rio;
2517
2518 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
2519 ddt_t *ddt = spa->spa_ddt[c];
2520 if (ddt == NULL)
2521 continue;
2522 ddt_sync_table(ddt, tx);
2523 if (ddt->ddt_flags & DDT_FLAG_LOG)
2524 ddt_sync_flush_log(ddt, tx);
2525 ddt_repair_table(ddt, rio);
2526 }
2527
2528 (void) zio_wait(rio);
2529 scn->scn_zio_root = NULL;
2530
2531 dmu_tx_commit(tx);
2532 }
2533
2534 void
ddt_walk_init(spa_t * spa,uint64_t txg)2535 ddt_walk_init(spa_t *spa, uint64_t txg)
2536 {
2537 if (txg == 0)
2538 txg = spa_syncing_txg(spa);
2539
2540 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
2541 ddt_t *ddt = spa->spa_ddt[c];
2542 if (ddt == NULL || !(ddt->ddt_flags & DDT_FLAG_LOG))
2543 continue;
2544
2545 ddt_enter(ddt);
2546 ddt_flush_force_update_txg(ddt, txg);
2547 ddt_exit(ddt);
2548 }
2549 }
2550
2551 boolean_t
ddt_walk_ready(spa_t * spa)2552 ddt_walk_ready(spa_t *spa)
2553 {
2554 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
2555 ddt_t *ddt = spa->spa_ddt[c];
2556 if (ddt == NULL || !(ddt->ddt_flags & DDT_FLAG_LOG))
2557 continue;
2558
2559 if (ddt->ddt_flush_force_txg > 0)
2560 return (B_FALSE);
2561 }
2562
2563 return (B_TRUE);
2564 }
2565
2566 static int
ddt_walk_impl(spa_t * spa,ddt_bookmark_t * ddb,ddt_lightweight_entry_t * ddlwe,uint64_t flags,boolean_t wait)2567 ddt_walk_impl(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe,
2568 uint64_t flags, boolean_t wait)
2569 {
2570 do {
2571 do {
2572 do {
2573 ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
2574 if (ddt == NULL)
2575 continue;
2576
2577 if (flags != 0 &&
2578 (ddt->ddt_flags & flags) != flags)
2579 continue;
2580
2581 if (wait && ddt->ddt_flush_force_txg > 0)
2582 return (EAGAIN);
2583
2584 int error = ENOENT;
2585 if (ddt_object_exists(ddt, ddb->ddb_type,
2586 ddb->ddb_class)) {
2587 error = ddt_object_walk(ddt,
2588 ddb->ddb_type, ddb->ddb_class,
2589 &ddb->ddb_cursor, ddlwe);
2590 }
2591 if (error == 0)
2592 return (0);
2593 if (error != ENOENT)
2594 return (error);
2595 ddb->ddb_cursor = 0;
2596 } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
2597 ddb->ddb_checksum = 0;
2598 } while (++ddb->ddb_type < DDT_TYPES);
2599 ddb->ddb_type = 0;
2600 } while (++ddb->ddb_class < DDT_CLASSES);
2601
2602 return (SET_ERROR(ENOENT));
2603 }
2604
2605 int
ddt_walk(spa_t * spa,ddt_bookmark_t * ddb,ddt_lightweight_entry_t * ddlwe)2606 ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe)
2607 {
2608 return (ddt_walk_impl(spa, ddb, ddlwe, 0, B_TRUE));
2609 }
2610
2611 /*
2612 * This function is used by Block Cloning (brt.c) to increase reference
2613 * counter for the DDT entry if the block is already in DDT.
2614 *
2615 * Return false if the block, despite having the D bit set, is not present
2616 * in the DDT. This is possible when the DDT has been pruned by an admin
2617 * or by the DDT quota mechanism.
2618 */
2619 boolean_t
ddt_addref(spa_t * spa,const blkptr_t * bp)2620 ddt_addref(spa_t *spa, const blkptr_t *bp)
2621 {
2622 ddt_t *ddt;
2623 ddt_entry_t *dde;
2624 boolean_t result;
2625
2626 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
2627 ddt = ddt_select(spa, bp);
2628 ddt_enter(ddt);
2629
2630 dde = ddt_lookup(ddt, bp, B_TRUE);
2631
2632 /* Can be NULL if the entry for this block was pruned. */
2633 if (dde == NULL) {
2634 ddt_exit(ddt);
2635 spa_config_exit(spa, SCL_ZIO, FTAG);
2636 return (B_FALSE);
2637 }
2638
2639 if ((dde->dde_type < DDT_TYPES) || (dde->dde_flags & DDE_FLAG_LOGGED)) {
2640 /*
2641 * This entry was either synced to a store object (dde_type is
2642 * real) or was logged. It must be properly on disk at this
2643 * point, so we can just bump its refcount.
2644 */
2645 int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp));
2646 ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
2647
2648 ddt_phys_addref(dde->dde_phys, v);
2649 result = B_TRUE;
2650 } else {
2651 /*
2652 * If the block has the DEDUP flag set it still might not
2653 * exist in the DEDUP table due to DDT pruning of entries
2654 * where refcnt=1.
2655 */
2656 ddt_remove(ddt, dde);
2657 result = B_FALSE;
2658 }
2659
2660 ddt_exit(ddt);
2661 spa_config_exit(spa, SCL_ZIO, FTAG);
2662
2663 return (result);
2664 }
2665
2666 typedef struct ddt_prune_entry {
2667 ddt_t *dpe_ddt;
2668 ddt_key_t dpe_key;
2669 list_node_t dpe_node;
2670 ddt_univ_phys_t dpe_phys[];
2671 } ddt_prune_entry_t;
2672
2673 typedef struct ddt_prune_info {
2674 spa_t *dpi_spa;
2675 uint64_t dpi_txg_syncs;
2676 uint64_t dpi_pruned;
2677 list_t dpi_candidates;
2678 } ddt_prune_info_t;
2679
2680 /*
2681 * Add prune candidates for ddt_sync during spa_sync
2682 */
2683 static void
prune_candidates_sync(void * arg,dmu_tx_t * tx)2684 prune_candidates_sync(void *arg, dmu_tx_t *tx)
2685 {
2686 (void) tx;
2687 ddt_prune_info_t *dpi = arg;
2688 ddt_prune_entry_t *dpe;
2689
2690 spa_config_enter(dpi->dpi_spa, SCL_ZIO, FTAG, RW_READER);
2691
2692 /* Process the prune candidates collected so far */
2693 while ((dpe = list_remove_head(&dpi->dpi_candidates)) != NULL) {
2694 blkptr_t blk;
2695 ddt_t *ddt = dpe->dpe_ddt;
2696
2697 ddt_enter(ddt);
2698
2699 /*
2700 * If it's on the live list, then it was loaded for update
2701 * this txg and is no longer stale; skip it.
2702 */
2703 if (avl_find(&ddt->ddt_tree, &dpe->dpe_key, NULL)) {
2704 ddt_exit(ddt);
2705 kmem_free(dpe, sizeof (*dpe));
2706 continue;
2707 }
2708
2709 ddt_bp_create(ddt->ddt_checksum, &dpe->dpe_key,
2710 dpe->dpe_phys, DDT_PHYS_FLAT, &blk);
2711
2712 ddt_entry_t *dde = ddt_lookup(ddt, &blk, B_TRUE);
2713 if (dde != NULL && !(dde->dde_flags & DDE_FLAG_LOGGED)) {
2714 ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
2715 /*
2716 * Zero the physical, so we don't try to free DVAs
2717 * at flush nor try to reuse this entry.
2718 */
2719 ddt_phys_clear(dde->dde_phys, DDT_PHYS_FLAT);
2720
2721 dpi->dpi_pruned++;
2722 }
2723
2724 ddt_exit(ddt);
2725 kmem_free(dpe, sizeof (*dpe));
2726 }
2727
2728 spa_config_exit(dpi->dpi_spa, SCL_ZIO, FTAG);
2729 dpi->dpi_txg_syncs++;
2730 }
2731
2732 /*
2733 * Prune candidates are collected in open context and processed
2734 * in sync context as part of ddt_sync_table().
2735 */
2736 static void
ddt_prune_entry(list_t * list,ddt_t * ddt,const ddt_key_t * ddk,const ddt_univ_phys_t * ddp)2737 ddt_prune_entry(list_t *list, ddt_t *ddt, const ddt_key_t *ddk,
2738 const ddt_univ_phys_t *ddp)
2739 {
2740 ASSERT(ddt->ddt_flags & DDT_FLAG_FLAT);
2741
2742 size_t dpe_size = sizeof (ddt_prune_entry_t) + DDT_FLAT_PHYS_SIZE;
2743 ddt_prune_entry_t *dpe = kmem_alloc(dpe_size, KM_SLEEP);
2744
2745 dpe->dpe_ddt = ddt;
2746 dpe->dpe_key = *ddk;
2747 memcpy(dpe->dpe_phys, ddp, DDT_FLAT_PHYS_SIZE);
2748 list_insert_head(list, dpe);
2749 }
2750
2751 /*
2752 * Interate over all the entries in the DDT unique class.
2753 * The walk will perform one of the following operations:
2754 * (a) build a histogram than can be used when pruning
2755 * (b) prune entries older than the cutoff
2756 *
2757 * Also called by zdb(8) to dump the age histogram
2758 */
2759 void
ddt_prune_walk(spa_t * spa,uint64_t cutoff,ddt_age_histo_t * histogram)2760 ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram)
2761 {
2762 ddt_bookmark_t ddb = {
2763 .ddb_class = DDT_CLASS_UNIQUE,
2764 .ddb_type = 0,
2765 .ddb_checksum = 0,
2766 .ddb_cursor = 0
2767 };
2768 ddt_lightweight_entry_t ddlwe = {0};
2769 int error;
2770 int valid = 0;
2771 int candidates = 0;
2772 uint64_t now = gethrestime_sec();
2773 ddt_prune_info_t dpi;
2774 boolean_t pruning = (cutoff != 0);
2775
2776 if (pruning) {
2777 dpi.dpi_txg_syncs = 0;
2778 dpi.dpi_pruned = 0;
2779 dpi.dpi_spa = spa;
2780 list_create(&dpi.dpi_candidates, sizeof (ddt_prune_entry_t),
2781 offsetof(ddt_prune_entry_t, dpe_node));
2782 }
2783
2784 if (histogram != NULL)
2785 memset(histogram, 0, sizeof (ddt_age_histo_t));
2786
2787 while ((error =
2788 ddt_walk_impl(spa, &ddb, &ddlwe, DDT_FLAG_FLAT, B_FALSE)) == 0) {
2789 ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
2790 VERIFY(ddt);
2791
2792 if (spa_shutting_down(spa) || issig())
2793 break;
2794
2795 ASSERT(ddt->ddt_flags & DDT_FLAG_FLAT);
2796 ASSERT3U(ddlwe.ddlwe_phys.ddp_flat.ddp_refcnt, <=, 1);
2797
2798 uint64_t class_start =
2799 ddlwe.ddlwe_phys.ddp_flat.ddp_class_start;
2800
2801 /*
2802 * If this entry is on the log, then the stored entry is stale
2803 * and we should skip it.
2804 */
2805 if (ddt_log_find_key(ddt, &ddlwe.ddlwe_key, NULL, NULL))
2806 continue;
2807
2808 /* prune older entries */
2809 if (pruning && class_start < cutoff) {
2810 if (candidates++ >= zfs_ddt_prunes_per_txg) {
2811 /* sync prune candidates in batches */
2812 VERIFY0(dsl_sync_task(spa_name(spa),
2813 NULL, prune_candidates_sync,
2814 &dpi, 0, ZFS_SPACE_CHECK_NONE));
2815 candidates = 1;
2816 }
2817 ddt_prune_entry(&dpi.dpi_candidates, ddt,
2818 &ddlwe.ddlwe_key, &ddlwe.ddlwe_phys);
2819 }
2820
2821 /* build a histogram */
2822 if (histogram != NULL) {
2823 uint64_t age = MAX(1, (now - class_start) / 3600);
2824 int bin = MIN(highbit64(age) - 1, HIST_BINS - 1);
2825 histogram->dah_entries++;
2826 histogram->dah_age_histo[bin]++;
2827 }
2828
2829 valid++;
2830 }
2831
2832 if (pruning && valid > 0) {
2833 if (!list_is_empty(&dpi.dpi_candidates)) {
2834 /* sync out final batch of prune candidates */
2835 VERIFY0(dsl_sync_task(spa_name(spa), NULL,
2836 prune_candidates_sync, &dpi, 0,
2837 ZFS_SPACE_CHECK_NONE));
2838 }
2839 list_destroy(&dpi.dpi_candidates);
2840
2841 zfs_dbgmsg("pruned %llu entries (%d%%) across %llu txg syncs",
2842 (u_longlong_t)dpi.dpi_pruned,
2843 (int)((dpi.dpi_pruned * 100) / valid),
2844 (u_longlong_t)dpi.dpi_txg_syncs);
2845 }
2846 }
2847
2848 static uint64_t
ddt_total_entries(spa_t * spa)2849 ddt_total_entries(spa_t *spa)
2850 {
2851 ddt_object_t ddo;
2852 ddt_get_dedup_object_stats(spa, &ddo);
2853
2854 return (ddo.ddo_count);
2855 }
2856
2857 int
ddt_prune_unique_entries(spa_t * spa,zpool_ddt_prune_unit_t unit,uint64_t amount)2858 ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit,
2859 uint64_t amount)
2860 {
2861 uint64_t cutoff;
2862 uint64_t start_time = gethrtime();
2863
2864 if (spa->spa_active_ddt_prune)
2865 return (SET_ERROR(EALREADY));
2866 if (ddt_total_entries(spa) == 0)
2867 return (0);
2868
2869 spa->spa_active_ddt_prune = B_TRUE;
2870
2871 zfs_dbgmsg("prune %llu %s", (u_longlong_t)amount,
2872 unit == ZPOOL_DDT_PRUNE_PERCENTAGE ? "%" : "seconds old or older");
2873
2874 if (unit == ZPOOL_DDT_PRUNE_PERCENTAGE) {
2875 ddt_age_histo_t histogram;
2876 uint64_t oldest = 0;
2877
2878 /* Make a pass over DDT to build a histogram */
2879 ddt_prune_walk(spa, 0, &histogram);
2880
2881 int target = (histogram.dah_entries * amount) / 100;
2882
2883 /*
2884 * Figure out our cutoff date
2885 * (i.e., which bins to prune from)
2886 */
2887 for (int i = HIST_BINS - 1; i >= 0 && target > 0; i--) {
2888 if (histogram.dah_age_histo[i] != 0) {
2889 /* less than this bucket remaining */
2890 if (target < histogram.dah_age_histo[i]) {
2891 oldest = MAX(1, (1<<i) * 3600);
2892 target = 0;
2893 } else {
2894 target -= histogram.dah_age_histo[i];
2895 }
2896 }
2897 }
2898 cutoff = gethrestime_sec() - oldest;
2899
2900 if (ddt_dump_prune_histogram)
2901 ddt_dump_age_histogram(&histogram, cutoff);
2902 } else if (unit == ZPOOL_DDT_PRUNE_AGE) {
2903 cutoff = gethrestime_sec() - amount;
2904 } else {
2905 return (EINVAL);
2906 }
2907
2908 if (cutoff > 0 && !spa_shutting_down(spa) && !issig()) {
2909 /* Traverse DDT to prune entries older that our cuttoff */
2910 ddt_prune_walk(spa, cutoff, NULL);
2911 }
2912
2913 zfs_dbgmsg("%s: prune completed in %llu ms",
2914 spa_name(spa), (u_longlong_t)NSEC2MSEC(gethrtime() - start_time));
2915
2916 spa->spa_active_ddt_prune = B_FALSE;
2917 return (0);
2918 }
2919
2920 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW,
2921 "Enable prefetching dedup-ed blks");
2922
2923 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_min_time_ms, UINT, ZMOD_RW,
2924 "Min time to spend on incremental dedup log flush each transaction");
2925
2926 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_min, UINT, ZMOD_RW,
2927 "Min number of log entries to flush each transaction");
2928
2929 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_max, UINT, ZMOD_RW,
2930 "Max number of log entries to flush each transaction");
2931
2932 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_txgs, UINT, ZMOD_RW,
2933 "Number of TXGs to try to rotate the log in");
2934
2935 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_cap, UINT, ZMOD_RW,
2936 "Soft cap for the size of the current dedup log");
2937
2938 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_hard_cap, UINT, ZMOD_RW,
2939 "Whether to use the soft cap as a hard cap");
2940
2941 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_flow_rate_txgs, UINT, ZMOD_RW,
2942 "Number of txgs to average flow rates across");
2943