1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2016 by Delphix. All rights reserved. 25 * Copyright (c) 2023, Klara Inc. 26 */ 27 28 #ifndef _SYS_DDT_H 29 #define _SYS_DDT_H 30 31 #include <sys/sysmacros.h> 32 #include <sys/types.h> 33 #include <sys/fs/zfs.h> 34 #include <sys/zio.h> 35 #include <sys/dmu.h> 36 #include <sys/wmsum.h> 37 38 #ifdef __cplusplus 39 extern "C" { 40 #endif 41 42 struct abd; 43 44 /* 45 * DDT-wide feature flags. These are set in ddt_flags by ddt_configure(). 46 */ 47 #define DDT_FLAG_FLAT (1 << 0) /* single extensible phys */ 48 #define DDT_FLAG_LOG (1 << 1) /* dedup log (journal) */ 49 #define DDT_FLAG_MASK (DDT_FLAG_FLAT|DDT_FLAG_LOG) 50 51 /* 52 * DDT on-disk storage object types. Each one corresponds to specific 53 * implementation, see ddt_ops_t. The value itself is not stored on disk. 54 * 55 * When searching for an entry, objects types will be searched in this order. 56 * 57 * Note that DDT_TYPES is used as the "no type" for new entries that have not 58 * yet been written to a storage object. 59 */ 60 typedef enum { 61 DDT_TYPE_ZAP = 0, /* ZAP storage object, ddt_zap */ 62 DDT_TYPES 63 } ddt_type_t; 64 65 _Static_assert(DDT_TYPES <= UINT8_MAX, 66 "ddt_type_t must fit in a uint8_t"); 67 68 /* New and updated entries recieve this type, see ddt_sync_entry() */ 69 #define DDT_TYPE_DEFAULT (DDT_TYPE_ZAP) 70 71 /* 72 * DDT storage classes. Each class has a separate storage object for each type. 73 * The value itself is not stored on disk. 74 * 75 * When search for an entry, object classes will be searched in this order. 76 * 77 * Note that DDT_CLASSES is used as the "no class" for new entries that have not 78 * yet been written to a storage object. 79 */ 80 typedef enum { 81 DDT_CLASS_DITTO = 0, /* entry has ditto blocks (obsolete) */ 82 DDT_CLASS_DUPLICATE, /* entry has multiple references */ 83 DDT_CLASS_UNIQUE, /* entry has a single reference */ 84 DDT_CLASSES 85 } ddt_class_t; 86 87 _Static_assert(DDT_CLASSES < UINT8_MAX, 88 "ddt_class_t must fit in a uint8_t"); 89 90 /* 91 * The "key" part of an on-disk entry. This is the unique "name" for a block, 92 * that is, that parts of the block pointer that will always be the same for 93 * the same data. 94 */ 95 typedef struct { 96 zio_cksum_t ddk_cksum; /* 256-bit block checksum */ 97 /* 98 * Encoded with logical & physical size, encryption, and compression, 99 * as follows: 100 * +-------+-------+-------+-------+-------+-------+-------+-------+ 101 * | 0 | 0 | 0 |X| comp| PSIZE | LSIZE | 102 * +-------+-------+-------+-------+-------+-------+-------+-------+ 103 */ 104 uint64_t ddk_prop; 105 } ddt_key_t; 106 107 /* 108 * Macros for accessing parts of a ddt_key_t. These are similar to their BP_* 109 * counterparts. 110 */ 111 #define DDK_GET_LSIZE(ddk) \ 112 BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1) 113 #define DDK_SET_LSIZE(ddk, x) \ 114 BF64_SET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x) 115 116 #define DDK_GET_PSIZE(ddk) \ 117 BF64_GET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1) 118 #define DDK_SET_PSIZE(ddk, x) \ 119 BF64_SET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x) 120 121 #define DDK_GET_COMPRESS(ddk) BF64_GET((ddk)->ddk_prop, 32, 7) 122 #define DDK_SET_COMPRESS(ddk, x) BF64_SET((ddk)->ddk_prop, 32, 7, x) 123 124 #define DDK_GET_CRYPT(ddk) BF64_GET((ddk)->ddk_prop, 39, 1) 125 #define DDK_SET_CRYPT(ddk, x) BF64_SET((ddk)->ddk_prop, 39, 1, x) 126 127 /* 128 * The "value" part for an on-disk entry. These are the "physical" 129 * characteristics of the stored block, such as its location on disk (DVAs), 130 * birth txg and ref count. 131 * 132 * The "traditional" entry has an array of four, one for each number of DVAs 133 * (copies= property) and another for additional "ditto" copies. Users of the 134 * traditional struct will specify the variant (index) of the one they want. 135 * 136 * The newer "flat" entry has only a single form that is specified using the 137 * DDT_PHYS_FLAT variant. 138 * 139 * Since the value size varies, use one of the size macros when interfacing 140 * with the ddt zap. 141 */ 142 143 #define DDT_PHYS_MAX (4) 144 145 /* 146 * Note - this can be used in a flexible array and allocated for 147 * a specific size (ddp_trad or ddp_flat). So be careful not to 148 * copy using "=" assignment but instead use ddt_phys_copy(). 149 */ 150 typedef union { 151 /* 152 * Traditional physical payload value for DDT zap (256 bytes) 153 */ 154 struct { 155 dva_t ddp_dva[SPA_DVAS_PER_BP]; 156 uint64_t ddp_refcnt; 157 uint64_t ddp_phys_birth; 158 } ddp_trad[DDT_PHYS_MAX]; 159 160 /* 161 * Flat physical payload value for DDT zap (72 bytes) 162 */ 163 struct { 164 dva_t ddp_dva[SPA_DVAS_PER_BP]; 165 uint64_t ddp_refcnt; 166 uint64_t ddp_phys_birth; /* txg based from BP */ 167 uint64_t ddp_class_start; /* in realtime seconds */ 168 } ddp_flat; 169 } ddt_univ_phys_t; 170 171 /* 172 * This enum denotes which variant of a ddt_univ_phys_t to target. For 173 * a traditional DDT entry, it represents the indexes into the ddp_trad 174 * array. Any consumer of a ddt_univ_phys_t needs to know which variant 175 * is being targeted. 176 * 177 * Note, we no longer generate new DDT_PHYS_DITTO-type blocks. However, 178 * we maintain the ability to free existing dedup-ditto blocks. 179 */ 180 181 typedef enum { 182 DDT_PHYS_DITTO = 0, 183 DDT_PHYS_SINGLE = 1, 184 DDT_PHYS_DOUBLE = 2, 185 DDT_PHYS_TRIPLE = 3, 186 DDT_PHYS_FLAT = 4, 187 DDT_PHYS_NONE = 5 188 } ddt_phys_variant_t; 189 190 #define DDT_PHYS_VARIANT(ddt, p) \ 191 (ASSERT((p) < DDT_PHYS_NONE), \ 192 ((ddt)->ddt_flags & DDT_FLAG_FLAT ? DDT_PHYS_FLAT : (p))) 193 194 #define DDT_TRAD_PHYS_SIZE sizeof (((ddt_univ_phys_t *)0)->ddp_trad) 195 #define DDT_FLAT_PHYS_SIZE sizeof (((ddt_univ_phys_t *)0)->ddp_flat) 196 197 #define _DDT_PHYS_SWITCH(ddt, flat, trad) \ 198 (((ddt)->ddt_flags & DDT_FLAG_FLAT) ? (flat) : (trad)) 199 200 #define DDT_PHYS_SIZE(ddt) _DDT_PHYS_SWITCH(ddt, \ 201 DDT_FLAT_PHYS_SIZE, DDT_TRAD_PHYS_SIZE) 202 203 #define DDT_NPHYS(ddt) _DDT_PHYS_SWITCH(ddt, 1, DDT_PHYS_MAX) 204 #define DDT_PHYS_FOR_COPIES(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, p) 205 #define DDT_PHYS_IS_DITTO(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, (p == 0)) 206 207 /* 208 * A "live" entry, holding changes to an entry made this txg, and other data to 209 * support loading, updating and repairing the entry. 210 */ 211 212 /* State flags for dde_flags */ 213 #define DDE_FLAG_LOADED (1 << 0) /* entry ready for use */ 214 #define DDE_FLAG_OVERQUOTA (1 << 1) /* entry unusable, no space */ 215 #define DDE_FLAG_LOGGED (1 << 2) /* loaded from log */ 216 #define DDE_FLAG_FROM_FLUSHING (1 << 3) /* loaded from flushing log */ 217 218 /* 219 * Additional data to support entry update or repair. This is fixed size 220 * because its relatively rarely used. 221 */ 222 typedef struct { 223 /* protects dde_phys, dde_orig_phys and dde_lead_zio during I/O */ 224 kmutex_t dde_io_lock; 225 226 /* copy of data after a repair read, to be rewritten */ 227 abd_t *dde_repair_abd; 228 229 /* original phys contents before update, for error handling */ 230 ddt_univ_phys_t dde_orig_phys; 231 232 /* in-flight update IOs */ 233 zio_t *dde_lead_zio[DDT_PHYS_MAX]; 234 } ddt_entry_io_t; 235 236 typedef struct { 237 /* key must be first for ddt_key_compare */ 238 ddt_key_t dde_key; /* ddt_tree key */ 239 avl_node_t dde_node; /* ddt_tree_node */ 240 241 /* storage type and class the entry was loaded from */ 242 ddt_type_t dde_type; 243 ddt_class_t dde_class; 244 245 uint8_t dde_flags; /* load state flags */ 246 kcondvar_t dde_cv; /* signaled when load completes */ 247 uint64_t dde_waiters; /* count of waiters on dde_cv */ 248 249 ddt_entry_io_t *dde_io; /* IO support, when required */ 250 251 ddt_univ_phys_t dde_phys[]; /* flexible -- allocated size varies */ 252 } ddt_entry_t; 253 254 /* 255 * A lightweight entry is for short-lived or transient uses, like iterating or 256 * inspecting, when you don't care where it came from. 257 */ 258 typedef struct { 259 ddt_key_t ddlwe_key; 260 ddt_type_t ddlwe_type; 261 ddt_class_t ddlwe_class; 262 ddt_univ_phys_t ddlwe_phys; 263 } ddt_lightweight_entry_t; 264 265 /* 266 * In-core DDT log. A separate struct to make it easier to switch between the 267 * appending and flushing logs. 268 */ 269 typedef struct { 270 avl_tree_t ddl_tree; /* logged entries */ 271 uint32_t ddl_flags; /* flags for this log */ 272 uint64_t ddl_object; /* log object id */ 273 uint64_t ddl_length; /* on-disk log size */ 274 uint64_t ddl_first_txg; /* txg log became active */ 275 ddt_key_t ddl_checkpoint; /* last checkpoint */ 276 } ddt_log_t; 277 278 /* 279 * In-core DDT object. This covers all entries and stats for a the whole pool 280 * for a given checksum type. 281 */ 282 typedef struct { 283 kmutex_t ddt_lock; /* protects changes to all fields */ 284 avl_tree_t ddt_tree; /* "live" (changed) entries this txg */ 285 avl_tree_t ddt_repair_tree; /* entries being repaired */ 286 287 /* 288 * Log trees are stable during I/O, and only modified during sync 289 * with exclusive access. 290 */ 291 ddt_log_t ddt_log[2] ____cacheline_aligned; /* logged entries */ 292 ddt_log_t *ddt_log_active; /* pointers into ddt_log */ 293 ddt_log_t *ddt_log_flushing; /* swapped when flush starts */ 294 295 int32_t ddt_log_ingest_rate; /* rolling log ingest rate */ 296 int32_t ddt_log_flush_rate; /* rolling log flush rate */ 297 int32_t ddt_log_flush_time_rate; /* avg time spent flushing */ 298 uint32_t ddt_log_flush_pressure; /* pressure to apply for cap */ 299 uint32_t ddt_log_flush_prev_backlog; /* prev backlog size */ 300 301 uint64_t ddt_flush_force_txg; /* flush hard before this txg */ 302 303 kstat_t *ddt_ksp; /* kstats context */ 304 305 /* wmsums for hot-path lookup counters */ 306 wmsum_t ddt_kstat_dds_lookup; 307 wmsum_t ddt_kstat_dds_lookup_live_hit; 308 wmsum_t ddt_kstat_dds_lookup_live_wait; 309 wmsum_t ddt_kstat_dds_lookup_live_miss; 310 wmsum_t ddt_kstat_dds_lookup_existing; 311 wmsum_t ddt_kstat_dds_lookup_new; 312 wmsum_t ddt_kstat_dds_lookup_log_hit; 313 wmsum_t ddt_kstat_dds_lookup_log_active_hit; 314 wmsum_t ddt_kstat_dds_lookup_log_flushing_hit; 315 wmsum_t ddt_kstat_dds_lookup_log_miss; 316 wmsum_t ddt_kstat_dds_lookup_stored_hit; 317 wmsum_t ddt_kstat_dds_lookup_stored_miss; 318 319 enum zio_checksum ddt_checksum; /* checksum algorithm in use */ 320 spa_t *ddt_spa; /* pool this ddt is on */ 321 objset_t *ddt_os; /* ddt objset (always MOS) */ 322 323 uint64_t ddt_dir_object; /* MOS dir holding ddt objects */ 324 uint64_t ddt_version; /* DDT version */ 325 uint64_t ddt_flags; /* FDT option flags */ 326 327 /* per-type/per-class entry store objects */ 328 uint64_t ddt_object[DDT_TYPES][DDT_CLASSES]; 329 dnode_t *ddt_object_dnode[DDT_TYPES][DDT_CLASSES]; 330 331 /* object ids for stored, logged and per-type/per-class stats */ 332 uint64_t ddt_stat_object; 333 ddt_object_t ddt_log_stats; 334 ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES]; 335 336 /* type/class stats by power-2-sized referenced blocks */ 337 ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES]; 338 ddt_histogram_t ddt_histogram_cache[DDT_TYPES][DDT_CLASSES]; 339 340 /* log stats power-2-sized referenced blocks */ 341 ddt_histogram_t ddt_log_histogram; 342 } ddt_t; 343 344 /* 345 * In-core and on-disk bookmark for DDT walks. This is a cursor for ddt_walk(), 346 * and is stable across calls, even if the DDT is updated, the pool is 347 * restarted or loaded on another system, or OpenZFS is upgraded. 348 */ 349 typedef struct { 350 uint64_t ddb_class; 351 uint64_t ddb_type; 352 uint64_t ddb_checksum; 353 uint64_t ddb_cursor; 354 } ddt_bookmark_t; 355 356 extern void ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, 357 blkptr_t *bp, uint64_t txg); 358 extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk, 359 const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp); 360 361 extern void ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v, 362 const blkptr_t *bp); 363 extern void ddt_phys_unextend(ddt_univ_phys_t *cur, ddt_univ_phys_t *orig, 364 ddt_phys_variant_t v); 365 extern void ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src, 366 ddt_phys_variant_t v); 367 extern void ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v); 368 extern void ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v); 369 extern uint64_t ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v); 370 extern uint64_t ddt_phys_refcnt(const ddt_univ_phys_t *ddp, 371 ddt_phys_variant_t v); 372 extern ddt_phys_variant_t ddt_phys_select(const ddt_t *ddt, 373 const ddt_entry_t *dde, const blkptr_t *bp); 374 extern uint64_t ddt_phys_birth(const ddt_univ_phys_t *ddp, 375 ddt_phys_variant_t v); 376 extern int ddt_phys_is_gang(const ddt_univ_phys_t *ddp, 377 ddt_phys_variant_t v); 378 extern int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, 379 boolean_t encrypted); 380 381 extern void ddt_histogram_add_entry(ddt_t *ddt, ddt_histogram_t *ddh, 382 const ddt_lightweight_entry_t *ddlwe); 383 extern void ddt_histogram_sub_entry(ddt_t *ddt, ddt_histogram_t *ddh, 384 const ddt_lightweight_entry_t *ddlwe); 385 386 extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src); 387 extern void ddt_histogram_total(ddt_stat_t *dds, const ddt_histogram_t *ddh); 388 extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh); 389 390 extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo); 391 extern uint64_t ddt_get_ddt_dsize(spa_t *spa); 392 extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh); 393 extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total); 394 395 extern uint64_t ddt_get_dedup_dspace(spa_t *spa); 396 extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa); 397 extern int ddt_get_pool_dedup_cached(spa_t *spa, uint64_t *psize); 398 399 extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp); 400 extern void ddt_enter(ddt_t *ddt); 401 extern void ddt_exit(ddt_t *ddt); 402 extern void ddt_init(void); 403 extern void ddt_fini(void); 404 extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, 405 boolean_t verify); 406 extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde); 407 extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp); 408 extern void ddt_prefetch_all(spa_t *spa); 409 410 extern boolean_t ddt_class_contains(spa_t *spa, ddt_class_t max_class, 411 const blkptr_t *bp); 412 413 extern void ddt_alloc_entry_io(ddt_entry_t *dde); 414 415 extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp); 416 extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde); 417 418 extern int ddt_key_compare(const void *x1, const void *x2); 419 420 extern void ddt_create(spa_t *spa); 421 extern int ddt_load(spa_t *spa); 422 extern void ddt_unload(spa_t *spa); 423 extern void ddt_sync(spa_t *spa, uint64_t txg); 424 425 extern void ddt_walk_init(spa_t *spa, uint64_t txg); 426 extern boolean_t ddt_walk_ready(spa_t *spa); 427 extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, 428 ddt_lightweight_entry_t *ddlwe); 429 430 extern boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp); 431 432 extern int ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit, 433 uint64_t amount); 434 435 #ifdef __cplusplus 436 } 437 #endif 438 439 #endif /* _SYS_DDT_H */ 440