1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2016 by Delphix. All rights reserved.
25 * Copyright (c) 2023, Klara Inc.
26 */
27
28 #ifndef _SYS_DDT_IMPL_H
29 #define _SYS_DDT_IMPL_H
30
31 #include <sys/ddt.h>
32 #include <sys/bitops.h>
33
34 #ifdef __cplusplus
35 extern "C" {
36 #endif
37
38 /* DDT version numbers */
39 #define DDT_VERSION_LEGACY (0)
40 #define DDT_VERSION_FDT (1)
41
42 /* Dummy version to signal that configure is still necessary */
43 #define DDT_VERSION_UNCONFIGURED (UINT64_MAX)
44
45 /* Names of interesting objects in the DDT root dir */
46 #define DDT_DIR_VERSION "version"
47 #define DDT_DIR_FLAGS "flags"
48
49 /* Fill a lightweight entry from a live entry. */
50 #define DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, ddlwe) do { \
51 memset((ddlwe), 0, sizeof (*ddlwe)); \
52 (ddlwe)->ddlwe_key = (dde)->dde_key; \
53 (ddlwe)->ddlwe_type = (dde)->dde_type; \
54 (ddlwe)->ddlwe_class = (dde)->dde_class; \
55 memcpy(&(ddlwe)->ddlwe_phys, (dde)->dde_phys, DDT_PHYS_SIZE(ddt)); \
56 } while (0)
57
58 #define DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe) do { \
59 memset((ddlwe), 0, sizeof (*ddlwe)); \
60 (ddlwe)->ddlwe_key = (ddle)->ddle_key; \
61 (ddlwe)->ddlwe_type = (ddle)->ddle_type; \
62 (ddlwe)->ddlwe_class = (ddle)->ddle_class; \
63 memcpy(&(ddlwe)->ddlwe_phys, (ddle)->ddle_phys, DDT_PHYS_SIZE(ddt)); \
64 } while (0)
65
66 /*
67 * An entry on the log tree. These are "frozen", and a record of what's in
68 * the on-disk log. They can't be used in place, but can be "loaded" back into
69 * the live tree.
70 */
71 typedef struct {
72 ddt_key_t ddle_key; /* ddt_log_tree key */
73 avl_node_t ddle_node; /* ddt_log_tree node */
74
75 ddt_type_t ddle_type; /* storage type */
76 ddt_class_t ddle_class; /* storage class */
77
78 /* extra allocation for flat/trad phys */
79 ddt_univ_phys_t ddle_phys[];
80 } ddt_log_entry_t;
81
82 /* On-disk log record types. */
83 typedef enum {
84 DLR_INVALID = 0, /* end of block marker */
85 DLR_ENTRY = 1, /* an entry to add or replace in the log tree */
86 } ddt_log_record_type_t;
87
88 /* On-disk log record header. */
89 typedef struct {
90 /*
91 * dlr_info is a packed u64, use the DLR_GET/DLR_SET macros below to
92 * access it.
93 *
94 * bits 0-7: record type (ddt_log_record_type_t)
95 * bits 8-15: length of record header+payload
96 * bits 16-47: reserved, all zero
97 * bits 48-55: if type==DLR_ENTRY, storage type (ddt_type)
98 * otherwise all zero
99 * bits 56-63: if type==DLR_ENTRY, storage class (ddt_class)
100 * otherwise all zero
101 */
102 uint64_t dlr_info;
103 uint8_t dlr_payload[];
104 } ddt_log_record_t;
105
106 #define DLR_GET_TYPE(dlr) BF64_GET((dlr)->dlr_info, 0, 8)
107 #define DLR_SET_TYPE(dlr, v) BF64_SET((dlr)->dlr_info, 0, 8, v)
108 #define DLR_GET_RECLEN(dlr) BF64_GET((dlr)->dlr_info, 8, 16)
109 #define DLR_SET_RECLEN(dlr, v) BF64_SET((dlr)->dlr_info, 8, 16, v)
110 #define DLR_GET_ENTRY_TYPE(dlr) BF64_GET((dlr)->dlr_info, 48, 8)
111 #define DLR_SET_ENTRY_TYPE(dlr, v) BF64_SET((dlr)->dlr_info, 48, 8, v)
112 #define DLR_GET_ENTRY_CLASS(dlr) BF64_GET((dlr)->dlr_info, 56, 8)
113 #define DLR_SET_ENTRY_CLASS(dlr, v) BF64_SET((dlr)->dlr_info, 56, 8, v)
114
115 /* Payload for DLR_ENTRY. */
116 typedef struct {
117 ddt_key_t dlre_key;
118 ddt_univ_phys_t dlre_phys[];
119 } ddt_log_record_entry_t;
120
121 /* Log flags (ddl_flags, dlh_flags) */
122 #define DDL_FLAG_FLUSHING (1 << 0) /* this log is being flushed */
123 #define DDL_FLAG_CHECKPOINT (1 << 1) /* header has a checkpoint */
124
125 /* On-disk log header, stored in the bonus buffer. */
126 typedef struct {
127 /*
128 * dlh_info is a packed u64, use the DLH_GET/DLH_SET macros below to
129 * access it.
130 *
131 * bits 0-7: log version
132 * bits 8-15: log flags
133 * bits 16-63: reserved, all zero
134 */
135 uint64_t dlh_info;
136
137 uint64_t dlh_length; /* log size in bytes */
138 uint64_t dlh_first_txg; /* txg this log went active */
139 ddt_key_t dlh_checkpoint; /* last checkpoint */
140 } ddt_log_header_t;
141
142 #define DLH_GET_VERSION(dlh) BF64_GET((dlh)->dlh_info, 0, 8)
143 #define DLH_SET_VERSION(dlh, v) BF64_SET((dlh)->dlh_info, 0, 8, v)
144 #define DLH_GET_FLAGS(dlh) BF64_GET((dlh)->dlh_info, 8, 8)
145 #define DLH_SET_FLAGS(dlh, v) BF64_SET((dlh)->dlh_info, 8, 8, v)
146
147 /* DDT log update state */
148 typedef struct {
149 dmu_tx_t *dlu_tx; /* tx the update is being applied to */
150 dnode_t *dlu_dn; /* log object dnode */
151 dmu_buf_t **dlu_dbp; /* array of block buffer pointers */
152 int dlu_ndbp; /* number of block buffer pointers */
153 uint16_t dlu_reclen; /* cached length of record */
154 uint64_t dlu_block; /* block for next entry */
155 uint64_t dlu_offset; /* offset for next entry */
156 } ddt_log_update_t;
157
158 /*
159 * Ops vector to access a specific DDT object type.
160 */
161 typedef struct {
162 char ddt_op_name[32];
163 int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx,
164 boolean_t prehash);
165 int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx);
166 int (*ddt_op_lookup)(objset_t *os, uint64_t object,
167 const ddt_key_t *ddk, void *phys, size_t psize);
168 int (*ddt_op_contains)(objset_t *os, uint64_t object,
169 const ddt_key_t *ddk);
170 void (*ddt_op_prefetch)(objset_t *os, uint64_t object,
171 const ddt_key_t *ddk);
172 void (*ddt_op_prefetch_all)(objset_t *os, uint64_t object);
173 int (*ddt_op_update)(objset_t *os, uint64_t object,
174 const ddt_key_t *ddk, const void *phys, size_t psize,
175 dmu_tx_t *tx);
176 int (*ddt_op_remove)(objset_t *os, uint64_t object,
177 const ddt_key_t *ddk, dmu_tx_t *tx);
178 int (*ddt_op_walk)(objset_t *os, uint64_t object, uint64_t *walk,
179 ddt_key_t *ddk, void *phys, size_t psize);
180 int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count);
181 } ddt_ops_t;
182
183 extern const ddt_ops_t ddt_zap_ops;
184
185 /* Dedup log API */
186 extern void ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx,
187 ddt_log_update_t *dlu);
188 extern void ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *dde,
189 ddt_log_update_t *dlu);
190 extern void ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu);
191
192 extern boolean_t ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl,
193 ddt_lightweight_entry_t *ddlwe);
194
195 extern boolean_t ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk,
196 ddt_lightweight_entry_t *ddlwe);
197 extern boolean_t ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl,
198 const ddt_key_t *ddk);
199
200 extern void ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe,
201 dmu_tx_t *tx);
202 extern void ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx);
203
204 extern boolean_t ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx);
205
206 extern void ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx);
207
208 extern int ddt_log_load(ddt_t *ddt);
209 extern void ddt_log_alloc(ddt_t *ddt);
210 extern void ddt_log_free(ddt_t *ddt);
211
212 extern void ddt_log_init(void);
213 extern void ddt_log_fini(void);
214
215 /*
216 * These are only exposed so that zdb can access them. Try not to use them
217 * outside of the DDT implementation proper, and if you do, consider moving
218 * them up.
219 */
220
221 /*
222 * We use a histogram to convert a percentage request into a
223 * cutoff value where entries older than the cutoff get pruned.
224 *
225 * The histogram bins represent hours in power-of-two increments.
226 * 16 bins covers up to four years.
227 */
228 #define HIST_BINS 16
229
230 typedef struct ddt_age_histo {
231 uint64_t dah_entries;
232 uint64_t dah_age_histo[HIST_BINS];
233 } ddt_age_histo_t;
234
235 void ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram);
236
237 #if defined(_KERNEL) || !defined(ZFS_DEBUG)
238 #define ddt_dump_age_histogram(histo, cutoff) ((void)0)
239 #else
240 static inline void
ddt_dump_age_histogram(ddt_age_histo_t * histogram,uint64_t cutoff)241 ddt_dump_age_histogram(ddt_age_histo_t *histogram, uint64_t cutoff)
242 {
243 if (histogram->dah_entries == 0)
244 return;
245
246 (void) printf("DDT prune unique class age, %llu hour cutoff\n",
247 (u_longlong_t)(gethrestime_sec() - cutoff)/3600);
248 (void) printf("%5s %9s %4s\n", "age", "blocks", "amnt");
249 (void) printf("%5s %9s %4s\n", "-----", "---------", "----");
250 for (int i = 0; i < HIST_BINS; i++) {
251 (void) printf("%5d %9llu %4d%%\n", 1<<i,
252 (u_longlong_t)histogram->dah_age_histo[i],
253 (int)((histogram->dah_age_histo[i] * 100) /
254 histogram->dah_entries));
255 }
256 }
257 #endif
258
259 /*
260 * Enough room to expand DMU_POOL_DDT format for all possible DDT
261 * checksum/class/type combinations.
262 */
263 #define DDT_NAMELEN 32
264
265 extern uint64_t ddt_phys_total_refcnt(const ddt_t *ddt,
266 const ddt_univ_phys_t *ddp);
267
268 extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp);
269
270 extern void ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
271 char *name);
272 extern int ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
273 uint64_t *walk, ddt_lightweight_entry_t *ddlwe);
274 extern int ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
275 uint64_t *count);
276 extern int ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
277 dmu_object_info_t *);
278
279 #ifdef __cplusplus
280 }
281 #endif
282
283 #endif /* _SYS_DDT_H */
284