1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23 /*
24 * Copyright (c) 2023, Klara Inc.
25 */
26
27 #include <sys/zfs_context.h>
28 #include <sys/spa.h>
29 #include <sys/ddt.h>
30 #include <sys/dmu_tx.h>
31 #include <sys/dmu.h>
32 #include <sys/ddt_impl.h>
33 #include <sys/dnode.h>
34 #include <sys/dbuf.h>
35 #include <sys/zap.h>
36 #include <sys/zio_checksum.h>
37
38 /*
39 * No more than this many txgs before swapping logs.
40 */
41 uint_t zfs_dedup_log_txg_max = 8;
42
43 /*
44 * Max memory for the log AVL trees. If zfs_dedup_log_mem_max is zero at module
45 * load, it will be set to zfs_dedup_log_mem_max_percent% of total memory.
46 */
47 uint64_t zfs_dedup_log_mem_max = 0;
48 uint_t zfs_dedup_log_mem_max_percent = 1;
49
50
51 static kmem_cache_t *ddt_log_entry_flat_cache;
52 static kmem_cache_t *ddt_log_entry_trad_cache;
53
54 #define DDT_LOG_ENTRY_FLAT_SIZE \
55 (sizeof (ddt_log_entry_t) + DDT_FLAT_PHYS_SIZE)
56 #define DDT_LOG_ENTRY_TRAD_SIZE \
57 (sizeof (ddt_log_entry_t) + DDT_TRAD_PHYS_SIZE)
58
59 #define DDT_LOG_ENTRY_SIZE(ddt) \
60 _DDT_PHYS_SWITCH(ddt, DDT_LOG_ENTRY_FLAT_SIZE, DDT_LOG_ENTRY_TRAD_SIZE)
61
62 void
ddt_log_init(void)63 ddt_log_init(void)
64 {
65 ddt_log_entry_flat_cache = kmem_cache_create("ddt_log_entry_flat_cache",
66 DDT_LOG_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
67 ddt_log_entry_trad_cache = kmem_cache_create("ddt_log_entry_trad_cache",
68 DDT_LOG_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
69
70 /*
71 * Max memory for log AVL entries. At least 1M, because we need
72 * something (that's ~3800 entries per tree). They can say 100% if they
73 * want; it just means they're at the mercy of the the txg flush limit.
74 */
75 if (zfs_dedup_log_mem_max == 0) {
76 zfs_dedup_log_mem_max_percent =
77 MIN(zfs_dedup_log_mem_max_percent, 100);
78 zfs_dedup_log_mem_max = (physmem * PAGESIZE) *
79 zfs_dedup_log_mem_max_percent / 100;
80 }
81 zfs_dedup_log_mem_max = MAX(zfs_dedup_log_mem_max, 1*1024*1024);
82 }
83
84 void
ddt_log_fini(void)85 ddt_log_fini(void)
86 {
87 kmem_cache_destroy(ddt_log_entry_trad_cache);
88 kmem_cache_destroy(ddt_log_entry_flat_cache);
89 }
90
91 static void
ddt_log_name(ddt_t * ddt,char * name,uint_t n)92 ddt_log_name(ddt_t *ddt, char *name, uint_t n)
93 {
94 snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_LOG,
95 zio_checksum_table[ddt->ddt_checksum].ci_name, n);
96 }
97
98 static void
ddt_log_update_header(ddt_t * ddt,ddt_log_t * ddl,dmu_tx_t * tx)99 ddt_log_update_header(ddt_t *ddt, ddt_log_t *ddl, dmu_tx_t *tx)
100 {
101 dmu_buf_t *db;
102 VERIFY0(dmu_bonus_hold(ddt->ddt_os, ddl->ddl_object, FTAG, &db));
103 dmu_buf_will_dirty(db, tx);
104
105 ddt_log_header_t *hdr = (ddt_log_header_t *)db->db_data;
106 DLH_SET_VERSION(hdr, 1);
107 DLH_SET_FLAGS(hdr, ddl->ddl_flags);
108 hdr->dlh_length = ddl->ddl_length;
109 hdr->dlh_first_txg = ddl->ddl_first_txg;
110 hdr->dlh_checkpoint = ddl->ddl_checkpoint;
111
112 dmu_buf_rele(db, FTAG);
113 }
114
115 static void
ddt_log_create_one(ddt_t * ddt,ddt_log_t * ddl,uint_t n,dmu_tx_t * tx)116 ddt_log_create_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
117 {
118 ASSERT3U(ddt->ddt_dir_object, >, 0);
119 ASSERT0(ddl->ddl_object);
120
121 char name[DDT_NAMELEN];
122 ddt_log_name(ddt, name, n);
123
124 ddl->ddl_object = dmu_object_alloc(ddt->ddt_os,
125 DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
126 DMU_OTN_UINT64_METADATA, sizeof (ddt_log_header_t), tx);
127 VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, name,
128 sizeof (uint64_t), 1, &ddl->ddl_object, tx));
129 ddl->ddl_length = 0;
130 ddl->ddl_first_txg = tx->tx_txg;
131 ddt_log_update_header(ddt, ddl, tx);
132 }
133
134 static void
ddt_log_create(ddt_t * ddt,dmu_tx_t * tx)135 ddt_log_create(ddt_t *ddt, dmu_tx_t *tx)
136 {
137 ddt_log_create_one(ddt, ddt->ddt_log_active, 0, tx);
138 ddt_log_create_one(ddt, ddt->ddt_log_flushing, 1, tx);
139 }
140
141 static void
ddt_log_destroy_one(ddt_t * ddt,ddt_log_t * ddl,uint_t n,dmu_tx_t * tx)142 ddt_log_destroy_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
143 {
144 ASSERT3U(ddt->ddt_dir_object, >, 0);
145
146 if (ddl->ddl_object == 0)
147 return;
148
149 ASSERT0(ddl->ddl_length);
150
151 char name[DDT_NAMELEN];
152 ddt_log_name(ddt, name, n);
153
154 VERIFY0(zap_remove(ddt->ddt_os, ddt->ddt_dir_object, name, tx));
155 VERIFY0(dmu_object_free(ddt->ddt_os, ddl->ddl_object, tx));
156
157 ddl->ddl_object = 0;
158 }
159
160 void
ddt_log_destroy(ddt_t * ddt,dmu_tx_t * tx)161 ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx)
162 {
163 ddt_log_destroy_one(ddt, ddt->ddt_log_active, 0, tx);
164 ddt_log_destroy_one(ddt, ddt->ddt_log_flushing, 1, tx);
165 }
166
167 static void
ddt_log_update_stats(ddt_t * ddt)168 ddt_log_update_stats(ddt_t *ddt)
169 {
170 /*
171 * Log object stats. We count the number of live entries in the log
172 * tree, even if there are more than on disk, and even if the same
173 * entry is on both append and flush trees, because that's more what
174 * the user expects to see. This does mean the on-disk size is not
175 * really correlated with the number of entries, but I don't think
176 * that's reasonable to expect anyway.
177 */
178 dmu_object_info_t doi;
179 uint64_t nblocks = 0;
180 if (dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object,
181 &doi) == 0)
182 nblocks += doi.doi_physical_blocks_512;
183 if (dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object,
184 &doi) == 0)
185 nblocks += doi.doi_physical_blocks_512;
186
187 ddt_object_t *ddo = &ddt->ddt_log_stats;
188 ddo->ddo_count =
189 avl_numnodes(&ddt->ddt_log_active->ddl_tree) +
190 avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
191 ddo->ddo_mspace = ddo->ddo_count * DDT_LOG_ENTRY_SIZE(ddt);
192 ddo->ddo_dspace = nblocks << 9;
193 }
194
195 void
ddt_log_begin(ddt_t * ddt,size_t nentries,dmu_tx_t * tx,ddt_log_update_t * dlu)196 ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu)
197 {
198 ASSERT3U(nentries, >, 0);
199 ASSERT0P(dlu->dlu_dbp);
200
201 if (ddt->ddt_log_active->ddl_object == 0)
202 ddt_log_create(ddt, tx);
203
204 /*
205 * We want to store as many entries as we can in a block, but never
206 * split an entry across block boundaries.
207 */
208 size_t reclen = P2ALIGN_TYPED(
209 sizeof (ddt_log_record_t) + sizeof (ddt_log_record_entry_t) +
210 DDT_PHYS_SIZE(ddt), sizeof (uint64_t), size_t);
211 ASSERT3U(reclen, <=, UINT16_MAX);
212 dlu->dlu_reclen = reclen;
213
214 VERIFY0(dnode_hold(ddt->ddt_os, ddt->ddt_log_active->ddl_object, FTAG,
215 &dlu->dlu_dn));
216 dnode_set_storage_type(dlu->dlu_dn, DMU_OT_DDT_ZAP);
217
218 uint64_t nblocks = howmany(nentries,
219 dlu->dlu_dn->dn_datablksz / dlu->dlu_reclen);
220 uint64_t offset = ddt->ddt_log_active->ddl_length;
221 uint64_t length = nblocks * dlu->dlu_dn->dn_datablksz;
222
223 VERIFY0(dmu_buf_hold_array_by_dnode(dlu->dlu_dn, offset, length,
224 B_FALSE, FTAG, &dlu->dlu_ndbp, &dlu->dlu_dbp,
225 DMU_READ_NO_PREFETCH | DMU_UNCACHEDIO));
226
227 dlu->dlu_tx = tx;
228 dlu->dlu_block = dlu->dlu_offset = 0;
229 }
230
231 static ddt_log_entry_t *
ddt_log_alloc_entry(ddt_t * ddt)232 ddt_log_alloc_entry(ddt_t *ddt)
233 {
234 ddt_log_entry_t *ddle;
235
236 if (ddt->ddt_flags & DDT_FLAG_FLAT) {
237 ddle = kmem_cache_alloc(ddt_log_entry_flat_cache, KM_SLEEP);
238 memset(ddle, 0, DDT_LOG_ENTRY_FLAT_SIZE);
239 } else {
240 ddle = kmem_cache_alloc(ddt_log_entry_trad_cache, KM_SLEEP);
241 memset(ddle, 0, DDT_LOG_ENTRY_TRAD_SIZE);
242 }
243
244 return (ddle);
245 }
246
247 static void
ddt_log_free_entry(ddt_t * ddt,ddt_log_entry_t * ddle)248 ddt_log_free_entry(ddt_t *ddt, ddt_log_entry_t *ddle)
249 {
250 kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
251 ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
252 }
253
254 static void
ddt_log_update_entry(ddt_t * ddt,ddt_log_t * ddl,ddt_lightweight_entry_t * ddlwe)255 ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
256 {
257 /* Create the log tree entry from a live or stored entry */
258 avl_index_t where;
259 ddt_log_entry_t *ddle =
260 avl_find(&ddl->ddl_tree, &ddlwe->ddlwe_key, &where);
261 if (ddle == NULL) {
262 ddle = ddt_log_alloc_entry(ddt);
263 ddle->ddle_key = ddlwe->ddlwe_key;
264 avl_insert(&ddl->ddl_tree, ddle, where);
265 }
266 ddle->ddle_type = ddlwe->ddlwe_type;
267 ddle->ddle_class = ddlwe->ddlwe_class;
268 memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
269 }
270
271 void
ddt_log_entry(ddt_t * ddt,ddt_lightweight_entry_t * ddlwe,ddt_log_update_t * dlu)272 ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu)
273 {
274 ASSERT3U(dlu->dlu_dbp, !=, NULL);
275
276 ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe);
277 ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
278
279 /* Get our block */
280 ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
281 dmu_buf_t *db = dlu->dlu_dbp[dlu->dlu_block];
282
283 /*
284 * If this would take us past the end of the block, finish it and
285 * move to the next one.
286 */
287 if (db->db_size < (dlu->dlu_offset + dlu->dlu_reclen)) {
288 ASSERT3U(dlu->dlu_offset, >, 0);
289 dmu_buf_fill_done(db, dlu->dlu_tx, B_FALSE);
290 dlu->dlu_block++;
291 dlu->dlu_offset = 0;
292 ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
293 db = dlu->dlu_dbp[dlu->dlu_block];
294 }
295
296 /*
297 * If this is the first time touching the block, inform the DMU that
298 * we will fill it, and zero it out.
299 */
300 if (dlu->dlu_offset == 0) {
301 dmu_buf_will_fill_flags(db, dlu->dlu_tx, B_FALSE,
302 DMU_UNCACHEDIO);
303 memset(db->db_data, 0, db->db_size);
304 }
305
306 /* Create the log record directly in the buffer */
307 ddt_log_record_t *dlr = (db->db_data + dlu->dlu_offset);
308 DLR_SET_TYPE(dlr, DLR_ENTRY);
309 DLR_SET_RECLEN(dlr, dlu->dlu_reclen);
310 DLR_SET_ENTRY_TYPE(dlr, ddlwe->ddlwe_type);
311 DLR_SET_ENTRY_CLASS(dlr, ddlwe->ddlwe_class);
312
313 ddt_log_record_entry_t *dlre =
314 (ddt_log_record_entry_t *)&dlr->dlr_payload;
315 dlre->dlre_key = ddlwe->ddlwe_key;
316 memcpy(dlre->dlre_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
317
318 /* Advance offset for next record. */
319 dlu->dlu_offset += dlu->dlu_reclen;
320 }
321
322 void
ddt_log_commit(ddt_t * ddt,ddt_log_update_t * dlu)323 ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu)
324 {
325 ASSERT3U(dlu->dlu_dbp, !=, NULL);
326 ASSERT3U(dlu->dlu_block+1, ==, dlu->dlu_ndbp);
327 ASSERT3U(dlu->dlu_offset, >, 0);
328
329 /*
330 * Close out the last block. Whatever we haven't used will be zeroed,
331 * which matches DLR_INVALID, so we can detect this during load.
332 */
333 dmu_buf_fill_done(dlu->dlu_dbp[dlu->dlu_block], dlu->dlu_tx, B_FALSE);
334
335 dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, FTAG);
336
337 ddt->ddt_log_active->ddl_length +=
338 dlu->dlu_ndbp * (uint64_t)dlu->dlu_dn->dn_datablksz;
339 dnode_rele(dlu->dlu_dn, FTAG);
340
341 ddt_log_update_header(ddt, ddt->ddt_log_active, dlu->dlu_tx);
342
343 memset(dlu, 0, sizeof (ddt_log_update_t));
344
345 ddt_log_update_stats(ddt);
346 }
347
348 boolean_t
ddt_log_take_first(ddt_t * ddt,ddt_log_t * ddl,ddt_lightweight_entry_t * ddlwe)349 ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
350 {
351 ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
352 if (ddle == NULL)
353 return (B_FALSE);
354
355 DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
356
357 ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
358
359 avl_remove(&ddl->ddl_tree, ddle);
360 ddt_log_free_entry(ddt, ddle);
361
362 return (B_TRUE);
363 }
364
365 boolean_t
ddt_log_remove_key(ddt_t * ddt,ddt_log_t * ddl,const ddt_key_t * ddk)366 ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk)
367 {
368 ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL);
369 if (ddle == NULL)
370 return (B_FALSE);
371
372 ddt_lightweight_entry_t ddlwe;
373 DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
374 ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
375
376 avl_remove(&ddl->ddl_tree, ddle);
377 ddt_log_free_entry(ddt, ddle);
378
379 return (B_TRUE);
380 }
381
382 boolean_t
ddt_log_find_key(ddt_t * ddt,const ddt_key_t * ddk,ddt_lightweight_entry_t * ddlwe)383 ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk,
384 ddt_lightweight_entry_t *ddlwe)
385 {
386 ddt_log_entry_t *ddle =
387 avl_find(&ddt->ddt_log_active->ddl_tree, ddk, NULL);
388 if (!ddle)
389 ddle = avl_find(&ddt->ddt_log_flushing->ddl_tree, ddk, NULL);
390 if (!ddle)
391 return (B_FALSE);
392 if (ddlwe)
393 DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
394 return (B_TRUE);
395 }
396
397 void
ddt_log_checkpoint(ddt_t * ddt,ddt_lightweight_entry_t * ddlwe,dmu_tx_t * tx)398 ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
399 {
400 ddt_log_t *ddl = ddt->ddt_log_flushing;
401
402 ASSERT3U(ddl->ddl_object, !=, 0);
403
404 #ifdef ZFS_DEBUG
405 /*
406 * There should not be any entries on the log tree before the given
407 * checkpoint. Assert that this is the case.
408 */
409 ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
410 if (ddle != NULL)
411 VERIFY3U(ddt_key_compare(&ddle->ddle_key, &ddlwe->ddlwe_key),
412 >, 0);
413 #endif
414
415 ddl->ddl_flags |= DDL_FLAG_CHECKPOINT;
416 ddl->ddl_checkpoint = ddlwe->ddlwe_key;
417 ddt_log_update_header(ddt, ddl, tx);
418
419 ddt_log_update_stats(ddt);
420 }
421
422 void
ddt_log_truncate(ddt_t * ddt,dmu_tx_t * tx)423 ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx)
424 {
425 ddt_log_t *ddl = ddt->ddt_log_flushing;
426
427 if (ddl->ddl_object == 0)
428 return;
429
430 ASSERT(avl_is_empty(&ddl->ddl_tree));
431
432 /* Eject the entire object */
433 dmu_free_range(ddt->ddt_os, ddl->ddl_object, 0, DMU_OBJECT_END, tx);
434
435 ddl->ddl_length = 0;
436 ddl->ddl_flags &= ~DDL_FLAG_CHECKPOINT;
437 memset(&ddl->ddl_checkpoint, 0, sizeof (ddt_key_t));
438 ddt_log_update_header(ddt, ddl, tx);
439
440 ddt_log_update_stats(ddt);
441 }
442
443 boolean_t
ddt_log_swap(ddt_t * ddt,dmu_tx_t * tx)444 ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx)
445 {
446 /* Swap the logs. The old flushing one must be empty */
447 VERIFY(avl_is_empty(&ddt->ddt_log_flushing->ddl_tree));
448
449 /*
450 * If there are still blocks on the flushing log, truncate it first.
451 * This can happen if there were entries on the flushing log that were
452 * removed in memory via ddt_lookup(); their vestigal remains are
453 * on disk.
454 */
455 if (ddt->ddt_log_flushing->ddl_length > 0)
456 ddt_log_truncate(ddt, tx);
457
458 /*
459 * Swap policy. We swap the logs (and so begin flushing) when the
460 * active tree grows too large, or when we haven't swapped it in
461 * some amount of time, or if something has requested the logs be
462 * flushed ASAP (see ddt_walk_init()).
463 */
464
465 /*
466 * The log tree is too large if the memory usage of its entries is over
467 * half of the memory limit. This effectively gives each log tree half
468 * the available memory.
469 */
470 const boolean_t too_large =
471 (avl_numnodes(&ddt->ddt_log_active->ddl_tree) *
472 DDT_LOG_ENTRY_SIZE(ddt)) >= (zfs_dedup_log_mem_max >> 1);
473
474 const boolean_t too_old =
475 tx->tx_txg >=
476 (ddt->ddt_log_active->ddl_first_txg +
477 MAX(1, zfs_dedup_log_txg_max));
478
479 const boolean_t force =
480 ddt->ddt_log_active->ddl_first_txg <= ddt->ddt_flush_force_txg;
481
482 if (!(too_large || too_old || force))
483 return (B_FALSE);
484
485 ddt_log_t *swap = ddt->ddt_log_active;
486 ddt->ddt_log_active = ddt->ddt_log_flushing;
487 ddt->ddt_log_flushing = swap;
488
489 ASSERT(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING);
490 ddt->ddt_log_active->ddl_flags &=
491 ~(DDL_FLAG_FLUSHING | DDL_FLAG_CHECKPOINT);
492
493 ASSERT(!(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING));
494 ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
495
496 ddt->ddt_log_active->ddl_first_txg = tx->tx_txg;
497
498 ddt_log_update_header(ddt, ddt->ddt_log_active, tx);
499 ddt_log_update_header(ddt, ddt->ddt_log_flushing, tx);
500
501 ddt_log_update_stats(ddt);
502
503 return (B_TRUE);
504 }
505
506 static inline void
ddt_log_load_entry(ddt_t * ddt,ddt_log_t * ddl,ddt_log_record_t * dlr,const ddt_key_t * checkpoint)507 ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr,
508 const ddt_key_t *checkpoint)
509 {
510 ASSERT3U(DLR_GET_TYPE(dlr), ==, DLR_ENTRY);
511
512 ddt_log_record_entry_t *dlre =
513 (ddt_log_record_entry_t *)dlr->dlr_payload;
514 if (checkpoint != NULL &&
515 ddt_key_compare(&dlre->dlre_key, checkpoint) <= 0) {
516 /* Skip pre-checkpoint entries; they're already flushed. */
517 return;
518 }
519
520 ddt_lightweight_entry_t ddlwe;
521 ddlwe.ddlwe_type = DLR_GET_ENTRY_TYPE(dlr);
522 ddlwe.ddlwe_class = DLR_GET_ENTRY_CLASS(dlr);
523
524 ddlwe.ddlwe_key = dlre->dlre_key;
525 memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt));
526
527 ddt_log_update_entry(ddt, ddl, &ddlwe);
528 }
529
530 static void
ddt_log_empty(ddt_t * ddt,ddt_log_t * ddl)531 ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl)
532 {
533 void *cookie = NULL;
534 ddt_log_entry_t *ddle;
535 IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree));
536 while ((ddle =
537 avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) {
538 ddt_log_free_entry(ddt, ddle);
539 }
540 ASSERT(avl_is_empty(&ddl->ddl_tree));
541 }
542
543 static int
ddt_log_load_one(ddt_t * ddt,uint_t n)544 ddt_log_load_one(ddt_t *ddt, uint_t n)
545 {
546 ASSERT3U(n, <, 2);
547
548 ddt_log_t *ddl = &ddt->ddt_log[n];
549
550 char name[DDT_NAMELEN];
551 ddt_log_name(ddt, name, n);
552
553 uint64_t obj;
554 int err = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name,
555 sizeof (uint64_t), 1, &obj);
556 if (err == ENOENT)
557 return (0);
558 if (err != 0)
559 return (err);
560
561 dnode_t *dn;
562 err = dnode_hold(ddt->ddt_os, obj, FTAG, &dn);
563 if (err != 0)
564 return (err);
565
566 ddt_log_header_t hdr;
567 dmu_buf_t *db;
568 err = dmu_bonus_hold_by_dnode(dn, FTAG, &db, DMU_READ_NO_PREFETCH);
569 if (err != 0) {
570 dnode_rele(dn, FTAG);
571 return (err);
572 }
573 memcpy(&hdr, db->db_data, sizeof (ddt_log_header_t));
574 dmu_buf_rele(db, FTAG);
575
576 if (DLH_GET_VERSION(&hdr) != 1) {
577 dnode_rele(dn, FTAG);
578 zfs_dbgmsg("ddt_log_load: spa=%s ddt_log=%s "
579 "unknown version=%llu", spa_name(ddt->ddt_spa), name,
580 (u_longlong_t)DLH_GET_VERSION(&hdr));
581 return (SET_ERROR(EINVAL));
582 }
583
584 ddt_key_t *checkpoint = NULL;
585 if (DLH_GET_FLAGS(&hdr) & DDL_FLAG_CHECKPOINT) {
586 /*
587 * If the log has a checkpoint, then we can ignore any entries
588 * that have already been flushed.
589 */
590 ASSERT(DLH_GET_FLAGS(&hdr) & DDL_FLAG_FLUSHING);
591 checkpoint = &hdr.dlh_checkpoint;
592 }
593
594 if (hdr.dlh_length > 0) {
595 dmu_prefetch_by_dnode(dn, 0, 0, hdr.dlh_length,
596 ZIO_PRIORITY_SYNC_READ);
597
598 for (uint64_t offset = 0; offset < hdr.dlh_length;
599 offset += dn->dn_datablksz) {
600 err = dmu_buf_hold_by_dnode(dn, offset, FTAG, &db,
601 DMU_READ_PREFETCH | DMU_UNCACHEDIO);
602 if (err != 0) {
603 dnode_rele(dn, FTAG);
604 ddt_log_empty(ddt, ddl);
605 return (err);
606 }
607
608 uint64_t boffset = 0;
609 while (boffset < db->db_size) {
610 ddt_log_record_t *dlr =
611 (ddt_log_record_t *)(db->db_data + boffset);
612
613 /* Partially-filled block, skip the rest */
614 if (DLR_GET_TYPE(dlr) == DLR_INVALID)
615 break;
616
617 switch (DLR_GET_TYPE(dlr)) {
618 case DLR_ENTRY:
619 ddt_log_load_entry(ddt, ddl, dlr,
620 checkpoint);
621 break;
622
623 default:
624 dmu_buf_rele(db, FTAG);
625 dnode_rele(dn, FTAG);
626 ddt_log_empty(ddt, ddl);
627 return (SET_ERROR(EINVAL));
628 }
629
630 boffset += DLR_GET_RECLEN(dlr);
631 }
632
633 dmu_buf_rele(db, FTAG);
634 }
635 }
636
637 dnode_rele(dn, FTAG);
638
639 ddl->ddl_object = obj;
640 ddl->ddl_flags = DLH_GET_FLAGS(&hdr);
641 ddl->ddl_length = hdr.dlh_length;
642 ddl->ddl_first_txg = hdr.dlh_first_txg;
643
644 if (ddl->ddl_flags & DDL_FLAG_FLUSHING)
645 ddt->ddt_log_flushing = ddl;
646 else
647 ddt->ddt_log_active = ddl;
648
649 return (0);
650 }
651
652 int
ddt_log_load(ddt_t * ddt)653 ddt_log_load(ddt_t *ddt)
654 {
655 int err;
656
657 if (spa_load_state(ddt->ddt_spa) == SPA_LOAD_TRYIMPORT) {
658 /*
659 * The DDT is going to be freed again in a moment, so there's
660 * no point loading the log; it'll just slow down import.
661 */
662 return (0);
663 }
664
665 ASSERT0(ddt->ddt_log[0].ddl_object);
666 ASSERT0(ddt->ddt_log[1].ddl_object);
667 if (ddt->ddt_dir_object == 0) {
668 /*
669 * If we're configured but the containing dir doesn't exist
670 * yet, then the log object can't possibly exist either.
671 */
672 ASSERT3U(ddt->ddt_version, !=, UINT64_MAX);
673 return (SET_ERROR(ENOENT));
674 }
675
676 if ((err = ddt_log_load_one(ddt, 0)) != 0)
677 return (err);
678 if ((err = ddt_log_load_one(ddt, 1)) != 0)
679 return (err);
680
681 VERIFY3P(ddt->ddt_log_active, !=, ddt->ddt_log_flushing);
682 VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING));
683 VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_CHECKPOINT));
684 VERIFY(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING);
685
686 /*
687 * We have two finalisation tasks:
688 *
689 * - rebuild the histogram. We do this at the end rather than while
690 * we're loading so we don't need to uncount and recount entries that
691 * appear multiple times in the log.
692 *
693 * - remove entries from the flushing tree that are on both trees. This
694 * happens when ddt_lookup() rehydrates an entry from the flushing
695 * tree, as ddt_log_take_key() removes the entry from the in-memory
696 * tree but doesn't remove it from disk.
697 */
698
699 /*
700 * We don't technically need a config lock here, since there shouldn't
701 * be pool config changes during DDT load. dva_get_dsize_sync() via
702 * ddt_stat_generate() is expecting it though, and it won't hurt
703 * anything, so we take it.
704 */
705 spa_config_enter(ddt->ddt_spa, SCL_STATE, FTAG, RW_READER);
706
707 avl_tree_t *al = &ddt->ddt_log_active->ddl_tree;
708 avl_tree_t *fl = &ddt->ddt_log_flushing->ddl_tree;
709 ddt_log_entry_t *ae = avl_first(al);
710 ddt_log_entry_t *fe = avl_first(fl);
711 while (ae != NULL || fe != NULL) {
712 ddt_log_entry_t *ddle;
713 if (ae == NULL) {
714 /* active exhausted, take flushing */
715 ddle = fe;
716 fe = AVL_NEXT(fl, fe);
717 } else if (fe == NULL) {
718 /* flushing exuhausted, take active */
719 ddle = ae;
720 ae = AVL_NEXT(al, ae);
721 } else {
722 /* compare active and flushing */
723 int c = ddt_key_compare(&ae->ddle_key, &fe->ddle_key);
724 if (c < 0) {
725 /* active behind, take and advance */
726 ddle = ae;
727 ae = AVL_NEXT(al, ae);
728 } else if (c > 0) {
729 /* flushing behind, take and advance */
730 ddle = fe;
731 fe = AVL_NEXT(fl, fe);
732 } else {
733 /* match. remove from flushing, take active */
734 ddle = fe;
735 fe = AVL_NEXT(fl, fe);
736 avl_remove(fl, ddle);
737 ddt_log_free_entry(ddt, ddle);
738 ddle = ae;
739 ae = AVL_NEXT(al, ae);
740 }
741 }
742
743 ddt_lightweight_entry_t ddlwe;
744 DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
745 ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
746 }
747
748 spa_config_exit(ddt->ddt_spa, SCL_STATE, FTAG);
749
750 ddt_log_update_stats(ddt);
751
752 return (0);
753 }
754
755 void
ddt_log_alloc(ddt_t * ddt)756 ddt_log_alloc(ddt_t *ddt)
757 {
758 ASSERT0P(ddt->ddt_log_active);
759 ASSERT0P(ddt->ddt_log_flushing);
760
761 avl_create(&ddt->ddt_log[0].ddl_tree, ddt_key_compare,
762 sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
763 avl_create(&ddt->ddt_log[1].ddl_tree, ddt_key_compare,
764 sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
765 ddt->ddt_log_active = &ddt->ddt_log[0];
766 ddt->ddt_log_flushing = &ddt->ddt_log[1];
767 ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
768 }
769
770 void
ddt_log_free(ddt_t * ddt)771 ddt_log_free(ddt_t *ddt)
772 {
773 ddt_log_empty(ddt, &ddt->ddt_log[0]);
774 ddt_log_empty(ddt, &ddt->ddt_log[1]);
775 avl_destroy(&ddt->ddt_log[0].ddl_tree);
776 avl_destroy(&ddt->ddt_log[1].ddl_tree);
777 }
778
779 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_txg_max, UINT, ZMOD_RW,
780 "Max transactions before starting to flush dedup logs");
781
782 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max, U64, ZMOD_RD,
783 "Max memory for dedup logs");
784
785 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max_percent, UINT, ZMOD_RD,
786 "Max memory for dedup logs, as % of total memory");
787