xref: /freebsd/sys/contrib/openzfs/module/zfs/ddt_log.c (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2023, Klara Inc.
24  */
25 
26 #include <sys/zfs_context.h>
27 #include <sys/spa.h>
28 #include <sys/ddt.h>
29 #include <sys/dmu_tx.h>
30 #include <sys/dmu.h>
31 #include <sys/ddt_impl.h>
32 #include <sys/dnode.h>
33 #include <sys/dbuf.h>
34 #include <sys/zap.h>
35 #include <sys/zio_checksum.h>
36 
37 /*
38  * No more than this many txgs before swapping logs.
39  */
40 uint_t zfs_dedup_log_txg_max = 8;
41 
42 /*
43  * Max memory for the log AVL trees. If zfs_dedup_log_mem_max is zero at module
44  * load, it will be set to zfs_dedup_log_mem_max_percent% of total memory.
45  */
46 uint64_t zfs_dedup_log_mem_max = 0;
47 uint_t zfs_dedup_log_mem_max_percent = 1;
48 
49 
50 static kmem_cache_t *ddt_log_entry_flat_cache;
51 static kmem_cache_t *ddt_log_entry_trad_cache;
52 
53 #define	DDT_LOG_ENTRY_FLAT_SIZE	\
54 	(sizeof (ddt_log_entry_t) + DDT_FLAT_PHYS_SIZE)
55 #define	DDT_LOG_ENTRY_TRAD_SIZE	\
56 	(sizeof (ddt_log_entry_t) + DDT_TRAD_PHYS_SIZE)
57 
58 #define	DDT_LOG_ENTRY_SIZE(ddt)	\
59 	_DDT_PHYS_SWITCH(ddt, DDT_LOG_ENTRY_FLAT_SIZE, DDT_LOG_ENTRY_TRAD_SIZE)
60 
61 void
62 ddt_log_init(void)
63 {
64 	ddt_log_entry_flat_cache = kmem_cache_create("ddt_log_entry_flat_cache",
65 	    DDT_LOG_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
66 	ddt_log_entry_trad_cache = kmem_cache_create("ddt_log_entry_trad_cache",
67 	    DDT_LOG_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
68 
69 	/*
70 	 * Max memory for log AVL entries. At least 1M, because we need
71 	 * something (that's ~3800 entries per tree). They can say 100% if they
72 	 * want; it just means they're at the mercy of the the txg flush limit.
73 	 */
74 	if (zfs_dedup_log_mem_max == 0) {
75 		zfs_dedup_log_mem_max_percent =
76 		    MIN(zfs_dedup_log_mem_max_percent, 100);
77 		zfs_dedup_log_mem_max = (physmem * PAGESIZE) *
78 		    zfs_dedup_log_mem_max_percent / 100;
79 	}
80 	zfs_dedup_log_mem_max = MAX(zfs_dedup_log_mem_max, 1*1024*1024);
81 }
82 
83 void
84 ddt_log_fini(void)
85 {
86 	kmem_cache_destroy(ddt_log_entry_trad_cache);
87 	kmem_cache_destroy(ddt_log_entry_flat_cache);
88 }
89 
90 static void
91 ddt_log_name(ddt_t *ddt, char *name, uint_t n)
92 {
93 	snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_LOG,
94 	    zio_checksum_table[ddt->ddt_checksum].ci_name, n);
95 }
96 
97 static void
98 ddt_log_update_header(ddt_t *ddt, ddt_log_t *ddl, dmu_tx_t *tx)
99 {
100 	dmu_buf_t *db;
101 	VERIFY0(dmu_bonus_hold(ddt->ddt_os, ddl->ddl_object, FTAG, &db));
102 	dmu_buf_will_dirty(db, tx);
103 
104 	ddt_log_header_t *hdr = (ddt_log_header_t *)db->db_data;
105 	DLH_SET_VERSION(hdr, 1);
106 	DLH_SET_FLAGS(hdr, ddl->ddl_flags);
107 	hdr->dlh_length = ddl->ddl_length;
108 	hdr->dlh_first_txg = ddl->ddl_first_txg;
109 	hdr->dlh_checkpoint = ddl->ddl_checkpoint;
110 
111 	dmu_buf_rele(db, FTAG);
112 }
113 
114 static void
115 ddt_log_create_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
116 {
117 	ASSERT3U(ddt->ddt_dir_object, >, 0);
118 	ASSERT3U(ddl->ddl_object, ==, 0);
119 
120 	char name[DDT_NAMELEN];
121 	ddt_log_name(ddt, name, n);
122 
123 	ddl->ddl_object = dmu_object_alloc(ddt->ddt_os,
124 	    DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
125 	    DMU_OTN_UINT64_METADATA, sizeof (ddt_log_header_t), tx);
126 	VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, name,
127 	    sizeof (uint64_t), 1, &ddl->ddl_object, tx));
128 	ddl->ddl_length = 0;
129 	ddl->ddl_first_txg = tx->tx_txg;
130 	ddt_log_update_header(ddt, ddl, tx);
131 }
132 
133 static void
134 ddt_log_create(ddt_t *ddt, dmu_tx_t *tx)
135 {
136 	ddt_log_create_one(ddt, ddt->ddt_log_active, 0, tx);
137 	ddt_log_create_one(ddt, ddt->ddt_log_flushing, 1, tx);
138 }
139 
140 static void
141 ddt_log_destroy_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
142 {
143 	ASSERT3U(ddt->ddt_dir_object, >, 0);
144 
145 	if (ddl->ddl_object == 0)
146 		return;
147 
148 	ASSERT0(ddl->ddl_length);
149 
150 	char name[DDT_NAMELEN];
151 	ddt_log_name(ddt, name, n);
152 
153 	VERIFY0(zap_remove(ddt->ddt_os, ddt->ddt_dir_object, name, tx));
154 	VERIFY0(dmu_object_free(ddt->ddt_os, ddl->ddl_object, tx));
155 
156 	ddl->ddl_object = 0;
157 }
158 
159 void
160 ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx)
161 {
162 	ddt_log_destroy_one(ddt, ddt->ddt_log_active, 0, tx);
163 	ddt_log_destroy_one(ddt, ddt->ddt_log_flushing, 1, tx);
164 }
165 
166 static void
167 ddt_log_update_stats(ddt_t *ddt)
168 {
169 	/*
170 	 * Log object stats. We count the number of live entries in the log
171 	 * tree, even if there are more than on disk, and even if the same
172 	 * entry is on both append and flush trees, because that's more what
173 	 * the user expects to see. This does mean the on-disk size is not
174 	 * really correlated with the number of entries, but I don't think
175 	 * that's reasonable to expect anyway.
176 	 */
177 	dmu_object_info_t doi;
178 	uint64_t nblocks;
179 	dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object, &doi);
180 	nblocks = doi.doi_physical_blocks_512;
181 	dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object, &doi);
182 	nblocks += doi.doi_physical_blocks_512;
183 
184 	ddt_object_t *ddo = &ddt->ddt_log_stats;
185 	ddo->ddo_count =
186 	    avl_numnodes(&ddt->ddt_log_active->ddl_tree) +
187 	    avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
188 	ddo->ddo_mspace = ddo->ddo_count * DDT_LOG_ENTRY_SIZE(ddt);
189 	ddo->ddo_dspace = nblocks << 9;
190 }
191 
192 void
193 ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu)
194 {
195 	ASSERT3U(nentries, >, 0);
196 	ASSERT3P(dlu->dlu_dbp, ==, NULL);
197 
198 	if (ddt->ddt_log_active->ddl_object == 0)
199 		ddt_log_create(ddt, tx);
200 
201 	/*
202 	 * We want to store as many entries as we can in a block, but never
203 	 * split an entry across block boundaries.
204 	 */
205 	size_t reclen = P2ALIGN_TYPED(
206 	    sizeof (ddt_log_record_t) + sizeof (ddt_log_record_entry_t) +
207 	    DDT_PHYS_SIZE(ddt), sizeof (uint64_t), size_t);
208 	ASSERT3U(reclen, <=, UINT16_MAX);
209 	dlu->dlu_reclen = reclen;
210 
211 	VERIFY0(dnode_hold(ddt->ddt_os, ddt->ddt_log_active->ddl_object, FTAG,
212 	    &dlu->dlu_dn));
213 	dnode_set_storage_type(dlu->dlu_dn, DMU_OT_DDT_ZAP);
214 
215 	uint64_t nblocks = howmany(nentries,
216 	    dlu->dlu_dn->dn_datablksz / dlu->dlu_reclen);
217 	uint64_t offset = ddt->ddt_log_active->ddl_length;
218 	uint64_t length = nblocks * dlu->dlu_dn->dn_datablksz;
219 
220 	VERIFY0(dmu_buf_hold_array_by_dnode(dlu->dlu_dn, offset, length,
221 	    B_FALSE, FTAG, &dlu->dlu_ndbp, &dlu->dlu_dbp,
222 	    DMU_READ_NO_PREFETCH));
223 
224 	dlu->dlu_tx = tx;
225 	dlu->dlu_block = dlu->dlu_offset = 0;
226 }
227 
228 static ddt_log_entry_t *
229 ddt_log_alloc_entry(ddt_t *ddt)
230 {
231 	ddt_log_entry_t *ddle;
232 
233 	if (ddt->ddt_flags & DDT_FLAG_FLAT) {
234 		ddle = kmem_cache_alloc(ddt_log_entry_flat_cache, KM_SLEEP);
235 		memset(ddle, 0, DDT_LOG_ENTRY_FLAT_SIZE);
236 	} else {
237 		ddle = kmem_cache_alloc(ddt_log_entry_trad_cache, KM_SLEEP);
238 		memset(ddle, 0, DDT_LOG_ENTRY_TRAD_SIZE);
239 	}
240 
241 	return (ddle);
242 }
243 
244 static void
245 ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
246 {
247 	/* Create the log tree entry from a live or stored entry */
248 	avl_index_t where;
249 	ddt_log_entry_t *ddle =
250 	    avl_find(&ddl->ddl_tree, &ddlwe->ddlwe_key, &where);
251 	if (ddle == NULL) {
252 		ddle = ddt_log_alloc_entry(ddt);
253 		ddle->ddle_key = ddlwe->ddlwe_key;
254 		avl_insert(&ddl->ddl_tree, ddle, where);
255 	}
256 	ddle->ddle_type = ddlwe->ddlwe_type;
257 	ddle->ddle_class = ddlwe->ddlwe_class;
258 	memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
259 }
260 
261 void
262 ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu)
263 {
264 	ASSERT3U(dlu->dlu_dbp, !=, NULL);
265 
266 	ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe);
267 	ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
268 
269 	/* Get our block */
270 	ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
271 	dmu_buf_t *db = dlu->dlu_dbp[dlu->dlu_block];
272 
273 	/*
274 	 * If this would take us past the end of the block, finish it and
275 	 * move to the next one.
276 	 */
277 	if (db->db_size < (dlu->dlu_offset + dlu->dlu_reclen)) {
278 		ASSERT3U(dlu->dlu_offset, >, 0);
279 		dmu_buf_fill_done(db, dlu->dlu_tx, B_FALSE);
280 		dlu->dlu_block++;
281 		dlu->dlu_offset = 0;
282 		ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
283 		db = dlu->dlu_dbp[dlu->dlu_block];
284 	}
285 
286 	/*
287 	 * If this is the first time touching the block, inform the DMU that
288 	 * we will fill it, and zero it out.
289 	 */
290 	if (dlu->dlu_offset == 0) {
291 		dmu_buf_will_fill(db, dlu->dlu_tx, B_FALSE);
292 		memset(db->db_data, 0, db->db_size);
293 	}
294 
295 	/* Create the log record directly in the buffer */
296 	ddt_log_record_t *dlr = (db->db_data + dlu->dlu_offset);
297 	DLR_SET_TYPE(dlr, DLR_ENTRY);
298 	DLR_SET_RECLEN(dlr, dlu->dlu_reclen);
299 	DLR_SET_ENTRY_TYPE(dlr, ddlwe->ddlwe_type);
300 	DLR_SET_ENTRY_CLASS(dlr, ddlwe->ddlwe_class);
301 
302 	ddt_log_record_entry_t *dlre =
303 	    (ddt_log_record_entry_t *)&dlr->dlr_payload;
304 	dlre->dlre_key = ddlwe->ddlwe_key;
305 	memcpy(dlre->dlre_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
306 
307 	/* Advance offset for next record. */
308 	dlu->dlu_offset += dlu->dlu_reclen;
309 }
310 
311 void
312 ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu)
313 {
314 	ASSERT3U(dlu->dlu_dbp, !=, NULL);
315 	ASSERT3U(dlu->dlu_block+1, ==, dlu->dlu_ndbp);
316 	ASSERT3U(dlu->dlu_offset, >, 0);
317 
318 	/*
319 	 * Close out the last block. Whatever we haven't used will be zeroed,
320 	 * which matches DLR_INVALID, so we can detect this during load.
321 	 */
322 	dmu_buf_fill_done(dlu->dlu_dbp[dlu->dlu_block], dlu->dlu_tx, B_FALSE);
323 
324 	dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, FTAG);
325 
326 	ddt->ddt_log_active->ddl_length +=
327 	    dlu->dlu_ndbp * (uint64_t)dlu->dlu_dn->dn_datablksz;
328 	dnode_rele(dlu->dlu_dn, FTAG);
329 
330 	ddt_log_update_header(ddt, ddt->ddt_log_active, dlu->dlu_tx);
331 
332 	memset(dlu, 0, sizeof (ddt_log_update_t));
333 
334 	ddt_log_update_stats(ddt);
335 }
336 
337 boolean_t
338 ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
339 {
340 	ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
341 	if (ddle == NULL)
342 		return (B_FALSE);
343 
344 	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
345 
346 	ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
347 
348 	avl_remove(&ddl->ddl_tree, ddle);
349 	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
350 	    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
351 
352 	return (B_TRUE);
353 }
354 
355 boolean_t
356 ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk)
357 {
358 	ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL);
359 	if (ddle == NULL)
360 		return (B_FALSE);
361 
362 	ddt_lightweight_entry_t ddlwe;
363 	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
364 	ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
365 
366 	avl_remove(&ddl->ddl_tree, ddle);
367 	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
368 	    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
369 
370 	return (B_TRUE);
371 }
372 
373 boolean_t
374 ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk,
375     ddt_lightweight_entry_t *ddlwe)
376 {
377 	ddt_log_entry_t *ddle =
378 	    avl_find(&ddt->ddt_log_active->ddl_tree, ddk, NULL);
379 	if (!ddle)
380 		ddle = avl_find(&ddt->ddt_log_flushing->ddl_tree, ddk, NULL);
381 	if (!ddle)
382 		return (B_FALSE);
383 	if (ddlwe)
384 		DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
385 	return (B_TRUE);
386 }
387 
388 void
389 ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
390 {
391 	ddt_log_t *ddl = ddt->ddt_log_flushing;
392 
393 	ASSERT3U(ddl->ddl_object, !=, 0);
394 
395 #ifdef ZFS_DEBUG
396 	/*
397 	 * There should not be any entries on the log tree before the given
398 	 * checkpoint. Assert that this is the case.
399 	 */
400 	ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
401 	if (ddle != NULL)
402 		VERIFY3U(ddt_key_compare(&ddle->ddle_key, &ddlwe->ddlwe_key),
403 		    >, 0);
404 #endif
405 
406 	ddl->ddl_flags |= DDL_FLAG_CHECKPOINT;
407 	ddl->ddl_checkpoint = ddlwe->ddlwe_key;
408 	ddt_log_update_header(ddt, ddl, tx);
409 
410 	ddt_log_update_stats(ddt);
411 }
412 
413 void
414 ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx)
415 {
416 	ddt_log_t *ddl = ddt->ddt_log_flushing;
417 
418 	if (ddl->ddl_object == 0)
419 		return;
420 
421 	ASSERT(avl_is_empty(&ddl->ddl_tree));
422 
423 	/* Eject the entire object */
424 	dmu_free_range(ddt->ddt_os, ddl->ddl_object, 0, DMU_OBJECT_END, tx);
425 
426 	ddl->ddl_length = 0;
427 	ddl->ddl_flags &= ~DDL_FLAG_CHECKPOINT;
428 	memset(&ddl->ddl_checkpoint, 0, sizeof (ddt_key_t));
429 	ddt_log_update_header(ddt, ddl, tx);
430 
431 	ddt_log_update_stats(ddt);
432 }
433 
434 boolean_t
435 ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx)
436 {
437 	/* Swap the logs. The old flushing one must be empty */
438 	VERIFY(avl_is_empty(&ddt->ddt_log_flushing->ddl_tree));
439 
440 	/*
441 	 * If there are still blocks on the flushing log, truncate it first.
442 	 * This can happen if there were entries on the flushing log that were
443 	 * removed in memory via ddt_lookup(); their vestigal remains are
444 	 * on disk.
445 	 */
446 	if (ddt->ddt_log_flushing->ddl_length > 0)
447 		ddt_log_truncate(ddt, tx);
448 
449 	/*
450 	 * Swap policy. We swap the logs (and so begin flushing) when the
451 	 * active tree grows too large, or when we haven't swapped it in
452 	 * some amount of time, or if something has requested the logs be
453 	 * flushed ASAP (see ddt_walk_init()).
454 	 */
455 
456 	/*
457 	 * The log tree is too large if the memory usage of its entries is over
458 	 * half of the memory limit. This effectively gives each log tree half
459 	 * the available memory.
460 	 */
461 	const boolean_t too_large =
462 	    (avl_numnodes(&ddt->ddt_log_active->ddl_tree) *
463 	    DDT_LOG_ENTRY_SIZE(ddt)) >= (zfs_dedup_log_mem_max >> 1);
464 
465 	const boolean_t too_old =
466 	    tx->tx_txg >=
467 	    (ddt->ddt_log_active->ddl_first_txg +
468 	    MAX(1, zfs_dedup_log_txg_max));
469 
470 	const boolean_t force =
471 	    ddt->ddt_log_active->ddl_first_txg <= ddt->ddt_flush_force_txg;
472 
473 	if (!(too_large || too_old || force))
474 		return (B_FALSE);
475 
476 	ddt_log_t *swap = ddt->ddt_log_active;
477 	ddt->ddt_log_active = ddt->ddt_log_flushing;
478 	ddt->ddt_log_flushing = swap;
479 
480 	ASSERT(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING);
481 	ddt->ddt_log_active->ddl_flags &=
482 	    ~(DDL_FLAG_FLUSHING | DDL_FLAG_CHECKPOINT);
483 
484 	ASSERT(!(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING));
485 	ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
486 
487 	ddt->ddt_log_active->ddl_first_txg = tx->tx_txg;
488 
489 	ddt_log_update_header(ddt, ddt->ddt_log_active, tx);
490 	ddt_log_update_header(ddt, ddt->ddt_log_flushing, tx);
491 
492 	ddt_log_update_stats(ddt);
493 
494 	return (B_TRUE);
495 }
496 
497 static inline void
498 ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr,
499     const ddt_key_t *checkpoint)
500 {
501 	ASSERT3U(DLR_GET_TYPE(dlr), ==, DLR_ENTRY);
502 
503 	ddt_log_record_entry_t *dlre =
504 	    (ddt_log_record_entry_t *)dlr->dlr_payload;
505 	if (checkpoint != NULL &&
506 	    ddt_key_compare(&dlre->dlre_key, checkpoint) <= 0) {
507 		/* Skip pre-checkpoint entries; they're already flushed. */
508 		return;
509 	}
510 
511 	ddt_lightweight_entry_t ddlwe;
512 	ddlwe.ddlwe_type = DLR_GET_ENTRY_TYPE(dlr);
513 	ddlwe.ddlwe_class = DLR_GET_ENTRY_CLASS(dlr);
514 
515 	ddlwe.ddlwe_key = dlre->dlre_key;
516 	memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt));
517 
518 	ddt_log_update_entry(ddt, ddl, &ddlwe);
519 }
520 
521 static void
522 ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl)
523 {
524 	void *cookie = NULL;
525 	ddt_log_entry_t *ddle;
526 	IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree));
527 	while ((ddle =
528 	    avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) {
529 		kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
530 		    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
531 	}
532 	ASSERT(avl_is_empty(&ddl->ddl_tree));
533 }
534 
535 static int
536 ddt_log_load_one(ddt_t *ddt, uint_t n)
537 {
538 	ASSERT3U(n, <, 2);
539 
540 	ddt_log_t *ddl = &ddt->ddt_log[n];
541 
542 	char name[DDT_NAMELEN];
543 	ddt_log_name(ddt, name, n);
544 
545 	uint64_t obj;
546 	int err = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name,
547 	    sizeof (uint64_t), 1, &obj);
548 	if (err == ENOENT)
549 		return (0);
550 	if (err != 0)
551 		return (err);
552 
553 	dnode_t *dn;
554 	err = dnode_hold(ddt->ddt_os, obj, FTAG, &dn);
555 	if (err != 0)
556 		return (err);
557 
558 	ddt_log_header_t hdr;
559 	dmu_buf_t *db;
560 	err = dmu_bonus_hold_by_dnode(dn, FTAG, &db, DMU_READ_NO_PREFETCH);
561 	if (err != 0) {
562 		dnode_rele(dn, FTAG);
563 		return (err);
564 	}
565 	memcpy(&hdr, db->db_data, sizeof (ddt_log_header_t));
566 	dmu_buf_rele(db, FTAG);
567 
568 	if (DLH_GET_VERSION(&hdr) != 1) {
569 		dnode_rele(dn, FTAG);
570 		zfs_dbgmsg("ddt_log_load: spa=%s ddt_log=%s "
571 		    "unknown version=%llu", spa_name(ddt->ddt_spa), name,
572 		    (u_longlong_t)DLH_GET_VERSION(&hdr));
573 		return (SET_ERROR(EINVAL));
574 	}
575 
576 	ddt_key_t *checkpoint = NULL;
577 	if (DLH_GET_FLAGS(&hdr) & DDL_FLAG_CHECKPOINT) {
578 		/*
579 		 * If the log has a checkpoint, then we can ignore any entries
580 		 * that have already been flushed.
581 		 */
582 		ASSERT(DLH_GET_FLAGS(&hdr) & DDL_FLAG_FLUSHING);
583 		checkpoint = &hdr.dlh_checkpoint;
584 	}
585 
586 	if (hdr.dlh_length > 0) {
587 		dmu_prefetch_by_dnode(dn, 0, 0, hdr.dlh_length,
588 		    ZIO_PRIORITY_SYNC_READ);
589 
590 		for (uint64_t offset = 0; offset < hdr.dlh_length;
591 		    offset += dn->dn_datablksz) {
592 			err = dmu_buf_hold_by_dnode(dn, offset, FTAG, &db,
593 			    DMU_READ_PREFETCH);
594 			if (err != 0) {
595 				dnode_rele(dn, FTAG);
596 				ddt_log_empty(ddt, ddl);
597 				return (err);
598 			}
599 
600 			uint64_t boffset = 0;
601 			while (boffset < db->db_size) {
602 				ddt_log_record_t *dlr =
603 				    (ddt_log_record_t *)(db->db_data + boffset);
604 
605 				/* Partially-filled block, skip the rest */
606 				if (DLR_GET_TYPE(dlr) == DLR_INVALID)
607 					break;
608 
609 				switch (DLR_GET_TYPE(dlr)) {
610 				case DLR_ENTRY:
611 					ddt_log_load_entry(ddt, ddl, dlr,
612 					    checkpoint);
613 					break;
614 
615 				default:
616 					dmu_buf_rele(db, FTAG);
617 					dnode_rele(dn, FTAG);
618 					ddt_log_empty(ddt, ddl);
619 					return (SET_ERROR(EINVAL));
620 				}
621 
622 				boffset += DLR_GET_RECLEN(dlr);
623 			}
624 
625 			dmu_buf_rele(db, FTAG);
626 		}
627 	}
628 
629 	dnode_rele(dn, FTAG);
630 
631 	ddl->ddl_object = obj;
632 	ddl->ddl_flags = DLH_GET_FLAGS(&hdr);
633 	ddl->ddl_length = hdr.dlh_length;
634 	ddl->ddl_first_txg = hdr.dlh_first_txg;
635 
636 	if (ddl->ddl_flags & DDL_FLAG_FLUSHING)
637 		ddt->ddt_log_flushing = ddl;
638 	else
639 		ddt->ddt_log_active = ddl;
640 
641 	return (0);
642 }
643 
644 int
645 ddt_log_load(ddt_t *ddt)
646 {
647 	int err;
648 
649 	if (spa_load_state(ddt->ddt_spa) == SPA_LOAD_TRYIMPORT) {
650 		/*
651 		 * The DDT is going to be freed again in a moment, so there's
652 		 * no point loading the log; it'll just slow down import.
653 		 */
654 		return (0);
655 	}
656 
657 	ASSERT0(ddt->ddt_log[0].ddl_object);
658 	ASSERT0(ddt->ddt_log[1].ddl_object);
659 	if (ddt->ddt_dir_object == 0) {
660 		/*
661 		 * If we're configured but the containing dir doesn't exist
662 		 * yet, then the log object can't possibly exist either.
663 		 */
664 		ASSERT3U(ddt->ddt_version, !=, UINT64_MAX);
665 		return (SET_ERROR(ENOENT));
666 	}
667 
668 	if ((err = ddt_log_load_one(ddt, 0)) != 0)
669 		return (err);
670 	if ((err = ddt_log_load_one(ddt, 1)) != 0)
671 		return (err);
672 
673 	VERIFY3P(ddt->ddt_log_active, !=, ddt->ddt_log_flushing);
674 	VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING));
675 	VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_CHECKPOINT));
676 	VERIFY(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING);
677 
678 	/*
679 	 * We have two finalisation tasks:
680 	 *
681 	 * - rebuild the histogram. We do this at the end rather than while
682 	 *   we're loading so we don't need to uncount and recount entries that
683 	 *   appear multiple times in the log.
684 	 *
685 	 * - remove entries from the flushing tree that are on both trees. This
686 	 *   happens when ddt_lookup() rehydrates an entry from the flushing
687 	 *   tree, as ddt_log_take_key() removes the entry from the in-memory
688 	 *   tree but doesn't remove it from disk.
689 	 */
690 
691 	/*
692 	 * We don't technically need a config lock here, since there shouldn't
693 	 * be pool config changes during DDT load. dva_get_dsize_sync() via
694 	 * ddt_stat_generate() is expecting it though, and it won't hurt
695 	 * anything, so we take it.
696 	 */
697 	spa_config_enter(ddt->ddt_spa, SCL_STATE, FTAG, RW_READER);
698 
699 	avl_tree_t *al = &ddt->ddt_log_active->ddl_tree;
700 	avl_tree_t *fl = &ddt->ddt_log_flushing->ddl_tree;
701 	ddt_log_entry_t *ae = avl_first(al);
702 	ddt_log_entry_t *fe = avl_first(fl);
703 	while (ae != NULL || fe != NULL) {
704 		ddt_log_entry_t *ddle;
705 		if (ae == NULL) {
706 			/* active exhausted, take flushing */
707 			ddle = fe;
708 			fe = AVL_NEXT(fl, fe);
709 		} else if (fe == NULL) {
710 			/* flushing exuhausted, take active */
711 			ddle = ae;
712 			ae = AVL_NEXT(al, ae);
713 		} else {
714 			/* compare active and flushing */
715 			int c = ddt_key_compare(&ae->ddle_key, &fe->ddle_key);
716 			if (c < 0) {
717 				/* active behind, take and advance */
718 				ddle = ae;
719 				ae = AVL_NEXT(al, ae);
720 			} else if (c > 0) {
721 				/* flushing behind, take and advance */
722 				ddle = fe;
723 				fe = AVL_NEXT(fl, fe);
724 			} else {
725 				/* match. remove from flushing, take active */
726 				ddle = fe;
727 				fe = AVL_NEXT(fl, fe);
728 				avl_remove(fl, ddle);
729 
730 				ddle = ae;
731 				ae = AVL_NEXT(al, ae);
732 			}
733 		}
734 
735 		ddt_lightweight_entry_t ddlwe;
736 		DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
737 		ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
738 	}
739 
740 	spa_config_exit(ddt->ddt_spa, SCL_STATE, FTAG);
741 
742 	ddt_log_update_stats(ddt);
743 
744 	return (0);
745 }
746 
747 void
748 ddt_log_alloc(ddt_t *ddt)
749 {
750 	ASSERT3P(ddt->ddt_log_active, ==, NULL);
751 	ASSERT3P(ddt->ddt_log_flushing, ==, NULL);
752 
753 	avl_create(&ddt->ddt_log[0].ddl_tree, ddt_key_compare,
754 	    sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
755 	avl_create(&ddt->ddt_log[1].ddl_tree, ddt_key_compare,
756 	    sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
757 	ddt->ddt_log_active = &ddt->ddt_log[0];
758 	ddt->ddt_log_flushing = &ddt->ddt_log[1];
759 	ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
760 }
761 
762 void
763 ddt_log_free(ddt_t *ddt)
764 {
765 	ddt_log_empty(ddt, &ddt->ddt_log[0]);
766 	ddt_log_empty(ddt, &ddt->ddt_log[1]);
767 	avl_destroy(&ddt->ddt_log[0].ddl_tree);
768 	avl_destroy(&ddt->ddt_log[1].ddl_tree);
769 }
770 
771 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_txg_max, UINT, ZMOD_RW,
772 	"Max transactions before starting to flush dedup logs");
773 
774 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max, U64, ZMOD_RD,
775 	"Max memory for dedup logs");
776 
777 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max_percent, UINT, ZMOD_RD,
778 	"Max memory for dedup logs, as % of total memory");
779