xref: /freebsd/sys/contrib/openzfs/module/zfs/ddt_log.c (revision 61145dc2b94f12f6a47344fb9aac702321880e43)
1*61145dc2SMartin Matuska // SPDX-License-Identifier: CDDL-1.0
2e2df9bb4SMartin Matuska /*
3e2df9bb4SMartin Matuska  * CDDL HEADER START
4e2df9bb4SMartin Matuska  *
5e2df9bb4SMartin Matuska  * The contents of this file are subject to the terms of the
6e2df9bb4SMartin Matuska  * Common Development and Distribution License (the "License").
7e2df9bb4SMartin Matuska  * You may not use this file except in compliance with the License.
8e2df9bb4SMartin Matuska  *
9e2df9bb4SMartin Matuska  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10e2df9bb4SMartin Matuska  * or https://opensource.org/licenses/CDDL-1.0.
11e2df9bb4SMartin Matuska  * See the License for the specific language governing permissions
12e2df9bb4SMartin Matuska  * and limitations under the License.
13e2df9bb4SMartin Matuska  *
14e2df9bb4SMartin Matuska  * When distributing Covered Code, include this CDDL HEADER in each
15e2df9bb4SMartin Matuska  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16e2df9bb4SMartin Matuska  * If applicable, add the following below this CDDL HEADER, with the
17e2df9bb4SMartin Matuska  * fields enclosed by brackets "[]" replaced with your own identifying
18e2df9bb4SMartin Matuska  * information: Portions Copyright [yyyy] [name of copyright owner]
19e2df9bb4SMartin Matuska  *
20e2df9bb4SMartin Matuska  * CDDL HEADER END
21e2df9bb4SMartin Matuska  */
22e2df9bb4SMartin Matuska 
23e2df9bb4SMartin Matuska /*
24e2df9bb4SMartin Matuska  * Copyright (c) 2023, Klara Inc.
25e2df9bb4SMartin Matuska  */
26e2df9bb4SMartin Matuska 
27e2df9bb4SMartin Matuska #include <sys/zfs_context.h>
28e2df9bb4SMartin Matuska #include <sys/spa.h>
29e2df9bb4SMartin Matuska #include <sys/ddt.h>
30e2df9bb4SMartin Matuska #include <sys/dmu_tx.h>
31e2df9bb4SMartin Matuska #include <sys/dmu.h>
32e2df9bb4SMartin Matuska #include <sys/ddt_impl.h>
33e2df9bb4SMartin Matuska #include <sys/dnode.h>
34e2df9bb4SMartin Matuska #include <sys/dbuf.h>
35e2df9bb4SMartin Matuska #include <sys/zap.h>
36e2df9bb4SMartin Matuska #include <sys/zio_checksum.h>
37e2df9bb4SMartin Matuska 
38e2df9bb4SMartin Matuska /*
39e2df9bb4SMartin Matuska  * No more than this many txgs before swapping logs.
40e2df9bb4SMartin Matuska  */
41e2df9bb4SMartin Matuska uint_t zfs_dedup_log_txg_max = 8;
42e2df9bb4SMartin Matuska 
43e2df9bb4SMartin Matuska /*
44e2df9bb4SMartin Matuska  * Max memory for the log AVL trees. If zfs_dedup_log_mem_max is zero at module
45e2df9bb4SMartin Matuska  * load, it will be set to zfs_dedup_log_mem_max_percent% of total memory.
46e2df9bb4SMartin Matuska  */
47e2df9bb4SMartin Matuska uint64_t zfs_dedup_log_mem_max = 0;
48e2df9bb4SMartin Matuska uint_t zfs_dedup_log_mem_max_percent = 1;
49e2df9bb4SMartin Matuska 
50e2df9bb4SMartin Matuska 
51e2df9bb4SMartin Matuska static kmem_cache_t *ddt_log_entry_flat_cache;
52e2df9bb4SMartin Matuska static kmem_cache_t *ddt_log_entry_trad_cache;
53e2df9bb4SMartin Matuska 
54e2df9bb4SMartin Matuska #define	DDT_LOG_ENTRY_FLAT_SIZE	\
55e2df9bb4SMartin Matuska 	(sizeof (ddt_log_entry_t) + DDT_FLAT_PHYS_SIZE)
56e2df9bb4SMartin Matuska #define	DDT_LOG_ENTRY_TRAD_SIZE	\
57e2df9bb4SMartin Matuska 	(sizeof (ddt_log_entry_t) + DDT_TRAD_PHYS_SIZE)
58e2df9bb4SMartin Matuska 
59e2df9bb4SMartin Matuska #define	DDT_LOG_ENTRY_SIZE(ddt)	\
60e2df9bb4SMartin Matuska 	_DDT_PHYS_SWITCH(ddt, DDT_LOG_ENTRY_FLAT_SIZE, DDT_LOG_ENTRY_TRAD_SIZE)
61e2df9bb4SMartin Matuska 
62e2df9bb4SMartin Matuska void
ddt_log_init(void)63e2df9bb4SMartin Matuska ddt_log_init(void)
64e2df9bb4SMartin Matuska {
65e2df9bb4SMartin Matuska 	ddt_log_entry_flat_cache = kmem_cache_create("ddt_log_entry_flat_cache",
66e2df9bb4SMartin Matuska 	    DDT_LOG_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
67e2df9bb4SMartin Matuska 	ddt_log_entry_trad_cache = kmem_cache_create("ddt_log_entry_trad_cache",
68e2df9bb4SMartin Matuska 	    DDT_LOG_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
69e2df9bb4SMartin Matuska 
70e2df9bb4SMartin Matuska 	/*
71e2df9bb4SMartin Matuska 	 * Max memory for log AVL entries. At least 1M, because we need
72e2df9bb4SMartin Matuska 	 * something (that's ~3800 entries per tree). They can say 100% if they
73e2df9bb4SMartin Matuska 	 * want; it just means they're at the mercy of the the txg flush limit.
74e2df9bb4SMartin Matuska 	 */
75e2df9bb4SMartin Matuska 	if (zfs_dedup_log_mem_max == 0) {
76e2df9bb4SMartin Matuska 		zfs_dedup_log_mem_max_percent =
77e2df9bb4SMartin Matuska 		    MIN(zfs_dedup_log_mem_max_percent, 100);
78e2df9bb4SMartin Matuska 		zfs_dedup_log_mem_max = (physmem * PAGESIZE) *
79e2df9bb4SMartin Matuska 		    zfs_dedup_log_mem_max_percent / 100;
80e2df9bb4SMartin Matuska 	}
81e2df9bb4SMartin Matuska 	zfs_dedup_log_mem_max = MAX(zfs_dedup_log_mem_max, 1*1024*1024);
82e2df9bb4SMartin Matuska }
83e2df9bb4SMartin Matuska 
84e2df9bb4SMartin Matuska void
ddt_log_fini(void)85e2df9bb4SMartin Matuska ddt_log_fini(void)
86e2df9bb4SMartin Matuska {
87e2df9bb4SMartin Matuska 	kmem_cache_destroy(ddt_log_entry_trad_cache);
88e2df9bb4SMartin Matuska 	kmem_cache_destroy(ddt_log_entry_flat_cache);
89e2df9bb4SMartin Matuska }
90e2df9bb4SMartin Matuska 
91e2df9bb4SMartin Matuska static void
ddt_log_name(ddt_t * ddt,char * name,uint_t n)92e2df9bb4SMartin Matuska ddt_log_name(ddt_t *ddt, char *name, uint_t n)
93e2df9bb4SMartin Matuska {
94e2df9bb4SMartin Matuska 	snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_LOG,
95e2df9bb4SMartin Matuska 	    zio_checksum_table[ddt->ddt_checksum].ci_name, n);
96e2df9bb4SMartin Matuska }
97e2df9bb4SMartin Matuska 
98e2df9bb4SMartin Matuska static void
ddt_log_update_header(ddt_t * ddt,ddt_log_t * ddl,dmu_tx_t * tx)99e2df9bb4SMartin Matuska ddt_log_update_header(ddt_t *ddt, ddt_log_t *ddl, dmu_tx_t *tx)
100e2df9bb4SMartin Matuska {
101e2df9bb4SMartin Matuska 	dmu_buf_t *db;
102e2df9bb4SMartin Matuska 	VERIFY0(dmu_bonus_hold(ddt->ddt_os, ddl->ddl_object, FTAG, &db));
103e2df9bb4SMartin Matuska 	dmu_buf_will_dirty(db, tx);
104e2df9bb4SMartin Matuska 
105e2df9bb4SMartin Matuska 	ddt_log_header_t *hdr = (ddt_log_header_t *)db->db_data;
106e2df9bb4SMartin Matuska 	DLH_SET_VERSION(hdr, 1);
107e2df9bb4SMartin Matuska 	DLH_SET_FLAGS(hdr, ddl->ddl_flags);
108e2df9bb4SMartin Matuska 	hdr->dlh_length = ddl->ddl_length;
109e2df9bb4SMartin Matuska 	hdr->dlh_first_txg = ddl->ddl_first_txg;
110e2df9bb4SMartin Matuska 	hdr->dlh_checkpoint = ddl->ddl_checkpoint;
111e2df9bb4SMartin Matuska 
112e2df9bb4SMartin Matuska 	dmu_buf_rele(db, FTAG);
113e2df9bb4SMartin Matuska }
114e2df9bb4SMartin Matuska 
115e2df9bb4SMartin Matuska static void
ddt_log_create_one(ddt_t * ddt,ddt_log_t * ddl,uint_t n,dmu_tx_t * tx)116e2df9bb4SMartin Matuska ddt_log_create_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
117e2df9bb4SMartin Matuska {
118e2df9bb4SMartin Matuska 	ASSERT3U(ddt->ddt_dir_object, >, 0);
119e2df9bb4SMartin Matuska 	ASSERT3U(ddl->ddl_object, ==, 0);
120e2df9bb4SMartin Matuska 
121e2df9bb4SMartin Matuska 	char name[DDT_NAMELEN];
122e2df9bb4SMartin Matuska 	ddt_log_name(ddt, name, n);
123e2df9bb4SMartin Matuska 
124e2df9bb4SMartin Matuska 	ddl->ddl_object = dmu_object_alloc(ddt->ddt_os,
125e2df9bb4SMartin Matuska 	    DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
126e2df9bb4SMartin Matuska 	    DMU_OTN_UINT64_METADATA, sizeof (ddt_log_header_t), tx);
127e2df9bb4SMartin Matuska 	VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, name,
128e2df9bb4SMartin Matuska 	    sizeof (uint64_t), 1, &ddl->ddl_object, tx));
129e2df9bb4SMartin Matuska 	ddl->ddl_length = 0;
130e2df9bb4SMartin Matuska 	ddl->ddl_first_txg = tx->tx_txg;
131e2df9bb4SMartin Matuska 	ddt_log_update_header(ddt, ddl, tx);
132e2df9bb4SMartin Matuska }
133e2df9bb4SMartin Matuska 
134e2df9bb4SMartin Matuska static void
ddt_log_create(ddt_t * ddt,dmu_tx_t * tx)135e2df9bb4SMartin Matuska ddt_log_create(ddt_t *ddt, dmu_tx_t *tx)
136e2df9bb4SMartin Matuska {
137e2df9bb4SMartin Matuska 	ddt_log_create_one(ddt, ddt->ddt_log_active, 0, tx);
138e2df9bb4SMartin Matuska 	ddt_log_create_one(ddt, ddt->ddt_log_flushing, 1, tx);
139e2df9bb4SMartin Matuska }
140e2df9bb4SMartin Matuska 
141e2df9bb4SMartin Matuska static void
ddt_log_destroy_one(ddt_t * ddt,ddt_log_t * ddl,uint_t n,dmu_tx_t * tx)142e2df9bb4SMartin Matuska ddt_log_destroy_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
143e2df9bb4SMartin Matuska {
144e2df9bb4SMartin Matuska 	ASSERT3U(ddt->ddt_dir_object, >, 0);
145e2df9bb4SMartin Matuska 
146e2df9bb4SMartin Matuska 	if (ddl->ddl_object == 0)
147e2df9bb4SMartin Matuska 		return;
148e2df9bb4SMartin Matuska 
149e2df9bb4SMartin Matuska 	ASSERT0(ddl->ddl_length);
150e2df9bb4SMartin Matuska 
151e2df9bb4SMartin Matuska 	char name[DDT_NAMELEN];
152e2df9bb4SMartin Matuska 	ddt_log_name(ddt, name, n);
153e2df9bb4SMartin Matuska 
154e2df9bb4SMartin Matuska 	VERIFY0(zap_remove(ddt->ddt_os, ddt->ddt_dir_object, name, tx));
155e2df9bb4SMartin Matuska 	VERIFY0(dmu_object_free(ddt->ddt_os, ddl->ddl_object, tx));
156e2df9bb4SMartin Matuska 
157e2df9bb4SMartin Matuska 	ddl->ddl_object = 0;
158e2df9bb4SMartin Matuska }
159e2df9bb4SMartin Matuska 
160e2df9bb4SMartin Matuska void
ddt_log_destroy(ddt_t * ddt,dmu_tx_t * tx)161e2df9bb4SMartin Matuska ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx)
162e2df9bb4SMartin Matuska {
163e2df9bb4SMartin Matuska 	ddt_log_destroy_one(ddt, ddt->ddt_log_active, 0, tx);
164e2df9bb4SMartin Matuska 	ddt_log_destroy_one(ddt, ddt->ddt_log_flushing, 1, tx);
165e2df9bb4SMartin Matuska }
166e2df9bb4SMartin Matuska 
167e2df9bb4SMartin Matuska static void
ddt_log_update_stats(ddt_t * ddt)168e2df9bb4SMartin Matuska ddt_log_update_stats(ddt_t *ddt)
169e2df9bb4SMartin Matuska {
170e2df9bb4SMartin Matuska 	/*
171e2df9bb4SMartin Matuska 	 * Log object stats. We count the number of live entries in the log
172e2df9bb4SMartin Matuska 	 * tree, even if there are more than on disk, and even if the same
173e2df9bb4SMartin Matuska 	 * entry is on both append and flush trees, because that's more what
174e2df9bb4SMartin Matuska 	 * the user expects to see. This does mean the on-disk size is not
175e2df9bb4SMartin Matuska 	 * really correlated with the number of entries, but I don't think
176e2df9bb4SMartin Matuska 	 * that's reasonable to expect anyway.
177e2df9bb4SMartin Matuska 	 */
178e2df9bb4SMartin Matuska 	dmu_object_info_t doi;
179e2df9bb4SMartin Matuska 	uint64_t nblocks;
180e2df9bb4SMartin Matuska 	dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object, &doi);
181e2df9bb4SMartin Matuska 	nblocks = doi.doi_physical_blocks_512;
182e2df9bb4SMartin Matuska 	dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object, &doi);
183e2df9bb4SMartin Matuska 	nblocks += doi.doi_physical_blocks_512;
184e2df9bb4SMartin Matuska 
185e2df9bb4SMartin Matuska 	ddt_object_t *ddo = &ddt->ddt_log_stats;
186e2df9bb4SMartin Matuska 	ddo->ddo_count =
187e2df9bb4SMartin Matuska 	    avl_numnodes(&ddt->ddt_log_active->ddl_tree) +
188e2df9bb4SMartin Matuska 	    avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
189e2df9bb4SMartin Matuska 	ddo->ddo_mspace = ddo->ddo_count * DDT_LOG_ENTRY_SIZE(ddt);
190e2df9bb4SMartin Matuska 	ddo->ddo_dspace = nblocks << 9;
191e2df9bb4SMartin Matuska }
192e2df9bb4SMartin Matuska 
193e2df9bb4SMartin Matuska void
ddt_log_begin(ddt_t * ddt,size_t nentries,dmu_tx_t * tx,ddt_log_update_t * dlu)194e2df9bb4SMartin Matuska ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu)
195e2df9bb4SMartin Matuska {
196e2df9bb4SMartin Matuska 	ASSERT3U(nentries, >, 0);
197e2df9bb4SMartin Matuska 	ASSERT3P(dlu->dlu_dbp, ==, NULL);
198e2df9bb4SMartin Matuska 
199e2df9bb4SMartin Matuska 	if (ddt->ddt_log_active->ddl_object == 0)
200e2df9bb4SMartin Matuska 		ddt_log_create(ddt, tx);
201e2df9bb4SMartin Matuska 
202e2df9bb4SMartin Matuska 	/*
203e2df9bb4SMartin Matuska 	 * We want to store as many entries as we can in a block, but never
204e2df9bb4SMartin Matuska 	 * split an entry across block boundaries.
205e2df9bb4SMartin Matuska 	 */
206e2df9bb4SMartin Matuska 	size_t reclen = P2ALIGN_TYPED(
207e2df9bb4SMartin Matuska 	    sizeof (ddt_log_record_t) + sizeof (ddt_log_record_entry_t) +
208e2df9bb4SMartin Matuska 	    DDT_PHYS_SIZE(ddt), sizeof (uint64_t), size_t);
209e2df9bb4SMartin Matuska 	ASSERT3U(reclen, <=, UINT16_MAX);
210e2df9bb4SMartin Matuska 	dlu->dlu_reclen = reclen;
211e2df9bb4SMartin Matuska 
212e2df9bb4SMartin Matuska 	VERIFY0(dnode_hold(ddt->ddt_os, ddt->ddt_log_active->ddl_object, FTAG,
213e2df9bb4SMartin Matuska 	    &dlu->dlu_dn));
214e2df9bb4SMartin Matuska 	dnode_set_storage_type(dlu->dlu_dn, DMU_OT_DDT_ZAP);
215e2df9bb4SMartin Matuska 
216e2df9bb4SMartin Matuska 	uint64_t nblocks = howmany(nentries,
217e2df9bb4SMartin Matuska 	    dlu->dlu_dn->dn_datablksz / dlu->dlu_reclen);
218e2df9bb4SMartin Matuska 	uint64_t offset = ddt->ddt_log_active->ddl_length;
219e2df9bb4SMartin Matuska 	uint64_t length = nblocks * dlu->dlu_dn->dn_datablksz;
220e2df9bb4SMartin Matuska 
221e2df9bb4SMartin Matuska 	VERIFY0(dmu_buf_hold_array_by_dnode(dlu->dlu_dn, offset, length,
222e2df9bb4SMartin Matuska 	    B_FALSE, FTAG, &dlu->dlu_ndbp, &dlu->dlu_dbp,
223e2df9bb4SMartin Matuska 	    DMU_READ_NO_PREFETCH));
224e2df9bb4SMartin Matuska 
225e2df9bb4SMartin Matuska 	dlu->dlu_tx = tx;
226e2df9bb4SMartin Matuska 	dlu->dlu_block = dlu->dlu_offset = 0;
227e2df9bb4SMartin Matuska }
228e2df9bb4SMartin Matuska 
229e2df9bb4SMartin Matuska static ddt_log_entry_t *
ddt_log_alloc_entry(ddt_t * ddt)230e2df9bb4SMartin Matuska ddt_log_alloc_entry(ddt_t *ddt)
231e2df9bb4SMartin Matuska {
232e2df9bb4SMartin Matuska 	ddt_log_entry_t *ddle;
233e2df9bb4SMartin Matuska 
234e2df9bb4SMartin Matuska 	if (ddt->ddt_flags & DDT_FLAG_FLAT) {
235e2df9bb4SMartin Matuska 		ddle = kmem_cache_alloc(ddt_log_entry_flat_cache, KM_SLEEP);
236e2df9bb4SMartin Matuska 		memset(ddle, 0, DDT_LOG_ENTRY_FLAT_SIZE);
237e2df9bb4SMartin Matuska 	} else {
238e2df9bb4SMartin Matuska 		ddle = kmem_cache_alloc(ddt_log_entry_trad_cache, KM_SLEEP);
239e2df9bb4SMartin Matuska 		memset(ddle, 0, DDT_LOG_ENTRY_TRAD_SIZE);
240e2df9bb4SMartin Matuska 	}
241e2df9bb4SMartin Matuska 
242e2df9bb4SMartin Matuska 	return (ddle);
243e2df9bb4SMartin Matuska }
244e2df9bb4SMartin Matuska 
245e2df9bb4SMartin Matuska static void
ddt_log_update_entry(ddt_t * ddt,ddt_log_t * ddl,ddt_lightweight_entry_t * ddlwe)246e2df9bb4SMartin Matuska ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
247e2df9bb4SMartin Matuska {
248e2df9bb4SMartin Matuska 	/* Create the log tree entry from a live or stored entry */
249e2df9bb4SMartin Matuska 	avl_index_t where;
250e2df9bb4SMartin Matuska 	ddt_log_entry_t *ddle =
251e2df9bb4SMartin Matuska 	    avl_find(&ddl->ddl_tree, &ddlwe->ddlwe_key, &where);
252e2df9bb4SMartin Matuska 	if (ddle == NULL) {
253e2df9bb4SMartin Matuska 		ddle = ddt_log_alloc_entry(ddt);
254e2df9bb4SMartin Matuska 		ddle->ddle_key = ddlwe->ddlwe_key;
255e2df9bb4SMartin Matuska 		avl_insert(&ddl->ddl_tree, ddle, where);
256e2df9bb4SMartin Matuska 	}
257e2df9bb4SMartin Matuska 	ddle->ddle_type = ddlwe->ddlwe_type;
258e2df9bb4SMartin Matuska 	ddle->ddle_class = ddlwe->ddlwe_class;
259e2df9bb4SMartin Matuska 	memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
260e2df9bb4SMartin Matuska }
261e2df9bb4SMartin Matuska 
262e2df9bb4SMartin Matuska void
ddt_log_entry(ddt_t * ddt,ddt_lightweight_entry_t * ddlwe,ddt_log_update_t * dlu)263e2df9bb4SMartin Matuska ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu)
264e2df9bb4SMartin Matuska {
265e2df9bb4SMartin Matuska 	ASSERT3U(dlu->dlu_dbp, !=, NULL);
266e2df9bb4SMartin Matuska 
267e2df9bb4SMartin Matuska 	ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe);
268e2df9bb4SMartin Matuska 	ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
269e2df9bb4SMartin Matuska 
270e2df9bb4SMartin Matuska 	/* Get our block */
271e2df9bb4SMartin Matuska 	ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
272e2df9bb4SMartin Matuska 	dmu_buf_t *db = dlu->dlu_dbp[dlu->dlu_block];
273e2df9bb4SMartin Matuska 
274e2df9bb4SMartin Matuska 	/*
275e2df9bb4SMartin Matuska 	 * If this would take us past the end of the block, finish it and
276e2df9bb4SMartin Matuska 	 * move to the next one.
277e2df9bb4SMartin Matuska 	 */
278e2df9bb4SMartin Matuska 	if (db->db_size < (dlu->dlu_offset + dlu->dlu_reclen)) {
279e2df9bb4SMartin Matuska 		ASSERT3U(dlu->dlu_offset, >, 0);
280e2df9bb4SMartin Matuska 		dmu_buf_fill_done(db, dlu->dlu_tx, B_FALSE);
281e2df9bb4SMartin Matuska 		dlu->dlu_block++;
282e2df9bb4SMartin Matuska 		dlu->dlu_offset = 0;
283e2df9bb4SMartin Matuska 		ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
284e2df9bb4SMartin Matuska 		db = dlu->dlu_dbp[dlu->dlu_block];
285e2df9bb4SMartin Matuska 	}
286e2df9bb4SMartin Matuska 
287e2df9bb4SMartin Matuska 	/*
288e2df9bb4SMartin Matuska 	 * If this is the first time touching the block, inform the DMU that
289e2df9bb4SMartin Matuska 	 * we will fill it, and zero it out.
290e2df9bb4SMartin Matuska 	 */
291e2df9bb4SMartin Matuska 	if (dlu->dlu_offset == 0) {
292e2df9bb4SMartin Matuska 		dmu_buf_will_fill(db, dlu->dlu_tx, B_FALSE);
293e2df9bb4SMartin Matuska 		memset(db->db_data, 0, db->db_size);
294e2df9bb4SMartin Matuska 	}
295e2df9bb4SMartin Matuska 
296e2df9bb4SMartin Matuska 	/* Create the log record directly in the buffer */
297e2df9bb4SMartin Matuska 	ddt_log_record_t *dlr = (db->db_data + dlu->dlu_offset);
298e2df9bb4SMartin Matuska 	DLR_SET_TYPE(dlr, DLR_ENTRY);
299e2df9bb4SMartin Matuska 	DLR_SET_RECLEN(dlr, dlu->dlu_reclen);
300e2df9bb4SMartin Matuska 	DLR_SET_ENTRY_TYPE(dlr, ddlwe->ddlwe_type);
301e2df9bb4SMartin Matuska 	DLR_SET_ENTRY_CLASS(dlr, ddlwe->ddlwe_class);
302e2df9bb4SMartin Matuska 
303e2df9bb4SMartin Matuska 	ddt_log_record_entry_t *dlre =
304e2df9bb4SMartin Matuska 	    (ddt_log_record_entry_t *)&dlr->dlr_payload;
305e2df9bb4SMartin Matuska 	dlre->dlre_key = ddlwe->ddlwe_key;
306e2df9bb4SMartin Matuska 	memcpy(dlre->dlre_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
307e2df9bb4SMartin Matuska 
308e2df9bb4SMartin Matuska 	/* Advance offset for next record. */
309e2df9bb4SMartin Matuska 	dlu->dlu_offset += dlu->dlu_reclen;
310e2df9bb4SMartin Matuska }
311e2df9bb4SMartin Matuska 
312e2df9bb4SMartin Matuska void
ddt_log_commit(ddt_t * ddt,ddt_log_update_t * dlu)313e2df9bb4SMartin Matuska ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu)
314e2df9bb4SMartin Matuska {
315e2df9bb4SMartin Matuska 	ASSERT3U(dlu->dlu_dbp, !=, NULL);
316e2df9bb4SMartin Matuska 	ASSERT3U(dlu->dlu_block+1, ==, dlu->dlu_ndbp);
317e2df9bb4SMartin Matuska 	ASSERT3U(dlu->dlu_offset, >, 0);
318e2df9bb4SMartin Matuska 
319e2df9bb4SMartin Matuska 	/*
320e2df9bb4SMartin Matuska 	 * Close out the last block. Whatever we haven't used will be zeroed,
321e2df9bb4SMartin Matuska 	 * which matches DLR_INVALID, so we can detect this during load.
322e2df9bb4SMartin Matuska 	 */
323e2df9bb4SMartin Matuska 	dmu_buf_fill_done(dlu->dlu_dbp[dlu->dlu_block], dlu->dlu_tx, B_FALSE);
324e2df9bb4SMartin Matuska 
325e2df9bb4SMartin Matuska 	dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, FTAG);
326e2df9bb4SMartin Matuska 
327e2df9bb4SMartin Matuska 	ddt->ddt_log_active->ddl_length +=
328e2df9bb4SMartin Matuska 	    dlu->dlu_ndbp * (uint64_t)dlu->dlu_dn->dn_datablksz;
329e2df9bb4SMartin Matuska 	dnode_rele(dlu->dlu_dn, FTAG);
330e2df9bb4SMartin Matuska 
331e2df9bb4SMartin Matuska 	ddt_log_update_header(ddt, ddt->ddt_log_active, dlu->dlu_tx);
332e2df9bb4SMartin Matuska 
333e2df9bb4SMartin Matuska 	memset(dlu, 0, sizeof (ddt_log_update_t));
334e2df9bb4SMartin Matuska 
335e2df9bb4SMartin Matuska 	ddt_log_update_stats(ddt);
336e2df9bb4SMartin Matuska }
337e2df9bb4SMartin Matuska 
338e2df9bb4SMartin Matuska boolean_t
ddt_log_take_first(ddt_t * ddt,ddt_log_t * ddl,ddt_lightweight_entry_t * ddlwe)339e2df9bb4SMartin Matuska ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
340e2df9bb4SMartin Matuska {
341e2df9bb4SMartin Matuska 	ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
342e2df9bb4SMartin Matuska 	if (ddle == NULL)
343e2df9bb4SMartin Matuska 		return (B_FALSE);
344e2df9bb4SMartin Matuska 
345e2df9bb4SMartin Matuska 	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
346e2df9bb4SMartin Matuska 
347e2df9bb4SMartin Matuska 	ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
348e2df9bb4SMartin Matuska 
349e2df9bb4SMartin Matuska 	avl_remove(&ddl->ddl_tree, ddle);
350e2df9bb4SMartin Matuska 	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
351e2df9bb4SMartin Matuska 	    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
352e2df9bb4SMartin Matuska 
353e2df9bb4SMartin Matuska 	return (B_TRUE);
354e2df9bb4SMartin Matuska }
355e2df9bb4SMartin Matuska 
356e2df9bb4SMartin Matuska boolean_t
ddt_log_remove_key(ddt_t * ddt,ddt_log_t * ddl,const ddt_key_t * ddk)357e2df9bb4SMartin Matuska ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk)
358e2df9bb4SMartin Matuska {
359e2df9bb4SMartin Matuska 	ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL);
360e2df9bb4SMartin Matuska 	if (ddle == NULL)
361e2df9bb4SMartin Matuska 		return (B_FALSE);
362e2df9bb4SMartin Matuska 
363e2df9bb4SMartin Matuska 	ddt_lightweight_entry_t ddlwe;
364e2df9bb4SMartin Matuska 	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
365e2df9bb4SMartin Matuska 	ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
366e2df9bb4SMartin Matuska 
367e2df9bb4SMartin Matuska 	avl_remove(&ddl->ddl_tree, ddle);
368e2df9bb4SMartin Matuska 	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
369e2df9bb4SMartin Matuska 	    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
370e2df9bb4SMartin Matuska 
371e2df9bb4SMartin Matuska 	return (B_TRUE);
372e2df9bb4SMartin Matuska }
373e2df9bb4SMartin Matuska 
374e2df9bb4SMartin Matuska boolean_t
ddt_log_find_key(ddt_t * ddt,const ddt_key_t * ddk,ddt_lightweight_entry_t * ddlwe)375e2df9bb4SMartin Matuska ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk,
376e2df9bb4SMartin Matuska     ddt_lightweight_entry_t *ddlwe)
377e2df9bb4SMartin Matuska {
378e2df9bb4SMartin Matuska 	ddt_log_entry_t *ddle =
379e2df9bb4SMartin Matuska 	    avl_find(&ddt->ddt_log_active->ddl_tree, ddk, NULL);
380e2df9bb4SMartin Matuska 	if (!ddle)
381e2df9bb4SMartin Matuska 		ddle = avl_find(&ddt->ddt_log_flushing->ddl_tree, ddk, NULL);
382e2df9bb4SMartin Matuska 	if (!ddle)
383e2df9bb4SMartin Matuska 		return (B_FALSE);
384e2df9bb4SMartin Matuska 	if (ddlwe)
385e2df9bb4SMartin Matuska 		DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
386e2df9bb4SMartin Matuska 	return (B_TRUE);
387e2df9bb4SMartin Matuska }
388e2df9bb4SMartin Matuska 
389e2df9bb4SMartin Matuska void
ddt_log_checkpoint(ddt_t * ddt,ddt_lightweight_entry_t * ddlwe,dmu_tx_t * tx)390e2df9bb4SMartin Matuska ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
391e2df9bb4SMartin Matuska {
392e2df9bb4SMartin Matuska 	ddt_log_t *ddl = ddt->ddt_log_flushing;
393e2df9bb4SMartin Matuska 
394e2df9bb4SMartin Matuska 	ASSERT3U(ddl->ddl_object, !=, 0);
395e2df9bb4SMartin Matuska 
396e2df9bb4SMartin Matuska #ifdef ZFS_DEBUG
397e2df9bb4SMartin Matuska 	/*
398e2df9bb4SMartin Matuska 	 * There should not be any entries on the log tree before the given
399e2df9bb4SMartin Matuska 	 * checkpoint. Assert that this is the case.
400e2df9bb4SMartin Matuska 	 */
401e2df9bb4SMartin Matuska 	ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
402e2df9bb4SMartin Matuska 	if (ddle != NULL)
403e2df9bb4SMartin Matuska 		VERIFY3U(ddt_key_compare(&ddle->ddle_key, &ddlwe->ddlwe_key),
404e2df9bb4SMartin Matuska 		    >, 0);
405e2df9bb4SMartin Matuska #endif
406e2df9bb4SMartin Matuska 
407e2df9bb4SMartin Matuska 	ddl->ddl_flags |= DDL_FLAG_CHECKPOINT;
408e2df9bb4SMartin Matuska 	ddl->ddl_checkpoint = ddlwe->ddlwe_key;
409e2df9bb4SMartin Matuska 	ddt_log_update_header(ddt, ddl, tx);
410e2df9bb4SMartin Matuska 
411e2df9bb4SMartin Matuska 	ddt_log_update_stats(ddt);
412e2df9bb4SMartin Matuska }
413e2df9bb4SMartin Matuska 
414e2df9bb4SMartin Matuska void
ddt_log_truncate(ddt_t * ddt,dmu_tx_t * tx)415e2df9bb4SMartin Matuska ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx)
416e2df9bb4SMartin Matuska {
417e2df9bb4SMartin Matuska 	ddt_log_t *ddl = ddt->ddt_log_flushing;
418e2df9bb4SMartin Matuska 
419e2df9bb4SMartin Matuska 	if (ddl->ddl_object == 0)
420e2df9bb4SMartin Matuska 		return;
421e2df9bb4SMartin Matuska 
422e2df9bb4SMartin Matuska 	ASSERT(avl_is_empty(&ddl->ddl_tree));
423e2df9bb4SMartin Matuska 
424e2df9bb4SMartin Matuska 	/* Eject the entire object */
425e2df9bb4SMartin Matuska 	dmu_free_range(ddt->ddt_os, ddl->ddl_object, 0, DMU_OBJECT_END, tx);
426e2df9bb4SMartin Matuska 
427e2df9bb4SMartin Matuska 	ddl->ddl_length = 0;
428e2df9bb4SMartin Matuska 	ddl->ddl_flags &= ~DDL_FLAG_CHECKPOINT;
429e2df9bb4SMartin Matuska 	memset(&ddl->ddl_checkpoint, 0, sizeof (ddt_key_t));
430e2df9bb4SMartin Matuska 	ddt_log_update_header(ddt, ddl, tx);
431e2df9bb4SMartin Matuska 
432e2df9bb4SMartin Matuska 	ddt_log_update_stats(ddt);
433e2df9bb4SMartin Matuska }
434e2df9bb4SMartin Matuska 
435e2df9bb4SMartin Matuska boolean_t
ddt_log_swap(ddt_t * ddt,dmu_tx_t * tx)436e2df9bb4SMartin Matuska ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx)
437e2df9bb4SMartin Matuska {
438e2df9bb4SMartin Matuska 	/* Swap the logs. The old flushing one must be empty */
439e2df9bb4SMartin Matuska 	VERIFY(avl_is_empty(&ddt->ddt_log_flushing->ddl_tree));
440e2df9bb4SMartin Matuska 
441e2df9bb4SMartin Matuska 	/*
442e2df9bb4SMartin Matuska 	 * If there are still blocks on the flushing log, truncate it first.
443e2df9bb4SMartin Matuska 	 * This can happen if there were entries on the flushing log that were
444e2df9bb4SMartin Matuska 	 * removed in memory via ddt_lookup(); their vestigal remains are
445e2df9bb4SMartin Matuska 	 * on disk.
446e2df9bb4SMartin Matuska 	 */
447e2df9bb4SMartin Matuska 	if (ddt->ddt_log_flushing->ddl_length > 0)
448e2df9bb4SMartin Matuska 		ddt_log_truncate(ddt, tx);
449e2df9bb4SMartin Matuska 
450e2df9bb4SMartin Matuska 	/*
451e2df9bb4SMartin Matuska 	 * Swap policy. We swap the logs (and so begin flushing) when the
452e2df9bb4SMartin Matuska 	 * active tree grows too large, or when we haven't swapped it in
453e2df9bb4SMartin Matuska 	 * some amount of time, or if something has requested the logs be
454e2df9bb4SMartin Matuska 	 * flushed ASAP (see ddt_walk_init()).
455e2df9bb4SMartin Matuska 	 */
456e2df9bb4SMartin Matuska 
457e2df9bb4SMartin Matuska 	/*
458e2df9bb4SMartin Matuska 	 * The log tree is too large if the memory usage of its entries is over
459e2df9bb4SMartin Matuska 	 * half of the memory limit. This effectively gives each log tree half
460e2df9bb4SMartin Matuska 	 * the available memory.
461e2df9bb4SMartin Matuska 	 */
462e2df9bb4SMartin Matuska 	const boolean_t too_large =
463e2df9bb4SMartin Matuska 	    (avl_numnodes(&ddt->ddt_log_active->ddl_tree) *
464e2df9bb4SMartin Matuska 	    DDT_LOG_ENTRY_SIZE(ddt)) >= (zfs_dedup_log_mem_max >> 1);
465e2df9bb4SMartin Matuska 
466e2df9bb4SMartin Matuska 	const boolean_t too_old =
467e2df9bb4SMartin Matuska 	    tx->tx_txg >=
468e2df9bb4SMartin Matuska 	    (ddt->ddt_log_active->ddl_first_txg +
469e2df9bb4SMartin Matuska 	    MAX(1, zfs_dedup_log_txg_max));
470e2df9bb4SMartin Matuska 
471e2df9bb4SMartin Matuska 	const boolean_t force =
472e2df9bb4SMartin Matuska 	    ddt->ddt_log_active->ddl_first_txg <= ddt->ddt_flush_force_txg;
473e2df9bb4SMartin Matuska 
474e2df9bb4SMartin Matuska 	if (!(too_large || too_old || force))
475e2df9bb4SMartin Matuska 		return (B_FALSE);
476e2df9bb4SMartin Matuska 
477e2df9bb4SMartin Matuska 	ddt_log_t *swap = ddt->ddt_log_active;
478e2df9bb4SMartin Matuska 	ddt->ddt_log_active = ddt->ddt_log_flushing;
479e2df9bb4SMartin Matuska 	ddt->ddt_log_flushing = swap;
480e2df9bb4SMartin Matuska 
481e2df9bb4SMartin Matuska 	ASSERT(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING);
482e2df9bb4SMartin Matuska 	ddt->ddt_log_active->ddl_flags &=
483e2df9bb4SMartin Matuska 	    ~(DDL_FLAG_FLUSHING | DDL_FLAG_CHECKPOINT);
484e2df9bb4SMartin Matuska 
485e2df9bb4SMartin Matuska 	ASSERT(!(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING));
486e2df9bb4SMartin Matuska 	ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
487e2df9bb4SMartin Matuska 
488e2df9bb4SMartin Matuska 	ddt->ddt_log_active->ddl_first_txg = tx->tx_txg;
489e2df9bb4SMartin Matuska 
490e2df9bb4SMartin Matuska 	ddt_log_update_header(ddt, ddt->ddt_log_active, tx);
491e2df9bb4SMartin Matuska 	ddt_log_update_header(ddt, ddt->ddt_log_flushing, tx);
492e2df9bb4SMartin Matuska 
493e2df9bb4SMartin Matuska 	ddt_log_update_stats(ddt);
494e2df9bb4SMartin Matuska 
495e2df9bb4SMartin Matuska 	return (B_TRUE);
496e2df9bb4SMartin Matuska }
497e2df9bb4SMartin Matuska 
498e2df9bb4SMartin Matuska static inline void
ddt_log_load_entry(ddt_t * ddt,ddt_log_t * ddl,ddt_log_record_t * dlr,const ddt_key_t * checkpoint)499e2df9bb4SMartin Matuska ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr,
500e2df9bb4SMartin Matuska     const ddt_key_t *checkpoint)
501e2df9bb4SMartin Matuska {
502e2df9bb4SMartin Matuska 	ASSERT3U(DLR_GET_TYPE(dlr), ==, DLR_ENTRY);
503e2df9bb4SMartin Matuska 
504e2df9bb4SMartin Matuska 	ddt_log_record_entry_t *dlre =
505e2df9bb4SMartin Matuska 	    (ddt_log_record_entry_t *)dlr->dlr_payload;
506e2df9bb4SMartin Matuska 	if (checkpoint != NULL &&
507e2df9bb4SMartin Matuska 	    ddt_key_compare(&dlre->dlre_key, checkpoint) <= 0) {
508e2df9bb4SMartin Matuska 		/* Skip pre-checkpoint entries; they're already flushed. */
509e2df9bb4SMartin Matuska 		return;
510e2df9bb4SMartin Matuska 	}
511e2df9bb4SMartin Matuska 
512e2df9bb4SMartin Matuska 	ddt_lightweight_entry_t ddlwe;
513e2df9bb4SMartin Matuska 	ddlwe.ddlwe_type = DLR_GET_ENTRY_TYPE(dlr);
514e2df9bb4SMartin Matuska 	ddlwe.ddlwe_class = DLR_GET_ENTRY_CLASS(dlr);
515e2df9bb4SMartin Matuska 
516e2df9bb4SMartin Matuska 	ddlwe.ddlwe_key = dlre->dlre_key;
517e2df9bb4SMartin Matuska 	memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt));
518e2df9bb4SMartin Matuska 
519e2df9bb4SMartin Matuska 	ddt_log_update_entry(ddt, ddl, &ddlwe);
520e2df9bb4SMartin Matuska }
521e2df9bb4SMartin Matuska 
522e2df9bb4SMartin Matuska static void
ddt_log_empty(ddt_t * ddt,ddt_log_t * ddl)523e2df9bb4SMartin Matuska ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl)
524e2df9bb4SMartin Matuska {
525e2df9bb4SMartin Matuska 	void *cookie = NULL;
526e2df9bb4SMartin Matuska 	ddt_log_entry_t *ddle;
527e2df9bb4SMartin Matuska 	IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree));
528e2df9bb4SMartin Matuska 	while ((ddle =
529e2df9bb4SMartin Matuska 	    avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) {
530e2df9bb4SMartin Matuska 		kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
531e2df9bb4SMartin Matuska 		    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
532e2df9bb4SMartin Matuska 	}
533e2df9bb4SMartin Matuska 	ASSERT(avl_is_empty(&ddl->ddl_tree));
534e2df9bb4SMartin Matuska }
535e2df9bb4SMartin Matuska 
536e2df9bb4SMartin Matuska static int
ddt_log_load_one(ddt_t * ddt,uint_t n)537e2df9bb4SMartin Matuska ddt_log_load_one(ddt_t *ddt, uint_t n)
538e2df9bb4SMartin Matuska {
539e2df9bb4SMartin Matuska 	ASSERT3U(n, <, 2);
540e2df9bb4SMartin Matuska 
541e2df9bb4SMartin Matuska 	ddt_log_t *ddl = &ddt->ddt_log[n];
542e2df9bb4SMartin Matuska 
543e2df9bb4SMartin Matuska 	char name[DDT_NAMELEN];
544e2df9bb4SMartin Matuska 	ddt_log_name(ddt, name, n);
545e2df9bb4SMartin Matuska 
546e2df9bb4SMartin Matuska 	uint64_t obj;
547e2df9bb4SMartin Matuska 	int err = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name,
548e2df9bb4SMartin Matuska 	    sizeof (uint64_t), 1, &obj);
549e2df9bb4SMartin Matuska 	if (err == ENOENT)
550e2df9bb4SMartin Matuska 		return (0);
551e2df9bb4SMartin Matuska 	if (err != 0)
552e2df9bb4SMartin Matuska 		return (err);
553e2df9bb4SMartin Matuska 
554e2df9bb4SMartin Matuska 	dnode_t *dn;
555e2df9bb4SMartin Matuska 	err = dnode_hold(ddt->ddt_os, obj, FTAG, &dn);
556e2df9bb4SMartin Matuska 	if (err != 0)
557e2df9bb4SMartin Matuska 		return (err);
558e2df9bb4SMartin Matuska 
559e2df9bb4SMartin Matuska 	ddt_log_header_t hdr;
560e2df9bb4SMartin Matuska 	dmu_buf_t *db;
561e2df9bb4SMartin Matuska 	err = dmu_bonus_hold_by_dnode(dn, FTAG, &db, DMU_READ_NO_PREFETCH);
562e2df9bb4SMartin Matuska 	if (err != 0) {
563e2df9bb4SMartin Matuska 		dnode_rele(dn, FTAG);
564e2df9bb4SMartin Matuska 		return (err);
565e2df9bb4SMartin Matuska 	}
566e2df9bb4SMartin Matuska 	memcpy(&hdr, db->db_data, sizeof (ddt_log_header_t));
567e2df9bb4SMartin Matuska 	dmu_buf_rele(db, FTAG);
568e2df9bb4SMartin Matuska 
569e2df9bb4SMartin Matuska 	if (DLH_GET_VERSION(&hdr) != 1) {
570e2df9bb4SMartin Matuska 		dnode_rele(dn, FTAG);
571e2df9bb4SMartin Matuska 		zfs_dbgmsg("ddt_log_load: spa=%s ddt_log=%s "
572e2df9bb4SMartin Matuska 		    "unknown version=%llu", spa_name(ddt->ddt_spa), name,
573e2df9bb4SMartin Matuska 		    (u_longlong_t)DLH_GET_VERSION(&hdr));
574e2df9bb4SMartin Matuska 		return (SET_ERROR(EINVAL));
575e2df9bb4SMartin Matuska 	}
576e2df9bb4SMartin Matuska 
577e2df9bb4SMartin Matuska 	ddt_key_t *checkpoint = NULL;
578e2df9bb4SMartin Matuska 	if (DLH_GET_FLAGS(&hdr) & DDL_FLAG_CHECKPOINT) {
579e2df9bb4SMartin Matuska 		/*
580e2df9bb4SMartin Matuska 		 * If the log has a checkpoint, then we can ignore any entries
581e2df9bb4SMartin Matuska 		 * that have already been flushed.
582e2df9bb4SMartin Matuska 		 */
583e2df9bb4SMartin Matuska 		ASSERT(DLH_GET_FLAGS(&hdr) & DDL_FLAG_FLUSHING);
584e2df9bb4SMartin Matuska 		checkpoint = &hdr.dlh_checkpoint;
585e2df9bb4SMartin Matuska 	}
586e2df9bb4SMartin Matuska 
587e2df9bb4SMartin Matuska 	if (hdr.dlh_length > 0) {
588e2df9bb4SMartin Matuska 		dmu_prefetch_by_dnode(dn, 0, 0, hdr.dlh_length,
589e2df9bb4SMartin Matuska 		    ZIO_PRIORITY_SYNC_READ);
590e2df9bb4SMartin Matuska 
591e2df9bb4SMartin Matuska 		for (uint64_t offset = 0; offset < hdr.dlh_length;
592e2df9bb4SMartin Matuska 		    offset += dn->dn_datablksz) {
593e2df9bb4SMartin Matuska 			err = dmu_buf_hold_by_dnode(dn, offset, FTAG, &db,
594e2df9bb4SMartin Matuska 			    DMU_READ_PREFETCH);
595e2df9bb4SMartin Matuska 			if (err != 0) {
596e2df9bb4SMartin Matuska 				dnode_rele(dn, FTAG);
597e2df9bb4SMartin Matuska 				ddt_log_empty(ddt, ddl);
598e2df9bb4SMartin Matuska 				return (err);
599e2df9bb4SMartin Matuska 			}
600e2df9bb4SMartin Matuska 
601e2df9bb4SMartin Matuska 			uint64_t boffset = 0;
602e2df9bb4SMartin Matuska 			while (boffset < db->db_size) {
603e2df9bb4SMartin Matuska 				ddt_log_record_t *dlr =
604e2df9bb4SMartin Matuska 				    (ddt_log_record_t *)(db->db_data + boffset);
605e2df9bb4SMartin Matuska 
606e2df9bb4SMartin Matuska 				/* Partially-filled block, skip the rest */
607e2df9bb4SMartin Matuska 				if (DLR_GET_TYPE(dlr) == DLR_INVALID)
608e2df9bb4SMartin Matuska 					break;
609e2df9bb4SMartin Matuska 
610e2df9bb4SMartin Matuska 				switch (DLR_GET_TYPE(dlr)) {
611e2df9bb4SMartin Matuska 				case DLR_ENTRY:
612e2df9bb4SMartin Matuska 					ddt_log_load_entry(ddt, ddl, dlr,
613e2df9bb4SMartin Matuska 					    checkpoint);
614e2df9bb4SMartin Matuska 					break;
615e2df9bb4SMartin Matuska 
616e2df9bb4SMartin Matuska 				default:
617e2df9bb4SMartin Matuska 					dmu_buf_rele(db, FTAG);
618e2df9bb4SMartin Matuska 					dnode_rele(dn, FTAG);
619e2df9bb4SMartin Matuska 					ddt_log_empty(ddt, ddl);
620e2df9bb4SMartin Matuska 					return (SET_ERROR(EINVAL));
621e2df9bb4SMartin Matuska 				}
622e2df9bb4SMartin Matuska 
623e2df9bb4SMartin Matuska 				boffset += DLR_GET_RECLEN(dlr);
624e2df9bb4SMartin Matuska 			}
625e2df9bb4SMartin Matuska 
626e2df9bb4SMartin Matuska 			dmu_buf_rele(db, FTAG);
627e2df9bb4SMartin Matuska 		}
628e2df9bb4SMartin Matuska 	}
629e2df9bb4SMartin Matuska 
630e2df9bb4SMartin Matuska 	dnode_rele(dn, FTAG);
631e2df9bb4SMartin Matuska 
632e2df9bb4SMartin Matuska 	ddl->ddl_object = obj;
633e2df9bb4SMartin Matuska 	ddl->ddl_flags = DLH_GET_FLAGS(&hdr);
634e2df9bb4SMartin Matuska 	ddl->ddl_length = hdr.dlh_length;
635e2df9bb4SMartin Matuska 	ddl->ddl_first_txg = hdr.dlh_first_txg;
636e2df9bb4SMartin Matuska 
637e2df9bb4SMartin Matuska 	if (ddl->ddl_flags & DDL_FLAG_FLUSHING)
638e2df9bb4SMartin Matuska 		ddt->ddt_log_flushing = ddl;
639e2df9bb4SMartin Matuska 	else
640e2df9bb4SMartin Matuska 		ddt->ddt_log_active = ddl;
641e2df9bb4SMartin Matuska 
642e2df9bb4SMartin Matuska 	return (0);
643e2df9bb4SMartin Matuska }
644e2df9bb4SMartin Matuska 
645e2df9bb4SMartin Matuska int
ddt_log_load(ddt_t * ddt)646e2df9bb4SMartin Matuska ddt_log_load(ddt_t *ddt)
647e2df9bb4SMartin Matuska {
648e2df9bb4SMartin Matuska 	int err;
649e2df9bb4SMartin Matuska 
650e2df9bb4SMartin Matuska 	if (spa_load_state(ddt->ddt_spa) == SPA_LOAD_TRYIMPORT) {
651e2df9bb4SMartin Matuska 		/*
652e2df9bb4SMartin Matuska 		 * The DDT is going to be freed again in a moment, so there's
653e2df9bb4SMartin Matuska 		 * no point loading the log; it'll just slow down import.
654e2df9bb4SMartin Matuska 		 */
655e2df9bb4SMartin Matuska 		return (0);
656e2df9bb4SMartin Matuska 	}
657e2df9bb4SMartin Matuska 
658e2df9bb4SMartin Matuska 	ASSERT0(ddt->ddt_log[0].ddl_object);
659e2df9bb4SMartin Matuska 	ASSERT0(ddt->ddt_log[1].ddl_object);
660e2df9bb4SMartin Matuska 	if (ddt->ddt_dir_object == 0) {
661e2df9bb4SMartin Matuska 		/*
662e2df9bb4SMartin Matuska 		 * If we're configured but the containing dir doesn't exist
663e2df9bb4SMartin Matuska 		 * yet, then the log object can't possibly exist either.
664e2df9bb4SMartin Matuska 		 */
665e2df9bb4SMartin Matuska 		ASSERT3U(ddt->ddt_version, !=, UINT64_MAX);
666e2df9bb4SMartin Matuska 		return (SET_ERROR(ENOENT));
667e2df9bb4SMartin Matuska 	}
668e2df9bb4SMartin Matuska 
669e2df9bb4SMartin Matuska 	if ((err = ddt_log_load_one(ddt, 0)) != 0)
670e2df9bb4SMartin Matuska 		return (err);
671e2df9bb4SMartin Matuska 	if ((err = ddt_log_load_one(ddt, 1)) != 0)
672e2df9bb4SMartin Matuska 		return (err);
673e2df9bb4SMartin Matuska 
674e2df9bb4SMartin Matuska 	VERIFY3P(ddt->ddt_log_active, !=, ddt->ddt_log_flushing);
675e2df9bb4SMartin Matuska 	VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING));
676e2df9bb4SMartin Matuska 	VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_CHECKPOINT));
677e2df9bb4SMartin Matuska 	VERIFY(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING);
678e2df9bb4SMartin Matuska 
679e2df9bb4SMartin Matuska 	/*
680e2df9bb4SMartin Matuska 	 * We have two finalisation tasks:
681e2df9bb4SMartin Matuska 	 *
682e2df9bb4SMartin Matuska 	 * - rebuild the histogram. We do this at the end rather than while
683e2df9bb4SMartin Matuska 	 *   we're loading so we don't need to uncount and recount entries that
684e2df9bb4SMartin Matuska 	 *   appear multiple times in the log.
685e2df9bb4SMartin Matuska 	 *
686e2df9bb4SMartin Matuska 	 * - remove entries from the flushing tree that are on both trees. This
687e2df9bb4SMartin Matuska 	 *   happens when ddt_lookup() rehydrates an entry from the flushing
688e2df9bb4SMartin Matuska 	 *   tree, as ddt_log_take_key() removes the entry from the in-memory
689e2df9bb4SMartin Matuska 	 *   tree but doesn't remove it from disk.
690e2df9bb4SMartin Matuska 	 */
691e2df9bb4SMartin Matuska 
692e2df9bb4SMartin Matuska 	/*
693e2df9bb4SMartin Matuska 	 * We don't technically need a config lock here, since there shouldn't
694e2df9bb4SMartin Matuska 	 * be pool config changes during DDT load. dva_get_dsize_sync() via
695e2df9bb4SMartin Matuska 	 * ddt_stat_generate() is expecting it though, and it won't hurt
696e2df9bb4SMartin Matuska 	 * anything, so we take it.
697e2df9bb4SMartin Matuska 	 */
698e2df9bb4SMartin Matuska 	spa_config_enter(ddt->ddt_spa, SCL_STATE, FTAG, RW_READER);
699e2df9bb4SMartin Matuska 
700e2df9bb4SMartin Matuska 	avl_tree_t *al = &ddt->ddt_log_active->ddl_tree;
701e2df9bb4SMartin Matuska 	avl_tree_t *fl = &ddt->ddt_log_flushing->ddl_tree;
702e2df9bb4SMartin Matuska 	ddt_log_entry_t *ae = avl_first(al);
703e2df9bb4SMartin Matuska 	ddt_log_entry_t *fe = avl_first(fl);
704e2df9bb4SMartin Matuska 	while (ae != NULL || fe != NULL) {
705e2df9bb4SMartin Matuska 		ddt_log_entry_t *ddle;
706e2df9bb4SMartin Matuska 		if (ae == NULL) {
707e2df9bb4SMartin Matuska 			/* active exhausted, take flushing */
708e2df9bb4SMartin Matuska 			ddle = fe;
709e2df9bb4SMartin Matuska 			fe = AVL_NEXT(fl, fe);
710e2df9bb4SMartin Matuska 		} else if (fe == NULL) {
711e2df9bb4SMartin Matuska 			/* flushing exuhausted, take active */
712e2df9bb4SMartin Matuska 			ddle = ae;
713e2df9bb4SMartin Matuska 			ae = AVL_NEXT(al, ae);
714e2df9bb4SMartin Matuska 		} else {
715e2df9bb4SMartin Matuska 			/* compare active and flushing */
716e2df9bb4SMartin Matuska 			int c = ddt_key_compare(&ae->ddle_key, &fe->ddle_key);
717e2df9bb4SMartin Matuska 			if (c < 0) {
718e2df9bb4SMartin Matuska 				/* active behind, take and advance */
719e2df9bb4SMartin Matuska 				ddle = ae;
720e2df9bb4SMartin Matuska 				ae = AVL_NEXT(al, ae);
721e2df9bb4SMartin Matuska 			} else if (c > 0) {
722e2df9bb4SMartin Matuska 				/* flushing behind, take and advance */
723e2df9bb4SMartin Matuska 				ddle = fe;
724e2df9bb4SMartin Matuska 				fe = AVL_NEXT(fl, fe);
725e2df9bb4SMartin Matuska 			} else {
726e2df9bb4SMartin Matuska 				/* match. remove from flushing, take active */
727e2df9bb4SMartin Matuska 				ddle = fe;
728e2df9bb4SMartin Matuska 				fe = AVL_NEXT(fl, fe);
729e2df9bb4SMartin Matuska 				avl_remove(fl, ddle);
730e2df9bb4SMartin Matuska 
731e2df9bb4SMartin Matuska 				ddle = ae;
732e2df9bb4SMartin Matuska 				ae = AVL_NEXT(al, ae);
733e2df9bb4SMartin Matuska 			}
734e2df9bb4SMartin Matuska 		}
735e2df9bb4SMartin Matuska 
736e2df9bb4SMartin Matuska 		ddt_lightweight_entry_t ddlwe;
737e2df9bb4SMartin Matuska 		DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
738e2df9bb4SMartin Matuska 		ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
739e2df9bb4SMartin Matuska 	}
740e2df9bb4SMartin Matuska 
741e2df9bb4SMartin Matuska 	spa_config_exit(ddt->ddt_spa, SCL_STATE, FTAG);
742e2df9bb4SMartin Matuska 
743e2df9bb4SMartin Matuska 	ddt_log_update_stats(ddt);
744e2df9bb4SMartin Matuska 
745e2df9bb4SMartin Matuska 	return (0);
746e2df9bb4SMartin Matuska }
747e2df9bb4SMartin Matuska 
748e2df9bb4SMartin Matuska void
ddt_log_alloc(ddt_t * ddt)749e2df9bb4SMartin Matuska ddt_log_alloc(ddt_t *ddt)
750e2df9bb4SMartin Matuska {
751e2df9bb4SMartin Matuska 	ASSERT3P(ddt->ddt_log_active, ==, NULL);
752e2df9bb4SMartin Matuska 	ASSERT3P(ddt->ddt_log_flushing, ==, NULL);
753e2df9bb4SMartin Matuska 
754e2df9bb4SMartin Matuska 	avl_create(&ddt->ddt_log[0].ddl_tree, ddt_key_compare,
755e2df9bb4SMartin Matuska 	    sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
756e2df9bb4SMartin Matuska 	avl_create(&ddt->ddt_log[1].ddl_tree, ddt_key_compare,
757e2df9bb4SMartin Matuska 	    sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
758e2df9bb4SMartin Matuska 	ddt->ddt_log_active = &ddt->ddt_log[0];
759e2df9bb4SMartin Matuska 	ddt->ddt_log_flushing = &ddt->ddt_log[1];
760e2df9bb4SMartin Matuska 	ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
761e2df9bb4SMartin Matuska }
762e2df9bb4SMartin Matuska 
763e2df9bb4SMartin Matuska void
ddt_log_free(ddt_t * ddt)764e2df9bb4SMartin Matuska ddt_log_free(ddt_t *ddt)
765e2df9bb4SMartin Matuska {
766e2df9bb4SMartin Matuska 	ddt_log_empty(ddt, &ddt->ddt_log[0]);
767e2df9bb4SMartin Matuska 	ddt_log_empty(ddt, &ddt->ddt_log[1]);
768e2df9bb4SMartin Matuska 	avl_destroy(&ddt->ddt_log[0].ddl_tree);
769e2df9bb4SMartin Matuska 	avl_destroy(&ddt->ddt_log[1].ddl_tree);
770e2df9bb4SMartin Matuska }
771e2df9bb4SMartin Matuska 
772e2df9bb4SMartin Matuska ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_txg_max, UINT, ZMOD_RW,
773e2df9bb4SMartin Matuska 	"Max transactions before starting to flush dedup logs");
774e2df9bb4SMartin Matuska 
775e2df9bb4SMartin Matuska ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max, U64, ZMOD_RD,
776e2df9bb4SMartin Matuska 	"Max memory for dedup logs");
777e2df9bb4SMartin Matuska 
778e2df9bb4SMartin Matuska ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max_percent, UINT, ZMOD_RD,
779e2df9bb4SMartin Matuska 	"Max memory for dedup logs, as % of total memory");
780