xref: /freebsd/sys/contrib/openzfs/module/zfs/txg.c (revision b985c9cafd2aedac5cf92428c0211485ea4ede24)
1eda14cbcSMatt Macy /*
2eda14cbcSMatt Macy  * CDDL HEADER START
3eda14cbcSMatt Macy  *
4eda14cbcSMatt Macy  * The contents of this file are subject to the terms of the
5eda14cbcSMatt Macy  * Common Development and Distribution License (the "License").
6eda14cbcSMatt Macy  * You may not use this file except in compliance with the License.
7eda14cbcSMatt Macy  *
8eda14cbcSMatt Macy  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9271171e0SMartin Matuska  * or https://opensource.org/licenses/CDDL-1.0.
10eda14cbcSMatt Macy  * See the License for the specific language governing permissions
11eda14cbcSMatt Macy  * and limitations under the License.
12eda14cbcSMatt Macy  *
13eda14cbcSMatt Macy  * When distributing Covered Code, include this CDDL HEADER in each
14eda14cbcSMatt Macy  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15eda14cbcSMatt Macy  * If applicable, add the following below this CDDL HEADER, with the
16eda14cbcSMatt Macy  * fields enclosed by brackets "[]" replaced with your own identifying
17eda14cbcSMatt Macy  * information: Portions Copyright [yyyy] [name of copyright owner]
18eda14cbcSMatt Macy  *
19eda14cbcSMatt Macy  * CDDL HEADER END
20eda14cbcSMatt Macy  */
21eda14cbcSMatt Macy /*
22eda14cbcSMatt Macy  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23eda14cbcSMatt Macy  * Portions Copyright 2011 Martin Matuska
24eda14cbcSMatt Macy  * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
25eda14cbcSMatt Macy  */
26eda14cbcSMatt Macy 
27eda14cbcSMatt Macy #include <sys/zfs_context.h>
28eda14cbcSMatt Macy #include <sys/txg_impl.h>
29eda14cbcSMatt Macy #include <sys/dmu_impl.h>
30eda14cbcSMatt Macy #include <sys/spa_impl.h>
31eda14cbcSMatt Macy #include <sys/dmu_tx.h>
32eda14cbcSMatt Macy #include <sys/dsl_pool.h>
33eda14cbcSMatt Macy #include <sys/dsl_scan.h>
34eda14cbcSMatt Macy #include <sys/zil.h>
35eda14cbcSMatt Macy #include <sys/callb.h>
36eda14cbcSMatt Macy #include <sys/trace_zfs.h>
37eda14cbcSMatt Macy 
38eda14cbcSMatt Macy /*
39eda14cbcSMatt Macy  * ZFS Transaction Groups
40eda14cbcSMatt Macy  * ----------------------
41eda14cbcSMatt Macy  *
42eda14cbcSMatt Macy  * ZFS transaction groups are, as the name implies, groups of transactions
43eda14cbcSMatt Macy  * that act on persistent state. ZFS asserts consistency at the granularity of
44eda14cbcSMatt Macy  * these transaction groups. Each successive transaction group (txg) is
45eda14cbcSMatt Macy  * assigned a 64-bit consecutive identifier. There are three active
46eda14cbcSMatt Macy  * transaction group states: open, quiescing, or syncing. At any given time,
47eda14cbcSMatt Macy  * there may be an active txg associated with each state; each active txg may
48eda14cbcSMatt Macy  * either be processing, or blocked waiting to enter the next state. There may
49eda14cbcSMatt Macy  * be up to three active txgs, and there is always a txg in the open state
50eda14cbcSMatt Macy  * (though it may be blocked waiting to enter the quiescing state). In broad
51eda14cbcSMatt Macy  * strokes, transactions -- operations that change in-memory structures -- are
52eda14cbcSMatt Macy  * accepted into the txg in the open state, and are completed while the txg is
53eda14cbcSMatt Macy  * in the open or quiescing states. The accumulated changes are written to
54eda14cbcSMatt Macy  * disk in the syncing state.
55eda14cbcSMatt Macy  *
56eda14cbcSMatt Macy  * Open
57eda14cbcSMatt Macy  *
58eda14cbcSMatt Macy  * When a new txg becomes active, it first enters the open state. New
59eda14cbcSMatt Macy  * transactions -- updates to in-memory structures -- are assigned to the
60eda14cbcSMatt Macy  * currently open txg. There is always a txg in the open state so that ZFS can
61eda14cbcSMatt Macy  * accept new changes (though the txg may refuse new changes if it has hit
62eda14cbcSMatt Macy  * some limit). ZFS advances the open txg to the next state for a variety of
63eda14cbcSMatt Macy  * reasons such as it hitting a time or size threshold, or the execution of an
64eda14cbcSMatt Macy  * administrative action that must be completed in the syncing state.
65eda14cbcSMatt Macy  *
66eda14cbcSMatt Macy  * Quiescing
67eda14cbcSMatt Macy  *
68eda14cbcSMatt Macy  * After a txg exits the open state, it enters the quiescing state. The
69eda14cbcSMatt Macy  * quiescing state is intended to provide a buffer between accepting new
70eda14cbcSMatt Macy  * transactions in the open state and writing them out to stable storage in
71eda14cbcSMatt Macy  * the syncing state. While quiescing, transactions can continue their
72eda14cbcSMatt Macy  * operation without delaying either of the other states. Typically, a txg is
73eda14cbcSMatt Macy  * in the quiescing state very briefly since the operations are bounded by
74eda14cbcSMatt Macy  * software latencies rather than, say, slower I/O latencies. After all
75eda14cbcSMatt Macy  * transactions complete, the txg is ready to enter the next state.
76eda14cbcSMatt Macy  *
77eda14cbcSMatt Macy  * Syncing
78eda14cbcSMatt Macy  *
79eda14cbcSMatt Macy  * In the syncing state, the in-memory state built up during the open and (to
80eda14cbcSMatt Macy  * a lesser degree) the quiescing states is written to stable storage. The
81eda14cbcSMatt Macy  * process of writing out modified data can, in turn modify more data. For
82eda14cbcSMatt Macy  * example when we write new blocks, we need to allocate space for them; those
83eda14cbcSMatt Macy  * allocations modify metadata (space maps)... which themselves must be
84eda14cbcSMatt Macy  * written to stable storage. During the sync state, ZFS iterates, writing out
85eda14cbcSMatt Macy  * data until it converges and all in-memory changes have been written out.
86eda14cbcSMatt Macy  * The first such pass is the largest as it encompasses all the modified user
87eda14cbcSMatt Macy  * data (as opposed to filesystem metadata). Subsequent passes typically have
88eda14cbcSMatt Macy  * far less data to write as they consist exclusively of filesystem metadata.
89eda14cbcSMatt Macy  *
90eda14cbcSMatt Macy  * To ensure convergence, after a certain number of passes ZFS begins
91eda14cbcSMatt Macy  * overwriting locations on stable storage that had been allocated earlier in
92eda14cbcSMatt Macy  * the syncing state (and subsequently freed). ZFS usually allocates new
93eda14cbcSMatt Macy  * blocks to optimize for large, continuous, writes. For the syncing state to
94eda14cbcSMatt Macy  * converge however it must complete a pass where no new blocks are allocated
95eda14cbcSMatt Macy  * since each allocation requires a modification of persistent metadata.
96eda14cbcSMatt Macy  * Further, to hasten convergence, after a prescribed number of passes, ZFS
97eda14cbcSMatt Macy  * also defers frees, and stops compressing.
98eda14cbcSMatt Macy  *
99eda14cbcSMatt Macy  * In addition to writing out user data, we must also execute synctasks during
100eda14cbcSMatt Macy  * the syncing context. A synctask is the mechanism by which some
101eda14cbcSMatt Macy  * administrative activities work such as creating and destroying snapshots or
102eda14cbcSMatt Macy  * datasets. Note that when a synctask is initiated it enters the open txg,
103eda14cbcSMatt Macy  * and ZFS then pushes that txg as quickly as possible to completion of the
104eda14cbcSMatt Macy  * syncing state in order to reduce the latency of the administrative
105eda14cbcSMatt Macy  * activity. To complete the syncing state, ZFS writes out a new uberblock,
106eda14cbcSMatt Macy  * the root of the tree of blocks that comprise all state stored on the ZFS
107eda14cbcSMatt Macy  * pool. Finally, if there is a quiesced txg waiting, we signal that it can
108eda14cbcSMatt Macy  * now transition to the syncing state.
109eda14cbcSMatt Macy  */
110eda14cbcSMatt Macy 
111da5137abSMartin Matuska static __attribute__((noreturn)) void txg_sync_thread(void *arg);
112da5137abSMartin Matuska static __attribute__((noreturn)) void txg_quiesce_thread(void *arg);
113eda14cbcSMatt Macy 
114be181ee2SMartin Matuska uint_t zfs_txg_timeout = 5;	/* max seconds worth of delta per txg */
115eda14cbcSMatt Macy 
116eda14cbcSMatt Macy /*
117eda14cbcSMatt Macy  * Prepare the txg subsystem.
118eda14cbcSMatt Macy  */
119eda14cbcSMatt Macy void
120eda14cbcSMatt Macy txg_init(dsl_pool_t *dp, uint64_t txg)
121eda14cbcSMatt Macy {
122eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
123eda14cbcSMatt Macy 	int c;
124da5137abSMartin Matuska 	memset(tx, 0, sizeof (tx_state_t));
125eda14cbcSMatt Macy 
126eda14cbcSMatt Macy 	tx->tx_cpu = vmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP);
127eda14cbcSMatt Macy 
128eda14cbcSMatt Macy 	for (c = 0; c < max_ncpus; c++) {
129eda14cbcSMatt Macy 		int i;
130eda14cbcSMatt Macy 
131eda14cbcSMatt Macy 		mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL);
132eda14cbcSMatt Macy 		mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_NOLOCKDEP,
133eda14cbcSMatt Macy 		    NULL);
134eda14cbcSMatt Macy 		for (i = 0; i < TXG_SIZE; i++) {
135eda14cbcSMatt Macy 			cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT,
136eda14cbcSMatt Macy 			    NULL);
137eda14cbcSMatt Macy 			list_create(&tx->tx_cpu[c].tc_callbacks[i],
138eda14cbcSMatt Macy 			    sizeof (dmu_tx_callback_t),
139eda14cbcSMatt Macy 			    offsetof(dmu_tx_callback_t, dcb_node));
140eda14cbcSMatt Macy 		}
141eda14cbcSMatt Macy 	}
142eda14cbcSMatt Macy 
143eda14cbcSMatt Macy 	mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL);
144eda14cbcSMatt Macy 
145eda14cbcSMatt Macy 	cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL);
146eda14cbcSMatt Macy 	cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL);
147eda14cbcSMatt Macy 	cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL);
148eda14cbcSMatt Macy 	cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL);
149eda14cbcSMatt Macy 	cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL);
150eda14cbcSMatt Macy 
151eda14cbcSMatt Macy 	tx->tx_open_txg = txg;
152eda14cbcSMatt Macy }
153eda14cbcSMatt Macy 
154eda14cbcSMatt Macy /*
155eda14cbcSMatt Macy  * Close down the txg subsystem.
156eda14cbcSMatt Macy  */
157eda14cbcSMatt Macy void
158eda14cbcSMatt Macy txg_fini(dsl_pool_t *dp)
159eda14cbcSMatt Macy {
160eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
161eda14cbcSMatt Macy 	int c;
162eda14cbcSMatt Macy 
163eda14cbcSMatt Macy 	ASSERT0(tx->tx_threads);
164eda14cbcSMatt Macy 
165eda14cbcSMatt Macy 	mutex_destroy(&tx->tx_sync_lock);
166eda14cbcSMatt Macy 
167eda14cbcSMatt Macy 	cv_destroy(&tx->tx_sync_more_cv);
168eda14cbcSMatt Macy 	cv_destroy(&tx->tx_sync_done_cv);
169eda14cbcSMatt Macy 	cv_destroy(&tx->tx_quiesce_more_cv);
170eda14cbcSMatt Macy 	cv_destroy(&tx->tx_quiesce_done_cv);
171eda14cbcSMatt Macy 	cv_destroy(&tx->tx_exit_cv);
172eda14cbcSMatt Macy 
173eda14cbcSMatt Macy 	for (c = 0; c < max_ncpus; c++) {
174eda14cbcSMatt Macy 		int i;
175eda14cbcSMatt Macy 
176eda14cbcSMatt Macy 		mutex_destroy(&tx->tx_cpu[c].tc_open_lock);
177eda14cbcSMatt Macy 		mutex_destroy(&tx->tx_cpu[c].tc_lock);
178eda14cbcSMatt Macy 		for (i = 0; i < TXG_SIZE; i++) {
179eda14cbcSMatt Macy 			cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
180eda14cbcSMatt Macy 			list_destroy(&tx->tx_cpu[c].tc_callbacks[i]);
181eda14cbcSMatt Macy 		}
182eda14cbcSMatt Macy 	}
183eda14cbcSMatt Macy 
184eda14cbcSMatt Macy 	if (tx->tx_commit_cb_taskq != NULL)
185eda14cbcSMatt Macy 		taskq_destroy(tx->tx_commit_cb_taskq);
186eda14cbcSMatt Macy 
187eda14cbcSMatt Macy 	vmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t));
188eda14cbcSMatt Macy 
189da5137abSMartin Matuska 	memset(tx, 0, sizeof (tx_state_t));
190eda14cbcSMatt Macy }
191eda14cbcSMatt Macy 
192eda14cbcSMatt Macy /*
193eda14cbcSMatt Macy  * Start syncing transaction groups.
194eda14cbcSMatt Macy  */
195eda14cbcSMatt Macy void
196eda14cbcSMatt Macy txg_sync_start(dsl_pool_t *dp)
197eda14cbcSMatt Macy {
198eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
199eda14cbcSMatt Macy 
200eda14cbcSMatt Macy 	mutex_enter(&tx->tx_sync_lock);
201eda14cbcSMatt Macy 
202eda14cbcSMatt Macy 	dprintf("pool %p\n", dp);
203eda14cbcSMatt Macy 
204eda14cbcSMatt Macy 	ASSERT0(tx->tx_threads);
205eda14cbcSMatt Macy 
206eda14cbcSMatt Macy 	tx->tx_threads = 2;
207eda14cbcSMatt Macy 
208eda14cbcSMatt Macy 	tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread,
209eda14cbcSMatt Macy 	    dp, 0, &p0, TS_RUN, defclsyspri);
210eda14cbcSMatt Macy 
211eda14cbcSMatt Macy 	/*
212eda14cbcSMatt Macy 	 * The sync thread can need a larger-than-default stack size on
213eda14cbcSMatt Macy 	 * 32-bit x86.  This is due in part to nested pools and
214eda14cbcSMatt Macy 	 * scrub_visitbp() recursion.
215eda14cbcSMatt Macy 	 */
216eda14cbcSMatt Macy 	tx->tx_sync_thread = thread_create(NULL, 0, txg_sync_thread,
217eda14cbcSMatt Macy 	    dp, 0, &p0, TS_RUN, defclsyspri);
218eda14cbcSMatt Macy 
219eda14cbcSMatt Macy 	mutex_exit(&tx->tx_sync_lock);
220eda14cbcSMatt Macy }
221eda14cbcSMatt Macy 
222eda14cbcSMatt Macy static void
223eda14cbcSMatt Macy txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr)
224eda14cbcSMatt Macy {
225eda14cbcSMatt Macy 	CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG);
226eda14cbcSMatt Macy 	mutex_enter(&tx->tx_sync_lock);
227eda14cbcSMatt Macy }
228eda14cbcSMatt Macy 
229eda14cbcSMatt Macy static void
230eda14cbcSMatt Macy txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp)
231eda14cbcSMatt Macy {
232eda14cbcSMatt Macy 	ASSERT(*tpp != NULL);
233eda14cbcSMatt Macy 	*tpp = NULL;
234eda14cbcSMatt Macy 	tx->tx_threads--;
235eda14cbcSMatt Macy 	cv_broadcast(&tx->tx_exit_cv);
236eda14cbcSMatt Macy 	CALLB_CPR_EXIT(cpr);		/* drops &tx->tx_sync_lock */
237eda14cbcSMatt Macy 	thread_exit();
238eda14cbcSMatt Macy }
239eda14cbcSMatt Macy 
240eda14cbcSMatt Macy static void
241eda14cbcSMatt Macy txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time)
242eda14cbcSMatt Macy {
243eda14cbcSMatt Macy 	CALLB_CPR_SAFE_BEGIN(cpr);
244eda14cbcSMatt Macy 
245eda14cbcSMatt Macy 	if (time) {
2462c48331dSMatt Macy 		(void) cv_timedwait_idle(cv, &tx->tx_sync_lock,
247eda14cbcSMatt Macy 		    ddi_get_lbolt() + time);
248eda14cbcSMatt Macy 	} else {
2492c48331dSMatt Macy 		cv_wait_idle(cv, &tx->tx_sync_lock);
250eda14cbcSMatt Macy 	}
251eda14cbcSMatt Macy 
252eda14cbcSMatt Macy 	CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock);
253eda14cbcSMatt Macy }
254eda14cbcSMatt Macy 
255eda14cbcSMatt Macy /*
256eda14cbcSMatt Macy  * Stop syncing transaction groups.
257eda14cbcSMatt Macy  */
258eda14cbcSMatt Macy void
259eda14cbcSMatt Macy txg_sync_stop(dsl_pool_t *dp)
260eda14cbcSMatt Macy {
261eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
262eda14cbcSMatt Macy 
263eda14cbcSMatt Macy 	dprintf("pool %p\n", dp);
264eda14cbcSMatt Macy 	/*
265eda14cbcSMatt Macy 	 * Finish off any work in progress.
266eda14cbcSMatt Macy 	 */
267eda14cbcSMatt Macy 	ASSERT3U(tx->tx_threads, ==, 2);
268eda14cbcSMatt Macy 
269eda14cbcSMatt Macy 	/*
270eda14cbcSMatt Macy 	 * We need to ensure that we've vacated the deferred metaslab trees.
271eda14cbcSMatt Macy 	 */
272eda14cbcSMatt Macy 	txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE);
273eda14cbcSMatt Macy 
274eda14cbcSMatt Macy 	/*
275eda14cbcSMatt Macy 	 * Wake all sync threads and wait for them to die.
276eda14cbcSMatt Macy 	 */
277eda14cbcSMatt Macy 	mutex_enter(&tx->tx_sync_lock);
278eda14cbcSMatt Macy 
279eda14cbcSMatt Macy 	ASSERT3U(tx->tx_threads, ==, 2);
280eda14cbcSMatt Macy 
281eda14cbcSMatt Macy 	tx->tx_exiting = 1;
282eda14cbcSMatt Macy 
283eda14cbcSMatt Macy 	cv_broadcast(&tx->tx_quiesce_more_cv);
284eda14cbcSMatt Macy 	cv_broadcast(&tx->tx_quiesce_done_cv);
285eda14cbcSMatt Macy 	cv_broadcast(&tx->tx_sync_more_cv);
286eda14cbcSMatt Macy 
287eda14cbcSMatt Macy 	while (tx->tx_threads != 0)
288eda14cbcSMatt Macy 		cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock);
289eda14cbcSMatt Macy 
290eda14cbcSMatt Macy 	tx->tx_exiting = 0;
291eda14cbcSMatt Macy 
292eda14cbcSMatt Macy 	mutex_exit(&tx->tx_sync_lock);
293eda14cbcSMatt Macy }
294eda14cbcSMatt Macy 
295184c1b94SMartin Matuska /*
296184c1b94SMartin Matuska  * Get a handle on the currently open txg and keep it open.
297184c1b94SMartin Matuska  *
298184c1b94SMartin Matuska  * The txg is guaranteed to stay open until txg_rele_to_quiesce() is called for
299184c1b94SMartin Matuska  * the handle. Once txg_rele_to_quiesce() has been called, the txg stays
300184c1b94SMartin Matuska  * in quiescing state until txg_rele_to_sync() is called for the handle.
301184c1b94SMartin Matuska  *
302184c1b94SMartin Matuska  * It is guaranteed that subsequent calls return monotonically increasing
303184c1b94SMartin Matuska  * txgs for the same dsl_pool_t. Of course this is not strong monotonicity,
304184c1b94SMartin Matuska  * because the same txg can be returned multiple times in a row. This
305184c1b94SMartin Matuska  * guarantee holds both for subsequent calls from one thread and for multiple
306184c1b94SMartin Matuska  * threads. For example, it is impossible to observe the following sequence
307184c1b94SMartin Matuska  * of events:
308184c1b94SMartin Matuska  *
309184c1b94SMartin Matuska  *           Thread 1                            Thread 2
310184c1b94SMartin Matuska  *
311184c1b94SMartin Matuska  *   1 <- txg_hold_open(P, ...)
312184c1b94SMartin Matuska  *                                       2 <- txg_hold_open(P, ...)
313184c1b94SMartin Matuska  *   1 <- txg_hold_open(P, ...)
314184c1b94SMartin Matuska  *
315184c1b94SMartin Matuska  */
316eda14cbcSMatt Macy uint64_t
317eda14cbcSMatt Macy txg_hold_open(dsl_pool_t *dp, txg_handle_t *th)
318eda14cbcSMatt Macy {
319eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
320eda14cbcSMatt Macy 	tx_cpu_t *tc;
321eda14cbcSMatt Macy 	uint64_t txg;
322eda14cbcSMatt Macy 
323eda14cbcSMatt Macy 	/*
324eda14cbcSMatt Macy 	 * It appears the processor id is simply used as a "random"
325eda14cbcSMatt Macy 	 * number to index into the array, and there isn't any other
326eda14cbcSMatt Macy 	 * significance to the chosen tx_cpu. Because.. Why not use
327eda14cbcSMatt Macy 	 * the current cpu to index into the array?
328eda14cbcSMatt Macy 	 */
3297877fdebSMatt Macy 	tc = &tx->tx_cpu[CPU_SEQID_UNSTABLE];
330eda14cbcSMatt Macy 
331eda14cbcSMatt Macy 	mutex_enter(&tc->tc_open_lock);
332eda14cbcSMatt Macy 	txg = tx->tx_open_txg;
333eda14cbcSMatt Macy 
334eda14cbcSMatt Macy 	mutex_enter(&tc->tc_lock);
335eda14cbcSMatt Macy 	tc->tc_count[txg & TXG_MASK]++;
336eda14cbcSMatt Macy 	mutex_exit(&tc->tc_lock);
337eda14cbcSMatt Macy 
338eda14cbcSMatt Macy 	th->th_cpu = tc;
339eda14cbcSMatt Macy 	th->th_txg = txg;
340eda14cbcSMatt Macy 
341eda14cbcSMatt Macy 	return (txg);
342eda14cbcSMatt Macy }
343eda14cbcSMatt Macy 
344eda14cbcSMatt Macy void
345eda14cbcSMatt Macy txg_rele_to_quiesce(txg_handle_t *th)
346eda14cbcSMatt Macy {
347eda14cbcSMatt Macy 	tx_cpu_t *tc = th->th_cpu;
348eda14cbcSMatt Macy 
349eda14cbcSMatt Macy 	ASSERT(!MUTEX_HELD(&tc->tc_lock));
350eda14cbcSMatt Macy 	mutex_exit(&tc->tc_open_lock);
351eda14cbcSMatt Macy }
352eda14cbcSMatt Macy 
353eda14cbcSMatt Macy void
354eda14cbcSMatt Macy txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks)
355eda14cbcSMatt Macy {
356eda14cbcSMatt Macy 	tx_cpu_t *tc = th->th_cpu;
357eda14cbcSMatt Macy 	int g = th->th_txg & TXG_MASK;
358eda14cbcSMatt Macy 
359eda14cbcSMatt Macy 	mutex_enter(&tc->tc_lock);
360eda14cbcSMatt Macy 	list_move_tail(&tc->tc_callbacks[g], tx_callbacks);
361eda14cbcSMatt Macy 	mutex_exit(&tc->tc_lock);
362eda14cbcSMatt Macy }
363eda14cbcSMatt Macy 
364eda14cbcSMatt Macy void
365eda14cbcSMatt Macy txg_rele_to_sync(txg_handle_t *th)
366eda14cbcSMatt Macy {
367eda14cbcSMatt Macy 	tx_cpu_t *tc = th->th_cpu;
368eda14cbcSMatt Macy 	int g = th->th_txg & TXG_MASK;
369eda14cbcSMatt Macy 
370eda14cbcSMatt Macy 	mutex_enter(&tc->tc_lock);
371eda14cbcSMatt Macy 	ASSERT(tc->tc_count[g] != 0);
372eda14cbcSMatt Macy 	if (--tc->tc_count[g] == 0)
373eda14cbcSMatt Macy 		cv_broadcast(&tc->tc_cv[g]);
374eda14cbcSMatt Macy 	mutex_exit(&tc->tc_lock);
375eda14cbcSMatt Macy 
376eda14cbcSMatt Macy 	th->th_cpu = NULL;	/* defensive */
377eda14cbcSMatt Macy }
378eda14cbcSMatt Macy 
379eda14cbcSMatt Macy /*
380eda14cbcSMatt Macy  * Blocks until all transactions in the group are committed.
381eda14cbcSMatt Macy  *
382eda14cbcSMatt Macy  * On return, the transaction group has reached a stable state in which it can
383eda14cbcSMatt Macy  * then be passed off to the syncing context.
384eda14cbcSMatt Macy  */
385eda14cbcSMatt Macy static void
386eda14cbcSMatt Macy txg_quiesce(dsl_pool_t *dp, uint64_t txg)
387eda14cbcSMatt Macy {
388eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
389eda14cbcSMatt Macy 	uint64_t tx_open_time;
390eda14cbcSMatt Macy 	int g = txg & TXG_MASK;
391eda14cbcSMatt Macy 	int c;
392eda14cbcSMatt Macy 
393eda14cbcSMatt Macy 	/*
394eda14cbcSMatt Macy 	 * Grab all tc_open_locks so nobody else can get into this txg.
395eda14cbcSMatt Macy 	 */
396eda14cbcSMatt Macy 	for (c = 0; c < max_ncpus; c++)
397eda14cbcSMatt Macy 		mutex_enter(&tx->tx_cpu[c].tc_open_lock);
398eda14cbcSMatt Macy 
399eda14cbcSMatt Macy 	ASSERT(txg == tx->tx_open_txg);
400eda14cbcSMatt Macy 	tx->tx_open_txg++;
401eda14cbcSMatt Macy 	tx->tx_open_time = tx_open_time = gethrtime();
402eda14cbcSMatt Macy 
403eda14cbcSMatt Macy 	DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg);
404eda14cbcSMatt Macy 	DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg);
405eda14cbcSMatt Macy 
406eda14cbcSMatt Macy 	/*
407eda14cbcSMatt Macy 	 * Now that we've incremented tx_open_txg, we can let threads
408eda14cbcSMatt Macy 	 * enter the next transaction group.
409eda14cbcSMatt Macy 	 */
410eda14cbcSMatt Macy 	for (c = 0; c < max_ncpus; c++)
411eda14cbcSMatt Macy 		mutex_exit(&tx->tx_cpu[c].tc_open_lock);
412eda14cbcSMatt Macy 
413eda14cbcSMatt Macy 	spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_OPEN, tx_open_time);
414eda14cbcSMatt Macy 	spa_txg_history_add(dp->dp_spa, txg + 1, tx_open_time);
415eda14cbcSMatt Macy 
416eda14cbcSMatt Macy 	/*
417184c1b94SMartin Matuska 	 * Quiesce the transaction group by waiting for everyone to
418184c1b94SMartin Matuska 	 * call txg_rele_to_sync() for their open transaction handles.
419eda14cbcSMatt Macy 	 */
420eda14cbcSMatt Macy 	for (c = 0; c < max_ncpus; c++) {
421eda14cbcSMatt Macy 		tx_cpu_t *tc = &tx->tx_cpu[c];
422eda14cbcSMatt Macy 		mutex_enter(&tc->tc_lock);
423eda14cbcSMatt Macy 		while (tc->tc_count[g] != 0)
424eda14cbcSMatt Macy 			cv_wait(&tc->tc_cv[g], &tc->tc_lock);
425eda14cbcSMatt Macy 		mutex_exit(&tc->tc_lock);
426eda14cbcSMatt Macy 	}
427eda14cbcSMatt Macy 
428eda14cbcSMatt Macy 	spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_QUIESCED, gethrtime());
429eda14cbcSMatt Macy }
430eda14cbcSMatt Macy 
431eda14cbcSMatt Macy static void
432bb2d13b6SMartin Matuska txg_do_callbacks(void *cb_list)
433eda14cbcSMatt Macy {
434eda14cbcSMatt Macy 	dmu_tx_do_callbacks(cb_list, 0);
435eda14cbcSMatt Macy 
436eda14cbcSMatt Macy 	list_destroy(cb_list);
437eda14cbcSMatt Macy 
438eda14cbcSMatt Macy 	kmem_free(cb_list, sizeof (list_t));
439eda14cbcSMatt Macy }
440eda14cbcSMatt Macy 
441eda14cbcSMatt Macy /*
442eda14cbcSMatt Macy  * Dispatch the commit callbacks registered on this txg to worker threads.
443eda14cbcSMatt Macy  *
444eda14cbcSMatt Macy  * If no callbacks are registered for a given TXG, nothing happens.
445eda14cbcSMatt Macy  * This function creates a taskq for the associated pool, if needed.
446eda14cbcSMatt Macy  */
447eda14cbcSMatt Macy static void
448eda14cbcSMatt Macy txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
449eda14cbcSMatt Macy {
450eda14cbcSMatt Macy 	int c;
451eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
452eda14cbcSMatt Macy 	list_t *cb_list;
453eda14cbcSMatt Macy 
454eda14cbcSMatt Macy 	for (c = 0; c < max_ncpus; c++) {
455eda14cbcSMatt Macy 		tx_cpu_t *tc = &tx->tx_cpu[c];
456eda14cbcSMatt Macy 		/*
457eda14cbcSMatt Macy 		 * No need to lock tx_cpu_t at this point, since this can
458eda14cbcSMatt Macy 		 * only be called once a txg has been synced.
459eda14cbcSMatt Macy 		 */
460eda14cbcSMatt Macy 
461eda14cbcSMatt Macy 		int g = txg & TXG_MASK;
462eda14cbcSMatt Macy 
463eda14cbcSMatt Macy 		if (list_is_empty(&tc->tc_callbacks[g]))
464eda14cbcSMatt Macy 			continue;
465eda14cbcSMatt Macy 
466eda14cbcSMatt Macy 		if (tx->tx_commit_cb_taskq == NULL) {
467eda14cbcSMatt Macy 			/*
468eda14cbcSMatt Macy 			 * Commit callback taskq hasn't been created yet.
469eda14cbcSMatt Macy 			 */
470eda14cbcSMatt Macy 			tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb",
4717877fdebSMatt Macy 			    100, defclsyspri, boot_ncpus, boot_ncpus * 2,
4727877fdebSMatt Macy 			    TASKQ_PREPOPULATE | TASKQ_DYNAMIC |
4737877fdebSMatt Macy 			    TASKQ_THREADS_CPU_PCT);
474eda14cbcSMatt Macy 		}
475eda14cbcSMatt Macy 
476eda14cbcSMatt Macy 		cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
477eda14cbcSMatt Macy 		list_create(cb_list, sizeof (dmu_tx_callback_t),
478eda14cbcSMatt Macy 		    offsetof(dmu_tx_callback_t, dcb_node));
479eda14cbcSMatt Macy 
480eda14cbcSMatt Macy 		list_move_tail(cb_list, &tc->tc_callbacks[g]);
481eda14cbcSMatt Macy 
482bb2d13b6SMartin Matuska 		(void) taskq_dispatch(tx->tx_commit_cb_taskq,
483eda14cbcSMatt Macy 		    txg_do_callbacks, cb_list, TQ_SLEEP);
484eda14cbcSMatt Macy 	}
485eda14cbcSMatt Macy }
486eda14cbcSMatt Macy 
487eda14cbcSMatt Macy /*
488eda14cbcSMatt Macy  * Wait for pending commit callbacks of already-synced transactions to finish
489eda14cbcSMatt Macy  * processing.
490eda14cbcSMatt Macy  * Calling this function from within a commit callback will deadlock.
491eda14cbcSMatt Macy  */
492eda14cbcSMatt Macy void
493eda14cbcSMatt Macy txg_wait_callbacks(dsl_pool_t *dp)
494eda14cbcSMatt Macy {
495eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
496eda14cbcSMatt Macy 
497eda14cbcSMatt Macy 	if (tx->tx_commit_cb_taskq != NULL)
498eda14cbcSMatt Macy 		taskq_wait_outstanding(tx->tx_commit_cb_taskq, 0);
499eda14cbcSMatt Macy }
500eda14cbcSMatt Macy 
501eda14cbcSMatt Macy static boolean_t
502eda14cbcSMatt Macy txg_is_quiescing(dsl_pool_t *dp)
503eda14cbcSMatt Macy {
504eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
505eda14cbcSMatt Macy 	ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
506eda14cbcSMatt Macy 	return (tx->tx_quiescing_txg != 0);
507eda14cbcSMatt Macy }
508eda14cbcSMatt Macy 
509eda14cbcSMatt Macy static boolean_t
510eda14cbcSMatt Macy txg_has_quiesced_to_sync(dsl_pool_t *dp)
511eda14cbcSMatt Macy {
512eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
513eda14cbcSMatt Macy 	ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
514eda14cbcSMatt Macy 	return (tx->tx_quiesced_txg != 0);
515eda14cbcSMatt Macy }
516eda14cbcSMatt Macy 
517da5137abSMartin Matuska static __attribute__((noreturn)) void
518eda14cbcSMatt Macy txg_sync_thread(void *arg)
519eda14cbcSMatt Macy {
520eda14cbcSMatt Macy 	dsl_pool_t *dp = arg;
521eda14cbcSMatt Macy 	spa_t *spa = dp->dp_spa;
522eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
523eda14cbcSMatt Macy 	callb_cpr_t cpr;
524eda14cbcSMatt Macy 	clock_t start, delta;
525eda14cbcSMatt Macy 
526eda14cbcSMatt Macy 	(void) spl_fstrans_mark();
527eda14cbcSMatt Macy 	txg_thread_enter(tx, &cpr);
528eda14cbcSMatt Macy 
529eda14cbcSMatt Macy 	start = delta = 0;
530eda14cbcSMatt Macy 	for (;;) {
531eda14cbcSMatt Macy 		clock_t timeout = zfs_txg_timeout * hz;
532eda14cbcSMatt Macy 		clock_t timer;
533eda14cbcSMatt Macy 		uint64_t txg;
534eda14cbcSMatt Macy 
535eda14cbcSMatt Macy 		/*
536eda14cbcSMatt Macy 		 * We sync when we're scanning, there's someone waiting
537eda14cbcSMatt Macy 		 * on us, or the quiesce thread has handed off a txg to
538eda14cbcSMatt Macy 		 * us, or we have reached our timeout.
539eda14cbcSMatt Macy 		 */
540eda14cbcSMatt Macy 		timer = (delta >= timeout ? 0 : timeout - delta);
541eda14cbcSMatt Macy 		while (!dsl_scan_active(dp->dp_scan) &&
542eda14cbcSMatt Macy 		    !tx->tx_exiting && timer > 0 &&
543eda14cbcSMatt Macy 		    tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
5447cd22ac4SMartin Matuska 		    !txg_has_quiesced_to_sync(dp)) {
545eda14cbcSMatt Macy 			dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
54633b8c039SMartin Matuska 			    (u_longlong_t)tx->tx_synced_txg,
54733b8c039SMartin Matuska 			    (u_longlong_t)tx->tx_sync_txg_waiting, dp);
548eda14cbcSMatt Macy 			txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
549eda14cbcSMatt Macy 			delta = ddi_get_lbolt() - start;
550eda14cbcSMatt Macy 			timer = (delta > timeout ? 0 : timeout - delta);
551eda14cbcSMatt Macy 		}
552eda14cbcSMatt Macy 
553eda14cbcSMatt Macy 		/*
554*b985c9caSMartin Matuska 		 * When we're suspended, nothing should be changing and for
555*b985c9caSMartin Matuska 		 * MMP we don't want to bump anything that would make it
556*b985c9caSMartin Matuska 		 * harder to detect if another host is changing it when
557*b985c9caSMartin Matuska 		 * resuming after a MMP suspend.
558*b985c9caSMartin Matuska 		 */
559*b985c9caSMartin Matuska 		if (spa_suspended(spa))
560*b985c9caSMartin Matuska 			continue;
561*b985c9caSMartin Matuska 
562*b985c9caSMartin Matuska 		/*
563eda14cbcSMatt Macy 		 * Wait until the quiesce thread hands off a txg to us,
564eda14cbcSMatt Macy 		 * prompting it to do so if necessary.
565eda14cbcSMatt Macy 		 */
566eda14cbcSMatt Macy 		while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) {
5677cd22ac4SMartin Matuska 			if (txg_is_quiescing(dp)) {
5687cd22ac4SMartin Matuska 				txg_thread_wait(tx, &cpr,
5697cd22ac4SMartin Matuska 				    &tx->tx_quiesce_done_cv, 0);
5707cd22ac4SMartin Matuska 				continue;
5717cd22ac4SMartin Matuska 			}
572eda14cbcSMatt Macy 			if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
573eda14cbcSMatt Macy 				tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
574eda14cbcSMatt Macy 			cv_broadcast(&tx->tx_quiesce_more_cv);
575eda14cbcSMatt Macy 			txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
576eda14cbcSMatt Macy 		}
577eda14cbcSMatt Macy 
578eda14cbcSMatt Macy 		if (tx->tx_exiting)
579eda14cbcSMatt Macy 			txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
580eda14cbcSMatt Macy 
581eda14cbcSMatt Macy 		/*
582eda14cbcSMatt Macy 		 * Consume the quiesced txg which has been handed off to
583eda14cbcSMatt Macy 		 * us.  This may cause the quiescing thread to now be
584eda14cbcSMatt Macy 		 * able to quiesce another txg, so we must signal it.
585eda14cbcSMatt Macy 		 */
586eda14cbcSMatt Macy 		ASSERT(tx->tx_quiesced_txg != 0);
587eda14cbcSMatt Macy 		txg = tx->tx_quiesced_txg;
588eda14cbcSMatt Macy 		tx->tx_quiesced_txg = 0;
589eda14cbcSMatt Macy 		tx->tx_syncing_txg = txg;
590eda14cbcSMatt Macy 		DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg);
591eda14cbcSMatt Macy 		cv_broadcast(&tx->tx_quiesce_more_cv);
592eda14cbcSMatt Macy 
593eda14cbcSMatt Macy 		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
59433b8c039SMartin Matuska 		    (u_longlong_t)txg, (u_longlong_t)tx->tx_quiesce_txg_waiting,
59533b8c039SMartin Matuska 		    (u_longlong_t)tx->tx_sync_txg_waiting);
596eda14cbcSMatt Macy 		mutex_exit(&tx->tx_sync_lock);
597eda14cbcSMatt Macy 
598eda14cbcSMatt Macy 		txg_stat_t *ts = spa_txg_history_init_io(spa, txg, dp);
599eda14cbcSMatt Macy 		start = ddi_get_lbolt();
600eda14cbcSMatt Macy 		spa_sync(spa, txg);
601eda14cbcSMatt Macy 		delta = ddi_get_lbolt() - start;
602eda14cbcSMatt Macy 		spa_txg_history_fini_io(spa, ts);
603eda14cbcSMatt Macy 
604eda14cbcSMatt Macy 		mutex_enter(&tx->tx_sync_lock);
605eda14cbcSMatt Macy 		tx->tx_synced_txg = txg;
606eda14cbcSMatt Macy 		tx->tx_syncing_txg = 0;
607eda14cbcSMatt Macy 		DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg);
608eda14cbcSMatt Macy 		cv_broadcast(&tx->tx_sync_done_cv);
609eda14cbcSMatt Macy 
610eda14cbcSMatt Macy 		/*
611eda14cbcSMatt Macy 		 * Dispatch commit callbacks to worker threads.
612eda14cbcSMatt Macy 		 */
613eda14cbcSMatt Macy 		txg_dispatch_callbacks(dp, txg);
614eda14cbcSMatt Macy 	}
615eda14cbcSMatt Macy }
616eda14cbcSMatt Macy 
617da5137abSMartin Matuska static __attribute__((noreturn)) void
618eda14cbcSMatt Macy txg_quiesce_thread(void *arg)
619eda14cbcSMatt Macy {
620eda14cbcSMatt Macy 	dsl_pool_t *dp = arg;
621eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
622eda14cbcSMatt Macy 	callb_cpr_t cpr;
623eda14cbcSMatt Macy 
624eda14cbcSMatt Macy 	txg_thread_enter(tx, &cpr);
625eda14cbcSMatt Macy 
626eda14cbcSMatt Macy 	for (;;) {
627eda14cbcSMatt Macy 		uint64_t txg;
628eda14cbcSMatt Macy 
629eda14cbcSMatt Macy 		/*
630eda14cbcSMatt Macy 		 * We quiesce when there's someone waiting on us.
631eda14cbcSMatt Macy 		 * However, we can only have one txg in "quiescing" or
632eda14cbcSMatt Macy 		 * "quiesced, waiting to sync" state.  So we wait until
633eda14cbcSMatt Macy 		 * the "quiesced, waiting to sync" txg has been consumed
634eda14cbcSMatt Macy 		 * by the sync thread.
635eda14cbcSMatt Macy 		 */
636eda14cbcSMatt Macy 		while (!tx->tx_exiting &&
637eda14cbcSMatt Macy 		    (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting ||
638eda14cbcSMatt Macy 		    txg_has_quiesced_to_sync(dp)))
639eda14cbcSMatt Macy 			txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0);
640eda14cbcSMatt Macy 
641eda14cbcSMatt Macy 		if (tx->tx_exiting)
642eda14cbcSMatt Macy 			txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread);
643eda14cbcSMatt Macy 
644eda14cbcSMatt Macy 		txg = tx->tx_open_txg;
645eda14cbcSMatt Macy 		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
64633b8c039SMartin Matuska 		    (u_longlong_t)txg,
64733b8c039SMartin Matuska 		    (u_longlong_t)tx->tx_quiesce_txg_waiting,
64833b8c039SMartin Matuska 		    (u_longlong_t)tx->tx_sync_txg_waiting);
649eda14cbcSMatt Macy 		tx->tx_quiescing_txg = txg;
650eda14cbcSMatt Macy 
651eda14cbcSMatt Macy 		mutex_exit(&tx->tx_sync_lock);
652eda14cbcSMatt Macy 		txg_quiesce(dp, txg);
653eda14cbcSMatt Macy 		mutex_enter(&tx->tx_sync_lock);
654eda14cbcSMatt Macy 
655eda14cbcSMatt Macy 		/*
656eda14cbcSMatt Macy 		 * Hand this txg off to the sync thread.
657eda14cbcSMatt Macy 		 */
65833b8c039SMartin Matuska 		dprintf("quiesce done, handing off txg %llu\n",
65933b8c039SMartin Matuska 		    (u_longlong_t)txg);
660eda14cbcSMatt Macy 		tx->tx_quiescing_txg = 0;
661eda14cbcSMatt Macy 		tx->tx_quiesced_txg = txg;
662eda14cbcSMatt Macy 		DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg);
663eda14cbcSMatt Macy 		cv_broadcast(&tx->tx_sync_more_cv);
664eda14cbcSMatt Macy 		cv_broadcast(&tx->tx_quiesce_done_cv);
665eda14cbcSMatt Macy 	}
666eda14cbcSMatt Macy }
667eda14cbcSMatt Macy 
668eda14cbcSMatt Macy /*
669eda14cbcSMatt Macy  * Delay this thread by delay nanoseconds if we are still in the open
670eda14cbcSMatt Macy  * transaction group and there is already a waiting txg quiescing or quiesced.
671eda14cbcSMatt Macy  * Abort the delay if this txg stalls or enters the quiescing state.
672eda14cbcSMatt Macy  */
673eda14cbcSMatt Macy void
674eda14cbcSMatt Macy txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution)
675eda14cbcSMatt Macy {
676eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
677eda14cbcSMatt Macy 	hrtime_t start = gethrtime();
678eda14cbcSMatt Macy 
679eda14cbcSMatt Macy 	/* don't delay if this txg could transition to quiescing immediately */
680eda14cbcSMatt Macy 	if (tx->tx_open_txg > txg ||
681eda14cbcSMatt Macy 	    tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1)
682eda14cbcSMatt Macy 		return;
683eda14cbcSMatt Macy 
684eda14cbcSMatt Macy 	mutex_enter(&tx->tx_sync_lock);
685eda14cbcSMatt Macy 	if (tx->tx_open_txg > txg || tx->tx_synced_txg == txg-1) {
686eda14cbcSMatt Macy 		mutex_exit(&tx->tx_sync_lock);
687eda14cbcSMatt Macy 		return;
688eda14cbcSMatt Macy 	}
689eda14cbcSMatt Macy 
690eda14cbcSMatt Macy 	while (gethrtime() - start < delay &&
691eda14cbcSMatt Macy 	    tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) {
692eda14cbcSMatt Macy 		(void) cv_timedwait_hires(&tx->tx_quiesce_more_cv,
693eda14cbcSMatt Macy 		    &tx->tx_sync_lock, delay, resolution, 0);
694eda14cbcSMatt Macy 	}
695eda14cbcSMatt Macy 
696eda14cbcSMatt Macy 	DMU_TX_STAT_BUMP(dmu_tx_delay);
697eda14cbcSMatt Macy 
698eda14cbcSMatt Macy 	mutex_exit(&tx->tx_sync_lock);
699eda14cbcSMatt Macy }
700eda14cbcSMatt Macy 
701eda14cbcSMatt Macy static boolean_t
702eda14cbcSMatt Macy txg_wait_synced_impl(dsl_pool_t *dp, uint64_t txg, boolean_t wait_sig)
703eda14cbcSMatt Macy {
704eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
705eda14cbcSMatt Macy 
706eda14cbcSMatt Macy 	ASSERT(!dsl_pool_config_held(dp));
707eda14cbcSMatt Macy 
708eda14cbcSMatt Macy 	mutex_enter(&tx->tx_sync_lock);
709eda14cbcSMatt Macy 	ASSERT3U(tx->tx_threads, ==, 2);
710eda14cbcSMatt Macy 	if (txg == 0)
711eda14cbcSMatt Macy 		txg = tx->tx_open_txg + TXG_DEFER_SIZE;
712eda14cbcSMatt Macy 	if (tx->tx_sync_txg_waiting < txg)
713eda14cbcSMatt Macy 		tx->tx_sync_txg_waiting = txg;
714eda14cbcSMatt Macy 	dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
71533b8c039SMartin Matuska 	    (u_longlong_t)txg, (u_longlong_t)tx->tx_quiesce_txg_waiting,
71633b8c039SMartin Matuska 	    (u_longlong_t)tx->tx_sync_txg_waiting);
717eda14cbcSMatt Macy 	while (tx->tx_synced_txg < txg) {
718eda14cbcSMatt Macy 		dprintf("broadcasting sync more "
719eda14cbcSMatt Macy 		    "tx_synced=%llu waiting=%llu dp=%px\n",
72033b8c039SMartin Matuska 		    (u_longlong_t)tx->tx_synced_txg,
72133b8c039SMartin Matuska 		    (u_longlong_t)tx->tx_sync_txg_waiting, dp);
722eda14cbcSMatt Macy 		cv_broadcast(&tx->tx_sync_more_cv);
723eda14cbcSMatt Macy 		if (wait_sig) {
724eda14cbcSMatt Macy 			/*
725eda14cbcSMatt Macy 			 * Condition wait here but stop if the thread receives a
726eda14cbcSMatt Macy 			 * signal. The caller may call txg_wait_synced*() again
727eda14cbcSMatt Macy 			 * to resume waiting for this txg.
728eda14cbcSMatt Macy 			 */
729eda14cbcSMatt Macy 			if (cv_wait_io_sig(&tx->tx_sync_done_cv,
730eda14cbcSMatt Macy 			    &tx->tx_sync_lock) == 0) {
731eda14cbcSMatt Macy 				mutex_exit(&tx->tx_sync_lock);
732eda14cbcSMatt Macy 				return (B_TRUE);
733eda14cbcSMatt Macy 			}
734eda14cbcSMatt Macy 		} else {
735eda14cbcSMatt Macy 			cv_wait_io(&tx->tx_sync_done_cv, &tx->tx_sync_lock);
736eda14cbcSMatt Macy 		}
737eda14cbcSMatt Macy 	}
738eda14cbcSMatt Macy 	mutex_exit(&tx->tx_sync_lock);
739eda14cbcSMatt Macy 	return (B_FALSE);
740eda14cbcSMatt Macy }
741eda14cbcSMatt Macy 
742eda14cbcSMatt Macy void
743eda14cbcSMatt Macy txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
744eda14cbcSMatt Macy {
745eda14cbcSMatt Macy 	VERIFY0(txg_wait_synced_impl(dp, txg, B_FALSE));
746eda14cbcSMatt Macy }
747eda14cbcSMatt Macy 
748eda14cbcSMatt Macy /*
749eda14cbcSMatt Macy  * Similar to a txg_wait_synced but it can be interrupted from a signal.
750eda14cbcSMatt Macy  * Returns B_TRUE if the thread was signaled while waiting.
751eda14cbcSMatt Macy  */
752eda14cbcSMatt Macy boolean_t
753eda14cbcSMatt Macy txg_wait_synced_sig(dsl_pool_t *dp, uint64_t txg)
754eda14cbcSMatt Macy {
755eda14cbcSMatt Macy 	return (txg_wait_synced_impl(dp, txg, B_TRUE));
756eda14cbcSMatt Macy }
757eda14cbcSMatt Macy 
758eda14cbcSMatt Macy /*
759eda14cbcSMatt Macy  * Wait for the specified open transaction group.  Set should_quiesce
760eda14cbcSMatt Macy  * when the current open txg should be quiesced immediately.
761eda14cbcSMatt Macy  */
762eda14cbcSMatt Macy void
763eda14cbcSMatt Macy txg_wait_open(dsl_pool_t *dp, uint64_t txg, boolean_t should_quiesce)
764eda14cbcSMatt Macy {
765eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
766eda14cbcSMatt Macy 
767eda14cbcSMatt Macy 	ASSERT(!dsl_pool_config_held(dp));
768eda14cbcSMatt Macy 
769eda14cbcSMatt Macy 	mutex_enter(&tx->tx_sync_lock);
770eda14cbcSMatt Macy 	ASSERT3U(tx->tx_threads, ==, 2);
771eda14cbcSMatt Macy 	if (txg == 0)
772eda14cbcSMatt Macy 		txg = tx->tx_open_txg + 1;
773eda14cbcSMatt Macy 	if (tx->tx_quiesce_txg_waiting < txg && should_quiesce)
774eda14cbcSMatt Macy 		tx->tx_quiesce_txg_waiting = txg;
775eda14cbcSMatt Macy 	dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
77633b8c039SMartin Matuska 	    (u_longlong_t)txg, (u_longlong_t)tx->tx_quiesce_txg_waiting,
77733b8c039SMartin Matuska 	    (u_longlong_t)tx->tx_sync_txg_waiting);
778eda14cbcSMatt Macy 	while (tx->tx_open_txg < txg) {
779eda14cbcSMatt Macy 		cv_broadcast(&tx->tx_quiesce_more_cv);
780eda14cbcSMatt Macy 		/*
781eda14cbcSMatt Macy 		 * Callers setting should_quiesce will use cv_wait_io() and
782eda14cbcSMatt Macy 		 * be accounted for as iowait time.  Otherwise, the caller is
783eda14cbcSMatt Macy 		 * understood to be idle and cv_wait_sig() is used to prevent
784eda14cbcSMatt Macy 		 * incorrectly inflating the system load average.
785eda14cbcSMatt Macy 		 */
786eda14cbcSMatt Macy 		if (should_quiesce == B_TRUE) {
787eda14cbcSMatt Macy 			cv_wait_io(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
788eda14cbcSMatt Macy 		} else {
7892c48331dSMatt Macy 			cv_wait_idle(&tx->tx_quiesce_done_cv,
7902c48331dSMatt Macy 			    &tx->tx_sync_lock);
791eda14cbcSMatt Macy 		}
792eda14cbcSMatt Macy 	}
793eda14cbcSMatt Macy 	mutex_exit(&tx->tx_sync_lock);
794eda14cbcSMatt Macy }
795eda14cbcSMatt Macy 
796eda14cbcSMatt Macy /*
7977cd22ac4SMartin Matuska  * Pass in the txg number that should be synced.
798eda14cbcSMatt Macy  */
799eda14cbcSMatt Macy void
8007cd22ac4SMartin Matuska txg_kick(dsl_pool_t *dp, uint64_t txg)
801eda14cbcSMatt Macy {
802eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
803eda14cbcSMatt Macy 
804eda14cbcSMatt Macy 	ASSERT(!dsl_pool_config_held(dp));
805eda14cbcSMatt Macy 
8067cd22ac4SMartin Matuska 	if (tx->tx_sync_txg_waiting >= txg)
8077cd22ac4SMartin Matuska 		return;
8087cd22ac4SMartin Matuska 
809eda14cbcSMatt Macy 	mutex_enter(&tx->tx_sync_lock);
8107cd22ac4SMartin Matuska 	if (tx->tx_sync_txg_waiting < txg) {
8117cd22ac4SMartin Matuska 		tx->tx_sync_txg_waiting = txg;
8127cd22ac4SMartin Matuska 		cv_broadcast(&tx->tx_sync_more_cv);
813eda14cbcSMatt Macy 	}
814eda14cbcSMatt Macy 	mutex_exit(&tx->tx_sync_lock);
815eda14cbcSMatt Macy }
816eda14cbcSMatt Macy 
817eda14cbcSMatt Macy boolean_t
818eda14cbcSMatt Macy txg_stalled(dsl_pool_t *dp)
819eda14cbcSMatt Macy {
820eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
821eda14cbcSMatt Macy 	return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg);
822eda14cbcSMatt Macy }
823eda14cbcSMatt Macy 
824eda14cbcSMatt Macy boolean_t
825eda14cbcSMatt Macy txg_sync_waiting(dsl_pool_t *dp)
826eda14cbcSMatt Macy {
827eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
828eda14cbcSMatt Macy 
829eda14cbcSMatt Macy 	return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting ||
830eda14cbcSMatt Macy 	    tx->tx_quiesced_txg != 0);
831eda14cbcSMatt Macy }
832eda14cbcSMatt Macy 
833eda14cbcSMatt Macy /*
834eda14cbcSMatt Macy  * Verify that this txg is active (open, quiescing, syncing).  Non-active
835eda14cbcSMatt Macy  * txg's should not be manipulated.
836eda14cbcSMatt Macy  */
837eda14cbcSMatt Macy #ifdef ZFS_DEBUG
838eda14cbcSMatt Macy void
839eda14cbcSMatt Macy txg_verify(spa_t *spa, uint64_t txg)
840eda14cbcSMatt Macy {
841eda14cbcSMatt Macy 	dsl_pool_t *dp __maybe_unused = spa_get_dsl(spa);
842eda14cbcSMatt Macy 	if (txg <= TXG_INITIAL || txg == ZILTEST_TXG)
843eda14cbcSMatt Macy 		return;
844eda14cbcSMatt Macy 	ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
845eda14cbcSMatt Macy 	ASSERT3U(txg, >=, dp->dp_tx.tx_synced_txg);
846eda14cbcSMatt Macy 	ASSERT3U(txg, >=, dp->dp_tx.tx_open_txg - TXG_CONCURRENT_STATES);
847eda14cbcSMatt Macy }
848eda14cbcSMatt Macy #endif
849eda14cbcSMatt Macy 
850eda14cbcSMatt Macy /*
851eda14cbcSMatt Macy  * Per-txg object lists.
852eda14cbcSMatt Macy  */
853eda14cbcSMatt Macy void
854eda14cbcSMatt Macy txg_list_create(txg_list_t *tl, spa_t *spa, size_t offset)
855eda14cbcSMatt Macy {
856eda14cbcSMatt Macy 	int t;
857eda14cbcSMatt Macy 
858eda14cbcSMatt Macy 	mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL);
859eda14cbcSMatt Macy 
860eda14cbcSMatt Macy 	tl->tl_offset = offset;
861eda14cbcSMatt Macy 	tl->tl_spa = spa;
862eda14cbcSMatt Macy 
863eda14cbcSMatt Macy 	for (t = 0; t < TXG_SIZE; t++)
864eda14cbcSMatt Macy 		tl->tl_head[t] = NULL;
865eda14cbcSMatt Macy }
866eda14cbcSMatt Macy 
867eda14cbcSMatt Macy static boolean_t
868eda14cbcSMatt Macy txg_list_empty_impl(txg_list_t *tl, uint64_t txg)
869eda14cbcSMatt Macy {
870eda14cbcSMatt Macy 	ASSERT(MUTEX_HELD(&tl->tl_lock));
871eda14cbcSMatt Macy 	TXG_VERIFY(tl->tl_spa, txg);
872eda14cbcSMatt Macy 	return (tl->tl_head[txg & TXG_MASK] == NULL);
873eda14cbcSMatt Macy }
874eda14cbcSMatt Macy 
875eda14cbcSMatt Macy boolean_t
876eda14cbcSMatt Macy txg_list_empty(txg_list_t *tl, uint64_t txg)
877eda14cbcSMatt Macy {
878eda14cbcSMatt Macy 	mutex_enter(&tl->tl_lock);
879eda14cbcSMatt Macy 	boolean_t ret = txg_list_empty_impl(tl, txg);
880eda14cbcSMatt Macy 	mutex_exit(&tl->tl_lock);
881eda14cbcSMatt Macy 
882eda14cbcSMatt Macy 	return (ret);
883eda14cbcSMatt Macy }
884eda14cbcSMatt Macy 
885eda14cbcSMatt Macy void
886eda14cbcSMatt Macy txg_list_destroy(txg_list_t *tl)
887eda14cbcSMatt Macy {
888eda14cbcSMatt Macy 	int t;
889eda14cbcSMatt Macy 
890eda14cbcSMatt Macy 	mutex_enter(&tl->tl_lock);
891eda14cbcSMatt Macy 	for (t = 0; t < TXG_SIZE; t++)
892eda14cbcSMatt Macy 		ASSERT(txg_list_empty_impl(tl, t));
893eda14cbcSMatt Macy 	mutex_exit(&tl->tl_lock);
894eda14cbcSMatt Macy 
895eda14cbcSMatt Macy 	mutex_destroy(&tl->tl_lock);
896eda14cbcSMatt Macy }
897eda14cbcSMatt Macy 
898eda14cbcSMatt Macy /*
899eda14cbcSMatt Macy  * Returns true if all txg lists are empty.
900eda14cbcSMatt Macy  *
901eda14cbcSMatt Macy  * Warning: this is inherently racy (an item could be added immediately
902eda14cbcSMatt Macy  * after this function returns).
903eda14cbcSMatt Macy  */
904eda14cbcSMatt Macy boolean_t
905eda14cbcSMatt Macy txg_all_lists_empty(txg_list_t *tl)
906eda14cbcSMatt Macy {
9077b5e6873SMartin Matuska 	boolean_t res = B_TRUE;
9087b5e6873SMartin Matuska 	for (int i = 0; i < TXG_SIZE; i++)
9097b5e6873SMartin Matuska 		res &= (tl->tl_head[i] == NULL);
9107b5e6873SMartin Matuska 	return (res);
911eda14cbcSMatt Macy }
912eda14cbcSMatt Macy 
913eda14cbcSMatt Macy /*
914eda14cbcSMatt Macy  * Add an entry to the list (unless it's already on the list).
915eda14cbcSMatt Macy  * Returns B_TRUE if it was actually added.
916eda14cbcSMatt Macy  */
917eda14cbcSMatt Macy boolean_t
918eda14cbcSMatt Macy txg_list_add(txg_list_t *tl, void *p, uint64_t txg)
919eda14cbcSMatt Macy {
920eda14cbcSMatt Macy 	int t = txg & TXG_MASK;
921eda14cbcSMatt Macy 	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
922eda14cbcSMatt Macy 	boolean_t add;
923eda14cbcSMatt Macy 
924eda14cbcSMatt Macy 	TXG_VERIFY(tl->tl_spa, txg);
925eda14cbcSMatt Macy 	mutex_enter(&tl->tl_lock);
926eda14cbcSMatt Macy 	add = (tn->tn_member[t] == 0);
927eda14cbcSMatt Macy 	if (add) {
928eda14cbcSMatt Macy 		tn->tn_member[t] = 1;
929eda14cbcSMatt Macy 		tn->tn_next[t] = tl->tl_head[t];
930eda14cbcSMatt Macy 		tl->tl_head[t] = tn;
931eda14cbcSMatt Macy 	}
932eda14cbcSMatt Macy 	mutex_exit(&tl->tl_lock);
933eda14cbcSMatt Macy 
934eda14cbcSMatt Macy 	return (add);
935eda14cbcSMatt Macy }
936eda14cbcSMatt Macy 
937eda14cbcSMatt Macy /*
938eda14cbcSMatt Macy  * Add an entry to the end of the list, unless it's already on the list.
939eda14cbcSMatt Macy  * (walks list to find end)
940eda14cbcSMatt Macy  * Returns B_TRUE if it was actually added.
941eda14cbcSMatt Macy  */
942eda14cbcSMatt Macy boolean_t
943eda14cbcSMatt Macy txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg)
944eda14cbcSMatt Macy {
945eda14cbcSMatt Macy 	int t = txg & TXG_MASK;
946eda14cbcSMatt Macy 	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
947eda14cbcSMatt Macy 	boolean_t add;
948eda14cbcSMatt Macy 
949eda14cbcSMatt Macy 	TXG_VERIFY(tl->tl_spa, txg);
950eda14cbcSMatt Macy 	mutex_enter(&tl->tl_lock);
951eda14cbcSMatt Macy 	add = (tn->tn_member[t] == 0);
952eda14cbcSMatt Macy 	if (add) {
953eda14cbcSMatt Macy 		txg_node_t **tp;
954eda14cbcSMatt Macy 
955eda14cbcSMatt Macy 		for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t])
956eda14cbcSMatt Macy 			continue;
957eda14cbcSMatt Macy 
958eda14cbcSMatt Macy 		tn->tn_member[t] = 1;
959eda14cbcSMatt Macy 		tn->tn_next[t] = NULL;
960eda14cbcSMatt Macy 		*tp = tn;
961eda14cbcSMatt Macy 	}
962eda14cbcSMatt Macy 	mutex_exit(&tl->tl_lock);
963eda14cbcSMatt Macy 
964eda14cbcSMatt Macy 	return (add);
965eda14cbcSMatt Macy }
966eda14cbcSMatt Macy 
967eda14cbcSMatt Macy /*
968eda14cbcSMatt Macy  * Remove the head of the list and return it.
969eda14cbcSMatt Macy  */
970eda14cbcSMatt Macy void *
971eda14cbcSMatt Macy txg_list_remove(txg_list_t *tl, uint64_t txg)
972eda14cbcSMatt Macy {
973eda14cbcSMatt Macy 	int t = txg & TXG_MASK;
974eda14cbcSMatt Macy 	txg_node_t *tn;
975eda14cbcSMatt Macy 	void *p = NULL;
976eda14cbcSMatt Macy 
977eda14cbcSMatt Macy 	TXG_VERIFY(tl->tl_spa, txg);
978eda14cbcSMatt Macy 	mutex_enter(&tl->tl_lock);
979eda14cbcSMatt Macy 	if ((tn = tl->tl_head[t]) != NULL) {
980eda14cbcSMatt Macy 		ASSERT(tn->tn_member[t]);
981eda14cbcSMatt Macy 		ASSERT(tn->tn_next[t] == NULL || tn->tn_next[t]->tn_member[t]);
982eda14cbcSMatt Macy 		p = (char *)tn - tl->tl_offset;
983eda14cbcSMatt Macy 		tl->tl_head[t] = tn->tn_next[t];
984eda14cbcSMatt Macy 		tn->tn_next[t] = NULL;
985eda14cbcSMatt Macy 		tn->tn_member[t] = 0;
986eda14cbcSMatt Macy 	}
987eda14cbcSMatt Macy 	mutex_exit(&tl->tl_lock);
988eda14cbcSMatt Macy 
989eda14cbcSMatt Macy 	return (p);
990eda14cbcSMatt Macy }
991eda14cbcSMatt Macy 
992eda14cbcSMatt Macy /*
993eda14cbcSMatt Macy  * Remove a specific item from the list and return it.
994eda14cbcSMatt Macy  */
995eda14cbcSMatt Macy void *
996eda14cbcSMatt Macy txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg)
997eda14cbcSMatt Macy {
998eda14cbcSMatt Macy 	int t = txg & TXG_MASK;
999eda14cbcSMatt Macy 	txg_node_t *tn, **tp;
1000eda14cbcSMatt Macy 
1001eda14cbcSMatt Macy 	TXG_VERIFY(tl->tl_spa, txg);
1002eda14cbcSMatt Macy 	mutex_enter(&tl->tl_lock);
1003eda14cbcSMatt Macy 
1004eda14cbcSMatt Macy 	for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) {
1005eda14cbcSMatt Macy 		if ((char *)tn - tl->tl_offset == p) {
1006eda14cbcSMatt Macy 			*tp = tn->tn_next[t];
1007eda14cbcSMatt Macy 			tn->tn_next[t] = NULL;
1008eda14cbcSMatt Macy 			tn->tn_member[t] = 0;
1009eda14cbcSMatt Macy 			mutex_exit(&tl->tl_lock);
1010eda14cbcSMatt Macy 			return (p);
1011eda14cbcSMatt Macy 		}
1012eda14cbcSMatt Macy 	}
1013eda14cbcSMatt Macy 
1014eda14cbcSMatt Macy 	mutex_exit(&tl->tl_lock);
1015eda14cbcSMatt Macy 
1016eda14cbcSMatt Macy 	return (NULL);
1017eda14cbcSMatt Macy }
1018eda14cbcSMatt Macy 
1019eda14cbcSMatt Macy boolean_t
1020eda14cbcSMatt Macy txg_list_member(txg_list_t *tl, void *p, uint64_t txg)
1021eda14cbcSMatt Macy {
1022eda14cbcSMatt Macy 	int t = txg & TXG_MASK;
1023eda14cbcSMatt Macy 	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
1024eda14cbcSMatt Macy 
1025eda14cbcSMatt Macy 	TXG_VERIFY(tl->tl_spa, txg);
1026eda14cbcSMatt Macy 	return (tn->tn_member[t] != 0);
1027eda14cbcSMatt Macy }
1028eda14cbcSMatt Macy 
1029eda14cbcSMatt Macy /*
1030eda14cbcSMatt Macy  * Walk a txg list
1031eda14cbcSMatt Macy  */
1032eda14cbcSMatt Macy void *
1033eda14cbcSMatt Macy txg_list_head(txg_list_t *tl, uint64_t txg)
1034eda14cbcSMatt Macy {
1035eda14cbcSMatt Macy 	int t = txg & TXG_MASK;
1036eda14cbcSMatt Macy 	txg_node_t *tn;
1037eda14cbcSMatt Macy 
1038eda14cbcSMatt Macy 	mutex_enter(&tl->tl_lock);
1039eda14cbcSMatt Macy 	tn = tl->tl_head[t];
1040eda14cbcSMatt Macy 	mutex_exit(&tl->tl_lock);
1041eda14cbcSMatt Macy 
1042eda14cbcSMatt Macy 	TXG_VERIFY(tl->tl_spa, txg);
1043eda14cbcSMatt Macy 	return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
1044eda14cbcSMatt Macy }
1045eda14cbcSMatt Macy 
1046eda14cbcSMatt Macy void *
1047eda14cbcSMatt Macy txg_list_next(txg_list_t *tl, void *p, uint64_t txg)
1048eda14cbcSMatt Macy {
1049eda14cbcSMatt Macy 	int t = txg & TXG_MASK;
1050eda14cbcSMatt Macy 	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
1051eda14cbcSMatt Macy 
1052eda14cbcSMatt Macy 	TXG_VERIFY(tl->tl_spa, txg);
1053eda14cbcSMatt Macy 
1054eda14cbcSMatt Macy 	mutex_enter(&tl->tl_lock);
1055eda14cbcSMatt Macy 	tn = tn->tn_next[t];
1056eda14cbcSMatt Macy 	mutex_exit(&tl->tl_lock);
1057eda14cbcSMatt Macy 
1058eda14cbcSMatt Macy 	return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
1059eda14cbcSMatt Macy }
1060eda14cbcSMatt Macy 
1061eda14cbcSMatt Macy EXPORT_SYMBOL(txg_init);
1062eda14cbcSMatt Macy EXPORT_SYMBOL(txg_fini);
1063eda14cbcSMatt Macy EXPORT_SYMBOL(txg_sync_start);
1064eda14cbcSMatt Macy EXPORT_SYMBOL(txg_sync_stop);
1065eda14cbcSMatt Macy EXPORT_SYMBOL(txg_hold_open);
1066eda14cbcSMatt Macy EXPORT_SYMBOL(txg_rele_to_quiesce);
1067eda14cbcSMatt Macy EXPORT_SYMBOL(txg_rele_to_sync);
1068eda14cbcSMatt Macy EXPORT_SYMBOL(txg_register_callbacks);
1069eda14cbcSMatt Macy EXPORT_SYMBOL(txg_delay);
1070eda14cbcSMatt Macy EXPORT_SYMBOL(txg_wait_synced);
1071eda14cbcSMatt Macy EXPORT_SYMBOL(txg_wait_open);
1072eda14cbcSMatt Macy EXPORT_SYMBOL(txg_wait_callbacks);
1073eda14cbcSMatt Macy EXPORT_SYMBOL(txg_stalled);
1074eda14cbcSMatt Macy EXPORT_SYMBOL(txg_sync_waiting);
1075eda14cbcSMatt Macy 
1076be181ee2SMartin Matuska ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, timeout, UINT, ZMOD_RW,
1077eda14cbcSMatt Macy 	"Max seconds worth of delta per txg");
1078