xref: /freebsd/sys/contrib/openzfs/module/zfs/txg.c (revision 61145dc2b94f12f6a47344fb9aac702321880e43)
1*61145dc2SMartin Matuska // SPDX-License-Identifier: CDDL-1.0
2eda14cbcSMatt Macy /*
3eda14cbcSMatt Macy  * CDDL HEADER START
4eda14cbcSMatt Macy  *
5eda14cbcSMatt Macy  * The contents of this file are subject to the terms of the
6eda14cbcSMatt Macy  * Common Development and Distribution License (the "License").
7eda14cbcSMatt Macy  * You may not use this file except in compliance with the License.
8eda14cbcSMatt Macy  *
9eda14cbcSMatt Macy  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10271171e0SMartin Matuska  * or https://opensource.org/licenses/CDDL-1.0.
11eda14cbcSMatt Macy  * See the License for the specific language governing permissions
12eda14cbcSMatt Macy  * and limitations under the License.
13eda14cbcSMatt Macy  *
14eda14cbcSMatt Macy  * When distributing Covered Code, include this CDDL HEADER in each
15eda14cbcSMatt Macy  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16eda14cbcSMatt Macy  * If applicable, add the following below this CDDL HEADER, with the
17eda14cbcSMatt Macy  * fields enclosed by brackets "[]" replaced with your own identifying
18eda14cbcSMatt Macy  * information: Portions Copyright [yyyy] [name of copyright owner]
19eda14cbcSMatt Macy  *
20eda14cbcSMatt Macy  * CDDL HEADER END
21eda14cbcSMatt Macy  */
22eda14cbcSMatt Macy /*
23eda14cbcSMatt Macy  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24eda14cbcSMatt Macy  * Portions Copyright 2011 Martin Matuska
25eda14cbcSMatt Macy  * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
26eda14cbcSMatt Macy  */
27eda14cbcSMatt Macy 
28eda14cbcSMatt Macy #include <sys/zfs_context.h>
29eda14cbcSMatt Macy #include <sys/txg_impl.h>
30eda14cbcSMatt Macy #include <sys/dmu_impl.h>
31eda14cbcSMatt Macy #include <sys/spa_impl.h>
32eda14cbcSMatt Macy #include <sys/dmu_tx.h>
33eda14cbcSMatt Macy #include <sys/dsl_pool.h>
34eda14cbcSMatt Macy #include <sys/dsl_scan.h>
35eda14cbcSMatt Macy #include <sys/zil.h>
36eda14cbcSMatt Macy #include <sys/callb.h>
37eda14cbcSMatt Macy #include <sys/trace_zfs.h>
38eda14cbcSMatt Macy 
39eda14cbcSMatt Macy /*
40eda14cbcSMatt Macy  * ZFS Transaction Groups
41eda14cbcSMatt Macy  * ----------------------
42eda14cbcSMatt Macy  *
43eda14cbcSMatt Macy  * ZFS transaction groups are, as the name implies, groups of transactions
44eda14cbcSMatt Macy  * that act on persistent state. ZFS asserts consistency at the granularity of
45eda14cbcSMatt Macy  * these transaction groups. Each successive transaction group (txg) is
46eda14cbcSMatt Macy  * assigned a 64-bit consecutive identifier. There are three active
47eda14cbcSMatt Macy  * transaction group states: open, quiescing, or syncing. At any given time,
48eda14cbcSMatt Macy  * there may be an active txg associated with each state; each active txg may
49eda14cbcSMatt Macy  * either be processing, or blocked waiting to enter the next state. There may
50eda14cbcSMatt Macy  * be up to three active txgs, and there is always a txg in the open state
51eda14cbcSMatt Macy  * (though it may be blocked waiting to enter the quiescing state). In broad
52eda14cbcSMatt Macy  * strokes, transactions -- operations that change in-memory structures -- are
53eda14cbcSMatt Macy  * accepted into the txg in the open state, and are completed while the txg is
54eda14cbcSMatt Macy  * in the open or quiescing states. The accumulated changes are written to
55eda14cbcSMatt Macy  * disk in the syncing state.
56eda14cbcSMatt Macy  *
57eda14cbcSMatt Macy  * Open
58eda14cbcSMatt Macy  *
59eda14cbcSMatt Macy  * When a new txg becomes active, it first enters the open state. New
60eda14cbcSMatt Macy  * transactions -- updates to in-memory structures -- are assigned to the
61eda14cbcSMatt Macy  * currently open txg. There is always a txg in the open state so that ZFS can
62eda14cbcSMatt Macy  * accept new changes (though the txg may refuse new changes if it has hit
63eda14cbcSMatt Macy  * some limit). ZFS advances the open txg to the next state for a variety of
64eda14cbcSMatt Macy  * reasons such as it hitting a time or size threshold, or the execution of an
65eda14cbcSMatt Macy  * administrative action that must be completed in the syncing state.
66eda14cbcSMatt Macy  *
67eda14cbcSMatt Macy  * Quiescing
68eda14cbcSMatt Macy  *
69eda14cbcSMatt Macy  * After a txg exits the open state, it enters the quiescing state. The
70eda14cbcSMatt Macy  * quiescing state is intended to provide a buffer between accepting new
71eda14cbcSMatt Macy  * transactions in the open state and writing them out to stable storage in
72eda14cbcSMatt Macy  * the syncing state. While quiescing, transactions can continue their
73eda14cbcSMatt Macy  * operation without delaying either of the other states. Typically, a txg is
74eda14cbcSMatt Macy  * in the quiescing state very briefly since the operations are bounded by
75eda14cbcSMatt Macy  * software latencies rather than, say, slower I/O latencies. After all
76eda14cbcSMatt Macy  * transactions complete, the txg is ready to enter the next state.
77eda14cbcSMatt Macy  *
78eda14cbcSMatt Macy  * Syncing
79eda14cbcSMatt Macy  *
80eda14cbcSMatt Macy  * In the syncing state, the in-memory state built up during the open and (to
81eda14cbcSMatt Macy  * a lesser degree) the quiescing states is written to stable storage. The
82eda14cbcSMatt Macy  * process of writing out modified data can, in turn modify more data. For
83eda14cbcSMatt Macy  * example when we write new blocks, we need to allocate space for them; those
84eda14cbcSMatt Macy  * allocations modify metadata (space maps)... which themselves must be
85eda14cbcSMatt Macy  * written to stable storage. During the sync state, ZFS iterates, writing out
86eda14cbcSMatt Macy  * data until it converges and all in-memory changes have been written out.
87eda14cbcSMatt Macy  * The first such pass is the largest as it encompasses all the modified user
88eda14cbcSMatt Macy  * data (as opposed to filesystem metadata). Subsequent passes typically have
89eda14cbcSMatt Macy  * far less data to write as they consist exclusively of filesystem metadata.
90eda14cbcSMatt Macy  *
91eda14cbcSMatt Macy  * To ensure convergence, after a certain number of passes ZFS begins
92eda14cbcSMatt Macy  * overwriting locations on stable storage that had been allocated earlier in
93eda14cbcSMatt Macy  * the syncing state (and subsequently freed). ZFS usually allocates new
94eda14cbcSMatt Macy  * blocks to optimize for large, continuous, writes. For the syncing state to
95eda14cbcSMatt Macy  * converge however it must complete a pass where no new blocks are allocated
96eda14cbcSMatt Macy  * since each allocation requires a modification of persistent metadata.
97eda14cbcSMatt Macy  * Further, to hasten convergence, after a prescribed number of passes, ZFS
98eda14cbcSMatt Macy  * also defers frees, and stops compressing.
99eda14cbcSMatt Macy  *
100eda14cbcSMatt Macy  * In addition to writing out user data, we must also execute synctasks during
101eda14cbcSMatt Macy  * the syncing context. A synctask is the mechanism by which some
102eda14cbcSMatt Macy  * administrative activities work such as creating and destroying snapshots or
103eda14cbcSMatt Macy  * datasets. Note that when a synctask is initiated it enters the open txg,
104eda14cbcSMatt Macy  * and ZFS then pushes that txg as quickly as possible to completion of the
105eda14cbcSMatt Macy  * syncing state in order to reduce the latency of the administrative
106eda14cbcSMatt Macy  * activity. To complete the syncing state, ZFS writes out a new uberblock,
107eda14cbcSMatt Macy  * the root of the tree of blocks that comprise all state stored on the ZFS
108eda14cbcSMatt Macy  * pool. Finally, if there is a quiesced txg waiting, we signal that it can
109eda14cbcSMatt Macy  * now transition to the syncing state.
110eda14cbcSMatt Macy  */
111eda14cbcSMatt Macy 
112da5137abSMartin Matuska static __attribute__((noreturn)) void txg_sync_thread(void *arg);
113da5137abSMartin Matuska static __attribute__((noreturn)) void txg_quiesce_thread(void *arg);
114eda14cbcSMatt Macy 
115be181ee2SMartin Matuska uint_t zfs_txg_timeout = 5;	/* max seconds worth of delta per txg */
116eda14cbcSMatt Macy 
117eda14cbcSMatt Macy /*
118eda14cbcSMatt Macy  * Prepare the txg subsystem.
119eda14cbcSMatt Macy  */
120eda14cbcSMatt Macy void
txg_init(dsl_pool_t * dp,uint64_t txg)121eda14cbcSMatt Macy txg_init(dsl_pool_t *dp, uint64_t txg)
122eda14cbcSMatt Macy {
123eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
124eda14cbcSMatt Macy 	int c;
125da5137abSMartin Matuska 	memset(tx, 0, sizeof (tx_state_t));
126eda14cbcSMatt Macy 
127eda14cbcSMatt Macy 	tx->tx_cpu = vmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP);
128eda14cbcSMatt Macy 
129eda14cbcSMatt Macy 	for (c = 0; c < max_ncpus; c++) {
130eda14cbcSMatt Macy 		int i;
131eda14cbcSMatt Macy 
132eda14cbcSMatt Macy 		mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL);
133eda14cbcSMatt Macy 		mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_NOLOCKDEP,
134eda14cbcSMatt Macy 		    NULL);
135eda14cbcSMatt Macy 		for (i = 0; i < TXG_SIZE; i++) {
136eda14cbcSMatt Macy 			cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT,
137eda14cbcSMatt Macy 			    NULL);
138eda14cbcSMatt Macy 			list_create(&tx->tx_cpu[c].tc_callbacks[i],
139eda14cbcSMatt Macy 			    sizeof (dmu_tx_callback_t),
140eda14cbcSMatt Macy 			    offsetof(dmu_tx_callback_t, dcb_node));
141eda14cbcSMatt Macy 		}
142eda14cbcSMatt Macy 	}
143eda14cbcSMatt Macy 
144eda14cbcSMatt Macy 	mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL);
145eda14cbcSMatt Macy 
146eda14cbcSMatt Macy 	cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL);
147eda14cbcSMatt Macy 	cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL);
148eda14cbcSMatt Macy 	cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL);
149eda14cbcSMatt Macy 	cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL);
150eda14cbcSMatt Macy 	cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL);
151eda14cbcSMatt Macy 
152eda14cbcSMatt Macy 	tx->tx_open_txg = txg;
153eda14cbcSMatt Macy }
154eda14cbcSMatt Macy 
155eda14cbcSMatt Macy /*
156eda14cbcSMatt Macy  * Close down the txg subsystem.
157eda14cbcSMatt Macy  */
158eda14cbcSMatt Macy void
txg_fini(dsl_pool_t * dp)159eda14cbcSMatt Macy txg_fini(dsl_pool_t *dp)
160eda14cbcSMatt Macy {
161eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
162eda14cbcSMatt Macy 	int c;
163eda14cbcSMatt Macy 
164eda14cbcSMatt Macy 	ASSERT0(tx->tx_threads);
165eda14cbcSMatt Macy 
166eda14cbcSMatt Macy 	mutex_destroy(&tx->tx_sync_lock);
167eda14cbcSMatt Macy 
168eda14cbcSMatt Macy 	cv_destroy(&tx->tx_sync_more_cv);
169eda14cbcSMatt Macy 	cv_destroy(&tx->tx_sync_done_cv);
170eda14cbcSMatt Macy 	cv_destroy(&tx->tx_quiesce_more_cv);
171eda14cbcSMatt Macy 	cv_destroy(&tx->tx_quiesce_done_cv);
172eda14cbcSMatt Macy 	cv_destroy(&tx->tx_exit_cv);
173eda14cbcSMatt Macy 
174eda14cbcSMatt Macy 	for (c = 0; c < max_ncpus; c++) {
175eda14cbcSMatt Macy 		int i;
176eda14cbcSMatt Macy 
177eda14cbcSMatt Macy 		mutex_destroy(&tx->tx_cpu[c].tc_open_lock);
178eda14cbcSMatt Macy 		mutex_destroy(&tx->tx_cpu[c].tc_lock);
179eda14cbcSMatt Macy 		for (i = 0; i < TXG_SIZE; i++) {
180eda14cbcSMatt Macy 			cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
181eda14cbcSMatt Macy 			list_destroy(&tx->tx_cpu[c].tc_callbacks[i]);
182eda14cbcSMatt Macy 		}
183eda14cbcSMatt Macy 	}
184eda14cbcSMatt Macy 
185eda14cbcSMatt Macy 	if (tx->tx_commit_cb_taskq != NULL)
186eda14cbcSMatt Macy 		taskq_destroy(tx->tx_commit_cb_taskq);
187eda14cbcSMatt Macy 
188eda14cbcSMatt Macy 	vmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t));
189eda14cbcSMatt Macy 
190da5137abSMartin Matuska 	memset(tx, 0, sizeof (tx_state_t));
191eda14cbcSMatt Macy }
192eda14cbcSMatt Macy 
193eda14cbcSMatt Macy /*
194eda14cbcSMatt Macy  * Start syncing transaction groups.
195eda14cbcSMatt Macy  */
196eda14cbcSMatt Macy void
txg_sync_start(dsl_pool_t * dp)197eda14cbcSMatt Macy txg_sync_start(dsl_pool_t *dp)
198eda14cbcSMatt Macy {
199eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
200eda14cbcSMatt Macy 
201eda14cbcSMatt Macy 	mutex_enter(&tx->tx_sync_lock);
202eda14cbcSMatt Macy 
203eda14cbcSMatt Macy 	dprintf("pool %p\n", dp);
204eda14cbcSMatt Macy 
205eda14cbcSMatt Macy 	ASSERT0(tx->tx_threads);
206eda14cbcSMatt Macy 
207eda14cbcSMatt Macy 	tx->tx_threads = 2;
208eda14cbcSMatt Macy 
209eda14cbcSMatt Macy 	tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread,
210eda14cbcSMatt Macy 	    dp, 0, &p0, TS_RUN, defclsyspri);
211eda14cbcSMatt Macy 
212eda14cbcSMatt Macy 	/*
213eda14cbcSMatt Macy 	 * The sync thread can need a larger-than-default stack size on
214eda14cbcSMatt Macy 	 * 32-bit x86.  This is due in part to nested pools and
215eda14cbcSMatt Macy 	 * scrub_visitbp() recursion.
216eda14cbcSMatt Macy 	 */
217eda14cbcSMatt Macy 	tx->tx_sync_thread = thread_create(NULL, 0, txg_sync_thread,
218eda14cbcSMatt Macy 	    dp, 0, &p0, TS_RUN, defclsyspri);
219eda14cbcSMatt Macy 
220eda14cbcSMatt Macy 	mutex_exit(&tx->tx_sync_lock);
221eda14cbcSMatt Macy }
222eda14cbcSMatt Macy 
223eda14cbcSMatt Macy static void
txg_thread_enter(tx_state_t * tx,callb_cpr_t * cpr)224eda14cbcSMatt Macy txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr)
225eda14cbcSMatt Macy {
226eda14cbcSMatt Macy 	CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG);
227eda14cbcSMatt Macy 	mutex_enter(&tx->tx_sync_lock);
228eda14cbcSMatt Macy }
229eda14cbcSMatt Macy 
230eda14cbcSMatt Macy static void
txg_thread_exit(tx_state_t * tx,callb_cpr_t * cpr,kthread_t ** tpp)231eda14cbcSMatt Macy txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp)
232eda14cbcSMatt Macy {
233eda14cbcSMatt Macy 	ASSERT(*tpp != NULL);
234eda14cbcSMatt Macy 	*tpp = NULL;
235eda14cbcSMatt Macy 	tx->tx_threads--;
236eda14cbcSMatt Macy 	cv_broadcast(&tx->tx_exit_cv);
237eda14cbcSMatt Macy 	CALLB_CPR_EXIT(cpr);		/* drops &tx->tx_sync_lock */
238eda14cbcSMatt Macy 	thread_exit();
239eda14cbcSMatt Macy }
240eda14cbcSMatt Macy 
241eda14cbcSMatt Macy static void
txg_thread_wait(tx_state_t * tx,callb_cpr_t * cpr,kcondvar_t * cv,clock_t time)242eda14cbcSMatt Macy txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time)
243eda14cbcSMatt Macy {
244eda14cbcSMatt Macy 	CALLB_CPR_SAFE_BEGIN(cpr);
245eda14cbcSMatt Macy 
246eda14cbcSMatt Macy 	if (time) {
2472c48331dSMatt Macy 		(void) cv_timedwait_idle(cv, &tx->tx_sync_lock,
248eda14cbcSMatt Macy 		    ddi_get_lbolt() + time);
249eda14cbcSMatt Macy 	} else {
2502c48331dSMatt Macy 		cv_wait_idle(cv, &tx->tx_sync_lock);
251eda14cbcSMatt Macy 	}
252eda14cbcSMatt Macy 
253eda14cbcSMatt Macy 	CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock);
254eda14cbcSMatt Macy }
255eda14cbcSMatt Macy 
256eda14cbcSMatt Macy /*
257eda14cbcSMatt Macy  * Stop syncing transaction groups.
258eda14cbcSMatt Macy  */
259eda14cbcSMatt Macy void
txg_sync_stop(dsl_pool_t * dp)260eda14cbcSMatt Macy txg_sync_stop(dsl_pool_t *dp)
261eda14cbcSMatt Macy {
262eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
263eda14cbcSMatt Macy 
264eda14cbcSMatt Macy 	dprintf("pool %p\n", dp);
265eda14cbcSMatt Macy 	/*
266eda14cbcSMatt Macy 	 * Finish off any work in progress.
267eda14cbcSMatt Macy 	 */
268eda14cbcSMatt Macy 	ASSERT3U(tx->tx_threads, ==, 2);
269eda14cbcSMatt Macy 
270eda14cbcSMatt Macy 	/*
271eda14cbcSMatt Macy 	 * We need to ensure that we've vacated the deferred metaslab trees.
272eda14cbcSMatt Macy 	 */
273eda14cbcSMatt Macy 	txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE);
274eda14cbcSMatt Macy 
275eda14cbcSMatt Macy 	/*
276eda14cbcSMatt Macy 	 * Wake all sync threads and wait for them to die.
277eda14cbcSMatt Macy 	 */
278eda14cbcSMatt Macy 	mutex_enter(&tx->tx_sync_lock);
279eda14cbcSMatt Macy 
280eda14cbcSMatt Macy 	ASSERT3U(tx->tx_threads, ==, 2);
281eda14cbcSMatt Macy 
282eda14cbcSMatt Macy 	tx->tx_exiting = 1;
283eda14cbcSMatt Macy 
284eda14cbcSMatt Macy 	cv_broadcast(&tx->tx_quiesce_more_cv);
285eda14cbcSMatt Macy 	cv_broadcast(&tx->tx_quiesce_done_cv);
286eda14cbcSMatt Macy 	cv_broadcast(&tx->tx_sync_more_cv);
287eda14cbcSMatt Macy 
288eda14cbcSMatt Macy 	while (tx->tx_threads != 0)
289eda14cbcSMatt Macy 		cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock);
290eda14cbcSMatt Macy 
291eda14cbcSMatt Macy 	tx->tx_exiting = 0;
292eda14cbcSMatt Macy 
293eda14cbcSMatt Macy 	mutex_exit(&tx->tx_sync_lock);
294eda14cbcSMatt Macy }
295eda14cbcSMatt Macy 
296184c1b94SMartin Matuska /*
297184c1b94SMartin Matuska  * Get a handle on the currently open txg and keep it open.
298184c1b94SMartin Matuska  *
299184c1b94SMartin Matuska  * The txg is guaranteed to stay open until txg_rele_to_quiesce() is called for
300184c1b94SMartin Matuska  * the handle. Once txg_rele_to_quiesce() has been called, the txg stays
301184c1b94SMartin Matuska  * in quiescing state until txg_rele_to_sync() is called for the handle.
302184c1b94SMartin Matuska  *
303184c1b94SMartin Matuska  * It is guaranteed that subsequent calls return monotonically increasing
304184c1b94SMartin Matuska  * txgs for the same dsl_pool_t. Of course this is not strong monotonicity,
305184c1b94SMartin Matuska  * because the same txg can be returned multiple times in a row. This
306184c1b94SMartin Matuska  * guarantee holds both for subsequent calls from one thread and for multiple
307184c1b94SMartin Matuska  * threads. For example, it is impossible to observe the following sequence
308184c1b94SMartin Matuska  * of events:
309184c1b94SMartin Matuska  *
310184c1b94SMartin Matuska  *           Thread 1                            Thread 2
311184c1b94SMartin Matuska  *
312184c1b94SMartin Matuska  *   1 <- txg_hold_open(P, ...)
313184c1b94SMartin Matuska  *                                       2 <- txg_hold_open(P, ...)
314184c1b94SMartin Matuska  *   1 <- txg_hold_open(P, ...)
315184c1b94SMartin Matuska  *
316184c1b94SMartin Matuska  */
317eda14cbcSMatt Macy uint64_t
txg_hold_open(dsl_pool_t * dp,txg_handle_t * th)318eda14cbcSMatt Macy txg_hold_open(dsl_pool_t *dp, txg_handle_t *th)
319eda14cbcSMatt Macy {
320eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
321eda14cbcSMatt Macy 	tx_cpu_t *tc;
322eda14cbcSMatt Macy 	uint64_t txg;
323eda14cbcSMatt Macy 
324eda14cbcSMatt Macy 	/*
325eda14cbcSMatt Macy 	 * It appears the processor id is simply used as a "random"
326eda14cbcSMatt Macy 	 * number to index into the array, and there isn't any other
327eda14cbcSMatt Macy 	 * significance to the chosen tx_cpu. Because.. Why not use
328eda14cbcSMatt Macy 	 * the current cpu to index into the array?
329eda14cbcSMatt Macy 	 */
3307877fdebSMatt Macy 	tc = &tx->tx_cpu[CPU_SEQID_UNSTABLE];
331eda14cbcSMatt Macy 
332eda14cbcSMatt Macy 	mutex_enter(&tc->tc_open_lock);
333eda14cbcSMatt Macy 	txg = tx->tx_open_txg;
334eda14cbcSMatt Macy 
335eda14cbcSMatt Macy 	mutex_enter(&tc->tc_lock);
336eda14cbcSMatt Macy 	tc->tc_count[txg & TXG_MASK]++;
337eda14cbcSMatt Macy 	mutex_exit(&tc->tc_lock);
338eda14cbcSMatt Macy 
339eda14cbcSMatt Macy 	th->th_cpu = tc;
340eda14cbcSMatt Macy 	th->th_txg = txg;
341eda14cbcSMatt Macy 
342eda14cbcSMatt Macy 	return (txg);
343eda14cbcSMatt Macy }
344eda14cbcSMatt Macy 
345eda14cbcSMatt Macy void
txg_rele_to_quiesce(txg_handle_t * th)346eda14cbcSMatt Macy txg_rele_to_quiesce(txg_handle_t *th)
347eda14cbcSMatt Macy {
348eda14cbcSMatt Macy 	tx_cpu_t *tc = th->th_cpu;
349eda14cbcSMatt Macy 
350eda14cbcSMatt Macy 	ASSERT(!MUTEX_HELD(&tc->tc_lock));
351eda14cbcSMatt Macy 	mutex_exit(&tc->tc_open_lock);
352eda14cbcSMatt Macy }
353eda14cbcSMatt Macy 
354eda14cbcSMatt Macy void
txg_register_callbacks(txg_handle_t * th,list_t * tx_callbacks)355eda14cbcSMatt Macy txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks)
356eda14cbcSMatt Macy {
357eda14cbcSMatt Macy 	tx_cpu_t *tc = th->th_cpu;
358eda14cbcSMatt Macy 	int g = th->th_txg & TXG_MASK;
359eda14cbcSMatt Macy 
360eda14cbcSMatt Macy 	mutex_enter(&tc->tc_lock);
361eda14cbcSMatt Macy 	list_move_tail(&tc->tc_callbacks[g], tx_callbacks);
362eda14cbcSMatt Macy 	mutex_exit(&tc->tc_lock);
363eda14cbcSMatt Macy }
364eda14cbcSMatt Macy 
365eda14cbcSMatt Macy void
txg_rele_to_sync(txg_handle_t * th)366eda14cbcSMatt Macy txg_rele_to_sync(txg_handle_t *th)
367eda14cbcSMatt Macy {
368eda14cbcSMatt Macy 	tx_cpu_t *tc = th->th_cpu;
369eda14cbcSMatt Macy 	int g = th->th_txg & TXG_MASK;
370eda14cbcSMatt Macy 
371eda14cbcSMatt Macy 	mutex_enter(&tc->tc_lock);
372eda14cbcSMatt Macy 	ASSERT(tc->tc_count[g] != 0);
373eda14cbcSMatt Macy 	if (--tc->tc_count[g] == 0)
374eda14cbcSMatt Macy 		cv_broadcast(&tc->tc_cv[g]);
375eda14cbcSMatt Macy 	mutex_exit(&tc->tc_lock);
376eda14cbcSMatt Macy 
377eda14cbcSMatt Macy 	th->th_cpu = NULL;	/* defensive */
378eda14cbcSMatt Macy }
379eda14cbcSMatt Macy 
380eda14cbcSMatt Macy /*
381eda14cbcSMatt Macy  * Blocks until all transactions in the group are committed.
382eda14cbcSMatt Macy  *
383eda14cbcSMatt Macy  * On return, the transaction group has reached a stable state in which it can
384eda14cbcSMatt Macy  * then be passed off to the syncing context.
385eda14cbcSMatt Macy  */
386eda14cbcSMatt Macy static void
txg_quiesce(dsl_pool_t * dp,uint64_t txg)387eda14cbcSMatt Macy txg_quiesce(dsl_pool_t *dp, uint64_t txg)
388eda14cbcSMatt Macy {
389eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
390eda14cbcSMatt Macy 	uint64_t tx_open_time;
391eda14cbcSMatt Macy 	int g = txg & TXG_MASK;
392eda14cbcSMatt Macy 	int c;
393eda14cbcSMatt Macy 
394eda14cbcSMatt Macy 	/*
395eda14cbcSMatt Macy 	 * Grab all tc_open_locks so nobody else can get into this txg.
396eda14cbcSMatt Macy 	 */
397eda14cbcSMatt Macy 	for (c = 0; c < max_ncpus; c++)
398eda14cbcSMatt Macy 		mutex_enter(&tx->tx_cpu[c].tc_open_lock);
399eda14cbcSMatt Macy 
400eda14cbcSMatt Macy 	ASSERT(txg == tx->tx_open_txg);
401eda14cbcSMatt Macy 	tx->tx_open_txg++;
402eda14cbcSMatt Macy 	tx->tx_open_time = tx_open_time = gethrtime();
403eda14cbcSMatt Macy 
404eda14cbcSMatt Macy 	DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg);
405eda14cbcSMatt Macy 	DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg);
406eda14cbcSMatt Macy 
407eda14cbcSMatt Macy 	/*
408eda14cbcSMatt Macy 	 * Now that we've incremented tx_open_txg, we can let threads
409eda14cbcSMatt Macy 	 * enter the next transaction group.
410eda14cbcSMatt Macy 	 */
411eda14cbcSMatt Macy 	for (c = 0; c < max_ncpus; c++)
412eda14cbcSMatt Macy 		mutex_exit(&tx->tx_cpu[c].tc_open_lock);
413eda14cbcSMatt Macy 
414eda14cbcSMatt Macy 	spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_OPEN, tx_open_time);
415eda14cbcSMatt Macy 	spa_txg_history_add(dp->dp_spa, txg + 1, tx_open_time);
416eda14cbcSMatt Macy 
417eda14cbcSMatt Macy 	/*
418184c1b94SMartin Matuska 	 * Quiesce the transaction group by waiting for everyone to
419184c1b94SMartin Matuska 	 * call txg_rele_to_sync() for their open transaction handles.
420eda14cbcSMatt Macy 	 */
421eda14cbcSMatt Macy 	for (c = 0; c < max_ncpus; c++) {
422eda14cbcSMatt Macy 		tx_cpu_t *tc = &tx->tx_cpu[c];
423eda14cbcSMatt Macy 		mutex_enter(&tc->tc_lock);
424eda14cbcSMatt Macy 		while (tc->tc_count[g] != 0)
425eda14cbcSMatt Macy 			cv_wait(&tc->tc_cv[g], &tc->tc_lock);
426eda14cbcSMatt Macy 		mutex_exit(&tc->tc_lock);
427eda14cbcSMatt Macy 	}
428eda14cbcSMatt Macy 
429eda14cbcSMatt Macy 	spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_QUIESCED, gethrtime());
430eda14cbcSMatt Macy }
431eda14cbcSMatt Macy 
432eda14cbcSMatt Macy static void
txg_do_callbacks(void * cb_list)433bb2d13b6SMartin Matuska txg_do_callbacks(void *cb_list)
434eda14cbcSMatt Macy {
435eda14cbcSMatt Macy 	dmu_tx_do_callbacks(cb_list, 0);
436eda14cbcSMatt Macy 
437eda14cbcSMatt Macy 	list_destroy(cb_list);
438eda14cbcSMatt Macy 
439eda14cbcSMatt Macy 	kmem_free(cb_list, sizeof (list_t));
440eda14cbcSMatt Macy }
441eda14cbcSMatt Macy 
442eda14cbcSMatt Macy /*
443eda14cbcSMatt Macy  * Dispatch the commit callbacks registered on this txg to worker threads.
444eda14cbcSMatt Macy  *
445eda14cbcSMatt Macy  * If no callbacks are registered for a given TXG, nothing happens.
446eda14cbcSMatt Macy  * This function creates a taskq for the associated pool, if needed.
447eda14cbcSMatt Macy  */
448eda14cbcSMatt Macy static void
txg_dispatch_callbacks(dsl_pool_t * dp,uint64_t txg)449eda14cbcSMatt Macy txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
450eda14cbcSMatt Macy {
451eda14cbcSMatt Macy 	int c;
452eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
453eda14cbcSMatt Macy 	list_t *cb_list;
454eda14cbcSMatt Macy 
455eda14cbcSMatt Macy 	for (c = 0; c < max_ncpus; c++) {
456eda14cbcSMatt Macy 		tx_cpu_t *tc = &tx->tx_cpu[c];
457eda14cbcSMatt Macy 		/*
458eda14cbcSMatt Macy 		 * No need to lock tx_cpu_t at this point, since this can
459eda14cbcSMatt Macy 		 * only be called once a txg has been synced.
460eda14cbcSMatt Macy 		 */
461eda14cbcSMatt Macy 
462eda14cbcSMatt Macy 		int g = txg & TXG_MASK;
463eda14cbcSMatt Macy 
464eda14cbcSMatt Macy 		if (list_is_empty(&tc->tc_callbacks[g]))
465eda14cbcSMatt Macy 			continue;
466eda14cbcSMatt Macy 
467eda14cbcSMatt Macy 		if (tx->tx_commit_cb_taskq == NULL) {
468eda14cbcSMatt Macy 			/*
469eda14cbcSMatt Macy 			 * Commit callback taskq hasn't been created yet.
470eda14cbcSMatt Macy 			 */
471eda14cbcSMatt Macy 			tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb",
4727877fdebSMatt Macy 			    100, defclsyspri, boot_ncpus, boot_ncpus * 2,
4737877fdebSMatt Macy 			    TASKQ_PREPOPULATE | TASKQ_DYNAMIC |
4747877fdebSMatt Macy 			    TASKQ_THREADS_CPU_PCT);
475eda14cbcSMatt Macy 		}
476eda14cbcSMatt Macy 
477eda14cbcSMatt Macy 		cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
478eda14cbcSMatt Macy 		list_create(cb_list, sizeof (dmu_tx_callback_t),
479eda14cbcSMatt Macy 		    offsetof(dmu_tx_callback_t, dcb_node));
480eda14cbcSMatt Macy 
481eda14cbcSMatt Macy 		list_move_tail(cb_list, &tc->tc_callbacks[g]);
482eda14cbcSMatt Macy 
483bb2d13b6SMartin Matuska 		(void) taskq_dispatch(tx->tx_commit_cb_taskq,
484eda14cbcSMatt Macy 		    txg_do_callbacks, cb_list, TQ_SLEEP);
485eda14cbcSMatt Macy 	}
486eda14cbcSMatt Macy }
487eda14cbcSMatt Macy 
488eda14cbcSMatt Macy /*
489eda14cbcSMatt Macy  * Wait for pending commit callbacks of already-synced transactions to finish
490eda14cbcSMatt Macy  * processing.
491eda14cbcSMatt Macy  * Calling this function from within a commit callback will deadlock.
492eda14cbcSMatt Macy  */
493eda14cbcSMatt Macy void
txg_wait_callbacks(dsl_pool_t * dp)494eda14cbcSMatt Macy txg_wait_callbacks(dsl_pool_t *dp)
495eda14cbcSMatt Macy {
496eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
497eda14cbcSMatt Macy 
498eda14cbcSMatt Macy 	if (tx->tx_commit_cb_taskq != NULL)
499eda14cbcSMatt Macy 		taskq_wait_outstanding(tx->tx_commit_cb_taskq, 0);
500eda14cbcSMatt Macy }
501eda14cbcSMatt Macy 
502eda14cbcSMatt Macy static boolean_t
txg_is_quiescing(dsl_pool_t * dp)503eda14cbcSMatt Macy txg_is_quiescing(dsl_pool_t *dp)
504eda14cbcSMatt Macy {
505eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
506eda14cbcSMatt Macy 	ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
507eda14cbcSMatt Macy 	return (tx->tx_quiescing_txg != 0);
508eda14cbcSMatt Macy }
509eda14cbcSMatt Macy 
510eda14cbcSMatt Macy static boolean_t
txg_has_quiesced_to_sync(dsl_pool_t * dp)511eda14cbcSMatt Macy txg_has_quiesced_to_sync(dsl_pool_t *dp)
512eda14cbcSMatt Macy {
513eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
514eda14cbcSMatt Macy 	ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
515eda14cbcSMatt Macy 	return (tx->tx_quiesced_txg != 0);
516eda14cbcSMatt Macy }
517eda14cbcSMatt Macy 
518da5137abSMartin Matuska static __attribute__((noreturn)) void
txg_sync_thread(void * arg)519eda14cbcSMatt Macy txg_sync_thread(void *arg)
520eda14cbcSMatt Macy {
521eda14cbcSMatt Macy 	dsl_pool_t *dp = arg;
522eda14cbcSMatt Macy 	spa_t *spa = dp->dp_spa;
523eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
524eda14cbcSMatt Macy 	callb_cpr_t cpr;
525eda14cbcSMatt Macy 	clock_t start, delta;
526eda14cbcSMatt Macy 
527eda14cbcSMatt Macy 	(void) spl_fstrans_mark();
528eda14cbcSMatt Macy 	txg_thread_enter(tx, &cpr);
529eda14cbcSMatt Macy 
530eda14cbcSMatt Macy 	start = delta = 0;
531eda14cbcSMatt Macy 	for (;;) {
532eda14cbcSMatt Macy 		clock_t timeout = zfs_txg_timeout * hz;
533eda14cbcSMatt Macy 		clock_t timer;
534eda14cbcSMatt Macy 		uint64_t txg;
535eda14cbcSMatt Macy 
536eda14cbcSMatt Macy 		/*
537eda14cbcSMatt Macy 		 * We sync when we're scanning, there's someone waiting
538eda14cbcSMatt Macy 		 * on us, or the quiesce thread has handed off a txg to
539eda14cbcSMatt Macy 		 * us, or we have reached our timeout.
540eda14cbcSMatt Macy 		 */
541eda14cbcSMatt Macy 		timer = (delta >= timeout ? 0 : timeout - delta);
542eda14cbcSMatt Macy 		while (!dsl_scan_active(dp->dp_scan) &&
543eda14cbcSMatt Macy 		    !tx->tx_exiting && timer > 0 &&
544eda14cbcSMatt Macy 		    tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
5457cd22ac4SMartin Matuska 		    !txg_has_quiesced_to_sync(dp)) {
546eda14cbcSMatt Macy 			dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
54733b8c039SMartin Matuska 			    (u_longlong_t)tx->tx_synced_txg,
54833b8c039SMartin Matuska 			    (u_longlong_t)tx->tx_sync_txg_waiting, dp);
549eda14cbcSMatt Macy 			txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
550eda14cbcSMatt Macy 			delta = ddi_get_lbolt() - start;
551eda14cbcSMatt Macy 			timer = (delta > timeout ? 0 : timeout - delta);
552eda14cbcSMatt Macy 		}
553eda14cbcSMatt Macy 
554eda14cbcSMatt Macy 		/*
555b985c9caSMartin Matuska 		 * When we're suspended, nothing should be changing and for
556b985c9caSMartin Matuska 		 * MMP we don't want to bump anything that would make it
557b985c9caSMartin Matuska 		 * harder to detect if another host is changing it when
558b985c9caSMartin Matuska 		 * resuming after a MMP suspend.
559b985c9caSMartin Matuska 		 */
560b985c9caSMartin Matuska 		if (spa_suspended(spa))
561b985c9caSMartin Matuska 			continue;
562b985c9caSMartin Matuska 
563b985c9caSMartin Matuska 		/*
564eda14cbcSMatt Macy 		 * Wait until the quiesce thread hands off a txg to us,
565eda14cbcSMatt Macy 		 * prompting it to do so if necessary.
566eda14cbcSMatt Macy 		 */
567eda14cbcSMatt Macy 		while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) {
5687cd22ac4SMartin Matuska 			if (txg_is_quiescing(dp)) {
5697cd22ac4SMartin Matuska 				txg_thread_wait(tx, &cpr,
5707cd22ac4SMartin Matuska 				    &tx->tx_quiesce_done_cv, 0);
5717cd22ac4SMartin Matuska 				continue;
5727cd22ac4SMartin Matuska 			}
573eda14cbcSMatt Macy 			if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
574eda14cbcSMatt Macy 				tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
575eda14cbcSMatt Macy 			cv_broadcast(&tx->tx_quiesce_more_cv);
576eda14cbcSMatt Macy 			txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
577eda14cbcSMatt Macy 		}
578eda14cbcSMatt Macy 
579eda14cbcSMatt Macy 		if (tx->tx_exiting)
580eda14cbcSMatt Macy 			txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
581eda14cbcSMatt Macy 
582eda14cbcSMatt Macy 		/*
583eda14cbcSMatt Macy 		 * Consume the quiesced txg which has been handed off to
584eda14cbcSMatt Macy 		 * us.  This may cause the quiescing thread to now be
585eda14cbcSMatt Macy 		 * able to quiesce another txg, so we must signal it.
586eda14cbcSMatt Macy 		 */
587eda14cbcSMatt Macy 		ASSERT(tx->tx_quiesced_txg != 0);
588eda14cbcSMatt Macy 		txg = tx->tx_quiesced_txg;
589eda14cbcSMatt Macy 		tx->tx_quiesced_txg = 0;
590eda14cbcSMatt Macy 		tx->tx_syncing_txg = txg;
591eda14cbcSMatt Macy 		DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg);
592eda14cbcSMatt Macy 		cv_broadcast(&tx->tx_quiesce_more_cv);
593eda14cbcSMatt Macy 
594eda14cbcSMatt Macy 		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
59533b8c039SMartin Matuska 		    (u_longlong_t)txg, (u_longlong_t)tx->tx_quiesce_txg_waiting,
59633b8c039SMartin Matuska 		    (u_longlong_t)tx->tx_sync_txg_waiting);
597eda14cbcSMatt Macy 		mutex_exit(&tx->tx_sync_lock);
598eda14cbcSMatt Macy 
599eda14cbcSMatt Macy 		txg_stat_t *ts = spa_txg_history_init_io(spa, txg, dp);
600eda14cbcSMatt Macy 		start = ddi_get_lbolt();
601eda14cbcSMatt Macy 		spa_sync(spa, txg);
602eda14cbcSMatt Macy 		delta = ddi_get_lbolt() - start;
603eda14cbcSMatt Macy 		spa_txg_history_fini_io(spa, ts);
604eda14cbcSMatt Macy 
605eda14cbcSMatt Macy 		mutex_enter(&tx->tx_sync_lock);
606eda14cbcSMatt Macy 		tx->tx_synced_txg = txg;
607eda14cbcSMatt Macy 		tx->tx_syncing_txg = 0;
608eda14cbcSMatt Macy 		DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg);
609eda14cbcSMatt Macy 		cv_broadcast(&tx->tx_sync_done_cv);
610eda14cbcSMatt Macy 
611eda14cbcSMatt Macy 		/*
612eda14cbcSMatt Macy 		 * Dispatch commit callbacks to worker threads.
613eda14cbcSMatt Macy 		 */
614eda14cbcSMatt Macy 		txg_dispatch_callbacks(dp, txg);
615eda14cbcSMatt Macy 	}
616eda14cbcSMatt Macy }
617eda14cbcSMatt Macy 
618da5137abSMartin Matuska static __attribute__((noreturn)) void
txg_quiesce_thread(void * arg)619eda14cbcSMatt Macy txg_quiesce_thread(void *arg)
620eda14cbcSMatt Macy {
621eda14cbcSMatt Macy 	dsl_pool_t *dp = arg;
622eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
623eda14cbcSMatt Macy 	callb_cpr_t cpr;
624eda14cbcSMatt Macy 
625eda14cbcSMatt Macy 	txg_thread_enter(tx, &cpr);
626eda14cbcSMatt Macy 
627eda14cbcSMatt Macy 	for (;;) {
628eda14cbcSMatt Macy 		uint64_t txg;
629eda14cbcSMatt Macy 
630eda14cbcSMatt Macy 		/*
631eda14cbcSMatt Macy 		 * We quiesce when there's someone waiting on us.
632eda14cbcSMatt Macy 		 * However, we can only have one txg in "quiescing" or
633eda14cbcSMatt Macy 		 * "quiesced, waiting to sync" state.  So we wait until
634eda14cbcSMatt Macy 		 * the "quiesced, waiting to sync" txg has been consumed
635eda14cbcSMatt Macy 		 * by the sync thread.
636eda14cbcSMatt Macy 		 */
637eda14cbcSMatt Macy 		while (!tx->tx_exiting &&
638eda14cbcSMatt Macy 		    (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting ||
639eda14cbcSMatt Macy 		    txg_has_quiesced_to_sync(dp)))
640eda14cbcSMatt Macy 			txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0);
641eda14cbcSMatt Macy 
642eda14cbcSMatt Macy 		if (tx->tx_exiting)
643eda14cbcSMatt Macy 			txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread);
644eda14cbcSMatt Macy 
645eda14cbcSMatt Macy 		txg = tx->tx_open_txg;
646eda14cbcSMatt Macy 		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
64733b8c039SMartin Matuska 		    (u_longlong_t)txg,
64833b8c039SMartin Matuska 		    (u_longlong_t)tx->tx_quiesce_txg_waiting,
64933b8c039SMartin Matuska 		    (u_longlong_t)tx->tx_sync_txg_waiting);
650eda14cbcSMatt Macy 		tx->tx_quiescing_txg = txg;
651eda14cbcSMatt Macy 
652eda14cbcSMatt Macy 		mutex_exit(&tx->tx_sync_lock);
653eda14cbcSMatt Macy 		txg_quiesce(dp, txg);
654eda14cbcSMatt Macy 		mutex_enter(&tx->tx_sync_lock);
655eda14cbcSMatt Macy 
656eda14cbcSMatt Macy 		/*
657eda14cbcSMatt Macy 		 * Hand this txg off to the sync thread.
658eda14cbcSMatt Macy 		 */
65933b8c039SMartin Matuska 		dprintf("quiesce done, handing off txg %llu\n",
66033b8c039SMartin Matuska 		    (u_longlong_t)txg);
661eda14cbcSMatt Macy 		tx->tx_quiescing_txg = 0;
662eda14cbcSMatt Macy 		tx->tx_quiesced_txg = txg;
663eda14cbcSMatt Macy 		DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg);
664eda14cbcSMatt Macy 		cv_broadcast(&tx->tx_sync_more_cv);
665eda14cbcSMatt Macy 		cv_broadcast(&tx->tx_quiesce_done_cv);
666eda14cbcSMatt Macy 	}
667eda14cbcSMatt Macy }
668eda14cbcSMatt Macy 
669eda14cbcSMatt Macy /*
670eda14cbcSMatt Macy  * Delay this thread by delay nanoseconds if we are still in the open
671eda14cbcSMatt Macy  * transaction group and there is already a waiting txg quiescing or quiesced.
672eda14cbcSMatt Macy  * Abort the delay if this txg stalls or enters the quiescing state.
673eda14cbcSMatt Macy  */
674eda14cbcSMatt Macy void
txg_delay(dsl_pool_t * dp,uint64_t txg,hrtime_t delay,hrtime_t resolution)675eda14cbcSMatt Macy txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution)
676eda14cbcSMatt Macy {
677eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
678eda14cbcSMatt Macy 	hrtime_t start = gethrtime();
679eda14cbcSMatt Macy 
680eda14cbcSMatt Macy 	/* don't delay if this txg could transition to quiescing immediately */
681eda14cbcSMatt Macy 	if (tx->tx_open_txg > txg ||
682eda14cbcSMatt Macy 	    tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1)
683eda14cbcSMatt Macy 		return;
684eda14cbcSMatt Macy 
685eda14cbcSMatt Macy 	mutex_enter(&tx->tx_sync_lock);
686eda14cbcSMatt Macy 	if (tx->tx_open_txg > txg || tx->tx_synced_txg == txg-1) {
687eda14cbcSMatt Macy 		mutex_exit(&tx->tx_sync_lock);
688eda14cbcSMatt Macy 		return;
689eda14cbcSMatt Macy 	}
690eda14cbcSMatt Macy 
691eda14cbcSMatt Macy 	while (gethrtime() - start < delay &&
692eda14cbcSMatt Macy 	    tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) {
693eda14cbcSMatt Macy 		(void) cv_timedwait_hires(&tx->tx_quiesce_more_cv,
694eda14cbcSMatt Macy 		    &tx->tx_sync_lock, delay, resolution, 0);
695eda14cbcSMatt Macy 	}
696eda14cbcSMatt Macy 
697eda14cbcSMatt Macy 	DMU_TX_STAT_BUMP(dmu_tx_delay);
698eda14cbcSMatt Macy 
699eda14cbcSMatt Macy 	mutex_exit(&tx->tx_sync_lock);
700eda14cbcSMatt Macy }
701eda14cbcSMatt Macy 
702eda14cbcSMatt Macy static boolean_t
txg_wait_synced_impl(dsl_pool_t * dp,uint64_t txg,boolean_t wait_sig)703eda14cbcSMatt Macy txg_wait_synced_impl(dsl_pool_t *dp, uint64_t txg, boolean_t wait_sig)
704eda14cbcSMatt Macy {
705eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
706eda14cbcSMatt Macy 
707eda14cbcSMatt Macy 	ASSERT(!dsl_pool_config_held(dp));
708eda14cbcSMatt Macy 
709eda14cbcSMatt Macy 	mutex_enter(&tx->tx_sync_lock);
710eda14cbcSMatt Macy 	ASSERT3U(tx->tx_threads, ==, 2);
711eda14cbcSMatt Macy 	if (txg == 0)
712eda14cbcSMatt Macy 		txg = tx->tx_open_txg + TXG_DEFER_SIZE;
713eda14cbcSMatt Macy 	if (tx->tx_sync_txg_waiting < txg)
714eda14cbcSMatt Macy 		tx->tx_sync_txg_waiting = txg;
715eda14cbcSMatt Macy 	dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
71633b8c039SMartin Matuska 	    (u_longlong_t)txg, (u_longlong_t)tx->tx_quiesce_txg_waiting,
71733b8c039SMartin Matuska 	    (u_longlong_t)tx->tx_sync_txg_waiting);
718eda14cbcSMatt Macy 	while (tx->tx_synced_txg < txg) {
719eda14cbcSMatt Macy 		dprintf("broadcasting sync more "
720eda14cbcSMatt Macy 		    "tx_synced=%llu waiting=%llu dp=%px\n",
72133b8c039SMartin Matuska 		    (u_longlong_t)tx->tx_synced_txg,
72233b8c039SMartin Matuska 		    (u_longlong_t)tx->tx_sync_txg_waiting, dp);
723eda14cbcSMatt Macy 		cv_broadcast(&tx->tx_sync_more_cv);
724eda14cbcSMatt Macy 		if (wait_sig) {
725eda14cbcSMatt Macy 			/*
726eda14cbcSMatt Macy 			 * Condition wait here but stop if the thread receives a
727eda14cbcSMatt Macy 			 * signal. The caller may call txg_wait_synced*() again
728eda14cbcSMatt Macy 			 * to resume waiting for this txg.
729eda14cbcSMatt Macy 			 */
730eda14cbcSMatt Macy 			if (cv_wait_io_sig(&tx->tx_sync_done_cv,
731eda14cbcSMatt Macy 			    &tx->tx_sync_lock) == 0) {
732eda14cbcSMatt Macy 				mutex_exit(&tx->tx_sync_lock);
733eda14cbcSMatt Macy 				return (B_TRUE);
734eda14cbcSMatt Macy 			}
735eda14cbcSMatt Macy 		} else {
736eda14cbcSMatt Macy 			cv_wait_io(&tx->tx_sync_done_cv, &tx->tx_sync_lock);
737eda14cbcSMatt Macy 		}
738eda14cbcSMatt Macy 	}
739eda14cbcSMatt Macy 	mutex_exit(&tx->tx_sync_lock);
740eda14cbcSMatt Macy 	return (B_FALSE);
741eda14cbcSMatt Macy }
742eda14cbcSMatt Macy 
743eda14cbcSMatt Macy void
txg_wait_synced(dsl_pool_t * dp,uint64_t txg)744eda14cbcSMatt Macy txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
745eda14cbcSMatt Macy {
746eda14cbcSMatt Macy 	VERIFY0(txg_wait_synced_impl(dp, txg, B_FALSE));
747eda14cbcSMatt Macy }
748eda14cbcSMatt Macy 
749eda14cbcSMatt Macy /*
750eda14cbcSMatt Macy  * Similar to a txg_wait_synced but it can be interrupted from a signal.
751eda14cbcSMatt Macy  * Returns B_TRUE if the thread was signaled while waiting.
752eda14cbcSMatt Macy  */
753eda14cbcSMatt Macy boolean_t
txg_wait_synced_sig(dsl_pool_t * dp,uint64_t txg)754eda14cbcSMatt Macy txg_wait_synced_sig(dsl_pool_t *dp, uint64_t txg)
755eda14cbcSMatt Macy {
756eda14cbcSMatt Macy 	return (txg_wait_synced_impl(dp, txg, B_TRUE));
757eda14cbcSMatt Macy }
758eda14cbcSMatt Macy 
759eda14cbcSMatt Macy /*
760eda14cbcSMatt Macy  * Wait for the specified open transaction group.  Set should_quiesce
761eda14cbcSMatt Macy  * when the current open txg should be quiesced immediately.
762eda14cbcSMatt Macy  */
763eda14cbcSMatt Macy void
txg_wait_open(dsl_pool_t * dp,uint64_t txg,boolean_t should_quiesce)764eda14cbcSMatt Macy txg_wait_open(dsl_pool_t *dp, uint64_t txg, boolean_t should_quiesce)
765eda14cbcSMatt Macy {
766eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
767eda14cbcSMatt Macy 
768eda14cbcSMatt Macy 	ASSERT(!dsl_pool_config_held(dp));
769eda14cbcSMatt Macy 
770eda14cbcSMatt Macy 	mutex_enter(&tx->tx_sync_lock);
771eda14cbcSMatt Macy 	ASSERT3U(tx->tx_threads, ==, 2);
772eda14cbcSMatt Macy 	if (txg == 0)
773eda14cbcSMatt Macy 		txg = tx->tx_open_txg + 1;
774eda14cbcSMatt Macy 	if (tx->tx_quiesce_txg_waiting < txg && should_quiesce)
775eda14cbcSMatt Macy 		tx->tx_quiesce_txg_waiting = txg;
776eda14cbcSMatt Macy 	dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
77733b8c039SMartin Matuska 	    (u_longlong_t)txg, (u_longlong_t)tx->tx_quiesce_txg_waiting,
77833b8c039SMartin Matuska 	    (u_longlong_t)tx->tx_sync_txg_waiting);
779eda14cbcSMatt Macy 	while (tx->tx_open_txg < txg) {
780eda14cbcSMatt Macy 		cv_broadcast(&tx->tx_quiesce_more_cv);
781eda14cbcSMatt Macy 		/*
782eda14cbcSMatt Macy 		 * Callers setting should_quiesce will use cv_wait_io() and
783eda14cbcSMatt Macy 		 * be accounted for as iowait time.  Otherwise, the caller is
784eda14cbcSMatt Macy 		 * understood to be idle and cv_wait_sig() is used to prevent
785eda14cbcSMatt Macy 		 * incorrectly inflating the system load average.
786eda14cbcSMatt Macy 		 */
787eda14cbcSMatt Macy 		if (should_quiesce == B_TRUE) {
788eda14cbcSMatt Macy 			cv_wait_io(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
789eda14cbcSMatt Macy 		} else {
7902c48331dSMatt Macy 			cv_wait_idle(&tx->tx_quiesce_done_cv,
7912c48331dSMatt Macy 			    &tx->tx_sync_lock);
792eda14cbcSMatt Macy 		}
793eda14cbcSMatt Macy 	}
794eda14cbcSMatt Macy 	mutex_exit(&tx->tx_sync_lock);
795eda14cbcSMatt Macy }
796eda14cbcSMatt Macy 
797eda14cbcSMatt Macy /*
7987cd22ac4SMartin Matuska  * Pass in the txg number that should be synced.
799eda14cbcSMatt Macy  */
800eda14cbcSMatt Macy void
txg_kick(dsl_pool_t * dp,uint64_t txg)8017cd22ac4SMartin Matuska txg_kick(dsl_pool_t *dp, uint64_t txg)
802eda14cbcSMatt Macy {
803eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
804eda14cbcSMatt Macy 
805eda14cbcSMatt Macy 	ASSERT(!dsl_pool_config_held(dp));
806eda14cbcSMatt Macy 
8077cd22ac4SMartin Matuska 	if (tx->tx_sync_txg_waiting >= txg)
8087cd22ac4SMartin Matuska 		return;
8097cd22ac4SMartin Matuska 
810eda14cbcSMatt Macy 	mutex_enter(&tx->tx_sync_lock);
8117cd22ac4SMartin Matuska 	if (tx->tx_sync_txg_waiting < txg) {
8127cd22ac4SMartin Matuska 		tx->tx_sync_txg_waiting = txg;
8137cd22ac4SMartin Matuska 		cv_broadcast(&tx->tx_sync_more_cv);
814eda14cbcSMatt Macy 	}
815eda14cbcSMatt Macy 	mutex_exit(&tx->tx_sync_lock);
816eda14cbcSMatt Macy }
817eda14cbcSMatt Macy 
818eda14cbcSMatt Macy boolean_t
txg_stalled(dsl_pool_t * dp)819eda14cbcSMatt Macy txg_stalled(dsl_pool_t *dp)
820eda14cbcSMatt Macy {
821eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
822eda14cbcSMatt Macy 	return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg);
823eda14cbcSMatt Macy }
824eda14cbcSMatt Macy 
825eda14cbcSMatt Macy boolean_t
txg_sync_waiting(dsl_pool_t * dp)826eda14cbcSMatt Macy txg_sync_waiting(dsl_pool_t *dp)
827eda14cbcSMatt Macy {
828eda14cbcSMatt Macy 	tx_state_t *tx = &dp->dp_tx;
829eda14cbcSMatt Macy 
830eda14cbcSMatt Macy 	return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting ||
831eda14cbcSMatt Macy 	    tx->tx_quiesced_txg != 0);
832eda14cbcSMatt Macy }
833eda14cbcSMatt Macy 
834eda14cbcSMatt Macy /*
835eda14cbcSMatt Macy  * Verify that this txg is active (open, quiescing, syncing).  Non-active
836eda14cbcSMatt Macy  * txg's should not be manipulated.
837eda14cbcSMatt Macy  */
838eda14cbcSMatt Macy #ifdef ZFS_DEBUG
839eda14cbcSMatt Macy void
txg_verify(spa_t * spa,uint64_t txg)840eda14cbcSMatt Macy txg_verify(spa_t *spa, uint64_t txg)
841eda14cbcSMatt Macy {
842eda14cbcSMatt Macy 	dsl_pool_t *dp __maybe_unused = spa_get_dsl(spa);
843eda14cbcSMatt Macy 	if (txg <= TXG_INITIAL || txg == ZILTEST_TXG)
844eda14cbcSMatt Macy 		return;
845eda14cbcSMatt Macy 	ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
846eda14cbcSMatt Macy 	ASSERT3U(txg, >=, dp->dp_tx.tx_synced_txg);
847eda14cbcSMatt Macy 	ASSERT3U(txg, >=, dp->dp_tx.tx_open_txg - TXG_CONCURRENT_STATES);
848eda14cbcSMatt Macy }
849eda14cbcSMatt Macy #endif
850eda14cbcSMatt Macy 
851eda14cbcSMatt Macy /*
852eda14cbcSMatt Macy  * Per-txg object lists.
853eda14cbcSMatt Macy  */
854eda14cbcSMatt Macy void
txg_list_create(txg_list_t * tl,spa_t * spa,size_t offset)855eda14cbcSMatt Macy txg_list_create(txg_list_t *tl, spa_t *spa, size_t offset)
856eda14cbcSMatt Macy {
857eda14cbcSMatt Macy 	int t;
858eda14cbcSMatt Macy 
859eda14cbcSMatt Macy 	mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL);
860eda14cbcSMatt Macy 
861eda14cbcSMatt Macy 	tl->tl_offset = offset;
862eda14cbcSMatt Macy 	tl->tl_spa = spa;
863eda14cbcSMatt Macy 
864eda14cbcSMatt Macy 	for (t = 0; t < TXG_SIZE; t++)
865eda14cbcSMatt Macy 		tl->tl_head[t] = NULL;
866eda14cbcSMatt Macy }
867eda14cbcSMatt Macy 
868eda14cbcSMatt Macy static boolean_t
txg_list_empty_impl(txg_list_t * tl,uint64_t txg)869eda14cbcSMatt Macy txg_list_empty_impl(txg_list_t *tl, uint64_t txg)
870eda14cbcSMatt Macy {
871eda14cbcSMatt Macy 	ASSERT(MUTEX_HELD(&tl->tl_lock));
872eda14cbcSMatt Macy 	TXG_VERIFY(tl->tl_spa, txg);
873eda14cbcSMatt Macy 	return (tl->tl_head[txg & TXG_MASK] == NULL);
874eda14cbcSMatt Macy }
875eda14cbcSMatt Macy 
876eda14cbcSMatt Macy boolean_t
txg_list_empty(txg_list_t * tl,uint64_t txg)877eda14cbcSMatt Macy txg_list_empty(txg_list_t *tl, uint64_t txg)
878eda14cbcSMatt Macy {
879eda14cbcSMatt Macy 	mutex_enter(&tl->tl_lock);
880eda14cbcSMatt Macy 	boolean_t ret = txg_list_empty_impl(tl, txg);
881eda14cbcSMatt Macy 	mutex_exit(&tl->tl_lock);
882eda14cbcSMatt Macy 
883eda14cbcSMatt Macy 	return (ret);
884eda14cbcSMatt Macy }
885eda14cbcSMatt Macy 
886eda14cbcSMatt Macy void
txg_list_destroy(txg_list_t * tl)887eda14cbcSMatt Macy txg_list_destroy(txg_list_t *tl)
888eda14cbcSMatt Macy {
889eda14cbcSMatt Macy 	int t;
890eda14cbcSMatt Macy 
891eda14cbcSMatt Macy 	mutex_enter(&tl->tl_lock);
892eda14cbcSMatt Macy 	for (t = 0; t < TXG_SIZE; t++)
893eda14cbcSMatt Macy 		ASSERT(txg_list_empty_impl(tl, t));
894eda14cbcSMatt Macy 	mutex_exit(&tl->tl_lock);
895eda14cbcSMatt Macy 
896eda14cbcSMatt Macy 	mutex_destroy(&tl->tl_lock);
897eda14cbcSMatt Macy }
898eda14cbcSMatt Macy 
899eda14cbcSMatt Macy /*
900eda14cbcSMatt Macy  * Returns true if all txg lists are empty.
901eda14cbcSMatt Macy  *
902eda14cbcSMatt Macy  * Warning: this is inherently racy (an item could be added immediately
903eda14cbcSMatt Macy  * after this function returns).
904eda14cbcSMatt Macy  */
905eda14cbcSMatt Macy boolean_t
txg_all_lists_empty(txg_list_t * tl)906eda14cbcSMatt Macy txg_all_lists_empty(txg_list_t *tl)
907eda14cbcSMatt Macy {
9087b5e6873SMartin Matuska 	boolean_t res = B_TRUE;
9097b5e6873SMartin Matuska 	for (int i = 0; i < TXG_SIZE; i++)
9107b5e6873SMartin Matuska 		res &= (tl->tl_head[i] == NULL);
9117b5e6873SMartin Matuska 	return (res);
912eda14cbcSMatt Macy }
913eda14cbcSMatt Macy 
914eda14cbcSMatt Macy /*
915eda14cbcSMatt Macy  * Add an entry to the list (unless it's already on the list).
916eda14cbcSMatt Macy  * Returns B_TRUE if it was actually added.
917eda14cbcSMatt Macy  */
918eda14cbcSMatt Macy boolean_t
txg_list_add(txg_list_t * tl,void * p,uint64_t txg)919eda14cbcSMatt Macy txg_list_add(txg_list_t *tl, void *p, uint64_t txg)
920eda14cbcSMatt Macy {
921eda14cbcSMatt Macy 	int t = txg & TXG_MASK;
922eda14cbcSMatt Macy 	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
923eda14cbcSMatt Macy 	boolean_t add;
924eda14cbcSMatt Macy 
925eda14cbcSMatt Macy 	TXG_VERIFY(tl->tl_spa, txg);
926eda14cbcSMatt Macy 	mutex_enter(&tl->tl_lock);
927eda14cbcSMatt Macy 	add = (tn->tn_member[t] == 0);
928eda14cbcSMatt Macy 	if (add) {
929eda14cbcSMatt Macy 		tn->tn_member[t] = 1;
930eda14cbcSMatt Macy 		tn->tn_next[t] = tl->tl_head[t];
931eda14cbcSMatt Macy 		tl->tl_head[t] = tn;
932eda14cbcSMatt Macy 	}
933eda14cbcSMatt Macy 	mutex_exit(&tl->tl_lock);
934eda14cbcSMatt Macy 
935eda14cbcSMatt Macy 	return (add);
936eda14cbcSMatt Macy }
937eda14cbcSMatt Macy 
938eda14cbcSMatt Macy /*
939eda14cbcSMatt Macy  * Add an entry to the end of the list, unless it's already on the list.
940eda14cbcSMatt Macy  * (walks list to find end)
941eda14cbcSMatt Macy  * Returns B_TRUE if it was actually added.
942eda14cbcSMatt Macy  */
943eda14cbcSMatt Macy boolean_t
txg_list_add_tail(txg_list_t * tl,void * p,uint64_t txg)944eda14cbcSMatt Macy txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg)
945eda14cbcSMatt Macy {
946eda14cbcSMatt Macy 	int t = txg & TXG_MASK;
947eda14cbcSMatt Macy 	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
948eda14cbcSMatt Macy 	boolean_t add;
949eda14cbcSMatt Macy 
950eda14cbcSMatt Macy 	TXG_VERIFY(tl->tl_spa, txg);
951eda14cbcSMatt Macy 	mutex_enter(&tl->tl_lock);
952eda14cbcSMatt Macy 	add = (tn->tn_member[t] == 0);
953eda14cbcSMatt Macy 	if (add) {
954eda14cbcSMatt Macy 		txg_node_t **tp;
955eda14cbcSMatt Macy 
956eda14cbcSMatt Macy 		for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t])
957eda14cbcSMatt Macy 			continue;
958eda14cbcSMatt Macy 
959eda14cbcSMatt Macy 		tn->tn_member[t] = 1;
960eda14cbcSMatt Macy 		tn->tn_next[t] = NULL;
961eda14cbcSMatt Macy 		*tp = tn;
962eda14cbcSMatt Macy 	}
963eda14cbcSMatt Macy 	mutex_exit(&tl->tl_lock);
964eda14cbcSMatt Macy 
965eda14cbcSMatt Macy 	return (add);
966eda14cbcSMatt Macy }
967eda14cbcSMatt Macy 
968eda14cbcSMatt Macy /*
969eda14cbcSMatt Macy  * Remove the head of the list and return it.
970eda14cbcSMatt Macy  */
971eda14cbcSMatt Macy void *
txg_list_remove(txg_list_t * tl,uint64_t txg)972eda14cbcSMatt Macy txg_list_remove(txg_list_t *tl, uint64_t txg)
973eda14cbcSMatt Macy {
974eda14cbcSMatt Macy 	int t = txg & TXG_MASK;
975eda14cbcSMatt Macy 	txg_node_t *tn;
976eda14cbcSMatt Macy 	void *p = NULL;
977eda14cbcSMatt Macy 
978eda14cbcSMatt Macy 	TXG_VERIFY(tl->tl_spa, txg);
979eda14cbcSMatt Macy 	mutex_enter(&tl->tl_lock);
980eda14cbcSMatt Macy 	if ((tn = tl->tl_head[t]) != NULL) {
981eda14cbcSMatt Macy 		ASSERT(tn->tn_member[t]);
982eda14cbcSMatt Macy 		ASSERT(tn->tn_next[t] == NULL || tn->tn_next[t]->tn_member[t]);
983eda14cbcSMatt Macy 		p = (char *)tn - tl->tl_offset;
984eda14cbcSMatt Macy 		tl->tl_head[t] = tn->tn_next[t];
985eda14cbcSMatt Macy 		tn->tn_next[t] = NULL;
986eda14cbcSMatt Macy 		tn->tn_member[t] = 0;
987eda14cbcSMatt Macy 	}
988eda14cbcSMatt Macy 	mutex_exit(&tl->tl_lock);
989eda14cbcSMatt Macy 
990eda14cbcSMatt Macy 	return (p);
991eda14cbcSMatt Macy }
992eda14cbcSMatt Macy 
993eda14cbcSMatt Macy /*
994eda14cbcSMatt Macy  * Remove a specific item from the list and return it.
995eda14cbcSMatt Macy  */
996eda14cbcSMatt Macy void *
txg_list_remove_this(txg_list_t * tl,void * p,uint64_t txg)997eda14cbcSMatt Macy txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg)
998eda14cbcSMatt Macy {
999eda14cbcSMatt Macy 	int t = txg & TXG_MASK;
1000eda14cbcSMatt Macy 	txg_node_t *tn, **tp;
1001eda14cbcSMatt Macy 
1002eda14cbcSMatt Macy 	TXG_VERIFY(tl->tl_spa, txg);
1003eda14cbcSMatt Macy 	mutex_enter(&tl->tl_lock);
1004eda14cbcSMatt Macy 
1005eda14cbcSMatt Macy 	for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) {
1006eda14cbcSMatt Macy 		if ((char *)tn - tl->tl_offset == p) {
1007eda14cbcSMatt Macy 			*tp = tn->tn_next[t];
1008eda14cbcSMatt Macy 			tn->tn_next[t] = NULL;
1009eda14cbcSMatt Macy 			tn->tn_member[t] = 0;
1010eda14cbcSMatt Macy 			mutex_exit(&tl->tl_lock);
1011eda14cbcSMatt Macy 			return (p);
1012eda14cbcSMatt Macy 		}
1013eda14cbcSMatt Macy 	}
1014eda14cbcSMatt Macy 
1015eda14cbcSMatt Macy 	mutex_exit(&tl->tl_lock);
1016eda14cbcSMatt Macy 
1017eda14cbcSMatt Macy 	return (NULL);
1018eda14cbcSMatt Macy }
1019eda14cbcSMatt Macy 
1020eda14cbcSMatt Macy boolean_t
txg_list_member(txg_list_t * tl,void * p,uint64_t txg)1021eda14cbcSMatt Macy txg_list_member(txg_list_t *tl, void *p, uint64_t txg)
1022eda14cbcSMatt Macy {
1023eda14cbcSMatt Macy 	int t = txg & TXG_MASK;
1024eda14cbcSMatt Macy 	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
1025eda14cbcSMatt Macy 
1026eda14cbcSMatt Macy 	TXG_VERIFY(tl->tl_spa, txg);
1027eda14cbcSMatt Macy 	return (tn->tn_member[t] != 0);
1028eda14cbcSMatt Macy }
1029eda14cbcSMatt Macy 
1030eda14cbcSMatt Macy /*
1031eda14cbcSMatt Macy  * Walk a txg list
1032eda14cbcSMatt Macy  */
1033eda14cbcSMatt Macy void *
txg_list_head(txg_list_t * tl,uint64_t txg)1034eda14cbcSMatt Macy txg_list_head(txg_list_t *tl, uint64_t txg)
1035eda14cbcSMatt Macy {
1036eda14cbcSMatt Macy 	int t = txg & TXG_MASK;
1037eda14cbcSMatt Macy 	txg_node_t *tn;
1038eda14cbcSMatt Macy 
1039eda14cbcSMatt Macy 	mutex_enter(&tl->tl_lock);
1040eda14cbcSMatt Macy 	tn = tl->tl_head[t];
1041eda14cbcSMatt Macy 	mutex_exit(&tl->tl_lock);
1042eda14cbcSMatt Macy 
1043eda14cbcSMatt Macy 	TXG_VERIFY(tl->tl_spa, txg);
1044eda14cbcSMatt Macy 	return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
1045eda14cbcSMatt Macy }
1046eda14cbcSMatt Macy 
1047eda14cbcSMatt Macy void *
txg_list_next(txg_list_t * tl,void * p,uint64_t txg)1048eda14cbcSMatt Macy txg_list_next(txg_list_t *tl, void *p, uint64_t txg)
1049eda14cbcSMatt Macy {
1050eda14cbcSMatt Macy 	int t = txg & TXG_MASK;
1051eda14cbcSMatt Macy 	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
1052eda14cbcSMatt Macy 
1053eda14cbcSMatt Macy 	TXG_VERIFY(tl->tl_spa, txg);
1054eda14cbcSMatt Macy 
1055eda14cbcSMatt Macy 	mutex_enter(&tl->tl_lock);
1056eda14cbcSMatt Macy 	tn = tn->tn_next[t];
1057eda14cbcSMatt Macy 	mutex_exit(&tl->tl_lock);
1058eda14cbcSMatt Macy 
1059eda14cbcSMatt Macy 	return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
1060eda14cbcSMatt Macy }
1061eda14cbcSMatt Macy 
1062eda14cbcSMatt Macy EXPORT_SYMBOL(txg_init);
1063eda14cbcSMatt Macy EXPORT_SYMBOL(txg_fini);
1064eda14cbcSMatt Macy EXPORT_SYMBOL(txg_sync_start);
1065eda14cbcSMatt Macy EXPORT_SYMBOL(txg_sync_stop);
1066eda14cbcSMatt Macy EXPORT_SYMBOL(txg_hold_open);
1067eda14cbcSMatt Macy EXPORT_SYMBOL(txg_rele_to_quiesce);
1068eda14cbcSMatt Macy EXPORT_SYMBOL(txg_rele_to_sync);
1069eda14cbcSMatt Macy EXPORT_SYMBOL(txg_register_callbacks);
1070eda14cbcSMatt Macy EXPORT_SYMBOL(txg_delay);
1071eda14cbcSMatt Macy EXPORT_SYMBOL(txg_wait_synced);
1072eda14cbcSMatt Macy EXPORT_SYMBOL(txg_wait_open);
1073eda14cbcSMatt Macy EXPORT_SYMBOL(txg_wait_callbacks);
1074eda14cbcSMatt Macy EXPORT_SYMBOL(txg_stalled);
1075eda14cbcSMatt Macy EXPORT_SYMBOL(txg_sync_waiting);
1076eda14cbcSMatt Macy 
1077be181ee2SMartin Matuska ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, timeout, UINT, ZMOD_RW,
1078eda14cbcSMatt Macy 	"Max seconds worth of delta per txg");
1079