1eda14cbcSMatt Macy /* 2eda14cbcSMatt Macy * CDDL HEADER START 3eda14cbcSMatt Macy * 4eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7eda14cbcSMatt Macy * 8eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0. 10eda14cbcSMatt Macy * See the License for the specific language governing permissions 11eda14cbcSMatt Macy * and limitations under the License. 12eda14cbcSMatt Macy * 13eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18eda14cbcSMatt Macy * 19eda14cbcSMatt Macy * CDDL HEADER END 20eda14cbcSMatt Macy */ 21eda14cbcSMatt Macy /* 22eda14cbcSMatt Macy * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23eda14cbcSMatt Macy * Portions Copyright 2011 Martin Matuska 24eda14cbcSMatt Macy * Copyright (c) 2012, 2019 by Delphix. All rights reserved. 25eda14cbcSMatt Macy */ 26eda14cbcSMatt Macy 27eda14cbcSMatt Macy #include <sys/zfs_context.h> 28eda14cbcSMatt Macy #include <sys/txg_impl.h> 29eda14cbcSMatt Macy #include <sys/dmu_impl.h> 30eda14cbcSMatt Macy #include <sys/spa_impl.h> 31eda14cbcSMatt Macy #include <sys/dmu_tx.h> 32eda14cbcSMatt Macy #include <sys/dsl_pool.h> 33eda14cbcSMatt Macy #include <sys/dsl_scan.h> 34eda14cbcSMatt Macy #include <sys/zil.h> 35eda14cbcSMatt Macy #include <sys/callb.h> 36eda14cbcSMatt Macy #include <sys/trace_zfs.h> 37eda14cbcSMatt Macy 38eda14cbcSMatt Macy /* 39eda14cbcSMatt Macy * ZFS Transaction Groups 40eda14cbcSMatt Macy * ---------------------- 41eda14cbcSMatt Macy * 42eda14cbcSMatt Macy * ZFS transaction groups are, as the name implies, groups of transactions 43eda14cbcSMatt Macy * that act on persistent state. ZFS asserts consistency at the granularity of 44eda14cbcSMatt Macy * these transaction groups. Each successive transaction group (txg) is 45eda14cbcSMatt Macy * assigned a 64-bit consecutive identifier. There are three active 46eda14cbcSMatt Macy * transaction group states: open, quiescing, or syncing. At any given time, 47eda14cbcSMatt Macy * there may be an active txg associated with each state; each active txg may 48eda14cbcSMatt Macy * either be processing, or blocked waiting to enter the next state. There may 49eda14cbcSMatt Macy * be up to three active txgs, and there is always a txg in the open state 50eda14cbcSMatt Macy * (though it may be blocked waiting to enter the quiescing state). In broad 51eda14cbcSMatt Macy * strokes, transactions -- operations that change in-memory structures -- are 52eda14cbcSMatt Macy * accepted into the txg in the open state, and are completed while the txg is 53eda14cbcSMatt Macy * in the open or quiescing states. The accumulated changes are written to 54eda14cbcSMatt Macy * disk in the syncing state. 55eda14cbcSMatt Macy * 56eda14cbcSMatt Macy * Open 57eda14cbcSMatt Macy * 58eda14cbcSMatt Macy * When a new txg becomes active, it first enters the open state. New 59eda14cbcSMatt Macy * transactions -- updates to in-memory structures -- are assigned to the 60eda14cbcSMatt Macy * currently open txg. There is always a txg in the open state so that ZFS can 61eda14cbcSMatt Macy * accept new changes (though the txg may refuse new changes if it has hit 62eda14cbcSMatt Macy * some limit). ZFS advances the open txg to the next state for a variety of 63eda14cbcSMatt Macy * reasons such as it hitting a time or size threshold, or the execution of an 64eda14cbcSMatt Macy * administrative action that must be completed in the syncing state. 65eda14cbcSMatt Macy * 66eda14cbcSMatt Macy * Quiescing 67eda14cbcSMatt Macy * 68eda14cbcSMatt Macy * After a txg exits the open state, it enters the quiescing state. The 69eda14cbcSMatt Macy * quiescing state is intended to provide a buffer between accepting new 70eda14cbcSMatt Macy * transactions in the open state and writing them out to stable storage in 71eda14cbcSMatt Macy * the syncing state. While quiescing, transactions can continue their 72eda14cbcSMatt Macy * operation without delaying either of the other states. Typically, a txg is 73eda14cbcSMatt Macy * in the quiescing state very briefly since the operations are bounded by 74eda14cbcSMatt Macy * software latencies rather than, say, slower I/O latencies. After all 75eda14cbcSMatt Macy * transactions complete, the txg is ready to enter the next state. 76eda14cbcSMatt Macy * 77eda14cbcSMatt Macy * Syncing 78eda14cbcSMatt Macy * 79eda14cbcSMatt Macy * In the syncing state, the in-memory state built up during the open and (to 80eda14cbcSMatt Macy * a lesser degree) the quiescing states is written to stable storage. The 81eda14cbcSMatt Macy * process of writing out modified data can, in turn modify more data. For 82eda14cbcSMatt Macy * example when we write new blocks, we need to allocate space for them; those 83eda14cbcSMatt Macy * allocations modify metadata (space maps)... which themselves must be 84eda14cbcSMatt Macy * written to stable storage. During the sync state, ZFS iterates, writing out 85eda14cbcSMatt Macy * data until it converges and all in-memory changes have been written out. 86eda14cbcSMatt Macy * The first such pass is the largest as it encompasses all the modified user 87eda14cbcSMatt Macy * data (as opposed to filesystem metadata). Subsequent passes typically have 88eda14cbcSMatt Macy * far less data to write as they consist exclusively of filesystem metadata. 89eda14cbcSMatt Macy * 90eda14cbcSMatt Macy * To ensure convergence, after a certain number of passes ZFS begins 91eda14cbcSMatt Macy * overwriting locations on stable storage that had been allocated earlier in 92eda14cbcSMatt Macy * the syncing state (and subsequently freed). ZFS usually allocates new 93eda14cbcSMatt Macy * blocks to optimize for large, continuous, writes. For the syncing state to 94eda14cbcSMatt Macy * converge however it must complete a pass where no new blocks are allocated 95eda14cbcSMatt Macy * since each allocation requires a modification of persistent metadata. 96eda14cbcSMatt Macy * Further, to hasten convergence, after a prescribed number of passes, ZFS 97eda14cbcSMatt Macy * also defers frees, and stops compressing. 98eda14cbcSMatt Macy * 99eda14cbcSMatt Macy * In addition to writing out user data, we must also execute synctasks during 100eda14cbcSMatt Macy * the syncing context. A synctask is the mechanism by which some 101eda14cbcSMatt Macy * administrative activities work such as creating and destroying snapshots or 102eda14cbcSMatt Macy * datasets. Note that when a synctask is initiated it enters the open txg, 103eda14cbcSMatt Macy * and ZFS then pushes that txg as quickly as possible to completion of the 104eda14cbcSMatt Macy * syncing state in order to reduce the latency of the administrative 105eda14cbcSMatt Macy * activity. To complete the syncing state, ZFS writes out a new uberblock, 106eda14cbcSMatt Macy * the root of the tree of blocks that comprise all state stored on the ZFS 107eda14cbcSMatt Macy * pool. Finally, if there is a quiesced txg waiting, we signal that it can 108eda14cbcSMatt Macy * now transition to the syncing state. 109eda14cbcSMatt Macy */ 110eda14cbcSMatt Macy 111da5137abSMartin Matuska static __attribute__((noreturn)) void txg_sync_thread(void *arg); 112da5137abSMartin Matuska static __attribute__((noreturn)) void txg_quiesce_thread(void *arg); 113eda14cbcSMatt Macy 114be181ee2SMartin Matuska uint_t zfs_txg_timeout = 5; /* max seconds worth of delta per txg */ 115eda14cbcSMatt Macy 116eda14cbcSMatt Macy /* 117eda14cbcSMatt Macy * Prepare the txg subsystem. 118eda14cbcSMatt Macy */ 119eda14cbcSMatt Macy void 120eda14cbcSMatt Macy txg_init(dsl_pool_t *dp, uint64_t txg) 121eda14cbcSMatt Macy { 122eda14cbcSMatt Macy tx_state_t *tx = &dp->dp_tx; 123eda14cbcSMatt Macy int c; 124da5137abSMartin Matuska memset(tx, 0, sizeof (tx_state_t)); 125eda14cbcSMatt Macy 126eda14cbcSMatt Macy tx->tx_cpu = vmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP); 127eda14cbcSMatt Macy 128eda14cbcSMatt Macy for (c = 0; c < max_ncpus; c++) { 129eda14cbcSMatt Macy int i; 130eda14cbcSMatt Macy 131eda14cbcSMatt Macy mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL); 132eda14cbcSMatt Macy mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_NOLOCKDEP, 133eda14cbcSMatt Macy NULL); 134eda14cbcSMatt Macy for (i = 0; i < TXG_SIZE; i++) { 135eda14cbcSMatt Macy cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT, 136eda14cbcSMatt Macy NULL); 137eda14cbcSMatt Macy list_create(&tx->tx_cpu[c].tc_callbacks[i], 138eda14cbcSMatt Macy sizeof (dmu_tx_callback_t), 139eda14cbcSMatt Macy offsetof(dmu_tx_callback_t, dcb_node)); 140eda14cbcSMatt Macy } 141eda14cbcSMatt Macy } 142eda14cbcSMatt Macy 143eda14cbcSMatt Macy mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL); 144eda14cbcSMatt Macy 145eda14cbcSMatt Macy cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL); 146eda14cbcSMatt Macy cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL); 147eda14cbcSMatt Macy cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL); 148eda14cbcSMatt Macy cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL); 149eda14cbcSMatt Macy cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL); 150eda14cbcSMatt Macy 151eda14cbcSMatt Macy tx->tx_open_txg = txg; 152eda14cbcSMatt Macy } 153eda14cbcSMatt Macy 154eda14cbcSMatt Macy /* 155eda14cbcSMatt Macy * Close down the txg subsystem. 156eda14cbcSMatt Macy */ 157eda14cbcSMatt Macy void 158eda14cbcSMatt Macy txg_fini(dsl_pool_t *dp) 159eda14cbcSMatt Macy { 160eda14cbcSMatt Macy tx_state_t *tx = &dp->dp_tx; 161eda14cbcSMatt Macy int c; 162eda14cbcSMatt Macy 163eda14cbcSMatt Macy ASSERT0(tx->tx_threads); 164eda14cbcSMatt Macy 165eda14cbcSMatt Macy mutex_destroy(&tx->tx_sync_lock); 166eda14cbcSMatt Macy 167eda14cbcSMatt Macy cv_destroy(&tx->tx_sync_more_cv); 168eda14cbcSMatt Macy cv_destroy(&tx->tx_sync_done_cv); 169eda14cbcSMatt Macy cv_destroy(&tx->tx_quiesce_more_cv); 170eda14cbcSMatt Macy cv_destroy(&tx->tx_quiesce_done_cv); 171eda14cbcSMatt Macy cv_destroy(&tx->tx_exit_cv); 172eda14cbcSMatt Macy 173eda14cbcSMatt Macy for (c = 0; c < max_ncpus; c++) { 174eda14cbcSMatt Macy int i; 175eda14cbcSMatt Macy 176eda14cbcSMatt Macy mutex_destroy(&tx->tx_cpu[c].tc_open_lock); 177eda14cbcSMatt Macy mutex_destroy(&tx->tx_cpu[c].tc_lock); 178eda14cbcSMatt Macy for (i = 0; i < TXG_SIZE; i++) { 179eda14cbcSMatt Macy cv_destroy(&tx->tx_cpu[c].tc_cv[i]); 180eda14cbcSMatt Macy list_destroy(&tx->tx_cpu[c].tc_callbacks[i]); 181eda14cbcSMatt Macy } 182eda14cbcSMatt Macy } 183eda14cbcSMatt Macy 184eda14cbcSMatt Macy if (tx->tx_commit_cb_taskq != NULL) 185eda14cbcSMatt Macy taskq_destroy(tx->tx_commit_cb_taskq); 186eda14cbcSMatt Macy 187eda14cbcSMatt Macy vmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t)); 188eda14cbcSMatt Macy 189da5137abSMartin Matuska memset(tx, 0, sizeof (tx_state_t)); 190eda14cbcSMatt Macy } 191eda14cbcSMatt Macy 192eda14cbcSMatt Macy /* 193eda14cbcSMatt Macy * Start syncing transaction groups. 194eda14cbcSMatt Macy */ 195eda14cbcSMatt Macy void 196eda14cbcSMatt Macy txg_sync_start(dsl_pool_t *dp) 197eda14cbcSMatt Macy { 198eda14cbcSMatt Macy tx_state_t *tx = &dp->dp_tx; 199eda14cbcSMatt Macy 200eda14cbcSMatt Macy mutex_enter(&tx->tx_sync_lock); 201eda14cbcSMatt Macy 202eda14cbcSMatt Macy dprintf("pool %p\n", dp); 203eda14cbcSMatt Macy 204eda14cbcSMatt Macy ASSERT0(tx->tx_threads); 205eda14cbcSMatt Macy 206eda14cbcSMatt Macy tx->tx_threads = 2; 207eda14cbcSMatt Macy 208eda14cbcSMatt Macy tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread, 209eda14cbcSMatt Macy dp, 0, &p0, TS_RUN, defclsyspri); 210eda14cbcSMatt Macy 211eda14cbcSMatt Macy /* 212eda14cbcSMatt Macy * The sync thread can need a larger-than-default stack size on 213eda14cbcSMatt Macy * 32-bit x86. This is due in part to nested pools and 214eda14cbcSMatt Macy * scrub_visitbp() recursion. 215eda14cbcSMatt Macy */ 216eda14cbcSMatt Macy tx->tx_sync_thread = thread_create(NULL, 0, txg_sync_thread, 217eda14cbcSMatt Macy dp, 0, &p0, TS_RUN, defclsyspri); 218eda14cbcSMatt Macy 219eda14cbcSMatt Macy mutex_exit(&tx->tx_sync_lock); 220eda14cbcSMatt Macy } 221eda14cbcSMatt Macy 222eda14cbcSMatt Macy static void 223eda14cbcSMatt Macy txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr) 224eda14cbcSMatt Macy { 225eda14cbcSMatt Macy CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG); 226eda14cbcSMatt Macy mutex_enter(&tx->tx_sync_lock); 227eda14cbcSMatt Macy } 228eda14cbcSMatt Macy 229eda14cbcSMatt Macy static void 230eda14cbcSMatt Macy txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp) 231eda14cbcSMatt Macy { 232eda14cbcSMatt Macy ASSERT(*tpp != NULL); 233eda14cbcSMatt Macy *tpp = NULL; 234eda14cbcSMatt Macy tx->tx_threads--; 235eda14cbcSMatt Macy cv_broadcast(&tx->tx_exit_cv); 236eda14cbcSMatt Macy CALLB_CPR_EXIT(cpr); /* drops &tx->tx_sync_lock */ 237eda14cbcSMatt Macy thread_exit(); 238eda14cbcSMatt Macy } 239eda14cbcSMatt Macy 240eda14cbcSMatt Macy static void 241eda14cbcSMatt Macy txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time) 242eda14cbcSMatt Macy { 243eda14cbcSMatt Macy CALLB_CPR_SAFE_BEGIN(cpr); 244eda14cbcSMatt Macy 245eda14cbcSMatt Macy if (time) { 2462c48331dSMatt Macy (void) cv_timedwait_idle(cv, &tx->tx_sync_lock, 247eda14cbcSMatt Macy ddi_get_lbolt() + time); 248eda14cbcSMatt Macy } else { 2492c48331dSMatt Macy cv_wait_idle(cv, &tx->tx_sync_lock); 250eda14cbcSMatt Macy } 251eda14cbcSMatt Macy 252eda14cbcSMatt Macy CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock); 253eda14cbcSMatt Macy } 254eda14cbcSMatt Macy 255eda14cbcSMatt Macy /* 256eda14cbcSMatt Macy * Stop syncing transaction groups. 257eda14cbcSMatt Macy */ 258eda14cbcSMatt Macy void 259eda14cbcSMatt Macy txg_sync_stop(dsl_pool_t *dp) 260eda14cbcSMatt Macy { 261eda14cbcSMatt Macy tx_state_t *tx = &dp->dp_tx; 262eda14cbcSMatt Macy 263eda14cbcSMatt Macy dprintf("pool %p\n", dp); 264eda14cbcSMatt Macy /* 265eda14cbcSMatt Macy * Finish off any work in progress. 266eda14cbcSMatt Macy */ 267eda14cbcSMatt Macy ASSERT3U(tx->tx_threads, ==, 2); 268eda14cbcSMatt Macy 269eda14cbcSMatt Macy /* 270eda14cbcSMatt Macy * We need to ensure that we've vacated the deferred metaslab trees. 271eda14cbcSMatt Macy */ 272eda14cbcSMatt Macy txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE); 273eda14cbcSMatt Macy 274eda14cbcSMatt Macy /* 275eda14cbcSMatt Macy * Wake all sync threads and wait for them to die. 276eda14cbcSMatt Macy */ 277eda14cbcSMatt Macy mutex_enter(&tx->tx_sync_lock); 278eda14cbcSMatt Macy 279eda14cbcSMatt Macy ASSERT3U(tx->tx_threads, ==, 2); 280eda14cbcSMatt Macy 281eda14cbcSMatt Macy tx->tx_exiting = 1; 282eda14cbcSMatt Macy 283eda14cbcSMatt Macy cv_broadcast(&tx->tx_quiesce_more_cv); 284eda14cbcSMatt Macy cv_broadcast(&tx->tx_quiesce_done_cv); 285eda14cbcSMatt Macy cv_broadcast(&tx->tx_sync_more_cv); 286eda14cbcSMatt Macy 287eda14cbcSMatt Macy while (tx->tx_threads != 0) 288eda14cbcSMatt Macy cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock); 289eda14cbcSMatt Macy 290eda14cbcSMatt Macy tx->tx_exiting = 0; 291eda14cbcSMatt Macy 292eda14cbcSMatt Macy mutex_exit(&tx->tx_sync_lock); 293eda14cbcSMatt Macy } 294eda14cbcSMatt Macy 295184c1b94SMartin Matuska /* 296184c1b94SMartin Matuska * Get a handle on the currently open txg and keep it open. 297184c1b94SMartin Matuska * 298184c1b94SMartin Matuska * The txg is guaranteed to stay open until txg_rele_to_quiesce() is called for 299184c1b94SMartin Matuska * the handle. Once txg_rele_to_quiesce() has been called, the txg stays 300184c1b94SMartin Matuska * in quiescing state until txg_rele_to_sync() is called for the handle. 301184c1b94SMartin Matuska * 302184c1b94SMartin Matuska * It is guaranteed that subsequent calls return monotonically increasing 303184c1b94SMartin Matuska * txgs for the same dsl_pool_t. Of course this is not strong monotonicity, 304184c1b94SMartin Matuska * because the same txg can be returned multiple times in a row. This 305184c1b94SMartin Matuska * guarantee holds both for subsequent calls from one thread and for multiple 306184c1b94SMartin Matuska * threads. For example, it is impossible to observe the following sequence 307184c1b94SMartin Matuska * of events: 308184c1b94SMartin Matuska * 309184c1b94SMartin Matuska * Thread 1 Thread 2 310184c1b94SMartin Matuska * 311184c1b94SMartin Matuska * 1 <- txg_hold_open(P, ...) 312184c1b94SMartin Matuska * 2 <- txg_hold_open(P, ...) 313184c1b94SMartin Matuska * 1 <- txg_hold_open(P, ...) 314184c1b94SMartin Matuska * 315184c1b94SMartin Matuska */ 316eda14cbcSMatt Macy uint64_t 317eda14cbcSMatt Macy txg_hold_open(dsl_pool_t *dp, txg_handle_t *th) 318eda14cbcSMatt Macy { 319eda14cbcSMatt Macy tx_state_t *tx = &dp->dp_tx; 320eda14cbcSMatt Macy tx_cpu_t *tc; 321eda14cbcSMatt Macy uint64_t txg; 322eda14cbcSMatt Macy 323eda14cbcSMatt Macy /* 324eda14cbcSMatt Macy * It appears the processor id is simply used as a "random" 325eda14cbcSMatt Macy * number to index into the array, and there isn't any other 326eda14cbcSMatt Macy * significance to the chosen tx_cpu. Because.. Why not use 327eda14cbcSMatt Macy * the current cpu to index into the array? 328eda14cbcSMatt Macy */ 3297877fdebSMatt Macy tc = &tx->tx_cpu[CPU_SEQID_UNSTABLE]; 330eda14cbcSMatt Macy 331eda14cbcSMatt Macy mutex_enter(&tc->tc_open_lock); 332eda14cbcSMatt Macy txg = tx->tx_open_txg; 333eda14cbcSMatt Macy 334eda14cbcSMatt Macy mutex_enter(&tc->tc_lock); 335eda14cbcSMatt Macy tc->tc_count[txg & TXG_MASK]++; 336eda14cbcSMatt Macy mutex_exit(&tc->tc_lock); 337eda14cbcSMatt Macy 338eda14cbcSMatt Macy th->th_cpu = tc; 339eda14cbcSMatt Macy th->th_txg = txg; 340eda14cbcSMatt Macy 341eda14cbcSMatt Macy return (txg); 342eda14cbcSMatt Macy } 343eda14cbcSMatt Macy 344eda14cbcSMatt Macy void 345eda14cbcSMatt Macy txg_rele_to_quiesce(txg_handle_t *th) 346eda14cbcSMatt Macy { 347eda14cbcSMatt Macy tx_cpu_t *tc = th->th_cpu; 348eda14cbcSMatt Macy 349eda14cbcSMatt Macy ASSERT(!MUTEX_HELD(&tc->tc_lock)); 350eda14cbcSMatt Macy mutex_exit(&tc->tc_open_lock); 351eda14cbcSMatt Macy } 352eda14cbcSMatt Macy 353eda14cbcSMatt Macy void 354eda14cbcSMatt Macy txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks) 355eda14cbcSMatt Macy { 356eda14cbcSMatt Macy tx_cpu_t *tc = th->th_cpu; 357eda14cbcSMatt Macy int g = th->th_txg & TXG_MASK; 358eda14cbcSMatt Macy 359eda14cbcSMatt Macy mutex_enter(&tc->tc_lock); 360eda14cbcSMatt Macy list_move_tail(&tc->tc_callbacks[g], tx_callbacks); 361eda14cbcSMatt Macy mutex_exit(&tc->tc_lock); 362eda14cbcSMatt Macy } 363eda14cbcSMatt Macy 364eda14cbcSMatt Macy void 365eda14cbcSMatt Macy txg_rele_to_sync(txg_handle_t *th) 366eda14cbcSMatt Macy { 367eda14cbcSMatt Macy tx_cpu_t *tc = th->th_cpu; 368eda14cbcSMatt Macy int g = th->th_txg & TXG_MASK; 369eda14cbcSMatt Macy 370eda14cbcSMatt Macy mutex_enter(&tc->tc_lock); 371eda14cbcSMatt Macy ASSERT(tc->tc_count[g] != 0); 372eda14cbcSMatt Macy if (--tc->tc_count[g] == 0) 373eda14cbcSMatt Macy cv_broadcast(&tc->tc_cv[g]); 374eda14cbcSMatt Macy mutex_exit(&tc->tc_lock); 375eda14cbcSMatt Macy 376eda14cbcSMatt Macy th->th_cpu = NULL; /* defensive */ 377eda14cbcSMatt Macy } 378eda14cbcSMatt Macy 379eda14cbcSMatt Macy /* 380eda14cbcSMatt Macy * Blocks until all transactions in the group are committed. 381eda14cbcSMatt Macy * 382eda14cbcSMatt Macy * On return, the transaction group has reached a stable state in which it can 383eda14cbcSMatt Macy * then be passed off to the syncing context. 384eda14cbcSMatt Macy */ 385eda14cbcSMatt Macy static void 386eda14cbcSMatt Macy txg_quiesce(dsl_pool_t *dp, uint64_t txg) 387eda14cbcSMatt Macy { 388eda14cbcSMatt Macy tx_state_t *tx = &dp->dp_tx; 389eda14cbcSMatt Macy uint64_t tx_open_time; 390eda14cbcSMatt Macy int g = txg & TXG_MASK; 391eda14cbcSMatt Macy int c; 392eda14cbcSMatt Macy 393eda14cbcSMatt Macy /* 394eda14cbcSMatt Macy * Grab all tc_open_locks so nobody else can get into this txg. 395eda14cbcSMatt Macy */ 396eda14cbcSMatt Macy for (c = 0; c < max_ncpus; c++) 397eda14cbcSMatt Macy mutex_enter(&tx->tx_cpu[c].tc_open_lock); 398eda14cbcSMatt Macy 399eda14cbcSMatt Macy ASSERT(txg == tx->tx_open_txg); 400eda14cbcSMatt Macy tx->tx_open_txg++; 401eda14cbcSMatt Macy tx->tx_open_time = tx_open_time = gethrtime(); 402eda14cbcSMatt Macy 403eda14cbcSMatt Macy DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg); 404eda14cbcSMatt Macy DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg); 405eda14cbcSMatt Macy 406eda14cbcSMatt Macy /* 407eda14cbcSMatt Macy * Now that we've incremented tx_open_txg, we can let threads 408eda14cbcSMatt Macy * enter the next transaction group. 409eda14cbcSMatt Macy */ 410eda14cbcSMatt Macy for (c = 0; c < max_ncpus; c++) 411eda14cbcSMatt Macy mutex_exit(&tx->tx_cpu[c].tc_open_lock); 412eda14cbcSMatt Macy 413eda14cbcSMatt Macy spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_OPEN, tx_open_time); 414eda14cbcSMatt Macy spa_txg_history_add(dp->dp_spa, txg + 1, tx_open_time); 415eda14cbcSMatt Macy 416eda14cbcSMatt Macy /* 417184c1b94SMartin Matuska * Quiesce the transaction group by waiting for everyone to 418184c1b94SMartin Matuska * call txg_rele_to_sync() for their open transaction handles. 419eda14cbcSMatt Macy */ 420eda14cbcSMatt Macy for (c = 0; c < max_ncpus; c++) { 421eda14cbcSMatt Macy tx_cpu_t *tc = &tx->tx_cpu[c]; 422eda14cbcSMatt Macy mutex_enter(&tc->tc_lock); 423eda14cbcSMatt Macy while (tc->tc_count[g] != 0) 424eda14cbcSMatt Macy cv_wait(&tc->tc_cv[g], &tc->tc_lock); 425eda14cbcSMatt Macy mutex_exit(&tc->tc_lock); 426eda14cbcSMatt Macy } 427eda14cbcSMatt Macy 428eda14cbcSMatt Macy spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_QUIESCED, gethrtime()); 429eda14cbcSMatt Macy } 430eda14cbcSMatt Macy 431eda14cbcSMatt Macy static void 432bb2d13b6SMartin Matuska txg_do_callbacks(void *cb_list) 433eda14cbcSMatt Macy { 434eda14cbcSMatt Macy dmu_tx_do_callbacks(cb_list, 0); 435eda14cbcSMatt Macy 436eda14cbcSMatt Macy list_destroy(cb_list); 437eda14cbcSMatt Macy 438eda14cbcSMatt Macy kmem_free(cb_list, sizeof (list_t)); 439eda14cbcSMatt Macy } 440eda14cbcSMatt Macy 441eda14cbcSMatt Macy /* 442eda14cbcSMatt Macy * Dispatch the commit callbacks registered on this txg to worker threads. 443eda14cbcSMatt Macy * 444eda14cbcSMatt Macy * If no callbacks are registered for a given TXG, nothing happens. 445eda14cbcSMatt Macy * This function creates a taskq for the associated pool, if needed. 446eda14cbcSMatt Macy */ 447eda14cbcSMatt Macy static void 448eda14cbcSMatt Macy txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) 449eda14cbcSMatt Macy { 450eda14cbcSMatt Macy int c; 451eda14cbcSMatt Macy tx_state_t *tx = &dp->dp_tx; 452eda14cbcSMatt Macy list_t *cb_list; 453eda14cbcSMatt Macy 454eda14cbcSMatt Macy for (c = 0; c < max_ncpus; c++) { 455eda14cbcSMatt Macy tx_cpu_t *tc = &tx->tx_cpu[c]; 456eda14cbcSMatt Macy /* 457eda14cbcSMatt Macy * No need to lock tx_cpu_t at this point, since this can 458eda14cbcSMatt Macy * only be called once a txg has been synced. 459eda14cbcSMatt Macy */ 460eda14cbcSMatt Macy 461eda14cbcSMatt Macy int g = txg & TXG_MASK; 462eda14cbcSMatt Macy 463eda14cbcSMatt Macy if (list_is_empty(&tc->tc_callbacks[g])) 464eda14cbcSMatt Macy continue; 465eda14cbcSMatt Macy 466eda14cbcSMatt Macy if (tx->tx_commit_cb_taskq == NULL) { 467eda14cbcSMatt Macy /* 468eda14cbcSMatt Macy * Commit callback taskq hasn't been created yet. 469eda14cbcSMatt Macy */ 470eda14cbcSMatt Macy tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb", 4717877fdebSMatt Macy 100, defclsyspri, boot_ncpus, boot_ncpus * 2, 4727877fdebSMatt Macy TASKQ_PREPOPULATE | TASKQ_DYNAMIC | 4737877fdebSMatt Macy TASKQ_THREADS_CPU_PCT); 474eda14cbcSMatt Macy } 475eda14cbcSMatt Macy 476eda14cbcSMatt Macy cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP); 477eda14cbcSMatt Macy list_create(cb_list, sizeof (dmu_tx_callback_t), 478eda14cbcSMatt Macy offsetof(dmu_tx_callback_t, dcb_node)); 479eda14cbcSMatt Macy 480eda14cbcSMatt Macy list_move_tail(cb_list, &tc->tc_callbacks[g]); 481eda14cbcSMatt Macy 482bb2d13b6SMartin Matuska (void) taskq_dispatch(tx->tx_commit_cb_taskq, 483eda14cbcSMatt Macy txg_do_callbacks, cb_list, TQ_SLEEP); 484eda14cbcSMatt Macy } 485eda14cbcSMatt Macy } 486eda14cbcSMatt Macy 487eda14cbcSMatt Macy /* 488eda14cbcSMatt Macy * Wait for pending commit callbacks of already-synced transactions to finish 489eda14cbcSMatt Macy * processing. 490eda14cbcSMatt Macy * Calling this function from within a commit callback will deadlock. 491eda14cbcSMatt Macy */ 492eda14cbcSMatt Macy void 493eda14cbcSMatt Macy txg_wait_callbacks(dsl_pool_t *dp) 494eda14cbcSMatt Macy { 495eda14cbcSMatt Macy tx_state_t *tx = &dp->dp_tx; 496eda14cbcSMatt Macy 497eda14cbcSMatt Macy if (tx->tx_commit_cb_taskq != NULL) 498eda14cbcSMatt Macy taskq_wait_outstanding(tx->tx_commit_cb_taskq, 0); 499eda14cbcSMatt Macy } 500eda14cbcSMatt Macy 501eda14cbcSMatt Macy static boolean_t 502eda14cbcSMatt Macy txg_is_quiescing(dsl_pool_t *dp) 503eda14cbcSMatt Macy { 504eda14cbcSMatt Macy tx_state_t *tx = &dp->dp_tx; 505eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); 506eda14cbcSMatt Macy return (tx->tx_quiescing_txg != 0); 507eda14cbcSMatt Macy } 508eda14cbcSMatt Macy 509eda14cbcSMatt Macy static boolean_t 510eda14cbcSMatt Macy txg_has_quiesced_to_sync(dsl_pool_t *dp) 511eda14cbcSMatt Macy { 512eda14cbcSMatt Macy tx_state_t *tx = &dp->dp_tx; 513eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); 514eda14cbcSMatt Macy return (tx->tx_quiesced_txg != 0); 515eda14cbcSMatt Macy } 516eda14cbcSMatt Macy 517da5137abSMartin Matuska static __attribute__((noreturn)) void 518eda14cbcSMatt Macy txg_sync_thread(void *arg) 519eda14cbcSMatt Macy { 520eda14cbcSMatt Macy dsl_pool_t *dp = arg; 521eda14cbcSMatt Macy spa_t *spa = dp->dp_spa; 522eda14cbcSMatt Macy tx_state_t *tx = &dp->dp_tx; 523eda14cbcSMatt Macy callb_cpr_t cpr; 524eda14cbcSMatt Macy clock_t start, delta; 525eda14cbcSMatt Macy 526eda14cbcSMatt Macy (void) spl_fstrans_mark(); 527eda14cbcSMatt Macy txg_thread_enter(tx, &cpr); 528eda14cbcSMatt Macy 529eda14cbcSMatt Macy start = delta = 0; 530eda14cbcSMatt Macy for (;;) { 531eda14cbcSMatt Macy clock_t timeout = zfs_txg_timeout * hz; 532eda14cbcSMatt Macy clock_t timer; 533eda14cbcSMatt Macy uint64_t txg; 534eda14cbcSMatt Macy 535eda14cbcSMatt Macy /* 536eda14cbcSMatt Macy * We sync when we're scanning, there's someone waiting 537eda14cbcSMatt Macy * on us, or the quiesce thread has handed off a txg to 538eda14cbcSMatt Macy * us, or we have reached our timeout. 539eda14cbcSMatt Macy */ 540eda14cbcSMatt Macy timer = (delta >= timeout ? 0 : timeout - delta); 541eda14cbcSMatt Macy while (!dsl_scan_active(dp->dp_scan) && 542eda14cbcSMatt Macy !tx->tx_exiting && timer > 0 && 543eda14cbcSMatt Macy tx->tx_synced_txg >= tx->tx_sync_txg_waiting && 5447cd22ac4SMartin Matuska !txg_has_quiesced_to_sync(dp)) { 545eda14cbcSMatt Macy dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", 54633b8c039SMartin Matuska (u_longlong_t)tx->tx_synced_txg, 54733b8c039SMartin Matuska (u_longlong_t)tx->tx_sync_txg_waiting, dp); 548eda14cbcSMatt Macy txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer); 549eda14cbcSMatt Macy delta = ddi_get_lbolt() - start; 550eda14cbcSMatt Macy timer = (delta > timeout ? 0 : timeout - delta); 551eda14cbcSMatt Macy } 552eda14cbcSMatt Macy 553eda14cbcSMatt Macy /* 554*b985c9caSMartin Matuska * When we're suspended, nothing should be changing and for 555*b985c9caSMartin Matuska * MMP we don't want to bump anything that would make it 556*b985c9caSMartin Matuska * harder to detect if another host is changing it when 557*b985c9caSMartin Matuska * resuming after a MMP suspend. 558*b985c9caSMartin Matuska */ 559*b985c9caSMartin Matuska if (spa_suspended(spa)) 560*b985c9caSMartin Matuska continue; 561*b985c9caSMartin Matuska 562*b985c9caSMartin Matuska /* 563eda14cbcSMatt Macy * Wait until the quiesce thread hands off a txg to us, 564eda14cbcSMatt Macy * prompting it to do so if necessary. 565eda14cbcSMatt Macy */ 566eda14cbcSMatt Macy while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) { 5677cd22ac4SMartin Matuska if (txg_is_quiescing(dp)) { 5687cd22ac4SMartin Matuska txg_thread_wait(tx, &cpr, 5697cd22ac4SMartin Matuska &tx->tx_quiesce_done_cv, 0); 5707cd22ac4SMartin Matuska continue; 5717cd22ac4SMartin Matuska } 572eda14cbcSMatt Macy if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1) 573eda14cbcSMatt Macy tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1; 574eda14cbcSMatt Macy cv_broadcast(&tx->tx_quiesce_more_cv); 575eda14cbcSMatt Macy txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0); 576eda14cbcSMatt Macy } 577eda14cbcSMatt Macy 578eda14cbcSMatt Macy if (tx->tx_exiting) 579eda14cbcSMatt Macy txg_thread_exit(tx, &cpr, &tx->tx_sync_thread); 580eda14cbcSMatt Macy 581eda14cbcSMatt Macy /* 582eda14cbcSMatt Macy * Consume the quiesced txg which has been handed off to 583eda14cbcSMatt Macy * us. This may cause the quiescing thread to now be 584eda14cbcSMatt Macy * able to quiesce another txg, so we must signal it. 585eda14cbcSMatt Macy */ 586eda14cbcSMatt Macy ASSERT(tx->tx_quiesced_txg != 0); 587eda14cbcSMatt Macy txg = tx->tx_quiesced_txg; 588eda14cbcSMatt Macy tx->tx_quiesced_txg = 0; 589eda14cbcSMatt Macy tx->tx_syncing_txg = txg; 590eda14cbcSMatt Macy DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg); 591eda14cbcSMatt Macy cv_broadcast(&tx->tx_quiesce_more_cv); 592eda14cbcSMatt Macy 593eda14cbcSMatt Macy dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", 59433b8c039SMartin Matuska (u_longlong_t)txg, (u_longlong_t)tx->tx_quiesce_txg_waiting, 59533b8c039SMartin Matuska (u_longlong_t)tx->tx_sync_txg_waiting); 596eda14cbcSMatt Macy mutex_exit(&tx->tx_sync_lock); 597eda14cbcSMatt Macy 598eda14cbcSMatt Macy txg_stat_t *ts = spa_txg_history_init_io(spa, txg, dp); 599eda14cbcSMatt Macy start = ddi_get_lbolt(); 600eda14cbcSMatt Macy spa_sync(spa, txg); 601eda14cbcSMatt Macy delta = ddi_get_lbolt() - start; 602eda14cbcSMatt Macy spa_txg_history_fini_io(spa, ts); 603eda14cbcSMatt Macy 604eda14cbcSMatt Macy mutex_enter(&tx->tx_sync_lock); 605eda14cbcSMatt Macy tx->tx_synced_txg = txg; 606eda14cbcSMatt Macy tx->tx_syncing_txg = 0; 607eda14cbcSMatt Macy DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg); 608eda14cbcSMatt Macy cv_broadcast(&tx->tx_sync_done_cv); 609eda14cbcSMatt Macy 610eda14cbcSMatt Macy /* 611eda14cbcSMatt Macy * Dispatch commit callbacks to worker threads. 612eda14cbcSMatt Macy */ 613eda14cbcSMatt Macy txg_dispatch_callbacks(dp, txg); 614eda14cbcSMatt Macy } 615eda14cbcSMatt Macy } 616eda14cbcSMatt Macy 617da5137abSMartin Matuska static __attribute__((noreturn)) void 618eda14cbcSMatt Macy txg_quiesce_thread(void *arg) 619eda14cbcSMatt Macy { 620eda14cbcSMatt Macy dsl_pool_t *dp = arg; 621eda14cbcSMatt Macy tx_state_t *tx = &dp->dp_tx; 622eda14cbcSMatt Macy callb_cpr_t cpr; 623eda14cbcSMatt Macy 624eda14cbcSMatt Macy txg_thread_enter(tx, &cpr); 625eda14cbcSMatt Macy 626eda14cbcSMatt Macy for (;;) { 627eda14cbcSMatt Macy uint64_t txg; 628eda14cbcSMatt Macy 629eda14cbcSMatt Macy /* 630eda14cbcSMatt Macy * We quiesce when there's someone waiting on us. 631eda14cbcSMatt Macy * However, we can only have one txg in "quiescing" or 632eda14cbcSMatt Macy * "quiesced, waiting to sync" state. So we wait until 633eda14cbcSMatt Macy * the "quiesced, waiting to sync" txg has been consumed 634eda14cbcSMatt Macy * by the sync thread. 635eda14cbcSMatt Macy */ 636eda14cbcSMatt Macy while (!tx->tx_exiting && 637eda14cbcSMatt Macy (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting || 638eda14cbcSMatt Macy txg_has_quiesced_to_sync(dp))) 639eda14cbcSMatt Macy txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0); 640eda14cbcSMatt Macy 641eda14cbcSMatt Macy if (tx->tx_exiting) 642eda14cbcSMatt Macy txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread); 643eda14cbcSMatt Macy 644eda14cbcSMatt Macy txg = tx->tx_open_txg; 645eda14cbcSMatt Macy dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", 64633b8c039SMartin Matuska (u_longlong_t)txg, 64733b8c039SMartin Matuska (u_longlong_t)tx->tx_quiesce_txg_waiting, 64833b8c039SMartin Matuska (u_longlong_t)tx->tx_sync_txg_waiting); 649eda14cbcSMatt Macy tx->tx_quiescing_txg = txg; 650eda14cbcSMatt Macy 651eda14cbcSMatt Macy mutex_exit(&tx->tx_sync_lock); 652eda14cbcSMatt Macy txg_quiesce(dp, txg); 653eda14cbcSMatt Macy mutex_enter(&tx->tx_sync_lock); 654eda14cbcSMatt Macy 655eda14cbcSMatt Macy /* 656eda14cbcSMatt Macy * Hand this txg off to the sync thread. 657eda14cbcSMatt Macy */ 65833b8c039SMartin Matuska dprintf("quiesce done, handing off txg %llu\n", 65933b8c039SMartin Matuska (u_longlong_t)txg); 660eda14cbcSMatt Macy tx->tx_quiescing_txg = 0; 661eda14cbcSMatt Macy tx->tx_quiesced_txg = txg; 662eda14cbcSMatt Macy DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg); 663eda14cbcSMatt Macy cv_broadcast(&tx->tx_sync_more_cv); 664eda14cbcSMatt Macy cv_broadcast(&tx->tx_quiesce_done_cv); 665eda14cbcSMatt Macy } 666eda14cbcSMatt Macy } 667eda14cbcSMatt Macy 668eda14cbcSMatt Macy /* 669eda14cbcSMatt Macy * Delay this thread by delay nanoseconds if we are still in the open 670eda14cbcSMatt Macy * transaction group and there is already a waiting txg quiescing or quiesced. 671eda14cbcSMatt Macy * Abort the delay if this txg stalls or enters the quiescing state. 672eda14cbcSMatt Macy */ 673eda14cbcSMatt Macy void 674eda14cbcSMatt Macy txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution) 675eda14cbcSMatt Macy { 676eda14cbcSMatt Macy tx_state_t *tx = &dp->dp_tx; 677eda14cbcSMatt Macy hrtime_t start = gethrtime(); 678eda14cbcSMatt Macy 679eda14cbcSMatt Macy /* don't delay if this txg could transition to quiescing immediately */ 680eda14cbcSMatt Macy if (tx->tx_open_txg > txg || 681eda14cbcSMatt Macy tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1) 682eda14cbcSMatt Macy return; 683eda14cbcSMatt Macy 684eda14cbcSMatt Macy mutex_enter(&tx->tx_sync_lock); 685eda14cbcSMatt Macy if (tx->tx_open_txg > txg || tx->tx_synced_txg == txg-1) { 686eda14cbcSMatt Macy mutex_exit(&tx->tx_sync_lock); 687eda14cbcSMatt Macy return; 688eda14cbcSMatt Macy } 689eda14cbcSMatt Macy 690eda14cbcSMatt Macy while (gethrtime() - start < delay && 691eda14cbcSMatt Macy tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) { 692eda14cbcSMatt Macy (void) cv_timedwait_hires(&tx->tx_quiesce_more_cv, 693eda14cbcSMatt Macy &tx->tx_sync_lock, delay, resolution, 0); 694eda14cbcSMatt Macy } 695eda14cbcSMatt Macy 696eda14cbcSMatt Macy DMU_TX_STAT_BUMP(dmu_tx_delay); 697eda14cbcSMatt Macy 698eda14cbcSMatt Macy mutex_exit(&tx->tx_sync_lock); 699eda14cbcSMatt Macy } 700eda14cbcSMatt Macy 701eda14cbcSMatt Macy static boolean_t 702eda14cbcSMatt Macy txg_wait_synced_impl(dsl_pool_t *dp, uint64_t txg, boolean_t wait_sig) 703eda14cbcSMatt Macy { 704eda14cbcSMatt Macy tx_state_t *tx = &dp->dp_tx; 705eda14cbcSMatt Macy 706eda14cbcSMatt Macy ASSERT(!dsl_pool_config_held(dp)); 707eda14cbcSMatt Macy 708eda14cbcSMatt Macy mutex_enter(&tx->tx_sync_lock); 709eda14cbcSMatt Macy ASSERT3U(tx->tx_threads, ==, 2); 710eda14cbcSMatt Macy if (txg == 0) 711eda14cbcSMatt Macy txg = tx->tx_open_txg + TXG_DEFER_SIZE; 712eda14cbcSMatt Macy if (tx->tx_sync_txg_waiting < txg) 713eda14cbcSMatt Macy tx->tx_sync_txg_waiting = txg; 714eda14cbcSMatt Macy dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", 71533b8c039SMartin Matuska (u_longlong_t)txg, (u_longlong_t)tx->tx_quiesce_txg_waiting, 71633b8c039SMartin Matuska (u_longlong_t)tx->tx_sync_txg_waiting); 717eda14cbcSMatt Macy while (tx->tx_synced_txg < txg) { 718eda14cbcSMatt Macy dprintf("broadcasting sync more " 719eda14cbcSMatt Macy "tx_synced=%llu waiting=%llu dp=%px\n", 72033b8c039SMartin Matuska (u_longlong_t)tx->tx_synced_txg, 72133b8c039SMartin Matuska (u_longlong_t)tx->tx_sync_txg_waiting, dp); 722eda14cbcSMatt Macy cv_broadcast(&tx->tx_sync_more_cv); 723eda14cbcSMatt Macy if (wait_sig) { 724eda14cbcSMatt Macy /* 725eda14cbcSMatt Macy * Condition wait here but stop if the thread receives a 726eda14cbcSMatt Macy * signal. The caller may call txg_wait_synced*() again 727eda14cbcSMatt Macy * to resume waiting for this txg. 728eda14cbcSMatt Macy */ 729eda14cbcSMatt Macy if (cv_wait_io_sig(&tx->tx_sync_done_cv, 730eda14cbcSMatt Macy &tx->tx_sync_lock) == 0) { 731eda14cbcSMatt Macy mutex_exit(&tx->tx_sync_lock); 732eda14cbcSMatt Macy return (B_TRUE); 733eda14cbcSMatt Macy } 734eda14cbcSMatt Macy } else { 735eda14cbcSMatt Macy cv_wait_io(&tx->tx_sync_done_cv, &tx->tx_sync_lock); 736eda14cbcSMatt Macy } 737eda14cbcSMatt Macy } 738eda14cbcSMatt Macy mutex_exit(&tx->tx_sync_lock); 739eda14cbcSMatt Macy return (B_FALSE); 740eda14cbcSMatt Macy } 741eda14cbcSMatt Macy 742eda14cbcSMatt Macy void 743eda14cbcSMatt Macy txg_wait_synced(dsl_pool_t *dp, uint64_t txg) 744eda14cbcSMatt Macy { 745eda14cbcSMatt Macy VERIFY0(txg_wait_synced_impl(dp, txg, B_FALSE)); 746eda14cbcSMatt Macy } 747eda14cbcSMatt Macy 748eda14cbcSMatt Macy /* 749eda14cbcSMatt Macy * Similar to a txg_wait_synced but it can be interrupted from a signal. 750eda14cbcSMatt Macy * Returns B_TRUE if the thread was signaled while waiting. 751eda14cbcSMatt Macy */ 752eda14cbcSMatt Macy boolean_t 753eda14cbcSMatt Macy txg_wait_synced_sig(dsl_pool_t *dp, uint64_t txg) 754eda14cbcSMatt Macy { 755eda14cbcSMatt Macy return (txg_wait_synced_impl(dp, txg, B_TRUE)); 756eda14cbcSMatt Macy } 757eda14cbcSMatt Macy 758eda14cbcSMatt Macy /* 759eda14cbcSMatt Macy * Wait for the specified open transaction group. Set should_quiesce 760eda14cbcSMatt Macy * when the current open txg should be quiesced immediately. 761eda14cbcSMatt Macy */ 762eda14cbcSMatt Macy void 763eda14cbcSMatt Macy txg_wait_open(dsl_pool_t *dp, uint64_t txg, boolean_t should_quiesce) 764eda14cbcSMatt Macy { 765eda14cbcSMatt Macy tx_state_t *tx = &dp->dp_tx; 766eda14cbcSMatt Macy 767eda14cbcSMatt Macy ASSERT(!dsl_pool_config_held(dp)); 768eda14cbcSMatt Macy 769eda14cbcSMatt Macy mutex_enter(&tx->tx_sync_lock); 770eda14cbcSMatt Macy ASSERT3U(tx->tx_threads, ==, 2); 771eda14cbcSMatt Macy if (txg == 0) 772eda14cbcSMatt Macy txg = tx->tx_open_txg + 1; 773eda14cbcSMatt Macy if (tx->tx_quiesce_txg_waiting < txg && should_quiesce) 774eda14cbcSMatt Macy tx->tx_quiesce_txg_waiting = txg; 775eda14cbcSMatt Macy dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", 77633b8c039SMartin Matuska (u_longlong_t)txg, (u_longlong_t)tx->tx_quiesce_txg_waiting, 77733b8c039SMartin Matuska (u_longlong_t)tx->tx_sync_txg_waiting); 778eda14cbcSMatt Macy while (tx->tx_open_txg < txg) { 779eda14cbcSMatt Macy cv_broadcast(&tx->tx_quiesce_more_cv); 780eda14cbcSMatt Macy /* 781eda14cbcSMatt Macy * Callers setting should_quiesce will use cv_wait_io() and 782eda14cbcSMatt Macy * be accounted for as iowait time. Otherwise, the caller is 783eda14cbcSMatt Macy * understood to be idle and cv_wait_sig() is used to prevent 784eda14cbcSMatt Macy * incorrectly inflating the system load average. 785eda14cbcSMatt Macy */ 786eda14cbcSMatt Macy if (should_quiesce == B_TRUE) { 787eda14cbcSMatt Macy cv_wait_io(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock); 788eda14cbcSMatt Macy } else { 7892c48331dSMatt Macy cv_wait_idle(&tx->tx_quiesce_done_cv, 7902c48331dSMatt Macy &tx->tx_sync_lock); 791eda14cbcSMatt Macy } 792eda14cbcSMatt Macy } 793eda14cbcSMatt Macy mutex_exit(&tx->tx_sync_lock); 794eda14cbcSMatt Macy } 795eda14cbcSMatt Macy 796eda14cbcSMatt Macy /* 7977cd22ac4SMartin Matuska * Pass in the txg number that should be synced. 798eda14cbcSMatt Macy */ 799eda14cbcSMatt Macy void 8007cd22ac4SMartin Matuska txg_kick(dsl_pool_t *dp, uint64_t txg) 801eda14cbcSMatt Macy { 802eda14cbcSMatt Macy tx_state_t *tx = &dp->dp_tx; 803eda14cbcSMatt Macy 804eda14cbcSMatt Macy ASSERT(!dsl_pool_config_held(dp)); 805eda14cbcSMatt Macy 8067cd22ac4SMartin Matuska if (tx->tx_sync_txg_waiting >= txg) 8077cd22ac4SMartin Matuska return; 8087cd22ac4SMartin Matuska 809eda14cbcSMatt Macy mutex_enter(&tx->tx_sync_lock); 8107cd22ac4SMartin Matuska if (tx->tx_sync_txg_waiting < txg) { 8117cd22ac4SMartin Matuska tx->tx_sync_txg_waiting = txg; 8127cd22ac4SMartin Matuska cv_broadcast(&tx->tx_sync_more_cv); 813eda14cbcSMatt Macy } 814eda14cbcSMatt Macy mutex_exit(&tx->tx_sync_lock); 815eda14cbcSMatt Macy } 816eda14cbcSMatt Macy 817eda14cbcSMatt Macy boolean_t 818eda14cbcSMatt Macy txg_stalled(dsl_pool_t *dp) 819eda14cbcSMatt Macy { 820eda14cbcSMatt Macy tx_state_t *tx = &dp->dp_tx; 821eda14cbcSMatt Macy return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg); 822eda14cbcSMatt Macy } 823eda14cbcSMatt Macy 824eda14cbcSMatt Macy boolean_t 825eda14cbcSMatt Macy txg_sync_waiting(dsl_pool_t *dp) 826eda14cbcSMatt Macy { 827eda14cbcSMatt Macy tx_state_t *tx = &dp->dp_tx; 828eda14cbcSMatt Macy 829eda14cbcSMatt Macy return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting || 830eda14cbcSMatt Macy tx->tx_quiesced_txg != 0); 831eda14cbcSMatt Macy } 832eda14cbcSMatt Macy 833eda14cbcSMatt Macy /* 834eda14cbcSMatt Macy * Verify that this txg is active (open, quiescing, syncing). Non-active 835eda14cbcSMatt Macy * txg's should not be manipulated. 836eda14cbcSMatt Macy */ 837eda14cbcSMatt Macy #ifdef ZFS_DEBUG 838eda14cbcSMatt Macy void 839eda14cbcSMatt Macy txg_verify(spa_t *spa, uint64_t txg) 840eda14cbcSMatt Macy { 841eda14cbcSMatt Macy dsl_pool_t *dp __maybe_unused = spa_get_dsl(spa); 842eda14cbcSMatt Macy if (txg <= TXG_INITIAL || txg == ZILTEST_TXG) 843eda14cbcSMatt Macy return; 844eda14cbcSMatt Macy ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg); 845eda14cbcSMatt Macy ASSERT3U(txg, >=, dp->dp_tx.tx_synced_txg); 846eda14cbcSMatt Macy ASSERT3U(txg, >=, dp->dp_tx.tx_open_txg - TXG_CONCURRENT_STATES); 847eda14cbcSMatt Macy } 848eda14cbcSMatt Macy #endif 849eda14cbcSMatt Macy 850eda14cbcSMatt Macy /* 851eda14cbcSMatt Macy * Per-txg object lists. 852eda14cbcSMatt Macy */ 853eda14cbcSMatt Macy void 854eda14cbcSMatt Macy txg_list_create(txg_list_t *tl, spa_t *spa, size_t offset) 855eda14cbcSMatt Macy { 856eda14cbcSMatt Macy int t; 857eda14cbcSMatt Macy 858eda14cbcSMatt Macy mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL); 859eda14cbcSMatt Macy 860eda14cbcSMatt Macy tl->tl_offset = offset; 861eda14cbcSMatt Macy tl->tl_spa = spa; 862eda14cbcSMatt Macy 863eda14cbcSMatt Macy for (t = 0; t < TXG_SIZE; t++) 864eda14cbcSMatt Macy tl->tl_head[t] = NULL; 865eda14cbcSMatt Macy } 866eda14cbcSMatt Macy 867eda14cbcSMatt Macy static boolean_t 868eda14cbcSMatt Macy txg_list_empty_impl(txg_list_t *tl, uint64_t txg) 869eda14cbcSMatt Macy { 870eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&tl->tl_lock)); 871eda14cbcSMatt Macy TXG_VERIFY(tl->tl_spa, txg); 872eda14cbcSMatt Macy return (tl->tl_head[txg & TXG_MASK] == NULL); 873eda14cbcSMatt Macy } 874eda14cbcSMatt Macy 875eda14cbcSMatt Macy boolean_t 876eda14cbcSMatt Macy txg_list_empty(txg_list_t *tl, uint64_t txg) 877eda14cbcSMatt Macy { 878eda14cbcSMatt Macy mutex_enter(&tl->tl_lock); 879eda14cbcSMatt Macy boolean_t ret = txg_list_empty_impl(tl, txg); 880eda14cbcSMatt Macy mutex_exit(&tl->tl_lock); 881eda14cbcSMatt Macy 882eda14cbcSMatt Macy return (ret); 883eda14cbcSMatt Macy } 884eda14cbcSMatt Macy 885eda14cbcSMatt Macy void 886eda14cbcSMatt Macy txg_list_destroy(txg_list_t *tl) 887eda14cbcSMatt Macy { 888eda14cbcSMatt Macy int t; 889eda14cbcSMatt Macy 890eda14cbcSMatt Macy mutex_enter(&tl->tl_lock); 891eda14cbcSMatt Macy for (t = 0; t < TXG_SIZE; t++) 892eda14cbcSMatt Macy ASSERT(txg_list_empty_impl(tl, t)); 893eda14cbcSMatt Macy mutex_exit(&tl->tl_lock); 894eda14cbcSMatt Macy 895eda14cbcSMatt Macy mutex_destroy(&tl->tl_lock); 896eda14cbcSMatt Macy } 897eda14cbcSMatt Macy 898eda14cbcSMatt Macy /* 899eda14cbcSMatt Macy * Returns true if all txg lists are empty. 900eda14cbcSMatt Macy * 901eda14cbcSMatt Macy * Warning: this is inherently racy (an item could be added immediately 902eda14cbcSMatt Macy * after this function returns). 903eda14cbcSMatt Macy */ 904eda14cbcSMatt Macy boolean_t 905eda14cbcSMatt Macy txg_all_lists_empty(txg_list_t *tl) 906eda14cbcSMatt Macy { 9077b5e6873SMartin Matuska boolean_t res = B_TRUE; 9087b5e6873SMartin Matuska for (int i = 0; i < TXG_SIZE; i++) 9097b5e6873SMartin Matuska res &= (tl->tl_head[i] == NULL); 9107b5e6873SMartin Matuska return (res); 911eda14cbcSMatt Macy } 912eda14cbcSMatt Macy 913eda14cbcSMatt Macy /* 914eda14cbcSMatt Macy * Add an entry to the list (unless it's already on the list). 915eda14cbcSMatt Macy * Returns B_TRUE if it was actually added. 916eda14cbcSMatt Macy */ 917eda14cbcSMatt Macy boolean_t 918eda14cbcSMatt Macy txg_list_add(txg_list_t *tl, void *p, uint64_t txg) 919eda14cbcSMatt Macy { 920eda14cbcSMatt Macy int t = txg & TXG_MASK; 921eda14cbcSMatt Macy txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); 922eda14cbcSMatt Macy boolean_t add; 923eda14cbcSMatt Macy 924eda14cbcSMatt Macy TXG_VERIFY(tl->tl_spa, txg); 925eda14cbcSMatt Macy mutex_enter(&tl->tl_lock); 926eda14cbcSMatt Macy add = (tn->tn_member[t] == 0); 927eda14cbcSMatt Macy if (add) { 928eda14cbcSMatt Macy tn->tn_member[t] = 1; 929eda14cbcSMatt Macy tn->tn_next[t] = tl->tl_head[t]; 930eda14cbcSMatt Macy tl->tl_head[t] = tn; 931eda14cbcSMatt Macy } 932eda14cbcSMatt Macy mutex_exit(&tl->tl_lock); 933eda14cbcSMatt Macy 934eda14cbcSMatt Macy return (add); 935eda14cbcSMatt Macy } 936eda14cbcSMatt Macy 937eda14cbcSMatt Macy /* 938eda14cbcSMatt Macy * Add an entry to the end of the list, unless it's already on the list. 939eda14cbcSMatt Macy * (walks list to find end) 940eda14cbcSMatt Macy * Returns B_TRUE if it was actually added. 941eda14cbcSMatt Macy */ 942eda14cbcSMatt Macy boolean_t 943eda14cbcSMatt Macy txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg) 944eda14cbcSMatt Macy { 945eda14cbcSMatt Macy int t = txg & TXG_MASK; 946eda14cbcSMatt Macy txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); 947eda14cbcSMatt Macy boolean_t add; 948eda14cbcSMatt Macy 949eda14cbcSMatt Macy TXG_VERIFY(tl->tl_spa, txg); 950eda14cbcSMatt Macy mutex_enter(&tl->tl_lock); 951eda14cbcSMatt Macy add = (tn->tn_member[t] == 0); 952eda14cbcSMatt Macy if (add) { 953eda14cbcSMatt Macy txg_node_t **tp; 954eda14cbcSMatt Macy 955eda14cbcSMatt Macy for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t]) 956eda14cbcSMatt Macy continue; 957eda14cbcSMatt Macy 958eda14cbcSMatt Macy tn->tn_member[t] = 1; 959eda14cbcSMatt Macy tn->tn_next[t] = NULL; 960eda14cbcSMatt Macy *tp = tn; 961eda14cbcSMatt Macy } 962eda14cbcSMatt Macy mutex_exit(&tl->tl_lock); 963eda14cbcSMatt Macy 964eda14cbcSMatt Macy return (add); 965eda14cbcSMatt Macy } 966eda14cbcSMatt Macy 967eda14cbcSMatt Macy /* 968eda14cbcSMatt Macy * Remove the head of the list and return it. 969eda14cbcSMatt Macy */ 970eda14cbcSMatt Macy void * 971eda14cbcSMatt Macy txg_list_remove(txg_list_t *tl, uint64_t txg) 972eda14cbcSMatt Macy { 973eda14cbcSMatt Macy int t = txg & TXG_MASK; 974eda14cbcSMatt Macy txg_node_t *tn; 975eda14cbcSMatt Macy void *p = NULL; 976eda14cbcSMatt Macy 977eda14cbcSMatt Macy TXG_VERIFY(tl->tl_spa, txg); 978eda14cbcSMatt Macy mutex_enter(&tl->tl_lock); 979eda14cbcSMatt Macy if ((tn = tl->tl_head[t]) != NULL) { 980eda14cbcSMatt Macy ASSERT(tn->tn_member[t]); 981eda14cbcSMatt Macy ASSERT(tn->tn_next[t] == NULL || tn->tn_next[t]->tn_member[t]); 982eda14cbcSMatt Macy p = (char *)tn - tl->tl_offset; 983eda14cbcSMatt Macy tl->tl_head[t] = tn->tn_next[t]; 984eda14cbcSMatt Macy tn->tn_next[t] = NULL; 985eda14cbcSMatt Macy tn->tn_member[t] = 0; 986eda14cbcSMatt Macy } 987eda14cbcSMatt Macy mutex_exit(&tl->tl_lock); 988eda14cbcSMatt Macy 989eda14cbcSMatt Macy return (p); 990eda14cbcSMatt Macy } 991eda14cbcSMatt Macy 992eda14cbcSMatt Macy /* 993eda14cbcSMatt Macy * Remove a specific item from the list and return it. 994eda14cbcSMatt Macy */ 995eda14cbcSMatt Macy void * 996eda14cbcSMatt Macy txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg) 997eda14cbcSMatt Macy { 998eda14cbcSMatt Macy int t = txg & TXG_MASK; 999eda14cbcSMatt Macy txg_node_t *tn, **tp; 1000eda14cbcSMatt Macy 1001eda14cbcSMatt Macy TXG_VERIFY(tl->tl_spa, txg); 1002eda14cbcSMatt Macy mutex_enter(&tl->tl_lock); 1003eda14cbcSMatt Macy 1004eda14cbcSMatt Macy for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) { 1005eda14cbcSMatt Macy if ((char *)tn - tl->tl_offset == p) { 1006eda14cbcSMatt Macy *tp = tn->tn_next[t]; 1007eda14cbcSMatt Macy tn->tn_next[t] = NULL; 1008eda14cbcSMatt Macy tn->tn_member[t] = 0; 1009eda14cbcSMatt Macy mutex_exit(&tl->tl_lock); 1010eda14cbcSMatt Macy return (p); 1011eda14cbcSMatt Macy } 1012eda14cbcSMatt Macy } 1013eda14cbcSMatt Macy 1014eda14cbcSMatt Macy mutex_exit(&tl->tl_lock); 1015eda14cbcSMatt Macy 1016eda14cbcSMatt Macy return (NULL); 1017eda14cbcSMatt Macy } 1018eda14cbcSMatt Macy 1019eda14cbcSMatt Macy boolean_t 1020eda14cbcSMatt Macy txg_list_member(txg_list_t *tl, void *p, uint64_t txg) 1021eda14cbcSMatt Macy { 1022eda14cbcSMatt Macy int t = txg & TXG_MASK; 1023eda14cbcSMatt Macy txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); 1024eda14cbcSMatt Macy 1025eda14cbcSMatt Macy TXG_VERIFY(tl->tl_spa, txg); 1026eda14cbcSMatt Macy return (tn->tn_member[t] != 0); 1027eda14cbcSMatt Macy } 1028eda14cbcSMatt Macy 1029eda14cbcSMatt Macy /* 1030eda14cbcSMatt Macy * Walk a txg list 1031eda14cbcSMatt Macy */ 1032eda14cbcSMatt Macy void * 1033eda14cbcSMatt Macy txg_list_head(txg_list_t *tl, uint64_t txg) 1034eda14cbcSMatt Macy { 1035eda14cbcSMatt Macy int t = txg & TXG_MASK; 1036eda14cbcSMatt Macy txg_node_t *tn; 1037eda14cbcSMatt Macy 1038eda14cbcSMatt Macy mutex_enter(&tl->tl_lock); 1039eda14cbcSMatt Macy tn = tl->tl_head[t]; 1040eda14cbcSMatt Macy mutex_exit(&tl->tl_lock); 1041eda14cbcSMatt Macy 1042eda14cbcSMatt Macy TXG_VERIFY(tl->tl_spa, txg); 1043eda14cbcSMatt Macy return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); 1044eda14cbcSMatt Macy } 1045eda14cbcSMatt Macy 1046eda14cbcSMatt Macy void * 1047eda14cbcSMatt Macy txg_list_next(txg_list_t *tl, void *p, uint64_t txg) 1048eda14cbcSMatt Macy { 1049eda14cbcSMatt Macy int t = txg & TXG_MASK; 1050eda14cbcSMatt Macy txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); 1051eda14cbcSMatt Macy 1052eda14cbcSMatt Macy TXG_VERIFY(tl->tl_spa, txg); 1053eda14cbcSMatt Macy 1054eda14cbcSMatt Macy mutex_enter(&tl->tl_lock); 1055eda14cbcSMatt Macy tn = tn->tn_next[t]; 1056eda14cbcSMatt Macy mutex_exit(&tl->tl_lock); 1057eda14cbcSMatt Macy 1058eda14cbcSMatt Macy return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); 1059eda14cbcSMatt Macy } 1060eda14cbcSMatt Macy 1061eda14cbcSMatt Macy EXPORT_SYMBOL(txg_init); 1062eda14cbcSMatt Macy EXPORT_SYMBOL(txg_fini); 1063eda14cbcSMatt Macy EXPORT_SYMBOL(txg_sync_start); 1064eda14cbcSMatt Macy EXPORT_SYMBOL(txg_sync_stop); 1065eda14cbcSMatt Macy EXPORT_SYMBOL(txg_hold_open); 1066eda14cbcSMatt Macy EXPORT_SYMBOL(txg_rele_to_quiesce); 1067eda14cbcSMatt Macy EXPORT_SYMBOL(txg_rele_to_sync); 1068eda14cbcSMatt Macy EXPORT_SYMBOL(txg_register_callbacks); 1069eda14cbcSMatt Macy EXPORT_SYMBOL(txg_delay); 1070eda14cbcSMatt Macy EXPORT_SYMBOL(txg_wait_synced); 1071eda14cbcSMatt Macy EXPORT_SYMBOL(txg_wait_open); 1072eda14cbcSMatt Macy EXPORT_SYMBOL(txg_wait_callbacks); 1073eda14cbcSMatt Macy EXPORT_SYMBOL(txg_stalled); 1074eda14cbcSMatt Macy EXPORT_SYMBOL(txg_sync_waiting); 1075eda14cbcSMatt Macy 1076be181ee2SMartin Matuska ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, timeout, UINT, ZMOD_RW, 1077eda14cbcSMatt Macy "Max seconds worth of delta per txg"); 1078