1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 55ad82045Snd150628 * Common Development and Distribution License (the "License"). 65ad82045Snd150628 * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 223f9d6ad7SLin Ling * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23383e7c74SXin Li * Portions Copyright 2011 Martin Matuska 24*73527f44SAlex Reece * Copyright (c) 2012, 2014 by Delphix. All rights reserved. 25fa9e4066Sahrens */ 26fa9e4066Sahrens 27fa9e4066Sahrens #include <sys/zfs_context.h> 28fa9e4066Sahrens #include <sys/txg_impl.h> 29fa9e4066Sahrens #include <sys/dmu_impl.h> 30d20e665cSRicardo M. Correia #include <sys/dmu_tx.h> 31fa9e4066Sahrens #include <sys/dsl_pool.h> 323f9d6ad7SLin Ling #include <sys/dsl_scan.h> 33fa9e4066Sahrens #include <sys/callb.h> 34fa9e4066Sahrens 35fa9e4066Sahrens /* 36adbbcffaSAdam H. Leventhal * ZFS Transaction Groups 37adbbcffaSAdam H. Leventhal * ---------------------- 38adbbcffaSAdam H. Leventhal * 39adbbcffaSAdam H. Leventhal * ZFS transaction groups are, as the name implies, groups of transactions 40adbbcffaSAdam H. Leventhal * that act on persistent state. ZFS asserts consistency at the granularity of 41adbbcffaSAdam H. Leventhal * these transaction groups. Each successive transaction group (txg) is 42adbbcffaSAdam H. Leventhal * assigned a 64-bit consecutive identifier. There are three active 43adbbcffaSAdam H. Leventhal * transaction group states: open, quiescing, or syncing. At any given time, 44adbbcffaSAdam H. Leventhal * there may be an active txg associated with each state; each active txg may 45adbbcffaSAdam H. Leventhal * either be processing, or blocked waiting to enter the next state. There may 46adbbcffaSAdam H. Leventhal * be up to three active txgs, and there is always a txg in the open state 47adbbcffaSAdam H. Leventhal * (though it may be blocked waiting to enter the quiescing state). In broad 4869962b56SMatthew Ahrens * strokes, transactions -- operations that change in-memory structures -- are 49adbbcffaSAdam H. Leventhal * accepted into the txg in the open state, and are completed while the txg is 50adbbcffaSAdam H. Leventhal * in the open or quiescing states. The accumulated changes are written to 51adbbcffaSAdam H. Leventhal * disk in the syncing state. 52adbbcffaSAdam H. Leventhal * 53adbbcffaSAdam H. Leventhal * Open 54adbbcffaSAdam H. Leventhal * 55adbbcffaSAdam H. Leventhal * When a new txg becomes active, it first enters the open state. New 5669962b56SMatthew Ahrens * transactions -- updates to in-memory structures -- are assigned to the 57adbbcffaSAdam H. Leventhal * currently open txg. There is always a txg in the open state so that ZFS can 58adbbcffaSAdam H. Leventhal * accept new changes (though the txg may refuse new changes if it has hit 59adbbcffaSAdam H. Leventhal * some limit). ZFS advances the open txg to the next state for a variety of 60adbbcffaSAdam H. Leventhal * reasons such as it hitting a time or size threshold, or the execution of an 61adbbcffaSAdam H. Leventhal * administrative action that must be completed in the syncing state. 62adbbcffaSAdam H. Leventhal * 63adbbcffaSAdam H. Leventhal * Quiescing 64adbbcffaSAdam H. Leventhal * 65adbbcffaSAdam H. Leventhal * After a txg exits the open state, it enters the quiescing state. The 66adbbcffaSAdam H. Leventhal * quiescing state is intended to provide a buffer between accepting new 67adbbcffaSAdam H. Leventhal * transactions in the open state and writing them out to stable storage in 68adbbcffaSAdam H. Leventhal * the syncing state. While quiescing, transactions can continue their 69adbbcffaSAdam H. Leventhal * operation without delaying either of the other states. Typically, a txg is 70adbbcffaSAdam H. Leventhal * in the quiescing state very briefly since the operations are bounded by 71adbbcffaSAdam H. Leventhal * software latencies rather than, say, slower I/O latencies. After all 72adbbcffaSAdam H. Leventhal * transactions complete, the txg is ready to enter the next state. 73adbbcffaSAdam H. Leventhal * 74adbbcffaSAdam H. Leventhal * Syncing 75adbbcffaSAdam H. Leventhal * 76adbbcffaSAdam H. Leventhal * In the syncing state, the in-memory state built up during the open and (to 77adbbcffaSAdam H. Leventhal * a lesser degree) the quiescing states is written to stable storage. The 78adbbcffaSAdam H. Leventhal * process of writing out modified data can, in turn modify more data. For 79adbbcffaSAdam H. Leventhal * example when we write new blocks, we need to allocate space for them; those 80adbbcffaSAdam H. Leventhal * allocations modify metadata (space maps)... which themselves must be 81adbbcffaSAdam H. Leventhal * written to stable storage. During the sync state, ZFS iterates, writing out 82adbbcffaSAdam H. Leventhal * data until it converges and all in-memory changes have been written out. 83adbbcffaSAdam H. Leventhal * The first such pass is the largest as it encompasses all the modified user 84adbbcffaSAdam H. Leventhal * data (as opposed to filesystem metadata). Subsequent passes typically have 85adbbcffaSAdam H. Leventhal * far less data to write as they consist exclusively of filesystem metadata. 86adbbcffaSAdam H. Leventhal * 87adbbcffaSAdam H. Leventhal * To ensure convergence, after a certain number of passes ZFS begins 88adbbcffaSAdam H. Leventhal * overwriting locations on stable storage that had been allocated earlier in 89adbbcffaSAdam H. Leventhal * the syncing state (and subsequently freed). ZFS usually allocates new 90adbbcffaSAdam H. Leventhal * blocks to optimize for large, continuous, writes. For the syncing state to 91adbbcffaSAdam H. Leventhal * converge however it must complete a pass where no new blocks are allocated 92adbbcffaSAdam H. Leventhal * since each allocation requires a modification of persistent metadata. 93adbbcffaSAdam H. Leventhal * Further, to hasten convergence, after a prescribed number of passes, ZFS 94adbbcffaSAdam H. Leventhal * also defers frees, and stops compressing. 95adbbcffaSAdam H. Leventhal * 96adbbcffaSAdam H. Leventhal * In addition to writing out user data, we must also execute synctasks during 97adbbcffaSAdam H. Leventhal * the syncing context. A synctask is the mechanism by which some 98adbbcffaSAdam H. Leventhal * administrative activities work such as creating and destroying snapshots or 99adbbcffaSAdam H. Leventhal * datasets. Note that when a synctask is initiated it enters the open txg, 100adbbcffaSAdam H. Leventhal * and ZFS then pushes that txg as quickly as possible to completion of the 101adbbcffaSAdam H. Leventhal * syncing state in order to reduce the latency of the administrative 102adbbcffaSAdam H. Leventhal * activity. To complete the syncing state, ZFS writes out a new uberblock, 103adbbcffaSAdam H. Leventhal * the root of the tree of blocks that comprise all state stored on the ZFS 104adbbcffaSAdam H. Leventhal * pool. Finally, if there is a quiesced txg waiting, we signal that it can 105adbbcffaSAdam H. Leventhal * now transition to the syncing state. 106fa9e4066Sahrens */ 107fa9e4066Sahrens 108fa9e4066Sahrens static void txg_sync_thread(dsl_pool_t *dp); 109fa9e4066Sahrens static void txg_quiesce_thread(dsl_pool_t *dp); 110fa9e4066Sahrens 11144ecc532SGeorge Wilson int zfs_txg_timeout = 5; /* max seconds worth of delta per txg */ 112fa9e4066Sahrens 113fa9e4066Sahrens /* 114fa9e4066Sahrens * Prepare the txg subsystem. 115fa9e4066Sahrens */ 116fa9e4066Sahrens void 117fa9e4066Sahrens txg_init(dsl_pool_t *dp, uint64_t txg) 118fa9e4066Sahrens { 119fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 1205ad82045Snd150628 int c; 121fa9e4066Sahrens bzero(tx, sizeof (tx_state_t)); 122fa9e4066Sahrens 123fa9e4066Sahrens tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP); 124fa9e4066Sahrens 1258f38d419Sek110237 for (c = 0; c < max_ncpus; c++) { 1268f38d419Sek110237 int i; 1278f38d419Sek110237 1285ad82045Snd150628 mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL); 1294a923759SGeorge Wilson mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_DEFAULT, 1304a923759SGeorge Wilson NULL); 1318f38d419Sek110237 for (i = 0; i < TXG_SIZE; i++) { 1328f38d419Sek110237 cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT, 1338f38d419Sek110237 NULL); 134d20e665cSRicardo M. Correia list_create(&tx->tx_cpu[c].tc_callbacks[i], 135d20e665cSRicardo M. Correia sizeof (dmu_tx_callback_t), 136d20e665cSRicardo M. Correia offsetof(dmu_tx_callback_t, dcb_node)); 1378f38d419Sek110237 } 1388f38d419Sek110237 } 1395ad82045Snd150628 1405ad82045Snd150628 mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL); 141fa9e4066Sahrens 142b5e70f97SRicardo M. Correia cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL); 143b5e70f97SRicardo M. Correia cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL); 144b5e70f97SRicardo M. Correia cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL); 145b5e70f97SRicardo M. Correia cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL); 146b5e70f97SRicardo M. Correia cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL); 147b5e70f97SRicardo M. Correia 148fa9e4066Sahrens tx->tx_open_txg = txg; 149fa9e4066Sahrens } 150fa9e4066Sahrens 151fa9e4066Sahrens /* 152fa9e4066Sahrens * Close down the txg subsystem. 153fa9e4066Sahrens */ 154fa9e4066Sahrens void 155fa9e4066Sahrens txg_fini(dsl_pool_t *dp) 156fa9e4066Sahrens { 157fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 1585ad82045Snd150628 int c; 159fa9e4066Sahrens 160fa9e4066Sahrens ASSERT(tx->tx_threads == 0); 161fa9e4066Sahrens 1625ad82045Snd150628 mutex_destroy(&tx->tx_sync_lock); 1635ad82045Snd150628 164b5e70f97SRicardo M. Correia cv_destroy(&tx->tx_sync_more_cv); 165b5e70f97SRicardo M. Correia cv_destroy(&tx->tx_sync_done_cv); 166b5e70f97SRicardo M. Correia cv_destroy(&tx->tx_quiesce_more_cv); 167b5e70f97SRicardo M. Correia cv_destroy(&tx->tx_quiesce_done_cv); 168b5e70f97SRicardo M. Correia cv_destroy(&tx->tx_exit_cv); 169b5e70f97SRicardo M. Correia 1708f38d419Sek110237 for (c = 0; c < max_ncpus; c++) { 1718f38d419Sek110237 int i; 1728f38d419Sek110237 1734a923759SGeorge Wilson mutex_destroy(&tx->tx_cpu[c].tc_open_lock); 1745ad82045Snd150628 mutex_destroy(&tx->tx_cpu[c].tc_lock); 175d20e665cSRicardo M. Correia for (i = 0; i < TXG_SIZE; i++) { 1768f38d419Sek110237 cv_destroy(&tx->tx_cpu[c].tc_cv[i]); 177d20e665cSRicardo M. Correia list_destroy(&tx->tx_cpu[c].tc_callbacks[i]); 1788f38d419Sek110237 } 179d20e665cSRicardo M. Correia } 180d20e665cSRicardo M. Correia 181d20e665cSRicardo M. Correia if (tx->tx_commit_cb_taskq != NULL) 182d20e665cSRicardo M. Correia taskq_destroy(tx->tx_commit_cb_taskq); 183fa9e4066Sahrens 184fa9e4066Sahrens kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t)); 185fa9e4066Sahrens 186fa9e4066Sahrens bzero(tx, sizeof (tx_state_t)); 187fa9e4066Sahrens } 188fa9e4066Sahrens 189fa9e4066Sahrens /* 190fa9e4066Sahrens * Start syncing transaction groups. 191fa9e4066Sahrens */ 192fa9e4066Sahrens void 193fa9e4066Sahrens txg_sync_start(dsl_pool_t *dp) 194fa9e4066Sahrens { 195fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 196fa9e4066Sahrens 197fa9e4066Sahrens mutex_enter(&tx->tx_sync_lock); 198fa9e4066Sahrens 199fa9e4066Sahrens dprintf("pool %p\n", dp); 200fa9e4066Sahrens 201fa9e4066Sahrens ASSERT(tx->tx_threads == 0); 202fa9e4066Sahrens 2031ab7f2deSmaybee tx->tx_threads = 2; 204fa9e4066Sahrens 205fa9e4066Sahrens tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread, 206fa9e4066Sahrens dp, 0, &p0, TS_RUN, minclsyspri); 207fa9e4066Sahrens 208088f3894Sahrens /* 209088f3894Sahrens * The sync thread can need a larger-than-default stack size on 210088f3894Sahrens * 32-bit x86. This is due in part to nested pools and 211088f3894Sahrens * scrub_visitbp() recursion. 212088f3894Sahrens */ 2133f9d6ad7SLin Ling tx->tx_sync_thread = thread_create(NULL, 32<<10, txg_sync_thread, 214fa9e4066Sahrens dp, 0, &p0, TS_RUN, minclsyspri); 215fa9e4066Sahrens 216fa9e4066Sahrens mutex_exit(&tx->tx_sync_lock); 217fa9e4066Sahrens } 218fa9e4066Sahrens 219fa9e4066Sahrens static void 220fa9e4066Sahrens txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr) 221fa9e4066Sahrens { 222fa9e4066Sahrens CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG); 223fa9e4066Sahrens mutex_enter(&tx->tx_sync_lock); 224fa9e4066Sahrens } 225fa9e4066Sahrens 226fa9e4066Sahrens static void 227fa9e4066Sahrens txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp) 228fa9e4066Sahrens { 229fa9e4066Sahrens ASSERT(*tpp != NULL); 230fa9e4066Sahrens *tpp = NULL; 231fa9e4066Sahrens tx->tx_threads--; 232fa9e4066Sahrens cv_broadcast(&tx->tx_exit_cv); 233fa9e4066Sahrens CALLB_CPR_EXIT(cpr); /* drops &tx->tx_sync_lock */ 234fa9e4066Sahrens thread_exit(); 235fa9e4066Sahrens } 236fa9e4066Sahrens 237fa9e4066Sahrens static void 2380689f76cSAdam Leventhal txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time) 239fa9e4066Sahrens { 240fa9e4066Sahrens CALLB_CPR_SAFE_BEGIN(cpr); 241fa9e4066Sahrens 2421ab7f2deSmaybee if (time) 243d3d50737SRafael Vanoni (void) cv_timedwait(cv, &tx->tx_sync_lock, 244d3d50737SRafael Vanoni ddi_get_lbolt() + time); 245fa9e4066Sahrens else 246fa9e4066Sahrens cv_wait(cv, &tx->tx_sync_lock); 247fa9e4066Sahrens 248fa9e4066Sahrens CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock); 249fa9e4066Sahrens } 250fa9e4066Sahrens 251fa9e4066Sahrens /* 252fa9e4066Sahrens * Stop syncing transaction groups. 253fa9e4066Sahrens */ 254fa9e4066Sahrens void 255fa9e4066Sahrens txg_sync_stop(dsl_pool_t *dp) 256fa9e4066Sahrens { 257fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 258fa9e4066Sahrens 259fa9e4066Sahrens dprintf("pool %p\n", dp); 260fa9e4066Sahrens /* 261fa9e4066Sahrens * Finish off any work in progress. 262fa9e4066Sahrens */ 2631ab7f2deSmaybee ASSERT(tx->tx_threads == 2); 264468c413aSTim Haley 265468c413aSTim Haley /* 266468c413aSTim Haley * We need to ensure that we've vacated the deferred space_maps. 267468c413aSTim Haley */ 268468c413aSTim Haley txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE); 269fa9e4066Sahrens 270fa9e4066Sahrens /* 2711ab7f2deSmaybee * Wake all sync threads and wait for them to die. 272fa9e4066Sahrens */ 273fa9e4066Sahrens mutex_enter(&tx->tx_sync_lock); 274fa9e4066Sahrens 2751ab7f2deSmaybee ASSERT(tx->tx_threads == 2); 276fa9e4066Sahrens 277fa9e4066Sahrens tx->tx_exiting = 1; 278fa9e4066Sahrens 279fa9e4066Sahrens cv_broadcast(&tx->tx_quiesce_more_cv); 280fa9e4066Sahrens cv_broadcast(&tx->tx_quiesce_done_cv); 281fa9e4066Sahrens cv_broadcast(&tx->tx_sync_more_cv); 282fa9e4066Sahrens 283fa9e4066Sahrens while (tx->tx_threads != 0) 284fa9e4066Sahrens cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock); 285fa9e4066Sahrens 286fa9e4066Sahrens tx->tx_exiting = 0; 287fa9e4066Sahrens 288fa9e4066Sahrens mutex_exit(&tx->tx_sync_lock); 289fa9e4066Sahrens } 290fa9e4066Sahrens 291fa9e4066Sahrens uint64_t 292fa9e4066Sahrens txg_hold_open(dsl_pool_t *dp, txg_handle_t *th) 293fa9e4066Sahrens { 294fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 295fa9e4066Sahrens tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID]; 296fa9e4066Sahrens uint64_t txg; 297fa9e4066Sahrens 2984a923759SGeorge Wilson mutex_enter(&tc->tc_open_lock); 299fa9e4066Sahrens txg = tx->tx_open_txg; 3004a923759SGeorge Wilson 3014a923759SGeorge Wilson mutex_enter(&tc->tc_lock); 302fa9e4066Sahrens tc->tc_count[txg & TXG_MASK]++; 3034a923759SGeorge Wilson mutex_exit(&tc->tc_lock); 304fa9e4066Sahrens 305fa9e4066Sahrens th->th_cpu = tc; 306fa9e4066Sahrens th->th_txg = txg; 307fa9e4066Sahrens 308fa9e4066Sahrens return (txg); 309fa9e4066Sahrens } 310fa9e4066Sahrens 311fa9e4066Sahrens void 312fa9e4066Sahrens txg_rele_to_quiesce(txg_handle_t *th) 313fa9e4066Sahrens { 314fa9e4066Sahrens tx_cpu_t *tc = th->th_cpu; 315fa9e4066Sahrens 3164a923759SGeorge Wilson ASSERT(!MUTEX_HELD(&tc->tc_lock)); 3174a923759SGeorge Wilson mutex_exit(&tc->tc_open_lock); 318fa9e4066Sahrens } 319fa9e4066Sahrens 320fa9e4066Sahrens void 321d20e665cSRicardo M. Correia txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks) 322d20e665cSRicardo M. Correia { 323d20e665cSRicardo M. Correia tx_cpu_t *tc = th->th_cpu; 324d20e665cSRicardo M. Correia int g = th->th_txg & TXG_MASK; 325d20e665cSRicardo M. Correia 326d20e665cSRicardo M. Correia mutex_enter(&tc->tc_lock); 327d20e665cSRicardo M. Correia list_move_tail(&tc->tc_callbacks[g], tx_callbacks); 328d20e665cSRicardo M. Correia mutex_exit(&tc->tc_lock); 329d20e665cSRicardo M. Correia } 330d20e665cSRicardo M. Correia 331d20e665cSRicardo M. Correia void 332fa9e4066Sahrens txg_rele_to_sync(txg_handle_t *th) 333fa9e4066Sahrens { 334fa9e4066Sahrens tx_cpu_t *tc = th->th_cpu; 335fa9e4066Sahrens int g = th->th_txg & TXG_MASK; 336fa9e4066Sahrens 337fa9e4066Sahrens mutex_enter(&tc->tc_lock); 338fa9e4066Sahrens ASSERT(tc->tc_count[g] != 0); 339fa9e4066Sahrens if (--tc->tc_count[g] == 0) 340fa9e4066Sahrens cv_broadcast(&tc->tc_cv[g]); 341fa9e4066Sahrens mutex_exit(&tc->tc_lock); 342fa9e4066Sahrens 343fa9e4066Sahrens th->th_cpu = NULL; /* defensive */ 344fa9e4066Sahrens } 345fa9e4066Sahrens 3463e30c24aSWill Andrews /* 3473e30c24aSWill Andrews * Blocks until all transactions in the group are committed. 3483e30c24aSWill Andrews * 3493e30c24aSWill Andrews * On return, the transaction group has reached a stable state in which it can 3503e30c24aSWill Andrews * then be passed off to the syncing context. 3513e30c24aSWill Andrews */ 352fa9e4066Sahrens static void 353fa9e4066Sahrens txg_quiesce(dsl_pool_t *dp, uint64_t txg) 354fa9e4066Sahrens { 355fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 356fa9e4066Sahrens int g = txg & TXG_MASK; 357fa9e4066Sahrens int c; 358fa9e4066Sahrens 359fa9e4066Sahrens /* 3604a923759SGeorge Wilson * Grab all tc_open_locks so nobody else can get into this txg. 361fa9e4066Sahrens */ 362fa9e4066Sahrens for (c = 0; c < max_ncpus; c++) 3634a923759SGeorge Wilson mutex_enter(&tx->tx_cpu[c].tc_open_lock); 364fa9e4066Sahrens 365fa9e4066Sahrens ASSERT(txg == tx->tx_open_txg); 366fa9e4066Sahrens tx->tx_open_txg++; 36769962b56SMatthew Ahrens tx->tx_open_time = gethrtime(); 368fa9e4066Sahrens 3690689f76cSAdam Leventhal DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg); 3700689f76cSAdam Leventhal DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg); 3710689f76cSAdam Leventhal 372fa9e4066Sahrens /* 373fa9e4066Sahrens * Now that we've incremented tx_open_txg, we can let threads 374fa9e4066Sahrens * enter the next transaction group. 375fa9e4066Sahrens */ 376fa9e4066Sahrens for (c = 0; c < max_ncpus; c++) 3774a923759SGeorge Wilson mutex_exit(&tx->tx_cpu[c].tc_open_lock); 378fa9e4066Sahrens 379fa9e4066Sahrens /* 380fa9e4066Sahrens * Quiesce the transaction group by waiting for everyone to txg_exit(). 381fa9e4066Sahrens */ 382fa9e4066Sahrens for (c = 0; c < max_ncpus; c++) { 383fa9e4066Sahrens tx_cpu_t *tc = &tx->tx_cpu[c]; 384fa9e4066Sahrens mutex_enter(&tc->tc_lock); 385fa9e4066Sahrens while (tc->tc_count[g] != 0) 386fa9e4066Sahrens cv_wait(&tc->tc_cv[g], &tc->tc_lock); 387fa9e4066Sahrens mutex_exit(&tc->tc_lock); 388fa9e4066Sahrens } 389fa9e4066Sahrens } 390fa9e4066Sahrens 391fa9e4066Sahrens static void 392d20e665cSRicardo M. Correia txg_do_callbacks(list_t *cb_list) 393d20e665cSRicardo M. Correia { 394d20e665cSRicardo M. Correia dmu_tx_do_callbacks(cb_list, 0); 395d20e665cSRicardo M. Correia 396d20e665cSRicardo M. Correia list_destroy(cb_list); 397d20e665cSRicardo M. Correia 398d20e665cSRicardo M. Correia kmem_free(cb_list, sizeof (list_t)); 399d20e665cSRicardo M. Correia } 400d20e665cSRicardo M. Correia 401d20e665cSRicardo M. Correia /* 402d20e665cSRicardo M. Correia * Dispatch the commit callbacks registered on this txg to worker threads. 4033e30c24aSWill Andrews * 4043e30c24aSWill Andrews * If no callbacks are registered for a given TXG, nothing happens. 4053e30c24aSWill Andrews * This function creates a taskq for the associated pool, if needed. 406d20e665cSRicardo M. Correia */ 407d20e665cSRicardo M. Correia static void 408d20e665cSRicardo M. Correia txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) 409d20e665cSRicardo M. Correia { 410d20e665cSRicardo M. Correia int c; 411d20e665cSRicardo M. Correia tx_state_t *tx = &dp->dp_tx; 412d20e665cSRicardo M. Correia list_t *cb_list; 413d20e665cSRicardo M. Correia 414d20e665cSRicardo M. Correia for (c = 0; c < max_ncpus; c++) { 415d20e665cSRicardo M. Correia tx_cpu_t *tc = &tx->tx_cpu[c]; 4163e30c24aSWill Andrews /* 4173e30c24aSWill Andrews * No need to lock tx_cpu_t at this point, since this can 4183e30c24aSWill Andrews * only be called once a txg has been synced. 4193e30c24aSWill Andrews */ 420d20e665cSRicardo M. Correia 421d20e665cSRicardo M. Correia int g = txg & TXG_MASK; 422d20e665cSRicardo M. Correia 423d20e665cSRicardo M. Correia if (list_is_empty(&tc->tc_callbacks[g])) 424d20e665cSRicardo M. Correia continue; 425d20e665cSRicardo M. Correia 426d20e665cSRicardo M. Correia if (tx->tx_commit_cb_taskq == NULL) { 427d20e665cSRicardo M. Correia /* 428d20e665cSRicardo M. Correia * Commit callback taskq hasn't been created yet. 429d20e665cSRicardo M. Correia */ 430d20e665cSRicardo M. Correia tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb", 431d20e665cSRicardo M. Correia max_ncpus, minclsyspri, max_ncpus, max_ncpus * 2, 432d20e665cSRicardo M. Correia TASKQ_PREPOPULATE); 433d20e665cSRicardo M. Correia } 434d20e665cSRicardo M. Correia 435d20e665cSRicardo M. Correia cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP); 436d20e665cSRicardo M. Correia list_create(cb_list, sizeof (dmu_tx_callback_t), 437d20e665cSRicardo M. Correia offsetof(dmu_tx_callback_t, dcb_node)); 438d20e665cSRicardo M. Correia 439b3d9f2e2SWill Andrews list_move_tail(cb_list, &tc->tc_callbacks[g]); 440d20e665cSRicardo M. Correia 441d20e665cSRicardo M. Correia (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *) 442d20e665cSRicardo M. Correia txg_do_callbacks, cb_list, TQ_SLEEP); 443d20e665cSRicardo M. Correia } 444d20e665cSRicardo M. Correia } 445d20e665cSRicardo M. Correia 446d20e665cSRicardo M. Correia static void 447fa9e4066Sahrens txg_sync_thread(dsl_pool_t *dp) 448fa9e4066Sahrens { 449b16da2e2SGeorge Wilson spa_t *spa = dp->dp_spa; 450fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 451fa9e4066Sahrens callb_cpr_t cpr; 45205715f94SMark Maybee uint64_t start, delta; 453fa9e4066Sahrens 454fa9e4066Sahrens txg_thread_enter(tx, &cpr); 455fa9e4066Sahrens 4561ab7f2deSmaybee start = delta = 0; 457fa9e4066Sahrens for (;;) { 45869962b56SMatthew Ahrens uint64_t timeout = zfs_txg_timeout * hz; 45969962b56SMatthew Ahrens uint64_t timer; 46005715f94SMark Maybee uint64_t txg; 461fa9e4066Sahrens 462fa9e4066Sahrens /* 4633f9d6ad7SLin Ling * We sync when we're scanning, there's someone waiting 46488b7b0f2SMatthew Ahrens * on us, or the quiesce thread has handed off a txg to 46588b7b0f2SMatthew Ahrens * us, or we have reached our timeout. 466fa9e4066Sahrens */ 4671ab7f2deSmaybee timer = (delta >= timeout ? 0 : timeout - delta); 468cde58dbcSMatthew Ahrens while (!dsl_scan_active(dp->dp_scan) && 46988b7b0f2SMatthew Ahrens !tx->tx_exiting && timer > 0 && 470fa9e4066Sahrens tx->tx_synced_txg >= tx->tx_sync_txg_waiting && 47169962b56SMatthew Ahrens tx->tx_quiesced_txg == 0 && 47269962b56SMatthew Ahrens dp->dp_dirty_total < zfs_dirty_data_sync) { 473fa9e4066Sahrens dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", 474fa9e4066Sahrens tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); 4751ab7f2deSmaybee txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer); 476d3d50737SRafael Vanoni delta = ddi_get_lbolt() - start; 4771ab7f2deSmaybee timer = (delta > timeout ? 0 : timeout - delta); 478fa9e4066Sahrens } 479fa9e4066Sahrens 480fa9e4066Sahrens /* 481fa9e4066Sahrens * Wait until the quiesce thread hands off a txg to us, 482fa9e4066Sahrens * prompting it to do so if necessary. 483fa9e4066Sahrens */ 484fa9e4066Sahrens while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) { 485fa9e4066Sahrens if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1) 486fa9e4066Sahrens tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1; 487fa9e4066Sahrens cv_broadcast(&tx->tx_quiesce_more_cv); 488fa9e4066Sahrens txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0); 489fa9e4066Sahrens } 490fa9e4066Sahrens 491fa9e4066Sahrens if (tx->tx_exiting) 492fa9e4066Sahrens txg_thread_exit(tx, &cpr, &tx->tx_sync_thread); 493fa9e4066Sahrens 494fa9e4066Sahrens /* 495fa9e4066Sahrens * Consume the quiesced txg which has been handed off to 496fa9e4066Sahrens * us. This may cause the quiescing thread to now be 497fa9e4066Sahrens * able to quiesce another txg, so we must signal it. 498fa9e4066Sahrens */ 499fa9e4066Sahrens txg = tx->tx_quiesced_txg; 500fa9e4066Sahrens tx->tx_quiesced_txg = 0; 501fa9e4066Sahrens tx->tx_syncing_txg = txg; 5020689f76cSAdam Leventhal DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg); 503fa9e4066Sahrens cv_broadcast(&tx->tx_quiesce_more_cv); 504fa9e4066Sahrens 505fa9e4066Sahrens dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", 5068f38d419Sek110237 txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); 507fa9e4066Sahrens mutex_exit(&tx->tx_sync_lock); 50805715f94SMark Maybee 509d3d50737SRafael Vanoni start = ddi_get_lbolt(); 510b16da2e2SGeorge Wilson spa_sync(spa, txg); 511d3d50737SRafael Vanoni delta = ddi_get_lbolt() - start; 5121ab7f2deSmaybee 513fa9e4066Sahrens mutex_enter(&tx->tx_sync_lock); 514fa9e4066Sahrens tx->tx_synced_txg = txg; 515fa9e4066Sahrens tx->tx_syncing_txg = 0; 5160689f76cSAdam Leventhal DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg); 517fa9e4066Sahrens cv_broadcast(&tx->tx_sync_done_cv); 518d20e665cSRicardo M. Correia 519d20e665cSRicardo M. Correia /* 520d20e665cSRicardo M. Correia * Dispatch commit callbacks to worker threads. 521d20e665cSRicardo M. Correia */ 522d20e665cSRicardo M. Correia txg_dispatch_callbacks(dp, txg); 523fa9e4066Sahrens } 524fa9e4066Sahrens } 525fa9e4066Sahrens 526fa9e4066Sahrens static void 527fa9e4066Sahrens txg_quiesce_thread(dsl_pool_t *dp) 528fa9e4066Sahrens { 529fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 530fa9e4066Sahrens callb_cpr_t cpr; 531fa9e4066Sahrens 532fa9e4066Sahrens txg_thread_enter(tx, &cpr); 533fa9e4066Sahrens 534fa9e4066Sahrens for (;;) { 535fa9e4066Sahrens uint64_t txg; 536fa9e4066Sahrens 537fa9e4066Sahrens /* 538fa9e4066Sahrens * We quiesce when there's someone waiting on us. 539fa9e4066Sahrens * However, we can only have one txg in "quiescing" or 540fa9e4066Sahrens * "quiesced, waiting to sync" state. So we wait until 541fa9e4066Sahrens * the "quiesced, waiting to sync" txg has been consumed 542fa9e4066Sahrens * by the sync thread. 543fa9e4066Sahrens */ 544fa9e4066Sahrens while (!tx->tx_exiting && 545fa9e4066Sahrens (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting || 546fa9e4066Sahrens tx->tx_quiesced_txg != 0)) 547fa9e4066Sahrens txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0); 548fa9e4066Sahrens 549fa9e4066Sahrens if (tx->tx_exiting) 550fa9e4066Sahrens txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread); 551fa9e4066Sahrens 552fa9e4066Sahrens txg = tx->tx_open_txg; 553fa9e4066Sahrens dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", 554fa9e4066Sahrens txg, tx->tx_quiesce_txg_waiting, 555fa9e4066Sahrens tx->tx_sync_txg_waiting); 556fa9e4066Sahrens mutex_exit(&tx->tx_sync_lock); 557fa9e4066Sahrens txg_quiesce(dp, txg); 558fa9e4066Sahrens mutex_enter(&tx->tx_sync_lock); 559fa9e4066Sahrens 560fa9e4066Sahrens /* 561fa9e4066Sahrens * Hand this txg off to the sync thread. 562fa9e4066Sahrens */ 563fa9e4066Sahrens dprintf("quiesce done, handing off txg %llu\n", txg); 564fa9e4066Sahrens tx->tx_quiesced_txg = txg; 5650689f76cSAdam Leventhal DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg); 566fa9e4066Sahrens cv_broadcast(&tx->tx_sync_more_cv); 567fa9e4066Sahrens cv_broadcast(&tx->tx_quiesce_done_cv); 568fa9e4066Sahrens } 569fa9e4066Sahrens } 570fa9e4066Sahrens 5711ab7f2deSmaybee /* 5720689f76cSAdam Leventhal * Delay this thread by delay nanoseconds if we are still in the open 573f7170741SWill Andrews * transaction group and there is already a waiting txg quiescing or quiesced. 574f7170741SWill Andrews * Abort the delay if this txg stalls or enters the quiescing state. 5751ab7f2deSmaybee */ 5761ab7f2deSmaybee void 5770689f76cSAdam Leventhal txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution) 5781ab7f2deSmaybee { 5791ab7f2deSmaybee tx_state_t *tx = &dp->dp_tx; 5800689f76cSAdam Leventhal hrtime_t start = gethrtime(); 5811ab7f2deSmaybee 582f7170741SWill Andrews /* don't delay if this txg could transition to quiescing immediately */ 5831ab7f2deSmaybee if (tx->tx_open_txg > txg || 5841ab7f2deSmaybee tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1) 5851ab7f2deSmaybee return; 5861ab7f2deSmaybee 5871ab7f2deSmaybee mutex_enter(&tx->tx_sync_lock); 5881ab7f2deSmaybee if (tx->tx_open_txg > txg || tx->tx_synced_txg == txg-1) { 5891ab7f2deSmaybee mutex_exit(&tx->tx_sync_lock); 5901ab7f2deSmaybee return; 5911ab7f2deSmaybee } 5921ab7f2deSmaybee 5930689f76cSAdam Leventhal while (gethrtime() - start < delay && 5940689f76cSAdam Leventhal tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) { 5950689f76cSAdam Leventhal (void) cv_timedwait_hires(&tx->tx_quiesce_more_cv, 5960689f76cSAdam Leventhal &tx->tx_sync_lock, delay, resolution, 0); 5970689f76cSAdam Leventhal } 5981ab7f2deSmaybee 5991ab7f2deSmaybee mutex_exit(&tx->tx_sync_lock); 6001ab7f2deSmaybee } 6011ab7f2deSmaybee 602fa9e4066Sahrens void 603fa9e4066Sahrens txg_wait_synced(dsl_pool_t *dp, uint64_t txg) 604fa9e4066Sahrens { 605fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 606fa9e4066Sahrens 6073b2aab18SMatthew Ahrens ASSERT(!dsl_pool_config_held(dp)); 6083b2aab18SMatthew Ahrens 609fa9e4066Sahrens mutex_enter(&tx->tx_sync_lock); 6101ab7f2deSmaybee ASSERT(tx->tx_threads == 2); 611fa9e4066Sahrens if (txg == 0) 612b24ab676SJeff Bonwick txg = tx->tx_open_txg + TXG_DEFER_SIZE; 613fa9e4066Sahrens if (tx->tx_sync_txg_waiting < txg) 614fa9e4066Sahrens tx->tx_sync_txg_waiting = txg; 615fa9e4066Sahrens dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", 616fa9e4066Sahrens txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); 617fa9e4066Sahrens while (tx->tx_synced_txg < txg) { 618fa9e4066Sahrens dprintf("broadcasting sync more " 619fa9e4066Sahrens "tx_synced=%llu waiting=%llu dp=%p\n", 620fa9e4066Sahrens tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); 621fa9e4066Sahrens cv_broadcast(&tx->tx_sync_more_cv); 622fa9e4066Sahrens cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock); 623fa9e4066Sahrens } 624fa9e4066Sahrens mutex_exit(&tx->tx_sync_lock); 625fa9e4066Sahrens } 626fa9e4066Sahrens 627fa9e4066Sahrens void 628fa9e4066Sahrens txg_wait_open(dsl_pool_t *dp, uint64_t txg) 629fa9e4066Sahrens { 630fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 631fa9e4066Sahrens 6323b2aab18SMatthew Ahrens ASSERT(!dsl_pool_config_held(dp)); 6333b2aab18SMatthew Ahrens 634fa9e4066Sahrens mutex_enter(&tx->tx_sync_lock); 6351ab7f2deSmaybee ASSERT(tx->tx_threads == 2); 636fa9e4066Sahrens if (txg == 0) 637fa9e4066Sahrens txg = tx->tx_open_txg + 1; 638fa9e4066Sahrens if (tx->tx_quiesce_txg_waiting < txg) 639fa9e4066Sahrens tx->tx_quiesce_txg_waiting = txg; 640fa9e4066Sahrens dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", 641fa9e4066Sahrens txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); 642fa9e4066Sahrens while (tx->tx_open_txg < txg) { 643fa9e4066Sahrens cv_broadcast(&tx->tx_quiesce_more_cv); 644fa9e4066Sahrens cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock); 645fa9e4066Sahrens } 646fa9e4066Sahrens mutex_exit(&tx->tx_sync_lock); 647fa9e4066Sahrens } 648fa9e4066Sahrens 64969962b56SMatthew Ahrens /* 65069962b56SMatthew Ahrens * If there isn't a txg syncing or in the pipeline, push another txg through 65169962b56SMatthew Ahrens * the pipeline by queiscing the open txg. 65269962b56SMatthew Ahrens */ 65369962b56SMatthew Ahrens void 65469962b56SMatthew Ahrens txg_kick(dsl_pool_t *dp) 65569962b56SMatthew Ahrens { 65669962b56SMatthew Ahrens tx_state_t *tx = &dp->dp_tx; 65769962b56SMatthew Ahrens 65869962b56SMatthew Ahrens ASSERT(!dsl_pool_config_held(dp)); 65969962b56SMatthew Ahrens 66069962b56SMatthew Ahrens mutex_enter(&tx->tx_sync_lock); 66169962b56SMatthew Ahrens if (tx->tx_syncing_txg == 0 && 66269962b56SMatthew Ahrens tx->tx_quiesce_txg_waiting <= tx->tx_open_txg && 66369962b56SMatthew Ahrens tx->tx_sync_txg_waiting <= tx->tx_synced_txg && 66469962b56SMatthew Ahrens tx->tx_quiesced_txg <= tx->tx_synced_txg) { 66569962b56SMatthew Ahrens tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1; 66669962b56SMatthew Ahrens cv_broadcast(&tx->tx_quiesce_more_cv); 66769962b56SMatthew Ahrens } 66869962b56SMatthew Ahrens mutex_exit(&tx->tx_sync_lock); 66969962b56SMatthew Ahrens } 67069962b56SMatthew Ahrens 671088f3894Sahrens boolean_t 672fa9e4066Sahrens txg_stalled(dsl_pool_t *dp) 673fa9e4066Sahrens { 674fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 675fa9e4066Sahrens return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg); 676fa9e4066Sahrens } 677fa9e4066Sahrens 678088f3894Sahrens boolean_t 679088f3894Sahrens txg_sync_waiting(dsl_pool_t *dp) 680088f3894Sahrens { 681088f3894Sahrens tx_state_t *tx = &dp->dp_tx; 682088f3894Sahrens 683088f3894Sahrens return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting || 684088f3894Sahrens tx->tx_quiesced_txg != 0); 685088f3894Sahrens } 686088f3894Sahrens 687fa9e4066Sahrens /* 688fa9e4066Sahrens * Per-txg object lists. 689fa9e4066Sahrens */ 690fa9e4066Sahrens void 691fa9e4066Sahrens txg_list_create(txg_list_t *tl, size_t offset) 692fa9e4066Sahrens { 693fa9e4066Sahrens int t; 694fa9e4066Sahrens 695fa9e4066Sahrens mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL); 696fa9e4066Sahrens 697fa9e4066Sahrens tl->tl_offset = offset; 698fa9e4066Sahrens 699fa9e4066Sahrens for (t = 0; t < TXG_SIZE; t++) 700fa9e4066Sahrens tl->tl_head[t] = NULL; 701fa9e4066Sahrens } 702fa9e4066Sahrens 703fa9e4066Sahrens void 704fa9e4066Sahrens txg_list_destroy(txg_list_t *tl) 705fa9e4066Sahrens { 706fa9e4066Sahrens int t; 707fa9e4066Sahrens 708fa9e4066Sahrens for (t = 0; t < TXG_SIZE; t++) 709fa9e4066Sahrens ASSERT(txg_list_empty(tl, t)); 710fa9e4066Sahrens 711fa9e4066Sahrens mutex_destroy(&tl->tl_lock); 712fa9e4066Sahrens } 713fa9e4066Sahrens 714ce636f8bSMatthew Ahrens boolean_t 715fa9e4066Sahrens txg_list_empty(txg_list_t *tl, uint64_t txg) 716fa9e4066Sahrens { 717fa9e4066Sahrens return (tl->tl_head[txg & TXG_MASK] == NULL); 718fa9e4066Sahrens } 719fa9e4066Sahrens 720fa9e4066Sahrens /* 721*73527f44SAlex Reece * Returns true if all txg lists are empty. 722*73527f44SAlex Reece * 723*73527f44SAlex Reece * Warning: this is inherently racy (an item could be added immediately after this 724*73527f44SAlex Reece * function returns). We don't bother with the lock because it wouldn't change the 725*73527f44SAlex Reece * semantics. 726*73527f44SAlex Reece */ 727*73527f44SAlex Reece boolean_t 728*73527f44SAlex Reece txg_all_lists_empty(txg_list_t *tl) 729*73527f44SAlex Reece { 730*73527f44SAlex Reece for (int i = 0; i < TXG_SIZE; i++) { 731*73527f44SAlex Reece if (!txg_list_empty(tl, i)) { 732*73527f44SAlex Reece return (B_FALSE); 733*73527f44SAlex Reece } 734*73527f44SAlex Reece } 735*73527f44SAlex Reece return (B_TRUE); 736*73527f44SAlex Reece } 737*73527f44SAlex Reece 738*73527f44SAlex Reece /* 7393b2aab18SMatthew Ahrens * Add an entry to the list (unless it's already on the list). 7403b2aab18SMatthew Ahrens * Returns B_TRUE if it was actually added. 741fa9e4066Sahrens */ 7423b2aab18SMatthew Ahrens boolean_t 743fa9e4066Sahrens txg_list_add(txg_list_t *tl, void *p, uint64_t txg) 744fa9e4066Sahrens { 745fa9e4066Sahrens int t = txg & TXG_MASK; 746fa9e4066Sahrens txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); 7473b2aab18SMatthew Ahrens boolean_t add; 748fa9e4066Sahrens 749fa9e4066Sahrens mutex_enter(&tl->tl_lock); 7503b2aab18SMatthew Ahrens add = (tn->tn_member[t] == 0); 7513b2aab18SMatthew Ahrens if (add) { 752fa9e4066Sahrens tn->tn_member[t] = 1; 753fa9e4066Sahrens tn->tn_next[t] = tl->tl_head[t]; 754fa9e4066Sahrens tl->tl_head[t] = tn; 755fa9e4066Sahrens } 756fa9e4066Sahrens mutex_exit(&tl->tl_lock); 757fa9e4066Sahrens 7583b2aab18SMatthew Ahrens return (add); 759fa9e4066Sahrens } 760fa9e4066Sahrens 761fa9e4066Sahrens /* 7623b2aab18SMatthew Ahrens * Add an entry to the end of the list, unless it's already on the list. 7633b2aab18SMatthew Ahrens * (walks list to find end) 7643b2aab18SMatthew Ahrens * Returns B_TRUE if it was actually added. 765495807d7SMatthew Ahrens */ 7663b2aab18SMatthew Ahrens boolean_t 767495807d7SMatthew Ahrens txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg) 768495807d7SMatthew Ahrens { 769495807d7SMatthew Ahrens int t = txg & TXG_MASK; 770495807d7SMatthew Ahrens txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); 7713b2aab18SMatthew Ahrens boolean_t add; 772495807d7SMatthew Ahrens 773495807d7SMatthew Ahrens mutex_enter(&tl->tl_lock); 7743b2aab18SMatthew Ahrens add = (tn->tn_member[t] == 0); 7753b2aab18SMatthew Ahrens if (add) { 776495807d7SMatthew Ahrens txg_node_t **tp; 777495807d7SMatthew Ahrens 778495807d7SMatthew Ahrens for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t]) 779495807d7SMatthew Ahrens continue; 780495807d7SMatthew Ahrens 781495807d7SMatthew Ahrens tn->tn_member[t] = 1; 782495807d7SMatthew Ahrens tn->tn_next[t] = NULL; 783495807d7SMatthew Ahrens *tp = tn; 784495807d7SMatthew Ahrens } 785495807d7SMatthew Ahrens mutex_exit(&tl->tl_lock); 786495807d7SMatthew Ahrens 7873b2aab18SMatthew Ahrens return (add); 788495807d7SMatthew Ahrens } 789495807d7SMatthew Ahrens 790495807d7SMatthew Ahrens /* 791fa9e4066Sahrens * Remove the head of the list and return it. 792fa9e4066Sahrens */ 793fa9e4066Sahrens void * 794fa9e4066Sahrens txg_list_remove(txg_list_t *tl, uint64_t txg) 795fa9e4066Sahrens { 796fa9e4066Sahrens int t = txg & TXG_MASK; 797fa9e4066Sahrens txg_node_t *tn; 798fa9e4066Sahrens void *p = NULL; 799fa9e4066Sahrens 800fa9e4066Sahrens mutex_enter(&tl->tl_lock); 801fa9e4066Sahrens if ((tn = tl->tl_head[t]) != NULL) { 802fa9e4066Sahrens p = (char *)tn - tl->tl_offset; 803fa9e4066Sahrens tl->tl_head[t] = tn->tn_next[t]; 804fa9e4066Sahrens tn->tn_next[t] = NULL; 805fa9e4066Sahrens tn->tn_member[t] = 0; 806fa9e4066Sahrens } 807fa9e4066Sahrens mutex_exit(&tl->tl_lock); 808fa9e4066Sahrens 809fa9e4066Sahrens return (p); 810fa9e4066Sahrens } 811fa9e4066Sahrens 812fa9e4066Sahrens /* 813fa9e4066Sahrens * Remove a specific item from the list and return it. 814fa9e4066Sahrens */ 815fa9e4066Sahrens void * 816fa9e4066Sahrens txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg) 817fa9e4066Sahrens { 818fa9e4066Sahrens int t = txg & TXG_MASK; 819fa9e4066Sahrens txg_node_t *tn, **tp; 820fa9e4066Sahrens 821fa9e4066Sahrens mutex_enter(&tl->tl_lock); 822fa9e4066Sahrens 823fa9e4066Sahrens for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) { 824fa9e4066Sahrens if ((char *)tn - tl->tl_offset == p) { 825fa9e4066Sahrens *tp = tn->tn_next[t]; 826fa9e4066Sahrens tn->tn_next[t] = NULL; 827fa9e4066Sahrens tn->tn_member[t] = 0; 828fa9e4066Sahrens mutex_exit(&tl->tl_lock); 829fa9e4066Sahrens return (p); 830fa9e4066Sahrens } 831fa9e4066Sahrens } 832fa9e4066Sahrens 833fa9e4066Sahrens mutex_exit(&tl->tl_lock); 834fa9e4066Sahrens 835fa9e4066Sahrens return (NULL); 836fa9e4066Sahrens } 837fa9e4066Sahrens 8383b2aab18SMatthew Ahrens boolean_t 839fa9e4066Sahrens txg_list_member(txg_list_t *tl, void *p, uint64_t txg) 840fa9e4066Sahrens { 841fa9e4066Sahrens int t = txg & TXG_MASK; 842fa9e4066Sahrens txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); 843fa9e4066Sahrens 8443b2aab18SMatthew Ahrens return (tn->tn_member[t] != 0); 845fa9e4066Sahrens } 846fa9e4066Sahrens 847fa9e4066Sahrens /* 848fa9e4066Sahrens * Walk a txg list -- only safe if you know it's not changing. 849fa9e4066Sahrens */ 850fa9e4066Sahrens void * 851fa9e4066Sahrens txg_list_head(txg_list_t *tl, uint64_t txg) 852fa9e4066Sahrens { 853fa9e4066Sahrens int t = txg & TXG_MASK; 854fa9e4066Sahrens txg_node_t *tn = tl->tl_head[t]; 855fa9e4066Sahrens 856fa9e4066Sahrens return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); 857fa9e4066Sahrens } 858fa9e4066Sahrens 859fa9e4066Sahrens void * 860fa9e4066Sahrens txg_list_next(txg_list_t *tl, void *p, uint64_t txg) 861fa9e4066Sahrens { 862fa9e4066Sahrens int t = txg & TXG_MASK; 863fa9e4066Sahrens txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); 864fa9e4066Sahrens 865fa9e4066Sahrens tn = tn->tn_next[t]; 866fa9e4066Sahrens 867fa9e4066Sahrens return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); 868fa9e4066Sahrens } 869