1*eda14cbcSMatt Macy /* 2*eda14cbcSMatt Macy * CDDL HEADER START 3*eda14cbcSMatt Macy * 4*eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5*eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6*eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7*eda14cbcSMatt Macy * 8*eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*eda14cbcSMatt Macy * or http://www.opensolaris.org/os/licensing. 10*eda14cbcSMatt Macy * See the License for the specific language governing permissions 11*eda14cbcSMatt Macy * and limitations under the License. 12*eda14cbcSMatt Macy * 13*eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14*eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16*eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17*eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18*eda14cbcSMatt Macy * 19*eda14cbcSMatt Macy * CDDL HEADER END 20*eda14cbcSMatt Macy */ 21*eda14cbcSMatt Macy /* 22*eda14cbcSMatt Macy * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23*eda14cbcSMatt Macy * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24*eda14cbcSMatt Macy * Copyright (c) 2012, 2017 by Delphix. All rights reserved. 25*eda14cbcSMatt Macy */ 26*eda14cbcSMatt Macy 27*eda14cbcSMatt Macy #include <sys/dmu.h> 28*eda14cbcSMatt Macy #include <sys/dmu_impl.h> 29*eda14cbcSMatt Macy #include <sys/dbuf.h> 30*eda14cbcSMatt Macy #include <sys/dmu_tx.h> 31*eda14cbcSMatt Macy #include <sys/dmu_objset.h> 32*eda14cbcSMatt Macy #include <sys/dsl_dataset.h> 33*eda14cbcSMatt Macy #include <sys/dsl_dir.h> 34*eda14cbcSMatt Macy #include <sys/dsl_pool.h> 35*eda14cbcSMatt Macy #include <sys/zap_impl.h> 36*eda14cbcSMatt Macy #include <sys/spa.h> 37*eda14cbcSMatt Macy #include <sys/sa.h> 38*eda14cbcSMatt Macy #include <sys/sa_impl.h> 39*eda14cbcSMatt Macy #include <sys/zfs_context.h> 40*eda14cbcSMatt Macy #include <sys/trace_zfs.h> 41*eda14cbcSMatt Macy 42*eda14cbcSMatt Macy typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, 43*eda14cbcSMatt Macy uint64_t arg1, uint64_t arg2); 44*eda14cbcSMatt Macy 45*eda14cbcSMatt Macy dmu_tx_stats_t dmu_tx_stats = { 46*eda14cbcSMatt Macy { "dmu_tx_assigned", KSTAT_DATA_UINT64 }, 47*eda14cbcSMatt Macy { "dmu_tx_delay", KSTAT_DATA_UINT64 }, 48*eda14cbcSMatt Macy { "dmu_tx_error", KSTAT_DATA_UINT64 }, 49*eda14cbcSMatt Macy { "dmu_tx_suspended", KSTAT_DATA_UINT64 }, 50*eda14cbcSMatt Macy { "dmu_tx_group", KSTAT_DATA_UINT64 }, 51*eda14cbcSMatt Macy { "dmu_tx_memory_reserve", KSTAT_DATA_UINT64 }, 52*eda14cbcSMatt Macy { "dmu_tx_memory_reclaim", KSTAT_DATA_UINT64 }, 53*eda14cbcSMatt Macy { "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 }, 54*eda14cbcSMatt Macy { "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 }, 55*eda14cbcSMatt Macy { "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 }, 56*eda14cbcSMatt Macy { "dmu_tx_dirty_frees_delay", KSTAT_DATA_UINT64 }, 57*eda14cbcSMatt Macy { "dmu_tx_quota", KSTAT_DATA_UINT64 }, 58*eda14cbcSMatt Macy }; 59*eda14cbcSMatt Macy 60*eda14cbcSMatt Macy static kstat_t *dmu_tx_ksp; 61*eda14cbcSMatt Macy 62*eda14cbcSMatt Macy dmu_tx_t * 63*eda14cbcSMatt Macy dmu_tx_create_dd(dsl_dir_t *dd) 64*eda14cbcSMatt Macy { 65*eda14cbcSMatt Macy dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); 66*eda14cbcSMatt Macy tx->tx_dir = dd; 67*eda14cbcSMatt Macy if (dd != NULL) 68*eda14cbcSMatt Macy tx->tx_pool = dd->dd_pool; 69*eda14cbcSMatt Macy list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), 70*eda14cbcSMatt Macy offsetof(dmu_tx_hold_t, txh_node)); 71*eda14cbcSMatt Macy list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), 72*eda14cbcSMatt Macy offsetof(dmu_tx_callback_t, dcb_node)); 73*eda14cbcSMatt Macy tx->tx_start = gethrtime(); 74*eda14cbcSMatt Macy return (tx); 75*eda14cbcSMatt Macy } 76*eda14cbcSMatt Macy 77*eda14cbcSMatt Macy dmu_tx_t * 78*eda14cbcSMatt Macy dmu_tx_create(objset_t *os) 79*eda14cbcSMatt Macy { 80*eda14cbcSMatt Macy dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir); 81*eda14cbcSMatt Macy tx->tx_objset = os; 82*eda14cbcSMatt Macy return (tx); 83*eda14cbcSMatt Macy } 84*eda14cbcSMatt Macy 85*eda14cbcSMatt Macy dmu_tx_t * 86*eda14cbcSMatt Macy dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) 87*eda14cbcSMatt Macy { 88*eda14cbcSMatt Macy dmu_tx_t *tx = dmu_tx_create_dd(NULL); 89*eda14cbcSMatt Macy 90*eda14cbcSMatt Macy TXG_VERIFY(dp->dp_spa, txg); 91*eda14cbcSMatt Macy tx->tx_pool = dp; 92*eda14cbcSMatt Macy tx->tx_txg = txg; 93*eda14cbcSMatt Macy tx->tx_anyobj = TRUE; 94*eda14cbcSMatt Macy 95*eda14cbcSMatt Macy return (tx); 96*eda14cbcSMatt Macy } 97*eda14cbcSMatt Macy 98*eda14cbcSMatt Macy int 99*eda14cbcSMatt Macy dmu_tx_is_syncing(dmu_tx_t *tx) 100*eda14cbcSMatt Macy { 101*eda14cbcSMatt Macy return (tx->tx_anyobj); 102*eda14cbcSMatt Macy } 103*eda14cbcSMatt Macy 104*eda14cbcSMatt Macy int 105*eda14cbcSMatt Macy dmu_tx_private_ok(dmu_tx_t *tx) 106*eda14cbcSMatt Macy { 107*eda14cbcSMatt Macy return (tx->tx_anyobj); 108*eda14cbcSMatt Macy } 109*eda14cbcSMatt Macy 110*eda14cbcSMatt Macy static dmu_tx_hold_t * 111*eda14cbcSMatt Macy dmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type, 112*eda14cbcSMatt Macy uint64_t arg1, uint64_t arg2) 113*eda14cbcSMatt Macy { 114*eda14cbcSMatt Macy dmu_tx_hold_t *txh; 115*eda14cbcSMatt Macy 116*eda14cbcSMatt Macy if (dn != NULL) { 117*eda14cbcSMatt Macy (void) zfs_refcount_add(&dn->dn_holds, tx); 118*eda14cbcSMatt Macy if (tx->tx_txg != 0) { 119*eda14cbcSMatt Macy mutex_enter(&dn->dn_mtx); 120*eda14cbcSMatt Macy /* 121*eda14cbcSMatt Macy * dn->dn_assigned_txg == tx->tx_txg doesn't pose a 122*eda14cbcSMatt Macy * problem, but there's no way for it to happen (for 123*eda14cbcSMatt Macy * now, at least). 124*eda14cbcSMatt Macy */ 125*eda14cbcSMatt Macy ASSERT(dn->dn_assigned_txg == 0); 126*eda14cbcSMatt Macy dn->dn_assigned_txg = tx->tx_txg; 127*eda14cbcSMatt Macy (void) zfs_refcount_add(&dn->dn_tx_holds, tx); 128*eda14cbcSMatt Macy mutex_exit(&dn->dn_mtx); 129*eda14cbcSMatt Macy } 130*eda14cbcSMatt Macy } 131*eda14cbcSMatt Macy 132*eda14cbcSMatt Macy txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); 133*eda14cbcSMatt Macy txh->txh_tx = tx; 134*eda14cbcSMatt Macy txh->txh_dnode = dn; 135*eda14cbcSMatt Macy zfs_refcount_create(&txh->txh_space_towrite); 136*eda14cbcSMatt Macy zfs_refcount_create(&txh->txh_memory_tohold); 137*eda14cbcSMatt Macy txh->txh_type = type; 138*eda14cbcSMatt Macy txh->txh_arg1 = arg1; 139*eda14cbcSMatt Macy txh->txh_arg2 = arg2; 140*eda14cbcSMatt Macy list_insert_tail(&tx->tx_holds, txh); 141*eda14cbcSMatt Macy 142*eda14cbcSMatt Macy return (txh); 143*eda14cbcSMatt Macy } 144*eda14cbcSMatt Macy 145*eda14cbcSMatt Macy static dmu_tx_hold_t * 146*eda14cbcSMatt Macy dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, 147*eda14cbcSMatt Macy enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) 148*eda14cbcSMatt Macy { 149*eda14cbcSMatt Macy dnode_t *dn = NULL; 150*eda14cbcSMatt Macy dmu_tx_hold_t *txh; 151*eda14cbcSMatt Macy int err; 152*eda14cbcSMatt Macy 153*eda14cbcSMatt Macy if (object != DMU_NEW_OBJECT) { 154*eda14cbcSMatt Macy err = dnode_hold(os, object, FTAG, &dn); 155*eda14cbcSMatt Macy if (err != 0) { 156*eda14cbcSMatt Macy tx->tx_err = err; 157*eda14cbcSMatt Macy return (NULL); 158*eda14cbcSMatt Macy } 159*eda14cbcSMatt Macy } 160*eda14cbcSMatt Macy txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2); 161*eda14cbcSMatt Macy if (dn != NULL) 162*eda14cbcSMatt Macy dnode_rele(dn, FTAG); 163*eda14cbcSMatt Macy return (txh); 164*eda14cbcSMatt Macy } 165*eda14cbcSMatt Macy 166*eda14cbcSMatt Macy void 167*eda14cbcSMatt Macy dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn) 168*eda14cbcSMatt Macy { 169*eda14cbcSMatt Macy /* 170*eda14cbcSMatt Macy * If we're syncing, they can manipulate any object anyhow, and 171*eda14cbcSMatt Macy * the hold on the dnode_t can cause problems. 172*eda14cbcSMatt Macy */ 173*eda14cbcSMatt Macy if (!dmu_tx_is_syncing(tx)) 174*eda14cbcSMatt Macy (void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0); 175*eda14cbcSMatt Macy } 176*eda14cbcSMatt Macy 177*eda14cbcSMatt Macy /* 178*eda14cbcSMatt Macy * This function reads specified data from disk. The specified data will 179*eda14cbcSMatt Macy * be needed to perform the transaction -- i.e, it will be read after 180*eda14cbcSMatt Macy * we do dmu_tx_assign(). There are two reasons that we read the data now 181*eda14cbcSMatt Macy * (before dmu_tx_assign()): 182*eda14cbcSMatt Macy * 183*eda14cbcSMatt Macy * 1. Reading it now has potentially better performance. The transaction 184*eda14cbcSMatt Macy * has not yet been assigned, so the TXG is not held open, and also the 185*eda14cbcSMatt Macy * caller typically has less locks held when calling dmu_tx_hold_*() than 186*eda14cbcSMatt Macy * after the transaction has been assigned. This reduces the lock (and txg) 187*eda14cbcSMatt Macy * hold times, thus reducing lock contention. 188*eda14cbcSMatt Macy * 189*eda14cbcSMatt Macy * 2. It is easier for callers (primarily the ZPL) to handle i/o errors 190*eda14cbcSMatt Macy * that are detected before they start making changes to the DMU state 191*eda14cbcSMatt Macy * (i.e. now). Once the transaction has been assigned, and some DMU 192*eda14cbcSMatt Macy * state has been changed, it can be difficult to recover from an i/o 193*eda14cbcSMatt Macy * error (e.g. to undo the changes already made in memory at the DMU 194*eda14cbcSMatt Macy * layer). Typically code to do so does not exist in the caller -- it 195*eda14cbcSMatt Macy * assumes that the data has already been cached and thus i/o errors are 196*eda14cbcSMatt Macy * not possible. 197*eda14cbcSMatt Macy * 198*eda14cbcSMatt Macy * It has been observed that the i/o initiated here can be a performance 199*eda14cbcSMatt Macy * problem, and it appears to be optional, because we don't look at the 200*eda14cbcSMatt Macy * data which is read. However, removing this read would only serve to 201*eda14cbcSMatt Macy * move the work elsewhere (after the dmu_tx_assign()), where it may 202*eda14cbcSMatt Macy * have a greater impact on performance (in addition to the impact on 203*eda14cbcSMatt Macy * fault tolerance noted above). 204*eda14cbcSMatt Macy */ 205*eda14cbcSMatt Macy static int 206*eda14cbcSMatt Macy dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) 207*eda14cbcSMatt Macy { 208*eda14cbcSMatt Macy int err; 209*eda14cbcSMatt Macy dmu_buf_impl_t *db; 210*eda14cbcSMatt Macy 211*eda14cbcSMatt Macy rw_enter(&dn->dn_struct_rwlock, RW_READER); 212*eda14cbcSMatt Macy db = dbuf_hold_level(dn, level, blkid, FTAG); 213*eda14cbcSMatt Macy rw_exit(&dn->dn_struct_rwlock); 214*eda14cbcSMatt Macy if (db == NULL) 215*eda14cbcSMatt Macy return (SET_ERROR(EIO)); 216*eda14cbcSMatt Macy err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); 217*eda14cbcSMatt Macy dbuf_rele(db, FTAG); 218*eda14cbcSMatt Macy return (err); 219*eda14cbcSMatt Macy } 220*eda14cbcSMatt Macy 221*eda14cbcSMatt Macy /* ARGSUSED */ 222*eda14cbcSMatt Macy static void 223*eda14cbcSMatt Macy dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) 224*eda14cbcSMatt Macy { 225*eda14cbcSMatt Macy dnode_t *dn = txh->txh_dnode; 226*eda14cbcSMatt Macy int err = 0; 227*eda14cbcSMatt Macy 228*eda14cbcSMatt Macy if (len == 0) 229*eda14cbcSMatt Macy return; 230*eda14cbcSMatt Macy 231*eda14cbcSMatt Macy (void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG); 232*eda14cbcSMatt Macy 233*eda14cbcSMatt Macy if (zfs_refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS) 234*eda14cbcSMatt Macy err = SET_ERROR(EFBIG); 235*eda14cbcSMatt Macy 236*eda14cbcSMatt Macy if (dn == NULL) 237*eda14cbcSMatt Macy return; 238*eda14cbcSMatt Macy 239*eda14cbcSMatt Macy /* 240*eda14cbcSMatt Macy * For i/o error checking, read the blocks that will be needed 241*eda14cbcSMatt Macy * to perform the write: the first and last level-0 blocks (if 242*eda14cbcSMatt Macy * they are not aligned, i.e. if they are partial-block writes), 243*eda14cbcSMatt Macy * and all the level-1 blocks. 244*eda14cbcSMatt Macy */ 245*eda14cbcSMatt Macy if (dn->dn_maxblkid == 0) { 246*eda14cbcSMatt Macy if (off < dn->dn_datablksz && 247*eda14cbcSMatt Macy (off > 0 || len < dn->dn_datablksz)) { 248*eda14cbcSMatt Macy err = dmu_tx_check_ioerr(NULL, dn, 0, 0); 249*eda14cbcSMatt Macy if (err != 0) { 250*eda14cbcSMatt Macy txh->txh_tx->tx_err = err; 251*eda14cbcSMatt Macy } 252*eda14cbcSMatt Macy } 253*eda14cbcSMatt Macy } else { 254*eda14cbcSMatt Macy zio_t *zio = zio_root(dn->dn_objset->os_spa, 255*eda14cbcSMatt Macy NULL, NULL, ZIO_FLAG_CANFAIL); 256*eda14cbcSMatt Macy 257*eda14cbcSMatt Macy /* first level-0 block */ 258*eda14cbcSMatt Macy uint64_t start = off >> dn->dn_datablkshift; 259*eda14cbcSMatt Macy if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { 260*eda14cbcSMatt Macy err = dmu_tx_check_ioerr(zio, dn, 0, start); 261*eda14cbcSMatt Macy if (err != 0) { 262*eda14cbcSMatt Macy txh->txh_tx->tx_err = err; 263*eda14cbcSMatt Macy } 264*eda14cbcSMatt Macy } 265*eda14cbcSMatt Macy 266*eda14cbcSMatt Macy /* last level-0 block */ 267*eda14cbcSMatt Macy uint64_t end = (off + len - 1) >> dn->dn_datablkshift; 268*eda14cbcSMatt Macy if (end != start && end <= dn->dn_maxblkid && 269*eda14cbcSMatt Macy P2PHASE(off + len, dn->dn_datablksz)) { 270*eda14cbcSMatt Macy err = dmu_tx_check_ioerr(zio, dn, 0, end); 271*eda14cbcSMatt Macy if (err != 0) { 272*eda14cbcSMatt Macy txh->txh_tx->tx_err = err; 273*eda14cbcSMatt Macy } 274*eda14cbcSMatt Macy } 275*eda14cbcSMatt Macy 276*eda14cbcSMatt Macy /* level-1 blocks */ 277*eda14cbcSMatt Macy if (dn->dn_nlevels > 1) { 278*eda14cbcSMatt Macy int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 279*eda14cbcSMatt Macy for (uint64_t i = (start >> shft) + 1; 280*eda14cbcSMatt Macy i < end >> shft; i++) { 281*eda14cbcSMatt Macy err = dmu_tx_check_ioerr(zio, dn, 1, i); 282*eda14cbcSMatt Macy if (err != 0) { 283*eda14cbcSMatt Macy txh->txh_tx->tx_err = err; 284*eda14cbcSMatt Macy } 285*eda14cbcSMatt Macy } 286*eda14cbcSMatt Macy } 287*eda14cbcSMatt Macy 288*eda14cbcSMatt Macy err = zio_wait(zio); 289*eda14cbcSMatt Macy if (err != 0) { 290*eda14cbcSMatt Macy txh->txh_tx->tx_err = err; 291*eda14cbcSMatt Macy } 292*eda14cbcSMatt Macy } 293*eda14cbcSMatt Macy } 294*eda14cbcSMatt Macy 295*eda14cbcSMatt Macy static void 296*eda14cbcSMatt Macy dmu_tx_count_dnode(dmu_tx_hold_t *txh) 297*eda14cbcSMatt Macy { 298*eda14cbcSMatt Macy (void) zfs_refcount_add_many(&txh->txh_space_towrite, 299*eda14cbcSMatt Macy DNODE_MIN_SIZE, FTAG); 300*eda14cbcSMatt Macy } 301*eda14cbcSMatt Macy 302*eda14cbcSMatt Macy void 303*eda14cbcSMatt Macy dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) 304*eda14cbcSMatt Macy { 305*eda14cbcSMatt Macy dmu_tx_hold_t *txh; 306*eda14cbcSMatt Macy 307*eda14cbcSMatt Macy ASSERT0(tx->tx_txg); 308*eda14cbcSMatt Macy ASSERT3U(len, <=, DMU_MAX_ACCESS); 309*eda14cbcSMatt Macy ASSERT(len == 0 || UINT64_MAX - off >= len - 1); 310*eda14cbcSMatt Macy 311*eda14cbcSMatt Macy txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 312*eda14cbcSMatt Macy object, THT_WRITE, off, len); 313*eda14cbcSMatt Macy if (txh != NULL) { 314*eda14cbcSMatt Macy dmu_tx_count_write(txh, off, len); 315*eda14cbcSMatt Macy dmu_tx_count_dnode(txh); 316*eda14cbcSMatt Macy } 317*eda14cbcSMatt Macy } 318*eda14cbcSMatt Macy 319*eda14cbcSMatt Macy void 320*eda14cbcSMatt Macy dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) 321*eda14cbcSMatt Macy { 322*eda14cbcSMatt Macy dmu_tx_hold_t *txh; 323*eda14cbcSMatt Macy 324*eda14cbcSMatt Macy ASSERT0(tx->tx_txg); 325*eda14cbcSMatt Macy ASSERT3U(len, <=, DMU_MAX_ACCESS); 326*eda14cbcSMatt Macy ASSERT(len == 0 || UINT64_MAX - off >= len - 1); 327*eda14cbcSMatt Macy 328*eda14cbcSMatt Macy txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len); 329*eda14cbcSMatt Macy if (txh != NULL) { 330*eda14cbcSMatt Macy dmu_tx_count_write(txh, off, len); 331*eda14cbcSMatt Macy dmu_tx_count_dnode(txh); 332*eda14cbcSMatt Macy } 333*eda14cbcSMatt Macy } 334*eda14cbcSMatt Macy 335*eda14cbcSMatt Macy /* 336*eda14cbcSMatt Macy * This function marks the transaction as being a "net free". The end 337*eda14cbcSMatt Macy * result is that refquotas will be disabled for this transaction, and 338*eda14cbcSMatt Macy * this transaction will be able to use half of the pool space overhead 339*eda14cbcSMatt Macy * (see dsl_pool_adjustedsize()). Therefore this function should only 340*eda14cbcSMatt Macy * be called for transactions that we expect will not cause a net increase 341*eda14cbcSMatt Macy * in the amount of space used (but it's OK if that is occasionally not true). 342*eda14cbcSMatt Macy */ 343*eda14cbcSMatt Macy void 344*eda14cbcSMatt Macy dmu_tx_mark_netfree(dmu_tx_t *tx) 345*eda14cbcSMatt Macy { 346*eda14cbcSMatt Macy tx->tx_netfree = B_TRUE; 347*eda14cbcSMatt Macy } 348*eda14cbcSMatt Macy 349*eda14cbcSMatt Macy static void 350*eda14cbcSMatt Macy dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) 351*eda14cbcSMatt Macy { 352*eda14cbcSMatt Macy dmu_tx_t *tx = txh->txh_tx; 353*eda14cbcSMatt Macy dnode_t *dn = txh->txh_dnode; 354*eda14cbcSMatt Macy int err; 355*eda14cbcSMatt Macy 356*eda14cbcSMatt Macy ASSERT(tx->tx_txg == 0); 357*eda14cbcSMatt Macy 358*eda14cbcSMatt Macy dmu_tx_count_dnode(txh); 359*eda14cbcSMatt Macy 360*eda14cbcSMatt Macy if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz) 361*eda14cbcSMatt Macy return; 362*eda14cbcSMatt Macy if (len == DMU_OBJECT_END) 363*eda14cbcSMatt Macy len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off; 364*eda14cbcSMatt Macy 365*eda14cbcSMatt Macy dmu_tx_count_dnode(txh); 366*eda14cbcSMatt Macy 367*eda14cbcSMatt Macy /* 368*eda14cbcSMatt Macy * For i/o error checking, we read the first and last level-0 369*eda14cbcSMatt Macy * blocks if they are not aligned, and all the level-1 blocks. 370*eda14cbcSMatt Macy * 371*eda14cbcSMatt Macy * Note: dbuf_free_range() assumes that we have not instantiated 372*eda14cbcSMatt Macy * any level-0 dbufs that will be completely freed. Therefore we must 373*eda14cbcSMatt Macy * exercise care to not read or count the first and last blocks 374*eda14cbcSMatt Macy * if they are blocksize-aligned. 375*eda14cbcSMatt Macy */ 376*eda14cbcSMatt Macy if (dn->dn_datablkshift == 0) { 377*eda14cbcSMatt Macy if (off != 0 || len < dn->dn_datablksz) 378*eda14cbcSMatt Macy dmu_tx_count_write(txh, 0, dn->dn_datablksz); 379*eda14cbcSMatt Macy } else { 380*eda14cbcSMatt Macy /* first block will be modified if it is not aligned */ 381*eda14cbcSMatt Macy if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift)) 382*eda14cbcSMatt Macy dmu_tx_count_write(txh, off, 1); 383*eda14cbcSMatt Macy /* last block will be modified if it is not aligned */ 384*eda14cbcSMatt Macy if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift)) 385*eda14cbcSMatt Macy dmu_tx_count_write(txh, off + len, 1); 386*eda14cbcSMatt Macy } 387*eda14cbcSMatt Macy 388*eda14cbcSMatt Macy /* 389*eda14cbcSMatt Macy * Check level-1 blocks. 390*eda14cbcSMatt Macy */ 391*eda14cbcSMatt Macy if (dn->dn_nlevels > 1) { 392*eda14cbcSMatt Macy int shift = dn->dn_datablkshift + dn->dn_indblkshift - 393*eda14cbcSMatt Macy SPA_BLKPTRSHIFT; 394*eda14cbcSMatt Macy uint64_t start = off >> shift; 395*eda14cbcSMatt Macy uint64_t end = (off + len) >> shift; 396*eda14cbcSMatt Macy 397*eda14cbcSMatt Macy ASSERT(dn->dn_indblkshift != 0); 398*eda14cbcSMatt Macy 399*eda14cbcSMatt Macy /* 400*eda14cbcSMatt Macy * dnode_reallocate() can result in an object with indirect 401*eda14cbcSMatt Macy * blocks having an odd data block size. In this case, 402*eda14cbcSMatt Macy * just check the single block. 403*eda14cbcSMatt Macy */ 404*eda14cbcSMatt Macy if (dn->dn_datablkshift == 0) 405*eda14cbcSMatt Macy start = end = 0; 406*eda14cbcSMatt Macy 407*eda14cbcSMatt Macy zio_t *zio = zio_root(tx->tx_pool->dp_spa, 408*eda14cbcSMatt Macy NULL, NULL, ZIO_FLAG_CANFAIL); 409*eda14cbcSMatt Macy for (uint64_t i = start; i <= end; i++) { 410*eda14cbcSMatt Macy uint64_t ibyte = i << shift; 411*eda14cbcSMatt Macy err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); 412*eda14cbcSMatt Macy i = ibyte >> shift; 413*eda14cbcSMatt Macy if (err == ESRCH || i > end) 414*eda14cbcSMatt Macy break; 415*eda14cbcSMatt Macy if (err != 0) { 416*eda14cbcSMatt Macy tx->tx_err = err; 417*eda14cbcSMatt Macy (void) zio_wait(zio); 418*eda14cbcSMatt Macy return; 419*eda14cbcSMatt Macy } 420*eda14cbcSMatt Macy 421*eda14cbcSMatt Macy (void) zfs_refcount_add_many(&txh->txh_memory_tohold, 422*eda14cbcSMatt Macy 1 << dn->dn_indblkshift, FTAG); 423*eda14cbcSMatt Macy 424*eda14cbcSMatt Macy err = dmu_tx_check_ioerr(zio, dn, 1, i); 425*eda14cbcSMatt Macy if (err != 0) { 426*eda14cbcSMatt Macy tx->tx_err = err; 427*eda14cbcSMatt Macy (void) zio_wait(zio); 428*eda14cbcSMatt Macy return; 429*eda14cbcSMatt Macy } 430*eda14cbcSMatt Macy } 431*eda14cbcSMatt Macy err = zio_wait(zio); 432*eda14cbcSMatt Macy if (err != 0) { 433*eda14cbcSMatt Macy tx->tx_err = err; 434*eda14cbcSMatt Macy return; 435*eda14cbcSMatt Macy } 436*eda14cbcSMatt Macy } 437*eda14cbcSMatt Macy } 438*eda14cbcSMatt Macy 439*eda14cbcSMatt Macy void 440*eda14cbcSMatt Macy dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) 441*eda14cbcSMatt Macy { 442*eda14cbcSMatt Macy dmu_tx_hold_t *txh; 443*eda14cbcSMatt Macy 444*eda14cbcSMatt Macy txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 445*eda14cbcSMatt Macy object, THT_FREE, off, len); 446*eda14cbcSMatt Macy if (txh != NULL) 447*eda14cbcSMatt Macy (void) dmu_tx_hold_free_impl(txh, off, len); 448*eda14cbcSMatt Macy } 449*eda14cbcSMatt Macy 450*eda14cbcSMatt Macy void 451*eda14cbcSMatt Macy dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) 452*eda14cbcSMatt Macy { 453*eda14cbcSMatt Macy dmu_tx_hold_t *txh; 454*eda14cbcSMatt Macy 455*eda14cbcSMatt Macy txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len); 456*eda14cbcSMatt Macy if (txh != NULL) 457*eda14cbcSMatt Macy (void) dmu_tx_hold_free_impl(txh, off, len); 458*eda14cbcSMatt Macy } 459*eda14cbcSMatt Macy 460*eda14cbcSMatt Macy static void 461*eda14cbcSMatt Macy dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name) 462*eda14cbcSMatt Macy { 463*eda14cbcSMatt Macy dmu_tx_t *tx = txh->txh_tx; 464*eda14cbcSMatt Macy dnode_t *dn = txh->txh_dnode; 465*eda14cbcSMatt Macy int err; 466*eda14cbcSMatt Macy 467*eda14cbcSMatt Macy ASSERT(tx->tx_txg == 0); 468*eda14cbcSMatt Macy 469*eda14cbcSMatt Macy dmu_tx_count_dnode(txh); 470*eda14cbcSMatt Macy 471*eda14cbcSMatt Macy /* 472*eda14cbcSMatt Macy * Modifying a almost-full microzap is around the worst case (128KB) 473*eda14cbcSMatt Macy * 474*eda14cbcSMatt Macy * If it is a fat zap, the worst case would be 7*16KB=112KB: 475*eda14cbcSMatt Macy * - 3 blocks overwritten: target leaf, ptrtbl block, header block 476*eda14cbcSMatt Macy * - 4 new blocks written if adding: 477*eda14cbcSMatt Macy * - 2 blocks for possibly split leaves, 478*eda14cbcSMatt Macy * - 2 grown ptrtbl blocks 479*eda14cbcSMatt Macy */ 480*eda14cbcSMatt Macy (void) zfs_refcount_add_many(&txh->txh_space_towrite, 481*eda14cbcSMatt Macy MZAP_MAX_BLKSZ, FTAG); 482*eda14cbcSMatt Macy 483*eda14cbcSMatt Macy if (dn == NULL) 484*eda14cbcSMatt Macy return; 485*eda14cbcSMatt Macy 486*eda14cbcSMatt Macy ASSERT3U(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP); 487*eda14cbcSMatt Macy 488*eda14cbcSMatt Macy if (dn->dn_maxblkid == 0 || name == NULL) { 489*eda14cbcSMatt Macy /* 490*eda14cbcSMatt Macy * This is a microzap (only one block), or we don't know 491*eda14cbcSMatt Macy * the name. Check the first block for i/o errors. 492*eda14cbcSMatt Macy */ 493*eda14cbcSMatt Macy err = dmu_tx_check_ioerr(NULL, dn, 0, 0); 494*eda14cbcSMatt Macy if (err != 0) { 495*eda14cbcSMatt Macy tx->tx_err = err; 496*eda14cbcSMatt Macy } 497*eda14cbcSMatt Macy } else { 498*eda14cbcSMatt Macy /* 499*eda14cbcSMatt Macy * Access the name so that we'll check for i/o errors to 500*eda14cbcSMatt Macy * the leaf blocks, etc. We ignore ENOENT, as this name 501*eda14cbcSMatt Macy * may not yet exist. 502*eda14cbcSMatt Macy */ 503*eda14cbcSMatt Macy err = zap_lookup_by_dnode(dn, name, 8, 0, NULL); 504*eda14cbcSMatt Macy if (err == EIO || err == ECKSUM || err == ENXIO) { 505*eda14cbcSMatt Macy tx->tx_err = err; 506*eda14cbcSMatt Macy } 507*eda14cbcSMatt Macy } 508*eda14cbcSMatt Macy } 509*eda14cbcSMatt Macy 510*eda14cbcSMatt Macy void 511*eda14cbcSMatt Macy dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) 512*eda14cbcSMatt Macy { 513*eda14cbcSMatt Macy dmu_tx_hold_t *txh; 514*eda14cbcSMatt Macy 515*eda14cbcSMatt Macy ASSERT0(tx->tx_txg); 516*eda14cbcSMatt Macy 517*eda14cbcSMatt Macy txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 518*eda14cbcSMatt Macy object, THT_ZAP, add, (uintptr_t)name); 519*eda14cbcSMatt Macy if (txh != NULL) 520*eda14cbcSMatt Macy dmu_tx_hold_zap_impl(txh, name); 521*eda14cbcSMatt Macy } 522*eda14cbcSMatt Macy 523*eda14cbcSMatt Macy void 524*eda14cbcSMatt Macy dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name) 525*eda14cbcSMatt Macy { 526*eda14cbcSMatt Macy dmu_tx_hold_t *txh; 527*eda14cbcSMatt Macy 528*eda14cbcSMatt Macy ASSERT0(tx->tx_txg); 529*eda14cbcSMatt Macy ASSERT(dn != NULL); 530*eda14cbcSMatt Macy 531*eda14cbcSMatt Macy txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name); 532*eda14cbcSMatt Macy if (txh != NULL) 533*eda14cbcSMatt Macy dmu_tx_hold_zap_impl(txh, name); 534*eda14cbcSMatt Macy } 535*eda14cbcSMatt Macy 536*eda14cbcSMatt Macy void 537*eda14cbcSMatt Macy dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) 538*eda14cbcSMatt Macy { 539*eda14cbcSMatt Macy dmu_tx_hold_t *txh; 540*eda14cbcSMatt Macy 541*eda14cbcSMatt Macy ASSERT(tx->tx_txg == 0); 542*eda14cbcSMatt Macy 543*eda14cbcSMatt Macy txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 544*eda14cbcSMatt Macy object, THT_BONUS, 0, 0); 545*eda14cbcSMatt Macy if (txh) 546*eda14cbcSMatt Macy dmu_tx_count_dnode(txh); 547*eda14cbcSMatt Macy } 548*eda14cbcSMatt Macy 549*eda14cbcSMatt Macy void 550*eda14cbcSMatt Macy dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn) 551*eda14cbcSMatt Macy { 552*eda14cbcSMatt Macy dmu_tx_hold_t *txh; 553*eda14cbcSMatt Macy 554*eda14cbcSMatt Macy ASSERT0(tx->tx_txg); 555*eda14cbcSMatt Macy 556*eda14cbcSMatt Macy txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0); 557*eda14cbcSMatt Macy if (txh) 558*eda14cbcSMatt Macy dmu_tx_count_dnode(txh); 559*eda14cbcSMatt Macy } 560*eda14cbcSMatt Macy 561*eda14cbcSMatt Macy void 562*eda14cbcSMatt Macy dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) 563*eda14cbcSMatt Macy { 564*eda14cbcSMatt Macy dmu_tx_hold_t *txh; 565*eda14cbcSMatt Macy 566*eda14cbcSMatt Macy ASSERT(tx->tx_txg == 0); 567*eda14cbcSMatt Macy 568*eda14cbcSMatt Macy txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 569*eda14cbcSMatt Macy DMU_NEW_OBJECT, THT_SPACE, space, 0); 570*eda14cbcSMatt Macy if (txh) { 571*eda14cbcSMatt Macy (void) zfs_refcount_add_many( 572*eda14cbcSMatt Macy &txh->txh_space_towrite, space, FTAG); 573*eda14cbcSMatt Macy } 574*eda14cbcSMatt Macy } 575*eda14cbcSMatt Macy 576*eda14cbcSMatt Macy #ifdef ZFS_DEBUG 577*eda14cbcSMatt Macy void 578*eda14cbcSMatt Macy dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) 579*eda14cbcSMatt Macy { 580*eda14cbcSMatt Macy boolean_t match_object = B_FALSE; 581*eda14cbcSMatt Macy boolean_t match_offset = B_FALSE; 582*eda14cbcSMatt Macy 583*eda14cbcSMatt Macy DB_DNODE_ENTER(db); 584*eda14cbcSMatt Macy dnode_t *dn = DB_DNODE(db); 585*eda14cbcSMatt Macy ASSERT(tx->tx_txg != 0); 586*eda14cbcSMatt Macy ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); 587*eda14cbcSMatt Macy ASSERT3U(dn->dn_object, ==, db->db.db_object); 588*eda14cbcSMatt Macy 589*eda14cbcSMatt Macy if (tx->tx_anyobj) { 590*eda14cbcSMatt Macy DB_DNODE_EXIT(db); 591*eda14cbcSMatt Macy return; 592*eda14cbcSMatt Macy } 593*eda14cbcSMatt Macy 594*eda14cbcSMatt Macy /* XXX No checking on the meta dnode for now */ 595*eda14cbcSMatt Macy if (db->db.db_object == DMU_META_DNODE_OBJECT) { 596*eda14cbcSMatt Macy DB_DNODE_EXIT(db); 597*eda14cbcSMatt Macy return; 598*eda14cbcSMatt Macy } 599*eda14cbcSMatt Macy 600*eda14cbcSMatt Macy for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; 601*eda14cbcSMatt Macy txh = list_next(&tx->tx_holds, txh)) { 602*eda14cbcSMatt Macy ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 603*eda14cbcSMatt Macy if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) 604*eda14cbcSMatt Macy match_object = TRUE; 605*eda14cbcSMatt Macy if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { 606*eda14cbcSMatt Macy int datablkshift = dn->dn_datablkshift ? 607*eda14cbcSMatt Macy dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; 608*eda14cbcSMatt Macy int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 609*eda14cbcSMatt Macy int shift = datablkshift + epbs * db->db_level; 610*eda14cbcSMatt Macy uint64_t beginblk = shift >= 64 ? 0 : 611*eda14cbcSMatt Macy (txh->txh_arg1 >> shift); 612*eda14cbcSMatt Macy uint64_t endblk = shift >= 64 ? 0 : 613*eda14cbcSMatt Macy ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); 614*eda14cbcSMatt Macy uint64_t blkid = db->db_blkid; 615*eda14cbcSMatt Macy 616*eda14cbcSMatt Macy /* XXX txh_arg2 better not be zero... */ 617*eda14cbcSMatt Macy 618*eda14cbcSMatt Macy dprintf("found txh type %x beginblk=%llx endblk=%llx\n", 619*eda14cbcSMatt Macy txh->txh_type, beginblk, endblk); 620*eda14cbcSMatt Macy 621*eda14cbcSMatt Macy switch (txh->txh_type) { 622*eda14cbcSMatt Macy case THT_WRITE: 623*eda14cbcSMatt Macy if (blkid >= beginblk && blkid <= endblk) 624*eda14cbcSMatt Macy match_offset = TRUE; 625*eda14cbcSMatt Macy /* 626*eda14cbcSMatt Macy * We will let this hold work for the bonus 627*eda14cbcSMatt Macy * or spill buffer so that we don't need to 628*eda14cbcSMatt Macy * hold it when creating a new object. 629*eda14cbcSMatt Macy */ 630*eda14cbcSMatt Macy if (blkid == DMU_BONUS_BLKID || 631*eda14cbcSMatt Macy blkid == DMU_SPILL_BLKID) 632*eda14cbcSMatt Macy match_offset = TRUE; 633*eda14cbcSMatt Macy /* 634*eda14cbcSMatt Macy * They might have to increase nlevels, 635*eda14cbcSMatt Macy * thus dirtying the new TLIBs. Or the 636*eda14cbcSMatt Macy * might have to change the block size, 637*eda14cbcSMatt Macy * thus dirying the new lvl=0 blk=0. 638*eda14cbcSMatt Macy */ 639*eda14cbcSMatt Macy if (blkid == 0) 640*eda14cbcSMatt Macy match_offset = TRUE; 641*eda14cbcSMatt Macy break; 642*eda14cbcSMatt Macy case THT_FREE: 643*eda14cbcSMatt Macy /* 644*eda14cbcSMatt Macy * We will dirty all the level 1 blocks in 645*eda14cbcSMatt Macy * the free range and perhaps the first and 646*eda14cbcSMatt Macy * last level 0 block. 647*eda14cbcSMatt Macy */ 648*eda14cbcSMatt Macy if (blkid >= beginblk && (blkid <= endblk || 649*eda14cbcSMatt Macy txh->txh_arg2 == DMU_OBJECT_END)) 650*eda14cbcSMatt Macy match_offset = TRUE; 651*eda14cbcSMatt Macy break; 652*eda14cbcSMatt Macy case THT_SPILL: 653*eda14cbcSMatt Macy if (blkid == DMU_SPILL_BLKID) 654*eda14cbcSMatt Macy match_offset = TRUE; 655*eda14cbcSMatt Macy break; 656*eda14cbcSMatt Macy case THT_BONUS: 657*eda14cbcSMatt Macy if (blkid == DMU_BONUS_BLKID) 658*eda14cbcSMatt Macy match_offset = TRUE; 659*eda14cbcSMatt Macy break; 660*eda14cbcSMatt Macy case THT_ZAP: 661*eda14cbcSMatt Macy match_offset = TRUE; 662*eda14cbcSMatt Macy break; 663*eda14cbcSMatt Macy case THT_NEWOBJECT: 664*eda14cbcSMatt Macy match_object = TRUE; 665*eda14cbcSMatt Macy break; 666*eda14cbcSMatt Macy default: 667*eda14cbcSMatt Macy cmn_err(CE_PANIC, "bad txh_type %d", 668*eda14cbcSMatt Macy txh->txh_type); 669*eda14cbcSMatt Macy } 670*eda14cbcSMatt Macy } 671*eda14cbcSMatt Macy if (match_object && match_offset) { 672*eda14cbcSMatt Macy DB_DNODE_EXIT(db); 673*eda14cbcSMatt Macy return; 674*eda14cbcSMatt Macy } 675*eda14cbcSMatt Macy } 676*eda14cbcSMatt Macy DB_DNODE_EXIT(db); 677*eda14cbcSMatt Macy panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", 678*eda14cbcSMatt Macy (u_longlong_t)db->db.db_object, db->db_level, 679*eda14cbcSMatt Macy (u_longlong_t)db->db_blkid); 680*eda14cbcSMatt Macy } 681*eda14cbcSMatt Macy #endif 682*eda14cbcSMatt Macy 683*eda14cbcSMatt Macy /* 684*eda14cbcSMatt Macy * If we can't do 10 iops, something is wrong. Let us go ahead 685*eda14cbcSMatt Macy * and hit zfs_dirty_data_max. 686*eda14cbcSMatt Macy */ 687*eda14cbcSMatt Macy hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */ 688*eda14cbcSMatt Macy int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */ 689*eda14cbcSMatt Macy 690*eda14cbcSMatt Macy /* 691*eda14cbcSMatt Macy * We delay transactions when we've determined that the backend storage 692*eda14cbcSMatt Macy * isn't able to accommodate the rate of incoming writes. 693*eda14cbcSMatt Macy * 694*eda14cbcSMatt Macy * If there is already a transaction waiting, we delay relative to when 695*eda14cbcSMatt Macy * that transaction finishes waiting. This way the calculated min_time 696*eda14cbcSMatt Macy * is independent of the number of threads concurrently executing 697*eda14cbcSMatt Macy * transactions. 698*eda14cbcSMatt Macy * 699*eda14cbcSMatt Macy * If we are the only waiter, wait relative to when the transaction 700*eda14cbcSMatt Macy * started, rather than the current time. This credits the transaction for 701*eda14cbcSMatt Macy * "time already served", e.g. reading indirect blocks. 702*eda14cbcSMatt Macy * 703*eda14cbcSMatt Macy * The minimum time for a transaction to take is calculated as: 704*eda14cbcSMatt Macy * min_time = scale * (dirty - min) / (max - dirty) 705*eda14cbcSMatt Macy * min_time is then capped at zfs_delay_max_ns. 706*eda14cbcSMatt Macy * 707*eda14cbcSMatt Macy * The delay has two degrees of freedom that can be adjusted via tunables. 708*eda14cbcSMatt Macy * The percentage of dirty data at which we start to delay is defined by 709*eda14cbcSMatt Macy * zfs_delay_min_dirty_percent. This should typically be at or above 710*eda14cbcSMatt Macy * zfs_vdev_async_write_active_max_dirty_percent so that we only start to 711*eda14cbcSMatt Macy * delay after writing at full speed has failed to keep up with the incoming 712*eda14cbcSMatt Macy * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly 713*eda14cbcSMatt Macy * speaking, this variable determines the amount of delay at the midpoint of 714*eda14cbcSMatt Macy * the curve. 715*eda14cbcSMatt Macy * 716*eda14cbcSMatt Macy * delay 717*eda14cbcSMatt Macy * 10ms +-------------------------------------------------------------*+ 718*eda14cbcSMatt Macy * | *| 719*eda14cbcSMatt Macy * 9ms + *+ 720*eda14cbcSMatt Macy * | *| 721*eda14cbcSMatt Macy * 8ms + *+ 722*eda14cbcSMatt Macy * | * | 723*eda14cbcSMatt Macy * 7ms + * + 724*eda14cbcSMatt Macy * | * | 725*eda14cbcSMatt Macy * 6ms + * + 726*eda14cbcSMatt Macy * | * | 727*eda14cbcSMatt Macy * 5ms + * + 728*eda14cbcSMatt Macy * | * | 729*eda14cbcSMatt Macy * 4ms + * + 730*eda14cbcSMatt Macy * | * | 731*eda14cbcSMatt Macy * 3ms + * + 732*eda14cbcSMatt Macy * | * | 733*eda14cbcSMatt Macy * 2ms + (midpoint) * + 734*eda14cbcSMatt Macy * | | ** | 735*eda14cbcSMatt Macy * 1ms + v *** + 736*eda14cbcSMatt Macy * | zfs_delay_scale ----------> ******** | 737*eda14cbcSMatt Macy * 0 +-------------------------------------*********----------------+ 738*eda14cbcSMatt Macy * 0% <- zfs_dirty_data_max -> 100% 739*eda14cbcSMatt Macy * 740*eda14cbcSMatt Macy * Note that since the delay is added to the outstanding time remaining on the 741*eda14cbcSMatt Macy * most recent transaction, the delay is effectively the inverse of IOPS. 742*eda14cbcSMatt Macy * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve 743*eda14cbcSMatt Macy * was chosen such that small changes in the amount of accumulated dirty data 744*eda14cbcSMatt Macy * in the first 3/4 of the curve yield relatively small differences in the 745*eda14cbcSMatt Macy * amount of delay. 746*eda14cbcSMatt Macy * 747*eda14cbcSMatt Macy * The effects can be easier to understand when the amount of delay is 748*eda14cbcSMatt Macy * represented on a log scale: 749*eda14cbcSMatt Macy * 750*eda14cbcSMatt Macy * delay 751*eda14cbcSMatt Macy * 100ms +-------------------------------------------------------------++ 752*eda14cbcSMatt Macy * + + 753*eda14cbcSMatt Macy * | | 754*eda14cbcSMatt Macy * + *+ 755*eda14cbcSMatt Macy * 10ms + *+ 756*eda14cbcSMatt Macy * + ** + 757*eda14cbcSMatt Macy * | (midpoint) ** | 758*eda14cbcSMatt Macy * + | ** + 759*eda14cbcSMatt Macy * 1ms + v **** + 760*eda14cbcSMatt Macy * + zfs_delay_scale ----------> ***** + 761*eda14cbcSMatt Macy * | **** | 762*eda14cbcSMatt Macy * + **** + 763*eda14cbcSMatt Macy * 100us + ** + 764*eda14cbcSMatt Macy * + * + 765*eda14cbcSMatt Macy * | * | 766*eda14cbcSMatt Macy * + * + 767*eda14cbcSMatt Macy * 10us + * + 768*eda14cbcSMatt Macy * + + 769*eda14cbcSMatt Macy * | | 770*eda14cbcSMatt Macy * + + 771*eda14cbcSMatt Macy * +--------------------------------------------------------------+ 772*eda14cbcSMatt Macy * 0% <- zfs_dirty_data_max -> 100% 773*eda14cbcSMatt Macy * 774*eda14cbcSMatt Macy * Note here that only as the amount of dirty data approaches its limit does 775*eda14cbcSMatt Macy * the delay start to increase rapidly. The goal of a properly tuned system 776*eda14cbcSMatt Macy * should be to keep the amount of dirty data out of that range by first 777*eda14cbcSMatt Macy * ensuring that the appropriate limits are set for the I/O scheduler to reach 778*eda14cbcSMatt Macy * optimal throughput on the backend storage, and then by changing the value 779*eda14cbcSMatt Macy * of zfs_delay_scale to increase the steepness of the curve. 780*eda14cbcSMatt Macy */ 781*eda14cbcSMatt Macy static void 782*eda14cbcSMatt Macy dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) 783*eda14cbcSMatt Macy { 784*eda14cbcSMatt Macy dsl_pool_t *dp = tx->tx_pool; 785*eda14cbcSMatt Macy uint64_t delay_min_bytes = 786*eda14cbcSMatt Macy zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; 787*eda14cbcSMatt Macy hrtime_t wakeup, min_tx_time, now; 788*eda14cbcSMatt Macy 789*eda14cbcSMatt Macy if (dirty <= delay_min_bytes) 790*eda14cbcSMatt Macy return; 791*eda14cbcSMatt Macy 792*eda14cbcSMatt Macy /* 793*eda14cbcSMatt Macy * The caller has already waited until we are under the max. 794*eda14cbcSMatt Macy * We make them pass us the amount of dirty data so we don't 795*eda14cbcSMatt Macy * have to handle the case of it being >= the max, which could 796*eda14cbcSMatt Macy * cause a divide-by-zero if it's == the max. 797*eda14cbcSMatt Macy */ 798*eda14cbcSMatt Macy ASSERT3U(dirty, <, zfs_dirty_data_max); 799*eda14cbcSMatt Macy 800*eda14cbcSMatt Macy now = gethrtime(); 801*eda14cbcSMatt Macy min_tx_time = zfs_delay_scale * 802*eda14cbcSMatt Macy (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); 803*eda14cbcSMatt Macy min_tx_time = MIN(min_tx_time, zfs_delay_max_ns); 804*eda14cbcSMatt Macy if (now > tx->tx_start + min_tx_time) 805*eda14cbcSMatt Macy return; 806*eda14cbcSMatt Macy 807*eda14cbcSMatt Macy DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, 808*eda14cbcSMatt Macy uint64_t, min_tx_time); 809*eda14cbcSMatt Macy 810*eda14cbcSMatt Macy mutex_enter(&dp->dp_lock); 811*eda14cbcSMatt Macy wakeup = MAX(tx->tx_start + min_tx_time, 812*eda14cbcSMatt Macy dp->dp_last_wakeup + min_tx_time); 813*eda14cbcSMatt Macy dp->dp_last_wakeup = wakeup; 814*eda14cbcSMatt Macy mutex_exit(&dp->dp_lock); 815*eda14cbcSMatt Macy 816*eda14cbcSMatt Macy zfs_sleep_until(wakeup); 817*eda14cbcSMatt Macy } 818*eda14cbcSMatt Macy 819*eda14cbcSMatt Macy /* 820*eda14cbcSMatt Macy * This routine attempts to assign the transaction to a transaction group. 821*eda14cbcSMatt Macy * To do so, we must determine if there is sufficient free space on disk. 822*eda14cbcSMatt Macy * 823*eda14cbcSMatt Macy * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree() 824*eda14cbcSMatt Macy * on it), then it is assumed that there is sufficient free space, 825*eda14cbcSMatt Macy * unless there's insufficient slop space in the pool (see the comment 826*eda14cbcSMatt Macy * above spa_slop_shift in spa_misc.c). 827*eda14cbcSMatt Macy * 828*eda14cbcSMatt Macy * If it is not a "netfree" transaction, then if the data already on disk 829*eda14cbcSMatt Macy * is over the allowed usage (e.g. quota), this will fail with EDQUOT or 830*eda14cbcSMatt Macy * ENOSPC. Otherwise, if the current rough estimate of pending changes, 831*eda14cbcSMatt Macy * plus the rough estimate of this transaction's changes, may exceed the 832*eda14cbcSMatt Macy * allowed usage, then this will fail with ERESTART, which will cause the 833*eda14cbcSMatt Macy * caller to wait for the pending changes to be written to disk (by waiting 834*eda14cbcSMatt Macy * for the next TXG to open), and then check the space usage again. 835*eda14cbcSMatt Macy * 836*eda14cbcSMatt Macy * The rough estimate of pending changes is comprised of the sum of: 837*eda14cbcSMatt Macy * 838*eda14cbcSMatt Macy * - this transaction's holds' txh_space_towrite 839*eda14cbcSMatt Macy * 840*eda14cbcSMatt Macy * - dd_tempreserved[], which is the sum of in-flight transactions' 841*eda14cbcSMatt Macy * holds' txh_space_towrite (i.e. those transactions that have called 842*eda14cbcSMatt Macy * dmu_tx_assign() but not yet called dmu_tx_commit()). 843*eda14cbcSMatt Macy * 844*eda14cbcSMatt Macy * - dd_space_towrite[], which is the amount of dirtied dbufs. 845*eda14cbcSMatt Macy * 846*eda14cbcSMatt Macy * Note that all of these values are inflated by spa_get_worst_case_asize(), 847*eda14cbcSMatt Macy * which means that we may get ERESTART well before we are actually in danger 848*eda14cbcSMatt Macy * of running out of space, but this also mitigates any small inaccuracies 849*eda14cbcSMatt Macy * in the rough estimate (e.g. txh_space_towrite doesn't take into account 850*eda14cbcSMatt Macy * indirect blocks, and dd_space_towrite[] doesn't take into account changes 851*eda14cbcSMatt Macy * to the MOS). 852*eda14cbcSMatt Macy * 853*eda14cbcSMatt Macy * Note that due to this algorithm, it is possible to exceed the allowed 854*eda14cbcSMatt Macy * usage by one transaction. Also, as we approach the allowed usage, 855*eda14cbcSMatt Macy * we will allow a very limited amount of changes into each TXG, thus 856*eda14cbcSMatt Macy * decreasing performance. 857*eda14cbcSMatt Macy */ 858*eda14cbcSMatt Macy static int 859*eda14cbcSMatt Macy dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) 860*eda14cbcSMatt Macy { 861*eda14cbcSMatt Macy spa_t *spa = tx->tx_pool->dp_spa; 862*eda14cbcSMatt Macy 863*eda14cbcSMatt Macy ASSERT0(tx->tx_txg); 864*eda14cbcSMatt Macy 865*eda14cbcSMatt Macy if (tx->tx_err) { 866*eda14cbcSMatt Macy DMU_TX_STAT_BUMP(dmu_tx_error); 867*eda14cbcSMatt Macy return (tx->tx_err); 868*eda14cbcSMatt Macy } 869*eda14cbcSMatt Macy 870*eda14cbcSMatt Macy if (spa_suspended(spa)) { 871*eda14cbcSMatt Macy DMU_TX_STAT_BUMP(dmu_tx_suspended); 872*eda14cbcSMatt Macy 873*eda14cbcSMatt Macy /* 874*eda14cbcSMatt Macy * If the user has indicated a blocking failure mode 875*eda14cbcSMatt Macy * then return ERESTART which will block in dmu_tx_wait(). 876*eda14cbcSMatt Macy * Otherwise, return EIO so that an error can get 877*eda14cbcSMatt Macy * propagated back to the VOP calls. 878*eda14cbcSMatt Macy * 879*eda14cbcSMatt Macy * Note that we always honor the txg_how flag regardless 880*eda14cbcSMatt Macy * of the failuremode setting. 881*eda14cbcSMatt Macy */ 882*eda14cbcSMatt Macy if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && 883*eda14cbcSMatt Macy !(txg_how & TXG_WAIT)) 884*eda14cbcSMatt Macy return (SET_ERROR(EIO)); 885*eda14cbcSMatt Macy 886*eda14cbcSMatt Macy return (SET_ERROR(ERESTART)); 887*eda14cbcSMatt Macy } 888*eda14cbcSMatt Macy 889*eda14cbcSMatt Macy if (!tx->tx_dirty_delayed && 890*eda14cbcSMatt Macy dsl_pool_need_dirty_delay(tx->tx_pool)) { 891*eda14cbcSMatt Macy tx->tx_wait_dirty = B_TRUE; 892*eda14cbcSMatt Macy DMU_TX_STAT_BUMP(dmu_tx_dirty_delay); 893*eda14cbcSMatt Macy return (SET_ERROR(ERESTART)); 894*eda14cbcSMatt Macy } 895*eda14cbcSMatt Macy 896*eda14cbcSMatt Macy tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); 897*eda14cbcSMatt Macy tx->tx_needassign_txh = NULL; 898*eda14cbcSMatt Macy 899*eda14cbcSMatt Macy /* 900*eda14cbcSMatt Macy * NB: No error returns are allowed after txg_hold_open, but 901*eda14cbcSMatt Macy * before processing the dnode holds, due to the 902*eda14cbcSMatt Macy * dmu_tx_unassign() logic. 903*eda14cbcSMatt Macy */ 904*eda14cbcSMatt Macy 905*eda14cbcSMatt Macy uint64_t towrite = 0; 906*eda14cbcSMatt Macy uint64_t tohold = 0; 907*eda14cbcSMatt Macy for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; 908*eda14cbcSMatt Macy txh = list_next(&tx->tx_holds, txh)) { 909*eda14cbcSMatt Macy dnode_t *dn = txh->txh_dnode; 910*eda14cbcSMatt Macy if (dn != NULL) { 911*eda14cbcSMatt Macy /* 912*eda14cbcSMatt Macy * This thread can't hold the dn_struct_rwlock 913*eda14cbcSMatt Macy * while assigning the tx, because this can lead to 914*eda14cbcSMatt Macy * deadlock. Specifically, if this dnode is already 915*eda14cbcSMatt Macy * assigned to an earlier txg, this thread may need 916*eda14cbcSMatt Macy * to wait for that txg to sync (the ERESTART case 917*eda14cbcSMatt Macy * below). The other thread that has assigned this 918*eda14cbcSMatt Macy * dnode to an earlier txg prevents this txg from 919*eda14cbcSMatt Macy * syncing until its tx can complete (calling 920*eda14cbcSMatt Macy * dmu_tx_commit()), but it may need to acquire the 921*eda14cbcSMatt Macy * dn_struct_rwlock to do so (e.g. via 922*eda14cbcSMatt Macy * dmu_buf_hold*()). 923*eda14cbcSMatt Macy * 924*eda14cbcSMatt Macy * Note that this thread can't hold the lock for 925*eda14cbcSMatt Macy * read either, but the rwlock doesn't record 926*eda14cbcSMatt Macy * enough information to make that assertion. 927*eda14cbcSMatt Macy */ 928*eda14cbcSMatt Macy ASSERT(!RW_WRITE_HELD(&dn->dn_struct_rwlock)); 929*eda14cbcSMatt Macy 930*eda14cbcSMatt Macy mutex_enter(&dn->dn_mtx); 931*eda14cbcSMatt Macy if (dn->dn_assigned_txg == tx->tx_txg - 1) { 932*eda14cbcSMatt Macy mutex_exit(&dn->dn_mtx); 933*eda14cbcSMatt Macy tx->tx_needassign_txh = txh; 934*eda14cbcSMatt Macy DMU_TX_STAT_BUMP(dmu_tx_group); 935*eda14cbcSMatt Macy return (SET_ERROR(ERESTART)); 936*eda14cbcSMatt Macy } 937*eda14cbcSMatt Macy if (dn->dn_assigned_txg == 0) 938*eda14cbcSMatt Macy dn->dn_assigned_txg = tx->tx_txg; 939*eda14cbcSMatt Macy ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 940*eda14cbcSMatt Macy (void) zfs_refcount_add(&dn->dn_tx_holds, tx); 941*eda14cbcSMatt Macy mutex_exit(&dn->dn_mtx); 942*eda14cbcSMatt Macy } 943*eda14cbcSMatt Macy towrite += zfs_refcount_count(&txh->txh_space_towrite); 944*eda14cbcSMatt Macy tohold += zfs_refcount_count(&txh->txh_memory_tohold); 945*eda14cbcSMatt Macy } 946*eda14cbcSMatt Macy 947*eda14cbcSMatt Macy /* needed allocation: worst-case estimate of write space */ 948*eda14cbcSMatt Macy uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite); 949*eda14cbcSMatt Macy /* calculate memory footprint estimate */ 950*eda14cbcSMatt Macy uint64_t memory = towrite + tohold; 951*eda14cbcSMatt Macy 952*eda14cbcSMatt Macy if (tx->tx_dir != NULL && asize != 0) { 953*eda14cbcSMatt Macy int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, 954*eda14cbcSMatt Macy asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx); 955*eda14cbcSMatt Macy if (err != 0) 956*eda14cbcSMatt Macy return (err); 957*eda14cbcSMatt Macy } 958*eda14cbcSMatt Macy 959*eda14cbcSMatt Macy DMU_TX_STAT_BUMP(dmu_tx_assigned); 960*eda14cbcSMatt Macy 961*eda14cbcSMatt Macy return (0); 962*eda14cbcSMatt Macy } 963*eda14cbcSMatt Macy 964*eda14cbcSMatt Macy static void 965*eda14cbcSMatt Macy dmu_tx_unassign(dmu_tx_t *tx) 966*eda14cbcSMatt Macy { 967*eda14cbcSMatt Macy if (tx->tx_txg == 0) 968*eda14cbcSMatt Macy return; 969*eda14cbcSMatt Macy 970*eda14cbcSMatt Macy txg_rele_to_quiesce(&tx->tx_txgh); 971*eda14cbcSMatt Macy 972*eda14cbcSMatt Macy /* 973*eda14cbcSMatt Macy * Walk the transaction's hold list, removing the hold on the 974*eda14cbcSMatt Macy * associated dnode, and notifying waiters if the refcount drops to 0. 975*eda14cbcSMatt Macy */ 976*eda14cbcSMatt Macy for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); 977*eda14cbcSMatt Macy txh && txh != tx->tx_needassign_txh; 978*eda14cbcSMatt Macy txh = list_next(&tx->tx_holds, txh)) { 979*eda14cbcSMatt Macy dnode_t *dn = txh->txh_dnode; 980*eda14cbcSMatt Macy 981*eda14cbcSMatt Macy if (dn == NULL) 982*eda14cbcSMatt Macy continue; 983*eda14cbcSMatt Macy mutex_enter(&dn->dn_mtx); 984*eda14cbcSMatt Macy ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 985*eda14cbcSMatt Macy 986*eda14cbcSMatt Macy if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) { 987*eda14cbcSMatt Macy dn->dn_assigned_txg = 0; 988*eda14cbcSMatt Macy cv_broadcast(&dn->dn_notxholds); 989*eda14cbcSMatt Macy } 990*eda14cbcSMatt Macy mutex_exit(&dn->dn_mtx); 991*eda14cbcSMatt Macy } 992*eda14cbcSMatt Macy 993*eda14cbcSMatt Macy txg_rele_to_sync(&tx->tx_txgh); 994*eda14cbcSMatt Macy 995*eda14cbcSMatt Macy tx->tx_lasttried_txg = tx->tx_txg; 996*eda14cbcSMatt Macy tx->tx_txg = 0; 997*eda14cbcSMatt Macy } 998*eda14cbcSMatt Macy 999*eda14cbcSMatt Macy /* 1000*eda14cbcSMatt Macy * Assign tx to a transaction group; txg_how is a bitmask: 1001*eda14cbcSMatt Macy * 1002*eda14cbcSMatt Macy * If TXG_WAIT is set and the currently open txg is full, this function 1003*eda14cbcSMatt Macy * will wait until there's a new txg. This should be used when no locks 1004*eda14cbcSMatt Macy * are being held. With this bit set, this function will only fail if 1005*eda14cbcSMatt Macy * we're truly out of space (or over quota). 1006*eda14cbcSMatt Macy * 1007*eda14cbcSMatt Macy * If TXG_WAIT is *not* set and we can't assign into the currently open 1008*eda14cbcSMatt Macy * txg without blocking, this function will return immediately with 1009*eda14cbcSMatt Macy * ERESTART. This should be used whenever locks are being held. On an 1010*eda14cbcSMatt Macy * ERESTART error, the caller should drop all locks, call dmu_tx_wait(), 1011*eda14cbcSMatt Macy * and try again. 1012*eda14cbcSMatt Macy * 1013*eda14cbcSMatt Macy * If TXG_NOTHROTTLE is set, this indicates that this tx should not be 1014*eda14cbcSMatt Macy * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for 1015*eda14cbcSMatt Macy * details on the throttle). This is used by the VFS operations, after 1016*eda14cbcSMatt Macy * they have already called dmu_tx_wait() (though most likely on a 1017*eda14cbcSMatt Macy * different tx). 1018*eda14cbcSMatt Macy */ 1019*eda14cbcSMatt Macy int 1020*eda14cbcSMatt Macy dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) 1021*eda14cbcSMatt Macy { 1022*eda14cbcSMatt Macy int err; 1023*eda14cbcSMatt Macy 1024*eda14cbcSMatt Macy ASSERT(tx->tx_txg == 0); 1025*eda14cbcSMatt Macy ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE)); 1026*eda14cbcSMatt Macy ASSERT(!dsl_pool_sync_context(tx->tx_pool)); 1027*eda14cbcSMatt Macy 1028*eda14cbcSMatt Macy /* If we might wait, we must not hold the config lock. */ 1029*eda14cbcSMatt Macy IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool)); 1030*eda14cbcSMatt Macy 1031*eda14cbcSMatt Macy if ((txg_how & TXG_NOTHROTTLE)) 1032*eda14cbcSMatt Macy tx->tx_dirty_delayed = B_TRUE; 1033*eda14cbcSMatt Macy 1034*eda14cbcSMatt Macy while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { 1035*eda14cbcSMatt Macy dmu_tx_unassign(tx); 1036*eda14cbcSMatt Macy 1037*eda14cbcSMatt Macy if (err != ERESTART || !(txg_how & TXG_WAIT)) 1038*eda14cbcSMatt Macy return (err); 1039*eda14cbcSMatt Macy 1040*eda14cbcSMatt Macy dmu_tx_wait(tx); 1041*eda14cbcSMatt Macy } 1042*eda14cbcSMatt Macy 1043*eda14cbcSMatt Macy txg_rele_to_quiesce(&tx->tx_txgh); 1044*eda14cbcSMatt Macy 1045*eda14cbcSMatt Macy return (0); 1046*eda14cbcSMatt Macy } 1047*eda14cbcSMatt Macy 1048*eda14cbcSMatt Macy void 1049*eda14cbcSMatt Macy dmu_tx_wait(dmu_tx_t *tx) 1050*eda14cbcSMatt Macy { 1051*eda14cbcSMatt Macy spa_t *spa = tx->tx_pool->dp_spa; 1052*eda14cbcSMatt Macy dsl_pool_t *dp = tx->tx_pool; 1053*eda14cbcSMatt Macy hrtime_t before; 1054*eda14cbcSMatt Macy 1055*eda14cbcSMatt Macy ASSERT(tx->tx_txg == 0); 1056*eda14cbcSMatt Macy ASSERT(!dsl_pool_config_held(tx->tx_pool)); 1057*eda14cbcSMatt Macy 1058*eda14cbcSMatt Macy before = gethrtime(); 1059*eda14cbcSMatt Macy 1060*eda14cbcSMatt Macy if (tx->tx_wait_dirty) { 1061*eda14cbcSMatt Macy uint64_t dirty; 1062*eda14cbcSMatt Macy 1063*eda14cbcSMatt Macy /* 1064*eda14cbcSMatt Macy * dmu_tx_try_assign() has determined that we need to wait 1065*eda14cbcSMatt Macy * because we've consumed much or all of the dirty buffer 1066*eda14cbcSMatt Macy * space. 1067*eda14cbcSMatt Macy */ 1068*eda14cbcSMatt Macy mutex_enter(&dp->dp_lock); 1069*eda14cbcSMatt Macy if (dp->dp_dirty_total >= zfs_dirty_data_max) 1070*eda14cbcSMatt Macy DMU_TX_STAT_BUMP(dmu_tx_dirty_over_max); 1071*eda14cbcSMatt Macy while (dp->dp_dirty_total >= zfs_dirty_data_max) 1072*eda14cbcSMatt Macy cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); 1073*eda14cbcSMatt Macy dirty = dp->dp_dirty_total; 1074*eda14cbcSMatt Macy mutex_exit(&dp->dp_lock); 1075*eda14cbcSMatt Macy 1076*eda14cbcSMatt Macy dmu_tx_delay(tx, dirty); 1077*eda14cbcSMatt Macy 1078*eda14cbcSMatt Macy tx->tx_wait_dirty = B_FALSE; 1079*eda14cbcSMatt Macy 1080*eda14cbcSMatt Macy /* 1081*eda14cbcSMatt Macy * Note: setting tx_dirty_delayed only has effect if the 1082*eda14cbcSMatt Macy * caller used TX_WAIT. Otherwise they are going to 1083*eda14cbcSMatt Macy * destroy this tx and try again. The common case, 1084*eda14cbcSMatt Macy * zfs_write(), uses TX_WAIT. 1085*eda14cbcSMatt Macy */ 1086*eda14cbcSMatt Macy tx->tx_dirty_delayed = B_TRUE; 1087*eda14cbcSMatt Macy } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { 1088*eda14cbcSMatt Macy /* 1089*eda14cbcSMatt Macy * If the pool is suspended we need to wait until it 1090*eda14cbcSMatt Macy * is resumed. Note that it's possible that the pool 1091*eda14cbcSMatt Macy * has become active after this thread has tried to 1092*eda14cbcSMatt Macy * obtain a tx. If that's the case then tx_lasttried_txg 1093*eda14cbcSMatt Macy * would not have been set. 1094*eda14cbcSMatt Macy */ 1095*eda14cbcSMatt Macy txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); 1096*eda14cbcSMatt Macy } else if (tx->tx_needassign_txh) { 1097*eda14cbcSMatt Macy dnode_t *dn = tx->tx_needassign_txh->txh_dnode; 1098*eda14cbcSMatt Macy 1099*eda14cbcSMatt Macy mutex_enter(&dn->dn_mtx); 1100*eda14cbcSMatt Macy while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) 1101*eda14cbcSMatt Macy cv_wait(&dn->dn_notxholds, &dn->dn_mtx); 1102*eda14cbcSMatt Macy mutex_exit(&dn->dn_mtx); 1103*eda14cbcSMatt Macy tx->tx_needassign_txh = NULL; 1104*eda14cbcSMatt Macy } else { 1105*eda14cbcSMatt Macy /* 1106*eda14cbcSMatt Macy * If we have a lot of dirty data just wait until we sync 1107*eda14cbcSMatt Macy * out a TXG at which point we'll hopefully have synced 1108*eda14cbcSMatt Macy * a portion of the changes. 1109*eda14cbcSMatt Macy */ 1110*eda14cbcSMatt Macy txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); 1111*eda14cbcSMatt Macy } 1112*eda14cbcSMatt Macy 1113*eda14cbcSMatt Macy spa_tx_assign_add_nsecs(spa, gethrtime() - before); 1114*eda14cbcSMatt Macy } 1115*eda14cbcSMatt Macy 1116*eda14cbcSMatt Macy static void 1117*eda14cbcSMatt Macy dmu_tx_destroy(dmu_tx_t *tx) 1118*eda14cbcSMatt Macy { 1119*eda14cbcSMatt Macy dmu_tx_hold_t *txh; 1120*eda14cbcSMatt Macy 1121*eda14cbcSMatt Macy while ((txh = list_head(&tx->tx_holds)) != NULL) { 1122*eda14cbcSMatt Macy dnode_t *dn = txh->txh_dnode; 1123*eda14cbcSMatt Macy 1124*eda14cbcSMatt Macy list_remove(&tx->tx_holds, txh); 1125*eda14cbcSMatt Macy zfs_refcount_destroy_many(&txh->txh_space_towrite, 1126*eda14cbcSMatt Macy zfs_refcount_count(&txh->txh_space_towrite)); 1127*eda14cbcSMatt Macy zfs_refcount_destroy_many(&txh->txh_memory_tohold, 1128*eda14cbcSMatt Macy zfs_refcount_count(&txh->txh_memory_tohold)); 1129*eda14cbcSMatt Macy kmem_free(txh, sizeof (dmu_tx_hold_t)); 1130*eda14cbcSMatt Macy if (dn != NULL) 1131*eda14cbcSMatt Macy dnode_rele(dn, tx); 1132*eda14cbcSMatt Macy } 1133*eda14cbcSMatt Macy 1134*eda14cbcSMatt Macy list_destroy(&tx->tx_callbacks); 1135*eda14cbcSMatt Macy list_destroy(&tx->tx_holds); 1136*eda14cbcSMatt Macy kmem_free(tx, sizeof (dmu_tx_t)); 1137*eda14cbcSMatt Macy } 1138*eda14cbcSMatt Macy 1139*eda14cbcSMatt Macy void 1140*eda14cbcSMatt Macy dmu_tx_commit(dmu_tx_t *tx) 1141*eda14cbcSMatt Macy { 1142*eda14cbcSMatt Macy ASSERT(tx->tx_txg != 0); 1143*eda14cbcSMatt Macy 1144*eda14cbcSMatt Macy /* 1145*eda14cbcSMatt Macy * Go through the transaction's hold list and remove holds on 1146*eda14cbcSMatt Macy * associated dnodes, notifying waiters if no holds remain. 1147*eda14cbcSMatt Macy */ 1148*eda14cbcSMatt Macy for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; 1149*eda14cbcSMatt Macy txh = list_next(&tx->tx_holds, txh)) { 1150*eda14cbcSMatt Macy dnode_t *dn = txh->txh_dnode; 1151*eda14cbcSMatt Macy 1152*eda14cbcSMatt Macy if (dn == NULL) 1153*eda14cbcSMatt Macy continue; 1154*eda14cbcSMatt Macy 1155*eda14cbcSMatt Macy mutex_enter(&dn->dn_mtx); 1156*eda14cbcSMatt Macy ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 1157*eda14cbcSMatt Macy 1158*eda14cbcSMatt Macy if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) { 1159*eda14cbcSMatt Macy dn->dn_assigned_txg = 0; 1160*eda14cbcSMatt Macy cv_broadcast(&dn->dn_notxholds); 1161*eda14cbcSMatt Macy } 1162*eda14cbcSMatt Macy mutex_exit(&dn->dn_mtx); 1163*eda14cbcSMatt Macy } 1164*eda14cbcSMatt Macy 1165*eda14cbcSMatt Macy if (tx->tx_tempreserve_cookie) 1166*eda14cbcSMatt Macy dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); 1167*eda14cbcSMatt Macy 1168*eda14cbcSMatt Macy if (!list_is_empty(&tx->tx_callbacks)) 1169*eda14cbcSMatt Macy txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks); 1170*eda14cbcSMatt Macy 1171*eda14cbcSMatt Macy if (tx->tx_anyobj == FALSE) 1172*eda14cbcSMatt Macy txg_rele_to_sync(&tx->tx_txgh); 1173*eda14cbcSMatt Macy 1174*eda14cbcSMatt Macy dmu_tx_destroy(tx); 1175*eda14cbcSMatt Macy } 1176*eda14cbcSMatt Macy 1177*eda14cbcSMatt Macy void 1178*eda14cbcSMatt Macy dmu_tx_abort(dmu_tx_t *tx) 1179*eda14cbcSMatt Macy { 1180*eda14cbcSMatt Macy ASSERT(tx->tx_txg == 0); 1181*eda14cbcSMatt Macy 1182*eda14cbcSMatt Macy /* 1183*eda14cbcSMatt Macy * Call any registered callbacks with an error code. 1184*eda14cbcSMatt Macy */ 1185*eda14cbcSMatt Macy if (!list_is_empty(&tx->tx_callbacks)) 1186*eda14cbcSMatt Macy dmu_tx_do_callbacks(&tx->tx_callbacks, SET_ERROR(ECANCELED)); 1187*eda14cbcSMatt Macy 1188*eda14cbcSMatt Macy dmu_tx_destroy(tx); 1189*eda14cbcSMatt Macy } 1190*eda14cbcSMatt Macy 1191*eda14cbcSMatt Macy uint64_t 1192*eda14cbcSMatt Macy dmu_tx_get_txg(dmu_tx_t *tx) 1193*eda14cbcSMatt Macy { 1194*eda14cbcSMatt Macy ASSERT(tx->tx_txg != 0); 1195*eda14cbcSMatt Macy return (tx->tx_txg); 1196*eda14cbcSMatt Macy } 1197*eda14cbcSMatt Macy 1198*eda14cbcSMatt Macy dsl_pool_t * 1199*eda14cbcSMatt Macy dmu_tx_pool(dmu_tx_t *tx) 1200*eda14cbcSMatt Macy { 1201*eda14cbcSMatt Macy ASSERT(tx->tx_pool != NULL); 1202*eda14cbcSMatt Macy return (tx->tx_pool); 1203*eda14cbcSMatt Macy } 1204*eda14cbcSMatt Macy 1205*eda14cbcSMatt Macy void 1206*eda14cbcSMatt Macy dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) 1207*eda14cbcSMatt Macy { 1208*eda14cbcSMatt Macy dmu_tx_callback_t *dcb; 1209*eda14cbcSMatt Macy 1210*eda14cbcSMatt Macy dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP); 1211*eda14cbcSMatt Macy 1212*eda14cbcSMatt Macy dcb->dcb_func = func; 1213*eda14cbcSMatt Macy dcb->dcb_data = data; 1214*eda14cbcSMatt Macy 1215*eda14cbcSMatt Macy list_insert_tail(&tx->tx_callbacks, dcb); 1216*eda14cbcSMatt Macy } 1217*eda14cbcSMatt Macy 1218*eda14cbcSMatt Macy /* 1219*eda14cbcSMatt Macy * Call all the commit callbacks on a list, with a given error code. 1220*eda14cbcSMatt Macy */ 1221*eda14cbcSMatt Macy void 1222*eda14cbcSMatt Macy dmu_tx_do_callbacks(list_t *cb_list, int error) 1223*eda14cbcSMatt Macy { 1224*eda14cbcSMatt Macy dmu_tx_callback_t *dcb; 1225*eda14cbcSMatt Macy 1226*eda14cbcSMatt Macy while ((dcb = list_tail(cb_list)) != NULL) { 1227*eda14cbcSMatt Macy list_remove(cb_list, dcb); 1228*eda14cbcSMatt Macy dcb->dcb_func(dcb->dcb_data, error); 1229*eda14cbcSMatt Macy kmem_free(dcb, sizeof (dmu_tx_callback_t)); 1230*eda14cbcSMatt Macy } 1231*eda14cbcSMatt Macy } 1232*eda14cbcSMatt Macy 1233*eda14cbcSMatt Macy /* 1234*eda14cbcSMatt Macy * Interface to hold a bunch of attributes. 1235*eda14cbcSMatt Macy * used for creating new files. 1236*eda14cbcSMatt Macy * attrsize is the total size of all attributes 1237*eda14cbcSMatt Macy * to be added during object creation 1238*eda14cbcSMatt Macy * 1239*eda14cbcSMatt Macy * For updating/adding a single attribute dmu_tx_hold_sa() should be used. 1240*eda14cbcSMatt Macy */ 1241*eda14cbcSMatt Macy 1242*eda14cbcSMatt Macy /* 1243*eda14cbcSMatt Macy * hold necessary attribute name for attribute registration. 1244*eda14cbcSMatt Macy * should be a very rare case where this is needed. If it does 1245*eda14cbcSMatt Macy * happen it would only happen on the first write to the file system. 1246*eda14cbcSMatt Macy */ 1247*eda14cbcSMatt Macy static void 1248*eda14cbcSMatt Macy dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) 1249*eda14cbcSMatt Macy { 1250*eda14cbcSMatt Macy if (!sa->sa_need_attr_registration) 1251*eda14cbcSMatt Macy return; 1252*eda14cbcSMatt Macy 1253*eda14cbcSMatt Macy for (int i = 0; i != sa->sa_num_attrs; i++) { 1254*eda14cbcSMatt Macy if (!sa->sa_attr_table[i].sa_registered) { 1255*eda14cbcSMatt Macy if (sa->sa_reg_attr_obj) 1256*eda14cbcSMatt Macy dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj, 1257*eda14cbcSMatt Macy B_TRUE, sa->sa_attr_table[i].sa_name); 1258*eda14cbcSMatt Macy else 1259*eda14cbcSMatt Macy dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 1260*eda14cbcSMatt Macy B_TRUE, sa->sa_attr_table[i].sa_name); 1261*eda14cbcSMatt Macy } 1262*eda14cbcSMatt Macy } 1263*eda14cbcSMatt Macy } 1264*eda14cbcSMatt Macy 1265*eda14cbcSMatt Macy void 1266*eda14cbcSMatt Macy dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) 1267*eda14cbcSMatt Macy { 1268*eda14cbcSMatt Macy dmu_tx_hold_t *txh; 1269*eda14cbcSMatt Macy 1270*eda14cbcSMatt Macy txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, 1271*eda14cbcSMatt Macy THT_SPILL, 0, 0); 1272*eda14cbcSMatt Macy if (txh != NULL) 1273*eda14cbcSMatt Macy (void) zfs_refcount_add_many(&txh->txh_space_towrite, 1274*eda14cbcSMatt Macy SPA_OLD_MAXBLOCKSIZE, FTAG); 1275*eda14cbcSMatt Macy } 1276*eda14cbcSMatt Macy 1277*eda14cbcSMatt Macy void 1278*eda14cbcSMatt Macy dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) 1279*eda14cbcSMatt Macy { 1280*eda14cbcSMatt Macy sa_os_t *sa = tx->tx_objset->os_sa; 1281*eda14cbcSMatt Macy 1282*eda14cbcSMatt Macy dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1283*eda14cbcSMatt Macy 1284*eda14cbcSMatt Macy if (tx->tx_objset->os_sa->sa_master_obj == 0) 1285*eda14cbcSMatt Macy return; 1286*eda14cbcSMatt Macy 1287*eda14cbcSMatt Macy if (tx->tx_objset->os_sa->sa_layout_attr_obj) { 1288*eda14cbcSMatt Macy dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); 1289*eda14cbcSMatt Macy } else { 1290*eda14cbcSMatt Macy dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); 1291*eda14cbcSMatt Macy dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); 1292*eda14cbcSMatt Macy dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 1293*eda14cbcSMatt Macy dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 1294*eda14cbcSMatt Macy } 1295*eda14cbcSMatt Macy 1296*eda14cbcSMatt Macy dmu_tx_sa_registration_hold(sa, tx); 1297*eda14cbcSMatt Macy 1298*eda14cbcSMatt Macy if (attrsize <= DN_OLD_MAX_BONUSLEN && !sa->sa_force_spill) 1299*eda14cbcSMatt Macy return; 1300*eda14cbcSMatt Macy 1301*eda14cbcSMatt Macy (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, 1302*eda14cbcSMatt Macy THT_SPILL, 0, 0); 1303*eda14cbcSMatt Macy } 1304*eda14cbcSMatt Macy 1305*eda14cbcSMatt Macy /* 1306*eda14cbcSMatt Macy * Hold SA attribute 1307*eda14cbcSMatt Macy * 1308*eda14cbcSMatt Macy * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size) 1309*eda14cbcSMatt Macy * 1310*eda14cbcSMatt Macy * variable_size is the total size of all variable sized attributes 1311*eda14cbcSMatt Macy * passed to this function. It is not the total size of all 1312*eda14cbcSMatt Macy * variable size attributes that *may* exist on this object. 1313*eda14cbcSMatt Macy */ 1314*eda14cbcSMatt Macy void 1315*eda14cbcSMatt Macy dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) 1316*eda14cbcSMatt Macy { 1317*eda14cbcSMatt Macy uint64_t object; 1318*eda14cbcSMatt Macy sa_os_t *sa = tx->tx_objset->os_sa; 1319*eda14cbcSMatt Macy 1320*eda14cbcSMatt Macy ASSERT(hdl != NULL); 1321*eda14cbcSMatt Macy 1322*eda14cbcSMatt Macy object = sa_handle_object(hdl); 1323*eda14cbcSMatt Macy 1324*eda14cbcSMatt Macy dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; 1325*eda14cbcSMatt Macy DB_DNODE_ENTER(db); 1326*eda14cbcSMatt Macy dmu_tx_hold_bonus_by_dnode(tx, DB_DNODE(db)); 1327*eda14cbcSMatt Macy DB_DNODE_EXIT(db); 1328*eda14cbcSMatt Macy 1329*eda14cbcSMatt Macy if (tx->tx_objset->os_sa->sa_master_obj == 0) 1330*eda14cbcSMatt Macy return; 1331*eda14cbcSMatt Macy 1332*eda14cbcSMatt Macy if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 || 1333*eda14cbcSMatt Macy tx->tx_objset->os_sa->sa_layout_attr_obj == 0) { 1334*eda14cbcSMatt Macy dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); 1335*eda14cbcSMatt Macy dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); 1336*eda14cbcSMatt Macy dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 1337*eda14cbcSMatt Macy dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 1338*eda14cbcSMatt Macy } 1339*eda14cbcSMatt Macy 1340*eda14cbcSMatt Macy dmu_tx_sa_registration_hold(sa, tx); 1341*eda14cbcSMatt Macy 1342*eda14cbcSMatt Macy if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj) 1343*eda14cbcSMatt Macy dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); 1344*eda14cbcSMatt Macy 1345*eda14cbcSMatt Macy if (sa->sa_force_spill || may_grow || hdl->sa_spill) { 1346*eda14cbcSMatt Macy ASSERT(tx->tx_txg == 0); 1347*eda14cbcSMatt Macy dmu_tx_hold_spill(tx, object); 1348*eda14cbcSMatt Macy } else { 1349*eda14cbcSMatt Macy dnode_t *dn; 1350*eda14cbcSMatt Macy 1351*eda14cbcSMatt Macy DB_DNODE_ENTER(db); 1352*eda14cbcSMatt Macy dn = DB_DNODE(db); 1353*eda14cbcSMatt Macy if (dn->dn_have_spill) { 1354*eda14cbcSMatt Macy ASSERT(tx->tx_txg == 0); 1355*eda14cbcSMatt Macy dmu_tx_hold_spill(tx, object); 1356*eda14cbcSMatt Macy } 1357*eda14cbcSMatt Macy DB_DNODE_EXIT(db); 1358*eda14cbcSMatt Macy } 1359*eda14cbcSMatt Macy } 1360*eda14cbcSMatt Macy 1361*eda14cbcSMatt Macy void 1362*eda14cbcSMatt Macy dmu_tx_init(void) 1363*eda14cbcSMatt Macy { 1364*eda14cbcSMatt Macy dmu_tx_ksp = kstat_create("zfs", 0, "dmu_tx", "misc", 1365*eda14cbcSMatt Macy KSTAT_TYPE_NAMED, sizeof (dmu_tx_stats) / sizeof (kstat_named_t), 1366*eda14cbcSMatt Macy KSTAT_FLAG_VIRTUAL); 1367*eda14cbcSMatt Macy 1368*eda14cbcSMatt Macy if (dmu_tx_ksp != NULL) { 1369*eda14cbcSMatt Macy dmu_tx_ksp->ks_data = &dmu_tx_stats; 1370*eda14cbcSMatt Macy kstat_install(dmu_tx_ksp); 1371*eda14cbcSMatt Macy } 1372*eda14cbcSMatt Macy } 1373*eda14cbcSMatt Macy 1374*eda14cbcSMatt Macy void 1375*eda14cbcSMatt Macy dmu_tx_fini(void) 1376*eda14cbcSMatt Macy { 1377*eda14cbcSMatt Macy if (dmu_tx_ksp != NULL) { 1378*eda14cbcSMatt Macy kstat_delete(dmu_tx_ksp); 1379*eda14cbcSMatt Macy dmu_tx_ksp = NULL; 1380*eda14cbcSMatt Macy } 1381*eda14cbcSMatt Macy } 1382*eda14cbcSMatt Macy 1383*eda14cbcSMatt Macy #if defined(_KERNEL) 1384*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_tx_create); 1385*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_tx_hold_write); 1386*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_tx_hold_write_by_dnode); 1387*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_tx_hold_free); 1388*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_tx_hold_free_by_dnode); 1389*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_tx_hold_zap); 1390*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_tx_hold_zap_by_dnode); 1391*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_tx_hold_bonus); 1392*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_tx_hold_bonus_by_dnode); 1393*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_tx_abort); 1394*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_tx_assign); 1395*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_tx_wait); 1396*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_tx_commit); 1397*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_tx_mark_netfree); 1398*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_tx_get_txg); 1399*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_tx_callback_register); 1400*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_tx_do_callbacks); 1401*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_tx_hold_spill); 1402*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_tx_hold_sa_create); 1403*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_tx_hold_sa); 1404*eda14cbcSMatt Macy #endif 1405