1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2012, 2017 by Delphix. All rights reserved. 26 * Copyright (c) 2024, 2025, Klara, Inc. 27 */ 28 29 #include <sys/dmu.h> 30 #include <sys/dmu_impl.h> 31 #include <sys/dbuf.h> 32 #include <sys/dmu_tx.h> 33 #include <sys/dmu_objset.h> 34 #include <sys/dsl_dataset.h> 35 #include <sys/dsl_dir.h> 36 #include <sys/dsl_pool.h> 37 #include <sys/zap_impl.h> 38 #include <sys/spa.h> 39 #include <sys/brt_impl.h> 40 #include <sys/sa.h> 41 #include <sys/sa_impl.h> 42 #include <sys/zfs_context.h> 43 #include <sys/trace_zfs.h> 44 45 typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, 46 uint64_t arg1, uint64_t arg2); 47 48 dmu_tx_stats_t dmu_tx_stats = { 49 { "dmu_tx_assigned", KSTAT_DATA_UINT64 }, 50 { "dmu_tx_delay", KSTAT_DATA_UINT64 }, 51 { "dmu_tx_error", KSTAT_DATA_UINT64 }, 52 { "dmu_tx_suspended", KSTAT_DATA_UINT64 }, 53 { "dmu_tx_group", KSTAT_DATA_UINT64 }, 54 { "dmu_tx_memory_reserve", KSTAT_DATA_UINT64 }, 55 { "dmu_tx_memory_reclaim", KSTAT_DATA_UINT64 }, 56 { "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 }, 57 { "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 }, 58 { "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 }, 59 { "dmu_tx_dirty_frees_delay", KSTAT_DATA_UINT64 }, 60 { "dmu_tx_wrlog_delay", KSTAT_DATA_UINT64 }, 61 { "dmu_tx_quota", KSTAT_DATA_UINT64 }, 62 }; 63 64 static kstat_t *dmu_tx_ksp; 65 66 dmu_tx_t * 67 dmu_tx_create_dd(dsl_dir_t *dd) 68 { 69 dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); 70 tx->tx_dir = dd; 71 if (dd != NULL) 72 tx->tx_pool = dd->dd_pool; 73 list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), 74 offsetof(dmu_tx_hold_t, txh_node)); 75 list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), 76 offsetof(dmu_tx_callback_t, dcb_node)); 77 tx->tx_start = gethrtime(); 78 return (tx); 79 } 80 81 dmu_tx_t * 82 dmu_tx_create(objset_t *os) 83 { 84 dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir); 85 tx->tx_objset = os; 86 return (tx); 87 } 88 89 dmu_tx_t * 90 dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) 91 { 92 dmu_tx_t *tx = dmu_tx_create_dd(NULL); 93 94 TXG_VERIFY(dp->dp_spa, txg); 95 tx->tx_pool = dp; 96 tx->tx_txg = txg; 97 tx->tx_anyobj = TRUE; 98 99 return (tx); 100 } 101 102 int 103 dmu_tx_is_syncing(dmu_tx_t *tx) 104 { 105 return (tx->tx_anyobj); 106 } 107 108 int 109 dmu_tx_private_ok(dmu_tx_t *tx) 110 { 111 return (tx->tx_anyobj); 112 } 113 114 static dmu_tx_hold_t * 115 dmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type, 116 uint64_t arg1, uint64_t arg2) 117 { 118 dmu_tx_hold_t *txh; 119 120 if (dn != NULL) { 121 (void) zfs_refcount_add(&dn->dn_holds, tx); 122 if (tx->tx_txg != 0) { 123 mutex_enter(&dn->dn_mtx); 124 /* 125 * dn->dn_assigned_txg == tx->tx_txg doesn't pose a 126 * problem, but there's no way for it to happen (for 127 * now, at least). 128 */ 129 ASSERT(dn->dn_assigned_txg == 0); 130 dn->dn_assigned_txg = tx->tx_txg; 131 (void) zfs_refcount_add(&dn->dn_tx_holds, tx); 132 mutex_exit(&dn->dn_mtx); 133 } 134 } 135 136 txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); 137 txh->txh_tx = tx; 138 txh->txh_dnode = dn; 139 zfs_refcount_create(&txh->txh_space_towrite); 140 zfs_refcount_create(&txh->txh_memory_tohold); 141 txh->txh_type = type; 142 txh->txh_arg1 = arg1; 143 txh->txh_arg2 = arg2; 144 list_insert_tail(&tx->tx_holds, txh); 145 146 return (txh); 147 } 148 149 static dmu_tx_hold_t * 150 dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, 151 enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) 152 { 153 dnode_t *dn = NULL; 154 dmu_tx_hold_t *txh; 155 int err; 156 157 if (object != DMU_NEW_OBJECT) { 158 err = dnode_hold(os, object, FTAG, &dn); 159 if (err != 0) { 160 tx->tx_err = err; 161 return (NULL); 162 } 163 } 164 txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2); 165 if (dn != NULL) 166 dnode_rele(dn, FTAG); 167 return (txh); 168 } 169 170 void 171 dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn) 172 { 173 /* 174 * If we're syncing, they can manipulate any object anyhow, and 175 * the hold on the dnode_t can cause problems. 176 */ 177 if (!dmu_tx_is_syncing(tx)) 178 (void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0); 179 } 180 181 /* 182 * This function reads specified data from disk. The specified data will 183 * be needed to perform the transaction -- i.e, it will be read after 184 * we do dmu_tx_assign(). There are two reasons that we read the data now 185 * (before dmu_tx_assign()): 186 * 187 * 1. Reading it now has potentially better performance. The transaction 188 * has not yet been assigned, so the TXG is not held open, and also the 189 * caller typically has less locks held when calling dmu_tx_hold_*() than 190 * after the transaction has been assigned. This reduces the lock (and txg) 191 * hold times, thus reducing lock contention. 192 * 193 * 2. It is easier for callers (primarily the ZPL) to handle i/o errors 194 * that are detected before they start making changes to the DMU state 195 * (i.e. now). Once the transaction has been assigned, and some DMU 196 * state has been changed, it can be difficult to recover from an i/o 197 * error (e.g. to undo the changes already made in memory at the DMU 198 * layer). Typically code to do so does not exist in the caller -- it 199 * assumes that the data has already been cached and thus i/o errors are 200 * not possible. 201 * 202 * It has been observed that the i/o initiated here can be a performance 203 * problem, and it appears to be optional, because we don't look at the 204 * data which is read. However, removing this read would only serve to 205 * move the work elsewhere (after the dmu_tx_assign()), where it may 206 * have a greater impact on performance (in addition to the impact on 207 * fault tolerance noted above). 208 */ 209 static int 210 dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) 211 { 212 int err; 213 dmu_buf_impl_t *db; 214 215 rw_enter(&dn->dn_struct_rwlock, RW_READER); 216 err = dbuf_hold_impl(dn, level, blkid, TRUE, FALSE, FTAG, &db); 217 rw_exit(&dn->dn_struct_rwlock); 218 if (err == ENOENT) 219 return (0); 220 if (err != 0) 221 return (err); 222 /* 223 * PARTIAL_FIRST allows caching for uncacheable blocks. It will 224 * be cleared after dmu_buf_will_dirty() call dbuf_read() again. 225 */ 226 err = dbuf_read(db, zio, DB_RF_CANFAIL | DMU_READ_NO_PREFETCH | 227 (level == 0 ? (DMU_UNCACHEDIO | DMU_PARTIAL_FIRST) : 0)); 228 dbuf_rele(db, FTAG); 229 return (err); 230 } 231 232 static void 233 dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) 234 { 235 dnode_t *dn = txh->txh_dnode; 236 int err = 0; 237 238 if (len == 0) 239 return; 240 241 (void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG); 242 243 if (dn == NULL) 244 return; 245 246 /* 247 * For i/o error checking, read the blocks that will be needed 248 * to perform the write: the first and last level-0 blocks (if 249 * they are not aligned, i.e. if they are partial-block writes), 250 * and all the level-1 blocks. 251 */ 252 if (dn->dn_maxblkid == 0) { 253 if (off < dn->dn_datablksz && 254 (off > 0 || len < dn->dn_datablksz)) { 255 err = dmu_tx_check_ioerr(NULL, dn, 0, 0); 256 if (err != 0) { 257 txh->txh_tx->tx_err = err; 258 } 259 } 260 } else { 261 zio_t *zio = zio_root(dn->dn_objset->os_spa, 262 NULL, NULL, ZIO_FLAG_CANFAIL); 263 264 /* first level-0 block */ 265 uint64_t start = off >> dn->dn_datablkshift; 266 if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { 267 err = dmu_tx_check_ioerr(zio, dn, 0, start); 268 if (err != 0) { 269 txh->txh_tx->tx_err = err; 270 } 271 } 272 273 /* last level-0 block */ 274 uint64_t end = (off + len - 1) >> dn->dn_datablkshift; 275 if (end != start && end <= dn->dn_maxblkid && 276 P2PHASE(off + len, dn->dn_datablksz)) { 277 err = dmu_tx_check_ioerr(zio, dn, 0, end); 278 if (err != 0) { 279 txh->txh_tx->tx_err = err; 280 } 281 } 282 283 /* level-1 blocks */ 284 if (dn->dn_nlevels > 1) { 285 int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 286 for (uint64_t i = (start >> shft) + 1; 287 i < end >> shft; i++) { 288 err = dmu_tx_check_ioerr(zio, dn, 1, i); 289 if (err != 0) { 290 txh->txh_tx->tx_err = err; 291 } 292 } 293 } 294 295 err = zio_wait(zio); 296 if (err != 0) { 297 txh->txh_tx->tx_err = err; 298 } 299 } 300 } 301 302 static void 303 dmu_tx_count_append(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) 304 { 305 dnode_t *dn = txh->txh_dnode; 306 int err = 0; 307 308 if (len == 0) 309 return; 310 311 (void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG); 312 313 if (dn == NULL) 314 return; 315 316 /* 317 * For i/o error checking, read the blocks that will be needed 318 * to perform the append; first level-0 block (if not aligned, i.e. 319 * if they are partial-block writes), no additional blocks are read. 320 */ 321 if (dn->dn_maxblkid == 0) { 322 if (off < dn->dn_datablksz && 323 (off > 0 || len < dn->dn_datablksz)) { 324 err = dmu_tx_check_ioerr(NULL, dn, 0, 0); 325 if (err != 0) { 326 txh->txh_tx->tx_err = err; 327 } 328 } 329 } else { 330 zio_t *zio = zio_root(dn->dn_objset->os_spa, 331 NULL, NULL, ZIO_FLAG_CANFAIL); 332 333 /* first level-0 block */ 334 uint64_t start = off >> dn->dn_datablkshift; 335 if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { 336 err = dmu_tx_check_ioerr(zio, dn, 0, start); 337 if (err != 0) { 338 txh->txh_tx->tx_err = err; 339 } 340 } 341 342 err = zio_wait(zio); 343 if (err != 0) { 344 txh->txh_tx->tx_err = err; 345 } 346 } 347 } 348 349 static void 350 dmu_tx_count_dnode(dmu_tx_hold_t *txh) 351 { 352 (void) zfs_refcount_add_many(&txh->txh_space_towrite, 353 DNODE_MIN_SIZE, FTAG); 354 } 355 356 void 357 dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) 358 { 359 dmu_tx_hold_t *txh; 360 361 ASSERT0(tx->tx_txg); 362 ASSERT3U(len, <=, DMU_MAX_ACCESS); 363 ASSERT(len == 0 || UINT64_MAX - off >= len - 1); 364 365 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 366 object, THT_WRITE, off, len); 367 if (txh != NULL) { 368 dmu_tx_count_write(txh, off, len); 369 dmu_tx_count_dnode(txh); 370 } 371 } 372 373 void 374 dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) 375 { 376 dmu_tx_hold_t *txh; 377 378 ASSERT0(tx->tx_txg); 379 ASSERT3U(len, <=, DMU_MAX_ACCESS); 380 ASSERT(len == 0 || UINT64_MAX - off >= len - 1); 381 382 txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len); 383 if (txh != NULL) { 384 dmu_tx_count_write(txh, off, len); 385 dmu_tx_count_dnode(txh); 386 } 387 } 388 389 /* 390 * Should be used when appending to an object and the exact offset is unknown. 391 * The write must occur at or beyond the specified offset. Only the L0 block 392 * at provided offset will be prefetched. 393 */ 394 void 395 dmu_tx_hold_append(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) 396 { 397 dmu_tx_hold_t *txh; 398 399 ASSERT0(tx->tx_txg); 400 ASSERT3U(len, <=, DMU_MAX_ACCESS); 401 402 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 403 object, THT_APPEND, off, DMU_OBJECT_END); 404 if (txh != NULL) { 405 dmu_tx_count_append(txh, off, len); 406 dmu_tx_count_dnode(txh); 407 } 408 } 409 410 void 411 dmu_tx_hold_append_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) 412 { 413 dmu_tx_hold_t *txh; 414 415 ASSERT0(tx->tx_txg); 416 ASSERT3U(len, <=, DMU_MAX_ACCESS); 417 418 txh = dmu_tx_hold_dnode_impl(tx, dn, THT_APPEND, off, DMU_OBJECT_END); 419 if (txh != NULL) { 420 dmu_tx_count_append(txh, off, len); 421 dmu_tx_count_dnode(txh); 422 } 423 } 424 425 /* 426 * This function marks the transaction as being a "net free". The end 427 * result is that refquotas will be disabled for this transaction, and 428 * this transaction will be able to use half of the pool space overhead 429 * (see dsl_pool_adjustedsize()). Therefore this function should only 430 * be called for transactions that we expect will not cause a net increase 431 * in the amount of space used (but it's OK if that is occasionally not true). 432 */ 433 void 434 dmu_tx_mark_netfree(dmu_tx_t *tx) 435 { 436 tx->tx_netfree = B_TRUE; 437 } 438 439 static void 440 dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) 441 { 442 dmu_tx_t *tx = txh->txh_tx; 443 dnode_t *dn = txh->txh_dnode; 444 int err; 445 446 ASSERT(tx->tx_txg == 0); 447 448 if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz) 449 return; 450 if (len == DMU_OBJECT_END) 451 len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off; 452 453 /* 454 * For i/o error checking, we read the first and last level-0 455 * blocks if they are not aligned, and all the level-1 blocks. 456 * 457 * Note: dbuf_free_range() assumes that we have not instantiated 458 * any level-0 dbufs that will be completely freed. Therefore we must 459 * exercise care to not read or count the first and last blocks 460 * if they are blocksize-aligned. 461 */ 462 if (dn->dn_datablkshift == 0) { 463 if (off != 0 || len < dn->dn_datablksz) 464 dmu_tx_count_write(txh, 0, dn->dn_datablksz); 465 } else { 466 /* first block will be modified if it is not aligned */ 467 if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift)) 468 dmu_tx_count_write(txh, off, 1); 469 /* last block will be modified if it is not aligned */ 470 if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift)) 471 dmu_tx_count_write(txh, off + len, 1); 472 } 473 474 /* 475 * Check level-1 blocks. 476 */ 477 if (dn->dn_nlevels > 1) { 478 int shift = dn->dn_datablkshift + dn->dn_indblkshift - 479 SPA_BLKPTRSHIFT; 480 uint64_t start = off >> shift; 481 uint64_t end = (off + len) >> shift; 482 483 ASSERT(dn->dn_indblkshift != 0); 484 485 /* 486 * dnode_reallocate() can result in an object with indirect 487 * blocks having an odd data block size. In this case, 488 * just check the single block. 489 */ 490 if (dn->dn_datablkshift == 0) 491 start = end = 0; 492 493 zio_t *zio = zio_root(tx->tx_pool->dp_spa, 494 NULL, NULL, ZIO_FLAG_CANFAIL); 495 for (uint64_t i = start; i <= end; i++) { 496 uint64_t ibyte = i << shift; 497 err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); 498 i = ibyte >> shift; 499 if (err == ESRCH || i > end) 500 break; 501 if (err != 0) { 502 tx->tx_err = err; 503 (void) zio_wait(zio); 504 return; 505 } 506 507 (void) zfs_refcount_add_many(&txh->txh_memory_tohold, 508 1 << dn->dn_indblkshift, FTAG); 509 510 err = dmu_tx_check_ioerr(zio, dn, 1, i); 511 if (err != 0) { 512 tx->tx_err = err; 513 (void) zio_wait(zio); 514 return; 515 } 516 } 517 err = zio_wait(zio); 518 if (err != 0) { 519 tx->tx_err = err; 520 return; 521 } 522 } 523 } 524 525 void 526 dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) 527 { 528 dmu_tx_hold_t *txh; 529 530 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 531 object, THT_FREE, off, len); 532 if (txh != NULL) { 533 dmu_tx_count_dnode(txh); 534 dmu_tx_count_free(txh, off, len); 535 } 536 } 537 538 void 539 dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) 540 { 541 dmu_tx_hold_t *txh; 542 543 txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len); 544 if (txh != NULL) { 545 dmu_tx_count_dnode(txh); 546 dmu_tx_count_free(txh, off, len); 547 } 548 } 549 550 static void 551 dmu_tx_count_clone(dmu_tx_hold_t *txh, uint64_t off, uint64_t len, 552 uint_t blksz) 553 { 554 dmu_tx_t *tx = txh->txh_tx; 555 dnode_t *dn = txh->txh_dnode; 556 int err; 557 558 ASSERT0(tx->tx_txg); 559 ASSERT(dn->dn_indblkshift != 0); 560 ASSERT(blksz != 0); 561 ASSERT0(off % blksz); 562 563 (void) zfs_refcount_add_many(&txh->txh_memory_tohold, 564 len / blksz * sizeof (brt_entry_t), FTAG); 565 566 int shift = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 567 uint64_t start = off / blksz >> shift; 568 uint64_t end = (off + len) / blksz >> shift; 569 570 (void) zfs_refcount_add_many(&txh->txh_space_towrite, 571 (end - start + 1) << dn->dn_indblkshift, FTAG); 572 573 zio_t *zio = zio_root(tx->tx_pool->dp_spa, 574 NULL, NULL, ZIO_FLAG_CANFAIL); 575 for (uint64_t i = start; i <= end; i++) { 576 err = dmu_tx_check_ioerr(zio, dn, 1, i); 577 if (err != 0) { 578 tx->tx_err = err; 579 break; 580 } 581 } 582 err = zio_wait(zio); 583 if (err != 0) 584 tx->tx_err = err; 585 } 586 587 void 588 dmu_tx_hold_clone_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, 589 uint64_t len, uint_t blksz) 590 { 591 dmu_tx_hold_t *txh; 592 593 ASSERT0(tx->tx_txg); 594 ASSERT(len == 0 || UINT64_MAX - off >= len - 1); 595 596 txh = dmu_tx_hold_dnode_impl(tx, dn, THT_CLONE, off, len); 597 if (txh != NULL) { 598 dmu_tx_count_dnode(txh); 599 dmu_tx_count_clone(txh, off, len, blksz); 600 } 601 } 602 603 static void 604 dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name) 605 { 606 dmu_tx_t *tx = txh->txh_tx; 607 dnode_t *dn = txh->txh_dnode; 608 int err; 609 610 ASSERT(tx->tx_txg == 0); 611 612 dmu_tx_count_dnode(txh); 613 614 /* 615 * Modifying a almost-full microzap is around the worst case (128KB) 616 * 617 * If it is a fat zap, the worst case would be 7*16KB=112KB: 618 * - 3 blocks overwritten: target leaf, ptrtbl block, header block 619 * - 4 new blocks written if adding: 620 * - 2 blocks for possibly split leaves, 621 * - 2 grown ptrtbl blocks 622 */ 623 (void) zfs_refcount_add_many(&txh->txh_space_towrite, 624 zap_get_micro_max_size(tx->tx_pool->dp_spa), FTAG); 625 626 if (dn == NULL) 627 return; 628 629 ASSERT3U(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP); 630 631 if (dn->dn_maxblkid == 0 || name == NULL) { 632 /* 633 * This is a microzap (only one block), or we don't know 634 * the name. Check the first block for i/o errors. 635 */ 636 err = dmu_tx_check_ioerr(NULL, dn, 0, 0); 637 if (err != 0) { 638 tx->tx_err = err; 639 } 640 } else { 641 /* 642 * Access the name so that we'll check for i/o errors to 643 * the leaf blocks, etc. We ignore ENOENT, as this name 644 * may not yet exist. 645 */ 646 err = zap_lookup_by_dnode(dn, name, 8, 0, NULL); 647 if (err == EIO || err == ECKSUM || err == ENXIO) { 648 tx->tx_err = err; 649 } 650 } 651 } 652 653 void 654 dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) 655 { 656 dmu_tx_hold_t *txh; 657 658 ASSERT0(tx->tx_txg); 659 660 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 661 object, THT_ZAP, add, (uintptr_t)name); 662 if (txh != NULL) 663 dmu_tx_hold_zap_impl(txh, name); 664 } 665 666 void 667 dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name) 668 { 669 dmu_tx_hold_t *txh; 670 671 ASSERT0(tx->tx_txg); 672 ASSERT(dn != NULL); 673 674 txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name); 675 if (txh != NULL) 676 dmu_tx_hold_zap_impl(txh, name); 677 } 678 679 void 680 dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) 681 { 682 dmu_tx_hold_t *txh; 683 684 ASSERT(tx->tx_txg == 0); 685 686 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 687 object, THT_BONUS, 0, 0); 688 if (txh) 689 dmu_tx_count_dnode(txh); 690 } 691 692 void 693 dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn) 694 { 695 dmu_tx_hold_t *txh; 696 697 ASSERT0(tx->tx_txg); 698 699 txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0); 700 if (txh) 701 dmu_tx_count_dnode(txh); 702 } 703 704 void 705 dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) 706 { 707 dmu_tx_hold_t *txh; 708 709 ASSERT(tx->tx_txg == 0); 710 711 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 712 DMU_NEW_OBJECT, THT_SPACE, space, 0); 713 if (txh) { 714 (void) zfs_refcount_add_many( 715 &txh->txh_space_towrite, space, FTAG); 716 } 717 } 718 719 #ifdef ZFS_DEBUG 720 void 721 dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) 722 { 723 boolean_t match_object = B_FALSE; 724 boolean_t match_offset = B_FALSE; 725 726 DB_DNODE_ENTER(db); 727 dnode_t *dn = DB_DNODE(db); 728 ASSERT(tx->tx_txg != 0); 729 ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); 730 ASSERT3U(dn->dn_object, ==, db->db.db_object); 731 732 if (tx->tx_anyobj) { 733 DB_DNODE_EXIT(db); 734 return; 735 } 736 737 /* XXX No checking on the meta dnode for now */ 738 if (db->db.db_object == DMU_META_DNODE_OBJECT) { 739 DB_DNODE_EXIT(db); 740 return; 741 } 742 743 for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; 744 txh = list_next(&tx->tx_holds, txh)) { 745 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 746 if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) 747 match_object = TRUE; 748 if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { 749 int datablkshift = dn->dn_datablkshift ? 750 dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; 751 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 752 int shift = datablkshift + epbs * db->db_level; 753 uint64_t beginblk = shift >= 64 ? 0 : 754 (txh->txh_arg1 >> shift); 755 uint64_t endblk = shift >= 64 ? 0 : 756 ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); 757 uint64_t blkid = db->db_blkid; 758 759 /* XXX txh_arg2 better not be zero... */ 760 761 dprintf("found txh type %x beginblk=%llx endblk=%llx\n", 762 txh->txh_type, (u_longlong_t)beginblk, 763 (u_longlong_t)endblk); 764 765 switch (txh->txh_type) { 766 case THT_WRITE: 767 if (blkid >= beginblk && blkid <= endblk) 768 match_offset = TRUE; 769 /* 770 * We will let this hold work for the bonus 771 * or spill buffer so that we don't need to 772 * hold it when creating a new object. 773 */ 774 if (blkid == DMU_BONUS_BLKID || 775 blkid == DMU_SPILL_BLKID) 776 match_offset = TRUE; 777 /* 778 * They might have to increase nlevels, 779 * thus dirtying the new TLIBs. Or the 780 * might have to change the block size, 781 * thus dirying the new lvl=0 blk=0. 782 */ 783 if (blkid == 0) 784 match_offset = TRUE; 785 break; 786 case THT_APPEND: 787 if (blkid >= beginblk && (blkid <= endblk || 788 txh->txh_arg2 == DMU_OBJECT_END)) 789 match_offset = TRUE; 790 791 /* 792 * THT_WRITE used for bonus and spill blocks. 793 */ 794 ASSERT(blkid != DMU_BONUS_BLKID && 795 blkid != DMU_SPILL_BLKID); 796 797 /* 798 * They might have to increase nlevels, 799 * thus dirtying the new TLIBs. Or the 800 * might have to change the block size, 801 * thus dirying the new lvl=0 blk=0. 802 */ 803 if (blkid == 0) 804 match_offset = TRUE; 805 break; 806 case THT_FREE: 807 /* 808 * We will dirty all the level 1 blocks in 809 * the free range and perhaps the first and 810 * last level 0 block. 811 */ 812 if (blkid >= beginblk && (blkid <= endblk || 813 txh->txh_arg2 == DMU_OBJECT_END)) 814 match_offset = TRUE; 815 break; 816 case THT_SPILL: 817 if (blkid == DMU_SPILL_BLKID) 818 match_offset = TRUE; 819 break; 820 case THT_BONUS: 821 if (blkid == DMU_BONUS_BLKID) 822 match_offset = TRUE; 823 break; 824 case THT_ZAP: 825 match_offset = TRUE; 826 break; 827 case THT_NEWOBJECT: 828 match_object = TRUE; 829 break; 830 case THT_CLONE: 831 if (blkid >= beginblk && blkid <= endblk) 832 match_offset = TRUE; 833 /* 834 * They might have to increase nlevels, 835 * thus dirtying the new TLIBs. Or the 836 * might have to change the block size, 837 * thus dirying the new lvl=0 blk=0. 838 */ 839 if (blkid == 0) 840 match_offset = TRUE; 841 break; 842 default: 843 cmn_err(CE_PANIC, "bad txh_type %d", 844 txh->txh_type); 845 } 846 } 847 if (match_object && match_offset) { 848 DB_DNODE_EXIT(db); 849 return; 850 } 851 } 852 DB_DNODE_EXIT(db); 853 panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", 854 (u_longlong_t)db->db.db_object, db->db_level, 855 (u_longlong_t)db->db_blkid); 856 } 857 #endif 858 859 /* 860 * If we can't do 10 iops, something is wrong. Let us go ahead 861 * and hit zfs_dirty_data_max. 862 */ 863 static const hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */ 864 865 /* 866 * We delay transactions when we've determined that the backend storage 867 * isn't able to accommodate the rate of incoming writes. 868 * 869 * If there is already a transaction waiting, we delay relative to when 870 * that transaction finishes waiting. This way the calculated min_time 871 * is independent of the number of threads concurrently executing 872 * transactions. 873 * 874 * If we are the only waiter, wait relative to when the transaction 875 * started, rather than the current time. This credits the transaction for 876 * "time already served", e.g. reading indirect blocks. 877 * 878 * The minimum time for a transaction to take is calculated as: 879 * min_time = scale * (dirty - min) / (max - dirty) 880 * min_time is then capped at zfs_delay_max_ns. 881 * 882 * The delay has two degrees of freedom that can be adjusted via tunables. 883 * The percentage of dirty data at which we start to delay is defined by 884 * zfs_delay_min_dirty_percent. This should typically be at or above 885 * zfs_vdev_async_write_active_max_dirty_percent so that we only start to 886 * delay after writing at full speed has failed to keep up with the incoming 887 * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly 888 * speaking, this variable determines the amount of delay at the midpoint of 889 * the curve. 890 * 891 * delay 892 * 10ms +-------------------------------------------------------------*+ 893 * | *| 894 * 9ms + *+ 895 * | *| 896 * 8ms + *+ 897 * | * | 898 * 7ms + * + 899 * | * | 900 * 6ms + * + 901 * | * | 902 * 5ms + * + 903 * | * | 904 * 4ms + * + 905 * | * | 906 * 3ms + * + 907 * | * | 908 * 2ms + (midpoint) * + 909 * | | ** | 910 * 1ms + v *** + 911 * | zfs_delay_scale ----------> ******** | 912 * 0 +-------------------------------------*********----------------+ 913 * 0% <- zfs_dirty_data_max -> 100% 914 * 915 * Note that since the delay is added to the outstanding time remaining on the 916 * most recent transaction, the delay is effectively the inverse of IOPS. 917 * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve 918 * was chosen such that small changes in the amount of accumulated dirty data 919 * in the first 3/4 of the curve yield relatively small differences in the 920 * amount of delay. 921 * 922 * The effects can be easier to understand when the amount of delay is 923 * represented on a log scale: 924 * 925 * delay 926 * 100ms +-------------------------------------------------------------++ 927 * + + 928 * | | 929 * + *+ 930 * 10ms + *+ 931 * + ** + 932 * | (midpoint) ** | 933 * + | ** + 934 * 1ms + v **** + 935 * + zfs_delay_scale ----------> ***** + 936 * | **** | 937 * + **** + 938 * 100us + ** + 939 * + * + 940 * | * | 941 * + * + 942 * 10us + * + 943 * + + 944 * | | 945 * + + 946 * +--------------------------------------------------------------+ 947 * 0% <- zfs_dirty_data_max -> 100% 948 * 949 * Note here that only as the amount of dirty data approaches its limit does 950 * the delay start to increase rapidly. The goal of a properly tuned system 951 * should be to keep the amount of dirty data out of that range by first 952 * ensuring that the appropriate limits are set for the I/O scheduler to reach 953 * optimal throughput on the backend storage, and then by changing the value 954 * of zfs_delay_scale to increase the steepness of the curve. 955 */ 956 static void 957 dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) 958 { 959 dsl_pool_t *dp = tx->tx_pool; 960 uint64_t delay_min_bytes, wrlog; 961 hrtime_t wakeup, tx_time = 0, now; 962 963 /* Calculate minimum transaction time for the dirty data amount. */ 964 delay_min_bytes = 965 zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; 966 if (dirty > delay_min_bytes) { 967 /* 968 * The caller has already waited until we are under the max. 969 * We make them pass us the amount of dirty data so we don't 970 * have to handle the case of it being >= the max, which 971 * could cause a divide-by-zero if it's == the max. 972 */ 973 ASSERT3U(dirty, <, zfs_dirty_data_max); 974 975 tx_time = zfs_delay_scale * (dirty - delay_min_bytes) / 976 (zfs_dirty_data_max - dirty); 977 } 978 979 /* Calculate minimum transaction time for the TX_WRITE log size. */ 980 wrlog = aggsum_upper_bound(&dp->dp_wrlog_total); 981 delay_min_bytes = 982 zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100; 983 if (wrlog >= zfs_wrlog_data_max) { 984 tx_time = zfs_delay_max_ns; 985 } else if (wrlog > delay_min_bytes) { 986 tx_time = MAX(zfs_delay_scale * (wrlog - delay_min_bytes) / 987 (zfs_wrlog_data_max - wrlog), tx_time); 988 } 989 990 if (tx_time == 0) 991 return; 992 993 tx_time = MIN(tx_time, zfs_delay_max_ns); 994 now = gethrtime(); 995 if (now > tx->tx_start + tx_time) 996 return; 997 998 DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, 999 uint64_t, tx_time); 1000 1001 mutex_enter(&dp->dp_lock); 1002 wakeup = MAX(tx->tx_start + tx_time, dp->dp_last_wakeup + tx_time); 1003 dp->dp_last_wakeup = wakeup; 1004 mutex_exit(&dp->dp_lock); 1005 1006 zfs_sleep_until(wakeup); 1007 } 1008 1009 /* 1010 * This routine attempts to assign the transaction to a transaction group. 1011 * To do so, we must determine if there is sufficient free space on disk. 1012 * 1013 * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree() 1014 * on it), then it is assumed that there is sufficient free space, 1015 * unless there's insufficient slop space in the pool (see the comment 1016 * above spa_slop_shift in spa_misc.c). 1017 * 1018 * If it is not a "netfree" transaction, then if the data already on disk 1019 * is over the allowed usage (e.g. quota), this will fail with EDQUOT or 1020 * ENOSPC. Otherwise, if the current rough estimate of pending changes, 1021 * plus the rough estimate of this transaction's changes, may exceed the 1022 * allowed usage, then this will fail with ERESTART, which will cause the 1023 * caller to wait for the pending changes to be written to disk (by waiting 1024 * for the next TXG to open), and then check the space usage again. 1025 * 1026 * The rough estimate of pending changes is comprised of the sum of: 1027 * 1028 * - this transaction's holds' txh_space_towrite 1029 * 1030 * - dd_tempreserved[], which is the sum of in-flight transactions' 1031 * holds' txh_space_towrite (i.e. those transactions that have called 1032 * dmu_tx_assign() but not yet called dmu_tx_commit()). 1033 * 1034 * - dd_space_towrite[], which is the amount of dirtied dbufs. 1035 * 1036 * Note that all of these values are inflated by spa_get_worst_case_asize(), 1037 * which means that we may get ERESTART well before we are actually in danger 1038 * of running out of space, but this also mitigates any small inaccuracies 1039 * in the rough estimate (e.g. txh_space_towrite doesn't take into account 1040 * indirect blocks, and dd_space_towrite[] doesn't take into account changes 1041 * to the MOS). 1042 * 1043 * Note that due to this algorithm, it is possible to exceed the allowed 1044 * usage by one transaction. Also, as we approach the allowed usage, 1045 * we will allow a very limited amount of changes into each TXG, thus 1046 * decreasing performance. 1047 */ 1048 static int 1049 dmu_tx_try_assign(dmu_tx_t *tx) 1050 { 1051 spa_t *spa = tx->tx_pool->dp_spa; 1052 1053 ASSERT0(tx->tx_txg); 1054 1055 if (tx->tx_err) { 1056 DMU_TX_STAT_BUMP(dmu_tx_error); 1057 return (SET_ERROR(EIO)); 1058 } 1059 1060 if (spa_suspended(spa)) { 1061 DMU_TX_STAT_BUMP(dmu_tx_suspended); 1062 1063 /* 1064 * Let dmu_tx_assign() know specifically what happened, so 1065 * it can make the right choice based on the caller flags. 1066 */ 1067 return (SET_ERROR(ESHUTDOWN)); 1068 } 1069 1070 if (!tx->tx_dirty_delayed && 1071 dsl_pool_need_wrlog_delay(tx->tx_pool)) { 1072 tx->tx_wait_dirty = B_TRUE; 1073 DMU_TX_STAT_BUMP(dmu_tx_wrlog_delay); 1074 return (SET_ERROR(ERESTART)); 1075 } 1076 1077 if (!tx->tx_dirty_delayed && 1078 dsl_pool_need_dirty_delay(tx->tx_pool)) { 1079 tx->tx_wait_dirty = B_TRUE; 1080 DMU_TX_STAT_BUMP(dmu_tx_dirty_delay); 1081 return (SET_ERROR(ERESTART)); 1082 } 1083 1084 tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); 1085 tx->tx_needassign_txh = NULL; 1086 1087 /* 1088 * NB: No error returns are allowed after txg_hold_open, but 1089 * before processing the dnode holds, due to the 1090 * dmu_tx_unassign() logic. 1091 */ 1092 1093 uint64_t towrite = 0; 1094 uint64_t tohold = 0; 1095 for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; 1096 txh = list_next(&tx->tx_holds, txh)) { 1097 dnode_t *dn = txh->txh_dnode; 1098 if (dn != NULL) { 1099 /* 1100 * This thread can't hold the dn_struct_rwlock 1101 * while assigning the tx, because this can lead to 1102 * deadlock. Specifically, if this dnode is already 1103 * assigned to an earlier txg, this thread may need 1104 * to wait for that txg to sync (the ERESTART case 1105 * below). The other thread that has assigned this 1106 * dnode to an earlier txg prevents this txg from 1107 * syncing until its tx can complete (calling 1108 * dmu_tx_commit()), but it may need to acquire the 1109 * dn_struct_rwlock to do so (e.g. via 1110 * dmu_buf_hold*()). 1111 * 1112 * Note that this thread can't hold the lock for 1113 * read either, but the rwlock doesn't record 1114 * enough information to make that assertion. 1115 */ 1116 ASSERT(!RW_WRITE_HELD(&dn->dn_struct_rwlock)); 1117 1118 mutex_enter(&dn->dn_mtx); 1119 if (dn->dn_assigned_txg == tx->tx_txg - 1) { 1120 mutex_exit(&dn->dn_mtx); 1121 tx->tx_needassign_txh = txh; 1122 DMU_TX_STAT_BUMP(dmu_tx_group); 1123 return (SET_ERROR(ERESTART)); 1124 } 1125 if (dn->dn_assigned_txg == 0) 1126 dn->dn_assigned_txg = tx->tx_txg; 1127 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 1128 (void) zfs_refcount_add(&dn->dn_tx_holds, tx); 1129 mutex_exit(&dn->dn_mtx); 1130 } 1131 towrite += zfs_refcount_count(&txh->txh_space_towrite); 1132 tohold += zfs_refcount_count(&txh->txh_memory_tohold); 1133 } 1134 1135 /* needed allocation: worst-case estimate of write space */ 1136 uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite); 1137 /* calculate memory footprint estimate */ 1138 uint64_t memory = towrite + tohold; 1139 1140 if (tx->tx_dir != NULL && asize != 0) { 1141 int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, 1142 asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx); 1143 if (err != 0) 1144 return (err); 1145 } 1146 1147 DMU_TX_STAT_BUMP(dmu_tx_assigned); 1148 1149 return (0); 1150 } 1151 1152 static void 1153 dmu_tx_unassign(dmu_tx_t *tx) 1154 { 1155 if (tx->tx_txg == 0) 1156 return; 1157 1158 txg_rele_to_quiesce(&tx->tx_txgh); 1159 1160 /* 1161 * Walk the transaction's hold list, removing the hold on the 1162 * associated dnode, and notifying waiters if the refcount drops to 0. 1163 */ 1164 for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); 1165 txh && txh != tx->tx_needassign_txh; 1166 txh = list_next(&tx->tx_holds, txh)) { 1167 dnode_t *dn = txh->txh_dnode; 1168 1169 if (dn == NULL) 1170 continue; 1171 mutex_enter(&dn->dn_mtx); 1172 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 1173 1174 if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) { 1175 dn->dn_assigned_txg = 0; 1176 cv_broadcast(&dn->dn_notxholds); 1177 } 1178 mutex_exit(&dn->dn_mtx); 1179 } 1180 1181 txg_rele_to_sync(&tx->tx_txgh); 1182 1183 tx->tx_lasttried_txg = tx->tx_txg; 1184 tx->tx_txg = 0; 1185 } 1186 1187 /* 1188 * Assign tx to a transaction group; `flags` is a bitmask: 1189 * 1190 * If DMU_TX_WAIT is set and the currently open txg is full, this function 1191 * will wait until there's a new txg. This should be used when no locks 1192 * are being held. With this bit set, this function will only fail if 1193 * we're truly out of space (ENOSPC), over quota (EDQUOT), or required 1194 * data for the transaction could not be read from disk (EIO). 1195 * 1196 * If DMU_TX_WAIT is *not* set and we can't assign into the currently open 1197 * txg without blocking, this function will return immediately with 1198 * ERESTART. This should be used whenever locks are being held. On an 1199 * ERESTART error, the caller should drop all locks, call dmu_tx_wait(), 1200 * and try again. 1201 * 1202 * If DMU_TX_NOTHROTTLE is set, this indicates that this tx should not be 1203 * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for 1204 * details on the throttle). This is used by the VFS operations, after 1205 * they have already called dmu_tx_wait() (though most likely on a 1206 * different tx). 1207 * 1208 * If DMU_TX_SUSPEND is set, this indicates that this tx should ignore 1209 * the pool being or becoming suspending while it is in progress. This will 1210 * cause dmu_tx_assign() (and dmu_tx_wait()) to block until the pool resumes. 1211 * If this flag is not set and the pool suspends, the return will be either 1212 * ERESTART or EIO, depending on the value of the pool's failmode= property. 1213 * 1214 * It is guaranteed that subsequent successful calls to dmu_tx_assign() 1215 * will assign the tx to monotonically increasing txgs. Of course this is 1216 * not strong monotonicity, because the same txg can be returned multiple 1217 * times in a row. This guarantee holds both for subsequent calls from 1218 * one thread and for multiple threads. For example, it is impossible to 1219 * observe the following sequence of events: 1220 * 1221 * Thread 1 Thread 2 1222 * 1223 * dmu_tx_assign(T1, ...) 1224 * 1 <- dmu_tx_get_txg(T1) 1225 * dmu_tx_assign(T2, ...) 1226 * 2 <- dmu_tx_get_txg(T2) 1227 * dmu_tx_assign(T3, ...) 1228 * 1 <- dmu_tx_get_txg(T3) 1229 */ 1230 int 1231 dmu_tx_assign(dmu_tx_t *tx, dmu_tx_flag_t flags) 1232 { 1233 int err; 1234 1235 ASSERT(tx->tx_txg == 0); 1236 ASSERT0(flags & ~(DMU_TX_WAIT | DMU_TX_NOTHROTTLE | DMU_TX_SUSPEND)); 1237 IMPLY(flags & DMU_TX_SUSPEND, flags & DMU_TX_WAIT); 1238 ASSERT(!dsl_pool_sync_context(tx->tx_pool)); 1239 1240 /* If we might wait, we must not hold the config lock. */ 1241 IMPLY((flags & DMU_TX_WAIT), !dsl_pool_config_held(tx->tx_pool)); 1242 1243 if ((flags & DMU_TX_NOTHROTTLE)) 1244 tx->tx_dirty_delayed = B_TRUE; 1245 1246 if (!(flags & DMU_TX_SUSPEND)) 1247 tx->tx_break_on_suspend = B_TRUE; 1248 1249 while ((err = dmu_tx_try_assign(tx)) != 0) { 1250 dmu_tx_unassign(tx); 1251 1252 boolean_t suspended = (err == ESHUTDOWN); 1253 if (suspended) { 1254 /* 1255 * Pool suspended. We need to decide whether to block 1256 * and retry, or return error, depending on the 1257 * caller's flags and the pool config. 1258 */ 1259 if (flags & DMU_TX_SUSPEND) 1260 /* 1261 * The caller expressly does not care about 1262 * suspend, so treat it as a normal retry. 1263 */ 1264 err = SET_ERROR(ERESTART); 1265 else if ((flags & DMU_TX_WAIT) && 1266 spa_get_failmode(tx->tx_pool->dp_spa) == 1267 ZIO_FAILURE_MODE_CONTINUE) 1268 /* 1269 * Caller wants to wait, but pool config is 1270 * overriding that, so return EIO to be 1271 * propagated back to userspace. 1272 */ 1273 err = SET_ERROR(EIO); 1274 else 1275 /* Anything else, we should just block. */ 1276 err = SET_ERROR(ERESTART); 1277 } 1278 1279 /* 1280 * Return unless we decided to retry, or the caller does not 1281 * want to block. 1282 */ 1283 if (err != ERESTART || !(flags & DMU_TX_WAIT)) { 1284 ASSERT(err == EDQUOT || err == ENOSPC || 1285 err == ERESTART || err == EIO); 1286 return (err); 1287 } 1288 1289 /* 1290 * Wait until there's room in this txg, or until it's been 1291 * synced out and a new one is available. 1292 * 1293 * If we're here because the pool suspended above, then we 1294 * unset tx_break_on_suspend to make sure that if dmu_tx_wait() 1295 * has to fall back to a txg_wait_synced_flags(), it doesn't 1296 * immediately return because the pool is suspended. That would 1297 * then immediately return here, and we'd end up in a busy loop 1298 * until the pool resumes. 1299 * 1300 * On the other hand, if the pool hasn't suspended yet, then it 1301 * should be allowed to break a txg wait if the pool does 1302 * suspend, so we can loop and reassess it in 1303 * dmu_tx_try_assign(). 1304 */ 1305 if (suspended) 1306 tx->tx_break_on_suspend = B_FALSE; 1307 1308 dmu_tx_wait(tx); 1309 1310 /* 1311 * Reset tx_break_on_suspend for DMU_TX_SUSPEND. We do this 1312 * here so that it's available if we return for some other 1313 * reason, and then the caller calls dmu_tx_wait(). 1314 */ 1315 if (!(flags & DMU_TX_SUSPEND)) 1316 tx->tx_break_on_suspend = B_TRUE; 1317 } 1318 1319 txg_rele_to_quiesce(&tx->tx_txgh); 1320 1321 return (0); 1322 } 1323 1324 void 1325 dmu_tx_wait(dmu_tx_t *tx) 1326 { 1327 spa_t *spa = tx->tx_pool->dp_spa; 1328 dsl_pool_t *dp = tx->tx_pool; 1329 hrtime_t before; 1330 1331 ASSERT(tx->tx_txg == 0); 1332 ASSERT(!dsl_pool_config_held(tx->tx_pool)); 1333 1334 /* 1335 * Break on suspend according to whether or not DMU_TX_SUSPEND was 1336 * supplied to the previous dmu_tx_assign() call. For clients, this 1337 * ensures that after dmu_tx_assign() fails, the followup dmu_tx_wait() 1338 * gets the same behaviour wrt suspend. See also the comments in 1339 * dmu_tx_assign(). 1340 */ 1341 txg_wait_flag_t flags = 1342 (tx->tx_break_on_suspend ? TXG_WAIT_SUSPEND : TXG_WAIT_NONE); 1343 1344 before = gethrtime(); 1345 1346 if (tx->tx_wait_dirty) { 1347 uint64_t dirty; 1348 1349 /* 1350 * dmu_tx_try_assign() has determined that we need to wait 1351 * because we've consumed much or all of the dirty buffer 1352 * space. 1353 */ 1354 mutex_enter(&dp->dp_lock); 1355 if (dp->dp_dirty_total >= zfs_dirty_data_max) 1356 DMU_TX_STAT_BUMP(dmu_tx_dirty_over_max); 1357 while (dp->dp_dirty_total >= zfs_dirty_data_max) 1358 cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); 1359 dirty = dp->dp_dirty_total; 1360 mutex_exit(&dp->dp_lock); 1361 1362 dmu_tx_delay(tx, dirty); 1363 1364 tx->tx_wait_dirty = B_FALSE; 1365 1366 /* 1367 * Note: setting tx_dirty_delayed only has effect if the 1368 * caller used DMU_TX_WAIT. Otherwise they are going to 1369 * destroy this tx and try again. The common case, 1370 * zfs_write(), uses DMU_TX_WAIT. 1371 */ 1372 tx->tx_dirty_delayed = B_TRUE; 1373 } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { 1374 /* 1375 * If the pool is suspended we need to wait until it 1376 * is resumed. Note that it's possible that the pool 1377 * has become active after this thread has tried to 1378 * obtain a tx. If that's the case then tx_lasttried_txg 1379 * would not have been set. 1380 */ 1381 txg_wait_synced_flags(dp, spa_last_synced_txg(spa) + 1, flags); 1382 } else if (tx->tx_needassign_txh) { 1383 dnode_t *dn = tx->tx_needassign_txh->txh_dnode; 1384 1385 mutex_enter(&dn->dn_mtx); 1386 while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) 1387 cv_wait(&dn->dn_notxholds, &dn->dn_mtx); 1388 mutex_exit(&dn->dn_mtx); 1389 tx->tx_needassign_txh = NULL; 1390 } else { 1391 /* 1392 * If we have a lot of dirty data just wait until we sync 1393 * out a TXG at which point we'll hopefully have synced 1394 * a portion of the changes. 1395 */ 1396 txg_wait_synced_flags(dp, spa_last_synced_txg(spa) + 1, flags); 1397 } 1398 1399 spa_tx_assign_add_nsecs(spa, gethrtime() - before); 1400 } 1401 1402 static void 1403 dmu_tx_destroy(dmu_tx_t *tx) 1404 { 1405 dmu_tx_hold_t *txh; 1406 1407 while ((txh = list_head(&tx->tx_holds)) != NULL) { 1408 dnode_t *dn = txh->txh_dnode; 1409 1410 list_remove(&tx->tx_holds, txh); 1411 zfs_refcount_destroy_many(&txh->txh_space_towrite, 1412 zfs_refcount_count(&txh->txh_space_towrite)); 1413 zfs_refcount_destroy_many(&txh->txh_memory_tohold, 1414 zfs_refcount_count(&txh->txh_memory_tohold)); 1415 kmem_free(txh, sizeof (dmu_tx_hold_t)); 1416 if (dn != NULL) 1417 dnode_rele(dn, tx); 1418 } 1419 1420 list_destroy(&tx->tx_callbacks); 1421 list_destroy(&tx->tx_holds); 1422 kmem_free(tx, sizeof (dmu_tx_t)); 1423 } 1424 1425 void 1426 dmu_tx_commit(dmu_tx_t *tx) 1427 { 1428 /* This function should only be used on assigned transactions. */ 1429 ASSERT(tx->tx_txg != 0); 1430 1431 /* 1432 * Go through the transaction's hold list and remove holds on 1433 * associated dnodes, notifying waiters if no holds remain. 1434 */ 1435 for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; 1436 txh = list_next(&tx->tx_holds, txh)) { 1437 dnode_t *dn = txh->txh_dnode; 1438 1439 if (dn == NULL) 1440 continue; 1441 1442 mutex_enter(&dn->dn_mtx); 1443 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 1444 1445 if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) { 1446 dn->dn_assigned_txg = 0; 1447 cv_broadcast(&dn->dn_notxholds); 1448 } 1449 mutex_exit(&dn->dn_mtx); 1450 } 1451 1452 if (tx->tx_tempreserve_cookie) 1453 dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); 1454 1455 if (!list_is_empty(&tx->tx_callbacks)) 1456 txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks); 1457 1458 if (tx->tx_anyobj == FALSE) 1459 txg_rele_to_sync(&tx->tx_txgh); 1460 1461 dmu_tx_destroy(tx); 1462 } 1463 1464 void 1465 dmu_tx_abort(dmu_tx_t *tx) 1466 { 1467 /* This function should not be used on assigned transactions. */ 1468 ASSERT0(tx->tx_txg); 1469 1470 /* Should not be needed, but better be safe than sorry. */ 1471 if (tx->tx_tempreserve_cookie) 1472 dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); 1473 1474 /* 1475 * Call any registered callbacks with an error code. 1476 */ 1477 if (!list_is_empty(&tx->tx_callbacks)) 1478 dmu_tx_do_callbacks(&tx->tx_callbacks, SET_ERROR(ECANCELED)); 1479 1480 /* Should not be needed, but better be safe than sorry. */ 1481 dmu_tx_unassign(tx); 1482 1483 dmu_tx_destroy(tx); 1484 } 1485 1486 uint64_t 1487 dmu_tx_get_txg(dmu_tx_t *tx) 1488 { 1489 ASSERT(tx->tx_txg != 0); 1490 return (tx->tx_txg); 1491 } 1492 1493 dsl_pool_t * 1494 dmu_tx_pool(dmu_tx_t *tx) 1495 { 1496 ASSERT(tx->tx_pool != NULL); 1497 return (tx->tx_pool); 1498 } 1499 1500 /* 1501 * Register a callback to be executed at the end of a TXG. 1502 * 1503 * Note: This currently exists for outside consumers, specifically the ZFS OSD 1504 * for Lustre. Please do not remove before checking that project. For examples 1505 * on how to use this see `ztest_commit_callback`. 1506 */ 1507 void 1508 dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) 1509 { 1510 dmu_tx_callback_t *dcb; 1511 1512 dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP); 1513 1514 dcb->dcb_func = func; 1515 dcb->dcb_data = data; 1516 1517 list_insert_tail(&tx->tx_callbacks, dcb); 1518 } 1519 1520 /* 1521 * Call all the commit callbacks on a list, with a given error code. 1522 */ 1523 void 1524 dmu_tx_do_callbacks(list_t *cb_list, int error) 1525 { 1526 dmu_tx_callback_t *dcb; 1527 1528 while ((dcb = list_remove_tail(cb_list)) != NULL) { 1529 dcb->dcb_func(dcb->dcb_data, error); 1530 kmem_free(dcb, sizeof (dmu_tx_callback_t)); 1531 } 1532 } 1533 1534 /* 1535 * Interface to hold a bunch of attributes. 1536 * used for creating new files. 1537 * attrsize is the total size of all attributes 1538 * to be added during object creation 1539 * 1540 * For updating/adding a single attribute dmu_tx_hold_sa() should be used. 1541 */ 1542 1543 /* 1544 * hold necessary attribute name for attribute registration. 1545 * should be a very rare case where this is needed. If it does 1546 * happen it would only happen on the first write to the file system. 1547 */ 1548 static void 1549 dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) 1550 { 1551 if (!sa->sa_need_attr_registration) 1552 return; 1553 1554 for (int i = 0; i != sa->sa_num_attrs; i++) { 1555 if (!sa->sa_attr_table[i].sa_registered) { 1556 if (sa->sa_reg_attr_obj) 1557 dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj, 1558 B_TRUE, sa->sa_attr_table[i].sa_name); 1559 else 1560 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 1561 B_TRUE, sa->sa_attr_table[i].sa_name); 1562 } 1563 } 1564 } 1565 1566 void 1567 dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) 1568 { 1569 dmu_tx_hold_t *txh; 1570 1571 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, 1572 THT_SPILL, 0, 0); 1573 if (txh != NULL) 1574 (void) zfs_refcount_add_many(&txh->txh_space_towrite, 1575 SPA_OLD_MAXBLOCKSIZE, FTAG); 1576 } 1577 1578 void 1579 dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) 1580 { 1581 sa_os_t *sa = tx->tx_objset->os_sa; 1582 1583 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1584 1585 if (tx->tx_objset->os_sa->sa_master_obj == 0) 1586 return; 1587 1588 if (tx->tx_objset->os_sa->sa_layout_attr_obj) { 1589 dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); 1590 } else { 1591 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); 1592 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); 1593 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 1594 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 1595 } 1596 1597 dmu_tx_sa_registration_hold(sa, tx); 1598 1599 if (attrsize <= DN_OLD_MAX_BONUSLEN && !sa->sa_force_spill) 1600 return; 1601 1602 (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, 1603 THT_SPILL, 0, 0); 1604 } 1605 1606 /* 1607 * Hold SA attribute 1608 * 1609 * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size) 1610 * 1611 * variable_size is the total size of all variable sized attributes 1612 * passed to this function. It is not the total size of all 1613 * variable size attributes that *may* exist on this object. 1614 */ 1615 void 1616 dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) 1617 { 1618 uint64_t object; 1619 sa_os_t *sa = tx->tx_objset->os_sa; 1620 1621 ASSERT(hdl != NULL); 1622 1623 object = sa_handle_object(hdl); 1624 1625 dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; 1626 DB_DNODE_ENTER(db); 1627 dmu_tx_hold_bonus_by_dnode(tx, DB_DNODE(db)); 1628 DB_DNODE_EXIT(db); 1629 1630 if (tx->tx_objset->os_sa->sa_master_obj == 0) 1631 return; 1632 1633 if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 || 1634 tx->tx_objset->os_sa->sa_layout_attr_obj == 0) { 1635 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); 1636 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); 1637 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 1638 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 1639 } 1640 1641 dmu_tx_sa_registration_hold(sa, tx); 1642 1643 if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj) 1644 dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); 1645 1646 if (sa->sa_force_spill || may_grow || hdl->sa_spill) { 1647 ASSERT(tx->tx_txg == 0); 1648 dmu_tx_hold_spill(tx, object); 1649 } else { 1650 DB_DNODE_ENTER(db); 1651 if (DB_DNODE(db)->dn_have_spill) { 1652 ASSERT(tx->tx_txg == 0); 1653 dmu_tx_hold_spill(tx, object); 1654 } 1655 DB_DNODE_EXIT(db); 1656 } 1657 } 1658 1659 void 1660 dmu_tx_init(void) 1661 { 1662 dmu_tx_ksp = kstat_create("zfs", 0, "dmu_tx", "misc", 1663 KSTAT_TYPE_NAMED, sizeof (dmu_tx_stats) / sizeof (kstat_named_t), 1664 KSTAT_FLAG_VIRTUAL); 1665 1666 if (dmu_tx_ksp != NULL) { 1667 dmu_tx_ksp->ks_data = &dmu_tx_stats; 1668 kstat_install(dmu_tx_ksp); 1669 } 1670 } 1671 1672 void 1673 dmu_tx_fini(void) 1674 { 1675 if (dmu_tx_ksp != NULL) { 1676 kstat_delete(dmu_tx_ksp); 1677 dmu_tx_ksp = NULL; 1678 } 1679 } 1680 1681 #if defined(_KERNEL) 1682 EXPORT_SYMBOL(dmu_tx_create); 1683 EXPORT_SYMBOL(dmu_tx_hold_write); 1684 EXPORT_SYMBOL(dmu_tx_hold_write_by_dnode); 1685 EXPORT_SYMBOL(dmu_tx_hold_append); 1686 EXPORT_SYMBOL(dmu_tx_hold_append_by_dnode); 1687 EXPORT_SYMBOL(dmu_tx_hold_free); 1688 EXPORT_SYMBOL(dmu_tx_hold_free_by_dnode); 1689 EXPORT_SYMBOL(dmu_tx_hold_zap); 1690 EXPORT_SYMBOL(dmu_tx_hold_zap_by_dnode); 1691 EXPORT_SYMBOL(dmu_tx_hold_bonus); 1692 EXPORT_SYMBOL(dmu_tx_hold_bonus_by_dnode); 1693 EXPORT_SYMBOL(dmu_tx_abort); 1694 EXPORT_SYMBOL(dmu_tx_assign); 1695 EXPORT_SYMBOL(dmu_tx_wait); 1696 EXPORT_SYMBOL(dmu_tx_commit); 1697 EXPORT_SYMBOL(dmu_tx_mark_netfree); 1698 EXPORT_SYMBOL(dmu_tx_get_txg); 1699 EXPORT_SYMBOL(dmu_tx_callback_register); 1700 EXPORT_SYMBOL(dmu_tx_do_callbacks); 1701 EXPORT_SYMBOL(dmu_tx_hold_spill); 1702 EXPORT_SYMBOL(dmu_tx_hold_sa_create); 1703 EXPORT_SYMBOL(dmu_tx_hold_sa); 1704 #endif 1705