1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2012, 2016 by Delphix. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 */ 27 28 #include <sys/dmu.h> 29 #include <sys/dmu_impl.h> 30 #include <sys/dbuf.h> 31 #include <sys/dmu_tx.h> 32 #include <sys/dmu_objset.h> 33 #include <sys/dsl_dataset.h> 34 #include <sys/dsl_dir.h> 35 #include <sys/dsl_pool.h> 36 #include <sys/zap_impl.h> 37 #include <sys/spa.h> 38 #include <sys/sa.h> 39 #include <sys/sa_impl.h> 40 #include <sys/zfs_context.h> 41 #include <sys/varargs.h> 42 43 typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, 44 uint64_t arg1, uint64_t arg2); 45 46 47 dmu_tx_t * 48 dmu_tx_create_dd(dsl_dir_t *dd) 49 { 50 dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); 51 tx->tx_dir = dd; 52 if (dd != NULL) 53 tx->tx_pool = dd->dd_pool; 54 list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), 55 offsetof(dmu_tx_hold_t, txh_node)); 56 list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), 57 offsetof(dmu_tx_callback_t, dcb_node)); 58 tx->tx_start = gethrtime(); 59 return (tx); 60 } 61 62 dmu_tx_t * 63 dmu_tx_create(objset_t *os) 64 { 65 dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir); 66 tx->tx_objset = os; 67 return (tx); 68 } 69 70 dmu_tx_t * 71 dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) 72 { 73 dmu_tx_t *tx = dmu_tx_create_dd(NULL); 74 75 ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg); 76 tx->tx_pool = dp; 77 tx->tx_txg = txg; 78 tx->tx_anyobj = TRUE; 79 80 return (tx); 81 } 82 83 int 84 dmu_tx_is_syncing(dmu_tx_t *tx) 85 { 86 return (tx->tx_anyobj); 87 } 88 89 int 90 dmu_tx_private_ok(dmu_tx_t *tx) 91 { 92 return (tx->tx_anyobj); 93 } 94 95 static dmu_tx_hold_t * 96 dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, 97 enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) 98 { 99 dmu_tx_hold_t *txh; 100 dnode_t *dn = NULL; 101 int err; 102 103 if (object != DMU_NEW_OBJECT) { 104 err = dnode_hold(os, object, tx, &dn); 105 if (err) { 106 tx->tx_err = err; 107 return (NULL); 108 } 109 110 if (err == 0 && tx->tx_txg != 0) { 111 mutex_enter(&dn->dn_mtx); 112 /* 113 * dn->dn_assigned_txg == tx->tx_txg doesn't pose a 114 * problem, but there's no way for it to happen (for 115 * now, at least). 116 */ 117 ASSERT(dn->dn_assigned_txg == 0); 118 dn->dn_assigned_txg = tx->tx_txg; 119 (void) refcount_add(&dn->dn_tx_holds, tx); 120 mutex_exit(&dn->dn_mtx); 121 } 122 } 123 124 txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); 125 txh->txh_tx = tx; 126 txh->txh_dnode = dn; 127 refcount_create(&txh->txh_space_towrite); 128 refcount_create(&txh->txh_memory_tohold); 129 txh->txh_type = type; 130 txh->txh_arg1 = arg1; 131 txh->txh_arg2 = arg2; 132 list_insert_tail(&tx->tx_holds, txh); 133 134 return (txh); 135 } 136 137 void 138 dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object) 139 { 140 /* 141 * If we're syncing, they can manipulate any object anyhow, and 142 * the hold on the dnode_t can cause problems. 143 */ 144 if (!dmu_tx_is_syncing(tx)) { 145 (void) dmu_tx_hold_object_impl(tx, os, 146 object, THT_NEWOBJECT, 0, 0); 147 } 148 } 149 150 /* 151 * This function reads specified data from disk. The specified data will 152 * be needed to perform the transaction -- i.e, it will be read after 153 * we do dmu_tx_assign(). There are two reasons that we read the data now 154 * (before dmu_tx_assign()): 155 * 156 * 1. Reading it now has potentially better performance. The transaction 157 * has not yet been assigned, so the TXG is not held open, and also the 158 * caller typically has less locks held when calling dmu_tx_hold_*() than 159 * after the transaction has been assigned. This reduces the lock (and txg) 160 * hold times, thus reducing lock contention. 161 * 162 * 2. It is easier for callers (primarily the ZPL) to handle i/o errors 163 * that are detected before they start making changes to the DMU state 164 * (i.e. now). Once the transaction has been assigned, and some DMU 165 * state has been changed, it can be difficult to recover from an i/o 166 * error (e.g. to undo the changes already made in memory at the DMU 167 * layer). Typically code to do so does not exist in the caller -- it 168 * assumes that the data has already been cached and thus i/o errors are 169 * not possible. 170 * 171 * It has been observed that the i/o initiated here can be a performance 172 * problem, and it appears to be optional, because we don't look at the 173 * data which is read. However, removing this read would only serve to 174 * move the work elsewhere (after the dmu_tx_assign()), where it may 175 * have a greater impact on performance (in addition to the impact on 176 * fault tolerance noted above). 177 */ 178 static int 179 dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) 180 { 181 int err; 182 dmu_buf_impl_t *db; 183 184 rw_enter(&dn->dn_struct_rwlock, RW_READER); 185 db = dbuf_hold_level(dn, level, blkid, FTAG); 186 rw_exit(&dn->dn_struct_rwlock); 187 if (db == NULL) 188 return (SET_ERROR(EIO)); 189 err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); 190 dbuf_rele(db, FTAG); 191 return (err); 192 } 193 194 /* ARGSUSED */ 195 static void 196 dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) 197 { 198 dnode_t *dn = txh->txh_dnode; 199 int err = 0; 200 201 if (len == 0) 202 return; 203 204 (void) refcount_add_many(&txh->txh_space_towrite, len, FTAG); 205 206 if (refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS) 207 err = SET_ERROR(EFBIG); 208 209 if (dn == NULL) 210 return; 211 212 /* 213 * For i/o error checking, read the blocks that will be needed 214 * to perform the write: the first and last level-0 blocks (if 215 * they are not aligned, i.e. if they are partial-block writes), 216 * and all the level-1 blocks. 217 */ 218 if (dn->dn_maxblkid == 0) { 219 if (off < dn->dn_datablksz && 220 (off > 0 || len < dn->dn_datablksz)) { 221 err = dmu_tx_check_ioerr(NULL, dn, 0, 0); 222 if (err != 0) { 223 txh->txh_tx->tx_err = err; 224 } 225 } 226 } else { 227 zio_t *zio = zio_root(dn->dn_objset->os_spa, 228 NULL, NULL, ZIO_FLAG_CANFAIL); 229 230 /* first level-0 block */ 231 uint64_t start = off >> dn->dn_datablkshift; 232 if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { 233 err = dmu_tx_check_ioerr(zio, dn, 0, start); 234 if (err != 0) { 235 txh->txh_tx->tx_err = err; 236 } 237 } 238 239 /* last level-0 block */ 240 uint64_t end = (off + len - 1) >> dn->dn_datablkshift; 241 if (end != start && end <= dn->dn_maxblkid && 242 P2PHASE(off + len, dn->dn_datablksz)) { 243 err = dmu_tx_check_ioerr(zio, dn, 0, end); 244 if (err != 0) { 245 txh->txh_tx->tx_err = err; 246 } 247 } 248 249 /* level-1 blocks */ 250 if (dn->dn_nlevels > 1) { 251 int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 252 for (uint64_t i = (start >> shft) + 1; 253 i < end >> shft; i++) { 254 err = dmu_tx_check_ioerr(zio, dn, 1, i); 255 if (err != 0) { 256 txh->txh_tx->tx_err = err; 257 } 258 } 259 } 260 261 err = zio_wait(zio); 262 if (err != 0) { 263 txh->txh_tx->tx_err = err; 264 } 265 } 266 } 267 268 static void 269 dmu_tx_count_dnode(dmu_tx_hold_t *txh) 270 { 271 (void) refcount_add_many(&txh->txh_space_towrite, DNODE_SIZE, FTAG); 272 } 273 274 void 275 dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) 276 { 277 dmu_tx_hold_t *txh; 278 279 ASSERT0(tx->tx_txg); 280 ASSERT3U(len, <=, DMU_MAX_ACCESS); 281 ASSERT(len == 0 || UINT64_MAX - off >= len - 1); 282 283 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 284 object, THT_WRITE, off, len); 285 if (txh == NULL) 286 return; 287 288 dmu_tx_count_write(txh, off, len); 289 dmu_tx_count_dnode(txh); 290 } 291 292 /* 293 * This function marks the transaction as being a "net free". The end 294 * result is that refquotas will be disabled for this transaction, and 295 * this transaction will be able to use half of the pool space overhead 296 * (see dsl_pool_adjustedsize()). Therefore this function should only 297 * be called for transactions that we expect will not cause a net increase 298 * in the amount of space used (but it's OK if that is occasionally not true). 299 */ 300 void 301 dmu_tx_mark_netfree(dmu_tx_t *tx) 302 { 303 tx->tx_netfree = B_TRUE; 304 } 305 306 void 307 dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) 308 { 309 int err; 310 311 ASSERT(tx->tx_txg == 0); 312 313 dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 314 object, THT_FREE, off, len); 315 if (txh == NULL) 316 return; 317 dnode_t *dn = txh->txh_dnode; 318 dmu_tx_count_dnode(txh); 319 320 if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz) 321 return; 322 if (len == DMU_OBJECT_END) 323 len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off; 324 325 /* 326 * For i/o error checking, we read the first and last level-0 327 * blocks if they are not aligned, and all the level-1 blocks. 328 * 329 * Note: dbuf_free_range() assumes that we have not instantiated 330 * any level-0 dbufs that will be completely freed. Therefore we must 331 * exercise care to not read or count the first and last blocks 332 * if they are blocksize-aligned. 333 */ 334 if (dn->dn_datablkshift == 0) { 335 if (off != 0 || len < dn->dn_datablksz) 336 dmu_tx_count_write(txh, 0, dn->dn_datablksz); 337 } else { 338 /* first block will be modified if it is not aligned */ 339 if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift)) 340 dmu_tx_count_write(txh, off, 1); 341 /* last block will be modified if it is not aligned */ 342 if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift)) 343 dmu_tx_count_write(txh, off + len, 1); 344 } 345 346 /* 347 * Check level-1 blocks. 348 */ 349 if (dn->dn_nlevels > 1) { 350 int shift = dn->dn_datablkshift + dn->dn_indblkshift - 351 SPA_BLKPTRSHIFT; 352 uint64_t start = off >> shift; 353 uint64_t end = (off + len) >> shift; 354 355 ASSERT(dn->dn_indblkshift != 0); 356 357 /* 358 * dnode_reallocate() can result in an object with indirect 359 * blocks having an odd data block size. In this case, 360 * just check the single block. 361 */ 362 if (dn->dn_datablkshift == 0) 363 start = end = 0; 364 365 zio_t *zio = zio_root(tx->tx_pool->dp_spa, 366 NULL, NULL, ZIO_FLAG_CANFAIL); 367 for (uint64_t i = start; i <= end; i++) { 368 uint64_t ibyte = i << shift; 369 err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); 370 i = ibyte >> shift; 371 if (err == ESRCH || i > end) 372 break; 373 if (err != 0) { 374 tx->tx_err = err; 375 (void) zio_wait(zio); 376 return; 377 } 378 379 (void) refcount_add_many(&txh->txh_memory_tohold, 380 1 << dn->dn_indblkshift, FTAG); 381 382 err = dmu_tx_check_ioerr(zio, dn, 1, i); 383 if (err != 0) { 384 tx->tx_err = err; 385 (void) zio_wait(zio); 386 return; 387 } 388 } 389 err = zio_wait(zio); 390 if (err != 0) { 391 tx->tx_err = err; 392 return; 393 } 394 } 395 } 396 397 void 398 dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) 399 { 400 int err; 401 402 ASSERT(tx->tx_txg == 0); 403 404 dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 405 object, THT_ZAP, add, (uintptr_t)name); 406 if (txh == NULL) 407 return; 408 dnode_t *dn = txh->txh_dnode; 409 410 dmu_tx_count_dnode(txh); 411 412 /* 413 * Modifying a almost-full microzap is around the worst case (128KB) 414 * 415 * If it is a fat zap, the worst case would be 7*16KB=112KB: 416 * - 3 blocks overwritten: target leaf, ptrtbl block, header block 417 * - 4 new blocks written if adding: 418 * - 2 blocks for possibly split leaves, 419 * - 2 grown ptrtbl blocks 420 */ 421 (void) refcount_add_many(&txh->txh_space_towrite, 422 MZAP_MAX_BLKSZ, FTAG); 423 424 if (dn == NULL) 425 return; 426 427 ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP); 428 429 if (dn->dn_maxblkid == 0 || name == NULL) { 430 /* 431 * This is a microzap (only one block), or we don't know 432 * the name. Check the first block for i/o errors. 433 */ 434 err = dmu_tx_check_ioerr(NULL, dn, 0, 0); 435 if (err != 0) { 436 tx->tx_err = err; 437 } 438 } else { 439 /* 440 * Access the name so that we'll check for i/o errors to 441 * the leaf blocks, etc. We ignore ENOENT, as this name 442 * may not yet exist. 443 */ 444 err = zap_lookup_by_dnode(dn, name, 8, 0, NULL); 445 if (err == EIO || err == ECKSUM || err == ENXIO) { 446 tx->tx_err = err; 447 } 448 } 449 } 450 451 void 452 dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) 453 { 454 dmu_tx_hold_t *txh; 455 456 ASSERT(tx->tx_txg == 0); 457 458 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 459 object, THT_BONUS, 0, 0); 460 if (txh) 461 dmu_tx_count_dnode(txh); 462 } 463 464 void 465 dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) 466 { 467 dmu_tx_hold_t *txh; 468 ASSERT(tx->tx_txg == 0); 469 470 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 471 DMU_NEW_OBJECT, THT_SPACE, space, 0); 472 473 (void) refcount_add_many(&txh->txh_space_towrite, space, FTAG); 474 } 475 476 #ifdef ZFS_DEBUG 477 void 478 dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) 479 { 480 boolean_t match_object = B_FALSE; 481 boolean_t match_offset = B_FALSE; 482 483 DB_DNODE_ENTER(db); 484 dnode_t *dn = DB_DNODE(db); 485 ASSERT(tx->tx_txg != 0); 486 ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); 487 ASSERT3U(dn->dn_object, ==, db->db.db_object); 488 489 if (tx->tx_anyobj) { 490 DB_DNODE_EXIT(db); 491 return; 492 } 493 494 /* XXX No checking on the meta dnode for now */ 495 if (db->db.db_object == DMU_META_DNODE_OBJECT) { 496 DB_DNODE_EXIT(db); 497 return; 498 } 499 500 for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; 501 txh = list_next(&tx->tx_holds, txh)) { 502 ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg); 503 if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) 504 match_object = TRUE; 505 if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { 506 int datablkshift = dn->dn_datablkshift ? 507 dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; 508 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 509 int shift = datablkshift + epbs * db->db_level; 510 uint64_t beginblk = shift >= 64 ? 0 : 511 (txh->txh_arg1 >> shift); 512 uint64_t endblk = shift >= 64 ? 0 : 513 ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); 514 uint64_t blkid = db->db_blkid; 515 516 /* XXX txh_arg2 better not be zero... */ 517 518 dprintf("found txh type %x beginblk=%llx endblk=%llx\n", 519 txh->txh_type, beginblk, endblk); 520 521 switch (txh->txh_type) { 522 case THT_WRITE: 523 if (blkid >= beginblk && blkid <= endblk) 524 match_offset = TRUE; 525 /* 526 * We will let this hold work for the bonus 527 * or spill buffer so that we don't need to 528 * hold it when creating a new object. 529 */ 530 if (blkid == DMU_BONUS_BLKID || 531 blkid == DMU_SPILL_BLKID) 532 match_offset = TRUE; 533 /* 534 * They might have to increase nlevels, 535 * thus dirtying the new TLIBs. Or the 536 * might have to change the block size, 537 * thus dirying the new lvl=0 blk=0. 538 */ 539 if (blkid == 0) 540 match_offset = TRUE; 541 break; 542 case THT_FREE: 543 /* 544 * We will dirty all the level 1 blocks in 545 * the free range and perhaps the first and 546 * last level 0 block. 547 */ 548 if (blkid >= beginblk && (blkid <= endblk || 549 txh->txh_arg2 == DMU_OBJECT_END)) 550 match_offset = TRUE; 551 break; 552 case THT_SPILL: 553 if (blkid == DMU_SPILL_BLKID) 554 match_offset = TRUE; 555 break; 556 case THT_BONUS: 557 if (blkid == DMU_BONUS_BLKID) 558 match_offset = TRUE; 559 break; 560 case THT_ZAP: 561 match_offset = TRUE; 562 break; 563 case THT_NEWOBJECT: 564 match_object = TRUE; 565 break; 566 default: 567 ASSERT(!"bad txh_type"); 568 } 569 } 570 if (match_object && match_offset) { 571 DB_DNODE_EXIT(db); 572 return; 573 } 574 } 575 DB_DNODE_EXIT(db); 576 panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", 577 (u_longlong_t)db->db.db_object, db->db_level, 578 (u_longlong_t)db->db_blkid); 579 } 580 #endif 581 582 /* 583 * If we can't do 10 iops, something is wrong. Let us go ahead 584 * and hit zfs_dirty_data_max. 585 */ 586 hrtime_t zfs_delay_max_ns = MSEC2NSEC(100); 587 int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */ 588 589 /* 590 * We delay transactions when we've determined that the backend storage 591 * isn't able to accommodate the rate of incoming writes. 592 * 593 * If there is already a transaction waiting, we delay relative to when 594 * that transaction finishes waiting. This way the calculated min_time 595 * is independent of the number of threads concurrently executing 596 * transactions. 597 * 598 * If we are the only waiter, wait relative to when the transaction 599 * started, rather than the current time. This credits the transaction for 600 * "time already served", e.g. reading indirect blocks. 601 * 602 * The minimum time for a transaction to take is calculated as: 603 * min_time = scale * (dirty - min) / (max - dirty) 604 * min_time is then capped at zfs_delay_max_ns. 605 * 606 * The delay has two degrees of freedom that can be adjusted via tunables. 607 * The percentage of dirty data at which we start to delay is defined by 608 * zfs_delay_min_dirty_percent. This should typically be at or above 609 * zfs_vdev_async_write_active_max_dirty_percent so that we only start to 610 * delay after writing at full speed has failed to keep up with the incoming 611 * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly 612 * speaking, this variable determines the amount of delay at the midpoint of 613 * the curve. 614 * 615 * delay 616 * 10ms +-------------------------------------------------------------*+ 617 * | *| 618 * 9ms + *+ 619 * | *| 620 * 8ms + *+ 621 * | * | 622 * 7ms + * + 623 * | * | 624 * 6ms + * + 625 * | * | 626 * 5ms + * + 627 * | * | 628 * 4ms + * + 629 * | * | 630 * 3ms + * + 631 * | * | 632 * 2ms + (midpoint) * + 633 * | | ** | 634 * 1ms + v *** + 635 * | zfs_delay_scale ----------> ******** | 636 * 0 +-------------------------------------*********----------------+ 637 * 0% <- zfs_dirty_data_max -> 100% 638 * 639 * Note that since the delay is added to the outstanding time remaining on the 640 * most recent transaction, the delay is effectively the inverse of IOPS. 641 * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve 642 * was chosen such that small changes in the amount of accumulated dirty data 643 * in the first 3/4 of the curve yield relatively small differences in the 644 * amount of delay. 645 * 646 * The effects can be easier to understand when the amount of delay is 647 * represented on a log scale: 648 * 649 * delay 650 * 100ms +-------------------------------------------------------------++ 651 * + + 652 * | | 653 * + *+ 654 * 10ms + *+ 655 * + ** + 656 * | (midpoint) ** | 657 * + | ** + 658 * 1ms + v **** + 659 * + zfs_delay_scale ----------> ***** + 660 * | **** | 661 * + **** + 662 * 100us + ** + 663 * + * + 664 * | * | 665 * + * + 666 * 10us + * + 667 * + + 668 * | | 669 * + + 670 * +--------------------------------------------------------------+ 671 * 0% <- zfs_dirty_data_max -> 100% 672 * 673 * Note here that only as the amount of dirty data approaches its limit does 674 * the delay start to increase rapidly. The goal of a properly tuned system 675 * should be to keep the amount of dirty data out of that range by first 676 * ensuring that the appropriate limits are set for the I/O scheduler to reach 677 * optimal throughput on the backend storage, and then by changing the value 678 * of zfs_delay_scale to increase the steepness of the curve. 679 */ 680 static void 681 dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) 682 { 683 dsl_pool_t *dp = tx->tx_pool; 684 uint64_t delay_min_bytes = 685 zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; 686 hrtime_t wakeup, min_tx_time, now; 687 688 if (dirty <= delay_min_bytes) 689 return; 690 691 /* 692 * The caller has already waited until we are under the max. 693 * We make them pass us the amount of dirty data so we don't 694 * have to handle the case of it being >= the max, which could 695 * cause a divide-by-zero if it's == the max. 696 */ 697 ASSERT3U(dirty, <, zfs_dirty_data_max); 698 699 now = gethrtime(); 700 min_tx_time = zfs_delay_scale * 701 (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); 702 if (now > tx->tx_start + min_tx_time) 703 return; 704 705 min_tx_time = MIN(min_tx_time, zfs_delay_max_ns); 706 707 DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, 708 uint64_t, min_tx_time); 709 710 mutex_enter(&dp->dp_lock); 711 wakeup = MAX(tx->tx_start + min_tx_time, 712 dp->dp_last_wakeup + min_tx_time); 713 dp->dp_last_wakeup = wakeup; 714 mutex_exit(&dp->dp_lock); 715 716 #ifdef _KERNEL 717 mutex_enter(&curthread->t_delay_lock); 718 while (cv_timedwait_hires(&curthread->t_delay_cv, 719 &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns, 720 CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0) 721 continue; 722 mutex_exit(&curthread->t_delay_lock); 723 #else 724 hrtime_t delta = wakeup - gethrtime(); 725 struct timespec ts; 726 ts.tv_sec = delta / NANOSEC; 727 ts.tv_nsec = delta % NANOSEC; 728 (void) nanosleep(&ts, NULL); 729 #endif 730 } 731 732 /* 733 * This routine attempts to assign the transaction to a transaction group. 734 * To do so, we must determine if there is sufficient free space on disk. 735 * 736 * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree() 737 * on it), then it is assumed that there is sufficient free space, 738 * unless there's insufficient slop space in the pool (see the comment 739 * above spa_slop_shift in spa_misc.c). 740 * 741 * If it is not a "netfree" transaction, then if the data already on disk 742 * is over the allowed usage (e.g. quota), this will fail with EDQUOT or 743 * ENOSPC. Otherwise, if the current rough estimate of pending changes, 744 * plus the rough estimate of this transaction's changes, may exceed the 745 * allowed usage, then this will fail with ERESTART, which will cause the 746 * caller to wait for the pending changes to be written to disk (by waiting 747 * for the next TXG to open), and then check the space usage again. 748 * 749 * The rough estimate of pending changes is comprised of the sum of: 750 * 751 * - this transaction's holds' txh_space_towrite 752 * 753 * - dd_tempreserved[], which is the sum of in-flight transactions' 754 * holds' txh_space_towrite (i.e. those transactions that have called 755 * dmu_tx_assign() but not yet called dmu_tx_commit()). 756 * 757 * - dd_space_towrite[], which is the amount of dirtied dbufs. 758 * 759 * Note that all of these values are inflated by spa_get_worst_case_asize(), 760 * which means that we may get ERESTART well before we are actually in danger 761 * of running out of space, but this also mitigates any small inaccuracies 762 * in the rough estimate (e.g. txh_space_towrite doesn't take into account 763 * indirect blocks, and dd_space_towrite[] doesn't take into account changes 764 * to the MOS). 765 * 766 * Note that due to this algorithm, it is possible to exceed the allowed 767 * usage by one transaction. Also, as we approach the allowed usage, 768 * we will allow a very limited amount of changes into each TXG, thus 769 * decreasing performance. 770 */ 771 static int 772 dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how) 773 { 774 spa_t *spa = tx->tx_pool->dp_spa; 775 776 ASSERT0(tx->tx_txg); 777 778 if (tx->tx_err) 779 return (tx->tx_err); 780 781 if (spa_suspended(spa)) { 782 /* 783 * If the user has indicated a blocking failure mode 784 * then return ERESTART which will block in dmu_tx_wait(). 785 * Otherwise, return EIO so that an error can get 786 * propagated back to the VOP calls. 787 * 788 * Note that we always honor the txg_how flag regardless 789 * of the failuremode setting. 790 */ 791 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && 792 txg_how != TXG_WAIT) 793 return (SET_ERROR(EIO)); 794 795 return (SET_ERROR(ERESTART)); 796 } 797 798 if (!tx->tx_waited && 799 dsl_pool_need_dirty_delay(tx->tx_pool)) { 800 tx->tx_wait_dirty = B_TRUE; 801 return (SET_ERROR(ERESTART)); 802 } 803 804 tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); 805 tx->tx_needassign_txh = NULL; 806 807 /* 808 * NB: No error returns are allowed after txg_hold_open, but 809 * before processing the dnode holds, due to the 810 * dmu_tx_unassign() logic. 811 */ 812 813 uint64_t towrite = 0; 814 uint64_t tohold = 0; 815 for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; 816 txh = list_next(&tx->tx_holds, txh)) { 817 dnode_t *dn = txh->txh_dnode; 818 if (dn != NULL) { 819 mutex_enter(&dn->dn_mtx); 820 if (dn->dn_assigned_txg == tx->tx_txg - 1) { 821 mutex_exit(&dn->dn_mtx); 822 tx->tx_needassign_txh = txh; 823 return (SET_ERROR(ERESTART)); 824 } 825 if (dn->dn_assigned_txg == 0) 826 dn->dn_assigned_txg = tx->tx_txg; 827 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 828 (void) refcount_add(&dn->dn_tx_holds, tx); 829 mutex_exit(&dn->dn_mtx); 830 } 831 towrite += refcount_count(&txh->txh_space_towrite); 832 tohold += refcount_count(&txh->txh_memory_tohold); 833 } 834 835 /* needed allocation: worst-case estimate of write space */ 836 uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite); 837 /* calculate memory footprint estimate */ 838 uint64_t memory = towrite + tohold; 839 840 if (tx->tx_dir != NULL && asize != 0) { 841 int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, 842 asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx); 843 if (err != 0) 844 return (err); 845 } 846 847 return (0); 848 } 849 850 static void 851 dmu_tx_unassign(dmu_tx_t *tx) 852 { 853 if (tx->tx_txg == 0) 854 return; 855 856 txg_rele_to_quiesce(&tx->tx_txgh); 857 858 /* 859 * Walk the transaction's hold list, removing the hold on the 860 * associated dnode, and notifying waiters if the refcount drops to 0. 861 */ 862 for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); 863 txh != tx->tx_needassign_txh; 864 txh = list_next(&tx->tx_holds, txh)) { 865 dnode_t *dn = txh->txh_dnode; 866 867 if (dn == NULL) 868 continue; 869 mutex_enter(&dn->dn_mtx); 870 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 871 872 if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { 873 dn->dn_assigned_txg = 0; 874 cv_broadcast(&dn->dn_notxholds); 875 } 876 mutex_exit(&dn->dn_mtx); 877 } 878 879 txg_rele_to_sync(&tx->tx_txgh); 880 881 tx->tx_lasttried_txg = tx->tx_txg; 882 tx->tx_txg = 0; 883 } 884 885 /* 886 * Assign tx to a transaction group. txg_how can be one of: 887 * 888 * (1) TXG_WAIT. If the current open txg is full, waits until there's 889 * a new one. This should be used when you're not holding locks. 890 * It will only fail if we're truly out of space (or over quota). 891 * 892 * (2) TXG_NOWAIT. If we can't assign into the current open txg without 893 * blocking, returns immediately with ERESTART. This should be used 894 * whenever you're holding locks. On an ERESTART error, the caller 895 * should drop locks, do a dmu_tx_wait(tx), and try again. 896 * 897 * (3) TXG_WAITED. Like TXG_NOWAIT, but indicates that dmu_tx_wait() 898 * has already been called on behalf of this operation (though 899 * most likely on a different tx). 900 */ 901 int 902 dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how) 903 { 904 int err; 905 906 ASSERT(tx->tx_txg == 0); 907 ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT || 908 txg_how == TXG_WAITED); 909 ASSERT(!dsl_pool_sync_context(tx->tx_pool)); 910 911 /* If we might wait, we must not hold the config lock. */ 912 ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool)); 913 914 if (txg_how == TXG_WAITED) 915 tx->tx_waited = B_TRUE; 916 917 while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { 918 dmu_tx_unassign(tx); 919 920 if (err != ERESTART || txg_how != TXG_WAIT) 921 return (err); 922 923 dmu_tx_wait(tx); 924 } 925 926 txg_rele_to_quiesce(&tx->tx_txgh); 927 928 return (0); 929 } 930 931 void 932 dmu_tx_wait(dmu_tx_t *tx) 933 { 934 spa_t *spa = tx->tx_pool->dp_spa; 935 dsl_pool_t *dp = tx->tx_pool; 936 937 ASSERT(tx->tx_txg == 0); 938 ASSERT(!dsl_pool_config_held(tx->tx_pool)); 939 940 if (tx->tx_wait_dirty) { 941 /* 942 * dmu_tx_try_assign() has determined that we need to wait 943 * because we've consumed much or all of the dirty buffer 944 * space. 945 */ 946 mutex_enter(&dp->dp_lock); 947 while (dp->dp_dirty_total >= zfs_dirty_data_max) 948 cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); 949 uint64_t dirty = dp->dp_dirty_total; 950 mutex_exit(&dp->dp_lock); 951 952 dmu_tx_delay(tx, dirty); 953 954 tx->tx_wait_dirty = B_FALSE; 955 956 /* 957 * Note: setting tx_waited only has effect if the caller 958 * used TX_WAIT. Otherwise they are going to destroy 959 * this tx and try again. The common case, zfs_write(), 960 * uses TX_WAIT. 961 */ 962 tx->tx_waited = B_TRUE; 963 } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { 964 /* 965 * If the pool is suspended we need to wait until it 966 * is resumed. Note that it's possible that the pool 967 * has become active after this thread has tried to 968 * obtain a tx. If that's the case then tx_lasttried_txg 969 * would not have been set. 970 */ 971 txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); 972 } else if (tx->tx_needassign_txh) { 973 /* 974 * A dnode is assigned to the quiescing txg. Wait for its 975 * transaction to complete. 976 */ 977 dnode_t *dn = tx->tx_needassign_txh->txh_dnode; 978 979 mutex_enter(&dn->dn_mtx); 980 while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) 981 cv_wait(&dn->dn_notxholds, &dn->dn_mtx); 982 mutex_exit(&dn->dn_mtx); 983 tx->tx_needassign_txh = NULL; 984 } else { 985 txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1); 986 } 987 } 988 989 static void 990 dmu_tx_destroy(dmu_tx_t *tx) 991 { 992 dmu_tx_hold_t *txh; 993 994 while ((txh = list_head(&tx->tx_holds)) != NULL) { 995 dnode_t *dn = txh->txh_dnode; 996 997 list_remove(&tx->tx_holds, txh); 998 refcount_destroy_many(&txh->txh_space_towrite, 999 refcount_count(&txh->txh_space_towrite)); 1000 refcount_destroy_many(&txh->txh_memory_tohold, 1001 refcount_count(&txh->txh_memory_tohold)); 1002 kmem_free(txh, sizeof (dmu_tx_hold_t)); 1003 if (dn != NULL) 1004 dnode_rele(dn, tx); 1005 } 1006 1007 list_destroy(&tx->tx_callbacks); 1008 list_destroy(&tx->tx_holds); 1009 kmem_free(tx, sizeof (dmu_tx_t)); 1010 } 1011 1012 void 1013 dmu_tx_commit(dmu_tx_t *tx) 1014 { 1015 ASSERT(tx->tx_txg != 0); 1016 1017 /* 1018 * Go through the transaction's hold list and remove holds on 1019 * associated dnodes, notifying waiters if no holds remain. 1020 */ 1021 for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; 1022 txh = list_next(&tx->tx_holds, txh)) { 1023 dnode_t *dn = txh->txh_dnode; 1024 1025 if (dn == NULL) 1026 continue; 1027 1028 mutex_enter(&dn->dn_mtx); 1029 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 1030 1031 if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { 1032 dn->dn_assigned_txg = 0; 1033 cv_broadcast(&dn->dn_notxholds); 1034 } 1035 mutex_exit(&dn->dn_mtx); 1036 } 1037 1038 if (tx->tx_tempreserve_cookie) 1039 dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); 1040 1041 if (!list_is_empty(&tx->tx_callbacks)) 1042 txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks); 1043 1044 if (tx->tx_anyobj == FALSE) 1045 txg_rele_to_sync(&tx->tx_txgh); 1046 1047 dmu_tx_destroy(tx); 1048 } 1049 1050 void 1051 dmu_tx_abort(dmu_tx_t *tx) 1052 { 1053 ASSERT(tx->tx_txg == 0); 1054 1055 /* 1056 * Call any registered callbacks with an error code. 1057 */ 1058 if (!list_is_empty(&tx->tx_callbacks)) 1059 dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED); 1060 1061 dmu_tx_destroy(tx); 1062 } 1063 1064 uint64_t 1065 dmu_tx_get_txg(dmu_tx_t *tx) 1066 { 1067 ASSERT(tx->tx_txg != 0); 1068 return (tx->tx_txg); 1069 } 1070 1071 dsl_pool_t * 1072 dmu_tx_pool(dmu_tx_t *tx) 1073 { 1074 ASSERT(tx->tx_pool != NULL); 1075 return (tx->tx_pool); 1076 } 1077 1078 void 1079 dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) 1080 { 1081 dmu_tx_callback_t *dcb; 1082 1083 dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP); 1084 1085 dcb->dcb_func = func; 1086 dcb->dcb_data = data; 1087 1088 list_insert_tail(&tx->tx_callbacks, dcb); 1089 } 1090 1091 /* 1092 * Call all the commit callbacks on a list, with a given error code. 1093 */ 1094 void 1095 dmu_tx_do_callbacks(list_t *cb_list, int error) 1096 { 1097 dmu_tx_callback_t *dcb; 1098 1099 while ((dcb = list_head(cb_list)) != NULL) { 1100 list_remove(cb_list, dcb); 1101 dcb->dcb_func(dcb->dcb_data, error); 1102 kmem_free(dcb, sizeof (dmu_tx_callback_t)); 1103 } 1104 } 1105 1106 /* 1107 * Interface to hold a bunch of attributes. 1108 * used for creating new files. 1109 * attrsize is the total size of all attributes 1110 * to be added during object creation 1111 * 1112 * For updating/adding a single attribute dmu_tx_hold_sa() should be used. 1113 */ 1114 1115 /* 1116 * hold necessary attribute name for attribute registration. 1117 * should be a very rare case where this is needed. If it does 1118 * happen it would only happen on the first write to the file system. 1119 */ 1120 static void 1121 dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) 1122 { 1123 if (!sa->sa_need_attr_registration) 1124 return; 1125 1126 for (int i = 0; i != sa->sa_num_attrs; i++) { 1127 if (!sa->sa_attr_table[i].sa_registered) { 1128 if (sa->sa_reg_attr_obj) 1129 dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj, 1130 B_TRUE, sa->sa_attr_table[i].sa_name); 1131 else 1132 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 1133 B_TRUE, sa->sa_attr_table[i].sa_name); 1134 } 1135 } 1136 } 1137 1138 void 1139 dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) 1140 { 1141 dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx, 1142 tx->tx_objset, object, THT_SPILL, 0, 0); 1143 1144 (void) refcount_add_many(&txh->txh_space_towrite, 1145 SPA_OLD_MAXBLOCKSIZE, FTAG); 1146 } 1147 1148 void 1149 dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) 1150 { 1151 sa_os_t *sa = tx->tx_objset->os_sa; 1152 1153 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1154 1155 if (tx->tx_objset->os_sa->sa_master_obj == 0) 1156 return; 1157 1158 if (tx->tx_objset->os_sa->sa_layout_attr_obj) { 1159 dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); 1160 } else { 1161 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); 1162 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); 1163 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 1164 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 1165 } 1166 1167 dmu_tx_sa_registration_hold(sa, tx); 1168 1169 if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill) 1170 return; 1171 1172 (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, 1173 THT_SPILL, 0, 0); 1174 } 1175 1176 /* 1177 * Hold SA attribute 1178 * 1179 * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size) 1180 * 1181 * variable_size is the total size of all variable sized attributes 1182 * passed to this function. It is not the total size of all 1183 * variable size attributes that *may* exist on this object. 1184 */ 1185 void 1186 dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) 1187 { 1188 uint64_t object; 1189 sa_os_t *sa = tx->tx_objset->os_sa; 1190 1191 ASSERT(hdl != NULL); 1192 1193 object = sa_handle_object(hdl); 1194 1195 dmu_tx_hold_bonus(tx, object); 1196 1197 if (tx->tx_objset->os_sa->sa_master_obj == 0) 1198 return; 1199 1200 if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 || 1201 tx->tx_objset->os_sa->sa_layout_attr_obj == 0) { 1202 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); 1203 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); 1204 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 1205 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 1206 } 1207 1208 dmu_tx_sa_registration_hold(sa, tx); 1209 1210 if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj) 1211 dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); 1212 1213 if (sa->sa_force_spill || may_grow || hdl->sa_spill) { 1214 ASSERT(tx->tx_txg == 0); 1215 dmu_tx_hold_spill(tx, object); 1216 } else { 1217 dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; 1218 dnode_t *dn; 1219 1220 DB_DNODE_ENTER(db); 1221 dn = DB_DNODE(db); 1222 if (dn->dn_have_spill) { 1223 ASSERT(tx->tx_txg == 0); 1224 dmu_tx_hold_spill(tx, object); 1225 } 1226 DB_DNODE_EXIT(db); 1227 } 1228 } 1229