1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/dmu.h> 27 #include <sys/dmu_impl.h> 28 #include <sys/dbuf.h> 29 #include <sys/dmu_tx.h> 30 #include <sys/dmu_objset.h> 31 #include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */ 32 #include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */ 33 #include <sys/dsl_pool.h> 34 #include <sys/zap_impl.h> /* for fzap_default_block_shift */ 35 #include <sys/spa.h> 36 #include <sys/zfs_context.h> 37 38 typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, 39 uint64_t arg1, uint64_t arg2); 40 41 42 dmu_tx_t * 43 dmu_tx_create_dd(dsl_dir_t *dd) 44 { 45 dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); 46 tx->tx_dir = dd; 47 if (dd) 48 tx->tx_pool = dd->dd_pool; 49 list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), 50 offsetof(dmu_tx_hold_t, txh_node)); 51 #ifdef ZFS_DEBUG 52 refcount_create(&tx->tx_space_written); 53 refcount_create(&tx->tx_space_freed); 54 #endif 55 return (tx); 56 } 57 58 dmu_tx_t * 59 dmu_tx_create(objset_t *os) 60 { 61 dmu_tx_t *tx = dmu_tx_create_dd(os->os->os_dsl_dataset->ds_dir); 62 tx->tx_objset = os; 63 tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os->os_dsl_dataset); 64 return (tx); 65 } 66 67 dmu_tx_t * 68 dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) 69 { 70 dmu_tx_t *tx = dmu_tx_create_dd(NULL); 71 72 ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg); 73 tx->tx_pool = dp; 74 tx->tx_txg = txg; 75 tx->tx_anyobj = TRUE; 76 77 return (tx); 78 } 79 80 int 81 dmu_tx_is_syncing(dmu_tx_t *tx) 82 { 83 return (tx->tx_anyobj); 84 } 85 86 int 87 dmu_tx_private_ok(dmu_tx_t *tx) 88 { 89 return (tx->tx_anyobj); 90 } 91 92 static dmu_tx_hold_t * 93 dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, 94 enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) 95 { 96 dmu_tx_hold_t *txh; 97 dnode_t *dn = NULL; 98 int err; 99 100 if (object != DMU_NEW_OBJECT) { 101 err = dnode_hold(os->os, object, tx, &dn); 102 if (err) { 103 tx->tx_err = err; 104 return (NULL); 105 } 106 107 if (err == 0 && tx->tx_txg != 0) { 108 mutex_enter(&dn->dn_mtx); 109 /* 110 * dn->dn_assigned_txg == tx->tx_txg doesn't pose a 111 * problem, but there's no way for it to happen (for 112 * now, at least). 113 */ 114 ASSERT(dn->dn_assigned_txg == 0); 115 dn->dn_assigned_txg = tx->tx_txg; 116 (void) refcount_add(&dn->dn_tx_holds, tx); 117 mutex_exit(&dn->dn_mtx); 118 } 119 } 120 121 txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); 122 txh->txh_tx = tx; 123 txh->txh_dnode = dn; 124 #ifdef ZFS_DEBUG 125 txh->txh_type = type; 126 txh->txh_arg1 = arg1; 127 txh->txh_arg2 = arg2; 128 #endif 129 list_insert_tail(&tx->tx_holds, txh); 130 131 return (txh); 132 } 133 134 void 135 dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object) 136 { 137 /* 138 * If we're syncing, they can manipulate any object anyhow, and 139 * the hold on the dnode_t can cause problems. 140 */ 141 if (!dmu_tx_is_syncing(tx)) { 142 (void) dmu_tx_hold_object_impl(tx, os, 143 object, THT_NEWOBJECT, 0, 0); 144 } 145 } 146 147 static int 148 dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) 149 { 150 int err; 151 dmu_buf_impl_t *db; 152 153 rw_enter(&dn->dn_struct_rwlock, RW_READER); 154 db = dbuf_hold_level(dn, level, blkid, FTAG); 155 rw_exit(&dn->dn_struct_rwlock); 156 if (db == NULL) 157 return (EIO); 158 err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); 159 dbuf_rele(db, FTAG); 160 return (err); 161 } 162 163 /* ARGSUSED */ 164 static void 165 dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) 166 { 167 dnode_t *dn = txh->txh_dnode; 168 uint64_t start, end, i; 169 int min_bs, max_bs, min_ibs, max_ibs, epbs, bits; 170 int err = 0; 171 172 if (len == 0) 173 return; 174 175 min_bs = SPA_MINBLOCKSHIFT; 176 max_bs = SPA_MAXBLOCKSHIFT; 177 min_ibs = DN_MIN_INDBLKSHIFT; 178 max_ibs = DN_MAX_INDBLKSHIFT; 179 180 181 /* 182 * For i/o error checking, read the first and last level-0 183 * blocks (if they are not aligned), and all the level-1 blocks. 184 */ 185 186 if (dn) { 187 if (dn->dn_maxblkid == 0) { 188 err = dmu_tx_check_ioerr(NULL, dn, 0, 0); 189 if (err) 190 goto out; 191 } else { 192 zio_t *zio = zio_root(dn->dn_objset->os_spa, 193 NULL, NULL, ZIO_FLAG_CANFAIL); 194 195 /* first level-0 block */ 196 start = off >> dn->dn_datablkshift; 197 if (P2PHASE(off, dn->dn_datablksz) || 198 len < dn->dn_datablksz) { 199 err = dmu_tx_check_ioerr(zio, dn, 0, start); 200 if (err) 201 goto out; 202 } 203 204 /* last level-0 block */ 205 end = (off+len-1) >> dn->dn_datablkshift; 206 if (end != start && 207 P2PHASE(off+len, dn->dn_datablksz)) { 208 err = dmu_tx_check_ioerr(zio, dn, 0, end); 209 if (err) 210 goto out; 211 } 212 213 /* level-1 blocks */ 214 if (dn->dn_nlevels > 1) { 215 start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; 216 end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; 217 for (i = start+1; i < end; i++) { 218 err = dmu_tx_check_ioerr(zio, dn, 1, i); 219 if (err) 220 goto out; 221 } 222 } 223 224 err = zio_wait(zio); 225 if (err) 226 goto out; 227 } 228 } 229 230 /* 231 * If there's more than one block, the blocksize can't change, 232 * so we can make a more precise estimate. Alternatively, 233 * if the dnode's ibs is larger than max_ibs, always use that. 234 * This ensures that if we reduce DN_MAX_INDBLKSHIFT, 235 * the code will still work correctly on existing pools. 236 */ 237 if (dn && (dn->dn_maxblkid != 0 || dn->dn_indblkshift > max_ibs)) { 238 min_ibs = max_ibs = dn->dn_indblkshift; 239 if (dn->dn_datablkshift != 0) 240 min_bs = max_bs = dn->dn_datablkshift; 241 } 242 243 /* 244 * 'end' is the last thing we will access, not one past. 245 * This way we won't overflow when accessing the last byte. 246 */ 247 start = P2ALIGN(off, 1ULL << max_bs); 248 end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1; 249 txh->txh_space_towrite += end - start + 1; 250 251 start >>= min_bs; 252 end >>= min_bs; 253 254 epbs = min_ibs - SPA_BLKPTRSHIFT; 255 256 /* 257 * The object contains at most 2^(64 - min_bs) blocks, 258 * and each indirect level maps 2^epbs. 259 */ 260 for (bits = 64 - min_bs; bits >= 0; bits -= epbs) { 261 start >>= epbs; 262 end >>= epbs; 263 /* 264 * If we increase the number of levels of indirection, 265 * we'll need new blkid=0 indirect blocks. If start == 0, 266 * we're already accounting for that blocks; and if end == 0, 267 * we can't increase the number of levels beyond that. 268 */ 269 if (start != 0 && end != 0) 270 txh->txh_space_towrite += 1ULL << max_ibs; 271 txh->txh_space_towrite += (end - start + 1) << max_ibs; 272 } 273 274 ASSERT(txh->txh_space_towrite < 2 * DMU_MAX_ACCESS); 275 276 out: 277 if (err) 278 txh->txh_tx->tx_err = err; 279 } 280 281 static void 282 dmu_tx_count_dnode(dmu_tx_hold_t *txh) 283 { 284 dnode_t *dn = txh->txh_dnode; 285 dnode_t *mdn = txh->txh_tx->tx_objset->os->os_meta_dnode; 286 uint64_t space = mdn->dn_datablksz + 287 ((mdn->dn_nlevels-1) << mdn->dn_indblkshift); 288 289 if (dn && dn->dn_dbuf->db_blkptr && 290 dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, 291 dn->dn_dbuf->db_blkptr->blk_birth)) { 292 txh->txh_space_tooverwrite += space; 293 } else { 294 txh->txh_space_towrite += space; 295 if (dn && dn->dn_dbuf->db_blkptr) 296 txh->txh_space_tounref += space; 297 } 298 } 299 300 void 301 dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) 302 { 303 dmu_tx_hold_t *txh; 304 305 ASSERT(tx->tx_txg == 0); 306 ASSERT(len < DMU_MAX_ACCESS); 307 ASSERT(len == 0 || UINT64_MAX - off >= len - 1); 308 309 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 310 object, THT_WRITE, off, len); 311 if (txh == NULL) 312 return; 313 314 dmu_tx_count_write(txh, off, len); 315 dmu_tx_count_dnode(txh); 316 } 317 318 static void 319 dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) 320 { 321 uint64_t blkid, nblks, lastblk; 322 uint64_t space = 0, unref = 0, skipped = 0; 323 dnode_t *dn = txh->txh_dnode; 324 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 325 spa_t *spa = txh->txh_tx->tx_pool->dp_spa; 326 int epbs; 327 328 if (dn->dn_nlevels == 0) 329 return; 330 331 /* 332 * The struct_rwlock protects us against dn_nlevels 333 * changing, in case (against all odds) we manage to dirty & 334 * sync out the changes after we check for being dirty. 335 * Also, dbuf_hold_level() wants us to have the struct_rwlock. 336 */ 337 rw_enter(&dn->dn_struct_rwlock, RW_READER); 338 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 339 if (dn->dn_maxblkid == 0) { 340 if (off == 0 && len >= dn->dn_datablksz) { 341 blkid = 0; 342 nblks = 1; 343 } else { 344 rw_exit(&dn->dn_struct_rwlock); 345 return; 346 } 347 } else { 348 blkid = off >> dn->dn_datablkshift; 349 nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift; 350 351 if (blkid >= dn->dn_maxblkid) { 352 rw_exit(&dn->dn_struct_rwlock); 353 return; 354 } 355 if (blkid + nblks > dn->dn_maxblkid) 356 nblks = dn->dn_maxblkid - blkid; 357 358 } 359 if (dn->dn_nlevels == 1) { 360 int i; 361 for (i = 0; i < nblks; i++) { 362 blkptr_t *bp = dn->dn_phys->dn_blkptr; 363 ASSERT3U(blkid + i, <, dn->dn_nblkptr); 364 bp += blkid + i; 365 if (dsl_dataset_block_freeable(ds, bp->blk_birth)) { 366 dprintf_bp(bp, "can free old%s", ""); 367 space += bp_get_dasize(spa, bp); 368 } 369 unref += BP_GET_ASIZE(bp); 370 } 371 nblks = 0; 372 } 373 374 /* 375 * Add in memory requirements of higher-level indirects. 376 * This assumes a worst-possible scenario for dn_nlevels. 377 */ 378 { 379 uint64_t blkcnt = 1 + ((nblks >> epbs) >> epbs); 380 int level = (dn->dn_nlevels > 1) ? 2 : 1; 381 382 while (level++ < DN_MAX_LEVELS) { 383 txh->txh_memory_tohold += blkcnt << dn->dn_indblkshift; 384 blkcnt = 1 + (blkcnt >> epbs); 385 } 386 ASSERT(blkcnt <= dn->dn_nblkptr); 387 } 388 389 lastblk = blkid + nblks - 1; 390 while (nblks) { 391 dmu_buf_impl_t *dbuf; 392 uint64_t ibyte, new_blkid; 393 int epb = 1 << epbs; 394 int err, i, blkoff, tochk; 395 blkptr_t *bp; 396 397 ibyte = blkid << dn->dn_datablkshift; 398 err = dnode_next_offset(dn, 399 DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0); 400 new_blkid = ibyte >> dn->dn_datablkshift; 401 if (err == ESRCH) { 402 skipped += (lastblk >> epbs) - (blkid >> epbs) + 1; 403 break; 404 } 405 if (err) { 406 txh->txh_tx->tx_err = err; 407 break; 408 } 409 if (new_blkid > lastblk) { 410 skipped += (lastblk >> epbs) - (blkid >> epbs) + 1; 411 break; 412 } 413 414 if (new_blkid > blkid) { 415 ASSERT((new_blkid >> epbs) > (blkid >> epbs)); 416 skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1; 417 nblks -= new_blkid - blkid; 418 blkid = new_blkid; 419 } 420 blkoff = P2PHASE(blkid, epb); 421 tochk = MIN(epb - blkoff, nblks); 422 423 dbuf = dbuf_hold_level(dn, 1, blkid >> epbs, FTAG); 424 425 txh->txh_memory_tohold += dbuf->db.db_size; 426 if (txh->txh_memory_tohold > DMU_MAX_ACCESS) { 427 txh->txh_tx->tx_err = E2BIG; 428 dbuf_rele(dbuf, FTAG); 429 break; 430 } 431 err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL); 432 if (err != 0) { 433 txh->txh_tx->tx_err = err; 434 dbuf_rele(dbuf, FTAG); 435 break; 436 } 437 438 bp = dbuf->db.db_data; 439 bp += blkoff; 440 441 for (i = 0; i < tochk; i++) { 442 if (dsl_dataset_block_freeable(ds, bp[i].blk_birth)) { 443 dprintf_bp(&bp[i], "can free old%s", ""); 444 space += bp_get_dasize(spa, &bp[i]); 445 } 446 unref += BP_GET_ASIZE(bp); 447 } 448 dbuf_rele(dbuf, FTAG); 449 450 blkid += tochk; 451 nblks -= tochk; 452 } 453 rw_exit(&dn->dn_struct_rwlock); 454 455 /* account for new level 1 indirect blocks that might show up */ 456 if (skipped > 0) { 457 txh->txh_fudge += skipped << dn->dn_indblkshift; 458 skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs); 459 txh->txh_memory_tohold += skipped << dn->dn_indblkshift; 460 } 461 txh->txh_space_tofree += space; 462 txh->txh_space_tounref += unref; 463 } 464 465 void 466 dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) 467 { 468 dmu_tx_hold_t *txh; 469 dnode_t *dn; 470 uint64_t start, end, i; 471 int err, shift; 472 zio_t *zio; 473 474 ASSERT(tx->tx_txg == 0); 475 476 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 477 object, THT_FREE, off, len); 478 if (txh == NULL) 479 return; 480 dn = txh->txh_dnode; 481 482 /* first block */ 483 if (off != 0) 484 dmu_tx_count_write(txh, off, 1); 485 /* last block */ 486 if (len != DMU_OBJECT_END) 487 dmu_tx_count_write(txh, off+len, 1); 488 489 if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz) 490 return; 491 if (len == DMU_OBJECT_END) 492 len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off; 493 494 /* 495 * For i/o error checking, read the first and last level-0 496 * blocks, and all the level-1 blocks. The above count_write's 497 * have already taken care of the level-0 blocks. 498 */ 499 if (dn->dn_nlevels > 1) { 500 shift = dn->dn_datablkshift + dn->dn_indblkshift - 501 SPA_BLKPTRSHIFT; 502 start = off >> shift; 503 end = dn->dn_datablkshift ? ((off+len) >> shift) : 0; 504 505 zio = zio_root(tx->tx_pool->dp_spa, 506 NULL, NULL, ZIO_FLAG_CANFAIL); 507 for (i = start; i <= end; i++) { 508 uint64_t ibyte = i << shift; 509 err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); 510 i = ibyte >> shift; 511 if (err == ESRCH) 512 break; 513 if (err) { 514 tx->tx_err = err; 515 return; 516 } 517 518 err = dmu_tx_check_ioerr(zio, dn, 1, i); 519 if (err) { 520 tx->tx_err = err; 521 return; 522 } 523 } 524 err = zio_wait(zio); 525 if (err) { 526 tx->tx_err = err; 527 return; 528 } 529 } 530 531 dmu_tx_count_dnode(txh); 532 dmu_tx_count_free(txh, off, len); 533 } 534 535 void 536 dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) 537 { 538 dmu_tx_hold_t *txh; 539 dnode_t *dn; 540 uint64_t nblocks; 541 int epbs, err; 542 543 ASSERT(tx->tx_txg == 0); 544 545 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 546 object, THT_ZAP, add, (uintptr_t)name); 547 if (txh == NULL) 548 return; 549 dn = txh->txh_dnode; 550 551 dmu_tx_count_dnode(txh); 552 553 if (dn == NULL) { 554 /* 555 * We will be able to fit a new object's entries into one leaf 556 * block. So there will be at most 2 blocks total, 557 * including the header block. 558 */ 559 dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift); 560 return; 561 } 562 563 ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap); 564 565 if (dn->dn_maxblkid == 0 && !add) { 566 /* 567 * If there is only one block (i.e. this is a micro-zap) 568 * and we are not adding anything, the accounting is simple. 569 */ 570 err = dmu_tx_check_ioerr(NULL, dn, 0, 0); 571 if (err) { 572 tx->tx_err = err; 573 return; 574 } 575 576 /* 577 * Use max block size here, since we don't know how much 578 * the size will change between now and the dbuf dirty call. 579 */ 580 if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, 581 dn->dn_phys->dn_blkptr[0].blk_birth)) { 582 txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; 583 } else { 584 txh->txh_space_towrite += SPA_MAXBLOCKSIZE; 585 txh->txh_space_tounref += 586 BP_GET_ASIZE(dn->dn_phys->dn_blkptr); 587 } 588 return; 589 } 590 591 if (dn->dn_maxblkid > 0 && name) { 592 /* 593 * access the name in this fat-zap so that we'll check 594 * for i/o errors to the leaf blocks, etc. 595 */ 596 err = zap_lookup(&dn->dn_objset->os, dn->dn_object, name, 597 8, 0, NULL); 598 if (err == EIO) { 599 tx->tx_err = err; 600 return; 601 } 602 } 603 604 /* 605 * 3 blocks overwritten: target leaf, ptrtbl block, header block 606 * 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks 607 */ 608 dmu_tx_count_write(txh, dn->dn_maxblkid * dn->dn_datablksz, 609 (3 + (add ? 3 : 0)) << dn->dn_datablkshift); 610 611 /* 612 * If the modified blocks are scattered to the four winds, 613 * we'll have to modify an indirect twig for each. 614 */ 615 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 616 for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs) 617 txh->txh_space_towrite += 3 << dn->dn_indblkshift; 618 } 619 620 void 621 dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) 622 { 623 dmu_tx_hold_t *txh; 624 625 ASSERT(tx->tx_txg == 0); 626 627 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 628 object, THT_BONUS, 0, 0); 629 if (txh) 630 dmu_tx_count_dnode(txh); 631 } 632 633 void 634 dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) 635 { 636 dmu_tx_hold_t *txh; 637 ASSERT(tx->tx_txg == 0); 638 639 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 640 DMU_NEW_OBJECT, THT_SPACE, space, 0); 641 642 txh->txh_space_towrite += space; 643 } 644 645 int 646 dmu_tx_holds(dmu_tx_t *tx, uint64_t object) 647 { 648 dmu_tx_hold_t *txh; 649 int holds = 0; 650 651 /* 652 * By asserting that the tx is assigned, we're counting the 653 * number of dn_tx_holds, which is the same as the number of 654 * dn_holds. Otherwise, we'd be counting dn_holds, but 655 * dn_tx_holds could be 0. 656 */ 657 ASSERT(tx->tx_txg != 0); 658 659 /* if (tx->tx_anyobj == TRUE) */ 660 /* return (0); */ 661 662 for (txh = list_head(&tx->tx_holds); txh; 663 txh = list_next(&tx->tx_holds, txh)) { 664 if (txh->txh_dnode && txh->txh_dnode->dn_object == object) 665 holds++; 666 } 667 668 return (holds); 669 } 670 671 #ifdef ZFS_DEBUG 672 void 673 dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) 674 { 675 dmu_tx_hold_t *txh; 676 int match_object = FALSE, match_offset = FALSE; 677 dnode_t *dn = db->db_dnode; 678 679 ASSERT(tx->tx_txg != 0); 680 ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset->os); 681 ASSERT3U(dn->dn_object, ==, db->db.db_object); 682 683 if (tx->tx_anyobj) 684 return; 685 686 /* XXX No checking on the meta dnode for now */ 687 if (db->db.db_object == DMU_META_DNODE_OBJECT) 688 return; 689 690 for (txh = list_head(&tx->tx_holds); txh; 691 txh = list_next(&tx->tx_holds, txh)) { 692 ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg); 693 if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) 694 match_object = TRUE; 695 if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { 696 int datablkshift = dn->dn_datablkshift ? 697 dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; 698 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 699 int shift = datablkshift + epbs * db->db_level; 700 uint64_t beginblk = shift >= 64 ? 0 : 701 (txh->txh_arg1 >> shift); 702 uint64_t endblk = shift >= 64 ? 0 : 703 ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); 704 uint64_t blkid = db->db_blkid; 705 706 /* XXX txh_arg2 better not be zero... */ 707 708 dprintf("found txh type %x beginblk=%llx endblk=%llx\n", 709 txh->txh_type, beginblk, endblk); 710 711 switch (txh->txh_type) { 712 case THT_WRITE: 713 if (blkid >= beginblk && blkid <= endblk) 714 match_offset = TRUE; 715 /* 716 * We will let this hold work for the bonus 717 * buffer so that we don't need to hold it 718 * when creating a new object. 719 */ 720 if (blkid == DB_BONUS_BLKID) 721 match_offset = TRUE; 722 /* 723 * They might have to increase nlevels, 724 * thus dirtying the new TLIBs. Or the 725 * might have to change the block size, 726 * thus dirying the new lvl=0 blk=0. 727 */ 728 if (blkid == 0) 729 match_offset = TRUE; 730 break; 731 case THT_FREE: 732 /* 733 * We will dirty all the level 1 blocks in 734 * the free range and perhaps the first and 735 * last level 0 block. 736 */ 737 if (blkid >= beginblk && (blkid <= endblk || 738 txh->txh_arg2 == DMU_OBJECT_END)) 739 match_offset = TRUE; 740 break; 741 case THT_BONUS: 742 if (blkid == DB_BONUS_BLKID) 743 match_offset = TRUE; 744 break; 745 case THT_ZAP: 746 match_offset = TRUE; 747 break; 748 case THT_NEWOBJECT: 749 match_object = TRUE; 750 break; 751 default: 752 ASSERT(!"bad txh_type"); 753 } 754 } 755 if (match_object && match_offset) 756 return; 757 } 758 panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", 759 (u_longlong_t)db->db.db_object, db->db_level, 760 (u_longlong_t)db->db_blkid); 761 } 762 #endif 763 764 static int 765 dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) 766 { 767 dmu_tx_hold_t *txh; 768 spa_t *spa = tx->tx_pool->dp_spa; 769 uint64_t memory, asize, fsize, usize; 770 uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge; 771 772 ASSERT3U(tx->tx_txg, ==, 0); 773 774 if (tx->tx_err) 775 return (tx->tx_err); 776 777 if (spa_suspended(spa)) { 778 /* 779 * If the user has indicated a blocking failure mode 780 * then return ERESTART which will block in dmu_tx_wait(). 781 * Otherwise, return EIO so that an error can get 782 * propagated back to the VOP calls. 783 * 784 * Note that we always honor the txg_how flag regardless 785 * of the failuremode setting. 786 */ 787 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && 788 txg_how != TXG_WAIT) 789 return (EIO); 790 791 return (ERESTART); 792 } 793 794 tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); 795 tx->tx_needassign_txh = NULL; 796 797 /* 798 * NB: No error returns are allowed after txg_hold_open, but 799 * before processing the dnode holds, due to the 800 * dmu_tx_unassign() logic. 801 */ 802 803 towrite = tofree = tooverwrite = tounref = tohold = fudge = 0; 804 for (txh = list_head(&tx->tx_holds); txh; 805 txh = list_next(&tx->tx_holds, txh)) { 806 dnode_t *dn = txh->txh_dnode; 807 if (dn != NULL) { 808 mutex_enter(&dn->dn_mtx); 809 if (dn->dn_assigned_txg == tx->tx_txg - 1) { 810 mutex_exit(&dn->dn_mtx); 811 tx->tx_needassign_txh = txh; 812 return (ERESTART); 813 } 814 if (dn->dn_assigned_txg == 0) 815 dn->dn_assigned_txg = tx->tx_txg; 816 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 817 (void) refcount_add(&dn->dn_tx_holds, tx); 818 mutex_exit(&dn->dn_mtx); 819 } 820 towrite += txh->txh_space_towrite; 821 tofree += txh->txh_space_tofree; 822 tooverwrite += txh->txh_space_tooverwrite; 823 tounref += txh->txh_space_tounref; 824 tohold += txh->txh_memory_tohold; 825 fudge += txh->txh_fudge; 826 } 827 828 /* 829 * NB: This check must be after we've held the dnodes, so that 830 * the dmu_tx_unassign() logic will work properly 831 */ 832 if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg) 833 return (ERESTART); 834 835 /* 836 * If a snapshot has been taken since we made our estimates, 837 * assume that we won't be able to free or overwrite anything. 838 */ 839 if (tx->tx_objset && 840 dsl_dataset_prev_snap_txg(tx->tx_objset->os->os_dsl_dataset) > 841 tx->tx_lastsnap_txg) { 842 towrite += tooverwrite; 843 tooverwrite = tofree = 0; 844 } 845 846 /* needed allocation: worst-case estimate of write space */ 847 asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite); 848 /* freed space estimate: worst-case overwrite + free estimate */ 849 fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree; 850 /* convert unrefd space to worst-case estimate */ 851 usize = spa_get_asize(tx->tx_pool->dp_spa, tounref); 852 /* calculate memory footprint estimate */ 853 memory = towrite + tooverwrite + tohold; 854 855 #ifdef ZFS_DEBUG 856 /* 857 * Add in 'tohold' to account for our dirty holds on this memory 858 * XXX - the "fudge" factor is to account for skipped blocks that 859 * we missed because dnode_next_offset() misses in-core-only blocks. 860 */ 861 tx->tx_space_towrite = asize + 862 spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge); 863 tx->tx_space_tofree = tofree; 864 tx->tx_space_tooverwrite = tooverwrite; 865 tx->tx_space_tounref = tounref; 866 #endif 867 868 if (tx->tx_dir && asize != 0) { 869 int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, 870 asize, fsize, usize, &tx->tx_tempreserve_cookie, tx); 871 if (err) 872 return (err); 873 } 874 875 return (0); 876 } 877 878 static void 879 dmu_tx_unassign(dmu_tx_t *tx) 880 { 881 dmu_tx_hold_t *txh; 882 883 if (tx->tx_txg == 0) 884 return; 885 886 txg_rele_to_quiesce(&tx->tx_txgh); 887 888 for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh; 889 txh = list_next(&tx->tx_holds, txh)) { 890 dnode_t *dn = txh->txh_dnode; 891 892 if (dn == NULL) 893 continue; 894 mutex_enter(&dn->dn_mtx); 895 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 896 897 if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { 898 dn->dn_assigned_txg = 0; 899 cv_broadcast(&dn->dn_notxholds); 900 } 901 mutex_exit(&dn->dn_mtx); 902 } 903 904 txg_rele_to_sync(&tx->tx_txgh); 905 906 tx->tx_lasttried_txg = tx->tx_txg; 907 tx->tx_txg = 0; 908 } 909 910 /* 911 * Assign tx to a transaction group. txg_how can be one of: 912 * 913 * (1) TXG_WAIT. If the current open txg is full, waits until there's 914 * a new one. This should be used when you're not holding locks. 915 * If will only fail if we're truly out of space (or over quota). 916 * 917 * (2) TXG_NOWAIT. If we can't assign into the current open txg without 918 * blocking, returns immediately with ERESTART. This should be used 919 * whenever you're holding locks. On an ERESTART error, the caller 920 * should drop locks, do a dmu_tx_wait(tx), and try again. 921 * 922 * (3) A specific txg. Use this if you need to ensure that multiple 923 * transactions all sync in the same txg. Like TXG_NOWAIT, it 924 * returns ERESTART if it can't assign you into the requested txg. 925 */ 926 int 927 dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) 928 { 929 int err; 930 931 ASSERT(tx->tx_txg == 0); 932 ASSERT(txg_how != 0); 933 ASSERT(!dsl_pool_sync_context(tx->tx_pool)); 934 935 while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { 936 dmu_tx_unassign(tx); 937 938 if (err != ERESTART || txg_how != TXG_WAIT) 939 return (err); 940 941 dmu_tx_wait(tx); 942 } 943 944 txg_rele_to_quiesce(&tx->tx_txgh); 945 946 return (0); 947 } 948 949 void 950 dmu_tx_wait(dmu_tx_t *tx) 951 { 952 spa_t *spa = tx->tx_pool->dp_spa; 953 954 ASSERT(tx->tx_txg == 0); 955 956 /* 957 * It's possible that the pool has become active after this thread 958 * has tried to obtain a tx. If that's the case then his 959 * tx_lasttried_txg would not have been assigned. 960 */ 961 if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { 962 txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1); 963 } else if (tx->tx_needassign_txh) { 964 dnode_t *dn = tx->tx_needassign_txh->txh_dnode; 965 966 mutex_enter(&dn->dn_mtx); 967 while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) 968 cv_wait(&dn->dn_notxholds, &dn->dn_mtx); 969 mutex_exit(&dn->dn_mtx); 970 tx->tx_needassign_txh = NULL; 971 } else { 972 txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1); 973 } 974 } 975 976 void 977 dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta) 978 { 979 #ifdef ZFS_DEBUG 980 if (tx->tx_dir == NULL || delta == 0) 981 return; 982 983 if (delta > 0) { 984 ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=, 985 tx->tx_space_towrite); 986 (void) refcount_add_many(&tx->tx_space_written, delta, NULL); 987 } else { 988 (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL); 989 } 990 #endif 991 } 992 993 void 994 dmu_tx_commit(dmu_tx_t *tx) 995 { 996 dmu_tx_hold_t *txh; 997 998 ASSERT(tx->tx_txg != 0); 999 1000 while (txh = list_head(&tx->tx_holds)) { 1001 dnode_t *dn = txh->txh_dnode; 1002 1003 list_remove(&tx->tx_holds, txh); 1004 kmem_free(txh, sizeof (dmu_tx_hold_t)); 1005 if (dn == NULL) 1006 continue; 1007 mutex_enter(&dn->dn_mtx); 1008 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 1009 1010 if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { 1011 dn->dn_assigned_txg = 0; 1012 cv_broadcast(&dn->dn_notxholds); 1013 } 1014 mutex_exit(&dn->dn_mtx); 1015 dnode_rele(dn, tx); 1016 } 1017 1018 if (tx->tx_tempreserve_cookie) 1019 dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); 1020 1021 if (tx->tx_anyobj == FALSE) 1022 txg_rele_to_sync(&tx->tx_txgh); 1023 list_destroy(&tx->tx_holds); 1024 #ifdef ZFS_DEBUG 1025 dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n", 1026 tx->tx_space_towrite, refcount_count(&tx->tx_space_written), 1027 tx->tx_space_tofree, refcount_count(&tx->tx_space_freed)); 1028 refcount_destroy_many(&tx->tx_space_written, 1029 refcount_count(&tx->tx_space_written)); 1030 refcount_destroy_many(&tx->tx_space_freed, 1031 refcount_count(&tx->tx_space_freed)); 1032 #endif 1033 kmem_free(tx, sizeof (dmu_tx_t)); 1034 } 1035 1036 void 1037 dmu_tx_abort(dmu_tx_t *tx) 1038 { 1039 dmu_tx_hold_t *txh; 1040 1041 ASSERT(tx->tx_txg == 0); 1042 1043 while (txh = list_head(&tx->tx_holds)) { 1044 dnode_t *dn = txh->txh_dnode; 1045 1046 list_remove(&tx->tx_holds, txh); 1047 kmem_free(txh, sizeof (dmu_tx_hold_t)); 1048 if (dn != NULL) 1049 dnode_rele(dn, tx); 1050 } 1051 list_destroy(&tx->tx_holds); 1052 #ifdef ZFS_DEBUG 1053 refcount_destroy_many(&tx->tx_space_written, 1054 refcount_count(&tx->tx_space_written)); 1055 refcount_destroy_many(&tx->tx_space_freed, 1056 refcount_count(&tx->tx_space_freed)); 1057 #endif 1058 kmem_free(tx, sizeof (dmu_tx_t)); 1059 } 1060 1061 uint64_t 1062 dmu_tx_get_txg(dmu_tx_t *tx) 1063 { 1064 ASSERT(tx->tx_txg != 0); 1065 return (tx->tx_txg); 1066 } 1067