1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/dmu.h> 29 #include <sys/dmu_impl.h> 30 #include <sys/dbuf.h> 31 #include <sys/dmu_tx.h> 32 #include <sys/dmu_objset.h> 33 #include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */ 34 #include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */ 35 #include <sys/dsl_pool.h> 36 #include <sys/zap_impl.h> /* for ZAP_BLOCK_SHIFT */ 37 #include <sys/spa.h> 38 #include <sys/zfs_context.h> 39 40 typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, 41 uint64_t arg1, uint64_t arg2); 42 43 #ifdef ZFS_DEBUG 44 int dmu_use_tx_debug_bufs = 1; 45 #endif 46 47 dmu_tx_t * 48 dmu_tx_create_ds(dsl_dir_t *dd) 49 { 50 dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); 51 tx->tx_dir = dd; 52 if (dd) 53 tx->tx_pool = dd->dd_pool; 54 list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), 55 offsetof(dmu_tx_hold_t, dth_node)); 56 refcount_create(&tx->tx_space_written); 57 refcount_create(&tx->tx_space_freed); 58 return (tx); 59 } 60 61 dmu_tx_t * 62 dmu_tx_create(objset_t *os) 63 { 64 dmu_tx_t *tx = dmu_tx_create_ds(os->os->os_dsl_dataset->ds_dir); 65 tx->tx_objset = os; 66 tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os->os_dsl_dataset); 67 return (tx); 68 } 69 70 dmu_tx_t * 71 dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) 72 { 73 dmu_tx_t *tx = dmu_tx_create_ds(NULL); 74 75 ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg); 76 tx->tx_pool = dp; 77 tx->tx_txg = txg; 78 tx->tx_anyobj = TRUE; 79 80 return (tx); 81 } 82 83 int 84 dmu_tx_is_syncing(dmu_tx_t *tx) 85 { 86 return (tx->tx_anyobj); 87 } 88 89 int 90 dmu_tx_private_ok(dmu_tx_t *tx) 91 { 92 return (tx->tx_anyobj); 93 } 94 95 static void 96 dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, 97 enum dmu_tx_hold_type type, dmu_tx_hold_func_t func, 98 uint64_t arg1, uint64_t arg2) 99 { 100 dmu_tx_hold_t *dth; 101 dnode_t *dn = NULL; 102 int err; 103 104 if (object != DMU_NEW_OBJECT) { 105 err = dnode_hold(os->os, object, tx, &dn); 106 if (err) { 107 tx->tx_err = err; 108 return; 109 } 110 111 if (err == 0 && tx->tx_txg != 0) { 112 mutex_enter(&dn->dn_mtx); 113 /* 114 * dn->dn_assigned_txg == tx->tx_txg doesn't pose a 115 * problem, but there's no way for it to happen (for 116 * now, at least). 117 */ 118 ASSERT(dn->dn_assigned_txg == 0); 119 ASSERT(dn->dn_assigned_tx == NULL); 120 dn->dn_assigned_txg = tx->tx_txg; 121 dn->dn_assigned_tx = tx; 122 (void) refcount_add(&dn->dn_tx_holds, tx); 123 mutex_exit(&dn->dn_mtx); 124 } 125 } 126 127 dth = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); 128 dth->dth_dnode = dn; 129 dth->dth_type = type; 130 dth->dth_arg1 = arg1; 131 dth->dth_arg2 = arg2; 132 list_insert_tail(&tx->tx_holds, dth); 133 134 if (func) 135 func(tx, dn, arg1, arg2); 136 } 137 138 void 139 dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object) 140 { 141 /* 142 * If we're syncing, they can manipulate any object anyhow, and 143 * the hold on the dnode_t can cause problems. 144 */ 145 if (!dmu_tx_is_syncing(tx)) { 146 dmu_tx_hold_object_impl(tx, os, object, THT_NEWOBJECT, 147 NULL, 0, 0); 148 } 149 } 150 151 static int 152 dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) 153 { 154 int err; 155 dmu_buf_impl_t *db; 156 157 rw_enter(&dn->dn_struct_rwlock, RW_READER); 158 db = dbuf_hold_level(dn, level, blkid, FTAG); 159 rw_exit(&dn->dn_struct_rwlock); 160 if (db == NULL) 161 return (EIO); 162 err = dbuf_read(db, zio, DB_RF_CANFAIL); 163 dbuf_rele(db, FTAG); 164 return (err); 165 } 166 167 /* ARGSUSED */ 168 static void 169 dmu_tx_count_write(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) 170 { 171 uint64_t start, end, i, space; 172 int min_bs, max_bs, min_ibs, max_ibs, epbs, bits; 173 174 if (len == 0) 175 return; 176 177 min_bs = SPA_MINBLOCKSHIFT; 178 max_bs = SPA_MAXBLOCKSHIFT; 179 min_ibs = DN_MIN_INDBLKSHIFT; 180 max_ibs = DN_MAX_INDBLKSHIFT; 181 182 /* 183 * For i/o error checking, read the first and last level-0 184 * blocks, and all the level-1 blocks. We needn't do this on 185 * the meta-dnode, because we've already read it in. 186 */ 187 188 if (dn && dn->dn_object != DMU_META_DNODE_OBJECT) { 189 int err; 190 191 if (dn->dn_maxblkid == 0) { 192 err = dmu_tx_check_ioerr(NULL, dn, 0, 0); 193 if (err) { 194 tx->tx_err = err; 195 return; 196 } 197 } else { 198 zio_t *zio = zio_root(tx->tx_pool->dp_spa, 199 NULL, NULL, ZIO_FLAG_CANFAIL); 200 201 /* first level-0 block */ 202 start = off/dn->dn_datablksz; 203 err = dmu_tx_check_ioerr(zio, dn, 0, start); 204 if (err) { 205 tx->tx_err = err; 206 return; 207 } 208 209 /* last level-0 block */ 210 end = (off+len)/dn->dn_datablksz; 211 if (end != start) { 212 err = dmu_tx_check_ioerr(zio, dn, 0, end); 213 if (err) { 214 tx->tx_err = err; 215 return; 216 } 217 } 218 219 /* level-1 blocks */ 220 if (dn->dn_nlevels > 1) { 221 start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; 222 end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; 223 for (i = start+1; i < end; i++) { 224 err = dmu_tx_check_ioerr(zio, dn, 1, i); 225 if (err) { 226 tx->tx_err = err; 227 return; 228 } 229 } 230 } 231 232 err = zio_wait(zio); 233 if (err) { 234 tx->tx_err = err; 235 return; 236 } 237 } 238 } 239 240 /* 241 * If there's more than one block, the blocksize can't change, 242 * so we can make a more precise estimate. Alternatively, 243 * if the dnode's ibs is larger than max_ibs, always use that. 244 * This ensures that if we reduce DN_MAX_INDBLKSHIFT, 245 * the code will still work correctly on existing pools. 246 */ 247 if (dn && (dn->dn_maxblkid != 0 || dn->dn_indblkshift > max_ibs)) { 248 min_ibs = max_ibs = dn->dn_indblkshift; 249 if (dn->dn_datablkshift != 0) 250 min_bs = max_bs = dn->dn_datablkshift; 251 } 252 253 /* 254 * 'end' is the last thing we will access, not one past. 255 * This way we won't overflow when accessing the last byte. 256 */ 257 start = P2ALIGN(off, 1ULL << max_bs); 258 end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1; 259 space = end - start + 1; 260 261 start >>= min_bs; 262 end >>= min_bs; 263 264 epbs = min_ibs - SPA_BLKPTRSHIFT; 265 266 /* 267 * The object contains at most 2^(64 - min_bs) blocks, 268 * and each indirect level maps 2^epbs. 269 */ 270 for (bits = 64 - min_bs; bits >= 0; bits -= epbs) { 271 start >>= epbs; 272 end >>= epbs; 273 /* 274 * If we increase the number of levels of indirection, 275 * we'll need new blkid=0 indirect blocks. If start == 0, 276 * we're already accounting for that blocks; and if end == 0, 277 * we can't increase the number of levels beyond that. 278 */ 279 if (start != 0 && end != 0) 280 space += 1ULL << max_ibs; 281 space += (end - start + 1) << max_ibs; 282 } 283 284 ASSERT(space < 2 * DMU_MAX_ACCESS); 285 286 tx->tx_space_towrite += space; 287 } 288 289 static void 290 dmu_tx_count_dnode(dmu_tx_t *tx, dnode_t *dn) 291 { 292 dnode_t *mdn = tx->tx_objset->os->os_meta_dnode; 293 uint64_t object = dn ? dn->dn_object : DN_MAX_OBJECT - 1; 294 uint64_t pre_write_space; 295 296 ASSERT(object < DN_MAX_OBJECT); 297 pre_write_space = tx->tx_space_towrite; 298 dmu_tx_count_write(tx, mdn, object << DNODE_SHIFT, 1 << DNODE_SHIFT); 299 if (dn && dn->dn_dbuf->db_blkptr && 300 dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, 301 dn->dn_dbuf->db_blkptr->blk_birth)) { 302 tx->tx_space_tooverwrite += 303 tx->tx_space_towrite - pre_write_space; 304 tx->tx_space_towrite = pre_write_space; 305 } 306 } 307 308 /* ARGSUSED */ 309 static void 310 dmu_tx_hold_write_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) 311 { 312 dmu_tx_count_write(tx, dn, off, len); 313 dmu_tx_count_dnode(tx, dn); 314 } 315 316 void 317 dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) 318 { 319 ASSERT(tx->tx_txg == 0); 320 ASSERT(len < DMU_MAX_ACCESS); 321 ASSERT(UINT64_MAX - off >= len - 1); 322 323 dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_WRITE, 324 dmu_tx_hold_write_impl, off, len); 325 } 326 327 static void 328 dmu_tx_count_free(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) 329 { 330 uint64_t blkid, nblks; 331 uint64_t space = 0; 332 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 333 334 if (dn->dn_datablkshift == 0) 335 return; 336 /* 337 * not that the dnode can change, since it isn't dirty, but 338 * dbuf_hold_impl() wants us to have the struct_rwlock. 339 * also need it to protect dn_maxblkid. 340 */ 341 rw_enter(&dn->dn_struct_rwlock, RW_READER); 342 blkid = off >> dn->dn_datablkshift; 343 nblks = (off + len) >> dn->dn_datablkshift; 344 345 if (blkid >= dn->dn_maxblkid) { 346 rw_exit(&dn->dn_struct_rwlock); 347 return; 348 } 349 if (blkid + nblks > dn->dn_maxblkid) 350 nblks = dn->dn_maxblkid - blkid; 351 352 /* don't bother after the 100,000 blocks */ 353 nblks = MIN(nblks, 128*1024); 354 355 if (dn->dn_phys->dn_nlevels == 1) { 356 int i; 357 for (i = 0; i < nblks; i++) { 358 blkptr_t *bp = dn->dn_phys->dn_blkptr; 359 ASSERT3U(blkid + i, <, dn->dn_phys->dn_nblkptr); 360 bp += blkid + i; 361 if (dsl_dataset_block_freeable(ds, bp->blk_birth)) { 362 dprintf_bp(bp, "can free old%s", ""); 363 space += BP_GET_ASIZE(bp); 364 } 365 } 366 nblks = 0; 367 } 368 369 while (nblks) { 370 dmu_buf_impl_t *dbuf; 371 int err, epbs, blkoff, tochk; 372 373 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 374 blkoff = P2PHASE(blkid, 1<<epbs); 375 tochk = MIN((1<<epbs) - blkoff, nblks); 376 377 err = dbuf_hold_impl(dn, 1, blkid >> epbs, TRUE, FTAG, &dbuf); 378 if (err == 0) { 379 int i; 380 blkptr_t *bp; 381 382 err = dbuf_read(dbuf, NULL, 383 DB_RF_HAVESTRUCT | DB_RF_CANFAIL); 384 if (err != 0) { 385 tx->tx_err = err; 386 dbuf_rele(dbuf, FTAG); 387 break; 388 } 389 390 bp = dbuf->db.db_data; 391 bp += blkoff; 392 393 for (i = 0; i < tochk; i++) { 394 if (dsl_dataset_block_freeable(ds, 395 bp[i].blk_birth)) { 396 dprintf_bp(&bp[i], 397 "can free old%s", ""); 398 space += BP_GET_ASIZE(&bp[i]); 399 } 400 } 401 dbuf_rele(dbuf, FTAG); 402 } else { 403 /* the indirect block is sparse */ 404 ASSERT(err == ENOENT); 405 } 406 407 blkid += tochk; 408 nblks -= tochk; 409 } 410 rw_exit(&dn->dn_struct_rwlock); 411 412 tx->tx_space_tofree += space; 413 } 414 415 static void 416 dmu_tx_hold_free_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) 417 { 418 uint64_t start, end, i; 419 int dirty, err, shift; 420 zio_t *zio; 421 422 /* first block */ 423 if (off != 0 /* || dn->dn_maxblkid == 0 */) 424 dmu_tx_count_write(tx, dn, off, 1); 425 /* last block */ 426 if (len != DMU_OBJECT_END) 427 dmu_tx_count_write(tx, dn, off+len, 1); 428 429 if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz) 430 return; 431 if (len == DMU_OBJECT_END) 432 len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off; 433 434 /* 435 * For i/o error checking, read the first and last level-0 436 * blocks, and all the level-1 blocks. The above count_write's 437 * will take care of the level-0 blocks. 438 */ 439 shift = dn->dn_datablkshift + dn->dn_indblkshift - SPA_BLKPTRSHIFT; 440 start = off >> shift; 441 end = dn->dn_datablkshift ? ((off+len) >> shift) : 0; 442 443 zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); 444 for (i = start+1; i < end; i++) { 445 uint64_t ibyte = i << shift; 446 err = dnode_next_offset(dn, FALSE, &ibyte, 2, 1); 447 i = ibyte >> shift; 448 if (err == ESRCH) 449 break; 450 if (err) { 451 tx->tx_err = err; 452 return; 453 } 454 455 err = dmu_tx_check_ioerr(zio, dn, 1, i); 456 if (err) { 457 tx->tx_err = err; 458 return; 459 } 460 } 461 err = zio_wait(zio); 462 if (err) { 463 tx->tx_err = err; 464 return; 465 } 466 467 dmu_tx_count_dnode(tx, dn); 468 469 /* XXX locking */ 470 dirty = dn->dn_dirtyblksz[0] | dn->dn_dirtyblksz[1] | 471 dn->dn_dirtyblksz[2] | dn->dn_dirtyblksz[3]; 472 if (dn->dn_assigned_tx != NULL && !dirty) 473 dmu_tx_count_free(tx, dn, off, len); 474 } 475 476 void 477 dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) 478 { 479 ASSERT(tx->tx_txg == 0); 480 481 dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_FREE, 482 dmu_tx_hold_free_impl, off, len); 483 } 484 485 /* ARGSUSED */ 486 static void 487 dmu_tx_hold_zap_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t add, uint64_t iname) 488 { 489 uint64_t nblocks; 490 int epbs, err; 491 char *name = (char *)(uintptr_t)iname; 492 493 dmu_tx_count_dnode(tx, dn); 494 495 if (dn == NULL) { 496 /* 497 * We will be able to fit a new object's entries into one leaf 498 * block. So there will be at most 2 blocks total, 499 * including the header block. 500 */ 501 dmu_tx_count_write(tx, dn, 0, 2 << fzap_default_block_shift); 502 return; 503 } 504 505 ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap); 506 507 if (dn->dn_maxblkid == 0 && !add) { 508 /* 509 * If there is only one block (i.e. this is a micro-zap) 510 * and we are not adding anything, the accounting is simple. 511 */ 512 err = dmu_tx_check_ioerr(NULL, dn, 0, 0); 513 if (err) { 514 tx->tx_err = err; 515 return; 516 } 517 518 if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, 519 dn->dn_phys->dn_blkptr[0].blk_birth)) 520 tx->tx_space_tooverwrite += dn->dn_datablksz; 521 else 522 tx->tx_space_towrite += dn->dn_datablksz; 523 return; 524 } 525 526 if (dn->dn_maxblkid > 0 && name) { 527 /* 528 * access the name in this fat-zap so that we'll check 529 * for i/o errors to the leaf blocks, etc. 530 */ 531 err = zap_lookup(&dn->dn_objset->os, dn->dn_object, name, 532 8, 0, NULL); 533 if (err == EIO) { 534 tx->tx_err = err; 535 return; 536 } 537 } 538 539 /* 540 * 3 blocks overwritten: target leaf, ptrtbl block, header block 541 * 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks 542 */ 543 dmu_tx_count_write(tx, dn, dn->dn_maxblkid * dn->dn_datablksz, 544 (3 + add ? 3 : 0) << dn->dn_datablkshift); 545 546 /* 547 * If the modified blocks are scattered to the four winds, 548 * we'll have to modify an indirect twig for each. 549 */ 550 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 551 for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs) 552 tx->tx_space_towrite += 3 << dn->dn_indblkshift; 553 } 554 555 void 556 dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) 557 { 558 ASSERT(tx->tx_txg == 0); 559 560 dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_ZAP, 561 dmu_tx_hold_zap_impl, add, (uintptr_t)name); 562 } 563 564 void 565 dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) 566 { 567 ASSERT(tx->tx_txg == 0); 568 569 dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_BONUS, 570 dmu_tx_hold_write_impl, 0, 0); 571 } 572 573 574 /* ARGSUSED */ 575 static void 576 dmu_tx_hold_space_impl(dmu_tx_t *tx, dnode_t *dn, 577 uint64_t space, uint64_t unused) 578 { 579 tx->tx_space_towrite += space; 580 } 581 582 void 583 dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) 584 { 585 ASSERT(tx->tx_txg == 0); 586 587 dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, THT_SPACE, 588 dmu_tx_hold_space_impl, space, 0); 589 } 590 591 int 592 dmu_tx_holds(dmu_tx_t *tx, uint64_t object) 593 { 594 dmu_tx_hold_t *dth; 595 int holds = 0; 596 597 /* 598 * By asserting that the tx is assigned, we're counting the 599 * number of dn_tx_holds, which is the same as the number of 600 * dn_holds. Otherwise, we'd be counting dn_holds, but 601 * dn_tx_holds could be 0. 602 */ 603 ASSERT(tx->tx_txg != 0); 604 605 /* if (tx->tx_anyobj == TRUE) */ 606 /* return (0); */ 607 608 for (dth = list_head(&tx->tx_holds); dth; 609 dth = list_next(&tx->tx_holds, dth)) { 610 if (dth->dth_dnode && dth->dth_dnode->dn_object == object) 611 holds++; 612 } 613 614 return (holds); 615 } 616 617 #ifdef ZFS_DEBUG 618 void 619 dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) 620 { 621 dmu_tx_hold_t *dth; 622 int match_object = FALSE, match_offset = FALSE; 623 dnode_t *dn = db->db_dnode; 624 625 ASSERT(tx->tx_txg != 0); 626 ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset->os); 627 ASSERT3U(dn->dn_object, ==, db->db.db_object); 628 629 if (tx->tx_anyobj) 630 return; 631 632 /* XXX No checking on the meta dnode for now */ 633 if (db->db.db_object == DMU_META_DNODE_OBJECT) 634 return; 635 636 for (dth = list_head(&tx->tx_holds); dth; 637 dth = list_next(&tx->tx_holds, dth)) { 638 ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg); 639 if (dth->dth_dnode == dn && dth->dth_type != THT_NEWOBJECT) 640 match_object = TRUE; 641 if (dth->dth_dnode == NULL || dth->dth_dnode == dn) { 642 int datablkshift = dn->dn_datablkshift ? 643 dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; 644 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 645 int shift = datablkshift + epbs * db->db_level; 646 uint64_t beginblk = shift >= 64 ? 0 : 647 (dth->dth_arg1 >> shift); 648 uint64_t endblk = shift >= 64 ? 0 : 649 ((dth->dth_arg1 + dth->dth_arg2 - 1) >> shift); 650 uint64_t blkid = db->db_blkid; 651 652 /* XXX dth_arg2 better not be zero... */ 653 654 dprintf("found dth type %x beginblk=%llx endblk=%llx\n", 655 dth->dth_type, beginblk, endblk); 656 657 switch (dth->dth_type) { 658 case THT_WRITE: 659 if (blkid >= beginblk && blkid <= endblk) 660 match_offset = TRUE; 661 /* 662 * We will let this hold work for the bonus 663 * buffer so that we don't need to hold it 664 * when creating a new object. 665 */ 666 if (blkid == DB_BONUS_BLKID) 667 match_offset = TRUE; 668 /* 669 * They might have to increase nlevels, 670 * thus dirtying the new TLIBs. Or the 671 * might have to change the block size, 672 * thus dirying the new lvl=0 blk=0. 673 */ 674 if (blkid == 0) 675 match_offset = TRUE; 676 break; 677 case THT_FREE: 678 if (blkid == beginblk && 679 (dth->dth_arg1 != 0 || 680 dn->dn_maxblkid == 0)) 681 match_offset = TRUE; 682 if (blkid == endblk && 683 dth->dth_arg2 != DMU_OBJECT_END) 684 match_offset = TRUE; 685 break; 686 case THT_BONUS: 687 if (blkid == DB_BONUS_BLKID) 688 match_offset = TRUE; 689 break; 690 case THT_ZAP: 691 match_offset = TRUE; 692 break; 693 case THT_NEWOBJECT: 694 match_object = TRUE; 695 break; 696 default: 697 ASSERT(!"bad dth_type"); 698 } 699 } 700 if (match_object && match_offset) 701 return; 702 } 703 panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", 704 (u_longlong_t)db->db.db_object, db->db_level, 705 (u_longlong_t)db->db_blkid); 706 } 707 #endif 708 709 static int 710 dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how, dmu_tx_hold_t **last_dth) 711 { 712 dmu_tx_hold_t *dth; 713 uint64_t lsize, asize, fsize, towrite; 714 715 *last_dth = NULL; 716 717 tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); 718 719 if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg) 720 return (ERESTART); 721 if (tx->tx_err) 722 return (tx->tx_err); 723 724 for (dth = list_head(&tx->tx_holds); dth; 725 dth = list_next(&tx->tx_holds, dth)) { 726 dnode_t *dn = dth->dth_dnode; 727 if (dn != NULL) { 728 mutex_enter(&dn->dn_mtx); 729 while (dn->dn_assigned_txg == tx->tx_txg - 1) { 730 if (txg_how != TXG_WAIT) { 731 mutex_exit(&dn->dn_mtx); 732 return (ERESTART); 733 } 734 cv_wait(&dn->dn_notxholds, &dn->dn_mtx); 735 } 736 if (dn->dn_assigned_txg == 0) { 737 ASSERT(dn->dn_assigned_tx == NULL); 738 dn->dn_assigned_txg = tx->tx_txg; 739 dn->dn_assigned_tx = tx; 740 } else { 741 ASSERT(dn->dn_assigned_txg == tx->tx_txg); 742 if (dn->dn_assigned_tx != tx) 743 dn->dn_assigned_tx = NULL; 744 } 745 (void) refcount_add(&dn->dn_tx_holds, tx); 746 mutex_exit(&dn->dn_mtx); 747 } 748 *last_dth = dth; 749 if (tx->tx_err) 750 return (tx->tx_err); 751 } 752 753 /* 754 * If a snapshot has been taken since we made our estimates, 755 * assume that we won't be able to free or overwrite anything. 756 */ 757 if (tx->tx_objset && 758 dsl_dataset_prev_snap_txg(tx->tx_objset->os->os_dsl_dataset) > 759 tx->tx_lastsnap_txg) { 760 tx->tx_space_towrite += tx->tx_space_tooverwrite; 761 tx->tx_space_tooverwrite = 0; 762 tx->tx_space_tofree = 0; 763 } 764 765 /* 766 * Convert logical size to worst-case allocated size. 767 */ 768 fsize = spa_get_asize(tx->tx_pool->dp_spa, tx->tx_space_tooverwrite) + 769 tx->tx_space_tofree; 770 lsize = tx->tx_space_towrite + tx->tx_space_tooverwrite; 771 asize = spa_get_asize(tx->tx_pool->dp_spa, lsize); 772 towrite = tx->tx_space_towrite; 773 tx->tx_space_towrite = asize; 774 775 if (tx->tx_dir && asize != 0) { 776 int err = dsl_dir_tempreserve_space(tx->tx_dir, 777 lsize, asize, fsize, &tx->tx_tempreserve_cookie, tx); 778 if (err) { 779 tx->tx_space_towrite = towrite; 780 return (err); 781 } 782 } 783 784 return (0); 785 } 786 787 static uint64_t 788 dmu_tx_unassign(dmu_tx_t *tx, dmu_tx_hold_t *last_dth) 789 { 790 uint64_t txg = tx->tx_txg; 791 dmu_tx_hold_t *dth; 792 793 ASSERT(txg != 0); 794 795 txg_rele_to_quiesce(&tx->tx_txgh); 796 797 for (dth = last_dth; dth; dth = list_prev(&tx->tx_holds, dth)) { 798 dnode_t *dn = dth->dth_dnode; 799 800 if (dn == NULL) 801 continue; 802 mutex_enter(&dn->dn_mtx); 803 ASSERT3U(dn->dn_assigned_txg, ==, txg); 804 805 if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { 806 dn->dn_assigned_txg = 0; 807 dn->dn_assigned_tx = NULL; 808 cv_broadcast(&dn->dn_notxholds); 809 } 810 mutex_exit(&dn->dn_mtx); 811 } 812 813 txg_rele_to_sync(&tx->tx_txgh); 814 815 tx->tx_txg = 0; 816 return (txg); 817 } 818 819 /* 820 * Assign tx to a transaction group. txg_how can be one of: 821 * 822 * (1) TXG_WAIT. If the current open txg is full, waits until there's 823 * a new one. This should be used when you're not holding locks. 824 * If will only fail if we're truly out of space (or over quota). 825 * 826 * (2) TXG_NOWAIT. If we can't assign into the current open txg without 827 * blocking, returns immediately with ERESTART. This should be used 828 * whenever you're holding locks. On an ERESTART error, the caller 829 * should drop locks, do a txg_wait_open(dp, 0), and try again. 830 * 831 * (3) A specific txg. Use this if you need to ensure that multiple 832 * transactions all sync in the same txg. Like TXG_NOWAIT, it 833 * returns ERESTART if it can't assign you into the requested txg. 834 */ 835 int 836 dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) 837 { 838 dmu_tx_hold_t *last_dth; 839 int err; 840 841 ASSERT(tx->tx_txg == 0); 842 ASSERT(txg_how != 0); 843 ASSERT(!dsl_pool_sync_context(tx->tx_pool)); 844 845 while ((err = dmu_tx_try_assign(tx, txg_how, &last_dth)) != 0) { 846 uint64_t txg = dmu_tx_unassign(tx, last_dth); 847 848 if (err != ERESTART || txg_how != TXG_WAIT) 849 return (err); 850 851 txg_wait_open(tx->tx_pool, txg + 1); 852 } 853 854 txg_rele_to_quiesce(&tx->tx_txgh); 855 856 return (0); 857 } 858 859 void 860 dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta) 861 { 862 if (tx->tx_dir == NULL || delta == 0) 863 return; 864 865 if (delta > 0) { 866 ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=, 867 tx->tx_space_towrite); 868 (void) refcount_add_many(&tx->tx_space_written, delta, NULL); 869 } else { 870 (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL); 871 } 872 } 873 874 void 875 dmu_tx_commit(dmu_tx_t *tx) 876 { 877 dmu_tx_hold_t *dth; 878 879 ASSERT(tx->tx_txg != 0); 880 881 while (dth = list_head(&tx->tx_holds)) { 882 dnode_t *dn = dth->dth_dnode; 883 884 list_remove(&tx->tx_holds, dth); 885 kmem_free(dth, sizeof (dmu_tx_hold_t)); 886 if (dn == NULL) 887 continue; 888 mutex_enter(&dn->dn_mtx); 889 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 890 891 if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { 892 dn->dn_assigned_txg = 0; 893 dn->dn_assigned_tx = NULL; 894 cv_broadcast(&dn->dn_notxholds); 895 } 896 mutex_exit(&dn->dn_mtx); 897 dnode_rele(dn, tx); 898 } 899 900 if (tx->tx_dir && tx->tx_space_towrite > 0) { 901 dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); 902 } 903 904 if (tx->tx_anyobj == FALSE) 905 txg_rele_to_sync(&tx->tx_txgh); 906 dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n", 907 tx->tx_space_towrite, refcount_count(&tx->tx_space_written), 908 tx->tx_space_tofree, refcount_count(&tx->tx_space_freed)); 909 refcount_destroy_many(&tx->tx_space_written, 910 refcount_count(&tx->tx_space_written)); 911 refcount_destroy_many(&tx->tx_space_freed, 912 refcount_count(&tx->tx_space_freed)); 913 #ifdef ZFS_DEBUG 914 if (tx->tx_debug_buf) 915 kmem_free(tx->tx_debug_buf, 4096); 916 #endif 917 kmem_free(tx, sizeof (dmu_tx_t)); 918 } 919 920 void 921 dmu_tx_abort(dmu_tx_t *tx) 922 { 923 dmu_tx_hold_t *dth; 924 925 ASSERT(tx->tx_txg == 0); 926 927 while (dth = list_head(&tx->tx_holds)) { 928 dnode_t *dn = dth->dth_dnode; 929 930 list_remove(&tx->tx_holds, dth); 931 kmem_free(dth, sizeof (dmu_tx_hold_t)); 932 if (dn != NULL) 933 dnode_rele(dn, tx); 934 } 935 refcount_destroy_many(&tx->tx_space_written, 936 refcount_count(&tx->tx_space_written)); 937 refcount_destroy_many(&tx->tx_space_freed, 938 refcount_count(&tx->tx_space_freed)); 939 #ifdef ZFS_DEBUG 940 if (tx->tx_debug_buf) 941 kmem_free(tx->tx_debug_buf, 4096); 942 #endif 943 kmem_free(tx, sizeof (dmu_tx_t)); 944 } 945 946 uint64_t 947 dmu_tx_get_txg(dmu_tx_t *tx) 948 { 949 ASSERT(tx->tx_txg != 0); 950 return (tx->tx_txg); 951 } 952