1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011 by Delphix. All rights reserved. 24 */ 25 26 #include <sys/dmu_objset.h> 27 #include <sys/dsl_dataset.h> 28 #include <sys/dsl_dir.h> 29 #include <sys/dsl_prop.h> 30 #include <sys/dsl_synctask.h> 31 #include <sys/dmu_traverse.h> 32 #include <sys/dmu_tx.h> 33 #include <sys/arc.h> 34 #include <sys/zio.h> 35 #include <sys/zap.h> 36 #include <sys/unique.h> 37 #include <sys/zfs_context.h> 38 #include <sys/zfs_ioctl.h> 39 #include <sys/spa.h> 40 #include <sys/zfs_znode.h> 41 #include <sys/zfs_onexit.h> 42 #include <sys/zvol.h> 43 #include <sys/dsl_scan.h> 44 #include <sys/dsl_deadlist.h> 45 46 static char *dsl_reaper = "the grim reaper"; 47 48 static dsl_checkfunc_t dsl_dataset_destroy_begin_check; 49 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync; 50 static dsl_syncfunc_t dsl_dataset_set_reservation_sync; 51 52 #define SWITCH64(x, y) \ 53 { \ 54 uint64_t __tmp = (x); \ 55 (x) = (y); \ 56 (y) = __tmp; \ 57 } 58 59 #define DS_REF_MAX (1ULL << 62) 60 61 #define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE 62 63 #define DSL_DATASET_IS_DESTROYED(ds) ((ds)->ds_owner == dsl_reaper) 64 65 66 /* 67 * Figure out how much of this delta should be propogated to the dsl_dir 68 * layer. If there's a refreservation, that space has already been 69 * partially accounted for in our ancestors. 70 */ 71 static int64_t 72 parent_delta(dsl_dataset_t *ds, int64_t delta) 73 { 74 uint64_t old_bytes, new_bytes; 75 76 if (ds->ds_reserved == 0) 77 return (delta); 78 79 old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); 80 new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved); 81 82 ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta)); 83 return (new_bytes - old_bytes); 84 } 85 86 void 87 dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) 88 { 89 int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); 90 int compressed = BP_GET_PSIZE(bp); 91 int uncompressed = BP_GET_UCSIZE(bp); 92 int64_t delta; 93 94 dprintf_bp(bp, "ds=%p", ds); 95 96 ASSERT(dmu_tx_is_syncing(tx)); 97 /* It could have been compressed away to nothing */ 98 if (BP_IS_HOLE(bp)) 99 return; 100 ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); 101 ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES); 102 if (ds == NULL) { 103 /* 104 * Account for the meta-objset space in its placeholder 105 * dsl_dir. 106 */ 107 ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */ 108 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, 109 used, compressed, uncompressed, tx); 110 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); 111 return; 112 } 113 dmu_buf_will_dirty(ds->ds_dbuf, tx); 114 115 mutex_enter(&ds->ds_dir->dd_lock); 116 mutex_enter(&ds->ds_lock); 117 delta = parent_delta(ds, used); 118 ds->ds_phys->ds_used_bytes += used; 119 ds->ds_phys->ds_compressed_bytes += compressed; 120 ds->ds_phys->ds_uncompressed_bytes += uncompressed; 121 ds->ds_phys->ds_unique_bytes += used; 122 mutex_exit(&ds->ds_lock); 123 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, 124 compressed, uncompressed, tx); 125 dsl_dir_transfer_space(ds->ds_dir, used - delta, 126 DD_USED_REFRSRV, DD_USED_HEAD, tx); 127 mutex_exit(&ds->ds_dir->dd_lock); 128 } 129 130 int 131 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, 132 boolean_t async) 133 { 134 if (BP_IS_HOLE(bp)) 135 return (0); 136 137 ASSERT(dmu_tx_is_syncing(tx)); 138 ASSERT(bp->blk_birth <= tx->tx_txg); 139 140 int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); 141 int compressed = BP_GET_PSIZE(bp); 142 int uncompressed = BP_GET_UCSIZE(bp); 143 144 ASSERT(used > 0); 145 if (ds == NULL) { 146 /* 147 * Account for the meta-objset space in its placeholder 148 * dataset. 149 */ 150 dsl_free(tx->tx_pool, tx->tx_txg, bp); 151 152 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, 153 -used, -compressed, -uncompressed, tx); 154 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); 155 return (used); 156 } 157 ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); 158 159 ASSERT(!dsl_dataset_is_snapshot(ds)); 160 dmu_buf_will_dirty(ds->ds_dbuf, tx); 161 162 if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) { 163 int64_t delta; 164 165 dprintf_bp(bp, "freeing ds=%llu", ds->ds_object); 166 dsl_free(tx->tx_pool, tx->tx_txg, bp); 167 168 mutex_enter(&ds->ds_dir->dd_lock); 169 mutex_enter(&ds->ds_lock); 170 ASSERT(ds->ds_phys->ds_unique_bytes >= used || 171 !DS_UNIQUE_IS_ACCURATE(ds)); 172 delta = parent_delta(ds, -used); 173 ds->ds_phys->ds_unique_bytes -= used; 174 mutex_exit(&ds->ds_lock); 175 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, 176 delta, -compressed, -uncompressed, tx); 177 dsl_dir_transfer_space(ds->ds_dir, -used - delta, 178 DD_USED_REFRSRV, DD_USED_HEAD, tx); 179 mutex_exit(&ds->ds_dir->dd_lock); 180 } else { 181 dprintf_bp(bp, "putting on dead list: %s", ""); 182 if (async) { 183 /* 184 * We are here as part of zio's write done callback, 185 * which means we're a zio interrupt thread. We can't 186 * call dsl_deadlist_insert() now because it may block 187 * waiting for I/O. Instead, put bp on the deferred 188 * queue and let dsl_pool_sync() finish the job. 189 */ 190 bplist_append(&ds->ds_pending_deadlist, bp); 191 } else { 192 dsl_deadlist_insert(&ds->ds_deadlist, bp, tx); 193 } 194 ASSERT3U(ds->ds_prev->ds_object, ==, 195 ds->ds_phys->ds_prev_snap_obj); 196 ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0); 197 /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ 198 if (ds->ds_prev->ds_phys->ds_next_snap_obj == 199 ds->ds_object && bp->blk_birth > 200 ds->ds_prev->ds_phys->ds_prev_snap_txg) { 201 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 202 mutex_enter(&ds->ds_prev->ds_lock); 203 ds->ds_prev->ds_phys->ds_unique_bytes += used; 204 mutex_exit(&ds->ds_prev->ds_lock); 205 } 206 if (bp->blk_birth > ds->ds_dir->dd_origin_txg) { 207 dsl_dir_transfer_space(ds->ds_dir, used, 208 DD_USED_HEAD, DD_USED_SNAP, tx); 209 } 210 } 211 mutex_enter(&ds->ds_lock); 212 ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used); 213 ds->ds_phys->ds_used_bytes -= used; 214 ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed); 215 ds->ds_phys->ds_compressed_bytes -= compressed; 216 ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed); 217 ds->ds_phys->ds_uncompressed_bytes -= uncompressed; 218 mutex_exit(&ds->ds_lock); 219 220 return (used); 221 } 222 223 uint64_t 224 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds) 225 { 226 uint64_t trysnap = 0; 227 228 if (ds == NULL) 229 return (0); 230 /* 231 * The snapshot creation could fail, but that would cause an 232 * incorrect FALSE return, which would only result in an 233 * overestimation of the amount of space that an operation would 234 * consume, which is OK. 235 * 236 * There's also a small window where we could miss a pending 237 * snapshot, because we could set the sync task in the quiescing 238 * phase. So this should only be used as a guess. 239 */ 240 if (ds->ds_trysnap_txg > 241 spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa)) 242 trysnap = ds->ds_trysnap_txg; 243 return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap)); 244 } 245 246 boolean_t 247 dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp, 248 uint64_t blk_birth) 249 { 250 if (blk_birth <= dsl_dataset_prev_snap_txg(ds)) 251 return (B_FALSE); 252 253 ddt_prefetch(dsl_dataset_get_spa(ds), bp); 254 255 return (B_TRUE); 256 } 257 258 /* ARGSUSED */ 259 static void 260 dsl_dataset_evict(dmu_buf_t *db, void *dsv) 261 { 262 dsl_dataset_t *ds = dsv; 263 264 ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds)); 265 266 unique_remove(ds->ds_fsid_guid); 267 268 if (ds->ds_objset != NULL) 269 dmu_objset_evict(ds->ds_objset); 270 271 if (ds->ds_prev) { 272 dsl_dataset_drop_ref(ds->ds_prev, ds); 273 ds->ds_prev = NULL; 274 } 275 276 bplist_destroy(&ds->ds_pending_deadlist); 277 if (db != NULL) { 278 dsl_deadlist_close(&ds->ds_deadlist); 279 } else { 280 ASSERT(ds->ds_deadlist.dl_dbuf == NULL); 281 ASSERT(!ds->ds_deadlist.dl_oldfmt); 282 } 283 if (ds->ds_dir) 284 dsl_dir_close(ds->ds_dir, ds); 285 286 ASSERT(!list_link_active(&ds->ds_synced_link)); 287 288 mutex_destroy(&ds->ds_lock); 289 mutex_destroy(&ds->ds_recvlock); 290 mutex_destroy(&ds->ds_opening_lock); 291 rw_destroy(&ds->ds_rwlock); 292 cv_destroy(&ds->ds_exclusive_cv); 293 294 kmem_free(ds, sizeof (dsl_dataset_t)); 295 } 296 297 static int 298 dsl_dataset_get_snapname(dsl_dataset_t *ds) 299 { 300 dsl_dataset_phys_t *headphys; 301 int err; 302 dmu_buf_t *headdbuf; 303 dsl_pool_t *dp = ds->ds_dir->dd_pool; 304 objset_t *mos = dp->dp_meta_objset; 305 306 if (ds->ds_snapname[0]) 307 return (0); 308 if (ds->ds_phys->ds_next_snap_obj == 0) 309 return (0); 310 311 err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj, 312 FTAG, &headdbuf); 313 if (err) 314 return (err); 315 headphys = headdbuf->db_data; 316 err = zap_value_search(dp->dp_meta_objset, 317 headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname); 318 dmu_buf_rele(headdbuf, FTAG); 319 return (err); 320 } 321 322 static int 323 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) 324 { 325 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 326 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; 327 matchtype_t mt; 328 int err; 329 330 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 331 mt = MT_FIRST; 332 else 333 mt = MT_EXACT; 334 335 err = zap_lookup_norm(mos, snapobj, name, 8, 1, 336 value, mt, NULL, 0, NULL); 337 if (err == ENOTSUP && mt == MT_FIRST) 338 err = zap_lookup(mos, snapobj, name, 8, 1, value); 339 return (err); 340 } 341 342 static int 343 dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx) 344 { 345 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 346 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; 347 matchtype_t mt; 348 int err; 349 350 dsl_dir_snap_cmtime_update(ds->ds_dir); 351 352 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 353 mt = MT_FIRST; 354 else 355 mt = MT_EXACT; 356 357 err = zap_remove_norm(mos, snapobj, name, mt, tx); 358 if (err == ENOTSUP && mt == MT_FIRST) 359 err = zap_remove(mos, snapobj, name, tx); 360 return (err); 361 } 362 363 static int 364 dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, 365 dsl_dataset_t **dsp) 366 { 367 objset_t *mos = dp->dp_meta_objset; 368 dmu_buf_t *dbuf; 369 dsl_dataset_t *ds; 370 int err; 371 dmu_object_info_t doi; 372 373 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || 374 dsl_pool_sync_context(dp)); 375 376 err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); 377 if (err) 378 return (err); 379 380 /* Make sure dsobj has the correct object type. */ 381 dmu_object_info_from_db(dbuf, &doi); 382 if (doi.doi_type != DMU_OT_DSL_DATASET) 383 return (EINVAL); 384 385 ds = dmu_buf_get_user(dbuf); 386 if (ds == NULL) { 387 dsl_dataset_t *winner; 388 389 ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); 390 ds->ds_dbuf = dbuf; 391 ds->ds_object = dsobj; 392 ds->ds_phys = dbuf->db_data; 393 394 mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); 395 mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL); 396 mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); 397 rw_init(&ds->ds_rwlock, 0, 0, 0); 398 cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL); 399 400 bplist_create(&ds->ds_pending_deadlist); 401 dsl_deadlist_open(&ds->ds_deadlist, 402 mos, ds->ds_phys->ds_deadlist_obj); 403 404 if (err == 0) { 405 err = dsl_dir_open_obj(dp, 406 ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir); 407 } 408 if (err) { 409 mutex_destroy(&ds->ds_lock); 410 mutex_destroy(&ds->ds_recvlock); 411 mutex_destroy(&ds->ds_opening_lock); 412 rw_destroy(&ds->ds_rwlock); 413 cv_destroy(&ds->ds_exclusive_cv); 414 bplist_destroy(&ds->ds_pending_deadlist); 415 dsl_deadlist_close(&ds->ds_deadlist); 416 kmem_free(ds, sizeof (dsl_dataset_t)); 417 dmu_buf_rele(dbuf, tag); 418 return (err); 419 } 420 421 if (!dsl_dataset_is_snapshot(ds)) { 422 ds->ds_snapname[0] = '\0'; 423 if (ds->ds_phys->ds_prev_snap_obj) { 424 err = dsl_dataset_get_ref(dp, 425 ds->ds_phys->ds_prev_snap_obj, 426 ds, &ds->ds_prev); 427 } 428 } else { 429 if (zfs_flags & ZFS_DEBUG_SNAPNAMES) 430 err = dsl_dataset_get_snapname(ds); 431 if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) { 432 err = zap_count( 433 ds->ds_dir->dd_pool->dp_meta_objset, 434 ds->ds_phys->ds_userrefs_obj, 435 &ds->ds_userrefs); 436 } 437 } 438 439 if (err == 0 && !dsl_dataset_is_snapshot(ds)) { 440 /* 441 * In sync context, we're called with either no lock 442 * or with the write lock. If we're not syncing, 443 * we're always called with the read lock held. 444 */ 445 boolean_t need_lock = 446 !RW_WRITE_HELD(&dp->dp_config_rwlock) && 447 dsl_pool_sync_context(dp); 448 449 if (need_lock) 450 rw_enter(&dp->dp_config_rwlock, RW_READER); 451 452 err = dsl_prop_get_ds(ds, 453 "refreservation", sizeof (uint64_t), 1, 454 &ds->ds_reserved, NULL); 455 if (err == 0) { 456 err = dsl_prop_get_ds(ds, 457 "refquota", sizeof (uint64_t), 1, 458 &ds->ds_quota, NULL); 459 } 460 461 if (need_lock) 462 rw_exit(&dp->dp_config_rwlock); 463 } else { 464 ds->ds_reserved = ds->ds_quota = 0; 465 } 466 467 if (err == 0) { 468 winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys, 469 dsl_dataset_evict); 470 } 471 if (err || winner) { 472 bplist_destroy(&ds->ds_pending_deadlist); 473 dsl_deadlist_close(&ds->ds_deadlist); 474 if (ds->ds_prev) 475 dsl_dataset_drop_ref(ds->ds_prev, ds); 476 dsl_dir_close(ds->ds_dir, ds); 477 mutex_destroy(&ds->ds_lock); 478 mutex_destroy(&ds->ds_recvlock); 479 mutex_destroy(&ds->ds_opening_lock); 480 rw_destroy(&ds->ds_rwlock); 481 cv_destroy(&ds->ds_exclusive_cv); 482 kmem_free(ds, sizeof (dsl_dataset_t)); 483 if (err) { 484 dmu_buf_rele(dbuf, tag); 485 return (err); 486 } 487 ds = winner; 488 } else { 489 ds->ds_fsid_guid = 490 unique_insert(ds->ds_phys->ds_fsid_guid); 491 } 492 } 493 ASSERT3P(ds->ds_dbuf, ==, dbuf); 494 ASSERT3P(ds->ds_phys, ==, dbuf->db_data); 495 ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 || 496 spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN || 497 dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); 498 mutex_enter(&ds->ds_lock); 499 if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) { 500 mutex_exit(&ds->ds_lock); 501 dmu_buf_rele(ds->ds_dbuf, tag); 502 return (ENOENT); 503 } 504 mutex_exit(&ds->ds_lock); 505 *dsp = ds; 506 return (0); 507 } 508 509 static int 510 dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag) 511 { 512 dsl_pool_t *dp = ds->ds_dir->dd_pool; 513 514 /* 515 * In syncing context we don't want the rwlock lock: there 516 * may be an existing writer waiting for sync phase to 517 * finish. We don't need to worry about such writers, since 518 * sync phase is single-threaded, so the writer can't be 519 * doing anything while we are active. 520 */ 521 if (dsl_pool_sync_context(dp)) { 522 ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); 523 return (0); 524 } 525 526 /* 527 * Normal users will hold the ds_rwlock as a READER until they 528 * are finished (i.e., call dsl_dataset_rele()). "Owners" will 529 * drop their READER lock after they set the ds_owner field. 530 * 531 * If the dataset is being destroyed, the destroy thread will 532 * obtain a WRITER lock for exclusive access after it's done its 533 * open-context work and then change the ds_owner to 534 * dsl_reaper once destruction is assured. So threads 535 * may block here temporarily, until the "destructability" of 536 * the dataset is determined. 537 */ 538 ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock)); 539 mutex_enter(&ds->ds_lock); 540 while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) { 541 rw_exit(&dp->dp_config_rwlock); 542 cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock); 543 if (DSL_DATASET_IS_DESTROYED(ds)) { 544 mutex_exit(&ds->ds_lock); 545 dsl_dataset_drop_ref(ds, tag); 546 rw_enter(&dp->dp_config_rwlock, RW_READER); 547 return (ENOENT); 548 } 549 /* 550 * The dp_config_rwlock lives above the ds_lock. And 551 * we need to check DSL_DATASET_IS_DESTROYED() while 552 * holding the ds_lock, so we have to drop and reacquire 553 * the ds_lock here. 554 */ 555 mutex_exit(&ds->ds_lock); 556 rw_enter(&dp->dp_config_rwlock, RW_READER); 557 mutex_enter(&ds->ds_lock); 558 } 559 mutex_exit(&ds->ds_lock); 560 return (0); 561 } 562 563 int 564 dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, 565 dsl_dataset_t **dsp) 566 { 567 int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp); 568 569 if (err) 570 return (err); 571 return (dsl_dataset_hold_ref(*dsp, tag)); 572 } 573 574 int 575 dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok, 576 void *tag, dsl_dataset_t **dsp) 577 { 578 int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); 579 if (err) 580 return (err); 581 if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { 582 dsl_dataset_rele(*dsp, tag); 583 *dsp = NULL; 584 return (EBUSY); 585 } 586 return (0); 587 } 588 589 int 590 dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) 591 { 592 dsl_dir_t *dd; 593 dsl_pool_t *dp; 594 const char *snapname; 595 uint64_t obj; 596 int err = 0; 597 598 err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname); 599 if (err) 600 return (err); 601 602 dp = dd->dd_pool; 603 obj = dd->dd_phys->dd_head_dataset_obj; 604 rw_enter(&dp->dp_config_rwlock, RW_READER); 605 if (obj) 606 err = dsl_dataset_get_ref(dp, obj, tag, dsp); 607 else 608 err = ENOENT; 609 if (err) 610 goto out; 611 612 err = dsl_dataset_hold_ref(*dsp, tag); 613 614 /* we may be looking for a snapshot */ 615 if (err == 0 && snapname != NULL) { 616 dsl_dataset_t *ds = NULL; 617 618 if (*snapname++ != '@') { 619 dsl_dataset_rele(*dsp, tag); 620 err = ENOENT; 621 goto out; 622 } 623 624 dprintf("looking for snapshot '%s'\n", snapname); 625 err = dsl_dataset_snap_lookup(*dsp, snapname, &obj); 626 if (err == 0) 627 err = dsl_dataset_get_ref(dp, obj, tag, &ds); 628 dsl_dataset_rele(*dsp, tag); 629 630 ASSERT3U((err == 0), ==, (ds != NULL)); 631 632 if (ds) { 633 mutex_enter(&ds->ds_lock); 634 if (ds->ds_snapname[0] == 0) 635 (void) strlcpy(ds->ds_snapname, snapname, 636 sizeof (ds->ds_snapname)); 637 mutex_exit(&ds->ds_lock); 638 err = dsl_dataset_hold_ref(ds, tag); 639 *dsp = err ? NULL : ds; 640 } 641 } 642 out: 643 rw_exit(&dp->dp_config_rwlock); 644 dsl_dir_close(dd, FTAG); 645 return (err); 646 } 647 648 int 649 dsl_dataset_own(const char *name, boolean_t inconsistentok, 650 void *tag, dsl_dataset_t **dsp) 651 { 652 int err = dsl_dataset_hold(name, tag, dsp); 653 if (err) 654 return (err); 655 if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { 656 dsl_dataset_rele(*dsp, tag); 657 return (EBUSY); 658 } 659 return (0); 660 } 661 662 void 663 dsl_dataset_name(dsl_dataset_t *ds, char *name) 664 { 665 if (ds == NULL) { 666 (void) strcpy(name, "mos"); 667 } else { 668 dsl_dir_name(ds->ds_dir, name); 669 VERIFY(0 == dsl_dataset_get_snapname(ds)); 670 if (ds->ds_snapname[0]) { 671 (void) strcat(name, "@"); 672 /* 673 * We use a "recursive" mutex so that we 674 * can call dprintf_ds() with ds_lock held. 675 */ 676 if (!MUTEX_HELD(&ds->ds_lock)) { 677 mutex_enter(&ds->ds_lock); 678 (void) strcat(name, ds->ds_snapname); 679 mutex_exit(&ds->ds_lock); 680 } else { 681 (void) strcat(name, ds->ds_snapname); 682 } 683 } 684 } 685 } 686 687 static int 688 dsl_dataset_namelen(dsl_dataset_t *ds) 689 { 690 int result; 691 692 if (ds == NULL) { 693 result = 3; /* "mos" */ 694 } else { 695 result = dsl_dir_namelen(ds->ds_dir); 696 VERIFY(0 == dsl_dataset_get_snapname(ds)); 697 if (ds->ds_snapname[0]) { 698 ++result; /* adding one for the @-sign */ 699 if (!MUTEX_HELD(&ds->ds_lock)) { 700 mutex_enter(&ds->ds_lock); 701 result += strlen(ds->ds_snapname); 702 mutex_exit(&ds->ds_lock); 703 } else { 704 result += strlen(ds->ds_snapname); 705 } 706 } 707 } 708 709 return (result); 710 } 711 712 void 713 dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag) 714 { 715 dmu_buf_rele(ds->ds_dbuf, tag); 716 } 717 718 void 719 dsl_dataset_rele(dsl_dataset_t *ds, void *tag) 720 { 721 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) { 722 rw_exit(&ds->ds_rwlock); 723 } 724 dsl_dataset_drop_ref(ds, tag); 725 } 726 727 void 728 dsl_dataset_disown(dsl_dataset_t *ds, void *tag) 729 { 730 ASSERT((ds->ds_owner == tag && ds->ds_dbuf) || 731 (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL)); 732 733 mutex_enter(&ds->ds_lock); 734 ds->ds_owner = NULL; 735 if (RW_WRITE_HELD(&ds->ds_rwlock)) { 736 rw_exit(&ds->ds_rwlock); 737 cv_broadcast(&ds->ds_exclusive_cv); 738 } 739 mutex_exit(&ds->ds_lock); 740 if (ds->ds_dbuf) 741 dsl_dataset_drop_ref(ds, tag); 742 else 743 dsl_dataset_evict(NULL, ds); 744 } 745 746 boolean_t 747 dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag) 748 { 749 boolean_t gotit = FALSE; 750 751 mutex_enter(&ds->ds_lock); 752 if (ds->ds_owner == NULL && 753 (!DS_IS_INCONSISTENT(ds) || inconsistentok)) { 754 ds->ds_owner = tag; 755 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) 756 rw_exit(&ds->ds_rwlock); 757 gotit = TRUE; 758 } 759 mutex_exit(&ds->ds_lock); 760 return (gotit); 761 } 762 763 void 764 dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner) 765 { 766 ASSERT3P(owner, ==, ds->ds_owner); 767 if (!RW_WRITE_HELD(&ds->ds_rwlock)) 768 rw_enter(&ds->ds_rwlock, RW_WRITER); 769 } 770 771 uint64_t 772 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, 773 uint64_t flags, dmu_tx_t *tx) 774 { 775 dsl_pool_t *dp = dd->dd_pool; 776 dmu_buf_t *dbuf; 777 dsl_dataset_phys_t *dsphys; 778 uint64_t dsobj; 779 objset_t *mos = dp->dp_meta_objset; 780 781 if (origin == NULL) 782 origin = dp->dp_origin_snap; 783 784 ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp); 785 ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0); 786 ASSERT(dmu_tx_is_syncing(tx)); 787 ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); 788 789 dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, 790 DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); 791 VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); 792 dmu_buf_will_dirty(dbuf, tx); 793 dsphys = dbuf->db_data; 794 bzero(dsphys, sizeof (dsl_dataset_phys_t)); 795 dsphys->ds_dir_obj = dd->dd_object; 796 dsphys->ds_flags = flags; 797 dsphys->ds_fsid_guid = unique_create(); 798 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, 799 sizeof (dsphys->ds_guid)); 800 dsphys->ds_snapnames_zapobj = 801 zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP, 802 DMU_OT_NONE, 0, tx); 803 dsphys->ds_creation_time = gethrestime_sec(); 804 dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg; 805 806 if (origin == NULL) { 807 dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx); 808 } else { 809 dsl_dataset_t *ohds; 810 811 dsphys->ds_prev_snap_obj = origin->ds_object; 812 dsphys->ds_prev_snap_txg = 813 origin->ds_phys->ds_creation_txg; 814 dsphys->ds_used_bytes = 815 origin->ds_phys->ds_used_bytes; 816 dsphys->ds_compressed_bytes = 817 origin->ds_phys->ds_compressed_bytes; 818 dsphys->ds_uncompressed_bytes = 819 origin->ds_phys->ds_uncompressed_bytes; 820 dsphys->ds_bp = origin->ds_phys->ds_bp; 821 dsphys->ds_flags |= origin->ds_phys->ds_flags; 822 823 dmu_buf_will_dirty(origin->ds_dbuf, tx); 824 origin->ds_phys->ds_num_children++; 825 826 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, 827 origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds)); 828 dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist, 829 dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx); 830 dsl_dataset_rele(ohds, FTAG); 831 832 if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) { 833 if (origin->ds_phys->ds_next_clones_obj == 0) { 834 origin->ds_phys->ds_next_clones_obj = 835 zap_create(mos, 836 DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); 837 } 838 VERIFY(0 == zap_add_int(mos, 839 origin->ds_phys->ds_next_clones_obj, 840 dsobj, tx)); 841 } 842 843 dmu_buf_will_dirty(dd->dd_dbuf, tx); 844 dd->dd_phys->dd_origin_obj = origin->ds_object; 845 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 846 if (origin->ds_dir->dd_phys->dd_clones == 0) { 847 dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); 848 origin->ds_dir->dd_phys->dd_clones = 849 zap_create(mos, 850 DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); 851 } 852 VERIFY3U(0, ==, zap_add_int(mos, 853 origin->ds_dir->dd_phys->dd_clones, dsobj, tx)); 854 } 855 } 856 857 if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) 858 dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 859 860 dmu_buf_rele(dbuf, FTAG); 861 862 dmu_buf_will_dirty(dd->dd_dbuf, tx); 863 dd->dd_phys->dd_head_dataset_obj = dsobj; 864 865 return (dsobj); 866 } 867 868 uint64_t 869 dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, 870 dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx) 871 { 872 dsl_pool_t *dp = pdd->dd_pool; 873 uint64_t dsobj, ddobj; 874 dsl_dir_t *dd; 875 876 ASSERT(lastname[0] != '@'); 877 878 ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); 879 VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd)); 880 881 dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx); 882 883 dsl_deleg_set_create_perms(dd, tx, cr); 884 885 dsl_dir_close(dd, FTAG); 886 887 /* 888 * If we are creating a clone, make sure we zero out any stale 889 * data from the origin snapshots zil header. 890 */ 891 if (origin != NULL) { 892 dsl_dataset_t *ds; 893 objset_t *os; 894 895 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 896 VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os)); 897 bzero(&os->os_zil_header, sizeof (os->os_zil_header)); 898 dsl_dataset_dirty(ds, tx); 899 dsl_dataset_rele(ds, FTAG); 900 } 901 902 return (dsobj); 903 } 904 905 /* 906 * The snapshots must all be in the same pool. 907 */ 908 int 909 dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer, char *failed) 910 { 911 int err; 912 dsl_sync_task_t *dst; 913 spa_t *spa; 914 nvpair_t *pair; 915 dsl_sync_task_group_t *dstg; 916 917 pair = nvlist_next_nvpair(snaps, NULL); 918 if (pair == NULL) 919 return (0); 920 921 err = spa_open(nvpair_name(pair), &spa, FTAG); 922 if (err) 923 return (err); 924 dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 925 926 for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; 927 pair = nvlist_next_nvpair(snaps, pair)) { 928 dsl_dataset_t *ds; 929 int err; 930 931 err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds); 932 if (err == 0) { 933 struct dsl_ds_destroyarg *dsda; 934 935 dsl_dataset_make_exclusive(ds, dstg); 936 dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), 937 KM_SLEEP); 938 dsda->ds = ds; 939 dsda->defer = defer; 940 dsl_sync_task_create(dstg, dsl_dataset_destroy_check, 941 dsl_dataset_destroy_sync, dsda, dstg, 0); 942 } else if (err == ENOENT) { 943 err = 0; 944 } else { 945 (void) strcpy(failed, nvpair_name(pair)); 946 break; 947 } 948 } 949 950 if (err == 0) 951 err = dsl_sync_task_group_wait(dstg); 952 953 for (dst = list_head(&dstg->dstg_tasks); dst; 954 dst = list_next(&dstg->dstg_tasks, dst)) { 955 struct dsl_ds_destroyarg *dsda = dst->dst_arg1; 956 dsl_dataset_t *ds = dsda->ds; 957 958 /* 959 * Return the file system name that triggered the error 960 */ 961 if (dst->dst_err) { 962 dsl_dataset_name(ds, failed); 963 } 964 ASSERT3P(dsda->rm_origin, ==, NULL); 965 dsl_dataset_disown(ds, dstg); 966 kmem_free(dsda, sizeof (struct dsl_ds_destroyarg)); 967 } 968 969 dsl_sync_task_group_destroy(dstg); 970 spa_close(spa, FTAG); 971 return (err); 972 973 } 974 975 static boolean_t 976 dsl_dataset_might_destroy_origin(dsl_dataset_t *ds) 977 { 978 boolean_t might_destroy = B_FALSE; 979 980 mutex_enter(&ds->ds_lock); 981 if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 && 982 DS_IS_DEFER_DESTROY(ds)) 983 might_destroy = B_TRUE; 984 mutex_exit(&ds->ds_lock); 985 986 return (might_destroy); 987 } 988 989 /* 990 * If we're removing a clone, and these three conditions are true: 991 * 1) the clone's origin has no other children 992 * 2) the clone's origin has no user references 993 * 3) the clone's origin has been marked for deferred destruction 994 * Then, prepare to remove the origin as part of this sync task group. 995 */ 996 static int 997 dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag) 998 { 999 dsl_dataset_t *ds = dsda->ds; 1000 dsl_dataset_t *origin = ds->ds_prev; 1001 1002 if (dsl_dataset_might_destroy_origin(origin)) { 1003 char *name; 1004 int namelen; 1005 int error; 1006 1007 namelen = dsl_dataset_namelen(origin) + 1; 1008 name = kmem_alloc(namelen, KM_SLEEP); 1009 dsl_dataset_name(origin, name); 1010 #ifdef _KERNEL 1011 error = zfs_unmount_snap(name, NULL); 1012 if (error) { 1013 kmem_free(name, namelen); 1014 return (error); 1015 } 1016 #endif 1017 error = dsl_dataset_own(name, B_TRUE, tag, &origin); 1018 kmem_free(name, namelen); 1019 if (error) 1020 return (error); 1021 dsda->rm_origin = origin; 1022 dsl_dataset_make_exclusive(origin, tag); 1023 } 1024 1025 return (0); 1026 } 1027 1028 /* 1029 * ds must be opened as OWNER. On return (whether successful or not), 1030 * ds will be closed and caller can no longer dereference it. 1031 */ 1032 int 1033 dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) 1034 { 1035 int err; 1036 dsl_sync_task_group_t *dstg; 1037 objset_t *os; 1038 dsl_dir_t *dd; 1039 uint64_t obj; 1040 struct dsl_ds_destroyarg dsda = { 0 }; 1041 dsl_dataset_t dummy_ds = { 0 }; 1042 1043 dsda.ds = ds; 1044 1045 if (dsl_dataset_is_snapshot(ds)) { 1046 /* Destroying a snapshot is simpler */ 1047 dsl_dataset_make_exclusive(ds, tag); 1048 1049 dsda.defer = defer; 1050 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 1051 dsl_dataset_destroy_check, dsl_dataset_destroy_sync, 1052 &dsda, tag, 0); 1053 ASSERT3P(dsda.rm_origin, ==, NULL); 1054 goto out; 1055 } else if (defer) { 1056 err = EINVAL; 1057 goto out; 1058 } 1059 1060 dd = ds->ds_dir; 1061 dummy_ds.ds_dir = dd; 1062 dummy_ds.ds_object = ds->ds_object; 1063 1064 /* 1065 * Check for errors and mark this ds as inconsistent, in 1066 * case we crash while freeing the objects. 1067 */ 1068 err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check, 1069 dsl_dataset_destroy_begin_sync, ds, NULL, 0); 1070 if (err) 1071 goto out; 1072 1073 err = dmu_objset_from_ds(ds, &os); 1074 if (err) 1075 goto out; 1076 1077 /* 1078 * remove the objects in open context, so that we won't 1079 * have too much to do in syncing context. 1080 */ 1081 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 1082 ds->ds_phys->ds_prev_snap_txg)) { 1083 /* 1084 * Ignore errors, if there is not enough disk space 1085 * we will deal with it in dsl_dataset_destroy_sync(). 1086 */ 1087 (void) dmu_free_object(os, obj); 1088 } 1089 if (err != ESRCH) 1090 goto out; 1091 1092 /* 1093 * Only the ZIL knows how to free log blocks. 1094 */ 1095 zil_destroy(dmu_objset_zil(os), B_FALSE); 1096 1097 /* 1098 * Sync out all in-flight IO. 1099 */ 1100 txg_wait_synced(dd->dd_pool, 0); 1101 1102 /* 1103 * If we managed to free all the objects in open 1104 * context, the user space accounting should be zero. 1105 */ 1106 if (ds->ds_phys->ds_bp.blk_fill == 0 && 1107 dmu_objset_userused_enabled(os)) { 1108 uint64_t count; 1109 1110 ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 || 1111 count == 0); 1112 ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, &count) != 0 || 1113 count == 0); 1114 } 1115 1116 rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); 1117 err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd); 1118 rw_exit(&dd->dd_pool->dp_config_rwlock); 1119 1120 if (err) 1121 goto out; 1122 1123 /* 1124 * Blow away the dsl_dir + head dataset. 1125 */ 1126 dsl_dataset_make_exclusive(ds, tag); 1127 /* 1128 * If we're removing a clone, we might also need to remove its 1129 * origin. 1130 */ 1131 do { 1132 dsda.need_prep = B_FALSE; 1133 if (dsl_dir_is_clone(dd)) { 1134 err = dsl_dataset_origin_rm_prep(&dsda, tag); 1135 if (err) { 1136 dsl_dir_close(dd, FTAG); 1137 goto out; 1138 } 1139 } 1140 1141 dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); 1142 dsl_sync_task_create(dstg, dsl_dataset_destroy_check, 1143 dsl_dataset_destroy_sync, &dsda, tag, 0); 1144 dsl_sync_task_create(dstg, dsl_dir_destroy_check, 1145 dsl_dir_destroy_sync, &dummy_ds, FTAG, 0); 1146 err = dsl_sync_task_group_wait(dstg); 1147 dsl_sync_task_group_destroy(dstg); 1148 1149 /* 1150 * We could be racing against 'zfs release' or 'zfs destroy -d' 1151 * on the origin snap, in which case we can get EBUSY if we 1152 * needed to destroy the origin snap but were not ready to 1153 * do so. 1154 */ 1155 if (dsda.need_prep) { 1156 ASSERT(err == EBUSY); 1157 ASSERT(dsl_dir_is_clone(dd)); 1158 ASSERT(dsda.rm_origin == NULL); 1159 } 1160 } while (dsda.need_prep); 1161 1162 if (dsda.rm_origin != NULL) 1163 dsl_dataset_disown(dsda.rm_origin, tag); 1164 1165 /* if it is successful, dsl_dir_destroy_sync will close the dd */ 1166 if (err) 1167 dsl_dir_close(dd, FTAG); 1168 out: 1169 dsl_dataset_disown(ds, tag); 1170 return (err); 1171 } 1172 1173 blkptr_t * 1174 dsl_dataset_get_blkptr(dsl_dataset_t *ds) 1175 { 1176 return (&ds->ds_phys->ds_bp); 1177 } 1178 1179 void 1180 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) 1181 { 1182 ASSERT(dmu_tx_is_syncing(tx)); 1183 /* If it's the meta-objset, set dp_meta_rootbp */ 1184 if (ds == NULL) { 1185 tx->tx_pool->dp_meta_rootbp = *bp; 1186 } else { 1187 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1188 ds->ds_phys->ds_bp = *bp; 1189 } 1190 } 1191 1192 spa_t * 1193 dsl_dataset_get_spa(dsl_dataset_t *ds) 1194 { 1195 return (ds->ds_dir->dd_pool->dp_spa); 1196 } 1197 1198 void 1199 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) 1200 { 1201 dsl_pool_t *dp; 1202 1203 if (ds == NULL) /* this is the meta-objset */ 1204 return; 1205 1206 ASSERT(ds->ds_objset != NULL); 1207 1208 if (ds->ds_phys->ds_next_snap_obj != 0) 1209 panic("dirtying snapshot!"); 1210 1211 dp = ds->ds_dir->dd_pool; 1212 1213 if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) { 1214 /* up the hold count until we can be written out */ 1215 dmu_buf_add_ref(ds->ds_dbuf, ds); 1216 } 1217 } 1218 1219 /* 1220 * The unique space in the head dataset can be calculated by subtracting 1221 * the space used in the most recent snapshot, that is still being used 1222 * in this file system, from the space currently in use. To figure out 1223 * the space in the most recent snapshot still in use, we need to take 1224 * the total space used in the snapshot and subtract out the space that 1225 * has been freed up since the snapshot was taken. 1226 */ 1227 static void 1228 dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) 1229 { 1230 uint64_t mrs_used; 1231 uint64_t dlused, dlcomp, dluncomp; 1232 1233 ASSERT(!dsl_dataset_is_snapshot(ds)); 1234 1235 if (ds->ds_phys->ds_prev_snap_obj != 0) 1236 mrs_used = ds->ds_prev->ds_phys->ds_used_bytes; 1237 else 1238 mrs_used = 0; 1239 1240 dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp); 1241 1242 ASSERT3U(dlused, <=, mrs_used); 1243 ds->ds_phys->ds_unique_bytes = 1244 ds->ds_phys->ds_used_bytes - (mrs_used - dlused); 1245 1246 if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= 1247 SPA_VERSION_UNIQUE_ACCURATE) 1248 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 1249 } 1250 1251 struct killarg { 1252 dsl_dataset_t *ds; 1253 dmu_tx_t *tx; 1254 }; 1255 1256 /* ARGSUSED */ 1257 static int 1258 kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, 1259 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 1260 { 1261 struct killarg *ka = arg; 1262 dmu_tx_t *tx = ka->tx; 1263 1264 if (bp == NULL) 1265 return (0); 1266 1267 if (zb->zb_level == ZB_ZIL_LEVEL) { 1268 ASSERT(zilog != NULL); 1269 /* 1270 * It's a block in the intent log. It has no 1271 * accounting, so just free it. 1272 */ 1273 dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); 1274 } else { 1275 ASSERT(zilog == NULL); 1276 ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); 1277 (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); 1278 } 1279 1280 return (0); 1281 } 1282 1283 /* ARGSUSED */ 1284 static int 1285 dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) 1286 { 1287 dsl_dataset_t *ds = arg1; 1288 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 1289 uint64_t count; 1290 int err; 1291 1292 /* 1293 * Can't delete a head dataset if there are snapshots of it. 1294 * (Except if the only snapshots are from the branch we cloned 1295 * from.) 1296 */ 1297 if (ds->ds_prev != NULL && 1298 ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) 1299 return (EBUSY); 1300 1301 /* 1302 * This is really a dsl_dir thing, but check it here so that 1303 * we'll be less likely to leave this dataset inconsistent & 1304 * nearly destroyed. 1305 */ 1306 err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count); 1307 if (err) 1308 return (err); 1309 if (count != 0) 1310 return (EEXIST); 1311 1312 return (0); 1313 } 1314 1315 /* ARGSUSED */ 1316 static void 1317 dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx) 1318 { 1319 dsl_dataset_t *ds = arg1; 1320 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1321 1322 /* Mark it as inconsistent on-disk, in case we crash */ 1323 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1324 ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; 1325 1326 spa_history_log_internal(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx, 1327 "dataset = %llu", ds->ds_object); 1328 } 1329 1330 static int 1331 dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag, 1332 dmu_tx_t *tx) 1333 { 1334 dsl_dataset_t *ds = dsda->ds; 1335 dsl_dataset_t *ds_prev = ds->ds_prev; 1336 1337 if (dsl_dataset_might_destroy_origin(ds_prev)) { 1338 struct dsl_ds_destroyarg ndsda = {0}; 1339 1340 /* 1341 * If we're not prepared to remove the origin, don't remove 1342 * the clone either. 1343 */ 1344 if (dsda->rm_origin == NULL) { 1345 dsda->need_prep = B_TRUE; 1346 return (EBUSY); 1347 } 1348 1349 ndsda.ds = ds_prev; 1350 ndsda.is_origin_rm = B_TRUE; 1351 return (dsl_dataset_destroy_check(&ndsda, tag, tx)); 1352 } 1353 1354 /* 1355 * If we're not going to remove the origin after all, 1356 * undo the open context setup. 1357 */ 1358 if (dsda->rm_origin != NULL) { 1359 dsl_dataset_disown(dsda->rm_origin, tag); 1360 dsda->rm_origin = NULL; 1361 } 1362 1363 return (0); 1364 } 1365 1366 /* 1367 * If you add new checks here, you may need to add 1368 * additional checks to the "temporary" case in 1369 * snapshot_check() in dmu_objset.c. 1370 */ 1371 /* ARGSUSED */ 1372 int 1373 dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) 1374 { 1375 struct dsl_ds_destroyarg *dsda = arg1; 1376 dsl_dataset_t *ds = dsda->ds; 1377 1378 /* we have an owner hold, so noone else can destroy us */ 1379 ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); 1380 1381 /* 1382 * Only allow deferred destroy on pools that support it. 1383 * NOTE: deferred destroy is only supported on snapshots. 1384 */ 1385 if (dsda->defer) { 1386 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < 1387 SPA_VERSION_USERREFS) 1388 return (ENOTSUP); 1389 ASSERT(dsl_dataset_is_snapshot(ds)); 1390 return (0); 1391 } 1392 1393 /* 1394 * Can't delete a head dataset if there are snapshots of it. 1395 * (Except if the only snapshots are from the branch we cloned 1396 * from.) 1397 */ 1398 if (ds->ds_prev != NULL && 1399 ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) 1400 return (EBUSY); 1401 1402 /* 1403 * If we made changes this txg, traverse_dsl_dataset won't find 1404 * them. Try again. 1405 */ 1406 if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) 1407 return (EAGAIN); 1408 1409 if (dsl_dataset_is_snapshot(ds)) { 1410 /* 1411 * If this snapshot has an elevated user reference count, 1412 * we can't destroy it yet. 1413 */ 1414 if (ds->ds_userrefs > 0 && !dsda->releasing) 1415 return (EBUSY); 1416 1417 mutex_enter(&ds->ds_lock); 1418 /* 1419 * Can't delete a branch point. However, if we're destroying 1420 * a clone and removing its origin due to it having a user 1421 * hold count of 0 and having been marked for deferred destroy, 1422 * it's OK for the origin to have a single clone. 1423 */ 1424 if (ds->ds_phys->ds_num_children > 1425 (dsda->is_origin_rm ? 2 : 1)) { 1426 mutex_exit(&ds->ds_lock); 1427 return (EEXIST); 1428 } 1429 mutex_exit(&ds->ds_lock); 1430 } else if (dsl_dir_is_clone(ds->ds_dir)) { 1431 return (dsl_dataset_origin_check(dsda, arg2, tx)); 1432 } 1433 1434 /* XXX we should do some i/o error checking... */ 1435 return (0); 1436 } 1437 1438 struct refsarg { 1439 kmutex_t lock; 1440 boolean_t gone; 1441 kcondvar_t cv; 1442 }; 1443 1444 /* ARGSUSED */ 1445 static void 1446 dsl_dataset_refs_gone(dmu_buf_t *db, void *argv) 1447 { 1448 struct refsarg *arg = argv; 1449 1450 mutex_enter(&arg->lock); 1451 arg->gone = TRUE; 1452 cv_signal(&arg->cv); 1453 mutex_exit(&arg->lock); 1454 } 1455 1456 static void 1457 dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag) 1458 { 1459 struct refsarg arg; 1460 1461 mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL); 1462 cv_init(&arg.cv, NULL, CV_DEFAULT, NULL); 1463 arg.gone = FALSE; 1464 (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys, 1465 dsl_dataset_refs_gone); 1466 dmu_buf_rele(ds->ds_dbuf, tag); 1467 mutex_enter(&arg.lock); 1468 while (!arg.gone) 1469 cv_wait(&arg.cv, &arg.lock); 1470 ASSERT(arg.gone); 1471 mutex_exit(&arg.lock); 1472 ds->ds_dbuf = NULL; 1473 ds->ds_phys = NULL; 1474 mutex_destroy(&arg.lock); 1475 cv_destroy(&arg.cv); 1476 } 1477 1478 static void 1479 remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) 1480 { 1481 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 1482 uint64_t count; 1483 int err; 1484 1485 ASSERT(ds->ds_phys->ds_num_children >= 2); 1486 err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx); 1487 /* 1488 * The err should not be ENOENT, but a bug in a previous version 1489 * of the code could cause upgrade_clones_cb() to not set 1490 * ds_next_snap_obj when it should, leading to a missing entry. 1491 * If we knew that the pool was created after 1492 * SPA_VERSION_NEXT_CLONES, we could assert that it isn't 1493 * ENOENT. However, at least we can check that we don't have 1494 * too many entries in the next_clones_obj even after failing to 1495 * remove this one. 1496 */ 1497 if (err != ENOENT) { 1498 VERIFY3U(err, ==, 0); 1499 } 1500 ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, 1501 &count)); 1502 ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2); 1503 } 1504 1505 static void 1506 dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) 1507 { 1508 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 1509 zap_cursor_t zc; 1510 zap_attribute_t za; 1511 1512 /* 1513 * If it is the old version, dd_clones doesn't exist so we can't 1514 * find the clones, but deadlist_remove_key() is a no-op so it 1515 * doesn't matter. 1516 */ 1517 if (ds->ds_dir->dd_phys->dd_clones == 0) 1518 return; 1519 1520 for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones); 1521 zap_cursor_retrieve(&zc, &za) == 0; 1522 zap_cursor_advance(&zc)) { 1523 dsl_dataset_t *clone; 1524 1525 VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool, 1526 za.za_first_integer, FTAG, &clone)); 1527 if (clone->ds_dir->dd_origin_txg > mintxg) { 1528 dsl_deadlist_remove_key(&clone->ds_deadlist, 1529 mintxg, tx); 1530 dsl_dataset_remove_clones_key(clone, mintxg, tx); 1531 } 1532 dsl_dataset_rele(clone, FTAG); 1533 } 1534 zap_cursor_fini(&zc); 1535 } 1536 1537 struct process_old_arg { 1538 dsl_dataset_t *ds; 1539 dsl_dataset_t *ds_prev; 1540 boolean_t after_branch_point; 1541 zio_t *pio; 1542 uint64_t used, comp, uncomp; 1543 }; 1544 1545 static int 1546 process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 1547 { 1548 struct process_old_arg *poa = arg; 1549 dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; 1550 1551 if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) { 1552 dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx); 1553 if (poa->ds_prev && !poa->after_branch_point && 1554 bp->blk_birth > 1555 poa->ds_prev->ds_phys->ds_prev_snap_txg) { 1556 poa->ds_prev->ds_phys->ds_unique_bytes += 1557 bp_get_dsize_sync(dp->dp_spa, bp); 1558 } 1559 } else { 1560 poa->used += bp_get_dsize_sync(dp->dp_spa, bp); 1561 poa->comp += BP_GET_PSIZE(bp); 1562 poa->uncomp += BP_GET_UCSIZE(bp); 1563 dsl_free_sync(poa->pio, dp, tx->tx_txg, bp); 1564 } 1565 return (0); 1566 } 1567 1568 static void 1569 process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, 1570 dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx) 1571 { 1572 struct process_old_arg poa = { 0 }; 1573 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1574 objset_t *mos = dp->dp_meta_objset; 1575 1576 ASSERT(ds->ds_deadlist.dl_oldfmt); 1577 ASSERT(ds_next->ds_deadlist.dl_oldfmt); 1578 1579 poa.ds = ds; 1580 poa.ds_prev = ds_prev; 1581 poa.after_branch_point = after_branch_point; 1582 poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 1583 VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj, 1584 process_old_cb, &poa, tx)); 1585 VERIFY3U(zio_wait(poa.pio), ==, 0); 1586 ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes); 1587 1588 /* change snapused */ 1589 dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, 1590 -poa.used, -poa.comp, -poa.uncomp, tx); 1591 1592 /* swap next's deadlist to our deadlist */ 1593 dsl_deadlist_close(&ds->ds_deadlist); 1594 dsl_deadlist_close(&ds_next->ds_deadlist); 1595 SWITCH64(ds_next->ds_phys->ds_deadlist_obj, 1596 ds->ds_phys->ds_deadlist_obj); 1597 dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); 1598 dsl_deadlist_open(&ds_next->ds_deadlist, mos, 1599 ds_next->ds_phys->ds_deadlist_obj); 1600 } 1601 1602 void 1603 dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) 1604 { 1605 struct dsl_ds_destroyarg *dsda = arg1; 1606 dsl_dataset_t *ds = dsda->ds; 1607 int err; 1608 int after_branch_point = FALSE; 1609 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1610 objset_t *mos = dp->dp_meta_objset; 1611 dsl_dataset_t *ds_prev = NULL; 1612 boolean_t wont_destroy; 1613 uint64_t obj; 1614 1615 wont_destroy = (dsda->defer && 1616 (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1)); 1617 1618 ASSERT(ds->ds_owner || wont_destroy); 1619 ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1); 1620 ASSERT(ds->ds_prev == NULL || 1621 ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); 1622 ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); 1623 1624 if (wont_destroy) { 1625 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 1626 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1627 ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY; 1628 return; 1629 } 1630 1631 /* signal any waiters that this dataset is going away */ 1632 mutex_enter(&ds->ds_lock); 1633 ds->ds_owner = dsl_reaper; 1634 cv_broadcast(&ds->ds_exclusive_cv); 1635 mutex_exit(&ds->ds_lock); 1636 1637 /* Remove our reservation */ 1638 if (ds->ds_reserved != 0) { 1639 dsl_prop_setarg_t psa; 1640 uint64_t value = 0; 1641 1642 dsl_prop_setarg_init_uint64(&psa, "refreservation", 1643 (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), 1644 &value); 1645 psa.psa_effective_value = 0; /* predict default value */ 1646 1647 dsl_dataset_set_reservation_sync(ds, &psa, tx); 1648 ASSERT3U(ds->ds_reserved, ==, 0); 1649 } 1650 1651 ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); 1652 1653 dsl_scan_ds_destroyed(ds, tx); 1654 1655 obj = ds->ds_object; 1656 1657 if (ds->ds_phys->ds_prev_snap_obj != 0) { 1658 if (ds->ds_prev) { 1659 ds_prev = ds->ds_prev; 1660 } else { 1661 VERIFY(0 == dsl_dataset_hold_obj(dp, 1662 ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev)); 1663 } 1664 after_branch_point = 1665 (ds_prev->ds_phys->ds_next_snap_obj != obj); 1666 1667 dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); 1668 if (after_branch_point && 1669 ds_prev->ds_phys->ds_next_clones_obj != 0) { 1670 remove_from_next_clones(ds_prev, obj, tx); 1671 if (ds->ds_phys->ds_next_snap_obj != 0) { 1672 VERIFY(0 == zap_add_int(mos, 1673 ds_prev->ds_phys->ds_next_clones_obj, 1674 ds->ds_phys->ds_next_snap_obj, tx)); 1675 } 1676 } 1677 if (after_branch_point && 1678 ds->ds_phys->ds_next_snap_obj == 0) { 1679 /* This clone is toast. */ 1680 ASSERT(ds_prev->ds_phys->ds_num_children > 1); 1681 ds_prev->ds_phys->ds_num_children--; 1682 1683 /* 1684 * If the clone's origin has no other clones, no 1685 * user holds, and has been marked for deferred 1686 * deletion, then we should have done the necessary 1687 * destroy setup for it. 1688 */ 1689 if (ds_prev->ds_phys->ds_num_children == 1 && 1690 ds_prev->ds_userrefs == 0 && 1691 DS_IS_DEFER_DESTROY(ds_prev)) { 1692 ASSERT3P(dsda->rm_origin, !=, NULL); 1693 } else { 1694 ASSERT3P(dsda->rm_origin, ==, NULL); 1695 } 1696 } else if (!after_branch_point) { 1697 ds_prev->ds_phys->ds_next_snap_obj = 1698 ds->ds_phys->ds_next_snap_obj; 1699 } 1700 } 1701 1702 if (dsl_dataset_is_snapshot(ds)) { 1703 dsl_dataset_t *ds_next; 1704 uint64_t old_unique; 1705 uint64_t used = 0, comp = 0, uncomp = 0; 1706 1707 VERIFY(0 == dsl_dataset_hold_obj(dp, 1708 ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); 1709 ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); 1710 1711 old_unique = ds_next->ds_phys->ds_unique_bytes; 1712 1713 dmu_buf_will_dirty(ds_next->ds_dbuf, tx); 1714 ds_next->ds_phys->ds_prev_snap_obj = 1715 ds->ds_phys->ds_prev_snap_obj; 1716 ds_next->ds_phys->ds_prev_snap_txg = 1717 ds->ds_phys->ds_prev_snap_txg; 1718 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, 1719 ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0); 1720 1721 1722 if (ds_next->ds_deadlist.dl_oldfmt) { 1723 process_old_deadlist(ds, ds_prev, ds_next, 1724 after_branch_point, tx); 1725 } else { 1726 /* Adjust prev's unique space. */ 1727 if (ds_prev && !after_branch_point) { 1728 dsl_deadlist_space_range(&ds_next->ds_deadlist, 1729 ds_prev->ds_phys->ds_prev_snap_txg, 1730 ds->ds_phys->ds_prev_snap_txg, 1731 &used, &comp, &uncomp); 1732 ds_prev->ds_phys->ds_unique_bytes += used; 1733 } 1734 1735 /* Adjust snapused. */ 1736 dsl_deadlist_space_range(&ds_next->ds_deadlist, 1737 ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, 1738 &used, &comp, &uncomp); 1739 dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, 1740 -used, -comp, -uncomp, tx); 1741 1742 /* Move blocks to be freed to pool's free list. */ 1743 dsl_deadlist_move_bpobj(&ds_next->ds_deadlist, 1744 &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg, 1745 tx); 1746 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, 1747 DD_USED_HEAD, used, comp, uncomp, tx); 1748 dsl_dir_dirty(tx->tx_pool->dp_free_dir, tx); 1749 1750 /* Merge our deadlist into next's and free it. */ 1751 dsl_deadlist_merge(&ds_next->ds_deadlist, 1752 ds->ds_phys->ds_deadlist_obj, tx); 1753 } 1754 dsl_deadlist_close(&ds->ds_deadlist); 1755 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); 1756 1757 /* Collapse range in clone heads */ 1758 dsl_dataset_remove_clones_key(ds, 1759 ds->ds_phys->ds_creation_txg, tx); 1760 1761 if (dsl_dataset_is_snapshot(ds_next)) { 1762 dsl_dataset_t *ds_nextnext; 1763 1764 /* 1765 * Update next's unique to include blocks which 1766 * were previously shared by only this snapshot 1767 * and it. Those blocks will be born after the 1768 * prev snap and before this snap, and will have 1769 * died after the next snap and before the one 1770 * after that (ie. be on the snap after next's 1771 * deadlist). 1772 */ 1773 VERIFY(0 == dsl_dataset_hold_obj(dp, 1774 ds_next->ds_phys->ds_next_snap_obj, 1775 FTAG, &ds_nextnext)); 1776 dsl_deadlist_space_range(&ds_nextnext->ds_deadlist, 1777 ds->ds_phys->ds_prev_snap_txg, 1778 ds->ds_phys->ds_creation_txg, 1779 &used, &comp, &uncomp); 1780 ds_next->ds_phys->ds_unique_bytes += used; 1781 dsl_dataset_rele(ds_nextnext, FTAG); 1782 ASSERT3P(ds_next->ds_prev, ==, NULL); 1783 1784 /* Collapse range in this head. */ 1785 dsl_dataset_t *hds; 1786 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, 1787 ds->ds_dir->dd_phys->dd_head_dataset_obj, 1788 FTAG, &hds)); 1789 dsl_deadlist_remove_key(&hds->ds_deadlist, 1790 ds->ds_phys->ds_creation_txg, tx); 1791 dsl_dataset_rele(hds, FTAG); 1792 1793 } else { 1794 ASSERT3P(ds_next->ds_prev, ==, ds); 1795 dsl_dataset_drop_ref(ds_next->ds_prev, ds_next); 1796 ds_next->ds_prev = NULL; 1797 if (ds_prev) { 1798 VERIFY(0 == dsl_dataset_get_ref(dp, 1799 ds->ds_phys->ds_prev_snap_obj, 1800 ds_next, &ds_next->ds_prev)); 1801 } 1802 1803 dsl_dataset_recalc_head_uniq(ds_next); 1804 1805 /* 1806 * Reduce the amount of our unconsmed refreservation 1807 * being charged to our parent by the amount of 1808 * new unique data we have gained. 1809 */ 1810 if (old_unique < ds_next->ds_reserved) { 1811 int64_t mrsdelta; 1812 uint64_t new_unique = 1813 ds_next->ds_phys->ds_unique_bytes; 1814 1815 ASSERT(old_unique <= new_unique); 1816 mrsdelta = MIN(new_unique - old_unique, 1817 ds_next->ds_reserved - old_unique); 1818 dsl_dir_diduse_space(ds->ds_dir, 1819 DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); 1820 } 1821 } 1822 dsl_dataset_rele(ds_next, FTAG); 1823 } else { 1824 /* 1825 * There's no next snapshot, so this is a head dataset. 1826 * Destroy the deadlist. Unless it's a clone, the 1827 * deadlist should be empty. (If it's a clone, it's 1828 * safe to ignore the deadlist contents.) 1829 */ 1830 struct killarg ka; 1831 1832 dsl_deadlist_close(&ds->ds_deadlist); 1833 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); 1834 ds->ds_phys->ds_deadlist_obj = 0; 1835 1836 /* 1837 * Free everything that we point to (that's born after 1838 * the previous snapshot, if we are a clone) 1839 * 1840 * NB: this should be very quick, because we already 1841 * freed all the objects in open context. 1842 */ 1843 ka.ds = ds; 1844 ka.tx = tx; 1845 err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg, 1846 TRAVERSE_POST, kill_blkptr, &ka); 1847 ASSERT3U(err, ==, 0); 1848 ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || 1849 ds->ds_phys->ds_unique_bytes == 0); 1850 1851 if (ds->ds_prev != NULL) { 1852 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 1853 VERIFY3U(0, ==, zap_remove_int(mos, 1854 ds->ds_prev->ds_dir->dd_phys->dd_clones, 1855 ds->ds_object, tx)); 1856 } 1857 dsl_dataset_rele(ds->ds_prev, ds); 1858 ds->ds_prev = ds_prev = NULL; 1859 } 1860 } 1861 1862 /* 1863 * This must be done after the dsl_traverse(), because it will 1864 * re-open the objset. 1865 */ 1866 if (ds->ds_objset) { 1867 dmu_objset_evict(ds->ds_objset); 1868 ds->ds_objset = NULL; 1869 } 1870 1871 if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) { 1872 /* Erase the link in the dir */ 1873 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); 1874 ds->ds_dir->dd_phys->dd_head_dataset_obj = 0; 1875 ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0); 1876 err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx); 1877 ASSERT(err == 0); 1878 } else { 1879 /* remove from snapshot namespace */ 1880 dsl_dataset_t *ds_head; 1881 ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0); 1882 VERIFY(0 == dsl_dataset_hold_obj(dp, 1883 ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head)); 1884 VERIFY(0 == dsl_dataset_get_snapname(ds)); 1885 #ifdef ZFS_DEBUG 1886 { 1887 uint64_t val; 1888 1889 err = dsl_dataset_snap_lookup(ds_head, 1890 ds->ds_snapname, &val); 1891 ASSERT3U(err, ==, 0); 1892 ASSERT3U(val, ==, obj); 1893 } 1894 #endif 1895 err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx); 1896 ASSERT(err == 0); 1897 dsl_dataset_rele(ds_head, FTAG); 1898 } 1899 1900 if (ds_prev && ds->ds_prev != ds_prev) 1901 dsl_dataset_rele(ds_prev, FTAG); 1902 1903 spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); 1904 spa_history_log_internal(LOG_DS_DESTROY, dp->dp_spa, tx, 1905 "dataset = %llu", ds->ds_object); 1906 1907 if (ds->ds_phys->ds_next_clones_obj != 0) { 1908 uint64_t count; 1909 ASSERT(0 == zap_count(mos, 1910 ds->ds_phys->ds_next_clones_obj, &count) && count == 0); 1911 VERIFY(0 == dmu_object_free(mos, 1912 ds->ds_phys->ds_next_clones_obj, tx)); 1913 } 1914 if (ds->ds_phys->ds_props_obj != 0) 1915 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx)); 1916 if (ds->ds_phys->ds_userrefs_obj != 0) 1917 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx)); 1918 dsl_dir_close(ds->ds_dir, ds); 1919 ds->ds_dir = NULL; 1920 dsl_dataset_drain_refs(ds, tag); 1921 VERIFY(0 == dmu_object_free(mos, obj, tx)); 1922 1923 if (dsda->rm_origin) { 1924 /* 1925 * Remove the origin of the clone we just destroyed. 1926 */ 1927 struct dsl_ds_destroyarg ndsda = {0}; 1928 1929 ndsda.ds = dsda->rm_origin; 1930 dsl_dataset_destroy_sync(&ndsda, tag, tx); 1931 } 1932 } 1933 1934 static int 1935 dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) 1936 { 1937 uint64_t asize; 1938 1939 if (!dmu_tx_is_syncing(tx)) 1940 return (0); 1941 1942 /* 1943 * If there's an fs-only reservation, any blocks that might become 1944 * owned by the snapshot dataset must be accommodated by space 1945 * outside of the reservation. 1946 */ 1947 ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds)); 1948 asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); 1949 if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) 1950 return (ENOSPC); 1951 1952 /* 1953 * Propogate any reserved space for this snapshot to other 1954 * snapshot checks in this sync group. 1955 */ 1956 if (asize > 0) 1957 dsl_dir_willuse_space(ds->ds_dir, asize, tx); 1958 1959 return (0); 1960 } 1961 1962 int 1963 dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) 1964 { 1965 dsl_dataset_t *ds = arg1; 1966 const char *snapname = arg2; 1967 int err; 1968 uint64_t value; 1969 1970 /* 1971 * We don't allow multiple snapshots of the same txg. If there 1972 * is already one, try again. 1973 */ 1974 if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg) 1975 return (EAGAIN); 1976 1977 /* 1978 * Check for conflicting name snapshot name. 1979 */ 1980 err = dsl_dataset_snap_lookup(ds, snapname, &value); 1981 if (err == 0) 1982 return (EEXIST); 1983 if (err != ENOENT) 1984 return (err); 1985 1986 /* 1987 * Check that the dataset's name is not too long. Name consists 1988 * of the dataset's length + 1 for the @-sign + snapshot name's length 1989 */ 1990 if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN) 1991 return (ENAMETOOLONG); 1992 1993 err = dsl_dataset_snapshot_reserve_space(ds, tx); 1994 if (err) 1995 return (err); 1996 1997 ds->ds_trysnap_txg = tx->tx_txg; 1998 return (0); 1999 } 2000 2001 void 2002 dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) 2003 { 2004 dsl_dataset_t *ds = arg1; 2005 const char *snapname = arg2; 2006 dsl_pool_t *dp = ds->ds_dir->dd_pool; 2007 dmu_buf_t *dbuf; 2008 dsl_dataset_phys_t *dsphys; 2009 uint64_t dsobj, crtxg; 2010 objset_t *mos = dp->dp_meta_objset; 2011 int err; 2012 2013 ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); 2014 2015 /* 2016 * The origin's ds_creation_txg has to be < TXG_INITIAL 2017 */ 2018 if (strcmp(snapname, ORIGIN_DIR_NAME) == 0) 2019 crtxg = 1; 2020 else 2021 crtxg = tx->tx_txg; 2022 2023 dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, 2024 DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); 2025 VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); 2026 dmu_buf_will_dirty(dbuf, tx); 2027 dsphys = dbuf->db_data; 2028 bzero(dsphys, sizeof (dsl_dataset_phys_t)); 2029 dsphys->ds_dir_obj = ds->ds_dir->dd_object; 2030 dsphys->ds_fsid_guid = unique_create(); 2031 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, 2032 sizeof (dsphys->ds_guid)); 2033 dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj; 2034 dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg; 2035 dsphys->ds_next_snap_obj = ds->ds_object; 2036 dsphys->ds_num_children = 1; 2037 dsphys->ds_creation_time = gethrestime_sec(); 2038 dsphys->ds_creation_txg = crtxg; 2039 dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj; 2040 dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes; 2041 dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes; 2042 dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes; 2043 dsphys->ds_flags = ds->ds_phys->ds_flags; 2044 dsphys->ds_bp = ds->ds_phys->ds_bp; 2045 dmu_buf_rele(dbuf, FTAG); 2046 2047 ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0); 2048 if (ds->ds_prev) { 2049 uint64_t next_clones_obj = 2050 ds->ds_prev->ds_phys->ds_next_clones_obj; 2051 ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj == 2052 ds->ds_object || 2053 ds->ds_prev->ds_phys->ds_num_children > 1); 2054 if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { 2055 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 2056 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, 2057 ds->ds_prev->ds_phys->ds_creation_txg); 2058 ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj; 2059 } else if (next_clones_obj != 0) { 2060 remove_from_next_clones(ds->ds_prev, 2061 dsphys->ds_next_snap_obj, tx); 2062 VERIFY3U(0, ==, zap_add_int(mos, 2063 next_clones_obj, dsobj, tx)); 2064 } 2065 } 2066 2067 /* 2068 * If we have a reference-reservation on this dataset, we will 2069 * need to increase the amount of refreservation being charged 2070 * since our unique space is going to zero. 2071 */ 2072 if (ds->ds_reserved) { 2073 int64_t delta; 2074 ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); 2075 delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); 2076 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, 2077 delta, 0, 0, tx); 2078 } 2079 2080 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2081 zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu", 2082 ds->ds_dir->dd_myname, snapname, dsobj, 2083 ds->ds_phys->ds_prev_snap_txg); 2084 ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist, 2085 UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx); 2086 dsl_deadlist_close(&ds->ds_deadlist); 2087 dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); 2088 dsl_deadlist_add_key(&ds->ds_deadlist, 2089 ds->ds_phys->ds_prev_snap_txg, tx); 2090 2091 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg); 2092 ds->ds_phys->ds_prev_snap_obj = dsobj; 2093 ds->ds_phys->ds_prev_snap_txg = crtxg; 2094 ds->ds_phys->ds_unique_bytes = 0; 2095 if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) 2096 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 2097 2098 err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj, 2099 snapname, 8, 1, &dsobj, tx); 2100 ASSERT(err == 0); 2101 2102 if (ds->ds_prev) 2103 dsl_dataset_drop_ref(ds->ds_prev, ds); 2104 VERIFY(0 == dsl_dataset_get_ref(dp, 2105 ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); 2106 2107 dsl_scan_ds_snapshotted(ds, tx); 2108 2109 dsl_dir_snap_cmtime_update(ds->ds_dir); 2110 2111 spa_history_log_internal(LOG_DS_SNAPSHOT, dp->dp_spa, tx, 2112 "dataset = %llu", dsobj); 2113 } 2114 2115 void 2116 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) 2117 { 2118 ASSERT(dmu_tx_is_syncing(tx)); 2119 ASSERT(ds->ds_objset != NULL); 2120 ASSERT(ds->ds_phys->ds_next_snap_obj == 0); 2121 2122 /* 2123 * in case we had to change ds_fsid_guid when we opened it, 2124 * sync it out now. 2125 */ 2126 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2127 ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid; 2128 2129 dsl_dir_dirty(ds->ds_dir, tx); 2130 dmu_objset_sync(ds->ds_objset, zio, tx); 2131 } 2132 2133 static void 2134 get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) 2135 { 2136 uint64_t count = 0; 2137 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 2138 zap_cursor_t zc; 2139 zap_attribute_t za; 2140 nvlist_t *propval; 2141 nvlist_t *val; 2142 2143 rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); 2144 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2145 VERIFY(nvlist_alloc(&val, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2146 2147 /* 2148 * There may me missing entries in ds_next_clones_obj 2149 * due to a bug in a previous version of the code. 2150 * Only trust it if it has the right number of entries. 2151 */ 2152 if (ds->ds_phys->ds_next_clones_obj != 0) { 2153 ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, 2154 &count)); 2155 } 2156 if (count != ds->ds_phys->ds_num_children - 1) { 2157 goto fail; 2158 } 2159 for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj); 2160 zap_cursor_retrieve(&zc, &za) == 0; 2161 zap_cursor_advance(&zc)) { 2162 dsl_dataset_t *clone; 2163 char buf[ZFS_MAXNAMELEN]; 2164 if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool, 2165 za.za_first_integer, FTAG, &clone) != 0) { 2166 goto fail; 2167 } 2168 dsl_dir_name(clone->ds_dir, buf); 2169 VERIFY(nvlist_add_boolean(val, buf) == 0); 2170 dsl_dataset_rele(clone, FTAG); 2171 } 2172 zap_cursor_fini(&zc); 2173 VERIFY(nvlist_add_nvlist(propval, ZPROP_VALUE, val) == 0); 2174 VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), 2175 propval) == 0); 2176 fail: 2177 nvlist_free(val); 2178 nvlist_free(propval); 2179 rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); 2180 } 2181 2182 void 2183 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) 2184 { 2185 uint64_t refd, avail, uobjs, aobjs, ratio; 2186 2187 dsl_dir_stats(ds->ds_dir, nv); 2188 2189 dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs); 2190 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail); 2191 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd); 2192 2193 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION, 2194 ds->ds_phys->ds_creation_time); 2195 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG, 2196 ds->ds_phys->ds_creation_txg); 2197 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA, 2198 ds->ds_quota); 2199 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION, 2200 ds->ds_reserved); 2201 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, 2202 ds->ds_phys->ds_guid); 2203 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE, 2204 ds->ds_phys->ds_unique_bytes); 2205 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID, 2206 ds->ds_object); 2207 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, 2208 ds->ds_userrefs); 2209 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, 2210 DS_IS_DEFER_DESTROY(ds) ? 1 : 0); 2211 2212 if (ds->ds_phys->ds_prev_snap_obj != 0) { 2213 uint64_t written, comp, uncomp; 2214 dsl_pool_t *dp = ds->ds_dir->dd_pool; 2215 dsl_dataset_t *prev; 2216 2217 rw_enter(&dp->dp_config_rwlock, RW_READER); 2218 int err = dsl_dataset_hold_obj(dp, 2219 ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); 2220 rw_exit(&dp->dp_config_rwlock); 2221 if (err == 0) { 2222 err = dsl_dataset_space_written(prev, ds, &written, 2223 &comp, &uncomp); 2224 dsl_dataset_rele(prev, FTAG); 2225 if (err == 0) { 2226 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN, 2227 written); 2228 } 2229 } 2230 } 2231 2232 ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 : 2233 (ds->ds_phys->ds_uncompressed_bytes * 100 / 2234 ds->ds_phys->ds_compressed_bytes); 2235 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio); 2236 2237 if (ds->ds_phys->ds_next_snap_obj) { 2238 /* 2239 * This is a snapshot; override the dd's space used with 2240 * our unique space and compression ratio. 2241 */ 2242 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, 2243 ds->ds_phys->ds_unique_bytes); 2244 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio); 2245 2246 get_clones_stat(ds, nv); 2247 } 2248 } 2249 2250 void 2251 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) 2252 { 2253 stat->dds_creation_txg = ds->ds_phys->ds_creation_txg; 2254 stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT; 2255 stat->dds_guid = ds->ds_phys->ds_guid; 2256 if (ds->ds_phys->ds_next_snap_obj) { 2257 stat->dds_is_snapshot = B_TRUE; 2258 stat->dds_num_clones = ds->ds_phys->ds_num_children - 1; 2259 } else { 2260 stat->dds_is_snapshot = B_FALSE; 2261 stat->dds_num_clones = 0; 2262 } 2263 2264 /* clone origin is really a dsl_dir thing... */ 2265 rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); 2266 if (dsl_dir_is_clone(ds->ds_dir)) { 2267 dsl_dataset_t *ods; 2268 2269 VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool, 2270 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods)); 2271 dsl_dataset_name(ods, stat->dds_origin); 2272 dsl_dataset_drop_ref(ods, FTAG); 2273 } else { 2274 stat->dds_origin[0] = '\0'; 2275 } 2276 rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); 2277 } 2278 2279 uint64_t 2280 dsl_dataset_fsid_guid(dsl_dataset_t *ds) 2281 { 2282 return (ds->ds_fsid_guid); 2283 } 2284 2285 void 2286 dsl_dataset_space(dsl_dataset_t *ds, 2287 uint64_t *refdbytesp, uint64_t *availbytesp, 2288 uint64_t *usedobjsp, uint64_t *availobjsp) 2289 { 2290 *refdbytesp = ds->ds_phys->ds_used_bytes; 2291 *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); 2292 if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) 2293 *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes; 2294 if (ds->ds_quota != 0) { 2295 /* 2296 * Adjust available bytes according to refquota 2297 */ 2298 if (*refdbytesp < ds->ds_quota) 2299 *availbytesp = MIN(*availbytesp, 2300 ds->ds_quota - *refdbytesp); 2301 else 2302 *availbytesp = 0; 2303 } 2304 *usedobjsp = ds->ds_phys->ds_bp.blk_fill; 2305 *availobjsp = DN_MAX_OBJECT - *usedobjsp; 2306 } 2307 2308 boolean_t 2309 dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) 2310 { 2311 dsl_pool_t *dp = ds->ds_dir->dd_pool; 2312 2313 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || 2314 dsl_pool_sync_context(dp)); 2315 if (ds->ds_prev == NULL) 2316 return (B_FALSE); 2317 if (ds->ds_phys->ds_bp.blk_birth > 2318 ds->ds_prev->ds_phys->ds_creation_txg) { 2319 objset_t *os, *os_prev; 2320 /* 2321 * It may be that only the ZIL differs, because it was 2322 * reset in the head. Don't count that as being 2323 * modified. 2324 */ 2325 if (dmu_objset_from_ds(ds, &os) != 0) 2326 return (B_TRUE); 2327 if (dmu_objset_from_ds(ds->ds_prev, &os_prev) != 0) 2328 return (B_TRUE); 2329 return (bcmp(&os->os_phys->os_meta_dnode, 2330 &os_prev->os_phys->os_meta_dnode, 2331 sizeof (os->os_phys->os_meta_dnode)) != 0); 2332 } 2333 return (B_FALSE); 2334 } 2335 2336 /* ARGSUSED */ 2337 static int 2338 dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) 2339 { 2340 dsl_dataset_t *ds = arg1; 2341 char *newsnapname = arg2; 2342 dsl_dir_t *dd = ds->ds_dir; 2343 dsl_dataset_t *hds; 2344 uint64_t val; 2345 int err; 2346 2347 err = dsl_dataset_hold_obj(dd->dd_pool, 2348 dd->dd_phys->dd_head_dataset_obj, FTAG, &hds); 2349 if (err) 2350 return (err); 2351 2352 /* new name better not be in use */ 2353 err = dsl_dataset_snap_lookup(hds, newsnapname, &val); 2354 dsl_dataset_rele(hds, FTAG); 2355 2356 if (err == 0) 2357 err = EEXIST; 2358 else if (err == ENOENT) 2359 err = 0; 2360 2361 /* dataset name + 1 for the "@" + the new snapshot name must fit */ 2362 if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN) 2363 err = ENAMETOOLONG; 2364 2365 return (err); 2366 } 2367 2368 static void 2369 dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) 2370 { 2371 dsl_dataset_t *ds = arg1; 2372 const char *newsnapname = arg2; 2373 dsl_dir_t *dd = ds->ds_dir; 2374 objset_t *mos = dd->dd_pool->dp_meta_objset; 2375 dsl_dataset_t *hds; 2376 int err; 2377 2378 ASSERT(ds->ds_phys->ds_next_snap_obj != 0); 2379 2380 VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, 2381 dd->dd_phys->dd_head_dataset_obj, FTAG, &hds)); 2382 2383 VERIFY(0 == dsl_dataset_get_snapname(ds)); 2384 err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx); 2385 ASSERT3U(err, ==, 0); 2386 mutex_enter(&ds->ds_lock); 2387 (void) strcpy(ds->ds_snapname, newsnapname); 2388 mutex_exit(&ds->ds_lock); 2389 err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj, 2390 ds->ds_snapname, 8, 1, &ds->ds_object, tx); 2391 ASSERT3U(err, ==, 0); 2392 2393 spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx, 2394 "dataset = %llu", ds->ds_object); 2395 dsl_dataset_rele(hds, FTAG); 2396 } 2397 2398 struct renamesnaparg { 2399 dsl_sync_task_group_t *dstg; 2400 char failed[MAXPATHLEN]; 2401 char *oldsnap; 2402 char *newsnap; 2403 }; 2404 2405 static int 2406 dsl_snapshot_rename_one(const char *name, void *arg) 2407 { 2408 struct renamesnaparg *ra = arg; 2409 dsl_dataset_t *ds = NULL; 2410 char *snapname; 2411 int err; 2412 2413 snapname = kmem_asprintf("%s@%s", name, ra->oldsnap); 2414 (void) strlcpy(ra->failed, snapname, sizeof (ra->failed)); 2415 2416 /* 2417 * For recursive snapshot renames the parent won't be changing 2418 * so we just pass name for both the to/from argument. 2419 */ 2420 err = zfs_secpolicy_rename_perms(snapname, snapname, CRED()); 2421 if (err != 0) { 2422 strfree(snapname); 2423 return (err == ENOENT ? 0 : err); 2424 } 2425 2426 #ifdef _KERNEL 2427 /* 2428 * For all filesystems undergoing rename, we'll need to unmount it. 2429 */ 2430 (void) zfs_unmount_snap(snapname, NULL); 2431 #endif 2432 err = dsl_dataset_hold(snapname, ra->dstg, &ds); 2433 strfree(snapname); 2434 if (err != 0) 2435 return (err == ENOENT ? 0 : err); 2436 2437 dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check, 2438 dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0); 2439 2440 return (0); 2441 } 2442 2443 static int 2444 dsl_recursive_rename(char *oldname, const char *newname) 2445 { 2446 int err; 2447 struct renamesnaparg *ra; 2448 dsl_sync_task_t *dst; 2449 spa_t *spa; 2450 char *cp, *fsname = spa_strdup(oldname); 2451 int len = strlen(oldname) + 1; 2452 2453 /* truncate the snapshot name to get the fsname */ 2454 cp = strchr(fsname, '@'); 2455 *cp = '\0'; 2456 2457 err = spa_open(fsname, &spa, FTAG); 2458 if (err) { 2459 kmem_free(fsname, len); 2460 return (err); 2461 } 2462 ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP); 2463 ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 2464 2465 ra->oldsnap = strchr(oldname, '@') + 1; 2466 ra->newsnap = strchr(newname, '@') + 1; 2467 *ra->failed = '\0'; 2468 2469 err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra, 2470 DS_FIND_CHILDREN); 2471 kmem_free(fsname, len); 2472 2473 if (err == 0) { 2474 err = dsl_sync_task_group_wait(ra->dstg); 2475 } 2476 2477 for (dst = list_head(&ra->dstg->dstg_tasks); dst; 2478 dst = list_next(&ra->dstg->dstg_tasks, dst)) { 2479 dsl_dataset_t *ds = dst->dst_arg1; 2480 if (dst->dst_err) { 2481 dsl_dir_name(ds->ds_dir, ra->failed); 2482 (void) strlcat(ra->failed, "@", sizeof (ra->failed)); 2483 (void) strlcat(ra->failed, ra->newsnap, 2484 sizeof (ra->failed)); 2485 } 2486 dsl_dataset_rele(ds, ra->dstg); 2487 } 2488 2489 if (err) 2490 (void) strlcpy(oldname, ra->failed, sizeof (ra->failed)); 2491 2492 dsl_sync_task_group_destroy(ra->dstg); 2493 kmem_free(ra, sizeof (struct renamesnaparg)); 2494 spa_close(spa, FTAG); 2495 return (err); 2496 } 2497 2498 static int 2499 dsl_valid_rename(const char *oldname, void *arg) 2500 { 2501 int delta = *(int *)arg; 2502 2503 if (strlen(oldname) + delta >= MAXNAMELEN) 2504 return (ENAMETOOLONG); 2505 2506 return (0); 2507 } 2508 2509 #pragma weak dmu_objset_rename = dsl_dataset_rename 2510 int 2511 dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive) 2512 { 2513 dsl_dir_t *dd; 2514 dsl_dataset_t *ds; 2515 const char *tail; 2516 int err; 2517 2518 err = dsl_dir_open(oldname, FTAG, &dd, &tail); 2519 if (err) 2520 return (err); 2521 2522 if (tail == NULL) { 2523 int delta = strlen(newname) - strlen(oldname); 2524 2525 /* if we're growing, validate child name lengths */ 2526 if (delta > 0) 2527 err = dmu_objset_find(oldname, dsl_valid_rename, 2528 &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 2529 2530 if (err == 0) 2531 err = dsl_dir_rename(dd, newname); 2532 dsl_dir_close(dd, FTAG); 2533 return (err); 2534 } 2535 2536 if (tail[0] != '@') { 2537 /* the name ended in a nonexistent component */ 2538 dsl_dir_close(dd, FTAG); 2539 return (ENOENT); 2540 } 2541 2542 dsl_dir_close(dd, FTAG); 2543 2544 /* new name must be snapshot in same filesystem */ 2545 tail = strchr(newname, '@'); 2546 if (tail == NULL) 2547 return (EINVAL); 2548 tail++; 2549 if (strncmp(oldname, newname, tail - newname) != 0) 2550 return (EXDEV); 2551 2552 if (recursive) { 2553 err = dsl_recursive_rename(oldname, newname); 2554 } else { 2555 err = dsl_dataset_hold(oldname, FTAG, &ds); 2556 if (err) 2557 return (err); 2558 2559 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 2560 dsl_dataset_snapshot_rename_check, 2561 dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1); 2562 2563 dsl_dataset_rele(ds, FTAG); 2564 } 2565 2566 return (err); 2567 } 2568 2569 struct promotenode { 2570 list_node_t link; 2571 dsl_dataset_t *ds; 2572 }; 2573 2574 struct promotearg { 2575 list_t shared_snaps, origin_snaps, clone_snaps; 2576 dsl_dataset_t *origin_origin; 2577 uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; 2578 char *err_ds; 2579 }; 2580 2581 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); 2582 static boolean_t snaplist_unstable(list_t *l); 2583 2584 static int 2585 dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) 2586 { 2587 dsl_dataset_t *hds = arg1; 2588 struct promotearg *pa = arg2; 2589 struct promotenode *snap = list_head(&pa->shared_snaps); 2590 dsl_dataset_t *origin_ds = snap->ds; 2591 int err; 2592 uint64_t unused; 2593 2594 /* Check that it is a real clone */ 2595 if (!dsl_dir_is_clone(hds->ds_dir)) 2596 return (EINVAL); 2597 2598 /* Since this is so expensive, don't do the preliminary check */ 2599 if (!dmu_tx_is_syncing(tx)) 2600 return (0); 2601 2602 if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) 2603 return (EXDEV); 2604 2605 /* compute origin's new unique space */ 2606 snap = list_tail(&pa->clone_snaps); 2607 ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); 2608 dsl_deadlist_space_range(&snap->ds->ds_deadlist, 2609 origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, 2610 &pa->unique, &unused, &unused); 2611 2612 /* 2613 * Walk the snapshots that we are moving 2614 * 2615 * Compute space to transfer. Consider the incremental changes 2616 * to used for each snapshot: 2617 * (my used) = (prev's used) + (blocks born) - (blocks killed) 2618 * So each snapshot gave birth to: 2619 * (blocks born) = (my used) - (prev's used) + (blocks killed) 2620 * So a sequence would look like: 2621 * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0) 2622 * Which simplifies to: 2623 * uN + kN + kN-1 + ... + k1 + k0 2624 * Note however, if we stop before we reach the ORIGIN we get: 2625 * uN + kN + kN-1 + ... + kM - uM-1 2626 */ 2627 pa->used = origin_ds->ds_phys->ds_used_bytes; 2628 pa->comp = origin_ds->ds_phys->ds_compressed_bytes; 2629 pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; 2630 for (snap = list_head(&pa->shared_snaps); snap; 2631 snap = list_next(&pa->shared_snaps, snap)) { 2632 uint64_t val, dlused, dlcomp, dluncomp; 2633 dsl_dataset_t *ds = snap->ds; 2634 2635 /* Check that the snapshot name does not conflict */ 2636 VERIFY(0 == dsl_dataset_get_snapname(ds)); 2637 err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); 2638 if (err == 0) { 2639 err = EEXIST; 2640 goto out; 2641 } 2642 if (err != ENOENT) 2643 goto out; 2644 2645 /* The very first snapshot does not have a deadlist */ 2646 if (ds->ds_phys->ds_prev_snap_obj == 0) 2647 continue; 2648 2649 dsl_deadlist_space(&ds->ds_deadlist, 2650 &dlused, &dlcomp, &dluncomp); 2651 pa->used += dlused; 2652 pa->comp += dlcomp; 2653 pa->uncomp += dluncomp; 2654 } 2655 2656 /* 2657 * If we are a clone of a clone then we never reached ORIGIN, 2658 * so we need to subtract out the clone origin's used space. 2659 */ 2660 if (pa->origin_origin) { 2661 pa->used -= pa->origin_origin->ds_phys->ds_used_bytes; 2662 pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes; 2663 pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes; 2664 } 2665 2666 /* Check that there is enough space here */ 2667 err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, 2668 pa->used); 2669 if (err) 2670 return (err); 2671 2672 /* 2673 * Compute the amounts of space that will be used by snapshots 2674 * after the promotion (for both origin and clone). For each, 2675 * it is the amount of space that will be on all of their 2676 * deadlists (that was not born before their new origin). 2677 */ 2678 if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { 2679 uint64_t space; 2680 2681 /* 2682 * Note, typically this will not be a clone of a clone, 2683 * so dd_origin_txg will be < TXG_INITIAL, so 2684 * these snaplist_space() -> dsl_deadlist_space_range() 2685 * calls will be fast because they do not have to 2686 * iterate over all bps. 2687 */ 2688 snap = list_head(&pa->origin_snaps); 2689 err = snaplist_space(&pa->shared_snaps, 2690 snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap); 2691 if (err) 2692 return (err); 2693 2694 err = snaplist_space(&pa->clone_snaps, 2695 snap->ds->ds_dir->dd_origin_txg, &space); 2696 if (err) 2697 return (err); 2698 pa->cloneusedsnap += space; 2699 } 2700 if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { 2701 err = snaplist_space(&pa->origin_snaps, 2702 origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap); 2703 if (err) 2704 return (err); 2705 } 2706 2707 return (0); 2708 out: 2709 pa->err_ds = snap->ds->ds_snapname; 2710 return (err); 2711 } 2712 2713 static void 2714 dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) 2715 { 2716 dsl_dataset_t *hds = arg1; 2717 struct promotearg *pa = arg2; 2718 struct promotenode *snap = list_head(&pa->shared_snaps); 2719 dsl_dataset_t *origin_ds = snap->ds; 2720 dsl_dataset_t *origin_head; 2721 dsl_dir_t *dd = hds->ds_dir; 2722 dsl_pool_t *dp = hds->ds_dir->dd_pool; 2723 dsl_dir_t *odd = NULL; 2724 uint64_t oldnext_obj; 2725 int64_t delta; 2726 2727 ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)); 2728 2729 snap = list_head(&pa->origin_snaps); 2730 origin_head = snap->ds; 2731 2732 /* 2733 * We need to explicitly open odd, since origin_ds's dd will be 2734 * changing. 2735 */ 2736 VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object, 2737 NULL, FTAG, &odd)); 2738 2739 /* change origin's next snap */ 2740 dmu_buf_will_dirty(origin_ds->ds_dbuf, tx); 2741 oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj; 2742 snap = list_tail(&pa->clone_snaps); 2743 ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); 2744 origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object; 2745 2746 /* change the origin's next clone */ 2747 if (origin_ds->ds_phys->ds_next_clones_obj) { 2748 remove_from_next_clones(origin_ds, snap->ds->ds_object, tx); 2749 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, 2750 origin_ds->ds_phys->ds_next_clones_obj, 2751 oldnext_obj, tx)); 2752 } 2753 2754 /* change origin */ 2755 dmu_buf_will_dirty(dd->dd_dbuf, tx); 2756 ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object); 2757 dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj; 2758 dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg; 2759 dmu_buf_will_dirty(odd->dd_dbuf, tx); 2760 odd->dd_phys->dd_origin_obj = origin_ds->ds_object; 2761 origin_head->ds_dir->dd_origin_txg = 2762 origin_ds->ds_phys->ds_creation_txg; 2763 2764 /* change dd_clone entries */ 2765 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 2766 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 2767 odd->dd_phys->dd_clones, hds->ds_object, tx)); 2768 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, 2769 pa->origin_origin->ds_dir->dd_phys->dd_clones, 2770 hds->ds_object, tx)); 2771 2772 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 2773 pa->origin_origin->ds_dir->dd_phys->dd_clones, 2774 origin_head->ds_object, tx)); 2775 if (dd->dd_phys->dd_clones == 0) { 2776 dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset, 2777 DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); 2778 } 2779 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, 2780 dd->dd_phys->dd_clones, origin_head->ds_object, tx)); 2781 2782 } 2783 2784 /* move snapshots to this dir */ 2785 for (snap = list_head(&pa->shared_snaps); snap; 2786 snap = list_next(&pa->shared_snaps, snap)) { 2787 dsl_dataset_t *ds = snap->ds; 2788 2789 /* unregister props as dsl_dir is changing */ 2790 if (ds->ds_objset) { 2791 dmu_objset_evict(ds->ds_objset); 2792 ds->ds_objset = NULL; 2793 } 2794 /* move snap name entry */ 2795 VERIFY(0 == dsl_dataset_get_snapname(ds)); 2796 VERIFY(0 == dsl_dataset_snap_remove(origin_head, 2797 ds->ds_snapname, tx)); 2798 VERIFY(0 == zap_add(dp->dp_meta_objset, 2799 hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, 2800 8, 1, &ds->ds_object, tx)); 2801 2802 /* change containing dsl_dir */ 2803 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2804 ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object); 2805 ds->ds_phys->ds_dir_obj = dd->dd_object; 2806 ASSERT3P(ds->ds_dir, ==, odd); 2807 dsl_dir_close(ds->ds_dir, ds); 2808 VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, 2809 NULL, ds, &ds->ds_dir)); 2810 2811 /* move any clone references */ 2812 if (ds->ds_phys->ds_next_clones_obj && 2813 spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 2814 zap_cursor_t zc; 2815 zap_attribute_t za; 2816 2817 for (zap_cursor_init(&zc, dp->dp_meta_objset, 2818 ds->ds_phys->ds_next_clones_obj); 2819 zap_cursor_retrieve(&zc, &za) == 0; 2820 zap_cursor_advance(&zc)) { 2821 dsl_dataset_t *cnds; 2822 uint64_t o; 2823 2824 if (za.za_first_integer == oldnext_obj) { 2825 /* 2826 * We've already moved the 2827 * origin's reference. 2828 */ 2829 continue; 2830 } 2831 2832 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, 2833 za.za_first_integer, FTAG, &cnds)); 2834 o = cnds->ds_dir->dd_phys->dd_head_dataset_obj; 2835 2836 VERIFY3U(zap_remove_int(dp->dp_meta_objset, 2837 odd->dd_phys->dd_clones, o, tx), ==, 0); 2838 VERIFY3U(zap_add_int(dp->dp_meta_objset, 2839 dd->dd_phys->dd_clones, o, tx), ==, 0); 2840 dsl_dataset_rele(cnds, FTAG); 2841 } 2842 zap_cursor_fini(&zc); 2843 } 2844 2845 ASSERT3U(dsl_prop_numcb(ds), ==, 0); 2846 } 2847 2848 /* 2849 * Change space accounting. 2850 * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either 2851 * both be valid, or both be 0 (resulting in delta == 0). This 2852 * is true for each of {clone,origin} independently. 2853 */ 2854 2855 delta = pa->cloneusedsnap - 2856 dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; 2857 ASSERT3S(delta, >=, 0); 2858 ASSERT3U(pa->used, >=, delta); 2859 dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx); 2860 dsl_dir_diduse_space(dd, DD_USED_HEAD, 2861 pa->used - delta, pa->comp, pa->uncomp, tx); 2862 2863 delta = pa->originusedsnap - 2864 odd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; 2865 ASSERT3S(delta, <=, 0); 2866 ASSERT3U(pa->used, >=, -delta); 2867 dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx); 2868 dsl_dir_diduse_space(odd, DD_USED_HEAD, 2869 -pa->used - delta, -pa->comp, -pa->uncomp, tx); 2870 2871 origin_ds->ds_phys->ds_unique_bytes = pa->unique; 2872 2873 /* log history record */ 2874 spa_history_log_internal(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx, 2875 "dataset = %llu", hds->ds_object); 2876 2877 dsl_dir_close(odd, FTAG); 2878 } 2879 2880 static char *snaplist_tag = "snaplist"; 2881 /* 2882 * Make a list of dsl_dataset_t's for the snapshots between first_obj 2883 * (exclusive) and last_obj (inclusive). The list will be in reverse 2884 * order (last_obj will be the list_head()). If first_obj == 0, do all 2885 * snapshots back to this dataset's origin. 2886 */ 2887 static int 2888 snaplist_make(dsl_pool_t *dp, boolean_t own, 2889 uint64_t first_obj, uint64_t last_obj, list_t *l) 2890 { 2891 uint64_t obj = last_obj; 2892 2893 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock)); 2894 2895 list_create(l, sizeof (struct promotenode), 2896 offsetof(struct promotenode, link)); 2897 2898 while (obj != first_obj) { 2899 dsl_dataset_t *ds; 2900 struct promotenode *snap; 2901 int err; 2902 2903 if (own) { 2904 err = dsl_dataset_own_obj(dp, obj, 2905 0, snaplist_tag, &ds); 2906 if (err == 0) 2907 dsl_dataset_make_exclusive(ds, snaplist_tag); 2908 } else { 2909 err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds); 2910 } 2911 if (err == ENOENT) { 2912 /* lost race with snapshot destroy */ 2913 struct promotenode *last = list_tail(l); 2914 ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj); 2915 obj = last->ds->ds_phys->ds_prev_snap_obj; 2916 continue; 2917 } else if (err) { 2918 return (err); 2919 } 2920 2921 if (first_obj == 0) 2922 first_obj = ds->ds_dir->dd_phys->dd_origin_obj; 2923 2924 snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP); 2925 snap->ds = ds; 2926 list_insert_tail(l, snap); 2927 obj = ds->ds_phys->ds_prev_snap_obj; 2928 } 2929 2930 return (0); 2931 } 2932 2933 static int 2934 snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep) 2935 { 2936 struct promotenode *snap; 2937 2938 *spacep = 0; 2939 for (snap = list_head(l); snap; snap = list_next(l, snap)) { 2940 uint64_t used, comp, uncomp; 2941 dsl_deadlist_space_range(&snap->ds->ds_deadlist, 2942 mintxg, UINT64_MAX, &used, &comp, &uncomp); 2943 *spacep += used; 2944 } 2945 return (0); 2946 } 2947 2948 static void 2949 snaplist_destroy(list_t *l, boolean_t own) 2950 { 2951 struct promotenode *snap; 2952 2953 if (!l || !list_link_active(&l->list_head)) 2954 return; 2955 2956 while ((snap = list_tail(l)) != NULL) { 2957 list_remove(l, snap); 2958 if (own) 2959 dsl_dataset_disown(snap->ds, snaplist_tag); 2960 else 2961 dsl_dataset_rele(snap->ds, snaplist_tag); 2962 kmem_free(snap, sizeof (struct promotenode)); 2963 } 2964 list_destroy(l); 2965 } 2966 2967 /* 2968 * Promote a clone. Nomenclature note: 2969 * "clone" or "cds": the original clone which is being promoted 2970 * "origin" or "ods": the snapshot which is originally clone's origin 2971 * "origin head" or "ohds": the dataset which is the head 2972 * (filesystem/volume) for the origin 2973 * "origin origin": the origin of the origin's filesystem (typically 2974 * NULL, indicating that the clone is not a clone of a clone). 2975 */ 2976 int 2977 dsl_dataset_promote(const char *name, char *conflsnap) 2978 { 2979 dsl_dataset_t *ds; 2980 dsl_dir_t *dd; 2981 dsl_pool_t *dp; 2982 dmu_object_info_t doi; 2983 struct promotearg pa = { 0 }; 2984 struct promotenode *snap; 2985 int err; 2986 2987 err = dsl_dataset_hold(name, FTAG, &ds); 2988 if (err) 2989 return (err); 2990 dd = ds->ds_dir; 2991 dp = dd->dd_pool; 2992 2993 err = dmu_object_info(dp->dp_meta_objset, 2994 ds->ds_phys->ds_snapnames_zapobj, &doi); 2995 if (err) { 2996 dsl_dataset_rele(ds, FTAG); 2997 return (err); 2998 } 2999 3000 if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) { 3001 dsl_dataset_rele(ds, FTAG); 3002 return (EINVAL); 3003 } 3004 3005 /* 3006 * We are going to inherit all the snapshots taken before our 3007 * origin (i.e., our new origin will be our parent's origin). 3008 * Take ownership of them so that we can rename them into our 3009 * namespace. 3010 */ 3011 rw_enter(&dp->dp_config_rwlock, RW_READER); 3012 3013 err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj, 3014 &pa.shared_snaps); 3015 if (err != 0) 3016 goto out; 3017 3018 err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps); 3019 if (err != 0) 3020 goto out; 3021 3022 snap = list_head(&pa.shared_snaps); 3023 ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj); 3024 err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj, 3025 snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps); 3026 if (err != 0) 3027 goto out; 3028 3029 if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) { 3030 err = dsl_dataset_hold_obj(dp, 3031 snap->ds->ds_dir->dd_phys->dd_origin_obj, 3032 FTAG, &pa.origin_origin); 3033 if (err != 0) 3034 goto out; 3035 } 3036 3037 out: 3038 rw_exit(&dp->dp_config_rwlock); 3039 3040 /* 3041 * Add in 128x the snapnames zapobj size, since we will be moving 3042 * a bunch of snapnames to the promoted ds, and dirtying their 3043 * bonus buffers. 3044 */ 3045 if (err == 0) { 3046 err = dsl_sync_task_do(dp, dsl_dataset_promote_check, 3047 dsl_dataset_promote_sync, ds, &pa, 3048 2 + 2 * doi.doi_physical_blocks_512); 3049 if (err && pa.err_ds && conflsnap) 3050 (void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN); 3051 } 3052 3053 snaplist_destroy(&pa.shared_snaps, B_TRUE); 3054 snaplist_destroy(&pa.clone_snaps, B_FALSE); 3055 snaplist_destroy(&pa.origin_snaps, B_FALSE); 3056 if (pa.origin_origin) 3057 dsl_dataset_rele(pa.origin_origin, FTAG); 3058 dsl_dataset_rele(ds, FTAG); 3059 return (err); 3060 } 3061 3062 struct cloneswaparg { 3063 dsl_dataset_t *cds; /* clone dataset */ 3064 dsl_dataset_t *ohds; /* origin's head dataset */ 3065 boolean_t force; 3066 int64_t unused_refres_delta; /* change in unconsumed refreservation */ 3067 }; 3068 3069 /* ARGSUSED */ 3070 static int 3071 dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx) 3072 { 3073 struct cloneswaparg *csa = arg1; 3074 3075 /* they should both be heads */ 3076 if (dsl_dataset_is_snapshot(csa->cds) || 3077 dsl_dataset_is_snapshot(csa->ohds)) 3078 return (EINVAL); 3079 3080 /* the branch point should be just before them */ 3081 if (csa->cds->ds_prev != csa->ohds->ds_prev) 3082 return (EINVAL); 3083 3084 /* cds should be the clone (unless they are unrelated) */ 3085 if (csa->cds->ds_prev != NULL && 3086 csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap && 3087 csa->ohds->ds_object != 3088 csa->cds->ds_prev->ds_phys->ds_next_snap_obj) 3089 return (EINVAL); 3090 3091 /* the clone should be a child of the origin */ 3092 if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir) 3093 return (EINVAL); 3094 3095 /* ohds shouldn't be modified unless 'force' */ 3096 if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds)) 3097 return (ETXTBSY); 3098 3099 /* adjust amount of any unconsumed refreservation */ 3100 csa->unused_refres_delta = 3101 (int64_t)MIN(csa->ohds->ds_reserved, 3102 csa->ohds->ds_phys->ds_unique_bytes) - 3103 (int64_t)MIN(csa->ohds->ds_reserved, 3104 csa->cds->ds_phys->ds_unique_bytes); 3105 3106 if (csa->unused_refres_delta > 0 && 3107 csa->unused_refres_delta > 3108 dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE)) 3109 return (ENOSPC); 3110 3111 if (csa->ohds->ds_quota != 0 && 3112 csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota) 3113 return (EDQUOT); 3114 3115 return (0); 3116 } 3117 3118 /* ARGSUSED */ 3119 static void 3120 dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) 3121 { 3122 struct cloneswaparg *csa = arg1; 3123 dsl_pool_t *dp = csa->cds->ds_dir->dd_pool; 3124 3125 ASSERT(csa->cds->ds_reserved == 0); 3126 ASSERT(csa->ohds->ds_quota == 0 || 3127 csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota); 3128 3129 dmu_buf_will_dirty(csa->cds->ds_dbuf, tx); 3130 dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx); 3131 3132 if (csa->cds->ds_objset != NULL) { 3133 dmu_objset_evict(csa->cds->ds_objset); 3134 csa->cds->ds_objset = NULL; 3135 } 3136 3137 if (csa->ohds->ds_objset != NULL) { 3138 dmu_objset_evict(csa->ohds->ds_objset); 3139 csa->ohds->ds_objset = NULL; 3140 } 3141 3142 /* 3143 * Reset origin's unique bytes, if it exists. 3144 */ 3145 if (csa->cds->ds_prev) { 3146 dsl_dataset_t *origin = csa->cds->ds_prev; 3147 uint64_t comp, uncomp; 3148 3149 dmu_buf_will_dirty(origin->ds_dbuf, tx); 3150 dsl_deadlist_space_range(&csa->cds->ds_deadlist, 3151 origin->ds_phys->ds_prev_snap_txg, UINT64_MAX, 3152 &origin->ds_phys->ds_unique_bytes, &comp, &uncomp); 3153 } 3154 3155 /* swap blkptrs */ 3156 { 3157 blkptr_t tmp; 3158 tmp = csa->ohds->ds_phys->ds_bp; 3159 csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp; 3160 csa->cds->ds_phys->ds_bp = tmp; 3161 } 3162 3163 /* set dd_*_bytes */ 3164 { 3165 int64_t dused, dcomp, duncomp; 3166 uint64_t cdl_used, cdl_comp, cdl_uncomp; 3167 uint64_t odl_used, odl_comp, odl_uncomp; 3168 3169 ASSERT3U(csa->cds->ds_dir->dd_phys-> 3170 dd_used_breakdown[DD_USED_SNAP], ==, 0); 3171 3172 dsl_deadlist_space(&csa->cds->ds_deadlist, 3173 &cdl_used, &cdl_comp, &cdl_uncomp); 3174 dsl_deadlist_space(&csa->ohds->ds_deadlist, 3175 &odl_used, &odl_comp, &odl_uncomp); 3176 3177 dused = csa->cds->ds_phys->ds_used_bytes + cdl_used - 3178 (csa->ohds->ds_phys->ds_used_bytes + odl_used); 3179 dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp - 3180 (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp); 3181 duncomp = csa->cds->ds_phys->ds_uncompressed_bytes + 3182 cdl_uncomp - 3183 (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp); 3184 3185 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD, 3186 dused, dcomp, duncomp, tx); 3187 dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD, 3188 -dused, -dcomp, -duncomp, tx); 3189 3190 /* 3191 * The difference in the space used by snapshots is the 3192 * difference in snapshot space due to the head's 3193 * deadlist (since that's the only thing that's 3194 * changing that affects the snapused). 3195 */ 3196 dsl_deadlist_space_range(&csa->cds->ds_deadlist, 3197 csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, 3198 &cdl_used, &cdl_comp, &cdl_uncomp); 3199 dsl_deadlist_space_range(&csa->ohds->ds_deadlist, 3200 csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, 3201 &odl_used, &odl_comp, &odl_uncomp); 3202 dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used, 3203 DD_USED_HEAD, DD_USED_SNAP, tx); 3204 } 3205 3206 /* swap ds_*_bytes */ 3207 SWITCH64(csa->ohds->ds_phys->ds_used_bytes, 3208 csa->cds->ds_phys->ds_used_bytes); 3209 SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes, 3210 csa->cds->ds_phys->ds_compressed_bytes); 3211 SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes, 3212 csa->cds->ds_phys->ds_uncompressed_bytes); 3213 SWITCH64(csa->ohds->ds_phys->ds_unique_bytes, 3214 csa->cds->ds_phys->ds_unique_bytes); 3215 3216 /* apply any parent delta for change in unconsumed refreservation */ 3217 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV, 3218 csa->unused_refres_delta, 0, 0, tx); 3219 3220 /* 3221 * Swap deadlists. 3222 */ 3223 dsl_deadlist_close(&csa->cds->ds_deadlist); 3224 dsl_deadlist_close(&csa->ohds->ds_deadlist); 3225 SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj, 3226 csa->cds->ds_phys->ds_deadlist_obj); 3227 dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset, 3228 csa->cds->ds_phys->ds_deadlist_obj); 3229 dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset, 3230 csa->ohds->ds_phys->ds_deadlist_obj); 3231 3232 dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx); 3233 } 3234 3235 /* 3236 * Swap 'clone' with its origin head datasets. Used at the end of "zfs 3237 * recv" into an existing fs to swizzle the file system to the new 3238 * version, and by "zfs rollback". Can also be used to swap two 3239 * independent head datasets if neither has any snapshots. 3240 */ 3241 int 3242 dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, 3243 boolean_t force) 3244 { 3245 struct cloneswaparg csa; 3246 int error; 3247 3248 ASSERT(clone->ds_owner); 3249 ASSERT(origin_head->ds_owner); 3250 retry: 3251 /* 3252 * Need exclusive access for the swap. If we're swapping these 3253 * datasets back after an error, we already hold the locks. 3254 */ 3255 if (!RW_WRITE_HELD(&clone->ds_rwlock)) 3256 rw_enter(&clone->ds_rwlock, RW_WRITER); 3257 if (!RW_WRITE_HELD(&origin_head->ds_rwlock) && 3258 !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) { 3259 rw_exit(&clone->ds_rwlock); 3260 rw_enter(&origin_head->ds_rwlock, RW_WRITER); 3261 if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) { 3262 rw_exit(&origin_head->ds_rwlock); 3263 goto retry; 3264 } 3265 } 3266 csa.cds = clone; 3267 csa.ohds = origin_head; 3268 csa.force = force; 3269 error = dsl_sync_task_do(clone->ds_dir->dd_pool, 3270 dsl_dataset_clone_swap_check, 3271 dsl_dataset_clone_swap_sync, &csa, NULL, 9); 3272 return (error); 3273 } 3274 3275 /* 3276 * Given a pool name and a dataset object number in that pool, 3277 * return the name of that dataset. 3278 */ 3279 int 3280 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) 3281 { 3282 spa_t *spa; 3283 dsl_pool_t *dp; 3284 dsl_dataset_t *ds; 3285 int error; 3286 3287 if ((error = spa_open(pname, &spa, FTAG)) != 0) 3288 return (error); 3289 dp = spa_get_dsl(spa); 3290 rw_enter(&dp->dp_config_rwlock, RW_READER); 3291 if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) { 3292 dsl_dataset_name(ds, buf); 3293 dsl_dataset_rele(ds, FTAG); 3294 } 3295 rw_exit(&dp->dp_config_rwlock); 3296 spa_close(spa, FTAG); 3297 3298 return (error); 3299 } 3300 3301 int 3302 dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, 3303 uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv) 3304 { 3305 int error = 0; 3306 3307 ASSERT3S(asize, >, 0); 3308 3309 /* 3310 * *ref_rsrv is the portion of asize that will come from any 3311 * unconsumed refreservation space. 3312 */ 3313 *ref_rsrv = 0; 3314 3315 mutex_enter(&ds->ds_lock); 3316 /* 3317 * Make a space adjustment for reserved bytes. 3318 */ 3319 if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) { 3320 ASSERT3U(*used, >=, 3321 ds->ds_reserved - ds->ds_phys->ds_unique_bytes); 3322 *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes); 3323 *ref_rsrv = 3324 asize - MIN(asize, parent_delta(ds, asize + inflight)); 3325 } 3326 3327 if (!check_quota || ds->ds_quota == 0) { 3328 mutex_exit(&ds->ds_lock); 3329 return (0); 3330 } 3331 /* 3332 * If they are requesting more space, and our current estimate 3333 * is over quota, they get to try again unless the actual 3334 * on-disk is over quota and there are no pending changes (which 3335 * may free up space for us). 3336 */ 3337 if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) { 3338 if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota) 3339 error = ERESTART; 3340 else 3341 error = EDQUOT; 3342 } 3343 mutex_exit(&ds->ds_lock); 3344 3345 return (error); 3346 } 3347 3348 /* ARGSUSED */ 3349 static int 3350 dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) 3351 { 3352 dsl_dataset_t *ds = arg1; 3353 dsl_prop_setarg_t *psa = arg2; 3354 int err; 3355 3356 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA) 3357 return (ENOTSUP); 3358 3359 if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) 3360 return (err); 3361 3362 if (psa->psa_effective_value == 0) 3363 return (0); 3364 3365 if (psa->psa_effective_value < ds->ds_phys->ds_used_bytes || 3366 psa->psa_effective_value < ds->ds_reserved) 3367 return (ENOSPC); 3368 3369 return (0); 3370 } 3371 3372 extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *); 3373 3374 void 3375 dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) 3376 { 3377 dsl_dataset_t *ds = arg1; 3378 dsl_prop_setarg_t *psa = arg2; 3379 uint64_t effective_value = psa->psa_effective_value; 3380 3381 dsl_prop_set_sync(ds, psa, tx); 3382 DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); 3383 3384 if (ds->ds_quota != effective_value) { 3385 dmu_buf_will_dirty(ds->ds_dbuf, tx); 3386 ds->ds_quota = effective_value; 3387 3388 spa_history_log_internal(LOG_DS_REFQUOTA, 3389 ds->ds_dir->dd_pool->dp_spa, tx, "%lld dataset = %llu ", 3390 (longlong_t)ds->ds_quota, ds->ds_object); 3391 } 3392 } 3393 3394 int 3395 dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota) 3396 { 3397 dsl_dataset_t *ds; 3398 dsl_prop_setarg_t psa; 3399 int err; 3400 3401 dsl_prop_setarg_init_uint64(&psa, "refquota", source, "a); 3402 3403 err = dsl_dataset_hold(dsname, FTAG, &ds); 3404 if (err) 3405 return (err); 3406 3407 /* 3408 * If someone removes a file, then tries to set the quota, we 3409 * want to make sure the file freeing takes effect. 3410 */ 3411 txg_wait_open(ds->ds_dir->dd_pool, 0); 3412 3413 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 3414 dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync, 3415 ds, &psa, 0); 3416 3417 dsl_dataset_rele(ds, FTAG); 3418 return (err); 3419 } 3420 3421 static int 3422 dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) 3423 { 3424 dsl_dataset_t *ds = arg1; 3425 dsl_prop_setarg_t *psa = arg2; 3426 uint64_t effective_value; 3427 uint64_t unique; 3428 int err; 3429 3430 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < 3431 SPA_VERSION_REFRESERVATION) 3432 return (ENOTSUP); 3433 3434 if (dsl_dataset_is_snapshot(ds)) 3435 return (EINVAL); 3436 3437 if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) 3438 return (err); 3439 3440 effective_value = psa->psa_effective_value; 3441 3442 /* 3443 * If we are doing the preliminary check in open context, the 3444 * space estimates may be inaccurate. 3445 */ 3446 if (!dmu_tx_is_syncing(tx)) 3447 return (0); 3448 3449 mutex_enter(&ds->ds_lock); 3450 if (!DS_UNIQUE_IS_ACCURATE(ds)) 3451 dsl_dataset_recalc_head_uniq(ds); 3452 unique = ds->ds_phys->ds_unique_bytes; 3453 mutex_exit(&ds->ds_lock); 3454 3455 if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) { 3456 uint64_t delta = MAX(unique, effective_value) - 3457 MAX(unique, ds->ds_reserved); 3458 3459 if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) 3460 return (ENOSPC); 3461 if (ds->ds_quota > 0 && 3462 effective_value > ds->ds_quota) 3463 return (ENOSPC); 3464 } 3465 3466 return (0); 3467 } 3468 3469 static void 3470 dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) 3471 { 3472 dsl_dataset_t *ds = arg1; 3473 dsl_prop_setarg_t *psa = arg2; 3474 uint64_t effective_value = psa->psa_effective_value; 3475 uint64_t unique; 3476 int64_t delta; 3477 3478 dsl_prop_set_sync(ds, psa, tx); 3479 DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); 3480 3481 dmu_buf_will_dirty(ds->ds_dbuf, tx); 3482 3483 mutex_enter(&ds->ds_dir->dd_lock); 3484 mutex_enter(&ds->ds_lock); 3485 ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); 3486 unique = ds->ds_phys->ds_unique_bytes; 3487 delta = MAX(0, (int64_t)(effective_value - unique)) - 3488 MAX(0, (int64_t)(ds->ds_reserved - unique)); 3489 ds->ds_reserved = effective_value; 3490 mutex_exit(&ds->ds_lock); 3491 3492 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); 3493 mutex_exit(&ds->ds_dir->dd_lock); 3494 3495 spa_history_log_internal(LOG_DS_REFRESERV, 3496 ds->ds_dir->dd_pool->dp_spa, tx, "%lld dataset = %llu", 3497 (longlong_t)effective_value, ds->ds_object); 3498 } 3499 3500 int 3501 dsl_dataset_set_reservation(const char *dsname, zprop_source_t source, 3502 uint64_t reservation) 3503 { 3504 dsl_dataset_t *ds; 3505 dsl_prop_setarg_t psa; 3506 int err; 3507 3508 dsl_prop_setarg_init_uint64(&psa, "refreservation", source, 3509 &reservation); 3510 3511 err = dsl_dataset_hold(dsname, FTAG, &ds); 3512 if (err) 3513 return (err); 3514 3515 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 3516 dsl_dataset_set_reservation_check, 3517 dsl_dataset_set_reservation_sync, ds, &psa, 0); 3518 3519 dsl_dataset_rele(ds, FTAG); 3520 return (err); 3521 } 3522 3523 typedef struct zfs_hold_cleanup_arg { 3524 dsl_pool_t *dp; 3525 uint64_t dsobj; 3526 char htag[MAXNAMELEN]; 3527 } zfs_hold_cleanup_arg_t; 3528 3529 static void 3530 dsl_dataset_user_release_onexit(void *arg) 3531 { 3532 zfs_hold_cleanup_arg_t *ca = arg; 3533 3534 (void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag, 3535 B_TRUE); 3536 kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t)); 3537 } 3538 3539 void 3540 dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag, 3541 minor_t minor) 3542 { 3543 zfs_hold_cleanup_arg_t *ca; 3544 3545 ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP); 3546 ca->dp = ds->ds_dir->dd_pool; 3547 ca->dsobj = ds->ds_object; 3548 (void) strlcpy(ca->htag, htag, sizeof (ca->htag)); 3549 VERIFY3U(0, ==, zfs_onexit_add_cb(minor, 3550 dsl_dataset_user_release_onexit, ca, NULL)); 3551 } 3552 3553 /* 3554 * If you add new checks here, you may need to add 3555 * additional checks to the "temporary" case in 3556 * snapshot_check() in dmu_objset.c. 3557 */ 3558 static int 3559 dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx) 3560 { 3561 dsl_dataset_t *ds = arg1; 3562 struct dsl_ds_holdarg *ha = arg2; 3563 char *htag = ha->htag; 3564 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 3565 int error = 0; 3566 3567 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) 3568 return (ENOTSUP); 3569 3570 if (!dsl_dataset_is_snapshot(ds)) 3571 return (EINVAL); 3572 3573 /* tags must be unique */ 3574 mutex_enter(&ds->ds_lock); 3575 if (ds->ds_phys->ds_userrefs_obj) { 3576 error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag, 3577 8, 1, tx); 3578 if (error == 0) 3579 error = EEXIST; 3580 else if (error == ENOENT) 3581 error = 0; 3582 } 3583 mutex_exit(&ds->ds_lock); 3584 3585 if (error == 0 && ha->temphold && 3586 strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) 3587 error = E2BIG; 3588 3589 return (error); 3590 } 3591 3592 void 3593 dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx) 3594 { 3595 dsl_dataset_t *ds = arg1; 3596 struct dsl_ds_holdarg *ha = arg2; 3597 char *htag = ha->htag; 3598 dsl_pool_t *dp = ds->ds_dir->dd_pool; 3599 objset_t *mos = dp->dp_meta_objset; 3600 uint64_t now = gethrestime_sec(); 3601 uint64_t zapobj; 3602 3603 mutex_enter(&ds->ds_lock); 3604 if (ds->ds_phys->ds_userrefs_obj == 0) { 3605 /* 3606 * This is the first user hold for this dataset. Create 3607 * the userrefs zap object. 3608 */ 3609 dmu_buf_will_dirty(ds->ds_dbuf, tx); 3610 zapobj = ds->ds_phys->ds_userrefs_obj = 3611 zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); 3612 } else { 3613 zapobj = ds->ds_phys->ds_userrefs_obj; 3614 } 3615 ds->ds_userrefs++; 3616 mutex_exit(&ds->ds_lock); 3617 3618 VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx)); 3619 3620 if (ha->temphold) { 3621 VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object, 3622 htag, &now, tx)); 3623 } 3624 3625 spa_history_log_internal(LOG_DS_USER_HOLD, 3626 dp->dp_spa, tx, "<%s> temp = %d dataset = %llu", htag, 3627 (int)ha->temphold, ds->ds_object); 3628 } 3629 3630 static int 3631 dsl_dataset_user_hold_one(const char *dsname, void *arg) 3632 { 3633 struct dsl_ds_holdarg *ha = arg; 3634 dsl_dataset_t *ds; 3635 int error; 3636 char *name; 3637 3638 /* alloc a buffer to hold dsname@snapname plus terminating NULL */ 3639 name = kmem_asprintf("%s@%s", dsname, ha->snapname); 3640 error = dsl_dataset_hold(name, ha->dstg, &ds); 3641 strfree(name); 3642 if (error == 0) { 3643 ha->gotone = B_TRUE; 3644 dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check, 3645 dsl_dataset_user_hold_sync, ds, ha, 0); 3646 } else if (error == ENOENT && ha->recursive) { 3647 error = 0; 3648 } else { 3649 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 3650 } 3651 return (error); 3652 } 3653 3654 int 3655 dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag, 3656 boolean_t temphold) 3657 { 3658 struct dsl_ds_holdarg *ha; 3659 int error; 3660 3661 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); 3662 ha->htag = htag; 3663 ha->temphold = temphold; 3664 error = dsl_sync_task_do(ds->ds_dir->dd_pool, 3665 dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync, 3666 ds, ha, 0); 3667 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3668 3669 return (error); 3670 } 3671 3672 int 3673 dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, 3674 boolean_t recursive, boolean_t temphold, int cleanup_fd) 3675 { 3676 struct dsl_ds_holdarg *ha; 3677 dsl_sync_task_t *dst; 3678 spa_t *spa; 3679 int error; 3680 minor_t minor = 0; 3681 3682 if (cleanup_fd != -1) { 3683 /* Currently we only support cleanup-on-exit of tempholds. */ 3684 if (!temphold) 3685 return (EINVAL); 3686 error = zfs_onexit_fd_hold(cleanup_fd, &minor); 3687 if (error) 3688 return (error); 3689 } 3690 3691 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); 3692 3693 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 3694 3695 error = spa_open(dsname, &spa, FTAG); 3696 if (error) { 3697 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3698 if (cleanup_fd != -1) 3699 zfs_onexit_fd_rele(cleanup_fd); 3700 return (error); 3701 } 3702 3703 ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 3704 ha->htag = htag; 3705 ha->snapname = snapname; 3706 ha->recursive = recursive; 3707 ha->temphold = temphold; 3708 3709 if (recursive) { 3710 error = dmu_objset_find(dsname, dsl_dataset_user_hold_one, 3711 ha, DS_FIND_CHILDREN); 3712 } else { 3713 error = dsl_dataset_user_hold_one(dsname, ha); 3714 } 3715 if (error == 0) 3716 error = dsl_sync_task_group_wait(ha->dstg); 3717 3718 for (dst = list_head(&ha->dstg->dstg_tasks); dst; 3719 dst = list_next(&ha->dstg->dstg_tasks, dst)) { 3720 dsl_dataset_t *ds = dst->dst_arg1; 3721 3722 if (dst->dst_err) { 3723 dsl_dataset_name(ds, ha->failed); 3724 *strchr(ha->failed, '@') = '\0'; 3725 } else if (error == 0 && minor != 0 && temphold) { 3726 /* 3727 * If this hold is to be released upon process exit, 3728 * register that action now. 3729 */ 3730 dsl_register_onexit_hold_cleanup(ds, htag, minor); 3731 } 3732 dsl_dataset_rele(ds, ha->dstg); 3733 } 3734 3735 if (error == 0 && recursive && !ha->gotone) 3736 error = ENOENT; 3737 3738 if (error) 3739 (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); 3740 3741 dsl_sync_task_group_destroy(ha->dstg); 3742 3743 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3744 spa_close(spa, FTAG); 3745 if (cleanup_fd != -1) 3746 zfs_onexit_fd_rele(cleanup_fd); 3747 return (error); 3748 } 3749 3750 struct dsl_ds_releasearg { 3751 dsl_dataset_t *ds; 3752 const char *htag; 3753 boolean_t own; /* do we own or just hold ds? */ 3754 }; 3755 3756 static int 3757 dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag, 3758 boolean_t *might_destroy) 3759 { 3760 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 3761 uint64_t zapobj; 3762 uint64_t tmp; 3763 int error; 3764 3765 *might_destroy = B_FALSE; 3766 3767 mutex_enter(&ds->ds_lock); 3768 zapobj = ds->ds_phys->ds_userrefs_obj; 3769 if (zapobj == 0) { 3770 /* The tag can't possibly exist */ 3771 mutex_exit(&ds->ds_lock); 3772 return (ESRCH); 3773 } 3774 3775 /* Make sure the tag exists */ 3776 error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp); 3777 if (error) { 3778 mutex_exit(&ds->ds_lock); 3779 if (error == ENOENT) 3780 error = ESRCH; 3781 return (error); 3782 } 3783 3784 if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 && 3785 DS_IS_DEFER_DESTROY(ds)) 3786 *might_destroy = B_TRUE; 3787 3788 mutex_exit(&ds->ds_lock); 3789 return (0); 3790 } 3791 3792 static int 3793 dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx) 3794 { 3795 struct dsl_ds_releasearg *ra = arg1; 3796 dsl_dataset_t *ds = ra->ds; 3797 boolean_t might_destroy; 3798 int error; 3799 3800 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) 3801 return (ENOTSUP); 3802 3803 error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy); 3804 if (error) 3805 return (error); 3806 3807 if (might_destroy) { 3808 struct dsl_ds_destroyarg dsda = {0}; 3809 3810 if (dmu_tx_is_syncing(tx)) { 3811 /* 3812 * If we're not prepared to remove the snapshot, 3813 * we can't allow the release to happen right now. 3814 */ 3815 if (!ra->own) 3816 return (EBUSY); 3817 } 3818 dsda.ds = ds; 3819 dsda.releasing = B_TRUE; 3820 return (dsl_dataset_destroy_check(&dsda, tag, tx)); 3821 } 3822 3823 return (0); 3824 } 3825 3826 static void 3827 dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx) 3828 { 3829 struct dsl_ds_releasearg *ra = arg1; 3830 dsl_dataset_t *ds = ra->ds; 3831 dsl_pool_t *dp = ds->ds_dir->dd_pool; 3832 objset_t *mos = dp->dp_meta_objset; 3833 uint64_t zapobj; 3834 uint64_t dsobj = ds->ds_object; 3835 uint64_t refs; 3836 int error; 3837 3838 mutex_enter(&ds->ds_lock); 3839 ds->ds_userrefs--; 3840 refs = ds->ds_userrefs; 3841 mutex_exit(&ds->ds_lock); 3842 error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx); 3843 VERIFY(error == 0 || error == ENOENT); 3844 zapobj = ds->ds_phys->ds_userrefs_obj; 3845 VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx)); 3846 if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 && 3847 DS_IS_DEFER_DESTROY(ds)) { 3848 struct dsl_ds_destroyarg dsda = {0}; 3849 3850 ASSERT(ra->own); 3851 dsda.ds = ds; 3852 dsda.releasing = B_TRUE; 3853 /* We already did the destroy_check */ 3854 dsl_dataset_destroy_sync(&dsda, tag, tx); 3855 } 3856 3857 spa_history_log_internal(LOG_DS_USER_RELEASE, 3858 dp->dp_spa, tx, "<%s> %lld dataset = %llu", 3859 ra->htag, (longlong_t)refs, dsobj); 3860 } 3861 3862 static int 3863 dsl_dataset_user_release_one(const char *dsname, void *arg) 3864 { 3865 struct dsl_ds_holdarg *ha = arg; 3866 struct dsl_ds_releasearg *ra; 3867 dsl_dataset_t *ds; 3868 int error; 3869 void *dtag = ha->dstg; 3870 char *name; 3871 boolean_t own = B_FALSE; 3872 boolean_t might_destroy; 3873 3874 /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */ 3875 name = kmem_asprintf("%s@%s", dsname, ha->snapname); 3876 error = dsl_dataset_hold(name, dtag, &ds); 3877 strfree(name); 3878 if (error == ENOENT && ha->recursive) 3879 return (0); 3880 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 3881 if (error) 3882 return (error); 3883 3884 ha->gotone = B_TRUE; 3885 3886 ASSERT(dsl_dataset_is_snapshot(ds)); 3887 3888 error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy); 3889 if (error) { 3890 dsl_dataset_rele(ds, dtag); 3891 return (error); 3892 } 3893 3894 if (might_destroy) { 3895 #ifdef _KERNEL 3896 name = kmem_asprintf("%s@%s", dsname, ha->snapname); 3897 error = zfs_unmount_snap(name, NULL); 3898 strfree(name); 3899 if (error) { 3900 dsl_dataset_rele(ds, dtag); 3901 return (error); 3902 } 3903 #endif 3904 if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) { 3905 dsl_dataset_rele(ds, dtag); 3906 return (EBUSY); 3907 } else { 3908 own = B_TRUE; 3909 dsl_dataset_make_exclusive(ds, dtag); 3910 } 3911 } 3912 3913 ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP); 3914 ra->ds = ds; 3915 ra->htag = ha->htag; 3916 ra->own = own; 3917 dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check, 3918 dsl_dataset_user_release_sync, ra, dtag, 0); 3919 3920 return (0); 3921 } 3922 3923 int 3924 dsl_dataset_user_release(char *dsname, char *snapname, char *htag, 3925 boolean_t recursive) 3926 { 3927 struct dsl_ds_holdarg *ha; 3928 dsl_sync_task_t *dst; 3929 spa_t *spa; 3930 int error; 3931 3932 top: 3933 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); 3934 3935 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 3936 3937 error = spa_open(dsname, &spa, FTAG); 3938 if (error) { 3939 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3940 return (error); 3941 } 3942 3943 ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 3944 ha->htag = htag; 3945 ha->snapname = snapname; 3946 ha->recursive = recursive; 3947 if (recursive) { 3948 error = dmu_objset_find(dsname, dsl_dataset_user_release_one, 3949 ha, DS_FIND_CHILDREN); 3950 } else { 3951 error = dsl_dataset_user_release_one(dsname, ha); 3952 } 3953 if (error == 0) 3954 error = dsl_sync_task_group_wait(ha->dstg); 3955 3956 for (dst = list_head(&ha->dstg->dstg_tasks); dst; 3957 dst = list_next(&ha->dstg->dstg_tasks, dst)) { 3958 struct dsl_ds_releasearg *ra = dst->dst_arg1; 3959 dsl_dataset_t *ds = ra->ds; 3960 3961 if (dst->dst_err) 3962 dsl_dataset_name(ds, ha->failed); 3963 3964 if (ra->own) 3965 dsl_dataset_disown(ds, ha->dstg); 3966 else 3967 dsl_dataset_rele(ds, ha->dstg); 3968 3969 kmem_free(ra, sizeof (struct dsl_ds_releasearg)); 3970 } 3971 3972 if (error == 0 && recursive && !ha->gotone) 3973 error = ENOENT; 3974 3975 if (error && error != EBUSY) 3976 (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); 3977 3978 dsl_sync_task_group_destroy(ha->dstg); 3979 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3980 spa_close(spa, FTAG); 3981 3982 /* 3983 * We can get EBUSY if we were racing with deferred destroy and 3984 * dsl_dataset_user_release_check() hadn't done the necessary 3985 * open context setup. We can also get EBUSY if we're racing 3986 * with destroy and that thread is the ds_owner. Either way 3987 * the busy condition should be transient, and we should retry 3988 * the release operation. 3989 */ 3990 if (error == EBUSY) 3991 goto top; 3992 3993 return (error); 3994 } 3995 3996 /* 3997 * Called at spa_load time (with retry == B_FALSE) to release a stale 3998 * temporary user hold. Also called by the onexit code (with retry == B_TRUE). 3999 */ 4000 int 4001 dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag, 4002 boolean_t retry) 4003 { 4004 dsl_dataset_t *ds; 4005 char *snap; 4006 char *name; 4007 int namelen; 4008 int error; 4009 4010 do { 4011 rw_enter(&dp->dp_config_rwlock, RW_READER); 4012 error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); 4013 rw_exit(&dp->dp_config_rwlock); 4014 if (error) 4015 return (error); 4016 namelen = dsl_dataset_namelen(ds)+1; 4017 name = kmem_alloc(namelen, KM_SLEEP); 4018 dsl_dataset_name(ds, name); 4019 dsl_dataset_rele(ds, FTAG); 4020 4021 snap = strchr(name, '@'); 4022 *snap = '\0'; 4023 ++snap; 4024 error = dsl_dataset_user_release(name, snap, htag, B_FALSE); 4025 kmem_free(name, namelen); 4026 4027 /* 4028 * The object can't have been destroyed because we have a hold, 4029 * but it might have been renamed, resulting in ENOENT. Retry 4030 * if we've been requested to do so. 4031 * 4032 * It would be nice if we could use the dsobj all the way 4033 * through and avoid ENOENT entirely. But we might need to 4034 * unmount the snapshot, and there's currently no way to lookup 4035 * a vfsp using a ZFS object id. 4036 */ 4037 } while ((error == ENOENT) && retry); 4038 4039 return (error); 4040 } 4041 4042 int 4043 dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp) 4044 { 4045 dsl_dataset_t *ds; 4046 int err; 4047 4048 err = dsl_dataset_hold(dsname, FTAG, &ds); 4049 if (err) 4050 return (err); 4051 4052 VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP)); 4053 if (ds->ds_phys->ds_userrefs_obj != 0) { 4054 zap_attribute_t *za; 4055 zap_cursor_t zc; 4056 4057 za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); 4058 for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, 4059 ds->ds_phys->ds_userrefs_obj); 4060 zap_cursor_retrieve(&zc, za) == 0; 4061 zap_cursor_advance(&zc)) { 4062 VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name, 4063 za->za_first_integer)); 4064 } 4065 zap_cursor_fini(&zc); 4066 kmem_free(za, sizeof (zap_attribute_t)); 4067 } 4068 dsl_dataset_rele(ds, FTAG); 4069 return (0); 4070 } 4071 4072 /* 4073 * Note, this function is used as the callback for dmu_objset_find(). We 4074 * always return 0 so that we will continue to find and process 4075 * inconsistent datasets, even if we encounter an error trying to 4076 * process one of them. 4077 */ 4078 /* ARGSUSED */ 4079 int 4080 dsl_destroy_inconsistent(const char *dsname, void *arg) 4081 { 4082 dsl_dataset_t *ds; 4083 4084 if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) { 4085 if (DS_IS_INCONSISTENT(ds)) 4086 (void) dsl_dataset_destroy(ds, FTAG, B_FALSE); 4087 else 4088 dsl_dataset_disown(ds, FTAG); 4089 } 4090 return (0); 4091 } 4092 4093 /* 4094 * Return (in *usedp) the amount of space written in new that is not 4095 * present in oldsnap. New may be a snapshot or the head. Old must be 4096 * a snapshot before new, in new's filesystem (or its origin). If not then 4097 * fail and return EINVAL. 4098 * 4099 * The written space is calculated by considering two components: First, we 4100 * ignore any freed space, and calculate the written as new's used space 4101 * minus old's used space. Next, we add in the amount of space that was freed 4102 * between the two snapshots, thus reducing new's used space relative to old's. 4103 * Specifically, this is the space that was born before old->ds_creation_txg, 4104 * and freed before new (ie. on new's deadlist or a previous deadlist). 4105 * 4106 * space freed [---------------------] 4107 * snapshots ---O-------O--------O-------O------ 4108 * oldsnap new 4109 */ 4110 int 4111 dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, 4112 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 4113 { 4114 int err = 0; 4115 uint64_t snapobj; 4116 dsl_pool_t *dp = new->ds_dir->dd_pool; 4117 4118 *usedp = 0; 4119 *usedp += new->ds_phys->ds_used_bytes; 4120 *usedp -= oldsnap->ds_phys->ds_used_bytes; 4121 4122 *compp = 0; 4123 *compp += new->ds_phys->ds_compressed_bytes; 4124 *compp -= oldsnap->ds_phys->ds_compressed_bytes; 4125 4126 *uncompp = 0; 4127 *uncompp += new->ds_phys->ds_uncompressed_bytes; 4128 *uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes; 4129 4130 rw_enter(&dp->dp_config_rwlock, RW_READER); 4131 snapobj = new->ds_object; 4132 while (snapobj != oldsnap->ds_object) { 4133 dsl_dataset_t *snap; 4134 uint64_t used, comp, uncomp; 4135 4136 err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); 4137 if (err != 0) 4138 break; 4139 4140 if (snap->ds_phys->ds_prev_snap_txg == 4141 oldsnap->ds_phys->ds_creation_txg) { 4142 /* 4143 * The blocks in the deadlist can not be born after 4144 * ds_prev_snap_txg, so get the whole deadlist space, 4145 * which is more efficient (especially for old-format 4146 * deadlists). Unfortunately the deadlist code 4147 * doesn't have enough information to make this 4148 * optimization itself. 4149 */ 4150 dsl_deadlist_space(&snap->ds_deadlist, 4151 &used, &comp, &uncomp); 4152 } else { 4153 dsl_deadlist_space_range(&snap->ds_deadlist, 4154 0, oldsnap->ds_phys->ds_creation_txg, 4155 &used, &comp, &uncomp); 4156 } 4157 *usedp += used; 4158 *compp += comp; 4159 *uncompp += uncomp; 4160 4161 /* 4162 * If we get to the beginning of the chain of snapshots 4163 * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap 4164 * was not a snapshot of/before new. 4165 */ 4166 snapobj = snap->ds_phys->ds_prev_snap_obj; 4167 dsl_dataset_rele(snap, FTAG); 4168 if (snapobj == 0) { 4169 err = EINVAL; 4170 break; 4171 } 4172 4173 } 4174 rw_exit(&dp->dp_config_rwlock); 4175 return (err); 4176 } 4177 4178 /* 4179 * Return (in *usedp) the amount of space that will be reclaimed if firstsnap, 4180 * lastsnap, and all snapshots in between are deleted. 4181 * 4182 * blocks that would be freed [---------------------------] 4183 * snapshots ---O-------O--------O-------O--------O 4184 * firstsnap lastsnap 4185 * 4186 * This is the set of blocks that were born after the snap before firstsnap, 4187 * (birth > firstsnap->prev_snap_txg) and died before the snap after the 4188 * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist). 4189 * We calculate this by iterating over the relevant deadlists (from the snap 4190 * after lastsnap, backward to the snap after firstsnap), summing up the 4191 * space on the deadlist that was born after the snap before firstsnap. 4192 */ 4193 int 4194 dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, 4195 dsl_dataset_t *lastsnap, 4196 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 4197 { 4198 int err = 0; 4199 uint64_t snapobj; 4200 dsl_pool_t *dp = firstsnap->ds_dir->dd_pool; 4201 4202 ASSERT(dsl_dataset_is_snapshot(firstsnap)); 4203 ASSERT(dsl_dataset_is_snapshot(lastsnap)); 4204 4205 /* 4206 * Check that the snapshots are in the same dsl_dir, and firstsnap 4207 * is before lastsnap. 4208 */ 4209 if (firstsnap->ds_dir != lastsnap->ds_dir || 4210 firstsnap->ds_phys->ds_creation_txg > 4211 lastsnap->ds_phys->ds_creation_txg) 4212 return (EINVAL); 4213 4214 *usedp = *compp = *uncompp = 0; 4215 4216 rw_enter(&dp->dp_config_rwlock, RW_READER); 4217 snapobj = lastsnap->ds_phys->ds_next_snap_obj; 4218 while (snapobj != firstsnap->ds_object) { 4219 dsl_dataset_t *ds; 4220 uint64_t used, comp, uncomp; 4221 4222 err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds); 4223 if (err != 0) 4224 break; 4225 4226 dsl_deadlist_space_range(&ds->ds_deadlist, 4227 firstsnap->ds_phys->ds_prev_snap_txg, UINT64_MAX, 4228 &used, &comp, &uncomp); 4229 *usedp += used; 4230 *compp += comp; 4231 *uncompp += uncomp; 4232 4233 snapobj = ds->ds_phys->ds_prev_snap_obj; 4234 ASSERT3U(snapobj, !=, 0); 4235 dsl_dataset_rele(ds, FTAG); 4236 } 4237 rw_exit(&dp->dp_config_rwlock); 4238 return (err); 4239 } 4240