1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 #include <sys/dmu_objset.h> 26 #include <sys/dsl_dataset.h> 27 #include <sys/dsl_dir.h> 28 #include <sys/dsl_prop.h> 29 #include <sys/dsl_synctask.h> 30 #include <sys/dmu_traverse.h> 31 #include <sys/dmu_tx.h> 32 #include <sys/arc.h> 33 #include <sys/zio.h> 34 #include <sys/zap.h> 35 #include <sys/unique.h> 36 #include <sys/zfs_context.h> 37 #include <sys/zfs_ioctl.h> 38 #include <sys/spa.h> 39 #include <sys/zfs_znode.h> 40 #include <sys/zvol.h> 41 42 static char *dsl_reaper = "the grim reaper"; 43 44 static dsl_checkfunc_t dsl_dataset_destroy_begin_check; 45 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync; 46 static dsl_syncfunc_t dsl_dataset_set_reservation_sync; 47 48 #define DS_REF_MAX (1ULL << 62) 49 50 #define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE 51 52 #define DSL_DATASET_IS_DESTROYED(ds) ((ds)->ds_owner == dsl_reaper) 53 54 55 /* 56 * Figure out how much of this delta should be propogated to the dsl_dir 57 * layer. If there's a refreservation, that space has already been 58 * partially accounted for in our ancestors. 59 */ 60 static int64_t 61 parent_delta(dsl_dataset_t *ds, int64_t delta) 62 { 63 uint64_t old_bytes, new_bytes; 64 65 if (ds->ds_reserved == 0) 66 return (delta); 67 68 old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); 69 new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved); 70 71 ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta)); 72 return (new_bytes - old_bytes); 73 } 74 75 void 76 dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) 77 { 78 int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); 79 int compressed = BP_GET_PSIZE(bp); 80 int uncompressed = BP_GET_UCSIZE(bp); 81 int64_t delta; 82 83 dprintf_bp(bp, "born, ds=%p\n", ds); 84 85 ASSERT(dmu_tx_is_syncing(tx)); 86 /* It could have been compressed away to nothing */ 87 if (BP_IS_HOLE(bp)) 88 return; 89 ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); 90 ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES); 91 if (ds == NULL) { 92 /* 93 * Account for the meta-objset space in its placeholder 94 * dsl_dir. 95 */ 96 ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */ 97 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, 98 used, compressed, uncompressed, tx); 99 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); 100 return; 101 } 102 dmu_buf_will_dirty(ds->ds_dbuf, tx); 103 mutex_enter(&ds->ds_dir->dd_lock); 104 mutex_enter(&ds->ds_lock); 105 delta = parent_delta(ds, used); 106 ds->ds_phys->ds_used_bytes += used; 107 ds->ds_phys->ds_compressed_bytes += compressed; 108 ds->ds_phys->ds_uncompressed_bytes += uncompressed; 109 ds->ds_phys->ds_unique_bytes += used; 110 mutex_exit(&ds->ds_lock); 111 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, 112 compressed, uncompressed, tx); 113 dsl_dir_transfer_space(ds->ds_dir, used - delta, 114 DD_USED_REFRSRV, DD_USED_HEAD, tx); 115 mutex_exit(&ds->ds_dir->dd_lock); 116 } 117 118 int 119 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, 120 boolean_t async) 121 { 122 if (BP_IS_HOLE(bp)) 123 return (0); 124 125 ASSERT(dmu_tx_is_syncing(tx)); 126 ASSERT(bp->blk_birth <= tx->tx_txg); 127 128 int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); 129 int compressed = BP_GET_PSIZE(bp); 130 int uncompressed = BP_GET_UCSIZE(bp); 131 132 ASSERT(used > 0); 133 if (ds == NULL) { 134 /* 135 * Account for the meta-objset space in its placeholder 136 * dataset. 137 */ 138 dsl_free(tx->tx_pool, tx->tx_txg, bp); 139 140 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, 141 -used, -compressed, -uncompressed, tx); 142 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); 143 return (used); 144 } 145 ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); 146 147 ASSERT(!dsl_dataset_is_snapshot(ds)); 148 dmu_buf_will_dirty(ds->ds_dbuf, tx); 149 150 if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) { 151 int64_t delta; 152 153 dprintf_bp(bp, "freeing: %s", ""); 154 dsl_free(tx->tx_pool, tx->tx_txg, bp); 155 156 mutex_enter(&ds->ds_dir->dd_lock); 157 mutex_enter(&ds->ds_lock); 158 ASSERT(ds->ds_phys->ds_unique_bytes >= used || 159 !DS_UNIQUE_IS_ACCURATE(ds)); 160 delta = parent_delta(ds, -used); 161 ds->ds_phys->ds_unique_bytes -= used; 162 mutex_exit(&ds->ds_lock); 163 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, 164 delta, -compressed, -uncompressed, tx); 165 dsl_dir_transfer_space(ds->ds_dir, -used - delta, 166 DD_USED_REFRSRV, DD_USED_HEAD, tx); 167 mutex_exit(&ds->ds_dir->dd_lock); 168 } else { 169 dprintf_bp(bp, "putting on dead list: %s", ""); 170 if (async) { 171 /* 172 * We are here as part of zio's write done callback, 173 * which means we're a zio interrupt thread. We can't 174 * call bplist_enqueue() now because it may block 175 * waiting for I/O. Instead, put bp on the deferred 176 * queue and let dsl_pool_sync() finish the job. 177 */ 178 bplist_enqueue_deferred(&ds->ds_deadlist, bp); 179 } else { 180 VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx)); 181 } 182 ASSERT3U(ds->ds_prev->ds_object, ==, 183 ds->ds_phys->ds_prev_snap_obj); 184 ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0); 185 /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ 186 if (ds->ds_prev->ds_phys->ds_next_snap_obj == 187 ds->ds_object && bp->blk_birth > 188 ds->ds_prev->ds_phys->ds_prev_snap_txg) { 189 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 190 mutex_enter(&ds->ds_prev->ds_lock); 191 ds->ds_prev->ds_phys->ds_unique_bytes += used; 192 mutex_exit(&ds->ds_prev->ds_lock); 193 } 194 if (bp->blk_birth > ds->ds_origin_txg) { 195 dsl_dir_transfer_space(ds->ds_dir, used, 196 DD_USED_HEAD, DD_USED_SNAP, tx); 197 } 198 } 199 mutex_enter(&ds->ds_lock); 200 ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used); 201 ds->ds_phys->ds_used_bytes -= used; 202 ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed); 203 ds->ds_phys->ds_compressed_bytes -= compressed; 204 ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed); 205 ds->ds_phys->ds_uncompressed_bytes -= uncompressed; 206 mutex_exit(&ds->ds_lock); 207 208 return (used); 209 } 210 211 uint64_t 212 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds) 213 { 214 uint64_t trysnap = 0; 215 216 if (ds == NULL) 217 return (0); 218 /* 219 * The snapshot creation could fail, but that would cause an 220 * incorrect FALSE return, which would only result in an 221 * overestimation of the amount of space that an operation would 222 * consume, which is OK. 223 * 224 * There's also a small window where we could miss a pending 225 * snapshot, because we could set the sync task in the quiescing 226 * phase. So this should only be used as a guess. 227 */ 228 if (ds->ds_trysnap_txg > 229 spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa)) 230 trysnap = ds->ds_trysnap_txg; 231 return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap)); 232 } 233 234 boolean_t 235 dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth) 236 { 237 return (blk_birth > dsl_dataset_prev_snap_txg(ds)); 238 } 239 240 /* ARGSUSED */ 241 static void 242 dsl_dataset_evict(dmu_buf_t *db, void *dsv) 243 { 244 dsl_dataset_t *ds = dsv; 245 246 ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds)); 247 248 unique_remove(ds->ds_fsid_guid); 249 250 if (ds->ds_objset != NULL) 251 dmu_objset_evict(ds->ds_objset); 252 253 if (ds->ds_prev) { 254 dsl_dataset_drop_ref(ds->ds_prev, ds); 255 ds->ds_prev = NULL; 256 } 257 258 bplist_close(&ds->ds_deadlist); 259 if (ds->ds_dir) 260 dsl_dir_close(ds->ds_dir, ds); 261 262 ASSERT(!list_link_active(&ds->ds_synced_link)); 263 264 mutex_destroy(&ds->ds_lock); 265 mutex_destroy(&ds->ds_recvlock); 266 mutex_destroy(&ds->ds_opening_lock); 267 rw_destroy(&ds->ds_rwlock); 268 cv_destroy(&ds->ds_exclusive_cv); 269 bplist_fini(&ds->ds_deadlist); 270 271 kmem_free(ds, sizeof (dsl_dataset_t)); 272 } 273 274 static int 275 dsl_dataset_get_snapname(dsl_dataset_t *ds) 276 { 277 dsl_dataset_phys_t *headphys; 278 int err; 279 dmu_buf_t *headdbuf; 280 dsl_pool_t *dp = ds->ds_dir->dd_pool; 281 objset_t *mos = dp->dp_meta_objset; 282 283 if (ds->ds_snapname[0]) 284 return (0); 285 if (ds->ds_phys->ds_next_snap_obj == 0) 286 return (0); 287 288 err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj, 289 FTAG, &headdbuf); 290 if (err) 291 return (err); 292 headphys = headdbuf->db_data; 293 err = zap_value_search(dp->dp_meta_objset, 294 headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname); 295 dmu_buf_rele(headdbuf, FTAG); 296 return (err); 297 } 298 299 static int 300 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) 301 { 302 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 303 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; 304 matchtype_t mt; 305 int err; 306 307 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 308 mt = MT_FIRST; 309 else 310 mt = MT_EXACT; 311 312 err = zap_lookup_norm(mos, snapobj, name, 8, 1, 313 value, mt, NULL, 0, NULL); 314 if (err == ENOTSUP && mt == MT_FIRST) 315 err = zap_lookup(mos, snapobj, name, 8, 1, value); 316 return (err); 317 } 318 319 static int 320 dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx) 321 { 322 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 323 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; 324 matchtype_t mt; 325 int err; 326 327 dsl_dir_snap_cmtime_update(ds->ds_dir); 328 329 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 330 mt = MT_FIRST; 331 else 332 mt = MT_EXACT; 333 334 err = zap_remove_norm(mos, snapobj, name, mt, tx); 335 if (err == ENOTSUP && mt == MT_FIRST) 336 err = zap_remove(mos, snapobj, name, tx); 337 return (err); 338 } 339 340 static int 341 dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, 342 dsl_dataset_t **dsp) 343 { 344 objset_t *mos = dp->dp_meta_objset; 345 dmu_buf_t *dbuf; 346 dsl_dataset_t *ds; 347 int err; 348 349 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || 350 dsl_pool_sync_context(dp)); 351 352 err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); 353 if (err) 354 return (err); 355 ds = dmu_buf_get_user(dbuf); 356 if (ds == NULL) { 357 dsl_dataset_t *winner; 358 359 ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); 360 ds->ds_dbuf = dbuf; 361 ds->ds_object = dsobj; 362 ds->ds_phys = dbuf->db_data; 363 364 mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); 365 mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL); 366 mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); 367 rw_init(&ds->ds_rwlock, 0, 0, 0); 368 cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL); 369 bplist_init(&ds->ds_deadlist); 370 371 err = bplist_open(&ds->ds_deadlist, 372 mos, ds->ds_phys->ds_deadlist_obj); 373 if (err == 0) { 374 err = dsl_dir_open_obj(dp, 375 ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir); 376 } 377 if (err) { 378 /* 379 * we don't really need to close the blist if we 380 * just opened it. 381 */ 382 mutex_destroy(&ds->ds_lock); 383 mutex_destroy(&ds->ds_recvlock); 384 mutex_destroy(&ds->ds_opening_lock); 385 rw_destroy(&ds->ds_rwlock); 386 cv_destroy(&ds->ds_exclusive_cv); 387 bplist_fini(&ds->ds_deadlist); 388 kmem_free(ds, sizeof (dsl_dataset_t)); 389 dmu_buf_rele(dbuf, tag); 390 return (err); 391 } 392 393 if (!dsl_dataset_is_snapshot(ds)) { 394 ds->ds_snapname[0] = '\0'; 395 if (ds->ds_phys->ds_prev_snap_obj) { 396 err = dsl_dataset_get_ref(dp, 397 ds->ds_phys->ds_prev_snap_obj, 398 ds, &ds->ds_prev); 399 } 400 401 if (err == 0 && dsl_dir_is_clone(ds->ds_dir)) { 402 dsl_dataset_t *origin; 403 404 err = dsl_dataset_hold_obj(dp, 405 ds->ds_dir->dd_phys->dd_origin_obj, 406 FTAG, &origin); 407 if (err == 0) { 408 ds->ds_origin_txg = 409 origin->ds_phys->ds_creation_txg; 410 dsl_dataset_rele(origin, FTAG); 411 } 412 } 413 } else { 414 if (zfs_flags & ZFS_DEBUG_SNAPNAMES) 415 err = dsl_dataset_get_snapname(ds); 416 if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) { 417 err = zap_count( 418 ds->ds_dir->dd_pool->dp_meta_objset, 419 ds->ds_phys->ds_userrefs_obj, 420 &ds->ds_userrefs); 421 } 422 } 423 424 if (err == 0 && !dsl_dataset_is_snapshot(ds)) { 425 /* 426 * In sync context, we're called with either no lock 427 * or with the write lock. If we're not syncing, 428 * we're always called with the read lock held. 429 */ 430 boolean_t need_lock = 431 !RW_WRITE_HELD(&dp->dp_config_rwlock) && 432 dsl_pool_sync_context(dp); 433 434 if (need_lock) 435 rw_enter(&dp->dp_config_rwlock, RW_READER); 436 437 err = dsl_prop_get_ds(ds, 438 "refreservation", sizeof (uint64_t), 1, 439 &ds->ds_reserved, NULL); 440 if (err == 0) { 441 err = dsl_prop_get_ds(ds, 442 "refquota", sizeof (uint64_t), 1, 443 &ds->ds_quota, NULL); 444 } 445 446 if (need_lock) 447 rw_exit(&dp->dp_config_rwlock); 448 } else { 449 ds->ds_reserved = ds->ds_quota = 0; 450 } 451 452 if (err == 0) { 453 winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys, 454 dsl_dataset_evict); 455 } 456 if (err || winner) { 457 bplist_close(&ds->ds_deadlist); 458 if (ds->ds_prev) 459 dsl_dataset_drop_ref(ds->ds_prev, ds); 460 dsl_dir_close(ds->ds_dir, ds); 461 mutex_destroy(&ds->ds_lock); 462 mutex_destroy(&ds->ds_recvlock); 463 mutex_destroy(&ds->ds_opening_lock); 464 rw_destroy(&ds->ds_rwlock); 465 cv_destroy(&ds->ds_exclusive_cv); 466 bplist_fini(&ds->ds_deadlist); 467 kmem_free(ds, sizeof (dsl_dataset_t)); 468 if (err) { 469 dmu_buf_rele(dbuf, tag); 470 return (err); 471 } 472 ds = winner; 473 } else { 474 ds->ds_fsid_guid = 475 unique_insert(ds->ds_phys->ds_fsid_guid); 476 } 477 } 478 ASSERT3P(ds->ds_dbuf, ==, dbuf); 479 ASSERT3P(ds->ds_phys, ==, dbuf->db_data); 480 ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 || 481 spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN || 482 dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); 483 mutex_enter(&ds->ds_lock); 484 if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) { 485 mutex_exit(&ds->ds_lock); 486 dmu_buf_rele(ds->ds_dbuf, tag); 487 return (ENOENT); 488 } 489 mutex_exit(&ds->ds_lock); 490 *dsp = ds; 491 return (0); 492 } 493 494 static int 495 dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag) 496 { 497 dsl_pool_t *dp = ds->ds_dir->dd_pool; 498 499 /* 500 * In syncing context we don't want the rwlock lock: there 501 * may be an existing writer waiting for sync phase to 502 * finish. We don't need to worry about such writers, since 503 * sync phase is single-threaded, so the writer can't be 504 * doing anything while we are active. 505 */ 506 if (dsl_pool_sync_context(dp)) { 507 ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); 508 return (0); 509 } 510 511 /* 512 * Normal users will hold the ds_rwlock as a READER until they 513 * are finished (i.e., call dsl_dataset_rele()). "Owners" will 514 * drop their READER lock after they set the ds_owner field. 515 * 516 * If the dataset is being destroyed, the destroy thread will 517 * obtain a WRITER lock for exclusive access after it's done its 518 * open-context work and then change the ds_owner to 519 * dsl_reaper once destruction is assured. So threads 520 * may block here temporarily, until the "destructability" of 521 * the dataset is determined. 522 */ 523 ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock)); 524 mutex_enter(&ds->ds_lock); 525 while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) { 526 rw_exit(&dp->dp_config_rwlock); 527 cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock); 528 if (DSL_DATASET_IS_DESTROYED(ds)) { 529 mutex_exit(&ds->ds_lock); 530 dsl_dataset_drop_ref(ds, tag); 531 rw_enter(&dp->dp_config_rwlock, RW_READER); 532 return (ENOENT); 533 } 534 /* 535 * The dp_config_rwlock lives above the ds_lock. And 536 * we need to check DSL_DATASET_IS_DESTROYED() while 537 * holding the ds_lock, so we have to drop and reacquire 538 * the ds_lock here. 539 */ 540 mutex_exit(&ds->ds_lock); 541 rw_enter(&dp->dp_config_rwlock, RW_READER); 542 mutex_enter(&ds->ds_lock); 543 } 544 mutex_exit(&ds->ds_lock); 545 return (0); 546 } 547 548 int 549 dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, 550 dsl_dataset_t **dsp) 551 { 552 int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp); 553 554 if (err) 555 return (err); 556 return (dsl_dataset_hold_ref(*dsp, tag)); 557 } 558 559 int 560 dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok, 561 void *tag, dsl_dataset_t **dsp) 562 { 563 int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); 564 if (err) 565 return (err); 566 if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { 567 dsl_dataset_rele(*dsp, tag); 568 *dsp = NULL; 569 return (EBUSY); 570 } 571 return (0); 572 } 573 574 int 575 dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) 576 { 577 dsl_dir_t *dd; 578 dsl_pool_t *dp; 579 const char *snapname; 580 uint64_t obj; 581 int err = 0; 582 583 err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname); 584 if (err) 585 return (err); 586 587 dp = dd->dd_pool; 588 obj = dd->dd_phys->dd_head_dataset_obj; 589 rw_enter(&dp->dp_config_rwlock, RW_READER); 590 if (obj) 591 err = dsl_dataset_get_ref(dp, obj, tag, dsp); 592 else 593 err = ENOENT; 594 if (err) 595 goto out; 596 597 err = dsl_dataset_hold_ref(*dsp, tag); 598 599 /* we may be looking for a snapshot */ 600 if (err == 0 && snapname != NULL) { 601 dsl_dataset_t *ds = NULL; 602 603 if (*snapname++ != '@') { 604 dsl_dataset_rele(*dsp, tag); 605 err = ENOENT; 606 goto out; 607 } 608 609 dprintf("looking for snapshot '%s'\n", snapname); 610 err = dsl_dataset_snap_lookup(*dsp, snapname, &obj); 611 if (err == 0) 612 err = dsl_dataset_get_ref(dp, obj, tag, &ds); 613 dsl_dataset_rele(*dsp, tag); 614 615 ASSERT3U((err == 0), ==, (ds != NULL)); 616 617 if (ds) { 618 mutex_enter(&ds->ds_lock); 619 if (ds->ds_snapname[0] == 0) 620 (void) strlcpy(ds->ds_snapname, snapname, 621 sizeof (ds->ds_snapname)); 622 mutex_exit(&ds->ds_lock); 623 err = dsl_dataset_hold_ref(ds, tag); 624 *dsp = err ? NULL : ds; 625 } 626 } 627 out: 628 rw_exit(&dp->dp_config_rwlock); 629 dsl_dir_close(dd, FTAG); 630 return (err); 631 } 632 633 int 634 dsl_dataset_own(const char *name, boolean_t inconsistentok, 635 void *tag, dsl_dataset_t **dsp) 636 { 637 int err = dsl_dataset_hold(name, tag, dsp); 638 if (err) 639 return (err); 640 if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { 641 dsl_dataset_rele(*dsp, tag); 642 return (EBUSY); 643 } 644 return (0); 645 } 646 647 void 648 dsl_dataset_name(dsl_dataset_t *ds, char *name) 649 { 650 if (ds == NULL) { 651 (void) strcpy(name, "mos"); 652 } else { 653 dsl_dir_name(ds->ds_dir, name); 654 VERIFY(0 == dsl_dataset_get_snapname(ds)); 655 if (ds->ds_snapname[0]) { 656 (void) strcat(name, "@"); 657 /* 658 * We use a "recursive" mutex so that we 659 * can call dprintf_ds() with ds_lock held. 660 */ 661 if (!MUTEX_HELD(&ds->ds_lock)) { 662 mutex_enter(&ds->ds_lock); 663 (void) strcat(name, ds->ds_snapname); 664 mutex_exit(&ds->ds_lock); 665 } else { 666 (void) strcat(name, ds->ds_snapname); 667 } 668 } 669 } 670 } 671 672 static int 673 dsl_dataset_namelen(dsl_dataset_t *ds) 674 { 675 int result; 676 677 if (ds == NULL) { 678 result = 3; /* "mos" */ 679 } else { 680 result = dsl_dir_namelen(ds->ds_dir); 681 VERIFY(0 == dsl_dataset_get_snapname(ds)); 682 if (ds->ds_snapname[0]) { 683 ++result; /* adding one for the @-sign */ 684 if (!MUTEX_HELD(&ds->ds_lock)) { 685 mutex_enter(&ds->ds_lock); 686 result += strlen(ds->ds_snapname); 687 mutex_exit(&ds->ds_lock); 688 } else { 689 result += strlen(ds->ds_snapname); 690 } 691 } 692 } 693 694 return (result); 695 } 696 697 void 698 dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag) 699 { 700 dmu_buf_rele(ds->ds_dbuf, tag); 701 } 702 703 void 704 dsl_dataset_rele(dsl_dataset_t *ds, void *tag) 705 { 706 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) { 707 rw_exit(&ds->ds_rwlock); 708 } 709 dsl_dataset_drop_ref(ds, tag); 710 } 711 712 void 713 dsl_dataset_disown(dsl_dataset_t *ds, void *tag) 714 { 715 ASSERT((ds->ds_owner == tag && ds->ds_dbuf) || 716 (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL)); 717 718 mutex_enter(&ds->ds_lock); 719 ds->ds_owner = NULL; 720 if (RW_WRITE_HELD(&ds->ds_rwlock)) { 721 rw_exit(&ds->ds_rwlock); 722 cv_broadcast(&ds->ds_exclusive_cv); 723 } 724 mutex_exit(&ds->ds_lock); 725 if (ds->ds_dbuf) 726 dsl_dataset_drop_ref(ds, tag); 727 else 728 dsl_dataset_evict(ds->ds_dbuf, ds); 729 } 730 731 boolean_t 732 dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag) 733 { 734 boolean_t gotit = FALSE; 735 736 mutex_enter(&ds->ds_lock); 737 if (ds->ds_owner == NULL && 738 (!DS_IS_INCONSISTENT(ds) || inconsistentok)) { 739 ds->ds_owner = tag; 740 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) 741 rw_exit(&ds->ds_rwlock); 742 gotit = TRUE; 743 } 744 mutex_exit(&ds->ds_lock); 745 return (gotit); 746 } 747 748 void 749 dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner) 750 { 751 ASSERT3P(owner, ==, ds->ds_owner); 752 if (!RW_WRITE_HELD(&ds->ds_rwlock)) 753 rw_enter(&ds->ds_rwlock, RW_WRITER); 754 } 755 756 uint64_t 757 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, 758 uint64_t flags, dmu_tx_t *tx) 759 { 760 dsl_pool_t *dp = dd->dd_pool; 761 dmu_buf_t *dbuf; 762 dsl_dataset_phys_t *dsphys; 763 uint64_t dsobj; 764 objset_t *mos = dp->dp_meta_objset; 765 766 if (origin == NULL) 767 origin = dp->dp_origin_snap; 768 769 ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp); 770 ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0); 771 ASSERT(dmu_tx_is_syncing(tx)); 772 ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); 773 774 dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, 775 DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); 776 VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); 777 dmu_buf_will_dirty(dbuf, tx); 778 dsphys = dbuf->db_data; 779 bzero(dsphys, sizeof (dsl_dataset_phys_t)); 780 dsphys->ds_dir_obj = dd->dd_object; 781 dsphys->ds_flags = flags; 782 dsphys->ds_fsid_guid = unique_create(); 783 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, 784 sizeof (dsphys->ds_guid)); 785 dsphys->ds_snapnames_zapobj = 786 zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP, 787 DMU_OT_NONE, 0, tx); 788 dsphys->ds_creation_time = gethrestime_sec(); 789 dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg; 790 dsphys->ds_deadlist_obj = 791 bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); 792 793 if (origin) { 794 dsphys->ds_prev_snap_obj = origin->ds_object; 795 dsphys->ds_prev_snap_txg = 796 origin->ds_phys->ds_creation_txg; 797 dsphys->ds_used_bytes = 798 origin->ds_phys->ds_used_bytes; 799 dsphys->ds_compressed_bytes = 800 origin->ds_phys->ds_compressed_bytes; 801 dsphys->ds_uncompressed_bytes = 802 origin->ds_phys->ds_uncompressed_bytes; 803 dsphys->ds_bp = origin->ds_phys->ds_bp; 804 dsphys->ds_flags |= origin->ds_phys->ds_flags; 805 806 dmu_buf_will_dirty(origin->ds_dbuf, tx); 807 origin->ds_phys->ds_num_children++; 808 809 if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) { 810 if (origin->ds_phys->ds_next_clones_obj == 0) { 811 origin->ds_phys->ds_next_clones_obj = 812 zap_create(mos, 813 DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); 814 } 815 VERIFY(0 == zap_add_int(mos, 816 origin->ds_phys->ds_next_clones_obj, 817 dsobj, tx)); 818 } 819 820 dmu_buf_will_dirty(dd->dd_dbuf, tx); 821 dd->dd_phys->dd_origin_obj = origin->ds_object; 822 } 823 824 if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) 825 dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 826 827 dmu_buf_rele(dbuf, FTAG); 828 829 dmu_buf_will_dirty(dd->dd_dbuf, tx); 830 dd->dd_phys->dd_head_dataset_obj = dsobj; 831 832 return (dsobj); 833 } 834 835 uint64_t 836 dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, 837 dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx) 838 { 839 dsl_pool_t *dp = pdd->dd_pool; 840 uint64_t dsobj, ddobj; 841 dsl_dir_t *dd; 842 843 ASSERT(lastname[0] != '@'); 844 845 ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); 846 VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd)); 847 848 dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx); 849 850 dsl_deleg_set_create_perms(dd, tx, cr); 851 852 dsl_dir_close(dd, FTAG); 853 854 return (dsobj); 855 } 856 857 struct destroyarg { 858 dsl_sync_task_group_t *dstg; 859 char *snapname; 860 char *failed; 861 boolean_t defer; 862 }; 863 864 static int 865 dsl_snapshot_destroy_one(const char *name, void *arg) 866 { 867 struct destroyarg *da = arg; 868 dsl_dataset_t *ds; 869 int err; 870 char *dsname; 871 872 dsname = kmem_asprintf("%s@%s", name, da->snapname); 873 err = dsl_dataset_own(dsname, B_TRUE, da->dstg, &ds); 874 strfree(dsname); 875 if (err == 0) { 876 struct dsl_ds_destroyarg *dsda; 877 878 dsl_dataset_make_exclusive(ds, da->dstg); 879 if (ds->ds_objset != NULL) { 880 dmu_objset_evict(ds->ds_objset); 881 ds->ds_objset = NULL; 882 } 883 dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), KM_SLEEP); 884 dsda->ds = ds; 885 dsda->defer = da->defer; 886 dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check, 887 dsl_dataset_destroy_sync, dsda, da->dstg, 0); 888 } else if (err == ENOENT) { 889 err = 0; 890 } else { 891 (void) strcpy(da->failed, name); 892 } 893 return (err); 894 } 895 896 /* 897 * Destroy 'snapname' in all descendants of 'fsname'. 898 */ 899 #pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy 900 int 901 dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer) 902 { 903 int err; 904 struct destroyarg da; 905 dsl_sync_task_t *dst; 906 spa_t *spa; 907 908 err = spa_open(fsname, &spa, FTAG); 909 if (err) 910 return (err); 911 da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 912 da.snapname = snapname; 913 da.failed = fsname; 914 da.defer = defer; 915 916 err = dmu_objset_find(fsname, 917 dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN); 918 919 if (err == 0) 920 err = dsl_sync_task_group_wait(da.dstg); 921 922 for (dst = list_head(&da.dstg->dstg_tasks); dst; 923 dst = list_next(&da.dstg->dstg_tasks, dst)) { 924 struct dsl_ds_destroyarg *dsda = dst->dst_arg1; 925 dsl_dataset_t *ds = dsda->ds; 926 927 /* 928 * Return the file system name that triggered the error 929 */ 930 if (dst->dst_err) { 931 dsl_dataset_name(ds, fsname); 932 *strchr(fsname, '@') = '\0'; 933 } 934 ASSERT3P(dsda->rm_origin, ==, NULL); 935 dsl_dataset_disown(ds, da.dstg); 936 kmem_free(dsda, sizeof (struct dsl_ds_destroyarg)); 937 } 938 939 dsl_sync_task_group_destroy(da.dstg); 940 spa_close(spa, FTAG); 941 return (err); 942 } 943 944 static boolean_t 945 dsl_dataset_might_destroy_origin(dsl_dataset_t *ds) 946 { 947 boolean_t might_destroy = B_FALSE; 948 949 mutex_enter(&ds->ds_lock); 950 if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 && 951 DS_IS_DEFER_DESTROY(ds)) 952 might_destroy = B_TRUE; 953 mutex_exit(&ds->ds_lock); 954 955 return (might_destroy); 956 } 957 958 /* 959 * If we're removing a clone, and these three conditions are true: 960 * 1) the clone's origin has no other children 961 * 2) the clone's origin has no user references 962 * 3) the clone's origin has been marked for deferred destruction 963 * Then, prepare to remove the origin as part of this sync task group. 964 */ 965 static int 966 dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag) 967 { 968 dsl_dataset_t *ds = dsda->ds; 969 dsl_dataset_t *origin = ds->ds_prev; 970 971 if (dsl_dataset_might_destroy_origin(origin)) { 972 char *name; 973 int namelen; 974 int error; 975 976 namelen = dsl_dataset_namelen(origin) + 1; 977 name = kmem_alloc(namelen, KM_SLEEP); 978 dsl_dataset_name(origin, name); 979 #ifdef _KERNEL 980 error = zfs_unmount_snap(name, NULL); 981 if (error) { 982 kmem_free(name, namelen); 983 return (error); 984 } 985 #endif 986 error = dsl_dataset_own(name, B_TRUE, tag, &origin); 987 kmem_free(name, namelen); 988 if (error) 989 return (error); 990 dsda->rm_origin = origin; 991 dsl_dataset_make_exclusive(origin, tag); 992 993 if (origin->ds_objset != NULL) { 994 dmu_objset_evict(origin->ds_objset); 995 origin->ds_objset = NULL; 996 } 997 } 998 999 return (0); 1000 } 1001 1002 /* 1003 * ds must be opened as OWNER. On return (whether successful or not), 1004 * ds will be closed and caller can no longer dereference it. 1005 */ 1006 int 1007 dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) 1008 { 1009 int err; 1010 dsl_sync_task_group_t *dstg; 1011 objset_t *os; 1012 dsl_dir_t *dd; 1013 uint64_t obj; 1014 struct dsl_ds_destroyarg dsda = { 0 }; 1015 dsl_dataset_t dummy_ds = { 0 }; 1016 1017 dsda.ds = ds; 1018 1019 if (dsl_dataset_is_snapshot(ds)) { 1020 /* Destroying a snapshot is simpler */ 1021 dsl_dataset_make_exclusive(ds, tag); 1022 1023 if (ds->ds_objset != NULL) { 1024 dmu_objset_evict(ds->ds_objset); 1025 ds->ds_objset = NULL; 1026 } 1027 dsda.defer = defer; 1028 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 1029 dsl_dataset_destroy_check, dsl_dataset_destroy_sync, 1030 &dsda, tag, 0); 1031 ASSERT3P(dsda.rm_origin, ==, NULL); 1032 goto out; 1033 } else if (defer) { 1034 err = EINVAL; 1035 goto out; 1036 } 1037 1038 dd = ds->ds_dir; 1039 dummy_ds.ds_dir = dd; 1040 dummy_ds.ds_object = ds->ds_object; 1041 1042 /* 1043 * Check for errors and mark this ds as inconsistent, in 1044 * case we crash while freeing the objects. 1045 */ 1046 err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check, 1047 dsl_dataset_destroy_begin_sync, ds, NULL, 0); 1048 if (err) 1049 goto out; 1050 1051 err = dmu_objset_from_ds(ds, &os); 1052 if (err) 1053 goto out; 1054 1055 /* 1056 * remove the objects in open context, so that we won't 1057 * have too much to do in syncing context. 1058 */ 1059 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 1060 ds->ds_phys->ds_prev_snap_txg)) { 1061 /* 1062 * Ignore errors, if there is not enough disk space 1063 * we will deal with it in dsl_dataset_destroy_sync(). 1064 */ 1065 (void) dmu_free_object(os, obj); 1066 } 1067 1068 /* 1069 * We need to sync out all in-flight IO before we try to evict 1070 * (the dataset evict func is trying to clear the cached entries 1071 * for this dataset in the ARC). 1072 */ 1073 txg_wait_synced(dd->dd_pool, 0); 1074 1075 /* 1076 * If we managed to free all the objects in open 1077 * context, the user space accounting should be zero. 1078 */ 1079 if (ds->ds_phys->ds_bp.blk_fill == 0 && 1080 dmu_objset_userused_enabled(os)) { 1081 uint64_t count; 1082 1083 ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 || 1084 count == 0); 1085 ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, &count) != 0 || 1086 count == 0); 1087 } 1088 1089 if (err != ESRCH) 1090 goto out; 1091 1092 rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); 1093 err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd); 1094 rw_exit(&dd->dd_pool->dp_config_rwlock); 1095 1096 if (err) 1097 goto out; 1098 1099 if (ds->ds_objset) { 1100 /* 1101 * We need to sync out all in-flight IO before we try 1102 * to evict (the dataset evict func is trying to clear 1103 * the cached entries for this dataset in the ARC). 1104 */ 1105 txg_wait_synced(dd->dd_pool, 0); 1106 } 1107 1108 /* 1109 * Blow away the dsl_dir + head dataset. 1110 */ 1111 dsl_dataset_make_exclusive(ds, tag); 1112 if (ds->ds_objset) { 1113 dmu_objset_evict(ds->ds_objset); 1114 ds->ds_objset = NULL; 1115 } 1116 1117 /* 1118 * If we're removing a clone, we might also need to remove its 1119 * origin. 1120 */ 1121 do { 1122 dsda.need_prep = B_FALSE; 1123 if (dsl_dir_is_clone(dd)) { 1124 err = dsl_dataset_origin_rm_prep(&dsda, tag); 1125 if (err) { 1126 dsl_dir_close(dd, FTAG); 1127 goto out; 1128 } 1129 } 1130 1131 dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); 1132 dsl_sync_task_create(dstg, dsl_dataset_destroy_check, 1133 dsl_dataset_destroy_sync, &dsda, tag, 0); 1134 dsl_sync_task_create(dstg, dsl_dir_destroy_check, 1135 dsl_dir_destroy_sync, &dummy_ds, FTAG, 0); 1136 err = dsl_sync_task_group_wait(dstg); 1137 dsl_sync_task_group_destroy(dstg); 1138 1139 /* 1140 * We could be racing against 'zfs release' or 'zfs destroy -d' 1141 * on the origin snap, in which case we can get EBUSY if we 1142 * needed to destroy the origin snap but were not ready to 1143 * do so. 1144 */ 1145 if (dsda.need_prep) { 1146 ASSERT(err == EBUSY); 1147 ASSERT(dsl_dir_is_clone(dd)); 1148 ASSERT(dsda.rm_origin == NULL); 1149 } 1150 } while (dsda.need_prep); 1151 1152 if (dsda.rm_origin != NULL) 1153 dsl_dataset_disown(dsda.rm_origin, tag); 1154 1155 /* if it is successful, dsl_dir_destroy_sync will close the dd */ 1156 if (err) 1157 dsl_dir_close(dd, FTAG); 1158 out: 1159 dsl_dataset_disown(ds, tag); 1160 return (err); 1161 } 1162 1163 blkptr_t * 1164 dsl_dataset_get_blkptr(dsl_dataset_t *ds) 1165 { 1166 return (&ds->ds_phys->ds_bp); 1167 } 1168 1169 void 1170 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) 1171 { 1172 ASSERT(dmu_tx_is_syncing(tx)); 1173 /* If it's the meta-objset, set dp_meta_rootbp */ 1174 if (ds == NULL) { 1175 tx->tx_pool->dp_meta_rootbp = *bp; 1176 } else { 1177 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1178 ds->ds_phys->ds_bp = *bp; 1179 } 1180 } 1181 1182 spa_t * 1183 dsl_dataset_get_spa(dsl_dataset_t *ds) 1184 { 1185 return (ds->ds_dir->dd_pool->dp_spa); 1186 } 1187 1188 void 1189 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) 1190 { 1191 dsl_pool_t *dp; 1192 1193 if (ds == NULL) /* this is the meta-objset */ 1194 return; 1195 1196 ASSERT(ds->ds_objset != NULL); 1197 1198 if (ds->ds_phys->ds_next_snap_obj != 0) 1199 panic("dirtying snapshot!"); 1200 1201 dp = ds->ds_dir->dd_pool; 1202 1203 if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) { 1204 /* up the hold count until we can be written out */ 1205 dmu_buf_add_ref(ds->ds_dbuf, ds); 1206 } 1207 } 1208 1209 /* 1210 * The unique space in the head dataset can be calculated by subtracting 1211 * the space used in the most recent snapshot, that is still being used 1212 * in this file system, from the space currently in use. To figure out 1213 * the space in the most recent snapshot still in use, we need to take 1214 * the total space used in the snapshot and subtract out the space that 1215 * has been freed up since the snapshot was taken. 1216 */ 1217 static void 1218 dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) 1219 { 1220 uint64_t mrs_used; 1221 uint64_t dlused, dlcomp, dluncomp; 1222 1223 ASSERT(ds->ds_object == ds->ds_dir->dd_phys->dd_head_dataset_obj); 1224 1225 if (ds->ds_phys->ds_prev_snap_obj != 0) 1226 mrs_used = ds->ds_prev->ds_phys->ds_used_bytes; 1227 else 1228 mrs_used = 0; 1229 1230 VERIFY(0 == bplist_space(&ds->ds_deadlist, &dlused, &dlcomp, 1231 &dluncomp)); 1232 1233 ASSERT3U(dlused, <=, mrs_used); 1234 ds->ds_phys->ds_unique_bytes = 1235 ds->ds_phys->ds_used_bytes - (mrs_used - dlused); 1236 1237 if (!DS_UNIQUE_IS_ACCURATE(ds) && 1238 spa_version(ds->ds_dir->dd_pool->dp_spa) >= 1239 SPA_VERSION_UNIQUE_ACCURATE) 1240 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 1241 } 1242 1243 static uint64_t 1244 dsl_dataset_unique(dsl_dataset_t *ds) 1245 { 1246 if (!DS_UNIQUE_IS_ACCURATE(ds) && !dsl_dataset_is_snapshot(ds)) 1247 dsl_dataset_recalc_head_uniq(ds); 1248 1249 return (ds->ds_phys->ds_unique_bytes); 1250 } 1251 1252 struct killarg { 1253 dsl_dataset_t *ds; 1254 dmu_tx_t *tx; 1255 }; 1256 1257 /* ARGSUSED */ 1258 static int 1259 kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1260 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 1261 { 1262 struct killarg *ka = arg; 1263 dmu_tx_t *tx = ka->tx; 1264 1265 if (bp == NULL) 1266 return (0); 1267 1268 if (zb->zb_level == ZB_ZIL_LEVEL) { 1269 ASSERT(zilog != NULL); 1270 /* 1271 * It's a block in the intent log. It has no 1272 * accounting, so just free it. 1273 */ 1274 dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); 1275 } else { 1276 ASSERT(zilog == NULL); 1277 ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); 1278 (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); 1279 } 1280 1281 return (0); 1282 } 1283 1284 /* ARGSUSED */ 1285 static int 1286 dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) 1287 { 1288 dsl_dataset_t *ds = arg1; 1289 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 1290 uint64_t count; 1291 int err; 1292 1293 /* 1294 * Can't delete a head dataset if there are snapshots of it. 1295 * (Except if the only snapshots are from the branch we cloned 1296 * from.) 1297 */ 1298 if (ds->ds_prev != NULL && 1299 ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) 1300 return (EBUSY); 1301 1302 /* 1303 * This is really a dsl_dir thing, but check it here so that 1304 * we'll be less likely to leave this dataset inconsistent & 1305 * nearly destroyed. 1306 */ 1307 err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count); 1308 if (err) 1309 return (err); 1310 if (count != 0) 1311 return (EEXIST); 1312 1313 return (0); 1314 } 1315 1316 /* ARGSUSED */ 1317 static void 1318 dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 1319 { 1320 dsl_dataset_t *ds = arg1; 1321 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1322 1323 /* Mark it as inconsistent on-disk, in case we crash */ 1324 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1325 ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; 1326 1327 spa_history_internal_log(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx, 1328 cr, "dataset = %llu", ds->ds_object); 1329 } 1330 1331 static int 1332 dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag, 1333 dmu_tx_t *tx) 1334 { 1335 dsl_dataset_t *ds = dsda->ds; 1336 dsl_dataset_t *ds_prev = ds->ds_prev; 1337 1338 if (dsl_dataset_might_destroy_origin(ds_prev)) { 1339 struct dsl_ds_destroyarg ndsda = {0}; 1340 1341 /* 1342 * If we're not prepared to remove the origin, don't remove 1343 * the clone either. 1344 */ 1345 if (dsda->rm_origin == NULL) { 1346 dsda->need_prep = B_TRUE; 1347 return (EBUSY); 1348 } 1349 1350 ndsda.ds = ds_prev; 1351 ndsda.is_origin_rm = B_TRUE; 1352 return (dsl_dataset_destroy_check(&ndsda, tag, tx)); 1353 } 1354 1355 /* 1356 * If we're not going to remove the origin after all, 1357 * undo the open context setup. 1358 */ 1359 if (dsda->rm_origin != NULL) { 1360 dsl_dataset_disown(dsda->rm_origin, tag); 1361 dsda->rm_origin = NULL; 1362 } 1363 1364 return (0); 1365 } 1366 1367 /* ARGSUSED */ 1368 int 1369 dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) 1370 { 1371 struct dsl_ds_destroyarg *dsda = arg1; 1372 dsl_dataset_t *ds = dsda->ds; 1373 1374 /* we have an owner hold, so noone else can destroy us */ 1375 ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); 1376 1377 /* 1378 * Only allow deferred destroy on pools that support it. 1379 * NOTE: deferred destroy is only supported on snapshots. 1380 */ 1381 if (dsda->defer) { 1382 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < 1383 SPA_VERSION_USERREFS) 1384 return (ENOTSUP); 1385 ASSERT(dsl_dataset_is_snapshot(ds)); 1386 return (0); 1387 } 1388 1389 /* 1390 * Can't delete a head dataset if there are snapshots of it. 1391 * (Except if the only snapshots are from the branch we cloned 1392 * from.) 1393 */ 1394 if (ds->ds_prev != NULL && 1395 ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) 1396 return (EBUSY); 1397 1398 /* 1399 * If we made changes this txg, traverse_dsl_dataset won't find 1400 * them. Try again. 1401 */ 1402 if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) 1403 return (EAGAIN); 1404 1405 if (dsl_dataset_is_snapshot(ds)) { 1406 /* 1407 * If this snapshot has an elevated user reference count, 1408 * we can't destroy it yet. 1409 */ 1410 if (ds->ds_userrefs > 0 && !dsda->releasing) 1411 return (EBUSY); 1412 1413 mutex_enter(&ds->ds_lock); 1414 /* 1415 * Can't delete a branch point. However, if we're destroying 1416 * a clone and removing its origin due to it having a user 1417 * hold count of 0 and having been marked for deferred destroy, 1418 * it's OK for the origin to have a single clone. 1419 */ 1420 if (ds->ds_phys->ds_num_children > 1421 (dsda->is_origin_rm ? 2 : 1)) { 1422 mutex_exit(&ds->ds_lock); 1423 return (EEXIST); 1424 } 1425 mutex_exit(&ds->ds_lock); 1426 } else if (dsl_dir_is_clone(ds->ds_dir)) { 1427 return (dsl_dataset_origin_check(dsda, arg2, tx)); 1428 } 1429 1430 /* XXX we should do some i/o error checking... */ 1431 return (0); 1432 } 1433 1434 struct refsarg { 1435 kmutex_t lock; 1436 boolean_t gone; 1437 kcondvar_t cv; 1438 }; 1439 1440 /* ARGSUSED */ 1441 static void 1442 dsl_dataset_refs_gone(dmu_buf_t *db, void *argv) 1443 { 1444 struct refsarg *arg = argv; 1445 1446 mutex_enter(&arg->lock); 1447 arg->gone = TRUE; 1448 cv_signal(&arg->cv); 1449 mutex_exit(&arg->lock); 1450 } 1451 1452 static void 1453 dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag) 1454 { 1455 struct refsarg arg; 1456 1457 mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL); 1458 cv_init(&arg.cv, NULL, CV_DEFAULT, NULL); 1459 arg.gone = FALSE; 1460 (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys, 1461 dsl_dataset_refs_gone); 1462 dmu_buf_rele(ds->ds_dbuf, tag); 1463 mutex_enter(&arg.lock); 1464 while (!arg.gone) 1465 cv_wait(&arg.cv, &arg.lock); 1466 ASSERT(arg.gone); 1467 mutex_exit(&arg.lock); 1468 ds->ds_dbuf = NULL; 1469 ds->ds_phys = NULL; 1470 mutex_destroy(&arg.lock); 1471 cv_destroy(&arg.cv); 1472 } 1473 1474 static void 1475 remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) 1476 { 1477 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 1478 uint64_t count; 1479 int err; 1480 1481 ASSERT(ds->ds_phys->ds_num_children >= 2); 1482 err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx); 1483 /* 1484 * The err should not be ENOENT, but a bug in a previous version 1485 * of the code could cause upgrade_clones_cb() to not set 1486 * ds_next_snap_obj when it should, leading to a missing entry. 1487 * If we knew that the pool was created after 1488 * SPA_VERSION_NEXT_CLONES, we could assert that it isn't 1489 * ENOENT. However, at least we can check that we don't have 1490 * too many entries in the next_clones_obj even after failing to 1491 * remove this one. 1492 */ 1493 if (err != ENOENT) { 1494 VERIFY3U(err, ==, 0); 1495 } 1496 ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, 1497 &count)); 1498 ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2); 1499 } 1500 1501 void 1502 dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) 1503 { 1504 struct dsl_ds_destroyarg *dsda = arg1; 1505 dsl_dataset_t *ds = dsda->ds; 1506 int err; 1507 int after_branch_point = FALSE; 1508 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1509 objset_t *mos = dp->dp_meta_objset; 1510 dsl_dataset_t *ds_prev = NULL; 1511 uint64_t obj; 1512 1513 ASSERT(ds->ds_owner); 1514 ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1); 1515 ASSERT(ds->ds_prev == NULL || 1516 ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); 1517 ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); 1518 1519 if (dsda->defer) { 1520 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 1521 if (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1) { 1522 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1523 ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY; 1524 return; 1525 } 1526 } 1527 1528 /* signal any waiters that this dataset is going away */ 1529 mutex_enter(&ds->ds_lock); 1530 ds->ds_owner = dsl_reaper; 1531 cv_broadcast(&ds->ds_exclusive_cv); 1532 mutex_exit(&ds->ds_lock); 1533 1534 /* Remove our reservation */ 1535 if (ds->ds_reserved != 0) { 1536 dsl_prop_setarg_t psa; 1537 uint64_t value = 0; 1538 1539 dsl_prop_setarg_init_uint64(&psa, "refreservation", 1540 (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), 1541 &value); 1542 psa.psa_effective_value = 0; /* predict default value */ 1543 1544 dsl_dataset_set_reservation_sync(ds, &psa, cr, tx); 1545 ASSERT3U(ds->ds_reserved, ==, 0); 1546 } 1547 1548 ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); 1549 1550 dsl_pool_ds_destroyed(ds, tx); 1551 1552 obj = ds->ds_object; 1553 1554 if (ds->ds_phys->ds_prev_snap_obj != 0) { 1555 if (ds->ds_prev) { 1556 ds_prev = ds->ds_prev; 1557 } else { 1558 VERIFY(0 == dsl_dataset_hold_obj(dp, 1559 ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev)); 1560 } 1561 after_branch_point = 1562 (ds_prev->ds_phys->ds_next_snap_obj != obj); 1563 1564 dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); 1565 if (after_branch_point && 1566 ds_prev->ds_phys->ds_next_clones_obj != 0) { 1567 remove_from_next_clones(ds_prev, obj, tx); 1568 if (ds->ds_phys->ds_next_snap_obj != 0) { 1569 VERIFY(0 == zap_add_int(mos, 1570 ds_prev->ds_phys->ds_next_clones_obj, 1571 ds->ds_phys->ds_next_snap_obj, tx)); 1572 } 1573 } 1574 if (after_branch_point && 1575 ds->ds_phys->ds_next_snap_obj == 0) { 1576 /* This clone is toast. */ 1577 ASSERT(ds_prev->ds_phys->ds_num_children > 1); 1578 ds_prev->ds_phys->ds_num_children--; 1579 1580 /* 1581 * If the clone's origin has no other clones, no 1582 * user holds, and has been marked for deferred 1583 * deletion, then we should have done the necessary 1584 * destroy setup for it. 1585 */ 1586 if (ds_prev->ds_phys->ds_num_children == 1 && 1587 ds_prev->ds_userrefs == 0 && 1588 DS_IS_DEFER_DESTROY(ds_prev)) { 1589 ASSERT3P(dsda->rm_origin, !=, NULL); 1590 } else { 1591 ASSERT3P(dsda->rm_origin, ==, NULL); 1592 } 1593 } else if (!after_branch_point) { 1594 ds_prev->ds_phys->ds_next_snap_obj = 1595 ds->ds_phys->ds_next_snap_obj; 1596 } 1597 } 1598 1599 if (ds->ds_phys->ds_next_snap_obj != 0) { 1600 blkptr_t bp; 1601 dsl_dataset_t *ds_next; 1602 uint64_t itor = 0; 1603 uint64_t old_unique; 1604 int64_t used = 0, compressed = 0, uncompressed = 0; 1605 1606 VERIFY(0 == dsl_dataset_hold_obj(dp, 1607 ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); 1608 ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); 1609 1610 old_unique = dsl_dataset_unique(ds_next); 1611 1612 dmu_buf_will_dirty(ds_next->ds_dbuf, tx); 1613 ds_next->ds_phys->ds_prev_snap_obj = 1614 ds->ds_phys->ds_prev_snap_obj; 1615 ds_next->ds_phys->ds_prev_snap_txg = 1616 ds->ds_phys->ds_prev_snap_txg; 1617 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, 1618 ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0); 1619 1620 /* 1621 * Transfer to our deadlist (which will become next's 1622 * new deadlist) any entries from next's current 1623 * deadlist which were born before prev, and free the 1624 * other entries. 1625 * 1626 * XXX we're doing this long task with the config lock held 1627 */ 1628 while (bplist_iterate(&ds_next->ds_deadlist, &itor, &bp) == 0) { 1629 if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) { 1630 VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, 1631 &bp, tx)); 1632 if (ds_prev && !after_branch_point && 1633 bp.blk_birth > 1634 ds_prev->ds_phys->ds_prev_snap_txg) { 1635 ds_prev->ds_phys->ds_unique_bytes += 1636 bp_get_dsize_sync(dp->dp_spa, &bp); 1637 } 1638 } else { 1639 used += bp_get_dsize_sync(dp->dp_spa, &bp); 1640 compressed += BP_GET_PSIZE(&bp); 1641 uncompressed += BP_GET_UCSIZE(&bp); 1642 dsl_free(dp, tx->tx_txg, &bp); 1643 } 1644 } 1645 1646 ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes); 1647 1648 /* change snapused */ 1649 dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, 1650 -used, -compressed, -uncompressed, tx); 1651 1652 /* free next's deadlist */ 1653 bplist_close(&ds_next->ds_deadlist); 1654 bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx); 1655 1656 /* set next's deadlist to our deadlist */ 1657 bplist_close(&ds->ds_deadlist); 1658 ds_next->ds_phys->ds_deadlist_obj = 1659 ds->ds_phys->ds_deadlist_obj; 1660 VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos, 1661 ds_next->ds_phys->ds_deadlist_obj)); 1662 ds->ds_phys->ds_deadlist_obj = 0; 1663 1664 if (ds_next->ds_phys->ds_next_snap_obj != 0) { 1665 /* 1666 * Update next's unique to include blocks which 1667 * were previously shared by only this snapshot 1668 * and it. Those blocks will be born after the 1669 * prev snap and before this snap, and will have 1670 * died after the next snap and before the one 1671 * after that (ie. be on the snap after next's 1672 * deadlist). 1673 * 1674 * XXX we're doing this long task with the 1675 * config lock held 1676 */ 1677 dsl_dataset_t *ds_after_next; 1678 uint64_t space; 1679 1680 VERIFY(0 == dsl_dataset_hold_obj(dp, 1681 ds_next->ds_phys->ds_next_snap_obj, 1682 FTAG, &ds_after_next)); 1683 1684 VERIFY(0 == 1685 bplist_space_birthrange(&ds_after_next->ds_deadlist, 1686 ds->ds_phys->ds_prev_snap_txg, 1687 ds->ds_phys->ds_creation_txg, &space)); 1688 ds_next->ds_phys->ds_unique_bytes += space; 1689 1690 dsl_dataset_rele(ds_after_next, FTAG); 1691 ASSERT3P(ds_next->ds_prev, ==, NULL); 1692 } else { 1693 ASSERT3P(ds_next->ds_prev, ==, ds); 1694 dsl_dataset_drop_ref(ds_next->ds_prev, ds_next); 1695 ds_next->ds_prev = NULL; 1696 if (ds_prev) { 1697 VERIFY(0 == dsl_dataset_get_ref(dp, 1698 ds->ds_phys->ds_prev_snap_obj, 1699 ds_next, &ds_next->ds_prev)); 1700 } 1701 1702 dsl_dataset_recalc_head_uniq(ds_next); 1703 1704 /* 1705 * Reduce the amount of our unconsmed refreservation 1706 * being charged to our parent by the amount of 1707 * new unique data we have gained. 1708 */ 1709 if (old_unique < ds_next->ds_reserved) { 1710 int64_t mrsdelta; 1711 uint64_t new_unique = 1712 ds_next->ds_phys->ds_unique_bytes; 1713 1714 ASSERT(old_unique <= new_unique); 1715 mrsdelta = MIN(new_unique - old_unique, 1716 ds_next->ds_reserved - old_unique); 1717 dsl_dir_diduse_space(ds->ds_dir, 1718 DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); 1719 } 1720 } 1721 dsl_dataset_rele(ds_next, FTAG); 1722 } else { 1723 /* 1724 * There's no next snapshot, so this is a head dataset. 1725 * Destroy the deadlist. Unless it's a clone, the 1726 * deadlist should be empty. (If it's a clone, it's 1727 * safe to ignore the deadlist contents.) 1728 */ 1729 struct killarg ka; 1730 1731 ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist)); 1732 bplist_close(&ds->ds_deadlist); 1733 bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx); 1734 ds->ds_phys->ds_deadlist_obj = 0; 1735 1736 /* 1737 * Free everything that we point to (that's born after 1738 * the previous snapshot, if we are a clone) 1739 * 1740 * NB: this should be very quick, because we already 1741 * freed all the objects in open context. 1742 */ 1743 ka.ds = ds; 1744 ka.tx = tx; 1745 err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg, 1746 TRAVERSE_POST, kill_blkptr, &ka); 1747 ASSERT3U(err, ==, 0); 1748 ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || 1749 ds->ds_phys->ds_unique_bytes == 0); 1750 1751 if (ds->ds_prev != NULL) { 1752 dsl_dataset_rele(ds->ds_prev, ds); 1753 ds->ds_prev = ds_prev = NULL; 1754 } 1755 } 1756 1757 if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) { 1758 /* Erase the link in the dir */ 1759 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); 1760 ds->ds_dir->dd_phys->dd_head_dataset_obj = 0; 1761 ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0); 1762 err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx); 1763 ASSERT(err == 0); 1764 } else { 1765 /* remove from snapshot namespace */ 1766 dsl_dataset_t *ds_head; 1767 ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0); 1768 VERIFY(0 == dsl_dataset_hold_obj(dp, 1769 ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head)); 1770 VERIFY(0 == dsl_dataset_get_snapname(ds)); 1771 #ifdef ZFS_DEBUG 1772 { 1773 uint64_t val; 1774 1775 err = dsl_dataset_snap_lookup(ds_head, 1776 ds->ds_snapname, &val); 1777 ASSERT3U(err, ==, 0); 1778 ASSERT3U(val, ==, obj); 1779 } 1780 #endif 1781 err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx); 1782 ASSERT(err == 0); 1783 dsl_dataset_rele(ds_head, FTAG); 1784 } 1785 1786 if (ds_prev && ds->ds_prev != ds_prev) 1787 dsl_dataset_rele(ds_prev, FTAG); 1788 1789 spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); 1790 spa_history_internal_log(LOG_DS_DESTROY, dp->dp_spa, tx, 1791 cr, "dataset = %llu", ds->ds_object); 1792 1793 if (ds->ds_phys->ds_next_clones_obj != 0) { 1794 uint64_t count; 1795 ASSERT(0 == zap_count(mos, 1796 ds->ds_phys->ds_next_clones_obj, &count) && count == 0); 1797 VERIFY(0 == dmu_object_free(mos, 1798 ds->ds_phys->ds_next_clones_obj, tx)); 1799 } 1800 if (ds->ds_phys->ds_props_obj != 0) 1801 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx)); 1802 if (ds->ds_phys->ds_userrefs_obj != 0) 1803 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx)); 1804 dsl_dir_close(ds->ds_dir, ds); 1805 ds->ds_dir = NULL; 1806 dsl_dataset_drain_refs(ds, tag); 1807 VERIFY(0 == dmu_object_free(mos, obj, tx)); 1808 1809 if (dsda->rm_origin) { 1810 /* 1811 * Remove the origin of the clone we just destroyed. 1812 */ 1813 struct dsl_ds_destroyarg ndsda = {0}; 1814 1815 ndsda.ds = dsda->rm_origin; 1816 dsl_dataset_destroy_sync(&ndsda, tag, cr, tx); 1817 } 1818 } 1819 1820 static int 1821 dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) 1822 { 1823 uint64_t asize; 1824 1825 if (!dmu_tx_is_syncing(tx)) 1826 return (0); 1827 1828 /* 1829 * If there's an fs-only reservation, any blocks that might become 1830 * owned by the snapshot dataset must be accommodated by space 1831 * outside of the reservation. 1832 */ 1833 asize = MIN(dsl_dataset_unique(ds), ds->ds_reserved); 1834 if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, FALSE)) 1835 return (ENOSPC); 1836 1837 /* 1838 * Propogate any reserved space for this snapshot to other 1839 * snapshot checks in this sync group. 1840 */ 1841 if (asize > 0) 1842 dsl_dir_willuse_space(ds->ds_dir, asize, tx); 1843 1844 return (0); 1845 } 1846 1847 /* ARGSUSED */ 1848 int 1849 dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) 1850 { 1851 dsl_dataset_t *ds = arg1; 1852 const char *snapname = arg2; 1853 int err; 1854 uint64_t value; 1855 1856 /* 1857 * We don't allow multiple snapshots of the same txg. If there 1858 * is already one, try again. 1859 */ 1860 if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg) 1861 return (EAGAIN); 1862 1863 /* 1864 * Check for conflicting name snapshot name. 1865 */ 1866 err = dsl_dataset_snap_lookup(ds, snapname, &value); 1867 if (err == 0) 1868 return (EEXIST); 1869 if (err != ENOENT) 1870 return (err); 1871 1872 /* 1873 * Check that the dataset's name is not too long. Name consists 1874 * of the dataset's length + 1 for the @-sign + snapshot name's length 1875 */ 1876 if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN) 1877 return (ENAMETOOLONG); 1878 1879 err = dsl_dataset_snapshot_reserve_space(ds, tx); 1880 if (err) 1881 return (err); 1882 1883 ds->ds_trysnap_txg = tx->tx_txg; 1884 return (0); 1885 } 1886 1887 void 1888 dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 1889 { 1890 dsl_dataset_t *ds = arg1; 1891 const char *snapname = arg2; 1892 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1893 dmu_buf_t *dbuf; 1894 dsl_dataset_phys_t *dsphys; 1895 uint64_t dsobj, crtxg; 1896 objset_t *mos = dp->dp_meta_objset; 1897 int err; 1898 1899 ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); 1900 1901 /* 1902 * The origin's ds_creation_txg has to be < TXG_INITIAL 1903 */ 1904 if (strcmp(snapname, ORIGIN_DIR_NAME) == 0) 1905 crtxg = 1; 1906 else 1907 crtxg = tx->tx_txg; 1908 1909 dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, 1910 DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); 1911 VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); 1912 dmu_buf_will_dirty(dbuf, tx); 1913 dsphys = dbuf->db_data; 1914 bzero(dsphys, sizeof (dsl_dataset_phys_t)); 1915 dsphys->ds_dir_obj = ds->ds_dir->dd_object; 1916 dsphys->ds_fsid_guid = unique_create(); 1917 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, 1918 sizeof (dsphys->ds_guid)); 1919 dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj; 1920 dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg; 1921 dsphys->ds_next_snap_obj = ds->ds_object; 1922 dsphys->ds_num_children = 1; 1923 dsphys->ds_creation_time = gethrestime_sec(); 1924 dsphys->ds_creation_txg = crtxg; 1925 dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj; 1926 dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes; 1927 dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes; 1928 dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes; 1929 dsphys->ds_flags = ds->ds_phys->ds_flags; 1930 dsphys->ds_bp = ds->ds_phys->ds_bp; 1931 dmu_buf_rele(dbuf, FTAG); 1932 1933 ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0); 1934 if (ds->ds_prev) { 1935 uint64_t next_clones_obj = 1936 ds->ds_prev->ds_phys->ds_next_clones_obj; 1937 ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj == 1938 ds->ds_object || 1939 ds->ds_prev->ds_phys->ds_num_children > 1); 1940 if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { 1941 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 1942 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, 1943 ds->ds_prev->ds_phys->ds_creation_txg); 1944 ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj; 1945 } else if (next_clones_obj != 0) { 1946 remove_from_next_clones(ds->ds_prev, 1947 dsphys->ds_next_snap_obj, tx); 1948 VERIFY3U(0, ==, zap_add_int(mos, 1949 next_clones_obj, dsobj, tx)); 1950 } 1951 } 1952 1953 /* 1954 * If we have a reference-reservation on this dataset, we will 1955 * need to increase the amount of refreservation being charged 1956 * since our unique space is going to zero. 1957 */ 1958 if (ds->ds_reserved) { 1959 int64_t add = MIN(dsl_dataset_unique(ds), ds->ds_reserved); 1960 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, 1961 add, 0, 0, tx); 1962 } 1963 1964 bplist_close(&ds->ds_deadlist); 1965 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1966 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg); 1967 ds->ds_phys->ds_prev_snap_obj = dsobj; 1968 ds->ds_phys->ds_prev_snap_txg = crtxg; 1969 ds->ds_phys->ds_unique_bytes = 0; 1970 if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) 1971 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 1972 ds->ds_phys->ds_deadlist_obj = 1973 bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); 1974 VERIFY(0 == bplist_open(&ds->ds_deadlist, mos, 1975 ds->ds_phys->ds_deadlist_obj)); 1976 1977 dprintf("snap '%s' -> obj %llu\n", snapname, dsobj); 1978 err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj, 1979 snapname, 8, 1, &dsobj, tx); 1980 ASSERT(err == 0); 1981 1982 if (ds->ds_prev) 1983 dsl_dataset_drop_ref(ds->ds_prev, ds); 1984 VERIFY(0 == dsl_dataset_get_ref(dp, 1985 ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); 1986 1987 dsl_pool_ds_snapshotted(ds, tx); 1988 1989 dsl_dir_snap_cmtime_update(ds->ds_dir); 1990 1991 spa_history_internal_log(LOG_DS_SNAPSHOT, dp->dp_spa, tx, cr, 1992 "dataset = %llu", dsobj); 1993 } 1994 1995 void 1996 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) 1997 { 1998 ASSERT(dmu_tx_is_syncing(tx)); 1999 ASSERT(ds->ds_objset != NULL); 2000 ASSERT(ds->ds_phys->ds_next_snap_obj == 0); 2001 2002 /* 2003 * in case we had to change ds_fsid_guid when we opened it, 2004 * sync it out now. 2005 */ 2006 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2007 ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid; 2008 2009 dsl_dir_dirty(ds->ds_dir, tx); 2010 dmu_objset_sync(ds->ds_objset, zio, tx); 2011 } 2012 2013 void 2014 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) 2015 { 2016 uint64_t refd, avail, uobjs, aobjs; 2017 2018 dsl_dir_stats(ds->ds_dir, nv); 2019 2020 dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs); 2021 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail); 2022 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd); 2023 2024 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION, 2025 ds->ds_phys->ds_creation_time); 2026 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG, 2027 ds->ds_phys->ds_creation_txg); 2028 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA, 2029 ds->ds_quota); 2030 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION, 2031 ds->ds_reserved); 2032 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, 2033 ds->ds_phys->ds_guid); 2034 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE, 2035 dsl_dataset_unique(ds)); 2036 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID, 2037 ds->ds_object); 2038 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, 2039 ds->ds_userrefs); 2040 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, 2041 DS_IS_DEFER_DESTROY(ds) ? 1 : 0); 2042 2043 if (ds->ds_phys->ds_next_snap_obj) { 2044 /* 2045 * This is a snapshot; override the dd's space used with 2046 * our unique space and compression ratio. 2047 */ 2048 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, 2049 ds->ds_phys->ds_unique_bytes); 2050 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, 2051 ds->ds_phys->ds_compressed_bytes == 0 ? 100 : 2052 (ds->ds_phys->ds_uncompressed_bytes * 100 / 2053 ds->ds_phys->ds_compressed_bytes)); 2054 } 2055 } 2056 2057 void 2058 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) 2059 { 2060 stat->dds_creation_txg = ds->ds_phys->ds_creation_txg; 2061 stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT; 2062 stat->dds_guid = ds->ds_phys->ds_guid; 2063 if (ds->ds_phys->ds_next_snap_obj) { 2064 stat->dds_is_snapshot = B_TRUE; 2065 stat->dds_num_clones = ds->ds_phys->ds_num_children - 1; 2066 } else { 2067 stat->dds_is_snapshot = B_FALSE; 2068 stat->dds_num_clones = 0; 2069 } 2070 2071 /* clone origin is really a dsl_dir thing... */ 2072 rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); 2073 if (dsl_dir_is_clone(ds->ds_dir)) { 2074 dsl_dataset_t *ods; 2075 2076 VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool, 2077 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods)); 2078 dsl_dataset_name(ods, stat->dds_origin); 2079 dsl_dataset_drop_ref(ods, FTAG); 2080 } else { 2081 stat->dds_origin[0] = '\0'; 2082 } 2083 rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); 2084 } 2085 2086 uint64_t 2087 dsl_dataset_fsid_guid(dsl_dataset_t *ds) 2088 { 2089 return (ds->ds_fsid_guid); 2090 } 2091 2092 void 2093 dsl_dataset_space(dsl_dataset_t *ds, 2094 uint64_t *refdbytesp, uint64_t *availbytesp, 2095 uint64_t *usedobjsp, uint64_t *availobjsp) 2096 { 2097 *refdbytesp = ds->ds_phys->ds_used_bytes; 2098 *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); 2099 if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) 2100 *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes; 2101 if (ds->ds_quota != 0) { 2102 /* 2103 * Adjust available bytes according to refquota 2104 */ 2105 if (*refdbytesp < ds->ds_quota) 2106 *availbytesp = MIN(*availbytesp, 2107 ds->ds_quota - *refdbytesp); 2108 else 2109 *availbytesp = 0; 2110 } 2111 *usedobjsp = ds->ds_phys->ds_bp.blk_fill; 2112 *availobjsp = DN_MAX_OBJECT - *usedobjsp; 2113 } 2114 2115 boolean_t 2116 dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) 2117 { 2118 dsl_pool_t *dp = ds->ds_dir->dd_pool; 2119 2120 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || 2121 dsl_pool_sync_context(dp)); 2122 if (ds->ds_prev == NULL) 2123 return (B_FALSE); 2124 if (ds->ds_phys->ds_bp.blk_birth > 2125 ds->ds_prev->ds_phys->ds_creation_txg) 2126 return (B_TRUE); 2127 return (B_FALSE); 2128 } 2129 2130 /* ARGSUSED */ 2131 static int 2132 dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) 2133 { 2134 dsl_dataset_t *ds = arg1; 2135 char *newsnapname = arg2; 2136 dsl_dir_t *dd = ds->ds_dir; 2137 dsl_dataset_t *hds; 2138 uint64_t val; 2139 int err; 2140 2141 err = dsl_dataset_hold_obj(dd->dd_pool, 2142 dd->dd_phys->dd_head_dataset_obj, FTAG, &hds); 2143 if (err) 2144 return (err); 2145 2146 /* new name better not be in use */ 2147 err = dsl_dataset_snap_lookup(hds, newsnapname, &val); 2148 dsl_dataset_rele(hds, FTAG); 2149 2150 if (err == 0) 2151 err = EEXIST; 2152 else if (err == ENOENT) 2153 err = 0; 2154 2155 /* dataset name + 1 for the "@" + the new snapshot name must fit */ 2156 if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN) 2157 err = ENAMETOOLONG; 2158 2159 return (err); 2160 } 2161 2162 static void 2163 dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, 2164 cred_t *cr, dmu_tx_t *tx) 2165 { 2166 dsl_dataset_t *ds = arg1; 2167 const char *newsnapname = arg2; 2168 dsl_dir_t *dd = ds->ds_dir; 2169 objset_t *mos = dd->dd_pool->dp_meta_objset; 2170 dsl_dataset_t *hds; 2171 int err; 2172 2173 ASSERT(ds->ds_phys->ds_next_snap_obj != 0); 2174 2175 VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, 2176 dd->dd_phys->dd_head_dataset_obj, FTAG, &hds)); 2177 2178 VERIFY(0 == dsl_dataset_get_snapname(ds)); 2179 err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx); 2180 ASSERT3U(err, ==, 0); 2181 mutex_enter(&ds->ds_lock); 2182 (void) strcpy(ds->ds_snapname, newsnapname); 2183 mutex_exit(&ds->ds_lock); 2184 err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj, 2185 ds->ds_snapname, 8, 1, &ds->ds_object, tx); 2186 ASSERT3U(err, ==, 0); 2187 2188 spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx, 2189 cr, "dataset = %llu", ds->ds_object); 2190 dsl_dataset_rele(hds, FTAG); 2191 } 2192 2193 struct renamesnaparg { 2194 dsl_sync_task_group_t *dstg; 2195 char failed[MAXPATHLEN]; 2196 char *oldsnap; 2197 char *newsnap; 2198 }; 2199 2200 static int 2201 dsl_snapshot_rename_one(const char *name, void *arg) 2202 { 2203 struct renamesnaparg *ra = arg; 2204 dsl_dataset_t *ds = NULL; 2205 char *snapname; 2206 int err; 2207 2208 snapname = kmem_asprintf("%s@%s", name, ra->oldsnap); 2209 (void) strlcpy(ra->failed, snapname, sizeof (ra->failed)); 2210 2211 /* 2212 * For recursive snapshot renames the parent won't be changing 2213 * so we just pass name for both the to/from argument. 2214 */ 2215 err = zfs_secpolicy_rename_perms(snapname, snapname, CRED()); 2216 if (err != 0) { 2217 strfree(snapname); 2218 return (err == ENOENT ? 0 : err); 2219 } 2220 2221 #ifdef _KERNEL 2222 /* 2223 * For all filesystems undergoing rename, we'll need to unmount it. 2224 */ 2225 (void) zfs_unmount_snap(snapname, NULL); 2226 #endif 2227 err = dsl_dataset_hold(snapname, ra->dstg, &ds); 2228 strfree(snapname); 2229 if (err != 0) 2230 return (err == ENOENT ? 0 : err); 2231 2232 dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check, 2233 dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0); 2234 2235 return (0); 2236 } 2237 2238 static int 2239 dsl_recursive_rename(char *oldname, const char *newname) 2240 { 2241 int err; 2242 struct renamesnaparg *ra; 2243 dsl_sync_task_t *dst; 2244 spa_t *spa; 2245 char *cp, *fsname = spa_strdup(oldname); 2246 int len = strlen(oldname) + 1; 2247 2248 /* truncate the snapshot name to get the fsname */ 2249 cp = strchr(fsname, '@'); 2250 *cp = '\0'; 2251 2252 err = spa_open(fsname, &spa, FTAG); 2253 if (err) { 2254 kmem_free(fsname, len); 2255 return (err); 2256 } 2257 ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP); 2258 ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 2259 2260 ra->oldsnap = strchr(oldname, '@') + 1; 2261 ra->newsnap = strchr(newname, '@') + 1; 2262 *ra->failed = '\0'; 2263 2264 err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra, 2265 DS_FIND_CHILDREN); 2266 kmem_free(fsname, len); 2267 2268 if (err == 0) { 2269 err = dsl_sync_task_group_wait(ra->dstg); 2270 } 2271 2272 for (dst = list_head(&ra->dstg->dstg_tasks); dst; 2273 dst = list_next(&ra->dstg->dstg_tasks, dst)) { 2274 dsl_dataset_t *ds = dst->dst_arg1; 2275 if (dst->dst_err) { 2276 dsl_dir_name(ds->ds_dir, ra->failed); 2277 (void) strlcat(ra->failed, "@", sizeof (ra->failed)); 2278 (void) strlcat(ra->failed, ra->newsnap, 2279 sizeof (ra->failed)); 2280 } 2281 dsl_dataset_rele(ds, ra->dstg); 2282 } 2283 2284 if (err) 2285 (void) strlcpy(oldname, ra->failed, sizeof (ra->failed)); 2286 2287 dsl_sync_task_group_destroy(ra->dstg); 2288 kmem_free(ra, sizeof (struct renamesnaparg)); 2289 spa_close(spa, FTAG); 2290 return (err); 2291 } 2292 2293 static int 2294 dsl_valid_rename(const char *oldname, void *arg) 2295 { 2296 int delta = *(int *)arg; 2297 2298 if (strlen(oldname) + delta >= MAXNAMELEN) 2299 return (ENAMETOOLONG); 2300 2301 return (0); 2302 } 2303 2304 #pragma weak dmu_objset_rename = dsl_dataset_rename 2305 int 2306 dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive) 2307 { 2308 dsl_dir_t *dd; 2309 dsl_dataset_t *ds; 2310 const char *tail; 2311 int err; 2312 2313 err = dsl_dir_open(oldname, FTAG, &dd, &tail); 2314 if (err) 2315 return (err); 2316 2317 if (tail == NULL) { 2318 int delta = strlen(newname) - strlen(oldname); 2319 2320 /* if we're growing, validate child name lengths */ 2321 if (delta > 0) 2322 err = dmu_objset_find(oldname, dsl_valid_rename, 2323 &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 2324 2325 if (err == 0) 2326 err = dsl_dir_rename(dd, newname); 2327 dsl_dir_close(dd, FTAG); 2328 return (err); 2329 } 2330 2331 if (tail[0] != '@') { 2332 /* the name ended in a nonexistent component */ 2333 dsl_dir_close(dd, FTAG); 2334 return (ENOENT); 2335 } 2336 2337 dsl_dir_close(dd, FTAG); 2338 2339 /* new name must be snapshot in same filesystem */ 2340 tail = strchr(newname, '@'); 2341 if (tail == NULL) 2342 return (EINVAL); 2343 tail++; 2344 if (strncmp(oldname, newname, tail - newname) != 0) 2345 return (EXDEV); 2346 2347 if (recursive) { 2348 err = dsl_recursive_rename(oldname, newname); 2349 } else { 2350 err = dsl_dataset_hold(oldname, FTAG, &ds); 2351 if (err) 2352 return (err); 2353 2354 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 2355 dsl_dataset_snapshot_rename_check, 2356 dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1); 2357 2358 dsl_dataset_rele(ds, FTAG); 2359 } 2360 2361 return (err); 2362 } 2363 2364 struct promotenode { 2365 list_node_t link; 2366 dsl_dataset_t *ds; 2367 }; 2368 2369 struct promotearg { 2370 list_t shared_snaps, origin_snaps, clone_snaps; 2371 dsl_dataset_t *origin_origin, *origin_head; 2372 uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; 2373 char *err_ds; 2374 }; 2375 2376 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); 2377 2378 /* ARGSUSED */ 2379 static int 2380 dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) 2381 { 2382 dsl_dataset_t *hds = arg1; 2383 struct promotearg *pa = arg2; 2384 struct promotenode *snap = list_head(&pa->shared_snaps); 2385 dsl_dataset_t *origin_ds = snap->ds; 2386 int err; 2387 2388 /* Check that it is a real clone */ 2389 if (!dsl_dir_is_clone(hds->ds_dir)) 2390 return (EINVAL); 2391 2392 /* Since this is so expensive, don't do the preliminary check */ 2393 if (!dmu_tx_is_syncing(tx)) 2394 return (0); 2395 2396 if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) 2397 return (EXDEV); 2398 2399 /* compute origin's new unique space */ 2400 snap = list_tail(&pa->clone_snaps); 2401 ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); 2402 err = bplist_space_birthrange(&snap->ds->ds_deadlist, 2403 origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, &pa->unique); 2404 if (err) 2405 return (err); 2406 2407 /* 2408 * Walk the snapshots that we are moving 2409 * 2410 * Compute space to transfer. Consider the incremental changes 2411 * to used for each snapshot: 2412 * (my used) = (prev's used) + (blocks born) - (blocks killed) 2413 * So each snapshot gave birth to: 2414 * (blocks born) = (my used) - (prev's used) + (blocks killed) 2415 * So a sequence would look like: 2416 * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0) 2417 * Which simplifies to: 2418 * uN + kN + kN-1 + ... + k1 + k0 2419 * Note however, if we stop before we reach the ORIGIN we get: 2420 * uN + kN + kN-1 + ... + kM - uM-1 2421 */ 2422 pa->used = origin_ds->ds_phys->ds_used_bytes; 2423 pa->comp = origin_ds->ds_phys->ds_compressed_bytes; 2424 pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; 2425 for (snap = list_head(&pa->shared_snaps); snap; 2426 snap = list_next(&pa->shared_snaps, snap)) { 2427 uint64_t val, dlused, dlcomp, dluncomp; 2428 dsl_dataset_t *ds = snap->ds; 2429 2430 /* Check that the snapshot name does not conflict */ 2431 VERIFY(0 == dsl_dataset_get_snapname(ds)); 2432 err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); 2433 if (err == 0) { 2434 err = EEXIST; 2435 goto out; 2436 } 2437 if (err != ENOENT) 2438 goto out; 2439 2440 /* The very first snapshot does not have a deadlist */ 2441 if (ds->ds_phys->ds_prev_snap_obj == 0) 2442 continue; 2443 2444 if (err = bplist_space(&ds->ds_deadlist, 2445 &dlused, &dlcomp, &dluncomp)) 2446 goto out; 2447 pa->used += dlused; 2448 pa->comp += dlcomp; 2449 pa->uncomp += dluncomp; 2450 } 2451 2452 /* 2453 * If we are a clone of a clone then we never reached ORIGIN, 2454 * so we need to subtract out the clone origin's used space. 2455 */ 2456 if (pa->origin_origin) { 2457 pa->used -= pa->origin_origin->ds_phys->ds_used_bytes; 2458 pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes; 2459 pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes; 2460 } 2461 2462 /* Check that there is enough space here */ 2463 err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, 2464 pa->used); 2465 if (err) 2466 return (err); 2467 2468 /* 2469 * Compute the amounts of space that will be used by snapshots 2470 * after the promotion (for both origin and clone). For each, 2471 * it is the amount of space that will be on all of their 2472 * deadlists (that was not born before their new origin). 2473 */ 2474 if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { 2475 uint64_t space; 2476 2477 /* 2478 * Note, typically this will not be a clone of a clone, 2479 * so snap->ds->ds_origin_txg will be < TXG_INITIAL, so 2480 * these snaplist_space() -> bplist_space_birthrange() 2481 * calls will be fast because they do not have to 2482 * iterate over all bps. 2483 */ 2484 snap = list_head(&pa->origin_snaps); 2485 err = snaplist_space(&pa->shared_snaps, 2486 snap->ds->ds_origin_txg, &pa->cloneusedsnap); 2487 if (err) 2488 return (err); 2489 2490 err = snaplist_space(&pa->clone_snaps, 2491 snap->ds->ds_origin_txg, &space); 2492 if (err) 2493 return (err); 2494 pa->cloneusedsnap += space; 2495 } 2496 if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { 2497 err = snaplist_space(&pa->origin_snaps, 2498 origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap); 2499 if (err) 2500 return (err); 2501 } 2502 2503 return (0); 2504 out: 2505 pa->err_ds = snap->ds->ds_snapname; 2506 return (err); 2507 } 2508 2509 static void 2510 dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 2511 { 2512 dsl_dataset_t *hds = arg1; 2513 struct promotearg *pa = arg2; 2514 struct promotenode *snap = list_head(&pa->shared_snaps); 2515 dsl_dataset_t *origin_ds = snap->ds; 2516 dsl_dataset_t *origin_head; 2517 dsl_dir_t *dd = hds->ds_dir; 2518 dsl_pool_t *dp = hds->ds_dir->dd_pool; 2519 dsl_dir_t *odd = NULL; 2520 uint64_t oldnext_obj; 2521 int64_t delta; 2522 2523 ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)); 2524 2525 snap = list_head(&pa->origin_snaps); 2526 origin_head = snap->ds; 2527 2528 /* 2529 * We need to explicitly open odd, since origin_ds's dd will be 2530 * changing. 2531 */ 2532 VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object, 2533 NULL, FTAG, &odd)); 2534 2535 /* change origin's next snap */ 2536 dmu_buf_will_dirty(origin_ds->ds_dbuf, tx); 2537 oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj; 2538 snap = list_tail(&pa->clone_snaps); 2539 ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); 2540 origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object; 2541 2542 /* change the origin's next clone */ 2543 if (origin_ds->ds_phys->ds_next_clones_obj) { 2544 remove_from_next_clones(origin_ds, snap->ds->ds_object, tx); 2545 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, 2546 origin_ds->ds_phys->ds_next_clones_obj, 2547 oldnext_obj, tx)); 2548 } 2549 2550 /* change origin */ 2551 dmu_buf_will_dirty(dd->dd_dbuf, tx); 2552 ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object); 2553 dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj; 2554 hds->ds_origin_txg = origin_head->ds_origin_txg; 2555 dmu_buf_will_dirty(odd->dd_dbuf, tx); 2556 odd->dd_phys->dd_origin_obj = origin_ds->ds_object; 2557 origin_head->ds_origin_txg = origin_ds->ds_phys->ds_creation_txg; 2558 2559 /* move snapshots to this dir */ 2560 for (snap = list_head(&pa->shared_snaps); snap; 2561 snap = list_next(&pa->shared_snaps, snap)) { 2562 dsl_dataset_t *ds = snap->ds; 2563 2564 /* unregister props as dsl_dir is changing */ 2565 if (ds->ds_objset) { 2566 dmu_objset_evict(ds->ds_objset); 2567 ds->ds_objset = NULL; 2568 } 2569 /* move snap name entry */ 2570 VERIFY(0 == dsl_dataset_get_snapname(ds)); 2571 VERIFY(0 == dsl_dataset_snap_remove(origin_head, 2572 ds->ds_snapname, tx)); 2573 VERIFY(0 == zap_add(dp->dp_meta_objset, 2574 hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, 2575 8, 1, &ds->ds_object, tx)); 2576 /* change containing dsl_dir */ 2577 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2578 ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object); 2579 ds->ds_phys->ds_dir_obj = dd->dd_object; 2580 ASSERT3P(ds->ds_dir, ==, odd); 2581 dsl_dir_close(ds->ds_dir, ds); 2582 VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, 2583 NULL, ds, &ds->ds_dir)); 2584 2585 ASSERT3U(dsl_prop_numcb(ds), ==, 0); 2586 } 2587 2588 /* 2589 * Change space accounting. 2590 * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either 2591 * both be valid, or both be 0 (resulting in delta == 0). This 2592 * is true for each of {clone,origin} independently. 2593 */ 2594 2595 delta = pa->cloneusedsnap - 2596 dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; 2597 ASSERT3S(delta, >=, 0); 2598 ASSERT3U(pa->used, >=, delta); 2599 dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx); 2600 dsl_dir_diduse_space(dd, DD_USED_HEAD, 2601 pa->used - delta, pa->comp, pa->uncomp, tx); 2602 2603 delta = pa->originusedsnap - 2604 odd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; 2605 ASSERT3S(delta, <=, 0); 2606 ASSERT3U(pa->used, >=, -delta); 2607 dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx); 2608 dsl_dir_diduse_space(odd, DD_USED_HEAD, 2609 -pa->used - delta, -pa->comp, -pa->uncomp, tx); 2610 2611 origin_ds->ds_phys->ds_unique_bytes = pa->unique; 2612 2613 /* log history record */ 2614 spa_history_internal_log(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx, 2615 cr, "dataset = %llu", hds->ds_object); 2616 2617 dsl_dir_close(odd, FTAG); 2618 } 2619 2620 static char *snaplist_tag = "snaplist"; 2621 /* 2622 * Make a list of dsl_dataset_t's for the snapshots between first_obj 2623 * (exclusive) and last_obj (inclusive). The list will be in reverse 2624 * order (last_obj will be the list_head()). If first_obj == 0, do all 2625 * snapshots back to this dataset's origin. 2626 */ 2627 static int 2628 snaplist_make(dsl_pool_t *dp, boolean_t own, 2629 uint64_t first_obj, uint64_t last_obj, list_t *l) 2630 { 2631 uint64_t obj = last_obj; 2632 2633 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock)); 2634 2635 list_create(l, sizeof (struct promotenode), 2636 offsetof(struct promotenode, link)); 2637 2638 while (obj != first_obj) { 2639 dsl_dataset_t *ds; 2640 struct promotenode *snap; 2641 int err; 2642 2643 if (own) { 2644 err = dsl_dataset_own_obj(dp, obj, 2645 0, snaplist_tag, &ds); 2646 if (err == 0) 2647 dsl_dataset_make_exclusive(ds, snaplist_tag); 2648 } else { 2649 err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds); 2650 } 2651 if (err == ENOENT) { 2652 /* lost race with snapshot destroy */ 2653 struct promotenode *last = list_tail(l); 2654 ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj); 2655 obj = last->ds->ds_phys->ds_prev_snap_obj; 2656 continue; 2657 } else if (err) { 2658 return (err); 2659 } 2660 2661 if (first_obj == 0) 2662 first_obj = ds->ds_dir->dd_phys->dd_origin_obj; 2663 2664 snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP); 2665 snap->ds = ds; 2666 list_insert_tail(l, snap); 2667 obj = ds->ds_phys->ds_prev_snap_obj; 2668 } 2669 2670 return (0); 2671 } 2672 2673 static int 2674 snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep) 2675 { 2676 struct promotenode *snap; 2677 2678 *spacep = 0; 2679 for (snap = list_head(l); snap; snap = list_next(l, snap)) { 2680 uint64_t used; 2681 int err = bplist_space_birthrange(&snap->ds->ds_deadlist, 2682 mintxg, UINT64_MAX, &used); 2683 if (err) 2684 return (err); 2685 *spacep += used; 2686 } 2687 return (0); 2688 } 2689 2690 static void 2691 snaplist_destroy(list_t *l, boolean_t own) 2692 { 2693 struct promotenode *snap; 2694 2695 if (!l || !list_link_active(&l->list_head)) 2696 return; 2697 2698 while ((snap = list_tail(l)) != NULL) { 2699 list_remove(l, snap); 2700 if (own) 2701 dsl_dataset_disown(snap->ds, snaplist_tag); 2702 else 2703 dsl_dataset_rele(snap->ds, snaplist_tag); 2704 kmem_free(snap, sizeof (struct promotenode)); 2705 } 2706 list_destroy(l); 2707 } 2708 2709 /* 2710 * Promote a clone. Nomenclature note: 2711 * "clone" or "cds": the original clone which is being promoted 2712 * "origin" or "ods": the snapshot which is originally clone's origin 2713 * "origin head" or "ohds": the dataset which is the head 2714 * (filesystem/volume) for the origin 2715 * "origin origin": the origin of the origin's filesystem (typically 2716 * NULL, indicating that the clone is not a clone of a clone). 2717 */ 2718 int 2719 dsl_dataset_promote(const char *name, char *conflsnap) 2720 { 2721 dsl_dataset_t *ds; 2722 dsl_dir_t *dd; 2723 dsl_pool_t *dp; 2724 dmu_object_info_t doi; 2725 struct promotearg pa = { 0 }; 2726 struct promotenode *snap; 2727 int err; 2728 2729 err = dsl_dataset_hold(name, FTAG, &ds); 2730 if (err) 2731 return (err); 2732 dd = ds->ds_dir; 2733 dp = dd->dd_pool; 2734 2735 err = dmu_object_info(dp->dp_meta_objset, 2736 ds->ds_phys->ds_snapnames_zapobj, &doi); 2737 if (err) { 2738 dsl_dataset_rele(ds, FTAG); 2739 return (err); 2740 } 2741 2742 if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) { 2743 dsl_dataset_rele(ds, FTAG); 2744 return (EINVAL); 2745 } 2746 2747 /* 2748 * We are going to inherit all the snapshots taken before our 2749 * origin (i.e., our new origin will be our parent's origin). 2750 * Take ownership of them so that we can rename them into our 2751 * namespace. 2752 */ 2753 rw_enter(&dp->dp_config_rwlock, RW_READER); 2754 2755 err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj, 2756 &pa.shared_snaps); 2757 if (err != 0) 2758 goto out; 2759 2760 err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps); 2761 if (err != 0) 2762 goto out; 2763 2764 snap = list_head(&pa.shared_snaps); 2765 ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj); 2766 err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj, 2767 snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps); 2768 if (err != 0) 2769 goto out; 2770 2771 if (dsl_dir_is_clone(snap->ds->ds_dir)) { 2772 err = dsl_dataset_own_obj(dp, 2773 snap->ds->ds_dir->dd_phys->dd_origin_obj, 2774 0, FTAG, &pa.origin_origin); 2775 if (err != 0) 2776 goto out; 2777 } 2778 2779 out: 2780 rw_exit(&dp->dp_config_rwlock); 2781 2782 /* 2783 * Add in 128x the snapnames zapobj size, since we will be moving 2784 * a bunch of snapnames to the promoted ds, and dirtying their 2785 * bonus buffers. 2786 */ 2787 if (err == 0) { 2788 err = dsl_sync_task_do(dp, dsl_dataset_promote_check, 2789 dsl_dataset_promote_sync, ds, &pa, 2790 2 + 2 * doi.doi_physical_blocks_512); 2791 if (err && pa.err_ds && conflsnap) 2792 (void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN); 2793 } 2794 2795 snaplist_destroy(&pa.shared_snaps, B_TRUE); 2796 snaplist_destroy(&pa.clone_snaps, B_FALSE); 2797 snaplist_destroy(&pa.origin_snaps, B_FALSE); 2798 if (pa.origin_origin) 2799 dsl_dataset_disown(pa.origin_origin, FTAG); 2800 dsl_dataset_rele(ds, FTAG); 2801 return (err); 2802 } 2803 2804 struct cloneswaparg { 2805 dsl_dataset_t *cds; /* clone dataset */ 2806 dsl_dataset_t *ohds; /* origin's head dataset */ 2807 boolean_t force; 2808 int64_t unused_refres_delta; /* change in unconsumed refreservation */ 2809 }; 2810 2811 /* ARGSUSED */ 2812 static int 2813 dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx) 2814 { 2815 struct cloneswaparg *csa = arg1; 2816 2817 /* they should both be heads */ 2818 if (dsl_dataset_is_snapshot(csa->cds) || 2819 dsl_dataset_is_snapshot(csa->ohds)) 2820 return (EINVAL); 2821 2822 /* the branch point should be just before them */ 2823 if (csa->cds->ds_prev != csa->ohds->ds_prev) 2824 return (EINVAL); 2825 2826 /* cds should be the clone (unless they are unrelated) */ 2827 if (csa->cds->ds_prev != NULL && 2828 csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap && 2829 csa->ohds->ds_object != 2830 csa->cds->ds_prev->ds_phys->ds_next_snap_obj) 2831 return (EINVAL); 2832 2833 /* the clone should be a child of the origin */ 2834 if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir) 2835 return (EINVAL); 2836 2837 /* ohds shouldn't be modified unless 'force' */ 2838 if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds)) 2839 return (ETXTBSY); 2840 2841 /* adjust amount of any unconsumed refreservation */ 2842 csa->unused_refres_delta = 2843 (int64_t)MIN(csa->ohds->ds_reserved, 2844 csa->ohds->ds_phys->ds_unique_bytes) - 2845 (int64_t)MIN(csa->ohds->ds_reserved, 2846 csa->cds->ds_phys->ds_unique_bytes); 2847 2848 if (csa->unused_refres_delta > 0 && 2849 csa->unused_refres_delta > 2850 dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE)) 2851 return (ENOSPC); 2852 2853 if (csa->ohds->ds_quota != 0 && 2854 csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota) 2855 return (EDQUOT); 2856 2857 return (0); 2858 } 2859 2860 /* ARGSUSED */ 2861 static void 2862 dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 2863 { 2864 struct cloneswaparg *csa = arg1; 2865 dsl_pool_t *dp = csa->cds->ds_dir->dd_pool; 2866 2867 ASSERT(csa->cds->ds_reserved == 0); 2868 ASSERT(csa->ohds->ds_quota == 0 || 2869 csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota); 2870 2871 dmu_buf_will_dirty(csa->cds->ds_dbuf, tx); 2872 dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx); 2873 2874 if (csa->cds->ds_objset != NULL) { 2875 dmu_objset_evict(csa->cds->ds_objset); 2876 csa->cds->ds_objset = NULL; 2877 } 2878 2879 if (csa->ohds->ds_objset != NULL) { 2880 dmu_objset_evict(csa->ohds->ds_objset); 2881 csa->ohds->ds_objset = NULL; 2882 } 2883 2884 /* 2885 * Reset origin's unique bytes, if it exists. 2886 */ 2887 if (csa->cds->ds_prev) { 2888 dsl_dataset_t *origin = csa->cds->ds_prev; 2889 dmu_buf_will_dirty(origin->ds_dbuf, tx); 2890 VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist, 2891 origin->ds_phys->ds_prev_snap_txg, UINT64_MAX, 2892 &origin->ds_phys->ds_unique_bytes)); 2893 } 2894 2895 /* swap blkptrs */ 2896 { 2897 blkptr_t tmp; 2898 tmp = csa->ohds->ds_phys->ds_bp; 2899 csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp; 2900 csa->cds->ds_phys->ds_bp = tmp; 2901 } 2902 2903 /* set dd_*_bytes */ 2904 { 2905 int64_t dused, dcomp, duncomp; 2906 uint64_t cdl_used, cdl_comp, cdl_uncomp; 2907 uint64_t odl_used, odl_comp, odl_uncomp; 2908 2909 ASSERT3U(csa->cds->ds_dir->dd_phys-> 2910 dd_used_breakdown[DD_USED_SNAP], ==, 0); 2911 2912 VERIFY(0 == bplist_space(&csa->cds->ds_deadlist, &cdl_used, 2913 &cdl_comp, &cdl_uncomp)); 2914 VERIFY(0 == bplist_space(&csa->ohds->ds_deadlist, &odl_used, 2915 &odl_comp, &odl_uncomp)); 2916 2917 dused = csa->cds->ds_phys->ds_used_bytes + cdl_used - 2918 (csa->ohds->ds_phys->ds_used_bytes + odl_used); 2919 dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp - 2920 (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp); 2921 duncomp = csa->cds->ds_phys->ds_uncompressed_bytes + 2922 cdl_uncomp - 2923 (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp); 2924 2925 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD, 2926 dused, dcomp, duncomp, tx); 2927 dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD, 2928 -dused, -dcomp, -duncomp, tx); 2929 2930 /* 2931 * The difference in the space used by snapshots is the 2932 * difference in snapshot space due to the head's 2933 * deadlist (since that's the only thing that's 2934 * changing that affects the snapused). 2935 */ 2936 VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist, 2937 csa->ohds->ds_origin_txg, UINT64_MAX, &cdl_used)); 2938 VERIFY(0 == bplist_space_birthrange(&csa->ohds->ds_deadlist, 2939 csa->ohds->ds_origin_txg, UINT64_MAX, &odl_used)); 2940 dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used, 2941 DD_USED_HEAD, DD_USED_SNAP, tx); 2942 } 2943 2944 #define SWITCH64(x, y) \ 2945 { \ 2946 uint64_t __tmp = (x); \ 2947 (x) = (y); \ 2948 (y) = __tmp; \ 2949 } 2950 2951 /* swap ds_*_bytes */ 2952 SWITCH64(csa->ohds->ds_phys->ds_used_bytes, 2953 csa->cds->ds_phys->ds_used_bytes); 2954 SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes, 2955 csa->cds->ds_phys->ds_compressed_bytes); 2956 SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes, 2957 csa->cds->ds_phys->ds_uncompressed_bytes); 2958 SWITCH64(csa->ohds->ds_phys->ds_unique_bytes, 2959 csa->cds->ds_phys->ds_unique_bytes); 2960 2961 /* apply any parent delta for change in unconsumed refreservation */ 2962 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV, 2963 csa->unused_refres_delta, 0, 0, tx); 2964 2965 /* swap deadlists */ 2966 bplist_close(&csa->cds->ds_deadlist); 2967 bplist_close(&csa->ohds->ds_deadlist); 2968 SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj, 2969 csa->cds->ds_phys->ds_deadlist_obj); 2970 VERIFY(0 == bplist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset, 2971 csa->cds->ds_phys->ds_deadlist_obj)); 2972 VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset, 2973 csa->ohds->ds_phys->ds_deadlist_obj)); 2974 2975 dsl_pool_ds_clone_swapped(csa->ohds, csa->cds, tx); 2976 } 2977 2978 /* 2979 * Swap 'clone' with its origin head datasets. Used at the end of "zfs 2980 * recv" into an existing fs to swizzle the file system to the new 2981 * version, and by "zfs rollback". Can also be used to swap two 2982 * independent head datasets if neither has any snapshots. 2983 */ 2984 int 2985 dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, 2986 boolean_t force) 2987 { 2988 struct cloneswaparg csa; 2989 int error; 2990 2991 ASSERT(clone->ds_owner); 2992 ASSERT(origin_head->ds_owner); 2993 retry: 2994 /* Need exclusive access for the swap */ 2995 rw_enter(&clone->ds_rwlock, RW_WRITER); 2996 if (!rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) { 2997 rw_exit(&clone->ds_rwlock); 2998 rw_enter(&origin_head->ds_rwlock, RW_WRITER); 2999 if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) { 3000 rw_exit(&origin_head->ds_rwlock); 3001 goto retry; 3002 } 3003 } 3004 csa.cds = clone; 3005 csa.ohds = origin_head; 3006 csa.force = force; 3007 error = dsl_sync_task_do(clone->ds_dir->dd_pool, 3008 dsl_dataset_clone_swap_check, 3009 dsl_dataset_clone_swap_sync, &csa, NULL, 9); 3010 return (error); 3011 } 3012 3013 /* 3014 * Given a pool name and a dataset object number in that pool, 3015 * return the name of that dataset. 3016 */ 3017 int 3018 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) 3019 { 3020 spa_t *spa; 3021 dsl_pool_t *dp; 3022 dsl_dataset_t *ds; 3023 int error; 3024 3025 if ((error = spa_open(pname, &spa, FTAG)) != 0) 3026 return (error); 3027 dp = spa_get_dsl(spa); 3028 rw_enter(&dp->dp_config_rwlock, RW_READER); 3029 if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) { 3030 dsl_dataset_name(ds, buf); 3031 dsl_dataset_rele(ds, FTAG); 3032 } 3033 rw_exit(&dp->dp_config_rwlock); 3034 spa_close(spa, FTAG); 3035 3036 return (error); 3037 } 3038 3039 int 3040 dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, 3041 uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv) 3042 { 3043 int error = 0; 3044 3045 ASSERT3S(asize, >, 0); 3046 3047 /* 3048 * *ref_rsrv is the portion of asize that will come from any 3049 * unconsumed refreservation space. 3050 */ 3051 *ref_rsrv = 0; 3052 3053 mutex_enter(&ds->ds_lock); 3054 /* 3055 * Make a space adjustment for reserved bytes. 3056 */ 3057 if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) { 3058 ASSERT3U(*used, >=, 3059 ds->ds_reserved - ds->ds_phys->ds_unique_bytes); 3060 *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes); 3061 *ref_rsrv = 3062 asize - MIN(asize, parent_delta(ds, asize + inflight)); 3063 } 3064 3065 if (!check_quota || ds->ds_quota == 0) { 3066 mutex_exit(&ds->ds_lock); 3067 return (0); 3068 } 3069 /* 3070 * If they are requesting more space, and our current estimate 3071 * is over quota, they get to try again unless the actual 3072 * on-disk is over quota and there are no pending changes (which 3073 * may free up space for us). 3074 */ 3075 if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) { 3076 if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota) 3077 error = ERESTART; 3078 else 3079 error = EDQUOT; 3080 } 3081 mutex_exit(&ds->ds_lock); 3082 3083 return (error); 3084 } 3085 3086 /* ARGSUSED */ 3087 static int 3088 dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) 3089 { 3090 dsl_dataset_t *ds = arg1; 3091 dsl_prop_setarg_t *psa = arg2; 3092 int err; 3093 3094 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA) 3095 return (ENOTSUP); 3096 3097 if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) 3098 return (err); 3099 3100 if (psa->psa_effective_value == 0) 3101 return (0); 3102 3103 if (psa->psa_effective_value < ds->ds_phys->ds_used_bytes || 3104 psa->psa_effective_value < ds->ds_reserved) 3105 return (ENOSPC); 3106 3107 return (0); 3108 } 3109 3110 extern void dsl_prop_set_sync(void *, void *, cred_t *, dmu_tx_t *); 3111 3112 void 3113 dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 3114 { 3115 dsl_dataset_t *ds = arg1; 3116 dsl_prop_setarg_t *psa = arg2; 3117 uint64_t effective_value = psa->psa_effective_value; 3118 3119 dsl_prop_set_sync(ds, psa, cr, tx); 3120 DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); 3121 3122 if (ds->ds_quota != effective_value) { 3123 dmu_buf_will_dirty(ds->ds_dbuf, tx); 3124 ds->ds_quota = effective_value; 3125 3126 spa_history_internal_log(LOG_DS_REFQUOTA, 3127 ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu ", 3128 (longlong_t)ds->ds_quota, ds->ds_object); 3129 } 3130 } 3131 3132 int 3133 dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota) 3134 { 3135 dsl_dataset_t *ds; 3136 dsl_prop_setarg_t psa; 3137 int err; 3138 3139 dsl_prop_setarg_init_uint64(&psa, "refquota", source, "a); 3140 3141 err = dsl_dataset_hold(dsname, FTAG, &ds); 3142 if (err) 3143 return (err); 3144 3145 /* 3146 * If someone removes a file, then tries to set the quota, we 3147 * want to make sure the file freeing takes effect. 3148 */ 3149 txg_wait_open(ds->ds_dir->dd_pool, 0); 3150 3151 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 3152 dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync, 3153 ds, &psa, 0); 3154 3155 dsl_dataset_rele(ds, FTAG); 3156 return (err); 3157 } 3158 3159 static int 3160 dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) 3161 { 3162 dsl_dataset_t *ds = arg1; 3163 dsl_prop_setarg_t *psa = arg2; 3164 uint64_t effective_value; 3165 uint64_t unique; 3166 int err; 3167 3168 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < 3169 SPA_VERSION_REFRESERVATION) 3170 return (ENOTSUP); 3171 3172 if (dsl_dataset_is_snapshot(ds)) 3173 return (EINVAL); 3174 3175 if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) 3176 return (err); 3177 3178 effective_value = psa->psa_effective_value; 3179 3180 /* 3181 * If we are doing the preliminary check in open context, the 3182 * space estimates may be inaccurate. 3183 */ 3184 if (!dmu_tx_is_syncing(tx)) 3185 return (0); 3186 3187 mutex_enter(&ds->ds_lock); 3188 unique = dsl_dataset_unique(ds); 3189 mutex_exit(&ds->ds_lock); 3190 3191 if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) { 3192 uint64_t delta = MAX(unique, effective_value) - 3193 MAX(unique, ds->ds_reserved); 3194 3195 if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) 3196 return (ENOSPC); 3197 if (ds->ds_quota > 0 && 3198 effective_value > ds->ds_quota) 3199 return (ENOSPC); 3200 } 3201 3202 return (0); 3203 } 3204 3205 /* ARGSUSED */ 3206 static void 3207 dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, 3208 dmu_tx_t *tx) 3209 { 3210 dsl_dataset_t *ds = arg1; 3211 dsl_prop_setarg_t *psa = arg2; 3212 uint64_t effective_value = psa->psa_effective_value; 3213 uint64_t unique; 3214 int64_t delta; 3215 3216 dsl_prop_set_sync(ds, psa, cr, tx); 3217 DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); 3218 3219 dmu_buf_will_dirty(ds->ds_dbuf, tx); 3220 3221 mutex_enter(&ds->ds_dir->dd_lock); 3222 mutex_enter(&ds->ds_lock); 3223 unique = dsl_dataset_unique(ds); 3224 delta = MAX(0, (int64_t)(effective_value - unique)) - 3225 MAX(0, (int64_t)(ds->ds_reserved - unique)); 3226 ds->ds_reserved = effective_value; 3227 mutex_exit(&ds->ds_lock); 3228 3229 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); 3230 mutex_exit(&ds->ds_dir->dd_lock); 3231 3232 spa_history_internal_log(LOG_DS_REFRESERV, 3233 ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu", 3234 (longlong_t)effective_value, ds->ds_object); 3235 } 3236 3237 int 3238 dsl_dataset_set_reservation(const char *dsname, zprop_source_t source, 3239 uint64_t reservation) 3240 { 3241 dsl_dataset_t *ds; 3242 dsl_prop_setarg_t psa; 3243 int err; 3244 3245 dsl_prop_setarg_init_uint64(&psa, "refreservation", source, 3246 &reservation); 3247 3248 err = dsl_dataset_hold(dsname, FTAG, &ds); 3249 if (err) 3250 return (err); 3251 3252 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 3253 dsl_dataset_set_reservation_check, 3254 dsl_dataset_set_reservation_sync, ds, &psa, 0); 3255 3256 dsl_dataset_rele(ds, FTAG); 3257 return (err); 3258 } 3259 3260 struct dsl_ds_holdarg { 3261 dsl_sync_task_group_t *dstg; 3262 char *htag; 3263 char *snapname; 3264 boolean_t recursive; 3265 boolean_t gotone; 3266 boolean_t temphold; 3267 char failed[MAXPATHLEN]; 3268 }; 3269 3270 /* 3271 * The max length of a temporary tag prefix is the number of hex digits 3272 * required to express UINT64_MAX plus one for the hyphen. 3273 */ 3274 #define MAX_TAG_PREFIX_LEN 17 3275 3276 static int 3277 dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx) 3278 { 3279 dsl_dataset_t *ds = arg1; 3280 struct dsl_ds_holdarg *ha = arg2; 3281 char *htag = ha->htag; 3282 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 3283 int error = 0; 3284 3285 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) 3286 return (ENOTSUP); 3287 3288 if (!dsl_dataset_is_snapshot(ds)) 3289 return (EINVAL); 3290 3291 /* tags must be unique */ 3292 mutex_enter(&ds->ds_lock); 3293 if (ds->ds_phys->ds_userrefs_obj) { 3294 error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag, 3295 8, 1, tx); 3296 if (error == 0) 3297 error = EEXIST; 3298 else if (error == ENOENT) 3299 error = 0; 3300 } 3301 mutex_exit(&ds->ds_lock); 3302 3303 if (error == 0 && ha->temphold && 3304 strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) 3305 error = E2BIG; 3306 3307 return (error); 3308 } 3309 3310 static void 3311 dsl_dataset_user_hold_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 3312 { 3313 dsl_dataset_t *ds = arg1; 3314 struct dsl_ds_holdarg *ha = arg2; 3315 char *htag = ha->htag; 3316 dsl_pool_t *dp = ds->ds_dir->dd_pool; 3317 objset_t *mos = dp->dp_meta_objset; 3318 uint64_t now = gethrestime_sec(); 3319 uint64_t zapobj; 3320 3321 mutex_enter(&ds->ds_lock); 3322 if (ds->ds_phys->ds_userrefs_obj == 0) { 3323 /* 3324 * This is the first user hold for this dataset. Create 3325 * the userrefs zap object. 3326 */ 3327 dmu_buf_will_dirty(ds->ds_dbuf, tx); 3328 zapobj = ds->ds_phys->ds_userrefs_obj = 3329 zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); 3330 } else { 3331 zapobj = ds->ds_phys->ds_userrefs_obj; 3332 } 3333 ds->ds_userrefs++; 3334 mutex_exit(&ds->ds_lock); 3335 3336 VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx)); 3337 3338 if (ha->temphold) { 3339 VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object, 3340 htag, &now, tx)); 3341 } 3342 3343 spa_history_internal_log(LOG_DS_USER_HOLD, 3344 dp->dp_spa, tx, cr, "<%s> temp = %d dataset = %llu", htag, 3345 (int)ha->temphold, ds->ds_object); 3346 } 3347 3348 static int 3349 dsl_dataset_user_hold_one(const char *dsname, void *arg) 3350 { 3351 struct dsl_ds_holdarg *ha = arg; 3352 dsl_dataset_t *ds; 3353 int error; 3354 char *name; 3355 3356 /* alloc a buffer to hold dsname@snapname plus terminating NULL */ 3357 name = kmem_asprintf("%s@%s", dsname, ha->snapname); 3358 error = dsl_dataset_hold(name, ha->dstg, &ds); 3359 strfree(name); 3360 if (error == 0) { 3361 ha->gotone = B_TRUE; 3362 dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check, 3363 dsl_dataset_user_hold_sync, ds, ha, 0); 3364 } else if (error == ENOENT && ha->recursive) { 3365 error = 0; 3366 } else { 3367 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 3368 } 3369 return (error); 3370 } 3371 3372 int 3373 dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, 3374 boolean_t recursive, boolean_t temphold) 3375 { 3376 struct dsl_ds_holdarg *ha; 3377 dsl_sync_task_t *dst; 3378 spa_t *spa; 3379 int error; 3380 3381 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); 3382 3383 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 3384 3385 error = spa_open(dsname, &spa, FTAG); 3386 if (error) { 3387 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3388 return (error); 3389 } 3390 3391 ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 3392 ha->htag = htag; 3393 ha->snapname = snapname; 3394 ha->recursive = recursive; 3395 ha->temphold = temphold; 3396 if (recursive) { 3397 error = dmu_objset_find(dsname, dsl_dataset_user_hold_one, 3398 ha, DS_FIND_CHILDREN); 3399 } else { 3400 error = dsl_dataset_user_hold_one(dsname, ha); 3401 } 3402 if (error == 0) 3403 error = dsl_sync_task_group_wait(ha->dstg); 3404 3405 for (dst = list_head(&ha->dstg->dstg_tasks); dst; 3406 dst = list_next(&ha->dstg->dstg_tasks, dst)) { 3407 dsl_dataset_t *ds = dst->dst_arg1; 3408 3409 if (dst->dst_err) { 3410 dsl_dataset_name(ds, ha->failed); 3411 *strchr(ha->failed, '@') = '\0'; 3412 } 3413 dsl_dataset_rele(ds, ha->dstg); 3414 } 3415 3416 if (error == 0 && recursive && !ha->gotone) 3417 error = ENOENT; 3418 3419 if (error) 3420 (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); 3421 3422 dsl_sync_task_group_destroy(ha->dstg); 3423 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3424 spa_close(spa, FTAG); 3425 return (error); 3426 } 3427 3428 struct dsl_ds_releasearg { 3429 dsl_dataset_t *ds; 3430 const char *htag; 3431 boolean_t own; /* do we own or just hold ds? */ 3432 }; 3433 3434 static int 3435 dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag, 3436 boolean_t *might_destroy) 3437 { 3438 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 3439 uint64_t zapobj; 3440 uint64_t tmp; 3441 int error; 3442 3443 *might_destroy = B_FALSE; 3444 3445 mutex_enter(&ds->ds_lock); 3446 zapobj = ds->ds_phys->ds_userrefs_obj; 3447 if (zapobj == 0) { 3448 /* The tag can't possibly exist */ 3449 mutex_exit(&ds->ds_lock); 3450 return (ESRCH); 3451 } 3452 3453 /* Make sure the tag exists */ 3454 error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp); 3455 if (error) { 3456 mutex_exit(&ds->ds_lock); 3457 if (error == ENOENT) 3458 error = ESRCH; 3459 return (error); 3460 } 3461 3462 if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 && 3463 DS_IS_DEFER_DESTROY(ds)) 3464 *might_destroy = B_TRUE; 3465 3466 mutex_exit(&ds->ds_lock); 3467 return (0); 3468 } 3469 3470 static int 3471 dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx) 3472 { 3473 struct dsl_ds_releasearg *ra = arg1; 3474 dsl_dataset_t *ds = ra->ds; 3475 boolean_t might_destroy; 3476 int error; 3477 3478 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) 3479 return (ENOTSUP); 3480 3481 error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy); 3482 if (error) 3483 return (error); 3484 3485 if (might_destroy) { 3486 struct dsl_ds_destroyarg dsda = {0}; 3487 3488 if (dmu_tx_is_syncing(tx)) { 3489 /* 3490 * If we're not prepared to remove the snapshot, 3491 * we can't allow the release to happen right now. 3492 */ 3493 if (!ra->own) 3494 return (EBUSY); 3495 if (ds->ds_objset) { 3496 dmu_objset_evict(ds->ds_objset); 3497 ds->ds_objset = NULL; 3498 } 3499 } 3500 dsda.ds = ds; 3501 dsda.releasing = B_TRUE; 3502 return (dsl_dataset_destroy_check(&dsda, tag, tx)); 3503 } 3504 3505 return (0); 3506 } 3507 3508 static void 3509 dsl_dataset_user_release_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) 3510 { 3511 struct dsl_ds_releasearg *ra = arg1; 3512 dsl_dataset_t *ds = ra->ds; 3513 dsl_pool_t *dp = ds->ds_dir->dd_pool; 3514 objset_t *mos = dp->dp_meta_objset; 3515 uint64_t zapobj; 3516 uint64_t dsobj = ds->ds_object; 3517 uint64_t refs; 3518 int error; 3519 3520 mutex_enter(&ds->ds_lock); 3521 ds->ds_userrefs--; 3522 refs = ds->ds_userrefs; 3523 mutex_exit(&ds->ds_lock); 3524 error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx); 3525 VERIFY(error == 0 || error == ENOENT); 3526 zapobj = ds->ds_phys->ds_userrefs_obj; 3527 VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx)); 3528 if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 && 3529 DS_IS_DEFER_DESTROY(ds)) { 3530 struct dsl_ds_destroyarg dsda = {0}; 3531 3532 ASSERT(ra->own); 3533 dsda.ds = ds; 3534 dsda.releasing = B_TRUE; 3535 /* We already did the destroy_check */ 3536 dsl_dataset_destroy_sync(&dsda, tag, cr, tx); 3537 } 3538 3539 spa_history_internal_log(LOG_DS_USER_RELEASE, 3540 dp->dp_spa, tx, cr, "<%s> %lld dataset = %llu", 3541 ra->htag, (longlong_t)refs, dsobj); 3542 } 3543 3544 static int 3545 dsl_dataset_user_release_one(const char *dsname, void *arg) 3546 { 3547 struct dsl_ds_holdarg *ha = arg; 3548 struct dsl_ds_releasearg *ra; 3549 dsl_dataset_t *ds; 3550 int error; 3551 void *dtag = ha->dstg; 3552 char *name; 3553 boolean_t own = B_FALSE; 3554 boolean_t might_destroy; 3555 3556 /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */ 3557 name = kmem_asprintf("%s@%s", dsname, ha->snapname); 3558 error = dsl_dataset_hold(name, dtag, &ds); 3559 strfree(name); 3560 if (error == ENOENT && ha->recursive) 3561 return (0); 3562 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 3563 if (error) 3564 return (error); 3565 3566 ha->gotone = B_TRUE; 3567 3568 ASSERT(dsl_dataset_is_snapshot(ds)); 3569 3570 error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy); 3571 if (error) { 3572 dsl_dataset_rele(ds, dtag); 3573 return (error); 3574 } 3575 3576 if (might_destroy) { 3577 #ifdef _KERNEL 3578 name = kmem_asprintf("%s@%s", dsname, ha->snapname); 3579 error = zfs_unmount_snap(name, NULL); 3580 strfree(name); 3581 if (error) { 3582 dsl_dataset_rele(ds, dtag); 3583 return (error); 3584 } 3585 #endif 3586 if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) { 3587 dsl_dataset_rele(ds, dtag); 3588 return (EBUSY); 3589 } else { 3590 own = B_TRUE; 3591 dsl_dataset_make_exclusive(ds, dtag); 3592 } 3593 } 3594 3595 ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP); 3596 ra->ds = ds; 3597 ra->htag = ha->htag; 3598 ra->own = own; 3599 dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check, 3600 dsl_dataset_user_release_sync, ra, dtag, 0); 3601 3602 return (0); 3603 } 3604 3605 int 3606 dsl_dataset_user_release(char *dsname, char *snapname, char *htag, 3607 boolean_t recursive) 3608 { 3609 struct dsl_ds_holdarg *ha; 3610 dsl_sync_task_t *dst; 3611 spa_t *spa; 3612 int error; 3613 3614 top: 3615 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); 3616 3617 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 3618 3619 error = spa_open(dsname, &spa, FTAG); 3620 if (error) { 3621 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3622 return (error); 3623 } 3624 3625 ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 3626 ha->htag = htag; 3627 ha->snapname = snapname; 3628 ha->recursive = recursive; 3629 if (recursive) { 3630 error = dmu_objset_find(dsname, dsl_dataset_user_release_one, 3631 ha, DS_FIND_CHILDREN); 3632 } else { 3633 error = dsl_dataset_user_release_one(dsname, ha); 3634 } 3635 if (error == 0) 3636 error = dsl_sync_task_group_wait(ha->dstg); 3637 3638 for (dst = list_head(&ha->dstg->dstg_tasks); dst; 3639 dst = list_next(&ha->dstg->dstg_tasks, dst)) { 3640 struct dsl_ds_releasearg *ra = dst->dst_arg1; 3641 dsl_dataset_t *ds = ra->ds; 3642 3643 if (dst->dst_err) 3644 dsl_dataset_name(ds, ha->failed); 3645 3646 if (ra->own) 3647 dsl_dataset_disown(ds, ha->dstg); 3648 else 3649 dsl_dataset_rele(ds, ha->dstg); 3650 3651 kmem_free(ra, sizeof (struct dsl_ds_releasearg)); 3652 } 3653 3654 if (error == 0 && recursive && !ha->gotone) 3655 error = ENOENT; 3656 3657 if (error && error != EBUSY) 3658 (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); 3659 3660 dsl_sync_task_group_destroy(ha->dstg); 3661 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3662 spa_close(spa, FTAG); 3663 3664 /* 3665 * We can get EBUSY if we were racing with deferred destroy and 3666 * dsl_dataset_user_release_check() hadn't done the necessary 3667 * open context setup. We can also get EBUSY if we're racing 3668 * with destroy and that thread is the ds_owner. Either way 3669 * the busy condition should be transient, and we should retry 3670 * the release operation. 3671 */ 3672 if (error == EBUSY) 3673 goto top; 3674 3675 return (error); 3676 } 3677 3678 /* 3679 * Called at spa_load time to release a stale temporary user hold. 3680 */ 3681 int 3682 dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag) 3683 { 3684 dsl_dataset_t *ds; 3685 char *snap; 3686 char *name; 3687 int namelen; 3688 int error; 3689 3690 rw_enter(&dp->dp_config_rwlock, RW_READER); 3691 error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); 3692 rw_exit(&dp->dp_config_rwlock); 3693 if (error) 3694 return (error); 3695 namelen = dsl_dataset_namelen(ds)+1; 3696 name = kmem_alloc(namelen, KM_SLEEP); 3697 dsl_dataset_name(ds, name); 3698 dsl_dataset_rele(ds, FTAG); 3699 3700 snap = strchr(name, '@'); 3701 *snap = '\0'; 3702 ++snap; 3703 return (dsl_dataset_user_release(name, snap, htag, B_FALSE)); 3704 } 3705 3706 int 3707 dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp) 3708 { 3709 dsl_dataset_t *ds; 3710 int err; 3711 3712 err = dsl_dataset_hold(dsname, FTAG, &ds); 3713 if (err) 3714 return (err); 3715 3716 VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP)); 3717 if (ds->ds_phys->ds_userrefs_obj != 0) { 3718 zap_attribute_t *za; 3719 zap_cursor_t zc; 3720 3721 za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); 3722 for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, 3723 ds->ds_phys->ds_userrefs_obj); 3724 zap_cursor_retrieve(&zc, za) == 0; 3725 zap_cursor_advance(&zc)) { 3726 VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name, 3727 za->za_first_integer)); 3728 } 3729 zap_cursor_fini(&zc); 3730 kmem_free(za, sizeof (zap_attribute_t)); 3731 } 3732 dsl_dataset_rele(ds, FTAG); 3733 return (0); 3734 } 3735 3736 /* 3737 * Note, this fuction is used as the callback for dmu_objset_find(). We 3738 * always return 0 so that we will continue to find and process 3739 * inconsistent datasets, even if we encounter an error trying to 3740 * process one of them. 3741 */ 3742 /* ARGSUSED */ 3743 int 3744 dsl_destroy_inconsistent(const char *dsname, void *arg) 3745 { 3746 dsl_dataset_t *ds; 3747 3748 if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) { 3749 if (DS_IS_INCONSISTENT(ds)) 3750 (void) dsl_dataset_destroy(ds, FTAG, B_FALSE); 3751 else 3752 dsl_dataset_disown(ds, FTAG); 3753 } 3754 return (0); 3755 } 3756