1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011 by Delphix. All rights reserved. 24 */ 25 26 #include <sys/dmu_objset.h> 27 #include <sys/dsl_dataset.h> 28 #include <sys/dsl_dir.h> 29 #include <sys/dsl_prop.h> 30 #include <sys/dsl_synctask.h> 31 #include <sys/dmu_traverse.h> 32 #include <sys/dmu_tx.h> 33 #include <sys/arc.h> 34 #include <sys/zio.h> 35 #include <sys/zap.h> 36 #include <sys/unique.h> 37 #include <sys/zfs_context.h> 38 #include <sys/zfs_ioctl.h> 39 #include <sys/spa.h> 40 #include <sys/zfs_znode.h> 41 #include <sys/zfs_onexit.h> 42 #include <sys/zvol.h> 43 #include <sys/dsl_scan.h> 44 #include <sys/dsl_deadlist.h> 45 46 static char *dsl_reaper = "the grim reaper"; 47 48 static dsl_checkfunc_t dsl_dataset_destroy_begin_check; 49 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync; 50 static dsl_syncfunc_t dsl_dataset_set_reservation_sync; 51 52 #define SWITCH64(x, y) \ 53 { \ 54 uint64_t __tmp = (x); \ 55 (x) = (y); \ 56 (y) = __tmp; \ 57 } 58 59 #define DS_REF_MAX (1ULL << 62) 60 61 #define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE 62 63 #define DSL_DATASET_IS_DESTROYED(ds) ((ds)->ds_owner == dsl_reaper) 64 65 66 /* 67 * Figure out how much of this delta should be propogated to the dsl_dir 68 * layer. If there's a refreservation, that space has already been 69 * partially accounted for in our ancestors. 70 */ 71 static int64_t 72 parent_delta(dsl_dataset_t *ds, int64_t delta) 73 { 74 uint64_t old_bytes, new_bytes; 75 76 if (ds->ds_reserved == 0) 77 return (delta); 78 79 old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); 80 new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved); 81 82 ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta)); 83 return (new_bytes - old_bytes); 84 } 85 86 void 87 dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) 88 { 89 int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); 90 int compressed = BP_GET_PSIZE(bp); 91 int uncompressed = BP_GET_UCSIZE(bp); 92 int64_t delta; 93 94 dprintf_bp(bp, "ds=%p", ds); 95 96 ASSERT(dmu_tx_is_syncing(tx)); 97 /* It could have been compressed away to nothing */ 98 if (BP_IS_HOLE(bp)) 99 return; 100 ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); 101 ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES); 102 if (ds == NULL) { 103 /* 104 * Account for the meta-objset space in its placeholder 105 * dsl_dir. 106 */ 107 ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */ 108 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, 109 used, compressed, uncompressed, tx); 110 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); 111 return; 112 } 113 dmu_buf_will_dirty(ds->ds_dbuf, tx); 114 115 mutex_enter(&ds->ds_dir->dd_lock); 116 mutex_enter(&ds->ds_lock); 117 delta = parent_delta(ds, used); 118 ds->ds_phys->ds_used_bytes += used; 119 ds->ds_phys->ds_compressed_bytes += compressed; 120 ds->ds_phys->ds_uncompressed_bytes += uncompressed; 121 ds->ds_phys->ds_unique_bytes += used; 122 mutex_exit(&ds->ds_lock); 123 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, 124 compressed, uncompressed, tx); 125 dsl_dir_transfer_space(ds->ds_dir, used - delta, 126 DD_USED_REFRSRV, DD_USED_HEAD, tx); 127 mutex_exit(&ds->ds_dir->dd_lock); 128 } 129 130 int 131 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, 132 boolean_t async) 133 { 134 if (BP_IS_HOLE(bp)) 135 return (0); 136 137 ASSERT(dmu_tx_is_syncing(tx)); 138 ASSERT(bp->blk_birth <= tx->tx_txg); 139 140 int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); 141 int compressed = BP_GET_PSIZE(bp); 142 int uncompressed = BP_GET_UCSIZE(bp); 143 144 ASSERT(used > 0); 145 if (ds == NULL) { 146 /* 147 * Account for the meta-objset space in its placeholder 148 * dataset. 149 */ 150 dsl_free(tx->tx_pool, tx->tx_txg, bp); 151 152 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, 153 -used, -compressed, -uncompressed, tx); 154 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); 155 return (used); 156 } 157 ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); 158 159 ASSERT(!dsl_dataset_is_snapshot(ds)); 160 dmu_buf_will_dirty(ds->ds_dbuf, tx); 161 162 if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) { 163 int64_t delta; 164 165 dprintf_bp(bp, "freeing ds=%llu", ds->ds_object); 166 dsl_free(tx->tx_pool, tx->tx_txg, bp); 167 168 mutex_enter(&ds->ds_dir->dd_lock); 169 mutex_enter(&ds->ds_lock); 170 ASSERT(ds->ds_phys->ds_unique_bytes >= used || 171 !DS_UNIQUE_IS_ACCURATE(ds)); 172 delta = parent_delta(ds, -used); 173 ds->ds_phys->ds_unique_bytes -= used; 174 mutex_exit(&ds->ds_lock); 175 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, 176 delta, -compressed, -uncompressed, tx); 177 dsl_dir_transfer_space(ds->ds_dir, -used - delta, 178 DD_USED_REFRSRV, DD_USED_HEAD, tx); 179 mutex_exit(&ds->ds_dir->dd_lock); 180 } else { 181 dprintf_bp(bp, "putting on dead list: %s", ""); 182 if (async) { 183 /* 184 * We are here as part of zio's write done callback, 185 * which means we're a zio interrupt thread. We can't 186 * call dsl_deadlist_insert() now because it may block 187 * waiting for I/O. Instead, put bp on the deferred 188 * queue and let dsl_pool_sync() finish the job. 189 */ 190 bplist_append(&ds->ds_pending_deadlist, bp); 191 } else { 192 dsl_deadlist_insert(&ds->ds_deadlist, bp, tx); 193 } 194 ASSERT3U(ds->ds_prev->ds_object, ==, 195 ds->ds_phys->ds_prev_snap_obj); 196 ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0); 197 /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ 198 if (ds->ds_prev->ds_phys->ds_next_snap_obj == 199 ds->ds_object && bp->blk_birth > 200 ds->ds_prev->ds_phys->ds_prev_snap_txg) { 201 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 202 mutex_enter(&ds->ds_prev->ds_lock); 203 ds->ds_prev->ds_phys->ds_unique_bytes += used; 204 mutex_exit(&ds->ds_prev->ds_lock); 205 } 206 if (bp->blk_birth > ds->ds_dir->dd_origin_txg) { 207 dsl_dir_transfer_space(ds->ds_dir, used, 208 DD_USED_HEAD, DD_USED_SNAP, tx); 209 } 210 } 211 mutex_enter(&ds->ds_lock); 212 ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used); 213 ds->ds_phys->ds_used_bytes -= used; 214 ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed); 215 ds->ds_phys->ds_compressed_bytes -= compressed; 216 ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed); 217 ds->ds_phys->ds_uncompressed_bytes -= uncompressed; 218 mutex_exit(&ds->ds_lock); 219 220 return (used); 221 } 222 223 uint64_t 224 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds) 225 { 226 uint64_t trysnap = 0; 227 228 if (ds == NULL) 229 return (0); 230 /* 231 * The snapshot creation could fail, but that would cause an 232 * incorrect FALSE return, which would only result in an 233 * overestimation of the amount of space that an operation would 234 * consume, which is OK. 235 * 236 * There's also a small window where we could miss a pending 237 * snapshot, because we could set the sync task in the quiescing 238 * phase. So this should only be used as a guess. 239 */ 240 if (ds->ds_trysnap_txg > 241 spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa)) 242 trysnap = ds->ds_trysnap_txg; 243 return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap)); 244 } 245 246 boolean_t 247 dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp, 248 uint64_t blk_birth) 249 { 250 if (blk_birth <= dsl_dataset_prev_snap_txg(ds)) 251 return (B_FALSE); 252 253 ddt_prefetch(dsl_dataset_get_spa(ds), bp); 254 255 return (B_TRUE); 256 } 257 258 /* ARGSUSED */ 259 static void 260 dsl_dataset_evict(dmu_buf_t *db, void *dsv) 261 { 262 dsl_dataset_t *ds = dsv; 263 264 ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds)); 265 266 unique_remove(ds->ds_fsid_guid); 267 268 if (ds->ds_objset != NULL) 269 dmu_objset_evict(ds->ds_objset); 270 271 if (ds->ds_prev) { 272 dsl_dataset_drop_ref(ds->ds_prev, ds); 273 ds->ds_prev = NULL; 274 } 275 276 bplist_destroy(&ds->ds_pending_deadlist); 277 if (db != NULL) { 278 dsl_deadlist_close(&ds->ds_deadlist); 279 } else { 280 ASSERT(ds->ds_deadlist.dl_dbuf == NULL); 281 ASSERT(!ds->ds_deadlist.dl_oldfmt); 282 } 283 if (ds->ds_dir) 284 dsl_dir_close(ds->ds_dir, ds); 285 286 ASSERT(!list_link_active(&ds->ds_synced_link)); 287 288 mutex_destroy(&ds->ds_lock); 289 mutex_destroy(&ds->ds_recvlock); 290 mutex_destroy(&ds->ds_opening_lock); 291 rw_destroy(&ds->ds_rwlock); 292 cv_destroy(&ds->ds_exclusive_cv); 293 294 kmem_free(ds, sizeof (dsl_dataset_t)); 295 } 296 297 static int 298 dsl_dataset_get_snapname(dsl_dataset_t *ds) 299 { 300 dsl_dataset_phys_t *headphys; 301 int err; 302 dmu_buf_t *headdbuf; 303 dsl_pool_t *dp = ds->ds_dir->dd_pool; 304 objset_t *mos = dp->dp_meta_objset; 305 306 if (ds->ds_snapname[0]) 307 return (0); 308 if (ds->ds_phys->ds_next_snap_obj == 0) 309 return (0); 310 311 err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj, 312 FTAG, &headdbuf); 313 if (err) 314 return (err); 315 headphys = headdbuf->db_data; 316 err = zap_value_search(dp->dp_meta_objset, 317 headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname); 318 dmu_buf_rele(headdbuf, FTAG); 319 return (err); 320 } 321 322 static int 323 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) 324 { 325 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 326 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; 327 matchtype_t mt; 328 int err; 329 330 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 331 mt = MT_FIRST; 332 else 333 mt = MT_EXACT; 334 335 err = zap_lookup_norm(mos, snapobj, name, 8, 1, 336 value, mt, NULL, 0, NULL); 337 if (err == ENOTSUP && mt == MT_FIRST) 338 err = zap_lookup(mos, snapobj, name, 8, 1, value); 339 return (err); 340 } 341 342 static int 343 dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx) 344 { 345 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 346 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; 347 matchtype_t mt; 348 int err; 349 350 dsl_dir_snap_cmtime_update(ds->ds_dir); 351 352 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 353 mt = MT_FIRST; 354 else 355 mt = MT_EXACT; 356 357 err = zap_remove_norm(mos, snapobj, name, mt, tx); 358 if (err == ENOTSUP && mt == MT_FIRST) 359 err = zap_remove(mos, snapobj, name, tx); 360 return (err); 361 } 362 363 static int 364 dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, 365 dsl_dataset_t **dsp) 366 { 367 objset_t *mos = dp->dp_meta_objset; 368 dmu_buf_t *dbuf; 369 dsl_dataset_t *ds; 370 int err; 371 dmu_object_info_t doi; 372 373 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || 374 dsl_pool_sync_context(dp)); 375 376 err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); 377 if (err) 378 return (err); 379 380 /* Make sure dsobj has the correct object type. */ 381 dmu_object_info_from_db(dbuf, &doi); 382 if (doi.doi_type != DMU_OT_DSL_DATASET) 383 return (EINVAL); 384 385 ds = dmu_buf_get_user(dbuf); 386 if (ds == NULL) { 387 dsl_dataset_t *winner; 388 389 ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); 390 ds->ds_dbuf = dbuf; 391 ds->ds_object = dsobj; 392 ds->ds_phys = dbuf->db_data; 393 394 mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); 395 mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL); 396 mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); 397 rw_init(&ds->ds_rwlock, 0, 0, 0); 398 cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL); 399 400 bplist_create(&ds->ds_pending_deadlist); 401 dsl_deadlist_open(&ds->ds_deadlist, 402 mos, ds->ds_phys->ds_deadlist_obj); 403 404 if (err == 0) { 405 err = dsl_dir_open_obj(dp, 406 ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir); 407 } 408 if (err) { 409 mutex_destroy(&ds->ds_lock); 410 mutex_destroy(&ds->ds_recvlock); 411 mutex_destroy(&ds->ds_opening_lock); 412 rw_destroy(&ds->ds_rwlock); 413 cv_destroy(&ds->ds_exclusive_cv); 414 bplist_destroy(&ds->ds_pending_deadlist); 415 dsl_deadlist_close(&ds->ds_deadlist); 416 kmem_free(ds, sizeof (dsl_dataset_t)); 417 dmu_buf_rele(dbuf, tag); 418 return (err); 419 } 420 421 if (!dsl_dataset_is_snapshot(ds)) { 422 ds->ds_snapname[0] = '\0'; 423 if (ds->ds_phys->ds_prev_snap_obj) { 424 err = dsl_dataset_get_ref(dp, 425 ds->ds_phys->ds_prev_snap_obj, 426 ds, &ds->ds_prev); 427 } 428 } else { 429 if (zfs_flags & ZFS_DEBUG_SNAPNAMES) 430 err = dsl_dataset_get_snapname(ds); 431 if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) { 432 err = zap_count( 433 ds->ds_dir->dd_pool->dp_meta_objset, 434 ds->ds_phys->ds_userrefs_obj, 435 &ds->ds_userrefs); 436 } 437 } 438 439 if (err == 0 && !dsl_dataset_is_snapshot(ds)) { 440 /* 441 * In sync context, we're called with either no lock 442 * or with the write lock. If we're not syncing, 443 * we're always called with the read lock held. 444 */ 445 boolean_t need_lock = 446 !RW_WRITE_HELD(&dp->dp_config_rwlock) && 447 dsl_pool_sync_context(dp); 448 449 if (need_lock) 450 rw_enter(&dp->dp_config_rwlock, RW_READER); 451 452 err = dsl_prop_get_ds(ds, 453 "refreservation", sizeof (uint64_t), 1, 454 &ds->ds_reserved, NULL); 455 if (err == 0) { 456 err = dsl_prop_get_ds(ds, 457 "refquota", sizeof (uint64_t), 1, 458 &ds->ds_quota, NULL); 459 } 460 461 if (need_lock) 462 rw_exit(&dp->dp_config_rwlock); 463 } else { 464 ds->ds_reserved = ds->ds_quota = 0; 465 } 466 467 if (err == 0) { 468 winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys, 469 dsl_dataset_evict); 470 } 471 if (err || winner) { 472 bplist_destroy(&ds->ds_pending_deadlist); 473 dsl_deadlist_close(&ds->ds_deadlist); 474 if (ds->ds_prev) 475 dsl_dataset_drop_ref(ds->ds_prev, ds); 476 dsl_dir_close(ds->ds_dir, ds); 477 mutex_destroy(&ds->ds_lock); 478 mutex_destroy(&ds->ds_recvlock); 479 mutex_destroy(&ds->ds_opening_lock); 480 rw_destroy(&ds->ds_rwlock); 481 cv_destroy(&ds->ds_exclusive_cv); 482 kmem_free(ds, sizeof (dsl_dataset_t)); 483 if (err) { 484 dmu_buf_rele(dbuf, tag); 485 return (err); 486 } 487 ds = winner; 488 } else { 489 ds->ds_fsid_guid = 490 unique_insert(ds->ds_phys->ds_fsid_guid); 491 } 492 } 493 ASSERT3P(ds->ds_dbuf, ==, dbuf); 494 ASSERT3P(ds->ds_phys, ==, dbuf->db_data); 495 ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 || 496 spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN || 497 dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); 498 mutex_enter(&ds->ds_lock); 499 if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) { 500 mutex_exit(&ds->ds_lock); 501 dmu_buf_rele(ds->ds_dbuf, tag); 502 return (ENOENT); 503 } 504 mutex_exit(&ds->ds_lock); 505 *dsp = ds; 506 return (0); 507 } 508 509 static int 510 dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag) 511 { 512 dsl_pool_t *dp = ds->ds_dir->dd_pool; 513 514 /* 515 * In syncing context we don't want the rwlock lock: there 516 * may be an existing writer waiting for sync phase to 517 * finish. We don't need to worry about such writers, since 518 * sync phase is single-threaded, so the writer can't be 519 * doing anything while we are active. 520 */ 521 if (dsl_pool_sync_context(dp)) { 522 ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); 523 return (0); 524 } 525 526 /* 527 * Normal users will hold the ds_rwlock as a READER until they 528 * are finished (i.e., call dsl_dataset_rele()). "Owners" will 529 * drop their READER lock after they set the ds_owner field. 530 * 531 * If the dataset is being destroyed, the destroy thread will 532 * obtain a WRITER lock for exclusive access after it's done its 533 * open-context work and then change the ds_owner to 534 * dsl_reaper once destruction is assured. So threads 535 * may block here temporarily, until the "destructability" of 536 * the dataset is determined. 537 */ 538 ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock)); 539 mutex_enter(&ds->ds_lock); 540 while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) { 541 rw_exit(&dp->dp_config_rwlock); 542 cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock); 543 if (DSL_DATASET_IS_DESTROYED(ds)) { 544 mutex_exit(&ds->ds_lock); 545 dsl_dataset_drop_ref(ds, tag); 546 rw_enter(&dp->dp_config_rwlock, RW_READER); 547 return (ENOENT); 548 } 549 /* 550 * The dp_config_rwlock lives above the ds_lock. And 551 * we need to check DSL_DATASET_IS_DESTROYED() while 552 * holding the ds_lock, so we have to drop and reacquire 553 * the ds_lock here. 554 */ 555 mutex_exit(&ds->ds_lock); 556 rw_enter(&dp->dp_config_rwlock, RW_READER); 557 mutex_enter(&ds->ds_lock); 558 } 559 mutex_exit(&ds->ds_lock); 560 return (0); 561 } 562 563 int 564 dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, 565 dsl_dataset_t **dsp) 566 { 567 int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp); 568 569 if (err) 570 return (err); 571 return (dsl_dataset_hold_ref(*dsp, tag)); 572 } 573 574 int 575 dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok, 576 void *tag, dsl_dataset_t **dsp) 577 { 578 int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); 579 if (err) 580 return (err); 581 if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { 582 dsl_dataset_rele(*dsp, tag); 583 *dsp = NULL; 584 return (EBUSY); 585 } 586 return (0); 587 } 588 589 int 590 dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) 591 { 592 dsl_dir_t *dd; 593 dsl_pool_t *dp; 594 const char *snapname; 595 uint64_t obj; 596 int err = 0; 597 598 err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname); 599 if (err) 600 return (err); 601 602 dp = dd->dd_pool; 603 obj = dd->dd_phys->dd_head_dataset_obj; 604 rw_enter(&dp->dp_config_rwlock, RW_READER); 605 if (obj) 606 err = dsl_dataset_get_ref(dp, obj, tag, dsp); 607 else 608 err = ENOENT; 609 if (err) 610 goto out; 611 612 err = dsl_dataset_hold_ref(*dsp, tag); 613 614 /* we may be looking for a snapshot */ 615 if (err == 0 && snapname != NULL) { 616 dsl_dataset_t *ds = NULL; 617 618 if (*snapname++ != '@') { 619 dsl_dataset_rele(*dsp, tag); 620 err = ENOENT; 621 goto out; 622 } 623 624 dprintf("looking for snapshot '%s'\n", snapname); 625 err = dsl_dataset_snap_lookup(*dsp, snapname, &obj); 626 if (err == 0) 627 err = dsl_dataset_get_ref(dp, obj, tag, &ds); 628 dsl_dataset_rele(*dsp, tag); 629 630 ASSERT3U((err == 0), ==, (ds != NULL)); 631 632 if (ds) { 633 mutex_enter(&ds->ds_lock); 634 if (ds->ds_snapname[0] == 0) 635 (void) strlcpy(ds->ds_snapname, snapname, 636 sizeof (ds->ds_snapname)); 637 mutex_exit(&ds->ds_lock); 638 err = dsl_dataset_hold_ref(ds, tag); 639 *dsp = err ? NULL : ds; 640 } 641 } 642 out: 643 rw_exit(&dp->dp_config_rwlock); 644 dsl_dir_close(dd, FTAG); 645 return (err); 646 } 647 648 int 649 dsl_dataset_own(const char *name, boolean_t inconsistentok, 650 void *tag, dsl_dataset_t **dsp) 651 { 652 int err = dsl_dataset_hold(name, tag, dsp); 653 if (err) 654 return (err); 655 if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { 656 dsl_dataset_rele(*dsp, tag); 657 return (EBUSY); 658 } 659 return (0); 660 } 661 662 void 663 dsl_dataset_name(dsl_dataset_t *ds, char *name) 664 { 665 if (ds == NULL) { 666 (void) strcpy(name, "mos"); 667 } else { 668 dsl_dir_name(ds->ds_dir, name); 669 VERIFY(0 == dsl_dataset_get_snapname(ds)); 670 if (ds->ds_snapname[0]) { 671 (void) strcat(name, "@"); 672 /* 673 * We use a "recursive" mutex so that we 674 * can call dprintf_ds() with ds_lock held. 675 */ 676 if (!MUTEX_HELD(&ds->ds_lock)) { 677 mutex_enter(&ds->ds_lock); 678 (void) strcat(name, ds->ds_snapname); 679 mutex_exit(&ds->ds_lock); 680 } else { 681 (void) strcat(name, ds->ds_snapname); 682 } 683 } 684 } 685 } 686 687 static int 688 dsl_dataset_namelen(dsl_dataset_t *ds) 689 { 690 int result; 691 692 if (ds == NULL) { 693 result = 3; /* "mos" */ 694 } else { 695 result = dsl_dir_namelen(ds->ds_dir); 696 VERIFY(0 == dsl_dataset_get_snapname(ds)); 697 if (ds->ds_snapname[0]) { 698 ++result; /* adding one for the @-sign */ 699 if (!MUTEX_HELD(&ds->ds_lock)) { 700 mutex_enter(&ds->ds_lock); 701 result += strlen(ds->ds_snapname); 702 mutex_exit(&ds->ds_lock); 703 } else { 704 result += strlen(ds->ds_snapname); 705 } 706 } 707 } 708 709 return (result); 710 } 711 712 void 713 dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag) 714 { 715 dmu_buf_rele(ds->ds_dbuf, tag); 716 } 717 718 void 719 dsl_dataset_rele(dsl_dataset_t *ds, void *tag) 720 { 721 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) { 722 rw_exit(&ds->ds_rwlock); 723 } 724 dsl_dataset_drop_ref(ds, tag); 725 } 726 727 void 728 dsl_dataset_disown(dsl_dataset_t *ds, void *tag) 729 { 730 ASSERT((ds->ds_owner == tag && ds->ds_dbuf) || 731 (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL)); 732 733 mutex_enter(&ds->ds_lock); 734 ds->ds_owner = NULL; 735 if (RW_WRITE_HELD(&ds->ds_rwlock)) { 736 rw_exit(&ds->ds_rwlock); 737 cv_broadcast(&ds->ds_exclusive_cv); 738 } 739 mutex_exit(&ds->ds_lock); 740 if (ds->ds_dbuf) 741 dsl_dataset_drop_ref(ds, tag); 742 else 743 dsl_dataset_evict(NULL, ds); 744 } 745 746 boolean_t 747 dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag) 748 { 749 boolean_t gotit = FALSE; 750 751 mutex_enter(&ds->ds_lock); 752 if (ds->ds_owner == NULL && 753 (!DS_IS_INCONSISTENT(ds) || inconsistentok)) { 754 ds->ds_owner = tag; 755 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) 756 rw_exit(&ds->ds_rwlock); 757 gotit = TRUE; 758 } 759 mutex_exit(&ds->ds_lock); 760 return (gotit); 761 } 762 763 void 764 dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner) 765 { 766 ASSERT3P(owner, ==, ds->ds_owner); 767 if (!RW_WRITE_HELD(&ds->ds_rwlock)) 768 rw_enter(&ds->ds_rwlock, RW_WRITER); 769 } 770 771 uint64_t 772 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, 773 uint64_t flags, dmu_tx_t *tx) 774 { 775 dsl_pool_t *dp = dd->dd_pool; 776 dmu_buf_t *dbuf; 777 dsl_dataset_phys_t *dsphys; 778 uint64_t dsobj; 779 objset_t *mos = dp->dp_meta_objset; 780 781 if (origin == NULL) 782 origin = dp->dp_origin_snap; 783 784 ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp); 785 ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0); 786 ASSERT(dmu_tx_is_syncing(tx)); 787 ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); 788 789 dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, 790 DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); 791 VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); 792 dmu_buf_will_dirty(dbuf, tx); 793 dsphys = dbuf->db_data; 794 bzero(dsphys, sizeof (dsl_dataset_phys_t)); 795 dsphys->ds_dir_obj = dd->dd_object; 796 dsphys->ds_flags = flags; 797 dsphys->ds_fsid_guid = unique_create(); 798 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, 799 sizeof (dsphys->ds_guid)); 800 dsphys->ds_snapnames_zapobj = 801 zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP, 802 DMU_OT_NONE, 0, tx); 803 dsphys->ds_creation_time = gethrestime_sec(); 804 dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg; 805 806 if (origin == NULL) { 807 dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx); 808 } else { 809 dsl_dataset_t *ohds; 810 811 dsphys->ds_prev_snap_obj = origin->ds_object; 812 dsphys->ds_prev_snap_txg = 813 origin->ds_phys->ds_creation_txg; 814 dsphys->ds_used_bytes = 815 origin->ds_phys->ds_used_bytes; 816 dsphys->ds_compressed_bytes = 817 origin->ds_phys->ds_compressed_bytes; 818 dsphys->ds_uncompressed_bytes = 819 origin->ds_phys->ds_uncompressed_bytes; 820 dsphys->ds_bp = origin->ds_phys->ds_bp; 821 dsphys->ds_flags |= origin->ds_phys->ds_flags; 822 823 dmu_buf_will_dirty(origin->ds_dbuf, tx); 824 origin->ds_phys->ds_num_children++; 825 826 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, 827 origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds)); 828 dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist, 829 dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx); 830 dsl_dataset_rele(ohds, FTAG); 831 832 if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) { 833 if (origin->ds_phys->ds_next_clones_obj == 0) { 834 origin->ds_phys->ds_next_clones_obj = 835 zap_create(mos, 836 DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); 837 } 838 VERIFY(0 == zap_add_int(mos, 839 origin->ds_phys->ds_next_clones_obj, 840 dsobj, tx)); 841 } 842 843 dmu_buf_will_dirty(dd->dd_dbuf, tx); 844 dd->dd_phys->dd_origin_obj = origin->ds_object; 845 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 846 if (origin->ds_dir->dd_phys->dd_clones == 0) { 847 dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); 848 origin->ds_dir->dd_phys->dd_clones = 849 zap_create(mos, 850 DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); 851 } 852 VERIFY3U(0, ==, zap_add_int(mos, 853 origin->ds_dir->dd_phys->dd_clones, dsobj, tx)); 854 } 855 } 856 857 if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) 858 dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 859 860 dmu_buf_rele(dbuf, FTAG); 861 862 dmu_buf_will_dirty(dd->dd_dbuf, tx); 863 dd->dd_phys->dd_head_dataset_obj = dsobj; 864 865 return (dsobj); 866 } 867 868 uint64_t 869 dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, 870 dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx) 871 { 872 dsl_pool_t *dp = pdd->dd_pool; 873 uint64_t dsobj, ddobj; 874 dsl_dir_t *dd; 875 876 ASSERT(lastname[0] != '@'); 877 878 ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); 879 VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd)); 880 881 dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx); 882 883 dsl_deleg_set_create_perms(dd, tx, cr); 884 885 dsl_dir_close(dd, FTAG); 886 887 /* 888 * If we are creating a clone, make sure we zero out any stale 889 * data from the origin snapshots zil header. 890 */ 891 if (origin != NULL) { 892 dsl_dataset_t *ds; 893 objset_t *os; 894 895 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 896 VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os)); 897 bzero(&os->os_zil_header, sizeof (os->os_zil_header)); 898 dsl_dataset_dirty(ds, tx); 899 dsl_dataset_rele(ds, FTAG); 900 } 901 902 return (dsobj); 903 } 904 905 struct destroyarg { 906 dsl_sync_task_group_t *dstg; 907 char *snapname; 908 char *failed; 909 boolean_t defer; 910 }; 911 912 static int 913 dsl_snapshot_destroy_one(const char *name, void *arg) 914 { 915 struct destroyarg *da = arg; 916 dsl_dataset_t *ds; 917 int err; 918 char *dsname; 919 920 dsname = kmem_asprintf("%s@%s", name, da->snapname); 921 err = dsl_dataset_own(dsname, B_TRUE, da->dstg, &ds); 922 strfree(dsname); 923 if (err == 0) { 924 struct dsl_ds_destroyarg *dsda; 925 926 dsl_dataset_make_exclusive(ds, da->dstg); 927 dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), KM_SLEEP); 928 dsda->ds = ds; 929 dsda->defer = da->defer; 930 dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check, 931 dsl_dataset_destroy_sync, dsda, da->dstg, 0); 932 } else if (err == ENOENT) { 933 err = 0; 934 } else { 935 (void) strcpy(da->failed, name); 936 } 937 return (err); 938 } 939 940 /* 941 * Destroy 'snapname' in all descendants of 'fsname'. 942 */ 943 #pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy 944 int 945 dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer) 946 { 947 int err; 948 struct destroyarg da; 949 dsl_sync_task_t *dst; 950 spa_t *spa; 951 952 err = spa_open(fsname, &spa, FTAG); 953 if (err) 954 return (err); 955 da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 956 da.snapname = snapname; 957 da.failed = fsname; 958 da.defer = defer; 959 960 err = dmu_objset_find(fsname, 961 dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN); 962 963 if (err == 0) 964 err = dsl_sync_task_group_wait(da.dstg); 965 966 for (dst = list_head(&da.dstg->dstg_tasks); dst; 967 dst = list_next(&da.dstg->dstg_tasks, dst)) { 968 struct dsl_ds_destroyarg *dsda = dst->dst_arg1; 969 dsl_dataset_t *ds = dsda->ds; 970 971 /* 972 * Return the file system name that triggered the error 973 */ 974 if (dst->dst_err) { 975 dsl_dataset_name(ds, fsname); 976 *strchr(fsname, '@') = '\0'; 977 } 978 ASSERT3P(dsda->rm_origin, ==, NULL); 979 dsl_dataset_disown(ds, da.dstg); 980 kmem_free(dsda, sizeof (struct dsl_ds_destroyarg)); 981 } 982 983 dsl_sync_task_group_destroy(da.dstg); 984 spa_close(spa, FTAG); 985 return (err); 986 } 987 988 static boolean_t 989 dsl_dataset_might_destroy_origin(dsl_dataset_t *ds) 990 { 991 boolean_t might_destroy = B_FALSE; 992 993 mutex_enter(&ds->ds_lock); 994 if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 && 995 DS_IS_DEFER_DESTROY(ds)) 996 might_destroy = B_TRUE; 997 mutex_exit(&ds->ds_lock); 998 999 return (might_destroy); 1000 } 1001 1002 /* 1003 * If we're removing a clone, and these three conditions are true: 1004 * 1) the clone's origin has no other children 1005 * 2) the clone's origin has no user references 1006 * 3) the clone's origin has been marked for deferred destruction 1007 * Then, prepare to remove the origin as part of this sync task group. 1008 */ 1009 static int 1010 dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag) 1011 { 1012 dsl_dataset_t *ds = dsda->ds; 1013 dsl_dataset_t *origin = ds->ds_prev; 1014 1015 if (dsl_dataset_might_destroy_origin(origin)) { 1016 char *name; 1017 int namelen; 1018 int error; 1019 1020 namelen = dsl_dataset_namelen(origin) + 1; 1021 name = kmem_alloc(namelen, KM_SLEEP); 1022 dsl_dataset_name(origin, name); 1023 #ifdef _KERNEL 1024 error = zfs_unmount_snap(name, NULL); 1025 if (error) { 1026 kmem_free(name, namelen); 1027 return (error); 1028 } 1029 #endif 1030 error = dsl_dataset_own(name, B_TRUE, tag, &origin); 1031 kmem_free(name, namelen); 1032 if (error) 1033 return (error); 1034 dsda->rm_origin = origin; 1035 dsl_dataset_make_exclusive(origin, tag); 1036 } 1037 1038 return (0); 1039 } 1040 1041 /* 1042 * ds must be opened as OWNER. On return (whether successful or not), 1043 * ds will be closed and caller can no longer dereference it. 1044 */ 1045 int 1046 dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) 1047 { 1048 int err; 1049 dsl_sync_task_group_t *dstg; 1050 objset_t *os; 1051 dsl_dir_t *dd; 1052 uint64_t obj; 1053 struct dsl_ds_destroyarg dsda = { 0 }; 1054 dsl_dataset_t dummy_ds = { 0 }; 1055 1056 dsda.ds = ds; 1057 1058 if (dsl_dataset_is_snapshot(ds)) { 1059 /* Destroying a snapshot is simpler */ 1060 dsl_dataset_make_exclusive(ds, tag); 1061 1062 dsda.defer = defer; 1063 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 1064 dsl_dataset_destroy_check, dsl_dataset_destroy_sync, 1065 &dsda, tag, 0); 1066 ASSERT3P(dsda.rm_origin, ==, NULL); 1067 goto out; 1068 } else if (defer) { 1069 err = EINVAL; 1070 goto out; 1071 } 1072 1073 dd = ds->ds_dir; 1074 dummy_ds.ds_dir = dd; 1075 dummy_ds.ds_object = ds->ds_object; 1076 1077 /* 1078 * Check for errors and mark this ds as inconsistent, in 1079 * case we crash while freeing the objects. 1080 */ 1081 err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check, 1082 dsl_dataset_destroy_begin_sync, ds, NULL, 0); 1083 if (err) 1084 goto out; 1085 1086 err = dmu_objset_from_ds(ds, &os); 1087 if (err) 1088 goto out; 1089 1090 /* 1091 * remove the objects in open context, so that we won't 1092 * have too much to do in syncing context. 1093 */ 1094 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 1095 ds->ds_phys->ds_prev_snap_txg)) { 1096 /* 1097 * Ignore errors, if there is not enough disk space 1098 * we will deal with it in dsl_dataset_destroy_sync(). 1099 */ 1100 (void) dmu_free_object(os, obj); 1101 } 1102 if (err != ESRCH) 1103 goto out; 1104 1105 /* 1106 * Only the ZIL knows how to free log blocks. 1107 */ 1108 zil_destroy(dmu_objset_zil(os), B_FALSE); 1109 1110 /* 1111 * Sync out all in-flight IO. 1112 */ 1113 txg_wait_synced(dd->dd_pool, 0); 1114 1115 /* 1116 * If we managed to free all the objects in open 1117 * context, the user space accounting should be zero. 1118 */ 1119 if (ds->ds_phys->ds_bp.blk_fill == 0 && 1120 dmu_objset_userused_enabled(os)) { 1121 uint64_t count; 1122 1123 ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 || 1124 count == 0); 1125 ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, &count) != 0 || 1126 count == 0); 1127 } 1128 1129 rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); 1130 err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd); 1131 rw_exit(&dd->dd_pool->dp_config_rwlock); 1132 1133 if (err) 1134 goto out; 1135 1136 /* 1137 * Blow away the dsl_dir + head dataset. 1138 */ 1139 dsl_dataset_make_exclusive(ds, tag); 1140 /* 1141 * If we're removing a clone, we might also need to remove its 1142 * origin. 1143 */ 1144 do { 1145 dsda.need_prep = B_FALSE; 1146 if (dsl_dir_is_clone(dd)) { 1147 err = dsl_dataset_origin_rm_prep(&dsda, tag); 1148 if (err) { 1149 dsl_dir_close(dd, FTAG); 1150 goto out; 1151 } 1152 } 1153 1154 dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); 1155 dsl_sync_task_create(dstg, dsl_dataset_destroy_check, 1156 dsl_dataset_destroy_sync, &dsda, tag, 0); 1157 dsl_sync_task_create(dstg, dsl_dir_destroy_check, 1158 dsl_dir_destroy_sync, &dummy_ds, FTAG, 0); 1159 err = dsl_sync_task_group_wait(dstg); 1160 dsl_sync_task_group_destroy(dstg); 1161 1162 /* 1163 * We could be racing against 'zfs release' or 'zfs destroy -d' 1164 * on the origin snap, in which case we can get EBUSY if we 1165 * needed to destroy the origin snap but were not ready to 1166 * do so. 1167 */ 1168 if (dsda.need_prep) { 1169 ASSERT(err == EBUSY); 1170 ASSERT(dsl_dir_is_clone(dd)); 1171 ASSERT(dsda.rm_origin == NULL); 1172 } 1173 } while (dsda.need_prep); 1174 1175 if (dsda.rm_origin != NULL) 1176 dsl_dataset_disown(dsda.rm_origin, tag); 1177 1178 /* if it is successful, dsl_dir_destroy_sync will close the dd */ 1179 if (err) 1180 dsl_dir_close(dd, FTAG); 1181 out: 1182 dsl_dataset_disown(ds, tag); 1183 return (err); 1184 } 1185 1186 blkptr_t * 1187 dsl_dataset_get_blkptr(dsl_dataset_t *ds) 1188 { 1189 return (&ds->ds_phys->ds_bp); 1190 } 1191 1192 void 1193 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) 1194 { 1195 ASSERT(dmu_tx_is_syncing(tx)); 1196 /* If it's the meta-objset, set dp_meta_rootbp */ 1197 if (ds == NULL) { 1198 tx->tx_pool->dp_meta_rootbp = *bp; 1199 } else { 1200 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1201 ds->ds_phys->ds_bp = *bp; 1202 } 1203 } 1204 1205 spa_t * 1206 dsl_dataset_get_spa(dsl_dataset_t *ds) 1207 { 1208 return (ds->ds_dir->dd_pool->dp_spa); 1209 } 1210 1211 void 1212 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) 1213 { 1214 dsl_pool_t *dp; 1215 1216 if (ds == NULL) /* this is the meta-objset */ 1217 return; 1218 1219 ASSERT(ds->ds_objset != NULL); 1220 1221 if (ds->ds_phys->ds_next_snap_obj != 0) 1222 panic("dirtying snapshot!"); 1223 1224 dp = ds->ds_dir->dd_pool; 1225 1226 if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) { 1227 /* up the hold count until we can be written out */ 1228 dmu_buf_add_ref(ds->ds_dbuf, ds); 1229 } 1230 } 1231 1232 /* 1233 * The unique space in the head dataset can be calculated by subtracting 1234 * the space used in the most recent snapshot, that is still being used 1235 * in this file system, from the space currently in use. To figure out 1236 * the space in the most recent snapshot still in use, we need to take 1237 * the total space used in the snapshot and subtract out the space that 1238 * has been freed up since the snapshot was taken. 1239 */ 1240 static void 1241 dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) 1242 { 1243 uint64_t mrs_used; 1244 uint64_t dlused, dlcomp, dluncomp; 1245 1246 ASSERT(!dsl_dataset_is_snapshot(ds)); 1247 1248 if (ds->ds_phys->ds_prev_snap_obj != 0) 1249 mrs_used = ds->ds_prev->ds_phys->ds_used_bytes; 1250 else 1251 mrs_used = 0; 1252 1253 dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp); 1254 1255 ASSERT3U(dlused, <=, mrs_used); 1256 ds->ds_phys->ds_unique_bytes = 1257 ds->ds_phys->ds_used_bytes - (mrs_used - dlused); 1258 1259 if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= 1260 SPA_VERSION_UNIQUE_ACCURATE) 1261 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 1262 } 1263 1264 struct killarg { 1265 dsl_dataset_t *ds; 1266 dmu_tx_t *tx; 1267 }; 1268 1269 /* ARGSUSED */ 1270 static int 1271 kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, 1272 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 1273 { 1274 struct killarg *ka = arg; 1275 dmu_tx_t *tx = ka->tx; 1276 1277 if (bp == NULL) 1278 return (0); 1279 1280 if (zb->zb_level == ZB_ZIL_LEVEL) { 1281 ASSERT(zilog != NULL); 1282 /* 1283 * It's a block in the intent log. It has no 1284 * accounting, so just free it. 1285 */ 1286 dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); 1287 } else { 1288 ASSERT(zilog == NULL); 1289 ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); 1290 (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); 1291 } 1292 1293 return (0); 1294 } 1295 1296 /* ARGSUSED */ 1297 static int 1298 dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) 1299 { 1300 dsl_dataset_t *ds = arg1; 1301 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 1302 uint64_t count; 1303 int err; 1304 1305 /* 1306 * Can't delete a head dataset if there are snapshots of it. 1307 * (Except if the only snapshots are from the branch we cloned 1308 * from.) 1309 */ 1310 if (ds->ds_prev != NULL && 1311 ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) 1312 return (EBUSY); 1313 1314 /* 1315 * This is really a dsl_dir thing, but check it here so that 1316 * we'll be less likely to leave this dataset inconsistent & 1317 * nearly destroyed. 1318 */ 1319 err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count); 1320 if (err) 1321 return (err); 1322 if (count != 0) 1323 return (EEXIST); 1324 1325 return (0); 1326 } 1327 1328 /* ARGSUSED */ 1329 static void 1330 dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx) 1331 { 1332 dsl_dataset_t *ds = arg1; 1333 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1334 1335 /* Mark it as inconsistent on-disk, in case we crash */ 1336 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1337 ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; 1338 1339 spa_history_log_internal(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx, 1340 "dataset = %llu", ds->ds_object); 1341 } 1342 1343 static int 1344 dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag, 1345 dmu_tx_t *tx) 1346 { 1347 dsl_dataset_t *ds = dsda->ds; 1348 dsl_dataset_t *ds_prev = ds->ds_prev; 1349 1350 if (dsl_dataset_might_destroy_origin(ds_prev)) { 1351 struct dsl_ds_destroyarg ndsda = {0}; 1352 1353 /* 1354 * If we're not prepared to remove the origin, don't remove 1355 * the clone either. 1356 */ 1357 if (dsda->rm_origin == NULL) { 1358 dsda->need_prep = B_TRUE; 1359 return (EBUSY); 1360 } 1361 1362 ndsda.ds = ds_prev; 1363 ndsda.is_origin_rm = B_TRUE; 1364 return (dsl_dataset_destroy_check(&ndsda, tag, tx)); 1365 } 1366 1367 /* 1368 * If we're not going to remove the origin after all, 1369 * undo the open context setup. 1370 */ 1371 if (dsda->rm_origin != NULL) { 1372 dsl_dataset_disown(dsda->rm_origin, tag); 1373 dsda->rm_origin = NULL; 1374 } 1375 1376 return (0); 1377 } 1378 1379 /* 1380 * If you add new checks here, you may need to add 1381 * additional checks to the "temporary" case in 1382 * snapshot_check() in dmu_objset.c. 1383 */ 1384 /* ARGSUSED */ 1385 int 1386 dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) 1387 { 1388 struct dsl_ds_destroyarg *dsda = arg1; 1389 dsl_dataset_t *ds = dsda->ds; 1390 1391 /* we have an owner hold, so noone else can destroy us */ 1392 ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); 1393 1394 /* 1395 * Only allow deferred destroy on pools that support it. 1396 * NOTE: deferred destroy is only supported on snapshots. 1397 */ 1398 if (dsda->defer) { 1399 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < 1400 SPA_VERSION_USERREFS) 1401 return (ENOTSUP); 1402 ASSERT(dsl_dataset_is_snapshot(ds)); 1403 return (0); 1404 } 1405 1406 /* 1407 * Can't delete a head dataset if there are snapshots of it. 1408 * (Except if the only snapshots are from the branch we cloned 1409 * from.) 1410 */ 1411 if (ds->ds_prev != NULL && 1412 ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) 1413 return (EBUSY); 1414 1415 /* 1416 * If we made changes this txg, traverse_dsl_dataset won't find 1417 * them. Try again. 1418 */ 1419 if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) 1420 return (EAGAIN); 1421 1422 if (dsl_dataset_is_snapshot(ds)) { 1423 /* 1424 * If this snapshot has an elevated user reference count, 1425 * we can't destroy it yet. 1426 */ 1427 if (ds->ds_userrefs > 0 && !dsda->releasing) 1428 return (EBUSY); 1429 1430 mutex_enter(&ds->ds_lock); 1431 /* 1432 * Can't delete a branch point. However, if we're destroying 1433 * a clone and removing its origin due to it having a user 1434 * hold count of 0 and having been marked for deferred destroy, 1435 * it's OK for the origin to have a single clone. 1436 */ 1437 if (ds->ds_phys->ds_num_children > 1438 (dsda->is_origin_rm ? 2 : 1)) { 1439 mutex_exit(&ds->ds_lock); 1440 return (EEXIST); 1441 } 1442 mutex_exit(&ds->ds_lock); 1443 } else if (dsl_dir_is_clone(ds->ds_dir)) { 1444 return (dsl_dataset_origin_check(dsda, arg2, tx)); 1445 } 1446 1447 /* XXX we should do some i/o error checking... */ 1448 return (0); 1449 } 1450 1451 struct refsarg { 1452 kmutex_t lock; 1453 boolean_t gone; 1454 kcondvar_t cv; 1455 }; 1456 1457 /* ARGSUSED */ 1458 static void 1459 dsl_dataset_refs_gone(dmu_buf_t *db, void *argv) 1460 { 1461 struct refsarg *arg = argv; 1462 1463 mutex_enter(&arg->lock); 1464 arg->gone = TRUE; 1465 cv_signal(&arg->cv); 1466 mutex_exit(&arg->lock); 1467 } 1468 1469 static void 1470 dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag) 1471 { 1472 struct refsarg arg; 1473 1474 mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL); 1475 cv_init(&arg.cv, NULL, CV_DEFAULT, NULL); 1476 arg.gone = FALSE; 1477 (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys, 1478 dsl_dataset_refs_gone); 1479 dmu_buf_rele(ds->ds_dbuf, tag); 1480 mutex_enter(&arg.lock); 1481 while (!arg.gone) 1482 cv_wait(&arg.cv, &arg.lock); 1483 ASSERT(arg.gone); 1484 mutex_exit(&arg.lock); 1485 ds->ds_dbuf = NULL; 1486 ds->ds_phys = NULL; 1487 mutex_destroy(&arg.lock); 1488 cv_destroy(&arg.cv); 1489 } 1490 1491 static void 1492 remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) 1493 { 1494 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 1495 uint64_t count; 1496 int err; 1497 1498 ASSERT(ds->ds_phys->ds_num_children >= 2); 1499 err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx); 1500 /* 1501 * The err should not be ENOENT, but a bug in a previous version 1502 * of the code could cause upgrade_clones_cb() to not set 1503 * ds_next_snap_obj when it should, leading to a missing entry. 1504 * If we knew that the pool was created after 1505 * SPA_VERSION_NEXT_CLONES, we could assert that it isn't 1506 * ENOENT. However, at least we can check that we don't have 1507 * too many entries in the next_clones_obj even after failing to 1508 * remove this one. 1509 */ 1510 if (err != ENOENT) { 1511 VERIFY3U(err, ==, 0); 1512 } 1513 ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, 1514 &count)); 1515 ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2); 1516 } 1517 1518 static void 1519 dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) 1520 { 1521 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 1522 zap_cursor_t zc; 1523 zap_attribute_t za; 1524 1525 /* 1526 * If it is the old version, dd_clones doesn't exist so we can't 1527 * find the clones, but deadlist_remove_key() is a no-op so it 1528 * doesn't matter. 1529 */ 1530 if (ds->ds_dir->dd_phys->dd_clones == 0) 1531 return; 1532 1533 for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones); 1534 zap_cursor_retrieve(&zc, &za) == 0; 1535 zap_cursor_advance(&zc)) { 1536 dsl_dataset_t *clone; 1537 1538 VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool, 1539 za.za_first_integer, FTAG, &clone)); 1540 if (clone->ds_dir->dd_origin_txg > mintxg) { 1541 dsl_deadlist_remove_key(&clone->ds_deadlist, 1542 mintxg, tx); 1543 dsl_dataset_remove_clones_key(clone, mintxg, tx); 1544 } 1545 dsl_dataset_rele(clone, FTAG); 1546 } 1547 zap_cursor_fini(&zc); 1548 } 1549 1550 struct process_old_arg { 1551 dsl_dataset_t *ds; 1552 dsl_dataset_t *ds_prev; 1553 boolean_t after_branch_point; 1554 zio_t *pio; 1555 uint64_t used, comp, uncomp; 1556 }; 1557 1558 static int 1559 process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 1560 { 1561 struct process_old_arg *poa = arg; 1562 dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; 1563 1564 if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) { 1565 dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx); 1566 if (poa->ds_prev && !poa->after_branch_point && 1567 bp->blk_birth > 1568 poa->ds_prev->ds_phys->ds_prev_snap_txg) { 1569 poa->ds_prev->ds_phys->ds_unique_bytes += 1570 bp_get_dsize_sync(dp->dp_spa, bp); 1571 } 1572 } else { 1573 poa->used += bp_get_dsize_sync(dp->dp_spa, bp); 1574 poa->comp += BP_GET_PSIZE(bp); 1575 poa->uncomp += BP_GET_UCSIZE(bp); 1576 dsl_free_sync(poa->pio, dp, tx->tx_txg, bp); 1577 } 1578 return (0); 1579 } 1580 1581 static void 1582 process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, 1583 dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx) 1584 { 1585 struct process_old_arg poa = { 0 }; 1586 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1587 objset_t *mos = dp->dp_meta_objset; 1588 1589 ASSERT(ds->ds_deadlist.dl_oldfmt); 1590 ASSERT(ds_next->ds_deadlist.dl_oldfmt); 1591 1592 poa.ds = ds; 1593 poa.ds_prev = ds_prev; 1594 poa.after_branch_point = after_branch_point; 1595 poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 1596 VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj, 1597 process_old_cb, &poa, tx)); 1598 VERIFY3U(zio_wait(poa.pio), ==, 0); 1599 ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes); 1600 1601 /* change snapused */ 1602 dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, 1603 -poa.used, -poa.comp, -poa.uncomp, tx); 1604 1605 /* swap next's deadlist to our deadlist */ 1606 dsl_deadlist_close(&ds->ds_deadlist); 1607 dsl_deadlist_close(&ds_next->ds_deadlist); 1608 SWITCH64(ds_next->ds_phys->ds_deadlist_obj, 1609 ds->ds_phys->ds_deadlist_obj); 1610 dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); 1611 dsl_deadlist_open(&ds_next->ds_deadlist, mos, 1612 ds_next->ds_phys->ds_deadlist_obj); 1613 } 1614 1615 void 1616 dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) 1617 { 1618 struct dsl_ds_destroyarg *dsda = arg1; 1619 dsl_dataset_t *ds = dsda->ds; 1620 int err; 1621 int after_branch_point = FALSE; 1622 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1623 objset_t *mos = dp->dp_meta_objset; 1624 dsl_dataset_t *ds_prev = NULL; 1625 boolean_t wont_destroy; 1626 uint64_t obj; 1627 1628 wont_destroy = (dsda->defer && 1629 (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1)); 1630 1631 ASSERT(ds->ds_owner || wont_destroy); 1632 ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1); 1633 ASSERT(ds->ds_prev == NULL || 1634 ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); 1635 ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); 1636 1637 if (wont_destroy) { 1638 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 1639 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1640 ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY; 1641 return; 1642 } 1643 1644 /* signal any waiters that this dataset is going away */ 1645 mutex_enter(&ds->ds_lock); 1646 ds->ds_owner = dsl_reaper; 1647 cv_broadcast(&ds->ds_exclusive_cv); 1648 mutex_exit(&ds->ds_lock); 1649 1650 /* Remove our reservation */ 1651 if (ds->ds_reserved != 0) { 1652 dsl_prop_setarg_t psa; 1653 uint64_t value = 0; 1654 1655 dsl_prop_setarg_init_uint64(&psa, "refreservation", 1656 (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), 1657 &value); 1658 psa.psa_effective_value = 0; /* predict default value */ 1659 1660 dsl_dataset_set_reservation_sync(ds, &psa, tx); 1661 ASSERT3U(ds->ds_reserved, ==, 0); 1662 } 1663 1664 ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); 1665 1666 dsl_scan_ds_destroyed(ds, tx); 1667 1668 obj = ds->ds_object; 1669 1670 if (ds->ds_phys->ds_prev_snap_obj != 0) { 1671 if (ds->ds_prev) { 1672 ds_prev = ds->ds_prev; 1673 } else { 1674 VERIFY(0 == dsl_dataset_hold_obj(dp, 1675 ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev)); 1676 } 1677 after_branch_point = 1678 (ds_prev->ds_phys->ds_next_snap_obj != obj); 1679 1680 dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); 1681 if (after_branch_point && 1682 ds_prev->ds_phys->ds_next_clones_obj != 0) { 1683 remove_from_next_clones(ds_prev, obj, tx); 1684 if (ds->ds_phys->ds_next_snap_obj != 0) { 1685 VERIFY(0 == zap_add_int(mos, 1686 ds_prev->ds_phys->ds_next_clones_obj, 1687 ds->ds_phys->ds_next_snap_obj, tx)); 1688 } 1689 } 1690 if (after_branch_point && 1691 ds->ds_phys->ds_next_snap_obj == 0) { 1692 /* This clone is toast. */ 1693 ASSERT(ds_prev->ds_phys->ds_num_children > 1); 1694 ds_prev->ds_phys->ds_num_children--; 1695 1696 /* 1697 * If the clone's origin has no other clones, no 1698 * user holds, and has been marked for deferred 1699 * deletion, then we should have done the necessary 1700 * destroy setup for it. 1701 */ 1702 if (ds_prev->ds_phys->ds_num_children == 1 && 1703 ds_prev->ds_userrefs == 0 && 1704 DS_IS_DEFER_DESTROY(ds_prev)) { 1705 ASSERT3P(dsda->rm_origin, !=, NULL); 1706 } else { 1707 ASSERT3P(dsda->rm_origin, ==, NULL); 1708 } 1709 } else if (!after_branch_point) { 1710 ds_prev->ds_phys->ds_next_snap_obj = 1711 ds->ds_phys->ds_next_snap_obj; 1712 } 1713 } 1714 1715 if (dsl_dataset_is_snapshot(ds)) { 1716 dsl_dataset_t *ds_next; 1717 uint64_t old_unique; 1718 uint64_t used = 0, comp = 0, uncomp = 0; 1719 1720 VERIFY(0 == dsl_dataset_hold_obj(dp, 1721 ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); 1722 ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); 1723 1724 old_unique = ds_next->ds_phys->ds_unique_bytes; 1725 1726 dmu_buf_will_dirty(ds_next->ds_dbuf, tx); 1727 ds_next->ds_phys->ds_prev_snap_obj = 1728 ds->ds_phys->ds_prev_snap_obj; 1729 ds_next->ds_phys->ds_prev_snap_txg = 1730 ds->ds_phys->ds_prev_snap_txg; 1731 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, 1732 ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0); 1733 1734 1735 if (ds_next->ds_deadlist.dl_oldfmt) { 1736 process_old_deadlist(ds, ds_prev, ds_next, 1737 after_branch_point, tx); 1738 } else { 1739 /* Adjust prev's unique space. */ 1740 if (ds_prev && !after_branch_point) { 1741 dsl_deadlist_space_range(&ds_next->ds_deadlist, 1742 ds_prev->ds_phys->ds_prev_snap_txg, 1743 ds->ds_phys->ds_prev_snap_txg, 1744 &used, &comp, &uncomp); 1745 ds_prev->ds_phys->ds_unique_bytes += used; 1746 } 1747 1748 /* Adjust snapused. */ 1749 dsl_deadlist_space_range(&ds_next->ds_deadlist, 1750 ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, 1751 &used, &comp, &uncomp); 1752 dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, 1753 -used, -comp, -uncomp, tx); 1754 1755 /* Move blocks to be freed to pool's free list. */ 1756 dsl_deadlist_move_bpobj(&ds_next->ds_deadlist, 1757 &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg, 1758 tx); 1759 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, 1760 DD_USED_HEAD, used, comp, uncomp, tx); 1761 dsl_dir_dirty(tx->tx_pool->dp_free_dir, tx); 1762 1763 /* Merge our deadlist into next's and free it. */ 1764 dsl_deadlist_merge(&ds_next->ds_deadlist, 1765 ds->ds_phys->ds_deadlist_obj, tx); 1766 } 1767 dsl_deadlist_close(&ds->ds_deadlist); 1768 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); 1769 1770 /* Collapse range in clone heads */ 1771 dsl_dataset_remove_clones_key(ds, 1772 ds->ds_phys->ds_creation_txg, tx); 1773 1774 if (dsl_dataset_is_snapshot(ds_next)) { 1775 dsl_dataset_t *ds_nextnext; 1776 1777 /* 1778 * Update next's unique to include blocks which 1779 * were previously shared by only this snapshot 1780 * and it. Those blocks will be born after the 1781 * prev snap and before this snap, and will have 1782 * died after the next snap and before the one 1783 * after that (ie. be on the snap after next's 1784 * deadlist). 1785 */ 1786 VERIFY(0 == dsl_dataset_hold_obj(dp, 1787 ds_next->ds_phys->ds_next_snap_obj, 1788 FTAG, &ds_nextnext)); 1789 dsl_deadlist_space_range(&ds_nextnext->ds_deadlist, 1790 ds->ds_phys->ds_prev_snap_txg, 1791 ds->ds_phys->ds_creation_txg, 1792 &used, &comp, &uncomp); 1793 ds_next->ds_phys->ds_unique_bytes += used; 1794 dsl_dataset_rele(ds_nextnext, FTAG); 1795 ASSERT3P(ds_next->ds_prev, ==, NULL); 1796 1797 /* Collapse range in this head. */ 1798 dsl_dataset_t *hds; 1799 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, 1800 ds->ds_dir->dd_phys->dd_head_dataset_obj, 1801 FTAG, &hds)); 1802 dsl_deadlist_remove_key(&hds->ds_deadlist, 1803 ds->ds_phys->ds_creation_txg, tx); 1804 dsl_dataset_rele(hds, FTAG); 1805 1806 } else { 1807 ASSERT3P(ds_next->ds_prev, ==, ds); 1808 dsl_dataset_drop_ref(ds_next->ds_prev, ds_next); 1809 ds_next->ds_prev = NULL; 1810 if (ds_prev) { 1811 VERIFY(0 == dsl_dataset_get_ref(dp, 1812 ds->ds_phys->ds_prev_snap_obj, 1813 ds_next, &ds_next->ds_prev)); 1814 } 1815 1816 dsl_dataset_recalc_head_uniq(ds_next); 1817 1818 /* 1819 * Reduce the amount of our unconsmed refreservation 1820 * being charged to our parent by the amount of 1821 * new unique data we have gained. 1822 */ 1823 if (old_unique < ds_next->ds_reserved) { 1824 int64_t mrsdelta; 1825 uint64_t new_unique = 1826 ds_next->ds_phys->ds_unique_bytes; 1827 1828 ASSERT(old_unique <= new_unique); 1829 mrsdelta = MIN(new_unique - old_unique, 1830 ds_next->ds_reserved - old_unique); 1831 dsl_dir_diduse_space(ds->ds_dir, 1832 DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); 1833 } 1834 } 1835 dsl_dataset_rele(ds_next, FTAG); 1836 } else { 1837 /* 1838 * There's no next snapshot, so this is a head dataset. 1839 * Destroy the deadlist. Unless it's a clone, the 1840 * deadlist should be empty. (If it's a clone, it's 1841 * safe to ignore the deadlist contents.) 1842 */ 1843 struct killarg ka; 1844 1845 dsl_deadlist_close(&ds->ds_deadlist); 1846 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); 1847 ds->ds_phys->ds_deadlist_obj = 0; 1848 1849 /* 1850 * Free everything that we point to (that's born after 1851 * the previous snapshot, if we are a clone) 1852 * 1853 * NB: this should be very quick, because we already 1854 * freed all the objects in open context. 1855 */ 1856 ka.ds = ds; 1857 ka.tx = tx; 1858 err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg, 1859 TRAVERSE_POST, kill_blkptr, &ka); 1860 ASSERT3U(err, ==, 0); 1861 ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || 1862 ds->ds_phys->ds_unique_bytes == 0); 1863 1864 if (ds->ds_prev != NULL) { 1865 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 1866 VERIFY3U(0, ==, zap_remove_int(mos, 1867 ds->ds_prev->ds_dir->dd_phys->dd_clones, 1868 ds->ds_object, tx)); 1869 } 1870 dsl_dataset_rele(ds->ds_prev, ds); 1871 ds->ds_prev = ds_prev = NULL; 1872 } 1873 } 1874 1875 /* 1876 * This must be done after the dsl_traverse(), because it will 1877 * re-open the objset. 1878 */ 1879 if (ds->ds_objset) { 1880 dmu_objset_evict(ds->ds_objset); 1881 ds->ds_objset = NULL; 1882 } 1883 1884 if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) { 1885 /* Erase the link in the dir */ 1886 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); 1887 ds->ds_dir->dd_phys->dd_head_dataset_obj = 0; 1888 ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0); 1889 err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx); 1890 ASSERT(err == 0); 1891 } else { 1892 /* remove from snapshot namespace */ 1893 dsl_dataset_t *ds_head; 1894 ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0); 1895 VERIFY(0 == dsl_dataset_hold_obj(dp, 1896 ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head)); 1897 VERIFY(0 == dsl_dataset_get_snapname(ds)); 1898 #ifdef ZFS_DEBUG 1899 { 1900 uint64_t val; 1901 1902 err = dsl_dataset_snap_lookup(ds_head, 1903 ds->ds_snapname, &val); 1904 ASSERT3U(err, ==, 0); 1905 ASSERT3U(val, ==, obj); 1906 } 1907 #endif 1908 err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx); 1909 ASSERT(err == 0); 1910 dsl_dataset_rele(ds_head, FTAG); 1911 } 1912 1913 if (ds_prev && ds->ds_prev != ds_prev) 1914 dsl_dataset_rele(ds_prev, FTAG); 1915 1916 spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); 1917 spa_history_log_internal(LOG_DS_DESTROY, dp->dp_spa, tx, 1918 "dataset = %llu", ds->ds_object); 1919 1920 if (ds->ds_phys->ds_next_clones_obj != 0) { 1921 uint64_t count; 1922 ASSERT(0 == zap_count(mos, 1923 ds->ds_phys->ds_next_clones_obj, &count) && count == 0); 1924 VERIFY(0 == dmu_object_free(mos, 1925 ds->ds_phys->ds_next_clones_obj, tx)); 1926 } 1927 if (ds->ds_phys->ds_props_obj != 0) 1928 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx)); 1929 if (ds->ds_phys->ds_userrefs_obj != 0) 1930 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx)); 1931 dsl_dir_close(ds->ds_dir, ds); 1932 ds->ds_dir = NULL; 1933 dsl_dataset_drain_refs(ds, tag); 1934 VERIFY(0 == dmu_object_free(mos, obj, tx)); 1935 1936 if (dsda->rm_origin) { 1937 /* 1938 * Remove the origin of the clone we just destroyed. 1939 */ 1940 struct dsl_ds_destroyarg ndsda = {0}; 1941 1942 ndsda.ds = dsda->rm_origin; 1943 dsl_dataset_destroy_sync(&ndsda, tag, tx); 1944 } 1945 } 1946 1947 static int 1948 dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) 1949 { 1950 uint64_t asize; 1951 1952 if (!dmu_tx_is_syncing(tx)) 1953 return (0); 1954 1955 /* 1956 * If there's an fs-only reservation, any blocks that might become 1957 * owned by the snapshot dataset must be accommodated by space 1958 * outside of the reservation. 1959 */ 1960 ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds)); 1961 asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); 1962 if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) 1963 return (ENOSPC); 1964 1965 /* 1966 * Propogate any reserved space for this snapshot to other 1967 * snapshot checks in this sync group. 1968 */ 1969 if (asize > 0) 1970 dsl_dir_willuse_space(ds->ds_dir, asize, tx); 1971 1972 return (0); 1973 } 1974 1975 int 1976 dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) 1977 { 1978 dsl_dataset_t *ds = arg1; 1979 const char *snapname = arg2; 1980 int err; 1981 uint64_t value; 1982 1983 /* 1984 * We don't allow multiple snapshots of the same txg. If there 1985 * is already one, try again. 1986 */ 1987 if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg) 1988 return (EAGAIN); 1989 1990 /* 1991 * Check for conflicting name snapshot name. 1992 */ 1993 err = dsl_dataset_snap_lookup(ds, snapname, &value); 1994 if (err == 0) 1995 return (EEXIST); 1996 if (err != ENOENT) 1997 return (err); 1998 1999 /* 2000 * Check that the dataset's name is not too long. Name consists 2001 * of the dataset's length + 1 for the @-sign + snapshot name's length 2002 */ 2003 if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN) 2004 return (ENAMETOOLONG); 2005 2006 err = dsl_dataset_snapshot_reserve_space(ds, tx); 2007 if (err) 2008 return (err); 2009 2010 ds->ds_trysnap_txg = tx->tx_txg; 2011 return (0); 2012 } 2013 2014 void 2015 dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) 2016 { 2017 dsl_dataset_t *ds = arg1; 2018 const char *snapname = arg2; 2019 dsl_pool_t *dp = ds->ds_dir->dd_pool; 2020 dmu_buf_t *dbuf; 2021 dsl_dataset_phys_t *dsphys; 2022 uint64_t dsobj, crtxg; 2023 objset_t *mos = dp->dp_meta_objset; 2024 int err; 2025 2026 ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); 2027 2028 /* 2029 * The origin's ds_creation_txg has to be < TXG_INITIAL 2030 */ 2031 if (strcmp(snapname, ORIGIN_DIR_NAME) == 0) 2032 crtxg = 1; 2033 else 2034 crtxg = tx->tx_txg; 2035 2036 dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, 2037 DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); 2038 VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); 2039 dmu_buf_will_dirty(dbuf, tx); 2040 dsphys = dbuf->db_data; 2041 bzero(dsphys, sizeof (dsl_dataset_phys_t)); 2042 dsphys->ds_dir_obj = ds->ds_dir->dd_object; 2043 dsphys->ds_fsid_guid = unique_create(); 2044 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, 2045 sizeof (dsphys->ds_guid)); 2046 dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj; 2047 dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg; 2048 dsphys->ds_next_snap_obj = ds->ds_object; 2049 dsphys->ds_num_children = 1; 2050 dsphys->ds_creation_time = gethrestime_sec(); 2051 dsphys->ds_creation_txg = crtxg; 2052 dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj; 2053 dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes; 2054 dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes; 2055 dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes; 2056 dsphys->ds_flags = ds->ds_phys->ds_flags; 2057 dsphys->ds_bp = ds->ds_phys->ds_bp; 2058 dmu_buf_rele(dbuf, FTAG); 2059 2060 ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0); 2061 if (ds->ds_prev) { 2062 uint64_t next_clones_obj = 2063 ds->ds_prev->ds_phys->ds_next_clones_obj; 2064 ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj == 2065 ds->ds_object || 2066 ds->ds_prev->ds_phys->ds_num_children > 1); 2067 if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { 2068 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 2069 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, 2070 ds->ds_prev->ds_phys->ds_creation_txg); 2071 ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj; 2072 } else if (next_clones_obj != 0) { 2073 remove_from_next_clones(ds->ds_prev, 2074 dsphys->ds_next_snap_obj, tx); 2075 VERIFY3U(0, ==, zap_add_int(mos, 2076 next_clones_obj, dsobj, tx)); 2077 } 2078 } 2079 2080 /* 2081 * If we have a reference-reservation on this dataset, we will 2082 * need to increase the amount of refreservation being charged 2083 * since our unique space is going to zero. 2084 */ 2085 if (ds->ds_reserved) { 2086 int64_t delta; 2087 ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); 2088 delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); 2089 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, 2090 delta, 0, 0, tx); 2091 } 2092 2093 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2094 zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu", 2095 ds->ds_dir->dd_myname, snapname, dsobj, 2096 ds->ds_phys->ds_prev_snap_txg); 2097 ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist, 2098 UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx); 2099 dsl_deadlist_close(&ds->ds_deadlist); 2100 dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); 2101 dsl_deadlist_add_key(&ds->ds_deadlist, 2102 ds->ds_phys->ds_prev_snap_txg, tx); 2103 2104 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg); 2105 ds->ds_phys->ds_prev_snap_obj = dsobj; 2106 ds->ds_phys->ds_prev_snap_txg = crtxg; 2107 ds->ds_phys->ds_unique_bytes = 0; 2108 if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) 2109 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 2110 2111 err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj, 2112 snapname, 8, 1, &dsobj, tx); 2113 ASSERT(err == 0); 2114 2115 if (ds->ds_prev) 2116 dsl_dataset_drop_ref(ds->ds_prev, ds); 2117 VERIFY(0 == dsl_dataset_get_ref(dp, 2118 ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); 2119 2120 dsl_scan_ds_snapshotted(ds, tx); 2121 2122 dsl_dir_snap_cmtime_update(ds->ds_dir); 2123 2124 spa_history_log_internal(LOG_DS_SNAPSHOT, dp->dp_spa, tx, 2125 "dataset = %llu", dsobj); 2126 } 2127 2128 void 2129 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) 2130 { 2131 ASSERT(dmu_tx_is_syncing(tx)); 2132 ASSERT(ds->ds_objset != NULL); 2133 ASSERT(ds->ds_phys->ds_next_snap_obj == 0); 2134 2135 /* 2136 * in case we had to change ds_fsid_guid when we opened it, 2137 * sync it out now. 2138 */ 2139 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2140 ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid; 2141 2142 dsl_dir_dirty(ds->ds_dir, tx); 2143 dmu_objset_sync(ds->ds_objset, zio, tx); 2144 } 2145 2146 void 2147 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) 2148 { 2149 uint64_t refd, avail, uobjs, aobjs, ratio; 2150 2151 dsl_dir_stats(ds->ds_dir, nv); 2152 2153 dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs); 2154 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail); 2155 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd); 2156 2157 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION, 2158 ds->ds_phys->ds_creation_time); 2159 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG, 2160 ds->ds_phys->ds_creation_txg); 2161 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA, 2162 ds->ds_quota); 2163 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION, 2164 ds->ds_reserved); 2165 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, 2166 ds->ds_phys->ds_guid); 2167 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE, 2168 ds->ds_phys->ds_unique_bytes); 2169 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID, 2170 ds->ds_object); 2171 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, 2172 ds->ds_userrefs); 2173 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, 2174 DS_IS_DEFER_DESTROY(ds) ? 1 : 0); 2175 2176 ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 : 2177 (ds->ds_phys->ds_uncompressed_bytes * 100 / 2178 ds->ds_phys->ds_compressed_bytes); 2179 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio); 2180 2181 if (ds->ds_phys->ds_next_snap_obj) { 2182 /* 2183 * This is a snapshot; override the dd's space used with 2184 * our unique space and compression ratio. 2185 */ 2186 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, 2187 ds->ds_phys->ds_unique_bytes); 2188 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio); 2189 } 2190 } 2191 2192 void 2193 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) 2194 { 2195 stat->dds_creation_txg = ds->ds_phys->ds_creation_txg; 2196 stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT; 2197 stat->dds_guid = ds->ds_phys->ds_guid; 2198 if (ds->ds_phys->ds_next_snap_obj) { 2199 stat->dds_is_snapshot = B_TRUE; 2200 stat->dds_num_clones = ds->ds_phys->ds_num_children - 1; 2201 } else { 2202 stat->dds_is_snapshot = B_FALSE; 2203 stat->dds_num_clones = 0; 2204 } 2205 2206 /* clone origin is really a dsl_dir thing... */ 2207 rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); 2208 if (dsl_dir_is_clone(ds->ds_dir)) { 2209 dsl_dataset_t *ods; 2210 2211 VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool, 2212 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods)); 2213 dsl_dataset_name(ods, stat->dds_origin); 2214 dsl_dataset_drop_ref(ods, FTAG); 2215 } else { 2216 stat->dds_origin[0] = '\0'; 2217 } 2218 rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); 2219 } 2220 2221 uint64_t 2222 dsl_dataset_fsid_guid(dsl_dataset_t *ds) 2223 { 2224 return (ds->ds_fsid_guid); 2225 } 2226 2227 void 2228 dsl_dataset_space(dsl_dataset_t *ds, 2229 uint64_t *refdbytesp, uint64_t *availbytesp, 2230 uint64_t *usedobjsp, uint64_t *availobjsp) 2231 { 2232 *refdbytesp = ds->ds_phys->ds_used_bytes; 2233 *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); 2234 if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) 2235 *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes; 2236 if (ds->ds_quota != 0) { 2237 /* 2238 * Adjust available bytes according to refquota 2239 */ 2240 if (*refdbytesp < ds->ds_quota) 2241 *availbytesp = MIN(*availbytesp, 2242 ds->ds_quota - *refdbytesp); 2243 else 2244 *availbytesp = 0; 2245 } 2246 *usedobjsp = ds->ds_phys->ds_bp.blk_fill; 2247 *availobjsp = DN_MAX_OBJECT - *usedobjsp; 2248 } 2249 2250 boolean_t 2251 dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) 2252 { 2253 dsl_pool_t *dp = ds->ds_dir->dd_pool; 2254 2255 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || 2256 dsl_pool_sync_context(dp)); 2257 if (ds->ds_prev == NULL) 2258 return (B_FALSE); 2259 if (ds->ds_phys->ds_bp.blk_birth > 2260 ds->ds_prev->ds_phys->ds_creation_txg) { 2261 objset_t *os, *os_prev; 2262 /* 2263 * It may be that only the ZIL differs, because it was 2264 * reset in the head. Don't count that as being 2265 * modified. 2266 */ 2267 if (dmu_objset_from_ds(ds, &os) != 0) 2268 return (B_TRUE); 2269 if (dmu_objset_from_ds(ds->ds_prev, &os_prev) != 0) 2270 return (B_TRUE); 2271 return (bcmp(&os->os_phys->os_meta_dnode, 2272 &os_prev->os_phys->os_meta_dnode, 2273 sizeof (os->os_phys->os_meta_dnode)) != 0); 2274 } 2275 return (B_FALSE); 2276 } 2277 2278 /* ARGSUSED */ 2279 static int 2280 dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) 2281 { 2282 dsl_dataset_t *ds = arg1; 2283 char *newsnapname = arg2; 2284 dsl_dir_t *dd = ds->ds_dir; 2285 dsl_dataset_t *hds; 2286 uint64_t val; 2287 int err; 2288 2289 err = dsl_dataset_hold_obj(dd->dd_pool, 2290 dd->dd_phys->dd_head_dataset_obj, FTAG, &hds); 2291 if (err) 2292 return (err); 2293 2294 /* new name better not be in use */ 2295 err = dsl_dataset_snap_lookup(hds, newsnapname, &val); 2296 dsl_dataset_rele(hds, FTAG); 2297 2298 if (err == 0) 2299 err = EEXIST; 2300 else if (err == ENOENT) 2301 err = 0; 2302 2303 /* dataset name + 1 for the "@" + the new snapshot name must fit */ 2304 if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN) 2305 err = ENAMETOOLONG; 2306 2307 return (err); 2308 } 2309 2310 static void 2311 dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) 2312 { 2313 dsl_dataset_t *ds = arg1; 2314 const char *newsnapname = arg2; 2315 dsl_dir_t *dd = ds->ds_dir; 2316 objset_t *mos = dd->dd_pool->dp_meta_objset; 2317 dsl_dataset_t *hds; 2318 int err; 2319 2320 ASSERT(ds->ds_phys->ds_next_snap_obj != 0); 2321 2322 VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, 2323 dd->dd_phys->dd_head_dataset_obj, FTAG, &hds)); 2324 2325 VERIFY(0 == dsl_dataset_get_snapname(ds)); 2326 err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx); 2327 ASSERT3U(err, ==, 0); 2328 mutex_enter(&ds->ds_lock); 2329 (void) strcpy(ds->ds_snapname, newsnapname); 2330 mutex_exit(&ds->ds_lock); 2331 err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj, 2332 ds->ds_snapname, 8, 1, &ds->ds_object, tx); 2333 ASSERT3U(err, ==, 0); 2334 2335 spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx, 2336 "dataset = %llu", ds->ds_object); 2337 dsl_dataset_rele(hds, FTAG); 2338 } 2339 2340 struct renamesnaparg { 2341 dsl_sync_task_group_t *dstg; 2342 char failed[MAXPATHLEN]; 2343 char *oldsnap; 2344 char *newsnap; 2345 }; 2346 2347 static int 2348 dsl_snapshot_rename_one(const char *name, void *arg) 2349 { 2350 struct renamesnaparg *ra = arg; 2351 dsl_dataset_t *ds = NULL; 2352 char *snapname; 2353 int err; 2354 2355 snapname = kmem_asprintf("%s@%s", name, ra->oldsnap); 2356 (void) strlcpy(ra->failed, snapname, sizeof (ra->failed)); 2357 2358 /* 2359 * For recursive snapshot renames the parent won't be changing 2360 * so we just pass name for both the to/from argument. 2361 */ 2362 err = zfs_secpolicy_rename_perms(snapname, snapname, CRED()); 2363 if (err != 0) { 2364 strfree(snapname); 2365 return (err == ENOENT ? 0 : err); 2366 } 2367 2368 #ifdef _KERNEL 2369 /* 2370 * For all filesystems undergoing rename, we'll need to unmount it. 2371 */ 2372 (void) zfs_unmount_snap(snapname, NULL); 2373 #endif 2374 err = dsl_dataset_hold(snapname, ra->dstg, &ds); 2375 strfree(snapname); 2376 if (err != 0) 2377 return (err == ENOENT ? 0 : err); 2378 2379 dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check, 2380 dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0); 2381 2382 return (0); 2383 } 2384 2385 static int 2386 dsl_recursive_rename(char *oldname, const char *newname) 2387 { 2388 int err; 2389 struct renamesnaparg *ra; 2390 dsl_sync_task_t *dst; 2391 spa_t *spa; 2392 char *cp, *fsname = spa_strdup(oldname); 2393 int len = strlen(oldname) + 1; 2394 2395 /* truncate the snapshot name to get the fsname */ 2396 cp = strchr(fsname, '@'); 2397 *cp = '\0'; 2398 2399 err = spa_open(fsname, &spa, FTAG); 2400 if (err) { 2401 kmem_free(fsname, len); 2402 return (err); 2403 } 2404 ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP); 2405 ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 2406 2407 ra->oldsnap = strchr(oldname, '@') + 1; 2408 ra->newsnap = strchr(newname, '@') + 1; 2409 *ra->failed = '\0'; 2410 2411 err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra, 2412 DS_FIND_CHILDREN); 2413 kmem_free(fsname, len); 2414 2415 if (err == 0) { 2416 err = dsl_sync_task_group_wait(ra->dstg); 2417 } 2418 2419 for (dst = list_head(&ra->dstg->dstg_tasks); dst; 2420 dst = list_next(&ra->dstg->dstg_tasks, dst)) { 2421 dsl_dataset_t *ds = dst->dst_arg1; 2422 if (dst->dst_err) { 2423 dsl_dir_name(ds->ds_dir, ra->failed); 2424 (void) strlcat(ra->failed, "@", sizeof (ra->failed)); 2425 (void) strlcat(ra->failed, ra->newsnap, 2426 sizeof (ra->failed)); 2427 } 2428 dsl_dataset_rele(ds, ra->dstg); 2429 } 2430 2431 if (err) 2432 (void) strlcpy(oldname, ra->failed, sizeof (ra->failed)); 2433 2434 dsl_sync_task_group_destroy(ra->dstg); 2435 kmem_free(ra, sizeof (struct renamesnaparg)); 2436 spa_close(spa, FTAG); 2437 return (err); 2438 } 2439 2440 static int 2441 dsl_valid_rename(const char *oldname, void *arg) 2442 { 2443 int delta = *(int *)arg; 2444 2445 if (strlen(oldname) + delta >= MAXNAMELEN) 2446 return (ENAMETOOLONG); 2447 2448 return (0); 2449 } 2450 2451 #pragma weak dmu_objset_rename = dsl_dataset_rename 2452 int 2453 dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive) 2454 { 2455 dsl_dir_t *dd; 2456 dsl_dataset_t *ds; 2457 const char *tail; 2458 int err; 2459 2460 err = dsl_dir_open(oldname, FTAG, &dd, &tail); 2461 if (err) 2462 return (err); 2463 2464 if (tail == NULL) { 2465 int delta = strlen(newname) - strlen(oldname); 2466 2467 /* if we're growing, validate child name lengths */ 2468 if (delta > 0) 2469 err = dmu_objset_find(oldname, dsl_valid_rename, 2470 &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 2471 2472 if (err == 0) 2473 err = dsl_dir_rename(dd, newname); 2474 dsl_dir_close(dd, FTAG); 2475 return (err); 2476 } 2477 2478 if (tail[0] != '@') { 2479 /* the name ended in a nonexistent component */ 2480 dsl_dir_close(dd, FTAG); 2481 return (ENOENT); 2482 } 2483 2484 dsl_dir_close(dd, FTAG); 2485 2486 /* new name must be snapshot in same filesystem */ 2487 tail = strchr(newname, '@'); 2488 if (tail == NULL) 2489 return (EINVAL); 2490 tail++; 2491 if (strncmp(oldname, newname, tail - newname) != 0) 2492 return (EXDEV); 2493 2494 if (recursive) { 2495 err = dsl_recursive_rename(oldname, newname); 2496 } else { 2497 err = dsl_dataset_hold(oldname, FTAG, &ds); 2498 if (err) 2499 return (err); 2500 2501 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 2502 dsl_dataset_snapshot_rename_check, 2503 dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1); 2504 2505 dsl_dataset_rele(ds, FTAG); 2506 } 2507 2508 return (err); 2509 } 2510 2511 struct promotenode { 2512 list_node_t link; 2513 dsl_dataset_t *ds; 2514 }; 2515 2516 struct promotearg { 2517 list_t shared_snaps, origin_snaps, clone_snaps; 2518 dsl_dataset_t *origin_origin; 2519 uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; 2520 char *err_ds; 2521 }; 2522 2523 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); 2524 static boolean_t snaplist_unstable(list_t *l); 2525 2526 static int 2527 dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) 2528 { 2529 dsl_dataset_t *hds = arg1; 2530 struct promotearg *pa = arg2; 2531 struct promotenode *snap = list_head(&pa->shared_snaps); 2532 dsl_dataset_t *origin_ds = snap->ds; 2533 int err; 2534 uint64_t unused; 2535 2536 /* Check that it is a real clone */ 2537 if (!dsl_dir_is_clone(hds->ds_dir)) 2538 return (EINVAL); 2539 2540 /* Since this is so expensive, don't do the preliminary check */ 2541 if (!dmu_tx_is_syncing(tx)) 2542 return (0); 2543 2544 if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) 2545 return (EXDEV); 2546 2547 /* compute origin's new unique space */ 2548 snap = list_tail(&pa->clone_snaps); 2549 ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); 2550 dsl_deadlist_space_range(&snap->ds->ds_deadlist, 2551 origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, 2552 &pa->unique, &unused, &unused); 2553 2554 /* 2555 * Walk the snapshots that we are moving 2556 * 2557 * Compute space to transfer. Consider the incremental changes 2558 * to used for each snapshot: 2559 * (my used) = (prev's used) + (blocks born) - (blocks killed) 2560 * So each snapshot gave birth to: 2561 * (blocks born) = (my used) - (prev's used) + (blocks killed) 2562 * So a sequence would look like: 2563 * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0) 2564 * Which simplifies to: 2565 * uN + kN + kN-1 + ... + k1 + k0 2566 * Note however, if we stop before we reach the ORIGIN we get: 2567 * uN + kN + kN-1 + ... + kM - uM-1 2568 */ 2569 pa->used = origin_ds->ds_phys->ds_used_bytes; 2570 pa->comp = origin_ds->ds_phys->ds_compressed_bytes; 2571 pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; 2572 for (snap = list_head(&pa->shared_snaps); snap; 2573 snap = list_next(&pa->shared_snaps, snap)) { 2574 uint64_t val, dlused, dlcomp, dluncomp; 2575 dsl_dataset_t *ds = snap->ds; 2576 2577 /* Check that the snapshot name does not conflict */ 2578 VERIFY(0 == dsl_dataset_get_snapname(ds)); 2579 err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); 2580 if (err == 0) { 2581 err = EEXIST; 2582 goto out; 2583 } 2584 if (err != ENOENT) 2585 goto out; 2586 2587 /* The very first snapshot does not have a deadlist */ 2588 if (ds->ds_phys->ds_prev_snap_obj == 0) 2589 continue; 2590 2591 dsl_deadlist_space(&ds->ds_deadlist, 2592 &dlused, &dlcomp, &dluncomp); 2593 pa->used += dlused; 2594 pa->comp += dlcomp; 2595 pa->uncomp += dluncomp; 2596 } 2597 2598 /* 2599 * If we are a clone of a clone then we never reached ORIGIN, 2600 * so we need to subtract out the clone origin's used space. 2601 */ 2602 if (pa->origin_origin) { 2603 pa->used -= pa->origin_origin->ds_phys->ds_used_bytes; 2604 pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes; 2605 pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes; 2606 } 2607 2608 /* Check that there is enough space here */ 2609 err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, 2610 pa->used); 2611 if (err) 2612 return (err); 2613 2614 /* 2615 * Compute the amounts of space that will be used by snapshots 2616 * after the promotion (for both origin and clone). For each, 2617 * it is the amount of space that will be on all of their 2618 * deadlists (that was not born before their new origin). 2619 */ 2620 if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { 2621 uint64_t space; 2622 2623 /* 2624 * Note, typically this will not be a clone of a clone, 2625 * so dd_origin_txg will be < TXG_INITIAL, so 2626 * these snaplist_space() -> dsl_deadlist_space_range() 2627 * calls will be fast because they do not have to 2628 * iterate over all bps. 2629 */ 2630 snap = list_head(&pa->origin_snaps); 2631 err = snaplist_space(&pa->shared_snaps, 2632 snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap); 2633 if (err) 2634 return (err); 2635 2636 err = snaplist_space(&pa->clone_snaps, 2637 snap->ds->ds_dir->dd_origin_txg, &space); 2638 if (err) 2639 return (err); 2640 pa->cloneusedsnap += space; 2641 } 2642 if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { 2643 err = snaplist_space(&pa->origin_snaps, 2644 origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap); 2645 if (err) 2646 return (err); 2647 } 2648 2649 return (0); 2650 out: 2651 pa->err_ds = snap->ds->ds_snapname; 2652 return (err); 2653 } 2654 2655 static void 2656 dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) 2657 { 2658 dsl_dataset_t *hds = arg1; 2659 struct promotearg *pa = arg2; 2660 struct promotenode *snap = list_head(&pa->shared_snaps); 2661 dsl_dataset_t *origin_ds = snap->ds; 2662 dsl_dataset_t *origin_head; 2663 dsl_dir_t *dd = hds->ds_dir; 2664 dsl_pool_t *dp = hds->ds_dir->dd_pool; 2665 dsl_dir_t *odd = NULL; 2666 uint64_t oldnext_obj; 2667 int64_t delta; 2668 2669 ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)); 2670 2671 snap = list_head(&pa->origin_snaps); 2672 origin_head = snap->ds; 2673 2674 /* 2675 * We need to explicitly open odd, since origin_ds's dd will be 2676 * changing. 2677 */ 2678 VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object, 2679 NULL, FTAG, &odd)); 2680 2681 /* change origin's next snap */ 2682 dmu_buf_will_dirty(origin_ds->ds_dbuf, tx); 2683 oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj; 2684 snap = list_tail(&pa->clone_snaps); 2685 ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); 2686 origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object; 2687 2688 /* change the origin's next clone */ 2689 if (origin_ds->ds_phys->ds_next_clones_obj) { 2690 remove_from_next_clones(origin_ds, snap->ds->ds_object, tx); 2691 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, 2692 origin_ds->ds_phys->ds_next_clones_obj, 2693 oldnext_obj, tx)); 2694 } 2695 2696 /* change origin */ 2697 dmu_buf_will_dirty(dd->dd_dbuf, tx); 2698 ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object); 2699 dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj; 2700 dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg; 2701 dmu_buf_will_dirty(odd->dd_dbuf, tx); 2702 odd->dd_phys->dd_origin_obj = origin_ds->ds_object; 2703 origin_head->ds_dir->dd_origin_txg = 2704 origin_ds->ds_phys->ds_creation_txg; 2705 2706 /* change dd_clone entries */ 2707 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 2708 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 2709 odd->dd_phys->dd_clones, hds->ds_object, tx)); 2710 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, 2711 pa->origin_origin->ds_dir->dd_phys->dd_clones, 2712 hds->ds_object, tx)); 2713 2714 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 2715 pa->origin_origin->ds_dir->dd_phys->dd_clones, 2716 origin_head->ds_object, tx)); 2717 if (dd->dd_phys->dd_clones == 0) { 2718 dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset, 2719 DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); 2720 } 2721 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, 2722 dd->dd_phys->dd_clones, origin_head->ds_object, tx)); 2723 2724 } 2725 2726 /* move snapshots to this dir */ 2727 for (snap = list_head(&pa->shared_snaps); snap; 2728 snap = list_next(&pa->shared_snaps, snap)) { 2729 dsl_dataset_t *ds = snap->ds; 2730 2731 /* unregister props as dsl_dir is changing */ 2732 if (ds->ds_objset) { 2733 dmu_objset_evict(ds->ds_objset); 2734 ds->ds_objset = NULL; 2735 } 2736 /* move snap name entry */ 2737 VERIFY(0 == dsl_dataset_get_snapname(ds)); 2738 VERIFY(0 == dsl_dataset_snap_remove(origin_head, 2739 ds->ds_snapname, tx)); 2740 VERIFY(0 == zap_add(dp->dp_meta_objset, 2741 hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, 2742 8, 1, &ds->ds_object, tx)); 2743 2744 /* change containing dsl_dir */ 2745 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2746 ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object); 2747 ds->ds_phys->ds_dir_obj = dd->dd_object; 2748 ASSERT3P(ds->ds_dir, ==, odd); 2749 dsl_dir_close(ds->ds_dir, ds); 2750 VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, 2751 NULL, ds, &ds->ds_dir)); 2752 2753 /* move any clone references */ 2754 if (ds->ds_phys->ds_next_clones_obj && 2755 spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 2756 zap_cursor_t zc; 2757 zap_attribute_t za; 2758 2759 for (zap_cursor_init(&zc, dp->dp_meta_objset, 2760 ds->ds_phys->ds_next_clones_obj); 2761 zap_cursor_retrieve(&zc, &za) == 0; 2762 zap_cursor_advance(&zc)) { 2763 dsl_dataset_t *cnds; 2764 uint64_t o; 2765 2766 if (za.za_first_integer == oldnext_obj) { 2767 /* 2768 * We've already moved the 2769 * origin's reference. 2770 */ 2771 continue; 2772 } 2773 2774 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, 2775 za.za_first_integer, FTAG, &cnds)); 2776 o = cnds->ds_dir->dd_phys->dd_head_dataset_obj; 2777 2778 VERIFY3U(zap_remove_int(dp->dp_meta_objset, 2779 odd->dd_phys->dd_clones, o, tx), ==, 0); 2780 VERIFY3U(zap_add_int(dp->dp_meta_objset, 2781 dd->dd_phys->dd_clones, o, tx), ==, 0); 2782 dsl_dataset_rele(cnds, FTAG); 2783 } 2784 zap_cursor_fini(&zc); 2785 } 2786 2787 ASSERT3U(dsl_prop_numcb(ds), ==, 0); 2788 } 2789 2790 /* 2791 * Change space accounting. 2792 * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either 2793 * both be valid, or both be 0 (resulting in delta == 0). This 2794 * is true for each of {clone,origin} independently. 2795 */ 2796 2797 delta = pa->cloneusedsnap - 2798 dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; 2799 ASSERT3S(delta, >=, 0); 2800 ASSERT3U(pa->used, >=, delta); 2801 dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx); 2802 dsl_dir_diduse_space(dd, DD_USED_HEAD, 2803 pa->used - delta, pa->comp, pa->uncomp, tx); 2804 2805 delta = pa->originusedsnap - 2806 odd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; 2807 ASSERT3S(delta, <=, 0); 2808 ASSERT3U(pa->used, >=, -delta); 2809 dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx); 2810 dsl_dir_diduse_space(odd, DD_USED_HEAD, 2811 -pa->used - delta, -pa->comp, -pa->uncomp, tx); 2812 2813 origin_ds->ds_phys->ds_unique_bytes = pa->unique; 2814 2815 /* log history record */ 2816 spa_history_log_internal(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx, 2817 "dataset = %llu", hds->ds_object); 2818 2819 dsl_dir_close(odd, FTAG); 2820 } 2821 2822 static char *snaplist_tag = "snaplist"; 2823 /* 2824 * Make a list of dsl_dataset_t's for the snapshots between first_obj 2825 * (exclusive) and last_obj (inclusive). The list will be in reverse 2826 * order (last_obj will be the list_head()). If first_obj == 0, do all 2827 * snapshots back to this dataset's origin. 2828 */ 2829 static int 2830 snaplist_make(dsl_pool_t *dp, boolean_t own, 2831 uint64_t first_obj, uint64_t last_obj, list_t *l) 2832 { 2833 uint64_t obj = last_obj; 2834 2835 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock)); 2836 2837 list_create(l, sizeof (struct promotenode), 2838 offsetof(struct promotenode, link)); 2839 2840 while (obj != first_obj) { 2841 dsl_dataset_t *ds; 2842 struct promotenode *snap; 2843 int err; 2844 2845 if (own) { 2846 err = dsl_dataset_own_obj(dp, obj, 2847 0, snaplist_tag, &ds); 2848 if (err == 0) 2849 dsl_dataset_make_exclusive(ds, snaplist_tag); 2850 } else { 2851 err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds); 2852 } 2853 if (err == ENOENT) { 2854 /* lost race with snapshot destroy */ 2855 struct promotenode *last = list_tail(l); 2856 ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj); 2857 obj = last->ds->ds_phys->ds_prev_snap_obj; 2858 continue; 2859 } else if (err) { 2860 return (err); 2861 } 2862 2863 if (first_obj == 0) 2864 first_obj = ds->ds_dir->dd_phys->dd_origin_obj; 2865 2866 snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP); 2867 snap->ds = ds; 2868 list_insert_tail(l, snap); 2869 obj = ds->ds_phys->ds_prev_snap_obj; 2870 } 2871 2872 return (0); 2873 } 2874 2875 static int 2876 snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep) 2877 { 2878 struct promotenode *snap; 2879 2880 *spacep = 0; 2881 for (snap = list_head(l); snap; snap = list_next(l, snap)) { 2882 uint64_t used, comp, uncomp; 2883 dsl_deadlist_space_range(&snap->ds->ds_deadlist, 2884 mintxg, UINT64_MAX, &used, &comp, &uncomp); 2885 *spacep += used; 2886 } 2887 return (0); 2888 } 2889 2890 static void 2891 snaplist_destroy(list_t *l, boolean_t own) 2892 { 2893 struct promotenode *snap; 2894 2895 if (!l || !list_link_active(&l->list_head)) 2896 return; 2897 2898 while ((snap = list_tail(l)) != NULL) { 2899 list_remove(l, snap); 2900 if (own) 2901 dsl_dataset_disown(snap->ds, snaplist_tag); 2902 else 2903 dsl_dataset_rele(snap->ds, snaplist_tag); 2904 kmem_free(snap, sizeof (struct promotenode)); 2905 } 2906 list_destroy(l); 2907 } 2908 2909 /* 2910 * Promote a clone. Nomenclature note: 2911 * "clone" or "cds": the original clone which is being promoted 2912 * "origin" or "ods": the snapshot which is originally clone's origin 2913 * "origin head" or "ohds": the dataset which is the head 2914 * (filesystem/volume) for the origin 2915 * "origin origin": the origin of the origin's filesystem (typically 2916 * NULL, indicating that the clone is not a clone of a clone). 2917 */ 2918 int 2919 dsl_dataset_promote(const char *name, char *conflsnap) 2920 { 2921 dsl_dataset_t *ds; 2922 dsl_dir_t *dd; 2923 dsl_pool_t *dp; 2924 dmu_object_info_t doi; 2925 struct promotearg pa = { 0 }; 2926 struct promotenode *snap; 2927 int err; 2928 2929 err = dsl_dataset_hold(name, FTAG, &ds); 2930 if (err) 2931 return (err); 2932 dd = ds->ds_dir; 2933 dp = dd->dd_pool; 2934 2935 err = dmu_object_info(dp->dp_meta_objset, 2936 ds->ds_phys->ds_snapnames_zapobj, &doi); 2937 if (err) { 2938 dsl_dataset_rele(ds, FTAG); 2939 return (err); 2940 } 2941 2942 if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) { 2943 dsl_dataset_rele(ds, FTAG); 2944 return (EINVAL); 2945 } 2946 2947 /* 2948 * We are going to inherit all the snapshots taken before our 2949 * origin (i.e., our new origin will be our parent's origin). 2950 * Take ownership of them so that we can rename them into our 2951 * namespace. 2952 */ 2953 rw_enter(&dp->dp_config_rwlock, RW_READER); 2954 2955 err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj, 2956 &pa.shared_snaps); 2957 if (err != 0) 2958 goto out; 2959 2960 err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps); 2961 if (err != 0) 2962 goto out; 2963 2964 snap = list_head(&pa.shared_snaps); 2965 ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj); 2966 err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj, 2967 snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps); 2968 if (err != 0) 2969 goto out; 2970 2971 if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) { 2972 err = dsl_dataset_hold_obj(dp, 2973 snap->ds->ds_dir->dd_phys->dd_origin_obj, 2974 FTAG, &pa.origin_origin); 2975 if (err != 0) 2976 goto out; 2977 } 2978 2979 out: 2980 rw_exit(&dp->dp_config_rwlock); 2981 2982 /* 2983 * Add in 128x the snapnames zapobj size, since we will be moving 2984 * a bunch of snapnames to the promoted ds, and dirtying their 2985 * bonus buffers. 2986 */ 2987 if (err == 0) { 2988 err = dsl_sync_task_do(dp, dsl_dataset_promote_check, 2989 dsl_dataset_promote_sync, ds, &pa, 2990 2 + 2 * doi.doi_physical_blocks_512); 2991 if (err && pa.err_ds && conflsnap) 2992 (void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN); 2993 } 2994 2995 snaplist_destroy(&pa.shared_snaps, B_TRUE); 2996 snaplist_destroy(&pa.clone_snaps, B_FALSE); 2997 snaplist_destroy(&pa.origin_snaps, B_FALSE); 2998 if (pa.origin_origin) 2999 dsl_dataset_rele(pa.origin_origin, FTAG); 3000 dsl_dataset_rele(ds, FTAG); 3001 return (err); 3002 } 3003 3004 struct cloneswaparg { 3005 dsl_dataset_t *cds; /* clone dataset */ 3006 dsl_dataset_t *ohds; /* origin's head dataset */ 3007 boolean_t force; 3008 int64_t unused_refres_delta; /* change in unconsumed refreservation */ 3009 }; 3010 3011 /* ARGSUSED */ 3012 static int 3013 dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx) 3014 { 3015 struct cloneswaparg *csa = arg1; 3016 3017 /* they should both be heads */ 3018 if (dsl_dataset_is_snapshot(csa->cds) || 3019 dsl_dataset_is_snapshot(csa->ohds)) 3020 return (EINVAL); 3021 3022 /* the branch point should be just before them */ 3023 if (csa->cds->ds_prev != csa->ohds->ds_prev) 3024 return (EINVAL); 3025 3026 /* cds should be the clone (unless they are unrelated) */ 3027 if (csa->cds->ds_prev != NULL && 3028 csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap && 3029 csa->ohds->ds_object != 3030 csa->cds->ds_prev->ds_phys->ds_next_snap_obj) 3031 return (EINVAL); 3032 3033 /* the clone should be a child of the origin */ 3034 if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir) 3035 return (EINVAL); 3036 3037 /* ohds shouldn't be modified unless 'force' */ 3038 if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds)) 3039 return (ETXTBSY); 3040 3041 /* adjust amount of any unconsumed refreservation */ 3042 csa->unused_refres_delta = 3043 (int64_t)MIN(csa->ohds->ds_reserved, 3044 csa->ohds->ds_phys->ds_unique_bytes) - 3045 (int64_t)MIN(csa->ohds->ds_reserved, 3046 csa->cds->ds_phys->ds_unique_bytes); 3047 3048 if (csa->unused_refres_delta > 0 && 3049 csa->unused_refres_delta > 3050 dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE)) 3051 return (ENOSPC); 3052 3053 if (csa->ohds->ds_quota != 0 && 3054 csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota) 3055 return (EDQUOT); 3056 3057 return (0); 3058 } 3059 3060 /* ARGSUSED */ 3061 static void 3062 dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) 3063 { 3064 struct cloneswaparg *csa = arg1; 3065 dsl_pool_t *dp = csa->cds->ds_dir->dd_pool; 3066 3067 ASSERT(csa->cds->ds_reserved == 0); 3068 ASSERT(csa->ohds->ds_quota == 0 || 3069 csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota); 3070 3071 dmu_buf_will_dirty(csa->cds->ds_dbuf, tx); 3072 dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx); 3073 3074 if (csa->cds->ds_objset != NULL) { 3075 dmu_objset_evict(csa->cds->ds_objset); 3076 csa->cds->ds_objset = NULL; 3077 } 3078 3079 if (csa->ohds->ds_objset != NULL) { 3080 dmu_objset_evict(csa->ohds->ds_objset); 3081 csa->ohds->ds_objset = NULL; 3082 } 3083 3084 /* 3085 * Reset origin's unique bytes, if it exists. 3086 */ 3087 if (csa->cds->ds_prev) { 3088 dsl_dataset_t *origin = csa->cds->ds_prev; 3089 uint64_t comp, uncomp; 3090 3091 dmu_buf_will_dirty(origin->ds_dbuf, tx); 3092 dsl_deadlist_space_range(&csa->cds->ds_deadlist, 3093 origin->ds_phys->ds_prev_snap_txg, UINT64_MAX, 3094 &origin->ds_phys->ds_unique_bytes, &comp, &uncomp); 3095 } 3096 3097 /* swap blkptrs */ 3098 { 3099 blkptr_t tmp; 3100 tmp = csa->ohds->ds_phys->ds_bp; 3101 csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp; 3102 csa->cds->ds_phys->ds_bp = tmp; 3103 } 3104 3105 /* set dd_*_bytes */ 3106 { 3107 int64_t dused, dcomp, duncomp; 3108 uint64_t cdl_used, cdl_comp, cdl_uncomp; 3109 uint64_t odl_used, odl_comp, odl_uncomp; 3110 3111 ASSERT3U(csa->cds->ds_dir->dd_phys-> 3112 dd_used_breakdown[DD_USED_SNAP], ==, 0); 3113 3114 dsl_deadlist_space(&csa->cds->ds_deadlist, 3115 &cdl_used, &cdl_comp, &cdl_uncomp); 3116 dsl_deadlist_space(&csa->ohds->ds_deadlist, 3117 &odl_used, &odl_comp, &odl_uncomp); 3118 3119 dused = csa->cds->ds_phys->ds_used_bytes + cdl_used - 3120 (csa->ohds->ds_phys->ds_used_bytes + odl_used); 3121 dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp - 3122 (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp); 3123 duncomp = csa->cds->ds_phys->ds_uncompressed_bytes + 3124 cdl_uncomp - 3125 (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp); 3126 3127 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD, 3128 dused, dcomp, duncomp, tx); 3129 dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD, 3130 -dused, -dcomp, -duncomp, tx); 3131 3132 /* 3133 * The difference in the space used by snapshots is the 3134 * difference in snapshot space due to the head's 3135 * deadlist (since that's the only thing that's 3136 * changing that affects the snapused). 3137 */ 3138 dsl_deadlist_space_range(&csa->cds->ds_deadlist, 3139 csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, 3140 &cdl_used, &cdl_comp, &cdl_uncomp); 3141 dsl_deadlist_space_range(&csa->ohds->ds_deadlist, 3142 csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, 3143 &odl_used, &odl_comp, &odl_uncomp); 3144 dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used, 3145 DD_USED_HEAD, DD_USED_SNAP, tx); 3146 } 3147 3148 /* swap ds_*_bytes */ 3149 SWITCH64(csa->ohds->ds_phys->ds_used_bytes, 3150 csa->cds->ds_phys->ds_used_bytes); 3151 SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes, 3152 csa->cds->ds_phys->ds_compressed_bytes); 3153 SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes, 3154 csa->cds->ds_phys->ds_uncompressed_bytes); 3155 SWITCH64(csa->ohds->ds_phys->ds_unique_bytes, 3156 csa->cds->ds_phys->ds_unique_bytes); 3157 3158 /* apply any parent delta for change in unconsumed refreservation */ 3159 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV, 3160 csa->unused_refres_delta, 0, 0, tx); 3161 3162 /* 3163 * Swap deadlists. 3164 */ 3165 dsl_deadlist_close(&csa->cds->ds_deadlist); 3166 dsl_deadlist_close(&csa->ohds->ds_deadlist); 3167 SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj, 3168 csa->cds->ds_phys->ds_deadlist_obj); 3169 dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset, 3170 csa->cds->ds_phys->ds_deadlist_obj); 3171 dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset, 3172 csa->ohds->ds_phys->ds_deadlist_obj); 3173 3174 dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx); 3175 } 3176 3177 /* 3178 * Swap 'clone' with its origin head datasets. Used at the end of "zfs 3179 * recv" into an existing fs to swizzle the file system to the new 3180 * version, and by "zfs rollback". Can also be used to swap two 3181 * independent head datasets if neither has any snapshots. 3182 */ 3183 int 3184 dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, 3185 boolean_t force) 3186 { 3187 struct cloneswaparg csa; 3188 int error; 3189 3190 ASSERT(clone->ds_owner); 3191 ASSERT(origin_head->ds_owner); 3192 retry: 3193 /* 3194 * Need exclusive access for the swap. If we're swapping these 3195 * datasets back after an error, we already hold the locks. 3196 */ 3197 if (!RW_WRITE_HELD(&clone->ds_rwlock)) 3198 rw_enter(&clone->ds_rwlock, RW_WRITER); 3199 if (!RW_WRITE_HELD(&origin_head->ds_rwlock) && 3200 !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) { 3201 rw_exit(&clone->ds_rwlock); 3202 rw_enter(&origin_head->ds_rwlock, RW_WRITER); 3203 if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) { 3204 rw_exit(&origin_head->ds_rwlock); 3205 goto retry; 3206 } 3207 } 3208 csa.cds = clone; 3209 csa.ohds = origin_head; 3210 csa.force = force; 3211 error = dsl_sync_task_do(clone->ds_dir->dd_pool, 3212 dsl_dataset_clone_swap_check, 3213 dsl_dataset_clone_swap_sync, &csa, NULL, 9); 3214 return (error); 3215 } 3216 3217 /* 3218 * Given a pool name and a dataset object number in that pool, 3219 * return the name of that dataset. 3220 */ 3221 int 3222 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) 3223 { 3224 spa_t *spa; 3225 dsl_pool_t *dp; 3226 dsl_dataset_t *ds; 3227 int error; 3228 3229 if ((error = spa_open(pname, &spa, FTAG)) != 0) 3230 return (error); 3231 dp = spa_get_dsl(spa); 3232 rw_enter(&dp->dp_config_rwlock, RW_READER); 3233 if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) { 3234 dsl_dataset_name(ds, buf); 3235 dsl_dataset_rele(ds, FTAG); 3236 } 3237 rw_exit(&dp->dp_config_rwlock); 3238 spa_close(spa, FTAG); 3239 3240 return (error); 3241 } 3242 3243 int 3244 dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, 3245 uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv) 3246 { 3247 int error = 0; 3248 3249 ASSERT3S(asize, >, 0); 3250 3251 /* 3252 * *ref_rsrv is the portion of asize that will come from any 3253 * unconsumed refreservation space. 3254 */ 3255 *ref_rsrv = 0; 3256 3257 mutex_enter(&ds->ds_lock); 3258 /* 3259 * Make a space adjustment for reserved bytes. 3260 */ 3261 if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) { 3262 ASSERT3U(*used, >=, 3263 ds->ds_reserved - ds->ds_phys->ds_unique_bytes); 3264 *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes); 3265 *ref_rsrv = 3266 asize - MIN(asize, parent_delta(ds, asize + inflight)); 3267 } 3268 3269 if (!check_quota || ds->ds_quota == 0) { 3270 mutex_exit(&ds->ds_lock); 3271 return (0); 3272 } 3273 /* 3274 * If they are requesting more space, and our current estimate 3275 * is over quota, they get to try again unless the actual 3276 * on-disk is over quota and there are no pending changes (which 3277 * may free up space for us). 3278 */ 3279 if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) { 3280 if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota) 3281 error = ERESTART; 3282 else 3283 error = EDQUOT; 3284 } 3285 mutex_exit(&ds->ds_lock); 3286 3287 return (error); 3288 } 3289 3290 /* ARGSUSED */ 3291 static int 3292 dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) 3293 { 3294 dsl_dataset_t *ds = arg1; 3295 dsl_prop_setarg_t *psa = arg2; 3296 int err; 3297 3298 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA) 3299 return (ENOTSUP); 3300 3301 if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) 3302 return (err); 3303 3304 if (psa->psa_effective_value == 0) 3305 return (0); 3306 3307 if (psa->psa_effective_value < ds->ds_phys->ds_used_bytes || 3308 psa->psa_effective_value < ds->ds_reserved) 3309 return (ENOSPC); 3310 3311 return (0); 3312 } 3313 3314 extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *); 3315 3316 void 3317 dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) 3318 { 3319 dsl_dataset_t *ds = arg1; 3320 dsl_prop_setarg_t *psa = arg2; 3321 uint64_t effective_value = psa->psa_effective_value; 3322 3323 dsl_prop_set_sync(ds, psa, tx); 3324 DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); 3325 3326 if (ds->ds_quota != effective_value) { 3327 dmu_buf_will_dirty(ds->ds_dbuf, tx); 3328 ds->ds_quota = effective_value; 3329 3330 spa_history_log_internal(LOG_DS_REFQUOTA, 3331 ds->ds_dir->dd_pool->dp_spa, tx, "%lld dataset = %llu ", 3332 (longlong_t)ds->ds_quota, ds->ds_object); 3333 } 3334 } 3335 3336 int 3337 dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota) 3338 { 3339 dsl_dataset_t *ds; 3340 dsl_prop_setarg_t psa; 3341 int err; 3342 3343 dsl_prop_setarg_init_uint64(&psa, "refquota", source, "a); 3344 3345 err = dsl_dataset_hold(dsname, FTAG, &ds); 3346 if (err) 3347 return (err); 3348 3349 /* 3350 * If someone removes a file, then tries to set the quota, we 3351 * want to make sure the file freeing takes effect. 3352 */ 3353 txg_wait_open(ds->ds_dir->dd_pool, 0); 3354 3355 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 3356 dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync, 3357 ds, &psa, 0); 3358 3359 dsl_dataset_rele(ds, FTAG); 3360 return (err); 3361 } 3362 3363 static int 3364 dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) 3365 { 3366 dsl_dataset_t *ds = arg1; 3367 dsl_prop_setarg_t *psa = arg2; 3368 uint64_t effective_value; 3369 uint64_t unique; 3370 int err; 3371 3372 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < 3373 SPA_VERSION_REFRESERVATION) 3374 return (ENOTSUP); 3375 3376 if (dsl_dataset_is_snapshot(ds)) 3377 return (EINVAL); 3378 3379 if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) 3380 return (err); 3381 3382 effective_value = psa->psa_effective_value; 3383 3384 /* 3385 * If we are doing the preliminary check in open context, the 3386 * space estimates may be inaccurate. 3387 */ 3388 if (!dmu_tx_is_syncing(tx)) 3389 return (0); 3390 3391 mutex_enter(&ds->ds_lock); 3392 if (!DS_UNIQUE_IS_ACCURATE(ds)) 3393 dsl_dataset_recalc_head_uniq(ds); 3394 unique = ds->ds_phys->ds_unique_bytes; 3395 mutex_exit(&ds->ds_lock); 3396 3397 if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) { 3398 uint64_t delta = MAX(unique, effective_value) - 3399 MAX(unique, ds->ds_reserved); 3400 3401 if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) 3402 return (ENOSPC); 3403 if (ds->ds_quota > 0 && 3404 effective_value > ds->ds_quota) 3405 return (ENOSPC); 3406 } 3407 3408 return (0); 3409 } 3410 3411 static void 3412 dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) 3413 { 3414 dsl_dataset_t *ds = arg1; 3415 dsl_prop_setarg_t *psa = arg2; 3416 uint64_t effective_value = psa->psa_effective_value; 3417 uint64_t unique; 3418 int64_t delta; 3419 3420 dsl_prop_set_sync(ds, psa, tx); 3421 DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); 3422 3423 dmu_buf_will_dirty(ds->ds_dbuf, tx); 3424 3425 mutex_enter(&ds->ds_dir->dd_lock); 3426 mutex_enter(&ds->ds_lock); 3427 ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); 3428 unique = ds->ds_phys->ds_unique_bytes; 3429 delta = MAX(0, (int64_t)(effective_value - unique)) - 3430 MAX(0, (int64_t)(ds->ds_reserved - unique)); 3431 ds->ds_reserved = effective_value; 3432 mutex_exit(&ds->ds_lock); 3433 3434 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); 3435 mutex_exit(&ds->ds_dir->dd_lock); 3436 3437 spa_history_log_internal(LOG_DS_REFRESERV, 3438 ds->ds_dir->dd_pool->dp_spa, tx, "%lld dataset = %llu", 3439 (longlong_t)effective_value, ds->ds_object); 3440 } 3441 3442 int 3443 dsl_dataset_set_reservation(const char *dsname, zprop_source_t source, 3444 uint64_t reservation) 3445 { 3446 dsl_dataset_t *ds; 3447 dsl_prop_setarg_t psa; 3448 int err; 3449 3450 dsl_prop_setarg_init_uint64(&psa, "refreservation", source, 3451 &reservation); 3452 3453 err = dsl_dataset_hold(dsname, FTAG, &ds); 3454 if (err) 3455 return (err); 3456 3457 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 3458 dsl_dataset_set_reservation_check, 3459 dsl_dataset_set_reservation_sync, ds, &psa, 0); 3460 3461 dsl_dataset_rele(ds, FTAG); 3462 return (err); 3463 } 3464 3465 typedef struct zfs_hold_cleanup_arg { 3466 dsl_pool_t *dp; 3467 uint64_t dsobj; 3468 char htag[MAXNAMELEN]; 3469 } zfs_hold_cleanup_arg_t; 3470 3471 static void 3472 dsl_dataset_user_release_onexit(void *arg) 3473 { 3474 zfs_hold_cleanup_arg_t *ca = arg; 3475 3476 (void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag, 3477 B_TRUE); 3478 kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t)); 3479 } 3480 3481 void 3482 dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag, 3483 minor_t minor) 3484 { 3485 zfs_hold_cleanup_arg_t *ca; 3486 3487 ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP); 3488 ca->dp = ds->ds_dir->dd_pool; 3489 ca->dsobj = ds->ds_object; 3490 (void) strlcpy(ca->htag, htag, sizeof (ca->htag)); 3491 VERIFY3U(0, ==, zfs_onexit_add_cb(minor, 3492 dsl_dataset_user_release_onexit, ca, NULL)); 3493 } 3494 3495 /* 3496 * If you add new checks here, you may need to add 3497 * additional checks to the "temporary" case in 3498 * snapshot_check() in dmu_objset.c. 3499 */ 3500 static int 3501 dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx) 3502 { 3503 dsl_dataset_t *ds = arg1; 3504 struct dsl_ds_holdarg *ha = arg2; 3505 char *htag = ha->htag; 3506 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 3507 int error = 0; 3508 3509 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) 3510 return (ENOTSUP); 3511 3512 if (!dsl_dataset_is_snapshot(ds)) 3513 return (EINVAL); 3514 3515 /* tags must be unique */ 3516 mutex_enter(&ds->ds_lock); 3517 if (ds->ds_phys->ds_userrefs_obj) { 3518 error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag, 3519 8, 1, tx); 3520 if (error == 0) 3521 error = EEXIST; 3522 else if (error == ENOENT) 3523 error = 0; 3524 } 3525 mutex_exit(&ds->ds_lock); 3526 3527 if (error == 0 && ha->temphold && 3528 strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) 3529 error = E2BIG; 3530 3531 return (error); 3532 } 3533 3534 void 3535 dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx) 3536 { 3537 dsl_dataset_t *ds = arg1; 3538 struct dsl_ds_holdarg *ha = arg2; 3539 char *htag = ha->htag; 3540 dsl_pool_t *dp = ds->ds_dir->dd_pool; 3541 objset_t *mos = dp->dp_meta_objset; 3542 uint64_t now = gethrestime_sec(); 3543 uint64_t zapobj; 3544 3545 mutex_enter(&ds->ds_lock); 3546 if (ds->ds_phys->ds_userrefs_obj == 0) { 3547 /* 3548 * This is the first user hold for this dataset. Create 3549 * the userrefs zap object. 3550 */ 3551 dmu_buf_will_dirty(ds->ds_dbuf, tx); 3552 zapobj = ds->ds_phys->ds_userrefs_obj = 3553 zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); 3554 } else { 3555 zapobj = ds->ds_phys->ds_userrefs_obj; 3556 } 3557 ds->ds_userrefs++; 3558 mutex_exit(&ds->ds_lock); 3559 3560 VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx)); 3561 3562 if (ha->temphold) { 3563 VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object, 3564 htag, &now, tx)); 3565 } 3566 3567 spa_history_log_internal(LOG_DS_USER_HOLD, 3568 dp->dp_spa, tx, "<%s> temp = %d dataset = %llu", htag, 3569 (int)ha->temphold, ds->ds_object); 3570 } 3571 3572 static int 3573 dsl_dataset_user_hold_one(const char *dsname, void *arg) 3574 { 3575 struct dsl_ds_holdarg *ha = arg; 3576 dsl_dataset_t *ds; 3577 int error; 3578 char *name; 3579 3580 /* alloc a buffer to hold dsname@snapname plus terminating NULL */ 3581 name = kmem_asprintf("%s@%s", dsname, ha->snapname); 3582 error = dsl_dataset_hold(name, ha->dstg, &ds); 3583 strfree(name); 3584 if (error == 0) { 3585 ha->gotone = B_TRUE; 3586 dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check, 3587 dsl_dataset_user_hold_sync, ds, ha, 0); 3588 } else if (error == ENOENT && ha->recursive) { 3589 error = 0; 3590 } else { 3591 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 3592 } 3593 return (error); 3594 } 3595 3596 int 3597 dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag, 3598 boolean_t temphold) 3599 { 3600 struct dsl_ds_holdarg *ha; 3601 int error; 3602 3603 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); 3604 ha->htag = htag; 3605 ha->temphold = temphold; 3606 error = dsl_sync_task_do(ds->ds_dir->dd_pool, 3607 dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync, 3608 ds, ha, 0); 3609 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3610 3611 return (error); 3612 } 3613 3614 int 3615 dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, 3616 boolean_t recursive, boolean_t temphold, int cleanup_fd) 3617 { 3618 struct dsl_ds_holdarg *ha; 3619 dsl_sync_task_t *dst; 3620 spa_t *spa; 3621 int error; 3622 minor_t minor = 0; 3623 3624 if (cleanup_fd != -1) { 3625 /* Currently we only support cleanup-on-exit of tempholds. */ 3626 if (!temphold) 3627 return (EINVAL); 3628 error = zfs_onexit_fd_hold(cleanup_fd, &minor); 3629 if (error) 3630 return (error); 3631 } 3632 3633 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); 3634 3635 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 3636 3637 error = spa_open(dsname, &spa, FTAG); 3638 if (error) { 3639 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3640 if (cleanup_fd != -1) 3641 zfs_onexit_fd_rele(cleanup_fd); 3642 return (error); 3643 } 3644 3645 ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 3646 ha->htag = htag; 3647 ha->snapname = snapname; 3648 ha->recursive = recursive; 3649 ha->temphold = temphold; 3650 3651 if (recursive) { 3652 error = dmu_objset_find(dsname, dsl_dataset_user_hold_one, 3653 ha, DS_FIND_CHILDREN); 3654 } else { 3655 error = dsl_dataset_user_hold_one(dsname, ha); 3656 } 3657 if (error == 0) 3658 error = dsl_sync_task_group_wait(ha->dstg); 3659 3660 for (dst = list_head(&ha->dstg->dstg_tasks); dst; 3661 dst = list_next(&ha->dstg->dstg_tasks, dst)) { 3662 dsl_dataset_t *ds = dst->dst_arg1; 3663 3664 if (dst->dst_err) { 3665 dsl_dataset_name(ds, ha->failed); 3666 *strchr(ha->failed, '@') = '\0'; 3667 } else if (error == 0 && minor != 0 && temphold) { 3668 /* 3669 * If this hold is to be released upon process exit, 3670 * register that action now. 3671 */ 3672 dsl_register_onexit_hold_cleanup(ds, htag, minor); 3673 } 3674 dsl_dataset_rele(ds, ha->dstg); 3675 } 3676 3677 if (error == 0 && recursive && !ha->gotone) 3678 error = ENOENT; 3679 3680 if (error) 3681 (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); 3682 3683 dsl_sync_task_group_destroy(ha->dstg); 3684 3685 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3686 spa_close(spa, FTAG); 3687 if (cleanup_fd != -1) 3688 zfs_onexit_fd_rele(cleanup_fd); 3689 return (error); 3690 } 3691 3692 struct dsl_ds_releasearg { 3693 dsl_dataset_t *ds; 3694 const char *htag; 3695 boolean_t own; /* do we own or just hold ds? */ 3696 }; 3697 3698 static int 3699 dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag, 3700 boolean_t *might_destroy) 3701 { 3702 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 3703 uint64_t zapobj; 3704 uint64_t tmp; 3705 int error; 3706 3707 *might_destroy = B_FALSE; 3708 3709 mutex_enter(&ds->ds_lock); 3710 zapobj = ds->ds_phys->ds_userrefs_obj; 3711 if (zapobj == 0) { 3712 /* The tag can't possibly exist */ 3713 mutex_exit(&ds->ds_lock); 3714 return (ESRCH); 3715 } 3716 3717 /* Make sure the tag exists */ 3718 error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp); 3719 if (error) { 3720 mutex_exit(&ds->ds_lock); 3721 if (error == ENOENT) 3722 error = ESRCH; 3723 return (error); 3724 } 3725 3726 if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 && 3727 DS_IS_DEFER_DESTROY(ds)) 3728 *might_destroy = B_TRUE; 3729 3730 mutex_exit(&ds->ds_lock); 3731 return (0); 3732 } 3733 3734 static int 3735 dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx) 3736 { 3737 struct dsl_ds_releasearg *ra = arg1; 3738 dsl_dataset_t *ds = ra->ds; 3739 boolean_t might_destroy; 3740 int error; 3741 3742 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) 3743 return (ENOTSUP); 3744 3745 error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy); 3746 if (error) 3747 return (error); 3748 3749 if (might_destroy) { 3750 struct dsl_ds_destroyarg dsda = {0}; 3751 3752 if (dmu_tx_is_syncing(tx)) { 3753 /* 3754 * If we're not prepared to remove the snapshot, 3755 * we can't allow the release to happen right now. 3756 */ 3757 if (!ra->own) 3758 return (EBUSY); 3759 } 3760 dsda.ds = ds; 3761 dsda.releasing = B_TRUE; 3762 return (dsl_dataset_destroy_check(&dsda, tag, tx)); 3763 } 3764 3765 return (0); 3766 } 3767 3768 static void 3769 dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx) 3770 { 3771 struct dsl_ds_releasearg *ra = arg1; 3772 dsl_dataset_t *ds = ra->ds; 3773 dsl_pool_t *dp = ds->ds_dir->dd_pool; 3774 objset_t *mos = dp->dp_meta_objset; 3775 uint64_t zapobj; 3776 uint64_t dsobj = ds->ds_object; 3777 uint64_t refs; 3778 int error; 3779 3780 mutex_enter(&ds->ds_lock); 3781 ds->ds_userrefs--; 3782 refs = ds->ds_userrefs; 3783 mutex_exit(&ds->ds_lock); 3784 error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx); 3785 VERIFY(error == 0 || error == ENOENT); 3786 zapobj = ds->ds_phys->ds_userrefs_obj; 3787 VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx)); 3788 if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 && 3789 DS_IS_DEFER_DESTROY(ds)) { 3790 struct dsl_ds_destroyarg dsda = {0}; 3791 3792 ASSERT(ra->own); 3793 dsda.ds = ds; 3794 dsda.releasing = B_TRUE; 3795 /* We already did the destroy_check */ 3796 dsl_dataset_destroy_sync(&dsda, tag, tx); 3797 } 3798 3799 spa_history_log_internal(LOG_DS_USER_RELEASE, 3800 dp->dp_spa, tx, "<%s> %lld dataset = %llu", 3801 ra->htag, (longlong_t)refs, dsobj); 3802 } 3803 3804 static int 3805 dsl_dataset_user_release_one(const char *dsname, void *arg) 3806 { 3807 struct dsl_ds_holdarg *ha = arg; 3808 struct dsl_ds_releasearg *ra; 3809 dsl_dataset_t *ds; 3810 int error; 3811 void *dtag = ha->dstg; 3812 char *name; 3813 boolean_t own = B_FALSE; 3814 boolean_t might_destroy; 3815 3816 /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */ 3817 name = kmem_asprintf("%s@%s", dsname, ha->snapname); 3818 error = dsl_dataset_hold(name, dtag, &ds); 3819 strfree(name); 3820 if (error == ENOENT && ha->recursive) 3821 return (0); 3822 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 3823 if (error) 3824 return (error); 3825 3826 ha->gotone = B_TRUE; 3827 3828 ASSERT(dsl_dataset_is_snapshot(ds)); 3829 3830 error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy); 3831 if (error) { 3832 dsl_dataset_rele(ds, dtag); 3833 return (error); 3834 } 3835 3836 if (might_destroy) { 3837 #ifdef _KERNEL 3838 name = kmem_asprintf("%s@%s", dsname, ha->snapname); 3839 error = zfs_unmount_snap(name, NULL); 3840 strfree(name); 3841 if (error) { 3842 dsl_dataset_rele(ds, dtag); 3843 return (error); 3844 } 3845 #endif 3846 if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) { 3847 dsl_dataset_rele(ds, dtag); 3848 return (EBUSY); 3849 } else { 3850 own = B_TRUE; 3851 dsl_dataset_make_exclusive(ds, dtag); 3852 } 3853 } 3854 3855 ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP); 3856 ra->ds = ds; 3857 ra->htag = ha->htag; 3858 ra->own = own; 3859 dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check, 3860 dsl_dataset_user_release_sync, ra, dtag, 0); 3861 3862 return (0); 3863 } 3864 3865 int 3866 dsl_dataset_user_release(char *dsname, char *snapname, char *htag, 3867 boolean_t recursive) 3868 { 3869 struct dsl_ds_holdarg *ha; 3870 dsl_sync_task_t *dst; 3871 spa_t *spa; 3872 int error; 3873 3874 top: 3875 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); 3876 3877 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 3878 3879 error = spa_open(dsname, &spa, FTAG); 3880 if (error) { 3881 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3882 return (error); 3883 } 3884 3885 ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 3886 ha->htag = htag; 3887 ha->snapname = snapname; 3888 ha->recursive = recursive; 3889 if (recursive) { 3890 error = dmu_objset_find(dsname, dsl_dataset_user_release_one, 3891 ha, DS_FIND_CHILDREN); 3892 } else { 3893 error = dsl_dataset_user_release_one(dsname, ha); 3894 } 3895 if (error == 0) 3896 error = dsl_sync_task_group_wait(ha->dstg); 3897 3898 for (dst = list_head(&ha->dstg->dstg_tasks); dst; 3899 dst = list_next(&ha->dstg->dstg_tasks, dst)) { 3900 struct dsl_ds_releasearg *ra = dst->dst_arg1; 3901 dsl_dataset_t *ds = ra->ds; 3902 3903 if (dst->dst_err) 3904 dsl_dataset_name(ds, ha->failed); 3905 3906 if (ra->own) 3907 dsl_dataset_disown(ds, ha->dstg); 3908 else 3909 dsl_dataset_rele(ds, ha->dstg); 3910 3911 kmem_free(ra, sizeof (struct dsl_ds_releasearg)); 3912 } 3913 3914 if (error == 0 && recursive && !ha->gotone) 3915 error = ENOENT; 3916 3917 if (error && error != EBUSY) 3918 (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); 3919 3920 dsl_sync_task_group_destroy(ha->dstg); 3921 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3922 spa_close(spa, FTAG); 3923 3924 /* 3925 * We can get EBUSY if we were racing with deferred destroy and 3926 * dsl_dataset_user_release_check() hadn't done the necessary 3927 * open context setup. We can also get EBUSY if we're racing 3928 * with destroy and that thread is the ds_owner. Either way 3929 * the busy condition should be transient, and we should retry 3930 * the release operation. 3931 */ 3932 if (error == EBUSY) 3933 goto top; 3934 3935 return (error); 3936 } 3937 3938 /* 3939 * Called at spa_load time (with retry == B_FALSE) to release a stale 3940 * temporary user hold. Also called by the onexit code (with retry == B_TRUE). 3941 */ 3942 int 3943 dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag, 3944 boolean_t retry) 3945 { 3946 dsl_dataset_t *ds; 3947 char *snap; 3948 char *name; 3949 int namelen; 3950 int error; 3951 3952 do { 3953 rw_enter(&dp->dp_config_rwlock, RW_READER); 3954 error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); 3955 rw_exit(&dp->dp_config_rwlock); 3956 if (error) 3957 return (error); 3958 namelen = dsl_dataset_namelen(ds)+1; 3959 name = kmem_alloc(namelen, KM_SLEEP); 3960 dsl_dataset_name(ds, name); 3961 dsl_dataset_rele(ds, FTAG); 3962 3963 snap = strchr(name, '@'); 3964 *snap = '\0'; 3965 ++snap; 3966 error = dsl_dataset_user_release(name, snap, htag, B_FALSE); 3967 kmem_free(name, namelen); 3968 3969 /* 3970 * The object can't have been destroyed because we have a hold, 3971 * but it might have been renamed, resulting in ENOENT. Retry 3972 * if we've been requested to do so. 3973 * 3974 * It would be nice if we could use the dsobj all the way 3975 * through and avoid ENOENT entirely. But we might need to 3976 * unmount the snapshot, and there's currently no way to lookup 3977 * a vfsp using a ZFS object id. 3978 */ 3979 } while ((error == ENOENT) && retry); 3980 3981 return (error); 3982 } 3983 3984 int 3985 dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp) 3986 { 3987 dsl_dataset_t *ds; 3988 int err; 3989 3990 err = dsl_dataset_hold(dsname, FTAG, &ds); 3991 if (err) 3992 return (err); 3993 3994 VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP)); 3995 if (ds->ds_phys->ds_userrefs_obj != 0) { 3996 zap_attribute_t *za; 3997 zap_cursor_t zc; 3998 3999 za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); 4000 for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, 4001 ds->ds_phys->ds_userrefs_obj); 4002 zap_cursor_retrieve(&zc, za) == 0; 4003 zap_cursor_advance(&zc)) { 4004 VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name, 4005 za->za_first_integer)); 4006 } 4007 zap_cursor_fini(&zc); 4008 kmem_free(za, sizeof (zap_attribute_t)); 4009 } 4010 dsl_dataset_rele(ds, FTAG); 4011 return (0); 4012 } 4013 4014 /* 4015 * Note, this fuction is used as the callback for dmu_objset_find(). We 4016 * always return 0 so that we will continue to find and process 4017 * inconsistent datasets, even if we encounter an error trying to 4018 * process one of them. 4019 */ 4020 /* ARGSUSED */ 4021 int 4022 dsl_destroy_inconsistent(const char *dsname, void *arg) 4023 { 4024 dsl_dataset_t *ds; 4025 4026 if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) { 4027 if (DS_IS_INCONSISTENT(ds)) 4028 (void) dsl_dataset_destroy(ds, FTAG, B_FALSE); 4029 else 4030 dsl_dataset_disown(ds, FTAG); 4031 } 4032 return (0); 4033 } 4034