1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/dmu_objset.h> 27 #include <sys/dsl_dataset.h> 28 #include <sys/dsl_dir.h> 29 #include <sys/dsl_prop.h> 30 #include <sys/dsl_synctask.h> 31 #include <sys/dmu_traverse.h> 32 #include <sys/dmu_tx.h> 33 #include <sys/arc.h> 34 #include <sys/zio.h> 35 #include <sys/zap.h> 36 #include <sys/unique.h> 37 #include <sys/zfs_context.h> 38 #include <sys/zfs_ioctl.h> 39 #include <sys/spa.h> 40 #include <sys/zfs_znode.h> 41 #include <sys/sunddi.h> 42 #include <sys/zvol.h> 43 44 static char *dsl_reaper = "the grim reaper"; 45 46 static dsl_checkfunc_t dsl_dataset_destroy_begin_check; 47 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync; 48 static dsl_checkfunc_t dsl_dataset_rollback_check; 49 static dsl_syncfunc_t dsl_dataset_rollback_sync; 50 static dsl_syncfunc_t dsl_dataset_set_reservation_sync; 51 52 #define DS_REF_MAX (1ULL << 62) 53 54 #define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE 55 56 #define DSL_DATASET_IS_DESTROYED(ds) ((ds)->ds_owner == dsl_reaper) 57 58 59 /* 60 * Figure out how much of this delta should be propogated to the dsl_dir 61 * layer. If there's a refreservation, that space has already been 62 * partially accounted for in our ancestors. 63 */ 64 static int64_t 65 parent_delta(dsl_dataset_t *ds, int64_t delta) 66 { 67 uint64_t old_bytes, new_bytes; 68 69 if (ds->ds_reserved == 0) 70 return (delta); 71 72 old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); 73 new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved); 74 75 ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta)); 76 return (new_bytes - old_bytes); 77 } 78 79 void 80 dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) 81 { 82 int used = bp_get_dasize(tx->tx_pool->dp_spa, bp); 83 int compressed = BP_GET_PSIZE(bp); 84 int uncompressed = BP_GET_UCSIZE(bp); 85 int64_t delta; 86 87 dprintf_bp(bp, "born, ds=%p\n", ds); 88 89 ASSERT(dmu_tx_is_syncing(tx)); 90 /* It could have been compressed away to nothing */ 91 if (BP_IS_HOLE(bp)) 92 return; 93 ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); 94 ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES); 95 if (ds == NULL) { 96 /* 97 * Account for the meta-objset space in its placeholder 98 * dsl_dir. 99 */ 100 ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */ 101 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, 102 used, compressed, uncompressed, tx); 103 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); 104 return; 105 } 106 dmu_buf_will_dirty(ds->ds_dbuf, tx); 107 mutex_enter(&ds->ds_dir->dd_lock); 108 mutex_enter(&ds->ds_lock); 109 delta = parent_delta(ds, used); 110 ds->ds_phys->ds_used_bytes += used; 111 ds->ds_phys->ds_compressed_bytes += compressed; 112 ds->ds_phys->ds_uncompressed_bytes += uncompressed; 113 ds->ds_phys->ds_unique_bytes += used; 114 mutex_exit(&ds->ds_lock); 115 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, 116 compressed, uncompressed, tx); 117 dsl_dir_transfer_space(ds->ds_dir, used - delta, 118 DD_USED_REFRSRV, DD_USED_HEAD, tx); 119 mutex_exit(&ds->ds_dir->dd_lock); 120 } 121 122 int 123 dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, 124 dmu_tx_t *tx) 125 { 126 int used = bp_get_dasize(tx->tx_pool->dp_spa, bp); 127 int compressed = BP_GET_PSIZE(bp); 128 int uncompressed = BP_GET_UCSIZE(bp); 129 130 ASSERT(pio != NULL); 131 ASSERT(dmu_tx_is_syncing(tx)); 132 /* No block pointer => nothing to free */ 133 if (BP_IS_HOLE(bp)) 134 return (0); 135 136 ASSERT(used > 0); 137 if (ds == NULL) { 138 int err; 139 /* 140 * Account for the meta-objset space in its placeholder 141 * dataset. 142 */ 143 err = dsl_free(pio, tx->tx_pool, 144 tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT); 145 ASSERT(err == 0); 146 147 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, 148 -used, -compressed, -uncompressed, tx); 149 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); 150 return (used); 151 } 152 ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); 153 154 ASSERT(!dsl_dataset_is_snapshot(ds)); 155 dmu_buf_will_dirty(ds->ds_dbuf, tx); 156 157 if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) { 158 int err; 159 int64_t delta; 160 161 dprintf_bp(bp, "freeing: %s", ""); 162 err = dsl_free(pio, tx->tx_pool, 163 tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT); 164 ASSERT(err == 0); 165 166 mutex_enter(&ds->ds_dir->dd_lock); 167 mutex_enter(&ds->ds_lock); 168 ASSERT(ds->ds_phys->ds_unique_bytes >= used || 169 !DS_UNIQUE_IS_ACCURATE(ds)); 170 delta = parent_delta(ds, -used); 171 ds->ds_phys->ds_unique_bytes -= used; 172 mutex_exit(&ds->ds_lock); 173 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, 174 delta, -compressed, -uncompressed, tx); 175 dsl_dir_transfer_space(ds->ds_dir, -used - delta, 176 DD_USED_REFRSRV, DD_USED_HEAD, tx); 177 mutex_exit(&ds->ds_dir->dd_lock); 178 } else { 179 dprintf_bp(bp, "putting on dead list: %s", ""); 180 VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx)); 181 ASSERT3U(ds->ds_prev->ds_object, ==, 182 ds->ds_phys->ds_prev_snap_obj); 183 ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0); 184 /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ 185 if (ds->ds_prev->ds_phys->ds_next_snap_obj == 186 ds->ds_object && bp->blk_birth > 187 ds->ds_prev->ds_phys->ds_prev_snap_txg) { 188 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 189 mutex_enter(&ds->ds_prev->ds_lock); 190 ds->ds_prev->ds_phys->ds_unique_bytes += used; 191 mutex_exit(&ds->ds_prev->ds_lock); 192 } 193 if (bp->blk_birth > ds->ds_origin_txg) { 194 dsl_dir_transfer_space(ds->ds_dir, used, 195 DD_USED_HEAD, DD_USED_SNAP, tx); 196 } 197 } 198 mutex_enter(&ds->ds_lock); 199 ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used); 200 ds->ds_phys->ds_used_bytes -= used; 201 ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed); 202 ds->ds_phys->ds_compressed_bytes -= compressed; 203 ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed); 204 ds->ds_phys->ds_uncompressed_bytes -= uncompressed; 205 mutex_exit(&ds->ds_lock); 206 207 return (used); 208 } 209 210 uint64_t 211 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds) 212 { 213 uint64_t trysnap = 0; 214 215 if (ds == NULL) 216 return (0); 217 /* 218 * The snapshot creation could fail, but that would cause an 219 * incorrect FALSE return, which would only result in an 220 * overestimation of the amount of space that an operation would 221 * consume, which is OK. 222 * 223 * There's also a small window where we could miss a pending 224 * snapshot, because we could set the sync task in the quiescing 225 * phase. So this should only be used as a guess. 226 */ 227 if (ds->ds_trysnap_txg > 228 spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa)) 229 trysnap = ds->ds_trysnap_txg; 230 return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap)); 231 } 232 233 boolean_t 234 dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth) 235 { 236 return (blk_birth > dsl_dataset_prev_snap_txg(ds)); 237 } 238 239 /* ARGSUSED */ 240 static void 241 dsl_dataset_evict(dmu_buf_t *db, void *dsv) 242 { 243 dsl_dataset_t *ds = dsv; 244 245 ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds)); 246 247 dprintf_ds(ds, "evicting %s\n", ""); 248 249 unique_remove(ds->ds_fsid_guid); 250 251 if (ds->ds_user_ptr != NULL) 252 ds->ds_user_evict_func(ds, ds->ds_user_ptr); 253 254 if (ds->ds_prev) { 255 dsl_dataset_drop_ref(ds->ds_prev, ds); 256 ds->ds_prev = NULL; 257 } 258 259 bplist_close(&ds->ds_deadlist); 260 if (ds->ds_dir) 261 dsl_dir_close(ds->ds_dir, ds); 262 263 ASSERT(!list_link_active(&ds->ds_synced_link)); 264 265 mutex_destroy(&ds->ds_lock); 266 mutex_destroy(&ds->ds_recvlock); 267 mutex_destroy(&ds->ds_opening_lock); 268 mutex_destroy(&ds->ds_deadlist.bpl_lock); 269 rw_destroy(&ds->ds_rwlock); 270 cv_destroy(&ds->ds_exclusive_cv); 271 272 kmem_free(ds, sizeof (dsl_dataset_t)); 273 } 274 275 static int 276 dsl_dataset_get_snapname(dsl_dataset_t *ds) 277 { 278 dsl_dataset_phys_t *headphys; 279 int err; 280 dmu_buf_t *headdbuf; 281 dsl_pool_t *dp = ds->ds_dir->dd_pool; 282 objset_t *mos = dp->dp_meta_objset; 283 284 if (ds->ds_snapname[0]) 285 return (0); 286 if (ds->ds_phys->ds_next_snap_obj == 0) 287 return (0); 288 289 err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj, 290 FTAG, &headdbuf); 291 if (err) 292 return (err); 293 headphys = headdbuf->db_data; 294 err = zap_value_search(dp->dp_meta_objset, 295 headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname); 296 dmu_buf_rele(headdbuf, FTAG); 297 return (err); 298 } 299 300 static int 301 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) 302 { 303 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 304 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; 305 matchtype_t mt; 306 int err; 307 308 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 309 mt = MT_FIRST; 310 else 311 mt = MT_EXACT; 312 313 err = zap_lookup_norm(mos, snapobj, name, 8, 1, 314 value, mt, NULL, 0, NULL); 315 if (err == ENOTSUP && mt == MT_FIRST) 316 err = zap_lookup(mos, snapobj, name, 8, 1, value); 317 return (err); 318 } 319 320 static int 321 dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx) 322 { 323 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 324 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; 325 matchtype_t mt; 326 int err; 327 328 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 329 mt = MT_FIRST; 330 else 331 mt = MT_EXACT; 332 333 err = zap_remove_norm(mos, snapobj, name, mt, tx); 334 if (err == ENOTSUP && mt == MT_FIRST) 335 err = zap_remove(mos, snapobj, name, tx); 336 return (err); 337 } 338 339 static int 340 dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, 341 dsl_dataset_t **dsp) 342 { 343 objset_t *mos = dp->dp_meta_objset; 344 dmu_buf_t *dbuf; 345 dsl_dataset_t *ds; 346 int err; 347 348 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || 349 dsl_pool_sync_context(dp)); 350 351 err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); 352 if (err) 353 return (err); 354 ds = dmu_buf_get_user(dbuf); 355 if (ds == NULL) { 356 dsl_dataset_t *winner; 357 358 ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); 359 ds->ds_dbuf = dbuf; 360 ds->ds_object = dsobj; 361 ds->ds_phys = dbuf->db_data; 362 363 mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); 364 mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL); 365 mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); 366 mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT, 367 NULL); 368 rw_init(&ds->ds_rwlock, 0, 0, 0); 369 cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL); 370 371 err = bplist_open(&ds->ds_deadlist, 372 mos, ds->ds_phys->ds_deadlist_obj); 373 if (err == 0) { 374 err = dsl_dir_open_obj(dp, 375 ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir); 376 } 377 if (err) { 378 /* 379 * we don't really need to close the blist if we 380 * just opened it. 381 */ 382 mutex_destroy(&ds->ds_lock); 383 mutex_destroy(&ds->ds_recvlock); 384 mutex_destroy(&ds->ds_opening_lock); 385 mutex_destroy(&ds->ds_deadlist.bpl_lock); 386 rw_destroy(&ds->ds_rwlock); 387 cv_destroy(&ds->ds_exclusive_cv); 388 kmem_free(ds, sizeof (dsl_dataset_t)); 389 dmu_buf_rele(dbuf, tag); 390 return (err); 391 } 392 393 if (!dsl_dataset_is_snapshot(ds)) { 394 ds->ds_snapname[0] = '\0'; 395 if (ds->ds_phys->ds_prev_snap_obj) { 396 err = dsl_dataset_get_ref(dp, 397 ds->ds_phys->ds_prev_snap_obj, 398 ds, &ds->ds_prev); 399 } 400 401 if (err == 0 && dsl_dir_is_clone(ds->ds_dir)) { 402 dsl_dataset_t *origin; 403 404 err = dsl_dataset_hold_obj(dp, 405 ds->ds_dir->dd_phys->dd_origin_obj, 406 FTAG, &origin); 407 if (err == 0) { 408 ds->ds_origin_txg = 409 origin->ds_phys->ds_creation_txg; 410 dsl_dataset_rele(origin, FTAG); 411 } 412 } 413 } else { 414 if (zfs_flags & ZFS_DEBUG_SNAPNAMES) 415 err = dsl_dataset_get_snapname(ds); 416 if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) { 417 err = zap_count( 418 ds->ds_dir->dd_pool->dp_meta_objset, 419 ds->ds_phys->ds_userrefs_obj, 420 &ds->ds_userrefs); 421 } 422 } 423 424 if (err == 0 && !dsl_dataset_is_snapshot(ds)) { 425 /* 426 * In sync context, we're called with either no lock 427 * or with the write lock. If we're not syncing, 428 * we're always called with the read lock held. 429 */ 430 boolean_t need_lock = 431 !RW_WRITE_HELD(&dp->dp_config_rwlock) && 432 dsl_pool_sync_context(dp); 433 434 if (need_lock) 435 rw_enter(&dp->dp_config_rwlock, RW_READER); 436 437 err = dsl_prop_get_ds(ds, 438 "refreservation", sizeof (uint64_t), 1, 439 &ds->ds_reserved, NULL); 440 if (err == 0) { 441 err = dsl_prop_get_ds(ds, 442 "refquota", sizeof (uint64_t), 1, 443 &ds->ds_quota, NULL); 444 } 445 446 if (need_lock) 447 rw_exit(&dp->dp_config_rwlock); 448 } else { 449 ds->ds_reserved = ds->ds_quota = 0; 450 } 451 452 if (err == 0) { 453 winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys, 454 dsl_dataset_evict); 455 } 456 if (err || winner) { 457 bplist_close(&ds->ds_deadlist); 458 if (ds->ds_prev) 459 dsl_dataset_drop_ref(ds->ds_prev, ds); 460 dsl_dir_close(ds->ds_dir, ds); 461 mutex_destroy(&ds->ds_lock); 462 mutex_destroy(&ds->ds_recvlock); 463 mutex_destroy(&ds->ds_opening_lock); 464 mutex_destroy(&ds->ds_deadlist.bpl_lock); 465 rw_destroy(&ds->ds_rwlock); 466 cv_destroy(&ds->ds_exclusive_cv); 467 kmem_free(ds, sizeof (dsl_dataset_t)); 468 if (err) { 469 dmu_buf_rele(dbuf, tag); 470 return (err); 471 } 472 ds = winner; 473 } else { 474 ds->ds_fsid_guid = 475 unique_insert(ds->ds_phys->ds_fsid_guid); 476 } 477 } 478 ASSERT3P(ds->ds_dbuf, ==, dbuf); 479 ASSERT3P(ds->ds_phys, ==, dbuf->db_data); 480 ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 || 481 spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN || 482 dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); 483 mutex_enter(&ds->ds_lock); 484 if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) { 485 mutex_exit(&ds->ds_lock); 486 dmu_buf_rele(ds->ds_dbuf, tag); 487 return (ENOENT); 488 } 489 mutex_exit(&ds->ds_lock); 490 *dsp = ds; 491 return (0); 492 } 493 494 static int 495 dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag) 496 { 497 dsl_pool_t *dp = ds->ds_dir->dd_pool; 498 499 /* 500 * In syncing context we don't want the rwlock lock: there 501 * may be an existing writer waiting for sync phase to 502 * finish. We don't need to worry about such writers, since 503 * sync phase is single-threaded, so the writer can't be 504 * doing anything while we are active. 505 */ 506 if (dsl_pool_sync_context(dp)) { 507 ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); 508 return (0); 509 } 510 511 /* 512 * Normal users will hold the ds_rwlock as a READER until they 513 * are finished (i.e., call dsl_dataset_rele()). "Owners" will 514 * drop their READER lock after they set the ds_owner field. 515 * 516 * If the dataset is being destroyed, the destroy thread will 517 * obtain a WRITER lock for exclusive access after it's done its 518 * open-context work and then change the ds_owner to 519 * dsl_reaper once destruction is assured. So threads 520 * may block here temporarily, until the "destructability" of 521 * the dataset is determined. 522 */ 523 ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock)); 524 mutex_enter(&ds->ds_lock); 525 while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) { 526 rw_exit(&dp->dp_config_rwlock); 527 cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock); 528 if (DSL_DATASET_IS_DESTROYED(ds)) { 529 mutex_exit(&ds->ds_lock); 530 dsl_dataset_drop_ref(ds, tag); 531 rw_enter(&dp->dp_config_rwlock, RW_READER); 532 return (ENOENT); 533 } 534 rw_enter(&dp->dp_config_rwlock, RW_READER); 535 } 536 mutex_exit(&ds->ds_lock); 537 return (0); 538 } 539 540 int 541 dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, 542 dsl_dataset_t **dsp) 543 { 544 int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp); 545 546 if (err) 547 return (err); 548 return (dsl_dataset_hold_ref(*dsp, tag)); 549 } 550 551 int 552 dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, int flags, void *owner, 553 dsl_dataset_t **dsp) 554 { 555 int err = dsl_dataset_hold_obj(dp, dsobj, owner, dsp); 556 557 ASSERT(DS_MODE_TYPE(flags) != DS_MODE_USER); 558 559 if (err) 560 return (err); 561 if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) { 562 dsl_dataset_rele(*dsp, owner); 563 *dsp = NULL; 564 return (EBUSY); 565 } 566 return (0); 567 } 568 569 int 570 dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) 571 { 572 dsl_dir_t *dd; 573 dsl_pool_t *dp; 574 const char *snapname; 575 uint64_t obj; 576 int err = 0; 577 578 err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname); 579 if (err) 580 return (err); 581 582 dp = dd->dd_pool; 583 obj = dd->dd_phys->dd_head_dataset_obj; 584 rw_enter(&dp->dp_config_rwlock, RW_READER); 585 if (obj) 586 err = dsl_dataset_get_ref(dp, obj, tag, dsp); 587 else 588 err = ENOENT; 589 if (err) 590 goto out; 591 592 err = dsl_dataset_hold_ref(*dsp, tag); 593 594 /* we may be looking for a snapshot */ 595 if (err == 0 && snapname != NULL) { 596 dsl_dataset_t *ds = NULL; 597 598 if (*snapname++ != '@') { 599 dsl_dataset_rele(*dsp, tag); 600 err = ENOENT; 601 goto out; 602 } 603 604 dprintf("looking for snapshot '%s'\n", snapname); 605 err = dsl_dataset_snap_lookup(*dsp, snapname, &obj); 606 if (err == 0) 607 err = dsl_dataset_get_ref(dp, obj, tag, &ds); 608 dsl_dataset_rele(*dsp, tag); 609 610 ASSERT3U((err == 0), ==, (ds != NULL)); 611 612 if (ds) { 613 mutex_enter(&ds->ds_lock); 614 if (ds->ds_snapname[0] == 0) 615 (void) strlcpy(ds->ds_snapname, snapname, 616 sizeof (ds->ds_snapname)); 617 mutex_exit(&ds->ds_lock); 618 err = dsl_dataset_hold_ref(ds, tag); 619 *dsp = err ? NULL : ds; 620 } 621 } 622 out: 623 rw_exit(&dp->dp_config_rwlock); 624 dsl_dir_close(dd, FTAG); 625 return (err); 626 } 627 628 int 629 dsl_dataset_own(const char *name, int flags, void *owner, dsl_dataset_t **dsp) 630 { 631 int err = dsl_dataset_hold(name, owner, dsp); 632 if (err) 633 return (err); 634 if ((*dsp)->ds_phys->ds_num_children > 0 && 635 !DS_MODE_IS_READONLY(flags)) { 636 dsl_dataset_rele(*dsp, owner); 637 return (EROFS); 638 } 639 if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) { 640 dsl_dataset_rele(*dsp, owner); 641 return (EBUSY); 642 } 643 return (0); 644 } 645 646 void 647 dsl_dataset_name(dsl_dataset_t *ds, char *name) 648 { 649 if (ds == NULL) { 650 (void) strcpy(name, "mos"); 651 } else { 652 dsl_dir_name(ds->ds_dir, name); 653 VERIFY(0 == dsl_dataset_get_snapname(ds)); 654 if (ds->ds_snapname[0]) { 655 (void) strcat(name, "@"); 656 /* 657 * We use a "recursive" mutex so that we 658 * can call dprintf_ds() with ds_lock held. 659 */ 660 if (!MUTEX_HELD(&ds->ds_lock)) { 661 mutex_enter(&ds->ds_lock); 662 (void) strcat(name, ds->ds_snapname); 663 mutex_exit(&ds->ds_lock); 664 } else { 665 (void) strcat(name, ds->ds_snapname); 666 } 667 } 668 } 669 } 670 671 static int 672 dsl_dataset_namelen(dsl_dataset_t *ds) 673 { 674 int result; 675 676 if (ds == NULL) { 677 result = 3; /* "mos" */ 678 } else { 679 result = dsl_dir_namelen(ds->ds_dir); 680 VERIFY(0 == dsl_dataset_get_snapname(ds)); 681 if (ds->ds_snapname[0]) { 682 ++result; /* adding one for the @-sign */ 683 if (!MUTEX_HELD(&ds->ds_lock)) { 684 mutex_enter(&ds->ds_lock); 685 result += strlen(ds->ds_snapname); 686 mutex_exit(&ds->ds_lock); 687 } else { 688 result += strlen(ds->ds_snapname); 689 } 690 } 691 } 692 693 return (result); 694 } 695 696 void 697 dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag) 698 { 699 dmu_buf_rele(ds->ds_dbuf, tag); 700 } 701 702 void 703 dsl_dataset_rele(dsl_dataset_t *ds, void *tag) 704 { 705 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) { 706 rw_exit(&ds->ds_rwlock); 707 } 708 dsl_dataset_drop_ref(ds, tag); 709 } 710 711 void 712 dsl_dataset_disown(dsl_dataset_t *ds, void *owner) 713 { 714 ASSERT((ds->ds_owner == owner && ds->ds_dbuf) || 715 (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL)); 716 717 mutex_enter(&ds->ds_lock); 718 ds->ds_owner = NULL; 719 if (RW_WRITE_HELD(&ds->ds_rwlock)) { 720 rw_exit(&ds->ds_rwlock); 721 cv_broadcast(&ds->ds_exclusive_cv); 722 } 723 mutex_exit(&ds->ds_lock); 724 if (ds->ds_dbuf) 725 dsl_dataset_drop_ref(ds, owner); 726 else 727 dsl_dataset_evict(ds->ds_dbuf, ds); 728 } 729 730 boolean_t 731 dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *owner) 732 { 733 boolean_t gotit = FALSE; 734 735 mutex_enter(&ds->ds_lock); 736 if (ds->ds_owner == NULL && 737 (!DS_IS_INCONSISTENT(ds) || inconsistentok)) { 738 ds->ds_owner = owner; 739 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) 740 rw_exit(&ds->ds_rwlock); 741 gotit = TRUE; 742 } 743 mutex_exit(&ds->ds_lock); 744 return (gotit); 745 } 746 747 void 748 dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner) 749 { 750 ASSERT3P(owner, ==, ds->ds_owner); 751 if (!RW_WRITE_HELD(&ds->ds_rwlock)) 752 rw_enter(&ds->ds_rwlock, RW_WRITER); 753 } 754 755 uint64_t 756 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, 757 uint64_t flags, dmu_tx_t *tx) 758 { 759 dsl_pool_t *dp = dd->dd_pool; 760 dmu_buf_t *dbuf; 761 dsl_dataset_phys_t *dsphys; 762 uint64_t dsobj; 763 objset_t *mos = dp->dp_meta_objset; 764 765 if (origin == NULL) 766 origin = dp->dp_origin_snap; 767 768 ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp); 769 ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0); 770 ASSERT(dmu_tx_is_syncing(tx)); 771 ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); 772 773 dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, 774 DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); 775 VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); 776 dmu_buf_will_dirty(dbuf, tx); 777 dsphys = dbuf->db_data; 778 bzero(dsphys, sizeof (dsl_dataset_phys_t)); 779 dsphys->ds_dir_obj = dd->dd_object; 780 dsphys->ds_flags = flags; 781 dsphys->ds_fsid_guid = unique_create(); 782 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, 783 sizeof (dsphys->ds_guid)); 784 dsphys->ds_snapnames_zapobj = 785 zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP, 786 DMU_OT_NONE, 0, tx); 787 dsphys->ds_creation_time = gethrestime_sec(); 788 dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg; 789 dsphys->ds_deadlist_obj = 790 bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); 791 792 if (origin) { 793 dsphys->ds_prev_snap_obj = origin->ds_object; 794 dsphys->ds_prev_snap_txg = 795 origin->ds_phys->ds_creation_txg; 796 dsphys->ds_used_bytes = 797 origin->ds_phys->ds_used_bytes; 798 dsphys->ds_compressed_bytes = 799 origin->ds_phys->ds_compressed_bytes; 800 dsphys->ds_uncompressed_bytes = 801 origin->ds_phys->ds_uncompressed_bytes; 802 dsphys->ds_bp = origin->ds_phys->ds_bp; 803 dsphys->ds_flags |= origin->ds_phys->ds_flags; 804 805 dmu_buf_will_dirty(origin->ds_dbuf, tx); 806 origin->ds_phys->ds_num_children++; 807 808 if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) { 809 if (origin->ds_phys->ds_next_clones_obj == 0) { 810 origin->ds_phys->ds_next_clones_obj = 811 zap_create(mos, 812 DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); 813 } 814 VERIFY(0 == zap_add_int(mos, 815 origin->ds_phys->ds_next_clones_obj, 816 dsobj, tx)); 817 } 818 819 dmu_buf_will_dirty(dd->dd_dbuf, tx); 820 dd->dd_phys->dd_origin_obj = origin->ds_object; 821 } 822 823 if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) 824 dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 825 826 dmu_buf_rele(dbuf, FTAG); 827 828 dmu_buf_will_dirty(dd->dd_dbuf, tx); 829 dd->dd_phys->dd_head_dataset_obj = dsobj; 830 831 return (dsobj); 832 } 833 834 uint64_t 835 dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, 836 dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx) 837 { 838 dsl_pool_t *dp = pdd->dd_pool; 839 uint64_t dsobj, ddobj; 840 dsl_dir_t *dd; 841 842 ASSERT(lastname[0] != '@'); 843 844 ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); 845 VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd)); 846 847 dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx); 848 849 dsl_deleg_set_create_perms(dd, tx, cr); 850 851 dsl_dir_close(dd, FTAG); 852 853 return (dsobj); 854 } 855 856 struct destroyarg { 857 dsl_sync_task_group_t *dstg; 858 char *snapname; 859 char *failed; 860 boolean_t defer; 861 }; 862 863 static int 864 dsl_snapshot_destroy_one(char *name, void *arg) 865 { 866 struct destroyarg *da = arg; 867 dsl_dataset_t *ds; 868 int err; 869 char *dsname; 870 size_t buflen; 871 872 /* alloc a buffer to hold name@snapname, plus the terminating NULL */ 873 buflen = strlen(name) + strlen(da->snapname) + 2; 874 dsname = kmem_alloc(buflen, KM_SLEEP); 875 (void) snprintf(dsname, buflen, "%s@%s", name, da->snapname); 876 err = dsl_dataset_own(dsname, DS_MODE_READONLY | DS_MODE_INCONSISTENT, 877 da->dstg, &ds); 878 kmem_free(dsname, buflen); 879 if (err == 0) { 880 struct dsl_ds_destroyarg *dsda; 881 882 dsl_dataset_make_exclusive(ds, da->dstg); 883 if (ds->ds_user_ptr) { 884 ds->ds_user_evict_func(ds, ds->ds_user_ptr); 885 ds->ds_user_ptr = NULL; 886 } 887 dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), KM_SLEEP); 888 dsda->ds = ds; 889 dsda->defer = da->defer; 890 dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check, 891 dsl_dataset_destroy_sync, dsda, da->dstg, 0); 892 } else if (err == ENOENT) { 893 err = 0; 894 } else { 895 (void) strcpy(da->failed, name); 896 } 897 return (err); 898 } 899 900 /* 901 * Destroy 'snapname' in all descendants of 'fsname'. 902 */ 903 #pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy 904 int 905 dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer) 906 { 907 int err; 908 struct destroyarg da; 909 dsl_sync_task_t *dst; 910 spa_t *spa; 911 912 err = spa_open(fsname, &spa, FTAG); 913 if (err) 914 return (err); 915 da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 916 da.snapname = snapname; 917 da.failed = fsname; 918 da.defer = defer; 919 920 err = dmu_objset_find(fsname, 921 dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN); 922 923 if (err == 0) 924 err = dsl_sync_task_group_wait(da.dstg); 925 926 for (dst = list_head(&da.dstg->dstg_tasks); dst; 927 dst = list_next(&da.dstg->dstg_tasks, dst)) { 928 struct dsl_ds_destroyarg *dsda = dst->dst_arg1; 929 dsl_dataset_t *ds = dsda->ds; 930 931 /* 932 * Return the file system name that triggered the error 933 */ 934 if (dst->dst_err) { 935 dsl_dataset_name(ds, fsname); 936 *strchr(fsname, '@') = '\0'; 937 } 938 ASSERT3P(dsda->rm_origin, ==, NULL); 939 dsl_dataset_disown(ds, da.dstg); 940 kmem_free(dsda, sizeof (struct dsl_ds_destroyarg)); 941 } 942 943 dsl_sync_task_group_destroy(da.dstg); 944 spa_close(spa, FTAG); 945 return (err); 946 } 947 948 static boolean_t 949 dsl_dataset_might_destroy_origin(dsl_dataset_t *ds) 950 { 951 boolean_t might_destroy = B_FALSE; 952 953 mutex_enter(&ds->ds_lock); 954 if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 && 955 DS_IS_DEFER_DESTROY(ds)) 956 might_destroy = B_TRUE; 957 mutex_exit(&ds->ds_lock); 958 959 return (might_destroy); 960 } 961 962 #ifdef _KERNEL 963 static int 964 dsl_dataset_zvol_cleanup(dsl_dataset_t *ds, const char *name) 965 { 966 int error; 967 objset_t *os; 968 969 error = dmu_objset_open_ds(ds, DMU_OST_ANY, &os); 970 if (error) 971 return (error); 972 973 if (dmu_objset_type(os) == DMU_OST_ZVOL) 974 error = zvol_remove_minor(name); 975 dmu_objset_close(os); 976 977 return (error); 978 } 979 #endif 980 981 /* 982 * If we're removing a clone, and these three conditions are true: 983 * 1) the clone's origin has no other children 984 * 2) the clone's origin has no user references 985 * 3) the clone's origin has been marked for deferred destruction 986 * Then, prepare to remove the origin as part of this sync task group. 987 */ 988 static int 989 dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag) 990 { 991 dsl_dataset_t *ds = dsda->ds; 992 dsl_dataset_t *origin = ds->ds_prev; 993 994 if (dsl_dataset_might_destroy_origin(origin)) { 995 char *name; 996 int namelen; 997 int error; 998 999 namelen = dsl_dataset_namelen(origin) + 1; 1000 name = kmem_alloc(namelen, KM_SLEEP); 1001 dsl_dataset_name(origin, name); 1002 #ifdef _KERNEL 1003 error = zfs_unmount_snap(name, NULL); 1004 if (error) { 1005 kmem_free(name, namelen); 1006 return (error); 1007 } 1008 error = dsl_dataset_zvol_cleanup(origin, name); 1009 if (error) { 1010 kmem_free(name, namelen); 1011 return (error); 1012 } 1013 #endif 1014 error = dsl_dataset_own(name, 1015 DS_MODE_READONLY | DS_MODE_INCONSISTENT, 1016 tag, &origin); 1017 kmem_free(name, namelen); 1018 if (error) 1019 return (error); 1020 dsda->rm_origin = origin; 1021 dsl_dataset_make_exclusive(origin, tag); 1022 } 1023 1024 return (0); 1025 } 1026 1027 /* 1028 * ds must be opened as OWNER. On return (whether successful or not), 1029 * ds will be closed and caller can no longer dereference it. 1030 */ 1031 int 1032 dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) 1033 { 1034 int err; 1035 dsl_sync_task_group_t *dstg; 1036 objset_t *os; 1037 dsl_dir_t *dd; 1038 uint64_t obj; 1039 struct dsl_ds_destroyarg dsda = {0}; 1040 1041 dsda.ds = ds; 1042 1043 if (dsl_dataset_is_snapshot(ds)) { 1044 /* Destroying a snapshot is simpler */ 1045 dsl_dataset_make_exclusive(ds, tag); 1046 1047 if (ds->ds_user_ptr) { 1048 ds->ds_user_evict_func(ds, ds->ds_user_ptr); 1049 ds->ds_user_ptr = NULL; 1050 } 1051 /* NOTE: defer is always B_FALSE for non-snapshots */ 1052 dsda.defer = defer; 1053 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 1054 dsl_dataset_destroy_check, dsl_dataset_destroy_sync, 1055 &dsda, tag, 0); 1056 ASSERT3P(dsda.rm_origin, ==, NULL); 1057 goto out; 1058 } 1059 1060 dd = ds->ds_dir; 1061 1062 /* 1063 * Check for errors and mark this ds as inconsistent, in 1064 * case we crash while freeing the objects. 1065 */ 1066 err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check, 1067 dsl_dataset_destroy_begin_sync, ds, NULL, 0); 1068 if (err) 1069 goto out; 1070 1071 err = dmu_objset_open_ds(ds, DMU_OST_ANY, &os); 1072 if (err) 1073 goto out; 1074 1075 /* 1076 * remove the objects in open context, so that we won't 1077 * have too much to do in syncing context. 1078 */ 1079 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 1080 ds->ds_phys->ds_prev_snap_txg)) { 1081 /* 1082 * Ignore errors, if there is not enough disk space 1083 * we will deal with it in dsl_dataset_destroy_sync(). 1084 */ 1085 (void) dmu_free_object(os, obj); 1086 } 1087 1088 /* 1089 * We need to sync out all in-flight IO before we try to evict 1090 * (the dataset evict func is trying to clear the cached entries 1091 * for this dataset in the ARC). 1092 */ 1093 txg_wait_synced(dd->dd_pool, 0); 1094 1095 /* 1096 * If we managed to free all the objects in open 1097 * context, the user space accounting should be zero. 1098 */ 1099 if (ds->ds_phys->ds_bp.blk_fill == 0 && 1100 dmu_objset_userused_enabled(os->os)) { 1101 uint64_t count; 1102 1103 ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 || 1104 count == 0); 1105 ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, &count) != 0 || 1106 count == 0); 1107 } 1108 1109 dmu_objset_close(os); 1110 if (err != ESRCH) 1111 goto out; 1112 1113 rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); 1114 err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd); 1115 rw_exit(&dd->dd_pool->dp_config_rwlock); 1116 1117 if (err) 1118 goto out; 1119 1120 if (ds->ds_user_ptr) { 1121 /* 1122 * We need to sync out all in-flight IO before we try 1123 * to evict (the dataset evict func is trying to clear 1124 * the cached entries for this dataset in the ARC). 1125 */ 1126 txg_wait_synced(dd->dd_pool, 0); 1127 } 1128 1129 /* 1130 * Blow away the dsl_dir + head dataset. 1131 */ 1132 dsl_dataset_make_exclusive(ds, tag); 1133 if (ds->ds_user_ptr) { 1134 ds->ds_user_evict_func(ds, ds->ds_user_ptr); 1135 ds->ds_user_ptr = NULL; 1136 } 1137 1138 /* 1139 * If we're removing a clone, we might also need to remove its 1140 * origin. 1141 */ 1142 do { 1143 dsda.need_prep = B_FALSE; 1144 if (dsl_dir_is_clone(dd)) { 1145 err = dsl_dataset_origin_rm_prep(&dsda, tag); 1146 if (err) { 1147 dsl_dir_close(dd, FTAG); 1148 goto out; 1149 } 1150 } 1151 1152 dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); 1153 dsl_sync_task_create(dstg, dsl_dataset_destroy_check, 1154 dsl_dataset_destroy_sync, &dsda, tag, 0); 1155 dsl_sync_task_create(dstg, dsl_dir_destroy_check, 1156 dsl_dir_destroy_sync, dd, FTAG, 0); 1157 err = dsl_sync_task_group_wait(dstg); 1158 dsl_sync_task_group_destroy(dstg); 1159 1160 /* 1161 * We could be racing against 'zfs release' or 'zfs destroy -d' 1162 * on the origin snap, in which case we can get EBUSY if we 1163 * needed to destroy the origin snap but were not ready to 1164 * do so. 1165 */ 1166 if (dsda.need_prep) { 1167 ASSERT(err == EBUSY); 1168 ASSERT(dsl_dir_is_clone(dd)); 1169 ASSERT(dsda.rm_origin == NULL); 1170 } 1171 } while (dsda.need_prep); 1172 1173 if (dsda.rm_origin != NULL) 1174 dsl_dataset_disown(dsda.rm_origin, tag); 1175 1176 /* if it is successful, dsl_dir_destroy_sync will close the dd */ 1177 if (err) 1178 dsl_dir_close(dd, FTAG); 1179 out: 1180 dsl_dataset_disown(ds, tag); 1181 return (err); 1182 } 1183 1184 int 1185 dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost) 1186 { 1187 int err; 1188 1189 ASSERT(ds->ds_owner); 1190 1191 dsl_dataset_make_exclusive(ds, ds->ds_owner); 1192 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 1193 dsl_dataset_rollback_check, dsl_dataset_rollback_sync, 1194 ds, &ost, 0); 1195 /* drop exclusive access */ 1196 mutex_enter(&ds->ds_lock); 1197 rw_exit(&ds->ds_rwlock); 1198 cv_broadcast(&ds->ds_exclusive_cv); 1199 mutex_exit(&ds->ds_lock); 1200 return (err); 1201 } 1202 1203 void * 1204 dsl_dataset_set_user_ptr(dsl_dataset_t *ds, 1205 void *p, dsl_dataset_evict_func_t func) 1206 { 1207 void *old; 1208 1209 mutex_enter(&ds->ds_lock); 1210 old = ds->ds_user_ptr; 1211 if (old == NULL) { 1212 ds->ds_user_ptr = p; 1213 ds->ds_user_evict_func = func; 1214 } 1215 mutex_exit(&ds->ds_lock); 1216 return (old); 1217 } 1218 1219 void * 1220 dsl_dataset_get_user_ptr(dsl_dataset_t *ds) 1221 { 1222 return (ds->ds_user_ptr); 1223 } 1224 1225 blkptr_t * 1226 dsl_dataset_get_blkptr(dsl_dataset_t *ds) 1227 { 1228 return (&ds->ds_phys->ds_bp); 1229 } 1230 1231 void 1232 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) 1233 { 1234 ASSERT(dmu_tx_is_syncing(tx)); 1235 /* If it's the meta-objset, set dp_meta_rootbp */ 1236 if (ds == NULL) { 1237 tx->tx_pool->dp_meta_rootbp = *bp; 1238 } else { 1239 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1240 ds->ds_phys->ds_bp = *bp; 1241 } 1242 } 1243 1244 spa_t * 1245 dsl_dataset_get_spa(dsl_dataset_t *ds) 1246 { 1247 return (ds->ds_dir->dd_pool->dp_spa); 1248 } 1249 1250 void 1251 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) 1252 { 1253 dsl_pool_t *dp; 1254 1255 if (ds == NULL) /* this is the meta-objset */ 1256 return; 1257 1258 ASSERT(ds->ds_user_ptr != NULL); 1259 1260 if (ds->ds_phys->ds_next_snap_obj != 0) 1261 panic("dirtying snapshot!"); 1262 1263 dp = ds->ds_dir->dd_pool; 1264 1265 if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) { 1266 /* up the hold count until we can be written out */ 1267 dmu_buf_add_ref(ds->ds_dbuf, ds); 1268 } 1269 } 1270 1271 /* 1272 * The unique space in the head dataset can be calculated by subtracting 1273 * the space used in the most recent snapshot, that is still being used 1274 * in this file system, from the space currently in use. To figure out 1275 * the space in the most recent snapshot still in use, we need to take 1276 * the total space used in the snapshot and subtract out the space that 1277 * has been freed up since the snapshot was taken. 1278 */ 1279 static void 1280 dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) 1281 { 1282 uint64_t mrs_used; 1283 uint64_t dlused, dlcomp, dluncomp; 1284 1285 ASSERT(ds->ds_object == ds->ds_dir->dd_phys->dd_head_dataset_obj); 1286 1287 if (ds->ds_phys->ds_prev_snap_obj != 0) 1288 mrs_used = ds->ds_prev->ds_phys->ds_used_bytes; 1289 else 1290 mrs_used = 0; 1291 1292 VERIFY(0 == bplist_space(&ds->ds_deadlist, &dlused, &dlcomp, 1293 &dluncomp)); 1294 1295 ASSERT3U(dlused, <=, mrs_used); 1296 ds->ds_phys->ds_unique_bytes = 1297 ds->ds_phys->ds_used_bytes - (mrs_used - dlused); 1298 1299 if (!DS_UNIQUE_IS_ACCURATE(ds) && 1300 spa_version(ds->ds_dir->dd_pool->dp_spa) >= 1301 SPA_VERSION_UNIQUE_ACCURATE) 1302 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 1303 } 1304 1305 static uint64_t 1306 dsl_dataset_unique(dsl_dataset_t *ds) 1307 { 1308 if (!DS_UNIQUE_IS_ACCURATE(ds) && !dsl_dataset_is_snapshot(ds)) 1309 dsl_dataset_recalc_head_uniq(ds); 1310 1311 return (ds->ds_phys->ds_unique_bytes); 1312 } 1313 1314 struct killarg { 1315 dsl_dataset_t *ds; 1316 zio_t *zio; 1317 dmu_tx_t *tx; 1318 }; 1319 1320 /* ARGSUSED */ 1321 static int 1322 kill_blkptr(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, 1323 const dnode_phys_t *dnp, void *arg) 1324 { 1325 struct killarg *ka = arg; 1326 1327 if (bp == NULL) 1328 return (0); 1329 1330 if ((zb->zb_level == -1ULL && zb->zb_blkid != 0) || 1331 (zb->zb_object != 0 && dnp == NULL)) { 1332 /* 1333 * It's a block in the intent log. It has no 1334 * accounting, so just free it. 1335 */ 1336 VERIFY3U(0, ==, dsl_free(ka->zio, ka->tx->tx_pool, 1337 ka->tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT)); 1338 } else { 1339 ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); 1340 (void) dsl_dataset_block_kill(ka->ds, bp, ka->zio, ka->tx); 1341 } 1342 1343 return (0); 1344 } 1345 1346 /* ARGSUSED */ 1347 static int 1348 dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx) 1349 { 1350 dsl_dataset_t *ds = arg1; 1351 dmu_objset_type_t *ost = arg2; 1352 1353 /* 1354 * We can only roll back to emptyness if it is a ZPL objset. 1355 */ 1356 if (*ost != DMU_OST_ZFS && 1357 ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) 1358 return (EINVAL); 1359 1360 /* 1361 * This must not be a snapshot. 1362 */ 1363 if (ds->ds_phys->ds_next_snap_obj != 0) 1364 return (EINVAL); 1365 1366 /* 1367 * If we made changes this txg, traverse_dataset won't find 1368 * them. Try again. 1369 */ 1370 if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) 1371 return (EAGAIN); 1372 1373 return (0); 1374 } 1375 1376 /* ARGSUSED */ 1377 static void 1378 dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 1379 { 1380 dsl_dataset_t *ds = arg1; 1381 dmu_objset_type_t *ost = arg2; 1382 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 1383 1384 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1385 1386 if (ds->ds_user_ptr != NULL) { 1387 /* 1388 * We need to make sure that the objset_impl_t is reopened after 1389 * we do the rollback, otherwise it will have the wrong 1390 * objset_phys_t. Normally this would happen when this 1391 * dataset-open is closed, thus causing the 1392 * dataset to be immediately evicted. But when doing "zfs recv 1393 * -F", we reopen the objset before that, so that there is no 1394 * window where the dataset is closed and inconsistent. 1395 */ 1396 ds->ds_user_evict_func(ds, ds->ds_user_ptr); 1397 ds->ds_user_ptr = NULL; 1398 } 1399 1400 /* Transfer space that was freed since last snap back to the head. */ 1401 { 1402 uint64_t used; 1403 1404 VERIFY(0 == bplist_space_birthrange(&ds->ds_deadlist, 1405 ds->ds_origin_txg, UINT64_MAX, &used)); 1406 dsl_dir_transfer_space(ds->ds_dir, used, 1407 DD_USED_SNAP, DD_USED_HEAD, tx); 1408 } 1409 1410 /* Zero out the deadlist. */ 1411 bplist_close(&ds->ds_deadlist); 1412 bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx); 1413 ds->ds_phys->ds_deadlist_obj = 1414 bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); 1415 VERIFY(0 == bplist_open(&ds->ds_deadlist, mos, 1416 ds->ds_phys->ds_deadlist_obj)); 1417 1418 { 1419 /* 1420 * Free blkptrs that we gave birth to - this covers 1421 * claimed but not played log blocks too. 1422 */ 1423 zio_t *zio; 1424 struct killarg ka; 1425 1426 zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, 1427 ZIO_FLAG_MUSTSUCCEED); 1428 ka.ds = ds; 1429 ka.zio = zio; 1430 ka.tx = tx; 1431 (void) traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg, 1432 TRAVERSE_POST, kill_blkptr, &ka); 1433 (void) zio_wait(zio); 1434 } 1435 1436 ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0); 1437 1438 if (ds->ds_prev && ds->ds_prev != ds->ds_dir->dd_pool->dp_origin_snap) { 1439 /* Change our contents to that of the prev snapshot */ 1440 1441 ASSERT3U(ds->ds_prev->ds_object, ==, 1442 ds->ds_phys->ds_prev_snap_obj); 1443 ASSERT3U(ds->ds_phys->ds_used_bytes, <=, 1444 ds->ds_prev->ds_phys->ds_used_bytes); 1445 1446 ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp; 1447 ds->ds_phys->ds_used_bytes = 1448 ds->ds_prev->ds_phys->ds_used_bytes; 1449 ds->ds_phys->ds_compressed_bytes = 1450 ds->ds_prev->ds_phys->ds_compressed_bytes; 1451 ds->ds_phys->ds_uncompressed_bytes = 1452 ds->ds_prev->ds_phys->ds_uncompressed_bytes; 1453 ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags; 1454 1455 if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { 1456 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 1457 ds->ds_prev->ds_phys->ds_unique_bytes = 0; 1458 } 1459 } else { 1460 objset_impl_t *osi; 1461 1462 ASSERT(*ost != DMU_OST_ZVOL); 1463 ASSERT3U(ds->ds_phys->ds_used_bytes, ==, 0); 1464 ASSERT3U(ds->ds_phys->ds_compressed_bytes, ==, 0); 1465 ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, ==, 0); 1466 1467 bzero(&ds->ds_phys->ds_bp, sizeof (blkptr_t)); 1468 ds->ds_phys->ds_flags = 0; 1469 ds->ds_phys->ds_unique_bytes = 0; 1470 if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= 1471 SPA_VERSION_UNIQUE_ACCURATE) 1472 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 1473 1474 osi = dmu_objset_create_impl(ds->ds_dir->dd_pool->dp_spa, ds, 1475 &ds->ds_phys->ds_bp, *ost, tx); 1476 #ifdef _KERNEL 1477 zfs_create_fs(&osi->os, kcred, NULL, tx); 1478 #endif 1479 } 1480 1481 spa_history_internal_log(LOG_DS_ROLLBACK, ds->ds_dir->dd_pool->dp_spa, 1482 tx, cr, "dataset = %llu", ds->ds_object); 1483 } 1484 1485 /* ARGSUSED */ 1486 static int 1487 dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) 1488 { 1489 dsl_dataset_t *ds = arg1; 1490 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 1491 uint64_t count; 1492 int err; 1493 1494 /* 1495 * Can't delete a head dataset if there are snapshots of it. 1496 * (Except if the only snapshots are from the branch we cloned 1497 * from.) 1498 */ 1499 if (ds->ds_prev != NULL && 1500 ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) 1501 return (EINVAL); 1502 1503 /* 1504 * This is really a dsl_dir thing, but check it here so that 1505 * we'll be less likely to leave this dataset inconsistent & 1506 * nearly destroyed. 1507 */ 1508 err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count); 1509 if (err) 1510 return (err); 1511 if (count != 0) 1512 return (EEXIST); 1513 1514 return (0); 1515 } 1516 1517 /* ARGSUSED */ 1518 static void 1519 dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 1520 { 1521 dsl_dataset_t *ds = arg1; 1522 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1523 1524 /* Mark it as inconsistent on-disk, in case we crash */ 1525 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1526 ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; 1527 1528 spa_history_internal_log(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx, 1529 cr, "dataset = %llu", ds->ds_object); 1530 } 1531 1532 static int 1533 dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag, 1534 dmu_tx_t *tx) 1535 { 1536 dsl_dataset_t *ds = dsda->ds; 1537 dsl_dataset_t *ds_prev = ds->ds_prev; 1538 1539 if (dsl_dataset_might_destroy_origin(ds_prev)) { 1540 struct dsl_ds_destroyarg ndsda = {0}; 1541 1542 /* 1543 * If we're not prepared to remove the origin, don't remove 1544 * the clone either. 1545 */ 1546 if (dsda->rm_origin == NULL) { 1547 dsda->need_prep = B_TRUE; 1548 return (EBUSY); 1549 } 1550 1551 ndsda.ds = ds_prev; 1552 ndsda.is_origin_rm = B_TRUE; 1553 return (dsl_dataset_destroy_check(&ndsda, tag, tx)); 1554 } 1555 1556 /* 1557 * If we're not going to remove the origin after all, 1558 * undo the open context setup. 1559 */ 1560 if (dsda->rm_origin != NULL) { 1561 dsl_dataset_disown(dsda->rm_origin, tag); 1562 dsda->rm_origin = NULL; 1563 } 1564 1565 return (0); 1566 } 1567 1568 /* ARGSUSED */ 1569 int 1570 dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) 1571 { 1572 struct dsl_ds_destroyarg *dsda = arg1; 1573 dsl_dataset_t *ds = dsda->ds; 1574 1575 /* we have an owner hold, so noone else can destroy us */ 1576 ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); 1577 1578 /* 1579 * Only allow deferred destroy on pools that support it. 1580 * NOTE: deferred destroy is only supported on snapshots. 1581 */ 1582 if (dsda->defer) { 1583 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < 1584 SPA_VERSION_USERREFS) 1585 return (ENOTSUP); 1586 ASSERT(dsl_dataset_is_snapshot(ds)); 1587 return (0); 1588 } 1589 1590 /* 1591 * Can't delete a head dataset if there are snapshots of it. 1592 * (Except if the only snapshots are from the branch we cloned 1593 * from.) 1594 */ 1595 if (ds->ds_prev != NULL && 1596 ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) 1597 return (EINVAL); 1598 1599 /* 1600 * If we made changes this txg, traverse_dsl_dataset won't find 1601 * them. Try again. 1602 */ 1603 if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) 1604 return (EAGAIN); 1605 1606 if (dsl_dataset_is_snapshot(ds)) { 1607 /* 1608 * If this snapshot has an elevated user reference count, 1609 * we can't destroy it yet. 1610 */ 1611 if (ds->ds_userrefs > 0 && !dsda->releasing) 1612 return (EBUSY); 1613 1614 mutex_enter(&ds->ds_lock); 1615 /* 1616 * Can't delete a branch point. However, if we're destroying 1617 * a clone and removing its origin due to it having a user 1618 * hold count of 0 and having been marked for deferred destroy, 1619 * it's OK for the origin to have a single clone. 1620 */ 1621 if (ds->ds_phys->ds_num_children > 1622 (dsda->is_origin_rm ? 2 : 1)) { 1623 mutex_exit(&ds->ds_lock); 1624 return (EEXIST); 1625 } 1626 mutex_exit(&ds->ds_lock); 1627 } else if (dsl_dir_is_clone(ds->ds_dir)) { 1628 return (dsl_dataset_origin_check(dsda, arg2, tx)); 1629 } 1630 1631 /* XXX we should do some i/o error checking... */ 1632 return (0); 1633 } 1634 1635 struct refsarg { 1636 kmutex_t lock; 1637 boolean_t gone; 1638 kcondvar_t cv; 1639 }; 1640 1641 /* ARGSUSED */ 1642 static void 1643 dsl_dataset_refs_gone(dmu_buf_t *db, void *argv) 1644 { 1645 struct refsarg *arg = argv; 1646 1647 mutex_enter(&arg->lock); 1648 arg->gone = TRUE; 1649 cv_signal(&arg->cv); 1650 mutex_exit(&arg->lock); 1651 } 1652 1653 static void 1654 dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag) 1655 { 1656 struct refsarg arg; 1657 1658 mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL); 1659 cv_init(&arg.cv, NULL, CV_DEFAULT, NULL); 1660 arg.gone = FALSE; 1661 (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys, 1662 dsl_dataset_refs_gone); 1663 dmu_buf_rele(ds->ds_dbuf, tag); 1664 mutex_enter(&arg.lock); 1665 while (!arg.gone) 1666 cv_wait(&arg.cv, &arg.lock); 1667 ASSERT(arg.gone); 1668 mutex_exit(&arg.lock); 1669 ds->ds_dbuf = NULL; 1670 ds->ds_phys = NULL; 1671 mutex_destroy(&arg.lock); 1672 cv_destroy(&arg.cv); 1673 } 1674 1675 void 1676 dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) 1677 { 1678 struct dsl_ds_destroyarg *dsda = arg1; 1679 dsl_dataset_t *ds = dsda->ds; 1680 zio_t *zio; 1681 int err; 1682 int after_branch_point = FALSE; 1683 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1684 objset_t *mos = dp->dp_meta_objset; 1685 dsl_dataset_t *ds_prev = NULL; 1686 uint64_t obj; 1687 1688 ASSERT(ds->ds_owner); 1689 ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1); 1690 ASSERT(ds->ds_prev == NULL || 1691 ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); 1692 ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); 1693 1694 if (dsda->defer) { 1695 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 1696 if (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1) { 1697 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1698 ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY; 1699 return; 1700 } 1701 } 1702 1703 /* signal any waiters that this dataset is going away */ 1704 mutex_enter(&ds->ds_lock); 1705 ds->ds_owner = dsl_reaper; 1706 cv_broadcast(&ds->ds_exclusive_cv); 1707 mutex_exit(&ds->ds_lock); 1708 1709 /* Remove our reservation */ 1710 if (ds->ds_reserved != 0) { 1711 uint64_t val = 0; 1712 dsl_dataset_set_reservation_sync(ds, &val, cr, tx); 1713 ASSERT3U(ds->ds_reserved, ==, 0); 1714 } 1715 1716 ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); 1717 1718 dsl_pool_ds_destroyed(ds, tx); 1719 1720 obj = ds->ds_object; 1721 1722 if (ds->ds_phys->ds_prev_snap_obj != 0) { 1723 if (ds->ds_prev) { 1724 ds_prev = ds->ds_prev; 1725 } else { 1726 VERIFY(0 == dsl_dataset_hold_obj(dp, 1727 ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev)); 1728 } 1729 after_branch_point = 1730 (ds_prev->ds_phys->ds_next_snap_obj != obj); 1731 1732 dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); 1733 if (after_branch_point && 1734 ds_prev->ds_phys->ds_next_clones_obj != 0) { 1735 VERIFY3U(0, ==, zap_remove_int(mos, 1736 ds_prev->ds_phys->ds_next_clones_obj, obj, tx)); 1737 if (ds->ds_phys->ds_next_snap_obj != 0) { 1738 VERIFY(0 == zap_add_int(mos, 1739 ds_prev->ds_phys->ds_next_clones_obj, 1740 ds->ds_phys->ds_next_snap_obj, tx)); 1741 } 1742 } 1743 if (after_branch_point && 1744 ds->ds_phys->ds_next_snap_obj == 0) { 1745 /* This clone is toast. */ 1746 ASSERT(ds_prev->ds_phys->ds_num_children > 1); 1747 ds_prev->ds_phys->ds_num_children--; 1748 1749 /* 1750 * If the clone's origin has no other clones, no 1751 * user holds, and has been marked for deferred 1752 * deletion, then we should have done the necessary 1753 * destroy setup for it. 1754 */ 1755 if (ds_prev->ds_phys->ds_num_children == 1 && 1756 ds_prev->ds_userrefs == 0 && 1757 DS_IS_DEFER_DESTROY(ds_prev)) { 1758 ASSERT3P(dsda->rm_origin, !=, NULL); 1759 } else { 1760 ASSERT3P(dsda->rm_origin, ==, NULL); 1761 } 1762 } else if (!after_branch_point) { 1763 ds_prev->ds_phys->ds_next_snap_obj = 1764 ds->ds_phys->ds_next_snap_obj; 1765 } 1766 } 1767 1768 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 1769 1770 if (ds->ds_phys->ds_next_snap_obj != 0) { 1771 blkptr_t bp; 1772 dsl_dataset_t *ds_next; 1773 uint64_t itor = 0; 1774 uint64_t old_unique; 1775 int64_t used = 0, compressed = 0, uncompressed = 0; 1776 1777 VERIFY(0 == dsl_dataset_hold_obj(dp, 1778 ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); 1779 ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); 1780 1781 old_unique = dsl_dataset_unique(ds_next); 1782 1783 dmu_buf_will_dirty(ds_next->ds_dbuf, tx); 1784 ds_next->ds_phys->ds_prev_snap_obj = 1785 ds->ds_phys->ds_prev_snap_obj; 1786 ds_next->ds_phys->ds_prev_snap_txg = 1787 ds->ds_phys->ds_prev_snap_txg; 1788 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, 1789 ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0); 1790 1791 /* 1792 * Transfer to our deadlist (which will become next's 1793 * new deadlist) any entries from next's current 1794 * deadlist which were born before prev, and free the 1795 * other entries. 1796 * 1797 * XXX we're doing this long task with the config lock held 1798 */ 1799 while (bplist_iterate(&ds_next->ds_deadlist, &itor, &bp) == 0) { 1800 if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) { 1801 VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, 1802 &bp, tx)); 1803 if (ds_prev && !after_branch_point && 1804 bp.blk_birth > 1805 ds_prev->ds_phys->ds_prev_snap_txg) { 1806 ds_prev->ds_phys->ds_unique_bytes += 1807 bp_get_dasize(dp->dp_spa, &bp); 1808 } 1809 } else { 1810 used += bp_get_dasize(dp->dp_spa, &bp); 1811 compressed += BP_GET_PSIZE(&bp); 1812 uncompressed += BP_GET_UCSIZE(&bp); 1813 /* XXX check return value? */ 1814 (void) dsl_free(zio, dp, tx->tx_txg, 1815 &bp, NULL, NULL, ARC_NOWAIT); 1816 } 1817 } 1818 1819 ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes); 1820 1821 /* change snapused */ 1822 dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, 1823 -used, -compressed, -uncompressed, tx); 1824 1825 /* free next's deadlist */ 1826 bplist_close(&ds_next->ds_deadlist); 1827 bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx); 1828 1829 /* set next's deadlist to our deadlist */ 1830 bplist_close(&ds->ds_deadlist); 1831 ds_next->ds_phys->ds_deadlist_obj = 1832 ds->ds_phys->ds_deadlist_obj; 1833 VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos, 1834 ds_next->ds_phys->ds_deadlist_obj)); 1835 ds->ds_phys->ds_deadlist_obj = 0; 1836 1837 if (ds_next->ds_phys->ds_next_snap_obj != 0) { 1838 /* 1839 * Update next's unique to include blocks which 1840 * were previously shared by only this snapshot 1841 * and it. Those blocks will be born after the 1842 * prev snap and before this snap, and will have 1843 * died after the next snap and before the one 1844 * after that (ie. be on the snap after next's 1845 * deadlist). 1846 * 1847 * XXX we're doing this long task with the 1848 * config lock held 1849 */ 1850 dsl_dataset_t *ds_after_next; 1851 uint64_t space; 1852 1853 VERIFY(0 == dsl_dataset_hold_obj(dp, 1854 ds_next->ds_phys->ds_next_snap_obj, 1855 FTAG, &ds_after_next)); 1856 1857 VERIFY(0 == 1858 bplist_space_birthrange(&ds_after_next->ds_deadlist, 1859 ds->ds_phys->ds_prev_snap_txg, 1860 ds->ds_phys->ds_creation_txg, &space)); 1861 ds_next->ds_phys->ds_unique_bytes += space; 1862 1863 dsl_dataset_rele(ds_after_next, FTAG); 1864 ASSERT3P(ds_next->ds_prev, ==, NULL); 1865 } else { 1866 ASSERT3P(ds_next->ds_prev, ==, ds); 1867 dsl_dataset_drop_ref(ds_next->ds_prev, ds_next); 1868 ds_next->ds_prev = NULL; 1869 if (ds_prev) { 1870 VERIFY(0 == dsl_dataset_get_ref(dp, 1871 ds->ds_phys->ds_prev_snap_obj, 1872 ds_next, &ds_next->ds_prev)); 1873 } 1874 1875 dsl_dataset_recalc_head_uniq(ds_next); 1876 1877 /* 1878 * Reduce the amount of our unconsmed refreservation 1879 * being charged to our parent by the amount of 1880 * new unique data we have gained. 1881 */ 1882 if (old_unique < ds_next->ds_reserved) { 1883 int64_t mrsdelta; 1884 uint64_t new_unique = 1885 ds_next->ds_phys->ds_unique_bytes; 1886 1887 ASSERT(old_unique <= new_unique); 1888 mrsdelta = MIN(new_unique - old_unique, 1889 ds_next->ds_reserved - old_unique); 1890 dsl_dir_diduse_space(ds->ds_dir, 1891 DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); 1892 } 1893 } 1894 dsl_dataset_rele(ds_next, FTAG); 1895 } else { 1896 /* 1897 * There's no next snapshot, so this is a head dataset. 1898 * Destroy the deadlist. Unless it's a clone, the 1899 * deadlist should be empty. (If it's a clone, it's 1900 * safe to ignore the deadlist contents.) 1901 */ 1902 struct killarg ka; 1903 1904 ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist)); 1905 bplist_close(&ds->ds_deadlist); 1906 bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx); 1907 ds->ds_phys->ds_deadlist_obj = 0; 1908 1909 /* 1910 * Free everything that we point to (that's born after 1911 * the previous snapshot, if we are a clone) 1912 * 1913 * NB: this should be very quick, because we already 1914 * freed all the objects in open context. 1915 */ 1916 ka.ds = ds; 1917 ka.zio = zio; 1918 ka.tx = tx; 1919 err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg, 1920 TRAVERSE_POST, kill_blkptr, &ka); 1921 ASSERT3U(err, ==, 0); 1922 ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || 1923 ds->ds_phys->ds_unique_bytes == 0); 1924 } 1925 1926 err = zio_wait(zio); 1927 ASSERT3U(err, ==, 0); 1928 1929 if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) { 1930 /* Erase the link in the dir */ 1931 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); 1932 ds->ds_dir->dd_phys->dd_head_dataset_obj = 0; 1933 ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0); 1934 err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx); 1935 ASSERT(err == 0); 1936 } else { 1937 /* remove from snapshot namespace */ 1938 dsl_dataset_t *ds_head; 1939 ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0); 1940 VERIFY(0 == dsl_dataset_hold_obj(dp, 1941 ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head)); 1942 VERIFY(0 == dsl_dataset_get_snapname(ds)); 1943 #ifdef ZFS_DEBUG 1944 { 1945 uint64_t val; 1946 1947 err = dsl_dataset_snap_lookup(ds_head, 1948 ds->ds_snapname, &val); 1949 ASSERT3U(err, ==, 0); 1950 ASSERT3U(val, ==, obj); 1951 } 1952 #endif 1953 err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx); 1954 ASSERT(err == 0); 1955 dsl_dataset_rele(ds_head, FTAG); 1956 } 1957 1958 if (ds_prev && ds->ds_prev != ds_prev) 1959 dsl_dataset_rele(ds_prev, FTAG); 1960 1961 spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); 1962 spa_history_internal_log(LOG_DS_DESTROY, dp->dp_spa, tx, 1963 cr, "dataset = %llu", ds->ds_object); 1964 1965 if (ds->ds_phys->ds_next_clones_obj != 0) { 1966 uint64_t count; 1967 ASSERT(0 == zap_count(mos, 1968 ds->ds_phys->ds_next_clones_obj, &count) && count == 0); 1969 VERIFY(0 == dmu_object_free(mos, 1970 ds->ds_phys->ds_next_clones_obj, tx)); 1971 } 1972 if (ds->ds_phys->ds_props_obj != 0) 1973 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx)); 1974 if (ds->ds_phys->ds_userrefs_obj != 0) 1975 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx)); 1976 dsl_dir_close(ds->ds_dir, ds); 1977 ds->ds_dir = NULL; 1978 dsl_dataset_drain_refs(ds, tag); 1979 VERIFY(0 == dmu_object_free(mos, obj, tx)); 1980 1981 if (dsda->rm_origin) { 1982 /* 1983 * Remove the origin of the clone we just destroyed. 1984 */ 1985 dsl_dataset_t *origin = ds->ds_prev; 1986 struct dsl_ds_destroyarg ndsda = {0}; 1987 1988 ASSERT3P(origin, ==, dsda->rm_origin); 1989 if (origin->ds_user_ptr) { 1990 origin->ds_user_evict_func(origin, origin->ds_user_ptr); 1991 origin->ds_user_ptr = NULL; 1992 } 1993 1994 dsl_dataset_rele(origin, tag); 1995 ds->ds_prev = NULL; 1996 1997 ndsda.ds = origin; 1998 dsl_dataset_destroy_sync(&ndsda, tag, cr, tx); 1999 } 2000 } 2001 2002 static int 2003 dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) 2004 { 2005 uint64_t asize; 2006 2007 if (!dmu_tx_is_syncing(tx)) 2008 return (0); 2009 2010 /* 2011 * If there's an fs-only reservation, any blocks that might become 2012 * owned by the snapshot dataset must be accommodated by space 2013 * outside of the reservation. 2014 */ 2015 asize = MIN(dsl_dataset_unique(ds), ds->ds_reserved); 2016 if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, FALSE)) 2017 return (ENOSPC); 2018 2019 /* 2020 * Propogate any reserved space for this snapshot to other 2021 * snapshot checks in this sync group. 2022 */ 2023 if (asize > 0) 2024 dsl_dir_willuse_space(ds->ds_dir, asize, tx); 2025 2026 return (0); 2027 } 2028 2029 /* ARGSUSED */ 2030 int 2031 dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) 2032 { 2033 dsl_dataset_t *ds = arg1; 2034 const char *snapname = arg2; 2035 int err; 2036 uint64_t value; 2037 2038 /* 2039 * We don't allow multiple snapshots of the same txg. If there 2040 * is already one, try again. 2041 */ 2042 if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg) 2043 return (EAGAIN); 2044 2045 /* 2046 * Check for conflicting name snapshot name. 2047 */ 2048 err = dsl_dataset_snap_lookup(ds, snapname, &value); 2049 if (err == 0) 2050 return (EEXIST); 2051 if (err != ENOENT) 2052 return (err); 2053 2054 /* 2055 * Check that the dataset's name is not too long. Name consists 2056 * of the dataset's length + 1 for the @-sign + snapshot name's length 2057 */ 2058 if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN) 2059 return (ENAMETOOLONG); 2060 2061 err = dsl_dataset_snapshot_reserve_space(ds, tx); 2062 if (err) 2063 return (err); 2064 2065 ds->ds_trysnap_txg = tx->tx_txg; 2066 return (0); 2067 } 2068 2069 void 2070 dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 2071 { 2072 dsl_dataset_t *ds = arg1; 2073 const char *snapname = arg2; 2074 dsl_pool_t *dp = ds->ds_dir->dd_pool; 2075 dmu_buf_t *dbuf; 2076 dsl_dataset_phys_t *dsphys; 2077 uint64_t dsobj, crtxg; 2078 objset_t *mos = dp->dp_meta_objset; 2079 int err; 2080 2081 ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); 2082 2083 /* 2084 * The origin's ds_creation_txg has to be < TXG_INITIAL 2085 */ 2086 if (strcmp(snapname, ORIGIN_DIR_NAME) == 0) 2087 crtxg = 1; 2088 else 2089 crtxg = tx->tx_txg; 2090 2091 dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, 2092 DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); 2093 VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); 2094 dmu_buf_will_dirty(dbuf, tx); 2095 dsphys = dbuf->db_data; 2096 bzero(dsphys, sizeof (dsl_dataset_phys_t)); 2097 dsphys->ds_dir_obj = ds->ds_dir->dd_object; 2098 dsphys->ds_fsid_guid = unique_create(); 2099 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, 2100 sizeof (dsphys->ds_guid)); 2101 dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj; 2102 dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg; 2103 dsphys->ds_next_snap_obj = ds->ds_object; 2104 dsphys->ds_num_children = 1; 2105 dsphys->ds_creation_time = gethrestime_sec(); 2106 dsphys->ds_creation_txg = crtxg; 2107 dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj; 2108 dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes; 2109 dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes; 2110 dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes; 2111 dsphys->ds_flags = ds->ds_phys->ds_flags; 2112 dsphys->ds_bp = ds->ds_phys->ds_bp; 2113 dmu_buf_rele(dbuf, FTAG); 2114 2115 ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0); 2116 if (ds->ds_prev) { 2117 uint64_t next_clones_obj = 2118 ds->ds_prev->ds_phys->ds_next_clones_obj; 2119 ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj == 2120 ds->ds_object || 2121 ds->ds_prev->ds_phys->ds_num_children > 1); 2122 if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { 2123 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 2124 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, 2125 ds->ds_prev->ds_phys->ds_creation_txg); 2126 ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj; 2127 } else if (next_clones_obj != 0) { 2128 VERIFY3U(0, ==, zap_remove_int(mos, 2129 next_clones_obj, dsphys->ds_next_snap_obj, tx)); 2130 VERIFY3U(0, ==, zap_add_int(mos, 2131 next_clones_obj, dsobj, tx)); 2132 } 2133 } 2134 2135 /* 2136 * If we have a reference-reservation on this dataset, we will 2137 * need to increase the amount of refreservation being charged 2138 * since our unique space is going to zero. 2139 */ 2140 if (ds->ds_reserved) { 2141 int64_t add = MIN(dsl_dataset_unique(ds), ds->ds_reserved); 2142 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, 2143 add, 0, 0, tx); 2144 } 2145 2146 bplist_close(&ds->ds_deadlist); 2147 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2148 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg); 2149 ds->ds_phys->ds_prev_snap_obj = dsobj; 2150 ds->ds_phys->ds_prev_snap_txg = crtxg; 2151 ds->ds_phys->ds_unique_bytes = 0; 2152 if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) 2153 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 2154 ds->ds_phys->ds_deadlist_obj = 2155 bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); 2156 VERIFY(0 == bplist_open(&ds->ds_deadlist, mos, 2157 ds->ds_phys->ds_deadlist_obj)); 2158 2159 dprintf("snap '%s' -> obj %llu\n", snapname, dsobj); 2160 err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj, 2161 snapname, 8, 1, &dsobj, tx); 2162 ASSERT(err == 0); 2163 2164 if (ds->ds_prev) 2165 dsl_dataset_drop_ref(ds->ds_prev, ds); 2166 VERIFY(0 == dsl_dataset_get_ref(dp, 2167 ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); 2168 2169 dsl_pool_ds_snapshotted(ds, tx); 2170 2171 spa_history_internal_log(LOG_DS_SNAPSHOT, dp->dp_spa, tx, cr, 2172 "dataset = %llu", dsobj); 2173 } 2174 2175 void 2176 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) 2177 { 2178 ASSERT(dmu_tx_is_syncing(tx)); 2179 ASSERT(ds->ds_user_ptr != NULL); 2180 ASSERT(ds->ds_phys->ds_next_snap_obj == 0); 2181 2182 /* 2183 * in case we had to change ds_fsid_guid when we opened it, 2184 * sync it out now. 2185 */ 2186 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2187 ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid; 2188 2189 dsl_dir_dirty(ds->ds_dir, tx); 2190 dmu_objset_sync(ds->ds_user_ptr, zio, tx); 2191 } 2192 2193 void 2194 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) 2195 { 2196 uint64_t refd, avail, uobjs, aobjs; 2197 2198 dsl_dir_stats(ds->ds_dir, nv); 2199 2200 dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs); 2201 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail); 2202 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd); 2203 2204 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION, 2205 ds->ds_phys->ds_creation_time); 2206 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG, 2207 ds->ds_phys->ds_creation_txg); 2208 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA, 2209 ds->ds_quota); 2210 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION, 2211 ds->ds_reserved); 2212 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, 2213 ds->ds_phys->ds_guid); 2214 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, ds->ds_userrefs); 2215 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, 2216 DS_IS_DEFER_DESTROY(ds) ? 1 : 0); 2217 2218 if (ds->ds_phys->ds_next_snap_obj) { 2219 /* 2220 * This is a snapshot; override the dd's space used with 2221 * our unique space and compression ratio. 2222 */ 2223 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, 2224 ds->ds_phys->ds_unique_bytes); 2225 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, 2226 ds->ds_phys->ds_compressed_bytes == 0 ? 100 : 2227 (ds->ds_phys->ds_uncompressed_bytes * 100 / 2228 ds->ds_phys->ds_compressed_bytes)); 2229 } 2230 } 2231 2232 void 2233 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) 2234 { 2235 stat->dds_creation_txg = ds->ds_phys->ds_creation_txg; 2236 stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT; 2237 stat->dds_guid = ds->ds_phys->ds_guid; 2238 if (ds->ds_phys->ds_next_snap_obj) { 2239 stat->dds_is_snapshot = B_TRUE; 2240 stat->dds_num_clones = ds->ds_phys->ds_num_children - 1; 2241 } else { 2242 stat->dds_is_snapshot = B_FALSE; 2243 stat->dds_num_clones = 0; 2244 } 2245 2246 /* clone origin is really a dsl_dir thing... */ 2247 rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); 2248 if (dsl_dir_is_clone(ds->ds_dir)) { 2249 dsl_dataset_t *ods; 2250 2251 VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool, 2252 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods)); 2253 dsl_dataset_name(ods, stat->dds_origin); 2254 dsl_dataset_drop_ref(ods, FTAG); 2255 } else { 2256 stat->dds_origin[0] = '\0'; 2257 } 2258 rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); 2259 } 2260 2261 uint64_t 2262 dsl_dataset_fsid_guid(dsl_dataset_t *ds) 2263 { 2264 return (ds->ds_fsid_guid); 2265 } 2266 2267 void 2268 dsl_dataset_space(dsl_dataset_t *ds, 2269 uint64_t *refdbytesp, uint64_t *availbytesp, 2270 uint64_t *usedobjsp, uint64_t *availobjsp) 2271 { 2272 *refdbytesp = ds->ds_phys->ds_used_bytes; 2273 *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); 2274 if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) 2275 *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes; 2276 if (ds->ds_quota != 0) { 2277 /* 2278 * Adjust available bytes according to refquota 2279 */ 2280 if (*refdbytesp < ds->ds_quota) 2281 *availbytesp = MIN(*availbytesp, 2282 ds->ds_quota - *refdbytesp); 2283 else 2284 *availbytesp = 0; 2285 } 2286 *usedobjsp = ds->ds_phys->ds_bp.blk_fill; 2287 *availobjsp = DN_MAX_OBJECT - *usedobjsp; 2288 } 2289 2290 boolean_t 2291 dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) 2292 { 2293 dsl_pool_t *dp = ds->ds_dir->dd_pool; 2294 2295 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || 2296 dsl_pool_sync_context(dp)); 2297 if (ds->ds_prev == NULL) 2298 return (B_FALSE); 2299 if (ds->ds_phys->ds_bp.blk_birth > 2300 ds->ds_prev->ds_phys->ds_creation_txg) 2301 return (B_TRUE); 2302 return (B_FALSE); 2303 } 2304 2305 /* ARGSUSED */ 2306 static int 2307 dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) 2308 { 2309 dsl_dataset_t *ds = arg1; 2310 char *newsnapname = arg2; 2311 dsl_dir_t *dd = ds->ds_dir; 2312 dsl_dataset_t *hds; 2313 uint64_t val; 2314 int err; 2315 2316 err = dsl_dataset_hold_obj(dd->dd_pool, 2317 dd->dd_phys->dd_head_dataset_obj, FTAG, &hds); 2318 if (err) 2319 return (err); 2320 2321 /* new name better not be in use */ 2322 err = dsl_dataset_snap_lookup(hds, newsnapname, &val); 2323 dsl_dataset_rele(hds, FTAG); 2324 2325 if (err == 0) 2326 err = EEXIST; 2327 else if (err == ENOENT) 2328 err = 0; 2329 2330 /* dataset name + 1 for the "@" + the new snapshot name must fit */ 2331 if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN) 2332 err = ENAMETOOLONG; 2333 2334 return (err); 2335 } 2336 2337 static void 2338 dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, 2339 cred_t *cr, dmu_tx_t *tx) 2340 { 2341 dsl_dataset_t *ds = arg1; 2342 const char *newsnapname = arg2; 2343 dsl_dir_t *dd = ds->ds_dir; 2344 objset_t *mos = dd->dd_pool->dp_meta_objset; 2345 dsl_dataset_t *hds; 2346 int err; 2347 2348 ASSERT(ds->ds_phys->ds_next_snap_obj != 0); 2349 2350 VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, 2351 dd->dd_phys->dd_head_dataset_obj, FTAG, &hds)); 2352 2353 VERIFY(0 == dsl_dataset_get_snapname(ds)); 2354 err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx); 2355 ASSERT3U(err, ==, 0); 2356 mutex_enter(&ds->ds_lock); 2357 (void) strcpy(ds->ds_snapname, newsnapname); 2358 mutex_exit(&ds->ds_lock); 2359 err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj, 2360 ds->ds_snapname, 8, 1, &ds->ds_object, tx); 2361 ASSERT3U(err, ==, 0); 2362 2363 spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx, 2364 cr, "dataset = %llu", ds->ds_object); 2365 dsl_dataset_rele(hds, FTAG); 2366 } 2367 2368 struct renamesnaparg { 2369 dsl_sync_task_group_t *dstg; 2370 char failed[MAXPATHLEN]; 2371 char *oldsnap; 2372 char *newsnap; 2373 }; 2374 2375 static int 2376 dsl_snapshot_rename_one(char *name, void *arg) 2377 { 2378 struct renamesnaparg *ra = arg; 2379 dsl_dataset_t *ds = NULL; 2380 char *cp; 2381 int err; 2382 2383 cp = name + strlen(name); 2384 *cp = '@'; 2385 (void) strcpy(cp + 1, ra->oldsnap); 2386 2387 /* 2388 * For recursive snapshot renames the parent won't be changing 2389 * so we just pass name for both the to/from argument. 2390 */ 2391 err = zfs_secpolicy_rename_perms(name, name, CRED()); 2392 if (err == ENOENT) { 2393 return (0); 2394 } else if (err) { 2395 (void) strcpy(ra->failed, name); 2396 return (err); 2397 } 2398 2399 #ifdef _KERNEL 2400 /* 2401 * For all filesystems undergoing rename, we'll need to unmount it. 2402 */ 2403 (void) zfs_unmount_snap(name, NULL); 2404 #endif 2405 err = dsl_dataset_hold(name, ra->dstg, &ds); 2406 *cp = '\0'; 2407 if (err == ENOENT) { 2408 return (0); 2409 } else if (err) { 2410 (void) strcpy(ra->failed, name); 2411 return (err); 2412 } 2413 2414 dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check, 2415 dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0); 2416 2417 return (0); 2418 } 2419 2420 static int 2421 dsl_recursive_rename(char *oldname, const char *newname) 2422 { 2423 int err; 2424 struct renamesnaparg *ra; 2425 dsl_sync_task_t *dst; 2426 spa_t *spa; 2427 char *cp, *fsname = spa_strdup(oldname); 2428 int len = strlen(oldname); 2429 2430 /* truncate the snapshot name to get the fsname */ 2431 cp = strchr(fsname, '@'); 2432 *cp = '\0'; 2433 2434 err = spa_open(fsname, &spa, FTAG); 2435 if (err) { 2436 kmem_free(fsname, len + 1); 2437 return (err); 2438 } 2439 ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP); 2440 ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 2441 2442 ra->oldsnap = strchr(oldname, '@') + 1; 2443 ra->newsnap = strchr(newname, '@') + 1; 2444 *ra->failed = '\0'; 2445 2446 err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra, 2447 DS_FIND_CHILDREN); 2448 kmem_free(fsname, len + 1); 2449 2450 if (err == 0) { 2451 err = dsl_sync_task_group_wait(ra->dstg); 2452 } 2453 2454 for (dst = list_head(&ra->dstg->dstg_tasks); dst; 2455 dst = list_next(&ra->dstg->dstg_tasks, dst)) { 2456 dsl_dataset_t *ds = dst->dst_arg1; 2457 if (dst->dst_err) { 2458 dsl_dir_name(ds->ds_dir, ra->failed); 2459 (void) strcat(ra->failed, "@"); 2460 (void) strcat(ra->failed, ra->newsnap); 2461 } 2462 dsl_dataset_rele(ds, ra->dstg); 2463 } 2464 2465 if (err) 2466 (void) strcpy(oldname, ra->failed); 2467 2468 dsl_sync_task_group_destroy(ra->dstg); 2469 kmem_free(ra, sizeof (struct renamesnaparg)); 2470 spa_close(spa, FTAG); 2471 return (err); 2472 } 2473 2474 static int 2475 dsl_valid_rename(char *oldname, void *arg) 2476 { 2477 int delta = *(int *)arg; 2478 2479 if (strlen(oldname) + delta >= MAXNAMELEN) 2480 return (ENAMETOOLONG); 2481 2482 return (0); 2483 } 2484 2485 #pragma weak dmu_objset_rename = dsl_dataset_rename 2486 int 2487 dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive) 2488 { 2489 dsl_dir_t *dd; 2490 dsl_dataset_t *ds; 2491 const char *tail; 2492 int err; 2493 2494 err = dsl_dir_open(oldname, FTAG, &dd, &tail); 2495 if (err) 2496 return (err); 2497 /* 2498 * If there are more than 2 references there may be holds 2499 * hanging around that haven't been cleared out yet. 2500 */ 2501 if (dmu_buf_refcount(dd->dd_dbuf) > 2) 2502 txg_wait_synced(dd->dd_pool, 0); 2503 if (tail == NULL) { 2504 int delta = strlen(newname) - strlen(oldname); 2505 2506 /* if we're growing, validate child name lengths */ 2507 if (delta > 0) 2508 err = dmu_objset_find(oldname, dsl_valid_rename, 2509 &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 2510 2511 if (!err) 2512 err = dsl_dir_rename(dd, newname); 2513 dsl_dir_close(dd, FTAG); 2514 return (err); 2515 } 2516 if (tail[0] != '@') { 2517 /* the name ended in a nonexistant component */ 2518 dsl_dir_close(dd, FTAG); 2519 return (ENOENT); 2520 } 2521 2522 dsl_dir_close(dd, FTAG); 2523 2524 /* new name must be snapshot in same filesystem */ 2525 tail = strchr(newname, '@'); 2526 if (tail == NULL) 2527 return (EINVAL); 2528 tail++; 2529 if (strncmp(oldname, newname, tail - newname) != 0) 2530 return (EXDEV); 2531 2532 if (recursive) { 2533 err = dsl_recursive_rename(oldname, newname); 2534 } else { 2535 err = dsl_dataset_hold(oldname, FTAG, &ds); 2536 if (err) 2537 return (err); 2538 2539 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 2540 dsl_dataset_snapshot_rename_check, 2541 dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1); 2542 2543 dsl_dataset_rele(ds, FTAG); 2544 } 2545 2546 return (err); 2547 } 2548 2549 struct promotenode { 2550 list_node_t link; 2551 dsl_dataset_t *ds; 2552 }; 2553 2554 struct promotearg { 2555 list_t shared_snaps, origin_snaps, clone_snaps; 2556 dsl_dataset_t *origin_origin, *origin_head; 2557 uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; 2558 }; 2559 2560 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); 2561 2562 /* ARGSUSED */ 2563 static int 2564 dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) 2565 { 2566 dsl_dataset_t *hds = arg1; 2567 struct promotearg *pa = arg2; 2568 struct promotenode *snap = list_head(&pa->shared_snaps); 2569 dsl_dataset_t *origin_ds = snap->ds; 2570 int err; 2571 2572 /* Check that it is a real clone */ 2573 if (!dsl_dir_is_clone(hds->ds_dir)) 2574 return (EINVAL); 2575 2576 /* Since this is so expensive, don't do the preliminary check */ 2577 if (!dmu_tx_is_syncing(tx)) 2578 return (0); 2579 2580 if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) 2581 return (EXDEV); 2582 2583 /* compute origin's new unique space */ 2584 snap = list_tail(&pa->clone_snaps); 2585 ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); 2586 err = bplist_space_birthrange(&snap->ds->ds_deadlist, 2587 origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, &pa->unique); 2588 if (err) 2589 return (err); 2590 2591 /* 2592 * Walk the snapshots that we are moving 2593 * 2594 * Compute space to transfer. Consider the incremental changes 2595 * to used for each snapshot: 2596 * (my used) = (prev's used) + (blocks born) - (blocks killed) 2597 * So each snapshot gave birth to: 2598 * (blocks born) = (my used) - (prev's used) + (blocks killed) 2599 * So a sequence would look like: 2600 * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0) 2601 * Which simplifies to: 2602 * uN + kN + kN-1 + ... + k1 + k0 2603 * Note however, if we stop before we reach the ORIGIN we get: 2604 * uN + kN + kN-1 + ... + kM - uM-1 2605 */ 2606 pa->used = origin_ds->ds_phys->ds_used_bytes; 2607 pa->comp = origin_ds->ds_phys->ds_compressed_bytes; 2608 pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; 2609 for (snap = list_head(&pa->shared_snaps); snap; 2610 snap = list_next(&pa->shared_snaps, snap)) { 2611 uint64_t val, dlused, dlcomp, dluncomp; 2612 dsl_dataset_t *ds = snap->ds; 2613 2614 /* Check that the snapshot name does not conflict */ 2615 VERIFY(0 == dsl_dataset_get_snapname(ds)); 2616 err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); 2617 if (err == 0) 2618 return (EEXIST); 2619 if (err != ENOENT) 2620 return (err); 2621 2622 /* The very first snapshot does not have a deadlist */ 2623 if (ds->ds_phys->ds_prev_snap_obj == 0) 2624 continue; 2625 2626 if (err = bplist_space(&ds->ds_deadlist, 2627 &dlused, &dlcomp, &dluncomp)) 2628 return (err); 2629 pa->used += dlused; 2630 pa->comp += dlcomp; 2631 pa->uncomp += dluncomp; 2632 } 2633 2634 /* 2635 * If we are a clone of a clone then we never reached ORIGIN, 2636 * so we need to subtract out the clone origin's used space. 2637 */ 2638 if (pa->origin_origin) { 2639 pa->used -= pa->origin_origin->ds_phys->ds_used_bytes; 2640 pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes; 2641 pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes; 2642 } 2643 2644 /* Check that there is enough space here */ 2645 err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, 2646 pa->used); 2647 if (err) 2648 return (err); 2649 2650 /* 2651 * Compute the amounts of space that will be used by snapshots 2652 * after the promotion (for both origin and clone). For each, 2653 * it is the amount of space that will be on all of their 2654 * deadlists (that was not born before their new origin). 2655 */ 2656 if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { 2657 uint64_t space; 2658 2659 /* 2660 * Note, typically this will not be a clone of a clone, 2661 * so snap->ds->ds_origin_txg will be < TXG_INITIAL, so 2662 * these snaplist_space() -> bplist_space_birthrange() 2663 * calls will be fast because they do not have to 2664 * iterate over all bps. 2665 */ 2666 snap = list_head(&pa->origin_snaps); 2667 err = snaplist_space(&pa->shared_snaps, 2668 snap->ds->ds_origin_txg, &pa->cloneusedsnap); 2669 if (err) 2670 return (err); 2671 2672 err = snaplist_space(&pa->clone_snaps, 2673 snap->ds->ds_origin_txg, &space); 2674 if (err) 2675 return (err); 2676 pa->cloneusedsnap += space; 2677 } 2678 if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { 2679 err = snaplist_space(&pa->origin_snaps, 2680 origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap); 2681 if (err) 2682 return (err); 2683 } 2684 2685 return (0); 2686 } 2687 2688 static void 2689 dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 2690 { 2691 dsl_dataset_t *hds = arg1; 2692 struct promotearg *pa = arg2; 2693 struct promotenode *snap = list_head(&pa->shared_snaps); 2694 dsl_dataset_t *origin_ds = snap->ds; 2695 dsl_dataset_t *origin_head; 2696 dsl_dir_t *dd = hds->ds_dir; 2697 dsl_pool_t *dp = hds->ds_dir->dd_pool; 2698 dsl_dir_t *odd = NULL; 2699 uint64_t oldnext_obj; 2700 int64_t delta; 2701 2702 ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)); 2703 2704 snap = list_head(&pa->origin_snaps); 2705 origin_head = snap->ds; 2706 2707 /* 2708 * We need to explicitly open odd, since origin_ds's dd will be 2709 * changing. 2710 */ 2711 VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object, 2712 NULL, FTAG, &odd)); 2713 2714 /* change origin's next snap */ 2715 dmu_buf_will_dirty(origin_ds->ds_dbuf, tx); 2716 oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj; 2717 snap = list_tail(&pa->clone_snaps); 2718 ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); 2719 origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object; 2720 2721 /* change the origin's next clone */ 2722 if (origin_ds->ds_phys->ds_next_clones_obj) { 2723 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 2724 origin_ds->ds_phys->ds_next_clones_obj, 2725 origin_ds->ds_phys->ds_next_snap_obj, tx)); 2726 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, 2727 origin_ds->ds_phys->ds_next_clones_obj, 2728 oldnext_obj, tx)); 2729 } 2730 2731 /* change origin */ 2732 dmu_buf_will_dirty(dd->dd_dbuf, tx); 2733 ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object); 2734 dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj; 2735 hds->ds_origin_txg = origin_head->ds_origin_txg; 2736 dmu_buf_will_dirty(odd->dd_dbuf, tx); 2737 odd->dd_phys->dd_origin_obj = origin_ds->ds_object; 2738 origin_head->ds_origin_txg = origin_ds->ds_phys->ds_creation_txg; 2739 2740 /* move snapshots to this dir */ 2741 for (snap = list_head(&pa->shared_snaps); snap; 2742 snap = list_next(&pa->shared_snaps, snap)) { 2743 dsl_dataset_t *ds = snap->ds; 2744 2745 /* unregister props as dsl_dir is changing */ 2746 if (ds->ds_user_ptr) { 2747 ds->ds_user_evict_func(ds, ds->ds_user_ptr); 2748 ds->ds_user_ptr = NULL; 2749 } 2750 /* move snap name entry */ 2751 VERIFY(0 == dsl_dataset_get_snapname(ds)); 2752 VERIFY(0 == dsl_dataset_snap_remove(origin_head, 2753 ds->ds_snapname, tx)); 2754 VERIFY(0 == zap_add(dp->dp_meta_objset, 2755 hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, 2756 8, 1, &ds->ds_object, tx)); 2757 /* change containing dsl_dir */ 2758 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2759 ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object); 2760 ds->ds_phys->ds_dir_obj = dd->dd_object; 2761 ASSERT3P(ds->ds_dir, ==, odd); 2762 dsl_dir_close(ds->ds_dir, ds); 2763 VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, 2764 NULL, ds, &ds->ds_dir)); 2765 2766 ASSERT3U(dsl_prop_numcb(ds), ==, 0); 2767 } 2768 2769 /* 2770 * Change space accounting. 2771 * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either 2772 * both be valid, or both be 0 (resulting in delta == 0). This 2773 * is true for each of {clone,origin} independently. 2774 */ 2775 2776 delta = pa->cloneusedsnap - 2777 dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; 2778 ASSERT3S(delta, >=, 0); 2779 ASSERT3U(pa->used, >=, delta); 2780 dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx); 2781 dsl_dir_diduse_space(dd, DD_USED_HEAD, 2782 pa->used - delta, pa->comp, pa->uncomp, tx); 2783 2784 delta = pa->originusedsnap - 2785 odd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; 2786 ASSERT3S(delta, <=, 0); 2787 ASSERT3U(pa->used, >=, -delta); 2788 dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx); 2789 dsl_dir_diduse_space(odd, DD_USED_HEAD, 2790 -pa->used - delta, -pa->comp, -pa->uncomp, tx); 2791 2792 origin_ds->ds_phys->ds_unique_bytes = pa->unique; 2793 2794 /* log history record */ 2795 spa_history_internal_log(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx, 2796 cr, "dataset = %llu", hds->ds_object); 2797 2798 dsl_dir_close(odd, FTAG); 2799 } 2800 2801 static char *snaplist_tag = "snaplist"; 2802 /* 2803 * Make a list of dsl_dataset_t's for the snapshots between first_obj 2804 * (exclusive) and last_obj (inclusive). The list will be in reverse 2805 * order (last_obj will be the list_head()). If first_obj == 0, do all 2806 * snapshots back to this dataset's origin. 2807 */ 2808 static int 2809 snaplist_make(dsl_pool_t *dp, boolean_t own, 2810 uint64_t first_obj, uint64_t last_obj, list_t *l) 2811 { 2812 uint64_t obj = last_obj; 2813 2814 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock)); 2815 2816 list_create(l, sizeof (struct promotenode), 2817 offsetof(struct promotenode, link)); 2818 2819 while (obj != first_obj) { 2820 dsl_dataset_t *ds; 2821 struct promotenode *snap; 2822 int err; 2823 2824 if (own) { 2825 err = dsl_dataset_own_obj(dp, obj, 2826 0, snaplist_tag, &ds); 2827 if (err == 0) 2828 dsl_dataset_make_exclusive(ds, snaplist_tag); 2829 } else { 2830 err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds); 2831 } 2832 if (err == ENOENT) { 2833 /* lost race with snapshot destroy */ 2834 struct promotenode *last = list_tail(l); 2835 ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj); 2836 obj = last->ds->ds_phys->ds_prev_snap_obj; 2837 continue; 2838 } else if (err) { 2839 return (err); 2840 } 2841 2842 if (first_obj == 0) 2843 first_obj = ds->ds_dir->dd_phys->dd_origin_obj; 2844 2845 snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP); 2846 snap->ds = ds; 2847 list_insert_tail(l, snap); 2848 obj = ds->ds_phys->ds_prev_snap_obj; 2849 } 2850 2851 return (0); 2852 } 2853 2854 static int 2855 snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep) 2856 { 2857 struct promotenode *snap; 2858 2859 *spacep = 0; 2860 for (snap = list_head(l); snap; snap = list_next(l, snap)) { 2861 uint64_t used; 2862 int err = bplist_space_birthrange(&snap->ds->ds_deadlist, 2863 mintxg, UINT64_MAX, &used); 2864 if (err) 2865 return (err); 2866 *spacep += used; 2867 } 2868 return (0); 2869 } 2870 2871 static void 2872 snaplist_destroy(list_t *l, boolean_t own) 2873 { 2874 struct promotenode *snap; 2875 2876 if (!l || !list_link_active(&l->list_head)) 2877 return; 2878 2879 while ((snap = list_tail(l)) != NULL) { 2880 list_remove(l, snap); 2881 if (own) 2882 dsl_dataset_disown(snap->ds, snaplist_tag); 2883 else 2884 dsl_dataset_rele(snap->ds, snaplist_tag); 2885 kmem_free(snap, sizeof (struct promotenode)); 2886 } 2887 list_destroy(l); 2888 } 2889 2890 /* 2891 * Promote a clone. Nomenclature note: 2892 * "clone" or "cds": the original clone which is being promoted 2893 * "origin" or "ods": the snapshot which is originally clone's origin 2894 * "origin head" or "ohds": the dataset which is the head 2895 * (filesystem/volume) for the origin 2896 * "origin origin": the origin of the origin's filesystem (typically 2897 * NULL, indicating that the clone is not a clone of a clone). 2898 */ 2899 int 2900 dsl_dataset_promote(const char *name) 2901 { 2902 dsl_dataset_t *ds; 2903 dsl_dir_t *dd; 2904 dsl_pool_t *dp; 2905 dmu_object_info_t doi; 2906 struct promotearg pa = { 0 }; 2907 struct promotenode *snap; 2908 int err; 2909 2910 err = dsl_dataset_hold(name, FTAG, &ds); 2911 if (err) 2912 return (err); 2913 dd = ds->ds_dir; 2914 dp = dd->dd_pool; 2915 2916 err = dmu_object_info(dp->dp_meta_objset, 2917 ds->ds_phys->ds_snapnames_zapobj, &doi); 2918 if (err) { 2919 dsl_dataset_rele(ds, FTAG); 2920 return (err); 2921 } 2922 2923 if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) { 2924 dsl_dataset_rele(ds, FTAG); 2925 return (EINVAL); 2926 } 2927 2928 /* 2929 * We are going to inherit all the snapshots taken before our 2930 * origin (i.e., our new origin will be our parent's origin). 2931 * Take ownership of them so that we can rename them into our 2932 * namespace. 2933 */ 2934 rw_enter(&dp->dp_config_rwlock, RW_READER); 2935 2936 err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj, 2937 &pa.shared_snaps); 2938 if (err != 0) 2939 goto out; 2940 2941 err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps); 2942 if (err != 0) 2943 goto out; 2944 2945 snap = list_head(&pa.shared_snaps); 2946 ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj); 2947 err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj, 2948 snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps); 2949 if (err != 0) 2950 goto out; 2951 2952 if (dsl_dir_is_clone(snap->ds->ds_dir)) { 2953 err = dsl_dataset_own_obj(dp, 2954 snap->ds->ds_dir->dd_phys->dd_origin_obj, 2955 0, FTAG, &pa.origin_origin); 2956 if (err != 0) 2957 goto out; 2958 } 2959 2960 out: 2961 rw_exit(&dp->dp_config_rwlock); 2962 2963 /* 2964 * Add in 128x the snapnames zapobj size, since we will be moving 2965 * a bunch of snapnames to the promoted ds, and dirtying their 2966 * bonus buffers. 2967 */ 2968 if (err == 0) { 2969 err = dsl_sync_task_do(dp, dsl_dataset_promote_check, 2970 dsl_dataset_promote_sync, ds, &pa, 2971 2 + 2 * doi.doi_physical_blks); 2972 } 2973 2974 snaplist_destroy(&pa.shared_snaps, B_TRUE); 2975 snaplist_destroy(&pa.clone_snaps, B_FALSE); 2976 snaplist_destroy(&pa.origin_snaps, B_FALSE); 2977 if (pa.origin_origin) 2978 dsl_dataset_disown(pa.origin_origin, FTAG); 2979 dsl_dataset_rele(ds, FTAG); 2980 return (err); 2981 } 2982 2983 struct cloneswaparg { 2984 dsl_dataset_t *cds; /* clone dataset */ 2985 dsl_dataset_t *ohds; /* origin's head dataset */ 2986 boolean_t force; 2987 int64_t unused_refres_delta; /* change in unconsumed refreservation */ 2988 }; 2989 2990 /* ARGSUSED */ 2991 static int 2992 dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx) 2993 { 2994 struct cloneswaparg *csa = arg1; 2995 2996 /* they should both be heads */ 2997 if (dsl_dataset_is_snapshot(csa->cds) || 2998 dsl_dataset_is_snapshot(csa->ohds)) 2999 return (EINVAL); 3000 3001 /* the branch point should be just before them */ 3002 if (csa->cds->ds_prev != csa->ohds->ds_prev) 3003 return (EINVAL); 3004 3005 /* cds should be the clone */ 3006 if (csa->cds->ds_prev->ds_phys->ds_next_snap_obj != 3007 csa->ohds->ds_object) 3008 return (EINVAL); 3009 3010 /* the clone should be a child of the origin */ 3011 if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir) 3012 return (EINVAL); 3013 3014 /* ohds shouldn't be modified unless 'force' */ 3015 if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds)) 3016 return (ETXTBSY); 3017 3018 /* adjust amount of any unconsumed refreservation */ 3019 csa->unused_refres_delta = 3020 (int64_t)MIN(csa->ohds->ds_reserved, 3021 csa->ohds->ds_phys->ds_unique_bytes) - 3022 (int64_t)MIN(csa->ohds->ds_reserved, 3023 csa->cds->ds_phys->ds_unique_bytes); 3024 3025 if (csa->unused_refres_delta > 0 && 3026 csa->unused_refres_delta > 3027 dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE)) 3028 return (ENOSPC); 3029 3030 return (0); 3031 } 3032 3033 /* ARGSUSED */ 3034 static void 3035 dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 3036 { 3037 struct cloneswaparg *csa = arg1; 3038 dsl_pool_t *dp = csa->cds->ds_dir->dd_pool; 3039 3040 ASSERT(csa->cds->ds_reserved == 0); 3041 ASSERT(csa->cds->ds_quota == csa->ohds->ds_quota); 3042 3043 dmu_buf_will_dirty(csa->cds->ds_dbuf, tx); 3044 dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx); 3045 dmu_buf_will_dirty(csa->cds->ds_prev->ds_dbuf, tx); 3046 3047 if (csa->cds->ds_user_ptr != NULL) { 3048 csa->cds->ds_user_evict_func(csa->cds, csa->cds->ds_user_ptr); 3049 csa->cds->ds_user_ptr = NULL; 3050 } 3051 3052 if (csa->ohds->ds_user_ptr != NULL) { 3053 csa->ohds->ds_user_evict_func(csa->ohds, 3054 csa->ohds->ds_user_ptr); 3055 csa->ohds->ds_user_ptr = NULL; 3056 } 3057 3058 /* reset origin's unique bytes */ 3059 VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist, 3060 csa->cds->ds_prev->ds_phys->ds_prev_snap_txg, UINT64_MAX, 3061 &csa->cds->ds_prev->ds_phys->ds_unique_bytes)); 3062 3063 /* swap blkptrs */ 3064 { 3065 blkptr_t tmp; 3066 tmp = csa->ohds->ds_phys->ds_bp; 3067 csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp; 3068 csa->cds->ds_phys->ds_bp = tmp; 3069 } 3070 3071 /* set dd_*_bytes */ 3072 { 3073 int64_t dused, dcomp, duncomp; 3074 uint64_t cdl_used, cdl_comp, cdl_uncomp; 3075 uint64_t odl_used, odl_comp, odl_uncomp; 3076 3077 ASSERT3U(csa->cds->ds_dir->dd_phys-> 3078 dd_used_breakdown[DD_USED_SNAP], ==, 0); 3079 3080 VERIFY(0 == bplist_space(&csa->cds->ds_deadlist, &cdl_used, 3081 &cdl_comp, &cdl_uncomp)); 3082 VERIFY(0 == bplist_space(&csa->ohds->ds_deadlist, &odl_used, 3083 &odl_comp, &odl_uncomp)); 3084 3085 dused = csa->cds->ds_phys->ds_used_bytes + cdl_used - 3086 (csa->ohds->ds_phys->ds_used_bytes + odl_used); 3087 dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp - 3088 (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp); 3089 duncomp = csa->cds->ds_phys->ds_uncompressed_bytes + 3090 cdl_uncomp - 3091 (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp); 3092 3093 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD, 3094 dused, dcomp, duncomp, tx); 3095 dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD, 3096 -dused, -dcomp, -duncomp, tx); 3097 3098 /* 3099 * The difference in the space used by snapshots is the 3100 * difference in snapshot space due to the head's 3101 * deadlist (since that's the only thing that's 3102 * changing that affects the snapused). 3103 */ 3104 VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist, 3105 csa->ohds->ds_origin_txg, UINT64_MAX, &cdl_used)); 3106 VERIFY(0 == bplist_space_birthrange(&csa->ohds->ds_deadlist, 3107 csa->ohds->ds_origin_txg, UINT64_MAX, &odl_used)); 3108 dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used, 3109 DD_USED_HEAD, DD_USED_SNAP, tx); 3110 } 3111 3112 #define SWITCH64(x, y) \ 3113 { \ 3114 uint64_t __tmp = (x); \ 3115 (x) = (y); \ 3116 (y) = __tmp; \ 3117 } 3118 3119 /* swap ds_*_bytes */ 3120 SWITCH64(csa->ohds->ds_phys->ds_used_bytes, 3121 csa->cds->ds_phys->ds_used_bytes); 3122 SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes, 3123 csa->cds->ds_phys->ds_compressed_bytes); 3124 SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes, 3125 csa->cds->ds_phys->ds_uncompressed_bytes); 3126 SWITCH64(csa->ohds->ds_phys->ds_unique_bytes, 3127 csa->cds->ds_phys->ds_unique_bytes); 3128 3129 /* apply any parent delta for change in unconsumed refreservation */ 3130 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV, 3131 csa->unused_refres_delta, 0, 0, tx); 3132 3133 /* swap deadlists */ 3134 bplist_close(&csa->cds->ds_deadlist); 3135 bplist_close(&csa->ohds->ds_deadlist); 3136 SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj, 3137 csa->cds->ds_phys->ds_deadlist_obj); 3138 VERIFY(0 == bplist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset, 3139 csa->cds->ds_phys->ds_deadlist_obj)); 3140 VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset, 3141 csa->ohds->ds_phys->ds_deadlist_obj)); 3142 3143 dsl_pool_ds_clone_swapped(csa->ohds, csa->cds, tx); 3144 } 3145 3146 /* 3147 * Swap 'clone' with its origin head file system. Used at the end 3148 * of "online recv" to swizzle the file system to the new version. 3149 */ 3150 int 3151 dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, 3152 boolean_t force) 3153 { 3154 struct cloneswaparg csa; 3155 int error; 3156 3157 ASSERT(clone->ds_owner); 3158 ASSERT(origin_head->ds_owner); 3159 retry: 3160 /* Need exclusive access for the swap */ 3161 rw_enter(&clone->ds_rwlock, RW_WRITER); 3162 if (!rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) { 3163 rw_exit(&clone->ds_rwlock); 3164 rw_enter(&origin_head->ds_rwlock, RW_WRITER); 3165 if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) { 3166 rw_exit(&origin_head->ds_rwlock); 3167 goto retry; 3168 } 3169 } 3170 csa.cds = clone; 3171 csa.ohds = origin_head; 3172 csa.force = force; 3173 error = dsl_sync_task_do(clone->ds_dir->dd_pool, 3174 dsl_dataset_clone_swap_check, 3175 dsl_dataset_clone_swap_sync, &csa, NULL, 9); 3176 return (error); 3177 } 3178 3179 /* 3180 * Given a pool name and a dataset object number in that pool, 3181 * return the name of that dataset. 3182 */ 3183 int 3184 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) 3185 { 3186 spa_t *spa; 3187 dsl_pool_t *dp; 3188 dsl_dataset_t *ds; 3189 int error; 3190 3191 if ((error = spa_open(pname, &spa, FTAG)) != 0) 3192 return (error); 3193 dp = spa_get_dsl(spa); 3194 rw_enter(&dp->dp_config_rwlock, RW_READER); 3195 if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) { 3196 dsl_dataset_name(ds, buf); 3197 dsl_dataset_rele(ds, FTAG); 3198 } 3199 rw_exit(&dp->dp_config_rwlock); 3200 spa_close(spa, FTAG); 3201 3202 return (error); 3203 } 3204 3205 int 3206 dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, 3207 uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv) 3208 { 3209 int error = 0; 3210 3211 ASSERT3S(asize, >, 0); 3212 3213 /* 3214 * *ref_rsrv is the portion of asize that will come from any 3215 * unconsumed refreservation space. 3216 */ 3217 *ref_rsrv = 0; 3218 3219 mutex_enter(&ds->ds_lock); 3220 /* 3221 * Make a space adjustment for reserved bytes. 3222 */ 3223 if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) { 3224 ASSERT3U(*used, >=, 3225 ds->ds_reserved - ds->ds_phys->ds_unique_bytes); 3226 *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes); 3227 *ref_rsrv = 3228 asize - MIN(asize, parent_delta(ds, asize + inflight)); 3229 } 3230 3231 if (!check_quota || ds->ds_quota == 0) { 3232 mutex_exit(&ds->ds_lock); 3233 return (0); 3234 } 3235 /* 3236 * If they are requesting more space, and our current estimate 3237 * is over quota, they get to try again unless the actual 3238 * on-disk is over quota and there are no pending changes (which 3239 * may free up space for us). 3240 */ 3241 if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) { 3242 if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota) 3243 error = ERESTART; 3244 else 3245 error = EDQUOT; 3246 } 3247 mutex_exit(&ds->ds_lock); 3248 3249 return (error); 3250 } 3251 3252 /* ARGSUSED */ 3253 static int 3254 dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) 3255 { 3256 dsl_dataset_t *ds = arg1; 3257 uint64_t *quotap = arg2; 3258 uint64_t new_quota = *quotap; 3259 3260 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA) 3261 return (ENOTSUP); 3262 3263 if (new_quota == 0) 3264 return (0); 3265 3266 if (new_quota < ds->ds_phys->ds_used_bytes || 3267 new_quota < ds->ds_reserved) 3268 return (ENOSPC); 3269 3270 return (0); 3271 } 3272 3273 /* ARGSUSED */ 3274 void 3275 dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 3276 { 3277 dsl_dataset_t *ds = arg1; 3278 uint64_t *quotap = arg2; 3279 uint64_t new_quota = *quotap; 3280 3281 dmu_buf_will_dirty(ds->ds_dbuf, tx); 3282 3283 ds->ds_quota = new_quota; 3284 3285 dsl_dir_prop_set_uint64_sync(ds->ds_dir, "refquota", new_quota, cr, tx); 3286 3287 spa_history_internal_log(LOG_DS_REFQUOTA, ds->ds_dir->dd_pool->dp_spa, 3288 tx, cr, "%lld dataset = %llu ", 3289 (longlong_t)new_quota, ds->ds_object); 3290 } 3291 3292 int 3293 dsl_dataset_set_quota(const char *dsname, uint64_t quota) 3294 { 3295 dsl_dataset_t *ds; 3296 int err; 3297 3298 err = dsl_dataset_hold(dsname, FTAG, &ds); 3299 if (err) 3300 return (err); 3301 3302 if (quota != ds->ds_quota) { 3303 /* 3304 * If someone removes a file, then tries to set the quota, we 3305 * want to make sure the file freeing takes effect. 3306 */ 3307 txg_wait_open(ds->ds_dir->dd_pool, 0); 3308 3309 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 3310 dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync, 3311 ds, "a, 0); 3312 } 3313 dsl_dataset_rele(ds, FTAG); 3314 return (err); 3315 } 3316 3317 static int 3318 dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) 3319 { 3320 dsl_dataset_t *ds = arg1; 3321 uint64_t *reservationp = arg2; 3322 uint64_t new_reservation = *reservationp; 3323 uint64_t unique; 3324 3325 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < 3326 SPA_VERSION_REFRESERVATION) 3327 return (ENOTSUP); 3328 3329 if (dsl_dataset_is_snapshot(ds)) 3330 return (EINVAL); 3331 3332 /* 3333 * If we are doing the preliminary check in open context, the 3334 * space estimates may be inaccurate. 3335 */ 3336 if (!dmu_tx_is_syncing(tx)) 3337 return (0); 3338 3339 mutex_enter(&ds->ds_lock); 3340 unique = dsl_dataset_unique(ds); 3341 mutex_exit(&ds->ds_lock); 3342 3343 if (MAX(unique, new_reservation) > MAX(unique, ds->ds_reserved)) { 3344 uint64_t delta = MAX(unique, new_reservation) - 3345 MAX(unique, ds->ds_reserved); 3346 3347 if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) 3348 return (ENOSPC); 3349 if (ds->ds_quota > 0 && 3350 new_reservation > ds->ds_quota) 3351 return (ENOSPC); 3352 } 3353 3354 return (0); 3355 } 3356 3357 /* ARGSUSED */ 3358 static void 3359 dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, 3360 dmu_tx_t *tx) 3361 { 3362 dsl_dataset_t *ds = arg1; 3363 uint64_t *reservationp = arg2; 3364 uint64_t new_reservation = *reservationp; 3365 uint64_t unique; 3366 int64_t delta; 3367 3368 dmu_buf_will_dirty(ds->ds_dbuf, tx); 3369 3370 mutex_enter(&ds->ds_dir->dd_lock); 3371 mutex_enter(&ds->ds_lock); 3372 unique = dsl_dataset_unique(ds); 3373 delta = MAX(0, (int64_t)(new_reservation - unique)) - 3374 MAX(0, (int64_t)(ds->ds_reserved - unique)); 3375 ds->ds_reserved = new_reservation; 3376 mutex_exit(&ds->ds_lock); 3377 3378 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); 3379 mutex_exit(&ds->ds_dir->dd_lock); 3380 dsl_dir_prop_set_uint64_sync(ds->ds_dir, "refreservation", 3381 new_reservation, cr, tx); 3382 3383 spa_history_internal_log(LOG_DS_REFRESERV, 3384 ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu", 3385 (longlong_t)new_reservation, ds->ds_object); 3386 } 3387 3388 int 3389 dsl_dataset_set_reservation(const char *dsname, uint64_t reservation) 3390 { 3391 dsl_dataset_t *ds; 3392 int err; 3393 3394 err = dsl_dataset_hold(dsname, FTAG, &ds); 3395 if (err) 3396 return (err); 3397 3398 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 3399 dsl_dataset_set_reservation_check, 3400 dsl_dataset_set_reservation_sync, ds, &reservation, 0); 3401 dsl_dataset_rele(ds, FTAG); 3402 return (err); 3403 } 3404 3405 static int 3406 dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx) 3407 { 3408 dsl_dataset_t *ds = arg1; 3409 char *htag = arg2; 3410 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 3411 int error = 0; 3412 3413 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) 3414 return (ENOTSUP); 3415 3416 if (!dsl_dataset_is_snapshot(ds)) 3417 return (EINVAL); 3418 3419 if (strlen(htag) >= ZAP_MAXNAMELEN) 3420 return (ENAMETOOLONG); 3421 3422 /* tags must be unique */ 3423 mutex_enter(&ds->ds_lock); 3424 if (ds->ds_phys->ds_userrefs_obj) { 3425 error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag, 3426 8, 1, tx); 3427 if (error == 0) 3428 error = EEXIST; 3429 else if (error == ENOENT) 3430 error = 0; 3431 } 3432 mutex_exit(&ds->ds_lock); 3433 3434 return (error); 3435 } 3436 3437 static void 3438 dsl_dataset_user_hold_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 3439 { 3440 dsl_dataset_t *ds = arg1; 3441 char *htag = arg2; 3442 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 3443 time_t now = gethrestime_sec(); 3444 uint64_t zapobj; 3445 3446 mutex_enter(&ds->ds_lock); 3447 if (ds->ds_phys->ds_userrefs_obj == 0) { 3448 /* 3449 * This is the first user hold for this dataset. Create 3450 * the userrefs zap object. 3451 */ 3452 dmu_buf_will_dirty(ds->ds_dbuf, tx); 3453 zapobj = ds->ds_phys->ds_userrefs_obj = 3454 zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); 3455 } else { 3456 zapobj = ds->ds_phys->ds_userrefs_obj; 3457 } 3458 ds->ds_userrefs++; 3459 mutex_exit(&ds->ds_lock); 3460 3461 VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx)); 3462 3463 spa_history_internal_log(LOG_DS_USER_HOLD, 3464 ds->ds_dir->dd_pool->dp_spa, tx, cr, "<%s> dataset = %llu", 3465 htag, ds->ds_object); 3466 } 3467 3468 struct dsl_ds_holdarg { 3469 dsl_sync_task_group_t *dstg; 3470 char *htag; 3471 char *snapname; 3472 boolean_t recursive; 3473 char failed[MAXPATHLEN]; 3474 }; 3475 3476 static int 3477 dsl_dataset_user_hold_one(char *dsname, void *arg) 3478 { 3479 struct dsl_ds_holdarg *ha = arg; 3480 dsl_dataset_t *ds; 3481 int error; 3482 char *name; 3483 size_t buflen; 3484 3485 /* alloc a buffer to hold dsname@snapname plus terminating NULL */ 3486 buflen = strlen(dsname) + strlen(ha->snapname) + 2; 3487 name = kmem_alloc(buflen, KM_SLEEP); 3488 (void) snprintf(name, buflen, "%s@%s", dsname, ha->snapname); 3489 error = dsl_dataset_hold(name, ha->dstg, &ds); 3490 kmem_free(name, buflen); 3491 if (error == 0) { 3492 dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check, 3493 dsl_dataset_user_hold_sync, ds, ha->htag, 0); 3494 } else if (error == ENOENT && ha->recursive) { 3495 error = 0; 3496 } else { 3497 (void) strcpy(ha->failed, dsname); 3498 } 3499 return (error); 3500 } 3501 3502 int 3503 dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, 3504 boolean_t recursive) 3505 { 3506 struct dsl_ds_holdarg *ha; 3507 dsl_sync_task_t *dst; 3508 spa_t *spa; 3509 int error; 3510 3511 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); 3512 3513 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 3514 3515 error = spa_open(dsname, &spa, FTAG); 3516 if (error) { 3517 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3518 return (error); 3519 } 3520 3521 ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 3522 ha->htag = htag; 3523 ha->snapname = snapname; 3524 ha->recursive = recursive; 3525 if (recursive) { 3526 error = dmu_objset_find(dsname, dsl_dataset_user_hold_one, 3527 ha, DS_FIND_CHILDREN); 3528 } else { 3529 error = dsl_dataset_user_hold_one(dsname, ha); 3530 } 3531 if (error == 0) 3532 error = dsl_sync_task_group_wait(ha->dstg); 3533 3534 for (dst = list_head(&ha->dstg->dstg_tasks); dst; 3535 dst = list_next(&ha->dstg->dstg_tasks, dst)) { 3536 dsl_dataset_t *ds = dst->dst_arg1; 3537 3538 if (dst->dst_err) { 3539 dsl_dataset_name(ds, ha->failed); 3540 *strchr(ha->failed, '@') = '\0'; 3541 } 3542 dsl_dataset_rele(ds, ha->dstg); 3543 } 3544 3545 if (error) 3546 (void) strcpy(dsname, ha->failed); 3547 3548 dsl_sync_task_group_destroy(ha->dstg); 3549 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3550 spa_close(spa, FTAG); 3551 return (error); 3552 } 3553 3554 struct dsl_ds_releasearg { 3555 dsl_dataset_t *ds; 3556 const char *htag; 3557 boolean_t own; /* do we own or just hold ds? */ 3558 }; 3559 3560 static int 3561 dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag, 3562 boolean_t *might_destroy) 3563 { 3564 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 3565 uint64_t zapobj; 3566 uint64_t tmp; 3567 int error; 3568 3569 *might_destroy = B_FALSE; 3570 3571 mutex_enter(&ds->ds_lock); 3572 zapobj = ds->ds_phys->ds_userrefs_obj; 3573 if (zapobj == 0) { 3574 /* The tag can't possibly exist */ 3575 mutex_exit(&ds->ds_lock); 3576 return (ESRCH); 3577 } 3578 3579 /* Make sure the tag exists */ 3580 error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp); 3581 if (error) { 3582 mutex_exit(&ds->ds_lock); 3583 if (error == ENOENT) 3584 error = ESRCH; 3585 return (error); 3586 } 3587 3588 if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 && 3589 DS_IS_DEFER_DESTROY(ds)) 3590 *might_destroy = B_TRUE; 3591 3592 mutex_exit(&ds->ds_lock); 3593 return (0); 3594 } 3595 3596 static int 3597 dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx) 3598 { 3599 struct dsl_ds_releasearg *ra = arg1; 3600 dsl_dataset_t *ds = ra->ds; 3601 boolean_t might_destroy; 3602 int error; 3603 3604 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) 3605 return (ENOTSUP); 3606 3607 error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy); 3608 if (error) 3609 return (error); 3610 3611 if (might_destroy) { 3612 struct dsl_ds_destroyarg dsda = {0}; 3613 3614 if (dmu_tx_is_syncing(tx)) { 3615 /* 3616 * If we're not prepared to remove the snapshot, 3617 * we can't allow the release to happen right now. 3618 */ 3619 if (!ra->own) 3620 return (EBUSY); 3621 if (ds->ds_user_ptr) { 3622 ds->ds_user_evict_func(ds, ds->ds_user_ptr); 3623 ds->ds_user_ptr = NULL; 3624 } 3625 } 3626 dsda.ds = ds; 3627 dsda.releasing = B_TRUE; 3628 return (dsl_dataset_destroy_check(&dsda, tag, tx)); 3629 } 3630 3631 return (0); 3632 } 3633 3634 static void 3635 dsl_dataset_user_release_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) 3636 { 3637 struct dsl_ds_releasearg *ra = arg1; 3638 dsl_dataset_t *ds = ra->ds; 3639 spa_t *spa = ds->ds_dir->dd_pool->dp_spa; 3640 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 3641 uint64_t zapobj; 3642 uint64_t dsobj = ds->ds_object; 3643 uint64_t refs; 3644 3645 mutex_enter(&ds->ds_lock); 3646 ds->ds_userrefs--; 3647 refs = ds->ds_userrefs; 3648 mutex_exit(&ds->ds_lock); 3649 zapobj = ds->ds_phys->ds_userrefs_obj; 3650 VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx)); 3651 if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 && 3652 DS_IS_DEFER_DESTROY(ds)) { 3653 struct dsl_ds_destroyarg dsda = {0}; 3654 3655 ASSERT(ra->own); 3656 dsda.ds = ds; 3657 dsda.releasing = B_TRUE; 3658 /* We already did the destroy_check */ 3659 dsl_dataset_destroy_sync(&dsda, tag, cr, tx); 3660 } 3661 3662 spa_history_internal_log(LOG_DS_USER_RELEASE, 3663 spa, tx, cr, "<%s> %lld dataset = %llu", 3664 ra->htag, (longlong_t)refs, dsobj); 3665 } 3666 3667 static int 3668 dsl_dataset_user_release_one(char *dsname, void *arg) 3669 { 3670 struct dsl_ds_holdarg *ha = arg; 3671 struct dsl_ds_releasearg *ra; 3672 dsl_dataset_t *ds; 3673 int error; 3674 void *dtag = ha->dstg; 3675 char *name; 3676 size_t buflen; 3677 boolean_t own = B_FALSE; 3678 boolean_t might_destroy; 3679 3680 if (strlen(ha->htag) >= ZAP_MAXNAMELEN) 3681 return (ENAMETOOLONG); 3682 3683 /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */ 3684 buflen = strlen(dsname) + strlen(ha->snapname) + 2; 3685 name = kmem_alloc(buflen, KM_SLEEP); 3686 (void) snprintf(name, buflen, "%s@%s", dsname, ha->snapname); 3687 error = dsl_dataset_hold(name, dtag, &ds); 3688 kmem_free(name, buflen); 3689 if (error == ENOENT && ha->recursive) 3690 return (0); 3691 (void) strcpy(ha->failed, dsname); 3692 if (error) 3693 return (error); 3694 3695 ASSERT(dsl_dataset_is_snapshot(ds)); 3696 3697 error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy); 3698 if (error) { 3699 dsl_dataset_rele(ds, dtag); 3700 return (error); 3701 } 3702 3703 if (might_destroy) { 3704 #ifdef _KERNEL 3705 error = zfs_unmount_snap(name, NULL); 3706 if (error) { 3707 dsl_dataset_rele(ds, dtag); 3708 return (error); 3709 } 3710 error = dsl_dataset_zvol_cleanup(ds, name); 3711 if (error) { 3712 dsl_dataset_rele(ds, dtag); 3713 return (error); 3714 } 3715 #endif 3716 if (!dsl_dataset_tryown(ds, 3717 DS_MODE_READONLY | DS_MODE_INCONSISTENT, dtag)) { 3718 dsl_dataset_rele(ds, dtag); 3719 return (EBUSY); 3720 } else { 3721 own = B_TRUE; 3722 dsl_dataset_make_exclusive(ds, dtag); 3723 } 3724 } 3725 3726 ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP); 3727 ra->ds = ds; 3728 ra->htag = ha->htag; 3729 ra->own = own; 3730 dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check, 3731 dsl_dataset_user_release_sync, ra, dtag, 0); 3732 3733 return (0); 3734 } 3735 3736 int 3737 dsl_dataset_user_release(char *dsname, char *snapname, char *htag, 3738 boolean_t recursive) 3739 { 3740 struct dsl_ds_holdarg *ha; 3741 dsl_sync_task_t *dst; 3742 spa_t *spa; 3743 int error; 3744 3745 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); 3746 3747 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 3748 3749 error = spa_open(dsname, &spa, FTAG); 3750 if (error) { 3751 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3752 return (error); 3753 } 3754 3755 ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 3756 ha->htag = htag; 3757 ha->snapname = snapname; 3758 ha->recursive = recursive; 3759 if (recursive) { 3760 error = dmu_objset_find(dsname, dsl_dataset_user_release_one, 3761 ha, DS_FIND_CHILDREN); 3762 } else { 3763 error = dsl_dataset_user_release_one(dsname, ha); 3764 } 3765 if (error == 0) 3766 error = dsl_sync_task_group_wait(ha->dstg); 3767 3768 for (dst = list_head(&ha->dstg->dstg_tasks); dst; 3769 dst = list_next(&ha->dstg->dstg_tasks, dst)) { 3770 struct dsl_ds_releasearg *ra = dst->dst_arg1; 3771 dsl_dataset_t *ds = ra->ds; 3772 3773 if (dst->dst_err) 3774 dsl_dataset_name(ds, ha->failed); 3775 3776 if (ra->own) 3777 dsl_dataset_disown(ds, ha->dstg); 3778 else 3779 dsl_dataset_rele(ds, ha->dstg); 3780 3781 kmem_free(ra, sizeof (struct dsl_ds_releasearg)); 3782 } 3783 3784 if (error) 3785 (void) strcpy(dsname, ha->failed); 3786 3787 dsl_sync_task_group_destroy(ha->dstg); 3788 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3789 spa_close(spa, FTAG); 3790 return (error); 3791 } 3792 3793 int 3794 dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp) 3795 { 3796 dsl_dataset_t *ds; 3797 int err; 3798 3799 err = dsl_dataset_hold(dsname, FTAG, &ds); 3800 if (err) 3801 return (err); 3802 3803 VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP)); 3804 if (ds->ds_phys->ds_userrefs_obj != 0) { 3805 zap_attribute_t *za; 3806 zap_cursor_t zc; 3807 3808 za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); 3809 for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, 3810 ds->ds_phys->ds_userrefs_obj); 3811 zap_cursor_retrieve(&zc, za) == 0; 3812 zap_cursor_advance(&zc)) { 3813 VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name, 3814 za->za_first_integer)); 3815 } 3816 zap_cursor_fini(&zc); 3817 kmem_free(za, sizeof (zap_attribute_t)); 3818 } 3819 dsl_dataset_rele(ds, FTAG); 3820 return (0); 3821 } 3822