1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/dmu_objset.h> 29 #include <sys/dsl_dataset.h> 30 #include <sys/dsl_dir.h> 31 #include <sys/dsl_prop.h> 32 #include <sys/dsl_synctask.h> 33 #include <sys/dmu_traverse.h> 34 #include <sys/dmu_tx.h> 35 #include <sys/arc.h> 36 #include <sys/zio.h> 37 #include <sys/zap.h> 38 #include <sys/unique.h> 39 #include <sys/zfs_context.h> 40 #include <sys/zfs_ioctl.h> 41 #include <sys/spa.h> 42 #include <sys/zfs_znode.h> 43 #include <sys/sunddi.h> 44 45 static char *dsl_reaper = "the grim reaper"; 46 47 static dsl_checkfunc_t dsl_dataset_destroy_begin_check; 48 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync; 49 static dsl_checkfunc_t dsl_dataset_rollback_check; 50 static dsl_syncfunc_t dsl_dataset_rollback_sync; 51 static dsl_syncfunc_t dsl_dataset_set_reservation_sync; 52 53 #define DS_REF_MAX (1ULL << 62) 54 55 #define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE 56 57 #define DSL_DATASET_IS_DESTROYED(ds) ((ds)->ds_owner == dsl_reaper) 58 59 60 /* 61 * Figure out how much of this delta should be propogated to the dsl_dir 62 * layer. If there's a refreservation, that space has already been 63 * partially accounted for in our ancestors. 64 */ 65 static int64_t 66 parent_delta(dsl_dataset_t *ds, int64_t delta) 67 { 68 uint64_t old_bytes, new_bytes; 69 70 if (ds->ds_reserved == 0) 71 return (delta); 72 73 old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); 74 new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved); 75 76 ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta)); 77 return (new_bytes - old_bytes); 78 } 79 80 void 81 dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) 82 { 83 int used = bp_get_dasize(tx->tx_pool->dp_spa, bp); 84 int compressed = BP_GET_PSIZE(bp); 85 int uncompressed = BP_GET_UCSIZE(bp); 86 int64_t delta; 87 88 dprintf_bp(bp, "born, ds=%p\n", ds); 89 90 ASSERT(dmu_tx_is_syncing(tx)); 91 /* It could have been compressed away to nothing */ 92 if (BP_IS_HOLE(bp)) 93 return; 94 ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); 95 ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES); 96 if (ds == NULL) { 97 /* 98 * Account for the meta-objset space in its placeholder 99 * dsl_dir. 100 */ 101 ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */ 102 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, 103 used, compressed, uncompressed, tx); 104 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); 105 return; 106 } 107 dmu_buf_will_dirty(ds->ds_dbuf, tx); 108 mutex_enter(&ds->ds_lock); 109 delta = parent_delta(ds, used); 110 ds->ds_phys->ds_used_bytes += used; 111 ds->ds_phys->ds_compressed_bytes += compressed; 112 ds->ds_phys->ds_uncompressed_bytes += uncompressed; 113 ds->ds_phys->ds_unique_bytes += used; 114 mutex_exit(&ds->ds_lock); 115 dsl_dir_diduse_space(ds->ds_dir, delta, compressed, uncompressed, tx); 116 } 117 118 int 119 dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, 120 dmu_tx_t *tx) 121 { 122 int used = bp_get_dasize(tx->tx_pool->dp_spa, bp); 123 int compressed = BP_GET_PSIZE(bp); 124 int uncompressed = BP_GET_UCSIZE(bp); 125 126 ASSERT(dmu_tx_is_syncing(tx)); 127 /* No block pointer => nothing to free */ 128 if (BP_IS_HOLE(bp)) 129 return (0); 130 131 ASSERT(used > 0); 132 if (ds == NULL) { 133 int err; 134 /* 135 * Account for the meta-objset space in its placeholder 136 * dataset. 137 */ 138 err = dsl_free(pio, tx->tx_pool, 139 tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT); 140 ASSERT(err == 0); 141 142 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, 143 -used, -compressed, -uncompressed, tx); 144 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); 145 return (used); 146 } 147 ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); 148 149 dmu_buf_will_dirty(ds->ds_dbuf, tx); 150 151 if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) { 152 int err; 153 int64_t delta; 154 155 dprintf_bp(bp, "freeing: %s", ""); 156 err = dsl_free(pio, tx->tx_pool, 157 tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT); 158 ASSERT(err == 0); 159 160 mutex_enter(&ds->ds_lock); 161 ASSERT(ds->ds_phys->ds_unique_bytes >= used || 162 !DS_UNIQUE_IS_ACCURATE(ds)); 163 delta = parent_delta(ds, -used); 164 ds->ds_phys->ds_unique_bytes -= used; 165 mutex_exit(&ds->ds_lock); 166 dsl_dir_diduse_space(ds->ds_dir, 167 delta, -compressed, -uncompressed, tx); 168 } else { 169 dprintf_bp(bp, "putting on dead list: %s", ""); 170 VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx)); 171 ASSERT3U(ds->ds_prev->ds_object, ==, 172 ds->ds_phys->ds_prev_snap_obj); 173 ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0); 174 /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ 175 if (ds->ds_prev->ds_phys->ds_next_snap_obj == 176 ds->ds_object && bp->blk_birth > 177 ds->ds_prev->ds_phys->ds_prev_snap_txg) { 178 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 179 mutex_enter(&ds->ds_prev->ds_lock); 180 ds->ds_prev->ds_phys->ds_unique_bytes += used; 181 mutex_exit(&ds->ds_prev->ds_lock); 182 } 183 } 184 mutex_enter(&ds->ds_lock); 185 ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used); 186 ds->ds_phys->ds_used_bytes -= used; 187 ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed); 188 ds->ds_phys->ds_compressed_bytes -= compressed; 189 ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed); 190 ds->ds_phys->ds_uncompressed_bytes -= uncompressed; 191 mutex_exit(&ds->ds_lock); 192 193 return (used); 194 } 195 196 uint64_t 197 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds) 198 { 199 uint64_t trysnap = 0; 200 201 if (ds == NULL) 202 return (0); 203 /* 204 * The snapshot creation could fail, but that would cause an 205 * incorrect FALSE return, which would only result in an 206 * overestimation of the amount of space that an operation would 207 * consume, which is OK. 208 * 209 * There's also a small window where we could miss a pending 210 * snapshot, because we could set the sync task in the quiescing 211 * phase. So this should only be used as a guess. 212 */ 213 if (ds->ds_trysnap_txg > 214 spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa)) 215 trysnap = ds->ds_trysnap_txg; 216 return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap)); 217 } 218 219 int 220 dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth) 221 { 222 return (blk_birth > dsl_dataset_prev_snap_txg(ds)); 223 } 224 225 /* ARGSUSED */ 226 static void 227 dsl_dataset_evict(dmu_buf_t *db, void *dsv) 228 { 229 dsl_dataset_t *ds = dsv; 230 231 ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds)); 232 233 dprintf_ds(ds, "evicting %s\n", ""); 234 235 unique_remove(ds->ds_fsid_guid); 236 237 if (ds->ds_user_ptr != NULL) 238 ds->ds_user_evict_func(ds, ds->ds_user_ptr); 239 240 if (ds->ds_prev) { 241 dsl_dataset_drop_ref(ds->ds_prev, ds); 242 ds->ds_prev = NULL; 243 } 244 245 bplist_close(&ds->ds_deadlist); 246 if (ds->ds_dir) 247 dsl_dir_close(ds->ds_dir, ds); 248 249 ASSERT(!list_link_active(&ds->ds_synced_link)); 250 251 mutex_destroy(&ds->ds_lock); 252 mutex_destroy(&ds->ds_opening_lock); 253 mutex_destroy(&ds->ds_deadlist.bpl_lock); 254 rw_destroy(&ds->ds_rwlock); 255 cv_destroy(&ds->ds_exclusive_cv); 256 257 kmem_free(ds, sizeof (dsl_dataset_t)); 258 } 259 260 static int 261 dsl_dataset_get_snapname(dsl_dataset_t *ds) 262 { 263 dsl_dataset_phys_t *headphys; 264 int err; 265 dmu_buf_t *headdbuf; 266 dsl_pool_t *dp = ds->ds_dir->dd_pool; 267 objset_t *mos = dp->dp_meta_objset; 268 269 if (ds->ds_snapname[0]) 270 return (0); 271 if (ds->ds_phys->ds_next_snap_obj == 0) 272 return (0); 273 274 err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj, 275 FTAG, &headdbuf); 276 if (err) 277 return (err); 278 headphys = headdbuf->db_data; 279 err = zap_value_search(dp->dp_meta_objset, 280 headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname); 281 dmu_buf_rele(headdbuf, FTAG); 282 return (err); 283 } 284 285 static int 286 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) 287 { 288 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 289 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; 290 matchtype_t mt; 291 int err; 292 293 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 294 mt = MT_FIRST; 295 else 296 mt = MT_EXACT; 297 298 err = zap_lookup_norm(mos, snapobj, name, 8, 1, 299 value, mt, NULL, 0, NULL); 300 if (err == ENOTSUP && mt == MT_FIRST) 301 err = zap_lookup(mos, snapobj, name, 8, 1, value); 302 return (err); 303 } 304 305 static int 306 dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx) 307 { 308 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 309 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; 310 matchtype_t mt; 311 int err; 312 313 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 314 mt = MT_FIRST; 315 else 316 mt = MT_EXACT; 317 318 err = zap_remove_norm(mos, snapobj, name, mt, tx); 319 if (err == ENOTSUP && mt == MT_FIRST) 320 err = zap_remove(mos, snapobj, name, tx); 321 return (err); 322 } 323 324 static int 325 dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, 326 dsl_dataset_t **dsp) 327 { 328 objset_t *mos = dp->dp_meta_objset; 329 dmu_buf_t *dbuf; 330 dsl_dataset_t *ds; 331 int err; 332 333 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || 334 dsl_pool_sync_context(dp)); 335 336 err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); 337 if (err) 338 return (err); 339 ds = dmu_buf_get_user(dbuf); 340 if (ds == NULL) { 341 dsl_dataset_t *winner; 342 343 ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); 344 ds->ds_dbuf = dbuf; 345 ds->ds_object = dsobj; 346 ds->ds_phys = dbuf->db_data; 347 348 mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); 349 mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); 350 mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT, 351 NULL); 352 rw_init(&ds->ds_rwlock, 0, 0, 0); 353 cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL); 354 355 err = bplist_open(&ds->ds_deadlist, 356 mos, ds->ds_phys->ds_deadlist_obj); 357 if (err == 0) { 358 err = dsl_dir_open_obj(dp, 359 ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir); 360 } 361 if (err) { 362 /* 363 * we don't really need to close the blist if we 364 * just opened it. 365 */ 366 mutex_destroy(&ds->ds_lock); 367 mutex_destroy(&ds->ds_opening_lock); 368 mutex_destroy(&ds->ds_deadlist.bpl_lock); 369 rw_destroy(&ds->ds_rwlock); 370 cv_destroy(&ds->ds_exclusive_cv); 371 kmem_free(ds, sizeof (dsl_dataset_t)); 372 dmu_buf_rele(dbuf, tag); 373 return (err); 374 } 375 376 if (ds->ds_dir->dd_phys->dd_head_dataset_obj == dsobj) { 377 ds->ds_snapname[0] = '\0'; 378 if (ds->ds_phys->ds_prev_snap_obj) { 379 err = dsl_dataset_get_ref(dp, 380 ds->ds_phys->ds_prev_snap_obj, 381 ds, &ds->ds_prev); 382 } 383 } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) { 384 err = dsl_dataset_get_snapname(ds); 385 } 386 387 if (!dsl_dataset_is_snapshot(ds)) { 388 /* 389 * In sync context, we're called with either no lock 390 * or with the write lock. If we're not syncing, 391 * we're always called with the read lock held. 392 */ 393 boolean_t need_lock = 394 !RW_WRITE_HELD(&dp->dp_config_rwlock) && 395 dsl_pool_sync_context(dp); 396 397 if (need_lock) 398 rw_enter(&dp->dp_config_rwlock, RW_READER); 399 400 err = dsl_prop_get_ds_locked(ds->ds_dir, 401 "refreservation", sizeof (uint64_t), 1, 402 &ds->ds_reserved, NULL); 403 if (err == 0) { 404 err = dsl_prop_get_ds_locked(ds->ds_dir, 405 "refquota", sizeof (uint64_t), 1, 406 &ds->ds_quota, NULL); 407 } 408 409 if (need_lock) 410 rw_exit(&dp->dp_config_rwlock); 411 } else { 412 ds->ds_reserved = ds->ds_quota = 0; 413 } 414 415 if (err == 0) { 416 winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys, 417 dsl_dataset_evict); 418 } 419 if (err || winner) { 420 bplist_close(&ds->ds_deadlist); 421 if (ds->ds_prev) 422 dsl_dataset_drop_ref(ds->ds_prev, ds); 423 dsl_dir_close(ds->ds_dir, ds); 424 mutex_destroy(&ds->ds_lock); 425 mutex_destroy(&ds->ds_opening_lock); 426 mutex_destroy(&ds->ds_deadlist.bpl_lock); 427 rw_destroy(&ds->ds_rwlock); 428 cv_destroy(&ds->ds_exclusive_cv); 429 kmem_free(ds, sizeof (dsl_dataset_t)); 430 if (err) { 431 dmu_buf_rele(dbuf, tag); 432 return (err); 433 } 434 ds = winner; 435 } else { 436 ds->ds_fsid_guid = 437 unique_insert(ds->ds_phys->ds_fsid_guid); 438 } 439 } 440 ASSERT3P(ds->ds_dbuf, ==, dbuf); 441 ASSERT3P(ds->ds_phys, ==, dbuf->db_data); 442 ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 || 443 dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); 444 mutex_enter(&ds->ds_lock); 445 if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) { 446 mutex_exit(&ds->ds_lock); 447 dmu_buf_rele(ds->ds_dbuf, tag); 448 return (ENOENT); 449 } 450 mutex_exit(&ds->ds_lock); 451 *dsp = ds; 452 return (0); 453 } 454 455 static int 456 dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag) 457 { 458 dsl_pool_t *dp = ds->ds_dir->dd_pool; 459 460 /* 461 * In syncing context we don't want the rwlock lock: there 462 * may be an existing writer waiting for sync phase to 463 * finish. We don't need to worry about such writers, since 464 * sync phase is single-threaded, so the writer can't be 465 * doing anything while we are active. 466 */ 467 if (dsl_pool_sync_context(dp)) { 468 ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); 469 return (0); 470 } 471 472 /* 473 * Normal users will hold the ds_rwlock as a READER until they 474 * are finished (i.e., call dsl_dataset_rele()). "Owners" will 475 * drop their READER lock after they set the ds_owner field. 476 * 477 * If the dataset is being destroyed, the destroy thread will 478 * obtain a WRITER lock for exclusive access after it's done its 479 * open-context work and then change the ds_owner to 480 * dsl_reaper once destruction is assured. So threads 481 * may block here temporarily, until the "destructability" of 482 * the dataset is determined. 483 */ 484 ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock)); 485 mutex_enter(&ds->ds_lock); 486 while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) { 487 rw_exit(&dp->dp_config_rwlock); 488 cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock); 489 if (DSL_DATASET_IS_DESTROYED(ds)) { 490 mutex_exit(&ds->ds_lock); 491 dsl_dataset_drop_ref(ds, tag); 492 rw_enter(&dp->dp_config_rwlock, RW_READER); 493 return (ENOENT); 494 } 495 rw_enter(&dp->dp_config_rwlock, RW_READER); 496 } 497 mutex_exit(&ds->ds_lock); 498 return (0); 499 } 500 501 int 502 dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, 503 dsl_dataset_t **dsp) 504 { 505 int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp); 506 507 if (err) 508 return (err); 509 return (dsl_dataset_hold_ref(*dsp, tag)); 510 } 511 512 int 513 dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, int flags, void *owner, 514 dsl_dataset_t **dsp) 515 { 516 int err = dsl_dataset_hold_obj(dp, dsobj, owner, dsp); 517 518 ASSERT(DS_MODE_TYPE(flags) != DS_MODE_USER); 519 520 if (err) 521 return (err); 522 if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) { 523 dsl_dataset_rele(*dsp, owner); 524 return (EBUSY); 525 } 526 return (0); 527 } 528 529 int 530 dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) 531 { 532 dsl_dir_t *dd; 533 dsl_pool_t *dp; 534 const char *snapname; 535 uint64_t obj; 536 int err = 0; 537 538 err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname); 539 if (err) 540 return (err); 541 542 dp = dd->dd_pool; 543 obj = dd->dd_phys->dd_head_dataset_obj; 544 rw_enter(&dp->dp_config_rwlock, RW_READER); 545 if (obj) 546 err = dsl_dataset_get_ref(dp, obj, tag, dsp); 547 else 548 err = ENOENT; 549 if (err) 550 goto out; 551 552 err = dsl_dataset_hold_ref(*dsp, tag); 553 554 /* we may be looking for a snapshot */ 555 if (err == 0 && snapname != NULL) { 556 dsl_dataset_t *ds = NULL; 557 558 if (*snapname++ != '@') { 559 dsl_dataset_rele(*dsp, tag); 560 err = ENOENT; 561 goto out; 562 } 563 564 dprintf("looking for snapshot '%s'\n", snapname); 565 err = dsl_dataset_snap_lookup(*dsp, snapname, &obj); 566 if (err == 0) 567 err = dsl_dataset_get_ref(dp, obj, tag, &ds); 568 dsl_dataset_rele(*dsp, tag); 569 570 ASSERT3U((err == 0), ==, (ds != NULL)); 571 572 if (ds) { 573 mutex_enter(&ds->ds_lock); 574 if (ds->ds_snapname[0] == 0) 575 (void) strlcpy(ds->ds_snapname, snapname, 576 sizeof (ds->ds_snapname)); 577 mutex_exit(&ds->ds_lock); 578 err = dsl_dataset_hold_ref(ds, tag); 579 *dsp = err ? NULL : ds; 580 } 581 } 582 out: 583 rw_exit(&dp->dp_config_rwlock); 584 dsl_dir_close(dd, FTAG); 585 return (err); 586 } 587 588 int 589 dsl_dataset_own(const char *name, int flags, void *owner, dsl_dataset_t **dsp) 590 { 591 int err = dsl_dataset_hold(name, owner, dsp); 592 if (err) 593 return (err); 594 if ((*dsp)->ds_phys->ds_num_children > 0 && 595 !DS_MODE_IS_READONLY(flags)) { 596 dsl_dataset_rele(*dsp, owner); 597 return (EROFS); 598 } 599 if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) { 600 dsl_dataset_rele(*dsp, owner); 601 return (EBUSY); 602 } 603 return (0); 604 } 605 606 void 607 dsl_dataset_name(dsl_dataset_t *ds, char *name) 608 { 609 if (ds == NULL) { 610 (void) strcpy(name, "mos"); 611 } else { 612 dsl_dir_name(ds->ds_dir, name); 613 VERIFY(0 == dsl_dataset_get_snapname(ds)); 614 if (ds->ds_snapname[0]) { 615 (void) strcat(name, "@"); 616 /* 617 * We use a "recursive" mutex so that we 618 * can call dprintf_ds() with ds_lock held. 619 */ 620 if (!MUTEX_HELD(&ds->ds_lock)) { 621 mutex_enter(&ds->ds_lock); 622 (void) strcat(name, ds->ds_snapname); 623 mutex_exit(&ds->ds_lock); 624 } else { 625 (void) strcat(name, ds->ds_snapname); 626 } 627 } 628 } 629 } 630 631 static int 632 dsl_dataset_namelen(dsl_dataset_t *ds) 633 { 634 int result; 635 636 if (ds == NULL) { 637 result = 3; /* "mos" */ 638 } else { 639 result = dsl_dir_namelen(ds->ds_dir); 640 VERIFY(0 == dsl_dataset_get_snapname(ds)); 641 if (ds->ds_snapname[0]) { 642 ++result; /* adding one for the @-sign */ 643 if (!MUTEX_HELD(&ds->ds_lock)) { 644 mutex_enter(&ds->ds_lock); 645 result += strlen(ds->ds_snapname); 646 mutex_exit(&ds->ds_lock); 647 } else { 648 result += strlen(ds->ds_snapname); 649 } 650 } 651 } 652 653 return (result); 654 } 655 656 void 657 dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag) 658 { 659 dmu_buf_rele(ds->ds_dbuf, tag); 660 } 661 662 void 663 dsl_dataset_rele(dsl_dataset_t *ds, void *tag) 664 { 665 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) { 666 rw_exit(&ds->ds_rwlock); 667 } 668 dsl_dataset_drop_ref(ds, tag); 669 } 670 671 void 672 dsl_dataset_disown(dsl_dataset_t *ds, void *owner) 673 { 674 ASSERT((ds->ds_owner == owner && ds->ds_dbuf) || 675 (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL)); 676 677 mutex_enter(&ds->ds_lock); 678 ds->ds_owner = NULL; 679 if (RW_WRITE_HELD(&ds->ds_rwlock)) { 680 rw_exit(&ds->ds_rwlock); 681 cv_broadcast(&ds->ds_exclusive_cv); 682 } 683 mutex_exit(&ds->ds_lock); 684 if (ds->ds_dbuf) 685 dsl_dataset_drop_ref(ds, owner); 686 else 687 dsl_dataset_evict(ds->ds_dbuf, ds); 688 } 689 690 boolean_t 691 dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *owner) 692 { 693 boolean_t gotit = FALSE; 694 695 mutex_enter(&ds->ds_lock); 696 if (ds->ds_owner == NULL && 697 (!DS_IS_INCONSISTENT(ds) || inconsistentok)) { 698 ds->ds_owner = owner; 699 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) 700 rw_exit(&ds->ds_rwlock); 701 gotit = TRUE; 702 } 703 mutex_exit(&ds->ds_lock); 704 return (gotit); 705 } 706 707 void 708 dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner) 709 { 710 ASSERT3P(owner, ==, ds->ds_owner); 711 if (!RW_WRITE_HELD(&ds->ds_rwlock)) 712 rw_enter(&ds->ds_rwlock, RW_WRITER); 713 } 714 715 uint64_t 716 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, 717 uint64_t flags, dmu_tx_t *tx) 718 { 719 dsl_pool_t *dp = dd->dd_pool; 720 dmu_buf_t *dbuf; 721 dsl_dataset_phys_t *dsphys; 722 uint64_t dsobj; 723 objset_t *mos = dp->dp_meta_objset; 724 725 if (origin == NULL) 726 origin = dp->dp_origin_snap; 727 728 ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp); 729 ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0); 730 ASSERT(dmu_tx_is_syncing(tx)); 731 ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); 732 733 dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, 734 DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); 735 VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); 736 dmu_buf_will_dirty(dbuf, tx); 737 dsphys = dbuf->db_data; 738 bzero(dsphys, sizeof (dsl_dataset_phys_t)); 739 dsphys->ds_dir_obj = dd->dd_object; 740 dsphys->ds_flags = flags; 741 dsphys->ds_fsid_guid = unique_create(); 742 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, 743 sizeof (dsphys->ds_guid)); 744 dsphys->ds_snapnames_zapobj = 745 zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP, 746 DMU_OT_NONE, 0, tx); 747 dsphys->ds_creation_time = gethrestime_sec(); 748 dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg; 749 dsphys->ds_deadlist_obj = 750 bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); 751 752 if (origin) { 753 dsphys->ds_prev_snap_obj = origin->ds_object; 754 dsphys->ds_prev_snap_txg = 755 origin->ds_phys->ds_creation_txg; 756 dsphys->ds_used_bytes = 757 origin->ds_phys->ds_used_bytes; 758 dsphys->ds_compressed_bytes = 759 origin->ds_phys->ds_compressed_bytes; 760 dsphys->ds_uncompressed_bytes = 761 origin->ds_phys->ds_uncompressed_bytes; 762 dsphys->ds_bp = origin->ds_phys->ds_bp; 763 dsphys->ds_flags |= origin->ds_phys->ds_flags; 764 765 dmu_buf_will_dirty(origin->ds_dbuf, tx); 766 origin->ds_phys->ds_num_children++; 767 768 if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) { 769 if (origin->ds_phys->ds_next_clones_obj == 0) { 770 origin->ds_phys->ds_next_clones_obj = 771 zap_create(mos, 772 DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); 773 } 774 VERIFY(0 == zap_add_int(mos, 775 origin->ds_phys->ds_next_clones_obj, 776 dsobj, tx)); 777 } 778 779 dmu_buf_will_dirty(dd->dd_dbuf, tx); 780 dd->dd_phys->dd_origin_obj = origin->ds_object; 781 } 782 783 if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) 784 dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 785 786 dmu_buf_rele(dbuf, FTAG); 787 788 dmu_buf_will_dirty(dd->dd_dbuf, tx); 789 dd->dd_phys->dd_head_dataset_obj = dsobj; 790 791 return (dsobj); 792 } 793 794 uint64_t 795 dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, 796 dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx) 797 { 798 dsl_pool_t *dp = pdd->dd_pool; 799 uint64_t dsobj, ddobj; 800 dsl_dir_t *dd; 801 802 ASSERT(lastname[0] != '@'); 803 804 ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); 805 VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd)); 806 807 dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx); 808 809 dsl_deleg_set_create_perms(dd, tx, cr); 810 811 dsl_dir_close(dd, FTAG); 812 813 return (dsobj); 814 } 815 816 struct destroyarg { 817 dsl_sync_task_group_t *dstg; 818 char *snapname; 819 char *failed; 820 }; 821 822 static int 823 dsl_snapshot_destroy_one(char *name, void *arg) 824 { 825 struct destroyarg *da = arg; 826 dsl_dataset_t *ds; 827 char *cp; 828 int err; 829 830 (void) strcat(name, "@"); 831 (void) strcat(name, da->snapname); 832 err = dsl_dataset_own(name, DS_MODE_READONLY | DS_MODE_INCONSISTENT, 833 da->dstg, &ds); 834 cp = strchr(name, '@'); 835 *cp = '\0'; 836 if (err == 0) { 837 dsl_dataset_make_exclusive(ds, da->dstg); 838 dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check, 839 dsl_dataset_destroy_sync, ds, da->dstg, 0); 840 } else if (err == ENOENT) { 841 err = 0; 842 } else { 843 (void) strcpy(da->failed, name); 844 } 845 return (err); 846 } 847 848 /* 849 * Destroy 'snapname' in all descendants of 'fsname'. 850 */ 851 #pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy 852 int 853 dsl_snapshots_destroy(char *fsname, char *snapname) 854 { 855 int err; 856 struct destroyarg da; 857 dsl_sync_task_t *dst; 858 spa_t *spa; 859 860 err = spa_open(fsname, &spa, FTAG); 861 if (err) 862 return (err); 863 da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 864 da.snapname = snapname; 865 da.failed = fsname; 866 867 err = dmu_objset_find(fsname, 868 dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN); 869 870 if (err == 0) 871 err = dsl_sync_task_group_wait(da.dstg); 872 873 for (dst = list_head(&da.dstg->dstg_tasks); dst; 874 dst = list_next(&da.dstg->dstg_tasks, dst)) { 875 dsl_dataset_t *ds = dst->dst_arg1; 876 /* 877 * Return the file system name that triggered the error 878 */ 879 if (dst->dst_err) { 880 dsl_dataset_name(ds, fsname); 881 *strchr(fsname, '@') = '\0'; 882 } 883 dsl_dataset_disown(ds, da.dstg); 884 } 885 886 dsl_sync_task_group_destroy(da.dstg); 887 spa_close(spa, FTAG); 888 return (err); 889 } 890 891 /* 892 * ds must be opened as OWNER. On return (whether successful or not), 893 * ds will be closed and caller can no longer dereference it. 894 */ 895 int 896 dsl_dataset_destroy(dsl_dataset_t *ds, void *tag) 897 { 898 int err; 899 dsl_sync_task_group_t *dstg; 900 objset_t *os; 901 dsl_dir_t *dd; 902 uint64_t obj; 903 904 if (dsl_dataset_is_snapshot(ds)) { 905 /* Destroying a snapshot is simpler */ 906 dsl_dataset_make_exclusive(ds, tag); 907 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 908 dsl_dataset_destroy_check, dsl_dataset_destroy_sync, 909 ds, tag, 0); 910 goto out; 911 } 912 913 dd = ds->ds_dir; 914 915 /* 916 * Check for errors and mark this ds as inconsistent, in 917 * case we crash while freeing the objects. 918 */ 919 err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check, 920 dsl_dataset_destroy_begin_sync, ds, NULL, 0); 921 if (err) 922 goto out; 923 924 err = dmu_objset_open_ds(ds, DMU_OST_ANY, &os); 925 if (err) 926 goto out; 927 928 /* 929 * remove the objects in open context, so that we won't 930 * have too much to do in syncing context. 931 */ 932 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 933 ds->ds_phys->ds_prev_snap_txg)) { 934 /* 935 * Ignore errors, if there is not enough disk space 936 * we will deal with it in dsl_dataset_destroy_sync(). 937 */ 938 (void) dmu_free_object(os, obj); 939 } 940 941 dmu_objset_close(os); 942 if (err != ESRCH) 943 goto out; 944 945 rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); 946 err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd); 947 rw_exit(&dd->dd_pool->dp_config_rwlock); 948 949 if (err) 950 goto out; 951 952 if (ds->ds_user_ptr) { 953 /* 954 * We need to sync out all in-flight IO before we try 955 * to evict (the dataset evict func is trying to clear 956 * the cached entries for this dataset in the ARC). 957 */ 958 txg_wait_synced(dd->dd_pool, 0); 959 } 960 961 /* 962 * Blow away the dsl_dir + head dataset. 963 */ 964 dsl_dataset_make_exclusive(ds, tag); 965 if (ds->ds_user_ptr) { 966 ds->ds_user_evict_func(ds, ds->ds_user_ptr); 967 ds->ds_user_ptr = NULL; 968 } 969 dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); 970 dsl_sync_task_create(dstg, dsl_dataset_destroy_check, 971 dsl_dataset_destroy_sync, ds, tag, 0); 972 dsl_sync_task_create(dstg, dsl_dir_destroy_check, 973 dsl_dir_destroy_sync, dd, FTAG, 0); 974 err = dsl_sync_task_group_wait(dstg); 975 dsl_sync_task_group_destroy(dstg); 976 /* if it is successful, dsl_dir_destroy_sync will close the dd */ 977 if (err) 978 dsl_dir_close(dd, FTAG); 979 out: 980 dsl_dataset_disown(ds, tag); 981 return (err); 982 } 983 984 int 985 dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost) 986 { 987 ASSERT(ds->ds_owner); 988 989 return (dsl_sync_task_do(ds->ds_dir->dd_pool, 990 dsl_dataset_rollback_check, dsl_dataset_rollback_sync, 991 ds, &ost, 0)); 992 } 993 994 void * 995 dsl_dataset_set_user_ptr(dsl_dataset_t *ds, 996 void *p, dsl_dataset_evict_func_t func) 997 { 998 void *old; 999 1000 mutex_enter(&ds->ds_lock); 1001 old = ds->ds_user_ptr; 1002 if (old == NULL) { 1003 ds->ds_user_ptr = p; 1004 ds->ds_user_evict_func = func; 1005 } 1006 mutex_exit(&ds->ds_lock); 1007 return (old); 1008 } 1009 1010 void * 1011 dsl_dataset_get_user_ptr(dsl_dataset_t *ds) 1012 { 1013 return (ds->ds_user_ptr); 1014 } 1015 1016 1017 blkptr_t * 1018 dsl_dataset_get_blkptr(dsl_dataset_t *ds) 1019 { 1020 return (&ds->ds_phys->ds_bp); 1021 } 1022 1023 void 1024 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) 1025 { 1026 ASSERT(dmu_tx_is_syncing(tx)); 1027 /* If it's the meta-objset, set dp_meta_rootbp */ 1028 if (ds == NULL) { 1029 tx->tx_pool->dp_meta_rootbp = *bp; 1030 } else { 1031 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1032 ds->ds_phys->ds_bp = *bp; 1033 } 1034 } 1035 1036 spa_t * 1037 dsl_dataset_get_spa(dsl_dataset_t *ds) 1038 { 1039 return (ds->ds_dir->dd_pool->dp_spa); 1040 } 1041 1042 void 1043 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) 1044 { 1045 dsl_pool_t *dp; 1046 1047 if (ds == NULL) /* this is the meta-objset */ 1048 return; 1049 1050 ASSERT(ds->ds_user_ptr != NULL); 1051 1052 if (ds->ds_phys->ds_next_snap_obj != 0) 1053 panic("dirtying snapshot!"); 1054 1055 dp = ds->ds_dir->dd_pool; 1056 1057 if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) { 1058 /* up the hold count until we can be written out */ 1059 dmu_buf_add_ref(ds->ds_dbuf, ds); 1060 } 1061 } 1062 1063 /* 1064 * The unique space in the head dataset can be calculated by subtracting 1065 * the space used in the most recent snapshot, that is still being used 1066 * in this file system, from the space currently in use. To figure out 1067 * the space in the most recent snapshot still in use, we need to take 1068 * the total space used in the snapshot and subtract out the space that 1069 * has been freed up since the snapshot was taken. 1070 */ 1071 static void 1072 dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) 1073 { 1074 uint64_t mrs_used; 1075 uint64_t dlused, dlcomp, dluncomp; 1076 1077 ASSERT(ds->ds_object == ds->ds_dir->dd_phys->dd_head_dataset_obj); 1078 1079 if (ds->ds_phys->ds_prev_snap_obj != 0) 1080 mrs_used = ds->ds_prev->ds_phys->ds_used_bytes; 1081 else 1082 mrs_used = 0; 1083 1084 VERIFY(0 == bplist_space(&ds->ds_deadlist, &dlused, &dlcomp, 1085 &dluncomp)); 1086 1087 ASSERT3U(dlused, <=, mrs_used); 1088 ds->ds_phys->ds_unique_bytes = 1089 ds->ds_phys->ds_used_bytes - (mrs_used - dlused); 1090 1091 if (!DS_UNIQUE_IS_ACCURATE(ds) && 1092 spa_version(ds->ds_dir->dd_pool->dp_spa) >= 1093 SPA_VERSION_UNIQUE_ACCURATE) 1094 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 1095 } 1096 1097 static uint64_t 1098 dsl_dataset_unique(dsl_dataset_t *ds) 1099 { 1100 if (!DS_UNIQUE_IS_ACCURATE(ds) && !dsl_dataset_is_snapshot(ds)) 1101 dsl_dataset_recalc_head_uniq(ds); 1102 1103 return (ds->ds_phys->ds_unique_bytes); 1104 } 1105 1106 struct killarg { 1107 int64_t *usedp; 1108 int64_t *compressedp; 1109 int64_t *uncompressedp; 1110 zio_t *zio; 1111 dmu_tx_t *tx; 1112 }; 1113 1114 static int 1115 kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg) 1116 { 1117 struct killarg *ka = arg; 1118 blkptr_t *bp = &bc->bc_blkptr; 1119 1120 ASSERT3U(bc->bc_errno, ==, 0); 1121 1122 /* 1123 * Since this callback is not called concurrently, no lock is 1124 * needed on the accounting values. 1125 */ 1126 *ka->usedp += bp_get_dasize(spa, bp); 1127 *ka->compressedp += BP_GET_PSIZE(bp); 1128 *ka->uncompressedp += BP_GET_UCSIZE(bp); 1129 /* XXX check for EIO? */ 1130 (void) dsl_free(ka->zio, spa_get_dsl(spa), ka->tx->tx_txg, 1131 bp, NULL, NULL, ARC_NOWAIT); 1132 return (0); 1133 } 1134 1135 /* ARGSUSED */ 1136 static int 1137 dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx) 1138 { 1139 dsl_dataset_t *ds = arg1; 1140 dmu_objset_type_t *ost = arg2; 1141 1142 /* 1143 * We can only roll back to emptyness if it is a ZPL objset. 1144 */ 1145 if (*ost != DMU_OST_ZFS && ds->ds_phys->ds_prev_snap_txg == 0) 1146 return (EINVAL); 1147 1148 /* 1149 * This must not be a snapshot. 1150 */ 1151 if (ds->ds_phys->ds_next_snap_obj != 0) 1152 return (EINVAL); 1153 1154 /* 1155 * If we made changes this txg, traverse_dsl_dataset won't find 1156 * them. Try again. 1157 */ 1158 if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) 1159 return (EAGAIN); 1160 1161 return (0); 1162 } 1163 1164 /* ARGSUSED */ 1165 static void 1166 dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 1167 { 1168 dsl_dataset_t *ds = arg1; 1169 dmu_objset_type_t *ost = arg2; 1170 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 1171 1172 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1173 1174 /* 1175 * Before the roll back destroy the zil. 1176 */ 1177 if (ds->ds_user_ptr != NULL) { 1178 zil_rollback_destroy( 1179 ((objset_impl_t *)ds->ds_user_ptr)->os_zil, tx); 1180 1181 /* 1182 * We need to make sure that the objset_impl_t is reopened after 1183 * we do the rollback, otherwise it will have the wrong 1184 * objset_phys_t. Normally this would happen when this 1185 * dataset-open is closed, thus causing the 1186 * dataset to be immediately evicted. But when doing "zfs recv 1187 * -F", we reopen the objset before that, so that there is no 1188 * window where the dataset is closed and inconsistent. 1189 */ 1190 ds->ds_user_evict_func(ds, ds->ds_user_ptr); 1191 ds->ds_user_ptr = NULL; 1192 } 1193 1194 /* Zero out the deadlist. */ 1195 bplist_close(&ds->ds_deadlist); 1196 bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx); 1197 ds->ds_phys->ds_deadlist_obj = 1198 bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); 1199 VERIFY(0 == bplist_open(&ds->ds_deadlist, mos, 1200 ds->ds_phys->ds_deadlist_obj)); 1201 1202 { 1203 /* Free blkptrs that we gave birth to */ 1204 zio_t *zio; 1205 int64_t used = 0, compressed = 0, uncompressed = 0; 1206 struct killarg ka; 1207 int64_t delta; 1208 1209 zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, 1210 ZIO_FLAG_MUSTSUCCEED); 1211 ka.usedp = &used; 1212 ka.compressedp = &compressed; 1213 ka.uncompressedp = &uncompressed; 1214 ka.zio = zio; 1215 ka.tx = tx; 1216 (void) traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg, 1217 ADVANCE_POST, kill_blkptr, &ka); 1218 (void) zio_wait(zio); 1219 1220 /* only deduct space beyond any refreservation */ 1221 delta = parent_delta(ds, -used); 1222 dsl_dir_diduse_space(ds->ds_dir, 1223 delta, -compressed, -uncompressed, tx); 1224 } 1225 1226 if (ds->ds_prev && ds->ds_prev != ds->ds_dir->dd_pool->dp_origin_snap) { 1227 /* Change our contents to that of the prev snapshot */ 1228 ASSERT3U(ds->ds_prev->ds_object, ==, 1229 ds->ds_phys->ds_prev_snap_obj); 1230 ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp; 1231 ds->ds_phys->ds_used_bytes = 1232 ds->ds_prev->ds_phys->ds_used_bytes; 1233 ds->ds_phys->ds_compressed_bytes = 1234 ds->ds_prev->ds_phys->ds_compressed_bytes; 1235 ds->ds_phys->ds_uncompressed_bytes = 1236 ds->ds_prev->ds_phys->ds_uncompressed_bytes; 1237 ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags; 1238 ds->ds_phys->ds_unique_bytes = 0; 1239 1240 if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { 1241 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 1242 ds->ds_prev->ds_phys->ds_unique_bytes = 0; 1243 } 1244 } else { 1245 objset_impl_t *osi; 1246 1247 /* Zero out our contents, recreate objset */ 1248 bzero(&ds->ds_phys->ds_bp, sizeof (blkptr_t)); 1249 ds->ds_phys->ds_used_bytes = 0; 1250 ds->ds_phys->ds_compressed_bytes = 0; 1251 ds->ds_phys->ds_uncompressed_bytes = 0; 1252 ds->ds_phys->ds_flags = 0; 1253 ds->ds_phys->ds_unique_bytes = 0; 1254 osi = dmu_objset_create_impl(ds->ds_dir->dd_pool->dp_spa, ds, 1255 &ds->ds_phys->ds_bp, *ost, tx); 1256 #ifdef _KERNEL 1257 zfs_create_fs(&osi->os, kcred, NULL, tx); 1258 #endif 1259 } 1260 1261 spa_history_internal_log(LOG_DS_ROLLBACK, ds->ds_dir->dd_pool->dp_spa, 1262 tx, cr, "dataset = %llu", ds->ds_object); 1263 } 1264 1265 /* ARGSUSED */ 1266 static int 1267 dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) 1268 { 1269 dsl_dataset_t *ds = arg1; 1270 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 1271 uint64_t count; 1272 int err; 1273 1274 /* 1275 * Can't delete a head dataset if there are snapshots of it. 1276 * (Except if the only snapshots are from the branch we cloned 1277 * from.) 1278 */ 1279 if (ds->ds_prev != NULL && 1280 ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) 1281 return (EINVAL); 1282 1283 /* 1284 * This is really a dsl_dir thing, but check it here so that 1285 * we'll be less likely to leave this dataset inconsistent & 1286 * nearly destroyed. 1287 */ 1288 err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count); 1289 if (err) 1290 return (err); 1291 if (count != 0) 1292 return (EEXIST); 1293 1294 return (0); 1295 } 1296 1297 /* ARGSUSED */ 1298 static void 1299 dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 1300 { 1301 dsl_dataset_t *ds = arg1; 1302 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1303 1304 /* Mark it as inconsistent on-disk, in case we crash */ 1305 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1306 ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; 1307 1308 spa_history_internal_log(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx, 1309 cr, "dataset = %llu", ds->ds_object); 1310 } 1311 1312 /* ARGSUSED */ 1313 int 1314 dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) 1315 { 1316 dsl_dataset_t *ds = arg1; 1317 1318 /* we have an owner hold, so noone else can destroy us */ 1319 ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); 1320 1321 /* Can't delete a branch point. */ 1322 if (ds->ds_phys->ds_num_children > 1) 1323 return (EEXIST); 1324 1325 /* 1326 * Can't delete a head dataset if there are snapshots of it. 1327 * (Except if the only snapshots are from the branch we cloned 1328 * from.) 1329 */ 1330 if (ds->ds_prev != NULL && 1331 ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) 1332 return (EINVAL); 1333 1334 /* 1335 * If we made changes this txg, traverse_dsl_dataset won't find 1336 * them. Try again. 1337 */ 1338 if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) 1339 return (EAGAIN); 1340 1341 /* XXX we should do some i/o error checking... */ 1342 return (0); 1343 } 1344 1345 struct refsarg { 1346 kmutex_t lock; 1347 boolean_t gone; 1348 kcondvar_t cv; 1349 }; 1350 1351 /* ARGSUSED */ 1352 static void 1353 dsl_dataset_refs_gone(dmu_buf_t *db, void *argv) 1354 { 1355 struct refsarg *arg = argv; 1356 1357 mutex_enter(&arg->lock); 1358 arg->gone = TRUE; 1359 cv_signal(&arg->cv); 1360 mutex_exit(&arg->lock); 1361 } 1362 1363 static void 1364 dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag) 1365 { 1366 struct refsarg arg; 1367 1368 mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL); 1369 cv_init(&arg.cv, NULL, CV_DEFAULT, NULL); 1370 arg.gone = FALSE; 1371 (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys, 1372 dsl_dataset_refs_gone); 1373 dmu_buf_rele(ds->ds_dbuf, tag); 1374 mutex_enter(&arg.lock); 1375 while (!arg.gone) 1376 cv_wait(&arg.cv, &arg.lock); 1377 ASSERT(arg.gone); 1378 mutex_exit(&arg.lock); 1379 ds->ds_dbuf = NULL; 1380 ds->ds_phys = NULL; 1381 mutex_destroy(&arg.lock); 1382 cv_destroy(&arg.cv); 1383 } 1384 1385 void 1386 dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) 1387 { 1388 dsl_dataset_t *ds = arg1; 1389 int64_t used = 0, compressed = 0, uncompressed = 0; 1390 zio_t *zio; 1391 int err; 1392 int after_branch_point = FALSE; 1393 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1394 objset_t *mos = dp->dp_meta_objset; 1395 dsl_dataset_t *ds_prev = NULL; 1396 uint64_t obj; 1397 1398 ASSERT(ds->ds_owner); 1399 ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); 1400 ASSERT(ds->ds_prev == NULL || 1401 ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); 1402 ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); 1403 1404 /* signal any waiters that this dataset is going away */ 1405 mutex_enter(&ds->ds_lock); 1406 ds->ds_owner = dsl_reaper; 1407 cv_broadcast(&ds->ds_exclusive_cv); 1408 mutex_exit(&ds->ds_lock); 1409 1410 /* Remove our reservation */ 1411 if (ds->ds_reserved != 0) { 1412 uint64_t val = 0; 1413 dsl_dataset_set_reservation_sync(ds, &val, cr, tx); 1414 ASSERT3U(ds->ds_reserved, ==, 0); 1415 } 1416 1417 ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); 1418 1419 dsl_pool_ds_destroyed(ds, tx); 1420 1421 obj = ds->ds_object; 1422 1423 if (ds->ds_phys->ds_prev_snap_obj != 0) { 1424 if (ds->ds_prev) { 1425 ds_prev = ds->ds_prev; 1426 } else { 1427 VERIFY(0 == dsl_dataset_hold_obj(dp, 1428 ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev)); 1429 } 1430 after_branch_point = 1431 (ds_prev->ds_phys->ds_next_snap_obj != obj); 1432 1433 dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); 1434 if (after_branch_point && 1435 ds_prev->ds_phys->ds_next_clones_obj != 0) { 1436 VERIFY(0 == zap_remove_int(mos, 1437 ds_prev->ds_phys->ds_next_clones_obj, obj, tx)); 1438 if (ds->ds_phys->ds_next_snap_obj != 0) { 1439 VERIFY(0 == zap_add_int(mos, 1440 ds_prev->ds_phys->ds_next_clones_obj, 1441 ds->ds_phys->ds_next_snap_obj, tx)); 1442 } 1443 } 1444 if (after_branch_point && 1445 ds->ds_phys->ds_next_snap_obj == 0) { 1446 /* This clone is toast. */ 1447 ASSERT(ds_prev->ds_phys->ds_num_children > 1); 1448 ds_prev->ds_phys->ds_num_children--; 1449 } else if (!after_branch_point) { 1450 ds_prev->ds_phys->ds_next_snap_obj = 1451 ds->ds_phys->ds_next_snap_obj; 1452 } 1453 } 1454 1455 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 1456 1457 if (ds->ds_phys->ds_next_snap_obj != 0) { 1458 blkptr_t bp; 1459 dsl_dataset_t *ds_next; 1460 uint64_t itor = 0; 1461 uint64_t old_unique; 1462 1463 VERIFY(0 == dsl_dataset_hold_obj(dp, 1464 ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); 1465 ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); 1466 1467 old_unique = dsl_dataset_unique(ds_next); 1468 1469 dmu_buf_will_dirty(ds_next->ds_dbuf, tx); 1470 ds_next->ds_phys->ds_prev_snap_obj = 1471 ds->ds_phys->ds_prev_snap_obj; 1472 ds_next->ds_phys->ds_prev_snap_txg = 1473 ds->ds_phys->ds_prev_snap_txg; 1474 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, 1475 ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0); 1476 1477 /* 1478 * Transfer to our deadlist (which will become next's 1479 * new deadlist) any entries from next's current 1480 * deadlist which were born before prev, and free the 1481 * other entries. 1482 * 1483 * XXX we're doing this long task with the config lock held 1484 */ 1485 while (bplist_iterate(&ds_next->ds_deadlist, &itor, &bp) == 0) { 1486 if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) { 1487 VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, 1488 &bp, tx)); 1489 if (ds_prev && !after_branch_point && 1490 bp.blk_birth > 1491 ds_prev->ds_phys->ds_prev_snap_txg) { 1492 ds_prev->ds_phys->ds_unique_bytes += 1493 bp_get_dasize(dp->dp_spa, &bp); 1494 } 1495 } else { 1496 used += bp_get_dasize(dp->dp_spa, &bp); 1497 compressed += BP_GET_PSIZE(&bp); 1498 uncompressed += BP_GET_UCSIZE(&bp); 1499 /* XXX check return value? */ 1500 (void) dsl_free(zio, dp, tx->tx_txg, 1501 &bp, NULL, NULL, ARC_NOWAIT); 1502 } 1503 } 1504 1505 /* free next's deadlist */ 1506 bplist_close(&ds_next->ds_deadlist); 1507 bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx); 1508 1509 /* set next's deadlist to our deadlist */ 1510 bplist_close(&ds->ds_deadlist); 1511 ds_next->ds_phys->ds_deadlist_obj = 1512 ds->ds_phys->ds_deadlist_obj; 1513 VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos, 1514 ds_next->ds_phys->ds_deadlist_obj)); 1515 ds->ds_phys->ds_deadlist_obj = 0; 1516 1517 if (ds_next->ds_phys->ds_next_snap_obj != 0) { 1518 /* 1519 * Update next's unique to include blocks which 1520 * were previously shared by only this snapshot 1521 * and it. Those blocks will be born after the 1522 * prev snap and before this snap, and will have 1523 * died after the next snap and before the one 1524 * after that (ie. be on the snap after next's 1525 * deadlist). 1526 * 1527 * XXX we're doing this long task with the 1528 * config lock held 1529 */ 1530 dsl_dataset_t *ds_after_next; 1531 1532 VERIFY(0 == dsl_dataset_hold_obj(dp, 1533 ds_next->ds_phys->ds_next_snap_obj, 1534 FTAG, &ds_after_next)); 1535 itor = 0; 1536 while (bplist_iterate(&ds_after_next->ds_deadlist, 1537 &itor, &bp) == 0) { 1538 if (bp.blk_birth > 1539 ds->ds_phys->ds_prev_snap_txg && 1540 bp.blk_birth <= 1541 ds->ds_phys->ds_creation_txg) { 1542 ds_next->ds_phys->ds_unique_bytes += 1543 bp_get_dasize(dp->dp_spa, &bp); 1544 } 1545 } 1546 1547 dsl_dataset_rele(ds_after_next, FTAG); 1548 ASSERT3P(ds_next->ds_prev, ==, NULL); 1549 } else { 1550 ASSERT3P(ds_next->ds_prev, ==, ds); 1551 dsl_dataset_drop_ref(ds_next->ds_prev, ds_next); 1552 ds_next->ds_prev = NULL; 1553 if (ds_prev) { 1554 VERIFY(0 == dsl_dataset_get_ref(dp, 1555 ds->ds_phys->ds_prev_snap_obj, 1556 ds_next, &ds_next->ds_prev)); 1557 } 1558 1559 dsl_dataset_recalc_head_uniq(ds_next); 1560 1561 /* 1562 * Reduce the amount of our unconsmed refreservation 1563 * being charged to our parent by the amount of 1564 * new unique data we have gained. 1565 */ 1566 if (old_unique < ds_next->ds_reserved) { 1567 int64_t mrsdelta; 1568 uint64_t new_unique = 1569 ds_next->ds_phys->ds_unique_bytes; 1570 1571 ASSERT(old_unique <= new_unique); 1572 mrsdelta = MIN(new_unique - old_unique, 1573 ds_next->ds_reserved - old_unique); 1574 dsl_dir_diduse_space(ds->ds_dir, -mrsdelta, 1575 0, 0, tx); 1576 } 1577 } 1578 dsl_dataset_rele(ds_next, FTAG); 1579 1580 /* 1581 * NB: unique_bytes might not be accurate for the head objset. 1582 * Before SPA_VERSION 9, we didn't update its value when we 1583 * deleted the most recent snapshot. 1584 */ 1585 ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes); 1586 } else { 1587 /* 1588 * There's no next snapshot, so this is a head dataset. 1589 * Destroy the deadlist. Unless it's a clone, the 1590 * deadlist should be empty. (If it's a clone, it's 1591 * safe to ignore the deadlist contents.) 1592 */ 1593 struct killarg ka; 1594 1595 ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist)); 1596 bplist_close(&ds->ds_deadlist); 1597 bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx); 1598 ds->ds_phys->ds_deadlist_obj = 0; 1599 1600 /* 1601 * Free everything that we point to (that's born after 1602 * the previous snapshot, if we are a clone) 1603 * 1604 * XXX we're doing this long task with the config lock held 1605 */ 1606 ka.usedp = &used; 1607 ka.compressedp = &compressed; 1608 ka.uncompressedp = &uncompressed; 1609 ka.zio = zio; 1610 ka.tx = tx; 1611 err = traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg, 1612 ADVANCE_POST, kill_blkptr, &ka); 1613 ASSERT3U(err, ==, 0); 1614 ASSERT(spa_version(dp->dp_spa) < 1615 SPA_VERSION_UNIQUE_ACCURATE || 1616 used == ds->ds_phys->ds_unique_bytes); 1617 } 1618 1619 err = zio_wait(zio); 1620 ASSERT3U(err, ==, 0); 1621 1622 dsl_dir_diduse_space(ds->ds_dir, -used, -compressed, -uncompressed, tx); 1623 1624 if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) { 1625 /* Erase the link in the dir */ 1626 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); 1627 ds->ds_dir->dd_phys->dd_head_dataset_obj = 0; 1628 ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0); 1629 err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx); 1630 ASSERT(err == 0); 1631 } else { 1632 /* remove from snapshot namespace */ 1633 dsl_dataset_t *ds_head; 1634 ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0); 1635 VERIFY(0 == dsl_dataset_hold_obj(dp, 1636 ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head)); 1637 VERIFY(0 == dsl_dataset_get_snapname(ds)); 1638 #ifdef ZFS_DEBUG 1639 { 1640 uint64_t val; 1641 1642 err = dsl_dataset_snap_lookup(ds_head, 1643 ds->ds_snapname, &val); 1644 ASSERT3U(err, ==, 0); 1645 ASSERT3U(val, ==, obj); 1646 } 1647 #endif 1648 err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx); 1649 ASSERT(err == 0); 1650 dsl_dataset_rele(ds_head, FTAG); 1651 } 1652 1653 if (ds_prev && ds->ds_prev != ds_prev) 1654 dsl_dataset_rele(ds_prev, FTAG); 1655 1656 spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); 1657 spa_history_internal_log(LOG_DS_DESTROY, dp->dp_spa, tx, 1658 cr, "dataset = %llu", ds->ds_object); 1659 1660 if (ds->ds_phys->ds_next_clones_obj != 0) { 1661 uint64_t count; 1662 ASSERT(0 == zap_count(mos, 1663 ds->ds_phys->ds_next_clones_obj, &count) && count == 0); 1664 VERIFY(0 == dmu_object_free(mos, 1665 ds->ds_phys->ds_next_clones_obj, tx)); 1666 } 1667 dsl_dir_close(ds->ds_dir, ds); 1668 ds->ds_dir = NULL; 1669 dsl_dataset_drain_refs(ds, tag); 1670 VERIFY(0 == dmu_object_free(mos, obj, tx)); 1671 } 1672 1673 static int 1674 dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) 1675 { 1676 uint64_t asize; 1677 1678 if (!dmu_tx_is_syncing(tx)) 1679 return (0); 1680 1681 /* 1682 * If there's an fs-only reservation, any blocks that might become 1683 * owned by the snapshot dataset must be accommodated by space 1684 * outside of the reservation. 1685 */ 1686 asize = MIN(dsl_dataset_unique(ds), ds->ds_reserved); 1687 if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, FALSE)) 1688 return (ENOSPC); 1689 1690 /* 1691 * Propogate any reserved space for this snapshot to other 1692 * snapshot checks in this sync group. 1693 */ 1694 if (asize > 0) 1695 dsl_dir_willuse_space(ds->ds_dir, asize, tx); 1696 1697 return (0); 1698 } 1699 1700 /* ARGSUSED */ 1701 int 1702 dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) 1703 { 1704 dsl_dataset_t *ds = arg1; 1705 const char *snapname = arg2; 1706 int err; 1707 uint64_t value; 1708 1709 /* 1710 * We don't allow multiple snapshots of the same txg. If there 1711 * is already one, try again. 1712 */ 1713 if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg) 1714 return (EAGAIN); 1715 1716 /* 1717 * Check for conflicting name snapshot name. 1718 */ 1719 err = dsl_dataset_snap_lookup(ds, snapname, &value); 1720 if (err == 0) 1721 return (EEXIST); 1722 if (err != ENOENT) 1723 return (err); 1724 1725 /* 1726 * Check that the dataset's name is not too long. Name consists 1727 * of the dataset's length + 1 for the @-sign + snapshot name's length 1728 */ 1729 if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN) 1730 return (ENAMETOOLONG); 1731 1732 err = dsl_dataset_snapshot_reserve_space(ds, tx); 1733 if (err) 1734 return (err); 1735 1736 ds->ds_trysnap_txg = tx->tx_txg; 1737 return (0); 1738 } 1739 1740 void 1741 dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 1742 { 1743 dsl_dataset_t *ds = arg1; 1744 const char *snapname = arg2; 1745 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1746 dmu_buf_t *dbuf; 1747 dsl_dataset_phys_t *dsphys; 1748 uint64_t dsobj, crtxg; 1749 objset_t *mos = dp->dp_meta_objset; 1750 int err; 1751 1752 ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); 1753 1754 /* 1755 * The origin's ds_creation_txg has to be < TXG_INITIAL 1756 */ 1757 if (strcmp(snapname, ORIGIN_DIR_NAME) == 0) 1758 crtxg = 1; 1759 else 1760 crtxg = tx->tx_txg; 1761 1762 dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, 1763 DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); 1764 VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); 1765 dmu_buf_will_dirty(dbuf, tx); 1766 dsphys = dbuf->db_data; 1767 bzero(dsphys, sizeof (dsl_dataset_phys_t)); 1768 dsphys->ds_dir_obj = ds->ds_dir->dd_object; 1769 dsphys->ds_fsid_guid = unique_create(); 1770 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, 1771 sizeof (dsphys->ds_guid)); 1772 dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj; 1773 dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg; 1774 dsphys->ds_next_snap_obj = ds->ds_object; 1775 dsphys->ds_num_children = 1; 1776 dsphys->ds_creation_time = gethrestime_sec(); 1777 dsphys->ds_creation_txg = crtxg; 1778 dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj; 1779 dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes; 1780 dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes; 1781 dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes; 1782 dsphys->ds_flags = ds->ds_phys->ds_flags; 1783 dsphys->ds_bp = ds->ds_phys->ds_bp; 1784 dmu_buf_rele(dbuf, FTAG); 1785 1786 ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0); 1787 if (ds->ds_prev) { 1788 uint64_t next_clones_obj = 1789 ds->ds_prev->ds_phys->ds_next_clones_obj; 1790 ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj == 1791 ds->ds_object || 1792 ds->ds_prev->ds_phys->ds_num_children > 1); 1793 if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { 1794 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 1795 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, 1796 ds->ds_prev->ds_phys->ds_creation_txg); 1797 ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj; 1798 } else if (next_clones_obj != 0) { 1799 VERIFY3U(0, ==, zap_remove_int(mos, 1800 next_clones_obj, dsphys->ds_next_snap_obj, tx)); 1801 VERIFY3U(0, ==, zap_add_int(mos, 1802 next_clones_obj, dsobj, tx)); 1803 } 1804 } 1805 1806 /* 1807 * If we have a reference-reservation on this dataset, we will 1808 * need to increase the amount of refreservation being charged 1809 * since our unique space is going to zero. 1810 */ 1811 if (ds->ds_reserved) { 1812 int64_t add = MIN(dsl_dataset_unique(ds), ds->ds_reserved); 1813 dsl_dir_diduse_space(ds->ds_dir, add, 0, 0, tx); 1814 } 1815 1816 bplist_close(&ds->ds_deadlist); 1817 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1818 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg); 1819 ds->ds_phys->ds_prev_snap_obj = dsobj; 1820 ds->ds_phys->ds_prev_snap_txg = crtxg; 1821 ds->ds_phys->ds_unique_bytes = 0; 1822 if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) 1823 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 1824 ds->ds_phys->ds_deadlist_obj = 1825 bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); 1826 VERIFY(0 == bplist_open(&ds->ds_deadlist, mos, 1827 ds->ds_phys->ds_deadlist_obj)); 1828 1829 dprintf("snap '%s' -> obj %llu\n", snapname, dsobj); 1830 err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj, 1831 snapname, 8, 1, &dsobj, tx); 1832 ASSERT(err == 0); 1833 1834 if (ds->ds_prev) 1835 dsl_dataset_drop_ref(ds->ds_prev, ds); 1836 VERIFY(0 == dsl_dataset_get_ref(dp, 1837 ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); 1838 1839 dsl_pool_ds_snapshotted(ds, tx); 1840 1841 spa_history_internal_log(LOG_DS_SNAPSHOT, dp->dp_spa, tx, cr, 1842 "dataset = %llu", dsobj); 1843 } 1844 1845 void 1846 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) 1847 { 1848 ASSERT(dmu_tx_is_syncing(tx)); 1849 ASSERT(ds->ds_user_ptr != NULL); 1850 ASSERT(ds->ds_phys->ds_next_snap_obj == 0); 1851 1852 /* 1853 * in case we had to change ds_fsid_guid when we opened it, 1854 * sync it out now. 1855 */ 1856 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1857 ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid; 1858 1859 dsl_dir_dirty(ds->ds_dir, tx); 1860 dmu_objset_sync(ds->ds_user_ptr, zio, tx); 1861 } 1862 1863 void 1864 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) 1865 { 1866 uint64_t refd, avail, uobjs, aobjs; 1867 1868 dsl_dir_stats(ds->ds_dir, nv); 1869 1870 dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs); 1871 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail); 1872 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd); 1873 1874 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION, 1875 ds->ds_phys->ds_creation_time); 1876 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG, 1877 ds->ds_phys->ds_creation_txg); 1878 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA, 1879 ds->ds_quota); 1880 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION, 1881 ds->ds_reserved); 1882 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, 1883 ds->ds_phys->ds_guid); 1884 1885 if (ds->ds_phys->ds_next_snap_obj) { 1886 /* 1887 * This is a snapshot; override the dd's space used with 1888 * our unique space and compression ratio. 1889 */ 1890 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, 1891 ds->ds_phys->ds_unique_bytes); 1892 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, 1893 ds->ds_phys->ds_compressed_bytes == 0 ? 100 : 1894 (ds->ds_phys->ds_uncompressed_bytes * 100 / 1895 ds->ds_phys->ds_compressed_bytes)); 1896 } 1897 } 1898 1899 void 1900 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) 1901 { 1902 stat->dds_creation_txg = ds->ds_phys->ds_creation_txg; 1903 stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT; 1904 stat->dds_guid = ds->ds_phys->ds_guid; 1905 if (ds->ds_phys->ds_next_snap_obj) { 1906 stat->dds_is_snapshot = B_TRUE; 1907 stat->dds_num_clones = ds->ds_phys->ds_num_children - 1; 1908 } 1909 1910 /* clone origin is really a dsl_dir thing... */ 1911 rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); 1912 if (dsl_dir_is_clone(ds->ds_dir)) { 1913 dsl_dataset_t *ods; 1914 1915 VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool, 1916 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods)); 1917 dsl_dataset_name(ods, stat->dds_origin); 1918 dsl_dataset_drop_ref(ods, FTAG); 1919 } 1920 rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); 1921 } 1922 1923 uint64_t 1924 dsl_dataset_fsid_guid(dsl_dataset_t *ds) 1925 { 1926 return (ds->ds_fsid_guid); 1927 } 1928 1929 void 1930 dsl_dataset_space(dsl_dataset_t *ds, 1931 uint64_t *refdbytesp, uint64_t *availbytesp, 1932 uint64_t *usedobjsp, uint64_t *availobjsp) 1933 { 1934 *refdbytesp = ds->ds_phys->ds_used_bytes; 1935 *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); 1936 if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) 1937 *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes; 1938 if (ds->ds_quota != 0) { 1939 /* 1940 * Adjust available bytes according to refquota 1941 */ 1942 if (*refdbytesp < ds->ds_quota) 1943 *availbytesp = MIN(*availbytesp, 1944 ds->ds_quota - *refdbytesp); 1945 else 1946 *availbytesp = 0; 1947 } 1948 *usedobjsp = ds->ds_phys->ds_bp.blk_fill; 1949 *availobjsp = DN_MAX_OBJECT - *usedobjsp; 1950 } 1951 1952 boolean_t 1953 dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) 1954 { 1955 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1956 1957 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || 1958 dsl_pool_sync_context(dp)); 1959 if (ds->ds_prev == NULL) 1960 return (B_FALSE); 1961 if (ds->ds_phys->ds_bp.blk_birth > 1962 ds->ds_prev->ds_phys->ds_creation_txg) 1963 return (B_TRUE); 1964 return (B_FALSE); 1965 } 1966 1967 /* ARGSUSED */ 1968 static int 1969 dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) 1970 { 1971 dsl_dataset_t *ds = arg1; 1972 char *newsnapname = arg2; 1973 dsl_dir_t *dd = ds->ds_dir; 1974 dsl_dataset_t *hds; 1975 uint64_t val; 1976 int err; 1977 1978 err = dsl_dataset_hold_obj(dd->dd_pool, 1979 dd->dd_phys->dd_head_dataset_obj, FTAG, &hds); 1980 if (err) 1981 return (err); 1982 1983 /* new name better not be in use */ 1984 err = dsl_dataset_snap_lookup(hds, newsnapname, &val); 1985 dsl_dataset_rele(hds, FTAG); 1986 1987 if (err == 0) 1988 err = EEXIST; 1989 else if (err == ENOENT) 1990 err = 0; 1991 1992 /* dataset name + 1 for the "@" + the new snapshot name must fit */ 1993 if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN) 1994 err = ENAMETOOLONG; 1995 1996 return (err); 1997 } 1998 1999 static void 2000 dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, 2001 cred_t *cr, dmu_tx_t *tx) 2002 { 2003 dsl_dataset_t *ds = arg1; 2004 const char *newsnapname = arg2; 2005 dsl_dir_t *dd = ds->ds_dir; 2006 objset_t *mos = dd->dd_pool->dp_meta_objset; 2007 dsl_dataset_t *hds; 2008 int err; 2009 2010 ASSERT(ds->ds_phys->ds_next_snap_obj != 0); 2011 2012 VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, 2013 dd->dd_phys->dd_head_dataset_obj, FTAG, &hds)); 2014 2015 VERIFY(0 == dsl_dataset_get_snapname(ds)); 2016 err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx); 2017 ASSERT3U(err, ==, 0); 2018 mutex_enter(&ds->ds_lock); 2019 (void) strcpy(ds->ds_snapname, newsnapname); 2020 mutex_exit(&ds->ds_lock); 2021 err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj, 2022 ds->ds_snapname, 8, 1, &ds->ds_object, tx); 2023 ASSERT3U(err, ==, 0); 2024 2025 spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx, 2026 cr, "dataset = %llu", ds->ds_object); 2027 dsl_dataset_rele(hds, FTAG); 2028 } 2029 2030 struct renamesnaparg { 2031 dsl_sync_task_group_t *dstg; 2032 char failed[MAXPATHLEN]; 2033 char *oldsnap; 2034 char *newsnap; 2035 }; 2036 2037 static int 2038 dsl_snapshot_rename_one(char *name, void *arg) 2039 { 2040 struct renamesnaparg *ra = arg; 2041 dsl_dataset_t *ds = NULL; 2042 char *cp; 2043 int err; 2044 2045 cp = name + strlen(name); 2046 *cp = '@'; 2047 (void) strcpy(cp + 1, ra->oldsnap); 2048 2049 /* 2050 * For recursive snapshot renames the parent won't be changing 2051 * so we just pass name for both the to/from argument. 2052 */ 2053 if (err = zfs_secpolicy_rename_perms(name, name, CRED())) { 2054 (void) strcpy(ra->failed, name); 2055 return (err); 2056 } 2057 2058 #ifdef _KERNEL 2059 /* 2060 * For all filesystems undergoing rename, we'll need to unmount it. 2061 */ 2062 (void) zfs_unmount_snap(name, NULL); 2063 #endif 2064 err = dsl_dataset_hold(name, ra->dstg, &ds); 2065 *cp = '\0'; 2066 if (err == ENOENT) { 2067 return (0); 2068 } else if (err) { 2069 (void) strcpy(ra->failed, name); 2070 return (err); 2071 } 2072 2073 dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check, 2074 dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0); 2075 2076 return (0); 2077 } 2078 2079 static int 2080 dsl_recursive_rename(char *oldname, const char *newname) 2081 { 2082 int err; 2083 struct renamesnaparg *ra; 2084 dsl_sync_task_t *dst; 2085 spa_t *spa; 2086 char *cp, *fsname = spa_strdup(oldname); 2087 int len = strlen(oldname); 2088 2089 /* truncate the snapshot name to get the fsname */ 2090 cp = strchr(fsname, '@'); 2091 *cp = '\0'; 2092 2093 err = spa_open(fsname, &spa, FTAG); 2094 if (err) { 2095 kmem_free(fsname, len + 1); 2096 return (err); 2097 } 2098 ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP); 2099 ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 2100 2101 ra->oldsnap = strchr(oldname, '@') + 1; 2102 ra->newsnap = strchr(newname, '@') + 1; 2103 *ra->failed = '\0'; 2104 2105 err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra, 2106 DS_FIND_CHILDREN); 2107 kmem_free(fsname, len + 1); 2108 2109 if (err == 0) { 2110 err = dsl_sync_task_group_wait(ra->dstg); 2111 } 2112 2113 for (dst = list_head(&ra->dstg->dstg_tasks); dst; 2114 dst = list_next(&ra->dstg->dstg_tasks, dst)) { 2115 dsl_dataset_t *ds = dst->dst_arg1; 2116 if (dst->dst_err) { 2117 dsl_dir_name(ds->ds_dir, ra->failed); 2118 (void) strcat(ra->failed, "@"); 2119 (void) strcat(ra->failed, ra->newsnap); 2120 } 2121 dsl_dataset_rele(ds, ra->dstg); 2122 } 2123 2124 if (err) 2125 (void) strcpy(oldname, ra->failed); 2126 2127 dsl_sync_task_group_destroy(ra->dstg); 2128 kmem_free(ra, sizeof (struct renamesnaparg)); 2129 spa_close(spa, FTAG); 2130 return (err); 2131 } 2132 2133 static int 2134 dsl_valid_rename(char *oldname, void *arg) 2135 { 2136 int delta = *(int *)arg; 2137 2138 if (strlen(oldname) + delta >= MAXNAMELEN) 2139 return (ENAMETOOLONG); 2140 2141 return (0); 2142 } 2143 2144 #pragma weak dmu_objset_rename = dsl_dataset_rename 2145 int 2146 dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive) 2147 { 2148 dsl_dir_t *dd; 2149 dsl_dataset_t *ds; 2150 const char *tail; 2151 int err; 2152 2153 err = dsl_dir_open(oldname, FTAG, &dd, &tail); 2154 if (err) 2155 return (err); 2156 if (tail == NULL) { 2157 int delta = strlen(newname) - strlen(oldname); 2158 2159 /* if we're growing, validate child name lengths */ 2160 if (delta > 0) 2161 err = dmu_objset_find(oldname, dsl_valid_rename, 2162 &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 2163 2164 if (!err) 2165 err = dsl_dir_rename(dd, newname); 2166 dsl_dir_close(dd, FTAG); 2167 return (err); 2168 } 2169 if (tail[0] != '@') { 2170 /* the name ended in a nonexistant component */ 2171 dsl_dir_close(dd, FTAG); 2172 return (ENOENT); 2173 } 2174 2175 dsl_dir_close(dd, FTAG); 2176 2177 /* new name must be snapshot in same filesystem */ 2178 tail = strchr(newname, '@'); 2179 if (tail == NULL) 2180 return (EINVAL); 2181 tail++; 2182 if (strncmp(oldname, newname, tail - newname) != 0) 2183 return (EXDEV); 2184 2185 if (recursive) { 2186 err = dsl_recursive_rename(oldname, newname); 2187 } else { 2188 err = dsl_dataset_hold(oldname, FTAG, &ds); 2189 if (err) 2190 return (err); 2191 2192 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 2193 dsl_dataset_snapshot_rename_check, 2194 dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1); 2195 2196 dsl_dataset_rele(ds, FTAG); 2197 } 2198 2199 return (err); 2200 } 2201 2202 struct promotenode { 2203 list_node_t link; 2204 dsl_dataset_t *ds; 2205 }; 2206 2207 struct promotearg { 2208 list_t snap_list; 2209 dsl_dataset_t *clone_origin, *old_head; 2210 uint64_t used, comp, uncomp, unique; 2211 uint64_t newnext_obj; 2212 }; 2213 2214 /* ARGSUSED */ 2215 static int 2216 dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) 2217 { 2218 dsl_dataset_t *hds = arg1; 2219 struct promotearg *pa = arg2; 2220 struct promotenode *snap = list_head(&pa->snap_list); 2221 dsl_pool_t *dp = hds->ds_dir->dd_pool; 2222 dsl_dataset_t *origin_ds = snap->ds; 2223 dsl_dataset_t *newnext_ds; 2224 char *name; 2225 uint64_t itor = 0; 2226 blkptr_t bp; 2227 int err; 2228 2229 /* Check that it is a real clone */ 2230 if (!dsl_dir_is_clone(hds->ds_dir)) 2231 return (EINVAL); 2232 2233 /* Since this is so expensive, don't do the preliminary check */ 2234 if (!dmu_tx_is_syncing(tx)) 2235 return (0); 2236 2237 if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) 2238 return (EXDEV); 2239 2240 /* find origin's new next ds */ 2241 newnext_ds = hds; 2242 while (newnext_ds->ds_phys->ds_prev_snap_obj != origin_ds->ds_object) { 2243 dsl_dataset_t *prev; 2244 2245 err = dsl_dataset_hold_obj(dp, 2246 newnext_ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); 2247 if (newnext_ds != hds) 2248 dsl_dataset_rele(newnext_ds, FTAG); 2249 if (err) 2250 return (err); 2251 newnext_ds = prev; 2252 } 2253 pa->newnext_obj = newnext_ds->ds_object; 2254 2255 /* compute origin's new unique space */ 2256 pa->unique = 0; 2257 while ((err = bplist_iterate(&newnext_ds->ds_deadlist, 2258 &itor, &bp)) == 0) { 2259 if (bp.blk_birth > origin_ds->ds_phys->ds_prev_snap_txg) 2260 pa->unique += bp_get_dasize(dp->dp_spa, &bp); 2261 } 2262 if (newnext_ds != hds) 2263 dsl_dataset_rele(newnext_ds, FTAG); 2264 if (err != ENOENT) 2265 return (err); 2266 2267 name = kmem_alloc(MAXPATHLEN, KM_SLEEP); 2268 2269 /* 2270 * Walk the snapshots that we are moving 2271 * 2272 * Compute space to transfer. Each snapshot gave birth to: 2273 * (my used) - (prev's used) + (deadlist's used) 2274 * So a sequence would look like: 2275 * uN - u(N-1) + dN + ... + u1 - u0 + d1 + u0 - 0 + d0 2276 * Which simplifies to: 2277 * uN + dN + ... + d1 + d0 2278 * Note however, if we stop before we reach the ORIGIN we get: 2279 * uN + dN + ... + dM - uM-1 2280 */ 2281 pa->used = origin_ds->ds_phys->ds_used_bytes; 2282 pa->comp = origin_ds->ds_phys->ds_compressed_bytes; 2283 pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; 2284 do { 2285 uint64_t val, dlused, dlcomp, dluncomp; 2286 dsl_dataset_t *ds = snap->ds; 2287 2288 /* Check that the snapshot name does not conflict */ 2289 dsl_dataset_name(ds, name); 2290 err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); 2291 if (err == 0) 2292 err = EEXIST; 2293 if (err != ENOENT) 2294 break; 2295 err = 0; 2296 2297 /* The very first snapshot does not have a deadlist */ 2298 if (ds->ds_phys->ds_prev_snap_obj != 0) { 2299 if (err = bplist_space(&ds->ds_deadlist, 2300 &dlused, &dlcomp, &dluncomp)) 2301 break; 2302 pa->used += dlused; 2303 pa->comp += dlcomp; 2304 pa->uncomp += dluncomp; 2305 } 2306 } while (snap = list_next(&pa->snap_list, snap)); 2307 2308 /* 2309 * If we are a clone of a clone then we never reached ORIGIN, 2310 * so we need to subtract out the clone origin's used space. 2311 */ 2312 if (pa->clone_origin) { 2313 pa->used -= pa->clone_origin->ds_phys->ds_used_bytes; 2314 pa->comp -= pa->clone_origin->ds_phys->ds_compressed_bytes; 2315 pa->uncomp -= pa->clone_origin->ds_phys->ds_uncompressed_bytes; 2316 } 2317 2318 kmem_free(name, MAXPATHLEN); 2319 2320 /* Check that there is enough space here */ 2321 if (err == 0) { 2322 dsl_dir_t *odd = origin_ds->ds_dir; 2323 err = dsl_dir_transfer_possible(odd, hds->ds_dir, pa->used); 2324 } 2325 2326 return (err); 2327 } 2328 2329 static void 2330 dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 2331 { 2332 dsl_dataset_t *hds = arg1; 2333 struct promotearg *pa = arg2; 2334 struct promotenode *snap = list_head(&pa->snap_list); 2335 dsl_dataset_t *origin_ds = snap->ds; 2336 dsl_dir_t *dd = hds->ds_dir; 2337 dsl_pool_t *dp = hds->ds_dir->dd_pool; 2338 dsl_dir_t *odd = NULL; 2339 char *name; 2340 uint64_t oldnext_obj; 2341 2342 ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)); 2343 2344 /* 2345 * We need to explicitly open odd, since origin_ds's dd will be 2346 * changing. 2347 */ 2348 VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object, 2349 NULL, FTAG, &odd)); 2350 2351 /* change origin's next snap */ 2352 dmu_buf_will_dirty(origin_ds->ds_dbuf, tx); 2353 oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj; 2354 origin_ds->ds_phys->ds_next_snap_obj = pa->newnext_obj; 2355 2356 /* change the origin's next clone */ 2357 if (origin_ds->ds_phys->ds_next_clones_obj) { 2358 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 2359 origin_ds->ds_phys->ds_next_clones_obj, 2360 pa->newnext_obj, tx)); 2361 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, 2362 origin_ds->ds_phys->ds_next_clones_obj, 2363 oldnext_obj, tx)); 2364 } 2365 2366 /* change origin */ 2367 dmu_buf_will_dirty(dd->dd_dbuf, tx); 2368 ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object); 2369 dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj; 2370 dmu_buf_will_dirty(odd->dd_dbuf, tx); 2371 odd->dd_phys->dd_origin_obj = origin_ds->ds_object; 2372 2373 /* move snapshots to this dir */ 2374 name = kmem_alloc(MAXPATHLEN, KM_SLEEP); 2375 do { 2376 dsl_dataset_t *ds = snap->ds; 2377 2378 /* move snap name entry */ 2379 dsl_dataset_name(ds, name); 2380 VERIFY(0 == dsl_dataset_snap_remove(pa->old_head, 2381 ds->ds_snapname, tx)); 2382 VERIFY(0 == zap_add(dp->dp_meta_objset, 2383 hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, 2384 8, 1, &ds->ds_object, tx)); 2385 2386 /* change containing dsl_dir */ 2387 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2388 ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object); 2389 ds->ds_phys->ds_dir_obj = dd->dd_object; 2390 ASSERT3P(ds->ds_dir, ==, odd); 2391 dsl_dir_close(ds->ds_dir, ds); 2392 VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, 2393 NULL, ds, &ds->ds_dir)); 2394 2395 ASSERT3U(dsl_prop_numcb(ds), ==, 0); 2396 } while (snap = list_next(&pa->snap_list, snap)); 2397 2398 /* change space accounting */ 2399 dsl_dir_diduse_space(odd, -pa->used, -pa->comp, -pa->uncomp, tx); 2400 dsl_dir_diduse_space(dd, pa->used, pa->comp, pa->uncomp, tx); 2401 origin_ds->ds_phys->ds_unique_bytes = pa->unique; 2402 2403 /* log history record */ 2404 spa_history_internal_log(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx, 2405 cr, "dataset = %llu", hds->ds_object); 2406 2407 dsl_dir_close(odd, FTAG); 2408 kmem_free(name, MAXPATHLEN); 2409 } 2410 2411 int 2412 dsl_dataset_promote(const char *name) 2413 { 2414 dsl_dataset_t *ds; 2415 dsl_dir_t *dd; 2416 dsl_pool_t *dp; 2417 dmu_object_info_t doi; 2418 struct promotearg pa; 2419 struct promotenode *snap; 2420 uint64_t snap_obj; 2421 uint64_t last_snap = 0; 2422 int err; 2423 2424 err = dsl_dataset_hold(name, FTAG, &ds); 2425 if (err) 2426 return (err); 2427 dd = ds->ds_dir; 2428 dp = dd->dd_pool; 2429 2430 err = dmu_object_info(dp->dp_meta_objset, 2431 ds->ds_phys->ds_snapnames_zapobj, &doi); 2432 if (err) { 2433 dsl_dataset_rele(ds, FTAG); 2434 return (err); 2435 } 2436 2437 /* 2438 * We are going to inherit all the snapshots taken before our 2439 * origin (i.e., our new origin will be our parent's origin). 2440 * Take ownership of them so that we can rename them into our 2441 * namespace. 2442 */ 2443 pa.clone_origin = NULL; 2444 list_create(&pa.snap_list, 2445 sizeof (struct promotenode), offsetof(struct promotenode, link)); 2446 rw_enter(&dp->dp_config_rwlock, RW_READER); 2447 ASSERT(dd->dd_phys->dd_origin_obj != 0); 2448 snap_obj = dd->dd_phys->dd_origin_obj; 2449 while (snap_obj) { 2450 dsl_dataset_t *snapds; 2451 2452 /* 2453 * NB: this would be handled by the below check for 2454 * clone of a clone, but then we'd always own_obj() the 2455 * $ORIGIN, thus causing unnecessary EBUSYs. We don't 2456 * need to set pa.clone_origin because the $ORIGIN has 2457 * no data to account for. 2458 */ 2459 if (dp->dp_origin_snap && 2460 snap_obj == dp->dp_origin_snap->ds_object) 2461 break; 2462 2463 err = dsl_dataset_own_obj(dp, snap_obj, 0, FTAG, &snapds); 2464 if (err == ENOENT) { 2465 /* lost race with snapshot destroy */ 2466 struct promotenode *last = list_tail(&pa.snap_list); 2467 ASSERT(snap_obj != last->ds->ds_phys->ds_prev_snap_obj); 2468 snap_obj = last->ds->ds_phys->ds_prev_snap_obj; 2469 continue; 2470 } else if (err) { 2471 rw_exit(&dp->dp_config_rwlock); 2472 goto out; 2473 } 2474 2475 /* 2476 * We could be a clone of a clone. If we reach our 2477 * parent's branch point, we're done. 2478 */ 2479 if (last_snap && 2480 snapds->ds_phys->ds_next_snap_obj != last_snap) { 2481 pa.clone_origin = snapds; 2482 break; 2483 } 2484 2485 snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP); 2486 snap->ds = snapds; 2487 list_insert_tail(&pa.snap_list, snap); 2488 last_snap = snap_obj; 2489 snap_obj = snap->ds->ds_phys->ds_prev_snap_obj; 2490 } 2491 snap = list_head(&pa.snap_list); 2492 ASSERT(snap != NULL); 2493 err = dsl_dataset_hold_obj(dp, 2494 snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &pa.old_head); 2495 rw_exit(&dp->dp_config_rwlock); 2496 2497 if (err) 2498 goto out; 2499 2500 /* 2501 * Add in 128x the snapnames zapobj size, since we will be moving 2502 * a bunch of snapnames to the promoted ds, and dirtying their 2503 * bonus buffers. 2504 */ 2505 err = dsl_sync_task_do(dp, dsl_dataset_promote_check, 2506 dsl_dataset_promote_sync, ds, &pa, 2 + 2 * doi.doi_physical_blks); 2507 2508 dsl_dataset_rele(pa.old_head, FTAG); 2509 out: 2510 while ((snap = list_tail(&pa.snap_list)) != NULL) { 2511 list_remove(&pa.snap_list, snap); 2512 dsl_dataset_disown(snap->ds, FTAG); 2513 kmem_free(snap, sizeof (struct promotenode)); 2514 } 2515 list_destroy(&pa.snap_list); 2516 if (pa.clone_origin) 2517 dsl_dataset_disown(pa.clone_origin, FTAG); 2518 dsl_dataset_rele(ds, FTAG); 2519 return (err); 2520 } 2521 2522 struct cloneswaparg { 2523 dsl_dataset_t *cds; /* clone dataset */ 2524 dsl_dataset_t *ohds; /* origin's head dataset */ 2525 boolean_t force; 2526 int64_t unused_refres_delta; /* change in unconsumed refreservation */ 2527 }; 2528 2529 /* ARGSUSED */ 2530 static int 2531 dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx) 2532 { 2533 struct cloneswaparg *csa = arg1; 2534 2535 /* they should both be heads */ 2536 if (dsl_dataset_is_snapshot(csa->cds) || 2537 dsl_dataset_is_snapshot(csa->ohds)) 2538 return (EINVAL); 2539 2540 /* the branch point should be just before them */ 2541 if (csa->cds->ds_prev != csa->ohds->ds_prev) 2542 return (EINVAL); 2543 2544 /* cds should be the clone */ 2545 if (csa->cds->ds_prev->ds_phys->ds_next_snap_obj != 2546 csa->ohds->ds_object) 2547 return (EINVAL); 2548 2549 /* the clone should be a child of the origin */ 2550 if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir) 2551 return (EINVAL); 2552 2553 /* ohds shouldn't be modified unless 'force' */ 2554 if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds)) 2555 return (ETXTBSY); 2556 2557 /* adjust amount of any unconsumed refreservation */ 2558 csa->unused_refres_delta = 2559 (int64_t)MIN(csa->ohds->ds_reserved, 2560 csa->ohds->ds_phys->ds_unique_bytes) - 2561 (int64_t)MIN(csa->ohds->ds_reserved, 2562 csa->cds->ds_phys->ds_unique_bytes); 2563 2564 if (csa->unused_refres_delta > 0 && 2565 csa->unused_refres_delta > 2566 dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE)) 2567 return (ENOSPC); 2568 2569 return (0); 2570 } 2571 2572 /* ARGSUSED */ 2573 static void 2574 dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 2575 { 2576 struct cloneswaparg *csa = arg1; 2577 dsl_pool_t *dp = csa->cds->ds_dir->dd_pool; 2578 uint64_t itor = 0; 2579 blkptr_t bp; 2580 uint64_t unique = 0; 2581 int err; 2582 2583 ASSERT(csa->cds->ds_reserved == 0); 2584 ASSERT(csa->cds->ds_quota == csa->ohds->ds_quota); 2585 2586 dmu_buf_will_dirty(csa->cds->ds_dbuf, tx); 2587 dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx); 2588 dmu_buf_will_dirty(csa->cds->ds_prev->ds_dbuf, tx); 2589 2590 if (csa->cds->ds_user_ptr != NULL) { 2591 csa->cds->ds_user_evict_func(csa->cds, csa->cds->ds_user_ptr); 2592 csa->cds->ds_user_ptr = NULL; 2593 } 2594 2595 if (csa->ohds->ds_user_ptr != NULL) { 2596 csa->ohds->ds_user_evict_func(csa->ohds, 2597 csa->ohds->ds_user_ptr); 2598 csa->ohds->ds_user_ptr = NULL; 2599 } 2600 2601 /* compute unique space */ 2602 while ((err = bplist_iterate(&csa->cds->ds_deadlist, 2603 &itor, &bp)) == 0) { 2604 if (bp.blk_birth > csa->cds->ds_prev->ds_phys->ds_prev_snap_txg) 2605 unique += bp_get_dasize(dp->dp_spa, &bp); 2606 } 2607 VERIFY(err == ENOENT); 2608 2609 /* reset origin's unique bytes */ 2610 csa->cds->ds_prev->ds_phys->ds_unique_bytes = unique; 2611 2612 /* swap blkptrs */ 2613 { 2614 blkptr_t tmp; 2615 tmp = csa->ohds->ds_phys->ds_bp; 2616 csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp; 2617 csa->cds->ds_phys->ds_bp = tmp; 2618 } 2619 2620 /* set dd_*_bytes */ 2621 { 2622 int64_t dused, dcomp, duncomp; 2623 uint64_t cdl_used, cdl_comp, cdl_uncomp; 2624 uint64_t odl_used, odl_comp, odl_uncomp; 2625 2626 VERIFY(0 == bplist_space(&csa->cds->ds_deadlist, &cdl_used, 2627 &cdl_comp, &cdl_uncomp)); 2628 VERIFY(0 == bplist_space(&csa->ohds->ds_deadlist, &odl_used, 2629 &odl_comp, &odl_uncomp)); 2630 dused = csa->cds->ds_phys->ds_used_bytes + cdl_used - 2631 (csa->ohds->ds_phys->ds_used_bytes + odl_used); 2632 dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp - 2633 (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp); 2634 duncomp = csa->cds->ds_phys->ds_uncompressed_bytes + 2635 cdl_uncomp - 2636 (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp); 2637 2638 dsl_dir_diduse_space(csa->ohds->ds_dir, 2639 dused, dcomp, duncomp, tx); 2640 dsl_dir_diduse_space(csa->cds->ds_dir, 2641 -dused, -dcomp, -duncomp, tx); 2642 } 2643 2644 #define SWITCH64(x, y) \ 2645 { \ 2646 uint64_t __tmp = (x); \ 2647 (x) = (y); \ 2648 (y) = __tmp; \ 2649 } 2650 2651 /* swap ds_*_bytes */ 2652 SWITCH64(csa->ohds->ds_phys->ds_used_bytes, 2653 csa->cds->ds_phys->ds_used_bytes); 2654 SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes, 2655 csa->cds->ds_phys->ds_compressed_bytes); 2656 SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes, 2657 csa->cds->ds_phys->ds_uncompressed_bytes); 2658 SWITCH64(csa->ohds->ds_phys->ds_unique_bytes, 2659 csa->cds->ds_phys->ds_unique_bytes); 2660 2661 /* apply any parent delta for change in unconsumed refreservation */ 2662 dsl_dir_diduse_space(csa->ohds->ds_dir, csa->unused_refres_delta, 2663 0, 0, tx); 2664 2665 /* swap deadlists */ 2666 bplist_close(&csa->cds->ds_deadlist); 2667 bplist_close(&csa->ohds->ds_deadlist); 2668 SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj, 2669 csa->cds->ds_phys->ds_deadlist_obj); 2670 VERIFY(0 == bplist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset, 2671 csa->cds->ds_phys->ds_deadlist_obj)); 2672 VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset, 2673 csa->ohds->ds_phys->ds_deadlist_obj)); 2674 } 2675 2676 /* 2677 * Swap 'clone' with its origin head file system. Used at the end 2678 * of "online recv" to swizzle the file system to the new version. 2679 */ 2680 int 2681 dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, 2682 boolean_t force) 2683 { 2684 struct cloneswaparg csa; 2685 int error; 2686 2687 ASSERT(clone->ds_owner); 2688 ASSERT(origin_head->ds_owner); 2689 retry: 2690 /* Need exclusive access for the swap */ 2691 rw_enter(&clone->ds_rwlock, RW_WRITER); 2692 if (!rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) { 2693 rw_exit(&clone->ds_rwlock); 2694 rw_enter(&origin_head->ds_rwlock, RW_WRITER); 2695 if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) { 2696 rw_exit(&origin_head->ds_rwlock); 2697 goto retry; 2698 } 2699 } 2700 csa.cds = clone; 2701 csa.ohds = origin_head; 2702 csa.force = force; 2703 error = dsl_sync_task_do(clone->ds_dir->dd_pool, 2704 dsl_dataset_clone_swap_check, 2705 dsl_dataset_clone_swap_sync, &csa, NULL, 9); 2706 return (error); 2707 } 2708 2709 /* 2710 * Given a pool name and a dataset object number in that pool, 2711 * return the name of that dataset. 2712 */ 2713 int 2714 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) 2715 { 2716 spa_t *spa; 2717 dsl_pool_t *dp; 2718 dsl_dataset_t *ds; 2719 int error; 2720 2721 if ((error = spa_open(pname, &spa, FTAG)) != 0) 2722 return (error); 2723 dp = spa_get_dsl(spa); 2724 rw_enter(&dp->dp_config_rwlock, RW_READER); 2725 if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) { 2726 dsl_dataset_name(ds, buf); 2727 dsl_dataset_rele(ds, FTAG); 2728 } 2729 rw_exit(&dp->dp_config_rwlock); 2730 spa_close(spa, FTAG); 2731 2732 return (error); 2733 } 2734 2735 int 2736 dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, 2737 uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv) 2738 { 2739 int error = 0; 2740 2741 ASSERT3S(asize, >, 0); 2742 2743 /* 2744 * *ref_rsrv is the portion of asize that will come from any 2745 * unconsumed refreservation space. 2746 */ 2747 *ref_rsrv = 0; 2748 2749 mutex_enter(&ds->ds_lock); 2750 /* 2751 * Make a space adjustment for reserved bytes. 2752 */ 2753 if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) { 2754 ASSERT3U(*used, >=, 2755 ds->ds_reserved - ds->ds_phys->ds_unique_bytes); 2756 *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes); 2757 *ref_rsrv = 2758 asize - MIN(asize, parent_delta(ds, asize + inflight)); 2759 } 2760 2761 if (!check_quota || ds->ds_quota == 0) { 2762 mutex_exit(&ds->ds_lock); 2763 return (0); 2764 } 2765 /* 2766 * If they are requesting more space, and our current estimate 2767 * is over quota, they get to try again unless the actual 2768 * on-disk is over quota and there are no pending changes (which 2769 * may free up space for us). 2770 */ 2771 if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) { 2772 if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota) 2773 error = ERESTART; 2774 else 2775 error = EDQUOT; 2776 } 2777 mutex_exit(&ds->ds_lock); 2778 2779 return (error); 2780 } 2781 2782 /* ARGSUSED */ 2783 static int 2784 dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) 2785 { 2786 dsl_dataset_t *ds = arg1; 2787 uint64_t *quotap = arg2; 2788 uint64_t new_quota = *quotap; 2789 2790 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA) 2791 return (ENOTSUP); 2792 2793 if (new_quota == 0) 2794 return (0); 2795 2796 if (new_quota < ds->ds_phys->ds_used_bytes || 2797 new_quota < ds->ds_reserved) 2798 return (ENOSPC); 2799 2800 return (0); 2801 } 2802 2803 /* ARGSUSED */ 2804 void 2805 dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 2806 { 2807 dsl_dataset_t *ds = arg1; 2808 uint64_t *quotap = arg2; 2809 uint64_t new_quota = *quotap; 2810 2811 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2812 2813 ds->ds_quota = new_quota; 2814 2815 dsl_prop_set_uint64_sync(ds->ds_dir, "refquota", new_quota, cr, tx); 2816 2817 spa_history_internal_log(LOG_DS_REFQUOTA, ds->ds_dir->dd_pool->dp_spa, 2818 tx, cr, "%lld dataset = %llu ", 2819 (longlong_t)new_quota, ds->ds_object); 2820 } 2821 2822 int 2823 dsl_dataset_set_quota(const char *dsname, uint64_t quota) 2824 { 2825 dsl_dataset_t *ds; 2826 int err; 2827 2828 err = dsl_dataset_hold(dsname, FTAG, &ds); 2829 if (err) 2830 return (err); 2831 2832 if (quota != ds->ds_quota) { 2833 /* 2834 * If someone removes a file, then tries to set the quota, we 2835 * want to make sure the file freeing takes effect. 2836 */ 2837 txg_wait_open(ds->ds_dir->dd_pool, 0); 2838 2839 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 2840 dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync, 2841 ds, "a, 0); 2842 } 2843 dsl_dataset_rele(ds, FTAG); 2844 return (err); 2845 } 2846 2847 static int 2848 dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) 2849 { 2850 dsl_dataset_t *ds = arg1; 2851 uint64_t *reservationp = arg2; 2852 uint64_t new_reservation = *reservationp; 2853 int64_t delta; 2854 uint64_t unique; 2855 2856 if (new_reservation > INT64_MAX) 2857 return (EOVERFLOW); 2858 2859 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < 2860 SPA_VERSION_REFRESERVATION) 2861 return (ENOTSUP); 2862 2863 if (dsl_dataset_is_snapshot(ds)) 2864 return (EINVAL); 2865 2866 /* 2867 * If we are doing the preliminary check in open context, the 2868 * space estimates may be inaccurate. 2869 */ 2870 if (!dmu_tx_is_syncing(tx)) 2871 return (0); 2872 2873 mutex_enter(&ds->ds_lock); 2874 unique = dsl_dataset_unique(ds); 2875 delta = MAX(unique, new_reservation) - MAX(unique, ds->ds_reserved); 2876 mutex_exit(&ds->ds_lock); 2877 2878 if (delta > 0 && 2879 delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) 2880 return (ENOSPC); 2881 if (delta > 0 && ds->ds_quota > 0 && 2882 new_reservation > ds->ds_quota) 2883 return (ENOSPC); 2884 2885 return (0); 2886 } 2887 2888 /* ARGSUSED */ 2889 static void 2890 dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, 2891 dmu_tx_t *tx) 2892 { 2893 dsl_dataset_t *ds = arg1; 2894 uint64_t *reservationp = arg2; 2895 uint64_t new_reservation = *reservationp; 2896 uint64_t unique; 2897 int64_t delta; 2898 2899 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2900 2901 mutex_enter(&ds->ds_lock); 2902 unique = dsl_dataset_unique(ds); 2903 delta = MAX(0, (int64_t)(new_reservation - unique)) - 2904 MAX(0, (int64_t)(ds->ds_reserved - unique)); 2905 ds->ds_reserved = new_reservation; 2906 mutex_exit(&ds->ds_lock); 2907 2908 dsl_prop_set_uint64_sync(ds->ds_dir, "refreservation", 2909 new_reservation, cr, tx); 2910 2911 dsl_dir_diduse_space(ds->ds_dir, delta, 0, 0, tx); 2912 2913 spa_history_internal_log(LOG_DS_REFRESERV, 2914 ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu", 2915 (longlong_t)new_reservation, 2916 ds->ds_dir->dd_phys->dd_head_dataset_obj); 2917 } 2918 2919 int 2920 dsl_dataset_set_reservation(const char *dsname, uint64_t reservation) 2921 { 2922 dsl_dataset_t *ds; 2923 int err; 2924 2925 err = dsl_dataset_hold(dsname, FTAG, &ds); 2926 if (err) 2927 return (err); 2928 2929 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 2930 dsl_dataset_set_reservation_check, 2931 dsl_dataset_set_reservation_sync, ds, &reservation, 0); 2932 dsl_dataset_rele(ds, FTAG); 2933 return (err); 2934 } 2935