1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 24 * Copyright (c) 2014, Joyent, Inc. All rights reserved. 25 * Copyright (c) 2014 RackTop Systems. 26 */ 27 28 #include <sys/dmu_objset.h> 29 #include <sys/dsl_dataset.h> 30 #include <sys/dsl_dir.h> 31 #include <sys/dsl_prop.h> 32 #include <sys/dsl_synctask.h> 33 #include <sys/dmu_traverse.h> 34 #include <sys/dmu_impl.h> 35 #include <sys/dmu_tx.h> 36 #include <sys/arc.h> 37 #include <sys/zio.h> 38 #include <sys/zap.h> 39 #include <sys/zfeature.h> 40 #include <sys/unique.h> 41 #include <sys/zfs_context.h> 42 #include <sys/zfs_ioctl.h> 43 #include <sys/spa.h> 44 #include <sys/zfs_znode.h> 45 #include <sys/zfs_onexit.h> 46 #include <sys/zvol.h> 47 #include <sys/dsl_scan.h> 48 #include <sys/dsl_deadlist.h> 49 #include <sys/dsl_destroy.h> 50 #include <sys/dsl_userhold.h> 51 #include <sys/dsl_bookmark.h> 52 53 /* 54 * The SPA supports block sizes up to 16MB. However, very large blocks 55 * can have an impact on i/o latency (e.g. tying up a spinning disk for 56 * ~300ms), and also potentially on the memory allocator. Therefore, 57 * we do not allow the recordsize to be set larger than zfs_max_recordsize 58 * (default 1MB). Larger blocks can be created by changing this tunable, 59 * and pools with larger blocks can always be imported and used, regardless 60 * of this setting. 61 */ 62 int zfs_max_recordsize = 1 * 1024 * 1024; 63 64 #define SWITCH64(x, y) \ 65 { \ 66 uint64_t __tmp = (x); \ 67 (x) = (y); \ 68 (y) = __tmp; \ 69 } 70 71 #define DS_REF_MAX (1ULL << 62) 72 73 extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds); 74 extern inline boolean_t dsl_dataset_is_snapshot(dsl_dataset_t *ds); 75 76 /* 77 * Figure out how much of this delta should be propogated to the dsl_dir 78 * layer. If there's a refreservation, that space has already been 79 * partially accounted for in our ancestors. 80 */ 81 static int64_t 82 parent_delta(dsl_dataset_t *ds, int64_t delta) 83 { 84 dsl_dataset_phys_t *ds_phys; 85 uint64_t old_bytes, new_bytes; 86 87 if (ds->ds_reserved == 0) 88 return (delta); 89 90 ds_phys = dsl_dataset_phys(ds); 91 old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved); 92 new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved); 93 94 ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta)); 95 return (new_bytes - old_bytes); 96 } 97 98 void 99 dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) 100 { 101 int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); 102 int compressed = BP_GET_PSIZE(bp); 103 int uncompressed = BP_GET_UCSIZE(bp); 104 int64_t delta; 105 106 dprintf_bp(bp, "ds=%p", ds); 107 108 ASSERT(dmu_tx_is_syncing(tx)); 109 /* It could have been compressed away to nothing */ 110 if (BP_IS_HOLE(bp)) 111 return; 112 ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); 113 ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp))); 114 if (ds == NULL) { 115 dsl_pool_mos_diduse_space(tx->tx_pool, 116 used, compressed, uncompressed); 117 return; 118 } 119 120 dmu_buf_will_dirty(ds->ds_dbuf, tx); 121 mutex_enter(&ds->ds_lock); 122 delta = parent_delta(ds, used); 123 dsl_dataset_phys(ds)->ds_referenced_bytes += used; 124 dsl_dataset_phys(ds)->ds_compressed_bytes += compressed; 125 dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed; 126 dsl_dataset_phys(ds)->ds_unique_bytes += used; 127 if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) 128 ds->ds_need_large_blocks = B_TRUE; 129 mutex_exit(&ds->ds_lock); 130 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, 131 compressed, uncompressed, tx); 132 dsl_dir_transfer_space(ds->ds_dir, used - delta, 133 DD_USED_REFRSRV, DD_USED_HEAD, tx); 134 } 135 136 int 137 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, 138 boolean_t async) 139 { 140 int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); 141 int compressed = BP_GET_PSIZE(bp); 142 int uncompressed = BP_GET_UCSIZE(bp); 143 144 if (BP_IS_HOLE(bp)) 145 return (0); 146 147 ASSERT(dmu_tx_is_syncing(tx)); 148 ASSERT(bp->blk_birth <= tx->tx_txg); 149 150 if (ds == NULL) { 151 dsl_free(tx->tx_pool, tx->tx_txg, bp); 152 dsl_pool_mos_diduse_space(tx->tx_pool, 153 -used, -compressed, -uncompressed); 154 return (used); 155 } 156 ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); 157 158 ASSERT(!dsl_dataset_is_snapshot(ds)); 159 dmu_buf_will_dirty(ds->ds_dbuf, tx); 160 161 if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) { 162 int64_t delta; 163 164 dprintf_bp(bp, "freeing ds=%llu", ds->ds_object); 165 dsl_free(tx->tx_pool, tx->tx_txg, bp); 166 167 mutex_enter(&ds->ds_lock); 168 ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used || 169 !DS_UNIQUE_IS_ACCURATE(ds)); 170 delta = parent_delta(ds, -used); 171 dsl_dataset_phys(ds)->ds_unique_bytes -= used; 172 mutex_exit(&ds->ds_lock); 173 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, 174 delta, -compressed, -uncompressed, tx); 175 dsl_dir_transfer_space(ds->ds_dir, -used - delta, 176 DD_USED_REFRSRV, DD_USED_HEAD, tx); 177 } else { 178 dprintf_bp(bp, "putting on dead list: %s", ""); 179 if (async) { 180 /* 181 * We are here as part of zio's write done callback, 182 * which means we're a zio interrupt thread. We can't 183 * call dsl_deadlist_insert() now because it may block 184 * waiting for I/O. Instead, put bp on the deferred 185 * queue and let dsl_pool_sync() finish the job. 186 */ 187 bplist_append(&ds->ds_pending_deadlist, bp); 188 } else { 189 dsl_deadlist_insert(&ds->ds_deadlist, bp, tx); 190 } 191 ASSERT3U(ds->ds_prev->ds_object, ==, 192 dsl_dataset_phys(ds)->ds_prev_snap_obj); 193 ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0); 194 /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ 195 if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == 196 ds->ds_object && bp->blk_birth > 197 dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) { 198 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 199 mutex_enter(&ds->ds_prev->ds_lock); 200 dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used; 201 mutex_exit(&ds->ds_prev->ds_lock); 202 } 203 if (bp->blk_birth > ds->ds_dir->dd_origin_txg) { 204 dsl_dir_transfer_space(ds->ds_dir, used, 205 DD_USED_HEAD, DD_USED_SNAP, tx); 206 } 207 } 208 mutex_enter(&ds->ds_lock); 209 ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used); 210 dsl_dataset_phys(ds)->ds_referenced_bytes -= used; 211 ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed); 212 dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed; 213 ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed); 214 dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed; 215 mutex_exit(&ds->ds_lock); 216 217 return (used); 218 } 219 220 uint64_t 221 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds) 222 { 223 uint64_t trysnap = 0; 224 225 if (ds == NULL) 226 return (0); 227 /* 228 * The snapshot creation could fail, but that would cause an 229 * incorrect FALSE return, which would only result in an 230 * overestimation of the amount of space that an operation would 231 * consume, which is OK. 232 * 233 * There's also a small window where we could miss a pending 234 * snapshot, because we could set the sync task in the quiescing 235 * phase. So this should only be used as a guess. 236 */ 237 if (ds->ds_trysnap_txg > 238 spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa)) 239 trysnap = ds->ds_trysnap_txg; 240 return (MAX(dsl_dataset_phys(ds)->ds_prev_snap_txg, trysnap)); 241 } 242 243 boolean_t 244 dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp, 245 uint64_t blk_birth) 246 { 247 if (blk_birth <= dsl_dataset_prev_snap_txg(ds) || 248 (bp != NULL && BP_IS_HOLE(bp))) 249 return (B_FALSE); 250 251 ddt_prefetch(dsl_dataset_get_spa(ds), bp); 252 253 return (B_TRUE); 254 } 255 256 /* ARGSUSED */ 257 static void 258 dsl_dataset_evict(dmu_buf_t *db, void *dsv) 259 { 260 dsl_dataset_t *ds = dsv; 261 262 ASSERT(ds->ds_owner == NULL); 263 264 unique_remove(ds->ds_fsid_guid); 265 266 if (ds->ds_objset != NULL) 267 dmu_objset_evict(ds->ds_objset); 268 269 if (ds->ds_prev) { 270 dsl_dataset_rele(ds->ds_prev, ds); 271 ds->ds_prev = NULL; 272 } 273 274 bplist_destroy(&ds->ds_pending_deadlist); 275 if (dsl_dataset_phys(ds)->ds_deadlist_obj != 0) 276 dsl_deadlist_close(&ds->ds_deadlist); 277 if (ds->ds_dir) 278 dsl_dir_rele(ds->ds_dir, ds); 279 280 ASSERT(!list_link_active(&ds->ds_synced_link)); 281 282 mutex_destroy(&ds->ds_lock); 283 mutex_destroy(&ds->ds_opening_lock); 284 mutex_destroy(&ds->ds_sendstream_lock); 285 refcount_destroy(&ds->ds_longholds); 286 287 kmem_free(ds, sizeof (dsl_dataset_t)); 288 } 289 290 int 291 dsl_dataset_get_snapname(dsl_dataset_t *ds) 292 { 293 dsl_dataset_phys_t *headphys; 294 int err; 295 dmu_buf_t *headdbuf; 296 dsl_pool_t *dp = ds->ds_dir->dd_pool; 297 objset_t *mos = dp->dp_meta_objset; 298 299 if (ds->ds_snapname[0]) 300 return (0); 301 if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) 302 return (0); 303 304 err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, 305 FTAG, &headdbuf); 306 if (err != 0) 307 return (err); 308 headphys = headdbuf->db_data; 309 err = zap_value_search(dp->dp_meta_objset, 310 headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname); 311 dmu_buf_rele(headdbuf, FTAG); 312 return (err); 313 } 314 315 int 316 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) 317 { 318 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 319 uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; 320 matchtype_t mt; 321 int err; 322 323 if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) 324 mt = MT_FIRST; 325 else 326 mt = MT_EXACT; 327 328 err = zap_lookup_norm(mos, snapobj, name, 8, 1, 329 value, mt, NULL, 0, NULL); 330 if (err == ENOTSUP && mt == MT_FIRST) 331 err = zap_lookup(mos, snapobj, name, 8, 1, value); 332 return (err); 333 } 334 335 int 336 dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx, 337 boolean_t adj_cnt) 338 { 339 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 340 uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; 341 matchtype_t mt; 342 int err; 343 344 dsl_dir_snap_cmtime_update(ds->ds_dir); 345 346 if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) 347 mt = MT_FIRST; 348 else 349 mt = MT_EXACT; 350 351 err = zap_remove_norm(mos, snapobj, name, mt, tx); 352 if (err == ENOTSUP && mt == MT_FIRST) 353 err = zap_remove(mos, snapobj, name, tx); 354 355 if (err == 0 && adj_cnt) 356 dsl_fs_ss_count_adjust(ds->ds_dir, -1, 357 DD_FIELD_SNAPSHOT_COUNT, tx); 358 359 return (err); 360 } 361 362 int 363 dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, 364 dsl_dataset_t **dsp) 365 { 366 objset_t *mos = dp->dp_meta_objset; 367 dmu_buf_t *dbuf; 368 dsl_dataset_t *ds; 369 int err; 370 dmu_object_info_t doi; 371 372 ASSERT(dsl_pool_config_held(dp)); 373 374 err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); 375 if (err != 0) 376 return (err); 377 378 /* Make sure dsobj has the correct object type. */ 379 dmu_object_info_from_db(dbuf, &doi); 380 if (doi.doi_bonus_type != DMU_OT_DSL_DATASET) { 381 dmu_buf_rele(dbuf, tag); 382 return (SET_ERROR(EINVAL)); 383 } 384 385 ds = dmu_buf_get_user(dbuf); 386 if (ds == NULL) { 387 dsl_dataset_t *winner = NULL; 388 389 ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); 390 ds->ds_dbuf = dbuf; 391 ds->ds_object = dsobj; 392 393 mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); 394 mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); 395 mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL); 396 refcount_create(&ds->ds_longholds); 397 398 bplist_create(&ds->ds_pending_deadlist); 399 dsl_deadlist_open(&ds->ds_deadlist, 400 mos, dsl_dataset_phys(ds)->ds_deadlist_obj); 401 402 list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t), 403 offsetof(dmu_sendarg_t, dsa_link)); 404 405 if (doi.doi_type == DMU_OTN_ZAP_METADATA) { 406 err = zap_contains(mos, dsobj, DS_FIELD_LARGE_BLOCKS); 407 if (err == 0) 408 ds->ds_large_blocks = B_TRUE; 409 else 410 ASSERT3U(err, ==, ENOENT); 411 } 412 413 if (err == 0) { 414 err = dsl_dir_hold_obj(dp, 415 dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds, 416 &ds->ds_dir); 417 } 418 if (err != 0) { 419 mutex_destroy(&ds->ds_lock); 420 mutex_destroy(&ds->ds_opening_lock); 421 mutex_destroy(&ds->ds_sendstream_lock); 422 refcount_destroy(&ds->ds_longholds); 423 bplist_destroy(&ds->ds_pending_deadlist); 424 dsl_deadlist_close(&ds->ds_deadlist); 425 kmem_free(ds, sizeof (dsl_dataset_t)); 426 dmu_buf_rele(dbuf, tag); 427 return (err); 428 } 429 430 if (!dsl_dataset_is_snapshot(ds)) { 431 ds->ds_snapname[0] = '\0'; 432 if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { 433 err = dsl_dataset_hold_obj(dp, 434 dsl_dataset_phys(ds)->ds_prev_snap_obj, 435 ds, &ds->ds_prev); 436 } 437 if (doi.doi_type == DMU_OTN_ZAP_METADATA) { 438 int zaperr = zap_lookup(mos, ds->ds_object, 439 DS_FIELD_BOOKMARK_NAMES, 440 sizeof (ds->ds_bookmarks), 1, 441 &ds->ds_bookmarks); 442 if (zaperr != ENOENT) 443 VERIFY0(zaperr); 444 } 445 } else { 446 if (zfs_flags & ZFS_DEBUG_SNAPNAMES) 447 err = dsl_dataset_get_snapname(ds); 448 if (err == 0 && 449 dsl_dataset_phys(ds)->ds_userrefs_obj != 0) { 450 err = zap_count( 451 ds->ds_dir->dd_pool->dp_meta_objset, 452 dsl_dataset_phys(ds)->ds_userrefs_obj, 453 &ds->ds_userrefs); 454 } 455 } 456 457 if (err == 0 && !dsl_dataset_is_snapshot(ds)) { 458 err = dsl_prop_get_int_ds(ds, 459 zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 460 &ds->ds_reserved); 461 if (err == 0) { 462 err = dsl_prop_get_int_ds(ds, 463 zfs_prop_to_name(ZFS_PROP_REFQUOTA), 464 &ds->ds_quota); 465 } 466 } else { 467 ds->ds_reserved = ds->ds_quota = 0; 468 } 469 470 if (err != 0 || (winner = dmu_buf_set_user_ie(dbuf, ds, 471 dsl_dataset_evict)) != NULL) { 472 bplist_destroy(&ds->ds_pending_deadlist); 473 dsl_deadlist_close(&ds->ds_deadlist); 474 if (ds->ds_prev) 475 dsl_dataset_rele(ds->ds_prev, ds); 476 dsl_dir_rele(ds->ds_dir, ds); 477 mutex_destroy(&ds->ds_lock); 478 mutex_destroy(&ds->ds_opening_lock); 479 mutex_destroy(&ds->ds_sendstream_lock); 480 refcount_destroy(&ds->ds_longholds); 481 kmem_free(ds, sizeof (dsl_dataset_t)); 482 if (err != 0) { 483 dmu_buf_rele(dbuf, tag); 484 return (err); 485 } 486 ds = winner; 487 } else { 488 ds->ds_fsid_guid = 489 unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid); 490 } 491 } 492 ASSERT3P(ds->ds_dbuf, ==, dbuf); 493 ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data); 494 ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 || 495 spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN || 496 dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); 497 *dsp = ds; 498 return (0); 499 } 500 501 int 502 dsl_dataset_hold(dsl_pool_t *dp, const char *name, 503 void *tag, dsl_dataset_t **dsp) 504 { 505 dsl_dir_t *dd; 506 const char *snapname; 507 uint64_t obj; 508 int err = 0; 509 510 err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname); 511 if (err != 0) 512 return (err); 513 514 ASSERT(dsl_pool_config_held(dp)); 515 obj = dsl_dir_phys(dd)->dd_head_dataset_obj; 516 if (obj != 0) 517 err = dsl_dataset_hold_obj(dp, obj, tag, dsp); 518 else 519 err = SET_ERROR(ENOENT); 520 521 /* we may be looking for a snapshot */ 522 if (err == 0 && snapname != NULL) { 523 dsl_dataset_t *ds; 524 525 if (*snapname++ != '@') { 526 dsl_dataset_rele(*dsp, tag); 527 dsl_dir_rele(dd, FTAG); 528 return (SET_ERROR(ENOENT)); 529 } 530 531 dprintf("looking for snapshot '%s'\n", snapname); 532 err = dsl_dataset_snap_lookup(*dsp, snapname, &obj); 533 if (err == 0) 534 err = dsl_dataset_hold_obj(dp, obj, tag, &ds); 535 dsl_dataset_rele(*dsp, tag); 536 537 if (err == 0) { 538 mutex_enter(&ds->ds_lock); 539 if (ds->ds_snapname[0] == 0) 540 (void) strlcpy(ds->ds_snapname, snapname, 541 sizeof (ds->ds_snapname)); 542 mutex_exit(&ds->ds_lock); 543 *dsp = ds; 544 } 545 } 546 547 dsl_dir_rele(dd, FTAG); 548 return (err); 549 } 550 551 int 552 dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, 553 void *tag, dsl_dataset_t **dsp) 554 { 555 int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); 556 if (err != 0) 557 return (err); 558 if (!dsl_dataset_tryown(*dsp, tag)) { 559 dsl_dataset_rele(*dsp, tag); 560 *dsp = NULL; 561 return (SET_ERROR(EBUSY)); 562 } 563 return (0); 564 } 565 566 int 567 dsl_dataset_own(dsl_pool_t *dp, const char *name, 568 void *tag, dsl_dataset_t **dsp) 569 { 570 int err = dsl_dataset_hold(dp, name, tag, dsp); 571 if (err != 0) 572 return (err); 573 if (!dsl_dataset_tryown(*dsp, tag)) { 574 dsl_dataset_rele(*dsp, tag); 575 return (SET_ERROR(EBUSY)); 576 } 577 return (0); 578 } 579 580 /* 581 * See the comment above dsl_pool_hold() for details. In summary, a long 582 * hold is used to prevent destruction of a dataset while the pool hold 583 * is dropped, allowing other concurrent operations (e.g. spa_sync()). 584 * 585 * The dataset and pool must be held when this function is called. After it 586 * is called, the pool hold may be released while the dataset is still held 587 * and accessed. 588 */ 589 void 590 dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag) 591 { 592 ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); 593 (void) refcount_add(&ds->ds_longholds, tag); 594 } 595 596 void 597 dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag) 598 { 599 (void) refcount_remove(&ds->ds_longholds, tag); 600 } 601 602 /* Return B_TRUE if there are any long holds on this dataset. */ 603 boolean_t 604 dsl_dataset_long_held(dsl_dataset_t *ds) 605 { 606 return (!refcount_is_zero(&ds->ds_longholds)); 607 } 608 609 void 610 dsl_dataset_name(dsl_dataset_t *ds, char *name) 611 { 612 if (ds == NULL) { 613 (void) strcpy(name, "mos"); 614 } else { 615 dsl_dir_name(ds->ds_dir, name); 616 VERIFY0(dsl_dataset_get_snapname(ds)); 617 if (ds->ds_snapname[0]) { 618 (void) strcat(name, "@"); 619 /* 620 * We use a "recursive" mutex so that we 621 * can call dprintf_ds() with ds_lock held. 622 */ 623 if (!MUTEX_HELD(&ds->ds_lock)) { 624 mutex_enter(&ds->ds_lock); 625 (void) strcat(name, ds->ds_snapname); 626 mutex_exit(&ds->ds_lock); 627 } else { 628 (void) strcat(name, ds->ds_snapname); 629 } 630 } 631 } 632 } 633 634 void 635 dsl_dataset_rele(dsl_dataset_t *ds, void *tag) 636 { 637 dmu_buf_rele(ds->ds_dbuf, tag); 638 } 639 640 void 641 dsl_dataset_disown(dsl_dataset_t *ds, void *tag) 642 { 643 ASSERT3P(ds->ds_owner, ==, tag); 644 ASSERT(ds->ds_dbuf != NULL); 645 646 mutex_enter(&ds->ds_lock); 647 ds->ds_owner = NULL; 648 mutex_exit(&ds->ds_lock); 649 dsl_dataset_long_rele(ds, tag); 650 dsl_dataset_rele(ds, tag); 651 } 652 653 boolean_t 654 dsl_dataset_tryown(dsl_dataset_t *ds, void *tag) 655 { 656 boolean_t gotit = FALSE; 657 658 mutex_enter(&ds->ds_lock); 659 if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) { 660 ds->ds_owner = tag; 661 dsl_dataset_long_hold(ds, tag); 662 gotit = TRUE; 663 } 664 mutex_exit(&ds->ds_lock); 665 return (gotit); 666 } 667 668 uint64_t 669 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, 670 uint64_t flags, dmu_tx_t *tx) 671 { 672 dsl_pool_t *dp = dd->dd_pool; 673 dmu_buf_t *dbuf; 674 dsl_dataset_phys_t *dsphys; 675 uint64_t dsobj; 676 objset_t *mos = dp->dp_meta_objset; 677 678 if (origin == NULL) 679 origin = dp->dp_origin_snap; 680 681 ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp); 682 ASSERT(origin == NULL || dsl_dataset_phys(origin)->ds_num_children > 0); 683 ASSERT(dmu_tx_is_syncing(tx)); 684 ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0); 685 686 dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, 687 DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); 688 VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); 689 dmu_buf_will_dirty(dbuf, tx); 690 dsphys = dbuf->db_data; 691 bzero(dsphys, sizeof (dsl_dataset_phys_t)); 692 dsphys->ds_dir_obj = dd->dd_object; 693 dsphys->ds_flags = flags; 694 dsphys->ds_fsid_guid = unique_create(); 695 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, 696 sizeof (dsphys->ds_guid)); 697 dsphys->ds_snapnames_zapobj = 698 zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP, 699 DMU_OT_NONE, 0, tx); 700 dsphys->ds_creation_time = gethrestime_sec(); 701 dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg; 702 703 if (origin == NULL) { 704 dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx); 705 } else { 706 dsl_dataset_t *ohds; /* head of the origin snapshot */ 707 708 dsphys->ds_prev_snap_obj = origin->ds_object; 709 dsphys->ds_prev_snap_txg = 710 dsl_dataset_phys(origin)->ds_creation_txg; 711 dsphys->ds_referenced_bytes = 712 dsl_dataset_phys(origin)->ds_referenced_bytes; 713 dsphys->ds_compressed_bytes = 714 dsl_dataset_phys(origin)->ds_compressed_bytes; 715 dsphys->ds_uncompressed_bytes = 716 dsl_dataset_phys(origin)->ds_uncompressed_bytes; 717 dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp; 718 719 /* 720 * Inherit flags that describe the dataset's contents 721 * (INCONSISTENT) or properties (Case Insensitive). 722 */ 723 dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags & 724 (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET); 725 726 if (origin->ds_large_blocks) 727 dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx); 728 729 dmu_buf_will_dirty(origin->ds_dbuf, tx); 730 dsl_dataset_phys(origin)->ds_num_children++; 731 732 VERIFY0(dsl_dataset_hold_obj(dp, 733 dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj, 734 FTAG, &ohds)); 735 dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist, 736 dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx); 737 dsl_dataset_rele(ohds, FTAG); 738 739 if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) { 740 if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) { 741 dsl_dataset_phys(origin)->ds_next_clones_obj = 742 zap_create(mos, 743 DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); 744 } 745 VERIFY0(zap_add_int(mos, 746 dsl_dataset_phys(origin)->ds_next_clones_obj, 747 dsobj, tx)); 748 } 749 750 dmu_buf_will_dirty(dd->dd_dbuf, tx); 751 dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object; 752 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 753 if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) { 754 dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); 755 dsl_dir_phys(origin->ds_dir)->dd_clones = 756 zap_create(mos, 757 DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); 758 } 759 VERIFY0(zap_add_int(mos, 760 dsl_dir_phys(origin->ds_dir)->dd_clones, 761 dsobj, tx)); 762 } 763 } 764 765 if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) 766 dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 767 768 dmu_buf_rele(dbuf, FTAG); 769 770 dmu_buf_will_dirty(dd->dd_dbuf, tx); 771 dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj; 772 773 return (dsobj); 774 } 775 776 static void 777 dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx) 778 { 779 objset_t *os; 780 781 VERIFY0(dmu_objset_from_ds(ds, &os)); 782 bzero(&os->os_zil_header, sizeof (os->os_zil_header)); 783 dsl_dataset_dirty(ds, tx); 784 } 785 786 uint64_t 787 dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, 788 dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx) 789 { 790 dsl_pool_t *dp = pdd->dd_pool; 791 uint64_t dsobj, ddobj; 792 dsl_dir_t *dd; 793 794 ASSERT(dmu_tx_is_syncing(tx)); 795 ASSERT(lastname[0] != '@'); 796 797 ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); 798 VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd)); 799 800 dsobj = dsl_dataset_create_sync_dd(dd, origin, 801 flags & ~DS_CREATE_FLAG_NODIRTY, tx); 802 803 dsl_deleg_set_create_perms(dd, tx, cr); 804 805 /* 806 * Since we're creating a new node we know it's a leaf, so we can 807 * initialize the counts if the limit feature is active. 808 */ 809 if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { 810 uint64_t cnt = 0; 811 objset_t *os = dd->dd_pool->dp_meta_objset; 812 813 dsl_dir_zapify(dd, tx); 814 VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, 815 sizeof (cnt), 1, &cnt, tx)); 816 VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, 817 sizeof (cnt), 1, &cnt, tx)); 818 } 819 820 dsl_dir_rele(dd, FTAG); 821 822 /* 823 * If we are creating a clone, make sure we zero out any stale 824 * data from the origin snapshots zil header. 825 */ 826 if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) { 827 dsl_dataset_t *ds; 828 829 VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 830 dsl_dataset_zero_zil(ds, tx); 831 dsl_dataset_rele(ds, FTAG); 832 } 833 834 return (dsobj); 835 } 836 837 /* 838 * The unique space in the head dataset can be calculated by subtracting 839 * the space used in the most recent snapshot, that is still being used 840 * in this file system, from the space currently in use. To figure out 841 * the space in the most recent snapshot still in use, we need to take 842 * the total space used in the snapshot and subtract out the space that 843 * has been freed up since the snapshot was taken. 844 */ 845 void 846 dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) 847 { 848 uint64_t mrs_used; 849 uint64_t dlused, dlcomp, dluncomp; 850 851 ASSERT(!dsl_dataset_is_snapshot(ds)); 852 853 if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) 854 mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes; 855 else 856 mrs_used = 0; 857 858 dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp); 859 860 ASSERT3U(dlused, <=, mrs_used); 861 dsl_dataset_phys(ds)->ds_unique_bytes = 862 dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused); 863 864 if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= 865 SPA_VERSION_UNIQUE_ACCURATE) 866 dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 867 } 868 869 void 870 dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, 871 dmu_tx_t *tx) 872 { 873 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 874 uint64_t count; 875 int err; 876 877 ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2); 878 err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, 879 obj, tx); 880 /* 881 * The err should not be ENOENT, but a bug in a previous version 882 * of the code could cause upgrade_clones_cb() to not set 883 * ds_next_snap_obj when it should, leading to a missing entry. 884 * If we knew that the pool was created after 885 * SPA_VERSION_NEXT_CLONES, we could assert that it isn't 886 * ENOENT. However, at least we can check that we don't have 887 * too many entries in the next_clones_obj even after failing to 888 * remove this one. 889 */ 890 if (err != ENOENT) 891 VERIFY0(err); 892 ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, 893 &count)); 894 ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2); 895 } 896 897 898 blkptr_t * 899 dsl_dataset_get_blkptr(dsl_dataset_t *ds) 900 { 901 return (&dsl_dataset_phys(ds)->ds_bp); 902 } 903 904 void 905 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) 906 { 907 ASSERT(dmu_tx_is_syncing(tx)); 908 /* If it's the meta-objset, set dp_meta_rootbp */ 909 if (ds == NULL) { 910 tx->tx_pool->dp_meta_rootbp = *bp; 911 } else { 912 dmu_buf_will_dirty(ds->ds_dbuf, tx); 913 dsl_dataset_phys(ds)->ds_bp = *bp; 914 } 915 } 916 917 spa_t * 918 dsl_dataset_get_spa(dsl_dataset_t *ds) 919 { 920 return (ds->ds_dir->dd_pool->dp_spa); 921 } 922 923 void 924 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) 925 { 926 dsl_pool_t *dp; 927 928 if (ds == NULL) /* this is the meta-objset */ 929 return; 930 931 ASSERT(ds->ds_objset != NULL); 932 933 if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) 934 panic("dirtying snapshot!"); 935 936 dp = ds->ds_dir->dd_pool; 937 938 if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) { 939 /* up the hold count until we can be written out */ 940 dmu_buf_add_ref(ds->ds_dbuf, ds); 941 } 942 } 943 944 boolean_t 945 dsl_dataset_is_dirty(dsl_dataset_t *ds) 946 { 947 for (int t = 0; t < TXG_SIZE; t++) { 948 if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets, 949 ds, t)) 950 return (B_TRUE); 951 } 952 return (B_FALSE); 953 } 954 955 static int 956 dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) 957 { 958 uint64_t asize; 959 960 if (!dmu_tx_is_syncing(tx)) 961 return (0); 962 963 /* 964 * If there's an fs-only reservation, any blocks that might become 965 * owned by the snapshot dataset must be accommodated by space 966 * outside of the reservation. 967 */ 968 ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds)); 969 asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved); 970 if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) 971 return (SET_ERROR(ENOSPC)); 972 973 /* 974 * Propagate any reserved space for this snapshot to other 975 * snapshot checks in this sync group. 976 */ 977 if (asize > 0) 978 dsl_dir_willuse_space(ds->ds_dir, asize, tx); 979 980 return (0); 981 } 982 983 typedef struct dsl_dataset_snapshot_arg { 984 nvlist_t *ddsa_snaps; 985 nvlist_t *ddsa_props; 986 nvlist_t *ddsa_errors; 987 cred_t *ddsa_cr; 988 } dsl_dataset_snapshot_arg_t; 989 990 int 991 dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, 992 dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr) 993 { 994 int error; 995 uint64_t value; 996 997 ds->ds_trysnap_txg = tx->tx_txg; 998 999 if (!dmu_tx_is_syncing(tx)) 1000 return (0); 1001 1002 /* 1003 * We don't allow multiple snapshots of the same txg. If there 1004 * is already one, try again. 1005 */ 1006 if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg) 1007 return (SET_ERROR(EAGAIN)); 1008 1009 /* 1010 * Check for conflicting snapshot name. 1011 */ 1012 error = dsl_dataset_snap_lookup(ds, snapname, &value); 1013 if (error == 0) 1014 return (SET_ERROR(EEXIST)); 1015 if (error != ENOENT) 1016 return (error); 1017 1018 /* 1019 * We don't allow taking snapshots of inconsistent datasets, such as 1020 * those into which we are currently receiving. However, if we are 1021 * creating this snapshot as part of a receive, this check will be 1022 * executed atomically with respect to the completion of the receive 1023 * itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this 1024 * case we ignore this, knowing it will be fixed up for us shortly in 1025 * dmu_recv_end_sync(). 1026 */ 1027 if (!recv && DS_IS_INCONSISTENT(ds)) 1028 return (SET_ERROR(EBUSY)); 1029 1030 /* 1031 * Skip the check for temporary snapshots or if we have already checked 1032 * the counts in dsl_dataset_snapshot_check. This means we really only 1033 * check the count here when we're receiving a stream. 1034 */ 1035 if (cnt != 0 && cr != NULL) { 1036 error = dsl_fs_ss_limit_check(ds->ds_dir, cnt, 1037 ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr); 1038 if (error != 0) 1039 return (error); 1040 } 1041 1042 error = dsl_dataset_snapshot_reserve_space(ds, tx); 1043 if (error != 0) 1044 return (error); 1045 1046 return (0); 1047 } 1048 1049 static int 1050 dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx) 1051 { 1052 dsl_dataset_snapshot_arg_t *ddsa = arg; 1053 dsl_pool_t *dp = dmu_tx_pool(tx); 1054 nvpair_t *pair; 1055 int rv = 0; 1056 1057 /* 1058 * Pre-compute how many total new snapshots will be created for each 1059 * level in the tree and below. This is needed for validating the 1060 * snapshot limit when either taking a recursive snapshot or when 1061 * taking multiple snapshots. 1062 * 1063 * The problem is that the counts are not actually adjusted when 1064 * we are checking, only when we finally sync. For a single snapshot, 1065 * this is easy, the count will increase by 1 at each node up the tree, 1066 * but its more complicated for the recursive/multiple snapshot case. 1067 * 1068 * The dsl_fs_ss_limit_check function does recursively check the count 1069 * at each level up the tree but since it is validating each snapshot 1070 * independently we need to be sure that we are validating the complete 1071 * count for the entire set of snapshots. We do this by rolling up the 1072 * counts for each component of the name into an nvlist and then 1073 * checking each of those cases with the aggregated count. 1074 * 1075 * This approach properly handles not only the recursive snapshot 1076 * case (where we get all of those on the ddsa_snaps list) but also 1077 * the sibling case (e.g. snapshot a/b and a/c so that we will also 1078 * validate the limit on 'a' using a count of 2). 1079 * 1080 * We validate the snapshot names in the third loop and only report 1081 * name errors once. 1082 */ 1083 if (dmu_tx_is_syncing(tx)) { 1084 nvlist_t *cnt_track = NULL; 1085 cnt_track = fnvlist_alloc(); 1086 1087 /* Rollup aggregated counts into the cnt_track list */ 1088 for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); 1089 pair != NULL; 1090 pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { 1091 char *pdelim; 1092 uint64_t val; 1093 char nm[MAXPATHLEN]; 1094 1095 (void) strlcpy(nm, nvpair_name(pair), sizeof (nm)); 1096 pdelim = strchr(nm, '@'); 1097 if (pdelim == NULL) 1098 continue; 1099 *pdelim = '\0'; 1100 1101 do { 1102 if (nvlist_lookup_uint64(cnt_track, nm, 1103 &val) == 0) { 1104 /* update existing entry */ 1105 fnvlist_add_uint64(cnt_track, nm, 1106 val + 1); 1107 } else { 1108 /* add to list */ 1109 fnvlist_add_uint64(cnt_track, nm, 1); 1110 } 1111 1112 pdelim = strrchr(nm, '/'); 1113 if (pdelim != NULL) 1114 *pdelim = '\0'; 1115 } while (pdelim != NULL); 1116 } 1117 1118 /* Check aggregated counts at each level */ 1119 for (pair = nvlist_next_nvpair(cnt_track, NULL); 1120 pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) { 1121 int error = 0; 1122 char *name; 1123 uint64_t cnt = 0; 1124 dsl_dataset_t *ds; 1125 1126 name = nvpair_name(pair); 1127 cnt = fnvpair_value_uint64(pair); 1128 ASSERT(cnt > 0); 1129 1130 error = dsl_dataset_hold(dp, name, FTAG, &ds); 1131 if (error == 0) { 1132 error = dsl_fs_ss_limit_check(ds->ds_dir, cnt, 1133 ZFS_PROP_SNAPSHOT_LIMIT, NULL, 1134 ddsa->ddsa_cr); 1135 dsl_dataset_rele(ds, FTAG); 1136 } 1137 1138 if (error != 0) { 1139 if (ddsa->ddsa_errors != NULL) 1140 fnvlist_add_int32(ddsa->ddsa_errors, 1141 name, error); 1142 rv = error; 1143 /* only report one error for this check */ 1144 break; 1145 } 1146 } 1147 nvlist_free(cnt_track); 1148 } 1149 1150 for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); 1151 pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { 1152 int error = 0; 1153 dsl_dataset_t *ds; 1154 char *name, *atp; 1155 char dsname[MAXNAMELEN]; 1156 1157 name = nvpair_name(pair); 1158 if (strlen(name) >= MAXNAMELEN) 1159 error = SET_ERROR(ENAMETOOLONG); 1160 if (error == 0) { 1161 atp = strchr(name, '@'); 1162 if (atp == NULL) 1163 error = SET_ERROR(EINVAL); 1164 if (error == 0) 1165 (void) strlcpy(dsname, name, atp - name + 1); 1166 } 1167 if (error == 0) 1168 error = dsl_dataset_hold(dp, dsname, FTAG, &ds); 1169 if (error == 0) { 1170 /* passing 0/NULL skips dsl_fs_ss_limit_check */ 1171 error = dsl_dataset_snapshot_check_impl(ds, 1172 atp + 1, tx, B_FALSE, 0, NULL); 1173 dsl_dataset_rele(ds, FTAG); 1174 } 1175 1176 if (error != 0) { 1177 if (ddsa->ddsa_errors != NULL) { 1178 fnvlist_add_int32(ddsa->ddsa_errors, 1179 name, error); 1180 } 1181 rv = error; 1182 } 1183 } 1184 1185 return (rv); 1186 } 1187 1188 void 1189 dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, 1190 dmu_tx_t *tx) 1191 { 1192 static zil_header_t zero_zil; 1193 1194 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1195 dmu_buf_t *dbuf; 1196 dsl_dataset_phys_t *dsphys; 1197 uint64_t dsobj, crtxg; 1198 objset_t *mos = dp->dp_meta_objset; 1199 objset_t *os; 1200 1201 ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); 1202 1203 /* 1204 * If we are on an old pool, the zil must not be active, in which 1205 * case it will be zeroed. Usually zil_suspend() accomplishes this. 1206 */ 1207 ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP || 1208 dmu_objset_from_ds(ds, &os) != 0 || 1209 bcmp(&os->os_phys->os_zil_header, &zero_zil, 1210 sizeof (zero_zil)) == 0); 1211 1212 dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx); 1213 1214 /* 1215 * The origin's ds_creation_txg has to be < TXG_INITIAL 1216 */ 1217 if (strcmp(snapname, ORIGIN_DIR_NAME) == 0) 1218 crtxg = 1; 1219 else 1220 crtxg = tx->tx_txg; 1221 1222 dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, 1223 DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); 1224 VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); 1225 dmu_buf_will_dirty(dbuf, tx); 1226 dsphys = dbuf->db_data; 1227 bzero(dsphys, sizeof (dsl_dataset_phys_t)); 1228 dsphys->ds_dir_obj = ds->ds_dir->dd_object; 1229 dsphys->ds_fsid_guid = unique_create(); 1230 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, 1231 sizeof (dsphys->ds_guid)); 1232 dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; 1233 dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; 1234 dsphys->ds_next_snap_obj = ds->ds_object; 1235 dsphys->ds_num_children = 1; 1236 dsphys->ds_creation_time = gethrestime_sec(); 1237 dsphys->ds_creation_txg = crtxg; 1238 dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj; 1239 dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes; 1240 dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes; 1241 dsphys->ds_uncompressed_bytes = 1242 dsl_dataset_phys(ds)->ds_uncompressed_bytes; 1243 dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags; 1244 dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp; 1245 dmu_buf_rele(dbuf, FTAG); 1246 1247 if (ds->ds_large_blocks) 1248 dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx); 1249 1250 ASSERT3U(ds->ds_prev != 0, ==, 1251 dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); 1252 if (ds->ds_prev) { 1253 uint64_t next_clones_obj = 1254 dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj; 1255 ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == 1256 ds->ds_object || 1257 dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1); 1258 if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == 1259 ds->ds_object) { 1260 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 1261 ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==, 1262 dsl_dataset_phys(ds->ds_prev)->ds_creation_txg); 1263 dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj; 1264 } else if (next_clones_obj != 0) { 1265 dsl_dataset_remove_from_next_clones(ds->ds_prev, 1266 dsphys->ds_next_snap_obj, tx); 1267 VERIFY0(zap_add_int(mos, 1268 next_clones_obj, dsobj, tx)); 1269 } 1270 } 1271 1272 /* 1273 * If we have a reference-reservation on this dataset, we will 1274 * need to increase the amount of refreservation being charged 1275 * since our unique space is going to zero. 1276 */ 1277 if (ds->ds_reserved) { 1278 int64_t delta; 1279 ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); 1280 delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, 1281 ds->ds_reserved); 1282 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, 1283 delta, 0, 0, tx); 1284 } 1285 1286 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1287 dsl_dataset_phys(ds)->ds_deadlist_obj = 1288 dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX, 1289 dsl_dataset_phys(ds)->ds_prev_snap_obj, tx); 1290 dsl_deadlist_close(&ds->ds_deadlist); 1291 dsl_deadlist_open(&ds->ds_deadlist, mos, 1292 dsl_dataset_phys(ds)->ds_deadlist_obj); 1293 dsl_deadlist_add_key(&ds->ds_deadlist, 1294 dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); 1295 1296 ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg); 1297 dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj; 1298 dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg; 1299 dsl_dataset_phys(ds)->ds_unique_bytes = 0; 1300 if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) 1301 dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 1302 1303 VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj, 1304 snapname, 8, 1, &dsobj, tx)); 1305 1306 if (ds->ds_prev) 1307 dsl_dataset_rele(ds->ds_prev, ds); 1308 VERIFY0(dsl_dataset_hold_obj(dp, 1309 dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev)); 1310 1311 dsl_scan_ds_snapshotted(ds, tx); 1312 1313 dsl_dir_snap_cmtime_update(ds->ds_dir); 1314 1315 spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, ""); 1316 } 1317 1318 static void 1319 dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx) 1320 { 1321 dsl_dataset_snapshot_arg_t *ddsa = arg; 1322 dsl_pool_t *dp = dmu_tx_pool(tx); 1323 nvpair_t *pair; 1324 1325 for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); 1326 pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { 1327 dsl_dataset_t *ds; 1328 char *name, *atp; 1329 char dsname[MAXNAMELEN]; 1330 1331 name = nvpair_name(pair); 1332 atp = strchr(name, '@'); 1333 (void) strlcpy(dsname, name, atp - name + 1); 1334 VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds)); 1335 1336 dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx); 1337 if (ddsa->ddsa_props != NULL) { 1338 dsl_props_set_sync_impl(ds->ds_prev, 1339 ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx); 1340 } 1341 dsl_dataset_rele(ds, FTAG); 1342 } 1343 } 1344 1345 /* 1346 * The snapshots must all be in the same pool. 1347 * All-or-nothing: if there are any failures, nothing will be modified. 1348 */ 1349 int 1350 dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors) 1351 { 1352 dsl_dataset_snapshot_arg_t ddsa; 1353 nvpair_t *pair; 1354 boolean_t needsuspend; 1355 int error; 1356 spa_t *spa; 1357 char *firstname; 1358 nvlist_t *suspended = NULL; 1359 1360 pair = nvlist_next_nvpair(snaps, NULL); 1361 if (pair == NULL) 1362 return (0); 1363 firstname = nvpair_name(pair); 1364 1365 error = spa_open(firstname, &spa, FTAG); 1366 if (error != 0) 1367 return (error); 1368 needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); 1369 spa_close(spa, FTAG); 1370 1371 if (needsuspend) { 1372 suspended = fnvlist_alloc(); 1373 for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; 1374 pair = nvlist_next_nvpair(snaps, pair)) { 1375 char fsname[MAXNAMELEN]; 1376 char *snapname = nvpair_name(pair); 1377 char *atp; 1378 void *cookie; 1379 1380 atp = strchr(snapname, '@'); 1381 if (atp == NULL) { 1382 error = SET_ERROR(EINVAL); 1383 break; 1384 } 1385 (void) strlcpy(fsname, snapname, atp - snapname + 1); 1386 1387 error = zil_suspend(fsname, &cookie); 1388 if (error != 0) 1389 break; 1390 fnvlist_add_uint64(suspended, fsname, 1391 (uintptr_t)cookie); 1392 } 1393 } 1394 1395 ddsa.ddsa_snaps = snaps; 1396 ddsa.ddsa_props = props; 1397 ddsa.ddsa_errors = errors; 1398 ddsa.ddsa_cr = CRED(); 1399 1400 if (error == 0) { 1401 error = dsl_sync_task(firstname, dsl_dataset_snapshot_check, 1402 dsl_dataset_snapshot_sync, &ddsa, 1403 fnvlist_num_pairs(snaps) * 3, ZFS_SPACE_CHECK_NORMAL); 1404 } 1405 1406 if (suspended != NULL) { 1407 for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL; 1408 pair = nvlist_next_nvpair(suspended, pair)) { 1409 zil_resume((void *)(uintptr_t) 1410 fnvpair_value_uint64(pair)); 1411 } 1412 fnvlist_free(suspended); 1413 } 1414 1415 return (error); 1416 } 1417 1418 typedef struct dsl_dataset_snapshot_tmp_arg { 1419 const char *ddsta_fsname; 1420 const char *ddsta_snapname; 1421 minor_t ddsta_cleanup_minor; 1422 const char *ddsta_htag; 1423 } dsl_dataset_snapshot_tmp_arg_t; 1424 1425 static int 1426 dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx) 1427 { 1428 dsl_dataset_snapshot_tmp_arg_t *ddsta = arg; 1429 dsl_pool_t *dp = dmu_tx_pool(tx); 1430 dsl_dataset_t *ds; 1431 int error; 1432 1433 error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds); 1434 if (error != 0) 1435 return (error); 1436 1437 /* NULL cred means no limit check for tmp snapshot */ 1438 error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname, 1439 tx, B_FALSE, 0, NULL); 1440 if (error != 0) { 1441 dsl_dataset_rele(ds, FTAG); 1442 return (error); 1443 } 1444 1445 if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) { 1446 dsl_dataset_rele(ds, FTAG); 1447 return (SET_ERROR(ENOTSUP)); 1448 } 1449 error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag, 1450 B_TRUE, tx); 1451 if (error != 0) { 1452 dsl_dataset_rele(ds, FTAG); 1453 return (error); 1454 } 1455 1456 dsl_dataset_rele(ds, FTAG); 1457 return (0); 1458 } 1459 1460 static void 1461 dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx) 1462 { 1463 dsl_dataset_snapshot_tmp_arg_t *ddsta = arg; 1464 dsl_pool_t *dp = dmu_tx_pool(tx); 1465 dsl_dataset_t *ds; 1466 1467 VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds)); 1468 1469 dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx); 1470 dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag, 1471 ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx); 1472 dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx); 1473 1474 dsl_dataset_rele(ds, FTAG); 1475 } 1476 1477 int 1478 dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname, 1479 minor_t cleanup_minor, const char *htag) 1480 { 1481 dsl_dataset_snapshot_tmp_arg_t ddsta; 1482 int error; 1483 spa_t *spa; 1484 boolean_t needsuspend; 1485 void *cookie; 1486 1487 ddsta.ddsta_fsname = fsname; 1488 ddsta.ddsta_snapname = snapname; 1489 ddsta.ddsta_cleanup_minor = cleanup_minor; 1490 ddsta.ddsta_htag = htag; 1491 1492 error = spa_open(fsname, &spa, FTAG); 1493 if (error != 0) 1494 return (error); 1495 needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); 1496 spa_close(spa, FTAG); 1497 1498 if (needsuspend) { 1499 error = zil_suspend(fsname, &cookie); 1500 if (error != 0) 1501 return (error); 1502 } 1503 1504 error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check, 1505 dsl_dataset_snapshot_tmp_sync, &ddsta, 3, ZFS_SPACE_CHECK_RESERVED); 1506 1507 if (needsuspend) 1508 zil_resume(cookie); 1509 return (error); 1510 } 1511 1512 1513 void 1514 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) 1515 { 1516 ASSERT(dmu_tx_is_syncing(tx)); 1517 ASSERT(ds->ds_objset != NULL); 1518 ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0); 1519 1520 /* 1521 * in case we had to change ds_fsid_guid when we opened it, 1522 * sync it out now. 1523 */ 1524 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1525 dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid; 1526 1527 dmu_objset_sync(ds->ds_objset, zio, tx); 1528 1529 if (ds->ds_need_large_blocks && !ds->ds_large_blocks) { 1530 dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx); 1531 ds->ds_large_blocks = B_TRUE; 1532 } 1533 } 1534 1535 static void 1536 get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) 1537 { 1538 uint64_t count = 0; 1539 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 1540 zap_cursor_t zc; 1541 zap_attribute_t za; 1542 nvlist_t *propval = fnvlist_alloc(); 1543 nvlist_t *val = fnvlist_alloc(); 1544 1545 ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); 1546 1547 /* 1548 * There may be missing entries in ds_next_clones_obj 1549 * due to a bug in a previous version of the code. 1550 * Only trust it if it has the right number of entries. 1551 */ 1552 if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { 1553 VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, 1554 &count)); 1555 } 1556 if (count != dsl_dataset_phys(ds)->ds_num_children - 1) 1557 goto fail; 1558 for (zap_cursor_init(&zc, mos, 1559 dsl_dataset_phys(ds)->ds_next_clones_obj); 1560 zap_cursor_retrieve(&zc, &za) == 0; 1561 zap_cursor_advance(&zc)) { 1562 dsl_dataset_t *clone; 1563 char buf[ZFS_MAXNAMELEN]; 1564 VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool, 1565 za.za_first_integer, FTAG, &clone)); 1566 dsl_dir_name(clone->ds_dir, buf); 1567 fnvlist_add_boolean(val, buf); 1568 dsl_dataset_rele(clone, FTAG); 1569 } 1570 zap_cursor_fini(&zc); 1571 fnvlist_add_nvlist(propval, ZPROP_VALUE, val); 1572 fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), propval); 1573 fail: 1574 nvlist_free(val); 1575 nvlist_free(propval); 1576 } 1577 1578 void 1579 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) 1580 { 1581 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1582 uint64_t refd, avail, uobjs, aobjs, ratio; 1583 1584 ASSERT(dsl_pool_config_held(dp)); 1585 1586 ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 : 1587 (dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 / 1588 dsl_dataset_phys(ds)->ds_compressed_bytes); 1589 1590 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio); 1591 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED, 1592 dsl_dataset_phys(ds)->ds_uncompressed_bytes); 1593 1594 if (dsl_dataset_is_snapshot(ds)) { 1595 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio); 1596 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, 1597 dsl_dataset_phys(ds)->ds_unique_bytes); 1598 get_clones_stat(ds, nv); 1599 } else { 1600 if (ds->ds_prev != NULL && ds->ds_prev != dp->dp_origin_snap) { 1601 char buf[MAXNAMELEN]; 1602 dsl_dataset_name(ds->ds_prev, buf); 1603 dsl_prop_nvlist_add_string(nv, ZFS_PROP_PREV_SNAP, buf); 1604 } 1605 1606 dsl_dir_stats(ds->ds_dir, nv); 1607 } 1608 1609 dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs); 1610 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail); 1611 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd); 1612 1613 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION, 1614 dsl_dataset_phys(ds)->ds_creation_time); 1615 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG, 1616 dsl_dataset_phys(ds)->ds_creation_txg); 1617 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA, 1618 ds->ds_quota); 1619 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION, 1620 ds->ds_reserved); 1621 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, 1622 dsl_dataset_phys(ds)->ds_guid); 1623 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE, 1624 dsl_dataset_phys(ds)->ds_unique_bytes); 1625 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID, 1626 ds->ds_object); 1627 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, 1628 ds->ds_userrefs); 1629 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, 1630 DS_IS_DEFER_DESTROY(ds) ? 1 : 0); 1631 1632 if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { 1633 uint64_t written, comp, uncomp; 1634 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1635 dsl_dataset_t *prev; 1636 1637 int err = dsl_dataset_hold_obj(dp, 1638 dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); 1639 if (err == 0) { 1640 err = dsl_dataset_space_written(prev, ds, &written, 1641 &comp, &uncomp); 1642 dsl_dataset_rele(prev, FTAG); 1643 if (err == 0) { 1644 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN, 1645 written); 1646 } 1647 } 1648 } 1649 } 1650 1651 void 1652 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) 1653 { 1654 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1655 ASSERT(dsl_pool_config_held(dp)); 1656 1657 stat->dds_creation_txg = dsl_dataset_phys(ds)->ds_creation_txg; 1658 stat->dds_inconsistent = 1659 dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT; 1660 stat->dds_guid = dsl_dataset_phys(ds)->ds_guid; 1661 stat->dds_origin[0] = '\0'; 1662 if (dsl_dataset_is_snapshot(ds)) { 1663 stat->dds_is_snapshot = B_TRUE; 1664 stat->dds_num_clones = 1665 dsl_dataset_phys(ds)->ds_num_children - 1; 1666 } else { 1667 stat->dds_is_snapshot = B_FALSE; 1668 stat->dds_num_clones = 0; 1669 1670 if (dsl_dir_is_clone(ds->ds_dir)) { 1671 dsl_dataset_t *ods; 1672 1673 VERIFY0(dsl_dataset_hold_obj(dp, 1674 dsl_dir_phys(ds->ds_dir)->dd_origin_obj, 1675 FTAG, &ods)); 1676 dsl_dataset_name(ods, stat->dds_origin); 1677 dsl_dataset_rele(ods, FTAG); 1678 } 1679 } 1680 } 1681 1682 uint64_t 1683 dsl_dataset_fsid_guid(dsl_dataset_t *ds) 1684 { 1685 return (ds->ds_fsid_guid); 1686 } 1687 1688 void 1689 dsl_dataset_space(dsl_dataset_t *ds, 1690 uint64_t *refdbytesp, uint64_t *availbytesp, 1691 uint64_t *usedobjsp, uint64_t *availobjsp) 1692 { 1693 *refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes; 1694 *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); 1695 if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) 1696 *availbytesp += 1697 ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes; 1698 if (ds->ds_quota != 0) { 1699 /* 1700 * Adjust available bytes according to refquota 1701 */ 1702 if (*refdbytesp < ds->ds_quota) 1703 *availbytesp = MIN(*availbytesp, 1704 ds->ds_quota - *refdbytesp); 1705 else 1706 *availbytesp = 0; 1707 } 1708 *usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp); 1709 *availobjsp = DN_MAX_OBJECT - *usedobjsp; 1710 } 1711 1712 boolean_t 1713 dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap) 1714 { 1715 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1716 1717 ASSERT(dsl_pool_config_held(dp)); 1718 if (snap == NULL) 1719 return (B_FALSE); 1720 if (dsl_dataset_phys(ds)->ds_bp.blk_birth > 1721 dsl_dataset_phys(snap)->ds_creation_txg) { 1722 objset_t *os, *os_snap; 1723 /* 1724 * It may be that only the ZIL differs, because it was 1725 * reset in the head. Don't count that as being 1726 * modified. 1727 */ 1728 if (dmu_objset_from_ds(ds, &os) != 0) 1729 return (B_TRUE); 1730 if (dmu_objset_from_ds(snap, &os_snap) != 0) 1731 return (B_TRUE); 1732 return (bcmp(&os->os_phys->os_meta_dnode, 1733 &os_snap->os_phys->os_meta_dnode, 1734 sizeof (os->os_phys->os_meta_dnode)) != 0); 1735 } 1736 return (B_FALSE); 1737 } 1738 1739 typedef struct dsl_dataset_rename_snapshot_arg { 1740 const char *ddrsa_fsname; 1741 const char *ddrsa_oldsnapname; 1742 const char *ddrsa_newsnapname; 1743 boolean_t ddrsa_recursive; 1744 dmu_tx_t *ddrsa_tx; 1745 } dsl_dataset_rename_snapshot_arg_t; 1746 1747 /* ARGSUSED */ 1748 static int 1749 dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp, 1750 dsl_dataset_t *hds, void *arg) 1751 { 1752 dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; 1753 int error; 1754 uint64_t val; 1755 1756 error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val); 1757 if (error != 0) { 1758 /* ignore nonexistent snapshots */ 1759 return (error == ENOENT ? 0 : error); 1760 } 1761 1762 /* new name should not exist */ 1763 error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val); 1764 if (error == 0) 1765 error = SET_ERROR(EEXIST); 1766 else if (error == ENOENT) 1767 error = 0; 1768 1769 /* dataset name + 1 for the "@" + the new snapshot name must fit */ 1770 if (dsl_dir_namelen(hds->ds_dir) + 1 + 1771 strlen(ddrsa->ddrsa_newsnapname) >= MAXNAMELEN) 1772 error = SET_ERROR(ENAMETOOLONG); 1773 1774 return (error); 1775 } 1776 1777 static int 1778 dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx) 1779 { 1780 dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; 1781 dsl_pool_t *dp = dmu_tx_pool(tx); 1782 dsl_dataset_t *hds; 1783 int error; 1784 1785 error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds); 1786 if (error != 0) 1787 return (error); 1788 1789 if (ddrsa->ddrsa_recursive) { 1790 error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object, 1791 dsl_dataset_rename_snapshot_check_impl, ddrsa, 1792 DS_FIND_CHILDREN); 1793 } else { 1794 error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa); 1795 } 1796 dsl_dataset_rele(hds, FTAG); 1797 return (error); 1798 } 1799 1800 static int 1801 dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp, 1802 dsl_dataset_t *hds, void *arg) 1803 { 1804 dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; 1805 dsl_dataset_t *ds; 1806 uint64_t val; 1807 dmu_tx_t *tx = ddrsa->ddrsa_tx; 1808 int error; 1809 1810 error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val); 1811 ASSERT(error == 0 || error == ENOENT); 1812 if (error == ENOENT) { 1813 /* ignore nonexistent snapshots */ 1814 return (0); 1815 } 1816 1817 VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds)); 1818 1819 /* log before we change the name */ 1820 spa_history_log_internal_ds(ds, "rename", tx, 1821 "-> @%s", ddrsa->ddrsa_newsnapname); 1822 1823 VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx, 1824 B_FALSE)); 1825 mutex_enter(&ds->ds_lock); 1826 (void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname); 1827 mutex_exit(&ds->ds_lock); 1828 VERIFY0(zap_add(dp->dp_meta_objset, 1829 dsl_dataset_phys(hds)->ds_snapnames_zapobj, 1830 ds->ds_snapname, 8, 1, &ds->ds_object, tx)); 1831 1832 dsl_dataset_rele(ds, FTAG); 1833 return (0); 1834 } 1835 1836 static void 1837 dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx) 1838 { 1839 dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; 1840 dsl_pool_t *dp = dmu_tx_pool(tx); 1841 dsl_dataset_t *hds; 1842 1843 VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds)); 1844 ddrsa->ddrsa_tx = tx; 1845 if (ddrsa->ddrsa_recursive) { 1846 VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object, 1847 dsl_dataset_rename_snapshot_sync_impl, ddrsa, 1848 DS_FIND_CHILDREN)); 1849 } else { 1850 VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa)); 1851 } 1852 dsl_dataset_rele(hds, FTAG); 1853 } 1854 1855 int 1856 dsl_dataset_rename_snapshot(const char *fsname, 1857 const char *oldsnapname, const char *newsnapname, boolean_t recursive) 1858 { 1859 dsl_dataset_rename_snapshot_arg_t ddrsa; 1860 1861 ddrsa.ddrsa_fsname = fsname; 1862 ddrsa.ddrsa_oldsnapname = oldsnapname; 1863 ddrsa.ddrsa_newsnapname = newsnapname; 1864 ddrsa.ddrsa_recursive = recursive; 1865 1866 return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check, 1867 dsl_dataset_rename_snapshot_sync, &ddrsa, 1868 1, ZFS_SPACE_CHECK_RESERVED)); 1869 } 1870 1871 /* 1872 * If we're doing an ownership handoff, we need to make sure that there is 1873 * only one long hold on the dataset. We're not allowed to change anything here 1874 * so we don't permanently release the long hold or regular hold here. We want 1875 * to do this only when syncing to avoid the dataset unexpectedly going away 1876 * when we release the long hold. 1877 */ 1878 static int 1879 dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx) 1880 { 1881 boolean_t held; 1882 1883 if (!dmu_tx_is_syncing(tx)) 1884 return (0); 1885 1886 if (owner != NULL) { 1887 VERIFY3P(ds->ds_owner, ==, owner); 1888 dsl_dataset_long_rele(ds, owner); 1889 } 1890 1891 held = dsl_dataset_long_held(ds); 1892 1893 if (owner != NULL) 1894 dsl_dataset_long_hold(ds, owner); 1895 1896 if (held) 1897 return (SET_ERROR(EBUSY)); 1898 1899 return (0); 1900 } 1901 1902 typedef struct dsl_dataset_rollback_arg { 1903 const char *ddra_fsname; 1904 void *ddra_owner; 1905 nvlist_t *ddra_result; 1906 } dsl_dataset_rollback_arg_t; 1907 1908 static int 1909 dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx) 1910 { 1911 dsl_dataset_rollback_arg_t *ddra = arg; 1912 dsl_pool_t *dp = dmu_tx_pool(tx); 1913 dsl_dataset_t *ds; 1914 int64_t unused_refres_delta; 1915 int error; 1916 1917 error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds); 1918 if (error != 0) 1919 return (error); 1920 1921 /* must not be a snapshot */ 1922 if (dsl_dataset_is_snapshot(ds)) { 1923 dsl_dataset_rele(ds, FTAG); 1924 return (SET_ERROR(EINVAL)); 1925 } 1926 1927 /* must have a most recent snapshot */ 1928 if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) { 1929 dsl_dataset_rele(ds, FTAG); 1930 return (SET_ERROR(EINVAL)); 1931 } 1932 1933 /* must not have any bookmarks after the most recent snapshot */ 1934 nvlist_t *proprequest = fnvlist_alloc(); 1935 fnvlist_add_boolean(proprequest, zfs_prop_to_name(ZFS_PROP_CREATETXG)); 1936 nvlist_t *bookmarks = fnvlist_alloc(); 1937 error = dsl_get_bookmarks_impl(ds, proprequest, bookmarks); 1938 fnvlist_free(proprequest); 1939 if (error != 0) 1940 return (error); 1941 for (nvpair_t *pair = nvlist_next_nvpair(bookmarks, NULL); 1942 pair != NULL; pair = nvlist_next_nvpair(bookmarks, pair)) { 1943 nvlist_t *valuenv = 1944 fnvlist_lookup_nvlist(fnvpair_value_nvlist(pair), 1945 zfs_prop_to_name(ZFS_PROP_CREATETXG)); 1946 uint64_t createtxg = fnvlist_lookup_uint64(valuenv, "value"); 1947 if (createtxg > dsl_dataset_phys(ds)->ds_prev_snap_txg) { 1948 fnvlist_free(bookmarks); 1949 dsl_dataset_rele(ds, FTAG); 1950 return (SET_ERROR(EEXIST)); 1951 } 1952 } 1953 fnvlist_free(bookmarks); 1954 1955 error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx); 1956 if (error != 0) { 1957 dsl_dataset_rele(ds, FTAG); 1958 return (error); 1959 } 1960 1961 /* 1962 * Check if the snap we are rolling back to uses more than 1963 * the refquota. 1964 */ 1965 if (ds->ds_quota != 0 && 1966 dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) { 1967 dsl_dataset_rele(ds, FTAG); 1968 return (SET_ERROR(EDQUOT)); 1969 } 1970 1971 /* 1972 * When we do the clone swap, we will temporarily use more space 1973 * due to the refreservation (the head will no longer have any 1974 * unique space, so the entire amount of the refreservation will need 1975 * to be free). We will immediately destroy the clone, freeing 1976 * this space, but the freeing happens over many txg's. 1977 */ 1978 unused_refres_delta = (int64_t)MIN(ds->ds_reserved, 1979 dsl_dataset_phys(ds)->ds_unique_bytes); 1980 1981 if (unused_refres_delta > 0 && 1982 unused_refres_delta > 1983 dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) { 1984 dsl_dataset_rele(ds, FTAG); 1985 return (SET_ERROR(ENOSPC)); 1986 } 1987 1988 dsl_dataset_rele(ds, FTAG); 1989 return (0); 1990 } 1991 1992 static void 1993 dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx) 1994 { 1995 dsl_dataset_rollback_arg_t *ddra = arg; 1996 dsl_pool_t *dp = dmu_tx_pool(tx); 1997 dsl_dataset_t *ds, *clone; 1998 uint64_t cloneobj; 1999 char namebuf[ZFS_MAXNAMELEN]; 2000 2001 VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds)); 2002 2003 dsl_dataset_name(ds->ds_prev, namebuf); 2004 fnvlist_add_string(ddra->ddra_result, "target", namebuf); 2005 2006 cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback", 2007 ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx); 2008 2009 VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone)); 2010 2011 dsl_dataset_clone_swap_sync_impl(clone, ds, tx); 2012 dsl_dataset_zero_zil(ds, tx); 2013 2014 dsl_destroy_head_sync_impl(clone, tx); 2015 2016 dsl_dataset_rele(clone, FTAG); 2017 dsl_dataset_rele(ds, FTAG); 2018 } 2019 2020 /* 2021 * Rolls back the given filesystem or volume to the most recent snapshot. 2022 * The name of the most recent snapshot will be returned under key "target" 2023 * in the result nvlist. 2024 * 2025 * If owner != NULL: 2026 * - The existing dataset MUST be owned by the specified owner at entry 2027 * - Upon return, dataset will still be held by the same owner, whether we 2028 * succeed or not. 2029 * 2030 * This mode is required any time the existing filesystem is mounted. See 2031 * notes above zfs_suspend_fs() for further details. 2032 */ 2033 int 2034 dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result) 2035 { 2036 dsl_dataset_rollback_arg_t ddra; 2037 2038 ddra.ddra_fsname = fsname; 2039 ddra.ddra_owner = owner; 2040 ddra.ddra_result = result; 2041 2042 return (dsl_sync_task(fsname, dsl_dataset_rollback_check, 2043 dsl_dataset_rollback_sync, &ddra, 2044 1, ZFS_SPACE_CHECK_RESERVED)); 2045 } 2046 2047 struct promotenode { 2048 list_node_t link; 2049 dsl_dataset_t *ds; 2050 }; 2051 2052 typedef struct dsl_dataset_promote_arg { 2053 const char *ddpa_clonename; 2054 dsl_dataset_t *ddpa_clone; 2055 list_t shared_snaps, origin_snaps, clone_snaps; 2056 dsl_dataset_t *origin_origin; /* origin of the origin */ 2057 uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; 2058 char *err_ds; 2059 cred_t *cr; 2060 } dsl_dataset_promote_arg_t; 2061 2062 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); 2063 static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, 2064 void *tag); 2065 static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag); 2066 2067 static int 2068 dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) 2069 { 2070 dsl_dataset_promote_arg_t *ddpa = arg; 2071 dsl_pool_t *dp = dmu_tx_pool(tx); 2072 dsl_dataset_t *hds; 2073 struct promotenode *snap; 2074 dsl_dataset_t *origin_ds; 2075 int err; 2076 uint64_t unused; 2077 uint64_t ss_mv_cnt; 2078 2079 err = promote_hold(ddpa, dp, FTAG); 2080 if (err != 0) 2081 return (err); 2082 2083 hds = ddpa->ddpa_clone; 2084 2085 if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) { 2086 promote_rele(ddpa, FTAG); 2087 return (SET_ERROR(EXDEV)); 2088 } 2089 2090 /* 2091 * Compute and check the amount of space to transfer. Since this is 2092 * so expensive, don't do the preliminary check. 2093 */ 2094 if (!dmu_tx_is_syncing(tx)) { 2095 promote_rele(ddpa, FTAG); 2096 return (0); 2097 } 2098 2099 snap = list_head(&ddpa->shared_snaps); 2100 origin_ds = snap->ds; 2101 2102 /* compute origin's new unique space */ 2103 snap = list_tail(&ddpa->clone_snaps); 2104 ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==, 2105 origin_ds->ds_object); 2106 dsl_deadlist_space_range(&snap->ds->ds_deadlist, 2107 dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX, 2108 &ddpa->unique, &unused, &unused); 2109 2110 /* 2111 * Walk the snapshots that we are moving 2112 * 2113 * Compute space to transfer. Consider the incremental changes 2114 * to used by each snapshot: 2115 * (my used) = (prev's used) + (blocks born) - (blocks killed) 2116 * So each snapshot gave birth to: 2117 * (blocks born) = (my used) - (prev's used) + (blocks killed) 2118 * So a sequence would look like: 2119 * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0) 2120 * Which simplifies to: 2121 * uN + kN + kN-1 + ... + k1 + k0 2122 * Note however, if we stop before we reach the ORIGIN we get: 2123 * uN + kN + kN-1 + ... + kM - uM-1 2124 */ 2125 ss_mv_cnt = 0; 2126 ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes; 2127 ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes; 2128 ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes; 2129 for (snap = list_head(&ddpa->shared_snaps); snap; 2130 snap = list_next(&ddpa->shared_snaps, snap)) { 2131 uint64_t val, dlused, dlcomp, dluncomp; 2132 dsl_dataset_t *ds = snap->ds; 2133 2134 ss_mv_cnt++; 2135 2136 /* 2137 * If there are long holds, we won't be able to evict 2138 * the objset. 2139 */ 2140 if (dsl_dataset_long_held(ds)) { 2141 err = SET_ERROR(EBUSY); 2142 goto out; 2143 } 2144 2145 /* Check that the snapshot name does not conflict */ 2146 VERIFY0(dsl_dataset_get_snapname(ds)); 2147 err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); 2148 if (err == 0) { 2149 (void) strcpy(ddpa->err_ds, snap->ds->ds_snapname); 2150 err = SET_ERROR(EEXIST); 2151 goto out; 2152 } 2153 if (err != ENOENT) 2154 goto out; 2155 2156 /* The very first snapshot does not have a deadlist */ 2157 if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0) 2158 continue; 2159 2160 dsl_deadlist_space(&ds->ds_deadlist, 2161 &dlused, &dlcomp, &dluncomp); 2162 ddpa->used += dlused; 2163 ddpa->comp += dlcomp; 2164 ddpa->uncomp += dluncomp; 2165 } 2166 2167 /* 2168 * If we are a clone of a clone then we never reached ORIGIN, 2169 * so we need to subtract out the clone origin's used space. 2170 */ 2171 if (ddpa->origin_origin) { 2172 ddpa->used -= 2173 dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes; 2174 ddpa->comp -= 2175 dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes; 2176 ddpa->uncomp -= 2177 dsl_dataset_phys(ddpa->origin_origin)-> 2178 ds_uncompressed_bytes; 2179 } 2180 2181 /* Check that there is enough space and limit headroom here */ 2182 err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, 2183 0, ss_mv_cnt, ddpa->used, ddpa->cr); 2184 if (err != 0) 2185 goto out; 2186 2187 /* 2188 * Compute the amounts of space that will be used by snapshots 2189 * after the promotion (for both origin and clone). For each, 2190 * it is the amount of space that will be on all of their 2191 * deadlists (that was not born before their new origin). 2192 */ 2193 if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) { 2194 uint64_t space; 2195 2196 /* 2197 * Note, typically this will not be a clone of a clone, 2198 * so dd_origin_txg will be < TXG_INITIAL, so 2199 * these snaplist_space() -> dsl_deadlist_space_range() 2200 * calls will be fast because they do not have to 2201 * iterate over all bps. 2202 */ 2203 snap = list_head(&ddpa->origin_snaps); 2204 err = snaplist_space(&ddpa->shared_snaps, 2205 snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap); 2206 if (err != 0) 2207 goto out; 2208 2209 err = snaplist_space(&ddpa->clone_snaps, 2210 snap->ds->ds_dir->dd_origin_txg, &space); 2211 if (err != 0) 2212 goto out; 2213 ddpa->cloneusedsnap += space; 2214 } 2215 if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags & 2216 DD_FLAG_USED_BREAKDOWN) { 2217 err = snaplist_space(&ddpa->origin_snaps, 2218 dsl_dataset_phys(origin_ds)->ds_creation_txg, 2219 &ddpa->originusedsnap); 2220 if (err != 0) 2221 goto out; 2222 } 2223 2224 out: 2225 promote_rele(ddpa, FTAG); 2226 return (err); 2227 } 2228 2229 static void 2230 dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) 2231 { 2232 dsl_dataset_promote_arg_t *ddpa = arg; 2233 dsl_pool_t *dp = dmu_tx_pool(tx); 2234 dsl_dataset_t *hds; 2235 struct promotenode *snap; 2236 dsl_dataset_t *origin_ds; 2237 dsl_dataset_t *origin_head; 2238 dsl_dir_t *dd; 2239 dsl_dir_t *odd = NULL; 2240 uint64_t oldnext_obj; 2241 int64_t delta; 2242 2243 VERIFY0(promote_hold(ddpa, dp, FTAG)); 2244 hds = ddpa->ddpa_clone; 2245 2246 ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE); 2247 2248 snap = list_head(&ddpa->shared_snaps); 2249 origin_ds = snap->ds; 2250 dd = hds->ds_dir; 2251 2252 snap = list_head(&ddpa->origin_snaps); 2253 origin_head = snap->ds; 2254 2255 /* 2256 * We need to explicitly open odd, since origin_ds's dd will be 2257 * changing. 2258 */ 2259 VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object, 2260 NULL, FTAG, &odd)); 2261 2262 /* change origin's next snap */ 2263 dmu_buf_will_dirty(origin_ds->ds_dbuf, tx); 2264 oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj; 2265 snap = list_tail(&ddpa->clone_snaps); 2266 ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==, 2267 origin_ds->ds_object); 2268 dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object; 2269 2270 /* change the origin's next clone */ 2271 if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) { 2272 dsl_dataset_remove_from_next_clones(origin_ds, 2273 snap->ds->ds_object, tx); 2274 VERIFY0(zap_add_int(dp->dp_meta_objset, 2275 dsl_dataset_phys(origin_ds)->ds_next_clones_obj, 2276 oldnext_obj, tx)); 2277 } 2278 2279 /* change origin */ 2280 dmu_buf_will_dirty(dd->dd_dbuf, tx); 2281 ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object); 2282 dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj; 2283 dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg; 2284 dmu_buf_will_dirty(odd->dd_dbuf, tx); 2285 dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object; 2286 origin_head->ds_dir->dd_origin_txg = 2287 dsl_dataset_phys(origin_ds)->ds_creation_txg; 2288 2289 /* change dd_clone entries */ 2290 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 2291 VERIFY0(zap_remove_int(dp->dp_meta_objset, 2292 dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx)); 2293 VERIFY0(zap_add_int(dp->dp_meta_objset, 2294 dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones, 2295 hds->ds_object, tx)); 2296 2297 VERIFY0(zap_remove_int(dp->dp_meta_objset, 2298 dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones, 2299 origin_head->ds_object, tx)); 2300 if (dsl_dir_phys(dd)->dd_clones == 0) { 2301 dsl_dir_phys(dd)->dd_clones = 2302 zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES, 2303 DMU_OT_NONE, 0, tx); 2304 } 2305 VERIFY0(zap_add_int(dp->dp_meta_objset, 2306 dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx)); 2307 } 2308 2309 /* move snapshots to this dir */ 2310 for (snap = list_head(&ddpa->shared_snaps); snap; 2311 snap = list_next(&ddpa->shared_snaps, snap)) { 2312 dsl_dataset_t *ds = snap->ds; 2313 2314 /* 2315 * Property callbacks are registered to a particular 2316 * dsl_dir. Since ours is changing, evict the objset 2317 * so that they will be unregistered from the old dsl_dir. 2318 */ 2319 if (ds->ds_objset) { 2320 dmu_objset_evict(ds->ds_objset); 2321 ds->ds_objset = NULL; 2322 } 2323 2324 /* move snap name entry */ 2325 VERIFY0(dsl_dataset_get_snapname(ds)); 2326 VERIFY0(dsl_dataset_snap_remove(origin_head, 2327 ds->ds_snapname, tx, B_TRUE)); 2328 VERIFY0(zap_add(dp->dp_meta_objset, 2329 dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname, 2330 8, 1, &ds->ds_object, tx)); 2331 dsl_fs_ss_count_adjust(hds->ds_dir, 1, 2332 DD_FIELD_SNAPSHOT_COUNT, tx); 2333 2334 /* change containing dsl_dir */ 2335 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2336 ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object); 2337 dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object; 2338 ASSERT3P(ds->ds_dir, ==, odd); 2339 dsl_dir_rele(ds->ds_dir, ds); 2340 VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object, 2341 NULL, ds, &ds->ds_dir)); 2342 2343 /* move any clone references */ 2344 if (dsl_dataset_phys(ds)->ds_next_clones_obj && 2345 spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 2346 zap_cursor_t zc; 2347 zap_attribute_t za; 2348 2349 for (zap_cursor_init(&zc, dp->dp_meta_objset, 2350 dsl_dataset_phys(ds)->ds_next_clones_obj); 2351 zap_cursor_retrieve(&zc, &za) == 0; 2352 zap_cursor_advance(&zc)) { 2353 dsl_dataset_t *cnds; 2354 uint64_t o; 2355 2356 if (za.za_first_integer == oldnext_obj) { 2357 /* 2358 * We've already moved the 2359 * origin's reference. 2360 */ 2361 continue; 2362 } 2363 2364 VERIFY0(dsl_dataset_hold_obj(dp, 2365 za.za_first_integer, FTAG, &cnds)); 2366 o = dsl_dir_phys(cnds->ds_dir)-> 2367 dd_head_dataset_obj; 2368 2369 VERIFY0(zap_remove_int(dp->dp_meta_objset, 2370 dsl_dir_phys(odd)->dd_clones, o, tx)); 2371 VERIFY0(zap_add_int(dp->dp_meta_objset, 2372 dsl_dir_phys(dd)->dd_clones, o, tx)); 2373 dsl_dataset_rele(cnds, FTAG); 2374 } 2375 zap_cursor_fini(&zc); 2376 } 2377 2378 ASSERT(!dsl_prop_hascb(ds)); 2379 } 2380 2381 /* 2382 * Change space accounting. 2383 * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either 2384 * both be valid, or both be 0 (resulting in delta == 0). This 2385 * is true for each of {clone,origin} independently. 2386 */ 2387 2388 delta = ddpa->cloneusedsnap - 2389 dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]; 2390 ASSERT3S(delta, >=, 0); 2391 ASSERT3U(ddpa->used, >=, delta); 2392 dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx); 2393 dsl_dir_diduse_space(dd, DD_USED_HEAD, 2394 ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx); 2395 2396 delta = ddpa->originusedsnap - 2397 dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP]; 2398 ASSERT3S(delta, <=, 0); 2399 ASSERT3U(ddpa->used, >=, -delta); 2400 dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx); 2401 dsl_dir_diduse_space(odd, DD_USED_HEAD, 2402 -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx); 2403 2404 dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique; 2405 2406 /* log history record */ 2407 spa_history_log_internal_ds(hds, "promote", tx, ""); 2408 2409 dsl_dir_rele(odd, FTAG); 2410 promote_rele(ddpa, FTAG); 2411 } 2412 2413 /* 2414 * Make a list of dsl_dataset_t's for the snapshots between first_obj 2415 * (exclusive) and last_obj (inclusive). The list will be in reverse 2416 * order (last_obj will be the list_head()). If first_obj == 0, do all 2417 * snapshots back to this dataset's origin. 2418 */ 2419 static int 2420 snaplist_make(dsl_pool_t *dp, 2421 uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag) 2422 { 2423 uint64_t obj = last_obj; 2424 2425 list_create(l, sizeof (struct promotenode), 2426 offsetof(struct promotenode, link)); 2427 2428 while (obj != first_obj) { 2429 dsl_dataset_t *ds; 2430 struct promotenode *snap; 2431 int err; 2432 2433 err = dsl_dataset_hold_obj(dp, obj, tag, &ds); 2434 ASSERT(err != ENOENT); 2435 if (err != 0) 2436 return (err); 2437 2438 if (first_obj == 0) 2439 first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj; 2440 2441 snap = kmem_alloc(sizeof (*snap), KM_SLEEP); 2442 snap->ds = ds; 2443 list_insert_tail(l, snap); 2444 obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; 2445 } 2446 2447 return (0); 2448 } 2449 2450 static int 2451 snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep) 2452 { 2453 struct promotenode *snap; 2454 2455 *spacep = 0; 2456 for (snap = list_head(l); snap; snap = list_next(l, snap)) { 2457 uint64_t used, comp, uncomp; 2458 dsl_deadlist_space_range(&snap->ds->ds_deadlist, 2459 mintxg, UINT64_MAX, &used, &comp, &uncomp); 2460 *spacep += used; 2461 } 2462 return (0); 2463 } 2464 2465 static void 2466 snaplist_destroy(list_t *l, void *tag) 2467 { 2468 struct promotenode *snap; 2469 2470 if (l == NULL || !list_link_active(&l->list_head)) 2471 return; 2472 2473 while ((snap = list_tail(l)) != NULL) { 2474 list_remove(l, snap); 2475 dsl_dataset_rele(snap->ds, tag); 2476 kmem_free(snap, sizeof (*snap)); 2477 } 2478 list_destroy(l); 2479 } 2480 2481 static int 2482 promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag) 2483 { 2484 int error; 2485 dsl_dir_t *dd; 2486 struct promotenode *snap; 2487 2488 error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag, 2489 &ddpa->ddpa_clone); 2490 if (error != 0) 2491 return (error); 2492 dd = ddpa->ddpa_clone->ds_dir; 2493 2494 if (dsl_dataset_is_snapshot(ddpa->ddpa_clone) || 2495 !dsl_dir_is_clone(dd)) { 2496 dsl_dataset_rele(ddpa->ddpa_clone, tag); 2497 return (SET_ERROR(EINVAL)); 2498 } 2499 2500 error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj, 2501 &ddpa->shared_snaps, tag); 2502 if (error != 0) 2503 goto out; 2504 2505 error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object, 2506 &ddpa->clone_snaps, tag); 2507 if (error != 0) 2508 goto out; 2509 2510 snap = list_head(&ddpa->shared_snaps); 2511 ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj); 2512 error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj, 2513 dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj, 2514 &ddpa->origin_snaps, tag); 2515 if (error != 0) 2516 goto out; 2517 2518 if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) { 2519 error = dsl_dataset_hold_obj(dp, 2520 dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj, 2521 tag, &ddpa->origin_origin); 2522 if (error != 0) 2523 goto out; 2524 } 2525 out: 2526 if (error != 0) 2527 promote_rele(ddpa, tag); 2528 return (error); 2529 } 2530 2531 static void 2532 promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag) 2533 { 2534 snaplist_destroy(&ddpa->shared_snaps, tag); 2535 snaplist_destroy(&ddpa->clone_snaps, tag); 2536 snaplist_destroy(&ddpa->origin_snaps, tag); 2537 if (ddpa->origin_origin != NULL) 2538 dsl_dataset_rele(ddpa->origin_origin, tag); 2539 dsl_dataset_rele(ddpa->ddpa_clone, tag); 2540 } 2541 2542 /* 2543 * Promote a clone. 2544 * 2545 * If it fails due to a conflicting snapshot name, "conflsnap" will be filled 2546 * in with the name. (It must be at least MAXNAMELEN bytes long.) 2547 */ 2548 int 2549 dsl_dataset_promote(const char *name, char *conflsnap) 2550 { 2551 dsl_dataset_promote_arg_t ddpa = { 0 }; 2552 uint64_t numsnaps; 2553 int error; 2554 objset_t *os; 2555 2556 /* 2557 * We will modify space proportional to the number of 2558 * snapshots. Compute numsnaps. 2559 */ 2560 error = dmu_objset_hold(name, FTAG, &os); 2561 if (error != 0) 2562 return (error); 2563 error = zap_count(dmu_objset_pool(os)->dp_meta_objset, 2564 dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj, 2565 &numsnaps); 2566 dmu_objset_rele(os, FTAG); 2567 if (error != 0) 2568 return (error); 2569 2570 ddpa.ddpa_clonename = name; 2571 ddpa.err_ds = conflsnap; 2572 ddpa.cr = CRED(); 2573 2574 return (dsl_sync_task(name, dsl_dataset_promote_check, 2575 dsl_dataset_promote_sync, &ddpa, 2576 2 + numsnaps, ZFS_SPACE_CHECK_RESERVED)); 2577 } 2578 2579 int 2580 dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, 2581 dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx) 2582 { 2583 int64_t unused_refres_delta; 2584 2585 /* they should both be heads */ 2586 if (dsl_dataset_is_snapshot(clone) || 2587 dsl_dataset_is_snapshot(origin_head)) 2588 return (SET_ERROR(EINVAL)); 2589 2590 /* if we are not forcing, the branch point should be just before them */ 2591 if (!force && clone->ds_prev != origin_head->ds_prev) 2592 return (SET_ERROR(EINVAL)); 2593 2594 /* clone should be the clone (unless they are unrelated) */ 2595 if (clone->ds_prev != NULL && 2596 clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap && 2597 origin_head->ds_dir != clone->ds_prev->ds_dir) 2598 return (SET_ERROR(EINVAL)); 2599 2600 /* the clone should be a child of the origin */ 2601 if (clone->ds_dir->dd_parent != origin_head->ds_dir) 2602 return (SET_ERROR(EINVAL)); 2603 2604 /* origin_head shouldn't be modified unless 'force' */ 2605 if (!force && 2606 dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev)) 2607 return (SET_ERROR(ETXTBSY)); 2608 2609 /* origin_head should have no long holds (e.g. is not mounted) */ 2610 if (dsl_dataset_handoff_check(origin_head, owner, tx)) 2611 return (SET_ERROR(EBUSY)); 2612 2613 /* check amount of any unconsumed refreservation */ 2614 unused_refres_delta = 2615 (int64_t)MIN(origin_head->ds_reserved, 2616 dsl_dataset_phys(origin_head)->ds_unique_bytes) - 2617 (int64_t)MIN(origin_head->ds_reserved, 2618 dsl_dataset_phys(clone)->ds_unique_bytes); 2619 2620 if (unused_refres_delta > 0 && 2621 unused_refres_delta > 2622 dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE)) 2623 return (SET_ERROR(ENOSPC)); 2624 2625 /* clone can't be over the head's refquota */ 2626 if (origin_head->ds_quota != 0 && 2627 dsl_dataset_phys(clone)->ds_referenced_bytes > 2628 origin_head->ds_quota) 2629 return (SET_ERROR(EDQUOT)); 2630 2631 return (0); 2632 } 2633 2634 void 2635 dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, 2636 dsl_dataset_t *origin_head, dmu_tx_t *tx) 2637 { 2638 dsl_pool_t *dp = dmu_tx_pool(tx); 2639 int64_t unused_refres_delta; 2640 2641 ASSERT(clone->ds_reserved == 0); 2642 ASSERT(origin_head->ds_quota == 0 || 2643 dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota); 2644 ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev); 2645 2646 dmu_buf_will_dirty(clone->ds_dbuf, tx); 2647 dmu_buf_will_dirty(origin_head->ds_dbuf, tx); 2648 2649 if (clone->ds_objset != NULL) { 2650 dmu_objset_evict(clone->ds_objset); 2651 clone->ds_objset = NULL; 2652 } 2653 2654 if (origin_head->ds_objset != NULL) { 2655 dmu_objset_evict(origin_head->ds_objset); 2656 origin_head->ds_objset = NULL; 2657 } 2658 2659 unused_refres_delta = 2660 (int64_t)MIN(origin_head->ds_reserved, 2661 dsl_dataset_phys(origin_head)->ds_unique_bytes) - 2662 (int64_t)MIN(origin_head->ds_reserved, 2663 dsl_dataset_phys(clone)->ds_unique_bytes); 2664 2665 /* 2666 * Reset origin's unique bytes, if it exists. 2667 */ 2668 if (clone->ds_prev) { 2669 dsl_dataset_t *origin = clone->ds_prev; 2670 uint64_t comp, uncomp; 2671 2672 dmu_buf_will_dirty(origin->ds_dbuf, tx); 2673 dsl_deadlist_space_range(&clone->ds_deadlist, 2674 dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX, 2675 &dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp); 2676 } 2677 2678 /* swap blkptrs */ 2679 { 2680 blkptr_t tmp; 2681 tmp = dsl_dataset_phys(origin_head)->ds_bp; 2682 dsl_dataset_phys(origin_head)->ds_bp = 2683 dsl_dataset_phys(clone)->ds_bp; 2684 dsl_dataset_phys(clone)->ds_bp = tmp; 2685 } 2686 2687 /* set dd_*_bytes */ 2688 { 2689 int64_t dused, dcomp, duncomp; 2690 uint64_t cdl_used, cdl_comp, cdl_uncomp; 2691 uint64_t odl_used, odl_comp, odl_uncomp; 2692 2693 ASSERT3U(dsl_dir_phys(clone->ds_dir)-> 2694 dd_used_breakdown[DD_USED_SNAP], ==, 0); 2695 2696 dsl_deadlist_space(&clone->ds_deadlist, 2697 &cdl_used, &cdl_comp, &cdl_uncomp); 2698 dsl_deadlist_space(&origin_head->ds_deadlist, 2699 &odl_used, &odl_comp, &odl_uncomp); 2700 2701 dused = dsl_dataset_phys(clone)->ds_referenced_bytes + 2702 cdl_used - 2703 (dsl_dataset_phys(origin_head)->ds_referenced_bytes + 2704 odl_used); 2705 dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes + 2706 cdl_comp - 2707 (dsl_dataset_phys(origin_head)->ds_compressed_bytes + 2708 odl_comp); 2709 duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes + 2710 cdl_uncomp - 2711 (dsl_dataset_phys(origin_head)->ds_uncompressed_bytes + 2712 odl_uncomp); 2713 2714 dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD, 2715 dused, dcomp, duncomp, tx); 2716 dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD, 2717 -dused, -dcomp, -duncomp, tx); 2718 2719 /* 2720 * The difference in the space used by snapshots is the 2721 * difference in snapshot space due to the head's 2722 * deadlist (since that's the only thing that's 2723 * changing that affects the snapused). 2724 */ 2725 dsl_deadlist_space_range(&clone->ds_deadlist, 2726 origin_head->ds_dir->dd_origin_txg, UINT64_MAX, 2727 &cdl_used, &cdl_comp, &cdl_uncomp); 2728 dsl_deadlist_space_range(&origin_head->ds_deadlist, 2729 origin_head->ds_dir->dd_origin_txg, UINT64_MAX, 2730 &odl_used, &odl_comp, &odl_uncomp); 2731 dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used, 2732 DD_USED_HEAD, DD_USED_SNAP, tx); 2733 } 2734 2735 /* swap ds_*_bytes */ 2736 SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes, 2737 dsl_dataset_phys(clone)->ds_referenced_bytes); 2738 SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes, 2739 dsl_dataset_phys(clone)->ds_compressed_bytes); 2740 SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes, 2741 dsl_dataset_phys(clone)->ds_uncompressed_bytes); 2742 SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes, 2743 dsl_dataset_phys(clone)->ds_unique_bytes); 2744 2745 /* apply any parent delta for change in unconsumed refreservation */ 2746 dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV, 2747 unused_refres_delta, 0, 0, tx); 2748 2749 /* 2750 * Swap deadlists. 2751 */ 2752 dsl_deadlist_close(&clone->ds_deadlist); 2753 dsl_deadlist_close(&origin_head->ds_deadlist); 2754 SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj, 2755 dsl_dataset_phys(clone)->ds_deadlist_obj); 2756 dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset, 2757 dsl_dataset_phys(clone)->ds_deadlist_obj); 2758 dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset, 2759 dsl_dataset_phys(origin_head)->ds_deadlist_obj); 2760 2761 dsl_scan_ds_clone_swapped(origin_head, clone, tx); 2762 2763 spa_history_log_internal_ds(clone, "clone swap", tx, 2764 "parent=%s", origin_head->ds_dir->dd_myname); 2765 } 2766 2767 /* 2768 * Given a pool name and a dataset object number in that pool, 2769 * return the name of that dataset. 2770 */ 2771 int 2772 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) 2773 { 2774 dsl_pool_t *dp; 2775 dsl_dataset_t *ds; 2776 int error; 2777 2778 error = dsl_pool_hold(pname, FTAG, &dp); 2779 if (error != 0) 2780 return (error); 2781 2782 error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); 2783 if (error == 0) { 2784 dsl_dataset_name(ds, buf); 2785 dsl_dataset_rele(ds, FTAG); 2786 } 2787 dsl_pool_rele(dp, FTAG); 2788 2789 return (error); 2790 } 2791 2792 int 2793 dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, 2794 uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv) 2795 { 2796 int error = 0; 2797 2798 ASSERT3S(asize, >, 0); 2799 2800 /* 2801 * *ref_rsrv is the portion of asize that will come from any 2802 * unconsumed refreservation space. 2803 */ 2804 *ref_rsrv = 0; 2805 2806 mutex_enter(&ds->ds_lock); 2807 /* 2808 * Make a space adjustment for reserved bytes. 2809 */ 2810 if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) { 2811 ASSERT3U(*used, >=, 2812 ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes); 2813 *used -= 2814 (ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes); 2815 *ref_rsrv = 2816 asize - MIN(asize, parent_delta(ds, asize + inflight)); 2817 } 2818 2819 if (!check_quota || ds->ds_quota == 0) { 2820 mutex_exit(&ds->ds_lock); 2821 return (0); 2822 } 2823 /* 2824 * If they are requesting more space, and our current estimate 2825 * is over quota, they get to try again unless the actual 2826 * on-disk is over quota and there are no pending changes (which 2827 * may free up space for us). 2828 */ 2829 if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >= 2830 ds->ds_quota) { 2831 if (inflight > 0 || 2832 dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota) 2833 error = SET_ERROR(ERESTART); 2834 else 2835 error = SET_ERROR(EDQUOT); 2836 } 2837 mutex_exit(&ds->ds_lock); 2838 2839 return (error); 2840 } 2841 2842 typedef struct dsl_dataset_set_qr_arg { 2843 const char *ddsqra_name; 2844 zprop_source_t ddsqra_source; 2845 uint64_t ddsqra_value; 2846 } dsl_dataset_set_qr_arg_t; 2847 2848 2849 /* ARGSUSED */ 2850 static int 2851 dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx) 2852 { 2853 dsl_dataset_set_qr_arg_t *ddsqra = arg; 2854 dsl_pool_t *dp = dmu_tx_pool(tx); 2855 dsl_dataset_t *ds; 2856 int error; 2857 uint64_t newval; 2858 2859 if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA) 2860 return (SET_ERROR(ENOTSUP)); 2861 2862 error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); 2863 if (error != 0) 2864 return (error); 2865 2866 if (dsl_dataset_is_snapshot(ds)) { 2867 dsl_dataset_rele(ds, FTAG); 2868 return (SET_ERROR(EINVAL)); 2869 } 2870 2871 error = dsl_prop_predict(ds->ds_dir, 2872 zfs_prop_to_name(ZFS_PROP_REFQUOTA), 2873 ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); 2874 if (error != 0) { 2875 dsl_dataset_rele(ds, FTAG); 2876 return (error); 2877 } 2878 2879 if (newval == 0) { 2880 dsl_dataset_rele(ds, FTAG); 2881 return (0); 2882 } 2883 2884 if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes || 2885 newval < ds->ds_reserved) { 2886 dsl_dataset_rele(ds, FTAG); 2887 return (SET_ERROR(ENOSPC)); 2888 } 2889 2890 dsl_dataset_rele(ds, FTAG); 2891 return (0); 2892 } 2893 2894 static void 2895 dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx) 2896 { 2897 dsl_dataset_set_qr_arg_t *ddsqra = arg; 2898 dsl_pool_t *dp = dmu_tx_pool(tx); 2899 dsl_dataset_t *ds; 2900 uint64_t newval; 2901 2902 VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); 2903 2904 dsl_prop_set_sync_impl(ds, 2905 zfs_prop_to_name(ZFS_PROP_REFQUOTA), 2906 ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, 2907 &ddsqra->ddsqra_value, tx); 2908 2909 VERIFY0(dsl_prop_get_int_ds(ds, 2910 zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval)); 2911 2912 if (ds->ds_quota != newval) { 2913 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2914 ds->ds_quota = newval; 2915 } 2916 dsl_dataset_rele(ds, FTAG); 2917 } 2918 2919 int 2920 dsl_dataset_set_refquota(const char *dsname, zprop_source_t source, 2921 uint64_t refquota) 2922 { 2923 dsl_dataset_set_qr_arg_t ddsqra; 2924 2925 ddsqra.ddsqra_name = dsname; 2926 ddsqra.ddsqra_source = source; 2927 ddsqra.ddsqra_value = refquota; 2928 2929 return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check, 2930 dsl_dataset_set_refquota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE)); 2931 } 2932 2933 static int 2934 dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx) 2935 { 2936 dsl_dataset_set_qr_arg_t *ddsqra = arg; 2937 dsl_pool_t *dp = dmu_tx_pool(tx); 2938 dsl_dataset_t *ds; 2939 int error; 2940 uint64_t newval, unique; 2941 2942 if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION) 2943 return (SET_ERROR(ENOTSUP)); 2944 2945 error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); 2946 if (error != 0) 2947 return (error); 2948 2949 if (dsl_dataset_is_snapshot(ds)) { 2950 dsl_dataset_rele(ds, FTAG); 2951 return (SET_ERROR(EINVAL)); 2952 } 2953 2954 error = dsl_prop_predict(ds->ds_dir, 2955 zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 2956 ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); 2957 if (error != 0) { 2958 dsl_dataset_rele(ds, FTAG); 2959 return (error); 2960 } 2961 2962 /* 2963 * If we are doing the preliminary check in open context, the 2964 * space estimates may be inaccurate. 2965 */ 2966 if (!dmu_tx_is_syncing(tx)) { 2967 dsl_dataset_rele(ds, FTAG); 2968 return (0); 2969 } 2970 2971 mutex_enter(&ds->ds_lock); 2972 if (!DS_UNIQUE_IS_ACCURATE(ds)) 2973 dsl_dataset_recalc_head_uniq(ds); 2974 unique = dsl_dataset_phys(ds)->ds_unique_bytes; 2975 mutex_exit(&ds->ds_lock); 2976 2977 if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) { 2978 uint64_t delta = MAX(unique, newval) - 2979 MAX(unique, ds->ds_reserved); 2980 2981 if (delta > 2982 dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) || 2983 (ds->ds_quota > 0 && newval > ds->ds_quota)) { 2984 dsl_dataset_rele(ds, FTAG); 2985 return (SET_ERROR(ENOSPC)); 2986 } 2987 } 2988 2989 dsl_dataset_rele(ds, FTAG); 2990 return (0); 2991 } 2992 2993 void 2994 dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds, 2995 zprop_source_t source, uint64_t value, dmu_tx_t *tx) 2996 { 2997 uint64_t newval; 2998 uint64_t unique; 2999 int64_t delta; 3000 3001 dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 3002 source, sizeof (value), 1, &value, tx); 3003 3004 VERIFY0(dsl_prop_get_int_ds(ds, 3005 zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval)); 3006 3007 dmu_buf_will_dirty(ds->ds_dbuf, tx); 3008 mutex_enter(&ds->ds_dir->dd_lock); 3009 mutex_enter(&ds->ds_lock); 3010 ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); 3011 unique = dsl_dataset_phys(ds)->ds_unique_bytes; 3012 delta = MAX(0, (int64_t)(newval - unique)) - 3013 MAX(0, (int64_t)(ds->ds_reserved - unique)); 3014 ds->ds_reserved = newval; 3015 mutex_exit(&ds->ds_lock); 3016 3017 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); 3018 mutex_exit(&ds->ds_dir->dd_lock); 3019 } 3020 3021 static void 3022 dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx) 3023 { 3024 dsl_dataset_set_qr_arg_t *ddsqra = arg; 3025 dsl_pool_t *dp = dmu_tx_pool(tx); 3026 dsl_dataset_t *ds; 3027 3028 VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); 3029 dsl_dataset_set_refreservation_sync_impl(ds, 3030 ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx); 3031 dsl_dataset_rele(ds, FTAG); 3032 } 3033 3034 int 3035 dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source, 3036 uint64_t refreservation) 3037 { 3038 dsl_dataset_set_qr_arg_t ddsqra; 3039 3040 ddsqra.ddsqra_name = dsname; 3041 ddsqra.ddsqra_source = source; 3042 ddsqra.ddsqra_value = refreservation; 3043 3044 return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check, 3045 dsl_dataset_set_refreservation_sync, &ddsqra, 3046 0, ZFS_SPACE_CHECK_NONE)); 3047 } 3048 3049 /* 3050 * Return (in *usedp) the amount of space written in new that is not 3051 * present in oldsnap. New may be a snapshot or the head. Old must be 3052 * a snapshot before new, in new's filesystem (or its origin). If not then 3053 * fail and return EINVAL. 3054 * 3055 * The written space is calculated by considering two components: First, we 3056 * ignore any freed space, and calculate the written as new's used space 3057 * minus old's used space. Next, we add in the amount of space that was freed 3058 * between the two snapshots, thus reducing new's used space relative to old's. 3059 * Specifically, this is the space that was born before old->ds_creation_txg, 3060 * and freed before new (ie. on new's deadlist or a previous deadlist). 3061 * 3062 * space freed [---------------------] 3063 * snapshots ---O-------O--------O-------O------ 3064 * oldsnap new 3065 */ 3066 int 3067 dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, 3068 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 3069 { 3070 int err = 0; 3071 uint64_t snapobj; 3072 dsl_pool_t *dp = new->ds_dir->dd_pool; 3073 3074 ASSERT(dsl_pool_config_held(dp)); 3075 3076 *usedp = 0; 3077 *usedp += dsl_dataset_phys(new)->ds_referenced_bytes; 3078 *usedp -= dsl_dataset_phys(oldsnap)->ds_referenced_bytes; 3079 3080 *compp = 0; 3081 *compp += dsl_dataset_phys(new)->ds_compressed_bytes; 3082 *compp -= dsl_dataset_phys(oldsnap)->ds_compressed_bytes; 3083 3084 *uncompp = 0; 3085 *uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes; 3086 *uncompp -= dsl_dataset_phys(oldsnap)->ds_uncompressed_bytes; 3087 3088 snapobj = new->ds_object; 3089 while (snapobj != oldsnap->ds_object) { 3090 dsl_dataset_t *snap; 3091 uint64_t used, comp, uncomp; 3092 3093 if (snapobj == new->ds_object) { 3094 snap = new; 3095 } else { 3096 err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); 3097 if (err != 0) 3098 break; 3099 } 3100 3101 if (dsl_dataset_phys(snap)->ds_prev_snap_txg == 3102 dsl_dataset_phys(oldsnap)->ds_creation_txg) { 3103 /* 3104 * The blocks in the deadlist can not be born after 3105 * ds_prev_snap_txg, so get the whole deadlist space, 3106 * which is more efficient (especially for old-format 3107 * deadlists). Unfortunately the deadlist code 3108 * doesn't have enough information to make this 3109 * optimization itself. 3110 */ 3111 dsl_deadlist_space(&snap->ds_deadlist, 3112 &used, &comp, &uncomp); 3113 } else { 3114 dsl_deadlist_space_range(&snap->ds_deadlist, 3115 0, dsl_dataset_phys(oldsnap)->ds_creation_txg, 3116 &used, &comp, &uncomp); 3117 } 3118 *usedp += used; 3119 *compp += comp; 3120 *uncompp += uncomp; 3121 3122 /* 3123 * If we get to the beginning of the chain of snapshots 3124 * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap 3125 * was not a snapshot of/before new. 3126 */ 3127 snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj; 3128 if (snap != new) 3129 dsl_dataset_rele(snap, FTAG); 3130 if (snapobj == 0) { 3131 err = SET_ERROR(EINVAL); 3132 break; 3133 } 3134 3135 } 3136 return (err); 3137 } 3138 3139 /* 3140 * Return (in *usedp) the amount of space that will be reclaimed if firstsnap, 3141 * lastsnap, and all snapshots in between are deleted. 3142 * 3143 * blocks that would be freed [---------------------------] 3144 * snapshots ---O-------O--------O-------O--------O 3145 * firstsnap lastsnap 3146 * 3147 * This is the set of blocks that were born after the snap before firstsnap, 3148 * (birth > firstsnap->prev_snap_txg) and died before the snap after the 3149 * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist). 3150 * We calculate this by iterating over the relevant deadlists (from the snap 3151 * after lastsnap, backward to the snap after firstsnap), summing up the 3152 * space on the deadlist that was born after the snap before firstsnap. 3153 */ 3154 int 3155 dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, 3156 dsl_dataset_t *lastsnap, 3157 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 3158 { 3159 int err = 0; 3160 uint64_t snapobj; 3161 dsl_pool_t *dp = firstsnap->ds_dir->dd_pool; 3162 3163 ASSERT(dsl_dataset_is_snapshot(firstsnap)); 3164 ASSERT(dsl_dataset_is_snapshot(lastsnap)); 3165 3166 /* 3167 * Check that the snapshots are in the same dsl_dir, and firstsnap 3168 * is before lastsnap. 3169 */ 3170 if (firstsnap->ds_dir != lastsnap->ds_dir || 3171 dsl_dataset_phys(firstsnap)->ds_creation_txg > 3172 dsl_dataset_phys(lastsnap)->ds_creation_txg) 3173 return (SET_ERROR(EINVAL)); 3174 3175 *usedp = *compp = *uncompp = 0; 3176 3177 snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj; 3178 while (snapobj != firstsnap->ds_object) { 3179 dsl_dataset_t *ds; 3180 uint64_t used, comp, uncomp; 3181 3182 err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds); 3183 if (err != 0) 3184 break; 3185 3186 dsl_deadlist_space_range(&ds->ds_deadlist, 3187 dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX, 3188 &used, &comp, &uncomp); 3189 *usedp += used; 3190 *compp += comp; 3191 *uncompp += uncomp; 3192 3193 snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj; 3194 ASSERT3U(snapobj, !=, 0); 3195 dsl_dataset_rele(ds, FTAG); 3196 } 3197 return (err); 3198 } 3199 3200 static int 3201 dsl_dataset_activate_large_blocks_check(void *arg, dmu_tx_t *tx) 3202 { 3203 const char *dsname = arg; 3204 dsl_dataset_t *ds; 3205 dsl_pool_t *dp = dmu_tx_pool(tx); 3206 int error = 0; 3207 3208 if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS)) 3209 return (SET_ERROR(ENOTSUP)); 3210 3211 ASSERT(spa_feature_is_enabled(dp->dp_spa, 3212 SPA_FEATURE_EXTENSIBLE_DATASET)); 3213 3214 error = dsl_dataset_hold(dp, dsname, FTAG, &ds); 3215 if (error != 0) 3216 return (error); 3217 3218 if (ds->ds_large_blocks) 3219 error = EALREADY; 3220 dsl_dataset_rele(ds, FTAG); 3221 3222 return (error); 3223 } 3224 3225 void 3226 dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx) 3227 { 3228 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 3229 objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset; 3230 uint64_t zero = 0; 3231 3232 spa_feature_incr(spa, SPA_FEATURE_LARGE_BLOCKS, tx); 3233 dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx); 3234 3235 VERIFY0(zap_add(mos, dsobj, DS_FIELD_LARGE_BLOCKS, 3236 sizeof (zero), 1, &zero, tx)); 3237 } 3238 3239 static void 3240 dsl_dataset_activate_large_blocks_sync(void *arg, dmu_tx_t *tx) 3241 { 3242 const char *dsname = arg; 3243 dsl_dataset_t *ds; 3244 3245 VERIFY0(dsl_dataset_hold(dmu_tx_pool(tx), dsname, FTAG, &ds)); 3246 3247 dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx); 3248 ASSERT(!ds->ds_large_blocks); 3249 ds->ds_large_blocks = B_TRUE; 3250 dsl_dataset_rele(ds, FTAG); 3251 } 3252 3253 int 3254 dsl_dataset_activate_large_blocks(const char *dsname) 3255 { 3256 int error; 3257 3258 error = dsl_sync_task(dsname, 3259 dsl_dataset_activate_large_blocks_check, 3260 dsl_dataset_activate_large_blocks_sync, (void *)dsname, 3261 1, ZFS_SPACE_CHECK_RESERVED); 3262 3263 /* 3264 * EALREADY indicates that this dataset already supports large blocks. 3265 */ 3266 if (error == EALREADY) 3267 error = 0; 3268 return (error); 3269 } 3270 3271 /* 3272 * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline. 3273 * For example, they could both be snapshots of the same filesystem, and 3274 * 'earlier' is before 'later'. Or 'earlier' could be the origin of 3275 * 'later's filesystem. Or 'earlier' could be an older snapshot in the origin's 3276 * filesystem. Or 'earlier' could be the origin's origin. 3277 * 3278 * If non-zero, earlier_txg is used instead of earlier's ds_creation_txg. 3279 */ 3280 boolean_t 3281 dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier, 3282 uint64_t earlier_txg) 3283 { 3284 dsl_pool_t *dp = later->ds_dir->dd_pool; 3285 int error; 3286 boolean_t ret; 3287 3288 ASSERT(dsl_pool_config_held(dp)); 3289 ASSERT(dsl_dataset_is_snapshot(earlier) || earlier_txg != 0); 3290 3291 if (earlier_txg == 0) 3292 earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg; 3293 3294 if (dsl_dataset_is_snapshot(later) && 3295 earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg) 3296 return (B_FALSE); 3297 3298 if (later->ds_dir == earlier->ds_dir) 3299 return (B_TRUE); 3300 if (!dsl_dir_is_clone(later->ds_dir)) 3301 return (B_FALSE); 3302 3303 if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == earlier->ds_object) 3304 return (B_TRUE); 3305 dsl_dataset_t *origin; 3306 error = dsl_dataset_hold_obj(dp, 3307 dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin); 3308 if (error != 0) 3309 return (B_FALSE); 3310 ret = dsl_dataset_is_before(origin, earlier, earlier_txg); 3311 dsl_dataset_rele(origin, FTAG); 3312 return (ret); 3313 } 3314 3315 3316 void 3317 dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx) 3318 { 3319 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 3320 dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx); 3321 } 3322