1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/dsl_pool.h> 27 #include <sys/dsl_dataset.h> 28 #include <sys/dsl_dir.h> 29 #include <sys/dsl_synctask.h> 30 #include <sys/dmu_tx.h> 31 #include <sys/dmu_objset.h> 32 #include <sys/arc.h> 33 #include <sys/zap.h> 34 #include <sys/zio.h> 35 #include <sys/zfs_context.h> 36 #include <sys/fs/zfs.h> 37 #include <sys/zfs_znode.h> 38 #include <sys/spa_impl.h> 39 40 int zfs_no_write_throttle = 0; 41 int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ 42 int zfs_txg_synctime = 5; /* target secs to sync a txg */ 43 44 uint64_t zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */ 45 uint64_t zfs_write_limit_max = 0; /* max data payload per txg */ 46 uint64_t zfs_write_limit_inflated = 0; 47 uint64_t zfs_write_limit_override = 0; 48 49 kmutex_t zfs_write_limit_lock; 50 51 static pgcnt_t old_physmem = 0; 52 53 static int 54 dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) 55 { 56 uint64_t obj; 57 int err; 58 59 err = zap_lookup(dp->dp_meta_objset, 60 dp->dp_root_dir->dd_phys->dd_child_dir_zapobj, 61 name, sizeof (obj), 1, &obj); 62 if (err) 63 return (err); 64 65 return (dsl_dir_open_obj(dp, obj, name, dp, ddp)); 66 } 67 68 static dsl_pool_t * 69 dsl_pool_open_impl(spa_t *spa, uint64_t txg) 70 { 71 dsl_pool_t *dp; 72 blkptr_t *bp = spa_get_rootblkptr(spa); 73 74 dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); 75 dp->dp_spa = spa; 76 dp->dp_meta_rootbp = *bp; 77 rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL); 78 dp->dp_write_limit = zfs_write_limit_min; 79 txg_init(dp, txg); 80 81 txg_list_create(&dp->dp_dirty_datasets, 82 offsetof(dsl_dataset_t, ds_dirty_link)); 83 txg_list_create(&dp->dp_dirty_dirs, 84 offsetof(dsl_dir_t, dd_dirty_link)); 85 txg_list_create(&dp->dp_sync_tasks, 86 offsetof(dsl_sync_task_group_t, dstg_node)); 87 list_create(&dp->dp_synced_datasets, sizeof (dsl_dataset_t), 88 offsetof(dsl_dataset_t, ds_synced_link)); 89 90 mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); 91 mutex_init(&dp->dp_scrub_cancel_lock, NULL, MUTEX_DEFAULT, NULL); 92 93 dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri, 94 1, 4, 0); 95 96 return (dp); 97 } 98 99 int 100 dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) 101 { 102 int err; 103 dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); 104 dsl_dir_t *dd; 105 dsl_dataset_t *ds; 106 107 rw_enter(&dp->dp_config_rwlock, RW_WRITER); 108 err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, 109 &dp->dp_meta_objset); 110 if (err) 111 goto out; 112 113 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 114 DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, 115 &dp->dp_root_dir_obj); 116 if (err) 117 goto out; 118 119 err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, 120 NULL, dp, &dp->dp_root_dir); 121 if (err) 122 goto out; 123 124 err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir); 125 if (err) 126 goto out; 127 128 if (spa_version(spa) >= SPA_VERSION_ORIGIN) { 129 err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); 130 if (err) 131 goto out; 132 err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj, 133 FTAG, &ds); 134 if (err == 0) { 135 err = dsl_dataset_hold_obj(dp, 136 ds->ds_phys->ds_prev_snap_obj, dp, 137 &dp->dp_origin_snap); 138 dsl_dataset_rele(ds, FTAG); 139 } 140 dsl_dir_close(dd, dp); 141 if (err) 142 goto out; 143 } 144 145 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 146 DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, 147 &dp->dp_tmp_userrefs_obj); 148 if (err == ENOENT) 149 err = 0; 150 if (err) 151 goto out; 152 153 /* get scrub status */ 154 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 155 DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1, 156 &dp->dp_scrub_func); 157 if (err == 0) { 158 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 159 DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1, 160 &dp->dp_scrub_queue_obj); 161 if (err) 162 goto out; 163 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 164 DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1, 165 &dp->dp_scrub_min_txg); 166 if (err) 167 goto out; 168 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 169 DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1, 170 &dp->dp_scrub_max_txg); 171 if (err) 172 goto out; 173 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 174 DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4, 175 &dp->dp_scrub_bookmark); 176 if (err) 177 goto out; 178 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 179 DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, 180 &spa->spa_scrub_errors); 181 if (err) 182 goto out; 183 if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) { 184 /* 185 * A new-type scrub was in progress on an old 186 * pool. Restart from the beginning, since the 187 * old software may have changed the pool in the 188 * meantime. 189 */ 190 dsl_pool_scrub_restart(dp); 191 } 192 } else { 193 /* 194 * It's OK if there is no scrub in progress (and if 195 * there was an I/O error, ignore it). 196 */ 197 err = 0; 198 } 199 200 out: 201 rw_exit(&dp->dp_config_rwlock); 202 if (err) 203 dsl_pool_close(dp); 204 else 205 *dpp = dp; 206 207 return (err); 208 } 209 210 void 211 dsl_pool_close(dsl_pool_t *dp) 212 { 213 /* drop our references from dsl_pool_open() */ 214 215 /* 216 * Since we held the origin_snap from "syncing" context (which 217 * includes pool-opening context), it actually only got a "ref" 218 * and not a hold, so just drop that here. 219 */ 220 if (dp->dp_origin_snap) 221 dsl_dataset_drop_ref(dp->dp_origin_snap, dp); 222 if (dp->dp_mos_dir) 223 dsl_dir_close(dp->dp_mos_dir, dp); 224 if (dp->dp_root_dir) 225 dsl_dir_close(dp->dp_root_dir, dp); 226 227 /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ 228 if (dp->dp_meta_objset) 229 dmu_objset_evict(dp->dp_meta_objset); 230 231 txg_list_destroy(&dp->dp_dirty_datasets); 232 txg_list_destroy(&dp->dp_dirty_dirs); 233 list_destroy(&dp->dp_synced_datasets); 234 235 arc_flush(dp->dp_spa); 236 txg_fini(dp); 237 rw_destroy(&dp->dp_config_rwlock); 238 mutex_destroy(&dp->dp_lock); 239 mutex_destroy(&dp->dp_scrub_cancel_lock); 240 taskq_destroy(dp->dp_vnrele_taskq); 241 if (dp->dp_blkstats) 242 kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); 243 kmem_free(dp, sizeof (dsl_pool_t)); 244 } 245 246 dsl_pool_t * 247 dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) 248 { 249 int err; 250 dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); 251 dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); 252 objset_t *os; 253 dsl_dataset_t *ds; 254 uint64_t dsobj; 255 256 /* create and open the MOS (meta-objset) */ 257 dp->dp_meta_objset = dmu_objset_create_impl(spa, 258 NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx); 259 260 /* create the pool directory */ 261 err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 262 DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); 263 ASSERT3U(err, ==, 0); 264 265 /* create and open the root dir */ 266 dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); 267 VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj, 268 NULL, dp, &dp->dp_root_dir)); 269 270 /* create and open the meta-objset dir */ 271 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); 272 VERIFY(0 == dsl_pool_open_special_dir(dp, 273 MOS_DIR_NAME, &dp->dp_mos_dir)); 274 275 if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) 276 dsl_pool_create_origin(dp, tx); 277 278 /* create the root dataset */ 279 dsobj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); 280 281 /* create the root objset */ 282 VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 283 os = dmu_objset_create_impl(dp->dp_spa, ds, 284 dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); 285 #ifdef _KERNEL 286 zfs_create_fs(os, kcred, zplprops, tx); 287 #endif 288 dsl_dataset_rele(ds, FTAG); 289 290 dmu_tx_commit(tx); 291 292 return (dp); 293 } 294 295 void 296 dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) 297 { 298 zio_t *zio; 299 dmu_tx_t *tx; 300 dsl_dir_t *dd; 301 dsl_dataset_t *ds; 302 dsl_sync_task_group_t *dstg; 303 objset_t *mos = dp->dp_meta_objset; 304 hrtime_t start, write_time; 305 uint64_t data_written; 306 int err; 307 308 tx = dmu_tx_create_assigned(dp, txg); 309 310 dp->dp_read_overhead = 0; 311 start = gethrtime(); 312 313 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 314 while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { 315 /* 316 * We must not sync any non-MOS datasets twice, because 317 * we may have taken a snapshot of them. However, we 318 * may sync newly-created datasets on pass 2. 319 */ 320 ASSERT(!list_link_active(&ds->ds_synced_link)); 321 list_insert_tail(&dp->dp_synced_datasets, ds); 322 dsl_dataset_sync(ds, zio, tx); 323 } 324 DTRACE_PROBE(pool_sync__1setup); 325 err = zio_wait(zio); 326 327 write_time = gethrtime() - start; 328 ASSERT(err == 0); 329 DTRACE_PROBE(pool_sync__2rootzio); 330 331 for (ds = list_head(&dp->dp_synced_datasets); ds; 332 ds = list_next(&dp->dp_synced_datasets, ds)) 333 dmu_objset_do_userquota_callbacks(ds->ds_objset, tx); 334 335 /* 336 * Sync the datasets again to push out the changes due to 337 * userquota updates. This must be done before we process the 338 * sync tasks, because that could cause a snapshot of a dataset 339 * whose ds_bp will be rewritten when we do this 2nd sync. 340 */ 341 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 342 while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { 343 ASSERT(list_link_active(&ds->ds_synced_link)); 344 dmu_buf_rele(ds->ds_dbuf, ds); 345 dsl_dataset_sync(ds, zio, tx); 346 } 347 err = zio_wait(zio); 348 349 while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) { 350 /* 351 * No more sync tasks should have been added while we 352 * were syncing. 353 */ 354 ASSERT(spa_sync_pass(dp->dp_spa) == 1); 355 dsl_sync_task_group_sync(dstg, tx); 356 } 357 DTRACE_PROBE(pool_sync__3task); 358 359 start = gethrtime(); 360 while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) 361 dsl_dir_sync(dd, tx); 362 write_time += gethrtime() - start; 363 364 if (spa_sync_pass(dp->dp_spa) == 1) 365 dsl_pool_scrub_sync(dp, tx); 366 367 start = gethrtime(); 368 if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL || 369 list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) { 370 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 371 dmu_objset_sync(mos, zio, tx); 372 err = zio_wait(zio); 373 ASSERT(err == 0); 374 dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); 375 spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); 376 } 377 write_time += gethrtime() - start; 378 DTRACE_PROBE2(pool_sync__4io, hrtime_t, write_time, 379 hrtime_t, dp->dp_read_overhead); 380 write_time -= dp->dp_read_overhead; 381 382 dmu_tx_commit(tx); 383 384 data_written = dp->dp_space_towrite[txg & TXG_MASK]; 385 dp->dp_space_towrite[txg & TXG_MASK] = 0; 386 ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0); 387 388 /* 389 * If the write limit max has not been explicitly set, set it 390 * to a fraction of available physical memory (default 1/8th). 391 * Note that we must inflate the limit because the spa 392 * inflates write sizes to account for data replication. 393 * Check this each sync phase to catch changing memory size. 394 */ 395 if (physmem != old_physmem && zfs_write_limit_shift) { 396 mutex_enter(&zfs_write_limit_lock); 397 old_physmem = physmem; 398 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; 399 zfs_write_limit_inflated = MAX(zfs_write_limit_min, 400 spa_get_asize(dp->dp_spa, zfs_write_limit_max)); 401 mutex_exit(&zfs_write_limit_lock); 402 } 403 404 /* 405 * Attempt to keep the sync time consistent by adjusting the 406 * amount of write traffic allowed into each transaction group. 407 * Weight the throughput calculation towards the current value: 408 * thru = 3/4 old_thru + 1/4 new_thru 409 */ 410 ASSERT(zfs_write_limit_min > 0); 411 if (data_written > zfs_write_limit_min / 8 && write_time > 0) { 412 uint64_t throughput = (data_written * NANOSEC) / write_time; 413 if (dp->dp_throughput) 414 dp->dp_throughput = throughput / 4 + 415 3 * dp->dp_throughput / 4; 416 else 417 dp->dp_throughput = throughput; 418 dp->dp_write_limit = MIN(zfs_write_limit_inflated, 419 MAX(zfs_write_limit_min, 420 dp->dp_throughput * zfs_txg_synctime)); 421 } 422 } 423 424 void 425 dsl_pool_zil_clean(dsl_pool_t *dp) 426 { 427 dsl_dataset_t *ds; 428 429 while (ds = list_head(&dp->dp_synced_datasets)) { 430 list_remove(&dp->dp_synced_datasets, ds); 431 ASSERT(ds->ds_objset != NULL); 432 zil_clean(ds->ds_objset->os_zil); 433 dmu_buf_rele(ds->ds_dbuf, ds); 434 } 435 } 436 437 /* 438 * TRUE if the current thread is the tx_sync_thread or if we 439 * are being called from SPA context during pool initialization. 440 */ 441 int 442 dsl_pool_sync_context(dsl_pool_t *dp) 443 { 444 return (curthread == dp->dp_tx.tx_sync_thread || 445 spa_get_dsl(dp->dp_spa) == NULL); 446 } 447 448 uint64_t 449 dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree) 450 { 451 uint64_t space, resv; 452 453 /* 454 * Reserve about 1.6% (1/64), or at least 32MB, for allocation 455 * efficiency. 456 * XXX The intent log is not accounted for, so it must fit 457 * within this slop. 458 * 459 * If we're trying to assess whether it's OK to do a free, 460 * cut the reservation in half to allow forward progress 461 * (e.g. make it possible to rm(1) files from a full pool). 462 */ 463 space = spa_get_dspace(dp->dp_spa); 464 resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1); 465 if (netfree) 466 resv >>= 1; 467 468 return (space - resv); 469 } 470 471 int 472 dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx) 473 { 474 uint64_t reserved = 0; 475 uint64_t write_limit = (zfs_write_limit_override ? 476 zfs_write_limit_override : dp->dp_write_limit); 477 478 if (zfs_no_write_throttle) { 479 atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], 480 space); 481 return (0); 482 } 483 484 /* 485 * Check to see if we have exceeded the maximum allowed IO for 486 * this transaction group. We can do this without locks since 487 * a little slop here is ok. Note that we do the reserved check 488 * with only half the requested reserve: this is because the 489 * reserve requests are worst-case, and we really don't want to 490 * throttle based off of worst-case estimates. 491 */ 492 if (write_limit > 0) { 493 reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK] 494 + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2; 495 496 if (reserved && reserved > write_limit) 497 return (ERESTART); 498 } 499 500 atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space); 501 502 /* 503 * If this transaction group is over 7/8ths capacity, delay 504 * the caller 1 clock tick. This will slow down the "fill" 505 * rate until the sync process can catch up with us. 506 */ 507 if (reserved && reserved > (write_limit - (write_limit >> 3))) 508 txg_delay(dp, tx->tx_txg, 1); 509 510 return (0); 511 } 512 513 void 514 dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) 515 { 516 ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space); 517 atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space); 518 } 519 520 void 521 dsl_pool_memory_pressure(dsl_pool_t *dp) 522 { 523 uint64_t space_inuse = 0; 524 int i; 525 526 if (dp->dp_write_limit == zfs_write_limit_min) 527 return; 528 529 for (i = 0; i < TXG_SIZE; i++) { 530 space_inuse += dp->dp_space_towrite[i]; 531 space_inuse += dp->dp_tempreserved[i]; 532 } 533 dp->dp_write_limit = MAX(zfs_write_limit_min, 534 MIN(dp->dp_write_limit, space_inuse / 4)); 535 } 536 537 void 538 dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) 539 { 540 if (space > 0) { 541 mutex_enter(&dp->dp_lock); 542 dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space; 543 mutex_exit(&dp->dp_lock); 544 } 545 } 546 547 /* ARGSUSED */ 548 static int 549 upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) 550 { 551 dmu_tx_t *tx = arg; 552 dsl_dataset_t *ds, *prev = NULL; 553 int err; 554 dsl_pool_t *dp = spa_get_dsl(spa); 555 556 err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); 557 if (err) 558 return (err); 559 560 while (ds->ds_phys->ds_prev_snap_obj != 0) { 561 err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, 562 FTAG, &prev); 563 if (err) { 564 dsl_dataset_rele(ds, FTAG); 565 return (err); 566 } 567 568 if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) 569 break; 570 dsl_dataset_rele(ds, FTAG); 571 ds = prev; 572 prev = NULL; 573 } 574 575 if (prev == NULL) { 576 prev = dp->dp_origin_snap; 577 578 /* 579 * The $ORIGIN can't have any data, or the accounting 580 * will be wrong. 581 */ 582 ASSERT(prev->ds_phys->ds_bp.blk_birth == 0); 583 584 /* The origin doesn't get attached to itself */ 585 if (ds->ds_object == prev->ds_object) { 586 dsl_dataset_rele(ds, FTAG); 587 return (0); 588 } 589 590 dmu_buf_will_dirty(ds->ds_dbuf, tx); 591 ds->ds_phys->ds_prev_snap_obj = prev->ds_object; 592 ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg; 593 594 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); 595 ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object; 596 597 dmu_buf_will_dirty(prev->ds_dbuf, tx); 598 prev->ds_phys->ds_num_children++; 599 600 if (ds->ds_phys->ds_next_snap_obj == 0) { 601 ASSERT(ds->ds_prev == NULL); 602 VERIFY(0 == dsl_dataset_hold_obj(dp, 603 ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); 604 } 605 } 606 607 ASSERT(ds->ds_dir->dd_phys->dd_origin_obj == prev->ds_object); 608 ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object); 609 610 if (prev->ds_phys->ds_next_clones_obj == 0) { 611 prev->ds_phys->ds_next_clones_obj = 612 zap_create(dp->dp_meta_objset, 613 DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); 614 } 615 VERIFY(0 == zap_add_int(dp->dp_meta_objset, 616 prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx)); 617 618 dsl_dataset_rele(ds, FTAG); 619 if (prev != dp->dp_origin_snap) 620 dsl_dataset_rele(prev, FTAG); 621 return (0); 622 } 623 624 void 625 dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) 626 { 627 ASSERT(dmu_tx_is_syncing(tx)); 628 ASSERT(dp->dp_origin_snap != NULL); 629 630 (void) dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb, 631 tx, DS_FIND_CHILDREN); 632 } 633 634 void 635 dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) 636 { 637 uint64_t dsobj; 638 dsl_dataset_t *ds; 639 640 ASSERT(dmu_tx_is_syncing(tx)); 641 ASSERT(dp->dp_origin_snap == NULL); 642 643 /* create the origin dir, ds, & snap-ds */ 644 rw_enter(&dp->dp_config_rwlock, RW_WRITER); 645 dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, 646 NULL, 0, kcred, tx); 647 VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 648 dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, kcred, tx); 649 VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, 650 dp, &dp->dp_origin_snap)); 651 dsl_dataset_rele(ds, FTAG); 652 rw_exit(&dp->dp_config_rwlock); 653 } 654 655 taskq_t * 656 dsl_pool_vnrele_taskq(dsl_pool_t *dp) 657 { 658 return (dp->dp_vnrele_taskq); 659 } 660 661 /* 662 * Walk through the pool-wide zap object of temporary snapshot user holds 663 * and release them. 664 */ 665 void 666 dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) 667 { 668 zap_attribute_t za; 669 zap_cursor_t zc; 670 objset_t *mos = dp->dp_meta_objset; 671 uint64_t zapobj = dp->dp_tmp_userrefs_obj; 672 673 if (zapobj == 0) 674 return; 675 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 676 677 for (zap_cursor_init(&zc, mos, zapobj); 678 zap_cursor_retrieve(&zc, &za) == 0; 679 zap_cursor_advance(&zc)) { 680 char *htag; 681 uint64_t dsobj; 682 683 htag = strchr(za.za_name, '-'); 684 *htag = '\0'; 685 ++htag; 686 dsobj = strtonum(za.za_name, NULL); 687 (void) dsl_dataset_user_release_tmp(dp, dsobj, htag); 688 } 689 zap_cursor_fini(&zc); 690 } 691 692 /* 693 * Create the pool-wide zap object for storing temporary snapshot holds. 694 */ 695 void 696 dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) 697 { 698 objset_t *mos = dp->dp_meta_objset; 699 700 ASSERT(dp->dp_tmp_userrefs_obj == 0); 701 ASSERT(dmu_tx_is_syncing(tx)); 702 703 dp->dp_tmp_userrefs_obj = zap_create(mos, DMU_OT_USERREFS, 704 DMU_OT_NONE, 0, tx); 705 706 VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, 707 sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj, tx) == 0); 708 } 709 710 static int 711 dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, 712 const char *tag, time_t *t, dmu_tx_t *tx, boolean_t holding) 713 { 714 objset_t *mos = dp->dp_meta_objset; 715 uint64_t zapobj = dp->dp_tmp_userrefs_obj; 716 char *name; 717 int error; 718 719 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 720 ASSERT(dmu_tx_is_syncing(tx)); 721 722 /* 723 * If the pool was created prior to SPA_VERSION_USERREFS, the 724 * zap object for temporary holds might not exist yet. 725 */ 726 if (zapobj == 0) { 727 if (holding) { 728 dsl_pool_user_hold_create_obj(dp, tx); 729 zapobj = dp->dp_tmp_userrefs_obj; 730 } else { 731 return (ENOENT); 732 } 733 } 734 735 name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag); 736 if (holding) 737 error = zap_add(mos, zapobj, name, 8, 1, t, tx); 738 else 739 error = zap_remove(mos, zapobj, name, tx); 740 strfree(name); 741 742 return (error); 743 } 744 745 /* 746 * Add a temporary hold for the given dataset object and tag. 747 */ 748 int 749 dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag, 750 time_t *t, dmu_tx_t *tx) 751 { 752 return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, t, tx, B_TRUE)); 753 } 754 755 /* 756 * Release a temporary hold for the given dataset object and tag. 757 */ 758 int 759 dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, 760 dmu_tx_t *tx) 761 { 762 return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL, 763 tx, B_FALSE)); 764 } 765