1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013 by Delphix. All rights reserved. 24 */ 25 26 #include <sys/dsl_pool.h> 27 #include <sys/dsl_dataset.h> 28 #include <sys/dsl_prop.h> 29 #include <sys/dsl_dir.h> 30 #include <sys/dsl_synctask.h> 31 #include <sys/dsl_scan.h> 32 #include <sys/dnode.h> 33 #include <sys/dmu_tx.h> 34 #include <sys/dmu_objset.h> 35 #include <sys/arc.h> 36 #include <sys/zap.h> 37 #include <sys/zio.h> 38 #include <sys/zfs_context.h> 39 #include <sys/fs/zfs.h> 40 #include <sys/zfs_znode.h> 41 #include <sys/spa_impl.h> 42 #include <sys/dsl_deadlist.h> 43 #include <sys/bptree.h> 44 #include <sys/zfeature.h> 45 #include <sys/zil_impl.h> 46 #include <sys/dsl_userhold.h> 47 48 int zfs_no_write_throttle = 0; 49 int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ 50 int zfs_txg_synctime_ms = 1000; /* target millisecs to sync a txg */ 51 52 uint64_t zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */ 53 uint64_t zfs_write_limit_max = 0; /* max data payload per txg */ 54 uint64_t zfs_write_limit_inflated = 0; 55 uint64_t zfs_write_limit_override = 0; 56 57 kmutex_t zfs_write_limit_lock; 58 59 static pgcnt_t old_physmem = 0; 60 61 hrtime_t zfs_throttle_delay = MSEC2NSEC(10); 62 hrtime_t zfs_throttle_resolution = MSEC2NSEC(10); 63 64 int 65 dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) 66 { 67 uint64_t obj; 68 int err; 69 70 err = zap_lookup(dp->dp_meta_objset, 71 dp->dp_root_dir->dd_phys->dd_child_dir_zapobj, 72 name, sizeof (obj), 1, &obj); 73 if (err) 74 return (err); 75 76 return (dsl_dir_hold_obj(dp, obj, name, dp, ddp)); 77 } 78 79 static dsl_pool_t * 80 dsl_pool_open_impl(spa_t *spa, uint64_t txg) 81 { 82 dsl_pool_t *dp; 83 blkptr_t *bp = spa_get_rootblkptr(spa); 84 85 dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); 86 dp->dp_spa = spa; 87 dp->dp_meta_rootbp = *bp; 88 rrw_init(&dp->dp_config_rwlock, B_TRUE); 89 dp->dp_write_limit = zfs_write_limit_min; 90 txg_init(dp, txg); 91 92 txg_list_create(&dp->dp_dirty_datasets, 93 offsetof(dsl_dataset_t, ds_dirty_link)); 94 txg_list_create(&dp->dp_dirty_zilogs, 95 offsetof(zilog_t, zl_dirty_link)); 96 txg_list_create(&dp->dp_dirty_dirs, 97 offsetof(dsl_dir_t, dd_dirty_link)); 98 txg_list_create(&dp->dp_sync_tasks, 99 offsetof(dsl_sync_task_t, dst_node)); 100 101 mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); 102 103 dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri, 104 1, 4, 0); 105 106 return (dp); 107 } 108 109 int 110 dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) 111 { 112 int err; 113 dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); 114 115 err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, 116 &dp->dp_meta_objset); 117 if (err != 0) 118 dsl_pool_close(dp); 119 else 120 *dpp = dp; 121 122 return (err); 123 } 124 125 int 126 dsl_pool_open(dsl_pool_t *dp) 127 { 128 int err; 129 dsl_dir_t *dd; 130 dsl_dataset_t *ds; 131 uint64_t obj; 132 133 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 134 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 135 DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, 136 &dp->dp_root_dir_obj); 137 if (err) 138 goto out; 139 140 err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, 141 NULL, dp, &dp->dp_root_dir); 142 if (err) 143 goto out; 144 145 err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir); 146 if (err) 147 goto out; 148 149 if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) { 150 err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); 151 if (err) 152 goto out; 153 err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj, 154 FTAG, &ds); 155 if (err == 0) { 156 err = dsl_dataset_hold_obj(dp, 157 ds->ds_phys->ds_prev_snap_obj, dp, 158 &dp->dp_origin_snap); 159 dsl_dataset_rele(ds, FTAG); 160 } 161 dsl_dir_rele(dd, dp); 162 if (err) 163 goto out; 164 } 165 166 if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { 167 err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME, 168 &dp->dp_free_dir); 169 if (err) 170 goto out; 171 172 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 173 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj); 174 if (err) 175 goto out; 176 VERIFY0(bpobj_open(&dp->dp_free_bpobj, 177 dp->dp_meta_objset, obj)); 178 } 179 180 if (spa_feature_is_active(dp->dp_spa, 181 &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { 182 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 183 DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, 184 &dp->dp_bptree_obj); 185 if (err != 0) 186 goto out; 187 } 188 189 if (spa_feature_is_active(dp->dp_spa, 190 &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ])) { 191 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 192 DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, 193 &dp->dp_empty_bpobj); 194 if (err != 0) 195 goto out; 196 } 197 198 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 199 DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, 200 &dp->dp_tmp_userrefs_obj); 201 if (err == ENOENT) 202 err = 0; 203 if (err) 204 goto out; 205 206 err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg); 207 208 out: 209 rrw_exit(&dp->dp_config_rwlock, FTAG); 210 return (err); 211 } 212 213 void 214 dsl_pool_close(dsl_pool_t *dp) 215 { 216 /* drop our references from dsl_pool_open() */ 217 218 /* 219 * Since we held the origin_snap from "syncing" context (which 220 * includes pool-opening context), it actually only got a "ref" 221 * and not a hold, so just drop that here. 222 */ 223 if (dp->dp_origin_snap) 224 dsl_dataset_rele(dp->dp_origin_snap, dp); 225 if (dp->dp_mos_dir) 226 dsl_dir_rele(dp->dp_mos_dir, dp); 227 if (dp->dp_free_dir) 228 dsl_dir_rele(dp->dp_free_dir, dp); 229 if (dp->dp_root_dir) 230 dsl_dir_rele(dp->dp_root_dir, dp); 231 232 bpobj_close(&dp->dp_free_bpobj); 233 234 /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ 235 if (dp->dp_meta_objset) 236 dmu_objset_evict(dp->dp_meta_objset); 237 238 txg_list_destroy(&dp->dp_dirty_datasets); 239 txg_list_destroy(&dp->dp_dirty_zilogs); 240 txg_list_destroy(&dp->dp_sync_tasks); 241 txg_list_destroy(&dp->dp_dirty_dirs); 242 243 arc_flush(dp->dp_spa); 244 txg_fini(dp); 245 dsl_scan_fini(dp); 246 rrw_destroy(&dp->dp_config_rwlock); 247 mutex_destroy(&dp->dp_lock); 248 taskq_destroy(dp->dp_vnrele_taskq); 249 if (dp->dp_blkstats) 250 kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); 251 kmem_free(dp, sizeof (dsl_pool_t)); 252 } 253 254 dsl_pool_t * 255 dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) 256 { 257 int err; 258 dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); 259 dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); 260 objset_t *os; 261 dsl_dataset_t *ds; 262 uint64_t obj; 263 264 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 265 266 /* create and open the MOS (meta-objset) */ 267 dp->dp_meta_objset = dmu_objset_create_impl(spa, 268 NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx); 269 270 /* create the pool directory */ 271 err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 272 DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); 273 ASSERT0(err); 274 275 /* Initialize scan structures */ 276 VERIFY0(dsl_scan_init(dp, txg)); 277 278 /* create and open the root dir */ 279 dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); 280 VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, 281 NULL, dp, &dp->dp_root_dir)); 282 283 /* create and open the meta-objset dir */ 284 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); 285 VERIFY0(dsl_pool_open_special_dir(dp, 286 MOS_DIR_NAME, &dp->dp_mos_dir)); 287 288 if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { 289 /* create and open the free dir */ 290 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, 291 FREE_DIR_NAME, tx); 292 VERIFY0(dsl_pool_open_special_dir(dp, 293 FREE_DIR_NAME, &dp->dp_free_dir)); 294 295 /* create and open the free_bplist */ 296 obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx); 297 VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 298 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0); 299 VERIFY0(bpobj_open(&dp->dp_free_bpobj, 300 dp->dp_meta_objset, obj)); 301 } 302 303 if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) 304 dsl_pool_create_origin(dp, tx); 305 306 /* create the root dataset */ 307 obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); 308 309 /* create the root objset */ 310 VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); 311 os = dmu_objset_create_impl(dp->dp_spa, ds, 312 dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); 313 #ifdef _KERNEL 314 zfs_create_fs(os, kcred, zplprops, tx); 315 #endif 316 dsl_dataset_rele(ds, FTAG); 317 318 dmu_tx_commit(tx); 319 320 rrw_exit(&dp->dp_config_rwlock, FTAG); 321 322 return (dp); 323 } 324 325 /* 326 * Account for the meta-objset space in its placeholder dsl_dir. 327 */ 328 void 329 dsl_pool_mos_diduse_space(dsl_pool_t *dp, 330 int64_t used, int64_t comp, int64_t uncomp) 331 { 332 ASSERT3U(comp, ==, uncomp); /* it's all metadata */ 333 mutex_enter(&dp->dp_lock); 334 dp->dp_mos_used_delta += used; 335 dp->dp_mos_compressed_delta += comp; 336 dp->dp_mos_uncompressed_delta += uncomp; 337 mutex_exit(&dp->dp_lock); 338 } 339 340 static int 341 deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 342 { 343 dsl_deadlist_t *dl = arg; 344 dsl_deadlist_insert(dl, bp, tx); 345 return (0); 346 } 347 348 void 349 dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) 350 { 351 zio_t *zio; 352 dmu_tx_t *tx; 353 dsl_dir_t *dd; 354 dsl_dataset_t *ds; 355 objset_t *mos = dp->dp_meta_objset; 356 hrtime_t start, write_time; 357 uint64_t data_written; 358 int err; 359 list_t synced_datasets; 360 361 list_create(&synced_datasets, sizeof (dsl_dataset_t), 362 offsetof(dsl_dataset_t, ds_synced_link)); 363 364 /* 365 * We need to copy dp_space_towrite() before doing 366 * dsl_sync_task_sync(), because 367 * dsl_dataset_snapshot_reserve_space() will increase 368 * dp_space_towrite but not actually write anything. 369 */ 370 data_written = dp->dp_space_towrite[txg & TXG_MASK]; 371 372 tx = dmu_tx_create_assigned(dp, txg); 373 374 dp->dp_read_overhead = 0; 375 start = gethrtime(); 376 377 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 378 while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { 379 /* 380 * We must not sync any non-MOS datasets twice, because 381 * we may have taken a snapshot of them. However, we 382 * may sync newly-created datasets on pass 2. 383 */ 384 ASSERT(!list_link_active(&ds->ds_synced_link)); 385 list_insert_tail(&synced_datasets, ds); 386 dsl_dataset_sync(ds, zio, tx); 387 } 388 DTRACE_PROBE(pool_sync__1setup); 389 err = zio_wait(zio); 390 391 write_time = gethrtime() - start; 392 ASSERT(err == 0); 393 DTRACE_PROBE(pool_sync__2rootzio); 394 395 /* 396 * After the data blocks have been written (ensured by the zio_wait() 397 * above), update the user/group space accounting. 398 */ 399 for (ds = list_head(&synced_datasets); ds; 400 ds = list_next(&synced_datasets, ds)) 401 dmu_objset_do_userquota_updates(ds->ds_objset, tx); 402 403 /* 404 * Sync the datasets again to push out the changes due to 405 * userspace updates. This must be done before we process the 406 * sync tasks, so that any snapshots will have the correct 407 * user accounting information (and we won't get confused 408 * about which blocks are part of the snapshot). 409 */ 410 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 411 while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { 412 ASSERT(list_link_active(&ds->ds_synced_link)); 413 dmu_buf_rele(ds->ds_dbuf, ds); 414 dsl_dataset_sync(ds, zio, tx); 415 } 416 err = zio_wait(zio); 417 418 /* 419 * Now that the datasets have been completely synced, we can 420 * clean up our in-memory structures accumulated while syncing: 421 * 422 * - move dead blocks from the pending deadlist to the on-disk deadlist 423 * - release hold from dsl_dataset_dirty() 424 */ 425 while (ds = list_remove_head(&synced_datasets)) { 426 objset_t *os = ds->ds_objset; 427 bplist_iterate(&ds->ds_pending_deadlist, 428 deadlist_enqueue_cb, &ds->ds_deadlist, tx); 429 ASSERT(!dmu_objset_is_dirty(os, txg)); 430 dmu_buf_rele(ds->ds_dbuf, ds); 431 } 432 433 start = gethrtime(); 434 while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) 435 dsl_dir_sync(dd, tx); 436 write_time += gethrtime() - start; 437 438 /* 439 * The MOS's space is accounted for in the pool/$MOS 440 * (dp_mos_dir). We can't modify the mos while we're syncing 441 * it, so we remember the deltas and apply them here. 442 */ 443 if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 || 444 dp->dp_mos_uncompressed_delta != 0) { 445 dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD, 446 dp->dp_mos_used_delta, 447 dp->dp_mos_compressed_delta, 448 dp->dp_mos_uncompressed_delta, tx); 449 dp->dp_mos_used_delta = 0; 450 dp->dp_mos_compressed_delta = 0; 451 dp->dp_mos_uncompressed_delta = 0; 452 } 453 454 start = gethrtime(); 455 if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL || 456 list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) { 457 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 458 dmu_objset_sync(mos, zio, tx); 459 err = zio_wait(zio); 460 ASSERT(err == 0); 461 dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); 462 spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); 463 } 464 write_time += gethrtime() - start; 465 DTRACE_PROBE2(pool_sync__4io, hrtime_t, write_time, 466 hrtime_t, dp->dp_read_overhead); 467 write_time -= dp->dp_read_overhead; 468 469 /* 470 * If we modify a dataset in the same txg that we want to destroy it, 471 * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it. 472 * dsl_dir_destroy_check() will fail if there are unexpected holds. 473 * Therefore, we want to sync the MOS (thus syncing the dd_dbuf 474 * and clearing the hold on it) before we process the sync_tasks. 475 * The MOS data dirtied by the sync_tasks will be synced on the next 476 * pass. 477 */ 478 DTRACE_PROBE(pool_sync__3task); 479 if (!txg_list_empty(&dp->dp_sync_tasks, txg)) { 480 dsl_sync_task_t *dst; 481 /* 482 * No more sync tasks should have been added while we 483 * were syncing. 484 */ 485 ASSERT(spa_sync_pass(dp->dp_spa) == 1); 486 while (dst = txg_list_remove(&dp->dp_sync_tasks, txg)) 487 dsl_sync_task_sync(dst, tx); 488 } 489 490 dmu_tx_commit(tx); 491 492 dp->dp_space_towrite[txg & TXG_MASK] = 0; 493 ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0); 494 495 /* 496 * If the write limit max has not been explicitly set, set it 497 * to a fraction of available physical memory (default 1/8th). 498 * Note that we must inflate the limit because the spa 499 * inflates write sizes to account for data replication. 500 * Check this each sync phase to catch changing memory size. 501 */ 502 if (physmem != old_physmem && zfs_write_limit_shift) { 503 mutex_enter(&zfs_write_limit_lock); 504 old_physmem = physmem; 505 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; 506 zfs_write_limit_inflated = MAX(zfs_write_limit_min, 507 spa_get_asize(dp->dp_spa, zfs_write_limit_max)); 508 mutex_exit(&zfs_write_limit_lock); 509 } 510 511 /* 512 * Attempt to keep the sync time consistent by adjusting the 513 * amount of write traffic allowed into each transaction group. 514 * Weight the throughput calculation towards the current value: 515 * thru = 3/4 old_thru + 1/4 new_thru 516 * 517 * Note: write_time is in nanosecs while dp_throughput is expressed in 518 * bytes per millisecond. 519 */ 520 ASSERT(zfs_write_limit_min > 0); 521 if (data_written > zfs_write_limit_min / 8 && 522 write_time > MSEC2NSEC(1)) { 523 uint64_t throughput = data_written / NSEC2MSEC(write_time); 524 525 if (dp->dp_throughput) 526 dp->dp_throughput = throughput / 4 + 527 3 * dp->dp_throughput / 4; 528 else 529 dp->dp_throughput = throughput; 530 dp->dp_write_limit = MIN(zfs_write_limit_inflated, 531 MAX(zfs_write_limit_min, 532 dp->dp_throughput * zfs_txg_synctime_ms)); 533 } 534 } 535 536 void 537 dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) 538 { 539 zilog_t *zilog; 540 dsl_dataset_t *ds; 541 542 while (zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg)) { 543 ds = dmu_objset_ds(zilog->zl_os); 544 zil_clean(zilog, txg); 545 ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); 546 dmu_buf_rele(ds->ds_dbuf, zilog); 547 } 548 ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); 549 } 550 551 /* 552 * TRUE if the current thread is the tx_sync_thread or if we 553 * are being called from SPA context during pool initialization. 554 */ 555 int 556 dsl_pool_sync_context(dsl_pool_t *dp) 557 { 558 return (curthread == dp->dp_tx.tx_sync_thread || 559 spa_is_initializing(dp->dp_spa)); 560 } 561 562 uint64_t 563 dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree) 564 { 565 uint64_t space, resv; 566 567 /* 568 * Reserve about 1.6% (1/64), or at least 32MB, for allocation 569 * efficiency. 570 * XXX The intent log is not accounted for, so it must fit 571 * within this slop. 572 * 573 * If we're trying to assess whether it's OK to do a free, 574 * cut the reservation in half to allow forward progress 575 * (e.g. make it possible to rm(1) files from a full pool). 576 */ 577 space = spa_get_dspace(dp->dp_spa); 578 resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1); 579 if (netfree) 580 resv >>= 1; 581 582 return (space - resv); 583 } 584 585 int 586 dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx) 587 { 588 uint64_t reserved = 0; 589 uint64_t write_limit = (zfs_write_limit_override ? 590 zfs_write_limit_override : dp->dp_write_limit); 591 592 if (zfs_no_write_throttle) { 593 atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], 594 space); 595 return (0); 596 } 597 598 /* 599 * Check to see if we have exceeded the maximum allowed IO for 600 * this transaction group. We can do this without locks since 601 * a little slop here is ok. Note that we do the reserved check 602 * with only half the requested reserve: this is because the 603 * reserve requests are worst-case, and we really don't want to 604 * throttle based off of worst-case estimates. 605 */ 606 if (write_limit > 0) { 607 reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK] 608 + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2; 609 610 if (reserved && reserved > write_limit) 611 return (SET_ERROR(ERESTART)); 612 } 613 614 atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space); 615 616 /* 617 * If this transaction group is over 7/8ths capacity, delay 618 * the caller 1 clock tick. This will slow down the "fill" 619 * rate until the sync process can catch up with us. 620 */ 621 if (reserved && reserved > (write_limit - (write_limit >> 3))) { 622 txg_delay(dp, tx->tx_txg, zfs_throttle_delay, 623 zfs_throttle_resolution); 624 } 625 626 return (0); 627 } 628 629 void 630 dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) 631 { 632 ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space); 633 atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space); 634 } 635 636 void 637 dsl_pool_memory_pressure(dsl_pool_t *dp) 638 { 639 uint64_t space_inuse = 0; 640 int i; 641 642 if (dp->dp_write_limit == zfs_write_limit_min) 643 return; 644 645 for (i = 0; i < TXG_SIZE; i++) { 646 space_inuse += dp->dp_space_towrite[i]; 647 space_inuse += dp->dp_tempreserved[i]; 648 } 649 dp->dp_write_limit = MAX(zfs_write_limit_min, 650 MIN(dp->dp_write_limit, space_inuse / 4)); 651 } 652 653 void 654 dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) 655 { 656 if (space > 0) { 657 mutex_enter(&dp->dp_lock); 658 dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space; 659 mutex_exit(&dp->dp_lock); 660 } 661 } 662 663 /* ARGSUSED */ 664 static int 665 upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) 666 { 667 dmu_tx_t *tx = arg; 668 dsl_dataset_t *ds, *prev = NULL; 669 int err; 670 671 err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); 672 if (err) 673 return (err); 674 675 while (ds->ds_phys->ds_prev_snap_obj != 0) { 676 err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, 677 FTAG, &prev); 678 if (err) { 679 dsl_dataset_rele(ds, FTAG); 680 return (err); 681 } 682 683 if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) 684 break; 685 dsl_dataset_rele(ds, FTAG); 686 ds = prev; 687 prev = NULL; 688 } 689 690 if (prev == NULL) { 691 prev = dp->dp_origin_snap; 692 693 /* 694 * The $ORIGIN can't have any data, or the accounting 695 * will be wrong. 696 */ 697 ASSERT0(prev->ds_phys->ds_bp.blk_birth); 698 699 /* The origin doesn't get attached to itself */ 700 if (ds->ds_object == prev->ds_object) { 701 dsl_dataset_rele(ds, FTAG); 702 return (0); 703 } 704 705 dmu_buf_will_dirty(ds->ds_dbuf, tx); 706 ds->ds_phys->ds_prev_snap_obj = prev->ds_object; 707 ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg; 708 709 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); 710 ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object; 711 712 dmu_buf_will_dirty(prev->ds_dbuf, tx); 713 prev->ds_phys->ds_num_children++; 714 715 if (ds->ds_phys->ds_next_snap_obj == 0) { 716 ASSERT(ds->ds_prev == NULL); 717 VERIFY0(dsl_dataset_hold_obj(dp, 718 ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); 719 } 720 } 721 722 ASSERT3U(ds->ds_dir->dd_phys->dd_origin_obj, ==, prev->ds_object); 723 ASSERT3U(ds->ds_phys->ds_prev_snap_obj, ==, prev->ds_object); 724 725 if (prev->ds_phys->ds_next_clones_obj == 0) { 726 dmu_buf_will_dirty(prev->ds_dbuf, tx); 727 prev->ds_phys->ds_next_clones_obj = 728 zap_create(dp->dp_meta_objset, 729 DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); 730 } 731 VERIFY0(zap_add_int(dp->dp_meta_objset, 732 prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx)); 733 734 dsl_dataset_rele(ds, FTAG); 735 if (prev != dp->dp_origin_snap) 736 dsl_dataset_rele(prev, FTAG); 737 return (0); 738 } 739 740 void 741 dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) 742 { 743 ASSERT(dmu_tx_is_syncing(tx)); 744 ASSERT(dp->dp_origin_snap != NULL); 745 746 VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb, 747 tx, DS_FIND_CHILDREN)); 748 } 749 750 /* ARGSUSED */ 751 static int 752 upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 753 { 754 dmu_tx_t *tx = arg; 755 objset_t *mos = dp->dp_meta_objset; 756 757 if (ds->ds_dir->dd_phys->dd_origin_obj != 0) { 758 dsl_dataset_t *origin; 759 760 VERIFY0(dsl_dataset_hold_obj(dp, 761 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin)); 762 763 if (origin->ds_dir->dd_phys->dd_clones == 0) { 764 dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); 765 origin->ds_dir->dd_phys->dd_clones = zap_create(mos, 766 DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); 767 } 768 769 VERIFY0(zap_add_int(dp->dp_meta_objset, 770 origin->ds_dir->dd_phys->dd_clones, ds->ds_object, tx)); 771 772 dsl_dataset_rele(origin, FTAG); 773 } 774 return (0); 775 } 776 777 void 778 dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) 779 { 780 ASSERT(dmu_tx_is_syncing(tx)); 781 uint64_t obj; 782 783 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); 784 VERIFY0(dsl_pool_open_special_dir(dp, 785 FREE_DIR_NAME, &dp->dp_free_dir)); 786 787 /* 788 * We can't use bpobj_alloc(), because spa_version() still 789 * returns the old version, and we need a new-version bpobj with 790 * subobj support. So call dmu_object_alloc() directly. 791 */ 792 obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ, 793 SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx); 794 VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 795 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); 796 VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); 797 798 VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 799 upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN)); 800 } 801 802 void 803 dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) 804 { 805 uint64_t dsobj; 806 dsl_dataset_t *ds; 807 808 ASSERT(dmu_tx_is_syncing(tx)); 809 ASSERT(dp->dp_origin_snap == NULL); 810 ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER)); 811 812 /* create the origin dir, ds, & snap-ds */ 813 dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, 814 NULL, 0, kcred, tx); 815 VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 816 dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx); 817 VERIFY0(dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, 818 dp, &dp->dp_origin_snap)); 819 dsl_dataset_rele(ds, FTAG); 820 } 821 822 taskq_t * 823 dsl_pool_vnrele_taskq(dsl_pool_t *dp) 824 { 825 return (dp->dp_vnrele_taskq); 826 } 827 828 /* 829 * Walk through the pool-wide zap object of temporary snapshot user holds 830 * and release them. 831 */ 832 void 833 dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) 834 { 835 zap_attribute_t za; 836 zap_cursor_t zc; 837 objset_t *mos = dp->dp_meta_objset; 838 uint64_t zapobj = dp->dp_tmp_userrefs_obj; 839 840 if (zapobj == 0) 841 return; 842 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 843 844 for (zap_cursor_init(&zc, mos, zapobj); 845 zap_cursor_retrieve(&zc, &za) == 0; 846 zap_cursor_advance(&zc)) { 847 char *htag; 848 uint64_t dsobj; 849 850 htag = strchr(za.za_name, '-'); 851 *htag = '\0'; 852 ++htag; 853 dsobj = strtonum(za.za_name, NULL); 854 dsl_dataset_user_release_tmp(dp, dsobj, htag); 855 } 856 zap_cursor_fini(&zc); 857 } 858 859 /* 860 * Create the pool-wide zap object for storing temporary snapshot holds. 861 */ 862 void 863 dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) 864 { 865 objset_t *mos = dp->dp_meta_objset; 866 867 ASSERT(dp->dp_tmp_userrefs_obj == 0); 868 ASSERT(dmu_tx_is_syncing(tx)); 869 870 dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS, 871 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx); 872 } 873 874 static int 875 dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, 876 const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding) 877 { 878 objset_t *mos = dp->dp_meta_objset; 879 uint64_t zapobj = dp->dp_tmp_userrefs_obj; 880 char *name; 881 int error; 882 883 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 884 ASSERT(dmu_tx_is_syncing(tx)); 885 886 /* 887 * If the pool was created prior to SPA_VERSION_USERREFS, the 888 * zap object for temporary holds might not exist yet. 889 */ 890 if (zapobj == 0) { 891 if (holding) { 892 dsl_pool_user_hold_create_obj(dp, tx); 893 zapobj = dp->dp_tmp_userrefs_obj; 894 } else { 895 return (SET_ERROR(ENOENT)); 896 } 897 } 898 899 name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag); 900 if (holding) 901 error = zap_add(mos, zapobj, name, 8, 1, &now, tx); 902 else 903 error = zap_remove(mos, zapobj, name, tx); 904 strfree(name); 905 906 return (error); 907 } 908 909 /* 910 * Add a temporary hold for the given dataset object and tag. 911 */ 912 int 913 dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag, 914 uint64_t now, dmu_tx_t *tx) 915 { 916 return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE)); 917 } 918 919 /* 920 * Release a temporary hold for the given dataset object and tag. 921 */ 922 int 923 dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, 924 dmu_tx_t *tx) 925 { 926 return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL, 927 tx, B_FALSE)); 928 } 929 930 /* 931 * DSL Pool Configuration Lock 932 * 933 * The dp_config_rwlock protects against changes to DSL state (e.g. dataset 934 * creation / destruction / rename / property setting). It must be held for 935 * read to hold a dataset or dsl_dir. I.e. you must call 936 * dsl_pool_config_enter() or dsl_pool_hold() before calling 937 * dsl_{dataset,dir}_hold{_obj}. In most circumstances, the dp_config_rwlock 938 * must be held continuously until all datasets and dsl_dirs are released. 939 * 940 * The only exception to this rule is that if a "long hold" is placed on 941 * a dataset, then the dp_config_rwlock may be dropped while the dataset 942 * is still held. The long hold will prevent the dataset from being 943 * destroyed -- the destroy will fail with EBUSY. A long hold can be 944 * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset 945 * (by calling dsl_{dataset,objset}_{try}own{_obj}). 946 * 947 * Legitimate long-holders (including owners) should be long-running, cancelable 948 * tasks that should cause "zfs destroy" to fail. This includes DMU 949 * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open), 950 * "zfs send", and "zfs diff". There are several other long-holders whose 951 * uses are suboptimal (e.g. "zfs promote", and zil_suspend()). 952 * 953 * The usual formula for long-holding would be: 954 * dsl_pool_hold() 955 * dsl_dataset_hold() 956 * ... perform checks ... 957 * dsl_dataset_long_hold() 958 * dsl_pool_rele() 959 * ... perform long-running task ... 960 * dsl_dataset_long_rele() 961 * dsl_dataset_rele() 962 * 963 * Note that when the long hold is released, the dataset is still held but 964 * the pool is not held. The dataset may change arbitrarily during this time 965 * (e.g. it could be destroyed). Therefore you shouldn't do anything to the 966 * dataset except release it. 967 * 968 * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only 969 * or modifying operations. 970 * 971 * Modifying operations should generally use dsl_sync_task(). The synctask 972 * infrastructure enforces proper locking strategy with respect to the 973 * dp_config_rwlock. See the comment above dsl_sync_task() for details. 974 * 975 * Read-only operations will manually hold the pool, then the dataset, obtain 976 * information from the dataset, then release the pool and dataset. 977 * dmu_objset_{hold,rele}() are convenience routines that also do the pool 978 * hold/rele. 979 */ 980 981 int 982 dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp) 983 { 984 spa_t *spa; 985 int error; 986 987 error = spa_open(name, &spa, tag); 988 if (error == 0) { 989 *dp = spa_get_dsl(spa); 990 dsl_pool_config_enter(*dp, tag); 991 } 992 return (error); 993 } 994 995 void 996 dsl_pool_rele(dsl_pool_t *dp, void *tag) 997 { 998 dsl_pool_config_exit(dp, tag); 999 spa_close(dp->dp_spa, tag); 1000 } 1001 1002 void 1003 dsl_pool_config_enter(dsl_pool_t *dp, void *tag) 1004 { 1005 /* 1006 * We use a "reentrant" reader-writer lock, but not reentrantly. 1007 * 1008 * The rrwlock can (with the track_all flag) track all reading threads, 1009 * which is very useful for debugging which code path failed to release 1010 * the lock, and for verifying that the *current* thread does hold 1011 * the lock. 1012 * 1013 * (Unlike a rwlock, which knows that N threads hold it for 1014 * read, but not *which* threads, so rw_held(RW_READER) returns TRUE 1015 * if any thread holds it for read, even if this thread doesn't). 1016 */ 1017 ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); 1018 rrw_enter(&dp->dp_config_rwlock, RW_READER, tag); 1019 } 1020 1021 void 1022 dsl_pool_config_exit(dsl_pool_t *dp, void *tag) 1023 { 1024 rrw_exit(&dp->dp_config_rwlock, tag); 1025 } 1026 1027 boolean_t 1028 dsl_pool_config_held(dsl_pool_t *dp) 1029 { 1030 return (RRW_LOCK_HELD(&dp->dp_config_rwlock)); 1031 } 1032