1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012 by Delphix. All rights reserved. 24 */ 25 26 #include <sys/dsl_pool.h> 27 #include <sys/dsl_dataset.h> 28 #include <sys/dsl_prop.h> 29 #include <sys/dsl_dir.h> 30 #include <sys/dsl_synctask.h> 31 #include <sys/dsl_scan.h> 32 #include <sys/dnode.h> 33 #include <sys/dmu_tx.h> 34 #include <sys/dmu_objset.h> 35 #include <sys/arc.h> 36 #include <sys/zap.h> 37 #include <sys/zio.h> 38 #include <sys/zfs_context.h> 39 #include <sys/fs/zfs.h> 40 #include <sys/zfs_znode.h> 41 #include <sys/spa_impl.h> 42 #include <sys/dsl_deadlist.h> 43 #include <sys/bptree.h> 44 #include <sys/zfeature.h> 45 #include <sys/zil_impl.h> 46 47 int zfs_no_write_throttle = 0; 48 int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ 49 int zfs_txg_synctime_ms = 1000; /* target millisecs to sync a txg */ 50 51 uint64_t zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */ 52 uint64_t zfs_write_limit_max = 0; /* max data payload per txg */ 53 uint64_t zfs_write_limit_inflated = 0; 54 uint64_t zfs_write_limit_override = 0; 55 56 kmutex_t zfs_write_limit_lock; 57 58 static pgcnt_t old_physmem = 0; 59 60 int 61 dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) 62 { 63 uint64_t obj; 64 int err; 65 66 err = zap_lookup(dp->dp_meta_objset, 67 dp->dp_root_dir->dd_phys->dd_child_dir_zapobj, 68 name, sizeof (obj), 1, &obj); 69 if (err) 70 return (err); 71 72 return (dsl_dir_open_obj(dp, obj, name, dp, ddp)); 73 } 74 75 static dsl_pool_t * 76 dsl_pool_open_impl(spa_t *spa, uint64_t txg) 77 { 78 dsl_pool_t *dp; 79 blkptr_t *bp = spa_get_rootblkptr(spa); 80 81 dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); 82 dp->dp_spa = spa; 83 dp->dp_meta_rootbp = *bp; 84 rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL); 85 dp->dp_write_limit = zfs_write_limit_min; 86 txg_init(dp, txg); 87 88 txg_list_create(&dp->dp_dirty_datasets, 89 offsetof(dsl_dataset_t, ds_dirty_link)); 90 txg_list_create(&dp->dp_dirty_zilogs, 91 offsetof(zilog_t, zl_dirty_link)); 92 txg_list_create(&dp->dp_dirty_dirs, 93 offsetof(dsl_dir_t, dd_dirty_link)); 94 txg_list_create(&dp->dp_sync_tasks, 95 offsetof(dsl_sync_task_group_t, dstg_node)); 96 97 mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); 98 99 dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri, 100 1, 4, 0); 101 102 return (dp); 103 } 104 105 int 106 dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) 107 { 108 int err; 109 dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); 110 111 err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, 112 &dp->dp_meta_objset); 113 if (err != 0) 114 dsl_pool_close(dp); 115 else 116 *dpp = dp; 117 118 return (err); 119 } 120 121 int 122 dsl_pool_open(dsl_pool_t *dp) 123 { 124 int err; 125 dsl_dir_t *dd; 126 dsl_dataset_t *ds; 127 uint64_t obj; 128 129 ASSERT(!dmu_objset_is_dirty_anywhere(dp->dp_meta_objset)); 130 131 rw_enter(&dp->dp_config_rwlock, RW_WRITER); 132 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 133 DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, 134 &dp->dp_root_dir_obj); 135 if (err) 136 goto out; 137 138 err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, 139 NULL, dp, &dp->dp_root_dir); 140 if (err) 141 goto out; 142 143 err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir); 144 if (err) 145 goto out; 146 147 if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) { 148 err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); 149 if (err) 150 goto out; 151 err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj, 152 FTAG, &ds); 153 if (err == 0) { 154 err = dsl_dataset_hold_obj(dp, 155 ds->ds_phys->ds_prev_snap_obj, dp, 156 &dp->dp_origin_snap); 157 dsl_dataset_rele(ds, FTAG); 158 } 159 dsl_dir_close(dd, dp); 160 if (err) 161 goto out; 162 } 163 164 if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { 165 err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME, 166 &dp->dp_free_dir); 167 if (err) 168 goto out; 169 170 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 171 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj); 172 if (err) 173 goto out; 174 VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, 175 dp->dp_meta_objset, obj)); 176 } 177 178 if (spa_feature_is_active(dp->dp_spa, 179 &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { 180 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 181 DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, 182 &dp->dp_bptree_obj); 183 if (err != 0) 184 goto out; 185 } 186 187 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 188 DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, 189 &dp->dp_tmp_userrefs_obj); 190 if (err == ENOENT) 191 err = 0; 192 if (err) 193 goto out; 194 195 err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg); 196 197 out: 198 rw_exit(&dp->dp_config_rwlock); 199 return (err); 200 } 201 202 void 203 dsl_pool_close(dsl_pool_t *dp) 204 { 205 /* drop our references from dsl_pool_open() */ 206 207 /* 208 * Since we held the origin_snap from "syncing" context (which 209 * includes pool-opening context), it actually only got a "ref" 210 * and not a hold, so just drop that here. 211 */ 212 if (dp->dp_origin_snap) 213 dsl_dataset_drop_ref(dp->dp_origin_snap, dp); 214 if (dp->dp_mos_dir) 215 dsl_dir_close(dp->dp_mos_dir, dp); 216 if (dp->dp_free_dir) 217 dsl_dir_close(dp->dp_free_dir, dp); 218 if (dp->dp_root_dir) 219 dsl_dir_close(dp->dp_root_dir, dp); 220 221 bpobj_close(&dp->dp_free_bpobj); 222 223 /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ 224 if (dp->dp_meta_objset) 225 dmu_objset_evict(dp->dp_meta_objset); 226 227 txg_list_destroy(&dp->dp_dirty_datasets); 228 txg_list_destroy(&dp->dp_dirty_zilogs); 229 txg_list_destroy(&dp->dp_sync_tasks); 230 txg_list_destroy(&dp->dp_dirty_dirs); 231 232 arc_flush(dp->dp_spa); 233 txg_fini(dp); 234 dsl_scan_fini(dp); 235 rw_destroy(&dp->dp_config_rwlock); 236 mutex_destroy(&dp->dp_lock); 237 taskq_destroy(dp->dp_vnrele_taskq); 238 if (dp->dp_blkstats) 239 kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); 240 kmem_free(dp, sizeof (dsl_pool_t)); 241 } 242 243 dsl_pool_t * 244 dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) 245 { 246 int err; 247 dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); 248 dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); 249 objset_t *os; 250 dsl_dataset_t *ds; 251 uint64_t obj; 252 253 /* create and open the MOS (meta-objset) */ 254 dp->dp_meta_objset = dmu_objset_create_impl(spa, 255 NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx); 256 257 /* create the pool directory */ 258 err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 259 DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); 260 ASSERT3U(err, ==, 0); 261 262 /* Initialize scan structures */ 263 VERIFY3U(0, ==, dsl_scan_init(dp, txg)); 264 265 /* create and open the root dir */ 266 dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); 267 VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj, 268 NULL, dp, &dp->dp_root_dir)); 269 270 /* create and open the meta-objset dir */ 271 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); 272 VERIFY(0 == dsl_pool_open_special_dir(dp, 273 MOS_DIR_NAME, &dp->dp_mos_dir)); 274 275 if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { 276 /* create and open the free dir */ 277 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, 278 FREE_DIR_NAME, tx); 279 VERIFY(0 == dsl_pool_open_special_dir(dp, 280 FREE_DIR_NAME, &dp->dp_free_dir)); 281 282 /* create and open the free_bplist */ 283 obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx); 284 VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 285 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0); 286 VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, 287 dp->dp_meta_objset, obj)); 288 } 289 290 if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) 291 dsl_pool_create_origin(dp, tx); 292 293 /* create the root dataset */ 294 obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); 295 296 /* create the root objset */ 297 VERIFY(0 == dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); 298 os = dmu_objset_create_impl(dp->dp_spa, ds, 299 dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); 300 #ifdef _KERNEL 301 zfs_create_fs(os, kcred, zplprops, tx); 302 #endif 303 dsl_dataset_rele(ds, FTAG); 304 305 dmu_tx_commit(tx); 306 307 return (dp); 308 } 309 310 /* 311 * Account for the meta-objset space in its placeholder dsl_dir. 312 */ 313 void 314 dsl_pool_mos_diduse_space(dsl_pool_t *dp, 315 int64_t used, int64_t comp, int64_t uncomp) 316 { 317 ASSERT3U(comp, ==, uncomp); /* it's all metadata */ 318 mutex_enter(&dp->dp_lock); 319 dp->dp_mos_used_delta += used; 320 dp->dp_mos_compressed_delta += comp; 321 dp->dp_mos_uncompressed_delta += uncomp; 322 mutex_exit(&dp->dp_lock); 323 } 324 325 static int 326 deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 327 { 328 dsl_deadlist_t *dl = arg; 329 dsl_pool_t *dp = dmu_objset_pool(dl->dl_os); 330 rw_enter(&dp->dp_config_rwlock, RW_READER); 331 dsl_deadlist_insert(dl, bp, tx); 332 rw_exit(&dp->dp_config_rwlock); 333 return (0); 334 } 335 336 void 337 dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) 338 { 339 zio_t *zio; 340 dmu_tx_t *tx; 341 dsl_dir_t *dd; 342 dsl_dataset_t *ds; 343 objset_t *mos = dp->dp_meta_objset; 344 hrtime_t start, write_time; 345 uint64_t data_written; 346 int err; 347 list_t synced_datasets; 348 349 list_create(&synced_datasets, sizeof (dsl_dataset_t), 350 offsetof(dsl_dataset_t, ds_synced_link)); 351 352 /* 353 * We need to copy dp_space_towrite() before doing 354 * dsl_sync_task_group_sync(), because 355 * dsl_dataset_snapshot_reserve_space() will increase 356 * dp_space_towrite but not actually write anything. 357 */ 358 data_written = dp->dp_space_towrite[txg & TXG_MASK]; 359 360 tx = dmu_tx_create_assigned(dp, txg); 361 362 dp->dp_read_overhead = 0; 363 start = gethrtime(); 364 365 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 366 while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { 367 /* 368 * We must not sync any non-MOS datasets twice, because 369 * we may have taken a snapshot of them. However, we 370 * may sync newly-created datasets on pass 2. 371 */ 372 ASSERT(!list_link_active(&ds->ds_synced_link)); 373 list_insert_tail(&synced_datasets, ds); 374 dsl_dataset_sync(ds, zio, tx); 375 } 376 DTRACE_PROBE(pool_sync__1setup); 377 err = zio_wait(zio); 378 379 write_time = gethrtime() - start; 380 ASSERT(err == 0); 381 DTRACE_PROBE(pool_sync__2rootzio); 382 383 /* 384 * After the data blocks have been written (ensured by the zio_wait() 385 * above), update the user/group space accounting. 386 */ 387 for (ds = list_head(&synced_datasets); ds; 388 ds = list_next(&synced_datasets, ds)) 389 dmu_objset_do_userquota_updates(ds->ds_objset, tx); 390 391 /* 392 * Sync the datasets again to push out the changes due to 393 * userspace updates. This must be done before we process the 394 * sync tasks, so that any snapshots will have the correct 395 * user accounting information (and we won't get confused 396 * about which blocks are part of the snapshot). 397 */ 398 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 399 while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { 400 ASSERT(list_link_active(&ds->ds_synced_link)); 401 dmu_buf_rele(ds->ds_dbuf, ds); 402 dsl_dataset_sync(ds, zio, tx); 403 } 404 err = zio_wait(zio); 405 406 /* 407 * Now that the datasets have been completely synced, we can 408 * clean up our in-memory structures accumulated while syncing: 409 * 410 * - move dead blocks from the pending deadlist to the on-disk deadlist 411 * - clean up zil records 412 * - release hold from dsl_dataset_dirty() 413 */ 414 while (ds = list_remove_head(&synced_datasets)) { 415 objset_t *os = ds->ds_objset; 416 bplist_iterate(&ds->ds_pending_deadlist, 417 deadlist_enqueue_cb, &ds->ds_deadlist, tx); 418 ASSERT(!dmu_objset_is_dirty(os, txg)); 419 dmu_buf_rele(ds->ds_dbuf, ds); 420 } 421 422 start = gethrtime(); 423 while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) 424 dsl_dir_sync(dd, tx); 425 write_time += gethrtime() - start; 426 427 /* 428 * The MOS's space is accounted for in the pool/$MOS 429 * (dp_mos_dir). We can't modify the mos while we're syncing 430 * it, so we remember the deltas and apply them here. 431 */ 432 if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 || 433 dp->dp_mos_uncompressed_delta != 0) { 434 dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD, 435 dp->dp_mos_used_delta, 436 dp->dp_mos_compressed_delta, 437 dp->dp_mos_uncompressed_delta, tx); 438 dp->dp_mos_used_delta = 0; 439 dp->dp_mos_compressed_delta = 0; 440 dp->dp_mos_uncompressed_delta = 0; 441 } 442 443 start = gethrtime(); 444 if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL || 445 list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) { 446 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 447 dmu_objset_sync(mos, zio, tx); 448 err = zio_wait(zio); 449 ASSERT(err == 0); 450 dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); 451 spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); 452 } 453 write_time += gethrtime() - start; 454 DTRACE_PROBE2(pool_sync__4io, hrtime_t, write_time, 455 hrtime_t, dp->dp_read_overhead); 456 write_time -= dp->dp_read_overhead; 457 458 /* 459 * If we modify a dataset in the same txg that we want to destroy it, 460 * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it. 461 * dsl_dir_destroy_check() will fail if there are unexpected holds. 462 * Therefore, we want to sync the MOS (thus syncing the dd_dbuf 463 * and clearing the hold on it) before we process the sync_tasks. 464 * The MOS data dirtied by the sync_tasks will be synced on the next 465 * pass. 466 */ 467 DTRACE_PROBE(pool_sync__3task); 468 if (!txg_list_empty(&dp->dp_sync_tasks, txg)) { 469 dsl_sync_task_group_t *dstg; 470 /* 471 * No more sync tasks should have been added while we 472 * were syncing. 473 */ 474 ASSERT(spa_sync_pass(dp->dp_spa) == 1); 475 while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) 476 dsl_sync_task_group_sync(dstg, tx); 477 } 478 479 dmu_tx_commit(tx); 480 481 dp->dp_space_towrite[txg & TXG_MASK] = 0; 482 ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0); 483 484 /* 485 * If the write limit max has not been explicitly set, set it 486 * to a fraction of available physical memory (default 1/8th). 487 * Note that we must inflate the limit because the spa 488 * inflates write sizes to account for data replication. 489 * Check this each sync phase to catch changing memory size. 490 */ 491 if (physmem != old_physmem && zfs_write_limit_shift) { 492 mutex_enter(&zfs_write_limit_lock); 493 old_physmem = physmem; 494 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; 495 zfs_write_limit_inflated = MAX(zfs_write_limit_min, 496 spa_get_asize(dp->dp_spa, zfs_write_limit_max)); 497 mutex_exit(&zfs_write_limit_lock); 498 } 499 500 /* 501 * Attempt to keep the sync time consistent by adjusting the 502 * amount of write traffic allowed into each transaction group. 503 * Weight the throughput calculation towards the current value: 504 * thru = 3/4 old_thru + 1/4 new_thru 505 * 506 * Note: write_time is in nanosecs, so write_time/MICROSEC 507 * yields millisecs 508 */ 509 ASSERT(zfs_write_limit_min > 0); 510 if (data_written > zfs_write_limit_min / 8 && write_time > MICROSEC) { 511 uint64_t throughput = data_written / (write_time / MICROSEC); 512 513 if (dp->dp_throughput) 514 dp->dp_throughput = throughput / 4 + 515 3 * dp->dp_throughput / 4; 516 else 517 dp->dp_throughput = throughput; 518 dp->dp_write_limit = MIN(zfs_write_limit_inflated, 519 MAX(zfs_write_limit_min, 520 dp->dp_throughput * zfs_txg_synctime_ms)); 521 } 522 } 523 524 void 525 dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) 526 { 527 zilog_t *zilog; 528 dsl_dataset_t *ds; 529 530 while (zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg)) { 531 ds = dmu_objset_ds(zilog->zl_os); 532 zil_clean(zilog, txg); 533 ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); 534 dmu_buf_rele(ds->ds_dbuf, zilog); 535 } 536 ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); 537 } 538 539 /* 540 * TRUE if the current thread is the tx_sync_thread or if we 541 * are being called from SPA context during pool initialization. 542 */ 543 int 544 dsl_pool_sync_context(dsl_pool_t *dp) 545 { 546 return (curthread == dp->dp_tx.tx_sync_thread || 547 spa_is_initializing(dp->dp_spa)); 548 } 549 550 uint64_t 551 dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree) 552 { 553 uint64_t space, resv; 554 555 /* 556 * Reserve about 1.6% (1/64), or at least 32MB, for allocation 557 * efficiency. 558 * XXX The intent log is not accounted for, so it must fit 559 * within this slop. 560 * 561 * If we're trying to assess whether it's OK to do a free, 562 * cut the reservation in half to allow forward progress 563 * (e.g. make it possible to rm(1) files from a full pool). 564 */ 565 space = spa_get_dspace(dp->dp_spa); 566 resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1); 567 if (netfree) 568 resv >>= 1; 569 570 return (space - resv); 571 } 572 573 int 574 dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx) 575 { 576 uint64_t reserved = 0; 577 uint64_t write_limit = (zfs_write_limit_override ? 578 zfs_write_limit_override : dp->dp_write_limit); 579 580 if (zfs_no_write_throttle) { 581 atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], 582 space); 583 return (0); 584 } 585 586 /* 587 * Check to see if we have exceeded the maximum allowed IO for 588 * this transaction group. We can do this without locks since 589 * a little slop here is ok. Note that we do the reserved check 590 * with only half the requested reserve: this is because the 591 * reserve requests are worst-case, and we really don't want to 592 * throttle based off of worst-case estimates. 593 */ 594 if (write_limit > 0) { 595 reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK] 596 + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2; 597 598 if (reserved && reserved > write_limit) 599 return (ERESTART); 600 } 601 602 atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space); 603 604 /* 605 * If this transaction group is over 7/8ths capacity, delay 606 * the caller 1 clock tick. This will slow down the "fill" 607 * rate until the sync process can catch up with us. 608 */ 609 if (reserved && reserved > (write_limit - (write_limit >> 3))) 610 txg_delay(dp, tx->tx_txg, 1); 611 612 return (0); 613 } 614 615 void 616 dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) 617 { 618 ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space); 619 atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space); 620 } 621 622 void 623 dsl_pool_memory_pressure(dsl_pool_t *dp) 624 { 625 uint64_t space_inuse = 0; 626 int i; 627 628 if (dp->dp_write_limit == zfs_write_limit_min) 629 return; 630 631 for (i = 0; i < TXG_SIZE; i++) { 632 space_inuse += dp->dp_space_towrite[i]; 633 space_inuse += dp->dp_tempreserved[i]; 634 } 635 dp->dp_write_limit = MAX(zfs_write_limit_min, 636 MIN(dp->dp_write_limit, space_inuse / 4)); 637 } 638 639 void 640 dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) 641 { 642 if (space > 0) { 643 mutex_enter(&dp->dp_lock); 644 dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space; 645 mutex_exit(&dp->dp_lock); 646 } 647 } 648 649 /* ARGSUSED */ 650 static int 651 upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) 652 { 653 dmu_tx_t *tx = arg; 654 dsl_dataset_t *ds, *prev = NULL; 655 int err; 656 dsl_pool_t *dp = spa_get_dsl(spa); 657 658 err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); 659 if (err) 660 return (err); 661 662 while (ds->ds_phys->ds_prev_snap_obj != 0) { 663 err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, 664 FTAG, &prev); 665 if (err) { 666 dsl_dataset_rele(ds, FTAG); 667 return (err); 668 } 669 670 if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) 671 break; 672 dsl_dataset_rele(ds, FTAG); 673 ds = prev; 674 prev = NULL; 675 } 676 677 if (prev == NULL) { 678 prev = dp->dp_origin_snap; 679 680 /* 681 * The $ORIGIN can't have any data, or the accounting 682 * will be wrong. 683 */ 684 ASSERT(prev->ds_phys->ds_bp.blk_birth == 0); 685 686 /* The origin doesn't get attached to itself */ 687 if (ds->ds_object == prev->ds_object) { 688 dsl_dataset_rele(ds, FTAG); 689 return (0); 690 } 691 692 dmu_buf_will_dirty(ds->ds_dbuf, tx); 693 ds->ds_phys->ds_prev_snap_obj = prev->ds_object; 694 ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg; 695 696 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); 697 ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object; 698 699 dmu_buf_will_dirty(prev->ds_dbuf, tx); 700 prev->ds_phys->ds_num_children++; 701 702 if (ds->ds_phys->ds_next_snap_obj == 0) { 703 ASSERT(ds->ds_prev == NULL); 704 VERIFY(0 == dsl_dataset_hold_obj(dp, 705 ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); 706 } 707 } 708 709 ASSERT(ds->ds_dir->dd_phys->dd_origin_obj == prev->ds_object); 710 ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object); 711 712 if (prev->ds_phys->ds_next_clones_obj == 0) { 713 dmu_buf_will_dirty(prev->ds_dbuf, tx); 714 prev->ds_phys->ds_next_clones_obj = 715 zap_create(dp->dp_meta_objset, 716 DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); 717 } 718 VERIFY(0 == zap_add_int(dp->dp_meta_objset, 719 prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx)); 720 721 dsl_dataset_rele(ds, FTAG); 722 if (prev != dp->dp_origin_snap) 723 dsl_dataset_rele(prev, FTAG); 724 return (0); 725 } 726 727 void 728 dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) 729 { 730 ASSERT(dmu_tx_is_syncing(tx)); 731 ASSERT(dp->dp_origin_snap != NULL); 732 733 VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb, 734 tx, DS_FIND_CHILDREN)); 735 } 736 737 /* ARGSUSED */ 738 static int 739 upgrade_dir_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) 740 { 741 dmu_tx_t *tx = arg; 742 dsl_dataset_t *ds; 743 dsl_pool_t *dp = spa_get_dsl(spa); 744 objset_t *mos = dp->dp_meta_objset; 745 746 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 747 748 if (ds->ds_dir->dd_phys->dd_origin_obj) { 749 dsl_dataset_t *origin; 750 751 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, 752 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin)); 753 754 if (origin->ds_dir->dd_phys->dd_clones == 0) { 755 dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); 756 origin->ds_dir->dd_phys->dd_clones = zap_create(mos, 757 DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); 758 } 759 760 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, 761 origin->ds_dir->dd_phys->dd_clones, dsobj, tx)); 762 763 dsl_dataset_rele(origin, FTAG); 764 } 765 766 dsl_dataset_rele(ds, FTAG); 767 return (0); 768 } 769 770 void 771 dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) 772 { 773 ASSERT(dmu_tx_is_syncing(tx)); 774 uint64_t obj; 775 776 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); 777 VERIFY(0 == dsl_pool_open_special_dir(dp, 778 FREE_DIR_NAME, &dp->dp_free_dir)); 779 780 /* 781 * We can't use bpobj_alloc(), because spa_version() still 782 * returns the old version, and we need a new-version bpobj with 783 * subobj support. So call dmu_object_alloc() directly. 784 */ 785 obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ, 786 SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx); 787 VERIFY3U(0, ==, zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 788 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); 789 VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, 790 dp->dp_meta_objset, obj)); 791 792 VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, 793 upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN)); 794 } 795 796 void 797 dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) 798 { 799 uint64_t dsobj; 800 dsl_dataset_t *ds; 801 802 ASSERT(dmu_tx_is_syncing(tx)); 803 ASSERT(dp->dp_origin_snap == NULL); 804 805 /* create the origin dir, ds, & snap-ds */ 806 rw_enter(&dp->dp_config_rwlock, RW_WRITER); 807 dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, 808 NULL, 0, kcred, tx); 809 VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 810 dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, tx); 811 VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, 812 dp, &dp->dp_origin_snap)); 813 dsl_dataset_rele(ds, FTAG); 814 rw_exit(&dp->dp_config_rwlock); 815 } 816 817 taskq_t * 818 dsl_pool_vnrele_taskq(dsl_pool_t *dp) 819 { 820 return (dp->dp_vnrele_taskq); 821 } 822 823 /* 824 * Walk through the pool-wide zap object of temporary snapshot user holds 825 * and release them. 826 */ 827 void 828 dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) 829 { 830 zap_attribute_t za; 831 zap_cursor_t zc; 832 objset_t *mos = dp->dp_meta_objset; 833 uint64_t zapobj = dp->dp_tmp_userrefs_obj; 834 835 if (zapobj == 0) 836 return; 837 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 838 839 for (zap_cursor_init(&zc, mos, zapobj); 840 zap_cursor_retrieve(&zc, &za) == 0; 841 zap_cursor_advance(&zc)) { 842 char *htag; 843 uint64_t dsobj; 844 845 htag = strchr(za.za_name, '-'); 846 *htag = '\0'; 847 ++htag; 848 dsobj = strtonum(za.za_name, NULL); 849 (void) dsl_dataset_user_release_tmp(dp, dsobj, htag, B_FALSE); 850 } 851 zap_cursor_fini(&zc); 852 } 853 854 /* 855 * Create the pool-wide zap object for storing temporary snapshot holds. 856 */ 857 void 858 dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) 859 { 860 objset_t *mos = dp->dp_meta_objset; 861 862 ASSERT(dp->dp_tmp_userrefs_obj == 0); 863 ASSERT(dmu_tx_is_syncing(tx)); 864 865 dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS, 866 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx); 867 } 868 869 static int 870 dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, 871 const char *tag, uint64_t *now, dmu_tx_t *tx, boolean_t holding) 872 { 873 objset_t *mos = dp->dp_meta_objset; 874 uint64_t zapobj = dp->dp_tmp_userrefs_obj; 875 char *name; 876 int error; 877 878 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 879 ASSERT(dmu_tx_is_syncing(tx)); 880 881 /* 882 * If the pool was created prior to SPA_VERSION_USERREFS, the 883 * zap object for temporary holds might not exist yet. 884 */ 885 if (zapobj == 0) { 886 if (holding) { 887 dsl_pool_user_hold_create_obj(dp, tx); 888 zapobj = dp->dp_tmp_userrefs_obj; 889 } else { 890 return (ENOENT); 891 } 892 } 893 894 name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag); 895 if (holding) 896 error = zap_add(mos, zapobj, name, 8, 1, now, tx); 897 else 898 error = zap_remove(mos, zapobj, name, tx); 899 strfree(name); 900 901 return (error); 902 } 903 904 /* 905 * Add a temporary hold for the given dataset object and tag. 906 */ 907 int 908 dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag, 909 uint64_t *now, dmu_tx_t *tx) 910 { 911 return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE)); 912 } 913 914 /* 915 * Release a temporary hold for the given dataset object and tag. 916 */ 917 int 918 dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, 919 dmu_tx_t *tx) 920 { 921 return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL, 922 tx, B_FALSE)); 923 } 924