1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/dsl_pool.h> 29 #include <sys/dsl_dataset.h> 30 #include <sys/dsl_dir.h> 31 #include <sys/dsl_synctask.h> 32 #include <sys/dmu_tx.h> 33 #include <sys/dmu_objset.h> 34 #include <sys/arc.h> 35 #include <sys/zap.h> 36 #include <sys/zio.h> 37 #include <sys/zfs_context.h> 38 #include <sys/fs/zfs.h> 39 #include <sys/zfs_znode.h> 40 #include <sys/spa_impl.h> 41 42 int zfs_no_write_throttle = 0; 43 uint64_t zfs_write_limit_override = 0; 44 45 46 static int 47 dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) 48 { 49 uint64_t obj; 50 int err; 51 52 err = zap_lookup(dp->dp_meta_objset, 53 dp->dp_root_dir->dd_phys->dd_child_dir_zapobj, 54 name, sizeof (obj), 1, &obj); 55 if (err) 56 return (err); 57 58 return (dsl_dir_open_obj(dp, obj, name, dp, ddp)); 59 } 60 61 static dsl_pool_t * 62 dsl_pool_open_impl(spa_t *spa, uint64_t txg) 63 { 64 dsl_pool_t *dp; 65 blkptr_t *bp = spa_get_rootblkptr(spa); 66 extern uint64_t zfs_write_limit_min; 67 68 dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); 69 dp->dp_spa = spa; 70 dp->dp_meta_rootbp = *bp; 71 rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL); 72 dp->dp_write_limit = zfs_write_limit_min; 73 txg_init(dp, txg); 74 75 txg_list_create(&dp->dp_dirty_datasets, 76 offsetof(dsl_dataset_t, ds_dirty_link)); 77 txg_list_create(&dp->dp_dirty_dirs, 78 offsetof(dsl_dir_t, dd_dirty_link)); 79 txg_list_create(&dp->dp_sync_tasks, 80 offsetof(dsl_sync_task_group_t, dstg_node)); 81 list_create(&dp->dp_synced_datasets, sizeof (dsl_dataset_t), 82 offsetof(dsl_dataset_t, ds_synced_link)); 83 84 mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); 85 mutex_init(&dp->dp_scrub_cancel_lock, NULL, MUTEX_DEFAULT, NULL); 86 87 return (dp); 88 } 89 90 int 91 dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) 92 { 93 int err; 94 dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); 95 dsl_dir_t *dd; 96 dsl_dataset_t *ds; 97 objset_impl_t *osi; 98 99 rw_enter(&dp->dp_config_rwlock, RW_WRITER); 100 err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &osi); 101 if (err) 102 goto out; 103 dp->dp_meta_objset = &osi->os; 104 105 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 106 DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, 107 &dp->dp_root_dir_obj); 108 if (err) 109 goto out; 110 111 err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, 112 NULL, dp, &dp->dp_root_dir); 113 if (err) 114 goto out; 115 116 err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir); 117 if (err) 118 goto out; 119 120 if (spa_version(spa) >= SPA_VERSION_ORIGIN) { 121 err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); 122 if (err) 123 goto out; 124 err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj, 125 FTAG, &ds); 126 if (err) 127 goto out; 128 err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, 129 dp, &dp->dp_origin_snap); 130 if (err) 131 goto out; 132 dsl_dataset_rele(ds, FTAG); 133 dsl_dir_close(dd, dp); 134 } 135 136 /* get scrub status */ 137 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 138 DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1, 139 &dp->dp_scrub_func); 140 if (err == 0) { 141 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 142 DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1, 143 &dp->dp_scrub_queue_obj); 144 if (err) 145 goto out; 146 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 147 DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1, 148 &dp->dp_scrub_min_txg); 149 if (err) 150 goto out; 151 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 152 DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1, 153 &dp->dp_scrub_max_txg); 154 if (err) 155 goto out; 156 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 157 DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4, 158 &dp->dp_scrub_bookmark); 159 if (err) 160 goto out; 161 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 162 DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, 163 &spa->spa_scrub_errors); 164 if (err) 165 goto out; 166 if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) { 167 /* 168 * A new-type scrub was in progress on an old 169 * pool. Restart from the beginning, since the 170 * old software may have changed the pool in the 171 * meantime. 172 */ 173 dsl_pool_scrub_restart(dp); 174 } 175 } else { 176 /* 177 * It's OK if there is no scrub in progress (and if 178 * there was an I/O error, ignore it). 179 */ 180 err = 0; 181 } 182 183 out: 184 rw_exit(&dp->dp_config_rwlock); 185 if (err) 186 dsl_pool_close(dp); 187 else 188 *dpp = dp; 189 190 return (err); 191 } 192 193 void 194 dsl_pool_close(dsl_pool_t *dp) 195 { 196 /* drop our references from dsl_pool_open() */ 197 198 /* 199 * Since we held the origin_snap from "syncing" context (which 200 * includes pool-opening context), it actually only got a "ref" 201 * and not a hold, so just drop that here. 202 */ 203 if (dp->dp_origin_snap) 204 dsl_dataset_drop_ref(dp->dp_origin_snap, dp); 205 if (dp->dp_mos_dir) 206 dsl_dir_close(dp->dp_mos_dir, dp); 207 if (dp->dp_root_dir) 208 dsl_dir_close(dp->dp_root_dir, dp); 209 210 /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ 211 if (dp->dp_meta_objset) 212 dmu_objset_evict(NULL, dp->dp_meta_objset->os); 213 214 txg_list_destroy(&dp->dp_dirty_datasets); 215 txg_list_destroy(&dp->dp_dirty_dirs); 216 list_destroy(&dp->dp_synced_datasets); 217 218 arc_flush(dp->dp_spa); 219 txg_fini(dp); 220 rw_destroy(&dp->dp_config_rwlock); 221 mutex_destroy(&dp->dp_lock); 222 mutex_destroy(&dp->dp_scrub_cancel_lock); 223 kmem_free(dp, sizeof (dsl_pool_t)); 224 } 225 226 dsl_pool_t * 227 dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) 228 { 229 int err; 230 dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); 231 dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); 232 objset_impl_t *osip; 233 dsl_dataset_t *ds; 234 uint64_t dsobj; 235 236 /* create and open the MOS (meta-objset) */ 237 dp->dp_meta_objset = &dmu_objset_create_impl(spa, 238 NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx)->os; 239 240 /* create the pool directory */ 241 err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 242 DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); 243 ASSERT3U(err, ==, 0); 244 245 /* create and open the root dir */ 246 dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); 247 VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj, 248 NULL, dp, &dp->dp_root_dir)); 249 250 /* create and open the meta-objset dir */ 251 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); 252 VERIFY(0 == dsl_pool_open_special_dir(dp, 253 MOS_DIR_NAME, &dp->dp_mos_dir)); 254 255 if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) 256 dsl_pool_create_origin(dp, tx); 257 258 /* create the root dataset */ 259 dsobj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); 260 261 /* create the root objset */ 262 VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 263 osip = dmu_objset_create_impl(dp->dp_spa, ds, 264 dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); 265 #ifdef _KERNEL 266 zfs_create_fs(&osip->os, kcred, zplprops, tx); 267 #endif 268 dsl_dataset_rele(ds, FTAG); 269 270 dmu_tx_commit(tx); 271 272 return (dp); 273 } 274 275 void 276 dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) 277 { 278 zio_t *zio; 279 dmu_tx_t *tx; 280 dsl_dir_t *dd; 281 dsl_dataset_t *ds; 282 dsl_sync_task_group_t *dstg; 283 objset_impl_t *mosi = dp->dp_meta_objset->os; 284 int err; 285 286 tx = dmu_tx_create_assigned(dp, txg); 287 288 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 289 while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { 290 if (!list_link_active(&ds->ds_synced_link)) 291 list_insert_tail(&dp->dp_synced_datasets, ds); 292 else 293 dmu_buf_rele(ds->ds_dbuf, ds); 294 dsl_dataset_sync(ds, zio, tx); 295 } 296 err = zio_wait(zio); 297 ASSERT(err == 0); 298 299 while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) 300 dsl_sync_task_group_sync(dstg, tx); 301 while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) 302 dsl_dir_sync(dd, tx); 303 304 if (spa_sync_pass(dp->dp_spa) == 1) 305 dsl_pool_scrub_sync(dp, tx); 306 307 if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL || 308 list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) { 309 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 310 dmu_objset_sync(mosi, zio, tx); 311 err = zio_wait(zio); 312 ASSERT(err == 0); 313 dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); 314 spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); 315 } 316 317 dmu_tx_commit(tx); 318 } 319 320 void 321 dsl_pool_zil_clean(dsl_pool_t *dp) 322 { 323 dsl_dataset_t *ds; 324 325 while (ds = list_head(&dp->dp_synced_datasets)) { 326 list_remove(&dp->dp_synced_datasets, ds); 327 ASSERT(ds->ds_user_ptr != NULL); 328 zil_clean(((objset_impl_t *)ds->ds_user_ptr)->os_zil); 329 dmu_buf_rele(ds->ds_dbuf, ds); 330 } 331 } 332 333 /* 334 * TRUE if the current thread is the tx_sync_thread or if we 335 * are being called from SPA context during pool initialization. 336 */ 337 int 338 dsl_pool_sync_context(dsl_pool_t *dp) 339 { 340 return (curthread == dp->dp_tx.tx_sync_thread || 341 spa_get_dsl(dp->dp_spa) == NULL); 342 } 343 344 uint64_t 345 dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree) 346 { 347 uint64_t space, resv; 348 349 /* 350 * Reserve about 1.6% (1/64), or at least 32MB, for allocation 351 * efficiency. 352 * XXX The intent log is not accounted for, so it must fit 353 * within this slop. 354 * 355 * If we're trying to assess whether it's OK to do a free, 356 * cut the reservation in half to allow forward progress 357 * (e.g. make it possible to rm(1) files from a full pool). 358 */ 359 space = spa_get_dspace(dp->dp_spa); 360 resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1); 361 if (netfree) 362 resv >>= 1; 363 364 return (space - resv); 365 } 366 367 int 368 dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx) 369 { 370 uint64_t reserved = 0; 371 uint64_t write_limit = (zfs_write_limit_override ? 372 zfs_write_limit_override : dp->dp_write_limit); 373 374 if (zfs_no_write_throttle) { 375 atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], 376 space); 377 return (0); 378 } 379 380 /* 381 * Check to see if we have exceeded the maximum allowed IO for 382 * this transaction group. We can do this without locks since 383 * a little slop here is ok. Note that we do the reserved check 384 * with only half the requested reserve: this is because the 385 * reserve requests are worst-case, and we really don't want to 386 * throttle based off of worst-case estimates. 387 */ 388 if (write_limit > 0) { 389 reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK] 390 + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2; 391 392 if (reserved && reserved > write_limit) 393 return (ERESTART); 394 } 395 396 atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space); 397 398 /* 399 * If this transaction group is over 7/8ths capacity, delay 400 * the caller 1 clock tick. This will slow down the "fill" 401 * rate until the sync process can catch up with us. 402 */ 403 if (reserved && reserved > (write_limit - (write_limit >> 3))) 404 txg_delay(dp, tx->tx_txg, 1); 405 406 return (0); 407 } 408 409 void 410 dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) 411 { 412 ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space); 413 atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space); 414 } 415 416 void 417 dsl_pool_memory_pressure(dsl_pool_t *dp) 418 { 419 extern uint64_t zfs_write_limit_min; 420 uint64_t space_inuse = 0; 421 int i; 422 423 if (dp->dp_write_limit == zfs_write_limit_min) 424 return; 425 426 for (i = 0; i < TXG_SIZE; i++) { 427 space_inuse += dp->dp_space_towrite[i]; 428 space_inuse += dp->dp_tempreserved[i]; 429 } 430 dp->dp_write_limit = MAX(zfs_write_limit_min, 431 MIN(dp->dp_write_limit, space_inuse / 4)); 432 } 433 434 void 435 dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) 436 { 437 if (space > 0) { 438 mutex_enter(&dp->dp_lock); 439 dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space; 440 mutex_exit(&dp->dp_lock); 441 } 442 } 443 444 /* ARGSUSED */ 445 static int 446 upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) 447 { 448 dmu_tx_t *tx = arg; 449 dsl_dataset_t *ds, *prev = NULL; 450 int err; 451 dsl_pool_t *dp = spa_get_dsl(spa); 452 453 err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); 454 if (err) 455 return (err); 456 457 while (ds->ds_phys->ds_prev_snap_obj != 0) { 458 err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, 459 FTAG, &prev); 460 if (err) { 461 dsl_dataset_rele(ds, FTAG); 462 return (err); 463 } 464 465 if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) 466 break; 467 dsl_dataset_rele(ds, FTAG); 468 ds = prev; 469 prev = NULL; 470 } 471 472 if (prev == NULL) { 473 prev = dp->dp_origin_snap; 474 475 /* 476 * The $ORIGIN can't have any data, or the accounting 477 * will be wrong. 478 */ 479 ASSERT(prev->ds_phys->ds_bp.blk_birth == 0); 480 481 /* The origin doesn't get attached to itself */ 482 if (ds->ds_object == prev->ds_object) { 483 dsl_dataset_rele(ds, FTAG); 484 return (0); 485 } 486 487 dmu_buf_will_dirty(ds->ds_dbuf, tx); 488 ds->ds_phys->ds_prev_snap_obj = prev->ds_object; 489 ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg; 490 491 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); 492 ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object; 493 494 dmu_buf_will_dirty(prev->ds_dbuf, tx); 495 prev->ds_phys->ds_num_children++; 496 497 if (ds->ds_phys->ds_next_snap_obj == 0) { 498 ASSERT(ds->ds_prev == NULL); 499 VERIFY(0 == dsl_dataset_hold_obj(dp, 500 ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); 501 } 502 } 503 504 ASSERT(ds->ds_dir->dd_phys->dd_origin_obj == prev->ds_object); 505 ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object); 506 507 if (prev->ds_phys->ds_next_clones_obj == 0) { 508 prev->ds_phys->ds_next_clones_obj = 509 zap_create(dp->dp_meta_objset, 510 DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); 511 } 512 VERIFY(0 == zap_add_int(dp->dp_meta_objset, 513 prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx)); 514 515 dsl_dataset_rele(ds, FTAG); 516 if (prev != dp->dp_origin_snap) 517 dsl_dataset_rele(prev, FTAG); 518 return (0); 519 } 520 521 void 522 dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) 523 { 524 ASSERT(dmu_tx_is_syncing(tx)); 525 ASSERT(dp->dp_origin_snap != NULL); 526 527 (void) dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb, 528 tx, DS_FIND_CHILDREN); 529 } 530 531 void 532 dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) 533 { 534 uint64_t dsobj; 535 dsl_dataset_t *ds; 536 537 ASSERT(dmu_tx_is_syncing(tx)); 538 ASSERT(dp->dp_origin_snap == NULL); 539 540 /* create the origin dir, ds, & snap-ds */ 541 rw_enter(&dp->dp_config_rwlock, RW_WRITER); 542 dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, 543 NULL, 0, kcred, tx); 544 VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 545 dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, kcred, tx); 546 VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, 547 dp, &dp->dp_origin_snap)); 548 dsl_dataset_rele(ds, FTAG); 549 rw_exit(&dp->dp_config_rwlock); 550 } 551