1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains all the routines used when modifying on-disk SPA state. 31 * This includes opening, importing, destroying, exporting a pool, and syncing a 32 * pool. 33 */ 34 35 #include <sys/zfs_context.h> 36 #include <sys/fm/fs/zfs.h> 37 #include <sys/spa_impl.h> 38 #include <sys/zio.h> 39 #include <sys/zio_checksum.h> 40 #include <sys/zio_compress.h> 41 #include <sys/dmu.h> 42 #include <sys/dmu_tx.h> 43 #include <sys/zap.h> 44 #include <sys/zil.h> 45 #include <sys/vdev_impl.h> 46 #include <sys/metaslab.h> 47 #include <sys/uberblock_impl.h> 48 #include <sys/txg.h> 49 #include <sys/avl.h> 50 #include <sys/dmu_traverse.h> 51 #include <sys/dmu_objset.h> 52 #include <sys/unique.h> 53 #include <sys/dsl_pool.h> 54 #include <sys/dsl_dataset.h> 55 #include <sys/dsl_dir.h> 56 #include <sys/dsl_prop.h> 57 #include <sys/dsl_synctask.h> 58 #include <sys/fs/zfs.h> 59 #include <sys/callb.h> 60 61 int zio_taskq_threads = 8; 62 63 /* 64 * ========================================================================== 65 * SPA state manipulation (open/create/destroy/import/export) 66 * ========================================================================== 67 */ 68 69 static int 70 spa_error_entry_compare(const void *a, const void *b) 71 { 72 spa_error_entry_t *sa = (spa_error_entry_t *)a; 73 spa_error_entry_t *sb = (spa_error_entry_t *)b; 74 int ret; 75 76 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 77 sizeof (zbookmark_t)); 78 79 if (ret < 0) 80 return (-1); 81 else if (ret > 0) 82 return (1); 83 else 84 return (0); 85 } 86 87 /* 88 * Utility function which retrieves copies of the current logs and 89 * re-initializes them in the process. 90 */ 91 void 92 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 93 { 94 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 95 96 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 97 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 98 99 avl_create(&spa->spa_errlist_scrub, 100 spa_error_entry_compare, sizeof (spa_error_entry_t), 101 offsetof(spa_error_entry_t, se_avl)); 102 avl_create(&spa->spa_errlist_last, 103 spa_error_entry_compare, sizeof (spa_error_entry_t), 104 offsetof(spa_error_entry_t, se_avl)); 105 } 106 107 /* 108 * Activate an uninitialized pool. 109 */ 110 static void 111 spa_activate(spa_t *spa) 112 { 113 int t; 114 115 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 116 117 spa->spa_state = POOL_STATE_ACTIVE; 118 119 spa->spa_normal_class = metaslab_class_create(); 120 121 for (t = 0; t < ZIO_TYPES; t++) { 122 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 123 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 124 TASKQ_PREPOPULATE); 125 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 126 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 127 TASKQ_PREPOPULATE); 128 } 129 130 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 131 132 mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); 133 mutex_init(&spa->spa_config_cache_lock, NULL, MUTEX_DEFAULT, NULL); 134 mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); 135 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); 136 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); 137 mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL); 138 mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); 139 mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); 140 mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); 141 142 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 143 offsetof(vdev_t, vdev_dirty_node)); 144 145 txg_list_create(&spa->spa_vdev_txg_list, 146 offsetof(struct vdev, vdev_txg_node)); 147 148 avl_create(&spa->spa_errlist_scrub, 149 spa_error_entry_compare, sizeof (spa_error_entry_t), 150 offsetof(spa_error_entry_t, se_avl)); 151 avl_create(&spa->spa_errlist_last, 152 spa_error_entry_compare, sizeof (spa_error_entry_t), 153 offsetof(spa_error_entry_t, se_avl)); 154 } 155 156 /* 157 * Opposite of spa_activate(). 158 */ 159 static void 160 spa_deactivate(spa_t *spa) 161 { 162 int t; 163 164 ASSERT(spa->spa_sync_on == B_FALSE); 165 ASSERT(spa->spa_dsl_pool == NULL); 166 ASSERT(spa->spa_root_vdev == NULL); 167 168 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 169 170 txg_list_destroy(&spa->spa_vdev_txg_list); 171 172 list_destroy(&spa->spa_dirty_list); 173 174 rw_destroy(&spa->spa_traverse_lock); 175 176 for (t = 0; t < ZIO_TYPES; t++) { 177 taskq_destroy(spa->spa_zio_issue_taskq[t]); 178 taskq_destroy(spa->spa_zio_intr_taskq[t]); 179 spa->spa_zio_issue_taskq[t] = NULL; 180 spa->spa_zio_intr_taskq[t] = NULL; 181 } 182 183 metaslab_class_destroy(spa->spa_normal_class); 184 spa->spa_normal_class = NULL; 185 186 /* 187 * If this was part of an import or the open otherwise failed, we may 188 * still have errors left in the queues. Empty them just in case. 189 */ 190 spa_errlog_drain(spa); 191 192 avl_destroy(&spa->spa_errlist_scrub); 193 avl_destroy(&spa->spa_errlist_last); 194 195 spa->spa_state = POOL_STATE_UNINITIALIZED; 196 } 197 198 /* 199 * Verify a pool configuration, and construct the vdev tree appropriately. This 200 * will create all the necessary vdevs in the appropriate layout, with each vdev 201 * in the CLOSED state. This will prep the pool before open/creation/import. 202 * All vdev validation is done by the vdev_alloc() routine. 203 */ 204 static int 205 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 206 uint_t id, int atype) 207 { 208 nvlist_t **child; 209 uint_t c, children; 210 int error; 211 212 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 213 return (error); 214 215 if ((*vdp)->vdev_ops->vdev_op_leaf) 216 return (0); 217 218 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 219 &child, &children) != 0) { 220 vdev_free(*vdp); 221 *vdp = NULL; 222 return (EINVAL); 223 } 224 225 for (c = 0; c < children; c++) { 226 vdev_t *vd; 227 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 228 atype)) != 0) { 229 vdev_free(*vdp); 230 *vdp = NULL; 231 return (error); 232 } 233 } 234 235 ASSERT(*vdp != NULL); 236 237 return (0); 238 } 239 240 /* 241 * Opposite of spa_load(). 242 */ 243 static void 244 spa_unload(spa_t *spa) 245 { 246 int i; 247 248 /* 249 * Stop async tasks. 250 */ 251 spa_async_suspend(spa); 252 253 /* 254 * Stop syncing. 255 */ 256 if (spa->spa_sync_on) { 257 txg_sync_stop(spa->spa_dsl_pool); 258 spa->spa_sync_on = B_FALSE; 259 } 260 261 /* 262 * Wait for any outstanding prefetch I/O to complete. 263 */ 264 spa_config_enter(spa, RW_WRITER, FTAG); 265 spa_config_exit(spa, FTAG); 266 267 /* 268 * Close the dsl pool. 269 */ 270 if (spa->spa_dsl_pool) { 271 dsl_pool_close(spa->spa_dsl_pool); 272 spa->spa_dsl_pool = NULL; 273 } 274 275 /* 276 * Close all vdevs. 277 */ 278 if (spa->spa_root_vdev) 279 vdev_free(spa->spa_root_vdev); 280 ASSERT(spa->spa_root_vdev == NULL); 281 282 for (i = 0; i < spa->spa_nspares; i++) 283 vdev_free(spa->spa_spares[i]); 284 if (spa->spa_spares) { 285 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 286 spa->spa_spares = NULL; 287 } 288 if (spa->spa_sparelist) { 289 nvlist_free(spa->spa_sparelist); 290 spa->spa_sparelist = NULL; 291 } 292 293 spa->spa_async_suspended = 0; 294 } 295 296 /* 297 * Load (or re-load) the current list of vdevs describing the active spares for 298 * this pool. When this is called, we have some form of basic information in 299 * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 300 * re-generate a more complete list including status information. 301 */ 302 static void 303 spa_load_spares(spa_t *spa) 304 { 305 nvlist_t **spares; 306 uint_t nspares; 307 int i; 308 vdev_t *vd, *tvd; 309 310 /* 311 * First, close and free any existing spare vdevs. 312 */ 313 for (i = 0; i < spa->spa_nspares; i++) { 314 vd = spa->spa_spares[i]; 315 316 /* Undo the call to spa_activate() below */ 317 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && 318 tvd->vdev_isspare) 319 spa_spare_remove(tvd); 320 vdev_close(vd); 321 vdev_free(vd); 322 } 323 324 if (spa->spa_spares) 325 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 326 327 if (spa->spa_sparelist == NULL) 328 nspares = 0; 329 else 330 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 331 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 332 333 spa->spa_nspares = (int)nspares; 334 spa->spa_spares = NULL; 335 336 if (nspares == 0) 337 return; 338 339 /* 340 * Construct the array of vdevs, opening them to get status in the 341 * process. For each spare, there is potentially two different vdev_t 342 * structures associated with it: one in the list of spares (used only 343 * for basic validation purposes) and one in the active vdev 344 * configuration (if it's spared in). During this phase we open and 345 * validate each vdev on the spare list. If the vdev also exists in the 346 * active configuration, then we also mark this vdev as an active spare. 347 */ 348 spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 349 for (i = 0; i < spa->spa_nspares; i++) { 350 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 351 VDEV_ALLOC_SPARE) == 0); 352 ASSERT(vd != NULL); 353 354 spa->spa_spares[i] = vd; 355 356 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { 357 if (!tvd->vdev_isspare) 358 spa_spare_add(tvd); 359 360 /* 361 * We only mark the spare active if we were successfully 362 * able to load the vdev. Otherwise, importing a pool 363 * with a bad active spare would result in strange 364 * behavior, because multiple pool would think the spare 365 * is actively in use. 366 * 367 * There is a vulnerability here to an equally bizarre 368 * circumstance, where a dead active spare is later 369 * brought back to life (onlined or otherwise). Given 370 * the rarity of this scenario, and the extra complexity 371 * it adds, we ignore the possibility. 372 */ 373 if (!vdev_is_dead(tvd)) 374 spa_spare_activate(tvd); 375 } 376 377 if (vdev_open(vd) != 0) 378 continue; 379 380 vd->vdev_top = vd; 381 (void) vdev_validate_spare(vd); 382 } 383 384 /* 385 * Recompute the stashed list of spares, with status information 386 * this time. 387 */ 388 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 389 DATA_TYPE_NVLIST_ARRAY) == 0); 390 391 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 392 for (i = 0; i < spa->spa_nspares; i++) 393 spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 394 B_TRUE, B_TRUE); 395 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 396 spares, spa->spa_nspares) == 0); 397 for (i = 0; i < spa->spa_nspares; i++) 398 nvlist_free(spares[i]); 399 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 400 } 401 402 static int 403 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 404 { 405 dmu_buf_t *db; 406 char *packed = NULL; 407 size_t nvsize = 0; 408 int error; 409 *value = NULL; 410 411 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 412 nvsize = *(uint64_t *)db->db_data; 413 dmu_buf_rele(db, FTAG); 414 415 packed = kmem_alloc(nvsize, KM_SLEEP); 416 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 417 if (error == 0) 418 error = nvlist_unpack(packed, nvsize, value, 0); 419 kmem_free(packed, nvsize); 420 421 return (error); 422 } 423 424 /* 425 * Load an existing storage pool, using the pool's builtin spa_config as a 426 * source of configuration information. 427 */ 428 static int 429 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 430 { 431 int error = 0; 432 nvlist_t *nvroot = NULL; 433 vdev_t *rvd; 434 uberblock_t *ub = &spa->spa_uberblock; 435 uint64_t config_cache_txg = spa->spa_config_txg; 436 uint64_t pool_guid; 437 uint64_t version; 438 zio_t *zio; 439 440 spa->spa_load_state = state; 441 442 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 443 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 444 error = EINVAL; 445 goto out; 446 } 447 448 /* 449 * Versioning wasn't explicitly added to the label until later, so if 450 * it's not present treat it as the initial version. 451 */ 452 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 453 version = ZFS_VERSION_INITIAL; 454 455 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 456 &spa->spa_config_txg); 457 458 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 459 spa_guid_exists(pool_guid, 0)) { 460 error = EEXIST; 461 goto out; 462 } 463 464 spa->spa_load_guid = pool_guid; 465 466 /* 467 * Parse the configuration into a vdev tree. We explicitly set the 468 * value that will be returned by spa_version() since parsing the 469 * configuration requires knowing the version number. 470 */ 471 spa_config_enter(spa, RW_WRITER, FTAG); 472 spa->spa_ubsync.ub_version = version; 473 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 474 spa_config_exit(spa, FTAG); 475 476 if (error != 0) 477 goto out; 478 479 ASSERT(spa->spa_root_vdev == rvd); 480 ASSERT(spa_guid(spa) == pool_guid); 481 482 /* 483 * Try to open all vdevs, loading each label in the process. 484 */ 485 if (vdev_open(rvd) != 0) { 486 error = ENXIO; 487 goto out; 488 } 489 490 /* 491 * Validate the labels for all leaf vdevs. We need to grab the config 492 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 493 * flag. 494 */ 495 spa_config_enter(spa, RW_READER, FTAG); 496 error = vdev_validate(rvd); 497 spa_config_exit(spa, FTAG); 498 499 if (error != 0) { 500 error = EBADF; 501 goto out; 502 } 503 504 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 505 error = ENXIO; 506 goto out; 507 } 508 509 /* 510 * Find the best uberblock. 511 */ 512 bzero(ub, sizeof (uberblock_t)); 513 514 zio = zio_root(spa, NULL, NULL, 515 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 516 vdev_uberblock_load(zio, rvd, ub); 517 error = zio_wait(zio); 518 519 /* 520 * If we weren't able to find a single valid uberblock, return failure. 521 */ 522 if (ub->ub_txg == 0) { 523 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 524 VDEV_AUX_CORRUPT_DATA); 525 error = ENXIO; 526 goto out; 527 } 528 529 /* 530 * If the pool is newer than the code, we can't open it. 531 */ 532 if (ub->ub_version > ZFS_VERSION) { 533 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 534 VDEV_AUX_VERSION_NEWER); 535 error = ENOTSUP; 536 goto out; 537 } 538 539 /* 540 * If the vdev guid sum doesn't match the uberblock, we have an 541 * incomplete configuration. 542 */ 543 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 544 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 545 VDEV_AUX_BAD_GUID_SUM); 546 error = ENXIO; 547 goto out; 548 } 549 550 /* 551 * Initialize internal SPA structures. 552 */ 553 spa->spa_state = POOL_STATE_ACTIVE; 554 spa->spa_ubsync = spa->spa_uberblock; 555 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 556 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 557 if (error) { 558 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 559 VDEV_AUX_CORRUPT_DATA); 560 goto out; 561 } 562 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 563 564 if (zap_lookup(spa->spa_meta_objset, 565 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 566 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 567 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 568 VDEV_AUX_CORRUPT_DATA); 569 error = EIO; 570 goto out; 571 } 572 573 if (!mosconfig) { 574 nvlist_t *newconfig; 575 576 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 577 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 578 VDEV_AUX_CORRUPT_DATA); 579 error = EIO; 580 goto out; 581 } 582 583 spa_config_set(spa, newconfig); 584 spa_unload(spa); 585 spa_deactivate(spa); 586 spa_activate(spa); 587 588 return (spa_load(spa, newconfig, state, B_TRUE)); 589 } 590 591 if (zap_lookup(spa->spa_meta_objset, 592 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 593 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 594 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 595 VDEV_AUX_CORRUPT_DATA); 596 error = EIO; 597 goto out; 598 } 599 600 /* 601 * Load the bit that tells us to use the new accounting function 602 * (raid-z deflation). If we have an older pool, this will not 603 * be present. 604 */ 605 error = zap_lookup(spa->spa_meta_objset, 606 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 607 sizeof (uint64_t), 1, &spa->spa_deflate); 608 if (error != 0 && error != ENOENT) { 609 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 610 VDEV_AUX_CORRUPT_DATA); 611 error = EIO; 612 goto out; 613 } 614 615 /* 616 * Load the persistent error log. If we have an older pool, this will 617 * not be present. 618 */ 619 error = zap_lookup(spa->spa_meta_objset, 620 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 621 sizeof (uint64_t), 1, &spa->spa_errlog_last); 622 if (error != 0 && error != ENOENT) { 623 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 624 VDEV_AUX_CORRUPT_DATA); 625 error = EIO; 626 goto out; 627 } 628 629 error = zap_lookup(spa->spa_meta_objset, 630 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 631 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 632 if (error != 0 && error != ENOENT) { 633 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 634 VDEV_AUX_CORRUPT_DATA); 635 error = EIO; 636 goto out; 637 } 638 639 /* 640 * Load the history object. If we have an older pool, this 641 * will not be present. 642 */ 643 error = zap_lookup(spa->spa_meta_objset, 644 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 645 sizeof (uint64_t), 1, &spa->spa_history); 646 if (error != 0 && error != ENOENT) { 647 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 648 VDEV_AUX_CORRUPT_DATA); 649 error = EIO; 650 goto out; 651 } 652 653 /* 654 * Load any hot spares for this pool. 655 */ 656 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 657 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 658 if (error != 0 && error != ENOENT) { 659 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 660 VDEV_AUX_CORRUPT_DATA); 661 error = EIO; 662 goto out; 663 } 664 if (error == 0) { 665 ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES); 666 if (load_nvlist(spa, spa->spa_spares_object, 667 &spa->spa_sparelist) != 0) { 668 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 669 VDEV_AUX_CORRUPT_DATA); 670 error = EIO; 671 goto out; 672 } 673 674 spa_config_enter(spa, RW_WRITER, FTAG); 675 spa_load_spares(spa); 676 spa_config_exit(spa, FTAG); 677 } 678 679 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 680 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 681 682 if (error && error != ENOENT) { 683 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 684 VDEV_AUX_CORRUPT_DATA); 685 error = EIO; 686 goto out; 687 } 688 689 if (error == 0) { 690 (void) zap_lookup(spa->spa_meta_objset, 691 spa->spa_pool_props_object, 692 zpool_prop_to_name(ZFS_PROP_BOOTFS), 693 sizeof (uint64_t), 1, &spa->spa_bootfs); 694 } 695 696 /* 697 * Load the vdev state for all toplevel vdevs. 698 */ 699 vdev_load(rvd); 700 701 /* 702 * Propagate the leaf DTLs we just loaded all the way up the tree. 703 */ 704 spa_config_enter(spa, RW_WRITER, FTAG); 705 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 706 spa_config_exit(spa, FTAG); 707 708 /* 709 * Check the state of the root vdev. If it can't be opened, it 710 * indicates one or more toplevel vdevs are faulted. 711 */ 712 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 713 error = ENXIO; 714 goto out; 715 } 716 717 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 718 dmu_tx_t *tx; 719 int need_update = B_FALSE; 720 int c; 721 722 /* 723 * Claim log blocks that haven't been committed yet. 724 * This must all happen in a single txg. 725 */ 726 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 727 spa_first_txg(spa)); 728 (void) dmu_objset_find(spa->spa_name, 729 zil_claim, tx, DS_FIND_CHILDREN); 730 dmu_tx_commit(tx); 731 732 spa->spa_sync_on = B_TRUE; 733 txg_sync_start(spa->spa_dsl_pool); 734 735 /* 736 * Wait for all claims to sync. 737 */ 738 txg_wait_synced(spa->spa_dsl_pool, 0); 739 740 /* 741 * If the config cache is stale, or we have uninitialized 742 * metaslabs (see spa_vdev_add()), then update the config. 743 */ 744 if (config_cache_txg != spa->spa_config_txg || 745 state == SPA_LOAD_IMPORT) 746 need_update = B_TRUE; 747 748 for (c = 0; c < rvd->vdev_children; c++) 749 if (rvd->vdev_child[c]->vdev_ms_array == 0) 750 need_update = B_TRUE; 751 752 /* 753 * Update the config cache asychronously in case we're the 754 * root pool, in which case the config cache isn't writable yet. 755 */ 756 if (need_update) 757 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 758 } 759 760 error = 0; 761 out: 762 if (error && error != EBADF) 763 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 764 spa->spa_load_state = SPA_LOAD_NONE; 765 spa->spa_ena = 0; 766 767 return (error); 768 } 769 770 /* 771 * Pool Open/Import 772 * 773 * The import case is identical to an open except that the configuration is sent 774 * down from userland, instead of grabbed from the configuration cache. For the 775 * case of an open, the pool configuration will exist in the 776 * POOL_STATE_UNITIALIZED state. 777 * 778 * The stats information (gen/count/ustats) is used to gather vdev statistics at 779 * the same time open the pool, without having to keep around the spa_t in some 780 * ambiguous state. 781 */ 782 static int 783 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 784 { 785 spa_t *spa; 786 int error; 787 int loaded = B_FALSE; 788 int locked = B_FALSE; 789 790 *spapp = NULL; 791 792 /* 793 * As disgusting as this is, we need to support recursive calls to this 794 * function because dsl_dir_open() is called during spa_load(), and ends 795 * up calling spa_open() again. The real fix is to figure out how to 796 * avoid dsl_dir_open() calling this in the first place. 797 */ 798 if (mutex_owner(&spa_namespace_lock) != curthread) { 799 mutex_enter(&spa_namespace_lock); 800 locked = B_TRUE; 801 } 802 803 if ((spa = spa_lookup(pool)) == NULL) { 804 if (locked) 805 mutex_exit(&spa_namespace_lock); 806 return (ENOENT); 807 } 808 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 809 810 spa_activate(spa); 811 812 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 813 814 if (error == EBADF) { 815 /* 816 * If vdev_validate() returns failure (indicated by 817 * EBADF), it indicates that one of the vdevs indicates 818 * that the pool has been exported or destroyed. If 819 * this is the case, the config cache is out of sync and 820 * we should remove the pool from the namespace. 821 */ 822 zfs_post_ok(spa, NULL); 823 spa_unload(spa); 824 spa_deactivate(spa); 825 spa_remove(spa); 826 spa_config_sync(); 827 if (locked) 828 mutex_exit(&spa_namespace_lock); 829 return (ENOENT); 830 } 831 832 if (error) { 833 /* 834 * We can't open the pool, but we still have useful 835 * information: the state of each vdev after the 836 * attempted vdev_open(). Return this to the user. 837 */ 838 if (config != NULL && spa->spa_root_vdev != NULL) { 839 spa_config_enter(spa, RW_READER, FTAG); 840 *config = spa_config_generate(spa, NULL, -1ULL, 841 B_TRUE); 842 spa_config_exit(spa, FTAG); 843 } 844 spa_unload(spa); 845 spa_deactivate(spa); 846 spa->spa_last_open_failed = B_TRUE; 847 if (locked) 848 mutex_exit(&spa_namespace_lock); 849 *spapp = NULL; 850 return (error); 851 } else { 852 zfs_post_ok(spa, NULL); 853 spa->spa_last_open_failed = B_FALSE; 854 } 855 856 loaded = B_TRUE; 857 } 858 859 spa_open_ref(spa, tag); 860 if (locked) 861 mutex_exit(&spa_namespace_lock); 862 863 *spapp = spa; 864 865 if (config != NULL) { 866 spa_config_enter(spa, RW_READER, FTAG); 867 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 868 spa_config_exit(spa, FTAG); 869 } 870 871 /* 872 * If we just loaded the pool, resilver anything that's out of date. 873 */ 874 if (loaded && (spa_mode & FWRITE)) 875 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 876 877 return (0); 878 } 879 880 int 881 spa_open(const char *name, spa_t **spapp, void *tag) 882 { 883 return (spa_open_common(name, spapp, tag, NULL)); 884 } 885 886 /* 887 * Lookup the given spa_t, incrementing the inject count in the process, 888 * preventing it from being exported or destroyed. 889 */ 890 spa_t * 891 spa_inject_addref(char *name) 892 { 893 spa_t *spa; 894 895 mutex_enter(&spa_namespace_lock); 896 if ((spa = spa_lookup(name)) == NULL) { 897 mutex_exit(&spa_namespace_lock); 898 return (NULL); 899 } 900 spa->spa_inject_ref++; 901 mutex_exit(&spa_namespace_lock); 902 903 return (spa); 904 } 905 906 void 907 spa_inject_delref(spa_t *spa) 908 { 909 mutex_enter(&spa_namespace_lock); 910 spa->spa_inject_ref--; 911 mutex_exit(&spa_namespace_lock); 912 } 913 914 static void 915 spa_add_spares(spa_t *spa, nvlist_t *config) 916 { 917 nvlist_t **spares; 918 uint_t i, nspares; 919 nvlist_t *nvroot; 920 uint64_t guid; 921 vdev_stat_t *vs; 922 uint_t vsc; 923 uint64_t pool; 924 925 if (spa->spa_nspares == 0) 926 return; 927 928 VERIFY(nvlist_lookup_nvlist(config, 929 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 930 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 931 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 932 if (nspares != 0) { 933 VERIFY(nvlist_add_nvlist_array(nvroot, 934 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 935 VERIFY(nvlist_lookup_nvlist_array(nvroot, 936 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 937 938 /* 939 * Go through and find any spares which have since been 940 * repurposed as an active spare. If this is the case, update 941 * their status appropriately. 942 */ 943 for (i = 0; i < nspares; i++) { 944 VERIFY(nvlist_lookup_uint64(spares[i], 945 ZPOOL_CONFIG_GUID, &guid) == 0); 946 if (spa_spare_exists(guid, &pool) && pool != 0ULL) { 947 VERIFY(nvlist_lookup_uint64_array( 948 spares[i], ZPOOL_CONFIG_STATS, 949 (uint64_t **)&vs, &vsc) == 0); 950 vs->vs_state = VDEV_STATE_CANT_OPEN; 951 vs->vs_aux = VDEV_AUX_SPARED; 952 } 953 } 954 } 955 } 956 957 int 958 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 959 { 960 int error; 961 spa_t *spa; 962 963 *config = NULL; 964 error = spa_open_common(name, &spa, FTAG, config); 965 966 if (spa && *config != NULL) { 967 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 968 spa_get_errlog_size(spa)) == 0); 969 970 spa_add_spares(spa, *config); 971 } 972 973 /* 974 * We want to get the alternate root even for faulted pools, so we cheat 975 * and call spa_lookup() directly. 976 */ 977 if (altroot) { 978 if (spa == NULL) { 979 mutex_enter(&spa_namespace_lock); 980 spa = spa_lookup(name); 981 if (spa) 982 spa_altroot(spa, altroot, buflen); 983 else 984 altroot[0] = '\0'; 985 spa = NULL; 986 mutex_exit(&spa_namespace_lock); 987 } else { 988 spa_altroot(spa, altroot, buflen); 989 } 990 } 991 992 if (spa != NULL) 993 spa_close(spa, FTAG); 994 995 return (error); 996 } 997 998 /* 999 * Validate that the 'spares' array is well formed. We must have an array of 1000 * nvlists, each which describes a valid leaf vdev. If this is an import (mode 1001 * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long 1002 * as they are well-formed. 1003 */ 1004 static int 1005 spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 1006 { 1007 nvlist_t **spares; 1008 uint_t i, nspares; 1009 vdev_t *vd; 1010 int error; 1011 1012 /* 1013 * It's acceptable to have no spares specified. 1014 */ 1015 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1016 &spares, &nspares) != 0) 1017 return (0); 1018 1019 if (nspares == 0) 1020 return (EINVAL); 1021 1022 /* 1023 * Make sure the pool is formatted with a version that supports hot 1024 * spares. 1025 */ 1026 if (spa_version(spa) < ZFS_VERSION_SPARES) 1027 return (ENOTSUP); 1028 1029 /* 1030 * Set the pending spare list so we correctly handle device in-use 1031 * checking. 1032 */ 1033 spa->spa_pending_spares = spares; 1034 spa->spa_pending_nspares = nspares; 1035 1036 for (i = 0; i < nspares; i++) { 1037 if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 1038 mode)) != 0) 1039 goto out; 1040 1041 if (!vd->vdev_ops->vdev_op_leaf) { 1042 vdev_free(vd); 1043 error = EINVAL; 1044 goto out; 1045 } 1046 1047 vd->vdev_top = vd; 1048 1049 if ((error = vdev_open(vd)) == 0 && 1050 (error = vdev_label_init(vd, crtxg, 1051 VDEV_LABEL_SPARE)) == 0) { 1052 VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 1053 vd->vdev_guid) == 0); 1054 } 1055 1056 vdev_free(vd); 1057 1058 if (error && mode != VDEV_ALLOC_SPARE) 1059 goto out; 1060 else 1061 error = 0; 1062 } 1063 1064 out: 1065 spa->spa_pending_spares = NULL; 1066 spa->spa_pending_nspares = 0; 1067 return (error); 1068 } 1069 1070 /* 1071 * Pool Creation 1072 */ 1073 int 1074 spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 1075 { 1076 spa_t *spa; 1077 vdev_t *rvd; 1078 dsl_pool_t *dp; 1079 dmu_tx_t *tx; 1080 int c, error = 0; 1081 uint64_t txg = TXG_INITIAL; 1082 nvlist_t **spares; 1083 uint_t nspares; 1084 1085 /* 1086 * If this pool already exists, return failure. 1087 */ 1088 mutex_enter(&spa_namespace_lock); 1089 if (spa_lookup(pool) != NULL) { 1090 mutex_exit(&spa_namespace_lock); 1091 return (EEXIST); 1092 } 1093 1094 /* 1095 * Allocate a new spa_t structure. 1096 */ 1097 spa = spa_add(pool, altroot); 1098 spa_activate(spa); 1099 1100 spa->spa_uberblock.ub_txg = txg - 1; 1101 spa->spa_uberblock.ub_version = ZFS_VERSION; 1102 spa->spa_ubsync = spa->spa_uberblock; 1103 1104 /* 1105 * Create the root vdev. 1106 */ 1107 spa_config_enter(spa, RW_WRITER, FTAG); 1108 1109 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1110 1111 ASSERT(error != 0 || rvd != NULL); 1112 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1113 1114 if (error == 0 && rvd->vdev_children == 0) 1115 error = EINVAL; 1116 1117 if (error == 0 && 1118 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1119 (error = spa_validate_spares(spa, nvroot, txg, 1120 VDEV_ALLOC_ADD)) == 0) { 1121 for (c = 0; c < rvd->vdev_children; c++) 1122 vdev_init(rvd->vdev_child[c], txg); 1123 vdev_config_dirty(rvd); 1124 } 1125 1126 spa_config_exit(spa, FTAG); 1127 1128 if (error != 0) { 1129 spa_unload(spa); 1130 spa_deactivate(spa); 1131 spa_remove(spa); 1132 mutex_exit(&spa_namespace_lock); 1133 return (error); 1134 } 1135 1136 /* 1137 * Get the list of spares, if specified. 1138 */ 1139 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1140 &spares, &nspares) == 0) { 1141 VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 1142 KM_SLEEP) == 0); 1143 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1144 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1145 spa_config_enter(spa, RW_WRITER, FTAG); 1146 spa_load_spares(spa); 1147 spa_config_exit(spa, FTAG); 1148 spa->spa_sync_spares = B_TRUE; 1149 } 1150 1151 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1152 spa->spa_meta_objset = dp->dp_meta_objset; 1153 1154 tx = dmu_tx_create_assigned(dp, txg); 1155 1156 /* 1157 * Create the pool config object. 1158 */ 1159 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1160 DMU_OT_PACKED_NVLIST, 1 << 14, 1161 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1162 1163 if (zap_add(spa->spa_meta_objset, 1164 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1165 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1166 cmn_err(CE_PANIC, "failed to add pool config"); 1167 } 1168 1169 /* Newly created pools are always deflated. */ 1170 spa->spa_deflate = TRUE; 1171 if (zap_add(spa->spa_meta_objset, 1172 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1173 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 1174 cmn_err(CE_PANIC, "failed to add deflate"); 1175 } 1176 1177 /* 1178 * Create the deferred-free bplist object. Turn off compression 1179 * because sync-to-convergence takes longer if the blocksize 1180 * keeps changing. 1181 */ 1182 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1183 1 << 14, tx); 1184 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1185 ZIO_COMPRESS_OFF, tx); 1186 1187 if (zap_add(spa->spa_meta_objset, 1188 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1189 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 1190 cmn_err(CE_PANIC, "failed to add bplist"); 1191 } 1192 1193 /* 1194 * Create the pool's history object. 1195 */ 1196 spa_history_create_obj(spa, tx); 1197 1198 dmu_tx_commit(tx); 1199 1200 spa->spa_bootfs = zfs_prop_default_numeric(ZFS_PROP_BOOTFS); 1201 spa->spa_sync_on = B_TRUE; 1202 txg_sync_start(spa->spa_dsl_pool); 1203 1204 /* 1205 * We explicitly wait for the first transaction to complete so that our 1206 * bean counters are appropriately updated. 1207 */ 1208 txg_wait_synced(spa->spa_dsl_pool, txg); 1209 1210 spa_config_sync(); 1211 1212 mutex_exit(&spa_namespace_lock); 1213 1214 return (0); 1215 } 1216 1217 /* 1218 * Import the given pool into the system. We set up the necessary spa_t and 1219 * then call spa_load() to do the dirty work. 1220 */ 1221 int 1222 spa_import(const char *pool, nvlist_t *config, const char *altroot) 1223 { 1224 spa_t *spa; 1225 int error; 1226 nvlist_t *nvroot; 1227 nvlist_t **spares; 1228 uint_t nspares; 1229 1230 if (!(spa_mode & FWRITE)) 1231 return (EROFS); 1232 1233 /* 1234 * If a pool with this name exists, return failure. 1235 */ 1236 mutex_enter(&spa_namespace_lock); 1237 if (spa_lookup(pool) != NULL) { 1238 mutex_exit(&spa_namespace_lock); 1239 return (EEXIST); 1240 } 1241 1242 /* 1243 * Create and initialize the spa structure. 1244 */ 1245 spa = spa_add(pool, altroot); 1246 spa_activate(spa); 1247 1248 /* 1249 * Pass off the heavy lifting to spa_load(). 1250 * Pass TRUE for mosconfig because the user-supplied config 1251 * is actually the one to trust when doing an import. 1252 */ 1253 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1254 1255 spa_config_enter(spa, RW_WRITER, FTAG); 1256 /* 1257 * Toss any existing sparelist, as it doesn't have any validity anymore, 1258 * and conflicts with spa_has_spare(). 1259 */ 1260 if (spa->spa_sparelist) { 1261 nvlist_free(spa->spa_sparelist); 1262 spa->spa_sparelist = NULL; 1263 spa_load_spares(spa); 1264 } 1265 1266 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1267 &nvroot) == 0); 1268 if (error == 0) 1269 error = spa_validate_spares(spa, nvroot, -1ULL, 1270 VDEV_ALLOC_SPARE); 1271 spa_config_exit(spa, FTAG); 1272 1273 if (error != 0) { 1274 spa_unload(spa); 1275 spa_deactivate(spa); 1276 spa_remove(spa); 1277 mutex_exit(&spa_namespace_lock); 1278 return (error); 1279 } 1280 1281 /* 1282 * Override any spares as specified by the user, as these may have 1283 * correct device names/devids, etc. 1284 */ 1285 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1286 &spares, &nspares) == 0) { 1287 if (spa->spa_sparelist) 1288 VERIFY(nvlist_remove(spa->spa_sparelist, 1289 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1290 else 1291 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1292 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1293 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1294 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1295 spa_config_enter(spa, RW_WRITER, FTAG); 1296 spa_load_spares(spa); 1297 spa_config_exit(spa, FTAG); 1298 spa->spa_sync_spares = B_TRUE; 1299 } 1300 1301 /* 1302 * Update the config cache to include the newly-imported pool. 1303 */ 1304 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1305 1306 mutex_exit(&spa_namespace_lock); 1307 1308 /* 1309 * Resilver anything that's out of date. 1310 */ 1311 if (spa_mode & FWRITE) 1312 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1313 1314 return (0); 1315 } 1316 1317 /* 1318 * This (illegal) pool name is used when temporarily importing a spa_t in order 1319 * to get the vdev stats associated with the imported devices. 1320 */ 1321 #define TRYIMPORT_NAME "$import" 1322 1323 nvlist_t * 1324 spa_tryimport(nvlist_t *tryconfig) 1325 { 1326 nvlist_t *config = NULL; 1327 char *poolname; 1328 spa_t *spa; 1329 uint64_t state; 1330 1331 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1332 return (NULL); 1333 1334 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1335 return (NULL); 1336 1337 /* 1338 * Create and initialize the spa structure. 1339 */ 1340 mutex_enter(&spa_namespace_lock); 1341 spa = spa_add(TRYIMPORT_NAME, NULL); 1342 spa_activate(spa); 1343 1344 /* 1345 * Pass off the heavy lifting to spa_load(). 1346 * Pass TRUE for mosconfig because the user-supplied config 1347 * is actually the one to trust when doing an import. 1348 */ 1349 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1350 1351 /* 1352 * If 'tryconfig' was at least parsable, return the current config. 1353 */ 1354 if (spa->spa_root_vdev != NULL) { 1355 spa_config_enter(spa, RW_READER, FTAG); 1356 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1357 spa_config_exit(spa, FTAG); 1358 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1359 poolname) == 0); 1360 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1361 state) == 0); 1362 1363 /* 1364 * Add the list of hot spares. 1365 */ 1366 spa_add_spares(spa, config); 1367 } 1368 1369 spa_unload(spa); 1370 spa_deactivate(spa); 1371 spa_remove(spa); 1372 mutex_exit(&spa_namespace_lock); 1373 1374 return (config); 1375 } 1376 1377 /* 1378 * Pool export/destroy 1379 * 1380 * The act of destroying or exporting a pool is very simple. We make sure there 1381 * is no more pending I/O and any references to the pool are gone. Then, we 1382 * update the pool state and sync all the labels to disk, removing the 1383 * configuration from the cache afterwards. 1384 */ 1385 static int 1386 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1387 { 1388 spa_t *spa; 1389 1390 if (oldconfig) 1391 *oldconfig = NULL; 1392 1393 if (!(spa_mode & FWRITE)) 1394 return (EROFS); 1395 1396 mutex_enter(&spa_namespace_lock); 1397 if ((spa = spa_lookup(pool)) == NULL) { 1398 mutex_exit(&spa_namespace_lock); 1399 return (ENOENT); 1400 } 1401 1402 /* 1403 * Put a hold on the pool, drop the namespace lock, stop async tasks, 1404 * reacquire the namespace lock, and see if we can export. 1405 */ 1406 spa_open_ref(spa, FTAG); 1407 mutex_exit(&spa_namespace_lock); 1408 spa_async_suspend(spa); 1409 mutex_enter(&spa_namespace_lock); 1410 spa_close(spa, FTAG); 1411 1412 /* 1413 * The pool will be in core if it's openable, 1414 * in which case we can modify its state. 1415 */ 1416 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1417 /* 1418 * Objsets may be open only because they're dirty, so we 1419 * have to force it to sync before checking spa_refcnt. 1420 */ 1421 spa_scrub_suspend(spa); 1422 txg_wait_synced(spa->spa_dsl_pool, 0); 1423 1424 /* 1425 * A pool cannot be exported or destroyed if there are active 1426 * references. If we are resetting a pool, allow references by 1427 * fault injection handlers. 1428 */ 1429 if (!spa_refcount_zero(spa) || 1430 (spa->spa_inject_ref != 0 && 1431 new_state != POOL_STATE_UNINITIALIZED)) { 1432 spa_scrub_resume(spa); 1433 spa_async_resume(spa); 1434 mutex_exit(&spa_namespace_lock); 1435 return (EBUSY); 1436 } 1437 1438 spa_scrub_resume(spa); 1439 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1440 1441 /* 1442 * We want this to be reflected on every label, 1443 * so mark them all dirty. spa_unload() will do the 1444 * final sync that pushes these changes out. 1445 */ 1446 if (new_state != POOL_STATE_UNINITIALIZED) { 1447 spa_config_enter(spa, RW_WRITER, FTAG); 1448 spa->spa_state = new_state; 1449 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 1450 vdev_config_dirty(spa->spa_root_vdev); 1451 spa_config_exit(spa, FTAG); 1452 } 1453 } 1454 1455 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1456 spa_unload(spa); 1457 spa_deactivate(spa); 1458 } 1459 1460 if (oldconfig && spa->spa_config) 1461 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 1462 1463 if (new_state != POOL_STATE_UNINITIALIZED) { 1464 spa_remove(spa); 1465 spa_config_sync(); 1466 } 1467 mutex_exit(&spa_namespace_lock); 1468 1469 return (0); 1470 } 1471 1472 /* 1473 * Destroy a storage pool. 1474 */ 1475 int 1476 spa_destroy(char *pool) 1477 { 1478 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1479 } 1480 1481 /* 1482 * Export a storage pool. 1483 */ 1484 int 1485 spa_export(char *pool, nvlist_t **oldconfig) 1486 { 1487 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1488 } 1489 1490 /* 1491 * Similar to spa_export(), this unloads the spa_t without actually removing it 1492 * from the namespace in any way. 1493 */ 1494 int 1495 spa_reset(char *pool) 1496 { 1497 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 1498 } 1499 1500 1501 /* 1502 * ========================================================================== 1503 * Device manipulation 1504 * ========================================================================== 1505 */ 1506 1507 /* 1508 * Add capacity to a storage pool. 1509 */ 1510 int 1511 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1512 { 1513 uint64_t txg; 1514 int c, error; 1515 vdev_t *rvd = spa->spa_root_vdev; 1516 vdev_t *vd, *tvd; 1517 nvlist_t **spares; 1518 uint_t i, nspares; 1519 1520 txg = spa_vdev_enter(spa); 1521 1522 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 1523 VDEV_ALLOC_ADD)) != 0) 1524 return (spa_vdev_exit(spa, NULL, txg, error)); 1525 1526 spa->spa_pending_vdev = vd; 1527 1528 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1529 &spares, &nspares) != 0) 1530 nspares = 0; 1531 1532 if (vd->vdev_children == 0 && nspares == 0) { 1533 spa->spa_pending_vdev = NULL; 1534 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1535 } 1536 1537 if (vd->vdev_children != 0) { 1538 if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 1539 spa->spa_pending_vdev = NULL; 1540 return (spa_vdev_exit(spa, vd, txg, error)); 1541 } 1542 } 1543 1544 /* 1545 * We must validate the spares after checking the children. Otherwise, 1546 * vdev_inuse() will blindly overwrite the spare. 1547 */ 1548 if ((error = spa_validate_spares(spa, nvroot, txg, 1549 VDEV_ALLOC_ADD)) != 0) { 1550 spa->spa_pending_vdev = NULL; 1551 return (spa_vdev_exit(spa, vd, txg, error)); 1552 } 1553 1554 spa->spa_pending_vdev = NULL; 1555 1556 /* 1557 * Transfer each new top-level vdev from vd to rvd. 1558 */ 1559 for (c = 0; c < vd->vdev_children; c++) { 1560 tvd = vd->vdev_child[c]; 1561 vdev_remove_child(vd, tvd); 1562 tvd->vdev_id = rvd->vdev_children; 1563 vdev_add_child(rvd, tvd); 1564 vdev_config_dirty(tvd); 1565 } 1566 1567 if (nspares != 0) { 1568 if (spa->spa_sparelist != NULL) { 1569 nvlist_t **oldspares; 1570 uint_t oldnspares; 1571 nvlist_t **newspares; 1572 1573 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 1574 ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 1575 1576 newspares = kmem_alloc(sizeof (void *) * 1577 (nspares + oldnspares), KM_SLEEP); 1578 for (i = 0; i < oldnspares; i++) 1579 VERIFY(nvlist_dup(oldspares[i], 1580 &newspares[i], KM_SLEEP) == 0); 1581 for (i = 0; i < nspares; i++) 1582 VERIFY(nvlist_dup(spares[i], 1583 &newspares[i + oldnspares], 1584 KM_SLEEP) == 0); 1585 1586 VERIFY(nvlist_remove(spa->spa_sparelist, 1587 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1588 1589 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1590 ZPOOL_CONFIG_SPARES, newspares, 1591 nspares + oldnspares) == 0); 1592 for (i = 0; i < oldnspares + nspares; i++) 1593 nvlist_free(newspares[i]); 1594 kmem_free(newspares, (oldnspares + nspares) * 1595 sizeof (void *)); 1596 } else { 1597 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1598 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1599 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1600 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1601 } 1602 1603 spa_load_spares(spa); 1604 spa->spa_sync_spares = B_TRUE; 1605 } 1606 1607 /* 1608 * We have to be careful when adding new vdevs to an existing pool. 1609 * If other threads start allocating from these vdevs before we 1610 * sync the config cache, and we lose power, then upon reboot we may 1611 * fail to open the pool because there are DVAs that the config cache 1612 * can't translate. Therefore, we first add the vdevs without 1613 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1614 * and then let spa_config_update() initialize the new metaslabs. 1615 * 1616 * spa_load() checks for added-but-not-initialized vdevs, so that 1617 * if we lose power at any point in this sequence, the remaining 1618 * steps will be completed the next time we load the pool. 1619 */ 1620 (void) spa_vdev_exit(spa, vd, txg, 0); 1621 1622 mutex_enter(&spa_namespace_lock); 1623 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1624 mutex_exit(&spa_namespace_lock); 1625 1626 return (0); 1627 } 1628 1629 /* 1630 * Attach a device to a mirror. The arguments are the path to any device 1631 * in the mirror, and the nvroot for the new device. If the path specifies 1632 * a device that is not mirrored, we automatically insert the mirror vdev. 1633 * 1634 * If 'replacing' is specified, the new device is intended to replace the 1635 * existing device; in this case the two devices are made into their own 1636 * mirror using the 'replacing' vdev, which is functionally idendical to 1637 * the mirror vdev (it actually reuses all the same ops) but has a few 1638 * extra rules: you can't attach to it after it's been created, and upon 1639 * completion of resilvering, the first disk (the one being replaced) 1640 * is automatically detached. 1641 */ 1642 int 1643 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1644 { 1645 uint64_t txg, open_txg; 1646 int error; 1647 vdev_t *rvd = spa->spa_root_vdev; 1648 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1649 vdev_ops_t *pvops; 1650 1651 txg = spa_vdev_enter(spa); 1652 1653 oldvd = vdev_lookup_by_guid(rvd, guid); 1654 1655 if (oldvd == NULL) 1656 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1657 1658 if (!oldvd->vdev_ops->vdev_op_leaf) 1659 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1660 1661 pvd = oldvd->vdev_parent; 1662 1663 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 1664 VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) 1665 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1666 1667 newvd = newrootvd->vdev_child[0]; 1668 1669 if (!newvd->vdev_ops->vdev_op_leaf) 1670 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1671 1672 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 1673 return (spa_vdev_exit(spa, newrootvd, txg, error)); 1674 1675 if (!replacing) { 1676 /* 1677 * For attach, the only allowable parent is a mirror or the root 1678 * vdev. 1679 */ 1680 if (pvd->vdev_ops != &vdev_mirror_ops && 1681 pvd->vdev_ops != &vdev_root_ops) 1682 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1683 1684 pvops = &vdev_mirror_ops; 1685 } else { 1686 /* 1687 * Active hot spares can only be replaced by inactive hot 1688 * spares. 1689 */ 1690 if (pvd->vdev_ops == &vdev_spare_ops && 1691 pvd->vdev_child[1] == oldvd && 1692 !spa_has_spare(spa, newvd->vdev_guid)) 1693 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1694 1695 /* 1696 * If the source is a hot spare, and the parent isn't already a 1697 * spare, then we want to create a new hot spare. Otherwise, we 1698 * want to create a replacing vdev. The user is not allowed to 1699 * attach to a spared vdev child unless the 'isspare' state is 1700 * the same (spare replaces spare, non-spare replaces 1701 * non-spare). 1702 */ 1703 if (pvd->vdev_ops == &vdev_replacing_ops) 1704 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1705 else if (pvd->vdev_ops == &vdev_spare_ops && 1706 newvd->vdev_isspare != oldvd->vdev_isspare) 1707 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1708 else if (pvd->vdev_ops != &vdev_spare_ops && 1709 newvd->vdev_isspare) 1710 pvops = &vdev_spare_ops; 1711 else 1712 pvops = &vdev_replacing_ops; 1713 } 1714 1715 /* 1716 * Compare the new device size with the replaceable/attachable 1717 * device size. 1718 */ 1719 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1720 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1721 1722 /* 1723 * The new device cannot have a higher alignment requirement 1724 * than the top-level vdev. 1725 */ 1726 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1727 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1728 1729 /* 1730 * If this is an in-place replacement, update oldvd's path and devid 1731 * to make it distinguishable from newvd, and unopenable from now on. 1732 */ 1733 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1734 spa_strfree(oldvd->vdev_path); 1735 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1736 KM_SLEEP); 1737 (void) sprintf(oldvd->vdev_path, "%s/%s", 1738 newvd->vdev_path, "old"); 1739 if (oldvd->vdev_devid != NULL) { 1740 spa_strfree(oldvd->vdev_devid); 1741 oldvd->vdev_devid = NULL; 1742 } 1743 } 1744 1745 /* 1746 * If the parent is not a mirror, or if we're replacing, insert the new 1747 * mirror/replacing/spare vdev above oldvd. 1748 */ 1749 if (pvd->vdev_ops != pvops) 1750 pvd = vdev_add_parent(oldvd, pvops); 1751 1752 ASSERT(pvd->vdev_top->vdev_parent == rvd); 1753 ASSERT(pvd->vdev_ops == pvops); 1754 ASSERT(oldvd->vdev_parent == pvd); 1755 1756 /* 1757 * Extract the new device from its root and add it to pvd. 1758 */ 1759 vdev_remove_child(newrootvd, newvd); 1760 newvd->vdev_id = pvd->vdev_children; 1761 vdev_add_child(pvd, newvd); 1762 1763 /* 1764 * If newvd is smaller than oldvd, but larger than its rsize, 1765 * the addition of newvd may have decreased our parent's asize. 1766 */ 1767 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1768 1769 tvd = newvd->vdev_top; 1770 ASSERT(pvd->vdev_top == tvd); 1771 ASSERT(tvd->vdev_parent == rvd); 1772 1773 vdev_config_dirty(tvd); 1774 1775 /* 1776 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1777 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1778 */ 1779 open_txg = txg + TXG_CONCURRENT_STATES - 1; 1780 1781 mutex_enter(&newvd->vdev_dtl_lock); 1782 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1783 open_txg - TXG_INITIAL + 1); 1784 mutex_exit(&newvd->vdev_dtl_lock); 1785 1786 if (newvd->vdev_isspare) 1787 spa_spare_activate(newvd); 1788 1789 /* 1790 * Mark newvd's DTL dirty in this txg. 1791 */ 1792 vdev_dirty(tvd, VDD_DTL, newvd, txg); 1793 1794 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1795 1796 /* 1797 * Kick off a resilver to update newvd. 1798 */ 1799 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1800 1801 return (0); 1802 } 1803 1804 /* 1805 * Detach a device from a mirror or replacing vdev. 1806 * If 'replace_done' is specified, only detach if the parent 1807 * is a replacing vdev. 1808 */ 1809 int 1810 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1811 { 1812 uint64_t txg; 1813 int c, t, error; 1814 vdev_t *rvd = spa->spa_root_vdev; 1815 vdev_t *vd, *pvd, *cvd, *tvd; 1816 boolean_t unspare = B_FALSE; 1817 uint64_t unspare_guid; 1818 1819 txg = spa_vdev_enter(spa); 1820 1821 vd = vdev_lookup_by_guid(rvd, guid); 1822 1823 if (vd == NULL) 1824 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1825 1826 if (!vd->vdev_ops->vdev_op_leaf) 1827 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1828 1829 pvd = vd->vdev_parent; 1830 1831 /* 1832 * If replace_done is specified, only remove this device if it's 1833 * the first child of a replacing vdev. For the 'spare' vdev, either 1834 * disk can be removed. 1835 */ 1836 if (replace_done) { 1837 if (pvd->vdev_ops == &vdev_replacing_ops) { 1838 if (vd->vdev_id != 0) 1839 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1840 } else if (pvd->vdev_ops != &vdev_spare_ops) { 1841 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1842 } 1843 } 1844 1845 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 1846 spa_version(spa) >= ZFS_VERSION_SPARES); 1847 1848 /* 1849 * Only mirror, replacing, and spare vdevs support detach. 1850 */ 1851 if (pvd->vdev_ops != &vdev_replacing_ops && 1852 pvd->vdev_ops != &vdev_mirror_ops && 1853 pvd->vdev_ops != &vdev_spare_ops) 1854 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1855 1856 /* 1857 * If there's only one replica, you can't detach it. 1858 */ 1859 if (pvd->vdev_children <= 1) 1860 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1861 1862 /* 1863 * If all siblings have non-empty DTLs, this device may have the only 1864 * valid copy of the data, which means we cannot safely detach it. 1865 * 1866 * XXX -- as in the vdev_offline() case, we really want a more 1867 * precise DTL check. 1868 */ 1869 for (c = 0; c < pvd->vdev_children; c++) { 1870 uint64_t dirty; 1871 1872 cvd = pvd->vdev_child[c]; 1873 if (cvd == vd) 1874 continue; 1875 if (vdev_is_dead(cvd)) 1876 continue; 1877 mutex_enter(&cvd->vdev_dtl_lock); 1878 dirty = cvd->vdev_dtl_map.sm_space | 1879 cvd->vdev_dtl_scrub.sm_space; 1880 mutex_exit(&cvd->vdev_dtl_lock); 1881 if (!dirty) 1882 break; 1883 } 1884 1885 /* 1886 * If we are a replacing or spare vdev, then we can always detach the 1887 * latter child, as that is how one cancels the operation. 1888 */ 1889 if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 1890 c == pvd->vdev_children) 1891 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1892 1893 /* 1894 * If we are detaching the original disk from a spare, then it implies 1895 * that the spare should become a real disk, and be removed from the 1896 * active spare list for the pool. 1897 */ 1898 if (pvd->vdev_ops == &vdev_spare_ops && 1899 vd->vdev_id == 0) 1900 unspare = B_TRUE; 1901 1902 /* 1903 * Erase the disk labels so the disk can be used for other things. 1904 * This must be done after all other error cases are handled, 1905 * but before we disembowel vd (so we can still do I/O to it). 1906 * But if we can't do it, don't treat the error as fatal -- 1907 * it may be that the unwritability of the disk is the reason 1908 * it's being detached! 1909 */ 1910 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 1911 1912 /* 1913 * Remove vd from its parent and compact the parent's children. 1914 */ 1915 vdev_remove_child(pvd, vd); 1916 vdev_compact_children(pvd); 1917 1918 /* 1919 * Remember one of the remaining children so we can get tvd below. 1920 */ 1921 cvd = pvd->vdev_child[0]; 1922 1923 /* 1924 * If we need to remove the remaining child from the list of hot spares, 1925 * do it now, marking the vdev as no longer a spare in the process. We 1926 * must do this before vdev_remove_parent(), because that can change the 1927 * GUID if it creates a new toplevel GUID. 1928 */ 1929 if (unspare) { 1930 ASSERT(cvd->vdev_isspare); 1931 spa_spare_remove(cvd); 1932 unspare_guid = cvd->vdev_guid; 1933 } 1934 1935 /* 1936 * If the parent mirror/replacing vdev only has one child, 1937 * the parent is no longer needed. Remove it from the tree. 1938 */ 1939 if (pvd->vdev_children == 1) 1940 vdev_remove_parent(cvd); 1941 1942 /* 1943 * We don't set tvd until now because the parent we just removed 1944 * may have been the previous top-level vdev. 1945 */ 1946 tvd = cvd->vdev_top; 1947 ASSERT(tvd->vdev_parent == rvd); 1948 1949 /* 1950 * Reevaluate the parent vdev state. 1951 */ 1952 vdev_propagate_state(cvd->vdev_parent); 1953 1954 /* 1955 * If the device we just detached was smaller than the others, it may be 1956 * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 1957 * can't fail because the existing metaslabs are already in core, so 1958 * there's nothing to read from disk. 1959 */ 1960 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 1961 1962 vdev_config_dirty(tvd); 1963 1964 /* 1965 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 1966 * vd->vdev_detached is set and free vd's DTL object in syncing context. 1967 * But first make sure we're not on any *other* txg's DTL list, to 1968 * prevent vd from being accessed after it's freed. 1969 */ 1970 for (t = 0; t < TXG_SIZE; t++) 1971 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1972 vd->vdev_detached = B_TRUE; 1973 vdev_dirty(tvd, VDD_DTL, vd, txg); 1974 1975 error = spa_vdev_exit(spa, vd, txg, 0); 1976 1977 /* 1978 * If this was the removal of the original device in a hot spare vdev, 1979 * then we want to go through and remove the device from the hot spare 1980 * list of every other pool. 1981 */ 1982 if (unspare) { 1983 spa = NULL; 1984 mutex_enter(&spa_namespace_lock); 1985 while ((spa = spa_next(spa)) != NULL) { 1986 if (spa->spa_state != POOL_STATE_ACTIVE) 1987 continue; 1988 1989 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 1990 } 1991 mutex_exit(&spa_namespace_lock); 1992 } 1993 1994 return (error); 1995 } 1996 1997 /* 1998 * Remove a device from the pool. Currently, this supports removing only hot 1999 * spares. 2000 */ 2001 int 2002 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 2003 { 2004 vdev_t *vd; 2005 nvlist_t **spares, *nv, **newspares; 2006 uint_t i, j, nspares; 2007 int ret = 0; 2008 2009 spa_config_enter(spa, RW_WRITER, FTAG); 2010 2011 vd = spa_lookup_by_guid(spa, guid); 2012 2013 nv = NULL; 2014 if (spa->spa_spares != NULL && 2015 nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2016 &spares, &nspares) == 0) { 2017 for (i = 0; i < nspares; i++) { 2018 uint64_t theguid; 2019 2020 VERIFY(nvlist_lookup_uint64(spares[i], 2021 ZPOOL_CONFIG_GUID, &theguid) == 0); 2022 if (theguid == guid) { 2023 nv = spares[i]; 2024 break; 2025 } 2026 } 2027 } 2028 2029 /* 2030 * We only support removing a hot spare, and only if it's not currently 2031 * in use in this pool. 2032 */ 2033 if (nv == NULL && vd == NULL) { 2034 ret = ENOENT; 2035 goto out; 2036 } 2037 2038 if (nv == NULL && vd != NULL) { 2039 ret = ENOTSUP; 2040 goto out; 2041 } 2042 2043 if (!unspare && nv != NULL && vd != NULL) { 2044 ret = EBUSY; 2045 goto out; 2046 } 2047 2048 if (nspares == 1) { 2049 newspares = NULL; 2050 } else { 2051 newspares = kmem_alloc((nspares - 1) * sizeof (void *), 2052 KM_SLEEP); 2053 for (i = 0, j = 0; i < nspares; i++) { 2054 if (spares[i] != nv) 2055 VERIFY(nvlist_dup(spares[i], 2056 &newspares[j++], KM_SLEEP) == 0); 2057 } 2058 } 2059 2060 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2061 DATA_TYPE_NVLIST_ARRAY) == 0); 2062 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2063 newspares, nspares - 1) == 0); 2064 for (i = 0; i < nspares - 1; i++) 2065 nvlist_free(newspares[i]); 2066 kmem_free(newspares, (nspares - 1) * sizeof (void *)); 2067 spa_load_spares(spa); 2068 spa->spa_sync_spares = B_TRUE; 2069 2070 out: 2071 spa_config_exit(spa, FTAG); 2072 2073 return (ret); 2074 } 2075 2076 /* 2077 * Find any device that's done replacing, so we can detach it. 2078 */ 2079 static vdev_t * 2080 spa_vdev_replace_done_hunt(vdev_t *vd) 2081 { 2082 vdev_t *newvd, *oldvd; 2083 int c; 2084 2085 for (c = 0; c < vd->vdev_children; c++) { 2086 oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 2087 if (oldvd != NULL) 2088 return (oldvd); 2089 } 2090 2091 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 2092 oldvd = vd->vdev_child[0]; 2093 newvd = vd->vdev_child[1]; 2094 2095 mutex_enter(&newvd->vdev_dtl_lock); 2096 if (newvd->vdev_dtl_map.sm_space == 0 && 2097 newvd->vdev_dtl_scrub.sm_space == 0) { 2098 mutex_exit(&newvd->vdev_dtl_lock); 2099 return (oldvd); 2100 } 2101 mutex_exit(&newvd->vdev_dtl_lock); 2102 } 2103 2104 return (NULL); 2105 } 2106 2107 static void 2108 spa_vdev_replace_done(spa_t *spa) 2109 { 2110 vdev_t *vd; 2111 vdev_t *pvd; 2112 uint64_t guid; 2113 uint64_t pguid = 0; 2114 2115 spa_config_enter(spa, RW_READER, FTAG); 2116 2117 while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 2118 guid = vd->vdev_guid; 2119 /* 2120 * If we have just finished replacing a hot spared device, then 2121 * we need to detach the parent's first child (the original hot 2122 * spare) as well. 2123 */ 2124 pvd = vd->vdev_parent; 2125 if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2126 pvd->vdev_id == 0) { 2127 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 2128 ASSERT(pvd->vdev_parent->vdev_children == 2); 2129 pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 2130 } 2131 spa_config_exit(spa, FTAG); 2132 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 2133 return; 2134 if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 2135 return; 2136 spa_config_enter(spa, RW_READER, FTAG); 2137 } 2138 2139 spa_config_exit(spa, FTAG); 2140 } 2141 2142 /* 2143 * Update the stored path for this vdev. Dirty the vdev configuration, relying 2144 * on spa_vdev_enter/exit() to synchronize the labels and cache. 2145 */ 2146 int 2147 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 2148 { 2149 vdev_t *rvd, *vd; 2150 uint64_t txg; 2151 2152 rvd = spa->spa_root_vdev; 2153 2154 txg = spa_vdev_enter(spa); 2155 2156 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2157 /* 2158 * Determine if this is a reference to a hot spare. In that 2159 * case, update the path as stored in the spare list. 2160 */ 2161 nvlist_t **spares; 2162 uint_t i, nspares; 2163 if (spa->spa_sparelist != NULL) { 2164 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 2165 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2166 for (i = 0; i < nspares; i++) { 2167 uint64_t theguid; 2168 VERIFY(nvlist_lookup_uint64(spares[i], 2169 ZPOOL_CONFIG_GUID, &theguid) == 0); 2170 if (theguid == guid) 2171 break; 2172 } 2173 2174 if (i == nspares) 2175 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2176 2177 VERIFY(nvlist_add_string(spares[i], 2178 ZPOOL_CONFIG_PATH, newpath) == 0); 2179 spa_load_spares(spa); 2180 spa->spa_sync_spares = B_TRUE; 2181 return (spa_vdev_exit(spa, NULL, txg, 0)); 2182 } else { 2183 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2184 } 2185 } 2186 2187 if (!vd->vdev_ops->vdev_op_leaf) 2188 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2189 2190 spa_strfree(vd->vdev_path); 2191 vd->vdev_path = spa_strdup(newpath); 2192 2193 vdev_config_dirty(vd->vdev_top); 2194 2195 return (spa_vdev_exit(spa, NULL, txg, 0)); 2196 } 2197 2198 /* 2199 * ========================================================================== 2200 * SPA Scrubbing 2201 * ========================================================================== 2202 */ 2203 2204 static void 2205 spa_scrub_io_done(zio_t *zio) 2206 { 2207 spa_t *spa = zio->io_spa; 2208 2209 zio_data_buf_free(zio->io_data, zio->io_size); 2210 2211 mutex_enter(&spa->spa_scrub_lock); 2212 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2213 vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2214 spa->spa_scrub_errors++; 2215 mutex_enter(&vd->vdev_stat_lock); 2216 vd->vdev_stat.vs_scrub_errors++; 2217 mutex_exit(&vd->vdev_stat_lock); 2218 } 2219 2220 if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight) 2221 cv_broadcast(&spa->spa_scrub_io_cv); 2222 2223 ASSERT(spa->spa_scrub_inflight >= 0); 2224 2225 mutex_exit(&spa->spa_scrub_lock); 2226 } 2227 2228 static void 2229 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 2230 zbookmark_t *zb) 2231 { 2232 size_t size = BP_GET_LSIZE(bp); 2233 void *data; 2234 2235 mutex_enter(&spa->spa_scrub_lock); 2236 /* 2237 * Do not give too much work to vdev(s). 2238 */ 2239 while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) { 2240 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2241 } 2242 spa->spa_scrub_inflight++; 2243 mutex_exit(&spa->spa_scrub_lock); 2244 2245 data = zio_data_buf_alloc(size); 2246 2247 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 2248 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 2249 2250 flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 2251 2252 zio_nowait(zio_read(NULL, spa, bp, data, size, 2253 spa_scrub_io_done, NULL, priority, flags, zb)); 2254 } 2255 2256 /* ARGSUSED */ 2257 static int 2258 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2259 { 2260 blkptr_t *bp = &bc->bc_blkptr; 2261 vdev_t *vd = spa->spa_root_vdev; 2262 dva_t *dva = bp->blk_dva; 2263 int needs_resilver = B_FALSE; 2264 int d; 2265 2266 if (bc->bc_errno) { 2267 /* 2268 * We can't scrub this block, but we can continue to scrub 2269 * the rest of the pool. Note the error and move along. 2270 */ 2271 mutex_enter(&spa->spa_scrub_lock); 2272 spa->spa_scrub_errors++; 2273 mutex_exit(&spa->spa_scrub_lock); 2274 2275 mutex_enter(&vd->vdev_stat_lock); 2276 vd->vdev_stat.vs_scrub_errors++; 2277 mutex_exit(&vd->vdev_stat_lock); 2278 2279 return (ERESTART); 2280 } 2281 2282 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2283 2284 for (d = 0; d < BP_GET_NDVAS(bp); d++) { 2285 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 2286 2287 ASSERT(vd != NULL); 2288 2289 /* 2290 * Keep track of how much data we've examined so that 2291 * zpool(1M) status can make useful progress reports. 2292 */ 2293 mutex_enter(&vd->vdev_stat_lock); 2294 vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 2295 mutex_exit(&vd->vdev_stat_lock); 2296 2297 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 2298 if (DVA_GET_GANG(&dva[d])) { 2299 /* 2300 * Gang members may be spread across multiple 2301 * vdevs, so the best we can do is look at the 2302 * pool-wide DTL. 2303 * XXX -- it would be better to change our 2304 * allocation policy to ensure that this can't 2305 * happen. 2306 */ 2307 vd = spa->spa_root_vdev; 2308 } 2309 if (vdev_dtl_contains(&vd->vdev_dtl_map, 2310 bp->blk_birth, 1)) 2311 needs_resilver = B_TRUE; 2312 } 2313 } 2314 2315 if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2316 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 2317 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 2318 else if (needs_resilver) 2319 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 2320 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2321 2322 return (0); 2323 } 2324 2325 static void 2326 spa_scrub_thread(spa_t *spa) 2327 { 2328 callb_cpr_t cprinfo; 2329 traverse_handle_t *th = spa->spa_scrub_th; 2330 vdev_t *rvd = spa->spa_root_vdev; 2331 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2332 int error = 0; 2333 boolean_t complete; 2334 2335 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2336 2337 /* 2338 * If we're restarting due to a snapshot create/delete, 2339 * wait for that to complete. 2340 */ 2341 txg_wait_synced(spa_get_dsl(spa), 0); 2342 2343 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 2344 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2345 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 2346 2347 spa_config_enter(spa, RW_WRITER, FTAG); 2348 vdev_reopen(rvd); /* purge all vdev caches */ 2349 vdev_config_dirty(rvd); /* rewrite all disk labels */ 2350 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 2351 spa_config_exit(spa, FTAG); 2352 2353 mutex_enter(&spa->spa_scrub_lock); 2354 spa->spa_scrub_errors = 0; 2355 spa->spa_scrub_active = 1; 2356 ASSERT(spa->spa_scrub_inflight == 0); 2357 2358 while (!spa->spa_scrub_stop) { 2359 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2360 while (spa->spa_scrub_suspended) { 2361 spa->spa_scrub_active = 0; 2362 cv_broadcast(&spa->spa_scrub_cv); 2363 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2364 spa->spa_scrub_active = 1; 2365 } 2366 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2367 2368 if (spa->spa_scrub_restart_txg != 0) 2369 break; 2370 2371 mutex_exit(&spa->spa_scrub_lock); 2372 error = traverse_more(th); 2373 mutex_enter(&spa->spa_scrub_lock); 2374 if (error != EAGAIN) 2375 break; 2376 } 2377 2378 while (spa->spa_scrub_inflight) 2379 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2380 2381 spa->spa_scrub_active = 0; 2382 cv_broadcast(&spa->spa_scrub_cv); 2383 2384 mutex_exit(&spa->spa_scrub_lock); 2385 2386 spa_config_enter(spa, RW_WRITER, FTAG); 2387 2388 mutex_enter(&spa->spa_scrub_lock); 2389 2390 /* 2391 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 2392 * AND the spa config lock to synchronize with any config changes 2393 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 2394 */ 2395 if (spa->spa_scrub_restart_txg != 0) 2396 error = ERESTART; 2397 2398 if (spa->spa_scrub_stop) 2399 error = EINTR; 2400 2401 /* 2402 * Even if there were uncorrectable errors, we consider the scrub 2403 * completed. The downside is that if there is a transient error during 2404 * a resilver, we won't resilver the data properly to the target. But 2405 * if the damage is permanent (more likely) we will resilver forever, 2406 * which isn't really acceptable. Since there is enough information for 2407 * the user to know what has failed and why, this seems like a more 2408 * tractable approach. 2409 */ 2410 complete = (error == 0); 2411 2412 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 2413 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2414 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2415 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2416 2417 mutex_exit(&spa->spa_scrub_lock); 2418 2419 /* 2420 * If the scrub/resilver completed, update all DTLs to reflect this. 2421 * Whether it succeeded or not, vacate all temporary scrub DTLs. 2422 */ 2423 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2424 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2425 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 2426 spa_errlog_rotate(spa); 2427 2428 spa_config_exit(spa, FTAG); 2429 2430 mutex_enter(&spa->spa_scrub_lock); 2431 2432 /* 2433 * We may have finished replacing a device. 2434 * Let the async thread assess this and handle the detach. 2435 */ 2436 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2437 2438 /* 2439 * If we were told to restart, our final act is to start a new scrub. 2440 */ 2441 if (error == ERESTART) 2442 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 2443 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2444 2445 spa->spa_scrub_type = POOL_SCRUB_NONE; 2446 spa->spa_scrub_active = 0; 2447 spa->spa_scrub_thread = NULL; 2448 cv_broadcast(&spa->spa_scrub_cv); 2449 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2450 thread_exit(); 2451 } 2452 2453 void 2454 spa_scrub_suspend(spa_t *spa) 2455 { 2456 mutex_enter(&spa->spa_scrub_lock); 2457 spa->spa_scrub_suspended++; 2458 while (spa->spa_scrub_active) { 2459 cv_broadcast(&spa->spa_scrub_cv); 2460 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2461 } 2462 while (spa->spa_scrub_inflight) 2463 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2464 mutex_exit(&spa->spa_scrub_lock); 2465 } 2466 2467 void 2468 spa_scrub_resume(spa_t *spa) 2469 { 2470 mutex_enter(&spa->spa_scrub_lock); 2471 ASSERT(spa->spa_scrub_suspended != 0); 2472 if (--spa->spa_scrub_suspended == 0) 2473 cv_broadcast(&spa->spa_scrub_cv); 2474 mutex_exit(&spa->spa_scrub_lock); 2475 } 2476 2477 void 2478 spa_scrub_restart(spa_t *spa, uint64_t txg) 2479 { 2480 /* 2481 * Something happened (e.g. snapshot create/delete) that means 2482 * we must restart any in-progress scrubs. The itinerary will 2483 * fix this properly. 2484 */ 2485 mutex_enter(&spa->spa_scrub_lock); 2486 spa->spa_scrub_restart_txg = txg; 2487 mutex_exit(&spa->spa_scrub_lock); 2488 } 2489 2490 int 2491 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2492 { 2493 space_seg_t *ss; 2494 uint64_t mintxg, maxtxg; 2495 vdev_t *rvd = spa->spa_root_vdev; 2496 2497 if ((uint_t)type >= POOL_SCRUB_TYPES) 2498 return (ENOTSUP); 2499 2500 mutex_enter(&spa->spa_scrub_lock); 2501 2502 /* 2503 * If there's a scrub or resilver already in progress, stop it. 2504 */ 2505 while (spa->spa_scrub_thread != NULL) { 2506 /* 2507 * Don't stop a resilver unless forced. 2508 */ 2509 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 2510 mutex_exit(&spa->spa_scrub_lock); 2511 return (EBUSY); 2512 } 2513 spa->spa_scrub_stop = 1; 2514 cv_broadcast(&spa->spa_scrub_cv); 2515 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2516 } 2517 2518 /* 2519 * Terminate the previous traverse. 2520 */ 2521 if (spa->spa_scrub_th != NULL) { 2522 traverse_fini(spa->spa_scrub_th); 2523 spa->spa_scrub_th = NULL; 2524 } 2525 2526 if (rvd == NULL) { 2527 ASSERT(spa->spa_scrub_stop == 0); 2528 ASSERT(spa->spa_scrub_type == type); 2529 ASSERT(spa->spa_scrub_restart_txg == 0); 2530 mutex_exit(&spa->spa_scrub_lock); 2531 return (0); 2532 } 2533 2534 mintxg = TXG_INITIAL - 1; 2535 maxtxg = spa_last_synced_txg(spa) + 1; 2536 2537 mutex_enter(&rvd->vdev_dtl_lock); 2538 2539 if (rvd->vdev_dtl_map.sm_space == 0) { 2540 /* 2541 * The pool-wide DTL is empty. 2542 * If this is a resilver, there's nothing to do except 2543 * check whether any in-progress replacements have completed. 2544 */ 2545 if (type == POOL_SCRUB_RESILVER) { 2546 type = POOL_SCRUB_NONE; 2547 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2548 } 2549 } else { 2550 /* 2551 * The pool-wide DTL is non-empty. 2552 * If this is a normal scrub, upgrade to a resilver instead. 2553 */ 2554 if (type == POOL_SCRUB_EVERYTHING) 2555 type = POOL_SCRUB_RESILVER; 2556 } 2557 2558 if (type == POOL_SCRUB_RESILVER) { 2559 /* 2560 * Determine the resilvering boundaries. 2561 * 2562 * Note: (mintxg, maxtxg) is an open interval, 2563 * i.e. mintxg and maxtxg themselves are not included. 2564 * 2565 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 2566 * so we don't claim to resilver a txg that's still changing. 2567 */ 2568 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 2569 mintxg = ss->ss_start - 1; 2570 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 2571 maxtxg = MIN(ss->ss_end, maxtxg); 2572 } 2573 2574 mutex_exit(&rvd->vdev_dtl_lock); 2575 2576 spa->spa_scrub_stop = 0; 2577 spa->spa_scrub_type = type; 2578 spa->spa_scrub_restart_txg = 0; 2579 2580 if (type != POOL_SCRUB_NONE) { 2581 spa->spa_scrub_mintxg = mintxg; 2582 spa->spa_scrub_maxtxg = maxtxg; 2583 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 2584 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 2585 ZIO_FLAG_CANFAIL); 2586 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 2587 spa->spa_scrub_thread = thread_create(NULL, 0, 2588 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 2589 } 2590 2591 mutex_exit(&spa->spa_scrub_lock); 2592 2593 return (0); 2594 } 2595 2596 /* 2597 * ========================================================================== 2598 * SPA async task processing 2599 * ========================================================================== 2600 */ 2601 2602 static void 2603 spa_async_reopen(spa_t *spa) 2604 { 2605 vdev_t *rvd = spa->spa_root_vdev; 2606 vdev_t *tvd; 2607 int c; 2608 2609 spa_config_enter(spa, RW_WRITER, FTAG); 2610 2611 for (c = 0; c < rvd->vdev_children; c++) { 2612 tvd = rvd->vdev_child[c]; 2613 if (tvd->vdev_reopen_wanted) { 2614 tvd->vdev_reopen_wanted = 0; 2615 vdev_reopen(tvd); 2616 } 2617 } 2618 2619 spa_config_exit(spa, FTAG); 2620 } 2621 2622 static void 2623 spa_async_thread(spa_t *spa) 2624 { 2625 int tasks; 2626 2627 ASSERT(spa->spa_sync_on); 2628 2629 mutex_enter(&spa->spa_async_lock); 2630 tasks = spa->spa_async_tasks; 2631 spa->spa_async_tasks = 0; 2632 mutex_exit(&spa->spa_async_lock); 2633 2634 /* 2635 * See if the config needs to be updated. 2636 */ 2637 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 2638 mutex_enter(&spa_namespace_lock); 2639 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2640 mutex_exit(&spa_namespace_lock); 2641 } 2642 2643 /* 2644 * See if any devices need to be reopened. 2645 */ 2646 if (tasks & SPA_ASYNC_REOPEN) 2647 spa_async_reopen(spa); 2648 2649 /* 2650 * If any devices are done replacing, detach them. 2651 */ 2652 if (tasks & SPA_ASYNC_REPLACE_DONE) 2653 spa_vdev_replace_done(spa); 2654 2655 /* 2656 * Kick off a scrub. 2657 */ 2658 if (tasks & SPA_ASYNC_SCRUB) 2659 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 2660 2661 /* 2662 * Kick off a resilver. 2663 */ 2664 if (tasks & SPA_ASYNC_RESILVER) 2665 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 2666 2667 /* 2668 * Let the world know that we're done. 2669 */ 2670 mutex_enter(&spa->spa_async_lock); 2671 spa->spa_async_thread = NULL; 2672 cv_broadcast(&spa->spa_async_cv); 2673 mutex_exit(&spa->spa_async_lock); 2674 thread_exit(); 2675 } 2676 2677 void 2678 spa_async_suspend(spa_t *spa) 2679 { 2680 mutex_enter(&spa->spa_async_lock); 2681 spa->spa_async_suspended++; 2682 while (spa->spa_async_thread != NULL) 2683 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 2684 mutex_exit(&spa->spa_async_lock); 2685 } 2686 2687 void 2688 spa_async_resume(spa_t *spa) 2689 { 2690 mutex_enter(&spa->spa_async_lock); 2691 ASSERT(spa->spa_async_suspended != 0); 2692 spa->spa_async_suspended--; 2693 mutex_exit(&spa->spa_async_lock); 2694 } 2695 2696 static void 2697 spa_async_dispatch(spa_t *spa) 2698 { 2699 mutex_enter(&spa->spa_async_lock); 2700 if (spa->spa_async_tasks && !spa->spa_async_suspended && 2701 spa->spa_async_thread == NULL && 2702 rootdir != NULL && !vn_is_readonly(rootdir)) 2703 spa->spa_async_thread = thread_create(NULL, 0, 2704 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 2705 mutex_exit(&spa->spa_async_lock); 2706 } 2707 2708 void 2709 spa_async_request(spa_t *spa, int task) 2710 { 2711 mutex_enter(&spa->spa_async_lock); 2712 spa->spa_async_tasks |= task; 2713 mutex_exit(&spa->spa_async_lock); 2714 } 2715 2716 /* 2717 * ========================================================================== 2718 * SPA syncing routines 2719 * ========================================================================== 2720 */ 2721 2722 static void 2723 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2724 { 2725 bplist_t *bpl = &spa->spa_sync_bplist; 2726 dmu_tx_t *tx; 2727 blkptr_t blk; 2728 uint64_t itor = 0; 2729 zio_t *zio; 2730 int error; 2731 uint8_t c = 1; 2732 2733 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2734 2735 while (bplist_iterate(bpl, &itor, &blk) == 0) 2736 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2737 2738 error = zio_wait(zio); 2739 ASSERT3U(error, ==, 0); 2740 2741 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2742 bplist_vacate(bpl, tx); 2743 2744 /* 2745 * Pre-dirty the first block so we sync to convergence faster. 2746 * (Usually only the first block is needed.) 2747 */ 2748 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2749 dmu_tx_commit(tx); 2750 } 2751 2752 static void 2753 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 2754 { 2755 char *packed = NULL; 2756 size_t nvsize = 0; 2757 dmu_buf_t *db; 2758 2759 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 2760 2761 packed = kmem_alloc(nvsize, KM_SLEEP); 2762 2763 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 2764 KM_SLEEP) == 0); 2765 2766 dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 2767 2768 kmem_free(packed, nvsize); 2769 2770 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 2771 dmu_buf_will_dirty(db, tx); 2772 *(uint64_t *)db->db_data = nvsize; 2773 dmu_buf_rele(db, FTAG); 2774 } 2775 2776 static void 2777 spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 2778 { 2779 nvlist_t *nvroot; 2780 nvlist_t **spares; 2781 int i; 2782 2783 if (!spa->spa_sync_spares) 2784 return; 2785 2786 /* 2787 * Update the MOS nvlist describing the list of available spares. 2788 * spa_validate_spares() will have already made sure this nvlist is 2789 * valid and the vdevs are labelled appropriately. 2790 */ 2791 if (spa->spa_spares_object == 0) { 2792 spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 2793 DMU_OT_PACKED_NVLIST, 1 << 14, 2794 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2795 VERIFY(zap_update(spa->spa_meta_objset, 2796 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 2797 sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 2798 } 2799 2800 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2801 if (spa->spa_nspares == 0) { 2802 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2803 NULL, 0) == 0); 2804 } else { 2805 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 2806 KM_SLEEP); 2807 for (i = 0; i < spa->spa_nspares; i++) 2808 spares[i] = vdev_config_generate(spa, 2809 spa->spa_spares[i], B_FALSE, B_TRUE); 2810 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2811 spares, spa->spa_nspares) == 0); 2812 for (i = 0; i < spa->spa_nspares; i++) 2813 nvlist_free(spares[i]); 2814 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 2815 } 2816 2817 spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 2818 nvlist_free(nvroot); 2819 2820 spa->spa_sync_spares = B_FALSE; 2821 } 2822 2823 static void 2824 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2825 { 2826 nvlist_t *config; 2827 2828 if (list_is_empty(&spa->spa_dirty_list)) 2829 return; 2830 2831 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2832 2833 if (spa->spa_config_syncing) 2834 nvlist_free(spa->spa_config_syncing); 2835 spa->spa_config_syncing = config; 2836 2837 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 2838 } 2839 2840 static void 2841 spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) 2842 { 2843 spa_t *spa = arg1; 2844 nvlist_t *nvp = arg2; 2845 nvpair_t *nvpair; 2846 objset_t *mos = spa->spa_meta_objset; 2847 uint64_t zapobj; 2848 2849 mutex_enter(&spa->spa_props_lock); 2850 if (spa->spa_pool_props_object == 0) { 2851 zapobj = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx); 2852 VERIFY(zapobj > 0); 2853 2854 spa->spa_pool_props_object = zapobj; 2855 2856 VERIFY(zap_update(mos, DMU_POOL_DIRECTORY_OBJECT, 2857 DMU_POOL_PROPS, 8, 1, 2858 &spa->spa_pool_props_object, tx) == 0); 2859 } 2860 mutex_exit(&spa->spa_props_lock); 2861 2862 nvpair = NULL; 2863 while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) { 2864 switch (zpool_name_to_prop(nvpair_name(nvpair))) { 2865 case ZFS_PROP_BOOTFS: 2866 VERIFY(nvlist_lookup_uint64(nvp, 2867 nvpair_name(nvpair), &spa->spa_bootfs) == 0); 2868 VERIFY(zap_update(mos, 2869 spa->spa_pool_props_object, 2870 zpool_prop_to_name(ZFS_PROP_BOOTFS), 8, 1, 2871 &spa->spa_bootfs, tx) == 0); 2872 break; 2873 } 2874 } 2875 } 2876 2877 /* 2878 * Sync the specified transaction group. New blocks may be dirtied as 2879 * part of the process, so we iterate until it converges. 2880 */ 2881 void 2882 spa_sync(spa_t *spa, uint64_t txg) 2883 { 2884 dsl_pool_t *dp = spa->spa_dsl_pool; 2885 objset_t *mos = spa->spa_meta_objset; 2886 bplist_t *bpl = &spa->spa_sync_bplist; 2887 vdev_t *rvd = spa->spa_root_vdev; 2888 vdev_t *vd; 2889 dmu_tx_t *tx; 2890 int dirty_vdevs; 2891 2892 /* 2893 * Lock out configuration changes. 2894 */ 2895 spa_config_enter(spa, RW_READER, FTAG); 2896 2897 spa->spa_syncing_txg = txg; 2898 spa->spa_sync_pass = 0; 2899 2900 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2901 2902 tx = dmu_tx_create_assigned(dp, txg); 2903 2904 /* 2905 * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg, 2906 * set spa_deflate if we have no raid-z vdevs. 2907 */ 2908 if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE && 2909 spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) { 2910 int i; 2911 2912 for (i = 0; i < rvd->vdev_children; i++) { 2913 vd = rvd->vdev_child[i]; 2914 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 2915 break; 2916 } 2917 if (i == rvd->vdev_children) { 2918 spa->spa_deflate = TRUE; 2919 VERIFY(0 == zap_add(spa->spa_meta_objset, 2920 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2921 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 2922 } 2923 } 2924 2925 /* 2926 * If anything has changed in this txg, push the deferred frees 2927 * from the previous txg. If not, leave them alone so that we 2928 * don't generate work on an otherwise idle system. 2929 */ 2930 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2931 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 2932 !txg_list_empty(&dp->dp_sync_tasks, txg)) 2933 spa_sync_deferred_frees(spa, txg); 2934 2935 /* 2936 * Iterate to convergence. 2937 */ 2938 do { 2939 spa->spa_sync_pass++; 2940 2941 spa_sync_config_object(spa, tx); 2942 spa_sync_spares(spa, tx); 2943 spa_errlog_sync(spa, txg); 2944 dsl_pool_sync(dp, txg); 2945 2946 dirty_vdevs = 0; 2947 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2948 vdev_sync(vd, txg); 2949 dirty_vdevs++; 2950 } 2951 2952 bplist_sync(bpl, tx); 2953 } while (dirty_vdevs); 2954 2955 bplist_close(bpl); 2956 2957 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2958 2959 /* 2960 * Rewrite the vdev configuration (which includes the uberblock) 2961 * to commit the transaction group. 2962 * 2963 * If there are any dirty vdevs, sync the uberblock to all vdevs. 2964 * Otherwise, pick a random top-level vdev that's known to be 2965 * visible in the config cache (see spa_vdev_add() for details). 2966 * If the write fails, try the next vdev until we're tried them all. 2967 */ 2968 if (!list_is_empty(&spa->spa_dirty_list)) { 2969 VERIFY(vdev_config_sync(rvd, txg) == 0); 2970 } else { 2971 int children = rvd->vdev_children; 2972 int c0 = spa_get_random(children); 2973 int c; 2974 2975 for (c = 0; c < children; c++) { 2976 vd = rvd->vdev_child[(c0 + c) % children]; 2977 if (vd->vdev_ms_array == 0) 2978 continue; 2979 if (vdev_config_sync(vd, txg) == 0) 2980 break; 2981 } 2982 if (c == children) 2983 VERIFY(vdev_config_sync(rvd, txg) == 0); 2984 } 2985 2986 dmu_tx_commit(tx); 2987 2988 /* 2989 * Clear the dirty config list. 2990 */ 2991 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 2992 vdev_config_clean(vd); 2993 2994 /* 2995 * Now that the new config has synced transactionally, 2996 * let it become visible to the config cache. 2997 */ 2998 if (spa->spa_config_syncing != NULL) { 2999 spa_config_set(spa, spa->spa_config_syncing); 3000 spa->spa_config_txg = txg; 3001 spa->spa_config_syncing = NULL; 3002 } 3003 3004 /* 3005 * Make a stable copy of the fully synced uberblock. 3006 * We use this as the root for pool traversals. 3007 */ 3008 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 3009 3010 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 3011 3012 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 3013 spa->spa_traverse_wanted = 0; 3014 spa->spa_ubsync = spa->spa_uberblock; 3015 rw_exit(&spa->spa_traverse_lock); 3016 3017 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 3018 3019 /* 3020 * Clean up the ZIL records for the synced txg. 3021 */ 3022 dsl_pool_zil_clean(dp); 3023 3024 /* 3025 * Update usable space statistics. 3026 */ 3027 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 3028 vdev_sync_done(vd, txg); 3029 3030 /* 3031 * It had better be the case that we didn't dirty anything 3032 * since vdev_config_sync(). 3033 */ 3034 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 3035 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 3036 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 3037 ASSERT(bpl->bpl_queue == NULL); 3038 3039 spa_config_exit(spa, FTAG); 3040 3041 /* 3042 * If any async tasks have been requested, kick them off. 3043 */ 3044 spa_async_dispatch(spa); 3045 } 3046 3047 /* 3048 * Sync all pools. We don't want to hold the namespace lock across these 3049 * operations, so we take a reference on the spa_t and drop the lock during the 3050 * sync. 3051 */ 3052 void 3053 spa_sync_allpools(void) 3054 { 3055 spa_t *spa = NULL; 3056 mutex_enter(&spa_namespace_lock); 3057 while ((spa = spa_next(spa)) != NULL) { 3058 if (spa_state(spa) != POOL_STATE_ACTIVE) 3059 continue; 3060 spa_open_ref(spa, FTAG); 3061 mutex_exit(&spa_namespace_lock); 3062 txg_wait_synced(spa_get_dsl(spa), 0); 3063 mutex_enter(&spa_namespace_lock); 3064 spa_close(spa, FTAG); 3065 } 3066 mutex_exit(&spa_namespace_lock); 3067 } 3068 3069 /* 3070 * ========================================================================== 3071 * Miscellaneous routines 3072 * ========================================================================== 3073 */ 3074 3075 /* 3076 * Remove all pools in the system. 3077 */ 3078 void 3079 spa_evict_all(void) 3080 { 3081 spa_t *spa; 3082 3083 /* 3084 * Remove all cached state. All pools should be closed now, 3085 * so every spa in the AVL tree should be unreferenced. 3086 */ 3087 mutex_enter(&spa_namespace_lock); 3088 while ((spa = spa_next(NULL)) != NULL) { 3089 /* 3090 * Stop async tasks. The async thread may need to detach 3091 * a device that's been replaced, which requires grabbing 3092 * spa_namespace_lock, so we must drop it here. 3093 */ 3094 spa_open_ref(spa, FTAG); 3095 mutex_exit(&spa_namespace_lock); 3096 spa_async_suspend(spa); 3097 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 3098 mutex_enter(&spa_namespace_lock); 3099 spa_close(spa, FTAG); 3100 3101 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3102 spa_unload(spa); 3103 spa_deactivate(spa); 3104 } 3105 spa_remove(spa); 3106 } 3107 mutex_exit(&spa_namespace_lock); 3108 } 3109 3110 vdev_t * 3111 spa_lookup_by_guid(spa_t *spa, uint64_t guid) 3112 { 3113 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 3114 } 3115 3116 void 3117 spa_upgrade(spa_t *spa) 3118 { 3119 spa_config_enter(spa, RW_WRITER, FTAG); 3120 3121 /* 3122 * This should only be called for a non-faulted pool, and since a 3123 * future version would result in an unopenable pool, this shouldn't be 3124 * possible. 3125 */ 3126 ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 3127 3128 spa->spa_uberblock.ub_version = ZFS_VERSION; 3129 vdev_config_dirty(spa->spa_root_vdev); 3130 3131 spa_config_exit(spa, FTAG); 3132 3133 txg_wait_synced(spa_get_dsl(spa), 0); 3134 } 3135 3136 boolean_t 3137 spa_has_spare(spa_t *spa, uint64_t guid) 3138 { 3139 int i; 3140 uint64_t spareguid; 3141 3142 for (i = 0; i < spa->spa_nspares; i++) 3143 if (spa->spa_spares[i]->vdev_guid == guid) 3144 return (B_TRUE); 3145 3146 for (i = 0; i < spa->spa_pending_nspares; i++) { 3147 if (nvlist_lookup_uint64(spa->spa_pending_spares[i], 3148 ZPOOL_CONFIG_GUID, &spareguid) == 0 && 3149 spareguid == guid) 3150 return (B_TRUE); 3151 } 3152 3153 return (B_FALSE); 3154 } 3155 3156 int 3157 spa_set_props(spa_t *spa, nvlist_t *nvp) 3158 { 3159 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 3160 spa, nvp, 3)); 3161 } 3162 3163 int 3164 spa_get_props(spa_t *spa, nvlist_t **nvp) 3165 { 3166 zap_cursor_t zc; 3167 zap_attribute_t za; 3168 objset_t *mos = spa->spa_meta_objset; 3169 zfs_source_t src; 3170 zfs_prop_t prop; 3171 nvlist_t *propval; 3172 uint64_t value; 3173 int err; 3174 3175 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3176 3177 mutex_enter(&spa->spa_props_lock); 3178 /* If no props object, then just return empty nvlist */ 3179 if (spa->spa_pool_props_object == 0) { 3180 mutex_exit(&spa->spa_props_lock); 3181 return (0); 3182 } 3183 3184 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 3185 (err = zap_cursor_retrieve(&zc, &za)) == 0; 3186 zap_cursor_advance(&zc)) { 3187 3188 if ((prop = zpool_name_to_prop(za.za_name)) == ZFS_PROP_INVAL) 3189 continue; 3190 3191 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3192 switch (za.za_integer_length) { 3193 case 8: 3194 if (zfs_prop_default_numeric(prop) == 3195 za.za_first_integer) 3196 src = ZFS_SRC_DEFAULT; 3197 else 3198 src = ZFS_SRC_LOCAL; 3199 value = za.za_first_integer; 3200 3201 if (prop == ZFS_PROP_BOOTFS) { 3202 dsl_pool_t *dp; 3203 dsl_dataset_t *ds = NULL; 3204 char strval[MAXPATHLEN]; 3205 3206 dp = spa_get_dsl(spa); 3207 rw_enter(&dp->dp_config_rwlock, RW_READER); 3208 if ((err = dsl_dataset_open_obj(dp, 3209 za.za_first_integer, NULL, DS_MODE_NONE, 3210 FTAG, &ds)) != 0) { 3211 rw_exit(&dp->dp_config_rwlock); 3212 break; 3213 } 3214 dsl_dataset_name(ds, strval); 3215 dsl_dataset_close(ds, DS_MODE_NONE, FTAG); 3216 rw_exit(&dp->dp_config_rwlock); 3217 3218 VERIFY(nvlist_add_uint64(propval, 3219 ZFS_PROP_SOURCE, src) == 0); 3220 VERIFY(nvlist_add_string(propval, 3221 ZFS_PROP_VALUE, strval) == 0); 3222 } else { 3223 VERIFY(nvlist_add_uint64(propval, 3224 ZFS_PROP_SOURCE, src) == 0); 3225 VERIFY(nvlist_add_uint64(propval, 3226 ZFS_PROP_VALUE, value) == 0); 3227 } 3228 VERIFY(nvlist_add_nvlist(*nvp, za.za_name, 3229 propval) == 0); 3230 break; 3231 } 3232 nvlist_free(propval); 3233 } 3234 zap_cursor_fini(&zc); 3235 mutex_exit(&spa->spa_props_lock); 3236 if (err && err != ENOENT) { 3237 nvlist_free(*nvp); 3238 return (err); 3239 } 3240 3241 return (0); 3242 } 3243 3244 /* 3245 * If the bootfs property value is dsobj, clear it. 3246 */ 3247 void 3248 spa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 3249 { 3250 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 3251 VERIFY(zap_remove(spa->spa_meta_objset, 3252 spa->spa_pool_props_object, 3253 zpool_prop_to_name(ZFS_PROP_BOOTFS), tx) == 0); 3254 spa->spa_bootfs = 0; 3255 } 3256 } 3257