1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains all the routines used when modifying on-disk SPA state. 31 * This includes opening, importing, destroying, exporting a pool, and syncing a 32 * pool. 33 */ 34 35 #include <sys/zfs_context.h> 36 #include <sys/fm/fs/zfs.h> 37 #include <sys/spa_impl.h> 38 #include <sys/zio.h> 39 #include <sys/zio_checksum.h> 40 #include <sys/zio_compress.h> 41 #include <sys/dmu.h> 42 #include <sys/dmu_tx.h> 43 #include <sys/zap.h> 44 #include <sys/zil.h> 45 #include <sys/vdev_impl.h> 46 #include <sys/metaslab.h> 47 #include <sys/uberblock_impl.h> 48 #include <sys/txg.h> 49 #include <sys/avl.h> 50 #include <sys/dmu_traverse.h> 51 #include <sys/dmu_objset.h> 52 #include <sys/unique.h> 53 #include <sys/dsl_pool.h> 54 #include <sys/dsl_dataset.h> 55 #include <sys/dsl_dir.h> 56 #include <sys/dsl_prop.h> 57 #include <sys/dsl_synctask.h> 58 #include <sys/fs/zfs.h> 59 #include <sys/callb.h> 60 #include <sys/systeminfo.h> 61 #include <sys/sunddi.h> 62 63 int zio_taskq_threads = 8; 64 65 /* 66 * ========================================================================== 67 * SPA state manipulation (open/create/destroy/import/export) 68 * ========================================================================== 69 */ 70 71 static int 72 spa_error_entry_compare(const void *a, const void *b) 73 { 74 spa_error_entry_t *sa = (spa_error_entry_t *)a; 75 spa_error_entry_t *sb = (spa_error_entry_t *)b; 76 int ret; 77 78 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 79 sizeof (zbookmark_t)); 80 81 if (ret < 0) 82 return (-1); 83 else if (ret > 0) 84 return (1); 85 else 86 return (0); 87 } 88 89 /* 90 * Utility function which retrieves copies of the current logs and 91 * re-initializes them in the process. 92 */ 93 void 94 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 95 { 96 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 97 98 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 99 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 100 101 avl_create(&spa->spa_errlist_scrub, 102 spa_error_entry_compare, sizeof (spa_error_entry_t), 103 offsetof(spa_error_entry_t, se_avl)); 104 avl_create(&spa->spa_errlist_last, 105 spa_error_entry_compare, sizeof (spa_error_entry_t), 106 offsetof(spa_error_entry_t, se_avl)); 107 } 108 109 /* 110 * Activate an uninitialized pool. 111 */ 112 static void 113 spa_activate(spa_t *spa) 114 { 115 int t; 116 117 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 118 119 spa->spa_state = POOL_STATE_ACTIVE; 120 121 spa->spa_normal_class = metaslab_class_create(); 122 123 for (t = 0; t < ZIO_TYPES; t++) { 124 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 125 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 126 TASKQ_PREPOPULATE); 127 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 128 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 129 TASKQ_PREPOPULATE); 130 } 131 132 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 133 134 mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); 135 mutex_init(&spa->spa_config_cache_lock, NULL, MUTEX_DEFAULT, NULL); 136 mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); 137 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); 138 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); 139 mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL); 140 mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); 141 mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); 142 mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); 143 144 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 145 offsetof(vdev_t, vdev_dirty_node)); 146 147 txg_list_create(&spa->spa_vdev_txg_list, 148 offsetof(struct vdev, vdev_txg_node)); 149 150 avl_create(&spa->spa_errlist_scrub, 151 spa_error_entry_compare, sizeof (spa_error_entry_t), 152 offsetof(spa_error_entry_t, se_avl)); 153 avl_create(&spa->spa_errlist_last, 154 spa_error_entry_compare, sizeof (spa_error_entry_t), 155 offsetof(spa_error_entry_t, se_avl)); 156 } 157 158 /* 159 * Opposite of spa_activate(). 160 */ 161 static void 162 spa_deactivate(spa_t *spa) 163 { 164 int t; 165 166 ASSERT(spa->spa_sync_on == B_FALSE); 167 ASSERT(spa->spa_dsl_pool == NULL); 168 ASSERT(spa->spa_root_vdev == NULL); 169 170 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 171 172 txg_list_destroy(&spa->spa_vdev_txg_list); 173 174 list_destroy(&spa->spa_dirty_list); 175 176 rw_destroy(&spa->spa_traverse_lock); 177 178 for (t = 0; t < ZIO_TYPES; t++) { 179 taskq_destroy(spa->spa_zio_issue_taskq[t]); 180 taskq_destroy(spa->spa_zio_intr_taskq[t]); 181 spa->spa_zio_issue_taskq[t] = NULL; 182 spa->spa_zio_intr_taskq[t] = NULL; 183 } 184 185 metaslab_class_destroy(spa->spa_normal_class); 186 spa->spa_normal_class = NULL; 187 188 /* 189 * If this was part of an import or the open otherwise failed, we may 190 * still have errors left in the queues. Empty them just in case. 191 */ 192 spa_errlog_drain(spa); 193 194 avl_destroy(&spa->spa_errlist_scrub); 195 avl_destroy(&spa->spa_errlist_last); 196 197 spa->spa_state = POOL_STATE_UNINITIALIZED; 198 } 199 200 /* 201 * Verify a pool configuration, and construct the vdev tree appropriately. This 202 * will create all the necessary vdevs in the appropriate layout, with each vdev 203 * in the CLOSED state. This will prep the pool before open/creation/import. 204 * All vdev validation is done by the vdev_alloc() routine. 205 */ 206 static int 207 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 208 uint_t id, int atype) 209 { 210 nvlist_t **child; 211 uint_t c, children; 212 int error; 213 214 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 215 return (error); 216 217 if ((*vdp)->vdev_ops->vdev_op_leaf) 218 return (0); 219 220 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 221 &child, &children) != 0) { 222 vdev_free(*vdp); 223 *vdp = NULL; 224 return (EINVAL); 225 } 226 227 for (c = 0; c < children; c++) { 228 vdev_t *vd; 229 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 230 atype)) != 0) { 231 vdev_free(*vdp); 232 *vdp = NULL; 233 return (error); 234 } 235 } 236 237 ASSERT(*vdp != NULL); 238 239 return (0); 240 } 241 242 /* 243 * Opposite of spa_load(). 244 */ 245 static void 246 spa_unload(spa_t *spa) 247 { 248 int i; 249 250 /* 251 * Stop async tasks. 252 */ 253 spa_async_suspend(spa); 254 255 /* 256 * Stop syncing. 257 */ 258 if (spa->spa_sync_on) { 259 txg_sync_stop(spa->spa_dsl_pool); 260 spa->spa_sync_on = B_FALSE; 261 } 262 263 /* 264 * Wait for any outstanding prefetch I/O to complete. 265 */ 266 spa_config_enter(spa, RW_WRITER, FTAG); 267 spa_config_exit(spa, FTAG); 268 269 /* 270 * Close the dsl pool. 271 */ 272 if (spa->spa_dsl_pool) { 273 dsl_pool_close(spa->spa_dsl_pool); 274 spa->spa_dsl_pool = NULL; 275 } 276 277 /* 278 * Close all vdevs. 279 */ 280 if (spa->spa_root_vdev) 281 vdev_free(spa->spa_root_vdev); 282 ASSERT(spa->spa_root_vdev == NULL); 283 284 for (i = 0; i < spa->spa_nspares; i++) 285 vdev_free(spa->spa_spares[i]); 286 if (spa->spa_spares) { 287 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 288 spa->spa_spares = NULL; 289 } 290 if (spa->spa_sparelist) { 291 nvlist_free(spa->spa_sparelist); 292 spa->spa_sparelist = NULL; 293 } 294 295 spa->spa_async_suspended = 0; 296 } 297 298 /* 299 * Load (or re-load) the current list of vdevs describing the active spares for 300 * this pool. When this is called, we have some form of basic information in 301 * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 302 * re-generate a more complete list including status information. 303 */ 304 static void 305 spa_load_spares(spa_t *spa) 306 { 307 nvlist_t **spares; 308 uint_t nspares; 309 int i; 310 vdev_t *vd, *tvd; 311 312 /* 313 * First, close and free any existing spare vdevs. 314 */ 315 for (i = 0; i < spa->spa_nspares; i++) { 316 vd = spa->spa_spares[i]; 317 318 /* Undo the call to spa_activate() below */ 319 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && 320 tvd->vdev_isspare) 321 spa_spare_remove(tvd); 322 vdev_close(vd); 323 vdev_free(vd); 324 } 325 326 if (spa->spa_spares) 327 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 328 329 if (spa->spa_sparelist == NULL) 330 nspares = 0; 331 else 332 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 333 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 334 335 spa->spa_nspares = (int)nspares; 336 spa->spa_spares = NULL; 337 338 if (nspares == 0) 339 return; 340 341 /* 342 * Construct the array of vdevs, opening them to get status in the 343 * process. For each spare, there is potentially two different vdev_t 344 * structures associated with it: one in the list of spares (used only 345 * for basic validation purposes) and one in the active vdev 346 * configuration (if it's spared in). During this phase we open and 347 * validate each vdev on the spare list. If the vdev also exists in the 348 * active configuration, then we also mark this vdev as an active spare. 349 */ 350 spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 351 for (i = 0; i < spa->spa_nspares; i++) { 352 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 353 VDEV_ALLOC_SPARE) == 0); 354 ASSERT(vd != NULL); 355 356 spa->spa_spares[i] = vd; 357 358 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { 359 if (!tvd->vdev_isspare) 360 spa_spare_add(tvd); 361 362 /* 363 * We only mark the spare active if we were successfully 364 * able to load the vdev. Otherwise, importing a pool 365 * with a bad active spare would result in strange 366 * behavior, because multiple pool would think the spare 367 * is actively in use. 368 * 369 * There is a vulnerability here to an equally bizarre 370 * circumstance, where a dead active spare is later 371 * brought back to life (onlined or otherwise). Given 372 * the rarity of this scenario, and the extra complexity 373 * it adds, we ignore the possibility. 374 */ 375 if (!vdev_is_dead(tvd)) 376 spa_spare_activate(tvd); 377 } 378 379 if (vdev_open(vd) != 0) 380 continue; 381 382 vd->vdev_top = vd; 383 (void) vdev_validate_spare(vd); 384 } 385 386 /* 387 * Recompute the stashed list of spares, with status information 388 * this time. 389 */ 390 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 391 DATA_TYPE_NVLIST_ARRAY) == 0); 392 393 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 394 for (i = 0; i < spa->spa_nspares; i++) 395 spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 396 B_TRUE, B_TRUE); 397 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 398 spares, spa->spa_nspares) == 0); 399 for (i = 0; i < spa->spa_nspares; i++) 400 nvlist_free(spares[i]); 401 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 402 } 403 404 static int 405 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 406 { 407 dmu_buf_t *db; 408 char *packed = NULL; 409 size_t nvsize = 0; 410 int error; 411 *value = NULL; 412 413 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 414 nvsize = *(uint64_t *)db->db_data; 415 dmu_buf_rele(db, FTAG); 416 417 packed = kmem_alloc(nvsize, KM_SLEEP); 418 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 419 if (error == 0) 420 error = nvlist_unpack(packed, nvsize, value, 0); 421 kmem_free(packed, nvsize); 422 423 return (error); 424 } 425 426 /* 427 * Load an existing storage pool, using the pool's builtin spa_config as a 428 * source of configuration information. 429 */ 430 static int 431 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 432 { 433 int error = 0; 434 nvlist_t *nvroot = NULL; 435 vdev_t *rvd; 436 uberblock_t *ub = &spa->spa_uberblock; 437 uint64_t config_cache_txg = spa->spa_config_txg; 438 uint64_t pool_guid; 439 uint64_t version; 440 zio_t *zio; 441 442 spa->spa_load_state = state; 443 444 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 445 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 446 error = EINVAL; 447 goto out; 448 } 449 450 /* 451 * Versioning wasn't explicitly added to the label until later, so if 452 * it's not present treat it as the initial version. 453 */ 454 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 455 version = ZFS_VERSION_INITIAL; 456 457 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 458 &spa->spa_config_txg); 459 460 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 461 spa_guid_exists(pool_guid, 0)) { 462 error = EEXIST; 463 goto out; 464 } 465 466 spa->spa_load_guid = pool_guid; 467 468 /* 469 * Parse the configuration into a vdev tree. We explicitly set the 470 * value that will be returned by spa_version() since parsing the 471 * configuration requires knowing the version number. 472 */ 473 spa_config_enter(spa, RW_WRITER, FTAG); 474 spa->spa_ubsync.ub_version = version; 475 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 476 spa_config_exit(spa, FTAG); 477 478 if (error != 0) 479 goto out; 480 481 ASSERT(spa->spa_root_vdev == rvd); 482 ASSERT(spa_guid(spa) == pool_guid); 483 484 /* 485 * Try to open all vdevs, loading each label in the process. 486 */ 487 error = vdev_open(rvd); 488 if (error != 0) 489 goto out; 490 491 /* 492 * Validate the labels for all leaf vdevs. We need to grab the config 493 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 494 * flag. 495 */ 496 spa_config_enter(spa, RW_READER, FTAG); 497 error = vdev_validate(rvd); 498 spa_config_exit(spa, FTAG); 499 500 if (error != 0) 501 goto out; 502 503 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 504 error = ENXIO; 505 goto out; 506 } 507 508 /* 509 * Find the best uberblock. 510 */ 511 bzero(ub, sizeof (uberblock_t)); 512 513 zio = zio_root(spa, NULL, NULL, 514 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 515 vdev_uberblock_load(zio, rvd, ub); 516 error = zio_wait(zio); 517 518 /* 519 * If we weren't able to find a single valid uberblock, return failure. 520 */ 521 if (ub->ub_txg == 0) { 522 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 523 VDEV_AUX_CORRUPT_DATA); 524 error = ENXIO; 525 goto out; 526 } 527 528 /* 529 * If the pool is newer than the code, we can't open it. 530 */ 531 if (ub->ub_version > ZFS_VERSION) { 532 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 533 VDEV_AUX_VERSION_NEWER); 534 error = ENOTSUP; 535 goto out; 536 } 537 538 /* 539 * If the vdev guid sum doesn't match the uberblock, we have an 540 * incomplete configuration. 541 */ 542 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 543 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 544 VDEV_AUX_BAD_GUID_SUM); 545 error = ENXIO; 546 goto out; 547 } 548 549 /* 550 * Initialize internal SPA structures. 551 */ 552 spa->spa_state = POOL_STATE_ACTIVE; 553 spa->spa_ubsync = spa->spa_uberblock; 554 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 555 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 556 if (error) { 557 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 558 VDEV_AUX_CORRUPT_DATA); 559 goto out; 560 } 561 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 562 563 if (zap_lookup(spa->spa_meta_objset, 564 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 565 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 566 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 567 VDEV_AUX_CORRUPT_DATA); 568 error = EIO; 569 goto out; 570 } 571 572 if (!mosconfig) { 573 nvlist_t *newconfig; 574 uint64_t hostid; 575 576 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 577 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 578 VDEV_AUX_CORRUPT_DATA); 579 error = EIO; 580 goto out; 581 } 582 583 if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID, 584 &hostid) == 0) { 585 char *hostname; 586 unsigned long myhostid = 0; 587 588 VERIFY(nvlist_lookup_string(newconfig, 589 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 590 591 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 592 if (hostid != 0 && myhostid != 0 && 593 (unsigned long)hostid != myhostid) { 594 cmn_err(CE_WARN, "pool '%s' could not be " 595 "loaded as it was last accessed by " 596 "another system (host: %s hostid: 0x%lx). " 597 "See: http://www.sun.com/msg/ZFS-8000-EY", 598 spa->spa_name, hostname, 599 (unsigned long)hostid); 600 error = EBADF; 601 goto out; 602 } 603 } 604 605 spa_config_set(spa, newconfig); 606 spa_unload(spa); 607 spa_deactivate(spa); 608 spa_activate(spa); 609 610 return (spa_load(spa, newconfig, state, B_TRUE)); 611 } 612 613 if (zap_lookup(spa->spa_meta_objset, 614 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 615 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 616 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 617 VDEV_AUX_CORRUPT_DATA); 618 error = EIO; 619 goto out; 620 } 621 622 /* 623 * Load the bit that tells us to use the new accounting function 624 * (raid-z deflation). If we have an older pool, this will not 625 * be present. 626 */ 627 error = zap_lookup(spa->spa_meta_objset, 628 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 629 sizeof (uint64_t), 1, &spa->spa_deflate); 630 if (error != 0 && error != ENOENT) { 631 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 632 VDEV_AUX_CORRUPT_DATA); 633 error = EIO; 634 goto out; 635 } 636 637 /* 638 * Load the persistent error log. If we have an older pool, this will 639 * not be present. 640 */ 641 error = zap_lookup(spa->spa_meta_objset, 642 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 643 sizeof (uint64_t), 1, &spa->spa_errlog_last); 644 if (error != 0 && error != ENOENT) { 645 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 646 VDEV_AUX_CORRUPT_DATA); 647 error = EIO; 648 goto out; 649 } 650 651 error = zap_lookup(spa->spa_meta_objset, 652 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 653 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 654 if (error != 0 && error != ENOENT) { 655 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 656 VDEV_AUX_CORRUPT_DATA); 657 error = EIO; 658 goto out; 659 } 660 661 /* 662 * Load the history object. If we have an older pool, this 663 * will not be present. 664 */ 665 error = zap_lookup(spa->spa_meta_objset, 666 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 667 sizeof (uint64_t), 1, &spa->spa_history); 668 if (error != 0 && error != ENOENT) { 669 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 670 VDEV_AUX_CORRUPT_DATA); 671 error = EIO; 672 goto out; 673 } 674 675 /* 676 * Load any hot spares for this pool. 677 */ 678 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 679 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 680 if (error != 0 && error != ENOENT) { 681 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 682 VDEV_AUX_CORRUPT_DATA); 683 error = EIO; 684 goto out; 685 } 686 if (error == 0) { 687 ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES); 688 if (load_nvlist(spa, spa->spa_spares_object, 689 &spa->spa_sparelist) != 0) { 690 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 691 VDEV_AUX_CORRUPT_DATA); 692 error = EIO; 693 goto out; 694 } 695 696 spa_config_enter(spa, RW_WRITER, FTAG); 697 spa_load_spares(spa); 698 spa_config_exit(spa, FTAG); 699 } 700 701 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 702 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 703 704 if (error && error != ENOENT) { 705 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 706 VDEV_AUX_CORRUPT_DATA); 707 error = EIO; 708 goto out; 709 } 710 711 if (error == 0) { 712 (void) zap_lookup(spa->spa_meta_objset, 713 spa->spa_pool_props_object, 714 zpool_prop_to_name(ZFS_PROP_BOOTFS), 715 sizeof (uint64_t), 1, &spa->spa_bootfs); 716 } 717 718 /* 719 * Load the vdev state for all toplevel vdevs. 720 */ 721 vdev_load(rvd); 722 723 /* 724 * Propagate the leaf DTLs we just loaded all the way up the tree. 725 */ 726 spa_config_enter(spa, RW_WRITER, FTAG); 727 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 728 spa_config_exit(spa, FTAG); 729 730 /* 731 * Check the state of the root vdev. If it can't be opened, it 732 * indicates one or more toplevel vdevs are faulted. 733 */ 734 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 735 error = ENXIO; 736 goto out; 737 } 738 739 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 740 dmu_tx_t *tx; 741 int need_update = B_FALSE; 742 int c; 743 744 /* 745 * Claim log blocks that haven't been committed yet. 746 * This must all happen in a single txg. 747 */ 748 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 749 spa_first_txg(spa)); 750 (void) dmu_objset_find(spa->spa_name, 751 zil_claim, tx, DS_FIND_CHILDREN); 752 dmu_tx_commit(tx); 753 754 spa->spa_sync_on = B_TRUE; 755 txg_sync_start(spa->spa_dsl_pool); 756 757 /* 758 * Wait for all claims to sync. 759 */ 760 txg_wait_synced(spa->spa_dsl_pool, 0); 761 762 /* 763 * If the config cache is stale, or we have uninitialized 764 * metaslabs (see spa_vdev_add()), then update the config. 765 */ 766 if (config_cache_txg != spa->spa_config_txg || 767 state == SPA_LOAD_IMPORT) 768 need_update = B_TRUE; 769 770 for (c = 0; c < rvd->vdev_children; c++) 771 if (rvd->vdev_child[c]->vdev_ms_array == 0) 772 need_update = B_TRUE; 773 774 /* 775 * Update the config cache asychronously in case we're the 776 * root pool, in which case the config cache isn't writable yet. 777 */ 778 if (need_update) 779 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 780 } 781 782 error = 0; 783 out: 784 if (error && error != EBADF) 785 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 786 spa->spa_load_state = SPA_LOAD_NONE; 787 spa->spa_ena = 0; 788 789 return (error); 790 } 791 792 /* 793 * Pool Open/Import 794 * 795 * The import case is identical to an open except that the configuration is sent 796 * down from userland, instead of grabbed from the configuration cache. For the 797 * case of an open, the pool configuration will exist in the 798 * POOL_STATE_UNITIALIZED state. 799 * 800 * The stats information (gen/count/ustats) is used to gather vdev statistics at 801 * the same time open the pool, without having to keep around the spa_t in some 802 * ambiguous state. 803 */ 804 static int 805 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 806 { 807 spa_t *spa; 808 int error; 809 int loaded = B_FALSE; 810 int locked = B_FALSE; 811 812 *spapp = NULL; 813 814 /* 815 * As disgusting as this is, we need to support recursive calls to this 816 * function because dsl_dir_open() is called during spa_load(), and ends 817 * up calling spa_open() again. The real fix is to figure out how to 818 * avoid dsl_dir_open() calling this in the first place. 819 */ 820 if (mutex_owner(&spa_namespace_lock) != curthread) { 821 mutex_enter(&spa_namespace_lock); 822 locked = B_TRUE; 823 } 824 825 if ((spa = spa_lookup(pool)) == NULL) { 826 if (locked) 827 mutex_exit(&spa_namespace_lock); 828 return (ENOENT); 829 } 830 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 831 832 spa_activate(spa); 833 834 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 835 836 if (error == EBADF) { 837 /* 838 * If vdev_validate() returns failure (indicated by 839 * EBADF), it indicates that one of the vdevs indicates 840 * that the pool has been exported or destroyed. If 841 * this is the case, the config cache is out of sync and 842 * we should remove the pool from the namespace. 843 */ 844 zfs_post_ok(spa, NULL); 845 spa_unload(spa); 846 spa_deactivate(spa); 847 spa_remove(spa); 848 spa_config_sync(); 849 if (locked) 850 mutex_exit(&spa_namespace_lock); 851 return (ENOENT); 852 } 853 854 if (error) { 855 /* 856 * We can't open the pool, but we still have useful 857 * information: the state of each vdev after the 858 * attempted vdev_open(). Return this to the user. 859 */ 860 if (config != NULL && spa->spa_root_vdev != NULL) { 861 spa_config_enter(spa, RW_READER, FTAG); 862 *config = spa_config_generate(spa, NULL, -1ULL, 863 B_TRUE); 864 spa_config_exit(spa, FTAG); 865 } 866 spa_unload(spa); 867 spa_deactivate(spa); 868 spa->spa_last_open_failed = B_TRUE; 869 if (locked) 870 mutex_exit(&spa_namespace_lock); 871 *spapp = NULL; 872 return (error); 873 } else { 874 zfs_post_ok(spa, NULL); 875 spa->spa_last_open_failed = B_FALSE; 876 } 877 878 loaded = B_TRUE; 879 } 880 881 spa_open_ref(spa, tag); 882 if (locked) 883 mutex_exit(&spa_namespace_lock); 884 885 *spapp = spa; 886 887 if (config != NULL) { 888 spa_config_enter(spa, RW_READER, FTAG); 889 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 890 spa_config_exit(spa, FTAG); 891 } 892 893 /* 894 * If we just loaded the pool, resilver anything that's out of date. 895 */ 896 if (loaded && (spa_mode & FWRITE)) 897 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 898 899 return (0); 900 } 901 902 int 903 spa_open(const char *name, spa_t **spapp, void *tag) 904 { 905 return (spa_open_common(name, spapp, tag, NULL)); 906 } 907 908 /* 909 * Lookup the given spa_t, incrementing the inject count in the process, 910 * preventing it from being exported or destroyed. 911 */ 912 spa_t * 913 spa_inject_addref(char *name) 914 { 915 spa_t *spa; 916 917 mutex_enter(&spa_namespace_lock); 918 if ((spa = spa_lookup(name)) == NULL) { 919 mutex_exit(&spa_namespace_lock); 920 return (NULL); 921 } 922 spa->spa_inject_ref++; 923 mutex_exit(&spa_namespace_lock); 924 925 return (spa); 926 } 927 928 void 929 spa_inject_delref(spa_t *spa) 930 { 931 mutex_enter(&spa_namespace_lock); 932 spa->spa_inject_ref--; 933 mutex_exit(&spa_namespace_lock); 934 } 935 936 static void 937 spa_add_spares(spa_t *spa, nvlist_t *config) 938 { 939 nvlist_t **spares; 940 uint_t i, nspares; 941 nvlist_t *nvroot; 942 uint64_t guid; 943 vdev_stat_t *vs; 944 uint_t vsc; 945 uint64_t pool; 946 947 if (spa->spa_nspares == 0) 948 return; 949 950 VERIFY(nvlist_lookup_nvlist(config, 951 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 952 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 953 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 954 if (nspares != 0) { 955 VERIFY(nvlist_add_nvlist_array(nvroot, 956 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 957 VERIFY(nvlist_lookup_nvlist_array(nvroot, 958 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 959 960 /* 961 * Go through and find any spares which have since been 962 * repurposed as an active spare. If this is the case, update 963 * their status appropriately. 964 */ 965 for (i = 0; i < nspares; i++) { 966 VERIFY(nvlist_lookup_uint64(spares[i], 967 ZPOOL_CONFIG_GUID, &guid) == 0); 968 if (spa_spare_exists(guid, &pool) && pool != 0ULL) { 969 VERIFY(nvlist_lookup_uint64_array( 970 spares[i], ZPOOL_CONFIG_STATS, 971 (uint64_t **)&vs, &vsc) == 0); 972 vs->vs_state = VDEV_STATE_CANT_OPEN; 973 vs->vs_aux = VDEV_AUX_SPARED; 974 } 975 } 976 } 977 } 978 979 int 980 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 981 { 982 int error; 983 spa_t *spa; 984 985 *config = NULL; 986 error = spa_open_common(name, &spa, FTAG, config); 987 988 if (spa && *config != NULL) { 989 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 990 spa_get_errlog_size(spa)) == 0); 991 992 spa_add_spares(spa, *config); 993 } 994 995 /* 996 * We want to get the alternate root even for faulted pools, so we cheat 997 * and call spa_lookup() directly. 998 */ 999 if (altroot) { 1000 if (spa == NULL) { 1001 mutex_enter(&spa_namespace_lock); 1002 spa = spa_lookup(name); 1003 if (spa) 1004 spa_altroot(spa, altroot, buflen); 1005 else 1006 altroot[0] = '\0'; 1007 spa = NULL; 1008 mutex_exit(&spa_namespace_lock); 1009 } else { 1010 spa_altroot(spa, altroot, buflen); 1011 } 1012 } 1013 1014 if (spa != NULL) 1015 spa_close(spa, FTAG); 1016 1017 return (error); 1018 } 1019 1020 /* 1021 * Validate that the 'spares' array is well formed. We must have an array of 1022 * nvlists, each which describes a valid leaf vdev. If this is an import (mode 1023 * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long 1024 * as they are well-formed. 1025 */ 1026 static int 1027 spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 1028 { 1029 nvlist_t **spares; 1030 uint_t i, nspares; 1031 vdev_t *vd; 1032 int error; 1033 1034 /* 1035 * It's acceptable to have no spares specified. 1036 */ 1037 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1038 &spares, &nspares) != 0) 1039 return (0); 1040 1041 if (nspares == 0) 1042 return (EINVAL); 1043 1044 /* 1045 * Make sure the pool is formatted with a version that supports hot 1046 * spares. 1047 */ 1048 if (spa_version(spa) < ZFS_VERSION_SPARES) 1049 return (ENOTSUP); 1050 1051 /* 1052 * Set the pending spare list so we correctly handle device in-use 1053 * checking. 1054 */ 1055 spa->spa_pending_spares = spares; 1056 spa->spa_pending_nspares = nspares; 1057 1058 for (i = 0; i < nspares; i++) { 1059 if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 1060 mode)) != 0) 1061 goto out; 1062 1063 if (!vd->vdev_ops->vdev_op_leaf) { 1064 vdev_free(vd); 1065 error = EINVAL; 1066 goto out; 1067 } 1068 1069 vd->vdev_top = vd; 1070 1071 if ((error = vdev_open(vd)) == 0 && 1072 (error = vdev_label_init(vd, crtxg, 1073 VDEV_LABEL_SPARE)) == 0) { 1074 VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 1075 vd->vdev_guid) == 0); 1076 } 1077 1078 vdev_free(vd); 1079 1080 if (error && mode != VDEV_ALLOC_SPARE) 1081 goto out; 1082 else 1083 error = 0; 1084 } 1085 1086 out: 1087 spa->spa_pending_spares = NULL; 1088 spa->spa_pending_nspares = 0; 1089 return (error); 1090 } 1091 1092 /* 1093 * Pool Creation 1094 */ 1095 int 1096 spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 1097 { 1098 spa_t *spa; 1099 vdev_t *rvd; 1100 dsl_pool_t *dp; 1101 dmu_tx_t *tx; 1102 int c, error = 0; 1103 uint64_t txg = TXG_INITIAL; 1104 nvlist_t **spares; 1105 uint_t nspares; 1106 1107 /* 1108 * If this pool already exists, return failure. 1109 */ 1110 mutex_enter(&spa_namespace_lock); 1111 if (spa_lookup(pool) != NULL) { 1112 mutex_exit(&spa_namespace_lock); 1113 return (EEXIST); 1114 } 1115 1116 /* 1117 * Allocate a new spa_t structure. 1118 */ 1119 spa = spa_add(pool, altroot); 1120 spa_activate(spa); 1121 1122 spa->spa_uberblock.ub_txg = txg - 1; 1123 spa->spa_uberblock.ub_version = ZFS_VERSION; 1124 spa->spa_ubsync = spa->spa_uberblock; 1125 1126 /* 1127 * Create the root vdev. 1128 */ 1129 spa_config_enter(spa, RW_WRITER, FTAG); 1130 1131 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1132 1133 ASSERT(error != 0 || rvd != NULL); 1134 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1135 1136 if (error == 0 && rvd->vdev_children == 0) 1137 error = EINVAL; 1138 1139 if (error == 0 && 1140 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1141 (error = spa_validate_spares(spa, nvroot, txg, 1142 VDEV_ALLOC_ADD)) == 0) { 1143 for (c = 0; c < rvd->vdev_children; c++) 1144 vdev_init(rvd->vdev_child[c], txg); 1145 vdev_config_dirty(rvd); 1146 } 1147 1148 spa_config_exit(spa, FTAG); 1149 1150 if (error != 0) { 1151 spa_unload(spa); 1152 spa_deactivate(spa); 1153 spa_remove(spa); 1154 mutex_exit(&spa_namespace_lock); 1155 return (error); 1156 } 1157 1158 /* 1159 * Get the list of spares, if specified. 1160 */ 1161 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1162 &spares, &nspares) == 0) { 1163 VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 1164 KM_SLEEP) == 0); 1165 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1166 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1167 spa_config_enter(spa, RW_WRITER, FTAG); 1168 spa_load_spares(spa); 1169 spa_config_exit(spa, FTAG); 1170 spa->spa_sync_spares = B_TRUE; 1171 } 1172 1173 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1174 spa->spa_meta_objset = dp->dp_meta_objset; 1175 1176 tx = dmu_tx_create_assigned(dp, txg); 1177 1178 /* 1179 * Create the pool config object. 1180 */ 1181 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1182 DMU_OT_PACKED_NVLIST, 1 << 14, 1183 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1184 1185 if (zap_add(spa->spa_meta_objset, 1186 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1187 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1188 cmn_err(CE_PANIC, "failed to add pool config"); 1189 } 1190 1191 /* Newly created pools are always deflated. */ 1192 spa->spa_deflate = TRUE; 1193 if (zap_add(spa->spa_meta_objset, 1194 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1195 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 1196 cmn_err(CE_PANIC, "failed to add deflate"); 1197 } 1198 1199 /* 1200 * Create the deferred-free bplist object. Turn off compression 1201 * because sync-to-convergence takes longer if the blocksize 1202 * keeps changing. 1203 */ 1204 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1205 1 << 14, tx); 1206 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1207 ZIO_COMPRESS_OFF, tx); 1208 1209 if (zap_add(spa->spa_meta_objset, 1210 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1211 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 1212 cmn_err(CE_PANIC, "failed to add bplist"); 1213 } 1214 1215 /* 1216 * Create the pool's history object. 1217 */ 1218 spa_history_create_obj(spa, tx); 1219 1220 dmu_tx_commit(tx); 1221 1222 spa->spa_bootfs = zfs_prop_default_numeric(ZFS_PROP_BOOTFS); 1223 spa->spa_sync_on = B_TRUE; 1224 txg_sync_start(spa->spa_dsl_pool); 1225 1226 /* 1227 * We explicitly wait for the first transaction to complete so that our 1228 * bean counters are appropriately updated. 1229 */ 1230 txg_wait_synced(spa->spa_dsl_pool, txg); 1231 1232 spa_config_sync(); 1233 1234 mutex_exit(&spa_namespace_lock); 1235 1236 return (0); 1237 } 1238 1239 /* 1240 * Import the given pool into the system. We set up the necessary spa_t and 1241 * then call spa_load() to do the dirty work. 1242 */ 1243 int 1244 spa_import(const char *pool, nvlist_t *config, const char *altroot) 1245 { 1246 spa_t *spa; 1247 int error; 1248 nvlist_t *nvroot; 1249 nvlist_t **spares; 1250 uint_t nspares; 1251 1252 if (!(spa_mode & FWRITE)) 1253 return (EROFS); 1254 1255 /* 1256 * If a pool with this name exists, return failure. 1257 */ 1258 mutex_enter(&spa_namespace_lock); 1259 if (spa_lookup(pool) != NULL) { 1260 mutex_exit(&spa_namespace_lock); 1261 return (EEXIST); 1262 } 1263 1264 /* 1265 * Create and initialize the spa structure. 1266 */ 1267 spa = spa_add(pool, altroot); 1268 spa_activate(spa); 1269 1270 /* 1271 * Pass off the heavy lifting to spa_load(). 1272 * Pass TRUE for mosconfig because the user-supplied config 1273 * is actually the one to trust when doing an import. 1274 */ 1275 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1276 1277 spa_config_enter(spa, RW_WRITER, FTAG); 1278 /* 1279 * Toss any existing sparelist, as it doesn't have any validity anymore, 1280 * and conflicts with spa_has_spare(). 1281 */ 1282 if (spa->spa_sparelist) { 1283 nvlist_free(spa->spa_sparelist); 1284 spa->spa_sparelist = NULL; 1285 spa_load_spares(spa); 1286 } 1287 1288 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1289 &nvroot) == 0); 1290 if (error == 0) 1291 error = spa_validate_spares(spa, nvroot, -1ULL, 1292 VDEV_ALLOC_SPARE); 1293 spa_config_exit(spa, FTAG); 1294 1295 if (error != 0) { 1296 spa_unload(spa); 1297 spa_deactivate(spa); 1298 spa_remove(spa); 1299 mutex_exit(&spa_namespace_lock); 1300 return (error); 1301 } 1302 1303 /* 1304 * Override any spares as specified by the user, as these may have 1305 * correct device names/devids, etc. 1306 */ 1307 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1308 &spares, &nspares) == 0) { 1309 if (spa->spa_sparelist) 1310 VERIFY(nvlist_remove(spa->spa_sparelist, 1311 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1312 else 1313 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1314 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1315 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1316 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1317 spa_config_enter(spa, RW_WRITER, FTAG); 1318 spa_load_spares(spa); 1319 spa_config_exit(spa, FTAG); 1320 spa->spa_sync_spares = B_TRUE; 1321 } 1322 1323 /* 1324 * Update the config cache to include the newly-imported pool. 1325 */ 1326 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1327 1328 mutex_exit(&spa_namespace_lock); 1329 1330 /* 1331 * Resilver anything that's out of date. 1332 */ 1333 if (spa_mode & FWRITE) 1334 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1335 1336 return (0); 1337 } 1338 1339 /* 1340 * This (illegal) pool name is used when temporarily importing a spa_t in order 1341 * to get the vdev stats associated with the imported devices. 1342 */ 1343 #define TRYIMPORT_NAME "$import" 1344 1345 nvlist_t * 1346 spa_tryimport(nvlist_t *tryconfig) 1347 { 1348 nvlist_t *config = NULL; 1349 char *poolname; 1350 spa_t *spa; 1351 uint64_t state; 1352 1353 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1354 return (NULL); 1355 1356 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1357 return (NULL); 1358 1359 /* 1360 * Create and initialize the spa structure. 1361 */ 1362 mutex_enter(&spa_namespace_lock); 1363 spa = spa_add(TRYIMPORT_NAME, NULL); 1364 spa_activate(spa); 1365 1366 /* 1367 * Pass off the heavy lifting to spa_load(). 1368 * Pass TRUE for mosconfig because the user-supplied config 1369 * is actually the one to trust when doing an import. 1370 */ 1371 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1372 1373 /* 1374 * If 'tryconfig' was at least parsable, return the current config. 1375 */ 1376 if (spa->spa_root_vdev != NULL) { 1377 spa_config_enter(spa, RW_READER, FTAG); 1378 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1379 spa_config_exit(spa, FTAG); 1380 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1381 poolname) == 0); 1382 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1383 state) == 0); 1384 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 1385 spa->spa_uberblock.ub_timestamp) == 0); 1386 1387 /* 1388 * Add the list of hot spares. 1389 */ 1390 spa_add_spares(spa, config); 1391 } 1392 1393 spa_unload(spa); 1394 spa_deactivate(spa); 1395 spa_remove(spa); 1396 mutex_exit(&spa_namespace_lock); 1397 1398 return (config); 1399 } 1400 1401 /* 1402 * Pool export/destroy 1403 * 1404 * The act of destroying or exporting a pool is very simple. We make sure there 1405 * is no more pending I/O and any references to the pool are gone. Then, we 1406 * update the pool state and sync all the labels to disk, removing the 1407 * configuration from the cache afterwards. 1408 */ 1409 static int 1410 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1411 { 1412 spa_t *spa; 1413 1414 if (oldconfig) 1415 *oldconfig = NULL; 1416 1417 if (!(spa_mode & FWRITE)) 1418 return (EROFS); 1419 1420 mutex_enter(&spa_namespace_lock); 1421 if ((spa = spa_lookup(pool)) == NULL) { 1422 mutex_exit(&spa_namespace_lock); 1423 return (ENOENT); 1424 } 1425 1426 /* 1427 * Put a hold on the pool, drop the namespace lock, stop async tasks, 1428 * reacquire the namespace lock, and see if we can export. 1429 */ 1430 spa_open_ref(spa, FTAG); 1431 mutex_exit(&spa_namespace_lock); 1432 spa_async_suspend(spa); 1433 mutex_enter(&spa_namespace_lock); 1434 spa_close(spa, FTAG); 1435 1436 /* 1437 * The pool will be in core if it's openable, 1438 * in which case we can modify its state. 1439 */ 1440 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1441 /* 1442 * Objsets may be open only because they're dirty, so we 1443 * have to force it to sync before checking spa_refcnt. 1444 */ 1445 spa_scrub_suspend(spa); 1446 txg_wait_synced(spa->spa_dsl_pool, 0); 1447 1448 /* 1449 * A pool cannot be exported or destroyed if there are active 1450 * references. If we are resetting a pool, allow references by 1451 * fault injection handlers. 1452 */ 1453 if (!spa_refcount_zero(spa) || 1454 (spa->spa_inject_ref != 0 && 1455 new_state != POOL_STATE_UNINITIALIZED)) { 1456 spa_scrub_resume(spa); 1457 spa_async_resume(spa); 1458 mutex_exit(&spa_namespace_lock); 1459 return (EBUSY); 1460 } 1461 1462 spa_scrub_resume(spa); 1463 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1464 1465 /* 1466 * We want this to be reflected on every label, 1467 * so mark them all dirty. spa_unload() will do the 1468 * final sync that pushes these changes out. 1469 */ 1470 if (new_state != POOL_STATE_UNINITIALIZED) { 1471 spa_config_enter(spa, RW_WRITER, FTAG); 1472 spa->spa_state = new_state; 1473 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 1474 vdev_config_dirty(spa->spa_root_vdev); 1475 spa_config_exit(spa, FTAG); 1476 } 1477 } 1478 1479 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1480 spa_unload(spa); 1481 spa_deactivate(spa); 1482 } 1483 1484 if (oldconfig && spa->spa_config) 1485 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 1486 1487 if (new_state != POOL_STATE_UNINITIALIZED) { 1488 spa_remove(spa); 1489 spa_config_sync(); 1490 } 1491 mutex_exit(&spa_namespace_lock); 1492 1493 return (0); 1494 } 1495 1496 /* 1497 * Destroy a storage pool. 1498 */ 1499 int 1500 spa_destroy(char *pool) 1501 { 1502 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1503 } 1504 1505 /* 1506 * Export a storage pool. 1507 */ 1508 int 1509 spa_export(char *pool, nvlist_t **oldconfig) 1510 { 1511 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1512 } 1513 1514 /* 1515 * Similar to spa_export(), this unloads the spa_t without actually removing it 1516 * from the namespace in any way. 1517 */ 1518 int 1519 spa_reset(char *pool) 1520 { 1521 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 1522 } 1523 1524 1525 /* 1526 * ========================================================================== 1527 * Device manipulation 1528 * ========================================================================== 1529 */ 1530 1531 /* 1532 * Add capacity to a storage pool. 1533 */ 1534 int 1535 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1536 { 1537 uint64_t txg; 1538 int c, error; 1539 vdev_t *rvd = spa->spa_root_vdev; 1540 vdev_t *vd, *tvd; 1541 nvlist_t **spares; 1542 uint_t i, nspares; 1543 1544 txg = spa_vdev_enter(spa); 1545 1546 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 1547 VDEV_ALLOC_ADD)) != 0) 1548 return (spa_vdev_exit(spa, NULL, txg, error)); 1549 1550 spa->spa_pending_vdev = vd; 1551 1552 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1553 &spares, &nspares) != 0) 1554 nspares = 0; 1555 1556 if (vd->vdev_children == 0 && nspares == 0) { 1557 spa->spa_pending_vdev = NULL; 1558 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1559 } 1560 1561 if (vd->vdev_children != 0) { 1562 if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 1563 spa->spa_pending_vdev = NULL; 1564 return (spa_vdev_exit(spa, vd, txg, error)); 1565 } 1566 } 1567 1568 /* 1569 * We must validate the spares after checking the children. Otherwise, 1570 * vdev_inuse() will blindly overwrite the spare. 1571 */ 1572 if ((error = spa_validate_spares(spa, nvroot, txg, 1573 VDEV_ALLOC_ADD)) != 0) { 1574 spa->spa_pending_vdev = NULL; 1575 return (spa_vdev_exit(spa, vd, txg, error)); 1576 } 1577 1578 spa->spa_pending_vdev = NULL; 1579 1580 /* 1581 * Transfer each new top-level vdev from vd to rvd. 1582 */ 1583 for (c = 0; c < vd->vdev_children; c++) { 1584 tvd = vd->vdev_child[c]; 1585 vdev_remove_child(vd, tvd); 1586 tvd->vdev_id = rvd->vdev_children; 1587 vdev_add_child(rvd, tvd); 1588 vdev_config_dirty(tvd); 1589 } 1590 1591 if (nspares != 0) { 1592 if (spa->spa_sparelist != NULL) { 1593 nvlist_t **oldspares; 1594 uint_t oldnspares; 1595 nvlist_t **newspares; 1596 1597 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 1598 ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 1599 1600 newspares = kmem_alloc(sizeof (void *) * 1601 (nspares + oldnspares), KM_SLEEP); 1602 for (i = 0; i < oldnspares; i++) 1603 VERIFY(nvlist_dup(oldspares[i], 1604 &newspares[i], KM_SLEEP) == 0); 1605 for (i = 0; i < nspares; i++) 1606 VERIFY(nvlist_dup(spares[i], 1607 &newspares[i + oldnspares], 1608 KM_SLEEP) == 0); 1609 1610 VERIFY(nvlist_remove(spa->spa_sparelist, 1611 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1612 1613 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1614 ZPOOL_CONFIG_SPARES, newspares, 1615 nspares + oldnspares) == 0); 1616 for (i = 0; i < oldnspares + nspares; i++) 1617 nvlist_free(newspares[i]); 1618 kmem_free(newspares, (oldnspares + nspares) * 1619 sizeof (void *)); 1620 } else { 1621 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1622 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1623 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1624 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1625 } 1626 1627 spa_load_spares(spa); 1628 spa->spa_sync_spares = B_TRUE; 1629 } 1630 1631 /* 1632 * We have to be careful when adding new vdevs to an existing pool. 1633 * If other threads start allocating from these vdevs before we 1634 * sync the config cache, and we lose power, then upon reboot we may 1635 * fail to open the pool because there are DVAs that the config cache 1636 * can't translate. Therefore, we first add the vdevs without 1637 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1638 * and then let spa_config_update() initialize the new metaslabs. 1639 * 1640 * spa_load() checks for added-but-not-initialized vdevs, so that 1641 * if we lose power at any point in this sequence, the remaining 1642 * steps will be completed the next time we load the pool. 1643 */ 1644 (void) spa_vdev_exit(spa, vd, txg, 0); 1645 1646 mutex_enter(&spa_namespace_lock); 1647 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1648 mutex_exit(&spa_namespace_lock); 1649 1650 return (0); 1651 } 1652 1653 /* 1654 * Attach a device to a mirror. The arguments are the path to any device 1655 * in the mirror, and the nvroot for the new device. If the path specifies 1656 * a device that is not mirrored, we automatically insert the mirror vdev. 1657 * 1658 * If 'replacing' is specified, the new device is intended to replace the 1659 * existing device; in this case the two devices are made into their own 1660 * mirror using the 'replacing' vdev, which is functionally idendical to 1661 * the mirror vdev (it actually reuses all the same ops) but has a few 1662 * extra rules: you can't attach to it after it's been created, and upon 1663 * completion of resilvering, the first disk (the one being replaced) 1664 * is automatically detached. 1665 */ 1666 int 1667 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1668 { 1669 uint64_t txg, open_txg; 1670 int error; 1671 vdev_t *rvd = spa->spa_root_vdev; 1672 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1673 vdev_ops_t *pvops; 1674 1675 txg = spa_vdev_enter(spa); 1676 1677 oldvd = vdev_lookup_by_guid(rvd, guid); 1678 1679 if (oldvd == NULL) 1680 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1681 1682 if (!oldvd->vdev_ops->vdev_op_leaf) 1683 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1684 1685 pvd = oldvd->vdev_parent; 1686 1687 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 1688 VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) 1689 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1690 1691 newvd = newrootvd->vdev_child[0]; 1692 1693 if (!newvd->vdev_ops->vdev_op_leaf) 1694 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1695 1696 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 1697 return (spa_vdev_exit(spa, newrootvd, txg, error)); 1698 1699 if (!replacing) { 1700 /* 1701 * For attach, the only allowable parent is a mirror or the root 1702 * vdev. 1703 */ 1704 if (pvd->vdev_ops != &vdev_mirror_ops && 1705 pvd->vdev_ops != &vdev_root_ops) 1706 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1707 1708 pvops = &vdev_mirror_ops; 1709 } else { 1710 /* 1711 * Active hot spares can only be replaced by inactive hot 1712 * spares. 1713 */ 1714 if (pvd->vdev_ops == &vdev_spare_ops && 1715 pvd->vdev_child[1] == oldvd && 1716 !spa_has_spare(spa, newvd->vdev_guid)) 1717 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1718 1719 /* 1720 * If the source is a hot spare, and the parent isn't already a 1721 * spare, then we want to create a new hot spare. Otherwise, we 1722 * want to create a replacing vdev. The user is not allowed to 1723 * attach to a spared vdev child unless the 'isspare' state is 1724 * the same (spare replaces spare, non-spare replaces 1725 * non-spare). 1726 */ 1727 if (pvd->vdev_ops == &vdev_replacing_ops) 1728 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1729 else if (pvd->vdev_ops == &vdev_spare_ops && 1730 newvd->vdev_isspare != oldvd->vdev_isspare) 1731 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1732 else if (pvd->vdev_ops != &vdev_spare_ops && 1733 newvd->vdev_isspare) 1734 pvops = &vdev_spare_ops; 1735 else 1736 pvops = &vdev_replacing_ops; 1737 } 1738 1739 /* 1740 * Compare the new device size with the replaceable/attachable 1741 * device size. 1742 */ 1743 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1744 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1745 1746 /* 1747 * The new device cannot have a higher alignment requirement 1748 * than the top-level vdev. 1749 */ 1750 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1751 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1752 1753 /* 1754 * If this is an in-place replacement, update oldvd's path and devid 1755 * to make it distinguishable from newvd, and unopenable from now on. 1756 */ 1757 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1758 spa_strfree(oldvd->vdev_path); 1759 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1760 KM_SLEEP); 1761 (void) sprintf(oldvd->vdev_path, "%s/%s", 1762 newvd->vdev_path, "old"); 1763 if (oldvd->vdev_devid != NULL) { 1764 spa_strfree(oldvd->vdev_devid); 1765 oldvd->vdev_devid = NULL; 1766 } 1767 } 1768 1769 /* 1770 * If the parent is not a mirror, or if we're replacing, insert the new 1771 * mirror/replacing/spare vdev above oldvd. 1772 */ 1773 if (pvd->vdev_ops != pvops) 1774 pvd = vdev_add_parent(oldvd, pvops); 1775 1776 ASSERT(pvd->vdev_top->vdev_parent == rvd); 1777 ASSERT(pvd->vdev_ops == pvops); 1778 ASSERT(oldvd->vdev_parent == pvd); 1779 1780 /* 1781 * Extract the new device from its root and add it to pvd. 1782 */ 1783 vdev_remove_child(newrootvd, newvd); 1784 newvd->vdev_id = pvd->vdev_children; 1785 vdev_add_child(pvd, newvd); 1786 1787 /* 1788 * If newvd is smaller than oldvd, but larger than its rsize, 1789 * the addition of newvd may have decreased our parent's asize. 1790 */ 1791 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1792 1793 tvd = newvd->vdev_top; 1794 ASSERT(pvd->vdev_top == tvd); 1795 ASSERT(tvd->vdev_parent == rvd); 1796 1797 vdev_config_dirty(tvd); 1798 1799 /* 1800 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1801 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1802 */ 1803 open_txg = txg + TXG_CONCURRENT_STATES - 1; 1804 1805 mutex_enter(&newvd->vdev_dtl_lock); 1806 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1807 open_txg - TXG_INITIAL + 1); 1808 mutex_exit(&newvd->vdev_dtl_lock); 1809 1810 if (newvd->vdev_isspare) 1811 spa_spare_activate(newvd); 1812 1813 /* 1814 * Mark newvd's DTL dirty in this txg. 1815 */ 1816 vdev_dirty(tvd, VDD_DTL, newvd, txg); 1817 1818 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1819 1820 /* 1821 * Kick off a resilver to update newvd. 1822 */ 1823 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1824 1825 return (0); 1826 } 1827 1828 /* 1829 * Detach a device from a mirror or replacing vdev. 1830 * If 'replace_done' is specified, only detach if the parent 1831 * is a replacing vdev. 1832 */ 1833 int 1834 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1835 { 1836 uint64_t txg; 1837 int c, t, error; 1838 vdev_t *rvd = spa->spa_root_vdev; 1839 vdev_t *vd, *pvd, *cvd, *tvd; 1840 boolean_t unspare = B_FALSE; 1841 uint64_t unspare_guid; 1842 1843 txg = spa_vdev_enter(spa); 1844 1845 vd = vdev_lookup_by_guid(rvd, guid); 1846 1847 if (vd == NULL) 1848 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1849 1850 if (!vd->vdev_ops->vdev_op_leaf) 1851 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1852 1853 pvd = vd->vdev_parent; 1854 1855 /* 1856 * If replace_done is specified, only remove this device if it's 1857 * the first child of a replacing vdev. For the 'spare' vdev, either 1858 * disk can be removed. 1859 */ 1860 if (replace_done) { 1861 if (pvd->vdev_ops == &vdev_replacing_ops) { 1862 if (vd->vdev_id != 0) 1863 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1864 } else if (pvd->vdev_ops != &vdev_spare_ops) { 1865 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1866 } 1867 } 1868 1869 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 1870 spa_version(spa) >= ZFS_VERSION_SPARES); 1871 1872 /* 1873 * Only mirror, replacing, and spare vdevs support detach. 1874 */ 1875 if (pvd->vdev_ops != &vdev_replacing_ops && 1876 pvd->vdev_ops != &vdev_mirror_ops && 1877 pvd->vdev_ops != &vdev_spare_ops) 1878 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1879 1880 /* 1881 * If there's only one replica, you can't detach it. 1882 */ 1883 if (pvd->vdev_children <= 1) 1884 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1885 1886 /* 1887 * If all siblings have non-empty DTLs, this device may have the only 1888 * valid copy of the data, which means we cannot safely detach it. 1889 * 1890 * XXX -- as in the vdev_offline() case, we really want a more 1891 * precise DTL check. 1892 */ 1893 for (c = 0; c < pvd->vdev_children; c++) { 1894 uint64_t dirty; 1895 1896 cvd = pvd->vdev_child[c]; 1897 if (cvd == vd) 1898 continue; 1899 if (vdev_is_dead(cvd)) 1900 continue; 1901 mutex_enter(&cvd->vdev_dtl_lock); 1902 dirty = cvd->vdev_dtl_map.sm_space | 1903 cvd->vdev_dtl_scrub.sm_space; 1904 mutex_exit(&cvd->vdev_dtl_lock); 1905 if (!dirty) 1906 break; 1907 } 1908 1909 /* 1910 * If we are a replacing or spare vdev, then we can always detach the 1911 * latter child, as that is how one cancels the operation. 1912 */ 1913 if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 1914 c == pvd->vdev_children) 1915 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1916 1917 /* 1918 * If we are detaching the original disk from a spare, then it implies 1919 * that the spare should become a real disk, and be removed from the 1920 * active spare list for the pool. 1921 */ 1922 if (pvd->vdev_ops == &vdev_spare_ops && 1923 vd->vdev_id == 0) 1924 unspare = B_TRUE; 1925 1926 /* 1927 * Erase the disk labels so the disk can be used for other things. 1928 * This must be done after all other error cases are handled, 1929 * but before we disembowel vd (so we can still do I/O to it). 1930 * But if we can't do it, don't treat the error as fatal -- 1931 * it may be that the unwritability of the disk is the reason 1932 * it's being detached! 1933 */ 1934 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 1935 1936 /* 1937 * Remove vd from its parent and compact the parent's children. 1938 */ 1939 vdev_remove_child(pvd, vd); 1940 vdev_compact_children(pvd); 1941 1942 /* 1943 * Remember one of the remaining children so we can get tvd below. 1944 */ 1945 cvd = pvd->vdev_child[0]; 1946 1947 /* 1948 * If we need to remove the remaining child from the list of hot spares, 1949 * do it now, marking the vdev as no longer a spare in the process. We 1950 * must do this before vdev_remove_parent(), because that can change the 1951 * GUID if it creates a new toplevel GUID. 1952 */ 1953 if (unspare) { 1954 ASSERT(cvd->vdev_isspare); 1955 spa_spare_remove(cvd); 1956 unspare_guid = cvd->vdev_guid; 1957 } 1958 1959 /* 1960 * If the parent mirror/replacing vdev only has one child, 1961 * the parent is no longer needed. Remove it from the tree. 1962 */ 1963 if (pvd->vdev_children == 1) 1964 vdev_remove_parent(cvd); 1965 1966 /* 1967 * We don't set tvd until now because the parent we just removed 1968 * may have been the previous top-level vdev. 1969 */ 1970 tvd = cvd->vdev_top; 1971 ASSERT(tvd->vdev_parent == rvd); 1972 1973 /* 1974 * Reevaluate the parent vdev state. 1975 */ 1976 vdev_propagate_state(cvd->vdev_parent); 1977 1978 /* 1979 * If the device we just detached was smaller than the others, it may be 1980 * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 1981 * can't fail because the existing metaslabs are already in core, so 1982 * there's nothing to read from disk. 1983 */ 1984 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 1985 1986 vdev_config_dirty(tvd); 1987 1988 /* 1989 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 1990 * vd->vdev_detached is set and free vd's DTL object in syncing context. 1991 * But first make sure we're not on any *other* txg's DTL list, to 1992 * prevent vd from being accessed after it's freed. 1993 */ 1994 for (t = 0; t < TXG_SIZE; t++) 1995 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1996 vd->vdev_detached = B_TRUE; 1997 vdev_dirty(tvd, VDD_DTL, vd, txg); 1998 1999 error = spa_vdev_exit(spa, vd, txg, 0); 2000 2001 /* 2002 * If this was the removal of the original device in a hot spare vdev, 2003 * then we want to go through and remove the device from the hot spare 2004 * list of every other pool. 2005 */ 2006 if (unspare) { 2007 spa = NULL; 2008 mutex_enter(&spa_namespace_lock); 2009 while ((spa = spa_next(spa)) != NULL) { 2010 if (spa->spa_state != POOL_STATE_ACTIVE) 2011 continue; 2012 2013 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 2014 } 2015 mutex_exit(&spa_namespace_lock); 2016 } 2017 2018 return (error); 2019 } 2020 2021 /* 2022 * Remove a device from the pool. Currently, this supports removing only hot 2023 * spares. 2024 */ 2025 int 2026 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 2027 { 2028 vdev_t *vd; 2029 nvlist_t **spares, *nv, **newspares; 2030 uint_t i, j, nspares; 2031 int ret = 0; 2032 2033 spa_config_enter(spa, RW_WRITER, FTAG); 2034 2035 vd = spa_lookup_by_guid(spa, guid); 2036 2037 nv = NULL; 2038 if (spa->spa_spares != NULL && 2039 nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2040 &spares, &nspares) == 0) { 2041 for (i = 0; i < nspares; i++) { 2042 uint64_t theguid; 2043 2044 VERIFY(nvlist_lookup_uint64(spares[i], 2045 ZPOOL_CONFIG_GUID, &theguid) == 0); 2046 if (theguid == guid) { 2047 nv = spares[i]; 2048 break; 2049 } 2050 } 2051 } 2052 2053 /* 2054 * We only support removing a hot spare, and only if it's not currently 2055 * in use in this pool. 2056 */ 2057 if (nv == NULL && vd == NULL) { 2058 ret = ENOENT; 2059 goto out; 2060 } 2061 2062 if (nv == NULL && vd != NULL) { 2063 ret = ENOTSUP; 2064 goto out; 2065 } 2066 2067 if (!unspare && nv != NULL && vd != NULL) { 2068 ret = EBUSY; 2069 goto out; 2070 } 2071 2072 if (nspares == 1) { 2073 newspares = NULL; 2074 } else { 2075 newspares = kmem_alloc((nspares - 1) * sizeof (void *), 2076 KM_SLEEP); 2077 for (i = 0, j = 0; i < nspares; i++) { 2078 if (spares[i] != nv) 2079 VERIFY(nvlist_dup(spares[i], 2080 &newspares[j++], KM_SLEEP) == 0); 2081 } 2082 } 2083 2084 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2085 DATA_TYPE_NVLIST_ARRAY) == 0); 2086 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2087 newspares, nspares - 1) == 0); 2088 for (i = 0; i < nspares - 1; i++) 2089 nvlist_free(newspares[i]); 2090 kmem_free(newspares, (nspares - 1) * sizeof (void *)); 2091 spa_load_spares(spa); 2092 spa->spa_sync_spares = B_TRUE; 2093 2094 out: 2095 spa_config_exit(spa, FTAG); 2096 2097 return (ret); 2098 } 2099 2100 /* 2101 * Find any device that's done replacing, so we can detach it. 2102 */ 2103 static vdev_t * 2104 spa_vdev_replace_done_hunt(vdev_t *vd) 2105 { 2106 vdev_t *newvd, *oldvd; 2107 int c; 2108 2109 for (c = 0; c < vd->vdev_children; c++) { 2110 oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 2111 if (oldvd != NULL) 2112 return (oldvd); 2113 } 2114 2115 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 2116 oldvd = vd->vdev_child[0]; 2117 newvd = vd->vdev_child[1]; 2118 2119 mutex_enter(&newvd->vdev_dtl_lock); 2120 if (newvd->vdev_dtl_map.sm_space == 0 && 2121 newvd->vdev_dtl_scrub.sm_space == 0) { 2122 mutex_exit(&newvd->vdev_dtl_lock); 2123 return (oldvd); 2124 } 2125 mutex_exit(&newvd->vdev_dtl_lock); 2126 } 2127 2128 return (NULL); 2129 } 2130 2131 static void 2132 spa_vdev_replace_done(spa_t *spa) 2133 { 2134 vdev_t *vd; 2135 vdev_t *pvd; 2136 uint64_t guid; 2137 uint64_t pguid = 0; 2138 2139 spa_config_enter(spa, RW_READER, FTAG); 2140 2141 while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 2142 guid = vd->vdev_guid; 2143 /* 2144 * If we have just finished replacing a hot spared device, then 2145 * we need to detach the parent's first child (the original hot 2146 * spare) as well. 2147 */ 2148 pvd = vd->vdev_parent; 2149 if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2150 pvd->vdev_id == 0) { 2151 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 2152 ASSERT(pvd->vdev_parent->vdev_children == 2); 2153 pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 2154 } 2155 spa_config_exit(spa, FTAG); 2156 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 2157 return; 2158 if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 2159 return; 2160 spa_config_enter(spa, RW_READER, FTAG); 2161 } 2162 2163 spa_config_exit(spa, FTAG); 2164 } 2165 2166 /* 2167 * Update the stored path for this vdev. Dirty the vdev configuration, relying 2168 * on spa_vdev_enter/exit() to synchronize the labels and cache. 2169 */ 2170 int 2171 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 2172 { 2173 vdev_t *rvd, *vd; 2174 uint64_t txg; 2175 2176 rvd = spa->spa_root_vdev; 2177 2178 txg = spa_vdev_enter(spa); 2179 2180 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2181 /* 2182 * Determine if this is a reference to a hot spare. In that 2183 * case, update the path as stored in the spare list. 2184 */ 2185 nvlist_t **spares; 2186 uint_t i, nspares; 2187 if (spa->spa_sparelist != NULL) { 2188 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 2189 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2190 for (i = 0; i < nspares; i++) { 2191 uint64_t theguid; 2192 VERIFY(nvlist_lookup_uint64(spares[i], 2193 ZPOOL_CONFIG_GUID, &theguid) == 0); 2194 if (theguid == guid) 2195 break; 2196 } 2197 2198 if (i == nspares) 2199 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2200 2201 VERIFY(nvlist_add_string(spares[i], 2202 ZPOOL_CONFIG_PATH, newpath) == 0); 2203 spa_load_spares(spa); 2204 spa->spa_sync_spares = B_TRUE; 2205 return (spa_vdev_exit(spa, NULL, txg, 0)); 2206 } else { 2207 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2208 } 2209 } 2210 2211 if (!vd->vdev_ops->vdev_op_leaf) 2212 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2213 2214 spa_strfree(vd->vdev_path); 2215 vd->vdev_path = spa_strdup(newpath); 2216 2217 vdev_config_dirty(vd->vdev_top); 2218 2219 return (spa_vdev_exit(spa, NULL, txg, 0)); 2220 } 2221 2222 /* 2223 * ========================================================================== 2224 * SPA Scrubbing 2225 * ========================================================================== 2226 */ 2227 2228 static void 2229 spa_scrub_io_done(zio_t *zio) 2230 { 2231 spa_t *spa = zio->io_spa; 2232 2233 zio_data_buf_free(zio->io_data, zio->io_size); 2234 2235 mutex_enter(&spa->spa_scrub_lock); 2236 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2237 vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2238 spa->spa_scrub_errors++; 2239 mutex_enter(&vd->vdev_stat_lock); 2240 vd->vdev_stat.vs_scrub_errors++; 2241 mutex_exit(&vd->vdev_stat_lock); 2242 } 2243 2244 if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight) 2245 cv_broadcast(&spa->spa_scrub_io_cv); 2246 2247 ASSERT(spa->spa_scrub_inflight >= 0); 2248 2249 mutex_exit(&spa->spa_scrub_lock); 2250 } 2251 2252 static void 2253 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 2254 zbookmark_t *zb) 2255 { 2256 size_t size = BP_GET_LSIZE(bp); 2257 void *data; 2258 2259 mutex_enter(&spa->spa_scrub_lock); 2260 /* 2261 * Do not give too much work to vdev(s). 2262 */ 2263 while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) { 2264 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2265 } 2266 spa->spa_scrub_inflight++; 2267 mutex_exit(&spa->spa_scrub_lock); 2268 2269 data = zio_data_buf_alloc(size); 2270 2271 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 2272 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 2273 2274 flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 2275 2276 zio_nowait(zio_read(NULL, spa, bp, data, size, 2277 spa_scrub_io_done, NULL, priority, flags, zb)); 2278 } 2279 2280 /* ARGSUSED */ 2281 static int 2282 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2283 { 2284 blkptr_t *bp = &bc->bc_blkptr; 2285 vdev_t *vd = spa->spa_root_vdev; 2286 dva_t *dva = bp->blk_dva; 2287 int needs_resilver = B_FALSE; 2288 int d; 2289 2290 if (bc->bc_errno) { 2291 /* 2292 * We can't scrub this block, but we can continue to scrub 2293 * the rest of the pool. Note the error and move along. 2294 */ 2295 mutex_enter(&spa->spa_scrub_lock); 2296 spa->spa_scrub_errors++; 2297 mutex_exit(&spa->spa_scrub_lock); 2298 2299 mutex_enter(&vd->vdev_stat_lock); 2300 vd->vdev_stat.vs_scrub_errors++; 2301 mutex_exit(&vd->vdev_stat_lock); 2302 2303 return (ERESTART); 2304 } 2305 2306 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2307 2308 for (d = 0; d < BP_GET_NDVAS(bp); d++) { 2309 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 2310 2311 ASSERT(vd != NULL); 2312 2313 /* 2314 * Keep track of how much data we've examined so that 2315 * zpool(1M) status can make useful progress reports. 2316 */ 2317 mutex_enter(&vd->vdev_stat_lock); 2318 vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 2319 mutex_exit(&vd->vdev_stat_lock); 2320 2321 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 2322 if (DVA_GET_GANG(&dva[d])) { 2323 /* 2324 * Gang members may be spread across multiple 2325 * vdevs, so the best we can do is look at the 2326 * pool-wide DTL. 2327 * XXX -- it would be better to change our 2328 * allocation policy to ensure that this can't 2329 * happen. 2330 */ 2331 vd = spa->spa_root_vdev; 2332 } 2333 if (vdev_dtl_contains(&vd->vdev_dtl_map, 2334 bp->blk_birth, 1)) 2335 needs_resilver = B_TRUE; 2336 } 2337 } 2338 2339 if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2340 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 2341 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 2342 else if (needs_resilver) 2343 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 2344 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2345 2346 return (0); 2347 } 2348 2349 static void 2350 spa_scrub_thread(spa_t *spa) 2351 { 2352 callb_cpr_t cprinfo; 2353 traverse_handle_t *th = spa->spa_scrub_th; 2354 vdev_t *rvd = spa->spa_root_vdev; 2355 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2356 int error = 0; 2357 boolean_t complete; 2358 2359 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2360 2361 /* 2362 * If we're restarting due to a snapshot create/delete, 2363 * wait for that to complete. 2364 */ 2365 txg_wait_synced(spa_get_dsl(spa), 0); 2366 2367 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 2368 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2369 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 2370 2371 spa_config_enter(spa, RW_WRITER, FTAG); 2372 vdev_reopen(rvd); /* purge all vdev caches */ 2373 vdev_config_dirty(rvd); /* rewrite all disk labels */ 2374 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 2375 spa_config_exit(spa, FTAG); 2376 2377 mutex_enter(&spa->spa_scrub_lock); 2378 spa->spa_scrub_errors = 0; 2379 spa->spa_scrub_active = 1; 2380 ASSERT(spa->spa_scrub_inflight == 0); 2381 2382 while (!spa->spa_scrub_stop) { 2383 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2384 while (spa->spa_scrub_suspended) { 2385 spa->spa_scrub_active = 0; 2386 cv_broadcast(&spa->spa_scrub_cv); 2387 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2388 spa->spa_scrub_active = 1; 2389 } 2390 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2391 2392 if (spa->spa_scrub_restart_txg != 0) 2393 break; 2394 2395 mutex_exit(&spa->spa_scrub_lock); 2396 error = traverse_more(th); 2397 mutex_enter(&spa->spa_scrub_lock); 2398 if (error != EAGAIN) 2399 break; 2400 } 2401 2402 while (spa->spa_scrub_inflight) 2403 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2404 2405 spa->spa_scrub_active = 0; 2406 cv_broadcast(&spa->spa_scrub_cv); 2407 2408 mutex_exit(&spa->spa_scrub_lock); 2409 2410 spa_config_enter(spa, RW_WRITER, FTAG); 2411 2412 mutex_enter(&spa->spa_scrub_lock); 2413 2414 /* 2415 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 2416 * AND the spa config lock to synchronize with any config changes 2417 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 2418 */ 2419 if (spa->spa_scrub_restart_txg != 0) 2420 error = ERESTART; 2421 2422 if (spa->spa_scrub_stop) 2423 error = EINTR; 2424 2425 /* 2426 * Even if there were uncorrectable errors, we consider the scrub 2427 * completed. The downside is that if there is a transient error during 2428 * a resilver, we won't resilver the data properly to the target. But 2429 * if the damage is permanent (more likely) we will resilver forever, 2430 * which isn't really acceptable. Since there is enough information for 2431 * the user to know what has failed and why, this seems like a more 2432 * tractable approach. 2433 */ 2434 complete = (error == 0); 2435 2436 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 2437 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2438 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2439 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2440 2441 mutex_exit(&spa->spa_scrub_lock); 2442 2443 /* 2444 * If the scrub/resilver completed, update all DTLs to reflect this. 2445 * Whether it succeeded or not, vacate all temporary scrub DTLs. 2446 */ 2447 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2448 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2449 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 2450 spa_errlog_rotate(spa); 2451 2452 spa_config_exit(spa, FTAG); 2453 2454 mutex_enter(&spa->spa_scrub_lock); 2455 2456 /* 2457 * We may have finished replacing a device. 2458 * Let the async thread assess this and handle the detach. 2459 */ 2460 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2461 2462 /* 2463 * If we were told to restart, our final act is to start a new scrub. 2464 */ 2465 if (error == ERESTART) 2466 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 2467 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2468 2469 spa->spa_scrub_type = POOL_SCRUB_NONE; 2470 spa->spa_scrub_active = 0; 2471 spa->spa_scrub_thread = NULL; 2472 cv_broadcast(&spa->spa_scrub_cv); 2473 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2474 thread_exit(); 2475 } 2476 2477 void 2478 spa_scrub_suspend(spa_t *spa) 2479 { 2480 mutex_enter(&spa->spa_scrub_lock); 2481 spa->spa_scrub_suspended++; 2482 while (spa->spa_scrub_active) { 2483 cv_broadcast(&spa->spa_scrub_cv); 2484 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2485 } 2486 while (spa->spa_scrub_inflight) 2487 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2488 mutex_exit(&spa->spa_scrub_lock); 2489 } 2490 2491 void 2492 spa_scrub_resume(spa_t *spa) 2493 { 2494 mutex_enter(&spa->spa_scrub_lock); 2495 ASSERT(spa->spa_scrub_suspended != 0); 2496 if (--spa->spa_scrub_suspended == 0) 2497 cv_broadcast(&spa->spa_scrub_cv); 2498 mutex_exit(&spa->spa_scrub_lock); 2499 } 2500 2501 void 2502 spa_scrub_restart(spa_t *spa, uint64_t txg) 2503 { 2504 /* 2505 * Something happened (e.g. snapshot create/delete) that means 2506 * we must restart any in-progress scrubs. The itinerary will 2507 * fix this properly. 2508 */ 2509 mutex_enter(&spa->spa_scrub_lock); 2510 spa->spa_scrub_restart_txg = txg; 2511 mutex_exit(&spa->spa_scrub_lock); 2512 } 2513 2514 int 2515 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2516 { 2517 space_seg_t *ss; 2518 uint64_t mintxg, maxtxg; 2519 vdev_t *rvd = spa->spa_root_vdev; 2520 2521 if ((uint_t)type >= POOL_SCRUB_TYPES) 2522 return (ENOTSUP); 2523 2524 mutex_enter(&spa->spa_scrub_lock); 2525 2526 /* 2527 * If there's a scrub or resilver already in progress, stop it. 2528 */ 2529 while (spa->spa_scrub_thread != NULL) { 2530 /* 2531 * Don't stop a resilver unless forced. 2532 */ 2533 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 2534 mutex_exit(&spa->spa_scrub_lock); 2535 return (EBUSY); 2536 } 2537 spa->spa_scrub_stop = 1; 2538 cv_broadcast(&spa->spa_scrub_cv); 2539 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2540 } 2541 2542 /* 2543 * Terminate the previous traverse. 2544 */ 2545 if (spa->spa_scrub_th != NULL) { 2546 traverse_fini(spa->spa_scrub_th); 2547 spa->spa_scrub_th = NULL; 2548 } 2549 2550 if (rvd == NULL) { 2551 ASSERT(spa->spa_scrub_stop == 0); 2552 ASSERT(spa->spa_scrub_type == type); 2553 ASSERT(spa->spa_scrub_restart_txg == 0); 2554 mutex_exit(&spa->spa_scrub_lock); 2555 return (0); 2556 } 2557 2558 mintxg = TXG_INITIAL - 1; 2559 maxtxg = spa_last_synced_txg(spa) + 1; 2560 2561 mutex_enter(&rvd->vdev_dtl_lock); 2562 2563 if (rvd->vdev_dtl_map.sm_space == 0) { 2564 /* 2565 * The pool-wide DTL is empty. 2566 * If this is a resilver, there's nothing to do except 2567 * check whether any in-progress replacements have completed. 2568 */ 2569 if (type == POOL_SCRUB_RESILVER) { 2570 type = POOL_SCRUB_NONE; 2571 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2572 } 2573 } else { 2574 /* 2575 * The pool-wide DTL is non-empty. 2576 * If this is a normal scrub, upgrade to a resilver instead. 2577 */ 2578 if (type == POOL_SCRUB_EVERYTHING) 2579 type = POOL_SCRUB_RESILVER; 2580 } 2581 2582 if (type == POOL_SCRUB_RESILVER) { 2583 /* 2584 * Determine the resilvering boundaries. 2585 * 2586 * Note: (mintxg, maxtxg) is an open interval, 2587 * i.e. mintxg and maxtxg themselves are not included. 2588 * 2589 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 2590 * so we don't claim to resilver a txg that's still changing. 2591 */ 2592 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 2593 mintxg = ss->ss_start - 1; 2594 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 2595 maxtxg = MIN(ss->ss_end, maxtxg); 2596 } 2597 2598 mutex_exit(&rvd->vdev_dtl_lock); 2599 2600 spa->spa_scrub_stop = 0; 2601 spa->spa_scrub_type = type; 2602 spa->spa_scrub_restart_txg = 0; 2603 2604 if (type != POOL_SCRUB_NONE) { 2605 spa->spa_scrub_mintxg = mintxg; 2606 spa->spa_scrub_maxtxg = maxtxg; 2607 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 2608 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 2609 ZIO_FLAG_CANFAIL); 2610 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 2611 spa->spa_scrub_thread = thread_create(NULL, 0, 2612 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 2613 } 2614 2615 mutex_exit(&spa->spa_scrub_lock); 2616 2617 return (0); 2618 } 2619 2620 /* 2621 * ========================================================================== 2622 * SPA async task processing 2623 * ========================================================================== 2624 */ 2625 2626 static void 2627 spa_async_reopen(spa_t *spa) 2628 { 2629 vdev_t *rvd = spa->spa_root_vdev; 2630 vdev_t *tvd; 2631 int c; 2632 2633 spa_config_enter(spa, RW_WRITER, FTAG); 2634 2635 for (c = 0; c < rvd->vdev_children; c++) { 2636 tvd = rvd->vdev_child[c]; 2637 if (tvd->vdev_reopen_wanted) { 2638 tvd->vdev_reopen_wanted = 0; 2639 vdev_reopen(tvd); 2640 } 2641 } 2642 2643 spa_config_exit(spa, FTAG); 2644 } 2645 2646 static void 2647 spa_async_thread(spa_t *spa) 2648 { 2649 int tasks; 2650 2651 ASSERT(spa->spa_sync_on); 2652 2653 mutex_enter(&spa->spa_async_lock); 2654 tasks = spa->spa_async_tasks; 2655 spa->spa_async_tasks = 0; 2656 mutex_exit(&spa->spa_async_lock); 2657 2658 /* 2659 * See if the config needs to be updated. 2660 */ 2661 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 2662 mutex_enter(&spa_namespace_lock); 2663 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2664 mutex_exit(&spa_namespace_lock); 2665 } 2666 2667 /* 2668 * See if any devices need to be reopened. 2669 */ 2670 if (tasks & SPA_ASYNC_REOPEN) 2671 spa_async_reopen(spa); 2672 2673 /* 2674 * If any devices are done replacing, detach them. 2675 */ 2676 if (tasks & SPA_ASYNC_REPLACE_DONE) 2677 spa_vdev_replace_done(spa); 2678 2679 /* 2680 * Kick off a scrub. 2681 */ 2682 if (tasks & SPA_ASYNC_SCRUB) 2683 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 2684 2685 /* 2686 * Kick off a resilver. 2687 */ 2688 if (tasks & SPA_ASYNC_RESILVER) 2689 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 2690 2691 /* 2692 * Let the world know that we're done. 2693 */ 2694 mutex_enter(&spa->spa_async_lock); 2695 spa->spa_async_thread = NULL; 2696 cv_broadcast(&spa->spa_async_cv); 2697 mutex_exit(&spa->spa_async_lock); 2698 thread_exit(); 2699 } 2700 2701 void 2702 spa_async_suspend(spa_t *spa) 2703 { 2704 mutex_enter(&spa->spa_async_lock); 2705 spa->spa_async_suspended++; 2706 while (spa->spa_async_thread != NULL) 2707 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 2708 mutex_exit(&spa->spa_async_lock); 2709 } 2710 2711 void 2712 spa_async_resume(spa_t *spa) 2713 { 2714 mutex_enter(&spa->spa_async_lock); 2715 ASSERT(spa->spa_async_suspended != 0); 2716 spa->spa_async_suspended--; 2717 mutex_exit(&spa->spa_async_lock); 2718 } 2719 2720 static void 2721 spa_async_dispatch(spa_t *spa) 2722 { 2723 mutex_enter(&spa->spa_async_lock); 2724 if (spa->spa_async_tasks && !spa->spa_async_suspended && 2725 spa->spa_async_thread == NULL && 2726 rootdir != NULL && !vn_is_readonly(rootdir)) 2727 spa->spa_async_thread = thread_create(NULL, 0, 2728 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 2729 mutex_exit(&spa->spa_async_lock); 2730 } 2731 2732 void 2733 spa_async_request(spa_t *spa, int task) 2734 { 2735 mutex_enter(&spa->spa_async_lock); 2736 spa->spa_async_tasks |= task; 2737 mutex_exit(&spa->spa_async_lock); 2738 } 2739 2740 /* 2741 * ========================================================================== 2742 * SPA syncing routines 2743 * ========================================================================== 2744 */ 2745 2746 static void 2747 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2748 { 2749 bplist_t *bpl = &spa->spa_sync_bplist; 2750 dmu_tx_t *tx; 2751 blkptr_t blk; 2752 uint64_t itor = 0; 2753 zio_t *zio; 2754 int error; 2755 uint8_t c = 1; 2756 2757 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2758 2759 while (bplist_iterate(bpl, &itor, &blk) == 0) 2760 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2761 2762 error = zio_wait(zio); 2763 ASSERT3U(error, ==, 0); 2764 2765 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2766 bplist_vacate(bpl, tx); 2767 2768 /* 2769 * Pre-dirty the first block so we sync to convergence faster. 2770 * (Usually only the first block is needed.) 2771 */ 2772 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2773 dmu_tx_commit(tx); 2774 } 2775 2776 static void 2777 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 2778 { 2779 char *packed = NULL; 2780 size_t nvsize = 0; 2781 dmu_buf_t *db; 2782 2783 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 2784 2785 packed = kmem_alloc(nvsize, KM_SLEEP); 2786 2787 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 2788 KM_SLEEP) == 0); 2789 2790 dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 2791 2792 kmem_free(packed, nvsize); 2793 2794 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 2795 dmu_buf_will_dirty(db, tx); 2796 *(uint64_t *)db->db_data = nvsize; 2797 dmu_buf_rele(db, FTAG); 2798 } 2799 2800 static void 2801 spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 2802 { 2803 nvlist_t *nvroot; 2804 nvlist_t **spares; 2805 int i; 2806 2807 if (!spa->spa_sync_spares) 2808 return; 2809 2810 /* 2811 * Update the MOS nvlist describing the list of available spares. 2812 * spa_validate_spares() will have already made sure this nvlist is 2813 * valid and the vdevs are labelled appropriately. 2814 */ 2815 if (spa->spa_spares_object == 0) { 2816 spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 2817 DMU_OT_PACKED_NVLIST, 1 << 14, 2818 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2819 VERIFY(zap_update(spa->spa_meta_objset, 2820 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 2821 sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 2822 } 2823 2824 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2825 if (spa->spa_nspares == 0) { 2826 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2827 NULL, 0) == 0); 2828 } else { 2829 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 2830 KM_SLEEP); 2831 for (i = 0; i < spa->spa_nspares; i++) 2832 spares[i] = vdev_config_generate(spa, 2833 spa->spa_spares[i], B_FALSE, B_TRUE); 2834 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2835 spares, spa->spa_nspares) == 0); 2836 for (i = 0; i < spa->spa_nspares; i++) 2837 nvlist_free(spares[i]); 2838 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 2839 } 2840 2841 spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 2842 nvlist_free(nvroot); 2843 2844 spa->spa_sync_spares = B_FALSE; 2845 } 2846 2847 static void 2848 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2849 { 2850 nvlist_t *config; 2851 2852 if (list_is_empty(&spa->spa_dirty_list)) 2853 return; 2854 2855 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2856 2857 if (spa->spa_config_syncing) 2858 nvlist_free(spa->spa_config_syncing); 2859 spa->spa_config_syncing = config; 2860 2861 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 2862 } 2863 2864 static void 2865 spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) 2866 { 2867 spa_t *spa = arg1; 2868 nvlist_t *nvp = arg2; 2869 nvpair_t *nvpair; 2870 objset_t *mos = spa->spa_meta_objset; 2871 uint64_t zapobj; 2872 2873 mutex_enter(&spa->spa_props_lock); 2874 if (spa->spa_pool_props_object == 0) { 2875 zapobj = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx); 2876 VERIFY(zapobj > 0); 2877 2878 spa->spa_pool_props_object = zapobj; 2879 2880 VERIFY(zap_update(mos, DMU_POOL_DIRECTORY_OBJECT, 2881 DMU_POOL_PROPS, 8, 1, 2882 &spa->spa_pool_props_object, tx) == 0); 2883 } 2884 mutex_exit(&spa->spa_props_lock); 2885 2886 nvpair = NULL; 2887 while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) { 2888 switch (zpool_name_to_prop(nvpair_name(nvpair))) { 2889 case ZFS_PROP_BOOTFS: 2890 VERIFY(nvlist_lookup_uint64(nvp, 2891 nvpair_name(nvpair), &spa->spa_bootfs) == 0); 2892 VERIFY(zap_update(mos, 2893 spa->spa_pool_props_object, 2894 zpool_prop_to_name(ZFS_PROP_BOOTFS), 8, 1, 2895 &spa->spa_bootfs, tx) == 0); 2896 break; 2897 } 2898 } 2899 } 2900 2901 /* 2902 * Sync the specified transaction group. New blocks may be dirtied as 2903 * part of the process, so we iterate until it converges. 2904 */ 2905 void 2906 spa_sync(spa_t *spa, uint64_t txg) 2907 { 2908 dsl_pool_t *dp = spa->spa_dsl_pool; 2909 objset_t *mos = spa->spa_meta_objset; 2910 bplist_t *bpl = &spa->spa_sync_bplist; 2911 vdev_t *rvd = spa->spa_root_vdev; 2912 vdev_t *vd; 2913 dmu_tx_t *tx; 2914 int dirty_vdevs; 2915 2916 /* 2917 * Lock out configuration changes. 2918 */ 2919 spa_config_enter(spa, RW_READER, FTAG); 2920 2921 spa->spa_syncing_txg = txg; 2922 spa->spa_sync_pass = 0; 2923 2924 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2925 2926 tx = dmu_tx_create_assigned(dp, txg); 2927 2928 /* 2929 * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg, 2930 * set spa_deflate if we have no raid-z vdevs. 2931 */ 2932 if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE && 2933 spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) { 2934 int i; 2935 2936 for (i = 0; i < rvd->vdev_children; i++) { 2937 vd = rvd->vdev_child[i]; 2938 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 2939 break; 2940 } 2941 if (i == rvd->vdev_children) { 2942 spa->spa_deflate = TRUE; 2943 VERIFY(0 == zap_add(spa->spa_meta_objset, 2944 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2945 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 2946 } 2947 } 2948 2949 /* 2950 * If anything has changed in this txg, push the deferred frees 2951 * from the previous txg. If not, leave them alone so that we 2952 * don't generate work on an otherwise idle system. 2953 */ 2954 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2955 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 2956 !txg_list_empty(&dp->dp_sync_tasks, txg)) 2957 spa_sync_deferred_frees(spa, txg); 2958 2959 /* 2960 * Iterate to convergence. 2961 */ 2962 do { 2963 spa->spa_sync_pass++; 2964 2965 spa_sync_config_object(spa, tx); 2966 spa_sync_spares(spa, tx); 2967 spa_errlog_sync(spa, txg); 2968 dsl_pool_sync(dp, txg); 2969 2970 dirty_vdevs = 0; 2971 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2972 vdev_sync(vd, txg); 2973 dirty_vdevs++; 2974 } 2975 2976 bplist_sync(bpl, tx); 2977 } while (dirty_vdevs); 2978 2979 bplist_close(bpl); 2980 2981 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2982 2983 /* 2984 * Rewrite the vdev configuration (which includes the uberblock) 2985 * to commit the transaction group. 2986 * 2987 * If there are any dirty vdevs, sync the uberblock to all vdevs. 2988 * Otherwise, pick a random top-level vdev that's known to be 2989 * visible in the config cache (see spa_vdev_add() for details). 2990 * If the write fails, try the next vdev until we're tried them all. 2991 */ 2992 if (!list_is_empty(&spa->spa_dirty_list)) { 2993 VERIFY(vdev_config_sync(rvd, txg) == 0); 2994 } else { 2995 int children = rvd->vdev_children; 2996 int c0 = spa_get_random(children); 2997 int c; 2998 2999 for (c = 0; c < children; c++) { 3000 vd = rvd->vdev_child[(c0 + c) % children]; 3001 if (vd->vdev_ms_array == 0) 3002 continue; 3003 if (vdev_config_sync(vd, txg) == 0) 3004 break; 3005 } 3006 if (c == children) 3007 VERIFY(vdev_config_sync(rvd, txg) == 0); 3008 } 3009 3010 dmu_tx_commit(tx); 3011 3012 /* 3013 * Clear the dirty config list. 3014 */ 3015 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 3016 vdev_config_clean(vd); 3017 3018 /* 3019 * Now that the new config has synced transactionally, 3020 * let it become visible to the config cache. 3021 */ 3022 if (spa->spa_config_syncing != NULL) { 3023 spa_config_set(spa, spa->spa_config_syncing); 3024 spa->spa_config_txg = txg; 3025 spa->spa_config_syncing = NULL; 3026 } 3027 3028 /* 3029 * Make a stable copy of the fully synced uberblock. 3030 * We use this as the root for pool traversals. 3031 */ 3032 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 3033 3034 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 3035 3036 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 3037 spa->spa_traverse_wanted = 0; 3038 spa->spa_ubsync = spa->spa_uberblock; 3039 rw_exit(&spa->spa_traverse_lock); 3040 3041 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 3042 3043 /* 3044 * Clean up the ZIL records for the synced txg. 3045 */ 3046 dsl_pool_zil_clean(dp); 3047 3048 /* 3049 * Update usable space statistics. 3050 */ 3051 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 3052 vdev_sync_done(vd, txg); 3053 3054 /* 3055 * It had better be the case that we didn't dirty anything 3056 * since vdev_config_sync(). 3057 */ 3058 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 3059 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 3060 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 3061 ASSERT(bpl->bpl_queue == NULL); 3062 3063 spa_config_exit(spa, FTAG); 3064 3065 /* 3066 * If any async tasks have been requested, kick them off. 3067 */ 3068 spa_async_dispatch(spa); 3069 } 3070 3071 /* 3072 * Sync all pools. We don't want to hold the namespace lock across these 3073 * operations, so we take a reference on the spa_t and drop the lock during the 3074 * sync. 3075 */ 3076 void 3077 spa_sync_allpools(void) 3078 { 3079 spa_t *spa = NULL; 3080 mutex_enter(&spa_namespace_lock); 3081 while ((spa = spa_next(spa)) != NULL) { 3082 if (spa_state(spa) != POOL_STATE_ACTIVE) 3083 continue; 3084 spa_open_ref(spa, FTAG); 3085 mutex_exit(&spa_namespace_lock); 3086 txg_wait_synced(spa_get_dsl(spa), 0); 3087 mutex_enter(&spa_namespace_lock); 3088 spa_close(spa, FTAG); 3089 } 3090 mutex_exit(&spa_namespace_lock); 3091 } 3092 3093 /* 3094 * ========================================================================== 3095 * Miscellaneous routines 3096 * ========================================================================== 3097 */ 3098 3099 /* 3100 * Remove all pools in the system. 3101 */ 3102 void 3103 spa_evict_all(void) 3104 { 3105 spa_t *spa; 3106 3107 /* 3108 * Remove all cached state. All pools should be closed now, 3109 * so every spa in the AVL tree should be unreferenced. 3110 */ 3111 mutex_enter(&spa_namespace_lock); 3112 while ((spa = spa_next(NULL)) != NULL) { 3113 /* 3114 * Stop async tasks. The async thread may need to detach 3115 * a device that's been replaced, which requires grabbing 3116 * spa_namespace_lock, so we must drop it here. 3117 */ 3118 spa_open_ref(spa, FTAG); 3119 mutex_exit(&spa_namespace_lock); 3120 spa_async_suspend(spa); 3121 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 3122 mutex_enter(&spa_namespace_lock); 3123 spa_close(spa, FTAG); 3124 3125 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3126 spa_unload(spa); 3127 spa_deactivate(spa); 3128 } 3129 spa_remove(spa); 3130 } 3131 mutex_exit(&spa_namespace_lock); 3132 } 3133 3134 vdev_t * 3135 spa_lookup_by_guid(spa_t *spa, uint64_t guid) 3136 { 3137 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 3138 } 3139 3140 void 3141 spa_upgrade(spa_t *spa) 3142 { 3143 spa_config_enter(spa, RW_WRITER, FTAG); 3144 3145 /* 3146 * This should only be called for a non-faulted pool, and since a 3147 * future version would result in an unopenable pool, this shouldn't be 3148 * possible. 3149 */ 3150 ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 3151 3152 spa->spa_uberblock.ub_version = ZFS_VERSION; 3153 vdev_config_dirty(spa->spa_root_vdev); 3154 3155 spa_config_exit(spa, FTAG); 3156 3157 txg_wait_synced(spa_get_dsl(spa), 0); 3158 } 3159 3160 boolean_t 3161 spa_has_spare(spa_t *spa, uint64_t guid) 3162 { 3163 int i; 3164 uint64_t spareguid; 3165 3166 for (i = 0; i < spa->spa_nspares; i++) 3167 if (spa->spa_spares[i]->vdev_guid == guid) 3168 return (B_TRUE); 3169 3170 for (i = 0; i < spa->spa_pending_nspares; i++) { 3171 if (nvlist_lookup_uint64(spa->spa_pending_spares[i], 3172 ZPOOL_CONFIG_GUID, &spareguid) == 0 && 3173 spareguid == guid) 3174 return (B_TRUE); 3175 } 3176 3177 return (B_FALSE); 3178 } 3179 3180 int 3181 spa_set_props(spa_t *spa, nvlist_t *nvp) 3182 { 3183 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 3184 spa, nvp, 3)); 3185 } 3186 3187 int 3188 spa_get_props(spa_t *spa, nvlist_t **nvp) 3189 { 3190 zap_cursor_t zc; 3191 zap_attribute_t za; 3192 objset_t *mos = spa->spa_meta_objset; 3193 zfs_source_t src; 3194 zfs_prop_t prop; 3195 nvlist_t *propval; 3196 uint64_t value; 3197 int err; 3198 3199 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3200 3201 mutex_enter(&spa->spa_props_lock); 3202 /* If no props object, then just return empty nvlist */ 3203 if (spa->spa_pool_props_object == 0) { 3204 mutex_exit(&spa->spa_props_lock); 3205 return (0); 3206 } 3207 3208 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 3209 (err = zap_cursor_retrieve(&zc, &za)) == 0; 3210 zap_cursor_advance(&zc)) { 3211 3212 if ((prop = zpool_name_to_prop(za.za_name)) == ZFS_PROP_INVAL) 3213 continue; 3214 3215 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3216 switch (za.za_integer_length) { 3217 case 8: 3218 if (zfs_prop_default_numeric(prop) == 3219 za.za_first_integer) 3220 src = ZFS_SRC_DEFAULT; 3221 else 3222 src = ZFS_SRC_LOCAL; 3223 value = za.za_first_integer; 3224 3225 if (prop == ZFS_PROP_BOOTFS) { 3226 dsl_pool_t *dp; 3227 dsl_dataset_t *ds = NULL; 3228 char strval[MAXPATHLEN]; 3229 3230 dp = spa_get_dsl(spa); 3231 rw_enter(&dp->dp_config_rwlock, RW_READER); 3232 if ((err = dsl_dataset_open_obj(dp, 3233 za.za_first_integer, NULL, DS_MODE_NONE, 3234 FTAG, &ds)) != 0) { 3235 rw_exit(&dp->dp_config_rwlock); 3236 break; 3237 } 3238 dsl_dataset_name(ds, strval); 3239 dsl_dataset_close(ds, DS_MODE_NONE, FTAG); 3240 rw_exit(&dp->dp_config_rwlock); 3241 3242 VERIFY(nvlist_add_uint64(propval, 3243 ZFS_PROP_SOURCE, src) == 0); 3244 VERIFY(nvlist_add_string(propval, 3245 ZFS_PROP_VALUE, strval) == 0); 3246 } else { 3247 VERIFY(nvlist_add_uint64(propval, 3248 ZFS_PROP_SOURCE, src) == 0); 3249 VERIFY(nvlist_add_uint64(propval, 3250 ZFS_PROP_VALUE, value) == 0); 3251 } 3252 VERIFY(nvlist_add_nvlist(*nvp, za.za_name, 3253 propval) == 0); 3254 break; 3255 } 3256 nvlist_free(propval); 3257 } 3258 zap_cursor_fini(&zc); 3259 mutex_exit(&spa->spa_props_lock); 3260 if (err && err != ENOENT) { 3261 nvlist_free(*nvp); 3262 return (err); 3263 } 3264 3265 return (0); 3266 } 3267 3268 /* 3269 * If the bootfs property value is dsobj, clear it. 3270 */ 3271 void 3272 spa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 3273 { 3274 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 3275 VERIFY(zap_remove(spa->spa_meta_objset, 3276 spa->spa_pool_props_object, 3277 zpool_prop_to_name(ZFS_PROP_BOOTFS), tx) == 0); 3278 spa->spa_bootfs = 0; 3279 } 3280 } 3281