1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains all the routines used when modifying on-disk SPA state. 31 * This includes opening, importing, destroying, exporting a pool, and syncing a 32 * pool. 33 */ 34 35 #include <sys/zfs_context.h> 36 #include <sys/fm/fs/zfs.h> 37 #include <sys/spa_impl.h> 38 #include <sys/zio.h> 39 #include <sys/zio_checksum.h> 40 #include <sys/zio_compress.h> 41 #include <sys/dmu.h> 42 #include <sys/dmu_tx.h> 43 #include <sys/zap.h> 44 #include <sys/zil.h> 45 #include <sys/vdev_impl.h> 46 #include <sys/metaslab.h> 47 #include <sys/uberblock_impl.h> 48 #include <sys/txg.h> 49 #include <sys/avl.h> 50 #include <sys/dmu_traverse.h> 51 #include <sys/dmu_objset.h> 52 #include <sys/unique.h> 53 #include <sys/dsl_pool.h> 54 #include <sys/dsl_dataset.h> 55 #include <sys/dsl_dir.h> 56 #include <sys/dsl_prop.h> 57 #include <sys/dsl_synctask.h> 58 #include <sys/fs/zfs.h> 59 #include <sys/callb.h> 60 #include <sys/systeminfo.h> 61 #include <sys/sunddi.h> 62 63 int zio_taskq_threads = 8; 64 65 /* 66 * ========================================================================== 67 * SPA state manipulation (open/create/destroy/import/export) 68 * ========================================================================== 69 */ 70 71 static int 72 spa_error_entry_compare(const void *a, const void *b) 73 { 74 spa_error_entry_t *sa = (spa_error_entry_t *)a; 75 spa_error_entry_t *sb = (spa_error_entry_t *)b; 76 int ret; 77 78 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 79 sizeof (zbookmark_t)); 80 81 if (ret < 0) 82 return (-1); 83 else if (ret > 0) 84 return (1); 85 else 86 return (0); 87 } 88 89 /* 90 * Utility function which retrieves copies of the current logs and 91 * re-initializes them in the process. 92 */ 93 void 94 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 95 { 96 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 97 98 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 99 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 100 101 avl_create(&spa->spa_errlist_scrub, 102 spa_error_entry_compare, sizeof (spa_error_entry_t), 103 offsetof(spa_error_entry_t, se_avl)); 104 avl_create(&spa->spa_errlist_last, 105 spa_error_entry_compare, sizeof (spa_error_entry_t), 106 offsetof(spa_error_entry_t, se_avl)); 107 } 108 109 /* 110 * Activate an uninitialized pool. 111 */ 112 static void 113 spa_activate(spa_t *spa) 114 { 115 int t; 116 117 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 118 119 spa->spa_state = POOL_STATE_ACTIVE; 120 121 spa->spa_normal_class = metaslab_class_create(); 122 123 for (t = 0; t < ZIO_TYPES; t++) { 124 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 125 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 126 TASKQ_PREPOPULATE); 127 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 128 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 129 TASKQ_PREPOPULATE); 130 } 131 132 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 133 134 mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); 135 mutex_init(&spa->spa_config_cache_lock, NULL, MUTEX_DEFAULT, NULL); 136 mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); 137 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); 138 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); 139 mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL); 140 mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); 141 mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); 142 mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); 143 144 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 145 offsetof(vdev_t, vdev_dirty_node)); 146 147 txg_list_create(&spa->spa_vdev_txg_list, 148 offsetof(struct vdev, vdev_txg_node)); 149 150 avl_create(&spa->spa_errlist_scrub, 151 spa_error_entry_compare, sizeof (spa_error_entry_t), 152 offsetof(spa_error_entry_t, se_avl)); 153 avl_create(&spa->spa_errlist_last, 154 spa_error_entry_compare, sizeof (spa_error_entry_t), 155 offsetof(spa_error_entry_t, se_avl)); 156 } 157 158 /* 159 * Opposite of spa_activate(). 160 */ 161 static void 162 spa_deactivate(spa_t *spa) 163 { 164 int t; 165 166 ASSERT(spa->spa_sync_on == B_FALSE); 167 ASSERT(spa->spa_dsl_pool == NULL); 168 ASSERT(spa->spa_root_vdev == NULL); 169 170 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 171 172 txg_list_destroy(&spa->spa_vdev_txg_list); 173 174 list_destroy(&spa->spa_dirty_list); 175 176 rw_destroy(&spa->spa_traverse_lock); 177 178 for (t = 0; t < ZIO_TYPES; t++) { 179 taskq_destroy(spa->spa_zio_issue_taskq[t]); 180 taskq_destroy(spa->spa_zio_intr_taskq[t]); 181 spa->spa_zio_issue_taskq[t] = NULL; 182 spa->spa_zio_intr_taskq[t] = NULL; 183 } 184 185 metaslab_class_destroy(spa->spa_normal_class); 186 spa->spa_normal_class = NULL; 187 188 /* 189 * If this was part of an import or the open otherwise failed, we may 190 * still have errors left in the queues. Empty them just in case. 191 */ 192 spa_errlog_drain(spa); 193 194 avl_destroy(&spa->spa_errlist_scrub); 195 avl_destroy(&spa->spa_errlist_last); 196 197 spa->spa_state = POOL_STATE_UNINITIALIZED; 198 } 199 200 /* 201 * Verify a pool configuration, and construct the vdev tree appropriately. This 202 * will create all the necessary vdevs in the appropriate layout, with each vdev 203 * in the CLOSED state. This will prep the pool before open/creation/import. 204 * All vdev validation is done by the vdev_alloc() routine. 205 */ 206 static int 207 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 208 uint_t id, int atype) 209 { 210 nvlist_t **child; 211 uint_t c, children; 212 int error; 213 214 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 215 return (error); 216 217 if ((*vdp)->vdev_ops->vdev_op_leaf) 218 return (0); 219 220 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 221 &child, &children) != 0) { 222 vdev_free(*vdp); 223 *vdp = NULL; 224 return (EINVAL); 225 } 226 227 for (c = 0; c < children; c++) { 228 vdev_t *vd; 229 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 230 atype)) != 0) { 231 vdev_free(*vdp); 232 *vdp = NULL; 233 return (error); 234 } 235 } 236 237 ASSERT(*vdp != NULL); 238 239 return (0); 240 } 241 242 /* 243 * Opposite of spa_load(). 244 */ 245 static void 246 spa_unload(spa_t *spa) 247 { 248 int i; 249 250 /* 251 * Stop async tasks. 252 */ 253 spa_async_suspend(spa); 254 255 /* 256 * Stop syncing. 257 */ 258 if (spa->spa_sync_on) { 259 txg_sync_stop(spa->spa_dsl_pool); 260 spa->spa_sync_on = B_FALSE; 261 } 262 263 /* 264 * Wait for any outstanding prefetch I/O to complete. 265 */ 266 spa_config_enter(spa, RW_WRITER, FTAG); 267 spa_config_exit(spa, FTAG); 268 269 /* 270 * Close the dsl pool. 271 */ 272 if (spa->spa_dsl_pool) { 273 dsl_pool_close(spa->spa_dsl_pool); 274 spa->spa_dsl_pool = NULL; 275 } 276 277 /* 278 * Close all vdevs. 279 */ 280 if (spa->spa_root_vdev) 281 vdev_free(spa->spa_root_vdev); 282 ASSERT(spa->spa_root_vdev == NULL); 283 284 for (i = 0; i < spa->spa_nspares; i++) 285 vdev_free(spa->spa_spares[i]); 286 if (spa->spa_spares) { 287 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 288 spa->spa_spares = NULL; 289 } 290 if (spa->spa_sparelist) { 291 nvlist_free(spa->spa_sparelist); 292 spa->spa_sparelist = NULL; 293 } 294 295 spa->spa_async_suspended = 0; 296 } 297 298 /* 299 * Load (or re-load) the current list of vdevs describing the active spares for 300 * this pool. When this is called, we have some form of basic information in 301 * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 302 * re-generate a more complete list including status information. 303 */ 304 static void 305 spa_load_spares(spa_t *spa) 306 { 307 nvlist_t **spares; 308 uint_t nspares; 309 int i; 310 vdev_t *vd, *tvd; 311 312 /* 313 * First, close and free any existing spare vdevs. 314 */ 315 for (i = 0; i < spa->spa_nspares; i++) { 316 vd = spa->spa_spares[i]; 317 318 /* Undo the call to spa_activate() below */ 319 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && 320 tvd->vdev_isspare) 321 spa_spare_remove(tvd); 322 vdev_close(vd); 323 vdev_free(vd); 324 } 325 326 if (spa->spa_spares) 327 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 328 329 if (spa->spa_sparelist == NULL) 330 nspares = 0; 331 else 332 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 333 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 334 335 spa->spa_nspares = (int)nspares; 336 spa->spa_spares = NULL; 337 338 if (nspares == 0) 339 return; 340 341 /* 342 * Construct the array of vdevs, opening them to get status in the 343 * process. For each spare, there is potentially two different vdev_t 344 * structures associated with it: one in the list of spares (used only 345 * for basic validation purposes) and one in the active vdev 346 * configuration (if it's spared in). During this phase we open and 347 * validate each vdev on the spare list. If the vdev also exists in the 348 * active configuration, then we also mark this vdev as an active spare. 349 */ 350 spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 351 for (i = 0; i < spa->spa_nspares; i++) { 352 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 353 VDEV_ALLOC_SPARE) == 0); 354 ASSERT(vd != NULL); 355 356 spa->spa_spares[i] = vd; 357 358 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { 359 if (!tvd->vdev_isspare) 360 spa_spare_add(tvd); 361 362 /* 363 * We only mark the spare active if we were successfully 364 * able to load the vdev. Otherwise, importing a pool 365 * with a bad active spare would result in strange 366 * behavior, because multiple pool would think the spare 367 * is actively in use. 368 * 369 * There is a vulnerability here to an equally bizarre 370 * circumstance, where a dead active spare is later 371 * brought back to life (onlined or otherwise). Given 372 * the rarity of this scenario, and the extra complexity 373 * it adds, we ignore the possibility. 374 */ 375 if (!vdev_is_dead(tvd)) 376 spa_spare_activate(tvd); 377 } 378 379 if (vdev_open(vd) != 0) 380 continue; 381 382 vd->vdev_top = vd; 383 (void) vdev_validate_spare(vd); 384 } 385 386 /* 387 * Recompute the stashed list of spares, with status information 388 * this time. 389 */ 390 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 391 DATA_TYPE_NVLIST_ARRAY) == 0); 392 393 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 394 for (i = 0; i < spa->spa_nspares; i++) 395 spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 396 B_TRUE, B_TRUE); 397 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 398 spares, spa->spa_nspares) == 0); 399 for (i = 0; i < spa->spa_nspares; i++) 400 nvlist_free(spares[i]); 401 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 402 } 403 404 static int 405 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 406 { 407 dmu_buf_t *db; 408 char *packed = NULL; 409 size_t nvsize = 0; 410 int error; 411 *value = NULL; 412 413 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 414 nvsize = *(uint64_t *)db->db_data; 415 dmu_buf_rele(db, FTAG); 416 417 packed = kmem_alloc(nvsize, KM_SLEEP); 418 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 419 if (error == 0) 420 error = nvlist_unpack(packed, nvsize, value, 0); 421 kmem_free(packed, nvsize); 422 423 return (error); 424 } 425 426 /* 427 * Load an existing storage pool, using the pool's builtin spa_config as a 428 * source of configuration information. 429 */ 430 static int 431 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 432 { 433 int error = 0; 434 nvlist_t *nvroot = NULL; 435 vdev_t *rvd; 436 uberblock_t *ub = &spa->spa_uberblock; 437 uint64_t config_cache_txg = spa->spa_config_txg; 438 uint64_t pool_guid; 439 uint64_t version; 440 zio_t *zio; 441 442 spa->spa_load_state = state; 443 444 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 445 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 446 error = EINVAL; 447 goto out; 448 } 449 450 /* 451 * Versioning wasn't explicitly added to the label until later, so if 452 * it's not present treat it as the initial version. 453 */ 454 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 455 version = ZFS_VERSION_INITIAL; 456 457 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 458 &spa->spa_config_txg); 459 460 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 461 spa_guid_exists(pool_guid, 0)) { 462 error = EEXIST; 463 goto out; 464 } 465 466 spa->spa_load_guid = pool_guid; 467 468 /* 469 * Parse the configuration into a vdev tree. We explicitly set the 470 * value that will be returned by spa_version() since parsing the 471 * configuration requires knowing the version number. 472 */ 473 spa_config_enter(spa, RW_WRITER, FTAG); 474 spa->spa_ubsync.ub_version = version; 475 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 476 spa_config_exit(spa, FTAG); 477 478 if (error != 0) 479 goto out; 480 481 ASSERT(spa->spa_root_vdev == rvd); 482 ASSERT(spa_guid(spa) == pool_guid); 483 484 /* 485 * Try to open all vdevs, loading each label in the process. 486 */ 487 error = vdev_open(rvd); 488 if (error != 0) 489 goto out; 490 491 /* 492 * Validate the labels for all leaf vdevs. We need to grab the config 493 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 494 * flag. 495 */ 496 spa_config_enter(spa, RW_READER, FTAG); 497 error = vdev_validate(rvd); 498 spa_config_exit(spa, FTAG); 499 500 if (error != 0) 501 goto out; 502 503 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 504 error = ENXIO; 505 goto out; 506 } 507 508 /* 509 * Find the best uberblock. 510 */ 511 bzero(ub, sizeof (uberblock_t)); 512 513 zio = zio_root(spa, NULL, NULL, 514 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 515 vdev_uberblock_load(zio, rvd, ub); 516 error = zio_wait(zio); 517 518 /* 519 * If we weren't able to find a single valid uberblock, return failure. 520 */ 521 if (ub->ub_txg == 0) { 522 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 523 VDEV_AUX_CORRUPT_DATA); 524 error = ENXIO; 525 goto out; 526 } 527 528 /* 529 * If the pool is newer than the code, we can't open it. 530 */ 531 if (ub->ub_version > ZFS_VERSION) { 532 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 533 VDEV_AUX_VERSION_NEWER); 534 error = ENOTSUP; 535 goto out; 536 } 537 538 /* 539 * If the vdev guid sum doesn't match the uberblock, we have an 540 * incomplete configuration. 541 */ 542 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 543 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 544 VDEV_AUX_BAD_GUID_SUM); 545 error = ENXIO; 546 goto out; 547 } 548 549 /* 550 * Initialize internal SPA structures. 551 */ 552 spa->spa_state = POOL_STATE_ACTIVE; 553 spa->spa_ubsync = spa->spa_uberblock; 554 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 555 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 556 if (error) { 557 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 558 VDEV_AUX_CORRUPT_DATA); 559 goto out; 560 } 561 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 562 563 if (zap_lookup(spa->spa_meta_objset, 564 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 565 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 566 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 567 VDEV_AUX_CORRUPT_DATA); 568 error = EIO; 569 goto out; 570 } 571 572 if (!mosconfig) { 573 nvlist_t *newconfig; 574 uint64_t hostid; 575 576 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 577 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 578 VDEV_AUX_CORRUPT_DATA); 579 error = EIO; 580 goto out; 581 } 582 583 if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID, 584 &hostid) == 0) { 585 char *hostname; 586 unsigned long myhostid = 0; 587 588 VERIFY(nvlist_lookup_string(newconfig, 589 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 590 591 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 592 if ((unsigned long)hostid != myhostid) { 593 cmn_err(CE_WARN, "pool '%s' could not be " 594 "loaded as it was last accessed by " 595 "another system (host: %s hostid: 0x%lx). " 596 "See: http://www.sun.com/msg/ZFS-8000-EY", 597 spa->spa_name, hostname, 598 (unsigned long)hostid); 599 error = EBADF; 600 goto out; 601 } 602 } 603 604 spa_config_set(spa, newconfig); 605 spa_unload(spa); 606 spa_deactivate(spa); 607 spa_activate(spa); 608 609 return (spa_load(spa, newconfig, state, B_TRUE)); 610 } 611 612 if (zap_lookup(spa->spa_meta_objset, 613 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 614 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 615 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 616 VDEV_AUX_CORRUPT_DATA); 617 error = EIO; 618 goto out; 619 } 620 621 /* 622 * Load the bit that tells us to use the new accounting function 623 * (raid-z deflation). If we have an older pool, this will not 624 * be present. 625 */ 626 error = zap_lookup(spa->spa_meta_objset, 627 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 628 sizeof (uint64_t), 1, &spa->spa_deflate); 629 if (error != 0 && error != ENOENT) { 630 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 631 VDEV_AUX_CORRUPT_DATA); 632 error = EIO; 633 goto out; 634 } 635 636 /* 637 * Load the persistent error log. If we have an older pool, this will 638 * not be present. 639 */ 640 error = zap_lookup(spa->spa_meta_objset, 641 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 642 sizeof (uint64_t), 1, &spa->spa_errlog_last); 643 if (error != 0 && error != ENOENT) { 644 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 645 VDEV_AUX_CORRUPT_DATA); 646 error = EIO; 647 goto out; 648 } 649 650 error = zap_lookup(spa->spa_meta_objset, 651 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 652 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 653 if (error != 0 && error != ENOENT) { 654 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 655 VDEV_AUX_CORRUPT_DATA); 656 error = EIO; 657 goto out; 658 } 659 660 /* 661 * Load the history object. If we have an older pool, this 662 * will not be present. 663 */ 664 error = zap_lookup(spa->spa_meta_objset, 665 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 666 sizeof (uint64_t), 1, &spa->spa_history); 667 if (error != 0 && error != ENOENT) { 668 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 669 VDEV_AUX_CORRUPT_DATA); 670 error = EIO; 671 goto out; 672 } 673 674 /* 675 * Load any hot spares for this pool. 676 */ 677 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 678 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 679 if (error != 0 && error != ENOENT) { 680 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 681 VDEV_AUX_CORRUPT_DATA); 682 error = EIO; 683 goto out; 684 } 685 if (error == 0) { 686 ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES); 687 if (load_nvlist(spa, spa->spa_spares_object, 688 &spa->spa_sparelist) != 0) { 689 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 690 VDEV_AUX_CORRUPT_DATA); 691 error = EIO; 692 goto out; 693 } 694 695 spa_config_enter(spa, RW_WRITER, FTAG); 696 spa_load_spares(spa); 697 spa_config_exit(spa, FTAG); 698 } 699 700 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 701 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 702 703 if (error && error != ENOENT) { 704 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 705 VDEV_AUX_CORRUPT_DATA); 706 error = EIO; 707 goto out; 708 } 709 710 if (error == 0) { 711 (void) zap_lookup(spa->spa_meta_objset, 712 spa->spa_pool_props_object, 713 zpool_prop_to_name(ZFS_PROP_BOOTFS), 714 sizeof (uint64_t), 1, &spa->spa_bootfs); 715 } 716 717 /* 718 * Load the vdev state for all toplevel vdevs. 719 */ 720 vdev_load(rvd); 721 722 /* 723 * Propagate the leaf DTLs we just loaded all the way up the tree. 724 */ 725 spa_config_enter(spa, RW_WRITER, FTAG); 726 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 727 spa_config_exit(spa, FTAG); 728 729 /* 730 * Check the state of the root vdev. If it can't be opened, it 731 * indicates one or more toplevel vdevs are faulted. 732 */ 733 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 734 error = ENXIO; 735 goto out; 736 } 737 738 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 739 dmu_tx_t *tx; 740 int need_update = B_FALSE; 741 int c; 742 743 /* 744 * Claim log blocks that haven't been committed yet. 745 * This must all happen in a single txg. 746 */ 747 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 748 spa_first_txg(spa)); 749 (void) dmu_objset_find(spa->spa_name, 750 zil_claim, tx, DS_FIND_CHILDREN); 751 dmu_tx_commit(tx); 752 753 spa->spa_sync_on = B_TRUE; 754 txg_sync_start(spa->spa_dsl_pool); 755 756 /* 757 * Wait for all claims to sync. 758 */ 759 txg_wait_synced(spa->spa_dsl_pool, 0); 760 761 /* 762 * If the config cache is stale, or we have uninitialized 763 * metaslabs (see spa_vdev_add()), then update the config. 764 */ 765 if (config_cache_txg != spa->spa_config_txg || 766 state == SPA_LOAD_IMPORT) 767 need_update = B_TRUE; 768 769 for (c = 0; c < rvd->vdev_children; c++) 770 if (rvd->vdev_child[c]->vdev_ms_array == 0) 771 need_update = B_TRUE; 772 773 /* 774 * Update the config cache asychronously in case we're the 775 * root pool, in which case the config cache isn't writable yet. 776 */ 777 if (need_update) 778 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 779 } 780 781 error = 0; 782 out: 783 if (error && error != EBADF) 784 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 785 spa->spa_load_state = SPA_LOAD_NONE; 786 spa->spa_ena = 0; 787 788 return (error); 789 } 790 791 /* 792 * Pool Open/Import 793 * 794 * The import case is identical to an open except that the configuration is sent 795 * down from userland, instead of grabbed from the configuration cache. For the 796 * case of an open, the pool configuration will exist in the 797 * POOL_STATE_UNITIALIZED state. 798 * 799 * The stats information (gen/count/ustats) is used to gather vdev statistics at 800 * the same time open the pool, without having to keep around the spa_t in some 801 * ambiguous state. 802 */ 803 static int 804 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 805 { 806 spa_t *spa; 807 int error; 808 int loaded = B_FALSE; 809 int locked = B_FALSE; 810 811 *spapp = NULL; 812 813 /* 814 * As disgusting as this is, we need to support recursive calls to this 815 * function because dsl_dir_open() is called during spa_load(), and ends 816 * up calling spa_open() again. The real fix is to figure out how to 817 * avoid dsl_dir_open() calling this in the first place. 818 */ 819 if (mutex_owner(&spa_namespace_lock) != curthread) { 820 mutex_enter(&spa_namespace_lock); 821 locked = B_TRUE; 822 } 823 824 if ((spa = spa_lookup(pool)) == NULL) { 825 if (locked) 826 mutex_exit(&spa_namespace_lock); 827 return (ENOENT); 828 } 829 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 830 831 spa_activate(spa); 832 833 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 834 835 if (error == EBADF) { 836 /* 837 * If vdev_validate() returns failure (indicated by 838 * EBADF), it indicates that one of the vdevs indicates 839 * that the pool has been exported or destroyed. If 840 * this is the case, the config cache is out of sync and 841 * we should remove the pool from the namespace. 842 */ 843 zfs_post_ok(spa, NULL); 844 spa_unload(spa); 845 spa_deactivate(spa); 846 spa_remove(spa); 847 spa_config_sync(); 848 if (locked) 849 mutex_exit(&spa_namespace_lock); 850 return (ENOENT); 851 } 852 853 if (error) { 854 /* 855 * We can't open the pool, but we still have useful 856 * information: the state of each vdev after the 857 * attempted vdev_open(). Return this to the user. 858 */ 859 if (config != NULL && spa->spa_root_vdev != NULL) { 860 spa_config_enter(spa, RW_READER, FTAG); 861 *config = spa_config_generate(spa, NULL, -1ULL, 862 B_TRUE); 863 spa_config_exit(spa, FTAG); 864 } 865 spa_unload(spa); 866 spa_deactivate(spa); 867 spa->spa_last_open_failed = B_TRUE; 868 if (locked) 869 mutex_exit(&spa_namespace_lock); 870 *spapp = NULL; 871 return (error); 872 } else { 873 zfs_post_ok(spa, NULL); 874 spa->spa_last_open_failed = B_FALSE; 875 } 876 877 loaded = B_TRUE; 878 } 879 880 spa_open_ref(spa, tag); 881 if (locked) 882 mutex_exit(&spa_namespace_lock); 883 884 *spapp = spa; 885 886 if (config != NULL) { 887 spa_config_enter(spa, RW_READER, FTAG); 888 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 889 spa_config_exit(spa, FTAG); 890 } 891 892 /* 893 * If we just loaded the pool, resilver anything that's out of date. 894 */ 895 if (loaded && (spa_mode & FWRITE)) 896 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 897 898 return (0); 899 } 900 901 int 902 spa_open(const char *name, spa_t **spapp, void *tag) 903 { 904 return (spa_open_common(name, spapp, tag, NULL)); 905 } 906 907 /* 908 * Lookup the given spa_t, incrementing the inject count in the process, 909 * preventing it from being exported or destroyed. 910 */ 911 spa_t * 912 spa_inject_addref(char *name) 913 { 914 spa_t *spa; 915 916 mutex_enter(&spa_namespace_lock); 917 if ((spa = spa_lookup(name)) == NULL) { 918 mutex_exit(&spa_namespace_lock); 919 return (NULL); 920 } 921 spa->spa_inject_ref++; 922 mutex_exit(&spa_namespace_lock); 923 924 return (spa); 925 } 926 927 void 928 spa_inject_delref(spa_t *spa) 929 { 930 mutex_enter(&spa_namespace_lock); 931 spa->spa_inject_ref--; 932 mutex_exit(&spa_namespace_lock); 933 } 934 935 static void 936 spa_add_spares(spa_t *spa, nvlist_t *config) 937 { 938 nvlist_t **spares; 939 uint_t i, nspares; 940 nvlist_t *nvroot; 941 uint64_t guid; 942 vdev_stat_t *vs; 943 uint_t vsc; 944 uint64_t pool; 945 946 if (spa->spa_nspares == 0) 947 return; 948 949 VERIFY(nvlist_lookup_nvlist(config, 950 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 951 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 952 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 953 if (nspares != 0) { 954 VERIFY(nvlist_add_nvlist_array(nvroot, 955 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 956 VERIFY(nvlist_lookup_nvlist_array(nvroot, 957 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 958 959 /* 960 * Go through and find any spares which have since been 961 * repurposed as an active spare. If this is the case, update 962 * their status appropriately. 963 */ 964 for (i = 0; i < nspares; i++) { 965 VERIFY(nvlist_lookup_uint64(spares[i], 966 ZPOOL_CONFIG_GUID, &guid) == 0); 967 if (spa_spare_exists(guid, &pool) && pool != 0ULL) { 968 VERIFY(nvlist_lookup_uint64_array( 969 spares[i], ZPOOL_CONFIG_STATS, 970 (uint64_t **)&vs, &vsc) == 0); 971 vs->vs_state = VDEV_STATE_CANT_OPEN; 972 vs->vs_aux = VDEV_AUX_SPARED; 973 } 974 } 975 } 976 } 977 978 int 979 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 980 { 981 int error; 982 spa_t *spa; 983 984 *config = NULL; 985 error = spa_open_common(name, &spa, FTAG, config); 986 987 if (spa && *config != NULL) { 988 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 989 spa_get_errlog_size(spa)) == 0); 990 991 spa_add_spares(spa, *config); 992 } 993 994 /* 995 * We want to get the alternate root even for faulted pools, so we cheat 996 * and call spa_lookup() directly. 997 */ 998 if (altroot) { 999 if (spa == NULL) { 1000 mutex_enter(&spa_namespace_lock); 1001 spa = spa_lookup(name); 1002 if (spa) 1003 spa_altroot(spa, altroot, buflen); 1004 else 1005 altroot[0] = '\0'; 1006 spa = NULL; 1007 mutex_exit(&spa_namespace_lock); 1008 } else { 1009 spa_altroot(spa, altroot, buflen); 1010 } 1011 } 1012 1013 if (spa != NULL) 1014 spa_close(spa, FTAG); 1015 1016 return (error); 1017 } 1018 1019 /* 1020 * Validate that the 'spares' array is well formed. We must have an array of 1021 * nvlists, each which describes a valid leaf vdev. If this is an import (mode 1022 * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long 1023 * as they are well-formed. 1024 */ 1025 static int 1026 spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 1027 { 1028 nvlist_t **spares; 1029 uint_t i, nspares; 1030 vdev_t *vd; 1031 int error; 1032 1033 /* 1034 * It's acceptable to have no spares specified. 1035 */ 1036 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1037 &spares, &nspares) != 0) 1038 return (0); 1039 1040 if (nspares == 0) 1041 return (EINVAL); 1042 1043 /* 1044 * Make sure the pool is formatted with a version that supports hot 1045 * spares. 1046 */ 1047 if (spa_version(spa) < ZFS_VERSION_SPARES) 1048 return (ENOTSUP); 1049 1050 /* 1051 * Set the pending spare list so we correctly handle device in-use 1052 * checking. 1053 */ 1054 spa->spa_pending_spares = spares; 1055 spa->spa_pending_nspares = nspares; 1056 1057 for (i = 0; i < nspares; i++) { 1058 if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 1059 mode)) != 0) 1060 goto out; 1061 1062 if (!vd->vdev_ops->vdev_op_leaf) { 1063 vdev_free(vd); 1064 error = EINVAL; 1065 goto out; 1066 } 1067 1068 vd->vdev_top = vd; 1069 1070 if ((error = vdev_open(vd)) == 0 && 1071 (error = vdev_label_init(vd, crtxg, 1072 VDEV_LABEL_SPARE)) == 0) { 1073 VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 1074 vd->vdev_guid) == 0); 1075 } 1076 1077 vdev_free(vd); 1078 1079 if (error && mode != VDEV_ALLOC_SPARE) 1080 goto out; 1081 else 1082 error = 0; 1083 } 1084 1085 out: 1086 spa->spa_pending_spares = NULL; 1087 spa->spa_pending_nspares = 0; 1088 return (error); 1089 } 1090 1091 /* 1092 * Pool Creation 1093 */ 1094 int 1095 spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 1096 { 1097 spa_t *spa; 1098 vdev_t *rvd; 1099 dsl_pool_t *dp; 1100 dmu_tx_t *tx; 1101 int c, error = 0; 1102 uint64_t txg = TXG_INITIAL; 1103 nvlist_t **spares; 1104 uint_t nspares; 1105 1106 /* 1107 * If this pool already exists, return failure. 1108 */ 1109 mutex_enter(&spa_namespace_lock); 1110 if (spa_lookup(pool) != NULL) { 1111 mutex_exit(&spa_namespace_lock); 1112 return (EEXIST); 1113 } 1114 1115 /* 1116 * Allocate a new spa_t structure. 1117 */ 1118 spa = spa_add(pool, altroot); 1119 spa_activate(spa); 1120 1121 spa->spa_uberblock.ub_txg = txg - 1; 1122 spa->spa_uberblock.ub_version = ZFS_VERSION; 1123 spa->spa_ubsync = spa->spa_uberblock; 1124 1125 /* 1126 * Create the root vdev. 1127 */ 1128 spa_config_enter(spa, RW_WRITER, FTAG); 1129 1130 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1131 1132 ASSERT(error != 0 || rvd != NULL); 1133 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1134 1135 if (error == 0 && rvd->vdev_children == 0) 1136 error = EINVAL; 1137 1138 if (error == 0 && 1139 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1140 (error = spa_validate_spares(spa, nvroot, txg, 1141 VDEV_ALLOC_ADD)) == 0) { 1142 for (c = 0; c < rvd->vdev_children; c++) 1143 vdev_init(rvd->vdev_child[c], txg); 1144 vdev_config_dirty(rvd); 1145 } 1146 1147 spa_config_exit(spa, FTAG); 1148 1149 if (error != 0) { 1150 spa_unload(spa); 1151 spa_deactivate(spa); 1152 spa_remove(spa); 1153 mutex_exit(&spa_namespace_lock); 1154 return (error); 1155 } 1156 1157 /* 1158 * Get the list of spares, if specified. 1159 */ 1160 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1161 &spares, &nspares) == 0) { 1162 VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 1163 KM_SLEEP) == 0); 1164 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1165 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1166 spa_config_enter(spa, RW_WRITER, FTAG); 1167 spa_load_spares(spa); 1168 spa_config_exit(spa, FTAG); 1169 spa->spa_sync_spares = B_TRUE; 1170 } 1171 1172 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1173 spa->spa_meta_objset = dp->dp_meta_objset; 1174 1175 tx = dmu_tx_create_assigned(dp, txg); 1176 1177 /* 1178 * Create the pool config object. 1179 */ 1180 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1181 DMU_OT_PACKED_NVLIST, 1 << 14, 1182 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1183 1184 if (zap_add(spa->spa_meta_objset, 1185 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1186 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1187 cmn_err(CE_PANIC, "failed to add pool config"); 1188 } 1189 1190 /* Newly created pools are always deflated. */ 1191 spa->spa_deflate = TRUE; 1192 if (zap_add(spa->spa_meta_objset, 1193 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1194 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 1195 cmn_err(CE_PANIC, "failed to add deflate"); 1196 } 1197 1198 /* 1199 * Create the deferred-free bplist object. Turn off compression 1200 * because sync-to-convergence takes longer if the blocksize 1201 * keeps changing. 1202 */ 1203 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1204 1 << 14, tx); 1205 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1206 ZIO_COMPRESS_OFF, tx); 1207 1208 if (zap_add(spa->spa_meta_objset, 1209 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1210 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 1211 cmn_err(CE_PANIC, "failed to add bplist"); 1212 } 1213 1214 /* 1215 * Create the pool's history object. 1216 */ 1217 spa_history_create_obj(spa, tx); 1218 1219 dmu_tx_commit(tx); 1220 1221 spa->spa_bootfs = zfs_prop_default_numeric(ZFS_PROP_BOOTFS); 1222 spa->spa_sync_on = B_TRUE; 1223 txg_sync_start(spa->spa_dsl_pool); 1224 1225 /* 1226 * We explicitly wait for the first transaction to complete so that our 1227 * bean counters are appropriately updated. 1228 */ 1229 txg_wait_synced(spa->spa_dsl_pool, txg); 1230 1231 spa_config_sync(); 1232 1233 mutex_exit(&spa_namespace_lock); 1234 1235 return (0); 1236 } 1237 1238 /* 1239 * Import the given pool into the system. We set up the necessary spa_t and 1240 * then call spa_load() to do the dirty work. 1241 */ 1242 int 1243 spa_import(const char *pool, nvlist_t *config, const char *altroot) 1244 { 1245 spa_t *spa; 1246 int error; 1247 nvlist_t *nvroot; 1248 nvlist_t **spares; 1249 uint_t nspares; 1250 1251 if (!(spa_mode & FWRITE)) 1252 return (EROFS); 1253 1254 /* 1255 * If a pool with this name exists, return failure. 1256 */ 1257 mutex_enter(&spa_namespace_lock); 1258 if (spa_lookup(pool) != NULL) { 1259 mutex_exit(&spa_namespace_lock); 1260 return (EEXIST); 1261 } 1262 1263 /* 1264 * Create and initialize the spa structure. 1265 */ 1266 spa = spa_add(pool, altroot); 1267 spa_activate(spa); 1268 1269 /* 1270 * Pass off the heavy lifting to spa_load(). 1271 * Pass TRUE for mosconfig because the user-supplied config 1272 * is actually the one to trust when doing an import. 1273 */ 1274 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1275 1276 spa_config_enter(spa, RW_WRITER, FTAG); 1277 /* 1278 * Toss any existing sparelist, as it doesn't have any validity anymore, 1279 * and conflicts with spa_has_spare(). 1280 */ 1281 if (spa->spa_sparelist) { 1282 nvlist_free(spa->spa_sparelist); 1283 spa->spa_sparelist = NULL; 1284 spa_load_spares(spa); 1285 } 1286 1287 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1288 &nvroot) == 0); 1289 if (error == 0) 1290 error = spa_validate_spares(spa, nvroot, -1ULL, 1291 VDEV_ALLOC_SPARE); 1292 spa_config_exit(spa, FTAG); 1293 1294 if (error != 0) { 1295 spa_unload(spa); 1296 spa_deactivate(spa); 1297 spa_remove(spa); 1298 mutex_exit(&spa_namespace_lock); 1299 return (error); 1300 } 1301 1302 /* 1303 * Override any spares as specified by the user, as these may have 1304 * correct device names/devids, etc. 1305 */ 1306 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1307 &spares, &nspares) == 0) { 1308 if (spa->spa_sparelist) 1309 VERIFY(nvlist_remove(spa->spa_sparelist, 1310 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1311 else 1312 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1313 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1314 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1315 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1316 spa_config_enter(spa, RW_WRITER, FTAG); 1317 spa_load_spares(spa); 1318 spa_config_exit(spa, FTAG); 1319 spa->spa_sync_spares = B_TRUE; 1320 } 1321 1322 /* 1323 * Update the config cache to include the newly-imported pool. 1324 */ 1325 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1326 1327 mutex_exit(&spa_namespace_lock); 1328 1329 /* 1330 * Resilver anything that's out of date. 1331 */ 1332 if (spa_mode & FWRITE) 1333 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1334 1335 return (0); 1336 } 1337 1338 /* 1339 * This (illegal) pool name is used when temporarily importing a spa_t in order 1340 * to get the vdev stats associated with the imported devices. 1341 */ 1342 #define TRYIMPORT_NAME "$import" 1343 1344 nvlist_t * 1345 spa_tryimport(nvlist_t *tryconfig) 1346 { 1347 nvlist_t *config = NULL; 1348 char *poolname; 1349 spa_t *spa; 1350 uint64_t state; 1351 1352 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1353 return (NULL); 1354 1355 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1356 return (NULL); 1357 1358 /* 1359 * Create and initialize the spa structure. 1360 */ 1361 mutex_enter(&spa_namespace_lock); 1362 spa = spa_add(TRYIMPORT_NAME, NULL); 1363 spa_activate(spa); 1364 1365 /* 1366 * Pass off the heavy lifting to spa_load(). 1367 * Pass TRUE for mosconfig because the user-supplied config 1368 * is actually the one to trust when doing an import. 1369 */ 1370 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1371 1372 /* 1373 * If 'tryconfig' was at least parsable, return the current config. 1374 */ 1375 if (spa->spa_root_vdev != NULL) { 1376 spa_config_enter(spa, RW_READER, FTAG); 1377 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1378 spa_config_exit(spa, FTAG); 1379 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1380 poolname) == 0); 1381 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1382 state) == 0); 1383 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 1384 spa->spa_uberblock.ub_timestamp) == 0); 1385 1386 /* 1387 * Add the list of hot spares. 1388 */ 1389 spa_add_spares(spa, config); 1390 } 1391 1392 spa_unload(spa); 1393 spa_deactivate(spa); 1394 spa_remove(spa); 1395 mutex_exit(&spa_namespace_lock); 1396 1397 return (config); 1398 } 1399 1400 /* 1401 * Pool export/destroy 1402 * 1403 * The act of destroying or exporting a pool is very simple. We make sure there 1404 * is no more pending I/O and any references to the pool are gone. Then, we 1405 * update the pool state and sync all the labels to disk, removing the 1406 * configuration from the cache afterwards. 1407 */ 1408 static int 1409 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1410 { 1411 spa_t *spa; 1412 1413 if (oldconfig) 1414 *oldconfig = NULL; 1415 1416 if (!(spa_mode & FWRITE)) 1417 return (EROFS); 1418 1419 mutex_enter(&spa_namespace_lock); 1420 if ((spa = spa_lookup(pool)) == NULL) { 1421 mutex_exit(&spa_namespace_lock); 1422 return (ENOENT); 1423 } 1424 1425 /* 1426 * Put a hold on the pool, drop the namespace lock, stop async tasks, 1427 * reacquire the namespace lock, and see if we can export. 1428 */ 1429 spa_open_ref(spa, FTAG); 1430 mutex_exit(&spa_namespace_lock); 1431 spa_async_suspend(spa); 1432 mutex_enter(&spa_namespace_lock); 1433 spa_close(spa, FTAG); 1434 1435 /* 1436 * The pool will be in core if it's openable, 1437 * in which case we can modify its state. 1438 */ 1439 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1440 /* 1441 * Objsets may be open only because they're dirty, so we 1442 * have to force it to sync before checking spa_refcnt. 1443 */ 1444 spa_scrub_suspend(spa); 1445 txg_wait_synced(spa->spa_dsl_pool, 0); 1446 1447 /* 1448 * A pool cannot be exported or destroyed if there are active 1449 * references. If we are resetting a pool, allow references by 1450 * fault injection handlers. 1451 */ 1452 if (!spa_refcount_zero(spa) || 1453 (spa->spa_inject_ref != 0 && 1454 new_state != POOL_STATE_UNINITIALIZED)) { 1455 spa_scrub_resume(spa); 1456 spa_async_resume(spa); 1457 mutex_exit(&spa_namespace_lock); 1458 return (EBUSY); 1459 } 1460 1461 spa_scrub_resume(spa); 1462 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1463 1464 /* 1465 * We want this to be reflected on every label, 1466 * so mark them all dirty. spa_unload() will do the 1467 * final sync that pushes these changes out. 1468 */ 1469 if (new_state != POOL_STATE_UNINITIALIZED) { 1470 spa_config_enter(spa, RW_WRITER, FTAG); 1471 spa->spa_state = new_state; 1472 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 1473 vdev_config_dirty(spa->spa_root_vdev); 1474 spa_config_exit(spa, FTAG); 1475 } 1476 } 1477 1478 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1479 spa_unload(spa); 1480 spa_deactivate(spa); 1481 } 1482 1483 if (oldconfig && spa->spa_config) 1484 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 1485 1486 if (new_state != POOL_STATE_UNINITIALIZED) { 1487 spa_remove(spa); 1488 spa_config_sync(); 1489 } 1490 mutex_exit(&spa_namespace_lock); 1491 1492 return (0); 1493 } 1494 1495 /* 1496 * Destroy a storage pool. 1497 */ 1498 int 1499 spa_destroy(char *pool) 1500 { 1501 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1502 } 1503 1504 /* 1505 * Export a storage pool. 1506 */ 1507 int 1508 spa_export(char *pool, nvlist_t **oldconfig) 1509 { 1510 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1511 } 1512 1513 /* 1514 * Similar to spa_export(), this unloads the spa_t without actually removing it 1515 * from the namespace in any way. 1516 */ 1517 int 1518 spa_reset(char *pool) 1519 { 1520 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 1521 } 1522 1523 1524 /* 1525 * ========================================================================== 1526 * Device manipulation 1527 * ========================================================================== 1528 */ 1529 1530 /* 1531 * Add capacity to a storage pool. 1532 */ 1533 int 1534 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1535 { 1536 uint64_t txg; 1537 int c, error; 1538 vdev_t *rvd = spa->spa_root_vdev; 1539 vdev_t *vd, *tvd; 1540 nvlist_t **spares; 1541 uint_t i, nspares; 1542 1543 txg = spa_vdev_enter(spa); 1544 1545 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 1546 VDEV_ALLOC_ADD)) != 0) 1547 return (spa_vdev_exit(spa, NULL, txg, error)); 1548 1549 spa->spa_pending_vdev = vd; 1550 1551 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1552 &spares, &nspares) != 0) 1553 nspares = 0; 1554 1555 if (vd->vdev_children == 0 && nspares == 0) { 1556 spa->spa_pending_vdev = NULL; 1557 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1558 } 1559 1560 if (vd->vdev_children != 0) { 1561 if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 1562 spa->spa_pending_vdev = NULL; 1563 return (spa_vdev_exit(spa, vd, txg, error)); 1564 } 1565 } 1566 1567 /* 1568 * We must validate the spares after checking the children. Otherwise, 1569 * vdev_inuse() will blindly overwrite the spare. 1570 */ 1571 if ((error = spa_validate_spares(spa, nvroot, txg, 1572 VDEV_ALLOC_ADD)) != 0) { 1573 spa->spa_pending_vdev = NULL; 1574 return (spa_vdev_exit(spa, vd, txg, error)); 1575 } 1576 1577 spa->spa_pending_vdev = NULL; 1578 1579 /* 1580 * Transfer each new top-level vdev from vd to rvd. 1581 */ 1582 for (c = 0; c < vd->vdev_children; c++) { 1583 tvd = vd->vdev_child[c]; 1584 vdev_remove_child(vd, tvd); 1585 tvd->vdev_id = rvd->vdev_children; 1586 vdev_add_child(rvd, tvd); 1587 vdev_config_dirty(tvd); 1588 } 1589 1590 if (nspares != 0) { 1591 if (spa->spa_sparelist != NULL) { 1592 nvlist_t **oldspares; 1593 uint_t oldnspares; 1594 nvlist_t **newspares; 1595 1596 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 1597 ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 1598 1599 newspares = kmem_alloc(sizeof (void *) * 1600 (nspares + oldnspares), KM_SLEEP); 1601 for (i = 0; i < oldnspares; i++) 1602 VERIFY(nvlist_dup(oldspares[i], 1603 &newspares[i], KM_SLEEP) == 0); 1604 for (i = 0; i < nspares; i++) 1605 VERIFY(nvlist_dup(spares[i], 1606 &newspares[i + oldnspares], 1607 KM_SLEEP) == 0); 1608 1609 VERIFY(nvlist_remove(spa->spa_sparelist, 1610 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1611 1612 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1613 ZPOOL_CONFIG_SPARES, newspares, 1614 nspares + oldnspares) == 0); 1615 for (i = 0; i < oldnspares + nspares; i++) 1616 nvlist_free(newspares[i]); 1617 kmem_free(newspares, (oldnspares + nspares) * 1618 sizeof (void *)); 1619 } else { 1620 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1621 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1622 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1623 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1624 } 1625 1626 spa_load_spares(spa); 1627 spa->spa_sync_spares = B_TRUE; 1628 } 1629 1630 /* 1631 * We have to be careful when adding new vdevs to an existing pool. 1632 * If other threads start allocating from these vdevs before we 1633 * sync the config cache, and we lose power, then upon reboot we may 1634 * fail to open the pool because there are DVAs that the config cache 1635 * can't translate. Therefore, we first add the vdevs without 1636 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1637 * and then let spa_config_update() initialize the new metaslabs. 1638 * 1639 * spa_load() checks for added-but-not-initialized vdevs, so that 1640 * if we lose power at any point in this sequence, the remaining 1641 * steps will be completed the next time we load the pool. 1642 */ 1643 (void) spa_vdev_exit(spa, vd, txg, 0); 1644 1645 mutex_enter(&spa_namespace_lock); 1646 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1647 mutex_exit(&spa_namespace_lock); 1648 1649 return (0); 1650 } 1651 1652 /* 1653 * Attach a device to a mirror. The arguments are the path to any device 1654 * in the mirror, and the nvroot for the new device. If the path specifies 1655 * a device that is not mirrored, we automatically insert the mirror vdev. 1656 * 1657 * If 'replacing' is specified, the new device is intended to replace the 1658 * existing device; in this case the two devices are made into their own 1659 * mirror using the 'replacing' vdev, which is functionally idendical to 1660 * the mirror vdev (it actually reuses all the same ops) but has a few 1661 * extra rules: you can't attach to it after it's been created, and upon 1662 * completion of resilvering, the first disk (the one being replaced) 1663 * is automatically detached. 1664 */ 1665 int 1666 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1667 { 1668 uint64_t txg, open_txg; 1669 int error; 1670 vdev_t *rvd = spa->spa_root_vdev; 1671 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1672 vdev_ops_t *pvops; 1673 1674 txg = spa_vdev_enter(spa); 1675 1676 oldvd = vdev_lookup_by_guid(rvd, guid); 1677 1678 if (oldvd == NULL) 1679 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1680 1681 if (!oldvd->vdev_ops->vdev_op_leaf) 1682 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1683 1684 pvd = oldvd->vdev_parent; 1685 1686 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 1687 VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) 1688 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1689 1690 newvd = newrootvd->vdev_child[0]; 1691 1692 if (!newvd->vdev_ops->vdev_op_leaf) 1693 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1694 1695 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 1696 return (spa_vdev_exit(spa, newrootvd, txg, error)); 1697 1698 if (!replacing) { 1699 /* 1700 * For attach, the only allowable parent is a mirror or the root 1701 * vdev. 1702 */ 1703 if (pvd->vdev_ops != &vdev_mirror_ops && 1704 pvd->vdev_ops != &vdev_root_ops) 1705 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1706 1707 pvops = &vdev_mirror_ops; 1708 } else { 1709 /* 1710 * Active hot spares can only be replaced by inactive hot 1711 * spares. 1712 */ 1713 if (pvd->vdev_ops == &vdev_spare_ops && 1714 pvd->vdev_child[1] == oldvd && 1715 !spa_has_spare(spa, newvd->vdev_guid)) 1716 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1717 1718 /* 1719 * If the source is a hot spare, and the parent isn't already a 1720 * spare, then we want to create a new hot spare. Otherwise, we 1721 * want to create a replacing vdev. The user is not allowed to 1722 * attach to a spared vdev child unless the 'isspare' state is 1723 * the same (spare replaces spare, non-spare replaces 1724 * non-spare). 1725 */ 1726 if (pvd->vdev_ops == &vdev_replacing_ops) 1727 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1728 else if (pvd->vdev_ops == &vdev_spare_ops && 1729 newvd->vdev_isspare != oldvd->vdev_isspare) 1730 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1731 else if (pvd->vdev_ops != &vdev_spare_ops && 1732 newvd->vdev_isspare) 1733 pvops = &vdev_spare_ops; 1734 else 1735 pvops = &vdev_replacing_ops; 1736 } 1737 1738 /* 1739 * Compare the new device size with the replaceable/attachable 1740 * device size. 1741 */ 1742 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1743 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1744 1745 /* 1746 * The new device cannot have a higher alignment requirement 1747 * than the top-level vdev. 1748 */ 1749 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1750 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1751 1752 /* 1753 * If this is an in-place replacement, update oldvd's path and devid 1754 * to make it distinguishable from newvd, and unopenable from now on. 1755 */ 1756 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1757 spa_strfree(oldvd->vdev_path); 1758 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1759 KM_SLEEP); 1760 (void) sprintf(oldvd->vdev_path, "%s/%s", 1761 newvd->vdev_path, "old"); 1762 if (oldvd->vdev_devid != NULL) { 1763 spa_strfree(oldvd->vdev_devid); 1764 oldvd->vdev_devid = NULL; 1765 } 1766 } 1767 1768 /* 1769 * If the parent is not a mirror, or if we're replacing, insert the new 1770 * mirror/replacing/spare vdev above oldvd. 1771 */ 1772 if (pvd->vdev_ops != pvops) 1773 pvd = vdev_add_parent(oldvd, pvops); 1774 1775 ASSERT(pvd->vdev_top->vdev_parent == rvd); 1776 ASSERT(pvd->vdev_ops == pvops); 1777 ASSERT(oldvd->vdev_parent == pvd); 1778 1779 /* 1780 * Extract the new device from its root and add it to pvd. 1781 */ 1782 vdev_remove_child(newrootvd, newvd); 1783 newvd->vdev_id = pvd->vdev_children; 1784 vdev_add_child(pvd, newvd); 1785 1786 /* 1787 * If newvd is smaller than oldvd, but larger than its rsize, 1788 * the addition of newvd may have decreased our parent's asize. 1789 */ 1790 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1791 1792 tvd = newvd->vdev_top; 1793 ASSERT(pvd->vdev_top == tvd); 1794 ASSERT(tvd->vdev_parent == rvd); 1795 1796 vdev_config_dirty(tvd); 1797 1798 /* 1799 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1800 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1801 */ 1802 open_txg = txg + TXG_CONCURRENT_STATES - 1; 1803 1804 mutex_enter(&newvd->vdev_dtl_lock); 1805 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1806 open_txg - TXG_INITIAL + 1); 1807 mutex_exit(&newvd->vdev_dtl_lock); 1808 1809 if (newvd->vdev_isspare) 1810 spa_spare_activate(newvd); 1811 1812 /* 1813 * Mark newvd's DTL dirty in this txg. 1814 */ 1815 vdev_dirty(tvd, VDD_DTL, newvd, txg); 1816 1817 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1818 1819 /* 1820 * Kick off a resilver to update newvd. 1821 */ 1822 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1823 1824 return (0); 1825 } 1826 1827 /* 1828 * Detach a device from a mirror or replacing vdev. 1829 * If 'replace_done' is specified, only detach if the parent 1830 * is a replacing vdev. 1831 */ 1832 int 1833 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1834 { 1835 uint64_t txg; 1836 int c, t, error; 1837 vdev_t *rvd = spa->spa_root_vdev; 1838 vdev_t *vd, *pvd, *cvd, *tvd; 1839 boolean_t unspare = B_FALSE; 1840 uint64_t unspare_guid; 1841 1842 txg = spa_vdev_enter(spa); 1843 1844 vd = vdev_lookup_by_guid(rvd, guid); 1845 1846 if (vd == NULL) 1847 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1848 1849 if (!vd->vdev_ops->vdev_op_leaf) 1850 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1851 1852 pvd = vd->vdev_parent; 1853 1854 /* 1855 * If replace_done is specified, only remove this device if it's 1856 * the first child of a replacing vdev. For the 'spare' vdev, either 1857 * disk can be removed. 1858 */ 1859 if (replace_done) { 1860 if (pvd->vdev_ops == &vdev_replacing_ops) { 1861 if (vd->vdev_id != 0) 1862 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1863 } else if (pvd->vdev_ops != &vdev_spare_ops) { 1864 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1865 } 1866 } 1867 1868 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 1869 spa_version(spa) >= ZFS_VERSION_SPARES); 1870 1871 /* 1872 * Only mirror, replacing, and spare vdevs support detach. 1873 */ 1874 if (pvd->vdev_ops != &vdev_replacing_ops && 1875 pvd->vdev_ops != &vdev_mirror_ops && 1876 pvd->vdev_ops != &vdev_spare_ops) 1877 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1878 1879 /* 1880 * If there's only one replica, you can't detach it. 1881 */ 1882 if (pvd->vdev_children <= 1) 1883 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1884 1885 /* 1886 * If all siblings have non-empty DTLs, this device may have the only 1887 * valid copy of the data, which means we cannot safely detach it. 1888 * 1889 * XXX -- as in the vdev_offline() case, we really want a more 1890 * precise DTL check. 1891 */ 1892 for (c = 0; c < pvd->vdev_children; c++) { 1893 uint64_t dirty; 1894 1895 cvd = pvd->vdev_child[c]; 1896 if (cvd == vd) 1897 continue; 1898 if (vdev_is_dead(cvd)) 1899 continue; 1900 mutex_enter(&cvd->vdev_dtl_lock); 1901 dirty = cvd->vdev_dtl_map.sm_space | 1902 cvd->vdev_dtl_scrub.sm_space; 1903 mutex_exit(&cvd->vdev_dtl_lock); 1904 if (!dirty) 1905 break; 1906 } 1907 1908 /* 1909 * If we are a replacing or spare vdev, then we can always detach the 1910 * latter child, as that is how one cancels the operation. 1911 */ 1912 if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 1913 c == pvd->vdev_children) 1914 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1915 1916 /* 1917 * If we are detaching the original disk from a spare, then it implies 1918 * that the spare should become a real disk, and be removed from the 1919 * active spare list for the pool. 1920 */ 1921 if (pvd->vdev_ops == &vdev_spare_ops && 1922 vd->vdev_id == 0) 1923 unspare = B_TRUE; 1924 1925 /* 1926 * Erase the disk labels so the disk can be used for other things. 1927 * This must be done after all other error cases are handled, 1928 * but before we disembowel vd (so we can still do I/O to it). 1929 * But if we can't do it, don't treat the error as fatal -- 1930 * it may be that the unwritability of the disk is the reason 1931 * it's being detached! 1932 */ 1933 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 1934 1935 /* 1936 * Remove vd from its parent and compact the parent's children. 1937 */ 1938 vdev_remove_child(pvd, vd); 1939 vdev_compact_children(pvd); 1940 1941 /* 1942 * Remember one of the remaining children so we can get tvd below. 1943 */ 1944 cvd = pvd->vdev_child[0]; 1945 1946 /* 1947 * If we need to remove the remaining child from the list of hot spares, 1948 * do it now, marking the vdev as no longer a spare in the process. We 1949 * must do this before vdev_remove_parent(), because that can change the 1950 * GUID if it creates a new toplevel GUID. 1951 */ 1952 if (unspare) { 1953 ASSERT(cvd->vdev_isspare); 1954 spa_spare_remove(cvd); 1955 unspare_guid = cvd->vdev_guid; 1956 } 1957 1958 /* 1959 * If the parent mirror/replacing vdev only has one child, 1960 * the parent is no longer needed. Remove it from the tree. 1961 */ 1962 if (pvd->vdev_children == 1) 1963 vdev_remove_parent(cvd); 1964 1965 /* 1966 * We don't set tvd until now because the parent we just removed 1967 * may have been the previous top-level vdev. 1968 */ 1969 tvd = cvd->vdev_top; 1970 ASSERT(tvd->vdev_parent == rvd); 1971 1972 /* 1973 * Reevaluate the parent vdev state. 1974 */ 1975 vdev_propagate_state(cvd->vdev_parent); 1976 1977 /* 1978 * If the device we just detached was smaller than the others, it may be 1979 * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 1980 * can't fail because the existing metaslabs are already in core, so 1981 * there's nothing to read from disk. 1982 */ 1983 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 1984 1985 vdev_config_dirty(tvd); 1986 1987 /* 1988 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 1989 * vd->vdev_detached is set and free vd's DTL object in syncing context. 1990 * But first make sure we're not on any *other* txg's DTL list, to 1991 * prevent vd from being accessed after it's freed. 1992 */ 1993 for (t = 0; t < TXG_SIZE; t++) 1994 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1995 vd->vdev_detached = B_TRUE; 1996 vdev_dirty(tvd, VDD_DTL, vd, txg); 1997 1998 error = spa_vdev_exit(spa, vd, txg, 0); 1999 2000 /* 2001 * If this was the removal of the original device in a hot spare vdev, 2002 * then we want to go through and remove the device from the hot spare 2003 * list of every other pool. 2004 */ 2005 if (unspare) { 2006 spa = NULL; 2007 mutex_enter(&spa_namespace_lock); 2008 while ((spa = spa_next(spa)) != NULL) { 2009 if (spa->spa_state != POOL_STATE_ACTIVE) 2010 continue; 2011 2012 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 2013 } 2014 mutex_exit(&spa_namespace_lock); 2015 } 2016 2017 return (error); 2018 } 2019 2020 /* 2021 * Remove a device from the pool. Currently, this supports removing only hot 2022 * spares. 2023 */ 2024 int 2025 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 2026 { 2027 vdev_t *vd; 2028 nvlist_t **spares, *nv, **newspares; 2029 uint_t i, j, nspares; 2030 int ret = 0; 2031 2032 spa_config_enter(spa, RW_WRITER, FTAG); 2033 2034 vd = spa_lookup_by_guid(spa, guid); 2035 2036 nv = NULL; 2037 if (spa->spa_spares != NULL && 2038 nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2039 &spares, &nspares) == 0) { 2040 for (i = 0; i < nspares; i++) { 2041 uint64_t theguid; 2042 2043 VERIFY(nvlist_lookup_uint64(spares[i], 2044 ZPOOL_CONFIG_GUID, &theguid) == 0); 2045 if (theguid == guid) { 2046 nv = spares[i]; 2047 break; 2048 } 2049 } 2050 } 2051 2052 /* 2053 * We only support removing a hot spare, and only if it's not currently 2054 * in use in this pool. 2055 */ 2056 if (nv == NULL && vd == NULL) { 2057 ret = ENOENT; 2058 goto out; 2059 } 2060 2061 if (nv == NULL && vd != NULL) { 2062 ret = ENOTSUP; 2063 goto out; 2064 } 2065 2066 if (!unspare && nv != NULL && vd != NULL) { 2067 ret = EBUSY; 2068 goto out; 2069 } 2070 2071 if (nspares == 1) { 2072 newspares = NULL; 2073 } else { 2074 newspares = kmem_alloc((nspares - 1) * sizeof (void *), 2075 KM_SLEEP); 2076 for (i = 0, j = 0; i < nspares; i++) { 2077 if (spares[i] != nv) 2078 VERIFY(nvlist_dup(spares[i], 2079 &newspares[j++], KM_SLEEP) == 0); 2080 } 2081 } 2082 2083 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2084 DATA_TYPE_NVLIST_ARRAY) == 0); 2085 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2086 newspares, nspares - 1) == 0); 2087 for (i = 0; i < nspares - 1; i++) 2088 nvlist_free(newspares[i]); 2089 kmem_free(newspares, (nspares - 1) * sizeof (void *)); 2090 spa_load_spares(spa); 2091 spa->spa_sync_spares = B_TRUE; 2092 2093 out: 2094 spa_config_exit(spa, FTAG); 2095 2096 return (ret); 2097 } 2098 2099 /* 2100 * Find any device that's done replacing, so we can detach it. 2101 */ 2102 static vdev_t * 2103 spa_vdev_replace_done_hunt(vdev_t *vd) 2104 { 2105 vdev_t *newvd, *oldvd; 2106 int c; 2107 2108 for (c = 0; c < vd->vdev_children; c++) { 2109 oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 2110 if (oldvd != NULL) 2111 return (oldvd); 2112 } 2113 2114 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 2115 oldvd = vd->vdev_child[0]; 2116 newvd = vd->vdev_child[1]; 2117 2118 mutex_enter(&newvd->vdev_dtl_lock); 2119 if (newvd->vdev_dtl_map.sm_space == 0 && 2120 newvd->vdev_dtl_scrub.sm_space == 0) { 2121 mutex_exit(&newvd->vdev_dtl_lock); 2122 return (oldvd); 2123 } 2124 mutex_exit(&newvd->vdev_dtl_lock); 2125 } 2126 2127 return (NULL); 2128 } 2129 2130 static void 2131 spa_vdev_replace_done(spa_t *spa) 2132 { 2133 vdev_t *vd; 2134 vdev_t *pvd; 2135 uint64_t guid; 2136 uint64_t pguid = 0; 2137 2138 spa_config_enter(spa, RW_READER, FTAG); 2139 2140 while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 2141 guid = vd->vdev_guid; 2142 /* 2143 * If we have just finished replacing a hot spared device, then 2144 * we need to detach the parent's first child (the original hot 2145 * spare) as well. 2146 */ 2147 pvd = vd->vdev_parent; 2148 if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2149 pvd->vdev_id == 0) { 2150 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 2151 ASSERT(pvd->vdev_parent->vdev_children == 2); 2152 pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 2153 } 2154 spa_config_exit(spa, FTAG); 2155 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 2156 return; 2157 if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 2158 return; 2159 spa_config_enter(spa, RW_READER, FTAG); 2160 } 2161 2162 spa_config_exit(spa, FTAG); 2163 } 2164 2165 /* 2166 * Update the stored path for this vdev. Dirty the vdev configuration, relying 2167 * on spa_vdev_enter/exit() to synchronize the labels and cache. 2168 */ 2169 int 2170 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 2171 { 2172 vdev_t *rvd, *vd; 2173 uint64_t txg; 2174 2175 rvd = spa->spa_root_vdev; 2176 2177 txg = spa_vdev_enter(spa); 2178 2179 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2180 /* 2181 * Determine if this is a reference to a hot spare. In that 2182 * case, update the path as stored in the spare list. 2183 */ 2184 nvlist_t **spares; 2185 uint_t i, nspares; 2186 if (spa->spa_sparelist != NULL) { 2187 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 2188 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2189 for (i = 0; i < nspares; i++) { 2190 uint64_t theguid; 2191 VERIFY(nvlist_lookup_uint64(spares[i], 2192 ZPOOL_CONFIG_GUID, &theguid) == 0); 2193 if (theguid == guid) 2194 break; 2195 } 2196 2197 if (i == nspares) 2198 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2199 2200 VERIFY(nvlist_add_string(spares[i], 2201 ZPOOL_CONFIG_PATH, newpath) == 0); 2202 spa_load_spares(spa); 2203 spa->spa_sync_spares = B_TRUE; 2204 return (spa_vdev_exit(spa, NULL, txg, 0)); 2205 } else { 2206 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2207 } 2208 } 2209 2210 if (!vd->vdev_ops->vdev_op_leaf) 2211 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2212 2213 spa_strfree(vd->vdev_path); 2214 vd->vdev_path = spa_strdup(newpath); 2215 2216 vdev_config_dirty(vd->vdev_top); 2217 2218 return (spa_vdev_exit(spa, NULL, txg, 0)); 2219 } 2220 2221 /* 2222 * ========================================================================== 2223 * SPA Scrubbing 2224 * ========================================================================== 2225 */ 2226 2227 static void 2228 spa_scrub_io_done(zio_t *zio) 2229 { 2230 spa_t *spa = zio->io_spa; 2231 2232 zio_data_buf_free(zio->io_data, zio->io_size); 2233 2234 mutex_enter(&spa->spa_scrub_lock); 2235 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2236 vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2237 spa->spa_scrub_errors++; 2238 mutex_enter(&vd->vdev_stat_lock); 2239 vd->vdev_stat.vs_scrub_errors++; 2240 mutex_exit(&vd->vdev_stat_lock); 2241 } 2242 2243 if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight) 2244 cv_broadcast(&spa->spa_scrub_io_cv); 2245 2246 ASSERT(spa->spa_scrub_inflight >= 0); 2247 2248 mutex_exit(&spa->spa_scrub_lock); 2249 } 2250 2251 static void 2252 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 2253 zbookmark_t *zb) 2254 { 2255 size_t size = BP_GET_LSIZE(bp); 2256 void *data; 2257 2258 mutex_enter(&spa->spa_scrub_lock); 2259 /* 2260 * Do not give too much work to vdev(s). 2261 */ 2262 while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) { 2263 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2264 } 2265 spa->spa_scrub_inflight++; 2266 mutex_exit(&spa->spa_scrub_lock); 2267 2268 data = zio_data_buf_alloc(size); 2269 2270 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 2271 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 2272 2273 flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 2274 2275 zio_nowait(zio_read(NULL, spa, bp, data, size, 2276 spa_scrub_io_done, NULL, priority, flags, zb)); 2277 } 2278 2279 /* ARGSUSED */ 2280 static int 2281 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2282 { 2283 blkptr_t *bp = &bc->bc_blkptr; 2284 vdev_t *vd = spa->spa_root_vdev; 2285 dva_t *dva = bp->blk_dva; 2286 int needs_resilver = B_FALSE; 2287 int d; 2288 2289 if (bc->bc_errno) { 2290 /* 2291 * We can't scrub this block, but we can continue to scrub 2292 * the rest of the pool. Note the error and move along. 2293 */ 2294 mutex_enter(&spa->spa_scrub_lock); 2295 spa->spa_scrub_errors++; 2296 mutex_exit(&spa->spa_scrub_lock); 2297 2298 mutex_enter(&vd->vdev_stat_lock); 2299 vd->vdev_stat.vs_scrub_errors++; 2300 mutex_exit(&vd->vdev_stat_lock); 2301 2302 return (ERESTART); 2303 } 2304 2305 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2306 2307 for (d = 0; d < BP_GET_NDVAS(bp); d++) { 2308 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 2309 2310 ASSERT(vd != NULL); 2311 2312 /* 2313 * Keep track of how much data we've examined so that 2314 * zpool(1M) status can make useful progress reports. 2315 */ 2316 mutex_enter(&vd->vdev_stat_lock); 2317 vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 2318 mutex_exit(&vd->vdev_stat_lock); 2319 2320 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 2321 if (DVA_GET_GANG(&dva[d])) { 2322 /* 2323 * Gang members may be spread across multiple 2324 * vdevs, so the best we can do is look at the 2325 * pool-wide DTL. 2326 * XXX -- it would be better to change our 2327 * allocation policy to ensure that this can't 2328 * happen. 2329 */ 2330 vd = spa->spa_root_vdev; 2331 } 2332 if (vdev_dtl_contains(&vd->vdev_dtl_map, 2333 bp->blk_birth, 1)) 2334 needs_resilver = B_TRUE; 2335 } 2336 } 2337 2338 if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2339 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 2340 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 2341 else if (needs_resilver) 2342 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 2343 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2344 2345 return (0); 2346 } 2347 2348 static void 2349 spa_scrub_thread(spa_t *spa) 2350 { 2351 callb_cpr_t cprinfo; 2352 traverse_handle_t *th = spa->spa_scrub_th; 2353 vdev_t *rvd = spa->spa_root_vdev; 2354 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2355 int error = 0; 2356 boolean_t complete; 2357 2358 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2359 2360 /* 2361 * If we're restarting due to a snapshot create/delete, 2362 * wait for that to complete. 2363 */ 2364 txg_wait_synced(spa_get_dsl(spa), 0); 2365 2366 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 2367 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2368 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 2369 2370 spa_config_enter(spa, RW_WRITER, FTAG); 2371 vdev_reopen(rvd); /* purge all vdev caches */ 2372 vdev_config_dirty(rvd); /* rewrite all disk labels */ 2373 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 2374 spa_config_exit(spa, FTAG); 2375 2376 mutex_enter(&spa->spa_scrub_lock); 2377 spa->spa_scrub_errors = 0; 2378 spa->spa_scrub_active = 1; 2379 ASSERT(spa->spa_scrub_inflight == 0); 2380 2381 while (!spa->spa_scrub_stop) { 2382 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2383 while (spa->spa_scrub_suspended) { 2384 spa->spa_scrub_active = 0; 2385 cv_broadcast(&spa->spa_scrub_cv); 2386 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2387 spa->spa_scrub_active = 1; 2388 } 2389 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2390 2391 if (spa->spa_scrub_restart_txg != 0) 2392 break; 2393 2394 mutex_exit(&spa->spa_scrub_lock); 2395 error = traverse_more(th); 2396 mutex_enter(&spa->spa_scrub_lock); 2397 if (error != EAGAIN) 2398 break; 2399 } 2400 2401 while (spa->spa_scrub_inflight) 2402 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2403 2404 spa->spa_scrub_active = 0; 2405 cv_broadcast(&spa->spa_scrub_cv); 2406 2407 mutex_exit(&spa->spa_scrub_lock); 2408 2409 spa_config_enter(spa, RW_WRITER, FTAG); 2410 2411 mutex_enter(&spa->spa_scrub_lock); 2412 2413 /* 2414 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 2415 * AND the spa config lock to synchronize with any config changes 2416 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 2417 */ 2418 if (spa->spa_scrub_restart_txg != 0) 2419 error = ERESTART; 2420 2421 if (spa->spa_scrub_stop) 2422 error = EINTR; 2423 2424 /* 2425 * Even if there were uncorrectable errors, we consider the scrub 2426 * completed. The downside is that if there is a transient error during 2427 * a resilver, we won't resilver the data properly to the target. But 2428 * if the damage is permanent (more likely) we will resilver forever, 2429 * which isn't really acceptable. Since there is enough information for 2430 * the user to know what has failed and why, this seems like a more 2431 * tractable approach. 2432 */ 2433 complete = (error == 0); 2434 2435 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 2436 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2437 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2438 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2439 2440 mutex_exit(&spa->spa_scrub_lock); 2441 2442 /* 2443 * If the scrub/resilver completed, update all DTLs to reflect this. 2444 * Whether it succeeded or not, vacate all temporary scrub DTLs. 2445 */ 2446 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2447 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2448 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 2449 spa_errlog_rotate(spa); 2450 2451 spa_config_exit(spa, FTAG); 2452 2453 mutex_enter(&spa->spa_scrub_lock); 2454 2455 /* 2456 * We may have finished replacing a device. 2457 * Let the async thread assess this and handle the detach. 2458 */ 2459 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2460 2461 /* 2462 * If we were told to restart, our final act is to start a new scrub. 2463 */ 2464 if (error == ERESTART) 2465 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 2466 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2467 2468 spa->spa_scrub_type = POOL_SCRUB_NONE; 2469 spa->spa_scrub_active = 0; 2470 spa->spa_scrub_thread = NULL; 2471 cv_broadcast(&spa->spa_scrub_cv); 2472 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2473 thread_exit(); 2474 } 2475 2476 void 2477 spa_scrub_suspend(spa_t *spa) 2478 { 2479 mutex_enter(&spa->spa_scrub_lock); 2480 spa->spa_scrub_suspended++; 2481 while (spa->spa_scrub_active) { 2482 cv_broadcast(&spa->spa_scrub_cv); 2483 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2484 } 2485 while (spa->spa_scrub_inflight) 2486 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2487 mutex_exit(&spa->spa_scrub_lock); 2488 } 2489 2490 void 2491 spa_scrub_resume(spa_t *spa) 2492 { 2493 mutex_enter(&spa->spa_scrub_lock); 2494 ASSERT(spa->spa_scrub_suspended != 0); 2495 if (--spa->spa_scrub_suspended == 0) 2496 cv_broadcast(&spa->spa_scrub_cv); 2497 mutex_exit(&spa->spa_scrub_lock); 2498 } 2499 2500 void 2501 spa_scrub_restart(spa_t *spa, uint64_t txg) 2502 { 2503 /* 2504 * Something happened (e.g. snapshot create/delete) that means 2505 * we must restart any in-progress scrubs. The itinerary will 2506 * fix this properly. 2507 */ 2508 mutex_enter(&spa->spa_scrub_lock); 2509 spa->spa_scrub_restart_txg = txg; 2510 mutex_exit(&spa->spa_scrub_lock); 2511 } 2512 2513 int 2514 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2515 { 2516 space_seg_t *ss; 2517 uint64_t mintxg, maxtxg; 2518 vdev_t *rvd = spa->spa_root_vdev; 2519 2520 if ((uint_t)type >= POOL_SCRUB_TYPES) 2521 return (ENOTSUP); 2522 2523 mutex_enter(&spa->spa_scrub_lock); 2524 2525 /* 2526 * If there's a scrub or resilver already in progress, stop it. 2527 */ 2528 while (spa->spa_scrub_thread != NULL) { 2529 /* 2530 * Don't stop a resilver unless forced. 2531 */ 2532 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 2533 mutex_exit(&spa->spa_scrub_lock); 2534 return (EBUSY); 2535 } 2536 spa->spa_scrub_stop = 1; 2537 cv_broadcast(&spa->spa_scrub_cv); 2538 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2539 } 2540 2541 /* 2542 * Terminate the previous traverse. 2543 */ 2544 if (spa->spa_scrub_th != NULL) { 2545 traverse_fini(spa->spa_scrub_th); 2546 spa->spa_scrub_th = NULL; 2547 } 2548 2549 if (rvd == NULL) { 2550 ASSERT(spa->spa_scrub_stop == 0); 2551 ASSERT(spa->spa_scrub_type == type); 2552 ASSERT(spa->spa_scrub_restart_txg == 0); 2553 mutex_exit(&spa->spa_scrub_lock); 2554 return (0); 2555 } 2556 2557 mintxg = TXG_INITIAL - 1; 2558 maxtxg = spa_last_synced_txg(spa) + 1; 2559 2560 mutex_enter(&rvd->vdev_dtl_lock); 2561 2562 if (rvd->vdev_dtl_map.sm_space == 0) { 2563 /* 2564 * The pool-wide DTL is empty. 2565 * If this is a resilver, there's nothing to do except 2566 * check whether any in-progress replacements have completed. 2567 */ 2568 if (type == POOL_SCRUB_RESILVER) { 2569 type = POOL_SCRUB_NONE; 2570 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2571 } 2572 } else { 2573 /* 2574 * The pool-wide DTL is non-empty. 2575 * If this is a normal scrub, upgrade to a resilver instead. 2576 */ 2577 if (type == POOL_SCRUB_EVERYTHING) 2578 type = POOL_SCRUB_RESILVER; 2579 } 2580 2581 if (type == POOL_SCRUB_RESILVER) { 2582 /* 2583 * Determine the resilvering boundaries. 2584 * 2585 * Note: (mintxg, maxtxg) is an open interval, 2586 * i.e. mintxg and maxtxg themselves are not included. 2587 * 2588 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 2589 * so we don't claim to resilver a txg that's still changing. 2590 */ 2591 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 2592 mintxg = ss->ss_start - 1; 2593 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 2594 maxtxg = MIN(ss->ss_end, maxtxg); 2595 } 2596 2597 mutex_exit(&rvd->vdev_dtl_lock); 2598 2599 spa->spa_scrub_stop = 0; 2600 spa->spa_scrub_type = type; 2601 spa->spa_scrub_restart_txg = 0; 2602 2603 if (type != POOL_SCRUB_NONE) { 2604 spa->spa_scrub_mintxg = mintxg; 2605 spa->spa_scrub_maxtxg = maxtxg; 2606 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 2607 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 2608 ZIO_FLAG_CANFAIL); 2609 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 2610 spa->spa_scrub_thread = thread_create(NULL, 0, 2611 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 2612 } 2613 2614 mutex_exit(&spa->spa_scrub_lock); 2615 2616 return (0); 2617 } 2618 2619 /* 2620 * ========================================================================== 2621 * SPA async task processing 2622 * ========================================================================== 2623 */ 2624 2625 static void 2626 spa_async_reopen(spa_t *spa) 2627 { 2628 vdev_t *rvd = spa->spa_root_vdev; 2629 vdev_t *tvd; 2630 int c; 2631 2632 spa_config_enter(spa, RW_WRITER, FTAG); 2633 2634 for (c = 0; c < rvd->vdev_children; c++) { 2635 tvd = rvd->vdev_child[c]; 2636 if (tvd->vdev_reopen_wanted) { 2637 tvd->vdev_reopen_wanted = 0; 2638 vdev_reopen(tvd); 2639 } 2640 } 2641 2642 spa_config_exit(spa, FTAG); 2643 } 2644 2645 static void 2646 spa_async_thread(spa_t *spa) 2647 { 2648 int tasks; 2649 2650 ASSERT(spa->spa_sync_on); 2651 2652 mutex_enter(&spa->spa_async_lock); 2653 tasks = spa->spa_async_tasks; 2654 spa->spa_async_tasks = 0; 2655 mutex_exit(&spa->spa_async_lock); 2656 2657 /* 2658 * See if the config needs to be updated. 2659 */ 2660 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 2661 mutex_enter(&spa_namespace_lock); 2662 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2663 mutex_exit(&spa_namespace_lock); 2664 } 2665 2666 /* 2667 * See if any devices need to be reopened. 2668 */ 2669 if (tasks & SPA_ASYNC_REOPEN) 2670 spa_async_reopen(spa); 2671 2672 /* 2673 * If any devices are done replacing, detach them. 2674 */ 2675 if (tasks & SPA_ASYNC_REPLACE_DONE) 2676 spa_vdev_replace_done(spa); 2677 2678 /* 2679 * Kick off a scrub. 2680 */ 2681 if (tasks & SPA_ASYNC_SCRUB) 2682 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 2683 2684 /* 2685 * Kick off a resilver. 2686 */ 2687 if (tasks & SPA_ASYNC_RESILVER) 2688 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 2689 2690 /* 2691 * Let the world know that we're done. 2692 */ 2693 mutex_enter(&spa->spa_async_lock); 2694 spa->spa_async_thread = NULL; 2695 cv_broadcast(&spa->spa_async_cv); 2696 mutex_exit(&spa->spa_async_lock); 2697 thread_exit(); 2698 } 2699 2700 void 2701 spa_async_suspend(spa_t *spa) 2702 { 2703 mutex_enter(&spa->spa_async_lock); 2704 spa->spa_async_suspended++; 2705 while (spa->spa_async_thread != NULL) 2706 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 2707 mutex_exit(&spa->spa_async_lock); 2708 } 2709 2710 void 2711 spa_async_resume(spa_t *spa) 2712 { 2713 mutex_enter(&spa->spa_async_lock); 2714 ASSERT(spa->spa_async_suspended != 0); 2715 spa->spa_async_suspended--; 2716 mutex_exit(&spa->spa_async_lock); 2717 } 2718 2719 static void 2720 spa_async_dispatch(spa_t *spa) 2721 { 2722 mutex_enter(&spa->spa_async_lock); 2723 if (spa->spa_async_tasks && !spa->spa_async_suspended && 2724 spa->spa_async_thread == NULL && 2725 rootdir != NULL && !vn_is_readonly(rootdir)) 2726 spa->spa_async_thread = thread_create(NULL, 0, 2727 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 2728 mutex_exit(&spa->spa_async_lock); 2729 } 2730 2731 void 2732 spa_async_request(spa_t *spa, int task) 2733 { 2734 mutex_enter(&spa->spa_async_lock); 2735 spa->spa_async_tasks |= task; 2736 mutex_exit(&spa->spa_async_lock); 2737 } 2738 2739 /* 2740 * ========================================================================== 2741 * SPA syncing routines 2742 * ========================================================================== 2743 */ 2744 2745 static void 2746 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2747 { 2748 bplist_t *bpl = &spa->spa_sync_bplist; 2749 dmu_tx_t *tx; 2750 blkptr_t blk; 2751 uint64_t itor = 0; 2752 zio_t *zio; 2753 int error; 2754 uint8_t c = 1; 2755 2756 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2757 2758 while (bplist_iterate(bpl, &itor, &blk) == 0) 2759 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2760 2761 error = zio_wait(zio); 2762 ASSERT3U(error, ==, 0); 2763 2764 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2765 bplist_vacate(bpl, tx); 2766 2767 /* 2768 * Pre-dirty the first block so we sync to convergence faster. 2769 * (Usually only the first block is needed.) 2770 */ 2771 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2772 dmu_tx_commit(tx); 2773 } 2774 2775 static void 2776 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 2777 { 2778 char *packed = NULL; 2779 size_t nvsize = 0; 2780 dmu_buf_t *db; 2781 2782 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 2783 2784 packed = kmem_alloc(nvsize, KM_SLEEP); 2785 2786 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 2787 KM_SLEEP) == 0); 2788 2789 dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 2790 2791 kmem_free(packed, nvsize); 2792 2793 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 2794 dmu_buf_will_dirty(db, tx); 2795 *(uint64_t *)db->db_data = nvsize; 2796 dmu_buf_rele(db, FTAG); 2797 } 2798 2799 static void 2800 spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 2801 { 2802 nvlist_t *nvroot; 2803 nvlist_t **spares; 2804 int i; 2805 2806 if (!spa->spa_sync_spares) 2807 return; 2808 2809 /* 2810 * Update the MOS nvlist describing the list of available spares. 2811 * spa_validate_spares() will have already made sure this nvlist is 2812 * valid and the vdevs are labelled appropriately. 2813 */ 2814 if (spa->spa_spares_object == 0) { 2815 spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 2816 DMU_OT_PACKED_NVLIST, 1 << 14, 2817 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2818 VERIFY(zap_update(spa->spa_meta_objset, 2819 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 2820 sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 2821 } 2822 2823 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2824 if (spa->spa_nspares == 0) { 2825 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2826 NULL, 0) == 0); 2827 } else { 2828 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 2829 KM_SLEEP); 2830 for (i = 0; i < spa->spa_nspares; i++) 2831 spares[i] = vdev_config_generate(spa, 2832 spa->spa_spares[i], B_FALSE, B_TRUE); 2833 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2834 spares, spa->spa_nspares) == 0); 2835 for (i = 0; i < spa->spa_nspares; i++) 2836 nvlist_free(spares[i]); 2837 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 2838 } 2839 2840 spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 2841 nvlist_free(nvroot); 2842 2843 spa->spa_sync_spares = B_FALSE; 2844 } 2845 2846 static void 2847 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2848 { 2849 nvlist_t *config; 2850 2851 if (list_is_empty(&spa->spa_dirty_list)) 2852 return; 2853 2854 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2855 2856 if (spa->spa_config_syncing) 2857 nvlist_free(spa->spa_config_syncing); 2858 spa->spa_config_syncing = config; 2859 2860 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 2861 } 2862 2863 static void 2864 spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) 2865 { 2866 spa_t *spa = arg1; 2867 nvlist_t *nvp = arg2; 2868 nvpair_t *nvpair; 2869 objset_t *mos = spa->spa_meta_objset; 2870 uint64_t zapobj; 2871 2872 mutex_enter(&spa->spa_props_lock); 2873 if (spa->spa_pool_props_object == 0) { 2874 zapobj = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx); 2875 VERIFY(zapobj > 0); 2876 2877 spa->spa_pool_props_object = zapobj; 2878 2879 VERIFY(zap_update(mos, DMU_POOL_DIRECTORY_OBJECT, 2880 DMU_POOL_PROPS, 8, 1, 2881 &spa->spa_pool_props_object, tx) == 0); 2882 } 2883 mutex_exit(&spa->spa_props_lock); 2884 2885 nvpair = NULL; 2886 while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) { 2887 switch (zpool_name_to_prop(nvpair_name(nvpair))) { 2888 case ZFS_PROP_BOOTFS: 2889 VERIFY(nvlist_lookup_uint64(nvp, 2890 nvpair_name(nvpair), &spa->spa_bootfs) == 0); 2891 VERIFY(zap_update(mos, 2892 spa->spa_pool_props_object, 2893 zpool_prop_to_name(ZFS_PROP_BOOTFS), 8, 1, 2894 &spa->spa_bootfs, tx) == 0); 2895 break; 2896 } 2897 } 2898 } 2899 2900 /* 2901 * Sync the specified transaction group. New blocks may be dirtied as 2902 * part of the process, so we iterate until it converges. 2903 */ 2904 void 2905 spa_sync(spa_t *spa, uint64_t txg) 2906 { 2907 dsl_pool_t *dp = spa->spa_dsl_pool; 2908 objset_t *mos = spa->spa_meta_objset; 2909 bplist_t *bpl = &spa->spa_sync_bplist; 2910 vdev_t *rvd = spa->spa_root_vdev; 2911 vdev_t *vd; 2912 dmu_tx_t *tx; 2913 int dirty_vdevs; 2914 2915 /* 2916 * Lock out configuration changes. 2917 */ 2918 spa_config_enter(spa, RW_READER, FTAG); 2919 2920 spa->spa_syncing_txg = txg; 2921 spa->spa_sync_pass = 0; 2922 2923 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2924 2925 tx = dmu_tx_create_assigned(dp, txg); 2926 2927 /* 2928 * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg, 2929 * set spa_deflate if we have no raid-z vdevs. 2930 */ 2931 if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE && 2932 spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) { 2933 int i; 2934 2935 for (i = 0; i < rvd->vdev_children; i++) { 2936 vd = rvd->vdev_child[i]; 2937 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 2938 break; 2939 } 2940 if (i == rvd->vdev_children) { 2941 spa->spa_deflate = TRUE; 2942 VERIFY(0 == zap_add(spa->spa_meta_objset, 2943 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2944 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 2945 } 2946 } 2947 2948 /* 2949 * If anything has changed in this txg, push the deferred frees 2950 * from the previous txg. If not, leave them alone so that we 2951 * don't generate work on an otherwise idle system. 2952 */ 2953 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2954 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 2955 !txg_list_empty(&dp->dp_sync_tasks, txg)) 2956 spa_sync_deferred_frees(spa, txg); 2957 2958 /* 2959 * Iterate to convergence. 2960 */ 2961 do { 2962 spa->spa_sync_pass++; 2963 2964 spa_sync_config_object(spa, tx); 2965 spa_sync_spares(spa, tx); 2966 spa_errlog_sync(spa, txg); 2967 dsl_pool_sync(dp, txg); 2968 2969 dirty_vdevs = 0; 2970 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2971 vdev_sync(vd, txg); 2972 dirty_vdevs++; 2973 } 2974 2975 bplist_sync(bpl, tx); 2976 } while (dirty_vdevs); 2977 2978 bplist_close(bpl); 2979 2980 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2981 2982 /* 2983 * Rewrite the vdev configuration (which includes the uberblock) 2984 * to commit the transaction group. 2985 * 2986 * If there are any dirty vdevs, sync the uberblock to all vdevs. 2987 * Otherwise, pick a random top-level vdev that's known to be 2988 * visible in the config cache (see spa_vdev_add() for details). 2989 * If the write fails, try the next vdev until we're tried them all. 2990 */ 2991 if (!list_is_empty(&spa->spa_dirty_list)) { 2992 VERIFY(vdev_config_sync(rvd, txg) == 0); 2993 } else { 2994 int children = rvd->vdev_children; 2995 int c0 = spa_get_random(children); 2996 int c; 2997 2998 for (c = 0; c < children; c++) { 2999 vd = rvd->vdev_child[(c0 + c) % children]; 3000 if (vd->vdev_ms_array == 0) 3001 continue; 3002 if (vdev_config_sync(vd, txg) == 0) 3003 break; 3004 } 3005 if (c == children) 3006 VERIFY(vdev_config_sync(rvd, txg) == 0); 3007 } 3008 3009 dmu_tx_commit(tx); 3010 3011 /* 3012 * Clear the dirty config list. 3013 */ 3014 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 3015 vdev_config_clean(vd); 3016 3017 /* 3018 * Now that the new config has synced transactionally, 3019 * let it become visible to the config cache. 3020 */ 3021 if (spa->spa_config_syncing != NULL) { 3022 spa_config_set(spa, spa->spa_config_syncing); 3023 spa->spa_config_txg = txg; 3024 spa->spa_config_syncing = NULL; 3025 } 3026 3027 /* 3028 * Make a stable copy of the fully synced uberblock. 3029 * We use this as the root for pool traversals. 3030 */ 3031 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 3032 3033 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 3034 3035 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 3036 spa->spa_traverse_wanted = 0; 3037 spa->spa_ubsync = spa->spa_uberblock; 3038 rw_exit(&spa->spa_traverse_lock); 3039 3040 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 3041 3042 /* 3043 * Clean up the ZIL records for the synced txg. 3044 */ 3045 dsl_pool_zil_clean(dp); 3046 3047 /* 3048 * Update usable space statistics. 3049 */ 3050 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 3051 vdev_sync_done(vd, txg); 3052 3053 /* 3054 * It had better be the case that we didn't dirty anything 3055 * since vdev_config_sync(). 3056 */ 3057 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 3058 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 3059 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 3060 ASSERT(bpl->bpl_queue == NULL); 3061 3062 spa_config_exit(spa, FTAG); 3063 3064 /* 3065 * If any async tasks have been requested, kick them off. 3066 */ 3067 spa_async_dispatch(spa); 3068 } 3069 3070 /* 3071 * Sync all pools. We don't want to hold the namespace lock across these 3072 * operations, so we take a reference on the spa_t and drop the lock during the 3073 * sync. 3074 */ 3075 void 3076 spa_sync_allpools(void) 3077 { 3078 spa_t *spa = NULL; 3079 mutex_enter(&spa_namespace_lock); 3080 while ((spa = spa_next(spa)) != NULL) { 3081 if (spa_state(spa) != POOL_STATE_ACTIVE) 3082 continue; 3083 spa_open_ref(spa, FTAG); 3084 mutex_exit(&spa_namespace_lock); 3085 txg_wait_synced(spa_get_dsl(spa), 0); 3086 mutex_enter(&spa_namespace_lock); 3087 spa_close(spa, FTAG); 3088 } 3089 mutex_exit(&spa_namespace_lock); 3090 } 3091 3092 /* 3093 * ========================================================================== 3094 * Miscellaneous routines 3095 * ========================================================================== 3096 */ 3097 3098 /* 3099 * Remove all pools in the system. 3100 */ 3101 void 3102 spa_evict_all(void) 3103 { 3104 spa_t *spa; 3105 3106 /* 3107 * Remove all cached state. All pools should be closed now, 3108 * so every spa in the AVL tree should be unreferenced. 3109 */ 3110 mutex_enter(&spa_namespace_lock); 3111 while ((spa = spa_next(NULL)) != NULL) { 3112 /* 3113 * Stop async tasks. The async thread may need to detach 3114 * a device that's been replaced, which requires grabbing 3115 * spa_namespace_lock, so we must drop it here. 3116 */ 3117 spa_open_ref(spa, FTAG); 3118 mutex_exit(&spa_namespace_lock); 3119 spa_async_suspend(spa); 3120 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 3121 mutex_enter(&spa_namespace_lock); 3122 spa_close(spa, FTAG); 3123 3124 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3125 spa_unload(spa); 3126 spa_deactivate(spa); 3127 } 3128 spa_remove(spa); 3129 } 3130 mutex_exit(&spa_namespace_lock); 3131 } 3132 3133 vdev_t * 3134 spa_lookup_by_guid(spa_t *spa, uint64_t guid) 3135 { 3136 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 3137 } 3138 3139 void 3140 spa_upgrade(spa_t *spa) 3141 { 3142 spa_config_enter(spa, RW_WRITER, FTAG); 3143 3144 /* 3145 * This should only be called for a non-faulted pool, and since a 3146 * future version would result in an unopenable pool, this shouldn't be 3147 * possible. 3148 */ 3149 ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 3150 3151 spa->spa_uberblock.ub_version = ZFS_VERSION; 3152 vdev_config_dirty(spa->spa_root_vdev); 3153 3154 spa_config_exit(spa, FTAG); 3155 3156 txg_wait_synced(spa_get_dsl(spa), 0); 3157 } 3158 3159 boolean_t 3160 spa_has_spare(spa_t *spa, uint64_t guid) 3161 { 3162 int i; 3163 uint64_t spareguid; 3164 3165 for (i = 0; i < spa->spa_nspares; i++) 3166 if (spa->spa_spares[i]->vdev_guid == guid) 3167 return (B_TRUE); 3168 3169 for (i = 0; i < spa->spa_pending_nspares; i++) { 3170 if (nvlist_lookup_uint64(spa->spa_pending_spares[i], 3171 ZPOOL_CONFIG_GUID, &spareguid) == 0 && 3172 spareguid == guid) 3173 return (B_TRUE); 3174 } 3175 3176 return (B_FALSE); 3177 } 3178 3179 int 3180 spa_set_props(spa_t *spa, nvlist_t *nvp) 3181 { 3182 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 3183 spa, nvp, 3)); 3184 } 3185 3186 int 3187 spa_get_props(spa_t *spa, nvlist_t **nvp) 3188 { 3189 zap_cursor_t zc; 3190 zap_attribute_t za; 3191 objset_t *mos = spa->spa_meta_objset; 3192 zfs_source_t src; 3193 zfs_prop_t prop; 3194 nvlist_t *propval; 3195 uint64_t value; 3196 int err; 3197 3198 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3199 3200 mutex_enter(&spa->spa_props_lock); 3201 /* If no props object, then just return empty nvlist */ 3202 if (spa->spa_pool_props_object == 0) { 3203 mutex_exit(&spa->spa_props_lock); 3204 return (0); 3205 } 3206 3207 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 3208 (err = zap_cursor_retrieve(&zc, &za)) == 0; 3209 zap_cursor_advance(&zc)) { 3210 3211 if ((prop = zpool_name_to_prop(za.za_name)) == ZFS_PROP_INVAL) 3212 continue; 3213 3214 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3215 switch (za.za_integer_length) { 3216 case 8: 3217 if (zfs_prop_default_numeric(prop) == 3218 za.za_first_integer) 3219 src = ZFS_SRC_DEFAULT; 3220 else 3221 src = ZFS_SRC_LOCAL; 3222 value = za.za_first_integer; 3223 3224 if (prop == ZFS_PROP_BOOTFS) { 3225 dsl_pool_t *dp; 3226 dsl_dataset_t *ds = NULL; 3227 char strval[MAXPATHLEN]; 3228 3229 dp = spa_get_dsl(spa); 3230 rw_enter(&dp->dp_config_rwlock, RW_READER); 3231 if ((err = dsl_dataset_open_obj(dp, 3232 za.za_first_integer, NULL, DS_MODE_NONE, 3233 FTAG, &ds)) != 0) { 3234 rw_exit(&dp->dp_config_rwlock); 3235 break; 3236 } 3237 dsl_dataset_name(ds, strval); 3238 dsl_dataset_close(ds, DS_MODE_NONE, FTAG); 3239 rw_exit(&dp->dp_config_rwlock); 3240 3241 VERIFY(nvlist_add_uint64(propval, 3242 ZFS_PROP_SOURCE, src) == 0); 3243 VERIFY(nvlist_add_string(propval, 3244 ZFS_PROP_VALUE, strval) == 0); 3245 } else { 3246 VERIFY(nvlist_add_uint64(propval, 3247 ZFS_PROP_SOURCE, src) == 0); 3248 VERIFY(nvlist_add_uint64(propval, 3249 ZFS_PROP_VALUE, value) == 0); 3250 } 3251 VERIFY(nvlist_add_nvlist(*nvp, za.za_name, 3252 propval) == 0); 3253 break; 3254 } 3255 nvlist_free(propval); 3256 } 3257 zap_cursor_fini(&zc); 3258 mutex_exit(&spa->spa_props_lock); 3259 if (err && err != ENOENT) { 3260 nvlist_free(*nvp); 3261 return (err); 3262 } 3263 3264 return (0); 3265 } 3266 3267 /* 3268 * If the bootfs property value is dsobj, clear it. 3269 */ 3270 void 3271 spa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 3272 { 3273 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 3274 VERIFY(zap_remove(spa->spa_meta_objset, 3275 spa->spa_pool_props_object, 3276 zpool_prop_to_name(ZFS_PROP_BOOTFS), tx) == 0); 3277 spa->spa_bootfs = 0; 3278 } 3279 } 3280