1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains all the routines used when modifying on-disk SPA state. 31 * This includes opening, importing, destroying, exporting a pool, and syncing a 32 * pool. 33 */ 34 35 #include <sys/zfs_context.h> 36 #include <sys/fm/fs/zfs.h> 37 #include <sys/spa_impl.h> 38 #include <sys/zio.h> 39 #include <sys/zio_checksum.h> 40 #include <sys/zio_compress.h> 41 #include <sys/dmu.h> 42 #include <sys/dmu_tx.h> 43 #include <sys/zap.h> 44 #include <sys/zil.h> 45 #include <sys/vdev_impl.h> 46 #include <sys/metaslab.h> 47 #include <sys/uberblock_impl.h> 48 #include <sys/txg.h> 49 #include <sys/avl.h> 50 #include <sys/dmu_traverse.h> 51 #include <sys/dmu_objset.h> 52 #include <sys/unique.h> 53 #include <sys/dsl_pool.h> 54 #include <sys/dsl_dataset.h> 55 #include <sys/dsl_dir.h> 56 #include <sys/dsl_prop.h> 57 #include <sys/dsl_synctask.h> 58 #include <sys/fs/zfs.h> 59 #include <sys/callb.h> 60 #include <sys/systeminfo.h> 61 #include <sys/sunddi.h> 62 63 int zio_taskq_threads = 8; 64 65 /* 66 * ========================================================================== 67 * SPA state manipulation (open/create/destroy/import/export) 68 * ========================================================================== 69 */ 70 71 static int 72 spa_error_entry_compare(const void *a, const void *b) 73 { 74 spa_error_entry_t *sa = (spa_error_entry_t *)a; 75 spa_error_entry_t *sb = (spa_error_entry_t *)b; 76 int ret; 77 78 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 79 sizeof (zbookmark_t)); 80 81 if (ret < 0) 82 return (-1); 83 else if (ret > 0) 84 return (1); 85 else 86 return (0); 87 } 88 89 /* 90 * Utility function which retrieves copies of the current logs and 91 * re-initializes them in the process. 92 */ 93 void 94 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 95 { 96 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 97 98 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 99 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 100 101 avl_create(&spa->spa_errlist_scrub, 102 spa_error_entry_compare, sizeof (spa_error_entry_t), 103 offsetof(spa_error_entry_t, se_avl)); 104 avl_create(&spa->spa_errlist_last, 105 spa_error_entry_compare, sizeof (spa_error_entry_t), 106 offsetof(spa_error_entry_t, se_avl)); 107 } 108 109 /* 110 * Activate an uninitialized pool. 111 */ 112 static void 113 spa_activate(spa_t *spa) 114 { 115 int t; 116 117 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 118 119 spa->spa_state = POOL_STATE_ACTIVE; 120 121 spa->spa_normal_class = metaslab_class_create(); 122 123 for (t = 0; t < ZIO_TYPES; t++) { 124 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 125 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 126 TASKQ_PREPOPULATE); 127 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 128 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 129 TASKQ_PREPOPULATE); 130 } 131 132 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 133 134 mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); 135 mutex_init(&spa->spa_config_cache_lock, NULL, MUTEX_DEFAULT, NULL); 136 mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); 137 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); 138 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); 139 mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL); 140 mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); 141 mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); 142 mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); 143 144 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 145 offsetof(vdev_t, vdev_dirty_node)); 146 147 txg_list_create(&spa->spa_vdev_txg_list, 148 offsetof(struct vdev, vdev_txg_node)); 149 150 avl_create(&spa->spa_errlist_scrub, 151 spa_error_entry_compare, sizeof (spa_error_entry_t), 152 offsetof(spa_error_entry_t, se_avl)); 153 avl_create(&spa->spa_errlist_last, 154 spa_error_entry_compare, sizeof (spa_error_entry_t), 155 offsetof(spa_error_entry_t, se_avl)); 156 } 157 158 /* 159 * Opposite of spa_activate(). 160 */ 161 static void 162 spa_deactivate(spa_t *spa) 163 { 164 int t; 165 166 ASSERT(spa->spa_sync_on == B_FALSE); 167 ASSERT(spa->spa_dsl_pool == NULL); 168 ASSERT(spa->spa_root_vdev == NULL); 169 170 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 171 172 txg_list_destroy(&spa->spa_vdev_txg_list); 173 174 list_destroy(&spa->spa_dirty_list); 175 176 rw_destroy(&spa->spa_traverse_lock); 177 178 for (t = 0; t < ZIO_TYPES; t++) { 179 taskq_destroy(spa->spa_zio_issue_taskq[t]); 180 taskq_destroy(spa->spa_zio_intr_taskq[t]); 181 spa->spa_zio_issue_taskq[t] = NULL; 182 spa->spa_zio_intr_taskq[t] = NULL; 183 } 184 185 metaslab_class_destroy(spa->spa_normal_class); 186 spa->spa_normal_class = NULL; 187 188 /* 189 * If this was part of an import or the open otherwise failed, we may 190 * still have errors left in the queues. Empty them just in case. 191 */ 192 spa_errlog_drain(spa); 193 194 avl_destroy(&spa->spa_errlist_scrub); 195 avl_destroy(&spa->spa_errlist_last); 196 197 spa->spa_state = POOL_STATE_UNINITIALIZED; 198 } 199 200 /* 201 * Verify a pool configuration, and construct the vdev tree appropriately. This 202 * will create all the necessary vdevs in the appropriate layout, with each vdev 203 * in the CLOSED state. This will prep the pool before open/creation/import. 204 * All vdev validation is done by the vdev_alloc() routine. 205 */ 206 static int 207 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 208 uint_t id, int atype) 209 { 210 nvlist_t **child; 211 uint_t c, children; 212 int error; 213 214 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 215 return (error); 216 217 if ((*vdp)->vdev_ops->vdev_op_leaf) 218 return (0); 219 220 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 221 &child, &children) != 0) { 222 vdev_free(*vdp); 223 *vdp = NULL; 224 return (EINVAL); 225 } 226 227 for (c = 0; c < children; c++) { 228 vdev_t *vd; 229 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 230 atype)) != 0) { 231 vdev_free(*vdp); 232 *vdp = NULL; 233 return (error); 234 } 235 } 236 237 ASSERT(*vdp != NULL); 238 239 return (0); 240 } 241 242 /* 243 * Opposite of spa_load(). 244 */ 245 static void 246 spa_unload(spa_t *spa) 247 { 248 int i; 249 250 /* 251 * Stop async tasks. 252 */ 253 spa_async_suspend(spa); 254 255 /* 256 * Stop syncing. 257 */ 258 if (spa->spa_sync_on) { 259 txg_sync_stop(spa->spa_dsl_pool); 260 spa->spa_sync_on = B_FALSE; 261 } 262 263 /* 264 * Wait for any outstanding prefetch I/O to complete. 265 */ 266 spa_config_enter(spa, RW_WRITER, FTAG); 267 spa_config_exit(spa, FTAG); 268 269 /* 270 * Close the dsl pool. 271 */ 272 if (spa->spa_dsl_pool) { 273 dsl_pool_close(spa->spa_dsl_pool); 274 spa->spa_dsl_pool = NULL; 275 } 276 277 /* 278 * Close all vdevs. 279 */ 280 if (spa->spa_root_vdev) 281 vdev_free(spa->spa_root_vdev); 282 ASSERT(spa->spa_root_vdev == NULL); 283 284 for (i = 0; i < spa->spa_nspares; i++) 285 vdev_free(spa->spa_spares[i]); 286 if (spa->spa_spares) { 287 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 288 spa->spa_spares = NULL; 289 } 290 if (spa->spa_sparelist) { 291 nvlist_free(spa->spa_sparelist); 292 spa->spa_sparelist = NULL; 293 } 294 295 spa->spa_async_suspended = 0; 296 } 297 298 /* 299 * Load (or re-load) the current list of vdevs describing the active spares for 300 * this pool. When this is called, we have some form of basic information in 301 * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 302 * re-generate a more complete list including status information. 303 */ 304 static void 305 spa_load_spares(spa_t *spa) 306 { 307 nvlist_t **spares; 308 uint_t nspares; 309 int i; 310 vdev_t *vd, *tvd; 311 312 /* 313 * First, close and free any existing spare vdevs. 314 */ 315 for (i = 0; i < spa->spa_nspares; i++) { 316 vd = spa->spa_spares[i]; 317 318 /* Undo the call to spa_activate() below */ 319 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && 320 tvd->vdev_isspare) 321 spa_spare_remove(tvd); 322 vdev_close(vd); 323 vdev_free(vd); 324 } 325 326 if (spa->spa_spares) 327 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 328 329 if (spa->spa_sparelist == NULL) 330 nspares = 0; 331 else 332 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 333 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 334 335 spa->spa_nspares = (int)nspares; 336 spa->spa_spares = NULL; 337 338 if (nspares == 0) 339 return; 340 341 /* 342 * Construct the array of vdevs, opening them to get status in the 343 * process. For each spare, there is potentially two different vdev_t 344 * structures associated with it: one in the list of spares (used only 345 * for basic validation purposes) and one in the active vdev 346 * configuration (if it's spared in). During this phase we open and 347 * validate each vdev on the spare list. If the vdev also exists in the 348 * active configuration, then we also mark this vdev as an active spare. 349 */ 350 spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 351 for (i = 0; i < spa->spa_nspares; i++) { 352 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 353 VDEV_ALLOC_SPARE) == 0); 354 ASSERT(vd != NULL); 355 356 spa->spa_spares[i] = vd; 357 358 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { 359 if (!tvd->vdev_isspare) 360 spa_spare_add(tvd); 361 362 /* 363 * We only mark the spare active if we were successfully 364 * able to load the vdev. Otherwise, importing a pool 365 * with a bad active spare would result in strange 366 * behavior, because multiple pool would think the spare 367 * is actively in use. 368 * 369 * There is a vulnerability here to an equally bizarre 370 * circumstance, where a dead active spare is later 371 * brought back to life (onlined or otherwise). Given 372 * the rarity of this scenario, and the extra complexity 373 * it adds, we ignore the possibility. 374 */ 375 if (!vdev_is_dead(tvd)) 376 spa_spare_activate(tvd); 377 } 378 379 if (vdev_open(vd) != 0) 380 continue; 381 382 vd->vdev_top = vd; 383 (void) vdev_validate_spare(vd); 384 } 385 386 /* 387 * Recompute the stashed list of spares, with status information 388 * this time. 389 */ 390 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 391 DATA_TYPE_NVLIST_ARRAY) == 0); 392 393 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 394 for (i = 0; i < spa->spa_nspares; i++) 395 spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 396 B_TRUE, B_TRUE); 397 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 398 spares, spa->spa_nspares) == 0); 399 for (i = 0; i < spa->spa_nspares; i++) 400 nvlist_free(spares[i]); 401 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 402 } 403 404 static int 405 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 406 { 407 dmu_buf_t *db; 408 char *packed = NULL; 409 size_t nvsize = 0; 410 int error; 411 *value = NULL; 412 413 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 414 nvsize = *(uint64_t *)db->db_data; 415 dmu_buf_rele(db, FTAG); 416 417 packed = kmem_alloc(nvsize, KM_SLEEP); 418 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 419 if (error == 0) 420 error = nvlist_unpack(packed, nvsize, value, 0); 421 kmem_free(packed, nvsize); 422 423 return (error); 424 } 425 426 /* 427 * Load an existing storage pool, using the pool's builtin spa_config as a 428 * source of configuration information. 429 */ 430 static int 431 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 432 { 433 int error = 0; 434 nvlist_t *nvroot = NULL; 435 vdev_t *rvd; 436 uberblock_t *ub = &spa->spa_uberblock; 437 uint64_t config_cache_txg = spa->spa_config_txg; 438 uint64_t pool_guid; 439 uint64_t version; 440 zio_t *zio; 441 442 spa->spa_load_state = state; 443 444 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 445 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 446 error = EINVAL; 447 goto out; 448 } 449 450 /* 451 * Versioning wasn't explicitly added to the label until later, so if 452 * it's not present treat it as the initial version. 453 */ 454 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 455 version = ZFS_VERSION_INITIAL; 456 457 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 458 &spa->spa_config_txg); 459 460 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 461 spa_guid_exists(pool_guid, 0)) { 462 error = EEXIST; 463 goto out; 464 } 465 466 spa->spa_load_guid = pool_guid; 467 468 /* 469 * Parse the configuration into a vdev tree. We explicitly set the 470 * value that will be returned by spa_version() since parsing the 471 * configuration requires knowing the version number. 472 */ 473 spa_config_enter(spa, RW_WRITER, FTAG); 474 spa->spa_ubsync.ub_version = version; 475 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 476 spa_config_exit(spa, FTAG); 477 478 if (error != 0) 479 goto out; 480 481 ASSERT(spa->spa_root_vdev == rvd); 482 ASSERT(spa_guid(spa) == pool_guid); 483 484 /* 485 * Try to open all vdevs, loading each label in the process. 486 */ 487 if (vdev_open(rvd) != 0) { 488 error = ENXIO; 489 goto out; 490 } 491 492 /* 493 * Validate the labels for all leaf vdevs. We need to grab the config 494 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 495 * flag. 496 */ 497 spa_config_enter(spa, RW_READER, FTAG); 498 error = vdev_validate(rvd); 499 spa_config_exit(spa, FTAG); 500 501 if (error != 0) { 502 error = EBADF; 503 goto out; 504 } 505 506 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 507 error = ENXIO; 508 goto out; 509 } 510 511 /* 512 * Find the best uberblock. 513 */ 514 bzero(ub, sizeof (uberblock_t)); 515 516 zio = zio_root(spa, NULL, NULL, 517 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 518 vdev_uberblock_load(zio, rvd, ub); 519 error = zio_wait(zio); 520 521 /* 522 * If we weren't able to find a single valid uberblock, return failure. 523 */ 524 if (ub->ub_txg == 0) { 525 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 526 VDEV_AUX_CORRUPT_DATA); 527 error = ENXIO; 528 goto out; 529 } 530 531 /* 532 * If the pool is newer than the code, we can't open it. 533 */ 534 if (ub->ub_version > ZFS_VERSION) { 535 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 536 VDEV_AUX_VERSION_NEWER); 537 error = ENOTSUP; 538 goto out; 539 } 540 541 /* 542 * If the vdev guid sum doesn't match the uberblock, we have an 543 * incomplete configuration. 544 */ 545 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 546 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 547 VDEV_AUX_BAD_GUID_SUM); 548 error = ENXIO; 549 goto out; 550 } 551 552 /* 553 * Initialize internal SPA structures. 554 */ 555 spa->spa_state = POOL_STATE_ACTIVE; 556 spa->spa_ubsync = spa->spa_uberblock; 557 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 558 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 559 if (error) { 560 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 561 VDEV_AUX_CORRUPT_DATA); 562 goto out; 563 } 564 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 565 566 if (zap_lookup(spa->spa_meta_objset, 567 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 568 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 569 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 570 VDEV_AUX_CORRUPT_DATA); 571 error = EIO; 572 goto out; 573 } 574 575 if (!mosconfig) { 576 nvlist_t *newconfig; 577 uint64_t hostid; 578 579 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 580 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 581 VDEV_AUX_CORRUPT_DATA); 582 error = EIO; 583 goto out; 584 } 585 586 if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID, 587 &hostid) == 0) { 588 char *hostname; 589 unsigned long myhostid = 0; 590 591 VERIFY(nvlist_lookup_string(newconfig, 592 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 593 594 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 595 if ((unsigned long)hostid != myhostid) { 596 cmn_err(CE_WARN, "pool '%s' could not be " 597 "loaded as it was last accessed by " 598 "another system (host: %s hostid: 0x%lx). " 599 "See: http://www.sun.com/msg/ZFS-8000-EY", 600 spa->spa_name, hostname, 601 (unsigned long)hostid); 602 error = EBADF; 603 goto out; 604 } 605 } 606 607 spa_config_set(spa, newconfig); 608 spa_unload(spa); 609 spa_deactivate(spa); 610 spa_activate(spa); 611 612 return (spa_load(spa, newconfig, state, B_TRUE)); 613 } 614 615 if (zap_lookup(spa->spa_meta_objset, 616 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 617 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 618 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 619 VDEV_AUX_CORRUPT_DATA); 620 error = EIO; 621 goto out; 622 } 623 624 /* 625 * Load the bit that tells us to use the new accounting function 626 * (raid-z deflation). If we have an older pool, this will not 627 * be present. 628 */ 629 error = zap_lookup(spa->spa_meta_objset, 630 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 631 sizeof (uint64_t), 1, &spa->spa_deflate); 632 if (error != 0 && error != ENOENT) { 633 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 634 VDEV_AUX_CORRUPT_DATA); 635 error = EIO; 636 goto out; 637 } 638 639 /* 640 * Load the persistent error log. If we have an older pool, this will 641 * not be present. 642 */ 643 error = zap_lookup(spa->spa_meta_objset, 644 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 645 sizeof (uint64_t), 1, &spa->spa_errlog_last); 646 if (error != 0 && error != ENOENT) { 647 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 648 VDEV_AUX_CORRUPT_DATA); 649 error = EIO; 650 goto out; 651 } 652 653 error = zap_lookup(spa->spa_meta_objset, 654 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 655 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 656 if (error != 0 && error != ENOENT) { 657 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 658 VDEV_AUX_CORRUPT_DATA); 659 error = EIO; 660 goto out; 661 } 662 663 /* 664 * Load the history object. If we have an older pool, this 665 * will not be present. 666 */ 667 error = zap_lookup(spa->spa_meta_objset, 668 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 669 sizeof (uint64_t), 1, &spa->spa_history); 670 if (error != 0 && error != ENOENT) { 671 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 672 VDEV_AUX_CORRUPT_DATA); 673 error = EIO; 674 goto out; 675 } 676 677 /* 678 * Load any hot spares for this pool. 679 */ 680 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 681 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 682 if (error != 0 && error != ENOENT) { 683 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 684 VDEV_AUX_CORRUPT_DATA); 685 error = EIO; 686 goto out; 687 } 688 if (error == 0) { 689 ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES); 690 if (load_nvlist(spa, spa->spa_spares_object, 691 &spa->spa_sparelist) != 0) { 692 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 693 VDEV_AUX_CORRUPT_DATA); 694 error = EIO; 695 goto out; 696 } 697 698 spa_config_enter(spa, RW_WRITER, FTAG); 699 spa_load_spares(spa); 700 spa_config_exit(spa, FTAG); 701 } 702 703 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 704 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 705 706 if (error && error != ENOENT) { 707 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 708 VDEV_AUX_CORRUPT_DATA); 709 error = EIO; 710 goto out; 711 } 712 713 if (error == 0) { 714 (void) zap_lookup(spa->spa_meta_objset, 715 spa->spa_pool_props_object, 716 zpool_prop_to_name(ZFS_PROP_BOOTFS), 717 sizeof (uint64_t), 1, &spa->spa_bootfs); 718 } 719 720 /* 721 * Load the vdev state for all toplevel vdevs. 722 */ 723 vdev_load(rvd); 724 725 /* 726 * Propagate the leaf DTLs we just loaded all the way up the tree. 727 */ 728 spa_config_enter(spa, RW_WRITER, FTAG); 729 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 730 spa_config_exit(spa, FTAG); 731 732 /* 733 * Check the state of the root vdev. If it can't be opened, it 734 * indicates one or more toplevel vdevs are faulted. 735 */ 736 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 737 error = ENXIO; 738 goto out; 739 } 740 741 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 742 dmu_tx_t *tx; 743 int need_update = B_FALSE; 744 int c; 745 746 /* 747 * Claim log blocks that haven't been committed yet. 748 * This must all happen in a single txg. 749 */ 750 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 751 spa_first_txg(spa)); 752 (void) dmu_objset_find(spa->spa_name, 753 zil_claim, tx, DS_FIND_CHILDREN); 754 dmu_tx_commit(tx); 755 756 spa->spa_sync_on = B_TRUE; 757 txg_sync_start(spa->spa_dsl_pool); 758 759 /* 760 * Wait for all claims to sync. 761 */ 762 txg_wait_synced(spa->spa_dsl_pool, 0); 763 764 /* 765 * If the config cache is stale, or we have uninitialized 766 * metaslabs (see spa_vdev_add()), then update the config. 767 */ 768 if (config_cache_txg != spa->spa_config_txg || 769 state == SPA_LOAD_IMPORT) 770 need_update = B_TRUE; 771 772 for (c = 0; c < rvd->vdev_children; c++) 773 if (rvd->vdev_child[c]->vdev_ms_array == 0) 774 need_update = B_TRUE; 775 776 /* 777 * Update the config cache asychronously in case we're the 778 * root pool, in which case the config cache isn't writable yet. 779 */ 780 if (need_update) 781 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 782 } 783 784 error = 0; 785 out: 786 if (error && error != EBADF) 787 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 788 spa->spa_load_state = SPA_LOAD_NONE; 789 spa->spa_ena = 0; 790 791 return (error); 792 } 793 794 /* 795 * Pool Open/Import 796 * 797 * The import case is identical to an open except that the configuration is sent 798 * down from userland, instead of grabbed from the configuration cache. For the 799 * case of an open, the pool configuration will exist in the 800 * POOL_STATE_UNITIALIZED state. 801 * 802 * The stats information (gen/count/ustats) is used to gather vdev statistics at 803 * the same time open the pool, without having to keep around the spa_t in some 804 * ambiguous state. 805 */ 806 static int 807 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 808 { 809 spa_t *spa; 810 int error; 811 int loaded = B_FALSE; 812 int locked = B_FALSE; 813 814 *spapp = NULL; 815 816 /* 817 * As disgusting as this is, we need to support recursive calls to this 818 * function because dsl_dir_open() is called during spa_load(), and ends 819 * up calling spa_open() again. The real fix is to figure out how to 820 * avoid dsl_dir_open() calling this in the first place. 821 */ 822 if (mutex_owner(&spa_namespace_lock) != curthread) { 823 mutex_enter(&spa_namespace_lock); 824 locked = B_TRUE; 825 } 826 827 if ((spa = spa_lookup(pool)) == NULL) { 828 if (locked) 829 mutex_exit(&spa_namespace_lock); 830 return (ENOENT); 831 } 832 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 833 834 spa_activate(spa); 835 836 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 837 838 if (error == EBADF) { 839 /* 840 * If vdev_validate() returns failure (indicated by 841 * EBADF), it indicates that one of the vdevs indicates 842 * that the pool has been exported or destroyed. If 843 * this is the case, the config cache is out of sync and 844 * we should remove the pool from the namespace. 845 */ 846 zfs_post_ok(spa, NULL); 847 spa_unload(spa); 848 spa_deactivate(spa); 849 spa_remove(spa); 850 spa_config_sync(); 851 if (locked) 852 mutex_exit(&spa_namespace_lock); 853 return (ENOENT); 854 } 855 856 if (error) { 857 /* 858 * We can't open the pool, but we still have useful 859 * information: the state of each vdev after the 860 * attempted vdev_open(). Return this to the user. 861 */ 862 if (config != NULL && spa->spa_root_vdev != NULL) { 863 spa_config_enter(spa, RW_READER, FTAG); 864 *config = spa_config_generate(spa, NULL, -1ULL, 865 B_TRUE); 866 spa_config_exit(spa, FTAG); 867 } 868 spa_unload(spa); 869 spa_deactivate(spa); 870 spa->spa_last_open_failed = B_TRUE; 871 if (locked) 872 mutex_exit(&spa_namespace_lock); 873 *spapp = NULL; 874 return (error); 875 } else { 876 zfs_post_ok(spa, NULL); 877 spa->spa_last_open_failed = B_FALSE; 878 } 879 880 loaded = B_TRUE; 881 } 882 883 spa_open_ref(spa, tag); 884 if (locked) 885 mutex_exit(&spa_namespace_lock); 886 887 *spapp = spa; 888 889 if (config != NULL) { 890 spa_config_enter(spa, RW_READER, FTAG); 891 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 892 spa_config_exit(spa, FTAG); 893 } 894 895 /* 896 * If we just loaded the pool, resilver anything that's out of date. 897 */ 898 if (loaded && (spa_mode & FWRITE)) 899 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 900 901 return (0); 902 } 903 904 int 905 spa_open(const char *name, spa_t **spapp, void *tag) 906 { 907 return (spa_open_common(name, spapp, tag, NULL)); 908 } 909 910 /* 911 * Lookup the given spa_t, incrementing the inject count in the process, 912 * preventing it from being exported or destroyed. 913 */ 914 spa_t * 915 spa_inject_addref(char *name) 916 { 917 spa_t *spa; 918 919 mutex_enter(&spa_namespace_lock); 920 if ((spa = spa_lookup(name)) == NULL) { 921 mutex_exit(&spa_namespace_lock); 922 return (NULL); 923 } 924 spa->spa_inject_ref++; 925 mutex_exit(&spa_namespace_lock); 926 927 return (spa); 928 } 929 930 void 931 spa_inject_delref(spa_t *spa) 932 { 933 mutex_enter(&spa_namespace_lock); 934 spa->spa_inject_ref--; 935 mutex_exit(&spa_namespace_lock); 936 } 937 938 static void 939 spa_add_spares(spa_t *spa, nvlist_t *config) 940 { 941 nvlist_t **spares; 942 uint_t i, nspares; 943 nvlist_t *nvroot; 944 uint64_t guid; 945 vdev_stat_t *vs; 946 uint_t vsc; 947 uint64_t pool; 948 949 if (spa->spa_nspares == 0) 950 return; 951 952 VERIFY(nvlist_lookup_nvlist(config, 953 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 954 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 955 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 956 if (nspares != 0) { 957 VERIFY(nvlist_add_nvlist_array(nvroot, 958 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 959 VERIFY(nvlist_lookup_nvlist_array(nvroot, 960 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 961 962 /* 963 * Go through and find any spares which have since been 964 * repurposed as an active spare. If this is the case, update 965 * their status appropriately. 966 */ 967 for (i = 0; i < nspares; i++) { 968 VERIFY(nvlist_lookup_uint64(spares[i], 969 ZPOOL_CONFIG_GUID, &guid) == 0); 970 if (spa_spare_exists(guid, &pool) && pool != 0ULL) { 971 VERIFY(nvlist_lookup_uint64_array( 972 spares[i], ZPOOL_CONFIG_STATS, 973 (uint64_t **)&vs, &vsc) == 0); 974 vs->vs_state = VDEV_STATE_CANT_OPEN; 975 vs->vs_aux = VDEV_AUX_SPARED; 976 } 977 } 978 } 979 } 980 981 int 982 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 983 { 984 int error; 985 spa_t *spa; 986 987 *config = NULL; 988 error = spa_open_common(name, &spa, FTAG, config); 989 990 if (spa && *config != NULL) { 991 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 992 spa_get_errlog_size(spa)) == 0); 993 994 spa_add_spares(spa, *config); 995 } 996 997 /* 998 * We want to get the alternate root even for faulted pools, so we cheat 999 * and call spa_lookup() directly. 1000 */ 1001 if (altroot) { 1002 if (spa == NULL) { 1003 mutex_enter(&spa_namespace_lock); 1004 spa = spa_lookup(name); 1005 if (spa) 1006 spa_altroot(spa, altroot, buflen); 1007 else 1008 altroot[0] = '\0'; 1009 spa = NULL; 1010 mutex_exit(&spa_namespace_lock); 1011 } else { 1012 spa_altroot(spa, altroot, buflen); 1013 } 1014 } 1015 1016 if (spa != NULL) 1017 spa_close(spa, FTAG); 1018 1019 return (error); 1020 } 1021 1022 /* 1023 * Validate that the 'spares' array is well formed. We must have an array of 1024 * nvlists, each which describes a valid leaf vdev. If this is an import (mode 1025 * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long 1026 * as they are well-formed. 1027 */ 1028 static int 1029 spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 1030 { 1031 nvlist_t **spares; 1032 uint_t i, nspares; 1033 vdev_t *vd; 1034 int error; 1035 1036 /* 1037 * It's acceptable to have no spares specified. 1038 */ 1039 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1040 &spares, &nspares) != 0) 1041 return (0); 1042 1043 if (nspares == 0) 1044 return (EINVAL); 1045 1046 /* 1047 * Make sure the pool is formatted with a version that supports hot 1048 * spares. 1049 */ 1050 if (spa_version(spa) < ZFS_VERSION_SPARES) 1051 return (ENOTSUP); 1052 1053 /* 1054 * Set the pending spare list so we correctly handle device in-use 1055 * checking. 1056 */ 1057 spa->spa_pending_spares = spares; 1058 spa->spa_pending_nspares = nspares; 1059 1060 for (i = 0; i < nspares; i++) { 1061 if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 1062 mode)) != 0) 1063 goto out; 1064 1065 if (!vd->vdev_ops->vdev_op_leaf) { 1066 vdev_free(vd); 1067 error = EINVAL; 1068 goto out; 1069 } 1070 1071 vd->vdev_top = vd; 1072 1073 if ((error = vdev_open(vd)) == 0 && 1074 (error = vdev_label_init(vd, crtxg, 1075 VDEV_LABEL_SPARE)) == 0) { 1076 VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 1077 vd->vdev_guid) == 0); 1078 } 1079 1080 vdev_free(vd); 1081 1082 if (error && mode != VDEV_ALLOC_SPARE) 1083 goto out; 1084 else 1085 error = 0; 1086 } 1087 1088 out: 1089 spa->spa_pending_spares = NULL; 1090 spa->spa_pending_nspares = 0; 1091 return (error); 1092 } 1093 1094 /* 1095 * Pool Creation 1096 */ 1097 int 1098 spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 1099 { 1100 spa_t *spa; 1101 vdev_t *rvd; 1102 dsl_pool_t *dp; 1103 dmu_tx_t *tx; 1104 int c, error = 0; 1105 uint64_t txg = TXG_INITIAL; 1106 nvlist_t **spares; 1107 uint_t nspares; 1108 1109 /* 1110 * If this pool already exists, return failure. 1111 */ 1112 mutex_enter(&spa_namespace_lock); 1113 if (spa_lookup(pool) != NULL) { 1114 mutex_exit(&spa_namespace_lock); 1115 return (EEXIST); 1116 } 1117 1118 /* 1119 * Allocate a new spa_t structure. 1120 */ 1121 spa = spa_add(pool, altroot); 1122 spa_activate(spa); 1123 1124 spa->spa_uberblock.ub_txg = txg - 1; 1125 spa->spa_uberblock.ub_version = ZFS_VERSION; 1126 spa->spa_ubsync = spa->spa_uberblock; 1127 1128 /* 1129 * Create the root vdev. 1130 */ 1131 spa_config_enter(spa, RW_WRITER, FTAG); 1132 1133 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1134 1135 ASSERT(error != 0 || rvd != NULL); 1136 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1137 1138 if (error == 0 && rvd->vdev_children == 0) 1139 error = EINVAL; 1140 1141 if (error == 0 && 1142 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1143 (error = spa_validate_spares(spa, nvroot, txg, 1144 VDEV_ALLOC_ADD)) == 0) { 1145 for (c = 0; c < rvd->vdev_children; c++) 1146 vdev_init(rvd->vdev_child[c], txg); 1147 vdev_config_dirty(rvd); 1148 } 1149 1150 spa_config_exit(spa, FTAG); 1151 1152 if (error != 0) { 1153 spa_unload(spa); 1154 spa_deactivate(spa); 1155 spa_remove(spa); 1156 mutex_exit(&spa_namespace_lock); 1157 return (error); 1158 } 1159 1160 /* 1161 * Get the list of spares, if specified. 1162 */ 1163 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1164 &spares, &nspares) == 0) { 1165 VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 1166 KM_SLEEP) == 0); 1167 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1168 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1169 spa_config_enter(spa, RW_WRITER, FTAG); 1170 spa_load_spares(spa); 1171 spa_config_exit(spa, FTAG); 1172 spa->spa_sync_spares = B_TRUE; 1173 } 1174 1175 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1176 spa->spa_meta_objset = dp->dp_meta_objset; 1177 1178 tx = dmu_tx_create_assigned(dp, txg); 1179 1180 /* 1181 * Create the pool config object. 1182 */ 1183 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1184 DMU_OT_PACKED_NVLIST, 1 << 14, 1185 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1186 1187 if (zap_add(spa->spa_meta_objset, 1188 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1189 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1190 cmn_err(CE_PANIC, "failed to add pool config"); 1191 } 1192 1193 /* Newly created pools are always deflated. */ 1194 spa->spa_deflate = TRUE; 1195 if (zap_add(spa->spa_meta_objset, 1196 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1197 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 1198 cmn_err(CE_PANIC, "failed to add deflate"); 1199 } 1200 1201 /* 1202 * Create the deferred-free bplist object. Turn off compression 1203 * because sync-to-convergence takes longer if the blocksize 1204 * keeps changing. 1205 */ 1206 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1207 1 << 14, tx); 1208 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1209 ZIO_COMPRESS_OFF, tx); 1210 1211 if (zap_add(spa->spa_meta_objset, 1212 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1213 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 1214 cmn_err(CE_PANIC, "failed to add bplist"); 1215 } 1216 1217 /* 1218 * Create the pool's history object. 1219 */ 1220 spa_history_create_obj(spa, tx); 1221 1222 dmu_tx_commit(tx); 1223 1224 spa->spa_bootfs = zfs_prop_default_numeric(ZFS_PROP_BOOTFS); 1225 spa->spa_sync_on = B_TRUE; 1226 txg_sync_start(spa->spa_dsl_pool); 1227 1228 /* 1229 * We explicitly wait for the first transaction to complete so that our 1230 * bean counters are appropriately updated. 1231 */ 1232 txg_wait_synced(spa->spa_dsl_pool, txg); 1233 1234 spa_config_sync(); 1235 1236 mutex_exit(&spa_namespace_lock); 1237 1238 return (0); 1239 } 1240 1241 /* 1242 * Import the given pool into the system. We set up the necessary spa_t and 1243 * then call spa_load() to do the dirty work. 1244 */ 1245 int 1246 spa_import(const char *pool, nvlist_t *config, const char *altroot) 1247 { 1248 spa_t *spa; 1249 int error; 1250 nvlist_t *nvroot; 1251 nvlist_t **spares; 1252 uint_t nspares; 1253 1254 if (!(spa_mode & FWRITE)) 1255 return (EROFS); 1256 1257 /* 1258 * If a pool with this name exists, return failure. 1259 */ 1260 mutex_enter(&spa_namespace_lock); 1261 if (spa_lookup(pool) != NULL) { 1262 mutex_exit(&spa_namespace_lock); 1263 return (EEXIST); 1264 } 1265 1266 /* 1267 * Create and initialize the spa structure. 1268 */ 1269 spa = spa_add(pool, altroot); 1270 spa_activate(spa); 1271 1272 /* 1273 * Pass off the heavy lifting to spa_load(). 1274 * Pass TRUE for mosconfig because the user-supplied config 1275 * is actually the one to trust when doing an import. 1276 */ 1277 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1278 1279 spa_config_enter(spa, RW_WRITER, FTAG); 1280 /* 1281 * Toss any existing sparelist, as it doesn't have any validity anymore, 1282 * and conflicts with spa_has_spare(). 1283 */ 1284 if (spa->spa_sparelist) { 1285 nvlist_free(spa->spa_sparelist); 1286 spa->spa_sparelist = NULL; 1287 spa_load_spares(spa); 1288 } 1289 1290 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1291 &nvroot) == 0); 1292 if (error == 0) 1293 error = spa_validate_spares(spa, nvroot, -1ULL, 1294 VDEV_ALLOC_SPARE); 1295 spa_config_exit(spa, FTAG); 1296 1297 if (error != 0) { 1298 spa_unload(spa); 1299 spa_deactivate(spa); 1300 spa_remove(spa); 1301 mutex_exit(&spa_namespace_lock); 1302 return (error); 1303 } 1304 1305 /* 1306 * Override any spares as specified by the user, as these may have 1307 * correct device names/devids, etc. 1308 */ 1309 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1310 &spares, &nspares) == 0) { 1311 if (spa->spa_sparelist) 1312 VERIFY(nvlist_remove(spa->spa_sparelist, 1313 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1314 else 1315 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1316 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1317 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1318 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1319 spa_config_enter(spa, RW_WRITER, FTAG); 1320 spa_load_spares(spa); 1321 spa_config_exit(spa, FTAG); 1322 spa->spa_sync_spares = B_TRUE; 1323 } 1324 1325 /* 1326 * Update the config cache to include the newly-imported pool. 1327 */ 1328 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1329 1330 mutex_exit(&spa_namespace_lock); 1331 1332 /* 1333 * Resilver anything that's out of date. 1334 */ 1335 if (spa_mode & FWRITE) 1336 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1337 1338 return (0); 1339 } 1340 1341 /* 1342 * This (illegal) pool name is used when temporarily importing a spa_t in order 1343 * to get the vdev stats associated with the imported devices. 1344 */ 1345 #define TRYIMPORT_NAME "$import" 1346 1347 nvlist_t * 1348 spa_tryimport(nvlist_t *tryconfig) 1349 { 1350 nvlist_t *config = NULL; 1351 char *poolname; 1352 spa_t *spa; 1353 uint64_t state; 1354 1355 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1356 return (NULL); 1357 1358 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1359 return (NULL); 1360 1361 /* 1362 * Create and initialize the spa structure. 1363 */ 1364 mutex_enter(&spa_namespace_lock); 1365 spa = spa_add(TRYIMPORT_NAME, NULL); 1366 spa_activate(spa); 1367 1368 /* 1369 * Pass off the heavy lifting to spa_load(). 1370 * Pass TRUE for mosconfig because the user-supplied config 1371 * is actually the one to trust when doing an import. 1372 */ 1373 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1374 1375 /* 1376 * If 'tryconfig' was at least parsable, return the current config. 1377 */ 1378 if (spa->spa_root_vdev != NULL) { 1379 spa_config_enter(spa, RW_READER, FTAG); 1380 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1381 spa_config_exit(spa, FTAG); 1382 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1383 poolname) == 0); 1384 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1385 state) == 0); 1386 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 1387 spa->spa_uberblock.ub_timestamp) == 0); 1388 1389 /* 1390 * Add the list of hot spares. 1391 */ 1392 spa_add_spares(spa, config); 1393 } 1394 1395 spa_unload(spa); 1396 spa_deactivate(spa); 1397 spa_remove(spa); 1398 mutex_exit(&spa_namespace_lock); 1399 1400 return (config); 1401 } 1402 1403 /* 1404 * Pool export/destroy 1405 * 1406 * The act of destroying or exporting a pool is very simple. We make sure there 1407 * is no more pending I/O and any references to the pool are gone. Then, we 1408 * update the pool state and sync all the labels to disk, removing the 1409 * configuration from the cache afterwards. 1410 */ 1411 static int 1412 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1413 { 1414 spa_t *spa; 1415 1416 if (oldconfig) 1417 *oldconfig = NULL; 1418 1419 if (!(spa_mode & FWRITE)) 1420 return (EROFS); 1421 1422 mutex_enter(&spa_namespace_lock); 1423 if ((spa = spa_lookup(pool)) == NULL) { 1424 mutex_exit(&spa_namespace_lock); 1425 return (ENOENT); 1426 } 1427 1428 /* 1429 * Put a hold on the pool, drop the namespace lock, stop async tasks, 1430 * reacquire the namespace lock, and see if we can export. 1431 */ 1432 spa_open_ref(spa, FTAG); 1433 mutex_exit(&spa_namespace_lock); 1434 spa_async_suspend(spa); 1435 mutex_enter(&spa_namespace_lock); 1436 spa_close(spa, FTAG); 1437 1438 /* 1439 * The pool will be in core if it's openable, 1440 * in which case we can modify its state. 1441 */ 1442 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1443 /* 1444 * Objsets may be open only because they're dirty, so we 1445 * have to force it to sync before checking spa_refcnt. 1446 */ 1447 spa_scrub_suspend(spa); 1448 txg_wait_synced(spa->spa_dsl_pool, 0); 1449 1450 /* 1451 * A pool cannot be exported or destroyed if there are active 1452 * references. If we are resetting a pool, allow references by 1453 * fault injection handlers. 1454 */ 1455 if (!spa_refcount_zero(spa) || 1456 (spa->spa_inject_ref != 0 && 1457 new_state != POOL_STATE_UNINITIALIZED)) { 1458 spa_scrub_resume(spa); 1459 spa_async_resume(spa); 1460 mutex_exit(&spa_namespace_lock); 1461 return (EBUSY); 1462 } 1463 1464 spa_scrub_resume(spa); 1465 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1466 1467 /* 1468 * We want this to be reflected on every label, 1469 * so mark them all dirty. spa_unload() will do the 1470 * final sync that pushes these changes out. 1471 */ 1472 if (new_state != POOL_STATE_UNINITIALIZED) { 1473 spa_config_enter(spa, RW_WRITER, FTAG); 1474 spa->spa_state = new_state; 1475 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 1476 vdev_config_dirty(spa->spa_root_vdev); 1477 spa_config_exit(spa, FTAG); 1478 } 1479 } 1480 1481 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1482 spa_unload(spa); 1483 spa_deactivate(spa); 1484 } 1485 1486 if (oldconfig && spa->spa_config) 1487 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 1488 1489 if (new_state != POOL_STATE_UNINITIALIZED) { 1490 spa_remove(spa); 1491 spa_config_sync(); 1492 } 1493 mutex_exit(&spa_namespace_lock); 1494 1495 return (0); 1496 } 1497 1498 /* 1499 * Destroy a storage pool. 1500 */ 1501 int 1502 spa_destroy(char *pool) 1503 { 1504 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1505 } 1506 1507 /* 1508 * Export a storage pool. 1509 */ 1510 int 1511 spa_export(char *pool, nvlist_t **oldconfig) 1512 { 1513 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1514 } 1515 1516 /* 1517 * Similar to spa_export(), this unloads the spa_t without actually removing it 1518 * from the namespace in any way. 1519 */ 1520 int 1521 spa_reset(char *pool) 1522 { 1523 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 1524 } 1525 1526 1527 /* 1528 * ========================================================================== 1529 * Device manipulation 1530 * ========================================================================== 1531 */ 1532 1533 /* 1534 * Add capacity to a storage pool. 1535 */ 1536 int 1537 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1538 { 1539 uint64_t txg; 1540 int c, error; 1541 vdev_t *rvd = spa->spa_root_vdev; 1542 vdev_t *vd, *tvd; 1543 nvlist_t **spares; 1544 uint_t i, nspares; 1545 1546 txg = spa_vdev_enter(spa); 1547 1548 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 1549 VDEV_ALLOC_ADD)) != 0) 1550 return (spa_vdev_exit(spa, NULL, txg, error)); 1551 1552 spa->spa_pending_vdev = vd; 1553 1554 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1555 &spares, &nspares) != 0) 1556 nspares = 0; 1557 1558 if (vd->vdev_children == 0 && nspares == 0) { 1559 spa->spa_pending_vdev = NULL; 1560 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1561 } 1562 1563 if (vd->vdev_children != 0) { 1564 if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 1565 spa->spa_pending_vdev = NULL; 1566 return (spa_vdev_exit(spa, vd, txg, error)); 1567 } 1568 } 1569 1570 /* 1571 * We must validate the spares after checking the children. Otherwise, 1572 * vdev_inuse() will blindly overwrite the spare. 1573 */ 1574 if ((error = spa_validate_spares(spa, nvroot, txg, 1575 VDEV_ALLOC_ADD)) != 0) { 1576 spa->spa_pending_vdev = NULL; 1577 return (spa_vdev_exit(spa, vd, txg, error)); 1578 } 1579 1580 spa->spa_pending_vdev = NULL; 1581 1582 /* 1583 * Transfer each new top-level vdev from vd to rvd. 1584 */ 1585 for (c = 0; c < vd->vdev_children; c++) { 1586 tvd = vd->vdev_child[c]; 1587 vdev_remove_child(vd, tvd); 1588 tvd->vdev_id = rvd->vdev_children; 1589 vdev_add_child(rvd, tvd); 1590 vdev_config_dirty(tvd); 1591 } 1592 1593 if (nspares != 0) { 1594 if (spa->spa_sparelist != NULL) { 1595 nvlist_t **oldspares; 1596 uint_t oldnspares; 1597 nvlist_t **newspares; 1598 1599 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 1600 ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 1601 1602 newspares = kmem_alloc(sizeof (void *) * 1603 (nspares + oldnspares), KM_SLEEP); 1604 for (i = 0; i < oldnspares; i++) 1605 VERIFY(nvlist_dup(oldspares[i], 1606 &newspares[i], KM_SLEEP) == 0); 1607 for (i = 0; i < nspares; i++) 1608 VERIFY(nvlist_dup(spares[i], 1609 &newspares[i + oldnspares], 1610 KM_SLEEP) == 0); 1611 1612 VERIFY(nvlist_remove(spa->spa_sparelist, 1613 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1614 1615 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1616 ZPOOL_CONFIG_SPARES, newspares, 1617 nspares + oldnspares) == 0); 1618 for (i = 0; i < oldnspares + nspares; i++) 1619 nvlist_free(newspares[i]); 1620 kmem_free(newspares, (oldnspares + nspares) * 1621 sizeof (void *)); 1622 } else { 1623 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1624 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1625 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1626 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1627 } 1628 1629 spa_load_spares(spa); 1630 spa->spa_sync_spares = B_TRUE; 1631 } 1632 1633 /* 1634 * We have to be careful when adding new vdevs to an existing pool. 1635 * If other threads start allocating from these vdevs before we 1636 * sync the config cache, and we lose power, then upon reboot we may 1637 * fail to open the pool because there are DVAs that the config cache 1638 * can't translate. Therefore, we first add the vdevs without 1639 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1640 * and then let spa_config_update() initialize the new metaslabs. 1641 * 1642 * spa_load() checks for added-but-not-initialized vdevs, so that 1643 * if we lose power at any point in this sequence, the remaining 1644 * steps will be completed the next time we load the pool. 1645 */ 1646 (void) spa_vdev_exit(spa, vd, txg, 0); 1647 1648 mutex_enter(&spa_namespace_lock); 1649 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1650 mutex_exit(&spa_namespace_lock); 1651 1652 return (0); 1653 } 1654 1655 /* 1656 * Attach a device to a mirror. The arguments are the path to any device 1657 * in the mirror, and the nvroot for the new device. If the path specifies 1658 * a device that is not mirrored, we automatically insert the mirror vdev. 1659 * 1660 * If 'replacing' is specified, the new device is intended to replace the 1661 * existing device; in this case the two devices are made into their own 1662 * mirror using the 'replacing' vdev, which is functionally idendical to 1663 * the mirror vdev (it actually reuses all the same ops) but has a few 1664 * extra rules: you can't attach to it after it's been created, and upon 1665 * completion of resilvering, the first disk (the one being replaced) 1666 * is automatically detached. 1667 */ 1668 int 1669 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1670 { 1671 uint64_t txg, open_txg; 1672 int error; 1673 vdev_t *rvd = spa->spa_root_vdev; 1674 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1675 vdev_ops_t *pvops; 1676 1677 txg = spa_vdev_enter(spa); 1678 1679 oldvd = vdev_lookup_by_guid(rvd, guid); 1680 1681 if (oldvd == NULL) 1682 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1683 1684 if (!oldvd->vdev_ops->vdev_op_leaf) 1685 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1686 1687 pvd = oldvd->vdev_parent; 1688 1689 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 1690 VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) 1691 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1692 1693 newvd = newrootvd->vdev_child[0]; 1694 1695 if (!newvd->vdev_ops->vdev_op_leaf) 1696 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1697 1698 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 1699 return (spa_vdev_exit(spa, newrootvd, txg, error)); 1700 1701 if (!replacing) { 1702 /* 1703 * For attach, the only allowable parent is a mirror or the root 1704 * vdev. 1705 */ 1706 if (pvd->vdev_ops != &vdev_mirror_ops && 1707 pvd->vdev_ops != &vdev_root_ops) 1708 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1709 1710 pvops = &vdev_mirror_ops; 1711 } else { 1712 /* 1713 * Active hot spares can only be replaced by inactive hot 1714 * spares. 1715 */ 1716 if (pvd->vdev_ops == &vdev_spare_ops && 1717 pvd->vdev_child[1] == oldvd && 1718 !spa_has_spare(spa, newvd->vdev_guid)) 1719 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1720 1721 /* 1722 * If the source is a hot spare, and the parent isn't already a 1723 * spare, then we want to create a new hot spare. Otherwise, we 1724 * want to create a replacing vdev. The user is not allowed to 1725 * attach to a spared vdev child unless the 'isspare' state is 1726 * the same (spare replaces spare, non-spare replaces 1727 * non-spare). 1728 */ 1729 if (pvd->vdev_ops == &vdev_replacing_ops) 1730 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1731 else if (pvd->vdev_ops == &vdev_spare_ops && 1732 newvd->vdev_isspare != oldvd->vdev_isspare) 1733 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1734 else if (pvd->vdev_ops != &vdev_spare_ops && 1735 newvd->vdev_isspare) 1736 pvops = &vdev_spare_ops; 1737 else 1738 pvops = &vdev_replacing_ops; 1739 } 1740 1741 /* 1742 * Compare the new device size with the replaceable/attachable 1743 * device size. 1744 */ 1745 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1746 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1747 1748 /* 1749 * The new device cannot have a higher alignment requirement 1750 * than the top-level vdev. 1751 */ 1752 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1753 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1754 1755 /* 1756 * If this is an in-place replacement, update oldvd's path and devid 1757 * to make it distinguishable from newvd, and unopenable from now on. 1758 */ 1759 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1760 spa_strfree(oldvd->vdev_path); 1761 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1762 KM_SLEEP); 1763 (void) sprintf(oldvd->vdev_path, "%s/%s", 1764 newvd->vdev_path, "old"); 1765 if (oldvd->vdev_devid != NULL) { 1766 spa_strfree(oldvd->vdev_devid); 1767 oldvd->vdev_devid = NULL; 1768 } 1769 } 1770 1771 /* 1772 * If the parent is not a mirror, or if we're replacing, insert the new 1773 * mirror/replacing/spare vdev above oldvd. 1774 */ 1775 if (pvd->vdev_ops != pvops) 1776 pvd = vdev_add_parent(oldvd, pvops); 1777 1778 ASSERT(pvd->vdev_top->vdev_parent == rvd); 1779 ASSERT(pvd->vdev_ops == pvops); 1780 ASSERT(oldvd->vdev_parent == pvd); 1781 1782 /* 1783 * Extract the new device from its root and add it to pvd. 1784 */ 1785 vdev_remove_child(newrootvd, newvd); 1786 newvd->vdev_id = pvd->vdev_children; 1787 vdev_add_child(pvd, newvd); 1788 1789 /* 1790 * If newvd is smaller than oldvd, but larger than its rsize, 1791 * the addition of newvd may have decreased our parent's asize. 1792 */ 1793 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1794 1795 tvd = newvd->vdev_top; 1796 ASSERT(pvd->vdev_top == tvd); 1797 ASSERT(tvd->vdev_parent == rvd); 1798 1799 vdev_config_dirty(tvd); 1800 1801 /* 1802 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1803 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1804 */ 1805 open_txg = txg + TXG_CONCURRENT_STATES - 1; 1806 1807 mutex_enter(&newvd->vdev_dtl_lock); 1808 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1809 open_txg - TXG_INITIAL + 1); 1810 mutex_exit(&newvd->vdev_dtl_lock); 1811 1812 if (newvd->vdev_isspare) 1813 spa_spare_activate(newvd); 1814 1815 /* 1816 * Mark newvd's DTL dirty in this txg. 1817 */ 1818 vdev_dirty(tvd, VDD_DTL, newvd, txg); 1819 1820 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1821 1822 /* 1823 * Kick off a resilver to update newvd. 1824 */ 1825 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1826 1827 return (0); 1828 } 1829 1830 /* 1831 * Detach a device from a mirror or replacing vdev. 1832 * If 'replace_done' is specified, only detach if the parent 1833 * is a replacing vdev. 1834 */ 1835 int 1836 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1837 { 1838 uint64_t txg; 1839 int c, t, error; 1840 vdev_t *rvd = spa->spa_root_vdev; 1841 vdev_t *vd, *pvd, *cvd, *tvd; 1842 boolean_t unspare = B_FALSE; 1843 uint64_t unspare_guid; 1844 1845 txg = spa_vdev_enter(spa); 1846 1847 vd = vdev_lookup_by_guid(rvd, guid); 1848 1849 if (vd == NULL) 1850 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1851 1852 if (!vd->vdev_ops->vdev_op_leaf) 1853 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1854 1855 pvd = vd->vdev_parent; 1856 1857 /* 1858 * If replace_done is specified, only remove this device if it's 1859 * the first child of a replacing vdev. For the 'spare' vdev, either 1860 * disk can be removed. 1861 */ 1862 if (replace_done) { 1863 if (pvd->vdev_ops == &vdev_replacing_ops) { 1864 if (vd->vdev_id != 0) 1865 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1866 } else if (pvd->vdev_ops != &vdev_spare_ops) { 1867 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1868 } 1869 } 1870 1871 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 1872 spa_version(spa) >= ZFS_VERSION_SPARES); 1873 1874 /* 1875 * Only mirror, replacing, and spare vdevs support detach. 1876 */ 1877 if (pvd->vdev_ops != &vdev_replacing_ops && 1878 pvd->vdev_ops != &vdev_mirror_ops && 1879 pvd->vdev_ops != &vdev_spare_ops) 1880 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1881 1882 /* 1883 * If there's only one replica, you can't detach it. 1884 */ 1885 if (pvd->vdev_children <= 1) 1886 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1887 1888 /* 1889 * If all siblings have non-empty DTLs, this device may have the only 1890 * valid copy of the data, which means we cannot safely detach it. 1891 * 1892 * XXX -- as in the vdev_offline() case, we really want a more 1893 * precise DTL check. 1894 */ 1895 for (c = 0; c < pvd->vdev_children; c++) { 1896 uint64_t dirty; 1897 1898 cvd = pvd->vdev_child[c]; 1899 if (cvd == vd) 1900 continue; 1901 if (vdev_is_dead(cvd)) 1902 continue; 1903 mutex_enter(&cvd->vdev_dtl_lock); 1904 dirty = cvd->vdev_dtl_map.sm_space | 1905 cvd->vdev_dtl_scrub.sm_space; 1906 mutex_exit(&cvd->vdev_dtl_lock); 1907 if (!dirty) 1908 break; 1909 } 1910 1911 /* 1912 * If we are a replacing or spare vdev, then we can always detach the 1913 * latter child, as that is how one cancels the operation. 1914 */ 1915 if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 1916 c == pvd->vdev_children) 1917 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1918 1919 /* 1920 * If we are detaching the original disk from a spare, then it implies 1921 * that the spare should become a real disk, and be removed from the 1922 * active spare list for the pool. 1923 */ 1924 if (pvd->vdev_ops == &vdev_spare_ops && 1925 vd->vdev_id == 0) 1926 unspare = B_TRUE; 1927 1928 /* 1929 * Erase the disk labels so the disk can be used for other things. 1930 * This must be done after all other error cases are handled, 1931 * but before we disembowel vd (so we can still do I/O to it). 1932 * But if we can't do it, don't treat the error as fatal -- 1933 * it may be that the unwritability of the disk is the reason 1934 * it's being detached! 1935 */ 1936 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 1937 1938 /* 1939 * Remove vd from its parent and compact the parent's children. 1940 */ 1941 vdev_remove_child(pvd, vd); 1942 vdev_compact_children(pvd); 1943 1944 /* 1945 * Remember one of the remaining children so we can get tvd below. 1946 */ 1947 cvd = pvd->vdev_child[0]; 1948 1949 /* 1950 * If we need to remove the remaining child from the list of hot spares, 1951 * do it now, marking the vdev as no longer a spare in the process. We 1952 * must do this before vdev_remove_parent(), because that can change the 1953 * GUID if it creates a new toplevel GUID. 1954 */ 1955 if (unspare) { 1956 ASSERT(cvd->vdev_isspare); 1957 spa_spare_remove(cvd); 1958 unspare_guid = cvd->vdev_guid; 1959 } 1960 1961 /* 1962 * If the parent mirror/replacing vdev only has one child, 1963 * the parent is no longer needed. Remove it from the tree. 1964 */ 1965 if (pvd->vdev_children == 1) 1966 vdev_remove_parent(cvd); 1967 1968 /* 1969 * We don't set tvd until now because the parent we just removed 1970 * may have been the previous top-level vdev. 1971 */ 1972 tvd = cvd->vdev_top; 1973 ASSERT(tvd->vdev_parent == rvd); 1974 1975 /* 1976 * Reevaluate the parent vdev state. 1977 */ 1978 vdev_propagate_state(cvd->vdev_parent); 1979 1980 /* 1981 * If the device we just detached was smaller than the others, it may be 1982 * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 1983 * can't fail because the existing metaslabs are already in core, so 1984 * there's nothing to read from disk. 1985 */ 1986 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 1987 1988 vdev_config_dirty(tvd); 1989 1990 /* 1991 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 1992 * vd->vdev_detached is set and free vd's DTL object in syncing context. 1993 * But first make sure we're not on any *other* txg's DTL list, to 1994 * prevent vd from being accessed after it's freed. 1995 */ 1996 for (t = 0; t < TXG_SIZE; t++) 1997 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1998 vd->vdev_detached = B_TRUE; 1999 vdev_dirty(tvd, VDD_DTL, vd, txg); 2000 2001 error = spa_vdev_exit(spa, vd, txg, 0); 2002 2003 /* 2004 * If this was the removal of the original device in a hot spare vdev, 2005 * then we want to go through and remove the device from the hot spare 2006 * list of every other pool. 2007 */ 2008 if (unspare) { 2009 spa = NULL; 2010 mutex_enter(&spa_namespace_lock); 2011 while ((spa = spa_next(spa)) != NULL) { 2012 if (spa->spa_state != POOL_STATE_ACTIVE) 2013 continue; 2014 2015 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 2016 } 2017 mutex_exit(&spa_namespace_lock); 2018 } 2019 2020 return (error); 2021 } 2022 2023 /* 2024 * Remove a device from the pool. Currently, this supports removing only hot 2025 * spares. 2026 */ 2027 int 2028 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 2029 { 2030 vdev_t *vd; 2031 nvlist_t **spares, *nv, **newspares; 2032 uint_t i, j, nspares; 2033 int ret = 0; 2034 2035 spa_config_enter(spa, RW_WRITER, FTAG); 2036 2037 vd = spa_lookup_by_guid(spa, guid); 2038 2039 nv = NULL; 2040 if (spa->spa_spares != NULL && 2041 nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2042 &spares, &nspares) == 0) { 2043 for (i = 0; i < nspares; i++) { 2044 uint64_t theguid; 2045 2046 VERIFY(nvlist_lookup_uint64(spares[i], 2047 ZPOOL_CONFIG_GUID, &theguid) == 0); 2048 if (theguid == guid) { 2049 nv = spares[i]; 2050 break; 2051 } 2052 } 2053 } 2054 2055 /* 2056 * We only support removing a hot spare, and only if it's not currently 2057 * in use in this pool. 2058 */ 2059 if (nv == NULL && vd == NULL) { 2060 ret = ENOENT; 2061 goto out; 2062 } 2063 2064 if (nv == NULL && vd != NULL) { 2065 ret = ENOTSUP; 2066 goto out; 2067 } 2068 2069 if (!unspare && nv != NULL && vd != NULL) { 2070 ret = EBUSY; 2071 goto out; 2072 } 2073 2074 if (nspares == 1) { 2075 newspares = NULL; 2076 } else { 2077 newspares = kmem_alloc((nspares - 1) * sizeof (void *), 2078 KM_SLEEP); 2079 for (i = 0, j = 0; i < nspares; i++) { 2080 if (spares[i] != nv) 2081 VERIFY(nvlist_dup(spares[i], 2082 &newspares[j++], KM_SLEEP) == 0); 2083 } 2084 } 2085 2086 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2087 DATA_TYPE_NVLIST_ARRAY) == 0); 2088 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2089 newspares, nspares - 1) == 0); 2090 for (i = 0; i < nspares - 1; i++) 2091 nvlist_free(newspares[i]); 2092 kmem_free(newspares, (nspares - 1) * sizeof (void *)); 2093 spa_load_spares(spa); 2094 spa->spa_sync_spares = B_TRUE; 2095 2096 out: 2097 spa_config_exit(spa, FTAG); 2098 2099 return (ret); 2100 } 2101 2102 /* 2103 * Find any device that's done replacing, so we can detach it. 2104 */ 2105 static vdev_t * 2106 spa_vdev_replace_done_hunt(vdev_t *vd) 2107 { 2108 vdev_t *newvd, *oldvd; 2109 int c; 2110 2111 for (c = 0; c < vd->vdev_children; c++) { 2112 oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 2113 if (oldvd != NULL) 2114 return (oldvd); 2115 } 2116 2117 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 2118 oldvd = vd->vdev_child[0]; 2119 newvd = vd->vdev_child[1]; 2120 2121 mutex_enter(&newvd->vdev_dtl_lock); 2122 if (newvd->vdev_dtl_map.sm_space == 0 && 2123 newvd->vdev_dtl_scrub.sm_space == 0) { 2124 mutex_exit(&newvd->vdev_dtl_lock); 2125 return (oldvd); 2126 } 2127 mutex_exit(&newvd->vdev_dtl_lock); 2128 } 2129 2130 return (NULL); 2131 } 2132 2133 static void 2134 spa_vdev_replace_done(spa_t *spa) 2135 { 2136 vdev_t *vd; 2137 vdev_t *pvd; 2138 uint64_t guid; 2139 uint64_t pguid = 0; 2140 2141 spa_config_enter(spa, RW_READER, FTAG); 2142 2143 while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 2144 guid = vd->vdev_guid; 2145 /* 2146 * If we have just finished replacing a hot spared device, then 2147 * we need to detach the parent's first child (the original hot 2148 * spare) as well. 2149 */ 2150 pvd = vd->vdev_parent; 2151 if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2152 pvd->vdev_id == 0) { 2153 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 2154 ASSERT(pvd->vdev_parent->vdev_children == 2); 2155 pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 2156 } 2157 spa_config_exit(spa, FTAG); 2158 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 2159 return; 2160 if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 2161 return; 2162 spa_config_enter(spa, RW_READER, FTAG); 2163 } 2164 2165 spa_config_exit(spa, FTAG); 2166 } 2167 2168 /* 2169 * Update the stored path for this vdev. Dirty the vdev configuration, relying 2170 * on spa_vdev_enter/exit() to synchronize the labels and cache. 2171 */ 2172 int 2173 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 2174 { 2175 vdev_t *rvd, *vd; 2176 uint64_t txg; 2177 2178 rvd = spa->spa_root_vdev; 2179 2180 txg = spa_vdev_enter(spa); 2181 2182 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2183 /* 2184 * Determine if this is a reference to a hot spare. In that 2185 * case, update the path as stored in the spare list. 2186 */ 2187 nvlist_t **spares; 2188 uint_t i, nspares; 2189 if (spa->spa_sparelist != NULL) { 2190 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 2191 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2192 for (i = 0; i < nspares; i++) { 2193 uint64_t theguid; 2194 VERIFY(nvlist_lookup_uint64(spares[i], 2195 ZPOOL_CONFIG_GUID, &theguid) == 0); 2196 if (theguid == guid) 2197 break; 2198 } 2199 2200 if (i == nspares) 2201 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2202 2203 VERIFY(nvlist_add_string(spares[i], 2204 ZPOOL_CONFIG_PATH, newpath) == 0); 2205 spa_load_spares(spa); 2206 spa->spa_sync_spares = B_TRUE; 2207 return (spa_vdev_exit(spa, NULL, txg, 0)); 2208 } else { 2209 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2210 } 2211 } 2212 2213 if (!vd->vdev_ops->vdev_op_leaf) 2214 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2215 2216 spa_strfree(vd->vdev_path); 2217 vd->vdev_path = spa_strdup(newpath); 2218 2219 vdev_config_dirty(vd->vdev_top); 2220 2221 return (spa_vdev_exit(spa, NULL, txg, 0)); 2222 } 2223 2224 /* 2225 * ========================================================================== 2226 * SPA Scrubbing 2227 * ========================================================================== 2228 */ 2229 2230 static void 2231 spa_scrub_io_done(zio_t *zio) 2232 { 2233 spa_t *spa = zio->io_spa; 2234 2235 zio_data_buf_free(zio->io_data, zio->io_size); 2236 2237 mutex_enter(&spa->spa_scrub_lock); 2238 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2239 vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2240 spa->spa_scrub_errors++; 2241 mutex_enter(&vd->vdev_stat_lock); 2242 vd->vdev_stat.vs_scrub_errors++; 2243 mutex_exit(&vd->vdev_stat_lock); 2244 } 2245 2246 if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight) 2247 cv_broadcast(&spa->spa_scrub_io_cv); 2248 2249 ASSERT(spa->spa_scrub_inflight >= 0); 2250 2251 mutex_exit(&spa->spa_scrub_lock); 2252 } 2253 2254 static void 2255 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 2256 zbookmark_t *zb) 2257 { 2258 size_t size = BP_GET_LSIZE(bp); 2259 void *data; 2260 2261 mutex_enter(&spa->spa_scrub_lock); 2262 /* 2263 * Do not give too much work to vdev(s). 2264 */ 2265 while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) { 2266 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2267 } 2268 spa->spa_scrub_inflight++; 2269 mutex_exit(&spa->spa_scrub_lock); 2270 2271 data = zio_data_buf_alloc(size); 2272 2273 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 2274 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 2275 2276 flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 2277 2278 zio_nowait(zio_read(NULL, spa, bp, data, size, 2279 spa_scrub_io_done, NULL, priority, flags, zb)); 2280 } 2281 2282 /* ARGSUSED */ 2283 static int 2284 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2285 { 2286 blkptr_t *bp = &bc->bc_blkptr; 2287 vdev_t *vd = spa->spa_root_vdev; 2288 dva_t *dva = bp->blk_dva; 2289 int needs_resilver = B_FALSE; 2290 int d; 2291 2292 if (bc->bc_errno) { 2293 /* 2294 * We can't scrub this block, but we can continue to scrub 2295 * the rest of the pool. Note the error and move along. 2296 */ 2297 mutex_enter(&spa->spa_scrub_lock); 2298 spa->spa_scrub_errors++; 2299 mutex_exit(&spa->spa_scrub_lock); 2300 2301 mutex_enter(&vd->vdev_stat_lock); 2302 vd->vdev_stat.vs_scrub_errors++; 2303 mutex_exit(&vd->vdev_stat_lock); 2304 2305 return (ERESTART); 2306 } 2307 2308 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2309 2310 for (d = 0; d < BP_GET_NDVAS(bp); d++) { 2311 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 2312 2313 ASSERT(vd != NULL); 2314 2315 /* 2316 * Keep track of how much data we've examined so that 2317 * zpool(1M) status can make useful progress reports. 2318 */ 2319 mutex_enter(&vd->vdev_stat_lock); 2320 vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 2321 mutex_exit(&vd->vdev_stat_lock); 2322 2323 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 2324 if (DVA_GET_GANG(&dva[d])) { 2325 /* 2326 * Gang members may be spread across multiple 2327 * vdevs, so the best we can do is look at the 2328 * pool-wide DTL. 2329 * XXX -- it would be better to change our 2330 * allocation policy to ensure that this can't 2331 * happen. 2332 */ 2333 vd = spa->spa_root_vdev; 2334 } 2335 if (vdev_dtl_contains(&vd->vdev_dtl_map, 2336 bp->blk_birth, 1)) 2337 needs_resilver = B_TRUE; 2338 } 2339 } 2340 2341 if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2342 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 2343 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 2344 else if (needs_resilver) 2345 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 2346 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2347 2348 return (0); 2349 } 2350 2351 static void 2352 spa_scrub_thread(spa_t *spa) 2353 { 2354 callb_cpr_t cprinfo; 2355 traverse_handle_t *th = spa->spa_scrub_th; 2356 vdev_t *rvd = spa->spa_root_vdev; 2357 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2358 int error = 0; 2359 boolean_t complete; 2360 2361 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2362 2363 /* 2364 * If we're restarting due to a snapshot create/delete, 2365 * wait for that to complete. 2366 */ 2367 txg_wait_synced(spa_get_dsl(spa), 0); 2368 2369 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 2370 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2371 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 2372 2373 spa_config_enter(spa, RW_WRITER, FTAG); 2374 vdev_reopen(rvd); /* purge all vdev caches */ 2375 vdev_config_dirty(rvd); /* rewrite all disk labels */ 2376 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 2377 spa_config_exit(spa, FTAG); 2378 2379 mutex_enter(&spa->spa_scrub_lock); 2380 spa->spa_scrub_errors = 0; 2381 spa->spa_scrub_active = 1; 2382 ASSERT(spa->spa_scrub_inflight == 0); 2383 2384 while (!spa->spa_scrub_stop) { 2385 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2386 while (spa->spa_scrub_suspended) { 2387 spa->spa_scrub_active = 0; 2388 cv_broadcast(&spa->spa_scrub_cv); 2389 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2390 spa->spa_scrub_active = 1; 2391 } 2392 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2393 2394 if (spa->spa_scrub_restart_txg != 0) 2395 break; 2396 2397 mutex_exit(&spa->spa_scrub_lock); 2398 error = traverse_more(th); 2399 mutex_enter(&spa->spa_scrub_lock); 2400 if (error != EAGAIN) 2401 break; 2402 } 2403 2404 while (spa->spa_scrub_inflight) 2405 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2406 2407 spa->spa_scrub_active = 0; 2408 cv_broadcast(&spa->spa_scrub_cv); 2409 2410 mutex_exit(&spa->spa_scrub_lock); 2411 2412 spa_config_enter(spa, RW_WRITER, FTAG); 2413 2414 mutex_enter(&spa->spa_scrub_lock); 2415 2416 /* 2417 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 2418 * AND the spa config lock to synchronize with any config changes 2419 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 2420 */ 2421 if (spa->spa_scrub_restart_txg != 0) 2422 error = ERESTART; 2423 2424 if (spa->spa_scrub_stop) 2425 error = EINTR; 2426 2427 /* 2428 * Even if there were uncorrectable errors, we consider the scrub 2429 * completed. The downside is that if there is a transient error during 2430 * a resilver, we won't resilver the data properly to the target. But 2431 * if the damage is permanent (more likely) we will resilver forever, 2432 * which isn't really acceptable. Since there is enough information for 2433 * the user to know what has failed and why, this seems like a more 2434 * tractable approach. 2435 */ 2436 complete = (error == 0); 2437 2438 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 2439 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2440 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2441 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2442 2443 mutex_exit(&spa->spa_scrub_lock); 2444 2445 /* 2446 * If the scrub/resilver completed, update all DTLs to reflect this. 2447 * Whether it succeeded or not, vacate all temporary scrub DTLs. 2448 */ 2449 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2450 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2451 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 2452 spa_errlog_rotate(spa); 2453 2454 spa_config_exit(spa, FTAG); 2455 2456 mutex_enter(&spa->spa_scrub_lock); 2457 2458 /* 2459 * We may have finished replacing a device. 2460 * Let the async thread assess this and handle the detach. 2461 */ 2462 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2463 2464 /* 2465 * If we were told to restart, our final act is to start a new scrub. 2466 */ 2467 if (error == ERESTART) 2468 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 2469 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2470 2471 spa->spa_scrub_type = POOL_SCRUB_NONE; 2472 spa->spa_scrub_active = 0; 2473 spa->spa_scrub_thread = NULL; 2474 cv_broadcast(&spa->spa_scrub_cv); 2475 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2476 thread_exit(); 2477 } 2478 2479 void 2480 spa_scrub_suspend(spa_t *spa) 2481 { 2482 mutex_enter(&spa->spa_scrub_lock); 2483 spa->spa_scrub_suspended++; 2484 while (spa->spa_scrub_active) { 2485 cv_broadcast(&spa->spa_scrub_cv); 2486 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2487 } 2488 while (spa->spa_scrub_inflight) 2489 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2490 mutex_exit(&spa->spa_scrub_lock); 2491 } 2492 2493 void 2494 spa_scrub_resume(spa_t *spa) 2495 { 2496 mutex_enter(&spa->spa_scrub_lock); 2497 ASSERT(spa->spa_scrub_suspended != 0); 2498 if (--spa->spa_scrub_suspended == 0) 2499 cv_broadcast(&spa->spa_scrub_cv); 2500 mutex_exit(&spa->spa_scrub_lock); 2501 } 2502 2503 void 2504 spa_scrub_restart(spa_t *spa, uint64_t txg) 2505 { 2506 /* 2507 * Something happened (e.g. snapshot create/delete) that means 2508 * we must restart any in-progress scrubs. The itinerary will 2509 * fix this properly. 2510 */ 2511 mutex_enter(&spa->spa_scrub_lock); 2512 spa->spa_scrub_restart_txg = txg; 2513 mutex_exit(&spa->spa_scrub_lock); 2514 } 2515 2516 int 2517 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2518 { 2519 space_seg_t *ss; 2520 uint64_t mintxg, maxtxg; 2521 vdev_t *rvd = spa->spa_root_vdev; 2522 2523 if ((uint_t)type >= POOL_SCRUB_TYPES) 2524 return (ENOTSUP); 2525 2526 mutex_enter(&spa->spa_scrub_lock); 2527 2528 /* 2529 * If there's a scrub or resilver already in progress, stop it. 2530 */ 2531 while (spa->spa_scrub_thread != NULL) { 2532 /* 2533 * Don't stop a resilver unless forced. 2534 */ 2535 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 2536 mutex_exit(&spa->spa_scrub_lock); 2537 return (EBUSY); 2538 } 2539 spa->spa_scrub_stop = 1; 2540 cv_broadcast(&spa->spa_scrub_cv); 2541 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2542 } 2543 2544 /* 2545 * Terminate the previous traverse. 2546 */ 2547 if (spa->spa_scrub_th != NULL) { 2548 traverse_fini(spa->spa_scrub_th); 2549 spa->spa_scrub_th = NULL; 2550 } 2551 2552 if (rvd == NULL) { 2553 ASSERT(spa->spa_scrub_stop == 0); 2554 ASSERT(spa->spa_scrub_type == type); 2555 ASSERT(spa->spa_scrub_restart_txg == 0); 2556 mutex_exit(&spa->spa_scrub_lock); 2557 return (0); 2558 } 2559 2560 mintxg = TXG_INITIAL - 1; 2561 maxtxg = spa_last_synced_txg(spa) + 1; 2562 2563 mutex_enter(&rvd->vdev_dtl_lock); 2564 2565 if (rvd->vdev_dtl_map.sm_space == 0) { 2566 /* 2567 * The pool-wide DTL is empty. 2568 * If this is a resilver, there's nothing to do except 2569 * check whether any in-progress replacements have completed. 2570 */ 2571 if (type == POOL_SCRUB_RESILVER) { 2572 type = POOL_SCRUB_NONE; 2573 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2574 } 2575 } else { 2576 /* 2577 * The pool-wide DTL is non-empty. 2578 * If this is a normal scrub, upgrade to a resilver instead. 2579 */ 2580 if (type == POOL_SCRUB_EVERYTHING) 2581 type = POOL_SCRUB_RESILVER; 2582 } 2583 2584 if (type == POOL_SCRUB_RESILVER) { 2585 /* 2586 * Determine the resilvering boundaries. 2587 * 2588 * Note: (mintxg, maxtxg) is an open interval, 2589 * i.e. mintxg and maxtxg themselves are not included. 2590 * 2591 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 2592 * so we don't claim to resilver a txg that's still changing. 2593 */ 2594 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 2595 mintxg = ss->ss_start - 1; 2596 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 2597 maxtxg = MIN(ss->ss_end, maxtxg); 2598 } 2599 2600 mutex_exit(&rvd->vdev_dtl_lock); 2601 2602 spa->spa_scrub_stop = 0; 2603 spa->spa_scrub_type = type; 2604 spa->spa_scrub_restart_txg = 0; 2605 2606 if (type != POOL_SCRUB_NONE) { 2607 spa->spa_scrub_mintxg = mintxg; 2608 spa->spa_scrub_maxtxg = maxtxg; 2609 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 2610 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 2611 ZIO_FLAG_CANFAIL); 2612 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 2613 spa->spa_scrub_thread = thread_create(NULL, 0, 2614 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 2615 } 2616 2617 mutex_exit(&spa->spa_scrub_lock); 2618 2619 return (0); 2620 } 2621 2622 /* 2623 * ========================================================================== 2624 * SPA async task processing 2625 * ========================================================================== 2626 */ 2627 2628 static void 2629 spa_async_reopen(spa_t *spa) 2630 { 2631 vdev_t *rvd = spa->spa_root_vdev; 2632 vdev_t *tvd; 2633 int c; 2634 2635 spa_config_enter(spa, RW_WRITER, FTAG); 2636 2637 for (c = 0; c < rvd->vdev_children; c++) { 2638 tvd = rvd->vdev_child[c]; 2639 if (tvd->vdev_reopen_wanted) { 2640 tvd->vdev_reopen_wanted = 0; 2641 vdev_reopen(tvd); 2642 } 2643 } 2644 2645 spa_config_exit(spa, FTAG); 2646 } 2647 2648 static void 2649 spa_async_thread(spa_t *spa) 2650 { 2651 int tasks; 2652 2653 ASSERT(spa->spa_sync_on); 2654 2655 mutex_enter(&spa->spa_async_lock); 2656 tasks = spa->spa_async_tasks; 2657 spa->spa_async_tasks = 0; 2658 mutex_exit(&spa->spa_async_lock); 2659 2660 /* 2661 * See if the config needs to be updated. 2662 */ 2663 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 2664 mutex_enter(&spa_namespace_lock); 2665 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2666 mutex_exit(&spa_namespace_lock); 2667 } 2668 2669 /* 2670 * See if any devices need to be reopened. 2671 */ 2672 if (tasks & SPA_ASYNC_REOPEN) 2673 spa_async_reopen(spa); 2674 2675 /* 2676 * If any devices are done replacing, detach them. 2677 */ 2678 if (tasks & SPA_ASYNC_REPLACE_DONE) 2679 spa_vdev_replace_done(spa); 2680 2681 /* 2682 * Kick off a scrub. 2683 */ 2684 if (tasks & SPA_ASYNC_SCRUB) 2685 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 2686 2687 /* 2688 * Kick off a resilver. 2689 */ 2690 if (tasks & SPA_ASYNC_RESILVER) 2691 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 2692 2693 /* 2694 * Let the world know that we're done. 2695 */ 2696 mutex_enter(&spa->spa_async_lock); 2697 spa->spa_async_thread = NULL; 2698 cv_broadcast(&spa->spa_async_cv); 2699 mutex_exit(&spa->spa_async_lock); 2700 thread_exit(); 2701 } 2702 2703 void 2704 spa_async_suspend(spa_t *spa) 2705 { 2706 mutex_enter(&spa->spa_async_lock); 2707 spa->spa_async_suspended++; 2708 while (spa->spa_async_thread != NULL) 2709 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 2710 mutex_exit(&spa->spa_async_lock); 2711 } 2712 2713 void 2714 spa_async_resume(spa_t *spa) 2715 { 2716 mutex_enter(&spa->spa_async_lock); 2717 ASSERT(spa->spa_async_suspended != 0); 2718 spa->spa_async_suspended--; 2719 mutex_exit(&spa->spa_async_lock); 2720 } 2721 2722 static void 2723 spa_async_dispatch(spa_t *spa) 2724 { 2725 mutex_enter(&spa->spa_async_lock); 2726 if (spa->spa_async_tasks && !spa->spa_async_suspended && 2727 spa->spa_async_thread == NULL && 2728 rootdir != NULL && !vn_is_readonly(rootdir)) 2729 spa->spa_async_thread = thread_create(NULL, 0, 2730 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 2731 mutex_exit(&spa->spa_async_lock); 2732 } 2733 2734 void 2735 spa_async_request(spa_t *spa, int task) 2736 { 2737 mutex_enter(&spa->spa_async_lock); 2738 spa->spa_async_tasks |= task; 2739 mutex_exit(&spa->spa_async_lock); 2740 } 2741 2742 /* 2743 * ========================================================================== 2744 * SPA syncing routines 2745 * ========================================================================== 2746 */ 2747 2748 static void 2749 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2750 { 2751 bplist_t *bpl = &spa->spa_sync_bplist; 2752 dmu_tx_t *tx; 2753 blkptr_t blk; 2754 uint64_t itor = 0; 2755 zio_t *zio; 2756 int error; 2757 uint8_t c = 1; 2758 2759 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2760 2761 while (bplist_iterate(bpl, &itor, &blk) == 0) 2762 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2763 2764 error = zio_wait(zio); 2765 ASSERT3U(error, ==, 0); 2766 2767 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2768 bplist_vacate(bpl, tx); 2769 2770 /* 2771 * Pre-dirty the first block so we sync to convergence faster. 2772 * (Usually only the first block is needed.) 2773 */ 2774 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2775 dmu_tx_commit(tx); 2776 } 2777 2778 static void 2779 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 2780 { 2781 char *packed = NULL; 2782 size_t nvsize = 0; 2783 dmu_buf_t *db; 2784 2785 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 2786 2787 packed = kmem_alloc(nvsize, KM_SLEEP); 2788 2789 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 2790 KM_SLEEP) == 0); 2791 2792 dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 2793 2794 kmem_free(packed, nvsize); 2795 2796 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 2797 dmu_buf_will_dirty(db, tx); 2798 *(uint64_t *)db->db_data = nvsize; 2799 dmu_buf_rele(db, FTAG); 2800 } 2801 2802 static void 2803 spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 2804 { 2805 nvlist_t *nvroot; 2806 nvlist_t **spares; 2807 int i; 2808 2809 if (!spa->spa_sync_spares) 2810 return; 2811 2812 /* 2813 * Update the MOS nvlist describing the list of available spares. 2814 * spa_validate_spares() will have already made sure this nvlist is 2815 * valid and the vdevs are labelled appropriately. 2816 */ 2817 if (spa->spa_spares_object == 0) { 2818 spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 2819 DMU_OT_PACKED_NVLIST, 1 << 14, 2820 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2821 VERIFY(zap_update(spa->spa_meta_objset, 2822 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 2823 sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 2824 } 2825 2826 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2827 if (spa->spa_nspares == 0) { 2828 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2829 NULL, 0) == 0); 2830 } else { 2831 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 2832 KM_SLEEP); 2833 for (i = 0; i < spa->spa_nspares; i++) 2834 spares[i] = vdev_config_generate(spa, 2835 spa->spa_spares[i], B_FALSE, B_TRUE); 2836 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2837 spares, spa->spa_nspares) == 0); 2838 for (i = 0; i < spa->spa_nspares; i++) 2839 nvlist_free(spares[i]); 2840 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 2841 } 2842 2843 spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 2844 nvlist_free(nvroot); 2845 2846 spa->spa_sync_spares = B_FALSE; 2847 } 2848 2849 static void 2850 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2851 { 2852 nvlist_t *config; 2853 2854 if (list_is_empty(&spa->spa_dirty_list)) 2855 return; 2856 2857 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2858 2859 if (spa->spa_config_syncing) 2860 nvlist_free(spa->spa_config_syncing); 2861 spa->spa_config_syncing = config; 2862 2863 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 2864 } 2865 2866 static void 2867 spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) 2868 { 2869 spa_t *spa = arg1; 2870 nvlist_t *nvp = arg2; 2871 nvpair_t *nvpair; 2872 objset_t *mos = spa->spa_meta_objset; 2873 uint64_t zapobj; 2874 2875 mutex_enter(&spa->spa_props_lock); 2876 if (spa->spa_pool_props_object == 0) { 2877 zapobj = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx); 2878 VERIFY(zapobj > 0); 2879 2880 spa->spa_pool_props_object = zapobj; 2881 2882 VERIFY(zap_update(mos, DMU_POOL_DIRECTORY_OBJECT, 2883 DMU_POOL_PROPS, 8, 1, 2884 &spa->spa_pool_props_object, tx) == 0); 2885 } 2886 mutex_exit(&spa->spa_props_lock); 2887 2888 nvpair = NULL; 2889 while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) { 2890 switch (zpool_name_to_prop(nvpair_name(nvpair))) { 2891 case ZFS_PROP_BOOTFS: 2892 VERIFY(nvlist_lookup_uint64(nvp, 2893 nvpair_name(nvpair), &spa->spa_bootfs) == 0); 2894 VERIFY(zap_update(mos, 2895 spa->spa_pool_props_object, 2896 zpool_prop_to_name(ZFS_PROP_BOOTFS), 8, 1, 2897 &spa->spa_bootfs, tx) == 0); 2898 break; 2899 } 2900 } 2901 } 2902 2903 /* 2904 * Sync the specified transaction group. New blocks may be dirtied as 2905 * part of the process, so we iterate until it converges. 2906 */ 2907 void 2908 spa_sync(spa_t *spa, uint64_t txg) 2909 { 2910 dsl_pool_t *dp = spa->spa_dsl_pool; 2911 objset_t *mos = spa->spa_meta_objset; 2912 bplist_t *bpl = &spa->spa_sync_bplist; 2913 vdev_t *rvd = spa->spa_root_vdev; 2914 vdev_t *vd; 2915 dmu_tx_t *tx; 2916 int dirty_vdevs; 2917 2918 /* 2919 * Lock out configuration changes. 2920 */ 2921 spa_config_enter(spa, RW_READER, FTAG); 2922 2923 spa->spa_syncing_txg = txg; 2924 spa->spa_sync_pass = 0; 2925 2926 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2927 2928 tx = dmu_tx_create_assigned(dp, txg); 2929 2930 /* 2931 * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg, 2932 * set spa_deflate if we have no raid-z vdevs. 2933 */ 2934 if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE && 2935 spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) { 2936 int i; 2937 2938 for (i = 0; i < rvd->vdev_children; i++) { 2939 vd = rvd->vdev_child[i]; 2940 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 2941 break; 2942 } 2943 if (i == rvd->vdev_children) { 2944 spa->spa_deflate = TRUE; 2945 VERIFY(0 == zap_add(spa->spa_meta_objset, 2946 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2947 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 2948 } 2949 } 2950 2951 /* 2952 * If anything has changed in this txg, push the deferred frees 2953 * from the previous txg. If not, leave them alone so that we 2954 * don't generate work on an otherwise idle system. 2955 */ 2956 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2957 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 2958 !txg_list_empty(&dp->dp_sync_tasks, txg)) 2959 spa_sync_deferred_frees(spa, txg); 2960 2961 /* 2962 * Iterate to convergence. 2963 */ 2964 do { 2965 spa->spa_sync_pass++; 2966 2967 spa_sync_config_object(spa, tx); 2968 spa_sync_spares(spa, tx); 2969 spa_errlog_sync(spa, txg); 2970 dsl_pool_sync(dp, txg); 2971 2972 dirty_vdevs = 0; 2973 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2974 vdev_sync(vd, txg); 2975 dirty_vdevs++; 2976 } 2977 2978 bplist_sync(bpl, tx); 2979 } while (dirty_vdevs); 2980 2981 bplist_close(bpl); 2982 2983 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2984 2985 /* 2986 * Rewrite the vdev configuration (which includes the uberblock) 2987 * to commit the transaction group. 2988 * 2989 * If there are any dirty vdevs, sync the uberblock to all vdevs. 2990 * Otherwise, pick a random top-level vdev that's known to be 2991 * visible in the config cache (see spa_vdev_add() for details). 2992 * If the write fails, try the next vdev until we're tried them all. 2993 */ 2994 if (!list_is_empty(&spa->spa_dirty_list)) { 2995 VERIFY(vdev_config_sync(rvd, txg) == 0); 2996 } else { 2997 int children = rvd->vdev_children; 2998 int c0 = spa_get_random(children); 2999 int c; 3000 3001 for (c = 0; c < children; c++) { 3002 vd = rvd->vdev_child[(c0 + c) % children]; 3003 if (vd->vdev_ms_array == 0) 3004 continue; 3005 if (vdev_config_sync(vd, txg) == 0) 3006 break; 3007 } 3008 if (c == children) 3009 VERIFY(vdev_config_sync(rvd, txg) == 0); 3010 } 3011 3012 dmu_tx_commit(tx); 3013 3014 /* 3015 * Clear the dirty config list. 3016 */ 3017 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 3018 vdev_config_clean(vd); 3019 3020 /* 3021 * Now that the new config has synced transactionally, 3022 * let it become visible to the config cache. 3023 */ 3024 if (spa->spa_config_syncing != NULL) { 3025 spa_config_set(spa, spa->spa_config_syncing); 3026 spa->spa_config_txg = txg; 3027 spa->spa_config_syncing = NULL; 3028 } 3029 3030 /* 3031 * Make a stable copy of the fully synced uberblock. 3032 * We use this as the root for pool traversals. 3033 */ 3034 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 3035 3036 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 3037 3038 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 3039 spa->spa_traverse_wanted = 0; 3040 spa->spa_ubsync = spa->spa_uberblock; 3041 rw_exit(&spa->spa_traverse_lock); 3042 3043 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 3044 3045 /* 3046 * Clean up the ZIL records for the synced txg. 3047 */ 3048 dsl_pool_zil_clean(dp); 3049 3050 /* 3051 * Update usable space statistics. 3052 */ 3053 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 3054 vdev_sync_done(vd, txg); 3055 3056 /* 3057 * It had better be the case that we didn't dirty anything 3058 * since vdev_config_sync(). 3059 */ 3060 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 3061 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 3062 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 3063 ASSERT(bpl->bpl_queue == NULL); 3064 3065 spa_config_exit(spa, FTAG); 3066 3067 /* 3068 * If any async tasks have been requested, kick them off. 3069 */ 3070 spa_async_dispatch(spa); 3071 } 3072 3073 /* 3074 * Sync all pools. We don't want to hold the namespace lock across these 3075 * operations, so we take a reference on the spa_t and drop the lock during the 3076 * sync. 3077 */ 3078 void 3079 spa_sync_allpools(void) 3080 { 3081 spa_t *spa = NULL; 3082 mutex_enter(&spa_namespace_lock); 3083 while ((spa = spa_next(spa)) != NULL) { 3084 if (spa_state(spa) != POOL_STATE_ACTIVE) 3085 continue; 3086 spa_open_ref(spa, FTAG); 3087 mutex_exit(&spa_namespace_lock); 3088 txg_wait_synced(spa_get_dsl(spa), 0); 3089 mutex_enter(&spa_namespace_lock); 3090 spa_close(spa, FTAG); 3091 } 3092 mutex_exit(&spa_namespace_lock); 3093 } 3094 3095 /* 3096 * ========================================================================== 3097 * Miscellaneous routines 3098 * ========================================================================== 3099 */ 3100 3101 /* 3102 * Remove all pools in the system. 3103 */ 3104 void 3105 spa_evict_all(void) 3106 { 3107 spa_t *spa; 3108 3109 /* 3110 * Remove all cached state. All pools should be closed now, 3111 * so every spa in the AVL tree should be unreferenced. 3112 */ 3113 mutex_enter(&spa_namespace_lock); 3114 while ((spa = spa_next(NULL)) != NULL) { 3115 /* 3116 * Stop async tasks. The async thread may need to detach 3117 * a device that's been replaced, which requires grabbing 3118 * spa_namespace_lock, so we must drop it here. 3119 */ 3120 spa_open_ref(spa, FTAG); 3121 mutex_exit(&spa_namespace_lock); 3122 spa_async_suspend(spa); 3123 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 3124 mutex_enter(&spa_namespace_lock); 3125 spa_close(spa, FTAG); 3126 3127 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3128 spa_unload(spa); 3129 spa_deactivate(spa); 3130 } 3131 spa_remove(spa); 3132 } 3133 mutex_exit(&spa_namespace_lock); 3134 } 3135 3136 vdev_t * 3137 spa_lookup_by_guid(spa_t *spa, uint64_t guid) 3138 { 3139 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 3140 } 3141 3142 void 3143 spa_upgrade(spa_t *spa) 3144 { 3145 spa_config_enter(spa, RW_WRITER, FTAG); 3146 3147 /* 3148 * This should only be called for a non-faulted pool, and since a 3149 * future version would result in an unopenable pool, this shouldn't be 3150 * possible. 3151 */ 3152 ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 3153 3154 spa->spa_uberblock.ub_version = ZFS_VERSION; 3155 vdev_config_dirty(spa->spa_root_vdev); 3156 3157 spa_config_exit(spa, FTAG); 3158 3159 txg_wait_synced(spa_get_dsl(spa), 0); 3160 } 3161 3162 boolean_t 3163 spa_has_spare(spa_t *spa, uint64_t guid) 3164 { 3165 int i; 3166 uint64_t spareguid; 3167 3168 for (i = 0; i < spa->spa_nspares; i++) 3169 if (spa->spa_spares[i]->vdev_guid == guid) 3170 return (B_TRUE); 3171 3172 for (i = 0; i < spa->spa_pending_nspares; i++) { 3173 if (nvlist_lookup_uint64(spa->spa_pending_spares[i], 3174 ZPOOL_CONFIG_GUID, &spareguid) == 0 && 3175 spareguid == guid) 3176 return (B_TRUE); 3177 } 3178 3179 return (B_FALSE); 3180 } 3181 3182 int 3183 spa_set_props(spa_t *spa, nvlist_t *nvp) 3184 { 3185 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 3186 spa, nvp, 3)); 3187 } 3188 3189 int 3190 spa_get_props(spa_t *spa, nvlist_t **nvp) 3191 { 3192 zap_cursor_t zc; 3193 zap_attribute_t za; 3194 objset_t *mos = spa->spa_meta_objset; 3195 zfs_source_t src; 3196 zfs_prop_t prop; 3197 nvlist_t *propval; 3198 uint64_t value; 3199 int err; 3200 3201 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3202 3203 mutex_enter(&spa->spa_props_lock); 3204 /* If no props object, then just return empty nvlist */ 3205 if (spa->spa_pool_props_object == 0) { 3206 mutex_exit(&spa->spa_props_lock); 3207 return (0); 3208 } 3209 3210 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 3211 (err = zap_cursor_retrieve(&zc, &za)) == 0; 3212 zap_cursor_advance(&zc)) { 3213 3214 if ((prop = zpool_name_to_prop(za.za_name)) == ZFS_PROP_INVAL) 3215 continue; 3216 3217 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3218 switch (za.za_integer_length) { 3219 case 8: 3220 if (zfs_prop_default_numeric(prop) == 3221 za.za_first_integer) 3222 src = ZFS_SRC_DEFAULT; 3223 else 3224 src = ZFS_SRC_LOCAL; 3225 value = za.za_first_integer; 3226 3227 if (prop == ZFS_PROP_BOOTFS) { 3228 dsl_pool_t *dp; 3229 dsl_dataset_t *ds = NULL; 3230 char strval[MAXPATHLEN]; 3231 3232 dp = spa_get_dsl(spa); 3233 rw_enter(&dp->dp_config_rwlock, RW_READER); 3234 if ((err = dsl_dataset_open_obj(dp, 3235 za.za_first_integer, NULL, DS_MODE_NONE, 3236 FTAG, &ds)) != 0) { 3237 rw_exit(&dp->dp_config_rwlock); 3238 break; 3239 } 3240 dsl_dataset_name(ds, strval); 3241 dsl_dataset_close(ds, DS_MODE_NONE, FTAG); 3242 rw_exit(&dp->dp_config_rwlock); 3243 3244 VERIFY(nvlist_add_uint64(propval, 3245 ZFS_PROP_SOURCE, src) == 0); 3246 VERIFY(nvlist_add_string(propval, 3247 ZFS_PROP_VALUE, strval) == 0); 3248 } else { 3249 VERIFY(nvlist_add_uint64(propval, 3250 ZFS_PROP_SOURCE, src) == 0); 3251 VERIFY(nvlist_add_uint64(propval, 3252 ZFS_PROP_VALUE, value) == 0); 3253 } 3254 VERIFY(nvlist_add_nvlist(*nvp, za.za_name, 3255 propval) == 0); 3256 break; 3257 } 3258 nvlist_free(propval); 3259 } 3260 zap_cursor_fini(&zc); 3261 mutex_exit(&spa->spa_props_lock); 3262 if (err && err != ENOENT) { 3263 nvlist_free(*nvp); 3264 return (err); 3265 } 3266 3267 return (0); 3268 } 3269 3270 /* 3271 * If the bootfs property value is dsobj, clear it. 3272 */ 3273 void 3274 spa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 3275 { 3276 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 3277 VERIFY(zap_remove(spa->spa_meta_objset, 3278 spa->spa_pool_props_object, 3279 zpool_prop_to_name(ZFS_PROP_BOOTFS), tx) == 0); 3280 spa->spa_bootfs = 0; 3281 } 3282 } 3283