1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains all the routines used when modifying on-disk SPA state. 31 * This includes opening, importing, destroying, exporting a pool, and syncing a 32 * pool. 33 */ 34 35 #include <sys/zfs_context.h> 36 #include <sys/fm/fs/zfs.h> 37 #include <sys/spa_impl.h> 38 #include <sys/zio.h> 39 #include <sys/zio_checksum.h> 40 #include <sys/zio_compress.h> 41 #include <sys/dmu.h> 42 #include <sys/dmu_tx.h> 43 #include <sys/zap.h> 44 #include <sys/zil.h> 45 #include <sys/vdev_impl.h> 46 #include <sys/metaslab.h> 47 #include <sys/uberblock_impl.h> 48 #include <sys/txg.h> 49 #include <sys/avl.h> 50 #include <sys/dmu_traverse.h> 51 #include <sys/dmu_objset.h> 52 #include <sys/unique.h> 53 #include <sys/dsl_pool.h> 54 #include <sys/dsl_dataset.h> 55 #include <sys/dsl_dir.h> 56 #include <sys/dsl_prop.h> 57 #include <sys/dsl_synctask.h> 58 #include <sys/fs/zfs.h> 59 #include <sys/callb.h> 60 #include <sys/systeminfo.h> 61 #include <sys/sunddi.h> 62 63 int zio_taskq_threads = 8; 64 65 /* 66 * ========================================================================== 67 * SPA state manipulation (open/create/destroy/import/export) 68 * ========================================================================== 69 */ 70 71 static int 72 spa_error_entry_compare(const void *a, const void *b) 73 { 74 spa_error_entry_t *sa = (spa_error_entry_t *)a; 75 spa_error_entry_t *sb = (spa_error_entry_t *)b; 76 int ret; 77 78 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 79 sizeof (zbookmark_t)); 80 81 if (ret < 0) 82 return (-1); 83 else if (ret > 0) 84 return (1); 85 else 86 return (0); 87 } 88 89 /* 90 * Utility function which retrieves copies of the current logs and 91 * re-initializes them in the process. 92 */ 93 void 94 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 95 { 96 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 97 98 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 99 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 100 101 avl_create(&spa->spa_errlist_scrub, 102 spa_error_entry_compare, sizeof (spa_error_entry_t), 103 offsetof(spa_error_entry_t, se_avl)); 104 avl_create(&spa->spa_errlist_last, 105 spa_error_entry_compare, sizeof (spa_error_entry_t), 106 offsetof(spa_error_entry_t, se_avl)); 107 } 108 109 /* 110 * Activate an uninitialized pool. 111 */ 112 static void 113 spa_activate(spa_t *spa) 114 { 115 int t; 116 117 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 118 119 spa->spa_state = POOL_STATE_ACTIVE; 120 121 spa->spa_normal_class = metaslab_class_create(); 122 spa->spa_log_class = metaslab_class_create(); 123 124 for (t = 0; t < ZIO_TYPES; t++) { 125 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 126 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 127 TASKQ_PREPOPULATE); 128 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 129 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 130 TASKQ_PREPOPULATE); 131 } 132 133 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 134 135 mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); 136 mutex_init(&spa->spa_config_cache_lock, NULL, MUTEX_DEFAULT, NULL); 137 mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); 138 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); 139 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); 140 mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL); 141 mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); 142 mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); 143 mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); 144 145 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 146 offsetof(vdev_t, vdev_dirty_node)); 147 148 txg_list_create(&spa->spa_vdev_txg_list, 149 offsetof(struct vdev, vdev_txg_node)); 150 151 avl_create(&spa->spa_errlist_scrub, 152 spa_error_entry_compare, sizeof (spa_error_entry_t), 153 offsetof(spa_error_entry_t, se_avl)); 154 avl_create(&spa->spa_errlist_last, 155 spa_error_entry_compare, sizeof (spa_error_entry_t), 156 offsetof(spa_error_entry_t, se_avl)); 157 } 158 159 /* 160 * Opposite of spa_activate(). 161 */ 162 static void 163 spa_deactivate(spa_t *spa) 164 { 165 int t; 166 167 ASSERT(spa->spa_sync_on == B_FALSE); 168 ASSERT(spa->spa_dsl_pool == NULL); 169 ASSERT(spa->spa_root_vdev == NULL); 170 171 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 172 173 txg_list_destroy(&spa->spa_vdev_txg_list); 174 175 list_destroy(&spa->spa_dirty_list); 176 177 rw_destroy(&spa->spa_traverse_lock); 178 179 for (t = 0; t < ZIO_TYPES; t++) { 180 taskq_destroy(spa->spa_zio_issue_taskq[t]); 181 taskq_destroy(spa->spa_zio_intr_taskq[t]); 182 spa->spa_zio_issue_taskq[t] = NULL; 183 spa->spa_zio_intr_taskq[t] = NULL; 184 } 185 186 metaslab_class_destroy(spa->spa_normal_class); 187 spa->spa_normal_class = NULL; 188 189 metaslab_class_destroy(spa->spa_log_class); 190 spa->spa_log_class = NULL; 191 192 /* 193 * If this was part of an import or the open otherwise failed, we may 194 * still have errors left in the queues. Empty them just in case. 195 */ 196 spa_errlog_drain(spa); 197 198 avl_destroy(&spa->spa_errlist_scrub); 199 avl_destroy(&spa->spa_errlist_last); 200 201 spa->spa_state = POOL_STATE_UNINITIALIZED; 202 } 203 204 /* 205 * Verify a pool configuration, and construct the vdev tree appropriately. This 206 * will create all the necessary vdevs in the appropriate layout, with each vdev 207 * in the CLOSED state. This will prep the pool before open/creation/import. 208 * All vdev validation is done by the vdev_alloc() routine. 209 */ 210 static int 211 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 212 uint_t id, int atype) 213 { 214 nvlist_t **child; 215 uint_t c, children; 216 int error; 217 218 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 219 return (error); 220 221 if ((*vdp)->vdev_ops->vdev_op_leaf) 222 return (0); 223 224 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 225 &child, &children) != 0) { 226 vdev_free(*vdp); 227 *vdp = NULL; 228 return (EINVAL); 229 } 230 231 for (c = 0; c < children; c++) { 232 vdev_t *vd; 233 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 234 atype)) != 0) { 235 vdev_free(*vdp); 236 *vdp = NULL; 237 return (error); 238 } 239 } 240 241 ASSERT(*vdp != NULL); 242 243 return (0); 244 } 245 246 /* 247 * Opposite of spa_load(). 248 */ 249 static void 250 spa_unload(spa_t *spa) 251 { 252 int i; 253 254 /* 255 * Stop async tasks. 256 */ 257 spa_async_suspend(spa); 258 259 /* 260 * Stop syncing. 261 */ 262 if (spa->spa_sync_on) { 263 txg_sync_stop(spa->spa_dsl_pool); 264 spa->spa_sync_on = B_FALSE; 265 } 266 267 /* 268 * Wait for any outstanding prefetch I/O to complete. 269 */ 270 spa_config_enter(spa, RW_WRITER, FTAG); 271 spa_config_exit(spa, FTAG); 272 273 /* 274 * Close the dsl pool. 275 */ 276 if (spa->spa_dsl_pool) { 277 dsl_pool_close(spa->spa_dsl_pool); 278 spa->spa_dsl_pool = NULL; 279 } 280 281 /* 282 * Close all vdevs. 283 */ 284 if (spa->spa_root_vdev) 285 vdev_free(spa->spa_root_vdev); 286 ASSERT(spa->spa_root_vdev == NULL); 287 288 for (i = 0; i < spa->spa_nspares; i++) 289 vdev_free(spa->spa_spares[i]); 290 if (spa->spa_spares) { 291 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 292 spa->spa_spares = NULL; 293 } 294 if (spa->spa_sparelist) { 295 nvlist_free(spa->spa_sparelist); 296 spa->spa_sparelist = NULL; 297 } 298 299 spa->spa_async_suspended = 0; 300 } 301 302 /* 303 * Load (or re-load) the current list of vdevs describing the active spares for 304 * this pool. When this is called, we have some form of basic information in 305 * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 306 * re-generate a more complete list including status information. 307 */ 308 static void 309 spa_load_spares(spa_t *spa) 310 { 311 nvlist_t **spares; 312 uint_t nspares; 313 int i; 314 vdev_t *vd, *tvd; 315 316 /* 317 * First, close and free any existing spare vdevs. 318 */ 319 for (i = 0; i < spa->spa_nspares; i++) { 320 vd = spa->spa_spares[i]; 321 322 /* Undo the call to spa_activate() below */ 323 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && 324 tvd->vdev_isspare) 325 spa_spare_remove(tvd); 326 vdev_close(vd); 327 vdev_free(vd); 328 } 329 330 if (spa->spa_spares) 331 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 332 333 if (spa->spa_sparelist == NULL) 334 nspares = 0; 335 else 336 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 337 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 338 339 spa->spa_nspares = (int)nspares; 340 spa->spa_spares = NULL; 341 342 if (nspares == 0) 343 return; 344 345 /* 346 * Construct the array of vdevs, opening them to get status in the 347 * process. For each spare, there is potentially two different vdev_t 348 * structures associated with it: one in the list of spares (used only 349 * for basic validation purposes) and one in the active vdev 350 * configuration (if it's spared in). During this phase we open and 351 * validate each vdev on the spare list. If the vdev also exists in the 352 * active configuration, then we also mark this vdev as an active spare. 353 */ 354 spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 355 for (i = 0; i < spa->spa_nspares; i++) { 356 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 357 VDEV_ALLOC_SPARE) == 0); 358 ASSERT(vd != NULL); 359 360 spa->spa_spares[i] = vd; 361 362 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { 363 if (!tvd->vdev_isspare) 364 spa_spare_add(tvd); 365 366 /* 367 * We only mark the spare active if we were successfully 368 * able to load the vdev. Otherwise, importing a pool 369 * with a bad active spare would result in strange 370 * behavior, because multiple pool would think the spare 371 * is actively in use. 372 * 373 * There is a vulnerability here to an equally bizarre 374 * circumstance, where a dead active spare is later 375 * brought back to life (onlined or otherwise). Given 376 * the rarity of this scenario, and the extra complexity 377 * it adds, we ignore the possibility. 378 */ 379 if (!vdev_is_dead(tvd)) 380 spa_spare_activate(tvd); 381 } 382 383 if (vdev_open(vd) != 0) 384 continue; 385 386 vd->vdev_top = vd; 387 (void) vdev_validate_spare(vd); 388 } 389 390 /* 391 * Recompute the stashed list of spares, with status information 392 * this time. 393 */ 394 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 395 DATA_TYPE_NVLIST_ARRAY) == 0); 396 397 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 398 for (i = 0; i < spa->spa_nspares; i++) 399 spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 400 B_TRUE, B_TRUE); 401 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 402 spares, spa->spa_nspares) == 0); 403 for (i = 0; i < spa->spa_nspares; i++) 404 nvlist_free(spares[i]); 405 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 406 } 407 408 static int 409 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 410 { 411 dmu_buf_t *db; 412 char *packed = NULL; 413 size_t nvsize = 0; 414 int error; 415 *value = NULL; 416 417 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 418 nvsize = *(uint64_t *)db->db_data; 419 dmu_buf_rele(db, FTAG); 420 421 packed = kmem_alloc(nvsize, KM_SLEEP); 422 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 423 if (error == 0) 424 error = nvlist_unpack(packed, nvsize, value, 0); 425 kmem_free(packed, nvsize); 426 427 return (error); 428 } 429 430 /* 431 * Checks to see if the given vdev could not be opened, in which case we post a 432 * sysevent to notify the autoreplace code that the device has been removed. 433 */ 434 static void 435 spa_check_removed(vdev_t *vd) 436 { 437 int c; 438 439 for (c = 0; c < vd->vdev_children; c++) 440 spa_check_removed(vd->vdev_child[c]); 441 442 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 443 zfs_post_autoreplace(vd->vdev_spa, vd); 444 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 445 } 446 } 447 448 /* 449 * Load an existing storage pool, using the pool's builtin spa_config as a 450 * source of configuration information. 451 */ 452 static int 453 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 454 { 455 int error = 0; 456 nvlist_t *nvroot = NULL; 457 vdev_t *rvd; 458 uberblock_t *ub = &spa->spa_uberblock; 459 uint64_t config_cache_txg = spa->spa_config_txg; 460 uint64_t pool_guid; 461 uint64_t version; 462 zio_t *zio; 463 uint64_t autoreplace = 0; 464 465 spa->spa_load_state = state; 466 467 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 468 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 469 error = EINVAL; 470 goto out; 471 } 472 473 /* 474 * Versioning wasn't explicitly added to the label until later, so if 475 * it's not present treat it as the initial version. 476 */ 477 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 478 version = SPA_VERSION_INITIAL; 479 480 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 481 &spa->spa_config_txg); 482 483 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 484 spa_guid_exists(pool_guid, 0)) { 485 error = EEXIST; 486 goto out; 487 } 488 489 spa->spa_load_guid = pool_guid; 490 491 /* 492 * Parse the configuration into a vdev tree. We explicitly set the 493 * value that will be returned by spa_version() since parsing the 494 * configuration requires knowing the version number. 495 */ 496 spa_config_enter(spa, RW_WRITER, FTAG); 497 spa->spa_ubsync.ub_version = version; 498 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 499 spa_config_exit(spa, FTAG); 500 501 if (error != 0) 502 goto out; 503 504 ASSERT(spa->spa_root_vdev == rvd); 505 ASSERT(spa_guid(spa) == pool_guid); 506 507 /* 508 * Try to open all vdevs, loading each label in the process. 509 */ 510 error = vdev_open(rvd); 511 if (error != 0) 512 goto out; 513 514 /* 515 * Validate the labels for all leaf vdevs. We need to grab the config 516 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 517 * flag. 518 */ 519 spa_config_enter(spa, RW_READER, FTAG); 520 error = vdev_validate(rvd); 521 spa_config_exit(spa, FTAG); 522 523 if (error != 0) 524 goto out; 525 526 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 527 error = ENXIO; 528 goto out; 529 } 530 531 /* 532 * Find the best uberblock. 533 */ 534 bzero(ub, sizeof (uberblock_t)); 535 536 zio = zio_root(spa, NULL, NULL, 537 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 538 vdev_uberblock_load(zio, rvd, ub); 539 error = zio_wait(zio); 540 541 /* 542 * If we weren't able to find a single valid uberblock, return failure. 543 */ 544 if (ub->ub_txg == 0) { 545 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 546 VDEV_AUX_CORRUPT_DATA); 547 error = ENXIO; 548 goto out; 549 } 550 551 /* 552 * If the pool is newer than the code, we can't open it. 553 */ 554 if (ub->ub_version > SPA_VERSION) { 555 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 556 VDEV_AUX_VERSION_NEWER); 557 error = ENOTSUP; 558 goto out; 559 } 560 561 /* 562 * If the vdev guid sum doesn't match the uberblock, we have an 563 * incomplete configuration. 564 */ 565 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 566 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 567 VDEV_AUX_BAD_GUID_SUM); 568 error = ENXIO; 569 goto out; 570 } 571 572 /* 573 * Initialize internal SPA structures. 574 */ 575 spa->spa_state = POOL_STATE_ACTIVE; 576 spa->spa_ubsync = spa->spa_uberblock; 577 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 578 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 579 if (error) { 580 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 581 VDEV_AUX_CORRUPT_DATA); 582 goto out; 583 } 584 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 585 586 if (zap_lookup(spa->spa_meta_objset, 587 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 588 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 589 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 590 VDEV_AUX_CORRUPT_DATA); 591 error = EIO; 592 goto out; 593 } 594 595 if (!mosconfig) { 596 nvlist_t *newconfig; 597 uint64_t hostid; 598 599 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 600 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 601 VDEV_AUX_CORRUPT_DATA); 602 error = EIO; 603 goto out; 604 } 605 606 if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID, 607 &hostid) == 0) { 608 char *hostname; 609 unsigned long myhostid = 0; 610 611 VERIFY(nvlist_lookup_string(newconfig, 612 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 613 614 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 615 if (hostid != 0 && myhostid != 0 && 616 (unsigned long)hostid != myhostid) { 617 cmn_err(CE_WARN, "pool '%s' could not be " 618 "loaded as it was last accessed by " 619 "another system (host: %s hostid: 0x%lx). " 620 "See: http://www.sun.com/msg/ZFS-8000-EY", 621 spa->spa_name, hostname, 622 (unsigned long)hostid); 623 error = EBADF; 624 goto out; 625 } 626 } 627 628 spa_config_set(spa, newconfig); 629 spa_unload(spa); 630 spa_deactivate(spa); 631 spa_activate(spa); 632 633 return (spa_load(spa, newconfig, state, B_TRUE)); 634 } 635 636 if (zap_lookup(spa->spa_meta_objset, 637 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 638 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 639 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 640 VDEV_AUX_CORRUPT_DATA); 641 error = EIO; 642 goto out; 643 } 644 645 /* 646 * Load the bit that tells us to use the new accounting function 647 * (raid-z deflation). If we have an older pool, this will not 648 * be present. 649 */ 650 error = zap_lookup(spa->spa_meta_objset, 651 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 652 sizeof (uint64_t), 1, &spa->spa_deflate); 653 if (error != 0 && error != ENOENT) { 654 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 655 VDEV_AUX_CORRUPT_DATA); 656 error = EIO; 657 goto out; 658 } 659 660 /* 661 * Load the persistent error log. If we have an older pool, this will 662 * not be present. 663 */ 664 error = zap_lookup(spa->spa_meta_objset, 665 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 666 sizeof (uint64_t), 1, &spa->spa_errlog_last); 667 if (error != 0 && error != ENOENT) { 668 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 669 VDEV_AUX_CORRUPT_DATA); 670 error = EIO; 671 goto out; 672 } 673 674 error = zap_lookup(spa->spa_meta_objset, 675 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 676 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 677 if (error != 0 && error != ENOENT) { 678 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 679 VDEV_AUX_CORRUPT_DATA); 680 error = EIO; 681 goto out; 682 } 683 684 /* 685 * Load the history object. If we have an older pool, this 686 * will not be present. 687 */ 688 error = zap_lookup(spa->spa_meta_objset, 689 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 690 sizeof (uint64_t), 1, &spa->spa_history); 691 if (error != 0 && error != ENOENT) { 692 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 693 VDEV_AUX_CORRUPT_DATA); 694 error = EIO; 695 goto out; 696 } 697 698 /* 699 * Load any hot spares for this pool. 700 */ 701 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 702 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 703 if (error != 0 && error != ENOENT) { 704 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 705 VDEV_AUX_CORRUPT_DATA); 706 error = EIO; 707 goto out; 708 } 709 if (error == 0) { 710 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 711 if (load_nvlist(spa, spa->spa_spares_object, 712 &spa->spa_sparelist) != 0) { 713 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 714 VDEV_AUX_CORRUPT_DATA); 715 error = EIO; 716 goto out; 717 } 718 719 spa_config_enter(spa, RW_WRITER, FTAG); 720 spa_load_spares(spa); 721 spa_config_exit(spa, FTAG); 722 } 723 724 spa->spa_delegation = zfs_prop_default_numeric(ZPOOL_PROP_DELEGATION); 725 726 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 727 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 728 729 if (error && error != ENOENT) { 730 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 731 VDEV_AUX_CORRUPT_DATA); 732 error = EIO; 733 goto out; 734 } 735 736 if (error == 0) { 737 (void) zap_lookup(spa->spa_meta_objset, 738 spa->spa_pool_props_object, 739 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 740 sizeof (uint64_t), 1, &spa->spa_bootfs); 741 (void) zap_lookup(spa->spa_meta_objset, 742 spa->spa_pool_props_object, 743 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 744 sizeof (uint64_t), 1, &autoreplace); 745 (void) zap_lookup(spa->spa_meta_objset, 746 spa->spa_pool_props_object, 747 zpool_prop_to_name(ZPOOL_PROP_DELEGATION), 748 sizeof (uint64_t), 1, &spa->spa_delegation); 749 } 750 751 /* 752 * If the 'autoreplace' property is set, then post a resource notifying 753 * the ZFS DE that it should not issue any faults for unopenable 754 * devices. We also iterate over the vdevs, and post a sysevent for any 755 * unopenable vdevs so that the normal autoreplace handler can take 756 * over. 757 */ 758 if (autoreplace) 759 spa_check_removed(spa->spa_root_vdev); 760 761 /* 762 * Load the vdev state for all toplevel vdevs. 763 */ 764 vdev_load(rvd); 765 766 /* 767 * Propagate the leaf DTLs we just loaded all the way up the tree. 768 */ 769 spa_config_enter(spa, RW_WRITER, FTAG); 770 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 771 spa_config_exit(spa, FTAG); 772 773 /* 774 * Check the state of the root vdev. If it can't be opened, it 775 * indicates one or more toplevel vdevs are faulted. 776 */ 777 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 778 error = ENXIO; 779 goto out; 780 } 781 782 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 783 dmu_tx_t *tx; 784 int need_update = B_FALSE; 785 int c; 786 787 /* 788 * Claim log blocks that haven't been committed yet. 789 * This must all happen in a single txg. 790 */ 791 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 792 spa_first_txg(spa)); 793 (void) dmu_objset_find(spa->spa_name, 794 zil_claim, tx, DS_FIND_CHILDREN); 795 dmu_tx_commit(tx); 796 797 spa->spa_sync_on = B_TRUE; 798 txg_sync_start(spa->spa_dsl_pool); 799 800 /* 801 * Wait for all claims to sync. 802 */ 803 txg_wait_synced(spa->spa_dsl_pool, 0); 804 805 /* 806 * If the config cache is stale, or we have uninitialized 807 * metaslabs (see spa_vdev_add()), then update the config. 808 */ 809 if (config_cache_txg != spa->spa_config_txg || 810 state == SPA_LOAD_IMPORT) 811 need_update = B_TRUE; 812 813 for (c = 0; c < rvd->vdev_children; c++) 814 if (rvd->vdev_child[c]->vdev_ms_array == 0) 815 need_update = B_TRUE; 816 817 /* 818 * Update the config cache asychronously in case we're the 819 * root pool, in which case the config cache isn't writable yet. 820 */ 821 if (need_update) 822 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 823 } 824 825 error = 0; 826 out: 827 if (error && error != EBADF) 828 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 829 spa->spa_load_state = SPA_LOAD_NONE; 830 spa->spa_ena = 0; 831 832 return (error); 833 } 834 835 /* 836 * Pool Open/Import 837 * 838 * The import case is identical to an open except that the configuration is sent 839 * down from userland, instead of grabbed from the configuration cache. For the 840 * case of an open, the pool configuration will exist in the 841 * POOL_STATE_UNINITIALIZED state. 842 * 843 * The stats information (gen/count/ustats) is used to gather vdev statistics at 844 * the same time open the pool, without having to keep around the spa_t in some 845 * ambiguous state. 846 */ 847 static int 848 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 849 { 850 spa_t *spa; 851 int error; 852 int loaded = B_FALSE; 853 int locked = B_FALSE; 854 855 *spapp = NULL; 856 857 /* 858 * As disgusting as this is, we need to support recursive calls to this 859 * function because dsl_dir_open() is called during spa_load(), and ends 860 * up calling spa_open() again. The real fix is to figure out how to 861 * avoid dsl_dir_open() calling this in the first place. 862 */ 863 if (mutex_owner(&spa_namespace_lock) != curthread) { 864 mutex_enter(&spa_namespace_lock); 865 locked = B_TRUE; 866 } 867 868 if ((spa = spa_lookup(pool)) == NULL) { 869 if (locked) 870 mutex_exit(&spa_namespace_lock); 871 return (ENOENT); 872 } 873 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 874 875 spa_activate(spa); 876 877 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 878 879 if (error == EBADF) { 880 /* 881 * If vdev_validate() returns failure (indicated by 882 * EBADF), it indicates that one of the vdevs indicates 883 * that the pool has been exported or destroyed. If 884 * this is the case, the config cache is out of sync and 885 * we should remove the pool from the namespace. 886 */ 887 zfs_post_ok(spa, NULL); 888 spa_unload(spa); 889 spa_deactivate(spa); 890 spa_remove(spa); 891 spa_config_sync(); 892 if (locked) 893 mutex_exit(&spa_namespace_lock); 894 return (ENOENT); 895 } 896 897 if (error) { 898 /* 899 * We can't open the pool, but we still have useful 900 * information: the state of each vdev after the 901 * attempted vdev_open(). Return this to the user. 902 */ 903 if (config != NULL && spa->spa_root_vdev != NULL) { 904 spa_config_enter(spa, RW_READER, FTAG); 905 *config = spa_config_generate(spa, NULL, -1ULL, 906 B_TRUE); 907 spa_config_exit(spa, FTAG); 908 } 909 spa_unload(spa); 910 spa_deactivate(spa); 911 spa->spa_last_open_failed = B_TRUE; 912 if (locked) 913 mutex_exit(&spa_namespace_lock); 914 *spapp = NULL; 915 return (error); 916 } else { 917 zfs_post_ok(spa, NULL); 918 spa->spa_last_open_failed = B_FALSE; 919 } 920 921 loaded = B_TRUE; 922 } 923 924 spa_open_ref(spa, tag); 925 926 /* 927 * If we just loaded the pool, resilver anything that's out of date. 928 */ 929 if (loaded && (spa_mode & FWRITE)) 930 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 931 932 if (locked) 933 mutex_exit(&spa_namespace_lock); 934 935 *spapp = spa; 936 937 if (config != NULL) { 938 spa_config_enter(spa, RW_READER, FTAG); 939 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 940 spa_config_exit(spa, FTAG); 941 } 942 943 return (0); 944 } 945 946 int 947 spa_open(const char *name, spa_t **spapp, void *tag) 948 { 949 return (spa_open_common(name, spapp, tag, NULL)); 950 } 951 952 /* 953 * Lookup the given spa_t, incrementing the inject count in the process, 954 * preventing it from being exported or destroyed. 955 */ 956 spa_t * 957 spa_inject_addref(char *name) 958 { 959 spa_t *spa; 960 961 mutex_enter(&spa_namespace_lock); 962 if ((spa = spa_lookup(name)) == NULL) { 963 mutex_exit(&spa_namespace_lock); 964 return (NULL); 965 } 966 spa->spa_inject_ref++; 967 mutex_exit(&spa_namespace_lock); 968 969 return (spa); 970 } 971 972 void 973 spa_inject_delref(spa_t *spa) 974 { 975 mutex_enter(&spa_namespace_lock); 976 spa->spa_inject_ref--; 977 mutex_exit(&spa_namespace_lock); 978 } 979 980 static void 981 spa_add_spares(spa_t *spa, nvlist_t *config) 982 { 983 nvlist_t **spares; 984 uint_t i, nspares; 985 nvlist_t *nvroot; 986 uint64_t guid; 987 vdev_stat_t *vs; 988 uint_t vsc; 989 uint64_t pool; 990 991 if (spa->spa_nspares == 0) 992 return; 993 994 VERIFY(nvlist_lookup_nvlist(config, 995 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 996 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 997 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 998 if (nspares != 0) { 999 VERIFY(nvlist_add_nvlist_array(nvroot, 1000 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1001 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1002 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1003 1004 /* 1005 * Go through and find any spares which have since been 1006 * repurposed as an active spare. If this is the case, update 1007 * their status appropriately. 1008 */ 1009 for (i = 0; i < nspares; i++) { 1010 VERIFY(nvlist_lookup_uint64(spares[i], 1011 ZPOOL_CONFIG_GUID, &guid) == 0); 1012 if (spa_spare_exists(guid, &pool) && pool != 0ULL) { 1013 VERIFY(nvlist_lookup_uint64_array( 1014 spares[i], ZPOOL_CONFIG_STATS, 1015 (uint64_t **)&vs, &vsc) == 0); 1016 vs->vs_state = VDEV_STATE_CANT_OPEN; 1017 vs->vs_aux = VDEV_AUX_SPARED; 1018 } 1019 } 1020 } 1021 } 1022 1023 int 1024 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 1025 { 1026 int error; 1027 spa_t *spa; 1028 1029 *config = NULL; 1030 error = spa_open_common(name, &spa, FTAG, config); 1031 1032 if (spa && *config != NULL) { 1033 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 1034 spa_get_errlog_size(spa)) == 0); 1035 1036 spa_add_spares(spa, *config); 1037 } 1038 1039 /* 1040 * We want to get the alternate root even for faulted pools, so we cheat 1041 * and call spa_lookup() directly. 1042 */ 1043 if (altroot) { 1044 if (spa == NULL) { 1045 mutex_enter(&spa_namespace_lock); 1046 spa = spa_lookup(name); 1047 if (spa) 1048 spa_altroot(spa, altroot, buflen); 1049 else 1050 altroot[0] = '\0'; 1051 spa = NULL; 1052 mutex_exit(&spa_namespace_lock); 1053 } else { 1054 spa_altroot(spa, altroot, buflen); 1055 } 1056 } 1057 1058 if (spa != NULL) 1059 spa_close(spa, FTAG); 1060 1061 return (error); 1062 } 1063 1064 /* 1065 * Validate that the 'spares' array is well formed. We must have an array of 1066 * nvlists, each which describes a valid leaf vdev. If this is an import (mode 1067 * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long 1068 * as they are well-formed. 1069 */ 1070 static int 1071 spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 1072 { 1073 nvlist_t **spares; 1074 uint_t i, nspares; 1075 vdev_t *vd; 1076 int error; 1077 1078 /* 1079 * It's acceptable to have no spares specified. 1080 */ 1081 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1082 &spares, &nspares) != 0) 1083 return (0); 1084 1085 if (nspares == 0) 1086 return (EINVAL); 1087 1088 /* 1089 * Make sure the pool is formatted with a version that supports hot 1090 * spares. 1091 */ 1092 if (spa_version(spa) < SPA_VERSION_SPARES) 1093 return (ENOTSUP); 1094 1095 /* 1096 * Set the pending spare list so we correctly handle device in-use 1097 * checking. 1098 */ 1099 spa->spa_pending_spares = spares; 1100 spa->spa_pending_nspares = nspares; 1101 1102 for (i = 0; i < nspares; i++) { 1103 if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 1104 mode)) != 0) 1105 goto out; 1106 1107 if (!vd->vdev_ops->vdev_op_leaf) { 1108 vdev_free(vd); 1109 error = EINVAL; 1110 goto out; 1111 } 1112 1113 vd->vdev_top = vd; 1114 1115 if ((error = vdev_open(vd)) == 0 && 1116 (error = vdev_label_init(vd, crtxg, 1117 VDEV_LABEL_SPARE)) == 0) { 1118 VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 1119 vd->vdev_guid) == 0); 1120 } 1121 1122 vdev_free(vd); 1123 1124 if (error && mode != VDEV_ALLOC_SPARE) 1125 goto out; 1126 else 1127 error = 0; 1128 } 1129 1130 out: 1131 spa->spa_pending_spares = NULL; 1132 spa->spa_pending_nspares = 0; 1133 return (error); 1134 } 1135 1136 /* 1137 * Pool Creation 1138 */ 1139 int 1140 spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 1141 { 1142 spa_t *spa; 1143 vdev_t *rvd; 1144 dsl_pool_t *dp; 1145 dmu_tx_t *tx; 1146 int c, error = 0; 1147 uint64_t txg = TXG_INITIAL; 1148 nvlist_t **spares; 1149 uint_t nspares; 1150 1151 /* 1152 * If this pool already exists, return failure. 1153 */ 1154 mutex_enter(&spa_namespace_lock); 1155 if (spa_lookup(pool) != NULL) { 1156 mutex_exit(&spa_namespace_lock); 1157 return (EEXIST); 1158 } 1159 1160 /* 1161 * Allocate a new spa_t structure. 1162 */ 1163 spa = spa_add(pool, altroot); 1164 spa_activate(spa); 1165 1166 spa->spa_uberblock.ub_txg = txg - 1; 1167 spa->spa_uberblock.ub_version = SPA_VERSION; 1168 spa->spa_ubsync = spa->spa_uberblock; 1169 1170 /* 1171 * Create the root vdev. 1172 */ 1173 spa_config_enter(spa, RW_WRITER, FTAG); 1174 1175 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1176 1177 ASSERT(error != 0 || rvd != NULL); 1178 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1179 1180 if (error == 0 && rvd->vdev_children == 0) 1181 error = EINVAL; 1182 1183 if (error == 0 && 1184 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1185 (error = spa_validate_spares(spa, nvroot, txg, 1186 VDEV_ALLOC_ADD)) == 0) { 1187 for (c = 0; c < rvd->vdev_children; c++) 1188 vdev_init(rvd->vdev_child[c], txg); 1189 vdev_config_dirty(rvd); 1190 } 1191 1192 spa_config_exit(spa, FTAG); 1193 1194 if (error != 0) { 1195 spa_unload(spa); 1196 spa_deactivate(spa); 1197 spa_remove(spa); 1198 mutex_exit(&spa_namespace_lock); 1199 return (error); 1200 } 1201 1202 /* 1203 * Get the list of spares, if specified. 1204 */ 1205 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1206 &spares, &nspares) == 0) { 1207 VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 1208 KM_SLEEP) == 0); 1209 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1210 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1211 spa_config_enter(spa, RW_WRITER, FTAG); 1212 spa_load_spares(spa); 1213 spa_config_exit(spa, FTAG); 1214 spa->spa_sync_spares = B_TRUE; 1215 } 1216 1217 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1218 spa->spa_meta_objset = dp->dp_meta_objset; 1219 1220 tx = dmu_tx_create_assigned(dp, txg); 1221 1222 /* 1223 * Create the pool config object. 1224 */ 1225 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1226 DMU_OT_PACKED_NVLIST, 1 << 14, 1227 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1228 1229 if (zap_add(spa->spa_meta_objset, 1230 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1231 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1232 cmn_err(CE_PANIC, "failed to add pool config"); 1233 } 1234 1235 /* Newly created pools are always deflated. */ 1236 spa->spa_deflate = TRUE; 1237 if (zap_add(spa->spa_meta_objset, 1238 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1239 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 1240 cmn_err(CE_PANIC, "failed to add deflate"); 1241 } 1242 1243 /* 1244 * Create the deferred-free bplist object. Turn off compression 1245 * because sync-to-convergence takes longer if the blocksize 1246 * keeps changing. 1247 */ 1248 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1249 1 << 14, tx); 1250 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1251 ZIO_COMPRESS_OFF, tx); 1252 1253 if (zap_add(spa->spa_meta_objset, 1254 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1255 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 1256 cmn_err(CE_PANIC, "failed to add bplist"); 1257 } 1258 1259 /* 1260 * Create the pool's history object. 1261 */ 1262 spa_history_create_obj(spa, tx); 1263 1264 dmu_tx_commit(tx); 1265 1266 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 1267 spa->spa_delegation = zfs_prop_default_numeric(ZPOOL_PROP_DELEGATION); 1268 spa->spa_sync_on = B_TRUE; 1269 txg_sync_start(spa->spa_dsl_pool); 1270 1271 /* 1272 * We explicitly wait for the first transaction to complete so that our 1273 * bean counters are appropriately updated. 1274 */ 1275 txg_wait_synced(spa->spa_dsl_pool, txg); 1276 1277 spa_config_sync(); 1278 1279 mutex_exit(&spa_namespace_lock); 1280 1281 return (0); 1282 } 1283 1284 /* 1285 * Import the given pool into the system. We set up the necessary spa_t and 1286 * then call spa_load() to do the dirty work. 1287 */ 1288 int 1289 spa_import(const char *pool, nvlist_t *config, const char *altroot) 1290 { 1291 spa_t *spa; 1292 int error; 1293 nvlist_t *nvroot; 1294 nvlist_t **spares; 1295 uint_t nspares; 1296 1297 /* 1298 * If a pool with this name exists, return failure. 1299 */ 1300 mutex_enter(&spa_namespace_lock); 1301 if (spa_lookup(pool) != NULL) { 1302 mutex_exit(&spa_namespace_lock); 1303 return (EEXIST); 1304 } 1305 1306 /* 1307 * Create and initialize the spa structure. 1308 */ 1309 spa = spa_add(pool, altroot); 1310 spa_activate(spa); 1311 1312 /* 1313 * Pass off the heavy lifting to spa_load(). 1314 * Pass TRUE for mosconfig because the user-supplied config 1315 * is actually the one to trust when doing an import. 1316 */ 1317 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1318 1319 spa_config_enter(spa, RW_WRITER, FTAG); 1320 /* 1321 * Toss any existing sparelist, as it doesn't have any validity anymore, 1322 * and conflicts with spa_has_spare(). 1323 */ 1324 if (spa->spa_sparelist) { 1325 nvlist_free(spa->spa_sparelist); 1326 spa->spa_sparelist = NULL; 1327 spa_load_spares(spa); 1328 } 1329 1330 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1331 &nvroot) == 0); 1332 if (error == 0) 1333 error = spa_validate_spares(spa, nvroot, -1ULL, 1334 VDEV_ALLOC_SPARE); 1335 spa_config_exit(spa, FTAG); 1336 1337 if (error != 0) { 1338 spa_unload(spa); 1339 spa_deactivate(spa); 1340 spa_remove(spa); 1341 mutex_exit(&spa_namespace_lock); 1342 return (error); 1343 } 1344 1345 /* 1346 * Override any spares as specified by the user, as these may have 1347 * correct device names/devids, etc. 1348 */ 1349 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1350 &spares, &nspares) == 0) { 1351 if (spa->spa_sparelist) 1352 VERIFY(nvlist_remove(spa->spa_sparelist, 1353 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1354 else 1355 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1356 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1357 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1358 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1359 spa_config_enter(spa, RW_WRITER, FTAG); 1360 spa_load_spares(spa); 1361 spa_config_exit(spa, FTAG); 1362 spa->spa_sync_spares = B_TRUE; 1363 } 1364 1365 /* 1366 * Update the config cache to include the newly-imported pool. 1367 */ 1368 if (spa_mode & FWRITE) 1369 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1370 1371 /* 1372 * Resilver anything that's out of date. 1373 */ 1374 if (spa_mode & FWRITE) 1375 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1376 1377 mutex_exit(&spa_namespace_lock); 1378 1379 return (0); 1380 } 1381 1382 /* 1383 * This (illegal) pool name is used when temporarily importing a spa_t in order 1384 * to get the vdev stats associated with the imported devices. 1385 */ 1386 #define TRYIMPORT_NAME "$import" 1387 1388 nvlist_t * 1389 spa_tryimport(nvlist_t *tryconfig) 1390 { 1391 nvlist_t *config = NULL; 1392 char *poolname; 1393 spa_t *spa; 1394 uint64_t state; 1395 1396 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1397 return (NULL); 1398 1399 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1400 return (NULL); 1401 1402 /* 1403 * Create and initialize the spa structure. 1404 */ 1405 mutex_enter(&spa_namespace_lock); 1406 spa = spa_add(TRYIMPORT_NAME, NULL); 1407 spa_activate(spa); 1408 1409 /* 1410 * Pass off the heavy lifting to spa_load(). 1411 * Pass TRUE for mosconfig because the user-supplied config 1412 * is actually the one to trust when doing an import. 1413 */ 1414 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1415 1416 /* 1417 * If 'tryconfig' was at least parsable, return the current config. 1418 */ 1419 if (spa->spa_root_vdev != NULL) { 1420 spa_config_enter(spa, RW_READER, FTAG); 1421 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1422 spa_config_exit(spa, FTAG); 1423 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1424 poolname) == 0); 1425 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1426 state) == 0); 1427 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 1428 spa->spa_uberblock.ub_timestamp) == 0); 1429 1430 /* 1431 * Add the list of hot spares. 1432 */ 1433 spa_add_spares(spa, config); 1434 } 1435 1436 spa_unload(spa); 1437 spa_deactivate(spa); 1438 spa_remove(spa); 1439 mutex_exit(&spa_namespace_lock); 1440 1441 return (config); 1442 } 1443 1444 /* 1445 * Pool export/destroy 1446 * 1447 * The act of destroying or exporting a pool is very simple. We make sure there 1448 * is no more pending I/O and any references to the pool are gone. Then, we 1449 * update the pool state and sync all the labels to disk, removing the 1450 * configuration from the cache afterwards. 1451 */ 1452 static int 1453 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1454 { 1455 spa_t *spa; 1456 1457 if (oldconfig) 1458 *oldconfig = NULL; 1459 1460 if (!(spa_mode & FWRITE)) 1461 return (EROFS); 1462 1463 mutex_enter(&spa_namespace_lock); 1464 if ((spa = spa_lookup(pool)) == NULL) { 1465 mutex_exit(&spa_namespace_lock); 1466 return (ENOENT); 1467 } 1468 1469 /* 1470 * Put a hold on the pool, drop the namespace lock, stop async tasks, 1471 * reacquire the namespace lock, and see if we can export. 1472 */ 1473 spa_open_ref(spa, FTAG); 1474 mutex_exit(&spa_namespace_lock); 1475 spa_async_suspend(spa); 1476 mutex_enter(&spa_namespace_lock); 1477 spa_close(spa, FTAG); 1478 1479 /* 1480 * The pool will be in core if it's openable, 1481 * in which case we can modify its state. 1482 */ 1483 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1484 /* 1485 * Objsets may be open only because they're dirty, so we 1486 * have to force it to sync before checking spa_refcnt. 1487 */ 1488 spa_scrub_suspend(spa); 1489 txg_wait_synced(spa->spa_dsl_pool, 0); 1490 1491 /* 1492 * A pool cannot be exported or destroyed if there are active 1493 * references. If we are resetting a pool, allow references by 1494 * fault injection handlers. 1495 */ 1496 if (!spa_refcount_zero(spa) || 1497 (spa->spa_inject_ref != 0 && 1498 new_state != POOL_STATE_UNINITIALIZED)) { 1499 spa_scrub_resume(spa); 1500 spa_async_resume(spa); 1501 mutex_exit(&spa_namespace_lock); 1502 return (EBUSY); 1503 } 1504 1505 spa_scrub_resume(spa); 1506 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1507 1508 /* 1509 * We want this to be reflected on every label, 1510 * so mark them all dirty. spa_unload() will do the 1511 * final sync that pushes these changes out. 1512 */ 1513 if (new_state != POOL_STATE_UNINITIALIZED) { 1514 spa_config_enter(spa, RW_WRITER, FTAG); 1515 spa->spa_state = new_state; 1516 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 1517 vdev_config_dirty(spa->spa_root_vdev); 1518 spa_config_exit(spa, FTAG); 1519 } 1520 } 1521 1522 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 1523 1524 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1525 spa_unload(spa); 1526 spa_deactivate(spa); 1527 } 1528 1529 if (oldconfig && spa->spa_config) 1530 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 1531 1532 if (new_state != POOL_STATE_UNINITIALIZED) { 1533 spa_remove(spa); 1534 spa_config_sync(); 1535 } 1536 mutex_exit(&spa_namespace_lock); 1537 1538 return (0); 1539 } 1540 1541 /* 1542 * Destroy a storage pool. 1543 */ 1544 int 1545 spa_destroy(char *pool) 1546 { 1547 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1548 } 1549 1550 /* 1551 * Export a storage pool. 1552 */ 1553 int 1554 spa_export(char *pool, nvlist_t **oldconfig) 1555 { 1556 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1557 } 1558 1559 /* 1560 * Similar to spa_export(), this unloads the spa_t without actually removing it 1561 * from the namespace in any way. 1562 */ 1563 int 1564 spa_reset(char *pool) 1565 { 1566 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 1567 } 1568 1569 1570 /* 1571 * ========================================================================== 1572 * Device manipulation 1573 * ========================================================================== 1574 */ 1575 1576 /* 1577 * Add a device to a storage pool. 1578 */ 1579 int 1580 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1581 { 1582 uint64_t txg; 1583 int c, error; 1584 vdev_t *rvd = spa->spa_root_vdev; 1585 vdev_t *vd, *tvd; 1586 nvlist_t **spares; 1587 uint_t i, nspares; 1588 1589 txg = spa_vdev_enter(spa); 1590 1591 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 1592 VDEV_ALLOC_ADD)) != 0) 1593 return (spa_vdev_exit(spa, NULL, txg, error)); 1594 1595 spa->spa_pending_vdev = vd; 1596 1597 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1598 &spares, &nspares) != 0) 1599 nspares = 0; 1600 1601 if (vd->vdev_children == 0 && nspares == 0) { 1602 spa->spa_pending_vdev = NULL; 1603 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1604 } 1605 1606 if (vd->vdev_children != 0) { 1607 if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 1608 spa->spa_pending_vdev = NULL; 1609 return (spa_vdev_exit(spa, vd, txg, error)); 1610 } 1611 } 1612 1613 /* 1614 * We must validate the spares after checking the children. Otherwise, 1615 * vdev_inuse() will blindly overwrite the spare. 1616 */ 1617 if ((error = spa_validate_spares(spa, nvroot, txg, 1618 VDEV_ALLOC_ADD)) != 0) { 1619 spa->spa_pending_vdev = NULL; 1620 return (spa_vdev_exit(spa, vd, txg, error)); 1621 } 1622 1623 spa->spa_pending_vdev = NULL; 1624 1625 /* 1626 * Transfer each new top-level vdev from vd to rvd. 1627 */ 1628 for (c = 0; c < vd->vdev_children; c++) { 1629 tvd = vd->vdev_child[c]; 1630 vdev_remove_child(vd, tvd); 1631 tvd->vdev_id = rvd->vdev_children; 1632 vdev_add_child(rvd, tvd); 1633 vdev_config_dirty(tvd); 1634 } 1635 1636 if (nspares != 0) { 1637 if (spa->spa_sparelist != NULL) { 1638 nvlist_t **oldspares; 1639 uint_t oldnspares; 1640 nvlist_t **newspares; 1641 1642 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 1643 ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 1644 1645 newspares = kmem_alloc(sizeof (void *) * 1646 (nspares + oldnspares), KM_SLEEP); 1647 for (i = 0; i < oldnspares; i++) 1648 VERIFY(nvlist_dup(oldspares[i], 1649 &newspares[i], KM_SLEEP) == 0); 1650 for (i = 0; i < nspares; i++) 1651 VERIFY(nvlist_dup(spares[i], 1652 &newspares[i + oldnspares], 1653 KM_SLEEP) == 0); 1654 1655 VERIFY(nvlist_remove(spa->spa_sparelist, 1656 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1657 1658 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1659 ZPOOL_CONFIG_SPARES, newspares, 1660 nspares + oldnspares) == 0); 1661 for (i = 0; i < oldnspares + nspares; i++) 1662 nvlist_free(newspares[i]); 1663 kmem_free(newspares, (oldnspares + nspares) * 1664 sizeof (void *)); 1665 } else { 1666 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1667 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1668 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1669 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1670 } 1671 1672 spa_load_spares(spa); 1673 spa->spa_sync_spares = B_TRUE; 1674 } 1675 1676 /* 1677 * We have to be careful when adding new vdevs to an existing pool. 1678 * If other threads start allocating from these vdevs before we 1679 * sync the config cache, and we lose power, then upon reboot we may 1680 * fail to open the pool because there are DVAs that the config cache 1681 * can't translate. Therefore, we first add the vdevs without 1682 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1683 * and then let spa_config_update() initialize the new metaslabs. 1684 * 1685 * spa_load() checks for added-but-not-initialized vdevs, so that 1686 * if we lose power at any point in this sequence, the remaining 1687 * steps will be completed the next time we load the pool. 1688 */ 1689 (void) spa_vdev_exit(spa, vd, txg, 0); 1690 1691 mutex_enter(&spa_namespace_lock); 1692 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1693 mutex_exit(&spa_namespace_lock); 1694 1695 return (0); 1696 } 1697 1698 /* 1699 * Attach a device to a mirror. The arguments are the path to any device 1700 * in the mirror, and the nvroot for the new device. If the path specifies 1701 * a device that is not mirrored, we automatically insert the mirror vdev. 1702 * 1703 * If 'replacing' is specified, the new device is intended to replace the 1704 * existing device; in this case the two devices are made into their own 1705 * mirror using the 'replacing' vdev, which is functionally identical to 1706 * the mirror vdev (it actually reuses all the same ops) but has a few 1707 * extra rules: you can't attach to it after it's been created, and upon 1708 * completion of resilvering, the first disk (the one being replaced) 1709 * is automatically detached. 1710 */ 1711 int 1712 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1713 { 1714 uint64_t txg, open_txg; 1715 int error; 1716 vdev_t *rvd = spa->spa_root_vdev; 1717 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1718 vdev_ops_t *pvops; 1719 int is_log; 1720 1721 txg = spa_vdev_enter(spa); 1722 1723 oldvd = vdev_lookup_by_guid(rvd, guid); 1724 1725 if (oldvd == NULL) 1726 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1727 1728 if (!oldvd->vdev_ops->vdev_op_leaf) 1729 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1730 1731 pvd = oldvd->vdev_parent; 1732 1733 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 1734 VDEV_ALLOC_ADD)) != 0) 1735 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 1736 1737 if (newrootvd->vdev_children != 1) 1738 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1739 1740 newvd = newrootvd->vdev_child[0]; 1741 1742 if (!newvd->vdev_ops->vdev_op_leaf) 1743 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1744 1745 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 1746 return (spa_vdev_exit(spa, newrootvd, txg, error)); 1747 1748 /* 1749 * Spares can't replace logs 1750 */ 1751 is_log = oldvd->vdev_islog; 1752 if (is_log && newvd->vdev_isspare) 1753 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1754 1755 if (!replacing) { 1756 /* 1757 * For attach, the only allowable parent is a mirror or the root 1758 * vdev. 1759 */ 1760 if (pvd->vdev_ops != &vdev_mirror_ops && 1761 pvd->vdev_ops != &vdev_root_ops) 1762 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1763 1764 pvops = &vdev_mirror_ops; 1765 } else { 1766 /* 1767 * Active hot spares can only be replaced by inactive hot 1768 * spares. 1769 */ 1770 if (pvd->vdev_ops == &vdev_spare_ops && 1771 pvd->vdev_child[1] == oldvd && 1772 !spa_has_spare(spa, newvd->vdev_guid)) 1773 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1774 1775 /* 1776 * If the source is a hot spare, and the parent isn't already a 1777 * spare, then we want to create a new hot spare. Otherwise, we 1778 * want to create a replacing vdev. The user is not allowed to 1779 * attach to a spared vdev child unless the 'isspare' state is 1780 * the same (spare replaces spare, non-spare replaces 1781 * non-spare). 1782 */ 1783 if (pvd->vdev_ops == &vdev_replacing_ops) 1784 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1785 else if (pvd->vdev_ops == &vdev_spare_ops && 1786 newvd->vdev_isspare != oldvd->vdev_isspare) 1787 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1788 else if (pvd->vdev_ops != &vdev_spare_ops && 1789 newvd->vdev_isspare) 1790 pvops = &vdev_spare_ops; 1791 else 1792 pvops = &vdev_replacing_ops; 1793 } 1794 1795 /* 1796 * Compare the new device size with the replaceable/attachable 1797 * device size. 1798 */ 1799 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1800 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1801 1802 /* 1803 * The new device cannot have a higher alignment requirement 1804 * than the top-level vdev. 1805 */ 1806 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1807 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1808 1809 /* 1810 * If this is an in-place replacement, update oldvd's path and devid 1811 * to make it distinguishable from newvd, and unopenable from now on. 1812 */ 1813 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1814 spa_strfree(oldvd->vdev_path); 1815 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1816 KM_SLEEP); 1817 (void) sprintf(oldvd->vdev_path, "%s/%s", 1818 newvd->vdev_path, "old"); 1819 if (oldvd->vdev_devid != NULL) { 1820 spa_strfree(oldvd->vdev_devid); 1821 oldvd->vdev_devid = NULL; 1822 } 1823 } 1824 1825 /* 1826 * If the parent is not a mirror, or if we're replacing, insert the new 1827 * mirror/replacing/spare vdev above oldvd. 1828 */ 1829 if (pvd->vdev_ops != pvops) 1830 pvd = vdev_add_parent(oldvd, pvops); 1831 1832 ASSERT(pvd->vdev_top->vdev_parent == rvd); 1833 ASSERT(pvd->vdev_ops == pvops); 1834 ASSERT(oldvd->vdev_parent == pvd); 1835 1836 /* 1837 * Extract the new device from its root and add it to pvd. 1838 */ 1839 vdev_remove_child(newrootvd, newvd); 1840 newvd->vdev_id = pvd->vdev_children; 1841 vdev_add_child(pvd, newvd); 1842 1843 /* 1844 * If newvd is smaller than oldvd, but larger than its rsize, 1845 * the addition of newvd may have decreased our parent's asize. 1846 */ 1847 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1848 1849 tvd = newvd->vdev_top; 1850 ASSERT(pvd->vdev_top == tvd); 1851 ASSERT(tvd->vdev_parent == rvd); 1852 1853 vdev_config_dirty(tvd); 1854 1855 /* 1856 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1857 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1858 */ 1859 open_txg = txg + TXG_CONCURRENT_STATES - 1; 1860 1861 mutex_enter(&newvd->vdev_dtl_lock); 1862 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1863 open_txg - TXG_INITIAL + 1); 1864 mutex_exit(&newvd->vdev_dtl_lock); 1865 1866 if (newvd->vdev_isspare) 1867 spa_spare_activate(newvd); 1868 1869 /* 1870 * Mark newvd's DTL dirty in this txg. 1871 */ 1872 vdev_dirty(tvd, VDD_DTL, newvd, txg); 1873 1874 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1875 1876 /* 1877 * Kick off a resilver to update newvd. We need to grab the namespace 1878 * lock because spa_scrub() needs to post a sysevent with the pool name. 1879 */ 1880 mutex_enter(&spa_namespace_lock); 1881 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1882 mutex_exit(&spa_namespace_lock); 1883 1884 return (0); 1885 } 1886 1887 /* 1888 * Detach a device from a mirror or replacing vdev. 1889 * If 'replace_done' is specified, only detach if the parent 1890 * is a replacing vdev. 1891 */ 1892 int 1893 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1894 { 1895 uint64_t txg; 1896 int c, t, error; 1897 vdev_t *rvd = spa->spa_root_vdev; 1898 vdev_t *vd, *pvd, *cvd, *tvd; 1899 boolean_t unspare = B_FALSE; 1900 uint64_t unspare_guid; 1901 1902 txg = spa_vdev_enter(spa); 1903 1904 vd = vdev_lookup_by_guid(rvd, guid); 1905 1906 if (vd == NULL) 1907 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1908 1909 if (!vd->vdev_ops->vdev_op_leaf) 1910 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1911 1912 pvd = vd->vdev_parent; 1913 1914 /* 1915 * If replace_done is specified, only remove this device if it's 1916 * the first child of a replacing vdev. For the 'spare' vdev, either 1917 * disk can be removed. 1918 */ 1919 if (replace_done) { 1920 if (pvd->vdev_ops == &vdev_replacing_ops) { 1921 if (vd->vdev_id != 0) 1922 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1923 } else if (pvd->vdev_ops != &vdev_spare_ops) { 1924 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1925 } 1926 } 1927 1928 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 1929 spa_version(spa) >= SPA_VERSION_SPARES); 1930 1931 /* 1932 * Only mirror, replacing, and spare vdevs support detach. 1933 */ 1934 if (pvd->vdev_ops != &vdev_replacing_ops && 1935 pvd->vdev_ops != &vdev_mirror_ops && 1936 pvd->vdev_ops != &vdev_spare_ops) 1937 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1938 1939 /* 1940 * If there's only one replica, you can't detach it. 1941 */ 1942 if (pvd->vdev_children <= 1) 1943 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1944 1945 /* 1946 * If all siblings have non-empty DTLs, this device may have the only 1947 * valid copy of the data, which means we cannot safely detach it. 1948 * 1949 * XXX -- as in the vdev_offline() case, we really want a more 1950 * precise DTL check. 1951 */ 1952 for (c = 0; c < pvd->vdev_children; c++) { 1953 uint64_t dirty; 1954 1955 cvd = pvd->vdev_child[c]; 1956 if (cvd == vd) 1957 continue; 1958 if (vdev_is_dead(cvd)) 1959 continue; 1960 mutex_enter(&cvd->vdev_dtl_lock); 1961 dirty = cvd->vdev_dtl_map.sm_space | 1962 cvd->vdev_dtl_scrub.sm_space; 1963 mutex_exit(&cvd->vdev_dtl_lock); 1964 if (!dirty) 1965 break; 1966 } 1967 1968 /* 1969 * If we are a replacing or spare vdev, then we can always detach the 1970 * latter child, as that is how one cancels the operation. 1971 */ 1972 if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 1973 c == pvd->vdev_children) 1974 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1975 1976 /* 1977 * If we are detaching the original disk from a spare, then it implies 1978 * that the spare should become a real disk, and be removed from the 1979 * active spare list for the pool. 1980 */ 1981 if (pvd->vdev_ops == &vdev_spare_ops && 1982 vd->vdev_id == 0) 1983 unspare = B_TRUE; 1984 1985 /* 1986 * Erase the disk labels so the disk can be used for other things. 1987 * This must be done after all other error cases are handled, 1988 * but before we disembowel vd (so we can still do I/O to it). 1989 * But if we can't do it, don't treat the error as fatal -- 1990 * it may be that the unwritability of the disk is the reason 1991 * it's being detached! 1992 */ 1993 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 1994 1995 /* 1996 * Remove vd from its parent and compact the parent's children. 1997 */ 1998 vdev_remove_child(pvd, vd); 1999 vdev_compact_children(pvd); 2000 2001 /* 2002 * Remember one of the remaining children so we can get tvd below. 2003 */ 2004 cvd = pvd->vdev_child[0]; 2005 2006 /* 2007 * If we need to remove the remaining child from the list of hot spares, 2008 * do it now, marking the vdev as no longer a spare in the process. We 2009 * must do this before vdev_remove_parent(), because that can change the 2010 * GUID if it creates a new toplevel GUID. 2011 */ 2012 if (unspare) { 2013 ASSERT(cvd->vdev_isspare); 2014 spa_spare_remove(cvd); 2015 unspare_guid = cvd->vdev_guid; 2016 } 2017 2018 /* 2019 * If the parent mirror/replacing vdev only has one child, 2020 * the parent is no longer needed. Remove it from the tree. 2021 */ 2022 if (pvd->vdev_children == 1) 2023 vdev_remove_parent(cvd); 2024 2025 /* 2026 * We don't set tvd until now because the parent we just removed 2027 * may have been the previous top-level vdev. 2028 */ 2029 tvd = cvd->vdev_top; 2030 ASSERT(tvd->vdev_parent == rvd); 2031 2032 /* 2033 * Reevaluate the parent vdev state. 2034 */ 2035 vdev_propagate_state(cvd); 2036 2037 /* 2038 * If the device we just detached was smaller than the others, it may be 2039 * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 2040 * can't fail because the existing metaslabs are already in core, so 2041 * there's nothing to read from disk. 2042 */ 2043 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 2044 2045 vdev_config_dirty(tvd); 2046 2047 /* 2048 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 2049 * vd->vdev_detached is set and free vd's DTL object in syncing context. 2050 * But first make sure we're not on any *other* txg's DTL list, to 2051 * prevent vd from being accessed after it's freed. 2052 */ 2053 for (t = 0; t < TXG_SIZE; t++) 2054 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 2055 vd->vdev_detached = B_TRUE; 2056 vdev_dirty(tvd, VDD_DTL, vd, txg); 2057 2058 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 2059 2060 error = spa_vdev_exit(spa, vd, txg, 0); 2061 2062 /* 2063 * If this was the removal of the original device in a hot spare vdev, 2064 * then we want to go through and remove the device from the hot spare 2065 * list of every other pool. 2066 */ 2067 if (unspare) { 2068 spa = NULL; 2069 mutex_enter(&spa_namespace_lock); 2070 while ((spa = spa_next(spa)) != NULL) { 2071 if (spa->spa_state != POOL_STATE_ACTIVE) 2072 continue; 2073 2074 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 2075 } 2076 mutex_exit(&spa_namespace_lock); 2077 } 2078 2079 return (error); 2080 } 2081 2082 /* 2083 * Remove a device from the pool. Currently, this supports removing only hot 2084 * spares. 2085 */ 2086 int 2087 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 2088 { 2089 vdev_t *vd; 2090 nvlist_t **spares, *nv, **newspares; 2091 uint_t i, j, nspares; 2092 int ret = 0; 2093 2094 spa_config_enter(spa, RW_WRITER, FTAG); 2095 2096 vd = spa_lookup_by_guid(spa, guid); 2097 2098 nv = NULL; 2099 if (spa->spa_spares != NULL && 2100 nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2101 &spares, &nspares) == 0) { 2102 for (i = 0; i < nspares; i++) { 2103 uint64_t theguid; 2104 2105 VERIFY(nvlist_lookup_uint64(spares[i], 2106 ZPOOL_CONFIG_GUID, &theguid) == 0); 2107 if (theguid == guid) { 2108 nv = spares[i]; 2109 break; 2110 } 2111 } 2112 } 2113 2114 /* 2115 * We only support removing a hot spare, and only if it's not currently 2116 * in use in this pool. 2117 */ 2118 if (nv == NULL && vd == NULL) { 2119 ret = ENOENT; 2120 goto out; 2121 } 2122 2123 if (nv == NULL && vd != NULL) { 2124 ret = ENOTSUP; 2125 goto out; 2126 } 2127 2128 if (!unspare && nv != NULL && vd != NULL) { 2129 ret = EBUSY; 2130 goto out; 2131 } 2132 2133 if (nspares == 1) { 2134 newspares = NULL; 2135 } else { 2136 newspares = kmem_alloc((nspares - 1) * sizeof (void *), 2137 KM_SLEEP); 2138 for (i = 0, j = 0; i < nspares; i++) { 2139 if (spares[i] != nv) 2140 VERIFY(nvlist_dup(spares[i], 2141 &newspares[j++], KM_SLEEP) == 0); 2142 } 2143 } 2144 2145 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2146 DATA_TYPE_NVLIST_ARRAY) == 0); 2147 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2148 newspares, nspares - 1) == 0); 2149 for (i = 0; i < nspares - 1; i++) 2150 nvlist_free(newspares[i]); 2151 kmem_free(newspares, (nspares - 1) * sizeof (void *)); 2152 spa_load_spares(spa); 2153 spa->spa_sync_spares = B_TRUE; 2154 2155 out: 2156 spa_config_exit(spa, FTAG); 2157 2158 return (ret); 2159 } 2160 2161 /* 2162 * Find any device that's done replacing, or a vdev marked 'unspare' that's 2163 * current spared, so we can detach it. 2164 */ 2165 static vdev_t * 2166 spa_vdev_resilver_done_hunt(vdev_t *vd) 2167 { 2168 vdev_t *newvd, *oldvd; 2169 int c; 2170 2171 for (c = 0; c < vd->vdev_children; c++) { 2172 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 2173 if (oldvd != NULL) 2174 return (oldvd); 2175 } 2176 2177 /* 2178 * Check for a completed replacement. 2179 */ 2180 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 2181 oldvd = vd->vdev_child[0]; 2182 newvd = vd->vdev_child[1]; 2183 2184 mutex_enter(&newvd->vdev_dtl_lock); 2185 if (newvd->vdev_dtl_map.sm_space == 0 && 2186 newvd->vdev_dtl_scrub.sm_space == 0) { 2187 mutex_exit(&newvd->vdev_dtl_lock); 2188 return (oldvd); 2189 } 2190 mutex_exit(&newvd->vdev_dtl_lock); 2191 } 2192 2193 /* 2194 * Check for a completed resilver with the 'unspare' flag set. 2195 */ 2196 if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { 2197 newvd = vd->vdev_child[0]; 2198 oldvd = vd->vdev_child[1]; 2199 2200 mutex_enter(&newvd->vdev_dtl_lock); 2201 if (newvd->vdev_unspare && 2202 newvd->vdev_dtl_map.sm_space == 0 && 2203 newvd->vdev_dtl_scrub.sm_space == 0) { 2204 newvd->vdev_unspare = 0; 2205 mutex_exit(&newvd->vdev_dtl_lock); 2206 return (oldvd); 2207 } 2208 mutex_exit(&newvd->vdev_dtl_lock); 2209 } 2210 2211 return (NULL); 2212 } 2213 2214 static void 2215 spa_vdev_resilver_done(spa_t *spa) 2216 { 2217 vdev_t *vd; 2218 vdev_t *pvd; 2219 uint64_t guid; 2220 uint64_t pguid = 0; 2221 2222 spa_config_enter(spa, RW_READER, FTAG); 2223 2224 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 2225 guid = vd->vdev_guid; 2226 /* 2227 * If we have just finished replacing a hot spared device, then 2228 * we need to detach the parent's first child (the original hot 2229 * spare) as well. 2230 */ 2231 pvd = vd->vdev_parent; 2232 if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2233 pvd->vdev_id == 0) { 2234 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 2235 ASSERT(pvd->vdev_parent->vdev_children == 2); 2236 pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 2237 } 2238 spa_config_exit(spa, FTAG); 2239 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 2240 return; 2241 if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 2242 return; 2243 spa_config_enter(spa, RW_READER, FTAG); 2244 } 2245 2246 spa_config_exit(spa, FTAG); 2247 } 2248 2249 /* 2250 * Update the stored path for this vdev. Dirty the vdev configuration, relying 2251 * on spa_vdev_enter/exit() to synchronize the labels and cache. 2252 */ 2253 int 2254 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 2255 { 2256 vdev_t *rvd, *vd; 2257 uint64_t txg; 2258 2259 rvd = spa->spa_root_vdev; 2260 2261 txg = spa_vdev_enter(spa); 2262 2263 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2264 /* 2265 * Determine if this is a reference to a hot spare. In that 2266 * case, update the path as stored in the spare list. 2267 */ 2268 nvlist_t **spares; 2269 uint_t i, nspares; 2270 if (spa->spa_sparelist != NULL) { 2271 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 2272 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2273 for (i = 0; i < nspares; i++) { 2274 uint64_t theguid; 2275 VERIFY(nvlist_lookup_uint64(spares[i], 2276 ZPOOL_CONFIG_GUID, &theguid) == 0); 2277 if (theguid == guid) 2278 break; 2279 } 2280 2281 if (i == nspares) 2282 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2283 2284 VERIFY(nvlist_add_string(spares[i], 2285 ZPOOL_CONFIG_PATH, newpath) == 0); 2286 spa_load_spares(spa); 2287 spa->spa_sync_spares = B_TRUE; 2288 return (spa_vdev_exit(spa, NULL, txg, 0)); 2289 } else { 2290 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2291 } 2292 } 2293 2294 if (!vd->vdev_ops->vdev_op_leaf) 2295 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2296 2297 spa_strfree(vd->vdev_path); 2298 vd->vdev_path = spa_strdup(newpath); 2299 2300 vdev_config_dirty(vd->vdev_top); 2301 2302 return (spa_vdev_exit(spa, NULL, txg, 0)); 2303 } 2304 2305 /* 2306 * ========================================================================== 2307 * SPA Scrubbing 2308 * ========================================================================== 2309 */ 2310 2311 static void 2312 spa_scrub_io_done(zio_t *zio) 2313 { 2314 spa_t *spa = zio->io_spa; 2315 2316 arc_data_buf_free(zio->io_data, zio->io_size); 2317 2318 mutex_enter(&spa->spa_scrub_lock); 2319 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2320 vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2321 spa->spa_scrub_errors++; 2322 mutex_enter(&vd->vdev_stat_lock); 2323 vd->vdev_stat.vs_scrub_errors++; 2324 mutex_exit(&vd->vdev_stat_lock); 2325 } 2326 2327 if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight) 2328 cv_broadcast(&spa->spa_scrub_io_cv); 2329 2330 ASSERT(spa->spa_scrub_inflight >= 0); 2331 2332 mutex_exit(&spa->spa_scrub_lock); 2333 } 2334 2335 static void 2336 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 2337 zbookmark_t *zb) 2338 { 2339 size_t size = BP_GET_LSIZE(bp); 2340 void *data; 2341 2342 mutex_enter(&spa->spa_scrub_lock); 2343 /* 2344 * Do not give too much work to vdev(s). 2345 */ 2346 while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) { 2347 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2348 } 2349 spa->spa_scrub_inflight++; 2350 mutex_exit(&spa->spa_scrub_lock); 2351 2352 data = arc_data_buf_alloc(size); 2353 2354 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 2355 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 2356 2357 flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 2358 2359 zio_nowait(zio_read(NULL, spa, bp, data, size, 2360 spa_scrub_io_done, NULL, priority, flags, zb)); 2361 } 2362 2363 /* ARGSUSED */ 2364 static int 2365 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2366 { 2367 blkptr_t *bp = &bc->bc_blkptr; 2368 vdev_t *vd = spa->spa_root_vdev; 2369 dva_t *dva = bp->blk_dva; 2370 int needs_resilver = B_FALSE; 2371 int d; 2372 2373 if (bc->bc_errno) { 2374 /* 2375 * We can't scrub this block, but we can continue to scrub 2376 * the rest of the pool. Note the error and move along. 2377 */ 2378 mutex_enter(&spa->spa_scrub_lock); 2379 spa->spa_scrub_errors++; 2380 mutex_exit(&spa->spa_scrub_lock); 2381 2382 mutex_enter(&vd->vdev_stat_lock); 2383 vd->vdev_stat.vs_scrub_errors++; 2384 mutex_exit(&vd->vdev_stat_lock); 2385 2386 return (ERESTART); 2387 } 2388 2389 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2390 2391 for (d = 0; d < BP_GET_NDVAS(bp); d++) { 2392 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 2393 2394 ASSERT(vd != NULL); 2395 2396 /* 2397 * Keep track of how much data we've examined so that 2398 * zpool(1M) status can make useful progress reports. 2399 */ 2400 mutex_enter(&vd->vdev_stat_lock); 2401 vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 2402 mutex_exit(&vd->vdev_stat_lock); 2403 2404 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 2405 if (DVA_GET_GANG(&dva[d])) { 2406 /* 2407 * Gang members may be spread across multiple 2408 * vdevs, so the best we can do is look at the 2409 * pool-wide DTL. 2410 * XXX -- it would be better to change our 2411 * allocation policy to ensure that this can't 2412 * happen. 2413 */ 2414 vd = spa->spa_root_vdev; 2415 } 2416 if (vdev_dtl_contains(&vd->vdev_dtl_map, 2417 bp->blk_birth, 1)) 2418 needs_resilver = B_TRUE; 2419 } 2420 } 2421 2422 if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2423 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 2424 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 2425 else if (needs_resilver) 2426 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 2427 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2428 2429 return (0); 2430 } 2431 2432 static void 2433 spa_scrub_thread(spa_t *spa) 2434 { 2435 callb_cpr_t cprinfo; 2436 traverse_handle_t *th = spa->spa_scrub_th; 2437 vdev_t *rvd = spa->spa_root_vdev; 2438 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2439 int error = 0; 2440 boolean_t complete; 2441 2442 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2443 2444 /* 2445 * If we're restarting due to a snapshot create/delete, 2446 * wait for that to complete. 2447 */ 2448 txg_wait_synced(spa_get_dsl(spa), 0); 2449 2450 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 2451 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2452 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 2453 2454 spa_config_enter(spa, RW_WRITER, FTAG); 2455 vdev_reopen(rvd); /* purge all vdev caches */ 2456 vdev_config_dirty(rvd); /* rewrite all disk labels */ 2457 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 2458 spa_config_exit(spa, FTAG); 2459 2460 mutex_enter(&spa->spa_scrub_lock); 2461 spa->spa_scrub_errors = 0; 2462 spa->spa_scrub_active = 1; 2463 ASSERT(spa->spa_scrub_inflight == 0); 2464 2465 while (!spa->spa_scrub_stop) { 2466 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2467 while (spa->spa_scrub_suspended) { 2468 spa->spa_scrub_active = 0; 2469 cv_broadcast(&spa->spa_scrub_cv); 2470 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2471 spa->spa_scrub_active = 1; 2472 } 2473 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2474 2475 if (spa->spa_scrub_restart_txg != 0) 2476 break; 2477 2478 mutex_exit(&spa->spa_scrub_lock); 2479 error = traverse_more(th); 2480 mutex_enter(&spa->spa_scrub_lock); 2481 if (error != EAGAIN) 2482 break; 2483 } 2484 2485 while (spa->spa_scrub_inflight) 2486 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2487 2488 spa->spa_scrub_active = 0; 2489 cv_broadcast(&spa->spa_scrub_cv); 2490 2491 mutex_exit(&spa->spa_scrub_lock); 2492 2493 spa_config_enter(spa, RW_WRITER, FTAG); 2494 2495 mutex_enter(&spa->spa_scrub_lock); 2496 2497 /* 2498 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 2499 * AND the spa config lock to synchronize with any config changes 2500 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 2501 */ 2502 if (spa->spa_scrub_restart_txg != 0) 2503 error = ERESTART; 2504 2505 if (spa->spa_scrub_stop) 2506 error = EINTR; 2507 2508 /* 2509 * Even if there were uncorrectable errors, we consider the scrub 2510 * completed. The downside is that if there is a transient error during 2511 * a resilver, we won't resilver the data properly to the target. But 2512 * if the damage is permanent (more likely) we will resilver forever, 2513 * which isn't really acceptable. Since there is enough information for 2514 * the user to know what has failed and why, this seems like a more 2515 * tractable approach. 2516 */ 2517 complete = (error == 0); 2518 2519 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 2520 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2521 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2522 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2523 2524 mutex_exit(&spa->spa_scrub_lock); 2525 2526 /* 2527 * If the scrub/resilver completed, update all DTLs to reflect this. 2528 * Whether it succeeded or not, vacate all temporary scrub DTLs. 2529 */ 2530 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2531 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2532 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 2533 spa_errlog_rotate(spa); 2534 2535 if (scrub_type == POOL_SCRUB_RESILVER && complete) 2536 spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_FINISH); 2537 2538 spa_config_exit(spa, FTAG); 2539 2540 mutex_enter(&spa->spa_scrub_lock); 2541 2542 /* 2543 * We may have finished replacing a device. 2544 * Let the async thread assess this and handle the detach. 2545 */ 2546 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 2547 2548 /* 2549 * If we were told to restart, our final act is to start a new scrub. 2550 */ 2551 if (error == ERESTART) 2552 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 2553 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2554 2555 spa->spa_scrub_type = POOL_SCRUB_NONE; 2556 spa->spa_scrub_active = 0; 2557 spa->spa_scrub_thread = NULL; 2558 cv_broadcast(&spa->spa_scrub_cv); 2559 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2560 thread_exit(); 2561 } 2562 2563 void 2564 spa_scrub_suspend(spa_t *spa) 2565 { 2566 mutex_enter(&spa->spa_scrub_lock); 2567 spa->spa_scrub_suspended++; 2568 while (spa->spa_scrub_active) { 2569 cv_broadcast(&spa->spa_scrub_cv); 2570 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2571 } 2572 while (spa->spa_scrub_inflight) 2573 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2574 mutex_exit(&spa->spa_scrub_lock); 2575 } 2576 2577 void 2578 spa_scrub_resume(spa_t *spa) 2579 { 2580 mutex_enter(&spa->spa_scrub_lock); 2581 ASSERT(spa->spa_scrub_suspended != 0); 2582 if (--spa->spa_scrub_suspended == 0) 2583 cv_broadcast(&spa->spa_scrub_cv); 2584 mutex_exit(&spa->spa_scrub_lock); 2585 } 2586 2587 void 2588 spa_scrub_restart(spa_t *spa, uint64_t txg) 2589 { 2590 /* 2591 * Something happened (e.g. snapshot create/delete) that means 2592 * we must restart any in-progress scrubs. The itinerary will 2593 * fix this properly. 2594 */ 2595 mutex_enter(&spa->spa_scrub_lock); 2596 spa->spa_scrub_restart_txg = txg; 2597 mutex_exit(&spa->spa_scrub_lock); 2598 } 2599 2600 int 2601 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2602 { 2603 space_seg_t *ss; 2604 uint64_t mintxg, maxtxg; 2605 vdev_t *rvd = spa->spa_root_vdev; 2606 2607 if ((uint_t)type >= POOL_SCRUB_TYPES) 2608 return (ENOTSUP); 2609 2610 mutex_enter(&spa->spa_scrub_lock); 2611 2612 /* 2613 * If there's a scrub or resilver already in progress, stop it. 2614 */ 2615 while (spa->spa_scrub_thread != NULL) { 2616 /* 2617 * Don't stop a resilver unless forced. 2618 */ 2619 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 2620 mutex_exit(&spa->spa_scrub_lock); 2621 return (EBUSY); 2622 } 2623 spa->spa_scrub_stop = 1; 2624 cv_broadcast(&spa->spa_scrub_cv); 2625 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2626 } 2627 2628 /* 2629 * Terminate the previous traverse. 2630 */ 2631 if (spa->spa_scrub_th != NULL) { 2632 traverse_fini(spa->spa_scrub_th); 2633 spa->spa_scrub_th = NULL; 2634 } 2635 2636 if (rvd == NULL) { 2637 ASSERT(spa->spa_scrub_stop == 0); 2638 ASSERT(spa->spa_scrub_type == type); 2639 ASSERT(spa->spa_scrub_restart_txg == 0); 2640 mutex_exit(&spa->spa_scrub_lock); 2641 return (0); 2642 } 2643 2644 mintxg = TXG_INITIAL - 1; 2645 maxtxg = spa_last_synced_txg(spa) + 1; 2646 2647 mutex_enter(&rvd->vdev_dtl_lock); 2648 2649 if (rvd->vdev_dtl_map.sm_space == 0) { 2650 /* 2651 * The pool-wide DTL is empty. 2652 * If this is a resilver, there's nothing to do except 2653 * check whether any in-progress replacements have completed. 2654 */ 2655 if (type == POOL_SCRUB_RESILVER) { 2656 type = POOL_SCRUB_NONE; 2657 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 2658 } 2659 } else { 2660 /* 2661 * The pool-wide DTL is non-empty. 2662 * If this is a normal scrub, upgrade to a resilver instead. 2663 */ 2664 if (type == POOL_SCRUB_EVERYTHING) 2665 type = POOL_SCRUB_RESILVER; 2666 } 2667 2668 if (type == POOL_SCRUB_RESILVER) { 2669 /* 2670 * Determine the resilvering boundaries. 2671 * 2672 * Note: (mintxg, maxtxg) is an open interval, 2673 * i.e. mintxg and maxtxg themselves are not included. 2674 * 2675 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 2676 * so we don't claim to resilver a txg that's still changing. 2677 */ 2678 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 2679 mintxg = ss->ss_start - 1; 2680 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 2681 maxtxg = MIN(ss->ss_end, maxtxg); 2682 2683 spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START); 2684 } 2685 2686 mutex_exit(&rvd->vdev_dtl_lock); 2687 2688 spa->spa_scrub_stop = 0; 2689 spa->spa_scrub_type = type; 2690 spa->spa_scrub_restart_txg = 0; 2691 2692 if (type != POOL_SCRUB_NONE) { 2693 spa->spa_scrub_mintxg = mintxg; 2694 spa->spa_scrub_maxtxg = maxtxg; 2695 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 2696 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 2697 ZIO_FLAG_CANFAIL); 2698 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 2699 spa->spa_scrub_thread = thread_create(NULL, 0, 2700 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 2701 } 2702 2703 mutex_exit(&spa->spa_scrub_lock); 2704 2705 return (0); 2706 } 2707 2708 /* 2709 * ========================================================================== 2710 * SPA async task processing 2711 * ========================================================================== 2712 */ 2713 2714 static void 2715 spa_async_remove(spa_t *spa, vdev_t *vd) 2716 { 2717 vdev_t *tvd; 2718 int c; 2719 2720 for (c = 0; c < vd->vdev_children; c++) { 2721 tvd = vd->vdev_child[c]; 2722 if (tvd->vdev_remove_wanted) { 2723 tvd->vdev_remove_wanted = 0; 2724 vdev_set_state(tvd, B_FALSE, VDEV_STATE_REMOVED, 2725 VDEV_AUX_NONE); 2726 vdev_clear(spa, tvd); 2727 vdev_config_dirty(tvd->vdev_top); 2728 } 2729 spa_async_remove(spa, tvd); 2730 } 2731 } 2732 2733 static void 2734 spa_async_thread(spa_t *spa) 2735 { 2736 int tasks; 2737 uint64_t txg; 2738 2739 ASSERT(spa->spa_sync_on); 2740 2741 mutex_enter(&spa->spa_async_lock); 2742 tasks = spa->spa_async_tasks; 2743 spa->spa_async_tasks = 0; 2744 mutex_exit(&spa->spa_async_lock); 2745 2746 /* 2747 * See if the config needs to be updated. 2748 */ 2749 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 2750 mutex_enter(&spa_namespace_lock); 2751 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2752 mutex_exit(&spa_namespace_lock); 2753 } 2754 2755 /* 2756 * See if any devices need to be marked REMOVED. 2757 */ 2758 if (tasks & SPA_ASYNC_REMOVE) { 2759 txg = spa_vdev_enter(spa); 2760 spa_async_remove(spa, spa->spa_root_vdev); 2761 (void) spa_vdev_exit(spa, NULL, txg, 0); 2762 } 2763 2764 /* 2765 * If any devices are done replacing, detach them. 2766 */ 2767 if (tasks & SPA_ASYNC_RESILVER_DONE) 2768 spa_vdev_resilver_done(spa); 2769 2770 /* 2771 * Kick off a scrub. When starting a RESILVER scrub (or an EVERYTHING 2772 * scrub which can become a resilver), we need to hold 2773 * spa_namespace_lock() because the sysevent we post via 2774 * spa_event_notify() needs to get the name of the pool. 2775 */ 2776 if (tasks & SPA_ASYNC_SCRUB) { 2777 mutex_enter(&spa_namespace_lock); 2778 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 2779 mutex_exit(&spa_namespace_lock); 2780 } 2781 2782 /* 2783 * Kick off a resilver. 2784 */ 2785 if (tasks & SPA_ASYNC_RESILVER) { 2786 mutex_enter(&spa_namespace_lock); 2787 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 2788 mutex_exit(&spa_namespace_lock); 2789 } 2790 2791 /* 2792 * Let the world know that we're done. 2793 */ 2794 mutex_enter(&spa->spa_async_lock); 2795 spa->spa_async_thread = NULL; 2796 cv_broadcast(&spa->spa_async_cv); 2797 mutex_exit(&spa->spa_async_lock); 2798 thread_exit(); 2799 } 2800 2801 void 2802 spa_async_suspend(spa_t *spa) 2803 { 2804 mutex_enter(&spa->spa_async_lock); 2805 spa->spa_async_suspended++; 2806 while (spa->spa_async_thread != NULL) 2807 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 2808 mutex_exit(&spa->spa_async_lock); 2809 } 2810 2811 void 2812 spa_async_resume(spa_t *spa) 2813 { 2814 mutex_enter(&spa->spa_async_lock); 2815 ASSERT(spa->spa_async_suspended != 0); 2816 spa->spa_async_suspended--; 2817 mutex_exit(&spa->spa_async_lock); 2818 } 2819 2820 static void 2821 spa_async_dispatch(spa_t *spa) 2822 { 2823 mutex_enter(&spa->spa_async_lock); 2824 if (spa->spa_async_tasks && !spa->spa_async_suspended && 2825 spa->spa_async_thread == NULL && 2826 rootdir != NULL && !vn_is_readonly(rootdir)) 2827 spa->spa_async_thread = thread_create(NULL, 0, 2828 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 2829 mutex_exit(&spa->spa_async_lock); 2830 } 2831 2832 void 2833 spa_async_request(spa_t *spa, int task) 2834 { 2835 mutex_enter(&spa->spa_async_lock); 2836 spa->spa_async_tasks |= task; 2837 mutex_exit(&spa->spa_async_lock); 2838 } 2839 2840 /* 2841 * ========================================================================== 2842 * SPA syncing routines 2843 * ========================================================================== 2844 */ 2845 2846 static void 2847 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2848 { 2849 bplist_t *bpl = &spa->spa_sync_bplist; 2850 dmu_tx_t *tx; 2851 blkptr_t blk; 2852 uint64_t itor = 0; 2853 zio_t *zio; 2854 int error; 2855 uint8_t c = 1; 2856 2857 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2858 2859 while (bplist_iterate(bpl, &itor, &blk) == 0) 2860 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2861 2862 error = zio_wait(zio); 2863 ASSERT3U(error, ==, 0); 2864 2865 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2866 bplist_vacate(bpl, tx); 2867 2868 /* 2869 * Pre-dirty the first block so we sync to convergence faster. 2870 * (Usually only the first block is needed.) 2871 */ 2872 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2873 dmu_tx_commit(tx); 2874 } 2875 2876 static void 2877 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 2878 { 2879 char *packed = NULL; 2880 size_t nvsize = 0; 2881 dmu_buf_t *db; 2882 2883 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 2884 2885 packed = kmem_alloc(nvsize, KM_SLEEP); 2886 2887 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 2888 KM_SLEEP) == 0); 2889 2890 dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 2891 2892 kmem_free(packed, nvsize); 2893 2894 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 2895 dmu_buf_will_dirty(db, tx); 2896 *(uint64_t *)db->db_data = nvsize; 2897 dmu_buf_rele(db, FTAG); 2898 } 2899 2900 static void 2901 spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 2902 { 2903 nvlist_t *nvroot; 2904 nvlist_t **spares; 2905 int i; 2906 2907 if (!spa->spa_sync_spares) 2908 return; 2909 2910 /* 2911 * Update the MOS nvlist describing the list of available spares. 2912 * spa_validate_spares() will have already made sure this nvlist is 2913 * valid and the vdevs are labeled appropriately. 2914 */ 2915 if (spa->spa_spares_object == 0) { 2916 spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 2917 DMU_OT_PACKED_NVLIST, 1 << 14, 2918 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2919 VERIFY(zap_update(spa->spa_meta_objset, 2920 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 2921 sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 2922 } 2923 2924 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2925 if (spa->spa_nspares == 0) { 2926 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2927 NULL, 0) == 0); 2928 } else { 2929 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 2930 KM_SLEEP); 2931 for (i = 0; i < spa->spa_nspares; i++) 2932 spares[i] = vdev_config_generate(spa, 2933 spa->spa_spares[i], B_FALSE, B_TRUE); 2934 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2935 spares, spa->spa_nspares) == 0); 2936 for (i = 0; i < spa->spa_nspares; i++) 2937 nvlist_free(spares[i]); 2938 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 2939 } 2940 2941 spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 2942 nvlist_free(nvroot); 2943 2944 spa->spa_sync_spares = B_FALSE; 2945 } 2946 2947 static void 2948 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2949 { 2950 nvlist_t *config; 2951 2952 if (list_is_empty(&spa->spa_dirty_list)) 2953 return; 2954 2955 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2956 2957 if (spa->spa_config_syncing) 2958 nvlist_free(spa->spa_config_syncing); 2959 spa->spa_config_syncing = config; 2960 2961 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 2962 } 2963 2964 static void 2965 spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 2966 { 2967 spa_t *spa = arg1; 2968 nvlist_t *nvp = arg2; 2969 nvpair_t *nvpair; 2970 objset_t *mos = spa->spa_meta_objset; 2971 uint64_t zapobj; 2972 uint64_t intval; 2973 2974 mutex_enter(&spa->spa_props_lock); 2975 if (spa->spa_pool_props_object == 0) { 2976 zapobj = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx); 2977 VERIFY(zapobj > 0); 2978 2979 spa->spa_pool_props_object = zapobj; 2980 2981 VERIFY(zap_update(mos, DMU_POOL_DIRECTORY_OBJECT, 2982 DMU_POOL_PROPS, 8, 1, 2983 &spa->spa_pool_props_object, tx) == 0); 2984 } 2985 mutex_exit(&spa->spa_props_lock); 2986 2987 nvpair = NULL; 2988 while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) { 2989 switch (zpool_name_to_prop(nvpair_name(nvpair))) { 2990 case ZPOOL_PROP_DELEGATION: 2991 VERIFY(nvlist_lookup_uint64(nvp, 2992 nvpair_name(nvpair), &intval) == 0); 2993 VERIFY(zap_update(mos, 2994 spa->spa_pool_props_object, 2995 nvpair_name(nvpair), 8, 1, 2996 &intval, tx) == 0); 2997 spa->spa_delegation = intval; 2998 break; 2999 case ZPOOL_PROP_BOOTFS: 3000 VERIFY(nvlist_lookup_uint64(nvp, 3001 nvpair_name(nvpair), &spa->spa_bootfs) == 0); 3002 intval = spa->spa_bootfs; 3003 VERIFY(zap_update(mos, 3004 spa->spa_pool_props_object, 3005 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 8, 1, 3006 &intval, tx) == 0); 3007 break; 3008 3009 case ZPOOL_PROP_AUTOREPLACE: 3010 VERIFY(nvlist_lookup_uint64(nvp, 3011 nvpair_name(nvpair), &intval) == 0); 3012 VERIFY(zap_update(mos, 3013 spa->spa_pool_props_object, 3014 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 8, 1, 3015 &intval, tx) == 0); 3016 break; 3017 } 3018 spa_history_internal_log(LOG_POOL_PROPSET, 3019 spa, tx, cr, "%s %lld %s", 3020 nvpair_name(nvpair), intval, 3021 spa->spa_name); 3022 } 3023 } 3024 3025 /* 3026 * Sync the specified transaction group. New blocks may be dirtied as 3027 * part of the process, so we iterate until it converges. 3028 */ 3029 void 3030 spa_sync(spa_t *spa, uint64_t txg) 3031 { 3032 dsl_pool_t *dp = spa->spa_dsl_pool; 3033 objset_t *mos = spa->spa_meta_objset; 3034 bplist_t *bpl = &spa->spa_sync_bplist; 3035 vdev_t *rvd = spa->spa_root_vdev; 3036 vdev_t *vd; 3037 dmu_tx_t *tx; 3038 int dirty_vdevs; 3039 3040 /* 3041 * Lock out configuration changes. 3042 */ 3043 spa_config_enter(spa, RW_READER, FTAG); 3044 3045 spa->spa_syncing_txg = txg; 3046 spa->spa_sync_pass = 0; 3047 3048 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 3049 3050 tx = dmu_tx_create_assigned(dp, txg); 3051 3052 /* 3053 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 3054 * set spa_deflate if we have no raid-z vdevs. 3055 */ 3056 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 3057 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 3058 int i; 3059 3060 for (i = 0; i < rvd->vdev_children; i++) { 3061 vd = rvd->vdev_child[i]; 3062 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 3063 break; 3064 } 3065 if (i == rvd->vdev_children) { 3066 spa->spa_deflate = TRUE; 3067 VERIFY(0 == zap_add(spa->spa_meta_objset, 3068 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3069 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 3070 } 3071 } 3072 3073 /* 3074 * If anything has changed in this txg, push the deferred frees 3075 * from the previous txg. If not, leave them alone so that we 3076 * don't generate work on an otherwise idle system. 3077 */ 3078 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 3079 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 3080 !txg_list_empty(&dp->dp_sync_tasks, txg)) 3081 spa_sync_deferred_frees(spa, txg); 3082 3083 /* 3084 * Iterate to convergence. 3085 */ 3086 do { 3087 spa->spa_sync_pass++; 3088 3089 spa_sync_config_object(spa, tx); 3090 spa_sync_spares(spa, tx); 3091 spa_errlog_sync(spa, txg); 3092 dsl_pool_sync(dp, txg); 3093 3094 dirty_vdevs = 0; 3095 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 3096 vdev_sync(vd, txg); 3097 dirty_vdevs++; 3098 } 3099 3100 bplist_sync(bpl, tx); 3101 } while (dirty_vdevs); 3102 3103 bplist_close(bpl); 3104 3105 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 3106 3107 /* 3108 * Rewrite the vdev configuration (which includes the uberblock) 3109 * to commit the transaction group. 3110 * 3111 * If there are any dirty vdevs, sync the uberblock to all vdevs. 3112 * Otherwise, pick a random top-level vdev that's known to be 3113 * visible in the config cache (see spa_vdev_add() for details). 3114 * If the write fails, try the next vdev until we're tried them all. 3115 */ 3116 if (!list_is_empty(&spa->spa_dirty_list)) { 3117 VERIFY(vdev_config_sync(rvd, txg) == 0); 3118 } else { 3119 int children = rvd->vdev_children; 3120 int c0 = spa_get_random(children); 3121 int c; 3122 3123 for (c = 0; c < children; c++) { 3124 vd = rvd->vdev_child[(c0 + c) % children]; 3125 if (vd->vdev_ms_array == 0) 3126 continue; 3127 if (vdev_config_sync(vd, txg) == 0) 3128 break; 3129 } 3130 if (c == children) 3131 VERIFY(vdev_config_sync(rvd, txg) == 0); 3132 } 3133 3134 dmu_tx_commit(tx); 3135 3136 /* 3137 * Clear the dirty config list. 3138 */ 3139 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 3140 vdev_config_clean(vd); 3141 3142 /* 3143 * Now that the new config has synced transactionally, 3144 * let it become visible to the config cache. 3145 */ 3146 if (spa->spa_config_syncing != NULL) { 3147 spa_config_set(spa, spa->spa_config_syncing); 3148 spa->spa_config_txg = txg; 3149 spa->spa_config_syncing = NULL; 3150 } 3151 3152 /* 3153 * Make a stable copy of the fully synced uberblock. 3154 * We use this as the root for pool traversals. 3155 */ 3156 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 3157 3158 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 3159 3160 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 3161 spa->spa_traverse_wanted = 0; 3162 spa->spa_ubsync = spa->spa_uberblock; 3163 rw_exit(&spa->spa_traverse_lock); 3164 3165 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 3166 3167 /* 3168 * Clean up the ZIL records for the synced txg. 3169 */ 3170 dsl_pool_zil_clean(dp); 3171 3172 /* 3173 * Update usable space statistics. 3174 */ 3175 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 3176 vdev_sync_done(vd, txg); 3177 3178 /* 3179 * It had better be the case that we didn't dirty anything 3180 * since vdev_config_sync(). 3181 */ 3182 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 3183 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 3184 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 3185 ASSERT(bpl->bpl_queue == NULL); 3186 3187 spa_config_exit(spa, FTAG); 3188 3189 /* 3190 * If any async tasks have been requested, kick them off. 3191 */ 3192 spa_async_dispatch(spa); 3193 } 3194 3195 /* 3196 * Sync all pools. We don't want to hold the namespace lock across these 3197 * operations, so we take a reference on the spa_t and drop the lock during the 3198 * sync. 3199 */ 3200 void 3201 spa_sync_allpools(void) 3202 { 3203 spa_t *spa = NULL; 3204 mutex_enter(&spa_namespace_lock); 3205 while ((spa = spa_next(spa)) != NULL) { 3206 if (spa_state(spa) != POOL_STATE_ACTIVE) 3207 continue; 3208 spa_open_ref(spa, FTAG); 3209 mutex_exit(&spa_namespace_lock); 3210 txg_wait_synced(spa_get_dsl(spa), 0); 3211 mutex_enter(&spa_namespace_lock); 3212 spa_close(spa, FTAG); 3213 } 3214 mutex_exit(&spa_namespace_lock); 3215 } 3216 3217 /* 3218 * ========================================================================== 3219 * Miscellaneous routines 3220 * ========================================================================== 3221 */ 3222 3223 /* 3224 * Remove all pools in the system. 3225 */ 3226 void 3227 spa_evict_all(void) 3228 { 3229 spa_t *spa; 3230 3231 /* 3232 * Remove all cached state. All pools should be closed now, 3233 * so every spa in the AVL tree should be unreferenced. 3234 */ 3235 mutex_enter(&spa_namespace_lock); 3236 while ((spa = spa_next(NULL)) != NULL) { 3237 /* 3238 * Stop async tasks. The async thread may need to detach 3239 * a device that's been replaced, which requires grabbing 3240 * spa_namespace_lock, so we must drop it here. 3241 */ 3242 spa_open_ref(spa, FTAG); 3243 mutex_exit(&spa_namespace_lock); 3244 spa_async_suspend(spa); 3245 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 3246 mutex_enter(&spa_namespace_lock); 3247 spa_close(spa, FTAG); 3248 3249 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3250 spa_unload(spa); 3251 spa_deactivate(spa); 3252 } 3253 spa_remove(spa); 3254 } 3255 mutex_exit(&spa_namespace_lock); 3256 } 3257 3258 vdev_t * 3259 spa_lookup_by_guid(spa_t *spa, uint64_t guid) 3260 { 3261 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 3262 } 3263 3264 void 3265 spa_upgrade(spa_t *spa) 3266 { 3267 spa_config_enter(spa, RW_WRITER, FTAG); 3268 3269 /* 3270 * This should only be called for a non-faulted pool, and since a 3271 * future version would result in an unopenable pool, this shouldn't be 3272 * possible. 3273 */ 3274 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 3275 3276 spa->spa_uberblock.ub_version = SPA_VERSION; 3277 vdev_config_dirty(spa->spa_root_vdev); 3278 3279 spa_config_exit(spa, FTAG); 3280 3281 txg_wait_synced(spa_get_dsl(spa), 0); 3282 } 3283 3284 boolean_t 3285 spa_has_spare(spa_t *spa, uint64_t guid) 3286 { 3287 int i; 3288 uint64_t spareguid; 3289 3290 for (i = 0; i < spa->spa_nspares; i++) 3291 if (spa->spa_spares[i]->vdev_guid == guid) 3292 return (B_TRUE); 3293 3294 for (i = 0; i < spa->spa_pending_nspares; i++) { 3295 if (nvlist_lookup_uint64(spa->spa_pending_spares[i], 3296 ZPOOL_CONFIG_GUID, &spareguid) == 0 && 3297 spareguid == guid) 3298 return (B_TRUE); 3299 } 3300 3301 return (B_FALSE); 3302 } 3303 3304 int 3305 spa_set_props(spa_t *spa, nvlist_t *nvp) 3306 { 3307 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 3308 spa, nvp, 3)); 3309 } 3310 3311 int 3312 spa_get_props(spa_t *spa, nvlist_t **nvp) 3313 { 3314 zap_cursor_t zc; 3315 zap_attribute_t za; 3316 objset_t *mos = spa->spa_meta_objset; 3317 zfs_source_t src; 3318 zpool_prop_t prop; 3319 nvlist_t *propval; 3320 uint64_t value; 3321 int err; 3322 3323 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3324 3325 mutex_enter(&spa->spa_props_lock); 3326 /* If no props object, then just return empty nvlist */ 3327 if (spa->spa_pool_props_object == 0) { 3328 mutex_exit(&spa->spa_props_lock); 3329 return (0); 3330 } 3331 3332 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 3333 (err = zap_cursor_retrieve(&zc, &za)) == 0; 3334 zap_cursor_advance(&zc)) { 3335 3336 if ((prop = zpool_name_to_prop(za.za_name)) == ZFS_PROP_INVAL) 3337 continue; 3338 3339 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3340 switch (za.za_integer_length) { 3341 case 8: 3342 if (zpool_prop_default_numeric(prop) == 3343 za.za_first_integer) 3344 src = ZFS_SRC_DEFAULT; 3345 else 3346 src = ZFS_SRC_LOCAL; 3347 value = za.za_first_integer; 3348 3349 if (prop == ZPOOL_PROP_BOOTFS) { 3350 dsl_pool_t *dp; 3351 dsl_dataset_t *ds = NULL; 3352 char strval[MAXPATHLEN]; 3353 3354 dp = spa_get_dsl(spa); 3355 rw_enter(&dp->dp_config_rwlock, RW_READER); 3356 if ((err = dsl_dataset_open_obj(dp, 3357 za.za_first_integer, NULL, DS_MODE_NONE, 3358 FTAG, &ds)) != 0) { 3359 rw_exit(&dp->dp_config_rwlock); 3360 break; 3361 } 3362 dsl_dataset_name(ds, strval); 3363 dsl_dataset_close(ds, DS_MODE_NONE, FTAG); 3364 rw_exit(&dp->dp_config_rwlock); 3365 3366 VERIFY(nvlist_add_uint64(propval, 3367 ZFS_PROP_SOURCE, src) == 0); 3368 VERIFY(nvlist_add_string(propval, 3369 ZFS_PROP_VALUE, strval) == 0); 3370 } else { 3371 VERIFY(nvlist_add_uint64(propval, 3372 ZFS_PROP_SOURCE, src) == 0); 3373 VERIFY(nvlist_add_uint64(propval, 3374 ZFS_PROP_VALUE, value) == 0); 3375 } 3376 VERIFY(nvlist_add_nvlist(*nvp, za.za_name, 3377 propval) == 0); 3378 break; 3379 } 3380 nvlist_free(propval); 3381 } 3382 zap_cursor_fini(&zc); 3383 mutex_exit(&spa->spa_props_lock); 3384 if (err && err != ENOENT) { 3385 nvlist_free(*nvp); 3386 return (err); 3387 } 3388 3389 return (0); 3390 } 3391 3392 /* 3393 * If the bootfs property value is dsobj, clear it. 3394 */ 3395 void 3396 spa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 3397 { 3398 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 3399 VERIFY(zap_remove(spa->spa_meta_objset, 3400 spa->spa_pool_props_object, 3401 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 3402 spa->spa_bootfs = 0; 3403 } 3404 } 3405 3406 /* 3407 * Post a sysevent corresponding to the given event. The 'name' must be one of 3408 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 3409 * filled in from the spa and (optionally) the vdev. This doesn't do anything 3410 * in the userland libzpool, as we don't want consumers to misinterpret ztest 3411 * or zdb as real changes. 3412 */ 3413 void 3414 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 3415 { 3416 #ifdef _KERNEL 3417 sysevent_t *ev; 3418 sysevent_attr_list_t *attr = NULL; 3419 sysevent_value_t value; 3420 sysevent_id_t eid; 3421 3422 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 3423 SE_SLEEP); 3424 3425 value.value_type = SE_DATA_TYPE_STRING; 3426 value.value.sv_string = spa_name(spa); 3427 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 3428 goto done; 3429 3430 value.value_type = SE_DATA_TYPE_UINT64; 3431 value.value.sv_uint64 = spa_guid(spa); 3432 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 3433 goto done; 3434 3435 if (vd) { 3436 value.value_type = SE_DATA_TYPE_UINT64; 3437 value.value.sv_uint64 = vd->vdev_guid; 3438 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 3439 SE_SLEEP) != 0) 3440 goto done; 3441 3442 if (vd->vdev_path) { 3443 value.value_type = SE_DATA_TYPE_STRING; 3444 value.value.sv_string = vd->vdev_path; 3445 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 3446 &value, SE_SLEEP) != 0) 3447 goto done; 3448 } 3449 } 3450 3451 (void) log_sysevent(ev, SE_SLEEP, &eid); 3452 3453 done: 3454 if (attr) 3455 sysevent_free_attr(attr); 3456 sysevent_free(ev); 3457 #endif 3458 } 3459