1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains all the routines used when modifying on-disk SPA state. 31 * This includes opening, importing, destroying, exporting a pool, and syncing a 32 * pool. 33 */ 34 35 #include <sys/zfs_context.h> 36 #include <sys/fm/fs/zfs.h> 37 #include <sys/spa_impl.h> 38 #include <sys/zio.h> 39 #include <sys/zio_checksum.h> 40 #include <sys/zio_compress.h> 41 #include <sys/dmu.h> 42 #include <sys/dmu_tx.h> 43 #include <sys/zap.h> 44 #include <sys/zil.h> 45 #include <sys/vdev_impl.h> 46 #include <sys/metaslab.h> 47 #include <sys/uberblock_impl.h> 48 #include <sys/txg.h> 49 #include <sys/avl.h> 50 #include <sys/dmu_traverse.h> 51 #include <sys/dmu_objset.h> 52 #include <sys/unique.h> 53 #include <sys/dsl_pool.h> 54 #include <sys/dsl_dataset.h> 55 #include <sys/dsl_dir.h> 56 #include <sys/dsl_prop.h> 57 #include <sys/dsl_synctask.h> 58 #include <sys/fs/zfs.h> 59 #include <sys/callb.h> 60 #include <sys/systeminfo.h> 61 #include <sys/sunddi.h> 62 63 int zio_taskq_threads = 8; 64 65 /* 66 * ========================================================================== 67 * SPA state manipulation (open/create/destroy/import/export) 68 * ========================================================================== 69 */ 70 71 static int 72 spa_error_entry_compare(const void *a, const void *b) 73 { 74 spa_error_entry_t *sa = (spa_error_entry_t *)a; 75 spa_error_entry_t *sb = (spa_error_entry_t *)b; 76 int ret; 77 78 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 79 sizeof (zbookmark_t)); 80 81 if (ret < 0) 82 return (-1); 83 else if (ret > 0) 84 return (1); 85 else 86 return (0); 87 } 88 89 /* 90 * Utility function which retrieves copies of the current logs and 91 * re-initializes them in the process. 92 */ 93 void 94 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 95 { 96 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 97 98 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 99 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 100 101 avl_create(&spa->spa_errlist_scrub, 102 spa_error_entry_compare, sizeof (spa_error_entry_t), 103 offsetof(spa_error_entry_t, se_avl)); 104 avl_create(&spa->spa_errlist_last, 105 spa_error_entry_compare, sizeof (spa_error_entry_t), 106 offsetof(spa_error_entry_t, se_avl)); 107 } 108 109 /* 110 * Activate an uninitialized pool. 111 */ 112 static void 113 spa_activate(spa_t *spa) 114 { 115 int t; 116 117 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 118 119 spa->spa_state = POOL_STATE_ACTIVE; 120 121 spa->spa_normal_class = metaslab_class_create(); 122 spa->spa_log_class = metaslab_class_create(); 123 124 for (t = 0; t < ZIO_TYPES; t++) { 125 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 126 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 127 TASKQ_PREPOPULATE); 128 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 129 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 130 TASKQ_PREPOPULATE); 131 } 132 133 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 134 135 mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); 136 mutex_init(&spa->spa_config_cache_lock, NULL, MUTEX_DEFAULT, NULL); 137 mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); 138 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); 139 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); 140 mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL); 141 mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); 142 mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); 143 mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); 144 145 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 146 offsetof(vdev_t, vdev_dirty_node)); 147 148 txg_list_create(&spa->spa_vdev_txg_list, 149 offsetof(struct vdev, vdev_txg_node)); 150 151 avl_create(&spa->spa_errlist_scrub, 152 spa_error_entry_compare, sizeof (spa_error_entry_t), 153 offsetof(spa_error_entry_t, se_avl)); 154 avl_create(&spa->spa_errlist_last, 155 spa_error_entry_compare, sizeof (spa_error_entry_t), 156 offsetof(spa_error_entry_t, se_avl)); 157 } 158 159 /* 160 * Opposite of spa_activate(). 161 */ 162 static void 163 spa_deactivate(spa_t *spa) 164 { 165 int t; 166 167 ASSERT(spa->spa_sync_on == B_FALSE); 168 ASSERT(spa->spa_dsl_pool == NULL); 169 ASSERT(spa->spa_root_vdev == NULL); 170 171 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 172 173 txg_list_destroy(&spa->spa_vdev_txg_list); 174 175 list_destroy(&spa->spa_dirty_list); 176 177 rw_destroy(&spa->spa_traverse_lock); 178 179 for (t = 0; t < ZIO_TYPES; t++) { 180 taskq_destroy(spa->spa_zio_issue_taskq[t]); 181 taskq_destroy(spa->spa_zio_intr_taskq[t]); 182 spa->spa_zio_issue_taskq[t] = NULL; 183 spa->spa_zio_intr_taskq[t] = NULL; 184 } 185 186 metaslab_class_destroy(spa->spa_normal_class); 187 spa->spa_normal_class = NULL; 188 189 metaslab_class_destroy(spa->spa_log_class); 190 spa->spa_log_class = NULL; 191 192 /* 193 * If this was part of an import or the open otherwise failed, we may 194 * still have errors left in the queues. Empty them just in case. 195 */ 196 spa_errlog_drain(spa); 197 198 avl_destroy(&spa->spa_errlist_scrub); 199 avl_destroy(&spa->spa_errlist_last); 200 201 spa->spa_state = POOL_STATE_UNINITIALIZED; 202 } 203 204 /* 205 * Verify a pool configuration, and construct the vdev tree appropriately. This 206 * will create all the necessary vdevs in the appropriate layout, with each vdev 207 * in the CLOSED state. This will prep the pool before open/creation/import. 208 * All vdev validation is done by the vdev_alloc() routine. 209 */ 210 static int 211 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 212 uint_t id, int atype) 213 { 214 nvlist_t **child; 215 uint_t c, children; 216 int error; 217 218 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 219 return (error); 220 221 if ((*vdp)->vdev_ops->vdev_op_leaf) 222 return (0); 223 224 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 225 &child, &children) != 0) { 226 vdev_free(*vdp); 227 *vdp = NULL; 228 return (EINVAL); 229 } 230 231 for (c = 0; c < children; c++) { 232 vdev_t *vd; 233 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 234 atype)) != 0) { 235 vdev_free(*vdp); 236 *vdp = NULL; 237 return (error); 238 } 239 } 240 241 ASSERT(*vdp != NULL); 242 243 return (0); 244 } 245 246 /* 247 * Opposite of spa_load(). 248 */ 249 static void 250 spa_unload(spa_t *spa) 251 { 252 int i; 253 254 /* 255 * Stop async tasks. 256 */ 257 spa_async_suspend(spa); 258 259 /* 260 * Stop syncing. 261 */ 262 if (spa->spa_sync_on) { 263 txg_sync_stop(spa->spa_dsl_pool); 264 spa->spa_sync_on = B_FALSE; 265 } 266 267 /* 268 * Wait for any outstanding prefetch I/O to complete. 269 */ 270 spa_config_enter(spa, RW_WRITER, FTAG); 271 spa_config_exit(spa, FTAG); 272 273 /* 274 * Close the dsl pool. 275 */ 276 if (spa->spa_dsl_pool) { 277 dsl_pool_close(spa->spa_dsl_pool); 278 spa->spa_dsl_pool = NULL; 279 } 280 281 /* 282 * Close all vdevs. 283 */ 284 if (spa->spa_root_vdev) 285 vdev_free(spa->spa_root_vdev); 286 ASSERT(spa->spa_root_vdev == NULL); 287 288 for (i = 0; i < spa->spa_nspares; i++) 289 vdev_free(spa->spa_spares[i]); 290 if (spa->spa_spares) { 291 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 292 spa->spa_spares = NULL; 293 } 294 if (spa->spa_sparelist) { 295 nvlist_free(spa->spa_sparelist); 296 spa->spa_sparelist = NULL; 297 } 298 299 spa->spa_async_suspended = 0; 300 } 301 302 /* 303 * Load (or re-load) the current list of vdevs describing the active spares for 304 * this pool. When this is called, we have some form of basic information in 305 * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 306 * re-generate a more complete list including status information. 307 */ 308 static void 309 spa_load_spares(spa_t *spa) 310 { 311 nvlist_t **spares; 312 uint_t nspares; 313 int i; 314 vdev_t *vd, *tvd; 315 316 /* 317 * First, close and free any existing spare vdevs. 318 */ 319 for (i = 0; i < spa->spa_nspares; i++) { 320 vd = spa->spa_spares[i]; 321 322 /* Undo the call to spa_activate() below */ 323 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && 324 tvd->vdev_isspare) 325 spa_spare_remove(tvd); 326 vdev_close(vd); 327 vdev_free(vd); 328 } 329 330 if (spa->spa_spares) 331 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 332 333 if (spa->spa_sparelist == NULL) 334 nspares = 0; 335 else 336 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 337 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 338 339 spa->spa_nspares = (int)nspares; 340 spa->spa_spares = NULL; 341 342 if (nspares == 0) 343 return; 344 345 /* 346 * Construct the array of vdevs, opening them to get status in the 347 * process. For each spare, there is potentially two different vdev_t 348 * structures associated with it: one in the list of spares (used only 349 * for basic validation purposes) and one in the active vdev 350 * configuration (if it's spared in). During this phase we open and 351 * validate each vdev on the spare list. If the vdev also exists in the 352 * active configuration, then we also mark this vdev as an active spare. 353 */ 354 spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 355 for (i = 0; i < spa->spa_nspares; i++) { 356 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 357 VDEV_ALLOC_SPARE) == 0); 358 ASSERT(vd != NULL); 359 360 spa->spa_spares[i] = vd; 361 362 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { 363 if (!tvd->vdev_isspare) 364 spa_spare_add(tvd); 365 366 /* 367 * We only mark the spare active if we were successfully 368 * able to load the vdev. Otherwise, importing a pool 369 * with a bad active spare would result in strange 370 * behavior, because multiple pool would think the spare 371 * is actively in use. 372 * 373 * There is a vulnerability here to an equally bizarre 374 * circumstance, where a dead active spare is later 375 * brought back to life (onlined or otherwise). Given 376 * the rarity of this scenario, and the extra complexity 377 * it adds, we ignore the possibility. 378 */ 379 if (!vdev_is_dead(tvd)) 380 spa_spare_activate(tvd); 381 } 382 383 if (vdev_open(vd) != 0) 384 continue; 385 386 vd->vdev_top = vd; 387 (void) vdev_validate_spare(vd); 388 } 389 390 /* 391 * Recompute the stashed list of spares, with status information 392 * this time. 393 */ 394 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 395 DATA_TYPE_NVLIST_ARRAY) == 0); 396 397 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 398 for (i = 0; i < spa->spa_nspares; i++) 399 spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 400 B_TRUE, B_TRUE); 401 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 402 spares, spa->spa_nspares) == 0); 403 for (i = 0; i < spa->spa_nspares; i++) 404 nvlist_free(spares[i]); 405 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 406 } 407 408 static int 409 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 410 { 411 dmu_buf_t *db; 412 char *packed = NULL; 413 size_t nvsize = 0; 414 int error; 415 *value = NULL; 416 417 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 418 nvsize = *(uint64_t *)db->db_data; 419 dmu_buf_rele(db, FTAG); 420 421 packed = kmem_alloc(nvsize, KM_SLEEP); 422 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 423 if (error == 0) 424 error = nvlist_unpack(packed, nvsize, value, 0); 425 kmem_free(packed, nvsize); 426 427 return (error); 428 } 429 430 /* 431 * Checks to see if the given vdev could not be opened, in which case we post a 432 * sysevent to notify the autoreplace code that the device has been removed. 433 */ 434 static void 435 spa_check_removed(vdev_t *vd) 436 { 437 int c; 438 439 for (c = 0; c < vd->vdev_children; c++) 440 spa_check_removed(vd->vdev_child[c]); 441 442 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 443 zfs_post_autoreplace(vd->vdev_spa, vd); 444 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 445 } 446 } 447 448 /* 449 * Load an existing storage pool, using the pool's builtin spa_config as a 450 * source of configuration information. 451 */ 452 static int 453 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 454 { 455 int error = 0; 456 nvlist_t *nvroot = NULL; 457 vdev_t *rvd; 458 uberblock_t *ub = &spa->spa_uberblock; 459 uint64_t config_cache_txg = spa->spa_config_txg; 460 uint64_t pool_guid; 461 uint64_t version; 462 zio_t *zio; 463 uint64_t autoreplace = 0; 464 465 spa->spa_load_state = state; 466 467 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 468 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 469 error = EINVAL; 470 goto out; 471 } 472 473 /* 474 * Versioning wasn't explicitly added to the label until later, so if 475 * it's not present treat it as the initial version. 476 */ 477 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 478 version = ZFS_VERSION_INITIAL; 479 480 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 481 &spa->spa_config_txg); 482 483 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 484 spa_guid_exists(pool_guid, 0)) { 485 error = EEXIST; 486 goto out; 487 } 488 489 spa->spa_load_guid = pool_guid; 490 491 /* 492 * Parse the configuration into a vdev tree. We explicitly set the 493 * value that will be returned by spa_version() since parsing the 494 * configuration requires knowing the version number. 495 */ 496 spa_config_enter(spa, RW_WRITER, FTAG); 497 spa->spa_ubsync.ub_version = version; 498 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 499 spa_config_exit(spa, FTAG); 500 501 if (error != 0) 502 goto out; 503 504 ASSERT(spa->spa_root_vdev == rvd); 505 ASSERT(spa_guid(spa) == pool_guid); 506 507 /* 508 * Try to open all vdevs, loading each label in the process. 509 */ 510 error = vdev_open(rvd); 511 if (error != 0) 512 goto out; 513 514 /* 515 * Validate the labels for all leaf vdevs. We need to grab the config 516 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 517 * flag. 518 */ 519 spa_config_enter(spa, RW_READER, FTAG); 520 error = vdev_validate(rvd); 521 spa_config_exit(spa, FTAG); 522 523 if (error != 0) 524 goto out; 525 526 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 527 error = ENXIO; 528 goto out; 529 } 530 531 /* 532 * Find the best uberblock. 533 */ 534 bzero(ub, sizeof (uberblock_t)); 535 536 zio = zio_root(spa, NULL, NULL, 537 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 538 vdev_uberblock_load(zio, rvd, ub); 539 error = zio_wait(zio); 540 541 /* 542 * If we weren't able to find a single valid uberblock, return failure. 543 */ 544 if (ub->ub_txg == 0) { 545 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 546 VDEV_AUX_CORRUPT_DATA); 547 error = ENXIO; 548 goto out; 549 } 550 551 /* 552 * If the pool is newer than the code, we can't open it. 553 */ 554 if (ub->ub_version > ZFS_VERSION) { 555 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 556 VDEV_AUX_VERSION_NEWER); 557 error = ENOTSUP; 558 goto out; 559 } 560 561 /* 562 * If the vdev guid sum doesn't match the uberblock, we have an 563 * incomplete configuration. 564 */ 565 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 566 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 567 VDEV_AUX_BAD_GUID_SUM); 568 error = ENXIO; 569 goto out; 570 } 571 572 /* 573 * Initialize internal SPA structures. 574 */ 575 spa->spa_state = POOL_STATE_ACTIVE; 576 spa->spa_ubsync = spa->spa_uberblock; 577 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 578 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 579 if (error) { 580 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 581 VDEV_AUX_CORRUPT_DATA); 582 goto out; 583 } 584 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 585 586 if (zap_lookup(spa->spa_meta_objset, 587 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 588 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 589 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 590 VDEV_AUX_CORRUPT_DATA); 591 error = EIO; 592 goto out; 593 } 594 595 if (!mosconfig) { 596 nvlist_t *newconfig; 597 uint64_t hostid; 598 599 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 600 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 601 VDEV_AUX_CORRUPT_DATA); 602 error = EIO; 603 goto out; 604 } 605 606 if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID, 607 &hostid) == 0) { 608 char *hostname; 609 unsigned long myhostid = 0; 610 611 VERIFY(nvlist_lookup_string(newconfig, 612 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 613 614 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 615 if (hostid != 0 && myhostid != 0 && 616 (unsigned long)hostid != myhostid) { 617 cmn_err(CE_WARN, "pool '%s' could not be " 618 "loaded as it was last accessed by " 619 "another system (host: %s hostid: 0x%lx). " 620 "See: http://www.sun.com/msg/ZFS-8000-EY", 621 spa->spa_name, hostname, 622 (unsigned long)hostid); 623 error = EBADF; 624 goto out; 625 } 626 } 627 628 spa_config_set(spa, newconfig); 629 spa_unload(spa); 630 spa_deactivate(spa); 631 spa_activate(spa); 632 633 return (spa_load(spa, newconfig, state, B_TRUE)); 634 } 635 636 if (zap_lookup(spa->spa_meta_objset, 637 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 638 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 639 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 640 VDEV_AUX_CORRUPT_DATA); 641 error = EIO; 642 goto out; 643 } 644 645 /* 646 * Load the bit that tells us to use the new accounting function 647 * (raid-z deflation). If we have an older pool, this will not 648 * be present. 649 */ 650 error = zap_lookup(spa->spa_meta_objset, 651 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 652 sizeof (uint64_t), 1, &spa->spa_deflate); 653 if (error != 0 && error != ENOENT) { 654 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 655 VDEV_AUX_CORRUPT_DATA); 656 error = EIO; 657 goto out; 658 } 659 660 /* 661 * Load the persistent error log. If we have an older pool, this will 662 * not be present. 663 */ 664 error = zap_lookup(spa->spa_meta_objset, 665 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 666 sizeof (uint64_t), 1, &spa->spa_errlog_last); 667 if (error != 0 && error != ENOENT) { 668 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 669 VDEV_AUX_CORRUPT_DATA); 670 error = EIO; 671 goto out; 672 } 673 674 error = zap_lookup(spa->spa_meta_objset, 675 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 676 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 677 if (error != 0 && error != ENOENT) { 678 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 679 VDEV_AUX_CORRUPT_DATA); 680 error = EIO; 681 goto out; 682 } 683 684 /* 685 * Load the history object. If we have an older pool, this 686 * will not be present. 687 */ 688 error = zap_lookup(spa->spa_meta_objset, 689 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 690 sizeof (uint64_t), 1, &spa->spa_history); 691 if (error != 0 && error != ENOENT) { 692 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 693 VDEV_AUX_CORRUPT_DATA); 694 error = EIO; 695 goto out; 696 } 697 698 /* 699 * Load any hot spares for this pool. 700 */ 701 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 702 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 703 if (error != 0 && error != ENOENT) { 704 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 705 VDEV_AUX_CORRUPT_DATA); 706 error = EIO; 707 goto out; 708 } 709 if (error == 0) { 710 ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES); 711 if (load_nvlist(spa, spa->spa_spares_object, 712 &spa->spa_sparelist) != 0) { 713 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 714 VDEV_AUX_CORRUPT_DATA); 715 error = EIO; 716 goto out; 717 } 718 719 spa_config_enter(spa, RW_WRITER, FTAG); 720 spa_load_spares(spa); 721 spa_config_exit(spa, FTAG); 722 } 723 724 spa->spa_delegation = zfs_prop_default_numeric(ZPOOL_PROP_DELEGATION); 725 726 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 727 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 728 729 if (error && error != ENOENT) { 730 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 731 VDEV_AUX_CORRUPT_DATA); 732 error = EIO; 733 goto out; 734 } 735 736 if (error == 0) { 737 (void) zap_lookup(spa->spa_meta_objset, 738 spa->spa_pool_props_object, 739 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 740 sizeof (uint64_t), 1, &spa->spa_bootfs); 741 (void) zap_lookup(spa->spa_meta_objset, 742 spa->spa_pool_props_object, 743 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 744 sizeof (uint64_t), 1, &autoreplace); 745 (void) zap_lookup(spa->spa_meta_objset, 746 spa->spa_pool_props_object, 747 zpool_prop_to_name(ZPOOL_PROP_DELEGATION), 748 sizeof (uint64_t), 1, &spa->spa_delegation); 749 } 750 751 /* 752 * If the 'autoreplace' property is set, then post a resource notifying 753 * the ZFS DE that it should not issue any faults for unopenable 754 * devices. We also iterate over the vdevs, and post a sysevent for any 755 * unopenable vdevs so that the normal autoreplace handler can take 756 * over. 757 */ 758 if (autoreplace) 759 spa_check_removed(spa->spa_root_vdev); 760 761 /* 762 * Load the vdev state for all toplevel vdevs. 763 */ 764 vdev_load(rvd); 765 766 /* 767 * Propagate the leaf DTLs we just loaded all the way up the tree. 768 */ 769 spa_config_enter(spa, RW_WRITER, FTAG); 770 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 771 spa_config_exit(spa, FTAG); 772 773 /* 774 * Check the state of the root vdev. If it can't be opened, it 775 * indicates one or more toplevel vdevs are faulted. 776 */ 777 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 778 error = ENXIO; 779 goto out; 780 } 781 782 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 783 dmu_tx_t *tx; 784 int need_update = B_FALSE; 785 int c; 786 787 /* 788 * Claim log blocks that haven't been committed yet. 789 * This must all happen in a single txg. 790 */ 791 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 792 spa_first_txg(spa)); 793 (void) dmu_objset_find(spa->spa_name, 794 zil_claim, tx, DS_FIND_CHILDREN); 795 dmu_tx_commit(tx); 796 797 spa->spa_sync_on = B_TRUE; 798 txg_sync_start(spa->spa_dsl_pool); 799 800 /* 801 * Wait for all claims to sync. 802 */ 803 txg_wait_synced(spa->spa_dsl_pool, 0); 804 805 /* 806 * If the config cache is stale, or we have uninitialized 807 * metaslabs (see spa_vdev_add()), then update the config. 808 */ 809 if (config_cache_txg != spa->spa_config_txg || 810 state == SPA_LOAD_IMPORT) 811 need_update = B_TRUE; 812 813 for (c = 0; c < rvd->vdev_children; c++) 814 if (rvd->vdev_child[c]->vdev_ms_array == 0) 815 need_update = B_TRUE; 816 817 /* 818 * Update the config cache asychronously in case we're the 819 * root pool, in which case the config cache isn't writable yet. 820 */ 821 if (need_update) 822 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 823 } 824 825 error = 0; 826 out: 827 if (error && error != EBADF) 828 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 829 spa->spa_load_state = SPA_LOAD_NONE; 830 spa->spa_ena = 0; 831 832 return (error); 833 } 834 835 /* 836 * Pool Open/Import 837 * 838 * The import case is identical to an open except that the configuration is sent 839 * down from userland, instead of grabbed from the configuration cache. For the 840 * case of an open, the pool configuration will exist in the 841 * POOL_STATE_UNINITIALIZED state. 842 * 843 * The stats information (gen/count/ustats) is used to gather vdev statistics at 844 * the same time open the pool, without having to keep around the spa_t in some 845 * ambiguous state. 846 */ 847 static int 848 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 849 { 850 spa_t *spa; 851 int error; 852 int loaded = B_FALSE; 853 int locked = B_FALSE; 854 855 *spapp = NULL; 856 857 /* 858 * As disgusting as this is, we need to support recursive calls to this 859 * function because dsl_dir_open() is called during spa_load(), and ends 860 * up calling spa_open() again. The real fix is to figure out how to 861 * avoid dsl_dir_open() calling this in the first place. 862 */ 863 if (mutex_owner(&spa_namespace_lock) != curthread) { 864 mutex_enter(&spa_namespace_lock); 865 locked = B_TRUE; 866 } 867 868 if ((spa = spa_lookup(pool)) == NULL) { 869 if (locked) 870 mutex_exit(&spa_namespace_lock); 871 return (ENOENT); 872 } 873 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 874 875 spa_activate(spa); 876 877 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 878 879 if (error == EBADF) { 880 /* 881 * If vdev_validate() returns failure (indicated by 882 * EBADF), it indicates that one of the vdevs indicates 883 * that the pool has been exported or destroyed. If 884 * this is the case, the config cache is out of sync and 885 * we should remove the pool from the namespace. 886 */ 887 zfs_post_ok(spa, NULL); 888 spa_unload(spa); 889 spa_deactivate(spa); 890 spa_remove(spa); 891 spa_config_sync(); 892 if (locked) 893 mutex_exit(&spa_namespace_lock); 894 return (ENOENT); 895 } 896 897 if (error) { 898 /* 899 * We can't open the pool, but we still have useful 900 * information: the state of each vdev after the 901 * attempted vdev_open(). Return this to the user. 902 */ 903 if (config != NULL && spa->spa_root_vdev != NULL) { 904 spa_config_enter(spa, RW_READER, FTAG); 905 *config = spa_config_generate(spa, NULL, -1ULL, 906 B_TRUE); 907 spa_config_exit(spa, FTAG); 908 } 909 spa_unload(spa); 910 spa_deactivate(spa); 911 spa->spa_last_open_failed = B_TRUE; 912 if (locked) 913 mutex_exit(&spa_namespace_lock); 914 *spapp = NULL; 915 return (error); 916 } else { 917 zfs_post_ok(spa, NULL); 918 spa->spa_last_open_failed = B_FALSE; 919 } 920 921 loaded = B_TRUE; 922 } 923 924 spa_open_ref(spa, tag); 925 926 /* 927 * If we just loaded the pool, resilver anything that's out of date. 928 */ 929 if (loaded && (spa_mode & FWRITE)) 930 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 931 932 if (locked) 933 mutex_exit(&spa_namespace_lock); 934 935 *spapp = spa; 936 937 if (config != NULL) { 938 spa_config_enter(spa, RW_READER, FTAG); 939 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 940 spa_config_exit(spa, FTAG); 941 } 942 943 return (0); 944 } 945 946 int 947 spa_open(const char *name, spa_t **spapp, void *tag) 948 { 949 return (spa_open_common(name, spapp, tag, NULL)); 950 } 951 952 /* 953 * Lookup the given spa_t, incrementing the inject count in the process, 954 * preventing it from being exported or destroyed. 955 */ 956 spa_t * 957 spa_inject_addref(char *name) 958 { 959 spa_t *spa; 960 961 mutex_enter(&spa_namespace_lock); 962 if ((spa = spa_lookup(name)) == NULL) { 963 mutex_exit(&spa_namespace_lock); 964 return (NULL); 965 } 966 spa->spa_inject_ref++; 967 mutex_exit(&spa_namespace_lock); 968 969 return (spa); 970 } 971 972 void 973 spa_inject_delref(spa_t *spa) 974 { 975 mutex_enter(&spa_namespace_lock); 976 spa->spa_inject_ref--; 977 mutex_exit(&spa_namespace_lock); 978 } 979 980 static void 981 spa_add_spares(spa_t *spa, nvlist_t *config) 982 { 983 nvlist_t **spares; 984 uint_t i, nspares; 985 nvlist_t *nvroot; 986 uint64_t guid; 987 vdev_stat_t *vs; 988 uint_t vsc; 989 uint64_t pool; 990 991 if (spa->spa_nspares == 0) 992 return; 993 994 VERIFY(nvlist_lookup_nvlist(config, 995 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 996 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 997 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 998 if (nspares != 0) { 999 VERIFY(nvlist_add_nvlist_array(nvroot, 1000 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1001 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1002 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1003 1004 /* 1005 * Go through and find any spares which have since been 1006 * repurposed as an active spare. If this is the case, update 1007 * their status appropriately. 1008 */ 1009 for (i = 0; i < nspares; i++) { 1010 VERIFY(nvlist_lookup_uint64(spares[i], 1011 ZPOOL_CONFIG_GUID, &guid) == 0); 1012 if (spa_spare_exists(guid, &pool) && pool != 0ULL) { 1013 VERIFY(nvlist_lookup_uint64_array( 1014 spares[i], ZPOOL_CONFIG_STATS, 1015 (uint64_t **)&vs, &vsc) == 0); 1016 vs->vs_state = VDEV_STATE_CANT_OPEN; 1017 vs->vs_aux = VDEV_AUX_SPARED; 1018 } 1019 } 1020 } 1021 } 1022 1023 int 1024 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 1025 { 1026 int error; 1027 spa_t *spa; 1028 1029 *config = NULL; 1030 error = spa_open_common(name, &spa, FTAG, config); 1031 1032 if (spa && *config != NULL) { 1033 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 1034 spa_get_errlog_size(spa)) == 0); 1035 1036 spa_add_spares(spa, *config); 1037 } 1038 1039 /* 1040 * We want to get the alternate root even for faulted pools, so we cheat 1041 * and call spa_lookup() directly. 1042 */ 1043 if (altroot) { 1044 if (spa == NULL) { 1045 mutex_enter(&spa_namespace_lock); 1046 spa = spa_lookup(name); 1047 if (spa) 1048 spa_altroot(spa, altroot, buflen); 1049 else 1050 altroot[0] = '\0'; 1051 spa = NULL; 1052 mutex_exit(&spa_namespace_lock); 1053 } else { 1054 spa_altroot(spa, altroot, buflen); 1055 } 1056 } 1057 1058 if (spa != NULL) 1059 spa_close(spa, FTAG); 1060 1061 return (error); 1062 } 1063 1064 /* 1065 * Validate that the 'spares' array is well formed. We must have an array of 1066 * nvlists, each which describes a valid leaf vdev. If this is an import (mode 1067 * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long 1068 * as they are well-formed. 1069 */ 1070 static int 1071 spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 1072 { 1073 nvlist_t **spares; 1074 uint_t i, nspares; 1075 vdev_t *vd; 1076 int error; 1077 1078 /* 1079 * It's acceptable to have no spares specified. 1080 */ 1081 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1082 &spares, &nspares) != 0) 1083 return (0); 1084 1085 if (nspares == 0) 1086 return (EINVAL); 1087 1088 /* 1089 * Make sure the pool is formatted with a version that supports hot 1090 * spares. 1091 */ 1092 if (spa_version(spa) < ZFS_VERSION_SPARES) 1093 return (ENOTSUP); 1094 1095 /* 1096 * Set the pending spare list so we correctly handle device in-use 1097 * checking. 1098 */ 1099 spa->spa_pending_spares = spares; 1100 spa->spa_pending_nspares = nspares; 1101 1102 for (i = 0; i < nspares; i++) { 1103 if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 1104 mode)) != 0) 1105 goto out; 1106 1107 if (!vd->vdev_ops->vdev_op_leaf) { 1108 vdev_free(vd); 1109 error = EINVAL; 1110 goto out; 1111 } 1112 1113 vd->vdev_top = vd; 1114 1115 if ((error = vdev_open(vd)) == 0 && 1116 (error = vdev_label_init(vd, crtxg, 1117 VDEV_LABEL_SPARE)) == 0) { 1118 VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 1119 vd->vdev_guid) == 0); 1120 } 1121 1122 vdev_free(vd); 1123 1124 if (error && mode != VDEV_ALLOC_SPARE) 1125 goto out; 1126 else 1127 error = 0; 1128 } 1129 1130 out: 1131 spa->spa_pending_spares = NULL; 1132 spa->spa_pending_nspares = 0; 1133 return (error); 1134 } 1135 1136 /* 1137 * Pool Creation 1138 */ 1139 int 1140 spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 1141 { 1142 spa_t *spa; 1143 vdev_t *rvd; 1144 dsl_pool_t *dp; 1145 dmu_tx_t *tx; 1146 int c, error = 0; 1147 uint64_t txg = TXG_INITIAL; 1148 nvlist_t **spares; 1149 uint_t nspares; 1150 1151 /* 1152 * If this pool already exists, return failure. 1153 */ 1154 mutex_enter(&spa_namespace_lock); 1155 if (spa_lookup(pool) != NULL) { 1156 mutex_exit(&spa_namespace_lock); 1157 return (EEXIST); 1158 } 1159 1160 /* 1161 * Allocate a new spa_t structure. 1162 */ 1163 spa = spa_add(pool, altroot); 1164 spa_activate(spa); 1165 1166 spa->spa_uberblock.ub_txg = txg - 1; 1167 spa->spa_uberblock.ub_version = ZFS_VERSION; 1168 spa->spa_ubsync = spa->spa_uberblock; 1169 1170 /* 1171 * Create the root vdev. 1172 */ 1173 spa_config_enter(spa, RW_WRITER, FTAG); 1174 1175 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1176 1177 ASSERT(error != 0 || rvd != NULL); 1178 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1179 1180 if (error == 0 && rvd->vdev_children == 0) 1181 error = EINVAL; 1182 1183 if (error == 0 && 1184 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1185 (error = spa_validate_spares(spa, nvroot, txg, 1186 VDEV_ALLOC_ADD)) == 0) { 1187 for (c = 0; c < rvd->vdev_children; c++) 1188 vdev_init(rvd->vdev_child[c], txg); 1189 vdev_config_dirty(rvd); 1190 } 1191 1192 spa_config_exit(spa, FTAG); 1193 1194 if (error != 0) { 1195 spa_unload(spa); 1196 spa_deactivate(spa); 1197 spa_remove(spa); 1198 mutex_exit(&spa_namespace_lock); 1199 return (error); 1200 } 1201 1202 /* 1203 * Get the list of spares, if specified. 1204 */ 1205 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1206 &spares, &nspares) == 0) { 1207 VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 1208 KM_SLEEP) == 0); 1209 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1210 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1211 spa_config_enter(spa, RW_WRITER, FTAG); 1212 spa_load_spares(spa); 1213 spa_config_exit(spa, FTAG); 1214 spa->spa_sync_spares = B_TRUE; 1215 } 1216 1217 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1218 spa->spa_meta_objset = dp->dp_meta_objset; 1219 1220 tx = dmu_tx_create_assigned(dp, txg); 1221 1222 /* 1223 * Create the pool config object. 1224 */ 1225 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1226 DMU_OT_PACKED_NVLIST, 1 << 14, 1227 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1228 1229 if (zap_add(spa->spa_meta_objset, 1230 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1231 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1232 cmn_err(CE_PANIC, "failed to add pool config"); 1233 } 1234 1235 /* Newly created pools are always deflated. */ 1236 spa->spa_deflate = TRUE; 1237 if (zap_add(spa->spa_meta_objset, 1238 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1239 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 1240 cmn_err(CE_PANIC, "failed to add deflate"); 1241 } 1242 1243 /* 1244 * Create the deferred-free bplist object. Turn off compression 1245 * because sync-to-convergence takes longer if the blocksize 1246 * keeps changing. 1247 */ 1248 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1249 1 << 14, tx); 1250 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1251 ZIO_COMPRESS_OFF, tx); 1252 1253 if (zap_add(spa->spa_meta_objset, 1254 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1255 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 1256 cmn_err(CE_PANIC, "failed to add bplist"); 1257 } 1258 1259 /* 1260 * Create the pool's history object. 1261 */ 1262 spa_history_create_obj(spa, tx); 1263 1264 dmu_tx_commit(tx); 1265 1266 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 1267 spa->spa_delegation = zfs_prop_default_numeric(ZPOOL_PROP_DELEGATION); 1268 spa->spa_sync_on = B_TRUE; 1269 txg_sync_start(spa->spa_dsl_pool); 1270 1271 /* 1272 * We explicitly wait for the first transaction to complete so that our 1273 * bean counters are appropriately updated. 1274 */ 1275 txg_wait_synced(spa->spa_dsl_pool, txg); 1276 1277 spa_config_sync(); 1278 1279 mutex_exit(&spa_namespace_lock); 1280 1281 return (0); 1282 } 1283 1284 /* 1285 * Import the given pool into the system. We set up the necessary spa_t and 1286 * then call spa_load() to do the dirty work. 1287 */ 1288 int 1289 spa_import(const char *pool, nvlist_t *config, const char *altroot) 1290 { 1291 spa_t *spa; 1292 int error; 1293 nvlist_t *nvroot; 1294 nvlist_t **spares; 1295 uint_t nspares; 1296 1297 if (!(spa_mode & FWRITE)) 1298 return (EROFS); 1299 1300 /* 1301 * If a pool with this name exists, return failure. 1302 */ 1303 mutex_enter(&spa_namespace_lock); 1304 if (spa_lookup(pool) != NULL) { 1305 mutex_exit(&spa_namespace_lock); 1306 return (EEXIST); 1307 } 1308 1309 /* 1310 * Create and initialize the spa structure. 1311 */ 1312 spa = spa_add(pool, altroot); 1313 spa_activate(spa); 1314 1315 /* 1316 * Pass off the heavy lifting to spa_load(). 1317 * Pass TRUE for mosconfig because the user-supplied config 1318 * is actually the one to trust when doing an import. 1319 */ 1320 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1321 1322 spa_config_enter(spa, RW_WRITER, FTAG); 1323 /* 1324 * Toss any existing sparelist, as it doesn't have any validity anymore, 1325 * and conflicts with spa_has_spare(). 1326 */ 1327 if (spa->spa_sparelist) { 1328 nvlist_free(spa->spa_sparelist); 1329 spa->spa_sparelist = NULL; 1330 spa_load_spares(spa); 1331 } 1332 1333 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1334 &nvroot) == 0); 1335 if (error == 0) 1336 error = spa_validate_spares(spa, nvroot, -1ULL, 1337 VDEV_ALLOC_SPARE); 1338 spa_config_exit(spa, FTAG); 1339 1340 if (error != 0) { 1341 spa_unload(spa); 1342 spa_deactivate(spa); 1343 spa_remove(spa); 1344 mutex_exit(&spa_namespace_lock); 1345 return (error); 1346 } 1347 1348 /* 1349 * Override any spares as specified by the user, as these may have 1350 * correct device names/devids, etc. 1351 */ 1352 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1353 &spares, &nspares) == 0) { 1354 if (spa->spa_sparelist) 1355 VERIFY(nvlist_remove(spa->spa_sparelist, 1356 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1357 else 1358 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1359 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1360 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1361 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1362 spa_config_enter(spa, RW_WRITER, FTAG); 1363 spa_load_spares(spa); 1364 spa_config_exit(spa, FTAG); 1365 spa->spa_sync_spares = B_TRUE; 1366 } 1367 1368 /* 1369 * Update the config cache to include the newly-imported pool. 1370 */ 1371 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1372 1373 /* 1374 * Resilver anything that's out of date. 1375 */ 1376 if (spa_mode & FWRITE) 1377 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1378 1379 mutex_exit(&spa_namespace_lock); 1380 1381 return (0); 1382 } 1383 1384 /* 1385 * This (illegal) pool name is used when temporarily importing a spa_t in order 1386 * to get the vdev stats associated with the imported devices. 1387 */ 1388 #define TRYIMPORT_NAME "$import" 1389 1390 nvlist_t * 1391 spa_tryimport(nvlist_t *tryconfig) 1392 { 1393 nvlist_t *config = NULL; 1394 char *poolname; 1395 spa_t *spa; 1396 uint64_t state; 1397 1398 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1399 return (NULL); 1400 1401 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1402 return (NULL); 1403 1404 /* 1405 * Create and initialize the spa structure. 1406 */ 1407 mutex_enter(&spa_namespace_lock); 1408 spa = spa_add(TRYIMPORT_NAME, NULL); 1409 spa_activate(spa); 1410 1411 /* 1412 * Pass off the heavy lifting to spa_load(). 1413 * Pass TRUE for mosconfig because the user-supplied config 1414 * is actually the one to trust when doing an import. 1415 */ 1416 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1417 1418 /* 1419 * If 'tryconfig' was at least parsable, return the current config. 1420 */ 1421 if (spa->spa_root_vdev != NULL) { 1422 spa_config_enter(spa, RW_READER, FTAG); 1423 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1424 spa_config_exit(spa, FTAG); 1425 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1426 poolname) == 0); 1427 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1428 state) == 0); 1429 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 1430 spa->spa_uberblock.ub_timestamp) == 0); 1431 1432 /* 1433 * Add the list of hot spares. 1434 */ 1435 spa_add_spares(spa, config); 1436 } 1437 1438 spa_unload(spa); 1439 spa_deactivate(spa); 1440 spa_remove(spa); 1441 mutex_exit(&spa_namespace_lock); 1442 1443 return (config); 1444 } 1445 1446 /* 1447 * Pool export/destroy 1448 * 1449 * The act of destroying or exporting a pool is very simple. We make sure there 1450 * is no more pending I/O and any references to the pool are gone. Then, we 1451 * update the pool state and sync all the labels to disk, removing the 1452 * configuration from the cache afterwards. 1453 */ 1454 static int 1455 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1456 { 1457 spa_t *spa; 1458 1459 if (oldconfig) 1460 *oldconfig = NULL; 1461 1462 if (!(spa_mode & FWRITE)) 1463 return (EROFS); 1464 1465 mutex_enter(&spa_namespace_lock); 1466 if ((spa = spa_lookup(pool)) == NULL) { 1467 mutex_exit(&spa_namespace_lock); 1468 return (ENOENT); 1469 } 1470 1471 /* 1472 * Put a hold on the pool, drop the namespace lock, stop async tasks, 1473 * reacquire the namespace lock, and see if we can export. 1474 */ 1475 spa_open_ref(spa, FTAG); 1476 mutex_exit(&spa_namespace_lock); 1477 spa_async_suspend(spa); 1478 mutex_enter(&spa_namespace_lock); 1479 spa_close(spa, FTAG); 1480 1481 /* 1482 * The pool will be in core if it's openable, 1483 * in which case we can modify its state. 1484 */ 1485 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1486 /* 1487 * Objsets may be open only because they're dirty, so we 1488 * have to force it to sync before checking spa_refcnt. 1489 */ 1490 spa_scrub_suspend(spa); 1491 txg_wait_synced(spa->spa_dsl_pool, 0); 1492 1493 /* 1494 * A pool cannot be exported or destroyed if there are active 1495 * references. If we are resetting a pool, allow references by 1496 * fault injection handlers. 1497 */ 1498 if (!spa_refcount_zero(spa) || 1499 (spa->spa_inject_ref != 0 && 1500 new_state != POOL_STATE_UNINITIALIZED)) { 1501 spa_scrub_resume(spa); 1502 spa_async_resume(spa); 1503 mutex_exit(&spa_namespace_lock); 1504 return (EBUSY); 1505 } 1506 1507 spa_scrub_resume(spa); 1508 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1509 1510 /* 1511 * We want this to be reflected on every label, 1512 * so mark them all dirty. spa_unload() will do the 1513 * final sync that pushes these changes out. 1514 */ 1515 if (new_state != POOL_STATE_UNINITIALIZED) { 1516 spa_config_enter(spa, RW_WRITER, FTAG); 1517 spa->spa_state = new_state; 1518 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 1519 vdev_config_dirty(spa->spa_root_vdev); 1520 spa_config_exit(spa, FTAG); 1521 } 1522 } 1523 1524 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 1525 1526 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1527 spa_unload(spa); 1528 spa_deactivate(spa); 1529 } 1530 1531 if (oldconfig && spa->spa_config) 1532 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 1533 1534 if (new_state != POOL_STATE_UNINITIALIZED) { 1535 spa_remove(spa); 1536 spa_config_sync(); 1537 } 1538 mutex_exit(&spa_namespace_lock); 1539 1540 return (0); 1541 } 1542 1543 /* 1544 * Destroy a storage pool. 1545 */ 1546 int 1547 spa_destroy(char *pool) 1548 { 1549 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1550 } 1551 1552 /* 1553 * Export a storage pool. 1554 */ 1555 int 1556 spa_export(char *pool, nvlist_t **oldconfig) 1557 { 1558 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1559 } 1560 1561 /* 1562 * Similar to spa_export(), this unloads the spa_t without actually removing it 1563 * from the namespace in any way. 1564 */ 1565 int 1566 spa_reset(char *pool) 1567 { 1568 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 1569 } 1570 1571 1572 /* 1573 * ========================================================================== 1574 * Device manipulation 1575 * ========================================================================== 1576 */ 1577 1578 /* 1579 * Add a device to a storage pool. 1580 */ 1581 int 1582 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1583 { 1584 uint64_t txg; 1585 int c, error; 1586 vdev_t *rvd = spa->spa_root_vdev; 1587 vdev_t *vd, *tvd; 1588 nvlist_t **spares; 1589 uint_t i, nspares; 1590 1591 txg = spa_vdev_enter(spa); 1592 1593 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 1594 VDEV_ALLOC_ADD)) != 0) 1595 return (spa_vdev_exit(spa, NULL, txg, error)); 1596 1597 spa->spa_pending_vdev = vd; 1598 1599 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1600 &spares, &nspares) != 0) 1601 nspares = 0; 1602 1603 if (vd->vdev_children == 0 && nspares == 0) { 1604 spa->spa_pending_vdev = NULL; 1605 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1606 } 1607 1608 if (vd->vdev_children != 0) { 1609 if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 1610 spa->spa_pending_vdev = NULL; 1611 return (spa_vdev_exit(spa, vd, txg, error)); 1612 } 1613 } 1614 1615 /* 1616 * We must validate the spares after checking the children. Otherwise, 1617 * vdev_inuse() will blindly overwrite the spare. 1618 */ 1619 if ((error = spa_validate_spares(spa, nvroot, txg, 1620 VDEV_ALLOC_ADD)) != 0) { 1621 spa->spa_pending_vdev = NULL; 1622 return (spa_vdev_exit(spa, vd, txg, error)); 1623 } 1624 1625 spa->spa_pending_vdev = NULL; 1626 1627 /* 1628 * Transfer each new top-level vdev from vd to rvd. 1629 */ 1630 for (c = 0; c < vd->vdev_children; c++) { 1631 tvd = vd->vdev_child[c]; 1632 vdev_remove_child(vd, tvd); 1633 tvd->vdev_id = rvd->vdev_children; 1634 vdev_add_child(rvd, tvd); 1635 vdev_config_dirty(tvd); 1636 } 1637 1638 if (nspares != 0) { 1639 if (spa->spa_sparelist != NULL) { 1640 nvlist_t **oldspares; 1641 uint_t oldnspares; 1642 nvlist_t **newspares; 1643 1644 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 1645 ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 1646 1647 newspares = kmem_alloc(sizeof (void *) * 1648 (nspares + oldnspares), KM_SLEEP); 1649 for (i = 0; i < oldnspares; i++) 1650 VERIFY(nvlist_dup(oldspares[i], 1651 &newspares[i], KM_SLEEP) == 0); 1652 for (i = 0; i < nspares; i++) 1653 VERIFY(nvlist_dup(spares[i], 1654 &newspares[i + oldnspares], 1655 KM_SLEEP) == 0); 1656 1657 VERIFY(nvlist_remove(spa->spa_sparelist, 1658 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1659 1660 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1661 ZPOOL_CONFIG_SPARES, newspares, 1662 nspares + oldnspares) == 0); 1663 for (i = 0; i < oldnspares + nspares; i++) 1664 nvlist_free(newspares[i]); 1665 kmem_free(newspares, (oldnspares + nspares) * 1666 sizeof (void *)); 1667 } else { 1668 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1669 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1670 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1671 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1672 } 1673 1674 spa_load_spares(spa); 1675 spa->spa_sync_spares = B_TRUE; 1676 } 1677 1678 /* 1679 * We have to be careful when adding new vdevs to an existing pool. 1680 * If other threads start allocating from these vdevs before we 1681 * sync the config cache, and we lose power, then upon reboot we may 1682 * fail to open the pool because there are DVAs that the config cache 1683 * can't translate. Therefore, we first add the vdevs without 1684 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1685 * and then let spa_config_update() initialize the new metaslabs. 1686 * 1687 * spa_load() checks for added-but-not-initialized vdevs, so that 1688 * if we lose power at any point in this sequence, the remaining 1689 * steps will be completed the next time we load the pool. 1690 */ 1691 (void) spa_vdev_exit(spa, vd, txg, 0); 1692 1693 mutex_enter(&spa_namespace_lock); 1694 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1695 mutex_exit(&spa_namespace_lock); 1696 1697 return (0); 1698 } 1699 1700 /* 1701 * Attach a device to a mirror. The arguments are the path to any device 1702 * in the mirror, and the nvroot for the new device. If the path specifies 1703 * a device that is not mirrored, we automatically insert the mirror vdev. 1704 * 1705 * If 'replacing' is specified, the new device is intended to replace the 1706 * existing device; in this case the two devices are made into their own 1707 * mirror using the 'replacing' vdev, which is functionally identical to 1708 * the mirror vdev (it actually reuses all the same ops) but has a few 1709 * extra rules: you can't attach to it after it's been created, and upon 1710 * completion of resilvering, the first disk (the one being replaced) 1711 * is automatically detached. 1712 */ 1713 int 1714 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1715 { 1716 uint64_t txg, open_txg; 1717 int error; 1718 vdev_t *rvd = spa->spa_root_vdev; 1719 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1720 vdev_ops_t *pvops; 1721 int is_log; 1722 1723 txg = spa_vdev_enter(spa); 1724 1725 oldvd = vdev_lookup_by_guid(rvd, guid); 1726 1727 if (oldvd == NULL) 1728 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1729 1730 if (!oldvd->vdev_ops->vdev_op_leaf) 1731 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1732 1733 pvd = oldvd->vdev_parent; 1734 1735 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 1736 VDEV_ALLOC_ADD)) != 0) 1737 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 1738 1739 if (newrootvd->vdev_children != 1) 1740 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1741 1742 newvd = newrootvd->vdev_child[0]; 1743 1744 if (!newvd->vdev_ops->vdev_op_leaf) 1745 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1746 1747 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 1748 return (spa_vdev_exit(spa, newrootvd, txg, error)); 1749 1750 /* 1751 * Spares can't replace logs 1752 */ 1753 is_log = oldvd->vdev_islog; 1754 if (is_log && newvd->vdev_isspare) 1755 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1756 1757 if (!replacing) { 1758 /* 1759 * For attach, the only allowable parent is a mirror or the root 1760 * vdev. 1761 */ 1762 if (pvd->vdev_ops != &vdev_mirror_ops && 1763 pvd->vdev_ops != &vdev_root_ops) 1764 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1765 1766 pvops = &vdev_mirror_ops; 1767 } else { 1768 /* 1769 * Active hot spares can only be replaced by inactive hot 1770 * spares. 1771 */ 1772 if (pvd->vdev_ops == &vdev_spare_ops && 1773 pvd->vdev_child[1] == oldvd && 1774 !spa_has_spare(spa, newvd->vdev_guid)) 1775 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1776 1777 /* 1778 * If the source is a hot spare, and the parent isn't already a 1779 * spare, then we want to create a new hot spare. Otherwise, we 1780 * want to create a replacing vdev. The user is not allowed to 1781 * attach to a spared vdev child unless the 'isspare' state is 1782 * the same (spare replaces spare, non-spare replaces 1783 * non-spare). 1784 */ 1785 if (pvd->vdev_ops == &vdev_replacing_ops) 1786 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1787 else if (pvd->vdev_ops == &vdev_spare_ops && 1788 newvd->vdev_isspare != oldvd->vdev_isspare) 1789 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1790 else if (pvd->vdev_ops != &vdev_spare_ops && 1791 newvd->vdev_isspare) 1792 pvops = &vdev_spare_ops; 1793 else 1794 pvops = &vdev_replacing_ops; 1795 } 1796 1797 /* 1798 * Compare the new device size with the replaceable/attachable 1799 * device size. 1800 */ 1801 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1802 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1803 1804 /* 1805 * The new device cannot have a higher alignment requirement 1806 * than the top-level vdev. 1807 */ 1808 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1809 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1810 1811 /* 1812 * If this is an in-place replacement, update oldvd's path and devid 1813 * to make it distinguishable from newvd, and unopenable from now on. 1814 */ 1815 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1816 spa_strfree(oldvd->vdev_path); 1817 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1818 KM_SLEEP); 1819 (void) sprintf(oldvd->vdev_path, "%s/%s", 1820 newvd->vdev_path, "old"); 1821 if (oldvd->vdev_devid != NULL) { 1822 spa_strfree(oldvd->vdev_devid); 1823 oldvd->vdev_devid = NULL; 1824 } 1825 } 1826 1827 /* 1828 * If the parent is not a mirror, or if we're replacing, insert the new 1829 * mirror/replacing/spare vdev above oldvd. 1830 */ 1831 if (pvd->vdev_ops != pvops) 1832 pvd = vdev_add_parent(oldvd, pvops); 1833 1834 ASSERT(pvd->vdev_top->vdev_parent == rvd); 1835 ASSERT(pvd->vdev_ops == pvops); 1836 ASSERT(oldvd->vdev_parent == pvd); 1837 1838 /* 1839 * Extract the new device from its root and add it to pvd. 1840 */ 1841 vdev_remove_child(newrootvd, newvd); 1842 newvd->vdev_id = pvd->vdev_children; 1843 vdev_add_child(pvd, newvd); 1844 1845 /* 1846 * If newvd is smaller than oldvd, but larger than its rsize, 1847 * the addition of newvd may have decreased our parent's asize. 1848 */ 1849 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1850 1851 tvd = newvd->vdev_top; 1852 ASSERT(pvd->vdev_top == tvd); 1853 ASSERT(tvd->vdev_parent == rvd); 1854 1855 vdev_config_dirty(tvd); 1856 1857 /* 1858 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1859 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1860 */ 1861 open_txg = txg + TXG_CONCURRENT_STATES - 1; 1862 1863 mutex_enter(&newvd->vdev_dtl_lock); 1864 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1865 open_txg - TXG_INITIAL + 1); 1866 mutex_exit(&newvd->vdev_dtl_lock); 1867 1868 if (newvd->vdev_isspare) 1869 spa_spare_activate(newvd); 1870 1871 /* 1872 * Mark newvd's DTL dirty in this txg. 1873 */ 1874 vdev_dirty(tvd, VDD_DTL, newvd, txg); 1875 1876 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1877 1878 /* 1879 * Kick off a resilver to update newvd. We need to grab the namespace 1880 * lock because spa_scrub() needs to post a sysevent with the pool name. 1881 */ 1882 mutex_enter(&spa_namespace_lock); 1883 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1884 mutex_exit(&spa_namespace_lock); 1885 1886 return (0); 1887 } 1888 1889 /* 1890 * Detach a device from a mirror or replacing vdev. 1891 * If 'replace_done' is specified, only detach if the parent 1892 * is a replacing vdev. 1893 */ 1894 int 1895 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1896 { 1897 uint64_t txg; 1898 int c, t, error; 1899 vdev_t *rvd = spa->spa_root_vdev; 1900 vdev_t *vd, *pvd, *cvd, *tvd; 1901 boolean_t unspare = B_FALSE; 1902 uint64_t unspare_guid; 1903 1904 txg = spa_vdev_enter(spa); 1905 1906 vd = vdev_lookup_by_guid(rvd, guid); 1907 1908 if (vd == NULL) 1909 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1910 1911 if (!vd->vdev_ops->vdev_op_leaf) 1912 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1913 1914 pvd = vd->vdev_parent; 1915 1916 /* 1917 * If replace_done is specified, only remove this device if it's 1918 * the first child of a replacing vdev. For the 'spare' vdev, either 1919 * disk can be removed. 1920 */ 1921 if (replace_done) { 1922 if (pvd->vdev_ops == &vdev_replacing_ops) { 1923 if (vd->vdev_id != 0) 1924 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1925 } else if (pvd->vdev_ops != &vdev_spare_ops) { 1926 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1927 } 1928 } 1929 1930 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 1931 spa_version(spa) >= ZFS_VERSION_SPARES); 1932 1933 /* 1934 * Only mirror, replacing, and spare vdevs support detach. 1935 */ 1936 if (pvd->vdev_ops != &vdev_replacing_ops && 1937 pvd->vdev_ops != &vdev_mirror_ops && 1938 pvd->vdev_ops != &vdev_spare_ops) 1939 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1940 1941 /* 1942 * If there's only one replica, you can't detach it. 1943 */ 1944 if (pvd->vdev_children <= 1) 1945 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1946 1947 /* 1948 * If all siblings have non-empty DTLs, this device may have the only 1949 * valid copy of the data, which means we cannot safely detach it. 1950 * 1951 * XXX -- as in the vdev_offline() case, we really want a more 1952 * precise DTL check. 1953 */ 1954 for (c = 0; c < pvd->vdev_children; c++) { 1955 uint64_t dirty; 1956 1957 cvd = pvd->vdev_child[c]; 1958 if (cvd == vd) 1959 continue; 1960 if (vdev_is_dead(cvd)) 1961 continue; 1962 mutex_enter(&cvd->vdev_dtl_lock); 1963 dirty = cvd->vdev_dtl_map.sm_space | 1964 cvd->vdev_dtl_scrub.sm_space; 1965 mutex_exit(&cvd->vdev_dtl_lock); 1966 if (!dirty) 1967 break; 1968 } 1969 1970 /* 1971 * If we are a replacing or spare vdev, then we can always detach the 1972 * latter child, as that is how one cancels the operation. 1973 */ 1974 if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 1975 c == pvd->vdev_children) 1976 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1977 1978 /* 1979 * If we are detaching the original disk from a spare, then it implies 1980 * that the spare should become a real disk, and be removed from the 1981 * active spare list for the pool. 1982 */ 1983 if (pvd->vdev_ops == &vdev_spare_ops && 1984 vd->vdev_id == 0) 1985 unspare = B_TRUE; 1986 1987 /* 1988 * Erase the disk labels so the disk can be used for other things. 1989 * This must be done after all other error cases are handled, 1990 * but before we disembowel vd (so we can still do I/O to it). 1991 * But if we can't do it, don't treat the error as fatal -- 1992 * it may be that the unwritability of the disk is the reason 1993 * it's being detached! 1994 */ 1995 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 1996 1997 /* 1998 * Remove vd from its parent and compact the parent's children. 1999 */ 2000 vdev_remove_child(pvd, vd); 2001 vdev_compact_children(pvd); 2002 2003 /* 2004 * Remember one of the remaining children so we can get tvd below. 2005 */ 2006 cvd = pvd->vdev_child[0]; 2007 2008 /* 2009 * If we need to remove the remaining child from the list of hot spares, 2010 * do it now, marking the vdev as no longer a spare in the process. We 2011 * must do this before vdev_remove_parent(), because that can change the 2012 * GUID if it creates a new toplevel GUID. 2013 */ 2014 if (unspare) { 2015 ASSERT(cvd->vdev_isspare); 2016 spa_spare_remove(cvd); 2017 unspare_guid = cvd->vdev_guid; 2018 } 2019 2020 /* 2021 * If the parent mirror/replacing vdev only has one child, 2022 * the parent is no longer needed. Remove it from the tree. 2023 */ 2024 if (pvd->vdev_children == 1) 2025 vdev_remove_parent(cvd); 2026 2027 /* 2028 * We don't set tvd until now because the parent we just removed 2029 * may have been the previous top-level vdev. 2030 */ 2031 tvd = cvd->vdev_top; 2032 ASSERT(tvd->vdev_parent == rvd); 2033 2034 /* 2035 * Reevaluate the parent vdev state. 2036 */ 2037 vdev_propagate_state(cvd); 2038 2039 /* 2040 * If the device we just detached was smaller than the others, it may be 2041 * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 2042 * can't fail because the existing metaslabs are already in core, so 2043 * there's nothing to read from disk. 2044 */ 2045 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 2046 2047 vdev_config_dirty(tvd); 2048 2049 /* 2050 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 2051 * vd->vdev_detached is set and free vd's DTL object in syncing context. 2052 * But first make sure we're not on any *other* txg's DTL list, to 2053 * prevent vd from being accessed after it's freed. 2054 */ 2055 for (t = 0; t < TXG_SIZE; t++) 2056 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 2057 vd->vdev_detached = B_TRUE; 2058 vdev_dirty(tvd, VDD_DTL, vd, txg); 2059 2060 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 2061 2062 error = spa_vdev_exit(spa, vd, txg, 0); 2063 2064 /* 2065 * If this was the removal of the original device in a hot spare vdev, 2066 * then we want to go through and remove the device from the hot spare 2067 * list of every other pool. 2068 */ 2069 if (unspare) { 2070 spa = NULL; 2071 mutex_enter(&spa_namespace_lock); 2072 while ((spa = spa_next(spa)) != NULL) { 2073 if (spa->spa_state != POOL_STATE_ACTIVE) 2074 continue; 2075 2076 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 2077 } 2078 mutex_exit(&spa_namespace_lock); 2079 } 2080 2081 return (error); 2082 } 2083 2084 /* 2085 * Remove a device from the pool. Currently, this supports removing only hot 2086 * spares. 2087 */ 2088 int 2089 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 2090 { 2091 vdev_t *vd; 2092 nvlist_t **spares, *nv, **newspares; 2093 uint_t i, j, nspares; 2094 int ret = 0; 2095 2096 spa_config_enter(spa, RW_WRITER, FTAG); 2097 2098 vd = spa_lookup_by_guid(spa, guid); 2099 2100 nv = NULL; 2101 if (spa->spa_spares != NULL && 2102 nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2103 &spares, &nspares) == 0) { 2104 for (i = 0; i < nspares; i++) { 2105 uint64_t theguid; 2106 2107 VERIFY(nvlist_lookup_uint64(spares[i], 2108 ZPOOL_CONFIG_GUID, &theguid) == 0); 2109 if (theguid == guid) { 2110 nv = spares[i]; 2111 break; 2112 } 2113 } 2114 } 2115 2116 /* 2117 * We only support removing a hot spare, and only if it's not currently 2118 * in use in this pool. 2119 */ 2120 if (nv == NULL && vd == NULL) { 2121 ret = ENOENT; 2122 goto out; 2123 } 2124 2125 if (nv == NULL && vd != NULL) { 2126 ret = ENOTSUP; 2127 goto out; 2128 } 2129 2130 if (!unspare && nv != NULL && vd != NULL) { 2131 ret = EBUSY; 2132 goto out; 2133 } 2134 2135 if (nspares == 1) { 2136 newspares = NULL; 2137 } else { 2138 newspares = kmem_alloc((nspares - 1) * sizeof (void *), 2139 KM_SLEEP); 2140 for (i = 0, j = 0; i < nspares; i++) { 2141 if (spares[i] != nv) 2142 VERIFY(nvlist_dup(spares[i], 2143 &newspares[j++], KM_SLEEP) == 0); 2144 } 2145 } 2146 2147 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2148 DATA_TYPE_NVLIST_ARRAY) == 0); 2149 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2150 newspares, nspares - 1) == 0); 2151 for (i = 0; i < nspares - 1; i++) 2152 nvlist_free(newspares[i]); 2153 kmem_free(newspares, (nspares - 1) * sizeof (void *)); 2154 spa_load_spares(spa); 2155 spa->spa_sync_spares = B_TRUE; 2156 2157 out: 2158 spa_config_exit(spa, FTAG); 2159 2160 return (ret); 2161 } 2162 2163 /* 2164 * Find any device that's done replacing, or a vdev marked 'unspare' that's 2165 * current spared, so we can detach it. 2166 */ 2167 static vdev_t * 2168 spa_vdev_resilver_done_hunt(vdev_t *vd) 2169 { 2170 vdev_t *newvd, *oldvd; 2171 int c; 2172 2173 for (c = 0; c < vd->vdev_children; c++) { 2174 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 2175 if (oldvd != NULL) 2176 return (oldvd); 2177 } 2178 2179 /* 2180 * Check for a completed replacement. 2181 */ 2182 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 2183 oldvd = vd->vdev_child[0]; 2184 newvd = vd->vdev_child[1]; 2185 2186 mutex_enter(&newvd->vdev_dtl_lock); 2187 if (newvd->vdev_dtl_map.sm_space == 0 && 2188 newvd->vdev_dtl_scrub.sm_space == 0) { 2189 mutex_exit(&newvd->vdev_dtl_lock); 2190 return (oldvd); 2191 } 2192 mutex_exit(&newvd->vdev_dtl_lock); 2193 } 2194 2195 /* 2196 * Check for a completed resilver with the 'unspare' flag set. 2197 */ 2198 if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { 2199 newvd = vd->vdev_child[0]; 2200 oldvd = vd->vdev_child[1]; 2201 2202 mutex_enter(&newvd->vdev_dtl_lock); 2203 if (newvd->vdev_unspare && 2204 newvd->vdev_dtl_map.sm_space == 0 && 2205 newvd->vdev_dtl_scrub.sm_space == 0) { 2206 newvd->vdev_unspare = 0; 2207 mutex_exit(&newvd->vdev_dtl_lock); 2208 return (oldvd); 2209 } 2210 mutex_exit(&newvd->vdev_dtl_lock); 2211 } 2212 2213 return (NULL); 2214 } 2215 2216 static void 2217 spa_vdev_resilver_done(spa_t *spa) 2218 { 2219 vdev_t *vd; 2220 vdev_t *pvd; 2221 uint64_t guid; 2222 uint64_t pguid = 0; 2223 2224 spa_config_enter(spa, RW_READER, FTAG); 2225 2226 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 2227 guid = vd->vdev_guid; 2228 /* 2229 * If we have just finished replacing a hot spared device, then 2230 * we need to detach the parent's first child (the original hot 2231 * spare) as well. 2232 */ 2233 pvd = vd->vdev_parent; 2234 if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2235 pvd->vdev_id == 0) { 2236 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 2237 ASSERT(pvd->vdev_parent->vdev_children == 2); 2238 pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 2239 } 2240 spa_config_exit(spa, FTAG); 2241 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 2242 return; 2243 if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 2244 return; 2245 spa_config_enter(spa, RW_READER, FTAG); 2246 } 2247 2248 spa_config_exit(spa, FTAG); 2249 } 2250 2251 /* 2252 * Update the stored path for this vdev. Dirty the vdev configuration, relying 2253 * on spa_vdev_enter/exit() to synchronize the labels and cache. 2254 */ 2255 int 2256 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 2257 { 2258 vdev_t *rvd, *vd; 2259 uint64_t txg; 2260 2261 rvd = spa->spa_root_vdev; 2262 2263 txg = spa_vdev_enter(spa); 2264 2265 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2266 /* 2267 * Determine if this is a reference to a hot spare. In that 2268 * case, update the path as stored in the spare list. 2269 */ 2270 nvlist_t **spares; 2271 uint_t i, nspares; 2272 if (spa->spa_sparelist != NULL) { 2273 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 2274 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2275 for (i = 0; i < nspares; i++) { 2276 uint64_t theguid; 2277 VERIFY(nvlist_lookup_uint64(spares[i], 2278 ZPOOL_CONFIG_GUID, &theguid) == 0); 2279 if (theguid == guid) 2280 break; 2281 } 2282 2283 if (i == nspares) 2284 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2285 2286 VERIFY(nvlist_add_string(spares[i], 2287 ZPOOL_CONFIG_PATH, newpath) == 0); 2288 spa_load_spares(spa); 2289 spa->spa_sync_spares = B_TRUE; 2290 return (spa_vdev_exit(spa, NULL, txg, 0)); 2291 } else { 2292 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2293 } 2294 } 2295 2296 if (!vd->vdev_ops->vdev_op_leaf) 2297 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2298 2299 spa_strfree(vd->vdev_path); 2300 vd->vdev_path = spa_strdup(newpath); 2301 2302 vdev_config_dirty(vd->vdev_top); 2303 2304 return (spa_vdev_exit(spa, NULL, txg, 0)); 2305 } 2306 2307 /* 2308 * ========================================================================== 2309 * SPA Scrubbing 2310 * ========================================================================== 2311 */ 2312 2313 static void 2314 spa_scrub_io_done(zio_t *zio) 2315 { 2316 spa_t *spa = zio->io_spa; 2317 2318 arc_data_buf_free(zio->io_data, zio->io_size); 2319 2320 mutex_enter(&spa->spa_scrub_lock); 2321 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2322 vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2323 spa->spa_scrub_errors++; 2324 mutex_enter(&vd->vdev_stat_lock); 2325 vd->vdev_stat.vs_scrub_errors++; 2326 mutex_exit(&vd->vdev_stat_lock); 2327 } 2328 2329 if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight) 2330 cv_broadcast(&spa->spa_scrub_io_cv); 2331 2332 ASSERT(spa->spa_scrub_inflight >= 0); 2333 2334 mutex_exit(&spa->spa_scrub_lock); 2335 } 2336 2337 static void 2338 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 2339 zbookmark_t *zb) 2340 { 2341 size_t size = BP_GET_LSIZE(bp); 2342 void *data; 2343 2344 mutex_enter(&spa->spa_scrub_lock); 2345 /* 2346 * Do not give too much work to vdev(s). 2347 */ 2348 while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) { 2349 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2350 } 2351 spa->spa_scrub_inflight++; 2352 mutex_exit(&spa->spa_scrub_lock); 2353 2354 data = arc_data_buf_alloc(size); 2355 2356 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 2357 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 2358 2359 flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 2360 2361 zio_nowait(zio_read(NULL, spa, bp, data, size, 2362 spa_scrub_io_done, NULL, priority, flags, zb)); 2363 } 2364 2365 /* ARGSUSED */ 2366 static int 2367 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2368 { 2369 blkptr_t *bp = &bc->bc_blkptr; 2370 vdev_t *vd = spa->spa_root_vdev; 2371 dva_t *dva = bp->blk_dva; 2372 int needs_resilver = B_FALSE; 2373 int d; 2374 2375 if (bc->bc_errno) { 2376 /* 2377 * We can't scrub this block, but we can continue to scrub 2378 * the rest of the pool. Note the error and move along. 2379 */ 2380 mutex_enter(&spa->spa_scrub_lock); 2381 spa->spa_scrub_errors++; 2382 mutex_exit(&spa->spa_scrub_lock); 2383 2384 mutex_enter(&vd->vdev_stat_lock); 2385 vd->vdev_stat.vs_scrub_errors++; 2386 mutex_exit(&vd->vdev_stat_lock); 2387 2388 return (ERESTART); 2389 } 2390 2391 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2392 2393 for (d = 0; d < BP_GET_NDVAS(bp); d++) { 2394 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 2395 2396 ASSERT(vd != NULL); 2397 2398 /* 2399 * Keep track of how much data we've examined so that 2400 * zpool(1M) status can make useful progress reports. 2401 */ 2402 mutex_enter(&vd->vdev_stat_lock); 2403 vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 2404 mutex_exit(&vd->vdev_stat_lock); 2405 2406 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 2407 if (DVA_GET_GANG(&dva[d])) { 2408 /* 2409 * Gang members may be spread across multiple 2410 * vdevs, so the best we can do is look at the 2411 * pool-wide DTL. 2412 * XXX -- it would be better to change our 2413 * allocation policy to ensure that this can't 2414 * happen. 2415 */ 2416 vd = spa->spa_root_vdev; 2417 } 2418 if (vdev_dtl_contains(&vd->vdev_dtl_map, 2419 bp->blk_birth, 1)) 2420 needs_resilver = B_TRUE; 2421 } 2422 } 2423 2424 if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2425 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 2426 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 2427 else if (needs_resilver) 2428 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 2429 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2430 2431 return (0); 2432 } 2433 2434 static void 2435 spa_scrub_thread(spa_t *spa) 2436 { 2437 callb_cpr_t cprinfo; 2438 traverse_handle_t *th = spa->spa_scrub_th; 2439 vdev_t *rvd = spa->spa_root_vdev; 2440 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2441 int error = 0; 2442 boolean_t complete; 2443 2444 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2445 2446 /* 2447 * If we're restarting due to a snapshot create/delete, 2448 * wait for that to complete. 2449 */ 2450 txg_wait_synced(spa_get_dsl(spa), 0); 2451 2452 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 2453 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2454 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 2455 2456 spa_config_enter(spa, RW_WRITER, FTAG); 2457 vdev_reopen(rvd); /* purge all vdev caches */ 2458 vdev_config_dirty(rvd); /* rewrite all disk labels */ 2459 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 2460 spa_config_exit(spa, FTAG); 2461 2462 mutex_enter(&spa->spa_scrub_lock); 2463 spa->spa_scrub_errors = 0; 2464 spa->spa_scrub_active = 1; 2465 ASSERT(spa->spa_scrub_inflight == 0); 2466 2467 while (!spa->spa_scrub_stop) { 2468 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2469 while (spa->spa_scrub_suspended) { 2470 spa->spa_scrub_active = 0; 2471 cv_broadcast(&spa->spa_scrub_cv); 2472 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2473 spa->spa_scrub_active = 1; 2474 } 2475 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2476 2477 if (spa->spa_scrub_restart_txg != 0) 2478 break; 2479 2480 mutex_exit(&spa->spa_scrub_lock); 2481 error = traverse_more(th); 2482 mutex_enter(&spa->spa_scrub_lock); 2483 if (error != EAGAIN) 2484 break; 2485 } 2486 2487 while (spa->spa_scrub_inflight) 2488 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2489 2490 spa->spa_scrub_active = 0; 2491 cv_broadcast(&spa->spa_scrub_cv); 2492 2493 mutex_exit(&spa->spa_scrub_lock); 2494 2495 spa_config_enter(spa, RW_WRITER, FTAG); 2496 2497 mutex_enter(&spa->spa_scrub_lock); 2498 2499 /* 2500 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 2501 * AND the spa config lock to synchronize with any config changes 2502 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 2503 */ 2504 if (spa->spa_scrub_restart_txg != 0) 2505 error = ERESTART; 2506 2507 if (spa->spa_scrub_stop) 2508 error = EINTR; 2509 2510 /* 2511 * Even if there were uncorrectable errors, we consider the scrub 2512 * completed. The downside is that if there is a transient error during 2513 * a resilver, we won't resilver the data properly to the target. But 2514 * if the damage is permanent (more likely) we will resilver forever, 2515 * which isn't really acceptable. Since there is enough information for 2516 * the user to know what has failed and why, this seems like a more 2517 * tractable approach. 2518 */ 2519 complete = (error == 0); 2520 2521 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 2522 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2523 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2524 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2525 2526 mutex_exit(&spa->spa_scrub_lock); 2527 2528 /* 2529 * If the scrub/resilver completed, update all DTLs to reflect this. 2530 * Whether it succeeded or not, vacate all temporary scrub DTLs. 2531 */ 2532 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2533 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2534 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 2535 spa_errlog_rotate(spa); 2536 2537 if (scrub_type == POOL_SCRUB_RESILVER && complete) 2538 spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_FINISH); 2539 2540 spa_config_exit(spa, FTAG); 2541 2542 mutex_enter(&spa->spa_scrub_lock); 2543 2544 /* 2545 * We may have finished replacing a device. 2546 * Let the async thread assess this and handle the detach. 2547 */ 2548 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 2549 2550 /* 2551 * If we were told to restart, our final act is to start a new scrub. 2552 */ 2553 if (error == ERESTART) 2554 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 2555 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2556 2557 spa->spa_scrub_type = POOL_SCRUB_NONE; 2558 spa->spa_scrub_active = 0; 2559 spa->spa_scrub_thread = NULL; 2560 cv_broadcast(&spa->spa_scrub_cv); 2561 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2562 thread_exit(); 2563 } 2564 2565 void 2566 spa_scrub_suspend(spa_t *spa) 2567 { 2568 mutex_enter(&spa->spa_scrub_lock); 2569 spa->spa_scrub_suspended++; 2570 while (spa->spa_scrub_active) { 2571 cv_broadcast(&spa->spa_scrub_cv); 2572 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2573 } 2574 while (spa->spa_scrub_inflight) 2575 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2576 mutex_exit(&spa->spa_scrub_lock); 2577 } 2578 2579 void 2580 spa_scrub_resume(spa_t *spa) 2581 { 2582 mutex_enter(&spa->spa_scrub_lock); 2583 ASSERT(spa->spa_scrub_suspended != 0); 2584 if (--spa->spa_scrub_suspended == 0) 2585 cv_broadcast(&spa->spa_scrub_cv); 2586 mutex_exit(&spa->spa_scrub_lock); 2587 } 2588 2589 void 2590 spa_scrub_restart(spa_t *spa, uint64_t txg) 2591 { 2592 /* 2593 * Something happened (e.g. snapshot create/delete) that means 2594 * we must restart any in-progress scrubs. The itinerary will 2595 * fix this properly. 2596 */ 2597 mutex_enter(&spa->spa_scrub_lock); 2598 spa->spa_scrub_restart_txg = txg; 2599 mutex_exit(&spa->spa_scrub_lock); 2600 } 2601 2602 int 2603 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2604 { 2605 space_seg_t *ss; 2606 uint64_t mintxg, maxtxg; 2607 vdev_t *rvd = spa->spa_root_vdev; 2608 2609 if ((uint_t)type >= POOL_SCRUB_TYPES) 2610 return (ENOTSUP); 2611 2612 mutex_enter(&spa->spa_scrub_lock); 2613 2614 /* 2615 * If there's a scrub or resilver already in progress, stop it. 2616 */ 2617 while (spa->spa_scrub_thread != NULL) { 2618 /* 2619 * Don't stop a resilver unless forced. 2620 */ 2621 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 2622 mutex_exit(&spa->spa_scrub_lock); 2623 return (EBUSY); 2624 } 2625 spa->spa_scrub_stop = 1; 2626 cv_broadcast(&spa->spa_scrub_cv); 2627 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2628 } 2629 2630 /* 2631 * Terminate the previous traverse. 2632 */ 2633 if (spa->spa_scrub_th != NULL) { 2634 traverse_fini(spa->spa_scrub_th); 2635 spa->spa_scrub_th = NULL; 2636 } 2637 2638 if (rvd == NULL) { 2639 ASSERT(spa->spa_scrub_stop == 0); 2640 ASSERT(spa->spa_scrub_type == type); 2641 ASSERT(spa->spa_scrub_restart_txg == 0); 2642 mutex_exit(&spa->spa_scrub_lock); 2643 return (0); 2644 } 2645 2646 mintxg = TXG_INITIAL - 1; 2647 maxtxg = spa_last_synced_txg(spa) + 1; 2648 2649 mutex_enter(&rvd->vdev_dtl_lock); 2650 2651 if (rvd->vdev_dtl_map.sm_space == 0) { 2652 /* 2653 * The pool-wide DTL is empty. 2654 * If this is a resilver, there's nothing to do except 2655 * check whether any in-progress replacements have completed. 2656 */ 2657 if (type == POOL_SCRUB_RESILVER) { 2658 type = POOL_SCRUB_NONE; 2659 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 2660 } 2661 } else { 2662 /* 2663 * The pool-wide DTL is non-empty. 2664 * If this is a normal scrub, upgrade to a resilver instead. 2665 */ 2666 if (type == POOL_SCRUB_EVERYTHING) 2667 type = POOL_SCRUB_RESILVER; 2668 } 2669 2670 if (type == POOL_SCRUB_RESILVER) { 2671 /* 2672 * Determine the resilvering boundaries. 2673 * 2674 * Note: (mintxg, maxtxg) is an open interval, 2675 * i.e. mintxg and maxtxg themselves are not included. 2676 * 2677 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 2678 * so we don't claim to resilver a txg that's still changing. 2679 */ 2680 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 2681 mintxg = ss->ss_start - 1; 2682 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 2683 maxtxg = MIN(ss->ss_end, maxtxg); 2684 2685 spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START); 2686 } 2687 2688 mutex_exit(&rvd->vdev_dtl_lock); 2689 2690 spa->spa_scrub_stop = 0; 2691 spa->spa_scrub_type = type; 2692 spa->spa_scrub_restart_txg = 0; 2693 2694 if (type != POOL_SCRUB_NONE) { 2695 spa->spa_scrub_mintxg = mintxg; 2696 spa->spa_scrub_maxtxg = maxtxg; 2697 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 2698 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 2699 ZIO_FLAG_CANFAIL); 2700 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 2701 spa->spa_scrub_thread = thread_create(NULL, 0, 2702 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 2703 } 2704 2705 mutex_exit(&spa->spa_scrub_lock); 2706 2707 return (0); 2708 } 2709 2710 /* 2711 * ========================================================================== 2712 * SPA async task processing 2713 * ========================================================================== 2714 */ 2715 2716 static void 2717 spa_async_remove(spa_t *spa, vdev_t *vd) 2718 { 2719 vdev_t *tvd; 2720 int c; 2721 2722 for (c = 0; c < vd->vdev_children; c++) { 2723 tvd = vd->vdev_child[c]; 2724 if (tvd->vdev_remove_wanted) { 2725 tvd->vdev_remove_wanted = 0; 2726 vdev_set_state(tvd, B_FALSE, VDEV_STATE_REMOVED, 2727 VDEV_AUX_NONE); 2728 vdev_clear(spa, tvd); 2729 vdev_config_dirty(tvd->vdev_top); 2730 } 2731 spa_async_remove(spa, tvd); 2732 } 2733 } 2734 2735 static void 2736 spa_async_thread(spa_t *spa) 2737 { 2738 int tasks; 2739 uint64_t txg; 2740 2741 ASSERT(spa->spa_sync_on); 2742 2743 mutex_enter(&spa->spa_async_lock); 2744 tasks = spa->spa_async_tasks; 2745 spa->spa_async_tasks = 0; 2746 mutex_exit(&spa->spa_async_lock); 2747 2748 /* 2749 * See if the config needs to be updated. 2750 */ 2751 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 2752 mutex_enter(&spa_namespace_lock); 2753 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2754 mutex_exit(&spa_namespace_lock); 2755 } 2756 2757 /* 2758 * See if any devices need to be marked REMOVED. 2759 */ 2760 if (tasks & SPA_ASYNC_REMOVE) { 2761 txg = spa_vdev_enter(spa); 2762 spa_async_remove(spa, spa->spa_root_vdev); 2763 (void) spa_vdev_exit(spa, NULL, txg, 0); 2764 } 2765 2766 /* 2767 * If any devices are done replacing, detach them. 2768 */ 2769 if (tasks & SPA_ASYNC_RESILVER_DONE) 2770 spa_vdev_resilver_done(spa); 2771 2772 /* 2773 * Kick off a scrub. When starting a RESILVER scrub (or an EVERYTHING 2774 * scrub which can become a resilver), we need to hold 2775 * spa_namespace_lock() because the sysevent we post via 2776 * spa_event_notify() needs to get the name of the pool. 2777 */ 2778 if (tasks & SPA_ASYNC_SCRUB) { 2779 mutex_enter(&spa_namespace_lock); 2780 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 2781 mutex_exit(&spa_namespace_lock); 2782 } 2783 2784 /* 2785 * Kick off a resilver. 2786 */ 2787 if (tasks & SPA_ASYNC_RESILVER) { 2788 mutex_enter(&spa_namespace_lock); 2789 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 2790 mutex_exit(&spa_namespace_lock); 2791 } 2792 2793 /* 2794 * Let the world know that we're done. 2795 */ 2796 mutex_enter(&spa->spa_async_lock); 2797 spa->spa_async_thread = NULL; 2798 cv_broadcast(&spa->spa_async_cv); 2799 mutex_exit(&spa->spa_async_lock); 2800 thread_exit(); 2801 } 2802 2803 void 2804 spa_async_suspend(spa_t *spa) 2805 { 2806 mutex_enter(&spa->spa_async_lock); 2807 spa->spa_async_suspended++; 2808 while (spa->spa_async_thread != NULL) 2809 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 2810 mutex_exit(&spa->spa_async_lock); 2811 } 2812 2813 void 2814 spa_async_resume(spa_t *spa) 2815 { 2816 mutex_enter(&spa->spa_async_lock); 2817 ASSERT(spa->spa_async_suspended != 0); 2818 spa->spa_async_suspended--; 2819 mutex_exit(&spa->spa_async_lock); 2820 } 2821 2822 static void 2823 spa_async_dispatch(spa_t *spa) 2824 { 2825 mutex_enter(&spa->spa_async_lock); 2826 if (spa->spa_async_tasks && !spa->spa_async_suspended && 2827 spa->spa_async_thread == NULL && 2828 rootdir != NULL && !vn_is_readonly(rootdir)) 2829 spa->spa_async_thread = thread_create(NULL, 0, 2830 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 2831 mutex_exit(&spa->spa_async_lock); 2832 } 2833 2834 void 2835 spa_async_request(spa_t *spa, int task) 2836 { 2837 mutex_enter(&spa->spa_async_lock); 2838 spa->spa_async_tasks |= task; 2839 mutex_exit(&spa->spa_async_lock); 2840 } 2841 2842 /* 2843 * ========================================================================== 2844 * SPA syncing routines 2845 * ========================================================================== 2846 */ 2847 2848 static void 2849 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2850 { 2851 bplist_t *bpl = &spa->spa_sync_bplist; 2852 dmu_tx_t *tx; 2853 blkptr_t blk; 2854 uint64_t itor = 0; 2855 zio_t *zio; 2856 int error; 2857 uint8_t c = 1; 2858 2859 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2860 2861 while (bplist_iterate(bpl, &itor, &blk) == 0) 2862 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2863 2864 error = zio_wait(zio); 2865 ASSERT3U(error, ==, 0); 2866 2867 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2868 bplist_vacate(bpl, tx); 2869 2870 /* 2871 * Pre-dirty the first block so we sync to convergence faster. 2872 * (Usually only the first block is needed.) 2873 */ 2874 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2875 dmu_tx_commit(tx); 2876 } 2877 2878 static void 2879 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 2880 { 2881 char *packed = NULL; 2882 size_t nvsize = 0; 2883 dmu_buf_t *db; 2884 2885 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 2886 2887 packed = kmem_alloc(nvsize, KM_SLEEP); 2888 2889 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 2890 KM_SLEEP) == 0); 2891 2892 dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 2893 2894 kmem_free(packed, nvsize); 2895 2896 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 2897 dmu_buf_will_dirty(db, tx); 2898 *(uint64_t *)db->db_data = nvsize; 2899 dmu_buf_rele(db, FTAG); 2900 } 2901 2902 static void 2903 spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 2904 { 2905 nvlist_t *nvroot; 2906 nvlist_t **spares; 2907 int i; 2908 2909 if (!spa->spa_sync_spares) 2910 return; 2911 2912 /* 2913 * Update the MOS nvlist describing the list of available spares. 2914 * spa_validate_spares() will have already made sure this nvlist is 2915 * valid and the vdevs are labeled appropriately. 2916 */ 2917 if (spa->spa_spares_object == 0) { 2918 spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 2919 DMU_OT_PACKED_NVLIST, 1 << 14, 2920 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2921 VERIFY(zap_update(spa->spa_meta_objset, 2922 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 2923 sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 2924 } 2925 2926 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2927 if (spa->spa_nspares == 0) { 2928 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2929 NULL, 0) == 0); 2930 } else { 2931 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 2932 KM_SLEEP); 2933 for (i = 0; i < spa->spa_nspares; i++) 2934 spares[i] = vdev_config_generate(spa, 2935 spa->spa_spares[i], B_FALSE, B_TRUE); 2936 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2937 spares, spa->spa_nspares) == 0); 2938 for (i = 0; i < spa->spa_nspares; i++) 2939 nvlist_free(spares[i]); 2940 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 2941 } 2942 2943 spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 2944 nvlist_free(nvroot); 2945 2946 spa->spa_sync_spares = B_FALSE; 2947 } 2948 2949 static void 2950 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2951 { 2952 nvlist_t *config; 2953 2954 if (list_is_empty(&spa->spa_dirty_list)) 2955 return; 2956 2957 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2958 2959 if (spa->spa_config_syncing) 2960 nvlist_free(spa->spa_config_syncing); 2961 spa->spa_config_syncing = config; 2962 2963 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 2964 } 2965 2966 static void 2967 spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 2968 { 2969 spa_t *spa = arg1; 2970 nvlist_t *nvp = arg2; 2971 nvpair_t *nvpair; 2972 objset_t *mos = spa->spa_meta_objset; 2973 uint64_t zapobj; 2974 uint64_t intval; 2975 2976 mutex_enter(&spa->spa_props_lock); 2977 if (spa->spa_pool_props_object == 0) { 2978 zapobj = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx); 2979 VERIFY(zapobj > 0); 2980 2981 spa->spa_pool_props_object = zapobj; 2982 2983 VERIFY(zap_update(mos, DMU_POOL_DIRECTORY_OBJECT, 2984 DMU_POOL_PROPS, 8, 1, 2985 &spa->spa_pool_props_object, tx) == 0); 2986 } 2987 mutex_exit(&spa->spa_props_lock); 2988 2989 nvpair = NULL; 2990 while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) { 2991 switch (zpool_name_to_prop(nvpair_name(nvpair))) { 2992 case ZPOOL_PROP_DELEGATION: 2993 VERIFY(nvlist_lookup_uint64(nvp, 2994 nvpair_name(nvpair), &intval) == 0); 2995 VERIFY(zap_update(mos, 2996 spa->spa_pool_props_object, 2997 nvpair_name(nvpair), 8, 1, 2998 &intval, tx) == 0); 2999 spa->spa_delegation = intval; 3000 break; 3001 case ZPOOL_PROP_BOOTFS: 3002 VERIFY(nvlist_lookup_uint64(nvp, 3003 nvpair_name(nvpair), &spa->spa_bootfs) == 0); 3004 intval = spa->spa_bootfs; 3005 VERIFY(zap_update(mos, 3006 spa->spa_pool_props_object, 3007 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 8, 1, 3008 &intval, tx) == 0); 3009 break; 3010 3011 case ZPOOL_PROP_AUTOREPLACE: 3012 VERIFY(nvlist_lookup_uint64(nvp, 3013 nvpair_name(nvpair), &intval) == 0); 3014 VERIFY(zap_update(mos, 3015 spa->spa_pool_props_object, 3016 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 8, 1, 3017 &intval, tx) == 0); 3018 break; 3019 } 3020 spa_history_internal_log(LOG_POOL_PROPSET, 3021 spa, tx, cr, "%s %lld %s", 3022 nvpair_name(nvpair), intval, 3023 spa->spa_name); 3024 } 3025 } 3026 3027 /* 3028 * Sync the specified transaction group. New blocks may be dirtied as 3029 * part of the process, so we iterate until it converges. 3030 */ 3031 void 3032 spa_sync(spa_t *spa, uint64_t txg) 3033 { 3034 dsl_pool_t *dp = spa->spa_dsl_pool; 3035 objset_t *mos = spa->spa_meta_objset; 3036 bplist_t *bpl = &spa->spa_sync_bplist; 3037 vdev_t *rvd = spa->spa_root_vdev; 3038 vdev_t *vd; 3039 dmu_tx_t *tx; 3040 int dirty_vdevs; 3041 3042 /* 3043 * Lock out configuration changes. 3044 */ 3045 spa_config_enter(spa, RW_READER, FTAG); 3046 3047 spa->spa_syncing_txg = txg; 3048 spa->spa_sync_pass = 0; 3049 3050 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 3051 3052 tx = dmu_tx_create_assigned(dp, txg); 3053 3054 /* 3055 * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg, 3056 * set spa_deflate if we have no raid-z vdevs. 3057 */ 3058 if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE && 3059 spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) { 3060 int i; 3061 3062 for (i = 0; i < rvd->vdev_children; i++) { 3063 vd = rvd->vdev_child[i]; 3064 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 3065 break; 3066 } 3067 if (i == rvd->vdev_children) { 3068 spa->spa_deflate = TRUE; 3069 VERIFY(0 == zap_add(spa->spa_meta_objset, 3070 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3071 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 3072 } 3073 } 3074 3075 /* 3076 * If anything has changed in this txg, push the deferred frees 3077 * from the previous txg. If not, leave them alone so that we 3078 * don't generate work on an otherwise idle system. 3079 */ 3080 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 3081 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 3082 !txg_list_empty(&dp->dp_sync_tasks, txg)) 3083 spa_sync_deferred_frees(spa, txg); 3084 3085 /* 3086 * Iterate to convergence. 3087 */ 3088 do { 3089 spa->spa_sync_pass++; 3090 3091 spa_sync_config_object(spa, tx); 3092 spa_sync_spares(spa, tx); 3093 spa_errlog_sync(spa, txg); 3094 dsl_pool_sync(dp, txg); 3095 3096 dirty_vdevs = 0; 3097 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 3098 vdev_sync(vd, txg); 3099 dirty_vdevs++; 3100 } 3101 3102 bplist_sync(bpl, tx); 3103 } while (dirty_vdevs); 3104 3105 bplist_close(bpl); 3106 3107 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 3108 3109 /* 3110 * Rewrite the vdev configuration (which includes the uberblock) 3111 * to commit the transaction group. 3112 * 3113 * If there are any dirty vdevs, sync the uberblock to all vdevs. 3114 * Otherwise, pick a random top-level vdev that's known to be 3115 * visible in the config cache (see spa_vdev_add() for details). 3116 * If the write fails, try the next vdev until we're tried them all. 3117 */ 3118 if (!list_is_empty(&spa->spa_dirty_list)) { 3119 VERIFY(vdev_config_sync(rvd, txg) == 0); 3120 } else { 3121 int children = rvd->vdev_children; 3122 int c0 = spa_get_random(children); 3123 int c; 3124 3125 for (c = 0; c < children; c++) { 3126 vd = rvd->vdev_child[(c0 + c) % children]; 3127 if (vd->vdev_ms_array == 0) 3128 continue; 3129 if (vdev_config_sync(vd, txg) == 0) 3130 break; 3131 } 3132 if (c == children) 3133 VERIFY(vdev_config_sync(rvd, txg) == 0); 3134 } 3135 3136 dmu_tx_commit(tx); 3137 3138 /* 3139 * Clear the dirty config list. 3140 */ 3141 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 3142 vdev_config_clean(vd); 3143 3144 /* 3145 * Now that the new config has synced transactionally, 3146 * let it become visible to the config cache. 3147 */ 3148 if (spa->spa_config_syncing != NULL) { 3149 spa_config_set(spa, spa->spa_config_syncing); 3150 spa->spa_config_txg = txg; 3151 spa->spa_config_syncing = NULL; 3152 } 3153 3154 /* 3155 * Make a stable copy of the fully synced uberblock. 3156 * We use this as the root for pool traversals. 3157 */ 3158 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 3159 3160 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 3161 3162 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 3163 spa->spa_traverse_wanted = 0; 3164 spa->spa_ubsync = spa->spa_uberblock; 3165 rw_exit(&spa->spa_traverse_lock); 3166 3167 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 3168 3169 /* 3170 * Clean up the ZIL records for the synced txg. 3171 */ 3172 dsl_pool_zil_clean(dp); 3173 3174 /* 3175 * Update usable space statistics. 3176 */ 3177 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 3178 vdev_sync_done(vd, txg); 3179 3180 /* 3181 * It had better be the case that we didn't dirty anything 3182 * since vdev_config_sync(). 3183 */ 3184 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 3185 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 3186 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 3187 ASSERT(bpl->bpl_queue == NULL); 3188 3189 spa_config_exit(spa, FTAG); 3190 3191 /* 3192 * If any async tasks have been requested, kick them off. 3193 */ 3194 spa_async_dispatch(spa); 3195 } 3196 3197 /* 3198 * Sync all pools. We don't want to hold the namespace lock across these 3199 * operations, so we take a reference on the spa_t and drop the lock during the 3200 * sync. 3201 */ 3202 void 3203 spa_sync_allpools(void) 3204 { 3205 spa_t *spa = NULL; 3206 mutex_enter(&spa_namespace_lock); 3207 while ((spa = spa_next(spa)) != NULL) { 3208 if (spa_state(spa) != POOL_STATE_ACTIVE) 3209 continue; 3210 spa_open_ref(spa, FTAG); 3211 mutex_exit(&spa_namespace_lock); 3212 txg_wait_synced(spa_get_dsl(spa), 0); 3213 mutex_enter(&spa_namespace_lock); 3214 spa_close(spa, FTAG); 3215 } 3216 mutex_exit(&spa_namespace_lock); 3217 } 3218 3219 /* 3220 * ========================================================================== 3221 * Miscellaneous routines 3222 * ========================================================================== 3223 */ 3224 3225 /* 3226 * Remove all pools in the system. 3227 */ 3228 void 3229 spa_evict_all(void) 3230 { 3231 spa_t *spa; 3232 3233 /* 3234 * Remove all cached state. All pools should be closed now, 3235 * so every spa in the AVL tree should be unreferenced. 3236 */ 3237 mutex_enter(&spa_namespace_lock); 3238 while ((spa = spa_next(NULL)) != NULL) { 3239 /* 3240 * Stop async tasks. The async thread may need to detach 3241 * a device that's been replaced, which requires grabbing 3242 * spa_namespace_lock, so we must drop it here. 3243 */ 3244 spa_open_ref(spa, FTAG); 3245 mutex_exit(&spa_namespace_lock); 3246 spa_async_suspend(spa); 3247 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 3248 mutex_enter(&spa_namespace_lock); 3249 spa_close(spa, FTAG); 3250 3251 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3252 spa_unload(spa); 3253 spa_deactivate(spa); 3254 } 3255 spa_remove(spa); 3256 } 3257 mutex_exit(&spa_namespace_lock); 3258 } 3259 3260 vdev_t * 3261 spa_lookup_by_guid(spa_t *spa, uint64_t guid) 3262 { 3263 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 3264 } 3265 3266 void 3267 spa_upgrade(spa_t *spa) 3268 { 3269 spa_config_enter(spa, RW_WRITER, FTAG); 3270 3271 /* 3272 * This should only be called for a non-faulted pool, and since a 3273 * future version would result in an unopenable pool, this shouldn't be 3274 * possible. 3275 */ 3276 ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 3277 3278 spa->spa_uberblock.ub_version = ZFS_VERSION; 3279 vdev_config_dirty(spa->spa_root_vdev); 3280 3281 spa_config_exit(spa, FTAG); 3282 3283 txg_wait_synced(spa_get_dsl(spa), 0); 3284 } 3285 3286 boolean_t 3287 spa_has_spare(spa_t *spa, uint64_t guid) 3288 { 3289 int i; 3290 uint64_t spareguid; 3291 3292 for (i = 0; i < spa->spa_nspares; i++) 3293 if (spa->spa_spares[i]->vdev_guid == guid) 3294 return (B_TRUE); 3295 3296 for (i = 0; i < spa->spa_pending_nspares; i++) { 3297 if (nvlist_lookup_uint64(spa->spa_pending_spares[i], 3298 ZPOOL_CONFIG_GUID, &spareguid) == 0 && 3299 spareguid == guid) 3300 return (B_TRUE); 3301 } 3302 3303 return (B_FALSE); 3304 } 3305 3306 int 3307 spa_set_props(spa_t *spa, nvlist_t *nvp) 3308 { 3309 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 3310 spa, nvp, 3)); 3311 } 3312 3313 int 3314 spa_get_props(spa_t *spa, nvlist_t **nvp) 3315 { 3316 zap_cursor_t zc; 3317 zap_attribute_t za; 3318 objset_t *mos = spa->spa_meta_objset; 3319 zfs_source_t src; 3320 zpool_prop_t prop; 3321 nvlist_t *propval; 3322 uint64_t value; 3323 int err; 3324 3325 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3326 3327 mutex_enter(&spa->spa_props_lock); 3328 /* If no props object, then just return empty nvlist */ 3329 if (spa->spa_pool_props_object == 0) { 3330 mutex_exit(&spa->spa_props_lock); 3331 return (0); 3332 } 3333 3334 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 3335 (err = zap_cursor_retrieve(&zc, &za)) == 0; 3336 zap_cursor_advance(&zc)) { 3337 3338 if ((prop = zpool_name_to_prop(za.za_name)) == ZFS_PROP_INVAL) 3339 continue; 3340 3341 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3342 switch (za.za_integer_length) { 3343 case 8: 3344 if (zpool_prop_default_numeric(prop) == 3345 za.za_first_integer) 3346 src = ZFS_SRC_DEFAULT; 3347 else 3348 src = ZFS_SRC_LOCAL; 3349 value = za.za_first_integer; 3350 3351 if (prop == ZPOOL_PROP_BOOTFS) { 3352 dsl_pool_t *dp; 3353 dsl_dataset_t *ds = NULL; 3354 char strval[MAXPATHLEN]; 3355 3356 dp = spa_get_dsl(spa); 3357 rw_enter(&dp->dp_config_rwlock, RW_READER); 3358 if ((err = dsl_dataset_open_obj(dp, 3359 za.za_first_integer, NULL, DS_MODE_NONE, 3360 FTAG, &ds)) != 0) { 3361 rw_exit(&dp->dp_config_rwlock); 3362 break; 3363 } 3364 dsl_dataset_name(ds, strval); 3365 dsl_dataset_close(ds, DS_MODE_NONE, FTAG); 3366 rw_exit(&dp->dp_config_rwlock); 3367 3368 VERIFY(nvlist_add_uint64(propval, 3369 ZFS_PROP_SOURCE, src) == 0); 3370 VERIFY(nvlist_add_string(propval, 3371 ZFS_PROP_VALUE, strval) == 0); 3372 } else { 3373 VERIFY(nvlist_add_uint64(propval, 3374 ZFS_PROP_SOURCE, src) == 0); 3375 VERIFY(nvlist_add_uint64(propval, 3376 ZFS_PROP_VALUE, value) == 0); 3377 } 3378 VERIFY(nvlist_add_nvlist(*nvp, za.za_name, 3379 propval) == 0); 3380 break; 3381 } 3382 nvlist_free(propval); 3383 } 3384 zap_cursor_fini(&zc); 3385 mutex_exit(&spa->spa_props_lock); 3386 if (err && err != ENOENT) { 3387 nvlist_free(*nvp); 3388 return (err); 3389 } 3390 3391 return (0); 3392 } 3393 3394 /* 3395 * If the bootfs property value is dsobj, clear it. 3396 */ 3397 void 3398 spa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 3399 { 3400 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 3401 VERIFY(zap_remove(spa->spa_meta_objset, 3402 spa->spa_pool_props_object, 3403 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 3404 spa->spa_bootfs = 0; 3405 } 3406 } 3407 3408 /* 3409 * Post a sysevent corresponding to the given event. The 'name' must be one of 3410 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 3411 * filled in from the spa and (optionally) the vdev. This doesn't do anything 3412 * in the userland libzpool, as we don't want consumers to misinterpret ztest 3413 * or zdb as real changes. 3414 */ 3415 void 3416 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 3417 { 3418 #ifdef _KERNEL 3419 sysevent_t *ev; 3420 sysevent_attr_list_t *attr = NULL; 3421 sysevent_value_t value; 3422 sysevent_id_t eid; 3423 3424 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 3425 SE_SLEEP); 3426 3427 value.value_type = SE_DATA_TYPE_STRING; 3428 value.value.sv_string = spa_name(spa); 3429 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 3430 goto done; 3431 3432 value.value_type = SE_DATA_TYPE_UINT64; 3433 value.value.sv_uint64 = spa_guid(spa); 3434 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 3435 goto done; 3436 3437 if (vd) { 3438 value.value_type = SE_DATA_TYPE_UINT64; 3439 value.value.sv_uint64 = vd->vdev_guid; 3440 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 3441 SE_SLEEP) != 0) 3442 goto done; 3443 3444 if (vd->vdev_path) { 3445 value.value_type = SE_DATA_TYPE_STRING; 3446 value.value.sv_string = vd->vdev_path; 3447 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 3448 &value, SE_SLEEP) != 0) 3449 goto done; 3450 } 3451 } 3452 3453 (void) log_sysevent(ev, SE_SLEEP, &eid); 3454 3455 done: 3456 if (attr) 3457 sysevent_free_attr(attr); 3458 sysevent_free(ev); 3459 #endif 3460 } 3461