1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains all the routines used when modifying on-disk SPA state. 31 * This includes opening, importing, destroying, exporting a pool, and syncing a 32 * pool. 33 */ 34 35 #include <sys/zfs_context.h> 36 #include <sys/fm/fs/zfs.h> 37 #include <sys/spa_impl.h> 38 #include <sys/zio.h> 39 #include <sys/zio_checksum.h> 40 #include <sys/zio_compress.h> 41 #include <sys/dmu.h> 42 #include <sys/dmu_tx.h> 43 #include <sys/zap.h> 44 #include <sys/zil.h> 45 #include <sys/vdev_impl.h> 46 #include <sys/metaslab.h> 47 #include <sys/uberblock_impl.h> 48 #include <sys/txg.h> 49 #include <sys/avl.h> 50 #include <sys/dmu_traverse.h> 51 #include <sys/dmu_objset.h> 52 #include <sys/unique.h> 53 #include <sys/dsl_pool.h> 54 #include <sys/dsl_dataset.h> 55 #include <sys/dsl_dir.h> 56 #include <sys/dsl_prop.h> 57 #include <sys/dsl_synctask.h> 58 #include <sys/fs/zfs.h> 59 #include <sys/callb.h> 60 #include <sys/systeminfo.h> 61 #include <sys/sunddi.h> 62 63 int zio_taskq_threads = 8; 64 65 /* 66 * ========================================================================== 67 * SPA state manipulation (open/create/destroy/import/export) 68 * ========================================================================== 69 */ 70 71 static int 72 spa_error_entry_compare(const void *a, const void *b) 73 { 74 spa_error_entry_t *sa = (spa_error_entry_t *)a; 75 spa_error_entry_t *sb = (spa_error_entry_t *)b; 76 int ret; 77 78 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 79 sizeof (zbookmark_t)); 80 81 if (ret < 0) 82 return (-1); 83 else if (ret > 0) 84 return (1); 85 else 86 return (0); 87 } 88 89 /* 90 * Utility function which retrieves copies of the current logs and 91 * re-initializes them in the process. 92 */ 93 void 94 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 95 { 96 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 97 98 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 99 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 100 101 avl_create(&spa->spa_errlist_scrub, 102 spa_error_entry_compare, sizeof (spa_error_entry_t), 103 offsetof(spa_error_entry_t, se_avl)); 104 avl_create(&spa->spa_errlist_last, 105 spa_error_entry_compare, sizeof (spa_error_entry_t), 106 offsetof(spa_error_entry_t, se_avl)); 107 } 108 109 /* 110 * Activate an uninitialized pool. 111 */ 112 static void 113 spa_activate(spa_t *spa) 114 { 115 int t; 116 117 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 118 119 spa->spa_state = POOL_STATE_ACTIVE; 120 121 spa->spa_normal_class = metaslab_class_create(); 122 spa->spa_log_class = metaslab_class_create(); 123 124 for (t = 0; t < ZIO_TYPES; t++) { 125 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 126 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 127 TASKQ_PREPOPULATE); 128 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 129 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 130 TASKQ_PREPOPULATE); 131 } 132 133 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 134 135 rprw_init(&spa->spa_config_lock); 136 137 mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); 138 mutex_init(&spa->spa_config_cache_lock, NULL, MUTEX_DEFAULT, NULL); 139 mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); 140 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); 141 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); 142 mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); 143 mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); 144 mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); 145 146 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 147 offsetof(vdev_t, vdev_dirty_node)); 148 149 txg_list_create(&spa->spa_vdev_txg_list, 150 offsetof(struct vdev, vdev_txg_node)); 151 152 avl_create(&spa->spa_errlist_scrub, 153 spa_error_entry_compare, sizeof (spa_error_entry_t), 154 offsetof(spa_error_entry_t, se_avl)); 155 avl_create(&spa->spa_errlist_last, 156 spa_error_entry_compare, sizeof (spa_error_entry_t), 157 offsetof(spa_error_entry_t, se_avl)); 158 } 159 160 /* 161 * Opposite of spa_activate(). 162 */ 163 static void 164 spa_deactivate(spa_t *spa) 165 { 166 int t; 167 168 ASSERT(spa->spa_sync_on == B_FALSE); 169 ASSERT(spa->spa_dsl_pool == NULL); 170 ASSERT(spa->spa_root_vdev == NULL); 171 172 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 173 174 txg_list_destroy(&spa->spa_vdev_txg_list); 175 176 list_destroy(&spa->spa_dirty_list); 177 178 rw_destroy(&spa->spa_traverse_lock); 179 180 for (t = 0; t < ZIO_TYPES; t++) { 181 taskq_destroy(spa->spa_zio_issue_taskq[t]); 182 taskq_destroy(spa->spa_zio_intr_taskq[t]); 183 spa->spa_zio_issue_taskq[t] = NULL; 184 spa->spa_zio_intr_taskq[t] = NULL; 185 } 186 187 metaslab_class_destroy(spa->spa_normal_class); 188 spa->spa_normal_class = NULL; 189 190 metaslab_class_destroy(spa->spa_log_class); 191 spa->spa_log_class = NULL; 192 193 /* 194 * If this was part of an import or the open otherwise failed, we may 195 * still have errors left in the queues. Empty them just in case. 196 */ 197 spa_errlog_drain(spa); 198 199 avl_destroy(&spa->spa_errlist_scrub); 200 avl_destroy(&spa->spa_errlist_last); 201 202 spa->spa_state = POOL_STATE_UNINITIALIZED; 203 } 204 205 /* 206 * Verify a pool configuration, and construct the vdev tree appropriately. This 207 * will create all the necessary vdevs in the appropriate layout, with each vdev 208 * in the CLOSED state. This will prep the pool before open/creation/import. 209 * All vdev validation is done by the vdev_alloc() routine. 210 */ 211 static int 212 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 213 uint_t id, int atype) 214 { 215 nvlist_t **child; 216 uint_t c, children; 217 int error; 218 219 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 220 return (error); 221 222 if ((*vdp)->vdev_ops->vdev_op_leaf) 223 return (0); 224 225 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 226 &child, &children) != 0) { 227 vdev_free(*vdp); 228 *vdp = NULL; 229 return (EINVAL); 230 } 231 232 for (c = 0; c < children; c++) { 233 vdev_t *vd; 234 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 235 atype)) != 0) { 236 vdev_free(*vdp); 237 *vdp = NULL; 238 return (error); 239 } 240 } 241 242 ASSERT(*vdp != NULL); 243 244 return (0); 245 } 246 247 /* 248 * Opposite of spa_load(). 249 */ 250 static void 251 spa_unload(spa_t *spa) 252 { 253 int i; 254 255 /* 256 * Stop async tasks. 257 */ 258 spa_async_suspend(spa); 259 260 /* 261 * Stop syncing. 262 */ 263 if (spa->spa_sync_on) { 264 txg_sync_stop(spa->spa_dsl_pool); 265 spa->spa_sync_on = B_FALSE; 266 } 267 268 /* 269 * Wait for any outstanding prefetch I/O to complete. 270 */ 271 spa_config_enter(spa, RW_WRITER, FTAG); 272 spa_config_exit(spa, FTAG); 273 274 /* 275 * Close the dsl pool. 276 */ 277 if (spa->spa_dsl_pool) { 278 dsl_pool_close(spa->spa_dsl_pool); 279 spa->spa_dsl_pool = NULL; 280 } 281 282 /* 283 * Close all vdevs. 284 */ 285 if (spa->spa_root_vdev) 286 vdev_free(spa->spa_root_vdev); 287 ASSERT(spa->spa_root_vdev == NULL); 288 289 for (i = 0; i < spa->spa_nspares; i++) 290 vdev_free(spa->spa_spares[i]); 291 if (spa->spa_spares) { 292 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 293 spa->spa_spares = NULL; 294 } 295 if (spa->spa_sparelist) { 296 nvlist_free(spa->spa_sparelist); 297 spa->spa_sparelist = NULL; 298 } 299 300 spa->spa_async_suspended = 0; 301 } 302 303 /* 304 * Load (or re-load) the current list of vdevs describing the active spares for 305 * this pool. When this is called, we have some form of basic information in 306 * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 307 * re-generate a more complete list including status information. 308 */ 309 static void 310 spa_load_spares(spa_t *spa) 311 { 312 nvlist_t **spares; 313 uint_t nspares; 314 int i; 315 vdev_t *vd, *tvd; 316 317 /* 318 * First, close and free any existing spare vdevs. 319 */ 320 for (i = 0; i < spa->spa_nspares; i++) { 321 vd = spa->spa_spares[i]; 322 323 /* Undo the call to spa_activate() below */ 324 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && 325 tvd->vdev_isspare) 326 spa_spare_remove(tvd); 327 vdev_close(vd); 328 vdev_free(vd); 329 } 330 331 if (spa->spa_spares) 332 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 333 334 if (spa->spa_sparelist == NULL) 335 nspares = 0; 336 else 337 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 338 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 339 340 spa->spa_nspares = (int)nspares; 341 spa->spa_spares = NULL; 342 343 if (nspares == 0) 344 return; 345 346 /* 347 * Construct the array of vdevs, opening them to get status in the 348 * process. For each spare, there is potentially two different vdev_t 349 * structures associated with it: one in the list of spares (used only 350 * for basic validation purposes) and one in the active vdev 351 * configuration (if it's spared in). During this phase we open and 352 * validate each vdev on the spare list. If the vdev also exists in the 353 * active configuration, then we also mark this vdev as an active spare. 354 */ 355 spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 356 for (i = 0; i < spa->spa_nspares; i++) { 357 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 358 VDEV_ALLOC_SPARE) == 0); 359 ASSERT(vd != NULL); 360 361 spa->spa_spares[i] = vd; 362 363 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { 364 if (!tvd->vdev_isspare) 365 spa_spare_add(tvd); 366 367 /* 368 * We only mark the spare active if we were successfully 369 * able to load the vdev. Otherwise, importing a pool 370 * with a bad active spare would result in strange 371 * behavior, because multiple pool would think the spare 372 * is actively in use. 373 * 374 * There is a vulnerability here to an equally bizarre 375 * circumstance, where a dead active spare is later 376 * brought back to life (onlined or otherwise). Given 377 * the rarity of this scenario, and the extra complexity 378 * it adds, we ignore the possibility. 379 */ 380 if (!vdev_is_dead(tvd)) 381 spa_spare_activate(tvd); 382 } 383 384 if (vdev_open(vd) != 0) 385 continue; 386 387 vd->vdev_top = vd; 388 (void) vdev_validate_spare(vd); 389 } 390 391 /* 392 * Recompute the stashed list of spares, with status information 393 * this time. 394 */ 395 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 396 DATA_TYPE_NVLIST_ARRAY) == 0); 397 398 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 399 for (i = 0; i < spa->spa_nspares; i++) 400 spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 401 B_TRUE, B_TRUE); 402 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 403 spares, spa->spa_nspares) == 0); 404 for (i = 0; i < spa->spa_nspares; i++) 405 nvlist_free(spares[i]); 406 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 407 } 408 409 static int 410 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 411 { 412 dmu_buf_t *db; 413 char *packed = NULL; 414 size_t nvsize = 0; 415 int error; 416 *value = NULL; 417 418 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 419 nvsize = *(uint64_t *)db->db_data; 420 dmu_buf_rele(db, FTAG); 421 422 packed = kmem_alloc(nvsize, KM_SLEEP); 423 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 424 if (error == 0) 425 error = nvlist_unpack(packed, nvsize, value, 0); 426 kmem_free(packed, nvsize); 427 428 return (error); 429 } 430 431 /* 432 * Checks to see if the given vdev could not be opened, in which case we post a 433 * sysevent to notify the autoreplace code that the device has been removed. 434 */ 435 static void 436 spa_check_removed(vdev_t *vd) 437 { 438 int c; 439 440 for (c = 0; c < vd->vdev_children; c++) 441 spa_check_removed(vd->vdev_child[c]); 442 443 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 444 zfs_post_autoreplace(vd->vdev_spa, vd); 445 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 446 } 447 } 448 449 /* 450 * Load an existing storage pool, using the pool's builtin spa_config as a 451 * source of configuration information. 452 */ 453 static int 454 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 455 { 456 int error = 0; 457 nvlist_t *nvroot = NULL; 458 vdev_t *rvd; 459 uberblock_t *ub = &spa->spa_uberblock; 460 uint64_t config_cache_txg = spa->spa_config_txg; 461 uint64_t pool_guid; 462 uint64_t version; 463 zio_t *zio; 464 uint64_t autoreplace = 0; 465 466 spa->spa_load_state = state; 467 468 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 469 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 470 error = EINVAL; 471 goto out; 472 } 473 474 /* 475 * Versioning wasn't explicitly added to the label until later, so if 476 * it's not present treat it as the initial version. 477 */ 478 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 479 version = SPA_VERSION_INITIAL; 480 481 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 482 &spa->spa_config_txg); 483 484 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 485 spa_guid_exists(pool_guid, 0)) { 486 error = EEXIST; 487 goto out; 488 } 489 490 spa->spa_load_guid = pool_guid; 491 492 /* 493 * Parse the configuration into a vdev tree. We explicitly set the 494 * value that will be returned by spa_version() since parsing the 495 * configuration requires knowing the version number. 496 */ 497 spa_config_enter(spa, RW_WRITER, FTAG); 498 spa->spa_ubsync.ub_version = version; 499 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 500 spa_config_exit(spa, FTAG); 501 502 if (error != 0) 503 goto out; 504 505 ASSERT(spa->spa_root_vdev == rvd); 506 ASSERT(spa_guid(spa) == pool_guid); 507 508 /* 509 * Try to open all vdevs, loading each label in the process. 510 */ 511 error = vdev_open(rvd); 512 if (error != 0) 513 goto out; 514 515 /* 516 * Validate the labels for all leaf vdevs. We need to grab the config 517 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 518 * flag. 519 */ 520 spa_config_enter(spa, RW_READER, FTAG); 521 error = vdev_validate(rvd); 522 spa_config_exit(spa, FTAG); 523 524 if (error != 0) 525 goto out; 526 527 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 528 error = ENXIO; 529 goto out; 530 } 531 532 /* 533 * Find the best uberblock. 534 */ 535 bzero(ub, sizeof (uberblock_t)); 536 537 zio = zio_root(spa, NULL, NULL, 538 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 539 vdev_uberblock_load(zio, rvd, ub); 540 error = zio_wait(zio); 541 542 /* 543 * If we weren't able to find a single valid uberblock, return failure. 544 */ 545 if (ub->ub_txg == 0) { 546 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 547 VDEV_AUX_CORRUPT_DATA); 548 error = ENXIO; 549 goto out; 550 } 551 552 /* 553 * If the pool is newer than the code, we can't open it. 554 */ 555 if (ub->ub_version > SPA_VERSION) { 556 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 557 VDEV_AUX_VERSION_NEWER); 558 error = ENOTSUP; 559 goto out; 560 } 561 562 /* 563 * If the vdev guid sum doesn't match the uberblock, we have an 564 * incomplete configuration. 565 */ 566 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 567 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 568 VDEV_AUX_BAD_GUID_SUM); 569 error = ENXIO; 570 goto out; 571 } 572 573 /* 574 * Initialize internal SPA structures. 575 */ 576 spa->spa_state = POOL_STATE_ACTIVE; 577 spa->spa_ubsync = spa->spa_uberblock; 578 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 579 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 580 if (error) { 581 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 582 VDEV_AUX_CORRUPT_DATA); 583 goto out; 584 } 585 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 586 587 if (zap_lookup(spa->spa_meta_objset, 588 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 589 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 590 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 591 VDEV_AUX_CORRUPT_DATA); 592 error = EIO; 593 goto out; 594 } 595 596 if (!mosconfig) { 597 nvlist_t *newconfig; 598 uint64_t hostid; 599 600 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 601 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 602 VDEV_AUX_CORRUPT_DATA); 603 error = EIO; 604 goto out; 605 } 606 607 if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID, 608 &hostid) == 0) { 609 char *hostname; 610 unsigned long myhostid = 0; 611 612 VERIFY(nvlist_lookup_string(newconfig, 613 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 614 615 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 616 if (hostid != 0 && myhostid != 0 && 617 (unsigned long)hostid != myhostid) { 618 cmn_err(CE_WARN, "pool '%s' could not be " 619 "loaded as it was last accessed by " 620 "another system (host: %s hostid: 0x%lx). " 621 "See: http://www.sun.com/msg/ZFS-8000-EY", 622 spa->spa_name, hostname, 623 (unsigned long)hostid); 624 error = EBADF; 625 goto out; 626 } 627 } 628 629 spa_config_set(spa, newconfig); 630 spa_unload(spa); 631 spa_deactivate(spa); 632 spa_activate(spa); 633 634 return (spa_load(spa, newconfig, state, B_TRUE)); 635 } 636 637 if (zap_lookup(spa->spa_meta_objset, 638 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 639 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 640 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 641 VDEV_AUX_CORRUPT_DATA); 642 error = EIO; 643 goto out; 644 } 645 646 /* 647 * Load the bit that tells us to use the new accounting function 648 * (raid-z deflation). If we have an older pool, this will not 649 * be present. 650 */ 651 error = zap_lookup(spa->spa_meta_objset, 652 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 653 sizeof (uint64_t), 1, &spa->spa_deflate); 654 if (error != 0 && error != ENOENT) { 655 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 656 VDEV_AUX_CORRUPT_DATA); 657 error = EIO; 658 goto out; 659 } 660 661 /* 662 * Load the persistent error log. If we have an older pool, this will 663 * not be present. 664 */ 665 error = zap_lookup(spa->spa_meta_objset, 666 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 667 sizeof (uint64_t), 1, &spa->spa_errlog_last); 668 if (error != 0 && error != ENOENT) { 669 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 670 VDEV_AUX_CORRUPT_DATA); 671 error = EIO; 672 goto out; 673 } 674 675 error = zap_lookup(spa->spa_meta_objset, 676 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 677 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 678 if (error != 0 && error != ENOENT) { 679 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 680 VDEV_AUX_CORRUPT_DATA); 681 error = EIO; 682 goto out; 683 } 684 685 /* 686 * Load the history object. If we have an older pool, this 687 * will not be present. 688 */ 689 error = zap_lookup(spa->spa_meta_objset, 690 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 691 sizeof (uint64_t), 1, &spa->spa_history); 692 if (error != 0 && error != ENOENT) { 693 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 694 VDEV_AUX_CORRUPT_DATA); 695 error = EIO; 696 goto out; 697 } 698 699 /* 700 * Load any hot spares for this pool. 701 */ 702 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 703 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 704 if (error != 0 && error != ENOENT) { 705 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 706 VDEV_AUX_CORRUPT_DATA); 707 error = EIO; 708 goto out; 709 } 710 if (error == 0) { 711 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 712 if (load_nvlist(spa, spa->spa_spares_object, 713 &spa->spa_sparelist) != 0) { 714 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 715 VDEV_AUX_CORRUPT_DATA); 716 error = EIO; 717 goto out; 718 } 719 720 spa_config_enter(spa, RW_WRITER, FTAG); 721 spa_load_spares(spa); 722 spa_config_exit(spa, FTAG); 723 } 724 725 spa->spa_delegation = zfs_prop_default_numeric(ZPOOL_PROP_DELEGATION); 726 727 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 728 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 729 730 if (error && error != ENOENT) { 731 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 732 VDEV_AUX_CORRUPT_DATA); 733 error = EIO; 734 goto out; 735 } 736 737 if (error == 0) { 738 (void) zap_lookup(spa->spa_meta_objset, 739 spa->spa_pool_props_object, 740 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 741 sizeof (uint64_t), 1, &spa->spa_bootfs); 742 (void) zap_lookup(spa->spa_meta_objset, 743 spa->spa_pool_props_object, 744 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 745 sizeof (uint64_t), 1, &autoreplace); 746 (void) zap_lookup(spa->spa_meta_objset, 747 spa->spa_pool_props_object, 748 zpool_prop_to_name(ZPOOL_PROP_DELEGATION), 749 sizeof (uint64_t), 1, &spa->spa_delegation); 750 } 751 752 /* 753 * If the 'autoreplace' property is set, then post a resource notifying 754 * the ZFS DE that it should not issue any faults for unopenable 755 * devices. We also iterate over the vdevs, and post a sysevent for any 756 * unopenable vdevs so that the normal autoreplace handler can take 757 * over. 758 */ 759 if (autoreplace) 760 spa_check_removed(spa->spa_root_vdev); 761 762 /* 763 * Load the vdev state for all toplevel vdevs. 764 */ 765 vdev_load(rvd); 766 767 /* 768 * Propagate the leaf DTLs we just loaded all the way up the tree. 769 */ 770 spa_config_enter(spa, RW_WRITER, FTAG); 771 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 772 spa_config_exit(spa, FTAG); 773 774 /* 775 * Check the state of the root vdev. If it can't be opened, it 776 * indicates one or more toplevel vdevs are faulted. 777 */ 778 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 779 error = ENXIO; 780 goto out; 781 } 782 783 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 784 dmu_tx_t *tx; 785 int need_update = B_FALSE; 786 int c; 787 788 /* 789 * Claim log blocks that haven't been committed yet. 790 * This must all happen in a single txg. 791 */ 792 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 793 spa_first_txg(spa)); 794 (void) dmu_objset_find(spa->spa_name, 795 zil_claim, tx, DS_FIND_CHILDREN); 796 dmu_tx_commit(tx); 797 798 spa->spa_sync_on = B_TRUE; 799 txg_sync_start(spa->spa_dsl_pool); 800 801 /* 802 * Wait for all claims to sync. 803 */ 804 txg_wait_synced(spa->spa_dsl_pool, 0); 805 806 /* 807 * If the config cache is stale, or we have uninitialized 808 * metaslabs (see spa_vdev_add()), then update the config. 809 */ 810 if (config_cache_txg != spa->spa_config_txg || 811 state == SPA_LOAD_IMPORT) 812 need_update = B_TRUE; 813 814 for (c = 0; c < rvd->vdev_children; c++) 815 if (rvd->vdev_child[c]->vdev_ms_array == 0) 816 need_update = B_TRUE; 817 818 /* 819 * Update the config cache asychronously in case we're the 820 * root pool, in which case the config cache isn't writable yet. 821 */ 822 if (need_update) 823 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 824 } 825 826 error = 0; 827 out: 828 if (error && error != EBADF) 829 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 830 spa->spa_load_state = SPA_LOAD_NONE; 831 spa->spa_ena = 0; 832 833 return (error); 834 } 835 836 /* 837 * Pool Open/Import 838 * 839 * The import case is identical to an open except that the configuration is sent 840 * down from userland, instead of grabbed from the configuration cache. For the 841 * case of an open, the pool configuration will exist in the 842 * POOL_STATE_UNINITIALIZED state. 843 * 844 * The stats information (gen/count/ustats) is used to gather vdev statistics at 845 * the same time open the pool, without having to keep around the spa_t in some 846 * ambiguous state. 847 */ 848 static int 849 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 850 { 851 spa_t *spa; 852 int error; 853 int loaded = B_FALSE; 854 int locked = B_FALSE; 855 856 *spapp = NULL; 857 858 /* 859 * As disgusting as this is, we need to support recursive calls to this 860 * function because dsl_dir_open() is called during spa_load(), and ends 861 * up calling spa_open() again. The real fix is to figure out how to 862 * avoid dsl_dir_open() calling this in the first place. 863 */ 864 if (mutex_owner(&spa_namespace_lock) != curthread) { 865 mutex_enter(&spa_namespace_lock); 866 locked = B_TRUE; 867 } 868 869 if ((spa = spa_lookup(pool)) == NULL) { 870 if (locked) 871 mutex_exit(&spa_namespace_lock); 872 return (ENOENT); 873 } 874 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 875 876 spa_activate(spa); 877 878 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 879 880 if (error == EBADF) { 881 /* 882 * If vdev_validate() returns failure (indicated by 883 * EBADF), it indicates that one of the vdevs indicates 884 * that the pool has been exported or destroyed. If 885 * this is the case, the config cache is out of sync and 886 * we should remove the pool from the namespace. 887 */ 888 zfs_post_ok(spa, NULL); 889 spa_unload(spa); 890 spa_deactivate(spa); 891 spa_remove(spa); 892 spa_config_sync(); 893 if (locked) 894 mutex_exit(&spa_namespace_lock); 895 return (ENOENT); 896 } 897 898 if (error) { 899 /* 900 * We can't open the pool, but we still have useful 901 * information: the state of each vdev after the 902 * attempted vdev_open(). Return this to the user. 903 */ 904 if (config != NULL && spa->spa_root_vdev != NULL) { 905 spa_config_enter(spa, RW_READER, FTAG); 906 *config = spa_config_generate(spa, NULL, -1ULL, 907 B_TRUE); 908 spa_config_exit(spa, FTAG); 909 } 910 spa_unload(spa); 911 spa_deactivate(spa); 912 spa->spa_last_open_failed = B_TRUE; 913 if (locked) 914 mutex_exit(&spa_namespace_lock); 915 *spapp = NULL; 916 return (error); 917 } else { 918 zfs_post_ok(spa, NULL); 919 spa->spa_last_open_failed = B_FALSE; 920 } 921 922 loaded = B_TRUE; 923 } 924 925 spa_open_ref(spa, tag); 926 927 /* 928 * If we just loaded the pool, resilver anything that's out of date. 929 */ 930 if (loaded && (spa_mode & FWRITE)) 931 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 932 933 if (locked) 934 mutex_exit(&spa_namespace_lock); 935 936 *spapp = spa; 937 938 if (config != NULL) { 939 spa_config_enter(spa, RW_READER, FTAG); 940 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 941 spa_config_exit(spa, FTAG); 942 } 943 944 return (0); 945 } 946 947 int 948 spa_open(const char *name, spa_t **spapp, void *tag) 949 { 950 return (spa_open_common(name, spapp, tag, NULL)); 951 } 952 953 /* 954 * Lookup the given spa_t, incrementing the inject count in the process, 955 * preventing it from being exported or destroyed. 956 */ 957 spa_t * 958 spa_inject_addref(char *name) 959 { 960 spa_t *spa; 961 962 mutex_enter(&spa_namespace_lock); 963 if ((spa = spa_lookup(name)) == NULL) { 964 mutex_exit(&spa_namespace_lock); 965 return (NULL); 966 } 967 spa->spa_inject_ref++; 968 mutex_exit(&spa_namespace_lock); 969 970 return (spa); 971 } 972 973 void 974 spa_inject_delref(spa_t *spa) 975 { 976 mutex_enter(&spa_namespace_lock); 977 spa->spa_inject_ref--; 978 mutex_exit(&spa_namespace_lock); 979 } 980 981 static void 982 spa_add_spares(spa_t *spa, nvlist_t *config) 983 { 984 nvlist_t **spares; 985 uint_t i, nspares; 986 nvlist_t *nvroot; 987 uint64_t guid; 988 vdev_stat_t *vs; 989 uint_t vsc; 990 uint64_t pool; 991 992 if (spa->spa_nspares == 0) 993 return; 994 995 VERIFY(nvlist_lookup_nvlist(config, 996 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 997 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 998 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 999 if (nspares != 0) { 1000 VERIFY(nvlist_add_nvlist_array(nvroot, 1001 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1002 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1003 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1004 1005 /* 1006 * Go through and find any spares which have since been 1007 * repurposed as an active spare. If this is the case, update 1008 * their status appropriately. 1009 */ 1010 for (i = 0; i < nspares; i++) { 1011 VERIFY(nvlist_lookup_uint64(spares[i], 1012 ZPOOL_CONFIG_GUID, &guid) == 0); 1013 if (spa_spare_exists(guid, &pool) && pool != 0ULL) { 1014 VERIFY(nvlist_lookup_uint64_array( 1015 spares[i], ZPOOL_CONFIG_STATS, 1016 (uint64_t **)&vs, &vsc) == 0); 1017 vs->vs_state = VDEV_STATE_CANT_OPEN; 1018 vs->vs_aux = VDEV_AUX_SPARED; 1019 } 1020 } 1021 } 1022 } 1023 1024 int 1025 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 1026 { 1027 int error; 1028 spa_t *spa; 1029 1030 *config = NULL; 1031 error = spa_open_common(name, &spa, FTAG, config); 1032 1033 if (spa && *config != NULL) { 1034 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 1035 spa_get_errlog_size(spa)) == 0); 1036 1037 spa_add_spares(spa, *config); 1038 } 1039 1040 /* 1041 * We want to get the alternate root even for faulted pools, so we cheat 1042 * and call spa_lookup() directly. 1043 */ 1044 if (altroot) { 1045 if (spa == NULL) { 1046 mutex_enter(&spa_namespace_lock); 1047 spa = spa_lookup(name); 1048 if (spa) 1049 spa_altroot(spa, altroot, buflen); 1050 else 1051 altroot[0] = '\0'; 1052 spa = NULL; 1053 mutex_exit(&spa_namespace_lock); 1054 } else { 1055 spa_altroot(spa, altroot, buflen); 1056 } 1057 } 1058 1059 if (spa != NULL) 1060 spa_close(spa, FTAG); 1061 1062 return (error); 1063 } 1064 1065 /* 1066 * Validate that the 'spares' array is well formed. We must have an array of 1067 * nvlists, each which describes a valid leaf vdev. If this is an import (mode 1068 * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long 1069 * as they are well-formed. 1070 */ 1071 static int 1072 spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 1073 { 1074 nvlist_t **spares; 1075 uint_t i, nspares; 1076 vdev_t *vd; 1077 int error; 1078 1079 /* 1080 * It's acceptable to have no spares specified. 1081 */ 1082 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1083 &spares, &nspares) != 0) 1084 return (0); 1085 1086 if (nspares == 0) 1087 return (EINVAL); 1088 1089 /* 1090 * Make sure the pool is formatted with a version that supports hot 1091 * spares. 1092 */ 1093 if (spa_version(spa) < SPA_VERSION_SPARES) 1094 return (ENOTSUP); 1095 1096 /* 1097 * Set the pending spare list so we correctly handle device in-use 1098 * checking. 1099 */ 1100 spa->spa_pending_spares = spares; 1101 spa->spa_pending_nspares = nspares; 1102 1103 for (i = 0; i < nspares; i++) { 1104 if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 1105 mode)) != 0) 1106 goto out; 1107 1108 if (!vd->vdev_ops->vdev_op_leaf) { 1109 vdev_free(vd); 1110 error = EINVAL; 1111 goto out; 1112 } 1113 1114 vd->vdev_top = vd; 1115 1116 if ((error = vdev_open(vd)) == 0 && 1117 (error = vdev_label_init(vd, crtxg, 1118 VDEV_LABEL_SPARE)) == 0) { 1119 VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 1120 vd->vdev_guid) == 0); 1121 } 1122 1123 vdev_free(vd); 1124 1125 if (error && mode != VDEV_ALLOC_SPARE) 1126 goto out; 1127 else 1128 error = 0; 1129 } 1130 1131 out: 1132 spa->spa_pending_spares = NULL; 1133 spa->spa_pending_nspares = 0; 1134 return (error); 1135 } 1136 1137 /* 1138 * Pool Creation 1139 */ 1140 int 1141 spa_create(const char *pool, nvlist_t *nvroot, const char *altroot, 1142 const char *history_str) 1143 { 1144 spa_t *spa; 1145 vdev_t *rvd; 1146 dsl_pool_t *dp; 1147 dmu_tx_t *tx; 1148 int c, error = 0; 1149 uint64_t txg = TXG_INITIAL; 1150 nvlist_t **spares; 1151 uint_t nspares; 1152 1153 /* 1154 * If this pool already exists, return failure. 1155 */ 1156 mutex_enter(&spa_namespace_lock); 1157 if (spa_lookup(pool) != NULL) { 1158 mutex_exit(&spa_namespace_lock); 1159 return (EEXIST); 1160 } 1161 1162 /* 1163 * Allocate a new spa_t structure. 1164 */ 1165 spa = spa_add(pool, altroot); 1166 spa_activate(spa); 1167 1168 spa->spa_uberblock.ub_txg = txg - 1; 1169 spa->spa_uberblock.ub_version = SPA_VERSION; 1170 spa->spa_ubsync = spa->spa_uberblock; 1171 1172 /* 1173 * Create the root vdev. 1174 */ 1175 spa_config_enter(spa, RW_WRITER, FTAG); 1176 1177 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1178 1179 ASSERT(error != 0 || rvd != NULL); 1180 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1181 1182 if (error == 0 && rvd->vdev_children == 0) 1183 error = EINVAL; 1184 1185 if (error == 0 && 1186 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1187 (error = spa_validate_spares(spa, nvroot, txg, 1188 VDEV_ALLOC_ADD)) == 0) { 1189 for (c = 0; c < rvd->vdev_children; c++) 1190 vdev_init(rvd->vdev_child[c], txg); 1191 vdev_config_dirty(rvd); 1192 } 1193 1194 spa_config_exit(spa, FTAG); 1195 1196 if (error != 0) { 1197 spa_unload(spa); 1198 spa_deactivate(spa); 1199 spa_remove(spa); 1200 mutex_exit(&spa_namespace_lock); 1201 return (error); 1202 } 1203 1204 /* 1205 * Get the list of spares, if specified. 1206 */ 1207 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1208 &spares, &nspares) == 0) { 1209 VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 1210 KM_SLEEP) == 0); 1211 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1212 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1213 spa_config_enter(spa, RW_WRITER, FTAG); 1214 spa_load_spares(spa); 1215 spa_config_exit(spa, FTAG); 1216 spa->spa_sync_spares = B_TRUE; 1217 } 1218 1219 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1220 spa->spa_meta_objset = dp->dp_meta_objset; 1221 1222 tx = dmu_tx_create_assigned(dp, txg); 1223 1224 /* 1225 * Create the pool config object. 1226 */ 1227 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1228 DMU_OT_PACKED_NVLIST, 1 << 14, 1229 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1230 1231 if (zap_add(spa->spa_meta_objset, 1232 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1233 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1234 cmn_err(CE_PANIC, "failed to add pool config"); 1235 } 1236 1237 /* Newly created pools are always deflated. */ 1238 spa->spa_deflate = TRUE; 1239 if (zap_add(spa->spa_meta_objset, 1240 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1241 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 1242 cmn_err(CE_PANIC, "failed to add deflate"); 1243 } 1244 1245 /* 1246 * Create the deferred-free bplist object. Turn off compression 1247 * because sync-to-convergence takes longer if the blocksize 1248 * keeps changing. 1249 */ 1250 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1251 1 << 14, tx); 1252 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1253 ZIO_COMPRESS_OFF, tx); 1254 1255 if (zap_add(spa->spa_meta_objset, 1256 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1257 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 1258 cmn_err(CE_PANIC, "failed to add bplist"); 1259 } 1260 1261 /* 1262 * Create the pool's history object. 1263 */ 1264 spa_history_create_obj(spa, tx); 1265 1266 dmu_tx_commit(tx); 1267 1268 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 1269 spa->spa_delegation = zfs_prop_default_numeric(ZPOOL_PROP_DELEGATION); 1270 spa->spa_sync_on = B_TRUE; 1271 txg_sync_start(spa->spa_dsl_pool); 1272 1273 /* 1274 * We explicitly wait for the first transaction to complete so that our 1275 * bean counters are appropriately updated. 1276 */ 1277 txg_wait_synced(spa->spa_dsl_pool, txg); 1278 1279 spa_config_sync(); 1280 1281 if (history_str != NULL) 1282 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 1283 1284 mutex_exit(&spa_namespace_lock); 1285 1286 return (0); 1287 } 1288 1289 /* 1290 * Import the given pool into the system. We set up the necessary spa_t and 1291 * then call spa_load() to do the dirty work. 1292 */ 1293 int 1294 spa_import(const char *pool, nvlist_t *config, const char *altroot) 1295 { 1296 spa_t *spa; 1297 int error; 1298 nvlist_t *nvroot; 1299 nvlist_t **spares; 1300 uint_t nspares; 1301 1302 /* 1303 * If a pool with this name exists, return failure. 1304 */ 1305 mutex_enter(&spa_namespace_lock); 1306 if (spa_lookup(pool) != NULL) { 1307 mutex_exit(&spa_namespace_lock); 1308 return (EEXIST); 1309 } 1310 1311 /* 1312 * Create and initialize the spa structure. 1313 */ 1314 spa = spa_add(pool, altroot); 1315 spa_activate(spa); 1316 1317 /* 1318 * Pass off the heavy lifting to spa_load(). 1319 * Pass TRUE for mosconfig because the user-supplied config 1320 * is actually the one to trust when doing an import. 1321 */ 1322 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1323 1324 spa_config_enter(spa, RW_WRITER, FTAG); 1325 /* 1326 * Toss any existing sparelist, as it doesn't have any validity anymore, 1327 * and conflicts with spa_has_spare(). 1328 */ 1329 if (spa->spa_sparelist) { 1330 nvlist_free(spa->spa_sparelist); 1331 spa->spa_sparelist = NULL; 1332 spa_load_spares(spa); 1333 } 1334 1335 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1336 &nvroot) == 0); 1337 if (error == 0) 1338 error = spa_validate_spares(spa, nvroot, -1ULL, 1339 VDEV_ALLOC_SPARE); 1340 spa_config_exit(spa, FTAG); 1341 1342 if (error != 0) { 1343 spa_unload(spa); 1344 spa_deactivate(spa); 1345 spa_remove(spa); 1346 mutex_exit(&spa_namespace_lock); 1347 return (error); 1348 } 1349 1350 /* 1351 * Override any spares as specified by the user, as these may have 1352 * correct device names/devids, etc. 1353 */ 1354 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1355 &spares, &nspares) == 0) { 1356 if (spa->spa_sparelist) 1357 VERIFY(nvlist_remove(spa->spa_sparelist, 1358 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1359 else 1360 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1361 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1362 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1363 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1364 spa_config_enter(spa, RW_WRITER, FTAG); 1365 spa_load_spares(spa); 1366 spa_config_exit(spa, FTAG); 1367 spa->spa_sync_spares = B_TRUE; 1368 } 1369 1370 /* 1371 * Update the config cache to include the newly-imported pool. 1372 */ 1373 if (spa_mode & FWRITE) 1374 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1375 1376 /* 1377 * Resilver anything that's out of date. 1378 */ 1379 if (spa_mode & FWRITE) 1380 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1381 1382 mutex_exit(&spa_namespace_lock); 1383 1384 return (0); 1385 } 1386 1387 /* 1388 * This (illegal) pool name is used when temporarily importing a spa_t in order 1389 * to get the vdev stats associated with the imported devices. 1390 */ 1391 #define TRYIMPORT_NAME "$import" 1392 1393 nvlist_t * 1394 spa_tryimport(nvlist_t *tryconfig) 1395 { 1396 nvlist_t *config = NULL; 1397 char *poolname; 1398 spa_t *spa; 1399 uint64_t state; 1400 1401 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1402 return (NULL); 1403 1404 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1405 return (NULL); 1406 1407 /* 1408 * Create and initialize the spa structure. 1409 */ 1410 mutex_enter(&spa_namespace_lock); 1411 spa = spa_add(TRYIMPORT_NAME, NULL); 1412 spa_activate(spa); 1413 1414 /* 1415 * Pass off the heavy lifting to spa_load(). 1416 * Pass TRUE for mosconfig because the user-supplied config 1417 * is actually the one to trust when doing an import. 1418 */ 1419 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1420 1421 /* 1422 * If 'tryconfig' was at least parsable, return the current config. 1423 */ 1424 if (spa->spa_root_vdev != NULL) { 1425 spa_config_enter(spa, RW_READER, FTAG); 1426 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1427 spa_config_exit(spa, FTAG); 1428 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1429 poolname) == 0); 1430 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1431 state) == 0); 1432 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 1433 spa->spa_uberblock.ub_timestamp) == 0); 1434 1435 /* 1436 * Add the list of hot spares. 1437 */ 1438 spa_add_spares(spa, config); 1439 } 1440 1441 spa_unload(spa); 1442 spa_deactivate(spa); 1443 spa_remove(spa); 1444 mutex_exit(&spa_namespace_lock); 1445 1446 return (config); 1447 } 1448 1449 /* 1450 * Pool export/destroy 1451 * 1452 * The act of destroying or exporting a pool is very simple. We make sure there 1453 * is no more pending I/O and any references to the pool are gone. Then, we 1454 * update the pool state and sync all the labels to disk, removing the 1455 * configuration from the cache afterwards. 1456 */ 1457 static int 1458 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1459 { 1460 spa_t *spa; 1461 1462 if (oldconfig) 1463 *oldconfig = NULL; 1464 1465 if (!(spa_mode & FWRITE)) 1466 return (EROFS); 1467 1468 mutex_enter(&spa_namespace_lock); 1469 if ((spa = spa_lookup(pool)) == NULL) { 1470 mutex_exit(&spa_namespace_lock); 1471 return (ENOENT); 1472 } 1473 1474 /* 1475 * Put a hold on the pool, drop the namespace lock, stop async tasks, 1476 * reacquire the namespace lock, and see if we can export. 1477 */ 1478 spa_open_ref(spa, FTAG); 1479 mutex_exit(&spa_namespace_lock); 1480 spa_async_suspend(spa); 1481 mutex_enter(&spa_namespace_lock); 1482 spa_close(spa, FTAG); 1483 1484 /* 1485 * The pool will be in core if it's openable, 1486 * in which case we can modify its state. 1487 */ 1488 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1489 /* 1490 * Objsets may be open only because they're dirty, so we 1491 * have to force it to sync before checking spa_refcnt. 1492 */ 1493 spa_scrub_suspend(spa); 1494 txg_wait_synced(spa->spa_dsl_pool, 0); 1495 1496 /* 1497 * A pool cannot be exported or destroyed if there are active 1498 * references. If we are resetting a pool, allow references by 1499 * fault injection handlers. 1500 */ 1501 if (!spa_refcount_zero(spa) || 1502 (spa->spa_inject_ref != 0 && 1503 new_state != POOL_STATE_UNINITIALIZED)) { 1504 spa_scrub_resume(spa); 1505 spa_async_resume(spa); 1506 mutex_exit(&spa_namespace_lock); 1507 return (EBUSY); 1508 } 1509 1510 spa_scrub_resume(spa); 1511 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1512 1513 /* 1514 * We want this to be reflected on every label, 1515 * so mark them all dirty. spa_unload() will do the 1516 * final sync that pushes these changes out. 1517 */ 1518 if (new_state != POOL_STATE_UNINITIALIZED) { 1519 spa_config_enter(spa, RW_WRITER, FTAG); 1520 spa->spa_state = new_state; 1521 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 1522 vdev_config_dirty(spa->spa_root_vdev); 1523 spa_config_exit(spa, FTAG); 1524 } 1525 } 1526 1527 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 1528 1529 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1530 spa_unload(spa); 1531 spa_deactivate(spa); 1532 } 1533 1534 if (oldconfig && spa->spa_config) 1535 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 1536 1537 if (new_state != POOL_STATE_UNINITIALIZED) { 1538 spa_remove(spa); 1539 spa_config_sync(); 1540 } 1541 mutex_exit(&spa_namespace_lock); 1542 1543 return (0); 1544 } 1545 1546 /* 1547 * Destroy a storage pool. 1548 */ 1549 int 1550 spa_destroy(char *pool) 1551 { 1552 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1553 } 1554 1555 /* 1556 * Export a storage pool. 1557 */ 1558 int 1559 spa_export(char *pool, nvlist_t **oldconfig) 1560 { 1561 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1562 } 1563 1564 /* 1565 * Similar to spa_export(), this unloads the spa_t without actually removing it 1566 * from the namespace in any way. 1567 */ 1568 int 1569 spa_reset(char *pool) 1570 { 1571 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 1572 } 1573 1574 1575 /* 1576 * ========================================================================== 1577 * Device manipulation 1578 * ========================================================================== 1579 */ 1580 1581 /* 1582 * Add a device to a storage pool. 1583 */ 1584 int 1585 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1586 { 1587 uint64_t txg; 1588 int c, error; 1589 vdev_t *rvd = spa->spa_root_vdev; 1590 vdev_t *vd, *tvd; 1591 nvlist_t **spares; 1592 uint_t i, nspares; 1593 1594 txg = spa_vdev_enter(spa); 1595 1596 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 1597 VDEV_ALLOC_ADD)) != 0) 1598 return (spa_vdev_exit(spa, NULL, txg, error)); 1599 1600 spa->spa_pending_vdev = vd; 1601 1602 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1603 &spares, &nspares) != 0) 1604 nspares = 0; 1605 1606 if (vd->vdev_children == 0 && nspares == 0) { 1607 spa->spa_pending_vdev = NULL; 1608 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1609 } 1610 1611 if (vd->vdev_children != 0) { 1612 if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 1613 spa->spa_pending_vdev = NULL; 1614 return (spa_vdev_exit(spa, vd, txg, error)); 1615 } 1616 } 1617 1618 /* 1619 * We must validate the spares after checking the children. Otherwise, 1620 * vdev_inuse() will blindly overwrite the spare. 1621 */ 1622 if ((error = spa_validate_spares(spa, nvroot, txg, 1623 VDEV_ALLOC_ADD)) != 0) { 1624 spa->spa_pending_vdev = NULL; 1625 return (spa_vdev_exit(spa, vd, txg, error)); 1626 } 1627 1628 spa->spa_pending_vdev = NULL; 1629 1630 /* 1631 * Transfer each new top-level vdev from vd to rvd. 1632 */ 1633 for (c = 0; c < vd->vdev_children; c++) { 1634 tvd = vd->vdev_child[c]; 1635 vdev_remove_child(vd, tvd); 1636 tvd->vdev_id = rvd->vdev_children; 1637 vdev_add_child(rvd, tvd); 1638 vdev_config_dirty(tvd); 1639 } 1640 1641 if (nspares != 0) { 1642 if (spa->spa_sparelist != NULL) { 1643 nvlist_t **oldspares; 1644 uint_t oldnspares; 1645 nvlist_t **newspares; 1646 1647 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 1648 ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 1649 1650 newspares = kmem_alloc(sizeof (void *) * 1651 (nspares + oldnspares), KM_SLEEP); 1652 for (i = 0; i < oldnspares; i++) 1653 VERIFY(nvlist_dup(oldspares[i], 1654 &newspares[i], KM_SLEEP) == 0); 1655 for (i = 0; i < nspares; i++) 1656 VERIFY(nvlist_dup(spares[i], 1657 &newspares[i + oldnspares], 1658 KM_SLEEP) == 0); 1659 1660 VERIFY(nvlist_remove(spa->spa_sparelist, 1661 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1662 1663 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1664 ZPOOL_CONFIG_SPARES, newspares, 1665 nspares + oldnspares) == 0); 1666 for (i = 0; i < oldnspares + nspares; i++) 1667 nvlist_free(newspares[i]); 1668 kmem_free(newspares, (oldnspares + nspares) * 1669 sizeof (void *)); 1670 } else { 1671 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1672 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1673 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1674 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1675 } 1676 1677 spa_load_spares(spa); 1678 spa->spa_sync_spares = B_TRUE; 1679 } 1680 1681 /* 1682 * We have to be careful when adding new vdevs to an existing pool. 1683 * If other threads start allocating from these vdevs before we 1684 * sync the config cache, and we lose power, then upon reboot we may 1685 * fail to open the pool because there are DVAs that the config cache 1686 * can't translate. Therefore, we first add the vdevs without 1687 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1688 * and then let spa_config_update() initialize the new metaslabs. 1689 * 1690 * spa_load() checks for added-but-not-initialized vdevs, so that 1691 * if we lose power at any point in this sequence, the remaining 1692 * steps will be completed the next time we load the pool. 1693 */ 1694 (void) spa_vdev_exit(spa, vd, txg, 0); 1695 1696 mutex_enter(&spa_namespace_lock); 1697 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1698 mutex_exit(&spa_namespace_lock); 1699 1700 return (0); 1701 } 1702 1703 /* 1704 * Attach a device to a mirror. The arguments are the path to any device 1705 * in the mirror, and the nvroot for the new device. If the path specifies 1706 * a device that is not mirrored, we automatically insert the mirror vdev. 1707 * 1708 * If 'replacing' is specified, the new device is intended to replace the 1709 * existing device; in this case the two devices are made into their own 1710 * mirror using the 'replacing' vdev, which is functionally identical to 1711 * the mirror vdev (it actually reuses all the same ops) but has a few 1712 * extra rules: you can't attach to it after it's been created, and upon 1713 * completion of resilvering, the first disk (the one being replaced) 1714 * is automatically detached. 1715 */ 1716 int 1717 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1718 { 1719 uint64_t txg, open_txg; 1720 int error; 1721 vdev_t *rvd = spa->spa_root_vdev; 1722 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1723 vdev_ops_t *pvops; 1724 int is_log; 1725 1726 txg = spa_vdev_enter(spa); 1727 1728 oldvd = vdev_lookup_by_guid(rvd, guid); 1729 1730 if (oldvd == NULL) 1731 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1732 1733 if (!oldvd->vdev_ops->vdev_op_leaf) 1734 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1735 1736 pvd = oldvd->vdev_parent; 1737 1738 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 1739 VDEV_ALLOC_ADD)) != 0) 1740 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 1741 1742 if (newrootvd->vdev_children != 1) 1743 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1744 1745 newvd = newrootvd->vdev_child[0]; 1746 1747 if (!newvd->vdev_ops->vdev_op_leaf) 1748 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1749 1750 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 1751 return (spa_vdev_exit(spa, newrootvd, txg, error)); 1752 1753 /* 1754 * Spares can't replace logs 1755 */ 1756 is_log = oldvd->vdev_islog; 1757 if (is_log && newvd->vdev_isspare) 1758 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1759 1760 if (!replacing) { 1761 /* 1762 * For attach, the only allowable parent is a mirror or the root 1763 * vdev. 1764 */ 1765 if (pvd->vdev_ops != &vdev_mirror_ops && 1766 pvd->vdev_ops != &vdev_root_ops) 1767 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1768 1769 pvops = &vdev_mirror_ops; 1770 } else { 1771 /* 1772 * Active hot spares can only be replaced by inactive hot 1773 * spares. 1774 */ 1775 if (pvd->vdev_ops == &vdev_spare_ops && 1776 pvd->vdev_child[1] == oldvd && 1777 !spa_has_spare(spa, newvd->vdev_guid)) 1778 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1779 1780 /* 1781 * If the source is a hot spare, and the parent isn't already a 1782 * spare, then we want to create a new hot spare. Otherwise, we 1783 * want to create a replacing vdev. The user is not allowed to 1784 * attach to a spared vdev child unless the 'isspare' state is 1785 * the same (spare replaces spare, non-spare replaces 1786 * non-spare). 1787 */ 1788 if (pvd->vdev_ops == &vdev_replacing_ops) 1789 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1790 else if (pvd->vdev_ops == &vdev_spare_ops && 1791 newvd->vdev_isspare != oldvd->vdev_isspare) 1792 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1793 else if (pvd->vdev_ops != &vdev_spare_ops && 1794 newvd->vdev_isspare) 1795 pvops = &vdev_spare_ops; 1796 else 1797 pvops = &vdev_replacing_ops; 1798 } 1799 1800 /* 1801 * Compare the new device size with the replaceable/attachable 1802 * device size. 1803 */ 1804 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1805 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1806 1807 /* 1808 * The new device cannot have a higher alignment requirement 1809 * than the top-level vdev. 1810 */ 1811 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1812 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1813 1814 /* 1815 * If this is an in-place replacement, update oldvd's path and devid 1816 * to make it distinguishable from newvd, and unopenable from now on. 1817 */ 1818 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1819 spa_strfree(oldvd->vdev_path); 1820 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1821 KM_SLEEP); 1822 (void) sprintf(oldvd->vdev_path, "%s/%s", 1823 newvd->vdev_path, "old"); 1824 if (oldvd->vdev_devid != NULL) { 1825 spa_strfree(oldvd->vdev_devid); 1826 oldvd->vdev_devid = NULL; 1827 } 1828 } 1829 1830 /* 1831 * If the parent is not a mirror, or if we're replacing, insert the new 1832 * mirror/replacing/spare vdev above oldvd. 1833 */ 1834 if (pvd->vdev_ops != pvops) 1835 pvd = vdev_add_parent(oldvd, pvops); 1836 1837 ASSERT(pvd->vdev_top->vdev_parent == rvd); 1838 ASSERT(pvd->vdev_ops == pvops); 1839 ASSERT(oldvd->vdev_parent == pvd); 1840 1841 /* 1842 * Extract the new device from its root and add it to pvd. 1843 */ 1844 vdev_remove_child(newrootvd, newvd); 1845 newvd->vdev_id = pvd->vdev_children; 1846 vdev_add_child(pvd, newvd); 1847 1848 /* 1849 * If newvd is smaller than oldvd, but larger than its rsize, 1850 * the addition of newvd may have decreased our parent's asize. 1851 */ 1852 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1853 1854 tvd = newvd->vdev_top; 1855 ASSERT(pvd->vdev_top == tvd); 1856 ASSERT(tvd->vdev_parent == rvd); 1857 1858 vdev_config_dirty(tvd); 1859 1860 /* 1861 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1862 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1863 */ 1864 open_txg = txg + TXG_CONCURRENT_STATES - 1; 1865 1866 mutex_enter(&newvd->vdev_dtl_lock); 1867 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1868 open_txg - TXG_INITIAL + 1); 1869 mutex_exit(&newvd->vdev_dtl_lock); 1870 1871 if (newvd->vdev_isspare) 1872 spa_spare_activate(newvd); 1873 1874 /* 1875 * Mark newvd's DTL dirty in this txg. 1876 */ 1877 vdev_dirty(tvd, VDD_DTL, newvd, txg); 1878 1879 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1880 1881 /* 1882 * Kick off a resilver to update newvd. We need to grab the namespace 1883 * lock because spa_scrub() needs to post a sysevent with the pool name. 1884 */ 1885 mutex_enter(&spa_namespace_lock); 1886 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1887 mutex_exit(&spa_namespace_lock); 1888 1889 return (0); 1890 } 1891 1892 /* 1893 * Detach a device from a mirror or replacing vdev. 1894 * If 'replace_done' is specified, only detach if the parent 1895 * is a replacing vdev. 1896 */ 1897 int 1898 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1899 { 1900 uint64_t txg; 1901 int c, t, error; 1902 vdev_t *rvd = spa->spa_root_vdev; 1903 vdev_t *vd, *pvd, *cvd, *tvd; 1904 boolean_t unspare = B_FALSE; 1905 uint64_t unspare_guid; 1906 1907 txg = spa_vdev_enter(spa); 1908 1909 vd = vdev_lookup_by_guid(rvd, guid); 1910 1911 if (vd == NULL) 1912 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1913 1914 if (!vd->vdev_ops->vdev_op_leaf) 1915 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1916 1917 pvd = vd->vdev_parent; 1918 1919 /* 1920 * If replace_done is specified, only remove this device if it's 1921 * the first child of a replacing vdev. For the 'spare' vdev, either 1922 * disk can be removed. 1923 */ 1924 if (replace_done) { 1925 if (pvd->vdev_ops == &vdev_replacing_ops) { 1926 if (vd->vdev_id != 0) 1927 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1928 } else if (pvd->vdev_ops != &vdev_spare_ops) { 1929 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1930 } 1931 } 1932 1933 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 1934 spa_version(spa) >= SPA_VERSION_SPARES); 1935 1936 /* 1937 * Only mirror, replacing, and spare vdevs support detach. 1938 */ 1939 if (pvd->vdev_ops != &vdev_replacing_ops && 1940 pvd->vdev_ops != &vdev_mirror_ops && 1941 pvd->vdev_ops != &vdev_spare_ops) 1942 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1943 1944 /* 1945 * If there's only one replica, you can't detach it. 1946 */ 1947 if (pvd->vdev_children <= 1) 1948 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1949 1950 /* 1951 * If all siblings have non-empty DTLs, this device may have the only 1952 * valid copy of the data, which means we cannot safely detach it. 1953 * 1954 * XXX -- as in the vdev_offline() case, we really want a more 1955 * precise DTL check. 1956 */ 1957 for (c = 0; c < pvd->vdev_children; c++) { 1958 uint64_t dirty; 1959 1960 cvd = pvd->vdev_child[c]; 1961 if (cvd == vd) 1962 continue; 1963 if (vdev_is_dead(cvd)) 1964 continue; 1965 mutex_enter(&cvd->vdev_dtl_lock); 1966 dirty = cvd->vdev_dtl_map.sm_space | 1967 cvd->vdev_dtl_scrub.sm_space; 1968 mutex_exit(&cvd->vdev_dtl_lock); 1969 if (!dirty) 1970 break; 1971 } 1972 1973 /* 1974 * If we are a replacing or spare vdev, then we can always detach the 1975 * latter child, as that is how one cancels the operation. 1976 */ 1977 if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 1978 c == pvd->vdev_children) 1979 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1980 1981 /* 1982 * If we are detaching the original disk from a spare, then it implies 1983 * that the spare should become a real disk, and be removed from the 1984 * active spare list for the pool. 1985 */ 1986 if (pvd->vdev_ops == &vdev_spare_ops && 1987 vd->vdev_id == 0) 1988 unspare = B_TRUE; 1989 1990 /* 1991 * Erase the disk labels so the disk can be used for other things. 1992 * This must be done after all other error cases are handled, 1993 * but before we disembowel vd (so we can still do I/O to it). 1994 * But if we can't do it, don't treat the error as fatal -- 1995 * it may be that the unwritability of the disk is the reason 1996 * it's being detached! 1997 */ 1998 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 1999 2000 /* 2001 * Remove vd from its parent and compact the parent's children. 2002 */ 2003 vdev_remove_child(pvd, vd); 2004 vdev_compact_children(pvd); 2005 2006 /* 2007 * Remember one of the remaining children so we can get tvd below. 2008 */ 2009 cvd = pvd->vdev_child[0]; 2010 2011 /* 2012 * If we need to remove the remaining child from the list of hot spares, 2013 * do it now, marking the vdev as no longer a spare in the process. We 2014 * must do this before vdev_remove_parent(), because that can change the 2015 * GUID if it creates a new toplevel GUID. 2016 */ 2017 if (unspare) { 2018 ASSERT(cvd->vdev_isspare); 2019 spa_spare_remove(cvd); 2020 unspare_guid = cvd->vdev_guid; 2021 } 2022 2023 /* 2024 * If the parent mirror/replacing vdev only has one child, 2025 * the parent is no longer needed. Remove it from the tree. 2026 */ 2027 if (pvd->vdev_children == 1) 2028 vdev_remove_parent(cvd); 2029 2030 /* 2031 * We don't set tvd until now because the parent we just removed 2032 * may have been the previous top-level vdev. 2033 */ 2034 tvd = cvd->vdev_top; 2035 ASSERT(tvd->vdev_parent == rvd); 2036 2037 /* 2038 * Reevaluate the parent vdev state. 2039 */ 2040 vdev_propagate_state(cvd); 2041 2042 /* 2043 * If the device we just detached was smaller than the others, it may be 2044 * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 2045 * can't fail because the existing metaslabs are already in core, so 2046 * there's nothing to read from disk. 2047 */ 2048 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 2049 2050 vdev_config_dirty(tvd); 2051 2052 /* 2053 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 2054 * vd->vdev_detached is set and free vd's DTL object in syncing context. 2055 * But first make sure we're not on any *other* txg's DTL list, to 2056 * prevent vd from being accessed after it's freed. 2057 */ 2058 for (t = 0; t < TXG_SIZE; t++) 2059 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 2060 vd->vdev_detached = B_TRUE; 2061 vdev_dirty(tvd, VDD_DTL, vd, txg); 2062 2063 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 2064 2065 error = spa_vdev_exit(spa, vd, txg, 0); 2066 2067 /* 2068 * If this was the removal of the original device in a hot spare vdev, 2069 * then we want to go through and remove the device from the hot spare 2070 * list of every other pool. 2071 */ 2072 if (unspare) { 2073 spa = NULL; 2074 mutex_enter(&spa_namespace_lock); 2075 while ((spa = spa_next(spa)) != NULL) { 2076 if (spa->spa_state != POOL_STATE_ACTIVE) 2077 continue; 2078 2079 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 2080 } 2081 mutex_exit(&spa_namespace_lock); 2082 } 2083 2084 return (error); 2085 } 2086 2087 /* 2088 * Remove a device from the pool. Currently, this supports removing only hot 2089 * spares. 2090 */ 2091 int 2092 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 2093 { 2094 vdev_t *vd; 2095 nvlist_t **spares, *nv, **newspares; 2096 uint_t i, j, nspares; 2097 int ret = 0; 2098 2099 spa_config_enter(spa, RW_WRITER, FTAG); 2100 2101 vd = spa_lookup_by_guid(spa, guid); 2102 2103 nv = NULL; 2104 if (spa->spa_spares != NULL && 2105 nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2106 &spares, &nspares) == 0) { 2107 for (i = 0; i < nspares; i++) { 2108 uint64_t theguid; 2109 2110 VERIFY(nvlist_lookup_uint64(spares[i], 2111 ZPOOL_CONFIG_GUID, &theguid) == 0); 2112 if (theguid == guid) { 2113 nv = spares[i]; 2114 break; 2115 } 2116 } 2117 } 2118 2119 /* 2120 * We only support removing a hot spare, and only if it's not currently 2121 * in use in this pool. 2122 */ 2123 if (nv == NULL && vd == NULL) { 2124 ret = ENOENT; 2125 goto out; 2126 } 2127 2128 if (nv == NULL && vd != NULL) { 2129 ret = ENOTSUP; 2130 goto out; 2131 } 2132 2133 if (!unspare && nv != NULL && vd != NULL) { 2134 ret = EBUSY; 2135 goto out; 2136 } 2137 2138 if (nspares == 1) { 2139 newspares = NULL; 2140 } else { 2141 newspares = kmem_alloc((nspares - 1) * sizeof (void *), 2142 KM_SLEEP); 2143 for (i = 0, j = 0; i < nspares; i++) { 2144 if (spares[i] != nv) 2145 VERIFY(nvlist_dup(spares[i], 2146 &newspares[j++], KM_SLEEP) == 0); 2147 } 2148 } 2149 2150 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2151 DATA_TYPE_NVLIST_ARRAY) == 0); 2152 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2153 newspares, nspares - 1) == 0); 2154 for (i = 0; i < nspares - 1; i++) 2155 nvlist_free(newspares[i]); 2156 kmem_free(newspares, (nspares - 1) * sizeof (void *)); 2157 spa_load_spares(spa); 2158 spa->spa_sync_spares = B_TRUE; 2159 2160 out: 2161 spa_config_exit(spa, FTAG); 2162 2163 return (ret); 2164 } 2165 2166 /* 2167 * Find any device that's done replacing, or a vdev marked 'unspare' that's 2168 * current spared, so we can detach it. 2169 */ 2170 static vdev_t * 2171 spa_vdev_resilver_done_hunt(vdev_t *vd) 2172 { 2173 vdev_t *newvd, *oldvd; 2174 int c; 2175 2176 for (c = 0; c < vd->vdev_children; c++) { 2177 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 2178 if (oldvd != NULL) 2179 return (oldvd); 2180 } 2181 2182 /* 2183 * Check for a completed replacement. 2184 */ 2185 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 2186 oldvd = vd->vdev_child[0]; 2187 newvd = vd->vdev_child[1]; 2188 2189 mutex_enter(&newvd->vdev_dtl_lock); 2190 if (newvd->vdev_dtl_map.sm_space == 0 && 2191 newvd->vdev_dtl_scrub.sm_space == 0) { 2192 mutex_exit(&newvd->vdev_dtl_lock); 2193 return (oldvd); 2194 } 2195 mutex_exit(&newvd->vdev_dtl_lock); 2196 } 2197 2198 /* 2199 * Check for a completed resilver with the 'unspare' flag set. 2200 */ 2201 if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { 2202 newvd = vd->vdev_child[0]; 2203 oldvd = vd->vdev_child[1]; 2204 2205 mutex_enter(&newvd->vdev_dtl_lock); 2206 if (newvd->vdev_unspare && 2207 newvd->vdev_dtl_map.sm_space == 0 && 2208 newvd->vdev_dtl_scrub.sm_space == 0) { 2209 newvd->vdev_unspare = 0; 2210 mutex_exit(&newvd->vdev_dtl_lock); 2211 return (oldvd); 2212 } 2213 mutex_exit(&newvd->vdev_dtl_lock); 2214 } 2215 2216 return (NULL); 2217 } 2218 2219 static void 2220 spa_vdev_resilver_done(spa_t *spa) 2221 { 2222 vdev_t *vd; 2223 vdev_t *pvd; 2224 uint64_t guid; 2225 uint64_t pguid = 0; 2226 2227 spa_config_enter(spa, RW_READER, FTAG); 2228 2229 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 2230 guid = vd->vdev_guid; 2231 /* 2232 * If we have just finished replacing a hot spared device, then 2233 * we need to detach the parent's first child (the original hot 2234 * spare) as well. 2235 */ 2236 pvd = vd->vdev_parent; 2237 if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2238 pvd->vdev_id == 0) { 2239 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 2240 ASSERT(pvd->vdev_parent->vdev_children == 2); 2241 pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 2242 } 2243 spa_config_exit(spa, FTAG); 2244 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 2245 return; 2246 if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 2247 return; 2248 spa_config_enter(spa, RW_READER, FTAG); 2249 } 2250 2251 spa_config_exit(spa, FTAG); 2252 } 2253 2254 /* 2255 * Update the stored path for this vdev. Dirty the vdev configuration, relying 2256 * on spa_vdev_enter/exit() to synchronize the labels and cache. 2257 */ 2258 int 2259 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 2260 { 2261 vdev_t *rvd, *vd; 2262 uint64_t txg; 2263 2264 rvd = spa->spa_root_vdev; 2265 2266 txg = spa_vdev_enter(spa); 2267 2268 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2269 /* 2270 * Determine if this is a reference to a hot spare. In that 2271 * case, update the path as stored in the spare list. 2272 */ 2273 nvlist_t **spares; 2274 uint_t i, nspares; 2275 if (spa->spa_sparelist != NULL) { 2276 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 2277 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2278 for (i = 0; i < nspares; i++) { 2279 uint64_t theguid; 2280 VERIFY(nvlist_lookup_uint64(spares[i], 2281 ZPOOL_CONFIG_GUID, &theguid) == 0); 2282 if (theguid == guid) 2283 break; 2284 } 2285 2286 if (i == nspares) 2287 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2288 2289 VERIFY(nvlist_add_string(spares[i], 2290 ZPOOL_CONFIG_PATH, newpath) == 0); 2291 spa_load_spares(spa); 2292 spa->spa_sync_spares = B_TRUE; 2293 return (spa_vdev_exit(spa, NULL, txg, 0)); 2294 } else { 2295 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2296 } 2297 } 2298 2299 if (!vd->vdev_ops->vdev_op_leaf) 2300 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2301 2302 spa_strfree(vd->vdev_path); 2303 vd->vdev_path = spa_strdup(newpath); 2304 2305 vdev_config_dirty(vd->vdev_top); 2306 2307 return (spa_vdev_exit(spa, NULL, txg, 0)); 2308 } 2309 2310 /* 2311 * ========================================================================== 2312 * SPA Scrubbing 2313 * ========================================================================== 2314 */ 2315 2316 static void 2317 spa_scrub_io_done(zio_t *zio) 2318 { 2319 spa_t *spa = zio->io_spa; 2320 2321 arc_data_buf_free(zio->io_data, zio->io_size); 2322 2323 mutex_enter(&spa->spa_scrub_lock); 2324 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2325 vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2326 spa->spa_scrub_errors++; 2327 mutex_enter(&vd->vdev_stat_lock); 2328 vd->vdev_stat.vs_scrub_errors++; 2329 mutex_exit(&vd->vdev_stat_lock); 2330 } 2331 2332 if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight) 2333 cv_broadcast(&spa->spa_scrub_io_cv); 2334 2335 ASSERT(spa->spa_scrub_inflight >= 0); 2336 2337 mutex_exit(&spa->spa_scrub_lock); 2338 } 2339 2340 static void 2341 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 2342 zbookmark_t *zb) 2343 { 2344 size_t size = BP_GET_LSIZE(bp); 2345 void *data; 2346 2347 mutex_enter(&spa->spa_scrub_lock); 2348 /* 2349 * Do not give too much work to vdev(s). 2350 */ 2351 while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) { 2352 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2353 } 2354 spa->spa_scrub_inflight++; 2355 mutex_exit(&spa->spa_scrub_lock); 2356 2357 data = arc_data_buf_alloc(size); 2358 2359 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 2360 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 2361 2362 flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 2363 2364 zio_nowait(zio_read(NULL, spa, bp, data, size, 2365 spa_scrub_io_done, NULL, priority, flags, zb)); 2366 } 2367 2368 /* ARGSUSED */ 2369 static int 2370 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2371 { 2372 blkptr_t *bp = &bc->bc_blkptr; 2373 vdev_t *vd = spa->spa_root_vdev; 2374 dva_t *dva = bp->blk_dva; 2375 int needs_resilver = B_FALSE; 2376 int d; 2377 2378 if (bc->bc_errno) { 2379 /* 2380 * We can't scrub this block, but we can continue to scrub 2381 * the rest of the pool. Note the error and move along. 2382 */ 2383 mutex_enter(&spa->spa_scrub_lock); 2384 spa->spa_scrub_errors++; 2385 mutex_exit(&spa->spa_scrub_lock); 2386 2387 mutex_enter(&vd->vdev_stat_lock); 2388 vd->vdev_stat.vs_scrub_errors++; 2389 mutex_exit(&vd->vdev_stat_lock); 2390 2391 return (ERESTART); 2392 } 2393 2394 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2395 2396 for (d = 0; d < BP_GET_NDVAS(bp); d++) { 2397 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 2398 2399 ASSERT(vd != NULL); 2400 2401 /* 2402 * Keep track of how much data we've examined so that 2403 * zpool(1M) status can make useful progress reports. 2404 */ 2405 mutex_enter(&vd->vdev_stat_lock); 2406 vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 2407 mutex_exit(&vd->vdev_stat_lock); 2408 2409 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 2410 if (DVA_GET_GANG(&dva[d])) { 2411 /* 2412 * Gang members may be spread across multiple 2413 * vdevs, so the best we can do is look at the 2414 * pool-wide DTL. 2415 * XXX -- it would be better to change our 2416 * allocation policy to ensure that this can't 2417 * happen. 2418 */ 2419 vd = spa->spa_root_vdev; 2420 } 2421 if (vdev_dtl_contains(&vd->vdev_dtl_map, 2422 bp->blk_birth, 1)) 2423 needs_resilver = B_TRUE; 2424 } 2425 } 2426 2427 if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2428 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 2429 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 2430 else if (needs_resilver) 2431 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 2432 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2433 2434 return (0); 2435 } 2436 2437 static void 2438 spa_scrub_thread(spa_t *spa) 2439 { 2440 callb_cpr_t cprinfo; 2441 traverse_handle_t *th = spa->spa_scrub_th; 2442 vdev_t *rvd = spa->spa_root_vdev; 2443 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2444 int error = 0; 2445 boolean_t complete; 2446 2447 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2448 2449 /* 2450 * If we're restarting due to a snapshot create/delete, 2451 * wait for that to complete. 2452 */ 2453 txg_wait_synced(spa_get_dsl(spa), 0); 2454 2455 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 2456 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2457 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 2458 2459 spa_config_enter(spa, RW_WRITER, FTAG); 2460 vdev_reopen(rvd); /* purge all vdev caches */ 2461 vdev_config_dirty(rvd); /* rewrite all disk labels */ 2462 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 2463 spa_config_exit(spa, FTAG); 2464 2465 mutex_enter(&spa->spa_scrub_lock); 2466 spa->spa_scrub_errors = 0; 2467 spa->spa_scrub_active = 1; 2468 ASSERT(spa->spa_scrub_inflight == 0); 2469 2470 while (!spa->spa_scrub_stop) { 2471 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2472 while (spa->spa_scrub_suspended) { 2473 spa->spa_scrub_active = 0; 2474 cv_broadcast(&spa->spa_scrub_cv); 2475 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2476 spa->spa_scrub_active = 1; 2477 } 2478 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2479 2480 if (spa->spa_scrub_restart_txg != 0) 2481 break; 2482 2483 mutex_exit(&spa->spa_scrub_lock); 2484 error = traverse_more(th); 2485 mutex_enter(&spa->spa_scrub_lock); 2486 if (error != EAGAIN) 2487 break; 2488 } 2489 2490 while (spa->spa_scrub_inflight) 2491 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2492 2493 spa->spa_scrub_active = 0; 2494 cv_broadcast(&spa->spa_scrub_cv); 2495 2496 mutex_exit(&spa->spa_scrub_lock); 2497 2498 spa_config_enter(spa, RW_WRITER, FTAG); 2499 2500 mutex_enter(&spa->spa_scrub_lock); 2501 2502 /* 2503 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 2504 * AND the spa config lock to synchronize with any config changes 2505 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 2506 */ 2507 if (spa->spa_scrub_restart_txg != 0) 2508 error = ERESTART; 2509 2510 if (spa->spa_scrub_stop) 2511 error = EINTR; 2512 2513 /* 2514 * Even if there were uncorrectable errors, we consider the scrub 2515 * completed. The downside is that if there is a transient error during 2516 * a resilver, we won't resilver the data properly to the target. But 2517 * if the damage is permanent (more likely) we will resilver forever, 2518 * which isn't really acceptable. Since there is enough information for 2519 * the user to know what has failed and why, this seems like a more 2520 * tractable approach. 2521 */ 2522 complete = (error == 0); 2523 2524 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 2525 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2526 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2527 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2528 2529 mutex_exit(&spa->spa_scrub_lock); 2530 2531 /* 2532 * If the scrub/resilver completed, update all DTLs to reflect this. 2533 * Whether it succeeded or not, vacate all temporary scrub DTLs. 2534 */ 2535 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2536 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2537 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 2538 spa_errlog_rotate(spa); 2539 2540 if (scrub_type == POOL_SCRUB_RESILVER && complete) 2541 spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_FINISH); 2542 2543 spa_config_exit(spa, FTAG); 2544 2545 mutex_enter(&spa->spa_scrub_lock); 2546 2547 /* 2548 * We may have finished replacing a device. 2549 * Let the async thread assess this and handle the detach. 2550 */ 2551 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 2552 2553 /* 2554 * If we were told to restart, our final act is to start a new scrub. 2555 */ 2556 if (error == ERESTART) 2557 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 2558 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2559 2560 spa->spa_scrub_type = POOL_SCRUB_NONE; 2561 spa->spa_scrub_active = 0; 2562 spa->spa_scrub_thread = NULL; 2563 cv_broadcast(&spa->spa_scrub_cv); 2564 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2565 thread_exit(); 2566 } 2567 2568 void 2569 spa_scrub_suspend(spa_t *spa) 2570 { 2571 mutex_enter(&spa->spa_scrub_lock); 2572 spa->spa_scrub_suspended++; 2573 while (spa->spa_scrub_active) { 2574 cv_broadcast(&spa->spa_scrub_cv); 2575 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2576 } 2577 while (spa->spa_scrub_inflight) 2578 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2579 mutex_exit(&spa->spa_scrub_lock); 2580 } 2581 2582 void 2583 spa_scrub_resume(spa_t *spa) 2584 { 2585 mutex_enter(&spa->spa_scrub_lock); 2586 ASSERT(spa->spa_scrub_suspended != 0); 2587 if (--spa->spa_scrub_suspended == 0) 2588 cv_broadcast(&spa->spa_scrub_cv); 2589 mutex_exit(&spa->spa_scrub_lock); 2590 } 2591 2592 void 2593 spa_scrub_restart(spa_t *spa, uint64_t txg) 2594 { 2595 /* 2596 * Something happened (e.g. snapshot create/delete) that means 2597 * we must restart any in-progress scrubs. The itinerary will 2598 * fix this properly. 2599 */ 2600 mutex_enter(&spa->spa_scrub_lock); 2601 spa->spa_scrub_restart_txg = txg; 2602 mutex_exit(&spa->spa_scrub_lock); 2603 } 2604 2605 int 2606 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2607 { 2608 space_seg_t *ss; 2609 uint64_t mintxg, maxtxg; 2610 vdev_t *rvd = spa->spa_root_vdev; 2611 2612 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2613 ASSERT(!spa_config_held(spa, RW_WRITER)); 2614 2615 if ((uint_t)type >= POOL_SCRUB_TYPES) 2616 return (ENOTSUP); 2617 2618 mutex_enter(&spa->spa_scrub_lock); 2619 2620 /* 2621 * If there's a scrub or resilver already in progress, stop it. 2622 */ 2623 while (spa->spa_scrub_thread != NULL) { 2624 /* 2625 * Don't stop a resilver unless forced. 2626 */ 2627 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 2628 mutex_exit(&spa->spa_scrub_lock); 2629 return (EBUSY); 2630 } 2631 spa->spa_scrub_stop = 1; 2632 cv_broadcast(&spa->spa_scrub_cv); 2633 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2634 } 2635 2636 /* 2637 * Terminate the previous traverse. 2638 */ 2639 if (spa->spa_scrub_th != NULL) { 2640 traverse_fini(spa->spa_scrub_th); 2641 spa->spa_scrub_th = NULL; 2642 } 2643 2644 if (rvd == NULL) { 2645 ASSERT(spa->spa_scrub_stop == 0); 2646 ASSERT(spa->spa_scrub_type == type); 2647 ASSERT(spa->spa_scrub_restart_txg == 0); 2648 mutex_exit(&spa->spa_scrub_lock); 2649 return (0); 2650 } 2651 2652 mintxg = TXG_INITIAL - 1; 2653 maxtxg = spa_last_synced_txg(spa) + 1; 2654 2655 mutex_enter(&rvd->vdev_dtl_lock); 2656 2657 if (rvd->vdev_dtl_map.sm_space == 0) { 2658 /* 2659 * The pool-wide DTL is empty. 2660 * If this is a resilver, there's nothing to do except 2661 * check whether any in-progress replacements have completed. 2662 */ 2663 if (type == POOL_SCRUB_RESILVER) { 2664 type = POOL_SCRUB_NONE; 2665 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 2666 } 2667 } else { 2668 /* 2669 * The pool-wide DTL is non-empty. 2670 * If this is a normal scrub, upgrade to a resilver instead. 2671 */ 2672 if (type == POOL_SCRUB_EVERYTHING) 2673 type = POOL_SCRUB_RESILVER; 2674 } 2675 2676 if (type == POOL_SCRUB_RESILVER) { 2677 /* 2678 * Determine the resilvering boundaries. 2679 * 2680 * Note: (mintxg, maxtxg) is an open interval, 2681 * i.e. mintxg and maxtxg themselves are not included. 2682 * 2683 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 2684 * so we don't claim to resilver a txg that's still changing. 2685 */ 2686 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 2687 mintxg = ss->ss_start - 1; 2688 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 2689 maxtxg = MIN(ss->ss_end, maxtxg); 2690 2691 spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START); 2692 } 2693 2694 mutex_exit(&rvd->vdev_dtl_lock); 2695 2696 spa->spa_scrub_stop = 0; 2697 spa->spa_scrub_type = type; 2698 spa->spa_scrub_restart_txg = 0; 2699 2700 if (type != POOL_SCRUB_NONE) { 2701 spa->spa_scrub_mintxg = mintxg; 2702 spa->spa_scrub_maxtxg = maxtxg; 2703 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 2704 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 2705 ZIO_FLAG_CANFAIL); 2706 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 2707 spa->spa_scrub_thread = thread_create(NULL, 0, 2708 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 2709 } 2710 2711 mutex_exit(&spa->spa_scrub_lock); 2712 2713 return (0); 2714 } 2715 2716 /* 2717 * ========================================================================== 2718 * SPA async task processing 2719 * ========================================================================== 2720 */ 2721 2722 static void 2723 spa_async_remove(spa_t *spa, vdev_t *vd) 2724 { 2725 vdev_t *tvd; 2726 int c; 2727 2728 for (c = 0; c < vd->vdev_children; c++) { 2729 tvd = vd->vdev_child[c]; 2730 if (tvd->vdev_remove_wanted) { 2731 tvd->vdev_remove_wanted = 0; 2732 vdev_set_state(tvd, B_FALSE, VDEV_STATE_REMOVED, 2733 VDEV_AUX_NONE); 2734 vdev_clear(spa, tvd); 2735 vdev_config_dirty(tvd->vdev_top); 2736 } 2737 spa_async_remove(spa, tvd); 2738 } 2739 } 2740 2741 static void 2742 spa_async_thread(spa_t *spa) 2743 { 2744 int tasks; 2745 uint64_t txg; 2746 2747 ASSERT(spa->spa_sync_on); 2748 2749 mutex_enter(&spa->spa_async_lock); 2750 tasks = spa->spa_async_tasks; 2751 spa->spa_async_tasks = 0; 2752 mutex_exit(&spa->spa_async_lock); 2753 2754 /* 2755 * See if the config needs to be updated. 2756 */ 2757 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 2758 mutex_enter(&spa_namespace_lock); 2759 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2760 mutex_exit(&spa_namespace_lock); 2761 } 2762 2763 /* 2764 * See if any devices need to be marked REMOVED. 2765 */ 2766 if (tasks & SPA_ASYNC_REMOVE) { 2767 txg = spa_vdev_enter(spa); 2768 spa_async_remove(spa, spa->spa_root_vdev); 2769 (void) spa_vdev_exit(spa, NULL, txg, 0); 2770 } 2771 2772 /* 2773 * If any devices are done replacing, detach them. 2774 */ 2775 if (tasks & SPA_ASYNC_RESILVER_DONE) 2776 spa_vdev_resilver_done(spa); 2777 2778 /* 2779 * Kick off a scrub. When starting a RESILVER scrub (or an EVERYTHING 2780 * scrub which can become a resilver), we need to hold 2781 * spa_namespace_lock() because the sysevent we post via 2782 * spa_event_notify() needs to get the name of the pool. 2783 */ 2784 if (tasks & SPA_ASYNC_SCRUB) { 2785 mutex_enter(&spa_namespace_lock); 2786 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 2787 mutex_exit(&spa_namespace_lock); 2788 } 2789 2790 /* 2791 * Kick off a resilver. 2792 */ 2793 if (tasks & SPA_ASYNC_RESILVER) { 2794 mutex_enter(&spa_namespace_lock); 2795 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 2796 mutex_exit(&spa_namespace_lock); 2797 } 2798 2799 /* 2800 * Let the world know that we're done. 2801 */ 2802 mutex_enter(&spa->spa_async_lock); 2803 spa->spa_async_thread = NULL; 2804 cv_broadcast(&spa->spa_async_cv); 2805 mutex_exit(&spa->spa_async_lock); 2806 thread_exit(); 2807 } 2808 2809 void 2810 spa_async_suspend(spa_t *spa) 2811 { 2812 mutex_enter(&spa->spa_async_lock); 2813 spa->spa_async_suspended++; 2814 while (spa->spa_async_thread != NULL) 2815 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 2816 mutex_exit(&spa->spa_async_lock); 2817 } 2818 2819 void 2820 spa_async_resume(spa_t *spa) 2821 { 2822 mutex_enter(&spa->spa_async_lock); 2823 ASSERT(spa->spa_async_suspended != 0); 2824 spa->spa_async_suspended--; 2825 mutex_exit(&spa->spa_async_lock); 2826 } 2827 2828 static void 2829 spa_async_dispatch(spa_t *spa) 2830 { 2831 mutex_enter(&spa->spa_async_lock); 2832 if (spa->spa_async_tasks && !spa->spa_async_suspended && 2833 spa->spa_async_thread == NULL && 2834 rootdir != NULL && !vn_is_readonly(rootdir)) 2835 spa->spa_async_thread = thread_create(NULL, 0, 2836 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 2837 mutex_exit(&spa->spa_async_lock); 2838 } 2839 2840 void 2841 spa_async_request(spa_t *spa, int task) 2842 { 2843 mutex_enter(&spa->spa_async_lock); 2844 spa->spa_async_tasks |= task; 2845 mutex_exit(&spa->spa_async_lock); 2846 } 2847 2848 /* 2849 * ========================================================================== 2850 * SPA syncing routines 2851 * ========================================================================== 2852 */ 2853 2854 static void 2855 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2856 { 2857 bplist_t *bpl = &spa->spa_sync_bplist; 2858 dmu_tx_t *tx; 2859 blkptr_t blk; 2860 uint64_t itor = 0; 2861 zio_t *zio; 2862 int error; 2863 uint8_t c = 1; 2864 2865 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2866 2867 while (bplist_iterate(bpl, &itor, &blk) == 0) 2868 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2869 2870 error = zio_wait(zio); 2871 ASSERT3U(error, ==, 0); 2872 2873 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2874 bplist_vacate(bpl, tx); 2875 2876 /* 2877 * Pre-dirty the first block so we sync to convergence faster. 2878 * (Usually only the first block is needed.) 2879 */ 2880 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2881 dmu_tx_commit(tx); 2882 } 2883 2884 static void 2885 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 2886 { 2887 char *packed = NULL; 2888 size_t nvsize = 0; 2889 dmu_buf_t *db; 2890 2891 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 2892 2893 packed = kmem_alloc(nvsize, KM_SLEEP); 2894 2895 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 2896 KM_SLEEP) == 0); 2897 2898 dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 2899 2900 kmem_free(packed, nvsize); 2901 2902 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 2903 dmu_buf_will_dirty(db, tx); 2904 *(uint64_t *)db->db_data = nvsize; 2905 dmu_buf_rele(db, FTAG); 2906 } 2907 2908 static void 2909 spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 2910 { 2911 nvlist_t *nvroot; 2912 nvlist_t **spares; 2913 int i; 2914 2915 if (!spa->spa_sync_spares) 2916 return; 2917 2918 /* 2919 * Update the MOS nvlist describing the list of available spares. 2920 * spa_validate_spares() will have already made sure this nvlist is 2921 * valid and the vdevs are labeled appropriately. 2922 */ 2923 if (spa->spa_spares_object == 0) { 2924 spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 2925 DMU_OT_PACKED_NVLIST, 1 << 14, 2926 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2927 VERIFY(zap_update(spa->spa_meta_objset, 2928 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 2929 sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 2930 } 2931 2932 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2933 if (spa->spa_nspares == 0) { 2934 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2935 NULL, 0) == 0); 2936 } else { 2937 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 2938 KM_SLEEP); 2939 for (i = 0; i < spa->spa_nspares; i++) 2940 spares[i] = vdev_config_generate(spa, 2941 spa->spa_spares[i], B_FALSE, B_TRUE); 2942 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2943 spares, spa->spa_nspares) == 0); 2944 for (i = 0; i < spa->spa_nspares; i++) 2945 nvlist_free(spares[i]); 2946 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 2947 } 2948 2949 spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 2950 nvlist_free(nvroot); 2951 2952 spa->spa_sync_spares = B_FALSE; 2953 } 2954 2955 static void 2956 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2957 { 2958 nvlist_t *config; 2959 2960 if (list_is_empty(&spa->spa_dirty_list)) 2961 return; 2962 2963 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2964 2965 if (spa->spa_config_syncing) 2966 nvlist_free(spa->spa_config_syncing); 2967 spa->spa_config_syncing = config; 2968 2969 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 2970 } 2971 2972 static void 2973 spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 2974 { 2975 spa_t *spa = arg1; 2976 nvlist_t *nvp = arg2; 2977 nvpair_t *nvpair; 2978 objset_t *mos = spa->spa_meta_objset; 2979 uint64_t zapobj; 2980 uint64_t intval; 2981 2982 mutex_enter(&spa->spa_props_lock); 2983 if (spa->spa_pool_props_object == 0) { 2984 zapobj = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx); 2985 VERIFY(zapobj > 0); 2986 2987 spa->spa_pool_props_object = zapobj; 2988 2989 VERIFY(zap_update(mos, DMU_POOL_DIRECTORY_OBJECT, 2990 DMU_POOL_PROPS, 8, 1, 2991 &spa->spa_pool_props_object, tx) == 0); 2992 } 2993 mutex_exit(&spa->spa_props_lock); 2994 2995 nvpair = NULL; 2996 while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) { 2997 switch (zpool_name_to_prop(nvpair_name(nvpair))) { 2998 case ZPOOL_PROP_DELEGATION: 2999 VERIFY(nvlist_lookup_uint64(nvp, 3000 nvpair_name(nvpair), &intval) == 0); 3001 VERIFY(zap_update(mos, 3002 spa->spa_pool_props_object, 3003 nvpair_name(nvpair), 8, 1, 3004 &intval, tx) == 0); 3005 spa->spa_delegation = intval; 3006 break; 3007 case ZPOOL_PROP_BOOTFS: 3008 VERIFY(nvlist_lookup_uint64(nvp, 3009 nvpair_name(nvpair), &spa->spa_bootfs) == 0); 3010 intval = spa->spa_bootfs; 3011 VERIFY(zap_update(mos, 3012 spa->spa_pool_props_object, 3013 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 8, 1, 3014 &intval, tx) == 0); 3015 break; 3016 3017 case ZPOOL_PROP_AUTOREPLACE: 3018 VERIFY(nvlist_lookup_uint64(nvp, 3019 nvpair_name(nvpair), &intval) == 0); 3020 VERIFY(zap_update(mos, 3021 spa->spa_pool_props_object, 3022 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 8, 1, 3023 &intval, tx) == 0); 3024 break; 3025 } 3026 spa_history_internal_log(LOG_POOL_PROPSET, 3027 spa, tx, cr, "%s %lld %s", 3028 nvpair_name(nvpair), intval, 3029 spa->spa_name); 3030 } 3031 } 3032 3033 /* 3034 * Sync the specified transaction group. New blocks may be dirtied as 3035 * part of the process, so we iterate until it converges. 3036 */ 3037 void 3038 spa_sync(spa_t *spa, uint64_t txg) 3039 { 3040 dsl_pool_t *dp = spa->spa_dsl_pool; 3041 objset_t *mos = spa->spa_meta_objset; 3042 bplist_t *bpl = &spa->spa_sync_bplist; 3043 vdev_t *rvd = spa->spa_root_vdev; 3044 vdev_t *vd; 3045 dmu_tx_t *tx; 3046 int dirty_vdevs; 3047 3048 /* 3049 * Lock out configuration changes. 3050 */ 3051 spa_config_enter(spa, RW_READER, FTAG); 3052 3053 spa->spa_syncing_txg = txg; 3054 spa->spa_sync_pass = 0; 3055 3056 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 3057 3058 tx = dmu_tx_create_assigned(dp, txg); 3059 3060 /* 3061 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 3062 * set spa_deflate if we have no raid-z vdevs. 3063 */ 3064 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 3065 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 3066 int i; 3067 3068 for (i = 0; i < rvd->vdev_children; i++) { 3069 vd = rvd->vdev_child[i]; 3070 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 3071 break; 3072 } 3073 if (i == rvd->vdev_children) { 3074 spa->spa_deflate = TRUE; 3075 VERIFY(0 == zap_add(spa->spa_meta_objset, 3076 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3077 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 3078 } 3079 } 3080 3081 /* 3082 * If anything has changed in this txg, push the deferred frees 3083 * from the previous txg. If not, leave them alone so that we 3084 * don't generate work on an otherwise idle system. 3085 */ 3086 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 3087 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 3088 !txg_list_empty(&dp->dp_sync_tasks, txg)) 3089 spa_sync_deferred_frees(spa, txg); 3090 3091 /* 3092 * Iterate to convergence. 3093 */ 3094 do { 3095 spa->spa_sync_pass++; 3096 3097 spa_sync_config_object(spa, tx); 3098 spa_sync_spares(spa, tx); 3099 spa_errlog_sync(spa, txg); 3100 dsl_pool_sync(dp, txg); 3101 3102 dirty_vdevs = 0; 3103 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 3104 vdev_sync(vd, txg); 3105 dirty_vdevs++; 3106 } 3107 3108 bplist_sync(bpl, tx); 3109 } while (dirty_vdevs); 3110 3111 bplist_close(bpl); 3112 3113 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 3114 3115 /* 3116 * Rewrite the vdev configuration (which includes the uberblock) 3117 * to commit the transaction group. 3118 * 3119 * If there are any dirty vdevs, sync the uberblock to all vdevs. 3120 * Otherwise, pick a random top-level vdev that's known to be 3121 * visible in the config cache (see spa_vdev_add() for details). 3122 * If the write fails, try the next vdev until we're tried them all. 3123 */ 3124 if (!list_is_empty(&spa->spa_dirty_list)) { 3125 VERIFY(vdev_config_sync(rvd, txg) == 0); 3126 } else { 3127 int children = rvd->vdev_children; 3128 int c0 = spa_get_random(children); 3129 int c; 3130 3131 for (c = 0; c < children; c++) { 3132 vd = rvd->vdev_child[(c0 + c) % children]; 3133 if (vd->vdev_ms_array == 0) 3134 continue; 3135 if (vdev_config_sync(vd, txg) == 0) 3136 break; 3137 } 3138 if (c == children) 3139 VERIFY(vdev_config_sync(rvd, txg) == 0); 3140 } 3141 3142 dmu_tx_commit(tx); 3143 3144 /* 3145 * Clear the dirty config list. 3146 */ 3147 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 3148 vdev_config_clean(vd); 3149 3150 /* 3151 * Now that the new config has synced transactionally, 3152 * let it become visible to the config cache. 3153 */ 3154 if (spa->spa_config_syncing != NULL) { 3155 spa_config_set(spa, spa->spa_config_syncing); 3156 spa->spa_config_txg = txg; 3157 spa->spa_config_syncing = NULL; 3158 } 3159 3160 /* 3161 * Make a stable copy of the fully synced uberblock. 3162 * We use this as the root for pool traversals. 3163 */ 3164 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 3165 3166 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 3167 3168 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 3169 spa->spa_traverse_wanted = 0; 3170 spa->spa_ubsync = spa->spa_uberblock; 3171 rw_exit(&spa->spa_traverse_lock); 3172 3173 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 3174 3175 /* 3176 * Clean up the ZIL records for the synced txg. 3177 */ 3178 dsl_pool_zil_clean(dp); 3179 3180 /* 3181 * Update usable space statistics. 3182 */ 3183 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 3184 vdev_sync_done(vd, txg); 3185 3186 /* 3187 * It had better be the case that we didn't dirty anything 3188 * since vdev_config_sync(). 3189 */ 3190 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 3191 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 3192 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 3193 ASSERT(bpl->bpl_queue == NULL); 3194 3195 spa_config_exit(spa, FTAG); 3196 3197 /* 3198 * If any async tasks have been requested, kick them off. 3199 */ 3200 spa_async_dispatch(spa); 3201 } 3202 3203 /* 3204 * Sync all pools. We don't want to hold the namespace lock across these 3205 * operations, so we take a reference on the spa_t and drop the lock during the 3206 * sync. 3207 */ 3208 void 3209 spa_sync_allpools(void) 3210 { 3211 spa_t *spa = NULL; 3212 mutex_enter(&spa_namespace_lock); 3213 while ((spa = spa_next(spa)) != NULL) { 3214 if (spa_state(spa) != POOL_STATE_ACTIVE) 3215 continue; 3216 spa_open_ref(spa, FTAG); 3217 mutex_exit(&spa_namespace_lock); 3218 txg_wait_synced(spa_get_dsl(spa), 0); 3219 mutex_enter(&spa_namespace_lock); 3220 spa_close(spa, FTAG); 3221 } 3222 mutex_exit(&spa_namespace_lock); 3223 } 3224 3225 /* 3226 * ========================================================================== 3227 * Miscellaneous routines 3228 * ========================================================================== 3229 */ 3230 3231 /* 3232 * Remove all pools in the system. 3233 */ 3234 void 3235 spa_evict_all(void) 3236 { 3237 spa_t *spa; 3238 3239 /* 3240 * Remove all cached state. All pools should be closed now, 3241 * so every spa in the AVL tree should be unreferenced. 3242 */ 3243 mutex_enter(&spa_namespace_lock); 3244 while ((spa = spa_next(NULL)) != NULL) { 3245 /* 3246 * Stop async tasks. The async thread may need to detach 3247 * a device that's been replaced, which requires grabbing 3248 * spa_namespace_lock, so we must drop it here. 3249 */ 3250 spa_open_ref(spa, FTAG); 3251 mutex_exit(&spa_namespace_lock); 3252 spa_async_suspend(spa); 3253 mutex_enter(&spa_namespace_lock); 3254 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 3255 spa_close(spa, FTAG); 3256 3257 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3258 spa_unload(spa); 3259 spa_deactivate(spa); 3260 } 3261 spa_remove(spa); 3262 } 3263 mutex_exit(&spa_namespace_lock); 3264 } 3265 3266 vdev_t * 3267 spa_lookup_by_guid(spa_t *spa, uint64_t guid) 3268 { 3269 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 3270 } 3271 3272 void 3273 spa_upgrade(spa_t *spa) 3274 { 3275 spa_config_enter(spa, RW_WRITER, FTAG); 3276 3277 /* 3278 * This should only be called for a non-faulted pool, and since a 3279 * future version would result in an unopenable pool, this shouldn't be 3280 * possible. 3281 */ 3282 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 3283 3284 spa->spa_uberblock.ub_version = SPA_VERSION; 3285 vdev_config_dirty(spa->spa_root_vdev); 3286 3287 spa_config_exit(spa, FTAG); 3288 3289 txg_wait_synced(spa_get_dsl(spa), 0); 3290 } 3291 3292 boolean_t 3293 spa_has_spare(spa_t *spa, uint64_t guid) 3294 { 3295 int i; 3296 uint64_t spareguid; 3297 3298 for (i = 0; i < spa->spa_nspares; i++) 3299 if (spa->spa_spares[i]->vdev_guid == guid) 3300 return (B_TRUE); 3301 3302 for (i = 0; i < spa->spa_pending_nspares; i++) { 3303 if (nvlist_lookup_uint64(spa->spa_pending_spares[i], 3304 ZPOOL_CONFIG_GUID, &spareguid) == 0 && 3305 spareguid == guid) 3306 return (B_TRUE); 3307 } 3308 3309 return (B_FALSE); 3310 } 3311 3312 int 3313 spa_set_props(spa_t *spa, nvlist_t *nvp) 3314 { 3315 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 3316 spa, nvp, 3)); 3317 } 3318 3319 int 3320 spa_get_props(spa_t *spa, nvlist_t **nvp) 3321 { 3322 zap_cursor_t zc; 3323 zap_attribute_t za; 3324 objset_t *mos = spa->spa_meta_objset; 3325 zfs_source_t src; 3326 zpool_prop_t prop; 3327 nvlist_t *propval; 3328 uint64_t value; 3329 int err; 3330 3331 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3332 3333 mutex_enter(&spa->spa_props_lock); 3334 /* If no props object, then just return empty nvlist */ 3335 if (spa->spa_pool_props_object == 0) { 3336 mutex_exit(&spa->spa_props_lock); 3337 return (0); 3338 } 3339 3340 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 3341 (err = zap_cursor_retrieve(&zc, &za)) == 0; 3342 zap_cursor_advance(&zc)) { 3343 3344 if ((prop = zpool_name_to_prop(za.za_name)) == ZFS_PROP_INVAL) 3345 continue; 3346 3347 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3348 switch (za.za_integer_length) { 3349 case 8: 3350 if (zpool_prop_default_numeric(prop) == 3351 za.za_first_integer) 3352 src = ZFS_SRC_DEFAULT; 3353 else 3354 src = ZFS_SRC_LOCAL; 3355 value = za.za_first_integer; 3356 3357 if (prop == ZPOOL_PROP_BOOTFS) { 3358 dsl_pool_t *dp; 3359 dsl_dataset_t *ds = NULL; 3360 char strval[MAXPATHLEN]; 3361 3362 dp = spa_get_dsl(spa); 3363 rw_enter(&dp->dp_config_rwlock, RW_READER); 3364 if ((err = dsl_dataset_open_obj(dp, 3365 za.za_first_integer, NULL, DS_MODE_NONE, 3366 FTAG, &ds)) != 0) { 3367 rw_exit(&dp->dp_config_rwlock); 3368 break; 3369 } 3370 dsl_dataset_name(ds, strval); 3371 dsl_dataset_close(ds, DS_MODE_NONE, FTAG); 3372 rw_exit(&dp->dp_config_rwlock); 3373 3374 VERIFY(nvlist_add_uint64(propval, 3375 ZFS_PROP_SOURCE, src) == 0); 3376 VERIFY(nvlist_add_string(propval, 3377 ZFS_PROP_VALUE, strval) == 0); 3378 } else { 3379 VERIFY(nvlist_add_uint64(propval, 3380 ZFS_PROP_SOURCE, src) == 0); 3381 VERIFY(nvlist_add_uint64(propval, 3382 ZFS_PROP_VALUE, value) == 0); 3383 } 3384 VERIFY(nvlist_add_nvlist(*nvp, za.za_name, 3385 propval) == 0); 3386 break; 3387 } 3388 nvlist_free(propval); 3389 } 3390 zap_cursor_fini(&zc); 3391 mutex_exit(&spa->spa_props_lock); 3392 if (err && err != ENOENT) { 3393 nvlist_free(*nvp); 3394 return (err); 3395 } 3396 3397 return (0); 3398 } 3399 3400 /* 3401 * If the bootfs property value is dsobj, clear it. 3402 */ 3403 void 3404 spa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 3405 { 3406 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 3407 VERIFY(zap_remove(spa->spa_meta_objset, 3408 spa->spa_pool_props_object, 3409 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 3410 spa->spa_bootfs = 0; 3411 } 3412 } 3413 3414 /* 3415 * Post a sysevent corresponding to the given event. The 'name' must be one of 3416 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 3417 * filled in from the spa and (optionally) the vdev. This doesn't do anything 3418 * in the userland libzpool, as we don't want consumers to misinterpret ztest 3419 * or zdb as real changes. 3420 */ 3421 void 3422 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 3423 { 3424 #ifdef _KERNEL 3425 sysevent_t *ev; 3426 sysevent_attr_list_t *attr = NULL; 3427 sysevent_value_t value; 3428 sysevent_id_t eid; 3429 3430 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 3431 SE_SLEEP); 3432 3433 value.value_type = SE_DATA_TYPE_STRING; 3434 value.value.sv_string = spa_name(spa); 3435 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 3436 goto done; 3437 3438 value.value_type = SE_DATA_TYPE_UINT64; 3439 value.value.sv_uint64 = spa_guid(spa); 3440 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 3441 goto done; 3442 3443 if (vd) { 3444 value.value_type = SE_DATA_TYPE_UINT64; 3445 value.value.sv_uint64 = vd->vdev_guid; 3446 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 3447 SE_SLEEP) != 0) 3448 goto done; 3449 3450 if (vd->vdev_path) { 3451 value.value_type = SE_DATA_TYPE_STRING; 3452 value.value.sv_string = vd->vdev_path; 3453 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 3454 &value, SE_SLEEP) != 0) 3455 goto done; 3456 } 3457 } 3458 3459 (void) log_sysevent(ev, SE_SLEEP, &eid); 3460 3461 done: 3462 if (attr) 3463 sysevent_free_attr(attr); 3464 sysevent_free(ev); 3465 #endif 3466 } 3467