1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains all the routines used when modifying on-disk SPA state. 31 * This includes opening, importing, destroying, exporting a pool, and syncing a 32 * pool. 33 */ 34 35 #include <sys/zfs_context.h> 36 #include <sys/fm/fs/zfs.h> 37 #include <sys/spa_impl.h> 38 #include <sys/zio.h> 39 #include <sys/zio_checksum.h> 40 #include <sys/zio_compress.h> 41 #include <sys/dmu.h> 42 #include <sys/dmu_tx.h> 43 #include <sys/zap.h> 44 #include <sys/zil.h> 45 #include <sys/vdev_impl.h> 46 #include <sys/metaslab.h> 47 #include <sys/uberblock_impl.h> 48 #include <sys/txg.h> 49 #include <sys/avl.h> 50 #include <sys/dmu_traverse.h> 51 #include <sys/dmu_objset.h> 52 #include <sys/unique.h> 53 #include <sys/dsl_pool.h> 54 #include <sys/dsl_dataset.h> 55 #include <sys/dsl_dir.h> 56 #include <sys/dsl_prop.h> 57 #include <sys/dsl_synctask.h> 58 #include <sys/fs/zfs.h> 59 #include <sys/callb.h> 60 #include <sys/systeminfo.h> 61 #include <sys/sunddi.h> 62 63 int zio_taskq_threads = 8; 64 65 /* 66 * ========================================================================== 67 * SPA state manipulation (open/create/destroy/import/export) 68 * ========================================================================== 69 */ 70 71 static int 72 spa_error_entry_compare(const void *a, const void *b) 73 { 74 spa_error_entry_t *sa = (spa_error_entry_t *)a; 75 spa_error_entry_t *sb = (spa_error_entry_t *)b; 76 int ret; 77 78 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 79 sizeof (zbookmark_t)); 80 81 if (ret < 0) 82 return (-1); 83 else if (ret > 0) 84 return (1); 85 else 86 return (0); 87 } 88 89 /* 90 * Utility function which retrieves copies of the current logs and 91 * re-initializes them in the process. 92 */ 93 void 94 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 95 { 96 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 97 98 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 99 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 100 101 avl_create(&spa->spa_errlist_scrub, 102 spa_error_entry_compare, sizeof (spa_error_entry_t), 103 offsetof(spa_error_entry_t, se_avl)); 104 avl_create(&spa->spa_errlist_last, 105 spa_error_entry_compare, sizeof (spa_error_entry_t), 106 offsetof(spa_error_entry_t, se_avl)); 107 } 108 109 /* 110 * Activate an uninitialized pool. 111 */ 112 static void 113 spa_activate(spa_t *spa) 114 { 115 int t; 116 117 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 118 119 spa->spa_state = POOL_STATE_ACTIVE; 120 121 spa->spa_normal_class = metaslab_class_create(); 122 spa->spa_log_class = metaslab_class_create(); 123 124 for (t = 0; t < ZIO_TYPES; t++) { 125 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 126 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 127 TASKQ_PREPOPULATE); 128 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 129 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 130 TASKQ_PREPOPULATE); 131 } 132 133 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 134 135 mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); 136 mutex_init(&spa->spa_config_cache_lock, NULL, MUTEX_DEFAULT, NULL); 137 mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); 138 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); 139 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); 140 mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL); 141 mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); 142 mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); 143 mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); 144 145 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 146 offsetof(vdev_t, vdev_dirty_node)); 147 148 txg_list_create(&spa->spa_vdev_txg_list, 149 offsetof(struct vdev, vdev_txg_node)); 150 151 avl_create(&spa->spa_errlist_scrub, 152 spa_error_entry_compare, sizeof (spa_error_entry_t), 153 offsetof(spa_error_entry_t, se_avl)); 154 avl_create(&spa->spa_errlist_last, 155 spa_error_entry_compare, sizeof (spa_error_entry_t), 156 offsetof(spa_error_entry_t, se_avl)); 157 } 158 159 /* 160 * Opposite of spa_activate(). 161 */ 162 static void 163 spa_deactivate(spa_t *spa) 164 { 165 int t; 166 167 ASSERT(spa->spa_sync_on == B_FALSE); 168 ASSERT(spa->spa_dsl_pool == NULL); 169 ASSERT(spa->spa_root_vdev == NULL); 170 171 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 172 173 txg_list_destroy(&spa->spa_vdev_txg_list); 174 175 list_destroy(&spa->spa_dirty_list); 176 177 rw_destroy(&spa->spa_traverse_lock); 178 179 for (t = 0; t < ZIO_TYPES; t++) { 180 taskq_destroy(spa->spa_zio_issue_taskq[t]); 181 taskq_destroy(spa->spa_zio_intr_taskq[t]); 182 spa->spa_zio_issue_taskq[t] = NULL; 183 spa->spa_zio_intr_taskq[t] = NULL; 184 } 185 186 metaslab_class_destroy(spa->spa_normal_class); 187 spa->spa_normal_class = NULL; 188 189 metaslab_class_destroy(spa->spa_log_class); 190 spa->spa_log_class = NULL; 191 192 /* 193 * If this was part of an import or the open otherwise failed, we may 194 * still have errors left in the queues. Empty them just in case. 195 */ 196 spa_errlog_drain(spa); 197 198 avl_destroy(&spa->spa_errlist_scrub); 199 avl_destroy(&spa->spa_errlist_last); 200 201 spa->spa_state = POOL_STATE_UNINITIALIZED; 202 } 203 204 /* 205 * Verify a pool configuration, and construct the vdev tree appropriately. This 206 * will create all the necessary vdevs in the appropriate layout, with each vdev 207 * in the CLOSED state. This will prep the pool before open/creation/import. 208 * All vdev validation is done by the vdev_alloc() routine. 209 */ 210 static int 211 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 212 uint_t id, int atype) 213 { 214 nvlist_t **child; 215 uint_t c, children; 216 int error; 217 218 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 219 return (error); 220 221 if ((*vdp)->vdev_ops->vdev_op_leaf) 222 return (0); 223 224 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 225 &child, &children) != 0) { 226 vdev_free(*vdp); 227 *vdp = NULL; 228 return (EINVAL); 229 } 230 231 for (c = 0; c < children; c++) { 232 vdev_t *vd; 233 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 234 atype)) != 0) { 235 vdev_free(*vdp); 236 *vdp = NULL; 237 return (error); 238 } 239 } 240 241 ASSERT(*vdp != NULL); 242 243 return (0); 244 } 245 246 /* 247 * Opposite of spa_load(). 248 */ 249 static void 250 spa_unload(spa_t *spa) 251 { 252 int i; 253 254 /* 255 * Stop async tasks. 256 */ 257 spa_async_suspend(spa); 258 259 /* 260 * Stop syncing. 261 */ 262 if (spa->spa_sync_on) { 263 txg_sync_stop(spa->spa_dsl_pool); 264 spa->spa_sync_on = B_FALSE; 265 } 266 267 /* 268 * Wait for any outstanding prefetch I/O to complete. 269 */ 270 spa_config_enter(spa, RW_WRITER, FTAG); 271 spa_config_exit(spa, FTAG); 272 273 /* 274 * Close the dsl pool. 275 */ 276 if (spa->spa_dsl_pool) { 277 dsl_pool_close(spa->spa_dsl_pool); 278 spa->spa_dsl_pool = NULL; 279 } 280 281 /* 282 * Close all vdevs. 283 */ 284 if (spa->spa_root_vdev) 285 vdev_free(spa->spa_root_vdev); 286 ASSERT(spa->spa_root_vdev == NULL); 287 288 for (i = 0; i < spa->spa_nspares; i++) 289 vdev_free(spa->spa_spares[i]); 290 if (spa->spa_spares) { 291 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 292 spa->spa_spares = NULL; 293 } 294 if (spa->spa_sparelist) { 295 nvlist_free(spa->spa_sparelist); 296 spa->spa_sparelist = NULL; 297 } 298 299 spa->spa_async_suspended = 0; 300 } 301 302 /* 303 * Load (or re-load) the current list of vdevs describing the active spares for 304 * this pool. When this is called, we have some form of basic information in 305 * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 306 * re-generate a more complete list including status information. 307 */ 308 static void 309 spa_load_spares(spa_t *spa) 310 { 311 nvlist_t **spares; 312 uint_t nspares; 313 int i; 314 vdev_t *vd, *tvd; 315 316 /* 317 * First, close and free any existing spare vdevs. 318 */ 319 for (i = 0; i < spa->spa_nspares; i++) { 320 vd = spa->spa_spares[i]; 321 322 /* Undo the call to spa_activate() below */ 323 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && 324 tvd->vdev_isspare) 325 spa_spare_remove(tvd); 326 vdev_close(vd); 327 vdev_free(vd); 328 } 329 330 if (spa->spa_spares) 331 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 332 333 if (spa->spa_sparelist == NULL) 334 nspares = 0; 335 else 336 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 337 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 338 339 spa->spa_nspares = (int)nspares; 340 spa->spa_spares = NULL; 341 342 if (nspares == 0) 343 return; 344 345 /* 346 * Construct the array of vdevs, opening them to get status in the 347 * process. For each spare, there is potentially two different vdev_t 348 * structures associated with it: one in the list of spares (used only 349 * for basic validation purposes) and one in the active vdev 350 * configuration (if it's spared in). During this phase we open and 351 * validate each vdev on the spare list. If the vdev also exists in the 352 * active configuration, then we also mark this vdev as an active spare. 353 */ 354 spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 355 for (i = 0; i < spa->spa_nspares; i++) { 356 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 357 VDEV_ALLOC_SPARE) == 0); 358 ASSERT(vd != NULL); 359 360 spa->spa_spares[i] = vd; 361 362 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { 363 if (!tvd->vdev_isspare) 364 spa_spare_add(tvd); 365 366 /* 367 * We only mark the spare active if we were successfully 368 * able to load the vdev. Otherwise, importing a pool 369 * with a bad active spare would result in strange 370 * behavior, because multiple pool would think the spare 371 * is actively in use. 372 * 373 * There is a vulnerability here to an equally bizarre 374 * circumstance, where a dead active spare is later 375 * brought back to life (onlined or otherwise). Given 376 * the rarity of this scenario, and the extra complexity 377 * it adds, we ignore the possibility. 378 */ 379 if (!vdev_is_dead(tvd)) 380 spa_spare_activate(tvd); 381 } 382 383 if (vdev_open(vd) != 0) 384 continue; 385 386 vd->vdev_top = vd; 387 (void) vdev_validate_spare(vd); 388 } 389 390 /* 391 * Recompute the stashed list of spares, with status information 392 * this time. 393 */ 394 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 395 DATA_TYPE_NVLIST_ARRAY) == 0); 396 397 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 398 for (i = 0; i < spa->spa_nspares; i++) 399 spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 400 B_TRUE, B_TRUE); 401 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 402 spares, spa->spa_nspares) == 0); 403 for (i = 0; i < spa->spa_nspares; i++) 404 nvlist_free(spares[i]); 405 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 406 } 407 408 static int 409 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 410 { 411 dmu_buf_t *db; 412 char *packed = NULL; 413 size_t nvsize = 0; 414 int error; 415 *value = NULL; 416 417 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 418 nvsize = *(uint64_t *)db->db_data; 419 dmu_buf_rele(db, FTAG); 420 421 packed = kmem_alloc(nvsize, KM_SLEEP); 422 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 423 if (error == 0) 424 error = nvlist_unpack(packed, nvsize, value, 0); 425 kmem_free(packed, nvsize); 426 427 return (error); 428 } 429 430 /* 431 * Checks to see if the given vdev could not be opened, in which case we post a 432 * sysevent to notify the autoreplace code that the device has been removed. 433 */ 434 static void 435 spa_check_removed(vdev_t *vd) 436 { 437 int c; 438 439 for (c = 0; c < vd->vdev_children; c++) 440 spa_check_removed(vd->vdev_child[c]); 441 442 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 443 zfs_post_autoreplace(vd->vdev_spa, vd); 444 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 445 } 446 } 447 448 /* 449 * Load an existing storage pool, using the pool's builtin spa_config as a 450 * source of configuration information. 451 */ 452 static int 453 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 454 { 455 int error = 0; 456 nvlist_t *nvroot = NULL; 457 vdev_t *rvd; 458 uberblock_t *ub = &spa->spa_uberblock; 459 uint64_t config_cache_txg = spa->spa_config_txg; 460 uint64_t pool_guid; 461 uint64_t version; 462 zio_t *zio; 463 uint64_t autoreplace = 0; 464 465 spa->spa_load_state = state; 466 467 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 468 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 469 error = EINVAL; 470 goto out; 471 } 472 473 /* 474 * Versioning wasn't explicitly added to the label until later, so if 475 * it's not present treat it as the initial version. 476 */ 477 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 478 version = SPA_VERSION_INITIAL; 479 480 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 481 &spa->spa_config_txg); 482 483 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 484 spa_guid_exists(pool_guid, 0)) { 485 error = EEXIST; 486 goto out; 487 } 488 489 spa->spa_load_guid = pool_guid; 490 491 /* 492 * Parse the configuration into a vdev tree. We explicitly set the 493 * value that will be returned by spa_version() since parsing the 494 * configuration requires knowing the version number. 495 */ 496 spa_config_enter(spa, RW_WRITER, FTAG); 497 spa->spa_ubsync.ub_version = version; 498 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 499 spa_config_exit(spa, FTAG); 500 501 if (error != 0) 502 goto out; 503 504 ASSERT(spa->spa_root_vdev == rvd); 505 ASSERT(spa_guid(spa) == pool_guid); 506 507 /* 508 * Try to open all vdevs, loading each label in the process. 509 */ 510 error = vdev_open(rvd); 511 if (error != 0) 512 goto out; 513 514 /* 515 * Validate the labels for all leaf vdevs. We need to grab the config 516 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 517 * flag. 518 */ 519 spa_config_enter(spa, RW_READER, FTAG); 520 error = vdev_validate(rvd); 521 spa_config_exit(spa, FTAG); 522 523 if (error != 0) 524 goto out; 525 526 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 527 error = ENXIO; 528 goto out; 529 } 530 531 /* 532 * Find the best uberblock. 533 */ 534 bzero(ub, sizeof (uberblock_t)); 535 536 zio = zio_root(spa, NULL, NULL, 537 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 538 vdev_uberblock_load(zio, rvd, ub); 539 error = zio_wait(zio); 540 541 /* 542 * If we weren't able to find a single valid uberblock, return failure. 543 */ 544 if (ub->ub_txg == 0) { 545 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 546 VDEV_AUX_CORRUPT_DATA); 547 error = ENXIO; 548 goto out; 549 } 550 551 /* 552 * If the pool is newer than the code, we can't open it. 553 */ 554 if (ub->ub_version > SPA_VERSION) { 555 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 556 VDEV_AUX_VERSION_NEWER); 557 error = ENOTSUP; 558 goto out; 559 } 560 561 /* 562 * If the vdev guid sum doesn't match the uberblock, we have an 563 * incomplete configuration. 564 */ 565 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 566 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 567 VDEV_AUX_BAD_GUID_SUM); 568 error = ENXIO; 569 goto out; 570 } 571 572 /* 573 * Initialize internal SPA structures. 574 */ 575 spa->spa_state = POOL_STATE_ACTIVE; 576 spa->spa_ubsync = spa->spa_uberblock; 577 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 578 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 579 if (error) { 580 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 581 VDEV_AUX_CORRUPT_DATA); 582 goto out; 583 } 584 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 585 586 if (zap_lookup(spa->spa_meta_objset, 587 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 588 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 589 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 590 VDEV_AUX_CORRUPT_DATA); 591 error = EIO; 592 goto out; 593 } 594 595 if (!mosconfig) { 596 nvlist_t *newconfig; 597 uint64_t hostid; 598 599 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 600 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 601 VDEV_AUX_CORRUPT_DATA); 602 error = EIO; 603 goto out; 604 } 605 606 if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID, 607 &hostid) == 0) { 608 char *hostname; 609 unsigned long myhostid = 0; 610 611 VERIFY(nvlist_lookup_string(newconfig, 612 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 613 614 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 615 if (hostid != 0 && myhostid != 0 && 616 (unsigned long)hostid != myhostid) { 617 cmn_err(CE_WARN, "pool '%s' could not be " 618 "loaded as it was last accessed by " 619 "another system (host: %s hostid: 0x%lx). " 620 "See: http://www.sun.com/msg/ZFS-8000-EY", 621 spa->spa_name, hostname, 622 (unsigned long)hostid); 623 error = EBADF; 624 goto out; 625 } 626 } 627 628 spa_config_set(spa, newconfig); 629 spa_unload(spa); 630 spa_deactivate(spa); 631 spa_activate(spa); 632 633 return (spa_load(spa, newconfig, state, B_TRUE)); 634 } 635 636 if (zap_lookup(spa->spa_meta_objset, 637 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 638 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 639 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 640 VDEV_AUX_CORRUPT_DATA); 641 error = EIO; 642 goto out; 643 } 644 645 /* 646 * Load the bit that tells us to use the new accounting function 647 * (raid-z deflation). If we have an older pool, this will not 648 * be present. 649 */ 650 error = zap_lookup(spa->spa_meta_objset, 651 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 652 sizeof (uint64_t), 1, &spa->spa_deflate); 653 if (error != 0 && error != ENOENT) { 654 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 655 VDEV_AUX_CORRUPT_DATA); 656 error = EIO; 657 goto out; 658 } 659 660 /* 661 * Load the persistent error log. If we have an older pool, this will 662 * not be present. 663 */ 664 error = zap_lookup(spa->spa_meta_objset, 665 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 666 sizeof (uint64_t), 1, &spa->spa_errlog_last); 667 if (error != 0 && error != ENOENT) { 668 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 669 VDEV_AUX_CORRUPT_DATA); 670 error = EIO; 671 goto out; 672 } 673 674 error = zap_lookup(spa->spa_meta_objset, 675 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 676 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 677 if (error != 0 && error != ENOENT) { 678 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 679 VDEV_AUX_CORRUPT_DATA); 680 error = EIO; 681 goto out; 682 } 683 684 /* 685 * Load the history object. If we have an older pool, this 686 * will not be present. 687 */ 688 error = zap_lookup(spa->spa_meta_objset, 689 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 690 sizeof (uint64_t), 1, &spa->spa_history); 691 if (error != 0 && error != ENOENT) { 692 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 693 VDEV_AUX_CORRUPT_DATA); 694 error = EIO; 695 goto out; 696 } 697 698 /* 699 * Load any hot spares for this pool. 700 */ 701 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 702 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 703 if (error != 0 && error != ENOENT) { 704 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 705 VDEV_AUX_CORRUPT_DATA); 706 error = EIO; 707 goto out; 708 } 709 if (error == 0) { 710 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 711 if (load_nvlist(spa, spa->spa_spares_object, 712 &spa->spa_sparelist) != 0) { 713 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 714 VDEV_AUX_CORRUPT_DATA); 715 error = EIO; 716 goto out; 717 } 718 719 spa_config_enter(spa, RW_WRITER, FTAG); 720 spa_load_spares(spa); 721 spa_config_exit(spa, FTAG); 722 } 723 724 spa->spa_delegation = zfs_prop_default_numeric(ZPOOL_PROP_DELEGATION); 725 726 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 727 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 728 729 if (error && error != ENOENT) { 730 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 731 VDEV_AUX_CORRUPT_DATA); 732 error = EIO; 733 goto out; 734 } 735 736 if (error == 0) { 737 (void) zap_lookup(spa->spa_meta_objset, 738 spa->spa_pool_props_object, 739 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 740 sizeof (uint64_t), 1, &spa->spa_bootfs); 741 (void) zap_lookup(spa->spa_meta_objset, 742 spa->spa_pool_props_object, 743 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 744 sizeof (uint64_t), 1, &autoreplace); 745 (void) zap_lookup(spa->spa_meta_objset, 746 spa->spa_pool_props_object, 747 zpool_prop_to_name(ZPOOL_PROP_DELEGATION), 748 sizeof (uint64_t), 1, &spa->spa_delegation); 749 } 750 751 /* 752 * If the 'autoreplace' property is set, then post a resource notifying 753 * the ZFS DE that it should not issue any faults for unopenable 754 * devices. We also iterate over the vdevs, and post a sysevent for any 755 * unopenable vdevs so that the normal autoreplace handler can take 756 * over. 757 */ 758 if (autoreplace) 759 spa_check_removed(spa->spa_root_vdev); 760 761 /* 762 * Load the vdev state for all toplevel vdevs. 763 */ 764 vdev_load(rvd); 765 766 /* 767 * Propagate the leaf DTLs we just loaded all the way up the tree. 768 */ 769 spa_config_enter(spa, RW_WRITER, FTAG); 770 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 771 spa_config_exit(spa, FTAG); 772 773 /* 774 * Check the state of the root vdev. If it can't be opened, it 775 * indicates one or more toplevel vdevs are faulted. 776 */ 777 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 778 error = ENXIO; 779 goto out; 780 } 781 782 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 783 dmu_tx_t *tx; 784 int need_update = B_FALSE; 785 int c; 786 787 /* 788 * Claim log blocks that haven't been committed yet. 789 * This must all happen in a single txg. 790 */ 791 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 792 spa_first_txg(spa)); 793 (void) dmu_objset_find(spa->spa_name, 794 zil_claim, tx, DS_FIND_CHILDREN); 795 dmu_tx_commit(tx); 796 797 spa->spa_sync_on = B_TRUE; 798 txg_sync_start(spa->spa_dsl_pool); 799 800 /* 801 * Wait for all claims to sync. 802 */ 803 txg_wait_synced(spa->spa_dsl_pool, 0); 804 805 /* 806 * If the config cache is stale, or we have uninitialized 807 * metaslabs (see spa_vdev_add()), then update the config. 808 */ 809 if (config_cache_txg != spa->spa_config_txg || 810 state == SPA_LOAD_IMPORT) 811 need_update = B_TRUE; 812 813 for (c = 0; c < rvd->vdev_children; c++) 814 if (rvd->vdev_child[c]->vdev_ms_array == 0) 815 need_update = B_TRUE; 816 817 /* 818 * Update the config cache asychronously in case we're the 819 * root pool, in which case the config cache isn't writable yet. 820 */ 821 if (need_update) 822 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 823 } 824 825 error = 0; 826 out: 827 if (error && error != EBADF) 828 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 829 spa->spa_load_state = SPA_LOAD_NONE; 830 spa->spa_ena = 0; 831 832 return (error); 833 } 834 835 /* 836 * Pool Open/Import 837 * 838 * The import case is identical to an open except that the configuration is sent 839 * down from userland, instead of grabbed from the configuration cache. For the 840 * case of an open, the pool configuration will exist in the 841 * POOL_STATE_UNINITIALIZED state. 842 * 843 * The stats information (gen/count/ustats) is used to gather vdev statistics at 844 * the same time open the pool, without having to keep around the spa_t in some 845 * ambiguous state. 846 */ 847 static int 848 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 849 { 850 spa_t *spa; 851 int error; 852 int loaded = B_FALSE; 853 int locked = B_FALSE; 854 855 *spapp = NULL; 856 857 /* 858 * As disgusting as this is, we need to support recursive calls to this 859 * function because dsl_dir_open() is called during spa_load(), and ends 860 * up calling spa_open() again. The real fix is to figure out how to 861 * avoid dsl_dir_open() calling this in the first place. 862 */ 863 if (mutex_owner(&spa_namespace_lock) != curthread) { 864 mutex_enter(&spa_namespace_lock); 865 locked = B_TRUE; 866 } 867 868 if ((spa = spa_lookup(pool)) == NULL) { 869 if (locked) 870 mutex_exit(&spa_namespace_lock); 871 return (ENOENT); 872 } 873 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 874 875 spa_activate(spa); 876 877 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 878 879 if (error == EBADF) { 880 /* 881 * If vdev_validate() returns failure (indicated by 882 * EBADF), it indicates that one of the vdevs indicates 883 * that the pool has been exported or destroyed. If 884 * this is the case, the config cache is out of sync and 885 * we should remove the pool from the namespace. 886 */ 887 zfs_post_ok(spa, NULL); 888 spa_unload(spa); 889 spa_deactivate(spa); 890 spa_remove(spa); 891 spa_config_sync(); 892 if (locked) 893 mutex_exit(&spa_namespace_lock); 894 return (ENOENT); 895 } 896 897 if (error) { 898 /* 899 * We can't open the pool, but we still have useful 900 * information: the state of each vdev after the 901 * attempted vdev_open(). Return this to the user. 902 */ 903 if (config != NULL && spa->spa_root_vdev != NULL) { 904 spa_config_enter(spa, RW_READER, FTAG); 905 *config = spa_config_generate(spa, NULL, -1ULL, 906 B_TRUE); 907 spa_config_exit(spa, FTAG); 908 } 909 spa_unload(spa); 910 spa_deactivate(spa); 911 spa->spa_last_open_failed = B_TRUE; 912 if (locked) 913 mutex_exit(&spa_namespace_lock); 914 *spapp = NULL; 915 return (error); 916 } else { 917 zfs_post_ok(spa, NULL); 918 spa->spa_last_open_failed = B_FALSE; 919 } 920 921 loaded = B_TRUE; 922 } 923 924 spa_open_ref(spa, tag); 925 926 /* 927 * If we just loaded the pool, resilver anything that's out of date. 928 */ 929 if (loaded && (spa_mode & FWRITE)) 930 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 931 932 if (locked) 933 mutex_exit(&spa_namespace_lock); 934 935 *spapp = spa; 936 937 if (config != NULL) { 938 spa_config_enter(spa, RW_READER, FTAG); 939 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 940 spa_config_exit(spa, FTAG); 941 } 942 943 return (0); 944 } 945 946 int 947 spa_open(const char *name, spa_t **spapp, void *tag) 948 { 949 return (spa_open_common(name, spapp, tag, NULL)); 950 } 951 952 /* 953 * Lookup the given spa_t, incrementing the inject count in the process, 954 * preventing it from being exported or destroyed. 955 */ 956 spa_t * 957 spa_inject_addref(char *name) 958 { 959 spa_t *spa; 960 961 mutex_enter(&spa_namespace_lock); 962 if ((spa = spa_lookup(name)) == NULL) { 963 mutex_exit(&spa_namespace_lock); 964 return (NULL); 965 } 966 spa->spa_inject_ref++; 967 mutex_exit(&spa_namespace_lock); 968 969 return (spa); 970 } 971 972 void 973 spa_inject_delref(spa_t *spa) 974 { 975 mutex_enter(&spa_namespace_lock); 976 spa->spa_inject_ref--; 977 mutex_exit(&spa_namespace_lock); 978 } 979 980 static void 981 spa_add_spares(spa_t *spa, nvlist_t *config) 982 { 983 nvlist_t **spares; 984 uint_t i, nspares; 985 nvlist_t *nvroot; 986 uint64_t guid; 987 vdev_stat_t *vs; 988 uint_t vsc; 989 uint64_t pool; 990 991 if (spa->spa_nspares == 0) 992 return; 993 994 VERIFY(nvlist_lookup_nvlist(config, 995 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 996 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 997 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 998 if (nspares != 0) { 999 VERIFY(nvlist_add_nvlist_array(nvroot, 1000 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1001 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1002 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1003 1004 /* 1005 * Go through and find any spares which have since been 1006 * repurposed as an active spare. If this is the case, update 1007 * their status appropriately. 1008 */ 1009 for (i = 0; i < nspares; i++) { 1010 VERIFY(nvlist_lookup_uint64(spares[i], 1011 ZPOOL_CONFIG_GUID, &guid) == 0); 1012 if (spa_spare_exists(guid, &pool) && pool != 0ULL) { 1013 VERIFY(nvlist_lookup_uint64_array( 1014 spares[i], ZPOOL_CONFIG_STATS, 1015 (uint64_t **)&vs, &vsc) == 0); 1016 vs->vs_state = VDEV_STATE_CANT_OPEN; 1017 vs->vs_aux = VDEV_AUX_SPARED; 1018 } 1019 } 1020 } 1021 } 1022 1023 int 1024 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 1025 { 1026 int error; 1027 spa_t *spa; 1028 1029 *config = NULL; 1030 error = spa_open_common(name, &spa, FTAG, config); 1031 1032 if (spa && *config != NULL) { 1033 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 1034 spa_get_errlog_size(spa)) == 0); 1035 1036 spa_add_spares(spa, *config); 1037 } 1038 1039 /* 1040 * We want to get the alternate root even for faulted pools, so we cheat 1041 * and call spa_lookup() directly. 1042 */ 1043 if (altroot) { 1044 if (spa == NULL) { 1045 mutex_enter(&spa_namespace_lock); 1046 spa = spa_lookup(name); 1047 if (spa) 1048 spa_altroot(spa, altroot, buflen); 1049 else 1050 altroot[0] = '\0'; 1051 spa = NULL; 1052 mutex_exit(&spa_namespace_lock); 1053 } else { 1054 spa_altroot(spa, altroot, buflen); 1055 } 1056 } 1057 1058 if (spa != NULL) 1059 spa_close(spa, FTAG); 1060 1061 return (error); 1062 } 1063 1064 /* 1065 * Validate that the 'spares' array is well formed. We must have an array of 1066 * nvlists, each which describes a valid leaf vdev. If this is an import (mode 1067 * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long 1068 * as they are well-formed. 1069 */ 1070 static int 1071 spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 1072 { 1073 nvlist_t **spares; 1074 uint_t i, nspares; 1075 vdev_t *vd; 1076 int error; 1077 1078 /* 1079 * It's acceptable to have no spares specified. 1080 */ 1081 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1082 &spares, &nspares) != 0) 1083 return (0); 1084 1085 if (nspares == 0) 1086 return (EINVAL); 1087 1088 /* 1089 * Make sure the pool is formatted with a version that supports hot 1090 * spares. 1091 */ 1092 if (spa_version(spa) < SPA_VERSION_SPARES) 1093 return (ENOTSUP); 1094 1095 /* 1096 * Set the pending spare list so we correctly handle device in-use 1097 * checking. 1098 */ 1099 spa->spa_pending_spares = spares; 1100 spa->spa_pending_nspares = nspares; 1101 1102 for (i = 0; i < nspares; i++) { 1103 if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 1104 mode)) != 0) 1105 goto out; 1106 1107 if (!vd->vdev_ops->vdev_op_leaf) { 1108 vdev_free(vd); 1109 error = EINVAL; 1110 goto out; 1111 } 1112 1113 vd->vdev_top = vd; 1114 1115 if ((error = vdev_open(vd)) == 0 && 1116 (error = vdev_label_init(vd, crtxg, 1117 VDEV_LABEL_SPARE)) == 0) { 1118 VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 1119 vd->vdev_guid) == 0); 1120 } 1121 1122 vdev_free(vd); 1123 1124 if (error && mode != VDEV_ALLOC_SPARE) 1125 goto out; 1126 else 1127 error = 0; 1128 } 1129 1130 out: 1131 spa->spa_pending_spares = NULL; 1132 spa->spa_pending_nspares = 0; 1133 return (error); 1134 } 1135 1136 /* 1137 * Pool Creation 1138 */ 1139 int 1140 spa_create(const char *pool, nvlist_t *nvroot, const char *altroot, 1141 const char *history_str) 1142 { 1143 spa_t *spa; 1144 vdev_t *rvd; 1145 dsl_pool_t *dp; 1146 dmu_tx_t *tx; 1147 int c, error = 0; 1148 uint64_t txg = TXG_INITIAL; 1149 nvlist_t **spares; 1150 uint_t nspares; 1151 1152 /* 1153 * If this pool already exists, return failure. 1154 */ 1155 mutex_enter(&spa_namespace_lock); 1156 if (spa_lookup(pool) != NULL) { 1157 mutex_exit(&spa_namespace_lock); 1158 return (EEXIST); 1159 } 1160 1161 /* 1162 * Allocate a new spa_t structure. 1163 */ 1164 spa = spa_add(pool, altroot); 1165 spa_activate(spa); 1166 1167 spa->spa_uberblock.ub_txg = txg - 1; 1168 spa->spa_uberblock.ub_version = SPA_VERSION; 1169 spa->spa_ubsync = spa->spa_uberblock; 1170 1171 /* 1172 * Create the root vdev. 1173 */ 1174 spa_config_enter(spa, RW_WRITER, FTAG); 1175 1176 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1177 1178 ASSERT(error != 0 || rvd != NULL); 1179 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1180 1181 if (error == 0 && rvd->vdev_children == 0) 1182 error = EINVAL; 1183 1184 if (error == 0 && 1185 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1186 (error = spa_validate_spares(spa, nvroot, txg, 1187 VDEV_ALLOC_ADD)) == 0) { 1188 for (c = 0; c < rvd->vdev_children; c++) 1189 vdev_init(rvd->vdev_child[c], txg); 1190 vdev_config_dirty(rvd); 1191 } 1192 1193 spa_config_exit(spa, FTAG); 1194 1195 if (error != 0) { 1196 spa_unload(spa); 1197 spa_deactivate(spa); 1198 spa_remove(spa); 1199 mutex_exit(&spa_namespace_lock); 1200 return (error); 1201 } 1202 1203 /* 1204 * Get the list of spares, if specified. 1205 */ 1206 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1207 &spares, &nspares) == 0) { 1208 VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 1209 KM_SLEEP) == 0); 1210 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1211 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1212 spa_config_enter(spa, RW_WRITER, FTAG); 1213 spa_load_spares(spa); 1214 spa_config_exit(spa, FTAG); 1215 spa->spa_sync_spares = B_TRUE; 1216 } 1217 1218 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1219 spa->spa_meta_objset = dp->dp_meta_objset; 1220 1221 tx = dmu_tx_create_assigned(dp, txg); 1222 1223 /* 1224 * Create the pool config object. 1225 */ 1226 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1227 DMU_OT_PACKED_NVLIST, 1 << 14, 1228 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1229 1230 if (zap_add(spa->spa_meta_objset, 1231 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1232 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1233 cmn_err(CE_PANIC, "failed to add pool config"); 1234 } 1235 1236 /* Newly created pools are always deflated. */ 1237 spa->spa_deflate = TRUE; 1238 if (zap_add(spa->spa_meta_objset, 1239 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1240 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 1241 cmn_err(CE_PANIC, "failed to add deflate"); 1242 } 1243 1244 /* 1245 * Create the deferred-free bplist object. Turn off compression 1246 * because sync-to-convergence takes longer if the blocksize 1247 * keeps changing. 1248 */ 1249 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1250 1 << 14, tx); 1251 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1252 ZIO_COMPRESS_OFF, tx); 1253 1254 if (zap_add(spa->spa_meta_objset, 1255 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1256 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 1257 cmn_err(CE_PANIC, "failed to add bplist"); 1258 } 1259 1260 /* 1261 * Create the pool's history object. 1262 */ 1263 spa_history_create_obj(spa, tx); 1264 1265 dmu_tx_commit(tx); 1266 1267 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 1268 spa->spa_delegation = zfs_prop_default_numeric(ZPOOL_PROP_DELEGATION); 1269 spa->spa_sync_on = B_TRUE; 1270 txg_sync_start(spa->spa_dsl_pool); 1271 1272 /* 1273 * We explicitly wait for the first transaction to complete so that our 1274 * bean counters are appropriately updated. 1275 */ 1276 txg_wait_synced(spa->spa_dsl_pool, txg); 1277 1278 spa_config_sync(); 1279 1280 if (history_str != NULL) 1281 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 1282 1283 mutex_exit(&spa_namespace_lock); 1284 1285 return (0); 1286 } 1287 1288 /* 1289 * Import the given pool into the system. We set up the necessary spa_t and 1290 * then call spa_load() to do the dirty work. 1291 */ 1292 int 1293 spa_import(const char *pool, nvlist_t *config, const char *altroot) 1294 { 1295 spa_t *spa; 1296 int error; 1297 nvlist_t *nvroot; 1298 nvlist_t **spares; 1299 uint_t nspares; 1300 1301 /* 1302 * If a pool with this name exists, return failure. 1303 */ 1304 mutex_enter(&spa_namespace_lock); 1305 if (spa_lookup(pool) != NULL) { 1306 mutex_exit(&spa_namespace_lock); 1307 return (EEXIST); 1308 } 1309 1310 /* 1311 * Create and initialize the spa structure. 1312 */ 1313 spa = spa_add(pool, altroot); 1314 spa_activate(spa); 1315 1316 /* 1317 * Pass off the heavy lifting to spa_load(). 1318 * Pass TRUE for mosconfig because the user-supplied config 1319 * is actually the one to trust when doing an import. 1320 */ 1321 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1322 1323 spa_config_enter(spa, RW_WRITER, FTAG); 1324 /* 1325 * Toss any existing sparelist, as it doesn't have any validity anymore, 1326 * and conflicts with spa_has_spare(). 1327 */ 1328 if (spa->spa_sparelist) { 1329 nvlist_free(spa->spa_sparelist); 1330 spa->spa_sparelist = NULL; 1331 spa_load_spares(spa); 1332 } 1333 1334 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1335 &nvroot) == 0); 1336 if (error == 0) 1337 error = spa_validate_spares(spa, nvroot, -1ULL, 1338 VDEV_ALLOC_SPARE); 1339 spa_config_exit(spa, FTAG); 1340 1341 if (error != 0) { 1342 spa_unload(spa); 1343 spa_deactivate(spa); 1344 spa_remove(spa); 1345 mutex_exit(&spa_namespace_lock); 1346 return (error); 1347 } 1348 1349 /* 1350 * Override any spares as specified by the user, as these may have 1351 * correct device names/devids, etc. 1352 */ 1353 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1354 &spares, &nspares) == 0) { 1355 if (spa->spa_sparelist) 1356 VERIFY(nvlist_remove(spa->spa_sparelist, 1357 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1358 else 1359 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1360 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1361 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1362 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1363 spa_config_enter(spa, RW_WRITER, FTAG); 1364 spa_load_spares(spa); 1365 spa_config_exit(spa, FTAG); 1366 spa->spa_sync_spares = B_TRUE; 1367 } 1368 1369 /* 1370 * Update the config cache to include the newly-imported pool. 1371 */ 1372 if (spa_mode & FWRITE) 1373 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1374 1375 /* 1376 * Resilver anything that's out of date. 1377 */ 1378 if (spa_mode & FWRITE) 1379 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1380 1381 mutex_exit(&spa_namespace_lock); 1382 1383 return (0); 1384 } 1385 1386 /* 1387 * This (illegal) pool name is used when temporarily importing a spa_t in order 1388 * to get the vdev stats associated with the imported devices. 1389 */ 1390 #define TRYIMPORT_NAME "$import" 1391 1392 nvlist_t * 1393 spa_tryimport(nvlist_t *tryconfig) 1394 { 1395 nvlist_t *config = NULL; 1396 char *poolname; 1397 spa_t *spa; 1398 uint64_t state; 1399 1400 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1401 return (NULL); 1402 1403 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1404 return (NULL); 1405 1406 /* 1407 * Create and initialize the spa structure. 1408 */ 1409 mutex_enter(&spa_namespace_lock); 1410 spa = spa_add(TRYIMPORT_NAME, NULL); 1411 spa_activate(spa); 1412 1413 /* 1414 * Pass off the heavy lifting to spa_load(). 1415 * Pass TRUE for mosconfig because the user-supplied config 1416 * is actually the one to trust when doing an import. 1417 */ 1418 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1419 1420 /* 1421 * If 'tryconfig' was at least parsable, return the current config. 1422 */ 1423 if (spa->spa_root_vdev != NULL) { 1424 spa_config_enter(spa, RW_READER, FTAG); 1425 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1426 spa_config_exit(spa, FTAG); 1427 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1428 poolname) == 0); 1429 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1430 state) == 0); 1431 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 1432 spa->spa_uberblock.ub_timestamp) == 0); 1433 1434 /* 1435 * Add the list of hot spares. 1436 */ 1437 spa_add_spares(spa, config); 1438 } 1439 1440 spa_unload(spa); 1441 spa_deactivate(spa); 1442 spa_remove(spa); 1443 mutex_exit(&spa_namespace_lock); 1444 1445 return (config); 1446 } 1447 1448 /* 1449 * Pool export/destroy 1450 * 1451 * The act of destroying or exporting a pool is very simple. We make sure there 1452 * is no more pending I/O and any references to the pool are gone. Then, we 1453 * update the pool state and sync all the labels to disk, removing the 1454 * configuration from the cache afterwards. 1455 */ 1456 static int 1457 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1458 { 1459 spa_t *spa; 1460 1461 if (oldconfig) 1462 *oldconfig = NULL; 1463 1464 if (!(spa_mode & FWRITE)) 1465 return (EROFS); 1466 1467 mutex_enter(&spa_namespace_lock); 1468 if ((spa = spa_lookup(pool)) == NULL) { 1469 mutex_exit(&spa_namespace_lock); 1470 return (ENOENT); 1471 } 1472 1473 /* 1474 * Put a hold on the pool, drop the namespace lock, stop async tasks, 1475 * reacquire the namespace lock, and see if we can export. 1476 */ 1477 spa_open_ref(spa, FTAG); 1478 mutex_exit(&spa_namespace_lock); 1479 spa_async_suspend(spa); 1480 mutex_enter(&spa_namespace_lock); 1481 spa_close(spa, FTAG); 1482 1483 /* 1484 * The pool will be in core if it's openable, 1485 * in which case we can modify its state. 1486 */ 1487 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1488 /* 1489 * Objsets may be open only because they're dirty, so we 1490 * have to force it to sync before checking spa_refcnt. 1491 */ 1492 spa_scrub_suspend(spa); 1493 txg_wait_synced(spa->spa_dsl_pool, 0); 1494 1495 /* 1496 * A pool cannot be exported or destroyed if there are active 1497 * references. If we are resetting a pool, allow references by 1498 * fault injection handlers. 1499 */ 1500 if (!spa_refcount_zero(spa) || 1501 (spa->spa_inject_ref != 0 && 1502 new_state != POOL_STATE_UNINITIALIZED)) { 1503 spa_scrub_resume(spa); 1504 spa_async_resume(spa); 1505 mutex_exit(&spa_namespace_lock); 1506 return (EBUSY); 1507 } 1508 1509 spa_scrub_resume(spa); 1510 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1511 1512 /* 1513 * We want this to be reflected on every label, 1514 * so mark them all dirty. spa_unload() will do the 1515 * final sync that pushes these changes out. 1516 */ 1517 if (new_state != POOL_STATE_UNINITIALIZED) { 1518 spa_config_enter(spa, RW_WRITER, FTAG); 1519 spa->spa_state = new_state; 1520 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 1521 vdev_config_dirty(spa->spa_root_vdev); 1522 spa_config_exit(spa, FTAG); 1523 } 1524 } 1525 1526 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 1527 1528 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1529 spa_unload(spa); 1530 spa_deactivate(spa); 1531 } 1532 1533 if (oldconfig && spa->spa_config) 1534 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 1535 1536 if (new_state != POOL_STATE_UNINITIALIZED) { 1537 spa_remove(spa); 1538 spa_config_sync(); 1539 } 1540 mutex_exit(&spa_namespace_lock); 1541 1542 return (0); 1543 } 1544 1545 /* 1546 * Destroy a storage pool. 1547 */ 1548 int 1549 spa_destroy(char *pool) 1550 { 1551 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1552 } 1553 1554 /* 1555 * Export a storage pool. 1556 */ 1557 int 1558 spa_export(char *pool, nvlist_t **oldconfig) 1559 { 1560 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1561 } 1562 1563 /* 1564 * Similar to spa_export(), this unloads the spa_t without actually removing it 1565 * from the namespace in any way. 1566 */ 1567 int 1568 spa_reset(char *pool) 1569 { 1570 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 1571 } 1572 1573 1574 /* 1575 * ========================================================================== 1576 * Device manipulation 1577 * ========================================================================== 1578 */ 1579 1580 /* 1581 * Add a device to a storage pool. 1582 */ 1583 int 1584 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1585 { 1586 uint64_t txg; 1587 int c, error; 1588 vdev_t *rvd = spa->spa_root_vdev; 1589 vdev_t *vd, *tvd; 1590 nvlist_t **spares; 1591 uint_t i, nspares; 1592 1593 txg = spa_vdev_enter(spa); 1594 1595 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 1596 VDEV_ALLOC_ADD)) != 0) 1597 return (spa_vdev_exit(spa, NULL, txg, error)); 1598 1599 spa->spa_pending_vdev = vd; 1600 1601 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1602 &spares, &nspares) != 0) 1603 nspares = 0; 1604 1605 if (vd->vdev_children == 0 && nspares == 0) { 1606 spa->spa_pending_vdev = NULL; 1607 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1608 } 1609 1610 if (vd->vdev_children != 0) { 1611 if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 1612 spa->spa_pending_vdev = NULL; 1613 return (spa_vdev_exit(spa, vd, txg, error)); 1614 } 1615 } 1616 1617 /* 1618 * We must validate the spares after checking the children. Otherwise, 1619 * vdev_inuse() will blindly overwrite the spare. 1620 */ 1621 if ((error = spa_validate_spares(spa, nvroot, txg, 1622 VDEV_ALLOC_ADD)) != 0) { 1623 spa->spa_pending_vdev = NULL; 1624 return (spa_vdev_exit(spa, vd, txg, error)); 1625 } 1626 1627 spa->spa_pending_vdev = NULL; 1628 1629 /* 1630 * Transfer each new top-level vdev from vd to rvd. 1631 */ 1632 for (c = 0; c < vd->vdev_children; c++) { 1633 tvd = vd->vdev_child[c]; 1634 vdev_remove_child(vd, tvd); 1635 tvd->vdev_id = rvd->vdev_children; 1636 vdev_add_child(rvd, tvd); 1637 vdev_config_dirty(tvd); 1638 } 1639 1640 if (nspares != 0) { 1641 if (spa->spa_sparelist != NULL) { 1642 nvlist_t **oldspares; 1643 uint_t oldnspares; 1644 nvlist_t **newspares; 1645 1646 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 1647 ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 1648 1649 newspares = kmem_alloc(sizeof (void *) * 1650 (nspares + oldnspares), KM_SLEEP); 1651 for (i = 0; i < oldnspares; i++) 1652 VERIFY(nvlist_dup(oldspares[i], 1653 &newspares[i], KM_SLEEP) == 0); 1654 for (i = 0; i < nspares; i++) 1655 VERIFY(nvlist_dup(spares[i], 1656 &newspares[i + oldnspares], 1657 KM_SLEEP) == 0); 1658 1659 VERIFY(nvlist_remove(spa->spa_sparelist, 1660 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1661 1662 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1663 ZPOOL_CONFIG_SPARES, newspares, 1664 nspares + oldnspares) == 0); 1665 for (i = 0; i < oldnspares + nspares; i++) 1666 nvlist_free(newspares[i]); 1667 kmem_free(newspares, (oldnspares + nspares) * 1668 sizeof (void *)); 1669 } else { 1670 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1671 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1672 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1673 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1674 } 1675 1676 spa_load_spares(spa); 1677 spa->spa_sync_spares = B_TRUE; 1678 } 1679 1680 /* 1681 * We have to be careful when adding new vdevs to an existing pool. 1682 * If other threads start allocating from these vdevs before we 1683 * sync the config cache, and we lose power, then upon reboot we may 1684 * fail to open the pool because there are DVAs that the config cache 1685 * can't translate. Therefore, we first add the vdevs without 1686 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1687 * and then let spa_config_update() initialize the new metaslabs. 1688 * 1689 * spa_load() checks for added-but-not-initialized vdevs, so that 1690 * if we lose power at any point in this sequence, the remaining 1691 * steps will be completed the next time we load the pool. 1692 */ 1693 (void) spa_vdev_exit(spa, vd, txg, 0); 1694 1695 mutex_enter(&spa_namespace_lock); 1696 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1697 mutex_exit(&spa_namespace_lock); 1698 1699 return (0); 1700 } 1701 1702 /* 1703 * Attach a device to a mirror. The arguments are the path to any device 1704 * in the mirror, and the nvroot for the new device. If the path specifies 1705 * a device that is not mirrored, we automatically insert the mirror vdev. 1706 * 1707 * If 'replacing' is specified, the new device is intended to replace the 1708 * existing device; in this case the two devices are made into their own 1709 * mirror using the 'replacing' vdev, which is functionally identical to 1710 * the mirror vdev (it actually reuses all the same ops) but has a few 1711 * extra rules: you can't attach to it after it's been created, and upon 1712 * completion of resilvering, the first disk (the one being replaced) 1713 * is automatically detached. 1714 */ 1715 int 1716 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1717 { 1718 uint64_t txg, open_txg; 1719 int error; 1720 vdev_t *rvd = spa->spa_root_vdev; 1721 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1722 vdev_ops_t *pvops; 1723 int is_log; 1724 1725 txg = spa_vdev_enter(spa); 1726 1727 oldvd = vdev_lookup_by_guid(rvd, guid); 1728 1729 if (oldvd == NULL) 1730 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1731 1732 if (!oldvd->vdev_ops->vdev_op_leaf) 1733 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1734 1735 pvd = oldvd->vdev_parent; 1736 1737 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 1738 VDEV_ALLOC_ADD)) != 0) 1739 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 1740 1741 if (newrootvd->vdev_children != 1) 1742 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1743 1744 newvd = newrootvd->vdev_child[0]; 1745 1746 if (!newvd->vdev_ops->vdev_op_leaf) 1747 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1748 1749 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 1750 return (spa_vdev_exit(spa, newrootvd, txg, error)); 1751 1752 /* 1753 * Spares can't replace logs 1754 */ 1755 is_log = oldvd->vdev_islog; 1756 if (is_log && newvd->vdev_isspare) 1757 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1758 1759 if (!replacing) { 1760 /* 1761 * For attach, the only allowable parent is a mirror or the root 1762 * vdev. 1763 */ 1764 if (pvd->vdev_ops != &vdev_mirror_ops && 1765 pvd->vdev_ops != &vdev_root_ops) 1766 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1767 1768 pvops = &vdev_mirror_ops; 1769 } else { 1770 /* 1771 * Active hot spares can only be replaced by inactive hot 1772 * spares. 1773 */ 1774 if (pvd->vdev_ops == &vdev_spare_ops && 1775 pvd->vdev_child[1] == oldvd && 1776 !spa_has_spare(spa, newvd->vdev_guid)) 1777 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1778 1779 /* 1780 * If the source is a hot spare, and the parent isn't already a 1781 * spare, then we want to create a new hot spare. Otherwise, we 1782 * want to create a replacing vdev. The user is not allowed to 1783 * attach to a spared vdev child unless the 'isspare' state is 1784 * the same (spare replaces spare, non-spare replaces 1785 * non-spare). 1786 */ 1787 if (pvd->vdev_ops == &vdev_replacing_ops) 1788 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1789 else if (pvd->vdev_ops == &vdev_spare_ops && 1790 newvd->vdev_isspare != oldvd->vdev_isspare) 1791 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1792 else if (pvd->vdev_ops != &vdev_spare_ops && 1793 newvd->vdev_isspare) 1794 pvops = &vdev_spare_ops; 1795 else 1796 pvops = &vdev_replacing_ops; 1797 } 1798 1799 /* 1800 * Compare the new device size with the replaceable/attachable 1801 * device size. 1802 */ 1803 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1804 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1805 1806 /* 1807 * The new device cannot have a higher alignment requirement 1808 * than the top-level vdev. 1809 */ 1810 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1811 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1812 1813 /* 1814 * If this is an in-place replacement, update oldvd's path and devid 1815 * to make it distinguishable from newvd, and unopenable from now on. 1816 */ 1817 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1818 spa_strfree(oldvd->vdev_path); 1819 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1820 KM_SLEEP); 1821 (void) sprintf(oldvd->vdev_path, "%s/%s", 1822 newvd->vdev_path, "old"); 1823 if (oldvd->vdev_devid != NULL) { 1824 spa_strfree(oldvd->vdev_devid); 1825 oldvd->vdev_devid = NULL; 1826 } 1827 } 1828 1829 /* 1830 * If the parent is not a mirror, or if we're replacing, insert the new 1831 * mirror/replacing/spare vdev above oldvd. 1832 */ 1833 if (pvd->vdev_ops != pvops) 1834 pvd = vdev_add_parent(oldvd, pvops); 1835 1836 ASSERT(pvd->vdev_top->vdev_parent == rvd); 1837 ASSERT(pvd->vdev_ops == pvops); 1838 ASSERT(oldvd->vdev_parent == pvd); 1839 1840 /* 1841 * Extract the new device from its root and add it to pvd. 1842 */ 1843 vdev_remove_child(newrootvd, newvd); 1844 newvd->vdev_id = pvd->vdev_children; 1845 vdev_add_child(pvd, newvd); 1846 1847 /* 1848 * If newvd is smaller than oldvd, but larger than its rsize, 1849 * the addition of newvd may have decreased our parent's asize. 1850 */ 1851 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1852 1853 tvd = newvd->vdev_top; 1854 ASSERT(pvd->vdev_top == tvd); 1855 ASSERT(tvd->vdev_parent == rvd); 1856 1857 vdev_config_dirty(tvd); 1858 1859 /* 1860 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1861 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1862 */ 1863 open_txg = txg + TXG_CONCURRENT_STATES - 1; 1864 1865 mutex_enter(&newvd->vdev_dtl_lock); 1866 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1867 open_txg - TXG_INITIAL + 1); 1868 mutex_exit(&newvd->vdev_dtl_lock); 1869 1870 if (newvd->vdev_isspare) 1871 spa_spare_activate(newvd); 1872 1873 /* 1874 * Mark newvd's DTL dirty in this txg. 1875 */ 1876 vdev_dirty(tvd, VDD_DTL, newvd, txg); 1877 1878 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1879 1880 /* 1881 * Kick off a resilver to update newvd. We need to grab the namespace 1882 * lock because spa_scrub() needs to post a sysevent with the pool name. 1883 */ 1884 mutex_enter(&spa_namespace_lock); 1885 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1886 mutex_exit(&spa_namespace_lock); 1887 1888 return (0); 1889 } 1890 1891 /* 1892 * Detach a device from a mirror or replacing vdev. 1893 * If 'replace_done' is specified, only detach if the parent 1894 * is a replacing vdev. 1895 */ 1896 int 1897 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1898 { 1899 uint64_t txg; 1900 int c, t, error; 1901 vdev_t *rvd = spa->spa_root_vdev; 1902 vdev_t *vd, *pvd, *cvd, *tvd; 1903 boolean_t unspare = B_FALSE; 1904 uint64_t unspare_guid; 1905 1906 txg = spa_vdev_enter(spa); 1907 1908 vd = vdev_lookup_by_guid(rvd, guid); 1909 1910 if (vd == NULL) 1911 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1912 1913 if (!vd->vdev_ops->vdev_op_leaf) 1914 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1915 1916 pvd = vd->vdev_parent; 1917 1918 /* 1919 * If replace_done is specified, only remove this device if it's 1920 * the first child of a replacing vdev. For the 'spare' vdev, either 1921 * disk can be removed. 1922 */ 1923 if (replace_done) { 1924 if (pvd->vdev_ops == &vdev_replacing_ops) { 1925 if (vd->vdev_id != 0) 1926 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1927 } else if (pvd->vdev_ops != &vdev_spare_ops) { 1928 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1929 } 1930 } 1931 1932 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 1933 spa_version(spa) >= SPA_VERSION_SPARES); 1934 1935 /* 1936 * Only mirror, replacing, and spare vdevs support detach. 1937 */ 1938 if (pvd->vdev_ops != &vdev_replacing_ops && 1939 pvd->vdev_ops != &vdev_mirror_ops && 1940 pvd->vdev_ops != &vdev_spare_ops) 1941 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1942 1943 /* 1944 * If there's only one replica, you can't detach it. 1945 */ 1946 if (pvd->vdev_children <= 1) 1947 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1948 1949 /* 1950 * If all siblings have non-empty DTLs, this device may have the only 1951 * valid copy of the data, which means we cannot safely detach it. 1952 * 1953 * XXX -- as in the vdev_offline() case, we really want a more 1954 * precise DTL check. 1955 */ 1956 for (c = 0; c < pvd->vdev_children; c++) { 1957 uint64_t dirty; 1958 1959 cvd = pvd->vdev_child[c]; 1960 if (cvd == vd) 1961 continue; 1962 if (vdev_is_dead(cvd)) 1963 continue; 1964 mutex_enter(&cvd->vdev_dtl_lock); 1965 dirty = cvd->vdev_dtl_map.sm_space | 1966 cvd->vdev_dtl_scrub.sm_space; 1967 mutex_exit(&cvd->vdev_dtl_lock); 1968 if (!dirty) 1969 break; 1970 } 1971 1972 /* 1973 * If we are a replacing or spare vdev, then we can always detach the 1974 * latter child, as that is how one cancels the operation. 1975 */ 1976 if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 1977 c == pvd->vdev_children) 1978 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1979 1980 /* 1981 * If we are detaching the original disk from a spare, then it implies 1982 * that the spare should become a real disk, and be removed from the 1983 * active spare list for the pool. 1984 */ 1985 if (pvd->vdev_ops == &vdev_spare_ops && 1986 vd->vdev_id == 0) 1987 unspare = B_TRUE; 1988 1989 /* 1990 * Erase the disk labels so the disk can be used for other things. 1991 * This must be done after all other error cases are handled, 1992 * but before we disembowel vd (so we can still do I/O to it). 1993 * But if we can't do it, don't treat the error as fatal -- 1994 * it may be that the unwritability of the disk is the reason 1995 * it's being detached! 1996 */ 1997 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 1998 1999 /* 2000 * Remove vd from its parent and compact the parent's children. 2001 */ 2002 vdev_remove_child(pvd, vd); 2003 vdev_compact_children(pvd); 2004 2005 /* 2006 * Remember one of the remaining children so we can get tvd below. 2007 */ 2008 cvd = pvd->vdev_child[0]; 2009 2010 /* 2011 * If we need to remove the remaining child from the list of hot spares, 2012 * do it now, marking the vdev as no longer a spare in the process. We 2013 * must do this before vdev_remove_parent(), because that can change the 2014 * GUID if it creates a new toplevel GUID. 2015 */ 2016 if (unspare) { 2017 ASSERT(cvd->vdev_isspare); 2018 spa_spare_remove(cvd); 2019 unspare_guid = cvd->vdev_guid; 2020 } 2021 2022 /* 2023 * If the parent mirror/replacing vdev only has one child, 2024 * the parent is no longer needed. Remove it from the tree. 2025 */ 2026 if (pvd->vdev_children == 1) 2027 vdev_remove_parent(cvd); 2028 2029 /* 2030 * We don't set tvd until now because the parent we just removed 2031 * may have been the previous top-level vdev. 2032 */ 2033 tvd = cvd->vdev_top; 2034 ASSERT(tvd->vdev_parent == rvd); 2035 2036 /* 2037 * Reevaluate the parent vdev state. 2038 */ 2039 vdev_propagate_state(cvd); 2040 2041 /* 2042 * If the device we just detached was smaller than the others, it may be 2043 * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 2044 * can't fail because the existing metaslabs are already in core, so 2045 * there's nothing to read from disk. 2046 */ 2047 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 2048 2049 vdev_config_dirty(tvd); 2050 2051 /* 2052 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 2053 * vd->vdev_detached is set and free vd's DTL object in syncing context. 2054 * But first make sure we're not on any *other* txg's DTL list, to 2055 * prevent vd from being accessed after it's freed. 2056 */ 2057 for (t = 0; t < TXG_SIZE; t++) 2058 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 2059 vd->vdev_detached = B_TRUE; 2060 vdev_dirty(tvd, VDD_DTL, vd, txg); 2061 2062 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 2063 2064 error = spa_vdev_exit(spa, vd, txg, 0); 2065 2066 /* 2067 * If this was the removal of the original device in a hot spare vdev, 2068 * then we want to go through and remove the device from the hot spare 2069 * list of every other pool. 2070 */ 2071 if (unspare) { 2072 spa = NULL; 2073 mutex_enter(&spa_namespace_lock); 2074 while ((spa = spa_next(spa)) != NULL) { 2075 if (spa->spa_state != POOL_STATE_ACTIVE) 2076 continue; 2077 2078 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 2079 } 2080 mutex_exit(&spa_namespace_lock); 2081 } 2082 2083 return (error); 2084 } 2085 2086 /* 2087 * Remove a device from the pool. Currently, this supports removing only hot 2088 * spares. 2089 */ 2090 int 2091 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 2092 { 2093 vdev_t *vd; 2094 nvlist_t **spares, *nv, **newspares; 2095 uint_t i, j, nspares; 2096 int ret = 0; 2097 2098 spa_config_enter(spa, RW_WRITER, FTAG); 2099 2100 vd = spa_lookup_by_guid(spa, guid); 2101 2102 nv = NULL; 2103 if (spa->spa_spares != NULL && 2104 nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2105 &spares, &nspares) == 0) { 2106 for (i = 0; i < nspares; i++) { 2107 uint64_t theguid; 2108 2109 VERIFY(nvlist_lookup_uint64(spares[i], 2110 ZPOOL_CONFIG_GUID, &theguid) == 0); 2111 if (theguid == guid) { 2112 nv = spares[i]; 2113 break; 2114 } 2115 } 2116 } 2117 2118 /* 2119 * We only support removing a hot spare, and only if it's not currently 2120 * in use in this pool. 2121 */ 2122 if (nv == NULL && vd == NULL) { 2123 ret = ENOENT; 2124 goto out; 2125 } 2126 2127 if (nv == NULL && vd != NULL) { 2128 ret = ENOTSUP; 2129 goto out; 2130 } 2131 2132 if (!unspare && nv != NULL && vd != NULL) { 2133 ret = EBUSY; 2134 goto out; 2135 } 2136 2137 if (nspares == 1) { 2138 newspares = NULL; 2139 } else { 2140 newspares = kmem_alloc((nspares - 1) * sizeof (void *), 2141 KM_SLEEP); 2142 for (i = 0, j = 0; i < nspares; i++) { 2143 if (spares[i] != nv) 2144 VERIFY(nvlist_dup(spares[i], 2145 &newspares[j++], KM_SLEEP) == 0); 2146 } 2147 } 2148 2149 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2150 DATA_TYPE_NVLIST_ARRAY) == 0); 2151 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2152 newspares, nspares - 1) == 0); 2153 for (i = 0; i < nspares - 1; i++) 2154 nvlist_free(newspares[i]); 2155 kmem_free(newspares, (nspares - 1) * sizeof (void *)); 2156 spa_load_spares(spa); 2157 spa->spa_sync_spares = B_TRUE; 2158 2159 out: 2160 spa_config_exit(spa, FTAG); 2161 2162 return (ret); 2163 } 2164 2165 /* 2166 * Find any device that's done replacing, or a vdev marked 'unspare' that's 2167 * current spared, so we can detach it. 2168 */ 2169 static vdev_t * 2170 spa_vdev_resilver_done_hunt(vdev_t *vd) 2171 { 2172 vdev_t *newvd, *oldvd; 2173 int c; 2174 2175 for (c = 0; c < vd->vdev_children; c++) { 2176 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 2177 if (oldvd != NULL) 2178 return (oldvd); 2179 } 2180 2181 /* 2182 * Check for a completed replacement. 2183 */ 2184 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 2185 oldvd = vd->vdev_child[0]; 2186 newvd = vd->vdev_child[1]; 2187 2188 mutex_enter(&newvd->vdev_dtl_lock); 2189 if (newvd->vdev_dtl_map.sm_space == 0 && 2190 newvd->vdev_dtl_scrub.sm_space == 0) { 2191 mutex_exit(&newvd->vdev_dtl_lock); 2192 return (oldvd); 2193 } 2194 mutex_exit(&newvd->vdev_dtl_lock); 2195 } 2196 2197 /* 2198 * Check for a completed resilver with the 'unspare' flag set. 2199 */ 2200 if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { 2201 newvd = vd->vdev_child[0]; 2202 oldvd = vd->vdev_child[1]; 2203 2204 mutex_enter(&newvd->vdev_dtl_lock); 2205 if (newvd->vdev_unspare && 2206 newvd->vdev_dtl_map.sm_space == 0 && 2207 newvd->vdev_dtl_scrub.sm_space == 0) { 2208 newvd->vdev_unspare = 0; 2209 mutex_exit(&newvd->vdev_dtl_lock); 2210 return (oldvd); 2211 } 2212 mutex_exit(&newvd->vdev_dtl_lock); 2213 } 2214 2215 return (NULL); 2216 } 2217 2218 static void 2219 spa_vdev_resilver_done(spa_t *spa) 2220 { 2221 vdev_t *vd; 2222 vdev_t *pvd; 2223 uint64_t guid; 2224 uint64_t pguid = 0; 2225 2226 spa_config_enter(spa, RW_READER, FTAG); 2227 2228 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 2229 guid = vd->vdev_guid; 2230 /* 2231 * If we have just finished replacing a hot spared device, then 2232 * we need to detach the parent's first child (the original hot 2233 * spare) as well. 2234 */ 2235 pvd = vd->vdev_parent; 2236 if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2237 pvd->vdev_id == 0) { 2238 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 2239 ASSERT(pvd->vdev_parent->vdev_children == 2); 2240 pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 2241 } 2242 spa_config_exit(spa, FTAG); 2243 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 2244 return; 2245 if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 2246 return; 2247 spa_config_enter(spa, RW_READER, FTAG); 2248 } 2249 2250 spa_config_exit(spa, FTAG); 2251 } 2252 2253 /* 2254 * Update the stored path for this vdev. Dirty the vdev configuration, relying 2255 * on spa_vdev_enter/exit() to synchronize the labels and cache. 2256 */ 2257 int 2258 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 2259 { 2260 vdev_t *rvd, *vd; 2261 uint64_t txg; 2262 2263 rvd = spa->spa_root_vdev; 2264 2265 txg = spa_vdev_enter(spa); 2266 2267 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2268 /* 2269 * Determine if this is a reference to a hot spare. In that 2270 * case, update the path as stored in the spare list. 2271 */ 2272 nvlist_t **spares; 2273 uint_t i, nspares; 2274 if (spa->spa_sparelist != NULL) { 2275 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 2276 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2277 for (i = 0; i < nspares; i++) { 2278 uint64_t theguid; 2279 VERIFY(nvlist_lookup_uint64(spares[i], 2280 ZPOOL_CONFIG_GUID, &theguid) == 0); 2281 if (theguid == guid) 2282 break; 2283 } 2284 2285 if (i == nspares) 2286 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2287 2288 VERIFY(nvlist_add_string(spares[i], 2289 ZPOOL_CONFIG_PATH, newpath) == 0); 2290 spa_load_spares(spa); 2291 spa->spa_sync_spares = B_TRUE; 2292 return (spa_vdev_exit(spa, NULL, txg, 0)); 2293 } else { 2294 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2295 } 2296 } 2297 2298 if (!vd->vdev_ops->vdev_op_leaf) 2299 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2300 2301 spa_strfree(vd->vdev_path); 2302 vd->vdev_path = spa_strdup(newpath); 2303 2304 vdev_config_dirty(vd->vdev_top); 2305 2306 return (spa_vdev_exit(spa, NULL, txg, 0)); 2307 } 2308 2309 /* 2310 * ========================================================================== 2311 * SPA Scrubbing 2312 * ========================================================================== 2313 */ 2314 2315 static void 2316 spa_scrub_io_done(zio_t *zio) 2317 { 2318 spa_t *spa = zio->io_spa; 2319 2320 arc_data_buf_free(zio->io_data, zio->io_size); 2321 2322 mutex_enter(&spa->spa_scrub_lock); 2323 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2324 vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2325 spa->spa_scrub_errors++; 2326 mutex_enter(&vd->vdev_stat_lock); 2327 vd->vdev_stat.vs_scrub_errors++; 2328 mutex_exit(&vd->vdev_stat_lock); 2329 } 2330 2331 if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight) 2332 cv_broadcast(&spa->spa_scrub_io_cv); 2333 2334 ASSERT(spa->spa_scrub_inflight >= 0); 2335 2336 mutex_exit(&spa->spa_scrub_lock); 2337 } 2338 2339 static void 2340 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 2341 zbookmark_t *zb) 2342 { 2343 size_t size = BP_GET_LSIZE(bp); 2344 void *data; 2345 2346 mutex_enter(&spa->spa_scrub_lock); 2347 /* 2348 * Do not give too much work to vdev(s). 2349 */ 2350 while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) { 2351 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2352 } 2353 spa->spa_scrub_inflight++; 2354 mutex_exit(&spa->spa_scrub_lock); 2355 2356 data = arc_data_buf_alloc(size); 2357 2358 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 2359 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 2360 2361 flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 2362 2363 zio_nowait(zio_read(NULL, spa, bp, data, size, 2364 spa_scrub_io_done, NULL, priority, flags, zb)); 2365 } 2366 2367 /* ARGSUSED */ 2368 static int 2369 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2370 { 2371 blkptr_t *bp = &bc->bc_blkptr; 2372 vdev_t *vd = spa->spa_root_vdev; 2373 dva_t *dva = bp->blk_dva; 2374 int needs_resilver = B_FALSE; 2375 int d; 2376 2377 if (bc->bc_errno) { 2378 /* 2379 * We can't scrub this block, but we can continue to scrub 2380 * the rest of the pool. Note the error and move along. 2381 */ 2382 mutex_enter(&spa->spa_scrub_lock); 2383 spa->spa_scrub_errors++; 2384 mutex_exit(&spa->spa_scrub_lock); 2385 2386 mutex_enter(&vd->vdev_stat_lock); 2387 vd->vdev_stat.vs_scrub_errors++; 2388 mutex_exit(&vd->vdev_stat_lock); 2389 2390 return (ERESTART); 2391 } 2392 2393 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2394 2395 for (d = 0; d < BP_GET_NDVAS(bp); d++) { 2396 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 2397 2398 ASSERT(vd != NULL); 2399 2400 /* 2401 * Keep track of how much data we've examined so that 2402 * zpool(1M) status can make useful progress reports. 2403 */ 2404 mutex_enter(&vd->vdev_stat_lock); 2405 vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 2406 mutex_exit(&vd->vdev_stat_lock); 2407 2408 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 2409 if (DVA_GET_GANG(&dva[d])) { 2410 /* 2411 * Gang members may be spread across multiple 2412 * vdevs, so the best we can do is look at the 2413 * pool-wide DTL. 2414 * XXX -- it would be better to change our 2415 * allocation policy to ensure that this can't 2416 * happen. 2417 */ 2418 vd = spa->spa_root_vdev; 2419 } 2420 if (vdev_dtl_contains(&vd->vdev_dtl_map, 2421 bp->blk_birth, 1)) 2422 needs_resilver = B_TRUE; 2423 } 2424 } 2425 2426 if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2427 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 2428 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 2429 else if (needs_resilver) 2430 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 2431 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2432 2433 return (0); 2434 } 2435 2436 static void 2437 spa_scrub_thread(spa_t *spa) 2438 { 2439 callb_cpr_t cprinfo; 2440 traverse_handle_t *th = spa->spa_scrub_th; 2441 vdev_t *rvd = spa->spa_root_vdev; 2442 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2443 int error = 0; 2444 boolean_t complete; 2445 2446 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2447 2448 /* 2449 * If we're restarting due to a snapshot create/delete, 2450 * wait for that to complete. 2451 */ 2452 txg_wait_synced(spa_get_dsl(spa), 0); 2453 2454 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 2455 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2456 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 2457 2458 spa_config_enter(spa, RW_WRITER, FTAG); 2459 vdev_reopen(rvd); /* purge all vdev caches */ 2460 vdev_config_dirty(rvd); /* rewrite all disk labels */ 2461 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 2462 spa_config_exit(spa, FTAG); 2463 2464 mutex_enter(&spa->spa_scrub_lock); 2465 spa->spa_scrub_errors = 0; 2466 spa->spa_scrub_active = 1; 2467 ASSERT(spa->spa_scrub_inflight == 0); 2468 2469 while (!spa->spa_scrub_stop) { 2470 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2471 while (spa->spa_scrub_suspended) { 2472 spa->spa_scrub_active = 0; 2473 cv_broadcast(&spa->spa_scrub_cv); 2474 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2475 spa->spa_scrub_active = 1; 2476 } 2477 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2478 2479 if (spa->spa_scrub_restart_txg != 0) 2480 break; 2481 2482 mutex_exit(&spa->spa_scrub_lock); 2483 error = traverse_more(th); 2484 mutex_enter(&spa->spa_scrub_lock); 2485 if (error != EAGAIN) 2486 break; 2487 } 2488 2489 while (spa->spa_scrub_inflight) 2490 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2491 2492 spa->spa_scrub_active = 0; 2493 cv_broadcast(&spa->spa_scrub_cv); 2494 2495 mutex_exit(&spa->spa_scrub_lock); 2496 2497 spa_config_enter(spa, RW_WRITER, FTAG); 2498 2499 mutex_enter(&spa->spa_scrub_lock); 2500 2501 /* 2502 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 2503 * AND the spa config lock to synchronize with any config changes 2504 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 2505 */ 2506 if (spa->spa_scrub_restart_txg != 0) 2507 error = ERESTART; 2508 2509 if (spa->spa_scrub_stop) 2510 error = EINTR; 2511 2512 /* 2513 * Even if there were uncorrectable errors, we consider the scrub 2514 * completed. The downside is that if there is a transient error during 2515 * a resilver, we won't resilver the data properly to the target. But 2516 * if the damage is permanent (more likely) we will resilver forever, 2517 * which isn't really acceptable. Since there is enough information for 2518 * the user to know what has failed and why, this seems like a more 2519 * tractable approach. 2520 */ 2521 complete = (error == 0); 2522 2523 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 2524 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2525 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2526 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2527 2528 mutex_exit(&spa->spa_scrub_lock); 2529 2530 /* 2531 * If the scrub/resilver completed, update all DTLs to reflect this. 2532 * Whether it succeeded or not, vacate all temporary scrub DTLs. 2533 */ 2534 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2535 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2536 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 2537 spa_errlog_rotate(spa); 2538 2539 if (scrub_type == POOL_SCRUB_RESILVER && complete) 2540 spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_FINISH); 2541 2542 spa_config_exit(spa, FTAG); 2543 2544 mutex_enter(&spa->spa_scrub_lock); 2545 2546 /* 2547 * We may have finished replacing a device. 2548 * Let the async thread assess this and handle the detach. 2549 */ 2550 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 2551 2552 /* 2553 * If we were told to restart, our final act is to start a new scrub. 2554 */ 2555 if (error == ERESTART) 2556 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 2557 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2558 2559 spa->spa_scrub_type = POOL_SCRUB_NONE; 2560 spa->spa_scrub_active = 0; 2561 spa->spa_scrub_thread = NULL; 2562 cv_broadcast(&spa->spa_scrub_cv); 2563 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2564 thread_exit(); 2565 } 2566 2567 void 2568 spa_scrub_suspend(spa_t *spa) 2569 { 2570 mutex_enter(&spa->spa_scrub_lock); 2571 spa->spa_scrub_suspended++; 2572 while (spa->spa_scrub_active) { 2573 cv_broadcast(&spa->spa_scrub_cv); 2574 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2575 } 2576 while (spa->spa_scrub_inflight) 2577 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2578 mutex_exit(&spa->spa_scrub_lock); 2579 } 2580 2581 void 2582 spa_scrub_resume(spa_t *spa) 2583 { 2584 mutex_enter(&spa->spa_scrub_lock); 2585 ASSERT(spa->spa_scrub_suspended != 0); 2586 if (--spa->spa_scrub_suspended == 0) 2587 cv_broadcast(&spa->spa_scrub_cv); 2588 mutex_exit(&spa->spa_scrub_lock); 2589 } 2590 2591 void 2592 spa_scrub_restart(spa_t *spa, uint64_t txg) 2593 { 2594 /* 2595 * Something happened (e.g. snapshot create/delete) that means 2596 * we must restart any in-progress scrubs. The itinerary will 2597 * fix this properly. 2598 */ 2599 mutex_enter(&spa->spa_scrub_lock); 2600 spa->spa_scrub_restart_txg = txg; 2601 mutex_exit(&spa->spa_scrub_lock); 2602 } 2603 2604 int 2605 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2606 { 2607 space_seg_t *ss; 2608 uint64_t mintxg, maxtxg; 2609 vdev_t *rvd = spa->spa_root_vdev; 2610 2611 if ((uint_t)type >= POOL_SCRUB_TYPES) 2612 return (ENOTSUP); 2613 2614 mutex_enter(&spa->spa_scrub_lock); 2615 2616 /* 2617 * If there's a scrub or resilver already in progress, stop it. 2618 */ 2619 while (spa->spa_scrub_thread != NULL) { 2620 /* 2621 * Don't stop a resilver unless forced. 2622 */ 2623 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 2624 mutex_exit(&spa->spa_scrub_lock); 2625 return (EBUSY); 2626 } 2627 spa->spa_scrub_stop = 1; 2628 cv_broadcast(&spa->spa_scrub_cv); 2629 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2630 } 2631 2632 /* 2633 * Terminate the previous traverse. 2634 */ 2635 if (spa->spa_scrub_th != NULL) { 2636 traverse_fini(spa->spa_scrub_th); 2637 spa->spa_scrub_th = NULL; 2638 } 2639 2640 if (rvd == NULL) { 2641 ASSERT(spa->spa_scrub_stop == 0); 2642 ASSERT(spa->spa_scrub_type == type); 2643 ASSERT(spa->spa_scrub_restart_txg == 0); 2644 mutex_exit(&spa->spa_scrub_lock); 2645 return (0); 2646 } 2647 2648 mintxg = TXG_INITIAL - 1; 2649 maxtxg = spa_last_synced_txg(spa) + 1; 2650 2651 mutex_enter(&rvd->vdev_dtl_lock); 2652 2653 if (rvd->vdev_dtl_map.sm_space == 0) { 2654 /* 2655 * The pool-wide DTL is empty. 2656 * If this is a resilver, there's nothing to do except 2657 * check whether any in-progress replacements have completed. 2658 */ 2659 if (type == POOL_SCRUB_RESILVER) { 2660 type = POOL_SCRUB_NONE; 2661 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 2662 } 2663 } else { 2664 /* 2665 * The pool-wide DTL is non-empty. 2666 * If this is a normal scrub, upgrade to a resilver instead. 2667 */ 2668 if (type == POOL_SCRUB_EVERYTHING) 2669 type = POOL_SCRUB_RESILVER; 2670 } 2671 2672 if (type == POOL_SCRUB_RESILVER) { 2673 /* 2674 * Determine the resilvering boundaries. 2675 * 2676 * Note: (mintxg, maxtxg) is an open interval, 2677 * i.e. mintxg and maxtxg themselves are not included. 2678 * 2679 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 2680 * so we don't claim to resilver a txg that's still changing. 2681 */ 2682 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 2683 mintxg = ss->ss_start - 1; 2684 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 2685 maxtxg = MIN(ss->ss_end, maxtxg); 2686 2687 spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START); 2688 } 2689 2690 mutex_exit(&rvd->vdev_dtl_lock); 2691 2692 spa->spa_scrub_stop = 0; 2693 spa->spa_scrub_type = type; 2694 spa->spa_scrub_restart_txg = 0; 2695 2696 if (type != POOL_SCRUB_NONE) { 2697 spa->spa_scrub_mintxg = mintxg; 2698 spa->spa_scrub_maxtxg = maxtxg; 2699 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 2700 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 2701 ZIO_FLAG_CANFAIL); 2702 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 2703 spa->spa_scrub_thread = thread_create(NULL, 0, 2704 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 2705 } 2706 2707 mutex_exit(&spa->spa_scrub_lock); 2708 2709 return (0); 2710 } 2711 2712 /* 2713 * ========================================================================== 2714 * SPA async task processing 2715 * ========================================================================== 2716 */ 2717 2718 static void 2719 spa_async_remove(spa_t *spa, vdev_t *vd) 2720 { 2721 vdev_t *tvd; 2722 int c; 2723 2724 for (c = 0; c < vd->vdev_children; c++) { 2725 tvd = vd->vdev_child[c]; 2726 if (tvd->vdev_remove_wanted) { 2727 tvd->vdev_remove_wanted = 0; 2728 vdev_set_state(tvd, B_FALSE, VDEV_STATE_REMOVED, 2729 VDEV_AUX_NONE); 2730 vdev_clear(spa, tvd); 2731 vdev_config_dirty(tvd->vdev_top); 2732 } 2733 spa_async_remove(spa, tvd); 2734 } 2735 } 2736 2737 static void 2738 spa_async_thread(spa_t *spa) 2739 { 2740 int tasks; 2741 uint64_t txg; 2742 2743 ASSERT(spa->spa_sync_on); 2744 2745 mutex_enter(&spa->spa_async_lock); 2746 tasks = spa->spa_async_tasks; 2747 spa->spa_async_tasks = 0; 2748 mutex_exit(&spa->spa_async_lock); 2749 2750 /* 2751 * See if the config needs to be updated. 2752 */ 2753 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 2754 mutex_enter(&spa_namespace_lock); 2755 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2756 mutex_exit(&spa_namespace_lock); 2757 } 2758 2759 /* 2760 * See if any devices need to be marked REMOVED. 2761 */ 2762 if (tasks & SPA_ASYNC_REMOVE) { 2763 txg = spa_vdev_enter(spa); 2764 spa_async_remove(spa, spa->spa_root_vdev); 2765 (void) spa_vdev_exit(spa, NULL, txg, 0); 2766 } 2767 2768 /* 2769 * If any devices are done replacing, detach them. 2770 */ 2771 if (tasks & SPA_ASYNC_RESILVER_DONE) 2772 spa_vdev_resilver_done(spa); 2773 2774 /* 2775 * Kick off a scrub. When starting a RESILVER scrub (or an EVERYTHING 2776 * scrub which can become a resilver), we need to hold 2777 * spa_namespace_lock() because the sysevent we post via 2778 * spa_event_notify() needs to get the name of the pool. 2779 */ 2780 if (tasks & SPA_ASYNC_SCRUB) { 2781 mutex_enter(&spa_namespace_lock); 2782 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 2783 mutex_exit(&spa_namespace_lock); 2784 } 2785 2786 /* 2787 * Kick off a resilver. 2788 */ 2789 if (tasks & SPA_ASYNC_RESILVER) { 2790 mutex_enter(&spa_namespace_lock); 2791 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 2792 mutex_exit(&spa_namespace_lock); 2793 } 2794 2795 /* 2796 * Let the world know that we're done. 2797 */ 2798 mutex_enter(&spa->spa_async_lock); 2799 spa->spa_async_thread = NULL; 2800 cv_broadcast(&spa->spa_async_cv); 2801 mutex_exit(&spa->spa_async_lock); 2802 thread_exit(); 2803 } 2804 2805 void 2806 spa_async_suspend(spa_t *spa) 2807 { 2808 mutex_enter(&spa->spa_async_lock); 2809 spa->spa_async_suspended++; 2810 while (spa->spa_async_thread != NULL) 2811 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 2812 mutex_exit(&spa->spa_async_lock); 2813 } 2814 2815 void 2816 spa_async_resume(spa_t *spa) 2817 { 2818 mutex_enter(&spa->spa_async_lock); 2819 ASSERT(spa->spa_async_suspended != 0); 2820 spa->spa_async_suspended--; 2821 mutex_exit(&spa->spa_async_lock); 2822 } 2823 2824 static void 2825 spa_async_dispatch(spa_t *spa) 2826 { 2827 mutex_enter(&spa->spa_async_lock); 2828 if (spa->spa_async_tasks && !spa->spa_async_suspended && 2829 spa->spa_async_thread == NULL && 2830 rootdir != NULL && !vn_is_readonly(rootdir)) 2831 spa->spa_async_thread = thread_create(NULL, 0, 2832 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 2833 mutex_exit(&spa->spa_async_lock); 2834 } 2835 2836 void 2837 spa_async_request(spa_t *spa, int task) 2838 { 2839 mutex_enter(&spa->spa_async_lock); 2840 spa->spa_async_tasks |= task; 2841 mutex_exit(&spa->spa_async_lock); 2842 } 2843 2844 /* 2845 * ========================================================================== 2846 * SPA syncing routines 2847 * ========================================================================== 2848 */ 2849 2850 static void 2851 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2852 { 2853 bplist_t *bpl = &spa->spa_sync_bplist; 2854 dmu_tx_t *tx; 2855 blkptr_t blk; 2856 uint64_t itor = 0; 2857 zio_t *zio; 2858 int error; 2859 uint8_t c = 1; 2860 2861 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2862 2863 while (bplist_iterate(bpl, &itor, &blk) == 0) 2864 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2865 2866 error = zio_wait(zio); 2867 ASSERT3U(error, ==, 0); 2868 2869 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2870 bplist_vacate(bpl, tx); 2871 2872 /* 2873 * Pre-dirty the first block so we sync to convergence faster. 2874 * (Usually only the first block is needed.) 2875 */ 2876 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2877 dmu_tx_commit(tx); 2878 } 2879 2880 static void 2881 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 2882 { 2883 char *packed = NULL; 2884 size_t nvsize = 0; 2885 dmu_buf_t *db; 2886 2887 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 2888 2889 packed = kmem_alloc(nvsize, KM_SLEEP); 2890 2891 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 2892 KM_SLEEP) == 0); 2893 2894 dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 2895 2896 kmem_free(packed, nvsize); 2897 2898 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 2899 dmu_buf_will_dirty(db, tx); 2900 *(uint64_t *)db->db_data = nvsize; 2901 dmu_buf_rele(db, FTAG); 2902 } 2903 2904 static void 2905 spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 2906 { 2907 nvlist_t *nvroot; 2908 nvlist_t **spares; 2909 int i; 2910 2911 if (!spa->spa_sync_spares) 2912 return; 2913 2914 /* 2915 * Update the MOS nvlist describing the list of available spares. 2916 * spa_validate_spares() will have already made sure this nvlist is 2917 * valid and the vdevs are labeled appropriately. 2918 */ 2919 if (spa->spa_spares_object == 0) { 2920 spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 2921 DMU_OT_PACKED_NVLIST, 1 << 14, 2922 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2923 VERIFY(zap_update(spa->spa_meta_objset, 2924 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 2925 sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 2926 } 2927 2928 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2929 if (spa->spa_nspares == 0) { 2930 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2931 NULL, 0) == 0); 2932 } else { 2933 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 2934 KM_SLEEP); 2935 for (i = 0; i < spa->spa_nspares; i++) 2936 spares[i] = vdev_config_generate(spa, 2937 spa->spa_spares[i], B_FALSE, B_TRUE); 2938 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2939 spares, spa->spa_nspares) == 0); 2940 for (i = 0; i < spa->spa_nspares; i++) 2941 nvlist_free(spares[i]); 2942 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 2943 } 2944 2945 spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 2946 nvlist_free(nvroot); 2947 2948 spa->spa_sync_spares = B_FALSE; 2949 } 2950 2951 static void 2952 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2953 { 2954 nvlist_t *config; 2955 2956 if (list_is_empty(&spa->spa_dirty_list)) 2957 return; 2958 2959 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2960 2961 if (spa->spa_config_syncing) 2962 nvlist_free(spa->spa_config_syncing); 2963 spa->spa_config_syncing = config; 2964 2965 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 2966 } 2967 2968 static void 2969 spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 2970 { 2971 spa_t *spa = arg1; 2972 nvlist_t *nvp = arg2; 2973 nvpair_t *nvpair; 2974 objset_t *mos = spa->spa_meta_objset; 2975 uint64_t zapobj; 2976 uint64_t intval; 2977 2978 mutex_enter(&spa->spa_props_lock); 2979 if (spa->spa_pool_props_object == 0) { 2980 zapobj = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx); 2981 VERIFY(zapobj > 0); 2982 2983 spa->spa_pool_props_object = zapobj; 2984 2985 VERIFY(zap_update(mos, DMU_POOL_DIRECTORY_OBJECT, 2986 DMU_POOL_PROPS, 8, 1, 2987 &spa->spa_pool_props_object, tx) == 0); 2988 } 2989 mutex_exit(&spa->spa_props_lock); 2990 2991 nvpair = NULL; 2992 while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) { 2993 switch (zpool_name_to_prop(nvpair_name(nvpair))) { 2994 case ZPOOL_PROP_DELEGATION: 2995 VERIFY(nvlist_lookup_uint64(nvp, 2996 nvpair_name(nvpair), &intval) == 0); 2997 VERIFY(zap_update(mos, 2998 spa->spa_pool_props_object, 2999 nvpair_name(nvpair), 8, 1, 3000 &intval, tx) == 0); 3001 spa->spa_delegation = intval; 3002 break; 3003 case ZPOOL_PROP_BOOTFS: 3004 VERIFY(nvlist_lookup_uint64(nvp, 3005 nvpair_name(nvpair), &spa->spa_bootfs) == 0); 3006 intval = spa->spa_bootfs; 3007 VERIFY(zap_update(mos, 3008 spa->spa_pool_props_object, 3009 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 8, 1, 3010 &intval, tx) == 0); 3011 break; 3012 3013 case ZPOOL_PROP_AUTOREPLACE: 3014 VERIFY(nvlist_lookup_uint64(nvp, 3015 nvpair_name(nvpair), &intval) == 0); 3016 VERIFY(zap_update(mos, 3017 spa->spa_pool_props_object, 3018 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 8, 1, 3019 &intval, tx) == 0); 3020 break; 3021 } 3022 spa_history_internal_log(LOG_POOL_PROPSET, 3023 spa, tx, cr, "%s %lld %s", 3024 nvpair_name(nvpair), intval, 3025 spa->spa_name); 3026 } 3027 } 3028 3029 /* 3030 * Sync the specified transaction group. New blocks may be dirtied as 3031 * part of the process, so we iterate until it converges. 3032 */ 3033 void 3034 spa_sync(spa_t *spa, uint64_t txg) 3035 { 3036 dsl_pool_t *dp = spa->spa_dsl_pool; 3037 objset_t *mos = spa->spa_meta_objset; 3038 bplist_t *bpl = &spa->spa_sync_bplist; 3039 vdev_t *rvd = spa->spa_root_vdev; 3040 vdev_t *vd; 3041 dmu_tx_t *tx; 3042 int dirty_vdevs; 3043 3044 /* 3045 * Lock out configuration changes. 3046 */ 3047 spa_config_enter(spa, RW_READER, FTAG); 3048 3049 spa->spa_syncing_txg = txg; 3050 spa->spa_sync_pass = 0; 3051 3052 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 3053 3054 tx = dmu_tx_create_assigned(dp, txg); 3055 3056 /* 3057 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 3058 * set spa_deflate if we have no raid-z vdevs. 3059 */ 3060 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 3061 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 3062 int i; 3063 3064 for (i = 0; i < rvd->vdev_children; i++) { 3065 vd = rvd->vdev_child[i]; 3066 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 3067 break; 3068 } 3069 if (i == rvd->vdev_children) { 3070 spa->spa_deflate = TRUE; 3071 VERIFY(0 == zap_add(spa->spa_meta_objset, 3072 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3073 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 3074 } 3075 } 3076 3077 /* 3078 * If anything has changed in this txg, push the deferred frees 3079 * from the previous txg. If not, leave them alone so that we 3080 * don't generate work on an otherwise idle system. 3081 */ 3082 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 3083 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 3084 !txg_list_empty(&dp->dp_sync_tasks, txg)) 3085 spa_sync_deferred_frees(spa, txg); 3086 3087 /* 3088 * Iterate to convergence. 3089 */ 3090 do { 3091 spa->spa_sync_pass++; 3092 3093 spa_sync_config_object(spa, tx); 3094 spa_sync_spares(spa, tx); 3095 spa_errlog_sync(spa, txg); 3096 dsl_pool_sync(dp, txg); 3097 3098 dirty_vdevs = 0; 3099 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 3100 vdev_sync(vd, txg); 3101 dirty_vdevs++; 3102 } 3103 3104 bplist_sync(bpl, tx); 3105 } while (dirty_vdevs); 3106 3107 bplist_close(bpl); 3108 3109 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 3110 3111 /* 3112 * Rewrite the vdev configuration (which includes the uberblock) 3113 * to commit the transaction group. 3114 * 3115 * If there are any dirty vdevs, sync the uberblock to all vdevs. 3116 * Otherwise, pick a random top-level vdev that's known to be 3117 * visible in the config cache (see spa_vdev_add() for details). 3118 * If the write fails, try the next vdev until we're tried them all. 3119 */ 3120 if (!list_is_empty(&spa->spa_dirty_list)) { 3121 VERIFY(vdev_config_sync(rvd, txg) == 0); 3122 } else { 3123 int children = rvd->vdev_children; 3124 int c0 = spa_get_random(children); 3125 int c; 3126 3127 for (c = 0; c < children; c++) { 3128 vd = rvd->vdev_child[(c0 + c) % children]; 3129 if (vd->vdev_ms_array == 0) 3130 continue; 3131 if (vdev_config_sync(vd, txg) == 0) 3132 break; 3133 } 3134 if (c == children) 3135 VERIFY(vdev_config_sync(rvd, txg) == 0); 3136 } 3137 3138 dmu_tx_commit(tx); 3139 3140 /* 3141 * Clear the dirty config list. 3142 */ 3143 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 3144 vdev_config_clean(vd); 3145 3146 /* 3147 * Now that the new config has synced transactionally, 3148 * let it become visible to the config cache. 3149 */ 3150 if (spa->spa_config_syncing != NULL) { 3151 spa_config_set(spa, spa->spa_config_syncing); 3152 spa->spa_config_txg = txg; 3153 spa->spa_config_syncing = NULL; 3154 } 3155 3156 /* 3157 * Make a stable copy of the fully synced uberblock. 3158 * We use this as the root for pool traversals. 3159 */ 3160 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 3161 3162 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 3163 3164 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 3165 spa->spa_traverse_wanted = 0; 3166 spa->spa_ubsync = spa->spa_uberblock; 3167 rw_exit(&spa->spa_traverse_lock); 3168 3169 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 3170 3171 /* 3172 * Clean up the ZIL records for the synced txg. 3173 */ 3174 dsl_pool_zil_clean(dp); 3175 3176 /* 3177 * Update usable space statistics. 3178 */ 3179 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 3180 vdev_sync_done(vd, txg); 3181 3182 /* 3183 * It had better be the case that we didn't dirty anything 3184 * since vdev_config_sync(). 3185 */ 3186 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 3187 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 3188 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 3189 ASSERT(bpl->bpl_queue == NULL); 3190 3191 spa_config_exit(spa, FTAG); 3192 3193 /* 3194 * If any async tasks have been requested, kick them off. 3195 */ 3196 spa_async_dispatch(spa); 3197 } 3198 3199 /* 3200 * Sync all pools. We don't want to hold the namespace lock across these 3201 * operations, so we take a reference on the spa_t and drop the lock during the 3202 * sync. 3203 */ 3204 void 3205 spa_sync_allpools(void) 3206 { 3207 spa_t *spa = NULL; 3208 mutex_enter(&spa_namespace_lock); 3209 while ((spa = spa_next(spa)) != NULL) { 3210 if (spa_state(spa) != POOL_STATE_ACTIVE) 3211 continue; 3212 spa_open_ref(spa, FTAG); 3213 mutex_exit(&spa_namespace_lock); 3214 txg_wait_synced(spa_get_dsl(spa), 0); 3215 mutex_enter(&spa_namespace_lock); 3216 spa_close(spa, FTAG); 3217 } 3218 mutex_exit(&spa_namespace_lock); 3219 } 3220 3221 /* 3222 * ========================================================================== 3223 * Miscellaneous routines 3224 * ========================================================================== 3225 */ 3226 3227 /* 3228 * Remove all pools in the system. 3229 */ 3230 void 3231 spa_evict_all(void) 3232 { 3233 spa_t *spa; 3234 3235 /* 3236 * Remove all cached state. All pools should be closed now, 3237 * so every spa in the AVL tree should be unreferenced. 3238 */ 3239 mutex_enter(&spa_namespace_lock); 3240 while ((spa = spa_next(NULL)) != NULL) { 3241 /* 3242 * Stop async tasks. The async thread may need to detach 3243 * a device that's been replaced, which requires grabbing 3244 * spa_namespace_lock, so we must drop it here. 3245 */ 3246 spa_open_ref(spa, FTAG); 3247 mutex_exit(&spa_namespace_lock); 3248 spa_async_suspend(spa); 3249 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 3250 mutex_enter(&spa_namespace_lock); 3251 spa_close(spa, FTAG); 3252 3253 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3254 spa_unload(spa); 3255 spa_deactivate(spa); 3256 } 3257 spa_remove(spa); 3258 } 3259 mutex_exit(&spa_namespace_lock); 3260 } 3261 3262 vdev_t * 3263 spa_lookup_by_guid(spa_t *spa, uint64_t guid) 3264 { 3265 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 3266 } 3267 3268 void 3269 spa_upgrade(spa_t *spa) 3270 { 3271 spa_config_enter(spa, RW_WRITER, FTAG); 3272 3273 /* 3274 * This should only be called for a non-faulted pool, and since a 3275 * future version would result in an unopenable pool, this shouldn't be 3276 * possible. 3277 */ 3278 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 3279 3280 spa->spa_uberblock.ub_version = SPA_VERSION; 3281 vdev_config_dirty(spa->spa_root_vdev); 3282 3283 spa_config_exit(spa, FTAG); 3284 3285 txg_wait_synced(spa_get_dsl(spa), 0); 3286 } 3287 3288 boolean_t 3289 spa_has_spare(spa_t *spa, uint64_t guid) 3290 { 3291 int i; 3292 uint64_t spareguid; 3293 3294 for (i = 0; i < spa->spa_nspares; i++) 3295 if (spa->spa_spares[i]->vdev_guid == guid) 3296 return (B_TRUE); 3297 3298 for (i = 0; i < spa->spa_pending_nspares; i++) { 3299 if (nvlist_lookup_uint64(spa->spa_pending_spares[i], 3300 ZPOOL_CONFIG_GUID, &spareguid) == 0 && 3301 spareguid == guid) 3302 return (B_TRUE); 3303 } 3304 3305 return (B_FALSE); 3306 } 3307 3308 int 3309 spa_set_props(spa_t *spa, nvlist_t *nvp) 3310 { 3311 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 3312 spa, nvp, 3)); 3313 } 3314 3315 int 3316 spa_get_props(spa_t *spa, nvlist_t **nvp) 3317 { 3318 zap_cursor_t zc; 3319 zap_attribute_t za; 3320 objset_t *mos = spa->spa_meta_objset; 3321 zfs_source_t src; 3322 zpool_prop_t prop; 3323 nvlist_t *propval; 3324 uint64_t value; 3325 int err; 3326 3327 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3328 3329 mutex_enter(&spa->spa_props_lock); 3330 /* If no props object, then just return empty nvlist */ 3331 if (spa->spa_pool_props_object == 0) { 3332 mutex_exit(&spa->spa_props_lock); 3333 return (0); 3334 } 3335 3336 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 3337 (err = zap_cursor_retrieve(&zc, &za)) == 0; 3338 zap_cursor_advance(&zc)) { 3339 3340 if ((prop = zpool_name_to_prop(za.za_name)) == ZFS_PROP_INVAL) 3341 continue; 3342 3343 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3344 switch (za.za_integer_length) { 3345 case 8: 3346 if (zpool_prop_default_numeric(prop) == 3347 za.za_first_integer) 3348 src = ZFS_SRC_DEFAULT; 3349 else 3350 src = ZFS_SRC_LOCAL; 3351 value = za.za_first_integer; 3352 3353 if (prop == ZPOOL_PROP_BOOTFS) { 3354 dsl_pool_t *dp; 3355 dsl_dataset_t *ds = NULL; 3356 char strval[MAXPATHLEN]; 3357 3358 dp = spa_get_dsl(spa); 3359 rw_enter(&dp->dp_config_rwlock, RW_READER); 3360 if ((err = dsl_dataset_open_obj(dp, 3361 za.za_first_integer, NULL, DS_MODE_NONE, 3362 FTAG, &ds)) != 0) { 3363 rw_exit(&dp->dp_config_rwlock); 3364 break; 3365 } 3366 dsl_dataset_name(ds, strval); 3367 dsl_dataset_close(ds, DS_MODE_NONE, FTAG); 3368 rw_exit(&dp->dp_config_rwlock); 3369 3370 VERIFY(nvlist_add_uint64(propval, 3371 ZFS_PROP_SOURCE, src) == 0); 3372 VERIFY(nvlist_add_string(propval, 3373 ZFS_PROP_VALUE, strval) == 0); 3374 } else { 3375 VERIFY(nvlist_add_uint64(propval, 3376 ZFS_PROP_SOURCE, src) == 0); 3377 VERIFY(nvlist_add_uint64(propval, 3378 ZFS_PROP_VALUE, value) == 0); 3379 } 3380 VERIFY(nvlist_add_nvlist(*nvp, za.za_name, 3381 propval) == 0); 3382 break; 3383 } 3384 nvlist_free(propval); 3385 } 3386 zap_cursor_fini(&zc); 3387 mutex_exit(&spa->spa_props_lock); 3388 if (err && err != ENOENT) { 3389 nvlist_free(*nvp); 3390 return (err); 3391 } 3392 3393 return (0); 3394 } 3395 3396 /* 3397 * If the bootfs property value is dsobj, clear it. 3398 */ 3399 void 3400 spa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 3401 { 3402 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 3403 VERIFY(zap_remove(spa->spa_meta_objset, 3404 spa->spa_pool_props_object, 3405 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 3406 spa->spa_bootfs = 0; 3407 } 3408 } 3409 3410 /* 3411 * Post a sysevent corresponding to the given event. The 'name' must be one of 3412 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 3413 * filled in from the spa and (optionally) the vdev. This doesn't do anything 3414 * in the userland libzpool, as we don't want consumers to misinterpret ztest 3415 * or zdb as real changes. 3416 */ 3417 void 3418 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 3419 { 3420 #ifdef _KERNEL 3421 sysevent_t *ev; 3422 sysevent_attr_list_t *attr = NULL; 3423 sysevent_value_t value; 3424 sysevent_id_t eid; 3425 3426 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 3427 SE_SLEEP); 3428 3429 value.value_type = SE_DATA_TYPE_STRING; 3430 value.value.sv_string = spa_name(spa); 3431 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 3432 goto done; 3433 3434 value.value_type = SE_DATA_TYPE_UINT64; 3435 value.value.sv_uint64 = spa_guid(spa); 3436 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 3437 goto done; 3438 3439 if (vd) { 3440 value.value_type = SE_DATA_TYPE_UINT64; 3441 value.value.sv_uint64 = vd->vdev_guid; 3442 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 3443 SE_SLEEP) != 0) 3444 goto done; 3445 3446 if (vd->vdev_path) { 3447 value.value_type = SE_DATA_TYPE_STRING; 3448 value.value.sv_string = vd->vdev_path; 3449 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 3450 &value, SE_SLEEP) != 0) 3451 goto done; 3452 } 3453 } 3454 3455 (void) log_sysevent(ev, SE_SLEEP, &eid); 3456 3457 done: 3458 if (attr) 3459 sysevent_free_attr(attr); 3460 sysevent_free(ev); 3461 #endif 3462 } 3463