1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains all the routines used when modifying on-disk SPA state. 31 * This includes opening, importing, destroying, exporting a pool, and syncing a 32 * pool. 33 */ 34 35 #include <sys/zfs_context.h> 36 #include <sys/fm/fs/zfs.h> 37 #include <sys/spa_impl.h> 38 #include <sys/zio.h> 39 #include <sys/zio_checksum.h> 40 #include <sys/zio_compress.h> 41 #include <sys/dmu.h> 42 #include <sys/dmu_tx.h> 43 #include <sys/zap.h> 44 #include <sys/zil.h> 45 #include <sys/vdev_impl.h> 46 #include <sys/metaslab.h> 47 #include <sys/uberblock_impl.h> 48 #include <sys/txg.h> 49 #include <sys/avl.h> 50 #include <sys/dmu_traverse.h> 51 #include <sys/unique.h> 52 #include <sys/dsl_pool.h> 53 #include <sys/dsl_dir.h> 54 #include <sys/dsl_prop.h> 55 #include <sys/fs/zfs.h> 56 #include <sys/callb.h> 57 58 int zio_taskq_threads = 8; 59 60 /* 61 * ========================================================================== 62 * SPA state manipulation (open/create/destroy/import/export) 63 * ========================================================================== 64 */ 65 66 static int 67 spa_error_entry_compare(const void *a, const void *b) 68 { 69 spa_error_entry_t *sa = (spa_error_entry_t *)a; 70 spa_error_entry_t *sb = (spa_error_entry_t *)b; 71 int ret; 72 73 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 74 sizeof (zbookmark_t)); 75 76 if (ret < 0) 77 return (-1); 78 else if (ret > 0) 79 return (1); 80 else 81 return (0); 82 } 83 84 /* 85 * Utility function which retrieves copies of the current logs and 86 * re-initializes them in the process. 87 */ 88 void 89 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 90 { 91 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 92 93 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 94 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 95 96 avl_create(&spa->spa_errlist_scrub, 97 spa_error_entry_compare, sizeof (spa_error_entry_t), 98 offsetof(spa_error_entry_t, se_avl)); 99 avl_create(&spa->spa_errlist_last, 100 spa_error_entry_compare, sizeof (spa_error_entry_t), 101 offsetof(spa_error_entry_t, se_avl)); 102 } 103 104 /* 105 * Activate an uninitialized pool. 106 */ 107 static void 108 spa_activate(spa_t *spa) 109 { 110 int t; 111 112 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 113 114 spa->spa_state = POOL_STATE_ACTIVE; 115 116 spa->spa_normal_class = metaslab_class_create(); 117 118 for (t = 0; t < ZIO_TYPES; t++) { 119 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 120 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 121 TASKQ_PREPOPULATE); 122 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 123 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 124 TASKQ_PREPOPULATE); 125 } 126 127 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 128 129 mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); 130 mutex_init(&spa->spa_config_cache_lock, NULL, MUTEX_DEFAULT, NULL); 131 mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); 132 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); 133 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); 134 mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL); 135 mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); 136 mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); 137 138 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 139 offsetof(vdev_t, vdev_dirty_node)); 140 141 txg_list_create(&spa->spa_vdev_txg_list, 142 offsetof(struct vdev, vdev_txg_node)); 143 144 avl_create(&spa->spa_errlist_scrub, 145 spa_error_entry_compare, sizeof (spa_error_entry_t), 146 offsetof(spa_error_entry_t, se_avl)); 147 avl_create(&spa->spa_errlist_last, 148 spa_error_entry_compare, sizeof (spa_error_entry_t), 149 offsetof(spa_error_entry_t, se_avl)); 150 } 151 152 /* 153 * Opposite of spa_activate(). 154 */ 155 static void 156 spa_deactivate(spa_t *spa) 157 { 158 int t; 159 160 ASSERT(spa->spa_sync_on == B_FALSE); 161 ASSERT(spa->spa_dsl_pool == NULL); 162 ASSERT(spa->spa_root_vdev == NULL); 163 164 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 165 166 txg_list_destroy(&spa->spa_vdev_txg_list); 167 168 list_destroy(&spa->spa_dirty_list); 169 170 rw_destroy(&spa->spa_traverse_lock); 171 172 for (t = 0; t < ZIO_TYPES; t++) { 173 taskq_destroy(spa->spa_zio_issue_taskq[t]); 174 taskq_destroy(spa->spa_zio_intr_taskq[t]); 175 spa->spa_zio_issue_taskq[t] = NULL; 176 spa->spa_zio_intr_taskq[t] = NULL; 177 } 178 179 metaslab_class_destroy(spa->spa_normal_class); 180 spa->spa_normal_class = NULL; 181 182 /* 183 * If this was part of an import or the open otherwise failed, we may 184 * still have errors left in the queues. Empty them just in case. 185 */ 186 spa_errlog_drain(spa); 187 188 avl_destroy(&spa->spa_errlist_scrub); 189 avl_destroy(&spa->spa_errlist_last); 190 191 spa->spa_state = POOL_STATE_UNINITIALIZED; 192 } 193 194 /* 195 * Verify a pool configuration, and construct the vdev tree appropriately. This 196 * will create all the necessary vdevs in the appropriate layout, with each vdev 197 * in the CLOSED state. This will prep the pool before open/creation/import. 198 * All vdev validation is done by the vdev_alloc() routine. 199 */ 200 static int 201 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 202 uint_t id, int atype) 203 { 204 nvlist_t **child; 205 uint_t c, children; 206 int error; 207 208 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 209 return (error); 210 211 if ((*vdp)->vdev_ops->vdev_op_leaf) 212 return (0); 213 214 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 215 &child, &children) != 0) { 216 vdev_free(*vdp); 217 *vdp = NULL; 218 return (EINVAL); 219 } 220 221 for (c = 0; c < children; c++) { 222 vdev_t *vd; 223 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 224 atype)) != 0) { 225 vdev_free(*vdp); 226 *vdp = NULL; 227 return (error); 228 } 229 } 230 231 ASSERT(*vdp != NULL); 232 233 return (0); 234 } 235 236 /* 237 * Opposite of spa_load(). 238 */ 239 static void 240 spa_unload(spa_t *spa) 241 { 242 int i; 243 244 /* 245 * Stop async tasks. 246 */ 247 spa_async_suspend(spa); 248 249 /* 250 * Stop syncing. 251 */ 252 if (spa->spa_sync_on) { 253 txg_sync_stop(spa->spa_dsl_pool); 254 spa->spa_sync_on = B_FALSE; 255 } 256 257 /* 258 * Wait for any outstanding prefetch I/O to complete. 259 */ 260 spa_config_enter(spa, RW_WRITER, FTAG); 261 spa_config_exit(spa, FTAG); 262 263 /* 264 * Close the dsl pool. 265 */ 266 if (spa->spa_dsl_pool) { 267 dsl_pool_close(spa->spa_dsl_pool); 268 spa->spa_dsl_pool = NULL; 269 } 270 271 /* 272 * Close all vdevs. 273 */ 274 if (spa->spa_root_vdev) 275 vdev_free(spa->spa_root_vdev); 276 ASSERT(spa->spa_root_vdev == NULL); 277 278 for (i = 0; i < spa->spa_nspares; i++) 279 vdev_free(spa->spa_spares[i]); 280 if (spa->spa_spares) { 281 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 282 spa->spa_spares = NULL; 283 } 284 if (spa->spa_sparelist) { 285 nvlist_free(spa->spa_sparelist); 286 spa->spa_sparelist = NULL; 287 } 288 289 spa->spa_async_suspended = 0; 290 } 291 292 /* 293 * Load (or re-load) the current list of vdevs describing the active spares for 294 * this pool. When this is called, we have some form of basic information in 295 * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 296 * re-generate a more complete list including status information. 297 */ 298 static void 299 spa_load_spares(spa_t *spa) 300 { 301 nvlist_t **spares; 302 uint_t nspares; 303 int i; 304 vdev_t *vd, *tvd; 305 306 /* 307 * First, close and free any existing spare vdevs. 308 */ 309 for (i = 0; i < spa->spa_nspares; i++) { 310 vd = spa->spa_spares[i]; 311 312 /* Undo the call to spa_activate() below */ 313 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && 314 tvd->vdev_isspare) 315 spa_spare_remove(tvd); 316 vdev_close(vd); 317 vdev_free(vd); 318 } 319 320 if (spa->spa_spares) 321 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 322 323 if (spa->spa_sparelist == NULL) 324 nspares = 0; 325 else 326 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 327 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 328 329 spa->spa_nspares = (int)nspares; 330 spa->spa_spares = NULL; 331 332 if (nspares == 0) 333 return; 334 335 /* 336 * Construct the array of vdevs, opening them to get status in the 337 * process. For each spare, there is potentially two different vdev_t 338 * structures associated with it: one in the list of spares (used only 339 * for basic validation purposes) and one in the active vdev 340 * configuration (if it's spared in). During this phase we open and 341 * validate each vdev on the spare list. If the vdev also exists in the 342 * active configuration, then we also mark this vdev as an active spare. 343 */ 344 spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 345 for (i = 0; i < spa->spa_nspares; i++) { 346 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 347 VDEV_ALLOC_SPARE) == 0); 348 ASSERT(vd != NULL); 349 350 spa->spa_spares[i] = vd; 351 352 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { 353 if (!tvd->vdev_isspare) 354 spa_spare_add(tvd); 355 356 /* 357 * We only mark the spare active if we were successfully 358 * able to load the vdev. Otherwise, importing a pool 359 * with a bad active spare would result in strange 360 * behavior, because multiple pool would think the spare 361 * is actively in use. 362 * 363 * There is a vulnerability here to an equally bizarre 364 * circumstance, where a dead active spare is later 365 * brought back to life (onlined or otherwise). Given 366 * the rarity of this scenario, and the extra complexity 367 * it adds, we ignore the possibility. 368 */ 369 if (!vdev_is_dead(tvd)) 370 spa_spare_activate(tvd); 371 } 372 373 if (vdev_open(vd) != 0) 374 continue; 375 376 vd->vdev_top = vd; 377 (void) vdev_validate_spare(vd); 378 } 379 380 /* 381 * Recompute the stashed list of spares, with status information 382 * this time. 383 */ 384 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 385 DATA_TYPE_NVLIST_ARRAY) == 0); 386 387 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 388 for (i = 0; i < spa->spa_nspares; i++) 389 spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 390 B_TRUE, B_TRUE); 391 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 392 spares, spa->spa_nspares) == 0); 393 for (i = 0; i < spa->spa_nspares; i++) 394 nvlist_free(spares[i]); 395 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 396 } 397 398 static int 399 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 400 { 401 dmu_buf_t *db; 402 char *packed = NULL; 403 size_t nvsize = 0; 404 int error; 405 *value = NULL; 406 407 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 408 nvsize = *(uint64_t *)db->db_data; 409 dmu_buf_rele(db, FTAG); 410 411 packed = kmem_alloc(nvsize, KM_SLEEP); 412 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 413 if (error == 0) 414 error = nvlist_unpack(packed, nvsize, value, 0); 415 kmem_free(packed, nvsize); 416 417 return (error); 418 } 419 420 /* 421 * Load an existing storage pool, using the pool's builtin spa_config as a 422 * source of configuration information. 423 */ 424 static int 425 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 426 { 427 int error = 0; 428 nvlist_t *nvroot = NULL; 429 vdev_t *rvd; 430 uberblock_t *ub = &spa->spa_uberblock; 431 uint64_t config_cache_txg = spa->spa_config_txg; 432 uint64_t pool_guid; 433 uint64_t version; 434 zio_t *zio; 435 436 spa->spa_load_state = state; 437 438 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 439 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 440 error = EINVAL; 441 goto out; 442 } 443 444 /* 445 * Versioning wasn't explicitly added to the label until later, so if 446 * it's not present treat it as the initial version. 447 */ 448 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 449 version = ZFS_VERSION_INITIAL; 450 451 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 452 &spa->spa_config_txg); 453 454 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 455 spa_guid_exists(pool_guid, 0)) { 456 error = EEXIST; 457 goto out; 458 } 459 460 spa->spa_load_guid = pool_guid; 461 462 /* 463 * Parse the configuration into a vdev tree. We explicitly set the 464 * value that will be returned by spa_version() since parsing the 465 * configuration requires knowing the version number. 466 */ 467 spa_config_enter(spa, RW_WRITER, FTAG); 468 spa->spa_ubsync.ub_version = version; 469 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 470 spa_config_exit(spa, FTAG); 471 472 if (error != 0) 473 goto out; 474 475 ASSERT(spa->spa_root_vdev == rvd); 476 ASSERT(spa_guid(spa) == pool_guid); 477 478 /* 479 * Try to open all vdevs, loading each label in the process. 480 */ 481 if (vdev_open(rvd) != 0) { 482 error = ENXIO; 483 goto out; 484 } 485 486 /* 487 * Validate the labels for all leaf vdevs. We need to grab the config 488 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 489 * flag. 490 */ 491 spa_config_enter(spa, RW_READER, FTAG); 492 error = vdev_validate(rvd); 493 spa_config_exit(spa, FTAG); 494 495 if (error != 0) { 496 error = EBADF; 497 goto out; 498 } 499 500 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 501 error = ENXIO; 502 goto out; 503 } 504 505 /* 506 * Find the best uberblock. 507 */ 508 bzero(ub, sizeof (uberblock_t)); 509 510 zio = zio_root(spa, NULL, NULL, 511 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 512 vdev_uberblock_load(zio, rvd, ub); 513 error = zio_wait(zio); 514 515 /* 516 * If we weren't able to find a single valid uberblock, return failure. 517 */ 518 if (ub->ub_txg == 0) { 519 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 520 VDEV_AUX_CORRUPT_DATA); 521 error = ENXIO; 522 goto out; 523 } 524 525 /* 526 * If the pool is newer than the code, we can't open it. 527 */ 528 if (ub->ub_version > ZFS_VERSION) { 529 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 530 VDEV_AUX_VERSION_NEWER); 531 error = ENOTSUP; 532 goto out; 533 } 534 535 /* 536 * If the vdev guid sum doesn't match the uberblock, we have an 537 * incomplete configuration. 538 */ 539 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 540 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 541 VDEV_AUX_BAD_GUID_SUM); 542 error = ENXIO; 543 goto out; 544 } 545 546 /* 547 * Initialize internal SPA structures. 548 */ 549 spa->spa_state = POOL_STATE_ACTIVE; 550 spa->spa_ubsync = spa->spa_uberblock; 551 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 552 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 553 if (error) { 554 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 555 VDEV_AUX_CORRUPT_DATA); 556 goto out; 557 } 558 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 559 560 if (zap_lookup(spa->spa_meta_objset, 561 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 562 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 563 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 564 VDEV_AUX_CORRUPT_DATA); 565 error = EIO; 566 goto out; 567 } 568 569 if (!mosconfig) { 570 nvlist_t *newconfig; 571 572 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 573 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 574 VDEV_AUX_CORRUPT_DATA); 575 error = EIO; 576 goto out; 577 } 578 579 spa_config_set(spa, newconfig); 580 spa_unload(spa); 581 spa_deactivate(spa); 582 spa_activate(spa); 583 584 return (spa_load(spa, newconfig, state, B_TRUE)); 585 } 586 587 if (zap_lookup(spa->spa_meta_objset, 588 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 589 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 590 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 591 VDEV_AUX_CORRUPT_DATA); 592 error = EIO; 593 goto out; 594 } 595 596 /* 597 * Load the bit that tells us to use the new accounting function 598 * (raid-z deflation). If we have an older pool, this will not 599 * be present. 600 */ 601 error = zap_lookup(spa->spa_meta_objset, 602 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 603 sizeof (uint64_t), 1, &spa->spa_deflate); 604 if (error != 0 && error != ENOENT) { 605 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 606 VDEV_AUX_CORRUPT_DATA); 607 error = EIO; 608 goto out; 609 } 610 611 /* 612 * Load the persistent error log. If we have an older pool, this will 613 * not be present. 614 */ 615 error = zap_lookup(spa->spa_meta_objset, 616 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 617 sizeof (uint64_t), 1, &spa->spa_errlog_last); 618 if (error != 0 && error != ENOENT) { 619 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 620 VDEV_AUX_CORRUPT_DATA); 621 error = EIO; 622 goto out; 623 } 624 625 error = zap_lookup(spa->spa_meta_objset, 626 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 627 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 628 if (error != 0 && error != ENOENT) { 629 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 630 VDEV_AUX_CORRUPT_DATA); 631 error = EIO; 632 goto out; 633 } 634 635 /* 636 * Load the history object. If we have an older pool, this 637 * will not be present. 638 */ 639 error = zap_lookup(spa->spa_meta_objset, 640 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 641 sizeof (uint64_t), 1, &spa->spa_history); 642 if (error != 0 && error != ENOENT) { 643 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 644 VDEV_AUX_CORRUPT_DATA); 645 error = EIO; 646 goto out; 647 } 648 649 /* 650 * Load any hot spares for this pool. 651 */ 652 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 653 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 654 if (error != 0 && error != ENOENT) { 655 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 656 VDEV_AUX_CORRUPT_DATA); 657 error = EIO; 658 goto out; 659 } 660 if (error == 0) { 661 ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES); 662 if (load_nvlist(spa, spa->spa_spares_object, 663 &spa->spa_sparelist) != 0) { 664 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 665 VDEV_AUX_CORRUPT_DATA); 666 error = EIO; 667 goto out; 668 } 669 670 spa_config_enter(spa, RW_WRITER, FTAG); 671 spa_load_spares(spa); 672 spa_config_exit(spa, FTAG); 673 } 674 675 /* 676 * Load the vdev state for all toplevel vdevs. 677 */ 678 vdev_load(rvd); 679 680 /* 681 * Propagate the leaf DTLs we just loaded all the way up the tree. 682 */ 683 spa_config_enter(spa, RW_WRITER, FTAG); 684 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 685 spa_config_exit(spa, FTAG); 686 687 /* 688 * Check the state of the root vdev. If it can't be opened, it 689 * indicates one or more toplevel vdevs are faulted. 690 */ 691 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 692 error = ENXIO; 693 goto out; 694 } 695 696 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 697 dmu_tx_t *tx; 698 int need_update = B_FALSE; 699 int c; 700 701 /* 702 * Claim log blocks that haven't been committed yet. 703 * This must all happen in a single txg. 704 */ 705 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 706 spa_first_txg(spa)); 707 (void) dmu_objset_find(spa->spa_name, 708 zil_claim, tx, DS_FIND_CHILDREN); 709 dmu_tx_commit(tx); 710 711 spa->spa_sync_on = B_TRUE; 712 txg_sync_start(spa->spa_dsl_pool); 713 714 /* 715 * Wait for all claims to sync. 716 */ 717 txg_wait_synced(spa->spa_dsl_pool, 0); 718 719 /* 720 * If the config cache is stale, or we have uninitialized 721 * metaslabs (see spa_vdev_add()), then update the config. 722 */ 723 if (config_cache_txg != spa->spa_config_txg || 724 state == SPA_LOAD_IMPORT) 725 need_update = B_TRUE; 726 727 for (c = 0; c < rvd->vdev_children; c++) 728 if (rvd->vdev_child[c]->vdev_ms_array == 0) 729 need_update = B_TRUE; 730 731 /* 732 * Update the config cache asychronously in case we're the 733 * root pool, in which case the config cache isn't writable yet. 734 */ 735 if (need_update) 736 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 737 } 738 739 error = 0; 740 out: 741 if (error && error != EBADF) 742 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 743 spa->spa_load_state = SPA_LOAD_NONE; 744 spa->spa_ena = 0; 745 746 return (error); 747 } 748 749 /* 750 * Pool Open/Import 751 * 752 * The import case is identical to an open except that the configuration is sent 753 * down from userland, instead of grabbed from the configuration cache. For the 754 * case of an open, the pool configuration will exist in the 755 * POOL_STATE_UNITIALIZED state. 756 * 757 * The stats information (gen/count/ustats) is used to gather vdev statistics at 758 * the same time open the pool, without having to keep around the spa_t in some 759 * ambiguous state. 760 */ 761 static int 762 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 763 { 764 spa_t *spa; 765 int error; 766 int loaded = B_FALSE; 767 int locked = B_FALSE; 768 769 *spapp = NULL; 770 771 /* 772 * As disgusting as this is, we need to support recursive calls to this 773 * function because dsl_dir_open() is called during spa_load(), and ends 774 * up calling spa_open() again. The real fix is to figure out how to 775 * avoid dsl_dir_open() calling this in the first place. 776 */ 777 if (mutex_owner(&spa_namespace_lock) != curthread) { 778 mutex_enter(&spa_namespace_lock); 779 locked = B_TRUE; 780 } 781 782 if ((spa = spa_lookup(pool)) == NULL) { 783 if (locked) 784 mutex_exit(&spa_namespace_lock); 785 return (ENOENT); 786 } 787 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 788 789 spa_activate(spa); 790 791 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 792 793 if (error == EBADF) { 794 /* 795 * If vdev_validate() returns failure (indicated by 796 * EBADF), it indicates that one of the vdevs indicates 797 * that the pool has been exported or destroyed. If 798 * this is the case, the config cache is out of sync and 799 * we should remove the pool from the namespace. 800 */ 801 zfs_post_ok(spa, NULL); 802 spa_unload(spa); 803 spa_deactivate(spa); 804 spa_remove(spa); 805 spa_config_sync(); 806 if (locked) 807 mutex_exit(&spa_namespace_lock); 808 return (ENOENT); 809 } 810 811 if (error) { 812 /* 813 * We can't open the pool, but we still have useful 814 * information: the state of each vdev after the 815 * attempted vdev_open(). Return this to the user. 816 */ 817 if (config != NULL && spa->spa_root_vdev != NULL) { 818 spa_config_enter(spa, RW_READER, FTAG); 819 *config = spa_config_generate(spa, NULL, -1ULL, 820 B_TRUE); 821 spa_config_exit(spa, FTAG); 822 } 823 spa_unload(spa); 824 spa_deactivate(spa); 825 spa->spa_last_open_failed = B_TRUE; 826 if (locked) 827 mutex_exit(&spa_namespace_lock); 828 *spapp = NULL; 829 return (error); 830 } else { 831 zfs_post_ok(spa, NULL); 832 spa->spa_last_open_failed = B_FALSE; 833 } 834 835 loaded = B_TRUE; 836 } 837 838 spa_open_ref(spa, tag); 839 if (locked) 840 mutex_exit(&spa_namespace_lock); 841 842 *spapp = spa; 843 844 if (config != NULL) { 845 spa_config_enter(spa, RW_READER, FTAG); 846 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 847 spa_config_exit(spa, FTAG); 848 } 849 850 /* 851 * If we just loaded the pool, resilver anything that's out of date. 852 */ 853 if (loaded && (spa_mode & FWRITE)) 854 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 855 856 return (0); 857 } 858 859 int 860 spa_open(const char *name, spa_t **spapp, void *tag) 861 { 862 return (spa_open_common(name, spapp, tag, NULL)); 863 } 864 865 /* 866 * Lookup the given spa_t, incrementing the inject count in the process, 867 * preventing it from being exported or destroyed. 868 */ 869 spa_t * 870 spa_inject_addref(char *name) 871 { 872 spa_t *spa; 873 874 mutex_enter(&spa_namespace_lock); 875 if ((spa = spa_lookup(name)) == NULL) { 876 mutex_exit(&spa_namespace_lock); 877 return (NULL); 878 } 879 spa->spa_inject_ref++; 880 mutex_exit(&spa_namespace_lock); 881 882 return (spa); 883 } 884 885 void 886 spa_inject_delref(spa_t *spa) 887 { 888 mutex_enter(&spa_namespace_lock); 889 spa->spa_inject_ref--; 890 mutex_exit(&spa_namespace_lock); 891 } 892 893 static void 894 spa_add_spares(spa_t *spa, nvlist_t *config) 895 { 896 nvlist_t **spares; 897 uint_t i, nspares; 898 nvlist_t *nvroot; 899 uint64_t guid; 900 vdev_stat_t *vs; 901 uint_t vsc; 902 uint64_t pool; 903 904 if (spa->spa_nspares == 0) 905 return; 906 907 VERIFY(nvlist_lookup_nvlist(config, 908 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 909 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 910 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 911 if (nspares != 0) { 912 VERIFY(nvlist_add_nvlist_array(nvroot, 913 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 914 VERIFY(nvlist_lookup_nvlist_array(nvroot, 915 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 916 917 /* 918 * Go through and find any spares which have since been 919 * repurposed as an active spare. If this is the case, update 920 * their status appropriately. 921 */ 922 for (i = 0; i < nspares; i++) { 923 VERIFY(nvlist_lookup_uint64(spares[i], 924 ZPOOL_CONFIG_GUID, &guid) == 0); 925 if (spa_spare_exists(guid, &pool) && pool != 0ULL) { 926 VERIFY(nvlist_lookup_uint64_array( 927 spares[i], ZPOOL_CONFIG_STATS, 928 (uint64_t **)&vs, &vsc) == 0); 929 vs->vs_state = VDEV_STATE_CANT_OPEN; 930 vs->vs_aux = VDEV_AUX_SPARED; 931 } 932 } 933 } 934 } 935 936 int 937 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 938 { 939 int error; 940 spa_t *spa; 941 942 *config = NULL; 943 error = spa_open_common(name, &spa, FTAG, config); 944 945 if (spa && *config != NULL) { 946 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 947 spa_get_errlog_size(spa)) == 0); 948 949 spa_add_spares(spa, *config); 950 } 951 952 /* 953 * We want to get the alternate root even for faulted pools, so we cheat 954 * and call spa_lookup() directly. 955 */ 956 if (altroot) { 957 if (spa == NULL) { 958 mutex_enter(&spa_namespace_lock); 959 spa = spa_lookup(name); 960 if (spa) 961 spa_altroot(spa, altroot, buflen); 962 else 963 altroot[0] = '\0'; 964 spa = NULL; 965 mutex_exit(&spa_namespace_lock); 966 } else { 967 spa_altroot(spa, altroot, buflen); 968 } 969 } 970 971 if (spa != NULL) 972 spa_close(spa, FTAG); 973 974 return (error); 975 } 976 977 /* 978 * Validate that the 'spares' array is well formed. We must have an array of 979 * nvlists, each which describes a valid leaf vdev. If this is an import (mode 980 * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long 981 * as they are well-formed. 982 */ 983 static int 984 spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 985 { 986 nvlist_t **spares; 987 uint_t i, nspares; 988 vdev_t *vd; 989 int error; 990 991 /* 992 * It's acceptable to have no spares specified. 993 */ 994 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 995 &spares, &nspares) != 0) 996 return (0); 997 998 if (nspares == 0) 999 return (EINVAL); 1000 1001 /* 1002 * Make sure the pool is formatted with a version that supports hot 1003 * spares. 1004 */ 1005 if (spa_version(spa) < ZFS_VERSION_SPARES) 1006 return (ENOTSUP); 1007 1008 /* 1009 * Set the pending spare list so we correctly handle device in-use 1010 * checking. 1011 */ 1012 spa->spa_pending_spares = spares; 1013 spa->spa_pending_nspares = nspares; 1014 1015 for (i = 0; i < nspares; i++) { 1016 if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 1017 mode)) != 0) 1018 goto out; 1019 1020 if (!vd->vdev_ops->vdev_op_leaf) { 1021 vdev_free(vd); 1022 error = EINVAL; 1023 goto out; 1024 } 1025 1026 vd->vdev_top = vd; 1027 1028 if ((error = vdev_open(vd)) == 0 && 1029 (error = vdev_label_init(vd, crtxg, 1030 VDEV_LABEL_SPARE)) == 0) { 1031 VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 1032 vd->vdev_guid) == 0); 1033 } 1034 1035 vdev_free(vd); 1036 1037 if (error && mode != VDEV_ALLOC_SPARE) 1038 goto out; 1039 else 1040 error = 0; 1041 } 1042 1043 out: 1044 spa->spa_pending_spares = NULL; 1045 spa->spa_pending_nspares = 0; 1046 return (error); 1047 } 1048 1049 /* 1050 * Pool Creation 1051 */ 1052 int 1053 spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 1054 { 1055 spa_t *spa; 1056 vdev_t *rvd; 1057 dsl_pool_t *dp; 1058 dmu_tx_t *tx; 1059 int c, error = 0; 1060 uint64_t txg = TXG_INITIAL; 1061 nvlist_t **spares; 1062 uint_t nspares; 1063 1064 /* 1065 * If this pool already exists, return failure. 1066 */ 1067 mutex_enter(&spa_namespace_lock); 1068 if (spa_lookup(pool) != NULL) { 1069 mutex_exit(&spa_namespace_lock); 1070 return (EEXIST); 1071 } 1072 1073 /* 1074 * Allocate a new spa_t structure. 1075 */ 1076 spa = spa_add(pool, altroot); 1077 spa_activate(spa); 1078 1079 spa->spa_uberblock.ub_txg = txg - 1; 1080 spa->spa_uberblock.ub_version = ZFS_VERSION; 1081 spa->spa_ubsync = spa->spa_uberblock; 1082 1083 /* 1084 * Create the root vdev. 1085 */ 1086 spa_config_enter(spa, RW_WRITER, FTAG); 1087 1088 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1089 1090 ASSERT(error != 0 || rvd != NULL); 1091 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1092 1093 if (error == 0 && rvd->vdev_children == 0) 1094 error = EINVAL; 1095 1096 if (error == 0 && 1097 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1098 (error = spa_validate_spares(spa, nvroot, txg, 1099 VDEV_ALLOC_ADD)) == 0) { 1100 for (c = 0; c < rvd->vdev_children; c++) 1101 vdev_init(rvd->vdev_child[c], txg); 1102 vdev_config_dirty(rvd); 1103 } 1104 1105 spa_config_exit(spa, FTAG); 1106 1107 if (error != 0) { 1108 spa_unload(spa); 1109 spa_deactivate(spa); 1110 spa_remove(spa); 1111 mutex_exit(&spa_namespace_lock); 1112 return (error); 1113 } 1114 1115 /* 1116 * Get the list of spares, if specified. 1117 */ 1118 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1119 &spares, &nspares) == 0) { 1120 VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 1121 KM_SLEEP) == 0); 1122 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1123 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1124 spa_config_enter(spa, RW_WRITER, FTAG); 1125 spa_load_spares(spa); 1126 spa_config_exit(spa, FTAG); 1127 spa->spa_sync_spares = B_TRUE; 1128 } 1129 1130 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1131 spa->spa_meta_objset = dp->dp_meta_objset; 1132 1133 tx = dmu_tx_create_assigned(dp, txg); 1134 1135 /* 1136 * Create the pool config object. 1137 */ 1138 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1139 DMU_OT_PACKED_NVLIST, 1 << 14, 1140 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1141 1142 if (zap_add(spa->spa_meta_objset, 1143 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1144 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1145 cmn_err(CE_PANIC, "failed to add pool config"); 1146 } 1147 1148 /* Newly created pools are always deflated. */ 1149 spa->spa_deflate = TRUE; 1150 if (zap_add(spa->spa_meta_objset, 1151 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1152 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 1153 cmn_err(CE_PANIC, "failed to add deflate"); 1154 } 1155 1156 /* 1157 * Create the deferred-free bplist object. Turn off compression 1158 * because sync-to-convergence takes longer if the blocksize 1159 * keeps changing. 1160 */ 1161 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1162 1 << 14, tx); 1163 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1164 ZIO_COMPRESS_OFF, tx); 1165 1166 if (zap_add(spa->spa_meta_objset, 1167 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1168 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 1169 cmn_err(CE_PANIC, "failed to add bplist"); 1170 } 1171 1172 /* 1173 * Create the pool's history object. 1174 */ 1175 spa_history_create_obj(spa, tx); 1176 1177 dmu_tx_commit(tx); 1178 1179 spa->spa_sync_on = B_TRUE; 1180 txg_sync_start(spa->spa_dsl_pool); 1181 1182 /* 1183 * We explicitly wait for the first transaction to complete so that our 1184 * bean counters are appropriately updated. 1185 */ 1186 txg_wait_synced(spa->spa_dsl_pool, txg); 1187 1188 spa_config_sync(); 1189 1190 mutex_exit(&spa_namespace_lock); 1191 1192 return (0); 1193 } 1194 1195 /* 1196 * Import the given pool into the system. We set up the necessary spa_t and 1197 * then call spa_load() to do the dirty work. 1198 */ 1199 int 1200 spa_import(const char *pool, nvlist_t *config, const char *altroot) 1201 { 1202 spa_t *spa; 1203 int error; 1204 nvlist_t *nvroot; 1205 nvlist_t **spares; 1206 uint_t nspares; 1207 1208 if (!(spa_mode & FWRITE)) 1209 return (EROFS); 1210 1211 /* 1212 * If a pool with this name exists, return failure. 1213 */ 1214 mutex_enter(&spa_namespace_lock); 1215 if (spa_lookup(pool) != NULL) { 1216 mutex_exit(&spa_namespace_lock); 1217 return (EEXIST); 1218 } 1219 1220 /* 1221 * Create and initialize the spa structure. 1222 */ 1223 spa = spa_add(pool, altroot); 1224 spa_activate(spa); 1225 1226 /* 1227 * Pass off the heavy lifting to spa_load(). 1228 * Pass TRUE for mosconfig because the user-supplied config 1229 * is actually the one to trust when doing an import. 1230 */ 1231 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1232 1233 spa_config_enter(spa, RW_WRITER, FTAG); 1234 /* 1235 * Toss any existing sparelist, as it doesn't have any validity anymore, 1236 * and conflicts with spa_has_spare(). 1237 */ 1238 if (spa->spa_sparelist) { 1239 nvlist_free(spa->spa_sparelist); 1240 spa->spa_sparelist = NULL; 1241 spa_load_spares(spa); 1242 } 1243 1244 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1245 &nvroot) == 0); 1246 if (error == 0) 1247 error = spa_validate_spares(spa, nvroot, -1ULL, 1248 VDEV_ALLOC_SPARE); 1249 spa_config_exit(spa, FTAG); 1250 1251 if (error != 0) { 1252 spa_unload(spa); 1253 spa_deactivate(spa); 1254 spa_remove(spa); 1255 mutex_exit(&spa_namespace_lock); 1256 return (error); 1257 } 1258 1259 /* 1260 * Override any spares as specified by the user, as these may have 1261 * correct device names/devids, etc. 1262 */ 1263 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1264 &spares, &nspares) == 0) { 1265 if (spa->spa_sparelist) 1266 VERIFY(nvlist_remove(spa->spa_sparelist, 1267 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1268 else 1269 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1270 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1271 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1272 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1273 spa_config_enter(spa, RW_WRITER, FTAG); 1274 spa_load_spares(spa); 1275 spa_config_exit(spa, FTAG); 1276 spa->spa_sync_spares = B_TRUE; 1277 } 1278 1279 /* 1280 * Update the config cache to include the newly-imported pool. 1281 */ 1282 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1283 1284 mutex_exit(&spa_namespace_lock); 1285 1286 /* 1287 * Resilver anything that's out of date. 1288 */ 1289 if (spa_mode & FWRITE) 1290 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1291 1292 return (0); 1293 } 1294 1295 /* 1296 * This (illegal) pool name is used when temporarily importing a spa_t in order 1297 * to get the vdev stats associated with the imported devices. 1298 */ 1299 #define TRYIMPORT_NAME "$import" 1300 1301 nvlist_t * 1302 spa_tryimport(nvlist_t *tryconfig) 1303 { 1304 nvlist_t *config = NULL; 1305 char *poolname; 1306 spa_t *spa; 1307 uint64_t state; 1308 1309 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1310 return (NULL); 1311 1312 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1313 return (NULL); 1314 1315 /* 1316 * Create and initialize the spa structure. 1317 */ 1318 mutex_enter(&spa_namespace_lock); 1319 spa = spa_add(TRYIMPORT_NAME, NULL); 1320 spa_activate(spa); 1321 1322 /* 1323 * Pass off the heavy lifting to spa_load(). 1324 * Pass TRUE for mosconfig because the user-supplied config 1325 * is actually the one to trust when doing an import. 1326 */ 1327 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1328 1329 /* 1330 * If 'tryconfig' was at least parsable, return the current config. 1331 */ 1332 if (spa->spa_root_vdev != NULL) { 1333 spa_config_enter(spa, RW_READER, FTAG); 1334 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1335 spa_config_exit(spa, FTAG); 1336 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1337 poolname) == 0); 1338 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1339 state) == 0); 1340 1341 /* 1342 * Add the list of hot spares. 1343 */ 1344 spa_add_spares(spa, config); 1345 } 1346 1347 spa_unload(spa); 1348 spa_deactivate(spa); 1349 spa_remove(spa); 1350 mutex_exit(&spa_namespace_lock); 1351 1352 return (config); 1353 } 1354 1355 /* 1356 * Pool export/destroy 1357 * 1358 * The act of destroying or exporting a pool is very simple. We make sure there 1359 * is no more pending I/O and any references to the pool are gone. Then, we 1360 * update the pool state and sync all the labels to disk, removing the 1361 * configuration from the cache afterwards. 1362 */ 1363 static int 1364 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1365 { 1366 spa_t *spa; 1367 1368 if (oldconfig) 1369 *oldconfig = NULL; 1370 1371 if (!(spa_mode & FWRITE)) 1372 return (EROFS); 1373 1374 mutex_enter(&spa_namespace_lock); 1375 if ((spa = spa_lookup(pool)) == NULL) { 1376 mutex_exit(&spa_namespace_lock); 1377 return (ENOENT); 1378 } 1379 1380 /* 1381 * Put a hold on the pool, drop the namespace lock, stop async tasks, 1382 * reacquire the namespace lock, and see if we can export. 1383 */ 1384 spa_open_ref(spa, FTAG); 1385 mutex_exit(&spa_namespace_lock); 1386 spa_async_suspend(spa); 1387 mutex_enter(&spa_namespace_lock); 1388 spa_close(spa, FTAG); 1389 1390 /* 1391 * The pool will be in core if it's openable, 1392 * in which case we can modify its state. 1393 */ 1394 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1395 /* 1396 * Objsets may be open only because they're dirty, so we 1397 * have to force it to sync before checking spa_refcnt. 1398 */ 1399 spa_scrub_suspend(spa); 1400 txg_wait_synced(spa->spa_dsl_pool, 0); 1401 1402 /* 1403 * A pool cannot be exported or destroyed if there are active 1404 * references. If we are resetting a pool, allow references by 1405 * fault injection handlers. 1406 */ 1407 if (!spa_refcount_zero(spa) || 1408 (spa->spa_inject_ref != 0 && 1409 new_state != POOL_STATE_UNINITIALIZED)) { 1410 spa_scrub_resume(spa); 1411 spa_async_resume(spa); 1412 mutex_exit(&spa_namespace_lock); 1413 return (EBUSY); 1414 } 1415 1416 spa_scrub_resume(spa); 1417 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1418 1419 /* 1420 * We want this to be reflected on every label, 1421 * so mark them all dirty. spa_unload() will do the 1422 * final sync that pushes these changes out. 1423 */ 1424 if (new_state != POOL_STATE_UNINITIALIZED) { 1425 spa_config_enter(spa, RW_WRITER, FTAG); 1426 spa->spa_state = new_state; 1427 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 1428 vdev_config_dirty(spa->spa_root_vdev); 1429 spa_config_exit(spa, FTAG); 1430 } 1431 } 1432 1433 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1434 spa_unload(spa); 1435 spa_deactivate(spa); 1436 } 1437 1438 if (oldconfig && spa->spa_config) 1439 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 1440 1441 if (new_state != POOL_STATE_UNINITIALIZED) { 1442 spa_remove(spa); 1443 spa_config_sync(); 1444 } 1445 mutex_exit(&spa_namespace_lock); 1446 1447 return (0); 1448 } 1449 1450 /* 1451 * Destroy a storage pool. 1452 */ 1453 int 1454 spa_destroy(char *pool) 1455 { 1456 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1457 } 1458 1459 /* 1460 * Export a storage pool. 1461 */ 1462 int 1463 spa_export(char *pool, nvlist_t **oldconfig) 1464 { 1465 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1466 } 1467 1468 /* 1469 * Similar to spa_export(), this unloads the spa_t without actually removing it 1470 * from the namespace in any way. 1471 */ 1472 int 1473 spa_reset(char *pool) 1474 { 1475 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 1476 } 1477 1478 1479 /* 1480 * ========================================================================== 1481 * Device manipulation 1482 * ========================================================================== 1483 */ 1484 1485 /* 1486 * Add capacity to a storage pool. 1487 */ 1488 int 1489 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1490 { 1491 uint64_t txg; 1492 int c, error; 1493 vdev_t *rvd = spa->spa_root_vdev; 1494 vdev_t *vd, *tvd; 1495 nvlist_t **spares; 1496 uint_t i, nspares; 1497 1498 txg = spa_vdev_enter(spa); 1499 1500 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 1501 VDEV_ALLOC_ADD)) != 0) 1502 return (spa_vdev_exit(spa, NULL, txg, error)); 1503 1504 spa->spa_pending_vdev = vd; 1505 1506 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1507 &spares, &nspares) != 0) 1508 nspares = 0; 1509 1510 if (vd->vdev_children == 0 && nspares == 0) { 1511 spa->spa_pending_vdev = NULL; 1512 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1513 } 1514 1515 if (vd->vdev_children != 0) { 1516 if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 1517 spa->spa_pending_vdev = NULL; 1518 return (spa_vdev_exit(spa, vd, txg, error)); 1519 } 1520 } 1521 1522 /* 1523 * We must validate the spares after checking the children. Otherwise, 1524 * vdev_inuse() will blindly overwrite the spare. 1525 */ 1526 if ((error = spa_validate_spares(spa, nvroot, txg, 1527 VDEV_ALLOC_ADD)) != 0) { 1528 spa->spa_pending_vdev = NULL; 1529 return (spa_vdev_exit(spa, vd, txg, error)); 1530 } 1531 1532 spa->spa_pending_vdev = NULL; 1533 1534 /* 1535 * Transfer each new top-level vdev from vd to rvd. 1536 */ 1537 for (c = 0; c < vd->vdev_children; c++) { 1538 tvd = vd->vdev_child[c]; 1539 vdev_remove_child(vd, tvd); 1540 tvd->vdev_id = rvd->vdev_children; 1541 vdev_add_child(rvd, tvd); 1542 vdev_config_dirty(tvd); 1543 } 1544 1545 if (nspares != 0) { 1546 if (spa->spa_sparelist != NULL) { 1547 nvlist_t **oldspares; 1548 uint_t oldnspares; 1549 nvlist_t **newspares; 1550 1551 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 1552 ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 1553 1554 newspares = kmem_alloc(sizeof (void *) * 1555 (nspares + oldnspares), KM_SLEEP); 1556 for (i = 0; i < oldnspares; i++) 1557 VERIFY(nvlist_dup(oldspares[i], 1558 &newspares[i], KM_SLEEP) == 0); 1559 for (i = 0; i < nspares; i++) 1560 VERIFY(nvlist_dup(spares[i], 1561 &newspares[i + oldnspares], 1562 KM_SLEEP) == 0); 1563 1564 VERIFY(nvlist_remove(spa->spa_sparelist, 1565 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1566 1567 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1568 ZPOOL_CONFIG_SPARES, newspares, 1569 nspares + oldnspares) == 0); 1570 for (i = 0; i < oldnspares + nspares; i++) 1571 nvlist_free(newspares[i]); 1572 kmem_free(newspares, (oldnspares + nspares) * 1573 sizeof (void *)); 1574 } else { 1575 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1576 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1577 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1578 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1579 } 1580 1581 spa_load_spares(spa); 1582 spa->spa_sync_spares = B_TRUE; 1583 } 1584 1585 /* 1586 * We have to be careful when adding new vdevs to an existing pool. 1587 * If other threads start allocating from these vdevs before we 1588 * sync the config cache, and we lose power, then upon reboot we may 1589 * fail to open the pool because there are DVAs that the config cache 1590 * can't translate. Therefore, we first add the vdevs without 1591 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1592 * and then let spa_config_update() initialize the new metaslabs. 1593 * 1594 * spa_load() checks for added-but-not-initialized vdevs, so that 1595 * if we lose power at any point in this sequence, the remaining 1596 * steps will be completed the next time we load the pool. 1597 */ 1598 (void) spa_vdev_exit(spa, vd, txg, 0); 1599 1600 mutex_enter(&spa_namespace_lock); 1601 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1602 mutex_exit(&spa_namespace_lock); 1603 1604 return (0); 1605 } 1606 1607 /* 1608 * Attach a device to a mirror. The arguments are the path to any device 1609 * in the mirror, and the nvroot for the new device. If the path specifies 1610 * a device that is not mirrored, we automatically insert the mirror vdev. 1611 * 1612 * If 'replacing' is specified, the new device is intended to replace the 1613 * existing device; in this case the two devices are made into their own 1614 * mirror using the 'replacing' vdev, which is functionally idendical to 1615 * the mirror vdev (it actually reuses all the same ops) but has a few 1616 * extra rules: you can't attach to it after it's been created, and upon 1617 * completion of resilvering, the first disk (the one being replaced) 1618 * is automatically detached. 1619 */ 1620 int 1621 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1622 { 1623 uint64_t txg, open_txg; 1624 int error; 1625 vdev_t *rvd = spa->spa_root_vdev; 1626 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1627 vdev_ops_t *pvops; 1628 1629 txg = spa_vdev_enter(spa); 1630 1631 oldvd = vdev_lookup_by_guid(rvd, guid); 1632 1633 if (oldvd == NULL) 1634 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1635 1636 if (!oldvd->vdev_ops->vdev_op_leaf) 1637 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1638 1639 pvd = oldvd->vdev_parent; 1640 1641 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 1642 VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) 1643 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1644 1645 newvd = newrootvd->vdev_child[0]; 1646 1647 if (!newvd->vdev_ops->vdev_op_leaf) 1648 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1649 1650 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 1651 return (spa_vdev_exit(spa, newrootvd, txg, error)); 1652 1653 if (!replacing) { 1654 /* 1655 * For attach, the only allowable parent is a mirror or the root 1656 * vdev. 1657 */ 1658 if (pvd->vdev_ops != &vdev_mirror_ops && 1659 pvd->vdev_ops != &vdev_root_ops) 1660 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1661 1662 pvops = &vdev_mirror_ops; 1663 } else { 1664 /* 1665 * Active hot spares can only be replaced by inactive hot 1666 * spares. 1667 */ 1668 if (pvd->vdev_ops == &vdev_spare_ops && 1669 pvd->vdev_child[1] == oldvd && 1670 !spa_has_spare(spa, newvd->vdev_guid)) 1671 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1672 1673 /* 1674 * If the source is a hot spare, and the parent isn't already a 1675 * spare, then we want to create a new hot spare. Otherwise, we 1676 * want to create a replacing vdev. The user is not allowed to 1677 * attach to a spared vdev child unless the 'isspare' state is 1678 * the same (spare replaces spare, non-spare replaces 1679 * non-spare). 1680 */ 1681 if (pvd->vdev_ops == &vdev_replacing_ops) 1682 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1683 else if (pvd->vdev_ops == &vdev_spare_ops && 1684 newvd->vdev_isspare != oldvd->vdev_isspare) 1685 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1686 else if (pvd->vdev_ops != &vdev_spare_ops && 1687 newvd->vdev_isspare) 1688 pvops = &vdev_spare_ops; 1689 else 1690 pvops = &vdev_replacing_ops; 1691 } 1692 1693 /* 1694 * Compare the new device size with the replaceable/attachable 1695 * device size. 1696 */ 1697 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1698 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1699 1700 /* 1701 * The new device cannot have a higher alignment requirement 1702 * than the top-level vdev. 1703 */ 1704 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1705 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1706 1707 /* 1708 * If this is an in-place replacement, update oldvd's path and devid 1709 * to make it distinguishable from newvd, and unopenable from now on. 1710 */ 1711 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1712 spa_strfree(oldvd->vdev_path); 1713 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1714 KM_SLEEP); 1715 (void) sprintf(oldvd->vdev_path, "%s/%s", 1716 newvd->vdev_path, "old"); 1717 if (oldvd->vdev_devid != NULL) { 1718 spa_strfree(oldvd->vdev_devid); 1719 oldvd->vdev_devid = NULL; 1720 } 1721 } 1722 1723 /* 1724 * If the parent is not a mirror, or if we're replacing, insert the new 1725 * mirror/replacing/spare vdev above oldvd. 1726 */ 1727 if (pvd->vdev_ops != pvops) 1728 pvd = vdev_add_parent(oldvd, pvops); 1729 1730 ASSERT(pvd->vdev_top->vdev_parent == rvd); 1731 ASSERT(pvd->vdev_ops == pvops); 1732 ASSERT(oldvd->vdev_parent == pvd); 1733 1734 /* 1735 * Extract the new device from its root and add it to pvd. 1736 */ 1737 vdev_remove_child(newrootvd, newvd); 1738 newvd->vdev_id = pvd->vdev_children; 1739 vdev_add_child(pvd, newvd); 1740 1741 /* 1742 * If newvd is smaller than oldvd, but larger than its rsize, 1743 * the addition of newvd may have decreased our parent's asize. 1744 */ 1745 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1746 1747 tvd = newvd->vdev_top; 1748 ASSERT(pvd->vdev_top == tvd); 1749 ASSERT(tvd->vdev_parent == rvd); 1750 1751 vdev_config_dirty(tvd); 1752 1753 /* 1754 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1755 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1756 */ 1757 open_txg = txg + TXG_CONCURRENT_STATES - 1; 1758 1759 mutex_enter(&newvd->vdev_dtl_lock); 1760 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1761 open_txg - TXG_INITIAL + 1); 1762 mutex_exit(&newvd->vdev_dtl_lock); 1763 1764 if (newvd->vdev_isspare) 1765 spa_spare_activate(newvd); 1766 1767 /* 1768 * Mark newvd's DTL dirty in this txg. 1769 */ 1770 vdev_dirty(tvd, VDD_DTL, newvd, txg); 1771 1772 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1773 1774 /* 1775 * Kick off a resilver to update newvd. 1776 */ 1777 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1778 1779 return (0); 1780 } 1781 1782 /* 1783 * Detach a device from a mirror or replacing vdev. 1784 * If 'replace_done' is specified, only detach if the parent 1785 * is a replacing vdev. 1786 */ 1787 int 1788 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1789 { 1790 uint64_t txg; 1791 int c, t, error; 1792 vdev_t *rvd = spa->spa_root_vdev; 1793 vdev_t *vd, *pvd, *cvd, *tvd; 1794 boolean_t unspare = B_FALSE; 1795 uint64_t unspare_guid; 1796 1797 txg = spa_vdev_enter(spa); 1798 1799 vd = vdev_lookup_by_guid(rvd, guid); 1800 1801 if (vd == NULL) 1802 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1803 1804 if (!vd->vdev_ops->vdev_op_leaf) 1805 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1806 1807 pvd = vd->vdev_parent; 1808 1809 /* 1810 * If replace_done is specified, only remove this device if it's 1811 * the first child of a replacing vdev. For the 'spare' vdev, either 1812 * disk can be removed. 1813 */ 1814 if (replace_done) { 1815 if (pvd->vdev_ops == &vdev_replacing_ops) { 1816 if (vd->vdev_id != 0) 1817 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1818 } else if (pvd->vdev_ops != &vdev_spare_ops) { 1819 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1820 } 1821 } 1822 1823 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 1824 spa_version(spa) >= ZFS_VERSION_SPARES); 1825 1826 /* 1827 * Only mirror, replacing, and spare vdevs support detach. 1828 */ 1829 if (pvd->vdev_ops != &vdev_replacing_ops && 1830 pvd->vdev_ops != &vdev_mirror_ops && 1831 pvd->vdev_ops != &vdev_spare_ops) 1832 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1833 1834 /* 1835 * If there's only one replica, you can't detach it. 1836 */ 1837 if (pvd->vdev_children <= 1) 1838 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1839 1840 /* 1841 * If all siblings have non-empty DTLs, this device may have the only 1842 * valid copy of the data, which means we cannot safely detach it. 1843 * 1844 * XXX -- as in the vdev_offline() case, we really want a more 1845 * precise DTL check. 1846 */ 1847 for (c = 0; c < pvd->vdev_children; c++) { 1848 uint64_t dirty; 1849 1850 cvd = pvd->vdev_child[c]; 1851 if (cvd == vd) 1852 continue; 1853 if (vdev_is_dead(cvd)) 1854 continue; 1855 mutex_enter(&cvd->vdev_dtl_lock); 1856 dirty = cvd->vdev_dtl_map.sm_space | 1857 cvd->vdev_dtl_scrub.sm_space; 1858 mutex_exit(&cvd->vdev_dtl_lock); 1859 if (!dirty) 1860 break; 1861 } 1862 1863 /* 1864 * If we are a replacing or spare vdev, then we can always detach the 1865 * latter child, as that is how one cancels the operation. 1866 */ 1867 if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 1868 c == pvd->vdev_children) 1869 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1870 1871 /* 1872 * If we are detaching the original disk from a spare, then it implies 1873 * that the spare should become a real disk, and be removed from the 1874 * active spare list for the pool. 1875 */ 1876 if (pvd->vdev_ops == &vdev_spare_ops && 1877 vd->vdev_id == 0) 1878 unspare = B_TRUE; 1879 1880 /* 1881 * Erase the disk labels so the disk can be used for other things. 1882 * This must be done after all other error cases are handled, 1883 * but before we disembowel vd (so we can still do I/O to it). 1884 * But if we can't do it, don't treat the error as fatal -- 1885 * it may be that the unwritability of the disk is the reason 1886 * it's being detached! 1887 */ 1888 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 1889 1890 /* 1891 * Remove vd from its parent and compact the parent's children. 1892 */ 1893 vdev_remove_child(pvd, vd); 1894 vdev_compact_children(pvd); 1895 1896 /* 1897 * Remember one of the remaining children so we can get tvd below. 1898 */ 1899 cvd = pvd->vdev_child[0]; 1900 1901 /* 1902 * If we need to remove the remaining child from the list of hot spares, 1903 * do it now, marking the vdev as no longer a spare in the process. We 1904 * must do this before vdev_remove_parent(), because that can change the 1905 * GUID if it creates a new toplevel GUID. 1906 */ 1907 if (unspare) { 1908 ASSERT(cvd->vdev_isspare); 1909 spa_spare_remove(cvd); 1910 unspare_guid = cvd->vdev_guid; 1911 } 1912 1913 /* 1914 * If the parent mirror/replacing vdev only has one child, 1915 * the parent is no longer needed. Remove it from the tree. 1916 */ 1917 if (pvd->vdev_children == 1) 1918 vdev_remove_parent(cvd); 1919 1920 /* 1921 * We don't set tvd until now because the parent we just removed 1922 * may have been the previous top-level vdev. 1923 */ 1924 tvd = cvd->vdev_top; 1925 ASSERT(tvd->vdev_parent == rvd); 1926 1927 /* 1928 * Reevaluate the parent vdev state. 1929 */ 1930 vdev_propagate_state(cvd->vdev_parent); 1931 1932 /* 1933 * If the device we just detached was smaller than the others, it may be 1934 * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 1935 * can't fail because the existing metaslabs are already in core, so 1936 * there's nothing to read from disk. 1937 */ 1938 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 1939 1940 vdev_config_dirty(tvd); 1941 1942 /* 1943 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 1944 * vd->vdev_detached is set and free vd's DTL object in syncing context. 1945 * But first make sure we're not on any *other* txg's DTL list, to 1946 * prevent vd from being accessed after it's freed. 1947 */ 1948 for (t = 0; t < TXG_SIZE; t++) 1949 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1950 vd->vdev_detached = B_TRUE; 1951 vdev_dirty(tvd, VDD_DTL, vd, txg); 1952 1953 error = spa_vdev_exit(spa, vd, txg, 0); 1954 1955 /* 1956 * If this was the removal of the original device in a hot spare vdev, 1957 * then we want to go through and remove the device from the hot spare 1958 * list of every other pool. 1959 */ 1960 if (unspare) { 1961 spa = NULL; 1962 mutex_enter(&spa_namespace_lock); 1963 while ((spa = spa_next(spa)) != NULL) { 1964 if (spa->spa_state != POOL_STATE_ACTIVE) 1965 continue; 1966 1967 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 1968 } 1969 mutex_exit(&spa_namespace_lock); 1970 } 1971 1972 return (error); 1973 } 1974 1975 /* 1976 * Remove a device from the pool. Currently, this supports removing only hot 1977 * spares. 1978 */ 1979 int 1980 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 1981 { 1982 vdev_t *vd; 1983 nvlist_t **spares, *nv, **newspares; 1984 uint_t i, j, nspares; 1985 int ret = 0; 1986 1987 spa_config_enter(spa, RW_WRITER, FTAG); 1988 1989 vd = spa_lookup_by_guid(spa, guid); 1990 1991 nv = NULL; 1992 if (spa->spa_spares != NULL && 1993 nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 1994 &spares, &nspares) == 0) { 1995 for (i = 0; i < nspares; i++) { 1996 uint64_t theguid; 1997 1998 VERIFY(nvlist_lookup_uint64(spares[i], 1999 ZPOOL_CONFIG_GUID, &theguid) == 0); 2000 if (theguid == guid) { 2001 nv = spares[i]; 2002 break; 2003 } 2004 } 2005 } 2006 2007 /* 2008 * We only support removing a hot spare, and only if it's not currently 2009 * in use in this pool. 2010 */ 2011 if (nv == NULL && vd == NULL) { 2012 ret = ENOENT; 2013 goto out; 2014 } 2015 2016 if (nv == NULL && vd != NULL) { 2017 ret = ENOTSUP; 2018 goto out; 2019 } 2020 2021 if (!unspare && nv != NULL && vd != NULL) { 2022 ret = EBUSY; 2023 goto out; 2024 } 2025 2026 if (nspares == 1) { 2027 newspares = NULL; 2028 } else { 2029 newspares = kmem_alloc((nspares - 1) * sizeof (void *), 2030 KM_SLEEP); 2031 for (i = 0, j = 0; i < nspares; i++) { 2032 if (spares[i] != nv) 2033 VERIFY(nvlist_dup(spares[i], 2034 &newspares[j++], KM_SLEEP) == 0); 2035 } 2036 } 2037 2038 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2039 DATA_TYPE_NVLIST_ARRAY) == 0); 2040 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2041 newspares, nspares - 1) == 0); 2042 for (i = 0; i < nspares - 1; i++) 2043 nvlist_free(newspares[i]); 2044 kmem_free(newspares, (nspares - 1) * sizeof (void *)); 2045 spa_load_spares(spa); 2046 spa->spa_sync_spares = B_TRUE; 2047 2048 out: 2049 spa_config_exit(spa, FTAG); 2050 2051 return (ret); 2052 } 2053 2054 /* 2055 * Find any device that's done replacing, so we can detach it. 2056 */ 2057 static vdev_t * 2058 spa_vdev_replace_done_hunt(vdev_t *vd) 2059 { 2060 vdev_t *newvd, *oldvd; 2061 int c; 2062 2063 for (c = 0; c < vd->vdev_children; c++) { 2064 oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 2065 if (oldvd != NULL) 2066 return (oldvd); 2067 } 2068 2069 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 2070 oldvd = vd->vdev_child[0]; 2071 newvd = vd->vdev_child[1]; 2072 2073 mutex_enter(&newvd->vdev_dtl_lock); 2074 if (newvd->vdev_dtl_map.sm_space == 0 && 2075 newvd->vdev_dtl_scrub.sm_space == 0) { 2076 mutex_exit(&newvd->vdev_dtl_lock); 2077 return (oldvd); 2078 } 2079 mutex_exit(&newvd->vdev_dtl_lock); 2080 } 2081 2082 return (NULL); 2083 } 2084 2085 static void 2086 spa_vdev_replace_done(spa_t *spa) 2087 { 2088 vdev_t *vd; 2089 vdev_t *pvd; 2090 uint64_t guid; 2091 uint64_t pguid = 0; 2092 2093 spa_config_enter(spa, RW_READER, FTAG); 2094 2095 while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 2096 guid = vd->vdev_guid; 2097 /* 2098 * If we have just finished replacing a hot spared device, then 2099 * we need to detach the parent's first child (the original hot 2100 * spare) as well. 2101 */ 2102 pvd = vd->vdev_parent; 2103 if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2104 pvd->vdev_id == 0) { 2105 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 2106 ASSERT(pvd->vdev_parent->vdev_children == 2); 2107 pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 2108 } 2109 spa_config_exit(spa, FTAG); 2110 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 2111 return; 2112 if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 2113 return; 2114 spa_config_enter(spa, RW_READER, FTAG); 2115 } 2116 2117 spa_config_exit(spa, FTAG); 2118 } 2119 2120 /* 2121 * Update the stored path for this vdev. Dirty the vdev configuration, relying 2122 * on spa_vdev_enter/exit() to synchronize the labels and cache. 2123 */ 2124 int 2125 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 2126 { 2127 vdev_t *rvd, *vd; 2128 uint64_t txg; 2129 2130 rvd = spa->spa_root_vdev; 2131 2132 txg = spa_vdev_enter(spa); 2133 2134 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2135 /* 2136 * Determine if this is a reference to a hot spare. In that 2137 * case, update the path as stored in the spare list. 2138 */ 2139 nvlist_t **spares; 2140 uint_t i, nspares; 2141 if (spa->spa_sparelist != NULL) { 2142 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 2143 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2144 for (i = 0; i < nspares; i++) { 2145 uint64_t theguid; 2146 VERIFY(nvlist_lookup_uint64(spares[i], 2147 ZPOOL_CONFIG_GUID, &theguid) == 0); 2148 if (theguid == guid) 2149 break; 2150 } 2151 2152 if (i == nspares) 2153 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2154 2155 VERIFY(nvlist_add_string(spares[i], 2156 ZPOOL_CONFIG_PATH, newpath) == 0); 2157 spa_load_spares(spa); 2158 spa->spa_sync_spares = B_TRUE; 2159 return (spa_vdev_exit(spa, NULL, txg, 0)); 2160 } else { 2161 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2162 } 2163 } 2164 2165 if (!vd->vdev_ops->vdev_op_leaf) 2166 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2167 2168 spa_strfree(vd->vdev_path); 2169 vd->vdev_path = spa_strdup(newpath); 2170 2171 vdev_config_dirty(vd->vdev_top); 2172 2173 return (spa_vdev_exit(spa, NULL, txg, 0)); 2174 } 2175 2176 /* 2177 * ========================================================================== 2178 * SPA Scrubbing 2179 * ========================================================================== 2180 */ 2181 2182 static void 2183 spa_scrub_io_done(zio_t *zio) 2184 { 2185 spa_t *spa = zio->io_spa; 2186 2187 zio_data_buf_free(zio->io_data, zio->io_size); 2188 2189 mutex_enter(&spa->spa_scrub_lock); 2190 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2191 vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2192 spa->spa_scrub_errors++; 2193 mutex_enter(&vd->vdev_stat_lock); 2194 vd->vdev_stat.vs_scrub_errors++; 2195 mutex_exit(&vd->vdev_stat_lock); 2196 } 2197 2198 if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight) 2199 cv_broadcast(&spa->spa_scrub_io_cv); 2200 2201 ASSERT(spa->spa_scrub_inflight >= 0); 2202 2203 mutex_exit(&spa->spa_scrub_lock); 2204 } 2205 2206 static void 2207 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 2208 zbookmark_t *zb) 2209 { 2210 size_t size = BP_GET_LSIZE(bp); 2211 void *data; 2212 2213 mutex_enter(&spa->spa_scrub_lock); 2214 /* 2215 * Do not give too much work to vdev(s). 2216 */ 2217 while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) { 2218 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2219 } 2220 spa->spa_scrub_inflight++; 2221 mutex_exit(&spa->spa_scrub_lock); 2222 2223 data = zio_data_buf_alloc(size); 2224 2225 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 2226 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 2227 2228 flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 2229 2230 zio_nowait(zio_read(NULL, spa, bp, data, size, 2231 spa_scrub_io_done, NULL, priority, flags, zb)); 2232 } 2233 2234 /* ARGSUSED */ 2235 static int 2236 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2237 { 2238 blkptr_t *bp = &bc->bc_blkptr; 2239 vdev_t *vd = spa->spa_root_vdev; 2240 dva_t *dva = bp->blk_dva; 2241 int needs_resilver = B_FALSE; 2242 int d; 2243 2244 if (bc->bc_errno) { 2245 /* 2246 * We can't scrub this block, but we can continue to scrub 2247 * the rest of the pool. Note the error and move along. 2248 */ 2249 mutex_enter(&spa->spa_scrub_lock); 2250 spa->spa_scrub_errors++; 2251 mutex_exit(&spa->spa_scrub_lock); 2252 2253 mutex_enter(&vd->vdev_stat_lock); 2254 vd->vdev_stat.vs_scrub_errors++; 2255 mutex_exit(&vd->vdev_stat_lock); 2256 2257 return (ERESTART); 2258 } 2259 2260 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2261 2262 for (d = 0; d < BP_GET_NDVAS(bp); d++) { 2263 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 2264 2265 ASSERT(vd != NULL); 2266 2267 /* 2268 * Keep track of how much data we've examined so that 2269 * zpool(1M) status can make useful progress reports. 2270 */ 2271 mutex_enter(&vd->vdev_stat_lock); 2272 vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 2273 mutex_exit(&vd->vdev_stat_lock); 2274 2275 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 2276 if (DVA_GET_GANG(&dva[d])) { 2277 /* 2278 * Gang members may be spread across multiple 2279 * vdevs, so the best we can do is look at the 2280 * pool-wide DTL. 2281 * XXX -- it would be better to change our 2282 * allocation policy to ensure that this can't 2283 * happen. 2284 */ 2285 vd = spa->spa_root_vdev; 2286 } 2287 if (vdev_dtl_contains(&vd->vdev_dtl_map, 2288 bp->blk_birth, 1)) 2289 needs_resilver = B_TRUE; 2290 } 2291 } 2292 2293 if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2294 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 2295 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 2296 else if (needs_resilver) 2297 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 2298 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2299 2300 return (0); 2301 } 2302 2303 static void 2304 spa_scrub_thread(spa_t *spa) 2305 { 2306 callb_cpr_t cprinfo; 2307 traverse_handle_t *th = spa->spa_scrub_th; 2308 vdev_t *rvd = spa->spa_root_vdev; 2309 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2310 int error = 0; 2311 boolean_t complete; 2312 2313 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2314 2315 /* 2316 * If we're restarting due to a snapshot create/delete, 2317 * wait for that to complete. 2318 */ 2319 txg_wait_synced(spa_get_dsl(spa), 0); 2320 2321 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 2322 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2323 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 2324 2325 spa_config_enter(spa, RW_WRITER, FTAG); 2326 vdev_reopen(rvd); /* purge all vdev caches */ 2327 vdev_config_dirty(rvd); /* rewrite all disk labels */ 2328 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 2329 spa_config_exit(spa, FTAG); 2330 2331 mutex_enter(&spa->spa_scrub_lock); 2332 spa->spa_scrub_errors = 0; 2333 spa->spa_scrub_active = 1; 2334 ASSERT(spa->spa_scrub_inflight == 0); 2335 2336 while (!spa->spa_scrub_stop) { 2337 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2338 while (spa->spa_scrub_suspended) { 2339 spa->spa_scrub_active = 0; 2340 cv_broadcast(&spa->spa_scrub_cv); 2341 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2342 spa->spa_scrub_active = 1; 2343 } 2344 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2345 2346 if (spa->spa_scrub_restart_txg != 0) 2347 break; 2348 2349 mutex_exit(&spa->spa_scrub_lock); 2350 error = traverse_more(th); 2351 mutex_enter(&spa->spa_scrub_lock); 2352 if (error != EAGAIN) 2353 break; 2354 } 2355 2356 while (spa->spa_scrub_inflight) 2357 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2358 2359 spa->spa_scrub_active = 0; 2360 cv_broadcast(&spa->spa_scrub_cv); 2361 2362 mutex_exit(&spa->spa_scrub_lock); 2363 2364 spa_config_enter(spa, RW_WRITER, FTAG); 2365 2366 mutex_enter(&spa->spa_scrub_lock); 2367 2368 /* 2369 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 2370 * AND the spa config lock to synchronize with any config changes 2371 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 2372 */ 2373 if (spa->spa_scrub_restart_txg != 0) 2374 error = ERESTART; 2375 2376 if (spa->spa_scrub_stop) 2377 error = EINTR; 2378 2379 /* 2380 * Even if there were uncorrectable errors, we consider the scrub 2381 * completed. The downside is that if there is a transient error during 2382 * a resilver, we won't resilver the data properly to the target. But 2383 * if the damage is permanent (more likely) we will resilver forever, 2384 * which isn't really acceptable. Since there is enough information for 2385 * the user to know what has failed and why, this seems like a more 2386 * tractable approach. 2387 */ 2388 complete = (error == 0); 2389 2390 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 2391 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2392 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2393 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2394 2395 mutex_exit(&spa->spa_scrub_lock); 2396 2397 /* 2398 * If the scrub/resilver completed, update all DTLs to reflect this. 2399 * Whether it succeeded or not, vacate all temporary scrub DTLs. 2400 */ 2401 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2402 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2403 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 2404 spa_errlog_rotate(spa); 2405 2406 spa_config_exit(spa, FTAG); 2407 2408 mutex_enter(&spa->spa_scrub_lock); 2409 2410 /* 2411 * We may have finished replacing a device. 2412 * Let the async thread assess this and handle the detach. 2413 */ 2414 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2415 2416 /* 2417 * If we were told to restart, our final act is to start a new scrub. 2418 */ 2419 if (error == ERESTART) 2420 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 2421 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2422 2423 spa->spa_scrub_type = POOL_SCRUB_NONE; 2424 spa->spa_scrub_active = 0; 2425 spa->spa_scrub_thread = NULL; 2426 cv_broadcast(&spa->spa_scrub_cv); 2427 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2428 thread_exit(); 2429 } 2430 2431 void 2432 spa_scrub_suspend(spa_t *spa) 2433 { 2434 mutex_enter(&spa->spa_scrub_lock); 2435 spa->spa_scrub_suspended++; 2436 while (spa->spa_scrub_active) { 2437 cv_broadcast(&spa->spa_scrub_cv); 2438 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2439 } 2440 while (spa->spa_scrub_inflight) 2441 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2442 mutex_exit(&spa->spa_scrub_lock); 2443 } 2444 2445 void 2446 spa_scrub_resume(spa_t *spa) 2447 { 2448 mutex_enter(&spa->spa_scrub_lock); 2449 ASSERT(spa->spa_scrub_suspended != 0); 2450 if (--spa->spa_scrub_suspended == 0) 2451 cv_broadcast(&spa->spa_scrub_cv); 2452 mutex_exit(&spa->spa_scrub_lock); 2453 } 2454 2455 void 2456 spa_scrub_restart(spa_t *spa, uint64_t txg) 2457 { 2458 /* 2459 * Something happened (e.g. snapshot create/delete) that means 2460 * we must restart any in-progress scrubs. The itinerary will 2461 * fix this properly. 2462 */ 2463 mutex_enter(&spa->spa_scrub_lock); 2464 spa->spa_scrub_restart_txg = txg; 2465 mutex_exit(&spa->spa_scrub_lock); 2466 } 2467 2468 int 2469 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2470 { 2471 space_seg_t *ss; 2472 uint64_t mintxg, maxtxg; 2473 vdev_t *rvd = spa->spa_root_vdev; 2474 2475 if ((uint_t)type >= POOL_SCRUB_TYPES) 2476 return (ENOTSUP); 2477 2478 mutex_enter(&spa->spa_scrub_lock); 2479 2480 /* 2481 * If there's a scrub or resilver already in progress, stop it. 2482 */ 2483 while (spa->spa_scrub_thread != NULL) { 2484 /* 2485 * Don't stop a resilver unless forced. 2486 */ 2487 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 2488 mutex_exit(&spa->spa_scrub_lock); 2489 return (EBUSY); 2490 } 2491 spa->spa_scrub_stop = 1; 2492 cv_broadcast(&spa->spa_scrub_cv); 2493 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2494 } 2495 2496 /* 2497 * Terminate the previous traverse. 2498 */ 2499 if (spa->spa_scrub_th != NULL) { 2500 traverse_fini(spa->spa_scrub_th); 2501 spa->spa_scrub_th = NULL; 2502 } 2503 2504 if (rvd == NULL) { 2505 ASSERT(spa->spa_scrub_stop == 0); 2506 ASSERT(spa->spa_scrub_type == type); 2507 ASSERT(spa->spa_scrub_restart_txg == 0); 2508 mutex_exit(&spa->spa_scrub_lock); 2509 return (0); 2510 } 2511 2512 mintxg = TXG_INITIAL - 1; 2513 maxtxg = spa_last_synced_txg(spa) + 1; 2514 2515 mutex_enter(&rvd->vdev_dtl_lock); 2516 2517 if (rvd->vdev_dtl_map.sm_space == 0) { 2518 /* 2519 * The pool-wide DTL is empty. 2520 * If this is a resilver, there's nothing to do except 2521 * check whether any in-progress replacements have completed. 2522 */ 2523 if (type == POOL_SCRUB_RESILVER) { 2524 type = POOL_SCRUB_NONE; 2525 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2526 } 2527 } else { 2528 /* 2529 * The pool-wide DTL is non-empty. 2530 * If this is a normal scrub, upgrade to a resilver instead. 2531 */ 2532 if (type == POOL_SCRUB_EVERYTHING) 2533 type = POOL_SCRUB_RESILVER; 2534 } 2535 2536 if (type == POOL_SCRUB_RESILVER) { 2537 /* 2538 * Determine the resilvering boundaries. 2539 * 2540 * Note: (mintxg, maxtxg) is an open interval, 2541 * i.e. mintxg and maxtxg themselves are not included. 2542 * 2543 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 2544 * so we don't claim to resilver a txg that's still changing. 2545 */ 2546 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 2547 mintxg = ss->ss_start - 1; 2548 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 2549 maxtxg = MIN(ss->ss_end, maxtxg); 2550 } 2551 2552 mutex_exit(&rvd->vdev_dtl_lock); 2553 2554 spa->spa_scrub_stop = 0; 2555 spa->spa_scrub_type = type; 2556 spa->spa_scrub_restart_txg = 0; 2557 2558 if (type != POOL_SCRUB_NONE) { 2559 spa->spa_scrub_mintxg = mintxg; 2560 spa->spa_scrub_maxtxg = maxtxg; 2561 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 2562 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 2563 ZIO_FLAG_CANFAIL); 2564 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 2565 spa->spa_scrub_thread = thread_create(NULL, 0, 2566 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 2567 } 2568 2569 mutex_exit(&spa->spa_scrub_lock); 2570 2571 return (0); 2572 } 2573 2574 /* 2575 * ========================================================================== 2576 * SPA async task processing 2577 * ========================================================================== 2578 */ 2579 2580 static void 2581 spa_async_reopen(spa_t *spa) 2582 { 2583 vdev_t *rvd = spa->spa_root_vdev; 2584 vdev_t *tvd; 2585 int c; 2586 2587 spa_config_enter(spa, RW_WRITER, FTAG); 2588 2589 for (c = 0; c < rvd->vdev_children; c++) { 2590 tvd = rvd->vdev_child[c]; 2591 if (tvd->vdev_reopen_wanted) { 2592 tvd->vdev_reopen_wanted = 0; 2593 vdev_reopen(tvd); 2594 } 2595 } 2596 2597 spa_config_exit(spa, FTAG); 2598 } 2599 2600 static void 2601 spa_async_thread(spa_t *spa) 2602 { 2603 int tasks; 2604 2605 ASSERT(spa->spa_sync_on); 2606 2607 mutex_enter(&spa->spa_async_lock); 2608 tasks = spa->spa_async_tasks; 2609 spa->spa_async_tasks = 0; 2610 mutex_exit(&spa->spa_async_lock); 2611 2612 /* 2613 * See if the config needs to be updated. 2614 */ 2615 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 2616 mutex_enter(&spa_namespace_lock); 2617 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2618 mutex_exit(&spa_namespace_lock); 2619 } 2620 2621 /* 2622 * See if any devices need to be reopened. 2623 */ 2624 if (tasks & SPA_ASYNC_REOPEN) 2625 spa_async_reopen(spa); 2626 2627 /* 2628 * If any devices are done replacing, detach them. 2629 */ 2630 if (tasks & SPA_ASYNC_REPLACE_DONE) 2631 spa_vdev_replace_done(spa); 2632 2633 /* 2634 * Kick off a scrub. 2635 */ 2636 if (tasks & SPA_ASYNC_SCRUB) 2637 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 2638 2639 /* 2640 * Kick off a resilver. 2641 */ 2642 if (tasks & SPA_ASYNC_RESILVER) 2643 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 2644 2645 /* 2646 * Let the world know that we're done. 2647 */ 2648 mutex_enter(&spa->spa_async_lock); 2649 spa->spa_async_thread = NULL; 2650 cv_broadcast(&spa->spa_async_cv); 2651 mutex_exit(&spa->spa_async_lock); 2652 thread_exit(); 2653 } 2654 2655 void 2656 spa_async_suspend(spa_t *spa) 2657 { 2658 mutex_enter(&spa->spa_async_lock); 2659 spa->spa_async_suspended++; 2660 while (spa->spa_async_thread != NULL) 2661 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 2662 mutex_exit(&spa->spa_async_lock); 2663 } 2664 2665 void 2666 spa_async_resume(spa_t *spa) 2667 { 2668 mutex_enter(&spa->spa_async_lock); 2669 ASSERT(spa->spa_async_suspended != 0); 2670 spa->spa_async_suspended--; 2671 mutex_exit(&spa->spa_async_lock); 2672 } 2673 2674 static void 2675 spa_async_dispatch(spa_t *spa) 2676 { 2677 mutex_enter(&spa->spa_async_lock); 2678 if (spa->spa_async_tasks && !spa->spa_async_suspended && 2679 spa->spa_async_thread == NULL && 2680 rootdir != NULL && !vn_is_readonly(rootdir)) 2681 spa->spa_async_thread = thread_create(NULL, 0, 2682 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 2683 mutex_exit(&spa->spa_async_lock); 2684 } 2685 2686 void 2687 spa_async_request(spa_t *spa, int task) 2688 { 2689 mutex_enter(&spa->spa_async_lock); 2690 spa->spa_async_tasks |= task; 2691 mutex_exit(&spa->spa_async_lock); 2692 } 2693 2694 /* 2695 * ========================================================================== 2696 * SPA syncing routines 2697 * ========================================================================== 2698 */ 2699 2700 static void 2701 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2702 { 2703 bplist_t *bpl = &spa->spa_sync_bplist; 2704 dmu_tx_t *tx; 2705 blkptr_t blk; 2706 uint64_t itor = 0; 2707 zio_t *zio; 2708 int error; 2709 uint8_t c = 1; 2710 2711 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2712 2713 while (bplist_iterate(bpl, &itor, &blk) == 0) 2714 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2715 2716 error = zio_wait(zio); 2717 ASSERT3U(error, ==, 0); 2718 2719 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2720 bplist_vacate(bpl, tx); 2721 2722 /* 2723 * Pre-dirty the first block so we sync to convergence faster. 2724 * (Usually only the first block is needed.) 2725 */ 2726 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2727 dmu_tx_commit(tx); 2728 } 2729 2730 static void 2731 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 2732 { 2733 char *packed = NULL; 2734 size_t nvsize = 0; 2735 dmu_buf_t *db; 2736 2737 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 2738 2739 packed = kmem_alloc(nvsize, KM_SLEEP); 2740 2741 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 2742 KM_SLEEP) == 0); 2743 2744 dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 2745 2746 kmem_free(packed, nvsize); 2747 2748 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 2749 dmu_buf_will_dirty(db, tx); 2750 *(uint64_t *)db->db_data = nvsize; 2751 dmu_buf_rele(db, FTAG); 2752 } 2753 2754 static void 2755 spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 2756 { 2757 nvlist_t *nvroot; 2758 nvlist_t **spares; 2759 int i; 2760 2761 if (!spa->spa_sync_spares) 2762 return; 2763 2764 /* 2765 * Update the MOS nvlist describing the list of available spares. 2766 * spa_validate_spares() will have already made sure this nvlist is 2767 * valid and the vdevs are labelled appropriately. 2768 */ 2769 if (spa->spa_spares_object == 0) { 2770 spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 2771 DMU_OT_PACKED_NVLIST, 1 << 14, 2772 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2773 VERIFY(zap_update(spa->spa_meta_objset, 2774 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 2775 sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 2776 } 2777 2778 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2779 if (spa->spa_nspares == 0) { 2780 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2781 NULL, 0) == 0); 2782 } else { 2783 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 2784 KM_SLEEP); 2785 for (i = 0; i < spa->spa_nspares; i++) 2786 spares[i] = vdev_config_generate(spa, 2787 spa->spa_spares[i], B_FALSE, B_TRUE); 2788 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2789 spares, spa->spa_nspares) == 0); 2790 for (i = 0; i < spa->spa_nspares; i++) 2791 nvlist_free(spares[i]); 2792 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 2793 } 2794 2795 spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 2796 nvlist_free(nvroot); 2797 2798 spa->spa_sync_spares = B_FALSE; 2799 } 2800 2801 static void 2802 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2803 { 2804 nvlist_t *config; 2805 2806 if (list_is_empty(&spa->spa_dirty_list)) 2807 return; 2808 2809 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2810 2811 if (spa->spa_config_syncing) 2812 nvlist_free(spa->spa_config_syncing); 2813 spa->spa_config_syncing = config; 2814 2815 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 2816 } 2817 2818 /* 2819 * Sync the specified transaction group. New blocks may be dirtied as 2820 * part of the process, so we iterate until it converges. 2821 */ 2822 void 2823 spa_sync(spa_t *spa, uint64_t txg) 2824 { 2825 dsl_pool_t *dp = spa->spa_dsl_pool; 2826 objset_t *mos = spa->spa_meta_objset; 2827 bplist_t *bpl = &spa->spa_sync_bplist; 2828 vdev_t *rvd = spa->spa_root_vdev; 2829 vdev_t *vd; 2830 dmu_tx_t *tx; 2831 int dirty_vdevs; 2832 2833 /* 2834 * Lock out configuration changes. 2835 */ 2836 spa_config_enter(spa, RW_READER, FTAG); 2837 2838 spa->spa_syncing_txg = txg; 2839 spa->spa_sync_pass = 0; 2840 2841 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2842 2843 tx = dmu_tx_create_assigned(dp, txg); 2844 2845 /* 2846 * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg, 2847 * set spa_deflate if we have no raid-z vdevs. 2848 */ 2849 if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE && 2850 spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) { 2851 int i; 2852 2853 for (i = 0; i < rvd->vdev_children; i++) { 2854 vd = rvd->vdev_child[i]; 2855 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 2856 break; 2857 } 2858 if (i == rvd->vdev_children) { 2859 spa->spa_deflate = TRUE; 2860 VERIFY(0 == zap_add(spa->spa_meta_objset, 2861 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2862 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 2863 } 2864 } 2865 2866 /* 2867 * If anything has changed in this txg, push the deferred frees 2868 * from the previous txg. If not, leave them alone so that we 2869 * don't generate work on an otherwise idle system. 2870 */ 2871 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2872 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 2873 !txg_list_empty(&dp->dp_sync_tasks, txg)) 2874 spa_sync_deferred_frees(spa, txg); 2875 2876 /* 2877 * Iterate to convergence. 2878 */ 2879 do { 2880 spa->spa_sync_pass++; 2881 2882 spa_sync_config_object(spa, tx); 2883 spa_sync_spares(spa, tx); 2884 spa_errlog_sync(spa, txg); 2885 dsl_pool_sync(dp, txg); 2886 2887 dirty_vdevs = 0; 2888 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2889 vdev_sync(vd, txg); 2890 dirty_vdevs++; 2891 } 2892 2893 bplist_sync(bpl, tx); 2894 } while (dirty_vdevs); 2895 2896 bplist_close(bpl); 2897 2898 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2899 2900 /* 2901 * Rewrite the vdev configuration (which includes the uberblock) 2902 * to commit the transaction group. 2903 * 2904 * If there are any dirty vdevs, sync the uberblock to all vdevs. 2905 * Otherwise, pick a random top-level vdev that's known to be 2906 * visible in the config cache (see spa_vdev_add() for details). 2907 * If the write fails, try the next vdev until we're tried them all. 2908 */ 2909 if (!list_is_empty(&spa->spa_dirty_list)) { 2910 VERIFY(vdev_config_sync(rvd, txg) == 0); 2911 } else { 2912 int children = rvd->vdev_children; 2913 int c0 = spa_get_random(children); 2914 int c; 2915 2916 for (c = 0; c < children; c++) { 2917 vd = rvd->vdev_child[(c0 + c) % children]; 2918 if (vd->vdev_ms_array == 0) 2919 continue; 2920 if (vdev_config_sync(vd, txg) == 0) 2921 break; 2922 } 2923 if (c == children) 2924 VERIFY(vdev_config_sync(rvd, txg) == 0); 2925 } 2926 2927 dmu_tx_commit(tx); 2928 2929 /* 2930 * Clear the dirty config list. 2931 */ 2932 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 2933 vdev_config_clean(vd); 2934 2935 /* 2936 * Now that the new config has synced transactionally, 2937 * let it become visible to the config cache. 2938 */ 2939 if (spa->spa_config_syncing != NULL) { 2940 spa_config_set(spa, spa->spa_config_syncing); 2941 spa->spa_config_txg = txg; 2942 spa->spa_config_syncing = NULL; 2943 } 2944 2945 /* 2946 * Make a stable copy of the fully synced uberblock. 2947 * We use this as the root for pool traversals. 2948 */ 2949 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 2950 2951 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 2952 2953 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 2954 spa->spa_traverse_wanted = 0; 2955 spa->spa_ubsync = spa->spa_uberblock; 2956 rw_exit(&spa->spa_traverse_lock); 2957 2958 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 2959 2960 /* 2961 * Clean up the ZIL records for the synced txg. 2962 */ 2963 dsl_pool_zil_clean(dp); 2964 2965 /* 2966 * Update usable space statistics. 2967 */ 2968 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 2969 vdev_sync_done(vd, txg); 2970 2971 /* 2972 * It had better be the case that we didn't dirty anything 2973 * since vdev_config_sync(). 2974 */ 2975 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 2976 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 2977 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 2978 ASSERT(bpl->bpl_queue == NULL); 2979 2980 spa_config_exit(spa, FTAG); 2981 2982 /* 2983 * If any async tasks have been requested, kick them off. 2984 */ 2985 spa_async_dispatch(spa); 2986 } 2987 2988 /* 2989 * Sync all pools. We don't want to hold the namespace lock across these 2990 * operations, so we take a reference on the spa_t and drop the lock during the 2991 * sync. 2992 */ 2993 void 2994 spa_sync_allpools(void) 2995 { 2996 spa_t *spa = NULL; 2997 mutex_enter(&spa_namespace_lock); 2998 while ((spa = spa_next(spa)) != NULL) { 2999 if (spa_state(spa) != POOL_STATE_ACTIVE) 3000 continue; 3001 spa_open_ref(spa, FTAG); 3002 mutex_exit(&spa_namespace_lock); 3003 txg_wait_synced(spa_get_dsl(spa), 0); 3004 mutex_enter(&spa_namespace_lock); 3005 spa_close(spa, FTAG); 3006 } 3007 mutex_exit(&spa_namespace_lock); 3008 } 3009 3010 /* 3011 * ========================================================================== 3012 * Miscellaneous routines 3013 * ========================================================================== 3014 */ 3015 3016 /* 3017 * Remove all pools in the system. 3018 */ 3019 void 3020 spa_evict_all(void) 3021 { 3022 spa_t *spa; 3023 3024 /* 3025 * Remove all cached state. All pools should be closed now, 3026 * so every spa in the AVL tree should be unreferenced. 3027 */ 3028 mutex_enter(&spa_namespace_lock); 3029 while ((spa = spa_next(NULL)) != NULL) { 3030 /* 3031 * Stop async tasks. The async thread may need to detach 3032 * a device that's been replaced, which requires grabbing 3033 * spa_namespace_lock, so we must drop it here. 3034 */ 3035 spa_open_ref(spa, FTAG); 3036 mutex_exit(&spa_namespace_lock); 3037 spa_async_suspend(spa); 3038 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 3039 mutex_enter(&spa_namespace_lock); 3040 spa_close(spa, FTAG); 3041 3042 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3043 spa_unload(spa); 3044 spa_deactivate(spa); 3045 } 3046 spa_remove(spa); 3047 } 3048 mutex_exit(&spa_namespace_lock); 3049 } 3050 3051 vdev_t * 3052 spa_lookup_by_guid(spa_t *spa, uint64_t guid) 3053 { 3054 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 3055 } 3056 3057 void 3058 spa_upgrade(spa_t *spa) 3059 { 3060 spa_config_enter(spa, RW_WRITER, FTAG); 3061 3062 /* 3063 * This should only be called for a non-faulted pool, and since a 3064 * future version would result in an unopenable pool, this shouldn't be 3065 * possible. 3066 */ 3067 ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 3068 3069 spa->spa_uberblock.ub_version = ZFS_VERSION; 3070 vdev_config_dirty(spa->spa_root_vdev); 3071 3072 spa_config_exit(spa, FTAG); 3073 3074 txg_wait_synced(spa_get_dsl(spa), 0); 3075 } 3076 3077 boolean_t 3078 spa_has_spare(spa_t *spa, uint64_t guid) 3079 { 3080 int i; 3081 uint64_t spareguid; 3082 3083 for (i = 0; i < spa->spa_nspares; i++) 3084 if (spa->spa_spares[i]->vdev_guid == guid) 3085 return (B_TRUE); 3086 3087 for (i = 0; i < spa->spa_pending_nspares; i++) { 3088 if (nvlist_lookup_uint64(spa->spa_pending_spares[i], 3089 ZPOOL_CONFIG_GUID, &spareguid) == 0 && 3090 spareguid == guid) 3091 return (B_TRUE); 3092 } 3093 3094 return (B_FALSE); 3095 } 3096