1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains all the routines used when modifying on-disk SPA state. 31 * This includes opening, importing, destroying, exporting a pool, and syncing a 32 * pool. 33 */ 34 35 #include <sys/zfs_context.h> 36 #include <sys/fm/fs/zfs.h> 37 #include <sys/spa_impl.h> 38 #include <sys/zio.h> 39 #include <sys/zio_checksum.h> 40 #include <sys/zio_compress.h> 41 #include <sys/dmu.h> 42 #include <sys/dmu_tx.h> 43 #include <sys/zap.h> 44 #include <sys/zil.h> 45 #include <sys/vdev_impl.h> 46 #include <sys/metaslab.h> 47 #include <sys/uberblock_impl.h> 48 #include <sys/txg.h> 49 #include <sys/avl.h> 50 #include <sys/dmu_traverse.h> 51 #include <sys/unique.h> 52 #include <sys/dsl_pool.h> 53 #include <sys/dsl_dir.h> 54 #include <sys/dsl_prop.h> 55 #include <sys/fs/zfs.h> 56 #include <sys/callb.h> 57 58 int zio_taskq_threads = 8; 59 60 /* 61 * ========================================================================== 62 * SPA state manipulation (open/create/destroy/import/export) 63 * ========================================================================== 64 */ 65 66 static int 67 spa_error_entry_compare(const void *a, const void *b) 68 { 69 spa_error_entry_t *sa = (spa_error_entry_t *)a; 70 spa_error_entry_t *sb = (spa_error_entry_t *)b; 71 int ret; 72 73 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 74 sizeof (zbookmark_t)); 75 76 if (ret < 0) 77 return (-1); 78 else if (ret > 0) 79 return (1); 80 else 81 return (0); 82 } 83 84 /* 85 * Utility function which retrieves copies of the current logs and 86 * re-initializes them in the process. 87 */ 88 void 89 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 90 { 91 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 92 93 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 94 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 95 96 avl_create(&spa->spa_errlist_scrub, 97 spa_error_entry_compare, sizeof (spa_error_entry_t), 98 offsetof(spa_error_entry_t, se_avl)); 99 avl_create(&spa->spa_errlist_last, 100 spa_error_entry_compare, sizeof (spa_error_entry_t), 101 offsetof(spa_error_entry_t, se_avl)); 102 } 103 104 /* 105 * Activate an uninitialized pool. 106 */ 107 static void 108 spa_activate(spa_t *spa) 109 { 110 int t; 111 112 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 113 114 spa->spa_state = POOL_STATE_ACTIVE; 115 116 spa->spa_normal_class = metaslab_class_create(); 117 118 for (t = 0; t < ZIO_TYPES; t++) { 119 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 120 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 121 TASKQ_PREPOPULATE); 122 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 123 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 124 TASKQ_PREPOPULATE); 125 } 126 127 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 128 129 mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); 130 mutex_init(&spa->spa_config_cache_lock, NULL, MUTEX_DEFAULT, NULL); 131 mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); 132 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); 133 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); 134 mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL); 135 mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); 136 mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); 137 138 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 139 offsetof(vdev_t, vdev_dirty_node)); 140 141 txg_list_create(&spa->spa_vdev_txg_list, 142 offsetof(struct vdev, vdev_txg_node)); 143 144 avl_create(&spa->spa_errlist_scrub, 145 spa_error_entry_compare, sizeof (spa_error_entry_t), 146 offsetof(spa_error_entry_t, se_avl)); 147 avl_create(&spa->spa_errlist_last, 148 spa_error_entry_compare, sizeof (spa_error_entry_t), 149 offsetof(spa_error_entry_t, se_avl)); 150 } 151 152 /* 153 * Opposite of spa_activate(). 154 */ 155 static void 156 spa_deactivate(spa_t *spa) 157 { 158 int t; 159 160 ASSERT(spa->spa_sync_on == B_FALSE); 161 ASSERT(spa->spa_dsl_pool == NULL); 162 ASSERT(spa->spa_root_vdev == NULL); 163 164 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 165 166 txg_list_destroy(&spa->spa_vdev_txg_list); 167 168 list_destroy(&spa->spa_dirty_list); 169 170 rw_destroy(&spa->spa_traverse_lock); 171 172 for (t = 0; t < ZIO_TYPES; t++) { 173 taskq_destroy(spa->spa_zio_issue_taskq[t]); 174 taskq_destroy(spa->spa_zio_intr_taskq[t]); 175 spa->spa_zio_issue_taskq[t] = NULL; 176 spa->spa_zio_intr_taskq[t] = NULL; 177 } 178 179 metaslab_class_destroy(spa->spa_normal_class); 180 spa->spa_normal_class = NULL; 181 182 /* 183 * If this was part of an import or the open otherwise failed, we may 184 * still have errors left in the queues. Empty them just in case. 185 */ 186 spa_errlog_drain(spa); 187 188 avl_destroy(&spa->spa_errlist_scrub); 189 avl_destroy(&spa->spa_errlist_last); 190 191 spa->spa_state = POOL_STATE_UNINITIALIZED; 192 } 193 194 /* 195 * Verify a pool configuration, and construct the vdev tree appropriately. This 196 * will create all the necessary vdevs in the appropriate layout, with each vdev 197 * in the CLOSED state. This will prep the pool before open/creation/import. 198 * All vdev validation is done by the vdev_alloc() routine. 199 */ 200 static int 201 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 202 uint_t id, int atype) 203 { 204 nvlist_t **child; 205 uint_t c, children; 206 int error; 207 208 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 209 return (error); 210 211 if ((*vdp)->vdev_ops->vdev_op_leaf) 212 return (0); 213 214 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 215 &child, &children) != 0) { 216 vdev_free(*vdp); 217 *vdp = NULL; 218 return (EINVAL); 219 } 220 221 for (c = 0; c < children; c++) { 222 vdev_t *vd; 223 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 224 atype)) != 0) { 225 vdev_free(*vdp); 226 *vdp = NULL; 227 return (error); 228 } 229 } 230 231 ASSERT(*vdp != NULL); 232 233 return (0); 234 } 235 236 /* 237 * Opposite of spa_load(). 238 */ 239 static void 240 spa_unload(spa_t *spa) 241 { 242 int i; 243 244 /* 245 * Stop async tasks. 246 */ 247 spa_async_suspend(spa); 248 249 /* 250 * Stop syncing. 251 */ 252 if (spa->spa_sync_on) { 253 txg_sync_stop(spa->spa_dsl_pool); 254 spa->spa_sync_on = B_FALSE; 255 } 256 257 /* 258 * Wait for any outstanding prefetch I/O to complete. 259 */ 260 spa_config_enter(spa, RW_WRITER, FTAG); 261 spa_config_exit(spa, FTAG); 262 263 /* 264 * Close the dsl pool. 265 */ 266 if (spa->spa_dsl_pool) { 267 dsl_pool_close(spa->spa_dsl_pool); 268 spa->spa_dsl_pool = NULL; 269 } 270 271 /* 272 * Close all vdevs. 273 */ 274 if (spa->spa_root_vdev) 275 vdev_free(spa->spa_root_vdev); 276 ASSERT(spa->spa_root_vdev == NULL); 277 278 for (i = 0; i < spa->spa_nspares; i++) 279 vdev_free(spa->spa_spares[i]); 280 if (spa->spa_spares) { 281 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 282 spa->spa_spares = NULL; 283 } 284 if (spa->spa_sparelist) { 285 nvlist_free(spa->spa_sparelist); 286 spa->spa_sparelist = NULL; 287 } 288 289 spa->spa_async_suspended = 0; 290 } 291 292 /* 293 * Load (or re-load) the current list of vdevs describing the active spares for 294 * this pool. When this is called, we have some form of basic information in 295 * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 296 * re-generate a more complete list including status information. 297 */ 298 static void 299 spa_load_spares(spa_t *spa) 300 { 301 nvlist_t **spares; 302 uint_t nspares; 303 int i; 304 vdev_t *vd, *tvd; 305 306 /* 307 * First, close and free any existing spare vdevs. 308 */ 309 for (i = 0; i < spa->spa_nspares; i++) { 310 vd = spa->spa_spares[i]; 311 312 /* Undo the call to spa_activate() below */ 313 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && 314 tvd->vdev_isspare) 315 spa_spare_remove(tvd); 316 vdev_close(vd); 317 vdev_free(vd); 318 } 319 320 if (spa->spa_spares) 321 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 322 323 if (spa->spa_sparelist == NULL) 324 nspares = 0; 325 else 326 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 327 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 328 329 spa->spa_nspares = (int)nspares; 330 spa->spa_spares = NULL; 331 332 if (nspares == 0) 333 return; 334 335 /* 336 * Construct the array of vdevs, opening them to get status in the 337 * process. For each spare, there is potentially two different vdev_t 338 * structures associated with it: one in the list of spares (used only 339 * for basic validation purposes) and one in the active vdev 340 * configuration (if it's spared in). During this phase we open and 341 * validate each vdev on the spare list. If the vdev also exists in the 342 * active configuration, then we also mark this vdev as an active spare. 343 */ 344 spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 345 for (i = 0; i < spa->spa_nspares; i++) { 346 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 347 VDEV_ALLOC_SPARE) == 0); 348 ASSERT(vd != NULL); 349 350 spa->spa_spares[i] = vd; 351 352 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { 353 if (!tvd->vdev_isspare) 354 spa_spare_add(tvd); 355 356 /* 357 * We only mark the spare active if we were successfully 358 * able to load the vdev. Otherwise, importing a pool 359 * with a bad active spare would result in strange 360 * behavior, because multiple pool would think the spare 361 * is actively in use. 362 * 363 * There is a vulnerability here to an equally bizarre 364 * circumstance, where a dead active spare is later 365 * brought back to life (onlined or otherwise). Given 366 * the rarity of this scenario, and the extra complexity 367 * it adds, we ignore the possibility. 368 */ 369 if (!vdev_is_dead(tvd)) 370 spa_spare_activate(tvd); 371 } 372 373 if (vdev_open(vd) != 0) 374 continue; 375 376 vd->vdev_top = vd; 377 (void) vdev_validate_spare(vd); 378 } 379 380 /* 381 * Recompute the stashed list of spares, with status information 382 * this time. 383 */ 384 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 385 DATA_TYPE_NVLIST_ARRAY) == 0); 386 387 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 388 for (i = 0; i < spa->spa_nspares; i++) 389 spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 390 B_TRUE, B_TRUE); 391 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 392 spares, spa->spa_nspares) == 0); 393 for (i = 0; i < spa->spa_nspares; i++) 394 nvlist_free(spares[i]); 395 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 396 } 397 398 static int 399 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 400 { 401 dmu_buf_t *db; 402 char *packed = NULL; 403 size_t nvsize = 0; 404 int error; 405 *value = NULL; 406 407 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 408 nvsize = *(uint64_t *)db->db_data; 409 dmu_buf_rele(db, FTAG); 410 411 packed = kmem_alloc(nvsize, KM_SLEEP); 412 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 413 if (error == 0) 414 error = nvlist_unpack(packed, nvsize, value, 0); 415 kmem_free(packed, nvsize); 416 417 return (error); 418 } 419 420 /* 421 * Load an existing storage pool, using the pool's builtin spa_config as a 422 * source of configuration information. 423 */ 424 static int 425 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 426 { 427 int error = 0; 428 nvlist_t *nvroot = NULL; 429 vdev_t *rvd; 430 uberblock_t *ub = &spa->spa_uberblock; 431 uint64_t config_cache_txg = spa->spa_config_txg; 432 uint64_t pool_guid; 433 uint64_t version; 434 zio_t *zio; 435 436 spa->spa_load_state = state; 437 438 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 439 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 440 error = EINVAL; 441 goto out; 442 } 443 444 /* 445 * Versioning wasn't explicitly added to the label until later, so if 446 * it's not present treat it as the initial version. 447 */ 448 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 449 version = ZFS_VERSION_INITIAL; 450 451 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 452 &spa->spa_config_txg); 453 454 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 455 spa_guid_exists(pool_guid, 0)) { 456 error = EEXIST; 457 goto out; 458 } 459 460 spa->spa_load_guid = pool_guid; 461 462 /* 463 * Parse the configuration into a vdev tree. We explicitly set the 464 * value that will be returned by spa_version() since parsing the 465 * configuration requires knowing the version number. 466 */ 467 spa_config_enter(spa, RW_WRITER, FTAG); 468 spa->spa_ubsync.ub_version = version; 469 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 470 spa_config_exit(spa, FTAG); 471 472 if (error != 0) 473 goto out; 474 475 ASSERT(spa->spa_root_vdev == rvd); 476 ASSERT(spa_guid(spa) == pool_guid); 477 478 /* 479 * Try to open all vdevs, loading each label in the process. 480 */ 481 if (vdev_open(rvd) != 0) { 482 error = ENXIO; 483 goto out; 484 } 485 486 /* 487 * Validate the labels for all leaf vdevs. We need to grab the config 488 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 489 * flag. 490 */ 491 spa_config_enter(spa, RW_READER, FTAG); 492 error = vdev_validate(rvd); 493 spa_config_exit(spa, FTAG); 494 495 if (error != 0) { 496 error = EBADF; 497 goto out; 498 } 499 500 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 501 error = ENXIO; 502 goto out; 503 } 504 505 /* 506 * Find the best uberblock. 507 */ 508 bzero(ub, sizeof (uberblock_t)); 509 510 zio = zio_root(spa, NULL, NULL, 511 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 512 vdev_uberblock_load(zio, rvd, ub); 513 error = zio_wait(zio); 514 515 /* 516 * If we weren't able to find a single valid uberblock, return failure. 517 */ 518 if (ub->ub_txg == 0) { 519 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 520 VDEV_AUX_CORRUPT_DATA); 521 error = ENXIO; 522 goto out; 523 } 524 525 /* 526 * If the pool is newer than the code, we can't open it. 527 */ 528 if (ub->ub_version > ZFS_VERSION) { 529 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 530 VDEV_AUX_VERSION_NEWER); 531 error = ENOTSUP; 532 goto out; 533 } 534 535 /* 536 * If the vdev guid sum doesn't match the uberblock, we have an 537 * incomplete configuration. 538 */ 539 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 540 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 541 VDEV_AUX_BAD_GUID_SUM); 542 error = ENXIO; 543 goto out; 544 } 545 546 /* 547 * Initialize internal SPA structures. 548 */ 549 spa->spa_state = POOL_STATE_ACTIVE; 550 spa->spa_ubsync = spa->spa_uberblock; 551 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 552 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 553 if (error) { 554 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 555 VDEV_AUX_CORRUPT_DATA); 556 goto out; 557 } 558 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 559 560 if (zap_lookup(spa->spa_meta_objset, 561 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 562 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 563 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 564 VDEV_AUX_CORRUPT_DATA); 565 error = EIO; 566 goto out; 567 } 568 569 if (!mosconfig) { 570 nvlist_t *newconfig; 571 572 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 573 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 574 VDEV_AUX_CORRUPT_DATA); 575 error = EIO; 576 goto out; 577 } 578 579 spa_config_set(spa, newconfig); 580 spa_unload(spa); 581 spa_deactivate(spa); 582 spa_activate(spa); 583 584 return (spa_load(spa, newconfig, state, B_TRUE)); 585 } 586 587 if (zap_lookup(spa->spa_meta_objset, 588 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 589 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 590 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 591 VDEV_AUX_CORRUPT_DATA); 592 error = EIO; 593 goto out; 594 } 595 596 /* 597 * Load the bit that tells us to use the new accounting function 598 * (raid-z deflation). If we have an older pool, this will not 599 * be present. 600 */ 601 error = zap_lookup(spa->spa_meta_objset, 602 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 603 sizeof (uint64_t), 1, &spa->spa_deflate); 604 if (error != 0 && error != ENOENT) { 605 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 606 VDEV_AUX_CORRUPT_DATA); 607 error = EIO; 608 goto out; 609 } 610 611 /* 612 * Load the persistent error log. If we have an older pool, this will 613 * not be present. 614 */ 615 error = zap_lookup(spa->spa_meta_objset, 616 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 617 sizeof (uint64_t), 1, &spa->spa_errlog_last); 618 if (error != 0 && error != ENOENT) { 619 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 620 VDEV_AUX_CORRUPT_DATA); 621 error = EIO; 622 goto out; 623 } 624 625 error = zap_lookup(spa->spa_meta_objset, 626 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 627 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 628 if (error != 0 && error != ENOENT) { 629 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 630 VDEV_AUX_CORRUPT_DATA); 631 error = EIO; 632 goto out; 633 } 634 635 /* 636 * Load the history object. If we have an older pool, this 637 * will not be present. 638 */ 639 error = zap_lookup(spa->spa_meta_objset, 640 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 641 sizeof (uint64_t), 1, &spa->spa_history); 642 if (error != 0 && error != ENOENT) { 643 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 644 VDEV_AUX_CORRUPT_DATA); 645 error = EIO; 646 goto out; 647 } 648 649 /* 650 * Load any hot spares for this pool. 651 */ 652 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 653 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 654 if (error != 0 && error != ENOENT) { 655 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 656 VDEV_AUX_CORRUPT_DATA); 657 error = EIO; 658 goto out; 659 } 660 if (error == 0) { 661 ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES); 662 if (load_nvlist(spa, spa->spa_spares_object, 663 &spa->spa_sparelist) != 0) { 664 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 665 VDEV_AUX_CORRUPT_DATA); 666 error = EIO; 667 goto out; 668 } 669 670 spa_config_enter(spa, RW_WRITER, FTAG); 671 spa_load_spares(spa); 672 spa_config_exit(spa, FTAG); 673 } 674 675 /* 676 * Load the vdev state for all toplevel vdevs. 677 */ 678 vdev_load(rvd); 679 680 /* 681 * Propagate the leaf DTLs we just loaded all the way up the tree. 682 */ 683 spa_config_enter(spa, RW_WRITER, FTAG); 684 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 685 spa_config_exit(spa, FTAG); 686 687 /* 688 * Check the state of the root vdev. If it can't be opened, it 689 * indicates one or more toplevel vdevs are faulted. 690 */ 691 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 692 error = ENXIO; 693 goto out; 694 } 695 696 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 697 dmu_tx_t *tx; 698 int need_update = B_FALSE; 699 int c; 700 701 /* 702 * Claim log blocks that haven't been committed yet. 703 * This must all happen in a single txg. 704 */ 705 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 706 spa_first_txg(spa)); 707 (void) dmu_objset_find(spa->spa_name, 708 zil_claim, tx, DS_FIND_CHILDREN); 709 dmu_tx_commit(tx); 710 711 spa->spa_sync_on = B_TRUE; 712 txg_sync_start(spa->spa_dsl_pool); 713 714 /* 715 * Wait for all claims to sync. 716 */ 717 txg_wait_synced(spa->spa_dsl_pool, 0); 718 719 /* 720 * If the config cache is stale, or we have uninitialized 721 * metaslabs (see spa_vdev_add()), then update the config. 722 */ 723 if (config_cache_txg != spa->spa_config_txg || 724 state == SPA_LOAD_IMPORT) 725 need_update = B_TRUE; 726 727 for (c = 0; c < rvd->vdev_children; c++) 728 if (rvd->vdev_child[c]->vdev_ms_array == 0) 729 need_update = B_TRUE; 730 731 /* 732 * Update the config cache asychronously in case we're the 733 * root pool, in which case the config cache isn't writable yet. 734 */ 735 if (need_update) 736 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 737 } 738 739 error = 0; 740 out: 741 if (error && error != EBADF) 742 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 743 spa->spa_load_state = SPA_LOAD_NONE; 744 spa->spa_ena = 0; 745 746 return (error); 747 } 748 749 /* 750 * Pool Open/Import 751 * 752 * The import case is identical to an open except that the configuration is sent 753 * down from userland, instead of grabbed from the configuration cache. For the 754 * case of an open, the pool configuration will exist in the 755 * POOL_STATE_UNITIALIZED state. 756 * 757 * The stats information (gen/count/ustats) is used to gather vdev statistics at 758 * the same time open the pool, without having to keep around the spa_t in some 759 * ambiguous state. 760 */ 761 static int 762 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 763 { 764 spa_t *spa; 765 int error; 766 int loaded = B_FALSE; 767 int locked = B_FALSE; 768 769 *spapp = NULL; 770 771 /* 772 * As disgusting as this is, we need to support recursive calls to this 773 * function because dsl_dir_open() is called during spa_load(), and ends 774 * up calling spa_open() again. The real fix is to figure out how to 775 * avoid dsl_dir_open() calling this in the first place. 776 */ 777 if (mutex_owner(&spa_namespace_lock) != curthread) { 778 mutex_enter(&spa_namespace_lock); 779 locked = B_TRUE; 780 } 781 782 if ((spa = spa_lookup(pool)) == NULL) { 783 if (locked) 784 mutex_exit(&spa_namespace_lock); 785 return (ENOENT); 786 } 787 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 788 789 spa_activate(spa); 790 791 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 792 793 if (error == EBADF) { 794 /* 795 * If vdev_validate() returns failure (indicated by 796 * EBADF), it indicates that one of the vdevs indicates 797 * that the pool has been exported or destroyed. If 798 * this is the case, the config cache is out of sync and 799 * we should remove the pool from the namespace. 800 */ 801 zfs_post_ok(spa, NULL); 802 spa_unload(spa); 803 spa_deactivate(spa); 804 spa_remove(spa); 805 spa_config_sync(); 806 if (locked) 807 mutex_exit(&spa_namespace_lock); 808 return (ENOENT); 809 } 810 811 if (error) { 812 /* 813 * We can't open the pool, but we still have useful 814 * information: the state of each vdev after the 815 * attempted vdev_open(). Return this to the user. 816 */ 817 if (config != NULL && spa->spa_root_vdev != NULL) { 818 spa_config_enter(spa, RW_READER, FTAG); 819 *config = spa_config_generate(spa, NULL, -1ULL, 820 B_TRUE); 821 spa_config_exit(spa, FTAG); 822 } 823 spa_unload(spa); 824 spa_deactivate(spa); 825 spa->spa_last_open_failed = B_TRUE; 826 if (locked) 827 mutex_exit(&spa_namespace_lock); 828 *spapp = NULL; 829 return (error); 830 } else { 831 zfs_post_ok(spa, NULL); 832 spa->spa_last_open_failed = B_FALSE; 833 } 834 835 loaded = B_TRUE; 836 } 837 838 spa_open_ref(spa, tag); 839 if (locked) 840 mutex_exit(&spa_namespace_lock); 841 842 *spapp = spa; 843 844 if (config != NULL) { 845 spa_config_enter(spa, RW_READER, FTAG); 846 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 847 spa_config_exit(spa, FTAG); 848 } 849 850 /* 851 * If we just loaded the pool, resilver anything that's out of date. 852 */ 853 if (loaded && (spa_mode & FWRITE)) 854 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 855 856 return (0); 857 } 858 859 int 860 spa_open(const char *name, spa_t **spapp, void *tag) 861 { 862 return (spa_open_common(name, spapp, tag, NULL)); 863 } 864 865 /* 866 * Lookup the given spa_t, incrementing the inject count in the process, 867 * preventing it from being exported or destroyed. 868 */ 869 spa_t * 870 spa_inject_addref(char *name) 871 { 872 spa_t *spa; 873 874 mutex_enter(&spa_namespace_lock); 875 if ((spa = spa_lookup(name)) == NULL) { 876 mutex_exit(&spa_namespace_lock); 877 return (NULL); 878 } 879 spa->spa_inject_ref++; 880 mutex_exit(&spa_namespace_lock); 881 882 return (spa); 883 } 884 885 void 886 spa_inject_delref(spa_t *spa) 887 { 888 mutex_enter(&spa_namespace_lock); 889 spa->spa_inject_ref--; 890 mutex_exit(&spa_namespace_lock); 891 } 892 893 static void 894 spa_add_spares(spa_t *spa, nvlist_t *config) 895 { 896 nvlist_t **spares; 897 uint_t i, nspares; 898 nvlist_t *nvroot; 899 uint64_t guid; 900 vdev_stat_t *vs; 901 uint_t vsc; 902 uint64_t pool; 903 904 if (spa->spa_nspares == 0) 905 return; 906 907 VERIFY(nvlist_lookup_nvlist(config, 908 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 909 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 910 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 911 if (nspares != 0) { 912 VERIFY(nvlist_add_nvlist_array(nvroot, 913 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 914 VERIFY(nvlist_lookup_nvlist_array(nvroot, 915 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 916 917 /* 918 * Go through and find any spares which have since been 919 * repurposed as an active spare. If this is the case, update 920 * their status appropriately. 921 */ 922 for (i = 0; i < nspares; i++) { 923 VERIFY(nvlist_lookup_uint64(spares[i], 924 ZPOOL_CONFIG_GUID, &guid) == 0); 925 if (spa_spare_exists(guid, &pool) && pool != 0ULL) { 926 VERIFY(nvlist_lookup_uint64_array( 927 spares[i], ZPOOL_CONFIG_STATS, 928 (uint64_t **)&vs, &vsc) == 0); 929 vs->vs_state = VDEV_STATE_CANT_OPEN; 930 vs->vs_aux = VDEV_AUX_SPARED; 931 } 932 } 933 } 934 } 935 936 int 937 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 938 { 939 int error; 940 spa_t *spa; 941 942 *config = NULL; 943 error = spa_open_common(name, &spa, FTAG, config); 944 945 if (spa && *config != NULL) { 946 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 947 spa_get_errlog_size(spa)) == 0); 948 949 spa_add_spares(spa, *config); 950 } 951 952 /* 953 * We want to get the alternate root even for faulted pools, so we cheat 954 * and call spa_lookup() directly. 955 */ 956 if (altroot) { 957 if (spa == NULL) { 958 mutex_enter(&spa_namespace_lock); 959 spa = spa_lookup(name); 960 if (spa) 961 spa_altroot(spa, altroot, buflen); 962 else 963 altroot[0] = '\0'; 964 spa = NULL; 965 mutex_exit(&spa_namespace_lock); 966 } else { 967 spa_altroot(spa, altroot, buflen); 968 } 969 } 970 971 if (spa != NULL) 972 spa_close(spa, FTAG); 973 974 return (error); 975 } 976 977 /* 978 * Validate that the 'spares' array is well formed. We must have an array of 979 * nvlists, each which describes a valid leaf vdev. If this is an import (mode 980 * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long 981 * as they are well-formed. 982 */ 983 static int 984 spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 985 { 986 nvlist_t **spares; 987 uint_t i, nspares; 988 vdev_t *vd; 989 int error; 990 991 /* 992 * It's acceptable to have no spares specified. 993 */ 994 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 995 &spares, &nspares) != 0) 996 return (0); 997 998 if (nspares == 0) 999 return (EINVAL); 1000 1001 /* 1002 * Make sure the pool is formatted with a version that supports hot 1003 * spares. 1004 */ 1005 if (spa_version(spa) < ZFS_VERSION_SPARES) 1006 return (ENOTSUP); 1007 1008 /* 1009 * Set the pending spare list so we correctly handle device in-use 1010 * checking. 1011 */ 1012 spa->spa_pending_spares = spares; 1013 spa->spa_pending_nspares = nspares; 1014 1015 for (i = 0; i < nspares; i++) { 1016 if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 1017 mode)) != 0) 1018 goto out; 1019 1020 if (!vd->vdev_ops->vdev_op_leaf) { 1021 vdev_free(vd); 1022 error = EINVAL; 1023 goto out; 1024 } 1025 1026 vd->vdev_top = vd; 1027 1028 if ((error = vdev_open(vd)) == 0 && 1029 (error = vdev_label_init(vd, crtxg, 1030 VDEV_LABEL_SPARE)) == 0) { 1031 VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 1032 vd->vdev_guid) == 0); 1033 } 1034 1035 vdev_free(vd); 1036 1037 if (error && mode != VDEV_ALLOC_SPARE) 1038 goto out; 1039 else 1040 error = 0; 1041 } 1042 1043 out: 1044 spa->spa_pending_spares = NULL; 1045 spa->spa_pending_nspares = 0; 1046 return (error); 1047 } 1048 1049 /* 1050 * Pool Creation 1051 */ 1052 int 1053 spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 1054 { 1055 spa_t *spa; 1056 vdev_t *rvd; 1057 dsl_pool_t *dp; 1058 dmu_tx_t *tx; 1059 int c, error = 0; 1060 uint64_t txg = TXG_INITIAL; 1061 nvlist_t **spares; 1062 uint_t nspares; 1063 1064 /* 1065 * If this pool already exists, return failure. 1066 */ 1067 mutex_enter(&spa_namespace_lock); 1068 if (spa_lookup(pool) != NULL) { 1069 mutex_exit(&spa_namespace_lock); 1070 return (EEXIST); 1071 } 1072 1073 /* 1074 * Allocate a new spa_t structure. 1075 */ 1076 spa = spa_add(pool, altroot); 1077 spa_activate(spa); 1078 1079 spa->spa_uberblock.ub_txg = txg - 1; 1080 spa->spa_uberblock.ub_version = ZFS_VERSION; 1081 spa->spa_ubsync = spa->spa_uberblock; 1082 1083 /* 1084 * Create the root vdev. 1085 */ 1086 spa_config_enter(spa, RW_WRITER, FTAG); 1087 1088 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1089 1090 ASSERT(error != 0 || rvd != NULL); 1091 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1092 1093 if (error == 0 && rvd->vdev_children == 0) 1094 error = EINVAL; 1095 1096 if (error == 0 && 1097 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1098 (error = spa_validate_spares(spa, nvroot, txg, 1099 VDEV_ALLOC_ADD)) == 0) { 1100 for (c = 0; c < rvd->vdev_children; c++) 1101 vdev_init(rvd->vdev_child[c], txg); 1102 vdev_config_dirty(rvd); 1103 } 1104 1105 spa_config_exit(spa, FTAG); 1106 1107 if (error != 0) { 1108 spa_unload(spa); 1109 spa_deactivate(spa); 1110 spa_remove(spa); 1111 mutex_exit(&spa_namespace_lock); 1112 return (error); 1113 } 1114 1115 /* 1116 * Get the list of spares, if specified. 1117 */ 1118 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1119 &spares, &nspares) == 0) { 1120 VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 1121 KM_SLEEP) == 0); 1122 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1123 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1124 spa_config_enter(spa, RW_WRITER, FTAG); 1125 spa_load_spares(spa); 1126 spa_config_exit(spa, FTAG); 1127 spa->spa_sync_spares = B_TRUE; 1128 } 1129 1130 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1131 spa->spa_meta_objset = dp->dp_meta_objset; 1132 1133 tx = dmu_tx_create_assigned(dp, txg); 1134 1135 /* 1136 * Create the pool config object. 1137 */ 1138 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1139 DMU_OT_PACKED_NVLIST, 1 << 14, 1140 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1141 1142 if (zap_add(spa->spa_meta_objset, 1143 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1144 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1145 cmn_err(CE_PANIC, "failed to add pool config"); 1146 } 1147 1148 /* Newly created pools are always deflated. */ 1149 spa->spa_deflate = TRUE; 1150 if (zap_add(spa->spa_meta_objset, 1151 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1152 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 1153 cmn_err(CE_PANIC, "failed to add deflate"); 1154 } 1155 1156 /* 1157 * Create the deferred-free bplist object. Turn off compression 1158 * because sync-to-convergence takes longer if the blocksize 1159 * keeps changing. 1160 */ 1161 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1162 1 << 14, tx); 1163 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1164 ZIO_COMPRESS_OFF, tx); 1165 1166 if (zap_add(spa->spa_meta_objset, 1167 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1168 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 1169 cmn_err(CE_PANIC, "failed to add bplist"); 1170 } 1171 1172 /* 1173 * Create the pool's history object. 1174 */ 1175 spa_history_create_obj(spa, tx); 1176 1177 dmu_tx_commit(tx); 1178 1179 spa->spa_sync_on = B_TRUE; 1180 txg_sync_start(spa->spa_dsl_pool); 1181 1182 /* 1183 * We explicitly wait for the first transaction to complete so that our 1184 * bean counters are appropriately updated. 1185 */ 1186 txg_wait_synced(spa->spa_dsl_pool, txg); 1187 1188 spa_config_sync(); 1189 1190 mutex_exit(&spa_namespace_lock); 1191 1192 return (0); 1193 } 1194 1195 /* 1196 * Import the given pool into the system. We set up the necessary spa_t and 1197 * then call spa_load() to do the dirty work. 1198 */ 1199 int 1200 spa_import(const char *pool, nvlist_t *config, const char *altroot) 1201 { 1202 spa_t *spa; 1203 int error; 1204 nvlist_t *nvroot; 1205 nvlist_t **spares; 1206 uint_t nspares; 1207 1208 if (!(spa_mode & FWRITE)) 1209 return (EROFS); 1210 1211 /* 1212 * If a pool with this name exists, return failure. 1213 */ 1214 mutex_enter(&spa_namespace_lock); 1215 if (spa_lookup(pool) != NULL) { 1216 mutex_exit(&spa_namespace_lock); 1217 return (EEXIST); 1218 } 1219 1220 /* 1221 * Create and initialize the spa structure. 1222 */ 1223 spa = spa_add(pool, altroot); 1224 spa_activate(spa); 1225 1226 /* 1227 * Pass off the heavy lifting to spa_load(). 1228 * Pass TRUE for mosconfig because the user-supplied config 1229 * is actually the one to trust when doing an import. 1230 */ 1231 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1232 1233 spa_config_enter(spa, RW_WRITER, FTAG); 1234 /* 1235 * Toss any existing sparelist, as it doesn't have any validity anymore, 1236 * and conflicts with spa_has_spare(). 1237 */ 1238 if (spa->spa_sparelist) { 1239 nvlist_free(spa->spa_sparelist); 1240 spa->spa_sparelist = NULL; 1241 spa_load_spares(spa); 1242 } 1243 1244 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1245 &nvroot) == 0); 1246 if (error == 0) 1247 error = spa_validate_spares(spa, nvroot, -1ULL, 1248 VDEV_ALLOC_SPARE); 1249 spa_config_exit(spa, FTAG); 1250 1251 if (error != 0) { 1252 spa_unload(spa); 1253 spa_deactivate(spa); 1254 spa_remove(spa); 1255 mutex_exit(&spa_namespace_lock); 1256 return (error); 1257 } 1258 1259 /* 1260 * Override any spares as specified by the user, as these may have 1261 * correct device names/devids, etc. 1262 */ 1263 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1264 &spares, &nspares) == 0) { 1265 if (spa->spa_sparelist) 1266 VERIFY(nvlist_remove(spa->spa_sparelist, 1267 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1268 else 1269 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1270 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1271 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1272 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1273 spa_config_enter(spa, RW_WRITER, FTAG); 1274 spa_load_spares(spa); 1275 spa_config_exit(spa, FTAG); 1276 spa->spa_sync_spares = B_TRUE; 1277 } 1278 1279 /* 1280 * Update the config cache to include the newly-imported pool. 1281 */ 1282 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1283 1284 mutex_exit(&spa_namespace_lock); 1285 1286 /* 1287 * Resilver anything that's out of date. 1288 */ 1289 if (spa_mode & FWRITE) 1290 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1291 1292 return (0); 1293 } 1294 1295 /* 1296 * This (illegal) pool name is used when temporarily importing a spa_t in order 1297 * to get the vdev stats associated with the imported devices. 1298 */ 1299 #define TRYIMPORT_NAME "$import" 1300 1301 nvlist_t * 1302 spa_tryimport(nvlist_t *tryconfig) 1303 { 1304 nvlist_t *config = NULL; 1305 char *poolname; 1306 spa_t *spa; 1307 uint64_t state; 1308 1309 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1310 return (NULL); 1311 1312 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1313 return (NULL); 1314 1315 /* 1316 * Create and initialize the spa structure. 1317 */ 1318 mutex_enter(&spa_namespace_lock); 1319 spa = spa_add(TRYIMPORT_NAME, NULL); 1320 spa_activate(spa); 1321 1322 /* 1323 * Pass off the heavy lifting to spa_load(). 1324 * Pass TRUE for mosconfig because the user-supplied config 1325 * is actually the one to trust when doing an import. 1326 */ 1327 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1328 1329 /* 1330 * If 'tryconfig' was at least parsable, return the current config. 1331 */ 1332 if (spa->spa_root_vdev != NULL) { 1333 spa_config_enter(spa, RW_READER, FTAG); 1334 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1335 spa_config_exit(spa, FTAG); 1336 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1337 poolname) == 0); 1338 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1339 state) == 0); 1340 1341 /* 1342 * Add the list of hot spares. 1343 */ 1344 spa_add_spares(spa, config); 1345 } 1346 1347 spa_unload(spa); 1348 spa_deactivate(spa); 1349 spa_remove(spa); 1350 mutex_exit(&spa_namespace_lock); 1351 1352 return (config); 1353 } 1354 1355 /* 1356 * Pool export/destroy 1357 * 1358 * The act of destroying or exporting a pool is very simple. We make sure there 1359 * is no more pending I/O and any references to the pool are gone. Then, we 1360 * update the pool state and sync all the labels to disk, removing the 1361 * configuration from the cache afterwards. 1362 */ 1363 static int 1364 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1365 { 1366 spa_t *spa; 1367 1368 if (oldconfig) 1369 *oldconfig = NULL; 1370 1371 if (!(spa_mode & FWRITE)) 1372 return (EROFS); 1373 1374 mutex_enter(&spa_namespace_lock); 1375 if ((spa = spa_lookup(pool)) == NULL) { 1376 mutex_exit(&spa_namespace_lock); 1377 return (ENOENT); 1378 } 1379 1380 /* 1381 * Put a hold on the pool, drop the namespace lock, stop async tasks, 1382 * reacquire the namespace lock, and see if we can export. 1383 */ 1384 spa_open_ref(spa, FTAG); 1385 mutex_exit(&spa_namespace_lock); 1386 spa_async_suspend(spa); 1387 mutex_enter(&spa_namespace_lock); 1388 spa_close(spa, FTAG); 1389 1390 /* 1391 * The pool will be in core if it's openable, 1392 * in which case we can modify its state. 1393 */ 1394 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1395 /* 1396 * Objsets may be open only because they're dirty, so we 1397 * have to force it to sync before checking spa_refcnt. 1398 */ 1399 spa_scrub_suspend(spa); 1400 txg_wait_synced(spa->spa_dsl_pool, 0); 1401 1402 /* 1403 * A pool cannot be exported or destroyed if there are active 1404 * references. If we are resetting a pool, allow references by 1405 * fault injection handlers. 1406 */ 1407 if (!spa_refcount_zero(spa) || 1408 (spa->spa_inject_ref != 0 && 1409 new_state != POOL_STATE_UNINITIALIZED)) { 1410 spa_scrub_resume(spa); 1411 spa_async_resume(spa); 1412 mutex_exit(&spa_namespace_lock); 1413 return (EBUSY); 1414 } 1415 1416 spa_scrub_resume(spa); 1417 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1418 1419 /* 1420 * We want this to be reflected on every label, 1421 * so mark them all dirty. spa_unload() will do the 1422 * final sync that pushes these changes out. 1423 */ 1424 if (new_state != POOL_STATE_UNINITIALIZED) { 1425 spa_config_enter(spa, RW_WRITER, FTAG); 1426 spa->spa_state = new_state; 1427 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 1428 vdev_config_dirty(spa->spa_root_vdev); 1429 spa_config_exit(spa, FTAG); 1430 } 1431 } 1432 1433 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1434 spa_unload(spa); 1435 spa_deactivate(spa); 1436 } 1437 1438 if (oldconfig && spa->spa_config) 1439 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 1440 1441 if (new_state != POOL_STATE_UNINITIALIZED) { 1442 spa_remove(spa); 1443 spa_config_sync(); 1444 } 1445 mutex_exit(&spa_namespace_lock); 1446 1447 return (0); 1448 } 1449 1450 /* 1451 * Destroy a storage pool. 1452 */ 1453 int 1454 spa_destroy(char *pool) 1455 { 1456 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1457 } 1458 1459 /* 1460 * Export a storage pool. 1461 */ 1462 int 1463 spa_export(char *pool, nvlist_t **oldconfig) 1464 { 1465 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1466 } 1467 1468 /* 1469 * Similar to spa_export(), this unloads the spa_t without actually removing it 1470 * from the namespace in any way. 1471 */ 1472 int 1473 spa_reset(char *pool) 1474 { 1475 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 1476 } 1477 1478 1479 /* 1480 * ========================================================================== 1481 * Device manipulation 1482 * ========================================================================== 1483 */ 1484 1485 /* 1486 * Add capacity to a storage pool. 1487 */ 1488 int 1489 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1490 { 1491 uint64_t txg; 1492 int c, error; 1493 vdev_t *rvd = spa->spa_root_vdev; 1494 vdev_t *vd, *tvd; 1495 nvlist_t **spares; 1496 uint_t i, nspares; 1497 1498 txg = spa_vdev_enter(spa); 1499 1500 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 1501 VDEV_ALLOC_ADD)) != 0) 1502 return (spa_vdev_exit(spa, NULL, txg, error)); 1503 1504 spa->spa_pending_vdev = vd; 1505 1506 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1507 &spares, &nspares) != 0) 1508 nspares = 0; 1509 1510 if (vd->vdev_children == 0 && nspares == 0) { 1511 spa->spa_pending_vdev = NULL; 1512 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1513 } 1514 1515 if (vd->vdev_children != 0) { 1516 if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 1517 spa->spa_pending_vdev = NULL; 1518 return (spa_vdev_exit(spa, vd, txg, error)); 1519 } 1520 } 1521 1522 /* 1523 * We must validate the spares after checking the children. Otherwise, 1524 * vdev_inuse() will blindly overwrite the spare. 1525 */ 1526 if ((error = spa_validate_spares(spa, nvroot, txg, 1527 VDEV_ALLOC_ADD)) != 0) { 1528 spa->spa_pending_vdev = NULL; 1529 return (spa_vdev_exit(spa, vd, txg, error)); 1530 } 1531 1532 spa->spa_pending_vdev = NULL; 1533 1534 /* 1535 * Transfer each new top-level vdev from vd to rvd. 1536 */ 1537 for (c = 0; c < vd->vdev_children; c++) { 1538 tvd = vd->vdev_child[c]; 1539 vdev_remove_child(vd, tvd); 1540 tvd->vdev_id = rvd->vdev_children; 1541 vdev_add_child(rvd, tvd); 1542 vdev_config_dirty(tvd); 1543 } 1544 1545 if (nspares != 0) { 1546 if (spa->spa_sparelist != NULL) { 1547 nvlist_t **oldspares; 1548 uint_t oldnspares; 1549 nvlist_t **newspares; 1550 1551 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 1552 ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 1553 1554 newspares = kmem_alloc(sizeof (void *) * 1555 (nspares + oldnspares), KM_SLEEP); 1556 for (i = 0; i < oldnspares; i++) 1557 VERIFY(nvlist_dup(oldspares[i], 1558 &newspares[i], KM_SLEEP) == 0); 1559 for (i = 0; i < nspares; i++) 1560 VERIFY(nvlist_dup(spares[i], 1561 &newspares[i + oldnspares], 1562 KM_SLEEP) == 0); 1563 1564 VERIFY(nvlist_remove(spa->spa_sparelist, 1565 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1566 1567 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1568 ZPOOL_CONFIG_SPARES, newspares, 1569 nspares + oldnspares) == 0); 1570 for (i = 0; i < oldnspares + nspares; i++) 1571 nvlist_free(newspares[i]); 1572 kmem_free(newspares, (oldnspares + nspares) * 1573 sizeof (void *)); 1574 } else { 1575 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1576 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1577 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1578 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1579 } 1580 1581 spa_load_spares(spa); 1582 spa->spa_sync_spares = B_TRUE; 1583 } 1584 1585 /* 1586 * We have to be careful when adding new vdevs to an existing pool. 1587 * If other threads start allocating from these vdevs before we 1588 * sync the config cache, and we lose power, then upon reboot we may 1589 * fail to open the pool because there are DVAs that the config cache 1590 * can't translate. Therefore, we first add the vdevs without 1591 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1592 * and then let spa_config_update() initialize the new metaslabs. 1593 * 1594 * spa_load() checks for added-but-not-initialized vdevs, so that 1595 * if we lose power at any point in this sequence, the remaining 1596 * steps will be completed the next time we load the pool. 1597 */ 1598 (void) spa_vdev_exit(spa, vd, txg, 0); 1599 1600 mutex_enter(&spa_namespace_lock); 1601 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1602 mutex_exit(&spa_namespace_lock); 1603 1604 return (0); 1605 } 1606 1607 /* 1608 * Attach a device to a mirror. The arguments are the path to any device 1609 * in the mirror, and the nvroot for the new device. If the path specifies 1610 * a device that is not mirrored, we automatically insert the mirror vdev. 1611 * 1612 * If 'replacing' is specified, the new device is intended to replace the 1613 * existing device; in this case the two devices are made into their own 1614 * mirror using the 'replacing' vdev, which is functionally idendical to 1615 * the mirror vdev (it actually reuses all the same ops) but has a few 1616 * extra rules: you can't attach to it after it's been created, and upon 1617 * completion of resilvering, the first disk (the one being replaced) 1618 * is automatically detached. 1619 */ 1620 int 1621 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1622 { 1623 uint64_t txg, open_txg; 1624 int error; 1625 vdev_t *rvd = spa->spa_root_vdev; 1626 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1627 vdev_ops_t *pvops; 1628 1629 txg = spa_vdev_enter(spa); 1630 1631 oldvd = vdev_lookup_by_guid(rvd, guid); 1632 1633 if (oldvd == NULL) 1634 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1635 1636 if (!oldvd->vdev_ops->vdev_op_leaf) 1637 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1638 1639 pvd = oldvd->vdev_parent; 1640 1641 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 1642 VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) 1643 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1644 1645 newvd = newrootvd->vdev_child[0]; 1646 1647 if (!newvd->vdev_ops->vdev_op_leaf) 1648 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1649 1650 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 1651 return (spa_vdev_exit(spa, newrootvd, txg, error)); 1652 1653 if (!replacing) { 1654 /* 1655 * For attach, the only allowable parent is a mirror or the root 1656 * vdev. 1657 */ 1658 if (pvd->vdev_ops != &vdev_mirror_ops && 1659 pvd->vdev_ops != &vdev_root_ops) 1660 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1661 1662 pvops = &vdev_mirror_ops; 1663 } else { 1664 /* 1665 * Active hot spares can only be replaced by inactive hot 1666 * spares. 1667 */ 1668 if (pvd->vdev_ops == &vdev_spare_ops && 1669 pvd->vdev_child[1] == oldvd && 1670 !spa_has_spare(spa, newvd->vdev_guid)) 1671 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1672 1673 /* 1674 * If the source is a hot spare, and the parent isn't already a 1675 * spare, then we want to create a new hot spare. Otherwise, we 1676 * want to create a replacing vdev. The user is not allowed to 1677 * attach to a spared vdev child unless the 'isspare' state is 1678 * the same (spare replaces spare, non-spare replaces 1679 * non-spare). 1680 */ 1681 if (pvd->vdev_ops == &vdev_replacing_ops) 1682 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1683 else if (pvd->vdev_ops == &vdev_spare_ops && 1684 newvd->vdev_isspare != oldvd->vdev_isspare) 1685 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1686 else if (pvd->vdev_ops != &vdev_spare_ops && 1687 newvd->vdev_isspare) 1688 pvops = &vdev_spare_ops; 1689 else 1690 pvops = &vdev_replacing_ops; 1691 } 1692 1693 /* 1694 * Compare the new device size with the replaceable/attachable 1695 * device size. 1696 */ 1697 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1698 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1699 1700 /* 1701 * The new device cannot have a higher alignment requirement 1702 * than the top-level vdev. 1703 */ 1704 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1705 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1706 1707 /* 1708 * If this is an in-place replacement, update oldvd's path and devid 1709 * to make it distinguishable from newvd, and unopenable from now on. 1710 */ 1711 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1712 spa_strfree(oldvd->vdev_path); 1713 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1714 KM_SLEEP); 1715 (void) sprintf(oldvd->vdev_path, "%s/%s", 1716 newvd->vdev_path, "old"); 1717 if (oldvd->vdev_devid != NULL) { 1718 spa_strfree(oldvd->vdev_devid); 1719 oldvd->vdev_devid = NULL; 1720 } 1721 } 1722 1723 /* 1724 * If the parent is not a mirror, or if we're replacing, insert the new 1725 * mirror/replacing/spare vdev above oldvd. 1726 */ 1727 if (pvd->vdev_ops != pvops) 1728 pvd = vdev_add_parent(oldvd, pvops); 1729 1730 ASSERT(pvd->vdev_top->vdev_parent == rvd); 1731 ASSERT(pvd->vdev_ops == pvops); 1732 ASSERT(oldvd->vdev_parent == pvd); 1733 1734 /* 1735 * Extract the new device from its root and add it to pvd. 1736 */ 1737 vdev_remove_child(newrootvd, newvd); 1738 newvd->vdev_id = pvd->vdev_children; 1739 vdev_add_child(pvd, newvd); 1740 1741 /* 1742 * If newvd is smaller than oldvd, but larger than its rsize, 1743 * the addition of newvd may have decreased our parent's asize. 1744 */ 1745 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1746 1747 tvd = newvd->vdev_top; 1748 ASSERT(pvd->vdev_top == tvd); 1749 ASSERT(tvd->vdev_parent == rvd); 1750 1751 vdev_config_dirty(tvd); 1752 1753 /* 1754 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1755 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1756 */ 1757 open_txg = txg + TXG_CONCURRENT_STATES - 1; 1758 1759 mutex_enter(&newvd->vdev_dtl_lock); 1760 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1761 open_txg - TXG_INITIAL + 1); 1762 mutex_exit(&newvd->vdev_dtl_lock); 1763 1764 if (newvd->vdev_isspare) 1765 spa_spare_activate(newvd); 1766 1767 /* 1768 * Mark newvd's DTL dirty in this txg. 1769 */ 1770 vdev_dirty(tvd, VDD_DTL, newvd, txg); 1771 1772 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1773 1774 /* 1775 * Kick off a resilver to update newvd. 1776 */ 1777 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1778 1779 return (0); 1780 } 1781 1782 /* 1783 * Detach a device from a mirror or replacing vdev. 1784 * If 'replace_done' is specified, only detach if the parent 1785 * is a replacing vdev. 1786 */ 1787 int 1788 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1789 { 1790 uint64_t txg; 1791 int c, t, error; 1792 vdev_t *rvd = spa->spa_root_vdev; 1793 vdev_t *vd, *pvd, *cvd, *tvd; 1794 boolean_t unspare = B_FALSE; 1795 uint64_t unspare_guid; 1796 1797 txg = spa_vdev_enter(spa); 1798 1799 vd = vdev_lookup_by_guid(rvd, guid); 1800 1801 if (vd == NULL) 1802 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1803 1804 if (!vd->vdev_ops->vdev_op_leaf) 1805 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1806 1807 pvd = vd->vdev_parent; 1808 1809 /* 1810 * If replace_done is specified, only remove this device if it's 1811 * the first child of a replacing vdev. For the 'spare' vdev, either 1812 * disk can be removed. 1813 */ 1814 if (replace_done) { 1815 if (pvd->vdev_ops == &vdev_replacing_ops) { 1816 if (vd->vdev_id != 0) 1817 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1818 } else if (pvd->vdev_ops != &vdev_spare_ops) { 1819 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1820 } 1821 } 1822 1823 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 1824 spa_version(spa) >= ZFS_VERSION_SPARES); 1825 1826 /* 1827 * Only mirror, replacing, and spare vdevs support detach. 1828 */ 1829 if (pvd->vdev_ops != &vdev_replacing_ops && 1830 pvd->vdev_ops != &vdev_mirror_ops && 1831 pvd->vdev_ops != &vdev_spare_ops) 1832 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1833 1834 /* 1835 * If there's only one replica, you can't detach it. 1836 */ 1837 if (pvd->vdev_children <= 1) 1838 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1839 1840 /* 1841 * If all siblings have non-empty DTLs, this device may have the only 1842 * valid copy of the data, which means we cannot safely detach it. 1843 * 1844 * XXX -- as in the vdev_offline() case, we really want a more 1845 * precise DTL check. 1846 */ 1847 for (c = 0; c < pvd->vdev_children; c++) { 1848 uint64_t dirty; 1849 1850 cvd = pvd->vdev_child[c]; 1851 if (cvd == vd) 1852 continue; 1853 if (vdev_is_dead(cvd)) 1854 continue; 1855 mutex_enter(&cvd->vdev_dtl_lock); 1856 dirty = cvd->vdev_dtl_map.sm_space | 1857 cvd->vdev_dtl_scrub.sm_space; 1858 mutex_exit(&cvd->vdev_dtl_lock); 1859 if (!dirty) 1860 break; 1861 } 1862 1863 /* 1864 * If we are a replacing or spare vdev, then we can always detach the 1865 * latter child, as that is how one cancels the operation. 1866 */ 1867 if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 1868 c == pvd->vdev_children) 1869 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1870 1871 /* 1872 * If we are detaching the original disk from a spare, then it implies 1873 * that the spare should become a real disk, and be removed from the 1874 * active spare list for the pool. 1875 */ 1876 if (pvd->vdev_ops == &vdev_spare_ops && 1877 vd->vdev_id == 0) 1878 unspare = B_TRUE; 1879 1880 /* 1881 * Erase the disk labels so the disk can be used for other things. 1882 * This must be done after all other error cases are handled, 1883 * but before we disembowel vd (so we can still do I/O to it). 1884 * But if we can't do it, don't treat the error as fatal -- 1885 * it may be that the unwritability of the disk is the reason 1886 * it's being detached! 1887 */ 1888 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 1889 1890 /* 1891 * Remove vd from its parent and compact the parent's children. 1892 */ 1893 vdev_remove_child(pvd, vd); 1894 vdev_compact_children(pvd); 1895 1896 /* 1897 * Remember one of the remaining children so we can get tvd below. 1898 */ 1899 cvd = pvd->vdev_child[0]; 1900 1901 /* 1902 * If we need to remove the remaining child from the list of hot spares, 1903 * do it now, marking the vdev as no longer a spare in the process. We 1904 * must do this before vdev_remove_parent(), because that can change the 1905 * GUID if it creates a new toplevel GUID. 1906 */ 1907 if (unspare) { 1908 ASSERT(cvd->vdev_isspare); 1909 spa_spare_remove(cvd); 1910 unspare_guid = cvd->vdev_guid; 1911 } 1912 1913 /* 1914 * If the parent mirror/replacing vdev only has one child, 1915 * the parent is no longer needed. Remove it from the tree. 1916 */ 1917 if (pvd->vdev_children == 1) 1918 vdev_remove_parent(cvd); 1919 1920 /* 1921 * We don't set tvd until now because the parent we just removed 1922 * may have been the previous top-level vdev. 1923 */ 1924 tvd = cvd->vdev_top; 1925 ASSERT(tvd->vdev_parent == rvd); 1926 1927 /* 1928 * Reevaluate the parent vdev state. 1929 */ 1930 vdev_propagate_state(cvd->vdev_parent); 1931 1932 /* 1933 * If the device we just detached was smaller than the others, it may be 1934 * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 1935 * can't fail because the existing metaslabs are already in core, so 1936 * there's nothing to read from disk. 1937 */ 1938 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 1939 1940 vdev_config_dirty(tvd); 1941 1942 /* 1943 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 1944 * vd->vdev_detached is set and free vd's DTL object in syncing context. 1945 * But first make sure we're not on any *other* txg's DTL list, to 1946 * prevent vd from being accessed after it's freed. 1947 */ 1948 for (t = 0; t < TXG_SIZE; t++) 1949 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1950 vd->vdev_detached = B_TRUE; 1951 vdev_dirty(tvd, VDD_DTL, vd, txg); 1952 1953 error = spa_vdev_exit(spa, vd, txg, 0); 1954 1955 /* 1956 * If this was the removal of the original device in a hot spare vdev, 1957 * then we want to go through and remove the device from the hot spare 1958 * list of every other pool. 1959 */ 1960 if (unspare) { 1961 spa = NULL; 1962 mutex_enter(&spa_namespace_lock); 1963 while ((spa = spa_next(spa)) != NULL) { 1964 if (spa->spa_state != POOL_STATE_ACTIVE) 1965 continue; 1966 1967 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 1968 } 1969 mutex_exit(&spa_namespace_lock); 1970 } 1971 1972 return (error); 1973 } 1974 1975 /* 1976 * Remove a device from the pool. Currently, this supports removing only hot 1977 * spares. 1978 */ 1979 int 1980 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 1981 { 1982 vdev_t *vd; 1983 nvlist_t **spares, *nv, **newspares; 1984 uint_t i, j, nspares; 1985 int ret = 0; 1986 1987 spa_config_enter(spa, RW_WRITER, FTAG); 1988 1989 vd = spa_lookup_by_guid(spa, guid); 1990 1991 nv = NULL; 1992 if (spa->spa_spares != NULL && 1993 nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 1994 &spares, &nspares) == 0) { 1995 for (i = 0; i < nspares; i++) { 1996 uint64_t theguid; 1997 1998 VERIFY(nvlist_lookup_uint64(spares[i], 1999 ZPOOL_CONFIG_GUID, &theguid) == 0); 2000 if (theguid == guid) { 2001 nv = spares[i]; 2002 break; 2003 } 2004 } 2005 } 2006 2007 /* 2008 * We only support removing a hot spare, and only if it's not currently 2009 * in use in this pool. 2010 */ 2011 if (nv == NULL && vd == NULL) { 2012 ret = ENOENT; 2013 goto out; 2014 } 2015 2016 if (nv == NULL && vd != NULL) { 2017 ret = ENOTSUP; 2018 goto out; 2019 } 2020 2021 if (!unspare && nv != NULL && vd != NULL) { 2022 ret = EBUSY; 2023 goto out; 2024 } 2025 2026 if (nspares == 1) { 2027 newspares = NULL; 2028 } else { 2029 newspares = kmem_alloc((nspares - 1) * sizeof (void *), 2030 KM_SLEEP); 2031 for (i = 0, j = 0; i < nspares; i++) { 2032 if (spares[i] != nv) 2033 VERIFY(nvlist_dup(spares[i], 2034 &newspares[j++], KM_SLEEP) == 0); 2035 } 2036 } 2037 2038 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2039 DATA_TYPE_NVLIST_ARRAY) == 0); 2040 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2041 newspares, nspares - 1) == 0); 2042 for (i = 0; i < nspares - 1; i++) 2043 nvlist_free(newspares[i]); 2044 kmem_free(newspares, (nspares - 1) * sizeof (void *)); 2045 spa_load_spares(spa); 2046 spa->spa_sync_spares = B_TRUE; 2047 2048 out: 2049 spa_config_exit(spa, FTAG); 2050 2051 return (ret); 2052 } 2053 2054 /* 2055 * Find any device that's done replacing, so we can detach it. 2056 */ 2057 static vdev_t * 2058 spa_vdev_replace_done_hunt(vdev_t *vd) 2059 { 2060 vdev_t *newvd, *oldvd; 2061 int c; 2062 2063 for (c = 0; c < vd->vdev_children; c++) { 2064 oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 2065 if (oldvd != NULL) 2066 return (oldvd); 2067 } 2068 2069 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 2070 oldvd = vd->vdev_child[0]; 2071 newvd = vd->vdev_child[1]; 2072 2073 mutex_enter(&newvd->vdev_dtl_lock); 2074 if (newvd->vdev_dtl_map.sm_space == 0 && 2075 newvd->vdev_dtl_scrub.sm_space == 0) { 2076 mutex_exit(&newvd->vdev_dtl_lock); 2077 return (oldvd); 2078 } 2079 mutex_exit(&newvd->vdev_dtl_lock); 2080 } 2081 2082 return (NULL); 2083 } 2084 2085 static void 2086 spa_vdev_replace_done(spa_t *spa) 2087 { 2088 vdev_t *vd; 2089 vdev_t *pvd; 2090 uint64_t guid; 2091 uint64_t pguid = 0; 2092 2093 spa_config_enter(spa, RW_READER, FTAG); 2094 2095 while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 2096 guid = vd->vdev_guid; 2097 /* 2098 * If we have just finished replacing a hot spared device, then 2099 * we need to detach the parent's first child (the original hot 2100 * spare) as well. 2101 */ 2102 pvd = vd->vdev_parent; 2103 if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2104 pvd->vdev_id == 0) { 2105 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 2106 ASSERT(pvd->vdev_parent->vdev_children == 2); 2107 pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 2108 } 2109 spa_config_exit(spa, FTAG); 2110 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 2111 return; 2112 if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 2113 return; 2114 spa_config_enter(spa, RW_READER, FTAG); 2115 } 2116 2117 spa_config_exit(spa, FTAG); 2118 } 2119 2120 /* 2121 * Update the stored path for this vdev. Dirty the vdev configuration, relying 2122 * on spa_vdev_enter/exit() to synchronize the labels and cache. 2123 */ 2124 int 2125 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 2126 { 2127 vdev_t *rvd, *vd; 2128 uint64_t txg; 2129 2130 rvd = spa->spa_root_vdev; 2131 2132 txg = spa_vdev_enter(spa); 2133 2134 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2135 /* 2136 * Determine if this is a reference to a hot spare. In that 2137 * case, update the path as stored in the spare list. 2138 */ 2139 nvlist_t **spares; 2140 uint_t i, nspares; 2141 if (spa->spa_sparelist != NULL) { 2142 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 2143 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2144 for (i = 0; i < nspares; i++) { 2145 uint64_t theguid; 2146 VERIFY(nvlist_lookup_uint64(spares[i], 2147 ZPOOL_CONFIG_GUID, &theguid) == 0); 2148 if (theguid == guid) 2149 break; 2150 } 2151 2152 if (i == nspares) 2153 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2154 2155 VERIFY(nvlist_add_string(spares[i], 2156 ZPOOL_CONFIG_PATH, newpath) == 0); 2157 spa_load_spares(spa); 2158 spa->spa_sync_spares = B_TRUE; 2159 return (spa_vdev_exit(spa, NULL, txg, 0)); 2160 } else { 2161 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2162 } 2163 } 2164 2165 if (!vd->vdev_ops->vdev_op_leaf) 2166 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2167 2168 spa_strfree(vd->vdev_path); 2169 vd->vdev_path = spa_strdup(newpath); 2170 2171 vdev_config_dirty(vd->vdev_top); 2172 2173 return (spa_vdev_exit(spa, NULL, txg, 0)); 2174 } 2175 2176 /* 2177 * ========================================================================== 2178 * SPA Scrubbing 2179 * ========================================================================== 2180 */ 2181 2182 void 2183 spa_scrub_throttle(spa_t *spa, int direction) 2184 { 2185 mutex_enter(&spa->spa_scrub_lock); 2186 spa->spa_scrub_throttled += direction; 2187 ASSERT(spa->spa_scrub_throttled >= 0); 2188 if (spa->spa_scrub_throttled == 0) 2189 cv_broadcast(&spa->spa_scrub_io_cv); 2190 mutex_exit(&spa->spa_scrub_lock); 2191 } 2192 2193 static void 2194 spa_scrub_io_done(zio_t *zio) 2195 { 2196 spa_t *spa = zio->io_spa; 2197 2198 zio_data_buf_free(zio->io_data, zio->io_size); 2199 2200 mutex_enter(&spa->spa_scrub_lock); 2201 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2202 vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2203 spa->spa_scrub_errors++; 2204 mutex_enter(&vd->vdev_stat_lock); 2205 vd->vdev_stat.vs_scrub_errors++; 2206 mutex_exit(&vd->vdev_stat_lock); 2207 } 2208 if (--spa->spa_scrub_inflight == 0) { 2209 cv_broadcast(&spa->spa_scrub_io_cv); 2210 ASSERT(spa->spa_scrub_throttled == 0); 2211 } 2212 mutex_exit(&spa->spa_scrub_lock); 2213 } 2214 2215 static void 2216 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 2217 zbookmark_t *zb) 2218 { 2219 size_t size = BP_GET_LSIZE(bp); 2220 void *data = zio_data_buf_alloc(size); 2221 2222 mutex_enter(&spa->spa_scrub_lock); 2223 spa->spa_scrub_inflight++; 2224 mutex_exit(&spa->spa_scrub_lock); 2225 2226 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 2227 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 2228 2229 flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 2230 2231 zio_nowait(zio_read(NULL, spa, bp, data, size, 2232 spa_scrub_io_done, NULL, priority, flags, zb)); 2233 } 2234 2235 /* ARGSUSED */ 2236 static int 2237 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2238 { 2239 blkptr_t *bp = &bc->bc_blkptr; 2240 vdev_t *vd = spa->spa_root_vdev; 2241 dva_t *dva = bp->blk_dva; 2242 int needs_resilver = B_FALSE; 2243 int d; 2244 2245 if (bc->bc_errno) { 2246 /* 2247 * We can't scrub this block, but we can continue to scrub 2248 * the rest of the pool. Note the error and move along. 2249 */ 2250 mutex_enter(&spa->spa_scrub_lock); 2251 spa->spa_scrub_errors++; 2252 mutex_exit(&spa->spa_scrub_lock); 2253 2254 mutex_enter(&vd->vdev_stat_lock); 2255 vd->vdev_stat.vs_scrub_errors++; 2256 mutex_exit(&vd->vdev_stat_lock); 2257 2258 return (ERESTART); 2259 } 2260 2261 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2262 2263 for (d = 0; d < BP_GET_NDVAS(bp); d++) { 2264 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 2265 2266 ASSERT(vd != NULL); 2267 2268 /* 2269 * Keep track of how much data we've examined so that 2270 * zpool(1M) status can make useful progress reports. 2271 */ 2272 mutex_enter(&vd->vdev_stat_lock); 2273 vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 2274 mutex_exit(&vd->vdev_stat_lock); 2275 2276 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 2277 if (DVA_GET_GANG(&dva[d])) { 2278 /* 2279 * Gang members may be spread across multiple 2280 * vdevs, so the best we can do is look at the 2281 * pool-wide DTL. 2282 * XXX -- it would be better to change our 2283 * allocation policy to ensure that this can't 2284 * happen. 2285 */ 2286 vd = spa->spa_root_vdev; 2287 } 2288 if (vdev_dtl_contains(&vd->vdev_dtl_map, 2289 bp->blk_birth, 1)) 2290 needs_resilver = B_TRUE; 2291 } 2292 } 2293 2294 if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2295 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 2296 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 2297 else if (needs_resilver) 2298 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 2299 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2300 2301 return (0); 2302 } 2303 2304 static void 2305 spa_scrub_thread(spa_t *spa) 2306 { 2307 callb_cpr_t cprinfo; 2308 traverse_handle_t *th = spa->spa_scrub_th; 2309 vdev_t *rvd = spa->spa_root_vdev; 2310 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2311 int error = 0; 2312 boolean_t complete; 2313 2314 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2315 2316 /* 2317 * If we're restarting due to a snapshot create/delete, 2318 * wait for that to complete. 2319 */ 2320 txg_wait_synced(spa_get_dsl(spa), 0); 2321 2322 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 2323 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2324 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 2325 2326 spa_config_enter(spa, RW_WRITER, FTAG); 2327 vdev_reopen(rvd); /* purge all vdev caches */ 2328 vdev_config_dirty(rvd); /* rewrite all disk labels */ 2329 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 2330 spa_config_exit(spa, FTAG); 2331 2332 mutex_enter(&spa->spa_scrub_lock); 2333 spa->spa_scrub_errors = 0; 2334 spa->spa_scrub_active = 1; 2335 ASSERT(spa->spa_scrub_inflight == 0); 2336 ASSERT(spa->spa_scrub_throttled == 0); 2337 2338 while (!spa->spa_scrub_stop) { 2339 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2340 while (spa->spa_scrub_suspended) { 2341 spa->spa_scrub_active = 0; 2342 cv_broadcast(&spa->spa_scrub_cv); 2343 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2344 spa->spa_scrub_active = 1; 2345 } 2346 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2347 2348 if (spa->spa_scrub_restart_txg != 0) 2349 break; 2350 2351 mutex_exit(&spa->spa_scrub_lock); 2352 error = traverse_more(th); 2353 mutex_enter(&spa->spa_scrub_lock); 2354 if (error != EAGAIN) 2355 break; 2356 2357 while (spa->spa_scrub_throttled > 0) 2358 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2359 } 2360 2361 while (spa->spa_scrub_inflight) 2362 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2363 2364 spa->spa_scrub_active = 0; 2365 cv_broadcast(&spa->spa_scrub_cv); 2366 2367 mutex_exit(&spa->spa_scrub_lock); 2368 2369 spa_config_enter(spa, RW_WRITER, FTAG); 2370 2371 mutex_enter(&spa->spa_scrub_lock); 2372 2373 /* 2374 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 2375 * AND the spa config lock to synchronize with any config changes 2376 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 2377 */ 2378 if (spa->spa_scrub_restart_txg != 0) 2379 error = ERESTART; 2380 2381 if (spa->spa_scrub_stop) 2382 error = EINTR; 2383 2384 /* 2385 * Even if there were uncorrectable errors, we consider the scrub 2386 * completed. The downside is that if there is a transient error during 2387 * a resilver, we won't resilver the data properly to the target. But 2388 * if the damage is permanent (more likely) we will resilver forever, 2389 * which isn't really acceptable. Since there is enough information for 2390 * the user to know what has failed and why, this seems like a more 2391 * tractable approach. 2392 */ 2393 complete = (error == 0); 2394 2395 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 2396 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2397 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2398 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2399 2400 mutex_exit(&spa->spa_scrub_lock); 2401 2402 /* 2403 * If the scrub/resilver completed, update all DTLs to reflect this. 2404 * Whether it succeeded or not, vacate all temporary scrub DTLs. 2405 */ 2406 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2407 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2408 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 2409 spa_errlog_rotate(spa); 2410 2411 spa_config_exit(spa, FTAG); 2412 2413 mutex_enter(&spa->spa_scrub_lock); 2414 2415 /* 2416 * We may have finished replacing a device. 2417 * Let the async thread assess this and handle the detach. 2418 */ 2419 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2420 2421 /* 2422 * If we were told to restart, our final act is to start a new scrub. 2423 */ 2424 if (error == ERESTART) 2425 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 2426 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2427 2428 spa->spa_scrub_type = POOL_SCRUB_NONE; 2429 spa->spa_scrub_active = 0; 2430 spa->spa_scrub_thread = NULL; 2431 cv_broadcast(&spa->spa_scrub_cv); 2432 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2433 thread_exit(); 2434 } 2435 2436 void 2437 spa_scrub_suspend(spa_t *spa) 2438 { 2439 mutex_enter(&spa->spa_scrub_lock); 2440 spa->spa_scrub_suspended++; 2441 while (spa->spa_scrub_active) { 2442 cv_broadcast(&spa->spa_scrub_cv); 2443 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2444 } 2445 while (spa->spa_scrub_inflight) 2446 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2447 mutex_exit(&spa->spa_scrub_lock); 2448 } 2449 2450 void 2451 spa_scrub_resume(spa_t *spa) 2452 { 2453 mutex_enter(&spa->spa_scrub_lock); 2454 ASSERT(spa->spa_scrub_suspended != 0); 2455 if (--spa->spa_scrub_suspended == 0) 2456 cv_broadcast(&spa->spa_scrub_cv); 2457 mutex_exit(&spa->spa_scrub_lock); 2458 } 2459 2460 void 2461 spa_scrub_restart(spa_t *spa, uint64_t txg) 2462 { 2463 /* 2464 * Something happened (e.g. snapshot create/delete) that means 2465 * we must restart any in-progress scrubs. The itinerary will 2466 * fix this properly. 2467 */ 2468 mutex_enter(&spa->spa_scrub_lock); 2469 spa->spa_scrub_restart_txg = txg; 2470 mutex_exit(&spa->spa_scrub_lock); 2471 } 2472 2473 int 2474 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2475 { 2476 space_seg_t *ss; 2477 uint64_t mintxg, maxtxg; 2478 vdev_t *rvd = spa->spa_root_vdev; 2479 2480 if ((uint_t)type >= POOL_SCRUB_TYPES) 2481 return (ENOTSUP); 2482 2483 mutex_enter(&spa->spa_scrub_lock); 2484 2485 /* 2486 * If there's a scrub or resilver already in progress, stop it. 2487 */ 2488 while (spa->spa_scrub_thread != NULL) { 2489 /* 2490 * Don't stop a resilver unless forced. 2491 */ 2492 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 2493 mutex_exit(&spa->spa_scrub_lock); 2494 return (EBUSY); 2495 } 2496 spa->spa_scrub_stop = 1; 2497 cv_broadcast(&spa->spa_scrub_cv); 2498 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2499 } 2500 2501 /* 2502 * Terminate the previous traverse. 2503 */ 2504 if (spa->spa_scrub_th != NULL) { 2505 traverse_fini(spa->spa_scrub_th); 2506 spa->spa_scrub_th = NULL; 2507 } 2508 2509 if (rvd == NULL) { 2510 ASSERT(spa->spa_scrub_stop == 0); 2511 ASSERT(spa->spa_scrub_type == type); 2512 ASSERT(spa->spa_scrub_restart_txg == 0); 2513 mutex_exit(&spa->spa_scrub_lock); 2514 return (0); 2515 } 2516 2517 mintxg = TXG_INITIAL - 1; 2518 maxtxg = spa_last_synced_txg(spa) + 1; 2519 2520 mutex_enter(&rvd->vdev_dtl_lock); 2521 2522 if (rvd->vdev_dtl_map.sm_space == 0) { 2523 /* 2524 * The pool-wide DTL is empty. 2525 * If this is a resilver, there's nothing to do except 2526 * check whether any in-progress replacements have completed. 2527 */ 2528 if (type == POOL_SCRUB_RESILVER) { 2529 type = POOL_SCRUB_NONE; 2530 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2531 } 2532 } else { 2533 /* 2534 * The pool-wide DTL is non-empty. 2535 * If this is a normal scrub, upgrade to a resilver instead. 2536 */ 2537 if (type == POOL_SCRUB_EVERYTHING) 2538 type = POOL_SCRUB_RESILVER; 2539 } 2540 2541 if (type == POOL_SCRUB_RESILVER) { 2542 /* 2543 * Determine the resilvering boundaries. 2544 * 2545 * Note: (mintxg, maxtxg) is an open interval, 2546 * i.e. mintxg and maxtxg themselves are not included. 2547 * 2548 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 2549 * so we don't claim to resilver a txg that's still changing. 2550 */ 2551 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 2552 mintxg = ss->ss_start - 1; 2553 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 2554 maxtxg = MIN(ss->ss_end, maxtxg); 2555 } 2556 2557 mutex_exit(&rvd->vdev_dtl_lock); 2558 2559 spa->spa_scrub_stop = 0; 2560 spa->spa_scrub_type = type; 2561 spa->spa_scrub_restart_txg = 0; 2562 2563 if (type != POOL_SCRUB_NONE) { 2564 spa->spa_scrub_mintxg = mintxg; 2565 spa->spa_scrub_maxtxg = maxtxg; 2566 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 2567 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 2568 ZIO_FLAG_CANFAIL); 2569 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 2570 spa->spa_scrub_thread = thread_create(NULL, 0, 2571 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 2572 } 2573 2574 mutex_exit(&spa->spa_scrub_lock); 2575 2576 return (0); 2577 } 2578 2579 /* 2580 * ========================================================================== 2581 * SPA async task processing 2582 * ========================================================================== 2583 */ 2584 2585 static void 2586 spa_async_reopen(spa_t *spa) 2587 { 2588 vdev_t *rvd = spa->spa_root_vdev; 2589 vdev_t *tvd; 2590 int c; 2591 2592 spa_config_enter(spa, RW_WRITER, FTAG); 2593 2594 for (c = 0; c < rvd->vdev_children; c++) { 2595 tvd = rvd->vdev_child[c]; 2596 if (tvd->vdev_reopen_wanted) { 2597 tvd->vdev_reopen_wanted = 0; 2598 vdev_reopen(tvd); 2599 } 2600 } 2601 2602 spa_config_exit(spa, FTAG); 2603 } 2604 2605 static void 2606 spa_async_thread(spa_t *spa) 2607 { 2608 int tasks; 2609 2610 ASSERT(spa->spa_sync_on); 2611 2612 mutex_enter(&spa->spa_async_lock); 2613 tasks = spa->spa_async_tasks; 2614 spa->spa_async_tasks = 0; 2615 mutex_exit(&spa->spa_async_lock); 2616 2617 /* 2618 * See if the config needs to be updated. 2619 */ 2620 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 2621 mutex_enter(&spa_namespace_lock); 2622 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2623 mutex_exit(&spa_namespace_lock); 2624 } 2625 2626 /* 2627 * See if any devices need to be reopened. 2628 */ 2629 if (tasks & SPA_ASYNC_REOPEN) 2630 spa_async_reopen(spa); 2631 2632 /* 2633 * If any devices are done replacing, detach them. 2634 */ 2635 if (tasks & SPA_ASYNC_REPLACE_DONE) 2636 spa_vdev_replace_done(spa); 2637 2638 /* 2639 * Kick off a scrub. 2640 */ 2641 if (tasks & SPA_ASYNC_SCRUB) 2642 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 2643 2644 /* 2645 * Kick off a resilver. 2646 */ 2647 if (tasks & SPA_ASYNC_RESILVER) 2648 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 2649 2650 /* 2651 * Let the world know that we're done. 2652 */ 2653 mutex_enter(&spa->spa_async_lock); 2654 spa->spa_async_thread = NULL; 2655 cv_broadcast(&spa->spa_async_cv); 2656 mutex_exit(&spa->spa_async_lock); 2657 thread_exit(); 2658 } 2659 2660 void 2661 spa_async_suspend(spa_t *spa) 2662 { 2663 mutex_enter(&spa->spa_async_lock); 2664 spa->spa_async_suspended++; 2665 while (spa->spa_async_thread != NULL) 2666 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 2667 mutex_exit(&spa->spa_async_lock); 2668 } 2669 2670 void 2671 spa_async_resume(spa_t *spa) 2672 { 2673 mutex_enter(&spa->spa_async_lock); 2674 ASSERT(spa->spa_async_suspended != 0); 2675 spa->spa_async_suspended--; 2676 mutex_exit(&spa->spa_async_lock); 2677 } 2678 2679 static void 2680 spa_async_dispatch(spa_t *spa) 2681 { 2682 mutex_enter(&spa->spa_async_lock); 2683 if (spa->spa_async_tasks && !spa->spa_async_suspended && 2684 spa->spa_async_thread == NULL && 2685 rootdir != NULL && !vn_is_readonly(rootdir)) 2686 spa->spa_async_thread = thread_create(NULL, 0, 2687 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 2688 mutex_exit(&spa->spa_async_lock); 2689 } 2690 2691 void 2692 spa_async_request(spa_t *spa, int task) 2693 { 2694 mutex_enter(&spa->spa_async_lock); 2695 spa->spa_async_tasks |= task; 2696 mutex_exit(&spa->spa_async_lock); 2697 } 2698 2699 /* 2700 * ========================================================================== 2701 * SPA syncing routines 2702 * ========================================================================== 2703 */ 2704 2705 static void 2706 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2707 { 2708 bplist_t *bpl = &spa->spa_sync_bplist; 2709 dmu_tx_t *tx; 2710 blkptr_t blk; 2711 uint64_t itor = 0; 2712 zio_t *zio; 2713 int error; 2714 uint8_t c = 1; 2715 2716 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2717 2718 while (bplist_iterate(bpl, &itor, &blk) == 0) 2719 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2720 2721 error = zio_wait(zio); 2722 ASSERT3U(error, ==, 0); 2723 2724 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2725 bplist_vacate(bpl, tx); 2726 2727 /* 2728 * Pre-dirty the first block so we sync to convergence faster. 2729 * (Usually only the first block is needed.) 2730 */ 2731 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2732 dmu_tx_commit(tx); 2733 } 2734 2735 static void 2736 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 2737 { 2738 char *packed = NULL; 2739 size_t nvsize = 0; 2740 dmu_buf_t *db; 2741 2742 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 2743 2744 packed = kmem_alloc(nvsize, KM_SLEEP); 2745 2746 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 2747 KM_SLEEP) == 0); 2748 2749 dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 2750 2751 kmem_free(packed, nvsize); 2752 2753 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 2754 dmu_buf_will_dirty(db, tx); 2755 *(uint64_t *)db->db_data = nvsize; 2756 dmu_buf_rele(db, FTAG); 2757 } 2758 2759 static void 2760 spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 2761 { 2762 nvlist_t *nvroot; 2763 nvlist_t **spares; 2764 int i; 2765 2766 if (!spa->spa_sync_spares) 2767 return; 2768 2769 /* 2770 * Update the MOS nvlist describing the list of available spares. 2771 * spa_validate_spares() will have already made sure this nvlist is 2772 * valid and the vdevs are labelled appropriately. 2773 */ 2774 if (spa->spa_spares_object == 0) { 2775 spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 2776 DMU_OT_PACKED_NVLIST, 1 << 14, 2777 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2778 VERIFY(zap_update(spa->spa_meta_objset, 2779 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 2780 sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 2781 } 2782 2783 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2784 if (spa->spa_nspares == 0) { 2785 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2786 NULL, 0) == 0); 2787 } else { 2788 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 2789 KM_SLEEP); 2790 for (i = 0; i < spa->spa_nspares; i++) 2791 spares[i] = vdev_config_generate(spa, 2792 spa->spa_spares[i], B_FALSE, B_TRUE); 2793 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2794 spares, spa->spa_nspares) == 0); 2795 for (i = 0; i < spa->spa_nspares; i++) 2796 nvlist_free(spares[i]); 2797 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 2798 } 2799 2800 spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 2801 nvlist_free(nvroot); 2802 2803 spa->spa_sync_spares = B_FALSE; 2804 } 2805 2806 static void 2807 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2808 { 2809 nvlist_t *config; 2810 2811 if (list_is_empty(&spa->spa_dirty_list)) 2812 return; 2813 2814 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2815 2816 if (spa->spa_config_syncing) 2817 nvlist_free(spa->spa_config_syncing); 2818 spa->spa_config_syncing = config; 2819 2820 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 2821 } 2822 2823 /* 2824 * Sync the specified transaction group. New blocks may be dirtied as 2825 * part of the process, so we iterate until it converges. 2826 */ 2827 void 2828 spa_sync(spa_t *spa, uint64_t txg) 2829 { 2830 dsl_pool_t *dp = spa->spa_dsl_pool; 2831 objset_t *mos = spa->spa_meta_objset; 2832 bplist_t *bpl = &spa->spa_sync_bplist; 2833 vdev_t *rvd = spa->spa_root_vdev; 2834 vdev_t *vd; 2835 dmu_tx_t *tx; 2836 int dirty_vdevs; 2837 2838 /* 2839 * Lock out configuration changes. 2840 */ 2841 spa_config_enter(spa, RW_READER, FTAG); 2842 2843 spa->spa_syncing_txg = txg; 2844 spa->spa_sync_pass = 0; 2845 2846 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2847 2848 tx = dmu_tx_create_assigned(dp, txg); 2849 2850 /* 2851 * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg, 2852 * set spa_deflate if we have no raid-z vdevs. 2853 */ 2854 if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE && 2855 spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) { 2856 int i; 2857 2858 for (i = 0; i < rvd->vdev_children; i++) { 2859 vd = rvd->vdev_child[i]; 2860 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 2861 break; 2862 } 2863 if (i == rvd->vdev_children) { 2864 spa->spa_deflate = TRUE; 2865 VERIFY(0 == zap_add(spa->spa_meta_objset, 2866 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2867 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 2868 } 2869 } 2870 2871 /* 2872 * If anything has changed in this txg, push the deferred frees 2873 * from the previous txg. If not, leave them alone so that we 2874 * don't generate work on an otherwise idle system. 2875 */ 2876 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2877 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 2878 !txg_list_empty(&dp->dp_sync_tasks, txg)) 2879 spa_sync_deferred_frees(spa, txg); 2880 2881 /* 2882 * Iterate to convergence. 2883 */ 2884 do { 2885 spa->spa_sync_pass++; 2886 2887 spa_sync_config_object(spa, tx); 2888 spa_sync_spares(spa, tx); 2889 spa_errlog_sync(spa, txg); 2890 dsl_pool_sync(dp, txg); 2891 2892 dirty_vdevs = 0; 2893 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2894 vdev_sync(vd, txg); 2895 dirty_vdevs++; 2896 } 2897 2898 bplist_sync(bpl, tx); 2899 } while (dirty_vdevs); 2900 2901 bplist_close(bpl); 2902 2903 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2904 2905 /* 2906 * Rewrite the vdev configuration (which includes the uberblock) 2907 * to commit the transaction group. 2908 * 2909 * If there are any dirty vdevs, sync the uberblock to all vdevs. 2910 * Otherwise, pick a random top-level vdev that's known to be 2911 * visible in the config cache (see spa_vdev_add() for details). 2912 * If the write fails, try the next vdev until we're tried them all. 2913 */ 2914 if (!list_is_empty(&spa->spa_dirty_list)) { 2915 VERIFY(vdev_config_sync(rvd, txg) == 0); 2916 } else { 2917 int children = rvd->vdev_children; 2918 int c0 = spa_get_random(children); 2919 int c; 2920 2921 for (c = 0; c < children; c++) { 2922 vd = rvd->vdev_child[(c0 + c) % children]; 2923 if (vd->vdev_ms_array == 0) 2924 continue; 2925 if (vdev_config_sync(vd, txg) == 0) 2926 break; 2927 } 2928 if (c == children) 2929 VERIFY(vdev_config_sync(rvd, txg) == 0); 2930 } 2931 2932 dmu_tx_commit(tx); 2933 2934 /* 2935 * Clear the dirty config list. 2936 */ 2937 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 2938 vdev_config_clean(vd); 2939 2940 /* 2941 * Now that the new config has synced transactionally, 2942 * let it become visible to the config cache. 2943 */ 2944 if (spa->spa_config_syncing != NULL) { 2945 spa_config_set(spa, spa->spa_config_syncing); 2946 spa->spa_config_txg = txg; 2947 spa->spa_config_syncing = NULL; 2948 } 2949 2950 /* 2951 * Make a stable copy of the fully synced uberblock. 2952 * We use this as the root for pool traversals. 2953 */ 2954 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 2955 2956 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 2957 2958 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 2959 spa->spa_traverse_wanted = 0; 2960 spa->spa_ubsync = spa->spa_uberblock; 2961 rw_exit(&spa->spa_traverse_lock); 2962 2963 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 2964 2965 /* 2966 * Clean up the ZIL records for the synced txg. 2967 */ 2968 dsl_pool_zil_clean(dp); 2969 2970 /* 2971 * Update usable space statistics. 2972 */ 2973 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 2974 vdev_sync_done(vd, txg); 2975 2976 /* 2977 * It had better be the case that we didn't dirty anything 2978 * since vdev_config_sync(). 2979 */ 2980 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 2981 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 2982 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 2983 ASSERT(bpl->bpl_queue == NULL); 2984 2985 spa_config_exit(spa, FTAG); 2986 2987 /* 2988 * If any async tasks have been requested, kick them off. 2989 */ 2990 spa_async_dispatch(spa); 2991 } 2992 2993 /* 2994 * Sync all pools. We don't want to hold the namespace lock across these 2995 * operations, so we take a reference on the spa_t and drop the lock during the 2996 * sync. 2997 */ 2998 void 2999 spa_sync_allpools(void) 3000 { 3001 spa_t *spa = NULL; 3002 mutex_enter(&spa_namespace_lock); 3003 while ((spa = spa_next(spa)) != NULL) { 3004 if (spa_state(spa) != POOL_STATE_ACTIVE) 3005 continue; 3006 spa_open_ref(spa, FTAG); 3007 mutex_exit(&spa_namespace_lock); 3008 txg_wait_synced(spa_get_dsl(spa), 0); 3009 mutex_enter(&spa_namespace_lock); 3010 spa_close(spa, FTAG); 3011 } 3012 mutex_exit(&spa_namespace_lock); 3013 } 3014 3015 /* 3016 * ========================================================================== 3017 * Miscellaneous routines 3018 * ========================================================================== 3019 */ 3020 3021 /* 3022 * Remove all pools in the system. 3023 */ 3024 void 3025 spa_evict_all(void) 3026 { 3027 spa_t *spa; 3028 3029 /* 3030 * Remove all cached state. All pools should be closed now, 3031 * so every spa in the AVL tree should be unreferenced. 3032 */ 3033 mutex_enter(&spa_namespace_lock); 3034 while ((spa = spa_next(NULL)) != NULL) { 3035 /* 3036 * Stop async tasks. The async thread may need to detach 3037 * a device that's been replaced, which requires grabbing 3038 * spa_namespace_lock, so we must drop it here. 3039 */ 3040 spa_open_ref(spa, FTAG); 3041 mutex_exit(&spa_namespace_lock); 3042 spa_async_suspend(spa); 3043 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 3044 mutex_enter(&spa_namespace_lock); 3045 spa_close(spa, FTAG); 3046 3047 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3048 spa_unload(spa); 3049 spa_deactivate(spa); 3050 } 3051 spa_remove(spa); 3052 } 3053 mutex_exit(&spa_namespace_lock); 3054 } 3055 3056 vdev_t * 3057 spa_lookup_by_guid(spa_t *spa, uint64_t guid) 3058 { 3059 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 3060 } 3061 3062 void 3063 spa_upgrade(spa_t *spa) 3064 { 3065 spa_config_enter(spa, RW_WRITER, FTAG); 3066 3067 /* 3068 * This should only be called for a non-faulted pool, and since a 3069 * future version would result in an unopenable pool, this shouldn't be 3070 * possible. 3071 */ 3072 ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 3073 3074 spa->spa_uberblock.ub_version = ZFS_VERSION; 3075 vdev_config_dirty(spa->spa_root_vdev); 3076 3077 spa_config_exit(spa, FTAG); 3078 3079 txg_wait_synced(spa_get_dsl(spa), 0); 3080 } 3081 3082 boolean_t 3083 spa_has_spare(spa_t *spa, uint64_t guid) 3084 { 3085 int i; 3086 uint64_t spareguid; 3087 3088 for (i = 0; i < spa->spa_nspares; i++) 3089 if (spa->spa_spares[i]->vdev_guid == guid) 3090 return (B_TRUE); 3091 3092 for (i = 0; i < spa->spa_pending_nspares; i++) { 3093 if (nvlist_lookup_uint64(spa->spa_pending_spares[i], 3094 ZPOOL_CONFIG_GUID, &spareguid) == 0 && 3095 spareguid == guid) 3096 return (B_TRUE); 3097 } 3098 3099 return (B_FALSE); 3100 } 3101