1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains all the routines used when modifying on-disk SPA state. 31 * This includes opening, importing, destroying, exporting a pool, and syncing a 32 * pool. 33 */ 34 35 #include <sys/zfs_context.h> 36 #include <sys/fm/fs/zfs.h> 37 #include <sys/spa_impl.h> 38 #include <sys/zio.h> 39 #include <sys/zio_checksum.h> 40 #include <sys/zio_compress.h> 41 #include <sys/dmu.h> 42 #include <sys/dmu_tx.h> 43 #include <sys/zap.h> 44 #include <sys/zil.h> 45 #include <sys/vdev_impl.h> 46 #include <sys/metaslab.h> 47 #include <sys/uberblock_impl.h> 48 #include <sys/txg.h> 49 #include <sys/avl.h> 50 #include <sys/dmu_traverse.h> 51 #include <sys/unique.h> 52 #include <sys/dsl_pool.h> 53 #include <sys/dsl_dir.h> 54 #include <sys/dsl_prop.h> 55 #include <sys/fs/zfs.h> 56 #include <sys/callb.h> 57 58 /* 59 * ========================================================================== 60 * SPA state manipulation (open/create/destroy/import/export) 61 * ========================================================================== 62 */ 63 64 static int 65 spa_error_entry_compare(const void *a, const void *b) 66 { 67 spa_error_entry_t *sa = (spa_error_entry_t *)a; 68 spa_error_entry_t *sb = (spa_error_entry_t *)b; 69 int ret; 70 71 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 72 sizeof (zbookmark_t)); 73 74 if (ret < 0) 75 return (-1); 76 else if (ret > 0) 77 return (1); 78 else 79 return (0); 80 } 81 82 /* 83 * Utility function which retrieves copies of the current logs and 84 * re-initializes them in the process. 85 */ 86 void 87 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 88 { 89 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 90 91 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 92 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 93 94 avl_create(&spa->spa_errlist_scrub, 95 spa_error_entry_compare, sizeof (spa_error_entry_t), 96 offsetof(spa_error_entry_t, se_avl)); 97 avl_create(&spa->spa_errlist_last, 98 spa_error_entry_compare, sizeof (spa_error_entry_t), 99 offsetof(spa_error_entry_t, se_avl)); 100 } 101 102 /* 103 * Activate an uninitialized pool. 104 */ 105 static void 106 spa_activate(spa_t *spa) 107 { 108 int t; 109 110 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 111 112 spa->spa_state = POOL_STATE_ACTIVE; 113 114 spa->spa_normal_class = metaslab_class_create(); 115 116 for (t = 0; t < ZIO_TYPES; t++) { 117 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 118 8, maxclsyspri, 50, INT_MAX, 119 TASKQ_PREPOPULATE); 120 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 121 8, maxclsyspri, 50, INT_MAX, 122 TASKQ_PREPOPULATE); 123 } 124 125 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 126 127 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 128 offsetof(vdev_t, vdev_dirty_node)); 129 130 txg_list_create(&spa->spa_vdev_txg_list, 131 offsetof(struct vdev, vdev_txg_node)); 132 133 avl_create(&spa->spa_errlist_scrub, 134 spa_error_entry_compare, sizeof (spa_error_entry_t), 135 offsetof(spa_error_entry_t, se_avl)); 136 avl_create(&spa->spa_errlist_last, 137 spa_error_entry_compare, sizeof (spa_error_entry_t), 138 offsetof(spa_error_entry_t, se_avl)); 139 } 140 141 /* 142 * Opposite of spa_activate(). 143 */ 144 static void 145 spa_deactivate(spa_t *spa) 146 { 147 int t; 148 149 ASSERT(spa->spa_sync_on == B_FALSE); 150 ASSERT(spa->spa_dsl_pool == NULL); 151 ASSERT(spa->spa_root_vdev == NULL); 152 153 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 154 155 txg_list_destroy(&spa->spa_vdev_txg_list); 156 157 list_destroy(&spa->spa_dirty_list); 158 159 rw_destroy(&spa->spa_traverse_lock); 160 161 for (t = 0; t < ZIO_TYPES; t++) { 162 taskq_destroy(spa->spa_zio_issue_taskq[t]); 163 taskq_destroy(spa->spa_zio_intr_taskq[t]); 164 spa->spa_zio_issue_taskq[t] = NULL; 165 spa->spa_zio_intr_taskq[t] = NULL; 166 } 167 168 metaslab_class_destroy(spa->spa_normal_class); 169 spa->spa_normal_class = NULL; 170 171 /* 172 * If this was part of an import or the open otherwise failed, we may 173 * still have errors left in the queues. Empty them just in case. 174 */ 175 spa_errlog_drain(spa); 176 177 avl_destroy(&spa->spa_errlist_scrub); 178 avl_destroy(&spa->spa_errlist_last); 179 180 spa->spa_state = POOL_STATE_UNINITIALIZED; 181 } 182 183 /* 184 * Verify a pool configuration, and construct the vdev tree appropriately. This 185 * will create all the necessary vdevs in the appropriate layout, with each vdev 186 * in the CLOSED state. This will prep the pool before open/creation/import. 187 * All vdev validation is done by the vdev_alloc() routine. 188 */ 189 static int 190 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 191 uint_t id, int atype) 192 { 193 nvlist_t **child; 194 uint_t c, children; 195 int error; 196 197 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 198 return (error); 199 200 if ((*vdp)->vdev_ops->vdev_op_leaf) 201 return (0); 202 203 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 204 &child, &children) != 0) { 205 vdev_free(*vdp); 206 *vdp = NULL; 207 return (EINVAL); 208 } 209 210 for (c = 0; c < children; c++) { 211 vdev_t *vd; 212 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 213 atype)) != 0) { 214 vdev_free(*vdp); 215 *vdp = NULL; 216 return (error); 217 } 218 } 219 220 ASSERT(*vdp != NULL); 221 222 return (0); 223 } 224 225 /* 226 * Opposite of spa_load(). 227 */ 228 static void 229 spa_unload(spa_t *spa) 230 { 231 int i; 232 233 /* 234 * Stop async tasks. 235 */ 236 spa_async_suspend(spa); 237 238 /* 239 * Stop syncing. 240 */ 241 if (spa->spa_sync_on) { 242 txg_sync_stop(spa->spa_dsl_pool); 243 spa->spa_sync_on = B_FALSE; 244 } 245 246 /* 247 * Wait for any outstanding prefetch I/O to complete. 248 */ 249 spa_config_enter(spa, RW_WRITER, FTAG); 250 spa_config_exit(spa, FTAG); 251 252 /* 253 * Close the dsl pool. 254 */ 255 if (spa->spa_dsl_pool) { 256 dsl_pool_close(spa->spa_dsl_pool); 257 spa->spa_dsl_pool = NULL; 258 } 259 260 /* 261 * Close all vdevs. 262 */ 263 if (spa->spa_root_vdev) 264 vdev_free(spa->spa_root_vdev); 265 ASSERT(spa->spa_root_vdev == NULL); 266 267 for (i = 0; i < spa->spa_nspares; i++) 268 vdev_free(spa->spa_spares[i]); 269 if (spa->spa_spares) { 270 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 271 spa->spa_spares = NULL; 272 } 273 if (spa->spa_sparelist) { 274 nvlist_free(spa->spa_sparelist); 275 spa->spa_sparelist = NULL; 276 } 277 278 spa->spa_async_suspended = 0; 279 } 280 281 /* 282 * Load (or re-load) the current list of vdevs describing the active spares for 283 * this pool. When this is called, we have some form of basic information in 284 * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 285 * re-generate a more complete list including status information. 286 */ 287 static void 288 spa_load_spares(spa_t *spa) 289 { 290 nvlist_t **spares; 291 uint_t nspares; 292 int i; 293 294 /* 295 * First, close and free any existing spare vdevs. 296 */ 297 for (i = 0; i < spa->spa_nspares; i++) { 298 vdev_close(spa->spa_spares[i]); 299 vdev_free(spa->spa_spares[i]); 300 } 301 if (spa->spa_spares) 302 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 303 304 if (spa->spa_sparelist == NULL) 305 nspares = 0; 306 else 307 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 308 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 309 310 spa->spa_nspares = (int)nspares; 311 spa->spa_spares = NULL; 312 313 if (nspares == 0) 314 return; 315 316 /* 317 * Construct the array of vdevs, opening them to get status in the 318 * process. 319 */ 320 spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 321 for (i = 0; i < spa->spa_nspares; i++) { 322 vdev_t *vd; 323 324 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 325 VDEV_ALLOC_SPARE) == 0); 326 ASSERT(vd != NULL); 327 328 spa->spa_spares[i] = vd; 329 330 if (vdev_open(vd) != 0) 331 continue; 332 333 vd->vdev_top = vd; 334 (void) vdev_validate_spare(vd); 335 } 336 337 /* 338 * Recompute the stashed list of spares, with status information 339 * this time. 340 */ 341 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 342 DATA_TYPE_NVLIST_ARRAY) == 0); 343 344 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 345 for (i = 0; i < spa->spa_nspares; i++) 346 spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 347 B_TRUE, B_TRUE); 348 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 349 spares, spa->spa_nspares) == 0); 350 for (i = 0; i < spa->spa_nspares; i++) 351 nvlist_free(spares[i]); 352 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 353 } 354 355 static int 356 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 357 { 358 dmu_buf_t *db; 359 char *packed = NULL; 360 size_t nvsize = 0; 361 int error; 362 *value = NULL; 363 364 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 365 nvsize = *(uint64_t *)db->db_data; 366 dmu_buf_rele(db, FTAG); 367 368 packed = kmem_alloc(nvsize, KM_SLEEP); 369 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 370 if (error == 0) 371 error = nvlist_unpack(packed, nvsize, value, 0); 372 kmem_free(packed, nvsize); 373 374 return (error); 375 } 376 377 /* 378 * Load an existing storage pool, using the pool's builtin spa_config as a 379 * source of configuration information. 380 */ 381 static int 382 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 383 { 384 int error = 0; 385 nvlist_t *nvroot = NULL; 386 vdev_t *rvd; 387 uberblock_t *ub = &spa->spa_uberblock; 388 uint64_t config_cache_txg = spa->spa_config_txg; 389 uint64_t pool_guid; 390 uint64_t version; 391 zio_t *zio; 392 393 spa->spa_load_state = state; 394 395 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 396 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 397 error = EINVAL; 398 goto out; 399 } 400 401 /* 402 * Versioning wasn't explicitly added to the label until later, so if 403 * it's not present treat it as the initial version. 404 */ 405 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 406 version = ZFS_VERSION_INITIAL; 407 408 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 409 &spa->spa_config_txg); 410 411 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 412 spa_guid_exists(pool_guid, 0)) { 413 error = EEXIST; 414 goto out; 415 } 416 417 spa->spa_load_guid = pool_guid; 418 419 /* 420 * Parse the configuration into a vdev tree. We explicitly set the 421 * value that will be returned by spa_version() since parsing the 422 * configuration requires knowing the version number. 423 */ 424 spa_config_enter(spa, RW_WRITER, FTAG); 425 spa->spa_ubsync.ub_version = version; 426 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 427 spa_config_exit(spa, FTAG); 428 429 if (error != 0) 430 goto out; 431 432 ASSERT(spa->spa_root_vdev == rvd); 433 ASSERT(spa_guid(spa) == pool_guid); 434 435 /* 436 * Try to open all vdevs, loading each label in the process. 437 */ 438 if (vdev_open(rvd) != 0) { 439 error = ENXIO; 440 goto out; 441 } 442 443 /* 444 * Validate the labels for all leaf vdevs. We need to grab the config 445 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 446 * flag. 447 */ 448 spa_config_enter(spa, RW_READER, FTAG); 449 error = vdev_validate(rvd); 450 spa_config_exit(spa, FTAG); 451 452 if (error != 0) { 453 error = EBADF; 454 goto out; 455 } 456 457 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 458 error = ENXIO; 459 goto out; 460 } 461 462 /* 463 * Find the best uberblock. 464 */ 465 bzero(ub, sizeof (uberblock_t)); 466 467 zio = zio_root(spa, NULL, NULL, 468 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 469 vdev_uberblock_load(zio, rvd, ub); 470 error = zio_wait(zio); 471 472 /* 473 * If we weren't able to find a single valid uberblock, return failure. 474 */ 475 if (ub->ub_txg == 0) { 476 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 477 VDEV_AUX_CORRUPT_DATA); 478 error = ENXIO; 479 goto out; 480 } 481 482 /* 483 * If the pool is newer than the code, we can't open it. 484 */ 485 if (ub->ub_version > ZFS_VERSION) { 486 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 487 VDEV_AUX_VERSION_NEWER); 488 error = ENOTSUP; 489 goto out; 490 } 491 492 /* 493 * If the vdev guid sum doesn't match the uberblock, we have an 494 * incomplete configuration. 495 */ 496 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 497 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 498 VDEV_AUX_BAD_GUID_SUM); 499 error = ENXIO; 500 goto out; 501 } 502 503 /* 504 * Initialize internal SPA structures. 505 */ 506 spa->spa_state = POOL_STATE_ACTIVE; 507 spa->spa_ubsync = spa->spa_uberblock; 508 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 509 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 510 if (error) { 511 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 512 VDEV_AUX_CORRUPT_DATA); 513 goto out; 514 } 515 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 516 517 if (zap_lookup(spa->spa_meta_objset, 518 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 519 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 520 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 521 VDEV_AUX_CORRUPT_DATA); 522 error = EIO; 523 goto out; 524 } 525 526 if (!mosconfig) { 527 nvlist_t *newconfig; 528 529 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 530 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 531 VDEV_AUX_CORRUPT_DATA); 532 error = EIO; 533 goto out; 534 } 535 536 spa_config_set(spa, newconfig); 537 spa_unload(spa); 538 spa_deactivate(spa); 539 spa_activate(spa); 540 541 return (spa_load(spa, newconfig, state, B_TRUE)); 542 } 543 544 if (zap_lookup(spa->spa_meta_objset, 545 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 546 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 547 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 548 VDEV_AUX_CORRUPT_DATA); 549 error = EIO; 550 goto out; 551 } 552 553 /* 554 * Load the bit that tells us to use the new accounting function 555 * (raid-z deflation). If we have an older pool, this will not 556 * be present. 557 */ 558 error = zap_lookup(spa->spa_meta_objset, 559 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 560 sizeof (uint64_t), 1, &spa->spa_deflate); 561 if (error != 0 && error != ENOENT) { 562 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 563 VDEV_AUX_CORRUPT_DATA); 564 error = EIO; 565 goto out; 566 } 567 568 /* 569 * Load the persistent error log. If we have an older pool, this will 570 * not be present. 571 */ 572 error = zap_lookup(spa->spa_meta_objset, 573 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 574 sizeof (uint64_t), 1, &spa->spa_errlog_last); 575 if (error != 0 && error != ENOENT) { 576 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 577 VDEV_AUX_CORRUPT_DATA); 578 error = EIO; 579 goto out; 580 } 581 582 error = zap_lookup(spa->spa_meta_objset, 583 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 584 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 585 if (error != 0 && error != ENOENT) { 586 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 587 VDEV_AUX_CORRUPT_DATA); 588 error = EIO; 589 goto out; 590 } 591 592 /* 593 * Load any hot spares for this pool. 594 */ 595 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 596 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 597 if (error != 0 && error != ENOENT) { 598 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 599 VDEV_AUX_CORRUPT_DATA); 600 error = EIO; 601 goto out; 602 } 603 if (error == 0) { 604 ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES); 605 if (load_nvlist(spa, spa->spa_spares_object, 606 &spa->spa_sparelist) != 0) { 607 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 608 VDEV_AUX_CORRUPT_DATA); 609 error = EIO; 610 goto out; 611 } 612 613 spa_config_enter(spa, RW_WRITER, FTAG); 614 spa_load_spares(spa); 615 spa_config_exit(spa, FTAG); 616 } 617 618 /* 619 * Load the vdev state for all toplevel vdevs. 620 */ 621 vdev_load(rvd); 622 623 /* 624 * Propagate the leaf DTLs we just loaded all the way up the tree. 625 */ 626 spa_config_enter(spa, RW_WRITER, FTAG); 627 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 628 spa_config_exit(spa, FTAG); 629 630 /* 631 * Check the state of the root vdev. If it can't be opened, it 632 * indicates one or more toplevel vdevs are faulted. 633 */ 634 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 635 error = ENXIO; 636 goto out; 637 } 638 639 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 640 dmu_tx_t *tx; 641 int need_update = B_FALSE; 642 int c; 643 644 /* 645 * Claim log blocks that haven't been committed yet. 646 * This must all happen in a single txg. 647 */ 648 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 649 spa_first_txg(spa)); 650 (void) dmu_objset_find(spa->spa_name, zil_claim, tx, 0); 651 dmu_tx_commit(tx); 652 653 spa->spa_sync_on = B_TRUE; 654 txg_sync_start(spa->spa_dsl_pool); 655 656 /* 657 * Wait for all claims to sync. 658 */ 659 txg_wait_synced(spa->spa_dsl_pool, 0); 660 661 /* 662 * If the config cache is stale, or we have uninitialized 663 * metaslabs (see spa_vdev_add()), then update the config. 664 */ 665 if (config_cache_txg != spa->spa_config_txg || 666 state == SPA_LOAD_IMPORT) 667 need_update = B_TRUE; 668 669 for (c = 0; c < rvd->vdev_children; c++) 670 if (rvd->vdev_child[c]->vdev_ms_array == 0) 671 need_update = B_TRUE; 672 673 /* 674 * Update the config cache asychronously in case we're the 675 * root pool, in which case the config cache isn't writable yet. 676 */ 677 if (need_update) 678 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 679 } 680 681 error = 0; 682 out: 683 if (error && error != EBADF) 684 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 685 spa->spa_load_state = SPA_LOAD_NONE; 686 spa->spa_ena = 0; 687 688 return (error); 689 } 690 691 /* 692 * Pool Open/Import 693 * 694 * The import case is identical to an open except that the configuration is sent 695 * down from userland, instead of grabbed from the configuration cache. For the 696 * case of an open, the pool configuration will exist in the 697 * POOL_STATE_UNITIALIZED state. 698 * 699 * The stats information (gen/count/ustats) is used to gather vdev statistics at 700 * the same time open the pool, without having to keep around the spa_t in some 701 * ambiguous state. 702 */ 703 static int 704 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 705 { 706 spa_t *spa; 707 int error; 708 int loaded = B_FALSE; 709 int locked = B_FALSE; 710 711 *spapp = NULL; 712 713 /* 714 * As disgusting as this is, we need to support recursive calls to this 715 * function because dsl_dir_open() is called during spa_load(), and ends 716 * up calling spa_open() again. The real fix is to figure out how to 717 * avoid dsl_dir_open() calling this in the first place. 718 */ 719 if (mutex_owner(&spa_namespace_lock) != curthread) { 720 mutex_enter(&spa_namespace_lock); 721 locked = B_TRUE; 722 } 723 724 if ((spa = spa_lookup(pool)) == NULL) { 725 if (locked) 726 mutex_exit(&spa_namespace_lock); 727 return (ENOENT); 728 } 729 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 730 731 spa_activate(spa); 732 733 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 734 735 if (error == EBADF) { 736 /* 737 * If vdev_validate() returns failure (indicated by 738 * EBADF), it indicates that one of the vdevs indicates 739 * that the pool has been exported or destroyed. If 740 * this is the case, the config cache is out of sync and 741 * we should remove the pool from the namespace. 742 */ 743 zfs_post_ok(spa, NULL); 744 spa_unload(spa); 745 spa_deactivate(spa); 746 spa_remove(spa); 747 spa_config_sync(); 748 if (locked) 749 mutex_exit(&spa_namespace_lock); 750 return (ENOENT); 751 } 752 753 if (error) { 754 /* 755 * We can't open the pool, but we still have useful 756 * information: the state of each vdev after the 757 * attempted vdev_open(). Return this to the user. 758 */ 759 if (config != NULL && spa->spa_root_vdev != NULL) { 760 spa_config_enter(spa, RW_READER, FTAG); 761 *config = spa_config_generate(spa, NULL, -1ULL, 762 B_TRUE); 763 spa_config_exit(spa, FTAG); 764 } 765 spa_unload(spa); 766 spa_deactivate(spa); 767 spa->spa_last_open_failed = B_TRUE; 768 if (locked) 769 mutex_exit(&spa_namespace_lock); 770 *spapp = NULL; 771 return (error); 772 } else { 773 zfs_post_ok(spa, NULL); 774 spa->spa_last_open_failed = B_FALSE; 775 } 776 777 loaded = B_TRUE; 778 } 779 780 spa_open_ref(spa, tag); 781 if (locked) 782 mutex_exit(&spa_namespace_lock); 783 784 *spapp = spa; 785 786 if (config != NULL) { 787 spa_config_enter(spa, RW_READER, FTAG); 788 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 789 spa_config_exit(spa, FTAG); 790 } 791 792 /* 793 * If we just loaded the pool, resilver anything that's out of date. 794 */ 795 if (loaded && (spa_mode & FWRITE)) 796 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 797 798 return (0); 799 } 800 801 int 802 spa_open(const char *name, spa_t **spapp, void *tag) 803 { 804 return (spa_open_common(name, spapp, tag, NULL)); 805 } 806 807 /* 808 * Lookup the given spa_t, incrementing the inject count in the process, 809 * preventing it from being exported or destroyed. 810 */ 811 spa_t * 812 spa_inject_addref(char *name) 813 { 814 spa_t *spa; 815 816 mutex_enter(&spa_namespace_lock); 817 if ((spa = spa_lookup(name)) == NULL) { 818 mutex_exit(&spa_namespace_lock); 819 return (NULL); 820 } 821 spa->spa_inject_ref++; 822 mutex_exit(&spa_namespace_lock); 823 824 return (spa); 825 } 826 827 void 828 spa_inject_delref(spa_t *spa) 829 { 830 mutex_enter(&spa_namespace_lock); 831 spa->spa_inject_ref--; 832 mutex_exit(&spa_namespace_lock); 833 } 834 835 static void 836 spa_add_spares(spa_t *spa, nvlist_t *config) 837 { 838 nvlist_t **spares; 839 uint_t i, nspares; 840 nvlist_t *nvroot; 841 uint64_t guid; 842 vdev_stat_t *vs; 843 uint_t vsc; 844 845 if (spa->spa_nspares == 0) 846 return; 847 848 VERIFY(nvlist_lookup_nvlist(config, 849 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 850 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 851 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 852 if (nspares != 0) { 853 VERIFY(nvlist_add_nvlist_array(nvroot, 854 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 855 VERIFY(nvlist_lookup_nvlist_array(nvroot, 856 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 857 858 /* 859 * Go through and find any spares which have since been 860 * repurposed as an active spare. If this is the case, update 861 * their status appropriately. 862 */ 863 for (i = 0; i < nspares; i++) { 864 VERIFY(nvlist_lookup_uint64(spares[i], 865 ZPOOL_CONFIG_GUID, &guid) == 0); 866 if (spa_spare_inuse(guid)) { 867 VERIFY(nvlist_lookup_uint64_array( 868 spares[i], ZPOOL_CONFIG_STATS, 869 (uint64_t **)&vs, &vsc) == 0); 870 vs->vs_state = VDEV_STATE_CANT_OPEN; 871 vs->vs_aux = VDEV_AUX_SPARED; 872 } 873 } 874 } 875 } 876 877 int 878 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 879 { 880 int error; 881 spa_t *spa; 882 883 *config = NULL; 884 error = spa_open_common(name, &spa, FTAG, config); 885 886 if (spa && *config != NULL) { 887 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 888 spa_get_errlog_size(spa)) == 0); 889 890 spa_add_spares(spa, *config); 891 } 892 893 /* 894 * We want to get the alternate root even for faulted pools, so we cheat 895 * and call spa_lookup() directly. 896 */ 897 if (altroot) { 898 if (spa == NULL) { 899 mutex_enter(&spa_namespace_lock); 900 spa = spa_lookup(name); 901 if (spa) 902 spa_altroot(spa, altroot, buflen); 903 else 904 altroot[0] = '\0'; 905 spa = NULL; 906 mutex_exit(&spa_namespace_lock); 907 } else { 908 spa_altroot(spa, altroot, buflen); 909 } 910 } 911 912 if (spa != NULL) 913 spa_close(spa, FTAG); 914 915 return (error); 916 } 917 918 /* 919 * Validate that the 'spares' array is well formed. We must have an array of 920 * nvlists, each which describes a valid leaf vdev. 921 */ 922 static int 923 spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 924 { 925 nvlist_t **spares; 926 uint_t i, nspares; 927 vdev_t *vd; 928 int error; 929 930 /* 931 * It's acceptable to have no spares specified. 932 */ 933 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 934 &spares, &nspares) != 0) 935 return (0); 936 937 if (nspares == 0) 938 return (EINVAL); 939 940 /* 941 * Make sure the pool is formatted with a version that supports hot 942 * spares. 943 */ 944 if (spa_version(spa) < ZFS_VERSION_SPARES) 945 return (ENOTSUP); 946 947 for (i = 0; i < nspares; i++) { 948 if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 949 mode)) != 0) 950 return (error); 951 952 if (!vd->vdev_ops->vdev_op_leaf) { 953 vdev_free(vd); 954 return (EINVAL); 955 } 956 957 if ((error = vdev_open(vd)) != 0) { 958 vdev_free(vd); 959 return (error); 960 } 961 962 vd->vdev_top = vd; 963 if ((error = vdev_label_spare(vd, crtxg)) != 0) { 964 vdev_free(vd); 965 return (error); 966 } 967 968 VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 969 vd->vdev_guid) == 0); 970 971 vdev_free(vd); 972 } 973 974 return (0); 975 } 976 977 /* 978 * Pool Creation 979 */ 980 int 981 spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 982 { 983 spa_t *spa; 984 vdev_t *rvd; 985 dsl_pool_t *dp; 986 dmu_tx_t *tx; 987 int c, error = 0; 988 uint64_t txg = TXG_INITIAL; 989 nvlist_t **spares; 990 uint_t nspares; 991 992 /* 993 * If this pool already exists, return failure. 994 */ 995 mutex_enter(&spa_namespace_lock); 996 if (spa_lookup(pool) != NULL) { 997 mutex_exit(&spa_namespace_lock); 998 return (EEXIST); 999 } 1000 1001 /* 1002 * Allocate a new spa_t structure. 1003 */ 1004 spa = spa_add(pool, altroot); 1005 spa_activate(spa); 1006 1007 spa->spa_uberblock.ub_txg = txg - 1; 1008 spa->spa_uberblock.ub_version = ZFS_VERSION; 1009 spa->spa_ubsync = spa->spa_uberblock; 1010 1011 /* 1012 * Create the root vdev. 1013 */ 1014 spa_config_enter(spa, RW_WRITER, FTAG); 1015 1016 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1017 1018 ASSERT(error != 0 || rvd != NULL); 1019 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1020 1021 if (error == 0 && rvd->vdev_children == 0) 1022 error = EINVAL; 1023 1024 if (error == 0 && 1025 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1026 (error = spa_validate_spares(spa, nvroot, txg, 1027 VDEV_ALLOC_ADD)) == 0) { 1028 for (c = 0; c < rvd->vdev_children; c++) 1029 vdev_init(rvd->vdev_child[c], txg); 1030 vdev_config_dirty(rvd); 1031 } 1032 1033 spa_config_exit(spa, FTAG); 1034 1035 if (error != 0) { 1036 spa_unload(spa); 1037 spa_deactivate(spa); 1038 spa_remove(spa); 1039 mutex_exit(&spa_namespace_lock); 1040 return (error); 1041 } 1042 1043 /* 1044 * Get the list of spares, if specified. 1045 */ 1046 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1047 &spares, &nspares) == 0) { 1048 VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 1049 KM_SLEEP) == 0); 1050 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1051 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1052 spa_config_enter(spa, RW_WRITER, FTAG); 1053 spa_load_spares(spa); 1054 spa_config_exit(spa, FTAG); 1055 spa->spa_sync_spares = B_TRUE; 1056 } 1057 1058 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1059 spa->spa_meta_objset = dp->dp_meta_objset; 1060 1061 tx = dmu_tx_create_assigned(dp, txg); 1062 1063 /* 1064 * Create the pool config object. 1065 */ 1066 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1067 DMU_OT_PACKED_NVLIST, 1 << 14, 1068 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1069 1070 if (zap_add(spa->spa_meta_objset, 1071 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1072 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1073 cmn_err(CE_PANIC, "failed to add pool config"); 1074 } 1075 1076 /* Newly created pools are always deflated. */ 1077 spa->spa_deflate = TRUE; 1078 if (zap_add(spa->spa_meta_objset, 1079 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1080 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 1081 cmn_err(CE_PANIC, "failed to add deflate"); 1082 } 1083 1084 /* 1085 * Create the deferred-free bplist object. Turn off compression 1086 * because sync-to-convergence takes longer if the blocksize 1087 * keeps changing. 1088 */ 1089 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1090 1 << 14, tx); 1091 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1092 ZIO_COMPRESS_OFF, tx); 1093 1094 if (zap_add(spa->spa_meta_objset, 1095 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1096 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 1097 cmn_err(CE_PANIC, "failed to add bplist"); 1098 } 1099 1100 dmu_tx_commit(tx); 1101 1102 spa->spa_sync_on = B_TRUE; 1103 txg_sync_start(spa->spa_dsl_pool); 1104 1105 /* 1106 * We explicitly wait for the first transaction to complete so that our 1107 * bean counters are appropriately updated. 1108 */ 1109 txg_wait_synced(spa->spa_dsl_pool, txg); 1110 1111 spa_config_sync(); 1112 1113 mutex_exit(&spa_namespace_lock); 1114 1115 return (0); 1116 } 1117 1118 /* 1119 * Import the given pool into the system. We set up the necessary spa_t and 1120 * then call spa_load() to do the dirty work. 1121 */ 1122 int 1123 spa_import(const char *pool, nvlist_t *config, const char *altroot) 1124 { 1125 spa_t *spa; 1126 int error; 1127 nvlist_t *nvroot; 1128 nvlist_t **spares; 1129 uint_t nspares; 1130 1131 if (!(spa_mode & FWRITE)) 1132 return (EROFS); 1133 1134 /* 1135 * If a pool with this name exists, return failure. 1136 */ 1137 mutex_enter(&spa_namespace_lock); 1138 if (spa_lookup(pool) != NULL) { 1139 mutex_exit(&spa_namespace_lock); 1140 return (EEXIST); 1141 } 1142 1143 /* 1144 * Create and initialize the spa structure. 1145 */ 1146 spa = spa_add(pool, altroot); 1147 spa_activate(spa); 1148 1149 /* 1150 * Pass off the heavy lifting to spa_load(). 1151 * Pass TRUE for mosconfig because the user-supplied config 1152 * is actually the one to trust when doing an import. 1153 */ 1154 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1155 1156 spa_config_enter(spa, RW_WRITER, FTAG); 1157 /* 1158 * Toss any existing sparelist, as it doesn't have any validity anymore, 1159 * and conflicts with spa_has_spare(). 1160 */ 1161 if (spa->spa_sparelist) { 1162 nvlist_free(spa->spa_sparelist); 1163 spa->spa_sparelist = NULL; 1164 spa_load_spares(spa); 1165 } 1166 1167 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1168 &nvroot) == 0); 1169 if (error == 0) 1170 error = spa_validate_spares(spa, nvroot, -1ULL, 1171 VDEV_ALLOC_SPARE); 1172 spa_config_exit(spa, FTAG); 1173 1174 if (error != 0) { 1175 spa_unload(spa); 1176 spa_deactivate(spa); 1177 spa_remove(spa); 1178 mutex_exit(&spa_namespace_lock); 1179 return (error); 1180 } 1181 1182 /* 1183 * Override any spares as specified by the user, as these may have 1184 * correct device names/devids, etc. 1185 */ 1186 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1187 &spares, &nspares) == 0) { 1188 if (spa->spa_sparelist) 1189 VERIFY(nvlist_remove(spa->spa_sparelist, 1190 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1191 else 1192 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1193 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1194 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1195 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1196 spa_config_enter(spa, RW_WRITER, FTAG); 1197 spa_load_spares(spa); 1198 spa_config_exit(spa, FTAG); 1199 spa->spa_sync_spares = B_TRUE; 1200 } 1201 1202 /* 1203 * Update the config cache to include the newly-imported pool. 1204 */ 1205 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1206 1207 mutex_exit(&spa_namespace_lock); 1208 1209 /* 1210 * Resilver anything that's out of date. 1211 */ 1212 if (spa_mode & FWRITE) 1213 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1214 1215 return (0); 1216 } 1217 1218 /* 1219 * This (illegal) pool name is used when temporarily importing a spa_t in order 1220 * to get the vdev stats associated with the imported devices. 1221 */ 1222 #define TRYIMPORT_NAME "$import" 1223 1224 nvlist_t * 1225 spa_tryimport(nvlist_t *tryconfig) 1226 { 1227 nvlist_t *config = NULL; 1228 char *poolname; 1229 spa_t *spa; 1230 uint64_t state; 1231 1232 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1233 return (NULL); 1234 1235 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1236 return (NULL); 1237 1238 /* 1239 * Create and initialize the spa structure. 1240 */ 1241 mutex_enter(&spa_namespace_lock); 1242 spa = spa_add(TRYIMPORT_NAME, NULL); 1243 spa_activate(spa); 1244 1245 /* 1246 * Pass off the heavy lifting to spa_load(). 1247 * Pass TRUE for mosconfig because the user-supplied config 1248 * is actually the one to trust when doing an import. 1249 */ 1250 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1251 1252 /* 1253 * If 'tryconfig' was at least parsable, return the current config. 1254 */ 1255 if (spa->spa_root_vdev != NULL) { 1256 spa_config_enter(spa, RW_READER, FTAG); 1257 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1258 spa_config_exit(spa, FTAG); 1259 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1260 poolname) == 0); 1261 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1262 state) == 0); 1263 1264 /* 1265 * Add the list of hot spares. 1266 */ 1267 spa_add_spares(spa, config); 1268 } 1269 1270 spa_unload(spa); 1271 spa_deactivate(spa); 1272 spa_remove(spa); 1273 mutex_exit(&spa_namespace_lock); 1274 1275 return (config); 1276 } 1277 1278 /* 1279 * Pool export/destroy 1280 * 1281 * The act of destroying or exporting a pool is very simple. We make sure there 1282 * is no more pending I/O and any references to the pool are gone. Then, we 1283 * update the pool state and sync all the labels to disk, removing the 1284 * configuration from the cache afterwards. 1285 */ 1286 static int 1287 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1288 { 1289 spa_t *spa; 1290 1291 if (oldconfig) 1292 *oldconfig = NULL; 1293 1294 if (!(spa_mode & FWRITE)) 1295 return (EROFS); 1296 1297 mutex_enter(&spa_namespace_lock); 1298 if ((spa = spa_lookup(pool)) == NULL) { 1299 mutex_exit(&spa_namespace_lock); 1300 return (ENOENT); 1301 } 1302 1303 /* 1304 * Put a hold on the pool, drop the namespace lock, stop async tasks, 1305 * reacquire the namespace lock, and see if we can export. 1306 */ 1307 spa_open_ref(spa, FTAG); 1308 mutex_exit(&spa_namespace_lock); 1309 spa_async_suspend(spa); 1310 mutex_enter(&spa_namespace_lock); 1311 spa_close(spa, FTAG); 1312 1313 /* 1314 * The pool will be in core if it's openable, 1315 * in which case we can modify its state. 1316 */ 1317 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1318 /* 1319 * Objsets may be open only because they're dirty, so we 1320 * have to force it to sync before checking spa_refcnt. 1321 */ 1322 spa_scrub_suspend(spa); 1323 txg_wait_synced(spa->spa_dsl_pool, 0); 1324 1325 /* 1326 * A pool cannot be exported or destroyed if there are active 1327 * references. If we are resetting a pool, allow references by 1328 * fault injection handlers. 1329 */ 1330 if (!spa_refcount_zero(spa) || 1331 (spa->spa_inject_ref != 0 && 1332 new_state != POOL_STATE_UNINITIALIZED)) { 1333 spa_scrub_resume(spa); 1334 spa_async_resume(spa); 1335 mutex_exit(&spa_namespace_lock); 1336 return (EBUSY); 1337 } 1338 1339 spa_scrub_resume(spa); 1340 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1341 1342 /* 1343 * We want this to be reflected on every label, 1344 * so mark them all dirty. spa_unload() will do the 1345 * final sync that pushes these changes out. 1346 */ 1347 if (new_state != POOL_STATE_UNINITIALIZED) { 1348 spa_config_enter(spa, RW_WRITER, FTAG); 1349 spa->spa_state = new_state; 1350 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 1351 vdev_config_dirty(spa->spa_root_vdev); 1352 spa_config_exit(spa, FTAG); 1353 } 1354 } 1355 1356 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1357 spa_unload(spa); 1358 spa_deactivate(spa); 1359 } 1360 1361 if (oldconfig && spa->spa_config) 1362 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 1363 1364 if (new_state != POOL_STATE_UNINITIALIZED) { 1365 spa_remove(spa); 1366 spa_config_sync(); 1367 } 1368 mutex_exit(&spa_namespace_lock); 1369 1370 return (0); 1371 } 1372 1373 /* 1374 * Destroy a storage pool. 1375 */ 1376 int 1377 spa_destroy(char *pool) 1378 { 1379 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1380 } 1381 1382 /* 1383 * Export a storage pool. 1384 */ 1385 int 1386 spa_export(char *pool, nvlist_t **oldconfig) 1387 { 1388 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1389 } 1390 1391 /* 1392 * Similar to spa_export(), this unloads the spa_t without actually removing it 1393 * from the namespace in any way. 1394 */ 1395 int 1396 spa_reset(char *pool) 1397 { 1398 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 1399 } 1400 1401 1402 /* 1403 * ========================================================================== 1404 * Device manipulation 1405 * ========================================================================== 1406 */ 1407 1408 /* 1409 * Add capacity to a storage pool. 1410 */ 1411 int 1412 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1413 { 1414 uint64_t txg; 1415 int c, error; 1416 vdev_t *rvd = spa->spa_root_vdev; 1417 vdev_t *vd, *tvd; 1418 nvlist_t **spares; 1419 uint_t i, nspares; 1420 1421 txg = spa_vdev_enter(spa); 1422 1423 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 1424 VDEV_ALLOC_ADD)) != 0) 1425 return (spa_vdev_exit(spa, NULL, txg, error)); 1426 1427 if ((error = spa_validate_spares(spa, nvroot, txg, 1428 VDEV_ALLOC_ADD)) != 0) 1429 return (spa_vdev_exit(spa, vd, txg, error)); 1430 1431 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1432 &spares, &nspares) != 0) 1433 nspares = 0; 1434 1435 if (vd->vdev_children == 0 && nspares == 0) 1436 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1437 1438 if (vd->vdev_children != 0) { 1439 if ((error = vdev_create(vd, txg, B_FALSE)) != 0) 1440 return (spa_vdev_exit(spa, vd, txg, error)); 1441 1442 /* 1443 * Transfer each new top-level vdev from vd to rvd. 1444 */ 1445 for (c = 0; c < vd->vdev_children; c++) { 1446 tvd = vd->vdev_child[c]; 1447 vdev_remove_child(vd, tvd); 1448 tvd->vdev_id = rvd->vdev_children; 1449 vdev_add_child(rvd, tvd); 1450 vdev_config_dirty(tvd); 1451 } 1452 } 1453 1454 if (nspares != 0) { 1455 if (spa->spa_sparelist != NULL) { 1456 nvlist_t **oldspares; 1457 uint_t oldnspares; 1458 nvlist_t **newspares; 1459 1460 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 1461 ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 1462 1463 newspares = kmem_alloc(sizeof (void *) * 1464 (nspares + oldnspares), KM_SLEEP); 1465 for (i = 0; i < oldnspares; i++) 1466 VERIFY(nvlist_dup(oldspares[i], 1467 &newspares[i], KM_SLEEP) == 0); 1468 for (i = 0; i < nspares; i++) 1469 VERIFY(nvlist_dup(spares[i], 1470 &newspares[i + oldnspares], 1471 KM_SLEEP) == 0); 1472 1473 VERIFY(nvlist_remove(spa->spa_sparelist, 1474 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1475 1476 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1477 ZPOOL_CONFIG_SPARES, newspares, 1478 nspares + oldnspares) == 0); 1479 for (i = 0; i < oldnspares + nspares; i++) 1480 nvlist_free(newspares[i]); 1481 kmem_free(newspares, (oldnspares + nspares) * 1482 sizeof (void *)); 1483 } else { 1484 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1485 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1486 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1487 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1488 } 1489 1490 spa_load_spares(spa); 1491 spa->spa_sync_spares = B_TRUE; 1492 } 1493 1494 /* 1495 * We have to be careful when adding new vdevs to an existing pool. 1496 * If other threads start allocating from these vdevs before we 1497 * sync the config cache, and we lose power, then upon reboot we may 1498 * fail to open the pool because there are DVAs that the config cache 1499 * can't translate. Therefore, we first add the vdevs without 1500 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1501 * and then let spa_config_update() initialize the new metaslabs. 1502 * 1503 * spa_load() checks for added-but-not-initialized vdevs, so that 1504 * if we lose power at any point in this sequence, the remaining 1505 * steps will be completed the next time we load the pool. 1506 */ 1507 (void) spa_vdev_exit(spa, vd, txg, 0); 1508 1509 mutex_enter(&spa_namespace_lock); 1510 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1511 mutex_exit(&spa_namespace_lock); 1512 1513 return (0); 1514 } 1515 1516 /* 1517 * Attach a device to a mirror. The arguments are the path to any device 1518 * in the mirror, and the nvroot for the new device. If the path specifies 1519 * a device that is not mirrored, we automatically insert the mirror vdev. 1520 * 1521 * If 'replacing' is specified, the new device is intended to replace the 1522 * existing device; in this case the two devices are made into their own 1523 * mirror using the 'replacing' vdev, which is functionally idendical to 1524 * the mirror vdev (it actually reuses all the same ops) but has a few 1525 * extra rules: you can't attach to it after it's been created, and upon 1526 * completion of resilvering, the first disk (the one being replaced) 1527 * is automatically detached. 1528 */ 1529 int 1530 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1531 { 1532 uint64_t txg, open_txg; 1533 int error; 1534 vdev_t *rvd = spa->spa_root_vdev; 1535 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1536 vdev_ops_t *pvops; 1537 1538 txg = spa_vdev_enter(spa); 1539 1540 oldvd = vdev_lookup_by_guid(rvd, guid); 1541 1542 if (oldvd == NULL) 1543 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1544 1545 if (!oldvd->vdev_ops->vdev_op_leaf) 1546 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1547 1548 pvd = oldvd->vdev_parent; 1549 1550 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 1551 VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) 1552 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1553 1554 newvd = newrootvd->vdev_child[0]; 1555 1556 if (!newvd->vdev_ops->vdev_op_leaf) 1557 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1558 1559 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 1560 return (spa_vdev_exit(spa, newrootvd, txg, error)); 1561 1562 if (!replacing) { 1563 /* 1564 * For attach, the only allowable parent is a mirror or the root 1565 * vdev. 1566 */ 1567 if (pvd->vdev_ops != &vdev_mirror_ops && 1568 pvd->vdev_ops != &vdev_root_ops) 1569 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1570 1571 pvops = &vdev_mirror_ops; 1572 } else { 1573 /* 1574 * Active hot spares can only be replaced by inactive hot 1575 * spares. 1576 */ 1577 if (pvd->vdev_ops == &vdev_spare_ops && 1578 pvd->vdev_child[1] == oldvd && 1579 !spa_has_spare(spa, newvd->vdev_guid)) 1580 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1581 1582 /* 1583 * If the source is a hot spare, and the parent isn't already a 1584 * spare, then we want to create a new hot spare. Otherwise, we 1585 * want to create a replacing vdev. 1586 */ 1587 if (pvd->vdev_ops == &vdev_replacing_ops) 1588 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1589 else if (pvd->vdev_ops != &vdev_spare_ops && 1590 newvd->vdev_isspare) 1591 pvops = &vdev_spare_ops; 1592 else 1593 pvops = &vdev_replacing_ops; 1594 } 1595 1596 /* 1597 * Compare the new device size with the replaceable/attachable 1598 * device size. 1599 */ 1600 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1601 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1602 1603 /* 1604 * The new device cannot have a higher alignment requirement 1605 * than the top-level vdev. 1606 */ 1607 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1608 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1609 1610 /* 1611 * If this is an in-place replacement, update oldvd's path and devid 1612 * to make it distinguishable from newvd, and unopenable from now on. 1613 */ 1614 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1615 spa_strfree(oldvd->vdev_path); 1616 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1617 KM_SLEEP); 1618 (void) sprintf(oldvd->vdev_path, "%s/%s", 1619 newvd->vdev_path, "old"); 1620 if (oldvd->vdev_devid != NULL) { 1621 spa_strfree(oldvd->vdev_devid); 1622 oldvd->vdev_devid = NULL; 1623 } 1624 } 1625 1626 /* 1627 * If the parent is not a mirror, or if we're replacing, insert the new 1628 * mirror/replacing/spare vdev above oldvd. 1629 */ 1630 if (pvd->vdev_ops != pvops) 1631 pvd = vdev_add_parent(oldvd, pvops); 1632 1633 ASSERT(pvd->vdev_top->vdev_parent == rvd); 1634 ASSERT(pvd->vdev_ops == pvops); 1635 ASSERT(oldvd->vdev_parent == pvd); 1636 1637 /* 1638 * Extract the new device from its root and add it to pvd. 1639 */ 1640 vdev_remove_child(newrootvd, newvd); 1641 newvd->vdev_id = pvd->vdev_children; 1642 vdev_add_child(pvd, newvd); 1643 1644 /* 1645 * If newvd is smaller than oldvd, but larger than its rsize, 1646 * the addition of newvd may have decreased our parent's asize. 1647 */ 1648 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1649 1650 tvd = newvd->vdev_top; 1651 ASSERT(pvd->vdev_top == tvd); 1652 ASSERT(tvd->vdev_parent == rvd); 1653 1654 vdev_config_dirty(tvd); 1655 1656 /* 1657 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1658 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1659 */ 1660 open_txg = txg + TXG_CONCURRENT_STATES - 1; 1661 1662 mutex_enter(&newvd->vdev_dtl_lock); 1663 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1664 open_txg - TXG_INITIAL + 1); 1665 mutex_exit(&newvd->vdev_dtl_lock); 1666 1667 dprintf("attached %s in txg %llu\n", newvd->vdev_path, txg); 1668 1669 /* 1670 * Mark newvd's DTL dirty in this txg. 1671 */ 1672 vdev_dirty(tvd, VDD_DTL, newvd, txg); 1673 1674 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1675 1676 /* 1677 * Kick off a resilver to update newvd. 1678 */ 1679 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1680 1681 return (0); 1682 } 1683 1684 /* 1685 * Detach a device from a mirror or replacing vdev. 1686 * If 'replace_done' is specified, only detach if the parent 1687 * is a replacing vdev. 1688 */ 1689 int 1690 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1691 { 1692 uint64_t txg; 1693 int c, t, error; 1694 vdev_t *rvd = spa->spa_root_vdev; 1695 vdev_t *vd, *pvd, *cvd, *tvd; 1696 boolean_t unspare = B_FALSE; 1697 uint64_t unspare_guid; 1698 1699 txg = spa_vdev_enter(spa); 1700 1701 vd = vdev_lookup_by_guid(rvd, guid); 1702 1703 if (vd == NULL) 1704 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1705 1706 if (!vd->vdev_ops->vdev_op_leaf) 1707 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1708 1709 pvd = vd->vdev_parent; 1710 1711 /* 1712 * If replace_done is specified, only remove this device if it's 1713 * the first child of a replacing vdev. For the 'spare' vdev, either 1714 * disk can be removed. 1715 */ 1716 if (replace_done) { 1717 if (pvd->vdev_ops == &vdev_replacing_ops) { 1718 if (vd->vdev_id != 0) 1719 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1720 } else if (pvd->vdev_ops != &vdev_spare_ops) { 1721 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1722 } 1723 } 1724 1725 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 1726 spa_version(spa) >= ZFS_VERSION_SPARES); 1727 1728 /* 1729 * Only mirror, replacing, and spare vdevs support detach. 1730 */ 1731 if (pvd->vdev_ops != &vdev_replacing_ops && 1732 pvd->vdev_ops != &vdev_mirror_ops && 1733 pvd->vdev_ops != &vdev_spare_ops) 1734 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1735 1736 /* 1737 * If there's only one replica, you can't detach it. 1738 */ 1739 if (pvd->vdev_children <= 1) 1740 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1741 1742 /* 1743 * If all siblings have non-empty DTLs, this device may have the only 1744 * valid copy of the data, which means we cannot safely detach it. 1745 * 1746 * XXX -- as in the vdev_offline() case, we really want a more 1747 * precise DTL check. 1748 */ 1749 for (c = 0; c < pvd->vdev_children; c++) { 1750 uint64_t dirty; 1751 1752 cvd = pvd->vdev_child[c]; 1753 if (cvd == vd) 1754 continue; 1755 if (vdev_is_dead(cvd)) 1756 continue; 1757 mutex_enter(&cvd->vdev_dtl_lock); 1758 dirty = cvd->vdev_dtl_map.sm_space | 1759 cvd->vdev_dtl_scrub.sm_space; 1760 mutex_exit(&cvd->vdev_dtl_lock); 1761 if (!dirty) 1762 break; 1763 } 1764 1765 /* 1766 * If we are a replacing or spare vdev, then we can always detach the 1767 * latter child, as that is how one cancels the operation. 1768 */ 1769 if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 1770 c == pvd->vdev_children) 1771 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1772 1773 /* 1774 * If we are detaching the original disk from a spare, then it implies 1775 * that the spare should become a real disk, and be removed from the 1776 * active spare list for the pool. 1777 */ 1778 if (pvd->vdev_ops == &vdev_spare_ops && 1779 vd->vdev_id == 0) 1780 unspare = B_TRUE; 1781 1782 /* 1783 * Erase the disk labels so the disk can be used for other things. 1784 * This must be done after all other error cases are handled, 1785 * but before we disembowel vd (so we can still do I/O to it). 1786 * But if we can't do it, don't treat the error as fatal -- 1787 * it may be that the unwritability of the disk is the reason 1788 * it's being detached! 1789 */ 1790 error = vdev_label_init(vd, 0, B_FALSE); 1791 if (error) 1792 dprintf("unable to erase labels on %s\n", vdev_description(vd)); 1793 1794 /* 1795 * Remove vd from its parent and compact the parent's children. 1796 */ 1797 vdev_remove_child(pvd, vd); 1798 vdev_compact_children(pvd); 1799 1800 /* 1801 * Remember one of the remaining children so we can get tvd below. 1802 */ 1803 cvd = pvd->vdev_child[0]; 1804 1805 /* 1806 * If we need to remove the remaining child from the list of hot spares, 1807 * do it now, marking the vdev as no longer a spare in the process. We 1808 * must do this before vdev_remove_parent(), because that can change the 1809 * GUID if it creates a new toplevel GUID. 1810 */ 1811 if (unspare) { 1812 ASSERT(cvd->vdev_isspare); 1813 spa_spare_remove(cvd->vdev_guid); 1814 cvd->vdev_isspare = B_FALSE; 1815 unspare_guid = cvd->vdev_guid; 1816 } 1817 1818 /* 1819 * If the parent mirror/replacing vdev only has one child, 1820 * the parent is no longer needed. Remove it from the tree. 1821 */ 1822 if (pvd->vdev_children == 1) 1823 vdev_remove_parent(cvd); 1824 1825 /* 1826 * We don't set tvd until now because the parent we just removed 1827 * may have been the previous top-level vdev. 1828 */ 1829 tvd = cvd->vdev_top; 1830 ASSERT(tvd->vdev_parent == rvd); 1831 1832 /* 1833 * Reopen this top-level vdev to reassess health after detach. 1834 */ 1835 vdev_reopen(tvd); 1836 1837 /* 1838 * If the device we just detached was smaller than the others, 1839 * it may be possible to add metaslabs (i.e. grow the pool). 1840 * vdev_metaslab_init() can't fail because the existing metaslabs 1841 * are already in core, so there's nothing to read from disk. 1842 */ 1843 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 1844 1845 vdev_config_dirty(tvd); 1846 1847 /* 1848 * Mark vd's DTL as dirty in this txg. 1849 * vdev_dtl_sync() will see that vd->vdev_detached is set 1850 * and free vd's DTL object in syncing context. 1851 * But first make sure we're not on any *other* txg's DTL list, 1852 * to prevent vd from being accessed after it's freed. 1853 */ 1854 for (t = 0; t < TXG_SIZE; t++) 1855 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1856 vd->vdev_detached = B_TRUE; 1857 vdev_dirty(tvd, VDD_DTL, vd, txg); 1858 1859 dprintf("detached %s in txg %llu\n", vd->vdev_path, txg); 1860 1861 error = spa_vdev_exit(spa, vd, txg, 0); 1862 1863 /* 1864 * If we are supposed to remove the given vdev from the list of spares, 1865 * iterate over all pools in the system and replace it if it's present. 1866 */ 1867 if (unspare) { 1868 spa = NULL; 1869 mutex_enter(&spa_namespace_lock); 1870 while ((spa = spa_next(spa)) != NULL) { 1871 if (spa->spa_state != POOL_STATE_ACTIVE) 1872 continue; 1873 1874 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 1875 } 1876 mutex_exit(&spa_namespace_lock); 1877 } 1878 1879 return (error); 1880 } 1881 1882 /* 1883 * Remove a device from the pool. Currently, this supports removing only hot 1884 * spares. 1885 */ 1886 int 1887 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 1888 { 1889 vdev_t *vd; 1890 nvlist_t **spares, *nv, **newspares; 1891 uint_t i, j, nspares; 1892 int ret = 0; 1893 1894 spa_config_enter(spa, RW_WRITER, FTAG); 1895 1896 vd = spa_lookup_by_guid(spa, guid); 1897 1898 nv = NULL; 1899 if (spa->spa_spares != NULL && 1900 nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 1901 &spares, &nspares) == 0) { 1902 for (i = 0; i < nspares; i++) { 1903 uint64_t theguid; 1904 1905 VERIFY(nvlist_lookup_uint64(spares[i], 1906 ZPOOL_CONFIG_GUID, &theguid) == 0); 1907 if (theguid == guid) { 1908 nv = spares[i]; 1909 break; 1910 } 1911 } 1912 } 1913 1914 /* 1915 * We only support removing a hot spare, and only if it's not currently 1916 * in use in this pool. 1917 */ 1918 if (nv == NULL && vd == NULL) { 1919 ret = ENOENT; 1920 goto out; 1921 } 1922 1923 if (nv == NULL && vd != NULL) { 1924 ret = ENOTSUP; 1925 goto out; 1926 } 1927 1928 if (!unspare && nv != NULL && vd != NULL) { 1929 ret = EBUSY; 1930 goto out; 1931 } 1932 1933 if (nspares == 1) { 1934 newspares = NULL; 1935 } else { 1936 newspares = kmem_alloc((nspares - 1) * sizeof (void *), 1937 KM_SLEEP); 1938 for (i = 0, j = 0; i < nspares; i++) { 1939 if (spares[i] != nv) 1940 VERIFY(nvlist_dup(spares[i], 1941 &newspares[j++], KM_SLEEP) == 0); 1942 } 1943 } 1944 1945 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 1946 DATA_TYPE_NVLIST_ARRAY) == 0); 1947 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 1948 newspares, nspares - 1) == 0); 1949 for (i = 0; i < nspares - 1; i++) 1950 nvlist_free(newspares[i]); 1951 kmem_free(newspares, (nspares - 1) * sizeof (void *)); 1952 spa_load_spares(spa); 1953 spa->spa_sync_spares = B_TRUE; 1954 1955 out: 1956 spa_config_exit(spa, FTAG); 1957 1958 return (ret); 1959 } 1960 1961 /* 1962 * Find any device that's done replacing, so we can detach it. 1963 */ 1964 static vdev_t * 1965 spa_vdev_replace_done_hunt(vdev_t *vd) 1966 { 1967 vdev_t *newvd, *oldvd; 1968 int c; 1969 1970 for (c = 0; c < vd->vdev_children; c++) { 1971 oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 1972 if (oldvd != NULL) 1973 return (oldvd); 1974 } 1975 1976 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 1977 oldvd = vd->vdev_child[0]; 1978 newvd = vd->vdev_child[1]; 1979 1980 mutex_enter(&newvd->vdev_dtl_lock); 1981 if (newvd->vdev_dtl_map.sm_space == 0 && 1982 newvd->vdev_dtl_scrub.sm_space == 0) { 1983 mutex_exit(&newvd->vdev_dtl_lock); 1984 return (oldvd); 1985 } 1986 mutex_exit(&newvd->vdev_dtl_lock); 1987 } 1988 1989 return (NULL); 1990 } 1991 1992 static void 1993 spa_vdev_replace_done(spa_t *spa) 1994 { 1995 vdev_t *vd; 1996 vdev_t *pvd; 1997 uint64_t guid; 1998 uint64_t pguid = 0; 1999 2000 spa_config_enter(spa, RW_READER, FTAG); 2001 2002 while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 2003 guid = vd->vdev_guid; 2004 /* 2005 * If we have just finished replacing a hot spared device, then 2006 * we need to detach the parent's first child (the original hot 2007 * spare) as well. 2008 */ 2009 pvd = vd->vdev_parent; 2010 if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2011 pvd->vdev_id == 0) { 2012 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 2013 ASSERT(pvd->vdev_parent->vdev_children == 2); 2014 pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 2015 } 2016 spa_config_exit(spa, FTAG); 2017 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 2018 return; 2019 if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 2020 return; 2021 spa_config_enter(spa, RW_READER, FTAG); 2022 } 2023 2024 spa_config_exit(spa, FTAG); 2025 } 2026 2027 /* 2028 * Update the stored path for this vdev. Dirty the vdev configuration, relying 2029 * on spa_vdev_enter/exit() to synchronize the labels and cache. 2030 */ 2031 int 2032 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 2033 { 2034 vdev_t *rvd, *vd; 2035 uint64_t txg; 2036 2037 rvd = spa->spa_root_vdev; 2038 2039 txg = spa_vdev_enter(spa); 2040 2041 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2042 /* 2043 * Determine if this is a reference to a hot spare. In that 2044 * case, update the path as stored in the spare list. 2045 */ 2046 nvlist_t **spares; 2047 uint_t i, nspares; 2048 if (spa->spa_sparelist != NULL) { 2049 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 2050 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2051 for (i = 0; i < nspares; i++) { 2052 uint64_t theguid; 2053 VERIFY(nvlist_lookup_uint64(spares[i], 2054 ZPOOL_CONFIG_GUID, &theguid) == 0); 2055 if (theguid == guid) 2056 break; 2057 } 2058 2059 if (i == nspares) 2060 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2061 2062 VERIFY(nvlist_add_string(spares[i], 2063 ZPOOL_CONFIG_PATH, newpath) == 0); 2064 spa_load_spares(spa); 2065 spa->spa_sync_spares = B_TRUE; 2066 return (spa_vdev_exit(spa, NULL, txg, 0)); 2067 } else { 2068 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2069 } 2070 } 2071 2072 if (!vd->vdev_ops->vdev_op_leaf) 2073 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2074 2075 spa_strfree(vd->vdev_path); 2076 vd->vdev_path = spa_strdup(newpath); 2077 2078 vdev_config_dirty(vd->vdev_top); 2079 2080 return (spa_vdev_exit(spa, NULL, txg, 0)); 2081 } 2082 2083 /* 2084 * ========================================================================== 2085 * SPA Scrubbing 2086 * ========================================================================== 2087 */ 2088 2089 void 2090 spa_scrub_throttle(spa_t *spa, int direction) 2091 { 2092 mutex_enter(&spa->spa_scrub_lock); 2093 spa->spa_scrub_throttled += direction; 2094 ASSERT(spa->spa_scrub_throttled >= 0); 2095 if (spa->spa_scrub_throttled == 0) 2096 cv_broadcast(&spa->spa_scrub_io_cv); 2097 mutex_exit(&spa->spa_scrub_lock); 2098 } 2099 2100 static void 2101 spa_scrub_io_done(zio_t *zio) 2102 { 2103 spa_t *spa = zio->io_spa; 2104 2105 zio_buf_free(zio->io_data, zio->io_size); 2106 2107 mutex_enter(&spa->spa_scrub_lock); 2108 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2109 vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2110 spa->spa_scrub_errors++; 2111 mutex_enter(&vd->vdev_stat_lock); 2112 vd->vdev_stat.vs_scrub_errors++; 2113 mutex_exit(&vd->vdev_stat_lock); 2114 } 2115 if (--spa->spa_scrub_inflight == 0) { 2116 cv_broadcast(&spa->spa_scrub_io_cv); 2117 ASSERT(spa->spa_scrub_throttled == 0); 2118 } 2119 mutex_exit(&spa->spa_scrub_lock); 2120 } 2121 2122 static void 2123 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 2124 zbookmark_t *zb) 2125 { 2126 size_t size = BP_GET_LSIZE(bp); 2127 void *data = zio_buf_alloc(size); 2128 2129 mutex_enter(&spa->spa_scrub_lock); 2130 spa->spa_scrub_inflight++; 2131 mutex_exit(&spa->spa_scrub_lock); 2132 2133 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 2134 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 2135 2136 flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 2137 2138 zio_nowait(zio_read(NULL, spa, bp, data, size, 2139 spa_scrub_io_done, NULL, priority, flags, zb)); 2140 } 2141 2142 /* ARGSUSED */ 2143 static int 2144 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2145 { 2146 blkptr_t *bp = &bc->bc_blkptr; 2147 vdev_t *vd = spa->spa_root_vdev; 2148 dva_t *dva = bp->blk_dva; 2149 int needs_resilver = B_FALSE; 2150 int d; 2151 2152 if (bc->bc_errno) { 2153 /* 2154 * We can't scrub this block, but we can continue to scrub 2155 * the rest of the pool. Note the error and move along. 2156 */ 2157 mutex_enter(&spa->spa_scrub_lock); 2158 spa->spa_scrub_errors++; 2159 mutex_exit(&spa->spa_scrub_lock); 2160 2161 mutex_enter(&vd->vdev_stat_lock); 2162 vd->vdev_stat.vs_scrub_errors++; 2163 mutex_exit(&vd->vdev_stat_lock); 2164 2165 return (ERESTART); 2166 } 2167 2168 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2169 2170 for (d = 0; d < BP_GET_NDVAS(bp); d++) { 2171 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 2172 2173 ASSERT(vd != NULL); 2174 2175 /* 2176 * Keep track of how much data we've examined so that 2177 * zpool(1M) status can make useful progress reports. 2178 */ 2179 mutex_enter(&vd->vdev_stat_lock); 2180 vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 2181 mutex_exit(&vd->vdev_stat_lock); 2182 2183 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 2184 if (DVA_GET_GANG(&dva[d])) { 2185 /* 2186 * Gang members may be spread across multiple 2187 * vdevs, so the best we can do is look at the 2188 * pool-wide DTL. 2189 * XXX -- it would be better to change our 2190 * allocation policy to ensure that this can't 2191 * happen. 2192 */ 2193 vd = spa->spa_root_vdev; 2194 } 2195 if (vdev_dtl_contains(&vd->vdev_dtl_map, 2196 bp->blk_birth, 1)) 2197 needs_resilver = B_TRUE; 2198 } 2199 } 2200 2201 if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2202 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 2203 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 2204 else if (needs_resilver) 2205 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 2206 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2207 2208 return (0); 2209 } 2210 2211 static void 2212 spa_scrub_thread(spa_t *spa) 2213 { 2214 callb_cpr_t cprinfo; 2215 traverse_handle_t *th = spa->spa_scrub_th; 2216 vdev_t *rvd = spa->spa_root_vdev; 2217 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2218 int error = 0; 2219 boolean_t complete; 2220 2221 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2222 2223 /* 2224 * If we're restarting due to a snapshot create/delete, 2225 * wait for that to complete. 2226 */ 2227 txg_wait_synced(spa_get_dsl(spa), 0); 2228 2229 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 2230 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2231 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 2232 2233 spa_config_enter(spa, RW_WRITER, FTAG); 2234 vdev_reopen(rvd); /* purge all vdev caches */ 2235 vdev_config_dirty(rvd); /* rewrite all disk labels */ 2236 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 2237 spa_config_exit(spa, FTAG); 2238 2239 mutex_enter(&spa->spa_scrub_lock); 2240 spa->spa_scrub_errors = 0; 2241 spa->spa_scrub_active = 1; 2242 ASSERT(spa->spa_scrub_inflight == 0); 2243 ASSERT(spa->spa_scrub_throttled == 0); 2244 2245 while (!spa->spa_scrub_stop) { 2246 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2247 while (spa->spa_scrub_suspended) { 2248 spa->spa_scrub_active = 0; 2249 cv_broadcast(&spa->spa_scrub_cv); 2250 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2251 spa->spa_scrub_active = 1; 2252 } 2253 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2254 2255 if (spa->spa_scrub_restart_txg != 0) 2256 break; 2257 2258 mutex_exit(&spa->spa_scrub_lock); 2259 error = traverse_more(th); 2260 mutex_enter(&spa->spa_scrub_lock); 2261 if (error != EAGAIN) 2262 break; 2263 2264 while (spa->spa_scrub_throttled > 0) 2265 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2266 } 2267 2268 while (spa->spa_scrub_inflight) 2269 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2270 2271 spa->spa_scrub_active = 0; 2272 cv_broadcast(&spa->spa_scrub_cv); 2273 2274 mutex_exit(&spa->spa_scrub_lock); 2275 2276 spa_config_enter(spa, RW_WRITER, FTAG); 2277 2278 mutex_enter(&spa->spa_scrub_lock); 2279 2280 /* 2281 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 2282 * AND the spa config lock to synchronize with any config changes 2283 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 2284 */ 2285 if (spa->spa_scrub_restart_txg != 0) 2286 error = ERESTART; 2287 2288 if (spa->spa_scrub_stop) 2289 error = EINTR; 2290 2291 /* 2292 * Even if there were uncorrectable errors, we consider the scrub 2293 * completed. The downside is that if there is a transient error during 2294 * a resilver, we won't resilver the data properly to the target. But 2295 * if the damage is permanent (more likely) we will resilver forever, 2296 * which isn't really acceptable. Since there is enough information for 2297 * the user to know what has failed and why, this seems like a more 2298 * tractable approach. 2299 */ 2300 complete = (error == 0); 2301 2302 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 2303 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2304 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2305 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2306 2307 mutex_exit(&spa->spa_scrub_lock); 2308 2309 /* 2310 * If the scrub/resilver completed, update all DTLs to reflect this. 2311 * Whether it succeeded or not, vacate all temporary scrub DTLs. 2312 */ 2313 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2314 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2315 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 2316 spa_errlog_rotate(spa); 2317 2318 spa_config_exit(spa, FTAG); 2319 2320 mutex_enter(&spa->spa_scrub_lock); 2321 2322 /* 2323 * We may have finished replacing a device. 2324 * Let the async thread assess this and handle the detach. 2325 */ 2326 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2327 2328 /* 2329 * If we were told to restart, our final act is to start a new scrub. 2330 */ 2331 if (error == ERESTART) 2332 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 2333 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2334 2335 spa->spa_scrub_type = POOL_SCRUB_NONE; 2336 spa->spa_scrub_active = 0; 2337 spa->spa_scrub_thread = NULL; 2338 cv_broadcast(&spa->spa_scrub_cv); 2339 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2340 thread_exit(); 2341 } 2342 2343 void 2344 spa_scrub_suspend(spa_t *spa) 2345 { 2346 mutex_enter(&spa->spa_scrub_lock); 2347 spa->spa_scrub_suspended++; 2348 while (spa->spa_scrub_active) { 2349 cv_broadcast(&spa->spa_scrub_cv); 2350 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2351 } 2352 while (spa->spa_scrub_inflight) 2353 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2354 mutex_exit(&spa->spa_scrub_lock); 2355 } 2356 2357 void 2358 spa_scrub_resume(spa_t *spa) 2359 { 2360 mutex_enter(&spa->spa_scrub_lock); 2361 ASSERT(spa->spa_scrub_suspended != 0); 2362 if (--spa->spa_scrub_suspended == 0) 2363 cv_broadcast(&spa->spa_scrub_cv); 2364 mutex_exit(&spa->spa_scrub_lock); 2365 } 2366 2367 void 2368 spa_scrub_restart(spa_t *spa, uint64_t txg) 2369 { 2370 /* 2371 * Something happened (e.g. snapshot create/delete) that means 2372 * we must restart any in-progress scrubs. The itinerary will 2373 * fix this properly. 2374 */ 2375 mutex_enter(&spa->spa_scrub_lock); 2376 spa->spa_scrub_restart_txg = txg; 2377 mutex_exit(&spa->spa_scrub_lock); 2378 } 2379 2380 int 2381 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2382 { 2383 space_seg_t *ss; 2384 uint64_t mintxg, maxtxg; 2385 vdev_t *rvd = spa->spa_root_vdev; 2386 2387 if ((uint_t)type >= POOL_SCRUB_TYPES) 2388 return (ENOTSUP); 2389 2390 mutex_enter(&spa->spa_scrub_lock); 2391 2392 /* 2393 * If there's a scrub or resilver already in progress, stop it. 2394 */ 2395 while (spa->spa_scrub_thread != NULL) { 2396 /* 2397 * Don't stop a resilver unless forced. 2398 */ 2399 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 2400 mutex_exit(&spa->spa_scrub_lock); 2401 return (EBUSY); 2402 } 2403 spa->spa_scrub_stop = 1; 2404 cv_broadcast(&spa->spa_scrub_cv); 2405 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2406 } 2407 2408 /* 2409 * Terminate the previous traverse. 2410 */ 2411 if (spa->spa_scrub_th != NULL) { 2412 traverse_fini(spa->spa_scrub_th); 2413 spa->spa_scrub_th = NULL; 2414 } 2415 2416 if (rvd == NULL) { 2417 ASSERT(spa->spa_scrub_stop == 0); 2418 ASSERT(spa->spa_scrub_type == type); 2419 ASSERT(spa->spa_scrub_restart_txg == 0); 2420 mutex_exit(&spa->spa_scrub_lock); 2421 return (0); 2422 } 2423 2424 mintxg = TXG_INITIAL - 1; 2425 maxtxg = spa_last_synced_txg(spa) + 1; 2426 2427 mutex_enter(&rvd->vdev_dtl_lock); 2428 2429 if (rvd->vdev_dtl_map.sm_space == 0) { 2430 /* 2431 * The pool-wide DTL is empty. 2432 * If this is a resilver, there's nothing to do except 2433 * check whether any in-progress replacements have completed. 2434 */ 2435 if (type == POOL_SCRUB_RESILVER) { 2436 type = POOL_SCRUB_NONE; 2437 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2438 } 2439 } else { 2440 /* 2441 * The pool-wide DTL is non-empty. 2442 * If this is a normal scrub, upgrade to a resilver instead. 2443 */ 2444 if (type == POOL_SCRUB_EVERYTHING) 2445 type = POOL_SCRUB_RESILVER; 2446 } 2447 2448 if (type == POOL_SCRUB_RESILVER) { 2449 /* 2450 * Determine the resilvering boundaries. 2451 * 2452 * Note: (mintxg, maxtxg) is an open interval, 2453 * i.e. mintxg and maxtxg themselves are not included. 2454 * 2455 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 2456 * so we don't claim to resilver a txg that's still changing. 2457 */ 2458 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 2459 mintxg = ss->ss_start - 1; 2460 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 2461 maxtxg = MIN(ss->ss_end, maxtxg); 2462 } 2463 2464 mutex_exit(&rvd->vdev_dtl_lock); 2465 2466 spa->spa_scrub_stop = 0; 2467 spa->spa_scrub_type = type; 2468 spa->spa_scrub_restart_txg = 0; 2469 2470 if (type != POOL_SCRUB_NONE) { 2471 spa->spa_scrub_mintxg = mintxg; 2472 spa->spa_scrub_maxtxg = maxtxg; 2473 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 2474 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 2475 ZIO_FLAG_CANFAIL); 2476 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 2477 spa->spa_scrub_thread = thread_create(NULL, 0, 2478 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 2479 } 2480 2481 mutex_exit(&spa->spa_scrub_lock); 2482 2483 return (0); 2484 } 2485 2486 /* 2487 * ========================================================================== 2488 * SPA async task processing 2489 * ========================================================================== 2490 */ 2491 2492 static void 2493 spa_async_reopen(spa_t *spa) 2494 { 2495 vdev_t *rvd = spa->spa_root_vdev; 2496 vdev_t *tvd; 2497 int c; 2498 2499 spa_config_enter(spa, RW_WRITER, FTAG); 2500 2501 for (c = 0; c < rvd->vdev_children; c++) { 2502 tvd = rvd->vdev_child[c]; 2503 if (tvd->vdev_reopen_wanted) { 2504 tvd->vdev_reopen_wanted = 0; 2505 vdev_reopen(tvd); 2506 } 2507 } 2508 2509 spa_config_exit(spa, FTAG); 2510 } 2511 2512 static void 2513 spa_async_thread(spa_t *spa) 2514 { 2515 int tasks; 2516 2517 ASSERT(spa->spa_sync_on); 2518 2519 mutex_enter(&spa->spa_async_lock); 2520 tasks = spa->spa_async_tasks; 2521 spa->spa_async_tasks = 0; 2522 mutex_exit(&spa->spa_async_lock); 2523 2524 /* 2525 * See if the config needs to be updated. 2526 */ 2527 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 2528 mutex_enter(&spa_namespace_lock); 2529 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2530 mutex_exit(&spa_namespace_lock); 2531 } 2532 2533 /* 2534 * See if any devices need to be reopened. 2535 */ 2536 if (tasks & SPA_ASYNC_REOPEN) 2537 spa_async_reopen(spa); 2538 2539 /* 2540 * If any devices are done replacing, detach them. 2541 */ 2542 if (tasks & SPA_ASYNC_REPLACE_DONE) 2543 spa_vdev_replace_done(spa); 2544 2545 /* 2546 * Kick off a scrub. 2547 */ 2548 if (tasks & SPA_ASYNC_SCRUB) 2549 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 2550 2551 /* 2552 * Kick off a resilver. 2553 */ 2554 if (tasks & SPA_ASYNC_RESILVER) 2555 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 2556 2557 /* 2558 * Let the world know that we're done. 2559 */ 2560 mutex_enter(&spa->spa_async_lock); 2561 spa->spa_async_thread = NULL; 2562 cv_broadcast(&spa->spa_async_cv); 2563 mutex_exit(&spa->spa_async_lock); 2564 thread_exit(); 2565 } 2566 2567 void 2568 spa_async_suspend(spa_t *spa) 2569 { 2570 mutex_enter(&spa->spa_async_lock); 2571 spa->spa_async_suspended++; 2572 while (spa->spa_async_thread != NULL) 2573 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 2574 mutex_exit(&spa->spa_async_lock); 2575 } 2576 2577 void 2578 spa_async_resume(spa_t *spa) 2579 { 2580 mutex_enter(&spa->spa_async_lock); 2581 ASSERT(spa->spa_async_suspended != 0); 2582 spa->spa_async_suspended--; 2583 mutex_exit(&spa->spa_async_lock); 2584 } 2585 2586 static void 2587 spa_async_dispatch(spa_t *spa) 2588 { 2589 mutex_enter(&spa->spa_async_lock); 2590 if (spa->spa_async_tasks && !spa->spa_async_suspended && 2591 spa->spa_async_thread == NULL && 2592 rootdir != NULL && !vn_is_readonly(rootdir)) 2593 spa->spa_async_thread = thread_create(NULL, 0, 2594 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 2595 mutex_exit(&spa->spa_async_lock); 2596 } 2597 2598 void 2599 spa_async_request(spa_t *spa, int task) 2600 { 2601 mutex_enter(&spa->spa_async_lock); 2602 spa->spa_async_tasks |= task; 2603 mutex_exit(&spa->spa_async_lock); 2604 } 2605 2606 /* 2607 * ========================================================================== 2608 * SPA syncing routines 2609 * ========================================================================== 2610 */ 2611 2612 static void 2613 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2614 { 2615 bplist_t *bpl = &spa->spa_sync_bplist; 2616 dmu_tx_t *tx; 2617 blkptr_t blk; 2618 uint64_t itor = 0; 2619 zio_t *zio; 2620 int error; 2621 uint8_t c = 1; 2622 2623 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2624 2625 while (bplist_iterate(bpl, &itor, &blk) == 0) 2626 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2627 2628 error = zio_wait(zio); 2629 ASSERT3U(error, ==, 0); 2630 2631 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2632 bplist_vacate(bpl, tx); 2633 2634 /* 2635 * Pre-dirty the first block so we sync to convergence faster. 2636 * (Usually only the first block is needed.) 2637 */ 2638 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2639 dmu_tx_commit(tx); 2640 } 2641 2642 static void 2643 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 2644 { 2645 char *packed = NULL; 2646 size_t nvsize = 0; 2647 dmu_buf_t *db; 2648 2649 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 2650 2651 packed = kmem_alloc(nvsize, KM_SLEEP); 2652 2653 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 2654 KM_SLEEP) == 0); 2655 2656 dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 2657 2658 kmem_free(packed, nvsize); 2659 2660 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 2661 dmu_buf_will_dirty(db, tx); 2662 *(uint64_t *)db->db_data = nvsize; 2663 dmu_buf_rele(db, FTAG); 2664 } 2665 2666 static void 2667 spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 2668 { 2669 nvlist_t *nvroot; 2670 nvlist_t **spares; 2671 int i; 2672 2673 if (!spa->spa_sync_spares) 2674 return; 2675 2676 /* 2677 * Update the MOS nvlist describing the list of available spares. 2678 * spa_validate_spares() will have already made sure this nvlist is 2679 * valid and the vdevs are labelled appropriately. 2680 */ 2681 if (spa->spa_spares_object == 0) { 2682 spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 2683 DMU_OT_PACKED_NVLIST, 1 << 14, 2684 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2685 VERIFY(zap_update(spa->spa_meta_objset, 2686 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 2687 sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 2688 } 2689 2690 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2691 if (spa->spa_nspares == 0) { 2692 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2693 NULL, 0) == 0); 2694 } else { 2695 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 2696 KM_SLEEP); 2697 for (i = 0; i < spa->spa_nspares; i++) 2698 spares[i] = vdev_config_generate(spa, 2699 spa->spa_spares[i], B_FALSE, B_TRUE); 2700 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2701 spares, spa->spa_nspares) == 0); 2702 for (i = 0; i < spa->spa_nspares; i++) 2703 nvlist_free(spares[i]); 2704 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 2705 } 2706 2707 spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 2708 2709 spa->spa_sync_spares = B_FALSE; 2710 } 2711 2712 static void 2713 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2714 { 2715 nvlist_t *config; 2716 2717 if (list_is_empty(&spa->spa_dirty_list)) 2718 return; 2719 2720 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2721 2722 if (spa->spa_config_syncing) 2723 nvlist_free(spa->spa_config_syncing); 2724 spa->spa_config_syncing = config; 2725 2726 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 2727 } 2728 2729 /* 2730 * Sync the specified transaction group. New blocks may be dirtied as 2731 * part of the process, so we iterate until it converges. 2732 */ 2733 void 2734 spa_sync(spa_t *spa, uint64_t txg) 2735 { 2736 dsl_pool_t *dp = spa->spa_dsl_pool; 2737 objset_t *mos = spa->spa_meta_objset; 2738 bplist_t *bpl = &spa->spa_sync_bplist; 2739 vdev_t *rvd = spa->spa_root_vdev; 2740 vdev_t *vd; 2741 dmu_tx_t *tx; 2742 int dirty_vdevs; 2743 2744 /* 2745 * Lock out configuration changes. 2746 */ 2747 spa_config_enter(spa, RW_READER, FTAG); 2748 2749 spa->spa_syncing_txg = txg; 2750 spa->spa_sync_pass = 0; 2751 2752 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2753 2754 tx = dmu_tx_create_assigned(dp, txg); 2755 2756 /* 2757 * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg, 2758 * set spa_deflate if we have no raid-z vdevs. 2759 */ 2760 if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE && 2761 spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) { 2762 int i; 2763 2764 for (i = 0; i < rvd->vdev_children; i++) { 2765 vd = rvd->vdev_child[i]; 2766 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 2767 break; 2768 } 2769 if (i == rvd->vdev_children) { 2770 spa->spa_deflate = TRUE; 2771 VERIFY(0 == zap_add(spa->spa_meta_objset, 2772 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2773 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 2774 } 2775 } 2776 2777 /* 2778 * If anything has changed in this txg, push the deferred frees 2779 * from the previous txg. If not, leave them alone so that we 2780 * don't generate work on an otherwise idle system. 2781 */ 2782 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2783 !txg_list_empty(&dp->dp_dirty_dirs, txg)) 2784 spa_sync_deferred_frees(spa, txg); 2785 2786 /* 2787 * Iterate to convergence. 2788 */ 2789 do { 2790 spa->spa_sync_pass++; 2791 2792 spa_sync_config_object(spa, tx); 2793 spa_sync_spares(spa, tx); 2794 spa_errlog_sync(spa, txg); 2795 dsl_pool_sync(dp, txg); 2796 2797 dirty_vdevs = 0; 2798 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2799 vdev_sync(vd, txg); 2800 dirty_vdevs++; 2801 } 2802 2803 bplist_sync(bpl, tx); 2804 } while (dirty_vdevs); 2805 2806 bplist_close(bpl); 2807 2808 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2809 2810 /* 2811 * Rewrite the vdev configuration (which includes the uberblock) 2812 * to commit the transaction group. 2813 * 2814 * If there are any dirty vdevs, sync the uberblock to all vdevs. 2815 * Otherwise, pick a random top-level vdev that's known to be 2816 * visible in the config cache (see spa_vdev_add() for details). 2817 * If the write fails, try the next vdev until we're tried them all. 2818 */ 2819 if (!list_is_empty(&spa->spa_dirty_list)) { 2820 VERIFY(vdev_config_sync(rvd, txg) == 0); 2821 } else { 2822 int children = rvd->vdev_children; 2823 int c0 = spa_get_random(children); 2824 int c; 2825 2826 for (c = 0; c < children; c++) { 2827 vd = rvd->vdev_child[(c0 + c) % children]; 2828 if (vd->vdev_ms_array == 0) 2829 continue; 2830 if (vdev_config_sync(vd, txg) == 0) 2831 break; 2832 } 2833 if (c == children) 2834 VERIFY(vdev_config_sync(rvd, txg) == 0); 2835 } 2836 2837 dmu_tx_commit(tx); 2838 2839 /* 2840 * Clear the dirty config list. 2841 */ 2842 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 2843 vdev_config_clean(vd); 2844 2845 /* 2846 * Now that the new config has synced transactionally, 2847 * let it become visible to the config cache. 2848 */ 2849 if (spa->spa_config_syncing != NULL) { 2850 spa_config_set(spa, spa->spa_config_syncing); 2851 spa->spa_config_txg = txg; 2852 spa->spa_config_syncing = NULL; 2853 } 2854 2855 /* 2856 * Make a stable copy of the fully synced uberblock. 2857 * We use this as the root for pool traversals. 2858 */ 2859 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 2860 2861 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 2862 2863 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 2864 spa->spa_traverse_wanted = 0; 2865 spa->spa_ubsync = spa->spa_uberblock; 2866 rw_exit(&spa->spa_traverse_lock); 2867 2868 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 2869 2870 /* 2871 * Clean up the ZIL records for the synced txg. 2872 */ 2873 dsl_pool_zil_clean(dp); 2874 2875 /* 2876 * Update usable space statistics. 2877 */ 2878 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 2879 vdev_sync_done(vd, txg); 2880 2881 /* 2882 * It had better be the case that we didn't dirty anything 2883 * since vdev_config_sync(). 2884 */ 2885 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 2886 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 2887 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 2888 ASSERT(bpl->bpl_queue == NULL); 2889 2890 spa_config_exit(spa, FTAG); 2891 2892 /* 2893 * If any async tasks have been requested, kick them off. 2894 */ 2895 spa_async_dispatch(spa); 2896 } 2897 2898 /* 2899 * Sync all pools. We don't want to hold the namespace lock across these 2900 * operations, so we take a reference on the spa_t and drop the lock during the 2901 * sync. 2902 */ 2903 void 2904 spa_sync_allpools(void) 2905 { 2906 spa_t *spa = NULL; 2907 mutex_enter(&spa_namespace_lock); 2908 while ((spa = spa_next(spa)) != NULL) { 2909 if (spa_state(spa) != POOL_STATE_ACTIVE) 2910 continue; 2911 spa_open_ref(spa, FTAG); 2912 mutex_exit(&spa_namespace_lock); 2913 txg_wait_synced(spa_get_dsl(spa), 0); 2914 mutex_enter(&spa_namespace_lock); 2915 spa_close(spa, FTAG); 2916 } 2917 mutex_exit(&spa_namespace_lock); 2918 } 2919 2920 /* 2921 * ========================================================================== 2922 * Miscellaneous routines 2923 * ========================================================================== 2924 */ 2925 2926 /* 2927 * Remove all pools in the system. 2928 */ 2929 void 2930 spa_evict_all(void) 2931 { 2932 spa_t *spa; 2933 2934 /* 2935 * Remove all cached state. All pools should be closed now, 2936 * so every spa in the AVL tree should be unreferenced. 2937 */ 2938 mutex_enter(&spa_namespace_lock); 2939 while ((spa = spa_next(NULL)) != NULL) { 2940 /* 2941 * Stop async tasks. The async thread may need to detach 2942 * a device that's been replaced, which requires grabbing 2943 * spa_namespace_lock, so we must drop it here. 2944 */ 2945 spa_open_ref(spa, FTAG); 2946 mutex_exit(&spa_namespace_lock); 2947 spa_async_suspend(spa); 2948 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 2949 mutex_enter(&spa_namespace_lock); 2950 spa_close(spa, FTAG); 2951 2952 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2953 spa_unload(spa); 2954 spa_deactivate(spa); 2955 } 2956 spa_remove(spa); 2957 } 2958 mutex_exit(&spa_namespace_lock); 2959 } 2960 2961 vdev_t * 2962 spa_lookup_by_guid(spa_t *spa, uint64_t guid) 2963 { 2964 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 2965 } 2966 2967 void 2968 spa_upgrade(spa_t *spa) 2969 { 2970 spa_config_enter(spa, RW_WRITER, FTAG); 2971 2972 /* 2973 * This should only be called for a non-faulted pool, and since a 2974 * future version would result in an unopenable pool, this shouldn't be 2975 * possible. 2976 */ 2977 ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 2978 2979 spa->spa_uberblock.ub_version = ZFS_VERSION; 2980 vdev_config_dirty(spa->spa_root_vdev); 2981 2982 spa_config_exit(spa, FTAG); 2983 2984 txg_wait_synced(spa_get_dsl(spa), 0); 2985 } 2986 2987 boolean_t 2988 spa_has_spare(spa_t *spa, uint64_t guid) 2989 { 2990 int i; 2991 2992 for (i = 0; i < spa->spa_nspares; i++) 2993 if (spa->spa_spares[i]->vdev_guid == guid) 2994 return (B_TRUE); 2995 2996 return (B_FALSE); 2997 } 2998