1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains all the routines used when modifying on-disk SPA state. 31 * This includes opening, importing, destroying, exporting a pool, and syncing a 32 * pool. 33 */ 34 35 #include <sys/zfs_context.h> 36 #include <sys/fm/fs/zfs.h> 37 #include <sys/spa_impl.h> 38 #include <sys/zio.h> 39 #include <sys/zio_checksum.h> 40 #include <sys/zio_compress.h> 41 #include <sys/dmu.h> 42 #include <sys/dmu_tx.h> 43 #include <sys/zap.h> 44 #include <sys/zil.h> 45 #include <sys/vdev_impl.h> 46 #include <sys/metaslab.h> 47 #include <sys/uberblock_impl.h> 48 #include <sys/txg.h> 49 #include <sys/avl.h> 50 #include <sys/dmu_traverse.h> 51 #include <sys/unique.h> 52 #include <sys/dsl_pool.h> 53 #include <sys/dsl_dir.h> 54 #include <sys/dsl_prop.h> 55 #include <sys/fs/zfs.h> 56 #include <sys/callb.h> 57 58 /* 59 * ========================================================================== 60 * SPA state manipulation (open/create/destroy/import/export) 61 * ========================================================================== 62 */ 63 64 static int 65 spa_error_entry_compare(const void *a, const void *b) 66 { 67 spa_error_entry_t *sa = (spa_error_entry_t *)a; 68 spa_error_entry_t *sb = (spa_error_entry_t *)b; 69 int ret; 70 71 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 72 sizeof (zbookmark_t)); 73 74 if (ret < 0) 75 return (-1); 76 else if (ret > 0) 77 return (1); 78 else 79 return (0); 80 } 81 82 /* 83 * Utility function which retrieves copies of the current logs and 84 * re-initializes them in the process. 85 */ 86 void 87 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 88 { 89 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 90 91 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 92 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 93 94 avl_create(&spa->spa_errlist_scrub, 95 spa_error_entry_compare, sizeof (spa_error_entry_t), 96 offsetof(spa_error_entry_t, se_avl)); 97 avl_create(&spa->spa_errlist_last, 98 spa_error_entry_compare, sizeof (spa_error_entry_t), 99 offsetof(spa_error_entry_t, se_avl)); 100 } 101 102 /* 103 * Activate an uninitialized pool. 104 */ 105 static void 106 spa_activate(spa_t *spa) 107 { 108 int t; 109 110 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 111 112 spa->spa_state = POOL_STATE_ACTIVE; 113 114 spa->spa_normal_class = metaslab_class_create(); 115 116 for (t = 0; t < ZIO_TYPES; t++) { 117 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 118 8, maxclsyspri, 50, INT_MAX, 119 TASKQ_PREPOPULATE); 120 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 121 8, maxclsyspri, 50, INT_MAX, 122 TASKQ_PREPOPULATE); 123 } 124 125 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 126 127 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 128 offsetof(vdev_t, vdev_dirty_node)); 129 130 txg_list_create(&spa->spa_vdev_txg_list, 131 offsetof(struct vdev, vdev_txg_node)); 132 133 avl_create(&spa->spa_errlist_scrub, 134 spa_error_entry_compare, sizeof (spa_error_entry_t), 135 offsetof(spa_error_entry_t, se_avl)); 136 avl_create(&spa->spa_errlist_last, 137 spa_error_entry_compare, sizeof (spa_error_entry_t), 138 offsetof(spa_error_entry_t, se_avl)); 139 } 140 141 /* 142 * Opposite of spa_activate(). 143 */ 144 static void 145 spa_deactivate(spa_t *spa) 146 { 147 int t; 148 149 ASSERT(spa->spa_sync_on == B_FALSE); 150 ASSERT(spa->spa_dsl_pool == NULL); 151 ASSERT(spa->spa_root_vdev == NULL); 152 153 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 154 155 txg_list_destroy(&spa->spa_vdev_txg_list); 156 157 list_destroy(&spa->spa_dirty_list); 158 159 rw_destroy(&spa->spa_traverse_lock); 160 161 for (t = 0; t < ZIO_TYPES; t++) { 162 taskq_destroy(spa->spa_zio_issue_taskq[t]); 163 taskq_destroy(spa->spa_zio_intr_taskq[t]); 164 spa->spa_zio_issue_taskq[t] = NULL; 165 spa->spa_zio_intr_taskq[t] = NULL; 166 } 167 168 metaslab_class_destroy(spa->spa_normal_class); 169 spa->spa_normal_class = NULL; 170 171 /* 172 * If this was part of an import or the open otherwise failed, we may 173 * still have errors left in the queues. Empty them just in case. 174 */ 175 spa_errlog_drain(spa); 176 177 avl_destroy(&spa->spa_errlist_scrub); 178 avl_destroy(&spa->spa_errlist_last); 179 180 spa->spa_state = POOL_STATE_UNINITIALIZED; 181 } 182 183 /* 184 * Verify a pool configuration, and construct the vdev tree appropriately. This 185 * will create all the necessary vdevs in the appropriate layout, with each vdev 186 * in the CLOSED state. This will prep the pool before open/creation/import. 187 * All vdev validation is done by the vdev_alloc() routine. 188 */ 189 static int 190 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 191 uint_t id, int atype) 192 { 193 nvlist_t **child; 194 uint_t c, children; 195 int error; 196 197 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 198 return (error); 199 200 if ((*vdp)->vdev_ops->vdev_op_leaf) 201 return (0); 202 203 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 204 &child, &children) != 0) { 205 vdev_free(*vdp); 206 *vdp = NULL; 207 return (EINVAL); 208 } 209 210 for (c = 0; c < children; c++) { 211 vdev_t *vd; 212 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 213 atype)) != 0) { 214 vdev_free(*vdp); 215 *vdp = NULL; 216 return (error); 217 } 218 } 219 220 ASSERT(*vdp != NULL); 221 222 return (0); 223 } 224 225 /* 226 * Opposite of spa_load(). 227 */ 228 static void 229 spa_unload(spa_t *spa) 230 { 231 int i; 232 233 /* 234 * Stop async tasks. 235 */ 236 spa_async_suspend(spa); 237 238 /* 239 * Stop syncing. 240 */ 241 if (spa->spa_sync_on) { 242 txg_sync_stop(spa->spa_dsl_pool); 243 spa->spa_sync_on = B_FALSE; 244 } 245 246 /* 247 * Wait for any outstanding prefetch I/O to complete. 248 */ 249 spa_config_enter(spa, RW_WRITER, FTAG); 250 spa_config_exit(spa, FTAG); 251 252 /* 253 * Close the dsl pool. 254 */ 255 if (spa->spa_dsl_pool) { 256 dsl_pool_close(spa->spa_dsl_pool); 257 spa->spa_dsl_pool = NULL; 258 } 259 260 /* 261 * Close all vdevs. 262 */ 263 if (spa->spa_root_vdev) 264 vdev_free(spa->spa_root_vdev); 265 ASSERT(spa->spa_root_vdev == NULL); 266 267 for (i = 0; i < spa->spa_nspares; i++) 268 vdev_free(spa->spa_spares[i]); 269 if (spa->spa_spares) { 270 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 271 spa->spa_spares = NULL; 272 } 273 if (spa->spa_sparelist) { 274 nvlist_free(spa->spa_sparelist); 275 spa->spa_sparelist = NULL; 276 } 277 278 spa->spa_async_suspended = 0; 279 } 280 281 /* 282 * Load (or re-load) the current list of vdevs describing the active spares for 283 * this pool. When this is called, we have some form of basic information in 284 * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 285 * re-generate a more complete list including status information. 286 */ 287 static void 288 spa_load_spares(spa_t *spa) 289 { 290 nvlist_t **spares; 291 uint_t nspares; 292 int i; 293 294 /* 295 * First, close and free any existing spare vdevs. 296 */ 297 for (i = 0; i < spa->spa_nspares; i++) { 298 vdev_close(spa->spa_spares[i]); 299 vdev_free(spa->spa_spares[i]); 300 } 301 if (spa->spa_spares) 302 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 303 304 if (spa->spa_sparelist == NULL) 305 nspares = 0; 306 else 307 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 308 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 309 310 spa->spa_nspares = (int)nspares; 311 spa->spa_spares = NULL; 312 313 if (nspares == 0) 314 return; 315 316 /* 317 * Construct the array of vdevs, opening them to get status in the 318 * process. 319 */ 320 spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 321 for (i = 0; i < spa->spa_nspares; i++) { 322 vdev_t *vd; 323 324 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 325 VDEV_ALLOC_SPARE) == 0); 326 ASSERT(vd != NULL); 327 328 spa->spa_spares[i] = vd; 329 330 if (vdev_open(vd) != 0) 331 continue; 332 333 vd->vdev_top = vd; 334 (void) vdev_validate_spare(vd); 335 } 336 337 /* 338 * Recompute the stashed list of spares, with status information 339 * this time. 340 */ 341 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 342 DATA_TYPE_NVLIST_ARRAY) == 0); 343 344 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 345 for (i = 0; i < spa->spa_nspares; i++) 346 spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 347 B_TRUE, B_TRUE); 348 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 349 spares, spa->spa_nspares) == 0); 350 for (i = 0; i < spa->spa_nspares; i++) 351 nvlist_free(spares[i]); 352 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 353 } 354 355 static int 356 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 357 { 358 dmu_buf_t *db; 359 char *packed = NULL; 360 size_t nvsize = 0; 361 int error; 362 *value = NULL; 363 364 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 365 nvsize = *(uint64_t *)db->db_data; 366 dmu_buf_rele(db, FTAG); 367 368 packed = kmem_alloc(nvsize, KM_SLEEP); 369 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 370 if (error == 0) 371 error = nvlist_unpack(packed, nvsize, value, 0); 372 kmem_free(packed, nvsize); 373 374 return (error); 375 } 376 377 /* 378 * Load an existing storage pool, using the pool's builtin spa_config as a 379 * source of configuration information. 380 */ 381 static int 382 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 383 { 384 int error = 0; 385 nvlist_t *nvroot = NULL; 386 vdev_t *rvd; 387 uberblock_t *ub = &spa->spa_uberblock; 388 uint64_t config_cache_txg = spa->spa_config_txg; 389 uint64_t pool_guid; 390 uint64_t version; 391 zio_t *zio; 392 393 spa->spa_load_state = state; 394 395 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 396 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 397 error = EINVAL; 398 goto out; 399 } 400 401 /* 402 * Versioning wasn't explicitly added to the label until later, so if 403 * it's not present treat it as the initial version. 404 */ 405 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 406 version = ZFS_VERSION_INITIAL; 407 408 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 409 &spa->spa_config_txg); 410 411 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 412 spa_guid_exists(pool_guid, 0)) { 413 error = EEXIST; 414 goto out; 415 } 416 417 spa->spa_load_guid = pool_guid; 418 419 /* 420 * Parse the configuration into a vdev tree. We explicitly set the 421 * value that will be returned by spa_version() since parsing the 422 * configuration requires knowing the version number. 423 */ 424 spa_config_enter(spa, RW_WRITER, FTAG); 425 spa->spa_ubsync.ub_version = version; 426 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 427 spa_config_exit(spa, FTAG); 428 429 if (error != 0) 430 goto out; 431 432 ASSERT(spa->spa_root_vdev == rvd); 433 ASSERT(spa_guid(spa) == pool_guid); 434 435 /* 436 * Try to open all vdevs, loading each label in the process. 437 */ 438 if (vdev_open(rvd) != 0) { 439 error = ENXIO; 440 goto out; 441 } 442 443 /* 444 * Validate the labels for all leaf vdevs. We need to grab the config 445 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 446 * flag. 447 */ 448 spa_config_enter(spa, RW_READER, FTAG); 449 error = vdev_validate(rvd); 450 spa_config_exit(spa, FTAG); 451 452 if (error != 0) { 453 error = EBADF; 454 goto out; 455 } 456 457 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 458 error = ENXIO; 459 goto out; 460 } 461 462 /* 463 * Find the best uberblock. 464 */ 465 bzero(ub, sizeof (uberblock_t)); 466 467 zio = zio_root(spa, NULL, NULL, 468 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 469 vdev_uberblock_load(zio, rvd, ub); 470 error = zio_wait(zio); 471 472 /* 473 * If we weren't able to find a single valid uberblock, return failure. 474 */ 475 if (ub->ub_txg == 0) { 476 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 477 VDEV_AUX_CORRUPT_DATA); 478 error = ENXIO; 479 goto out; 480 } 481 482 /* 483 * If the pool is newer than the code, we can't open it. 484 */ 485 if (ub->ub_version > ZFS_VERSION) { 486 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 487 VDEV_AUX_VERSION_NEWER); 488 error = ENOTSUP; 489 goto out; 490 } 491 492 /* 493 * If the vdev guid sum doesn't match the uberblock, we have an 494 * incomplete configuration. 495 */ 496 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 497 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 498 VDEV_AUX_BAD_GUID_SUM); 499 error = ENXIO; 500 goto out; 501 } 502 503 /* 504 * Initialize internal SPA structures. 505 */ 506 spa->spa_state = POOL_STATE_ACTIVE; 507 spa->spa_ubsync = spa->spa_uberblock; 508 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 509 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 510 if (error) { 511 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 512 VDEV_AUX_CORRUPT_DATA); 513 goto out; 514 } 515 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 516 517 if (zap_lookup(spa->spa_meta_objset, 518 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 519 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 520 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 521 VDEV_AUX_CORRUPT_DATA); 522 error = EIO; 523 goto out; 524 } 525 526 if (!mosconfig) { 527 nvlist_t *newconfig; 528 529 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 530 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 531 VDEV_AUX_CORRUPT_DATA); 532 error = EIO; 533 goto out; 534 } 535 536 spa_config_set(spa, newconfig); 537 spa_unload(spa); 538 spa_deactivate(spa); 539 spa_activate(spa); 540 541 return (spa_load(spa, newconfig, state, B_TRUE)); 542 } 543 544 if (zap_lookup(spa->spa_meta_objset, 545 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 546 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 547 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 548 VDEV_AUX_CORRUPT_DATA); 549 error = EIO; 550 goto out; 551 } 552 553 /* 554 * Load the bit that tells us to use the new accounting function 555 * (raid-z deflation). If we have an older pool, this will not 556 * be present. 557 */ 558 error = zap_lookup(spa->spa_meta_objset, 559 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 560 sizeof (uint64_t), 1, &spa->spa_deflate); 561 if (error != 0 && error != ENOENT) { 562 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 563 VDEV_AUX_CORRUPT_DATA); 564 error = EIO; 565 goto out; 566 } 567 568 /* 569 * Load the persistent error log. If we have an older pool, this will 570 * not be present. 571 */ 572 error = zap_lookup(spa->spa_meta_objset, 573 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 574 sizeof (uint64_t), 1, &spa->spa_errlog_last); 575 if (error != 0 && error != ENOENT) { 576 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 577 VDEV_AUX_CORRUPT_DATA); 578 error = EIO; 579 goto out; 580 } 581 582 error = zap_lookup(spa->spa_meta_objset, 583 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 584 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 585 if (error != 0 && error != ENOENT) { 586 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 587 VDEV_AUX_CORRUPT_DATA); 588 error = EIO; 589 goto out; 590 } 591 592 /* 593 * Load any hot spares for this pool. 594 */ 595 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 596 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 597 if (error != 0 && error != ENOENT) { 598 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 599 VDEV_AUX_CORRUPT_DATA); 600 error = EIO; 601 goto out; 602 } 603 if (error == 0) { 604 ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES); 605 if (load_nvlist(spa, spa->spa_spares_object, 606 &spa->spa_sparelist) != 0) { 607 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 608 VDEV_AUX_CORRUPT_DATA); 609 error = EIO; 610 goto out; 611 } 612 613 spa_config_enter(spa, RW_WRITER, FTAG); 614 spa_load_spares(spa); 615 spa_config_exit(spa, FTAG); 616 } 617 618 /* 619 * Load the vdev state for all toplevel vdevs. 620 */ 621 vdev_load(rvd); 622 623 /* 624 * Propagate the leaf DTLs we just loaded all the way up the tree. 625 */ 626 spa_config_enter(spa, RW_WRITER, FTAG); 627 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 628 spa_config_exit(spa, FTAG); 629 630 /* 631 * Check the state of the root vdev. If it can't be opened, it 632 * indicates one or more toplevel vdevs are faulted. 633 */ 634 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 635 error = ENXIO; 636 goto out; 637 } 638 639 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 640 dmu_tx_t *tx; 641 int need_update = B_FALSE; 642 int c; 643 644 /* 645 * Claim log blocks that haven't been committed yet. 646 * This must all happen in a single txg. 647 */ 648 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 649 spa_first_txg(spa)); 650 (void) dmu_objset_find(spa->spa_name, 651 zil_claim, tx, DS_FIND_CHILDREN); 652 dmu_tx_commit(tx); 653 654 spa->spa_sync_on = B_TRUE; 655 txg_sync_start(spa->spa_dsl_pool); 656 657 /* 658 * Wait for all claims to sync. 659 */ 660 txg_wait_synced(spa->spa_dsl_pool, 0); 661 662 /* 663 * If the config cache is stale, or we have uninitialized 664 * metaslabs (see spa_vdev_add()), then update the config. 665 */ 666 if (config_cache_txg != spa->spa_config_txg || 667 state == SPA_LOAD_IMPORT) 668 need_update = B_TRUE; 669 670 for (c = 0; c < rvd->vdev_children; c++) 671 if (rvd->vdev_child[c]->vdev_ms_array == 0) 672 need_update = B_TRUE; 673 674 /* 675 * Update the config cache asychronously in case we're the 676 * root pool, in which case the config cache isn't writable yet. 677 */ 678 if (need_update) 679 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 680 } 681 682 error = 0; 683 out: 684 if (error && error != EBADF) 685 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 686 spa->spa_load_state = SPA_LOAD_NONE; 687 spa->spa_ena = 0; 688 689 return (error); 690 } 691 692 /* 693 * Pool Open/Import 694 * 695 * The import case is identical to an open except that the configuration is sent 696 * down from userland, instead of grabbed from the configuration cache. For the 697 * case of an open, the pool configuration will exist in the 698 * POOL_STATE_UNITIALIZED state. 699 * 700 * The stats information (gen/count/ustats) is used to gather vdev statistics at 701 * the same time open the pool, without having to keep around the spa_t in some 702 * ambiguous state. 703 */ 704 static int 705 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 706 { 707 spa_t *spa; 708 int error; 709 int loaded = B_FALSE; 710 int locked = B_FALSE; 711 712 *spapp = NULL; 713 714 /* 715 * As disgusting as this is, we need to support recursive calls to this 716 * function because dsl_dir_open() is called during spa_load(), and ends 717 * up calling spa_open() again. The real fix is to figure out how to 718 * avoid dsl_dir_open() calling this in the first place. 719 */ 720 if (mutex_owner(&spa_namespace_lock) != curthread) { 721 mutex_enter(&spa_namespace_lock); 722 locked = B_TRUE; 723 } 724 725 if ((spa = spa_lookup(pool)) == NULL) { 726 if (locked) 727 mutex_exit(&spa_namespace_lock); 728 return (ENOENT); 729 } 730 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 731 732 spa_activate(spa); 733 734 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 735 736 if (error == EBADF) { 737 /* 738 * If vdev_validate() returns failure (indicated by 739 * EBADF), it indicates that one of the vdevs indicates 740 * that the pool has been exported or destroyed. If 741 * this is the case, the config cache is out of sync and 742 * we should remove the pool from the namespace. 743 */ 744 zfs_post_ok(spa, NULL); 745 spa_unload(spa); 746 spa_deactivate(spa); 747 spa_remove(spa); 748 spa_config_sync(); 749 if (locked) 750 mutex_exit(&spa_namespace_lock); 751 return (ENOENT); 752 } 753 754 if (error) { 755 /* 756 * We can't open the pool, but we still have useful 757 * information: the state of each vdev after the 758 * attempted vdev_open(). Return this to the user. 759 */ 760 if (config != NULL && spa->spa_root_vdev != NULL) { 761 spa_config_enter(spa, RW_READER, FTAG); 762 *config = spa_config_generate(spa, NULL, -1ULL, 763 B_TRUE); 764 spa_config_exit(spa, FTAG); 765 } 766 spa_unload(spa); 767 spa_deactivate(spa); 768 spa->spa_last_open_failed = B_TRUE; 769 if (locked) 770 mutex_exit(&spa_namespace_lock); 771 *spapp = NULL; 772 return (error); 773 } else { 774 zfs_post_ok(spa, NULL); 775 spa->spa_last_open_failed = B_FALSE; 776 } 777 778 loaded = B_TRUE; 779 } 780 781 spa_open_ref(spa, tag); 782 if (locked) 783 mutex_exit(&spa_namespace_lock); 784 785 *spapp = spa; 786 787 if (config != NULL) { 788 spa_config_enter(spa, RW_READER, FTAG); 789 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 790 spa_config_exit(spa, FTAG); 791 } 792 793 /* 794 * If we just loaded the pool, resilver anything that's out of date. 795 */ 796 if (loaded && (spa_mode & FWRITE)) 797 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 798 799 return (0); 800 } 801 802 int 803 spa_open(const char *name, spa_t **spapp, void *tag) 804 { 805 return (spa_open_common(name, spapp, tag, NULL)); 806 } 807 808 /* 809 * Lookup the given spa_t, incrementing the inject count in the process, 810 * preventing it from being exported or destroyed. 811 */ 812 spa_t * 813 spa_inject_addref(char *name) 814 { 815 spa_t *spa; 816 817 mutex_enter(&spa_namespace_lock); 818 if ((spa = spa_lookup(name)) == NULL) { 819 mutex_exit(&spa_namespace_lock); 820 return (NULL); 821 } 822 spa->spa_inject_ref++; 823 mutex_exit(&spa_namespace_lock); 824 825 return (spa); 826 } 827 828 void 829 spa_inject_delref(spa_t *spa) 830 { 831 mutex_enter(&spa_namespace_lock); 832 spa->spa_inject_ref--; 833 mutex_exit(&spa_namespace_lock); 834 } 835 836 static void 837 spa_add_spares(spa_t *spa, nvlist_t *config) 838 { 839 nvlist_t **spares; 840 uint_t i, nspares; 841 nvlist_t *nvroot; 842 uint64_t guid; 843 vdev_stat_t *vs; 844 uint_t vsc; 845 846 if (spa->spa_nspares == 0) 847 return; 848 849 VERIFY(nvlist_lookup_nvlist(config, 850 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 851 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 852 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 853 if (nspares != 0) { 854 VERIFY(nvlist_add_nvlist_array(nvroot, 855 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 856 VERIFY(nvlist_lookup_nvlist_array(nvroot, 857 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 858 859 /* 860 * Go through and find any spares which have since been 861 * repurposed as an active spare. If this is the case, update 862 * their status appropriately. 863 */ 864 for (i = 0; i < nspares; i++) { 865 VERIFY(nvlist_lookup_uint64(spares[i], 866 ZPOOL_CONFIG_GUID, &guid) == 0); 867 if (spa_spare_inuse(guid)) { 868 VERIFY(nvlist_lookup_uint64_array( 869 spares[i], ZPOOL_CONFIG_STATS, 870 (uint64_t **)&vs, &vsc) == 0); 871 vs->vs_state = VDEV_STATE_CANT_OPEN; 872 vs->vs_aux = VDEV_AUX_SPARED; 873 } 874 } 875 } 876 } 877 878 int 879 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 880 { 881 int error; 882 spa_t *spa; 883 884 *config = NULL; 885 error = spa_open_common(name, &spa, FTAG, config); 886 887 if (spa && *config != NULL) { 888 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 889 spa_get_errlog_size(spa)) == 0); 890 891 spa_add_spares(spa, *config); 892 } 893 894 /* 895 * We want to get the alternate root even for faulted pools, so we cheat 896 * and call spa_lookup() directly. 897 */ 898 if (altroot) { 899 if (spa == NULL) { 900 mutex_enter(&spa_namespace_lock); 901 spa = spa_lookup(name); 902 if (spa) 903 spa_altroot(spa, altroot, buflen); 904 else 905 altroot[0] = '\0'; 906 spa = NULL; 907 mutex_exit(&spa_namespace_lock); 908 } else { 909 spa_altroot(spa, altroot, buflen); 910 } 911 } 912 913 if (spa != NULL) 914 spa_close(spa, FTAG); 915 916 return (error); 917 } 918 919 /* 920 * Validate that the 'spares' array is well formed. We must have an array of 921 * nvlists, each which describes a valid leaf vdev. 922 */ 923 static int 924 spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 925 { 926 nvlist_t **spares; 927 uint_t i, nspares; 928 vdev_t *vd; 929 int error; 930 931 /* 932 * It's acceptable to have no spares specified. 933 */ 934 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 935 &spares, &nspares) != 0) 936 return (0); 937 938 if (nspares == 0) 939 return (EINVAL); 940 941 /* 942 * Make sure the pool is formatted with a version that supports hot 943 * spares. 944 */ 945 if (spa_version(spa) < ZFS_VERSION_SPARES) 946 return (ENOTSUP); 947 948 for (i = 0; i < nspares; i++) { 949 if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 950 mode)) != 0) 951 return (error); 952 953 if (!vd->vdev_ops->vdev_op_leaf) { 954 vdev_free(vd); 955 return (EINVAL); 956 } 957 958 if ((error = vdev_open(vd)) != 0) { 959 vdev_free(vd); 960 return (error); 961 } 962 963 vd->vdev_top = vd; 964 if ((error = vdev_label_spare(vd, crtxg)) != 0) { 965 vdev_free(vd); 966 return (error); 967 } 968 969 VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 970 vd->vdev_guid) == 0); 971 972 vdev_free(vd); 973 } 974 975 return (0); 976 } 977 978 /* 979 * Pool Creation 980 */ 981 int 982 spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 983 { 984 spa_t *spa; 985 vdev_t *rvd; 986 dsl_pool_t *dp; 987 dmu_tx_t *tx; 988 int c, error = 0; 989 uint64_t txg = TXG_INITIAL; 990 nvlist_t **spares; 991 uint_t nspares; 992 993 /* 994 * If this pool already exists, return failure. 995 */ 996 mutex_enter(&spa_namespace_lock); 997 if (spa_lookup(pool) != NULL) { 998 mutex_exit(&spa_namespace_lock); 999 return (EEXIST); 1000 } 1001 1002 /* 1003 * Allocate a new spa_t structure. 1004 */ 1005 spa = spa_add(pool, altroot); 1006 spa_activate(spa); 1007 1008 spa->spa_uberblock.ub_txg = txg - 1; 1009 spa->spa_uberblock.ub_version = ZFS_VERSION; 1010 spa->spa_ubsync = spa->spa_uberblock; 1011 1012 /* 1013 * Create the root vdev. 1014 */ 1015 spa_config_enter(spa, RW_WRITER, FTAG); 1016 1017 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1018 1019 ASSERT(error != 0 || rvd != NULL); 1020 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1021 1022 if (error == 0 && rvd->vdev_children == 0) 1023 error = EINVAL; 1024 1025 if (error == 0 && 1026 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1027 (error = spa_validate_spares(spa, nvroot, txg, 1028 VDEV_ALLOC_ADD)) == 0) { 1029 for (c = 0; c < rvd->vdev_children; c++) 1030 vdev_init(rvd->vdev_child[c], txg); 1031 vdev_config_dirty(rvd); 1032 } 1033 1034 spa_config_exit(spa, FTAG); 1035 1036 if (error != 0) { 1037 spa_unload(spa); 1038 spa_deactivate(spa); 1039 spa_remove(spa); 1040 mutex_exit(&spa_namespace_lock); 1041 return (error); 1042 } 1043 1044 /* 1045 * Get the list of spares, if specified. 1046 */ 1047 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1048 &spares, &nspares) == 0) { 1049 VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 1050 KM_SLEEP) == 0); 1051 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1052 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1053 spa_config_enter(spa, RW_WRITER, FTAG); 1054 spa_load_spares(spa); 1055 spa_config_exit(spa, FTAG); 1056 spa->spa_sync_spares = B_TRUE; 1057 } 1058 1059 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1060 spa->spa_meta_objset = dp->dp_meta_objset; 1061 1062 tx = dmu_tx_create_assigned(dp, txg); 1063 1064 /* 1065 * Create the pool config object. 1066 */ 1067 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1068 DMU_OT_PACKED_NVLIST, 1 << 14, 1069 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1070 1071 if (zap_add(spa->spa_meta_objset, 1072 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1073 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1074 cmn_err(CE_PANIC, "failed to add pool config"); 1075 } 1076 1077 /* Newly created pools are always deflated. */ 1078 spa->spa_deflate = TRUE; 1079 if (zap_add(spa->spa_meta_objset, 1080 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1081 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 1082 cmn_err(CE_PANIC, "failed to add deflate"); 1083 } 1084 1085 /* 1086 * Create the deferred-free bplist object. Turn off compression 1087 * because sync-to-convergence takes longer if the blocksize 1088 * keeps changing. 1089 */ 1090 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1091 1 << 14, tx); 1092 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1093 ZIO_COMPRESS_OFF, tx); 1094 1095 if (zap_add(spa->spa_meta_objset, 1096 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1097 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 1098 cmn_err(CE_PANIC, "failed to add bplist"); 1099 } 1100 1101 dmu_tx_commit(tx); 1102 1103 spa->spa_sync_on = B_TRUE; 1104 txg_sync_start(spa->spa_dsl_pool); 1105 1106 /* 1107 * We explicitly wait for the first transaction to complete so that our 1108 * bean counters are appropriately updated. 1109 */ 1110 txg_wait_synced(spa->spa_dsl_pool, txg); 1111 1112 spa_config_sync(); 1113 1114 mutex_exit(&spa_namespace_lock); 1115 1116 return (0); 1117 } 1118 1119 /* 1120 * Import the given pool into the system. We set up the necessary spa_t and 1121 * then call spa_load() to do the dirty work. 1122 */ 1123 int 1124 spa_import(const char *pool, nvlist_t *config, const char *altroot) 1125 { 1126 spa_t *spa; 1127 int error; 1128 nvlist_t *nvroot; 1129 nvlist_t **spares; 1130 uint_t nspares; 1131 1132 if (!(spa_mode & FWRITE)) 1133 return (EROFS); 1134 1135 /* 1136 * If a pool with this name exists, return failure. 1137 */ 1138 mutex_enter(&spa_namespace_lock); 1139 if (spa_lookup(pool) != NULL) { 1140 mutex_exit(&spa_namespace_lock); 1141 return (EEXIST); 1142 } 1143 1144 /* 1145 * Create and initialize the spa structure. 1146 */ 1147 spa = spa_add(pool, altroot); 1148 spa_activate(spa); 1149 1150 /* 1151 * Pass off the heavy lifting to spa_load(). 1152 * Pass TRUE for mosconfig because the user-supplied config 1153 * is actually the one to trust when doing an import. 1154 */ 1155 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1156 1157 spa_config_enter(spa, RW_WRITER, FTAG); 1158 /* 1159 * Toss any existing sparelist, as it doesn't have any validity anymore, 1160 * and conflicts with spa_has_spare(). 1161 */ 1162 if (spa->spa_sparelist) { 1163 nvlist_free(spa->spa_sparelist); 1164 spa->spa_sparelist = NULL; 1165 spa_load_spares(spa); 1166 } 1167 1168 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1169 &nvroot) == 0); 1170 if (error == 0) 1171 error = spa_validate_spares(spa, nvroot, -1ULL, 1172 VDEV_ALLOC_SPARE); 1173 spa_config_exit(spa, FTAG); 1174 1175 if (error != 0) { 1176 spa_unload(spa); 1177 spa_deactivate(spa); 1178 spa_remove(spa); 1179 mutex_exit(&spa_namespace_lock); 1180 return (error); 1181 } 1182 1183 /* 1184 * Override any spares as specified by the user, as these may have 1185 * correct device names/devids, etc. 1186 */ 1187 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1188 &spares, &nspares) == 0) { 1189 if (spa->spa_sparelist) 1190 VERIFY(nvlist_remove(spa->spa_sparelist, 1191 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1192 else 1193 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1194 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1195 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1196 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1197 spa_config_enter(spa, RW_WRITER, FTAG); 1198 spa_load_spares(spa); 1199 spa_config_exit(spa, FTAG); 1200 spa->spa_sync_spares = B_TRUE; 1201 } 1202 1203 /* 1204 * Update the config cache to include the newly-imported pool. 1205 */ 1206 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1207 1208 mutex_exit(&spa_namespace_lock); 1209 1210 /* 1211 * Resilver anything that's out of date. 1212 */ 1213 if (spa_mode & FWRITE) 1214 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1215 1216 return (0); 1217 } 1218 1219 /* 1220 * This (illegal) pool name is used when temporarily importing a spa_t in order 1221 * to get the vdev stats associated with the imported devices. 1222 */ 1223 #define TRYIMPORT_NAME "$import" 1224 1225 nvlist_t * 1226 spa_tryimport(nvlist_t *tryconfig) 1227 { 1228 nvlist_t *config = NULL; 1229 char *poolname; 1230 spa_t *spa; 1231 uint64_t state; 1232 1233 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1234 return (NULL); 1235 1236 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1237 return (NULL); 1238 1239 /* 1240 * Create and initialize the spa structure. 1241 */ 1242 mutex_enter(&spa_namespace_lock); 1243 spa = spa_add(TRYIMPORT_NAME, NULL); 1244 spa_activate(spa); 1245 1246 /* 1247 * Pass off the heavy lifting to spa_load(). 1248 * Pass TRUE for mosconfig because the user-supplied config 1249 * is actually the one to trust when doing an import. 1250 */ 1251 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1252 1253 /* 1254 * If 'tryconfig' was at least parsable, return the current config. 1255 */ 1256 if (spa->spa_root_vdev != NULL) { 1257 spa_config_enter(spa, RW_READER, FTAG); 1258 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1259 spa_config_exit(spa, FTAG); 1260 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1261 poolname) == 0); 1262 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1263 state) == 0); 1264 1265 /* 1266 * Add the list of hot spares. 1267 */ 1268 spa_add_spares(spa, config); 1269 } 1270 1271 spa_unload(spa); 1272 spa_deactivate(spa); 1273 spa_remove(spa); 1274 mutex_exit(&spa_namespace_lock); 1275 1276 return (config); 1277 } 1278 1279 /* 1280 * Pool export/destroy 1281 * 1282 * The act of destroying or exporting a pool is very simple. We make sure there 1283 * is no more pending I/O and any references to the pool are gone. Then, we 1284 * update the pool state and sync all the labels to disk, removing the 1285 * configuration from the cache afterwards. 1286 */ 1287 static int 1288 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1289 { 1290 spa_t *spa; 1291 1292 if (oldconfig) 1293 *oldconfig = NULL; 1294 1295 if (!(spa_mode & FWRITE)) 1296 return (EROFS); 1297 1298 mutex_enter(&spa_namespace_lock); 1299 if ((spa = spa_lookup(pool)) == NULL) { 1300 mutex_exit(&spa_namespace_lock); 1301 return (ENOENT); 1302 } 1303 1304 /* 1305 * Put a hold on the pool, drop the namespace lock, stop async tasks, 1306 * reacquire the namespace lock, and see if we can export. 1307 */ 1308 spa_open_ref(spa, FTAG); 1309 mutex_exit(&spa_namespace_lock); 1310 spa_async_suspend(spa); 1311 mutex_enter(&spa_namespace_lock); 1312 spa_close(spa, FTAG); 1313 1314 /* 1315 * The pool will be in core if it's openable, 1316 * in which case we can modify its state. 1317 */ 1318 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1319 /* 1320 * Objsets may be open only because they're dirty, so we 1321 * have to force it to sync before checking spa_refcnt. 1322 */ 1323 spa_scrub_suspend(spa); 1324 txg_wait_synced(spa->spa_dsl_pool, 0); 1325 1326 /* 1327 * A pool cannot be exported or destroyed if there are active 1328 * references. If we are resetting a pool, allow references by 1329 * fault injection handlers. 1330 */ 1331 if (!spa_refcount_zero(spa) || 1332 (spa->spa_inject_ref != 0 && 1333 new_state != POOL_STATE_UNINITIALIZED)) { 1334 spa_scrub_resume(spa); 1335 spa_async_resume(spa); 1336 mutex_exit(&spa_namespace_lock); 1337 return (EBUSY); 1338 } 1339 1340 spa_scrub_resume(spa); 1341 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1342 1343 /* 1344 * We want this to be reflected on every label, 1345 * so mark them all dirty. spa_unload() will do the 1346 * final sync that pushes these changes out. 1347 */ 1348 if (new_state != POOL_STATE_UNINITIALIZED) { 1349 spa_config_enter(spa, RW_WRITER, FTAG); 1350 spa->spa_state = new_state; 1351 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 1352 vdev_config_dirty(spa->spa_root_vdev); 1353 spa_config_exit(spa, FTAG); 1354 } 1355 } 1356 1357 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1358 spa_unload(spa); 1359 spa_deactivate(spa); 1360 } 1361 1362 if (oldconfig && spa->spa_config) 1363 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 1364 1365 if (new_state != POOL_STATE_UNINITIALIZED) { 1366 spa_remove(spa); 1367 spa_config_sync(); 1368 } 1369 mutex_exit(&spa_namespace_lock); 1370 1371 return (0); 1372 } 1373 1374 /* 1375 * Destroy a storage pool. 1376 */ 1377 int 1378 spa_destroy(char *pool) 1379 { 1380 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1381 } 1382 1383 /* 1384 * Export a storage pool. 1385 */ 1386 int 1387 spa_export(char *pool, nvlist_t **oldconfig) 1388 { 1389 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1390 } 1391 1392 /* 1393 * Similar to spa_export(), this unloads the spa_t without actually removing it 1394 * from the namespace in any way. 1395 */ 1396 int 1397 spa_reset(char *pool) 1398 { 1399 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 1400 } 1401 1402 1403 /* 1404 * ========================================================================== 1405 * Device manipulation 1406 * ========================================================================== 1407 */ 1408 1409 /* 1410 * Add capacity to a storage pool. 1411 */ 1412 int 1413 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1414 { 1415 uint64_t txg; 1416 int c, error; 1417 vdev_t *rvd = spa->spa_root_vdev; 1418 vdev_t *vd, *tvd; 1419 nvlist_t **spares; 1420 uint_t i, nspares; 1421 1422 txg = spa_vdev_enter(spa); 1423 1424 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 1425 VDEV_ALLOC_ADD)) != 0) 1426 return (spa_vdev_exit(spa, NULL, txg, error)); 1427 1428 if ((error = spa_validate_spares(spa, nvroot, txg, 1429 VDEV_ALLOC_ADD)) != 0) 1430 return (spa_vdev_exit(spa, vd, txg, error)); 1431 1432 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1433 &spares, &nspares) != 0) 1434 nspares = 0; 1435 1436 if (vd->vdev_children == 0 && nspares == 0) 1437 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1438 1439 if (vd->vdev_children != 0) { 1440 if ((error = vdev_create(vd, txg, B_FALSE)) != 0) 1441 return (spa_vdev_exit(spa, vd, txg, error)); 1442 1443 /* 1444 * Transfer each new top-level vdev from vd to rvd. 1445 */ 1446 for (c = 0; c < vd->vdev_children; c++) { 1447 tvd = vd->vdev_child[c]; 1448 vdev_remove_child(vd, tvd); 1449 tvd->vdev_id = rvd->vdev_children; 1450 vdev_add_child(rvd, tvd); 1451 vdev_config_dirty(tvd); 1452 } 1453 } 1454 1455 if (nspares != 0) { 1456 if (spa->spa_sparelist != NULL) { 1457 nvlist_t **oldspares; 1458 uint_t oldnspares; 1459 nvlist_t **newspares; 1460 1461 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 1462 ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 1463 1464 newspares = kmem_alloc(sizeof (void *) * 1465 (nspares + oldnspares), KM_SLEEP); 1466 for (i = 0; i < oldnspares; i++) 1467 VERIFY(nvlist_dup(oldspares[i], 1468 &newspares[i], KM_SLEEP) == 0); 1469 for (i = 0; i < nspares; i++) 1470 VERIFY(nvlist_dup(spares[i], 1471 &newspares[i + oldnspares], 1472 KM_SLEEP) == 0); 1473 1474 VERIFY(nvlist_remove(spa->spa_sparelist, 1475 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1476 1477 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1478 ZPOOL_CONFIG_SPARES, newspares, 1479 nspares + oldnspares) == 0); 1480 for (i = 0; i < oldnspares + nspares; i++) 1481 nvlist_free(newspares[i]); 1482 kmem_free(newspares, (oldnspares + nspares) * 1483 sizeof (void *)); 1484 } else { 1485 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1486 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1487 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1488 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1489 } 1490 1491 spa_load_spares(spa); 1492 spa->spa_sync_spares = B_TRUE; 1493 } 1494 1495 /* 1496 * We have to be careful when adding new vdevs to an existing pool. 1497 * If other threads start allocating from these vdevs before we 1498 * sync the config cache, and we lose power, then upon reboot we may 1499 * fail to open the pool because there are DVAs that the config cache 1500 * can't translate. Therefore, we first add the vdevs without 1501 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1502 * and then let spa_config_update() initialize the new metaslabs. 1503 * 1504 * spa_load() checks for added-but-not-initialized vdevs, so that 1505 * if we lose power at any point in this sequence, the remaining 1506 * steps will be completed the next time we load the pool. 1507 */ 1508 (void) spa_vdev_exit(spa, vd, txg, 0); 1509 1510 mutex_enter(&spa_namespace_lock); 1511 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1512 mutex_exit(&spa_namespace_lock); 1513 1514 return (0); 1515 } 1516 1517 /* 1518 * Attach a device to a mirror. The arguments are the path to any device 1519 * in the mirror, and the nvroot for the new device. If the path specifies 1520 * a device that is not mirrored, we automatically insert the mirror vdev. 1521 * 1522 * If 'replacing' is specified, the new device is intended to replace the 1523 * existing device; in this case the two devices are made into their own 1524 * mirror using the 'replacing' vdev, which is functionally idendical to 1525 * the mirror vdev (it actually reuses all the same ops) but has a few 1526 * extra rules: you can't attach to it after it's been created, and upon 1527 * completion of resilvering, the first disk (the one being replaced) 1528 * is automatically detached. 1529 */ 1530 int 1531 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1532 { 1533 uint64_t txg, open_txg; 1534 int error; 1535 vdev_t *rvd = spa->spa_root_vdev; 1536 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1537 vdev_ops_t *pvops; 1538 1539 txg = spa_vdev_enter(spa); 1540 1541 oldvd = vdev_lookup_by_guid(rvd, guid); 1542 1543 if (oldvd == NULL) 1544 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1545 1546 if (!oldvd->vdev_ops->vdev_op_leaf) 1547 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1548 1549 pvd = oldvd->vdev_parent; 1550 1551 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 1552 VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) 1553 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1554 1555 newvd = newrootvd->vdev_child[0]; 1556 1557 if (!newvd->vdev_ops->vdev_op_leaf) 1558 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1559 1560 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 1561 return (spa_vdev_exit(spa, newrootvd, txg, error)); 1562 1563 if (!replacing) { 1564 /* 1565 * For attach, the only allowable parent is a mirror or the root 1566 * vdev. 1567 */ 1568 if (pvd->vdev_ops != &vdev_mirror_ops && 1569 pvd->vdev_ops != &vdev_root_ops) 1570 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1571 1572 pvops = &vdev_mirror_ops; 1573 } else { 1574 /* 1575 * Active hot spares can only be replaced by inactive hot 1576 * spares. 1577 */ 1578 if (pvd->vdev_ops == &vdev_spare_ops && 1579 pvd->vdev_child[1] == oldvd && 1580 !spa_has_spare(spa, newvd->vdev_guid)) 1581 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1582 1583 /* 1584 * If the source is a hot spare, and the parent isn't already a 1585 * spare, then we want to create a new hot spare. Otherwise, we 1586 * want to create a replacing vdev. 1587 */ 1588 if (pvd->vdev_ops == &vdev_replacing_ops) 1589 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1590 else if (pvd->vdev_ops != &vdev_spare_ops && 1591 newvd->vdev_isspare) 1592 pvops = &vdev_spare_ops; 1593 else 1594 pvops = &vdev_replacing_ops; 1595 } 1596 1597 /* 1598 * Compare the new device size with the replaceable/attachable 1599 * device size. 1600 */ 1601 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1602 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1603 1604 /* 1605 * The new device cannot have a higher alignment requirement 1606 * than the top-level vdev. 1607 */ 1608 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1609 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1610 1611 /* 1612 * If this is an in-place replacement, update oldvd's path and devid 1613 * to make it distinguishable from newvd, and unopenable from now on. 1614 */ 1615 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1616 spa_strfree(oldvd->vdev_path); 1617 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1618 KM_SLEEP); 1619 (void) sprintf(oldvd->vdev_path, "%s/%s", 1620 newvd->vdev_path, "old"); 1621 if (oldvd->vdev_devid != NULL) { 1622 spa_strfree(oldvd->vdev_devid); 1623 oldvd->vdev_devid = NULL; 1624 } 1625 } 1626 1627 /* 1628 * If the parent is not a mirror, or if we're replacing, insert the new 1629 * mirror/replacing/spare vdev above oldvd. 1630 */ 1631 if (pvd->vdev_ops != pvops) 1632 pvd = vdev_add_parent(oldvd, pvops); 1633 1634 ASSERT(pvd->vdev_top->vdev_parent == rvd); 1635 ASSERT(pvd->vdev_ops == pvops); 1636 ASSERT(oldvd->vdev_parent == pvd); 1637 1638 /* 1639 * Extract the new device from its root and add it to pvd. 1640 */ 1641 vdev_remove_child(newrootvd, newvd); 1642 newvd->vdev_id = pvd->vdev_children; 1643 vdev_add_child(pvd, newvd); 1644 1645 /* 1646 * If newvd is smaller than oldvd, but larger than its rsize, 1647 * the addition of newvd may have decreased our parent's asize. 1648 */ 1649 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1650 1651 tvd = newvd->vdev_top; 1652 ASSERT(pvd->vdev_top == tvd); 1653 ASSERT(tvd->vdev_parent == rvd); 1654 1655 vdev_config_dirty(tvd); 1656 1657 /* 1658 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1659 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1660 */ 1661 open_txg = txg + TXG_CONCURRENT_STATES - 1; 1662 1663 mutex_enter(&newvd->vdev_dtl_lock); 1664 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1665 open_txg - TXG_INITIAL + 1); 1666 mutex_exit(&newvd->vdev_dtl_lock); 1667 1668 dprintf("attached %s in txg %llu\n", newvd->vdev_path, txg); 1669 1670 /* 1671 * Mark newvd's DTL dirty in this txg. 1672 */ 1673 vdev_dirty(tvd, VDD_DTL, newvd, txg); 1674 1675 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1676 1677 /* 1678 * Kick off a resilver to update newvd. 1679 */ 1680 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1681 1682 return (0); 1683 } 1684 1685 /* 1686 * Detach a device from a mirror or replacing vdev. 1687 * If 'replace_done' is specified, only detach if the parent 1688 * is a replacing vdev. 1689 */ 1690 int 1691 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1692 { 1693 uint64_t txg; 1694 int c, t, error; 1695 vdev_t *rvd = spa->spa_root_vdev; 1696 vdev_t *vd, *pvd, *cvd, *tvd; 1697 boolean_t unspare = B_FALSE; 1698 uint64_t unspare_guid; 1699 1700 txg = spa_vdev_enter(spa); 1701 1702 vd = vdev_lookup_by_guid(rvd, guid); 1703 1704 if (vd == NULL) 1705 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1706 1707 if (!vd->vdev_ops->vdev_op_leaf) 1708 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1709 1710 pvd = vd->vdev_parent; 1711 1712 /* 1713 * If replace_done is specified, only remove this device if it's 1714 * the first child of a replacing vdev. For the 'spare' vdev, either 1715 * disk can be removed. 1716 */ 1717 if (replace_done) { 1718 if (pvd->vdev_ops == &vdev_replacing_ops) { 1719 if (vd->vdev_id != 0) 1720 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1721 } else if (pvd->vdev_ops != &vdev_spare_ops) { 1722 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1723 } 1724 } 1725 1726 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 1727 spa_version(spa) >= ZFS_VERSION_SPARES); 1728 1729 /* 1730 * Only mirror, replacing, and spare vdevs support detach. 1731 */ 1732 if (pvd->vdev_ops != &vdev_replacing_ops && 1733 pvd->vdev_ops != &vdev_mirror_ops && 1734 pvd->vdev_ops != &vdev_spare_ops) 1735 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1736 1737 /* 1738 * If there's only one replica, you can't detach it. 1739 */ 1740 if (pvd->vdev_children <= 1) 1741 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1742 1743 /* 1744 * If all siblings have non-empty DTLs, this device may have the only 1745 * valid copy of the data, which means we cannot safely detach it. 1746 * 1747 * XXX -- as in the vdev_offline() case, we really want a more 1748 * precise DTL check. 1749 */ 1750 for (c = 0; c < pvd->vdev_children; c++) { 1751 uint64_t dirty; 1752 1753 cvd = pvd->vdev_child[c]; 1754 if (cvd == vd) 1755 continue; 1756 if (vdev_is_dead(cvd)) 1757 continue; 1758 mutex_enter(&cvd->vdev_dtl_lock); 1759 dirty = cvd->vdev_dtl_map.sm_space | 1760 cvd->vdev_dtl_scrub.sm_space; 1761 mutex_exit(&cvd->vdev_dtl_lock); 1762 if (!dirty) 1763 break; 1764 } 1765 1766 /* 1767 * If we are a replacing or spare vdev, then we can always detach the 1768 * latter child, as that is how one cancels the operation. 1769 */ 1770 if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 1771 c == pvd->vdev_children) 1772 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1773 1774 /* 1775 * If we are detaching the original disk from a spare, then it implies 1776 * that the spare should become a real disk, and be removed from the 1777 * active spare list for the pool. 1778 */ 1779 if (pvd->vdev_ops == &vdev_spare_ops && 1780 vd->vdev_id == 0) 1781 unspare = B_TRUE; 1782 1783 /* 1784 * Erase the disk labels so the disk can be used for other things. 1785 * This must be done after all other error cases are handled, 1786 * but before we disembowel vd (so we can still do I/O to it). 1787 * But if we can't do it, don't treat the error as fatal -- 1788 * it may be that the unwritability of the disk is the reason 1789 * it's being detached! 1790 */ 1791 error = vdev_label_init(vd, 0, B_FALSE); 1792 if (error) 1793 dprintf("unable to erase labels on %s\n", vdev_description(vd)); 1794 1795 /* 1796 * Remove vd from its parent and compact the parent's children. 1797 */ 1798 vdev_remove_child(pvd, vd); 1799 vdev_compact_children(pvd); 1800 1801 /* 1802 * Remember one of the remaining children so we can get tvd below. 1803 */ 1804 cvd = pvd->vdev_child[0]; 1805 1806 /* 1807 * If we need to remove the remaining child from the list of hot spares, 1808 * do it now, marking the vdev as no longer a spare in the process. We 1809 * must do this before vdev_remove_parent(), because that can change the 1810 * GUID if it creates a new toplevel GUID. 1811 */ 1812 if (unspare) { 1813 ASSERT(cvd->vdev_isspare); 1814 spa_spare_remove(cvd->vdev_guid); 1815 cvd->vdev_isspare = B_FALSE; 1816 unspare_guid = cvd->vdev_guid; 1817 } 1818 1819 /* 1820 * If the parent mirror/replacing vdev only has one child, 1821 * the parent is no longer needed. Remove it from the tree. 1822 */ 1823 if (pvd->vdev_children == 1) 1824 vdev_remove_parent(cvd); 1825 1826 /* 1827 * We don't set tvd until now because the parent we just removed 1828 * may have been the previous top-level vdev. 1829 */ 1830 tvd = cvd->vdev_top; 1831 ASSERT(tvd->vdev_parent == rvd); 1832 1833 /* 1834 * Reopen this top-level vdev to reassess health after detach. 1835 */ 1836 vdev_reopen(tvd); 1837 1838 /* 1839 * If the device we just detached was smaller than the others, 1840 * it may be possible to add metaslabs (i.e. grow the pool). 1841 * vdev_metaslab_init() can't fail because the existing metaslabs 1842 * are already in core, so there's nothing to read from disk. 1843 */ 1844 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 1845 1846 vdev_config_dirty(tvd); 1847 1848 /* 1849 * Mark vd's DTL as dirty in this txg. 1850 * vdev_dtl_sync() will see that vd->vdev_detached is set 1851 * and free vd's DTL object in syncing context. 1852 * But first make sure we're not on any *other* txg's DTL list, 1853 * to prevent vd from being accessed after it's freed. 1854 */ 1855 for (t = 0; t < TXG_SIZE; t++) 1856 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1857 vd->vdev_detached = B_TRUE; 1858 vdev_dirty(tvd, VDD_DTL, vd, txg); 1859 1860 dprintf("detached %s in txg %llu\n", vd->vdev_path, txg); 1861 1862 error = spa_vdev_exit(spa, vd, txg, 0); 1863 1864 /* 1865 * If we are supposed to remove the given vdev from the list of spares, 1866 * iterate over all pools in the system and replace it if it's present. 1867 */ 1868 if (unspare) { 1869 spa = NULL; 1870 mutex_enter(&spa_namespace_lock); 1871 while ((spa = spa_next(spa)) != NULL) { 1872 if (spa->spa_state != POOL_STATE_ACTIVE) 1873 continue; 1874 1875 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 1876 } 1877 mutex_exit(&spa_namespace_lock); 1878 } 1879 1880 return (error); 1881 } 1882 1883 /* 1884 * Remove a device from the pool. Currently, this supports removing only hot 1885 * spares. 1886 */ 1887 int 1888 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 1889 { 1890 vdev_t *vd; 1891 nvlist_t **spares, *nv, **newspares; 1892 uint_t i, j, nspares; 1893 int ret = 0; 1894 1895 spa_config_enter(spa, RW_WRITER, FTAG); 1896 1897 vd = spa_lookup_by_guid(spa, guid); 1898 1899 nv = NULL; 1900 if (spa->spa_spares != NULL && 1901 nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 1902 &spares, &nspares) == 0) { 1903 for (i = 0; i < nspares; i++) { 1904 uint64_t theguid; 1905 1906 VERIFY(nvlist_lookup_uint64(spares[i], 1907 ZPOOL_CONFIG_GUID, &theguid) == 0); 1908 if (theguid == guid) { 1909 nv = spares[i]; 1910 break; 1911 } 1912 } 1913 } 1914 1915 /* 1916 * We only support removing a hot spare, and only if it's not currently 1917 * in use in this pool. 1918 */ 1919 if (nv == NULL && vd == NULL) { 1920 ret = ENOENT; 1921 goto out; 1922 } 1923 1924 if (nv == NULL && vd != NULL) { 1925 ret = ENOTSUP; 1926 goto out; 1927 } 1928 1929 if (!unspare && nv != NULL && vd != NULL) { 1930 ret = EBUSY; 1931 goto out; 1932 } 1933 1934 if (nspares == 1) { 1935 newspares = NULL; 1936 } else { 1937 newspares = kmem_alloc((nspares - 1) * sizeof (void *), 1938 KM_SLEEP); 1939 for (i = 0, j = 0; i < nspares; i++) { 1940 if (spares[i] != nv) 1941 VERIFY(nvlist_dup(spares[i], 1942 &newspares[j++], KM_SLEEP) == 0); 1943 } 1944 } 1945 1946 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 1947 DATA_TYPE_NVLIST_ARRAY) == 0); 1948 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 1949 newspares, nspares - 1) == 0); 1950 for (i = 0; i < nspares - 1; i++) 1951 nvlist_free(newspares[i]); 1952 kmem_free(newspares, (nspares - 1) * sizeof (void *)); 1953 spa_load_spares(spa); 1954 spa->spa_sync_spares = B_TRUE; 1955 1956 out: 1957 spa_config_exit(spa, FTAG); 1958 1959 return (ret); 1960 } 1961 1962 /* 1963 * Find any device that's done replacing, so we can detach it. 1964 */ 1965 static vdev_t * 1966 spa_vdev_replace_done_hunt(vdev_t *vd) 1967 { 1968 vdev_t *newvd, *oldvd; 1969 int c; 1970 1971 for (c = 0; c < vd->vdev_children; c++) { 1972 oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 1973 if (oldvd != NULL) 1974 return (oldvd); 1975 } 1976 1977 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 1978 oldvd = vd->vdev_child[0]; 1979 newvd = vd->vdev_child[1]; 1980 1981 mutex_enter(&newvd->vdev_dtl_lock); 1982 if (newvd->vdev_dtl_map.sm_space == 0 && 1983 newvd->vdev_dtl_scrub.sm_space == 0) { 1984 mutex_exit(&newvd->vdev_dtl_lock); 1985 return (oldvd); 1986 } 1987 mutex_exit(&newvd->vdev_dtl_lock); 1988 } 1989 1990 return (NULL); 1991 } 1992 1993 static void 1994 spa_vdev_replace_done(spa_t *spa) 1995 { 1996 vdev_t *vd; 1997 vdev_t *pvd; 1998 uint64_t guid; 1999 uint64_t pguid = 0; 2000 2001 spa_config_enter(spa, RW_READER, FTAG); 2002 2003 while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 2004 guid = vd->vdev_guid; 2005 /* 2006 * If we have just finished replacing a hot spared device, then 2007 * we need to detach the parent's first child (the original hot 2008 * spare) as well. 2009 */ 2010 pvd = vd->vdev_parent; 2011 if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2012 pvd->vdev_id == 0) { 2013 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 2014 ASSERT(pvd->vdev_parent->vdev_children == 2); 2015 pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 2016 } 2017 spa_config_exit(spa, FTAG); 2018 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 2019 return; 2020 if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 2021 return; 2022 spa_config_enter(spa, RW_READER, FTAG); 2023 } 2024 2025 spa_config_exit(spa, FTAG); 2026 } 2027 2028 /* 2029 * Update the stored path for this vdev. Dirty the vdev configuration, relying 2030 * on spa_vdev_enter/exit() to synchronize the labels and cache. 2031 */ 2032 int 2033 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 2034 { 2035 vdev_t *rvd, *vd; 2036 uint64_t txg; 2037 2038 rvd = spa->spa_root_vdev; 2039 2040 txg = spa_vdev_enter(spa); 2041 2042 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2043 /* 2044 * Determine if this is a reference to a hot spare. In that 2045 * case, update the path as stored in the spare list. 2046 */ 2047 nvlist_t **spares; 2048 uint_t i, nspares; 2049 if (spa->spa_sparelist != NULL) { 2050 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 2051 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2052 for (i = 0; i < nspares; i++) { 2053 uint64_t theguid; 2054 VERIFY(nvlist_lookup_uint64(spares[i], 2055 ZPOOL_CONFIG_GUID, &theguid) == 0); 2056 if (theguid == guid) 2057 break; 2058 } 2059 2060 if (i == nspares) 2061 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2062 2063 VERIFY(nvlist_add_string(spares[i], 2064 ZPOOL_CONFIG_PATH, newpath) == 0); 2065 spa_load_spares(spa); 2066 spa->spa_sync_spares = B_TRUE; 2067 return (spa_vdev_exit(spa, NULL, txg, 0)); 2068 } else { 2069 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2070 } 2071 } 2072 2073 if (!vd->vdev_ops->vdev_op_leaf) 2074 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2075 2076 spa_strfree(vd->vdev_path); 2077 vd->vdev_path = spa_strdup(newpath); 2078 2079 vdev_config_dirty(vd->vdev_top); 2080 2081 return (spa_vdev_exit(spa, NULL, txg, 0)); 2082 } 2083 2084 /* 2085 * ========================================================================== 2086 * SPA Scrubbing 2087 * ========================================================================== 2088 */ 2089 2090 void 2091 spa_scrub_throttle(spa_t *spa, int direction) 2092 { 2093 mutex_enter(&spa->spa_scrub_lock); 2094 spa->spa_scrub_throttled += direction; 2095 ASSERT(spa->spa_scrub_throttled >= 0); 2096 if (spa->spa_scrub_throttled == 0) 2097 cv_broadcast(&spa->spa_scrub_io_cv); 2098 mutex_exit(&spa->spa_scrub_lock); 2099 } 2100 2101 static void 2102 spa_scrub_io_done(zio_t *zio) 2103 { 2104 spa_t *spa = zio->io_spa; 2105 2106 zio_buf_free(zio->io_data, zio->io_size); 2107 2108 mutex_enter(&spa->spa_scrub_lock); 2109 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2110 vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2111 spa->spa_scrub_errors++; 2112 mutex_enter(&vd->vdev_stat_lock); 2113 vd->vdev_stat.vs_scrub_errors++; 2114 mutex_exit(&vd->vdev_stat_lock); 2115 } 2116 if (--spa->spa_scrub_inflight == 0) { 2117 cv_broadcast(&spa->spa_scrub_io_cv); 2118 ASSERT(spa->spa_scrub_throttled == 0); 2119 } 2120 mutex_exit(&spa->spa_scrub_lock); 2121 } 2122 2123 static void 2124 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 2125 zbookmark_t *zb) 2126 { 2127 size_t size = BP_GET_LSIZE(bp); 2128 void *data = zio_buf_alloc(size); 2129 2130 mutex_enter(&spa->spa_scrub_lock); 2131 spa->spa_scrub_inflight++; 2132 mutex_exit(&spa->spa_scrub_lock); 2133 2134 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 2135 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 2136 2137 flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 2138 2139 zio_nowait(zio_read(NULL, spa, bp, data, size, 2140 spa_scrub_io_done, NULL, priority, flags, zb)); 2141 } 2142 2143 /* ARGSUSED */ 2144 static int 2145 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2146 { 2147 blkptr_t *bp = &bc->bc_blkptr; 2148 vdev_t *vd = spa->spa_root_vdev; 2149 dva_t *dva = bp->blk_dva; 2150 int needs_resilver = B_FALSE; 2151 int d; 2152 2153 if (bc->bc_errno) { 2154 /* 2155 * We can't scrub this block, but we can continue to scrub 2156 * the rest of the pool. Note the error and move along. 2157 */ 2158 mutex_enter(&spa->spa_scrub_lock); 2159 spa->spa_scrub_errors++; 2160 mutex_exit(&spa->spa_scrub_lock); 2161 2162 mutex_enter(&vd->vdev_stat_lock); 2163 vd->vdev_stat.vs_scrub_errors++; 2164 mutex_exit(&vd->vdev_stat_lock); 2165 2166 return (ERESTART); 2167 } 2168 2169 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2170 2171 for (d = 0; d < BP_GET_NDVAS(bp); d++) { 2172 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 2173 2174 ASSERT(vd != NULL); 2175 2176 /* 2177 * Keep track of how much data we've examined so that 2178 * zpool(1M) status can make useful progress reports. 2179 */ 2180 mutex_enter(&vd->vdev_stat_lock); 2181 vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 2182 mutex_exit(&vd->vdev_stat_lock); 2183 2184 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 2185 if (DVA_GET_GANG(&dva[d])) { 2186 /* 2187 * Gang members may be spread across multiple 2188 * vdevs, so the best we can do is look at the 2189 * pool-wide DTL. 2190 * XXX -- it would be better to change our 2191 * allocation policy to ensure that this can't 2192 * happen. 2193 */ 2194 vd = spa->spa_root_vdev; 2195 } 2196 if (vdev_dtl_contains(&vd->vdev_dtl_map, 2197 bp->blk_birth, 1)) 2198 needs_resilver = B_TRUE; 2199 } 2200 } 2201 2202 if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2203 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 2204 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 2205 else if (needs_resilver) 2206 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 2207 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2208 2209 return (0); 2210 } 2211 2212 static void 2213 spa_scrub_thread(spa_t *spa) 2214 { 2215 callb_cpr_t cprinfo; 2216 traverse_handle_t *th = spa->spa_scrub_th; 2217 vdev_t *rvd = spa->spa_root_vdev; 2218 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2219 int error = 0; 2220 boolean_t complete; 2221 2222 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2223 2224 /* 2225 * If we're restarting due to a snapshot create/delete, 2226 * wait for that to complete. 2227 */ 2228 txg_wait_synced(spa_get_dsl(spa), 0); 2229 2230 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 2231 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2232 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 2233 2234 spa_config_enter(spa, RW_WRITER, FTAG); 2235 vdev_reopen(rvd); /* purge all vdev caches */ 2236 vdev_config_dirty(rvd); /* rewrite all disk labels */ 2237 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 2238 spa_config_exit(spa, FTAG); 2239 2240 mutex_enter(&spa->spa_scrub_lock); 2241 spa->spa_scrub_errors = 0; 2242 spa->spa_scrub_active = 1; 2243 ASSERT(spa->spa_scrub_inflight == 0); 2244 ASSERT(spa->spa_scrub_throttled == 0); 2245 2246 while (!spa->spa_scrub_stop) { 2247 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2248 while (spa->spa_scrub_suspended) { 2249 spa->spa_scrub_active = 0; 2250 cv_broadcast(&spa->spa_scrub_cv); 2251 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2252 spa->spa_scrub_active = 1; 2253 } 2254 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2255 2256 if (spa->spa_scrub_restart_txg != 0) 2257 break; 2258 2259 mutex_exit(&spa->spa_scrub_lock); 2260 error = traverse_more(th); 2261 mutex_enter(&spa->spa_scrub_lock); 2262 if (error != EAGAIN) 2263 break; 2264 2265 while (spa->spa_scrub_throttled > 0) 2266 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2267 } 2268 2269 while (spa->spa_scrub_inflight) 2270 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2271 2272 spa->spa_scrub_active = 0; 2273 cv_broadcast(&spa->spa_scrub_cv); 2274 2275 mutex_exit(&spa->spa_scrub_lock); 2276 2277 spa_config_enter(spa, RW_WRITER, FTAG); 2278 2279 mutex_enter(&spa->spa_scrub_lock); 2280 2281 /* 2282 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 2283 * AND the spa config lock to synchronize with any config changes 2284 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 2285 */ 2286 if (spa->spa_scrub_restart_txg != 0) 2287 error = ERESTART; 2288 2289 if (spa->spa_scrub_stop) 2290 error = EINTR; 2291 2292 /* 2293 * Even if there were uncorrectable errors, we consider the scrub 2294 * completed. The downside is that if there is a transient error during 2295 * a resilver, we won't resilver the data properly to the target. But 2296 * if the damage is permanent (more likely) we will resilver forever, 2297 * which isn't really acceptable. Since there is enough information for 2298 * the user to know what has failed and why, this seems like a more 2299 * tractable approach. 2300 */ 2301 complete = (error == 0); 2302 2303 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 2304 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2305 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2306 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2307 2308 mutex_exit(&spa->spa_scrub_lock); 2309 2310 /* 2311 * If the scrub/resilver completed, update all DTLs to reflect this. 2312 * Whether it succeeded or not, vacate all temporary scrub DTLs. 2313 */ 2314 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2315 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2316 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 2317 spa_errlog_rotate(spa); 2318 2319 spa_config_exit(spa, FTAG); 2320 2321 mutex_enter(&spa->spa_scrub_lock); 2322 2323 /* 2324 * We may have finished replacing a device. 2325 * Let the async thread assess this and handle the detach. 2326 */ 2327 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2328 2329 /* 2330 * If we were told to restart, our final act is to start a new scrub. 2331 */ 2332 if (error == ERESTART) 2333 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 2334 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2335 2336 spa->spa_scrub_type = POOL_SCRUB_NONE; 2337 spa->spa_scrub_active = 0; 2338 spa->spa_scrub_thread = NULL; 2339 cv_broadcast(&spa->spa_scrub_cv); 2340 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2341 thread_exit(); 2342 } 2343 2344 void 2345 spa_scrub_suspend(spa_t *spa) 2346 { 2347 mutex_enter(&spa->spa_scrub_lock); 2348 spa->spa_scrub_suspended++; 2349 while (spa->spa_scrub_active) { 2350 cv_broadcast(&spa->spa_scrub_cv); 2351 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2352 } 2353 while (spa->spa_scrub_inflight) 2354 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2355 mutex_exit(&spa->spa_scrub_lock); 2356 } 2357 2358 void 2359 spa_scrub_resume(spa_t *spa) 2360 { 2361 mutex_enter(&spa->spa_scrub_lock); 2362 ASSERT(spa->spa_scrub_suspended != 0); 2363 if (--spa->spa_scrub_suspended == 0) 2364 cv_broadcast(&spa->spa_scrub_cv); 2365 mutex_exit(&spa->spa_scrub_lock); 2366 } 2367 2368 void 2369 spa_scrub_restart(spa_t *spa, uint64_t txg) 2370 { 2371 /* 2372 * Something happened (e.g. snapshot create/delete) that means 2373 * we must restart any in-progress scrubs. The itinerary will 2374 * fix this properly. 2375 */ 2376 mutex_enter(&spa->spa_scrub_lock); 2377 spa->spa_scrub_restart_txg = txg; 2378 mutex_exit(&spa->spa_scrub_lock); 2379 } 2380 2381 int 2382 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2383 { 2384 space_seg_t *ss; 2385 uint64_t mintxg, maxtxg; 2386 vdev_t *rvd = spa->spa_root_vdev; 2387 2388 if ((uint_t)type >= POOL_SCRUB_TYPES) 2389 return (ENOTSUP); 2390 2391 mutex_enter(&spa->spa_scrub_lock); 2392 2393 /* 2394 * If there's a scrub or resilver already in progress, stop it. 2395 */ 2396 while (spa->spa_scrub_thread != NULL) { 2397 /* 2398 * Don't stop a resilver unless forced. 2399 */ 2400 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 2401 mutex_exit(&spa->spa_scrub_lock); 2402 return (EBUSY); 2403 } 2404 spa->spa_scrub_stop = 1; 2405 cv_broadcast(&spa->spa_scrub_cv); 2406 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2407 } 2408 2409 /* 2410 * Terminate the previous traverse. 2411 */ 2412 if (spa->spa_scrub_th != NULL) { 2413 traverse_fini(spa->spa_scrub_th); 2414 spa->spa_scrub_th = NULL; 2415 } 2416 2417 if (rvd == NULL) { 2418 ASSERT(spa->spa_scrub_stop == 0); 2419 ASSERT(spa->spa_scrub_type == type); 2420 ASSERT(spa->spa_scrub_restart_txg == 0); 2421 mutex_exit(&spa->spa_scrub_lock); 2422 return (0); 2423 } 2424 2425 mintxg = TXG_INITIAL - 1; 2426 maxtxg = spa_last_synced_txg(spa) + 1; 2427 2428 mutex_enter(&rvd->vdev_dtl_lock); 2429 2430 if (rvd->vdev_dtl_map.sm_space == 0) { 2431 /* 2432 * The pool-wide DTL is empty. 2433 * If this is a resilver, there's nothing to do except 2434 * check whether any in-progress replacements have completed. 2435 */ 2436 if (type == POOL_SCRUB_RESILVER) { 2437 type = POOL_SCRUB_NONE; 2438 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2439 } 2440 } else { 2441 /* 2442 * The pool-wide DTL is non-empty. 2443 * If this is a normal scrub, upgrade to a resilver instead. 2444 */ 2445 if (type == POOL_SCRUB_EVERYTHING) 2446 type = POOL_SCRUB_RESILVER; 2447 } 2448 2449 if (type == POOL_SCRUB_RESILVER) { 2450 /* 2451 * Determine the resilvering boundaries. 2452 * 2453 * Note: (mintxg, maxtxg) is an open interval, 2454 * i.e. mintxg and maxtxg themselves are not included. 2455 * 2456 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 2457 * so we don't claim to resilver a txg that's still changing. 2458 */ 2459 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 2460 mintxg = ss->ss_start - 1; 2461 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 2462 maxtxg = MIN(ss->ss_end, maxtxg); 2463 } 2464 2465 mutex_exit(&rvd->vdev_dtl_lock); 2466 2467 spa->spa_scrub_stop = 0; 2468 spa->spa_scrub_type = type; 2469 spa->spa_scrub_restart_txg = 0; 2470 2471 if (type != POOL_SCRUB_NONE) { 2472 spa->spa_scrub_mintxg = mintxg; 2473 spa->spa_scrub_maxtxg = maxtxg; 2474 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 2475 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 2476 ZIO_FLAG_CANFAIL); 2477 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 2478 spa->spa_scrub_thread = thread_create(NULL, 0, 2479 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 2480 } 2481 2482 mutex_exit(&spa->spa_scrub_lock); 2483 2484 return (0); 2485 } 2486 2487 /* 2488 * ========================================================================== 2489 * SPA async task processing 2490 * ========================================================================== 2491 */ 2492 2493 static void 2494 spa_async_reopen(spa_t *spa) 2495 { 2496 vdev_t *rvd = spa->spa_root_vdev; 2497 vdev_t *tvd; 2498 int c; 2499 2500 spa_config_enter(spa, RW_WRITER, FTAG); 2501 2502 for (c = 0; c < rvd->vdev_children; c++) { 2503 tvd = rvd->vdev_child[c]; 2504 if (tvd->vdev_reopen_wanted) { 2505 tvd->vdev_reopen_wanted = 0; 2506 vdev_reopen(tvd); 2507 } 2508 } 2509 2510 spa_config_exit(spa, FTAG); 2511 } 2512 2513 static void 2514 spa_async_thread(spa_t *spa) 2515 { 2516 int tasks; 2517 2518 ASSERT(spa->spa_sync_on); 2519 2520 mutex_enter(&spa->spa_async_lock); 2521 tasks = spa->spa_async_tasks; 2522 spa->spa_async_tasks = 0; 2523 mutex_exit(&spa->spa_async_lock); 2524 2525 /* 2526 * See if the config needs to be updated. 2527 */ 2528 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 2529 mutex_enter(&spa_namespace_lock); 2530 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2531 mutex_exit(&spa_namespace_lock); 2532 } 2533 2534 /* 2535 * See if any devices need to be reopened. 2536 */ 2537 if (tasks & SPA_ASYNC_REOPEN) 2538 spa_async_reopen(spa); 2539 2540 /* 2541 * If any devices are done replacing, detach them. 2542 */ 2543 if (tasks & SPA_ASYNC_REPLACE_DONE) 2544 spa_vdev_replace_done(spa); 2545 2546 /* 2547 * Kick off a scrub. 2548 */ 2549 if (tasks & SPA_ASYNC_SCRUB) 2550 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 2551 2552 /* 2553 * Kick off a resilver. 2554 */ 2555 if (tasks & SPA_ASYNC_RESILVER) 2556 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 2557 2558 /* 2559 * Let the world know that we're done. 2560 */ 2561 mutex_enter(&spa->spa_async_lock); 2562 spa->spa_async_thread = NULL; 2563 cv_broadcast(&spa->spa_async_cv); 2564 mutex_exit(&spa->spa_async_lock); 2565 thread_exit(); 2566 } 2567 2568 void 2569 spa_async_suspend(spa_t *spa) 2570 { 2571 mutex_enter(&spa->spa_async_lock); 2572 spa->spa_async_suspended++; 2573 while (spa->spa_async_thread != NULL) 2574 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 2575 mutex_exit(&spa->spa_async_lock); 2576 } 2577 2578 void 2579 spa_async_resume(spa_t *spa) 2580 { 2581 mutex_enter(&spa->spa_async_lock); 2582 ASSERT(spa->spa_async_suspended != 0); 2583 spa->spa_async_suspended--; 2584 mutex_exit(&spa->spa_async_lock); 2585 } 2586 2587 static void 2588 spa_async_dispatch(spa_t *spa) 2589 { 2590 mutex_enter(&spa->spa_async_lock); 2591 if (spa->spa_async_tasks && !spa->spa_async_suspended && 2592 spa->spa_async_thread == NULL && 2593 rootdir != NULL && !vn_is_readonly(rootdir)) 2594 spa->spa_async_thread = thread_create(NULL, 0, 2595 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 2596 mutex_exit(&spa->spa_async_lock); 2597 } 2598 2599 void 2600 spa_async_request(spa_t *spa, int task) 2601 { 2602 mutex_enter(&spa->spa_async_lock); 2603 spa->spa_async_tasks |= task; 2604 mutex_exit(&spa->spa_async_lock); 2605 } 2606 2607 /* 2608 * ========================================================================== 2609 * SPA syncing routines 2610 * ========================================================================== 2611 */ 2612 2613 static void 2614 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2615 { 2616 bplist_t *bpl = &spa->spa_sync_bplist; 2617 dmu_tx_t *tx; 2618 blkptr_t blk; 2619 uint64_t itor = 0; 2620 zio_t *zio; 2621 int error; 2622 uint8_t c = 1; 2623 2624 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2625 2626 while (bplist_iterate(bpl, &itor, &blk) == 0) 2627 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2628 2629 error = zio_wait(zio); 2630 ASSERT3U(error, ==, 0); 2631 2632 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2633 bplist_vacate(bpl, tx); 2634 2635 /* 2636 * Pre-dirty the first block so we sync to convergence faster. 2637 * (Usually only the first block is needed.) 2638 */ 2639 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2640 dmu_tx_commit(tx); 2641 } 2642 2643 static void 2644 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 2645 { 2646 char *packed = NULL; 2647 size_t nvsize = 0; 2648 dmu_buf_t *db; 2649 2650 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 2651 2652 packed = kmem_alloc(nvsize, KM_SLEEP); 2653 2654 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 2655 KM_SLEEP) == 0); 2656 2657 dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 2658 2659 kmem_free(packed, nvsize); 2660 2661 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 2662 dmu_buf_will_dirty(db, tx); 2663 *(uint64_t *)db->db_data = nvsize; 2664 dmu_buf_rele(db, FTAG); 2665 } 2666 2667 static void 2668 spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 2669 { 2670 nvlist_t *nvroot; 2671 nvlist_t **spares; 2672 int i; 2673 2674 if (!spa->spa_sync_spares) 2675 return; 2676 2677 /* 2678 * Update the MOS nvlist describing the list of available spares. 2679 * spa_validate_spares() will have already made sure this nvlist is 2680 * valid and the vdevs are labelled appropriately. 2681 */ 2682 if (spa->spa_spares_object == 0) { 2683 spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 2684 DMU_OT_PACKED_NVLIST, 1 << 14, 2685 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2686 VERIFY(zap_update(spa->spa_meta_objset, 2687 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 2688 sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 2689 } 2690 2691 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2692 if (spa->spa_nspares == 0) { 2693 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2694 NULL, 0) == 0); 2695 } else { 2696 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 2697 KM_SLEEP); 2698 for (i = 0; i < spa->spa_nspares; i++) 2699 spares[i] = vdev_config_generate(spa, 2700 spa->spa_spares[i], B_FALSE, B_TRUE); 2701 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2702 spares, spa->spa_nspares) == 0); 2703 for (i = 0; i < spa->spa_nspares; i++) 2704 nvlist_free(spares[i]); 2705 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 2706 } 2707 2708 spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 2709 2710 spa->spa_sync_spares = B_FALSE; 2711 } 2712 2713 static void 2714 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2715 { 2716 nvlist_t *config; 2717 2718 if (list_is_empty(&spa->spa_dirty_list)) 2719 return; 2720 2721 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2722 2723 if (spa->spa_config_syncing) 2724 nvlist_free(spa->spa_config_syncing); 2725 spa->spa_config_syncing = config; 2726 2727 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 2728 } 2729 2730 /* 2731 * Sync the specified transaction group. New blocks may be dirtied as 2732 * part of the process, so we iterate until it converges. 2733 */ 2734 void 2735 spa_sync(spa_t *spa, uint64_t txg) 2736 { 2737 dsl_pool_t *dp = spa->spa_dsl_pool; 2738 objset_t *mos = spa->spa_meta_objset; 2739 bplist_t *bpl = &spa->spa_sync_bplist; 2740 vdev_t *rvd = spa->spa_root_vdev; 2741 vdev_t *vd; 2742 dmu_tx_t *tx; 2743 int dirty_vdevs; 2744 2745 /* 2746 * Lock out configuration changes. 2747 */ 2748 spa_config_enter(spa, RW_READER, FTAG); 2749 2750 spa->spa_syncing_txg = txg; 2751 spa->spa_sync_pass = 0; 2752 2753 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2754 2755 tx = dmu_tx_create_assigned(dp, txg); 2756 2757 /* 2758 * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg, 2759 * set spa_deflate if we have no raid-z vdevs. 2760 */ 2761 if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE && 2762 spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) { 2763 int i; 2764 2765 for (i = 0; i < rvd->vdev_children; i++) { 2766 vd = rvd->vdev_child[i]; 2767 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 2768 break; 2769 } 2770 if (i == rvd->vdev_children) { 2771 spa->spa_deflate = TRUE; 2772 VERIFY(0 == zap_add(spa->spa_meta_objset, 2773 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2774 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 2775 } 2776 } 2777 2778 /* 2779 * If anything has changed in this txg, push the deferred frees 2780 * from the previous txg. If not, leave them alone so that we 2781 * don't generate work on an otherwise idle system. 2782 */ 2783 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2784 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 2785 !txg_list_empty(&dp->dp_sync_tasks, txg)) 2786 spa_sync_deferred_frees(spa, txg); 2787 2788 /* 2789 * Iterate to convergence. 2790 */ 2791 do { 2792 spa->spa_sync_pass++; 2793 2794 spa_sync_config_object(spa, tx); 2795 spa_sync_spares(spa, tx); 2796 spa_errlog_sync(spa, txg); 2797 dsl_pool_sync(dp, txg); 2798 2799 dirty_vdevs = 0; 2800 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2801 vdev_sync(vd, txg); 2802 dirty_vdevs++; 2803 } 2804 2805 bplist_sync(bpl, tx); 2806 } while (dirty_vdevs); 2807 2808 bplist_close(bpl); 2809 2810 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2811 2812 /* 2813 * Rewrite the vdev configuration (which includes the uberblock) 2814 * to commit the transaction group. 2815 * 2816 * If there are any dirty vdevs, sync the uberblock to all vdevs. 2817 * Otherwise, pick a random top-level vdev that's known to be 2818 * visible in the config cache (see spa_vdev_add() for details). 2819 * If the write fails, try the next vdev until we're tried them all. 2820 */ 2821 if (!list_is_empty(&spa->spa_dirty_list)) { 2822 VERIFY(vdev_config_sync(rvd, txg) == 0); 2823 } else { 2824 int children = rvd->vdev_children; 2825 int c0 = spa_get_random(children); 2826 int c; 2827 2828 for (c = 0; c < children; c++) { 2829 vd = rvd->vdev_child[(c0 + c) % children]; 2830 if (vd->vdev_ms_array == 0) 2831 continue; 2832 if (vdev_config_sync(vd, txg) == 0) 2833 break; 2834 } 2835 if (c == children) 2836 VERIFY(vdev_config_sync(rvd, txg) == 0); 2837 } 2838 2839 dmu_tx_commit(tx); 2840 2841 /* 2842 * Clear the dirty config list. 2843 */ 2844 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 2845 vdev_config_clean(vd); 2846 2847 /* 2848 * Now that the new config has synced transactionally, 2849 * let it become visible to the config cache. 2850 */ 2851 if (spa->spa_config_syncing != NULL) { 2852 spa_config_set(spa, spa->spa_config_syncing); 2853 spa->spa_config_txg = txg; 2854 spa->spa_config_syncing = NULL; 2855 } 2856 2857 /* 2858 * Make a stable copy of the fully synced uberblock. 2859 * We use this as the root for pool traversals. 2860 */ 2861 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 2862 2863 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 2864 2865 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 2866 spa->spa_traverse_wanted = 0; 2867 spa->spa_ubsync = spa->spa_uberblock; 2868 rw_exit(&spa->spa_traverse_lock); 2869 2870 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 2871 2872 /* 2873 * Clean up the ZIL records for the synced txg. 2874 */ 2875 dsl_pool_zil_clean(dp); 2876 2877 /* 2878 * Update usable space statistics. 2879 */ 2880 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 2881 vdev_sync_done(vd, txg); 2882 2883 /* 2884 * It had better be the case that we didn't dirty anything 2885 * since vdev_config_sync(). 2886 */ 2887 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 2888 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 2889 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 2890 ASSERT(bpl->bpl_queue == NULL); 2891 2892 spa_config_exit(spa, FTAG); 2893 2894 /* 2895 * If any async tasks have been requested, kick them off. 2896 */ 2897 spa_async_dispatch(spa); 2898 } 2899 2900 /* 2901 * Sync all pools. We don't want to hold the namespace lock across these 2902 * operations, so we take a reference on the spa_t and drop the lock during the 2903 * sync. 2904 */ 2905 void 2906 spa_sync_allpools(void) 2907 { 2908 spa_t *spa = NULL; 2909 mutex_enter(&spa_namespace_lock); 2910 while ((spa = spa_next(spa)) != NULL) { 2911 if (spa_state(spa) != POOL_STATE_ACTIVE) 2912 continue; 2913 spa_open_ref(spa, FTAG); 2914 mutex_exit(&spa_namespace_lock); 2915 txg_wait_synced(spa_get_dsl(spa), 0); 2916 mutex_enter(&spa_namespace_lock); 2917 spa_close(spa, FTAG); 2918 } 2919 mutex_exit(&spa_namespace_lock); 2920 } 2921 2922 /* 2923 * ========================================================================== 2924 * Miscellaneous routines 2925 * ========================================================================== 2926 */ 2927 2928 /* 2929 * Remove all pools in the system. 2930 */ 2931 void 2932 spa_evict_all(void) 2933 { 2934 spa_t *spa; 2935 2936 /* 2937 * Remove all cached state. All pools should be closed now, 2938 * so every spa in the AVL tree should be unreferenced. 2939 */ 2940 mutex_enter(&spa_namespace_lock); 2941 while ((spa = spa_next(NULL)) != NULL) { 2942 /* 2943 * Stop async tasks. The async thread may need to detach 2944 * a device that's been replaced, which requires grabbing 2945 * spa_namespace_lock, so we must drop it here. 2946 */ 2947 spa_open_ref(spa, FTAG); 2948 mutex_exit(&spa_namespace_lock); 2949 spa_async_suspend(spa); 2950 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 2951 mutex_enter(&spa_namespace_lock); 2952 spa_close(spa, FTAG); 2953 2954 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2955 spa_unload(spa); 2956 spa_deactivate(spa); 2957 } 2958 spa_remove(spa); 2959 } 2960 mutex_exit(&spa_namespace_lock); 2961 } 2962 2963 vdev_t * 2964 spa_lookup_by_guid(spa_t *spa, uint64_t guid) 2965 { 2966 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 2967 } 2968 2969 void 2970 spa_upgrade(spa_t *spa) 2971 { 2972 spa_config_enter(spa, RW_WRITER, FTAG); 2973 2974 /* 2975 * This should only be called for a non-faulted pool, and since a 2976 * future version would result in an unopenable pool, this shouldn't be 2977 * possible. 2978 */ 2979 ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 2980 2981 spa->spa_uberblock.ub_version = ZFS_VERSION; 2982 vdev_config_dirty(spa->spa_root_vdev); 2983 2984 spa_config_exit(spa, FTAG); 2985 2986 txg_wait_synced(spa_get_dsl(spa), 0); 2987 } 2988 2989 boolean_t 2990 spa_has_spare(spa_t *spa, uint64_t guid) 2991 { 2992 int i; 2993 2994 for (i = 0; i < spa->spa_nspares; i++) 2995 if (spa->spa_spares[i]->vdev_guid == guid) 2996 return (B_TRUE); 2997 2998 return (B_FALSE); 2999 } 3000