1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains all the routines used when modifying on-disk SPA state. 31 * This includes opening, importing, destroying, exporting a pool, and syncing a 32 * pool. 33 */ 34 35 #include <sys/zfs_context.h> 36 #include <sys/fm/fs/zfs.h> 37 #include <sys/spa_impl.h> 38 #include <sys/zio.h> 39 #include <sys/zio_checksum.h> 40 #include <sys/zio_compress.h> 41 #include <sys/dmu.h> 42 #include <sys/dmu_tx.h> 43 #include <sys/zap.h> 44 #include <sys/zil.h> 45 #include <sys/vdev_impl.h> 46 #include <sys/metaslab.h> 47 #include <sys/uberblock_impl.h> 48 #include <sys/txg.h> 49 #include <sys/avl.h> 50 #include <sys/dmu_traverse.h> 51 #include <sys/unique.h> 52 #include <sys/dsl_pool.h> 53 #include <sys/dsl_dir.h> 54 #include <sys/dsl_prop.h> 55 #include <sys/fs/zfs.h> 56 #include <sys/callb.h> 57 58 /* 59 * ========================================================================== 60 * SPA state manipulation (open/create/destroy/import/export) 61 * ========================================================================== 62 */ 63 64 static int 65 spa_error_entry_compare(const void *a, const void *b) 66 { 67 spa_error_entry_t *sa = (spa_error_entry_t *)a; 68 spa_error_entry_t *sb = (spa_error_entry_t *)b; 69 int ret; 70 71 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 72 sizeof (zbookmark_t)); 73 74 if (ret < 0) 75 return (-1); 76 else if (ret > 0) 77 return (1); 78 else 79 return (0); 80 } 81 82 /* 83 * Utility function which retrieves copies of the current logs and 84 * re-initializes them in the process. 85 */ 86 void 87 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 88 { 89 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 90 91 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 92 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 93 94 avl_create(&spa->spa_errlist_scrub, 95 spa_error_entry_compare, sizeof (spa_error_entry_t), 96 offsetof(spa_error_entry_t, se_avl)); 97 avl_create(&spa->spa_errlist_last, 98 spa_error_entry_compare, sizeof (spa_error_entry_t), 99 offsetof(spa_error_entry_t, se_avl)); 100 } 101 102 /* 103 * Activate an uninitialized pool. 104 */ 105 static void 106 spa_activate(spa_t *spa) 107 { 108 int t; 109 110 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 111 112 spa->spa_state = POOL_STATE_ACTIVE; 113 114 spa->spa_normal_class = metaslab_class_create(); 115 116 for (t = 0; t < ZIO_TYPES; t++) { 117 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 118 8, maxclsyspri, 50, INT_MAX, 119 TASKQ_PREPOPULATE); 120 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 121 8, maxclsyspri, 50, INT_MAX, 122 TASKQ_PREPOPULATE); 123 } 124 125 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 126 127 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 128 offsetof(vdev_t, vdev_dirty_node)); 129 130 txg_list_create(&spa->spa_vdev_txg_list, 131 offsetof(struct vdev, vdev_txg_node)); 132 133 avl_create(&spa->spa_errlist_scrub, 134 spa_error_entry_compare, sizeof (spa_error_entry_t), 135 offsetof(spa_error_entry_t, se_avl)); 136 avl_create(&spa->spa_errlist_last, 137 spa_error_entry_compare, sizeof (spa_error_entry_t), 138 offsetof(spa_error_entry_t, se_avl)); 139 } 140 141 /* 142 * Opposite of spa_activate(). 143 */ 144 static void 145 spa_deactivate(spa_t *spa) 146 { 147 int t; 148 149 ASSERT(spa->spa_sync_on == B_FALSE); 150 ASSERT(spa->spa_dsl_pool == NULL); 151 ASSERT(spa->spa_root_vdev == NULL); 152 153 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 154 155 txg_list_destroy(&spa->spa_vdev_txg_list); 156 157 list_destroy(&spa->spa_dirty_list); 158 159 rw_destroy(&spa->spa_traverse_lock); 160 161 for (t = 0; t < ZIO_TYPES; t++) { 162 taskq_destroy(spa->spa_zio_issue_taskq[t]); 163 taskq_destroy(spa->spa_zio_intr_taskq[t]); 164 spa->spa_zio_issue_taskq[t] = NULL; 165 spa->spa_zio_intr_taskq[t] = NULL; 166 } 167 168 metaslab_class_destroy(spa->spa_normal_class); 169 spa->spa_normal_class = NULL; 170 171 /* 172 * If this was part of an import or the open otherwise failed, we may 173 * still have errors left in the queues. Empty them just in case. 174 */ 175 spa_errlog_drain(spa); 176 177 avl_destroy(&spa->spa_errlist_scrub); 178 avl_destroy(&spa->spa_errlist_last); 179 180 spa->spa_state = POOL_STATE_UNINITIALIZED; 181 } 182 183 /* 184 * Verify a pool configuration, and construct the vdev tree appropriately. This 185 * will create all the necessary vdevs in the appropriate layout, with each vdev 186 * in the CLOSED state. This will prep the pool before open/creation/import. 187 * All vdev validation is done by the vdev_alloc() routine. 188 */ 189 static int 190 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 191 uint_t id, int atype) 192 { 193 nvlist_t **child; 194 uint_t c, children; 195 int error; 196 197 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 198 return (error); 199 200 if ((*vdp)->vdev_ops->vdev_op_leaf) 201 return (0); 202 203 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 204 &child, &children) != 0) { 205 vdev_free(*vdp); 206 *vdp = NULL; 207 return (EINVAL); 208 } 209 210 for (c = 0; c < children; c++) { 211 vdev_t *vd; 212 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 213 atype)) != 0) { 214 vdev_free(*vdp); 215 *vdp = NULL; 216 return (error); 217 } 218 } 219 220 ASSERT(*vdp != NULL); 221 222 return (0); 223 } 224 225 /* 226 * Opposite of spa_load(). 227 */ 228 static void 229 spa_unload(spa_t *spa) 230 { 231 int i; 232 233 /* 234 * Stop async tasks. 235 */ 236 spa_async_suspend(spa); 237 238 /* 239 * Stop syncing. 240 */ 241 if (spa->spa_sync_on) { 242 txg_sync_stop(spa->spa_dsl_pool); 243 spa->spa_sync_on = B_FALSE; 244 } 245 246 /* 247 * Wait for any outstanding prefetch I/O to complete. 248 */ 249 spa_config_enter(spa, RW_WRITER, FTAG); 250 spa_config_exit(spa, FTAG); 251 252 /* 253 * Close the dsl pool. 254 */ 255 if (spa->spa_dsl_pool) { 256 dsl_pool_close(spa->spa_dsl_pool); 257 spa->spa_dsl_pool = NULL; 258 } 259 260 /* 261 * Close all vdevs. 262 */ 263 if (spa->spa_root_vdev) 264 vdev_free(spa->spa_root_vdev); 265 ASSERT(spa->spa_root_vdev == NULL); 266 267 for (i = 0; i < spa->spa_nspares; i++) 268 vdev_free(spa->spa_spares[i]); 269 if (spa->spa_spares) { 270 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 271 spa->spa_spares = NULL; 272 } 273 if (spa->spa_sparelist) { 274 nvlist_free(spa->spa_sparelist); 275 spa->spa_sparelist = NULL; 276 } 277 278 spa->spa_async_suspended = 0; 279 } 280 281 /* 282 * Load (or re-load) the current list of vdevs describing the active spares for 283 * this pool. When this is called, we have some form of basic information in 284 * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 285 * re-generate a more complete list including status information. 286 */ 287 static void 288 spa_load_spares(spa_t *spa) 289 { 290 nvlist_t **spares; 291 uint_t nspares; 292 int i; 293 294 /* 295 * First, close and free any existing spare vdevs. 296 */ 297 for (i = 0; i < spa->spa_nspares; i++) { 298 vdev_close(spa->spa_spares[i]); 299 vdev_free(spa->spa_spares[i]); 300 } 301 if (spa->spa_spares) 302 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 303 304 if (spa->spa_sparelist == NULL) 305 nspares = 0; 306 else 307 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 308 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 309 310 spa->spa_nspares = (int)nspares; 311 spa->spa_spares = NULL; 312 313 if (nspares == 0) 314 return; 315 316 /* 317 * Construct the array of vdevs, opening them to get status in the 318 * process. 319 */ 320 spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 321 for (i = 0; i < spa->spa_nspares; i++) { 322 vdev_t *vd; 323 324 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 325 VDEV_ALLOC_SPARE) == 0); 326 ASSERT(vd != NULL); 327 328 spa->spa_spares[i] = vd; 329 330 if (vdev_open(vd) != 0) 331 continue; 332 333 vd->vdev_top = vd; 334 (void) vdev_validate_spare(vd); 335 } 336 337 /* 338 * Recompute the stashed list of spares, with status information 339 * this time. 340 */ 341 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 342 DATA_TYPE_NVLIST_ARRAY) == 0); 343 344 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 345 for (i = 0; i < spa->spa_nspares; i++) 346 spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 347 B_TRUE, B_TRUE); 348 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 349 spares, spa->spa_nspares) == 0); 350 for (i = 0; i < spa->spa_nspares; i++) 351 nvlist_free(spares[i]); 352 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 353 } 354 355 static int 356 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 357 { 358 dmu_buf_t *db; 359 char *packed = NULL; 360 size_t nvsize = 0; 361 int error; 362 *value = NULL; 363 364 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 365 nvsize = *(uint64_t *)db->db_data; 366 dmu_buf_rele(db, FTAG); 367 368 packed = kmem_alloc(nvsize, KM_SLEEP); 369 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 370 if (error == 0) 371 error = nvlist_unpack(packed, nvsize, value, 0); 372 kmem_free(packed, nvsize); 373 374 return (error); 375 } 376 377 /* 378 * Load an existing storage pool, using the pool's builtin spa_config as a 379 * source of configuration information. 380 */ 381 static int 382 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 383 { 384 int error = 0; 385 nvlist_t *nvroot = NULL; 386 vdev_t *rvd; 387 uberblock_t *ub = &spa->spa_uberblock; 388 uint64_t config_cache_txg = spa->spa_config_txg; 389 uint64_t pool_guid; 390 uint64_t version; 391 zio_t *zio; 392 393 spa->spa_load_state = state; 394 395 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 396 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 397 error = EINVAL; 398 goto out; 399 } 400 401 /* 402 * Versioning wasn't explicitly added to the label until later, so if 403 * it's not present treat it as the initial version. 404 */ 405 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 406 version = ZFS_VERSION_INITIAL; 407 408 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 409 &spa->spa_config_txg); 410 411 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 412 spa_guid_exists(pool_guid, 0)) { 413 error = EEXIST; 414 goto out; 415 } 416 417 /* 418 * Parse the configuration into a vdev tree. We explicitly set the 419 * value that will be returned by spa_version() since parsing the 420 * configuration requires knowing the version number. 421 */ 422 spa_config_enter(spa, RW_WRITER, FTAG); 423 spa->spa_ubsync.ub_version = version; 424 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 425 spa_config_exit(spa, FTAG); 426 427 if (error != 0) 428 goto out; 429 430 ASSERT(spa->spa_root_vdev == rvd); 431 ASSERT(spa_guid(spa) == pool_guid); 432 433 /* 434 * Try to open all vdevs, loading each label in the process. 435 */ 436 if (vdev_open(rvd) != 0) { 437 error = ENXIO; 438 goto out; 439 } 440 441 /* 442 * Validate the labels for all leaf vdevs. We need to grab the config 443 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 444 * flag. 445 */ 446 spa_config_enter(spa, RW_READER, FTAG); 447 error = vdev_validate(rvd); 448 spa_config_exit(spa, FTAG); 449 450 if (error != 0) { 451 error = EBADF; 452 goto out; 453 } 454 455 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 456 error = ENXIO; 457 goto out; 458 } 459 460 /* 461 * Find the best uberblock. 462 */ 463 bzero(ub, sizeof (uberblock_t)); 464 465 zio = zio_root(spa, NULL, NULL, 466 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 467 vdev_uberblock_load(zio, rvd, ub); 468 error = zio_wait(zio); 469 470 /* 471 * If we weren't able to find a single valid uberblock, return failure. 472 */ 473 if (ub->ub_txg == 0) { 474 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 475 VDEV_AUX_CORRUPT_DATA); 476 error = ENXIO; 477 goto out; 478 } 479 480 /* 481 * If the pool is newer than the code, we can't open it. 482 */ 483 if (ub->ub_version > ZFS_VERSION) { 484 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 485 VDEV_AUX_VERSION_NEWER); 486 error = ENOTSUP; 487 goto out; 488 } 489 490 /* 491 * If the vdev guid sum doesn't match the uberblock, we have an 492 * incomplete configuration. 493 */ 494 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 495 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 496 VDEV_AUX_BAD_GUID_SUM); 497 error = ENXIO; 498 goto out; 499 } 500 501 /* 502 * Initialize internal SPA structures. 503 */ 504 spa->spa_state = POOL_STATE_ACTIVE; 505 spa->spa_ubsync = spa->spa_uberblock; 506 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 507 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 508 if (error) { 509 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 510 VDEV_AUX_CORRUPT_DATA); 511 goto out; 512 } 513 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 514 515 if (zap_lookup(spa->spa_meta_objset, 516 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 517 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 518 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 519 VDEV_AUX_CORRUPT_DATA); 520 error = EIO; 521 goto out; 522 } 523 524 if (!mosconfig) { 525 nvlist_t *newconfig; 526 527 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 528 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 529 VDEV_AUX_CORRUPT_DATA); 530 error = EIO; 531 goto out; 532 } 533 534 spa_config_set(spa, newconfig); 535 spa_unload(spa); 536 spa_deactivate(spa); 537 spa_activate(spa); 538 539 return (spa_load(spa, newconfig, state, B_TRUE)); 540 } 541 542 if (zap_lookup(spa->spa_meta_objset, 543 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 544 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 545 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 546 VDEV_AUX_CORRUPT_DATA); 547 error = EIO; 548 goto out; 549 } 550 551 /* 552 * Load the bit that tells us to use the new accounting function 553 * (raid-z deflation). If we have an older pool, this will not 554 * be present. 555 */ 556 error = zap_lookup(spa->spa_meta_objset, 557 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 558 sizeof (uint64_t), 1, &spa->spa_deflate); 559 if (error != 0 && error != ENOENT) { 560 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 561 VDEV_AUX_CORRUPT_DATA); 562 error = EIO; 563 goto out; 564 } 565 566 /* 567 * Load the persistent error log. If we have an older pool, this will 568 * not be present. 569 */ 570 error = zap_lookup(spa->spa_meta_objset, 571 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 572 sizeof (uint64_t), 1, &spa->spa_errlog_last); 573 if (error != 0 && error != ENOENT) { 574 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 575 VDEV_AUX_CORRUPT_DATA); 576 error = EIO; 577 goto out; 578 } 579 580 error = zap_lookup(spa->spa_meta_objset, 581 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 582 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 583 if (error != 0 && error != ENOENT) { 584 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 585 VDEV_AUX_CORRUPT_DATA); 586 error = EIO; 587 goto out; 588 } 589 590 /* 591 * Load any hot spares for this pool. 592 */ 593 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 594 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 595 if (error != 0 && error != ENOENT) { 596 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 597 VDEV_AUX_CORRUPT_DATA); 598 error = EIO; 599 goto out; 600 } 601 if (error == 0) { 602 ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES); 603 if (load_nvlist(spa, spa->spa_spares_object, 604 &spa->spa_sparelist) != 0) { 605 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 606 VDEV_AUX_CORRUPT_DATA); 607 error = EIO; 608 goto out; 609 } 610 611 spa_config_enter(spa, RW_WRITER, FTAG); 612 spa_load_spares(spa); 613 spa_config_exit(spa, FTAG); 614 } 615 616 /* 617 * Load the vdev state for all toplevel vdevs. 618 */ 619 vdev_load(rvd); 620 621 /* 622 * Propagate the leaf DTLs we just loaded all the way up the tree. 623 */ 624 spa_config_enter(spa, RW_WRITER, FTAG); 625 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 626 spa_config_exit(spa, FTAG); 627 628 /* 629 * Check the state of the root vdev. If it can't be opened, it 630 * indicates one or more toplevel vdevs are faulted. 631 */ 632 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 633 error = ENXIO; 634 goto out; 635 } 636 637 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 638 dmu_tx_t *tx; 639 int need_update = B_FALSE; 640 int c; 641 642 /* 643 * Claim log blocks that haven't been committed yet. 644 * This must all happen in a single txg. 645 */ 646 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 647 spa_first_txg(spa)); 648 dmu_objset_find(spa->spa_name, zil_claim, tx, 0); 649 dmu_tx_commit(tx); 650 651 spa->spa_sync_on = B_TRUE; 652 txg_sync_start(spa->spa_dsl_pool); 653 654 /* 655 * Wait for all claims to sync. 656 */ 657 txg_wait_synced(spa->spa_dsl_pool, 0); 658 659 /* 660 * If the config cache is stale, or we have uninitialized 661 * metaslabs (see spa_vdev_add()), then update the config. 662 */ 663 if (config_cache_txg != spa->spa_config_txg || 664 state == SPA_LOAD_IMPORT) 665 need_update = B_TRUE; 666 667 for (c = 0; c < rvd->vdev_children; c++) 668 if (rvd->vdev_child[c]->vdev_ms_array == 0) 669 need_update = B_TRUE; 670 671 /* 672 * Update the config cache asychronously in case we're the 673 * root pool, in which case the config cache isn't writable yet. 674 */ 675 if (need_update) 676 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 677 } 678 679 error = 0; 680 out: 681 if (error && error != EBADF) 682 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 683 spa->spa_load_state = SPA_LOAD_NONE; 684 spa->spa_ena = 0; 685 686 return (error); 687 } 688 689 /* 690 * Pool Open/Import 691 * 692 * The import case is identical to an open except that the configuration is sent 693 * down from userland, instead of grabbed from the configuration cache. For the 694 * case of an open, the pool configuration will exist in the 695 * POOL_STATE_UNITIALIZED state. 696 * 697 * The stats information (gen/count/ustats) is used to gather vdev statistics at 698 * the same time open the pool, without having to keep around the spa_t in some 699 * ambiguous state. 700 */ 701 static int 702 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 703 { 704 spa_t *spa; 705 int error; 706 int loaded = B_FALSE; 707 int locked = B_FALSE; 708 709 *spapp = NULL; 710 711 /* 712 * As disgusting as this is, we need to support recursive calls to this 713 * function because dsl_dir_open() is called during spa_load(), and ends 714 * up calling spa_open() again. The real fix is to figure out how to 715 * avoid dsl_dir_open() calling this in the first place. 716 */ 717 if (mutex_owner(&spa_namespace_lock) != curthread) { 718 mutex_enter(&spa_namespace_lock); 719 locked = B_TRUE; 720 } 721 722 if ((spa = spa_lookup(pool)) == NULL) { 723 if (locked) 724 mutex_exit(&spa_namespace_lock); 725 return (ENOENT); 726 } 727 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 728 729 spa_activate(spa); 730 731 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 732 733 if (error == EBADF) { 734 /* 735 * If vdev_validate() returns failure (indicated by 736 * EBADF), it indicates that one of the vdevs indicates 737 * that the pool has been exported or destroyed. If 738 * this is the case, the config cache is out of sync and 739 * we should remove the pool from the namespace. 740 */ 741 zfs_post_ok(spa, NULL); 742 spa_unload(spa); 743 spa_deactivate(spa); 744 spa_remove(spa); 745 spa_config_sync(); 746 if (locked) 747 mutex_exit(&spa_namespace_lock); 748 return (ENOENT); 749 } 750 751 if (error) { 752 /* 753 * We can't open the pool, but we still have useful 754 * information: the state of each vdev after the 755 * attempted vdev_open(). Return this to the user. 756 */ 757 if (config != NULL && spa->spa_root_vdev != NULL) { 758 spa_config_enter(spa, RW_READER, FTAG); 759 *config = spa_config_generate(spa, NULL, -1ULL, 760 B_TRUE); 761 spa_config_exit(spa, FTAG); 762 } 763 spa_unload(spa); 764 spa_deactivate(spa); 765 spa->spa_last_open_failed = B_TRUE; 766 if (locked) 767 mutex_exit(&spa_namespace_lock); 768 *spapp = NULL; 769 return (error); 770 } else { 771 zfs_post_ok(spa, NULL); 772 spa->spa_last_open_failed = B_FALSE; 773 } 774 775 loaded = B_TRUE; 776 } 777 778 spa_open_ref(spa, tag); 779 if (locked) 780 mutex_exit(&spa_namespace_lock); 781 782 *spapp = spa; 783 784 if (config != NULL) { 785 spa_config_enter(spa, RW_READER, FTAG); 786 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 787 spa_config_exit(spa, FTAG); 788 } 789 790 /* 791 * If we just loaded the pool, resilver anything that's out of date. 792 */ 793 if (loaded && (spa_mode & FWRITE)) 794 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 795 796 return (0); 797 } 798 799 int 800 spa_open(const char *name, spa_t **spapp, void *tag) 801 { 802 return (spa_open_common(name, spapp, tag, NULL)); 803 } 804 805 /* 806 * Lookup the given spa_t, incrementing the inject count in the process, 807 * preventing it from being exported or destroyed. 808 */ 809 spa_t * 810 spa_inject_addref(char *name) 811 { 812 spa_t *spa; 813 814 mutex_enter(&spa_namespace_lock); 815 if ((spa = spa_lookup(name)) == NULL) { 816 mutex_exit(&spa_namespace_lock); 817 return (NULL); 818 } 819 spa->spa_inject_ref++; 820 mutex_exit(&spa_namespace_lock); 821 822 return (spa); 823 } 824 825 void 826 spa_inject_delref(spa_t *spa) 827 { 828 mutex_enter(&spa_namespace_lock); 829 spa->spa_inject_ref--; 830 mutex_exit(&spa_namespace_lock); 831 } 832 833 static void 834 spa_add_spares(spa_t *spa, nvlist_t *config) 835 { 836 nvlist_t **spares; 837 uint_t i, nspares; 838 nvlist_t *nvroot; 839 uint64_t guid; 840 vdev_stat_t *vs; 841 uint_t vsc; 842 843 if (spa->spa_nspares == 0) 844 return; 845 846 VERIFY(nvlist_lookup_nvlist(config, 847 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 848 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 849 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 850 if (nspares != 0) { 851 VERIFY(nvlist_add_nvlist_array(nvroot, 852 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 853 VERIFY(nvlist_lookup_nvlist_array(nvroot, 854 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 855 856 /* 857 * Go through and find any spares which have since been 858 * repurposed as an active spare. If this is the case, update 859 * their status appropriately. 860 */ 861 for (i = 0; i < nspares; i++) { 862 VERIFY(nvlist_lookup_uint64(spares[i], 863 ZPOOL_CONFIG_GUID, &guid) == 0); 864 if (spa_spare_inuse(guid)) { 865 VERIFY(nvlist_lookup_uint64_array( 866 spares[i], ZPOOL_CONFIG_STATS, 867 (uint64_t **)&vs, &vsc) == 0); 868 vs->vs_state = VDEV_STATE_CANT_OPEN; 869 vs->vs_aux = VDEV_AUX_SPARED; 870 } 871 } 872 } 873 } 874 875 int 876 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 877 { 878 int error; 879 spa_t *spa; 880 881 *config = NULL; 882 error = spa_open_common(name, &spa, FTAG, config); 883 884 if (spa && *config != NULL) { 885 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 886 spa_get_errlog_size(spa)) == 0); 887 888 spa_add_spares(spa, *config); 889 } 890 891 /* 892 * We want to get the alternate root even for faulted pools, so we cheat 893 * and call spa_lookup() directly. 894 */ 895 if (altroot) { 896 if (spa == NULL) { 897 mutex_enter(&spa_namespace_lock); 898 spa = spa_lookup(name); 899 if (spa) 900 spa_altroot(spa, altroot, buflen); 901 else 902 altroot[0] = '\0'; 903 spa = NULL; 904 mutex_exit(&spa_namespace_lock); 905 } else { 906 spa_altroot(spa, altroot, buflen); 907 } 908 } 909 910 if (spa != NULL) 911 spa_close(spa, FTAG); 912 913 return (error); 914 } 915 916 /* 917 * Validate that the 'spares' array is well formed. We must have an array of 918 * nvlists, each which describes a valid leaf vdev. 919 */ 920 static int 921 spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 922 { 923 nvlist_t **spares; 924 uint_t i, nspares; 925 vdev_t *vd; 926 int error; 927 928 /* 929 * It's acceptable to have no spares specified. 930 */ 931 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 932 &spares, &nspares) != 0) 933 return (0); 934 935 if (nspares == 0) 936 return (EINVAL); 937 938 /* 939 * Make sure the pool is formatted with a version that supports hot 940 * spares. 941 */ 942 if (spa_version(spa) < ZFS_VERSION_SPARES) 943 return (ENOTSUP); 944 945 for (i = 0; i < nspares; i++) { 946 if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 947 mode)) != 0) 948 return (error); 949 950 if (!vd->vdev_ops->vdev_op_leaf) { 951 vdev_free(vd); 952 return (EINVAL); 953 } 954 955 if ((error = vdev_open(vd)) != 0) { 956 vdev_free(vd); 957 return (error); 958 } 959 960 vd->vdev_top = vd; 961 if ((error = vdev_label_spare(vd, crtxg)) != 0) { 962 vdev_free(vd); 963 return (error); 964 } 965 966 VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 967 vd->vdev_guid) == 0); 968 969 vdev_free(vd); 970 } 971 972 return (0); 973 } 974 975 /* 976 * Pool Creation 977 */ 978 int 979 spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 980 { 981 spa_t *spa; 982 vdev_t *rvd; 983 dsl_pool_t *dp; 984 dmu_tx_t *tx; 985 int c, error = 0; 986 uint64_t txg = TXG_INITIAL; 987 nvlist_t **spares; 988 uint_t nspares; 989 990 /* 991 * If this pool already exists, return failure. 992 */ 993 mutex_enter(&spa_namespace_lock); 994 if (spa_lookup(pool) != NULL) { 995 mutex_exit(&spa_namespace_lock); 996 return (EEXIST); 997 } 998 999 /* 1000 * Allocate a new spa_t structure. 1001 */ 1002 spa = spa_add(pool, altroot); 1003 spa_activate(spa); 1004 1005 spa->spa_uberblock.ub_txg = txg - 1; 1006 spa->spa_uberblock.ub_version = ZFS_VERSION; 1007 spa->spa_ubsync = spa->spa_uberblock; 1008 1009 /* 1010 * Create the root vdev. 1011 */ 1012 spa_config_enter(spa, RW_WRITER, FTAG); 1013 1014 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1015 1016 ASSERT(error != 0 || rvd != NULL); 1017 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1018 1019 if (error == 0 && rvd->vdev_children == 0) 1020 error = EINVAL; 1021 1022 if (error == 0 && 1023 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1024 (error = spa_validate_spares(spa, nvroot, txg, 1025 VDEV_ALLOC_ADD)) == 0) { 1026 for (c = 0; c < rvd->vdev_children; c++) 1027 vdev_init(rvd->vdev_child[c], txg); 1028 vdev_config_dirty(rvd); 1029 } 1030 1031 spa_config_exit(spa, FTAG); 1032 1033 if (error != 0) { 1034 spa_unload(spa); 1035 spa_deactivate(spa); 1036 spa_remove(spa); 1037 mutex_exit(&spa_namespace_lock); 1038 return (error); 1039 } 1040 1041 /* 1042 * Get the list of spares, if specified. 1043 */ 1044 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1045 &spares, &nspares) == 0) { 1046 VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 1047 KM_SLEEP) == 0); 1048 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1049 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1050 spa_config_enter(spa, RW_WRITER, FTAG); 1051 spa_load_spares(spa); 1052 spa_config_exit(spa, FTAG); 1053 spa->spa_sync_spares = B_TRUE; 1054 } 1055 1056 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1057 spa->spa_meta_objset = dp->dp_meta_objset; 1058 1059 tx = dmu_tx_create_assigned(dp, txg); 1060 1061 /* 1062 * Create the pool config object. 1063 */ 1064 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1065 DMU_OT_PACKED_NVLIST, 1 << 14, 1066 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1067 1068 if (zap_add(spa->spa_meta_objset, 1069 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1070 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1071 cmn_err(CE_PANIC, "failed to add pool config"); 1072 } 1073 1074 /* Newly created pools are always deflated. */ 1075 spa->spa_deflate = TRUE; 1076 if (zap_add(spa->spa_meta_objset, 1077 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1078 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 1079 cmn_err(CE_PANIC, "failed to add deflate"); 1080 } 1081 1082 /* 1083 * Create the deferred-free bplist object. Turn off compression 1084 * because sync-to-convergence takes longer if the blocksize 1085 * keeps changing. 1086 */ 1087 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1088 1 << 14, tx); 1089 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1090 ZIO_COMPRESS_OFF, tx); 1091 1092 if (zap_add(spa->spa_meta_objset, 1093 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1094 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 1095 cmn_err(CE_PANIC, "failed to add bplist"); 1096 } 1097 1098 dmu_tx_commit(tx); 1099 1100 spa->spa_sync_on = B_TRUE; 1101 txg_sync_start(spa->spa_dsl_pool); 1102 1103 /* 1104 * We explicitly wait for the first transaction to complete so that our 1105 * bean counters are appropriately updated. 1106 */ 1107 txg_wait_synced(spa->spa_dsl_pool, txg); 1108 1109 spa_config_sync(); 1110 1111 mutex_exit(&spa_namespace_lock); 1112 1113 return (0); 1114 } 1115 1116 /* 1117 * Import the given pool into the system. We set up the necessary spa_t and 1118 * then call spa_load() to do the dirty work. 1119 */ 1120 int 1121 spa_import(const char *pool, nvlist_t *config, const char *altroot) 1122 { 1123 spa_t *spa; 1124 int error; 1125 nvlist_t *nvroot; 1126 nvlist_t **spares; 1127 uint_t nspares; 1128 1129 if (!(spa_mode & FWRITE)) 1130 return (EROFS); 1131 1132 /* 1133 * If a pool with this name exists, return failure. 1134 */ 1135 mutex_enter(&spa_namespace_lock); 1136 if (spa_lookup(pool) != NULL) { 1137 mutex_exit(&spa_namespace_lock); 1138 return (EEXIST); 1139 } 1140 1141 /* 1142 * Create and initialize the spa structure. 1143 */ 1144 spa = spa_add(pool, altroot); 1145 spa_activate(spa); 1146 1147 /* 1148 * Pass off the heavy lifting to spa_load(). 1149 * Pass TRUE for mosconfig because the user-supplied config 1150 * is actually the one to trust when doing an import. 1151 */ 1152 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1153 1154 spa_config_enter(spa, RW_WRITER, FTAG); 1155 /* 1156 * Toss any existing sparelist, as it doesn't have any validity anymore, 1157 * and conflicts with spa_has_spare(). 1158 */ 1159 if (spa->spa_sparelist) { 1160 nvlist_free(spa->spa_sparelist); 1161 spa->spa_sparelist = NULL; 1162 spa_load_spares(spa); 1163 } 1164 1165 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1166 &nvroot) == 0); 1167 if (error == 0) 1168 error = spa_validate_spares(spa, nvroot, -1ULL, 1169 VDEV_ALLOC_SPARE); 1170 spa_config_exit(spa, FTAG); 1171 1172 if (error != 0) { 1173 spa_unload(spa); 1174 spa_deactivate(spa); 1175 spa_remove(spa); 1176 mutex_exit(&spa_namespace_lock); 1177 return (error); 1178 } 1179 1180 /* 1181 * Override any spares as specified by the user, as these may have 1182 * correct device names/devids, etc. 1183 */ 1184 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1185 &spares, &nspares) == 0) { 1186 if (spa->spa_sparelist) 1187 VERIFY(nvlist_remove(spa->spa_sparelist, 1188 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1189 else 1190 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1191 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1192 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1193 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1194 spa_config_enter(spa, RW_WRITER, FTAG); 1195 spa_load_spares(spa); 1196 spa_config_exit(spa, FTAG); 1197 spa->spa_sync_spares = B_TRUE; 1198 } 1199 1200 /* 1201 * Update the config cache to include the newly-imported pool. 1202 */ 1203 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1204 1205 mutex_exit(&spa_namespace_lock); 1206 1207 /* 1208 * Resilver anything that's out of date. 1209 */ 1210 if (spa_mode & FWRITE) 1211 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1212 1213 return (0); 1214 } 1215 1216 /* 1217 * This (illegal) pool name is used when temporarily importing a spa_t in order 1218 * to get the vdev stats associated with the imported devices. 1219 */ 1220 #define TRYIMPORT_NAME "$import" 1221 1222 nvlist_t * 1223 spa_tryimport(nvlist_t *tryconfig) 1224 { 1225 nvlist_t *config = NULL; 1226 char *poolname; 1227 spa_t *spa; 1228 uint64_t state; 1229 1230 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1231 return (NULL); 1232 1233 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1234 return (NULL); 1235 1236 /* 1237 * Create and initialize the spa structure. 1238 */ 1239 mutex_enter(&spa_namespace_lock); 1240 spa = spa_add(TRYIMPORT_NAME, NULL); 1241 spa_activate(spa); 1242 1243 /* 1244 * Pass off the heavy lifting to spa_load(). 1245 * Pass TRUE for mosconfig because the user-supplied config 1246 * is actually the one to trust when doing an import. 1247 */ 1248 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1249 1250 /* 1251 * If 'tryconfig' was at least parsable, return the current config. 1252 */ 1253 if (spa->spa_root_vdev != NULL) { 1254 spa_config_enter(spa, RW_READER, FTAG); 1255 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1256 spa_config_exit(spa, FTAG); 1257 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1258 poolname) == 0); 1259 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1260 state) == 0); 1261 1262 /* 1263 * Add the list of hot spares. 1264 */ 1265 spa_add_spares(spa, config); 1266 } 1267 1268 spa_unload(spa); 1269 spa_deactivate(spa); 1270 spa_remove(spa); 1271 mutex_exit(&spa_namespace_lock); 1272 1273 return (config); 1274 } 1275 1276 /* 1277 * Pool export/destroy 1278 * 1279 * The act of destroying or exporting a pool is very simple. We make sure there 1280 * is no more pending I/O and any references to the pool are gone. Then, we 1281 * update the pool state and sync all the labels to disk, removing the 1282 * configuration from the cache afterwards. 1283 */ 1284 static int 1285 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1286 { 1287 spa_t *spa; 1288 1289 if (oldconfig) 1290 *oldconfig = NULL; 1291 1292 if (!(spa_mode & FWRITE)) 1293 return (EROFS); 1294 1295 mutex_enter(&spa_namespace_lock); 1296 if ((spa = spa_lookup(pool)) == NULL) { 1297 mutex_exit(&spa_namespace_lock); 1298 return (ENOENT); 1299 } 1300 1301 /* 1302 * Put a hold on the pool, drop the namespace lock, stop async tasks, 1303 * reacquire the namespace lock, and see if we can export. 1304 */ 1305 spa_open_ref(spa, FTAG); 1306 mutex_exit(&spa_namespace_lock); 1307 spa_async_suspend(spa); 1308 mutex_enter(&spa_namespace_lock); 1309 spa_close(spa, FTAG); 1310 1311 /* 1312 * The pool will be in core if it's openable, 1313 * in which case we can modify its state. 1314 */ 1315 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1316 /* 1317 * Objsets may be open only because they're dirty, so we 1318 * have to force it to sync before checking spa_refcnt. 1319 */ 1320 spa_scrub_suspend(spa); 1321 txg_wait_synced(spa->spa_dsl_pool, 0); 1322 1323 /* 1324 * A pool cannot be exported or destroyed if there are active 1325 * references. If we are resetting a pool, allow references by 1326 * fault injection handlers. 1327 */ 1328 if (!spa_refcount_zero(spa) || 1329 (spa->spa_inject_ref != 0 && 1330 new_state != POOL_STATE_UNINITIALIZED)) { 1331 spa_scrub_resume(spa); 1332 spa_async_resume(spa); 1333 mutex_exit(&spa_namespace_lock); 1334 return (EBUSY); 1335 } 1336 1337 spa_scrub_resume(spa); 1338 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1339 1340 /* 1341 * We want this to be reflected on every label, 1342 * so mark them all dirty. spa_unload() will do the 1343 * final sync that pushes these changes out. 1344 */ 1345 if (new_state != POOL_STATE_UNINITIALIZED) { 1346 spa_config_enter(spa, RW_WRITER, FTAG); 1347 spa->spa_state = new_state; 1348 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 1349 vdev_config_dirty(spa->spa_root_vdev); 1350 spa_config_exit(spa, FTAG); 1351 } 1352 } 1353 1354 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1355 spa_unload(spa); 1356 spa_deactivate(spa); 1357 } 1358 1359 if (oldconfig && spa->spa_config) 1360 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 1361 1362 if (new_state != POOL_STATE_UNINITIALIZED) { 1363 spa_remove(spa); 1364 spa_config_sync(); 1365 } 1366 mutex_exit(&spa_namespace_lock); 1367 1368 return (0); 1369 } 1370 1371 /* 1372 * Destroy a storage pool. 1373 */ 1374 int 1375 spa_destroy(char *pool) 1376 { 1377 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1378 } 1379 1380 /* 1381 * Export a storage pool. 1382 */ 1383 int 1384 spa_export(char *pool, nvlist_t **oldconfig) 1385 { 1386 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1387 } 1388 1389 /* 1390 * Similar to spa_export(), this unloads the spa_t without actually removing it 1391 * from the namespace in any way. 1392 */ 1393 int 1394 spa_reset(char *pool) 1395 { 1396 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 1397 } 1398 1399 1400 /* 1401 * ========================================================================== 1402 * Device manipulation 1403 * ========================================================================== 1404 */ 1405 1406 /* 1407 * Add capacity to a storage pool. 1408 */ 1409 int 1410 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1411 { 1412 uint64_t txg; 1413 int c, error; 1414 vdev_t *rvd = spa->spa_root_vdev; 1415 vdev_t *vd, *tvd; 1416 nvlist_t **spares; 1417 uint_t i, nspares; 1418 1419 txg = spa_vdev_enter(spa); 1420 1421 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 1422 VDEV_ALLOC_ADD)) != 0) 1423 return (spa_vdev_exit(spa, NULL, txg, error)); 1424 1425 if ((error = spa_validate_spares(spa, nvroot, txg, 1426 VDEV_ALLOC_ADD)) != 0) 1427 return (spa_vdev_exit(spa, vd, txg, error)); 1428 1429 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1430 &spares, &nspares) != 0) 1431 nspares = 0; 1432 1433 if (vd->vdev_children == 0 && nspares == 0) 1434 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1435 1436 if (vd->vdev_children != 0) { 1437 if ((error = vdev_create(vd, txg, B_FALSE)) != 0) 1438 return (spa_vdev_exit(spa, vd, txg, error)); 1439 1440 /* 1441 * Transfer each new top-level vdev from vd to rvd. 1442 */ 1443 for (c = 0; c < vd->vdev_children; c++) { 1444 tvd = vd->vdev_child[c]; 1445 vdev_remove_child(vd, tvd); 1446 tvd->vdev_id = rvd->vdev_children; 1447 vdev_add_child(rvd, tvd); 1448 vdev_config_dirty(tvd); 1449 } 1450 } 1451 1452 if (nspares != 0) { 1453 if (spa->spa_sparelist != NULL) { 1454 nvlist_t **oldspares; 1455 uint_t oldnspares; 1456 nvlist_t **newspares; 1457 1458 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 1459 ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 1460 1461 newspares = kmem_alloc(sizeof (void *) * 1462 (nspares + oldnspares), KM_SLEEP); 1463 for (i = 0; i < oldnspares; i++) 1464 VERIFY(nvlist_dup(oldspares[i], 1465 &newspares[i], KM_SLEEP) == 0); 1466 for (i = 0; i < nspares; i++) 1467 VERIFY(nvlist_dup(spares[i], 1468 &newspares[i + oldnspares], 1469 KM_SLEEP) == 0); 1470 1471 VERIFY(nvlist_remove(spa->spa_sparelist, 1472 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1473 1474 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1475 ZPOOL_CONFIG_SPARES, newspares, 1476 nspares + oldnspares) == 0); 1477 for (i = 0; i < oldnspares + nspares; i++) 1478 nvlist_free(newspares[i]); 1479 kmem_free(newspares, (oldnspares + nspares) * 1480 sizeof (void *)); 1481 } else { 1482 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1483 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1484 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1485 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1486 } 1487 1488 spa_load_spares(spa); 1489 spa->spa_sync_spares = B_TRUE; 1490 } 1491 1492 /* 1493 * We have to be careful when adding new vdevs to an existing pool. 1494 * If other threads start allocating from these vdevs before we 1495 * sync the config cache, and we lose power, then upon reboot we may 1496 * fail to open the pool because there are DVAs that the config cache 1497 * can't translate. Therefore, we first add the vdevs without 1498 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1499 * and then let spa_config_update() initialize the new metaslabs. 1500 * 1501 * spa_load() checks for added-but-not-initialized vdevs, so that 1502 * if we lose power at any point in this sequence, the remaining 1503 * steps will be completed the next time we load the pool. 1504 */ 1505 (void) spa_vdev_exit(spa, vd, txg, 0); 1506 1507 mutex_enter(&spa_namespace_lock); 1508 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1509 mutex_exit(&spa_namespace_lock); 1510 1511 return (0); 1512 } 1513 1514 /* 1515 * Attach a device to a mirror. The arguments are the path to any device 1516 * in the mirror, and the nvroot for the new device. If the path specifies 1517 * a device that is not mirrored, we automatically insert the mirror vdev. 1518 * 1519 * If 'replacing' is specified, the new device is intended to replace the 1520 * existing device; in this case the two devices are made into their own 1521 * mirror using the 'replacing' vdev, which is functionally idendical to 1522 * the mirror vdev (it actually reuses all the same ops) but has a few 1523 * extra rules: you can't attach to it after it's been created, and upon 1524 * completion of resilvering, the first disk (the one being replaced) 1525 * is automatically detached. 1526 */ 1527 int 1528 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1529 { 1530 uint64_t txg, open_txg; 1531 int error; 1532 vdev_t *rvd = spa->spa_root_vdev; 1533 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1534 vdev_ops_t *pvops; 1535 1536 txg = spa_vdev_enter(spa); 1537 1538 oldvd = vdev_lookup_by_guid(rvd, guid); 1539 1540 if (oldvd == NULL) 1541 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1542 1543 if (!oldvd->vdev_ops->vdev_op_leaf) 1544 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1545 1546 pvd = oldvd->vdev_parent; 1547 1548 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 1549 VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) 1550 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1551 1552 newvd = newrootvd->vdev_child[0]; 1553 1554 if (!newvd->vdev_ops->vdev_op_leaf) 1555 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1556 1557 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 1558 return (spa_vdev_exit(spa, newrootvd, txg, error)); 1559 1560 if (!replacing) { 1561 /* 1562 * For attach, the only allowable parent is a mirror or the root 1563 * vdev. 1564 */ 1565 if (pvd->vdev_ops != &vdev_mirror_ops && 1566 pvd->vdev_ops != &vdev_root_ops) 1567 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1568 1569 pvops = &vdev_mirror_ops; 1570 } else { 1571 /* 1572 * Active hot spares can only be replaced by inactive hot 1573 * spares. 1574 */ 1575 if (pvd->vdev_ops == &vdev_spare_ops && 1576 pvd->vdev_child[1] == oldvd && 1577 !spa_has_spare(spa, newvd->vdev_guid)) 1578 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1579 1580 /* 1581 * If the source is a hot spare, and the parent isn't already a 1582 * spare, then we want to create a new hot spare. Otherwise, we 1583 * want to create a replacing vdev. 1584 */ 1585 if (pvd->vdev_ops == &vdev_replacing_ops) 1586 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1587 else if (pvd->vdev_ops != &vdev_spare_ops && 1588 newvd->vdev_isspare) 1589 pvops = &vdev_spare_ops; 1590 else 1591 pvops = &vdev_replacing_ops; 1592 } 1593 1594 /* 1595 * Compare the new device size with the replaceable/attachable 1596 * device size. 1597 */ 1598 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1599 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1600 1601 /* 1602 * The new device cannot have a higher alignment requirement 1603 * than the top-level vdev. 1604 */ 1605 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1606 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1607 1608 /* 1609 * If this is an in-place replacement, update oldvd's path and devid 1610 * to make it distinguishable from newvd, and unopenable from now on. 1611 */ 1612 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1613 spa_strfree(oldvd->vdev_path); 1614 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1615 KM_SLEEP); 1616 (void) sprintf(oldvd->vdev_path, "%s/%s", 1617 newvd->vdev_path, "old"); 1618 if (oldvd->vdev_devid != NULL) { 1619 spa_strfree(oldvd->vdev_devid); 1620 oldvd->vdev_devid = NULL; 1621 } 1622 } 1623 1624 /* 1625 * If the parent is not a mirror, or if we're replacing, insert the new 1626 * mirror/replacing/spare vdev above oldvd. 1627 */ 1628 if (pvd->vdev_ops != pvops) 1629 pvd = vdev_add_parent(oldvd, pvops); 1630 1631 ASSERT(pvd->vdev_top->vdev_parent == rvd); 1632 ASSERT(pvd->vdev_ops == pvops); 1633 ASSERT(oldvd->vdev_parent == pvd); 1634 1635 /* 1636 * Extract the new device from its root and add it to pvd. 1637 */ 1638 vdev_remove_child(newrootvd, newvd); 1639 newvd->vdev_id = pvd->vdev_children; 1640 vdev_add_child(pvd, newvd); 1641 1642 /* 1643 * If newvd is smaller than oldvd, but larger than its rsize, 1644 * the addition of newvd may have decreased our parent's asize. 1645 */ 1646 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1647 1648 tvd = newvd->vdev_top; 1649 ASSERT(pvd->vdev_top == tvd); 1650 ASSERT(tvd->vdev_parent == rvd); 1651 1652 vdev_config_dirty(tvd); 1653 1654 /* 1655 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1656 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1657 */ 1658 open_txg = txg + TXG_CONCURRENT_STATES - 1; 1659 1660 mutex_enter(&newvd->vdev_dtl_lock); 1661 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1662 open_txg - TXG_INITIAL + 1); 1663 mutex_exit(&newvd->vdev_dtl_lock); 1664 1665 dprintf("attached %s in txg %llu\n", newvd->vdev_path, txg); 1666 1667 /* 1668 * Mark newvd's DTL dirty in this txg. 1669 */ 1670 vdev_dirty(tvd, VDD_DTL, newvd, txg); 1671 1672 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1673 1674 /* 1675 * Kick off a resilver to update newvd. 1676 */ 1677 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1678 1679 return (0); 1680 } 1681 1682 /* 1683 * Detach a device from a mirror or replacing vdev. 1684 * If 'replace_done' is specified, only detach if the parent 1685 * is a replacing vdev. 1686 */ 1687 int 1688 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1689 { 1690 uint64_t txg; 1691 int c, t, error; 1692 vdev_t *rvd = spa->spa_root_vdev; 1693 vdev_t *vd, *pvd, *cvd, *tvd; 1694 boolean_t unspare = B_FALSE; 1695 uint64_t unspare_guid; 1696 1697 txg = spa_vdev_enter(spa); 1698 1699 vd = vdev_lookup_by_guid(rvd, guid); 1700 1701 if (vd == NULL) 1702 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1703 1704 if (!vd->vdev_ops->vdev_op_leaf) 1705 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1706 1707 pvd = vd->vdev_parent; 1708 1709 /* 1710 * If replace_done is specified, only remove this device if it's 1711 * the first child of a replacing vdev. For the 'spare' vdev, either 1712 * disk can be removed. 1713 */ 1714 if (replace_done) { 1715 if (pvd->vdev_ops == &vdev_replacing_ops) { 1716 if (vd->vdev_id != 0) 1717 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1718 } else if (pvd->vdev_ops != &vdev_spare_ops) { 1719 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1720 } 1721 } 1722 1723 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 1724 spa_version(spa) >= ZFS_VERSION_SPARES); 1725 1726 /* 1727 * Only mirror, replacing, and spare vdevs support detach. 1728 */ 1729 if (pvd->vdev_ops != &vdev_replacing_ops && 1730 pvd->vdev_ops != &vdev_mirror_ops && 1731 pvd->vdev_ops != &vdev_spare_ops) 1732 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1733 1734 /* 1735 * If there's only one replica, you can't detach it. 1736 */ 1737 if (pvd->vdev_children <= 1) 1738 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1739 1740 /* 1741 * If all siblings have non-empty DTLs, this device may have the only 1742 * valid copy of the data, which means we cannot safely detach it. 1743 * 1744 * XXX -- as in the vdev_offline() case, we really want a more 1745 * precise DTL check. 1746 */ 1747 for (c = 0; c < pvd->vdev_children; c++) { 1748 uint64_t dirty; 1749 1750 cvd = pvd->vdev_child[c]; 1751 if (cvd == vd) 1752 continue; 1753 if (vdev_is_dead(cvd)) 1754 continue; 1755 mutex_enter(&cvd->vdev_dtl_lock); 1756 dirty = cvd->vdev_dtl_map.sm_space | 1757 cvd->vdev_dtl_scrub.sm_space; 1758 mutex_exit(&cvd->vdev_dtl_lock); 1759 if (!dirty) 1760 break; 1761 } 1762 1763 /* 1764 * If we are a replacing or spare vdev, then we can always detach the 1765 * latter child, as that is how one cancels the operation. 1766 */ 1767 if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 1768 c == pvd->vdev_children) 1769 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1770 1771 /* 1772 * If we are detaching the original disk from a spare, then it implies 1773 * that the spare should become a real disk, and be removed from the 1774 * active spare list for the pool. 1775 */ 1776 if (pvd->vdev_ops == &vdev_spare_ops && 1777 vd->vdev_id == 0) 1778 unspare = B_TRUE; 1779 1780 /* 1781 * Erase the disk labels so the disk can be used for other things. 1782 * This must be done after all other error cases are handled, 1783 * but before we disembowel vd (so we can still do I/O to it). 1784 * But if we can't do it, don't treat the error as fatal -- 1785 * it may be that the unwritability of the disk is the reason 1786 * it's being detached! 1787 */ 1788 error = vdev_label_init(vd, 0, B_FALSE); 1789 if (error) 1790 dprintf("unable to erase labels on %s\n", vdev_description(vd)); 1791 1792 /* 1793 * Remove vd from its parent and compact the parent's children. 1794 */ 1795 vdev_remove_child(pvd, vd); 1796 vdev_compact_children(pvd); 1797 1798 /* 1799 * Remember one of the remaining children so we can get tvd below. 1800 */ 1801 cvd = pvd->vdev_child[0]; 1802 1803 /* 1804 * If we need to remove the remaining child from the list of hot spares, 1805 * do it now, marking the vdev as no longer a spare in the process. We 1806 * must do this before vdev_remove_parent(), because that can change the 1807 * GUID if it creates a new toplevel GUID. 1808 */ 1809 if (unspare) { 1810 ASSERT(cvd->vdev_isspare); 1811 spa_spare_remove(cvd->vdev_guid); 1812 cvd->vdev_isspare = B_FALSE; 1813 unspare_guid = cvd->vdev_guid; 1814 } 1815 1816 /* 1817 * If the parent mirror/replacing vdev only has one child, 1818 * the parent is no longer needed. Remove it from the tree. 1819 */ 1820 if (pvd->vdev_children == 1) 1821 vdev_remove_parent(cvd); 1822 1823 /* 1824 * We don't set tvd until now because the parent we just removed 1825 * may have been the previous top-level vdev. 1826 */ 1827 tvd = cvd->vdev_top; 1828 ASSERT(tvd->vdev_parent == rvd); 1829 1830 /* 1831 * Reopen this top-level vdev to reassess health after detach. 1832 */ 1833 vdev_reopen(tvd); 1834 1835 /* 1836 * If the device we just detached was smaller than the others, 1837 * it may be possible to add metaslabs (i.e. grow the pool). 1838 * vdev_metaslab_init() can't fail because the existing metaslabs 1839 * are already in core, so there's nothing to read from disk. 1840 */ 1841 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 1842 1843 vdev_config_dirty(tvd); 1844 1845 /* 1846 * Mark vd's DTL as dirty in this txg. 1847 * vdev_dtl_sync() will see that vd->vdev_detached is set 1848 * and free vd's DTL object in syncing context. 1849 * But first make sure we're not on any *other* txg's DTL list, 1850 * to prevent vd from being accessed after it's freed. 1851 */ 1852 for (t = 0; t < TXG_SIZE; t++) 1853 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1854 vd->vdev_detached = B_TRUE; 1855 vdev_dirty(tvd, VDD_DTL, vd, txg); 1856 1857 dprintf("detached %s in txg %llu\n", vd->vdev_path, txg); 1858 1859 error = spa_vdev_exit(spa, vd, txg, 0); 1860 1861 /* 1862 * If we are supposed to remove the given vdev from the list of spares, 1863 * iterate over all pools in the system and replace it if it's present. 1864 */ 1865 if (unspare) { 1866 spa = NULL; 1867 mutex_enter(&spa_namespace_lock); 1868 while ((spa = spa_next(spa)) != NULL) { 1869 if (spa->spa_state != POOL_STATE_ACTIVE) 1870 continue; 1871 1872 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 1873 } 1874 mutex_exit(&spa_namespace_lock); 1875 } 1876 1877 return (error); 1878 } 1879 1880 /* 1881 * Remove a device from the pool. Currently, this supports removing only hot 1882 * spares. 1883 */ 1884 int 1885 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 1886 { 1887 vdev_t *vd; 1888 nvlist_t **spares, *nv, **newspares; 1889 uint_t i, j, nspares; 1890 int ret = 0; 1891 1892 spa_config_enter(spa, RW_WRITER, FTAG); 1893 1894 vd = spa_lookup_by_guid(spa, guid); 1895 1896 nv = NULL; 1897 if (spa->spa_spares != NULL && 1898 nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 1899 &spares, &nspares) == 0) { 1900 for (i = 0; i < nspares; i++) { 1901 uint64_t theguid; 1902 1903 VERIFY(nvlist_lookup_uint64(spares[i], 1904 ZPOOL_CONFIG_GUID, &theguid) == 0); 1905 if (theguid == guid) { 1906 nv = spares[i]; 1907 break; 1908 } 1909 } 1910 } 1911 1912 /* 1913 * We only support removing a hot spare, and only if it's not currently 1914 * in use in this pool. 1915 */ 1916 if (nv == NULL && vd == NULL) { 1917 ret = ENOENT; 1918 goto out; 1919 } 1920 1921 if (nv == NULL && vd != NULL) { 1922 ret = ENOTSUP; 1923 goto out; 1924 } 1925 1926 if (!unspare && nv != NULL && vd != NULL) { 1927 ret = EBUSY; 1928 goto out; 1929 } 1930 1931 if (nspares == 1) { 1932 newspares = NULL; 1933 } else { 1934 newspares = kmem_alloc((nspares - 1) * sizeof (void *), 1935 KM_SLEEP); 1936 for (i = 0, j = 0; i < nspares; i++) { 1937 if (spares[i] != nv) 1938 VERIFY(nvlist_dup(spares[i], 1939 &newspares[j++], KM_SLEEP) == 0); 1940 } 1941 } 1942 1943 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 1944 DATA_TYPE_NVLIST_ARRAY) == 0); 1945 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 1946 newspares, nspares - 1) == 0); 1947 for (i = 0; i < nspares - 1; i++) 1948 nvlist_free(newspares[i]); 1949 kmem_free(newspares, (nspares - 1) * sizeof (void *)); 1950 spa_load_spares(spa); 1951 spa->spa_sync_spares = B_TRUE; 1952 1953 out: 1954 spa_config_exit(spa, FTAG); 1955 1956 return (ret); 1957 } 1958 1959 /* 1960 * Find any device that's done replacing, so we can detach it. 1961 */ 1962 static vdev_t * 1963 spa_vdev_replace_done_hunt(vdev_t *vd) 1964 { 1965 vdev_t *newvd, *oldvd; 1966 int c; 1967 1968 for (c = 0; c < vd->vdev_children; c++) { 1969 oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 1970 if (oldvd != NULL) 1971 return (oldvd); 1972 } 1973 1974 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 1975 oldvd = vd->vdev_child[0]; 1976 newvd = vd->vdev_child[1]; 1977 1978 mutex_enter(&newvd->vdev_dtl_lock); 1979 if (newvd->vdev_dtl_map.sm_space == 0 && 1980 newvd->vdev_dtl_scrub.sm_space == 0) { 1981 mutex_exit(&newvd->vdev_dtl_lock); 1982 return (oldvd); 1983 } 1984 mutex_exit(&newvd->vdev_dtl_lock); 1985 } 1986 1987 return (NULL); 1988 } 1989 1990 static void 1991 spa_vdev_replace_done(spa_t *spa) 1992 { 1993 vdev_t *vd; 1994 vdev_t *pvd; 1995 uint64_t guid; 1996 uint64_t pguid = 0; 1997 1998 spa_config_enter(spa, RW_READER, FTAG); 1999 2000 while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 2001 guid = vd->vdev_guid; 2002 /* 2003 * If we have just finished replacing a hot spared device, then 2004 * we need to detach the parent's first child (the original hot 2005 * spare) as well. 2006 */ 2007 pvd = vd->vdev_parent; 2008 if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2009 pvd->vdev_id == 0) { 2010 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 2011 ASSERT(pvd->vdev_parent->vdev_children == 2); 2012 pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 2013 } 2014 spa_config_exit(spa, FTAG); 2015 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 2016 return; 2017 if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 2018 return; 2019 spa_config_enter(spa, RW_READER, FTAG); 2020 } 2021 2022 spa_config_exit(spa, FTAG); 2023 } 2024 2025 /* 2026 * Update the stored path for this vdev. Dirty the vdev configuration, relying 2027 * on spa_vdev_enter/exit() to synchronize the labels and cache. 2028 */ 2029 int 2030 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 2031 { 2032 vdev_t *rvd, *vd; 2033 uint64_t txg; 2034 2035 rvd = spa->spa_root_vdev; 2036 2037 txg = spa_vdev_enter(spa); 2038 2039 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2040 /* 2041 * Determine if this is a reference to a hot spare. In that 2042 * case, update the path as stored in the spare list. 2043 */ 2044 nvlist_t **spares; 2045 uint_t i, nspares; 2046 if (spa->spa_sparelist != NULL) { 2047 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 2048 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2049 for (i = 0; i < nspares; i++) { 2050 uint64_t theguid; 2051 VERIFY(nvlist_lookup_uint64(spares[i], 2052 ZPOOL_CONFIG_GUID, &theguid) == 0); 2053 if (theguid == guid) 2054 break; 2055 } 2056 2057 if (i == nspares) 2058 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2059 2060 VERIFY(nvlist_add_string(spares[i], 2061 ZPOOL_CONFIG_PATH, newpath) == 0); 2062 spa_load_spares(spa); 2063 spa->spa_sync_spares = B_TRUE; 2064 return (spa_vdev_exit(spa, NULL, txg, 0)); 2065 } else { 2066 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2067 } 2068 } 2069 2070 if (!vd->vdev_ops->vdev_op_leaf) 2071 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2072 2073 spa_strfree(vd->vdev_path); 2074 vd->vdev_path = spa_strdup(newpath); 2075 2076 vdev_config_dirty(vd->vdev_top); 2077 2078 return (spa_vdev_exit(spa, NULL, txg, 0)); 2079 } 2080 2081 /* 2082 * ========================================================================== 2083 * SPA Scrubbing 2084 * ========================================================================== 2085 */ 2086 2087 void 2088 spa_scrub_throttle(spa_t *spa, int direction) 2089 { 2090 mutex_enter(&spa->spa_scrub_lock); 2091 spa->spa_scrub_throttled += direction; 2092 ASSERT(spa->spa_scrub_throttled >= 0); 2093 if (spa->spa_scrub_throttled == 0) 2094 cv_broadcast(&spa->spa_scrub_io_cv); 2095 mutex_exit(&spa->spa_scrub_lock); 2096 } 2097 2098 static void 2099 spa_scrub_io_done(zio_t *zio) 2100 { 2101 spa_t *spa = zio->io_spa; 2102 2103 zio_buf_free(zio->io_data, zio->io_size); 2104 2105 mutex_enter(&spa->spa_scrub_lock); 2106 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2107 vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2108 spa->spa_scrub_errors++; 2109 mutex_enter(&vd->vdev_stat_lock); 2110 vd->vdev_stat.vs_scrub_errors++; 2111 mutex_exit(&vd->vdev_stat_lock); 2112 } 2113 if (--spa->spa_scrub_inflight == 0) { 2114 cv_broadcast(&spa->spa_scrub_io_cv); 2115 ASSERT(spa->spa_scrub_throttled == 0); 2116 } 2117 mutex_exit(&spa->spa_scrub_lock); 2118 } 2119 2120 static void 2121 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 2122 zbookmark_t *zb) 2123 { 2124 size_t size = BP_GET_LSIZE(bp); 2125 void *data = zio_buf_alloc(size); 2126 2127 mutex_enter(&spa->spa_scrub_lock); 2128 spa->spa_scrub_inflight++; 2129 mutex_exit(&spa->spa_scrub_lock); 2130 2131 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 2132 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 2133 2134 flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 2135 2136 zio_nowait(zio_read(NULL, spa, bp, data, size, 2137 spa_scrub_io_done, NULL, priority, flags, zb)); 2138 } 2139 2140 /* ARGSUSED */ 2141 static int 2142 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2143 { 2144 blkptr_t *bp = &bc->bc_blkptr; 2145 vdev_t *vd = spa->spa_root_vdev; 2146 dva_t *dva = bp->blk_dva; 2147 int needs_resilver = B_FALSE; 2148 int d; 2149 2150 if (bc->bc_errno) { 2151 /* 2152 * We can't scrub this block, but we can continue to scrub 2153 * the rest of the pool. Note the error and move along. 2154 */ 2155 mutex_enter(&spa->spa_scrub_lock); 2156 spa->spa_scrub_errors++; 2157 mutex_exit(&spa->spa_scrub_lock); 2158 2159 mutex_enter(&vd->vdev_stat_lock); 2160 vd->vdev_stat.vs_scrub_errors++; 2161 mutex_exit(&vd->vdev_stat_lock); 2162 2163 return (ERESTART); 2164 } 2165 2166 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2167 2168 for (d = 0; d < BP_GET_NDVAS(bp); d++) { 2169 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 2170 2171 ASSERT(vd != NULL); 2172 2173 /* 2174 * Keep track of how much data we've examined so that 2175 * zpool(1M) status can make useful progress reports. 2176 */ 2177 mutex_enter(&vd->vdev_stat_lock); 2178 vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 2179 mutex_exit(&vd->vdev_stat_lock); 2180 2181 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 2182 if (DVA_GET_GANG(&dva[d])) { 2183 /* 2184 * Gang members may be spread across multiple 2185 * vdevs, so the best we can do is look at the 2186 * pool-wide DTL. 2187 * XXX -- it would be better to change our 2188 * allocation policy to ensure that this can't 2189 * happen. 2190 */ 2191 vd = spa->spa_root_vdev; 2192 } 2193 if (vdev_dtl_contains(&vd->vdev_dtl_map, 2194 bp->blk_birth, 1)) 2195 needs_resilver = B_TRUE; 2196 } 2197 } 2198 2199 if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2200 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 2201 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 2202 else if (needs_resilver) 2203 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 2204 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2205 2206 return (0); 2207 } 2208 2209 static void 2210 spa_scrub_thread(spa_t *spa) 2211 { 2212 callb_cpr_t cprinfo; 2213 traverse_handle_t *th = spa->spa_scrub_th; 2214 vdev_t *rvd = spa->spa_root_vdev; 2215 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2216 int error = 0; 2217 boolean_t complete; 2218 2219 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2220 2221 /* 2222 * If we're restarting due to a snapshot create/delete, 2223 * wait for that to complete. 2224 */ 2225 txg_wait_synced(spa_get_dsl(spa), 0); 2226 2227 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 2228 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2229 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 2230 2231 spa_config_enter(spa, RW_WRITER, FTAG); 2232 vdev_reopen(rvd); /* purge all vdev caches */ 2233 vdev_config_dirty(rvd); /* rewrite all disk labels */ 2234 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 2235 spa_config_exit(spa, FTAG); 2236 2237 mutex_enter(&spa->spa_scrub_lock); 2238 spa->spa_scrub_errors = 0; 2239 spa->spa_scrub_active = 1; 2240 ASSERT(spa->spa_scrub_inflight == 0); 2241 ASSERT(spa->spa_scrub_throttled == 0); 2242 2243 while (!spa->spa_scrub_stop) { 2244 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2245 while (spa->spa_scrub_suspended) { 2246 spa->spa_scrub_active = 0; 2247 cv_broadcast(&spa->spa_scrub_cv); 2248 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2249 spa->spa_scrub_active = 1; 2250 } 2251 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2252 2253 if (spa->spa_scrub_restart_txg != 0) 2254 break; 2255 2256 mutex_exit(&spa->spa_scrub_lock); 2257 error = traverse_more(th); 2258 mutex_enter(&spa->spa_scrub_lock); 2259 if (error != EAGAIN) 2260 break; 2261 2262 while (spa->spa_scrub_throttled > 0) 2263 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2264 } 2265 2266 while (spa->spa_scrub_inflight) 2267 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2268 2269 spa->spa_scrub_active = 0; 2270 cv_broadcast(&spa->spa_scrub_cv); 2271 2272 mutex_exit(&spa->spa_scrub_lock); 2273 2274 spa_config_enter(spa, RW_WRITER, FTAG); 2275 2276 mutex_enter(&spa->spa_scrub_lock); 2277 2278 /* 2279 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 2280 * AND the spa config lock to synchronize with any config changes 2281 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 2282 */ 2283 if (spa->spa_scrub_restart_txg != 0) 2284 error = ERESTART; 2285 2286 if (spa->spa_scrub_stop) 2287 error = EINTR; 2288 2289 /* 2290 * Even if there were uncorrectable errors, we consider the scrub 2291 * completed. The downside is that if there is a transient error during 2292 * a resilver, we won't resilver the data properly to the target. But 2293 * if the damage is permanent (more likely) we will resilver forever, 2294 * which isn't really acceptable. Since there is enough information for 2295 * the user to know what has failed and why, this seems like a more 2296 * tractable approach. 2297 */ 2298 complete = (error == 0); 2299 2300 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 2301 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2302 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2303 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2304 2305 mutex_exit(&spa->spa_scrub_lock); 2306 2307 /* 2308 * If the scrub/resilver completed, update all DTLs to reflect this. 2309 * Whether it succeeded or not, vacate all temporary scrub DTLs. 2310 */ 2311 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2312 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2313 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 2314 spa_errlog_rotate(spa); 2315 2316 spa_config_exit(spa, FTAG); 2317 2318 mutex_enter(&spa->spa_scrub_lock); 2319 2320 /* 2321 * We may have finished replacing a device. 2322 * Let the async thread assess this and handle the detach. 2323 */ 2324 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2325 2326 /* 2327 * If we were told to restart, our final act is to start a new scrub. 2328 */ 2329 if (error == ERESTART) 2330 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 2331 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2332 2333 spa->spa_scrub_type = POOL_SCRUB_NONE; 2334 spa->spa_scrub_active = 0; 2335 spa->spa_scrub_thread = NULL; 2336 cv_broadcast(&spa->spa_scrub_cv); 2337 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2338 thread_exit(); 2339 } 2340 2341 void 2342 spa_scrub_suspend(spa_t *spa) 2343 { 2344 mutex_enter(&spa->spa_scrub_lock); 2345 spa->spa_scrub_suspended++; 2346 while (spa->spa_scrub_active) { 2347 cv_broadcast(&spa->spa_scrub_cv); 2348 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2349 } 2350 while (spa->spa_scrub_inflight) 2351 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2352 mutex_exit(&spa->spa_scrub_lock); 2353 } 2354 2355 void 2356 spa_scrub_resume(spa_t *spa) 2357 { 2358 mutex_enter(&spa->spa_scrub_lock); 2359 ASSERT(spa->spa_scrub_suspended != 0); 2360 if (--spa->spa_scrub_suspended == 0) 2361 cv_broadcast(&spa->spa_scrub_cv); 2362 mutex_exit(&spa->spa_scrub_lock); 2363 } 2364 2365 void 2366 spa_scrub_restart(spa_t *spa, uint64_t txg) 2367 { 2368 /* 2369 * Something happened (e.g. snapshot create/delete) that means 2370 * we must restart any in-progress scrubs. The itinerary will 2371 * fix this properly. 2372 */ 2373 mutex_enter(&spa->spa_scrub_lock); 2374 spa->spa_scrub_restart_txg = txg; 2375 mutex_exit(&spa->spa_scrub_lock); 2376 } 2377 2378 int 2379 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2380 { 2381 space_seg_t *ss; 2382 uint64_t mintxg, maxtxg; 2383 vdev_t *rvd = spa->spa_root_vdev; 2384 2385 if ((uint_t)type >= POOL_SCRUB_TYPES) 2386 return (ENOTSUP); 2387 2388 mutex_enter(&spa->spa_scrub_lock); 2389 2390 /* 2391 * If there's a scrub or resilver already in progress, stop it. 2392 */ 2393 while (spa->spa_scrub_thread != NULL) { 2394 /* 2395 * Don't stop a resilver unless forced. 2396 */ 2397 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 2398 mutex_exit(&spa->spa_scrub_lock); 2399 return (EBUSY); 2400 } 2401 spa->spa_scrub_stop = 1; 2402 cv_broadcast(&spa->spa_scrub_cv); 2403 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2404 } 2405 2406 /* 2407 * Terminate the previous traverse. 2408 */ 2409 if (spa->spa_scrub_th != NULL) { 2410 traverse_fini(spa->spa_scrub_th); 2411 spa->spa_scrub_th = NULL; 2412 } 2413 2414 if (rvd == NULL) { 2415 ASSERT(spa->spa_scrub_stop == 0); 2416 ASSERT(spa->spa_scrub_type == type); 2417 ASSERT(spa->spa_scrub_restart_txg == 0); 2418 mutex_exit(&spa->spa_scrub_lock); 2419 return (0); 2420 } 2421 2422 mintxg = TXG_INITIAL - 1; 2423 maxtxg = spa_last_synced_txg(spa) + 1; 2424 2425 mutex_enter(&rvd->vdev_dtl_lock); 2426 2427 if (rvd->vdev_dtl_map.sm_space == 0) { 2428 /* 2429 * The pool-wide DTL is empty. 2430 * If this is a resilver, there's nothing to do except 2431 * check whether any in-progress replacements have completed. 2432 */ 2433 if (type == POOL_SCRUB_RESILVER) { 2434 type = POOL_SCRUB_NONE; 2435 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2436 } 2437 } else { 2438 /* 2439 * The pool-wide DTL is non-empty. 2440 * If this is a normal scrub, upgrade to a resilver instead. 2441 */ 2442 if (type == POOL_SCRUB_EVERYTHING) 2443 type = POOL_SCRUB_RESILVER; 2444 } 2445 2446 if (type == POOL_SCRUB_RESILVER) { 2447 /* 2448 * Determine the resilvering boundaries. 2449 * 2450 * Note: (mintxg, maxtxg) is an open interval, 2451 * i.e. mintxg and maxtxg themselves are not included. 2452 * 2453 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 2454 * so we don't claim to resilver a txg that's still changing. 2455 */ 2456 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 2457 mintxg = ss->ss_start - 1; 2458 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 2459 maxtxg = MIN(ss->ss_end, maxtxg); 2460 } 2461 2462 mutex_exit(&rvd->vdev_dtl_lock); 2463 2464 spa->spa_scrub_stop = 0; 2465 spa->spa_scrub_type = type; 2466 spa->spa_scrub_restart_txg = 0; 2467 2468 if (type != POOL_SCRUB_NONE) { 2469 spa->spa_scrub_mintxg = mintxg; 2470 spa->spa_scrub_maxtxg = maxtxg; 2471 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 2472 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 2473 ZIO_FLAG_CANFAIL); 2474 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 2475 spa->spa_scrub_thread = thread_create(NULL, 0, 2476 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 2477 } 2478 2479 mutex_exit(&spa->spa_scrub_lock); 2480 2481 return (0); 2482 } 2483 2484 /* 2485 * ========================================================================== 2486 * SPA async task processing 2487 * ========================================================================== 2488 */ 2489 2490 static void 2491 spa_async_reopen(spa_t *spa) 2492 { 2493 vdev_t *rvd = spa->spa_root_vdev; 2494 vdev_t *tvd; 2495 int c; 2496 2497 spa_config_enter(spa, RW_WRITER, FTAG); 2498 2499 for (c = 0; c < rvd->vdev_children; c++) { 2500 tvd = rvd->vdev_child[c]; 2501 if (tvd->vdev_reopen_wanted) { 2502 tvd->vdev_reopen_wanted = 0; 2503 vdev_reopen(tvd); 2504 } 2505 } 2506 2507 spa_config_exit(spa, FTAG); 2508 } 2509 2510 static void 2511 spa_async_thread(spa_t *spa) 2512 { 2513 int tasks; 2514 2515 ASSERT(spa->spa_sync_on); 2516 2517 mutex_enter(&spa->spa_async_lock); 2518 tasks = spa->spa_async_tasks; 2519 spa->spa_async_tasks = 0; 2520 mutex_exit(&spa->spa_async_lock); 2521 2522 /* 2523 * See if the config needs to be updated. 2524 */ 2525 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 2526 mutex_enter(&spa_namespace_lock); 2527 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2528 mutex_exit(&spa_namespace_lock); 2529 } 2530 2531 /* 2532 * See if any devices need to be reopened. 2533 */ 2534 if (tasks & SPA_ASYNC_REOPEN) 2535 spa_async_reopen(spa); 2536 2537 /* 2538 * If any devices are done replacing, detach them. 2539 */ 2540 if (tasks & SPA_ASYNC_REPLACE_DONE) 2541 spa_vdev_replace_done(spa); 2542 2543 /* 2544 * Kick off a scrub. 2545 */ 2546 if (tasks & SPA_ASYNC_SCRUB) 2547 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 2548 2549 /* 2550 * Kick off a resilver. 2551 */ 2552 if (tasks & SPA_ASYNC_RESILVER) 2553 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 2554 2555 /* 2556 * Let the world know that we're done. 2557 */ 2558 mutex_enter(&spa->spa_async_lock); 2559 spa->spa_async_thread = NULL; 2560 cv_broadcast(&spa->spa_async_cv); 2561 mutex_exit(&spa->spa_async_lock); 2562 thread_exit(); 2563 } 2564 2565 void 2566 spa_async_suspend(spa_t *spa) 2567 { 2568 mutex_enter(&spa->spa_async_lock); 2569 spa->spa_async_suspended++; 2570 while (spa->spa_async_thread != NULL) 2571 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 2572 mutex_exit(&spa->spa_async_lock); 2573 } 2574 2575 void 2576 spa_async_resume(spa_t *spa) 2577 { 2578 mutex_enter(&spa->spa_async_lock); 2579 ASSERT(spa->spa_async_suspended != 0); 2580 spa->spa_async_suspended--; 2581 mutex_exit(&spa->spa_async_lock); 2582 } 2583 2584 static void 2585 spa_async_dispatch(spa_t *spa) 2586 { 2587 mutex_enter(&spa->spa_async_lock); 2588 if (spa->spa_async_tasks && !spa->spa_async_suspended && 2589 spa->spa_async_thread == NULL && 2590 rootdir != NULL && !vn_is_readonly(rootdir)) 2591 spa->spa_async_thread = thread_create(NULL, 0, 2592 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 2593 mutex_exit(&spa->spa_async_lock); 2594 } 2595 2596 void 2597 spa_async_request(spa_t *spa, int task) 2598 { 2599 mutex_enter(&spa->spa_async_lock); 2600 spa->spa_async_tasks |= task; 2601 mutex_exit(&spa->spa_async_lock); 2602 } 2603 2604 /* 2605 * ========================================================================== 2606 * SPA syncing routines 2607 * ========================================================================== 2608 */ 2609 2610 static void 2611 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2612 { 2613 bplist_t *bpl = &spa->spa_sync_bplist; 2614 dmu_tx_t *tx; 2615 blkptr_t blk; 2616 uint64_t itor = 0; 2617 zio_t *zio; 2618 int error; 2619 uint8_t c = 1; 2620 2621 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2622 2623 while (bplist_iterate(bpl, &itor, &blk) == 0) 2624 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2625 2626 error = zio_wait(zio); 2627 ASSERT3U(error, ==, 0); 2628 2629 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2630 bplist_vacate(bpl, tx); 2631 2632 /* 2633 * Pre-dirty the first block so we sync to convergence faster. 2634 * (Usually only the first block is needed.) 2635 */ 2636 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2637 dmu_tx_commit(tx); 2638 } 2639 2640 static void 2641 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 2642 { 2643 char *packed = NULL; 2644 size_t nvsize = 0; 2645 dmu_buf_t *db; 2646 2647 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 2648 2649 packed = kmem_alloc(nvsize, KM_SLEEP); 2650 2651 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 2652 KM_SLEEP) == 0); 2653 2654 dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 2655 2656 kmem_free(packed, nvsize); 2657 2658 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 2659 dmu_buf_will_dirty(db, tx); 2660 *(uint64_t *)db->db_data = nvsize; 2661 dmu_buf_rele(db, FTAG); 2662 } 2663 2664 static void 2665 spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 2666 { 2667 nvlist_t *nvroot; 2668 nvlist_t **spares; 2669 int i; 2670 2671 if (!spa->spa_sync_spares) 2672 return; 2673 2674 /* 2675 * Update the MOS nvlist describing the list of available spares. 2676 * spa_validate_spares() will have already made sure this nvlist is 2677 * valid and the vdevs are labelled appropriately. 2678 */ 2679 if (spa->spa_spares_object == 0) { 2680 spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 2681 DMU_OT_PACKED_NVLIST, 1 << 14, 2682 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2683 VERIFY(zap_update(spa->spa_meta_objset, 2684 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 2685 sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 2686 } 2687 2688 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2689 if (spa->spa_nspares == 0) { 2690 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2691 NULL, 0) == 0); 2692 } else { 2693 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 2694 KM_SLEEP); 2695 for (i = 0; i < spa->spa_nspares; i++) 2696 spares[i] = vdev_config_generate(spa, 2697 spa->spa_spares[i], B_FALSE, B_TRUE); 2698 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2699 spares, spa->spa_nspares) == 0); 2700 for (i = 0; i < spa->spa_nspares; i++) 2701 nvlist_free(spares[i]); 2702 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 2703 } 2704 2705 spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 2706 2707 spa->spa_sync_spares = B_FALSE; 2708 } 2709 2710 static void 2711 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2712 { 2713 nvlist_t *config; 2714 2715 if (list_is_empty(&spa->spa_dirty_list)) 2716 return; 2717 2718 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2719 2720 if (spa->spa_config_syncing) 2721 nvlist_free(spa->spa_config_syncing); 2722 spa->spa_config_syncing = config; 2723 2724 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 2725 } 2726 2727 /* 2728 * Sync the specified transaction group. New blocks may be dirtied as 2729 * part of the process, so we iterate until it converges. 2730 */ 2731 void 2732 spa_sync(spa_t *spa, uint64_t txg) 2733 { 2734 dsl_pool_t *dp = spa->spa_dsl_pool; 2735 objset_t *mos = spa->spa_meta_objset; 2736 bplist_t *bpl = &spa->spa_sync_bplist; 2737 vdev_t *rvd = spa->spa_root_vdev; 2738 vdev_t *vd; 2739 dmu_tx_t *tx; 2740 int dirty_vdevs; 2741 2742 /* 2743 * Lock out configuration changes. 2744 */ 2745 spa_config_enter(spa, RW_READER, FTAG); 2746 2747 spa->spa_syncing_txg = txg; 2748 spa->spa_sync_pass = 0; 2749 2750 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2751 2752 tx = dmu_tx_create_assigned(dp, txg); 2753 2754 /* 2755 * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg, 2756 * set spa_deflate if we have no raid-z vdevs. 2757 */ 2758 if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE && 2759 spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) { 2760 int i; 2761 2762 for (i = 0; i < rvd->vdev_children; i++) { 2763 vd = rvd->vdev_child[i]; 2764 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 2765 break; 2766 } 2767 if (i == rvd->vdev_children) { 2768 spa->spa_deflate = TRUE; 2769 VERIFY(0 == zap_add(spa->spa_meta_objset, 2770 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2771 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 2772 } 2773 } 2774 2775 /* 2776 * If anything has changed in this txg, push the deferred frees 2777 * from the previous txg. If not, leave them alone so that we 2778 * don't generate work on an otherwise idle system. 2779 */ 2780 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2781 !txg_list_empty(&dp->dp_dirty_dirs, txg)) 2782 spa_sync_deferred_frees(spa, txg); 2783 2784 /* 2785 * Iterate to convergence. 2786 */ 2787 do { 2788 spa->spa_sync_pass++; 2789 2790 spa_sync_config_object(spa, tx); 2791 spa_sync_spares(spa, tx); 2792 spa_errlog_sync(spa, txg); 2793 dsl_pool_sync(dp, txg); 2794 2795 dirty_vdevs = 0; 2796 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2797 vdev_sync(vd, txg); 2798 dirty_vdevs++; 2799 } 2800 2801 bplist_sync(bpl, tx); 2802 } while (dirty_vdevs); 2803 2804 bplist_close(bpl); 2805 2806 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2807 2808 /* 2809 * Rewrite the vdev configuration (which includes the uberblock) 2810 * to commit the transaction group. 2811 * 2812 * If there are any dirty vdevs, sync the uberblock to all vdevs. 2813 * Otherwise, pick a random top-level vdev that's known to be 2814 * visible in the config cache (see spa_vdev_add() for details). 2815 * If the write fails, try the next vdev until we're tried them all. 2816 */ 2817 if (!list_is_empty(&spa->spa_dirty_list)) { 2818 VERIFY(vdev_config_sync(rvd, txg) == 0); 2819 } else { 2820 int children = rvd->vdev_children; 2821 int c0 = spa_get_random(children); 2822 int c; 2823 2824 for (c = 0; c < children; c++) { 2825 vd = rvd->vdev_child[(c0 + c) % children]; 2826 if (vd->vdev_ms_array == 0) 2827 continue; 2828 if (vdev_config_sync(vd, txg) == 0) 2829 break; 2830 } 2831 if (c == children) 2832 VERIFY(vdev_config_sync(rvd, txg) == 0); 2833 } 2834 2835 dmu_tx_commit(tx); 2836 2837 /* 2838 * Clear the dirty config list. 2839 */ 2840 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 2841 vdev_config_clean(vd); 2842 2843 /* 2844 * Now that the new config has synced transactionally, 2845 * let it become visible to the config cache. 2846 */ 2847 if (spa->spa_config_syncing != NULL) { 2848 spa_config_set(spa, spa->spa_config_syncing); 2849 spa->spa_config_txg = txg; 2850 spa->spa_config_syncing = NULL; 2851 } 2852 2853 /* 2854 * Make a stable copy of the fully synced uberblock. 2855 * We use this as the root for pool traversals. 2856 */ 2857 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 2858 2859 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 2860 2861 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 2862 spa->spa_traverse_wanted = 0; 2863 spa->spa_ubsync = spa->spa_uberblock; 2864 rw_exit(&spa->spa_traverse_lock); 2865 2866 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 2867 2868 /* 2869 * Clean up the ZIL records for the synced txg. 2870 */ 2871 dsl_pool_zil_clean(dp); 2872 2873 /* 2874 * Update usable space statistics. 2875 */ 2876 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 2877 vdev_sync_done(vd, txg); 2878 2879 /* 2880 * It had better be the case that we didn't dirty anything 2881 * since vdev_config_sync(). 2882 */ 2883 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 2884 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 2885 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 2886 ASSERT(bpl->bpl_queue == NULL); 2887 2888 spa_config_exit(spa, FTAG); 2889 2890 /* 2891 * If any async tasks have been requested, kick them off. 2892 */ 2893 spa_async_dispatch(spa); 2894 } 2895 2896 /* 2897 * Sync all pools. We don't want to hold the namespace lock across these 2898 * operations, so we take a reference on the spa_t and drop the lock during the 2899 * sync. 2900 */ 2901 void 2902 spa_sync_allpools(void) 2903 { 2904 spa_t *spa = NULL; 2905 mutex_enter(&spa_namespace_lock); 2906 while ((spa = spa_next(spa)) != NULL) { 2907 if (spa_state(spa) != POOL_STATE_ACTIVE) 2908 continue; 2909 spa_open_ref(spa, FTAG); 2910 mutex_exit(&spa_namespace_lock); 2911 txg_wait_synced(spa_get_dsl(spa), 0); 2912 mutex_enter(&spa_namespace_lock); 2913 spa_close(spa, FTAG); 2914 } 2915 mutex_exit(&spa_namespace_lock); 2916 } 2917 2918 /* 2919 * ========================================================================== 2920 * Miscellaneous routines 2921 * ========================================================================== 2922 */ 2923 2924 /* 2925 * Remove all pools in the system. 2926 */ 2927 void 2928 spa_evict_all(void) 2929 { 2930 spa_t *spa; 2931 2932 /* 2933 * Remove all cached state. All pools should be closed now, 2934 * so every spa in the AVL tree should be unreferenced. 2935 */ 2936 mutex_enter(&spa_namespace_lock); 2937 while ((spa = spa_next(NULL)) != NULL) { 2938 /* 2939 * Stop async tasks. The async thread may need to detach 2940 * a device that's been replaced, which requires grabbing 2941 * spa_namespace_lock, so we must drop it here. 2942 */ 2943 spa_open_ref(spa, FTAG); 2944 mutex_exit(&spa_namespace_lock); 2945 spa_async_suspend(spa); 2946 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 2947 mutex_enter(&spa_namespace_lock); 2948 spa_close(spa, FTAG); 2949 2950 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2951 spa_unload(spa); 2952 spa_deactivate(spa); 2953 } 2954 spa_remove(spa); 2955 } 2956 mutex_exit(&spa_namespace_lock); 2957 } 2958 2959 vdev_t * 2960 spa_lookup_by_guid(spa_t *spa, uint64_t guid) 2961 { 2962 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 2963 } 2964 2965 void 2966 spa_upgrade(spa_t *spa) 2967 { 2968 spa_config_enter(spa, RW_WRITER, FTAG); 2969 2970 /* 2971 * This should only be called for a non-faulted pool, and since a 2972 * future version would result in an unopenable pool, this shouldn't be 2973 * possible. 2974 */ 2975 ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 2976 2977 spa->spa_uberblock.ub_version = ZFS_VERSION; 2978 vdev_config_dirty(spa->spa_root_vdev); 2979 2980 spa_config_exit(spa, FTAG); 2981 2982 txg_wait_synced(spa_get_dsl(spa), 0); 2983 } 2984 2985 boolean_t 2986 spa_has_spare(spa_t *spa, uint64_t guid) 2987 { 2988 int i; 2989 2990 for (i = 0; i < spa->spa_nspares; i++) 2991 if (spa->spa_spares[i]->vdev_guid == guid) 2992 return (B_TRUE); 2993 2994 return (B_FALSE); 2995 } 2996