1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains all the routines used when modifying on-disk SPA state. 31 * This includes opening, importing, destroying, exporting a pool, and syncing a 32 * pool. 33 */ 34 35 #include <sys/zfs_context.h> 36 #include <sys/fm/fs/zfs.h> 37 #include <sys/spa_impl.h> 38 #include <sys/zio.h> 39 #include <sys/zio_checksum.h> 40 #include <sys/zio_compress.h> 41 #include <sys/dmu.h> 42 #include <sys/dmu_tx.h> 43 #include <sys/zap.h> 44 #include <sys/zil.h> 45 #include <sys/vdev_impl.h> 46 #include <sys/metaslab.h> 47 #include <sys/uberblock_impl.h> 48 #include <sys/txg.h> 49 #include <sys/avl.h> 50 #include <sys/dmu_traverse.h> 51 #include <sys/unique.h> 52 #include <sys/dsl_pool.h> 53 #include <sys/dsl_dir.h> 54 #include <sys/dsl_prop.h> 55 #include <sys/fs/zfs.h> 56 #include <sys/callb.h> 57 58 /* 59 * ========================================================================== 60 * SPA state manipulation (open/create/destroy/import/export) 61 * ========================================================================== 62 */ 63 64 static int 65 spa_error_entry_compare(const void *a, const void *b) 66 { 67 spa_error_entry_t *sa = (spa_error_entry_t *)a; 68 spa_error_entry_t *sb = (spa_error_entry_t *)b; 69 int ret; 70 71 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 72 sizeof (zbookmark_t)); 73 74 if (ret < 0) 75 return (-1); 76 else if (ret > 0) 77 return (1); 78 else 79 return (0); 80 } 81 82 /* 83 * Utility function which retrieves copies of the current logs and 84 * re-initializes them in the process. 85 */ 86 void 87 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 88 { 89 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 90 91 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 92 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 93 94 avl_create(&spa->spa_errlist_scrub, 95 spa_error_entry_compare, sizeof (spa_error_entry_t), 96 offsetof(spa_error_entry_t, se_avl)); 97 avl_create(&spa->spa_errlist_last, 98 spa_error_entry_compare, sizeof (spa_error_entry_t), 99 offsetof(spa_error_entry_t, se_avl)); 100 } 101 102 /* 103 * Activate an uninitialized pool. 104 */ 105 static void 106 spa_activate(spa_t *spa) 107 { 108 int t; 109 110 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 111 112 spa->spa_state = POOL_STATE_ACTIVE; 113 114 spa->spa_normal_class = metaslab_class_create(); 115 116 for (t = 0; t < ZIO_TYPES; t++) { 117 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 118 8, maxclsyspri, 50, INT_MAX, 119 TASKQ_PREPOPULATE); 120 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 121 8, maxclsyspri, 50, INT_MAX, 122 TASKQ_PREPOPULATE); 123 } 124 125 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 126 127 mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); 128 mutex_init(&spa->spa_config_cache_lock, NULL, MUTEX_DEFAULT, NULL); 129 mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); 130 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); 131 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); 132 mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL); 133 mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); 134 135 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 136 offsetof(vdev_t, vdev_dirty_node)); 137 138 txg_list_create(&spa->spa_vdev_txg_list, 139 offsetof(struct vdev, vdev_txg_node)); 140 141 avl_create(&spa->spa_errlist_scrub, 142 spa_error_entry_compare, sizeof (spa_error_entry_t), 143 offsetof(spa_error_entry_t, se_avl)); 144 avl_create(&spa->spa_errlist_last, 145 spa_error_entry_compare, sizeof (spa_error_entry_t), 146 offsetof(spa_error_entry_t, se_avl)); 147 } 148 149 /* 150 * Opposite of spa_activate(). 151 */ 152 static void 153 spa_deactivate(spa_t *spa) 154 { 155 int t; 156 157 ASSERT(spa->spa_sync_on == B_FALSE); 158 ASSERT(spa->spa_dsl_pool == NULL); 159 ASSERT(spa->spa_root_vdev == NULL); 160 161 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 162 163 txg_list_destroy(&spa->spa_vdev_txg_list); 164 165 list_destroy(&spa->spa_dirty_list); 166 167 rw_destroy(&spa->spa_traverse_lock); 168 169 for (t = 0; t < ZIO_TYPES; t++) { 170 taskq_destroy(spa->spa_zio_issue_taskq[t]); 171 taskq_destroy(spa->spa_zio_intr_taskq[t]); 172 spa->spa_zio_issue_taskq[t] = NULL; 173 spa->spa_zio_intr_taskq[t] = NULL; 174 } 175 176 metaslab_class_destroy(spa->spa_normal_class); 177 spa->spa_normal_class = NULL; 178 179 /* 180 * If this was part of an import or the open otherwise failed, we may 181 * still have errors left in the queues. Empty them just in case. 182 */ 183 spa_errlog_drain(spa); 184 185 avl_destroy(&spa->spa_errlist_scrub); 186 avl_destroy(&spa->spa_errlist_last); 187 188 spa->spa_state = POOL_STATE_UNINITIALIZED; 189 } 190 191 /* 192 * Verify a pool configuration, and construct the vdev tree appropriately. This 193 * will create all the necessary vdevs in the appropriate layout, with each vdev 194 * in the CLOSED state. This will prep the pool before open/creation/import. 195 * All vdev validation is done by the vdev_alloc() routine. 196 */ 197 static int 198 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 199 uint_t id, int atype) 200 { 201 nvlist_t **child; 202 uint_t c, children; 203 int error; 204 205 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 206 return (error); 207 208 if ((*vdp)->vdev_ops->vdev_op_leaf) 209 return (0); 210 211 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 212 &child, &children) != 0) { 213 vdev_free(*vdp); 214 *vdp = NULL; 215 return (EINVAL); 216 } 217 218 for (c = 0; c < children; c++) { 219 vdev_t *vd; 220 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 221 atype)) != 0) { 222 vdev_free(*vdp); 223 *vdp = NULL; 224 return (error); 225 } 226 } 227 228 ASSERT(*vdp != NULL); 229 230 return (0); 231 } 232 233 /* 234 * Opposite of spa_load(). 235 */ 236 static void 237 spa_unload(spa_t *spa) 238 { 239 int i; 240 241 /* 242 * Stop async tasks. 243 */ 244 spa_async_suspend(spa); 245 246 /* 247 * Stop syncing. 248 */ 249 if (spa->spa_sync_on) { 250 txg_sync_stop(spa->spa_dsl_pool); 251 spa->spa_sync_on = B_FALSE; 252 } 253 254 /* 255 * Wait for any outstanding prefetch I/O to complete. 256 */ 257 spa_config_enter(spa, RW_WRITER, FTAG); 258 spa_config_exit(spa, FTAG); 259 260 /* 261 * Close the dsl pool. 262 */ 263 if (spa->spa_dsl_pool) { 264 dsl_pool_close(spa->spa_dsl_pool); 265 spa->spa_dsl_pool = NULL; 266 } 267 268 /* 269 * Close all vdevs. 270 */ 271 if (spa->spa_root_vdev) 272 vdev_free(spa->spa_root_vdev); 273 ASSERT(spa->spa_root_vdev == NULL); 274 275 for (i = 0; i < spa->spa_nspares; i++) 276 vdev_free(spa->spa_spares[i]); 277 if (spa->spa_spares) { 278 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 279 spa->spa_spares = NULL; 280 } 281 if (spa->spa_sparelist) { 282 nvlist_free(spa->spa_sparelist); 283 spa->spa_sparelist = NULL; 284 } 285 286 spa->spa_async_suspended = 0; 287 } 288 289 /* 290 * Load (or re-load) the current list of vdevs describing the active spares for 291 * this pool. When this is called, we have some form of basic information in 292 * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 293 * re-generate a more complete list including status information. 294 */ 295 static void 296 spa_load_spares(spa_t *spa) 297 { 298 nvlist_t **spares; 299 uint_t nspares; 300 int i; 301 302 /* 303 * First, close and free any existing spare vdevs. 304 */ 305 for (i = 0; i < spa->spa_nspares; i++) { 306 vdev_close(spa->spa_spares[i]); 307 vdev_free(spa->spa_spares[i]); 308 } 309 if (spa->spa_spares) 310 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 311 312 if (spa->spa_sparelist == NULL) 313 nspares = 0; 314 else 315 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 316 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 317 318 spa->spa_nspares = (int)nspares; 319 spa->spa_spares = NULL; 320 321 if (nspares == 0) 322 return; 323 324 /* 325 * Construct the array of vdevs, opening them to get status in the 326 * process. 327 */ 328 spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 329 for (i = 0; i < spa->spa_nspares; i++) { 330 vdev_t *vd; 331 332 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 333 VDEV_ALLOC_SPARE) == 0); 334 ASSERT(vd != NULL); 335 336 spa->spa_spares[i] = vd; 337 338 if (vdev_open(vd) != 0) 339 continue; 340 341 vd->vdev_top = vd; 342 (void) vdev_validate_spare(vd); 343 } 344 345 /* 346 * Recompute the stashed list of spares, with status information 347 * this time. 348 */ 349 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 350 DATA_TYPE_NVLIST_ARRAY) == 0); 351 352 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 353 for (i = 0; i < spa->spa_nspares; i++) 354 spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 355 B_TRUE, B_TRUE); 356 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 357 spares, spa->spa_nspares) == 0); 358 for (i = 0; i < spa->spa_nspares; i++) 359 nvlist_free(spares[i]); 360 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 361 } 362 363 static int 364 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 365 { 366 dmu_buf_t *db; 367 char *packed = NULL; 368 size_t nvsize = 0; 369 int error; 370 *value = NULL; 371 372 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 373 nvsize = *(uint64_t *)db->db_data; 374 dmu_buf_rele(db, FTAG); 375 376 packed = kmem_alloc(nvsize, KM_SLEEP); 377 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 378 if (error == 0) 379 error = nvlist_unpack(packed, nvsize, value, 0); 380 kmem_free(packed, nvsize); 381 382 return (error); 383 } 384 385 /* 386 * Load an existing storage pool, using the pool's builtin spa_config as a 387 * source of configuration information. 388 */ 389 static int 390 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 391 { 392 int error = 0; 393 nvlist_t *nvroot = NULL; 394 vdev_t *rvd; 395 uberblock_t *ub = &spa->spa_uberblock; 396 uint64_t config_cache_txg = spa->spa_config_txg; 397 uint64_t pool_guid; 398 uint64_t version; 399 zio_t *zio; 400 401 spa->spa_load_state = state; 402 403 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 404 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 405 error = EINVAL; 406 goto out; 407 } 408 409 /* 410 * Versioning wasn't explicitly added to the label until later, so if 411 * it's not present treat it as the initial version. 412 */ 413 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 414 version = ZFS_VERSION_INITIAL; 415 416 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 417 &spa->spa_config_txg); 418 419 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 420 spa_guid_exists(pool_guid, 0)) { 421 error = EEXIST; 422 goto out; 423 } 424 425 spa->spa_load_guid = pool_guid; 426 427 /* 428 * Parse the configuration into a vdev tree. We explicitly set the 429 * value that will be returned by spa_version() since parsing the 430 * configuration requires knowing the version number. 431 */ 432 spa_config_enter(spa, RW_WRITER, FTAG); 433 spa->spa_ubsync.ub_version = version; 434 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 435 spa_config_exit(spa, FTAG); 436 437 if (error != 0) 438 goto out; 439 440 ASSERT(spa->spa_root_vdev == rvd); 441 ASSERT(spa_guid(spa) == pool_guid); 442 443 /* 444 * Try to open all vdevs, loading each label in the process. 445 */ 446 if (vdev_open(rvd) != 0) { 447 error = ENXIO; 448 goto out; 449 } 450 451 /* 452 * Validate the labels for all leaf vdevs. We need to grab the config 453 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 454 * flag. 455 */ 456 spa_config_enter(spa, RW_READER, FTAG); 457 error = vdev_validate(rvd); 458 spa_config_exit(spa, FTAG); 459 460 if (error != 0) { 461 error = EBADF; 462 goto out; 463 } 464 465 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 466 error = ENXIO; 467 goto out; 468 } 469 470 /* 471 * Find the best uberblock. 472 */ 473 bzero(ub, sizeof (uberblock_t)); 474 475 zio = zio_root(spa, NULL, NULL, 476 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 477 vdev_uberblock_load(zio, rvd, ub); 478 error = zio_wait(zio); 479 480 /* 481 * If we weren't able to find a single valid uberblock, return failure. 482 */ 483 if (ub->ub_txg == 0) { 484 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 485 VDEV_AUX_CORRUPT_DATA); 486 error = ENXIO; 487 goto out; 488 } 489 490 /* 491 * If the pool is newer than the code, we can't open it. 492 */ 493 if (ub->ub_version > ZFS_VERSION) { 494 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 495 VDEV_AUX_VERSION_NEWER); 496 error = ENOTSUP; 497 goto out; 498 } 499 500 /* 501 * If the vdev guid sum doesn't match the uberblock, we have an 502 * incomplete configuration. 503 */ 504 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 505 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 506 VDEV_AUX_BAD_GUID_SUM); 507 error = ENXIO; 508 goto out; 509 } 510 511 /* 512 * Initialize internal SPA structures. 513 */ 514 spa->spa_state = POOL_STATE_ACTIVE; 515 spa->spa_ubsync = spa->spa_uberblock; 516 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 517 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 518 if (error) { 519 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 520 VDEV_AUX_CORRUPT_DATA); 521 goto out; 522 } 523 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 524 525 if (zap_lookup(spa->spa_meta_objset, 526 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 527 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 528 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 529 VDEV_AUX_CORRUPT_DATA); 530 error = EIO; 531 goto out; 532 } 533 534 if (!mosconfig) { 535 nvlist_t *newconfig; 536 537 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 538 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 539 VDEV_AUX_CORRUPT_DATA); 540 error = EIO; 541 goto out; 542 } 543 544 spa_config_set(spa, newconfig); 545 spa_unload(spa); 546 spa_deactivate(spa); 547 spa_activate(spa); 548 549 return (spa_load(spa, newconfig, state, B_TRUE)); 550 } 551 552 if (zap_lookup(spa->spa_meta_objset, 553 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 554 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 555 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 556 VDEV_AUX_CORRUPT_DATA); 557 error = EIO; 558 goto out; 559 } 560 561 /* 562 * Load the bit that tells us to use the new accounting function 563 * (raid-z deflation). If we have an older pool, this will not 564 * be present. 565 */ 566 error = zap_lookup(spa->spa_meta_objset, 567 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 568 sizeof (uint64_t), 1, &spa->spa_deflate); 569 if (error != 0 && error != ENOENT) { 570 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 571 VDEV_AUX_CORRUPT_DATA); 572 error = EIO; 573 goto out; 574 } 575 576 /* 577 * Load the persistent error log. If we have an older pool, this will 578 * not be present. 579 */ 580 error = zap_lookup(spa->spa_meta_objset, 581 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 582 sizeof (uint64_t), 1, &spa->spa_errlog_last); 583 if (error != 0 && error != ENOENT) { 584 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 585 VDEV_AUX_CORRUPT_DATA); 586 error = EIO; 587 goto out; 588 } 589 590 error = zap_lookup(spa->spa_meta_objset, 591 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 592 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 593 if (error != 0 && error != ENOENT) { 594 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 595 VDEV_AUX_CORRUPT_DATA); 596 error = EIO; 597 goto out; 598 } 599 600 /* 601 * Load any hot spares for this pool. 602 */ 603 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 604 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 605 if (error != 0 && error != ENOENT) { 606 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 607 VDEV_AUX_CORRUPT_DATA); 608 error = EIO; 609 goto out; 610 } 611 if (error == 0) { 612 ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES); 613 if (load_nvlist(spa, spa->spa_spares_object, 614 &spa->spa_sparelist) != 0) { 615 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 616 VDEV_AUX_CORRUPT_DATA); 617 error = EIO; 618 goto out; 619 } 620 621 spa_config_enter(spa, RW_WRITER, FTAG); 622 spa_load_spares(spa); 623 spa_config_exit(spa, FTAG); 624 } 625 626 /* 627 * Load the vdev state for all toplevel vdevs. 628 */ 629 vdev_load(rvd); 630 631 /* 632 * Propagate the leaf DTLs we just loaded all the way up the tree. 633 */ 634 spa_config_enter(spa, RW_WRITER, FTAG); 635 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 636 spa_config_exit(spa, FTAG); 637 638 /* 639 * Check the state of the root vdev. If it can't be opened, it 640 * indicates one or more toplevel vdevs are faulted. 641 */ 642 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 643 error = ENXIO; 644 goto out; 645 } 646 647 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 648 dmu_tx_t *tx; 649 int need_update = B_FALSE; 650 int c; 651 652 /* 653 * Claim log blocks that haven't been committed yet. 654 * This must all happen in a single txg. 655 */ 656 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 657 spa_first_txg(spa)); 658 (void) dmu_objset_find(spa->spa_name, 659 zil_claim, tx, DS_FIND_CHILDREN); 660 dmu_tx_commit(tx); 661 662 spa->spa_sync_on = B_TRUE; 663 txg_sync_start(spa->spa_dsl_pool); 664 665 /* 666 * Wait for all claims to sync. 667 */ 668 txg_wait_synced(spa->spa_dsl_pool, 0); 669 670 /* 671 * If the config cache is stale, or we have uninitialized 672 * metaslabs (see spa_vdev_add()), then update the config. 673 */ 674 if (config_cache_txg != spa->spa_config_txg || 675 state == SPA_LOAD_IMPORT) 676 need_update = B_TRUE; 677 678 for (c = 0; c < rvd->vdev_children; c++) 679 if (rvd->vdev_child[c]->vdev_ms_array == 0) 680 need_update = B_TRUE; 681 682 /* 683 * Update the config cache asychronously in case we're the 684 * root pool, in which case the config cache isn't writable yet. 685 */ 686 if (need_update) 687 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 688 } 689 690 error = 0; 691 out: 692 if (error && error != EBADF) 693 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 694 spa->spa_load_state = SPA_LOAD_NONE; 695 spa->spa_ena = 0; 696 697 return (error); 698 } 699 700 /* 701 * Pool Open/Import 702 * 703 * The import case is identical to an open except that the configuration is sent 704 * down from userland, instead of grabbed from the configuration cache. For the 705 * case of an open, the pool configuration will exist in the 706 * POOL_STATE_UNITIALIZED state. 707 * 708 * The stats information (gen/count/ustats) is used to gather vdev statistics at 709 * the same time open the pool, without having to keep around the spa_t in some 710 * ambiguous state. 711 */ 712 static int 713 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 714 { 715 spa_t *spa; 716 int error; 717 int loaded = B_FALSE; 718 int locked = B_FALSE; 719 720 *spapp = NULL; 721 722 /* 723 * As disgusting as this is, we need to support recursive calls to this 724 * function because dsl_dir_open() is called during spa_load(), and ends 725 * up calling spa_open() again. The real fix is to figure out how to 726 * avoid dsl_dir_open() calling this in the first place. 727 */ 728 if (mutex_owner(&spa_namespace_lock) != curthread) { 729 mutex_enter(&spa_namespace_lock); 730 locked = B_TRUE; 731 } 732 733 if ((spa = spa_lookup(pool)) == NULL) { 734 if (locked) 735 mutex_exit(&spa_namespace_lock); 736 return (ENOENT); 737 } 738 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 739 740 spa_activate(spa); 741 742 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 743 744 if (error == EBADF) { 745 /* 746 * If vdev_validate() returns failure (indicated by 747 * EBADF), it indicates that one of the vdevs indicates 748 * that the pool has been exported or destroyed. If 749 * this is the case, the config cache is out of sync and 750 * we should remove the pool from the namespace. 751 */ 752 zfs_post_ok(spa, NULL); 753 spa_unload(spa); 754 spa_deactivate(spa); 755 spa_remove(spa); 756 spa_config_sync(); 757 if (locked) 758 mutex_exit(&spa_namespace_lock); 759 return (ENOENT); 760 } 761 762 if (error) { 763 /* 764 * We can't open the pool, but we still have useful 765 * information: the state of each vdev after the 766 * attempted vdev_open(). Return this to the user. 767 */ 768 if (config != NULL && spa->spa_root_vdev != NULL) { 769 spa_config_enter(spa, RW_READER, FTAG); 770 *config = spa_config_generate(spa, NULL, -1ULL, 771 B_TRUE); 772 spa_config_exit(spa, FTAG); 773 } 774 spa_unload(spa); 775 spa_deactivate(spa); 776 spa->spa_last_open_failed = B_TRUE; 777 if (locked) 778 mutex_exit(&spa_namespace_lock); 779 *spapp = NULL; 780 return (error); 781 } else { 782 zfs_post_ok(spa, NULL); 783 spa->spa_last_open_failed = B_FALSE; 784 } 785 786 loaded = B_TRUE; 787 } 788 789 spa_open_ref(spa, tag); 790 if (locked) 791 mutex_exit(&spa_namespace_lock); 792 793 *spapp = spa; 794 795 if (config != NULL) { 796 spa_config_enter(spa, RW_READER, FTAG); 797 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 798 spa_config_exit(spa, FTAG); 799 } 800 801 /* 802 * If we just loaded the pool, resilver anything that's out of date. 803 */ 804 if (loaded && (spa_mode & FWRITE)) 805 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 806 807 return (0); 808 } 809 810 int 811 spa_open(const char *name, spa_t **spapp, void *tag) 812 { 813 return (spa_open_common(name, spapp, tag, NULL)); 814 } 815 816 /* 817 * Lookup the given spa_t, incrementing the inject count in the process, 818 * preventing it from being exported or destroyed. 819 */ 820 spa_t * 821 spa_inject_addref(char *name) 822 { 823 spa_t *spa; 824 825 mutex_enter(&spa_namespace_lock); 826 if ((spa = spa_lookup(name)) == NULL) { 827 mutex_exit(&spa_namespace_lock); 828 return (NULL); 829 } 830 spa->spa_inject_ref++; 831 mutex_exit(&spa_namespace_lock); 832 833 return (spa); 834 } 835 836 void 837 spa_inject_delref(spa_t *spa) 838 { 839 mutex_enter(&spa_namespace_lock); 840 spa->spa_inject_ref--; 841 mutex_exit(&spa_namespace_lock); 842 } 843 844 static void 845 spa_add_spares(spa_t *spa, nvlist_t *config) 846 { 847 nvlist_t **spares; 848 uint_t i, nspares; 849 nvlist_t *nvroot; 850 uint64_t guid; 851 vdev_stat_t *vs; 852 uint_t vsc; 853 854 if (spa->spa_nspares == 0) 855 return; 856 857 VERIFY(nvlist_lookup_nvlist(config, 858 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 859 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 860 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 861 if (nspares != 0) { 862 VERIFY(nvlist_add_nvlist_array(nvroot, 863 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 864 VERIFY(nvlist_lookup_nvlist_array(nvroot, 865 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 866 867 /* 868 * Go through and find any spares which have since been 869 * repurposed as an active spare. If this is the case, update 870 * their status appropriately. 871 */ 872 for (i = 0; i < nspares; i++) { 873 VERIFY(nvlist_lookup_uint64(spares[i], 874 ZPOOL_CONFIG_GUID, &guid) == 0); 875 if (spa_spare_inuse(guid)) { 876 VERIFY(nvlist_lookup_uint64_array( 877 spares[i], ZPOOL_CONFIG_STATS, 878 (uint64_t **)&vs, &vsc) == 0); 879 vs->vs_state = VDEV_STATE_CANT_OPEN; 880 vs->vs_aux = VDEV_AUX_SPARED; 881 } 882 } 883 } 884 } 885 886 int 887 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 888 { 889 int error; 890 spa_t *spa; 891 892 *config = NULL; 893 error = spa_open_common(name, &spa, FTAG, config); 894 895 if (spa && *config != NULL) { 896 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 897 spa_get_errlog_size(spa)) == 0); 898 899 spa_add_spares(spa, *config); 900 } 901 902 /* 903 * We want to get the alternate root even for faulted pools, so we cheat 904 * and call spa_lookup() directly. 905 */ 906 if (altroot) { 907 if (spa == NULL) { 908 mutex_enter(&spa_namespace_lock); 909 spa = spa_lookup(name); 910 if (spa) 911 spa_altroot(spa, altroot, buflen); 912 else 913 altroot[0] = '\0'; 914 spa = NULL; 915 mutex_exit(&spa_namespace_lock); 916 } else { 917 spa_altroot(spa, altroot, buflen); 918 } 919 } 920 921 if (spa != NULL) 922 spa_close(spa, FTAG); 923 924 return (error); 925 } 926 927 /* 928 * Validate that the 'spares' array is well formed. We must have an array of 929 * nvlists, each which describes a valid leaf vdev. 930 */ 931 static int 932 spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 933 { 934 nvlist_t **spares; 935 uint_t i, nspares; 936 vdev_t *vd; 937 int error; 938 939 /* 940 * It's acceptable to have no spares specified. 941 */ 942 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 943 &spares, &nspares) != 0) 944 return (0); 945 946 if (nspares == 0) 947 return (EINVAL); 948 949 /* 950 * Make sure the pool is formatted with a version that supports hot 951 * spares. 952 */ 953 if (spa_version(spa) < ZFS_VERSION_SPARES) 954 return (ENOTSUP); 955 956 for (i = 0; i < nspares; i++) { 957 if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 958 mode)) != 0) 959 return (error); 960 961 if (!vd->vdev_ops->vdev_op_leaf) { 962 vdev_free(vd); 963 return (EINVAL); 964 } 965 966 if ((error = vdev_open(vd)) != 0) { 967 vdev_free(vd); 968 return (error); 969 } 970 971 vd->vdev_top = vd; 972 if ((error = vdev_label_spare(vd, crtxg)) != 0) { 973 vdev_free(vd); 974 return (error); 975 } 976 977 VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 978 vd->vdev_guid) == 0); 979 980 vdev_free(vd); 981 } 982 983 return (0); 984 } 985 986 /* 987 * Pool Creation 988 */ 989 int 990 spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 991 { 992 spa_t *spa; 993 vdev_t *rvd; 994 dsl_pool_t *dp; 995 dmu_tx_t *tx; 996 int c, error = 0; 997 uint64_t txg = TXG_INITIAL; 998 nvlist_t **spares; 999 uint_t nspares; 1000 1001 /* 1002 * If this pool already exists, return failure. 1003 */ 1004 mutex_enter(&spa_namespace_lock); 1005 if (spa_lookup(pool) != NULL) { 1006 mutex_exit(&spa_namespace_lock); 1007 return (EEXIST); 1008 } 1009 1010 /* 1011 * Allocate a new spa_t structure. 1012 */ 1013 spa = spa_add(pool, altroot); 1014 spa_activate(spa); 1015 1016 spa->spa_uberblock.ub_txg = txg - 1; 1017 spa->spa_uberblock.ub_version = ZFS_VERSION; 1018 spa->spa_ubsync = spa->spa_uberblock; 1019 1020 /* 1021 * Create the root vdev. 1022 */ 1023 spa_config_enter(spa, RW_WRITER, FTAG); 1024 1025 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1026 1027 ASSERT(error != 0 || rvd != NULL); 1028 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1029 1030 if (error == 0 && rvd->vdev_children == 0) 1031 error = EINVAL; 1032 1033 if (error == 0 && 1034 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1035 (error = spa_validate_spares(spa, nvroot, txg, 1036 VDEV_ALLOC_ADD)) == 0) { 1037 for (c = 0; c < rvd->vdev_children; c++) 1038 vdev_init(rvd->vdev_child[c], txg); 1039 vdev_config_dirty(rvd); 1040 } 1041 1042 spa_config_exit(spa, FTAG); 1043 1044 if (error != 0) { 1045 spa_unload(spa); 1046 spa_deactivate(spa); 1047 spa_remove(spa); 1048 mutex_exit(&spa_namespace_lock); 1049 return (error); 1050 } 1051 1052 /* 1053 * Get the list of spares, if specified. 1054 */ 1055 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1056 &spares, &nspares) == 0) { 1057 VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 1058 KM_SLEEP) == 0); 1059 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1060 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1061 spa_config_enter(spa, RW_WRITER, FTAG); 1062 spa_load_spares(spa); 1063 spa_config_exit(spa, FTAG); 1064 spa->spa_sync_spares = B_TRUE; 1065 } 1066 1067 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1068 spa->spa_meta_objset = dp->dp_meta_objset; 1069 1070 tx = dmu_tx_create_assigned(dp, txg); 1071 1072 /* 1073 * Create the pool config object. 1074 */ 1075 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1076 DMU_OT_PACKED_NVLIST, 1 << 14, 1077 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1078 1079 if (zap_add(spa->spa_meta_objset, 1080 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1081 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1082 cmn_err(CE_PANIC, "failed to add pool config"); 1083 } 1084 1085 /* Newly created pools are always deflated. */ 1086 spa->spa_deflate = TRUE; 1087 if (zap_add(spa->spa_meta_objset, 1088 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1089 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 1090 cmn_err(CE_PANIC, "failed to add deflate"); 1091 } 1092 1093 /* 1094 * Create the deferred-free bplist object. Turn off compression 1095 * because sync-to-convergence takes longer if the blocksize 1096 * keeps changing. 1097 */ 1098 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1099 1 << 14, tx); 1100 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1101 ZIO_COMPRESS_OFF, tx); 1102 1103 if (zap_add(spa->spa_meta_objset, 1104 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1105 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 1106 cmn_err(CE_PANIC, "failed to add bplist"); 1107 } 1108 1109 dmu_tx_commit(tx); 1110 1111 spa->spa_sync_on = B_TRUE; 1112 txg_sync_start(spa->spa_dsl_pool); 1113 1114 /* 1115 * We explicitly wait for the first transaction to complete so that our 1116 * bean counters are appropriately updated. 1117 */ 1118 txg_wait_synced(spa->spa_dsl_pool, txg); 1119 1120 spa_config_sync(); 1121 1122 mutex_exit(&spa_namespace_lock); 1123 1124 return (0); 1125 } 1126 1127 /* 1128 * Import the given pool into the system. We set up the necessary spa_t and 1129 * then call spa_load() to do the dirty work. 1130 */ 1131 int 1132 spa_import(const char *pool, nvlist_t *config, const char *altroot) 1133 { 1134 spa_t *spa; 1135 int error; 1136 nvlist_t *nvroot; 1137 nvlist_t **spares; 1138 uint_t nspares; 1139 1140 if (!(spa_mode & FWRITE)) 1141 return (EROFS); 1142 1143 /* 1144 * If a pool with this name exists, return failure. 1145 */ 1146 mutex_enter(&spa_namespace_lock); 1147 if (spa_lookup(pool) != NULL) { 1148 mutex_exit(&spa_namespace_lock); 1149 return (EEXIST); 1150 } 1151 1152 /* 1153 * Create and initialize the spa structure. 1154 */ 1155 spa = spa_add(pool, altroot); 1156 spa_activate(spa); 1157 1158 /* 1159 * Pass off the heavy lifting to spa_load(). 1160 * Pass TRUE for mosconfig because the user-supplied config 1161 * is actually the one to trust when doing an import. 1162 */ 1163 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1164 1165 spa_config_enter(spa, RW_WRITER, FTAG); 1166 /* 1167 * Toss any existing sparelist, as it doesn't have any validity anymore, 1168 * and conflicts with spa_has_spare(). 1169 */ 1170 if (spa->spa_sparelist) { 1171 nvlist_free(spa->spa_sparelist); 1172 spa->spa_sparelist = NULL; 1173 spa_load_spares(spa); 1174 } 1175 1176 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1177 &nvroot) == 0); 1178 if (error == 0) 1179 error = spa_validate_spares(spa, nvroot, -1ULL, 1180 VDEV_ALLOC_SPARE); 1181 spa_config_exit(spa, FTAG); 1182 1183 if (error != 0) { 1184 spa_unload(spa); 1185 spa_deactivate(spa); 1186 spa_remove(spa); 1187 mutex_exit(&spa_namespace_lock); 1188 return (error); 1189 } 1190 1191 /* 1192 * Override any spares as specified by the user, as these may have 1193 * correct device names/devids, etc. 1194 */ 1195 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1196 &spares, &nspares) == 0) { 1197 if (spa->spa_sparelist) 1198 VERIFY(nvlist_remove(spa->spa_sparelist, 1199 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1200 else 1201 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1202 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1203 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1204 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1205 spa_config_enter(spa, RW_WRITER, FTAG); 1206 spa_load_spares(spa); 1207 spa_config_exit(spa, FTAG); 1208 spa->spa_sync_spares = B_TRUE; 1209 } 1210 1211 /* 1212 * Update the config cache to include the newly-imported pool. 1213 */ 1214 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1215 1216 mutex_exit(&spa_namespace_lock); 1217 1218 /* 1219 * Resilver anything that's out of date. 1220 */ 1221 if (spa_mode & FWRITE) 1222 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1223 1224 return (0); 1225 } 1226 1227 /* 1228 * This (illegal) pool name is used when temporarily importing a spa_t in order 1229 * to get the vdev stats associated with the imported devices. 1230 */ 1231 #define TRYIMPORT_NAME "$import" 1232 1233 nvlist_t * 1234 spa_tryimport(nvlist_t *tryconfig) 1235 { 1236 nvlist_t *config = NULL; 1237 char *poolname; 1238 spa_t *spa; 1239 uint64_t state; 1240 1241 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1242 return (NULL); 1243 1244 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1245 return (NULL); 1246 1247 /* 1248 * Create and initialize the spa structure. 1249 */ 1250 mutex_enter(&spa_namespace_lock); 1251 spa = spa_add(TRYIMPORT_NAME, NULL); 1252 spa_activate(spa); 1253 1254 /* 1255 * Pass off the heavy lifting to spa_load(). 1256 * Pass TRUE for mosconfig because the user-supplied config 1257 * is actually the one to trust when doing an import. 1258 */ 1259 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1260 1261 /* 1262 * If 'tryconfig' was at least parsable, return the current config. 1263 */ 1264 if (spa->spa_root_vdev != NULL) { 1265 spa_config_enter(spa, RW_READER, FTAG); 1266 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1267 spa_config_exit(spa, FTAG); 1268 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1269 poolname) == 0); 1270 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1271 state) == 0); 1272 1273 /* 1274 * Add the list of hot spares. 1275 */ 1276 spa_add_spares(spa, config); 1277 } 1278 1279 spa_unload(spa); 1280 spa_deactivate(spa); 1281 spa_remove(spa); 1282 mutex_exit(&spa_namespace_lock); 1283 1284 return (config); 1285 } 1286 1287 /* 1288 * Pool export/destroy 1289 * 1290 * The act of destroying or exporting a pool is very simple. We make sure there 1291 * is no more pending I/O and any references to the pool are gone. Then, we 1292 * update the pool state and sync all the labels to disk, removing the 1293 * configuration from the cache afterwards. 1294 */ 1295 static int 1296 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1297 { 1298 spa_t *spa; 1299 1300 if (oldconfig) 1301 *oldconfig = NULL; 1302 1303 if (!(spa_mode & FWRITE)) 1304 return (EROFS); 1305 1306 mutex_enter(&spa_namespace_lock); 1307 if ((spa = spa_lookup(pool)) == NULL) { 1308 mutex_exit(&spa_namespace_lock); 1309 return (ENOENT); 1310 } 1311 1312 /* 1313 * Put a hold on the pool, drop the namespace lock, stop async tasks, 1314 * reacquire the namespace lock, and see if we can export. 1315 */ 1316 spa_open_ref(spa, FTAG); 1317 mutex_exit(&spa_namespace_lock); 1318 spa_async_suspend(spa); 1319 mutex_enter(&spa_namespace_lock); 1320 spa_close(spa, FTAG); 1321 1322 /* 1323 * The pool will be in core if it's openable, 1324 * in which case we can modify its state. 1325 */ 1326 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1327 /* 1328 * Objsets may be open only because they're dirty, so we 1329 * have to force it to sync before checking spa_refcnt. 1330 */ 1331 spa_scrub_suspend(spa); 1332 txg_wait_synced(spa->spa_dsl_pool, 0); 1333 1334 /* 1335 * A pool cannot be exported or destroyed if there are active 1336 * references. If we are resetting a pool, allow references by 1337 * fault injection handlers. 1338 */ 1339 if (!spa_refcount_zero(spa) || 1340 (spa->spa_inject_ref != 0 && 1341 new_state != POOL_STATE_UNINITIALIZED)) { 1342 spa_scrub_resume(spa); 1343 spa_async_resume(spa); 1344 mutex_exit(&spa_namespace_lock); 1345 return (EBUSY); 1346 } 1347 1348 spa_scrub_resume(spa); 1349 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1350 1351 /* 1352 * We want this to be reflected on every label, 1353 * so mark them all dirty. spa_unload() will do the 1354 * final sync that pushes these changes out. 1355 */ 1356 if (new_state != POOL_STATE_UNINITIALIZED) { 1357 spa_config_enter(spa, RW_WRITER, FTAG); 1358 spa->spa_state = new_state; 1359 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 1360 vdev_config_dirty(spa->spa_root_vdev); 1361 spa_config_exit(spa, FTAG); 1362 } 1363 } 1364 1365 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1366 spa_unload(spa); 1367 spa_deactivate(spa); 1368 } 1369 1370 if (oldconfig && spa->spa_config) 1371 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 1372 1373 if (new_state != POOL_STATE_UNINITIALIZED) { 1374 spa_remove(spa); 1375 spa_config_sync(); 1376 } 1377 mutex_exit(&spa_namespace_lock); 1378 1379 return (0); 1380 } 1381 1382 /* 1383 * Destroy a storage pool. 1384 */ 1385 int 1386 spa_destroy(char *pool) 1387 { 1388 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1389 } 1390 1391 /* 1392 * Export a storage pool. 1393 */ 1394 int 1395 spa_export(char *pool, nvlist_t **oldconfig) 1396 { 1397 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1398 } 1399 1400 /* 1401 * Similar to spa_export(), this unloads the spa_t without actually removing it 1402 * from the namespace in any way. 1403 */ 1404 int 1405 spa_reset(char *pool) 1406 { 1407 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 1408 } 1409 1410 1411 /* 1412 * ========================================================================== 1413 * Device manipulation 1414 * ========================================================================== 1415 */ 1416 1417 /* 1418 * Add capacity to a storage pool. 1419 */ 1420 int 1421 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1422 { 1423 uint64_t txg; 1424 int c, error; 1425 vdev_t *rvd = spa->spa_root_vdev; 1426 vdev_t *vd, *tvd; 1427 nvlist_t **spares; 1428 uint_t i, nspares; 1429 1430 txg = spa_vdev_enter(spa); 1431 1432 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 1433 VDEV_ALLOC_ADD)) != 0) 1434 return (spa_vdev_exit(spa, NULL, txg, error)); 1435 1436 if ((error = spa_validate_spares(spa, nvroot, txg, 1437 VDEV_ALLOC_ADD)) != 0) 1438 return (spa_vdev_exit(spa, vd, txg, error)); 1439 1440 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1441 &spares, &nspares) != 0) 1442 nspares = 0; 1443 1444 if (vd->vdev_children == 0 && nspares == 0) 1445 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1446 1447 if (vd->vdev_children != 0) { 1448 if ((error = vdev_create(vd, txg, B_FALSE)) != 0) 1449 return (spa_vdev_exit(spa, vd, txg, error)); 1450 1451 /* 1452 * Transfer each new top-level vdev from vd to rvd. 1453 */ 1454 for (c = 0; c < vd->vdev_children; c++) { 1455 tvd = vd->vdev_child[c]; 1456 vdev_remove_child(vd, tvd); 1457 tvd->vdev_id = rvd->vdev_children; 1458 vdev_add_child(rvd, tvd); 1459 vdev_config_dirty(tvd); 1460 } 1461 } 1462 1463 if (nspares != 0) { 1464 if (spa->spa_sparelist != NULL) { 1465 nvlist_t **oldspares; 1466 uint_t oldnspares; 1467 nvlist_t **newspares; 1468 1469 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 1470 ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 1471 1472 newspares = kmem_alloc(sizeof (void *) * 1473 (nspares + oldnspares), KM_SLEEP); 1474 for (i = 0; i < oldnspares; i++) 1475 VERIFY(nvlist_dup(oldspares[i], 1476 &newspares[i], KM_SLEEP) == 0); 1477 for (i = 0; i < nspares; i++) 1478 VERIFY(nvlist_dup(spares[i], 1479 &newspares[i + oldnspares], 1480 KM_SLEEP) == 0); 1481 1482 VERIFY(nvlist_remove(spa->spa_sparelist, 1483 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1484 1485 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1486 ZPOOL_CONFIG_SPARES, newspares, 1487 nspares + oldnspares) == 0); 1488 for (i = 0; i < oldnspares + nspares; i++) 1489 nvlist_free(newspares[i]); 1490 kmem_free(newspares, (oldnspares + nspares) * 1491 sizeof (void *)); 1492 } else { 1493 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1494 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1495 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1496 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1497 } 1498 1499 spa_load_spares(spa); 1500 spa->spa_sync_spares = B_TRUE; 1501 } 1502 1503 /* 1504 * We have to be careful when adding new vdevs to an existing pool. 1505 * If other threads start allocating from these vdevs before we 1506 * sync the config cache, and we lose power, then upon reboot we may 1507 * fail to open the pool because there are DVAs that the config cache 1508 * can't translate. Therefore, we first add the vdevs without 1509 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1510 * and then let spa_config_update() initialize the new metaslabs. 1511 * 1512 * spa_load() checks for added-but-not-initialized vdevs, so that 1513 * if we lose power at any point in this sequence, the remaining 1514 * steps will be completed the next time we load the pool. 1515 */ 1516 (void) spa_vdev_exit(spa, vd, txg, 0); 1517 1518 mutex_enter(&spa_namespace_lock); 1519 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1520 mutex_exit(&spa_namespace_lock); 1521 1522 return (0); 1523 } 1524 1525 /* 1526 * Attach a device to a mirror. The arguments are the path to any device 1527 * in the mirror, and the nvroot for the new device. If the path specifies 1528 * a device that is not mirrored, we automatically insert the mirror vdev. 1529 * 1530 * If 'replacing' is specified, the new device is intended to replace the 1531 * existing device; in this case the two devices are made into their own 1532 * mirror using the 'replacing' vdev, which is functionally idendical to 1533 * the mirror vdev (it actually reuses all the same ops) but has a few 1534 * extra rules: you can't attach to it after it's been created, and upon 1535 * completion of resilvering, the first disk (the one being replaced) 1536 * is automatically detached. 1537 */ 1538 int 1539 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1540 { 1541 uint64_t txg, open_txg; 1542 int error; 1543 vdev_t *rvd = spa->spa_root_vdev; 1544 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1545 vdev_ops_t *pvops; 1546 1547 txg = spa_vdev_enter(spa); 1548 1549 oldvd = vdev_lookup_by_guid(rvd, guid); 1550 1551 if (oldvd == NULL) 1552 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1553 1554 if (!oldvd->vdev_ops->vdev_op_leaf) 1555 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1556 1557 pvd = oldvd->vdev_parent; 1558 1559 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 1560 VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) 1561 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1562 1563 newvd = newrootvd->vdev_child[0]; 1564 1565 if (!newvd->vdev_ops->vdev_op_leaf) 1566 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1567 1568 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 1569 return (spa_vdev_exit(spa, newrootvd, txg, error)); 1570 1571 if (!replacing) { 1572 /* 1573 * For attach, the only allowable parent is a mirror or the root 1574 * vdev. 1575 */ 1576 if (pvd->vdev_ops != &vdev_mirror_ops && 1577 pvd->vdev_ops != &vdev_root_ops) 1578 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1579 1580 pvops = &vdev_mirror_ops; 1581 } else { 1582 /* 1583 * Active hot spares can only be replaced by inactive hot 1584 * spares. 1585 */ 1586 if (pvd->vdev_ops == &vdev_spare_ops && 1587 pvd->vdev_child[1] == oldvd && 1588 !spa_has_spare(spa, newvd->vdev_guid)) 1589 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1590 1591 /* 1592 * If the source is a hot spare, and the parent isn't already a 1593 * spare, then we want to create a new hot spare. Otherwise, we 1594 * want to create a replacing vdev. 1595 */ 1596 if (pvd->vdev_ops == &vdev_replacing_ops) 1597 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1598 else if (pvd->vdev_ops != &vdev_spare_ops && 1599 newvd->vdev_isspare) 1600 pvops = &vdev_spare_ops; 1601 else 1602 pvops = &vdev_replacing_ops; 1603 } 1604 1605 /* 1606 * Compare the new device size with the replaceable/attachable 1607 * device size. 1608 */ 1609 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1610 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1611 1612 /* 1613 * The new device cannot have a higher alignment requirement 1614 * than the top-level vdev. 1615 */ 1616 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1617 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1618 1619 /* 1620 * If this is an in-place replacement, update oldvd's path and devid 1621 * to make it distinguishable from newvd, and unopenable from now on. 1622 */ 1623 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1624 spa_strfree(oldvd->vdev_path); 1625 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1626 KM_SLEEP); 1627 (void) sprintf(oldvd->vdev_path, "%s/%s", 1628 newvd->vdev_path, "old"); 1629 if (oldvd->vdev_devid != NULL) { 1630 spa_strfree(oldvd->vdev_devid); 1631 oldvd->vdev_devid = NULL; 1632 } 1633 } 1634 1635 /* 1636 * If the parent is not a mirror, or if we're replacing, insert the new 1637 * mirror/replacing/spare vdev above oldvd. 1638 */ 1639 if (pvd->vdev_ops != pvops) 1640 pvd = vdev_add_parent(oldvd, pvops); 1641 1642 ASSERT(pvd->vdev_top->vdev_parent == rvd); 1643 ASSERT(pvd->vdev_ops == pvops); 1644 ASSERT(oldvd->vdev_parent == pvd); 1645 1646 /* 1647 * Extract the new device from its root and add it to pvd. 1648 */ 1649 vdev_remove_child(newrootvd, newvd); 1650 newvd->vdev_id = pvd->vdev_children; 1651 vdev_add_child(pvd, newvd); 1652 1653 /* 1654 * If newvd is smaller than oldvd, but larger than its rsize, 1655 * the addition of newvd may have decreased our parent's asize. 1656 */ 1657 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1658 1659 tvd = newvd->vdev_top; 1660 ASSERT(pvd->vdev_top == tvd); 1661 ASSERT(tvd->vdev_parent == rvd); 1662 1663 vdev_config_dirty(tvd); 1664 1665 /* 1666 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1667 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1668 */ 1669 open_txg = txg + TXG_CONCURRENT_STATES - 1; 1670 1671 mutex_enter(&newvd->vdev_dtl_lock); 1672 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1673 open_txg - TXG_INITIAL + 1); 1674 mutex_exit(&newvd->vdev_dtl_lock); 1675 1676 dprintf("attached %s in txg %llu\n", newvd->vdev_path, txg); 1677 1678 /* 1679 * Mark newvd's DTL dirty in this txg. 1680 */ 1681 vdev_dirty(tvd, VDD_DTL, newvd, txg); 1682 1683 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1684 1685 /* 1686 * Kick off a resilver to update newvd. 1687 */ 1688 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1689 1690 return (0); 1691 } 1692 1693 /* 1694 * Detach a device from a mirror or replacing vdev. 1695 * If 'replace_done' is specified, only detach if the parent 1696 * is a replacing vdev. 1697 */ 1698 int 1699 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1700 { 1701 uint64_t txg; 1702 int c, t, error; 1703 vdev_t *rvd = spa->spa_root_vdev; 1704 vdev_t *vd, *pvd, *cvd, *tvd; 1705 boolean_t unspare = B_FALSE; 1706 uint64_t unspare_guid; 1707 1708 txg = spa_vdev_enter(spa); 1709 1710 vd = vdev_lookup_by_guid(rvd, guid); 1711 1712 if (vd == NULL) 1713 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1714 1715 if (!vd->vdev_ops->vdev_op_leaf) 1716 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1717 1718 pvd = vd->vdev_parent; 1719 1720 /* 1721 * If replace_done is specified, only remove this device if it's 1722 * the first child of a replacing vdev. For the 'spare' vdev, either 1723 * disk can be removed. 1724 */ 1725 if (replace_done) { 1726 if (pvd->vdev_ops == &vdev_replacing_ops) { 1727 if (vd->vdev_id != 0) 1728 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1729 } else if (pvd->vdev_ops != &vdev_spare_ops) { 1730 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1731 } 1732 } 1733 1734 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 1735 spa_version(spa) >= ZFS_VERSION_SPARES); 1736 1737 /* 1738 * Only mirror, replacing, and spare vdevs support detach. 1739 */ 1740 if (pvd->vdev_ops != &vdev_replacing_ops && 1741 pvd->vdev_ops != &vdev_mirror_ops && 1742 pvd->vdev_ops != &vdev_spare_ops) 1743 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1744 1745 /* 1746 * If there's only one replica, you can't detach it. 1747 */ 1748 if (pvd->vdev_children <= 1) 1749 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1750 1751 /* 1752 * If all siblings have non-empty DTLs, this device may have the only 1753 * valid copy of the data, which means we cannot safely detach it. 1754 * 1755 * XXX -- as in the vdev_offline() case, we really want a more 1756 * precise DTL check. 1757 */ 1758 for (c = 0; c < pvd->vdev_children; c++) { 1759 uint64_t dirty; 1760 1761 cvd = pvd->vdev_child[c]; 1762 if (cvd == vd) 1763 continue; 1764 if (vdev_is_dead(cvd)) 1765 continue; 1766 mutex_enter(&cvd->vdev_dtl_lock); 1767 dirty = cvd->vdev_dtl_map.sm_space | 1768 cvd->vdev_dtl_scrub.sm_space; 1769 mutex_exit(&cvd->vdev_dtl_lock); 1770 if (!dirty) 1771 break; 1772 } 1773 1774 /* 1775 * If we are a replacing or spare vdev, then we can always detach the 1776 * latter child, as that is how one cancels the operation. 1777 */ 1778 if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 1779 c == pvd->vdev_children) 1780 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1781 1782 /* 1783 * If we are detaching the original disk from a spare, then it implies 1784 * that the spare should become a real disk, and be removed from the 1785 * active spare list for the pool. 1786 */ 1787 if (pvd->vdev_ops == &vdev_spare_ops && 1788 vd->vdev_id == 0) 1789 unspare = B_TRUE; 1790 1791 /* 1792 * Erase the disk labels so the disk can be used for other things. 1793 * This must be done after all other error cases are handled, 1794 * but before we disembowel vd (so we can still do I/O to it). 1795 * But if we can't do it, don't treat the error as fatal -- 1796 * it may be that the unwritability of the disk is the reason 1797 * it's being detached! 1798 */ 1799 error = vdev_label_init(vd, 0, B_FALSE); 1800 if (error) 1801 dprintf("unable to erase labels on %s\n", vdev_description(vd)); 1802 1803 /* 1804 * Remove vd from its parent and compact the parent's children. 1805 */ 1806 vdev_remove_child(pvd, vd); 1807 vdev_compact_children(pvd); 1808 1809 /* 1810 * Remember one of the remaining children so we can get tvd below. 1811 */ 1812 cvd = pvd->vdev_child[0]; 1813 1814 /* 1815 * If we need to remove the remaining child from the list of hot spares, 1816 * do it now, marking the vdev as no longer a spare in the process. We 1817 * must do this before vdev_remove_parent(), because that can change the 1818 * GUID if it creates a new toplevel GUID. 1819 */ 1820 if (unspare) { 1821 ASSERT(cvd->vdev_isspare); 1822 spa_spare_remove(cvd->vdev_guid); 1823 cvd->vdev_isspare = B_FALSE; 1824 unspare_guid = cvd->vdev_guid; 1825 } 1826 1827 /* 1828 * If the parent mirror/replacing vdev only has one child, 1829 * the parent is no longer needed. Remove it from the tree. 1830 */ 1831 if (pvd->vdev_children == 1) 1832 vdev_remove_parent(cvd); 1833 1834 /* 1835 * We don't set tvd until now because the parent we just removed 1836 * may have been the previous top-level vdev. 1837 */ 1838 tvd = cvd->vdev_top; 1839 ASSERT(tvd->vdev_parent == rvd); 1840 1841 /* 1842 * Reopen this top-level vdev to reassess health after detach. 1843 */ 1844 vdev_reopen(tvd); 1845 1846 /* 1847 * If the device we just detached was smaller than the others, 1848 * it may be possible to add metaslabs (i.e. grow the pool). 1849 * vdev_metaslab_init() can't fail because the existing metaslabs 1850 * are already in core, so there's nothing to read from disk. 1851 */ 1852 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 1853 1854 vdev_config_dirty(tvd); 1855 1856 /* 1857 * Mark vd's DTL as dirty in this txg. 1858 * vdev_dtl_sync() will see that vd->vdev_detached is set 1859 * and free vd's DTL object in syncing context. 1860 * But first make sure we're not on any *other* txg's DTL list, 1861 * to prevent vd from being accessed after it's freed. 1862 */ 1863 for (t = 0; t < TXG_SIZE; t++) 1864 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1865 vd->vdev_detached = B_TRUE; 1866 vdev_dirty(tvd, VDD_DTL, vd, txg); 1867 1868 dprintf("detached %s in txg %llu\n", vd->vdev_path, txg); 1869 1870 error = spa_vdev_exit(spa, vd, txg, 0); 1871 1872 /* 1873 * If we are supposed to remove the given vdev from the list of spares, 1874 * iterate over all pools in the system and replace it if it's present. 1875 */ 1876 if (unspare) { 1877 spa = NULL; 1878 mutex_enter(&spa_namespace_lock); 1879 while ((spa = spa_next(spa)) != NULL) { 1880 if (spa->spa_state != POOL_STATE_ACTIVE) 1881 continue; 1882 1883 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 1884 } 1885 mutex_exit(&spa_namespace_lock); 1886 } 1887 1888 return (error); 1889 } 1890 1891 /* 1892 * Remove a device from the pool. Currently, this supports removing only hot 1893 * spares. 1894 */ 1895 int 1896 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 1897 { 1898 vdev_t *vd; 1899 nvlist_t **spares, *nv, **newspares; 1900 uint_t i, j, nspares; 1901 int ret = 0; 1902 1903 spa_config_enter(spa, RW_WRITER, FTAG); 1904 1905 vd = spa_lookup_by_guid(spa, guid); 1906 1907 nv = NULL; 1908 if (spa->spa_spares != NULL && 1909 nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 1910 &spares, &nspares) == 0) { 1911 for (i = 0; i < nspares; i++) { 1912 uint64_t theguid; 1913 1914 VERIFY(nvlist_lookup_uint64(spares[i], 1915 ZPOOL_CONFIG_GUID, &theguid) == 0); 1916 if (theguid == guid) { 1917 nv = spares[i]; 1918 break; 1919 } 1920 } 1921 } 1922 1923 /* 1924 * We only support removing a hot spare, and only if it's not currently 1925 * in use in this pool. 1926 */ 1927 if (nv == NULL && vd == NULL) { 1928 ret = ENOENT; 1929 goto out; 1930 } 1931 1932 if (nv == NULL && vd != NULL) { 1933 ret = ENOTSUP; 1934 goto out; 1935 } 1936 1937 if (!unspare && nv != NULL && vd != NULL) { 1938 ret = EBUSY; 1939 goto out; 1940 } 1941 1942 if (nspares == 1) { 1943 newspares = NULL; 1944 } else { 1945 newspares = kmem_alloc((nspares - 1) * sizeof (void *), 1946 KM_SLEEP); 1947 for (i = 0, j = 0; i < nspares; i++) { 1948 if (spares[i] != nv) 1949 VERIFY(nvlist_dup(spares[i], 1950 &newspares[j++], KM_SLEEP) == 0); 1951 } 1952 } 1953 1954 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 1955 DATA_TYPE_NVLIST_ARRAY) == 0); 1956 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 1957 newspares, nspares - 1) == 0); 1958 for (i = 0; i < nspares - 1; i++) 1959 nvlist_free(newspares[i]); 1960 kmem_free(newspares, (nspares - 1) * sizeof (void *)); 1961 spa_load_spares(spa); 1962 spa->spa_sync_spares = B_TRUE; 1963 1964 out: 1965 spa_config_exit(spa, FTAG); 1966 1967 return (ret); 1968 } 1969 1970 /* 1971 * Find any device that's done replacing, so we can detach it. 1972 */ 1973 static vdev_t * 1974 spa_vdev_replace_done_hunt(vdev_t *vd) 1975 { 1976 vdev_t *newvd, *oldvd; 1977 int c; 1978 1979 for (c = 0; c < vd->vdev_children; c++) { 1980 oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 1981 if (oldvd != NULL) 1982 return (oldvd); 1983 } 1984 1985 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 1986 oldvd = vd->vdev_child[0]; 1987 newvd = vd->vdev_child[1]; 1988 1989 mutex_enter(&newvd->vdev_dtl_lock); 1990 if (newvd->vdev_dtl_map.sm_space == 0 && 1991 newvd->vdev_dtl_scrub.sm_space == 0) { 1992 mutex_exit(&newvd->vdev_dtl_lock); 1993 return (oldvd); 1994 } 1995 mutex_exit(&newvd->vdev_dtl_lock); 1996 } 1997 1998 return (NULL); 1999 } 2000 2001 static void 2002 spa_vdev_replace_done(spa_t *spa) 2003 { 2004 vdev_t *vd; 2005 vdev_t *pvd; 2006 uint64_t guid; 2007 uint64_t pguid = 0; 2008 2009 spa_config_enter(spa, RW_READER, FTAG); 2010 2011 while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 2012 guid = vd->vdev_guid; 2013 /* 2014 * If we have just finished replacing a hot spared device, then 2015 * we need to detach the parent's first child (the original hot 2016 * spare) as well. 2017 */ 2018 pvd = vd->vdev_parent; 2019 if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2020 pvd->vdev_id == 0) { 2021 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 2022 ASSERT(pvd->vdev_parent->vdev_children == 2); 2023 pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 2024 } 2025 spa_config_exit(spa, FTAG); 2026 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 2027 return; 2028 if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 2029 return; 2030 spa_config_enter(spa, RW_READER, FTAG); 2031 } 2032 2033 spa_config_exit(spa, FTAG); 2034 } 2035 2036 /* 2037 * Update the stored path for this vdev. Dirty the vdev configuration, relying 2038 * on spa_vdev_enter/exit() to synchronize the labels and cache. 2039 */ 2040 int 2041 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 2042 { 2043 vdev_t *rvd, *vd; 2044 uint64_t txg; 2045 2046 rvd = spa->spa_root_vdev; 2047 2048 txg = spa_vdev_enter(spa); 2049 2050 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2051 /* 2052 * Determine if this is a reference to a hot spare. In that 2053 * case, update the path as stored in the spare list. 2054 */ 2055 nvlist_t **spares; 2056 uint_t i, nspares; 2057 if (spa->spa_sparelist != NULL) { 2058 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 2059 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2060 for (i = 0; i < nspares; i++) { 2061 uint64_t theguid; 2062 VERIFY(nvlist_lookup_uint64(spares[i], 2063 ZPOOL_CONFIG_GUID, &theguid) == 0); 2064 if (theguid == guid) 2065 break; 2066 } 2067 2068 if (i == nspares) 2069 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2070 2071 VERIFY(nvlist_add_string(spares[i], 2072 ZPOOL_CONFIG_PATH, newpath) == 0); 2073 spa_load_spares(spa); 2074 spa->spa_sync_spares = B_TRUE; 2075 return (spa_vdev_exit(spa, NULL, txg, 0)); 2076 } else { 2077 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2078 } 2079 } 2080 2081 if (!vd->vdev_ops->vdev_op_leaf) 2082 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2083 2084 spa_strfree(vd->vdev_path); 2085 vd->vdev_path = spa_strdup(newpath); 2086 2087 vdev_config_dirty(vd->vdev_top); 2088 2089 return (spa_vdev_exit(spa, NULL, txg, 0)); 2090 } 2091 2092 /* 2093 * ========================================================================== 2094 * SPA Scrubbing 2095 * ========================================================================== 2096 */ 2097 2098 void 2099 spa_scrub_throttle(spa_t *spa, int direction) 2100 { 2101 mutex_enter(&spa->spa_scrub_lock); 2102 spa->spa_scrub_throttled += direction; 2103 ASSERT(spa->spa_scrub_throttled >= 0); 2104 if (spa->spa_scrub_throttled == 0) 2105 cv_broadcast(&spa->spa_scrub_io_cv); 2106 mutex_exit(&spa->spa_scrub_lock); 2107 } 2108 2109 static void 2110 spa_scrub_io_done(zio_t *zio) 2111 { 2112 spa_t *spa = zio->io_spa; 2113 2114 zio_buf_free(zio->io_data, zio->io_size); 2115 2116 mutex_enter(&spa->spa_scrub_lock); 2117 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2118 vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2119 spa->spa_scrub_errors++; 2120 mutex_enter(&vd->vdev_stat_lock); 2121 vd->vdev_stat.vs_scrub_errors++; 2122 mutex_exit(&vd->vdev_stat_lock); 2123 } 2124 if (--spa->spa_scrub_inflight == 0) { 2125 cv_broadcast(&spa->spa_scrub_io_cv); 2126 ASSERT(spa->spa_scrub_throttled == 0); 2127 } 2128 mutex_exit(&spa->spa_scrub_lock); 2129 } 2130 2131 static void 2132 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 2133 zbookmark_t *zb) 2134 { 2135 size_t size = BP_GET_LSIZE(bp); 2136 void *data = zio_buf_alloc(size); 2137 2138 mutex_enter(&spa->spa_scrub_lock); 2139 spa->spa_scrub_inflight++; 2140 mutex_exit(&spa->spa_scrub_lock); 2141 2142 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 2143 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 2144 2145 flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 2146 2147 zio_nowait(zio_read(NULL, spa, bp, data, size, 2148 spa_scrub_io_done, NULL, priority, flags, zb)); 2149 } 2150 2151 /* ARGSUSED */ 2152 static int 2153 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2154 { 2155 blkptr_t *bp = &bc->bc_blkptr; 2156 vdev_t *vd = spa->spa_root_vdev; 2157 dva_t *dva = bp->blk_dva; 2158 int needs_resilver = B_FALSE; 2159 int d; 2160 2161 if (bc->bc_errno) { 2162 /* 2163 * We can't scrub this block, but we can continue to scrub 2164 * the rest of the pool. Note the error and move along. 2165 */ 2166 mutex_enter(&spa->spa_scrub_lock); 2167 spa->spa_scrub_errors++; 2168 mutex_exit(&spa->spa_scrub_lock); 2169 2170 mutex_enter(&vd->vdev_stat_lock); 2171 vd->vdev_stat.vs_scrub_errors++; 2172 mutex_exit(&vd->vdev_stat_lock); 2173 2174 return (ERESTART); 2175 } 2176 2177 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2178 2179 for (d = 0; d < BP_GET_NDVAS(bp); d++) { 2180 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 2181 2182 ASSERT(vd != NULL); 2183 2184 /* 2185 * Keep track of how much data we've examined so that 2186 * zpool(1M) status can make useful progress reports. 2187 */ 2188 mutex_enter(&vd->vdev_stat_lock); 2189 vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 2190 mutex_exit(&vd->vdev_stat_lock); 2191 2192 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 2193 if (DVA_GET_GANG(&dva[d])) { 2194 /* 2195 * Gang members may be spread across multiple 2196 * vdevs, so the best we can do is look at the 2197 * pool-wide DTL. 2198 * XXX -- it would be better to change our 2199 * allocation policy to ensure that this can't 2200 * happen. 2201 */ 2202 vd = spa->spa_root_vdev; 2203 } 2204 if (vdev_dtl_contains(&vd->vdev_dtl_map, 2205 bp->blk_birth, 1)) 2206 needs_resilver = B_TRUE; 2207 } 2208 } 2209 2210 if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2211 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 2212 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 2213 else if (needs_resilver) 2214 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 2215 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2216 2217 return (0); 2218 } 2219 2220 static void 2221 spa_scrub_thread(spa_t *spa) 2222 { 2223 callb_cpr_t cprinfo; 2224 traverse_handle_t *th = spa->spa_scrub_th; 2225 vdev_t *rvd = spa->spa_root_vdev; 2226 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2227 int error = 0; 2228 boolean_t complete; 2229 2230 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2231 2232 /* 2233 * If we're restarting due to a snapshot create/delete, 2234 * wait for that to complete. 2235 */ 2236 txg_wait_synced(spa_get_dsl(spa), 0); 2237 2238 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 2239 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2240 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 2241 2242 spa_config_enter(spa, RW_WRITER, FTAG); 2243 vdev_reopen(rvd); /* purge all vdev caches */ 2244 vdev_config_dirty(rvd); /* rewrite all disk labels */ 2245 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 2246 spa_config_exit(spa, FTAG); 2247 2248 mutex_enter(&spa->spa_scrub_lock); 2249 spa->spa_scrub_errors = 0; 2250 spa->spa_scrub_active = 1; 2251 ASSERT(spa->spa_scrub_inflight == 0); 2252 ASSERT(spa->spa_scrub_throttled == 0); 2253 2254 while (!spa->spa_scrub_stop) { 2255 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2256 while (spa->spa_scrub_suspended) { 2257 spa->spa_scrub_active = 0; 2258 cv_broadcast(&spa->spa_scrub_cv); 2259 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2260 spa->spa_scrub_active = 1; 2261 } 2262 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2263 2264 if (spa->spa_scrub_restart_txg != 0) 2265 break; 2266 2267 mutex_exit(&spa->spa_scrub_lock); 2268 error = traverse_more(th); 2269 mutex_enter(&spa->spa_scrub_lock); 2270 if (error != EAGAIN) 2271 break; 2272 2273 while (spa->spa_scrub_throttled > 0) 2274 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2275 } 2276 2277 while (spa->spa_scrub_inflight) 2278 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2279 2280 spa->spa_scrub_active = 0; 2281 cv_broadcast(&spa->spa_scrub_cv); 2282 2283 mutex_exit(&spa->spa_scrub_lock); 2284 2285 spa_config_enter(spa, RW_WRITER, FTAG); 2286 2287 mutex_enter(&spa->spa_scrub_lock); 2288 2289 /* 2290 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 2291 * AND the spa config lock to synchronize with any config changes 2292 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 2293 */ 2294 if (spa->spa_scrub_restart_txg != 0) 2295 error = ERESTART; 2296 2297 if (spa->spa_scrub_stop) 2298 error = EINTR; 2299 2300 /* 2301 * Even if there were uncorrectable errors, we consider the scrub 2302 * completed. The downside is that if there is a transient error during 2303 * a resilver, we won't resilver the data properly to the target. But 2304 * if the damage is permanent (more likely) we will resilver forever, 2305 * which isn't really acceptable. Since there is enough information for 2306 * the user to know what has failed and why, this seems like a more 2307 * tractable approach. 2308 */ 2309 complete = (error == 0); 2310 2311 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 2312 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2313 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2314 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2315 2316 mutex_exit(&spa->spa_scrub_lock); 2317 2318 /* 2319 * If the scrub/resilver completed, update all DTLs to reflect this. 2320 * Whether it succeeded or not, vacate all temporary scrub DTLs. 2321 */ 2322 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2323 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2324 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 2325 spa_errlog_rotate(spa); 2326 2327 spa_config_exit(spa, FTAG); 2328 2329 mutex_enter(&spa->spa_scrub_lock); 2330 2331 /* 2332 * We may have finished replacing a device. 2333 * Let the async thread assess this and handle the detach. 2334 */ 2335 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2336 2337 /* 2338 * If we were told to restart, our final act is to start a new scrub. 2339 */ 2340 if (error == ERESTART) 2341 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 2342 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2343 2344 spa->spa_scrub_type = POOL_SCRUB_NONE; 2345 spa->spa_scrub_active = 0; 2346 spa->spa_scrub_thread = NULL; 2347 cv_broadcast(&spa->spa_scrub_cv); 2348 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2349 thread_exit(); 2350 } 2351 2352 void 2353 spa_scrub_suspend(spa_t *spa) 2354 { 2355 mutex_enter(&spa->spa_scrub_lock); 2356 spa->spa_scrub_suspended++; 2357 while (spa->spa_scrub_active) { 2358 cv_broadcast(&spa->spa_scrub_cv); 2359 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2360 } 2361 while (spa->spa_scrub_inflight) 2362 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2363 mutex_exit(&spa->spa_scrub_lock); 2364 } 2365 2366 void 2367 spa_scrub_resume(spa_t *spa) 2368 { 2369 mutex_enter(&spa->spa_scrub_lock); 2370 ASSERT(spa->spa_scrub_suspended != 0); 2371 if (--spa->spa_scrub_suspended == 0) 2372 cv_broadcast(&spa->spa_scrub_cv); 2373 mutex_exit(&spa->spa_scrub_lock); 2374 } 2375 2376 void 2377 spa_scrub_restart(spa_t *spa, uint64_t txg) 2378 { 2379 /* 2380 * Something happened (e.g. snapshot create/delete) that means 2381 * we must restart any in-progress scrubs. The itinerary will 2382 * fix this properly. 2383 */ 2384 mutex_enter(&spa->spa_scrub_lock); 2385 spa->spa_scrub_restart_txg = txg; 2386 mutex_exit(&spa->spa_scrub_lock); 2387 } 2388 2389 int 2390 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2391 { 2392 space_seg_t *ss; 2393 uint64_t mintxg, maxtxg; 2394 vdev_t *rvd = spa->spa_root_vdev; 2395 2396 if ((uint_t)type >= POOL_SCRUB_TYPES) 2397 return (ENOTSUP); 2398 2399 mutex_enter(&spa->spa_scrub_lock); 2400 2401 /* 2402 * If there's a scrub or resilver already in progress, stop it. 2403 */ 2404 while (spa->spa_scrub_thread != NULL) { 2405 /* 2406 * Don't stop a resilver unless forced. 2407 */ 2408 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 2409 mutex_exit(&spa->spa_scrub_lock); 2410 return (EBUSY); 2411 } 2412 spa->spa_scrub_stop = 1; 2413 cv_broadcast(&spa->spa_scrub_cv); 2414 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2415 } 2416 2417 /* 2418 * Terminate the previous traverse. 2419 */ 2420 if (spa->spa_scrub_th != NULL) { 2421 traverse_fini(spa->spa_scrub_th); 2422 spa->spa_scrub_th = NULL; 2423 } 2424 2425 if (rvd == NULL) { 2426 ASSERT(spa->spa_scrub_stop == 0); 2427 ASSERT(spa->spa_scrub_type == type); 2428 ASSERT(spa->spa_scrub_restart_txg == 0); 2429 mutex_exit(&spa->spa_scrub_lock); 2430 return (0); 2431 } 2432 2433 mintxg = TXG_INITIAL - 1; 2434 maxtxg = spa_last_synced_txg(spa) + 1; 2435 2436 mutex_enter(&rvd->vdev_dtl_lock); 2437 2438 if (rvd->vdev_dtl_map.sm_space == 0) { 2439 /* 2440 * The pool-wide DTL is empty. 2441 * If this is a resilver, there's nothing to do except 2442 * check whether any in-progress replacements have completed. 2443 */ 2444 if (type == POOL_SCRUB_RESILVER) { 2445 type = POOL_SCRUB_NONE; 2446 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2447 } 2448 } else { 2449 /* 2450 * The pool-wide DTL is non-empty. 2451 * If this is a normal scrub, upgrade to a resilver instead. 2452 */ 2453 if (type == POOL_SCRUB_EVERYTHING) 2454 type = POOL_SCRUB_RESILVER; 2455 } 2456 2457 if (type == POOL_SCRUB_RESILVER) { 2458 /* 2459 * Determine the resilvering boundaries. 2460 * 2461 * Note: (mintxg, maxtxg) is an open interval, 2462 * i.e. mintxg and maxtxg themselves are not included. 2463 * 2464 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 2465 * so we don't claim to resilver a txg that's still changing. 2466 */ 2467 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 2468 mintxg = ss->ss_start - 1; 2469 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 2470 maxtxg = MIN(ss->ss_end, maxtxg); 2471 } 2472 2473 mutex_exit(&rvd->vdev_dtl_lock); 2474 2475 spa->spa_scrub_stop = 0; 2476 spa->spa_scrub_type = type; 2477 spa->spa_scrub_restart_txg = 0; 2478 2479 if (type != POOL_SCRUB_NONE) { 2480 spa->spa_scrub_mintxg = mintxg; 2481 spa->spa_scrub_maxtxg = maxtxg; 2482 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 2483 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 2484 ZIO_FLAG_CANFAIL); 2485 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 2486 spa->spa_scrub_thread = thread_create(NULL, 0, 2487 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 2488 } 2489 2490 mutex_exit(&spa->spa_scrub_lock); 2491 2492 return (0); 2493 } 2494 2495 /* 2496 * ========================================================================== 2497 * SPA async task processing 2498 * ========================================================================== 2499 */ 2500 2501 static void 2502 spa_async_reopen(spa_t *spa) 2503 { 2504 vdev_t *rvd = spa->spa_root_vdev; 2505 vdev_t *tvd; 2506 int c; 2507 2508 spa_config_enter(spa, RW_WRITER, FTAG); 2509 2510 for (c = 0; c < rvd->vdev_children; c++) { 2511 tvd = rvd->vdev_child[c]; 2512 if (tvd->vdev_reopen_wanted) { 2513 tvd->vdev_reopen_wanted = 0; 2514 vdev_reopen(tvd); 2515 } 2516 } 2517 2518 spa_config_exit(spa, FTAG); 2519 } 2520 2521 static void 2522 spa_async_thread(spa_t *spa) 2523 { 2524 int tasks; 2525 2526 ASSERT(spa->spa_sync_on); 2527 2528 mutex_enter(&spa->spa_async_lock); 2529 tasks = spa->spa_async_tasks; 2530 spa->spa_async_tasks = 0; 2531 mutex_exit(&spa->spa_async_lock); 2532 2533 /* 2534 * See if the config needs to be updated. 2535 */ 2536 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 2537 mutex_enter(&spa_namespace_lock); 2538 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2539 mutex_exit(&spa_namespace_lock); 2540 } 2541 2542 /* 2543 * See if any devices need to be reopened. 2544 */ 2545 if (tasks & SPA_ASYNC_REOPEN) 2546 spa_async_reopen(spa); 2547 2548 /* 2549 * If any devices are done replacing, detach them. 2550 */ 2551 if (tasks & SPA_ASYNC_REPLACE_DONE) 2552 spa_vdev_replace_done(spa); 2553 2554 /* 2555 * Kick off a scrub. 2556 */ 2557 if (tasks & SPA_ASYNC_SCRUB) 2558 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 2559 2560 /* 2561 * Kick off a resilver. 2562 */ 2563 if (tasks & SPA_ASYNC_RESILVER) 2564 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 2565 2566 /* 2567 * Let the world know that we're done. 2568 */ 2569 mutex_enter(&spa->spa_async_lock); 2570 spa->spa_async_thread = NULL; 2571 cv_broadcast(&spa->spa_async_cv); 2572 mutex_exit(&spa->spa_async_lock); 2573 thread_exit(); 2574 } 2575 2576 void 2577 spa_async_suspend(spa_t *spa) 2578 { 2579 mutex_enter(&spa->spa_async_lock); 2580 spa->spa_async_suspended++; 2581 while (spa->spa_async_thread != NULL) 2582 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 2583 mutex_exit(&spa->spa_async_lock); 2584 } 2585 2586 void 2587 spa_async_resume(spa_t *spa) 2588 { 2589 mutex_enter(&spa->spa_async_lock); 2590 ASSERT(spa->spa_async_suspended != 0); 2591 spa->spa_async_suspended--; 2592 mutex_exit(&spa->spa_async_lock); 2593 } 2594 2595 static void 2596 spa_async_dispatch(spa_t *spa) 2597 { 2598 mutex_enter(&spa->spa_async_lock); 2599 if (spa->spa_async_tasks && !spa->spa_async_suspended && 2600 spa->spa_async_thread == NULL && 2601 rootdir != NULL && !vn_is_readonly(rootdir)) 2602 spa->spa_async_thread = thread_create(NULL, 0, 2603 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 2604 mutex_exit(&spa->spa_async_lock); 2605 } 2606 2607 void 2608 spa_async_request(spa_t *spa, int task) 2609 { 2610 mutex_enter(&spa->spa_async_lock); 2611 spa->spa_async_tasks |= task; 2612 mutex_exit(&spa->spa_async_lock); 2613 } 2614 2615 /* 2616 * ========================================================================== 2617 * SPA syncing routines 2618 * ========================================================================== 2619 */ 2620 2621 static void 2622 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2623 { 2624 bplist_t *bpl = &spa->spa_sync_bplist; 2625 dmu_tx_t *tx; 2626 blkptr_t blk; 2627 uint64_t itor = 0; 2628 zio_t *zio; 2629 int error; 2630 uint8_t c = 1; 2631 2632 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2633 2634 while (bplist_iterate(bpl, &itor, &blk) == 0) 2635 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2636 2637 error = zio_wait(zio); 2638 ASSERT3U(error, ==, 0); 2639 2640 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2641 bplist_vacate(bpl, tx); 2642 2643 /* 2644 * Pre-dirty the first block so we sync to convergence faster. 2645 * (Usually only the first block is needed.) 2646 */ 2647 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2648 dmu_tx_commit(tx); 2649 } 2650 2651 static void 2652 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 2653 { 2654 char *packed = NULL; 2655 size_t nvsize = 0; 2656 dmu_buf_t *db; 2657 2658 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 2659 2660 packed = kmem_alloc(nvsize, KM_SLEEP); 2661 2662 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 2663 KM_SLEEP) == 0); 2664 2665 dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 2666 2667 kmem_free(packed, nvsize); 2668 2669 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 2670 dmu_buf_will_dirty(db, tx); 2671 *(uint64_t *)db->db_data = nvsize; 2672 dmu_buf_rele(db, FTAG); 2673 } 2674 2675 static void 2676 spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 2677 { 2678 nvlist_t *nvroot; 2679 nvlist_t **spares; 2680 int i; 2681 2682 if (!spa->spa_sync_spares) 2683 return; 2684 2685 /* 2686 * Update the MOS nvlist describing the list of available spares. 2687 * spa_validate_spares() will have already made sure this nvlist is 2688 * valid and the vdevs are labelled appropriately. 2689 */ 2690 if (spa->spa_spares_object == 0) { 2691 spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 2692 DMU_OT_PACKED_NVLIST, 1 << 14, 2693 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2694 VERIFY(zap_update(spa->spa_meta_objset, 2695 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 2696 sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 2697 } 2698 2699 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2700 if (spa->spa_nspares == 0) { 2701 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2702 NULL, 0) == 0); 2703 } else { 2704 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 2705 KM_SLEEP); 2706 for (i = 0; i < spa->spa_nspares; i++) 2707 spares[i] = vdev_config_generate(spa, 2708 spa->spa_spares[i], B_FALSE, B_TRUE); 2709 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2710 spares, spa->spa_nspares) == 0); 2711 for (i = 0; i < spa->spa_nspares; i++) 2712 nvlist_free(spares[i]); 2713 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 2714 } 2715 2716 spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 2717 2718 spa->spa_sync_spares = B_FALSE; 2719 } 2720 2721 static void 2722 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2723 { 2724 nvlist_t *config; 2725 2726 if (list_is_empty(&spa->spa_dirty_list)) 2727 return; 2728 2729 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2730 2731 if (spa->spa_config_syncing) 2732 nvlist_free(spa->spa_config_syncing); 2733 spa->spa_config_syncing = config; 2734 2735 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 2736 } 2737 2738 /* 2739 * Sync the specified transaction group. New blocks may be dirtied as 2740 * part of the process, so we iterate until it converges. 2741 */ 2742 void 2743 spa_sync(spa_t *spa, uint64_t txg) 2744 { 2745 dsl_pool_t *dp = spa->spa_dsl_pool; 2746 objset_t *mos = spa->spa_meta_objset; 2747 bplist_t *bpl = &spa->spa_sync_bplist; 2748 vdev_t *rvd = spa->spa_root_vdev; 2749 vdev_t *vd; 2750 dmu_tx_t *tx; 2751 int dirty_vdevs; 2752 2753 /* 2754 * Lock out configuration changes. 2755 */ 2756 spa_config_enter(spa, RW_READER, FTAG); 2757 2758 spa->spa_syncing_txg = txg; 2759 spa->spa_sync_pass = 0; 2760 2761 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2762 2763 tx = dmu_tx_create_assigned(dp, txg); 2764 2765 /* 2766 * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg, 2767 * set spa_deflate if we have no raid-z vdevs. 2768 */ 2769 if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE && 2770 spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) { 2771 int i; 2772 2773 for (i = 0; i < rvd->vdev_children; i++) { 2774 vd = rvd->vdev_child[i]; 2775 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 2776 break; 2777 } 2778 if (i == rvd->vdev_children) { 2779 spa->spa_deflate = TRUE; 2780 VERIFY(0 == zap_add(spa->spa_meta_objset, 2781 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2782 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 2783 } 2784 } 2785 2786 /* 2787 * If anything has changed in this txg, push the deferred frees 2788 * from the previous txg. If not, leave them alone so that we 2789 * don't generate work on an otherwise idle system. 2790 */ 2791 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2792 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 2793 !txg_list_empty(&dp->dp_sync_tasks, txg)) 2794 spa_sync_deferred_frees(spa, txg); 2795 2796 /* 2797 * Iterate to convergence. 2798 */ 2799 do { 2800 spa->spa_sync_pass++; 2801 2802 spa_sync_config_object(spa, tx); 2803 spa_sync_spares(spa, tx); 2804 spa_errlog_sync(spa, txg); 2805 dsl_pool_sync(dp, txg); 2806 2807 dirty_vdevs = 0; 2808 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2809 vdev_sync(vd, txg); 2810 dirty_vdevs++; 2811 } 2812 2813 bplist_sync(bpl, tx); 2814 } while (dirty_vdevs); 2815 2816 bplist_close(bpl); 2817 2818 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2819 2820 /* 2821 * Rewrite the vdev configuration (which includes the uberblock) 2822 * to commit the transaction group. 2823 * 2824 * If there are any dirty vdevs, sync the uberblock to all vdevs. 2825 * Otherwise, pick a random top-level vdev that's known to be 2826 * visible in the config cache (see spa_vdev_add() for details). 2827 * If the write fails, try the next vdev until we're tried them all. 2828 */ 2829 if (!list_is_empty(&spa->spa_dirty_list)) { 2830 VERIFY(vdev_config_sync(rvd, txg) == 0); 2831 } else { 2832 int children = rvd->vdev_children; 2833 int c0 = spa_get_random(children); 2834 int c; 2835 2836 for (c = 0; c < children; c++) { 2837 vd = rvd->vdev_child[(c0 + c) % children]; 2838 if (vd->vdev_ms_array == 0) 2839 continue; 2840 if (vdev_config_sync(vd, txg) == 0) 2841 break; 2842 } 2843 if (c == children) 2844 VERIFY(vdev_config_sync(rvd, txg) == 0); 2845 } 2846 2847 dmu_tx_commit(tx); 2848 2849 /* 2850 * Clear the dirty config list. 2851 */ 2852 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 2853 vdev_config_clean(vd); 2854 2855 /* 2856 * Now that the new config has synced transactionally, 2857 * let it become visible to the config cache. 2858 */ 2859 if (spa->spa_config_syncing != NULL) { 2860 spa_config_set(spa, spa->spa_config_syncing); 2861 spa->spa_config_txg = txg; 2862 spa->spa_config_syncing = NULL; 2863 } 2864 2865 /* 2866 * Make a stable copy of the fully synced uberblock. 2867 * We use this as the root for pool traversals. 2868 */ 2869 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 2870 2871 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 2872 2873 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 2874 spa->spa_traverse_wanted = 0; 2875 spa->spa_ubsync = spa->spa_uberblock; 2876 rw_exit(&spa->spa_traverse_lock); 2877 2878 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 2879 2880 /* 2881 * Clean up the ZIL records for the synced txg. 2882 */ 2883 dsl_pool_zil_clean(dp); 2884 2885 /* 2886 * Update usable space statistics. 2887 */ 2888 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 2889 vdev_sync_done(vd, txg); 2890 2891 /* 2892 * It had better be the case that we didn't dirty anything 2893 * since vdev_config_sync(). 2894 */ 2895 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 2896 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 2897 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 2898 ASSERT(bpl->bpl_queue == NULL); 2899 2900 spa_config_exit(spa, FTAG); 2901 2902 /* 2903 * If any async tasks have been requested, kick them off. 2904 */ 2905 spa_async_dispatch(spa); 2906 } 2907 2908 /* 2909 * Sync all pools. We don't want to hold the namespace lock across these 2910 * operations, so we take a reference on the spa_t and drop the lock during the 2911 * sync. 2912 */ 2913 void 2914 spa_sync_allpools(void) 2915 { 2916 spa_t *spa = NULL; 2917 mutex_enter(&spa_namespace_lock); 2918 while ((spa = spa_next(spa)) != NULL) { 2919 if (spa_state(spa) != POOL_STATE_ACTIVE) 2920 continue; 2921 spa_open_ref(spa, FTAG); 2922 mutex_exit(&spa_namespace_lock); 2923 txg_wait_synced(spa_get_dsl(spa), 0); 2924 mutex_enter(&spa_namespace_lock); 2925 spa_close(spa, FTAG); 2926 } 2927 mutex_exit(&spa_namespace_lock); 2928 } 2929 2930 /* 2931 * ========================================================================== 2932 * Miscellaneous routines 2933 * ========================================================================== 2934 */ 2935 2936 /* 2937 * Remove all pools in the system. 2938 */ 2939 void 2940 spa_evict_all(void) 2941 { 2942 spa_t *spa; 2943 2944 /* 2945 * Remove all cached state. All pools should be closed now, 2946 * so every spa in the AVL tree should be unreferenced. 2947 */ 2948 mutex_enter(&spa_namespace_lock); 2949 while ((spa = spa_next(NULL)) != NULL) { 2950 /* 2951 * Stop async tasks. The async thread may need to detach 2952 * a device that's been replaced, which requires grabbing 2953 * spa_namespace_lock, so we must drop it here. 2954 */ 2955 spa_open_ref(spa, FTAG); 2956 mutex_exit(&spa_namespace_lock); 2957 spa_async_suspend(spa); 2958 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 2959 mutex_enter(&spa_namespace_lock); 2960 spa_close(spa, FTAG); 2961 2962 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2963 spa_unload(spa); 2964 spa_deactivate(spa); 2965 } 2966 spa_remove(spa); 2967 } 2968 mutex_exit(&spa_namespace_lock); 2969 } 2970 2971 vdev_t * 2972 spa_lookup_by_guid(spa_t *spa, uint64_t guid) 2973 { 2974 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 2975 } 2976 2977 void 2978 spa_upgrade(spa_t *spa) 2979 { 2980 spa_config_enter(spa, RW_WRITER, FTAG); 2981 2982 /* 2983 * This should only be called for a non-faulted pool, and since a 2984 * future version would result in an unopenable pool, this shouldn't be 2985 * possible. 2986 */ 2987 ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 2988 2989 spa->spa_uberblock.ub_version = ZFS_VERSION; 2990 vdev_config_dirty(spa->spa_root_vdev); 2991 2992 spa_config_exit(spa, FTAG); 2993 2994 txg_wait_synced(spa_get_dsl(spa), 0); 2995 } 2996 2997 boolean_t 2998 spa_has_spare(spa_t *spa, uint64_t guid) 2999 { 3000 int i; 3001 3002 for (i = 0; i < spa->spa_nspares; i++) 3003 if (spa->spa_spares[i]->vdev_guid == guid) 3004 return (B_TRUE); 3005 3006 return (B_FALSE); 3007 } 3008