1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains all the routines used when modifying on-disk SPA state. 31 * This includes opening, importing, destroying, exporting a pool, and syncing a 32 * pool. 33 */ 34 35 #include <sys/zfs_context.h> 36 #include <sys/fm/fs/zfs.h> 37 #include <sys/spa_impl.h> 38 #include <sys/zio.h> 39 #include <sys/zio_checksum.h> 40 #include <sys/zio_compress.h> 41 #include <sys/dmu.h> 42 #include <sys/dmu_tx.h> 43 #include <sys/zap.h> 44 #include <sys/zil.h> 45 #include <sys/vdev_impl.h> 46 #include <sys/metaslab.h> 47 #include <sys/uberblock_impl.h> 48 #include <sys/txg.h> 49 #include <sys/avl.h> 50 #include <sys/dmu_traverse.h> 51 #include <sys/unique.h> 52 #include <sys/dsl_pool.h> 53 #include <sys/dsl_dir.h> 54 #include <sys/dsl_prop.h> 55 #include <sys/fs/zfs.h> 56 #include <sys/callb.h> 57 58 /* 59 * ========================================================================== 60 * SPA state manipulation (open/create/destroy/import/export) 61 * ========================================================================== 62 */ 63 64 static int 65 spa_error_entry_compare(const void *a, const void *b) 66 { 67 spa_error_entry_t *sa = (spa_error_entry_t *)a; 68 spa_error_entry_t *sb = (spa_error_entry_t *)b; 69 int ret; 70 71 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 72 sizeof (zbookmark_t)); 73 74 if (ret < 0) 75 return (-1); 76 else if (ret > 0) 77 return (1); 78 else 79 return (0); 80 } 81 82 /* 83 * Utility function which retrieves copies of the current logs and 84 * re-initializes them in the process. 85 */ 86 void 87 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 88 { 89 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 90 91 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 92 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 93 94 avl_create(&spa->spa_errlist_scrub, 95 spa_error_entry_compare, sizeof (spa_error_entry_t), 96 offsetof(spa_error_entry_t, se_avl)); 97 avl_create(&spa->spa_errlist_last, 98 spa_error_entry_compare, sizeof (spa_error_entry_t), 99 offsetof(spa_error_entry_t, se_avl)); 100 } 101 102 /* 103 * Activate an uninitialized pool. 104 */ 105 static void 106 spa_activate(spa_t *spa) 107 { 108 int t; 109 110 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 111 112 spa->spa_state = POOL_STATE_ACTIVE; 113 114 spa->spa_normal_class = metaslab_class_create(); 115 116 for (t = 0; t < ZIO_TYPES; t++) { 117 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 118 8, maxclsyspri, 50, INT_MAX, 119 TASKQ_PREPOPULATE); 120 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 121 8, maxclsyspri, 50, INT_MAX, 122 TASKQ_PREPOPULATE); 123 } 124 125 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 126 127 mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); 128 mutex_init(&spa->spa_config_cache_lock, NULL, MUTEX_DEFAULT, NULL); 129 mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); 130 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); 131 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); 132 mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL); 133 mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); 134 mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); 135 136 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 137 offsetof(vdev_t, vdev_dirty_node)); 138 139 txg_list_create(&spa->spa_vdev_txg_list, 140 offsetof(struct vdev, vdev_txg_node)); 141 142 avl_create(&spa->spa_errlist_scrub, 143 spa_error_entry_compare, sizeof (spa_error_entry_t), 144 offsetof(spa_error_entry_t, se_avl)); 145 avl_create(&spa->spa_errlist_last, 146 spa_error_entry_compare, sizeof (spa_error_entry_t), 147 offsetof(spa_error_entry_t, se_avl)); 148 } 149 150 /* 151 * Opposite of spa_activate(). 152 */ 153 static void 154 spa_deactivate(spa_t *spa) 155 { 156 int t; 157 158 ASSERT(spa->spa_sync_on == B_FALSE); 159 ASSERT(spa->spa_dsl_pool == NULL); 160 ASSERT(spa->spa_root_vdev == NULL); 161 162 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 163 164 txg_list_destroy(&spa->spa_vdev_txg_list); 165 166 list_destroy(&spa->spa_dirty_list); 167 168 rw_destroy(&spa->spa_traverse_lock); 169 170 for (t = 0; t < ZIO_TYPES; t++) { 171 taskq_destroy(spa->spa_zio_issue_taskq[t]); 172 taskq_destroy(spa->spa_zio_intr_taskq[t]); 173 spa->spa_zio_issue_taskq[t] = NULL; 174 spa->spa_zio_intr_taskq[t] = NULL; 175 } 176 177 metaslab_class_destroy(spa->spa_normal_class); 178 spa->spa_normal_class = NULL; 179 180 /* 181 * If this was part of an import or the open otherwise failed, we may 182 * still have errors left in the queues. Empty them just in case. 183 */ 184 spa_errlog_drain(spa); 185 186 avl_destroy(&spa->spa_errlist_scrub); 187 avl_destroy(&spa->spa_errlist_last); 188 189 spa->spa_state = POOL_STATE_UNINITIALIZED; 190 } 191 192 /* 193 * Verify a pool configuration, and construct the vdev tree appropriately. This 194 * will create all the necessary vdevs in the appropriate layout, with each vdev 195 * in the CLOSED state. This will prep the pool before open/creation/import. 196 * All vdev validation is done by the vdev_alloc() routine. 197 */ 198 static int 199 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 200 uint_t id, int atype) 201 { 202 nvlist_t **child; 203 uint_t c, children; 204 int error; 205 206 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 207 return (error); 208 209 if ((*vdp)->vdev_ops->vdev_op_leaf) 210 return (0); 211 212 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 213 &child, &children) != 0) { 214 vdev_free(*vdp); 215 *vdp = NULL; 216 return (EINVAL); 217 } 218 219 for (c = 0; c < children; c++) { 220 vdev_t *vd; 221 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 222 atype)) != 0) { 223 vdev_free(*vdp); 224 *vdp = NULL; 225 return (error); 226 } 227 } 228 229 ASSERT(*vdp != NULL); 230 231 return (0); 232 } 233 234 /* 235 * Opposite of spa_load(). 236 */ 237 static void 238 spa_unload(spa_t *spa) 239 { 240 int i; 241 242 /* 243 * Stop async tasks. 244 */ 245 spa_async_suspend(spa); 246 247 /* 248 * Stop syncing. 249 */ 250 if (spa->spa_sync_on) { 251 txg_sync_stop(spa->spa_dsl_pool); 252 spa->spa_sync_on = B_FALSE; 253 } 254 255 /* 256 * Wait for any outstanding prefetch I/O to complete. 257 */ 258 spa_config_enter(spa, RW_WRITER, FTAG); 259 spa_config_exit(spa, FTAG); 260 261 /* 262 * Close the dsl pool. 263 */ 264 if (spa->spa_dsl_pool) { 265 dsl_pool_close(spa->spa_dsl_pool); 266 spa->spa_dsl_pool = NULL; 267 } 268 269 /* 270 * Close all vdevs. 271 */ 272 if (spa->spa_root_vdev) 273 vdev_free(spa->spa_root_vdev); 274 ASSERT(spa->spa_root_vdev == NULL); 275 276 for (i = 0; i < spa->spa_nspares; i++) 277 vdev_free(spa->spa_spares[i]); 278 if (spa->spa_spares) { 279 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 280 spa->spa_spares = NULL; 281 } 282 if (spa->spa_sparelist) { 283 nvlist_free(spa->spa_sparelist); 284 spa->spa_sparelist = NULL; 285 } 286 287 spa->spa_async_suspended = 0; 288 } 289 290 /* 291 * Load (or re-load) the current list of vdevs describing the active spares for 292 * this pool. When this is called, we have some form of basic information in 293 * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 294 * re-generate a more complete list including status information. 295 */ 296 static void 297 spa_load_spares(spa_t *spa) 298 { 299 nvlist_t **spares; 300 uint_t nspares; 301 int i; 302 303 /* 304 * First, close and free any existing spare vdevs. 305 */ 306 for (i = 0; i < spa->spa_nspares; i++) { 307 vdev_close(spa->spa_spares[i]); 308 vdev_free(spa->spa_spares[i]); 309 } 310 if (spa->spa_spares) 311 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 312 313 if (spa->spa_sparelist == NULL) 314 nspares = 0; 315 else 316 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 317 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 318 319 spa->spa_nspares = (int)nspares; 320 spa->spa_spares = NULL; 321 322 if (nspares == 0) 323 return; 324 325 /* 326 * Construct the array of vdevs, opening them to get status in the 327 * process. 328 */ 329 spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 330 for (i = 0; i < spa->spa_nspares; i++) { 331 vdev_t *vd; 332 333 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 334 VDEV_ALLOC_SPARE) == 0); 335 ASSERT(vd != NULL); 336 337 spa->spa_spares[i] = vd; 338 339 if (vdev_open(vd) != 0) 340 continue; 341 342 vd->vdev_top = vd; 343 (void) vdev_validate_spare(vd); 344 } 345 346 /* 347 * Recompute the stashed list of spares, with status information 348 * this time. 349 */ 350 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 351 DATA_TYPE_NVLIST_ARRAY) == 0); 352 353 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 354 for (i = 0; i < spa->spa_nspares; i++) 355 spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 356 B_TRUE, B_TRUE); 357 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 358 spares, spa->spa_nspares) == 0); 359 for (i = 0; i < spa->spa_nspares; i++) 360 nvlist_free(spares[i]); 361 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 362 } 363 364 static int 365 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 366 { 367 dmu_buf_t *db; 368 char *packed = NULL; 369 size_t nvsize = 0; 370 int error; 371 *value = NULL; 372 373 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 374 nvsize = *(uint64_t *)db->db_data; 375 dmu_buf_rele(db, FTAG); 376 377 packed = kmem_alloc(nvsize, KM_SLEEP); 378 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 379 if (error == 0) 380 error = nvlist_unpack(packed, nvsize, value, 0); 381 kmem_free(packed, nvsize); 382 383 return (error); 384 } 385 386 /* 387 * Load an existing storage pool, using the pool's builtin spa_config as a 388 * source of configuration information. 389 */ 390 static int 391 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 392 { 393 int error = 0; 394 nvlist_t *nvroot = NULL; 395 vdev_t *rvd; 396 uberblock_t *ub = &spa->spa_uberblock; 397 uint64_t config_cache_txg = spa->spa_config_txg; 398 uint64_t pool_guid; 399 uint64_t version; 400 zio_t *zio; 401 402 spa->spa_load_state = state; 403 404 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 405 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 406 error = EINVAL; 407 goto out; 408 } 409 410 /* 411 * Versioning wasn't explicitly added to the label until later, so if 412 * it's not present treat it as the initial version. 413 */ 414 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 415 version = ZFS_VERSION_INITIAL; 416 417 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 418 &spa->spa_config_txg); 419 420 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 421 spa_guid_exists(pool_guid, 0)) { 422 error = EEXIST; 423 goto out; 424 } 425 426 spa->spa_load_guid = pool_guid; 427 428 /* 429 * Parse the configuration into a vdev tree. We explicitly set the 430 * value that will be returned by spa_version() since parsing the 431 * configuration requires knowing the version number. 432 */ 433 spa_config_enter(spa, RW_WRITER, FTAG); 434 spa->spa_ubsync.ub_version = version; 435 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 436 spa_config_exit(spa, FTAG); 437 438 if (error != 0) 439 goto out; 440 441 ASSERT(spa->spa_root_vdev == rvd); 442 ASSERT(spa_guid(spa) == pool_guid); 443 444 /* 445 * Try to open all vdevs, loading each label in the process. 446 */ 447 if (vdev_open(rvd) != 0) { 448 error = ENXIO; 449 goto out; 450 } 451 452 /* 453 * Validate the labels for all leaf vdevs. We need to grab the config 454 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 455 * flag. 456 */ 457 spa_config_enter(spa, RW_READER, FTAG); 458 error = vdev_validate(rvd); 459 spa_config_exit(spa, FTAG); 460 461 if (error != 0) { 462 error = EBADF; 463 goto out; 464 } 465 466 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 467 error = ENXIO; 468 goto out; 469 } 470 471 /* 472 * Find the best uberblock. 473 */ 474 bzero(ub, sizeof (uberblock_t)); 475 476 zio = zio_root(spa, NULL, NULL, 477 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 478 vdev_uberblock_load(zio, rvd, ub); 479 error = zio_wait(zio); 480 481 /* 482 * If we weren't able to find a single valid uberblock, return failure. 483 */ 484 if (ub->ub_txg == 0) { 485 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 486 VDEV_AUX_CORRUPT_DATA); 487 error = ENXIO; 488 goto out; 489 } 490 491 /* 492 * If the pool is newer than the code, we can't open it. 493 */ 494 if (ub->ub_version > ZFS_VERSION) { 495 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 496 VDEV_AUX_VERSION_NEWER); 497 error = ENOTSUP; 498 goto out; 499 } 500 501 /* 502 * If the vdev guid sum doesn't match the uberblock, we have an 503 * incomplete configuration. 504 */ 505 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 506 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 507 VDEV_AUX_BAD_GUID_SUM); 508 error = ENXIO; 509 goto out; 510 } 511 512 /* 513 * Initialize internal SPA structures. 514 */ 515 spa->spa_state = POOL_STATE_ACTIVE; 516 spa->spa_ubsync = spa->spa_uberblock; 517 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 518 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 519 if (error) { 520 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 521 VDEV_AUX_CORRUPT_DATA); 522 goto out; 523 } 524 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 525 526 if (zap_lookup(spa->spa_meta_objset, 527 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 528 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 529 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 530 VDEV_AUX_CORRUPT_DATA); 531 error = EIO; 532 goto out; 533 } 534 535 if (!mosconfig) { 536 nvlist_t *newconfig; 537 538 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 539 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 540 VDEV_AUX_CORRUPT_DATA); 541 error = EIO; 542 goto out; 543 } 544 545 spa_config_set(spa, newconfig); 546 spa_unload(spa); 547 spa_deactivate(spa); 548 spa_activate(spa); 549 550 return (spa_load(spa, newconfig, state, B_TRUE)); 551 } 552 553 if (zap_lookup(spa->spa_meta_objset, 554 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 555 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 556 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 557 VDEV_AUX_CORRUPT_DATA); 558 error = EIO; 559 goto out; 560 } 561 562 /* 563 * Load the bit that tells us to use the new accounting function 564 * (raid-z deflation). If we have an older pool, this will not 565 * be present. 566 */ 567 error = zap_lookup(spa->spa_meta_objset, 568 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 569 sizeof (uint64_t), 1, &spa->spa_deflate); 570 if (error != 0 && error != ENOENT) { 571 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 572 VDEV_AUX_CORRUPT_DATA); 573 error = EIO; 574 goto out; 575 } 576 577 /* 578 * Load the persistent error log. If we have an older pool, this will 579 * not be present. 580 */ 581 error = zap_lookup(spa->spa_meta_objset, 582 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 583 sizeof (uint64_t), 1, &spa->spa_errlog_last); 584 if (error != 0 && error != ENOENT) { 585 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 586 VDEV_AUX_CORRUPT_DATA); 587 error = EIO; 588 goto out; 589 } 590 591 error = zap_lookup(spa->spa_meta_objset, 592 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 593 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 594 if (error != 0 && error != ENOENT) { 595 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 596 VDEV_AUX_CORRUPT_DATA); 597 error = EIO; 598 goto out; 599 } 600 601 /* 602 * Load the history object. If we have an older pool, this 603 * will not be present. 604 */ 605 error = zap_lookup(spa->spa_meta_objset, 606 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 607 sizeof (uint64_t), 1, &spa->spa_history); 608 if (error != 0 && error != ENOENT) { 609 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 610 VDEV_AUX_CORRUPT_DATA); 611 error = EIO; 612 goto out; 613 } 614 615 /* 616 * Load any hot spares for this pool. 617 */ 618 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 619 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 620 if (error != 0 && error != ENOENT) { 621 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 622 VDEV_AUX_CORRUPT_DATA); 623 error = EIO; 624 goto out; 625 } 626 if (error == 0) { 627 ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES); 628 if (load_nvlist(spa, spa->spa_spares_object, 629 &spa->spa_sparelist) != 0) { 630 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 631 VDEV_AUX_CORRUPT_DATA); 632 error = EIO; 633 goto out; 634 } 635 636 spa_config_enter(spa, RW_WRITER, FTAG); 637 spa_load_spares(spa); 638 spa_config_exit(spa, FTAG); 639 } 640 641 /* 642 * Load the vdev state for all toplevel vdevs. 643 */ 644 vdev_load(rvd); 645 646 /* 647 * Propagate the leaf DTLs we just loaded all the way up the tree. 648 */ 649 spa_config_enter(spa, RW_WRITER, FTAG); 650 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 651 spa_config_exit(spa, FTAG); 652 653 /* 654 * Check the state of the root vdev. If it can't be opened, it 655 * indicates one or more toplevel vdevs are faulted. 656 */ 657 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 658 error = ENXIO; 659 goto out; 660 } 661 662 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 663 dmu_tx_t *tx; 664 int need_update = B_FALSE; 665 int c; 666 667 /* 668 * Claim log blocks that haven't been committed yet. 669 * This must all happen in a single txg. 670 */ 671 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 672 spa_first_txg(spa)); 673 (void) dmu_objset_find(spa->spa_name, 674 zil_claim, tx, DS_FIND_CHILDREN); 675 dmu_tx_commit(tx); 676 677 spa->spa_sync_on = B_TRUE; 678 txg_sync_start(spa->spa_dsl_pool); 679 680 /* 681 * Wait for all claims to sync. 682 */ 683 txg_wait_synced(spa->spa_dsl_pool, 0); 684 685 /* 686 * If the config cache is stale, or we have uninitialized 687 * metaslabs (see spa_vdev_add()), then update the config. 688 */ 689 if (config_cache_txg != spa->spa_config_txg || 690 state == SPA_LOAD_IMPORT) 691 need_update = B_TRUE; 692 693 for (c = 0; c < rvd->vdev_children; c++) 694 if (rvd->vdev_child[c]->vdev_ms_array == 0) 695 need_update = B_TRUE; 696 697 /* 698 * Update the config cache asychronously in case we're the 699 * root pool, in which case the config cache isn't writable yet. 700 */ 701 if (need_update) 702 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 703 } 704 705 error = 0; 706 out: 707 if (error && error != EBADF) 708 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 709 spa->spa_load_state = SPA_LOAD_NONE; 710 spa->spa_ena = 0; 711 712 return (error); 713 } 714 715 /* 716 * Pool Open/Import 717 * 718 * The import case is identical to an open except that the configuration is sent 719 * down from userland, instead of grabbed from the configuration cache. For the 720 * case of an open, the pool configuration will exist in the 721 * POOL_STATE_UNITIALIZED state. 722 * 723 * The stats information (gen/count/ustats) is used to gather vdev statistics at 724 * the same time open the pool, without having to keep around the spa_t in some 725 * ambiguous state. 726 */ 727 static int 728 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 729 { 730 spa_t *spa; 731 int error; 732 int loaded = B_FALSE; 733 int locked = B_FALSE; 734 735 *spapp = NULL; 736 737 /* 738 * As disgusting as this is, we need to support recursive calls to this 739 * function because dsl_dir_open() is called during spa_load(), and ends 740 * up calling spa_open() again. The real fix is to figure out how to 741 * avoid dsl_dir_open() calling this in the first place. 742 */ 743 if (mutex_owner(&spa_namespace_lock) != curthread) { 744 mutex_enter(&spa_namespace_lock); 745 locked = B_TRUE; 746 } 747 748 if ((spa = spa_lookup(pool)) == NULL) { 749 if (locked) 750 mutex_exit(&spa_namespace_lock); 751 return (ENOENT); 752 } 753 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 754 755 spa_activate(spa); 756 757 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 758 759 if (error == EBADF) { 760 /* 761 * If vdev_validate() returns failure (indicated by 762 * EBADF), it indicates that one of the vdevs indicates 763 * that the pool has been exported or destroyed. If 764 * this is the case, the config cache is out of sync and 765 * we should remove the pool from the namespace. 766 */ 767 zfs_post_ok(spa, NULL); 768 spa_unload(spa); 769 spa_deactivate(spa); 770 spa_remove(spa); 771 spa_config_sync(); 772 if (locked) 773 mutex_exit(&spa_namespace_lock); 774 return (ENOENT); 775 } 776 777 if (error) { 778 /* 779 * We can't open the pool, but we still have useful 780 * information: the state of each vdev after the 781 * attempted vdev_open(). Return this to the user. 782 */ 783 if (config != NULL && spa->spa_root_vdev != NULL) { 784 spa_config_enter(spa, RW_READER, FTAG); 785 *config = spa_config_generate(spa, NULL, -1ULL, 786 B_TRUE); 787 spa_config_exit(spa, FTAG); 788 } 789 spa_unload(spa); 790 spa_deactivate(spa); 791 spa->spa_last_open_failed = B_TRUE; 792 if (locked) 793 mutex_exit(&spa_namespace_lock); 794 *spapp = NULL; 795 return (error); 796 } else { 797 zfs_post_ok(spa, NULL); 798 spa->spa_last_open_failed = B_FALSE; 799 } 800 801 loaded = B_TRUE; 802 } 803 804 spa_open_ref(spa, tag); 805 if (locked) 806 mutex_exit(&spa_namespace_lock); 807 808 *spapp = spa; 809 810 if (config != NULL) { 811 spa_config_enter(spa, RW_READER, FTAG); 812 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 813 spa_config_exit(spa, FTAG); 814 } 815 816 /* 817 * If we just loaded the pool, resilver anything that's out of date. 818 */ 819 if (loaded && (spa_mode & FWRITE)) 820 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 821 822 return (0); 823 } 824 825 int 826 spa_open(const char *name, spa_t **spapp, void *tag) 827 { 828 return (spa_open_common(name, spapp, tag, NULL)); 829 } 830 831 /* 832 * Lookup the given spa_t, incrementing the inject count in the process, 833 * preventing it from being exported or destroyed. 834 */ 835 spa_t * 836 spa_inject_addref(char *name) 837 { 838 spa_t *spa; 839 840 mutex_enter(&spa_namespace_lock); 841 if ((spa = spa_lookup(name)) == NULL) { 842 mutex_exit(&spa_namespace_lock); 843 return (NULL); 844 } 845 spa->spa_inject_ref++; 846 mutex_exit(&spa_namespace_lock); 847 848 return (spa); 849 } 850 851 void 852 spa_inject_delref(spa_t *spa) 853 { 854 mutex_enter(&spa_namespace_lock); 855 spa->spa_inject_ref--; 856 mutex_exit(&spa_namespace_lock); 857 } 858 859 static void 860 spa_add_spares(spa_t *spa, nvlist_t *config) 861 { 862 nvlist_t **spares; 863 uint_t i, nspares; 864 nvlist_t *nvroot; 865 uint64_t guid; 866 vdev_stat_t *vs; 867 uint_t vsc; 868 869 if (spa->spa_nspares == 0) 870 return; 871 872 VERIFY(nvlist_lookup_nvlist(config, 873 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 874 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 875 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 876 if (nspares != 0) { 877 VERIFY(nvlist_add_nvlist_array(nvroot, 878 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 879 VERIFY(nvlist_lookup_nvlist_array(nvroot, 880 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 881 882 /* 883 * Go through and find any spares which have since been 884 * repurposed as an active spare. If this is the case, update 885 * their status appropriately. 886 */ 887 for (i = 0; i < nspares; i++) { 888 VERIFY(nvlist_lookup_uint64(spares[i], 889 ZPOOL_CONFIG_GUID, &guid) == 0); 890 if (spa_spare_inuse(guid)) { 891 VERIFY(nvlist_lookup_uint64_array( 892 spares[i], ZPOOL_CONFIG_STATS, 893 (uint64_t **)&vs, &vsc) == 0); 894 vs->vs_state = VDEV_STATE_CANT_OPEN; 895 vs->vs_aux = VDEV_AUX_SPARED; 896 } 897 } 898 } 899 } 900 901 int 902 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 903 { 904 int error; 905 spa_t *spa; 906 907 *config = NULL; 908 error = spa_open_common(name, &spa, FTAG, config); 909 910 if (spa && *config != NULL) { 911 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 912 spa_get_errlog_size(spa)) == 0); 913 914 spa_add_spares(spa, *config); 915 } 916 917 /* 918 * We want to get the alternate root even for faulted pools, so we cheat 919 * and call spa_lookup() directly. 920 */ 921 if (altroot) { 922 if (spa == NULL) { 923 mutex_enter(&spa_namespace_lock); 924 spa = spa_lookup(name); 925 if (spa) 926 spa_altroot(spa, altroot, buflen); 927 else 928 altroot[0] = '\0'; 929 spa = NULL; 930 mutex_exit(&spa_namespace_lock); 931 } else { 932 spa_altroot(spa, altroot, buflen); 933 } 934 } 935 936 if (spa != NULL) 937 spa_close(spa, FTAG); 938 939 return (error); 940 } 941 942 /* 943 * Validate that the 'spares' array is well formed. We must have an array of 944 * nvlists, each which describes a valid leaf vdev. 945 */ 946 static int 947 spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 948 { 949 nvlist_t **spares; 950 uint_t i, nspares; 951 vdev_t *vd; 952 int error; 953 954 /* 955 * It's acceptable to have no spares specified. 956 */ 957 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 958 &spares, &nspares) != 0) 959 return (0); 960 961 if (nspares == 0) 962 return (EINVAL); 963 964 /* 965 * Make sure the pool is formatted with a version that supports hot 966 * spares. 967 */ 968 if (spa_version(spa) < ZFS_VERSION_SPARES) 969 return (ENOTSUP); 970 971 for (i = 0; i < nspares; i++) { 972 if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 973 mode)) != 0) 974 return (error); 975 976 if (!vd->vdev_ops->vdev_op_leaf) { 977 vdev_free(vd); 978 return (EINVAL); 979 } 980 981 if ((error = vdev_open(vd)) != 0) { 982 vdev_free(vd); 983 return (error); 984 } 985 986 vd->vdev_top = vd; 987 if ((error = vdev_label_spare(vd, crtxg)) != 0) { 988 vdev_free(vd); 989 return (error); 990 } 991 992 VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 993 vd->vdev_guid) == 0); 994 995 vdev_free(vd); 996 } 997 998 return (0); 999 } 1000 1001 /* 1002 * Pool Creation 1003 */ 1004 int 1005 spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 1006 { 1007 spa_t *spa; 1008 vdev_t *rvd; 1009 dsl_pool_t *dp; 1010 dmu_tx_t *tx; 1011 int c, error = 0; 1012 uint64_t txg = TXG_INITIAL; 1013 nvlist_t **spares; 1014 uint_t nspares; 1015 1016 /* 1017 * If this pool already exists, return failure. 1018 */ 1019 mutex_enter(&spa_namespace_lock); 1020 if (spa_lookup(pool) != NULL) { 1021 mutex_exit(&spa_namespace_lock); 1022 return (EEXIST); 1023 } 1024 1025 /* 1026 * Allocate a new spa_t structure. 1027 */ 1028 spa = spa_add(pool, altroot); 1029 spa_activate(spa); 1030 1031 spa->spa_uberblock.ub_txg = txg - 1; 1032 spa->spa_uberblock.ub_version = ZFS_VERSION; 1033 spa->spa_ubsync = spa->spa_uberblock; 1034 1035 /* 1036 * Create the root vdev. 1037 */ 1038 spa_config_enter(spa, RW_WRITER, FTAG); 1039 1040 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1041 1042 ASSERT(error != 0 || rvd != NULL); 1043 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1044 1045 if (error == 0 && rvd->vdev_children == 0) 1046 error = EINVAL; 1047 1048 if (error == 0 && 1049 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1050 (error = spa_validate_spares(spa, nvroot, txg, 1051 VDEV_ALLOC_ADD)) == 0) { 1052 for (c = 0; c < rvd->vdev_children; c++) 1053 vdev_init(rvd->vdev_child[c], txg); 1054 vdev_config_dirty(rvd); 1055 } 1056 1057 spa_config_exit(spa, FTAG); 1058 1059 if (error != 0) { 1060 spa_unload(spa); 1061 spa_deactivate(spa); 1062 spa_remove(spa); 1063 mutex_exit(&spa_namespace_lock); 1064 return (error); 1065 } 1066 1067 /* 1068 * Get the list of spares, if specified. 1069 */ 1070 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1071 &spares, &nspares) == 0) { 1072 VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 1073 KM_SLEEP) == 0); 1074 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1075 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1076 spa_config_enter(spa, RW_WRITER, FTAG); 1077 spa_load_spares(spa); 1078 spa_config_exit(spa, FTAG); 1079 spa->spa_sync_spares = B_TRUE; 1080 } 1081 1082 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1083 spa->spa_meta_objset = dp->dp_meta_objset; 1084 1085 tx = dmu_tx_create_assigned(dp, txg); 1086 1087 /* 1088 * Create the pool config object. 1089 */ 1090 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1091 DMU_OT_PACKED_NVLIST, 1 << 14, 1092 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1093 1094 if (zap_add(spa->spa_meta_objset, 1095 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1096 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1097 cmn_err(CE_PANIC, "failed to add pool config"); 1098 } 1099 1100 /* Newly created pools are always deflated. */ 1101 spa->spa_deflate = TRUE; 1102 if (zap_add(spa->spa_meta_objset, 1103 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1104 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 1105 cmn_err(CE_PANIC, "failed to add deflate"); 1106 } 1107 1108 /* 1109 * Create the deferred-free bplist object. Turn off compression 1110 * because sync-to-convergence takes longer if the blocksize 1111 * keeps changing. 1112 */ 1113 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1114 1 << 14, tx); 1115 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1116 ZIO_COMPRESS_OFF, tx); 1117 1118 if (zap_add(spa->spa_meta_objset, 1119 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1120 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 1121 cmn_err(CE_PANIC, "failed to add bplist"); 1122 } 1123 1124 /* 1125 * Create the pool's history object. 1126 */ 1127 spa_history_create_obj(spa, tx); 1128 1129 dmu_tx_commit(tx); 1130 1131 spa->spa_sync_on = B_TRUE; 1132 txg_sync_start(spa->spa_dsl_pool); 1133 1134 /* 1135 * We explicitly wait for the first transaction to complete so that our 1136 * bean counters are appropriately updated. 1137 */ 1138 txg_wait_synced(spa->spa_dsl_pool, txg); 1139 1140 spa_config_sync(); 1141 1142 mutex_exit(&spa_namespace_lock); 1143 1144 return (0); 1145 } 1146 1147 /* 1148 * Import the given pool into the system. We set up the necessary spa_t and 1149 * then call spa_load() to do the dirty work. 1150 */ 1151 int 1152 spa_import(const char *pool, nvlist_t *config, const char *altroot) 1153 { 1154 spa_t *spa; 1155 int error; 1156 nvlist_t *nvroot; 1157 nvlist_t **spares; 1158 uint_t nspares; 1159 1160 if (!(spa_mode & FWRITE)) 1161 return (EROFS); 1162 1163 /* 1164 * If a pool with this name exists, return failure. 1165 */ 1166 mutex_enter(&spa_namespace_lock); 1167 if (spa_lookup(pool) != NULL) { 1168 mutex_exit(&spa_namespace_lock); 1169 return (EEXIST); 1170 } 1171 1172 /* 1173 * Create and initialize the spa structure. 1174 */ 1175 spa = spa_add(pool, altroot); 1176 spa_activate(spa); 1177 1178 /* 1179 * Pass off the heavy lifting to spa_load(). 1180 * Pass TRUE for mosconfig because the user-supplied config 1181 * is actually the one to trust when doing an import. 1182 */ 1183 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1184 1185 spa_config_enter(spa, RW_WRITER, FTAG); 1186 /* 1187 * Toss any existing sparelist, as it doesn't have any validity anymore, 1188 * and conflicts with spa_has_spare(). 1189 */ 1190 if (spa->spa_sparelist) { 1191 nvlist_free(spa->spa_sparelist); 1192 spa->spa_sparelist = NULL; 1193 spa_load_spares(spa); 1194 } 1195 1196 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1197 &nvroot) == 0); 1198 if (error == 0) 1199 error = spa_validate_spares(spa, nvroot, -1ULL, 1200 VDEV_ALLOC_SPARE); 1201 spa_config_exit(spa, FTAG); 1202 1203 if (error != 0) { 1204 spa_unload(spa); 1205 spa_deactivate(spa); 1206 spa_remove(spa); 1207 mutex_exit(&spa_namespace_lock); 1208 return (error); 1209 } 1210 1211 /* 1212 * Override any spares as specified by the user, as these may have 1213 * correct device names/devids, etc. 1214 */ 1215 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1216 &spares, &nspares) == 0) { 1217 if (spa->spa_sparelist) 1218 VERIFY(nvlist_remove(spa->spa_sparelist, 1219 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1220 else 1221 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1222 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1223 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1224 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1225 spa_config_enter(spa, RW_WRITER, FTAG); 1226 spa_load_spares(spa); 1227 spa_config_exit(spa, FTAG); 1228 spa->spa_sync_spares = B_TRUE; 1229 } 1230 1231 /* 1232 * Update the config cache to include the newly-imported pool. 1233 */ 1234 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1235 1236 mutex_exit(&spa_namespace_lock); 1237 1238 /* 1239 * Resilver anything that's out of date. 1240 */ 1241 if (spa_mode & FWRITE) 1242 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1243 1244 return (0); 1245 } 1246 1247 /* 1248 * This (illegal) pool name is used when temporarily importing a spa_t in order 1249 * to get the vdev stats associated with the imported devices. 1250 */ 1251 #define TRYIMPORT_NAME "$import" 1252 1253 nvlist_t * 1254 spa_tryimport(nvlist_t *tryconfig) 1255 { 1256 nvlist_t *config = NULL; 1257 char *poolname; 1258 spa_t *spa; 1259 uint64_t state; 1260 1261 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1262 return (NULL); 1263 1264 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1265 return (NULL); 1266 1267 /* 1268 * Create and initialize the spa structure. 1269 */ 1270 mutex_enter(&spa_namespace_lock); 1271 spa = spa_add(TRYIMPORT_NAME, NULL); 1272 spa_activate(spa); 1273 1274 /* 1275 * Pass off the heavy lifting to spa_load(). 1276 * Pass TRUE for mosconfig because the user-supplied config 1277 * is actually the one to trust when doing an import. 1278 */ 1279 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1280 1281 /* 1282 * If 'tryconfig' was at least parsable, return the current config. 1283 */ 1284 if (spa->spa_root_vdev != NULL) { 1285 spa_config_enter(spa, RW_READER, FTAG); 1286 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1287 spa_config_exit(spa, FTAG); 1288 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1289 poolname) == 0); 1290 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1291 state) == 0); 1292 1293 /* 1294 * Add the list of hot spares. 1295 */ 1296 spa_add_spares(spa, config); 1297 } 1298 1299 spa_unload(spa); 1300 spa_deactivate(spa); 1301 spa_remove(spa); 1302 mutex_exit(&spa_namespace_lock); 1303 1304 return (config); 1305 } 1306 1307 /* 1308 * Pool export/destroy 1309 * 1310 * The act of destroying or exporting a pool is very simple. We make sure there 1311 * is no more pending I/O and any references to the pool are gone. Then, we 1312 * update the pool state and sync all the labels to disk, removing the 1313 * configuration from the cache afterwards. 1314 */ 1315 static int 1316 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1317 { 1318 spa_t *spa; 1319 1320 if (oldconfig) 1321 *oldconfig = NULL; 1322 1323 if (!(spa_mode & FWRITE)) 1324 return (EROFS); 1325 1326 mutex_enter(&spa_namespace_lock); 1327 if ((spa = spa_lookup(pool)) == NULL) { 1328 mutex_exit(&spa_namespace_lock); 1329 return (ENOENT); 1330 } 1331 1332 /* 1333 * Put a hold on the pool, drop the namespace lock, stop async tasks, 1334 * reacquire the namespace lock, and see if we can export. 1335 */ 1336 spa_open_ref(spa, FTAG); 1337 mutex_exit(&spa_namespace_lock); 1338 spa_async_suspend(spa); 1339 mutex_enter(&spa_namespace_lock); 1340 spa_close(spa, FTAG); 1341 1342 /* 1343 * The pool will be in core if it's openable, 1344 * in which case we can modify its state. 1345 */ 1346 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1347 /* 1348 * Objsets may be open only because they're dirty, so we 1349 * have to force it to sync before checking spa_refcnt. 1350 */ 1351 spa_scrub_suspend(spa); 1352 txg_wait_synced(spa->spa_dsl_pool, 0); 1353 1354 /* 1355 * A pool cannot be exported or destroyed if there are active 1356 * references. If we are resetting a pool, allow references by 1357 * fault injection handlers. 1358 */ 1359 if (!spa_refcount_zero(spa) || 1360 (spa->spa_inject_ref != 0 && 1361 new_state != POOL_STATE_UNINITIALIZED)) { 1362 spa_scrub_resume(spa); 1363 spa_async_resume(spa); 1364 mutex_exit(&spa_namespace_lock); 1365 return (EBUSY); 1366 } 1367 1368 spa_scrub_resume(spa); 1369 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1370 1371 /* 1372 * We want this to be reflected on every label, 1373 * so mark them all dirty. spa_unload() will do the 1374 * final sync that pushes these changes out. 1375 */ 1376 if (new_state != POOL_STATE_UNINITIALIZED) { 1377 spa_config_enter(spa, RW_WRITER, FTAG); 1378 spa->spa_state = new_state; 1379 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 1380 vdev_config_dirty(spa->spa_root_vdev); 1381 spa_config_exit(spa, FTAG); 1382 } 1383 } 1384 1385 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1386 spa_unload(spa); 1387 spa_deactivate(spa); 1388 } 1389 1390 if (oldconfig && spa->spa_config) 1391 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 1392 1393 if (new_state != POOL_STATE_UNINITIALIZED) { 1394 spa_remove(spa); 1395 spa_config_sync(); 1396 } 1397 mutex_exit(&spa_namespace_lock); 1398 1399 return (0); 1400 } 1401 1402 /* 1403 * Destroy a storage pool. 1404 */ 1405 int 1406 spa_destroy(char *pool) 1407 { 1408 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1409 } 1410 1411 /* 1412 * Export a storage pool. 1413 */ 1414 int 1415 spa_export(char *pool, nvlist_t **oldconfig) 1416 { 1417 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1418 } 1419 1420 /* 1421 * Similar to spa_export(), this unloads the spa_t without actually removing it 1422 * from the namespace in any way. 1423 */ 1424 int 1425 spa_reset(char *pool) 1426 { 1427 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 1428 } 1429 1430 1431 /* 1432 * ========================================================================== 1433 * Device manipulation 1434 * ========================================================================== 1435 */ 1436 1437 /* 1438 * Add capacity to a storage pool. 1439 */ 1440 int 1441 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1442 { 1443 uint64_t txg; 1444 int c, error; 1445 vdev_t *rvd = spa->spa_root_vdev; 1446 vdev_t *vd, *tvd; 1447 nvlist_t **spares; 1448 uint_t i, nspares; 1449 1450 txg = spa_vdev_enter(spa); 1451 1452 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 1453 VDEV_ALLOC_ADD)) != 0) 1454 return (spa_vdev_exit(spa, NULL, txg, error)); 1455 1456 if ((error = spa_validate_spares(spa, nvroot, txg, 1457 VDEV_ALLOC_ADD)) != 0) 1458 return (spa_vdev_exit(spa, vd, txg, error)); 1459 1460 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1461 &spares, &nspares) != 0) 1462 nspares = 0; 1463 1464 if (vd->vdev_children == 0 && nspares == 0) 1465 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1466 1467 if (vd->vdev_children != 0) { 1468 if ((error = vdev_create(vd, txg, B_FALSE)) != 0) 1469 return (spa_vdev_exit(spa, vd, txg, error)); 1470 1471 /* 1472 * Transfer each new top-level vdev from vd to rvd. 1473 */ 1474 for (c = 0; c < vd->vdev_children; c++) { 1475 tvd = vd->vdev_child[c]; 1476 vdev_remove_child(vd, tvd); 1477 tvd->vdev_id = rvd->vdev_children; 1478 vdev_add_child(rvd, tvd); 1479 vdev_config_dirty(tvd); 1480 } 1481 } 1482 1483 if (nspares != 0) { 1484 if (spa->spa_sparelist != NULL) { 1485 nvlist_t **oldspares; 1486 uint_t oldnspares; 1487 nvlist_t **newspares; 1488 1489 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 1490 ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 1491 1492 newspares = kmem_alloc(sizeof (void *) * 1493 (nspares + oldnspares), KM_SLEEP); 1494 for (i = 0; i < oldnspares; i++) 1495 VERIFY(nvlist_dup(oldspares[i], 1496 &newspares[i], KM_SLEEP) == 0); 1497 for (i = 0; i < nspares; i++) 1498 VERIFY(nvlist_dup(spares[i], 1499 &newspares[i + oldnspares], 1500 KM_SLEEP) == 0); 1501 1502 VERIFY(nvlist_remove(spa->spa_sparelist, 1503 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1504 1505 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1506 ZPOOL_CONFIG_SPARES, newspares, 1507 nspares + oldnspares) == 0); 1508 for (i = 0; i < oldnspares + nspares; i++) 1509 nvlist_free(newspares[i]); 1510 kmem_free(newspares, (oldnspares + nspares) * 1511 sizeof (void *)); 1512 } else { 1513 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1514 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1515 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1516 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1517 } 1518 1519 spa_load_spares(spa); 1520 spa->spa_sync_spares = B_TRUE; 1521 } 1522 1523 /* 1524 * We have to be careful when adding new vdevs to an existing pool. 1525 * If other threads start allocating from these vdevs before we 1526 * sync the config cache, and we lose power, then upon reboot we may 1527 * fail to open the pool because there are DVAs that the config cache 1528 * can't translate. Therefore, we first add the vdevs without 1529 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1530 * and then let spa_config_update() initialize the new metaslabs. 1531 * 1532 * spa_load() checks for added-but-not-initialized vdevs, so that 1533 * if we lose power at any point in this sequence, the remaining 1534 * steps will be completed the next time we load the pool. 1535 */ 1536 (void) spa_vdev_exit(spa, vd, txg, 0); 1537 1538 mutex_enter(&spa_namespace_lock); 1539 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1540 mutex_exit(&spa_namespace_lock); 1541 1542 return (0); 1543 } 1544 1545 /* 1546 * Attach a device to a mirror. The arguments are the path to any device 1547 * in the mirror, and the nvroot for the new device. If the path specifies 1548 * a device that is not mirrored, we automatically insert the mirror vdev. 1549 * 1550 * If 'replacing' is specified, the new device is intended to replace the 1551 * existing device; in this case the two devices are made into their own 1552 * mirror using the 'replacing' vdev, which is functionally idendical to 1553 * the mirror vdev (it actually reuses all the same ops) but has a few 1554 * extra rules: you can't attach to it after it's been created, and upon 1555 * completion of resilvering, the first disk (the one being replaced) 1556 * is automatically detached. 1557 */ 1558 int 1559 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1560 { 1561 uint64_t txg, open_txg; 1562 int error; 1563 vdev_t *rvd = spa->spa_root_vdev; 1564 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1565 vdev_ops_t *pvops; 1566 1567 txg = spa_vdev_enter(spa); 1568 1569 oldvd = vdev_lookup_by_guid(rvd, guid); 1570 1571 if (oldvd == NULL) 1572 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1573 1574 if (!oldvd->vdev_ops->vdev_op_leaf) 1575 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1576 1577 pvd = oldvd->vdev_parent; 1578 1579 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 1580 VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) 1581 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1582 1583 newvd = newrootvd->vdev_child[0]; 1584 1585 if (!newvd->vdev_ops->vdev_op_leaf) 1586 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1587 1588 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 1589 return (spa_vdev_exit(spa, newrootvd, txg, error)); 1590 1591 if (!replacing) { 1592 /* 1593 * For attach, the only allowable parent is a mirror or the root 1594 * vdev. 1595 */ 1596 if (pvd->vdev_ops != &vdev_mirror_ops && 1597 pvd->vdev_ops != &vdev_root_ops) 1598 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1599 1600 pvops = &vdev_mirror_ops; 1601 } else { 1602 /* 1603 * Active hot spares can only be replaced by inactive hot 1604 * spares. 1605 */ 1606 if (pvd->vdev_ops == &vdev_spare_ops && 1607 pvd->vdev_child[1] == oldvd && 1608 !spa_has_spare(spa, newvd->vdev_guid)) 1609 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1610 1611 /* 1612 * If the source is a hot spare, and the parent isn't already a 1613 * spare, then we want to create a new hot spare. Otherwise, we 1614 * want to create a replacing vdev. 1615 */ 1616 if (pvd->vdev_ops == &vdev_replacing_ops) 1617 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1618 else if (pvd->vdev_ops != &vdev_spare_ops && 1619 newvd->vdev_isspare) 1620 pvops = &vdev_spare_ops; 1621 else 1622 pvops = &vdev_replacing_ops; 1623 } 1624 1625 /* 1626 * Compare the new device size with the replaceable/attachable 1627 * device size. 1628 */ 1629 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1630 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1631 1632 /* 1633 * The new device cannot have a higher alignment requirement 1634 * than the top-level vdev. 1635 */ 1636 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1637 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1638 1639 /* 1640 * If this is an in-place replacement, update oldvd's path and devid 1641 * to make it distinguishable from newvd, and unopenable from now on. 1642 */ 1643 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1644 spa_strfree(oldvd->vdev_path); 1645 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1646 KM_SLEEP); 1647 (void) sprintf(oldvd->vdev_path, "%s/%s", 1648 newvd->vdev_path, "old"); 1649 if (oldvd->vdev_devid != NULL) { 1650 spa_strfree(oldvd->vdev_devid); 1651 oldvd->vdev_devid = NULL; 1652 } 1653 } 1654 1655 /* 1656 * If the parent is not a mirror, or if we're replacing, insert the new 1657 * mirror/replacing/spare vdev above oldvd. 1658 */ 1659 if (pvd->vdev_ops != pvops) 1660 pvd = vdev_add_parent(oldvd, pvops); 1661 1662 ASSERT(pvd->vdev_top->vdev_parent == rvd); 1663 ASSERT(pvd->vdev_ops == pvops); 1664 ASSERT(oldvd->vdev_parent == pvd); 1665 1666 /* 1667 * Extract the new device from its root and add it to pvd. 1668 */ 1669 vdev_remove_child(newrootvd, newvd); 1670 newvd->vdev_id = pvd->vdev_children; 1671 vdev_add_child(pvd, newvd); 1672 1673 /* 1674 * If newvd is smaller than oldvd, but larger than its rsize, 1675 * the addition of newvd may have decreased our parent's asize. 1676 */ 1677 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1678 1679 tvd = newvd->vdev_top; 1680 ASSERT(pvd->vdev_top == tvd); 1681 ASSERT(tvd->vdev_parent == rvd); 1682 1683 vdev_config_dirty(tvd); 1684 1685 /* 1686 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1687 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1688 */ 1689 open_txg = txg + TXG_CONCURRENT_STATES - 1; 1690 1691 mutex_enter(&newvd->vdev_dtl_lock); 1692 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1693 open_txg - TXG_INITIAL + 1); 1694 mutex_exit(&newvd->vdev_dtl_lock); 1695 1696 dprintf("attached %s in txg %llu\n", newvd->vdev_path, txg); 1697 1698 /* 1699 * Mark newvd's DTL dirty in this txg. 1700 */ 1701 vdev_dirty(tvd, VDD_DTL, newvd, txg); 1702 1703 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1704 1705 /* 1706 * Kick off a resilver to update newvd. 1707 */ 1708 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1709 1710 return (0); 1711 } 1712 1713 /* 1714 * Detach a device from a mirror or replacing vdev. 1715 * If 'replace_done' is specified, only detach if the parent 1716 * is a replacing vdev. 1717 */ 1718 int 1719 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1720 { 1721 uint64_t txg; 1722 int c, t, error; 1723 vdev_t *rvd = spa->spa_root_vdev; 1724 vdev_t *vd, *pvd, *cvd, *tvd; 1725 boolean_t unspare = B_FALSE; 1726 uint64_t unspare_guid; 1727 1728 txg = spa_vdev_enter(spa); 1729 1730 vd = vdev_lookup_by_guid(rvd, guid); 1731 1732 if (vd == NULL) 1733 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1734 1735 if (!vd->vdev_ops->vdev_op_leaf) 1736 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1737 1738 pvd = vd->vdev_parent; 1739 1740 /* 1741 * If replace_done is specified, only remove this device if it's 1742 * the first child of a replacing vdev. For the 'spare' vdev, either 1743 * disk can be removed. 1744 */ 1745 if (replace_done) { 1746 if (pvd->vdev_ops == &vdev_replacing_ops) { 1747 if (vd->vdev_id != 0) 1748 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1749 } else if (pvd->vdev_ops != &vdev_spare_ops) { 1750 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1751 } 1752 } 1753 1754 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 1755 spa_version(spa) >= ZFS_VERSION_SPARES); 1756 1757 /* 1758 * Only mirror, replacing, and spare vdevs support detach. 1759 */ 1760 if (pvd->vdev_ops != &vdev_replacing_ops && 1761 pvd->vdev_ops != &vdev_mirror_ops && 1762 pvd->vdev_ops != &vdev_spare_ops) 1763 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1764 1765 /* 1766 * If there's only one replica, you can't detach it. 1767 */ 1768 if (pvd->vdev_children <= 1) 1769 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1770 1771 /* 1772 * If all siblings have non-empty DTLs, this device may have the only 1773 * valid copy of the data, which means we cannot safely detach it. 1774 * 1775 * XXX -- as in the vdev_offline() case, we really want a more 1776 * precise DTL check. 1777 */ 1778 for (c = 0; c < pvd->vdev_children; c++) { 1779 uint64_t dirty; 1780 1781 cvd = pvd->vdev_child[c]; 1782 if (cvd == vd) 1783 continue; 1784 if (vdev_is_dead(cvd)) 1785 continue; 1786 mutex_enter(&cvd->vdev_dtl_lock); 1787 dirty = cvd->vdev_dtl_map.sm_space | 1788 cvd->vdev_dtl_scrub.sm_space; 1789 mutex_exit(&cvd->vdev_dtl_lock); 1790 if (!dirty) 1791 break; 1792 } 1793 1794 /* 1795 * If we are a replacing or spare vdev, then we can always detach the 1796 * latter child, as that is how one cancels the operation. 1797 */ 1798 if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 1799 c == pvd->vdev_children) 1800 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1801 1802 /* 1803 * If we are detaching the original disk from a spare, then it implies 1804 * that the spare should become a real disk, and be removed from the 1805 * active spare list for the pool. 1806 */ 1807 if (pvd->vdev_ops == &vdev_spare_ops && 1808 vd->vdev_id == 0) 1809 unspare = B_TRUE; 1810 1811 /* 1812 * Erase the disk labels so the disk can be used for other things. 1813 * This must be done after all other error cases are handled, 1814 * but before we disembowel vd (so we can still do I/O to it). 1815 * But if we can't do it, don't treat the error as fatal -- 1816 * it may be that the unwritability of the disk is the reason 1817 * it's being detached! 1818 */ 1819 error = vdev_label_init(vd, 0, B_FALSE); 1820 if (error) 1821 dprintf("unable to erase labels on %s\n", vdev_description(vd)); 1822 1823 /* 1824 * Remove vd from its parent and compact the parent's children. 1825 */ 1826 vdev_remove_child(pvd, vd); 1827 vdev_compact_children(pvd); 1828 1829 /* 1830 * Remember one of the remaining children so we can get tvd below. 1831 */ 1832 cvd = pvd->vdev_child[0]; 1833 1834 /* 1835 * If we need to remove the remaining child from the list of hot spares, 1836 * do it now, marking the vdev as no longer a spare in the process. We 1837 * must do this before vdev_remove_parent(), because that can change the 1838 * GUID if it creates a new toplevel GUID. 1839 */ 1840 if (unspare) { 1841 ASSERT(cvd->vdev_isspare); 1842 spa_spare_remove(cvd->vdev_guid); 1843 cvd->vdev_isspare = B_FALSE; 1844 unspare_guid = cvd->vdev_guid; 1845 } 1846 1847 /* 1848 * If the parent mirror/replacing vdev only has one child, 1849 * the parent is no longer needed. Remove it from the tree. 1850 */ 1851 if (pvd->vdev_children == 1) 1852 vdev_remove_parent(cvd); 1853 1854 /* 1855 * We don't set tvd until now because the parent we just removed 1856 * may have been the previous top-level vdev. 1857 */ 1858 tvd = cvd->vdev_top; 1859 ASSERT(tvd->vdev_parent == rvd); 1860 1861 /* 1862 * Reopen this top-level vdev to reassess health after detach. 1863 */ 1864 vdev_reopen(tvd); 1865 1866 /* 1867 * If the device we just detached was smaller than the others, 1868 * it may be possible to add metaslabs (i.e. grow the pool). 1869 * vdev_metaslab_init() can't fail because the existing metaslabs 1870 * are already in core, so there's nothing to read from disk. 1871 */ 1872 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 1873 1874 vdev_config_dirty(tvd); 1875 1876 /* 1877 * Mark vd's DTL as dirty in this txg. 1878 * vdev_dtl_sync() will see that vd->vdev_detached is set 1879 * and free vd's DTL object in syncing context. 1880 * But first make sure we're not on any *other* txg's DTL list, 1881 * to prevent vd from being accessed after it's freed. 1882 */ 1883 for (t = 0; t < TXG_SIZE; t++) 1884 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1885 vd->vdev_detached = B_TRUE; 1886 vdev_dirty(tvd, VDD_DTL, vd, txg); 1887 1888 dprintf("detached %s in txg %llu\n", vd->vdev_path, txg); 1889 1890 error = spa_vdev_exit(spa, vd, txg, 0); 1891 1892 /* 1893 * If we are supposed to remove the given vdev from the list of spares, 1894 * iterate over all pools in the system and replace it if it's present. 1895 */ 1896 if (unspare) { 1897 spa = NULL; 1898 mutex_enter(&spa_namespace_lock); 1899 while ((spa = spa_next(spa)) != NULL) { 1900 if (spa->spa_state != POOL_STATE_ACTIVE) 1901 continue; 1902 1903 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 1904 } 1905 mutex_exit(&spa_namespace_lock); 1906 } 1907 1908 return (error); 1909 } 1910 1911 /* 1912 * Remove a device from the pool. Currently, this supports removing only hot 1913 * spares. 1914 */ 1915 int 1916 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 1917 { 1918 vdev_t *vd; 1919 nvlist_t **spares, *nv, **newspares; 1920 uint_t i, j, nspares; 1921 int ret = 0; 1922 1923 spa_config_enter(spa, RW_WRITER, FTAG); 1924 1925 vd = spa_lookup_by_guid(spa, guid); 1926 1927 nv = NULL; 1928 if (spa->spa_spares != NULL && 1929 nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 1930 &spares, &nspares) == 0) { 1931 for (i = 0; i < nspares; i++) { 1932 uint64_t theguid; 1933 1934 VERIFY(nvlist_lookup_uint64(spares[i], 1935 ZPOOL_CONFIG_GUID, &theguid) == 0); 1936 if (theguid == guid) { 1937 nv = spares[i]; 1938 break; 1939 } 1940 } 1941 } 1942 1943 /* 1944 * We only support removing a hot spare, and only if it's not currently 1945 * in use in this pool. 1946 */ 1947 if (nv == NULL && vd == NULL) { 1948 ret = ENOENT; 1949 goto out; 1950 } 1951 1952 if (nv == NULL && vd != NULL) { 1953 ret = ENOTSUP; 1954 goto out; 1955 } 1956 1957 if (!unspare && nv != NULL && vd != NULL) { 1958 ret = EBUSY; 1959 goto out; 1960 } 1961 1962 if (nspares == 1) { 1963 newspares = NULL; 1964 } else { 1965 newspares = kmem_alloc((nspares - 1) * sizeof (void *), 1966 KM_SLEEP); 1967 for (i = 0, j = 0; i < nspares; i++) { 1968 if (spares[i] != nv) 1969 VERIFY(nvlist_dup(spares[i], 1970 &newspares[j++], KM_SLEEP) == 0); 1971 } 1972 } 1973 1974 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 1975 DATA_TYPE_NVLIST_ARRAY) == 0); 1976 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 1977 newspares, nspares - 1) == 0); 1978 for (i = 0; i < nspares - 1; i++) 1979 nvlist_free(newspares[i]); 1980 kmem_free(newspares, (nspares - 1) * sizeof (void *)); 1981 spa_load_spares(spa); 1982 spa->spa_sync_spares = B_TRUE; 1983 1984 out: 1985 spa_config_exit(spa, FTAG); 1986 1987 return (ret); 1988 } 1989 1990 /* 1991 * Find any device that's done replacing, so we can detach it. 1992 */ 1993 static vdev_t * 1994 spa_vdev_replace_done_hunt(vdev_t *vd) 1995 { 1996 vdev_t *newvd, *oldvd; 1997 int c; 1998 1999 for (c = 0; c < vd->vdev_children; c++) { 2000 oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 2001 if (oldvd != NULL) 2002 return (oldvd); 2003 } 2004 2005 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 2006 oldvd = vd->vdev_child[0]; 2007 newvd = vd->vdev_child[1]; 2008 2009 mutex_enter(&newvd->vdev_dtl_lock); 2010 if (newvd->vdev_dtl_map.sm_space == 0 && 2011 newvd->vdev_dtl_scrub.sm_space == 0) { 2012 mutex_exit(&newvd->vdev_dtl_lock); 2013 return (oldvd); 2014 } 2015 mutex_exit(&newvd->vdev_dtl_lock); 2016 } 2017 2018 return (NULL); 2019 } 2020 2021 static void 2022 spa_vdev_replace_done(spa_t *spa) 2023 { 2024 vdev_t *vd; 2025 vdev_t *pvd; 2026 uint64_t guid; 2027 uint64_t pguid = 0; 2028 2029 spa_config_enter(spa, RW_READER, FTAG); 2030 2031 while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 2032 guid = vd->vdev_guid; 2033 /* 2034 * If we have just finished replacing a hot spared device, then 2035 * we need to detach the parent's first child (the original hot 2036 * spare) as well. 2037 */ 2038 pvd = vd->vdev_parent; 2039 if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2040 pvd->vdev_id == 0) { 2041 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 2042 ASSERT(pvd->vdev_parent->vdev_children == 2); 2043 pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 2044 } 2045 spa_config_exit(spa, FTAG); 2046 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 2047 return; 2048 if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 2049 return; 2050 spa_config_enter(spa, RW_READER, FTAG); 2051 } 2052 2053 spa_config_exit(spa, FTAG); 2054 } 2055 2056 /* 2057 * Update the stored path for this vdev. Dirty the vdev configuration, relying 2058 * on spa_vdev_enter/exit() to synchronize the labels and cache. 2059 */ 2060 int 2061 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 2062 { 2063 vdev_t *rvd, *vd; 2064 uint64_t txg; 2065 2066 rvd = spa->spa_root_vdev; 2067 2068 txg = spa_vdev_enter(spa); 2069 2070 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2071 /* 2072 * Determine if this is a reference to a hot spare. In that 2073 * case, update the path as stored in the spare list. 2074 */ 2075 nvlist_t **spares; 2076 uint_t i, nspares; 2077 if (spa->spa_sparelist != NULL) { 2078 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 2079 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2080 for (i = 0; i < nspares; i++) { 2081 uint64_t theguid; 2082 VERIFY(nvlist_lookup_uint64(spares[i], 2083 ZPOOL_CONFIG_GUID, &theguid) == 0); 2084 if (theguid == guid) 2085 break; 2086 } 2087 2088 if (i == nspares) 2089 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2090 2091 VERIFY(nvlist_add_string(spares[i], 2092 ZPOOL_CONFIG_PATH, newpath) == 0); 2093 spa_load_spares(spa); 2094 spa->spa_sync_spares = B_TRUE; 2095 return (spa_vdev_exit(spa, NULL, txg, 0)); 2096 } else { 2097 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2098 } 2099 } 2100 2101 if (!vd->vdev_ops->vdev_op_leaf) 2102 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2103 2104 spa_strfree(vd->vdev_path); 2105 vd->vdev_path = spa_strdup(newpath); 2106 2107 vdev_config_dirty(vd->vdev_top); 2108 2109 return (spa_vdev_exit(spa, NULL, txg, 0)); 2110 } 2111 2112 /* 2113 * ========================================================================== 2114 * SPA Scrubbing 2115 * ========================================================================== 2116 */ 2117 2118 void 2119 spa_scrub_throttle(spa_t *spa, int direction) 2120 { 2121 mutex_enter(&spa->spa_scrub_lock); 2122 spa->spa_scrub_throttled += direction; 2123 ASSERT(spa->spa_scrub_throttled >= 0); 2124 if (spa->spa_scrub_throttled == 0) 2125 cv_broadcast(&spa->spa_scrub_io_cv); 2126 mutex_exit(&spa->spa_scrub_lock); 2127 } 2128 2129 static void 2130 spa_scrub_io_done(zio_t *zio) 2131 { 2132 spa_t *spa = zio->io_spa; 2133 2134 zio_buf_free(zio->io_data, zio->io_size); 2135 2136 mutex_enter(&spa->spa_scrub_lock); 2137 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2138 vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2139 spa->spa_scrub_errors++; 2140 mutex_enter(&vd->vdev_stat_lock); 2141 vd->vdev_stat.vs_scrub_errors++; 2142 mutex_exit(&vd->vdev_stat_lock); 2143 } 2144 if (--spa->spa_scrub_inflight == 0) { 2145 cv_broadcast(&spa->spa_scrub_io_cv); 2146 ASSERT(spa->spa_scrub_throttled == 0); 2147 } 2148 mutex_exit(&spa->spa_scrub_lock); 2149 } 2150 2151 static void 2152 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 2153 zbookmark_t *zb) 2154 { 2155 size_t size = BP_GET_LSIZE(bp); 2156 void *data = zio_buf_alloc(size); 2157 2158 mutex_enter(&spa->spa_scrub_lock); 2159 spa->spa_scrub_inflight++; 2160 mutex_exit(&spa->spa_scrub_lock); 2161 2162 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 2163 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 2164 2165 flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 2166 2167 zio_nowait(zio_read(NULL, spa, bp, data, size, 2168 spa_scrub_io_done, NULL, priority, flags, zb)); 2169 } 2170 2171 /* ARGSUSED */ 2172 static int 2173 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2174 { 2175 blkptr_t *bp = &bc->bc_blkptr; 2176 vdev_t *vd = spa->spa_root_vdev; 2177 dva_t *dva = bp->blk_dva; 2178 int needs_resilver = B_FALSE; 2179 int d; 2180 2181 if (bc->bc_errno) { 2182 /* 2183 * We can't scrub this block, but we can continue to scrub 2184 * the rest of the pool. Note the error and move along. 2185 */ 2186 mutex_enter(&spa->spa_scrub_lock); 2187 spa->spa_scrub_errors++; 2188 mutex_exit(&spa->spa_scrub_lock); 2189 2190 mutex_enter(&vd->vdev_stat_lock); 2191 vd->vdev_stat.vs_scrub_errors++; 2192 mutex_exit(&vd->vdev_stat_lock); 2193 2194 return (ERESTART); 2195 } 2196 2197 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2198 2199 for (d = 0; d < BP_GET_NDVAS(bp); d++) { 2200 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 2201 2202 ASSERT(vd != NULL); 2203 2204 /* 2205 * Keep track of how much data we've examined so that 2206 * zpool(1M) status can make useful progress reports. 2207 */ 2208 mutex_enter(&vd->vdev_stat_lock); 2209 vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 2210 mutex_exit(&vd->vdev_stat_lock); 2211 2212 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 2213 if (DVA_GET_GANG(&dva[d])) { 2214 /* 2215 * Gang members may be spread across multiple 2216 * vdevs, so the best we can do is look at the 2217 * pool-wide DTL. 2218 * XXX -- it would be better to change our 2219 * allocation policy to ensure that this can't 2220 * happen. 2221 */ 2222 vd = spa->spa_root_vdev; 2223 } 2224 if (vdev_dtl_contains(&vd->vdev_dtl_map, 2225 bp->blk_birth, 1)) 2226 needs_resilver = B_TRUE; 2227 } 2228 } 2229 2230 if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2231 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 2232 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 2233 else if (needs_resilver) 2234 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 2235 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2236 2237 return (0); 2238 } 2239 2240 static void 2241 spa_scrub_thread(spa_t *spa) 2242 { 2243 callb_cpr_t cprinfo; 2244 traverse_handle_t *th = spa->spa_scrub_th; 2245 vdev_t *rvd = spa->spa_root_vdev; 2246 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2247 int error = 0; 2248 boolean_t complete; 2249 2250 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2251 2252 /* 2253 * If we're restarting due to a snapshot create/delete, 2254 * wait for that to complete. 2255 */ 2256 txg_wait_synced(spa_get_dsl(spa), 0); 2257 2258 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 2259 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2260 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 2261 2262 spa_config_enter(spa, RW_WRITER, FTAG); 2263 vdev_reopen(rvd); /* purge all vdev caches */ 2264 vdev_config_dirty(rvd); /* rewrite all disk labels */ 2265 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 2266 spa_config_exit(spa, FTAG); 2267 2268 mutex_enter(&spa->spa_scrub_lock); 2269 spa->spa_scrub_errors = 0; 2270 spa->spa_scrub_active = 1; 2271 ASSERT(spa->spa_scrub_inflight == 0); 2272 ASSERT(spa->spa_scrub_throttled == 0); 2273 2274 while (!spa->spa_scrub_stop) { 2275 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2276 while (spa->spa_scrub_suspended) { 2277 spa->spa_scrub_active = 0; 2278 cv_broadcast(&spa->spa_scrub_cv); 2279 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2280 spa->spa_scrub_active = 1; 2281 } 2282 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2283 2284 if (spa->spa_scrub_restart_txg != 0) 2285 break; 2286 2287 mutex_exit(&spa->spa_scrub_lock); 2288 error = traverse_more(th); 2289 mutex_enter(&spa->spa_scrub_lock); 2290 if (error != EAGAIN) 2291 break; 2292 2293 while (spa->spa_scrub_throttled > 0) 2294 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2295 } 2296 2297 while (spa->spa_scrub_inflight) 2298 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2299 2300 spa->spa_scrub_active = 0; 2301 cv_broadcast(&spa->spa_scrub_cv); 2302 2303 mutex_exit(&spa->spa_scrub_lock); 2304 2305 spa_config_enter(spa, RW_WRITER, FTAG); 2306 2307 mutex_enter(&spa->spa_scrub_lock); 2308 2309 /* 2310 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 2311 * AND the spa config lock to synchronize with any config changes 2312 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 2313 */ 2314 if (spa->spa_scrub_restart_txg != 0) 2315 error = ERESTART; 2316 2317 if (spa->spa_scrub_stop) 2318 error = EINTR; 2319 2320 /* 2321 * Even if there were uncorrectable errors, we consider the scrub 2322 * completed. The downside is that if there is a transient error during 2323 * a resilver, we won't resilver the data properly to the target. But 2324 * if the damage is permanent (more likely) we will resilver forever, 2325 * which isn't really acceptable. Since there is enough information for 2326 * the user to know what has failed and why, this seems like a more 2327 * tractable approach. 2328 */ 2329 complete = (error == 0); 2330 2331 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 2332 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2333 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2334 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2335 2336 mutex_exit(&spa->spa_scrub_lock); 2337 2338 /* 2339 * If the scrub/resilver completed, update all DTLs to reflect this. 2340 * Whether it succeeded or not, vacate all temporary scrub DTLs. 2341 */ 2342 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2343 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2344 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 2345 spa_errlog_rotate(spa); 2346 2347 spa_config_exit(spa, FTAG); 2348 2349 mutex_enter(&spa->spa_scrub_lock); 2350 2351 /* 2352 * We may have finished replacing a device. 2353 * Let the async thread assess this and handle the detach. 2354 */ 2355 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2356 2357 /* 2358 * If we were told to restart, our final act is to start a new scrub. 2359 */ 2360 if (error == ERESTART) 2361 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 2362 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2363 2364 spa->spa_scrub_type = POOL_SCRUB_NONE; 2365 spa->spa_scrub_active = 0; 2366 spa->spa_scrub_thread = NULL; 2367 cv_broadcast(&spa->spa_scrub_cv); 2368 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2369 thread_exit(); 2370 } 2371 2372 void 2373 spa_scrub_suspend(spa_t *spa) 2374 { 2375 mutex_enter(&spa->spa_scrub_lock); 2376 spa->spa_scrub_suspended++; 2377 while (spa->spa_scrub_active) { 2378 cv_broadcast(&spa->spa_scrub_cv); 2379 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2380 } 2381 while (spa->spa_scrub_inflight) 2382 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2383 mutex_exit(&spa->spa_scrub_lock); 2384 } 2385 2386 void 2387 spa_scrub_resume(spa_t *spa) 2388 { 2389 mutex_enter(&spa->spa_scrub_lock); 2390 ASSERT(spa->spa_scrub_suspended != 0); 2391 if (--spa->spa_scrub_suspended == 0) 2392 cv_broadcast(&spa->spa_scrub_cv); 2393 mutex_exit(&spa->spa_scrub_lock); 2394 } 2395 2396 void 2397 spa_scrub_restart(spa_t *spa, uint64_t txg) 2398 { 2399 /* 2400 * Something happened (e.g. snapshot create/delete) that means 2401 * we must restart any in-progress scrubs. The itinerary will 2402 * fix this properly. 2403 */ 2404 mutex_enter(&spa->spa_scrub_lock); 2405 spa->spa_scrub_restart_txg = txg; 2406 mutex_exit(&spa->spa_scrub_lock); 2407 } 2408 2409 int 2410 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2411 { 2412 space_seg_t *ss; 2413 uint64_t mintxg, maxtxg; 2414 vdev_t *rvd = spa->spa_root_vdev; 2415 2416 if ((uint_t)type >= POOL_SCRUB_TYPES) 2417 return (ENOTSUP); 2418 2419 mutex_enter(&spa->spa_scrub_lock); 2420 2421 /* 2422 * If there's a scrub or resilver already in progress, stop it. 2423 */ 2424 while (spa->spa_scrub_thread != NULL) { 2425 /* 2426 * Don't stop a resilver unless forced. 2427 */ 2428 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 2429 mutex_exit(&spa->spa_scrub_lock); 2430 return (EBUSY); 2431 } 2432 spa->spa_scrub_stop = 1; 2433 cv_broadcast(&spa->spa_scrub_cv); 2434 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2435 } 2436 2437 /* 2438 * Terminate the previous traverse. 2439 */ 2440 if (spa->spa_scrub_th != NULL) { 2441 traverse_fini(spa->spa_scrub_th); 2442 spa->spa_scrub_th = NULL; 2443 } 2444 2445 if (rvd == NULL) { 2446 ASSERT(spa->spa_scrub_stop == 0); 2447 ASSERT(spa->spa_scrub_type == type); 2448 ASSERT(spa->spa_scrub_restart_txg == 0); 2449 mutex_exit(&spa->spa_scrub_lock); 2450 return (0); 2451 } 2452 2453 mintxg = TXG_INITIAL - 1; 2454 maxtxg = spa_last_synced_txg(spa) + 1; 2455 2456 mutex_enter(&rvd->vdev_dtl_lock); 2457 2458 if (rvd->vdev_dtl_map.sm_space == 0) { 2459 /* 2460 * The pool-wide DTL is empty. 2461 * If this is a resilver, there's nothing to do except 2462 * check whether any in-progress replacements have completed. 2463 */ 2464 if (type == POOL_SCRUB_RESILVER) { 2465 type = POOL_SCRUB_NONE; 2466 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2467 } 2468 } else { 2469 /* 2470 * The pool-wide DTL is non-empty. 2471 * If this is a normal scrub, upgrade to a resilver instead. 2472 */ 2473 if (type == POOL_SCRUB_EVERYTHING) 2474 type = POOL_SCRUB_RESILVER; 2475 } 2476 2477 if (type == POOL_SCRUB_RESILVER) { 2478 /* 2479 * Determine the resilvering boundaries. 2480 * 2481 * Note: (mintxg, maxtxg) is an open interval, 2482 * i.e. mintxg and maxtxg themselves are not included. 2483 * 2484 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 2485 * so we don't claim to resilver a txg that's still changing. 2486 */ 2487 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 2488 mintxg = ss->ss_start - 1; 2489 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 2490 maxtxg = MIN(ss->ss_end, maxtxg); 2491 } 2492 2493 mutex_exit(&rvd->vdev_dtl_lock); 2494 2495 spa->spa_scrub_stop = 0; 2496 spa->spa_scrub_type = type; 2497 spa->spa_scrub_restart_txg = 0; 2498 2499 if (type != POOL_SCRUB_NONE) { 2500 spa->spa_scrub_mintxg = mintxg; 2501 spa->spa_scrub_maxtxg = maxtxg; 2502 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 2503 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 2504 ZIO_FLAG_CANFAIL); 2505 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 2506 spa->spa_scrub_thread = thread_create(NULL, 0, 2507 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 2508 } 2509 2510 mutex_exit(&spa->spa_scrub_lock); 2511 2512 return (0); 2513 } 2514 2515 /* 2516 * ========================================================================== 2517 * SPA async task processing 2518 * ========================================================================== 2519 */ 2520 2521 static void 2522 spa_async_reopen(spa_t *spa) 2523 { 2524 vdev_t *rvd = spa->spa_root_vdev; 2525 vdev_t *tvd; 2526 int c; 2527 2528 spa_config_enter(spa, RW_WRITER, FTAG); 2529 2530 for (c = 0; c < rvd->vdev_children; c++) { 2531 tvd = rvd->vdev_child[c]; 2532 if (tvd->vdev_reopen_wanted) { 2533 tvd->vdev_reopen_wanted = 0; 2534 vdev_reopen(tvd); 2535 } 2536 } 2537 2538 spa_config_exit(spa, FTAG); 2539 } 2540 2541 static void 2542 spa_async_thread(spa_t *spa) 2543 { 2544 int tasks; 2545 2546 ASSERT(spa->spa_sync_on); 2547 2548 mutex_enter(&spa->spa_async_lock); 2549 tasks = spa->spa_async_tasks; 2550 spa->spa_async_tasks = 0; 2551 mutex_exit(&spa->spa_async_lock); 2552 2553 /* 2554 * See if the config needs to be updated. 2555 */ 2556 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 2557 mutex_enter(&spa_namespace_lock); 2558 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2559 mutex_exit(&spa_namespace_lock); 2560 } 2561 2562 /* 2563 * See if any devices need to be reopened. 2564 */ 2565 if (tasks & SPA_ASYNC_REOPEN) 2566 spa_async_reopen(spa); 2567 2568 /* 2569 * If any devices are done replacing, detach them. 2570 */ 2571 if (tasks & SPA_ASYNC_REPLACE_DONE) 2572 spa_vdev_replace_done(spa); 2573 2574 /* 2575 * Kick off a scrub. 2576 */ 2577 if (tasks & SPA_ASYNC_SCRUB) 2578 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 2579 2580 /* 2581 * Kick off a resilver. 2582 */ 2583 if (tasks & SPA_ASYNC_RESILVER) 2584 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 2585 2586 /* 2587 * Let the world know that we're done. 2588 */ 2589 mutex_enter(&spa->spa_async_lock); 2590 spa->spa_async_thread = NULL; 2591 cv_broadcast(&spa->spa_async_cv); 2592 mutex_exit(&spa->spa_async_lock); 2593 thread_exit(); 2594 } 2595 2596 void 2597 spa_async_suspend(spa_t *spa) 2598 { 2599 mutex_enter(&spa->spa_async_lock); 2600 spa->spa_async_suspended++; 2601 while (spa->spa_async_thread != NULL) 2602 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 2603 mutex_exit(&spa->spa_async_lock); 2604 } 2605 2606 void 2607 spa_async_resume(spa_t *spa) 2608 { 2609 mutex_enter(&spa->spa_async_lock); 2610 ASSERT(spa->spa_async_suspended != 0); 2611 spa->spa_async_suspended--; 2612 mutex_exit(&spa->spa_async_lock); 2613 } 2614 2615 static void 2616 spa_async_dispatch(spa_t *spa) 2617 { 2618 mutex_enter(&spa->spa_async_lock); 2619 if (spa->spa_async_tasks && !spa->spa_async_suspended && 2620 spa->spa_async_thread == NULL && 2621 rootdir != NULL && !vn_is_readonly(rootdir)) 2622 spa->spa_async_thread = thread_create(NULL, 0, 2623 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 2624 mutex_exit(&spa->spa_async_lock); 2625 } 2626 2627 void 2628 spa_async_request(spa_t *spa, int task) 2629 { 2630 mutex_enter(&spa->spa_async_lock); 2631 spa->spa_async_tasks |= task; 2632 mutex_exit(&spa->spa_async_lock); 2633 } 2634 2635 /* 2636 * ========================================================================== 2637 * SPA syncing routines 2638 * ========================================================================== 2639 */ 2640 2641 static void 2642 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2643 { 2644 bplist_t *bpl = &spa->spa_sync_bplist; 2645 dmu_tx_t *tx; 2646 blkptr_t blk; 2647 uint64_t itor = 0; 2648 zio_t *zio; 2649 int error; 2650 uint8_t c = 1; 2651 2652 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2653 2654 while (bplist_iterate(bpl, &itor, &blk) == 0) 2655 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2656 2657 error = zio_wait(zio); 2658 ASSERT3U(error, ==, 0); 2659 2660 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2661 bplist_vacate(bpl, tx); 2662 2663 /* 2664 * Pre-dirty the first block so we sync to convergence faster. 2665 * (Usually only the first block is needed.) 2666 */ 2667 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2668 dmu_tx_commit(tx); 2669 } 2670 2671 static void 2672 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 2673 { 2674 char *packed = NULL; 2675 size_t nvsize = 0; 2676 dmu_buf_t *db; 2677 2678 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 2679 2680 packed = kmem_alloc(nvsize, KM_SLEEP); 2681 2682 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 2683 KM_SLEEP) == 0); 2684 2685 dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 2686 2687 kmem_free(packed, nvsize); 2688 2689 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 2690 dmu_buf_will_dirty(db, tx); 2691 *(uint64_t *)db->db_data = nvsize; 2692 dmu_buf_rele(db, FTAG); 2693 } 2694 2695 static void 2696 spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 2697 { 2698 nvlist_t *nvroot; 2699 nvlist_t **spares; 2700 int i; 2701 2702 if (!spa->spa_sync_spares) 2703 return; 2704 2705 /* 2706 * Update the MOS nvlist describing the list of available spares. 2707 * spa_validate_spares() will have already made sure this nvlist is 2708 * valid and the vdevs are labelled appropriately. 2709 */ 2710 if (spa->spa_spares_object == 0) { 2711 spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 2712 DMU_OT_PACKED_NVLIST, 1 << 14, 2713 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2714 VERIFY(zap_update(spa->spa_meta_objset, 2715 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 2716 sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 2717 } 2718 2719 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2720 if (spa->spa_nspares == 0) { 2721 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2722 NULL, 0) == 0); 2723 } else { 2724 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 2725 KM_SLEEP); 2726 for (i = 0; i < spa->spa_nspares; i++) 2727 spares[i] = vdev_config_generate(spa, 2728 spa->spa_spares[i], B_FALSE, B_TRUE); 2729 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2730 spares, spa->spa_nspares) == 0); 2731 for (i = 0; i < spa->spa_nspares; i++) 2732 nvlist_free(spares[i]); 2733 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 2734 } 2735 2736 spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 2737 nvlist_free(nvroot); 2738 2739 spa->spa_sync_spares = B_FALSE; 2740 } 2741 2742 static void 2743 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2744 { 2745 nvlist_t *config; 2746 2747 if (list_is_empty(&spa->spa_dirty_list)) 2748 return; 2749 2750 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2751 2752 if (spa->spa_config_syncing) 2753 nvlist_free(spa->spa_config_syncing); 2754 spa->spa_config_syncing = config; 2755 2756 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 2757 } 2758 2759 /* 2760 * Sync the specified transaction group. New blocks may be dirtied as 2761 * part of the process, so we iterate until it converges. 2762 */ 2763 void 2764 spa_sync(spa_t *spa, uint64_t txg) 2765 { 2766 dsl_pool_t *dp = spa->spa_dsl_pool; 2767 objset_t *mos = spa->spa_meta_objset; 2768 bplist_t *bpl = &spa->spa_sync_bplist; 2769 vdev_t *rvd = spa->spa_root_vdev; 2770 vdev_t *vd; 2771 dmu_tx_t *tx; 2772 int dirty_vdevs; 2773 2774 /* 2775 * Lock out configuration changes. 2776 */ 2777 spa_config_enter(spa, RW_READER, FTAG); 2778 2779 spa->spa_syncing_txg = txg; 2780 spa->spa_sync_pass = 0; 2781 2782 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2783 2784 tx = dmu_tx_create_assigned(dp, txg); 2785 2786 /* 2787 * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg, 2788 * set spa_deflate if we have no raid-z vdevs. 2789 */ 2790 if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE && 2791 spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) { 2792 int i; 2793 2794 for (i = 0; i < rvd->vdev_children; i++) { 2795 vd = rvd->vdev_child[i]; 2796 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 2797 break; 2798 } 2799 if (i == rvd->vdev_children) { 2800 spa->spa_deflate = TRUE; 2801 VERIFY(0 == zap_add(spa->spa_meta_objset, 2802 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2803 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 2804 } 2805 } 2806 2807 /* 2808 * If anything has changed in this txg, push the deferred frees 2809 * from the previous txg. If not, leave them alone so that we 2810 * don't generate work on an otherwise idle system. 2811 */ 2812 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2813 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 2814 !txg_list_empty(&dp->dp_sync_tasks, txg)) 2815 spa_sync_deferred_frees(spa, txg); 2816 2817 /* 2818 * Iterate to convergence. 2819 */ 2820 do { 2821 spa->spa_sync_pass++; 2822 2823 spa_sync_config_object(spa, tx); 2824 spa_sync_spares(spa, tx); 2825 spa_errlog_sync(spa, txg); 2826 dsl_pool_sync(dp, txg); 2827 2828 dirty_vdevs = 0; 2829 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2830 vdev_sync(vd, txg); 2831 dirty_vdevs++; 2832 } 2833 2834 bplist_sync(bpl, tx); 2835 } while (dirty_vdevs); 2836 2837 bplist_close(bpl); 2838 2839 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2840 2841 /* 2842 * Rewrite the vdev configuration (which includes the uberblock) 2843 * to commit the transaction group. 2844 * 2845 * If there are any dirty vdevs, sync the uberblock to all vdevs. 2846 * Otherwise, pick a random top-level vdev that's known to be 2847 * visible in the config cache (see spa_vdev_add() for details). 2848 * If the write fails, try the next vdev until we're tried them all. 2849 */ 2850 if (!list_is_empty(&spa->spa_dirty_list)) { 2851 VERIFY(vdev_config_sync(rvd, txg) == 0); 2852 } else { 2853 int children = rvd->vdev_children; 2854 int c0 = spa_get_random(children); 2855 int c; 2856 2857 for (c = 0; c < children; c++) { 2858 vd = rvd->vdev_child[(c0 + c) % children]; 2859 if (vd->vdev_ms_array == 0) 2860 continue; 2861 if (vdev_config_sync(vd, txg) == 0) 2862 break; 2863 } 2864 if (c == children) 2865 VERIFY(vdev_config_sync(rvd, txg) == 0); 2866 } 2867 2868 dmu_tx_commit(tx); 2869 2870 /* 2871 * Clear the dirty config list. 2872 */ 2873 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 2874 vdev_config_clean(vd); 2875 2876 /* 2877 * Now that the new config has synced transactionally, 2878 * let it become visible to the config cache. 2879 */ 2880 if (spa->spa_config_syncing != NULL) { 2881 spa_config_set(spa, spa->spa_config_syncing); 2882 spa->spa_config_txg = txg; 2883 spa->spa_config_syncing = NULL; 2884 } 2885 2886 /* 2887 * Make a stable copy of the fully synced uberblock. 2888 * We use this as the root for pool traversals. 2889 */ 2890 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 2891 2892 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 2893 2894 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 2895 spa->spa_traverse_wanted = 0; 2896 spa->spa_ubsync = spa->spa_uberblock; 2897 rw_exit(&spa->spa_traverse_lock); 2898 2899 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 2900 2901 /* 2902 * Clean up the ZIL records for the synced txg. 2903 */ 2904 dsl_pool_zil_clean(dp); 2905 2906 /* 2907 * Update usable space statistics. 2908 */ 2909 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 2910 vdev_sync_done(vd, txg); 2911 2912 /* 2913 * It had better be the case that we didn't dirty anything 2914 * since vdev_config_sync(). 2915 */ 2916 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 2917 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 2918 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 2919 ASSERT(bpl->bpl_queue == NULL); 2920 2921 spa_config_exit(spa, FTAG); 2922 2923 /* 2924 * If any async tasks have been requested, kick them off. 2925 */ 2926 spa_async_dispatch(spa); 2927 } 2928 2929 /* 2930 * Sync all pools. We don't want to hold the namespace lock across these 2931 * operations, so we take a reference on the spa_t and drop the lock during the 2932 * sync. 2933 */ 2934 void 2935 spa_sync_allpools(void) 2936 { 2937 spa_t *spa = NULL; 2938 mutex_enter(&spa_namespace_lock); 2939 while ((spa = spa_next(spa)) != NULL) { 2940 if (spa_state(spa) != POOL_STATE_ACTIVE) 2941 continue; 2942 spa_open_ref(spa, FTAG); 2943 mutex_exit(&spa_namespace_lock); 2944 txg_wait_synced(spa_get_dsl(spa), 0); 2945 mutex_enter(&spa_namespace_lock); 2946 spa_close(spa, FTAG); 2947 } 2948 mutex_exit(&spa_namespace_lock); 2949 } 2950 2951 /* 2952 * ========================================================================== 2953 * Miscellaneous routines 2954 * ========================================================================== 2955 */ 2956 2957 /* 2958 * Remove all pools in the system. 2959 */ 2960 void 2961 spa_evict_all(void) 2962 { 2963 spa_t *spa; 2964 2965 /* 2966 * Remove all cached state. All pools should be closed now, 2967 * so every spa in the AVL tree should be unreferenced. 2968 */ 2969 mutex_enter(&spa_namespace_lock); 2970 while ((spa = spa_next(NULL)) != NULL) { 2971 /* 2972 * Stop async tasks. The async thread may need to detach 2973 * a device that's been replaced, which requires grabbing 2974 * spa_namespace_lock, so we must drop it here. 2975 */ 2976 spa_open_ref(spa, FTAG); 2977 mutex_exit(&spa_namespace_lock); 2978 spa_async_suspend(spa); 2979 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 2980 mutex_enter(&spa_namespace_lock); 2981 spa_close(spa, FTAG); 2982 2983 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2984 spa_unload(spa); 2985 spa_deactivate(spa); 2986 } 2987 spa_remove(spa); 2988 } 2989 mutex_exit(&spa_namespace_lock); 2990 } 2991 2992 vdev_t * 2993 spa_lookup_by_guid(spa_t *spa, uint64_t guid) 2994 { 2995 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 2996 } 2997 2998 void 2999 spa_upgrade(spa_t *spa) 3000 { 3001 spa_config_enter(spa, RW_WRITER, FTAG); 3002 3003 /* 3004 * This should only be called for a non-faulted pool, and since a 3005 * future version would result in an unopenable pool, this shouldn't be 3006 * possible. 3007 */ 3008 ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 3009 3010 spa->spa_uberblock.ub_version = ZFS_VERSION; 3011 vdev_config_dirty(spa->spa_root_vdev); 3012 3013 spa_config_exit(spa, FTAG); 3014 3015 txg_wait_synced(spa_get_dsl(spa), 0); 3016 } 3017 3018 boolean_t 3019 spa_has_spare(spa_t *spa, uint64_t guid) 3020 { 3021 int i; 3022 3023 for (i = 0; i < spa->spa_nspares; i++) 3024 if (spa->spa_spares[i]->vdev_guid == guid) 3025 return (B_TRUE); 3026 3027 return (B_FALSE); 3028 } 3029