1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * This file contains all the routines used when modifying on-disk SPA state. 30 * This includes opening, importing, destroying, exporting a pool, and syncing a 31 * pool. 32 */ 33 34 #include <sys/zfs_context.h> 35 #include <sys/fm/fs/zfs.h> 36 #include <sys/spa_impl.h> 37 #include <sys/zio.h> 38 #include <sys/zio_checksum.h> 39 #include <sys/zio_compress.h> 40 #include <sys/dmu.h> 41 #include <sys/dmu_tx.h> 42 #include <sys/zap.h> 43 #include <sys/zil.h> 44 #include <sys/vdev_impl.h> 45 #include <sys/metaslab.h> 46 #include <sys/uberblock_impl.h> 47 #include <sys/txg.h> 48 #include <sys/avl.h> 49 #include <sys/dmu_traverse.h> 50 #include <sys/unique.h> 51 #include <sys/dsl_pool.h> 52 #include <sys/dsl_dir.h> 53 #include <sys/dsl_prop.h> 54 #include <sys/fs/zfs.h> 55 #include <sys/callb.h> 56 57 static uint32_t spa_active_count; 58 59 /* 60 * ========================================================================== 61 * SPA state manipulation (open/create/destroy/import/export) 62 * ========================================================================== 63 */ 64 65 static int 66 spa_error_entry_compare(const void *a, const void *b) 67 { 68 spa_error_entry_t *sa = (spa_error_entry_t *)a; 69 spa_error_entry_t *sb = (spa_error_entry_t *)b; 70 int ret; 71 72 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 73 sizeof (zbookmark_t)); 74 75 if (ret < 0) 76 return (-1); 77 else if (ret > 0) 78 return (1); 79 else 80 return (0); 81 } 82 83 /* 84 * Utility function which retrieves copies of the current logs and 85 * re-initializes them in the process. 86 */ 87 void 88 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 89 { 90 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 91 92 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 93 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 94 95 avl_create(&spa->spa_errlist_scrub, 96 spa_error_entry_compare, sizeof (spa_error_entry_t), 97 offsetof(spa_error_entry_t, se_avl)); 98 avl_create(&spa->spa_errlist_last, 99 spa_error_entry_compare, sizeof (spa_error_entry_t), 100 offsetof(spa_error_entry_t, se_avl)); 101 } 102 103 /* 104 * Activate an uninitialized pool. 105 */ 106 static void 107 spa_activate(spa_t *spa) 108 { 109 int t; 110 111 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 112 113 spa->spa_state = POOL_STATE_ACTIVE; 114 115 spa->spa_normal_class = metaslab_class_create(); 116 117 for (t = 0; t < ZIO_TYPES; t++) { 118 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 119 8, maxclsyspri, 50, INT_MAX, 120 TASKQ_PREPOPULATE); 121 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 122 8, maxclsyspri, 50, INT_MAX, 123 TASKQ_PREPOPULATE); 124 } 125 126 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 127 128 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 129 offsetof(vdev_t, vdev_dirty_node)); 130 131 txg_list_create(&spa->spa_vdev_txg_list, 132 offsetof(struct vdev, vdev_txg_node)); 133 134 avl_create(&spa->spa_errlist_scrub, 135 spa_error_entry_compare, sizeof (spa_error_entry_t), 136 offsetof(spa_error_entry_t, se_avl)); 137 avl_create(&spa->spa_errlist_last, 138 spa_error_entry_compare, sizeof (spa_error_entry_t), 139 offsetof(spa_error_entry_t, se_avl)); 140 } 141 142 /* 143 * Opposite of spa_activate(). 144 */ 145 static void 146 spa_deactivate(spa_t *spa) 147 { 148 int t; 149 150 ASSERT(spa->spa_sync_on == B_FALSE); 151 ASSERT(spa->spa_dsl_pool == NULL); 152 ASSERT(spa->spa_root_vdev == NULL); 153 154 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 155 156 txg_list_destroy(&spa->spa_vdev_txg_list); 157 158 list_destroy(&spa->spa_dirty_list); 159 160 rw_destroy(&spa->spa_traverse_lock); 161 162 for (t = 0; t < ZIO_TYPES; t++) { 163 taskq_destroy(spa->spa_zio_issue_taskq[t]); 164 taskq_destroy(spa->spa_zio_intr_taskq[t]); 165 spa->spa_zio_issue_taskq[t] = NULL; 166 spa->spa_zio_intr_taskq[t] = NULL; 167 } 168 169 metaslab_class_destroy(spa->spa_normal_class); 170 spa->spa_normal_class = NULL; 171 172 /* 173 * If this was part of an import or the open otherwise failed, we may 174 * still have errors left in the queues. Empty them just in case. 175 */ 176 spa_errlog_drain(spa); 177 178 avl_destroy(&spa->spa_errlist_scrub); 179 avl_destroy(&spa->spa_errlist_last); 180 181 spa->spa_state = POOL_STATE_UNINITIALIZED; 182 } 183 184 /* 185 * Verify a pool configuration, and construct the vdev tree appropriately. This 186 * will create all the necessary vdevs in the appropriate layout, with each vdev 187 * in the CLOSED state. This will prep the pool before open/creation/import. 188 * All vdev validation is done by the vdev_alloc() routine. 189 */ 190 static vdev_t * 191 spa_config_parse(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) 192 { 193 nvlist_t **child; 194 uint_t c, children; 195 vdev_t *vd; 196 197 if ((vd = vdev_alloc(spa, nv, parent, id, atype)) == NULL) 198 return (NULL); 199 200 if (vd->vdev_ops->vdev_op_leaf) 201 return (vd); 202 203 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 204 &child, &children) != 0) { 205 vdev_free(vd); 206 return (NULL); 207 } 208 209 for (c = 0; c < children; c++) { 210 if (spa_config_parse(spa, child[c], vd, c, atype) == NULL) { 211 vdev_free(vd); 212 return (NULL); 213 } 214 } 215 216 return (vd); 217 } 218 219 /* 220 * Opposite of spa_load(). 221 */ 222 static void 223 spa_unload(spa_t *spa) 224 { 225 /* 226 * Stop async tasks. 227 */ 228 spa_async_suspend(spa); 229 230 /* 231 * Stop syncing. 232 */ 233 if (spa->spa_sync_on) { 234 txg_sync_stop(spa->spa_dsl_pool); 235 spa->spa_sync_on = B_FALSE; 236 } 237 238 /* 239 * Wait for any outstanding prefetch I/O to complete. 240 */ 241 spa_config_enter(spa, RW_WRITER, FTAG); 242 spa_config_exit(spa, FTAG); 243 244 /* 245 * Close the dsl pool. 246 */ 247 if (spa->spa_dsl_pool) { 248 dsl_pool_close(spa->spa_dsl_pool); 249 spa->spa_dsl_pool = NULL; 250 } 251 252 /* 253 * Close all vdevs. 254 */ 255 if (spa->spa_root_vdev) { 256 vdev_free(spa->spa_root_vdev); 257 spa->spa_root_vdev = NULL; 258 } 259 260 spa->spa_async_suspended = 0; 261 } 262 263 /* 264 * Load an existing storage pool, using the pool's builtin spa_config as a 265 * source of configuration information. 266 */ 267 static int 268 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 269 { 270 int error = 0; 271 nvlist_t *nvroot = NULL; 272 vdev_t *rvd; 273 uberblock_t *ub = &spa->spa_uberblock; 274 uint64_t pool_guid; 275 zio_t *zio; 276 277 spa->spa_load_state = state; 278 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 279 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 280 error = EINVAL; 281 goto out; 282 } 283 284 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 285 &spa->spa_config_txg); 286 287 if ((spa->spa_load_state == SPA_LOAD_IMPORT || 288 spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 289 spa_guid_exists(pool_guid, 0)) { 290 error = EEXIST; 291 goto out; 292 } 293 294 /* 295 * Parse the configuration into a vdev tree. 296 */ 297 spa_config_enter(spa, RW_WRITER, FTAG); 298 rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 299 spa_config_exit(spa, FTAG); 300 301 if (rvd == NULL) { 302 error = EINVAL; 303 goto out; 304 } 305 306 spa->spa_root_vdev = rvd; 307 ASSERT(spa_guid(spa) == pool_guid); 308 309 /* 310 * Try to open all vdevs, loading each label in the process. 311 */ 312 if (vdev_open(rvd) != 0) { 313 error = ENXIO; 314 goto out; 315 } 316 317 /* 318 * Find the best uberblock. 319 */ 320 bzero(ub, sizeof (uberblock_t)); 321 322 zio = zio_root(spa, NULL, NULL, 323 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 324 vdev_uberblock_load(zio, rvd, ub); 325 error = zio_wait(zio); 326 327 /* 328 * If we weren't able to find a single valid uberblock, return failure. 329 */ 330 if (ub->ub_txg == 0) { 331 error = ENXIO; 332 goto out; 333 } 334 335 /* 336 * If the pool is newer than the code, we can't open it. 337 */ 338 if (ub->ub_version > UBERBLOCK_VERSION) { 339 error = ENOTSUP; 340 goto out; 341 } 342 343 /* 344 * If the vdev guid sum doesn't match the uberblock, we have an 345 * incomplete configuration. 346 */ 347 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 348 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 349 VDEV_AUX_BAD_GUID_SUM); 350 error = ENXIO; 351 goto out; 352 } 353 354 /* 355 * Initialize internal SPA structures. 356 */ 357 spa->spa_state = POOL_STATE_ACTIVE; 358 spa->spa_ubsync = spa->spa_uberblock; 359 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 360 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 361 if (error) { 362 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 363 VDEV_AUX_CORRUPT_DATA); 364 goto out; 365 } 366 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 367 368 if (zap_lookup(spa->spa_meta_objset, 369 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 370 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 371 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 372 VDEV_AUX_CORRUPT_DATA); 373 error = EIO; 374 goto out; 375 } 376 377 if (!mosconfig) { 378 dmu_buf_t *db; 379 char *packed = NULL; 380 size_t nvsize = 0; 381 nvlist_t *newconfig = NULL; 382 383 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, 384 spa->spa_config_object, FTAG, &db)); 385 nvsize = *(uint64_t *)db->db_data; 386 dmu_buf_rele(db, FTAG); 387 388 packed = kmem_alloc(nvsize, KM_SLEEP); 389 error = dmu_read(spa->spa_meta_objset, 390 spa->spa_config_object, 0, nvsize, packed); 391 if (error == 0) 392 error = nvlist_unpack(packed, nvsize, &newconfig, 0); 393 kmem_free(packed, nvsize); 394 395 if (error) { 396 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 397 VDEV_AUX_CORRUPT_DATA); 398 error = EIO; 399 goto out; 400 } 401 402 spa_config_set(spa, newconfig); 403 404 spa_unload(spa); 405 spa_deactivate(spa); 406 spa_activate(spa); 407 408 return (spa_load(spa, newconfig, state, B_TRUE)); 409 } 410 411 if (zap_lookup(spa->spa_meta_objset, 412 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 413 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 414 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 415 VDEV_AUX_CORRUPT_DATA); 416 error = EIO; 417 goto out; 418 } 419 420 /* 421 * Load the persistent error log. If we have an older pool, this will 422 * not be present. 423 */ 424 error = zap_lookup(spa->spa_meta_objset, 425 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 426 sizeof (uint64_t), 1, &spa->spa_errlog_last); 427 if (error != 0 &&error != ENOENT) { 428 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 429 VDEV_AUX_CORRUPT_DATA); 430 error = EIO; 431 goto out; 432 } 433 434 error = zap_lookup(spa->spa_meta_objset, 435 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 436 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 437 if (error != 0 && error != ENOENT) { 438 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 439 VDEV_AUX_CORRUPT_DATA); 440 error = EIO; 441 goto out; 442 } 443 444 /* 445 * Load the vdev state for all top level vdevs. We need to grab the 446 * config lock because all label I/O is done with the 447 * ZIO_FLAG_CONFIG_HELD flag. 448 */ 449 spa_config_enter(spa, RW_READER, FTAG); 450 if ((error = vdev_load(rvd)) != 0) { 451 spa_config_exit(spa, FTAG); 452 goto out; 453 } 454 spa_config_exit(spa, FTAG); 455 456 /* 457 * Propagate the leaf DTLs we just loaded all the way up the tree. 458 */ 459 spa_config_enter(spa, RW_WRITER, FTAG); 460 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 461 spa_config_exit(spa, FTAG); 462 463 /* 464 * Check the state of the root vdev. If it can't be opened, it 465 * indicates one or more toplevel vdevs are faulted. 466 */ 467 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 468 error = ENXIO; 469 goto out; 470 } 471 472 /* 473 * Claim log blocks that haven't been committed yet, and update all 474 * top-level vdevs to sync any config changes found in vdev_load(). 475 * This must all happen in a single txg. 476 */ 477 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 478 dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), 479 spa_first_txg(spa)); 480 dmu_objset_find(spa->spa_name, zil_claim, tx, 0); 481 vdev_config_dirty(rvd); 482 dmu_tx_commit(tx); 483 484 spa->spa_sync_on = B_TRUE; 485 txg_sync_start(spa->spa_dsl_pool); 486 487 /* 488 * Wait for all claims to sync. 489 */ 490 txg_wait_synced(spa->spa_dsl_pool, 0); 491 } 492 493 error = 0; 494 out: 495 if (error) 496 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 497 spa->spa_load_state = SPA_LOAD_NONE; 498 spa->spa_ena = 0; 499 500 return (error); 501 } 502 503 /* 504 * Pool Open/Import 505 * 506 * The import case is identical to an open except that the configuration is sent 507 * down from userland, instead of grabbed from the configuration cache. For the 508 * case of an open, the pool configuration will exist in the 509 * POOL_STATE_UNITIALIZED state. 510 * 511 * The stats information (gen/count/ustats) is used to gather vdev statistics at 512 * the same time open the pool, without having to keep around the spa_t in some 513 * ambiguous state. 514 */ 515 static int 516 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 517 { 518 spa_t *spa; 519 int error; 520 int loaded = B_FALSE; 521 int locked = B_FALSE; 522 523 *spapp = NULL; 524 525 /* 526 * As disgusting as this is, we need to support recursive calls to this 527 * function because dsl_dir_open() is called during spa_load(), and ends 528 * up calling spa_open() again. The real fix is to figure out how to 529 * avoid dsl_dir_open() calling this in the first place. 530 */ 531 if (mutex_owner(&spa_namespace_lock) != curthread) { 532 mutex_enter(&spa_namespace_lock); 533 locked = B_TRUE; 534 } 535 536 if ((spa = spa_lookup(pool)) == NULL) { 537 if (locked) 538 mutex_exit(&spa_namespace_lock); 539 return (ENOENT); 540 } 541 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 542 543 spa_activate(spa); 544 545 error = spa_load(spa, spa->spa_config, 546 SPA_LOAD_OPEN, B_FALSE); 547 548 if (error == EBADF) { 549 /* 550 * If vdev_load() returns EBADF, it indicates that one 551 * of the vdevs indicates that the pool has been 552 * exported or destroyed. If this is the case, the 553 * config cache is out of sync and we should remove the 554 * pool from the namespace. 555 */ 556 spa_unload(spa); 557 spa_deactivate(spa); 558 spa_remove(spa); 559 spa_config_sync(); 560 if (locked) 561 mutex_exit(&spa_namespace_lock); 562 return (ENOENT); 563 } 564 565 if (error) { 566 /* 567 * We can't open the pool, but we still have useful 568 * information: the state of each vdev after the 569 * attempted vdev_open(). Return this to the user. 570 */ 571 if (config != NULL && spa->spa_root_vdev != NULL) 572 *config = spa_config_generate(spa, NULL, -1ULL, 573 B_TRUE); 574 spa_unload(spa); 575 spa_deactivate(spa); 576 spa->spa_last_open_failed = B_TRUE; 577 if (locked) 578 mutex_exit(&spa_namespace_lock); 579 *spapp = NULL; 580 return (error); 581 } else { 582 zfs_post_ok(spa, NULL); 583 spa->spa_last_open_failed = B_FALSE; 584 } 585 586 loaded = B_TRUE; 587 } 588 589 spa_open_ref(spa, tag); 590 if (locked) 591 mutex_exit(&spa_namespace_lock); 592 593 *spapp = spa; 594 595 if (config != NULL) { 596 spa_config_enter(spa, RW_READER, FTAG); 597 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 598 spa_config_exit(spa, FTAG); 599 } 600 601 /* 602 * If we just loaded the pool, resilver anything that's out of date. 603 */ 604 if (loaded && (spa_mode & FWRITE)) 605 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 606 607 return (0); 608 } 609 610 int 611 spa_open(const char *name, spa_t **spapp, void *tag) 612 { 613 return (spa_open_common(name, spapp, tag, NULL)); 614 } 615 616 /* 617 * Lookup the given spa_t, incrementing the inject count in the process, 618 * preventing it from being exported or destroyed. 619 */ 620 spa_t * 621 spa_inject_addref(char *name) 622 { 623 spa_t *spa; 624 625 mutex_enter(&spa_namespace_lock); 626 if ((spa = spa_lookup(name)) == NULL) { 627 mutex_exit(&spa_namespace_lock); 628 return (NULL); 629 } 630 spa->spa_inject_ref++; 631 mutex_exit(&spa_namespace_lock); 632 633 return (spa); 634 } 635 636 void 637 spa_inject_delref(spa_t *spa) 638 { 639 mutex_enter(&spa_namespace_lock); 640 spa->spa_inject_ref--; 641 mutex_exit(&spa_namespace_lock); 642 } 643 644 int 645 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 646 { 647 int error; 648 spa_t *spa; 649 650 *config = NULL; 651 error = spa_open_common(name, &spa, FTAG, config); 652 653 if (spa && *config != NULL) 654 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 655 spa_get_errlog_size(spa)) == 0); 656 657 /* 658 * We want to get the alternate root even for faulted pools, so we cheat 659 * and call spa_lookup() directly. 660 */ 661 if (altroot) { 662 if (spa == NULL) { 663 mutex_enter(&spa_namespace_lock); 664 spa = spa_lookup(name); 665 if (spa) 666 spa_altroot(spa, altroot, buflen); 667 else 668 altroot[0] = '\0'; 669 spa = NULL; 670 mutex_exit(&spa_namespace_lock); 671 } else { 672 spa_altroot(spa, altroot, buflen); 673 } 674 } 675 676 if (spa != NULL) 677 spa_close(spa, FTAG); 678 679 return (error); 680 } 681 682 /* 683 * Pool Creation 684 */ 685 int 686 spa_create(const char *pool, nvlist_t *nvroot, char *altroot) 687 { 688 spa_t *spa; 689 dsl_pool_t *dp; 690 dmu_tx_t *tx; 691 int error; 692 uint64_t txg = TXG_INITIAL; 693 694 /* 695 * If this pool already exists, return failure. 696 */ 697 mutex_enter(&spa_namespace_lock); 698 if (spa_lookup(pool) != NULL) { 699 mutex_exit(&spa_namespace_lock); 700 return (EEXIST); 701 } 702 spa = spa_add(pool); 703 704 /* 705 * Allocate a new spa_t structure. 706 */ 707 spa_activate(spa); 708 709 spa->spa_uberblock.ub_txg = txg - 1; 710 spa->spa_ubsync = spa->spa_uberblock; 711 712 error = spa_vdev_add(spa, nvroot); 713 714 if (error) { 715 spa_unload(spa); 716 spa_deactivate(spa); 717 spa_remove(spa); 718 mutex_exit(&spa_namespace_lock); 719 return (error); 720 } 721 722 if (altroot != NULL) { 723 spa->spa_root = spa_strdup(altroot); 724 atomic_add_32(&spa_active_count, 1); 725 } 726 727 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 728 spa->spa_meta_objset = dp->dp_meta_objset; 729 730 tx = dmu_tx_create_assigned(dp, txg); 731 732 /* 733 * Create the pool config object. 734 */ 735 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 736 DMU_OT_PACKED_NVLIST, 1 << 14, 737 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 738 739 if (zap_add(spa->spa_meta_objset, 740 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 741 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 742 cmn_err(CE_PANIC, "failed to add pool config"); 743 } 744 745 /* 746 * Create the deferred-free bplist object. Turn off compression 747 * because sync-to-convergence takes longer if the blocksize 748 * keeps changing. 749 */ 750 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 751 1 << 14, tx); 752 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 753 ZIO_COMPRESS_OFF, tx); 754 755 if (zap_add(spa->spa_meta_objset, 756 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 757 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 758 cmn_err(CE_PANIC, "failed to add bplist"); 759 } 760 761 dmu_tx_commit(tx); 762 763 spa->spa_sync_on = B_TRUE; 764 txg_sync_start(spa->spa_dsl_pool); 765 766 /* 767 * We explicitly wait for the first transaction to complete so that our 768 * bean counters are appropriately updated. 769 */ 770 txg_wait_synced(spa->spa_dsl_pool, txg); 771 772 spa_config_sync(); 773 774 mutex_exit(&spa_namespace_lock); 775 776 return (0); 777 } 778 779 /* 780 * Import the given pool into the system. We set up the necessary spa_t and 781 * then call spa_load() to do the dirty work. 782 */ 783 int 784 spa_import(const char *pool, nvlist_t *config, char *altroot) 785 { 786 spa_t *spa; 787 int error; 788 789 if (!(spa_mode & FWRITE)) 790 return (EROFS); 791 792 /* 793 * If a pool with this name exists, return failure. 794 */ 795 mutex_enter(&spa_namespace_lock); 796 if (spa_lookup(pool) != NULL) { 797 mutex_exit(&spa_namespace_lock); 798 return (EEXIST); 799 } 800 801 /* 802 * Create an initialize the spa structure 803 */ 804 spa = spa_add(pool); 805 spa_activate(spa); 806 807 /* 808 * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig 809 * so that we don't try to open the pool if the config is damaged. 810 */ 811 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 812 813 if (error) { 814 spa_unload(spa); 815 spa_deactivate(spa); 816 spa_remove(spa); 817 mutex_exit(&spa_namespace_lock); 818 return (error); 819 } 820 821 /* 822 * Set the alternate root, if there is one. 823 */ 824 if (altroot != NULL) { 825 atomic_add_32(&spa_active_count, 1); 826 spa->spa_root = spa_strdup(altroot); 827 } 828 829 /* 830 * Initialize the config based on the in-core state. 831 */ 832 config = spa_config_generate(spa, NULL, spa_last_synced_txg(spa), 0); 833 834 spa_config_set(spa, config); 835 836 /* 837 * Sync the configuration cache. 838 */ 839 spa_config_sync(); 840 841 mutex_exit(&spa_namespace_lock); 842 843 /* 844 * Resilver anything that's out of date. 845 */ 846 if (spa_mode & FWRITE) 847 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 848 849 return (0); 850 } 851 852 /* 853 * This (illegal) pool name is used when temporarily importing a spa_t in order 854 * to get the vdev stats associated with the imported devices. 855 */ 856 #define TRYIMPORT_NAME "$import" 857 858 nvlist_t * 859 spa_tryimport(nvlist_t *tryconfig) 860 { 861 nvlist_t *config = NULL; 862 char *poolname; 863 spa_t *spa; 864 uint64_t state; 865 866 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 867 return (NULL); 868 869 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 870 return (NULL); 871 872 mutex_enter(&spa_namespace_lock); 873 spa = spa_add(TRYIMPORT_NAME); 874 875 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 876 877 /* 878 * Initialize the spa_t structure. 879 */ 880 spa_activate(spa); 881 882 /* 883 * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig 884 * so we don't try to open the pool if the config is damaged. 885 */ 886 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 887 888 /* 889 * If 'tryconfig' was at least parsable, return the current config. 890 */ 891 if (spa->spa_root_vdev != NULL) { 892 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 893 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 894 poolname) == 0); 895 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 896 state) == 0); 897 } 898 899 spa_unload(spa); 900 spa_deactivate(spa); 901 spa_remove(spa); 902 mutex_exit(&spa_namespace_lock); 903 904 return (config); 905 } 906 907 /* 908 * Pool export/destroy 909 * 910 * The act of destroying or exporting a pool is very simple. We make sure there 911 * is no more pending I/O and any references to the pool are gone. Then, we 912 * update the pool state and sync all the labels to disk, removing the 913 * configuration from the cache afterwards. 914 */ 915 static int 916 spa_export_common(char *pool, int new_state) 917 { 918 spa_t *spa; 919 920 if (!(spa_mode & FWRITE)) 921 return (EROFS); 922 923 mutex_enter(&spa_namespace_lock); 924 if ((spa = spa_lookup(pool)) == NULL) { 925 mutex_exit(&spa_namespace_lock); 926 return (ENOENT); 927 } 928 929 /* 930 * Put a hold on the pool, drop the namespace lock, stop async tasks, 931 * reacquire the namespace lock, and see if we can export. 932 */ 933 spa_open_ref(spa, FTAG); 934 mutex_exit(&spa_namespace_lock); 935 spa_async_suspend(spa); 936 mutex_enter(&spa_namespace_lock); 937 spa_close(spa, FTAG); 938 939 /* 940 * The pool will be in core if it's openable, 941 * in which case we can modify its state. 942 */ 943 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 944 /* 945 * Objsets may be open only because they're dirty, so we 946 * have to force it to sync before checking spa_refcnt. 947 */ 948 spa_scrub_suspend(spa); 949 txg_wait_synced(spa->spa_dsl_pool, 0); 950 951 /* 952 * A pool cannot be exported or destroyed if there are active 953 * references. If we are resetting a pool, allow references by 954 * fault injection handlers. 955 */ 956 if (!spa_refcount_zero(spa) || 957 (spa->spa_inject_ref != 0 && 958 new_state != POOL_STATE_UNINITIALIZED)) { 959 spa_scrub_resume(spa); 960 spa_async_resume(spa); 961 mutex_exit(&spa_namespace_lock); 962 return (EBUSY); 963 } 964 965 spa_scrub_resume(spa); 966 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 967 968 if (spa->spa_root != NULL) 969 atomic_add_32(&spa_active_count, -1); 970 971 /* 972 * We want this to be reflected on every label, 973 * so mark them all dirty. spa_unload() will do the 974 * final sync that pushes these changes out. 975 */ 976 if (new_state != POOL_STATE_UNINITIALIZED) { 977 spa->spa_state = new_state; 978 vdev_config_dirty(spa->spa_root_vdev); 979 } 980 } 981 982 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 983 spa_unload(spa); 984 spa_deactivate(spa); 985 } 986 987 if (new_state != POOL_STATE_UNINITIALIZED) { 988 spa_remove(spa); 989 spa_config_sync(); 990 } 991 mutex_exit(&spa_namespace_lock); 992 993 return (0); 994 } 995 996 /* 997 * Destroy a storage pool. 998 */ 999 int 1000 spa_destroy(char *pool) 1001 { 1002 return (spa_export_common(pool, POOL_STATE_DESTROYED)); 1003 } 1004 1005 /* 1006 * Export a storage pool. 1007 */ 1008 int 1009 spa_export(char *pool) 1010 { 1011 return (spa_export_common(pool, POOL_STATE_EXPORTED)); 1012 } 1013 1014 /* 1015 * Similar to spa_export(), this unloads the spa_t without actually removing it 1016 * from the namespace in any way. 1017 */ 1018 int 1019 spa_reset(char *pool) 1020 { 1021 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED)); 1022 } 1023 1024 1025 /* 1026 * ========================================================================== 1027 * Device manipulation 1028 * ========================================================================== 1029 */ 1030 1031 /* 1032 * Add capacity to a storage pool. 1033 */ 1034 int 1035 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1036 { 1037 uint64_t txg; 1038 int c, error; 1039 vdev_t *rvd = spa->spa_root_vdev; 1040 vdev_t *vd; 1041 1042 txg = spa_vdev_enter(spa); 1043 1044 vd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1045 1046 if (vd == NULL) 1047 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1048 1049 if (rvd == NULL) /* spa_create() */ 1050 spa->spa_root_vdev = rvd = vd; 1051 1052 if ((error = vdev_create(vd, txg)) != 0) 1053 return (spa_vdev_exit(spa, vd, txg, error)); 1054 1055 /* 1056 * Transfer each top-level vdev from the temporary root 1057 * to the spa's root and initialize its metaslabs. 1058 */ 1059 for (c = 0; c < vd->vdev_children; c++) { 1060 vdev_t *tvd = vd->vdev_child[c]; 1061 if (vd != rvd) { 1062 vdev_remove_child(vd, tvd); 1063 tvd->vdev_id = rvd->vdev_children; 1064 vdev_add_child(rvd, tvd); 1065 } 1066 if ((error = vdev_init(tvd, txg)) != 0) 1067 return (spa_vdev_exit(spa, vd, txg, error)); 1068 vdev_config_dirty(tvd); 1069 } 1070 1071 /* 1072 * Update the config based on the new in-core state. 1073 */ 1074 spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); 1075 1076 return (spa_vdev_exit(spa, vd, txg, 0)); 1077 } 1078 1079 /* 1080 * Attach a device to a mirror. The arguments are the path to any device 1081 * in the mirror, and the nvroot for the new device. If the path specifies 1082 * a device that is not mirrored, we automatically insert the mirror vdev. 1083 * 1084 * If 'replacing' is specified, the new device is intended to replace the 1085 * existing device; in this case the two devices are made into their own 1086 * mirror using the 'replacing' vdev, which is functionally idendical to 1087 * the mirror vdev (it actually reuses all the same ops) but has a few 1088 * extra rules: you can't attach to it after it's been created, and upon 1089 * completion of resilvering, the first disk (the one being replaced) 1090 * is automatically detached. 1091 */ 1092 int 1093 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1094 { 1095 uint64_t txg, open_txg; 1096 int error; 1097 vdev_t *rvd = spa->spa_root_vdev; 1098 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1099 vdev_ops_t *pvops = replacing ? &vdev_replacing_ops : &vdev_mirror_ops; 1100 1101 txg = spa_vdev_enter(spa); 1102 1103 oldvd = vdev_lookup_by_guid(rvd, guid); 1104 1105 if (oldvd == NULL) 1106 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1107 1108 pvd = oldvd->vdev_parent; 1109 1110 /* 1111 * The parent must be a mirror or the root, unless we're replacing; 1112 * in that case, the parent can be anything but another replacing vdev. 1113 */ 1114 if (pvd->vdev_ops != &vdev_mirror_ops && 1115 pvd->vdev_ops != &vdev_root_ops && 1116 (!replacing || pvd->vdev_ops == &vdev_replacing_ops)) 1117 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1118 1119 newrootvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1120 1121 if (newrootvd == NULL || newrootvd->vdev_children != 1) 1122 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1123 1124 newvd = newrootvd->vdev_child[0]; 1125 1126 if (!newvd->vdev_ops->vdev_op_leaf) 1127 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1128 1129 if ((error = vdev_create(newrootvd, txg)) != 0) 1130 return (spa_vdev_exit(spa, newrootvd, txg, error)); 1131 1132 /* 1133 * Compare the new device size with the replaceable/attachable 1134 * device size. 1135 */ 1136 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1137 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1138 1139 if (newvd->vdev_ashift != oldvd->vdev_ashift && oldvd->vdev_ashift != 0) 1140 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1141 1142 /* 1143 * If this is an in-place replacement, update oldvd's path and devid 1144 * to make it distinguishable from newvd, and unopenable from now on. 1145 */ 1146 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1147 spa_strfree(oldvd->vdev_path); 1148 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1149 KM_SLEEP); 1150 (void) sprintf(oldvd->vdev_path, "%s/%s", 1151 newvd->vdev_path, "old"); 1152 if (oldvd->vdev_devid != NULL) { 1153 spa_strfree(oldvd->vdev_devid); 1154 oldvd->vdev_devid = NULL; 1155 } 1156 } 1157 1158 /* 1159 * If the parent is not a mirror, or if we're replacing, 1160 * insert the new mirror/replacing vdev above oldvd. 1161 */ 1162 if (pvd->vdev_ops != pvops) 1163 pvd = vdev_add_parent(oldvd, pvops); 1164 1165 ASSERT(pvd->vdev_top->vdev_parent == rvd); 1166 ASSERT(pvd->vdev_ops == pvops); 1167 ASSERT(oldvd->vdev_parent == pvd); 1168 1169 /* 1170 * Extract the new device from its root and add it to pvd. 1171 */ 1172 vdev_remove_child(newrootvd, newvd); 1173 newvd->vdev_id = pvd->vdev_children; 1174 vdev_add_child(pvd, newvd); 1175 1176 /* 1177 * If newvd is smaller than oldvd, but larger than its rsize, 1178 * the addition of newvd may have decreased our parent's asize. 1179 */ 1180 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1181 1182 tvd = newvd->vdev_top; 1183 ASSERT(pvd->vdev_top == tvd); 1184 ASSERT(tvd->vdev_parent == rvd); 1185 1186 /* 1187 * Update the config based on the new in-core state. 1188 */ 1189 spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); 1190 vdev_config_dirty(tvd); 1191 1192 /* 1193 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1194 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1195 */ 1196 open_txg = txg + TXG_CONCURRENT_STATES - 1; 1197 1198 mutex_enter(&newvd->vdev_dtl_lock); 1199 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1200 open_txg - TXG_INITIAL + 1); 1201 mutex_exit(&newvd->vdev_dtl_lock); 1202 1203 dprintf("attached %s in txg %llu\n", newvd->vdev_path, txg); 1204 1205 /* 1206 * Mark newvd's DTL dirty in this txg. 1207 */ 1208 vdev_dirty(tvd, VDD_DTL, txg); 1209 (void) txg_list_add(&tvd->vdev_dtl_list, newvd, txg); 1210 1211 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1212 1213 /* 1214 * Kick off a resilver to update newvd. 1215 */ 1216 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1217 1218 return (0); 1219 } 1220 1221 /* 1222 * Detach a device from a mirror or replacing vdev. 1223 * If 'replace_done' is specified, only detach if the parent 1224 * is a replacing vdev. 1225 */ 1226 int 1227 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1228 { 1229 uint64_t txg; 1230 int c, t, error; 1231 vdev_t *rvd = spa->spa_root_vdev; 1232 vdev_t *vd, *pvd, *cvd, *tvd; 1233 1234 txg = spa_vdev_enter(spa); 1235 1236 vd = vdev_lookup_by_guid(rvd, guid); 1237 1238 if (vd == NULL) 1239 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1240 1241 pvd = vd->vdev_parent; 1242 1243 /* 1244 * If replace_done is specified, only remove this device if it's 1245 * the first child of a replacing vdev. 1246 */ 1247 if (replace_done && 1248 (vd->vdev_id != 0 || pvd->vdev_ops != &vdev_replacing_ops)) 1249 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1250 1251 /* 1252 * Only mirror and replacing vdevs support detach. 1253 */ 1254 if (pvd->vdev_ops != &vdev_replacing_ops && 1255 pvd->vdev_ops != &vdev_mirror_ops) 1256 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1257 1258 /* 1259 * If there's only one replica, you can't detach it. 1260 */ 1261 if (pvd->vdev_children <= 1) 1262 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1263 1264 /* 1265 * If all siblings have non-empty DTLs, this device may have the only 1266 * valid copy of the data, which means we cannot safely detach it. 1267 * 1268 * XXX -- as in the vdev_offline() case, we really want a more 1269 * precise DTL check. 1270 */ 1271 for (c = 0; c < pvd->vdev_children; c++) { 1272 uint64_t dirty; 1273 1274 cvd = pvd->vdev_child[c]; 1275 if (cvd == vd) 1276 continue; 1277 if (vdev_is_dead(cvd)) 1278 continue; 1279 mutex_enter(&cvd->vdev_dtl_lock); 1280 dirty = cvd->vdev_dtl_map.sm_space | 1281 cvd->vdev_dtl_scrub.sm_space; 1282 mutex_exit(&cvd->vdev_dtl_lock); 1283 if (!dirty) 1284 break; 1285 } 1286 if (c == pvd->vdev_children) 1287 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1288 1289 /* 1290 * Erase the disk labels so the disk can be used for other things. 1291 * This must be done after all other error cases are handled, 1292 * but before we disembowel vd (so we can still do I/O to it). 1293 * But if we can't do it, don't treat the error as fatal -- 1294 * it may be that the unwritability of the disk is the reason 1295 * it's being detached! 1296 */ 1297 error = vdev_label_init(vd, 0); 1298 if (error) 1299 dprintf("unable to erase labels on %s\n", vdev_description(vd)); 1300 1301 /* 1302 * Remove vd from its parent and compact the parent's children. 1303 */ 1304 vdev_remove_child(pvd, vd); 1305 vdev_compact_children(pvd); 1306 1307 /* 1308 * Remember one of the remaining children so we can get tvd below. 1309 */ 1310 cvd = pvd->vdev_child[0]; 1311 1312 /* 1313 * If the parent mirror/replacing vdev only has one child, 1314 * the parent is no longer needed. Remove it from the tree. 1315 */ 1316 if (pvd->vdev_children == 1) 1317 vdev_remove_parent(cvd); 1318 1319 /* 1320 * We don't set tvd until now because the parent we just removed 1321 * may have been the previous top-level vdev. 1322 */ 1323 tvd = cvd->vdev_top; 1324 ASSERT(tvd->vdev_parent == rvd); 1325 1326 /* 1327 * Reopen this top-level vdev to reassess health after detach. 1328 */ 1329 vdev_reopen(tvd); 1330 1331 /* 1332 * If the device we just detached was smaller than the others, 1333 * it may be possible to add metaslabs (i.e. grow the pool). We ignore 1334 * the error here because the detach still succeeded - we just weren't 1335 * able to reinitialize the metaslabs. This pool is in for a world of 1336 * hurt, in any case. 1337 */ 1338 (void) vdev_metaslab_init(tvd, txg); 1339 1340 /* 1341 * Update the config based on the new in-core state. 1342 */ 1343 spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); 1344 1345 vdev_config_dirty(tvd); 1346 1347 /* 1348 * Mark vd's DTL as dirty in this txg. 1349 * vdev_dtl_sync() will see that vd->vdev_detached is set 1350 * and free vd's DTL object in syncing context. 1351 * But first make sure we're not on any *other* txg's DTL list, 1352 * to prevent vd from being accessed after it's freed. 1353 */ 1354 vdev_dirty(tvd, VDD_DTL, txg); 1355 vd->vdev_detached = B_TRUE; 1356 for (t = 0; t < TXG_SIZE; t++) 1357 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1358 (void) txg_list_add(&tvd->vdev_dtl_list, vd, txg); 1359 1360 dprintf("detached %s in txg %llu\n", vd->vdev_path, txg); 1361 1362 return (spa_vdev_exit(spa, vd, txg, 0)); 1363 } 1364 1365 /* 1366 * Find any device that's done replacing, so we can detach it. 1367 */ 1368 static vdev_t * 1369 spa_vdev_replace_done_hunt(vdev_t *vd) 1370 { 1371 vdev_t *newvd, *oldvd; 1372 int c; 1373 1374 for (c = 0; c < vd->vdev_children; c++) { 1375 oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 1376 if (oldvd != NULL) 1377 return (oldvd); 1378 } 1379 1380 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 1381 oldvd = vd->vdev_child[0]; 1382 newvd = vd->vdev_child[1]; 1383 1384 mutex_enter(&newvd->vdev_dtl_lock); 1385 if (newvd->vdev_dtl_map.sm_space == 0 && 1386 newvd->vdev_dtl_scrub.sm_space == 0) { 1387 mutex_exit(&newvd->vdev_dtl_lock); 1388 return (oldvd); 1389 } 1390 mutex_exit(&newvd->vdev_dtl_lock); 1391 } 1392 1393 return (NULL); 1394 } 1395 1396 static void 1397 spa_vdev_replace_done(spa_t *spa) 1398 { 1399 vdev_t *vd; 1400 uint64_t guid; 1401 1402 spa_config_enter(spa, RW_READER, FTAG); 1403 1404 while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 1405 guid = vd->vdev_guid; 1406 spa_config_exit(spa, FTAG); 1407 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 1408 return; 1409 spa_config_enter(spa, RW_READER, FTAG); 1410 } 1411 1412 spa_config_exit(spa, FTAG); 1413 } 1414 1415 /* 1416 * Update the stored path for this vdev. Dirty the vdev configuration, relying 1417 * on spa_vdev_enter/exit() to synchronize the labels and cache. 1418 */ 1419 int 1420 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 1421 { 1422 vdev_t *rvd, *vd; 1423 uint64_t txg; 1424 1425 rvd = spa->spa_root_vdev; 1426 1427 txg = spa_vdev_enter(spa); 1428 1429 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 1430 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 1431 1432 spa_strfree(vd->vdev_path); 1433 vd->vdev_path = spa_strdup(newpath); 1434 1435 spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); 1436 1437 vdev_config_dirty(vd->vdev_top); 1438 1439 return (spa_vdev_exit(spa, NULL, txg, 0)); 1440 } 1441 1442 /* 1443 * ========================================================================== 1444 * SPA Scrubbing 1445 * ========================================================================== 1446 */ 1447 1448 void 1449 spa_scrub_throttle(spa_t *spa, int direction) 1450 { 1451 mutex_enter(&spa->spa_scrub_lock); 1452 spa->spa_scrub_throttled += direction; 1453 ASSERT(spa->spa_scrub_throttled >= 0); 1454 if (spa->spa_scrub_throttled == 0) 1455 cv_broadcast(&spa->spa_scrub_io_cv); 1456 mutex_exit(&spa->spa_scrub_lock); 1457 } 1458 1459 static void 1460 spa_scrub_io_done(zio_t *zio) 1461 { 1462 spa_t *spa = zio->io_spa; 1463 1464 zio_buf_free(zio->io_data, zio->io_size); 1465 1466 mutex_enter(&spa->spa_scrub_lock); 1467 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1468 vdev_t *vd = zio->io_vd; 1469 spa->spa_scrub_errors++; 1470 mutex_enter(&vd->vdev_stat_lock); 1471 vd->vdev_stat.vs_scrub_errors++; 1472 mutex_exit(&vd->vdev_stat_lock); 1473 } 1474 if (--spa->spa_scrub_inflight == 0) { 1475 cv_broadcast(&spa->spa_scrub_io_cv); 1476 ASSERT(spa->spa_scrub_throttled == 0); 1477 } 1478 mutex_exit(&spa->spa_scrub_lock); 1479 } 1480 1481 static void 1482 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 1483 zbookmark_t *zb) 1484 { 1485 size_t size = BP_GET_LSIZE(bp); 1486 void *data = zio_buf_alloc(size); 1487 1488 mutex_enter(&spa->spa_scrub_lock); 1489 spa->spa_scrub_inflight++; 1490 mutex_exit(&spa->spa_scrub_lock); 1491 1492 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 1493 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 1494 1495 flags |= ZIO_FLAG_CANFAIL; 1496 1497 zio_nowait(zio_read(NULL, spa, bp, data, size, 1498 spa_scrub_io_done, NULL, priority, flags, zb)); 1499 } 1500 1501 /* ARGSUSED */ 1502 static int 1503 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 1504 { 1505 blkptr_t *bp = &bc->bc_blkptr; 1506 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[0])); 1507 1508 if (bc->bc_errno || vd == NULL) { 1509 /* 1510 * We can't scrub this block, but we can continue to scrub 1511 * the rest of the pool. Note the error and move along. 1512 */ 1513 mutex_enter(&spa->spa_scrub_lock); 1514 spa->spa_scrub_errors++; 1515 mutex_exit(&spa->spa_scrub_lock); 1516 1517 if (vd != NULL) { 1518 mutex_enter(&vd->vdev_stat_lock); 1519 vd->vdev_stat.vs_scrub_errors++; 1520 mutex_exit(&vd->vdev_stat_lock); 1521 } 1522 1523 return (ERESTART); 1524 } 1525 1526 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 1527 1528 /* 1529 * Keep track of how much data we've examined so that 1530 * zpool(1M) status can make useful progress reports. 1531 */ 1532 mutex_enter(&vd->vdev_stat_lock); 1533 vd->vdev_stat.vs_scrub_examined += BP_GET_ASIZE(bp); 1534 mutex_exit(&vd->vdev_stat_lock); 1535 1536 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 1537 if (DVA_GET_GANG(&bp->blk_dva[0])) { 1538 /* 1539 * Gang members may be spread across multiple vdevs, 1540 * so the best we can do is look at the pool-wide DTL. 1541 * XXX -- it would be better to change our allocation 1542 * policy to ensure that this can't happen. 1543 */ 1544 vd = spa->spa_root_vdev; 1545 } 1546 if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) { 1547 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 1548 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 1549 } 1550 } else { 1551 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 1552 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 1553 } 1554 1555 return (0); 1556 } 1557 1558 static void 1559 spa_scrub_thread(spa_t *spa) 1560 { 1561 callb_cpr_t cprinfo; 1562 traverse_handle_t *th = spa->spa_scrub_th; 1563 vdev_t *rvd = spa->spa_root_vdev; 1564 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 1565 int error = 0; 1566 boolean_t complete; 1567 1568 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 1569 1570 /* 1571 * If we're restarting due to a snapshot create/delete, 1572 * wait for that to complete. 1573 */ 1574 txg_wait_synced(spa_get_dsl(spa), 0); 1575 1576 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 1577 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 1578 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 1579 1580 spa_config_enter(spa, RW_WRITER, FTAG); 1581 vdev_reopen(rvd); /* purge all vdev caches */ 1582 vdev_config_dirty(rvd); /* rewrite all disk labels */ 1583 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 1584 spa_config_exit(spa, FTAG); 1585 1586 mutex_enter(&spa->spa_scrub_lock); 1587 spa->spa_scrub_errors = 0; 1588 spa->spa_scrub_active = 1; 1589 ASSERT(spa->spa_scrub_inflight == 0); 1590 ASSERT(spa->spa_scrub_throttled == 0); 1591 1592 while (!spa->spa_scrub_stop) { 1593 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1594 while (spa->spa_scrub_suspended) { 1595 spa->spa_scrub_active = 0; 1596 cv_broadcast(&spa->spa_scrub_cv); 1597 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1598 spa->spa_scrub_active = 1; 1599 } 1600 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 1601 1602 if (spa->spa_scrub_restart_txg != 0) 1603 break; 1604 1605 mutex_exit(&spa->spa_scrub_lock); 1606 error = traverse_more(th); 1607 mutex_enter(&spa->spa_scrub_lock); 1608 if (error != EAGAIN) 1609 break; 1610 1611 while (spa->spa_scrub_throttled > 0) 1612 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1613 } 1614 1615 while (spa->spa_scrub_inflight) 1616 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1617 1618 if (spa->spa_scrub_restart_txg != 0) 1619 error = ERESTART; 1620 1621 if (spa->spa_scrub_stop) 1622 error = EINTR; 1623 1624 spa->spa_scrub_active = 0; 1625 cv_broadcast(&spa->spa_scrub_cv); 1626 1627 /* 1628 * Even if there were uncorrectable errors, we consider the scrub 1629 * completed. The downside is that if there is a transient error during 1630 * a resilver, we won't resilver the data properly to the target. But 1631 * if the damage is permanent (more likely) we will resilver forever, 1632 * which isn't really acceptable. Since there is enough information for 1633 * the user to know what has failed and why, this seems like a more 1634 * tractable approach. 1635 */ 1636 complete = (error == 0); 1637 1638 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 1639 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 1640 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 1641 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 1642 1643 mutex_exit(&spa->spa_scrub_lock); 1644 1645 /* 1646 * If the scrub/resilver completed, update all DTLs to reflect this. 1647 * Whether it succeeded or not, vacate all temporary scrub DTLs. 1648 */ 1649 spa_config_enter(spa, RW_WRITER, FTAG); 1650 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 1651 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 1652 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 1653 spa_errlog_rotate(spa); 1654 spa_config_exit(spa, FTAG); 1655 1656 mutex_enter(&spa->spa_scrub_lock); 1657 1658 /* 1659 * We may have finished replacing a device. 1660 * Let the async thread assess this and handle the detach. 1661 */ 1662 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 1663 1664 /* 1665 * If we were told to restart, our final act is to start a new scrub. 1666 */ 1667 if (error == ERESTART) 1668 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 1669 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 1670 1671 spa->spa_scrub_type = POOL_SCRUB_NONE; 1672 spa->spa_scrub_active = 0; 1673 spa->spa_scrub_thread = NULL; 1674 cv_broadcast(&spa->spa_scrub_cv); 1675 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 1676 thread_exit(); 1677 } 1678 1679 void 1680 spa_scrub_suspend(spa_t *spa) 1681 { 1682 mutex_enter(&spa->spa_scrub_lock); 1683 spa->spa_scrub_suspended++; 1684 while (spa->spa_scrub_active) { 1685 cv_broadcast(&spa->spa_scrub_cv); 1686 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1687 } 1688 while (spa->spa_scrub_inflight) 1689 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1690 mutex_exit(&spa->spa_scrub_lock); 1691 } 1692 1693 void 1694 spa_scrub_resume(spa_t *spa) 1695 { 1696 mutex_enter(&spa->spa_scrub_lock); 1697 ASSERT(spa->spa_scrub_suspended != 0); 1698 if (--spa->spa_scrub_suspended == 0) 1699 cv_broadcast(&spa->spa_scrub_cv); 1700 mutex_exit(&spa->spa_scrub_lock); 1701 } 1702 1703 void 1704 spa_scrub_restart(spa_t *spa, uint64_t txg) 1705 { 1706 /* 1707 * Something happened (e.g. snapshot create/delete) that means 1708 * we must restart any in-progress scrubs. The itinerary will 1709 * fix this properly. 1710 */ 1711 mutex_enter(&spa->spa_scrub_lock); 1712 spa->spa_scrub_restart_txg = txg; 1713 mutex_exit(&spa->spa_scrub_lock); 1714 } 1715 1716 int 1717 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 1718 { 1719 space_seg_t *ss; 1720 uint64_t mintxg, maxtxg; 1721 vdev_t *rvd = spa->spa_root_vdev; 1722 int advance = ADVANCE_PRE | ADVANCE_ZIL; 1723 1724 if ((uint_t)type >= POOL_SCRUB_TYPES) 1725 return (ENOTSUP); 1726 1727 mutex_enter(&spa->spa_scrub_lock); 1728 1729 /* 1730 * If there's a scrub or resilver already in progress, stop it. 1731 */ 1732 while (spa->spa_scrub_thread != NULL) { 1733 /* 1734 * Don't stop a resilver unless forced. 1735 */ 1736 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 1737 mutex_exit(&spa->spa_scrub_lock); 1738 return (EBUSY); 1739 } 1740 spa->spa_scrub_stop = 1; 1741 cv_broadcast(&spa->spa_scrub_cv); 1742 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1743 } 1744 1745 /* 1746 * Terminate the previous traverse. 1747 */ 1748 if (spa->spa_scrub_th != NULL) { 1749 traverse_fini(spa->spa_scrub_th); 1750 spa->spa_scrub_th = NULL; 1751 } 1752 1753 if (rvd == NULL) { 1754 ASSERT(spa->spa_scrub_stop == 0); 1755 ASSERT(spa->spa_scrub_type == type); 1756 ASSERT(spa->spa_scrub_restart_txg == 0); 1757 mutex_exit(&spa->spa_scrub_lock); 1758 return (0); 1759 } 1760 1761 mintxg = TXG_INITIAL - 1; 1762 maxtxg = spa_last_synced_txg(spa) + 1; 1763 1764 mutex_enter(&rvd->vdev_dtl_lock); 1765 1766 if (rvd->vdev_dtl_map.sm_space == 0) { 1767 /* 1768 * The pool-wide DTL is empty. 1769 * If this is a resilver, there's nothing to do. 1770 */ 1771 if (type == POOL_SCRUB_RESILVER) 1772 type = POOL_SCRUB_NONE; 1773 } else { 1774 /* 1775 * The pool-wide DTL is non-empty. 1776 * If this is a normal scrub, upgrade to a resilver instead. 1777 */ 1778 if (type == POOL_SCRUB_EVERYTHING) 1779 type = POOL_SCRUB_RESILVER; 1780 } 1781 1782 if (type == POOL_SCRUB_RESILVER) { 1783 /* 1784 * Determine the resilvering boundaries. 1785 * 1786 * Note: (mintxg, maxtxg) is an open interval, 1787 * i.e. mintxg and maxtxg themselves are not included. 1788 * 1789 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 1790 * so we don't claim to resilver a txg that's still changing. 1791 */ 1792 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 1793 mintxg = ss->ss_start - 1; 1794 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 1795 maxtxg = MIN(ss->ss_end, maxtxg); 1796 1797 advance |= ADVANCE_PRUNE; 1798 } 1799 1800 mutex_exit(&rvd->vdev_dtl_lock); 1801 1802 spa->spa_scrub_stop = 0; 1803 spa->spa_scrub_type = type; 1804 spa->spa_scrub_restart_txg = 0; 1805 1806 if (type != POOL_SCRUB_NONE) { 1807 spa->spa_scrub_mintxg = mintxg; 1808 spa->spa_scrub_maxtxg = maxtxg; 1809 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 1810 advance, ZIO_FLAG_CANFAIL); 1811 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 1812 spa->spa_scrub_thread = thread_create(NULL, 0, 1813 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 1814 } 1815 1816 mutex_exit(&spa->spa_scrub_lock); 1817 1818 return (0); 1819 } 1820 1821 /* 1822 * ========================================================================== 1823 * SPA async task processing 1824 * ========================================================================== 1825 */ 1826 1827 static void 1828 spa_async_reopen(spa_t *spa) 1829 { 1830 vdev_t *rvd = spa->spa_root_vdev; 1831 vdev_t *tvd; 1832 int c; 1833 1834 spa_config_enter(spa, RW_WRITER, FTAG); 1835 1836 for (c = 0; c < rvd->vdev_children; c++) { 1837 tvd = rvd->vdev_child[c]; 1838 if (tvd->vdev_reopen_wanted) { 1839 tvd->vdev_reopen_wanted = 0; 1840 vdev_reopen(tvd); 1841 } 1842 } 1843 1844 spa_config_exit(spa, FTAG); 1845 } 1846 1847 static void 1848 spa_async_thread(spa_t *spa) 1849 { 1850 int tasks; 1851 1852 ASSERT(spa->spa_sync_on); 1853 1854 mutex_enter(&spa->spa_async_lock); 1855 tasks = spa->spa_async_tasks; 1856 spa->spa_async_tasks = 0; 1857 mutex_exit(&spa->spa_async_lock); 1858 1859 /* 1860 * See if any devices need to be reopened. 1861 */ 1862 if (tasks & SPA_ASYNC_REOPEN) 1863 spa_async_reopen(spa); 1864 1865 /* 1866 * If any devices are done replacing, detach them. 1867 */ 1868 if (tasks & SPA_ASYNC_REPLACE_DONE) 1869 spa_vdev_replace_done(spa); 1870 1871 /* 1872 * Kick off a scrub. 1873 */ 1874 if (tasks & SPA_ASYNC_SCRUB) 1875 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 1876 1877 /* 1878 * Kick off a resilver. 1879 */ 1880 if (tasks & SPA_ASYNC_RESILVER) 1881 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1882 1883 /* 1884 * Let the world know that we're done. 1885 */ 1886 mutex_enter(&spa->spa_async_lock); 1887 spa->spa_async_thread = NULL; 1888 cv_broadcast(&spa->spa_async_cv); 1889 mutex_exit(&spa->spa_async_lock); 1890 thread_exit(); 1891 } 1892 1893 void 1894 spa_async_suspend(spa_t *spa) 1895 { 1896 mutex_enter(&spa->spa_async_lock); 1897 spa->spa_async_suspended++; 1898 while (spa->spa_async_thread != NULL) 1899 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 1900 mutex_exit(&spa->spa_async_lock); 1901 } 1902 1903 void 1904 spa_async_resume(spa_t *spa) 1905 { 1906 mutex_enter(&spa->spa_async_lock); 1907 ASSERT(spa->spa_async_suspended != 0); 1908 spa->spa_async_suspended--; 1909 mutex_exit(&spa->spa_async_lock); 1910 } 1911 1912 static void 1913 spa_async_dispatch(spa_t *spa) 1914 { 1915 mutex_enter(&spa->spa_async_lock); 1916 if (spa->spa_async_tasks && !spa->spa_async_suspended && 1917 spa->spa_async_thread == NULL) 1918 spa->spa_async_thread = thread_create(NULL, 0, 1919 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 1920 mutex_exit(&spa->spa_async_lock); 1921 } 1922 1923 void 1924 spa_async_request(spa_t *spa, int task) 1925 { 1926 mutex_enter(&spa->spa_async_lock); 1927 spa->spa_async_tasks |= task; 1928 mutex_exit(&spa->spa_async_lock); 1929 } 1930 1931 /* 1932 * ========================================================================== 1933 * SPA syncing routines 1934 * ========================================================================== 1935 */ 1936 1937 static void 1938 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 1939 { 1940 bplist_t *bpl = &spa->spa_sync_bplist; 1941 dmu_tx_t *tx; 1942 blkptr_t blk; 1943 uint64_t itor = 0; 1944 zio_t *zio; 1945 int error; 1946 uint8_t c = 1; 1947 1948 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 1949 1950 while (bplist_iterate(bpl, &itor, &blk) == 0) 1951 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 1952 1953 error = zio_wait(zio); 1954 ASSERT3U(error, ==, 0); 1955 1956 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1957 bplist_vacate(bpl, tx); 1958 1959 /* 1960 * Pre-dirty the first block so we sync to convergence faster. 1961 * (Usually only the first block is needed.) 1962 */ 1963 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 1964 dmu_tx_commit(tx); 1965 } 1966 1967 static void 1968 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 1969 { 1970 nvlist_t *config; 1971 char *packed = NULL; 1972 size_t nvsize = 0; 1973 dmu_buf_t *db; 1974 1975 if (list_is_empty(&spa->spa_dirty_list)) 1976 return; 1977 1978 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 1979 1980 spa_config_set(spa, config); 1981 1982 VERIFY(nvlist_size(config, &nvsize, NV_ENCODE_XDR) == 0); 1983 1984 packed = kmem_alloc(nvsize, KM_SLEEP); 1985 1986 VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 1987 KM_SLEEP) == 0); 1988 1989 dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize, 1990 packed, tx); 1991 1992 kmem_free(packed, nvsize); 1993 1994 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, 1995 spa->spa_config_object, FTAG, &db)); 1996 dmu_buf_will_dirty(db, tx); 1997 *(uint64_t *)db->db_data = nvsize; 1998 dmu_buf_rele(db, FTAG); 1999 } 2000 2001 /* 2002 * Sync the specified transaction group. New blocks may be dirtied as 2003 * part of the process, so we iterate until it converges. 2004 */ 2005 void 2006 spa_sync(spa_t *spa, uint64_t txg) 2007 { 2008 dsl_pool_t *dp = spa->spa_dsl_pool; 2009 objset_t *mos = spa->spa_meta_objset; 2010 bplist_t *bpl = &spa->spa_sync_bplist; 2011 vdev_t *vd; 2012 dmu_tx_t *tx; 2013 int dirty_vdevs; 2014 2015 /* 2016 * Lock out configuration changes. 2017 */ 2018 spa_config_enter(spa, RW_READER, FTAG); 2019 2020 spa->spa_syncing_txg = txg; 2021 spa->spa_sync_pass = 0; 2022 2023 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2024 2025 /* 2026 * If anything has changed in this txg, push the deferred frees 2027 * from the previous txg. If not, leave them alone so that we 2028 * don't generate work on an otherwise idle system. 2029 */ 2030 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2031 !txg_list_empty(&dp->dp_dirty_dirs, txg)) 2032 spa_sync_deferred_frees(spa, txg); 2033 2034 /* 2035 * Iterate to convergence. 2036 */ 2037 do { 2038 spa->spa_sync_pass++; 2039 2040 tx = dmu_tx_create_assigned(dp, txg); 2041 spa_sync_config_object(spa, tx); 2042 dmu_tx_commit(tx); 2043 2044 spa_errlog_sync(spa, txg); 2045 2046 dsl_pool_sync(dp, txg); 2047 2048 dirty_vdevs = 0; 2049 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2050 vdev_sync(vd, txg); 2051 dirty_vdevs++; 2052 } 2053 2054 tx = dmu_tx_create_assigned(dp, txg); 2055 bplist_sync(bpl, tx); 2056 dmu_tx_commit(tx); 2057 2058 } while (dirty_vdevs); 2059 2060 bplist_close(bpl); 2061 2062 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2063 2064 /* 2065 * Rewrite the vdev configuration (which includes the uberblock) 2066 * to commit the transaction group. 2067 */ 2068 VERIFY(0 == spa_sync_labels(spa, txg)); 2069 2070 /* 2071 * Make a stable copy of the fully synced uberblock. 2072 * We use this as the root for pool traversals. 2073 */ 2074 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 2075 2076 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 2077 2078 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 2079 spa->spa_traverse_wanted = 0; 2080 spa->spa_ubsync = spa->spa_uberblock; 2081 rw_exit(&spa->spa_traverse_lock); 2082 2083 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 2084 2085 /* 2086 * Clean up the ZIL records for the synced txg. 2087 */ 2088 dsl_pool_zil_clean(dp); 2089 2090 /* 2091 * Update usable space statistics. 2092 */ 2093 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 2094 vdev_sync_done(vd, txg); 2095 2096 /* 2097 * It had better be the case that we didn't dirty anything 2098 * since spa_sync_labels(). 2099 */ 2100 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 2101 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 2102 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 2103 ASSERT(bpl->bpl_queue == NULL); 2104 2105 spa_config_exit(spa, FTAG); 2106 2107 /* 2108 * If any async tasks have been requested, kick them off. 2109 */ 2110 spa_async_dispatch(spa); 2111 } 2112 2113 /* 2114 * Sync all pools. We don't want to hold the namespace lock across these 2115 * operations, so we take a reference on the spa_t and drop the lock during the 2116 * sync. 2117 */ 2118 void 2119 spa_sync_allpools(void) 2120 { 2121 spa_t *spa = NULL; 2122 mutex_enter(&spa_namespace_lock); 2123 while ((spa = spa_next(spa)) != NULL) { 2124 if (spa_state(spa) != POOL_STATE_ACTIVE) 2125 continue; 2126 spa_open_ref(spa, FTAG); 2127 mutex_exit(&spa_namespace_lock); 2128 txg_wait_synced(spa_get_dsl(spa), 0); 2129 mutex_enter(&spa_namespace_lock); 2130 spa_close(spa, FTAG); 2131 } 2132 mutex_exit(&spa_namespace_lock); 2133 } 2134 2135 /* 2136 * ========================================================================== 2137 * Miscellaneous routines 2138 * ========================================================================== 2139 */ 2140 2141 int 2142 spa_busy(void) 2143 { 2144 return (spa_active_count != 0); 2145 } 2146 2147 /* 2148 * Remove all pools in the system. 2149 */ 2150 void 2151 spa_evict_all(void) 2152 { 2153 spa_t *spa; 2154 2155 /* 2156 * Remove all cached state. All pools should be closed now, 2157 * so every spa in the AVL tree should be unreferenced. 2158 */ 2159 mutex_enter(&spa_namespace_lock); 2160 while ((spa = spa_next(NULL)) != NULL) { 2161 /* 2162 * Stop async tasks. The async thread may need to detach 2163 * a device that's been replaced, which requires grabbing 2164 * spa_namespace_lock, so we must drop it here. 2165 */ 2166 spa_open_ref(spa, FTAG); 2167 mutex_exit(&spa_namespace_lock); 2168 spa_async_suspend(spa); 2169 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 2170 mutex_enter(&spa_namespace_lock); 2171 spa_close(spa, FTAG); 2172 2173 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2174 spa_unload(spa); 2175 spa_deactivate(spa); 2176 } 2177 spa_remove(spa); 2178 } 2179 mutex_exit(&spa_namespace_lock); 2180 } 2181 2182 vdev_t * 2183 spa_lookup_by_guid(spa_t *spa, uint64_t guid) 2184 { 2185 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 2186 } 2187