1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * This file contains all the routines used when modifying on-disk SPA state. 30 * This includes opening, importing, destroying, exporting a pool, and syncing a 31 * pool. 32 */ 33 34 #include <sys/zfs_context.h> 35 #include <sys/fm/fs/zfs.h> 36 #include <sys/spa_impl.h> 37 #include <sys/zio.h> 38 #include <sys/zio_checksum.h> 39 #include <sys/zio_compress.h> 40 #include <sys/dmu.h> 41 #include <sys/dmu_tx.h> 42 #include <sys/zap.h> 43 #include <sys/zil.h> 44 #include <sys/vdev_impl.h> 45 #include <sys/metaslab.h> 46 #include <sys/uberblock_impl.h> 47 #include <sys/txg.h> 48 #include <sys/avl.h> 49 #include <sys/dmu_traverse.h> 50 #include <sys/unique.h> 51 #include <sys/dsl_pool.h> 52 #include <sys/dsl_dir.h> 53 #include <sys/dsl_prop.h> 54 #include <sys/fs/zfs.h> 55 #include <sys/callb.h> 56 57 /* 58 * ========================================================================== 59 * SPA state manipulation (open/create/destroy/import/export) 60 * ========================================================================== 61 */ 62 63 static int 64 spa_error_entry_compare(const void *a, const void *b) 65 { 66 spa_error_entry_t *sa = (spa_error_entry_t *)a; 67 spa_error_entry_t *sb = (spa_error_entry_t *)b; 68 int ret; 69 70 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 71 sizeof (zbookmark_t)); 72 73 if (ret < 0) 74 return (-1); 75 else if (ret > 0) 76 return (1); 77 else 78 return (0); 79 } 80 81 /* 82 * Utility function which retrieves copies of the current logs and 83 * re-initializes them in the process. 84 */ 85 void 86 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 87 { 88 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 89 90 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 91 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 92 93 avl_create(&spa->spa_errlist_scrub, 94 spa_error_entry_compare, sizeof (spa_error_entry_t), 95 offsetof(spa_error_entry_t, se_avl)); 96 avl_create(&spa->spa_errlist_last, 97 spa_error_entry_compare, sizeof (spa_error_entry_t), 98 offsetof(spa_error_entry_t, se_avl)); 99 } 100 101 /* 102 * Activate an uninitialized pool. 103 */ 104 static void 105 spa_activate(spa_t *spa) 106 { 107 int t; 108 109 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 110 111 spa->spa_state = POOL_STATE_ACTIVE; 112 113 spa->spa_normal_class = metaslab_class_create(); 114 115 for (t = 0; t < ZIO_TYPES; t++) { 116 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 117 8, maxclsyspri, 50, INT_MAX, 118 TASKQ_PREPOPULATE); 119 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 120 8, maxclsyspri, 50, INT_MAX, 121 TASKQ_PREPOPULATE); 122 } 123 124 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 125 126 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 127 offsetof(vdev_t, vdev_dirty_node)); 128 129 txg_list_create(&spa->spa_vdev_txg_list, 130 offsetof(struct vdev, vdev_txg_node)); 131 132 avl_create(&spa->spa_errlist_scrub, 133 spa_error_entry_compare, sizeof (spa_error_entry_t), 134 offsetof(spa_error_entry_t, se_avl)); 135 avl_create(&spa->spa_errlist_last, 136 spa_error_entry_compare, sizeof (spa_error_entry_t), 137 offsetof(spa_error_entry_t, se_avl)); 138 } 139 140 /* 141 * Opposite of spa_activate(). 142 */ 143 static void 144 spa_deactivate(spa_t *spa) 145 { 146 int t; 147 148 ASSERT(spa->spa_sync_on == B_FALSE); 149 ASSERT(spa->spa_dsl_pool == NULL); 150 ASSERT(spa->spa_root_vdev == NULL); 151 152 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 153 154 txg_list_destroy(&spa->spa_vdev_txg_list); 155 156 list_destroy(&spa->spa_dirty_list); 157 158 rw_destroy(&spa->spa_traverse_lock); 159 160 for (t = 0; t < ZIO_TYPES; t++) { 161 taskq_destroy(spa->spa_zio_issue_taskq[t]); 162 taskq_destroy(spa->spa_zio_intr_taskq[t]); 163 spa->spa_zio_issue_taskq[t] = NULL; 164 spa->spa_zio_intr_taskq[t] = NULL; 165 } 166 167 metaslab_class_destroy(spa->spa_normal_class); 168 spa->spa_normal_class = NULL; 169 170 /* 171 * If this was part of an import or the open otherwise failed, we may 172 * still have errors left in the queues. Empty them just in case. 173 */ 174 spa_errlog_drain(spa); 175 176 avl_destroy(&spa->spa_errlist_scrub); 177 avl_destroy(&spa->spa_errlist_last); 178 179 spa->spa_state = POOL_STATE_UNINITIALIZED; 180 } 181 182 /* 183 * Verify a pool configuration, and construct the vdev tree appropriately. This 184 * will create all the necessary vdevs in the appropriate layout, with each vdev 185 * in the CLOSED state. This will prep the pool before open/creation/import. 186 * All vdev validation is done by the vdev_alloc() routine. 187 */ 188 static vdev_t * 189 spa_config_parse(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) 190 { 191 nvlist_t **child; 192 uint_t c, children; 193 vdev_t *vd; 194 195 if ((vd = vdev_alloc(spa, nv, parent, id, atype)) == NULL) 196 return (NULL); 197 198 if (vd->vdev_ops->vdev_op_leaf) 199 return (vd); 200 201 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 202 &child, &children) != 0) { 203 vdev_free(vd); 204 return (NULL); 205 } 206 207 for (c = 0; c < children; c++) { 208 if (spa_config_parse(spa, child[c], vd, c, atype) == NULL) { 209 vdev_free(vd); 210 return (NULL); 211 } 212 } 213 214 return (vd); 215 } 216 217 /* 218 * Opposite of spa_load(). 219 */ 220 static void 221 spa_unload(spa_t *spa) 222 { 223 /* 224 * Stop async tasks. 225 */ 226 spa_async_suspend(spa); 227 228 /* 229 * Stop syncing. 230 */ 231 if (spa->spa_sync_on) { 232 txg_sync_stop(spa->spa_dsl_pool); 233 spa->spa_sync_on = B_FALSE; 234 } 235 236 /* 237 * Wait for any outstanding prefetch I/O to complete. 238 */ 239 spa_config_enter(spa, RW_WRITER, FTAG); 240 spa_config_exit(spa, FTAG); 241 242 /* 243 * Close the dsl pool. 244 */ 245 if (spa->spa_dsl_pool) { 246 dsl_pool_close(spa->spa_dsl_pool); 247 spa->spa_dsl_pool = NULL; 248 } 249 250 /* 251 * Close all vdevs. 252 */ 253 if (spa->spa_root_vdev) 254 vdev_free(spa->spa_root_vdev); 255 ASSERT(spa->spa_root_vdev == NULL); 256 257 spa->spa_async_suspended = 0; 258 } 259 260 /* 261 * Load an existing storage pool, using the pool's builtin spa_config as a 262 * source of configuration information. 263 */ 264 static int 265 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 266 { 267 int error = 0; 268 nvlist_t *nvroot = NULL; 269 vdev_t *rvd; 270 uberblock_t *ub = &spa->spa_uberblock; 271 uint64_t config_cache_txg = spa->spa_config_txg; 272 uint64_t pool_guid; 273 zio_t *zio; 274 275 spa->spa_load_state = state; 276 277 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 278 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid) || 279 (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 280 &spa->spa_config_txg) && mosconfig)) { 281 error = EINVAL; 282 goto out; 283 } 284 285 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 286 spa_guid_exists(pool_guid, 0)) { 287 error = EEXIST; 288 goto out; 289 } 290 291 /* 292 * Parse the configuration into a vdev tree. 293 */ 294 spa_config_enter(spa, RW_WRITER, FTAG); 295 rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 296 spa_config_exit(spa, FTAG); 297 298 if (rvd == NULL) { 299 error = EINVAL; 300 goto out; 301 } 302 303 ASSERT(spa->spa_root_vdev == rvd); 304 ASSERT(spa_guid(spa) == pool_guid); 305 306 /* 307 * Try to open all vdevs, loading each label in the process. 308 */ 309 if (vdev_open(rvd) != 0) { 310 error = ENXIO; 311 goto out; 312 } 313 314 /* 315 * Find the best uberblock. 316 */ 317 bzero(ub, sizeof (uberblock_t)); 318 319 zio = zio_root(spa, NULL, NULL, 320 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 321 vdev_uberblock_load(zio, rvd, ub); 322 error = zio_wait(zio); 323 324 /* 325 * If we weren't able to find a single valid uberblock, return failure. 326 */ 327 if (ub->ub_txg == 0) { 328 error = ENXIO; 329 goto out; 330 } 331 332 /* 333 * If the pool is newer than the code, we can't open it. 334 */ 335 if (ub->ub_version > UBERBLOCK_VERSION) { 336 error = ENOTSUP; 337 goto out; 338 } 339 340 /* 341 * If the vdev guid sum doesn't match the uberblock, we have an 342 * incomplete configuration. 343 */ 344 if (rvd->vdev_guid_sum != ub->ub_guid_sum && (mosconfig || 345 state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT)) { 346 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 347 VDEV_AUX_BAD_GUID_SUM); 348 error = ENXIO; 349 goto out; 350 } 351 352 /* 353 * Initialize internal SPA structures. 354 */ 355 spa->spa_state = POOL_STATE_ACTIVE; 356 spa->spa_ubsync = spa->spa_uberblock; 357 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 358 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 359 if (error) { 360 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 361 VDEV_AUX_CORRUPT_DATA); 362 goto out; 363 } 364 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 365 366 if (zap_lookup(spa->spa_meta_objset, 367 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 368 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 369 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 370 VDEV_AUX_CORRUPT_DATA); 371 error = EIO; 372 goto out; 373 } 374 375 if (!mosconfig) { 376 dmu_buf_t *db; 377 char *packed = NULL; 378 size_t nvsize = 0; 379 nvlist_t *newconfig = NULL; 380 381 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, 382 spa->spa_config_object, FTAG, &db)); 383 nvsize = *(uint64_t *)db->db_data; 384 dmu_buf_rele(db, FTAG); 385 386 packed = kmem_alloc(nvsize, KM_SLEEP); 387 error = dmu_read(spa->spa_meta_objset, 388 spa->spa_config_object, 0, nvsize, packed); 389 if (error == 0) 390 error = nvlist_unpack(packed, nvsize, &newconfig, 0); 391 kmem_free(packed, nvsize); 392 393 if (error) { 394 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 395 VDEV_AUX_CORRUPT_DATA); 396 error = EIO; 397 goto out; 398 } 399 400 spa_config_set(spa, newconfig); 401 402 spa_unload(spa); 403 spa_deactivate(spa); 404 spa_activate(spa); 405 406 return (spa_load(spa, newconfig, state, B_TRUE)); 407 } 408 409 if (zap_lookup(spa->spa_meta_objset, 410 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 411 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 412 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 413 VDEV_AUX_CORRUPT_DATA); 414 error = EIO; 415 goto out; 416 } 417 418 /* 419 * Load the persistent error log. If we have an older pool, this will 420 * not be present. 421 */ 422 error = zap_lookup(spa->spa_meta_objset, 423 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 424 sizeof (uint64_t), 1, &spa->spa_errlog_last); 425 if (error != 0 &&error != ENOENT) { 426 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 427 VDEV_AUX_CORRUPT_DATA); 428 error = EIO; 429 goto out; 430 } 431 432 error = zap_lookup(spa->spa_meta_objset, 433 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 434 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 435 if (error != 0 && error != ENOENT) { 436 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 437 VDEV_AUX_CORRUPT_DATA); 438 error = EIO; 439 goto out; 440 } 441 442 /* 443 * Load the vdev state for all top level vdevs. We need to grab the 444 * config lock because all label I/O is done with the 445 * ZIO_FLAG_CONFIG_HELD flag. 446 */ 447 spa_config_enter(spa, RW_READER, FTAG); 448 error = vdev_load(rvd); 449 spa_config_exit(spa, FTAG); 450 451 if (error) 452 goto out; 453 454 /* 455 * Propagate the leaf DTLs we just loaded all the way up the tree. 456 */ 457 spa_config_enter(spa, RW_WRITER, FTAG); 458 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 459 spa_config_exit(spa, FTAG); 460 461 /* 462 * Check the state of the root vdev. If it can't be opened, it 463 * indicates one or more toplevel vdevs are faulted. 464 */ 465 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 466 error = ENXIO; 467 goto out; 468 } 469 470 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 471 dmu_tx_t *tx; 472 int need_update = B_FALSE; 473 int c; 474 475 /* 476 * Claim log blocks that haven't been committed yet. 477 * This must all happen in a single txg. 478 */ 479 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 480 spa_first_txg(spa)); 481 dmu_objset_find(spa->spa_name, zil_claim, tx, 0); 482 dmu_tx_commit(tx); 483 484 spa->spa_sync_on = B_TRUE; 485 txg_sync_start(spa->spa_dsl_pool); 486 487 /* 488 * Wait for all claims to sync. 489 */ 490 txg_wait_synced(spa->spa_dsl_pool, 0); 491 492 /* 493 * If the config cache is stale, or we have uninitialized 494 * metaslabs (see spa_vdev_add()), then update the config. 495 */ 496 if (config_cache_txg != spa->spa_config_txg || 497 state == SPA_LOAD_IMPORT) 498 need_update = B_TRUE; 499 500 for (c = 0; c < rvd->vdev_children; c++) 501 if (rvd->vdev_child[c]->vdev_ms_array == 0) 502 need_update = B_TRUE; 503 504 /* 505 * Update the config cache asychronously in case we're the 506 * root pool, in which case the config cache isn't writable yet. 507 */ 508 if (need_update) 509 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 510 } 511 512 error = 0; 513 out: 514 if (error) 515 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 516 spa->spa_load_state = SPA_LOAD_NONE; 517 spa->spa_ena = 0; 518 519 return (error); 520 } 521 522 /* 523 * Pool Open/Import 524 * 525 * The import case is identical to an open except that the configuration is sent 526 * down from userland, instead of grabbed from the configuration cache. For the 527 * case of an open, the pool configuration will exist in the 528 * POOL_STATE_UNITIALIZED state. 529 * 530 * The stats information (gen/count/ustats) is used to gather vdev statistics at 531 * the same time open the pool, without having to keep around the spa_t in some 532 * ambiguous state. 533 */ 534 static int 535 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 536 { 537 spa_t *spa; 538 int error; 539 int loaded = B_FALSE; 540 int locked = B_FALSE; 541 542 *spapp = NULL; 543 544 /* 545 * As disgusting as this is, we need to support recursive calls to this 546 * function because dsl_dir_open() is called during spa_load(), and ends 547 * up calling spa_open() again. The real fix is to figure out how to 548 * avoid dsl_dir_open() calling this in the first place. 549 */ 550 if (mutex_owner(&spa_namespace_lock) != curthread) { 551 mutex_enter(&spa_namespace_lock); 552 locked = B_TRUE; 553 } 554 555 if ((spa = spa_lookup(pool)) == NULL) { 556 if (locked) 557 mutex_exit(&spa_namespace_lock); 558 return (ENOENT); 559 } 560 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 561 562 spa_activate(spa); 563 564 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 565 566 if (error == EBADF) { 567 /* 568 * If vdev_load() returns EBADF, it indicates that one 569 * of the vdevs indicates that the pool has been 570 * exported or destroyed. If this is the case, the 571 * config cache is out of sync and we should remove the 572 * pool from the namespace. 573 */ 574 spa_unload(spa); 575 spa_deactivate(spa); 576 spa_remove(spa); 577 spa_config_sync(); 578 if (locked) 579 mutex_exit(&spa_namespace_lock); 580 return (ENOENT); 581 } 582 583 if (error) { 584 /* 585 * We can't open the pool, but we still have useful 586 * information: the state of each vdev after the 587 * attempted vdev_open(). Return this to the user. 588 */ 589 if (config != NULL && spa->spa_root_vdev != NULL) { 590 spa_config_enter(spa, RW_READER, FTAG); 591 *config = spa_config_generate(spa, NULL, -1ULL, 592 B_TRUE); 593 spa_config_exit(spa, FTAG); 594 } 595 spa_unload(spa); 596 spa_deactivate(spa); 597 spa->spa_last_open_failed = B_TRUE; 598 if (locked) 599 mutex_exit(&spa_namespace_lock); 600 *spapp = NULL; 601 return (error); 602 } else { 603 zfs_post_ok(spa, NULL); 604 spa->spa_last_open_failed = B_FALSE; 605 } 606 607 loaded = B_TRUE; 608 } 609 610 spa_open_ref(spa, tag); 611 if (locked) 612 mutex_exit(&spa_namespace_lock); 613 614 *spapp = spa; 615 616 if (config != NULL) { 617 spa_config_enter(spa, RW_READER, FTAG); 618 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 619 spa_config_exit(spa, FTAG); 620 } 621 622 /* 623 * If we just loaded the pool, resilver anything that's out of date. 624 */ 625 if (loaded && (spa_mode & FWRITE)) 626 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 627 628 return (0); 629 } 630 631 int 632 spa_open(const char *name, spa_t **spapp, void *tag) 633 { 634 return (spa_open_common(name, spapp, tag, NULL)); 635 } 636 637 /* 638 * Lookup the given spa_t, incrementing the inject count in the process, 639 * preventing it from being exported or destroyed. 640 */ 641 spa_t * 642 spa_inject_addref(char *name) 643 { 644 spa_t *spa; 645 646 mutex_enter(&spa_namespace_lock); 647 if ((spa = spa_lookup(name)) == NULL) { 648 mutex_exit(&spa_namespace_lock); 649 return (NULL); 650 } 651 spa->spa_inject_ref++; 652 mutex_exit(&spa_namespace_lock); 653 654 return (spa); 655 } 656 657 void 658 spa_inject_delref(spa_t *spa) 659 { 660 mutex_enter(&spa_namespace_lock); 661 spa->spa_inject_ref--; 662 mutex_exit(&spa_namespace_lock); 663 } 664 665 int 666 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 667 { 668 int error; 669 spa_t *spa; 670 671 *config = NULL; 672 error = spa_open_common(name, &spa, FTAG, config); 673 674 if (spa && *config != NULL) 675 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 676 spa_get_errlog_size(spa)) == 0); 677 678 /* 679 * We want to get the alternate root even for faulted pools, so we cheat 680 * and call spa_lookup() directly. 681 */ 682 if (altroot) { 683 if (spa == NULL) { 684 mutex_enter(&spa_namespace_lock); 685 spa = spa_lookup(name); 686 if (spa) 687 spa_altroot(spa, altroot, buflen); 688 else 689 altroot[0] = '\0'; 690 spa = NULL; 691 mutex_exit(&spa_namespace_lock); 692 } else { 693 spa_altroot(spa, altroot, buflen); 694 } 695 } 696 697 if (spa != NULL) 698 spa_close(spa, FTAG); 699 700 return (error); 701 } 702 703 /* 704 * Pool Creation 705 */ 706 int 707 spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 708 { 709 spa_t *spa; 710 vdev_t *rvd; 711 dsl_pool_t *dp; 712 dmu_tx_t *tx; 713 int c, error; 714 uint64_t txg = TXG_INITIAL; 715 716 /* 717 * If this pool already exists, return failure. 718 */ 719 mutex_enter(&spa_namespace_lock); 720 if (spa_lookup(pool) != NULL) { 721 mutex_exit(&spa_namespace_lock); 722 return (EEXIST); 723 } 724 725 /* 726 * Allocate a new spa_t structure. 727 */ 728 spa = spa_add(pool, altroot); 729 spa_activate(spa); 730 731 spa->spa_uberblock.ub_txg = txg - 1; 732 spa->spa_ubsync = spa->spa_uberblock; 733 734 /* 735 * Create the root vdev. 736 */ 737 spa_config_enter(spa, RW_WRITER, FTAG); 738 739 rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 740 741 ASSERT(spa->spa_root_vdev == rvd); 742 743 if (rvd == NULL) { 744 error = EINVAL; 745 } else { 746 if ((error = vdev_create(rvd, txg)) == 0) { 747 for (c = 0; c < rvd->vdev_children; c++) 748 vdev_init(rvd->vdev_child[c], txg); 749 vdev_config_dirty(rvd); 750 } 751 } 752 753 spa_config_exit(spa, FTAG); 754 755 if (error) { 756 spa_unload(spa); 757 spa_deactivate(spa); 758 spa_remove(spa); 759 mutex_exit(&spa_namespace_lock); 760 return (error); 761 } 762 763 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 764 spa->spa_meta_objset = dp->dp_meta_objset; 765 766 tx = dmu_tx_create_assigned(dp, txg); 767 768 /* 769 * Create the pool config object. 770 */ 771 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 772 DMU_OT_PACKED_NVLIST, 1 << 14, 773 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 774 775 if (zap_add(spa->spa_meta_objset, 776 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 777 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 778 cmn_err(CE_PANIC, "failed to add pool config"); 779 } 780 781 /* 782 * Create the deferred-free bplist object. Turn off compression 783 * because sync-to-convergence takes longer if the blocksize 784 * keeps changing. 785 */ 786 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 787 1 << 14, tx); 788 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 789 ZIO_COMPRESS_OFF, tx); 790 791 if (zap_add(spa->spa_meta_objset, 792 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 793 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 794 cmn_err(CE_PANIC, "failed to add bplist"); 795 } 796 797 dmu_tx_commit(tx); 798 799 spa->spa_sync_on = B_TRUE; 800 txg_sync_start(spa->spa_dsl_pool); 801 802 /* 803 * We explicitly wait for the first transaction to complete so that our 804 * bean counters are appropriately updated. 805 */ 806 txg_wait_synced(spa->spa_dsl_pool, txg); 807 808 spa_config_sync(); 809 810 mutex_exit(&spa_namespace_lock); 811 812 return (0); 813 } 814 815 /* 816 * Import the given pool into the system. We set up the necessary spa_t and 817 * then call spa_load() to do the dirty work. 818 */ 819 int 820 spa_import(const char *pool, nvlist_t *config, const char *altroot) 821 { 822 spa_t *spa; 823 int error; 824 825 if (!(spa_mode & FWRITE)) 826 return (EROFS); 827 828 /* 829 * If a pool with this name exists, return failure. 830 */ 831 mutex_enter(&spa_namespace_lock); 832 if (spa_lookup(pool) != NULL) { 833 mutex_exit(&spa_namespace_lock); 834 return (EEXIST); 835 } 836 837 /* 838 * Create and initialize the spa structure. 839 */ 840 spa = spa_add(pool, altroot); 841 spa_activate(spa); 842 843 /* 844 * Pass off the heavy lifting to spa_load(). 845 */ 846 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_FALSE); 847 848 if (error) { 849 spa_unload(spa); 850 spa_deactivate(spa); 851 spa_remove(spa); 852 mutex_exit(&spa_namespace_lock); 853 return (error); 854 } 855 856 /* 857 * Update the config cache to include the newly-imported pool. 858 */ 859 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 860 861 mutex_exit(&spa_namespace_lock); 862 863 /* 864 * Resilver anything that's out of date. 865 */ 866 if (spa_mode & FWRITE) 867 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 868 869 return (0); 870 } 871 872 /* 873 * This (illegal) pool name is used when temporarily importing a spa_t in order 874 * to get the vdev stats associated with the imported devices. 875 */ 876 #define TRYIMPORT_NAME "$import" 877 878 nvlist_t * 879 spa_tryimport(nvlist_t *tryconfig) 880 { 881 nvlist_t *config = NULL; 882 char *poolname; 883 spa_t *spa; 884 uint64_t state; 885 886 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 887 return (NULL); 888 889 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 890 return (NULL); 891 892 /* 893 * Create and initialize the spa structure. 894 */ 895 mutex_enter(&spa_namespace_lock); 896 spa = spa_add(TRYIMPORT_NAME, NULL); 897 spa_activate(spa); 898 899 /* 900 * Pass off the heavy lifting to spa_load(). 901 */ 902 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_FALSE); 903 904 /* 905 * If 'tryconfig' was at least parsable, return the current config. 906 */ 907 if (spa->spa_root_vdev != NULL) { 908 spa_config_enter(spa, RW_READER, FTAG); 909 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 910 spa_config_exit(spa, FTAG); 911 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 912 poolname) == 0); 913 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 914 state) == 0); 915 } 916 917 spa_unload(spa); 918 spa_deactivate(spa); 919 spa_remove(spa); 920 mutex_exit(&spa_namespace_lock); 921 922 return (config); 923 } 924 925 /* 926 * Pool export/destroy 927 * 928 * The act of destroying or exporting a pool is very simple. We make sure there 929 * is no more pending I/O and any references to the pool are gone. Then, we 930 * update the pool state and sync all the labels to disk, removing the 931 * configuration from the cache afterwards. 932 */ 933 static int 934 spa_export_common(char *pool, int new_state) 935 { 936 spa_t *spa; 937 938 if (!(spa_mode & FWRITE)) 939 return (EROFS); 940 941 mutex_enter(&spa_namespace_lock); 942 if ((spa = spa_lookup(pool)) == NULL) { 943 mutex_exit(&spa_namespace_lock); 944 return (ENOENT); 945 } 946 947 /* 948 * Put a hold on the pool, drop the namespace lock, stop async tasks, 949 * reacquire the namespace lock, and see if we can export. 950 */ 951 spa_open_ref(spa, FTAG); 952 mutex_exit(&spa_namespace_lock); 953 spa_async_suspend(spa); 954 mutex_enter(&spa_namespace_lock); 955 spa_close(spa, FTAG); 956 957 /* 958 * The pool will be in core if it's openable, 959 * in which case we can modify its state. 960 */ 961 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 962 /* 963 * Objsets may be open only because they're dirty, so we 964 * have to force it to sync before checking spa_refcnt. 965 */ 966 spa_scrub_suspend(spa); 967 txg_wait_synced(spa->spa_dsl_pool, 0); 968 969 /* 970 * A pool cannot be exported or destroyed if there are active 971 * references. If we are resetting a pool, allow references by 972 * fault injection handlers. 973 */ 974 if (!spa_refcount_zero(spa) || 975 (spa->spa_inject_ref != 0 && 976 new_state != POOL_STATE_UNINITIALIZED)) { 977 spa_scrub_resume(spa); 978 spa_async_resume(spa); 979 mutex_exit(&spa_namespace_lock); 980 return (EBUSY); 981 } 982 983 spa_scrub_resume(spa); 984 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 985 986 /* 987 * We want this to be reflected on every label, 988 * so mark them all dirty. spa_unload() will do the 989 * final sync that pushes these changes out. 990 */ 991 if (new_state != POOL_STATE_UNINITIALIZED) { 992 spa_config_enter(spa, RW_WRITER, FTAG); 993 spa->spa_state = new_state; 994 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 995 vdev_config_dirty(spa->spa_root_vdev); 996 spa_config_exit(spa, FTAG); 997 } 998 } 999 1000 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1001 spa_unload(spa); 1002 spa_deactivate(spa); 1003 } 1004 1005 if (new_state != POOL_STATE_UNINITIALIZED) { 1006 spa_remove(spa); 1007 spa_config_sync(); 1008 } 1009 mutex_exit(&spa_namespace_lock); 1010 1011 return (0); 1012 } 1013 1014 /* 1015 * Destroy a storage pool. 1016 */ 1017 int 1018 spa_destroy(char *pool) 1019 { 1020 return (spa_export_common(pool, POOL_STATE_DESTROYED)); 1021 } 1022 1023 /* 1024 * Export a storage pool. 1025 */ 1026 int 1027 spa_export(char *pool) 1028 { 1029 return (spa_export_common(pool, POOL_STATE_EXPORTED)); 1030 } 1031 1032 /* 1033 * Similar to spa_export(), this unloads the spa_t without actually removing it 1034 * from the namespace in any way. 1035 */ 1036 int 1037 spa_reset(char *pool) 1038 { 1039 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED)); 1040 } 1041 1042 1043 /* 1044 * ========================================================================== 1045 * Device manipulation 1046 * ========================================================================== 1047 */ 1048 1049 /* 1050 * Add capacity to a storage pool. 1051 */ 1052 int 1053 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1054 { 1055 uint64_t txg; 1056 int c, error; 1057 vdev_t *rvd = spa->spa_root_vdev; 1058 vdev_t *vd, *tvd; 1059 1060 txg = spa_vdev_enter(spa); 1061 1062 vd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1063 1064 if (vd == NULL) 1065 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1066 1067 if ((error = vdev_create(vd, txg)) != 0) 1068 return (spa_vdev_exit(spa, vd, txg, error)); 1069 1070 /* 1071 * Transfer each new top-level vdev from vd to rvd. 1072 */ 1073 for (c = 0; c < vd->vdev_children; c++) { 1074 tvd = vd->vdev_child[c]; 1075 vdev_remove_child(vd, tvd); 1076 tvd->vdev_id = rvd->vdev_children; 1077 vdev_add_child(rvd, tvd); 1078 vdev_config_dirty(tvd); 1079 } 1080 1081 /* 1082 * We have to be careful when adding new vdevs to an existing pool. 1083 * If other threads start allocating from these vdevs before we 1084 * sync the config cache, and we lose power, then upon reboot we may 1085 * fail to open the pool because there are DVAs that the config cache 1086 * can't translate. Therefore, we first add the vdevs without 1087 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1088 * and then let spa_config_update() initialize the new metaslabs. 1089 * 1090 * spa_load() checks for added-but-not-initialized vdevs, so that 1091 * if we lose power at any point in this sequence, the remaining 1092 * steps will be completed the next time we load the pool. 1093 */ 1094 (void) spa_vdev_exit(spa, vd, txg, 0); 1095 1096 mutex_enter(&spa_namespace_lock); 1097 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1098 mutex_exit(&spa_namespace_lock); 1099 1100 return (0); 1101 } 1102 1103 /* 1104 * Attach a device to a mirror. The arguments are the path to any device 1105 * in the mirror, and the nvroot for the new device. If the path specifies 1106 * a device that is not mirrored, we automatically insert the mirror vdev. 1107 * 1108 * If 'replacing' is specified, the new device is intended to replace the 1109 * existing device; in this case the two devices are made into their own 1110 * mirror using the 'replacing' vdev, which is functionally idendical to 1111 * the mirror vdev (it actually reuses all the same ops) but has a few 1112 * extra rules: you can't attach to it after it's been created, and upon 1113 * completion of resilvering, the first disk (the one being replaced) 1114 * is automatically detached. 1115 */ 1116 int 1117 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1118 { 1119 uint64_t txg, open_txg; 1120 int error; 1121 vdev_t *rvd = spa->spa_root_vdev; 1122 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1123 vdev_ops_t *pvops = replacing ? &vdev_replacing_ops : &vdev_mirror_ops; 1124 1125 txg = spa_vdev_enter(spa); 1126 1127 oldvd = vdev_lookup_by_guid(rvd, guid); 1128 1129 if (oldvd == NULL) 1130 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1131 1132 if (!oldvd->vdev_ops->vdev_op_leaf) 1133 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1134 1135 pvd = oldvd->vdev_parent; 1136 1137 /* 1138 * The parent must be a mirror or the root, unless we're replacing; 1139 * in that case, the parent can be anything but another replacing vdev. 1140 */ 1141 if (pvd->vdev_ops != &vdev_mirror_ops && 1142 pvd->vdev_ops != &vdev_root_ops && 1143 (!replacing || pvd->vdev_ops == &vdev_replacing_ops)) 1144 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1145 1146 newrootvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1147 1148 if (newrootvd == NULL || newrootvd->vdev_children != 1) 1149 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1150 1151 newvd = newrootvd->vdev_child[0]; 1152 1153 if (!newvd->vdev_ops->vdev_op_leaf) 1154 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1155 1156 if ((error = vdev_create(newrootvd, txg)) != 0) 1157 return (spa_vdev_exit(spa, newrootvd, txg, error)); 1158 1159 /* 1160 * Compare the new device size with the replaceable/attachable 1161 * device size. 1162 */ 1163 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1164 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1165 1166 if (newvd->vdev_ashift != oldvd->vdev_ashift && oldvd->vdev_ashift != 0) 1167 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1168 1169 /* 1170 * If this is an in-place replacement, update oldvd's path and devid 1171 * to make it distinguishable from newvd, and unopenable from now on. 1172 */ 1173 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1174 spa_strfree(oldvd->vdev_path); 1175 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1176 KM_SLEEP); 1177 (void) sprintf(oldvd->vdev_path, "%s/%s", 1178 newvd->vdev_path, "old"); 1179 if (oldvd->vdev_devid != NULL) { 1180 spa_strfree(oldvd->vdev_devid); 1181 oldvd->vdev_devid = NULL; 1182 } 1183 } 1184 1185 /* 1186 * If the parent is not a mirror, or if we're replacing, 1187 * insert the new mirror/replacing vdev above oldvd. 1188 */ 1189 if (pvd->vdev_ops != pvops) 1190 pvd = vdev_add_parent(oldvd, pvops); 1191 1192 ASSERT(pvd->vdev_top->vdev_parent == rvd); 1193 ASSERT(pvd->vdev_ops == pvops); 1194 ASSERT(oldvd->vdev_parent == pvd); 1195 1196 /* 1197 * Extract the new device from its root and add it to pvd. 1198 */ 1199 vdev_remove_child(newrootvd, newvd); 1200 newvd->vdev_id = pvd->vdev_children; 1201 vdev_add_child(pvd, newvd); 1202 1203 /* 1204 * If newvd is smaller than oldvd, but larger than its rsize, 1205 * the addition of newvd may have decreased our parent's asize. 1206 */ 1207 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1208 1209 tvd = newvd->vdev_top; 1210 ASSERT(pvd->vdev_top == tvd); 1211 ASSERT(tvd->vdev_parent == rvd); 1212 1213 vdev_config_dirty(tvd); 1214 1215 /* 1216 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1217 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1218 */ 1219 open_txg = txg + TXG_CONCURRENT_STATES - 1; 1220 1221 mutex_enter(&newvd->vdev_dtl_lock); 1222 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1223 open_txg - TXG_INITIAL + 1); 1224 mutex_exit(&newvd->vdev_dtl_lock); 1225 1226 dprintf("attached %s in txg %llu\n", newvd->vdev_path, txg); 1227 1228 /* 1229 * Mark newvd's DTL dirty in this txg. 1230 */ 1231 vdev_dirty(tvd, VDD_DTL, txg); 1232 (void) txg_list_add(&tvd->vdev_dtl_list, newvd, txg); 1233 1234 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1235 1236 /* 1237 * Kick off a resilver to update newvd. 1238 */ 1239 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1240 1241 return (0); 1242 } 1243 1244 /* 1245 * Detach a device from a mirror or replacing vdev. 1246 * If 'replace_done' is specified, only detach if the parent 1247 * is a replacing vdev. 1248 */ 1249 int 1250 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1251 { 1252 uint64_t txg; 1253 int c, t, error; 1254 vdev_t *rvd = spa->spa_root_vdev; 1255 vdev_t *vd, *pvd, *cvd, *tvd; 1256 1257 txg = spa_vdev_enter(spa); 1258 1259 vd = vdev_lookup_by_guid(rvd, guid); 1260 1261 if (vd == NULL) 1262 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1263 1264 if (!vd->vdev_ops->vdev_op_leaf) 1265 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1266 1267 pvd = vd->vdev_parent; 1268 1269 /* 1270 * If replace_done is specified, only remove this device if it's 1271 * the first child of a replacing vdev. 1272 */ 1273 if (replace_done && 1274 (vd->vdev_id != 0 || pvd->vdev_ops != &vdev_replacing_ops)) 1275 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1276 1277 /* 1278 * Only mirror and replacing vdevs support detach. 1279 */ 1280 if (pvd->vdev_ops != &vdev_replacing_ops && 1281 pvd->vdev_ops != &vdev_mirror_ops) 1282 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1283 1284 /* 1285 * If there's only one replica, you can't detach it. 1286 */ 1287 if (pvd->vdev_children <= 1) 1288 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1289 1290 /* 1291 * If all siblings have non-empty DTLs, this device may have the only 1292 * valid copy of the data, which means we cannot safely detach it. 1293 * 1294 * XXX -- as in the vdev_offline() case, we really want a more 1295 * precise DTL check. 1296 */ 1297 for (c = 0; c < pvd->vdev_children; c++) { 1298 uint64_t dirty; 1299 1300 cvd = pvd->vdev_child[c]; 1301 if (cvd == vd) 1302 continue; 1303 if (vdev_is_dead(cvd)) 1304 continue; 1305 mutex_enter(&cvd->vdev_dtl_lock); 1306 dirty = cvd->vdev_dtl_map.sm_space | 1307 cvd->vdev_dtl_scrub.sm_space; 1308 mutex_exit(&cvd->vdev_dtl_lock); 1309 if (!dirty) 1310 break; 1311 } 1312 if (c == pvd->vdev_children) 1313 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1314 1315 /* 1316 * Erase the disk labels so the disk can be used for other things. 1317 * This must be done after all other error cases are handled, 1318 * but before we disembowel vd (so we can still do I/O to it). 1319 * But if we can't do it, don't treat the error as fatal -- 1320 * it may be that the unwritability of the disk is the reason 1321 * it's being detached! 1322 */ 1323 error = vdev_label_init(vd, 0); 1324 if (error) 1325 dprintf("unable to erase labels on %s\n", vdev_description(vd)); 1326 1327 /* 1328 * Remove vd from its parent and compact the parent's children. 1329 */ 1330 vdev_remove_child(pvd, vd); 1331 vdev_compact_children(pvd); 1332 1333 /* 1334 * Remember one of the remaining children so we can get tvd below. 1335 */ 1336 cvd = pvd->vdev_child[0]; 1337 1338 /* 1339 * If the parent mirror/replacing vdev only has one child, 1340 * the parent is no longer needed. Remove it from the tree. 1341 */ 1342 if (pvd->vdev_children == 1) 1343 vdev_remove_parent(cvd); 1344 1345 /* 1346 * We don't set tvd until now because the parent we just removed 1347 * may have been the previous top-level vdev. 1348 */ 1349 tvd = cvd->vdev_top; 1350 ASSERT(tvd->vdev_parent == rvd); 1351 1352 /* 1353 * Reopen this top-level vdev to reassess health after detach. 1354 */ 1355 vdev_reopen(tvd); 1356 1357 /* 1358 * If the device we just detached was smaller than the others, 1359 * it may be possible to add metaslabs (i.e. grow the pool). We ignore 1360 * the error here because the detach still succeeded - we just weren't 1361 * able to reinitialize the metaslabs. This pool is in for a world of 1362 * hurt, in any case. 1363 */ 1364 (void) vdev_metaslab_init(tvd, txg); 1365 1366 vdev_config_dirty(tvd); 1367 1368 /* 1369 * Mark vd's DTL as dirty in this txg. 1370 * vdev_dtl_sync() will see that vd->vdev_detached is set 1371 * and free vd's DTL object in syncing context. 1372 * But first make sure we're not on any *other* txg's DTL list, 1373 * to prevent vd from being accessed after it's freed. 1374 */ 1375 vdev_dirty(tvd, VDD_DTL, txg); 1376 vd->vdev_detached = B_TRUE; 1377 for (t = 0; t < TXG_SIZE; t++) 1378 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1379 (void) txg_list_add(&tvd->vdev_dtl_list, vd, txg); 1380 1381 dprintf("detached %s in txg %llu\n", vd->vdev_path, txg); 1382 1383 return (spa_vdev_exit(spa, vd, txg, 0)); 1384 } 1385 1386 /* 1387 * Find any device that's done replacing, so we can detach it. 1388 */ 1389 static vdev_t * 1390 spa_vdev_replace_done_hunt(vdev_t *vd) 1391 { 1392 vdev_t *newvd, *oldvd; 1393 int c; 1394 1395 for (c = 0; c < vd->vdev_children; c++) { 1396 oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 1397 if (oldvd != NULL) 1398 return (oldvd); 1399 } 1400 1401 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 1402 oldvd = vd->vdev_child[0]; 1403 newvd = vd->vdev_child[1]; 1404 1405 mutex_enter(&newvd->vdev_dtl_lock); 1406 if (newvd->vdev_dtl_map.sm_space == 0 && 1407 newvd->vdev_dtl_scrub.sm_space == 0) { 1408 mutex_exit(&newvd->vdev_dtl_lock); 1409 return (oldvd); 1410 } 1411 mutex_exit(&newvd->vdev_dtl_lock); 1412 } 1413 1414 return (NULL); 1415 } 1416 1417 static void 1418 spa_vdev_replace_done(spa_t *spa) 1419 { 1420 vdev_t *vd; 1421 uint64_t guid; 1422 1423 spa_config_enter(spa, RW_READER, FTAG); 1424 1425 while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 1426 guid = vd->vdev_guid; 1427 spa_config_exit(spa, FTAG); 1428 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 1429 return; 1430 spa_config_enter(spa, RW_READER, FTAG); 1431 } 1432 1433 spa_config_exit(spa, FTAG); 1434 } 1435 1436 /* 1437 * Update the stored path for this vdev. Dirty the vdev configuration, relying 1438 * on spa_vdev_enter/exit() to synchronize the labels and cache. 1439 */ 1440 int 1441 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 1442 { 1443 vdev_t *rvd, *vd; 1444 uint64_t txg; 1445 1446 rvd = spa->spa_root_vdev; 1447 1448 txg = spa_vdev_enter(spa); 1449 1450 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 1451 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 1452 1453 if (!vd->vdev_ops->vdev_op_leaf) 1454 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1455 1456 spa_strfree(vd->vdev_path); 1457 vd->vdev_path = spa_strdup(newpath); 1458 1459 vdev_config_dirty(vd->vdev_top); 1460 1461 return (spa_vdev_exit(spa, NULL, txg, 0)); 1462 } 1463 1464 /* 1465 * ========================================================================== 1466 * SPA Scrubbing 1467 * ========================================================================== 1468 */ 1469 1470 void 1471 spa_scrub_throttle(spa_t *spa, int direction) 1472 { 1473 mutex_enter(&spa->spa_scrub_lock); 1474 spa->spa_scrub_throttled += direction; 1475 ASSERT(spa->spa_scrub_throttled >= 0); 1476 if (spa->spa_scrub_throttled == 0) 1477 cv_broadcast(&spa->spa_scrub_io_cv); 1478 mutex_exit(&spa->spa_scrub_lock); 1479 } 1480 1481 static void 1482 spa_scrub_io_done(zio_t *zio) 1483 { 1484 spa_t *spa = zio->io_spa; 1485 1486 zio_buf_free(zio->io_data, zio->io_size); 1487 1488 mutex_enter(&spa->spa_scrub_lock); 1489 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1490 vdev_t *vd = zio->io_vd; 1491 spa->spa_scrub_errors++; 1492 mutex_enter(&vd->vdev_stat_lock); 1493 vd->vdev_stat.vs_scrub_errors++; 1494 mutex_exit(&vd->vdev_stat_lock); 1495 } 1496 if (--spa->spa_scrub_inflight == 0) { 1497 cv_broadcast(&spa->spa_scrub_io_cv); 1498 ASSERT(spa->spa_scrub_throttled == 0); 1499 } 1500 mutex_exit(&spa->spa_scrub_lock); 1501 } 1502 1503 static void 1504 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 1505 zbookmark_t *zb) 1506 { 1507 size_t size = BP_GET_LSIZE(bp); 1508 void *data = zio_buf_alloc(size); 1509 1510 mutex_enter(&spa->spa_scrub_lock); 1511 spa->spa_scrub_inflight++; 1512 mutex_exit(&spa->spa_scrub_lock); 1513 1514 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 1515 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 1516 1517 flags |= ZIO_FLAG_CANFAIL; 1518 1519 zio_nowait(zio_read(NULL, spa, bp, data, size, 1520 spa_scrub_io_done, NULL, priority, flags, zb)); 1521 } 1522 1523 /* ARGSUSED */ 1524 static int 1525 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 1526 { 1527 blkptr_t *bp = &bc->bc_blkptr; 1528 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[0])); 1529 1530 if (bc->bc_errno || vd == NULL) { 1531 /* 1532 * We can't scrub this block, but we can continue to scrub 1533 * the rest of the pool. Note the error and move along. 1534 */ 1535 mutex_enter(&spa->spa_scrub_lock); 1536 spa->spa_scrub_errors++; 1537 mutex_exit(&spa->spa_scrub_lock); 1538 1539 if (vd != NULL) { 1540 mutex_enter(&vd->vdev_stat_lock); 1541 vd->vdev_stat.vs_scrub_errors++; 1542 mutex_exit(&vd->vdev_stat_lock); 1543 } 1544 1545 return (ERESTART); 1546 } 1547 1548 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 1549 1550 /* 1551 * Keep track of how much data we've examined so that 1552 * zpool(1M) status can make useful progress reports. 1553 */ 1554 mutex_enter(&vd->vdev_stat_lock); 1555 vd->vdev_stat.vs_scrub_examined += BP_GET_ASIZE(bp); 1556 mutex_exit(&vd->vdev_stat_lock); 1557 1558 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 1559 if (DVA_GET_GANG(&bp->blk_dva[0])) { 1560 /* 1561 * Gang members may be spread across multiple vdevs, 1562 * so the best we can do is look at the pool-wide DTL. 1563 * XXX -- it would be better to change our allocation 1564 * policy to ensure that this can't happen. 1565 */ 1566 vd = spa->spa_root_vdev; 1567 } 1568 if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) { 1569 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 1570 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 1571 } 1572 } else { 1573 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 1574 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 1575 } 1576 1577 return (0); 1578 } 1579 1580 static void 1581 spa_scrub_thread(spa_t *spa) 1582 { 1583 callb_cpr_t cprinfo; 1584 traverse_handle_t *th = spa->spa_scrub_th; 1585 vdev_t *rvd = spa->spa_root_vdev; 1586 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 1587 int error = 0; 1588 boolean_t complete; 1589 1590 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 1591 1592 /* 1593 * If we're restarting due to a snapshot create/delete, 1594 * wait for that to complete. 1595 */ 1596 txg_wait_synced(spa_get_dsl(spa), 0); 1597 1598 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 1599 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 1600 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 1601 1602 spa_config_enter(spa, RW_WRITER, FTAG); 1603 vdev_reopen(rvd); /* purge all vdev caches */ 1604 vdev_config_dirty(rvd); /* rewrite all disk labels */ 1605 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 1606 spa_config_exit(spa, FTAG); 1607 1608 mutex_enter(&spa->spa_scrub_lock); 1609 spa->spa_scrub_errors = 0; 1610 spa->spa_scrub_active = 1; 1611 ASSERT(spa->spa_scrub_inflight == 0); 1612 ASSERT(spa->spa_scrub_throttled == 0); 1613 1614 while (!spa->spa_scrub_stop) { 1615 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1616 while (spa->spa_scrub_suspended) { 1617 spa->spa_scrub_active = 0; 1618 cv_broadcast(&spa->spa_scrub_cv); 1619 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1620 spa->spa_scrub_active = 1; 1621 } 1622 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 1623 1624 if (spa->spa_scrub_restart_txg != 0) 1625 break; 1626 1627 mutex_exit(&spa->spa_scrub_lock); 1628 error = traverse_more(th); 1629 mutex_enter(&spa->spa_scrub_lock); 1630 if (error != EAGAIN) 1631 break; 1632 1633 while (spa->spa_scrub_throttled > 0) 1634 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1635 } 1636 1637 while (spa->spa_scrub_inflight) 1638 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1639 1640 spa->spa_scrub_active = 0; 1641 cv_broadcast(&spa->spa_scrub_cv); 1642 1643 mutex_exit(&spa->spa_scrub_lock); 1644 1645 spa_config_enter(spa, RW_WRITER, FTAG); 1646 1647 mutex_enter(&spa->spa_scrub_lock); 1648 1649 /* 1650 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 1651 * AND the spa config lock to synchronize with any config changes 1652 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 1653 */ 1654 if (spa->spa_scrub_restart_txg != 0) 1655 error = ERESTART; 1656 1657 if (spa->spa_scrub_stop) 1658 error = EINTR; 1659 1660 /* 1661 * Even if there were uncorrectable errors, we consider the scrub 1662 * completed. The downside is that if there is a transient error during 1663 * a resilver, we won't resilver the data properly to the target. But 1664 * if the damage is permanent (more likely) we will resilver forever, 1665 * which isn't really acceptable. Since there is enough information for 1666 * the user to know what has failed and why, this seems like a more 1667 * tractable approach. 1668 */ 1669 complete = (error == 0); 1670 1671 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 1672 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 1673 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 1674 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 1675 1676 mutex_exit(&spa->spa_scrub_lock); 1677 1678 /* 1679 * If the scrub/resilver completed, update all DTLs to reflect this. 1680 * Whether it succeeded or not, vacate all temporary scrub DTLs. 1681 */ 1682 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 1683 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 1684 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 1685 spa_errlog_rotate(spa); 1686 1687 spa_config_exit(spa, FTAG); 1688 1689 mutex_enter(&spa->spa_scrub_lock); 1690 1691 /* 1692 * We may have finished replacing a device. 1693 * Let the async thread assess this and handle the detach. 1694 */ 1695 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 1696 1697 /* 1698 * If we were told to restart, our final act is to start a new scrub. 1699 */ 1700 if (error == ERESTART) 1701 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 1702 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 1703 1704 spa->spa_scrub_type = POOL_SCRUB_NONE; 1705 spa->spa_scrub_active = 0; 1706 spa->spa_scrub_thread = NULL; 1707 cv_broadcast(&spa->spa_scrub_cv); 1708 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 1709 thread_exit(); 1710 } 1711 1712 void 1713 spa_scrub_suspend(spa_t *spa) 1714 { 1715 mutex_enter(&spa->spa_scrub_lock); 1716 spa->spa_scrub_suspended++; 1717 while (spa->spa_scrub_active) { 1718 cv_broadcast(&spa->spa_scrub_cv); 1719 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1720 } 1721 while (spa->spa_scrub_inflight) 1722 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1723 mutex_exit(&spa->spa_scrub_lock); 1724 } 1725 1726 void 1727 spa_scrub_resume(spa_t *spa) 1728 { 1729 mutex_enter(&spa->spa_scrub_lock); 1730 ASSERT(spa->spa_scrub_suspended != 0); 1731 if (--spa->spa_scrub_suspended == 0) 1732 cv_broadcast(&spa->spa_scrub_cv); 1733 mutex_exit(&spa->spa_scrub_lock); 1734 } 1735 1736 void 1737 spa_scrub_restart(spa_t *spa, uint64_t txg) 1738 { 1739 /* 1740 * Something happened (e.g. snapshot create/delete) that means 1741 * we must restart any in-progress scrubs. The itinerary will 1742 * fix this properly. 1743 */ 1744 mutex_enter(&spa->spa_scrub_lock); 1745 spa->spa_scrub_restart_txg = txg; 1746 mutex_exit(&spa->spa_scrub_lock); 1747 } 1748 1749 int 1750 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 1751 { 1752 space_seg_t *ss; 1753 uint64_t mintxg, maxtxg; 1754 vdev_t *rvd = spa->spa_root_vdev; 1755 1756 if ((uint_t)type >= POOL_SCRUB_TYPES) 1757 return (ENOTSUP); 1758 1759 mutex_enter(&spa->spa_scrub_lock); 1760 1761 /* 1762 * If there's a scrub or resilver already in progress, stop it. 1763 */ 1764 while (spa->spa_scrub_thread != NULL) { 1765 /* 1766 * Don't stop a resilver unless forced. 1767 */ 1768 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 1769 mutex_exit(&spa->spa_scrub_lock); 1770 return (EBUSY); 1771 } 1772 spa->spa_scrub_stop = 1; 1773 cv_broadcast(&spa->spa_scrub_cv); 1774 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1775 } 1776 1777 /* 1778 * Terminate the previous traverse. 1779 */ 1780 if (spa->spa_scrub_th != NULL) { 1781 traverse_fini(spa->spa_scrub_th); 1782 spa->spa_scrub_th = NULL; 1783 } 1784 1785 if (rvd == NULL) { 1786 ASSERT(spa->spa_scrub_stop == 0); 1787 ASSERT(spa->spa_scrub_type == type); 1788 ASSERT(spa->spa_scrub_restart_txg == 0); 1789 mutex_exit(&spa->spa_scrub_lock); 1790 return (0); 1791 } 1792 1793 mintxg = TXG_INITIAL - 1; 1794 maxtxg = spa_last_synced_txg(spa) + 1; 1795 1796 mutex_enter(&rvd->vdev_dtl_lock); 1797 1798 if (rvd->vdev_dtl_map.sm_space == 0) { 1799 /* 1800 * The pool-wide DTL is empty. 1801 * If this is a resilver, there's nothing to do. 1802 */ 1803 if (type == POOL_SCRUB_RESILVER) 1804 type = POOL_SCRUB_NONE; 1805 } else { 1806 /* 1807 * The pool-wide DTL is non-empty. 1808 * If this is a normal scrub, upgrade to a resilver instead. 1809 */ 1810 if (type == POOL_SCRUB_EVERYTHING) 1811 type = POOL_SCRUB_RESILVER; 1812 } 1813 1814 if (type == POOL_SCRUB_RESILVER) { 1815 /* 1816 * Determine the resilvering boundaries. 1817 * 1818 * Note: (mintxg, maxtxg) is an open interval, 1819 * i.e. mintxg and maxtxg themselves are not included. 1820 * 1821 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 1822 * so we don't claim to resilver a txg that's still changing. 1823 */ 1824 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 1825 mintxg = ss->ss_start - 1; 1826 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 1827 maxtxg = MIN(ss->ss_end, maxtxg); 1828 } 1829 1830 mutex_exit(&rvd->vdev_dtl_lock); 1831 1832 spa->spa_scrub_stop = 0; 1833 spa->spa_scrub_type = type; 1834 spa->spa_scrub_restart_txg = 0; 1835 1836 if (type != POOL_SCRUB_NONE) { 1837 spa->spa_scrub_mintxg = mintxg; 1838 spa->spa_scrub_maxtxg = maxtxg; 1839 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 1840 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 1841 ZIO_FLAG_CANFAIL); 1842 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 1843 spa->spa_scrub_thread = thread_create(NULL, 0, 1844 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 1845 } 1846 1847 mutex_exit(&spa->spa_scrub_lock); 1848 1849 return (0); 1850 } 1851 1852 /* 1853 * ========================================================================== 1854 * SPA async task processing 1855 * ========================================================================== 1856 */ 1857 1858 static void 1859 spa_async_reopen(spa_t *spa) 1860 { 1861 vdev_t *rvd = spa->spa_root_vdev; 1862 vdev_t *tvd; 1863 int c; 1864 1865 spa_config_enter(spa, RW_WRITER, FTAG); 1866 1867 for (c = 0; c < rvd->vdev_children; c++) { 1868 tvd = rvd->vdev_child[c]; 1869 if (tvd->vdev_reopen_wanted) { 1870 tvd->vdev_reopen_wanted = 0; 1871 vdev_reopen(tvd); 1872 } 1873 } 1874 1875 spa_config_exit(spa, FTAG); 1876 } 1877 1878 static void 1879 spa_async_thread(spa_t *spa) 1880 { 1881 int tasks; 1882 1883 ASSERT(spa->spa_sync_on); 1884 1885 mutex_enter(&spa->spa_async_lock); 1886 tasks = spa->spa_async_tasks; 1887 spa->spa_async_tasks = 0; 1888 mutex_exit(&spa->spa_async_lock); 1889 1890 /* 1891 * See if the config needs to be updated. 1892 */ 1893 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 1894 mutex_enter(&spa_namespace_lock); 1895 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1896 mutex_exit(&spa_namespace_lock); 1897 } 1898 1899 /* 1900 * See if any devices need to be reopened. 1901 */ 1902 if (tasks & SPA_ASYNC_REOPEN) 1903 spa_async_reopen(spa); 1904 1905 /* 1906 * If any devices are done replacing, detach them. 1907 */ 1908 if (tasks & SPA_ASYNC_REPLACE_DONE) 1909 spa_vdev_replace_done(spa); 1910 1911 /* 1912 * Kick off a scrub. 1913 */ 1914 if (tasks & SPA_ASYNC_SCRUB) 1915 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 1916 1917 /* 1918 * Kick off a resilver. 1919 */ 1920 if (tasks & SPA_ASYNC_RESILVER) 1921 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1922 1923 /* 1924 * Let the world know that we're done. 1925 */ 1926 mutex_enter(&spa->spa_async_lock); 1927 spa->spa_async_thread = NULL; 1928 cv_broadcast(&spa->spa_async_cv); 1929 mutex_exit(&spa->spa_async_lock); 1930 thread_exit(); 1931 } 1932 1933 void 1934 spa_async_suspend(spa_t *spa) 1935 { 1936 mutex_enter(&spa->spa_async_lock); 1937 spa->spa_async_suspended++; 1938 while (spa->spa_async_thread != NULL) 1939 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 1940 mutex_exit(&spa->spa_async_lock); 1941 } 1942 1943 void 1944 spa_async_resume(spa_t *spa) 1945 { 1946 mutex_enter(&spa->spa_async_lock); 1947 ASSERT(spa->spa_async_suspended != 0); 1948 spa->spa_async_suspended--; 1949 mutex_exit(&spa->spa_async_lock); 1950 } 1951 1952 static void 1953 spa_async_dispatch(spa_t *spa) 1954 { 1955 mutex_enter(&spa->spa_async_lock); 1956 if (spa->spa_async_tasks && !spa->spa_async_suspended && 1957 spa->spa_async_thread == NULL && 1958 rootdir != NULL && !vn_is_readonly(rootdir)) 1959 spa->spa_async_thread = thread_create(NULL, 0, 1960 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 1961 mutex_exit(&spa->spa_async_lock); 1962 } 1963 1964 void 1965 spa_async_request(spa_t *spa, int task) 1966 { 1967 mutex_enter(&spa->spa_async_lock); 1968 spa->spa_async_tasks |= task; 1969 mutex_exit(&spa->spa_async_lock); 1970 } 1971 1972 /* 1973 * ========================================================================== 1974 * SPA syncing routines 1975 * ========================================================================== 1976 */ 1977 1978 static void 1979 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 1980 { 1981 bplist_t *bpl = &spa->spa_sync_bplist; 1982 dmu_tx_t *tx; 1983 blkptr_t blk; 1984 uint64_t itor = 0; 1985 zio_t *zio; 1986 int error; 1987 uint8_t c = 1; 1988 1989 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 1990 1991 while (bplist_iterate(bpl, &itor, &blk) == 0) 1992 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 1993 1994 error = zio_wait(zio); 1995 ASSERT3U(error, ==, 0); 1996 1997 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1998 bplist_vacate(bpl, tx); 1999 2000 /* 2001 * Pre-dirty the first block so we sync to convergence faster. 2002 * (Usually only the first block is needed.) 2003 */ 2004 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2005 dmu_tx_commit(tx); 2006 } 2007 2008 static void 2009 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2010 { 2011 nvlist_t *config; 2012 char *packed = NULL; 2013 size_t nvsize = 0; 2014 dmu_buf_t *db; 2015 2016 if (list_is_empty(&spa->spa_dirty_list)) 2017 return; 2018 2019 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2020 2021 if (spa->spa_config_syncing) 2022 nvlist_free(spa->spa_config_syncing); 2023 spa->spa_config_syncing = config; 2024 2025 VERIFY(nvlist_size(config, &nvsize, NV_ENCODE_XDR) == 0); 2026 2027 packed = kmem_alloc(nvsize, KM_SLEEP); 2028 2029 VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 2030 KM_SLEEP) == 0); 2031 2032 dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize, 2033 packed, tx); 2034 2035 kmem_free(packed, nvsize); 2036 2037 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, 2038 spa->spa_config_object, FTAG, &db)); 2039 dmu_buf_will_dirty(db, tx); 2040 *(uint64_t *)db->db_data = nvsize; 2041 dmu_buf_rele(db, FTAG); 2042 } 2043 2044 /* 2045 * Sync the specified transaction group. New blocks may be dirtied as 2046 * part of the process, so we iterate until it converges. 2047 */ 2048 void 2049 spa_sync(spa_t *spa, uint64_t txg) 2050 { 2051 dsl_pool_t *dp = spa->spa_dsl_pool; 2052 objset_t *mos = spa->spa_meta_objset; 2053 bplist_t *bpl = &spa->spa_sync_bplist; 2054 vdev_t *rvd = spa->spa_root_vdev; 2055 vdev_t *vd; 2056 dmu_tx_t *tx; 2057 int dirty_vdevs; 2058 2059 /* 2060 * Lock out configuration changes. 2061 */ 2062 spa_config_enter(spa, RW_READER, FTAG); 2063 2064 spa->spa_syncing_txg = txg; 2065 spa->spa_sync_pass = 0; 2066 2067 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2068 2069 /* 2070 * If anything has changed in this txg, push the deferred frees 2071 * from the previous txg. If not, leave them alone so that we 2072 * don't generate work on an otherwise idle system. 2073 */ 2074 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2075 !txg_list_empty(&dp->dp_dirty_dirs, txg)) 2076 spa_sync_deferred_frees(spa, txg); 2077 2078 /* 2079 * Iterate to convergence. 2080 */ 2081 do { 2082 spa->spa_sync_pass++; 2083 2084 tx = dmu_tx_create_assigned(dp, txg); 2085 spa_sync_config_object(spa, tx); 2086 dmu_tx_commit(tx); 2087 2088 spa_errlog_sync(spa, txg); 2089 2090 dsl_pool_sync(dp, txg); 2091 2092 dirty_vdevs = 0; 2093 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2094 vdev_sync(vd, txg); 2095 dirty_vdevs++; 2096 } 2097 2098 tx = dmu_tx_create_assigned(dp, txg); 2099 bplist_sync(bpl, tx); 2100 dmu_tx_commit(tx); 2101 2102 } while (dirty_vdevs); 2103 2104 bplist_close(bpl); 2105 2106 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2107 2108 /* 2109 * Rewrite the vdev configuration (which includes the uberblock) 2110 * to commit the transaction group. 2111 * 2112 * If there are any dirty vdevs, sync the uberblock to all vdevs. 2113 * Otherwise, pick a random top-level vdev that's known to be 2114 * visible in the config cache (see spa_vdev_add() for details). 2115 * If the write fails, try the next vdev until we're tried them all. 2116 */ 2117 if (!list_is_empty(&spa->spa_dirty_list)) { 2118 VERIFY(vdev_config_sync(rvd, txg) == 0); 2119 } else { 2120 int children = rvd->vdev_children; 2121 int c0 = spa_get_random(children); 2122 int c; 2123 2124 for (c = 0; c < children; c++) { 2125 vd = rvd->vdev_child[(c0 + c) % children]; 2126 if (vd->vdev_ms_array == 0) 2127 continue; 2128 if (vdev_config_sync(vd, txg) == 0) 2129 break; 2130 } 2131 if (c == children) 2132 VERIFY(vdev_config_sync(rvd, txg) == 0); 2133 } 2134 2135 /* 2136 * Clear the dirty config list. 2137 */ 2138 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 2139 vdev_config_clean(vd); 2140 2141 /* 2142 * Now that the new config has synced transactionally, 2143 * let it become visible to the config cache. 2144 */ 2145 if (spa->spa_config_syncing != NULL) { 2146 spa_config_set(spa, spa->spa_config_syncing); 2147 spa->spa_config_txg = txg; 2148 spa->spa_config_syncing = NULL; 2149 } 2150 2151 /* 2152 * Make a stable copy of the fully synced uberblock. 2153 * We use this as the root for pool traversals. 2154 */ 2155 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 2156 2157 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 2158 2159 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 2160 spa->spa_traverse_wanted = 0; 2161 spa->spa_ubsync = spa->spa_uberblock; 2162 rw_exit(&spa->spa_traverse_lock); 2163 2164 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 2165 2166 /* 2167 * Clean up the ZIL records for the synced txg. 2168 */ 2169 dsl_pool_zil_clean(dp); 2170 2171 /* 2172 * Update usable space statistics. 2173 */ 2174 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 2175 vdev_sync_done(vd, txg); 2176 2177 /* 2178 * It had better be the case that we didn't dirty anything 2179 * since spa_sync_labels(). 2180 */ 2181 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 2182 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 2183 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 2184 ASSERT(bpl->bpl_queue == NULL); 2185 2186 spa_config_exit(spa, FTAG); 2187 2188 /* 2189 * If any async tasks have been requested, kick them off. 2190 */ 2191 spa_async_dispatch(spa); 2192 } 2193 2194 /* 2195 * Sync all pools. We don't want to hold the namespace lock across these 2196 * operations, so we take a reference on the spa_t and drop the lock during the 2197 * sync. 2198 */ 2199 void 2200 spa_sync_allpools(void) 2201 { 2202 spa_t *spa = NULL; 2203 mutex_enter(&spa_namespace_lock); 2204 while ((spa = spa_next(spa)) != NULL) { 2205 if (spa_state(spa) != POOL_STATE_ACTIVE) 2206 continue; 2207 spa_open_ref(spa, FTAG); 2208 mutex_exit(&spa_namespace_lock); 2209 txg_wait_synced(spa_get_dsl(spa), 0); 2210 mutex_enter(&spa_namespace_lock); 2211 spa_close(spa, FTAG); 2212 } 2213 mutex_exit(&spa_namespace_lock); 2214 } 2215 2216 /* 2217 * ========================================================================== 2218 * Miscellaneous routines 2219 * ========================================================================== 2220 */ 2221 2222 /* 2223 * Remove all pools in the system. 2224 */ 2225 void 2226 spa_evict_all(void) 2227 { 2228 spa_t *spa; 2229 2230 /* 2231 * Remove all cached state. All pools should be closed now, 2232 * so every spa in the AVL tree should be unreferenced. 2233 */ 2234 mutex_enter(&spa_namespace_lock); 2235 while ((spa = spa_next(NULL)) != NULL) { 2236 /* 2237 * Stop async tasks. The async thread may need to detach 2238 * a device that's been replaced, which requires grabbing 2239 * spa_namespace_lock, so we must drop it here. 2240 */ 2241 spa_open_ref(spa, FTAG); 2242 mutex_exit(&spa_namespace_lock); 2243 spa_async_suspend(spa); 2244 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 2245 mutex_enter(&spa_namespace_lock); 2246 spa_close(spa, FTAG); 2247 2248 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2249 spa_unload(spa); 2250 spa_deactivate(spa); 2251 } 2252 spa_remove(spa); 2253 } 2254 mutex_exit(&spa_namespace_lock); 2255 } 2256 2257 vdev_t * 2258 spa_lookup_by_guid(spa_t *spa, uint64_t guid) 2259 { 2260 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 2261 } 2262