1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * This file contains all the routines used when modifying on-disk SPA state. 30 * This includes opening, importing, destroying, exporting a pool, and syncing a 31 * pool. 32 */ 33 34 #include <sys/zfs_context.h> 35 #include <sys/fm/fs/zfs.h> 36 #include <sys/spa_impl.h> 37 #include <sys/zio.h> 38 #include <sys/zio_checksum.h> 39 #include <sys/zio_compress.h> 40 #include <sys/dmu.h> 41 #include <sys/dmu_tx.h> 42 #include <sys/zap.h> 43 #include <sys/zil.h> 44 #include <sys/vdev_impl.h> 45 #include <sys/metaslab.h> 46 #include <sys/uberblock_impl.h> 47 #include <sys/txg.h> 48 #include <sys/avl.h> 49 #include <sys/dmu_traverse.h> 50 #include <sys/unique.h> 51 #include <sys/dsl_pool.h> 52 #include <sys/dsl_dir.h> 53 #include <sys/dsl_prop.h> 54 #include <sys/fs/zfs.h> 55 #include <sys/callb.h> 56 57 /* 58 * ========================================================================== 59 * SPA state manipulation (open/create/destroy/import/export) 60 * ========================================================================== 61 */ 62 63 static int 64 spa_error_entry_compare(const void *a, const void *b) 65 { 66 spa_error_entry_t *sa = (spa_error_entry_t *)a; 67 spa_error_entry_t *sb = (spa_error_entry_t *)b; 68 int ret; 69 70 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 71 sizeof (zbookmark_t)); 72 73 if (ret < 0) 74 return (-1); 75 else if (ret > 0) 76 return (1); 77 else 78 return (0); 79 } 80 81 /* 82 * Utility function which retrieves copies of the current logs and 83 * re-initializes them in the process. 84 */ 85 void 86 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 87 { 88 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 89 90 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 91 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 92 93 avl_create(&spa->spa_errlist_scrub, 94 spa_error_entry_compare, sizeof (spa_error_entry_t), 95 offsetof(spa_error_entry_t, se_avl)); 96 avl_create(&spa->spa_errlist_last, 97 spa_error_entry_compare, sizeof (spa_error_entry_t), 98 offsetof(spa_error_entry_t, se_avl)); 99 } 100 101 /* 102 * Activate an uninitialized pool. 103 */ 104 static void 105 spa_activate(spa_t *spa) 106 { 107 int t; 108 109 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 110 111 spa->spa_state = POOL_STATE_ACTIVE; 112 113 spa->spa_normal_class = metaslab_class_create(); 114 115 for (t = 0; t < ZIO_TYPES; t++) { 116 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 117 8, maxclsyspri, 50, INT_MAX, 118 TASKQ_PREPOPULATE); 119 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 120 8, maxclsyspri, 50, INT_MAX, 121 TASKQ_PREPOPULATE); 122 } 123 124 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 125 126 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 127 offsetof(vdev_t, vdev_dirty_node)); 128 129 txg_list_create(&spa->spa_vdev_txg_list, 130 offsetof(struct vdev, vdev_txg_node)); 131 132 avl_create(&spa->spa_errlist_scrub, 133 spa_error_entry_compare, sizeof (spa_error_entry_t), 134 offsetof(spa_error_entry_t, se_avl)); 135 avl_create(&spa->spa_errlist_last, 136 spa_error_entry_compare, sizeof (spa_error_entry_t), 137 offsetof(spa_error_entry_t, se_avl)); 138 } 139 140 /* 141 * Opposite of spa_activate(). 142 */ 143 static void 144 spa_deactivate(spa_t *spa) 145 { 146 int t; 147 148 ASSERT(spa->spa_sync_on == B_FALSE); 149 ASSERT(spa->spa_dsl_pool == NULL); 150 ASSERT(spa->spa_root_vdev == NULL); 151 152 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 153 154 txg_list_destroy(&spa->spa_vdev_txg_list); 155 156 list_destroy(&spa->spa_dirty_list); 157 158 rw_destroy(&spa->spa_traverse_lock); 159 160 for (t = 0; t < ZIO_TYPES; t++) { 161 taskq_destroy(spa->spa_zio_issue_taskq[t]); 162 taskq_destroy(spa->spa_zio_intr_taskq[t]); 163 spa->spa_zio_issue_taskq[t] = NULL; 164 spa->spa_zio_intr_taskq[t] = NULL; 165 } 166 167 metaslab_class_destroy(spa->spa_normal_class); 168 spa->spa_normal_class = NULL; 169 170 /* 171 * If this was part of an import or the open otherwise failed, we may 172 * still have errors left in the queues. Empty them just in case. 173 */ 174 spa_errlog_drain(spa); 175 176 avl_destroy(&spa->spa_errlist_scrub); 177 avl_destroy(&spa->spa_errlist_last); 178 179 spa->spa_state = POOL_STATE_UNINITIALIZED; 180 } 181 182 /* 183 * Verify a pool configuration, and construct the vdev tree appropriately. This 184 * will create all the necessary vdevs in the appropriate layout, with each vdev 185 * in the CLOSED state. This will prep the pool before open/creation/import. 186 * All vdev validation is done by the vdev_alloc() routine. 187 */ 188 static vdev_t * 189 spa_config_parse(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) 190 { 191 nvlist_t **child; 192 uint_t c, children; 193 vdev_t *vd; 194 195 if ((vd = vdev_alloc(spa, nv, parent, id, atype)) == NULL) 196 return (NULL); 197 198 if (vd->vdev_ops->vdev_op_leaf) 199 return (vd); 200 201 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 202 &child, &children) != 0) { 203 vdev_free(vd); 204 return (NULL); 205 } 206 207 for (c = 0; c < children; c++) { 208 if (spa_config_parse(spa, child[c], vd, c, atype) == NULL) { 209 vdev_free(vd); 210 return (NULL); 211 } 212 } 213 214 return (vd); 215 } 216 217 /* 218 * Opposite of spa_load(). 219 */ 220 static void 221 spa_unload(spa_t *spa) 222 { 223 /* 224 * Stop async tasks. 225 */ 226 spa_async_suspend(spa); 227 228 /* 229 * Stop syncing. 230 */ 231 if (spa->spa_sync_on) { 232 txg_sync_stop(spa->spa_dsl_pool); 233 spa->spa_sync_on = B_FALSE; 234 } 235 236 /* 237 * Wait for any outstanding prefetch I/O to complete. 238 */ 239 spa_config_enter(spa, RW_WRITER, FTAG); 240 spa_config_exit(spa, FTAG); 241 242 /* 243 * Close the dsl pool. 244 */ 245 if (spa->spa_dsl_pool) { 246 dsl_pool_close(spa->spa_dsl_pool); 247 spa->spa_dsl_pool = NULL; 248 } 249 250 /* 251 * Close all vdevs. 252 */ 253 if (spa->spa_root_vdev) 254 vdev_free(spa->spa_root_vdev); 255 ASSERT(spa->spa_root_vdev == NULL); 256 257 spa->spa_async_suspended = 0; 258 } 259 260 /* 261 * Load an existing storage pool, using the pool's builtin spa_config as a 262 * source of configuration information. 263 */ 264 static int 265 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 266 { 267 int error = 0; 268 nvlist_t *nvroot = NULL; 269 vdev_t *rvd; 270 uberblock_t *ub = &spa->spa_uberblock; 271 uint64_t config_cache_txg = spa->spa_config_txg; 272 uint64_t pool_guid; 273 zio_t *zio; 274 275 spa->spa_load_state = state; 276 277 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 278 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 279 error = EINVAL; 280 goto out; 281 } 282 283 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 284 &spa->spa_config_txg); 285 286 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 287 spa_guid_exists(pool_guid, 0)) { 288 error = EEXIST; 289 goto out; 290 } 291 292 /* 293 * Parse the configuration into a vdev tree. 294 */ 295 spa_config_enter(spa, RW_WRITER, FTAG); 296 rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 297 spa_config_exit(spa, FTAG); 298 299 if (rvd == NULL) { 300 error = EINVAL; 301 goto out; 302 } 303 304 ASSERT(spa->spa_root_vdev == rvd); 305 ASSERT(spa_guid(spa) == pool_guid); 306 307 /* 308 * Try to open all vdevs, loading each label in the process. 309 */ 310 if (vdev_open(rvd) != 0) { 311 error = ENXIO; 312 goto out; 313 } 314 315 /* 316 * Find the best uberblock. 317 */ 318 bzero(ub, sizeof (uberblock_t)); 319 320 zio = zio_root(spa, NULL, NULL, 321 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 322 vdev_uberblock_load(zio, rvd, ub); 323 error = zio_wait(zio); 324 325 /* 326 * If we weren't able to find a single valid uberblock, return failure. 327 */ 328 if (ub->ub_txg == 0) { 329 error = ENXIO; 330 goto out; 331 } 332 333 /* 334 * If the pool is newer than the code, we can't open it. 335 */ 336 if (ub->ub_version > UBERBLOCK_VERSION) { 337 error = ENOTSUP; 338 goto out; 339 } 340 341 /* 342 * If the vdev guid sum doesn't match the uberblock, we have an 343 * incomplete configuration. 344 */ 345 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 346 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 347 VDEV_AUX_BAD_GUID_SUM); 348 error = ENXIO; 349 goto out; 350 } 351 352 /* 353 * Initialize internal SPA structures. 354 */ 355 spa->spa_state = POOL_STATE_ACTIVE; 356 spa->spa_ubsync = spa->spa_uberblock; 357 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 358 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 359 if (error) { 360 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 361 VDEV_AUX_CORRUPT_DATA); 362 goto out; 363 } 364 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 365 366 if (zap_lookup(spa->spa_meta_objset, 367 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 368 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 369 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 370 VDEV_AUX_CORRUPT_DATA); 371 error = EIO; 372 goto out; 373 } 374 375 if (!mosconfig) { 376 dmu_buf_t *db; 377 char *packed = NULL; 378 size_t nvsize = 0; 379 nvlist_t *newconfig = NULL; 380 381 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, 382 spa->spa_config_object, FTAG, &db)); 383 nvsize = *(uint64_t *)db->db_data; 384 dmu_buf_rele(db, FTAG); 385 386 packed = kmem_alloc(nvsize, KM_SLEEP); 387 error = dmu_read(spa->spa_meta_objset, 388 spa->spa_config_object, 0, nvsize, packed); 389 if (error == 0) 390 error = nvlist_unpack(packed, nvsize, &newconfig, 0); 391 kmem_free(packed, nvsize); 392 393 if (error) { 394 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 395 VDEV_AUX_CORRUPT_DATA); 396 error = EIO; 397 goto out; 398 } 399 400 spa_config_set(spa, newconfig); 401 402 spa_unload(spa); 403 spa_deactivate(spa); 404 spa_activate(spa); 405 406 return (spa_load(spa, newconfig, state, B_TRUE)); 407 } 408 409 if (zap_lookup(spa->spa_meta_objset, 410 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 411 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 412 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 413 VDEV_AUX_CORRUPT_DATA); 414 error = EIO; 415 goto out; 416 } 417 418 /* 419 * Load the persistent error log. If we have an older pool, this will 420 * not be present. 421 */ 422 error = zap_lookup(spa->spa_meta_objset, 423 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 424 sizeof (uint64_t), 1, &spa->spa_errlog_last); 425 if (error != 0 &&error != ENOENT) { 426 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 427 VDEV_AUX_CORRUPT_DATA); 428 error = EIO; 429 goto out; 430 } 431 432 error = zap_lookup(spa->spa_meta_objset, 433 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 434 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 435 if (error != 0 && error != ENOENT) { 436 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 437 VDEV_AUX_CORRUPT_DATA); 438 error = EIO; 439 goto out; 440 } 441 442 /* 443 * Load the vdev state for all top level vdevs. We need to grab the 444 * config lock because all label I/O is done with the 445 * ZIO_FLAG_CONFIG_HELD flag. 446 */ 447 spa_config_enter(spa, RW_READER, FTAG); 448 error = vdev_load(rvd); 449 spa_config_exit(spa, FTAG); 450 451 if (error) 452 goto out; 453 454 /* 455 * Propagate the leaf DTLs we just loaded all the way up the tree. 456 */ 457 spa_config_enter(spa, RW_WRITER, FTAG); 458 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 459 spa_config_exit(spa, FTAG); 460 461 /* 462 * Check the state of the root vdev. If it can't be opened, it 463 * indicates one or more toplevel vdevs are faulted. 464 */ 465 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 466 error = ENXIO; 467 goto out; 468 } 469 470 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 471 dmu_tx_t *tx; 472 int need_update = B_FALSE; 473 int c; 474 475 /* 476 * Claim log blocks that haven't been committed yet. 477 * This must all happen in a single txg. 478 */ 479 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 480 spa_first_txg(spa)); 481 dmu_objset_find(spa->spa_name, zil_claim, tx, 0); 482 dmu_tx_commit(tx); 483 484 spa->spa_sync_on = B_TRUE; 485 txg_sync_start(spa->spa_dsl_pool); 486 487 /* 488 * Wait for all claims to sync. 489 */ 490 txg_wait_synced(spa->spa_dsl_pool, 0); 491 492 /* 493 * If the config cache is stale, or we have uninitialized 494 * metaslabs (see spa_vdev_add()), then update the config. 495 */ 496 if (config_cache_txg != spa->spa_config_txg || 497 state == SPA_LOAD_IMPORT) 498 need_update = B_TRUE; 499 500 for (c = 0; c < rvd->vdev_children; c++) 501 if (rvd->vdev_child[c]->vdev_ms_array == 0) 502 need_update = B_TRUE; 503 504 /* 505 * Update the config cache asychronously in case we're the 506 * root pool, in which case the config cache isn't writable yet. 507 */ 508 if (need_update) 509 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 510 } 511 512 error = 0; 513 out: 514 if (error) 515 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 516 spa->spa_load_state = SPA_LOAD_NONE; 517 spa->spa_ena = 0; 518 519 return (error); 520 } 521 522 /* 523 * Pool Open/Import 524 * 525 * The import case is identical to an open except that the configuration is sent 526 * down from userland, instead of grabbed from the configuration cache. For the 527 * case of an open, the pool configuration will exist in the 528 * POOL_STATE_UNITIALIZED state. 529 * 530 * The stats information (gen/count/ustats) is used to gather vdev statistics at 531 * the same time open the pool, without having to keep around the spa_t in some 532 * ambiguous state. 533 */ 534 static int 535 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 536 { 537 spa_t *spa; 538 int error; 539 int loaded = B_FALSE; 540 int locked = B_FALSE; 541 542 *spapp = NULL; 543 544 /* 545 * As disgusting as this is, we need to support recursive calls to this 546 * function because dsl_dir_open() is called during spa_load(), and ends 547 * up calling spa_open() again. The real fix is to figure out how to 548 * avoid dsl_dir_open() calling this in the first place. 549 */ 550 if (mutex_owner(&spa_namespace_lock) != curthread) { 551 mutex_enter(&spa_namespace_lock); 552 locked = B_TRUE; 553 } 554 555 if ((spa = spa_lookup(pool)) == NULL) { 556 if (locked) 557 mutex_exit(&spa_namespace_lock); 558 return (ENOENT); 559 } 560 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 561 562 spa_activate(spa); 563 564 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 565 566 if (error == EBADF) { 567 /* 568 * If vdev_load() returns EBADF, it indicates that one 569 * of the vdevs indicates that the pool has been 570 * exported or destroyed. If this is the case, the 571 * config cache is out of sync and we should remove the 572 * pool from the namespace. 573 */ 574 spa_unload(spa); 575 spa_deactivate(spa); 576 spa_remove(spa); 577 spa_config_sync(); 578 if (locked) 579 mutex_exit(&spa_namespace_lock); 580 return (ENOENT); 581 } 582 583 if (error) { 584 /* 585 * We can't open the pool, but we still have useful 586 * information: the state of each vdev after the 587 * attempted vdev_open(). Return this to the user. 588 */ 589 if (config != NULL && spa->spa_root_vdev != NULL) { 590 spa_config_enter(spa, RW_READER, FTAG); 591 *config = spa_config_generate(spa, NULL, -1ULL, 592 B_TRUE); 593 spa_config_exit(spa, FTAG); 594 } 595 spa_unload(spa); 596 spa_deactivate(spa); 597 spa->spa_last_open_failed = B_TRUE; 598 if (locked) 599 mutex_exit(&spa_namespace_lock); 600 *spapp = NULL; 601 return (error); 602 } else { 603 zfs_post_ok(spa, NULL); 604 spa->spa_last_open_failed = B_FALSE; 605 } 606 607 loaded = B_TRUE; 608 } 609 610 spa_open_ref(spa, tag); 611 if (locked) 612 mutex_exit(&spa_namespace_lock); 613 614 *spapp = spa; 615 616 if (config != NULL) { 617 spa_config_enter(spa, RW_READER, FTAG); 618 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 619 spa_config_exit(spa, FTAG); 620 } 621 622 /* 623 * If we just loaded the pool, resilver anything that's out of date. 624 */ 625 if (loaded && (spa_mode & FWRITE)) 626 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 627 628 return (0); 629 } 630 631 int 632 spa_open(const char *name, spa_t **spapp, void *tag) 633 { 634 return (spa_open_common(name, spapp, tag, NULL)); 635 } 636 637 /* 638 * Lookup the given spa_t, incrementing the inject count in the process, 639 * preventing it from being exported or destroyed. 640 */ 641 spa_t * 642 spa_inject_addref(char *name) 643 { 644 spa_t *spa; 645 646 mutex_enter(&spa_namespace_lock); 647 if ((spa = spa_lookup(name)) == NULL) { 648 mutex_exit(&spa_namespace_lock); 649 return (NULL); 650 } 651 spa->spa_inject_ref++; 652 mutex_exit(&spa_namespace_lock); 653 654 return (spa); 655 } 656 657 void 658 spa_inject_delref(spa_t *spa) 659 { 660 mutex_enter(&spa_namespace_lock); 661 spa->spa_inject_ref--; 662 mutex_exit(&spa_namespace_lock); 663 } 664 665 int 666 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 667 { 668 int error; 669 spa_t *spa; 670 671 *config = NULL; 672 error = spa_open_common(name, &spa, FTAG, config); 673 674 if (spa && *config != NULL) 675 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 676 spa_get_errlog_size(spa)) == 0); 677 678 /* 679 * We want to get the alternate root even for faulted pools, so we cheat 680 * and call spa_lookup() directly. 681 */ 682 if (altroot) { 683 if (spa == NULL) { 684 mutex_enter(&spa_namespace_lock); 685 spa = spa_lookup(name); 686 if (spa) 687 spa_altroot(spa, altroot, buflen); 688 else 689 altroot[0] = '\0'; 690 spa = NULL; 691 mutex_exit(&spa_namespace_lock); 692 } else { 693 spa_altroot(spa, altroot, buflen); 694 } 695 } 696 697 if (spa != NULL) 698 spa_close(spa, FTAG); 699 700 return (error); 701 } 702 703 /* 704 * Pool Creation 705 */ 706 int 707 spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 708 { 709 spa_t *spa; 710 vdev_t *rvd; 711 dsl_pool_t *dp; 712 dmu_tx_t *tx; 713 int c, error; 714 uint64_t txg = TXG_INITIAL; 715 716 /* 717 * If this pool already exists, return failure. 718 */ 719 mutex_enter(&spa_namespace_lock); 720 if (spa_lookup(pool) != NULL) { 721 mutex_exit(&spa_namespace_lock); 722 return (EEXIST); 723 } 724 725 /* 726 * Allocate a new spa_t structure. 727 */ 728 spa = spa_add(pool, altroot); 729 spa_activate(spa); 730 731 spa->spa_uberblock.ub_txg = txg - 1; 732 spa->spa_ubsync = spa->spa_uberblock; 733 734 /* 735 * Create the root vdev. 736 */ 737 spa_config_enter(spa, RW_WRITER, FTAG); 738 739 rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 740 741 ASSERT(spa->spa_root_vdev == rvd); 742 743 if (rvd == NULL) { 744 error = EINVAL; 745 } else { 746 if ((error = vdev_create(rvd, txg)) == 0) { 747 for (c = 0; c < rvd->vdev_children; c++) 748 vdev_init(rvd->vdev_child[c], txg); 749 vdev_config_dirty(rvd); 750 } 751 } 752 753 spa_config_exit(spa, FTAG); 754 755 if (error) { 756 spa_unload(spa); 757 spa_deactivate(spa); 758 spa_remove(spa); 759 mutex_exit(&spa_namespace_lock); 760 return (error); 761 } 762 763 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 764 spa->spa_meta_objset = dp->dp_meta_objset; 765 766 tx = dmu_tx_create_assigned(dp, txg); 767 768 /* 769 * Create the pool config object. 770 */ 771 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 772 DMU_OT_PACKED_NVLIST, 1 << 14, 773 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 774 775 if (zap_add(spa->spa_meta_objset, 776 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 777 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 778 cmn_err(CE_PANIC, "failed to add pool config"); 779 } 780 781 /* 782 * Create the deferred-free bplist object. Turn off compression 783 * because sync-to-convergence takes longer if the blocksize 784 * keeps changing. 785 */ 786 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 787 1 << 14, tx); 788 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 789 ZIO_COMPRESS_OFF, tx); 790 791 if (zap_add(spa->spa_meta_objset, 792 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 793 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 794 cmn_err(CE_PANIC, "failed to add bplist"); 795 } 796 797 dmu_tx_commit(tx); 798 799 spa->spa_sync_on = B_TRUE; 800 txg_sync_start(spa->spa_dsl_pool); 801 802 /* 803 * We explicitly wait for the first transaction to complete so that our 804 * bean counters are appropriately updated. 805 */ 806 txg_wait_synced(spa->spa_dsl_pool, txg); 807 808 spa_config_sync(); 809 810 mutex_exit(&spa_namespace_lock); 811 812 return (0); 813 } 814 815 /* 816 * Import the given pool into the system. We set up the necessary spa_t and 817 * then call spa_load() to do the dirty work. 818 */ 819 int 820 spa_import(const char *pool, nvlist_t *config, const char *altroot) 821 { 822 spa_t *spa; 823 int error; 824 825 if (!(spa_mode & FWRITE)) 826 return (EROFS); 827 828 /* 829 * If a pool with this name exists, return failure. 830 */ 831 mutex_enter(&spa_namespace_lock); 832 if (spa_lookup(pool) != NULL) { 833 mutex_exit(&spa_namespace_lock); 834 return (EEXIST); 835 } 836 837 /* 838 * Create and initialize the spa structure. 839 */ 840 spa = spa_add(pool, altroot); 841 spa_activate(spa); 842 843 /* 844 * Pass off the heavy lifting to spa_load(). 845 * Pass TRUE for mosconfig because the user-supplied config 846 * is actually the one to trust when doing an import. 847 */ 848 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 849 850 if (error) { 851 spa_unload(spa); 852 spa_deactivate(spa); 853 spa_remove(spa); 854 mutex_exit(&spa_namespace_lock); 855 return (error); 856 } 857 858 /* 859 * Update the config cache to include the newly-imported pool. 860 */ 861 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 862 863 mutex_exit(&spa_namespace_lock); 864 865 /* 866 * Resilver anything that's out of date. 867 */ 868 if (spa_mode & FWRITE) 869 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 870 871 return (0); 872 } 873 874 /* 875 * This (illegal) pool name is used when temporarily importing a spa_t in order 876 * to get the vdev stats associated with the imported devices. 877 */ 878 #define TRYIMPORT_NAME "$import" 879 880 nvlist_t * 881 spa_tryimport(nvlist_t *tryconfig) 882 { 883 nvlist_t *config = NULL; 884 char *poolname; 885 spa_t *spa; 886 uint64_t state; 887 888 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 889 return (NULL); 890 891 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 892 return (NULL); 893 894 /* 895 * Create and initialize the spa structure. 896 */ 897 mutex_enter(&spa_namespace_lock); 898 spa = spa_add(TRYIMPORT_NAME, NULL); 899 spa_activate(spa); 900 901 /* 902 * Pass off the heavy lifting to spa_load(). 903 * Pass TRUE for mosconfig because the user-supplied config 904 * is actually the one to trust when doing an import. 905 */ 906 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 907 908 /* 909 * If 'tryconfig' was at least parsable, return the current config. 910 */ 911 if (spa->spa_root_vdev != NULL) { 912 spa_config_enter(spa, RW_READER, FTAG); 913 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 914 spa_config_exit(spa, FTAG); 915 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 916 poolname) == 0); 917 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 918 state) == 0); 919 } 920 921 spa_unload(spa); 922 spa_deactivate(spa); 923 spa_remove(spa); 924 mutex_exit(&spa_namespace_lock); 925 926 return (config); 927 } 928 929 /* 930 * Pool export/destroy 931 * 932 * The act of destroying or exporting a pool is very simple. We make sure there 933 * is no more pending I/O and any references to the pool are gone. Then, we 934 * update the pool state and sync all the labels to disk, removing the 935 * configuration from the cache afterwards. 936 */ 937 static int 938 spa_export_common(char *pool, int new_state) 939 { 940 spa_t *spa; 941 942 if (!(spa_mode & FWRITE)) 943 return (EROFS); 944 945 mutex_enter(&spa_namespace_lock); 946 if ((spa = spa_lookup(pool)) == NULL) { 947 mutex_exit(&spa_namespace_lock); 948 return (ENOENT); 949 } 950 951 /* 952 * Put a hold on the pool, drop the namespace lock, stop async tasks, 953 * reacquire the namespace lock, and see if we can export. 954 */ 955 spa_open_ref(spa, FTAG); 956 mutex_exit(&spa_namespace_lock); 957 spa_async_suspend(spa); 958 mutex_enter(&spa_namespace_lock); 959 spa_close(spa, FTAG); 960 961 /* 962 * The pool will be in core if it's openable, 963 * in which case we can modify its state. 964 */ 965 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 966 /* 967 * Objsets may be open only because they're dirty, so we 968 * have to force it to sync before checking spa_refcnt. 969 */ 970 spa_scrub_suspend(spa); 971 txg_wait_synced(spa->spa_dsl_pool, 0); 972 973 /* 974 * A pool cannot be exported or destroyed if there are active 975 * references. If we are resetting a pool, allow references by 976 * fault injection handlers. 977 */ 978 if (!spa_refcount_zero(spa) || 979 (spa->spa_inject_ref != 0 && 980 new_state != POOL_STATE_UNINITIALIZED)) { 981 spa_scrub_resume(spa); 982 spa_async_resume(spa); 983 mutex_exit(&spa_namespace_lock); 984 return (EBUSY); 985 } 986 987 spa_scrub_resume(spa); 988 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 989 990 /* 991 * We want this to be reflected on every label, 992 * so mark them all dirty. spa_unload() will do the 993 * final sync that pushes these changes out. 994 */ 995 if (new_state != POOL_STATE_UNINITIALIZED) { 996 spa_config_enter(spa, RW_WRITER, FTAG); 997 spa->spa_state = new_state; 998 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 999 vdev_config_dirty(spa->spa_root_vdev); 1000 spa_config_exit(spa, FTAG); 1001 } 1002 } 1003 1004 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1005 spa_unload(spa); 1006 spa_deactivate(spa); 1007 } 1008 1009 if (new_state != POOL_STATE_UNINITIALIZED) { 1010 spa_remove(spa); 1011 spa_config_sync(); 1012 } 1013 mutex_exit(&spa_namespace_lock); 1014 1015 return (0); 1016 } 1017 1018 /* 1019 * Destroy a storage pool. 1020 */ 1021 int 1022 spa_destroy(char *pool) 1023 { 1024 return (spa_export_common(pool, POOL_STATE_DESTROYED)); 1025 } 1026 1027 /* 1028 * Export a storage pool. 1029 */ 1030 int 1031 spa_export(char *pool) 1032 { 1033 return (spa_export_common(pool, POOL_STATE_EXPORTED)); 1034 } 1035 1036 /* 1037 * Similar to spa_export(), this unloads the spa_t without actually removing it 1038 * from the namespace in any way. 1039 */ 1040 int 1041 spa_reset(char *pool) 1042 { 1043 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED)); 1044 } 1045 1046 1047 /* 1048 * ========================================================================== 1049 * Device manipulation 1050 * ========================================================================== 1051 */ 1052 1053 /* 1054 * Add capacity to a storage pool. 1055 */ 1056 int 1057 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1058 { 1059 uint64_t txg; 1060 int c, error; 1061 vdev_t *rvd = spa->spa_root_vdev; 1062 vdev_t *vd, *tvd; 1063 1064 txg = spa_vdev_enter(spa); 1065 1066 vd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1067 1068 if (vd == NULL) 1069 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1070 1071 if ((error = vdev_create(vd, txg)) != 0) 1072 return (spa_vdev_exit(spa, vd, txg, error)); 1073 1074 /* 1075 * Transfer each new top-level vdev from vd to rvd. 1076 */ 1077 for (c = 0; c < vd->vdev_children; c++) { 1078 tvd = vd->vdev_child[c]; 1079 vdev_remove_child(vd, tvd); 1080 tvd->vdev_id = rvd->vdev_children; 1081 vdev_add_child(rvd, tvd); 1082 vdev_config_dirty(tvd); 1083 } 1084 1085 /* 1086 * We have to be careful when adding new vdevs to an existing pool. 1087 * If other threads start allocating from these vdevs before we 1088 * sync the config cache, and we lose power, then upon reboot we may 1089 * fail to open the pool because there are DVAs that the config cache 1090 * can't translate. Therefore, we first add the vdevs without 1091 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1092 * and then let spa_config_update() initialize the new metaslabs. 1093 * 1094 * spa_load() checks for added-but-not-initialized vdevs, so that 1095 * if we lose power at any point in this sequence, the remaining 1096 * steps will be completed the next time we load the pool. 1097 */ 1098 (void) spa_vdev_exit(spa, vd, txg, 0); 1099 1100 mutex_enter(&spa_namespace_lock); 1101 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1102 mutex_exit(&spa_namespace_lock); 1103 1104 return (0); 1105 } 1106 1107 /* 1108 * Attach a device to a mirror. The arguments are the path to any device 1109 * in the mirror, and the nvroot for the new device. If the path specifies 1110 * a device that is not mirrored, we automatically insert the mirror vdev. 1111 * 1112 * If 'replacing' is specified, the new device is intended to replace the 1113 * existing device; in this case the two devices are made into their own 1114 * mirror using the 'replacing' vdev, which is functionally idendical to 1115 * the mirror vdev (it actually reuses all the same ops) but has a few 1116 * extra rules: you can't attach to it after it's been created, and upon 1117 * completion of resilvering, the first disk (the one being replaced) 1118 * is automatically detached. 1119 */ 1120 int 1121 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1122 { 1123 uint64_t txg, open_txg; 1124 int error; 1125 vdev_t *rvd = spa->spa_root_vdev; 1126 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1127 vdev_ops_t *pvops = replacing ? &vdev_replacing_ops : &vdev_mirror_ops; 1128 1129 txg = spa_vdev_enter(spa); 1130 1131 oldvd = vdev_lookup_by_guid(rvd, guid); 1132 1133 if (oldvd == NULL) 1134 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1135 1136 if (!oldvd->vdev_ops->vdev_op_leaf) 1137 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1138 1139 pvd = oldvd->vdev_parent; 1140 1141 /* 1142 * The parent must be a mirror or the root, unless we're replacing; 1143 * in that case, the parent can be anything but another replacing vdev. 1144 */ 1145 if (pvd->vdev_ops != &vdev_mirror_ops && 1146 pvd->vdev_ops != &vdev_root_ops && 1147 (!replacing || pvd->vdev_ops == &vdev_replacing_ops)) 1148 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1149 1150 newrootvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1151 1152 if (newrootvd == NULL || newrootvd->vdev_children != 1) 1153 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1154 1155 newvd = newrootvd->vdev_child[0]; 1156 1157 if (!newvd->vdev_ops->vdev_op_leaf) 1158 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1159 1160 if ((error = vdev_create(newrootvd, txg)) != 0) 1161 return (spa_vdev_exit(spa, newrootvd, txg, error)); 1162 1163 /* 1164 * Compare the new device size with the replaceable/attachable 1165 * device size. 1166 */ 1167 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1168 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1169 1170 /* 1171 * The new device cannot have a higher alignment requirement 1172 * than the top-level vdev. 1173 */ 1174 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1175 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1176 1177 /* 1178 * If this is an in-place replacement, update oldvd's path and devid 1179 * to make it distinguishable from newvd, and unopenable from now on. 1180 */ 1181 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1182 spa_strfree(oldvd->vdev_path); 1183 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1184 KM_SLEEP); 1185 (void) sprintf(oldvd->vdev_path, "%s/%s", 1186 newvd->vdev_path, "old"); 1187 if (oldvd->vdev_devid != NULL) { 1188 spa_strfree(oldvd->vdev_devid); 1189 oldvd->vdev_devid = NULL; 1190 } 1191 } 1192 1193 /* 1194 * If the parent is not a mirror, or if we're replacing, 1195 * insert the new mirror/replacing vdev above oldvd. 1196 */ 1197 if (pvd->vdev_ops != pvops) 1198 pvd = vdev_add_parent(oldvd, pvops); 1199 1200 ASSERT(pvd->vdev_top->vdev_parent == rvd); 1201 ASSERT(pvd->vdev_ops == pvops); 1202 ASSERT(oldvd->vdev_parent == pvd); 1203 1204 /* 1205 * Extract the new device from its root and add it to pvd. 1206 */ 1207 vdev_remove_child(newrootvd, newvd); 1208 newvd->vdev_id = pvd->vdev_children; 1209 vdev_add_child(pvd, newvd); 1210 1211 /* 1212 * If newvd is smaller than oldvd, but larger than its rsize, 1213 * the addition of newvd may have decreased our parent's asize. 1214 */ 1215 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1216 1217 tvd = newvd->vdev_top; 1218 ASSERT(pvd->vdev_top == tvd); 1219 ASSERT(tvd->vdev_parent == rvd); 1220 1221 vdev_config_dirty(tvd); 1222 1223 /* 1224 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1225 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1226 */ 1227 open_txg = txg + TXG_CONCURRENT_STATES - 1; 1228 1229 mutex_enter(&newvd->vdev_dtl_lock); 1230 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1231 open_txg - TXG_INITIAL + 1); 1232 mutex_exit(&newvd->vdev_dtl_lock); 1233 1234 dprintf("attached %s in txg %llu\n", newvd->vdev_path, txg); 1235 1236 /* 1237 * Mark newvd's DTL dirty in this txg. 1238 */ 1239 vdev_dirty(tvd, VDD_DTL, newvd, txg); 1240 1241 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1242 1243 /* 1244 * Kick off a resilver to update newvd. 1245 */ 1246 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1247 1248 return (0); 1249 } 1250 1251 /* 1252 * Detach a device from a mirror or replacing vdev. 1253 * If 'replace_done' is specified, only detach if the parent 1254 * is a replacing vdev. 1255 */ 1256 int 1257 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1258 { 1259 uint64_t txg; 1260 int c, t, error; 1261 vdev_t *rvd = spa->spa_root_vdev; 1262 vdev_t *vd, *pvd, *cvd, *tvd; 1263 1264 txg = spa_vdev_enter(spa); 1265 1266 vd = vdev_lookup_by_guid(rvd, guid); 1267 1268 if (vd == NULL) 1269 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1270 1271 if (!vd->vdev_ops->vdev_op_leaf) 1272 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1273 1274 pvd = vd->vdev_parent; 1275 1276 /* 1277 * If replace_done is specified, only remove this device if it's 1278 * the first child of a replacing vdev. 1279 */ 1280 if (replace_done && 1281 (vd->vdev_id != 0 || pvd->vdev_ops != &vdev_replacing_ops)) 1282 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1283 1284 /* 1285 * Only mirror and replacing vdevs support detach. 1286 */ 1287 if (pvd->vdev_ops != &vdev_replacing_ops && 1288 pvd->vdev_ops != &vdev_mirror_ops) 1289 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1290 1291 /* 1292 * If there's only one replica, you can't detach it. 1293 */ 1294 if (pvd->vdev_children <= 1) 1295 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1296 1297 /* 1298 * If all siblings have non-empty DTLs, this device may have the only 1299 * valid copy of the data, which means we cannot safely detach it. 1300 * 1301 * XXX -- as in the vdev_offline() case, we really want a more 1302 * precise DTL check. 1303 */ 1304 for (c = 0; c < pvd->vdev_children; c++) { 1305 uint64_t dirty; 1306 1307 cvd = pvd->vdev_child[c]; 1308 if (cvd == vd) 1309 continue; 1310 if (vdev_is_dead(cvd)) 1311 continue; 1312 mutex_enter(&cvd->vdev_dtl_lock); 1313 dirty = cvd->vdev_dtl_map.sm_space | 1314 cvd->vdev_dtl_scrub.sm_space; 1315 mutex_exit(&cvd->vdev_dtl_lock); 1316 if (!dirty) 1317 break; 1318 } 1319 if (c == pvd->vdev_children) 1320 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1321 1322 /* 1323 * Erase the disk labels so the disk can be used for other things. 1324 * This must be done after all other error cases are handled, 1325 * but before we disembowel vd (so we can still do I/O to it). 1326 * But if we can't do it, don't treat the error as fatal -- 1327 * it may be that the unwritability of the disk is the reason 1328 * it's being detached! 1329 */ 1330 error = vdev_label_init(vd, 0); 1331 if (error) 1332 dprintf("unable to erase labels on %s\n", vdev_description(vd)); 1333 1334 /* 1335 * Remove vd from its parent and compact the parent's children. 1336 */ 1337 vdev_remove_child(pvd, vd); 1338 vdev_compact_children(pvd); 1339 1340 /* 1341 * Remember one of the remaining children so we can get tvd below. 1342 */ 1343 cvd = pvd->vdev_child[0]; 1344 1345 /* 1346 * If the parent mirror/replacing vdev only has one child, 1347 * the parent is no longer needed. Remove it from the tree. 1348 */ 1349 if (pvd->vdev_children == 1) 1350 vdev_remove_parent(cvd); 1351 1352 /* 1353 * We don't set tvd until now because the parent we just removed 1354 * may have been the previous top-level vdev. 1355 */ 1356 tvd = cvd->vdev_top; 1357 ASSERT(tvd->vdev_parent == rvd); 1358 1359 /* 1360 * Reopen this top-level vdev to reassess health after detach. 1361 */ 1362 vdev_reopen(tvd); 1363 1364 /* 1365 * If the device we just detached was smaller than the others, 1366 * it may be possible to add metaslabs (i.e. grow the pool). 1367 * vdev_metaslab_init() can't fail because the existing metaslabs 1368 * are already in core, so there's nothing to read from disk. 1369 */ 1370 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 1371 1372 vdev_config_dirty(tvd); 1373 1374 /* 1375 * Mark vd's DTL as dirty in this txg. 1376 * vdev_dtl_sync() will see that vd->vdev_detached is set 1377 * and free vd's DTL object in syncing context. 1378 * But first make sure we're not on any *other* txg's DTL list, 1379 * to prevent vd from being accessed after it's freed. 1380 */ 1381 for (t = 0; t < TXG_SIZE; t++) 1382 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1383 vd->vdev_detached = B_TRUE; 1384 vdev_dirty(tvd, VDD_DTL, vd, txg); 1385 1386 dprintf("detached %s in txg %llu\n", vd->vdev_path, txg); 1387 1388 return (spa_vdev_exit(spa, vd, txg, 0)); 1389 } 1390 1391 /* 1392 * Find any device that's done replacing, so we can detach it. 1393 */ 1394 static vdev_t * 1395 spa_vdev_replace_done_hunt(vdev_t *vd) 1396 { 1397 vdev_t *newvd, *oldvd; 1398 int c; 1399 1400 for (c = 0; c < vd->vdev_children; c++) { 1401 oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 1402 if (oldvd != NULL) 1403 return (oldvd); 1404 } 1405 1406 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 1407 oldvd = vd->vdev_child[0]; 1408 newvd = vd->vdev_child[1]; 1409 1410 mutex_enter(&newvd->vdev_dtl_lock); 1411 if (newvd->vdev_dtl_map.sm_space == 0 && 1412 newvd->vdev_dtl_scrub.sm_space == 0) { 1413 mutex_exit(&newvd->vdev_dtl_lock); 1414 return (oldvd); 1415 } 1416 mutex_exit(&newvd->vdev_dtl_lock); 1417 } 1418 1419 return (NULL); 1420 } 1421 1422 static void 1423 spa_vdev_replace_done(spa_t *spa) 1424 { 1425 vdev_t *vd; 1426 uint64_t guid; 1427 1428 spa_config_enter(spa, RW_READER, FTAG); 1429 1430 while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 1431 guid = vd->vdev_guid; 1432 spa_config_exit(spa, FTAG); 1433 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 1434 return; 1435 spa_config_enter(spa, RW_READER, FTAG); 1436 } 1437 1438 spa_config_exit(spa, FTAG); 1439 } 1440 1441 /* 1442 * Update the stored path for this vdev. Dirty the vdev configuration, relying 1443 * on spa_vdev_enter/exit() to synchronize the labels and cache. 1444 */ 1445 int 1446 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 1447 { 1448 vdev_t *rvd, *vd; 1449 uint64_t txg; 1450 1451 rvd = spa->spa_root_vdev; 1452 1453 txg = spa_vdev_enter(spa); 1454 1455 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 1456 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 1457 1458 if (!vd->vdev_ops->vdev_op_leaf) 1459 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1460 1461 spa_strfree(vd->vdev_path); 1462 vd->vdev_path = spa_strdup(newpath); 1463 1464 vdev_config_dirty(vd->vdev_top); 1465 1466 return (spa_vdev_exit(spa, NULL, txg, 0)); 1467 } 1468 1469 /* 1470 * ========================================================================== 1471 * SPA Scrubbing 1472 * ========================================================================== 1473 */ 1474 1475 void 1476 spa_scrub_throttle(spa_t *spa, int direction) 1477 { 1478 mutex_enter(&spa->spa_scrub_lock); 1479 spa->spa_scrub_throttled += direction; 1480 ASSERT(spa->spa_scrub_throttled >= 0); 1481 if (spa->spa_scrub_throttled == 0) 1482 cv_broadcast(&spa->spa_scrub_io_cv); 1483 mutex_exit(&spa->spa_scrub_lock); 1484 } 1485 1486 static void 1487 spa_scrub_io_done(zio_t *zio) 1488 { 1489 spa_t *spa = zio->io_spa; 1490 1491 zio_buf_free(zio->io_data, zio->io_size); 1492 1493 mutex_enter(&spa->spa_scrub_lock); 1494 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1495 vdev_t *vd = zio->io_vd; 1496 spa->spa_scrub_errors++; 1497 mutex_enter(&vd->vdev_stat_lock); 1498 vd->vdev_stat.vs_scrub_errors++; 1499 mutex_exit(&vd->vdev_stat_lock); 1500 } 1501 if (--spa->spa_scrub_inflight == 0) { 1502 cv_broadcast(&spa->spa_scrub_io_cv); 1503 ASSERT(spa->spa_scrub_throttled == 0); 1504 } 1505 mutex_exit(&spa->spa_scrub_lock); 1506 } 1507 1508 static void 1509 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 1510 zbookmark_t *zb) 1511 { 1512 size_t size = BP_GET_LSIZE(bp); 1513 void *data = zio_buf_alloc(size); 1514 1515 mutex_enter(&spa->spa_scrub_lock); 1516 spa->spa_scrub_inflight++; 1517 mutex_exit(&spa->spa_scrub_lock); 1518 1519 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 1520 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 1521 1522 flags |= ZIO_FLAG_CANFAIL; 1523 1524 zio_nowait(zio_read(NULL, spa, bp, data, size, 1525 spa_scrub_io_done, NULL, priority, flags, zb)); 1526 } 1527 1528 /* ARGSUSED */ 1529 static int 1530 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 1531 { 1532 blkptr_t *bp = &bc->bc_blkptr; 1533 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[0])); 1534 1535 if (bc->bc_errno || vd == NULL) { 1536 /* 1537 * We can't scrub this block, but we can continue to scrub 1538 * the rest of the pool. Note the error and move along. 1539 */ 1540 mutex_enter(&spa->spa_scrub_lock); 1541 spa->spa_scrub_errors++; 1542 mutex_exit(&spa->spa_scrub_lock); 1543 1544 if (vd != NULL) { 1545 mutex_enter(&vd->vdev_stat_lock); 1546 vd->vdev_stat.vs_scrub_errors++; 1547 mutex_exit(&vd->vdev_stat_lock); 1548 } 1549 1550 return (ERESTART); 1551 } 1552 1553 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 1554 1555 /* 1556 * Keep track of how much data we've examined so that 1557 * zpool(1M) status can make useful progress reports. 1558 */ 1559 mutex_enter(&vd->vdev_stat_lock); 1560 vd->vdev_stat.vs_scrub_examined += BP_GET_ASIZE(bp); 1561 mutex_exit(&vd->vdev_stat_lock); 1562 1563 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 1564 if (DVA_GET_GANG(&bp->blk_dva[0])) { 1565 /* 1566 * Gang members may be spread across multiple vdevs, 1567 * so the best we can do is look at the pool-wide DTL. 1568 * XXX -- it would be better to change our allocation 1569 * policy to ensure that this can't happen. 1570 */ 1571 vd = spa->spa_root_vdev; 1572 } 1573 if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) { 1574 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 1575 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 1576 } 1577 } else { 1578 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 1579 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 1580 } 1581 1582 return (0); 1583 } 1584 1585 static void 1586 spa_scrub_thread(spa_t *spa) 1587 { 1588 callb_cpr_t cprinfo; 1589 traverse_handle_t *th = spa->spa_scrub_th; 1590 vdev_t *rvd = spa->spa_root_vdev; 1591 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 1592 int error = 0; 1593 boolean_t complete; 1594 1595 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 1596 1597 /* 1598 * If we're restarting due to a snapshot create/delete, 1599 * wait for that to complete. 1600 */ 1601 txg_wait_synced(spa_get_dsl(spa), 0); 1602 1603 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 1604 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 1605 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 1606 1607 spa_config_enter(spa, RW_WRITER, FTAG); 1608 vdev_reopen(rvd); /* purge all vdev caches */ 1609 vdev_config_dirty(rvd); /* rewrite all disk labels */ 1610 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 1611 spa_config_exit(spa, FTAG); 1612 1613 mutex_enter(&spa->spa_scrub_lock); 1614 spa->spa_scrub_errors = 0; 1615 spa->spa_scrub_active = 1; 1616 ASSERT(spa->spa_scrub_inflight == 0); 1617 ASSERT(spa->spa_scrub_throttled == 0); 1618 1619 while (!spa->spa_scrub_stop) { 1620 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1621 while (spa->spa_scrub_suspended) { 1622 spa->spa_scrub_active = 0; 1623 cv_broadcast(&spa->spa_scrub_cv); 1624 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1625 spa->spa_scrub_active = 1; 1626 } 1627 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 1628 1629 if (spa->spa_scrub_restart_txg != 0) 1630 break; 1631 1632 mutex_exit(&spa->spa_scrub_lock); 1633 error = traverse_more(th); 1634 mutex_enter(&spa->spa_scrub_lock); 1635 if (error != EAGAIN) 1636 break; 1637 1638 while (spa->spa_scrub_throttled > 0) 1639 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1640 } 1641 1642 while (spa->spa_scrub_inflight) 1643 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1644 1645 spa->spa_scrub_active = 0; 1646 cv_broadcast(&spa->spa_scrub_cv); 1647 1648 mutex_exit(&spa->spa_scrub_lock); 1649 1650 spa_config_enter(spa, RW_WRITER, FTAG); 1651 1652 mutex_enter(&spa->spa_scrub_lock); 1653 1654 /* 1655 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 1656 * AND the spa config lock to synchronize with any config changes 1657 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 1658 */ 1659 if (spa->spa_scrub_restart_txg != 0) 1660 error = ERESTART; 1661 1662 if (spa->spa_scrub_stop) 1663 error = EINTR; 1664 1665 /* 1666 * Even if there were uncorrectable errors, we consider the scrub 1667 * completed. The downside is that if there is a transient error during 1668 * a resilver, we won't resilver the data properly to the target. But 1669 * if the damage is permanent (more likely) we will resilver forever, 1670 * which isn't really acceptable. Since there is enough information for 1671 * the user to know what has failed and why, this seems like a more 1672 * tractable approach. 1673 */ 1674 complete = (error == 0); 1675 1676 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 1677 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 1678 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 1679 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 1680 1681 mutex_exit(&spa->spa_scrub_lock); 1682 1683 /* 1684 * If the scrub/resilver completed, update all DTLs to reflect this. 1685 * Whether it succeeded or not, vacate all temporary scrub DTLs. 1686 */ 1687 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 1688 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 1689 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 1690 spa_errlog_rotate(spa); 1691 1692 spa_config_exit(spa, FTAG); 1693 1694 mutex_enter(&spa->spa_scrub_lock); 1695 1696 /* 1697 * We may have finished replacing a device. 1698 * Let the async thread assess this and handle the detach. 1699 */ 1700 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 1701 1702 /* 1703 * If we were told to restart, our final act is to start a new scrub. 1704 */ 1705 if (error == ERESTART) 1706 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 1707 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 1708 1709 spa->spa_scrub_type = POOL_SCRUB_NONE; 1710 spa->spa_scrub_active = 0; 1711 spa->spa_scrub_thread = NULL; 1712 cv_broadcast(&spa->spa_scrub_cv); 1713 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 1714 thread_exit(); 1715 } 1716 1717 void 1718 spa_scrub_suspend(spa_t *spa) 1719 { 1720 mutex_enter(&spa->spa_scrub_lock); 1721 spa->spa_scrub_suspended++; 1722 while (spa->spa_scrub_active) { 1723 cv_broadcast(&spa->spa_scrub_cv); 1724 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1725 } 1726 while (spa->spa_scrub_inflight) 1727 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1728 mutex_exit(&spa->spa_scrub_lock); 1729 } 1730 1731 void 1732 spa_scrub_resume(spa_t *spa) 1733 { 1734 mutex_enter(&spa->spa_scrub_lock); 1735 ASSERT(spa->spa_scrub_suspended != 0); 1736 if (--spa->spa_scrub_suspended == 0) 1737 cv_broadcast(&spa->spa_scrub_cv); 1738 mutex_exit(&spa->spa_scrub_lock); 1739 } 1740 1741 void 1742 spa_scrub_restart(spa_t *spa, uint64_t txg) 1743 { 1744 /* 1745 * Something happened (e.g. snapshot create/delete) that means 1746 * we must restart any in-progress scrubs. The itinerary will 1747 * fix this properly. 1748 */ 1749 mutex_enter(&spa->spa_scrub_lock); 1750 spa->spa_scrub_restart_txg = txg; 1751 mutex_exit(&spa->spa_scrub_lock); 1752 } 1753 1754 int 1755 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 1756 { 1757 space_seg_t *ss; 1758 uint64_t mintxg, maxtxg; 1759 vdev_t *rvd = spa->spa_root_vdev; 1760 1761 if ((uint_t)type >= POOL_SCRUB_TYPES) 1762 return (ENOTSUP); 1763 1764 mutex_enter(&spa->spa_scrub_lock); 1765 1766 /* 1767 * If there's a scrub or resilver already in progress, stop it. 1768 */ 1769 while (spa->spa_scrub_thread != NULL) { 1770 /* 1771 * Don't stop a resilver unless forced. 1772 */ 1773 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 1774 mutex_exit(&spa->spa_scrub_lock); 1775 return (EBUSY); 1776 } 1777 spa->spa_scrub_stop = 1; 1778 cv_broadcast(&spa->spa_scrub_cv); 1779 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1780 } 1781 1782 /* 1783 * Terminate the previous traverse. 1784 */ 1785 if (spa->spa_scrub_th != NULL) { 1786 traverse_fini(spa->spa_scrub_th); 1787 spa->spa_scrub_th = NULL; 1788 } 1789 1790 if (rvd == NULL) { 1791 ASSERT(spa->spa_scrub_stop == 0); 1792 ASSERT(spa->spa_scrub_type == type); 1793 ASSERT(spa->spa_scrub_restart_txg == 0); 1794 mutex_exit(&spa->spa_scrub_lock); 1795 return (0); 1796 } 1797 1798 mintxg = TXG_INITIAL - 1; 1799 maxtxg = spa_last_synced_txg(spa) + 1; 1800 1801 mutex_enter(&rvd->vdev_dtl_lock); 1802 1803 if (rvd->vdev_dtl_map.sm_space == 0) { 1804 /* 1805 * The pool-wide DTL is empty. 1806 * If this is a resilver, there's nothing to do except 1807 * check whether any in-progress replacements have completed. 1808 */ 1809 if (type == POOL_SCRUB_RESILVER) { 1810 type = POOL_SCRUB_NONE; 1811 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 1812 } 1813 } else { 1814 /* 1815 * The pool-wide DTL is non-empty. 1816 * If this is a normal scrub, upgrade to a resilver instead. 1817 */ 1818 if (type == POOL_SCRUB_EVERYTHING) 1819 type = POOL_SCRUB_RESILVER; 1820 } 1821 1822 if (type == POOL_SCRUB_RESILVER) { 1823 /* 1824 * Determine the resilvering boundaries. 1825 * 1826 * Note: (mintxg, maxtxg) is an open interval, 1827 * i.e. mintxg and maxtxg themselves are not included. 1828 * 1829 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 1830 * so we don't claim to resilver a txg that's still changing. 1831 */ 1832 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 1833 mintxg = ss->ss_start - 1; 1834 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 1835 maxtxg = MIN(ss->ss_end, maxtxg); 1836 } 1837 1838 mutex_exit(&rvd->vdev_dtl_lock); 1839 1840 spa->spa_scrub_stop = 0; 1841 spa->spa_scrub_type = type; 1842 spa->spa_scrub_restart_txg = 0; 1843 1844 if (type != POOL_SCRUB_NONE) { 1845 spa->spa_scrub_mintxg = mintxg; 1846 spa->spa_scrub_maxtxg = maxtxg; 1847 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 1848 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 1849 ZIO_FLAG_CANFAIL); 1850 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 1851 spa->spa_scrub_thread = thread_create(NULL, 0, 1852 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 1853 } 1854 1855 mutex_exit(&spa->spa_scrub_lock); 1856 1857 return (0); 1858 } 1859 1860 /* 1861 * ========================================================================== 1862 * SPA async task processing 1863 * ========================================================================== 1864 */ 1865 1866 static void 1867 spa_async_reopen(spa_t *spa) 1868 { 1869 vdev_t *rvd = spa->spa_root_vdev; 1870 vdev_t *tvd; 1871 int c; 1872 1873 spa_config_enter(spa, RW_WRITER, FTAG); 1874 1875 for (c = 0; c < rvd->vdev_children; c++) { 1876 tvd = rvd->vdev_child[c]; 1877 if (tvd->vdev_reopen_wanted) { 1878 tvd->vdev_reopen_wanted = 0; 1879 vdev_reopen(tvd); 1880 } 1881 } 1882 1883 spa_config_exit(spa, FTAG); 1884 } 1885 1886 static void 1887 spa_async_thread(spa_t *spa) 1888 { 1889 int tasks; 1890 1891 ASSERT(spa->spa_sync_on); 1892 1893 mutex_enter(&spa->spa_async_lock); 1894 tasks = spa->spa_async_tasks; 1895 spa->spa_async_tasks = 0; 1896 mutex_exit(&spa->spa_async_lock); 1897 1898 /* 1899 * See if the config needs to be updated. 1900 */ 1901 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 1902 mutex_enter(&spa_namespace_lock); 1903 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1904 mutex_exit(&spa_namespace_lock); 1905 } 1906 1907 /* 1908 * See if any devices need to be reopened. 1909 */ 1910 if (tasks & SPA_ASYNC_REOPEN) 1911 spa_async_reopen(spa); 1912 1913 /* 1914 * If any devices are done replacing, detach them. 1915 */ 1916 if (tasks & SPA_ASYNC_REPLACE_DONE) 1917 spa_vdev_replace_done(spa); 1918 1919 /* 1920 * Kick off a scrub. 1921 */ 1922 if (tasks & SPA_ASYNC_SCRUB) 1923 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 1924 1925 /* 1926 * Kick off a resilver. 1927 */ 1928 if (tasks & SPA_ASYNC_RESILVER) 1929 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1930 1931 /* 1932 * Let the world know that we're done. 1933 */ 1934 mutex_enter(&spa->spa_async_lock); 1935 spa->spa_async_thread = NULL; 1936 cv_broadcast(&spa->spa_async_cv); 1937 mutex_exit(&spa->spa_async_lock); 1938 thread_exit(); 1939 } 1940 1941 void 1942 spa_async_suspend(spa_t *spa) 1943 { 1944 mutex_enter(&spa->spa_async_lock); 1945 spa->spa_async_suspended++; 1946 while (spa->spa_async_thread != NULL) 1947 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 1948 mutex_exit(&spa->spa_async_lock); 1949 } 1950 1951 void 1952 spa_async_resume(spa_t *spa) 1953 { 1954 mutex_enter(&spa->spa_async_lock); 1955 ASSERT(spa->spa_async_suspended != 0); 1956 spa->spa_async_suspended--; 1957 mutex_exit(&spa->spa_async_lock); 1958 } 1959 1960 static void 1961 spa_async_dispatch(spa_t *spa) 1962 { 1963 mutex_enter(&spa->spa_async_lock); 1964 if (spa->spa_async_tasks && !spa->spa_async_suspended && 1965 spa->spa_async_thread == NULL && 1966 rootdir != NULL && !vn_is_readonly(rootdir)) 1967 spa->spa_async_thread = thread_create(NULL, 0, 1968 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 1969 mutex_exit(&spa->spa_async_lock); 1970 } 1971 1972 void 1973 spa_async_request(spa_t *spa, int task) 1974 { 1975 mutex_enter(&spa->spa_async_lock); 1976 spa->spa_async_tasks |= task; 1977 mutex_exit(&spa->spa_async_lock); 1978 } 1979 1980 /* 1981 * ========================================================================== 1982 * SPA syncing routines 1983 * ========================================================================== 1984 */ 1985 1986 static void 1987 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 1988 { 1989 bplist_t *bpl = &spa->spa_sync_bplist; 1990 dmu_tx_t *tx; 1991 blkptr_t blk; 1992 uint64_t itor = 0; 1993 zio_t *zio; 1994 int error; 1995 uint8_t c = 1; 1996 1997 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 1998 1999 while (bplist_iterate(bpl, &itor, &blk) == 0) 2000 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2001 2002 error = zio_wait(zio); 2003 ASSERT3U(error, ==, 0); 2004 2005 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2006 bplist_vacate(bpl, tx); 2007 2008 /* 2009 * Pre-dirty the first block so we sync to convergence faster. 2010 * (Usually only the first block is needed.) 2011 */ 2012 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2013 dmu_tx_commit(tx); 2014 } 2015 2016 static void 2017 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2018 { 2019 nvlist_t *config; 2020 char *packed = NULL; 2021 size_t nvsize = 0; 2022 dmu_buf_t *db; 2023 2024 if (list_is_empty(&spa->spa_dirty_list)) 2025 return; 2026 2027 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2028 2029 if (spa->spa_config_syncing) 2030 nvlist_free(spa->spa_config_syncing); 2031 spa->spa_config_syncing = config; 2032 2033 VERIFY(nvlist_size(config, &nvsize, NV_ENCODE_XDR) == 0); 2034 2035 packed = kmem_alloc(nvsize, KM_SLEEP); 2036 2037 VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 2038 KM_SLEEP) == 0); 2039 2040 dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize, 2041 packed, tx); 2042 2043 kmem_free(packed, nvsize); 2044 2045 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, 2046 spa->spa_config_object, FTAG, &db)); 2047 dmu_buf_will_dirty(db, tx); 2048 *(uint64_t *)db->db_data = nvsize; 2049 dmu_buf_rele(db, FTAG); 2050 } 2051 2052 /* 2053 * Sync the specified transaction group. New blocks may be dirtied as 2054 * part of the process, so we iterate until it converges. 2055 */ 2056 void 2057 spa_sync(spa_t *spa, uint64_t txg) 2058 { 2059 dsl_pool_t *dp = spa->spa_dsl_pool; 2060 objset_t *mos = spa->spa_meta_objset; 2061 bplist_t *bpl = &spa->spa_sync_bplist; 2062 vdev_t *rvd = spa->spa_root_vdev; 2063 vdev_t *vd; 2064 dmu_tx_t *tx; 2065 int dirty_vdevs; 2066 2067 /* 2068 * Lock out configuration changes. 2069 */ 2070 spa_config_enter(spa, RW_READER, FTAG); 2071 2072 spa->spa_syncing_txg = txg; 2073 spa->spa_sync_pass = 0; 2074 2075 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2076 2077 /* 2078 * If anything has changed in this txg, push the deferred frees 2079 * from the previous txg. If not, leave them alone so that we 2080 * don't generate work on an otherwise idle system. 2081 */ 2082 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2083 !txg_list_empty(&dp->dp_dirty_dirs, txg)) 2084 spa_sync_deferred_frees(spa, txg); 2085 2086 /* 2087 * Iterate to convergence. 2088 */ 2089 do { 2090 spa->spa_sync_pass++; 2091 2092 tx = dmu_tx_create_assigned(dp, txg); 2093 spa_sync_config_object(spa, tx); 2094 dmu_tx_commit(tx); 2095 2096 spa_errlog_sync(spa, txg); 2097 2098 dsl_pool_sync(dp, txg); 2099 2100 dirty_vdevs = 0; 2101 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2102 vdev_sync(vd, txg); 2103 dirty_vdevs++; 2104 } 2105 2106 tx = dmu_tx_create_assigned(dp, txg); 2107 bplist_sync(bpl, tx); 2108 dmu_tx_commit(tx); 2109 2110 } while (dirty_vdevs); 2111 2112 bplist_close(bpl); 2113 2114 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2115 2116 /* 2117 * Rewrite the vdev configuration (which includes the uberblock) 2118 * to commit the transaction group. 2119 * 2120 * If there are any dirty vdevs, sync the uberblock to all vdevs. 2121 * Otherwise, pick a random top-level vdev that's known to be 2122 * visible in the config cache (see spa_vdev_add() for details). 2123 * If the write fails, try the next vdev until we're tried them all. 2124 */ 2125 if (!list_is_empty(&spa->spa_dirty_list)) { 2126 VERIFY(vdev_config_sync(rvd, txg) == 0); 2127 } else { 2128 int children = rvd->vdev_children; 2129 int c0 = spa_get_random(children); 2130 int c; 2131 2132 for (c = 0; c < children; c++) { 2133 vd = rvd->vdev_child[(c0 + c) % children]; 2134 if (vd->vdev_ms_array == 0) 2135 continue; 2136 if (vdev_config_sync(vd, txg) == 0) 2137 break; 2138 } 2139 if (c == children) 2140 VERIFY(vdev_config_sync(rvd, txg) == 0); 2141 } 2142 2143 /* 2144 * Clear the dirty config list. 2145 */ 2146 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 2147 vdev_config_clean(vd); 2148 2149 /* 2150 * Now that the new config has synced transactionally, 2151 * let it become visible to the config cache. 2152 */ 2153 if (spa->spa_config_syncing != NULL) { 2154 spa_config_set(spa, spa->spa_config_syncing); 2155 spa->spa_config_txg = txg; 2156 spa->spa_config_syncing = NULL; 2157 } 2158 2159 /* 2160 * Make a stable copy of the fully synced uberblock. 2161 * We use this as the root for pool traversals. 2162 */ 2163 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 2164 2165 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 2166 2167 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 2168 spa->spa_traverse_wanted = 0; 2169 spa->spa_ubsync = spa->spa_uberblock; 2170 rw_exit(&spa->spa_traverse_lock); 2171 2172 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 2173 2174 /* 2175 * Clean up the ZIL records for the synced txg. 2176 */ 2177 dsl_pool_zil_clean(dp); 2178 2179 /* 2180 * Update usable space statistics. 2181 */ 2182 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 2183 vdev_sync_done(vd, txg); 2184 2185 /* 2186 * It had better be the case that we didn't dirty anything 2187 * since spa_sync_labels(). 2188 */ 2189 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 2190 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 2191 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 2192 ASSERT(bpl->bpl_queue == NULL); 2193 2194 spa_config_exit(spa, FTAG); 2195 2196 /* 2197 * If any async tasks have been requested, kick them off. 2198 */ 2199 spa_async_dispatch(spa); 2200 } 2201 2202 /* 2203 * Sync all pools. We don't want to hold the namespace lock across these 2204 * operations, so we take a reference on the spa_t and drop the lock during the 2205 * sync. 2206 */ 2207 void 2208 spa_sync_allpools(void) 2209 { 2210 spa_t *spa = NULL; 2211 mutex_enter(&spa_namespace_lock); 2212 while ((spa = spa_next(spa)) != NULL) { 2213 if (spa_state(spa) != POOL_STATE_ACTIVE) 2214 continue; 2215 spa_open_ref(spa, FTAG); 2216 mutex_exit(&spa_namespace_lock); 2217 txg_wait_synced(spa_get_dsl(spa), 0); 2218 mutex_enter(&spa_namespace_lock); 2219 spa_close(spa, FTAG); 2220 } 2221 mutex_exit(&spa_namespace_lock); 2222 } 2223 2224 /* 2225 * ========================================================================== 2226 * Miscellaneous routines 2227 * ========================================================================== 2228 */ 2229 2230 /* 2231 * Remove all pools in the system. 2232 */ 2233 void 2234 spa_evict_all(void) 2235 { 2236 spa_t *spa; 2237 2238 /* 2239 * Remove all cached state. All pools should be closed now, 2240 * so every spa in the AVL tree should be unreferenced. 2241 */ 2242 mutex_enter(&spa_namespace_lock); 2243 while ((spa = spa_next(NULL)) != NULL) { 2244 /* 2245 * Stop async tasks. The async thread may need to detach 2246 * a device that's been replaced, which requires grabbing 2247 * spa_namespace_lock, so we must drop it here. 2248 */ 2249 spa_open_ref(spa, FTAG); 2250 mutex_exit(&spa_namespace_lock); 2251 spa_async_suspend(spa); 2252 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 2253 mutex_enter(&spa_namespace_lock); 2254 spa_close(spa, FTAG); 2255 2256 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2257 spa_unload(spa); 2258 spa_deactivate(spa); 2259 } 2260 spa_remove(spa); 2261 } 2262 mutex_exit(&spa_namespace_lock); 2263 } 2264 2265 vdev_t * 2266 spa_lookup_by_guid(spa_t *spa, uint64_t guid) 2267 { 2268 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 2269 } 2270