1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains all the routines used when modifying on-disk SPA state. 31 * This includes opening, importing, destroying, exporting a pool, and syncing a 32 * pool. 33 */ 34 35 #include <sys/zfs_context.h> 36 #include <sys/spa_impl.h> 37 #include <sys/zio.h> 38 #include <sys/zio_checksum.h> 39 #include <sys/zio_compress.h> 40 #include <sys/dmu.h> 41 #include <sys/dmu_tx.h> 42 #include <sys/zap.h> 43 #include <sys/zil.h> 44 #include <sys/vdev_impl.h> 45 #include <sys/metaslab.h> 46 #include <sys/uberblock_impl.h> 47 #include <sys/txg.h> 48 #include <sys/avl.h> 49 #include <sys/dmu_traverse.h> 50 #include <sys/unique.h> 51 #include <sys/dsl_pool.h> 52 #include <sys/dsl_dir.h> 53 #include <sys/dsl_prop.h> 54 #include <sys/fs/zfs.h> 55 #include <sys/callb.h> 56 57 static uint32_t spa_active_count; 58 59 /* 60 * ========================================================================== 61 * SPA state manipulation (open/create/destroy/import/export) 62 * ========================================================================== 63 */ 64 65 /* 66 * Activate an uninitialized pool. 67 */ 68 static void 69 spa_activate(spa_t *spa) 70 { 71 int t; 72 73 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 74 75 spa->spa_state = POOL_STATE_ACTIVE; 76 77 spa->spa_normal_class = metaslab_class_create(); 78 79 spa->spa_vdev_retry_taskq = taskq_create("spa_vdev_retry", 80 4, maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); 81 82 for (t = 0; t < ZIO_TYPES; t++) { 83 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 84 8, maxclsyspri, 50, INT_MAX, 85 TASKQ_PREPOPULATE); 86 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 87 8, maxclsyspri, 50, INT_MAX, 88 TASKQ_PREPOPULATE); 89 } 90 91 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 92 93 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 94 offsetof(vdev_t, vdev_dirty_node)); 95 96 txg_list_create(&spa->spa_vdev_txg_list, 97 offsetof(struct vdev, vdev_txg_node)); 98 } 99 100 /* 101 * Opposite of spa_activate(). 102 */ 103 static void 104 spa_deactivate(spa_t *spa) 105 { 106 int t; 107 108 ASSERT(spa->spa_sync_on == B_FALSE); 109 ASSERT(spa->spa_dsl_pool == NULL); 110 ASSERT(spa->spa_root_vdev == NULL); 111 112 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 113 114 txg_list_destroy(&spa->spa_vdev_txg_list); 115 116 list_destroy(&spa->spa_dirty_list); 117 118 rw_destroy(&spa->spa_traverse_lock); 119 120 for (t = 0; t < ZIO_TYPES; t++) { 121 taskq_destroy(spa->spa_zio_issue_taskq[t]); 122 taskq_destroy(spa->spa_zio_intr_taskq[t]); 123 spa->spa_zio_issue_taskq[t] = NULL; 124 spa->spa_zio_intr_taskq[t] = NULL; 125 } 126 127 taskq_destroy(spa->spa_vdev_retry_taskq); 128 spa->spa_vdev_retry_taskq = NULL; 129 130 metaslab_class_destroy(spa->spa_normal_class); 131 spa->spa_normal_class = NULL; 132 133 spa->spa_state = POOL_STATE_UNINITIALIZED; 134 } 135 136 /* 137 * Verify a pool configuration, and construct the vdev tree appropriately. This 138 * will create all the necessary vdevs in the appropriate layout, with each vdev 139 * in the CLOSED state. This will prep the pool before open/creation/import. 140 * All vdev validation is done by the vdev_alloc() routine. 141 */ 142 static vdev_t * 143 spa_config_parse(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) 144 { 145 nvlist_t **child; 146 uint_t c, children; 147 vdev_t *vd; 148 149 if ((vd = vdev_alloc(spa, nv, parent, id, atype)) == NULL) 150 return (NULL); 151 152 if (vd->vdev_ops->vdev_op_leaf) 153 return (vd); 154 155 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 156 &child, &children) != 0) { 157 vdev_free(vd); 158 return (NULL); 159 } 160 161 for (c = 0; c < children; c++) { 162 if (spa_config_parse(spa, child[c], vd, c, atype) == NULL) { 163 vdev_free(vd); 164 return (NULL); 165 } 166 } 167 168 return (vd); 169 } 170 171 /* 172 * Opposite of spa_load(). 173 */ 174 static void 175 spa_unload(spa_t *spa) 176 { 177 /* 178 * Stop syncing. 179 */ 180 if (spa->spa_sync_on) { 181 txg_sync_stop(spa->spa_dsl_pool); 182 spa->spa_sync_on = B_FALSE; 183 } 184 185 /* 186 * Wait for any outstanding prefetch I/O to complete. 187 */ 188 spa_config_enter(spa, RW_WRITER); 189 spa_config_exit(spa); 190 191 /* 192 * Close the dsl pool. 193 */ 194 if (spa->spa_dsl_pool) { 195 dsl_pool_close(spa->spa_dsl_pool); 196 spa->spa_dsl_pool = NULL; 197 } 198 199 /* 200 * Close all vdevs. 201 */ 202 if (spa->spa_root_vdev) { 203 vdev_free(spa->spa_root_vdev); 204 spa->spa_root_vdev = NULL; 205 } 206 } 207 208 /* 209 * Load an existing storage pool, using the pool's builtin spa_config as a 210 * source of configuration information. The 'readonly' flag will prevent us 211 * from writing any updated state to disk, and can be use when testing a pool 212 * for import. 213 */ 214 static int 215 spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig) 216 { 217 int error = 0; 218 nvlist_t *nvroot = NULL; 219 vdev_t *rvd; 220 uberblock_t *ub = &spa->spa_uberblock; 221 uint64_t pool_guid; 222 zio_t *zio; 223 224 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 225 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 226 return (EINVAL); 227 228 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 229 &spa->spa_config_txg); 230 231 if (import && spa_guid_exists(pool_guid, 0)) 232 return (EEXIST); 233 234 /* 235 * Parse the configuration into a vdev tree. 236 */ 237 spa_config_enter(spa, RW_WRITER); 238 rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 239 spa_config_exit(spa); 240 241 if (rvd == NULL) 242 return (EINVAL); 243 244 spa->spa_root_vdev = rvd; 245 ASSERT(spa_guid(spa) == pool_guid); 246 247 /* 248 * Try to open all vdevs, loading each label in the process. 249 */ 250 if (vdev_open(rvd) != 0) 251 return (ENXIO); 252 253 /* 254 * Find the best uberblock. 255 */ 256 bzero(ub, sizeof (uberblock_t)); 257 258 zio = zio_root(spa, NULL, NULL, 259 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 260 vdev_uberblock_load(zio, rvd, ub); 261 error = zio_wait(zio); 262 263 /* 264 * If we weren't able to find a single valid uberblock, return failure. 265 */ 266 if (ub->ub_txg == 0) { 267 dprintf("ub_txg is zero\n"); 268 return (ENXIO); 269 } 270 271 /* 272 * If the vdev guid sum doesn't match the uberblock, we have an 273 * incomplete configuration. 274 */ 275 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 276 rvd->vdev_state = VDEV_STATE_CANT_OPEN; 277 rvd->vdev_stat.vs_aux = VDEV_AUX_BAD_GUID_SUM; 278 dprintf("vdev_guid_sum %llx != ub_guid_sum %llx\n", 279 rvd->vdev_guid_sum, ub->ub_guid_sum); 280 return (ENXIO); 281 } 282 283 /* 284 * Initialize internal SPA structures. 285 */ 286 spa->spa_state = POOL_STATE_ACTIVE; 287 spa->spa_ubsync = spa->spa_uberblock; 288 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 289 spa->spa_dsl_pool = dsl_pool_open(spa, spa->spa_first_txg); 290 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 291 292 VERIFY(zap_lookup(spa->spa_meta_objset, 293 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 294 sizeof (uint64_t), 1, &spa->spa_config_object) == 0); 295 296 if (!mosconfig) { 297 dmu_buf_t *db; 298 char *packed = NULL; 299 size_t nvsize = 0; 300 nvlist_t *newconfig = NULL; 301 302 db = dmu_bonus_hold(spa->spa_meta_objset, 303 spa->spa_config_object); 304 dmu_buf_read(db); 305 nvsize = *(uint64_t *)db->db_data; 306 dmu_buf_rele(db); 307 308 packed = kmem_alloc(nvsize, KM_SLEEP); 309 error = dmu_read_canfail(spa->spa_meta_objset, 310 spa->spa_config_object, 0, nvsize, packed); 311 if (error == 0) 312 error = nvlist_unpack(packed, nvsize, &newconfig, 0); 313 kmem_free(packed, nvsize); 314 315 if (error) 316 return (ENXIO); 317 318 spa_config_set(spa, newconfig); 319 320 spa_unload(spa); 321 spa_deactivate(spa); 322 spa_activate(spa); 323 324 return (spa_load(spa, newconfig, readonly, import, B_TRUE)); 325 } 326 327 VERIFY(zap_lookup(spa->spa_meta_objset, 328 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 329 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) == 0); 330 331 /* 332 * Load the vdev state for all top level vdevs. 333 */ 334 if ((error = vdev_load(rvd, import)) != 0) 335 return (error); 336 337 /* 338 * Propagate the leaf DTLs we just loaded all the way up the tree. 339 */ 340 spa_config_enter(spa, RW_WRITER); 341 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 342 spa_config_exit(spa); 343 344 /* 345 * Check the state of the root vdev. If it can't be opened, it 346 * indicates one or more toplevel vdevs are faulted. 347 */ 348 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 349 return (ENXIO); 350 351 /* 352 * Claim log blocks that haven't been committed yet, and update all 353 * top-level vdevs to sync any config changes found in vdev_load(). 354 * This must all happen in a single txg. 355 */ 356 if ((spa_mode & FWRITE) && !readonly) { 357 dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), 358 spa_first_txg(spa)); 359 dmu_objset_find(spa->spa_name, zil_claim, tx, 0); 360 vdev_config_dirty(rvd); 361 dmu_tx_commit(tx); 362 363 spa->spa_sync_on = B_TRUE; 364 txg_sync_start(spa->spa_dsl_pool); 365 366 /* 367 * Wait for all claims to sync. 368 */ 369 txg_wait_synced(spa->spa_dsl_pool, 0); 370 } 371 372 return (0); 373 } 374 375 /* 376 * Pool Open/Import 377 * 378 * The import case is identical to an open except that the configuration is sent 379 * down from userland, instead of grabbed from the configuration cache. For the 380 * case of an open, the pool configuration will exist in the 381 * POOL_STATE_UNITIALIZED state. 382 * 383 * The stats information (gen/count/ustats) is used to gather vdev statistics at 384 * the same time open the pool, without having to keep around the spa_t in some 385 * ambiguous state. 386 */ 387 static int 388 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 389 { 390 spa_t *spa; 391 int error; 392 int loaded = B_FALSE; 393 int locked = B_FALSE; 394 395 *spapp = NULL; 396 397 /* 398 * As disgusting as this is, we need to support recursive calls to this 399 * function because dsl_dir_open() is called during spa_load(), and ends 400 * up calling spa_open() again. The real fix is to figure out how to 401 * avoid dsl_dir_open() calling this in the first place. 402 */ 403 if (mutex_owner(&spa_namespace_lock) != curthread) { 404 mutex_enter(&spa_namespace_lock); 405 locked = B_TRUE; 406 } 407 408 if ((spa = spa_lookup(pool)) == NULL) { 409 if (locked) 410 mutex_exit(&spa_namespace_lock); 411 return (ENOENT); 412 } 413 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 414 415 spa_activate(spa); 416 417 error = spa_load(spa, spa->spa_config, 418 B_FALSE, B_FALSE, B_FALSE); 419 420 if (error == EBADF) { 421 /* 422 * If vdev_load() returns EBADF, it indicates that one 423 * of the vdevs indicates that the pool has been 424 * exported or destroyed. If this is the case, the 425 * config cache is out of sync and we should remove the 426 * pool from the namespace. 427 */ 428 spa_unload(spa); 429 spa_deactivate(spa); 430 spa_remove(spa); 431 spa_config_sync(); 432 if (locked) 433 mutex_exit(&spa_namespace_lock); 434 return (ENOENT); 435 } if (error) { 436 /* 437 * We can't open the pool, but we still have useful 438 * information: the state of each vdev after the 439 * attempted vdev_open(). Return this to the user. 440 */ 441 if (config != NULL && spa->spa_root_vdev != NULL) 442 *config = spa_config_generate(spa, NULL, -1ULL, 443 B_TRUE); 444 spa_unload(spa); 445 spa_deactivate(spa); 446 if (locked) 447 mutex_exit(&spa_namespace_lock); 448 *spapp = NULL; 449 return (error); 450 } 451 452 loaded = B_TRUE; 453 } 454 455 spa_open_ref(spa, tag); 456 if (locked) 457 mutex_exit(&spa_namespace_lock); 458 459 *spapp = spa; 460 461 if (config != NULL) { 462 spa_config_enter(spa, RW_READER); 463 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 464 spa_config_exit(spa); 465 } 466 467 /* 468 * If we just loaded the pool, resilver anything that's out of date. 469 */ 470 if (loaded && (spa_mode & FWRITE)) 471 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 472 473 return (0); 474 } 475 476 int 477 spa_open(const char *name, spa_t **spapp, void *tag) 478 { 479 return (spa_open_common(name, spapp, tag, NULL)); 480 } 481 482 int 483 spa_get_stats(const char *name, nvlist_t **config) 484 { 485 int error; 486 spa_t *spa; 487 488 *config = NULL; 489 error = spa_open_common(name, &spa, FTAG, config); 490 491 if (spa != NULL) 492 spa_close(spa, FTAG); 493 494 return (error); 495 } 496 497 /* 498 * Pool Creation 499 */ 500 int 501 spa_create(const char *pool, nvlist_t *nvroot, char *altroot) 502 { 503 spa_t *spa; 504 dsl_pool_t *dp; 505 dmu_tx_t *tx; 506 int error; 507 uint64_t txg = TXG_INITIAL; 508 509 /* 510 * If this pool already exists, return failure. 511 */ 512 mutex_enter(&spa_namespace_lock); 513 if (spa_lookup(pool) != NULL) { 514 mutex_exit(&spa_namespace_lock); 515 return (EEXIST); 516 } 517 spa = spa_add(pool); 518 519 /* 520 * Allocate a new spa_t structure. 521 */ 522 spa_activate(spa); 523 524 spa->spa_uberblock.ub_txg = txg - 1; 525 spa->spa_ubsync = spa->spa_uberblock; 526 527 error = spa_vdev_add(spa, nvroot); 528 529 if (error) { 530 spa_unload(spa); 531 spa_deactivate(spa); 532 spa_remove(spa); 533 mutex_exit(&spa_namespace_lock); 534 return (error); 535 } 536 537 if (altroot != NULL) { 538 spa->spa_root = spa_strdup(altroot); 539 atomic_add_32(&spa_active_count, 1); 540 } 541 542 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 543 spa->spa_meta_objset = dp->dp_meta_objset; 544 545 tx = dmu_tx_create_assigned(dp, txg); 546 547 /* 548 * Create the pool config object. 549 */ 550 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 551 DMU_OT_PACKED_NVLIST, 1 << 14, 552 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 553 554 VERIFY(zap_add(spa->spa_meta_objset, 555 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 556 sizeof (uint64_t), 1, &spa->spa_config_object, tx) == 0); 557 558 /* 559 * Create the deferred-free bplist object. Turn off compression 560 * because sync-to-convergence takes longer if the blocksize 561 * keeps changing. 562 */ 563 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 564 1 << 14, tx); 565 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 566 ZIO_COMPRESS_OFF, tx); 567 568 VERIFY(zap_add(spa->spa_meta_objset, 569 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 570 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) == 0); 571 572 dmu_tx_commit(tx); 573 574 spa->spa_sync_on = B_TRUE; 575 txg_sync_start(spa->spa_dsl_pool); 576 577 /* 578 * We explicitly wait for the first transaction to complete so that our 579 * bean counters are appropriately updated. 580 */ 581 txg_wait_synced(spa->spa_dsl_pool, txg); 582 583 spa_config_sync(); 584 585 mutex_exit(&spa_namespace_lock); 586 587 return (0); 588 } 589 590 /* 591 * Import the given pool into the system. We set up the necessary spa_t and 592 * then call spa_load() to do the dirty work. 593 */ 594 int 595 spa_import(const char *pool, nvlist_t *config, char *altroot) 596 { 597 spa_t *spa; 598 int error; 599 600 if (!(spa_mode & FWRITE)) 601 return (EROFS); 602 603 /* 604 * If a pool with this name exists, return failure. 605 */ 606 mutex_enter(&spa_namespace_lock); 607 if (spa_lookup(pool) != NULL) { 608 mutex_exit(&spa_namespace_lock); 609 return (EEXIST); 610 } 611 612 /* 613 * Create an initialize the spa structure 614 */ 615 spa = spa_add(pool); 616 spa_activate(spa); 617 618 /* 619 * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig 620 * so that we don't try to open the pool if the config is damaged. 621 */ 622 error = spa_load(spa, config, B_FALSE, B_TRUE, B_TRUE); 623 624 if (error) { 625 spa_unload(spa); 626 spa_deactivate(spa); 627 spa_remove(spa); 628 mutex_exit(&spa_namespace_lock); 629 return (error); 630 } 631 632 /* 633 * Set the alternate root, if there is one. 634 */ 635 if (altroot != NULL) { 636 atomic_add_32(&spa_active_count, 1); 637 spa->spa_root = spa_strdup(altroot); 638 } 639 640 /* 641 * Initialize the config based on the in-core state. 642 */ 643 config = spa_config_generate(spa, NULL, spa_last_synced_txg(spa), 0); 644 645 spa_config_set(spa, config); 646 647 /* 648 * Sync the configuration cache. 649 */ 650 spa_config_sync(); 651 652 mutex_exit(&spa_namespace_lock); 653 654 /* 655 * Resilver anything that's out of date. 656 */ 657 if (spa_mode & FWRITE) 658 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 659 660 return (0); 661 } 662 663 /* 664 * This (illegal) pool name is used when temporarily importing a spa_t in order 665 * to get the vdev stats associated with the imported devices. 666 */ 667 #define TRYIMPORT_NAME "$import" 668 669 nvlist_t * 670 spa_tryimport(nvlist_t *tryconfig) 671 { 672 nvlist_t *config = NULL; 673 char *poolname; 674 spa_t *spa; 675 uint64_t state; 676 677 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 678 return (NULL); 679 680 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 681 return (NULL); 682 683 mutex_enter(&spa_namespace_lock); 684 spa = spa_add(TRYIMPORT_NAME); 685 686 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 687 688 /* 689 * Initialize the spa_t structure. 690 */ 691 spa_activate(spa); 692 693 /* 694 * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig 695 * so we don't try to open the pool if the config is damaged. 696 */ 697 (void) spa_load(spa, tryconfig, B_TRUE, B_TRUE, B_TRUE); 698 699 /* 700 * If 'tryconfig' was at least parsable, return the current config. 701 */ 702 if (spa->spa_root_vdev != NULL) { 703 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 704 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 705 poolname) == 0); 706 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 707 state) == 0); 708 } 709 710 spa_unload(spa); 711 spa_deactivate(spa); 712 spa_remove(spa); 713 mutex_exit(&spa_namespace_lock); 714 715 return (config); 716 } 717 718 /* 719 * Pool export/destroy 720 * 721 * The act of destroying or exporting a pool is very simple. We make sure there 722 * is no more pending I/O and any references to the pool are gone. Then, we 723 * update the pool state and sync all the labels to disk, removing the 724 * configuration from the cache afterwards. 725 */ 726 static int 727 spa_export_common(char *pool, int new_state) 728 { 729 spa_t *spa; 730 731 if (!(spa_mode & FWRITE)) 732 return (EROFS); 733 734 mutex_enter(&spa_namespace_lock); 735 if ((spa = spa_lookup(pool)) == NULL) { 736 mutex_exit(&spa_namespace_lock); 737 return (ENOENT); 738 } 739 740 /* 741 * The pool will be in core if it's openable, 742 * in which case we can modify its state. 743 */ 744 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 745 /* 746 * Objsets may be open only because they're dirty, so we 747 * have to force it to sync before checking spa_refcnt. 748 */ 749 spa_scrub_suspend(spa); 750 txg_wait_synced(spa->spa_dsl_pool, 0); 751 752 if (!spa_refcount_zero(spa)) { 753 spa_scrub_resume(spa); 754 mutex_exit(&spa_namespace_lock); 755 return (EBUSY); 756 } 757 758 /* 759 * Update the pool state. 760 */ 761 spa->spa_state = new_state; 762 763 spa_scrub_resume(spa); 764 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 765 766 if (spa->spa_root != NULL) 767 atomic_add_32(&spa_active_count, -1); 768 769 /* 770 * We want this to be reflected on every label, 771 * so mark them all dirty. spa_unload() will do the 772 * final sync that pushes these changes out. 773 */ 774 vdev_config_dirty(spa->spa_root_vdev); 775 } 776 777 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 778 spa_unload(spa); 779 spa_deactivate(spa); 780 } 781 782 spa_remove(spa); 783 spa_config_sync(); 784 mutex_exit(&spa_namespace_lock); 785 786 return (0); 787 } 788 789 /* 790 * Destroy a storage pool. 791 */ 792 int 793 spa_destroy(char *pool) 794 { 795 return (spa_export_common(pool, POOL_STATE_DESTROYED)); 796 } 797 798 /* 799 * Export a storage pool. 800 */ 801 int 802 spa_export(char *pool) 803 { 804 return (spa_export_common(pool, POOL_STATE_EXPORTED)); 805 } 806 807 /* 808 * ========================================================================== 809 * Device manipulation 810 * ========================================================================== 811 */ 812 813 /* 814 * Add capacity to a storage pool. 815 */ 816 int 817 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 818 { 819 uint64_t txg; 820 int c, error; 821 vdev_t *rvd = spa->spa_root_vdev; 822 vdev_t *vd; 823 824 txg = spa_vdev_enter(spa); 825 826 vd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 827 828 if (vd == NULL) 829 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 830 831 if (rvd == NULL) /* spa_create() */ 832 spa->spa_root_vdev = rvd = vd; 833 834 if ((error = vdev_create(vd, txg)) != 0) 835 return (spa_vdev_exit(spa, vd, txg, error)); 836 837 /* 838 * Transfer each top-level vdev from the temporary root 839 * to the spa's root and initialize its metaslabs. 840 */ 841 for (c = 0; c < vd->vdev_children; c++) { 842 vdev_t *tvd = vd->vdev_child[c]; 843 if (vd != rvd) { 844 vdev_remove_child(vd, tvd); 845 tvd->vdev_id = rvd->vdev_children; 846 vdev_add_child(rvd, tvd); 847 } 848 vdev_init(tvd, txg); 849 vdev_config_dirty(tvd); 850 } 851 852 /* 853 * Update the config based on the new in-core state. 854 */ 855 spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); 856 857 return (spa_vdev_exit(spa, vd, txg, 0)); 858 } 859 860 /* 861 * Attach a device to a mirror. The arguments are the path to any device 862 * in the mirror, and the nvroot for the new device. If the path specifies 863 * a device that is not mirrored, we automatically insert the mirror vdev. 864 * 865 * If 'replacing' is specified, the new device is intended to replace the 866 * existing device; in this case the two devices are made into their own 867 * mirror using the 'replacing' vdev, which is functionally idendical to 868 * the mirror vdev (it actually reuses all the same ops) but has a few 869 * extra rules: you can't attach to it after it's been created, and upon 870 * completion of resilvering, the first disk (the one being replaced) 871 * is automatically detached. 872 */ 873 int 874 spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing) 875 { 876 uint64_t txg, open_txg; 877 int error; 878 vdev_t *rvd = spa->spa_root_vdev; 879 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 880 vdev_ops_t *pvops = replacing ? &vdev_replacing_ops : &vdev_mirror_ops; 881 882 txg = spa_vdev_enter(spa); 883 884 oldvd = vdev_lookup_by_path(rvd, path); 885 886 if (oldvd == NULL) 887 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 888 889 pvd = oldvd->vdev_parent; 890 891 /* 892 * The parent must be a mirror or the root, unless we're replacing; 893 * in that case, the parent can be anything but another replacing vdev. 894 */ 895 if (pvd->vdev_ops != &vdev_mirror_ops && 896 pvd->vdev_ops != &vdev_root_ops && 897 (!replacing || pvd->vdev_ops == &vdev_replacing_ops)) 898 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 899 900 newrootvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 901 902 if (newrootvd == NULL || newrootvd->vdev_children != 1) 903 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 904 905 newvd = newrootvd->vdev_child[0]; 906 907 if (!newvd->vdev_ops->vdev_op_leaf) 908 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 909 910 if ((error = vdev_create(newrootvd, txg)) != 0) 911 return (spa_vdev_exit(spa, newrootvd, txg, error)); 912 913 /* 914 * Compare the new device size with the replaceable/attachable 915 * device size. 916 */ 917 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 918 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 919 920 if (newvd->vdev_ashift != oldvd->vdev_ashift && oldvd->vdev_ashift != 0) 921 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 922 923 /* 924 * If this is an in-place replacement, update oldvd's path and devid 925 * to make it distinguishable from newvd, and unopenable from now on. 926 */ 927 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 928 spa_strfree(oldvd->vdev_path); 929 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 930 KM_SLEEP); 931 (void) sprintf(oldvd->vdev_path, "%s/%s", 932 newvd->vdev_path, "old"); 933 if (oldvd->vdev_devid != NULL) { 934 spa_strfree(oldvd->vdev_devid); 935 oldvd->vdev_devid = NULL; 936 } 937 } 938 939 /* 940 * If the parent is not a mirror, or if we're replacing, 941 * insert the new mirror/replacing vdev above oldvd. 942 */ 943 if (pvd->vdev_ops != pvops) 944 pvd = vdev_add_parent(oldvd, pvops); 945 946 ASSERT(pvd->vdev_top->vdev_parent == rvd); 947 ASSERT(pvd->vdev_ops == pvops); 948 ASSERT(oldvd->vdev_parent == pvd); 949 950 /* 951 * Extract the new device from its root and add it to pvd. 952 */ 953 vdev_remove_child(newrootvd, newvd); 954 newvd->vdev_id = pvd->vdev_children; 955 vdev_add_child(pvd, newvd); 956 957 tvd = newvd->vdev_top; 958 ASSERT(pvd->vdev_top == tvd); 959 ASSERT(tvd->vdev_parent == rvd); 960 961 /* 962 * Update the config based on the new in-core state. 963 */ 964 spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); 965 966 vdev_config_dirty(tvd); 967 968 /* 969 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 970 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 971 */ 972 open_txg = txg + TXG_CONCURRENT_STATES - 1; 973 974 mutex_enter(&newvd->vdev_dtl_lock); 975 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 976 open_txg - TXG_INITIAL + 1); 977 mutex_exit(&newvd->vdev_dtl_lock); 978 979 /* 980 * Mark newvd's DTL dirty in this txg. 981 */ 982 vdev_dirty(tvd, VDD_DTL, txg); 983 (void) txg_list_add(&tvd->vdev_dtl_list, newvd, txg); 984 985 dprintf("attached %s, replacing=%d\n", path, replacing); 986 987 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 988 989 /* 990 * Kick off a resilver to update newvd. 991 */ 992 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 993 994 return (0); 995 } 996 997 /* 998 * Detach a device from a mirror or replacing vdev. 999 * If 'replace_done' is specified, only detach if the parent 1000 * is a replacing vdev. 1001 */ 1002 int 1003 spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, int replace_done) 1004 { 1005 uint64_t txg; 1006 int c, t, error; 1007 vdev_t *rvd = spa->spa_root_vdev; 1008 vdev_t *vd, *pvd, *cvd, *tvd; 1009 1010 txg = spa_vdev_enter(spa); 1011 1012 vd = vdev_lookup_by_path(rvd, path); 1013 1014 if (vd == NULL) 1015 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1016 1017 if (guid != 0 && vd->vdev_guid != guid) 1018 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1019 1020 pvd = vd->vdev_parent; 1021 1022 /* 1023 * If replace_done is specified, only remove this device if it's 1024 * the first child of a replacing vdev. 1025 */ 1026 if (replace_done && 1027 (vd->vdev_id != 0 || pvd->vdev_ops != &vdev_replacing_ops)) 1028 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1029 1030 /* 1031 * Only mirror and replacing vdevs support detach. 1032 */ 1033 if (pvd->vdev_ops != &vdev_replacing_ops && 1034 pvd->vdev_ops != &vdev_mirror_ops) 1035 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1036 1037 /* 1038 * If there's only one replica, you can't detach it. 1039 */ 1040 if (pvd->vdev_children <= 1) 1041 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1042 1043 /* 1044 * If all siblings have non-empty DTLs, this device may have the only 1045 * valid copy of the data, which means we cannot safely detach it. 1046 * 1047 * XXX -- as in the vdev_offline() case, we really want a more 1048 * precise DTL check. 1049 */ 1050 for (c = 0; c < pvd->vdev_children; c++) { 1051 uint64_t dirty; 1052 1053 cvd = pvd->vdev_child[c]; 1054 if (cvd == vd) 1055 continue; 1056 if (vdev_is_dead(cvd)) 1057 continue; 1058 mutex_enter(&cvd->vdev_dtl_lock); 1059 dirty = cvd->vdev_dtl_map.sm_space | 1060 cvd->vdev_dtl_scrub.sm_space; 1061 mutex_exit(&cvd->vdev_dtl_lock); 1062 if (!dirty) 1063 break; 1064 } 1065 if (c == pvd->vdev_children) 1066 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1067 1068 /* 1069 * Erase the disk labels so the disk can be used for other things. 1070 * This must be done after all other error cases are handled, 1071 * but before we disembowel vd (so we can still do I/O to it). 1072 * But if we can't do it, don't treat the error as fatal -- 1073 * it may be that the unwritability of the disk is the reason 1074 * it's being detached! 1075 */ 1076 error = vdev_label_init(vd, 0); 1077 if (error) 1078 dprintf("unable to erase labels on %s\n", vdev_description(vd)); 1079 1080 /* 1081 * Remove vd from its parent and compact the parent's children. 1082 */ 1083 vdev_remove_child(pvd, vd); 1084 vdev_compact_children(pvd); 1085 1086 /* 1087 * Remember one of the remaining children so we can get tvd below. 1088 */ 1089 cvd = pvd->vdev_child[0]; 1090 1091 /* 1092 * If the parent mirror/replacing vdev only has one child, 1093 * the parent is no longer needed. Remove it from the tree. 1094 */ 1095 if (pvd->vdev_children == 1) 1096 vdev_remove_parent(cvd); 1097 1098 /* 1099 * We don't set tvd until now because the parent we just removed 1100 * may have been the previous top-level vdev. 1101 */ 1102 tvd = cvd->vdev_top; 1103 ASSERT(tvd->vdev_parent == rvd); 1104 1105 /* 1106 * Reopen this top-level vdev to reassess health after detach. 1107 */ 1108 vdev_reopen(tvd, NULL); 1109 1110 /* 1111 * If the device we just detached was smaller than the others, 1112 * it may be possible to add metaslabs (i.e. grow the pool). 1113 */ 1114 vdev_metaslab_init(tvd, txg); 1115 1116 /* 1117 * Update the config based on the new in-core state. 1118 */ 1119 spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); 1120 1121 vdev_config_dirty(tvd); 1122 1123 /* 1124 * Mark vd's DTL as dirty in this txg. 1125 * vdev_dtl_sync() will see that vd->vdev_detached is set 1126 * and free vd's DTL object in syncing context. 1127 * But first make sure we're not on any *other* txg's DTL list, 1128 * to prevent vd from being accessed after it's freed. 1129 */ 1130 vdev_dirty(tvd, VDD_DTL, txg); 1131 vd->vdev_detached = B_TRUE; 1132 for (t = 0; t < TXG_SIZE; t++) 1133 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1134 (void) txg_list_add(&tvd->vdev_dtl_list, vd, txg); 1135 1136 dprintf("detached %s\n", path); 1137 1138 return (spa_vdev_exit(spa, vd, txg, 0)); 1139 } 1140 1141 /* 1142 * If there are any replacing vdevs that have finished replacing, detach them. 1143 * We can't hold the config lock across detaches, so we lock the config, 1144 * build a list of candidates, unlock the config, and try each candidate. 1145 */ 1146 typedef struct vdev_detach_link { 1147 char *vdl_path; 1148 uint64_t vdl_guid; 1149 list_node_t vdl_node; 1150 } vdev_detach_link_t; 1151 1152 static void 1153 spa_vdev_replace_done_make_list(list_t *l, vdev_t *vd) 1154 { 1155 int c; 1156 1157 for (c = 0; c < vd->vdev_children; c++) 1158 spa_vdev_replace_done_make_list(l, vd->vdev_child[c]); 1159 1160 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 1161 vdev_t *cvd0 = vd->vdev_child[0]; 1162 vdev_t *cvd1 = vd->vdev_child[1]; 1163 vdev_detach_link_t *vdl; 1164 int dirty1; 1165 1166 mutex_enter(&cvd1->vdev_dtl_lock); 1167 dirty1 = cvd1->vdev_dtl_map.sm_space | 1168 cvd1->vdev_dtl_scrub.sm_space; 1169 mutex_exit(&cvd1->vdev_dtl_lock); 1170 1171 if (!dirty1) { 1172 vdl = kmem_zalloc(sizeof (*vdl), KM_SLEEP); 1173 vdl->vdl_path = spa_strdup(cvd0->vdev_path); 1174 vdl->vdl_guid = cvd0->vdev_guid; 1175 list_insert_tail(l, vdl); 1176 } 1177 } 1178 } 1179 1180 void 1181 spa_vdev_replace_done(spa_t *spa) 1182 { 1183 vdev_detach_link_t *vdl; 1184 list_t vdlist; 1185 1186 list_create(&vdlist, sizeof (vdev_detach_link_t), 1187 offsetof(vdev_detach_link_t, vdl_node)); 1188 1189 spa_config_enter(spa, RW_READER); 1190 spa_vdev_replace_done_make_list(&vdlist, spa->spa_root_vdev); 1191 spa_config_exit(spa); 1192 1193 while ((vdl = list_head(&vdlist)) != NULL) { 1194 list_remove(&vdlist, vdl); 1195 (void) spa_vdev_detach(spa, vdl->vdl_path, vdl->vdl_guid, 1196 B_TRUE); 1197 spa_strfree(vdl->vdl_path); 1198 kmem_free(vdl, sizeof (*vdl)); 1199 } 1200 1201 list_destroy(&vdlist); 1202 } 1203 1204 /* 1205 * ========================================================================== 1206 * SPA Scrubbing 1207 * ========================================================================== 1208 */ 1209 1210 static int spa_scrub_locked(spa_t *, pool_scrub_type_t, boolean_t); 1211 1212 static void 1213 spa_scrub_io_done(zio_t *zio) 1214 { 1215 spa_t *spa = zio->io_spa; 1216 1217 zio_buf_free(zio->io_data, zio->io_size); 1218 1219 mutex_enter(&spa->spa_scrub_lock); 1220 if (zio->io_error) 1221 spa->spa_scrub_errors++; 1222 if (--spa->spa_scrub_inflight == 0) 1223 cv_broadcast(&spa->spa_scrub_io_cv); 1224 mutex_exit(&spa->spa_scrub_lock); 1225 1226 if (zio->io_error) { 1227 vdev_t *vd = zio->io_vd; 1228 mutex_enter(&vd->vdev_stat_lock); 1229 vd->vdev_stat.vs_scrub_errors++; 1230 mutex_exit(&vd->vdev_stat_lock); 1231 } 1232 } 1233 1234 static void 1235 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags) 1236 { 1237 size_t size = BP_GET_LSIZE(bp); 1238 void *data = zio_buf_alloc(size); 1239 1240 mutex_enter(&spa->spa_scrub_lock); 1241 spa->spa_scrub_inflight++; 1242 mutex_exit(&spa->spa_scrub_lock); 1243 1244 zio_nowait(zio_read(NULL, spa, bp, data, size, 1245 spa_scrub_io_done, NULL, priority, flags)); 1246 } 1247 1248 /* ARGSUSED */ 1249 static int 1250 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 1251 { 1252 blkptr_t *bp = &bc->bc_blkptr; 1253 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[0])); 1254 1255 if (bc->bc_errno || vd == NULL) { 1256 /* 1257 * We can't scrub this block, but we can continue to scrub 1258 * the rest of the pool. Note the error and move along. 1259 */ 1260 mutex_enter(&spa->spa_scrub_lock); 1261 spa->spa_scrub_errors++; 1262 mutex_exit(&spa->spa_scrub_lock); 1263 1264 if (vd != NULL) { 1265 mutex_enter(&vd->vdev_stat_lock); 1266 vd->vdev_stat.vs_scrub_errors++; 1267 mutex_exit(&vd->vdev_stat_lock); 1268 } 1269 1270 return (ERESTART); 1271 } 1272 1273 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 1274 1275 /* 1276 * Keep track of how much data we've examined so that 1277 * zpool(1M) status can make useful progress reports. 1278 */ 1279 mutex_enter(&vd->vdev_stat_lock); 1280 vd->vdev_stat.vs_scrub_examined += BP_GET_ASIZE(bp); 1281 mutex_exit(&vd->vdev_stat_lock); 1282 1283 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 1284 if (DVA_GET_GANG(&bp->blk_dva[0])) { 1285 /* 1286 * Gang members may be spread across multiple vdevs, 1287 * so the best we can do is look at the pool-wide DTL. 1288 * XXX -- it would be better to change our allocation 1289 * policy to ensure that this can't happen. 1290 */ 1291 vd = spa->spa_root_vdev; 1292 } 1293 if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) { 1294 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 1295 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | 1296 ZIO_FLAG_RESILVER); 1297 } 1298 } else { 1299 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 1300 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SCRUB); 1301 } 1302 1303 return (0); 1304 } 1305 1306 static void 1307 spa_scrub_thread(spa_t *spa) 1308 { 1309 callb_cpr_t cprinfo; 1310 traverse_handle_t *th = spa->spa_scrub_th; 1311 vdev_t *rvd = spa->spa_root_vdev; 1312 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 1313 int error = 0; 1314 boolean_t complete; 1315 1316 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 1317 1318 /* 1319 * If we're restarting due to a snapshot create/delete, 1320 * wait for that to complete. 1321 */ 1322 txg_wait_synced(spa_get_dsl(spa), 0); 1323 1324 spa_config_enter(spa, RW_WRITER); 1325 vdev_reopen(rvd, NULL); /* purge all vdev caches */ 1326 vdev_config_dirty(rvd); /* rewrite all disk labels */ 1327 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 1328 spa_config_exit(spa); 1329 1330 mutex_enter(&spa->spa_scrub_lock); 1331 spa->spa_scrub_errors = 0; 1332 spa->spa_scrub_active = 1; 1333 1334 while (!spa->spa_scrub_stop) { 1335 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1336 while (spa->spa_scrub_suspend) { 1337 spa->spa_scrub_active = 0; 1338 cv_broadcast(&spa->spa_scrub_cv); 1339 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1340 spa->spa_scrub_active = 1; 1341 } 1342 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 1343 1344 if (spa->spa_scrub_restart_txg != 0) 1345 break; 1346 1347 mutex_exit(&spa->spa_scrub_lock); 1348 error = traverse_more(th); 1349 mutex_enter(&spa->spa_scrub_lock); 1350 if (error != EAGAIN) 1351 break; 1352 } 1353 1354 while (spa->spa_scrub_inflight) 1355 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1356 1357 if (spa->spa_scrub_restart_txg != 0) 1358 error = ERESTART; 1359 1360 spa->spa_scrub_active = 0; 1361 cv_broadcast(&spa->spa_scrub_cv); 1362 1363 /* 1364 * If the traverse completed, and there were no errors, 1365 * then the scrub was completely successful. 1366 */ 1367 complete = (error == 0 && spa->spa_scrub_errors == 0); 1368 1369 dprintf("scrub to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 1370 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 1371 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 1372 1373 mutex_exit(&spa->spa_scrub_lock); 1374 1375 /* 1376 * If the scrub/resilver completed, update all DTLs to reflect this. 1377 * Whether it succeeded or not, vacate all temporary scrub DTLs. 1378 */ 1379 spa_config_enter(spa, RW_WRITER); 1380 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 1381 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 1382 spa_config_exit(spa); 1383 1384 spa_vdev_replace_done(spa); 1385 1386 spa_config_enter(spa, RW_READER); 1387 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 1388 spa_config_exit(spa); 1389 1390 mutex_enter(&spa->spa_scrub_lock); 1391 1392 spa->spa_scrub_type = POOL_SCRUB_NONE; 1393 spa->spa_scrub_active = 0; 1394 spa->spa_scrub_thread = NULL; 1395 1396 cv_broadcast(&spa->spa_scrub_cv); 1397 1398 /* 1399 * If we were told to restart, our final act is to start a new scrub. 1400 */ 1401 if (error == ERESTART) 1402 VERIFY(spa_scrub_locked(spa, scrub_type, B_TRUE) == 0); 1403 1404 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 1405 thread_exit(); 1406 } 1407 1408 void 1409 spa_scrub_suspend(spa_t *spa) 1410 { 1411 mutex_enter(&spa->spa_scrub_lock); 1412 spa->spa_scrub_suspend++; 1413 while (spa->spa_scrub_active) { 1414 cv_broadcast(&spa->spa_scrub_cv); 1415 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1416 } 1417 while (spa->spa_scrub_inflight) 1418 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1419 mutex_exit(&spa->spa_scrub_lock); 1420 } 1421 1422 void 1423 spa_scrub_resume(spa_t *spa) 1424 { 1425 mutex_enter(&spa->spa_scrub_lock); 1426 ASSERT(spa->spa_scrub_suspend != 0); 1427 if (--spa->spa_scrub_suspend == 0) 1428 cv_broadcast(&spa->spa_scrub_cv); 1429 mutex_exit(&spa->spa_scrub_lock); 1430 } 1431 1432 void 1433 spa_scrub_restart(spa_t *spa, uint64_t txg) 1434 { 1435 /* 1436 * Something happened (e.g. snapshot create/delete) that means 1437 * we must restart any in-progress scrubs. The itinerary will 1438 * fix this properly. 1439 */ 1440 mutex_enter(&spa->spa_scrub_lock); 1441 spa->spa_scrub_restart_txg = txg; 1442 mutex_exit(&spa->spa_scrub_lock); 1443 } 1444 1445 static int 1446 spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force) 1447 { 1448 space_seg_t *ss; 1449 uint64_t mintxg, maxtxg; 1450 vdev_t *rvd = spa->spa_root_vdev; 1451 int advance = 0; 1452 1453 if ((uint_t)type >= POOL_SCRUB_TYPES) 1454 return (ENOTSUP); 1455 1456 /* 1457 * If there's a scrub or resilver already in progress, stop it. 1458 */ 1459 while (spa->spa_scrub_thread != NULL) { 1460 /* 1461 * Don't stop a resilver unless forced. 1462 */ 1463 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) 1464 return (EBUSY); 1465 1466 spa->spa_scrub_stop = 1; 1467 cv_broadcast(&spa->spa_scrub_cv); 1468 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1469 } 1470 1471 /* 1472 * Terminate the previous traverse. 1473 */ 1474 if (spa->spa_scrub_th != NULL) { 1475 traverse_fini(spa->spa_scrub_th); 1476 spa->spa_scrub_th = NULL; 1477 } 1478 1479 spa->spa_scrub_stop = 0; 1480 spa->spa_scrub_type = type; 1481 spa->spa_scrub_restart_txg = 0; 1482 1483 mintxg = TXG_INITIAL - 1; 1484 maxtxg = spa_last_synced_txg(spa) + 1; 1485 1486 switch (type) { 1487 1488 case POOL_SCRUB_NONE: 1489 break; 1490 1491 case POOL_SCRUB_RESILVER: 1492 /* 1493 * Determine the resilvering boundaries. 1494 * 1495 * Note: (mintxg, maxtxg) is an open interval, 1496 * i.e. mintxg and maxtxg themselves are not included. 1497 * 1498 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 1499 * so we don't claim to resilver a txg that's still changing. 1500 */ 1501 mutex_enter(&rvd->vdev_dtl_lock); 1502 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 1503 mintxg = ss ? ss->ss_start - 1 : 0; 1504 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 1505 maxtxg = ss ? ss->ss_end : 0; 1506 maxtxg = MIN(maxtxg, spa_last_synced_txg(spa) + 1); 1507 mutex_exit(&rvd->vdev_dtl_lock); 1508 1509 advance = ADVANCE_PRE | ADVANCE_PRUNE; 1510 break; 1511 1512 case POOL_SCRUB_EVERYTHING: 1513 /* 1514 * A scrub is like a resilver, but not pruned by DTL. 1515 */ 1516 advance = ADVANCE_PRE; 1517 break; 1518 } 1519 1520 if (mintxg != 0 && maxtxg != 0 && type != POOL_SCRUB_NONE) { 1521 spa->spa_scrub_maxtxg = maxtxg; 1522 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 1523 advance, ZIO_FLAG_CANFAIL); 1524 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 1525 spa->spa_scrub_thread = thread_create(NULL, 0, 1526 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 1527 } 1528 1529 return (0); 1530 } 1531 1532 int 1533 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 1534 { 1535 int error; 1536 traverse_handle_t *th; 1537 1538 mutex_enter(&spa->spa_scrub_lock); 1539 error = spa_scrub_locked(spa, type, force); 1540 th = spa->spa_scrub_th; 1541 mutex_exit(&spa->spa_scrub_lock); 1542 1543 if (th == NULL && type != POOL_SCRUB_NONE) 1544 spa_vdev_replace_done(spa); 1545 1546 return (error); 1547 } 1548 1549 /* 1550 * ========================================================================== 1551 * SPA syncing routines 1552 * ========================================================================== 1553 */ 1554 1555 static void 1556 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 1557 { 1558 bplist_t *bpl = &spa->spa_sync_bplist; 1559 dmu_tx_t *tx; 1560 blkptr_t blk; 1561 uint64_t itor = 0; 1562 zio_t *zio; 1563 int error; 1564 uint8_t c = 1; 1565 1566 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 1567 1568 while (bplist_iterate(bpl, &itor, &blk) == 0) 1569 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 1570 1571 error = zio_wait(zio); 1572 ASSERT3U(error, ==, 0); 1573 1574 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1575 bplist_vacate(bpl, tx); 1576 1577 /* 1578 * Pre-dirty the first block so we sync to convergence faster. 1579 * (Usually only the first block is needed.) 1580 */ 1581 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 1582 dmu_tx_commit(tx); 1583 } 1584 1585 static void 1586 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 1587 { 1588 nvlist_t *config; 1589 char *packed = NULL; 1590 size_t nvsize = 0; 1591 dmu_buf_t *db; 1592 1593 if (list_is_empty(&spa->spa_dirty_list)) 1594 return; 1595 1596 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 1597 1598 spa_config_set(spa, config); 1599 1600 VERIFY(nvlist_size(config, &nvsize, NV_ENCODE_XDR) == 0); 1601 1602 packed = kmem_alloc(nvsize, KM_SLEEP); 1603 1604 VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 0) == 0); 1605 1606 dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize, 1607 packed, tx); 1608 1609 kmem_free(packed, nvsize); 1610 1611 db = dmu_bonus_hold(spa->spa_meta_objset, spa->spa_config_object); 1612 dmu_buf_will_dirty(db, tx); 1613 *(uint64_t *)db->db_data = nvsize; 1614 dmu_buf_rele(db); 1615 } 1616 1617 /* 1618 * Sync the specified transaction group. New blocks may be dirtied as 1619 * part of the process, so we iterate until it converges. 1620 */ 1621 void 1622 spa_sync(spa_t *spa, uint64_t txg) 1623 { 1624 dsl_pool_t *dp = spa->spa_dsl_pool; 1625 objset_t *mos = spa->spa_meta_objset; 1626 bplist_t *bpl = &spa->spa_sync_bplist; 1627 vdev_t *rvd = spa->spa_root_vdev; 1628 vdev_t *vd; 1629 dmu_tx_t *tx; 1630 int dirty_vdevs; 1631 1632 /* 1633 * Lock out configuration changes. 1634 */ 1635 spa_config_enter(spa, RW_READER); 1636 1637 spa->spa_syncing_txg = txg; 1638 spa->spa_sync_pass = 0; 1639 1640 bplist_open(bpl, mos, spa->spa_sync_bplist_obj); 1641 1642 /* 1643 * If anything has changed in this txg, push the deferred frees 1644 * from the previous txg. If not, leave them alone so that we 1645 * don't generate work on an otherwise idle system. 1646 */ 1647 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 1648 !txg_list_empty(&dp->dp_dirty_dirs, txg)) 1649 spa_sync_deferred_frees(spa, txg); 1650 1651 /* 1652 * Iterate to convergence. 1653 */ 1654 do { 1655 spa->spa_sync_pass++; 1656 1657 tx = dmu_tx_create_assigned(dp, txg); 1658 spa_sync_config_object(spa, tx); 1659 dmu_tx_commit(tx); 1660 1661 dsl_pool_sync(dp, txg); 1662 1663 dirty_vdevs = 0; 1664 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 1665 vdev_sync(vd, txg); 1666 dirty_vdevs++; 1667 } 1668 1669 tx = dmu_tx_create_assigned(dp, txg); 1670 bplist_sync(bpl, tx); 1671 dmu_tx_commit(tx); 1672 1673 } while (dirty_vdevs); 1674 1675 bplist_close(bpl); 1676 1677 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 1678 1679 /* 1680 * Rewrite the vdev configuration (which includes the uberblock) 1681 * to commit the transaction group. 1682 */ 1683 while (spa_sync_labels(spa, txg)) { 1684 dprintf("waiting for devices to heal\n"); 1685 delay(hz); 1686 vdev_reopen(rvd, NULL); 1687 } 1688 1689 /* 1690 * Make a stable copy of the fully synced uberblock. 1691 * We use this as the root for pool traversals. 1692 */ 1693 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 1694 1695 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 1696 1697 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 1698 spa->spa_traverse_wanted = 0; 1699 spa->spa_ubsync = spa->spa_uberblock; 1700 rw_exit(&spa->spa_traverse_lock); 1701 1702 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 1703 1704 /* 1705 * Clean up the ZIL records for the synced txg. 1706 */ 1707 dsl_pool_zil_clean(dp); 1708 1709 /* 1710 * Update usable space statistics. 1711 */ 1712 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 1713 vdev_sync_done(vd, txg); 1714 1715 /* 1716 * It had better be the case that we didn't dirty anything 1717 * since spa_sync_labels(). 1718 */ 1719 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 1720 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 1721 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 1722 ASSERT(bpl->bpl_queue == NULL); 1723 1724 spa_config_exit(spa); 1725 } 1726 1727 /* 1728 * Sync all pools. We don't want to hold the namespace lock across these 1729 * operations, so we take a reference on the spa_t and drop the lock during the 1730 * sync. 1731 */ 1732 void 1733 spa_sync_allpools(void) 1734 { 1735 spa_t *spa = NULL; 1736 mutex_enter(&spa_namespace_lock); 1737 while ((spa = spa_next(spa)) != NULL) { 1738 if (spa_state(spa) != POOL_STATE_ACTIVE) 1739 continue; 1740 spa_open_ref(spa, FTAG); 1741 mutex_exit(&spa_namespace_lock); 1742 txg_wait_synced(spa_get_dsl(spa), 0); 1743 mutex_enter(&spa_namespace_lock); 1744 spa_close(spa, FTAG); 1745 } 1746 mutex_exit(&spa_namespace_lock); 1747 } 1748 1749 /* 1750 * ========================================================================== 1751 * Miscellaneous routines 1752 * ========================================================================== 1753 */ 1754 1755 int 1756 spa_busy(void) 1757 { 1758 return (spa_active_count != 0); 1759 } 1760 1761 /* 1762 * Remove all pools in the system. 1763 */ 1764 void 1765 spa_evict_all(void) 1766 { 1767 spa_t *spa; 1768 1769 /* 1770 * Remove all cached state. All pools should be closed now, 1771 * so every spa in the AVL tree should be unreferenced. 1772 */ 1773 mutex_enter(&spa_namespace_lock); 1774 while ((spa = spa_next(NULL)) != NULL) { 1775 /* 1776 * Stop all scrub and resilver activity. spa_scrub() needs to 1777 * wait for the scrub thread, which may do a detach and sync the 1778 * configs, which needs spa_namespace_lock. Drop the lock while 1779 * maintaining a hold on the spa_t. 1780 */ 1781 spa_open_ref(spa, FTAG); 1782 mutex_exit(&spa_namespace_lock); 1783 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1784 mutex_enter(&spa_namespace_lock); 1785 spa_close(spa, FTAG); 1786 1787 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1788 spa_unload(spa); 1789 spa_deactivate(spa); 1790 } 1791 spa_remove(spa); 1792 } 1793 mutex_exit(&spa_namespace_lock); 1794 } 1795