1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains all the routines used when modifying on-disk SPA state. 31 * This includes opening, importing, destroying, exporting a pool, and syncing a 32 * pool. 33 */ 34 35 #include <sys/zfs_context.h> 36 #include <sys/spa_impl.h> 37 #include <sys/zio.h> 38 #include <sys/zio_checksum.h> 39 #include <sys/zio_compress.h> 40 #include <sys/dmu.h> 41 #include <sys/dmu_tx.h> 42 #include <sys/zap.h> 43 #include <sys/zil.h> 44 #include <sys/vdev_impl.h> 45 #include <sys/metaslab.h> 46 #include <sys/uberblock_impl.h> 47 #include <sys/txg.h> 48 #include <sys/avl.h> 49 #include <sys/dmu_traverse.h> 50 #include <sys/unique.h> 51 #include <sys/dsl_pool.h> 52 #include <sys/dsl_dir.h> 53 #include <sys/dsl_prop.h> 54 #include <sys/fs/zfs.h> 55 #include <sys/callb.h> 56 57 static uint32_t spa_active_count; 58 59 /* 60 * ========================================================================== 61 * SPA state manipulation (open/create/destroy/import/export) 62 * ========================================================================== 63 */ 64 65 /* 66 * Activate an uninitialized pool. 67 */ 68 static void 69 spa_activate(spa_t *spa) 70 { 71 int t; 72 73 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 74 75 spa->spa_state = POOL_STATE_ACTIVE; 76 77 spa->spa_normal_class = metaslab_class_create(); 78 79 spa->spa_vdev_retry_taskq = taskq_create("spa_vdev_retry", 80 4, maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); 81 82 for (t = 0; t < ZIO_TYPES; t++) { 83 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 84 8, maxclsyspri, 50, INT_MAX, 85 TASKQ_PREPOPULATE); 86 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 87 8, maxclsyspri, 50, INT_MAX, 88 TASKQ_PREPOPULATE); 89 } 90 91 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 92 93 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 94 offsetof(vdev_t, vdev_dirty_node)); 95 96 txg_list_create(&spa->spa_vdev_txg_list, 97 offsetof(struct vdev, vdev_txg_node)); 98 } 99 100 /* 101 * Opposite of spa_activate(). 102 */ 103 static void 104 spa_deactivate(spa_t *spa) 105 { 106 int t; 107 108 ASSERT(spa->spa_sync_on == B_FALSE); 109 ASSERT(spa->spa_dsl_pool == NULL); 110 ASSERT(spa->spa_root_vdev == NULL); 111 112 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 113 114 txg_list_destroy(&spa->spa_vdev_txg_list); 115 116 list_destroy(&spa->spa_dirty_list); 117 118 rw_destroy(&spa->spa_traverse_lock); 119 120 for (t = 0; t < ZIO_TYPES; t++) { 121 taskq_destroy(spa->spa_zio_issue_taskq[t]); 122 taskq_destroy(spa->spa_zio_intr_taskq[t]); 123 spa->spa_zio_issue_taskq[t] = NULL; 124 spa->spa_zio_intr_taskq[t] = NULL; 125 } 126 127 taskq_destroy(spa->spa_vdev_retry_taskq); 128 spa->spa_vdev_retry_taskq = NULL; 129 130 metaslab_class_destroy(spa->spa_normal_class); 131 spa->spa_normal_class = NULL; 132 133 spa->spa_state = POOL_STATE_UNINITIALIZED; 134 } 135 136 /* 137 * Verify a pool configuration, and construct the vdev tree appropriately. This 138 * will create all the necessary vdevs in the appropriate layout, with each vdev 139 * in the CLOSED state. This will prep the pool before open/creation/import. 140 * All vdev validation is done by the vdev_alloc() routine. 141 */ 142 static vdev_t * 143 spa_config_parse(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) 144 { 145 nvlist_t **child; 146 uint_t c, children; 147 vdev_t *vd; 148 149 if ((vd = vdev_alloc(spa, nv, parent, id, atype)) == NULL) 150 return (NULL); 151 152 if (vd->vdev_ops->vdev_op_leaf) 153 return (vd); 154 155 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 156 &child, &children) != 0) { 157 vdev_free(vd); 158 return (NULL); 159 } 160 161 for (c = 0; c < children; c++) { 162 if (spa_config_parse(spa, child[c], vd, c, atype) == NULL) { 163 vdev_free(vd); 164 return (NULL); 165 } 166 } 167 168 return (vd); 169 } 170 171 /* 172 * Opposite of spa_load(). 173 */ 174 static void 175 spa_unload(spa_t *spa) 176 { 177 /* 178 * Stop syncing. 179 */ 180 if (spa->spa_sync_on) { 181 txg_sync_stop(spa->spa_dsl_pool); 182 spa->spa_sync_on = B_FALSE; 183 } 184 185 /* 186 * Wait for any outstanding prefetch I/O to complete. 187 */ 188 spa_config_enter(spa, RW_WRITER); 189 spa_config_exit(spa); 190 191 /* 192 * Close the dsl pool. 193 */ 194 if (spa->spa_dsl_pool) { 195 dsl_pool_close(spa->spa_dsl_pool); 196 spa->spa_dsl_pool = NULL; 197 } 198 199 /* 200 * Close all vdevs. 201 */ 202 if (spa->spa_root_vdev) { 203 vdev_free(spa->spa_root_vdev); 204 spa->spa_root_vdev = NULL; 205 } 206 } 207 208 /* 209 * Load an existing storage pool, using the pool's builtin spa_config as a 210 * source of configuration information. The 'readonly' flag will prevent us 211 * from writing any updated state to disk, and can be use when testing a pool 212 * for import. 213 */ 214 static int 215 spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig) 216 { 217 int error = 0; 218 nvlist_t *nvroot = NULL; 219 vdev_t *rvd; 220 uberblock_t *ub = &spa->spa_uberblock; 221 uint64_t pool_guid; 222 zio_t *zio; 223 224 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 225 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 226 return (EINVAL); 227 228 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 229 &spa->spa_config_txg); 230 231 if (import && spa_guid_exists(pool_guid, 0)) 232 return (EEXIST); 233 234 /* 235 * Parse the configuration into a vdev tree. 236 */ 237 spa_config_enter(spa, RW_WRITER); 238 rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 239 spa_config_exit(spa); 240 241 if (rvd == NULL) 242 return (EINVAL); 243 244 spa->spa_root_vdev = rvd; 245 ASSERT(spa_guid(spa) == pool_guid); 246 247 /* 248 * Try to open all vdevs, loading each label in the process. 249 */ 250 if (vdev_open(rvd) != 0) 251 return (ENXIO); 252 253 /* 254 * Find the best uberblock. 255 */ 256 bzero(ub, sizeof (uberblock_t)); 257 258 zio = zio_root(spa, NULL, NULL, 259 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 260 vdev_uberblock_load(zio, rvd, ub); 261 error = zio_wait(zio); 262 263 /* 264 * If we weren't able to find a single valid uberblock, return failure. 265 */ 266 if (ub->ub_txg == 0) { 267 dprintf("ub_txg is zero\n"); 268 return (ENXIO); 269 } 270 271 /* 272 * If the vdev guid sum doesn't match the uberblock, we have an 273 * incomplete configuration. 274 */ 275 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 276 rvd->vdev_state = VDEV_STATE_CANT_OPEN; 277 rvd->vdev_stat.vs_aux = VDEV_AUX_BAD_GUID_SUM; 278 dprintf("vdev_guid_sum %llx != ub_guid_sum %llx\n", 279 rvd->vdev_guid_sum, ub->ub_guid_sum); 280 return (ENXIO); 281 } 282 283 /* 284 * Initialize internal SPA structures. 285 */ 286 spa->spa_state = POOL_STATE_ACTIVE; 287 spa->spa_ubsync = spa->spa_uberblock; 288 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 289 spa->spa_dsl_pool = dsl_pool_open(spa, spa->spa_first_txg); 290 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 291 292 VERIFY(zap_lookup(spa->spa_meta_objset, 293 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 294 sizeof (uint64_t), 1, &spa->spa_config_object) == 0); 295 296 if (!mosconfig) { 297 dmu_buf_t *db; 298 char *packed = NULL; 299 size_t nvsize = 0; 300 nvlist_t *newconfig = NULL; 301 302 db = dmu_bonus_hold(spa->spa_meta_objset, 303 spa->spa_config_object); 304 dmu_buf_read(db); 305 nvsize = *(uint64_t *)db->db_data; 306 dmu_buf_rele(db); 307 308 packed = kmem_alloc(nvsize, KM_SLEEP); 309 error = dmu_read_canfail(spa->spa_meta_objset, 310 spa->spa_config_object, 0, nvsize, packed); 311 if (error == 0) 312 error = nvlist_unpack(packed, nvsize, &newconfig, 0); 313 kmem_free(packed, nvsize); 314 315 if (error) 316 return (ENXIO); 317 318 spa_config_set(spa, newconfig); 319 320 spa_unload(spa); 321 spa_deactivate(spa); 322 spa_activate(spa); 323 324 return (spa_load(spa, newconfig, readonly, import, B_TRUE)); 325 } 326 327 VERIFY(zap_lookup(spa->spa_meta_objset, 328 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 329 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) == 0); 330 331 /* 332 * Load the vdev state for all top level vdevs. 333 */ 334 if ((error = vdev_load(rvd, import)) != 0) 335 return (error); 336 337 /* 338 * Propagate the leaf DTLs we just loaded all the way up the tree. 339 */ 340 spa_config_enter(spa, RW_WRITER); 341 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 342 spa_config_exit(spa); 343 344 /* 345 * Check the state of the root vdev. If it can't be opened, it 346 * indicates one or more toplevel vdevs are faulted. 347 */ 348 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 349 return (ENXIO); 350 351 /* 352 * Claim log blocks that haven't been committed yet, and update all 353 * top-level vdevs to sync any config changes found in vdev_load(). 354 * This must all happen in a single txg. 355 */ 356 if ((spa_mode & FWRITE) && !readonly) { 357 dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), 358 spa_first_txg(spa)); 359 dmu_objset_find(spa->spa_name, zil_claim, tx, 0); 360 vdev_config_dirty(rvd); 361 dmu_tx_commit(tx); 362 363 spa->spa_sync_on = B_TRUE; 364 txg_sync_start(spa->spa_dsl_pool); 365 366 /* 367 * Wait for all claims to sync. 368 */ 369 txg_wait_synced(spa->spa_dsl_pool, 0); 370 } 371 372 return (0); 373 } 374 375 /* 376 * Pool Open/Import 377 * 378 * The import case is identical to an open except that the configuration is sent 379 * down from userland, instead of grabbed from the configuration cache. For the 380 * case of an open, the pool configuration will exist in the 381 * POOL_STATE_UNITIALIZED state. 382 * 383 * The stats information (gen/count/ustats) is used to gather vdev statistics at 384 * the same time open the pool, without having to keep around the spa_t in some 385 * ambiguous state. 386 */ 387 static int 388 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 389 { 390 spa_t *spa; 391 int error; 392 int loaded = B_FALSE; 393 int locked = B_FALSE; 394 395 *spapp = NULL; 396 397 /* 398 * As disgusting as this is, we need to support recursive calls to this 399 * function because dsl_dir_open() is called during spa_load(), and ends 400 * up calling spa_open() again. The real fix is to figure out how to 401 * avoid dsl_dir_open() calling this in the first place. 402 */ 403 if (mutex_owner(&spa_namespace_lock) != curthread) { 404 mutex_enter(&spa_namespace_lock); 405 locked = B_TRUE; 406 } 407 408 if ((spa = spa_lookup(pool)) == NULL) { 409 if (locked) 410 mutex_exit(&spa_namespace_lock); 411 return (ENOENT); 412 } 413 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 414 415 spa_activate(spa); 416 417 error = spa_load(spa, spa->spa_config, 418 B_FALSE, B_FALSE, B_FALSE); 419 420 if (error == EBADF) { 421 /* 422 * If vdev_load() returns EBADF, it indicates that one 423 * of the vdevs indicates that the pool has been 424 * exported or destroyed. If this is the case, the 425 * config cache is out of sync and we should remove the 426 * pool from the namespace. 427 */ 428 spa_unload(spa); 429 spa_deactivate(spa); 430 spa_remove(spa); 431 spa_config_sync(); 432 if (locked) 433 mutex_exit(&spa_namespace_lock); 434 return (ENOENT); 435 } if (error) { 436 /* 437 * We can't open the pool, but we still have useful 438 * information: the state of each vdev after the 439 * attempted vdev_open(). Return this to the user. 440 */ 441 if (config != NULL && spa->spa_root_vdev != NULL) 442 *config = spa_config_generate(spa, NULL, -1ULL, 443 B_TRUE); 444 spa_unload(spa); 445 spa_deactivate(spa); 446 if (locked) 447 mutex_exit(&spa_namespace_lock); 448 *spapp = NULL; 449 return (error); 450 } 451 452 loaded = B_TRUE; 453 } 454 455 spa_open_ref(spa, tag); 456 if (locked) 457 mutex_exit(&spa_namespace_lock); 458 459 *spapp = spa; 460 461 if (config != NULL) { 462 spa_config_enter(spa, RW_READER); 463 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 464 spa_config_exit(spa); 465 } 466 467 /* 468 * If we just loaded the pool, resilver anything that's out of date. 469 */ 470 if (loaded && (spa_mode & FWRITE)) 471 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 472 473 return (0); 474 } 475 476 int 477 spa_open(const char *name, spa_t **spapp, void *tag) 478 { 479 return (spa_open_common(name, spapp, tag, NULL)); 480 } 481 482 int 483 spa_get_stats(const char *name, nvlist_t **config) 484 { 485 int error; 486 spa_t *spa; 487 488 *config = NULL; 489 error = spa_open_common(name, &spa, FTAG, config); 490 491 if (spa != NULL) 492 spa_close(spa, FTAG); 493 494 return (error); 495 } 496 497 /* 498 * Pool Creation 499 */ 500 int 501 spa_create(const char *pool, nvlist_t *nvroot, char *altroot) 502 { 503 spa_t *spa; 504 dsl_pool_t *dp; 505 dmu_tx_t *tx; 506 int error; 507 uint64_t txg = TXG_INITIAL; 508 509 /* 510 * If this pool already exists, return failure. 511 */ 512 mutex_enter(&spa_namespace_lock); 513 if (spa_lookup(pool) != NULL) { 514 mutex_exit(&spa_namespace_lock); 515 return (EEXIST); 516 } 517 spa = spa_add(pool); 518 519 /* 520 * Allocate a new spa_t structure. 521 */ 522 spa_activate(spa); 523 524 spa->spa_uberblock.ub_txg = txg - 1; 525 spa->spa_ubsync = spa->spa_uberblock; 526 527 error = spa_vdev_add(spa, nvroot); 528 529 if (error) { 530 spa_unload(spa); 531 spa_deactivate(spa); 532 spa_remove(spa); 533 mutex_exit(&spa_namespace_lock); 534 return (error); 535 } 536 537 if (altroot != NULL) { 538 spa->spa_root = spa_strdup(altroot); 539 atomic_add_32(&spa_active_count, 1); 540 } 541 542 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 543 spa->spa_meta_objset = dp->dp_meta_objset; 544 545 tx = dmu_tx_create_assigned(dp, txg); 546 547 /* 548 * Create the pool config object. 549 */ 550 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 551 DMU_OT_PACKED_NVLIST, 1 << 14, 552 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 553 554 VERIFY(zap_add(spa->spa_meta_objset, 555 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 556 sizeof (uint64_t), 1, &spa->spa_config_object, tx) == 0); 557 558 /* 559 * Create the deferred-free bplist object. Turn off compression 560 * because sync-to-convergence takes longer if the blocksize 561 * keeps changing. 562 */ 563 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 564 1 << 14, tx); 565 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 566 ZIO_COMPRESS_OFF, tx); 567 568 VERIFY(zap_add(spa->spa_meta_objset, 569 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 570 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) == 0); 571 572 dmu_tx_commit(tx); 573 574 spa->spa_sync_on = B_TRUE; 575 txg_sync_start(spa->spa_dsl_pool); 576 577 /* 578 * We explicitly wait for the first transaction to complete so that our 579 * bean counters are appropriately updated. 580 */ 581 txg_wait_synced(spa->spa_dsl_pool, txg); 582 583 spa_config_sync(); 584 585 mutex_exit(&spa_namespace_lock); 586 587 return (0); 588 } 589 590 /* 591 * Import the given pool into the system. We set up the necessary spa_t and 592 * then call spa_load() to do the dirty work. 593 */ 594 int 595 spa_import(const char *pool, nvlist_t *config, char *altroot) 596 { 597 spa_t *spa; 598 int error; 599 600 if (!(spa_mode & FWRITE)) 601 return (EROFS); 602 603 /* 604 * If a pool with this name exists, return failure. 605 */ 606 mutex_enter(&spa_namespace_lock); 607 if (spa_lookup(pool) != NULL) { 608 mutex_exit(&spa_namespace_lock); 609 return (EEXIST); 610 } 611 612 /* 613 * Create an initialize the spa structure 614 */ 615 spa = spa_add(pool); 616 spa_activate(spa); 617 618 /* 619 * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig 620 * so that we don't try to open the pool if the config is damaged. 621 */ 622 error = spa_load(spa, config, B_FALSE, B_TRUE, B_TRUE); 623 624 if (error) { 625 spa_unload(spa); 626 spa_deactivate(spa); 627 spa_remove(spa); 628 mutex_exit(&spa_namespace_lock); 629 return (error); 630 } 631 632 /* 633 * Set the alternate root, if there is one. 634 */ 635 if (altroot != NULL) { 636 atomic_add_32(&spa_active_count, 1); 637 spa->spa_root = spa_strdup(altroot); 638 } 639 640 /* 641 * Initialize the config based on the in-core state. 642 */ 643 config = spa_config_generate(spa, NULL, spa_last_synced_txg(spa), 0); 644 645 spa_config_set(spa, config); 646 647 /* 648 * Sync the configuration cache. 649 */ 650 spa_config_sync(); 651 652 mutex_exit(&spa_namespace_lock); 653 654 /* 655 * Resilver anything that's out of date. 656 */ 657 if (spa_mode & FWRITE) 658 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 659 660 return (0); 661 } 662 663 /* 664 * This (illegal) pool name is used when temporarily importing a spa_t in order 665 * to get the vdev stats associated with the imported devices. 666 */ 667 #define TRYIMPORT_NAME "$import" 668 669 nvlist_t * 670 spa_tryimport(nvlist_t *tryconfig) 671 { 672 nvlist_t *config = NULL; 673 char *poolname; 674 spa_t *spa; 675 uint64_t state; 676 677 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 678 return (NULL); 679 680 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 681 return (NULL); 682 683 mutex_enter(&spa_namespace_lock); 684 spa = spa_add(TRYIMPORT_NAME); 685 686 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 687 688 /* 689 * Initialize the spa_t structure. 690 */ 691 spa_activate(spa); 692 693 /* 694 * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig 695 * so we don't try to open the pool if the config is damaged. 696 */ 697 (void) spa_load(spa, tryconfig, B_TRUE, B_TRUE, B_TRUE); 698 699 /* 700 * If 'tryconfig' was at least parsable, return the current config. 701 */ 702 if (spa->spa_root_vdev != NULL) { 703 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 704 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 705 poolname) == 0); 706 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 707 state) == 0); 708 } 709 710 spa_unload(spa); 711 spa_deactivate(spa); 712 spa_remove(spa); 713 mutex_exit(&spa_namespace_lock); 714 715 return (config); 716 } 717 718 /* 719 * Pool export/destroy 720 * 721 * The act of destroying or exporting a pool is very simple. We make sure there 722 * is no more pending I/O and any references to the pool are gone. Then, we 723 * update the pool state and sync all the labels to disk, removing the 724 * configuration from the cache afterwards. 725 */ 726 static int 727 spa_export_common(char *pool, int new_state) 728 { 729 spa_t *spa; 730 731 if (!(spa_mode & FWRITE)) 732 return (EROFS); 733 734 mutex_enter(&spa_namespace_lock); 735 if ((spa = spa_lookup(pool)) == NULL) { 736 mutex_exit(&spa_namespace_lock); 737 return (ENOENT); 738 } 739 740 /* 741 * The pool will be in core if it's openable, 742 * in which case we can modify its state. 743 */ 744 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 745 /* 746 * Objsets may be open only because they're dirty, so we 747 * have to force it to sync before checking spa_refcnt. 748 */ 749 spa_scrub_suspend(spa); 750 txg_wait_synced(spa->spa_dsl_pool, 0); 751 752 if (!spa_refcount_zero(spa)) { 753 spa_scrub_resume(spa); 754 mutex_exit(&spa_namespace_lock); 755 return (EBUSY); 756 } 757 758 /* 759 * Update the pool state. 760 */ 761 spa->spa_state = new_state; 762 763 spa_scrub_resume(spa); 764 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 765 766 if (spa->spa_root != NULL) 767 atomic_add_32(&spa_active_count, -1); 768 769 /* 770 * We want this to be reflected on every label, 771 * so mark them all dirty. spa_unload() will do the 772 * final sync that pushes these changes out. 773 */ 774 vdev_config_dirty(spa->spa_root_vdev); 775 } 776 777 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 778 spa_unload(spa); 779 spa_deactivate(spa); 780 } 781 782 spa_remove(spa); 783 spa_config_sync(); 784 mutex_exit(&spa_namespace_lock); 785 786 return (0); 787 } 788 789 /* 790 * Destroy a storage pool. 791 */ 792 int 793 spa_destroy(char *pool) 794 { 795 return (spa_export_common(pool, POOL_STATE_DESTROYED)); 796 } 797 798 /* 799 * Export a storage pool. 800 */ 801 int 802 spa_export(char *pool) 803 { 804 return (spa_export_common(pool, POOL_STATE_EXPORTED)); 805 } 806 807 /* 808 * ========================================================================== 809 * Device manipulation 810 * ========================================================================== 811 */ 812 813 /* 814 * Add capacity to a storage pool. 815 */ 816 int 817 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 818 { 819 uint64_t txg; 820 int c, error; 821 vdev_t *rvd = spa->spa_root_vdev; 822 vdev_t *vd; 823 824 txg = spa_vdev_enter(spa); 825 826 vd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 827 828 if (vd == NULL) 829 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 830 831 if (rvd == NULL) /* spa_create() */ 832 spa->spa_root_vdev = rvd = vd; 833 834 if ((error = vdev_create(vd, txg)) != 0) 835 return (spa_vdev_exit(spa, vd, txg, error)); 836 837 /* 838 * Transfer each top-level vdev from the temporary root 839 * to the spa's root and initialize its metaslabs. 840 */ 841 for (c = 0; c < vd->vdev_children; c++) { 842 vdev_t *tvd = vd->vdev_child[c]; 843 if (vd != rvd) { 844 vdev_remove_child(vd, tvd); 845 tvd->vdev_id = rvd->vdev_children; 846 vdev_add_child(rvd, tvd); 847 } 848 vdev_init(tvd, txg); 849 vdev_config_dirty(tvd); 850 } 851 852 /* 853 * Update the config based on the new in-core state. 854 */ 855 spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); 856 857 return (spa_vdev_exit(spa, vd, txg, 0)); 858 } 859 860 /* 861 * Attach a device to a mirror. The arguments are the path to any device 862 * in the mirror, and the nvroot for the new device. If the path specifies 863 * a device that is not mirrored, we automatically insert the mirror vdev. 864 * 865 * If 'replacing' is specified, the new device is intended to replace the 866 * existing device; in this case the two devices are made into their own 867 * mirror using the 'replacing' vdev, which is functionally idendical to 868 * the mirror vdev (it actually reuses all the same ops) but has a few 869 * extra rules: you can't attach to it after it's been created, and upon 870 * completion of resilvering, the first disk (the one being replaced) 871 * is automatically detached. 872 */ 873 int 874 spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing) 875 { 876 uint64_t txg, open_txg; 877 int error; 878 vdev_t *rvd = spa->spa_root_vdev; 879 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 880 vdev_ops_t *pvops = replacing ? &vdev_replacing_ops : &vdev_mirror_ops; 881 882 txg = spa_vdev_enter(spa); 883 884 oldvd = vdev_lookup_by_path(rvd, path); 885 886 if (oldvd == NULL) 887 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 888 889 pvd = oldvd->vdev_parent; 890 891 /* 892 * The parent must be a mirror or the root, unless we're replacing; 893 * in that case, the parent can be anything but another replacing vdev. 894 */ 895 if (pvd->vdev_ops != &vdev_mirror_ops && 896 pvd->vdev_ops != &vdev_root_ops && 897 (!replacing || pvd->vdev_ops == &vdev_replacing_ops)) 898 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 899 900 newrootvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 901 902 if (newrootvd == NULL || newrootvd->vdev_children != 1) 903 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 904 905 newvd = newrootvd->vdev_child[0]; 906 907 if (!newvd->vdev_ops->vdev_op_leaf) 908 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 909 910 if ((error = vdev_create(newrootvd, txg)) != 0) 911 return (spa_vdev_exit(spa, newrootvd, txg, error)); 912 913 if (newvd->vdev_psize < oldvd->vdev_psize) 914 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 915 916 if (newvd->vdev_ashift != oldvd->vdev_ashift && oldvd->vdev_ashift != 0) 917 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 918 919 /* 920 * If this is an in-place replacement, update oldvd's path and devid 921 * to make it distinguishable from newvd, and unopenable from now on. 922 */ 923 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 924 spa_strfree(oldvd->vdev_path); 925 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 926 KM_SLEEP); 927 (void) sprintf(oldvd->vdev_path, "%s/%s", 928 newvd->vdev_path, "old"); 929 if (oldvd->vdev_devid != NULL) { 930 spa_strfree(oldvd->vdev_devid); 931 oldvd->vdev_devid = NULL; 932 } 933 } 934 935 /* 936 * If the parent is not a mirror, or if we're replacing, 937 * insert the new mirror/replacing vdev above oldvd. 938 */ 939 if (pvd->vdev_ops != pvops) 940 pvd = vdev_add_parent(oldvd, pvops); 941 942 ASSERT(pvd->vdev_top->vdev_parent == rvd); 943 ASSERT(pvd->vdev_ops == pvops); 944 ASSERT(oldvd->vdev_parent == pvd); 945 946 /* 947 * Extract the new device from its root and add it to pvd. 948 */ 949 vdev_remove_child(newrootvd, newvd); 950 newvd->vdev_id = pvd->vdev_children; 951 vdev_add_child(pvd, newvd); 952 953 tvd = newvd->vdev_top; 954 ASSERT(pvd->vdev_top == tvd); 955 ASSERT(tvd->vdev_parent == rvd); 956 957 /* 958 * Update the config based on the new in-core state. 959 */ 960 spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); 961 962 vdev_config_dirty(tvd); 963 964 /* 965 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 966 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 967 */ 968 open_txg = txg + TXG_CONCURRENT_STATES - 1; 969 970 mutex_enter(&newvd->vdev_dtl_lock); 971 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 972 open_txg - TXG_INITIAL + 1); 973 mutex_exit(&newvd->vdev_dtl_lock); 974 975 /* 976 * Mark newvd's DTL dirty in this txg. 977 */ 978 vdev_dirty(tvd, VDD_DTL, txg); 979 (void) txg_list_add(&tvd->vdev_dtl_list, newvd, txg); 980 981 dprintf("attached %s, replacing=%d\n", path, replacing); 982 983 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 984 985 /* 986 * Kick off a resilver to update newvd. 987 */ 988 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 989 990 return (0); 991 } 992 993 /* 994 * Detach a device from a mirror or replacing vdev. 995 * If 'replace_done' is specified, only detach if the parent 996 * is a replacing vdev. 997 */ 998 int 999 spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, int replace_done) 1000 { 1001 uint64_t txg; 1002 int c, t, error; 1003 vdev_t *rvd = spa->spa_root_vdev; 1004 vdev_t *vd, *pvd, *cvd, *tvd; 1005 1006 txg = spa_vdev_enter(spa); 1007 1008 vd = vdev_lookup_by_path(rvd, path); 1009 1010 if (vd == NULL) 1011 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1012 1013 if (guid != 0 && vd->vdev_guid != guid) 1014 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1015 1016 pvd = vd->vdev_parent; 1017 1018 /* 1019 * If replace_done is specified, only remove this device if it's 1020 * the first child of a replacing vdev. 1021 */ 1022 if (replace_done && 1023 (vd->vdev_id != 0 || pvd->vdev_ops != &vdev_replacing_ops)) 1024 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1025 1026 /* 1027 * Only mirror and replacing vdevs support detach. 1028 */ 1029 if (pvd->vdev_ops != &vdev_replacing_ops && 1030 pvd->vdev_ops != &vdev_mirror_ops) 1031 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1032 1033 /* 1034 * If there's only one replica, you can't detach it. 1035 */ 1036 if (pvd->vdev_children <= 1) 1037 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1038 1039 /* 1040 * If all siblings have non-empty DTLs, this device may have the only 1041 * valid copy of the data, which means we cannot safely detach it. 1042 * 1043 * XXX -- as in the vdev_offline() case, we really want a more 1044 * precise DTL check. 1045 */ 1046 for (c = 0; c < pvd->vdev_children; c++) { 1047 uint64_t dirty; 1048 1049 cvd = pvd->vdev_child[c]; 1050 if (cvd == vd) 1051 continue; 1052 if (vdev_is_dead(cvd)) 1053 continue; 1054 mutex_enter(&cvd->vdev_dtl_lock); 1055 dirty = cvd->vdev_dtl_map.sm_space | 1056 cvd->vdev_dtl_scrub.sm_space; 1057 mutex_exit(&cvd->vdev_dtl_lock); 1058 if (!dirty) 1059 break; 1060 } 1061 if (c == pvd->vdev_children) 1062 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1063 1064 /* 1065 * Erase the disk labels so the disk can be used for other things. 1066 * This must be done after all other error cases are handled, 1067 * but before we disembowel vd (so we can still do I/O to it). 1068 * But if we can't do it, don't treat the error as fatal -- 1069 * it may be that the unwritability of the disk is the reason 1070 * it's being detached! 1071 */ 1072 error = vdev_label_init(vd, 0); 1073 if (error) 1074 dprintf("unable to erase labels on %s\n", vdev_description(vd)); 1075 1076 /* 1077 * Remove vd from its parent and compact the parent's children. 1078 */ 1079 vdev_remove_child(pvd, vd); 1080 vdev_compact_children(pvd); 1081 1082 /* 1083 * Remember one of the remaining children so we can get tvd below. 1084 */ 1085 cvd = pvd->vdev_child[0]; 1086 1087 /* 1088 * If the parent mirror/replacing vdev only has one child, 1089 * the parent is no longer needed. Remove it from the tree. 1090 */ 1091 if (pvd->vdev_children == 1) 1092 vdev_remove_parent(cvd); 1093 1094 /* 1095 * We don't set tvd until now because the parent we just removed 1096 * may have been the previous top-level vdev. 1097 */ 1098 tvd = cvd->vdev_top; 1099 ASSERT(tvd->vdev_parent == rvd); 1100 1101 /* 1102 * Reopen this top-level vdev to reassess health after detach. 1103 */ 1104 vdev_reopen(tvd, NULL); 1105 1106 /* 1107 * If the device we just detached was smaller than the others, 1108 * it may be possible to add metaslabs (i.e. grow the pool). 1109 */ 1110 vdev_metaslab_init(tvd, txg); 1111 1112 /* 1113 * Update the config based on the new in-core state. 1114 */ 1115 spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); 1116 1117 vdev_config_dirty(tvd); 1118 1119 /* 1120 * Mark vd's DTL as dirty in this txg. 1121 * vdev_dtl_sync() will see that vd->vdev_detached is set 1122 * and free vd's DTL object in syncing context. 1123 * But first make sure we're not on any *other* txg's DTL list, 1124 * to prevent vd from being accessed after it's freed. 1125 */ 1126 vdev_dirty(tvd, VDD_DTL, txg); 1127 vd->vdev_detached = B_TRUE; 1128 for (t = 0; t < TXG_SIZE; t++) 1129 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1130 (void) txg_list_add(&tvd->vdev_dtl_list, vd, txg); 1131 1132 dprintf("detached %s\n", path); 1133 1134 return (spa_vdev_exit(spa, vd, txg, 0)); 1135 } 1136 1137 /* 1138 * If there are any replacing vdevs that have finished replacing, detach them. 1139 * We can't hold the config lock across detaches, so we lock the config, 1140 * build a list of candidates, unlock the config, and try each candidate. 1141 */ 1142 typedef struct vdev_detach_link { 1143 char *vdl_path; 1144 uint64_t vdl_guid; 1145 list_node_t vdl_node; 1146 } vdev_detach_link_t; 1147 1148 static void 1149 spa_vdev_replace_done_make_list(list_t *l, vdev_t *vd) 1150 { 1151 int c; 1152 1153 for (c = 0; c < vd->vdev_children; c++) 1154 spa_vdev_replace_done_make_list(l, vd->vdev_child[c]); 1155 1156 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 1157 vdev_t *cvd0 = vd->vdev_child[0]; 1158 vdev_t *cvd1 = vd->vdev_child[1]; 1159 vdev_detach_link_t *vdl; 1160 int dirty1; 1161 1162 mutex_enter(&cvd1->vdev_dtl_lock); 1163 dirty1 = cvd1->vdev_dtl_map.sm_space | 1164 cvd1->vdev_dtl_scrub.sm_space; 1165 mutex_exit(&cvd1->vdev_dtl_lock); 1166 1167 if (!dirty1) { 1168 vdl = kmem_zalloc(sizeof (*vdl), KM_SLEEP); 1169 vdl->vdl_path = spa_strdup(cvd0->vdev_path); 1170 vdl->vdl_guid = cvd0->vdev_guid; 1171 list_insert_tail(l, vdl); 1172 } 1173 } 1174 } 1175 1176 void 1177 spa_vdev_replace_done(spa_t *spa) 1178 { 1179 vdev_detach_link_t *vdl; 1180 list_t vdlist; 1181 1182 list_create(&vdlist, sizeof (vdev_detach_link_t), 1183 offsetof(vdev_detach_link_t, vdl_node)); 1184 1185 spa_config_enter(spa, RW_READER); 1186 spa_vdev_replace_done_make_list(&vdlist, spa->spa_root_vdev); 1187 spa_config_exit(spa); 1188 1189 while ((vdl = list_head(&vdlist)) != NULL) { 1190 list_remove(&vdlist, vdl); 1191 (void) spa_vdev_detach(spa, vdl->vdl_path, vdl->vdl_guid, 1192 B_TRUE); 1193 spa_strfree(vdl->vdl_path); 1194 kmem_free(vdl, sizeof (*vdl)); 1195 } 1196 1197 list_destroy(&vdlist); 1198 } 1199 1200 /* 1201 * ========================================================================== 1202 * SPA Scrubbing 1203 * ========================================================================== 1204 */ 1205 1206 static int spa_scrub_locked(spa_t *, pool_scrub_type_t, boolean_t); 1207 1208 static void 1209 spa_scrub_io_done(zio_t *zio) 1210 { 1211 spa_t *spa = zio->io_spa; 1212 1213 zio_buf_free(zio->io_data, zio->io_size); 1214 1215 mutex_enter(&spa->spa_scrub_lock); 1216 if (zio->io_error) 1217 spa->spa_scrub_errors++; 1218 if (--spa->spa_scrub_inflight == 0) 1219 cv_broadcast(&spa->spa_scrub_io_cv); 1220 mutex_exit(&spa->spa_scrub_lock); 1221 1222 if (zio->io_error) { 1223 vdev_t *vd = zio->io_vd; 1224 mutex_enter(&vd->vdev_stat_lock); 1225 vd->vdev_stat.vs_scrub_errors++; 1226 mutex_exit(&vd->vdev_stat_lock); 1227 } 1228 } 1229 1230 static void 1231 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags) 1232 { 1233 size_t size = BP_GET_LSIZE(bp); 1234 void *data = zio_buf_alloc(size); 1235 1236 mutex_enter(&spa->spa_scrub_lock); 1237 spa->spa_scrub_inflight++; 1238 mutex_exit(&spa->spa_scrub_lock); 1239 1240 zio_nowait(zio_read(NULL, spa, bp, data, size, 1241 spa_scrub_io_done, NULL, priority, flags)); 1242 } 1243 1244 /* ARGSUSED */ 1245 static int 1246 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 1247 { 1248 blkptr_t *bp = &bc->bc_blkptr; 1249 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[0])); 1250 1251 if (bc->bc_errno || vd == NULL) { 1252 /* 1253 * We can't scrub this block, but we can continue to scrub 1254 * the rest of the pool. Note the error and move along. 1255 */ 1256 mutex_enter(&spa->spa_scrub_lock); 1257 spa->spa_scrub_errors++; 1258 mutex_exit(&spa->spa_scrub_lock); 1259 1260 if (vd != NULL) { 1261 mutex_enter(&vd->vdev_stat_lock); 1262 vd->vdev_stat.vs_scrub_errors++; 1263 mutex_exit(&vd->vdev_stat_lock); 1264 } 1265 1266 return (ERESTART); 1267 } 1268 1269 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 1270 1271 /* 1272 * Keep track of how much data we've examined so that 1273 * zpool(1M) status can make useful progress reports. 1274 */ 1275 mutex_enter(&vd->vdev_stat_lock); 1276 vd->vdev_stat.vs_scrub_examined += BP_GET_ASIZE(bp); 1277 mutex_exit(&vd->vdev_stat_lock); 1278 1279 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 1280 if (DVA_GET_GANG(&bp->blk_dva[0])) { 1281 /* 1282 * Gang members may be spread across multiple vdevs, 1283 * so the best we can do is look at the pool-wide DTL. 1284 * XXX -- it would be better to change our allocation 1285 * policy to ensure that this can't happen. 1286 */ 1287 vd = spa->spa_root_vdev; 1288 } 1289 if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) { 1290 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 1291 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | 1292 ZIO_FLAG_RESILVER); 1293 } 1294 } else { 1295 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 1296 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SCRUB); 1297 } 1298 1299 return (0); 1300 } 1301 1302 static void 1303 spa_scrub_thread(spa_t *spa) 1304 { 1305 callb_cpr_t cprinfo; 1306 traverse_handle_t *th = spa->spa_scrub_th; 1307 vdev_t *rvd = spa->spa_root_vdev; 1308 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 1309 int error = 0; 1310 boolean_t complete; 1311 1312 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 1313 1314 /* 1315 * If we're restarting due to a snapshot create/delete, 1316 * wait for that to complete. 1317 */ 1318 txg_wait_synced(spa_get_dsl(spa), 0); 1319 1320 spa_config_enter(spa, RW_WRITER); 1321 vdev_reopen(rvd, NULL); /* purge all vdev caches */ 1322 vdev_config_dirty(rvd); /* rewrite all disk labels */ 1323 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 1324 spa_config_exit(spa); 1325 1326 mutex_enter(&spa->spa_scrub_lock); 1327 spa->spa_scrub_errors = 0; 1328 spa->spa_scrub_active = 1; 1329 1330 while (!spa->spa_scrub_stop) { 1331 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1332 while (spa->spa_scrub_suspend) { 1333 spa->spa_scrub_active = 0; 1334 cv_broadcast(&spa->spa_scrub_cv); 1335 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1336 spa->spa_scrub_active = 1; 1337 } 1338 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 1339 1340 if (spa->spa_scrub_restart_txg != 0) 1341 break; 1342 1343 mutex_exit(&spa->spa_scrub_lock); 1344 error = traverse_more(th); 1345 mutex_enter(&spa->spa_scrub_lock); 1346 if (error != EAGAIN) 1347 break; 1348 } 1349 1350 while (spa->spa_scrub_inflight) 1351 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1352 1353 if (spa->spa_scrub_restart_txg != 0) 1354 error = ERESTART; 1355 1356 spa->spa_scrub_active = 0; 1357 cv_broadcast(&spa->spa_scrub_cv); 1358 1359 /* 1360 * If the traverse completed, and there were no errors, 1361 * then the scrub was completely successful. 1362 */ 1363 complete = (error == 0 && spa->spa_scrub_errors == 0); 1364 1365 dprintf("scrub to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 1366 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 1367 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 1368 1369 mutex_exit(&spa->spa_scrub_lock); 1370 1371 /* 1372 * If the scrub/resilver completed, update all DTLs to reflect this. 1373 * Whether it succeeded or not, vacate all temporary scrub DTLs. 1374 */ 1375 spa_config_enter(spa, RW_WRITER); 1376 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 1377 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 1378 spa_config_exit(spa); 1379 1380 spa_vdev_replace_done(spa); 1381 1382 spa_config_enter(spa, RW_READER); 1383 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 1384 spa_config_exit(spa); 1385 1386 mutex_enter(&spa->spa_scrub_lock); 1387 1388 spa->spa_scrub_type = POOL_SCRUB_NONE; 1389 spa->spa_scrub_active = 0; 1390 spa->spa_scrub_thread = NULL; 1391 1392 cv_broadcast(&spa->spa_scrub_cv); 1393 1394 /* 1395 * If we were told to restart, our final act is to start a new scrub. 1396 */ 1397 if (error == ERESTART) 1398 VERIFY(spa_scrub_locked(spa, scrub_type, B_TRUE) == 0); 1399 1400 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 1401 thread_exit(); 1402 } 1403 1404 void 1405 spa_scrub_suspend(spa_t *spa) 1406 { 1407 mutex_enter(&spa->spa_scrub_lock); 1408 spa->spa_scrub_suspend++; 1409 while (spa->spa_scrub_active) { 1410 cv_broadcast(&spa->spa_scrub_cv); 1411 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1412 } 1413 while (spa->spa_scrub_inflight) 1414 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1415 mutex_exit(&spa->spa_scrub_lock); 1416 } 1417 1418 void 1419 spa_scrub_resume(spa_t *spa) 1420 { 1421 mutex_enter(&spa->spa_scrub_lock); 1422 ASSERT(spa->spa_scrub_suspend != 0); 1423 if (--spa->spa_scrub_suspend == 0) 1424 cv_broadcast(&spa->spa_scrub_cv); 1425 mutex_exit(&spa->spa_scrub_lock); 1426 } 1427 1428 void 1429 spa_scrub_restart(spa_t *spa, uint64_t txg) 1430 { 1431 /* 1432 * Something happened (e.g. snapshot create/delete) that means 1433 * we must restart any in-progress scrubs. The itinerary will 1434 * fix this properly. 1435 */ 1436 mutex_enter(&spa->spa_scrub_lock); 1437 spa->spa_scrub_restart_txg = txg; 1438 mutex_exit(&spa->spa_scrub_lock); 1439 } 1440 1441 static int 1442 spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force) 1443 { 1444 space_seg_t *ss; 1445 uint64_t mintxg, maxtxg; 1446 vdev_t *rvd = spa->spa_root_vdev; 1447 int advance = 0; 1448 1449 if ((uint_t)type >= POOL_SCRUB_TYPES) 1450 return (ENOTSUP); 1451 1452 /* 1453 * If there's a scrub or resilver already in progress, stop it. 1454 */ 1455 while (spa->spa_scrub_thread != NULL) { 1456 /* 1457 * Don't stop a resilver unless forced. 1458 */ 1459 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) 1460 return (EBUSY); 1461 1462 spa->spa_scrub_stop = 1; 1463 cv_broadcast(&spa->spa_scrub_cv); 1464 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1465 } 1466 1467 /* 1468 * Terminate the previous traverse. 1469 */ 1470 if (spa->spa_scrub_th != NULL) { 1471 traverse_fini(spa->spa_scrub_th); 1472 spa->spa_scrub_th = NULL; 1473 } 1474 1475 spa->spa_scrub_stop = 0; 1476 spa->spa_scrub_type = type; 1477 spa->spa_scrub_restart_txg = 0; 1478 1479 mintxg = TXG_INITIAL - 1; 1480 maxtxg = spa_last_synced_txg(spa) + 1; 1481 1482 switch (type) { 1483 1484 case POOL_SCRUB_NONE: 1485 break; 1486 1487 case POOL_SCRUB_RESILVER: 1488 /* 1489 * Determine the resilvering boundaries. 1490 * 1491 * Note: (mintxg, maxtxg) is an open interval, 1492 * i.e. mintxg and maxtxg themselves are not included. 1493 * 1494 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 1495 * so we don't claim to resilver a txg that's still changing. 1496 */ 1497 mutex_enter(&rvd->vdev_dtl_lock); 1498 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 1499 mintxg = ss ? ss->ss_start - 1 : 0; 1500 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 1501 maxtxg = ss ? ss->ss_end : 0; 1502 maxtxg = MIN(maxtxg, spa_last_synced_txg(spa) + 1); 1503 mutex_exit(&rvd->vdev_dtl_lock); 1504 1505 advance = ADVANCE_PRE | ADVANCE_PRUNE; 1506 break; 1507 1508 case POOL_SCRUB_EVERYTHING: 1509 /* 1510 * A scrub is like a resilver, but not pruned by DTL. 1511 */ 1512 advance = ADVANCE_PRE; 1513 break; 1514 } 1515 1516 if (mintxg != 0 && maxtxg != 0 && type != POOL_SCRUB_NONE) { 1517 spa->spa_scrub_maxtxg = maxtxg; 1518 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 1519 advance, ZIO_FLAG_CANFAIL); 1520 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 1521 spa->spa_scrub_thread = thread_create(NULL, 0, 1522 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 1523 } 1524 1525 return (0); 1526 } 1527 1528 int 1529 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 1530 { 1531 int error; 1532 traverse_handle_t *th; 1533 1534 mutex_enter(&spa->spa_scrub_lock); 1535 error = spa_scrub_locked(spa, type, force); 1536 th = spa->spa_scrub_th; 1537 mutex_exit(&spa->spa_scrub_lock); 1538 1539 if (th == NULL && type != POOL_SCRUB_NONE) 1540 spa_vdev_replace_done(spa); 1541 1542 return (error); 1543 } 1544 1545 /* 1546 * ========================================================================== 1547 * SPA syncing routines 1548 * ========================================================================== 1549 */ 1550 1551 static void 1552 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 1553 { 1554 bplist_t *bpl = &spa->spa_sync_bplist; 1555 dmu_tx_t *tx; 1556 blkptr_t blk; 1557 uint64_t itor = 0; 1558 zio_t *zio; 1559 int error; 1560 uint8_t c = 1; 1561 1562 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 1563 1564 while (bplist_iterate(bpl, &itor, &blk) == 0) 1565 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 1566 1567 error = zio_wait(zio); 1568 ASSERT3U(error, ==, 0); 1569 1570 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1571 bplist_vacate(bpl, tx); 1572 1573 /* 1574 * Pre-dirty the first block so we sync to convergence faster. 1575 * (Usually only the first block is needed.) 1576 */ 1577 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 1578 dmu_tx_commit(tx); 1579 } 1580 1581 static void 1582 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 1583 { 1584 nvlist_t *config; 1585 char *packed = NULL; 1586 size_t nvsize = 0; 1587 dmu_buf_t *db; 1588 1589 if (list_is_empty(&spa->spa_dirty_list)) 1590 return; 1591 1592 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 1593 1594 spa_config_set(spa, config); 1595 1596 VERIFY(nvlist_size(config, &nvsize, NV_ENCODE_XDR) == 0); 1597 1598 packed = kmem_alloc(nvsize, KM_SLEEP); 1599 1600 VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 0) == 0); 1601 1602 dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize, 1603 packed, tx); 1604 1605 kmem_free(packed, nvsize); 1606 1607 db = dmu_bonus_hold(spa->spa_meta_objset, spa->spa_config_object); 1608 dmu_buf_will_dirty(db, tx); 1609 *(uint64_t *)db->db_data = nvsize; 1610 dmu_buf_rele(db); 1611 } 1612 1613 /* 1614 * Sync the specified transaction group. New blocks may be dirtied as 1615 * part of the process, so we iterate until it converges. 1616 */ 1617 void 1618 spa_sync(spa_t *spa, uint64_t txg) 1619 { 1620 dsl_pool_t *dp = spa->spa_dsl_pool; 1621 objset_t *mos = spa->spa_meta_objset; 1622 bplist_t *bpl = &spa->spa_sync_bplist; 1623 vdev_t *rvd = spa->spa_root_vdev; 1624 vdev_t *vd; 1625 dmu_tx_t *tx; 1626 int dirty_vdevs; 1627 1628 /* 1629 * Lock out configuration changes. 1630 */ 1631 spa_config_enter(spa, RW_READER); 1632 1633 spa->spa_syncing_txg = txg; 1634 spa->spa_sync_pass = 0; 1635 1636 bplist_open(bpl, mos, spa->spa_sync_bplist_obj); 1637 1638 /* 1639 * If anything has changed in this txg, push the deferred frees 1640 * from the previous txg. If not, leave them alone so that we 1641 * don't generate work on an otherwise idle system. 1642 */ 1643 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 1644 !txg_list_empty(&dp->dp_dirty_dirs, txg)) 1645 spa_sync_deferred_frees(spa, txg); 1646 1647 /* 1648 * Iterate to convergence. 1649 */ 1650 do { 1651 spa->spa_sync_pass++; 1652 1653 tx = dmu_tx_create_assigned(dp, txg); 1654 spa_sync_config_object(spa, tx); 1655 dmu_tx_commit(tx); 1656 1657 dsl_pool_sync(dp, txg); 1658 1659 dirty_vdevs = 0; 1660 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 1661 vdev_sync(vd, txg); 1662 dirty_vdevs++; 1663 } 1664 1665 tx = dmu_tx_create_assigned(dp, txg); 1666 bplist_sync(bpl, tx); 1667 dmu_tx_commit(tx); 1668 1669 } while (dirty_vdevs); 1670 1671 bplist_close(bpl); 1672 1673 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 1674 1675 /* 1676 * Rewrite the vdev configuration (which includes the uberblock) 1677 * to commit the transaction group. 1678 */ 1679 while (spa_sync_labels(spa, txg)) { 1680 dprintf("waiting for devices to heal\n"); 1681 delay(hz); 1682 vdev_reopen(rvd, NULL); 1683 } 1684 1685 /* 1686 * Make a stable copy of the fully synced uberblock. 1687 * We use this as the root for pool traversals. 1688 */ 1689 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 1690 1691 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 1692 1693 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 1694 spa->spa_traverse_wanted = 0; 1695 spa->spa_ubsync = spa->spa_uberblock; 1696 rw_exit(&spa->spa_traverse_lock); 1697 1698 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 1699 1700 /* 1701 * Clean up the ZIL records for the synced txg. 1702 */ 1703 dsl_pool_zil_clean(dp); 1704 1705 /* 1706 * Update usable space statistics. 1707 */ 1708 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 1709 vdev_sync_done(vd, txg); 1710 1711 /* 1712 * It had better be the case that we didn't dirty anything 1713 * since spa_sync_labels(). 1714 */ 1715 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 1716 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 1717 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 1718 ASSERT(bpl->bpl_queue == NULL); 1719 1720 spa_config_exit(spa); 1721 } 1722 1723 /* 1724 * Sync all pools. We don't want to hold the namespace lock across these 1725 * operations, so we take a reference on the spa_t and drop the lock during the 1726 * sync. 1727 */ 1728 void 1729 spa_sync_allpools(void) 1730 { 1731 spa_t *spa = NULL; 1732 mutex_enter(&spa_namespace_lock); 1733 while ((spa = spa_next(spa)) != NULL) { 1734 if (spa_state(spa) != POOL_STATE_ACTIVE) 1735 continue; 1736 spa_open_ref(spa, FTAG); 1737 mutex_exit(&spa_namespace_lock); 1738 txg_wait_synced(spa_get_dsl(spa), 0); 1739 mutex_enter(&spa_namespace_lock); 1740 spa_close(spa, FTAG); 1741 } 1742 mutex_exit(&spa_namespace_lock); 1743 } 1744 1745 /* 1746 * ========================================================================== 1747 * Miscellaneous routines 1748 * ========================================================================== 1749 */ 1750 1751 int 1752 spa_busy(void) 1753 { 1754 return (spa_active_count != 0); 1755 } 1756 1757 /* 1758 * Remove all pools in the system. 1759 */ 1760 void 1761 spa_evict_all(void) 1762 { 1763 spa_t *spa; 1764 1765 /* 1766 * Remove all cached state. All pools should be closed now, 1767 * so every spa in the AVL tree should be unreferenced. 1768 */ 1769 mutex_enter(&spa_namespace_lock); 1770 while ((spa = spa_next(NULL)) != NULL) { 1771 /* 1772 * Stop all scrub and resilver activity. spa_scrub() needs to 1773 * wait for the scrub thread, which may do a detach and sync the 1774 * configs, which needs spa_namespace_lock. Drop the lock while 1775 * maintaining a hold on the spa_t. 1776 */ 1777 spa_open_ref(spa, FTAG); 1778 mutex_exit(&spa_namespace_lock); 1779 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1780 mutex_enter(&spa_namespace_lock); 1781 spa_close(spa, FTAG); 1782 1783 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1784 spa_unload(spa); 1785 spa_deactivate(spa); 1786 } 1787 spa_remove(spa); 1788 } 1789 mutex_exit(&spa_namespace_lock); 1790 } 1791