1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains all the routines used when modifying on-disk SPA state. 31 * This includes opening, importing, destroying, exporting a pool, and syncing a 32 * pool. 33 */ 34 35 #include <sys/zfs_context.h> 36 #include <sys/spa_impl.h> 37 #include <sys/zio.h> 38 #include <sys/zio_checksum.h> 39 #include <sys/zio_compress.h> 40 #include <sys/dmu.h> 41 #include <sys/dmu_tx.h> 42 #include <sys/zap.h> 43 #include <sys/zil.h> 44 #include <sys/vdev_impl.h> 45 #include <sys/metaslab.h> 46 #include <sys/uberblock_impl.h> 47 #include <sys/txg.h> 48 #include <sys/avl.h> 49 #include <sys/dmu_traverse.h> 50 #include <sys/unique.h> 51 #include <sys/dsl_pool.h> 52 #include <sys/dsl_dir.h> 53 #include <sys/dsl_prop.h> 54 #include <sys/fs/zfs.h> 55 #include <sys/callb.h> 56 57 static uint32_t spa_active_count; 58 59 /* 60 * ========================================================================== 61 * SPA state manipulation (open/create/destroy/import/export) 62 * ========================================================================== 63 */ 64 65 /* 66 * Activate an uninitialized pool. 67 */ 68 static void 69 spa_activate(spa_t *spa) 70 { 71 int t; 72 73 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 74 75 spa->spa_state = POOL_STATE_ACTIVE; 76 77 spa->spa_normal_class = metaslab_class_create(); 78 79 spa->spa_vdev_retry_taskq = taskq_create("spa_vdev_retry", 80 4, maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); 81 82 for (t = 0; t < ZIO_TYPES; t++) { 83 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 84 8, maxclsyspri, 50, INT_MAX, 85 TASKQ_PREPOPULATE); 86 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 87 8, maxclsyspri, 50, INT_MAX, 88 TASKQ_PREPOPULATE); 89 } 90 91 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 92 93 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 94 offsetof(vdev_t, vdev_dirty_node)); 95 96 txg_list_create(&spa->spa_vdev_txg_list, 97 offsetof(struct vdev, vdev_txg_node)); 98 } 99 100 /* 101 * Opposite of spa_activate(). 102 */ 103 static void 104 spa_deactivate(spa_t *spa) 105 { 106 int t; 107 108 ASSERT(spa->spa_sync_on == B_FALSE); 109 ASSERT(spa->spa_dsl_pool == NULL); 110 ASSERT(spa->spa_root_vdev == NULL); 111 112 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 113 114 txg_list_destroy(&spa->spa_vdev_txg_list); 115 116 list_destroy(&spa->spa_dirty_list); 117 118 rw_destroy(&spa->spa_traverse_lock); 119 120 for (t = 0; t < ZIO_TYPES; t++) { 121 taskq_destroy(spa->spa_zio_issue_taskq[t]); 122 taskq_destroy(spa->spa_zio_intr_taskq[t]); 123 spa->spa_zio_issue_taskq[t] = NULL; 124 spa->spa_zio_intr_taskq[t] = NULL; 125 } 126 127 taskq_destroy(spa->spa_vdev_retry_taskq); 128 spa->spa_vdev_retry_taskq = NULL; 129 130 metaslab_class_destroy(spa->spa_normal_class); 131 spa->spa_normal_class = NULL; 132 133 spa->spa_state = POOL_STATE_UNINITIALIZED; 134 } 135 136 /* 137 * Verify a pool configuration, and construct the vdev tree appropriately. This 138 * will create all the necessary vdevs in the appropriate layout, with each vdev 139 * in the CLOSED state. This will prep the pool before open/creation/import. 140 * All vdev validation is done by the vdev_alloc() routine. 141 */ 142 static vdev_t * 143 spa_config_parse(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) 144 { 145 nvlist_t **child; 146 uint_t c, children; 147 vdev_t *vd; 148 149 if ((vd = vdev_alloc(spa, nv, parent, id, atype)) == NULL) 150 return (NULL); 151 152 if (vd->vdev_ops->vdev_op_leaf) 153 return (vd); 154 155 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 156 &child, &children) != 0) { 157 vdev_free(vd); 158 return (NULL); 159 } 160 161 for (c = 0; c < children; c++) { 162 if (spa_config_parse(spa, child[c], vd, c, atype) == NULL) { 163 vdev_free(vd); 164 return (NULL); 165 } 166 } 167 168 return (vd); 169 } 170 171 /* 172 * Opposite of spa_load(). 173 */ 174 static void 175 spa_unload(spa_t *spa) 176 { 177 /* 178 * Stop syncing. 179 */ 180 if (spa->spa_sync_on) { 181 txg_sync_stop(spa->spa_dsl_pool); 182 spa->spa_sync_on = B_FALSE; 183 } 184 185 /* 186 * Wait for any outstanding prefetch I/O to complete. 187 */ 188 spa_config_enter(spa, RW_WRITER); 189 spa_config_exit(spa); 190 191 /* 192 * Close the dsl pool. 193 */ 194 if (spa->spa_dsl_pool) { 195 dsl_pool_close(spa->spa_dsl_pool); 196 spa->spa_dsl_pool = NULL; 197 } 198 199 /* 200 * Close all vdevs. 201 */ 202 if (spa->spa_root_vdev) { 203 vdev_free(spa->spa_root_vdev); 204 spa->spa_root_vdev = NULL; 205 } 206 } 207 208 /* 209 * Load an existing storage pool, using the pool's builtin spa_config as a 210 * source of configuration information. The 'readonly' flag will prevent us 211 * from writing any updated state to disk, and can be use when testing a pool 212 * for import. 213 */ 214 static int 215 spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig) 216 { 217 int error = 0; 218 nvlist_t *nvroot = NULL; 219 vdev_t *rvd; 220 uberblock_t *ub = &spa->spa_uberblock; 221 uint64_t pool_guid; 222 zio_t *zio; 223 224 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 225 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 226 return (EINVAL); 227 228 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 229 &spa->spa_config_txg); 230 231 if (import && spa_guid_exists(pool_guid, 0)) 232 return (EEXIST); 233 234 /* 235 * Parse the configuration into a vdev tree. 236 */ 237 spa_config_enter(spa, RW_WRITER); 238 rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 239 spa_config_exit(spa); 240 241 if (rvd == NULL) 242 return (EINVAL); 243 244 spa->spa_root_vdev = rvd; 245 ASSERT(spa_guid(spa) == pool_guid); 246 247 /* 248 * Try to open all vdevs, loading each label in the process. 249 */ 250 if (vdev_open(rvd) != 0) 251 return (ENXIO); 252 253 /* 254 * Find the best uberblock. 255 */ 256 bzero(ub, sizeof (uberblock_t)); 257 258 zio = zio_root(spa, NULL, NULL, 259 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 260 vdev_uberblock_load(zio, rvd, ub); 261 error = zio_wait(zio); 262 263 /* 264 * If we weren't able to find a single valid uberblock, return failure. 265 */ 266 if (ub->ub_txg == 0) { 267 dprintf("ub_txg is zero\n"); 268 return (ENXIO); 269 } 270 271 /* 272 * If the vdev guid sum doesn't match the uberblock, we have an 273 * incomplete configuration. 274 */ 275 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 276 rvd->vdev_state = VDEV_STATE_CANT_OPEN; 277 rvd->vdev_stat.vs_aux = VDEV_AUX_BAD_GUID_SUM; 278 dprintf("vdev_guid_sum %llx != ub_guid_sum %llx\n", 279 rvd->vdev_guid_sum, ub->ub_guid_sum); 280 return (ENXIO); 281 } 282 283 /* 284 * Initialize internal SPA structures. 285 */ 286 spa->spa_state = POOL_STATE_ACTIVE; 287 spa->spa_ubsync = spa->spa_uberblock; 288 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 289 spa->spa_dsl_pool = dsl_pool_open(spa, spa->spa_first_txg); 290 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 291 292 VERIFY(zap_lookup(spa->spa_meta_objset, 293 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 294 sizeof (uint64_t), 1, &spa->spa_config_object) == 0); 295 296 if (!mosconfig) { 297 dmu_buf_t *db; 298 char *packed = NULL; 299 size_t nvsize = 0; 300 nvlist_t *newconfig = NULL; 301 302 db = dmu_bonus_hold(spa->spa_meta_objset, 303 spa->spa_config_object); 304 dmu_buf_read(db); 305 nvsize = *(uint64_t *)db->db_data; 306 dmu_buf_rele(db); 307 308 packed = kmem_alloc(nvsize, KM_SLEEP); 309 error = dmu_read_canfail(spa->spa_meta_objset, 310 spa->spa_config_object, 0, nvsize, packed); 311 if (error == 0) 312 error = nvlist_unpack(packed, nvsize, &newconfig, 0); 313 kmem_free(packed, nvsize); 314 315 if (error) 316 return (ENXIO); 317 318 spa_config_set(spa, newconfig); 319 320 spa_unload(spa); 321 spa_deactivate(spa); 322 spa_activate(spa); 323 324 return (spa_load(spa, newconfig, readonly, import, B_TRUE)); 325 } 326 327 VERIFY(zap_lookup(spa->spa_meta_objset, 328 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 329 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) == 0); 330 331 /* 332 * Load the vdev state for all top level vdevs. 333 */ 334 if ((error = vdev_load(rvd, import)) != 0) 335 return (error); 336 337 /* 338 * Propagate the leaf DTLs we just loaded all the way up the tree. 339 */ 340 spa_config_enter(spa, RW_WRITER); 341 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 342 spa_config_exit(spa); 343 344 /* 345 * Check the state of the root vdev. If it can't be opened, it 346 * indicates one or more toplevel vdevs are faulted. 347 */ 348 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 349 return (ENXIO); 350 351 /* 352 * Claim log blocks that haven't been committed yet, and update all 353 * top-level vdevs to sync any config changes found in vdev_load(). 354 * This must all happen in a single txg. 355 */ 356 if ((spa_mode & FWRITE) && !readonly) { 357 dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), 358 spa_first_txg(spa)); 359 dmu_objset_find(spa->spa_name, zil_claim, tx, 0); 360 vdev_config_dirty(rvd); 361 dmu_tx_commit(tx); 362 363 spa->spa_sync_on = B_TRUE; 364 txg_sync_start(spa->spa_dsl_pool); 365 366 /* 367 * Wait for all claims to sync. 368 */ 369 txg_wait_synced(spa->spa_dsl_pool, 0); 370 } 371 372 return (0); 373 } 374 375 /* 376 * Pool Open/Import 377 * 378 * The import case is identical to an open except that the configuration is sent 379 * down from userland, instead of grabbed from the configuration cache. For the 380 * case of an open, the pool configuration will exist in the 381 * POOL_STATE_UNITIALIZED state. 382 * 383 * The stats information (gen/count/ustats) is used to gather vdev statistics at 384 * the same time open the pool, without having to keep around the spa_t in some 385 * ambiguous state. 386 */ 387 static int 388 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 389 { 390 spa_t *spa; 391 int error; 392 int loaded = B_FALSE; 393 int locked = B_FALSE; 394 395 *spapp = NULL; 396 397 /* 398 * As disgusting as this is, we need to support recursive calls to this 399 * function because dsl_dir_open() is called during spa_load(), and ends 400 * up calling spa_open() again. The real fix is to figure out how to 401 * avoid dsl_dir_open() calling this in the first place. 402 */ 403 if (mutex_owner(&spa_namespace_lock) != curthread) { 404 mutex_enter(&spa_namespace_lock); 405 locked = B_TRUE; 406 } 407 408 if ((spa = spa_lookup(pool)) == NULL) { 409 if (locked) 410 mutex_exit(&spa_namespace_lock); 411 return (ENOENT); 412 } 413 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 414 415 spa_activate(spa); 416 417 error = spa_load(spa, spa->spa_config, 418 B_FALSE, B_FALSE, B_FALSE); 419 420 if (error == EBADF) { 421 /* 422 * If vdev_load() returns EBADF, it indicates that one 423 * of the vdevs indicates that the pool has been 424 * exported or destroyed. If this is the case, the 425 * config cache is out of sync and we should remove the 426 * pool from the namespace. 427 */ 428 spa_unload(spa); 429 spa_deactivate(spa); 430 spa_remove(spa); 431 spa_config_sync(); 432 if (locked) 433 mutex_exit(&spa_namespace_lock); 434 return (ENOENT); 435 } if (error) { 436 /* 437 * We can't open the pool, but we still have useful 438 * information: the state of each vdev after the 439 * attempted vdev_open(). Return this to the user. 440 */ 441 if (config != NULL && spa->spa_root_vdev != NULL) 442 *config = spa_config_generate(spa, NULL, -1ULL, 443 B_TRUE); 444 spa_unload(spa); 445 spa_deactivate(spa); 446 if (locked) 447 mutex_exit(&spa_namespace_lock); 448 *spapp = NULL; 449 return (error); 450 } 451 452 loaded = B_TRUE; 453 } 454 455 spa_open_ref(spa, tag); 456 if (locked) 457 mutex_exit(&spa_namespace_lock); 458 459 *spapp = spa; 460 461 if (config != NULL) { 462 spa_config_enter(spa, RW_READER); 463 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 464 spa_config_exit(spa); 465 } 466 467 /* 468 * If we just loaded the pool, resilver anything that's out of date. 469 */ 470 if (loaded && (spa_mode & FWRITE)) 471 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 472 473 return (0); 474 } 475 476 int 477 spa_open(const char *name, spa_t **spapp, void *tag) 478 { 479 return (spa_open_common(name, spapp, tag, NULL)); 480 } 481 482 int 483 spa_get_stats(const char *name, nvlist_t **config) 484 { 485 int error; 486 spa_t *spa; 487 488 *config = NULL; 489 error = spa_open_common(name, &spa, FTAG, config); 490 491 if (spa != NULL) 492 spa_close(spa, FTAG); 493 494 return (error); 495 } 496 497 /* 498 * Pool Creation 499 */ 500 int 501 spa_create(const char *pool, nvlist_t *nvroot, char *altroot) 502 { 503 spa_t *spa; 504 dsl_pool_t *dp; 505 dmu_tx_t *tx; 506 int error; 507 uint64_t txg = TXG_INITIAL; 508 509 /* 510 * If this pool already exists, return failure. 511 */ 512 mutex_enter(&spa_namespace_lock); 513 if (spa_lookup(pool) != NULL) { 514 mutex_exit(&spa_namespace_lock); 515 return (EEXIST); 516 } 517 spa = spa_add(pool); 518 519 /* 520 * Allocate a new spa_t structure. 521 */ 522 spa_activate(spa); 523 524 spa->spa_uberblock.ub_txg = txg - 1; 525 spa->spa_ubsync = spa->spa_uberblock; 526 527 error = spa_vdev_add(spa, nvroot); 528 529 if (error) { 530 spa_unload(spa); 531 spa_deactivate(spa); 532 spa_remove(spa); 533 mutex_exit(&spa_namespace_lock); 534 return (error); 535 } 536 537 if (altroot != NULL) { 538 spa->spa_root = spa_strdup(altroot); 539 atomic_add_32(&spa_active_count, 1); 540 } 541 542 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 543 spa->spa_meta_objset = dp->dp_meta_objset; 544 545 tx = dmu_tx_create_assigned(dp, txg); 546 547 /* 548 * Create the pool config object. 549 */ 550 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 551 DMU_OT_PACKED_NVLIST, 1 << 14, 552 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 553 554 VERIFY(zap_add(spa->spa_meta_objset, 555 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 556 sizeof (uint64_t), 1, &spa->spa_config_object, tx) == 0); 557 558 /* 559 * Create the deferred-free bplist object. Turn off compression 560 * because sync-to-convergence takes longer if the blocksize 561 * keeps changing. 562 */ 563 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 564 1 << 14, tx); 565 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 566 ZIO_COMPRESS_OFF, tx); 567 568 VERIFY(zap_add(spa->spa_meta_objset, 569 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 570 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) == 0); 571 572 dmu_tx_commit(tx); 573 574 spa->spa_sync_on = B_TRUE; 575 txg_sync_start(spa->spa_dsl_pool); 576 577 /* 578 * We explicitly wait for the first transaction to complete so that our 579 * bean counters are appropriately updated. 580 */ 581 txg_wait_synced(spa->spa_dsl_pool, txg); 582 583 spa_config_sync(); 584 585 mutex_exit(&spa_namespace_lock); 586 587 return (0); 588 } 589 590 /* 591 * Import the given pool into the system. We set up the necessary spa_t and 592 * then call spa_load() to do the dirty work. 593 */ 594 int 595 spa_import(const char *pool, nvlist_t *config, char *altroot) 596 { 597 spa_t *spa; 598 int error; 599 600 if (!(spa_mode & FWRITE)) 601 return (EROFS); 602 603 /* 604 * If a pool with this name exists, return failure. 605 */ 606 mutex_enter(&spa_namespace_lock); 607 if (spa_lookup(pool) != NULL) { 608 mutex_exit(&spa_namespace_lock); 609 return (EEXIST); 610 } 611 612 /* 613 * Create an initialize the spa structure 614 */ 615 spa = spa_add(pool); 616 spa_activate(spa); 617 618 /* 619 * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig 620 * so that we don't try to open the pool if the config is damaged. 621 */ 622 error = spa_load(spa, config, B_FALSE, B_TRUE, B_TRUE); 623 624 if (error) { 625 spa_unload(spa); 626 spa_deactivate(spa); 627 spa_remove(spa); 628 mutex_exit(&spa_namespace_lock); 629 return (error); 630 } 631 632 /* 633 * Set the alternate root, if there is one. 634 */ 635 if (altroot != NULL) { 636 atomic_add_32(&spa_active_count, 1); 637 spa->spa_root = spa_strdup(altroot); 638 } 639 640 /* 641 * Initialize the config based on the in-core state. 642 */ 643 config = spa_config_generate(spa, NULL, spa_last_synced_txg(spa), 0); 644 645 spa_config_set(spa, config); 646 647 /* 648 * Sync the configuration cache. 649 */ 650 spa_config_sync(); 651 652 mutex_exit(&spa_namespace_lock); 653 654 /* 655 * Resilver anything that's out of date. 656 */ 657 if (spa_mode & FWRITE) 658 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 659 660 return (0); 661 } 662 663 /* 664 * This (illegal) pool name is used when temporarily importing a spa_t in order 665 * to get the vdev stats associated with the imported devices. 666 */ 667 #define TRYIMPORT_NAME "$import" 668 669 nvlist_t * 670 spa_tryimport(nvlist_t *tryconfig) 671 { 672 nvlist_t *config = NULL; 673 char *poolname; 674 spa_t *spa; 675 uint64_t state; 676 677 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 678 return (NULL); 679 680 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 681 return (NULL); 682 683 mutex_enter(&spa_namespace_lock); 684 spa = spa_add(TRYIMPORT_NAME); 685 686 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 687 688 /* 689 * Initialize the spa_t structure. 690 */ 691 spa_activate(spa); 692 693 /* 694 * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig 695 * so we don't try to open the pool if the config is damaged. 696 */ 697 (void) spa_load(spa, tryconfig, B_TRUE, B_TRUE, B_TRUE); 698 699 /* 700 * If 'tryconfig' was at least parsable, return the current config. 701 */ 702 if (spa->spa_root_vdev != NULL) { 703 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 704 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 705 poolname) == 0); 706 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 707 state) == 0); 708 } 709 710 spa_unload(spa); 711 spa_deactivate(spa); 712 spa_remove(spa); 713 mutex_exit(&spa_namespace_lock); 714 715 return (config); 716 } 717 718 /* 719 * Pool export/destroy 720 * 721 * The act of destroying or exporting a pool is very simple. We make sure there 722 * is no more pending I/O and any references to the pool are gone. Then, we 723 * update the pool state and sync all the labels to disk, removing the 724 * configuration from the cache afterwards. 725 */ 726 static int 727 spa_export_common(char *pool, int new_state) 728 { 729 spa_t *spa; 730 731 if (!(spa_mode & FWRITE)) 732 return (EROFS); 733 734 mutex_enter(&spa_namespace_lock); 735 if ((spa = spa_lookup(pool)) == NULL) { 736 mutex_exit(&spa_namespace_lock); 737 return (ENOENT); 738 } 739 740 /* 741 * The pool will be in core if it's openable, 742 * in which case we can modify its state. 743 */ 744 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 745 /* 746 * Objsets may be open only because they're dirty, so we 747 * have to force it to sync before checking spa_refcnt. 748 */ 749 spa_scrub_suspend(spa); 750 txg_wait_synced(spa->spa_dsl_pool, 0); 751 752 if (!spa_refcount_zero(spa)) { 753 spa_scrub_resume(spa); 754 mutex_exit(&spa_namespace_lock); 755 return (EBUSY); 756 } 757 758 /* 759 * Update the pool state. 760 */ 761 spa->spa_state = new_state; 762 763 spa_scrub_resume(spa); 764 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 765 766 if (spa->spa_root != NULL) 767 atomic_add_32(&spa_active_count, -1); 768 769 /* 770 * We want this to be reflected on every label, 771 * so mark them all dirty. spa_unload() will do the 772 * final sync that pushes these changes out. 773 */ 774 vdev_config_dirty(spa->spa_root_vdev); 775 } 776 777 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 778 spa_unload(spa); 779 spa_deactivate(spa); 780 } 781 782 spa_remove(spa); 783 spa_config_sync(); 784 mutex_exit(&spa_namespace_lock); 785 786 return (0); 787 } 788 789 /* 790 * Destroy a storage pool. 791 */ 792 int 793 spa_destroy(char *pool) 794 { 795 return (spa_export_common(pool, POOL_STATE_DESTROYED)); 796 } 797 798 /* 799 * Export a storage pool. 800 */ 801 int 802 spa_export(char *pool) 803 { 804 return (spa_export_common(pool, POOL_STATE_EXPORTED)); 805 } 806 807 /* 808 * ========================================================================== 809 * Device manipulation 810 * ========================================================================== 811 */ 812 813 /* 814 * Add capacity to a storage pool. 815 */ 816 int 817 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 818 { 819 uint64_t txg; 820 int c, error; 821 vdev_t *rvd = spa->spa_root_vdev; 822 vdev_t *vd; 823 824 txg = spa_vdev_enter(spa); 825 826 vd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 827 828 if (vd == NULL) 829 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 830 831 if (rvd == NULL) /* spa_create() */ 832 spa->spa_root_vdev = rvd = vd; 833 834 if ((error = vdev_create(vd, txg)) != 0) 835 return (spa_vdev_exit(spa, vd, txg, error)); 836 837 /* 838 * Transfer each top-level vdev from the temporary root 839 * to the spa's root and initialize its metaslabs. 840 */ 841 for (c = 0; c < vd->vdev_children; c++) { 842 vdev_t *tvd = vd->vdev_child[c]; 843 if (vd != rvd) { 844 vdev_remove_child(vd, tvd); 845 tvd->vdev_id = rvd->vdev_children; 846 vdev_add_child(rvd, tvd); 847 } 848 vdev_init(tvd, txg); 849 vdev_config_dirty(tvd); 850 } 851 852 /* 853 * Update the config based on the new in-core state. 854 */ 855 spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); 856 857 return (spa_vdev_exit(spa, vd, txg, 0)); 858 } 859 860 /* 861 * Attach a device to a mirror. The arguments are the path to any device 862 * in the mirror, and the nvroot for the new device. If the path specifies 863 * a device that is not mirrored, we automatically insert the mirror vdev. 864 * 865 * If 'replacing' is specified, the new device is intended to replace the 866 * existing device; in this case the two devices are made into their own 867 * mirror using the 'replacing' vdev, which is functionally idendical to 868 * the mirror vdev (it actually reuses all the same ops) but has a few 869 * extra rules: you can't attach to it after it's been created, and upon 870 * completion of resilvering, the first disk (the one being replaced) 871 * is automatically detached. 872 */ 873 int 874 spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing) 875 { 876 uint64_t txg, open_txg; 877 int error; 878 vdev_t *rvd = spa->spa_root_vdev; 879 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 880 vdev_ops_t *pvops = replacing ? &vdev_replacing_ops : &vdev_mirror_ops; 881 882 txg = spa_vdev_enter(spa); 883 884 oldvd = vdev_lookup_by_path(rvd, path); 885 886 if (oldvd == NULL) 887 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 888 889 pvd = oldvd->vdev_parent; 890 891 /* 892 * The parent must be a mirror or the root, unless we're replacing; 893 * in that case, the parent can be anything but another replacing vdev. 894 */ 895 if (pvd->vdev_ops != &vdev_mirror_ops && 896 pvd->vdev_ops != &vdev_root_ops && 897 (!replacing || pvd->vdev_ops == &vdev_replacing_ops)) 898 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 899 900 newrootvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 901 902 if (newrootvd == NULL || newrootvd->vdev_children != 1) 903 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 904 905 newvd = newrootvd->vdev_child[0]; 906 907 if (!newvd->vdev_ops->vdev_op_leaf) 908 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 909 910 if ((error = vdev_create(newrootvd, txg)) != 0) 911 return (spa_vdev_exit(spa, newrootvd, txg, error)); 912 913 /* 914 * Compare the new device size with the replaceable/attachable 915 * device size. 916 */ 917 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 918 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 919 920 if (newvd->vdev_ashift != oldvd->vdev_ashift && oldvd->vdev_ashift != 0) 921 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 922 923 /* 924 * If this is an in-place replacement, update oldvd's path and devid 925 * to make it distinguishable from newvd, and unopenable from now on. 926 */ 927 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 928 spa_strfree(oldvd->vdev_path); 929 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 930 KM_SLEEP); 931 (void) sprintf(oldvd->vdev_path, "%s/%s", 932 newvd->vdev_path, "old"); 933 if (oldvd->vdev_devid != NULL) { 934 spa_strfree(oldvd->vdev_devid); 935 oldvd->vdev_devid = NULL; 936 } 937 } 938 939 /* 940 * If the parent is not a mirror, or if we're replacing, 941 * insert the new mirror/replacing vdev above oldvd. 942 */ 943 if (pvd->vdev_ops != pvops) 944 pvd = vdev_add_parent(oldvd, pvops); 945 946 ASSERT(pvd->vdev_top->vdev_parent == rvd); 947 ASSERT(pvd->vdev_ops == pvops); 948 ASSERT(oldvd->vdev_parent == pvd); 949 950 /* 951 * Extract the new device from its root and add it to pvd. 952 */ 953 vdev_remove_child(newrootvd, newvd); 954 newvd->vdev_id = pvd->vdev_children; 955 vdev_add_child(pvd, newvd); 956 957 tvd = newvd->vdev_top; 958 ASSERT(pvd->vdev_top == tvd); 959 ASSERT(tvd->vdev_parent == rvd); 960 961 /* 962 * Update the config based on the new in-core state. 963 */ 964 spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); 965 966 vdev_config_dirty(tvd); 967 968 /* 969 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 970 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 971 */ 972 open_txg = txg + TXG_CONCURRENT_STATES - 1; 973 974 mutex_enter(&newvd->vdev_dtl_lock); 975 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 976 open_txg - TXG_INITIAL + 1); 977 mutex_exit(&newvd->vdev_dtl_lock); 978 979 /* 980 * Mark newvd's DTL dirty in this txg. 981 */ 982 vdev_dirty(tvd, VDD_DTL, txg); 983 (void) txg_list_add(&tvd->vdev_dtl_list, newvd, txg); 984 985 dprintf("attached %s, replacing=%d\n", path, replacing); 986 987 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 988 989 /* 990 * Kick off a resilver to update newvd. 991 */ 992 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 993 994 return (0); 995 } 996 997 /* 998 * Detach a device from a mirror or replacing vdev. 999 * If 'replace_done' is specified, only detach if the parent 1000 * is a replacing vdev. 1001 */ 1002 int 1003 spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, int replace_done) 1004 { 1005 uint64_t txg; 1006 int c, t, error; 1007 vdev_t *rvd = spa->spa_root_vdev; 1008 vdev_t *vd, *pvd, *cvd, *tvd; 1009 1010 txg = spa_vdev_enter(spa); 1011 1012 vd = vdev_lookup_by_path(rvd, path); 1013 1014 if (vd == NULL) 1015 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1016 1017 if (guid != 0 && vd->vdev_guid != guid) 1018 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1019 1020 pvd = vd->vdev_parent; 1021 1022 /* 1023 * If replace_done is specified, only remove this device if it's 1024 * the first child of a replacing vdev. 1025 */ 1026 if (replace_done && 1027 (vd->vdev_id != 0 || pvd->vdev_ops != &vdev_replacing_ops)) 1028 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1029 1030 /* 1031 * Only mirror and replacing vdevs support detach. 1032 */ 1033 if (pvd->vdev_ops != &vdev_replacing_ops && 1034 pvd->vdev_ops != &vdev_mirror_ops) 1035 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1036 1037 /* 1038 * If there's only one replica, you can't detach it. 1039 */ 1040 if (pvd->vdev_children <= 1) 1041 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1042 1043 /* 1044 * If all siblings have non-empty DTLs, this device may have the only 1045 * valid copy of the data, which means we cannot safely detach it. 1046 * 1047 * XXX -- as in the vdev_offline() case, we really want a more 1048 * precise DTL check. 1049 */ 1050 for (c = 0; c < pvd->vdev_children; c++) { 1051 uint64_t dirty; 1052 1053 cvd = pvd->vdev_child[c]; 1054 if (cvd == vd) 1055 continue; 1056 if (vdev_is_dead(cvd)) 1057 continue; 1058 mutex_enter(&cvd->vdev_dtl_lock); 1059 dirty = cvd->vdev_dtl_map.sm_space | 1060 cvd->vdev_dtl_scrub.sm_space; 1061 mutex_exit(&cvd->vdev_dtl_lock); 1062 if (!dirty) 1063 break; 1064 } 1065 if (c == pvd->vdev_children) 1066 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1067 1068 /* 1069 * Erase the disk labels so the disk can be used for other things. 1070 * This must be done after all other error cases are handled, 1071 * but before we disembowel vd (so we can still do I/O to it). 1072 * But if we can't do it, don't treat the error as fatal -- 1073 * it may be that the unwritability of the disk is the reason 1074 * it's being detached! 1075 */ 1076 error = vdev_label_init(vd, 0); 1077 if (error) 1078 dprintf("unable to erase labels on %s\n", vdev_description(vd)); 1079 1080 /* 1081 * Remove vd from its parent and compact the parent's children. 1082 */ 1083 vdev_remove_child(pvd, vd); 1084 vdev_compact_children(pvd); 1085 1086 /* 1087 * Remember one of the remaining children so we can get tvd below. 1088 */ 1089 cvd = pvd->vdev_child[0]; 1090 1091 /* 1092 * If the parent mirror/replacing vdev only has one child, 1093 * the parent is no longer needed. Remove it from the tree. 1094 */ 1095 if (pvd->vdev_children == 1) 1096 vdev_remove_parent(cvd); 1097 1098 /* 1099 * We don't set tvd until now because the parent we just removed 1100 * may have been the previous top-level vdev. 1101 */ 1102 tvd = cvd->vdev_top; 1103 ASSERT(tvd->vdev_parent == rvd); 1104 1105 /* 1106 * Reopen this top-level vdev to reassess health after detach. 1107 */ 1108 vdev_reopen(tvd, NULL); 1109 1110 /* 1111 * If the device we just detached was smaller than the others, 1112 * it may be possible to add metaslabs (i.e. grow the pool). 1113 */ 1114 vdev_metaslab_init(tvd, txg); 1115 1116 /* 1117 * Update the config based on the new in-core state. 1118 */ 1119 spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); 1120 1121 vdev_config_dirty(tvd); 1122 1123 /* 1124 * Mark vd's DTL as dirty in this txg. 1125 * vdev_dtl_sync() will see that vd->vdev_detached is set 1126 * and free vd's DTL object in syncing context. 1127 * But first make sure we're not on any *other* txg's DTL list, 1128 * to prevent vd from being accessed after it's freed. 1129 */ 1130 vdev_dirty(tvd, VDD_DTL, txg); 1131 vd->vdev_detached = B_TRUE; 1132 for (t = 0; t < TXG_SIZE; t++) 1133 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1134 (void) txg_list_add(&tvd->vdev_dtl_list, vd, txg); 1135 1136 dprintf("detached %s\n", path); 1137 1138 return (spa_vdev_exit(spa, vd, txg, 0)); 1139 } 1140 1141 /* 1142 * If there are any replacing vdevs that have finished replacing, detach them. 1143 * We can't hold the config lock across detaches, so we lock the config, 1144 * build a list of candidates, unlock the config, and try each candidate. 1145 */ 1146 typedef struct vdev_detach_link { 1147 char *vdl_path; 1148 uint64_t vdl_guid; 1149 list_node_t vdl_node; 1150 } vdev_detach_link_t; 1151 1152 static void 1153 spa_vdev_replace_done_make_list(list_t *l, vdev_t *vd) 1154 { 1155 int c; 1156 1157 for (c = 0; c < vd->vdev_children; c++) 1158 spa_vdev_replace_done_make_list(l, vd->vdev_child[c]); 1159 1160 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 1161 vdev_t *cvd0 = vd->vdev_child[0]; 1162 vdev_t *cvd1 = vd->vdev_child[1]; 1163 vdev_detach_link_t *vdl; 1164 int dirty1; 1165 1166 mutex_enter(&cvd1->vdev_dtl_lock); 1167 dirty1 = cvd1->vdev_dtl_map.sm_space | 1168 cvd1->vdev_dtl_scrub.sm_space; 1169 mutex_exit(&cvd1->vdev_dtl_lock); 1170 1171 if (!dirty1) { 1172 vdl = kmem_zalloc(sizeof (*vdl), KM_SLEEP); 1173 vdl->vdl_path = spa_strdup(cvd0->vdev_path); 1174 vdl->vdl_guid = cvd0->vdev_guid; 1175 list_insert_tail(l, vdl); 1176 } 1177 } 1178 } 1179 1180 void 1181 spa_vdev_replace_done(spa_t *spa) 1182 { 1183 vdev_detach_link_t *vdl; 1184 list_t vdlist; 1185 1186 list_create(&vdlist, sizeof (vdev_detach_link_t), 1187 offsetof(vdev_detach_link_t, vdl_node)); 1188 1189 spa_config_enter(spa, RW_READER); 1190 spa_vdev_replace_done_make_list(&vdlist, spa->spa_root_vdev); 1191 spa_config_exit(spa); 1192 1193 while ((vdl = list_head(&vdlist)) != NULL) { 1194 list_remove(&vdlist, vdl); 1195 (void) spa_vdev_detach(spa, vdl->vdl_path, vdl->vdl_guid, 1196 B_TRUE); 1197 spa_strfree(vdl->vdl_path); 1198 kmem_free(vdl, sizeof (*vdl)); 1199 } 1200 1201 list_destroy(&vdlist); 1202 } 1203 1204 /* 1205 * Update the stored path for this vdev. Dirty the vdev configuration, relying 1206 * on spa_vdev_enter/exit() to synchronize the labels and cache. 1207 */ 1208 int 1209 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 1210 { 1211 vdev_t *rvd, *vd; 1212 uint64_t txg; 1213 1214 rvd = spa->spa_root_vdev; 1215 1216 txg = spa_vdev_enter(spa); 1217 1218 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 1219 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 1220 1221 spa_strfree(vd->vdev_path); 1222 vd->vdev_path = spa_strdup(newpath); 1223 1224 spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); 1225 1226 vdev_config_dirty(vd->vdev_top); 1227 1228 return (spa_vdev_exit(spa, NULL, txg, 0)); 1229 } 1230 1231 /* 1232 * ========================================================================== 1233 * SPA Scrubbing 1234 * ========================================================================== 1235 */ 1236 1237 static int spa_scrub_locked(spa_t *, pool_scrub_type_t, boolean_t); 1238 1239 static void 1240 spa_scrub_io_done(zio_t *zio) 1241 { 1242 spa_t *spa = zio->io_spa; 1243 1244 zio_buf_free(zio->io_data, zio->io_size); 1245 1246 mutex_enter(&spa->spa_scrub_lock); 1247 if (zio->io_error) 1248 spa->spa_scrub_errors++; 1249 if (--spa->spa_scrub_inflight == 0) 1250 cv_broadcast(&spa->spa_scrub_io_cv); 1251 mutex_exit(&spa->spa_scrub_lock); 1252 1253 if (zio->io_error) { 1254 vdev_t *vd = zio->io_vd; 1255 mutex_enter(&vd->vdev_stat_lock); 1256 vd->vdev_stat.vs_scrub_errors++; 1257 mutex_exit(&vd->vdev_stat_lock); 1258 } 1259 } 1260 1261 static void 1262 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags) 1263 { 1264 size_t size = BP_GET_LSIZE(bp); 1265 void *data = zio_buf_alloc(size); 1266 1267 mutex_enter(&spa->spa_scrub_lock); 1268 spa->spa_scrub_inflight++; 1269 mutex_exit(&spa->spa_scrub_lock); 1270 1271 zio_nowait(zio_read(NULL, spa, bp, data, size, 1272 spa_scrub_io_done, NULL, priority, flags)); 1273 } 1274 1275 /* ARGSUSED */ 1276 static int 1277 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 1278 { 1279 blkptr_t *bp = &bc->bc_blkptr; 1280 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[0])); 1281 1282 if (bc->bc_errno || vd == NULL) { 1283 /* 1284 * We can't scrub this block, but we can continue to scrub 1285 * the rest of the pool. Note the error and move along. 1286 */ 1287 mutex_enter(&spa->spa_scrub_lock); 1288 spa->spa_scrub_errors++; 1289 mutex_exit(&spa->spa_scrub_lock); 1290 1291 if (vd != NULL) { 1292 mutex_enter(&vd->vdev_stat_lock); 1293 vd->vdev_stat.vs_scrub_errors++; 1294 mutex_exit(&vd->vdev_stat_lock); 1295 } 1296 1297 return (ERESTART); 1298 } 1299 1300 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 1301 1302 /* 1303 * Keep track of how much data we've examined so that 1304 * zpool(1M) status can make useful progress reports. 1305 */ 1306 mutex_enter(&vd->vdev_stat_lock); 1307 vd->vdev_stat.vs_scrub_examined += BP_GET_ASIZE(bp); 1308 mutex_exit(&vd->vdev_stat_lock); 1309 1310 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 1311 if (DVA_GET_GANG(&bp->blk_dva[0])) { 1312 /* 1313 * Gang members may be spread across multiple vdevs, 1314 * so the best we can do is look at the pool-wide DTL. 1315 * XXX -- it would be better to change our allocation 1316 * policy to ensure that this can't happen. 1317 */ 1318 vd = spa->spa_root_vdev; 1319 } 1320 if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) { 1321 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 1322 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | 1323 ZIO_FLAG_RESILVER); 1324 } 1325 } else { 1326 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 1327 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SCRUB); 1328 } 1329 1330 return (0); 1331 } 1332 1333 static void 1334 spa_scrub_thread(spa_t *spa) 1335 { 1336 callb_cpr_t cprinfo; 1337 traverse_handle_t *th = spa->spa_scrub_th; 1338 vdev_t *rvd = spa->spa_root_vdev; 1339 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 1340 int error = 0; 1341 boolean_t complete; 1342 1343 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 1344 1345 /* 1346 * If we're restarting due to a snapshot create/delete, 1347 * wait for that to complete. 1348 */ 1349 txg_wait_synced(spa_get_dsl(spa), 0); 1350 1351 spa_config_enter(spa, RW_WRITER); 1352 vdev_reopen(rvd, NULL); /* purge all vdev caches */ 1353 vdev_config_dirty(rvd); /* rewrite all disk labels */ 1354 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 1355 spa_config_exit(spa); 1356 1357 mutex_enter(&spa->spa_scrub_lock); 1358 spa->spa_scrub_errors = 0; 1359 spa->spa_scrub_active = 1; 1360 1361 while (!spa->spa_scrub_stop) { 1362 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1363 while (spa->spa_scrub_suspend) { 1364 spa->spa_scrub_active = 0; 1365 cv_broadcast(&spa->spa_scrub_cv); 1366 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1367 spa->spa_scrub_active = 1; 1368 } 1369 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 1370 1371 if (spa->spa_scrub_restart_txg != 0) 1372 break; 1373 1374 mutex_exit(&spa->spa_scrub_lock); 1375 error = traverse_more(th); 1376 mutex_enter(&spa->spa_scrub_lock); 1377 if (error != EAGAIN) 1378 break; 1379 } 1380 1381 while (spa->spa_scrub_inflight) 1382 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1383 1384 if (spa->spa_scrub_restart_txg != 0) 1385 error = ERESTART; 1386 1387 spa->spa_scrub_active = 0; 1388 cv_broadcast(&spa->spa_scrub_cv); 1389 1390 /* 1391 * If the traverse completed, and there were no errors, 1392 * then the scrub was completely successful. 1393 */ 1394 complete = (error == 0 && spa->spa_scrub_errors == 0); 1395 1396 dprintf("scrub to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 1397 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 1398 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 1399 1400 mutex_exit(&spa->spa_scrub_lock); 1401 1402 /* 1403 * If the scrub/resilver completed, update all DTLs to reflect this. 1404 * Whether it succeeded or not, vacate all temporary scrub DTLs. 1405 */ 1406 spa_config_enter(spa, RW_WRITER); 1407 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 1408 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 1409 spa_config_exit(spa); 1410 1411 spa_vdev_replace_done(spa); 1412 1413 spa_config_enter(spa, RW_READER); 1414 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 1415 spa_config_exit(spa); 1416 1417 mutex_enter(&spa->spa_scrub_lock); 1418 1419 spa->spa_scrub_type = POOL_SCRUB_NONE; 1420 spa->spa_scrub_active = 0; 1421 spa->spa_scrub_thread = NULL; 1422 1423 cv_broadcast(&spa->spa_scrub_cv); 1424 1425 /* 1426 * If we were told to restart, our final act is to start a new scrub. 1427 */ 1428 if (error == ERESTART) 1429 VERIFY(spa_scrub_locked(spa, scrub_type, B_TRUE) == 0); 1430 1431 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 1432 thread_exit(); 1433 } 1434 1435 void 1436 spa_scrub_suspend(spa_t *spa) 1437 { 1438 mutex_enter(&spa->spa_scrub_lock); 1439 spa->spa_scrub_suspend++; 1440 while (spa->spa_scrub_active) { 1441 cv_broadcast(&spa->spa_scrub_cv); 1442 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1443 } 1444 while (spa->spa_scrub_inflight) 1445 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1446 mutex_exit(&spa->spa_scrub_lock); 1447 } 1448 1449 void 1450 spa_scrub_resume(spa_t *spa) 1451 { 1452 mutex_enter(&spa->spa_scrub_lock); 1453 ASSERT(spa->spa_scrub_suspend != 0); 1454 if (--spa->spa_scrub_suspend == 0) 1455 cv_broadcast(&spa->spa_scrub_cv); 1456 mutex_exit(&spa->spa_scrub_lock); 1457 } 1458 1459 void 1460 spa_scrub_restart(spa_t *spa, uint64_t txg) 1461 { 1462 /* 1463 * Something happened (e.g. snapshot create/delete) that means 1464 * we must restart any in-progress scrubs. The itinerary will 1465 * fix this properly. 1466 */ 1467 mutex_enter(&spa->spa_scrub_lock); 1468 spa->spa_scrub_restart_txg = txg; 1469 mutex_exit(&spa->spa_scrub_lock); 1470 } 1471 1472 static int 1473 spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force) 1474 { 1475 space_seg_t *ss; 1476 uint64_t mintxg, maxtxg; 1477 vdev_t *rvd = spa->spa_root_vdev; 1478 int advance = 0; 1479 1480 if ((uint_t)type >= POOL_SCRUB_TYPES) 1481 return (ENOTSUP); 1482 1483 /* 1484 * If there's a scrub or resilver already in progress, stop it. 1485 */ 1486 while (spa->spa_scrub_thread != NULL) { 1487 /* 1488 * Don't stop a resilver unless forced. 1489 */ 1490 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) 1491 return (EBUSY); 1492 1493 spa->spa_scrub_stop = 1; 1494 cv_broadcast(&spa->spa_scrub_cv); 1495 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1496 } 1497 1498 /* 1499 * Terminate the previous traverse. 1500 */ 1501 if (spa->spa_scrub_th != NULL) { 1502 traverse_fini(spa->spa_scrub_th); 1503 spa->spa_scrub_th = NULL; 1504 } 1505 1506 spa->spa_scrub_stop = 0; 1507 spa->spa_scrub_type = type; 1508 spa->spa_scrub_restart_txg = 0; 1509 1510 mintxg = TXG_INITIAL - 1; 1511 maxtxg = spa_last_synced_txg(spa) + 1; 1512 1513 switch (type) { 1514 1515 case POOL_SCRUB_NONE: 1516 break; 1517 1518 case POOL_SCRUB_RESILVER: 1519 /* 1520 * Determine the resilvering boundaries. 1521 * 1522 * Note: (mintxg, maxtxg) is an open interval, 1523 * i.e. mintxg and maxtxg themselves are not included. 1524 * 1525 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 1526 * so we don't claim to resilver a txg that's still changing. 1527 */ 1528 mutex_enter(&rvd->vdev_dtl_lock); 1529 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 1530 mintxg = ss ? ss->ss_start - 1 : 0; 1531 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 1532 maxtxg = ss ? ss->ss_end : 0; 1533 maxtxg = MIN(maxtxg, spa_last_synced_txg(spa) + 1); 1534 mutex_exit(&rvd->vdev_dtl_lock); 1535 1536 advance = ADVANCE_PRE | ADVANCE_PRUNE; 1537 break; 1538 1539 case POOL_SCRUB_EVERYTHING: 1540 /* 1541 * A scrub is like a resilver, but not pruned by DTL. 1542 */ 1543 advance = ADVANCE_PRE; 1544 break; 1545 } 1546 1547 if (mintxg != 0 && maxtxg != 0 && type != POOL_SCRUB_NONE) { 1548 spa->spa_scrub_maxtxg = maxtxg; 1549 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 1550 advance, ZIO_FLAG_CANFAIL); 1551 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 1552 spa->spa_scrub_thread = thread_create(NULL, 0, 1553 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 1554 } 1555 1556 return (0); 1557 } 1558 1559 int 1560 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 1561 { 1562 int error; 1563 traverse_handle_t *th; 1564 1565 mutex_enter(&spa->spa_scrub_lock); 1566 error = spa_scrub_locked(spa, type, force); 1567 th = spa->spa_scrub_th; 1568 mutex_exit(&spa->spa_scrub_lock); 1569 1570 if (th == NULL && type != POOL_SCRUB_NONE) 1571 spa_vdev_replace_done(spa); 1572 1573 return (error); 1574 } 1575 1576 /* 1577 * ========================================================================== 1578 * SPA syncing routines 1579 * ========================================================================== 1580 */ 1581 1582 static void 1583 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 1584 { 1585 bplist_t *bpl = &spa->spa_sync_bplist; 1586 dmu_tx_t *tx; 1587 blkptr_t blk; 1588 uint64_t itor = 0; 1589 zio_t *zio; 1590 int error; 1591 uint8_t c = 1; 1592 1593 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 1594 1595 while (bplist_iterate(bpl, &itor, &blk) == 0) 1596 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 1597 1598 error = zio_wait(zio); 1599 ASSERT3U(error, ==, 0); 1600 1601 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1602 bplist_vacate(bpl, tx); 1603 1604 /* 1605 * Pre-dirty the first block so we sync to convergence faster. 1606 * (Usually only the first block is needed.) 1607 */ 1608 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 1609 dmu_tx_commit(tx); 1610 } 1611 1612 static void 1613 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 1614 { 1615 nvlist_t *config; 1616 char *packed = NULL; 1617 size_t nvsize = 0; 1618 dmu_buf_t *db; 1619 1620 if (list_is_empty(&spa->spa_dirty_list)) 1621 return; 1622 1623 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 1624 1625 spa_config_set(spa, config); 1626 1627 VERIFY(nvlist_size(config, &nvsize, NV_ENCODE_XDR) == 0); 1628 1629 packed = kmem_alloc(nvsize, KM_SLEEP); 1630 1631 VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 0) == 0); 1632 1633 dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize, 1634 packed, tx); 1635 1636 kmem_free(packed, nvsize); 1637 1638 db = dmu_bonus_hold(spa->spa_meta_objset, spa->spa_config_object); 1639 dmu_buf_will_dirty(db, tx); 1640 *(uint64_t *)db->db_data = nvsize; 1641 dmu_buf_rele(db); 1642 } 1643 1644 /* 1645 * Sync the specified transaction group. New blocks may be dirtied as 1646 * part of the process, so we iterate until it converges. 1647 */ 1648 void 1649 spa_sync(spa_t *spa, uint64_t txg) 1650 { 1651 dsl_pool_t *dp = spa->spa_dsl_pool; 1652 objset_t *mos = spa->spa_meta_objset; 1653 bplist_t *bpl = &spa->spa_sync_bplist; 1654 vdev_t *rvd = spa->spa_root_vdev; 1655 vdev_t *vd; 1656 dmu_tx_t *tx; 1657 int dirty_vdevs; 1658 1659 /* 1660 * Lock out configuration changes. 1661 */ 1662 spa_config_enter(spa, RW_READER); 1663 1664 spa->spa_syncing_txg = txg; 1665 spa->spa_sync_pass = 0; 1666 1667 bplist_open(bpl, mos, spa->spa_sync_bplist_obj); 1668 1669 /* 1670 * If anything has changed in this txg, push the deferred frees 1671 * from the previous txg. If not, leave them alone so that we 1672 * don't generate work on an otherwise idle system. 1673 */ 1674 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 1675 !txg_list_empty(&dp->dp_dirty_dirs, txg)) 1676 spa_sync_deferred_frees(spa, txg); 1677 1678 /* 1679 * Iterate to convergence. 1680 */ 1681 do { 1682 spa->spa_sync_pass++; 1683 1684 tx = dmu_tx_create_assigned(dp, txg); 1685 spa_sync_config_object(spa, tx); 1686 dmu_tx_commit(tx); 1687 1688 dsl_pool_sync(dp, txg); 1689 1690 dirty_vdevs = 0; 1691 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 1692 vdev_sync(vd, txg); 1693 dirty_vdevs++; 1694 } 1695 1696 tx = dmu_tx_create_assigned(dp, txg); 1697 bplist_sync(bpl, tx); 1698 dmu_tx_commit(tx); 1699 1700 } while (dirty_vdevs); 1701 1702 bplist_close(bpl); 1703 1704 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 1705 1706 /* 1707 * Rewrite the vdev configuration (which includes the uberblock) 1708 * to commit the transaction group. 1709 */ 1710 while (spa_sync_labels(spa, txg)) { 1711 dprintf("waiting for devices to heal\n"); 1712 delay(hz); 1713 vdev_reopen(rvd, NULL); 1714 } 1715 1716 /* 1717 * Make a stable copy of the fully synced uberblock. 1718 * We use this as the root for pool traversals. 1719 */ 1720 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 1721 1722 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 1723 1724 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 1725 spa->spa_traverse_wanted = 0; 1726 spa->spa_ubsync = spa->spa_uberblock; 1727 rw_exit(&spa->spa_traverse_lock); 1728 1729 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 1730 1731 /* 1732 * Clean up the ZIL records for the synced txg. 1733 */ 1734 dsl_pool_zil_clean(dp); 1735 1736 /* 1737 * Update usable space statistics. 1738 */ 1739 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 1740 vdev_sync_done(vd, txg); 1741 1742 /* 1743 * It had better be the case that we didn't dirty anything 1744 * since spa_sync_labels(). 1745 */ 1746 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 1747 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 1748 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 1749 ASSERT(bpl->bpl_queue == NULL); 1750 1751 spa_config_exit(spa); 1752 } 1753 1754 /* 1755 * Sync all pools. We don't want to hold the namespace lock across these 1756 * operations, so we take a reference on the spa_t and drop the lock during the 1757 * sync. 1758 */ 1759 void 1760 spa_sync_allpools(void) 1761 { 1762 spa_t *spa = NULL; 1763 mutex_enter(&spa_namespace_lock); 1764 while ((spa = spa_next(spa)) != NULL) { 1765 if (spa_state(spa) != POOL_STATE_ACTIVE) 1766 continue; 1767 spa_open_ref(spa, FTAG); 1768 mutex_exit(&spa_namespace_lock); 1769 txg_wait_synced(spa_get_dsl(spa), 0); 1770 mutex_enter(&spa_namespace_lock); 1771 spa_close(spa, FTAG); 1772 } 1773 mutex_exit(&spa_namespace_lock); 1774 } 1775 1776 /* 1777 * ========================================================================== 1778 * Miscellaneous routines 1779 * ========================================================================== 1780 */ 1781 1782 int 1783 spa_busy(void) 1784 { 1785 return (spa_active_count != 0); 1786 } 1787 1788 /* 1789 * Remove all pools in the system. 1790 */ 1791 void 1792 spa_evict_all(void) 1793 { 1794 spa_t *spa; 1795 1796 /* 1797 * Remove all cached state. All pools should be closed now, 1798 * so every spa in the AVL tree should be unreferenced. 1799 */ 1800 mutex_enter(&spa_namespace_lock); 1801 while ((spa = spa_next(NULL)) != NULL) { 1802 /* 1803 * Stop all scrub and resilver activity. spa_scrub() needs to 1804 * wait for the scrub thread, which may do a detach and sync the 1805 * configs, which needs spa_namespace_lock. Drop the lock while 1806 * maintaining a hold on the spa_t. 1807 */ 1808 spa_open_ref(spa, FTAG); 1809 mutex_exit(&spa_namespace_lock); 1810 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1811 mutex_enter(&spa_namespace_lock); 1812 spa_close(spa, FTAG); 1813 1814 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1815 spa_unload(spa); 1816 spa_deactivate(spa); 1817 } 1818 spa_remove(spa); 1819 } 1820 mutex_exit(&spa_namespace_lock); 1821 } 1822