1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * This file contains all the routines used when modifying on-disk SPA state. 30 * This includes opening, importing, destroying, exporting a pool, and syncing a 31 * pool. 32 */ 33 34 #include <sys/zfs_context.h> 35 #include <sys/fm/fs/zfs.h> 36 #include <sys/spa_impl.h> 37 #include <sys/zio.h> 38 #include <sys/zio_checksum.h> 39 #include <sys/zio_compress.h> 40 #include <sys/dmu.h> 41 #include <sys/dmu_tx.h> 42 #include <sys/zap.h> 43 #include <sys/zil.h> 44 #include <sys/vdev_impl.h> 45 #include <sys/metaslab.h> 46 #include <sys/uberblock_impl.h> 47 #include <sys/txg.h> 48 #include <sys/avl.h> 49 #include <sys/dmu_traverse.h> 50 #include <sys/unique.h> 51 #include <sys/dsl_pool.h> 52 #include <sys/dsl_dir.h> 53 #include <sys/dsl_prop.h> 54 #include <sys/fs/zfs.h> 55 #include <sys/callb.h> 56 57 static uint32_t spa_active_count; 58 59 /* 60 * ========================================================================== 61 * SPA state manipulation (open/create/destroy/import/export) 62 * ========================================================================== 63 */ 64 65 static int 66 spa_error_entry_compare(const void *a, const void *b) 67 { 68 spa_error_entry_t *sa = (spa_error_entry_t *)a; 69 spa_error_entry_t *sb = (spa_error_entry_t *)b; 70 int ret; 71 72 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 73 sizeof (zbookmark_t)); 74 75 if (ret < 0) 76 return (-1); 77 else if (ret > 0) 78 return (1); 79 else 80 return (0); 81 } 82 83 /* 84 * Utility function which retrieves copies of the current logs and 85 * re-initializes them in the process. 86 */ 87 void 88 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 89 { 90 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 91 92 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 93 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 94 95 avl_create(&spa->spa_errlist_scrub, 96 spa_error_entry_compare, sizeof (spa_error_entry_t), 97 offsetof(spa_error_entry_t, se_avl)); 98 avl_create(&spa->spa_errlist_last, 99 spa_error_entry_compare, sizeof (spa_error_entry_t), 100 offsetof(spa_error_entry_t, se_avl)); 101 } 102 103 /* 104 * Activate an uninitialized pool. 105 */ 106 static void 107 spa_activate(spa_t *spa) 108 { 109 int t; 110 111 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 112 113 spa->spa_state = POOL_STATE_ACTIVE; 114 115 spa->spa_normal_class = metaslab_class_create(); 116 117 for (t = 0; t < ZIO_TYPES; t++) { 118 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 119 8, maxclsyspri, 50, INT_MAX, 120 TASKQ_PREPOPULATE); 121 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 122 8, maxclsyspri, 50, INT_MAX, 123 TASKQ_PREPOPULATE); 124 } 125 126 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 127 128 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 129 offsetof(vdev_t, vdev_dirty_node)); 130 131 txg_list_create(&spa->spa_vdev_txg_list, 132 offsetof(struct vdev, vdev_txg_node)); 133 134 avl_create(&spa->spa_errlist_scrub, 135 spa_error_entry_compare, sizeof (spa_error_entry_t), 136 offsetof(spa_error_entry_t, se_avl)); 137 avl_create(&spa->spa_errlist_last, 138 spa_error_entry_compare, sizeof (spa_error_entry_t), 139 offsetof(spa_error_entry_t, se_avl)); 140 } 141 142 /* 143 * Opposite of spa_activate(). 144 */ 145 static void 146 spa_deactivate(spa_t *spa) 147 { 148 int t; 149 150 ASSERT(spa->spa_sync_on == B_FALSE); 151 ASSERT(spa->spa_dsl_pool == NULL); 152 ASSERT(spa->spa_root_vdev == NULL); 153 154 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 155 156 txg_list_destroy(&spa->spa_vdev_txg_list); 157 158 list_destroy(&spa->spa_dirty_list); 159 160 rw_destroy(&spa->spa_traverse_lock); 161 162 for (t = 0; t < ZIO_TYPES; t++) { 163 taskq_destroy(spa->spa_zio_issue_taskq[t]); 164 taskq_destroy(spa->spa_zio_intr_taskq[t]); 165 spa->spa_zio_issue_taskq[t] = NULL; 166 spa->spa_zio_intr_taskq[t] = NULL; 167 } 168 169 metaslab_class_destroy(spa->spa_normal_class); 170 spa->spa_normal_class = NULL; 171 172 /* 173 * If this was part of an import or the open otherwise failed, we may 174 * still have errors left in the queues. Empty them just in case. 175 */ 176 spa_errlog_drain(spa); 177 178 avl_destroy(&spa->spa_errlist_scrub); 179 avl_destroy(&spa->spa_errlist_last); 180 181 spa->spa_state = POOL_STATE_UNINITIALIZED; 182 } 183 184 /* 185 * Verify a pool configuration, and construct the vdev tree appropriately. This 186 * will create all the necessary vdevs in the appropriate layout, with each vdev 187 * in the CLOSED state. This will prep the pool before open/creation/import. 188 * All vdev validation is done by the vdev_alloc() routine. 189 */ 190 static vdev_t * 191 spa_config_parse(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) 192 { 193 nvlist_t **child; 194 uint_t c, children; 195 vdev_t *vd; 196 197 if ((vd = vdev_alloc(spa, nv, parent, id, atype)) == NULL) 198 return (NULL); 199 200 if (vd->vdev_ops->vdev_op_leaf) 201 return (vd); 202 203 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 204 &child, &children) != 0) { 205 vdev_free(vd); 206 return (NULL); 207 } 208 209 for (c = 0; c < children; c++) { 210 if (spa_config_parse(spa, child[c], vd, c, atype) == NULL) { 211 vdev_free(vd); 212 return (NULL); 213 } 214 } 215 216 return (vd); 217 } 218 219 /* 220 * Opposite of spa_load(). 221 */ 222 static void 223 spa_unload(spa_t *spa) 224 { 225 /* 226 * Stop async tasks. 227 */ 228 spa_async_suspend(spa); 229 230 /* 231 * Stop syncing. 232 */ 233 if (spa->spa_sync_on) { 234 txg_sync_stop(spa->spa_dsl_pool); 235 spa->spa_sync_on = B_FALSE; 236 } 237 238 /* 239 * Wait for any outstanding prefetch I/O to complete. 240 */ 241 spa_config_enter(spa, RW_WRITER, FTAG); 242 spa_config_exit(spa, FTAG); 243 244 /* 245 * Close the dsl pool. 246 */ 247 if (spa->spa_dsl_pool) { 248 dsl_pool_close(spa->spa_dsl_pool); 249 spa->spa_dsl_pool = NULL; 250 } 251 252 /* 253 * Close all vdevs. 254 */ 255 if (spa->spa_root_vdev) 256 vdev_free(spa->spa_root_vdev); 257 ASSERT(spa->spa_root_vdev == NULL); 258 259 spa->spa_async_suspended = 0; 260 } 261 262 /* 263 * Load an existing storage pool, using the pool's builtin spa_config as a 264 * source of configuration information. 265 */ 266 static int 267 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 268 { 269 int error = 0; 270 uint64_t config_cache_txg = spa->spa_config_txg; 271 nvlist_t *nvroot = NULL; 272 vdev_t *rvd; 273 uberblock_t *ub = &spa->spa_uberblock; 274 uint64_t pool_guid; 275 zio_t *zio; 276 277 spa->spa_load_state = state; 278 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 279 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 280 error = EINVAL; 281 goto out; 282 } 283 284 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 285 &spa->spa_config_txg); 286 287 if ((spa->spa_load_state == SPA_LOAD_IMPORT || 288 spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 289 spa_guid_exists(pool_guid, 0)) { 290 error = EEXIST; 291 goto out; 292 } 293 294 /* 295 * Parse the configuration into a vdev tree. 296 */ 297 spa_config_enter(spa, RW_WRITER, FTAG); 298 rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 299 spa_config_exit(spa, FTAG); 300 301 if (rvd == NULL) { 302 error = EINVAL; 303 goto out; 304 } 305 306 ASSERT(spa->spa_root_vdev == rvd); 307 ASSERT(spa_guid(spa) == pool_guid); 308 309 /* 310 * Try to open all vdevs, loading each label in the process. 311 */ 312 if (vdev_open(rvd) != 0) { 313 error = ENXIO; 314 goto out; 315 } 316 317 /* 318 * Find the best uberblock. 319 */ 320 bzero(ub, sizeof (uberblock_t)); 321 322 zio = zio_root(spa, NULL, NULL, 323 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 324 vdev_uberblock_load(zio, rvd, ub); 325 error = zio_wait(zio); 326 327 /* 328 * If we weren't able to find a single valid uberblock, return failure. 329 */ 330 if (ub->ub_txg == 0) { 331 error = ENXIO; 332 goto out; 333 } 334 335 /* 336 * If the pool is newer than the code, we can't open it. 337 */ 338 if (ub->ub_version > UBERBLOCK_VERSION) { 339 error = ENOTSUP; 340 goto out; 341 } 342 343 /* 344 * If the vdev guid sum doesn't match the uberblock, we have an 345 * incomplete configuration. 346 */ 347 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 348 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 349 VDEV_AUX_BAD_GUID_SUM); 350 error = ENXIO; 351 goto out; 352 } 353 354 /* 355 * Initialize internal SPA structures. 356 */ 357 spa->spa_state = POOL_STATE_ACTIVE; 358 spa->spa_ubsync = spa->spa_uberblock; 359 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 360 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 361 if (error) { 362 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 363 VDEV_AUX_CORRUPT_DATA); 364 goto out; 365 } 366 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 367 368 if (zap_lookup(spa->spa_meta_objset, 369 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 370 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 371 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 372 VDEV_AUX_CORRUPT_DATA); 373 error = EIO; 374 goto out; 375 } 376 377 if (!mosconfig) { 378 dmu_buf_t *db; 379 char *packed = NULL; 380 size_t nvsize = 0; 381 nvlist_t *newconfig = NULL; 382 383 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, 384 spa->spa_config_object, FTAG, &db)); 385 nvsize = *(uint64_t *)db->db_data; 386 dmu_buf_rele(db, FTAG); 387 388 packed = kmem_alloc(nvsize, KM_SLEEP); 389 error = dmu_read(spa->spa_meta_objset, 390 spa->spa_config_object, 0, nvsize, packed); 391 if (error == 0) 392 error = nvlist_unpack(packed, nvsize, &newconfig, 0); 393 kmem_free(packed, nvsize); 394 395 if (error) { 396 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 397 VDEV_AUX_CORRUPT_DATA); 398 error = EIO; 399 goto out; 400 } 401 402 spa_config_set(spa, newconfig); 403 404 spa_unload(spa); 405 spa_deactivate(spa); 406 spa_activate(spa); 407 408 return (spa_load(spa, newconfig, state, B_TRUE)); 409 } 410 411 if (zap_lookup(spa->spa_meta_objset, 412 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 413 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 414 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 415 VDEV_AUX_CORRUPT_DATA); 416 error = EIO; 417 goto out; 418 } 419 420 /* 421 * Load the persistent error log. If we have an older pool, this will 422 * not be present. 423 */ 424 error = zap_lookup(spa->spa_meta_objset, 425 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 426 sizeof (uint64_t), 1, &spa->spa_errlog_last); 427 if (error != 0 &&error != ENOENT) { 428 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 429 VDEV_AUX_CORRUPT_DATA); 430 error = EIO; 431 goto out; 432 } 433 434 error = zap_lookup(spa->spa_meta_objset, 435 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 436 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 437 if (error != 0 && error != ENOENT) { 438 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 439 VDEV_AUX_CORRUPT_DATA); 440 error = EIO; 441 goto out; 442 } 443 444 /* 445 * Load the vdev state for all top level vdevs. We need to grab the 446 * config lock because all label I/O is done with the 447 * ZIO_FLAG_CONFIG_HELD flag. 448 */ 449 spa_config_enter(spa, RW_READER, FTAG); 450 if ((error = vdev_load(rvd)) != 0) { 451 spa_config_exit(spa, FTAG); 452 goto out; 453 } 454 spa_config_exit(spa, FTAG); 455 456 /* 457 * Propagate the leaf DTLs we just loaded all the way up the tree. 458 */ 459 spa_config_enter(spa, RW_WRITER, FTAG); 460 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 461 spa_config_exit(spa, FTAG); 462 463 /* 464 * Check the state of the root vdev. If it can't be opened, it 465 * indicates one or more toplevel vdevs are faulted. 466 */ 467 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 468 error = ENXIO; 469 goto out; 470 } 471 472 /* 473 * Claim log blocks that haven't been committed yet, and update all 474 * top-level vdevs to sync any config changes found in vdev_load(). 475 * This must all happen in a single txg. 476 */ 477 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 478 int c; 479 dmu_tx_t *tx; 480 481 spa_config_enter(spa, RW_WRITER, FTAG); 482 vdev_config_dirty(rvd); 483 spa_config_exit(spa, FTAG); 484 485 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 486 spa_first_txg(spa)); 487 dmu_objset_find(spa->spa_name, zil_claim, tx, 0); 488 dmu_tx_commit(tx); 489 490 spa->spa_sync_on = B_TRUE; 491 txg_sync_start(spa->spa_dsl_pool); 492 493 /* 494 * Wait for all claims to sync. 495 */ 496 txg_wait_synced(spa->spa_dsl_pool, 0); 497 498 /* 499 * If the config cache is stale relative to the mosconfig, 500 * sync the config cache. 501 */ 502 if (config_cache_txg != spa->spa_config_txg) { 503 uint64_t txg; 504 spa_config_enter(spa, RW_WRITER, FTAG); 505 txg = spa_last_synced_txg(spa) + 1; 506 spa_config_set(spa, 507 spa_config_generate(spa, rvd, txg, 0)); 508 spa_config_exit(spa, FTAG); 509 txg_wait_synced(spa->spa_dsl_pool, txg); 510 spa_config_sync(); 511 } 512 513 /* 514 * If we have top-level vdevs that were added but have 515 * not yet been prepared for allocation, do that now. 516 * (It's safe now because the config cache is up to date, 517 * so it will be able to translate the new DVAs.) 518 * See comments in spa_vdev_add() for full details. 519 */ 520 for (c = 0; c < rvd->vdev_children; c++) { 521 vdev_t *tvd = rvd->vdev_child[c]; 522 if (tvd->vdev_ms_array == 0) { 523 uint64_t txg; 524 ASSERT(tvd->vdev_ms_shift == 0); 525 spa_config_enter(spa, RW_WRITER, FTAG); 526 txg = spa_last_synced_txg(spa) + 1; 527 vdev_init(tvd, txg); 528 vdev_config_dirty(tvd); 529 spa_config_set(spa, 530 spa_config_generate(spa, rvd, txg, 0)); 531 spa_config_exit(spa, FTAG); 532 txg_wait_synced(spa->spa_dsl_pool, txg); 533 ASSERT(tvd->vdev_ms_shift != 0); 534 ASSERT(tvd->vdev_ms_array != 0); 535 spa_config_sync(); 536 } 537 } 538 } 539 540 error = 0; 541 out: 542 if (error) 543 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 544 spa->spa_load_state = SPA_LOAD_NONE; 545 spa->spa_ena = 0; 546 547 return (error); 548 } 549 550 /* 551 * Pool Open/Import 552 * 553 * The import case is identical to an open except that the configuration is sent 554 * down from userland, instead of grabbed from the configuration cache. For the 555 * case of an open, the pool configuration will exist in the 556 * POOL_STATE_UNITIALIZED state. 557 * 558 * The stats information (gen/count/ustats) is used to gather vdev statistics at 559 * the same time open the pool, without having to keep around the spa_t in some 560 * ambiguous state. 561 */ 562 static int 563 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 564 { 565 spa_t *spa; 566 int error; 567 int loaded = B_FALSE; 568 int locked = B_FALSE; 569 570 *spapp = NULL; 571 572 /* 573 * As disgusting as this is, we need to support recursive calls to this 574 * function because dsl_dir_open() is called during spa_load(), and ends 575 * up calling spa_open() again. The real fix is to figure out how to 576 * avoid dsl_dir_open() calling this in the first place. 577 */ 578 if (mutex_owner(&spa_namespace_lock) != curthread) { 579 mutex_enter(&spa_namespace_lock); 580 locked = B_TRUE; 581 } 582 583 if ((spa = spa_lookup(pool)) == NULL) { 584 if (locked) 585 mutex_exit(&spa_namespace_lock); 586 return (ENOENT); 587 } 588 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 589 590 spa_activate(spa); 591 592 error = spa_load(spa, spa->spa_config, 593 SPA_LOAD_OPEN, B_FALSE); 594 595 if (error == EBADF) { 596 /* 597 * If vdev_load() returns EBADF, it indicates that one 598 * of the vdevs indicates that the pool has been 599 * exported or destroyed. If this is the case, the 600 * config cache is out of sync and we should remove the 601 * pool from the namespace. 602 */ 603 spa_unload(spa); 604 spa_deactivate(spa); 605 spa_remove(spa); 606 spa_config_sync(); 607 if (locked) 608 mutex_exit(&spa_namespace_lock); 609 return (ENOENT); 610 } 611 612 if (error) { 613 /* 614 * We can't open the pool, but we still have useful 615 * information: the state of each vdev after the 616 * attempted vdev_open(). Return this to the user. 617 */ 618 if (config != NULL && spa->spa_root_vdev != NULL) 619 *config = spa_config_generate(spa, NULL, -1ULL, 620 B_TRUE); 621 spa_unload(spa); 622 spa_deactivate(spa); 623 spa->spa_last_open_failed = B_TRUE; 624 if (locked) 625 mutex_exit(&spa_namespace_lock); 626 *spapp = NULL; 627 return (error); 628 } else { 629 zfs_post_ok(spa, NULL); 630 spa->spa_last_open_failed = B_FALSE; 631 } 632 633 loaded = B_TRUE; 634 } 635 636 spa_open_ref(spa, tag); 637 if (locked) 638 mutex_exit(&spa_namespace_lock); 639 640 *spapp = spa; 641 642 if (config != NULL) { 643 spa_config_enter(spa, RW_READER, FTAG); 644 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 645 spa_config_exit(spa, FTAG); 646 } 647 648 /* 649 * If we just loaded the pool, resilver anything that's out of date. 650 */ 651 if (loaded && (spa_mode & FWRITE)) 652 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 653 654 return (0); 655 } 656 657 int 658 spa_open(const char *name, spa_t **spapp, void *tag) 659 { 660 return (spa_open_common(name, spapp, tag, NULL)); 661 } 662 663 /* 664 * Lookup the given spa_t, incrementing the inject count in the process, 665 * preventing it from being exported or destroyed. 666 */ 667 spa_t * 668 spa_inject_addref(char *name) 669 { 670 spa_t *spa; 671 672 mutex_enter(&spa_namespace_lock); 673 if ((spa = spa_lookup(name)) == NULL) { 674 mutex_exit(&spa_namespace_lock); 675 return (NULL); 676 } 677 spa->spa_inject_ref++; 678 mutex_exit(&spa_namespace_lock); 679 680 return (spa); 681 } 682 683 void 684 spa_inject_delref(spa_t *spa) 685 { 686 mutex_enter(&spa_namespace_lock); 687 spa->spa_inject_ref--; 688 mutex_exit(&spa_namespace_lock); 689 } 690 691 int 692 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 693 { 694 int error; 695 spa_t *spa; 696 697 *config = NULL; 698 error = spa_open_common(name, &spa, FTAG, config); 699 700 if (spa && *config != NULL) 701 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 702 spa_get_errlog_size(spa)) == 0); 703 704 /* 705 * We want to get the alternate root even for faulted pools, so we cheat 706 * and call spa_lookup() directly. 707 */ 708 if (altroot) { 709 if (spa == NULL) { 710 mutex_enter(&spa_namespace_lock); 711 spa = spa_lookup(name); 712 if (spa) 713 spa_altroot(spa, altroot, buflen); 714 else 715 altroot[0] = '\0'; 716 spa = NULL; 717 mutex_exit(&spa_namespace_lock); 718 } else { 719 spa_altroot(spa, altroot, buflen); 720 } 721 } 722 723 if (spa != NULL) 724 spa_close(spa, FTAG); 725 726 return (error); 727 } 728 729 /* 730 * Pool Creation 731 */ 732 int 733 spa_create(const char *pool, nvlist_t *nvroot, char *altroot) 734 { 735 spa_t *spa; 736 dsl_pool_t *dp; 737 dmu_tx_t *tx; 738 int error; 739 uint64_t txg = TXG_INITIAL; 740 741 /* 742 * If this pool already exists, return failure. 743 */ 744 mutex_enter(&spa_namespace_lock); 745 if (spa_lookup(pool) != NULL) { 746 mutex_exit(&spa_namespace_lock); 747 return (EEXIST); 748 } 749 spa = spa_add(pool); 750 751 /* 752 * Allocate a new spa_t structure. 753 */ 754 spa_activate(spa); 755 756 if (altroot != NULL) { 757 spa->spa_root = spa_strdup(altroot); 758 atomic_add_32(&spa_active_count, 1); 759 } 760 761 spa->spa_uberblock.ub_txg = txg - 1; 762 spa->spa_ubsync = spa->spa_uberblock; 763 764 error = spa_vdev_add(spa, nvroot); 765 766 if (error) { 767 spa_unload(spa); 768 spa_deactivate(spa); 769 spa_remove(spa); 770 mutex_exit(&spa_namespace_lock); 771 return (error); 772 } 773 774 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 775 spa->spa_meta_objset = dp->dp_meta_objset; 776 777 tx = dmu_tx_create_assigned(dp, txg); 778 779 /* 780 * Create the pool config object. 781 */ 782 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 783 DMU_OT_PACKED_NVLIST, 1 << 14, 784 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 785 786 if (zap_add(spa->spa_meta_objset, 787 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 788 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 789 cmn_err(CE_PANIC, "failed to add pool config"); 790 } 791 792 /* 793 * Create the deferred-free bplist object. Turn off compression 794 * because sync-to-convergence takes longer if the blocksize 795 * keeps changing. 796 */ 797 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 798 1 << 14, tx); 799 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 800 ZIO_COMPRESS_OFF, tx); 801 802 if (zap_add(spa->spa_meta_objset, 803 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 804 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 805 cmn_err(CE_PANIC, "failed to add bplist"); 806 } 807 808 dmu_tx_commit(tx); 809 810 spa->spa_sync_on = B_TRUE; 811 txg_sync_start(spa->spa_dsl_pool); 812 813 /* 814 * We explicitly wait for the first transaction to complete so that our 815 * bean counters are appropriately updated. 816 */ 817 txg_wait_synced(spa->spa_dsl_pool, txg); 818 819 spa_config_sync(); 820 821 mutex_exit(&spa_namespace_lock); 822 823 return (0); 824 } 825 826 /* 827 * Import the given pool into the system. We set up the necessary spa_t and 828 * then call spa_load() to do the dirty work. 829 */ 830 int 831 spa_import(const char *pool, nvlist_t *config, char *altroot) 832 { 833 spa_t *spa; 834 int error; 835 836 if (!(spa_mode & FWRITE)) 837 return (EROFS); 838 839 /* 840 * If a pool with this name exists, return failure. 841 */ 842 mutex_enter(&spa_namespace_lock); 843 if (spa_lookup(pool) != NULL) { 844 mutex_exit(&spa_namespace_lock); 845 return (EEXIST); 846 } 847 848 /* 849 * Create an initialize the spa structure 850 */ 851 spa = spa_add(pool); 852 spa_activate(spa); 853 854 /* 855 * Set the alternate root, if there is one. 856 */ 857 if (altroot != NULL) { 858 spa->spa_root = spa_strdup(altroot); 859 atomic_add_32(&spa_active_count, 1); 860 } 861 862 /* 863 * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig 864 * so that we don't try to open the pool if the config is damaged. 865 * Note: on success, spa_load() will update and sync the config cache. 866 */ 867 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 868 869 if (error) { 870 spa_unload(spa); 871 spa_deactivate(spa); 872 spa_remove(spa); 873 mutex_exit(&spa_namespace_lock); 874 return (error); 875 } 876 877 mutex_exit(&spa_namespace_lock); 878 879 /* 880 * Resilver anything that's out of date. 881 */ 882 if (spa_mode & FWRITE) 883 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 884 885 return (0); 886 } 887 888 /* 889 * This (illegal) pool name is used when temporarily importing a spa_t in order 890 * to get the vdev stats associated with the imported devices. 891 */ 892 #define TRYIMPORT_NAME "$import" 893 894 nvlist_t * 895 spa_tryimport(nvlist_t *tryconfig) 896 { 897 nvlist_t *config = NULL; 898 char *poolname; 899 spa_t *spa; 900 uint64_t state; 901 902 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 903 return (NULL); 904 905 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 906 return (NULL); 907 908 mutex_enter(&spa_namespace_lock); 909 spa = spa_add(TRYIMPORT_NAME); 910 911 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 912 913 /* 914 * Initialize the spa_t structure. 915 */ 916 spa_activate(spa); 917 918 /* 919 * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig 920 * so we don't try to open the pool if the config is damaged. 921 */ 922 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 923 924 /* 925 * If 'tryconfig' was at least parsable, return the current config. 926 */ 927 if (spa->spa_root_vdev != NULL) { 928 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 929 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 930 poolname) == 0); 931 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 932 state) == 0); 933 } 934 935 spa_unload(spa); 936 spa_deactivate(spa); 937 spa_remove(spa); 938 mutex_exit(&spa_namespace_lock); 939 940 return (config); 941 } 942 943 /* 944 * Pool export/destroy 945 * 946 * The act of destroying or exporting a pool is very simple. We make sure there 947 * is no more pending I/O and any references to the pool are gone. Then, we 948 * update the pool state and sync all the labels to disk, removing the 949 * configuration from the cache afterwards. 950 */ 951 static int 952 spa_export_common(char *pool, int new_state) 953 { 954 spa_t *spa; 955 956 if (!(spa_mode & FWRITE)) 957 return (EROFS); 958 959 mutex_enter(&spa_namespace_lock); 960 if ((spa = spa_lookup(pool)) == NULL) { 961 mutex_exit(&spa_namespace_lock); 962 return (ENOENT); 963 } 964 965 /* 966 * Put a hold on the pool, drop the namespace lock, stop async tasks, 967 * reacquire the namespace lock, and see if we can export. 968 */ 969 spa_open_ref(spa, FTAG); 970 mutex_exit(&spa_namespace_lock); 971 spa_async_suspend(spa); 972 mutex_enter(&spa_namespace_lock); 973 spa_close(spa, FTAG); 974 975 /* 976 * The pool will be in core if it's openable, 977 * in which case we can modify its state. 978 */ 979 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 980 /* 981 * Objsets may be open only because they're dirty, so we 982 * have to force it to sync before checking spa_refcnt. 983 */ 984 spa_scrub_suspend(spa); 985 txg_wait_synced(spa->spa_dsl_pool, 0); 986 987 /* 988 * A pool cannot be exported or destroyed if there are active 989 * references. If we are resetting a pool, allow references by 990 * fault injection handlers. 991 */ 992 if (!spa_refcount_zero(spa) || 993 (spa->spa_inject_ref != 0 && 994 new_state != POOL_STATE_UNINITIALIZED)) { 995 spa_scrub_resume(spa); 996 spa_async_resume(spa); 997 mutex_exit(&spa_namespace_lock); 998 return (EBUSY); 999 } 1000 1001 spa_scrub_resume(spa); 1002 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1003 1004 if (spa->spa_root != NULL) 1005 atomic_add_32(&spa_active_count, -1); 1006 1007 /* 1008 * We want this to be reflected on every label, 1009 * so mark them all dirty. spa_unload() will do the 1010 * final sync that pushes these changes out. 1011 */ 1012 if (new_state != POOL_STATE_UNINITIALIZED) { 1013 spa_config_enter(spa, RW_WRITER, FTAG); 1014 spa->spa_state = new_state; 1015 vdev_config_dirty(spa->spa_root_vdev); 1016 spa_config_exit(spa, FTAG); 1017 } 1018 } 1019 1020 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1021 spa_unload(spa); 1022 spa_deactivate(spa); 1023 } 1024 1025 if (new_state != POOL_STATE_UNINITIALIZED) { 1026 spa_remove(spa); 1027 spa_config_sync(); 1028 } 1029 mutex_exit(&spa_namespace_lock); 1030 1031 return (0); 1032 } 1033 1034 /* 1035 * Destroy a storage pool. 1036 */ 1037 int 1038 spa_destroy(char *pool) 1039 { 1040 return (spa_export_common(pool, POOL_STATE_DESTROYED)); 1041 } 1042 1043 /* 1044 * Export a storage pool. 1045 */ 1046 int 1047 spa_export(char *pool) 1048 { 1049 return (spa_export_common(pool, POOL_STATE_EXPORTED)); 1050 } 1051 1052 /* 1053 * Similar to spa_export(), this unloads the spa_t without actually removing it 1054 * from the namespace in any way. 1055 */ 1056 int 1057 spa_reset(char *pool) 1058 { 1059 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED)); 1060 } 1061 1062 1063 /* 1064 * ========================================================================== 1065 * Device manipulation 1066 * ========================================================================== 1067 */ 1068 1069 /* 1070 * Add capacity to a storage pool. 1071 */ 1072 int 1073 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1074 { 1075 uint64_t txg; 1076 int c, c0, children, error; 1077 vdev_t *rvd = spa->spa_root_vdev; 1078 vdev_t *vd, *tvd; 1079 1080 txg = spa_vdev_enter(spa); 1081 1082 vd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1083 1084 if (vd == NULL) 1085 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1086 1087 if (rvd == NULL) { /* spa_create() */ 1088 rvd = vd; 1089 c0 = 0; 1090 } else { 1091 c0 = rvd->vdev_children; 1092 } 1093 1094 ASSERT(spa->spa_root_vdev == rvd); 1095 1096 if ((error = vdev_create(vd, txg)) != 0) 1097 return (spa_vdev_exit(spa, vd, txg, error)); 1098 1099 children = vd->vdev_children; 1100 1101 /* 1102 * Transfer each new top-level vdev from vd to rvd. 1103 */ 1104 for (c = 0; c < children; c++) { 1105 tvd = vd->vdev_child[c]; 1106 if (vd != rvd) { 1107 vdev_remove_child(vd, tvd); 1108 tvd->vdev_id = c0 + c; 1109 vdev_add_child(rvd, tvd); 1110 } 1111 vdev_config_dirty(tvd); 1112 } 1113 1114 /* 1115 * We have to be careful when adding new vdevs to an existing pool. 1116 * If other threads start allocating from these vdevs before we 1117 * sync the config cache, and we lose power, then upon reboot we may 1118 * fail to open the pool because there are DVAs that the config cache 1119 * can't translate. Therefore, we first add the vdevs without 1120 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1121 * initialize the metaslabs; and sync the config cache again. 1122 * 1123 * spa_load() checks for added-but-not-initialized vdevs, so that 1124 * if we lose power at any point in this sequence, the remaining 1125 * steps will be completed the next time we load the pool. 1126 */ 1127 if (vd != rvd) { 1128 (void) spa_vdev_exit(spa, vd, txg, 0); 1129 txg = spa_vdev_enter(spa); 1130 vd = NULL; 1131 } 1132 1133 /* 1134 * Now that the config is safely on disk, we can use the new space. 1135 */ 1136 for (c = 0; c < children; c++) { 1137 tvd = rvd->vdev_child[c0 + c]; 1138 ASSERT(tvd->vdev_ms_array == 0); 1139 vdev_init(tvd, txg); 1140 vdev_config_dirty(tvd); 1141 } 1142 1143 return (spa_vdev_exit(spa, vd, txg, 0)); 1144 } 1145 1146 /* 1147 * Attach a device to a mirror. The arguments are the path to any device 1148 * in the mirror, and the nvroot for the new device. If the path specifies 1149 * a device that is not mirrored, we automatically insert the mirror vdev. 1150 * 1151 * If 'replacing' is specified, the new device is intended to replace the 1152 * existing device; in this case the two devices are made into their own 1153 * mirror using the 'replacing' vdev, which is functionally idendical to 1154 * the mirror vdev (it actually reuses all the same ops) but has a few 1155 * extra rules: you can't attach to it after it's been created, and upon 1156 * completion of resilvering, the first disk (the one being replaced) 1157 * is automatically detached. 1158 */ 1159 int 1160 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1161 { 1162 uint64_t txg, open_txg; 1163 int error; 1164 vdev_t *rvd = spa->spa_root_vdev; 1165 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1166 vdev_ops_t *pvops = replacing ? &vdev_replacing_ops : &vdev_mirror_ops; 1167 1168 txg = spa_vdev_enter(spa); 1169 1170 oldvd = vdev_lookup_by_guid(rvd, guid); 1171 1172 if (oldvd == NULL) 1173 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1174 1175 if (!oldvd->vdev_ops->vdev_op_leaf) 1176 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1177 1178 pvd = oldvd->vdev_parent; 1179 1180 /* 1181 * The parent must be a mirror or the root, unless we're replacing; 1182 * in that case, the parent can be anything but another replacing vdev. 1183 */ 1184 if (pvd->vdev_ops != &vdev_mirror_ops && 1185 pvd->vdev_ops != &vdev_root_ops && 1186 (!replacing || pvd->vdev_ops == &vdev_replacing_ops)) 1187 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1188 1189 newrootvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1190 1191 if (newrootvd == NULL || newrootvd->vdev_children != 1) 1192 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1193 1194 newvd = newrootvd->vdev_child[0]; 1195 1196 if (!newvd->vdev_ops->vdev_op_leaf) 1197 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1198 1199 if ((error = vdev_create(newrootvd, txg)) != 0) 1200 return (spa_vdev_exit(spa, newrootvd, txg, error)); 1201 1202 /* 1203 * Compare the new device size with the replaceable/attachable 1204 * device size. 1205 */ 1206 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1207 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1208 1209 if (newvd->vdev_ashift != oldvd->vdev_ashift && oldvd->vdev_ashift != 0) 1210 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1211 1212 /* 1213 * If this is an in-place replacement, update oldvd's path and devid 1214 * to make it distinguishable from newvd, and unopenable from now on. 1215 */ 1216 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1217 spa_strfree(oldvd->vdev_path); 1218 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1219 KM_SLEEP); 1220 (void) sprintf(oldvd->vdev_path, "%s/%s", 1221 newvd->vdev_path, "old"); 1222 if (oldvd->vdev_devid != NULL) { 1223 spa_strfree(oldvd->vdev_devid); 1224 oldvd->vdev_devid = NULL; 1225 } 1226 } 1227 1228 /* 1229 * If the parent is not a mirror, or if we're replacing, 1230 * insert the new mirror/replacing vdev above oldvd. 1231 */ 1232 if (pvd->vdev_ops != pvops) 1233 pvd = vdev_add_parent(oldvd, pvops); 1234 1235 ASSERT(pvd->vdev_top->vdev_parent == rvd); 1236 ASSERT(pvd->vdev_ops == pvops); 1237 ASSERT(oldvd->vdev_parent == pvd); 1238 1239 /* 1240 * Extract the new device from its root and add it to pvd. 1241 */ 1242 vdev_remove_child(newrootvd, newvd); 1243 newvd->vdev_id = pvd->vdev_children; 1244 vdev_add_child(pvd, newvd); 1245 1246 /* 1247 * If newvd is smaller than oldvd, but larger than its rsize, 1248 * the addition of newvd may have decreased our parent's asize. 1249 */ 1250 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1251 1252 tvd = newvd->vdev_top; 1253 ASSERT(pvd->vdev_top == tvd); 1254 ASSERT(tvd->vdev_parent == rvd); 1255 1256 vdev_config_dirty(tvd); 1257 1258 /* 1259 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1260 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1261 */ 1262 open_txg = txg + TXG_CONCURRENT_STATES - 1; 1263 1264 mutex_enter(&newvd->vdev_dtl_lock); 1265 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1266 open_txg - TXG_INITIAL + 1); 1267 mutex_exit(&newvd->vdev_dtl_lock); 1268 1269 dprintf("attached %s in txg %llu\n", newvd->vdev_path, txg); 1270 1271 /* 1272 * Mark newvd's DTL dirty in this txg. 1273 */ 1274 vdev_dirty(tvd, VDD_DTL, txg); 1275 (void) txg_list_add(&tvd->vdev_dtl_list, newvd, txg); 1276 1277 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1278 1279 /* 1280 * Kick off a resilver to update newvd. 1281 */ 1282 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1283 1284 return (0); 1285 } 1286 1287 /* 1288 * Detach a device from a mirror or replacing vdev. 1289 * If 'replace_done' is specified, only detach if the parent 1290 * is a replacing vdev. 1291 */ 1292 int 1293 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1294 { 1295 uint64_t txg; 1296 int c, t, error; 1297 vdev_t *rvd = spa->spa_root_vdev; 1298 vdev_t *vd, *pvd, *cvd, *tvd; 1299 1300 txg = spa_vdev_enter(spa); 1301 1302 vd = vdev_lookup_by_guid(rvd, guid); 1303 1304 if (vd == NULL) 1305 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1306 1307 if (!vd->vdev_ops->vdev_op_leaf) 1308 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1309 1310 pvd = vd->vdev_parent; 1311 1312 /* 1313 * If replace_done is specified, only remove this device if it's 1314 * the first child of a replacing vdev. 1315 */ 1316 if (replace_done && 1317 (vd->vdev_id != 0 || pvd->vdev_ops != &vdev_replacing_ops)) 1318 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1319 1320 /* 1321 * Only mirror and replacing vdevs support detach. 1322 */ 1323 if (pvd->vdev_ops != &vdev_replacing_ops && 1324 pvd->vdev_ops != &vdev_mirror_ops) 1325 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1326 1327 /* 1328 * If there's only one replica, you can't detach it. 1329 */ 1330 if (pvd->vdev_children <= 1) 1331 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1332 1333 /* 1334 * If all siblings have non-empty DTLs, this device may have the only 1335 * valid copy of the data, which means we cannot safely detach it. 1336 * 1337 * XXX -- as in the vdev_offline() case, we really want a more 1338 * precise DTL check. 1339 */ 1340 for (c = 0; c < pvd->vdev_children; c++) { 1341 uint64_t dirty; 1342 1343 cvd = pvd->vdev_child[c]; 1344 if (cvd == vd) 1345 continue; 1346 if (vdev_is_dead(cvd)) 1347 continue; 1348 mutex_enter(&cvd->vdev_dtl_lock); 1349 dirty = cvd->vdev_dtl_map.sm_space | 1350 cvd->vdev_dtl_scrub.sm_space; 1351 mutex_exit(&cvd->vdev_dtl_lock); 1352 if (!dirty) 1353 break; 1354 } 1355 if (c == pvd->vdev_children) 1356 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1357 1358 /* 1359 * Erase the disk labels so the disk can be used for other things. 1360 * This must be done after all other error cases are handled, 1361 * but before we disembowel vd (so we can still do I/O to it). 1362 * But if we can't do it, don't treat the error as fatal -- 1363 * it may be that the unwritability of the disk is the reason 1364 * it's being detached! 1365 */ 1366 error = vdev_label_init(vd, 0); 1367 if (error) 1368 dprintf("unable to erase labels on %s\n", vdev_description(vd)); 1369 1370 /* 1371 * Remove vd from its parent and compact the parent's children. 1372 */ 1373 vdev_remove_child(pvd, vd); 1374 vdev_compact_children(pvd); 1375 1376 /* 1377 * Remember one of the remaining children so we can get tvd below. 1378 */ 1379 cvd = pvd->vdev_child[0]; 1380 1381 /* 1382 * If the parent mirror/replacing vdev only has one child, 1383 * the parent is no longer needed. Remove it from the tree. 1384 */ 1385 if (pvd->vdev_children == 1) 1386 vdev_remove_parent(cvd); 1387 1388 /* 1389 * We don't set tvd until now because the parent we just removed 1390 * may have been the previous top-level vdev. 1391 */ 1392 tvd = cvd->vdev_top; 1393 ASSERT(tvd->vdev_parent == rvd); 1394 1395 /* 1396 * Reopen this top-level vdev to reassess health after detach. 1397 */ 1398 vdev_reopen(tvd); 1399 1400 /* 1401 * If the device we just detached was smaller than the others, 1402 * it may be possible to add metaslabs (i.e. grow the pool). We ignore 1403 * the error here because the detach still succeeded - we just weren't 1404 * able to reinitialize the metaslabs. This pool is in for a world of 1405 * hurt, in any case. 1406 */ 1407 (void) vdev_metaslab_init(tvd, txg); 1408 1409 vdev_config_dirty(tvd); 1410 1411 /* 1412 * Mark vd's DTL as dirty in this txg. 1413 * vdev_dtl_sync() will see that vd->vdev_detached is set 1414 * and free vd's DTL object in syncing context. 1415 * But first make sure we're not on any *other* txg's DTL list, 1416 * to prevent vd from being accessed after it's freed. 1417 */ 1418 vdev_dirty(tvd, VDD_DTL, txg); 1419 vd->vdev_detached = B_TRUE; 1420 for (t = 0; t < TXG_SIZE; t++) 1421 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1422 (void) txg_list_add(&tvd->vdev_dtl_list, vd, txg); 1423 1424 dprintf("detached %s in txg %llu\n", vd->vdev_path, txg); 1425 1426 return (spa_vdev_exit(spa, vd, txg, 0)); 1427 } 1428 1429 /* 1430 * Find any device that's done replacing, so we can detach it. 1431 */ 1432 static vdev_t * 1433 spa_vdev_replace_done_hunt(vdev_t *vd) 1434 { 1435 vdev_t *newvd, *oldvd; 1436 int c; 1437 1438 for (c = 0; c < vd->vdev_children; c++) { 1439 oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 1440 if (oldvd != NULL) 1441 return (oldvd); 1442 } 1443 1444 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 1445 oldvd = vd->vdev_child[0]; 1446 newvd = vd->vdev_child[1]; 1447 1448 mutex_enter(&newvd->vdev_dtl_lock); 1449 if (newvd->vdev_dtl_map.sm_space == 0 && 1450 newvd->vdev_dtl_scrub.sm_space == 0) { 1451 mutex_exit(&newvd->vdev_dtl_lock); 1452 return (oldvd); 1453 } 1454 mutex_exit(&newvd->vdev_dtl_lock); 1455 } 1456 1457 return (NULL); 1458 } 1459 1460 static void 1461 spa_vdev_replace_done(spa_t *spa) 1462 { 1463 vdev_t *vd; 1464 uint64_t guid; 1465 1466 spa_config_enter(spa, RW_READER, FTAG); 1467 1468 while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 1469 guid = vd->vdev_guid; 1470 spa_config_exit(spa, FTAG); 1471 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 1472 return; 1473 spa_config_enter(spa, RW_READER, FTAG); 1474 } 1475 1476 spa_config_exit(spa, FTAG); 1477 } 1478 1479 /* 1480 * Update the stored path for this vdev. Dirty the vdev configuration, relying 1481 * on spa_vdev_enter/exit() to synchronize the labels and cache. 1482 */ 1483 int 1484 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 1485 { 1486 vdev_t *rvd, *vd; 1487 uint64_t txg; 1488 1489 rvd = spa->spa_root_vdev; 1490 1491 txg = spa_vdev_enter(spa); 1492 1493 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 1494 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 1495 1496 if (!vd->vdev_ops->vdev_op_leaf) 1497 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1498 1499 spa_strfree(vd->vdev_path); 1500 vd->vdev_path = spa_strdup(newpath); 1501 1502 vdev_config_dirty(vd->vdev_top); 1503 1504 return (spa_vdev_exit(spa, NULL, txg, 0)); 1505 } 1506 1507 /* 1508 * ========================================================================== 1509 * SPA Scrubbing 1510 * ========================================================================== 1511 */ 1512 1513 void 1514 spa_scrub_throttle(spa_t *spa, int direction) 1515 { 1516 mutex_enter(&spa->spa_scrub_lock); 1517 spa->spa_scrub_throttled += direction; 1518 ASSERT(spa->spa_scrub_throttled >= 0); 1519 if (spa->spa_scrub_throttled == 0) 1520 cv_broadcast(&spa->spa_scrub_io_cv); 1521 mutex_exit(&spa->spa_scrub_lock); 1522 } 1523 1524 static void 1525 spa_scrub_io_done(zio_t *zio) 1526 { 1527 spa_t *spa = zio->io_spa; 1528 1529 zio_buf_free(zio->io_data, zio->io_size); 1530 1531 mutex_enter(&spa->spa_scrub_lock); 1532 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1533 vdev_t *vd = zio->io_vd; 1534 spa->spa_scrub_errors++; 1535 mutex_enter(&vd->vdev_stat_lock); 1536 vd->vdev_stat.vs_scrub_errors++; 1537 mutex_exit(&vd->vdev_stat_lock); 1538 } 1539 if (--spa->spa_scrub_inflight == 0) { 1540 cv_broadcast(&spa->spa_scrub_io_cv); 1541 ASSERT(spa->spa_scrub_throttled == 0); 1542 } 1543 mutex_exit(&spa->spa_scrub_lock); 1544 } 1545 1546 static void 1547 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 1548 zbookmark_t *zb) 1549 { 1550 size_t size = BP_GET_LSIZE(bp); 1551 void *data = zio_buf_alloc(size); 1552 1553 mutex_enter(&spa->spa_scrub_lock); 1554 spa->spa_scrub_inflight++; 1555 mutex_exit(&spa->spa_scrub_lock); 1556 1557 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 1558 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 1559 1560 flags |= ZIO_FLAG_CANFAIL; 1561 1562 zio_nowait(zio_read(NULL, spa, bp, data, size, 1563 spa_scrub_io_done, NULL, priority, flags, zb)); 1564 } 1565 1566 /* ARGSUSED */ 1567 static int 1568 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 1569 { 1570 blkptr_t *bp = &bc->bc_blkptr; 1571 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[0])); 1572 1573 if (bc->bc_errno || vd == NULL) { 1574 /* 1575 * We can't scrub this block, but we can continue to scrub 1576 * the rest of the pool. Note the error and move along. 1577 */ 1578 mutex_enter(&spa->spa_scrub_lock); 1579 spa->spa_scrub_errors++; 1580 mutex_exit(&spa->spa_scrub_lock); 1581 1582 if (vd != NULL) { 1583 mutex_enter(&vd->vdev_stat_lock); 1584 vd->vdev_stat.vs_scrub_errors++; 1585 mutex_exit(&vd->vdev_stat_lock); 1586 } 1587 1588 return (ERESTART); 1589 } 1590 1591 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 1592 1593 /* 1594 * Keep track of how much data we've examined so that 1595 * zpool(1M) status can make useful progress reports. 1596 */ 1597 mutex_enter(&vd->vdev_stat_lock); 1598 vd->vdev_stat.vs_scrub_examined += BP_GET_ASIZE(bp); 1599 mutex_exit(&vd->vdev_stat_lock); 1600 1601 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 1602 if (DVA_GET_GANG(&bp->blk_dva[0])) { 1603 /* 1604 * Gang members may be spread across multiple vdevs, 1605 * so the best we can do is look at the pool-wide DTL. 1606 * XXX -- it would be better to change our allocation 1607 * policy to ensure that this can't happen. 1608 */ 1609 vd = spa->spa_root_vdev; 1610 } 1611 if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) { 1612 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 1613 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 1614 } 1615 } else { 1616 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 1617 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 1618 } 1619 1620 return (0); 1621 } 1622 1623 static void 1624 spa_scrub_thread(spa_t *spa) 1625 { 1626 callb_cpr_t cprinfo; 1627 traverse_handle_t *th = spa->spa_scrub_th; 1628 vdev_t *rvd = spa->spa_root_vdev; 1629 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 1630 int error = 0; 1631 boolean_t complete; 1632 1633 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 1634 1635 /* 1636 * If we're restarting due to a snapshot create/delete, 1637 * wait for that to complete. 1638 */ 1639 txg_wait_synced(spa_get_dsl(spa), 0); 1640 1641 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 1642 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 1643 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 1644 1645 spa_config_enter(spa, RW_WRITER, FTAG); 1646 vdev_reopen(rvd); /* purge all vdev caches */ 1647 vdev_config_dirty(rvd); /* rewrite all disk labels */ 1648 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 1649 spa_config_exit(spa, FTAG); 1650 1651 mutex_enter(&spa->spa_scrub_lock); 1652 spa->spa_scrub_errors = 0; 1653 spa->spa_scrub_active = 1; 1654 ASSERT(spa->spa_scrub_inflight == 0); 1655 ASSERT(spa->spa_scrub_throttled == 0); 1656 1657 while (!spa->spa_scrub_stop) { 1658 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1659 while (spa->spa_scrub_suspended) { 1660 spa->spa_scrub_active = 0; 1661 cv_broadcast(&spa->spa_scrub_cv); 1662 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1663 spa->spa_scrub_active = 1; 1664 } 1665 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 1666 1667 if (spa->spa_scrub_restart_txg != 0) 1668 break; 1669 1670 mutex_exit(&spa->spa_scrub_lock); 1671 error = traverse_more(th); 1672 mutex_enter(&spa->spa_scrub_lock); 1673 if (error != EAGAIN) 1674 break; 1675 1676 while (spa->spa_scrub_throttled > 0) 1677 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1678 } 1679 1680 while (spa->spa_scrub_inflight) 1681 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1682 1683 spa->spa_scrub_active = 0; 1684 cv_broadcast(&spa->spa_scrub_cv); 1685 1686 mutex_exit(&spa->spa_scrub_lock); 1687 1688 spa_config_enter(spa, RW_WRITER, FTAG); 1689 1690 mutex_enter(&spa->spa_scrub_lock); 1691 1692 /* 1693 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 1694 * AND the spa config lock to synchronize with any config changes 1695 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 1696 */ 1697 if (spa->spa_scrub_restart_txg != 0) 1698 error = ERESTART; 1699 1700 if (spa->spa_scrub_stop) 1701 error = EINTR; 1702 1703 /* 1704 * Even if there were uncorrectable errors, we consider the scrub 1705 * completed. The downside is that if there is a transient error during 1706 * a resilver, we won't resilver the data properly to the target. But 1707 * if the damage is permanent (more likely) we will resilver forever, 1708 * which isn't really acceptable. Since there is enough information for 1709 * the user to know what has failed and why, this seems like a more 1710 * tractable approach. 1711 */ 1712 complete = (error == 0); 1713 1714 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 1715 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 1716 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 1717 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 1718 1719 mutex_exit(&spa->spa_scrub_lock); 1720 1721 /* 1722 * If the scrub/resilver completed, update all DTLs to reflect this. 1723 * Whether it succeeded or not, vacate all temporary scrub DTLs. 1724 */ 1725 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 1726 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 1727 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 1728 spa_errlog_rotate(spa); 1729 1730 spa_config_exit(spa, FTAG); 1731 1732 mutex_enter(&spa->spa_scrub_lock); 1733 1734 /* 1735 * We may have finished replacing a device. 1736 * Let the async thread assess this and handle the detach. 1737 */ 1738 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 1739 1740 /* 1741 * If we were told to restart, our final act is to start a new scrub. 1742 */ 1743 if (error == ERESTART) 1744 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 1745 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 1746 1747 spa->spa_scrub_type = POOL_SCRUB_NONE; 1748 spa->spa_scrub_active = 0; 1749 spa->spa_scrub_thread = NULL; 1750 cv_broadcast(&spa->spa_scrub_cv); 1751 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 1752 thread_exit(); 1753 } 1754 1755 void 1756 spa_scrub_suspend(spa_t *spa) 1757 { 1758 mutex_enter(&spa->spa_scrub_lock); 1759 spa->spa_scrub_suspended++; 1760 while (spa->spa_scrub_active) { 1761 cv_broadcast(&spa->spa_scrub_cv); 1762 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1763 } 1764 while (spa->spa_scrub_inflight) 1765 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1766 mutex_exit(&spa->spa_scrub_lock); 1767 } 1768 1769 void 1770 spa_scrub_resume(spa_t *spa) 1771 { 1772 mutex_enter(&spa->spa_scrub_lock); 1773 ASSERT(spa->spa_scrub_suspended != 0); 1774 if (--spa->spa_scrub_suspended == 0) 1775 cv_broadcast(&spa->spa_scrub_cv); 1776 mutex_exit(&spa->spa_scrub_lock); 1777 } 1778 1779 void 1780 spa_scrub_restart(spa_t *spa, uint64_t txg) 1781 { 1782 /* 1783 * Something happened (e.g. snapshot create/delete) that means 1784 * we must restart any in-progress scrubs. The itinerary will 1785 * fix this properly. 1786 */ 1787 mutex_enter(&spa->spa_scrub_lock); 1788 spa->spa_scrub_restart_txg = txg; 1789 mutex_exit(&spa->spa_scrub_lock); 1790 } 1791 1792 int 1793 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 1794 { 1795 space_seg_t *ss; 1796 uint64_t mintxg, maxtxg; 1797 vdev_t *rvd = spa->spa_root_vdev; 1798 int advance = ADVANCE_PRE | ADVANCE_ZIL; 1799 1800 if ((uint_t)type >= POOL_SCRUB_TYPES) 1801 return (ENOTSUP); 1802 1803 mutex_enter(&spa->spa_scrub_lock); 1804 1805 /* 1806 * If there's a scrub or resilver already in progress, stop it. 1807 */ 1808 while (spa->spa_scrub_thread != NULL) { 1809 /* 1810 * Don't stop a resilver unless forced. 1811 */ 1812 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 1813 mutex_exit(&spa->spa_scrub_lock); 1814 return (EBUSY); 1815 } 1816 spa->spa_scrub_stop = 1; 1817 cv_broadcast(&spa->spa_scrub_cv); 1818 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1819 } 1820 1821 /* 1822 * Terminate the previous traverse. 1823 */ 1824 if (spa->spa_scrub_th != NULL) { 1825 traverse_fini(spa->spa_scrub_th); 1826 spa->spa_scrub_th = NULL; 1827 } 1828 1829 if (rvd == NULL) { 1830 ASSERT(spa->spa_scrub_stop == 0); 1831 ASSERT(spa->spa_scrub_type == type); 1832 ASSERT(spa->spa_scrub_restart_txg == 0); 1833 mutex_exit(&spa->spa_scrub_lock); 1834 return (0); 1835 } 1836 1837 mintxg = TXG_INITIAL - 1; 1838 maxtxg = spa_last_synced_txg(spa) + 1; 1839 1840 mutex_enter(&rvd->vdev_dtl_lock); 1841 1842 if (rvd->vdev_dtl_map.sm_space == 0) { 1843 /* 1844 * The pool-wide DTL is empty. 1845 * If this is a resilver, there's nothing to do. 1846 */ 1847 if (type == POOL_SCRUB_RESILVER) 1848 type = POOL_SCRUB_NONE; 1849 } else { 1850 /* 1851 * The pool-wide DTL is non-empty. 1852 * If this is a normal scrub, upgrade to a resilver instead. 1853 */ 1854 if (type == POOL_SCRUB_EVERYTHING) 1855 type = POOL_SCRUB_RESILVER; 1856 } 1857 1858 if (type == POOL_SCRUB_RESILVER) { 1859 /* 1860 * Determine the resilvering boundaries. 1861 * 1862 * Note: (mintxg, maxtxg) is an open interval, 1863 * i.e. mintxg and maxtxg themselves are not included. 1864 * 1865 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 1866 * so we don't claim to resilver a txg that's still changing. 1867 */ 1868 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 1869 mintxg = ss->ss_start - 1; 1870 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 1871 maxtxg = MIN(ss->ss_end, maxtxg); 1872 1873 advance |= ADVANCE_PRUNE; 1874 } 1875 1876 mutex_exit(&rvd->vdev_dtl_lock); 1877 1878 spa->spa_scrub_stop = 0; 1879 spa->spa_scrub_type = type; 1880 spa->spa_scrub_restart_txg = 0; 1881 1882 if (type != POOL_SCRUB_NONE) { 1883 spa->spa_scrub_mintxg = mintxg; 1884 spa->spa_scrub_maxtxg = maxtxg; 1885 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 1886 advance, ZIO_FLAG_CANFAIL); 1887 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 1888 spa->spa_scrub_thread = thread_create(NULL, 0, 1889 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 1890 } 1891 1892 mutex_exit(&spa->spa_scrub_lock); 1893 1894 return (0); 1895 } 1896 1897 /* 1898 * ========================================================================== 1899 * SPA async task processing 1900 * ========================================================================== 1901 */ 1902 1903 static void 1904 spa_async_reopen(spa_t *spa) 1905 { 1906 vdev_t *rvd = spa->spa_root_vdev; 1907 vdev_t *tvd; 1908 int c; 1909 1910 spa_config_enter(spa, RW_WRITER, FTAG); 1911 1912 for (c = 0; c < rvd->vdev_children; c++) { 1913 tvd = rvd->vdev_child[c]; 1914 if (tvd->vdev_reopen_wanted) { 1915 tvd->vdev_reopen_wanted = 0; 1916 vdev_reopen(tvd); 1917 } 1918 } 1919 1920 spa_config_exit(spa, FTAG); 1921 } 1922 1923 static void 1924 spa_async_thread(spa_t *spa) 1925 { 1926 int tasks; 1927 1928 ASSERT(spa->spa_sync_on); 1929 1930 mutex_enter(&spa->spa_async_lock); 1931 tasks = spa->spa_async_tasks; 1932 spa->spa_async_tasks = 0; 1933 mutex_exit(&spa->spa_async_lock); 1934 1935 /* 1936 * See if any devices need to be reopened. 1937 */ 1938 if (tasks & SPA_ASYNC_REOPEN) 1939 spa_async_reopen(spa); 1940 1941 /* 1942 * If any devices are done replacing, detach them. 1943 */ 1944 if (tasks & SPA_ASYNC_REPLACE_DONE) 1945 spa_vdev_replace_done(spa); 1946 1947 /* 1948 * Kick off a scrub. 1949 */ 1950 if (tasks & SPA_ASYNC_SCRUB) 1951 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 1952 1953 /* 1954 * Kick off a resilver. 1955 */ 1956 if (tasks & SPA_ASYNC_RESILVER) 1957 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1958 1959 /* 1960 * Let the world know that we're done. 1961 */ 1962 mutex_enter(&spa->spa_async_lock); 1963 spa->spa_async_thread = NULL; 1964 cv_broadcast(&spa->spa_async_cv); 1965 mutex_exit(&spa->spa_async_lock); 1966 thread_exit(); 1967 } 1968 1969 void 1970 spa_async_suspend(spa_t *spa) 1971 { 1972 mutex_enter(&spa->spa_async_lock); 1973 spa->spa_async_suspended++; 1974 while (spa->spa_async_thread != NULL) 1975 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 1976 mutex_exit(&spa->spa_async_lock); 1977 } 1978 1979 void 1980 spa_async_resume(spa_t *spa) 1981 { 1982 mutex_enter(&spa->spa_async_lock); 1983 ASSERT(spa->spa_async_suspended != 0); 1984 spa->spa_async_suspended--; 1985 mutex_exit(&spa->spa_async_lock); 1986 } 1987 1988 static void 1989 spa_async_dispatch(spa_t *spa) 1990 { 1991 mutex_enter(&spa->spa_async_lock); 1992 if (spa->spa_async_tasks && !spa->spa_async_suspended && 1993 spa->spa_async_thread == NULL) 1994 spa->spa_async_thread = thread_create(NULL, 0, 1995 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 1996 mutex_exit(&spa->spa_async_lock); 1997 } 1998 1999 void 2000 spa_async_request(spa_t *spa, int task) 2001 { 2002 mutex_enter(&spa->spa_async_lock); 2003 spa->spa_async_tasks |= task; 2004 mutex_exit(&spa->spa_async_lock); 2005 } 2006 2007 /* 2008 * ========================================================================== 2009 * SPA syncing routines 2010 * ========================================================================== 2011 */ 2012 2013 static void 2014 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2015 { 2016 bplist_t *bpl = &spa->spa_sync_bplist; 2017 dmu_tx_t *tx; 2018 blkptr_t blk; 2019 uint64_t itor = 0; 2020 zio_t *zio; 2021 int error; 2022 uint8_t c = 1; 2023 2024 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2025 2026 while (bplist_iterate(bpl, &itor, &blk) == 0) 2027 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2028 2029 error = zio_wait(zio); 2030 ASSERT3U(error, ==, 0); 2031 2032 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2033 bplist_vacate(bpl, tx); 2034 2035 /* 2036 * Pre-dirty the first block so we sync to convergence faster. 2037 * (Usually only the first block is needed.) 2038 */ 2039 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2040 dmu_tx_commit(tx); 2041 } 2042 2043 static void 2044 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2045 { 2046 nvlist_t *config; 2047 char *packed = NULL; 2048 size_t nvsize = 0; 2049 dmu_buf_t *db; 2050 2051 if (list_is_empty(&spa->spa_dirty_list)) 2052 return; 2053 2054 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2055 2056 spa_config_set(spa, config); 2057 2058 VERIFY(nvlist_size(config, &nvsize, NV_ENCODE_XDR) == 0); 2059 2060 packed = kmem_alloc(nvsize, KM_SLEEP); 2061 2062 VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 2063 KM_SLEEP) == 0); 2064 2065 dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize, 2066 packed, tx); 2067 2068 kmem_free(packed, nvsize); 2069 2070 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, 2071 spa->spa_config_object, FTAG, &db)); 2072 dmu_buf_will_dirty(db, tx); 2073 *(uint64_t *)db->db_data = nvsize; 2074 dmu_buf_rele(db, FTAG); 2075 } 2076 2077 /* 2078 * Sync the specified transaction group. New blocks may be dirtied as 2079 * part of the process, so we iterate until it converges. 2080 */ 2081 void 2082 spa_sync(spa_t *spa, uint64_t txg) 2083 { 2084 dsl_pool_t *dp = spa->spa_dsl_pool; 2085 objset_t *mos = spa->spa_meta_objset; 2086 bplist_t *bpl = &spa->spa_sync_bplist; 2087 vdev_t *vd; 2088 dmu_tx_t *tx; 2089 int dirty_vdevs; 2090 2091 /* 2092 * Lock out configuration changes. 2093 */ 2094 spa_config_enter(spa, RW_READER, FTAG); 2095 2096 spa->spa_syncing_txg = txg; 2097 spa->spa_sync_pass = 0; 2098 2099 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2100 2101 /* 2102 * If anything has changed in this txg, push the deferred frees 2103 * from the previous txg. If not, leave them alone so that we 2104 * don't generate work on an otherwise idle system. 2105 */ 2106 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2107 !txg_list_empty(&dp->dp_dirty_dirs, txg)) 2108 spa_sync_deferred_frees(spa, txg); 2109 2110 /* 2111 * Iterate to convergence. 2112 */ 2113 do { 2114 spa->spa_sync_pass++; 2115 2116 tx = dmu_tx_create_assigned(dp, txg); 2117 spa_sync_config_object(spa, tx); 2118 dmu_tx_commit(tx); 2119 2120 spa_errlog_sync(spa, txg); 2121 2122 dsl_pool_sync(dp, txg); 2123 2124 dirty_vdevs = 0; 2125 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2126 vdev_sync(vd, txg); 2127 dirty_vdevs++; 2128 } 2129 2130 tx = dmu_tx_create_assigned(dp, txg); 2131 bplist_sync(bpl, tx); 2132 dmu_tx_commit(tx); 2133 2134 } while (dirty_vdevs); 2135 2136 bplist_close(bpl); 2137 2138 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2139 2140 /* 2141 * Rewrite the vdev configuration (which includes the uberblock) 2142 * to commit the transaction group. 2143 */ 2144 VERIFY(0 == spa_sync_labels(spa, txg)); 2145 2146 /* 2147 * Make a stable copy of the fully synced uberblock. 2148 * We use this as the root for pool traversals. 2149 */ 2150 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 2151 2152 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 2153 2154 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 2155 spa->spa_traverse_wanted = 0; 2156 spa->spa_ubsync = spa->spa_uberblock; 2157 rw_exit(&spa->spa_traverse_lock); 2158 2159 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 2160 2161 /* 2162 * Clean up the ZIL records for the synced txg. 2163 */ 2164 dsl_pool_zil_clean(dp); 2165 2166 /* 2167 * Update usable space statistics. 2168 */ 2169 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 2170 vdev_sync_done(vd, txg); 2171 2172 /* 2173 * It had better be the case that we didn't dirty anything 2174 * since spa_sync_labels(). 2175 */ 2176 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 2177 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 2178 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 2179 ASSERT(bpl->bpl_queue == NULL); 2180 2181 spa_config_exit(spa, FTAG); 2182 2183 /* 2184 * If any async tasks have been requested, kick them off. 2185 */ 2186 spa_async_dispatch(spa); 2187 } 2188 2189 /* 2190 * Sync all pools. We don't want to hold the namespace lock across these 2191 * operations, so we take a reference on the spa_t and drop the lock during the 2192 * sync. 2193 */ 2194 void 2195 spa_sync_allpools(void) 2196 { 2197 spa_t *spa = NULL; 2198 mutex_enter(&spa_namespace_lock); 2199 while ((spa = spa_next(spa)) != NULL) { 2200 if (spa_state(spa) != POOL_STATE_ACTIVE) 2201 continue; 2202 spa_open_ref(spa, FTAG); 2203 mutex_exit(&spa_namespace_lock); 2204 txg_wait_synced(spa_get_dsl(spa), 0); 2205 mutex_enter(&spa_namespace_lock); 2206 spa_close(spa, FTAG); 2207 } 2208 mutex_exit(&spa_namespace_lock); 2209 } 2210 2211 /* 2212 * ========================================================================== 2213 * Miscellaneous routines 2214 * ========================================================================== 2215 */ 2216 2217 int 2218 spa_busy(void) 2219 { 2220 return (spa_active_count != 0); 2221 } 2222 2223 /* 2224 * Remove all pools in the system. 2225 */ 2226 void 2227 spa_evict_all(void) 2228 { 2229 spa_t *spa; 2230 2231 /* 2232 * Remove all cached state. All pools should be closed now, 2233 * so every spa in the AVL tree should be unreferenced. 2234 */ 2235 mutex_enter(&spa_namespace_lock); 2236 while ((spa = spa_next(NULL)) != NULL) { 2237 /* 2238 * Stop async tasks. The async thread may need to detach 2239 * a device that's been replaced, which requires grabbing 2240 * spa_namespace_lock, so we must drop it here. 2241 */ 2242 spa_open_ref(spa, FTAG); 2243 mutex_exit(&spa_namespace_lock); 2244 spa_async_suspend(spa); 2245 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 2246 mutex_enter(&spa_namespace_lock); 2247 spa_close(spa, FTAG); 2248 2249 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2250 spa_unload(spa); 2251 spa_deactivate(spa); 2252 } 2253 spa_remove(spa); 2254 } 2255 mutex_exit(&spa_namespace_lock); 2256 } 2257 2258 vdev_t * 2259 spa_lookup_by_guid(spa_t *spa, uint64_t guid) 2260 { 2261 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 2262 } 2263