1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * This file contains all the routines used when modifying on-disk SPA state. 30 * This includes opening, importing, destroying, exporting a pool, and syncing a 31 * pool. 32 */ 33 34 #include <sys/zfs_context.h> 35 #include <sys/fm/fs/zfs.h> 36 #include <sys/spa_impl.h> 37 #include <sys/zio.h> 38 #include <sys/zio_checksum.h> 39 #include <sys/zio_compress.h> 40 #include <sys/dmu.h> 41 #include <sys/dmu_tx.h> 42 #include <sys/zap.h> 43 #include <sys/zil.h> 44 #include <sys/vdev_impl.h> 45 #include <sys/metaslab.h> 46 #include <sys/uberblock_impl.h> 47 #include <sys/txg.h> 48 #include <sys/avl.h> 49 #include <sys/dmu_traverse.h> 50 #include <sys/unique.h> 51 #include <sys/dsl_pool.h> 52 #include <sys/dsl_dir.h> 53 #include <sys/dsl_prop.h> 54 #include <sys/fs/zfs.h> 55 #include <sys/callb.h> 56 57 /* 58 * ========================================================================== 59 * SPA state manipulation (open/create/destroy/import/export) 60 * ========================================================================== 61 */ 62 63 static int 64 spa_error_entry_compare(const void *a, const void *b) 65 { 66 spa_error_entry_t *sa = (spa_error_entry_t *)a; 67 spa_error_entry_t *sb = (spa_error_entry_t *)b; 68 int ret; 69 70 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 71 sizeof (zbookmark_t)); 72 73 if (ret < 0) 74 return (-1); 75 else if (ret > 0) 76 return (1); 77 else 78 return (0); 79 } 80 81 /* 82 * Utility function which retrieves copies of the current logs and 83 * re-initializes them in the process. 84 */ 85 void 86 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 87 { 88 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 89 90 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 91 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 92 93 avl_create(&spa->spa_errlist_scrub, 94 spa_error_entry_compare, sizeof (spa_error_entry_t), 95 offsetof(spa_error_entry_t, se_avl)); 96 avl_create(&spa->spa_errlist_last, 97 spa_error_entry_compare, sizeof (spa_error_entry_t), 98 offsetof(spa_error_entry_t, se_avl)); 99 } 100 101 /* 102 * Activate an uninitialized pool. 103 */ 104 static void 105 spa_activate(spa_t *spa) 106 { 107 int t; 108 109 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 110 111 spa->spa_state = POOL_STATE_ACTIVE; 112 113 spa->spa_normal_class = metaslab_class_create(); 114 115 for (t = 0; t < ZIO_TYPES; t++) { 116 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 117 8, maxclsyspri, 50, INT_MAX, 118 TASKQ_PREPOPULATE); 119 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 120 8, maxclsyspri, 50, INT_MAX, 121 TASKQ_PREPOPULATE); 122 } 123 124 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 125 126 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 127 offsetof(vdev_t, vdev_dirty_node)); 128 129 txg_list_create(&spa->spa_vdev_txg_list, 130 offsetof(struct vdev, vdev_txg_node)); 131 132 avl_create(&spa->spa_errlist_scrub, 133 spa_error_entry_compare, sizeof (spa_error_entry_t), 134 offsetof(spa_error_entry_t, se_avl)); 135 avl_create(&spa->spa_errlist_last, 136 spa_error_entry_compare, sizeof (spa_error_entry_t), 137 offsetof(spa_error_entry_t, se_avl)); 138 } 139 140 /* 141 * Opposite of spa_activate(). 142 */ 143 static void 144 spa_deactivate(spa_t *spa) 145 { 146 int t; 147 148 ASSERT(spa->spa_sync_on == B_FALSE); 149 ASSERT(spa->spa_dsl_pool == NULL); 150 ASSERT(spa->spa_root_vdev == NULL); 151 152 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 153 154 txg_list_destroy(&spa->spa_vdev_txg_list); 155 156 list_destroy(&spa->spa_dirty_list); 157 158 rw_destroy(&spa->spa_traverse_lock); 159 160 for (t = 0; t < ZIO_TYPES; t++) { 161 taskq_destroy(spa->spa_zio_issue_taskq[t]); 162 taskq_destroy(spa->spa_zio_intr_taskq[t]); 163 spa->spa_zio_issue_taskq[t] = NULL; 164 spa->spa_zio_intr_taskq[t] = NULL; 165 } 166 167 metaslab_class_destroy(spa->spa_normal_class); 168 spa->spa_normal_class = NULL; 169 170 /* 171 * If this was part of an import or the open otherwise failed, we may 172 * still have errors left in the queues. Empty them just in case. 173 */ 174 spa_errlog_drain(spa); 175 176 avl_destroy(&spa->spa_errlist_scrub); 177 avl_destroy(&spa->spa_errlist_last); 178 179 spa->spa_state = POOL_STATE_UNINITIALIZED; 180 } 181 182 /* 183 * Verify a pool configuration, and construct the vdev tree appropriately. This 184 * will create all the necessary vdevs in the appropriate layout, with each vdev 185 * in the CLOSED state. This will prep the pool before open/creation/import. 186 * All vdev validation is done by the vdev_alloc() routine. 187 */ 188 static vdev_t * 189 spa_config_parse(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) 190 { 191 nvlist_t **child; 192 uint_t c, children; 193 vdev_t *vd; 194 195 if ((vd = vdev_alloc(spa, nv, parent, id, atype)) == NULL) 196 return (NULL); 197 198 if (vd->vdev_ops->vdev_op_leaf) 199 return (vd); 200 201 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 202 &child, &children) != 0) { 203 vdev_free(vd); 204 return (NULL); 205 } 206 207 for (c = 0; c < children; c++) { 208 if (spa_config_parse(spa, child[c], vd, c, atype) == NULL) { 209 vdev_free(vd); 210 return (NULL); 211 } 212 } 213 214 return (vd); 215 } 216 217 /* 218 * Opposite of spa_load(). 219 */ 220 static void 221 spa_unload(spa_t *spa) 222 { 223 /* 224 * Stop async tasks. 225 */ 226 spa_async_suspend(spa); 227 228 /* 229 * Stop syncing. 230 */ 231 if (spa->spa_sync_on) { 232 txg_sync_stop(spa->spa_dsl_pool); 233 spa->spa_sync_on = B_FALSE; 234 } 235 236 /* 237 * Wait for any outstanding prefetch I/O to complete. 238 */ 239 spa_config_enter(spa, RW_WRITER, FTAG); 240 spa_config_exit(spa, FTAG); 241 242 /* 243 * Close the dsl pool. 244 */ 245 if (spa->spa_dsl_pool) { 246 dsl_pool_close(spa->spa_dsl_pool); 247 spa->spa_dsl_pool = NULL; 248 } 249 250 /* 251 * Close all vdevs. 252 */ 253 if (spa->spa_root_vdev) 254 vdev_free(spa->spa_root_vdev); 255 ASSERT(spa->spa_root_vdev == NULL); 256 257 spa->spa_async_suspended = 0; 258 } 259 260 /* 261 * Load an existing storage pool, using the pool's builtin spa_config as a 262 * source of configuration information. 263 */ 264 static int 265 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 266 { 267 int error = 0; 268 nvlist_t *nvroot = NULL; 269 vdev_t *rvd; 270 uberblock_t *ub = &spa->spa_uberblock; 271 uint64_t config_cache_txg = spa->spa_config_txg; 272 uint64_t pool_guid; 273 zio_t *zio; 274 275 spa->spa_load_state = state; 276 277 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 278 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 279 error = EINVAL; 280 goto out; 281 } 282 283 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 284 &spa->spa_config_txg); 285 286 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 287 spa_guid_exists(pool_guid, 0)) { 288 error = EEXIST; 289 goto out; 290 } 291 292 /* 293 * Parse the configuration into a vdev tree. 294 */ 295 spa_config_enter(spa, RW_WRITER, FTAG); 296 rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 297 spa_config_exit(spa, FTAG); 298 299 if (rvd == NULL) { 300 error = EINVAL; 301 goto out; 302 } 303 304 ASSERT(spa->spa_root_vdev == rvd); 305 ASSERT(spa_guid(spa) == pool_guid); 306 307 /* 308 * Try to open all vdevs, loading each label in the process. 309 */ 310 if (vdev_open(rvd) != 0) { 311 error = ENXIO; 312 goto out; 313 } 314 315 /* 316 * Find the best uberblock. 317 */ 318 bzero(ub, sizeof (uberblock_t)); 319 320 zio = zio_root(spa, NULL, NULL, 321 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 322 vdev_uberblock_load(zio, rvd, ub); 323 error = zio_wait(zio); 324 325 /* 326 * If we weren't able to find a single valid uberblock, return failure. 327 */ 328 if (ub->ub_txg == 0) { 329 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 330 VDEV_AUX_CORRUPT_DATA); 331 error = ENXIO; 332 goto out; 333 } 334 335 /* 336 * If the pool is newer than the code, we can't open it. 337 */ 338 if (ub->ub_version > ZFS_VERSION) { 339 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 340 VDEV_AUX_VERSION_NEWER); 341 error = ENOTSUP; 342 goto out; 343 } 344 345 /* 346 * If the vdev guid sum doesn't match the uberblock, we have an 347 * incomplete configuration. 348 */ 349 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 350 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 351 VDEV_AUX_BAD_GUID_SUM); 352 error = ENXIO; 353 goto out; 354 } 355 356 /* 357 * Initialize internal SPA structures. 358 */ 359 spa->spa_state = POOL_STATE_ACTIVE; 360 spa->spa_ubsync = spa->spa_uberblock; 361 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 362 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 363 if (error) { 364 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 365 VDEV_AUX_CORRUPT_DATA); 366 goto out; 367 } 368 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 369 370 if (zap_lookup(spa->spa_meta_objset, 371 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 372 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 373 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 374 VDEV_AUX_CORRUPT_DATA); 375 error = EIO; 376 goto out; 377 } 378 379 if (!mosconfig) { 380 dmu_buf_t *db; 381 char *packed = NULL; 382 size_t nvsize = 0; 383 nvlist_t *newconfig = NULL; 384 385 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, 386 spa->spa_config_object, FTAG, &db)); 387 nvsize = *(uint64_t *)db->db_data; 388 dmu_buf_rele(db, FTAG); 389 390 packed = kmem_alloc(nvsize, KM_SLEEP); 391 error = dmu_read(spa->spa_meta_objset, 392 spa->spa_config_object, 0, nvsize, packed); 393 if (error == 0) 394 error = nvlist_unpack(packed, nvsize, &newconfig, 0); 395 kmem_free(packed, nvsize); 396 397 if (error) { 398 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 399 VDEV_AUX_CORRUPT_DATA); 400 error = EIO; 401 goto out; 402 } 403 404 spa_config_set(spa, newconfig); 405 406 spa_unload(spa); 407 spa_deactivate(spa); 408 spa_activate(spa); 409 410 return (spa_load(spa, newconfig, state, B_TRUE)); 411 } 412 413 if (zap_lookup(spa->spa_meta_objset, 414 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 415 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 416 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 417 VDEV_AUX_CORRUPT_DATA); 418 error = EIO; 419 goto out; 420 } 421 422 /* 423 * Load the persistent error log. If we have an older pool, this will 424 * not be present. 425 */ 426 error = zap_lookup(spa->spa_meta_objset, 427 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 428 sizeof (uint64_t), 1, &spa->spa_errlog_last); 429 if (error != 0 &&error != ENOENT) { 430 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 431 VDEV_AUX_CORRUPT_DATA); 432 error = EIO; 433 goto out; 434 } 435 436 error = zap_lookup(spa->spa_meta_objset, 437 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 438 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 439 if (error != 0 && error != ENOENT) { 440 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 441 VDEV_AUX_CORRUPT_DATA); 442 error = EIO; 443 goto out; 444 } 445 446 /* 447 * Load the vdev state for all top level vdevs. We need to grab the 448 * config lock because all label I/O is done with the 449 * ZIO_FLAG_CONFIG_HELD flag. 450 */ 451 spa_config_enter(spa, RW_READER, FTAG); 452 error = vdev_load(rvd); 453 spa_config_exit(spa, FTAG); 454 455 if (error) 456 goto out; 457 458 /* 459 * Propagate the leaf DTLs we just loaded all the way up the tree. 460 */ 461 spa_config_enter(spa, RW_WRITER, FTAG); 462 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 463 spa_config_exit(spa, FTAG); 464 465 /* 466 * Check the state of the root vdev. If it can't be opened, it 467 * indicates one or more toplevel vdevs are faulted. 468 */ 469 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 470 error = ENXIO; 471 goto out; 472 } 473 474 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 475 dmu_tx_t *tx; 476 int need_update = B_FALSE; 477 int c; 478 479 /* 480 * Claim log blocks that haven't been committed yet. 481 * This must all happen in a single txg. 482 */ 483 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 484 spa_first_txg(spa)); 485 dmu_objset_find(spa->spa_name, zil_claim, tx, 0); 486 dmu_tx_commit(tx); 487 488 spa->spa_sync_on = B_TRUE; 489 txg_sync_start(spa->spa_dsl_pool); 490 491 /* 492 * Wait for all claims to sync. 493 */ 494 txg_wait_synced(spa->spa_dsl_pool, 0); 495 496 /* 497 * If the config cache is stale, or we have uninitialized 498 * metaslabs (see spa_vdev_add()), then update the config. 499 */ 500 if (config_cache_txg != spa->spa_config_txg || 501 state == SPA_LOAD_IMPORT) 502 need_update = B_TRUE; 503 504 for (c = 0; c < rvd->vdev_children; c++) 505 if (rvd->vdev_child[c]->vdev_ms_array == 0) 506 need_update = B_TRUE; 507 508 /* 509 * Update the config cache asychronously in case we're the 510 * root pool, in which case the config cache isn't writable yet. 511 */ 512 if (need_update) 513 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 514 } 515 516 error = 0; 517 out: 518 if (error) 519 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 520 spa->spa_load_state = SPA_LOAD_NONE; 521 spa->spa_ena = 0; 522 523 return (error); 524 } 525 526 /* 527 * Pool Open/Import 528 * 529 * The import case is identical to an open except that the configuration is sent 530 * down from userland, instead of grabbed from the configuration cache. For the 531 * case of an open, the pool configuration will exist in the 532 * POOL_STATE_UNITIALIZED state. 533 * 534 * The stats information (gen/count/ustats) is used to gather vdev statistics at 535 * the same time open the pool, without having to keep around the spa_t in some 536 * ambiguous state. 537 */ 538 static int 539 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 540 { 541 spa_t *spa; 542 int error; 543 int loaded = B_FALSE; 544 int locked = B_FALSE; 545 546 *spapp = NULL; 547 548 /* 549 * As disgusting as this is, we need to support recursive calls to this 550 * function because dsl_dir_open() is called during spa_load(), and ends 551 * up calling spa_open() again. The real fix is to figure out how to 552 * avoid dsl_dir_open() calling this in the first place. 553 */ 554 if (mutex_owner(&spa_namespace_lock) != curthread) { 555 mutex_enter(&spa_namespace_lock); 556 locked = B_TRUE; 557 } 558 559 if ((spa = spa_lookup(pool)) == NULL) { 560 if (locked) 561 mutex_exit(&spa_namespace_lock); 562 return (ENOENT); 563 } 564 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 565 566 spa_activate(spa); 567 568 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 569 570 if (error == EBADF) { 571 /* 572 * If vdev_load() returns EBADF, it indicates that one 573 * of the vdevs indicates that the pool has been 574 * exported or destroyed. If this is the case, the 575 * config cache is out of sync and we should remove the 576 * pool from the namespace. 577 */ 578 spa_unload(spa); 579 spa_deactivate(spa); 580 spa_remove(spa); 581 spa_config_sync(); 582 if (locked) 583 mutex_exit(&spa_namespace_lock); 584 return (ENOENT); 585 } 586 587 if (error) { 588 /* 589 * We can't open the pool, but we still have useful 590 * information: the state of each vdev after the 591 * attempted vdev_open(). Return this to the user. 592 */ 593 if (config != NULL && spa->spa_root_vdev != NULL) { 594 spa_config_enter(spa, RW_READER, FTAG); 595 *config = spa_config_generate(spa, NULL, -1ULL, 596 B_TRUE); 597 spa_config_exit(spa, FTAG); 598 } 599 spa_unload(spa); 600 spa_deactivate(spa); 601 spa->spa_last_open_failed = B_TRUE; 602 if (locked) 603 mutex_exit(&spa_namespace_lock); 604 *spapp = NULL; 605 return (error); 606 } else { 607 zfs_post_ok(spa, NULL); 608 spa->spa_last_open_failed = B_FALSE; 609 } 610 611 loaded = B_TRUE; 612 } 613 614 spa_open_ref(spa, tag); 615 if (locked) 616 mutex_exit(&spa_namespace_lock); 617 618 *spapp = spa; 619 620 if (config != NULL) { 621 spa_config_enter(spa, RW_READER, FTAG); 622 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 623 spa_config_exit(spa, FTAG); 624 } 625 626 /* 627 * If we just loaded the pool, resilver anything that's out of date. 628 */ 629 if (loaded && (spa_mode & FWRITE)) 630 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 631 632 return (0); 633 } 634 635 int 636 spa_open(const char *name, spa_t **spapp, void *tag) 637 { 638 return (spa_open_common(name, spapp, tag, NULL)); 639 } 640 641 /* 642 * Lookup the given spa_t, incrementing the inject count in the process, 643 * preventing it from being exported or destroyed. 644 */ 645 spa_t * 646 spa_inject_addref(char *name) 647 { 648 spa_t *spa; 649 650 mutex_enter(&spa_namespace_lock); 651 if ((spa = spa_lookup(name)) == NULL) { 652 mutex_exit(&spa_namespace_lock); 653 return (NULL); 654 } 655 spa->spa_inject_ref++; 656 mutex_exit(&spa_namespace_lock); 657 658 return (spa); 659 } 660 661 void 662 spa_inject_delref(spa_t *spa) 663 { 664 mutex_enter(&spa_namespace_lock); 665 spa->spa_inject_ref--; 666 mutex_exit(&spa_namespace_lock); 667 } 668 669 int 670 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 671 { 672 int error; 673 spa_t *spa; 674 675 *config = NULL; 676 error = spa_open_common(name, &spa, FTAG, config); 677 678 if (spa && *config != NULL) 679 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 680 spa_get_errlog_size(spa)) == 0); 681 682 /* 683 * We want to get the alternate root even for faulted pools, so we cheat 684 * and call spa_lookup() directly. 685 */ 686 if (altroot) { 687 if (spa == NULL) { 688 mutex_enter(&spa_namespace_lock); 689 spa = spa_lookup(name); 690 if (spa) 691 spa_altroot(spa, altroot, buflen); 692 else 693 altroot[0] = '\0'; 694 spa = NULL; 695 mutex_exit(&spa_namespace_lock); 696 } else { 697 spa_altroot(spa, altroot, buflen); 698 } 699 } 700 701 if (spa != NULL) 702 spa_close(spa, FTAG); 703 704 return (error); 705 } 706 707 /* 708 * Pool Creation 709 */ 710 int 711 spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 712 { 713 spa_t *spa; 714 vdev_t *rvd; 715 dsl_pool_t *dp; 716 dmu_tx_t *tx; 717 int c, error; 718 uint64_t txg = TXG_INITIAL; 719 720 /* 721 * If this pool already exists, return failure. 722 */ 723 mutex_enter(&spa_namespace_lock); 724 if (spa_lookup(pool) != NULL) { 725 mutex_exit(&spa_namespace_lock); 726 return (EEXIST); 727 } 728 729 /* 730 * Allocate a new spa_t structure. 731 */ 732 spa = spa_add(pool, altroot); 733 spa_activate(spa); 734 735 spa->spa_uberblock.ub_txg = txg - 1; 736 spa->spa_uberblock.ub_version = ZFS_VERSION; 737 spa->spa_ubsync = spa->spa_uberblock; 738 739 /* 740 * Create the root vdev. 741 */ 742 spa_config_enter(spa, RW_WRITER, FTAG); 743 744 rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 745 746 ASSERT(spa->spa_root_vdev == rvd); 747 748 if (rvd == NULL) { 749 error = EINVAL; 750 } else { 751 if ((error = vdev_create(rvd, txg)) == 0) { 752 for (c = 0; c < rvd->vdev_children; c++) 753 vdev_init(rvd->vdev_child[c], txg); 754 vdev_config_dirty(rvd); 755 } 756 } 757 758 spa_config_exit(spa, FTAG); 759 760 if (error) { 761 spa_unload(spa); 762 spa_deactivate(spa); 763 spa_remove(spa); 764 mutex_exit(&spa_namespace_lock); 765 return (error); 766 } 767 768 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 769 spa->spa_meta_objset = dp->dp_meta_objset; 770 771 tx = dmu_tx_create_assigned(dp, txg); 772 773 /* 774 * Create the pool config object. 775 */ 776 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 777 DMU_OT_PACKED_NVLIST, 1 << 14, 778 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 779 780 if (zap_add(spa->spa_meta_objset, 781 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 782 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 783 cmn_err(CE_PANIC, "failed to add pool config"); 784 } 785 786 /* 787 * Create the deferred-free bplist object. Turn off compression 788 * because sync-to-convergence takes longer if the blocksize 789 * keeps changing. 790 */ 791 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 792 1 << 14, tx); 793 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 794 ZIO_COMPRESS_OFF, tx); 795 796 if (zap_add(spa->spa_meta_objset, 797 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 798 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 799 cmn_err(CE_PANIC, "failed to add bplist"); 800 } 801 802 dmu_tx_commit(tx); 803 804 spa->spa_sync_on = B_TRUE; 805 txg_sync_start(spa->spa_dsl_pool); 806 807 /* 808 * We explicitly wait for the first transaction to complete so that our 809 * bean counters are appropriately updated. 810 */ 811 txg_wait_synced(spa->spa_dsl_pool, txg); 812 813 spa_config_sync(); 814 815 mutex_exit(&spa_namespace_lock); 816 817 return (0); 818 } 819 820 /* 821 * Import the given pool into the system. We set up the necessary spa_t and 822 * then call spa_load() to do the dirty work. 823 */ 824 int 825 spa_import(const char *pool, nvlist_t *config, const char *altroot) 826 { 827 spa_t *spa; 828 int error; 829 830 if (!(spa_mode & FWRITE)) 831 return (EROFS); 832 833 /* 834 * If a pool with this name exists, return failure. 835 */ 836 mutex_enter(&spa_namespace_lock); 837 if (spa_lookup(pool) != NULL) { 838 mutex_exit(&spa_namespace_lock); 839 return (EEXIST); 840 } 841 842 /* 843 * Create and initialize the spa structure. 844 */ 845 spa = spa_add(pool, altroot); 846 spa_activate(spa); 847 848 /* 849 * Pass off the heavy lifting to spa_load(). 850 * Pass TRUE for mosconfig because the user-supplied config 851 * is actually the one to trust when doing an import. 852 */ 853 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 854 855 if (error) { 856 spa_unload(spa); 857 spa_deactivate(spa); 858 spa_remove(spa); 859 mutex_exit(&spa_namespace_lock); 860 return (error); 861 } 862 863 /* 864 * Update the config cache to include the newly-imported pool. 865 */ 866 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 867 868 mutex_exit(&spa_namespace_lock); 869 870 /* 871 * Resilver anything that's out of date. 872 */ 873 if (spa_mode & FWRITE) 874 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 875 876 return (0); 877 } 878 879 /* 880 * This (illegal) pool name is used when temporarily importing a spa_t in order 881 * to get the vdev stats associated with the imported devices. 882 */ 883 #define TRYIMPORT_NAME "$import" 884 885 nvlist_t * 886 spa_tryimport(nvlist_t *tryconfig) 887 { 888 nvlist_t *config = NULL; 889 char *poolname; 890 spa_t *spa; 891 uint64_t state; 892 893 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 894 return (NULL); 895 896 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 897 return (NULL); 898 899 /* 900 * Create and initialize the spa structure. 901 */ 902 mutex_enter(&spa_namespace_lock); 903 spa = spa_add(TRYIMPORT_NAME, NULL); 904 spa_activate(spa); 905 906 /* 907 * Pass off the heavy lifting to spa_load(). 908 * Pass TRUE for mosconfig because the user-supplied config 909 * is actually the one to trust when doing an import. 910 */ 911 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 912 913 /* 914 * If 'tryconfig' was at least parsable, return the current config. 915 */ 916 if (spa->spa_root_vdev != NULL) { 917 spa_config_enter(spa, RW_READER, FTAG); 918 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 919 spa_config_exit(spa, FTAG); 920 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 921 poolname) == 0); 922 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 923 state) == 0); 924 } 925 926 spa_unload(spa); 927 spa_deactivate(spa); 928 spa_remove(spa); 929 mutex_exit(&spa_namespace_lock); 930 931 return (config); 932 } 933 934 /* 935 * Pool export/destroy 936 * 937 * The act of destroying or exporting a pool is very simple. We make sure there 938 * is no more pending I/O and any references to the pool are gone. Then, we 939 * update the pool state and sync all the labels to disk, removing the 940 * configuration from the cache afterwards. 941 */ 942 static int 943 spa_export_common(char *pool, int new_state) 944 { 945 spa_t *spa; 946 947 if (!(spa_mode & FWRITE)) 948 return (EROFS); 949 950 mutex_enter(&spa_namespace_lock); 951 if ((spa = spa_lookup(pool)) == NULL) { 952 mutex_exit(&spa_namespace_lock); 953 return (ENOENT); 954 } 955 956 /* 957 * Put a hold on the pool, drop the namespace lock, stop async tasks, 958 * reacquire the namespace lock, and see if we can export. 959 */ 960 spa_open_ref(spa, FTAG); 961 mutex_exit(&spa_namespace_lock); 962 spa_async_suspend(spa); 963 mutex_enter(&spa_namespace_lock); 964 spa_close(spa, FTAG); 965 966 /* 967 * The pool will be in core if it's openable, 968 * in which case we can modify its state. 969 */ 970 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 971 /* 972 * Objsets may be open only because they're dirty, so we 973 * have to force it to sync before checking spa_refcnt. 974 */ 975 spa_scrub_suspend(spa); 976 txg_wait_synced(spa->spa_dsl_pool, 0); 977 978 /* 979 * A pool cannot be exported or destroyed if there are active 980 * references. If we are resetting a pool, allow references by 981 * fault injection handlers. 982 */ 983 if (!spa_refcount_zero(spa) || 984 (spa->spa_inject_ref != 0 && 985 new_state != POOL_STATE_UNINITIALIZED)) { 986 spa_scrub_resume(spa); 987 spa_async_resume(spa); 988 mutex_exit(&spa_namespace_lock); 989 return (EBUSY); 990 } 991 992 spa_scrub_resume(spa); 993 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 994 995 /* 996 * We want this to be reflected on every label, 997 * so mark them all dirty. spa_unload() will do the 998 * final sync that pushes these changes out. 999 */ 1000 if (new_state != POOL_STATE_UNINITIALIZED) { 1001 spa_config_enter(spa, RW_WRITER, FTAG); 1002 spa->spa_state = new_state; 1003 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 1004 vdev_config_dirty(spa->spa_root_vdev); 1005 spa_config_exit(spa, FTAG); 1006 } 1007 } 1008 1009 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1010 spa_unload(spa); 1011 spa_deactivate(spa); 1012 } 1013 1014 if (new_state != POOL_STATE_UNINITIALIZED) { 1015 spa_remove(spa); 1016 spa_config_sync(); 1017 } 1018 mutex_exit(&spa_namespace_lock); 1019 1020 return (0); 1021 } 1022 1023 /* 1024 * Destroy a storage pool. 1025 */ 1026 int 1027 spa_destroy(char *pool) 1028 { 1029 return (spa_export_common(pool, POOL_STATE_DESTROYED)); 1030 } 1031 1032 /* 1033 * Export a storage pool. 1034 */ 1035 int 1036 spa_export(char *pool) 1037 { 1038 return (spa_export_common(pool, POOL_STATE_EXPORTED)); 1039 } 1040 1041 /* 1042 * Similar to spa_export(), this unloads the spa_t without actually removing it 1043 * from the namespace in any way. 1044 */ 1045 int 1046 spa_reset(char *pool) 1047 { 1048 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED)); 1049 } 1050 1051 1052 /* 1053 * ========================================================================== 1054 * Device manipulation 1055 * ========================================================================== 1056 */ 1057 1058 /* 1059 * Add capacity to a storage pool. 1060 */ 1061 int 1062 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1063 { 1064 uint64_t txg; 1065 int c, error; 1066 vdev_t *rvd = spa->spa_root_vdev; 1067 vdev_t *vd, *tvd; 1068 1069 txg = spa_vdev_enter(spa); 1070 1071 vd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1072 1073 if (vd == NULL) 1074 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1075 1076 if ((error = vdev_create(vd, txg)) != 0) 1077 return (spa_vdev_exit(spa, vd, txg, error)); 1078 1079 /* 1080 * Transfer each new top-level vdev from vd to rvd. 1081 */ 1082 for (c = 0; c < vd->vdev_children; c++) { 1083 tvd = vd->vdev_child[c]; 1084 vdev_remove_child(vd, tvd); 1085 tvd->vdev_id = rvd->vdev_children; 1086 vdev_add_child(rvd, tvd); 1087 vdev_config_dirty(tvd); 1088 } 1089 1090 /* 1091 * We have to be careful when adding new vdevs to an existing pool. 1092 * If other threads start allocating from these vdevs before we 1093 * sync the config cache, and we lose power, then upon reboot we may 1094 * fail to open the pool because there are DVAs that the config cache 1095 * can't translate. Therefore, we first add the vdevs without 1096 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1097 * and then let spa_config_update() initialize the new metaslabs. 1098 * 1099 * spa_load() checks for added-but-not-initialized vdevs, so that 1100 * if we lose power at any point in this sequence, the remaining 1101 * steps will be completed the next time we load the pool. 1102 */ 1103 (void) spa_vdev_exit(spa, vd, txg, 0); 1104 1105 mutex_enter(&spa_namespace_lock); 1106 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1107 mutex_exit(&spa_namespace_lock); 1108 1109 return (0); 1110 } 1111 1112 /* 1113 * Attach a device to a mirror. The arguments are the path to any device 1114 * in the mirror, and the nvroot for the new device. If the path specifies 1115 * a device that is not mirrored, we automatically insert the mirror vdev. 1116 * 1117 * If 'replacing' is specified, the new device is intended to replace the 1118 * existing device; in this case the two devices are made into their own 1119 * mirror using the 'replacing' vdev, which is functionally idendical to 1120 * the mirror vdev (it actually reuses all the same ops) but has a few 1121 * extra rules: you can't attach to it after it's been created, and upon 1122 * completion of resilvering, the first disk (the one being replaced) 1123 * is automatically detached. 1124 */ 1125 int 1126 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1127 { 1128 uint64_t txg, open_txg; 1129 int error; 1130 vdev_t *rvd = spa->spa_root_vdev; 1131 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1132 vdev_ops_t *pvops = replacing ? &vdev_replacing_ops : &vdev_mirror_ops; 1133 1134 txg = spa_vdev_enter(spa); 1135 1136 oldvd = vdev_lookup_by_guid(rvd, guid); 1137 1138 if (oldvd == NULL) 1139 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1140 1141 if (!oldvd->vdev_ops->vdev_op_leaf) 1142 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1143 1144 pvd = oldvd->vdev_parent; 1145 1146 /* 1147 * The parent must be a mirror or the root, unless we're replacing; 1148 * in that case, the parent can be anything but another replacing vdev. 1149 */ 1150 if (pvd->vdev_ops != &vdev_mirror_ops && 1151 pvd->vdev_ops != &vdev_root_ops && 1152 (!replacing || pvd->vdev_ops == &vdev_replacing_ops)) 1153 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1154 1155 newrootvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1156 1157 if (newrootvd == NULL || newrootvd->vdev_children != 1) 1158 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1159 1160 newvd = newrootvd->vdev_child[0]; 1161 1162 if (!newvd->vdev_ops->vdev_op_leaf) 1163 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1164 1165 if ((error = vdev_create(newrootvd, txg)) != 0) 1166 return (spa_vdev_exit(spa, newrootvd, txg, error)); 1167 1168 /* 1169 * Compare the new device size with the replaceable/attachable 1170 * device size. 1171 */ 1172 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1173 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1174 1175 /* 1176 * The new device cannot have a higher alignment requirement 1177 * than the top-level vdev. 1178 */ 1179 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1180 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1181 1182 /* 1183 * If this is an in-place replacement, update oldvd's path and devid 1184 * to make it distinguishable from newvd, and unopenable from now on. 1185 */ 1186 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1187 spa_strfree(oldvd->vdev_path); 1188 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1189 KM_SLEEP); 1190 (void) sprintf(oldvd->vdev_path, "%s/%s", 1191 newvd->vdev_path, "old"); 1192 if (oldvd->vdev_devid != NULL) { 1193 spa_strfree(oldvd->vdev_devid); 1194 oldvd->vdev_devid = NULL; 1195 } 1196 } 1197 1198 /* 1199 * If the parent is not a mirror, or if we're replacing, 1200 * insert the new mirror/replacing vdev above oldvd. 1201 */ 1202 if (pvd->vdev_ops != pvops) 1203 pvd = vdev_add_parent(oldvd, pvops); 1204 1205 ASSERT(pvd->vdev_top->vdev_parent == rvd); 1206 ASSERT(pvd->vdev_ops == pvops); 1207 ASSERT(oldvd->vdev_parent == pvd); 1208 1209 /* 1210 * Extract the new device from its root and add it to pvd. 1211 */ 1212 vdev_remove_child(newrootvd, newvd); 1213 newvd->vdev_id = pvd->vdev_children; 1214 vdev_add_child(pvd, newvd); 1215 1216 /* 1217 * If newvd is smaller than oldvd, but larger than its rsize, 1218 * the addition of newvd may have decreased our parent's asize. 1219 */ 1220 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1221 1222 tvd = newvd->vdev_top; 1223 ASSERT(pvd->vdev_top == tvd); 1224 ASSERT(tvd->vdev_parent == rvd); 1225 1226 vdev_config_dirty(tvd); 1227 1228 /* 1229 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1230 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1231 */ 1232 open_txg = txg + TXG_CONCURRENT_STATES - 1; 1233 1234 mutex_enter(&newvd->vdev_dtl_lock); 1235 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1236 open_txg - TXG_INITIAL + 1); 1237 mutex_exit(&newvd->vdev_dtl_lock); 1238 1239 dprintf("attached %s in txg %llu\n", newvd->vdev_path, txg); 1240 1241 /* 1242 * Mark newvd's DTL dirty in this txg. 1243 */ 1244 vdev_dirty(tvd, VDD_DTL, newvd, txg); 1245 1246 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1247 1248 /* 1249 * Kick off a resilver to update newvd. 1250 */ 1251 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1252 1253 return (0); 1254 } 1255 1256 /* 1257 * Detach a device from a mirror or replacing vdev. 1258 * If 'replace_done' is specified, only detach if the parent 1259 * is a replacing vdev. 1260 */ 1261 int 1262 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1263 { 1264 uint64_t txg; 1265 int c, t, error; 1266 vdev_t *rvd = spa->spa_root_vdev; 1267 vdev_t *vd, *pvd, *cvd, *tvd; 1268 1269 txg = spa_vdev_enter(spa); 1270 1271 vd = vdev_lookup_by_guid(rvd, guid); 1272 1273 if (vd == NULL) 1274 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1275 1276 if (!vd->vdev_ops->vdev_op_leaf) 1277 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1278 1279 pvd = vd->vdev_parent; 1280 1281 /* 1282 * If replace_done is specified, only remove this device if it's 1283 * the first child of a replacing vdev. 1284 */ 1285 if (replace_done && 1286 (vd->vdev_id != 0 || pvd->vdev_ops != &vdev_replacing_ops)) 1287 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1288 1289 /* 1290 * Only mirror and replacing vdevs support detach. 1291 */ 1292 if (pvd->vdev_ops != &vdev_replacing_ops && 1293 pvd->vdev_ops != &vdev_mirror_ops) 1294 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1295 1296 /* 1297 * If there's only one replica, you can't detach it. 1298 */ 1299 if (pvd->vdev_children <= 1) 1300 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1301 1302 /* 1303 * If all siblings have non-empty DTLs, this device may have the only 1304 * valid copy of the data, which means we cannot safely detach it. 1305 * 1306 * XXX -- as in the vdev_offline() case, we really want a more 1307 * precise DTL check. 1308 */ 1309 for (c = 0; c < pvd->vdev_children; c++) { 1310 uint64_t dirty; 1311 1312 cvd = pvd->vdev_child[c]; 1313 if (cvd == vd) 1314 continue; 1315 if (vdev_is_dead(cvd)) 1316 continue; 1317 mutex_enter(&cvd->vdev_dtl_lock); 1318 dirty = cvd->vdev_dtl_map.sm_space | 1319 cvd->vdev_dtl_scrub.sm_space; 1320 mutex_exit(&cvd->vdev_dtl_lock); 1321 if (!dirty) 1322 break; 1323 } 1324 if (c == pvd->vdev_children) 1325 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1326 1327 /* 1328 * Erase the disk labels so the disk can be used for other things. 1329 * This must be done after all other error cases are handled, 1330 * but before we disembowel vd (so we can still do I/O to it). 1331 * But if we can't do it, don't treat the error as fatal -- 1332 * it may be that the unwritability of the disk is the reason 1333 * it's being detached! 1334 */ 1335 error = vdev_label_init(vd, 0); 1336 if (error) 1337 dprintf("unable to erase labels on %s\n", vdev_description(vd)); 1338 1339 /* 1340 * Remove vd from its parent and compact the parent's children. 1341 */ 1342 vdev_remove_child(pvd, vd); 1343 vdev_compact_children(pvd); 1344 1345 /* 1346 * Remember one of the remaining children so we can get tvd below. 1347 */ 1348 cvd = pvd->vdev_child[0]; 1349 1350 /* 1351 * If the parent mirror/replacing vdev only has one child, 1352 * the parent is no longer needed. Remove it from the tree. 1353 */ 1354 if (pvd->vdev_children == 1) 1355 vdev_remove_parent(cvd); 1356 1357 /* 1358 * We don't set tvd until now because the parent we just removed 1359 * may have been the previous top-level vdev. 1360 */ 1361 tvd = cvd->vdev_top; 1362 ASSERT(tvd->vdev_parent == rvd); 1363 1364 /* 1365 * Reopen this top-level vdev to reassess health after detach. 1366 */ 1367 vdev_reopen(tvd); 1368 1369 /* 1370 * If the device we just detached was smaller than the others, 1371 * it may be possible to add metaslabs (i.e. grow the pool). 1372 * vdev_metaslab_init() can't fail because the existing metaslabs 1373 * are already in core, so there's nothing to read from disk. 1374 */ 1375 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 1376 1377 vdev_config_dirty(tvd); 1378 1379 /* 1380 * Mark vd's DTL as dirty in this txg. 1381 * vdev_dtl_sync() will see that vd->vdev_detached is set 1382 * and free vd's DTL object in syncing context. 1383 * But first make sure we're not on any *other* txg's DTL list, 1384 * to prevent vd from being accessed after it's freed. 1385 */ 1386 for (t = 0; t < TXG_SIZE; t++) 1387 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1388 vd->vdev_detached = B_TRUE; 1389 vdev_dirty(tvd, VDD_DTL, vd, txg); 1390 1391 dprintf("detached %s in txg %llu\n", vd->vdev_path, txg); 1392 1393 return (spa_vdev_exit(spa, vd, txg, 0)); 1394 } 1395 1396 /* 1397 * Find any device that's done replacing, so we can detach it. 1398 */ 1399 static vdev_t * 1400 spa_vdev_replace_done_hunt(vdev_t *vd) 1401 { 1402 vdev_t *newvd, *oldvd; 1403 int c; 1404 1405 for (c = 0; c < vd->vdev_children; c++) { 1406 oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 1407 if (oldvd != NULL) 1408 return (oldvd); 1409 } 1410 1411 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 1412 oldvd = vd->vdev_child[0]; 1413 newvd = vd->vdev_child[1]; 1414 1415 mutex_enter(&newvd->vdev_dtl_lock); 1416 if (newvd->vdev_dtl_map.sm_space == 0 && 1417 newvd->vdev_dtl_scrub.sm_space == 0) { 1418 mutex_exit(&newvd->vdev_dtl_lock); 1419 return (oldvd); 1420 } 1421 mutex_exit(&newvd->vdev_dtl_lock); 1422 } 1423 1424 return (NULL); 1425 } 1426 1427 static void 1428 spa_vdev_replace_done(spa_t *spa) 1429 { 1430 vdev_t *vd; 1431 uint64_t guid; 1432 1433 spa_config_enter(spa, RW_READER, FTAG); 1434 1435 while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 1436 guid = vd->vdev_guid; 1437 spa_config_exit(spa, FTAG); 1438 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 1439 return; 1440 spa_config_enter(spa, RW_READER, FTAG); 1441 } 1442 1443 spa_config_exit(spa, FTAG); 1444 } 1445 1446 /* 1447 * Update the stored path for this vdev. Dirty the vdev configuration, relying 1448 * on spa_vdev_enter/exit() to synchronize the labels and cache. 1449 */ 1450 int 1451 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 1452 { 1453 vdev_t *rvd, *vd; 1454 uint64_t txg; 1455 1456 rvd = spa->spa_root_vdev; 1457 1458 txg = spa_vdev_enter(spa); 1459 1460 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 1461 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 1462 1463 if (!vd->vdev_ops->vdev_op_leaf) 1464 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1465 1466 spa_strfree(vd->vdev_path); 1467 vd->vdev_path = spa_strdup(newpath); 1468 1469 vdev_config_dirty(vd->vdev_top); 1470 1471 return (spa_vdev_exit(spa, NULL, txg, 0)); 1472 } 1473 1474 /* 1475 * ========================================================================== 1476 * SPA Scrubbing 1477 * ========================================================================== 1478 */ 1479 1480 void 1481 spa_scrub_throttle(spa_t *spa, int direction) 1482 { 1483 mutex_enter(&spa->spa_scrub_lock); 1484 spa->spa_scrub_throttled += direction; 1485 ASSERT(spa->spa_scrub_throttled >= 0); 1486 if (spa->spa_scrub_throttled == 0) 1487 cv_broadcast(&spa->spa_scrub_io_cv); 1488 mutex_exit(&spa->spa_scrub_lock); 1489 } 1490 1491 static void 1492 spa_scrub_io_done(zio_t *zio) 1493 { 1494 spa_t *spa = zio->io_spa; 1495 1496 zio_buf_free(zio->io_data, zio->io_size); 1497 1498 mutex_enter(&spa->spa_scrub_lock); 1499 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1500 vdev_t *vd = zio->io_vd; 1501 spa->spa_scrub_errors++; 1502 mutex_enter(&vd->vdev_stat_lock); 1503 vd->vdev_stat.vs_scrub_errors++; 1504 mutex_exit(&vd->vdev_stat_lock); 1505 } 1506 if (--spa->spa_scrub_inflight == 0) { 1507 cv_broadcast(&spa->spa_scrub_io_cv); 1508 ASSERT(spa->spa_scrub_throttled == 0); 1509 } 1510 mutex_exit(&spa->spa_scrub_lock); 1511 } 1512 1513 static void 1514 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 1515 zbookmark_t *zb) 1516 { 1517 size_t size = BP_GET_LSIZE(bp); 1518 void *data = zio_buf_alloc(size); 1519 1520 mutex_enter(&spa->spa_scrub_lock); 1521 spa->spa_scrub_inflight++; 1522 mutex_exit(&spa->spa_scrub_lock); 1523 1524 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 1525 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 1526 1527 flags |= ZIO_FLAG_CANFAIL; 1528 1529 zio_nowait(zio_read(NULL, spa, bp, data, size, 1530 spa_scrub_io_done, NULL, priority, flags, zb)); 1531 } 1532 1533 /* ARGSUSED */ 1534 static int 1535 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 1536 { 1537 blkptr_t *bp = &bc->bc_blkptr; 1538 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[0])); 1539 1540 if (bc->bc_errno || vd == NULL) { 1541 /* 1542 * We can't scrub this block, but we can continue to scrub 1543 * the rest of the pool. Note the error and move along. 1544 */ 1545 mutex_enter(&spa->spa_scrub_lock); 1546 spa->spa_scrub_errors++; 1547 mutex_exit(&spa->spa_scrub_lock); 1548 1549 if (vd != NULL) { 1550 mutex_enter(&vd->vdev_stat_lock); 1551 vd->vdev_stat.vs_scrub_errors++; 1552 mutex_exit(&vd->vdev_stat_lock); 1553 } 1554 1555 return (ERESTART); 1556 } 1557 1558 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 1559 1560 /* 1561 * Keep track of how much data we've examined so that 1562 * zpool(1M) status can make useful progress reports. 1563 */ 1564 mutex_enter(&vd->vdev_stat_lock); 1565 vd->vdev_stat.vs_scrub_examined += BP_GET_ASIZE(bp); 1566 mutex_exit(&vd->vdev_stat_lock); 1567 1568 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 1569 if (DVA_GET_GANG(&bp->blk_dva[0])) { 1570 /* 1571 * Gang members may be spread across multiple vdevs, 1572 * so the best we can do is look at the pool-wide DTL. 1573 * XXX -- it would be better to change our allocation 1574 * policy to ensure that this can't happen. 1575 */ 1576 vd = spa->spa_root_vdev; 1577 } 1578 if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) { 1579 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 1580 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 1581 } 1582 } else { 1583 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 1584 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 1585 } 1586 1587 return (0); 1588 } 1589 1590 static void 1591 spa_scrub_thread(spa_t *spa) 1592 { 1593 callb_cpr_t cprinfo; 1594 traverse_handle_t *th = spa->spa_scrub_th; 1595 vdev_t *rvd = spa->spa_root_vdev; 1596 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 1597 int error = 0; 1598 boolean_t complete; 1599 1600 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 1601 1602 /* 1603 * If we're restarting due to a snapshot create/delete, 1604 * wait for that to complete. 1605 */ 1606 txg_wait_synced(spa_get_dsl(spa), 0); 1607 1608 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 1609 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 1610 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 1611 1612 spa_config_enter(spa, RW_WRITER, FTAG); 1613 vdev_reopen(rvd); /* purge all vdev caches */ 1614 vdev_config_dirty(rvd); /* rewrite all disk labels */ 1615 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 1616 spa_config_exit(spa, FTAG); 1617 1618 mutex_enter(&spa->spa_scrub_lock); 1619 spa->spa_scrub_errors = 0; 1620 spa->spa_scrub_active = 1; 1621 ASSERT(spa->spa_scrub_inflight == 0); 1622 ASSERT(spa->spa_scrub_throttled == 0); 1623 1624 while (!spa->spa_scrub_stop) { 1625 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1626 while (spa->spa_scrub_suspended) { 1627 spa->spa_scrub_active = 0; 1628 cv_broadcast(&spa->spa_scrub_cv); 1629 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1630 spa->spa_scrub_active = 1; 1631 } 1632 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 1633 1634 if (spa->spa_scrub_restart_txg != 0) 1635 break; 1636 1637 mutex_exit(&spa->spa_scrub_lock); 1638 error = traverse_more(th); 1639 mutex_enter(&spa->spa_scrub_lock); 1640 if (error != EAGAIN) 1641 break; 1642 1643 while (spa->spa_scrub_throttled > 0) 1644 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1645 } 1646 1647 while (spa->spa_scrub_inflight) 1648 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1649 1650 spa->spa_scrub_active = 0; 1651 cv_broadcast(&spa->spa_scrub_cv); 1652 1653 mutex_exit(&spa->spa_scrub_lock); 1654 1655 spa_config_enter(spa, RW_WRITER, FTAG); 1656 1657 mutex_enter(&spa->spa_scrub_lock); 1658 1659 /* 1660 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 1661 * AND the spa config lock to synchronize with any config changes 1662 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 1663 */ 1664 if (spa->spa_scrub_restart_txg != 0) 1665 error = ERESTART; 1666 1667 if (spa->spa_scrub_stop) 1668 error = EINTR; 1669 1670 /* 1671 * Even if there were uncorrectable errors, we consider the scrub 1672 * completed. The downside is that if there is a transient error during 1673 * a resilver, we won't resilver the data properly to the target. But 1674 * if the damage is permanent (more likely) we will resilver forever, 1675 * which isn't really acceptable. Since there is enough information for 1676 * the user to know what has failed and why, this seems like a more 1677 * tractable approach. 1678 */ 1679 complete = (error == 0); 1680 1681 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 1682 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 1683 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 1684 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 1685 1686 mutex_exit(&spa->spa_scrub_lock); 1687 1688 /* 1689 * If the scrub/resilver completed, update all DTLs to reflect this. 1690 * Whether it succeeded or not, vacate all temporary scrub DTLs. 1691 */ 1692 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 1693 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 1694 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 1695 spa_errlog_rotate(spa); 1696 1697 spa_config_exit(spa, FTAG); 1698 1699 mutex_enter(&spa->spa_scrub_lock); 1700 1701 /* 1702 * We may have finished replacing a device. 1703 * Let the async thread assess this and handle the detach. 1704 */ 1705 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 1706 1707 /* 1708 * If we were told to restart, our final act is to start a new scrub. 1709 */ 1710 if (error == ERESTART) 1711 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 1712 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 1713 1714 spa->spa_scrub_type = POOL_SCRUB_NONE; 1715 spa->spa_scrub_active = 0; 1716 spa->spa_scrub_thread = NULL; 1717 cv_broadcast(&spa->spa_scrub_cv); 1718 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 1719 thread_exit(); 1720 } 1721 1722 void 1723 spa_scrub_suspend(spa_t *spa) 1724 { 1725 mutex_enter(&spa->spa_scrub_lock); 1726 spa->spa_scrub_suspended++; 1727 while (spa->spa_scrub_active) { 1728 cv_broadcast(&spa->spa_scrub_cv); 1729 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1730 } 1731 while (spa->spa_scrub_inflight) 1732 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1733 mutex_exit(&spa->spa_scrub_lock); 1734 } 1735 1736 void 1737 spa_scrub_resume(spa_t *spa) 1738 { 1739 mutex_enter(&spa->spa_scrub_lock); 1740 ASSERT(spa->spa_scrub_suspended != 0); 1741 if (--spa->spa_scrub_suspended == 0) 1742 cv_broadcast(&spa->spa_scrub_cv); 1743 mutex_exit(&spa->spa_scrub_lock); 1744 } 1745 1746 void 1747 spa_scrub_restart(spa_t *spa, uint64_t txg) 1748 { 1749 /* 1750 * Something happened (e.g. snapshot create/delete) that means 1751 * we must restart any in-progress scrubs. The itinerary will 1752 * fix this properly. 1753 */ 1754 mutex_enter(&spa->spa_scrub_lock); 1755 spa->spa_scrub_restart_txg = txg; 1756 mutex_exit(&spa->spa_scrub_lock); 1757 } 1758 1759 int 1760 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 1761 { 1762 space_seg_t *ss; 1763 uint64_t mintxg, maxtxg; 1764 vdev_t *rvd = spa->spa_root_vdev; 1765 1766 if ((uint_t)type >= POOL_SCRUB_TYPES) 1767 return (ENOTSUP); 1768 1769 mutex_enter(&spa->spa_scrub_lock); 1770 1771 /* 1772 * If there's a scrub or resilver already in progress, stop it. 1773 */ 1774 while (spa->spa_scrub_thread != NULL) { 1775 /* 1776 * Don't stop a resilver unless forced. 1777 */ 1778 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 1779 mutex_exit(&spa->spa_scrub_lock); 1780 return (EBUSY); 1781 } 1782 spa->spa_scrub_stop = 1; 1783 cv_broadcast(&spa->spa_scrub_cv); 1784 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1785 } 1786 1787 /* 1788 * Terminate the previous traverse. 1789 */ 1790 if (spa->spa_scrub_th != NULL) { 1791 traverse_fini(spa->spa_scrub_th); 1792 spa->spa_scrub_th = NULL; 1793 } 1794 1795 if (rvd == NULL) { 1796 ASSERT(spa->spa_scrub_stop == 0); 1797 ASSERT(spa->spa_scrub_type == type); 1798 ASSERT(spa->spa_scrub_restart_txg == 0); 1799 mutex_exit(&spa->spa_scrub_lock); 1800 return (0); 1801 } 1802 1803 mintxg = TXG_INITIAL - 1; 1804 maxtxg = spa_last_synced_txg(spa) + 1; 1805 1806 mutex_enter(&rvd->vdev_dtl_lock); 1807 1808 if (rvd->vdev_dtl_map.sm_space == 0) { 1809 /* 1810 * The pool-wide DTL is empty. 1811 * If this is a resilver, there's nothing to do except 1812 * check whether any in-progress replacements have completed. 1813 */ 1814 if (type == POOL_SCRUB_RESILVER) { 1815 type = POOL_SCRUB_NONE; 1816 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 1817 } 1818 } else { 1819 /* 1820 * The pool-wide DTL is non-empty. 1821 * If this is a normal scrub, upgrade to a resilver instead. 1822 */ 1823 if (type == POOL_SCRUB_EVERYTHING) 1824 type = POOL_SCRUB_RESILVER; 1825 } 1826 1827 if (type == POOL_SCRUB_RESILVER) { 1828 /* 1829 * Determine the resilvering boundaries. 1830 * 1831 * Note: (mintxg, maxtxg) is an open interval, 1832 * i.e. mintxg and maxtxg themselves are not included. 1833 * 1834 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 1835 * so we don't claim to resilver a txg that's still changing. 1836 */ 1837 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 1838 mintxg = ss->ss_start - 1; 1839 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 1840 maxtxg = MIN(ss->ss_end, maxtxg); 1841 } 1842 1843 mutex_exit(&rvd->vdev_dtl_lock); 1844 1845 spa->spa_scrub_stop = 0; 1846 spa->spa_scrub_type = type; 1847 spa->spa_scrub_restart_txg = 0; 1848 1849 if (type != POOL_SCRUB_NONE) { 1850 spa->spa_scrub_mintxg = mintxg; 1851 spa->spa_scrub_maxtxg = maxtxg; 1852 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 1853 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 1854 ZIO_FLAG_CANFAIL); 1855 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 1856 spa->spa_scrub_thread = thread_create(NULL, 0, 1857 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 1858 } 1859 1860 mutex_exit(&spa->spa_scrub_lock); 1861 1862 return (0); 1863 } 1864 1865 /* 1866 * ========================================================================== 1867 * SPA async task processing 1868 * ========================================================================== 1869 */ 1870 1871 static void 1872 spa_async_reopen(spa_t *spa) 1873 { 1874 vdev_t *rvd = spa->spa_root_vdev; 1875 vdev_t *tvd; 1876 int c; 1877 1878 spa_config_enter(spa, RW_WRITER, FTAG); 1879 1880 for (c = 0; c < rvd->vdev_children; c++) { 1881 tvd = rvd->vdev_child[c]; 1882 if (tvd->vdev_reopen_wanted) { 1883 tvd->vdev_reopen_wanted = 0; 1884 vdev_reopen(tvd); 1885 } 1886 } 1887 1888 spa_config_exit(spa, FTAG); 1889 } 1890 1891 static void 1892 spa_async_thread(spa_t *spa) 1893 { 1894 int tasks; 1895 1896 ASSERT(spa->spa_sync_on); 1897 1898 mutex_enter(&spa->spa_async_lock); 1899 tasks = spa->spa_async_tasks; 1900 spa->spa_async_tasks = 0; 1901 mutex_exit(&spa->spa_async_lock); 1902 1903 /* 1904 * See if the config needs to be updated. 1905 */ 1906 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 1907 mutex_enter(&spa_namespace_lock); 1908 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1909 mutex_exit(&spa_namespace_lock); 1910 } 1911 1912 /* 1913 * See if any devices need to be reopened. 1914 */ 1915 if (tasks & SPA_ASYNC_REOPEN) 1916 spa_async_reopen(spa); 1917 1918 /* 1919 * If any devices are done replacing, detach them. 1920 */ 1921 if (tasks & SPA_ASYNC_REPLACE_DONE) 1922 spa_vdev_replace_done(spa); 1923 1924 /* 1925 * Kick off a scrub. 1926 */ 1927 if (tasks & SPA_ASYNC_SCRUB) 1928 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 1929 1930 /* 1931 * Kick off a resilver. 1932 */ 1933 if (tasks & SPA_ASYNC_RESILVER) 1934 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1935 1936 /* 1937 * Let the world know that we're done. 1938 */ 1939 mutex_enter(&spa->spa_async_lock); 1940 spa->spa_async_thread = NULL; 1941 cv_broadcast(&spa->spa_async_cv); 1942 mutex_exit(&spa->spa_async_lock); 1943 thread_exit(); 1944 } 1945 1946 void 1947 spa_async_suspend(spa_t *spa) 1948 { 1949 mutex_enter(&spa->spa_async_lock); 1950 spa->spa_async_suspended++; 1951 while (spa->spa_async_thread != NULL) 1952 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 1953 mutex_exit(&spa->spa_async_lock); 1954 } 1955 1956 void 1957 spa_async_resume(spa_t *spa) 1958 { 1959 mutex_enter(&spa->spa_async_lock); 1960 ASSERT(spa->spa_async_suspended != 0); 1961 spa->spa_async_suspended--; 1962 mutex_exit(&spa->spa_async_lock); 1963 } 1964 1965 static void 1966 spa_async_dispatch(spa_t *spa) 1967 { 1968 mutex_enter(&spa->spa_async_lock); 1969 if (spa->spa_async_tasks && !spa->spa_async_suspended && 1970 spa->spa_async_thread == NULL && 1971 rootdir != NULL && !vn_is_readonly(rootdir)) 1972 spa->spa_async_thread = thread_create(NULL, 0, 1973 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 1974 mutex_exit(&spa->spa_async_lock); 1975 } 1976 1977 void 1978 spa_async_request(spa_t *spa, int task) 1979 { 1980 mutex_enter(&spa->spa_async_lock); 1981 spa->spa_async_tasks |= task; 1982 mutex_exit(&spa->spa_async_lock); 1983 } 1984 1985 /* 1986 * ========================================================================== 1987 * SPA syncing routines 1988 * ========================================================================== 1989 */ 1990 1991 static void 1992 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 1993 { 1994 bplist_t *bpl = &spa->spa_sync_bplist; 1995 dmu_tx_t *tx; 1996 blkptr_t blk; 1997 uint64_t itor = 0; 1998 zio_t *zio; 1999 int error; 2000 uint8_t c = 1; 2001 2002 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2003 2004 while (bplist_iterate(bpl, &itor, &blk) == 0) 2005 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2006 2007 error = zio_wait(zio); 2008 ASSERT3U(error, ==, 0); 2009 2010 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2011 bplist_vacate(bpl, tx); 2012 2013 /* 2014 * Pre-dirty the first block so we sync to convergence faster. 2015 * (Usually only the first block is needed.) 2016 */ 2017 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2018 dmu_tx_commit(tx); 2019 } 2020 2021 static void 2022 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2023 { 2024 nvlist_t *config; 2025 char *packed = NULL; 2026 size_t nvsize = 0; 2027 dmu_buf_t *db; 2028 2029 if (list_is_empty(&spa->spa_dirty_list)) 2030 return; 2031 2032 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2033 2034 if (spa->spa_config_syncing) 2035 nvlist_free(spa->spa_config_syncing); 2036 spa->spa_config_syncing = config; 2037 2038 VERIFY(nvlist_size(config, &nvsize, NV_ENCODE_XDR) == 0); 2039 2040 packed = kmem_alloc(nvsize, KM_SLEEP); 2041 2042 VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 2043 KM_SLEEP) == 0); 2044 2045 dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize, 2046 packed, tx); 2047 2048 kmem_free(packed, nvsize); 2049 2050 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, 2051 spa->spa_config_object, FTAG, &db)); 2052 dmu_buf_will_dirty(db, tx); 2053 *(uint64_t *)db->db_data = nvsize; 2054 dmu_buf_rele(db, FTAG); 2055 } 2056 2057 /* 2058 * Sync the specified transaction group. New blocks may be dirtied as 2059 * part of the process, so we iterate until it converges. 2060 */ 2061 void 2062 spa_sync(spa_t *spa, uint64_t txg) 2063 { 2064 dsl_pool_t *dp = spa->spa_dsl_pool; 2065 objset_t *mos = spa->spa_meta_objset; 2066 bplist_t *bpl = &spa->spa_sync_bplist; 2067 vdev_t *rvd = spa->spa_root_vdev; 2068 vdev_t *vd; 2069 dmu_tx_t *tx; 2070 int dirty_vdevs; 2071 2072 /* 2073 * Lock out configuration changes. 2074 */ 2075 spa_config_enter(spa, RW_READER, FTAG); 2076 2077 spa->spa_syncing_txg = txg; 2078 spa->spa_sync_pass = 0; 2079 2080 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2081 2082 /* 2083 * If anything has changed in this txg, push the deferred frees 2084 * from the previous txg. If not, leave them alone so that we 2085 * don't generate work on an otherwise idle system. 2086 */ 2087 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2088 !txg_list_empty(&dp->dp_dirty_dirs, txg)) 2089 spa_sync_deferred_frees(spa, txg); 2090 2091 /* 2092 * Iterate to convergence. 2093 */ 2094 do { 2095 spa->spa_sync_pass++; 2096 2097 tx = dmu_tx_create_assigned(dp, txg); 2098 spa_sync_config_object(spa, tx); 2099 dmu_tx_commit(tx); 2100 2101 spa_errlog_sync(spa, txg); 2102 2103 dsl_pool_sync(dp, txg); 2104 2105 dirty_vdevs = 0; 2106 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2107 vdev_sync(vd, txg); 2108 dirty_vdevs++; 2109 } 2110 2111 tx = dmu_tx_create_assigned(dp, txg); 2112 bplist_sync(bpl, tx); 2113 dmu_tx_commit(tx); 2114 2115 } while (dirty_vdevs); 2116 2117 bplist_close(bpl); 2118 2119 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2120 2121 /* 2122 * Rewrite the vdev configuration (which includes the uberblock) 2123 * to commit the transaction group. 2124 * 2125 * If there are any dirty vdevs, sync the uberblock to all vdevs. 2126 * Otherwise, pick a random top-level vdev that's known to be 2127 * visible in the config cache (see spa_vdev_add() for details). 2128 * If the write fails, try the next vdev until we're tried them all. 2129 */ 2130 if (!list_is_empty(&spa->spa_dirty_list)) { 2131 VERIFY(vdev_config_sync(rvd, txg) == 0); 2132 } else { 2133 int children = rvd->vdev_children; 2134 int c0 = spa_get_random(children); 2135 int c; 2136 2137 for (c = 0; c < children; c++) { 2138 vd = rvd->vdev_child[(c0 + c) % children]; 2139 if (vd->vdev_ms_array == 0) 2140 continue; 2141 if (vdev_config_sync(vd, txg) == 0) 2142 break; 2143 } 2144 if (c == children) 2145 VERIFY(vdev_config_sync(rvd, txg) == 0); 2146 } 2147 2148 /* 2149 * Clear the dirty config list. 2150 */ 2151 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 2152 vdev_config_clean(vd); 2153 2154 /* 2155 * Now that the new config has synced transactionally, 2156 * let it become visible to the config cache. 2157 */ 2158 if (spa->spa_config_syncing != NULL) { 2159 spa_config_set(spa, spa->spa_config_syncing); 2160 spa->spa_config_txg = txg; 2161 spa->spa_config_syncing = NULL; 2162 } 2163 2164 /* 2165 * Make a stable copy of the fully synced uberblock. 2166 * We use this as the root for pool traversals. 2167 */ 2168 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 2169 2170 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 2171 2172 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 2173 spa->spa_traverse_wanted = 0; 2174 spa->spa_ubsync = spa->spa_uberblock; 2175 rw_exit(&spa->spa_traverse_lock); 2176 2177 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 2178 2179 /* 2180 * Clean up the ZIL records for the synced txg. 2181 */ 2182 dsl_pool_zil_clean(dp); 2183 2184 /* 2185 * Update usable space statistics. 2186 */ 2187 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 2188 vdev_sync_done(vd, txg); 2189 2190 /* 2191 * It had better be the case that we didn't dirty anything 2192 * since spa_sync_labels(). 2193 */ 2194 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 2195 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 2196 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 2197 ASSERT(bpl->bpl_queue == NULL); 2198 2199 spa_config_exit(spa, FTAG); 2200 2201 /* 2202 * If any async tasks have been requested, kick them off. 2203 */ 2204 spa_async_dispatch(spa); 2205 } 2206 2207 /* 2208 * Sync all pools. We don't want to hold the namespace lock across these 2209 * operations, so we take a reference on the spa_t and drop the lock during the 2210 * sync. 2211 */ 2212 void 2213 spa_sync_allpools(void) 2214 { 2215 spa_t *spa = NULL; 2216 mutex_enter(&spa_namespace_lock); 2217 while ((spa = spa_next(spa)) != NULL) { 2218 if (spa_state(spa) != POOL_STATE_ACTIVE) 2219 continue; 2220 spa_open_ref(spa, FTAG); 2221 mutex_exit(&spa_namespace_lock); 2222 txg_wait_synced(spa_get_dsl(spa), 0); 2223 mutex_enter(&spa_namespace_lock); 2224 spa_close(spa, FTAG); 2225 } 2226 mutex_exit(&spa_namespace_lock); 2227 } 2228 2229 /* 2230 * ========================================================================== 2231 * Miscellaneous routines 2232 * ========================================================================== 2233 */ 2234 2235 /* 2236 * Remove all pools in the system. 2237 */ 2238 void 2239 spa_evict_all(void) 2240 { 2241 spa_t *spa; 2242 2243 /* 2244 * Remove all cached state. All pools should be closed now, 2245 * so every spa in the AVL tree should be unreferenced. 2246 */ 2247 mutex_enter(&spa_namespace_lock); 2248 while ((spa = spa_next(NULL)) != NULL) { 2249 /* 2250 * Stop async tasks. The async thread may need to detach 2251 * a device that's been replaced, which requires grabbing 2252 * spa_namespace_lock, so we must drop it here. 2253 */ 2254 spa_open_ref(spa, FTAG); 2255 mutex_exit(&spa_namespace_lock); 2256 spa_async_suspend(spa); 2257 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 2258 mutex_enter(&spa_namespace_lock); 2259 spa_close(spa, FTAG); 2260 2261 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2262 spa_unload(spa); 2263 spa_deactivate(spa); 2264 } 2265 spa_remove(spa); 2266 } 2267 mutex_exit(&spa_namespace_lock); 2268 } 2269 2270 vdev_t * 2271 spa_lookup_by_guid(spa_t *spa, uint64_t guid) 2272 { 2273 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 2274 } 2275 2276 void 2277 spa_upgrade(spa_t *spa) 2278 { 2279 spa_config_enter(spa, RW_WRITER, FTAG); 2280 2281 /* 2282 * This should only be called for a non-faulted pool, and since a 2283 * future version would result in an unopenable pool, this shouldn't be 2284 * possible. 2285 */ 2286 ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 2287 2288 spa->spa_uberblock.ub_version = ZFS_VERSION; 2289 vdev_config_dirty(spa->spa_root_vdev); 2290 2291 spa_config_exit(spa, FTAG); 2292 } 2293