1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * This file contains all the routines used when modifying on-disk SPA state. 30 * This includes opening, importing, destroying, exporting a pool, and syncing a 31 * pool. 32 */ 33 34 #include <sys/zfs_context.h> 35 #include <sys/fm/fs/zfs.h> 36 #include <sys/spa_impl.h> 37 #include <sys/zio.h> 38 #include <sys/zio_checksum.h> 39 #include <sys/zio_compress.h> 40 #include <sys/dmu.h> 41 #include <sys/dmu_tx.h> 42 #include <sys/zap.h> 43 #include <sys/zil.h> 44 #include <sys/vdev_impl.h> 45 #include <sys/metaslab.h> 46 #include <sys/uberblock_impl.h> 47 #include <sys/txg.h> 48 #include <sys/avl.h> 49 #include <sys/dmu_traverse.h> 50 #include <sys/unique.h> 51 #include <sys/dsl_pool.h> 52 #include <sys/dsl_dir.h> 53 #include <sys/dsl_prop.h> 54 #include <sys/fs/zfs.h> 55 #include <sys/callb.h> 56 57 /* 58 * ========================================================================== 59 * SPA state manipulation (open/create/destroy/import/export) 60 * ========================================================================== 61 */ 62 63 static int 64 spa_error_entry_compare(const void *a, const void *b) 65 { 66 spa_error_entry_t *sa = (spa_error_entry_t *)a; 67 spa_error_entry_t *sb = (spa_error_entry_t *)b; 68 int ret; 69 70 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 71 sizeof (zbookmark_t)); 72 73 if (ret < 0) 74 return (-1); 75 else if (ret > 0) 76 return (1); 77 else 78 return (0); 79 } 80 81 /* 82 * Utility function which retrieves copies of the current logs and 83 * re-initializes them in the process. 84 */ 85 void 86 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 87 { 88 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 89 90 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 91 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 92 93 avl_create(&spa->spa_errlist_scrub, 94 spa_error_entry_compare, sizeof (spa_error_entry_t), 95 offsetof(spa_error_entry_t, se_avl)); 96 avl_create(&spa->spa_errlist_last, 97 spa_error_entry_compare, sizeof (spa_error_entry_t), 98 offsetof(spa_error_entry_t, se_avl)); 99 } 100 101 /* 102 * Activate an uninitialized pool. 103 */ 104 static void 105 spa_activate(spa_t *spa) 106 { 107 int t; 108 109 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 110 111 spa->spa_state = POOL_STATE_ACTIVE; 112 113 spa->spa_normal_class = metaslab_class_create(); 114 115 for (t = 0; t < ZIO_TYPES; t++) { 116 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 117 8, maxclsyspri, 50, INT_MAX, 118 TASKQ_PREPOPULATE); 119 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 120 8, maxclsyspri, 50, INT_MAX, 121 TASKQ_PREPOPULATE); 122 } 123 124 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 125 126 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 127 offsetof(vdev_t, vdev_dirty_node)); 128 129 txg_list_create(&spa->spa_vdev_txg_list, 130 offsetof(struct vdev, vdev_txg_node)); 131 132 avl_create(&spa->spa_errlist_scrub, 133 spa_error_entry_compare, sizeof (spa_error_entry_t), 134 offsetof(spa_error_entry_t, se_avl)); 135 avl_create(&spa->spa_errlist_last, 136 spa_error_entry_compare, sizeof (spa_error_entry_t), 137 offsetof(spa_error_entry_t, se_avl)); 138 } 139 140 /* 141 * Opposite of spa_activate(). 142 */ 143 static void 144 spa_deactivate(spa_t *spa) 145 { 146 int t; 147 148 ASSERT(spa->spa_sync_on == B_FALSE); 149 ASSERT(spa->spa_dsl_pool == NULL); 150 ASSERT(spa->spa_root_vdev == NULL); 151 152 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 153 154 txg_list_destroy(&spa->spa_vdev_txg_list); 155 156 list_destroy(&spa->spa_dirty_list); 157 158 rw_destroy(&spa->spa_traverse_lock); 159 160 for (t = 0; t < ZIO_TYPES; t++) { 161 taskq_destroy(spa->spa_zio_issue_taskq[t]); 162 taskq_destroy(spa->spa_zio_intr_taskq[t]); 163 spa->spa_zio_issue_taskq[t] = NULL; 164 spa->spa_zio_intr_taskq[t] = NULL; 165 } 166 167 metaslab_class_destroy(spa->spa_normal_class); 168 spa->spa_normal_class = NULL; 169 170 /* 171 * If this was part of an import or the open otherwise failed, we may 172 * still have errors left in the queues. Empty them just in case. 173 */ 174 spa_errlog_drain(spa); 175 176 avl_destroy(&spa->spa_errlist_scrub); 177 avl_destroy(&spa->spa_errlist_last); 178 179 spa->spa_state = POOL_STATE_UNINITIALIZED; 180 } 181 182 /* 183 * Verify a pool configuration, and construct the vdev tree appropriately. This 184 * will create all the necessary vdevs in the appropriate layout, with each vdev 185 * in the CLOSED state. This will prep the pool before open/creation/import. 186 * All vdev validation is done by the vdev_alloc() routine. 187 */ 188 static vdev_t * 189 spa_config_parse(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) 190 { 191 nvlist_t **child; 192 uint_t c, children; 193 vdev_t *vd; 194 195 if ((vd = vdev_alloc(spa, nv, parent, id, atype)) == NULL) 196 return (NULL); 197 198 if (vd->vdev_ops->vdev_op_leaf) 199 return (vd); 200 201 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 202 &child, &children) != 0) { 203 vdev_free(vd); 204 return (NULL); 205 } 206 207 for (c = 0; c < children; c++) { 208 if (spa_config_parse(spa, child[c], vd, c, atype) == NULL) { 209 vdev_free(vd); 210 return (NULL); 211 } 212 } 213 214 return (vd); 215 } 216 217 /* 218 * Opposite of spa_load(). 219 */ 220 static void 221 spa_unload(spa_t *spa) 222 { 223 /* 224 * Stop async tasks. 225 */ 226 spa_async_suspend(spa); 227 228 /* 229 * Stop syncing. 230 */ 231 if (spa->spa_sync_on) { 232 txg_sync_stop(spa->spa_dsl_pool); 233 spa->spa_sync_on = B_FALSE; 234 } 235 236 /* 237 * Wait for any outstanding prefetch I/O to complete. 238 */ 239 spa_config_enter(spa, RW_WRITER, FTAG); 240 spa_config_exit(spa, FTAG); 241 242 /* 243 * Close the dsl pool. 244 */ 245 if (spa->spa_dsl_pool) { 246 dsl_pool_close(spa->spa_dsl_pool); 247 spa->spa_dsl_pool = NULL; 248 } 249 250 /* 251 * Close all vdevs. 252 */ 253 if (spa->spa_root_vdev) 254 vdev_free(spa->spa_root_vdev); 255 ASSERT(spa->spa_root_vdev == NULL); 256 257 spa->spa_async_suspended = 0; 258 } 259 260 /* 261 * Load an existing storage pool, using the pool's builtin spa_config as a 262 * source of configuration information. 263 */ 264 static int 265 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 266 { 267 int error = 0; 268 nvlist_t *nvroot = NULL; 269 vdev_t *rvd; 270 uberblock_t *ub = &spa->spa_uberblock; 271 uint64_t config_cache_txg = spa->spa_config_txg; 272 uint64_t pool_guid; 273 zio_t *zio; 274 275 spa->spa_load_state = state; 276 277 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 278 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 279 error = EINVAL; 280 goto out; 281 } 282 283 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 284 &spa->spa_config_txg); 285 286 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 287 spa_guid_exists(pool_guid, 0)) { 288 error = EEXIST; 289 goto out; 290 } 291 292 /* 293 * Parse the configuration into a vdev tree. 294 */ 295 spa_config_enter(spa, RW_WRITER, FTAG); 296 rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 297 spa_config_exit(spa, FTAG); 298 299 if (rvd == NULL) { 300 error = EINVAL; 301 goto out; 302 } 303 304 ASSERT(spa->spa_root_vdev == rvd); 305 ASSERT(spa_guid(spa) == pool_guid); 306 307 /* 308 * Try to open all vdevs, loading each label in the process. 309 */ 310 if (vdev_open(rvd) != 0) { 311 error = ENXIO; 312 goto out; 313 } 314 315 /* 316 * Validate the labels for all leaf vdevs. We need to grab the config 317 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 318 * flag. 319 */ 320 spa_config_enter(spa, RW_READER, FTAG); 321 error = vdev_validate(rvd); 322 spa_config_exit(spa, FTAG); 323 324 if (error != 0) { 325 error = EBADF; 326 goto out; 327 } 328 329 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 330 error = ENXIO; 331 goto out; 332 } 333 334 /* 335 * Find the best uberblock. 336 */ 337 bzero(ub, sizeof (uberblock_t)); 338 339 zio = zio_root(spa, NULL, NULL, 340 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 341 vdev_uberblock_load(zio, rvd, ub); 342 error = zio_wait(zio); 343 344 /* 345 * If we weren't able to find a single valid uberblock, return failure. 346 */ 347 if (ub->ub_txg == 0) { 348 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 349 VDEV_AUX_CORRUPT_DATA); 350 error = ENXIO; 351 goto out; 352 } 353 354 /* 355 * If the pool is newer than the code, we can't open it. 356 */ 357 if (ub->ub_version > ZFS_VERSION) { 358 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 359 VDEV_AUX_VERSION_NEWER); 360 error = ENOTSUP; 361 goto out; 362 } 363 364 /* 365 * If the vdev guid sum doesn't match the uberblock, we have an 366 * incomplete configuration. 367 */ 368 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 369 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 370 VDEV_AUX_BAD_GUID_SUM); 371 error = ENXIO; 372 goto out; 373 } 374 375 /* 376 * Initialize internal SPA structures. 377 */ 378 spa->spa_state = POOL_STATE_ACTIVE; 379 spa->spa_ubsync = spa->spa_uberblock; 380 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 381 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 382 if (error) { 383 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 384 VDEV_AUX_CORRUPT_DATA); 385 goto out; 386 } 387 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 388 389 if (zap_lookup(spa->spa_meta_objset, 390 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 391 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 392 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 393 VDEV_AUX_CORRUPT_DATA); 394 error = EIO; 395 goto out; 396 } 397 398 if (!mosconfig) { 399 dmu_buf_t *db; 400 char *packed = NULL; 401 size_t nvsize = 0; 402 nvlist_t *newconfig = NULL; 403 404 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, 405 spa->spa_config_object, FTAG, &db)); 406 nvsize = *(uint64_t *)db->db_data; 407 dmu_buf_rele(db, FTAG); 408 409 packed = kmem_alloc(nvsize, KM_SLEEP); 410 error = dmu_read(spa->spa_meta_objset, 411 spa->spa_config_object, 0, nvsize, packed); 412 if (error == 0) 413 error = nvlist_unpack(packed, nvsize, &newconfig, 0); 414 kmem_free(packed, nvsize); 415 416 if (error) { 417 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 418 VDEV_AUX_CORRUPT_DATA); 419 error = EIO; 420 goto out; 421 } 422 423 spa_config_set(spa, newconfig); 424 425 spa_unload(spa); 426 spa_deactivate(spa); 427 spa_activate(spa); 428 429 return (spa_load(spa, newconfig, state, B_TRUE)); 430 } 431 432 if (zap_lookup(spa->spa_meta_objset, 433 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 434 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 435 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 436 VDEV_AUX_CORRUPT_DATA); 437 error = EIO; 438 goto out; 439 } 440 441 /* 442 * Load the persistent error log. If we have an older pool, this will 443 * not be present. 444 */ 445 error = zap_lookup(spa->spa_meta_objset, 446 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 447 sizeof (uint64_t), 1, &spa->spa_errlog_last); 448 if (error != 0 && error != ENOENT) { 449 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 450 VDEV_AUX_CORRUPT_DATA); 451 error = EIO; 452 goto out; 453 } 454 455 error = zap_lookup(spa->spa_meta_objset, 456 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 457 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 458 if (error != 0 && error != ENOENT) { 459 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 460 VDEV_AUX_CORRUPT_DATA); 461 error = EIO; 462 goto out; 463 } 464 465 /* 466 * Load the vdev state for all toplevel vdevs. 467 */ 468 vdev_load(rvd); 469 470 /* 471 * Propagate the leaf DTLs we just loaded all the way up the tree. 472 */ 473 spa_config_enter(spa, RW_WRITER, FTAG); 474 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 475 spa_config_exit(spa, FTAG); 476 477 /* 478 * Check the state of the root vdev. If it can't be opened, it 479 * indicates one or more toplevel vdevs are faulted. 480 */ 481 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 482 error = ENXIO; 483 goto out; 484 } 485 486 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 487 dmu_tx_t *tx; 488 int need_update = B_FALSE; 489 int c; 490 491 /* 492 * Claim log blocks that haven't been committed yet. 493 * This must all happen in a single txg. 494 */ 495 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 496 spa_first_txg(spa)); 497 dmu_objset_find(spa->spa_name, zil_claim, tx, 0); 498 dmu_tx_commit(tx); 499 500 spa->spa_sync_on = B_TRUE; 501 txg_sync_start(spa->spa_dsl_pool); 502 503 /* 504 * Wait for all claims to sync. 505 */ 506 txg_wait_synced(spa->spa_dsl_pool, 0); 507 508 /* 509 * If the config cache is stale, or we have uninitialized 510 * metaslabs (see spa_vdev_add()), then update the config. 511 */ 512 if (config_cache_txg != spa->spa_config_txg || 513 state == SPA_LOAD_IMPORT) 514 need_update = B_TRUE; 515 516 for (c = 0; c < rvd->vdev_children; c++) 517 if (rvd->vdev_child[c]->vdev_ms_array == 0) 518 need_update = B_TRUE; 519 520 /* 521 * Update the config cache asychronously in case we're the 522 * root pool, in which case the config cache isn't writable yet. 523 */ 524 if (need_update) 525 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 526 } 527 528 error = 0; 529 out: 530 if (error) 531 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 532 spa->spa_load_state = SPA_LOAD_NONE; 533 spa->spa_ena = 0; 534 535 return (error); 536 } 537 538 /* 539 * Pool Open/Import 540 * 541 * The import case is identical to an open except that the configuration is sent 542 * down from userland, instead of grabbed from the configuration cache. For the 543 * case of an open, the pool configuration will exist in the 544 * POOL_STATE_UNITIALIZED state. 545 * 546 * The stats information (gen/count/ustats) is used to gather vdev statistics at 547 * the same time open the pool, without having to keep around the spa_t in some 548 * ambiguous state. 549 */ 550 static int 551 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 552 { 553 spa_t *spa; 554 int error; 555 int loaded = B_FALSE; 556 int locked = B_FALSE; 557 558 *spapp = NULL; 559 560 /* 561 * As disgusting as this is, we need to support recursive calls to this 562 * function because dsl_dir_open() is called during spa_load(), and ends 563 * up calling spa_open() again. The real fix is to figure out how to 564 * avoid dsl_dir_open() calling this in the first place. 565 */ 566 if (mutex_owner(&spa_namespace_lock) != curthread) { 567 mutex_enter(&spa_namespace_lock); 568 locked = B_TRUE; 569 } 570 571 if ((spa = spa_lookup(pool)) == NULL) { 572 if (locked) 573 mutex_exit(&spa_namespace_lock); 574 return (ENOENT); 575 } 576 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 577 578 spa_activate(spa); 579 580 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 581 582 if (error == EBADF) { 583 /* 584 * If vdev_validate() returns failure (indicated by 585 * EBADF), it indicates that one of the vdevs indicates 586 * that the pool has been exported or destroyed. If 587 * this is the case, the config cache is out of sync and 588 * we should remove the pool from the namespace. 589 */ 590 spa_unload(spa); 591 spa_deactivate(spa); 592 spa_remove(spa); 593 spa_config_sync(); 594 if (locked) 595 mutex_exit(&spa_namespace_lock); 596 return (ENOENT); 597 } 598 599 if (error) { 600 /* 601 * We can't open the pool, but we still have useful 602 * information: the state of each vdev after the 603 * attempted vdev_open(). Return this to the user. 604 */ 605 if (config != NULL && spa->spa_root_vdev != NULL) { 606 spa_config_enter(spa, RW_READER, FTAG); 607 *config = spa_config_generate(spa, NULL, -1ULL, 608 B_TRUE); 609 spa_config_exit(spa, FTAG); 610 } 611 spa_unload(spa); 612 spa_deactivate(spa); 613 spa->spa_last_open_failed = B_TRUE; 614 if (locked) 615 mutex_exit(&spa_namespace_lock); 616 *spapp = NULL; 617 return (error); 618 } else { 619 zfs_post_ok(spa, NULL); 620 spa->spa_last_open_failed = B_FALSE; 621 } 622 623 loaded = B_TRUE; 624 } 625 626 spa_open_ref(spa, tag); 627 if (locked) 628 mutex_exit(&spa_namespace_lock); 629 630 *spapp = spa; 631 632 if (config != NULL) { 633 spa_config_enter(spa, RW_READER, FTAG); 634 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 635 spa_config_exit(spa, FTAG); 636 } 637 638 /* 639 * If we just loaded the pool, resilver anything that's out of date. 640 */ 641 if (loaded && (spa_mode & FWRITE)) 642 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 643 644 return (0); 645 } 646 647 int 648 spa_open(const char *name, spa_t **spapp, void *tag) 649 { 650 return (spa_open_common(name, spapp, tag, NULL)); 651 } 652 653 /* 654 * Lookup the given spa_t, incrementing the inject count in the process, 655 * preventing it from being exported or destroyed. 656 */ 657 spa_t * 658 spa_inject_addref(char *name) 659 { 660 spa_t *spa; 661 662 mutex_enter(&spa_namespace_lock); 663 if ((spa = spa_lookup(name)) == NULL) { 664 mutex_exit(&spa_namespace_lock); 665 return (NULL); 666 } 667 spa->spa_inject_ref++; 668 mutex_exit(&spa_namespace_lock); 669 670 return (spa); 671 } 672 673 void 674 spa_inject_delref(spa_t *spa) 675 { 676 mutex_enter(&spa_namespace_lock); 677 spa->spa_inject_ref--; 678 mutex_exit(&spa_namespace_lock); 679 } 680 681 int 682 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 683 { 684 int error; 685 spa_t *spa; 686 687 *config = NULL; 688 error = spa_open_common(name, &spa, FTAG, config); 689 690 if (spa && *config != NULL) 691 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 692 spa_get_errlog_size(spa)) == 0); 693 694 /* 695 * We want to get the alternate root even for faulted pools, so we cheat 696 * and call spa_lookup() directly. 697 */ 698 if (altroot) { 699 if (spa == NULL) { 700 mutex_enter(&spa_namespace_lock); 701 spa = spa_lookup(name); 702 if (spa) 703 spa_altroot(spa, altroot, buflen); 704 else 705 altroot[0] = '\0'; 706 spa = NULL; 707 mutex_exit(&spa_namespace_lock); 708 } else { 709 spa_altroot(spa, altroot, buflen); 710 } 711 } 712 713 if (spa != NULL) 714 spa_close(spa, FTAG); 715 716 return (error); 717 } 718 719 /* 720 * Pool Creation 721 */ 722 int 723 spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 724 { 725 spa_t *spa; 726 vdev_t *rvd; 727 dsl_pool_t *dp; 728 dmu_tx_t *tx; 729 int c, error; 730 uint64_t txg = TXG_INITIAL; 731 732 /* 733 * If this pool already exists, return failure. 734 */ 735 mutex_enter(&spa_namespace_lock); 736 if (spa_lookup(pool) != NULL) { 737 mutex_exit(&spa_namespace_lock); 738 return (EEXIST); 739 } 740 741 /* 742 * Allocate a new spa_t structure. 743 */ 744 spa = spa_add(pool, altroot); 745 spa_activate(spa); 746 747 spa->spa_uberblock.ub_txg = txg - 1; 748 spa->spa_uberblock.ub_version = ZFS_VERSION; 749 spa->spa_ubsync = spa->spa_uberblock; 750 751 /* 752 * Create the root vdev. 753 */ 754 spa_config_enter(spa, RW_WRITER, FTAG); 755 756 rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 757 758 ASSERT(spa->spa_root_vdev == rvd); 759 760 if (rvd == NULL) { 761 error = EINVAL; 762 } else { 763 if ((error = vdev_create(rvd, txg)) == 0) { 764 for (c = 0; c < rvd->vdev_children; c++) 765 vdev_init(rvd->vdev_child[c], txg); 766 vdev_config_dirty(rvd); 767 } 768 } 769 770 spa_config_exit(spa, FTAG); 771 772 if (error) { 773 spa_unload(spa); 774 spa_deactivate(spa); 775 spa_remove(spa); 776 mutex_exit(&spa_namespace_lock); 777 return (error); 778 } 779 780 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 781 spa->spa_meta_objset = dp->dp_meta_objset; 782 783 tx = dmu_tx_create_assigned(dp, txg); 784 785 /* 786 * Create the pool config object. 787 */ 788 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 789 DMU_OT_PACKED_NVLIST, 1 << 14, 790 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 791 792 if (zap_add(spa->spa_meta_objset, 793 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 794 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 795 cmn_err(CE_PANIC, "failed to add pool config"); 796 } 797 798 /* 799 * Create the deferred-free bplist object. Turn off compression 800 * because sync-to-convergence takes longer if the blocksize 801 * keeps changing. 802 */ 803 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 804 1 << 14, tx); 805 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 806 ZIO_COMPRESS_OFF, tx); 807 808 if (zap_add(spa->spa_meta_objset, 809 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 810 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 811 cmn_err(CE_PANIC, "failed to add bplist"); 812 } 813 814 dmu_tx_commit(tx); 815 816 spa->spa_sync_on = B_TRUE; 817 txg_sync_start(spa->spa_dsl_pool); 818 819 /* 820 * We explicitly wait for the first transaction to complete so that our 821 * bean counters are appropriately updated. 822 */ 823 txg_wait_synced(spa->spa_dsl_pool, txg); 824 825 spa_config_sync(); 826 827 mutex_exit(&spa_namespace_lock); 828 829 return (0); 830 } 831 832 /* 833 * Import the given pool into the system. We set up the necessary spa_t and 834 * then call spa_load() to do the dirty work. 835 */ 836 int 837 spa_import(const char *pool, nvlist_t *config, const char *altroot) 838 { 839 spa_t *spa; 840 int error; 841 842 if (!(spa_mode & FWRITE)) 843 return (EROFS); 844 845 /* 846 * If a pool with this name exists, return failure. 847 */ 848 mutex_enter(&spa_namespace_lock); 849 if (spa_lookup(pool) != NULL) { 850 mutex_exit(&spa_namespace_lock); 851 return (EEXIST); 852 } 853 854 /* 855 * Create and initialize the spa structure. 856 */ 857 spa = spa_add(pool, altroot); 858 spa_activate(spa); 859 860 /* 861 * Pass off the heavy lifting to spa_load(). 862 * Pass TRUE for mosconfig because the user-supplied config 863 * is actually the one to trust when doing an import. 864 */ 865 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 866 867 if (error) { 868 spa_unload(spa); 869 spa_deactivate(spa); 870 spa_remove(spa); 871 mutex_exit(&spa_namespace_lock); 872 return (error); 873 } 874 875 /* 876 * Update the config cache to include the newly-imported pool. 877 */ 878 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 879 880 mutex_exit(&spa_namespace_lock); 881 882 /* 883 * Resilver anything that's out of date. 884 */ 885 if (spa_mode & FWRITE) 886 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 887 888 return (0); 889 } 890 891 /* 892 * This (illegal) pool name is used when temporarily importing a spa_t in order 893 * to get the vdev stats associated with the imported devices. 894 */ 895 #define TRYIMPORT_NAME "$import" 896 897 nvlist_t * 898 spa_tryimport(nvlist_t *tryconfig) 899 { 900 nvlist_t *config = NULL; 901 char *poolname; 902 spa_t *spa; 903 uint64_t state; 904 905 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 906 return (NULL); 907 908 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 909 return (NULL); 910 911 /* 912 * Create and initialize the spa structure. 913 */ 914 mutex_enter(&spa_namespace_lock); 915 spa = spa_add(TRYIMPORT_NAME, NULL); 916 spa_activate(spa); 917 918 /* 919 * Pass off the heavy lifting to spa_load(). 920 * Pass TRUE for mosconfig because the user-supplied config 921 * is actually the one to trust when doing an import. 922 */ 923 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 924 925 /* 926 * If 'tryconfig' was at least parsable, return the current config. 927 */ 928 if (spa->spa_root_vdev != NULL) { 929 spa_config_enter(spa, RW_READER, FTAG); 930 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 931 spa_config_exit(spa, FTAG); 932 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 933 poolname) == 0); 934 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 935 state) == 0); 936 } 937 938 spa_unload(spa); 939 spa_deactivate(spa); 940 spa_remove(spa); 941 mutex_exit(&spa_namespace_lock); 942 943 return (config); 944 } 945 946 /* 947 * Pool export/destroy 948 * 949 * The act of destroying or exporting a pool is very simple. We make sure there 950 * is no more pending I/O and any references to the pool are gone. Then, we 951 * update the pool state and sync all the labels to disk, removing the 952 * configuration from the cache afterwards. 953 */ 954 static int 955 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 956 { 957 spa_t *spa; 958 959 if (oldconfig) 960 *oldconfig = NULL; 961 962 if (!(spa_mode & FWRITE)) 963 return (EROFS); 964 965 mutex_enter(&spa_namespace_lock); 966 if ((spa = spa_lookup(pool)) == NULL) { 967 mutex_exit(&spa_namespace_lock); 968 return (ENOENT); 969 } 970 971 /* 972 * Put a hold on the pool, drop the namespace lock, stop async tasks, 973 * reacquire the namespace lock, and see if we can export. 974 */ 975 spa_open_ref(spa, FTAG); 976 mutex_exit(&spa_namespace_lock); 977 spa_async_suspend(spa); 978 mutex_enter(&spa_namespace_lock); 979 spa_close(spa, FTAG); 980 981 /* 982 * The pool will be in core if it's openable, 983 * in which case we can modify its state. 984 */ 985 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 986 /* 987 * Objsets may be open only because they're dirty, so we 988 * have to force it to sync before checking spa_refcnt. 989 */ 990 spa_scrub_suspend(spa); 991 txg_wait_synced(spa->spa_dsl_pool, 0); 992 993 /* 994 * A pool cannot be exported or destroyed if there are active 995 * references. If we are resetting a pool, allow references by 996 * fault injection handlers. 997 */ 998 if (!spa_refcount_zero(spa) || 999 (spa->spa_inject_ref != 0 && 1000 new_state != POOL_STATE_UNINITIALIZED)) { 1001 spa_scrub_resume(spa); 1002 spa_async_resume(spa); 1003 mutex_exit(&spa_namespace_lock); 1004 return (EBUSY); 1005 } 1006 1007 spa_scrub_resume(spa); 1008 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1009 1010 /* 1011 * We want this to be reflected on every label, 1012 * so mark them all dirty. spa_unload() will do the 1013 * final sync that pushes these changes out. 1014 */ 1015 if (new_state != POOL_STATE_UNINITIALIZED) { 1016 spa_config_enter(spa, RW_WRITER, FTAG); 1017 spa->spa_state = new_state; 1018 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 1019 vdev_config_dirty(spa->spa_root_vdev); 1020 spa_config_exit(spa, FTAG); 1021 } 1022 } 1023 1024 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1025 spa_unload(spa); 1026 spa_deactivate(spa); 1027 } 1028 1029 if (oldconfig && spa->spa_config) 1030 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 1031 1032 if (new_state != POOL_STATE_UNINITIALIZED) { 1033 spa_remove(spa); 1034 spa_config_sync(); 1035 } 1036 mutex_exit(&spa_namespace_lock); 1037 1038 return (0); 1039 } 1040 1041 /* 1042 * Destroy a storage pool. 1043 */ 1044 int 1045 spa_destroy(char *pool) 1046 { 1047 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1048 } 1049 1050 /* 1051 * Export a storage pool. 1052 */ 1053 int 1054 spa_export(char *pool, nvlist_t **oldconfig) 1055 { 1056 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1057 } 1058 1059 /* 1060 * Similar to spa_export(), this unloads the spa_t without actually removing it 1061 * from the namespace in any way. 1062 */ 1063 int 1064 spa_reset(char *pool) 1065 { 1066 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 1067 } 1068 1069 1070 /* 1071 * ========================================================================== 1072 * Device manipulation 1073 * ========================================================================== 1074 */ 1075 1076 /* 1077 * Add capacity to a storage pool. 1078 */ 1079 int 1080 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1081 { 1082 uint64_t txg; 1083 int c, error; 1084 vdev_t *rvd = spa->spa_root_vdev; 1085 vdev_t *vd, *tvd; 1086 1087 txg = spa_vdev_enter(spa); 1088 1089 vd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1090 1091 if (vd == NULL) 1092 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1093 1094 if ((error = vdev_create(vd, txg)) != 0) 1095 return (spa_vdev_exit(spa, vd, txg, error)); 1096 1097 /* 1098 * Transfer each new top-level vdev from vd to rvd. 1099 */ 1100 for (c = 0; c < vd->vdev_children; c++) { 1101 tvd = vd->vdev_child[c]; 1102 vdev_remove_child(vd, tvd); 1103 tvd->vdev_id = rvd->vdev_children; 1104 vdev_add_child(rvd, tvd); 1105 vdev_config_dirty(tvd); 1106 } 1107 1108 /* 1109 * We have to be careful when adding new vdevs to an existing pool. 1110 * If other threads start allocating from these vdevs before we 1111 * sync the config cache, and we lose power, then upon reboot we may 1112 * fail to open the pool because there are DVAs that the config cache 1113 * can't translate. Therefore, we first add the vdevs without 1114 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1115 * and then let spa_config_update() initialize the new metaslabs. 1116 * 1117 * spa_load() checks for added-but-not-initialized vdevs, so that 1118 * if we lose power at any point in this sequence, the remaining 1119 * steps will be completed the next time we load the pool. 1120 */ 1121 (void) spa_vdev_exit(spa, vd, txg, 0); 1122 1123 mutex_enter(&spa_namespace_lock); 1124 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1125 mutex_exit(&spa_namespace_lock); 1126 1127 return (0); 1128 } 1129 1130 /* 1131 * Attach a device to a mirror. The arguments are the path to any device 1132 * in the mirror, and the nvroot for the new device. If the path specifies 1133 * a device that is not mirrored, we automatically insert the mirror vdev. 1134 * 1135 * If 'replacing' is specified, the new device is intended to replace the 1136 * existing device; in this case the two devices are made into their own 1137 * mirror using the 'replacing' vdev, which is functionally idendical to 1138 * the mirror vdev (it actually reuses all the same ops) but has a few 1139 * extra rules: you can't attach to it after it's been created, and upon 1140 * completion of resilvering, the first disk (the one being replaced) 1141 * is automatically detached. 1142 */ 1143 int 1144 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1145 { 1146 uint64_t txg, open_txg; 1147 int error; 1148 vdev_t *rvd = spa->spa_root_vdev; 1149 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1150 vdev_ops_t *pvops = replacing ? &vdev_replacing_ops : &vdev_mirror_ops; 1151 1152 txg = spa_vdev_enter(spa); 1153 1154 oldvd = vdev_lookup_by_guid(rvd, guid); 1155 1156 if (oldvd == NULL) 1157 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1158 1159 if (!oldvd->vdev_ops->vdev_op_leaf) 1160 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1161 1162 pvd = oldvd->vdev_parent; 1163 1164 /* 1165 * The parent must be a mirror or the root, unless we're replacing; 1166 * in that case, the parent can be anything but another replacing vdev. 1167 */ 1168 if (pvd->vdev_ops != &vdev_mirror_ops && 1169 pvd->vdev_ops != &vdev_root_ops && 1170 (!replacing || pvd->vdev_ops == &vdev_replacing_ops)) 1171 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1172 1173 newrootvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1174 1175 if (newrootvd == NULL || newrootvd->vdev_children != 1) 1176 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1177 1178 newvd = newrootvd->vdev_child[0]; 1179 1180 if (!newvd->vdev_ops->vdev_op_leaf) 1181 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1182 1183 if ((error = vdev_create(newrootvd, txg)) != 0) 1184 return (spa_vdev_exit(spa, newrootvd, txg, error)); 1185 1186 /* 1187 * Compare the new device size with the replaceable/attachable 1188 * device size. 1189 */ 1190 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1191 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1192 1193 /* 1194 * The new device cannot have a higher alignment requirement 1195 * than the top-level vdev. 1196 */ 1197 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1198 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1199 1200 /* 1201 * If this is an in-place replacement, update oldvd's path and devid 1202 * to make it distinguishable from newvd, and unopenable from now on. 1203 */ 1204 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1205 spa_strfree(oldvd->vdev_path); 1206 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1207 KM_SLEEP); 1208 (void) sprintf(oldvd->vdev_path, "%s/%s", 1209 newvd->vdev_path, "old"); 1210 if (oldvd->vdev_devid != NULL) { 1211 spa_strfree(oldvd->vdev_devid); 1212 oldvd->vdev_devid = NULL; 1213 } 1214 } 1215 1216 /* 1217 * If the parent is not a mirror, or if we're replacing, 1218 * insert the new mirror/replacing vdev above oldvd. 1219 */ 1220 if (pvd->vdev_ops != pvops) 1221 pvd = vdev_add_parent(oldvd, pvops); 1222 1223 ASSERT(pvd->vdev_top->vdev_parent == rvd); 1224 ASSERT(pvd->vdev_ops == pvops); 1225 ASSERT(oldvd->vdev_parent == pvd); 1226 1227 /* 1228 * Extract the new device from its root and add it to pvd. 1229 */ 1230 vdev_remove_child(newrootvd, newvd); 1231 newvd->vdev_id = pvd->vdev_children; 1232 vdev_add_child(pvd, newvd); 1233 1234 /* 1235 * If newvd is smaller than oldvd, but larger than its rsize, 1236 * the addition of newvd may have decreased our parent's asize. 1237 */ 1238 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1239 1240 tvd = newvd->vdev_top; 1241 ASSERT(pvd->vdev_top == tvd); 1242 ASSERT(tvd->vdev_parent == rvd); 1243 1244 vdev_config_dirty(tvd); 1245 1246 /* 1247 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1248 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1249 */ 1250 open_txg = txg + TXG_CONCURRENT_STATES - 1; 1251 1252 mutex_enter(&newvd->vdev_dtl_lock); 1253 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1254 open_txg - TXG_INITIAL + 1); 1255 mutex_exit(&newvd->vdev_dtl_lock); 1256 1257 dprintf("attached %s in txg %llu\n", newvd->vdev_path, txg); 1258 1259 /* 1260 * Mark newvd's DTL dirty in this txg. 1261 */ 1262 vdev_dirty(tvd, VDD_DTL, newvd, txg); 1263 1264 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1265 1266 /* 1267 * Kick off a resilver to update newvd. 1268 */ 1269 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1270 1271 return (0); 1272 } 1273 1274 /* 1275 * Detach a device from a mirror or replacing vdev. 1276 * If 'replace_done' is specified, only detach if the parent 1277 * is a replacing vdev. 1278 */ 1279 int 1280 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1281 { 1282 uint64_t txg; 1283 int c, t, error; 1284 vdev_t *rvd = spa->spa_root_vdev; 1285 vdev_t *vd, *pvd, *cvd, *tvd; 1286 1287 txg = spa_vdev_enter(spa); 1288 1289 vd = vdev_lookup_by_guid(rvd, guid); 1290 1291 if (vd == NULL) 1292 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1293 1294 if (!vd->vdev_ops->vdev_op_leaf) 1295 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1296 1297 pvd = vd->vdev_parent; 1298 1299 /* 1300 * If replace_done is specified, only remove this device if it's 1301 * the first child of a replacing vdev. 1302 */ 1303 if (replace_done && 1304 (vd->vdev_id != 0 || pvd->vdev_ops != &vdev_replacing_ops)) 1305 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1306 1307 /* 1308 * Only mirror and replacing vdevs support detach. 1309 */ 1310 if (pvd->vdev_ops != &vdev_replacing_ops && 1311 pvd->vdev_ops != &vdev_mirror_ops) 1312 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1313 1314 /* 1315 * If there's only one replica, you can't detach it. 1316 */ 1317 if (pvd->vdev_children <= 1) 1318 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1319 1320 /* 1321 * If all siblings have non-empty DTLs, this device may have the only 1322 * valid copy of the data, which means we cannot safely detach it. 1323 * 1324 * XXX -- as in the vdev_offline() case, we really want a more 1325 * precise DTL check. 1326 */ 1327 for (c = 0; c < pvd->vdev_children; c++) { 1328 uint64_t dirty; 1329 1330 cvd = pvd->vdev_child[c]; 1331 if (cvd == vd) 1332 continue; 1333 if (vdev_is_dead(cvd)) 1334 continue; 1335 mutex_enter(&cvd->vdev_dtl_lock); 1336 dirty = cvd->vdev_dtl_map.sm_space | 1337 cvd->vdev_dtl_scrub.sm_space; 1338 mutex_exit(&cvd->vdev_dtl_lock); 1339 if (!dirty) 1340 break; 1341 } 1342 if (c == pvd->vdev_children) 1343 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1344 1345 /* 1346 * Erase the disk labels so the disk can be used for other things. 1347 * This must be done after all other error cases are handled, 1348 * but before we disembowel vd (so we can still do I/O to it). 1349 * But if we can't do it, don't treat the error as fatal -- 1350 * it may be that the unwritability of the disk is the reason 1351 * it's being detached! 1352 */ 1353 error = vdev_label_init(vd, 0); 1354 if (error) 1355 dprintf("unable to erase labels on %s\n", vdev_description(vd)); 1356 1357 /* 1358 * Remove vd from its parent and compact the parent's children. 1359 */ 1360 vdev_remove_child(pvd, vd); 1361 vdev_compact_children(pvd); 1362 1363 /* 1364 * Remember one of the remaining children so we can get tvd below. 1365 */ 1366 cvd = pvd->vdev_child[0]; 1367 1368 /* 1369 * If the parent mirror/replacing vdev only has one child, 1370 * the parent is no longer needed. Remove it from the tree. 1371 */ 1372 if (pvd->vdev_children == 1) 1373 vdev_remove_parent(cvd); 1374 1375 /* 1376 * We don't set tvd until now because the parent we just removed 1377 * may have been the previous top-level vdev. 1378 */ 1379 tvd = cvd->vdev_top; 1380 ASSERT(tvd->vdev_parent == rvd); 1381 1382 /* 1383 * Reopen this top-level vdev to reassess health after detach. 1384 */ 1385 vdev_reopen(tvd); 1386 1387 /* 1388 * If the device we just detached was smaller than the others, 1389 * it may be possible to add metaslabs (i.e. grow the pool). 1390 * vdev_metaslab_init() can't fail because the existing metaslabs 1391 * are already in core, so there's nothing to read from disk. 1392 */ 1393 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 1394 1395 vdev_config_dirty(tvd); 1396 1397 /* 1398 * Mark vd's DTL as dirty in this txg. 1399 * vdev_dtl_sync() will see that vd->vdev_detached is set 1400 * and free vd's DTL object in syncing context. 1401 * But first make sure we're not on any *other* txg's DTL list, 1402 * to prevent vd from being accessed after it's freed. 1403 */ 1404 for (t = 0; t < TXG_SIZE; t++) 1405 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1406 vd->vdev_detached = B_TRUE; 1407 vdev_dirty(tvd, VDD_DTL, vd, txg); 1408 1409 dprintf("detached %s in txg %llu\n", vd->vdev_path, txg); 1410 1411 return (spa_vdev_exit(spa, vd, txg, 0)); 1412 } 1413 1414 /* 1415 * Find any device that's done replacing, so we can detach it. 1416 */ 1417 static vdev_t * 1418 spa_vdev_replace_done_hunt(vdev_t *vd) 1419 { 1420 vdev_t *newvd, *oldvd; 1421 int c; 1422 1423 for (c = 0; c < vd->vdev_children; c++) { 1424 oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 1425 if (oldvd != NULL) 1426 return (oldvd); 1427 } 1428 1429 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 1430 oldvd = vd->vdev_child[0]; 1431 newvd = vd->vdev_child[1]; 1432 1433 mutex_enter(&newvd->vdev_dtl_lock); 1434 if (newvd->vdev_dtl_map.sm_space == 0 && 1435 newvd->vdev_dtl_scrub.sm_space == 0) { 1436 mutex_exit(&newvd->vdev_dtl_lock); 1437 return (oldvd); 1438 } 1439 mutex_exit(&newvd->vdev_dtl_lock); 1440 } 1441 1442 return (NULL); 1443 } 1444 1445 static void 1446 spa_vdev_replace_done(spa_t *spa) 1447 { 1448 vdev_t *vd; 1449 uint64_t guid; 1450 1451 spa_config_enter(spa, RW_READER, FTAG); 1452 1453 while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 1454 guid = vd->vdev_guid; 1455 spa_config_exit(spa, FTAG); 1456 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 1457 return; 1458 spa_config_enter(spa, RW_READER, FTAG); 1459 } 1460 1461 spa_config_exit(spa, FTAG); 1462 } 1463 1464 /* 1465 * Update the stored path for this vdev. Dirty the vdev configuration, relying 1466 * on spa_vdev_enter/exit() to synchronize the labels and cache. 1467 */ 1468 int 1469 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 1470 { 1471 vdev_t *rvd, *vd; 1472 uint64_t txg; 1473 1474 rvd = spa->spa_root_vdev; 1475 1476 txg = spa_vdev_enter(spa); 1477 1478 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 1479 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 1480 1481 if (!vd->vdev_ops->vdev_op_leaf) 1482 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1483 1484 spa_strfree(vd->vdev_path); 1485 vd->vdev_path = spa_strdup(newpath); 1486 1487 vdev_config_dirty(vd->vdev_top); 1488 1489 return (spa_vdev_exit(spa, NULL, txg, 0)); 1490 } 1491 1492 /* 1493 * ========================================================================== 1494 * SPA Scrubbing 1495 * ========================================================================== 1496 */ 1497 1498 void 1499 spa_scrub_throttle(spa_t *spa, int direction) 1500 { 1501 mutex_enter(&spa->spa_scrub_lock); 1502 spa->spa_scrub_throttled += direction; 1503 ASSERT(spa->spa_scrub_throttled >= 0); 1504 if (spa->spa_scrub_throttled == 0) 1505 cv_broadcast(&spa->spa_scrub_io_cv); 1506 mutex_exit(&spa->spa_scrub_lock); 1507 } 1508 1509 static void 1510 spa_scrub_io_done(zio_t *zio) 1511 { 1512 spa_t *spa = zio->io_spa; 1513 1514 zio_buf_free(zio->io_data, zio->io_size); 1515 1516 mutex_enter(&spa->spa_scrub_lock); 1517 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1518 vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 1519 spa->spa_scrub_errors++; 1520 mutex_enter(&vd->vdev_stat_lock); 1521 vd->vdev_stat.vs_scrub_errors++; 1522 mutex_exit(&vd->vdev_stat_lock); 1523 } 1524 if (--spa->spa_scrub_inflight == 0) { 1525 cv_broadcast(&spa->spa_scrub_io_cv); 1526 ASSERT(spa->spa_scrub_throttled == 0); 1527 } 1528 mutex_exit(&spa->spa_scrub_lock); 1529 } 1530 1531 static void 1532 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 1533 zbookmark_t *zb) 1534 { 1535 size_t size = BP_GET_LSIZE(bp); 1536 void *data = zio_buf_alloc(size); 1537 1538 mutex_enter(&spa->spa_scrub_lock); 1539 spa->spa_scrub_inflight++; 1540 mutex_exit(&spa->spa_scrub_lock); 1541 1542 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 1543 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 1544 1545 flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 1546 1547 zio_nowait(zio_read(NULL, spa, bp, data, size, 1548 spa_scrub_io_done, NULL, priority, flags, zb)); 1549 } 1550 1551 /* ARGSUSED */ 1552 static int 1553 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 1554 { 1555 blkptr_t *bp = &bc->bc_blkptr; 1556 vdev_t *vd = spa->spa_root_vdev; 1557 dva_t *dva = bp->blk_dva; 1558 int needs_resilver = B_FALSE; 1559 int d; 1560 1561 if (bc->bc_errno) { 1562 /* 1563 * We can't scrub this block, but we can continue to scrub 1564 * the rest of the pool. Note the error and move along. 1565 */ 1566 mutex_enter(&spa->spa_scrub_lock); 1567 spa->spa_scrub_errors++; 1568 mutex_exit(&spa->spa_scrub_lock); 1569 1570 mutex_enter(&vd->vdev_stat_lock); 1571 vd->vdev_stat.vs_scrub_errors++; 1572 mutex_exit(&vd->vdev_stat_lock); 1573 1574 return (ERESTART); 1575 } 1576 1577 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 1578 1579 for (d = 0; d < BP_GET_NDVAS(bp); d++) { 1580 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 1581 1582 ASSERT(vd != NULL); 1583 1584 /* 1585 * Keep track of how much data we've examined so that 1586 * zpool(1M) status can make useful progress reports. 1587 */ 1588 mutex_enter(&vd->vdev_stat_lock); 1589 vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 1590 mutex_exit(&vd->vdev_stat_lock); 1591 1592 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 1593 if (DVA_GET_GANG(&dva[d])) { 1594 /* 1595 * Gang members may be spread across multiple 1596 * vdevs, so the best we can do is look at the 1597 * pool-wide DTL. 1598 * XXX -- it would be better to change our 1599 * allocation policy to ensure that this can't 1600 * happen. 1601 */ 1602 vd = spa->spa_root_vdev; 1603 } 1604 if (vdev_dtl_contains(&vd->vdev_dtl_map, 1605 bp->blk_birth, 1)) 1606 needs_resilver = B_TRUE; 1607 } 1608 } 1609 1610 if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 1611 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 1612 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 1613 else if (needs_resilver) 1614 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 1615 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 1616 1617 return (0); 1618 } 1619 1620 static void 1621 spa_scrub_thread(spa_t *spa) 1622 { 1623 callb_cpr_t cprinfo; 1624 traverse_handle_t *th = spa->spa_scrub_th; 1625 vdev_t *rvd = spa->spa_root_vdev; 1626 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 1627 int error = 0; 1628 boolean_t complete; 1629 1630 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 1631 1632 /* 1633 * If we're restarting due to a snapshot create/delete, 1634 * wait for that to complete. 1635 */ 1636 txg_wait_synced(spa_get_dsl(spa), 0); 1637 1638 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 1639 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 1640 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 1641 1642 spa_config_enter(spa, RW_WRITER, FTAG); 1643 vdev_reopen(rvd); /* purge all vdev caches */ 1644 vdev_config_dirty(rvd); /* rewrite all disk labels */ 1645 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 1646 spa_config_exit(spa, FTAG); 1647 1648 mutex_enter(&spa->spa_scrub_lock); 1649 spa->spa_scrub_errors = 0; 1650 spa->spa_scrub_active = 1; 1651 ASSERT(spa->spa_scrub_inflight == 0); 1652 ASSERT(spa->spa_scrub_throttled == 0); 1653 1654 while (!spa->spa_scrub_stop) { 1655 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1656 while (spa->spa_scrub_suspended) { 1657 spa->spa_scrub_active = 0; 1658 cv_broadcast(&spa->spa_scrub_cv); 1659 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1660 spa->spa_scrub_active = 1; 1661 } 1662 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 1663 1664 if (spa->spa_scrub_restart_txg != 0) 1665 break; 1666 1667 mutex_exit(&spa->spa_scrub_lock); 1668 error = traverse_more(th); 1669 mutex_enter(&spa->spa_scrub_lock); 1670 if (error != EAGAIN) 1671 break; 1672 1673 while (spa->spa_scrub_throttled > 0) 1674 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1675 } 1676 1677 while (spa->spa_scrub_inflight) 1678 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1679 1680 spa->spa_scrub_active = 0; 1681 cv_broadcast(&spa->spa_scrub_cv); 1682 1683 mutex_exit(&spa->spa_scrub_lock); 1684 1685 spa_config_enter(spa, RW_WRITER, FTAG); 1686 1687 mutex_enter(&spa->spa_scrub_lock); 1688 1689 /* 1690 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 1691 * AND the spa config lock to synchronize with any config changes 1692 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 1693 */ 1694 if (spa->spa_scrub_restart_txg != 0) 1695 error = ERESTART; 1696 1697 if (spa->spa_scrub_stop) 1698 error = EINTR; 1699 1700 /* 1701 * Even if there were uncorrectable errors, we consider the scrub 1702 * completed. The downside is that if there is a transient error during 1703 * a resilver, we won't resilver the data properly to the target. But 1704 * if the damage is permanent (more likely) we will resilver forever, 1705 * which isn't really acceptable. Since there is enough information for 1706 * the user to know what has failed and why, this seems like a more 1707 * tractable approach. 1708 */ 1709 complete = (error == 0); 1710 1711 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 1712 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 1713 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 1714 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 1715 1716 mutex_exit(&spa->spa_scrub_lock); 1717 1718 /* 1719 * If the scrub/resilver completed, update all DTLs to reflect this. 1720 * Whether it succeeded or not, vacate all temporary scrub DTLs. 1721 */ 1722 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 1723 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 1724 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 1725 spa_errlog_rotate(spa); 1726 1727 spa_config_exit(spa, FTAG); 1728 1729 mutex_enter(&spa->spa_scrub_lock); 1730 1731 /* 1732 * We may have finished replacing a device. 1733 * Let the async thread assess this and handle the detach. 1734 */ 1735 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 1736 1737 /* 1738 * If we were told to restart, our final act is to start a new scrub. 1739 */ 1740 if (error == ERESTART) 1741 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 1742 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 1743 1744 spa->spa_scrub_type = POOL_SCRUB_NONE; 1745 spa->spa_scrub_active = 0; 1746 spa->spa_scrub_thread = NULL; 1747 cv_broadcast(&spa->spa_scrub_cv); 1748 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 1749 thread_exit(); 1750 } 1751 1752 void 1753 spa_scrub_suspend(spa_t *spa) 1754 { 1755 mutex_enter(&spa->spa_scrub_lock); 1756 spa->spa_scrub_suspended++; 1757 while (spa->spa_scrub_active) { 1758 cv_broadcast(&spa->spa_scrub_cv); 1759 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1760 } 1761 while (spa->spa_scrub_inflight) 1762 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1763 mutex_exit(&spa->spa_scrub_lock); 1764 } 1765 1766 void 1767 spa_scrub_resume(spa_t *spa) 1768 { 1769 mutex_enter(&spa->spa_scrub_lock); 1770 ASSERT(spa->spa_scrub_suspended != 0); 1771 if (--spa->spa_scrub_suspended == 0) 1772 cv_broadcast(&spa->spa_scrub_cv); 1773 mutex_exit(&spa->spa_scrub_lock); 1774 } 1775 1776 void 1777 spa_scrub_restart(spa_t *spa, uint64_t txg) 1778 { 1779 /* 1780 * Something happened (e.g. snapshot create/delete) that means 1781 * we must restart any in-progress scrubs. The itinerary will 1782 * fix this properly. 1783 */ 1784 mutex_enter(&spa->spa_scrub_lock); 1785 spa->spa_scrub_restart_txg = txg; 1786 mutex_exit(&spa->spa_scrub_lock); 1787 } 1788 1789 int 1790 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 1791 { 1792 space_seg_t *ss; 1793 uint64_t mintxg, maxtxg; 1794 vdev_t *rvd = spa->spa_root_vdev; 1795 1796 if ((uint_t)type >= POOL_SCRUB_TYPES) 1797 return (ENOTSUP); 1798 1799 mutex_enter(&spa->spa_scrub_lock); 1800 1801 /* 1802 * If there's a scrub or resilver already in progress, stop it. 1803 */ 1804 while (spa->spa_scrub_thread != NULL) { 1805 /* 1806 * Don't stop a resilver unless forced. 1807 */ 1808 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 1809 mutex_exit(&spa->spa_scrub_lock); 1810 return (EBUSY); 1811 } 1812 spa->spa_scrub_stop = 1; 1813 cv_broadcast(&spa->spa_scrub_cv); 1814 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1815 } 1816 1817 /* 1818 * Terminate the previous traverse. 1819 */ 1820 if (spa->spa_scrub_th != NULL) { 1821 traverse_fini(spa->spa_scrub_th); 1822 spa->spa_scrub_th = NULL; 1823 } 1824 1825 if (rvd == NULL) { 1826 ASSERT(spa->spa_scrub_stop == 0); 1827 ASSERT(spa->spa_scrub_type == type); 1828 ASSERT(spa->spa_scrub_restart_txg == 0); 1829 mutex_exit(&spa->spa_scrub_lock); 1830 return (0); 1831 } 1832 1833 mintxg = TXG_INITIAL - 1; 1834 maxtxg = spa_last_synced_txg(spa) + 1; 1835 1836 mutex_enter(&rvd->vdev_dtl_lock); 1837 1838 if (rvd->vdev_dtl_map.sm_space == 0) { 1839 /* 1840 * The pool-wide DTL is empty. 1841 * If this is a resilver, there's nothing to do except 1842 * check whether any in-progress replacements have completed. 1843 */ 1844 if (type == POOL_SCRUB_RESILVER) { 1845 type = POOL_SCRUB_NONE; 1846 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 1847 } 1848 } else { 1849 /* 1850 * The pool-wide DTL is non-empty. 1851 * If this is a normal scrub, upgrade to a resilver instead. 1852 */ 1853 if (type == POOL_SCRUB_EVERYTHING) 1854 type = POOL_SCRUB_RESILVER; 1855 } 1856 1857 if (type == POOL_SCRUB_RESILVER) { 1858 /* 1859 * Determine the resilvering boundaries. 1860 * 1861 * Note: (mintxg, maxtxg) is an open interval, 1862 * i.e. mintxg and maxtxg themselves are not included. 1863 * 1864 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 1865 * so we don't claim to resilver a txg that's still changing. 1866 */ 1867 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 1868 mintxg = ss->ss_start - 1; 1869 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 1870 maxtxg = MIN(ss->ss_end, maxtxg); 1871 } 1872 1873 mutex_exit(&rvd->vdev_dtl_lock); 1874 1875 spa->spa_scrub_stop = 0; 1876 spa->spa_scrub_type = type; 1877 spa->spa_scrub_restart_txg = 0; 1878 1879 if (type != POOL_SCRUB_NONE) { 1880 spa->spa_scrub_mintxg = mintxg; 1881 spa->spa_scrub_maxtxg = maxtxg; 1882 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 1883 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 1884 ZIO_FLAG_CANFAIL); 1885 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 1886 spa->spa_scrub_thread = thread_create(NULL, 0, 1887 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 1888 } 1889 1890 mutex_exit(&spa->spa_scrub_lock); 1891 1892 return (0); 1893 } 1894 1895 /* 1896 * ========================================================================== 1897 * SPA async task processing 1898 * ========================================================================== 1899 */ 1900 1901 static void 1902 spa_async_reopen(spa_t *spa) 1903 { 1904 vdev_t *rvd = spa->spa_root_vdev; 1905 vdev_t *tvd; 1906 int c; 1907 1908 spa_config_enter(spa, RW_WRITER, FTAG); 1909 1910 for (c = 0; c < rvd->vdev_children; c++) { 1911 tvd = rvd->vdev_child[c]; 1912 if (tvd->vdev_reopen_wanted) { 1913 tvd->vdev_reopen_wanted = 0; 1914 vdev_reopen(tvd); 1915 } 1916 } 1917 1918 spa_config_exit(spa, FTAG); 1919 } 1920 1921 static void 1922 spa_async_thread(spa_t *spa) 1923 { 1924 int tasks; 1925 1926 ASSERT(spa->spa_sync_on); 1927 1928 mutex_enter(&spa->spa_async_lock); 1929 tasks = spa->spa_async_tasks; 1930 spa->spa_async_tasks = 0; 1931 mutex_exit(&spa->spa_async_lock); 1932 1933 /* 1934 * See if the config needs to be updated. 1935 */ 1936 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 1937 mutex_enter(&spa_namespace_lock); 1938 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1939 mutex_exit(&spa_namespace_lock); 1940 } 1941 1942 /* 1943 * See if any devices need to be reopened. 1944 */ 1945 if (tasks & SPA_ASYNC_REOPEN) 1946 spa_async_reopen(spa); 1947 1948 /* 1949 * If any devices are done replacing, detach them. 1950 */ 1951 if (tasks & SPA_ASYNC_REPLACE_DONE) 1952 spa_vdev_replace_done(spa); 1953 1954 /* 1955 * Kick off a scrub. 1956 */ 1957 if (tasks & SPA_ASYNC_SCRUB) 1958 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 1959 1960 /* 1961 * Kick off a resilver. 1962 */ 1963 if (tasks & SPA_ASYNC_RESILVER) 1964 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1965 1966 /* 1967 * Let the world know that we're done. 1968 */ 1969 mutex_enter(&spa->spa_async_lock); 1970 spa->spa_async_thread = NULL; 1971 cv_broadcast(&spa->spa_async_cv); 1972 mutex_exit(&spa->spa_async_lock); 1973 thread_exit(); 1974 } 1975 1976 void 1977 spa_async_suspend(spa_t *spa) 1978 { 1979 mutex_enter(&spa->spa_async_lock); 1980 spa->spa_async_suspended++; 1981 while (spa->spa_async_thread != NULL) 1982 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 1983 mutex_exit(&spa->spa_async_lock); 1984 } 1985 1986 void 1987 spa_async_resume(spa_t *spa) 1988 { 1989 mutex_enter(&spa->spa_async_lock); 1990 ASSERT(spa->spa_async_suspended != 0); 1991 spa->spa_async_suspended--; 1992 mutex_exit(&spa->spa_async_lock); 1993 } 1994 1995 static void 1996 spa_async_dispatch(spa_t *spa) 1997 { 1998 mutex_enter(&spa->spa_async_lock); 1999 if (spa->spa_async_tasks && !spa->spa_async_suspended && 2000 spa->spa_async_thread == NULL && 2001 rootdir != NULL && !vn_is_readonly(rootdir)) 2002 spa->spa_async_thread = thread_create(NULL, 0, 2003 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 2004 mutex_exit(&spa->spa_async_lock); 2005 } 2006 2007 void 2008 spa_async_request(spa_t *spa, int task) 2009 { 2010 mutex_enter(&spa->spa_async_lock); 2011 spa->spa_async_tasks |= task; 2012 mutex_exit(&spa->spa_async_lock); 2013 } 2014 2015 /* 2016 * ========================================================================== 2017 * SPA syncing routines 2018 * ========================================================================== 2019 */ 2020 2021 static void 2022 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2023 { 2024 bplist_t *bpl = &spa->spa_sync_bplist; 2025 dmu_tx_t *tx; 2026 blkptr_t blk; 2027 uint64_t itor = 0; 2028 zio_t *zio; 2029 int error; 2030 uint8_t c = 1; 2031 2032 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2033 2034 while (bplist_iterate(bpl, &itor, &blk) == 0) 2035 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2036 2037 error = zio_wait(zio); 2038 ASSERT3U(error, ==, 0); 2039 2040 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2041 bplist_vacate(bpl, tx); 2042 2043 /* 2044 * Pre-dirty the first block so we sync to convergence faster. 2045 * (Usually only the first block is needed.) 2046 */ 2047 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2048 dmu_tx_commit(tx); 2049 } 2050 2051 static void 2052 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2053 { 2054 nvlist_t *config; 2055 char *packed = NULL; 2056 size_t nvsize = 0; 2057 dmu_buf_t *db; 2058 2059 if (list_is_empty(&spa->spa_dirty_list)) 2060 return; 2061 2062 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2063 2064 if (spa->spa_config_syncing) 2065 nvlist_free(spa->spa_config_syncing); 2066 spa->spa_config_syncing = config; 2067 2068 VERIFY(nvlist_size(config, &nvsize, NV_ENCODE_XDR) == 0); 2069 2070 packed = kmem_alloc(nvsize, KM_SLEEP); 2071 2072 VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 2073 KM_SLEEP) == 0); 2074 2075 dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize, 2076 packed, tx); 2077 2078 kmem_free(packed, nvsize); 2079 2080 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, 2081 spa->spa_config_object, FTAG, &db)); 2082 dmu_buf_will_dirty(db, tx); 2083 *(uint64_t *)db->db_data = nvsize; 2084 dmu_buf_rele(db, FTAG); 2085 } 2086 2087 /* 2088 * Sync the specified transaction group. New blocks may be dirtied as 2089 * part of the process, so we iterate until it converges. 2090 */ 2091 void 2092 spa_sync(spa_t *spa, uint64_t txg) 2093 { 2094 dsl_pool_t *dp = spa->spa_dsl_pool; 2095 objset_t *mos = spa->spa_meta_objset; 2096 bplist_t *bpl = &spa->spa_sync_bplist; 2097 vdev_t *rvd = spa->spa_root_vdev; 2098 vdev_t *vd; 2099 dmu_tx_t *tx; 2100 int dirty_vdevs; 2101 2102 /* 2103 * Lock out configuration changes. 2104 */ 2105 spa_config_enter(spa, RW_READER, FTAG); 2106 2107 spa->spa_syncing_txg = txg; 2108 spa->spa_sync_pass = 0; 2109 2110 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2111 2112 /* 2113 * If anything has changed in this txg, push the deferred frees 2114 * from the previous txg. If not, leave them alone so that we 2115 * don't generate work on an otherwise idle system. 2116 */ 2117 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2118 !txg_list_empty(&dp->dp_dirty_dirs, txg)) 2119 spa_sync_deferred_frees(spa, txg); 2120 2121 /* 2122 * Iterate to convergence. 2123 */ 2124 do { 2125 spa->spa_sync_pass++; 2126 2127 tx = dmu_tx_create_assigned(dp, txg); 2128 spa_sync_config_object(spa, tx); 2129 dmu_tx_commit(tx); 2130 2131 spa_errlog_sync(spa, txg); 2132 2133 dsl_pool_sync(dp, txg); 2134 2135 dirty_vdevs = 0; 2136 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2137 vdev_sync(vd, txg); 2138 dirty_vdevs++; 2139 } 2140 2141 tx = dmu_tx_create_assigned(dp, txg); 2142 bplist_sync(bpl, tx); 2143 dmu_tx_commit(tx); 2144 2145 } while (dirty_vdevs); 2146 2147 bplist_close(bpl); 2148 2149 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2150 2151 /* 2152 * Rewrite the vdev configuration (which includes the uberblock) 2153 * to commit the transaction group. 2154 * 2155 * If there are any dirty vdevs, sync the uberblock to all vdevs. 2156 * Otherwise, pick a random top-level vdev that's known to be 2157 * visible in the config cache (see spa_vdev_add() for details). 2158 * If the write fails, try the next vdev until we're tried them all. 2159 */ 2160 if (!list_is_empty(&spa->spa_dirty_list)) { 2161 VERIFY(vdev_config_sync(rvd, txg) == 0); 2162 } else { 2163 int children = rvd->vdev_children; 2164 int c0 = spa_get_random(children); 2165 int c; 2166 2167 for (c = 0; c < children; c++) { 2168 vd = rvd->vdev_child[(c0 + c) % children]; 2169 if (vd->vdev_ms_array == 0) 2170 continue; 2171 if (vdev_config_sync(vd, txg) == 0) 2172 break; 2173 } 2174 if (c == children) 2175 VERIFY(vdev_config_sync(rvd, txg) == 0); 2176 } 2177 2178 /* 2179 * Clear the dirty config list. 2180 */ 2181 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 2182 vdev_config_clean(vd); 2183 2184 /* 2185 * Now that the new config has synced transactionally, 2186 * let it become visible to the config cache. 2187 */ 2188 if (spa->spa_config_syncing != NULL) { 2189 spa_config_set(spa, spa->spa_config_syncing); 2190 spa->spa_config_txg = txg; 2191 spa->spa_config_syncing = NULL; 2192 } 2193 2194 /* 2195 * Make a stable copy of the fully synced uberblock. 2196 * We use this as the root for pool traversals. 2197 */ 2198 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 2199 2200 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 2201 2202 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 2203 spa->spa_traverse_wanted = 0; 2204 spa->spa_ubsync = spa->spa_uberblock; 2205 rw_exit(&spa->spa_traverse_lock); 2206 2207 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 2208 2209 /* 2210 * Clean up the ZIL records for the synced txg. 2211 */ 2212 dsl_pool_zil_clean(dp); 2213 2214 /* 2215 * Update usable space statistics. 2216 */ 2217 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 2218 vdev_sync_done(vd, txg); 2219 2220 /* 2221 * It had better be the case that we didn't dirty anything 2222 * since spa_sync_labels(). 2223 */ 2224 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 2225 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 2226 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 2227 ASSERT(bpl->bpl_queue == NULL); 2228 2229 spa_config_exit(spa, FTAG); 2230 2231 /* 2232 * If any async tasks have been requested, kick them off. 2233 */ 2234 spa_async_dispatch(spa); 2235 } 2236 2237 /* 2238 * Sync all pools. We don't want to hold the namespace lock across these 2239 * operations, so we take a reference on the spa_t and drop the lock during the 2240 * sync. 2241 */ 2242 void 2243 spa_sync_allpools(void) 2244 { 2245 spa_t *spa = NULL; 2246 mutex_enter(&spa_namespace_lock); 2247 while ((spa = spa_next(spa)) != NULL) { 2248 if (spa_state(spa) != POOL_STATE_ACTIVE) 2249 continue; 2250 spa_open_ref(spa, FTAG); 2251 mutex_exit(&spa_namespace_lock); 2252 txg_wait_synced(spa_get_dsl(spa), 0); 2253 mutex_enter(&spa_namespace_lock); 2254 spa_close(spa, FTAG); 2255 } 2256 mutex_exit(&spa_namespace_lock); 2257 } 2258 2259 /* 2260 * ========================================================================== 2261 * Miscellaneous routines 2262 * ========================================================================== 2263 */ 2264 2265 /* 2266 * Remove all pools in the system. 2267 */ 2268 void 2269 spa_evict_all(void) 2270 { 2271 spa_t *spa; 2272 2273 /* 2274 * Remove all cached state. All pools should be closed now, 2275 * so every spa in the AVL tree should be unreferenced. 2276 */ 2277 mutex_enter(&spa_namespace_lock); 2278 while ((spa = spa_next(NULL)) != NULL) { 2279 /* 2280 * Stop async tasks. The async thread may need to detach 2281 * a device that's been replaced, which requires grabbing 2282 * spa_namespace_lock, so we must drop it here. 2283 */ 2284 spa_open_ref(spa, FTAG); 2285 mutex_exit(&spa_namespace_lock); 2286 spa_async_suspend(spa); 2287 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 2288 mutex_enter(&spa_namespace_lock); 2289 spa_close(spa, FTAG); 2290 2291 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2292 spa_unload(spa); 2293 spa_deactivate(spa); 2294 } 2295 spa_remove(spa); 2296 } 2297 mutex_exit(&spa_namespace_lock); 2298 } 2299 2300 vdev_t * 2301 spa_lookup_by_guid(spa_t *spa, uint64_t guid) 2302 { 2303 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 2304 } 2305 2306 void 2307 spa_upgrade(spa_t *spa) 2308 { 2309 spa_config_enter(spa, RW_WRITER, FTAG); 2310 2311 /* 2312 * This should only be called for a non-faulted pool, and since a 2313 * future version would result in an unopenable pool, this shouldn't be 2314 * possible. 2315 */ 2316 ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 2317 2318 spa->spa_uberblock.ub_version = ZFS_VERSION; 2319 vdev_config_dirty(spa->spa_root_vdev); 2320 2321 spa_config_exit(spa, FTAG); 2322 } 2323