1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * This file contains all the routines used when modifying on-disk SPA state. 30 * This includes opening, importing, destroying, exporting a pool, and syncing a 31 * pool. 32 */ 33 34 #include <sys/zfs_context.h> 35 #include <sys/fm/fs/zfs.h> 36 #include <sys/spa_impl.h> 37 #include <sys/zio.h> 38 #include <sys/zio_checksum.h> 39 #include <sys/zio_compress.h> 40 #include <sys/dmu.h> 41 #include <sys/dmu_tx.h> 42 #include <sys/zap.h> 43 #include <sys/zil.h> 44 #include <sys/vdev_impl.h> 45 #include <sys/metaslab.h> 46 #include <sys/uberblock_impl.h> 47 #include <sys/txg.h> 48 #include <sys/avl.h> 49 #include <sys/dmu_traverse.h> 50 #include <sys/unique.h> 51 #include <sys/dsl_pool.h> 52 #include <sys/dsl_dir.h> 53 #include <sys/dsl_prop.h> 54 #include <sys/fs/zfs.h> 55 #include <sys/callb.h> 56 57 /* 58 * ========================================================================== 59 * SPA state manipulation (open/create/destroy/import/export) 60 * ========================================================================== 61 */ 62 63 static int 64 spa_error_entry_compare(const void *a, const void *b) 65 { 66 spa_error_entry_t *sa = (spa_error_entry_t *)a; 67 spa_error_entry_t *sb = (spa_error_entry_t *)b; 68 int ret; 69 70 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 71 sizeof (zbookmark_t)); 72 73 if (ret < 0) 74 return (-1); 75 else if (ret > 0) 76 return (1); 77 else 78 return (0); 79 } 80 81 /* 82 * Utility function which retrieves copies of the current logs and 83 * re-initializes them in the process. 84 */ 85 void 86 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 87 { 88 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 89 90 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 91 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 92 93 avl_create(&spa->spa_errlist_scrub, 94 spa_error_entry_compare, sizeof (spa_error_entry_t), 95 offsetof(spa_error_entry_t, se_avl)); 96 avl_create(&spa->spa_errlist_last, 97 spa_error_entry_compare, sizeof (spa_error_entry_t), 98 offsetof(spa_error_entry_t, se_avl)); 99 } 100 101 /* 102 * Activate an uninitialized pool. 103 */ 104 static void 105 spa_activate(spa_t *spa) 106 { 107 int t; 108 109 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 110 111 spa->spa_state = POOL_STATE_ACTIVE; 112 113 spa->spa_normal_class = metaslab_class_create(); 114 115 for (t = 0; t < ZIO_TYPES; t++) { 116 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 117 8, maxclsyspri, 50, INT_MAX, 118 TASKQ_PREPOPULATE); 119 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 120 8, maxclsyspri, 50, INT_MAX, 121 TASKQ_PREPOPULATE); 122 } 123 124 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 125 126 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 127 offsetof(vdev_t, vdev_dirty_node)); 128 129 txg_list_create(&spa->spa_vdev_txg_list, 130 offsetof(struct vdev, vdev_txg_node)); 131 132 avl_create(&spa->spa_errlist_scrub, 133 spa_error_entry_compare, sizeof (spa_error_entry_t), 134 offsetof(spa_error_entry_t, se_avl)); 135 avl_create(&spa->spa_errlist_last, 136 spa_error_entry_compare, sizeof (spa_error_entry_t), 137 offsetof(spa_error_entry_t, se_avl)); 138 } 139 140 /* 141 * Opposite of spa_activate(). 142 */ 143 static void 144 spa_deactivate(spa_t *spa) 145 { 146 int t; 147 148 ASSERT(spa->spa_sync_on == B_FALSE); 149 ASSERT(spa->spa_dsl_pool == NULL); 150 ASSERT(spa->spa_root_vdev == NULL); 151 152 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 153 154 txg_list_destroy(&spa->spa_vdev_txg_list); 155 156 list_destroy(&spa->spa_dirty_list); 157 158 rw_destroy(&spa->spa_traverse_lock); 159 160 for (t = 0; t < ZIO_TYPES; t++) { 161 taskq_destroy(spa->spa_zio_issue_taskq[t]); 162 taskq_destroy(spa->spa_zio_intr_taskq[t]); 163 spa->spa_zio_issue_taskq[t] = NULL; 164 spa->spa_zio_intr_taskq[t] = NULL; 165 } 166 167 metaslab_class_destroy(spa->spa_normal_class); 168 spa->spa_normal_class = NULL; 169 170 /* 171 * If this was part of an import or the open otherwise failed, we may 172 * still have errors left in the queues. Empty them just in case. 173 */ 174 spa_errlog_drain(spa); 175 176 avl_destroy(&spa->spa_errlist_scrub); 177 avl_destroy(&spa->spa_errlist_last); 178 179 spa->spa_state = POOL_STATE_UNINITIALIZED; 180 } 181 182 /* 183 * Verify a pool configuration, and construct the vdev tree appropriately. This 184 * will create all the necessary vdevs in the appropriate layout, with each vdev 185 * in the CLOSED state. This will prep the pool before open/creation/import. 186 * All vdev validation is done by the vdev_alloc() routine. 187 */ 188 static vdev_t * 189 spa_config_parse(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) 190 { 191 nvlist_t **child; 192 uint_t c, children; 193 vdev_t *vd; 194 195 if ((vd = vdev_alloc(spa, nv, parent, id, atype)) == NULL) 196 return (NULL); 197 198 if (vd->vdev_ops->vdev_op_leaf) 199 return (vd); 200 201 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 202 &child, &children) != 0) { 203 vdev_free(vd); 204 return (NULL); 205 } 206 207 for (c = 0; c < children; c++) { 208 if (spa_config_parse(spa, child[c], vd, c, atype) == NULL) { 209 vdev_free(vd); 210 return (NULL); 211 } 212 } 213 214 return (vd); 215 } 216 217 /* 218 * Opposite of spa_load(). 219 */ 220 static void 221 spa_unload(spa_t *spa) 222 { 223 /* 224 * Stop async tasks. 225 */ 226 spa_async_suspend(spa); 227 228 /* 229 * Stop syncing. 230 */ 231 if (spa->spa_sync_on) { 232 txg_sync_stop(spa->spa_dsl_pool); 233 spa->spa_sync_on = B_FALSE; 234 } 235 236 /* 237 * Wait for any outstanding prefetch I/O to complete. 238 */ 239 spa_config_enter(spa, RW_WRITER, FTAG); 240 spa_config_exit(spa, FTAG); 241 242 /* 243 * Close the dsl pool. 244 */ 245 if (spa->spa_dsl_pool) { 246 dsl_pool_close(spa->spa_dsl_pool); 247 spa->spa_dsl_pool = NULL; 248 } 249 250 /* 251 * Close all vdevs. 252 */ 253 if (spa->spa_root_vdev) 254 vdev_free(spa->spa_root_vdev); 255 ASSERT(spa->spa_root_vdev == NULL); 256 257 spa->spa_async_suspended = 0; 258 } 259 260 /* 261 * Load an existing storage pool, using the pool's builtin spa_config as a 262 * source of configuration information. 263 */ 264 static int 265 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 266 { 267 int error = 0; 268 nvlist_t *nvroot = NULL; 269 vdev_t *rvd; 270 uberblock_t *ub = &spa->spa_uberblock; 271 uint64_t config_cache_txg = spa->spa_config_txg; 272 uint64_t pool_guid; 273 zio_t *zio; 274 275 spa->spa_load_state = state; 276 277 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 278 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 279 error = EINVAL; 280 goto out; 281 } 282 283 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 284 &spa->spa_config_txg); 285 286 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 287 spa_guid_exists(pool_guid, 0)) { 288 error = EEXIST; 289 goto out; 290 } 291 292 /* 293 * Parse the configuration into a vdev tree. 294 */ 295 spa_config_enter(spa, RW_WRITER, FTAG); 296 rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 297 spa_config_exit(spa, FTAG); 298 299 if (rvd == NULL) { 300 error = EINVAL; 301 goto out; 302 } 303 304 ASSERT(spa->spa_root_vdev == rvd); 305 ASSERT(spa_guid(spa) == pool_guid); 306 307 /* 308 * Try to open all vdevs, loading each label in the process. 309 */ 310 if (vdev_open(rvd) != 0) { 311 error = ENXIO; 312 goto out; 313 } 314 315 /* 316 * Find the best uberblock. 317 */ 318 bzero(ub, sizeof (uberblock_t)); 319 320 zio = zio_root(spa, NULL, NULL, 321 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 322 vdev_uberblock_load(zio, rvd, ub); 323 error = zio_wait(zio); 324 325 /* 326 * If we weren't able to find a single valid uberblock, return failure. 327 */ 328 if (ub->ub_txg == 0) { 329 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 330 VDEV_AUX_CORRUPT_DATA); 331 error = ENXIO; 332 goto out; 333 } 334 335 /* 336 * If the pool is newer than the code, we can't open it. 337 */ 338 if (ub->ub_version > ZFS_VERSION) { 339 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 340 VDEV_AUX_VERSION_NEWER); 341 error = ENOTSUP; 342 goto out; 343 } 344 345 /* 346 * If the vdev guid sum doesn't match the uberblock, we have an 347 * incomplete configuration. 348 */ 349 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 350 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 351 VDEV_AUX_BAD_GUID_SUM); 352 error = ENXIO; 353 goto out; 354 } 355 356 /* 357 * Initialize internal SPA structures. 358 */ 359 spa->spa_state = POOL_STATE_ACTIVE; 360 spa->spa_ubsync = spa->spa_uberblock; 361 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 362 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 363 if (error) { 364 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 365 VDEV_AUX_CORRUPT_DATA); 366 goto out; 367 } 368 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 369 370 if (zap_lookup(spa->spa_meta_objset, 371 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 372 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 373 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 374 VDEV_AUX_CORRUPT_DATA); 375 error = EIO; 376 goto out; 377 } 378 379 if (!mosconfig) { 380 dmu_buf_t *db; 381 char *packed = NULL; 382 size_t nvsize = 0; 383 nvlist_t *newconfig = NULL; 384 385 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, 386 spa->spa_config_object, FTAG, &db)); 387 nvsize = *(uint64_t *)db->db_data; 388 dmu_buf_rele(db, FTAG); 389 390 packed = kmem_alloc(nvsize, KM_SLEEP); 391 error = dmu_read(spa->spa_meta_objset, 392 spa->spa_config_object, 0, nvsize, packed); 393 if (error == 0) 394 error = nvlist_unpack(packed, nvsize, &newconfig, 0); 395 kmem_free(packed, nvsize); 396 397 if (error) { 398 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 399 VDEV_AUX_CORRUPT_DATA); 400 error = EIO; 401 goto out; 402 } 403 404 spa_config_set(spa, newconfig); 405 406 spa_unload(spa); 407 spa_deactivate(spa); 408 spa_activate(spa); 409 410 return (spa_load(spa, newconfig, state, B_TRUE)); 411 } 412 413 if (zap_lookup(spa->spa_meta_objset, 414 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 415 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 416 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 417 VDEV_AUX_CORRUPT_DATA); 418 error = EIO; 419 goto out; 420 } 421 422 /* 423 * Load the persistent error log. If we have an older pool, this will 424 * not be present. 425 */ 426 error = zap_lookup(spa->spa_meta_objset, 427 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 428 sizeof (uint64_t), 1, &spa->spa_errlog_last); 429 if (error != 0 &&error != ENOENT) { 430 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 431 VDEV_AUX_CORRUPT_DATA); 432 error = EIO; 433 goto out; 434 } 435 436 error = zap_lookup(spa->spa_meta_objset, 437 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 438 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 439 if (error != 0 && error != ENOENT) { 440 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 441 VDEV_AUX_CORRUPT_DATA); 442 error = EIO; 443 goto out; 444 } 445 446 /* 447 * Load the vdev state for all top level vdevs. We need to grab the 448 * config lock because all label I/O is done with the 449 * ZIO_FLAG_CONFIG_HELD flag. 450 */ 451 spa_config_enter(spa, RW_READER, FTAG); 452 error = vdev_load(rvd); 453 spa_config_exit(spa, FTAG); 454 455 if (error) 456 goto out; 457 458 /* 459 * Propagate the leaf DTLs we just loaded all the way up the tree. 460 */ 461 spa_config_enter(spa, RW_WRITER, FTAG); 462 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 463 spa_config_exit(spa, FTAG); 464 465 /* 466 * Check the state of the root vdev. If it can't be opened, it 467 * indicates one or more toplevel vdevs are faulted. 468 */ 469 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 470 error = ENXIO; 471 goto out; 472 } 473 474 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 475 dmu_tx_t *tx; 476 int need_update = B_FALSE; 477 int c; 478 479 /* 480 * Claim log blocks that haven't been committed yet. 481 * This must all happen in a single txg. 482 */ 483 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 484 spa_first_txg(spa)); 485 dmu_objset_find(spa->spa_name, zil_claim, tx, 0); 486 dmu_tx_commit(tx); 487 488 spa->spa_sync_on = B_TRUE; 489 txg_sync_start(spa->spa_dsl_pool); 490 491 /* 492 * Wait for all claims to sync. 493 */ 494 txg_wait_synced(spa->spa_dsl_pool, 0); 495 496 /* 497 * If the config cache is stale, or we have uninitialized 498 * metaslabs (see spa_vdev_add()), then update the config. 499 */ 500 if (config_cache_txg != spa->spa_config_txg || 501 state == SPA_LOAD_IMPORT) 502 need_update = B_TRUE; 503 504 for (c = 0; c < rvd->vdev_children; c++) 505 if (rvd->vdev_child[c]->vdev_ms_array == 0) 506 need_update = B_TRUE; 507 508 /* 509 * Update the config cache asychronously in case we're the 510 * root pool, in which case the config cache isn't writable yet. 511 */ 512 if (need_update) 513 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 514 } 515 516 error = 0; 517 out: 518 if (error) 519 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 520 spa->spa_load_state = SPA_LOAD_NONE; 521 spa->spa_ena = 0; 522 523 return (error); 524 } 525 526 /* 527 * Pool Open/Import 528 * 529 * The import case is identical to an open except that the configuration is sent 530 * down from userland, instead of grabbed from the configuration cache. For the 531 * case of an open, the pool configuration will exist in the 532 * POOL_STATE_UNITIALIZED state. 533 * 534 * The stats information (gen/count/ustats) is used to gather vdev statistics at 535 * the same time open the pool, without having to keep around the spa_t in some 536 * ambiguous state. 537 */ 538 static int 539 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 540 { 541 spa_t *spa; 542 int error; 543 int loaded = B_FALSE; 544 int locked = B_FALSE; 545 546 *spapp = NULL; 547 548 /* 549 * As disgusting as this is, we need to support recursive calls to this 550 * function because dsl_dir_open() is called during spa_load(), and ends 551 * up calling spa_open() again. The real fix is to figure out how to 552 * avoid dsl_dir_open() calling this in the first place. 553 */ 554 if (mutex_owner(&spa_namespace_lock) != curthread) { 555 mutex_enter(&spa_namespace_lock); 556 locked = B_TRUE; 557 } 558 559 if ((spa = spa_lookup(pool)) == NULL) { 560 if (locked) 561 mutex_exit(&spa_namespace_lock); 562 return (ENOENT); 563 } 564 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 565 566 spa_activate(spa); 567 568 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 569 570 if (error == EBADF) { 571 /* 572 * If vdev_load() returns EBADF, it indicates that one 573 * of the vdevs indicates that the pool has been 574 * exported or destroyed. If this is the case, the 575 * config cache is out of sync and we should remove the 576 * pool from the namespace. 577 */ 578 spa_unload(spa); 579 spa_deactivate(spa); 580 spa_remove(spa); 581 spa_config_sync(); 582 if (locked) 583 mutex_exit(&spa_namespace_lock); 584 return (ENOENT); 585 } 586 587 if (error) { 588 /* 589 * We can't open the pool, but we still have useful 590 * information: the state of each vdev after the 591 * attempted vdev_open(). Return this to the user. 592 */ 593 if (config != NULL && spa->spa_root_vdev != NULL) { 594 spa_config_enter(spa, RW_READER, FTAG); 595 *config = spa_config_generate(spa, NULL, -1ULL, 596 B_TRUE); 597 spa_config_exit(spa, FTAG); 598 } 599 spa_unload(spa); 600 spa_deactivate(spa); 601 spa->spa_last_open_failed = B_TRUE; 602 if (locked) 603 mutex_exit(&spa_namespace_lock); 604 *spapp = NULL; 605 return (error); 606 } else { 607 zfs_post_ok(spa, NULL); 608 spa->spa_last_open_failed = B_FALSE; 609 } 610 611 loaded = B_TRUE; 612 } 613 614 spa_open_ref(spa, tag); 615 if (locked) 616 mutex_exit(&spa_namespace_lock); 617 618 *spapp = spa; 619 620 if (config != NULL) { 621 spa_config_enter(spa, RW_READER, FTAG); 622 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 623 spa_config_exit(spa, FTAG); 624 } 625 626 /* 627 * If we just loaded the pool, resilver anything that's out of date. 628 */ 629 if (loaded && (spa_mode & FWRITE)) 630 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 631 632 return (0); 633 } 634 635 int 636 spa_open(const char *name, spa_t **spapp, void *tag) 637 { 638 return (spa_open_common(name, spapp, tag, NULL)); 639 } 640 641 /* 642 * Lookup the given spa_t, incrementing the inject count in the process, 643 * preventing it from being exported or destroyed. 644 */ 645 spa_t * 646 spa_inject_addref(char *name) 647 { 648 spa_t *spa; 649 650 mutex_enter(&spa_namespace_lock); 651 if ((spa = spa_lookup(name)) == NULL) { 652 mutex_exit(&spa_namespace_lock); 653 return (NULL); 654 } 655 spa->spa_inject_ref++; 656 mutex_exit(&spa_namespace_lock); 657 658 return (spa); 659 } 660 661 void 662 spa_inject_delref(spa_t *spa) 663 { 664 mutex_enter(&spa_namespace_lock); 665 spa->spa_inject_ref--; 666 mutex_exit(&spa_namespace_lock); 667 } 668 669 int 670 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 671 { 672 int error; 673 spa_t *spa; 674 675 *config = NULL; 676 error = spa_open_common(name, &spa, FTAG, config); 677 678 if (spa && *config != NULL) 679 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 680 spa_get_errlog_size(spa)) == 0); 681 682 /* 683 * We want to get the alternate root even for faulted pools, so we cheat 684 * and call spa_lookup() directly. 685 */ 686 if (altroot) { 687 if (spa == NULL) { 688 mutex_enter(&spa_namespace_lock); 689 spa = spa_lookup(name); 690 if (spa) 691 spa_altroot(spa, altroot, buflen); 692 else 693 altroot[0] = '\0'; 694 spa = NULL; 695 mutex_exit(&spa_namespace_lock); 696 } else { 697 spa_altroot(spa, altroot, buflen); 698 } 699 } 700 701 if (spa != NULL) 702 spa_close(spa, FTAG); 703 704 return (error); 705 } 706 707 /* 708 * Pool Creation 709 */ 710 int 711 spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 712 { 713 spa_t *spa; 714 vdev_t *rvd; 715 dsl_pool_t *dp; 716 dmu_tx_t *tx; 717 int c, error; 718 uint64_t txg = TXG_INITIAL; 719 720 /* 721 * If this pool already exists, return failure. 722 */ 723 mutex_enter(&spa_namespace_lock); 724 if (spa_lookup(pool) != NULL) { 725 mutex_exit(&spa_namespace_lock); 726 return (EEXIST); 727 } 728 729 /* 730 * Allocate a new spa_t structure. 731 */ 732 spa = spa_add(pool, altroot); 733 spa_activate(spa); 734 735 spa->spa_uberblock.ub_txg = txg - 1; 736 spa->spa_uberblock.ub_version = ZFS_VERSION; 737 spa->spa_ubsync = spa->spa_uberblock; 738 739 /* 740 * Create the root vdev. 741 */ 742 spa_config_enter(spa, RW_WRITER, FTAG); 743 744 rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 745 746 ASSERT(spa->spa_root_vdev == rvd); 747 748 if (rvd == NULL) { 749 error = EINVAL; 750 } else { 751 if ((error = vdev_create(rvd, txg)) == 0) { 752 for (c = 0; c < rvd->vdev_children; c++) 753 vdev_init(rvd->vdev_child[c], txg); 754 vdev_config_dirty(rvd); 755 } 756 } 757 758 spa_config_exit(spa, FTAG); 759 760 if (error) { 761 spa_unload(spa); 762 spa_deactivate(spa); 763 spa_remove(spa); 764 mutex_exit(&spa_namespace_lock); 765 return (error); 766 } 767 768 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 769 spa->spa_meta_objset = dp->dp_meta_objset; 770 771 tx = dmu_tx_create_assigned(dp, txg); 772 773 /* 774 * Create the pool config object. 775 */ 776 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 777 DMU_OT_PACKED_NVLIST, 1 << 14, 778 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 779 780 if (zap_add(spa->spa_meta_objset, 781 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 782 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 783 cmn_err(CE_PANIC, "failed to add pool config"); 784 } 785 786 /* 787 * Create the deferred-free bplist object. Turn off compression 788 * because sync-to-convergence takes longer if the blocksize 789 * keeps changing. 790 */ 791 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 792 1 << 14, tx); 793 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 794 ZIO_COMPRESS_OFF, tx); 795 796 if (zap_add(spa->spa_meta_objset, 797 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 798 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 799 cmn_err(CE_PANIC, "failed to add bplist"); 800 } 801 802 dmu_tx_commit(tx); 803 804 spa->spa_sync_on = B_TRUE; 805 txg_sync_start(spa->spa_dsl_pool); 806 807 /* 808 * We explicitly wait for the first transaction to complete so that our 809 * bean counters are appropriately updated. 810 */ 811 txg_wait_synced(spa->spa_dsl_pool, txg); 812 813 spa_config_sync(); 814 815 mutex_exit(&spa_namespace_lock); 816 817 return (0); 818 } 819 820 /* 821 * Import the given pool into the system. We set up the necessary spa_t and 822 * then call spa_load() to do the dirty work. 823 */ 824 int 825 spa_import(const char *pool, nvlist_t *config, const char *altroot) 826 { 827 spa_t *spa; 828 int error; 829 830 if (!(spa_mode & FWRITE)) 831 return (EROFS); 832 833 /* 834 * If a pool with this name exists, return failure. 835 */ 836 mutex_enter(&spa_namespace_lock); 837 if (spa_lookup(pool) != NULL) { 838 mutex_exit(&spa_namespace_lock); 839 return (EEXIST); 840 } 841 842 /* 843 * Create and initialize the spa structure. 844 */ 845 spa = spa_add(pool, altroot); 846 spa_activate(spa); 847 848 /* 849 * Pass off the heavy lifting to spa_load(). 850 * Pass TRUE for mosconfig because the user-supplied config 851 * is actually the one to trust when doing an import. 852 */ 853 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 854 855 if (error) { 856 spa_unload(spa); 857 spa_deactivate(spa); 858 spa_remove(spa); 859 mutex_exit(&spa_namespace_lock); 860 return (error); 861 } 862 863 /* 864 * Update the config cache to include the newly-imported pool. 865 */ 866 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 867 868 mutex_exit(&spa_namespace_lock); 869 870 /* 871 * Resilver anything that's out of date. 872 */ 873 if (spa_mode & FWRITE) 874 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 875 876 return (0); 877 } 878 879 /* 880 * This (illegal) pool name is used when temporarily importing a spa_t in order 881 * to get the vdev stats associated with the imported devices. 882 */ 883 #define TRYIMPORT_NAME "$import" 884 885 nvlist_t * 886 spa_tryimport(nvlist_t *tryconfig) 887 { 888 nvlist_t *config = NULL; 889 char *poolname; 890 spa_t *spa; 891 uint64_t state; 892 893 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 894 return (NULL); 895 896 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 897 return (NULL); 898 899 /* 900 * Create and initialize the spa structure. 901 */ 902 mutex_enter(&spa_namespace_lock); 903 spa = spa_add(TRYIMPORT_NAME, NULL); 904 spa_activate(spa); 905 906 /* 907 * Pass off the heavy lifting to spa_load(). 908 * Pass TRUE for mosconfig because the user-supplied config 909 * is actually the one to trust when doing an import. 910 */ 911 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 912 913 /* 914 * If 'tryconfig' was at least parsable, return the current config. 915 */ 916 if (spa->spa_root_vdev != NULL) { 917 spa_config_enter(spa, RW_READER, FTAG); 918 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 919 spa_config_exit(spa, FTAG); 920 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 921 poolname) == 0); 922 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 923 state) == 0); 924 } 925 926 spa_unload(spa); 927 spa_deactivate(spa); 928 spa_remove(spa); 929 mutex_exit(&spa_namespace_lock); 930 931 return (config); 932 } 933 934 /* 935 * Pool export/destroy 936 * 937 * The act of destroying or exporting a pool is very simple. We make sure there 938 * is no more pending I/O and any references to the pool are gone. Then, we 939 * update the pool state and sync all the labels to disk, removing the 940 * configuration from the cache afterwards. 941 */ 942 static int 943 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 944 { 945 spa_t *spa; 946 947 if (oldconfig) 948 *oldconfig = NULL; 949 950 if (!(spa_mode & FWRITE)) 951 return (EROFS); 952 953 mutex_enter(&spa_namespace_lock); 954 if ((spa = spa_lookup(pool)) == NULL) { 955 mutex_exit(&spa_namespace_lock); 956 return (ENOENT); 957 } 958 959 /* 960 * Put a hold on the pool, drop the namespace lock, stop async tasks, 961 * reacquire the namespace lock, and see if we can export. 962 */ 963 spa_open_ref(spa, FTAG); 964 mutex_exit(&spa_namespace_lock); 965 spa_async_suspend(spa); 966 mutex_enter(&spa_namespace_lock); 967 spa_close(spa, FTAG); 968 969 /* 970 * The pool will be in core if it's openable, 971 * in which case we can modify its state. 972 */ 973 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 974 /* 975 * Objsets may be open only because they're dirty, so we 976 * have to force it to sync before checking spa_refcnt. 977 */ 978 spa_scrub_suspend(spa); 979 txg_wait_synced(spa->spa_dsl_pool, 0); 980 981 /* 982 * A pool cannot be exported or destroyed if there are active 983 * references. If we are resetting a pool, allow references by 984 * fault injection handlers. 985 */ 986 if (!spa_refcount_zero(spa) || 987 (spa->spa_inject_ref != 0 && 988 new_state != POOL_STATE_UNINITIALIZED)) { 989 spa_scrub_resume(spa); 990 spa_async_resume(spa); 991 mutex_exit(&spa_namespace_lock); 992 return (EBUSY); 993 } 994 995 spa_scrub_resume(spa); 996 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 997 998 /* 999 * We want this to be reflected on every label, 1000 * so mark them all dirty. spa_unload() will do the 1001 * final sync that pushes these changes out. 1002 */ 1003 if (new_state != POOL_STATE_UNINITIALIZED) { 1004 spa_config_enter(spa, RW_WRITER, FTAG); 1005 spa->spa_state = new_state; 1006 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 1007 vdev_config_dirty(spa->spa_root_vdev); 1008 spa_config_exit(spa, FTAG); 1009 } 1010 } 1011 1012 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1013 spa_unload(spa); 1014 spa_deactivate(spa); 1015 } 1016 1017 if (oldconfig && spa->spa_config) 1018 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 1019 1020 if (new_state != POOL_STATE_UNINITIALIZED) { 1021 spa_remove(spa); 1022 spa_config_sync(); 1023 } 1024 mutex_exit(&spa_namespace_lock); 1025 1026 return (0); 1027 } 1028 1029 /* 1030 * Destroy a storage pool. 1031 */ 1032 int 1033 spa_destroy(char *pool) 1034 { 1035 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1036 } 1037 1038 /* 1039 * Export a storage pool. 1040 */ 1041 int 1042 spa_export(char *pool, nvlist_t **oldconfig) 1043 { 1044 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1045 } 1046 1047 /* 1048 * Similar to spa_export(), this unloads the spa_t without actually removing it 1049 * from the namespace in any way. 1050 */ 1051 int 1052 spa_reset(char *pool) 1053 { 1054 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 1055 } 1056 1057 1058 /* 1059 * ========================================================================== 1060 * Device manipulation 1061 * ========================================================================== 1062 */ 1063 1064 /* 1065 * Add capacity to a storage pool. 1066 */ 1067 int 1068 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1069 { 1070 uint64_t txg; 1071 int c, error; 1072 vdev_t *rvd = spa->spa_root_vdev; 1073 vdev_t *vd, *tvd; 1074 1075 txg = spa_vdev_enter(spa); 1076 1077 vd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1078 1079 if (vd == NULL) 1080 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1081 1082 if ((error = vdev_create(vd, txg)) != 0) 1083 return (spa_vdev_exit(spa, vd, txg, error)); 1084 1085 /* 1086 * Transfer each new top-level vdev from vd to rvd. 1087 */ 1088 for (c = 0; c < vd->vdev_children; c++) { 1089 tvd = vd->vdev_child[c]; 1090 vdev_remove_child(vd, tvd); 1091 tvd->vdev_id = rvd->vdev_children; 1092 vdev_add_child(rvd, tvd); 1093 vdev_config_dirty(tvd); 1094 } 1095 1096 /* 1097 * We have to be careful when adding new vdevs to an existing pool. 1098 * If other threads start allocating from these vdevs before we 1099 * sync the config cache, and we lose power, then upon reboot we may 1100 * fail to open the pool because there are DVAs that the config cache 1101 * can't translate. Therefore, we first add the vdevs without 1102 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1103 * and then let spa_config_update() initialize the new metaslabs. 1104 * 1105 * spa_load() checks for added-but-not-initialized vdevs, so that 1106 * if we lose power at any point in this sequence, the remaining 1107 * steps will be completed the next time we load the pool. 1108 */ 1109 (void) spa_vdev_exit(spa, vd, txg, 0); 1110 1111 mutex_enter(&spa_namespace_lock); 1112 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1113 mutex_exit(&spa_namespace_lock); 1114 1115 return (0); 1116 } 1117 1118 /* 1119 * Attach a device to a mirror. The arguments are the path to any device 1120 * in the mirror, and the nvroot for the new device. If the path specifies 1121 * a device that is not mirrored, we automatically insert the mirror vdev. 1122 * 1123 * If 'replacing' is specified, the new device is intended to replace the 1124 * existing device; in this case the two devices are made into their own 1125 * mirror using the 'replacing' vdev, which is functionally idendical to 1126 * the mirror vdev (it actually reuses all the same ops) but has a few 1127 * extra rules: you can't attach to it after it's been created, and upon 1128 * completion of resilvering, the first disk (the one being replaced) 1129 * is automatically detached. 1130 */ 1131 int 1132 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1133 { 1134 uint64_t txg, open_txg; 1135 int error; 1136 vdev_t *rvd = spa->spa_root_vdev; 1137 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1138 vdev_ops_t *pvops = replacing ? &vdev_replacing_ops : &vdev_mirror_ops; 1139 1140 txg = spa_vdev_enter(spa); 1141 1142 oldvd = vdev_lookup_by_guid(rvd, guid); 1143 1144 if (oldvd == NULL) 1145 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1146 1147 if (!oldvd->vdev_ops->vdev_op_leaf) 1148 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1149 1150 pvd = oldvd->vdev_parent; 1151 1152 /* 1153 * The parent must be a mirror or the root, unless we're replacing; 1154 * in that case, the parent can be anything but another replacing vdev. 1155 */ 1156 if (pvd->vdev_ops != &vdev_mirror_ops && 1157 pvd->vdev_ops != &vdev_root_ops && 1158 (!replacing || pvd->vdev_ops == &vdev_replacing_ops)) 1159 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1160 1161 newrootvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1162 1163 if (newrootvd == NULL || newrootvd->vdev_children != 1) 1164 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1165 1166 newvd = newrootvd->vdev_child[0]; 1167 1168 if (!newvd->vdev_ops->vdev_op_leaf) 1169 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1170 1171 if ((error = vdev_create(newrootvd, txg)) != 0) 1172 return (spa_vdev_exit(spa, newrootvd, txg, error)); 1173 1174 /* 1175 * Compare the new device size with the replaceable/attachable 1176 * device size. 1177 */ 1178 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1179 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1180 1181 /* 1182 * The new device cannot have a higher alignment requirement 1183 * than the top-level vdev. 1184 */ 1185 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1186 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1187 1188 /* 1189 * If this is an in-place replacement, update oldvd's path and devid 1190 * to make it distinguishable from newvd, and unopenable from now on. 1191 */ 1192 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1193 spa_strfree(oldvd->vdev_path); 1194 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1195 KM_SLEEP); 1196 (void) sprintf(oldvd->vdev_path, "%s/%s", 1197 newvd->vdev_path, "old"); 1198 if (oldvd->vdev_devid != NULL) { 1199 spa_strfree(oldvd->vdev_devid); 1200 oldvd->vdev_devid = NULL; 1201 } 1202 } 1203 1204 /* 1205 * If the parent is not a mirror, or if we're replacing, 1206 * insert the new mirror/replacing vdev above oldvd. 1207 */ 1208 if (pvd->vdev_ops != pvops) 1209 pvd = vdev_add_parent(oldvd, pvops); 1210 1211 ASSERT(pvd->vdev_top->vdev_parent == rvd); 1212 ASSERT(pvd->vdev_ops == pvops); 1213 ASSERT(oldvd->vdev_parent == pvd); 1214 1215 /* 1216 * Extract the new device from its root and add it to pvd. 1217 */ 1218 vdev_remove_child(newrootvd, newvd); 1219 newvd->vdev_id = pvd->vdev_children; 1220 vdev_add_child(pvd, newvd); 1221 1222 /* 1223 * If newvd is smaller than oldvd, but larger than its rsize, 1224 * the addition of newvd may have decreased our parent's asize. 1225 */ 1226 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1227 1228 tvd = newvd->vdev_top; 1229 ASSERT(pvd->vdev_top == tvd); 1230 ASSERT(tvd->vdev_parent == rvd); 1231 1232 vdev_config_dirty(tvd); 1233 1234 /* 1235 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1236 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1237 */ 1238 open_txg = txg + TXG_CONCURRENT_STATES - 1; 1239 1240 mutex_enter(&newvd->vdev_dtl_lock); 1241 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1242 open_txg - TXG_INITIAL + 1); 1243 mutex_exit(&newvd->vdev_dtl_lock); 1244 1245 dprintf("attached %s in txg %llu\n", newvd->vdev_path, txg); 1246 1247 /* 1248 * Mark newvd's DTL dirty in this txg. 1249 */ 1250 vdev_dirty(tvd, VDD_DTL, newvd, txg); 1251 1252 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1253 1254 /* 1255 * Kick off a resilver to update newvd. 1256 */ 1257 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1258 1259 return (0); 1260 } 1261 1262 /* 1263 * Detach a device from a mirror or replacing vdev. 1264 * If 'replace_done' is specified, only detach if the parent 1265 * is a replacing vdev. 1266 */ 1267 int 1268 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1269 { 1270 uint64_t txg; 1271 int c, t, error; 1272 vdev_t *rvd = spa->spa_root_vdev; 1273 vdev_t *vd, *pvd, *cvd, *tvd; 1274 1275 txg = spa_vdev_enter(spa); 1276 1277 vd = vdev_lookup_by_guid(rvd, guid); 1278 1279 if (vd == NULL) 1280 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1281 1282 if (!vd->vdev_ops->vdev_op_leaf) 1283 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1284 1285 pvd = vd->vdev_parent; 1286 1287 /* 1288 * If replace_done is specified, only remove this device if it's 1289 * the first child of a replacing vdev. 1290 */ 1291 if (replace_done && 1292 (vd->vdev_id != 0 || pvd->vdev_ops != &vdev_replacing_ops)) 1293 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1294 1295 /* 1296 * Only mirror and replacing vdevs support detach. 1297 */ 1298 if (pvd->vdev_ops != &vdev_replacing_ops && 1299 pvd->vdev_ops != &vdev_mirror_ops) 1300 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1301 1302 /* 1303 * If there's only one replica, you can't detach it. 1304 */ 1305 if (pvd->vdev_children <= 1) 1306 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1307 1308 /* 1309 * If all siblings have non-empty DTLs, this device may have the only 1310 * valid copy of the data, which means we cannot safely detach it. 1311 * 1312 * XXX -- as in the vdev_offline() case, we really want a more 1313 * precise DTL check. 1314 */ 1315 for (c = 0; c < pvd->vdev_children; c++) { 1316 uint64_t dirty; 1317 1318 cvd = pvd->vdev_child[c]; 1319 if (cvd == vd) 1320 continue; 1321 if (vdev_is_dead(cvd)) 1322 continue; 1323 mutex_enter(&cvd->vdev_dtl_lock); 1324 dirty = cvd->vdev_dtl_map.sm_space | 1325 cvd->vdev_dtl_scrub.sm_space; 1326 mutex_exit(&cvd->vdev_dtl_lock); 1327 if (!dirty) 1328 break; 1329 } 1330 if (c == pvd->vdev_children) 1331 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1332 1333 /* 1334 * Erase the disk labels so the disk can be used for other things. 1335 * This must be done after all other error cases are handled, 1336 * but before we disembowel vd (so we can still do I/O to it). 1337 * But if we can't do it, don't treat the error as fatal -- 1338 * it may be that the unwritability of the disk is the reason 1339 * it's being detached! 1340 */ 1341 error = vdev_label_init(vd, 0); 1342 if (error) 1343 dprintf("unable to erase labels on %s\n", vdev_description(vd)); 1344 1345 /* 1346 * Remove vd from its parent and compact the parent's children. 1347 */ 1348 vdev_remove_child(pvd, vd); 1349 vdev_compact_children(pvd); 1350 1351 /* 1352 * Remember one of the remaining children so we can get tvd below. 1353 */ 1354 cvd = pvd->vdev_child[0]; 1355 1356 /* 1357 * If the parent mirror/replacing vdev only has one child, 1358 * the parent is no longer needed. Remove it from the tree. 1359 */ 1360 if (pvd->vdev_children == 1) 1361 vdev_remove_parent(cvd); 1362 1363 /* 1364 * We don't set tvd until now because the parent we just removed 1365 * may have been the previous top-level vdev. 1366 */ 1367 tvd = cvd->vdev_top; 1368 ASSERT(tvd->vdev_parent == rvd); 1369 1370 /* 1371 * Reopen this top-level vdev to reassess health after detach. 1372 */ 1373 vdev_reopen(tvd); 1374 1375 /* 1376 * If the device we just detached was smaller than the others, 1377 * it may be possible to add metaslabs (i.e. grow the pool). 1378 * vdev_metaslab_init() can't fail because the existing metaslabs 1379 * are already in core, so there's nothing to read from disk. 1380 */ 1381 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 1382 1383 vdev_config_dirty(tvd); 1384 1385 /* 1386 * Mark vd's DTL as dirty in this txg. 1387 * vdev_dtl_sync() will see that vd->vdev_detached is set 1388 * and free vd's DTL object in syncing context. 1389 * But first make sure we're not on any *other* txg's DTL list, 1390 * to prevent vd from being accessed after it's freed. 1391 */ 1392 for (t = 0; t < TXG_SIZE; t++) 1393 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1394 vd->vdev_detached = B_TRUE; 1395 vdev_dirty(tvd, VDD_DTL, vd, txg); 1396 1397 dprintf("detached %s in txg %llu\n", vd->vdev_path, txg); 1398 1399 return (spa_vdev_exit(spa, vd, txg, 0)); 1400 } 1401 1402 /* 1403 * Find any device that's done replacing, so we can detach it. 1404 */ 1405 static vdev_t * 1406 spa_vdev_replace_done_hunt(vdev_t *vd) 1407 { 1408 vdev_t *newvd, *oldvd; 1409 int c; 1410 1411 for (c = 0; c < vd->vdev_children; c++) { 1412 oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 1413 if (oldvd != NULL) 1414 return (oldvd); 1415 } 1416 1417 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 1418 oldvd = vd->vdev_child[0]; 1419 newvd = vd->vdev_child[1]; 1420 1421 mutex_enter(&newvd->vdev_dtl_lock); 1422 if (newvd->vdev_dtl_map.sm_space == 0 && 1423 newvd->vdev_dtl_scrub.sm_space == 0) { 1424 mutex_exit(&newvd->vdev_dtl_lock); 1425 return (oldvd); 1426 } 1427 mutex_exit(&newvd->vdev_dtl_lock); 1428 } 1429 1430 return (NULL); 1431 } 1432 1433 static void 1434 spa_vdev_replace_done(spa_t *spa) 1435 { 1436 vdev_t *vd; 1437 uint64_t guid; 1438 1439 spa_config_enter(spa, RW_READER, FTAG); 1440 1441 while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 1442 guid = vd->vdev_guid; 1443 spa_config_exit(spa, FTAG); 1444 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 1445 return; 1446 spa_config_enter(spa, RW_READER, FTAG); 1447 } 1448 1449 spa_config_exit(spa, FTAG); 1450 } 1451 1452 /* 1453 * Update the stored path for this vdev. Dirty the vdev configuration, relying 1454 * on spa_vdev_enter/exit() to synchronize the labels and cache. 1455 */ 1456 int 1457 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 1458 { 1459 vdev_t *rvd, *vd; 1460 uint64_t txg; 1461 1462 rvd = spa->spa_root_vdev; 1463 1464 txg = spa_vdev_enter(spa); 1465 1466 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 1467 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 1468 1469 if (!vd->vdev_ops->vdev_op_leaf) 1470 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1471 1472 spa_strfree(vd->vdev_path); 1473 vd->vdev_path = spa_strdup(newpath); 1474 1475 vdev_config_dirty(vd->vdev_top); 1476 1477 return (spa_vdev_exit(spa, NULL, txg, 0)); 1478 } 1479 1480 /* 1481 * ========================================================================== 1482 * SPA Scrubbing 1483 * ========================================================================== 1484 */ 1485 1486 void 1487 spa_scrub_throttle(spa_t *spa, int direction) 1488 { 1489 mutex_enter(&spa->spa_scrub_lock); 1490 spa->spa_scrub_throttled += direction; 1491 ASSERT(spa->spa_scrub_throttled >= 0); 1492 if (spa->spa_scrub_throttled == 0) 1493 cv_broadcast(&spa->spa_scrub_io_cv); 1494 mutex_exit(&spa->spa_scrub_lock); 1495 } 1496 1497 static void 1498 spa_scrub_io_done(zio_t *zio) 1499 { 1500 spa_t *spa = zio->io_spa; 1501 1502 zio_buf_free(zio->io_data, zio->io_size); 1503 1504 mutex_enter(&spa->spa_scrub_lock); 1505 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1506 vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 1507 spa->spa_scrub_errors++; 1508 mutex_enter(&vd->vdev_stat_lock); 1509 vd->vdev_stat.vs_scrub_errors++; 1510 mutex_exit(&vd->vdev_stat_lock); 1511 } 1512 if (--spa->spa_scrub_inflight == 0) { 1513 cv_broadcast(&spa->spa_scrub_io_cv); 1514 ASSERT(spa->spa_scrub_throttled == 0); 1515 } 1516 mutex_exit(&spa->spa_scrub_lock); 1517 } 1518 1519 static void 1520 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 1521 zbookmark_t *zb) 1522 { 1523 size_t size = BP_GET_LSIZE(bp); 1524 void *data = zio_buf_alloc(size); 1525 1526 mutex_enter(&spa->spa_scrub_lock); 1527 spa->spa_scrub_inflight++; 1528 mutex_exit(&spa->spa_scrub_lock); 1529 1530 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 1531 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 1532 1533 flags |= ZIO_FLAG_CANFAIL; 1534 1535 zio_nowait(zio_read(NULL, spa, bp, data, size, 1536 spa_scrub_io_done, NULL, priority, flags, zb)); 1537 } 1538 1539 /* ARGSUSED */ 1540 static int 1541 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 1542 { 1543 blkptr_t *bp = &bc->bc_blkptr; 1544 vdev_t *vd = spa->spa_root_vdev; 1545 dva_t *dva = bp->blk_dva; 1546 int needs_resilver = B_FALSE; 1547 int d; 1548 1549 if (bc->bc_errno) { 1550 /* 1551 * We can't scrub this block, but we can continue to scrub 1552 * the rest of the pool. Note the error and move along. 1553 */ 1554 mutex_enter(&spa->spa_scrub_lock); 1555 spa->spa_scrub_errors++; 1556 mutex_exit(&spa->spa_scrub_lock); 1557 1558 mutex_enter(&vd->vdev_stat_lock); 1559 vd->vdev_stat.vs_scrub_errors++; 1560 mutex_exit(&vd->vdev_stat_lock); 1561 1562 return (ERESTART); 1563 } 1564 1565 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 1566 1567 for (d = 0; d < BP_GET_NDVAS(bp); d++) { 1568 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 1569 1570 ASSERT(vd != NULL); 1571 1572 /* 1573 * Keep track of how much data we've examined so that 1574 * zpool(1M) status can make useful progress reports. 1575 */ 1576 mutex_enter(&vd->vdev_stat_lock); 1577 vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 1578 mutex_exit(&vd->vdev_stat_lock); 1579 1580 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 1581 if (DVA_GET_GANG(&dva[d])) { 1582 /* 1583 * Gang members may be spread across multiple 1584 * vdevs, so the best we can do is look at the 1585 * pool-wide DTL. 1586 * XXX -- it would be better to change our 1587 * allocation policy to ensure that this can't 1588 * happen. 1589 */ 1590 vd = spa->spa_root_vdev; 1591 } 1592 if (vdev_dtl_contains(&vd->vdev_dtl_map, 1593 bp->blk_birth, 1)) 1594 needs_resilver = B_TRUE; 1595 } 1596 } 1597 1598 if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 1599 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 1600 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 1601 else if (needs_resilver) 1602 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 1603 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 1604 1605 return (0); 1606 } 1607 1608 static void 1609 spa_scrub_thread(spa_t *spa) 1610 { 1611 callb_cpr_t cprinfo; 1612 traverse_handle_t *th = spa->spa_scrub_th; 1613 vdev_t *rvd = spa->spa_root_vdev; 1614 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 1615 int error = 0; 1616 boolean_t complete; 1617 1618 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 1619 1620 /* 1621 * If we're restarting due to a snapshot create/delete, 1622 * wait for that to complete. 1623 */ 1624 txg_wait_synced(spa_get_dsl(spa), 0); 1625 1626 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 1627 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 1628 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 1629 1630 spa_config_enter(spa, RW_WRITER, FTAG); 1631 vdev_reopen(rvd); /* purge all vdev caches */ 1632 vdev_config_dirty(rvd); /* rewrite all disk labels */ 1633 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 1634 spa_config_exit(spa, FTAG); 1635 1636 mutex_enter(&spa->spa_scrub_lock); 1637 spa->spa_scrub_errors = 0; 1638 spa->spa_scrub_active = 1; 1639 ASSERT(spa->spa_scrub_inflight == 0); 1640 ASSERT(spa->spa_scrub_throttled == 0); 1641 1642 while (!spa->spa_scrub_stop) { 1643 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1644 while (spa->spa_scrub_suspended) { 1645 spa->spa_scrub_active = 0; 1646 cv_broadcast(&spa->spa_scrub_cv); 1647 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1648 spa->spa_scrub_active = 1; 1649 } 1650 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 1651 1652 if (spa->spa_scrub_restart_txg != 0) 1653 break; 1654 1655 mutex_exit(&spa->spa_scrub_lock); 1656 error = traverse_more(th); 1657 mutex_enter(&spa->spa_scrub_lock); 1658 if (error != EAGAIN) 1659 break; 1660 1661 while (spa->spa_scrub_throttled > 0) 1662 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1663 } 1664 1665 while (spa->spa_scrub_inflight) 1666 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1667 1668 spa->spa_scrub_active = 0; 1669 cv_broadcast(&spa->spa_scrub_cv); 1670 1671 mutex_exit(&spa->spa_scrub_lock); 1672 1673 spa_config_enter(spa, RW_WRITER, FTAG); 1674 1675 mutex_enter(&spa->spa_scrub_lock); 1676 1677 /* 1678 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 1679 * AND the spa config lock to synchronize with any config changes 1680 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 1681 */ 1682 if (spa->spa_scrub_restart_txg != 0) 1683 error = ERESTART; 1684 1685 if (spa->spa_scrub_stop) 1686 error = EINTR; 1687 1688 /* 1689 * Even if there were uncorrectable errors, we consider the scrub 1690 * completed. The downside is that if there is a transient error during 1691 * a resilver, we won't resilver the data properly to the target. But 1692 * if the damage is permanent (more likely) we will resilver forever, 1693 * which isn't really acceptable. Since there is enough information for 1694 * the user to know what has failed and why, this seems like a more 1695 * tractable approach. 1696 */ 1697 complete = (error == 0); 1698 1699 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 1700 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 1701 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 1702 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 1703 1704 mutex_exit(&spa->spa_scrub_lock); 1705 1706 /* 1707 * If the scrub/resilver completed, update all DTLs to reflect this. 1708 * Whether it succeeded or not, vacate all temporary scrub DTLs. 1709 */ 1710 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 1711 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 1712 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 1713 spa_errlog_rotate(spa); 1714 1715 spa_config_exit(spa, FTAG); 1716 1717 mutex_enter(&spa->spa_scrub_lock); 1718 1719 /* 1720 * We may have finished replacing a device. 1721 * Let the async thread assess this and handle the detach. 1722 */ 1723 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 1724 1725 /* 1726 * If we were told to restart, our final act is to start a new scrub. 1727 */ 1728 if (error == ERESTART) 1729 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 1730 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 1731 1732 spa->spa_scrub_type = POOL_SCRUB_NONE; 1733 spa->spa_scrub_active = 0; 1734 spa->spa_scrub_thread = NULL; 1735 cv_broadcast(&spa->spa_scrub_cv); 1736 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 1737 thread_exit(); 1738 } 1739 1740 void 1741 spa_scrub_suspend(spa_t *spa) 1742 { 1743 mutex_enter(&spa->spa_scrub_lock); 1744 spa->spa_scrub_suspended++; 1745 while (spa->spa_scrub_active) { 1746 cv_broadcast(&spa->spa_scrub_cv); 1747 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1748 } 1749 while (spa->spa_scrub_inflight) 1750 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1751 mutex_exit(&spa->spa_scrub_lock); 1752 } 1753 1754 void 1755 spa_scrub_resume(spa_t *spa) 1756 { 1757 mutex_enter(&spa->spa_scrub_lock); 1758 ASSERT(spa->spa_scrub_suspended != 0); 1759 if (--spa->spa_scrub_suspended == 0) 1760 cv_broadcast(&spa->spa_scrub_cv); 1761 mutex_exit(&spa->spa_scrub_lock); 1762 } 1763 1764 void 1765 spa_scrub_restart(spa_t *spa, uint64_t txg) 1766 { 1767 /* 1768 * Something happened (e.g. snapshot create/delete) that means 1769 * we must restart any in-progress scrubs. The itinerary will 1770 * fix this properly. 1771 */ 1772 mutex_enter(&spa->spa_scrub_lock); 1773 spa->spa_scrub_restart_txg = txg; 1774 mutex_exit(&spa->spa_scrub_lock); 1775 } 1776 1777 int 1778 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 1779 { 1780 space_seg_t *ss; 1781 uint64_t mintxg, maxtxg; 1782 vdev_t *rvd = spa->spa_root_vdev; 1783 1784 if ((uint_t)type >= POOL_SCRUB_TYPES) 1785 return (ENOTSUP); 1786 1787 mutex_enter(&spa->spa_scrub_lock); 1788 1789 /* 1790 * If there's a scrub or resilver already in progress, stop it. 1791 */ 1792 while (spa->spa_scrub_thread != NULL) { 1793 /* 1794 * Don't stop a resilver unless forced. 1795 */ 1796 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 1797 mutex_exit(&spa->spa_scrub_lock); 1798 return (EBUSY); 1799 } 1800 spa->spa_scrub_stop = 1; 1801 cv_broadcast(&spa->spa_scrub_cv); 1802 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1803 } 1804 1805 /* 1806 * Terminate the previous traverse. 1807 */ 1808 if (spa->spa_scrub_th != NULL) { 1809 traverse_fini(spa->spa_scrub_th); 1810 spa->spa_scrub_th = NULL; 1811 } 1812 1813 if (rvd == NULL) { 1814 ASSERT(spa->spa_scrub_stop == 0); 1815 ASSERT(spa->spa_scrub_type == type); 1816 ASSERT(spa->spa_scrub_restart_txg == 0); 1817 mutex_exit(&spa->spa_scrub_lock); 1818 return (0); 1819 } 1820 1821 mintxg = TXG_INITIAL - 1; 1822 maxtxg = spa_last_synced_txg(spa) + 1; 1823 1824 mutex_enter(&rvd->vdev_dtl_lock); 1825 1826 if (rvd->vdev_dtl_map.sm_space == 0) { 1827 /* 1828 * The pool-wide DTL is empty. 1829 * If this is a resilver, there's nothing to do except 1830 * check whether any in-progress replacements have completed. 1831 */ 1832 if (type == POOL_SCRUB_RESILVER) { 1833 type = POOL_SCRUB_NONE; 1834 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 1835 } 1836 } else { 1837 /* 1838 * The pool-wide DTL is non-empty. 1839 * If this is a normal scrub, upgrade to a resilver instead. 1840 */ 1841 if (type == POOL_SCRUB_EVERYTHING) 1842 type = POOL_SCRUB_RESILVER; 1843 } 1844 1845 if (type == POOL_SCRUB_RESILVER) { 1846 /* 1847 * Determine the resilvering boundaries. 1848 * 1849 * Note: (mintxg, maxtxg) is an open interval, 1850 * i.e. mintxg and maxtxg themselves are not included. 1851 * 1852 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 1853 * so we don't claim to resilver a txg that's still changing. 1854 */ 1855 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 1856 mintxg = ss->ss_start - 1; 1857 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 1858 maxtxg = MIN(ss->ss_end, maxtxg); 1859 } 1860 1861 mutex_exit(&rvd->vdev_dtl_lock); 1862 1863 spa->spa_scrub_stop = 0; 1864 spa->spa_scrub_type = type; 1865 spa->spa_scrub_restart_txg = 0; 1866 1867 if (type != POOL_SCRUB_NONE) { 1868 spa->spa_scrub_mintxg = mintxg; 1869 spa->spa_scrub_maxtxg = maxtxg; 1870 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 1871 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 1872 ZIO_FLAG_CANFAIL); 1873 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 1874 spa->spa_scrub_thread = thread_create(NULL, 0, 1875 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 1876 } 1877 1878 mutex_exit(&spa->spa_scrub_lock); 1879 1880 return (0); 1881 } 1882 1883 /* 1884 * ========================================================================== 1885 * SPA async task processing 1886 * ========================================================================== 1887 */ 1888 1889 static void 1890 spa_async_reopen(spa_t *spa) 1891 { 1892 vdev_t *rvd = spa->spa_root_vdev; 1893 vdev_t *tvd; 1894 int c; 1895 1896 spa_config_enter(spa, RW_WRITER, FTAG); 1897 1898 for (c = 0; c < rvd->vdev_children; c++) { 1899 tvd = rvd->vdev_child[c]; 1900 if (tvd->vdev_reopen_wanted) { 1901 tvd->vdev_reopen_wanted = 0; 1902 vdev_reopen(tvd); 1903 } 1904 } 1905 1906 spa_config_exit(spa, FTAG); 1907 } 1908 1909 static void 1910 spa_async_thread(spa_t *spa) 1911 { 1912 int tasks; 1913 1914 ASSERT(spa->spa_sync_on); 1915 1916 mutex_enter(&spa->spa_async_lock); 1917 tasks = spa->spa_async_tasks; 1918 spa->spa_async_tasks = 0; 1919 mutex_exit(&spa->spa_async_lock); 1920 1921 /* 1922 * See if the config needs to be updated. 1923 */ 1924 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 1925 mutex_enter(&spa_namespace_lock); 1926 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1927 mutex_exit(&spa_namespace_lock); 1928 } 1929 1930 /* 1931 * See if any devices need to be reopened. 1932 */ 1933 if (tasks & SPA_ASYNC_REOPEN) 1934 spa_async_reopen(spa); 1935 1936 /* 1937 * If any devices are done replacing, detach them. 1938 */ 1939 if (tasks & SPA_ASYNC_REPLACE_DONE) 1940 spa_vdev_replace_done(spa); 1941 1942 /* 1943 * Kick off a scrub. 1944 */ 1945 if (tasks & SPA_ASYNC_SCRUB) 1946 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 1947 1948 /* 1949 * Kick off a resilver. 1950 */ 1951 if (tasks & SPA_ASYNC_RESILVER) 1952 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1953 1954 /* 1955 * Let the world know that we're done. 1956 */ 1957 mutex_enter(&spa->spa_async_lock); 1958 spa->spa_async_thread = NULL; 1959 cv_broadcast(&spa->spa_async_cv); 1960 mutex_exit(&spa->spa_async_lock); 1961 thread_exit(); 1962 } 1963 1964 void 1965 spa_async_suspend(spa_t *spa) 1966 { 1967 mutex_enter(&spa->spa_async_lock); 1968 spa->spa_async_suspended++; 1969 while (spa->spa_async_thread != NULL) 1970 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 1971 mutex_exit(&spa->spa_async_lock); 1972 } 1973 1974 void 1975 spa_async_resume(spa_t *spa) 1976 { 1977 mutex_enter(&spa->spa_async_lock); 1978 ASSERT(spa->spa_async_suspended != 0); 1979 spa->spa_async_suspended--; 1980 mutex_exit(&spa->spa_async_lock); 1981 } 1982 1983 static void 1984 spa_async_dispatch(spa_t *spa) 1985 { 1986 mutex_enter(&spa->spa_async_lock); 1987 if (spa->spa_async_tasks && !spa->spa_async_suspended && 1988 spa->spa_async_thread == NULL && 1989 rootdir != NULL && !vn_is_readonly(rootdir)) 1990 spa->spa_async_thread = thread_create(NULL, 0, 1991 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 1992 mutex_exit(&spa->spa_async_lock); 1993 } 1994 1995 void 1996 spa_async_request(spa_t *spa, int task) 1997 { 1998 mutex_enter(&spa->spa_async_lock); 1999 spa->spa_async_tasks |= task; 2000 mutex_exit(&spa->spa_async_lock); 2001 } 2002 2003 /* 2004 * ========================================================================== 2005 * SPA syncing routines 2006 * ========================================================================== 2007 */ 2008 2009 static void 2010 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2011 { 2012 bplist_t *bpl = &spa->spa_sync_bplist; 2013 dmu_tx_t *tx; 2014 blkptr_t blk; 2015 uint64_t itor = 0; 2016 zio_t *zio; 2017 int error; 2018 uint8_t c = 1; 2019 2020 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2021 2022 while (bplist_iterate(bpl, &itor, &blk) == 0) 2023 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2024 2025 error = zio_wait(zio); 2026 ASSERT3U(error, ==, 0); 2027 2028 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2029 bplist_vacate(bpl, tx); 2030 2031 /* 2032 * Pre-dirty the first block so we sync to convergence faster. 2033 * (Usually only the first block is needed.) 2034 */ 2035 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2036 dmu_tx_commit(tx); 2037 } 2038 2039 static void 2040 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2041 { 2042 nvlist_t *config; 2043 char *packed = NULL; 2044 size_t nvsize = 0; 2045 dmu_buf_t *db; 2046 2047 if (list_is_empty(&spa->spa_dirty_list)) 2048 return; 2049 2050 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2051 2052 if (spa->spa_config_syncing) 2053 nvlist_free(spa->spa_config_syncing); 2054 spa->spa_config_syncing = config; 2055 2056 VERIFY(nvlist_size(config, &nvsize, NV_ENCODE_XDR) == 0); 2057 2058 packed = kmem_alloc(nvsize, KM_SLEEP); 2059 2060 VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 2061 KM_SLEEP) == 0); 2062 2063 dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize, 2064 packed, tx); 2065 2066 kmem_free(packed, nvsize); 2067 2068 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, 2069 spa->spa_config_object, FTAG, &db)); 2070 dmu_buf_will_dirty(db, tx); 2071 *(uint64_t *)db->db_data = nvsize; 2072 dmu_buf_rele(db, FTAG); 2073 } 2074 2075 /* 2076 * Sync the specified transaction group. New blocks may be dirtied as 2077 * part of the process, so we iterate until it converges. 2078 */ 2079 void 2080 spa_sync(spa_t *spa, uint64_t txg) 2081 { 2082 dsl_pool_t *dp = spa->spa_dsl_pool; 2083 objset_t *mos = spa->spa_meta_objset; 2084 bplist_t *bpl = &spa->spa_sync_bplist; 2085 vdev_t *rvd = spa->spa_root_vdev; 2086 vdev_t *vd; 2087 dmu_tx_t *tx; 2088 int dirty_vdevs; 2089 2090 /* 2091 * Lock out configuration changes. 2092 */ 2093 spa_config_enter(spa, RW_READER, FTAG); 2094 2095 spa->spa_syncing_txg = txg; 2096 spa->spa_sync_pass = 0; 2097 2098 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2099 2100 /* 2101 * If anything has changed in this txg, push the deferred frees 2102 * from the previous txg. If not, leave them alone so that we 2103 * don't generate work on an otherwise idle system. 2104 */ 2105 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2106 !txg_list_empty(&dp->dp_dirty_dirs, txg)) 2107 spa_sync_deferred_frees(spa, txg); 2108 2109 /* 2110 * Iterate to convergence. 2111 */ 2112 do { 2113 spa->spa_sync_pass++; 2114 2115 tx = dmu_tx_create_assigned(dp, txg); 2116 spa_sync_config_object(spa, tx); 2117 dmu_tx_commit(tx); 2118 2119 spa_errlog_sync(spa, txg); 2120 2121 dsl_pool_sync(dp, txg); 2122 2123 dirty_vdevs = 0; 2124 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2125 vdev_sync(vd, txg); 2126 dirty_vdevs++; 2127 } 2128 2129 tx = dmu_tx_create_assigned(dp, txg); 2130 bplist_sync(bpl, tx); 2131 dmu_tx_commit(tx); 2132 2133 } while (dirty_vdevs); 2134 2135 bplist_close(bpl); 2136 2137 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2138 2139 /* 2140 * Rewrite the vdev configuration (which includes the uberblock) 2141 * to commit the transaction group. 2142 * 2143 * If there are any dirty vdevs, sync the uberblock to all vdevs. 2144 * Otherwise, pick a random top-level vdev that's known to be 2145 * visible in the config cache (see spa_vdev_add() for details). 2146 * If the write fails, try the next vdev until we're tried them all. 2147 */ 2148 if (!list_is_empty(&spa->spa_dirty_list)) { 2149 VERIFY(vdev_config_sync(rvd, txg) == 0); 2150 } else { 2151 int children = rvd->vdev_children; 2152 int c0 = spa_get_random(children); 2153 int c; 2154 2155 for (c = 0; c < children; c++) { 2156 vd = rvd->vdev_child[(c0 + c) % children]; 2157 if (vd->vdev_ms_array == 0) 2158 continue; 2159 if (vdev_config_sync(vd, txg) == 0) 2160 break; 2161 } 2162 if (c == children) 2163 VERIFY(vdev_config_sync(rvd, txg) == 0); 2164 } 2165 2166 /* 2167 * Clear the dirty config list. 2168 */ 2169 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 2170 vdev_config_clean(vd); 2171 2172 /* 2173 * Now that the new config has synced transactionally, 2174 * let it become visible to the config cache. 2175 */ 2176 if (spa->spa_config_syncing != NULL) { 2177 spa_config_set(spa, spa->spa_config_syncing); 2178 spa->spa_config_txg = txg; 2179 spa->spa_config_syncing = NULL; 2180 } 2181 2182 /* 2183 * Make a stable copy of the fully synced uberblock. 2184 * We use this as the root for pool traversals. 2185 */ 2186 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 2187 2188 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 2189 2190 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 2191 spa->spa_traverse_wanted = 0; 2192 spa->spa_ubsync = spa->spa_uberblock; 2193 rw_exit(&spa->spa_traverse_lock); 2194 2195 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 2196 2197 /* 2198 * Clean up the ZIL records for the synced txg. 2199 */ 2200 dsl_pool_zil_clean(dp); 2201 2202 /* 2203 * Update usable space statistics. 2204 */ 2205 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 2206 vdev_sync_done(vd, txg); 2207 2208 /* 2209 * It had better be the case that we didn't dirty anything 2210 * since spa_sync_labels(). 2211 */ 2212 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 2213 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 2214 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 2215 ASSERT(bpl->bpl_queue == NULL); 2216 2217 spa_config_exit(spa, FTAG); 2218 2219 /* 2220 * If any async tasks have been requested, kick them off. 2221 */ 2222 spa_async_dispatch(spa); 2223 } 2224 2225 /* 2226 * Sync all pools. We don't want to hold the namespace lock across these 2227 * operations, so we take a reference on the spa_t and drop the lock during the 2228 * sync. 2229 */ 2230 void 2231 spa_sync_allpools(void) 2232 { 2233 spa_t *spa = NULL; 2234 mutex_enter(&spa_namespace_lock); 2235 while ((spa = spa_next(spa)) != NULL) { 2236 if (spa_state(spa) != POOL_STATE_ACTIVE) 2237 continue; 2238 spa_open_ref(spa, FTAG); 2239 mutex_exit(&spa_namespace_lock); 2240 txg_wait_synced(spa_get_dsl(spa), 0); 2241 mutex_enter(&spa_namespace_lock); 2242 spa_close(spa, FTAG); 2243 } 2244 mutex_exit(&spa_namespace_lock); 2245 } 2246 2247 /* 2248 * ========================================================================== 2249 * Miscellaneous routines 2250 * ========================================================================== 2251 */ 2252 2253 /* 2254 * Remove all pools in the system. 2255 */ 2256 void 2257 spa_evict_all(void) 2258 { 2259 spa_t *spa; 2260 2261 /* 2262 * Remove all cached state. All pools should be closed now, 2263 * so every spa in the AVL tree should be unreferenced. 2264 */ 2265 mutex_enter(&spa_namespace_lock); 2266 while ((spa = spa_next(NULL)) != NULL) { 2267 /* 2268 * Stop async tasks. The async thread may need to detach 2269 * a device that's been replaced, which requires grabbing 2270 * spa_namespace_lock, so we must drop it here. 2271 */ 2272 spa_open_ref(spa, FTAG); 2273 mutex_exit(&spa_namespace_lock); 2274 spa_async_suspend(spa); 2275 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 2276 mutex_enter(&spa_namespace_lock); 2277 spa_close(spa, FTAG); 2278 2279 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2280 spa_unload(spa); 2281 spa_deactivate(spa); 2282 } 2283 spa_remove(spa); 2284 } 2285 mutex_exit(&spa_namespace_lock); 2286 } 2287 2288 vdev_t * 2289 spa_lookup_by_guid(spa_t *spa, uint64_t guid) 2290 { 2291 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 2292 } 2293 2294 void 2295 spa_upgrade(spa_t *spa) 2296 { 2297 spa_config_enter(spa, RW_WRITER, FTAG); 2298 2299 /* 2300 * This should only be called for a non-faulted pool, and since a 2301 * future version would result in an unopenable pool, this shouldn't be 2302 * possible. 2303 */ 2304 ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 2305 2306 spa->spa_uberblock.ub_version = ZFS_VERSION; 2307 vdev_config_dirty(spa->spa_root_vdev); 2308 2309 spa_config_exit(spa, FTAG); 2310 } 2311