1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains all the routines used when modifying on-disk SPA state. 31 * This includes opening, importing, destroying, exporting a pool, and syncing a 32 * pool. 33 */ 34 35 #include <sys/zfs_context.h> 36 #include <sys/fm/fs/zfs.h> 37 #include <sys/spa_impl.h> 38 #include <sys/zio.h> 39 #include <sys/zio_checksum.h> 40 #include <sys/zio_compress.h> 41 #include <sys/dmu.h> 42 #include <sys/dmu_tx.h> 43 #include <sys/zap.h> 44 #include <sys/zil.h> 45 #include <sys/vdev_impl.h> 46 #include <sys/metaslab.h> 47 #include <sys/uberblock_impl.h> 48 #include <sys/txg.h> 49 #include <sys/avl.h> 50 #include <sys/dmu_traverse.h> 51 #include <sys/dmu_objset.h> 52 #include <sys/unique.h> 53 #include <sys/dsl_pool.h> 54 #include <sys/dsl_dataset.h> 55 #include <sys/dsl_dir.h> 56 #include <sys/dsl_prop.h> 57 #include <sys/dsl_synctask.h> 58 #include <sys/fs/zfs.h> 59 #include <sys/callb.h> 60 #include <sys/systeminfo.h> 61 #include <sys/sunddi.h> 62 63 #include "zfs_prop.h" 64 65 int zio_taskq_threads = 8; 66 67 static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); 68 69 /* 70 * ========================================================================== 71 * SPA properties routines 72 * ========================================================================== 73 */ 74 75 /* 76 * Add a (source=src, propname=propval) list to an nvlist. 77 */ 78 static int 79 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 80 uint64_t intval, zprop_source_t src) 81 { 82 const char *propname = zpool_prop_to_name(prop); 83 nvlist_t *propval; 84 int err = 0; 85 86 if (err = nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP)) 87 return (err); 88 89 if (err = nvlist_add_uint64(propval, ZPROP_SOURCE, src)) 90 goto out; 91 92 if (strval != NULL) { 93 if (err = nvlist_add_string(propval, ZPROP_VALUE, strval)) 94 goto out; 95 } else { 96 if (err = nvlist_add_uint64(propval, ZPROP_VALUE, intval)) 97 goto out; 98 } 99 100 err = nvlist_add_nvlist(nvl, propname, propval); 101 out: 102 nvlist_free(propval); 103 return (err); 104 } 105 106 /* 107 * Get property values from the spa configuration. 108 */ 109 static int 110 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 111 { 112 uint64_t size = spa_get_space(spa); 113 uint64_t used = spa_get_alloc(spa); 114 uint64_t cap, version; 115 zprop_source_t src = ZPROP_SRC_NONE; 116 int err; 117 118 /* 119 * readonly properties 120 */ 121 if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa->spa_name, 122 0, src)) 123 return (err); 124 125 if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src)) 126 return (err); 127 128 if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src)) 129 return (err); 130 131 if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, 132 size - used, src)) 133 return (err); 134 135 cap = (size == 0) ? 0 : (used * 100 / size); 136 if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src)) 137 return (err); 138 139 if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, 140 spa_guid(spa), src)) 141 return (err); 142 143 if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 144 spa->spa_root_vdev->vdev_state, src)) 145 return (err); 146 147 /* 148 * settable properties that are not stored in the pool property object. 149 */ 150 version = spa_version(spa); 151 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 152 src = ZPROP_SRC_DEFAULT; 153 else 154 src = ZPROP_SRC_LOCAL; 155 if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, 156 version, src)) 157 return (err); 158 159 if (spa->spa_root != NULL) { 160 src = ZPROP_SRC_LOCAL; 161 if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, 162 spa->spa_root, 0, src)) 163 return (err); 164 } 165 166 if (spa->spa_temporary == 167 zpool_prop_default_numeric(ZPOOL_PROP_TEMPORARY)) 168 src = ZPROP_SRC_DEFAULT; 169 else 170 src = ZPROP_SRC_LOCAL; 171 if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_TEMPORARY, NULL, 172 spa->spa_temporary, src)) 173 return (err); 174 175 return (0); 176 } 177 178 /* 179 * Get zpool property values. 180 */ 181 int 182 spa_prop_get(spa_t *spa, nvlist_t **nvp) 183 { 184 zap_cursor_t zc; 185 zap_attribute_t za; 186 objset_t *mos = spa->spa_meta_objset; 187 int err; 188 189 if (err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP)) 190 return (err); 191 192 /* 193 * Get properties from the spa config. 194 */ 195 if (err = spa_prop_get_config(spa, nvp)) 196 goto out; 197 198 mutex_enter(&spa->spa_props_lock); 199 /* If no pool property object, no more prop to get. */ 200 if (spa->spa_pool_props_object == 0) { 201 mutex_exit(&spa->spa_props_lock); 202 return (0); 203 } 204 205 /* 206 * Get properties from the MOS pool property object. 207 */ 208 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 209 (err = zap_cursor_retrieve(&zc, &za)) == 0; 210 zap_cursor_advance(&zc)) { 211 uint64_t intval = 0; 212 char *strval = NULL; 213 zprop_source_t src = ZPROP_SRC_DEFAULT; 214 zpool_prop_t prop; 215 216 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 217 continue; 218 219 switch (za.za_integer_length) { 220 case 8: 221 /* integer property */ 222 if (za.za_first_integer != 223 zpool_prop_default_numeric(prop)) 224 src = ZPROP_SRC_LOCAL; 225 226 if (prop == ZPOOL_PROP_BOOTFS) { 227 dsl_pool_t *dp; 228 dsl_dataset_t *ds = NULL; 229 230 dp = spa_get_dsl(spa); 231 rw_enter(&dp->dp_config_rwlock, RW_READER); 232 if (err = dsl_dataset_open_obj(dp, 233 za.za_first_integer, NULL, DS_MODE_NONE, 234 FTAG, &ds)) { 235 rw_exit(&dp->dp_config_rwlock); 236 break; 237 } 238 239 strval = kmem_alloc( 240 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 241 KM_SLEEP); 242 dsl_dataset_name(ds, strval); 243 dsl_dataset_close(ds, DS_MODE_NONE, FTAG); 244 rw_exit(&dp->dp_config_rwlock); 245 } else { 246 strval = NULL; 247 intval = za.za_first_integer; 248 } 249 250 err = spa_prop_add_list(*nvp, prop, strval, 251 intval, src); 252 253 if (strval != NULL) 254 kmem_free(strval, 255 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 256 257 break; 258 259 case 1: 260 /* string property */ 261 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 262 err = zap_lookup(mos, spa->spa_pool_props_object, 263 za.za_name, 1, za.za_num_integers, strval); 264 if (err) { 265 kmem_free(strval, za.za_num_integers); 266 break; 267 } 268 err = spa_prop_add_list(*nvp, prop, strval, 0, src); 269 kmem_free(strval, za.za_num_integers); 270 break; 271 272 default: 273 break; 274 } 275 } 276 zap_cursor_fini(&zc); 277 mutex_exit(&spa->spa_props_lock); 278 out: 279 if (err && err != ENOENT) { 280 nvlist_free(*nvp); 281 return (err); 282 } 283 284 return (0); 285 } 286 287 /* 288 * Validate the given pool properties nvlist and modify the list 289 * for the property values to be set. 290 */ 291 static int 292 spa_prop_validate(spa_t *spa, nvlist_t *props) 293 { 294 nvpair_t *elem; 295 int error = 0, reset_bootfs = 0; 296 uint64_t objnum; 297 298 elem = NULL; 299 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 300 zpool_prop_t prop; 301 char *propname, *strval; 302 uint64_t intval; 303 vdev_t *rvdev; 304 char *vdev_type; 305 objset_t *os; 306 307 propname = nvpair_name(elem); 308 309 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) 310 return (EINVAL); 311 312 switch (prop) { 313 case ZPOOL_PROP_VERSION: 314 error = nvpair_value_uint64(elem, &intval); 315 if (!error && 316 (intval < spa_version(spa) || intval > SPA_VERSION)) 317 error = EINVAL; 318 break; 319 320 case ZPOOL_PROP_DELEGATION: 321 case ZPOOL_PROP_AUTOREPLACE: 322 error = nvpair_value_uint64(elem, &intval); 323 if (!error && intval > 1) 324 error = EINVAL; 325 break; 326 327 case ZPOOL_PROP_BOOTFS: 328 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 329 error = ENOTSUP; 330 break; 331 } 332 333 /* 334 * A bootable filesystem can not be on a RAIDZ pool 335 * nor a striped pool with more than 1 device. 336 */ 337 rvdev = spa->spa_root_vdev; 338 vdev_type = 339 rvdev->vdev_child[0]->vdev_ops->vdev_op_type; 340 if (rvdev->vdev_children > 1 || 341 strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || 342 strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { 343 error = ENOTSUP; 344 break; 345 } 346 347 reset_bootfs = 1; 348 349 error = nvpair_value_string(elem, &strval); 350 351 if (!error) { 352 if (strval == NULL || strval[0] == '\0') { 353 objnum = zpool_prop_default_numeric( 354 ZPOOL_PROP_BOOTFS); 355 break; 356 } 357 358 if (error = dmu_objset_open(strval, DMU_OST_ZFS, 359 DS_MODE_STANDARD | DS_MODE_READONLY, &os)) 360 break; 361 objnum = dmu_objset_id(os); 362 dmu_objset_close(os); 363 } 364 break; 365 } 366 367 if (error) 368 break; 369 } 370 371 if (!error && reset_bootfs) { 372 error = nvlist_remove(props, 373 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 374 375 if (!error) { 376 error = nvlist_add_uint64(props, 377 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 378 } 379 } 380 381 return (error); 382 } 383 384 int 385 spa_prop_set(spa_t *spa, nvlist_t *nvp) 386 { 387 int error; 388 389 if ((error = spa_prop_validate(spa, nvp)) != 0) 390 return (error); 391 392 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 393 spa, nvp, 3)); 394 } 395 396 /* 397 * If the bootfs property value is dsobj, clear it. 398 */ 399 void 400 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 401 { 402 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 403 VERIFY(zap_remove(spa->spa_meta_objset, 404 spa->spa_pool_props_object, 405 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 406 spa->spa_bootfs = 0; 407 } 408 } 409 410 /* 411 * ========================================================================== 412 * SPA state manipulation (open/create/destroy/import/export) 413 * ========================================================================== 414 */ 415 416 static int 417 spa_error_entry_compare(const void *a, const void *b) 418 { 419 spa_error_entry_t *sa = (spa_error_entry_t *)a; 420 spa_error_entry_t *sb = (spa_error_entry_t *)b; 421 int ret; 422 423 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 424 sizeof (zbookmark_t)); 425 426 if (ret < 0) 427 return (-1); 428 else if (ret > 0) 429 return (1); 430 else 431 return (0); 432 } 433 434 /* 435 * Utility function which retrieves copies of the current logs and 436 * re-initializes them in the process. 437 */ 438 void 439 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 440 { 441 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 442 443 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 444 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 445 446 avl_create(&spa->spa_errlist_scrub, 447 spa_error_entry_compare, sizeof (spa_error_entry_t), 448 offsetof(spa_error_entry_t, se_avl)); 449 avl_create(&spa->spa_errlist_last, 450 spa_error_entry_compare, sizeof (spa_error_entry_t), 451 offsetof(spa_error_entry_t, se_avl)); 452 } 453 454 /* 455 * Activate an uninitialized pool. 456 */ 457 static void 458 spa_activate(spa_t *spa) 459 { 460 int t; 461 462 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 463 464 spa->spa_state = POOL_STATE_ACTIVE; 465 466 spa->spa_normal_class = metaslab_class_create(); 467 spa->spa_log_class = metaslab_class_create(); 468 469 for (t = 0; t < ZIO_TYPES; t++) { 470 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 471 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 472 TASKQ_PREPOPULATE); 473 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 474 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 475 TASKQ_PREPOPULATE); 476 } 477 478 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 479 offsetof(vdev_t, vdev_dirty_node)); 480 481 txg_list_create(&spa->spa_vdev_txg_list, 482 offsetof(struct vdev, vdev_txg_node)); 483 484 avl_create(&spa->spa_errlist_scrub, 485 spa_error_entry_compare, sizeof (spa_error_entry_t), 486 offsetof(spa_error_entry_t, se_avl)); 487 avl_create(&spa->spa_errlist_last, 488 spa_error_entry_compare, sizeof (spa_error_entry_t), 489 offsetof(spa_error_entry_t, se_avl)); 490 } 491 492 /* 493 * Opposite of spa_activate(). 494 */ 495 static void 496 spa_deactivate(spa_t *spa) 497 { 498 int t; 499 500 ASSERT(spa->spa_sync_on == B_FALSE); 501 ASSERT(spa->spa_dsl_pool == NULL); 502 ASSERT(spa->spa_root_vdev == NULL); 503 504 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 505 506 txg_list_destroy(&spa->spa_vdev_txg_list); 507 508 list_destroy(&spa->spa_dirty_list); 509 510 for (t = 0; t < ZIO_TYPES; t++) { 511 taskq_destroy(spa->spa_zio_issue_taskq[t]); 512 taskq_destroy(spa->spa_zio_intr_taskq[t]); 513 spa->spa_zio_issue_taskq[t] = NULL; 514 spa->spa_zio_intr_taskq[t] = NULL; 515 } 516 517 metaslab_class_destroy(spa->spa_normal_class); 518 spa->spa_normal_class = NULL; 519 520 metaslab_class_destroy(spa->spa_log_class); 521 spa->spa_log_class = NULL; 522 523 /* 524 * If this was part of an import or the open otherwise failed, we may 525 * still have errors left in the queues. Empty them just in case. 526 */ 527 spa_errlog_drain(spa); 528 529 avl_destroy(&spa->spa_errlist_scrub); 530 avl_destroy(&spa->spa_errlist_last); 531 532 spa->spa_state = POOL_STATE_UNINITIALIZED; 533 } 534 535 /* 536 * Verify a pool configuration, and construct the vdev tree appropriately. This 537 * will create all the necessary vdevs in the appropriate layout, with each vdev 538 * in the CLOSED state. This will prep the pool before open/creation/import. 539 * All vdev validation is done by the vdev_alloc() routine. 540 */ 541 static int 542 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 543 uint_t id, int atype) 544 { 545 nvlist_t **child; 546 uint_t c, children; 547 int error; 548 549 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 550 return (error); 551 552 if ((*vdp)->vdev_ops->vdev_op_leaf) 553 return (0); 554 555 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 556 &child, &children) != 0) { 557 vdev_free(*vdp); 558 *vdp = NULL; 559 return (EINVAL); 560 } 561 562 for (c = 0; c < children; c++) { 563 vdev_t *vd; 564 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 565 atype)) != 0) { 566 vdev_free(*vdp); 567 *vdp = NULL; 568 return (error); 569 } 570 } 571 572 ASSERT(*vdp != NULL); 573 574 return (0); 575 } 576 577 /* 578 * Opposite of spa_load(). 579 */ 580 static void 581 spa_unload(spa_t *spa) 582 { 583 int i; 584 585 /* 586 * Stop async tasks. 587 */ 588 spa_async_suspend(spa); 589 590 /* 591 * Stop syncing. 592 */ 593 if (spa->spa_sync_on) { 594 txg_sync_stop(spa->spa_dsl_pool); 595 spa->spa_sync_on = B_FALSE; 596 } 597 598 /* 599 * Wait for any outstanding prefetch I/O to complete. 600 */ 601 spa_config_enter(spa, RW_WRITER, FTAG); 602 spa_config_exit(spa, FTAG); 603 604 /* 605 * Close the dsl pool. 606 */ 607 if (spa->spa_dsl_pool) { 608 dsl_pool_close(spa->spa_dsl_pool); 609 spa->spa_dsl_pool = NULL; 610 } 611 612 /* 613 * Close all vdevs. 614 */ 615 if (spa->spa_root_vdev) 616 vdev_free(spa->spa_root_vdev); 617 ASSERT(spa->spa_root_vdev == NULL); 618 619 for (i = 0; i < spa->spa_nspares; i++) 620 vdev_free(spa->spa_spares[i]); 621 if (spa->spa_spares) { 622 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 623 spa->spa_spares = NULL; 624 } 625 if (spa->spa_sparelist) { 626 nvlist_free(spa->spa_sparelist); 627 spa->spa_sparelist = NULL; 628 } 629 630 spa->spa_async_suspended = 0; 631 } 632 633 /* 634 * Load (or re-load) the current list of vdevs describing the active spares for 635 * this pool. When this is called, we have some form of basic information in 636 * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 637 * re-generate a more complete list including status information. 638 */ 639 static void 640 spa_load_spares(spa_t *spa) 641 { 642 nvlist_t **spares; 643 uint_t nspares; 644 int i; 645 vdev_t *vd, *tvd; 646 647 /* 648 * First, close and free any existing spare vdevs. 649 */ 650 for (i = 0; i < spa->spa_nspares; i++) { 651 vd = spa->spa_spares[i]; 652 653 /* Undo the call to spa_activate() below */ 654 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && 655 tvd->vdev_isspare) 656 spa_spare_remove(tvd); 657 vdev_close(vd); 658 vdev_free(vd); 659 } 660 661 if (spa->spa_spares) 662 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 663 664 if (spa->spa_sparelist == NULL) 665 nspares = 0; 666 else 667 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 668 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 669 670 spa->spa_nspares = (int)nspares; 671 spa->spa_spares = NULL; 672 673 if (nspares == 0) 674 return; 675 676 /* 677 * Construct the array of vdevs, opening them to get status in the 678 * process. For each spare, there is potentially two different vdev_t 679 * structures associated with it: one in the list of spares (used only 680 * for basic validation purposes) and one in the active vdev 681 * configuration (if it's spared in). During this phase we open and 682 * validate each vdev on the spare list. If the vdev also exists in the 683 * active configuration, then we also mark this vdev as an active spare. 684 */ 685 spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 686 for (i = 0; i < spa->spa_nspares; i++) { 687 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 688 VDEV_ALLOC_SPARE) == 0); 689 ASSERT(vd != NULL); 690 691 spa->spa_spares[i] = vd; 692 693 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { 694 if (!tvd->vdev_isspare) 695 spa_spare_add(tvd); 696 697 /* 698 * We only mark the spare active if we were successfully 699 * able to load the vdev. Otherwise, importing a pool 700 * with a bad active spare would result in strange 701 * behavior, because multiple pool would think the spare 702 * is actively in use. 703 * 704 * There is a vulnerability here to an equally bizarre 705 * circumstance, where a dead active spare is later 706 * brought back to life (onlined or otherwise). Given 707 * the rarity of this scenario, and the extra complexity 708 * it adds, we ignore the possibility. 709 */ 710 if (!vdev_is_dead(tvd)) 711 spa_spare_activate(tvd); 712 } 713 714 if (vdev_open(vd) != 0) 715 continue; 716 717 vd->vdev_top = vd; 718 (void) vdev_validate_spare(vd); 719 } 720 721 /* 722 * Recompute the stashed list of spares, with status information 723 * this time. 724 */ 725 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 726 DATA_TYPE_NVLIST_ARRAY) == 0); 727 728 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 729 for (i = 0; i < spa->spa_nspares; i++) 730 spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 731 B_TRUE, B_TRUE); 732 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 733 spares, spa->spa_nspares) == 0); 734 for (i = 0; i < spa->spa_nspares; i++) 735 nvlist_free(spares[i]); 736 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 737 } 738 739 static int 740 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 741 { 742 dmu_buf_t *db; 743 char *packed = NULL; 744 size_t nvsize = 0; 745 int error; 746 *value = NULL; 747 748 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 749 nvsize = *(uint64_t *)db->db_data; 750 dmu_buf_rele(db, FTAG); 751 752 packed = kmem_alloc(nvsize, KM_SLEEP); 753 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 754 if (error == 0) 755 error = nvlist_unpack(packed, nvsize, value, 0); 756 kmem_free(packed, nvsize); 757 758 return (error); 759 } 760 761 /* 762 * Checks to see if the given vdev could not be opened, in which case we post a 763 * sysevent to notify the autoreplace code that the device has been removed. 764 */ 765 static void 766 spa_check_removed(vdev_t *vd) 767 { 768 int c; 769 770 for (c = 0; c < vd->vdev_children; c++) 771 spa_check_removed(vd->vdev_child[c]); 772 773 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 774 zfs_post_autoreplace(vd->vdev_spa, vd); 775 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 776 } 777 } 778 779 /* 780 * Load an existing storage pool, using the pool's builtin spa_config as a 781 * source of configuration information. 782 */ 783 static int 784 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 785 { 786 int error = 0; 787 nvlist_t *nvroot = NULL; 788 vdev_t *rvd; 789 uberblock_t *ub = &spa->spa_uberblock; 790 uint64_t config_cache_txg = spa->spa_config_txg; 791 uint64_t pool_guid; 792 uint64_t version; 793 zio_t *zio; 794 uint64_t autoreplace = 0; 795 796 spa->spa_load_state = state; 797 798 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 799 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 800 error = EINVAL; 801 goto out; 802 } 803 804 /* 805 * Versioning wasn't explicitly added to the label until later, so if 806 * it's not present treat it as the initial version. 807 */ 808 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 809 version = SPA_VERSION_INITIAL; 810 811 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 812 &spa->spa_config_txg); 813 814 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 815 spa_guid_exists(pool_guid, 0)) { 816 error = EEXIST; 817 goto out; 818 } 819 820 spa->spa_load_guid = pool_guid; 821 822 /* 823 * Parse the configuration into a vdev tree. We explicitly set the 824 * value that will be returned by spa_version() since parsing the 825 * configuration requires knowing the version number. 826 */ 827 spa_config_enter(spa, RW_WRITER, FTAG); 828 spa->spa_ubsync.ub_version = version; 829 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 830 spa_config_exit(spa, FTAG); 831 832 if (error != 0) 833 goto out; 834 835 ASSERT(spa->spa_root_vdev == rvd); 836 ASSERT(spa_guid(spa) == pool_guid); 837 838 /* 839 * Try to open all vdevs, loading each label in the process. 840 */ 841 error = vdev_open(rvd); 842 if (error != 0) 843 goto out; 844 845 /* 846 * Validate the labels for all leaf vdevs. We need to grab the config 847 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 848 * flag. 849 */ 850 spa_config_enter(spa, RW_READER, FTAG); 851 error = vdev_validate(rvd); 852 spa_config_exit(spa, FTAG); 853 854 if (error != 0) 855 goto out; 856 857 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 858 error = ENXIO; 859 goto out; 860 } 861 862 /* 863 * Find the best uberblock. 864 */ 865 bzero(ub, sizeof (uberblock_t)); 866 867 zio = zio_root(spa, NULL, NULL, 868 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 869 vdev_uberblock_load(zio, rvd, ub); 870 error = zio_wait(zio); 871 872 /* 873 * If we weren't able to find a single valid uberblock, return failure. 874 */ 875 if (ub->ub_txg == 0) { 876 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 877 VDEV_AUX_CORRUPT_DATA); 878 error = ENXIO; 879 goto out; 880 } 881 882 /* 883 * If the pool is newer than the code, we can't open it. 884 */ 885 if (ub->ub_version > SPA_VERSION) { 886 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 887 VDEV_AUX_VERSION_NEWER); 888 error = ENOTSUP; 889 goto out; 890 } 891 892 /* 893 * If the vdev guid sum doesn't match the uberblock, we have an 894 * incomplete configuration. 895 */ 896 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 897 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 898 VDEV_AUX_BAD_GUID_SUM); 899 error = ENXIO; 900 goto out; 901 } 902 903 /* 904 * Initialize internal SPA structures. 905 */ 906 spa->spa_state = POOL_STATE_ACTIVE; 907 spa->spa_ubsync = spa->spa_uberblock; 908 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 909 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 910 if (error) { 911 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 912 VDEV_AUX_CORRUPT_DATA); 913 goto out; 914 } 915 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 916 917 if (zap_lookup(spa->spa_meta_objset, 918 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 919 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 920 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 921 VDEV_AUX_CORRUPT_DATA); 922 error = EIO; 923 goto out; 924 } 925 926 if (!mosconfig) { 927 nvlist_t *newconfig; 928 uint64_t hostid; 929 930 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 931 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 932 VDEV_AUX_CORRUPT_DATA); 933 error = EIO; 934 goto out; 935 } 936 937 if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID, 938 &hostid) == 0) { 939 char *hostname; 940 unsigned long myhostid = 0; 941 942 VERIFY(nvlist_lookup_string(newconfig, 943 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 944 945 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 946 if (hostid != 0 && myhostid != 0 && 947 (unsigned long)hostid != myhostid) { 948 cmn_err(CE_WARN, "pool '%s' could not be " 949 "loaded as it was last accessed by " 950 "another system (host: %s hostid: 0x%lx). " 951 "See: http://www.sun.com/msg/ZFS-8000-EY", 952 spa->spa_name, hostname, 953 (unsigned long)hostid); 954 error = EBADF; 955 goto out; 956 } 957 } 958 959 spa_config_set(spa, newconfig); 960 spa_unload(spa); 961 spa_deactivate(spa); 962 spa_activate(spa); 963 964 return (spa_load(spa, newconfig, state, B_TRUE)); 965 } 966 967 if (zap_lookup(spa->spa_meta_objset, 968 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 969 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 970 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 971 VDEV_AUX_CORRUPT_DATA); 972 error = EIO; 973 goto out; 974 } 975 976 /* 977 * Load the bit that tells us to use the new accounting function 978 * (raid-z deflation). If we have an older pool, this will not 979 * be present. 980 */ 981 error = zap_lookup(spa->spa_meta_objset, 982 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 983 sizeof (uint64_t), 1, &spa->spa_deflate); 984 if (error != 0 && error != ENOENT) { 985 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 986 VDEV_AUX_CORRUPT_DATA); 987 error = EIO; 988 goto out; 989 } 990 991 /* 992 * Load the persistent error log. If we have an older pool, this will 993 * not be present. 994 */ 995 error = zap_lookup(spa->spa_meta_objset, 996 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 997 sizeof (uint64_t), 1, &spa->spa_errlog_last); 998 if (error != 0 && error != ENOENT) { 999 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1000 VDEV_AUX_CORRUPT_DATA); 1001 error = EIO; 1002 goto out; 1003 } 1004 1005 error = zap_lookup(spa->spa_meta_objset, 1006 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 1007 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 1008 if (error != 0 && error != ENOENT) { 1009 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1010 VDEV_AUX_CORRUPT_DATA); 1011 error = EIO; 1012 goto out; 1013 } 1014 1015 /* 1016 * Load the history object. If we have an older pool, this 1017 * will not be present. 1018 */ 1019 error = zap_lookup(spa->spa_meta_objset, 1020 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 1021 sizeof (uint64_t), 1, &spa->spa_history); 1022 if (error != 0 && error != ENOENT) { 1023 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1024 VDEV_AUX_CORRUPT_DATA); 1025 error = EIO; 1026 goto out; 1027 } 1028 1029 /* 1030 * Load any hot spares for this pool. 1031 */ 1032 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1033 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 1034 if (error != 0 && error != ENOENT) { 1035 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1036 VDEV_AUX_CORRUPT_DATA); 1037 error = EIO; 1038 goto out; 1039 } 1040 if (error == 0) { 1041 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 1042 if (load_nvlist(spa, spa->spa_spares_object, 1043 &spa->spa_sparelist) != 0) { 1044 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1045 VDEV_AUX_CORRUPT_DATA); 1046 error = EIO; 1047 goto out; 1048 } 1049 1050 spa_config_enter(spa, RW_WRITER, FTAG); 1051 spa_load_spares(spa); 1052 spa_config_exit(spa, FTAG); 1053 } 1054 1055 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 1056 1057 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1058 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 1059 1060 if (error && error != ENOENT) { 1061 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1062 VDEV_AUX_CORRUPT_DATA); 1063 error = EIO; 1064 goto out; 1065 } 1066 1067 if (error == 0) { 1068 (void) zap_lookup(spa->spa_meta_objset, 1069 spa->spa_pool_props_object, 1070 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 1071 sizeof (uint64_t), 1, &spa->spa_bootfs); 1072 (void) zap_lookup(spa->spa_meta_objset, 1073 spa->spa_pool_props_object, 1074 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1075 sizeof (uint64_t), 1, &autoreplace); 1076 (void) zap_lookup(spa->spa_meta_objset, 1077 spa->spa_pool_props_object, 1078 zpool_prop_to_name(ZPOOL_PROP_DELEGATION), 1079 sizeof (uint64_t), 1, &spa->spa_delegation); 1080 } 1081 1082 /* 1083 * If the 'autoreplace' property is set, then post a resource notifying 1084 * the ZFS DE that it should not issue any faults for unopenable 1085 * devices. We also iterate over the vdevs, and post a sysevent for any 1086 * unopenable vdevs so that the normal autoreplace handler can take 1087 * over. 1088 */ 1089 if (autoreplace) 1090 spa_check_removed(spa->spa_root_vdev); 1091 1092 /* 1093 * Load the vdev state for all toplevel vdevs. 1094 */ 1095 vdev_load(rvd); 1096 1097 /* 1098 * Propagate the leaf DTLs we just loaded all the way up the tree. 1099 */ 1100 spa_config_enter(spa, RW_WRITER, FTAG); 1101 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 1102 spa_config_exit(spa, FTAG); 1103 1104 /* 1105 * Check the state of the root vdev. If it can't be opened, it 1106 * indicates one or more toplevel vdevs are faulted. 1107 */ 1108 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1109 error = ENXIO; 1110 goto out; 1111 } 1112 1113 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 1114 dmu_tx_t *tx; 1115 int need_update = B_FALSE; 1116 int c; 1117 1118 /* 1119 * Claim log blocks that haven't been committed yet. 1120 * This must all happen in a single txg. 1121 */ 1122 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 1123 spa_first_txg(spa)); 1124 (void) dmu_objset_find(spa->spa_name, 1125 zil_claim, tx, DS_FIND_CHILDREN); 1126 dmu_tx_commit(tx); 1127 1128 spa->spa_sync_on = B_TRUE; 1129 txg_sync_start(spa->spa_dsl_pool); 1130 1131 /* 1132 * Wait for all claims to sync. 1133 */ 1134 txg_wait_synced(spa->spa_dsl_pool, 0); 1135 1136 /* 1137 * If the config cache is stale, or we have uninitialized 1138 * metaslabs (see spa_vdev_add()), then update the config. 1139 */ 1140 if (config_cache_txg != spa->spa_config_txg || 1141 state == SPA_LOAD_IMPORT) 1142 need_update = B_TRUE; 1143 1144 for (c = 0; c < rvd->vdev_children; c++) 1145 if (rvd->vdev_child[c]->vdev_ms_array == 0) 1146 need_update = B_TRUE; 1147 1148 /* 1149 * Update the config cache asychronously in case we're the 1150 * root pool, in which case the config cache isn't writable yet. 1151 */ 1152 if (need_update) 1153 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 1154 } 1155 1156 error = 0; 1157 out: 1158 if (error && error != EBADF) 1159 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 1160 spa->spa_load_state = SPA_LOAD_NONE; 1161 spa->spa_ena = 0; 1162 1163 return (error); 1164 } 1165 1166 /* 1167 * Pool Open/Import 1168 * 1169 * The import case is identical to an open except that the configuration is sent 1170 * down from userland, instead of grabbed from the configuration cache. For the 1171 * case of an open, the pool configuration will exist in the 1172 * POOL_STATE_UNINITIALIZED state. 1173 * 1174 * The stats information (gen/count/ustats) is used to gather vdev statistics at 1175 * the same time open the pool, without having to keep around the spa_t in some 1176 * ambiguous state. 1177 */ 1178 static int 1179 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 1180 { 1181 spa_t *spa; 1182 int error; 1183 int loaded = B_FALSE; 1184 int locked = B_FALSE; 1185 1186 *spapp = NULL; 1187 1188 /* 1189 * As disgusting as this is, we need to support recursive calls to this 1190 * function because dsl_dir_open() is called during spa_load(), and ends 1191 * up calling spa_open() again. The real fix is to figure out how to 1192 * avoid dsl_dir_open() calling this in the first place. 1193 */ 1194 if (mutex_owner(&spa_namespace_lock) != curthread) { 1195 mutex_enter(&spa_namespace_lock); 1196 locked = B_TRUE; 1197 } 1198 1199 if ((spa = spa_lookup(pool)) == NULL) { 1200 if (locked) 1201 mutex_exit(&spa_namespace_lock); 1202 return (ENOENT); 1203 } 1204 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 1205 1206 spa_activate(spa); 1207 1208 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 1209 1210 if (error == EBADF) { 1211 /* 1212 * If vdev_validate() returns failure (indicated by 1213 * EBADF), it indicates that one of the vdevs indicates 1214 * that the pool has been exported or destroyed. If 1215 * this is the case, the config cache is out of sync and 1216 * we should remove the pool from the namespace. 1217 */ 1218 zfs_post_ok(spa, NULL); 1219 spa_unload(spa); 1220 spa_deactivate(spa); 1221 spa_remove(spa); 1222 spa_config_sync(); 1223 if (locked) 1224 mutex_exit(&spa_namespace_lock); 1225 return (ENOENT); 1226 } 1227 1228 if (error) { 1229 /* 1230 * We can't open the pool, but we still have useful 1231 * information: the state of each vdev after the 1232 * attempted vdev_open(). Return this to the user. 1233 */ 1234 if (config != NULL && spa->spa_root_vdev != NULL) { 1235 spa_config_enter(spa, RW_READER, FTAG); 1236 *config = spa_config_generate(spa, NULL, -1ULL, 1237 B_TRUE); 1238 spa_config_exit(spa, FTAG); 1239 } 1240 spa_unload(spa); 1241 spa_deactivate(spa); 1242 spa->spa_last_open_failed = B_TRUE; 1243 if (locked) 1244 mutex_exit(&spa_namespace_lock); 1245 *spapp = NULL; 1246 return (error); 1247 } else { 1248 zfs_post_ok(spa, NULL); 1249 spa->spa_last_open_failed = B_FALSE; 1250 } 1251 1252 loaded = B_TRUE; 1253 } 1254 1255 spa_open_ref(spa, tag); 1256 1257 /* 1258 * If we just loaded the pool, resilver anything that's out of date. 1259 */ 1260 if (loaded && (spa_mode & FWRITE)) 1261 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1262 1263 if (locked) 1264 mutex_exit(&spa_namespace_lock); 1265 1266 *spapp = spa; 1267 1268 if (config != NULL) { 1269 spa_config_enter(spa, RW_READER, FTAG); 1270 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1271 spa_config_exit(spa, FTAG); 1272 } 1273 1274 return (0); 1275 } 1276 1277 int 1278 spa_open(const char *name, spa_t **spapp, void *tag) 1279 { 1280 return (spa_open_common(name, spapp, tag, NULL)); 1281 } 1282 1283 /* 1284 * Lookup the given spa_t, incrementing the inject count in the process, 1285 * preventing it from being exported or destroyed. 1286 */ 1287 spa_t * 1288 spa_inject_addref(char *name) 1289 { 1290 spa_t *spa; 1291 1292 mutex_enter(&spa_namespace_lock); 1293 if ((spa = spa_lookup(name)) == NULL) { 1294 mutex_exit(&spa_namespace_lock); 1295 return (NULL); 1296 } 1297 spa->spa_inject_ref++; 1298 mutex_exit(&spa_namespace_lock); 1299 1300 return (spa); 1301 } 1302 1303 void 1304 spa_inject_delref(spa_t *spa) 1305 { 1306 mutex_enter(&spa_namespace_lock); 1307 spa->spa_inject_ref--; 1308 mutex_exit(&spa_namespace_lock); 1309 } 1310 1311 static void 1312 spa_add_spares(spa_t *spa, nvlist_t *config) 1313 { 1314 nvlist_t **spares; 1315 uint_t i, nspares; 1316 nvlist_t *nvroot; 1317 uint64_t guid; 1318 vdev_stat_t *vs; 1319 uint_t vsc; 1320 uint64_t pool; 1321 1322 if (spa->spa_nspares == 0) 1323 return; 1324 1325 VERIFY(nvlist_lookup_nvlist(config, 1326 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1327 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 1328 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1329 if (nspares != 0) { 1330 VERIFY(nvlist_add_nvlist_array(nvroot, 1331 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1332 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1333 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1334 1335 /* 1336 * Go through and find any spares which have since been 1337 * repurposed as an active spare. If this is the case, update 1338 * their status appropriately. 1339 */ 1340 for (i = 0; i < nspares; i++) { 1341 VERIFY(nvlist_lookup_uint64(spares[i], 1342 ZPOOL_CONFIG_GUID, &guid) == 0); 1343 if (spa_spare_exists(guid, &pool) && pool != 0ULL) { 1344 VERIFY(nvlist_lookup_uint64_array( 1345 spares[i], ZPOOL_CONFIG_STATS, 1346 (uint64_t **)&vs, &vsc) == 0); 1347 vs->vs_state = VDEV_STATE_CANT_OPEN; 1348 vs->vs_aux = VDEV_AUX_SPARED; 1349 } 1350 } 1351 } 1352 } 1353 1354 int 1355 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 1356 { 1357 int error; 1358 spa_t *spa; 1359 1360 *config = NULL; 1361 error = spa_open_common(name, &spa, FTAG, config); 1362 1363 if (spa && *config != NULL) { 1364 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 1365 spa_get_errlog_size(spa)) == 0); 1366 1367 spa_add_spares(spa, *config); 1368 } 1369 1370 /* 1371 * We want to get the alternate root even for faulted pools, so we cheat 1372 * and call spa_lookup() directly. 1373 */ 1374 if (altroot) { 1375 if (spa == NULL) { 1376 mutex_enter(&spa_namespace_lock); 1377 spa = spa_lookup(name); 1378 if (spa) 1379 spa_altroot(spa, altroot, buflen); 1380 else 1381 altroot[0] = '\0'; 1382 spa = NULL; 1383 mutex_exit(&spa_namespace_lock); 1384 } else { 1385 spa_altroot(spa, altroot, buflen); 1386 } 1387 } 1388 1389 if (spa != NULL) 1390 spa_close(spa, FTAG); 1391 1392 return (error); 1393 } 1394 1395 /* 1396 * Validate that the 'spares' array is well formed. We must have an array of 1397 * nvlists, each which describes a valid leaf vdev. If this is an import (mode 1398 * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long 1399 * as they are well-formed. 1400 */ 1401 static int 1402 spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 1403 { 1404 nvlist_t **spares; 1405 uint_t i, nspares; 1406 vdev_t *vd; 1407 int error; 1408 1409 /* 1410 * It's acceptable to have no spares specified. 1411 */ 1412 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1413 &spares, &nspares) != 0) 1414 return (0); 1415 1416 if (nspares == 0) 1417 return (EINVAL); 1418 1419 /* 1420 * Make sure the pool is formatted with a version that supports hot 1421 * spares. 1422 */ 1423 if (spa_version(spa) < SPA_VERSION_SPARES) 1424 return (ENOTSUP); 1425 1426 /* 1427 * Set the pending spare list so we correctly handle device in-use 1428 * checking. 1429 */ 1430 spa->spa_pending_spares = spares; 1431 spa->spa_pending_nspares = nspares; 1432 1433 for (i = 0; i < nspares; i++) { 1434 if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 1435 mode)) != 0) 1436 goto out; 1437 1438 if (!vd->vdev_ops->vdev_op_leaf) { 1439 vdev_free(vd); 1440 error = EINVAL; 1441 goto out; 1442 } 1443 1444 vd->vdev_top = vd; 1445 1446 if ((error = vdev_open(vd)) == 0 && 1447 (error = vdev_label_init(vd, crtxg, 1448 VDEV_LABEL_SPARE)) == 0) { 1449 VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 1450 vd->vdev_guid) == 0); 1451 } 1452 1453 vdev_free(vd); 1454 1455 if (error && mode != VDEV_ALLOC_SPARE) 1456 goto out; 1457 else 1458 error = 0; 1459 } 1460 1461 out: 1462 spa->spa_pending_spares = NULL; 1463 spa->spa_pending_nspares = 0; 1464 return (error); 1465 } 1466 1467 /* 1468 * Pool Creation 1469 */ 1470 int 1471 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 1472 const char *history_str) 1473 { 1474 spa_t *spa; 1475 char *altroot = NULL; 1476 vdev_t *rvd; 1477 dsl_pool_t *dp; 1478 dmu_tx_t *tx; 1479 int c, error = 0; 1480 uint64_t txg = TXG_INITIAL; 1481 nvlist_t **spares; 1482 uint_t nspares; 1483 uint64_t version; 1484 1485 /* 1486 * If this pool already exists, return failure. 1487 */ 1488 mutex_enter(&spa_namespace_lock); 1489 if (spa_lookup(pool) != NULL) { 1490 mutex_exit(&spa_namespace_lock); 1491 return (EEXIST); 1492 } 1493 1494 /* 1495 * Allocate a new spa_t structure. 1496 */ 1497 (void) nvlist_lookup_string(props, 1498 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 1499 spa = spa_add(pool, altroot); 1500 spa_activate(spa); 1501 1502 spa->spa_uberblock.ub_txg = txg - 1; 1503 1504 if (props && (error = spa_prop_validate(spa, props))) { 1505 spa_unload(spa); 1506 spa_deactivate(spa); 1507 spa_remove(spa); 1508 return (error); 1509 } 1510 1511 if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 1512 &version) != 0) 1513 version = SPA_VERSION; 1514 ASSERT(version <= SPA_VERSION); 1515 spa->spa_uberblock.ub_version = version; 1516 spa->spa_ubsync = spa->spa_uberblock; 1517 1518 /* 1519 * Create the root vdev. 1520 */ 1521 spa_config_enter(spa, RW_WRITER, FTAG); 1522 1523 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1524 1525 ASSERT(error != 0 || rvd != NULL); 1526 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1527 1528 if (error == 0 && rvd->vdev_children == 0) 1529 error = EINVAL; 1530 1531 if (error == 0 && 1532 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1533 (error = spa_validate_spares(spa, nvroot, txg, 1534 VDEV_ALLOC_ADD)) == 0) { 1535 for (c = 0; c < rvd->vdev_children; c++) 1536 vdev_init(rvd->vdev_child[c], txg); 1537 vdev_config_dirty(rvd); 1538 } 1539 1540 spa_config_exit(spa, FTAG); 1541 1542 if (error != 0) { 1543 spa_unload(spa); 1544 spa_deactivate(spa); 1545 spa_remove(spa); 1546 mutex_exit(&spa_namespace_lock); 1547 return (error); 1548 } 1549 1550 /* 1551 * Get the list of spares, if specified. 1552 */ 1553 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1554 &spares, &nspares) == 0) { 1555 VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 1556 KM_SLEEP) == 0); 1557 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1558 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1559 spa_config_enter(spa, RW_WRITER, FTAG); 1560 spa_load_spares(spa); 1561 spa_config_exit(spa, FTAG); 1562 spa->spa_sync_spares = B_TRUE; 1563 } 1564 1565 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1566 spa->spa_meta_objset = dp->dp_meta_objset; 1567 1568 tx = dmu_tx_create_assigned(dp, txg); 1569 1570 /* 1571 * Create the pool config object. 1572 */ 1573 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1574 DMU_OT_PACKED_NVLIST, 1 << 14, 1575 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1576 1577 if (zap_add(spa->spa_meta_objset, 1578 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1579 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1580 cmn_err(CE_PANIC, "failed to add pool config"); 1581 } 1582 1583 /* Newly created pools with the right version are always deflated. */ 1584 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 1585 spa->spa_deflate = TRUE; 1586 if (zap_add(spa->spa_meta_objset, 1587 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1588 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 1589 cmn_err(CE_PANIC, "failed to add deflate"); 1590 } 1591 } 1592 1593 /* 1594 * Create the deferred-free bplist object. Turn off compression 1595 * because sync-to-convergence takes longer if the blocksize 1596 * keeps changing. 1597 */ 1598 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1599 1 << 14, tx); 1600 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1601 ZIO_COMPRESS_OFF, tx); 1602 1603 if (zap_add(spa->spa_meta_objset, 1604 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1605 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 1606 cmn_err(CE_PANIC, "failed to add bplist"); 1607 } 1608 1609 /* 1610 * Create the pool's history object. 1611 */ 1612 if (version >= SPA_VERSION_ZPOOL_HISTORY) 1613 spa_history_create_obj(spa, tx); 1614 1615 /* 1616 * Set pool properties. 1617 */ 1618 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 1619 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 1620 spa->spa_temporary = zpool_prop_default_numeric(ZPOOL_PROP_TEMPORARY); 1621 if (props) 1622 spa_sync_props(spa, props, CRED(), tx); 1623 1624 dmu_tx_commit(tx); 1625 1626 spa->spa_sync_on = B_TRUE; 1627 txg_sync_start(spa->spa_dsl_pool); 1628 1629 /* 1630 * We explicitly wait for the first transaction to complete so that our 1631 * bean counters are appropriately updated. 1632 */ 1633 txg_wait_synced(spa->spa_dsl_pool, txg); 1634 1635 spa_config_sync(); 1636 1637 if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 1638 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 1639 1640 mutex_exit(&spa_namespace_lock); 1641 1642 return (0); 1643 } 1644 1645 /* 1646 * Import the given pool into the system. We set up the necessary spa_t and 1647 * then call spa_load() to do the dirty work. 1648 */ 1649 int 1650 spa_import(const char *pool, nvlist_t *config, nvlist_t *props) 1651 { 1652 spa_t *spa; 1653 char *altroot = NULL; 1654 int error; 1655 nvlist_t *nvroot; 1656 nvlist_t **spares; 1657 uint_t nspares; 1658 1659 /* 1660 * If a pool with this name exists, return failure. 1661 */ 1662 mutex_enter(&spa_namespace_lock); 1663 if (spa_lookup(pool) != NULL) { 1664 mutex_exit(&spa_namespace_lock); 1665 return (EEXIST); 1666 } 1667 1668 /* 1669 * Create and initialize the spa structure. 1670 */ 1671 (void) nvlist_lookup_string(props, 1672 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 1673 spa = spa_add(pool, altroot); 1674 spa_activate(spa); 1675 1676 /* 1677 * Pass off the heavy lifting to spa_load(). 1678 * Pass TRUE for mosconfig because the user-supplied config 1679 * is actually the one to trust when doing an import. 1680 */ 1681 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1682 1683 spa_config_enter(spa, RW_WRITER, FTAG); 1684 /* 1685 * Toss any existing sparelist, as it doesn't have any validity anymore, 1686 * and conflicts with spa_has_spare(). 1687 */ 1688 if (spa->spa_sparelist) { 1689 nvlist_free(spa->spa_sparelist); 1690 spa->spa_sparelist = NULL; 1691 spa_load_spares(spa); 1692 } 1693 1694 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1695 &nvroot) == 0); 1696 if (error == 0) { 1697 error = spa_validate_spares(spa, nvroot, -1ULL, 1698 VDEV_ALLOC_SPARE); 1699 } 1700 spa_config_exit(spa, FTAG); 1701 1702 if (error != 0 || (props && (error = spa_prop_set(spa, props)))) { 1703 spa_unload(spa); 1704 spa_deactivate(spa); 1705 spa_remove(spa); 1706 mutex_exit(&spa_namespace_lock); 1707 return (error); 1708 } 1709 1710 /* 1711 * Override any spares as specified by the user, as these may have 1712 * correct device names/devids, etc. 1713 */ 1714 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1715 &spares, &nspares) == 0) { 1716 if (spa->spa_sparelist) 1717 VERIFY(nvlist_remove(spa->spa_sparelist, 1718 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1719 else 1720 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1721 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1722 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1723 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1724 spa_config_enter(spa, RW_WRITER, FTAG); 1725 spa_load_spares(spa); 1726 spa_config_exit(spa, FTAG); 1727 spa->spa_sync_spares = B_TRUE; 1728 } 1729 1730 /* 1731 * Update the config cache to include the newly-imported pool. 1732 */ 1733 if (spa_mode & FWRITE) 1734 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1735 1736 /* 1737 * Resilver anything that's out of date. 1738 */ 1739 if (spa_mode & FWRITE) 1740 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1741 1742 mutex_exit(&spa_namespace_lock); 1743 1744 return (0); 1745 } 1746 1747 /* 1748 * This (illegal) pool name is used when temporarily importing a spa_t in order 1749 * to get the vdev stats associated with the imported devices. 1750 */ 1751 #define TRYIMPORT_NAME "$import" 1752 1753 nvlist_t * 1754 spa_tryimport(nvlist_t *tryconfig) 1755 { 1756 nvlist_t *config = NULL; 1757 char *poolname; 1758 spa_t *spa; 1759 uint64_t state; 1760 1761 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1762 return (NULL); 1763 1764 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1765 return (NULL); 1766 1767 /* 1768 * Create and initialize the spa structure. 1769 */ 1770 mutex_enter(&spa_namespace_lock); 1771 spa = spa_add(TRYIMPORT_NAME, NULL); 1772 spa_activate(spa); 1773 1774 /* 1775 * Pass off the heavy lifting to spa_load(). 1776 * Pass TRUE for mosconfig because the user-supplied config 1777 * is actually the one to trust when doing an import. 1778 */ 1779 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1780 1781 /* 1782 * If 'tryconfig' was at least parsable, return the current config. 1783 */ 1784 if (spa->spa_root_vdev != NULL) { 1785 spa_config_enter(spa, RW_READER, FTAG); 1786 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1787 spa_config_exit(spa, FTAG); 1788 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1789 poolname) == 0); 1790 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1791 state) == 0); 1792 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 1793 spa->spa_uberblock.ub_timestamp) == 0); 1794 1795 /* 1796 * Add the list of hot spares. 1797 */ 1798 spa_add_spares(spa, config); 1799 } 1800 1801 spa_unload(spa); 1802 spa_deactivate(spa); 1803 spa_remove(spa); 1804 mutex_exit(&spa_namespace_lock); 1805 1806 return (config); 1807 } 1808 1809 /* 1810 * Pool export/destroy 1811 * 1812 * The act of destroying or exporting a pool is very simple. We make sure there 1813 * is no more pending I/O and any references to the pool are gone. Then, we 1814 * update the pool state and sync all the labels to disk, removing the 1815 * configuration from the cache afterwards. 1816 */ 1817 static int 1818 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1819 { 1820 spa_t *spa; 1821 1822 if (oldconfig) 1823 *oldconfig = NULL; 1824 1825 if (!(spa_mode & FWRITE)) 1826 return (EROFS); 1827 1828 mutex_enter(&spa_namespace_lock); 1829 if ((spa = spa_lookup(pool)) == NULL) { 1830 mutex_exit(&spa_namespace_lock); 1831 return (ENOENT); 1832 } 1833 1834 /* 1835 * Put a hold on the pool, drop the namespace lock, stop async tasks, 1836 * reacquire the namespace lock, and see if we can export. 1837 */ 1838 spa_open_ref(spa, FTAG); 1839 mutex_exit(&spa_namespace_lock); 1840 spa_async_suspend(spa); 1841 mutex_enter(&spa_namespace_lock); 1842 spa_close(spa, FTAG); 1843 1844 /* 1845 * The pool will be in core if it's openable, 1846 * in which case we can modify its state. 1847 */ 1848 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1849 /* 1850 * Objsets may be open only because they're dirty, so we 1851 * have to force it to sync before checking spa_refcnt. 1852 */ 1853 spa_scrub_suspend(spa); 1854 txg_wait_synced(spa->spa_dsl_pool, 0); 1855 1856 /* 1857 * A pool cannot be exported or destroyed if there are active 1858 * references. If we are resetting a pool, allow references by 1859 * fault injection handlers. 1860 */ 1861 if (!spa_refcount_zero(spa) || 1862 (spa->spa_inject_ref != 0 && 1863 new_state != POOL_STATE_UNINITIALIZED)) { 1864 spa_scrub_resume(spa); 1865 spa_async_resume(spa); 1866 mutex_exit(&spa_namespace_lock); 1867 return (EBUSY); 1868 } 1869 1870 spa_scrub_resume(spa); 1871 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1872 1873 /* 1874 * We want this to be reflected on every label, 1875 * so mark them all dirty. spa_unload() will do the 1876 * final sync that pushes these changes out. 1877 */ 1878 if (new_state != POOL_STATE_UNINITIALIZED) { 1879 spa_config_enter(spa, RW_WRITER, FTAG); 1880 spa->spa_state = new_state; 1881 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 1882 vdev_config_dirty(spa->spa_root_vdev); 1883 spa_config_exit(spa, FTAG); 1884 } 1885 } 1886 1887 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 1888 1889 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1890 spa_unload(spa); 1891 spa_deactivate(spa); 1892 } 1893 1894 if (oldconfig && spa->spa_config) 1895 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 1896 1897 if (new_state != POOL_STATE_UNINITIALIZED) { 1898 spa_remove(spa); 1899 spa_config_sync(); 1900 } 1901 mutex_exit(&spa_namespace_lock); 1902 1903 return (0); 1904 } 1905 1906 /* 1907 * Destroy a storage pool. 1908 */ 1909 int 1910 spa_destroy(char *pool) 1911 { 1912 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1913 } 1914 1915 /* 1916 * Export a storage pool. 1917 */ 1918 int 1919 spa_export(char *pool, nvlist_t **oldconfig) 1920 { 1921 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1922 } 1923 1924 /* 1925 * Similar to spa_export(), this unloads the spa_t without actually removing it 1926 * from the namespace in any way. 1927 */ 1928 int 1929 spa_reset(char *pool) 1930 { 1931 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 1932 } 1933 1934 1935 /* 1936 * ========================================================================== 1937 * Device manipulation 1938 * ========================================================================== 1939 */ 1940 1941 /* 1942 * Add a device to a storage pool. 1943 */ 1944 int 1945 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1946 { 1947 uint64_t txg; 1948 int c, error; 1949 vdev_t *rvd = spa->spa_root_vdev; 1950 vdev_t *vd, *tvd; 1951 nvlist_t **spares; 1952 uint_t i, nspares; 1953 1954 txg = spa_vdev_enter(spa); 1955 1956 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 1957 VDEV_ALLOC_ADD)) != 0) 1958 return (spa_vdev_exit(spa, NULL, txg, error)); 1959 1960 spa->spa_pending_vdev = vd; 1961 1962 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1963 &spares, &nspares) != 0) 1964 nspares = 0; 1965 1966 if (vd->vdev_children == 0 && nspares == 0) { 1967 spa->spa_pending_vdev = NULL; 1968 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1969 } 1970 1971 if (vd->vdev_children != 0) { 1972 if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 1973 spa->spa_pending_vdev = NULL; 1974 return (spa_vdev_exit(spa, vd, txg, error)); 1975 } 1976 } 1977 1978 /* 1979 * We must validate the spares after checking the children. Otherwise, 1980 * vdev_inuse() will blindly overwrite the spare. 1981 */ 1982 if ((error = spa_validate_spares(spa, nvroot, txg, 1983 VDEV_ALLOC_ADD)) != 0) { 1984 spa->spa_pending_vdev = NULL; 1985 return (spa_vdev_exit(spa, vd, txg, error)); 1986 } 1987 1988 spa->spa_pending_vdev = NULL; 1989 1990 /* 1991 * Transfer each new top-level vdev from vd to rvd. 1992 */ 1993 for (c = 0; c < vd->vdev_children; c++) { 1994 tvd = vd->vdev_child[c]; 1995 vdev_remove_child(vd, tvd); 1996 tvd->vdev_id = rvd->vdev_children; 1997 vdev_add_child(rvd, tvd); 1998 vdev_config_dirty(tvd); 1999 } 2000 2001 if (nspares != 0) { 2002 if (spa->spa_sparelist != NULL) { 2003 nvlist_t **oldspares; 2004 uint_t oldnspares; 2005 nvlist_t **newspares; 2006 2007 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 2008 ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 2009 2010 newspares = kmem_alloc(sizeof (void *) * 2011 (nspares + oldnspares), KM_SLEEP); 2012 for (i = 0; i < oldnspares; i++) 2013 VERIFY(nvlist_dup(oldspares[i], 2014 &newspares[i], KM_SLEEP) == 0); 2015 for (i = 0; i < nspares; i++) 2016 VERIFY(nvlist_dup(spares[i], 2017 &newspares[i + oldnspares], 2018 KM_SLEEP) == 0); 2019 2020 VERIFY(nvlist_remove(spa->spa_sparelist, 2021 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 2022 2023 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 2024 ZPOOL_CONFIG_SPARES, newspares, 2025 nspares + oldnspares) == 0); 2026 for (i = 0; i < oldnspares + nspares; i++) 2027 nvlist_free(newspares[i]); 2028 kmem_free(newspares, (oldnspares + nspares) * 2029 sizeof (void *)); 2030 } else { 2031 VERIFY(nvlist_alloc(&spa->spa_sparelist, 2032 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2033 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 2034 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2035 } 2036 2037 spa_load_spares(spa); 2038 spa->spa_sync_spares = B_TRUE; 2039 } 2040 2041 /* 2042 * We have to be careful when adding new vdevs to an existing pool. 2043 * If other threads start allocating from these vdevs before we 2044 * sync the config cache, and we lose power, then upon reboot we may 2045 * fail to open the pool because there are DVAs that the config cache 2046 * can't translate. Therefore, we first add the vdevs without 2047 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 2048 * and then let spa_config_update() initialize the new metaslabs. 2049 * 2050 * spa_load() checks for added-but-not-initialized vdevs, so that 2051 * if we lose power at any point in this sequence, the remaining 2052 * steps will be completed the next time we load the pool. 2053 */ 2054 (void) spa_vdev_exit(spa, vd, txg, 0); 2055 2056 mutex_enter(&spa_namespace_lock); 2057 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2058 mutex_exit(&spa_namespace_lock); 2059 2060 return (0); 2061 } 2062 2063 /* 2064 * Attach a device to a mirror. The arguments are the path to any device 2065 * in the mirror, and the nvroot for the new device. If the path specifies 2066 * a device that is not mirrored, we automatically insert the mirror vdev. 2067 * 2068 * If 'replacing' is specified, the new device is intended to replace the 2069 * existing device; in this case the two devices are made into their own 2070 * mirror using the 'replacing' vdev, which is functionally identical to 2071 * the mirror vdev (it actually reuses all the same ops) but has a few 2072 * extra rules: you can't attach to it after it's been created, and upon 2073 * completion of resilvering, the first disk (the one being replaced) 2074 * is automatically detached. 2075 */ 2076 int 2077 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 2078 { 2079 uint64_t txg, open_txg; 2080 int error; 2081 vdev_t *rvd = spa->spa_root_vdev; 2082 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 2083 vdev_ops_t *pvops; 2084 int is_log; 2085 2086 txg = spa_vdev_enter(spa); 2087 2088 oldvd = vdev_lookup_by_guid(rvd, guid); 2089 2090 if (oldvd == NULL) 2091 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 2092 2093 if (!oldvd->vdev_ops->vdev_op_leaf) 2094 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2095 2096 pvd = oldvd->vdev_parent; 2097 2098 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 2099 VDEV_ALLOC_ADD)) != 0) 2100 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 2101 2102 if (newrootvd->vdev_children != 1) 2103 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 2104 2105 newvd = newrootvd->vdev_child[0]; 2106 2107 if (!newvd->vdev_ops->vdev_op_leaf) 2108 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 2109 2110 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 2111 return (spa_vdev_exit(spa, newrootvd, txg, error)); 2112 2113 /* 2114 * Spares can't replace logs 2115 */ 2116 is_log = oldvd->vdev_islog; 2117 if (is_log && newvd->vdev_isspare) 2118 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2119 2120 if (!replacing) { 2121 /* 2122 * For attach, the only allowable parent is a mirror or the root 2123 * vdev. 2124 */ 2125 if (pvd->vdev_ops != &vdev_mirror_ops && 2126 pvd->vdev_ops != &vdev_root_ops) 2127 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2128 2129 pvops = &vdev_mirror_ops; 2130 } else { 2131 /* 2132 * Active hot spares can only be replaced by inactive hot 2133 * spares. 2134 */ 2135 if (pvd->vdev_ops == &vdev_spare_ops && 2136 pvd->vdev_child[1] == oldvd && 2137 !spa_has_spare(spa, newvd->vdev_guid)) 2138 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2139 2140 /* 2141 * If the source is a hot spare, and the parent isn't already a 2142 * spare, then we want to create a new hot spare. Otherwise, we 2143 * want to create a replacing vdev. The user is not allowed to 2144 * attach to a spared vdev child unless the 'isspare' state is 2145 * the same (spare replaces spare, non-spare replaces 2146 * non-spare). 2147 */ 2148 if (pvd->vdev_ops == &vdev_replacing_ops) 2149 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2150 else if (pvd->vdev_ops == &vdev_spare_ops && 2151 newvd->vdev_isspare != oldvd->vdev_isspare) 2152 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2153 else if (pvd->vdev_ops != &vdev_spare_ops && 2154 newvd->vdev_isspare) 2155 pvops = &vdev_spare_ops; 2156 else 2157 pvops = &vdev_replacing_ops; 2158 } 2159 2160 /* 2161 * Compare the new device size with the replaceable/attachable 2162 * device size. 2163 */ 2164 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 2165 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 2166 2167 /* 2168 * The new device cannot have a higher alignment requirement 2169 * than the top-level vdev. 2170 */ 2171 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 2172 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 2173 2174 /* 2175 * If this is an in-place replacement, update oldvd's path and devid 2176 * to make it distinguishable from newvd, and unopenable from now on. 2177 */ 2178 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 2179 spa_strfree(oldvd->vdev_path); 2180 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 2181 KM_SLEEP); 2182 (void) sprintf(oldvd->vdev_path, "%s/%s", 2183 newvd->vdev_path, "old"); 2184 if (oldvd->vdev_devid != NULL) { 2185 spa_strfree(oldvd->vdev_devid); 2186 oldvd->vdev_devid = NULL; 2187 } 2188 } 2189 2190 /* 2191 * If the parent is not a mirror, or if we're replacing, insert the new 2192 * mirror/replacing/spare vdev above oldvd. 2193 */ 2194 if (pvd->vdev_ops != pvops) 2195 pvd = vdev_add_parent(oldvd, pvops); 2196 2197 ASSERT(pvd->vdev_top->vdev_parent == rvd); 2198 ASSERT(pvd->vdev_ops == pvops); 2199 ASSERT(oldvd->vdev_parent == pvd); 2200 2201 /* 2202 * Extract the new device from its root and add it to pvd. 2203 */ 2204 vdev_remove_child(newrootvd, newvd); 2205 newvd->vdev_id = pvd->vdev_children; 2206 vdev_add_child(pvd, newvd); 2207 2208 /* 2209 * If newvd is smaller than oldvd, but larger than its rsize, 2210 * the addition of newvd may have decreased our parent's asize. 2211 */ 2212 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 2213 2214 tvd = newvd->vdev_top; 2215 ASSERT(pvd->vdev_top == tvd); 2216 ASSERT(tvd->vdev_parent == rvd); 2217 2218 vdev_config_dirty(tvd); 2219 2220 /* 2221 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 2222 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 2223 */ 2224 open_txg = txg + TXG_CONCURRENT_STATES - 1; 2225 2226 mutex_enter(&newvd->vdev_dtl_lock); 2227 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 2228 open_txg - TXG_INITIAL + 1); 2229 mutex_exit(&newvd->vdev_dtl_lock); 2230 2231 if (newvd->vdev_isspare) 2232 spa_spare_activate(newvd); 2233 2234 /* 2235 * Mark newvd's DTL dirty in this txg. 2236 */ 2237 vdev_dirty(tvd, VDD_DTL, newvd, txg); 2238 2239 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 2240 2241 /* 2242 * Kick off a resilver to update newvd. We need to grab the namespace 2243 * lock because spa_scrub() needs to post a sysevent with the pool name. 2244 */ 2245 mutex_enter(&spa_namespace_lock); 2246 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 2247 mutex_exit(&spa_namespace_lock); 2248 2249 return (0); 2250 } 2251 2252 /* 2253 * Detach a device from a mirror or replacing vdev. 2254 * If 'replace_done' is specified, only detach if the parent 2255 * is a replacing vdev. 2256 */ 2257 int 2258 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 2259 { 2260 uint64_t txg; 2261 int c, t, error; 2262 vdev_t *rvd = spa->spa_root_vdev; 2263 vdev_t *vd, *pvd, *cvd, *tvd; 2264 boolean_t unspare = B_FALSE; 2265 uint64_t unspare_guid; 2266 2267 txg = spa_vdev_enter(spa); 2268 2269 vd = vdev_lookup_by_guid(rvd, guid); 2270 2271 if (vd == NULL) 2272 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 2273 2274 if (!vd->vdev_ops->vdev_op_leaf) 2275 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2276 2277 pvd = vd->vdev_parent; 2278 2279 /* 2280 * If replace_done is specified, only remove this device if it's 2281 * the first child of a replacing vdev. For the 'spare' vdev, either 2282 * disk can be removed. 2283 */ 2284 if (replace_done) { 2285 if (pvd->vdev_ops == &vdev_replacing_ops) { 2286 if (vd->vdev_id != 0) 2287 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2288 } else if (pvd->vdev_ops != &vdev_spare_ops) { 2289 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2290 } 2291 } 2292 2293 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 2294 spa_version(spa) >= SPA_VERSION_SPARES); 2295 2296 /* 2297 * Only mirror, replacing, and spare vdevs support detach. 2298 */ 2299 if (pvd->vdev_ops != &vdev_replacing_ops && 2300 pvd->vdev_ops != &vdev_mirror_ops && 2301 pvd->vdev_ops != &vdev_spare_ops) 2302 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2303 2304 /* 2305 * If there's only one replica, you can't detach it. 2306 */ 2307 if (pvd->vdev_children <= 1) 2308 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 2309 2310 /* 2311 * If all siblings have non-empty DTLs, this device may have the only 2312 * valid copy of the data, which means we cannot safely detach it. 2313 * 2314 * XXX -- as in the vdev_offline() case, we really want a more 2315 * precise DTL check. 2316 */ 2317 for (c = 0; c < pvd->vdev_children; c++) { 2318 uint64_t dirty; 2319 2320 cvd = pvd->vdev_child[c]; 2321 if (cvd == vd) 2322 continue; 2323 if (vdev_is_dead(cvd)) 2324 continue; 2325 mutex_enter(&cvd->vdev_dtl_lock); 2326 dirty = cvd->vdev_dtl_map.sm_space | 2327 cvd->vdev_dtl_scrub.sm_space; 2328 mutex_exit(&cvd->vdev_dtl_lock); 2329 if (!dirty) 2330 break; 2331 } 2332 2333 /* 2334 * If we are a replacing or spare vdev, then we can always detach the 2335 * latter child, as that is how one cancels the operation. 2336 */ 2337 if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 2338 c == pvd->vdev_children) 2339 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 2340 2341 /* 2342 * If we are detaching the original disk from a spare, then it implies 2343 * that the spare should become a real disk, and be removed from the 2344 * active spare list for the pool. 2345 */ 2346 if (pvd->vdev_ops == &vdev_spare_ops && 2347 vd->vdev_id == 0) 2348 unspare = B_TRUE; 2349 2350 /* 2351 * Erase the disk labels so the disk can be used for other things. 2352 * This must be done after all other error cases are handled, 2353 * but before we disembowel vd (so we can still do I/O to it). 2354 * But if we can't do it, don't treat the error as fatal -- 2355 * it may be that the unwritability of the disk is the reason 2356 * it's being detached! 2357 */ 2358 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 2359 2360 /* 2361 * Remove vd from its parent and compact the parent's children. 2362 */ 2363 vdev_remove_child(pvd, vd); 2364 vdev_compact_children(pvd); 2365 2366 /* 2367 * Remember one of the remaining children so we can get tvd below. 2368 */ 2369 cvd = pvd->vdev_child[0]; 2370 2371 /* 2372 * If we need to remove the remaining child from the list of hot spares, 2373 * do it now, marking the vdev as no longer a spare in the process. We 2374 * must do this before vdev_remove_parent(), because that can change the 2375 * GUID if it creates a new toplevel GUID. 2376 */ 2377 if (unspare) { 2378 ASSERT(cvd->vdev_isspare); 2379 spa_spare_remove(cvd); 2380 unspare_guid = cvd->vdev_guid; 2381 } 2382 2383 /* 2384 * If the parent mirror/replacing vdev only has one child, 2385 * the parent is no longer needed. Remove it from the tree. 2386 */ 2387 if (pvd->vdev_children == 1) 2388 vdev_remove_parent(cvd); 2389 2390 /* 2391 * We don't set tvd until now because the parent we just removed 2392 * may have been the previous top-level vdev. 2393 */ 2394 tvd = cvd->vdev_top; 2395 ASSERT(tvd->vdev_parent == rvd); 2396 2397 /* 2398 * Reevaluate the parent vdev state. 2399 */ 2400 vdev_propagate_state(cvd); 2401 2402 /* 2403 * If the device we just detached was smaller than the others, it may be 2404 * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 2405 * can't fail because the existing metaslabs are already in core, so 2406 * there's nothing to read from disk. 2407 */ 2408 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 2409 2410 vdev_config_dirty(tvd); 2411 2412 /* 2413 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 2414 * vd->vdev_detached is set and free vd's DTL object in syncing context. 2415 * But first make sure we're not on any *other* txg's DTL list, to 2416 * prevent vd from being accessed after it's freed. 2417 */ 2418 for (t = 0; t < TXG_SIZE; t++) 2419 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 2420 vd->vdev_detached = B_TRUE; 2421 vdev_dirty(tvd, VDD_DTL, vd, txg); 2422 2423 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 2424 2425 error = spa_vdev_exit(spa, vd, txg, 0); 2426 2427 /* 2428 * If this was the removal of the original device in a hot spare vdev, 2429 * then we want to go through and remove the device from the hot spare 2430 * list of every other pool. 2431 */ 2432 if (unspare) { 2433 spa = NULL; 2434 mutex_enter(&spa_namespace_lock); 2435 while ((spa = spa_next(spa)) != NULL) { 2436 if (spa->spa_state != POOL_STATE_ACTIVE) 2437 continue; 2438 2439 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 2440 } 2441 mutex_exit(&spa_namespace_lock); 2442 } 2443 2444 return (error); 2445 } 2446 2447 /* 2448 * Remove a device from the pool. Currently, this supports removing only hot 2449 * spares. 2450 */ 2451 int 2452 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 2453 { 2454 vdev_t *vd; 2455 nvlist_t **spares, *nv, **newspares; 2456 uint_t i, j, nspares; 2457 int ret = 0; 2458 2459 spa_config_enter(spa, RW_WRITER, FTAG); 2460 2461 vd = spa_lookup_by_guid(spa, guid); 2462 2463 nv = NULL; 2464 if (spa->spa_spares != NULL && 2465 nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2466 &spares, &nspares) == 0) { 2467 for (i = 0; i < nspares; i++) { 2468 uint64_t theguid; 2469 2470 VERIFY(nvlist_lookup_uint64(spares[i], 2471 ZPOOL_CONFIG_GUID, &theguid) == 0); 2472 if (theguid == guid) { 2473 nv = spares[i]; 2474 break; 2475 } 2476 } 2477 } 2478 2479 /* 2480 * We only support removing a hot spare, and only if it's not currently 2481 * in use in this pool. 2482 */ 2483 if (nv == NULL && vd == NULL) { 2484 ret = ENOENT; 2485 goto out; 2486 } 2487 2488 if (nv == NULL && vd != NULL) { 2489 ret = ENOTSUP; 2490 goto out; 2491 } 2492 2493 if (!unspare && nv != NULL && vd != NULL) { 2494 ret = EBUSY; 2495 goto out; 2496 } 2497 2498 if (nspares == 1) { 2499 newspares = NULL; 2500 } else { 2501 newspares = kmem_alloc((nspares - 1) * sizeof (void *), 2502 KM_SLEEP); 2503 for (i = 0, j = 0; i < nspares; i++) { 2504 if (spares[i] != nv) 2505 VERIFY(nvlist_dup(spares[i], 2506 &newspares[j++], KM_SLEEP) == 0); 2507 } 2508 } 2509 2510 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2511 DATA_TYPE_NVLIST_ARRAY) == 0); 2512 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2513 newspares, nspares - 1) == 0); 2514 for (i = 0; i < nspares - 1; i++) 2515 nvlist_free(newspares[i]); 2516 kmem_free(newspares, (nspares - 1) * sizeof (void *)); 2517 spa_load_spares(spa); 2518 spa->spa_sync_spares = B_TRUE; 2519 2520 out: 2521 spa_config_exit(spa, FTAG); 2522 2523 return (ret); 2524 } 2525 2526 /* 2527 * Find any device that's done replacing, or a vdev marked 'unspare' that's 2528 * current spared, so we can detach it. 2529 */ 2530 static vdev_t * 2531 spa_vdev_resilver_done_hunt(vdev_t *vd) 2532 { 2533 vdev_t *newvd, *oldvd; 2534 int c; 2535 2536 for (c = 0; c < vd->vdev_children; c++) { 2537 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 2538 if (oldvd != NULL) 2539 return (oldvd); 2540 } 2541 2542 /* 2543 * Check for a completed replacement. 2544 */ 2545 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 2546 oldvd = vd->vdev_child[0]; 2547 newvd = vd->vdev_child[1]; 2548 2549 mutex_enter(&newvd->vdev_dtl_lock); 2550 if (newvd->vdev_dtl_map.sm_space == 0 && 2551 newvd->vdev_dtl_scrub.sm_space == 0) { 2552 mutex_exit(&newvd->vdev_dtl_lock); 2553 return (oldvd); 2554 } 2555 mutex_exit(&newvd->vdev_dtl_lock); 2556 } 2557 2558 /* 2559 * Check for a completed resilver with the 'unspare' flag set. 2560 */ 2561 if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { 2562 newvd = vd->vdev_child[0]; 2563 oldvd = vd->vdev_child[1]; 2564 2565 mutex_enter(&newvd->vdev_dtl_lock); 2566 if (newvd->vdev_unspare && 2567 newvd->vdev_dtl_map.sm_space == 0 && 2568 newvd->vdev_dtl_scrub.sm_space == 0) { 2569 newvd->vdev_unspare = 0; 2570 mutex_exit(&newvd->vdev_dtl_lock); 2571 return (oldvd); 2572 } 2573 mutex_exit(&newvd->vdev_dtl_lock); 2574 } 2575 2576 return (NULL); 2577 } 2578 2579 static void 2580 spa_vdev_resilver_done(spa_t *spa) 2581 { 2582 vdev_t *vd; 2583 vdev_t *pvd; 2584 uint64_t guid; 2585 uint64_t pguid = 0; 2586 2587 spa_config_enter(spa, RW_READER, FTAG); 2588 2589 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 2590 guid = vd->vdev_guid; 2591 /* 2592 * If we have just finished replacing a hot spared device, then 2593 * we need to detach the parent's first child (the original hot 2594 * spare) as well. 2595 */ 2596 pvd = vd->vdev_parent; 2597 if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2598 pvd->vdev_id == 0) { 2599 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 2600 ASSERT(pvd->vdev_parent->vdev_children == 2); 2601 pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 2602 } 2603 spa_config_exit(spa, FTAG); 2604 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 2605 return; 2606 if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 2607 return; 2608 spa_config_enter(spa, RW_READER, FTAG); 2609 } 2610 2611 spa_config_exit(spa, FTAG); 2612 } 2613 2614 /* 2615 * Update the stored path for this vdev. Dirty the vdev configuration, relying 2616 * on spa_vdev_enter/exit() to synchronize the labels and cache. 2617 */ 2618 int 2619 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 2620 { 2621 vdev_t *rvd, *vd; 2622 uint64_t txg; 2623 2624 rvd = spa->spa_root_vdev; 2625 2626 txg = spa_vdev_enter(spa); 2627 2628 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2629 /* 2630 * Determine if this is a reference to a hot spare. In that 2631 * case, update the path as stored in the spare list. 2632 */ 2633 nvlist_t **spares; 2634 uint_t i, nspares; 2635 if (spa->spa_sparelist != NULL) { 2636 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 2637 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2638 for (i = 0; i < nspares; i++) { 2639 uint64_t theguid; 2640 VERIFY(nvlist_lookup_uint64(spares[i], 2641 ZPOOL_CONFIG_GUID, &theguid) == 0); 2642 if (theguid == guid) 2643 break; 2644 } 2645 2646 if (i == nspares) 2647 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2648 2649 VERIFY(nvlist_add_string(spares[i], 2650 ZPOOL_CONFIG_PATH, newpath) == 0); 2651 spa_load_spares(spa); 2652 spa->spa_sync_spares = B_TRUE; 2653 return (spa_vdev_exit(spa, NULL, txg, 0)); 2654 } else { 2655 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2656 } 2657 } 2658 2659 if (!vd->vdev_ops->vdev_op_leaf) 2660 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2661 2662 spa_strfree(vd->vdev_path); 2663 vd->vdev_path = spa_strdup(newpath); 2664 2665 vdev_config_dirty(vd->vdev_top); 2666 2667 return (spa_vdev_exit(spa, NULL, txg, 0)); 2668 } 2669 2670 /* 2671 * ========================================================================== 2672 * SPA Scrubbing 2673 * ========================================================================== 2674 */ 2675 2676 static void 2677 spa_scrub_io_done(zio_t *zio) 2678 { 2679 spa_t *spa = zio->io_spa; 2680 2681 arc_data_buf_free(zio->io_data, zio->io_size); 2682 2683 mutex_enter(&spa->spa_scrub_lock); 2684 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2685 vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2686 spa->spa_scrub_errors++; 2687 mutex_enter(&vd->vdev_stat_lock); 2688 vd->vdev_stat.vs_scrub_errors++; 2689 mutex_exit(&vd->vdev_stat_lock); 2690 } 2691 2692 if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight) 2693 cv_broadcast(&spa->spa_scrub_io_cv); 2694 2695 ASSERT(spa->spa_scrub_inflight >= 0); 2696 2697 mutex_exit(&spa->spa_scrub_lock); 2698 } 2699 2700 static void 2701 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 2702 zbookmark_t *zb) 2703 { 2704 size_t size = BP_GET_LSIZE(bp); 2705 void *data; 2706 2707 mutex_enter(&spa->spa_scrub_lock); 2708 /* 2709 * Do not give too much work to vdev(s). 2710 */ 2711 while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) { 2712 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2713 } 2714 spa->spa_scrub_inflight++; 2715 mutex_exit(&spa->spa_scrub_lock); 2716 2717 data = arc_data_buf_alloc(size); 2718 2719 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 2720 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 2721 2722 flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 2723 2724 zio_nowait(zio_read(NULL, spa, bp, data, size, 2725 spa_scrub_io_done, NULL, priority, flags, zb)); 2726 } 2727 2728 /* ARGSUSED */ 2729 static int 2730 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2731 { 2732 blkptr_t *bp = &bc->bc_blkptr; 2733 vdev_t *vd = spa->spa_root_vdev; 2734 dva_t *dva = bp->blk_dva; 2735 int needs_resilver = B_FALSE; 2736 int d; 2737 2738 if (bc->bc_errno) { 2739 /* 2740 * We can't scrub this block, but we can continue to scrub 2741 * the rest of the pool. Note the error and move along. 2742 */ 2743 mutex_enter(&spa->spa_scrub_lock); 2744 spa->spa_scrub_errors++; 2745 mutex_exit(&spa->spa_scrub_lock); 2746 2747 mutex_enter(&vd->vdev_stat_lock); 2748 vd->vdev_stat.vs_scrub_errors++; 2749 mutex_exit(&vd->vdev_stat_lock); 2750 2751 return (ERESTART); 2752 } 2753 2754 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2755 2756 for (d = 0; d < BP_GET_NDVAS(bp); d++) { 2757 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 2758 2759 ASSERT(vd != NULL); 2760 2761 /* 2762 * Keep track of how much data we've examined so that 2763 * zpool(1M) status can make useful progress reports. 2764 */ 2765 mutex_enter(&vd->vdev_stat_lock); 2766 vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 2767 mutex_exit(&vd->vdev_stat_lock); 2768 2769 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 2770 if (DVA_GET_GANG(&dva[d])) { 2771 /* 2772 * Gang members may be spread across multiple 2773 * vdevs, so the best we can do is look at the 2774 * pool-wide DTL. 2775 * XXX -- it would be better to change our 2776 * allocation policy to ensure that this can't 2777 * happen. 2778 */ 2779 vd = spa->spa_root_vdev; 2780 } 2781 if (vdev_dtl_contains(&vd->vdev_dtl_map, 2782 bp->blk_birth, 1)) 2783 needs_resilver = B_TRUE; 2784 } 2785 } 2786 2787 if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2788 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 2789 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 2790 else if (needs_resilver) 2791 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 2792 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2793 2794 return (0); 2795 } 2796 2797 static void 2798 spa_scrub_thread(spa_t *spa) 2799 { 2800 callb_cpr_t cprinfo; 2801 traverse_handle_t *th = spa->spa_scrub_th; 2802 vdev_t *rvd = spa->spa_root_vdev; 2803 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2804 int error = 0; 2805 boolean_t complete; 2806 2807 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2808 2809 /* 2810 * If we're restarting due to a snapshot create/delete, 2811 * wait for that to complete. 2812 */ 2813 txg_wait_synced(spa_get_dsl(spa), 0); 2814 2815 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 2816 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2817 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 2818 2819 spa_config_enter(spa, RW_WRITER, FTAG); 2820 vdev_reopen(rvd); /* purge all vdev caches */ 2821 vdev_config_dirty(rvd); /* rewrite all disk labels */ 2822 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 2823 spa_config_exit(spa, FTAG); 2824 2825 mutex_enter(&spa->spa_scrub_lock); 2826 spa->spa_scrub_errors = 0; 2827 spa->spa_scrub_active = 1; 2828 ASSERT(spa->spa_scrub_inflight == 0); 2829 2830 while (!spa->spa_scrub_stop) { 2831 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2832 while (spa->spa_scrub_suspended) { 2833 spa->spa_scrub_active = 0; 2834 cv_broadcast(&spa->spa_scrub_cv); 2835 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2836 spa->spa_scrub_active = 1; 2837 } 2838 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2839 2840 if (spa->spa_scrub_restart_txg != 0) 2841 break; 2842 2843 mutex_exit(&spa->spa_scrub_lock); 2844 error = traverse_more(th); 2845 mutex_enter(&spa->spa_scrub_lock); 2846 if (error != EAGAIN) 2847 break; 2848 } 2849 2850 while (spa->spa_scrub_inflight) 2851 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2852 2853 spa->spa_scrub_active = 0; 2854 cv_broadcast(&spa->spa_scrub_cv); 2855 2856 mutex_exit(&spa->spa_scrub_lock); 2857 2858 spa_config_enter(spa, RW_WRITER, FTAG); 2859 2860 mutex_enter(&spa->spa_scrub_lock); 2861 2862 /* 2863 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 2864 * AND the spa config lock to synchronize with any config changes 2865 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 2866 */ 2867 if (spa->spa_scrub_restart_txg != 0) 2868 error = ERESTART; 2869 2870 if (spa->spa_scrub_stop) 2871 error = EINTR; 2872 2873 /* 2874 * Even if there were uncorrectable errors, we consider the scrub 2875 * completed. The downside is that if there is a transient error during 2876 * a resilver, we won't resilver the data properly to the target. But 2877 * if the damage is permanent (more likely) we will resilver forever, 2878 * which isn't really acceptable. Since there is enough information for 2879 * the user to know what has failed and why, this seems like a more 2880 * tractable approach. 2881 */ 2882 complete = (error == 0); 2883 2884 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 2885 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2886 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2887 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2888 2889 mutex_exit(&spa->spa_scrub_lock); 2890 2891 /* 2892 * If the scrub/resilver completed, update all DTLs to reflect this. 2893 * Whether it succeeded or not, vacate all temporary scrub DTLs. 2894 */ 2895 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2896 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2897 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 2898 spa_errlog_rotate(spa); 2899 2900 if (scrub_type == POOL_SCRUB_RESILVER && complete) 2901 spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_FINISH); 2902 2903 spa_config_exit(spa, FTAG); 2904 2905 mutex_enter(&spa->spa_scrub_lock); 2906 2907 /* 2908 * We may have finished replacing a device. 2909 * Let the async thread assess this and handle the detach. 2910 */ 2911 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 2912 2913 /* 2914 * If we were told to restart, our final act is to start a new scrub. 2915 */ 2916 if (error == ERESTART) 2917 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 2918 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2919 2920 spa->spa_scrub_type = POOL_SCRUB_NONE; 2921 spa->spa_scrub_active = 0; 2922 spa->spa_scrub_thread = NULL; 2923 cv_broadcast(&spa->spa_scrub_cv); 2924 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2925 thread_exit(); 2926 } 2927 2928 void 2929 spa_scrub_suspend(spa_t *spa) 2930 { 2931 mutex_enter(&spa->spa_scrub_lock); 2932 spa->spa_scrub_suspended++; 2933 while (spa->spa_scrub_active) { 2934 cv_broadcast(&spa->spa_scrub_cv); 2935 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2936 } 2937 while (spa->spa_scrub_inflight) 2938 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2939 mutex_exit(&spa->spa_scrub_lock); 2940 } 2941 2942 void 2943 spa_scrub_resume(spa_t *spa) 2944 { 2945 mutex_enter(&spa->spa_scrub_lock); 2946 ASSERT(spa->spa_scrub_suspended != 0); 2947 if (--spa->spa_scrub_suspended == 0) 2948 cv_broadcast(&spa->spa_scrub_cv); 2949 mutex_exit(&spa->spa_scrub_lock); 2950 } 2951 2952 void 2953 spa_scrub_restart(spa_t *spa, uint64_t txg) 2954 { 2955 /* 2956 * Something happened (e.g. snapshot create/delete) that means 2957 * we must restart any in-progress scrubs. The itinerary will 2958 * fix this properly. 2959 */ 2960 mutex_enter(&spa->spa_scrub_lock); 2961 spa->spa_scrub_restart_txg = txg; 2962 mutex_exit(&spa->spa_scrub_lock); 2963 } 2964 2965 int 2966 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2967 { 2968 space_seg_t *ss; 2969 uint64_t mintxg, maxtxg; 2970 vdev_t *rvd = spa->spa_root_vdev; 2971 2972 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2973 ASSERT(!spa_config_held(spa, RW_WRITER)); 2974 2975 if ((uint_t)type >= POOL_SCRUB_TYPES) 2976 return (ENOTSUP); 2977 2978 mutex_enter(&spa->spa_scrub_lock); 2979 2980 /* 2981 * If there's a scrub or resilver already in progress, stop it. 2982 */ 2983 while (spa->spa_scrub_thread != NULL) { 2984 /* 2985 * Don't stop a resilver unless forced. 2986 */ 2987 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 2988 mutex_exit(&spa->spa_scrub_lock); 2989 return (EBUSY); 2990 } 2991 spa->spa_scrub_stop = 1; 2992 cv_broadcast(&spa->spa_scrub_cv); 2993 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2994 } 2995 2996 /* 2997 * Terminate the previous traverse. 2998 */ 2999 if (spa->spa_scrub_th != NULL) { 3000 traverse_fini(spa->spa_scrub_th); 3001 spa->spa_scrub_th = NULL; 3002 } 3003 3004 if (rvd == NULL) { 3005 ASSERT(spa->spa_scrub_stop == 0); 3006 ASSERT(spa->spa_scrub_type == type); 3007 ASSERT(spa->spa_scrub_restart_txg == 0); 3008 mutex_exit(&spa->spa_scrub_lock); 3009 return (0); 3010 } 3011 3012 mintxg = TXG_INITIAL - 1; 3013 maxtxg = spa_last_synced_txg(spa) + 1; 3014 3015 mutex_enter(&rvd->vdev_dtl_lock); 3016 3017 if (rvd->vdev_dtl_map.sm_space == 0) { 3018 /* 3019 * The pool-wide DTL is empty. 3020 * If this is a resilver, there's nothing to do except 3021 * check whether any in-progress replacements have completed. 3022 */ 3023 if (type == POOL_SCRUB_RESILVER) { 3024 type = POOL_SCRUB_NONE; 3025 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 3026 } 3027 } else { 3028 /* 3029 * The pool-wide DTL is non-empty. 3030 * If this is a normal scrub, upgrade to a resilver instead. 3031 */ 3032 if (type == POOL_SCRUB_EVERYTHING) 3033 type = POOL_SCRUB_RESILVER; 3034 } 3035 3036 if (type == POOL_SCRUB_RESILVER) { 3037 /* 3038 * Determine the resilvering boundaries. 3039 * 3040 * Note: (mintxg, maxtxg) is an open interval, 3041 * i.e. mintxg and maxtxg themselves are not included. 3042 * 3043 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 3044 * so we don't claim to resilver a txg that's still changing. 3045 */ 3046 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 3047 mintxg = ss->ss_start - 1; 3048 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 3049 maxtxg = MIN(ss->ss_end, maxtxg); 3050 3051 spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START); 3052 } 3053 3054 mutex_exit(&rvd->vdev_dtl_lock); 3055 3056 spa->spa_scrub_stop = 0; 3057 spa->spa_scrub_type = type; 3058 spa->spa_scrub_restart_txg = 0; 3059 3060 if (type != POOL_SCRUB_NONE) { 3061 spa->spa_scrub_mintxg = mintxg; 3062 spa->spa_scrub_maxtxg = maxtxg; 3063 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 3064 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 3065 ZIO_FLAG_CANFAIL); 3066 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 3067 spa->spa_scrub_thread = thread_create(NULL, 0, 3068 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 3069 } 3070 3071 mutex_exit(&spa->spa_scrub_lock); 3072 3073 return (0); 3074 } 3075 3076 /* 3077 * ========================================================================== 3078 * SPA async task processing 3079 * ========================================================================== 3080 */ 3081 3082 static void 3083 spa_async_remove(spa_t *spa, vdev_t *vd) 3084 { 3085 vdev_t *tvd; 3086 int c; 3087 3088 for (c = 0; c < vd->vdev_children; c++) { 3089 tvd = vd->vdev_child[c]; 3090 if (tvd->vdev_remove_wanted) { 3091 tvd->vdev_remove_wanted = 0; 3092 vdev_set_state(tvd, B_FALSE, VDEV_STATE_REMOVED, 3093 VDEV_AUX_NONE); 3094 vdev_clear(spa, tvd); 3095 vdev_config_dirty(tvd->vdev_top); 3096 } 3097 spa_async_remove(spa, tvd); 3098 } 3099 } 3100 3101 static void 3102 spa_async_thread(spa_t *spa) 3103 { 3104 int tasks; 3105 uint64_t txg; 3106 3107 ASSERT(spa->spa_sync_on); 3108 3109 mutex_enter(&spa->spa_async_lock); 3110 tasks = spa->spa_async_tasks; 3111 spa->spa_async_tasks = 0; 3112 mutex_exit(&spa->spa_async_lock); 3113 3114 /* 3115 * See if the config needs to be updated. 3116 */ 3117 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 3118 mutex_enter(&spa_namespace_lock); 3119 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3120 mutex_exit(&spa_namespace_lock); 3121 } 3122 3123 /* 3124 * See if any devices need to be marked REMOVED. 3125 */ 3126 if (tasks & SPA_ASYNC_REMOVE) { 3127 txg = spa_vdev_enter(spa); 3128 spa_async_remove(spa, spa->spa_root_vdev); 3129 (void) spa_vdev_exit(spa, NULL, txg, 0); 3130 } 3131 3132 /* 3133 * If any devices are done replacing, detach them. 3134 */ 3135 if (tasks & SPA_ASYNC_RESILVER_DONE) 3136 spa_vdev_resilver_done(spa); 3137 3138 /* 3139 * Kick off a scrub. When starting a RESILVER scrub (or an EVERYTHING 3140 * scrub which can become a resilver), we need to hold 3141 * spa_namespace_lock() because the sysevent we post via 3142 * spa_event_notify() needs to get the name of the pool. 3143 */ 3144 if (tasks & SPA_ASYNC_SCRUB) { 3145 mutex_enter(&spa_namespace_lock); 3146 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 3147 mutex_exit(&spa_namespace_lock); 3148 } 3149 3150 /* 3151 * Kick off a resilver. 3152 */ 3153 if (tasks & SPA_ASYNC_RESILVER) { 3154 mutex_enter(&spa_namespace_lock); 3155 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 3156 mutex_exit(&spa_namespace_lock); 3157 } 3158 3159 /* 3160 * Let the world know that we're done. 3161 */ 3162 mutex_enter(&spa->spa_async_lock); 3163 spa->spa_async_thread = NULL; 3164 cv_broadcast(&spa->spa_async_cv); 3165 mutex_exit(&spa->spa_async_lock); 3166 thread_exit(); 3167 } 3168 3169 void 3170 spa_async_suspend(spa_t *spa) 3171 { 3172 mutex_enter(&spa->spa_async_lock); 3173 spa->spa_async_suspended++; 3174 while (spa->spa_async_thread != NULL) 3175 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 3176 mutex_exit(&spa->spa_async_lock); 3177 } 3178 3179 void 3180 spa_async_resume(spa_t *spa) 3181 { 3182 mutex_enter(&spa->spa_async_lock); 3183 ASSERT(spa->spa_async_suspended != 0); 3184 spa->spa_async_suspended--; 3185 mutex_exit(&spa->spa_async_lock); 3186 } 3187 3188 static void 3189 spa_async_dispatch(spa_t *spa) 3190 { 3191 mutex_enter(&spa->spa_async_lock); 3192 if (spa->spa_async_tasks && !spa->spa_async_suspended && 3193 spa->spa_async_thread == NULL && 3194 rootdir != NULL && !vn_is_readonly(rootdir)) 3195 spa->spa_async_thread = thread_create(NULL, 0, 3196 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 3197 mutex_exit(&spa->spa_async_lock); 3198 } 3199 3200 void 3201 spa_async_request(spa_t *spa, int task) 3202 { 3203 mutex_enter(&spa->spa_async_lock); 3204 spa->spa_async_tasks |= task; 3205 mutex_exit(&spa->spa_async_lock); 3206 } 3207 3208 /* 3209 * ========================================================================== 3210 * SPA syncing routines 3211 * ========================================================================== 3212 */ 3213 3214 static void 3215 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 3216 { 3217 bplist_t *bpl = &spa->spa_sync_bplist; 3218 dmu_tx_t *tx; 3219 blkptr_t blk; 3220 uint64_t itor = 0; 3221 zio_t *zio; 3222 int error; 3223 uint8_t c = 1; 3224 3225 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 3226 3227 while (bplist_iterate(bpl, &itor, &blk) == 0) 3228 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 3229 3230 error = zio_wait(zio); 3231 ASSERT3U(error, ==, 0); 3232 3233 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 3234 bplist_vacate(bpl, tx); 3235 3236 /* 3237 * Pre-dirty the first block so we sync to convergence faster. 3238 * (Usually only the first block is needed.) 3239 */ 3240 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 3241 dmu_tx_commit(tx); 3242 } 3243 3244 static void 3245 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 3246 { 3247 char *packed = NULL; 3248 size_t nvsize = 0; 3249 dmu_buf_t *db; 3250 3251 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 3252 3253 packed = kmem_alloc(nvsize, KM_SLEEP); 3254 3255 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 3256 KM_SLEEP) == 0); 3257 3258 dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 3259 3260 kmem_free(packed, nvsize); 3261 3262 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 3263 dmu_buf_will_dirty(db, tx); 3264 *(uint64_t *)db->db_data = nvsize; 3265 dmu_buf_rele(db, FTAG); 3266 } 3267 3268 static void 3269 spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 3270 { 3271 nvlist_t *nvroot; 3272 nvlist_t **spares; 3273 int i; 3274 3275 if (!spa->spa_sync_spares) 3276 return; 3277 3278 /* 3279 * Update the MOS nvlist describing the list of available spares. 3280 * spa_validate_spares() will have already made sure this nvlist is 3281 * valid and the vdevs are labeled appropriately. 3282 */ 3283 if (spa->spa_spares_object == 0) { 3284 spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 3285 DMU_OT_PACKED_NVLIST, 1 << 14, 3286 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3287 VERIFY(zap_update(spa->spa_meta_objset, 3288 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 3289 sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 3290 } 3291 3292 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3293 if (spa->spa_nspares == 0) { 3294 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3295 NULL, 0) == 0); 3296 } else { 3297 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 3298 KM_SLEEP); 3299 for (i = 0; i < spa->spa_nspares; i++) 3300 spares[i] = vdev_config_generate(spa, 3301 spa->spa_spares[i], B_FALSE, B_TRUE); 3302 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3303 spares, spa->spa_nspares) == 0); 3304 for (i = 0; i < spa->spa_nspares; i++) 3305 nvlist_free(spares[i]); 3306 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 3307 } 3308 3309 spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 3310 nvlist_free(nvroot); 3311 3312 spa->spa_sync_spares = B_FALSE; 3313 } 3314 3315 static void 3316 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 3317 { 3318 nvlist_t *config; 3319 3320 if (list_is_empty(&spa->spa_dirty_list)) 3321 return; 3322 3323 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 3324 3325 if (spa->spa_config_syncing) 3326 nvlist_free(spa->spa_config_syncing); 3327 spa->spa_config_syncing = config; 3328 3329 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 3330 } 3331 3332 /* 3333 * Set zpool properties. 3334 */ 3335 static void 3336 spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 3337 { 3338 spa_t *spa = arg1; 3339 objset_t *mos = spa->spa_meta_objset; 3340 nvlist_t *nvp = arg2; 3341 nvpair_t *elem; 3342 uint64_t intval; 3343 char *strval; 3344 zpool_prop_t prop; 3345 const char *propname; 3346 zprop_type_t proptype; 3347 3348 elem = NULL; 3349 while ((elem = nvlist_next_nvpair(nvp, elem))) { 3350 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 3351 case ZPOOL_PROP_VERSION: 3352 /* 3353 * Only set version for non-zpool-creation cases 3354 * (set/import). spa_create() needs special care 3355 * for version setting. 3356 */ 3357 if (tx->tx_txg != TXG_INITIAL) { 3358 VERIFY(nvpair_value_uint64(elem, 3359 &intval) == 0); 3360 ASSERT(intval <= SPA_VERSION); 3361 ASSERT(intval >= spa_version(spa)); 3362 spa->spa_uberblock.ub_version = intval; 3363 vdev_config_dirty(spa->spa_root_vdev); 3364 } 3365 break; 3366 3367 case ZPOOL_PROP_ALTROOT: 3368 /* 3369 * 'altroot' is a non-persistent property. It should 3370 * have been set temporarily at creation or import time. 3371 */ 3372 ASSERT(spa->spa_root != NULL); 3373 break; 3374 3375 case ZPOOL_PROP_TEMPORARY: 3376 /* 3377 * 'temporary' is a non-persistant property. 3378 */ 3379 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 3380 spa->spa_temporary = intval; 3381 break; 3382 3383 default: 3384 /* 3385 * Set pool property values in the poolprops mos object. 3386 */ 3387 mutex_enter(&spa->spa_props_lock); 3388 if (spa->spa_pool_props_object == 0) { 3389 objset_t *mos = spa->spa_meta_objset; 3390 3391 VERIFY((spa->spa_pool_props_object = 3392 zap_create(mos, DMU_OT_POOL_PROPS, 3393 DMU_OT_NONE, 0, tx)) > 0); 3394 3395 VERIFY(zap_update(mos, 3396 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 3397 8, 1, &spa->spa_pool_props_object, tx) 3398 == 0); 3399 } 3400 mutex_exit(&spa->spa_props_lock); 3401 3402 /* normalize the property name */ 3403 propname = zpool_prop_to_name(prop); 3404 proptype = zpool_prop_get_type(prop); 3405 3406 if (nvpair_type(elem) == DATA_TYPE_STRING) { 3407 ASSERT(proptype == PROP_TYPE_STRING); 3408 VERIFY(nvpair_value_string(elem, &strval) == 0); 3409 VERIFY(zap_update(mos, 3410 spa->spa_pool_props_object, propname, 3411 1, strlen(strval) + 1, strval, tx) == 0); 3412 3413 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 3414 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 3415 3416 if (proptype == PROP_TYPE_INDEX) { 3417 const char *unused; 3418 VERIFY(zpool_prop_index_to_string( 3419 prop, intval, &unused) == 0); 3420 } 3421 VERIFY(zap_update(mos, 3422 spa->spa_pool_props_object, propname, 3423 8, 1, &intval, tx) == 0); 3424 } else { 3425 ASSERT(0); /* not allowed */ 3426 } 3427 3428 if (prop == ZPOOL_PROP_DELEGATION) 3429 spa->spa_delegation = intval; 3430 3431 if (prop == ZPOOL_PROP_BOOTFS) 3432 spa->spa_bootfs = intval; 3433 } 3434 3435 /* log internal history if this is not a zpool create */ 3436 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 3437 tx->tx_txg != TXG_INITIAL) { 3438 spa_history_internal_log(LOG_POOL_PROPSET, 3439 spa, tx, cr, "%s %lld %s", 3440 nvpair_name(elem), intval, spa->spa_name); 3441 } 3442 } 3443 } 3444 3445 /* 3446 * Sync the specified transaction group. New blocks may be dirtied as 3447 * part of the process, so we iterate until it converges. 3448 */ 3449 void 3450 spa_sync(spa_t *spa, uint64_t txg) 3451 { 3452 dsl_pool_t *dp = spa->spa_dsl_pool; 3453 objset_t *mos = spa->spa_meta_objset; 3454 bplist_t *bpl = &spa->spa_sync_bplist; 3455 vdev_t *rvd = spa->spa_root_vdev; 3456 vdev_t *vd; 3457 dmu_tx_t *tx; 3458 int dirty_vdevs; 3459 3460 /* 3461 * Lock out configuration changes. 3462 */ 3463 spa_config_enter(spa, RW_READER, FTAG); 3464 3465 spa->spa_syncing_txg = txg; 3466 spa->spa_sync_pass = 0; 3467 3468 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 3469 3470 tx = dmu_tx_create_assigned(dp, txg); 3471 3472 /* 3473 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 3474 * set spa_deflate if we have no raid-z vdevs. 3475 */ 3476 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 3477 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 3478 int i; 3479 3480 for (i = 0; i < rvd->vdev_children; i++) { 3481 vd = rvd->vdev_child[i]; 3482 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 3483 break; 3484 } 3485 if (i == rvd->vdev_children) { 3486 spa->spa_deflate = TRUE; 3487 VERIFY(0 == zap_add(spa->spa_meta_objset, 3488 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3489 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 3490 } 3491 } 3492 3493 /* 3494 * If anything has changed in this txg, push the deferred frees 3495 * from the previous txg. If not, leave them alone so that we 3496 * don't generate work on an otherwise idle system. 3497 */ 3498 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 3499 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 3500 !txg_list_empty(&dp->dp_sync_tasks, txg)) 3501 spa_sync_deferred_frees(spa, txg); 3502 3503 /* 3504 * Iterate to convergence. 3505 */ 3506 do { 3507 spa->spa_sync_pass++; 3508 3509 spa_sync_config_object(spa, tx); 3510 spa_sync_spares(spa, tx); 3511 spa_errlog_sync(spa, txg); 3512 dsl_pool_sync(dp, txg); 3513 3514 dirty_vdevs = 0; 3515 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 3516 vdev_sync(vd, txg); 3517 dirty_vdevs++; 3518 } 3519 3520 bplist_sync(bpl, tx); 3521 } while (dirty_vdevs); 3522 3523 bplist_close(bpl); 3524 3525 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 3526 3527 /* 3528 * Rewrite the vdev configuration (which includes the uberblock) 3529 * to commit the transaction group. 3530 * 3531 * If there are any dirty vdevs, sync the uberblock to all vdevs. 3532 * Otherwise, pick a random top-level vdev that's known to be 3533 * visible in the config cache (see spa_vdev_add() for details). 3534 * If the write fails, try the next vdev until we're tried them all. 3535 */ 3536 if (!list_is_empty(&spa->spa_dirty_list)) { 3537 VERIFY(vdev_config_sync(rvd, txg) == 0); 3538 } else { 3539 int children = rvd->vdev_children; 3540 int c0 = spa_get_random(children); 3541 int c; 3542 3543 for (c = 0; c < children; c++) { 3544 vd = rvd->vdev_child[(c0 + c) % children]; 3545 if (vd->vdev_ms_array == 0) 3546 continue; 3547 if (vdev_config_sync(vd, txg) == 0) 3548 break; 3549 } 3550 if (c == children) 3551 VERIFY(vdev_config_sync(rvd, txg) == 0); 3552 } 3553 3554 dmu_tx_commit(tx); 3555 3556 /* 3557 * Clear the dirty config list. 3558 */ 3559 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 3560 vdev_config_clean(vd); 3561 3562 /* 3563 * Now that the new config has synced transactionally, 3564 * let it become visible to the config cache. 3565 */ 3566 if (spa->spa_config_syncing != NULL) { 3567 spa_config_set(spa, spa->spa_config_syncing); 3568 spa->spa_config_txg = txg; 3569 spa->spa_config_syncing = NULL; 3570 } 3571 3572 /* 3573 * Make a stable copy of the fully synced uberblock. 3574 * We use this as the root for pool traversals. 3575 */ 3576 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 3577 3578 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 3579 3580 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 3581 spa->spa_traverse_wanted = 0; 3582 spa->spa_ubsync = spa->spa_uberblock; 3583 rw_exit(&spa->spa_traverse_lock); 3584 3585 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 3586 3587 /* 3588 * Clean up the ZIL records for the synced txg. 3589 */ 3590 dsl_pool_zil_clean(dp); 3591 3592 /* 3593 * Update usable space statistics. 3594 */ 3595 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 3596 vdev_sync_done(vd, txg); 3597 3598 /* 3599 * It had better be the case that we didn't dirty anything 3600 * since vdev_config_sync(). 3601 */ 3602 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 3603 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 3604 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 3605 ASSERT(bpl->bpl_queue == NULL); 3606 3607 spa_config_exit(spa, FTAG); 3608 3609 /* 3610 * If any async tasks have been requested, kick them off. 3611 */ 3612 spa_async_dispatch(spa); 3613 } 3614 3615 /* 3616 * Sync all pools. We don't want to hold the namespace lock across these 3617 * operations, so we take a reference on the spa_t and drop the lock during the 3618 * sync. 3619 */ 3620 void 3621 spa_sync_allpools(void) 3622 { 3623 spa_t *spa = NULL; 3624 mutex_enter(&spa_namespace_lock); 3625 while ((spa = spa_next(spa)) != NULL) { 3626 if (spa_state(spa) != POOL_STATE_ACTIVE) 3627 continue; 3628 spa_open_ref(spa, FTAG); 3629 mutex_exit(&spa_namespace_lock); 3630 txg_wait_synced(spa_get_dsl(spa), 0); 3631 mutex_enter(&spa_namespace_lock); 3632 spa_close(spa, FTAG); 3633 } 3634 mutex_exit(&spa_namespace_lock); 3635 } 3636 3637 /* 3638 * ========================================================================== 3639 * Miscellaneous routines 3640 * ========================================================================== 3641 */ 3642 3643 /* 3644 * Remove all pools in the system. 3645 */ 3646 void 3647 spa_evict_all(void) 3648 { 3649 spa_t *spa; 3650 3651 /* 3652 * Remove all cached state. All pools should be closed now, 3653 * so every spa in the AVL tree should be unreferenced. 3654 */ 3655 mutex_enter(&spa_namespace_lock); 3656 while ((spa = spa_next(NULL)) != NULL) { 3657 /* 3658 * Stop async tasks. The async thread may need to detach 3659 * a device that's been replaced, which requires grabbing 3660 * spa_namespace_lock, so we must drop it here. 3661 */ 3662 spa_open_ref(spa, FTAG); 3663 mutex_exit(&spa_namespace_lock); 3664 spa_async_suspend(spa); 3665 mutex_enter(&spa_namespace_lock); 3666 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 3667 spa_close(spa, FTAG); 3668 3669 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3670 spa_unload(spa); 3671 spa_deactivate(spa); 3672 } 3673 spa_remove(spa); 3674 } 3675 mutex_exit(&spa_namespace_lock); 3676 } 3677 3678 vdev_t * 3679 spa_lookup_by_guid(spa_t *spa, uint64_t guid) 3680 { 3681 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 3682 } 3683 3684 void 3685 spa_upgrade(spa_t *spa, uint64_t version) 3686 { 3687 spa_config_enter(spa, RW_WRITER, FTAG); 3688 3689 /* 3690 * This should only be called for a non-faulted pool, and since a 3691 * future version would result in an unopenable pool, this shouldn't be 3692 * possible. 3693 */ 3694 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 3695 ASSERT(version >= spa->spa_uberblock.ub_version); 3696 3697 spa->spa_uberblock.ub_version = version; 3698 vdev_config_dirty(spa->spa_root_vdev); 3699 3700 spa_config_exit(spa, FTAG); 3701 3702 txg_wait_synced(spa_get_dsl(spa), 0); 3703 } 3704 3705 boolean_t 3706 spa_has_spare(spa_t *spa, uint64_t guid) 3707 { 3708 int i; 3709 uint64_t spareguid; 3710 3711 for (i = 0; i < spa->spa_nspares; i++) 3712 if (spa->spa_spares[i]->vdev_guid == guid) 3713 return (B_TRUE); 3714 3715 for (i = 0; i < spa->spa_pending_nspares; i++) { 3716 if (nvlist_lookup_uint64(spa->spa_pending_spares[i], 3717 ZPOOL_CONFIG_GUID, &spareguid) == 0 && 3718 spareguid == guid) 3719 return (B_TRUE); 3720 } 3721 3722 return (B_FALSE); 3723 } 3724 3725 /* 3726 * Post a sysevent corresponding to the given event. The 'name' must be one of 3727 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 3728 * filled in from the spa and (optionally) the vdev. This doesn't do anything 3729 * in the userland libzpool, as we don't want consumers to misinterpret ztest 3730 * or zdb as real changes. 3731 */ 3732 void 3733 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 3734 { 3735 #ifdef _KERNEL 3736 sysevent_t *ev; 3737 sysevent_attr_list_t *attr = NULL; 3738 sysevent_value_t value; 3739 sysevent_id_t eid; 3740 3741 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 3742 SE_SLEEP); 3743 3744 value.value_type = SE_DATA_TYPE_STRING; 3745 value.value.sv_string = spa_name(spa); 3746 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 3747 goto done; 3748 3749 value.value_type = SE_DATA_TYPE_UINT64; 3750 value.value.sv_uint64 = spa_guid(spa); 3751 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 3752 goto done; 3753 3754 if (vd) { 3755 value.value_type = SE_DATA_TYPE_UINT64; 3756 value.value.sv_uint64 = vd->vdev_guid; 3757 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 3758 SE_SLEEP) != 0) 3759 goto done; 3760 3761 if (vd->vdev_path) { 3762 value.value_type = SE_DATA_TYPE_STRING; 3763 value.value.sv_string = vd->vdev_path; 3764 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 3765 &value, SE_SLEEP) != 0) 3766 goto done; 3767 } 3768 } 3769 3770 (void) log_sysevent(ev, SE_SLEEP, &eid); 3771 3772 done: 3773 if (attr) 3774 sysevent_free_attr(attr); 3775 sysevent_free(ev); 3776 #endif 3777 } 3778