1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains all the routines used when modifying on-disk SPA state. 31 * This includes opening, importing, destroying, exporting a pool, and syncing a 32 * pool. 33 */ 34 35 #include <sys/zfs_context.h> 36 #include <sys/fm/fs/zfs.h> 37 #include <sys/spa_impl.h> 38 #include <sys/zio.h> 39 #include <sys/zio_checksum.h> 40 #include <sys/zio_compress.h> 41 #include <sys/dmu.h> 42 #include <sys/dmu_tx.h> 43 #include <sys/zap.h> 44 #include <sys/zil.h> 45 #include <sys/vdev_impl.h> 46 #include <sys/metaslab.h> 47 #include <sys/uberblock_impl.h> 48 #include <sys/txg.h> 49 #include <sys/avl.h> 50 #include <sys/dmu_traverse.h> 51 #include <sys/dmu_objset.h> 52 #include <sys/unique.h> 53 #include <sys/dsl_pool.h> 54 #include <sys/dsl_dataset.h> 55 #include <sys/dsl_dir.h> 56 #include <sys/dsl_prop.h> 57 #include <sys/dsl_synctask.h> 58 #include <sys/fs/zfs.h> 59 #include <sys/callb.h> 60 #include <sys/systeminfo.h> 61 #include <sys/sunddi.h> 62 63 #include "zfs_prop.h" 64 65 int zio_taskq_threads = 8; 66 67 static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); 68 69 /* 70 * ========================================================================== 71 * SPA properties routines 72 * ========================================================================== 73 */ 74 75 /* 76 * Add a (source=src, propname=propval) list to an nvlist. 77 */ 78 static int 79 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 80 uint64_t intval, zprop_source_t src) 81 { 82 const char *propname = zpool_prop_to_name(prop); 83 nvlist_t *propval; 84 int err = 0; 85 86 if (err = nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP)) 87 return (err); 88 89 if (err = nvlist_add_uint64(propval, ZPROP_SOURCE, src)) 90 goto out; 91 92 if (strval != NULL) { 93 if (err = nvlist_add_string(propval, ZPROP_VALUE, strval)) 94 goto out; 95 } else { 96 if (err = nvlist_add_uint64(propval, ZPROP_VALUE, intval)) 97 goto out; 98 } 99 100 err = nvlist_add_nvlist(nvl, propname, propval); 101 out: 102 nvlist_free(propval); 103 return (err); 104 } 105 106 /* 107 * Get property values from the spa configuration. 108 */ 109 static int 110 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 111 { 112 uint64_t size = spa_get_space(spa); 113 uint64_t used = spa_get_alloc(spa); 114 uint64_t cap, version; 115 zprop_source_t src = ZPROP_SRC_NONE; 116 int err; 117 char *cachefile; 118 size_t len; 119 120 /* 121 * readonly properties 122 */ 123 if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa->spa_name, 124 0, src)) 125 return (err); 126 127 if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src)) 128 return (err); 129 130 if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src)) 131 return (err); 132 133 if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, 134 size - used, src)) 135 return (err); 136 137 cap = (size == 0) ? 0 : (used * 100 / size); 138 if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src)) 139 return (err); 140 141 if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, 142 spa_guid(spa), src)) 143 return (err); 144 145 if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 146 spa->spa_root_vdev->vdev_state, src)) 147 return (err); 148 149 /* 150 * settable properties that are not stored in the pool property object. 151 */ 152 version = spa_version(spa); 153 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 154 src = ZPROP_SRC_DEFAULT; 155 else 156 src = ZPROP_SRC_LOCAL; 157 if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, 158 version, src)) 159 return (err); 160 161 if (spa->spa_root != NULL) { 162 src = ZPROP_SRC_LOCAL; 163 if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, 164 spa->spa_root, 0, src)) 165 return (err); 166 } 167 168 if (spa->spa_config_dir != NULL) { 169 if (strcmp(spa->spa_config_dir, "none") == 0) { 170 err = spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 171 spa->spa_config_dir, 0, ZPROP_SRC_LOCAL); 172 } else { 173 len = strlen(spa->spa_config_dir) + 174 strlen(spa->spa_config_file) + 2; 175 cachefile = kmem_alloc(len, KM_SLEEP); 176 (void) snprintf(cachefile, len, "%s/%s", 177 spa->spa_config_dir, spa->spa_config_file); 178 err = spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 179 cachefile, 0, ZPROP_SRC_LOCAL); 180 kmem_free(cachefile, len); 181 } 182 183 if (err) 184 return (err); 185 } 186 187 return (0); 188 } 189 190 /* 191 * Get zpool property values. 192 */ 193 int 194 spa_prop_get(spa_t *spa, nvlist_t **nvp) 195 { 196 zap_cursor_t zc; 197 zap_attribute_t za; 198 objset_t *mos = spa->spa_meta_objset; 199 int err; 200 201 if (err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP)) 202 return (err); 203 204 /* 205 * Get properties from the spa config. 206 */ 207 if (err = spa_prop_get_config(spa, nvp)) 208 goto out; 209 210 mutex_enter(&spa->spa_props_lock); 211 /* If no pool property object, no more prop to get. */ 212 if (spa->spa_pool_props_object == 0) { 213 mutex_exit(&spa->spa_props_lock); 214 return (0); 215 } 216 217 /* 218 * Get properties from the MOS pool property object. 219 */ 220 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 221 (err = zap_cursor_retrieve(&zc, &za)) == 0; 222 zap_cursor_advance(&zc)) { 223 uint64_t intval = 0; 224 char *strval = NULL; 225 zprop_source_t src = ZPROP_SRC_DEFAULT; 226 zpool_prop_t prop; 227 228 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 229 continue; 230 231 switch (za.za_integer_length) { 232 case 8: 233 /* integer property */ 234 if (za.za_first_integer != 235 zpool_prop_default_numeric(prop)) 236 src = ZPROP_SRC_LOCAL; 237 238 if (prop == ZPOOL_PROP_BOOTFS) { 239 dsl_pool_t *dp; 240 dsl_dataset_t *ds = NULL; 241 242 dp = spa_get_dsl(spa); 243 rw_enter(&dp->dp_config_rwlock, RW_READER); 244 if (err = dsl_dataset_open_obj(dp, 245 za.za_first_integer, NULL, DS_MODE_NONE, 246 FTAG, &ds)) { 247 rw_exit(&dp->dp_config_rwlock); 248 break; 249 } 250 251 strval = kmem_alloc( 252 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 253 KM_SLEEP); 254 dsl_dataset_name(ds, strval); 255 dsl_dataset_close(ds, DS_MODE_NONE, FTAG); 256 rw_exit(&dp->dp_config_rwlock); 257 } else { 258 strval = NULL; 259 intval = za.za_first_integer; 260 } 261 262 err = spa_prop_add_list(*nvp, prop, strval, 263 intval, src); 264 265 if (strval != NULL) 266 kmem_free(strval, 267 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 268 269 break; 270 271 case 1: 272 /* string property */ 273 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 274 err = zap_lookup(mos, spa->spa_pool_props_object, 275 za.za_name, 1, za.za_num_integers, strval); 276 if (err) { 277 kmem_free(strval, za.za_num_integers); 278 break; 279 } 280 err = spa_prop_add_list(*nvp, prop, strval, 0, src); 281 kmem_free(strval, za.za_num_integers); 282 break; 283 284 default: 285 break; 286 } 287 } 288 zap_cursor_fini(&zc); 289 mutex_exit(&spa->spa_props_lock); 290 out: 291 if (err && err != ENOENT) { 292 nvlist_free(*nvp); 293 return (err); 294 } 295 296 return (0); 297 } 298 299 /* 300 * Validate the given pool properties nvlist and modify the list 301 * for the property values to be set. 302 */ 303 static int 304 spa_prop_validate(spa_t *spa, nvlist_t *props) 305 { 306 nvpair_t *elem; 307 int error = 0, reset_bootfs = 0; 308 uint64_t objnum; 309 310 elem = NULL; 311 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 312 zpool_prop_t prop; 313 char *propname, *strval; 314 uint64_t intval; 315 vdev_t *rvdev; 316 char *vdev_type; 317 objset_t *os; 318 char *slash; 319 320 propname = nvpair_name(elem); 321 322 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) 323 return (EINVAL); 324 325 switch (prop) { 326 case ZPOOL_PROP_VERSION: 327 error = nvpair_value_uint64(elem, &intval); 328 if (!error && 329 (intval < spa_version(spa) || intval > SPA_VERSION)) 330 error = EINVAL; 331 break; 332 333 case ZPOOL_PROP_DELEGATION: 334 case ZPOOL_PROP_AUTOREPLACE: 335 error = nvpair_value_uint64(elem, &intval); 336 if (!error && intval > 1) 337 error = EINVAL; 338 break; 339 340 case ZPOOL_PROP_BOOTFS: 341 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 342 error = ENOTSUP; 343 break; 344 } 345 346 /* 347 * A bootable filesystem can not be on a RAIDZ pool 348 * nor a striped pool with more than 1 device. 349 */ 350 rvdev = spa->spa_root_vdev; 351 vdev_type = 352 rvdev->vdev_child[0]->vdev_ops->vdev_op_type; 353 if (rvdev->vdev_children > 1 || 354 strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || 355 strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { 356 error = ENOTSUP; 357 break; 358 } 359 360 reset_bootfs = 1; 361 362 error = nvpair_value_string(elem, &strval); 363 364 if (!error) { 365 if (strval == NULL || strval[0] == '\0') { 366 objnum = zpool_prop_default_numeric( 367 ZPOOL_PROP_BOOTFS); 368 break; 369 } 370 371 if (error = dmu_objset_open(strval, DMU_OST_ZFS, 372 DS_MODE_STANDARD | DS_MODE_READONLY, &os)) 373 break; 374 objnum = dmu_objset_id(os); 375 dmu_objset_close(os); 376 } 377 break; 378 case ZPOOL_PROP_FAILUREMODE: 379 error = nvpair_value_uint64(elem, &intval); 380 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 381 intval > ZIO_FAILURE_MODE_PANIC)) 382 error = EINVAL; 383 384 /* 385 * This is a special case which only occurs when 386 * the pool has completely failed. This allows 387 * the user to change the in-core failmode property 388 * without syncing it out to disk (I/Os might 389 * currently be blocked). We do this by returning 390 * EIO to the caller (spa_prop_set) to trick it 391 * into thinking we encountered a property validation 392 * error. 393 */ 394 if (!error && spa_state(spa) == POOL_STATE_IO_FAILURE) { 395 spa->spa_failmode = intval; 396 error = EIO; 397 } 398 break; 399 400 case ZPOOL_PROP_CACHEFILE: 401 if ((error = nvpair_value_string(elem, &strval)) != 0) 402 break; 403 404 if (strval[0] == '\0') 405 break; 406 407 if (strcmp(strval, "none") == 0) 408 break; 409 410 if (strval[0] != '/') { 411 error = EINVAL; 412 break; 413 } 414 415 slash = strrchr(strval, '/'); 416 ASSERT(slash != NULL); 417 418 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 419 strcmp(slash, "/..") == 0) 420 error = EINVAL; 421 break; 422 } 423 424 if (error) 425 break; 426 } 427 428 if (!error && reset_bootfs) { 429 error = nvlist_remove(props, 430 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 431 432 if (!error) { 433 error = nvlist_add_uint64(props, 434 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 435 } 436 } 437 438 return (error); 439 } 440 441 int 442 spa_prop_set(spa_t *spa, nvlist_t *nvp) 443 { 444 int error; 445 446 if ((error = spa_prop_validate(spa, nvp)) != 0) 447 return (error); 448 449 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 450 spa, nvp, 3)); 451 } 452 453 /* 454 * If the bootfs property value is dsobj, clear it. 455 */ 456 void 457 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 458 { 459 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 460 VERIFY(zap_remove(spa->spa_meta_objset, 461 spa->spa_pool_props_object, 462 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 463 spa->spa_bootfs = 0; 464 } 465 } 466 467 /* 468 * ========================================================================== 469 * SPA state manipulation (open/create/destroy/import/export) 470 * ========================================================================== 471 */ 472 473 static int 474 spa_error_entry_compare(const void *a, const void *b) 475 { 476 spa_error_entry_t *sa = (spa_error_entry_t *)a; 477 spa_error_entry_t *sb = (spa_error_entry_t *)b; 478 int ret; 479 480 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 481 sizeof (zbookmark_t)); 482 483 if (ret < 0) 484 return (-1); 485 else if (ret > 0) 486 return (1); 487 else 488 return (0); 489 } 490 491 /* 492 * Utility function which retrieves copies of the current logs and 493 * re-initializes them in the process. 494 */ 495 void 496 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 497 { 498 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 499 500 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 501 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 502 503 avl_create(&spa->spa_errlist_scrub, 504 spa_error_entry_compare, sizeof (spa_error_entry_t), 505 offsetof(spa_error_entry_t, se_avl)); 506 avl_create(&spa->spa_errlist_last, 507 spa_error_entry_compare, sizeof (spa_error_entry_t), 508 offsetof(spa_error_entry_t, se_avl)); 509 } 510 511 /* 512 * Activate an uninitialized pool. 513 */ 514 static void 515 spa_activate(spa_t *spa) 516 { 517 int t; 518 519 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 520 521 spa->spa_state = POOL_STATE_ACTIVE; 522 523 spa->spa_normal_class = metaslab_class_create(); 524 spa->spa_log_class = metaslab_class_create(); 525 526 for (t = 0; t < ZIO_TYPES; t++) { 527 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 528 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 529 TASKQ_PREPOPULATE); 530 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 531 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 532 TASKQ_PREPOPULATE); 533 } 534 535 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 536 offsetof(vdev_t, vdev_dirty_node)); 537 list_create(&spa->spa_zio_list, sizeof (zio_t), 538 offsetof(zio_t, zio_link_node)); 539 540 txg_list_create(&spa->spa_vdev_txg_list, 541 offsetof(struct vdev, vdev_txg_node)); 542 543 avl_create(&spa->spa_errlist_scrub, 544 spa_error_entry_compare, sizeof (spa_error_entry_t), 545 offsetof(spa_error_entry_t, se_avl)); 546 avl_create(&spa->spa_errlist_last, 547 spa_error_entry_compare, sizeof (spa_error_entry_t), 548 offsetof(spa_error_entry_t, se_avl)); 549 } 550 551 /* 552 * Opposite of spa_activate(). 553 */ 554 static void 555 spa_deactivate(spa_t *spa) 556 { 557 int t; 558 559 ASSERT(spa->spa_sync_on == B_FALSE); 560 ASSERT(spa->spa_dsl_pool == NULL); 561 ASSERT(spa->spa_root_vdev == NULL); 562 563 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 564 565 txg_list_destroy(&spa->spa_vdev_txg_list); 566 567 list_destroy(&spa->spa_dirty_list); 568 list_destroy(&spa->spa_zio_list); 569 570 for (t = 0; t < ZIO_TYPES; t++) { 571 taskq_destroy(spa->spa_zio_issue_taskq[t]); 572 taskq_destroy(spa->spa_zio_intr_taskq[t]); 573 spa->spa_zio_issue_taskq[t] = NULL; 574 spa->spa_zio_intr_taskq[t] = NULL; 575 } 576 577 metaslab_class_destroy(spa->spa_normal_class); 578 spa->spa_normal_class = NULL; 579 580 metaslab_class_destroy(spa->spa_log_class); 581 spa->spa_log_class = NULL; 582 583 /* 584 * If this was part of an import or the open otherwise failed, we may 585 * still have errors left in the queues. Empty them just in case. 586 */ 587 spa_errlog_drain(spa); 588 589 avl_destroy(&spa->spa_errlist_scrub); 590 avl_destroy(&spa->spa_errlist_last); 591 592 spa->spa_state = POOL_STATE_UNINITIALIZED; 593 } 594 595 /* 596 * Verify a pool configuration, and construct the vdev tree appropriately. This 597 * will create all the necessary vdevs in the appropriate layout, with each vdev 598 * in the CLOSED state. This will prep the pool before open/creation/import. 599 * All vdev validation is done by the vdev_alloc() routine. 600 */ 601 static int 602 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 603 uint_t id, int atype) 604 { 605 nvlist_t **child; 606 uint_t c, children; 607 int error; 608 609 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 610 return (error); 611 612 if ((*vdp)->vdev_ops->vdev_op_leaf) 613 return (0); 614 615 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 616 &child, &children) != 0) { 617 vdev_free(*vdp); 618 *vdp = NULL; 619 return (EINVAL); 620 } 621 622 for (c = 0; c < children; c++) { 623 vdev_t *vd; 624 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 625 atype)) != 0) { 626 vdev_free(*vdp); 627 *vdp = NULL; 628 return (error); 629 } 630 } 631 632 ASSERT(*vdp != NULL); 633 634 return (0); 635 } 636 637 /* 638 * Opposite of spa_load(). 639 */ 640 static void 641 spa_unload(spa_t *spa) 642 { 643 int i; 644 645 /* 646 * Stop async tasks. 647 */ 648 spa_async_suspend(spa); 649 650 /* 651 * Stop syncing. 652 */ 653 if (spa->spa_sync_on) { 654 txg_sync_stop(spa->spa_dsl_pool); 655 spa->spa_sync_on = B_FALSE; 656 } 657 658 /* 659 * Wait for any outstanding prefetch I/O to complete. 660 */ 661 spa_config_enter(spa, RW_WRITER, FTAG); 662 spa_config_exit(spa, FTAG); 663 664 /* 665 * Close the dsl pool. 666 */ 667 if (spa->spa_dsl_pool) { 668 dsl_pool_close(spa->spa_dsl_pool); 669 spa->spa_dsl_pool = NULL; 670 } 671 672 /* 673 * Close all vdevs. 674 */ 675 if (spa->spa_root_vdev) 676 vdev_free(spa->spa_root_vdev); 677 ASSERT(spa->spa_root_vdev == NULL); 678 679 for (i = 0; i < spa->spa_nspares; i++) 680 vdev_free(spa->spa_spares[i]); 681 if (spa->spa_spares) { 682 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 683 spa->spa_spares = NULL; 684 } 685 if (spa->spa_sparelist) { 686 nvlist_free(spa->spa_sparelist); 687 spa->spa_sparelist = NULL; 688 } 689 690 spa->spa_async_suspended = 0; 691 } 692 693 /* 694 * Load (or re-load) the current list of vdevs describing the active spares for 695 * this pool. When this is called, we have some form of basic information in 696 * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 697 * re-generate a more complete list including status information. 698 */ 699 static void 700 spa_load_spares(spa_t *spa) 701 { 702 nvlist_t **spares; 703 uint_t nspares; 704 int i; 705 vdev_t *vd, *tvd; 706 707 /* 708 * First, close and free any existing spare vdevs. 709 */ 710 for (i = 0; i < spa->spa_nspares; i++) { 711 vd = spa->spa_spares[i]; 712 713 /* Undo the call to spa_activate() below */ 714 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && 715 tvd->vdev_isspare) 716 spa_spare_remove(tvd); 717 vdev_close(vd); 718 vdev_free(vd); 719 } 720 721 if (spa->spa_spares) 722 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 723 724 if (spa->spa_sparelist == NULL) 725 nspares = 0; 726 else 727 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 728 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 729 730 spa->spa_nspares = (int)nspares; 731 spa->spa_spares = NULL; 732 733 if (nspares == 0) 734 return; 735 736 /* 737 * Construct the array of vdevs, opening them to get status in the 738 * process. For each spare, there is potentially two different vdev_t 739 * structures associated with it: one in the list of spares (used only 740 * for basic validation purposes) and one in the active vdev 741 * configuration (if it's spared in). During this phase we open and 742 * validate each vdev on the spare list. If the vdev also exists in the 743 * active configuration, then we also mark this vdev as an active spare. 744 */ 745 spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 746 for (i = 0; i < spa->spa_nspares; i++) { 747 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 748 VDEV_ALLOC_SPARE) == 0); 749 ASSERT(vd != NULL); 750 751 spa->spa_spares[i] = vd; 752 753 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { 754 if (!tvd->vdev_isspare) 755 spa_spare_add(tvd); 756 757 /* 758 * We only mark the spare active if we were successfully 759 * able to load the vdev. Otherwise, importing a pool 760 * with a bad active spare would result in strange 761 * behavior, because multiple pool would think the spare 762 * is actively in use. 763 * 764 * There is a vulnerability here to an equally bizarre 765 * circumstance, where a dead active spare is later 766 * brought back to life (onlined or otherwise). Given 767 * the rarity of this scenario, and the extra complexity 768 * it adds, we ignore the possibility. 769 */ 770 if (!vdev_is_dead(tvd)) 771 spa_spare_activate(tvd); 772 } 773 774 if (vdev_open(vd) != 0) 775 continue; 776 777 vd->vdev_top = vd; 778 (void) vdev_validate_spare(vd); 779 } 780 781 /* 782 * Recompute the stashed list of spares, with status information 783 * this time. 784 */ 785 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 786 DATA_TYPE_NVLIST_ARRAY) == 0); 787 788 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 789 for (i = 0; i < spa->spa_nspares; i++) 790 spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 791 B_TRUE, B_TRUE); 792 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 793 spares, spa->spa_nspares) == 0); 794 for (i = 0; i < spa->spa_nspares; i++) 795 nvlist_free(spares[i]); 796 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 797 } 798 799 static int 800 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 801 { 802 dmu_buf_t *db; 803 char *packed = NULL; 804 size_t nvsize = 0; 805 int error; 806 *value = NULL; 807 808 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 809 nvsize = *(uint64_t *)db->db_data; 810 dmu_buf_rele(db, FTAG); 811 812 packed = kmem_alloc(nvsize, KM_SLEEP); 813 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 814 if (error == 0) 815 error = nvlist_unpack(packed, nvsize, value, 0); 816 kmem_free(packed, nvsize); 817 818 return (error); 819 } 820 821 /* 822 * Checks to see if the given vdev could not be opened, in which case we post a 823 * sysevent to notify the autoreplace code that the device has been removed. 824 */ 825 static void 826 spa_check_removed(vdev_t *vd) 827 { 828 int c; 829 830 for (c = 0; c < vd->vdev_children; c++) 831 spa_check_removed(vd->vdev_child[c]); 832 833 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 834 zfs_post_autoreplace(vd->vdev_spa, vd); 835 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 836 } 837 } 838 839 /* 840 * Load an existing storage pool, using the pool's builtin spa_config as a 841 * source of configuration information. 842 */ 843 static int 844 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 845 { 846 int error = 0; 847 nvlist_t *nvroot = NULL; 848 vdev_t *rvd; 849 uberblock_t *ub = &spa->spa_uberblock; 850 uint64_t config_cache_txg = spa->spa_config_txg; 851 uint64_t pool_guid; 852 uint64_t version; 853 zio_t *zio; 854 uint64_t autoreplace = 0; 855 856 spa->spa_load_state = state; 857 858 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 859 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 860 error = EINVAL; 861 goto out; 862 } 863 864 /* 865 * Versioning wasn't explicitly added to the label until later, so if 866 * it's not present treat it as the initial version. 867 */ 868 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 869 version = SPA_VERSION_INITIAL; 870 871 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 872 &spa->spa_config_txg); 873 874 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 875 spa_guid_exists(pool_guid, 0)) { 876 error = EEXIST; 877 goto out; 878 } 879 880 spa->spa_load_guid = pool_guid; 881 882 /* 883 * Parse the configuration into a vdev tree. We explicitly set the 884 * value that will be returned by spa_version() since parsing the 885 * configuration requires knowing the version number. 886 */ 887 spa_config_enter(spa, RW_WRITER, FTAG); 888 spa->spa_ubsync.ub_version = version; 889 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 890 spa_config_exit(spa, FTAG); 891 892 if (error != 0) 893 goto out; 894 895 ASSERT(spa->spa_root_vdev == rvd); 896 ASSERT(spa_guid(spa) == pool_guid); 897 898 /* 899 * Try to open all vdevs, loading each label in the process. 900 */ 901 error = vdev_open(rvd); 902 if (error != 0) 903 goto out; 904 905 /* 906 * Validate the labels for all leaf vdevs. We need to grab the config 907 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 908 * flag. 909 */ 910 spa_config_enter(spa, RW_READER, FTAG); 911 error = vdev_validate(rvd); 912 spa_config_exit(spa, FTAG); 913 914 if (error != 0) 915 goto out; 916 917 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 918 error = ENXIO; 919 goto out; 920 } 921 922 /* 923 * Find the best uberblock. 924 */ 925 bzero(ub, sizeof (uberblock_t)); 926 927 zio = zio_root(spa, NULL, NULL, 928 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 929 vdev_uberblock_load(zio, rvd, ub); 930 error = zio_wait(zio); 931 932 /* 933 * If we weren't able to find a single valid uberblock, return failure. 934 */ 935 if (ub->ub_txg == 0) { 936 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 937 VDEV_AUX_CORRUPT_DATA); 938 error = ENXIO; 939 goto out; 940 } 941 942 /* 943 * If the pool is newer than the code, we can't open it. 944 */ 945 if (ub->ub_version > SPA_VERSION) { 946 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 947 VDEV_AUX_VERSION_NEWER); 948 error = ENOTSUP; 949 goto out; 950 } 951 952 /* 953 * If the vdev guid sum doesn't match the uberblock, we have an 954 * incomplete configuration. 955 */ 956 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 957 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 958 VDEV_AUX_BAD_GUID_SUM); 959 error = ENXIO; 960 goto out; 961 } 962 963 /* 964 * Initialize internal SPA structures. 965 */ 966 spa->spa_state = POOL_STATE_ACTIVE; 967 spa->spa_ubsync = spa->spa_uberblock; 968 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 969 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 970 if (error) { 971 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 972 VDEV_AUX_CORRUPT_DATA); 973 goto out; 974 } 975 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 976 977 if (zap_lookup(spa->spa_meta_objset, 978 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 979 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 980 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 981 VDEV_AUX_CORRUPT_DATA); 982 error = EIO; 983 goto out; 984 } 985 986 if (!mosconfig) { 987 nvlist_t *newconfig; 988 uint64_t hostid; 989 990 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 991 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 992 VDEV_AUX_CORRUPT_DATA); 993 error = EIO; 994 goto out; 995 } 996 997 if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID, 998 &hostid) == 0) { 999 char *hostname; 1000 unsigned long myhostid = 0; 1001 1002 VERIFY(nvlist_lookup_string(newconfig, 1003 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 1004 1005 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 1006 if (hostid != 0 && myhostid != 0 && 1007 (unsigned long)hostid != myhostid) { 1008 cmn_err(CE_WARN, "pool '%s' could not be " 1009 "loaded as it was last accessed by " 1010 "another system (host: %s hostid: 0x%lx). " 1011 "See: http://www.sun.com/msg/ZFS-8000-EY", 1012 spa->spa_name, hostname, 1013 (unsigned long)hostid); 1014 error = EBADF; 1015 goto out; 1016 } 1017 } 1018 1019 spa_config_set(spa, newconfig); 1020 spa_unload(spa); 1021 spa_deactivate(spa); 1022 spa_activate(spa); 1023 1024 return (spa_load(spa, newconfig, state, B_TRUE)); 1025 } 1026 1027 if (zap_lookup(spa->spa_meta_objset, 1028 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1029 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 1030 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1031 VDEV_AUX_CORRUPT_DATA); 1032 error = EIO; 1033 goto out; 1034 } 1035 1036 /* 1037 * Load the bit that tells us to use the new accounting function 1038 * (raid-z deflation). If we have an older pool, this will not 1039 * be present. 1040 */ 1041 error = zap_lookup(spa->spa_meta_objset, 1042 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1043 sizeof (uint64_t), 1, &spa->spa_deflate); 1044 if (error != 0 && error != ENOENT) { 1045 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1046 VDEV_AUX_CORRUPT_DATA); 1047 error = EIO; 1048 goto out; 1049 } 1050 1051 /* 1052 * Load the persistent error log. If we have an older pool, this will 1053 * not be present. 1054 */ 1055 error = zap_lookup(spa->spa_meta_objset, 1056 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 1057 sizeof (uint64_t), 1, &spa->spa_errlog_last); 1058 if (error != 0 && error != ENOENT) { 1059 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1060 VDEV_AUX_CORRUPT_DATA); 1061 error = EIO; 1062 goto out; 1063 } 1064 1065 error = zap_lookup(spa->spa_meta_objset, 1066 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 1067 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 1068 if (error != 0 && error != ENOENT) { 1069 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1070 VDEV_AUX_CORRUPT_DATA); 1071 error = EIO; 1072 goto out; 1073 } 1074 1075 /* 1076 * Load the history object. If we have an older pool, this 1077 * will not be present. 1078 */ 1079 error = zap_lookup(spa->spa_meta_objset, 1080 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 1081 sizeof (uint64_t), 1, &spa->spa_history); 1082 if (error != 0 && error != ENOENT) { 1083 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1084 VDEV_AUX_CORRUPT_DATA); 1085 error = EIO; 1086 goto out; 1087 } 1088 1089 /* 1090 * Load any hot spares for this pool. 1091 */ 1092 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1093 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 1094 if (error != 0 && error != ENOENT) { 1095 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1096 VDEV_AUX_CORRUPT_DATA); 1097 error = EIO; 1098 goto out; 1099 } 1100 if (error == 0) { 1101 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 1102 if (load_nvlist(spa, spa->spa_spares_object, 1103 &spa->spa_sparelist) != 0) { 1104 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1105 VDEV_AUX_CORRUPT_DATA); 1106 error = EIO; 1107 goto out; 1108 } 1109 1110 spa_config_enter(spa, RW_WRITER, FTAG); 1111 spa_load_spares(spa); 1112 spa_config_exit(spa, FTAG); 1113 } 1114 1115 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 1116 1117 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1118 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 1119 1120 if (error && error != ENOENT) { 1121 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1122 VDEV_AUX_CORRUPT_DATA); 1123 error = EIO; 1124 goto out; 1125 } 1126 1127 if (error == 0) { 1128 (void) zap_lookup(spa->spa_meta_objset, 1129 spa->spa_pool_props_object, 1130 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 1131 sizeof (uint64_t), 1, &spa->spa_bootfs); 1132 (void) zap_lookup(spa->spa_meta_objset, 1133 spa->spa_pool_props_object, 1134 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1135 sizeof (uint64_t), 1, &autoreplace); 1136 (void) zap_lookup(spa->spa_meta_objset, 1137 spa->spa_pool_props_object, 1138 zpool_prop_to_name(ZPOOL_PROP_DELEGATION), 1139 sizeof (uint64_t), 1, &spa->spa_delegation); 1140 (void) zap_lookup(spa->spa_meta_objset, 1141 spa->spa_pool_props_object, 1142 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 1143 sizeof (uint64_t), 1, &spa->spa_failmode); 1144 } 1145 1146 /* 1147 * If the 'autoreplace' property is set, then post a resource notifying 1148 * the ZFS DE that it should not issue any faults for unopenable 1149 * devices. We also iterate over the vdevs, and post a sysevent for any 1150 * unopenable vdevs so that the normal autoreplace handler can take 1151 * over. 1152 */ 1153 if (autoreplace) 1154 spa_check_removed(spa->spa_root_vdev); 1155 1156 /* 1157 * Load the vdev state for all toplevel vdevs. 1158 */ 1159 vdev_load(rvd); 1160 1161 /* 1162 * Propagate the leaf DTLs we just loaded all the way up the tree. 1163 */ 1164 spa_config_enter(spa, RW_WRITER, FTAG); 1165 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 1166 spa_config_exit(spa, FTAG); 1167 1168 /* 1169 * Check the state of the root vdev. If it can't be opened, it 1170 * indicates one or more toplevel vdevs are faulted. 1171 */ 1172 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1173 error = ENXIO; 1174 goto out; 1175 } 1176 1177 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 1178 dmu_tx_t *tx; 1179 int need_update = B_FALSE; 1180 int c; 1181 1182 /* 1183 * Claim log blocks that haven't been committed yet. 1184 * This must all happen in a single txg. 1185 */ 1186 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 1187 spa_first_txg(spa)); 1188 (void) dmu_objset_find(spa->spa_name, 1189 zil_claim, tx, DS_FIND_CHILDREN); 1190 dmu_tx_commit(tx); 1191 1192 spa->spa_sync_on = B_TRUE; 1193 txg_sync_start(spa->spa_dsl_pool); 1194 1195 /* 1196 * Wait for all claims to sync. 1197 */ 1198 txg_wait_synced(spa->spa_dsl_pool, 0); 1199 1200 /* 1201 * If the config cache is stale, or we have uninitialized 1202 * metaslabs (see spa_vdev_add()), then update the config. 1203 */ 1204 if (config_cache_txg != spa->spa_config_txg || 1205 state == SPA_LOAD_IMPORT) 1206 need_update = B_TRUE; 1207 1208 for (c = 0; c < rvd->vdev_children; c++) 1209 if (rvd->vdev_child[c]->vdev_ms_array == 0) 1210 need_update = B_TRUE; 1211 1212 /* 1213 * Update the config cache asychronously in case we're the 1214 * root pool, in which case the config cache isn't writable yet. 1215 */ 1216 if (need_update) 1217 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 1218 } 1219 1220 error = 0; 1221 out: 1222 if (error && error != EBADF) 1223 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 1224 spa->spa_load_state = SPA_LOAD_NONE; 1225 spa->spa_ena = 0; 1226 1227 return (error); 1228 } 1229 1230 /* 1231 * Pool Open/Import 1232 * 1233 * The import case is identical to an open except that the configuration is sent 1234 * down from userland, instead of grabbed from the configuration cache. For the 1235 * case of an open, the pool configuration will exist in the 1236 * POOL_STATE_UNINITIALIZED state. 1237 * 1238 * The stats information (gen/count/ustats) is used to gather vdev statistics at 1239 * the same time open the pool, without having to keep around the spa_t in some 1240 * ambiguous state. 1241 */ 1242 static int 1243 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 1244 { 1245 spa_t *spa; 1246 int error; 1247 int loaded = B_FALSE; 1248 int locked = B_FALSE; 1249 1250 *spapp = NULL; 1251 1252 /* 1253 * As disgusting as this is, we need to support recursive calls to this 1254 * function because dsl_dir_open() is called during spa_load(), and ends 1255 * up calling spa_open() again. The real fix is to figure out how to 1256 * avoid dsl_dir_open() calling this in the first place. 1257 */ 1258 if (mutex_owner(&spa_namespace_lock) != curthread) { 1259 mutex_enter(&spa_namespace_lock); 1260 locked = B_TRUE; 1261 } 1262 1263 if ((spa = spa_lookup(pool)) == NULL) { 1264 if (locked) 1265 mutex_exit(&spa_namespace_lock); 1266 return (ENOENT); 1267 } 1268 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 1269 1270 spa_activate(spa); 1271 1272 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 1273 1274 if (error == EBADF) { 1275 /* 1276 * If vdev_validate() returns failure (indicated by 1277 * EBADF), it indicates that one of the vdevs indicates 1278 * that the pool has been exported or destroyed. If 1279 * this is the case, the config cache is out of sync and 1280 * we should remove the pool from the namespace. 1281 */ 1282 zfs_post_ok(spa, NULL); 1283 spa_unload(spa); 1284 spa_deactivate(spa); 1285 spa_remove(spa); 1286 spa_config_sync(); 1287 if (locked) 1288 mutex_exit(&spa_namespace_lock); 1289 return (ENOENT); 1290 } 1291 1292 if (error) { 1293 /* 1294 * We can't open the pool, but we still have useful 1295 * information: the state of each vdev after the 1296 * attempted vdev_open(). Return this to the user. 1297 */ 1298 if (config != NULL && spa->spa_root_vdev != NULL) { 1299 spa_config_enter(spa, RW_READER, FTAG); 1300 *config = spa_config_generate(spa, NULL, -1ULL, 1301 B_TRUE); 1302 spa_config_exit(spa, FTAG); 1303 } 1304 spa_unload(spa); 1305 spa_deactivate(spa); 1306 spa->spa_last_open_failed = B_TRUE; 1307 if (locked) 1308 mutex_exit(&spa_namespace_lock); 1309 *spapp = NULL; 1310 return (error); 1311 } else { 1312 zfs_post_ok(spa, NULL); 1313 spa->spa_last_open_failed = B_FALSE; 1314 } 1315 1316 loaded = B_TRUE; 1317 } 1318 1319 spa_open_ref(spa, tag); 1320 1321 /* 1322 * If we just loaded the pool, resilver anything that's out of date. 1323 */ 1324 if (loaded && (spa_mode & FWRITE)) 1325 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1326 1327 if (locked) 1328 mutex_exit(&spa_namespace_lock); 1329 1330 *spapp = spa; 1331 1332 if (config != NULL) { 1333 spa_config_enter(spa, RW_READER, FTAG); 1334 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1335 spa_config_exit(spa, FTAG); 1336 } 1337 1338 return (0); 1339 } 1340 1341 int 1342 spa_open(const char *name, spa_t **spapp, void *tag) 1343 { 1344 return (spa_open_common(name, spapp, tag, NULL)); 1345 } 1346 1347 /* 1348 * Lookup the given spa_t, incrementing the inject count in the process, 1349 * preventing it from being exported or destroyed. 1350 */ 1351 spa_t * 1352 spa_inject_addref(char *name) 1353 { 1354 spa_t *spa; 1355 1356 mutex_enter(&spa_namespace_lock); 1357 if ((spa = spa_lookup(name)) == NULL) { 1358 mutex_exit(&spa_namespace_lock); 1359 return (NULL); 1360 } 1361 spa->spa_inject_ref++; 1362 mutex_exit(&spa_namespace_lock); 1363 1364 return (spa); 1365 } 1366 1367 void 1368 spa_inject_delref(spa_t *spa) 1369 { 1370 mutex_enter(&spa_namespace_lock); 1371 spa->spa_inject_ref--; 1372 mutex_exit(&spa_namespace_lock); 1373 } 1374 1375 static void 1376 spa_add_spares(spa_t *spa, nvlist_t *config) 1377 { 1378 nvlist_t **spares; 1379 uint_t i, nspares; 1380 nvlist_t *nvroot; 1381 uint64_t guid; 1382 vdev_stat_t *vs; 1383 uint_t vsc; 1384 uint64_t pool; 1385 1386 if (spa->spa_nspares == 0) 1387 return; 1388 1389 VERIFY(nvlist_lookup_nvlist(config, 1390 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1391 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 1392 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1393 if (nspares != 0) { 1394 VERIFY(nvlist_add_nvlist_array(nvroot, 1395 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1396 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1397 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1398 1399 /* 1400 * Go through and find any spares which have since been 1401 * repurposed as an active spare. If this is the case, update 1402 * their status appropriately. 1403 */ 1404 for (i = 0; i < nspares; i++) { 1405 VERIFY(nvlist_lookup_uint64(spares[i], 1406 ZPOOL_CONFIG_GUID, &guid) == 0); 1407 if (spa_spare_exists(guid, &pool) && pool != 0ULL) { 1408 VERIFY(nvlist_lookup_uint64_array( 1409 spares[i], ZPOOL_CONFIG_STATS, 1410 (uint64_t **)&vs, &vsc) == 0); 1411 vs->vs_state = VDEV_STATE_CANT_OPEN; 1412 vs->vs_aux = VDEV_AUX_SPARED; 1413 } 1414 } 1415 } 1416 } 1417 1418 int 1419 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 1420 { 1421 int error; 1422 spa_t *spa; 1423 1424 *config = NULL; 1425 error = spa_open_common(name, &spa, FTAG, config); 1426 1427 if (spa && *config != NULL) { 1428 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 1429 spa_get_errlog_size(spa)) == 0); 1430 1431 spa_add_spares(spa, *config); 1432 } 1433 1434 /* 1435 * We want to get the alternate root even for faulted pools, so we cheat 1436 * and call spa_lookup() directly. 1437 */ 1438 if (altroot) { 1439 if (spa == NULL) { 1440 mutex_enter(&spa_namespace_lock); 1441 spa = spa_lookup(name); 1442 if (spa) 1443 spa_altroot(spa, altroot, buflen); 1444 else 1445 altroot[0] = '\0'; 1446 spa = NULL; 1447 mutex_exit(&spa_namespace_lock); 1448 } else { 1449 spa_altroot(spa, altroot, buflen); 1450 } 1451 } 1452 1453 if (spa != NULL) 1454 spa_close(spa, FTAG); 1455 1456 return (error); 1457 } 1458 1459 /* 1460 * Validate that the 'spares' array is well formed. We must have an array of 1461 * nvlists, each which describes a valid leaf vdev. If this is an import (mode 1462 * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long 1463 * as they are well-formed. 1464 */ 1465 static int 1466 spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 1467 { 1468 nvlist_t **spares; 1469 uint_t i, nspares; 1470 vdev_t *vd; 1471 int error; 1472 1473 /* 1474 * It's acceptable to have no spares specified. 1475 */ 1476 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1477 &spares, &nspares) != 0) 1478 return (0); 1479 1480 if (nspares == 0) 1481 return (EINVAL); 1482 1483 /* 1484 * Make sure the pool is formatted with a version that supports hot 1485 * spares. 1486 */ 1487 if (spa_version(spa) < SPA_VERSION_SPARES) 1488 return (ENOTSUP); 1489 1490 /* 1491 * Set the pending spare list so we correctly handle device in-use 1492 * checking. 1493 */ 1494 spa->spa_pending_spares = spares; 1495 spa->spa_pending_nspares = nspares; 1496 1497 for (i = 0; i < nspares; i++) { 1498 if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 1499 mode)) != 0) 1500 goto out; 1501 1502 if (!vd->vdev_ops->vdev_op_leaf) { 1503 vdev_free(vd); 1504 error = EINVAL; 1505 goto out; 1506 } 1507 1508 vd->vdev_top = vd; 1509 1510 if ((error = vdev_open(vd)) == 0 && 1511 (error = vdev_label_init(vd, crtxg, 1512 VDEV_LABEL_SPARE)) == 0) { 1513 VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 1514 vd->vdev_guid) == 0); 1515 } 1516 1517 vdev_free(vd); 1518 1519 if (error && mode != VDEV_ALLOC_SPARE) 1520 goto out; 1521 else 1522 error = 0; 1523 } 1524 1525 out: 1526 spa->spa_pending_spares = NULL; 1527 spa->spa_pending_nspares = 0; 1528 return (error); 1529 } 1530 1531 /* 1532 * Pool Creation 1533 */ 1534 int 1535 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 1536 const char *history_str) 1537 { 1538 spa_t *spa; 1539 char *altroot = NULL; 1540 vdev_t *rvd; 1541 dsl_pool_t *dp; 1542 dmu_tx_t *tx; 1543 int c, error = 0; 1544 uint64_t txg = TXG_INITIAL; 1545 nvlist_t **spares; 1546 uint_t nspares; 1547 uint64_t version; 1548 1549 /* 1550 * If this pool already exists, return failure. 1551 */ 1552 mutex_enter(&spa_namespace_lock); 1553 if (spa_lookup(pool) != NULL) { 1554 mutex_exit(&spa_namespace_lock); 1555 return (EEXIST); 1556 } 1557 1558 /* 1559 * Allocate a new spa_t structure. 1560 */ 1561 (void) nvlist_lookup_string(props, 1562 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 1563 spa = spa_add(pool, altroot); 1564 spa_activate(spa); 1565 1566 spa->spa_uberblock.ub_txg = txg - 1; 1567 1568 if (props && (error = spa_prop_validate(spa, props))) { 1569 spa_unload(spa); 1570 spa_deactivate(spa); 1571 spa_remove(spa); 1572 return (error); 1573 } 1574 1575 if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 1576 &version) != 0) 1577 version = SPA_VERSION; 1578 ASSERT(version <= SPA_VERSION); 1579 spa->spa_uberblock.ub_version = version; 1580 spa->spa_ubsync = spa->spa_uberblock; 1581 1582 /* 1583 * Create the root vdev. 1584 */ 1585 spa_config_enter(spa, RW_WRITER, FTAG); 1586 1587 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1588 1589 ASSERT(error != 0 || rvd != NULL); 1590 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1591 1592 if (error == 0 && rvd->vdev_children == 0) 1593 error = EINVAL; 1594 1595 if (error == 0 && 1596 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1597 (error = spa_validate_spares(spa, nvroot, txg, 1598 VDEV_ALLOC_ADD)) == 0) { 1599 for (c = 0; c < rvd->vdev_children; c++) 1600 vdev_init(rvd->vdev_child[c], txg); 1601 vdev_config_dirty(rvd); 1602 } 1603 1604 spa_config_exit(spa, FTAG); 1605 1606 if (error != 0) { 1607 spa_unload(spa); 1608 spa_deactivate(spa); 1609 spa_remove(spa); 1610 mutex_exit(&spa_namespace_lock); 1611 return (error); 1612 } 1613 1614 /* 1615 * Get the list of spares, if specified. 1616 */ 1617 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1618 &spares, &nspares) == 0) { 1619 VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 1620 KM_SLEEP) == 0); 1621 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1622 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1623 spa_config_enter(spa, RW_WRITER, FTAG); 1624 spa_load_spares(spa); 1625 spa_config_exit(spa, FTAG); 1626 spa->spa_sync_spares = B_TRUE; 1627 } 1628 1629 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1630 spa->spa_meta_objset = dp->dp_meta_objset; 1631 1632 tx = dmu_tx_create_assigned(dp, txg); 1633 1634 /* 1635 * Create the pool config object. 1636 */ 1637 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1638 DMU_OT_PACKED_NVLIST, 1 << 14, 1639 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1640 1641 if (zap_add(spa->spa_meta_objset, 1642 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1643 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1644 cmn_err(CE_PANIC, "failed to add pool config"); 1645 } 1646 1647 /* Newly created pools with the right version are always deflated. */ 1648 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 1649 spa->spa_deflate = TRUE; 1650 if (zap_add(spa->spa_meta_objset, 1651 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1652 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 1653 cmn_err(CE_PANIC, "failed to add deflate"); 1654 } 1655 } 1656 1657 /* 1658 * Create the deferred-free bplist object. Turn off compression 1659 * because sync-to-convergence takes longer if the blocksize 1660 * keeps changing. 1661 */ 1662 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1663 1 << 14, tx); 1664 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1665 ZIO_COMPRESS_OFF, tx); 1666 1667 if (zap_add(spa->spa_meta_objset, 1668 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1669 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 1670 cmn_err(CE_PANIC, "failed to add bplist"); 1671 } 1672 1673 /* 1674 * Create the pool's history object. 1675 */ 1676 if (version >= SPA_VERSION_ZPOOL_HISTORY) 1677 spa_history_create_obj(spa, tx); 1678 1679 /* 1680 * Set pool properties. 1681 */ 1682 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 1683 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 1684 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 1685 if (props) 1686 spa_sync_props(spa, props, CRED(), tx); 1687 1688 dmu_tx_commit(tx); 1689 1690 spa->spa_sync_on = B_TRUE; 1691 txg_sync_start(spa->spa_dsl_pool); 1692 1693 /* 1694 * We explicitly wait for the first transaction to complete so that our 1695 * bean counters are appropriately updated. 1696 */ 1697 txg_wait_synced(spa->spa_dsl_pool, txg); 1698 1699 spa_config_sync(); 1700 1701 if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 1702 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 1703 1704 mutex_exit(&spa_namespace_lock); 1705 1706 return (0); 1707 } 1708 1709 /* 1710 * Import the given pool into the system. We set up the necessary spa_t and 1711 * then call spa_load() to do the dirty work. 1712 */ 1713 int 1714 spa_import(const char *pool, nvlist_t *config, nvlist_t *props) 1715 { 1716 spa_t *spa; 1717 char *altroot = NULL; 1718 int error; 1719 nvlist_t *nvroot; 1720 nvlist_t **spares; 1721 uint_t nspares; 1722 1723 /* 1724 * If a pool with this name exists, return failure. 1725 */ 1726 mutex_enter(&spa_namespace_lock); 1727 if (spa_lookup(pool) != NULL) { 1728 mutex_exit(&spa_namespace_lock); 1729 return (EEXIST); 1730 } 1731 1732 /* 1733 * Create and initialize the spa structure. 1734 */ 1735 (void) nvlist_lookup_string(props, 1736 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 1737 spa = spa_add(pool, altroot); 1738 spa_activate(spa); 1739 1740 /* 1741 * Pass off the heavy lifting to spa_load(). 1742 * Pass TRUE for mosconfig because the user-supplied config 1743 * is actually the one to trust when doing an import. 1744 */ 1745 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1746 1747 spa_config_enter(spa, RW_WRITER, FTAG); 1748 /* 1749 * Toss any existing sparelist, as it doesn't have any validity anymore, 1750 * and conflicts with spa_has_spare(). 1751 */ 1752 if (spa->spa_sparelist) { 1753 nvlist_free(spa->spa_sparelist); 1754 spa->spa_sparelist = NULL; 1755 spa_load_spares(spa); 1756 } 1757 1758 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1759 &nvroot) == 0); 1760 if (error == 0) { 1761 error = spa_validate_spares(spa, nvroot, -1ULL, 1762 VDEV_ALLOC_SPARE); 1763 } 1764 spa_config_exit(spa, FTAG); 1765 1766 if (error != 0 || (props && (error = spa_prop_set(spa, props)))) { 1767 spa_unload(spa); 1768 spa_deactivate(spa); 1769 spa_remove(spa); 1770 mutex_exit(&spa_namespace_lock); 1771 return (error); 1772 } 1773 1774 /* 1775 * Override any spares as specified by the user, as these may have 1776 * correct device names/devids, etc. 1777 */ 1778 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1779 &spares, &nspares) == 0) { 1780 if (spa->spa_sparelist) 1781 VERIFY(nvlist_remove(spa->spa_sparelist, 1782 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1783 else 1784 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1785 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1786 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1787 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1788 spa_config_enter(spa, RW_WRITER, FTAG); 1789 spa_load_spares(spa); 1790 spa_config_exit(spa, FTAG); 1791 spa->spa_sync_spares = B_TRUE; 1792 } 1793 1794 /* 1795 * Update the config cache to include the newly-imported pool. 1796 */ 1797 if (spa_mode & FWRITE) 1798 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1799 1800 /* 1801 * Resilver anything that's out of date. 1802 */ 1803 if (spa_mode & FWRITE) 1804 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1805 1806 mutex_exit(&spa_namespace_lock); 1807 1808 return (0); 1809 } 1810 1811 /* 1812 * This (illegal) pool name is used when temporarily importing a spa_t in order 1813 * to get the vdev stats associated with the imported devices. 1814 */ 1815 #define TRYIMPORT_NAME "$import" 1816 1817 nvlist_t * 1818 spa_tryimport(nvlist_t *tryconfig) 1819 { 1820 nvlist_t *config = NULL; 1821 char *poolname; 1822 spa_t *spa; 1823 uint64_t state; 1824 1825 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1826 return (NULL); 1827 1828 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1829 return (NULL); 1830 1831 /* 1832 * Create and initialize the spa structure. 1833 */ 1834 mutex_enter(&spa_namespace_lock); 1835 spa = spa_add(TRYIMPORT_NAME, NULL); 1836 spa_activate(spa); 1837 1838 /* 1839 * Pass off the heavy lifting to spa_load(). 1840 * Pass TRUE for mosconfig because the user-supplied config 1841 * is actually the one to trust when doing an import. 1842 */ 1843 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1844 1845 /* 1846 * If 'tryconfig' was at least parsable, return the current config. 1847 */ 1848 if (spa->spa_root_vdev != NULL) { 1849 spa_config_enter(spa, RW_READER, FTAG); 1850 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1851 spa_config_exit(spa, FTAG); 1852 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1853 poolname) == 0); 1854 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1855 state) == 0); 1856 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 1857 spa->spa_uberblock.ub_timestamp) == 0); 1858 1859 /* 1860 * Add the list of hot spares. 1861 */ 1862 spa_add_spares(spa, config); 1863 } 1864 1865 spa_unload(spa); 1866 spa_deactivate(spa); 1867 spa_remove(spa); 1868 mutex_exit(&spa_namespace_lock); 1869 1870 return (config); 1871 } 1872 1873 /* 1874 * Pool export/destroy 1875 * 1876 * The act of destroying or exporting a pool is very simple. We make sure there 1877 * is no more pending I/O and any references to the pool are gone. Then, we 1878 * update the pool state and sync all the labels to disk, removing the 1879 * configuration from the cache afterwards. 1880 */ 1881 static int 1882 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1883 { 1884 spa_t *spa; 1885 1886 if (oldconfig) 1887 *oldconfig = NULL; 1888 1889 if (!(spa_mode & FWRITE)) 1890 return (EROFS); 1891 1892 mutex_enter(&spa_namespace_lock); 1893 if ((spa = spa_lookup(pool)) == NULL) { 1894 mutex_exit(&spa_namespace_lock); 1895 return (ENOENT); 1896 } 1897 1898 /* 1899 * Put a hold on the pool, drop the namespace lock, stop async tasks, 1900 * reacquire the namespace lock, and see if we can export. 1901 */ 1902 spa_open_ref(spa, FTAG); 1903 mutex_exit(&spa_namespace_lock); 1904 spa_async_suspend(spa); 1905 mutex_enter(&spa_namespace_lock); 1906 spa_close(spa, FTAG); 1907 1908 /* 1909 * The pool will be in core if it's openable, 1910 * in which case we can modify its state. 1911 */ 1912 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1913 /* 1914 * Objsets may be open only because they're dirty, so we 1915 * have to force it to sync before checking spa_refcnt. 1916 */ 1917 spa_scrub_suspend(spa); 1918 txg_wait_synced(spa->spa_dsl_pool, 0); 1919 1920 /* 1921 * A pool cannot be exported or destroyed if there are active 1922 * references. If we are resetting a pool, allow references by 1923 * fault injection handlers. 1924 */ 1925 if (!spa_refcount_zero(spa) || 1926 (spa->spa_inject_ref != 0 && 1927 new_state != POOL_STATE_UNINITIALIZED)) { 1928 spa_scrub_resume(spa); 1929 spa_async_resume(spa); 1930 mutex_exit(&spa_namespace_lock); 1931 return (EBUSY); 1932 } 1933 1934 spa_scrub_resume(spa); 1935 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1936 1937 /* 1938 * We want this to be reflected on every label, 1939 * so mark them all dirty. spa_unload() will do the 1940 * final sync that pushes these changes out. 1941 */ 1942 if (new_state != POOL_STATE_UNINITIALIZED) { 1943 spa_config_enter(spa, RW_WRITER, FTAG); 1944 spa->spa_state = new_state; 1945 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 1946 vdev_config_dirty(spa->spa_root_vdev); 1947 spa_config_exit(spa, FTAG); 1948 } 1949 } 1950 1951 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 1952 1953 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1954 spa_unload(spa); 1955 spa_deactivate(spa); 1956 } 1957 1958 if (oldconfig && spa->spa_config) 1959 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 1960 1961 if (new_state != POOL_STATE_UNINITIALIZED) { 1962 spa_config_check(spa->spa_config_dir, 1963 spa->spa_config_file); 1964 spa_remove(spa); 1965 spa_config_sync(); 1966 } 1967 mutex_exit(&spa_namespace_lock); 1968 1969 return (0); 1970 } 1971 1972 /* 1973 * Destroy a storage pool. 1974 */ 1975 int 1976 spa_destroy(char *pool) 1977 { 1978 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1979 } 1980 1981 /* 1982 * Export a storage pool. 1983 */ 1984 int 1985 spa_export(char *pool, nvlist_t **oldconfig) 1986 { 1987 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1988 } 1989 1990 /* 1991 * Similar to spa_export(), this unloads the spa_t without actually removing it 1992 * from the namespace in any way. 1993 */ 1994 int 1995 spa_reset(char *pool) 1996 { 1997 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 1998 } 1999 2000 2001 /* 2002 * ========================================================================== 2003 * Device manipulation 2004 * ========================================================================== 2005 */ 2006 2007 /* 2008 * Add a device to a storage pool. 2009 */ 2010 int 2011 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 2012 { 2013 uint64_t txg; 2014 int c, error; 2015 vdev_t *rvd = spa->spa_root_vdev; 2016 vdev_t *vd, *tvd; 2017 nvlist_t **spares; 2018 uint_t i, nspares; 2019 2020 txg = spa_vdev_enter(spa); 2021 2022 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 2023 VDEV_ALLOC_ADD)) != 0) 2024 return (spa_vdev_exit(spa, NULL, txg, error)); 2025 2026 spa->spa_pending_vdev = vd; 2027 2028 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2029 &spares, &nspares) != 0) 2030 nspares = 0; 2031 2032 if (vd->vdev_children == 0 && nspares == 0) { 2033 spa->spa_pending_vdev = NULL; 2034 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 2035 } 2036 2037 if (vd->vdev_children != 0) { 2038 if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 2039 spa->spa_pending_vdev = NULL; 2040 return (spa_vdev_exit(spa, vd, txg, error)); 2041 } 2042 } 2043 2044 /* 2045 * We must validate the spares after checking the children. Otherwise, 2046 * vdev_inuse() will blindly overwrite the spare. 2047 */ 2048 if ((error = spa_validate_spares(spa, nvroot, txg, 2049 VDEV_ALLOC_ADD)) != 0) { 2050 spa->spa_pending_vdev = NULL; 2051 return (spa_vdev_exit(spa, vd, txg, error)); 2052 } 2053 2054 spa->spa_pending_vdev = NULL; 2055 2056 /* 2057 * Transfer each new top-level vdev from vd to rvd. 2058 */ 2059 for (c = 0; c < vd->vdev_children; c++) { 2060 tvd = vd->vdev_child[c]; 2061 vdev_remove_child(vd, tvd); 2062 tvd->vdev_id = rvd->vdev_children; 2063 vdev_add_child(rvd, tvd); 2064 vdev_config_dirty(tvd); 2065 } 2066 2067 if (nspares != 0) { 2068 if (spa->spa_sparelist != NULL) { 2069 nvlist_t **oldspares; 2070 uint_t oldnspares; 2071 nvlist_t **newspares; 2072 2073 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 2074 ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 2075 2076 newspares = kmem_alloc(sizeof (void *) * 2077 (nspares + oldnspares), KM_SLEEP); 2078 for (i = 0; i < oldnspares; i++) 2079 VERIFY(nvlist_dup(oldspares[i], 2080 &newspares[i], KM_SLEEP) == 0); 2081 for (i = 0; i < nspares; i++) 2082 VERIFY(nvlist_dup(spares[i], 2083 &newspares[i + oldnspares], 2084 KM_SLEEP) == 0); 2085 2086 VERIFY(nvlist_remove(spa->spa_sparelist, 2087 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 2088 2089 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 2090 ZPOOL_CONFIG_SPARES, newspares, 2091 nspares + oldnspares) == 0); 2092 for (i = 0; i < oldnspares + nspares; i++) 2093 nvlist_free(newspares[i]); 2094 kmem_free(newspares, (oldnspares + nspares) * 2095 sizeof (void *)); 2096 } else { 2097 VERIFY(nvlist_alloc(&spa->spa_sparelist, 2098 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2099 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 2100 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2101 } 2102 2103 spa_load_spares(spa); 2104 spa->spa_sync_spares = B_TRUE; 2105 } 2106 2107 /* 2108 * We have to be careful when adding new vdevs to an existing pool. 2109 * If other threads start allocating from these vdevs before we 2110 * sync the config cache, and we lose power, then upon reboot we may 2111 * fail to open the pool because there are DVAs that the config cache 2112 * can't translate. Therefore, we first add the vdevs without 2113 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 2114 * and then let spa_config_update() initialize the new metaslabs. 2115 * 2116 * spa_load() checks for added-but-not-initialized vdevs, so that 2117 * if we lose power at any point in this sequence, the remaining 2118 * steps will be completed the next time we load the pool. 2119 */ 2120 (void) spa_vdev_exit(spa, vd, txg, 0); 2121 2122 mutex_enter(&spa_namespace_lock); 2123 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2124 mutex_exit(&spa_namespace_lock); 2125 2126 return (0); 2127 } 2128 2129 /* 2130 * Attach a device to a mirror. The arguments are the path to any device 2131 * in the mirror, and the nvroot for the new device. If the path specifies 2132 * a device that is not mirrored, we automatically insert the mirror vdev. 2133 * 2134 * If 'replacing' is specified, the new device is intended to replace the 2135 * existing device; in this case the two devices are made into their own 2136 * mirror using the 'replacing' vdev, which is functionally identical to 2137 * the mirror vdev (it actually reuses all the same ops) but has a few 2138 * extra rules: you can't attach to it after it's been created, and upon 2139 * completion of resilvering, the first disk (the one being replaced) 2140 * is automatically detached. 2141 */ 2142 int 2143 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 2144 { 2145 uint64_t txg, open_txg; 2146 int error; 2147 vdev_t *rvd = spa->spa_root_vdev; 2148 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 2149 vdev_ops_t *pvops; 2150 int is_log; 2151 2152 txg = spa_vdev_enter(spa); 2153 2154 oldvd = vdev_lookup_by_guid(rvd, guid); 2155 2156 if (oldvd == NULL) 2157 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 2158 2159 if (!oldvd->vdev_ops->vdev_op_leaf) 2160 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2161 2162 pvd = oldvd->vdev_parent; 2163 2164 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 2165 VDEV_ALLOC_ADD)) != 0) 2166 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 2167 2168 if (newrootvd->vdev_children != 1) 2169 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 2170 2171 newvd = newrootvd->vdev_child[0]; 2172 2173 if (!newvd->vdev_ops->vdev_op_leaf) 2174 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 2175 2176 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 2177 return (spa_vdev_exit(spa, newrootvd, txg, error)); 2178 2179 /* 2180 * Spares can't replace logs 2181 */ 2182 is_log = oldvd->vdev_islog; 2183 if (is_log && newvd->vdev_isspare) 2184 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2185 2186 if (!replacing) { 2187 /* 2188 * For attach, the only allowable parent is a mirror or the root 2189 * vdev. 2190 */ 2191 if (pvd->vdev_ops != &vdev_mirror_ops && 2192 pvd->vdev_ops != &vdev_root_ops) 2193 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2194 2195 pvops = &vdev_mirror_ops; 2196 } else { 2197 /* 2198 * Active hot spares can only be replaced by inactive hot 2199 * spares. 2200 */ 2201 if (pvd->vdev_ops == &vdev_spare_ops && 2202 pvd->vdev_child[1] == oldvd && 2203 !spa_has_spare(spa, newvd->vdev_guid)) 2204 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2205 2206 /* 2207 * If the source is a hot spare, and the parent isn't already a 2208 * spare, then we want to create a new hot spare. Otherwise, we 2209 * want to create a replacing vdev. The user is not allowed to 2210 * attach to a spared vdev child unless the 'isspare' state is 2211 * the same (spare replaces spare, non-spare replaces 2212 * non-spare). 2213 */ 2214 if (pvd->vdev_ops == &vdev_replacing_ops) 2215 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2216 else if (pvd->vdev_ops == &vdev_spare_ops && 2217 newvd->vdev_isspare != oldvd->vdev_isspare) 2218 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2219 else if (pvd->vdev_ops != &vdev_spare_ops && 2220 newvd->vdev_isspare) 2221 pvops = &vdev_spare_ops; 2222 else 2223 pvops = &vdev_replacing_ops; 2224 } 2225 2226 /* 2227 * Compare the new device size with the replaceable/attachable 2228 * device size. 2229 */ 2230 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 2231 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 2232 2233 /* 2234 * The new device cannot have a higher alignment requirement 2235 * than the top-level vdev. 2236 */ 2237 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 2238 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 2239 2240 /* 2241 * If this is an in-place replacement, update oldvd's path and devid 2242 * to make it distinguishable from newvd, and unopenable from now on. 2243 */ 2244 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 2245 spa_strfree(oldvd->vdev_path); 2246 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 2247 KM_SLEEP); 2248 (void) sprintf(oldvd->vdev_path, "%s/%s", 2249 newvd->vdev_path, "old"); 2250 if (oldvd->vdev_devid != NULL) { 2251 spa_strfree(oldvd->vdev_devid); 2252 oldvd->vdev_devid = NULL; 2253 } 2254 } 2255 2256 /* 2257 * If the parent is not a mirror, or if we're replacing, insert the new 2258 * mirror/replacing/spare vdev above oldvd. 2259 */ 2260 if (pvd->vdev_ops != pvops) 2261 pvd = vdev_add_parent(oldvd, pvops); 2262 2263 ASSERT(pvd->vdev_top->vdev_parent == rvd); 2264 ASSERT(pvd->vdev_ops == pvops); 2265 ASSERT(oldvd->vdev_parent == pvd); 2266 2267 /* 2268 * Extract the new device from its root and add it to pvd. 2269 */ 2270 vdev_remove_child(newrootvd, newvd); 2271 newvd->vdev_id = pvd->vdev_children; 2272 vdev_add_child(pvd, newvd); 2273 2274 /* 2275 * If newvd is smaller than oldvd, but larger than its rsize, 2276 * the addition of newvd may have decreased our parent's asize. 2277 */ 2278 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 2279 2280 tvd = newvd->vdev_top; 2281 ASSERT(pvd->vdev_top == tvd); 2282 ASSERT(tvd->vdev_parent == rvd); 2283 2284 vdev_config_dirty(tvd); 2285 2286 /* 2287 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 2288 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 2289 */ 2290 open_txg = txg + TXG_CONCURRENT_STATES - 1; 2291 2292 mutex_enter(&newvd->vdev_dtl_lock); 2293 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 2294 open_txg - TXG_INITIAL + 1); 2295 mutex_exit(&newvd->vdev_dtl_lock); 2296 2297 if (newvd->vdev_isspare) 2298 spa_spare_activate(newvd); 2299 2300 /* 2301 * Mark newvd's DTL dirty in this txg. 2302 */ 2303 vdev_dirty(tvd, VDD_DTL, newvd, txg); 2304 2305 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 2306 2307 /* 2308 * Kick off a resilver to update newvd. We need to grab the namespace 2309 * lock because spa_scrub() needs to post a sysevent with the pool name. 2310 */ 2311 mutex_enter(&spa_namespace_lock); 2312 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 2313 mutex_exit(&spa_namespace_lock); 2314 2315 return (0); 2316 } 2317 2318 /* 2319 * Detach a device from a mirror or replacing vdev. 2320 * If 'replace_done' is specified, only detach if the parent 2321 * is a replacing vdev. 2322 */ 2323 int 2324 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 2325 { 2326 uint64_t txg; 2327 int c, t, error; 2328 vdev_t *rvd = spa->spa_root_vdev; 2329 vdev_t *vd, *pvd, *cvd, *tvd; 2330 boolean_t unspare = B_FALSE; 2331 uint64_t unspare_guid; 2332 2333 txg = spa_vdev_enter(spa); 2334 2335 vd = vdev_lookup_by_guid(rvd, guid); 2336 2337 if (vd == NULL) 2338 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 2339 2340 if (!vd->vdev_ops->vdev_op_leaf) 2341 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2342 2343 pvd = vd->vdev_parent; 2344 2345 /* 2346 * If replace_done is specified, only remove this device if it's 2347 * the first child of a replacing vdev. For the 'spare' vdev, either 2348 * disk can be removed. 2349 */ 2350 if (replace_done) { 2351 if (pvd->vdev_ops == &vdev_replacing_ops) { 2352 if (vd->vdev_id != 0) 2353 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2354 } else if (pvd->vdev_ops != &vdev_spare_ops) { 2355 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2356 } 2357 } 2358 2359 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 2360 spa_version(spa) >= SPA_VERSION_SPARES); 2361 2362 /* 2363 * Only mirror, replacing, and spare vdevs support detach. 2364 */ 2365 if (pvd->vdev_ops != &vdev_replacing_ops && 2366 pvd->vdev_ops != &vdev_mirror_ops && 2367 pvd->vdev_ops != &vdev_spare_ops) 2368 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2369 2370 /* 2371 * If there's only one replica, you can't detach it. 2372 */ 2373 if (pvd->vdev_children <= 1) 2374 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 2375 2376 /* 2377 * If all siblings have non-empty DTLs, this device may have the only 2378 * valid copy of the data, which means we cannot safely detach it. 2379 * 2380 * XXX -- as in the vdev_offline() case, we really want a more 2381 * precise DTL check. 2382 */ 2383 for (c = 0; c < pvd->vdev_children; c++) { 2384 uint64_t dirty; 2385 2386 cvd = pvd->vdev_child[c]; 2387 if (cvd == vd) 2388 continue; 2389 if (vdev_is_dead(cvd)) 2390 continue; 2391 mutex_enter(&cvd->vdev_dtl_lock); 2392 dirty = cvd->vdev_dtl_map.sm_space | 2393 cvd->vdev_dtl_scrub.sm_space; 2394 mutex_exit(&cvd->vdev_dtl_lock); 2395 if (!dirty) 2396 break; 2397 } 2398 2399 /* 2400 * If we are a replacing or spare vdev, then we can always detach the 2401 * latter child, as that is how one cancels the operation. 2402 */ 2403 if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 2404 c == pvd->vdev_children) 2405 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 2406 2407 /* 2408 * If we are detaching the original disk from a spare, then it implies 2409 * that the spare should become a real disk, and be removed from the 2410 * active spare list for the pool. 2411 */ 2412 if (pvd->vdev_ops == &vdev_spare_ops && 2413 vd->vdev_id == 0) 2414 unspare = B_TRUE; 2415 2416 /* 2417 * Erase the disk labels so the disk can be used for other things. 2418 * This must be done after all other error cases are handled, 2419 * but before we disembowel vd (so we can still do I/O to it). 2420 * But if we can't do it, don't treat the error as fatal -- 2421 * it may be that the unwritability of the disk is the reason 2422 * it's being detached! 2423 */ 2424 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 2425 2426 /* 2427 * Remove vd from its parent and compact the parent's children. 2428 */ 2429 vdev_remove_child(pvd, vd); 2430 vdev_compact_children(pvd); 2431 2432 /* 2433 * Remember one of the remaining children so we can get tvd below. 2434 */ 2435 cvd = pvd->vdev_child[0]; 2436 2437 /* 2438 * If we need to remove the remaining child from the list of hot spares, 2439 * do it now, marking the vdev as no longer a spare in the process. We 2440 * must do this before vdev_remove_parent(), because that can change the 2441 * GUID if it creates a new toplevel GUID. 2442 */ 2443 if (unspare) { 2444 ASSERT(cvd->vdev_isspare); 2445 spa_spare_remove(cvd); 2446 unspare_guid = cvd->vdev_guid; 2447 } 2448 2449 /* 2450 * If the parent mirror/replacing vdev only has one child, 2451 * the parent is no longer needed. Remove it from the tree. 2452 */ 2453 if (pvd->vdev_children == 1) 2454 vdev_remove_parent(cvd); 2455 2456 /* 2457 * We don't set tvd until now because the parent we just removed 2458 * may have been the previous top-level vdev. 2459 */ 2460 tvd = cvd->vdev_top; 2461 ASSERT(tvd->vdev_parent == rvd); 2462 2463 /* 2464 * Reevaluate the parent vdev state. 2465 */ 2466 vdev_propagate_state(cvd); 2467 2468 /* 2469 * If the device we just detached was smaller than the others, it may be 2470 * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 2471 * can't fail because the existing metaslabs are already in core, so 2472 * there's nothing to read from disk. 2473 */ 2474 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 2475 2476 vdev_config_dirty(tvd); 2477 2478 /* 2479 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 2480 * vd->vdev_detached is set and free vd's DTL object in syncing context. 2481 * But first make sure we're not on any *other* txg's DTL list, to 2482 * prevent vd from being accessed after it's freed. 2483 */ 2484 for (t = 0; t < TXG_SIZE; t++) 2485 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 2486 vd->vdev_detached = B_TRUE; 2487 vdev_dirty(tvd, VDD_DTL, vd, txg); 2488 2489 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 2490 2491 error = spa_vdev_exit(spa, vd, txg, 0); 2492 2493 /* 2494 * If this was the removal of the original device in a hot spare vdev, 2495 * then we want to go through and remove the device from the hot spare 2496 * list of every other pool. 2497 */ 2498 if (unspare) { 2499 spa = NULL; 2500 mutex_enter(&spa_namespace_lock); 2501 while ((spa = spa_next(spa)) != NULL) { 2502 if (spa->spa_state != POOL_STATE_ACTIVE) 2503 continue; 2504 2505 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 2506 } 2507 mutex_exit(&spa_namespace_lock); 2508 } 2509 2510 return (error); 2511 } 2512 2513 /* 2514 * Remove a device from the pool. Currently, this supports removing only hot 2515 * spares. 2516 */ 2517 int 2518 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 2519 { 2520 vdev_t *vd; 2521 nvlist_t **spares, *nv, **newspares; 2522 uint_t i, j, nspares; 2523 int ret = 0; 2524 2525 spa_config_enter(spa, RW_WRITER, FTAG); 2526 2527 vd = spa_lookup_by_guid(spa, guid); 2528 2529 nv = NULL; 2530 if (spa->spa_spares != NULL && 2531 nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2532 &spares, &nspares) == 0) { 2533 for (i = 0; i < nspares; i++) { 2534 uint64_t theguid; 2535 2536 VERIFY(nvlist_lookup_uint64(spares[i], 2537 ZPOOL_CONFIG_GUID, &theguid) == 0); 2538 if (theguid == guid) { 2539 nv = spares[i]; 2540 break; 2541 } 2542 } 2543 } 2544 2545 /* 2546 * We only support removing a hot spare, and only if it's not currently 2547 * in use in this pool. 2548 */ 2549 if (nv == NULL && vd == NULL) { 2550 ret = ENOENT; 2551 goto out; 2552 } 2553 2554 if (nv == NULL && vd != NULL) { 2555 ret = ENOTSUP; 2556 goto out; 2557 } 2558 2559 if (!unspare && nv != NULL && vd != NULL) { 2560 ret = EBUSY; 2561 goto out; 2562 } 2563 2564 if (nspares == 1) { 2565 newspares = NULL; 2566 } else { 2567 newspares = kmem_alloc((nspares - 1) * sizeof (void *), 2568 KM_SLEEP); 2569 for (i = 0, j = 0; i < nspares; i++) { 2570 if (spares[i] != nv) 2571 VERIFY(nvlist_dup(spares[i], 2572 &newspares[j++], KM_SLEEP) == 0); 2573 } 2574 } 2575 2576 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2577 DATA_TYPE_NVLIST_ARRAY) == 0); 2578 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2579 newspares, nspares - 1) == 0); 2580 for (i = 0; i < nspares - 1; i++) 2581 nvlist_free(newspares[i]); 2582 kmem_free(newspares, (nspares - 1) * sizeof (void *)); 2583 spa_load_spares(spa); 2584 spa->spa_sync_spares = B_TRUE; 2585 2586 out: 2587 spa_config_exit(spa, FTAG); 2588 2589 return (ret); 2590 } 2591 2592 /* 2593 * Find any device that's done replacing, or a vdev marked 'unspare' that's 2594 * current spared, so we can detach it. 2595 */ 2596 static vdev_t * 2597 spa_vdev_resilver_done_hunt(vdev_t *vd) 2598 { 2599 vdev_t *newvd, *oldvd; 2600 int c; 2601 2602 for (c = 0; c < vd->vdev_children; c++) { 2603 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 2604 if (oldvd != NULL) 2605 return (oldvd); 2606 } 2607 2608 /* 2609 * Check for a completed replacement. 2610 */ 2611 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 2612 oldvd = vd->vdev_child[0]; 2613 newvd = vd->vdev_child[1]; 2614 2615 mutex_enter(&newvd->vdev_dtl_lock); 2616 if (newvd->vdev_dtl_map.sm_space == 0 && 2617 newvd->vdev_dtl_scrub.sm_space == 0) { 2618 mutex_exit(&newvd->vdev_dtl_lock); 2619 return (oldvd); 2620 } 2621 mutex_exit(&newvd->vdev_dtl_lock); 2622 } 2623 2624 /* 2625 * Check for a completed resilver with the 'unspare' flag set. 2626 */ 2627 if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { 2628 newvd = vd->vdev_child[0]; 2629 oldvd = vd->vdev_child[1]; 2630 2631 mutex_enter(&newvd->vdev_dtl_lock); 2632 if (newvd->vdev_unspare && 2633 newvd->vdev_dtl_map.sm_space == 0 && 2634 newvd->vdev_dtl_scrub.sm_space == 0) { 2635 newvd->vdev_unspare = 0; 2636 mutex_exit(&newvd->vdev_dtl_lock); 2637 return (oldvd); 2638 } 2639 mutex_exit(&newvd->vdev_dtl_lock); 2640 } 2641 2642 return (NULL); 2643 } 2644 2645 static void 2646 spa_vdev_resilver_done(spa_t *spa) 2647 { 2648 vdev_t *vd; 2649 vdev_t *pvd; 2650 uint64_t guid; 2651 uint64_t pguid = 0; 2652 2653 spa_config_enter(spa, RW_READER, FTAG); 2654 2655 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 2656 guid = vd->vdev_guid; 2657 /* 2658 * If we have just finished replacing a hot spared device, then 2659 * we need to detach the parent's first child (the original hot 2660 * spare) as well. 2661 */ 2662 pvd = vd->vdev_parent; 2663 if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2664 pvd->vdev_id == 0) { 2665 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 2666 ASSERT(pvd->vdev_parent->vdev_children == 2); 2667 pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 2668 } 2669 spa_config_exit(spa, FTAG); 2670 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 2671 return; 2672 if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 2673 return; 2674 spa_config_enter(spa, RW_READER, FTAG); 2675 } 2676 2677 spa_config_exit(spa, FTAG); 2678 } 2679 2680 /* 2681 * Update the stored path for this vdev. Dirty the vdev configuration, relying 2682 * on spa_vdev_enter/exit() to synchronize the labels and cache. 2683 */ 2684 int 2685 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 2686 { 2687 vdev_t *rvd, *vd; 2688 uint64_t txg; 2689 2690 rvd = spa->spa_root_vdev; 2691 2692 txg = spa_vdev_enter(spa); 2693 2694 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2695 /* 2696 * Determine if this is a reference to a hot spare. In that 2697 * case, update the path as stored in the spare list. 2698 */ 2699 nvlist_t **spares; 2700 uint_t i, nspares; 2701 if (spa->spa_sparelist != NULL) { 2702 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 2703 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2704 for (i = 0; i < nspares; i++) { 2705 uint64_t theguid; 2706 VERIFY(nvlist_lookup_uint64(spares[i], 2707 ZPOOL_CONFIG_GUID, &theguid) == 0); 2708 if (theguid == guid) 2709 break; 2710 } 2711 2712 if (i == nspares) 2713 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2714 2715 VERIFY(nvlist_add_string(spares[i], 2716 ZPOOL_CONFIG_PATH, newpath) == 0); 2717 spa_load_spares(spa); 2718 spa->spa_sync_spares = B_TRUE; 2719 return (spa_vdev_exit(spa, NULL, txg, 0)); 2720 } else { 2721 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2722 } 2723 } 2724 2725 if (!vd->vdev_ops->vdev_op_leaf) 2726 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2727 2728 spa_strfree(vd->vdev_path); 2729 vd->vdev_path = spa_strdup(newpath); 2730 2731 vdev_config_dirty(vd->vdev_top); 2732 2733 return (spa_vdev_exit(spa, NULL, txg, 0)); 2734 } 2735 2736 /* 2737 * ========================================================================== 2738 * SPA Scrubbing 2739 * ========================================================================== 2740 */ 2741 2742 static void 2743 spa_scrub_io_done(zio_t *zio) 2744 { 2745 spa_t *spa = zio->io_spa; 2746 2747 arc_data_buf_free(zio->io_data, zio->io_size); 2748 2749 mutex_enter(&spa->spa_scrub_lock); 2750 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2751 vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2752 spa->spa_scrub_errors++; 2753 mutex_enter(&vd->vdev_stat_lock); 2754 vd->vdev_stat.vs_scrub_errors++; 2755 mutex_exit(&vd->vdev_stat_lock); 2756 } 2757 2758 if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight) 2759 cv_broadcast(&spa->spa_scrub_io_cv); 2760 2761 ASSERT(spa->spa_scrub_inflight >= 0); 2762 2763 mutex_exit(&spa->spa_scrub_lock); 2764 } 2765 2766 static void 2767 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 2768 zbookmark_t *zb) 2769 { 2770 size_t size = BP_GET_LSIZE(bp); 2771 void *data; 2772 2773 mutex_enter(&spa->spa_scrub_lock); 2774 /* 2775 * Do not give too much work to vdev(s). 2776 */ 2777 while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) { 2778 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2779 } 2780 spa->spa_scrub_inflight++; 2781 mutex_exit(&spa->spa_scrub_lock); 2782 2783 data = arc_data_buf_alloc(size); 2784 2785 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 2786 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 2787 2788 flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 2789 2790 zio_nowait(zio_read(NULL, spa, bp, data, size, 2791 spa_scrub_io_done, NULL, priority, flags, zb)); 2792 } 2793 2794 /* ARGSUSED */ 2795 static int 2796 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2797 { 2798 blkptr_t *bp = &bc->bc_blkptr; 2799 vdev_t *vd = spa->spa_root_vdev; 2800 dva_t *dva = bp->blk_dva; 2801 int needs_resilver = B_FALSE; 2802 int d; 2803 2804 if (bc->bc_errno) { 2805 /* 2806 * We can't scrub this block, but we can continue to scrub 2807 * the rest of the pool. Note the error and move along. 2808 */ 2809 mutex_enter(&spa->spa_scrub_lock); 2810 spa->spa_scrub_errors++; 2811 mutex_exit(&spa->spa_scrub_lock); 2812 2813 mutex_enter(&vd->vdev_stat_lock); 2814 vd->vdev_stat.vs_scrub_errors++; 2815 mutex_exit(&vd->vdev_stat_lock); 2816 2817 return (ERESTART); 2818 } 2819 2820 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2821 2822 for (d = 0; d < BP_GET_NDVAS(bp); d++) { 2823 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 2824 2825 ASSERT(vd != NULL); 2826 2827 /* 2828 * Keep track of how much data we've examined so that 2829 * zpool(1M) status can make useful progress reports. 2830 */ 2831 mutex_enter(&vd->vdev_stat_lock); 2832 vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 2833 mutex_exit(&vd->vdev_stat_lock); 2834 2835 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 2836 if (DVA_GET_GANG(&dva[d])) { 2837 /* 2838 * Gang members may be spread across multiple 2839 * vdevs, so the best we can do is look at the 2840 * pool-wide DTL. 2841 * XXX -- it would be better to change our 2842 * allocation policy to ensure that this can't 2843 * happen. 2844 */ 2845 vd = spa->spa_root_vdev; 2846 } 2847 if (vdev_dtl_contains(&vd->vdev_dtl_map, 2848 bp->blk_birth, 1)) 2849 needs_resilver = B_TRUE; 2850 } 2851 } 2852 2853 if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2854 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 2855 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 2856 else if (needs_resilver) 2857 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 2858 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2859 2860 return (0); 2861 } 2862 2863 static void 2864 spa_scrub_thread(spa_t *spa) 2865 { 2866 callb_cpr_t cprinfo; 2867 traverse_handle_t *th = spa->spa_scrub_th; 2868 vdev_t *rvd = spa->spa_root_vdev; 2869 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2870 int error = 0; 2871 boolean_t complete; 2872 2873 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2874 2875 /* 2876 * If we're restarting due to a snapshot create/delete, 2877 * wait for that to complete. 2878 */ 2879 txg_wait_synced(spa_get_dsl(spa), 0); 2880 2881 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 2882 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2883 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 2884 2885 spa_config_enter(spa, RW_WRITER, FTAG); 2886 vdev_reopen(rvd); /* purge all vdev caches */ 2887 vdev_config_dirty(rvd); /* rewrite all disk labels */ 2888 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 2889 spa_config_exit(spa, FTAG); 2890 2891 mutex_enter(&spa->spa_scrub_lock); 2892 spa->spa_scrub_errors = 0; 2893 spa->spa_scrub_active = 1; 2894 ASSERT(spa->spa_scrub_inflight == 0); 2895 2896 while (!spa->spa_scrub_stop) { 2897 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2898 while (spa->spa_scrub_suspended) { 2899 spa->spa_scrub_active = 0; 2900 cv_broadcast(&spa->spa_scrub_cv); 2901 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2902 spa->spa_scrub_active = 1; 2903 } 2904 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2905 2906 if (spa->spa_scrub_restart_txg != 0) 2907 break; 2908 2909 mutex_exit(&spa->spa_scrub_lock); 2910 error = traverse_more(th); 2911 mutex_enter(&spa->spa_scrub_lock); 2912 if (error != EAGAIN) 2913 break; 2914 } 2915 2916 while (spa->spa_scrub_inflight) 2917 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2918 2919 spa->spa_scrub_active = 0; 2920 cv_broadcast(&spa->spa_scrub_cv); 2921 2922 mutex_exit(&spa->spa_scrub_lock); 2923 2924 spa_config_enter(spa, RW_WRITER, FTAG); 2925 2926 mutex_enter(&spa->spa_scrub_lock); 2927 2928 /* 2929 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 2930 * AND the spa config lock to synchronize with any config changes 2931 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 2932 */ 2933 if (spa->spa_scrub_restart_txg != 0) 2934 error = ERESTART; 2935 2936 if (spa->spa_scrub_stop) 2937 error = EINTR; 2938 2939 /* 2940 * Even if there were uncorrectable errors, we consider the scrub 2941 * completed. The downside is that if there is a transient error during 2942 * a resilver, we won't resilver the data properly to the target. But 2943 * if the damage is permanent (more likely) we will resilver forever, 2944 * which isn't really acceptable. Since there is enough information for 2945 * the user to know what has failed and why, this seems like a more 2946 * tractable approach. 2947 */ 2948 complete = (error == 0); 2949 2950 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 2951 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2952 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2953 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2954 2955 mutex_exit(&spa->spa_scrub_lock); 2956 2957 /* 2958 * If the scrub/resilver completed, update all DTLs to reflect this. 2959 * Whether it succeeded or not, vacate all temporary scrub DTLs. 2960 */ 2961 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2962 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2963 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 2964 spa_errlog_rotate(spa); 2965 2966 if (scrub_type == POOL_SCRUB_RESILVER && complete) 2967 spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_FINISH); 2968 2969 spa_config_exit(spa, FTAG); 2970 2971 mutex_enter(&spa->spa_scrub_lock); 2972 2973 /* 2974 * We may have finished replacing a device. 2975 * Let the async thread assess this and handle the detach. 2976 */ 2977 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 2978 2979 /* 2980 * If we were told to restart, our final act is to start a new scrub. 2981 */ 2982 if (error == ERESTART) 2983 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 2984 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2985 2986 spa->spa_scrub_type = POOL_SCRUB_NONE; 2987 spa->spa_scrub_active = 0; 2988 spa->spa_scrub_thread = NULL; 2989 cv_broadcast(&spa->spa_scrub_cv); 2990 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2991 thread_exit(); 2992 } 2993 2994 void 2995 spa_scrub_suspend(spa_t *spa) 2996 { 2997 mutex_enter(&spa->spa_scrub_lock); 2998 spa->spa_scrub_suspended++; 2999 while (spa->spa_scrub_active) { 3000 cv_broadcast(&spa->spa_scrub_cv); 3001 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 3002 } 3003 while (spa->spa_scrub_inflight) 3004 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 3005 mutex_exit(&spa->spa_scrub_lock); 3006 } 3007 3008 void 3009 spa_scrub_resume(spa_t *spa) 3010 { 3011 mutex_enter(&spa->spa_scrub_lock); 3012 ASSERT(spa->spa_scrub_suspended != 0); 3013 if (--spa->spa_scrub_suspended == 0) 3014 cv_broadcast(&spa->spa_scrub_cv); 3015 mutex_exit(&spa->spa_scrub_lock); 3016 } 3017 3018 void 3019 spa_scrub_restart(spa_t *spa, uint64_t txg) 3020 { 3021 /* 3022 * Something happened (e.g. snapshot create/delete) that means 3023 * we must restart any in-progress scrubs. The itinerary will 3024 * fix this properly. 3025 */ 3026 mutex_enter(&spa->spa_scrub_lock); 3027 spa->spa_scrub_restart_txg = txg; 3028 mutex_exit(&spa->spa_scrub_lock); 3029 } 3030 3031 int 3032 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 3033 { 3034 space_seg_t *ss; 3035 uint64_t mintxg, maxtxg; 3036 vdev_t *rvd = spa->spa_root_vdev; 3037 3038 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3039 ASSERT(!spa_config_held(spa, RW_WRITER)); 3040 3041 if ((uint_t)type >= POOL_SCRUB_TYPES) 3042 return (ENOTSUP); 3043 3044 mutex_enter(&spa->spa_scrub_lock); 3045 3046 /* 3047 * If there's a scrub or resilver already in progress, stop it. 3048 */ 3049 while (spa->spa_scrub_thread != NULL) { 3050 /* 3051 * Don't stop a resilver unless forced. 3052 */ 3053 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 3054 mutex_exit(&spa->spa_scrub_lock); 3055 return (EBUSY); 3056 } 3057 spa->spa_scrub_stop = 1; 3058 cv_broadcast(&spa->spa_scrub_cv); 3059 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 3060 } 3061 3062 /* 3063 * Terminate the previous traverse. 3064 */ 3065 if (spa->spa_scrub_th != NULL) { 3066 traverse_fini(spa->spa_scrub_th); 3067 spa->spa_scrub_th = NULL; 3068 } 3069 3070 if (rvd == NULL) { 3071 ASSERT(spa->spa_scrub_stop == 0); 3072 ASSERT(spa->spa_scrub_type == type); 3073 ASSERT(spa->spa_scrub_restart_txg == 0); 3074 mutex_exit(&spa->spa_scrub_lock); 3075 return (0); 3076 } 3077 3078 mintxg = TXG_INITIAL - 1; 3079 maxtxg = spa_last_synced_txg(spa) + 1; 3080 3081 mutex_enter(&rvd->vdev_dtl_lock); 3082 3083 if (rvd->vdev_dtl_map.sm_space == 0) { 3084 /* 3085 * The pool-wide DTL is empty. 3086 * If this is a resilver, there's nothing to do except 3087 * check whether any in-progress replacements have completed. 3088 */ 3089 if (type == POOL_SCRUB_RESILVER) { 3090 type = POOL_SCRUB_NONE; 3091 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 3092 } 3093 } else { 3094 /* 3095 * The pool-wide DTL is non-empty. 3096 * If this is a normal scrub, upgrade to a resilver instead. 3097 */ 3098 if (type == POOL_SCRUB_EVERYTHING) 3099 type = POOL_SCRUB_RESILVER; 3100 } 3101 3102 if (type == POOL_SCRUB_RESILVER) { 3103 /* 3104 * Determine the resilvering boundaries. 3105 * 3106 * Note: (mintxg, maxtxg) is an open interval, 3107 * i.e. mintxg and maxtxg themselves are not included. 3108 * 3109 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 3110 * so we don't claim to resilver a txg that's still changing. 3111 */ 3112 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 3113 mintxg = ss->ss_start - 1; 3114 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 3115 maxtxg = MIN(ss->ss_end, maxtxg); 3116 3117 spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START); 3118 } 3119 3120 mutex_exit(&rvd->vdev_dtl_lock); 3121 3122 spa->spa_scrub_stop = 0; 3123 spa->spa_scrub_type = type; 3124 spa->spa_scrub_restart_txg = 0; 3125 3126 if (type != POOL_SCRUB_NONE) { 3127 spa->spa_scrub_mintxg = mintxg; 3128 spa->spa_scrub_maxtxg = maxtxg; 3129 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 3130 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 3131 ZIO_FLAG_CANFAIL); 3132 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 3133 spa->spa_scrub_thread = thread_create(NULL, 0, 3134 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 3135 } 3136 3137 mutex_exit(&spa->spa_scrub_lock); 3138 3139 return (0); 3140 } 3141 3142 /* 3143 * ========================================================================== 3144 * SPA async task processing 3145 * ========================================================================== 3146 */ 3147 3148 static void 3149 spa_async_remove(spa_t *spa, vdev_t *vd) 3150 { 3151 vdev_t *tvd; 3152 int c; 3153 3154 for (c = 0; c < vd->vdev_children; c++) { 3155 tvd = vd->vdev_child[c]; 3156 if (tvd->vdev_remove_wanted) { 3157 tvd->vdev_remove_wanted = 0; 3158 vdev_set_state(tvd, B_FALSE, VDEV_STATE_REMOVED, 3159 VDEV_AUX_NONE); 3160 vdev_clear(spa, tvd, B_TRUE); 3161 vdev_config_dirty(tvd->vdev_top); 3162 } 3163 spa_async_remove(spa, tvd); 3164 } 3165 } 3166 3167 static void 3168 spa_async_thread(spa_t *spa) 3169 { 3170 int tasks; 3171 uint64_t txg; 3172 3173 ASSERT(spa->spa_sync_on); 3174 3175 mutex_enter(&spa->spa_async_lock); 3176 tasks = spa->spa_async_tasks; 3177 spa->spa_async_tasks = 0; 3178 mutex_exit(&spa->spa_async_lock); 3179 3180 /* 3181 * See if the config needs to be updated. 3182 */ 3183 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 3184 mutex_enter(&spa_namespace_lock); 3185 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3186 mutex_exit(&spa_namespace_lock); 3187 } 3188 3189 /* 3190 * See if any devices need to be marked REMOVED. 3191 * 3192 * XXX - We avoid doing this when we are in 3193 * I/O failure state since spa_vdev_enter() grabs 3194 * the namespace lock and would not be able to obtain 3195 * the writer config lock. 3196 */ 3197 if (tasks & SPA_ASYNC_REMOVE && 3198 spa_state(spa) != POOL_STATE_IO_FAILURE) { 3199 txg = spa_vdev_enter(spa); 3200 spa_async_remove(spa, spa->spa_root_vdev); 3201 (void) spa_vdev_exit(spa, NULL, txg, 0); 3202 } 3203 3204 /* 3205 * If any devices are done replacing, detach them. 3206 */ 3207 if (tasks & SPA_ASYNC_RESILVER_DONE) 3208 spa_vdev_resilver_done(spa); 3209 3210 /* 3211 * Kick off a scrub. When starting a RESILVER scrub (or an EVERYTHING 3212 * scrub which can become a resilver), we need to hold 3213 * spa_namespace_lock() because the sysevent we post via 3214 * spa_event_notify() needs to get the name of the pool. 3215 */ 3216 if (tasks & SPA_ASYNC_SCRUB) { 3217 mutex_enter(&spa_namespace_lock); 3218 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 3219 mutex_exit(&spa_namespace_lock); 3220 } 3221 3222 /* 3223 * Kick off a resilver. 3224 */ 3225 if (tasks & SPA_ASYNC_RESILVER) { 3226 mutex_enter(&spa_namespace_lock); 3227 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 3228 mutex_exit(&spa_namespace_lock); 3229 } 3230 3231 /* 3232 * Let the world know that we're done. 3233 */ 3234 mutex_enter(&spa->spa_async_lock); 3235 spa->spa_async_thread = NULL; 3236 cv_broadcast(&spa->spa_async_cv); 3237 mutex_exit(&spa->spa_async_lock); 3238 thread_exit(); 3239 } 3240 3241 void 3242 spa_async_suspend(spa_t *spa) 3243 { 3244 mutex_enter(&spa->spa_async_lock); 3245 spa->spa_async_suspended++; 3246 while (spa->spa_async_thread != NULL) 3247 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 3248 mutex_exit(&spa->spa_async_lock); 3249 } 3250 3251 void 3252 spa_async_resume(spa_t *spa) 3253 { 3254 mutex_enter(&spa->spa_async_lock); 3255 ASSERT(spa->spa_async_suspended != 0); 3256 spa->spa_async_suspended--; 3257 mutex_exit(&spa->spa_async_lock); 3258 } 3259 3260 static void 3261 spa_async_dispatch(spa_t *spa) 3262 { 3263 mutex_enter(&spa->spa_async_lock); 3264 if (spa->spa_async_tasks && !spa->spa_async_suspended && 3265 spa->spa_async_thread == NULL && 3266 rootdir != NULL && !vn_is_readonly(rootdir)) 3267 spa->spa_async_thread = thread_create(NULL, 0, 3268 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 3269 mutex_exit(&spa->spa_async_lock); 3270 } 3271 3272 void 3273 spa_async_request(spa_t *spa, int task) 3274 { 3275 mutex_enter(&spa->spa_async_lock); 3276 spa->spa_async_tasks |= task; 3277 mutex_exit(&spa->spa_async_lock); 3278 } 3279 3280 /* 3281 * ========================================================================== 3282 * SPA syncing routines 3283 * ========================================================================== 3284 */ 3285 3286 static void 3287 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 3288 { 3289 bplist_t *bpl = &spa->spa_sync_bplist; 3290 dmu_tx_t *tx; 3291 blkptr_t blk; 3292 uint64_t itor = 0; 3293 zio_t *zio; 3294 int error; 3295 uint8_t c = 1; 3296 3297 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 3298 3299 while (bplist_iterate(bpl, &itor, &blk) == 0) 3300 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 3301 3302 error = zio_wait(zio); 3303 ASSERT3U(error, ==, 0); 3304 3305 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 3306 bplist_vacate(bpl, tx); 3307 3308 /* 3309 * Pre-dirty the first block so we sync to convergence faster. 3310 * (Usually only the first block is needed.) 3311 */ 3312 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 3313 dmu_tx_commit(tx); 3314 } 3315 3316 static void 3317 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 3318 { 3319 char *packed = NULL; 3320 size_t nvsize = 0; 3321 dmu_buf_t *db; 3322 3323 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 3324 3325 packed = kmem_alloc(nvsize, KM_SLEEP); 3326 3327 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 3328 KM_SLEEP) == 0); 3329 3330 dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 3331 3332 kmem_free(packed, nvsize); 3333 3334 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 3335 dmu_buf_will_dirty(db, tx); 3336 *(uint64_t *)db->db_data = nvsize; 3337 dmu_buf_rele(db, FTAG); 3338 } 3339 3340 static void 3341 spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 3342 { 3343 nvlist_t *nvroot; 3344 nvlist_t **spares; 3345 int i; 3346 3347 if (!spa->spa_sync_spares) 3348 return; 3349 3350 /* 3351 * Update the MOS nvlist describing the list of available spares. 3352 * spa_validate_spares() will have already made sure this nvlist is 3353 * valid and the vdevs are labeled appropriately. 3354 */ 3355 if (spa->spa_spares_object == 0) { 3356 spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 3357 DMU_OT_PACKED_NVLIST, 1 << 14, 3358 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3359 VERIFY(zap_update(spa->spa_meta_objset, 3360 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 3361 sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 3362 } 3363 3364 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3365 if (spa->spa_nspares == 0) { 3366 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3367 NULL, 0) == 0); 3368 } else { 3369 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 3370 KM_SLEEP); 3371 for (i = 0; i < spa->spa_nspares; i++) 3372 spares[i] = vdev_config_generate(spa, 3373 spa->spa_spares[i], B_FALSE, B_TRUE); 3374 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3375 spares, spa->spa_nspares) == 0); 3376 for (i = 0; i < spa->spa_nspares; i++) 3377 nvlist_free(spares[i]); 3378 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 3379 } 3380 3381 spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 3382 nvlist_free(nvroot); 3383 3384 spa->spa_sync_spares = B_FALSE; 3385 } 3386 3387 static void 3388 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 3389 { 3390 nvlist_t *config; 3391 3392 if (list_is_empty(&spa->spa_dirty_list)) 3393 return; 3394 3395 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 3396 3397 if (spa->spa_config_syncing) 3398 nvlist_free(spa->spa_config_syncing); 3399 spa->spa_config_syncing = config; 3400 3401 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 3402 } 3403 3404 /* 3405 * Set zpool properties. 3406 */ 3407 static void 3408 spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 3409 { 3410 spa_t *spa = arg1; 3411 objset_t *mos = spa->spa_meta_objset; 3412 nvlist_t *nvp = arg2; 3413 nvpair_t *elem; 3414 uint64_t intval; 3415 char *strval, *slash; 3416 zpool_prop_t prop; 3417 const char *propname; 3418 zprop_type_t proptype; 3419 3420 elem = NULL; 3421 while ((elem = nvlist_next_nvpair(nvp, elem))) { 3422 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 3423 case ZPOOL_PROP_VERSION: 3424 /* 3425 * Only set version for non-zpool-creation cases 3426 * (set/import). spa_create() needs special care 3427 * for version setting. 3428 */ 3429 if (tx->tx_txg != TXG_INITIAL) { 3430 VERIFY(nvpair_value_uint64(elem, 3431 &intval) == 0); 3432 ASSERT(intval <= SPA_VERSION); 3433 ASSERT(intval >= spa_version(spa)); 3434 spa->spa_uberblock.ub_version = intval; 3435 vdev_config_dirty(spa->spa_root_vdev); 3436 } 3437 break; 3438 3439 case ZPOOL_PROP_ALTROOT: 3440 /* 3441 * 'altroot' is a non-persistent property. It should 3442 * have been set temporarily at creation or import time. 3443 */ 3444 ASSERT(spa->spa_root != NULL); 3445 break; 3446 3447 case ZPOOL_PROP_CACHEFILE: 3448 /* 3449 * 'cachefile' is a non-persistent property, but note 3450 * an async request that the config cache needs to be 3451 * udpated. 3452 */ 3453 VERIFY(nvpair_value_string(elem, &strval) == 0); 3454 if (spa->spa_config_dir) 3455 spa_strfree(spa->spa_config_dir); 3456 if (spa->spa_config_file) 3457 spa_strfree(spa->spa_config_file); 3458 3459 if (strval[0] == '\0') { 3460 spa->spa_config_dir = NULL; 3461 spa->spa_config_file = NULL; 3462 } else if (strcmp(strval, "none") == 0) { 3463 spa->spa_config_dir = spa_strdup(strval); 3464 spa->spa_config_file = NULL; 3465 } else { 3466 slash = strrchr(strval, '/'); 3467 ASSERT(slash != NULL); 3468 *slash = '\0'; 3469 spa->spa_config_dir = spa_strdup(strval); 3470 spa->spa_config_file = spa_strdup(slash + 1); 3471 } 3472 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 3473 break; 3474 default: 3475 /* 3476 * Set pool property values in the poolprops mos object. 3477 */ 3478 mutex_enter(&spa->spa_props_lock); 3479 if (spa->spa_pool_props_object == 0) { 3480 objset_t *mos = spa->spa_meta_objset; 3481 3482 VERIFY((spa->spa_pool_props_object = 3483 zap_create(mos, DMU_OT_POOL_PROPS, 3484 DMU_OT_NONE, 0, tx)) > 0); 3485 3486 VERIFY(zap_update(mos, 3487 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 3488 8, 1, &spa->spa_pool_props_object, tx) 3489 == 0); 3490 } 3491 mutex_exit(&spa->spa_props_lock); 3492 3493 /* normalize the property name */ 3494 propname = zpool_prop_to_name(prop); 3495 proptype = zpool_prop_get_type(prop); 3496 3497 if (nvpair_type(elem) == DATA_TYPE_STRING) { 3498 ASSERT(proptype == PROP_TYPE_STRING); 3499 VERIFY(nvpair_value_string(elem, &strval) == 0); 3500 VERIFY(zap_update(mos, 3501 spa->spa_pool_props_object, propname, 3502 1, strlen(strval) + 1, strval, tx) == 0); 3503 3504 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 3505 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 3506 3507 if (proptype == PROP_TYPE_INDEX) { 3508 const char *unused; 3509 VERIFY(zpool_prop_index_to_string( 3510 prop, intval, &unused) == 0); 3511 } 3512 VERIFY(zap_update(mos, 3513 spa->spa_pool_props_object, propname, 3514 8, 1, &intval, tx) == 0); 3515 } else { 3516 ASSERT(0); /* not allowed */ 3517 } 3518 3519 switch (prop) { 3520 case ZPOOL_PROP_DELEGATION: 3521 spa->spa_delegation = intval; 3522 break; 3523 case ZPOOL_PROP_BOOTFS: 3524 spa->spa_bootfs = intval; 3525 break; 3526 case ZPOOL_PROP_FAILUREMODE: 3527 spa->spa_failmode = intval; 3528 break; 3529 default: 3530 break; 3531 } 3532 } 3533 3534 /* log internal history if this is not a zpool create */ 3535 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 3536 tx->tx_txg != TXG_INITIAL) { 3537 spa_history_internal_log(LOG_POOL_PROPSET, 3538 spa, tx, cr, "%s %lld %s", 3539 nvpair_name(elem), intval, spa->spa_name); 3540 } 3541 } 3542 } 3543 3544 /* 3545 * Sync the specified transaction group. New blocks may be dirtied as 3546 * part of the process, so we iterate until it converges. 3547 */ 3548 void 3549 spa_sync(spa_t *spa, uint64_t txg) 3550 { 3551 dsl_pool_t *dp = spa->spa_dsl_pool; 3552 objset_t *mos = spa->spa_meta_objset; 3553 bplist_t *bpl = &spa->spa_sync_bplist; 3554 vdev_t *rvd = spa->spa_root_vdev; 3555 vdev_t *vd; 3556 dmu_tx_t *tx; 3557 int dirty_vdevs; 3558 3559 /* 3560 * Lock out configuration changes. 3561 */ 3562 spa_config_enter(spa, RW_READER, FTAG); 3563 3564 spa->spa_syncing_txg = txg; 3565 spa->spa_sync_pass = 0; 3566 3567 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 3568 3569 tx = dmu_tx_create_assigned(dp, txg); 3570 3571 /* 3572 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 3573 * set spa_deflate if we have no raid-z vdevs. 3574 */ 3575 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 3576 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 3577 int i; 3578 3579 for (i = 0; i < rvd->vdev_children; i++) { 3580 vd = rvd->vdev_child[i]; 3581 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 3582 break; 3583 } 3584 if (i == rvd->vdev_children) { 3585 spa->spa_deflate = TRUE; 3586 VERIFY(0 == zap_add(spa->spa_meta_objset, 3587 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3588 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 3589 } 3590 } 3591 3592 /* 3593 * If anything has changed in this txg, push the deferred frees 3594 * from the previous txg. If not, leave them alone so that we 3595 * don't generate work on an otherwise idle system. 3596 */ 3597 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 3598 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 3599 !txg_list_empty(&dp->dp_sync_tasks, txg)) 3600 spa_sync_deferred_frees(spa, txg); 3601 3602 /* 3603 * Iterate to convergence. 3604 */ 3605 do { 3606 spa->spa_sync_pass++; 3607 3608 spa_sync_config_object(spa, tx); 3609 spa_sync_spares(spa, tx); 3610 spa_errlog_sync(spa, txg); 3611 dsl_pool_sync(dp, txg); 3612 3613 dirty_vdevs = 0; 3614 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 3615 vdev_sync(vd, txg); 3616 dirty_vdevs++; 3617 } 3618 3619 bplist_sync(bpl, tx); 3620 } while (dirty_vdevs); 3621 3622 bplist_close(bpl); 3623 3624 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 3625 3626 /* 3627 * Rewrite the vdev configuration (which includes the uberblock) 3628 * to commit the transaction group. 3629 * 3630 * If there are any dirty vdevs, sync the uberblock to all vdevs. 3631 * Otherwise, pick a random top-level vdev that's known to be 3632 * visible in the config cache (see spa_vdev_add() for details). 3633 * If the write fails, try the next vdev until we're tried them all. 3634 */ 3635 if (!list_is_empty(&spa->spa_dirty_list)) { 3636 VERIFY(vdev_config_sync(rvd, txg) == 0); 3637 } else { 3638 int children = rvd->vdev_children; 3639 int c0 = spa_get_random(children); 3640 int c; 3641 3642 for (c = 0; c < children; c++) { 3643 vd = rvd->vdev_child[(c0 + c) % children]; 3644 if (vd->vdev_ms_array == 0) 3645 continue; 3646 if (vdev_config_sync(vd, txg) == 0) 3647 break; 3648 } 3649 if (c == children) 3650 VERIFY(vdev_config_sync(rvd, txg) == 0); 3651 } 3652 3653 dmu_tx_commit(tx); 3654 3655 /* 3656 * Clear the dirty config list. 3657 */ 3658 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 3659 vdev_config_clean(vd); 3660 3661 /* 3662 * Now that the new config has synced transactionally, 3663 * let it become visible to the config cache. 3664 */ 3665 if (spa->spa_config_syncing != NULL) { 3666 spa_config_set(spa, spa->spa_config_syncing); 3667 spa->spa_config_txg = txg; 3668 spa->spa_config_syncing = NULL; 3669 } 3670 3671 /* 3672 * Make a stable copy of the fully synced uberblock. 3673 * We use this as the root for pool traversals. 3674 */ 3675 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 3676 3677 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 3678 3679 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 3680 spa->spa_traverse_wanted = 0; 3681 spa->spa_ubsync = spa->spa_uberblock; 3682 rw_exit(&spa->spa_traverse_lock); 3683 3684 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 3685 3686 /* 3687 * Clean up the ZIL records for the synced txg. 3688 */ 3689 dsl_pool_zil_clean(dp); 3690 3691 /* 3692 * Update usable space statistics. 3693 */ 3694 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 3695 vdev_sync_done(vd, txg); 3696 3697 /* 3698 * It had better be the case that we didn't dirty anything 3699 * since vdev_config_sync(). 3700 */ 3701 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 3702 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 3703 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 3704 ASSERT(bpl->bpl_queue == NULL); 3705 3706 spa_config_exit(spa, FTAG); 3707 3708 /* 3709 * If any async tasks have been requested, kick them off. 3710 */ 3711 spa_async_dispatch(spa); 3712 } 3713 3714 /* 3715 * Sync all pools. We don't want to hold the namespace lock across these 3716 * operations, so we take a reference on the spa_t and drop the lock during the 3717 * sync. 3718 */ 3719 void 3720 spa_sync_allpools(void) 3721 { 3722 spa_t *spa = NULL; 3723 mutex_enter(&spa_namespace_lock); 3724 while ((spa = spa_next(spa)) != NULL) { 3725 if (spa_state(spa) != POOL_STATE_ACTIVE) 3726 continue; 3727 spa_open_ref(spa, FTAG); 3728 mutex_exit(&spa_namespace_lock); 3729 txg_wait_synced(spa_get_dsl(spa), 0); 3730 mutex_enter(&spa_namespace_lock); 3731 spa_close(spa, FTAG); 3732 } 3733 mutex_exit(&spa_namespace_lock); 3734 } 3735 3736 /* 3737 * ========================================================================== 3738 * Miscellaneous routines 3739 * ========================================================================== 3740 */ 3741 3742 /* 3743 * Remove all pools in the system. 3744 */ 3745 void 3746 spa_evict_all(void) 3747 { 3748 spa_t *spa; 3749 3750 /* 3751 * Remove all cached state. All pools should be closed now, 3752 * so every spa in the AVL tree should be unreferenced. 3753 */ 3754 mutex_enter(&spa_namespace_lock); 3755 while ((spa = spa_next(NULL)) != NULL) { 3756 /* 3757 * Stop async tasks. The async thread may need to detach 3758 * a device that's been replaced, which requires grabbing 3759 * spa_namespace_lock, so we must drop it here. 3760 */ 3761 spa_open_ref(spa, FTAG); 3762 mutex_exit(&spa_namespace_lock); 3763 spa_async_suspend(spa); 3764 mutex_enter(&spa_namespace_lock); 3765 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 3766 spa_close(spa, FTAG); 3767 3768 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3769 spa_unload(spa); 3770 spa_deactivate(spa); 3771 } 3772 spa_remove(spa); 3773 } 3774 mutex_exit(&spa_namespace_lock); 3775 } 3776 3777 vdev_t * 3778 spa_lookup_by_guid(spa_t *spa, uint64_t guid) 3779 { 3780 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 3781 } 3782 3783 void 3784 spa_upgrade(spa_t *spa, uint64_t version) 3785 { 3786 spa_config_enter(spa, RW_WRITER, FTAG); 3787 3788 /* 3789 * This should only be called for a non-faulted pool, and since a 3790 * future version would result in an unopenable pool, this shouldn't be 3791 * possible. 3792 */ 3793 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 3794 ASSERT(version >= spa->spa_uberblock.ub_version); 3795 3796 spa->spa_uberblock.ub_version = version; 3797 vdev_config_dirty(spa->spa_root_vdev); 3798 3799 spa_config_exit(spa, FTAG); 3800 3801 txg_wait_synced(spa_get_dsl(spa), 0); 3802 } 3803 3804 boolean_t 3805 spa_has_spare(spa_t *spa, uint64_t guid) 3806 { 3807 int i; 3808 uint64_t spareguid; 3809 3810 for (i = 0; i < spa->spa_nspares; i++) 3811 if (spa->spa_spares[i]->vdev_guid == guid) 3812 return (B_TRUE); 3813 3814 for (i = 0; i < spa->spa_pending_nspares; i++) { 3815 if (nvlist_lookup_uint64(spa->spa_pending_spares[i], 3816 ZPOOL_CONFIG_GUID, &spareguid) == 0 && 3817 spareguid == guid) 3818 return (B_TRUE); 3819 } 3820 3821 return (B_FALSE); 3822 } 3823 3824 /* 3825 * Post a sysevent corresponding to the given event. The 'name' must be one of 3826 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 3827 * filled in from the spa and (optionally) the vdev. This doesn't do anything 3828 * in the userland libzpool, as we don't want consumers to misinterpret ztest 3829 * or zdb as real changes. 3830 */ 3831 void 3832 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 3833 { 3834 #ifdef _KERNEL 3835 sysevent_t *ev; 3836 sysevent_attr_list_t *attr = NULL; 3837 sysevent_value_t value; 3838 sysevent_id_t eid; 3839 3840 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 3841 SE_SLEEP); 3842 3843 value.value_type = SE_DATA_TYPE_STRING; 3844 value.value.sv_string = spa_name(spa); 3845 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 3846 goto done; 3847 3848 value.value_type = SE_DATA_TYPE_UINT64; 3849 value.value.sv_uint64 = spa_guid(spa); 3850 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 3851 goto done; 3852 3853 if (vd) { 3854 value.value_type = SE_DATA_TYPE_UINT64; 3855 value.value.sv_uint64 = vd->vdev_guid; 3856 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 3857 SE_SLEEP) != 0) 3858 goto done; 3859 3860 if (vd->vdev_path) { 3861 value.value_type = SE_DATA_TYPE_STRING; 3862 value.value.sv_string = vd->vdev_path; 3863 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 3864 &value, SE_SLEEP) != 0) 3865 goto done; 3866 } 3867 } 3868 3869 (void) log_sysevent(ev, SE_SLEEP, &eid); 3870 3871 done: 3872 if (attr) 3873 sysevent_free_attr(attr); 3874 sysevent_free(ev); 3875 #endif 3876 } 3877