1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains all the routines used when modifying on-disk SPA state. 31 * This includes opening, importing, destroying, exporting a pool, and syncing a 32 * pool. 33 */ 34 35 #include <sys/zfs_context.h> 36 #include <sys/fm/fs/zfs.h> 37 #include <sys/spa_impl.h> 38 #include <sys/zio.h> 39 #include <sys/zio_checksum.h> 40 #include <sys/zio_compress.h> 41 #include <sys/dmu.h> 42 #include <sys/dmu_tx.h> 43 #include <sys/zap.h> 44 #include <sys/zil.h> 45 #include <sys/vdev_impl.h> 46 #include <sys/metaslab.h> 47 #include <sys/uberblock_impl.h> 48 #include <sys/txg.h> 49 #include <sys/avl.h> 50 #include <sys/dmu_traverse.h> 51 #include <sys/dmu_objset.h> 52 #include <sys/unique.h> 53 #include <sys/dsl_pool.h> 54 #include <sys/dsl_dataset.h> 55 #include <sys/dsl_dir.h> 56 #include <sys/dsl_prop.h> 57 #include <sys/dsl_synctask.h> 58 #include <sys/fs/zfs.h> 59 #include <sys/arc.h> 60 #include <sys/callb.h> 61 #include <sys/systeminfo.h> 62 #include <sys/sunddi.h> 63 #include <sys/spa_boot.h> 64 65 #include "zfs_prop.h" 66 #include "zfs_comutil.h" 67 68 int zio_taskq_threads = 8; 69 70 static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); 71 72 /* 73 * ========================================================================== 74 * SPA properties routines 75 * ========================================================================== 76 */ 77 78 /* 79 * Add a (source=src, propname=propval) list to an nvlist. 80 */ 81 static void 82 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 83 uint64_t intval, zprop_source_t src) 84 { 85 const char *propname = zpool_prop_to_name(prop); 86 nvlist_t *propval; 87 88 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 89 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 90 91 if (strval != NULL) 92 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 93 else 94 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 95 96 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 97 nvlist_free(propval); 98 } 99 100 /* 101 * Get property values from the spa configuration. 102 */ 103 static void 104 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 105 { 106 uint64_t size = spa_get_space(spa); 107 uint64_t used = spa_get_alloc(spa); 108 uint64_t cap, version; 109 zprop_source_t src = ZPROP_SRC_NONE; 110 spa_config_dirent_t *dp; 111 112 /* 113 * readonly properties 114 */ 115 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa->spa_name, 0, src); 116 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 117 spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src); 118 spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, size - used, src); 119 120 cap = (size == 0) ? 0 : (used * 100 / size); 121 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 122 123 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 124 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 125 spa->spa_root_vdev->vdev_state, src); 126 127 /* 128 * settable properties that are not stored in the pool property object. 129 */ 130 version = spa_version(spa); 131 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 132 src = ZPROP_SRC_DEFAULT; 133 else 134 src = ZPROP_SRC_LOCAL; 135 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 136 137 if (spa->spa_root != NULL) 138 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 139 0, ZPROP_SRC_LOCAL); 140 141 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 142 if (dp->scd_path == NULL) { 143 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 144 "none", 0, ZPROP_SRC_LOCAL); 145 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 146 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 147 dp->scd_path, 0, ZPROP_SRC_LOCAL); 148 } 149 } 150 } 151 152 /* 153 * Get zpool property values. 154 */ 155 int 156 spa_prop_get(spa_t *spa, nvlist_t **nvp) 157 { 158 zap_cursor_t zc; 159 zap_attribute_t za; 160 objset_t *mos = spa->spa_meta_objset; 161 int err; 162 163 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 164 165 /* 166 * Get properties from the spa config. 167 */ 168 spa_prop_get_config(spa, nvp); 169 170 mutex_enter(&spa->spa_props_lock); 171 /* If no pool property object, no more prop to get. */ 172 if (spa->spa_pool_props_object == 0) { 173 mutex_exit(&spa->spa_props_lock); 174 return (0); 175 } 176 177 /* 178 * Get properties from the MOS pool property object. 179 */ 180 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 181 (err = zap_cursor_retrieve(&zc, &za)) == 0; 182 zap_cursor_advance(&zc)) { 183 uint64_t intval = 0; 184 char *strval = NULL; 185 zprop_source_t src = ZPROP_SRC_DEFAULT; 186 zpool_prop_t prop; 187 188 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 189 continue; 190 191 switch (za.za_integer_length) { 192 case 8: 193 /* integer property */ 194 if (za.za_first_integer != 195 zpool_prop_default_numeric(prop)) 196 src = ZPROP_SRC_LOCAL; 197 198 if (prop == ZPOOL_PROP_BOOTFS) { 199 dsl_pool_t *dp; 200 dsl_dataset_t *ds = NULL; 201 202 dp = spa_get_dsl(spa); 203 rw_enter(&dp->dp_config_rwlock, RW_READER); 204 if (err = dsl_dataset_open_obj(dp, 205 za.za_first_integer, NULL, DS_MODE_NONE, 206 FTAG, &ds)) { 207 rw_exit(&dp->dp_config_rwlock); 208 break; 209 } 210 211 strval = kmem_alloc( 212 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 213 KM_SLEEP); 214 dsl_dataset_name(ds, strval); 215 dsl_dataset_close(ds, DS_MODE_NONE, FTAG); 216 rw_exit(&dp->dp_config_rwlock); 217 } else { 218 strval = NULL; 219 intval = za.za_first_integer; 220 } 221 222 spa_prop_add_list(*nvp, prop, strval, intval, src); 223 224 if (strval != NULL) 225 kmem_free(strval, 226 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 227 228 break; 229 230 case 1: 231 /* string property */ 232 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 233 err = zap_lookup(mos, spa->spa_pool_props_object, 234 za.za_name, 1, za.za_num_integers, strval); 235 if (err) { 236 kmem_free(strval, za.za_num_integers); 237 break; 238 } 239 spa_prop_add_list(*nvp, prop, strval, 0, src); 240 kmem_free(strval, za.za_num_integers); 241 break; 242 243 default: 244 break; 245 } 246 } 247 zap_cursor_fini(&zc); 248 mutex_exit(&spa->spa_props_lock); 249 out: 250 if (err && err != ENOENT) { 251 nvlist_free(*nvp); 252 *nvp = NULL; 253 return (err); 254 } 255 256 return (0); 257 } 258 259 /* 260 * Validate the given pool properties nvlist and modify the list 261 * for the property values to be set. 262 */ 263 static int 264 spa_prop_validate(spa_t *spa, nvlist_t *props) 265 { 266 nvpair_t *elem; 267 int error = 0, reset_bootfs = 0; 268 uint64_t objnum; 269 270 elem = NULL; 271 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 272 zpool_prop_t prop; 273 char *propname, *strval; 274 uint64_t intval; 275 vdev_t *rvdev; 276 char *vdev_type; 277 objset_t *os; 278 char *slash; 279 280 propname = nvpair_name(elem); 281 282 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) 283 return (EINVAL); 284 285 switch (prop) { 286 case ZPOOL_PROP_VERSION: 287 error = nvpair_value_uint64(elem, &intval); 288 if (!error && 289 (intval < spa_version(spa) || intval > SPA_VERSION)) 290 error = EINVAL; 291 break; 292 293 case ZPOOL_PROP_DELEGATION: 294 case ZPOOL_PROP_AUTOREPLACE: 295 error = nvpair_value_uint64(elem, &intval); 296 if (!error && intval > 1) 297 error = EINVAL; 298 break; 299 300 case ZPOOL_PROP_BOOTFS: 301 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 302 error = ENOTSUP; 303 break; 304 } 305 306 /* 307 * A bootable filesystem can not be on a RAIDZ pool 308 * nor a striped pool with more than 1 device. 309 */ 310 rvdev = spa->spa_root_vdev; 311 vdev_type = 312 rvdev->vdev_child[0]->vdev_ops->vdev_op_type; 313 if (rvdev->vdev_children > 1 || 314 strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || 315 strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { 316 error = ENOTSUP; 317 break; 318 } 319 320 reset_bootfs = 1; 321 322 error = nvpair_value_string(elem, &strval); 323 324 if (!error) { 325 if (strval == NULL || strval[0] == '\0') { 326 objnum = zpool_prop_default_numeric( 327 ZPOOL_PROP_BOOTFS); 328 break; 329 } 330 331 if (error = dmu_objset_open(strval, DMU_OST_ZFS, 332 DS_MODE_STANDARD | DS_MODE_READONLY, &os)) 333 break; 334 objnum = dmu_objset_id(os); 335 dmu_objset_close(os); 336 } 337 break; 338 case ZPOOL_PROP_FAILUREMODE: 339 error = nvpair_value_uint64(elem, &intval); 340 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 341 intval > ZIO_FAILURE_MODE_PANIC)) 342 error = EINVAL; 343 344 /* 345 * This is a special case which only occurs when 346 * the pool has completely failed. This allows 347 * the user to change the in-core failmode property 348 * without syncing it out to disk (I/Os might 349 * currently be blocked). We do this by returning 350 * EIO to the caller (spa_prop_set) to trick it 351 * into thinking we encountered a property validation 352 * error. 353 */ 354 if (!error && spa_state(spa) == POOL_STATE_IO_FAILURE) { 355 spa->spa_failmode = intval; 356 error = EIO; 357 } 358 break; 359 360 case ZPOOL_PROP_CACHEFILE: 361 if ((error = nvpair_value_string(elem, &strval)) != 0) 362 break; 363 364 if (strval[0] == '\0') 365 break; 366 367 if (strcmp(strval, "none") == 0) 368 break; 369 370 if (strval[0] != '/') { 371 error = EINVAL; 372 break; 373 } 374 375 slash = strrchr(strval, '/'); 376 ASSERT(slash != NULL); 377 378 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 379 strcmp(slash, "/..") == 0) 380 error = EINVAL; 381 break; 382 } 383 384 if (error) 385 break; 386 } 387 388 if (!error && reset_bootfs) { 389 error = nvlist_remove(props, 390 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 391 392 if (!error) { 393 error = nvlist_add_uint64(props, 394 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 395 } 396 } 397 398 return (error); 399 } 400 401 int 402 spa_prop_set(spa_t *spa, nvlist_t *nvp) 403 { 404 int error; 405 406 if ((error = spa_prop_validate(spa, nvp)) != 0) 407 return (error); 408 409 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 410 spa, nvp, 3)); 411 } 412 413 /* 414 * If the bootfs property value is dsobj, clear it. 415 */ 416 void 417 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 418 { 419 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 420 VERIFY(zap_remove(spa->spa_meta_objset, 421 spa->spa_pool_props_object, 422 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 423 spa->spa_bootfs = 0; 424 } 425 } 426 427 /* 428 * ========================================================================== 429 * SPA state manipulation (open/create/destroy/import/export) 430 * ========================================================================== 431 */ 432 433 static int 434 spa_error_entry_compare(const void *a, const void *b) 435 { 436 spa_error_entry_t *sa = (spa_error_entry_t *)a; 437 spa_error_entry_t *sb = (spa_error_entry_t *)b; 438 int ret; 439 440 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 441 sizeof (zbookmark_t)); 442 443 if (ret < 0) 444 return (-1); 445 else if (ret > 0) 446 return (1); 447 else 448 return (0); 449 } 450 451 /* 452 * Utility function which retrieves copies of the current logs and 453 * re-initializes them in the process. 454 */ 455 void 456 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 457 { 458 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 459 460 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 461 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 462 463 avl_create(&spa->spa_errlist_scrub, 464 spa_error_entry_compare, sizeof (spa_error_entry_t), 465 offsetof(spa_error_entry_t, se_avl)); 466 avl_create(&spa->spa_errlist_last, 467 spa_error_entry_compare, sizeof (spa_error_entry_t), 468 offsetof(spa_error_entry_t, se_avl)); 469 } 470 471 /* 472 * Activate an uninitialized pool. 473 */ 474 static void 475 spa_activate(spa_t *spa) 476 { 477 int t; 478 479 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 480 481 spa->spa_state = POOL_STATE_ACTIVE; 482 483 spa->spa_normal_class = metaslab_class_create(); 484 spa->spa_log_class = metaslab_class_create(); 485 486 for (t = 0; t < ZIO_TYPES; t++) { 487 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 488 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 489 TASKQ_PREPOPULATE); 490 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 491 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 492 TASKQ_PREPOPULATE); 493 } 494 495 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 496 offsetof(vdev_t, vdev_dirty_node)); 497 list_create(&spa->spa_zio_list, sizeof (zio_t), 498 offsetof(zio_t, zio_link_node)); 499 500 txg_list_create(&spa->spa_vdev_txg_list, 501 offsetof(struct vdev, vdev_txg_node)); 502 503 avl_create(&spa->spa_errlist_scrub, 504 spa_error_entry_compare, sizeof (spa_error_entry_t), 505 offsetof(spa_error_entry_t, se_avl)); 506 avl_create(&spa->spa_errlist_last, 507 spa_error_entry_compare, sizeof (spa_error_entry_t), 508 offsetof(spa_error_entry_t, se_avl)); 509 } 510 511 /* 512 * Opposite of spa_activate(). 513 */ 514 static void 515 spa_deactivate(spa_t *spa) 516 { 517 int t; 518 519 ASSERT(spa->spa_sync_on == B_FALSE); 520 ASSERT(spa->spa_dsl_pool == NULL); 521 ASSERT(spa->spa_root_vdev == NULL); 522 523 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 524 525 txg_list_destroy(&spa->spa_vdev_txg_list); 526 527 list_destroy(&spa->spa_dirty_list); 528 list_destroy(&spa->spa_zio_list); 529 530 for (t = 0; t < ZIO_TYPES; t++) { 531 taskq_destroy(spa->spa_zio_issue_taskq[t]); 532 taskq_destroy(spa->spa_zio_intr_taskq[t]); 533 spa->spa_zio_issue_taskq[t] = NULL; 534 spa->spa_zio_intr_taskq[t] = NULL; 535 } 536 537 metaslab_class_destroy(spa->spa_normal_class); 538 spa->spa_normal_class = NULL; 539 540 metaslab_class_destroy(spa->spa_log_class); 541 spa->spa_log_class = NULL; 542 543 /* 544 * If this was part of an import or the open otherwise failed, we may 545 * still have errors left in the queues. Empty them just in case. 546 */ 547 spa_errlog_drain(spa); 548 549 avl_destroy(&spa->spa_errlist_scrub); 550 avl_destroy(&spa->spa_errlist_last); 551 552 spa->spa_state = POOL_STATE_UNINITIALIZED; 553 } 554 555 /* 556 * Verify a pool configuration, and construct the vdev tree appropriately. This 557 * will create all the necessary vdevs in the appropriate layout, with each vdev 558 * in the CLOSED state. This will prep the pool before open/creation/import. 559 * All vdev validation is done by the vdev_alloc() routine. 560 */ 561 static int 562 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 563 uint_t id, int atype) 564 { 565 nvlist_t **child; 566 uint_t c, children; 567 int error; 568 569 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 570 return (error); 571 572 if ((*vdp)->vdev_ops->vdev_op_leaf) 573 return (0); 574 575 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 576 &child, &children) != 0) { 577 vdev_free(*vdp); 578 *vdp = NULL; 579 return (EINVAL); 580 } 581 582 for (c = 0; c < children; c++) { 583 vdev_t *vd; 584 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 585 atype)) != 0) { 586 vdev_free(*vdp); 587 *vdp = NULL; 588 return (error); 589 } 590 } 591 592 ASSERT(*vdp != NULL); 593 594 return (0); 595 } 596 597 /* 598 * Opposite of spa_load(). 599 */ 600 static void 601 spa_unload(spa_t *spa) 602 { 603 int i; 604 605 /* 606 * Stop async tasks. 607 */ 608 spa_async_suspend(spa); 609 610 /* 611 * Stop syncing. 612 */ 613 if (spa->spa_sync_on) { 614 txg_sync_stop(spa->spa_dsl_pool); 615 spa->spa_sync_on = B_FALSE; 616 } 617 618 /* 619 * Wait for any outstanding prefetch I/O to complete. 620 */ 621 spa_config_enter(spa, RW_WRITER, FTAG); 622 spa_config_exit(spa, FTAG); 623 624 /* 625 * Drop and purge level 2 cache 626 */ 627 spa_l2cache_drop(spa); 628 629 /* 630 * Close the dsl pool. 631 */ 632 if (spa->spa_dsl_pool) { 633 dsl_pool_close(spa->spa_dsl_pool); 634 spa->spa_dsl_pool = NULL; 635 } 636 637 /* 638 * Close all vdevs. 639 */ 640 if (spa->spa_root_vdev) 641 vdev_free(spa->spa_root_vdev); 642 ASSERT(spa->spa_root_vdev == NULL); 643 644 for (i = 0; i < spa->spa_spares.sav_count; i++) 645 vdev_free(spa->spa_spares.sav_vdevs[i]); 646 if (spa->spa_spares.sav_vdevs) { 647 kmem_free(spa->spa_spares.sav_vdevs, 648 spa->spa_spares.sav_count * sizeof (void *)); 649 spa->spa_spares.sav_vdevs = NULL; 650 } 651 if (spa->spa_spares.sav_config) { 652 nvlist_free(spa->spa_spares.sav_config); 653 spa->spa_spares.sav_config = NULL; 654 } 655 656 for (i = 0; i < spa->spa_l2cache.sav_count; i++) 657 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 658 if (spa->spa_l2cache.sav_vdevs) { 659 kmem_free(spa->spa_l2cache.sav_vdevs, 660 spa->spa_l2cache.sav_count * sizeof (void *)); 661 spa->spa_l2cache.sav_vdevs = NULL; 662 } 663 if (spa->spa_l2cache.sav_config) { 664 nvlist_free(spa->spa_l2cache.sav_config); 665 spa->spa_l2cache.sav_config = NULL; 666 } 667 668 spa->spa_async_suspended = 0; 669 } 670 671 /* 672 * Load (or re-load) the current list of vdevs describing the active spares for 673 * this pool. When this is called, we have some form of basic information in 674 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 675 * then re-generate a more complete list including status information. 676 */ 677 static void 678 spa_load_spares(spa_t *spa) 679 { 680 nvlist_t **spares; 681 uint_t nspares; 682 int i; 683 vdev_t *vd, *tvd; 684 685 /* 686 * First, close and free any existing spare vdevs. 687 */ 688 for (i = 0; i < spa->spa_spares.sav_count; i++) { 689 vd = spa->spa_spares.sav_vdevs[i]; 690 691 /* Undo the call to spa_activate() below */ 692 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 693 B_FALSE)) != NULL && tvd->vdev_isspare) 694 spa_spare_remove(tvd); 695 vdev_close(vd); 696 vdev_free(vd); 697 } 698 699 if (spa->spa_spares.sav_vdevs) 700 kmem_free(spa->spa_spares.sav_vdevs, 701 spa->spa_spares.sav_count * sizeof (void *)); 702 703 if (spa->spa_spares.sav_config == NULL) 704 nspares = 0; 705 else 706 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 707 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 708 709 spa->spa_spares.sav_count = (int)nspares; 710 spa->spa_spares.sav_vdevs = NULL; 711 712 if (nspares == 0) 713 return; 714 715 /* 716 * Construct the array of vdevs, opening them to get status in the 717 * process. For each spare, there is potentially two different vdev_t 718 * structures associated with it: one in the list of spares (used only 719 * for basic validation purposes) and one in the active vdev 720 * configuration (if it's spared in). During this phase we open and 721 * validate each vdev on the spare list. If the vdev also exists in the 722 * active configuration, then we also mark this vdev as an active spare. 723 */ 724 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 725 KM_SLEEP); 726 for (i = 0; i < spa->spa_spares.sav_count; i++) { 727 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 728 VDEV_ALLOC_SPARE) == 0); 729 ASSERT(vd != NULL); 730 731 spa->spa_spares.sav_vdevs[i] = vd; 732 733 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 734 B_FALSE)) != NULL) { 735 if (!tvd->vdev_isspare) 736 spa_spare_add(tvd); 737 738 /* 739 * We only mark the spare active if we were successfully 740 * able to load the vdev. Otherwise, importing a pool 741 * with a bad active spare would result in strange 742 * behavior, because multiple pool would think the spare 743 * is actively in use. 744 * 745 * There is a vulnerability here to an equally bizarre 746 * circumstance, where a dead active spare is later 747 * brought back to life (onlined or otherwise). Given 748 * the rarity of this scenario, and the extra complexity 749 * it adds, we ignore the possibility. 750 */ 751 if (!vdev_is_dead(tvd)) 752 spa_spare_activate(tvd); 753 } 754 755 if (vdev_open(vd) != 0) 756 continue; 757 758 vd->vdev_top = vd; 759 if (vdev_validate_aux(vd) == 0) 760 spa_spare_add(vd); 761 } 762 763 /* 764 * Recompute the stashed list of spares, with status information 765 * this time. 766 */ 767 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 768 DATA_TYPE_NVLIST_ARRAY) == 0); 769 770 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 771 KM_SLEEP); 772 for (i = 0; i < spa->spa_spares.sav_count; i++) 773 spares[i] = vdev_config_generate(spa, 774 spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE); 775 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 776 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 777 for (i = 0; i < spa->spa_spares.sav_count; i++) 778 nvlist_free(spares[i]); 779 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 780 } 781 782 /* 783 * Load (or re-load) the current list of vdevs describing the active l2cache for 784 * this pool. When this is called, we have some form of basic information in 785 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 786 * then re-generate a more complete list including status information. 787 * Devices which are already active have their details maintained, and are 788 * not re-opened. 789 */ 790 static void 791 spa_load_l2cache(spa_t *spa) 792 { 793 nvlist_t **l2cache; 794 uint_t nl2cache; 795 int i, j, oldnvdevs; 796 uint64_t guid, size; 797 vdev_t *vd, **oldvdevs, **newvdevs; 798 spa_aux_vdev_t *sav = &spa->spa_l2cache; 799 800 if (sav->sav_config != NULL) { 801 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 802 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 803 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 804 } else { 805 nl2cache = 0; 806 } 807 808 oldvdevs = sav->sav_vdevs; 809 oldnvdevs = sav->sav_count; 810 sav->sav_vdevs = NULL; 811 sav->sav_count = 0; 812 813 /* 814 * Process new nvlist of vdevs. 815 */ 816 for (i = 0; i < nl2cache; i++) { 817 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 818 &guid) == 0); 819 820 newvdevs[i] = NULL; 821 for (j = 0; j < oldnvdevs; j++) { 822 vd = oldvdevs[j]; 823 if (vd != NULL && guid == vd->vdev_guid) { 824 /* 825 * Retain previous vdev for add/remove ops. 826 */ 827 newvdevs[i] = vd; 828 oldvdevs[j] = NULL; 829 break; 830 } 831 } 832 833 if (newvdevs[i] == NULL) { 834 /* 835 * Create new vdev 836 */ 837 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 838 VDEV_ALLOC_L2CACHE) == 0); 839 ASSERT(vd != NULL); 840 newvdevs[i] = vd; 841 842 /* 843 * Commit this vdev as an l2cache device, 844 * even if it fails to open. 845 */ 846 spa_l2cache_add(vd); 847 848 vd->vdev_top = vd; 849 vd->vdev_aux = sav; 850 851 spa_l2cache_activate(vd); 852 853 if (vdev_open(vd) != 0) 854 continue; 855 856 (void) vdev_validate_aux(vd); 857 858 if (!vdev_is_dead(vd)) { 859 size = vdev_get_rsize(vd); 860 l2arc_add_vdev(spa, vd, 861 VDEV_LABEL_START_SIZE, 862 size - VDEV_LABEL_START_SIZE); 863 } 864 } 865 } 866 867 /* 868 * Purge vdevs that were dropped 869 */ 870 for (i = 0; i < oldnvdevs; i++) { 871 uint64_t pool; 872 873 vd = oldvdevs[i]; 874 if (vd != NULL) { 875 if (spa_mode & FWRITE && 876 spa_l2cache_exists(vd->vdev_guid, &pool) && 877 pool != 0ULL && 878 l2arc_vdev_present(vd)) { 879 l2arc_remove_vdev(vd); 880 } 881 (void) vdev_close(vd); 882 spa_l2cache_remove(vd); 883 } 884 } 885 886 if (oldvdevs) 887 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 888 889 if (sav->sav_config == NULL) 890 goto out; 891 892 sav->sav_vdevs = newvdevs; 893 sav->sav_count = (int)nl2cache; 894 895 /* 896 * Recompute the stashed list of l2cache devices, with status 897 * information this time. 898 */ 899 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 900 DATA_TYPE_NVLIST_ARRAY) == 0); 901 902 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 903 for (i = 0; i < sav->sav_count; i++) 904 l2cache[i] = vdev_config_generate(spa, 905 sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE); 906 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 907 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 908 out: 909 for (i = 0; i < sav->sav_count; i++) 910 nvlist_free(l2cache[i]); 911 if (sav->sav_count) 912 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 913 } 914 915 static int 916 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 917 { 918 dmu_buf_t *db; 919 char *packed = NULL; 920 size_t nvsize = 0; 921 int error; 922 *value = NULL; 923 924 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 925 nvsize = *(uint64_t *)db->db_data; 926 dmu_buf_rele(db, FTAG); 927 928 packed = kmem_alloc(nvsize, KM_SLEEP); 929 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 930 if (error == 0) 931 error = nvlist_unpack(packed, nvsize, value, 0); 932 kmem_free(packed, nvsize); 933 934 return (error); 935 } 936 937 /* 938 * Checks to see if the given vdev could not be opened, in which case we post a 939 * sysevent to notify the autoreplace code that the device has been removed. 940 */ 941 static void 942 spa_check_removed(vdev_t *vd) 943 { 944 int c; 945 946 for (c = 0; c < vd->vdev_children; c++) 947 spa_check_removed(vd->vdev_child[c]); 948 949 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 950 zfs_post_autoreplace(vd->vdev_spa, vd); 951 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 952 } 953 } 954 955 /* 956 * Load an existing storage pool, using the pool's builtin spa_config as a 957 * source of configuration information. 958 */ 959 static int 960 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 961 { 962 int error = 0; 963 nvlist_t *nvroot = NULL; 964 vdev_t *rvd; 965 uberblock_t *ub = &spa->spa_uberblock; 966 uint64_t config_cache_txg = spa->spa_config_txg; 967 uint64_t pool_guid; 968 uint64_t version; 969 zio_t *zio; 970 uint64_t autoreplace = 0; 971 972 spa->spa_load_state = state; 973 974 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 975 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 976 error = EINVAL; 977 goto out; 978 } 979 980 /* 981 * Versioning wasn't explicitly added to the label until later, so if 982 * it's not present treat it as the initial version. 983 */ 984 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 985 version = SPA_VERSION_INITIAL; 986 987 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 988 &spa->spa_config_txg); 989 990 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 991 spa_guid_exists(pool_guid, 0)) { 992 error = EEXIST; 993 goto out; 994 } 995 996 spa->spa_load_guid = pool_guid; 997 998 /* 999 * Parse the configuration into a vdev tree. We explicitly set the 1000 * value that will be returned by spa_version() since parsing the 1001 * configuration requires knowing the version number. 1002 */ 1003 spa_config_enter(spa, RW_WRITER, FTAG); 1004 spa->spa_ubsync.ub_version = version; 1005 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 1006 spa_config_exit(spa, FTAG); 1007 1008 if (error != 0) 1009 goto out; 1010 1011 ASSERT(spa->spa_root_vdev == rvd); 1012 ASSERT(spa_guid(spa) == pool_guid); 1013 1014 /* 1015 * Try to open all vdevs, loading each label in the process. 1016 */ 1017 error = vdev_open(rvd); 1018 if (error != 0) 1019 goto out; 1020 1021 /* 1022 * Validate the labels for all leaf vdevs. We need to grab the config 1023 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 1024 * flag. 1025 */ 1026 spa_config_enter(spa, RW_READER, FTAG); 1027 error = vdev_validate(rvd); 1028 spa_config_exit(spa, FTAG); 1029 1030 if (error != 0) 1031 goto out; 1032 1033 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1034 error = ENXIO; 1035 goto out; 1036 } 1037 1038 /* 1039 * Find the best uberblock. 1040 */ 1041 bzero(ub, sizeof (uberblock_t)); 1042 1043 zio = zio_root(spa, NULL, NULL, 1044 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1045 vdev_uberblock_load(zio, rvd, ub); 1046 error = zio_wait(zio); 1047 1048 /* 1049 * If we weren't able to find a single valid uberblock, return failure. 1050 */ 1051 if (ub->ub_txg == 0) { 1052 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1053 VDEV_AUX_CORRUPT_DATA); 1054 error = ENXIO; 1055 goto out; 1056 } 1057 1058 /* 1059 * If the pool is newer than the code, we can't open it. 1060 */ 1061 if (ub->ub_version > SPA_VERSION) { 1062 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1063 VDEV_AUX_VERSION_NEWER); 1064 error = ENOTSUP; 1065 goto out; 1066 } 1067 1068 /* 1069 * If the vdev guid sum doesn't match the uberblock, we have an 1070 * incomplete configuration. 1071 */ 1072 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 1073 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1074 VDEV_AUX_BAD_GUID_SUM); 1075 error = ENXIO; 1076 goto out; 1077 } 1078 1079 /* 1080 * Initialize internal SPA structures. 1081 */ 1082 spa->spa_state = POOL_STATE_ACTIVE; 1083 spa->spa_ubsync = spa->spa_uberblock; 1084 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 1085 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 1086 if (error) { 1087 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1088 VDEV_AUX_CORRUPT_DATA); 1089 goto out; 1090 } 1091 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 1092 1093 if (zap_lookup(spa->spa_meta_objset, 1094 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1095 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 1096 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1097 VDEV_AUX_CORRUPT_DATA); 1098 error = EIO; 1099 goto out; 1100 } 1101 1102 if (!mosconfig) { 1103 nvlist_t *newconfig; 1104 uint64_t hostid; 1105 1106 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 1107 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1108 VDEV_AUX_CORRUPT_DATA); 1109 error = EIO; 1110 goto out; 1111 } 1112 1113 if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID, 1114 &hostid) == 0) { 1115 char *hostname; 1116 unsigned long myhostid = 0; 1117 1118 VERIFY(nvlist_lookup_string(newconfig, 1119 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 1120 1121 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 1122 if (hostid != 0 && myhostid != 0 && 1123 (unsigned long)hostid != myhostid) { 1124 cmn_err(CE_WARN, "pool '%s' could not be " 1125 "loaded as it was last accessed by " 1126 "another system (host: %s hostid: 0x%lx). " 1127 "See: http://www.sun.com/msg/ZFS-8000-EY", 1128 spa->spa_name, hostname, 1129 (unsigned long)hostid); 1130 error = EBADF; 1131 goto out; 1132 } 1133 } 1134 1135 spa_config_set(spa, newconfig); 1136 spa_unload(spa); 1137 spa_deactivate(spa); 1138 spa_activate(spa); 1139 1140 return (spa_load(spa, newconfig, state, B_TRUE)); 1141 } 1142 1143 if (zap_lookup(spa->spa_meta_objset, 1144 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1145 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 1146 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1147 VDEV_AUX_CORRUPT_DATA); 1148 error = EIO; 1149 goto out; 1150 } 1151 1152 /* 1153 * Load the bit that tells us to use the new accounting function 1154 * (raid-z deflation). If we have an older pool, this will not 1155 * be present. 1156 */ 1157 error = zap_lookup(spa->spa_meta_objset, 1158 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1159 sizeof (uint64_t), 1, &spa->spa_deflate); 1160 if (error != 0 && error != ENOENT) { 1161 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1162 VDEV_AUX_CORRUPT_DATA); 1163 error = EIO; 1164 goto out; 1165 } 1166 1167 /* 1168 * Load the persistent error log. If we have an older pool, this will 1169 * not be present. 1170 */ 1171 error = zap_lookup(spa->spa_meta_objset, 1172 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 1173 sizeof (uint64_t), 1, &spa->spa_errlog_last); 1174 if (error != 0 && error != ENOENT) { 1175 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1176 VDEV_AUX_CORRUPT_DATA); 1177 error = EIO; 1178 goto out; 1179 } 1180 1181 error = zap_lookup(spa->spa_meta_objset, 1182 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 1183 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 1184 if (error != 0 && error != ENOENT) { 1185 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1186 VDEV_AUX_CORRUPT_DATA); 1187 error = EIO; 1188 goto out; 1189 } 1190 1191 /* 1192 * Load the history object. If we have an older pool, this 1193 * will not be present. 1194 */ 1195 error = zap_lookup(spa->spa_meta_objset, 1196 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 1197 sizeof (uint64_t), 1, &spa->spa_history); 1198 if (error != 0 && error != ENOENT) { 1199 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1200 VDEV_AUX_CORRUPT_DATA); 1201 error = EIO; 1202 goto out; 1203 } 1204 1205 /* 1206 * Load any hot spares for this pool. 1207 */ 1208 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1209 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object); 1210 if (error != 0 && error != ENOENT) { 1211 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1212 VDEV_AUX_CORRUPT_DATA); 1213 error = EIO; 1214 goto out; 1215 } 1216 if (error == 0) { 1217 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 1218 if (load_nvlist(spa, spa->spa_spares.sav_object, 1219 &spa->spa_spares.sav_config) != 0) { 1220 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1221 VDEV_AUX_CORRUPT_DATA); 1222 error = EIO; 1223 goto out; 1224 } 1225 1226 spa_config_enter(spa, RW_WRITER, FTAG); 1227 spa_load_spares(spa); 1228 spa_config_exit(spa, FTAG); 1229 } 1230 1231 /* 1232 * Load any level 2 ARC devices for this pool. 1233 */ 1234 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1235 DMU_POOL_L2CACHE, sizeof (uint64_t), 1, 1236 &spa->spa_l2cache.sav_object); 1237 if (error != 0 && error != ENOENT) { 1238 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1239 VDEV_AUX_CORRUPT_DATA); 1240 error = EIO; 1241 goto out; 1242 } 1243 if (error == 0) { 1244 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 1245 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 1246 &spa->spa_l2cache.sav_config) != 0) { 1247 vdev_set_state(rvd, B_TRUE, 1248 VDEV_STATE_CANT_OPEN, 1249 VDEV_AUX_CORRUPT_DATA); 1250 error = EIO; 1251 goto out; 1252 } 1253 1254 spa_config_enter(spa, RW_WRITER, FTAG); 1255 spa_load_l2cache(spa); 1256 spa_config_exit(spa, FTAG); 1257 } 1258 1259 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 1260 1261 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1262 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 1263 1264 if (error && error != ENOENT) { 1265 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1266 VDEV_AUX_CORRUPT_DATA); 1267 error = EIO; 1268 goto out; 1269 } 1270 1271 if (error == 0) { 1272 (void) zap_lookup(spa->spa_meta_objset, 1273 spa->spa_pool_props_object, 1274 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 1275 sizeof (uint64_t), 1, &spa->spa_bootfs); 1276 (void) zap_lookup(spa->spa_meta_objset, 1277 spa->spa_pool_props_object, 1278 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1279 sizeof (uint64_t), 1, &autoreplace); 1280 (void) zap_lookup(spa->spa_meta_objset, 1281 spa->spa_pool_props_object, 1282 zpool_prop_to_name(ZPOOL_PROP_DELEGATION), 1283 sizeof (uint64_t), 1, &spa->spa_delegation); 1284 (void) zap_lookup(spa->spa_meta_objset, 1285 spa->spa_pool_props_object, 1286 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 1287 sizeof (uint64_t), 1, &spa->spa_failmode); 1288 } 1289 1290 /* 1291 * If the 'autoreplace' property is set, then post a resource notifying 1292 * the ZFS DE that it should not issue any faults for unopenable 1293 * devices. We also iterate over the vdevs, and post a sysevent for any 1294 * unopenable vdevs so that the normal autoreplace handler can take 1295 * over. 1296 */ 1297 if (autoreplace && state != SPA_LOAD_TRYIMPORT) 1298 spa_check_removed(spa->spa_root_vdev); 1299 1300 /* 1301 * Load the vdev state for all toplevel vdevs. 1302 */ 1303 vdev_load(rvd); 1304 1305 /* 1306 * Propagate the leaf DTLs we just loaded all the way up the tree. 1307 */ 1308 spa_config_enter(spa, RW_WRITER, FTAG); 1309 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 1310 spa_config_exit(spa, FTAG); 1311 1312 /* 1313 * Check the state of the root vdev. If it can't be opened, it 1314 * indicates one or more toplevel vdevs are faulted. 1315 */ 1316 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1317 error = ENXIO; 1318 goto out; 1319 } 1320 1321 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 1322 dmu_tx_t *tx; 1323 int need_update = B_FALSE; 1324 int c; 1325 1326 /* 1327 * Claim log blocks that haven't been committed yet. 1328 * This must all happen in a single txg. 1329 */ 1330 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 1331 spa_first_txg(spa)); 1332 (void) dmu_objset_find(spa->spa_name, 1333 zil_claim, tx, DS_FIND_CHILDREN); 1334 dmu_tx_commit(tx); 1335 1336 spa->spa_sync_on = B_TRUE; 1337 txg_sync_start(spa->spa_dsl_pool); 1338 1339 /* 1340 * Wait for all claims to sync. 1341 */ 1342 txg_wait_synced(spa->spa_dsl_pool, 0); 1343 1344 /* 1345 * If the config cache is stale, or we have uninitialized 1346 * metaslabs (see spa_vdev_add()), then update the config. 1347 */ 1348 if (config_cache_txg != spa->spa_config_txg || 1349 state == SPA_LOAD_IMPORT) 1350 need_update = B_TRUE; 1351 1352 for (c = 0; c < rvd->vdev_children; c++) 1353 if (rvd->vdev_child[c]->vdev_ms_array == 0) 1354 need_update = B_TRUE; 1355 1356 /* 1357 * Update the config cache asychronously in case we're the 1358 * root pool, in which case the config cache isn't writable yet. 1359 */ 1360 if (need_update) 1361 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 1362 } 1363 1364 error = 0; 1365 out: 1366 if (error && error != EBADF) 1367 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 1368 spa->spa_load_state = SPA_LOAD_NONE; 1369 spa->spa_ena = 0; 1370 1371 return (error); 1372 } 1373 1374 /* 1375 * Pool Open/Import 1376 * 1377 * The import case is identical to an open except that the configuration is sent 1378 * down from userland, instead of grabbed from the configuration cache. For the 1379 * case of an open, the pool configuration will exist in the 1380 * POOL_STATE_UNINITIALIZED state. 1381 * 1382 * The stats information (gen/count/ustats) is used to gather vdev statistics at 1383 * the same time open the pool, without having to keep around the spa_t in some 1384 * ambiguous state. 1385 */ 1386 static int 1387 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 1388 { 1389 spa_t *spa; 1390 int error; 1391 int loaded = B_FALSE; 1392 int locked = B_FALSE; 1393 1394 *spapp = NULL; 1395 1396 /* 1397 * As disgusting as this is, we need to support recursive calls to this 1398 * function because dsl_dir_open() is called during spa_load(), and ends 1399 * up calling spa_open() again. The real fix is to figure out how to 1400 * avoid dsl_dir_open() calling this in the first place. 1401 */ 1402 if (mutex_owner(&spa_namespace_lock) != curthread) { 1403 mutex_enter(&spa_namespace_lock); 1404 locked = B_TRUE; 1405 } 1406 1407 if ((spa = spa_lookup(pool)) == NULL) { 1408 if (locked) 1409 mutex_exit(&spa_namespace_lock); 1410 return (ENOENT); 1411 } 1412 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 1413 1414 spa_activate(spa); 1415 1416 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 1417 1418 if (error == EBADF) { 1419 /* 1420 * If vdev_validate() returns failure (indicated by 1421 * EBADF), it indicates that one of the vdevs indicates 1422 * that the pool has been exported or destroyed. If 1423 * this is the case, the config cache is out of sync and 1424 * we should remove the pool from the namespace. 1425 */ 1426 spa_unload(spa); 1427 spa_deactivate(spa); 1428 spa_config_sync(spa, B_TRUE, B_TRUE); 1429 spa_remove(spa); 1430 if (locked) 1431 mutex_exit(&spa_namespace_lock); 1432 return (ENOENT); 1433 } 1434 1435 if (error) { 1436 /* 1437 * We can't open the pool, but we still have useful 1438 * information: the state of each vdev after the 1439 * attempted vdev_open(). Return this to the user. 1440 */ 1441 if (config != NULL && spa->spa_root_vdev != NULL) { 1442 spa_config_enter(spa, RW_READER, FTAG); 1443 *config = spa_config_generate(spa, NULL, -1ULL, 1444 B_TRUE); 1445 spa_config_exit(spa, FTAG); 1446 } 1447 spa_unload(spa); 1448 spa_deactivate(spa); 1449 spa->spa_last_open_failed = B_TRUE; 1450 if (locked) 1451 mutex_exit(&spa_namespace_lock); 1452 *spapp = NULL; 1453 return (error); 1454 } else { 1455 spa->spa_last_open_failed = B_FALSE; 1456 } 1457 1458 loaded = B_TRUE; 1459 } 1460 1461 spa_open_ref(spa, tag); 1462 1463 /* 1464 * If we just loaded the pool, resilver anything that's out of date. 1465 */ 1466 if (loaded && (spa_mode & FWRITE)) 1467 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1468 1469 if (locked) 1470 mutex_exit(&spa_namespace_lock); 1471 1472 *spapp = spa; 1473 1474 if (config != NULL) { 1475 spa_config_enter(spa, RW_READER, FTAG); 1476 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1477 spa_config_exit(spa, FTAG); 1478 } 1479 1480 return (0); 1481 } 1482 1483 int 1484 spa_open(const char *name, spa_t **spapp, void *tag) 1485 { 1486 return (spa_open_common(name, spapp, tag, NULL)); 1487 } 1488 1489 /* 1490 * Lookup the given spa_t, incrementing the inject count in the process, 1491 * preventing it from being exported or destroyed. 1492 */ 1493 spa_t * 1494 spa_inject_addref(char *name) 1495 { 1496 spa_t *spa; 1497 1498 mutex_enter(&spa_namespace_lock); 1499 if ((spa = spa_lookup(name)) == NULL) { 1500 mutex_exit(&spa_namespace_lock); 1501 return (NULL); 1502 } 1503 spa->spa_inject_ref++; 1504 mutex_exit(&spa_namespace_lock); 1505 1506 return (spa); 1507 } 1508 1509 void 1510 spa_inject_delref(spa_t *spa) 1511 { 1512 mutex_enter(&spa_namespace_lock); 1513 spa->spa_inject_ref--; 1514 mutex_exit(&spa_namespace_lock); 1515 } 1516 1517 /* 1518 * Add spares device information to the nvlist. 1519 */ 1520 static void 1521 spa_add_spares(spa_t *spa, nvlist_t *config) 1522 { 1523 nvlist_t **spares; 1524 uint_t i, nspares; 1525 nvlist_t *nvroot; 1526 uint64_t guid; 1527 vdev_stat_t *vs; 1528 uint_t vsc; 1529 uint64_t pool; 1530 1531 if (spa->spa_spares.sav_count == 0) 1532 return; 1533 1534 VERIFY(nvlist_lookup_nvlist(config, 1535 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1536 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1537 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1538 if (nspares != 0) { 1539 VERIFY(nvlist_add_nvlist_array(nvroot, 1540 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1541 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1542 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1543 1544 /* 1545 * Go through and find any spares which have since been 1546 * repurposed as an active spare. If this is the case, update 1547 * their status appropriately. 1548 */ 1549 for (i = 0; i < nspares; i++) { 1550 VERIFY(nvlist_lookup_uint64(spares[i], 1551 ZPOOL_CONFIG_GUID, &guid) == 0); 1552 if (spa_spare_exists(guid, &pool) && pool != 0ULL) { 1553 VERIFY(nvlist_lookup_uint64_array( 1554 spares[i], ZPOOL_CONFIG_STATS, 1555 (uint64_t **)&vs, &vsc) == 0); 1556 vs->vs_state = VDEV_STATE_CANT_OPEN; 1557 vs->vs_aux = VDEV_AUX_SPARED; 1558 } 1559 } 1560 } 1561 } 1562 1563 /* 1564 * Add l2cache device information to the nvlist, including vdev stats. 1565 */ 1566 static void 1567 spa_add_l2cache(spa_t *spa, nvlist_t *config) 1568 { 1569 nvlist_t **l2cache; 1570 uint_t i, j, nl2cache; 1571 nvlist_t *nvroot; 1572 uint64_t guid; 1573 vdev_t *vd; 1574 vdev_stat_t *vs; 1575 uint_t vsc; 1576 1577 if (spa->spa_l2cache.sav_count == 0) 1578 return; 1579 1580 spa_config_enter(spa, RW_READER, FTAG); 1581 1582 VERIFY(nvlist_lookup_nvlist(config, 1583 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1584 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 1585 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1586 if (nl2cache != 0) { 1587 VERIFY(nvlist_add_nvlist_array(nvroot, 1588 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 1589 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1590 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1591 1592 /* 1593 * Update level 2 cache device stats. 1594 */ 1595 1596 for (i = 0; i < nl2cache; i++) { 1597 VERIFY(nvlist_lookup_uint64(l2cache[i], 1598 ZPOOL_CONFIG_GUID, &guid) == 0); 1599 1600 vd = NULL; 1601 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 1602 if (guid == 1603 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 1604 vd = spa->spa_l2cache.sav_vdevs[j]; 1605 break; 1606 } 1607 } 1608 ASSERT(vd != NULL); 1609 1610 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 1611 ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0); 1612 vdev_get_stats(vd, vs); 1613 } 1614 } 1615 1616 spa_config_exit(spa, FTAG); 1617 } 1618 1619 int 1620 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 1621 { 1622 int error; 1623 spa_t *spa; 1624 1625 *config = NULL; 1626 error = spa_open_common(name, &spa, FTAG, config); 1627 1628 if (spa && *config != NULL) { 1629 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 1630 spa_get_errlog_size(spa)) == 0); 1631 1632 spa_add_spares(spa, *config); 1633 spa_add_l2cache(spa, *config); 1634 } 1635 1636 /* 1637 * We want to get the alternate root even for faulted pools, so we cheat 1638 * and call spa_lookup() directly. 1639 */ 1640 if (altroot) { 1641 if (spa == NULL) { 1642 mutex_enter(&spa_namespace_lock); 1643 spa = spa_lookup(name); 1644 if (spa) 1645 spa_altroot(spa, altroot, buflen); 1646 else 1647 altroot[0] = '\0'; 1648 spa = NULL; 1649 mutex_exit(&spa_namespace_lock); 1650 } else { 1651 spa_altroot(spa, altroot, buflen); 1652 } 1653 } 1654 1655 if (spa != NULL) 1656 spa_close(spa, FTAG); 1657 1658 return (error); 1659 } 1660 1661 /* 1662 * Validate that the auxiliary device array is well formed. We must have an 1663 * array of nvlists, each which describes a valid leaf vdev. If this is an 1664 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 1665 * specified, as long as they are well-formed. 1666 */ 1667 static int 1668 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 1669 spa_aux_vdev_t *sav, const char *config, uint64_t version, 1670 vdev_labeltype_t label) 1671 { 1672 nvlist_t **dev; 1673 uint_t i, ndev; 1674 vdev_t *vd; 1675 int error; 1676 1677 /* 1678 * It's acceptable to have no devs specified. 1679 */ 1680 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 1681 return (0); 1682 1683 if (ndev == 0) 1684 return (EINVAL); 1685 1686 /* 1687 * Make sure the pool is formatted with a version that supports this 1688 * device type. 1689 */ 1690 if (spa_version(spa) < version) 1691 return (ENOTSUP); 1692 1693 /* 1694 * Set the pending device list so we correctly handle device in-use 1695 * checking. 1696 */ 1697 sav->sav_pending = dev; 1698 sav->sav_npending = ndev; 1699 1700 for (i = 0; i < ndev; i++) { 1701 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 1702 mode)) != 0) 1703 goto out; 1704 1705 if (!vd->vdev_ops->vdev_op_leaf) { 1706 vdev_free(vd); 1707 error = EINVAL; 1708 goto out; 1709 } 1710 1711 /* 1712 * The L2ARC currently only supports disk devices. 1713 */ 1714 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 1715 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 1716 error = ENOTBLK; 1717 goto out; 1718 } 1719 1720 vd->vdev_top = vd; 1721 1722 if ((error = vdev_open(vd)) == 0 && 1723 (error = vdev_label_init(vd, crtxg, label)) == 0) { 1724 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 1725 vd->vdev_guid) == 0); 1726 } 1727 1728 vdev_free(vd); 1729 1730 if (error && 1731 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 1732 goto out; 1733 else 1734 error = 0; 1735 } 1736 1737 out: 1738 sav->sav_pending = NULL; 1739 sav->sav_npending = 0; 1740 return (error); 1741 } 1742 1743 static int 1744 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 1745 { 1746 int error; 1747 1748 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 1749 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 1750 VDEV_LABEL_SPARE)) != 0) { 1751 return (error); 1752 } 1753 1754 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 1755 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 1756 VDEV_LABEL_L2CACHE)); 1757 } 1758 1759 static void 1760 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 1761 const char *config) 1762 { 1763 int i; 1764 1765 if (sav->sav_config != NULL) { 1766 nvlist_t **olddevs; 1767 uint_t oldndevs; 1768 nvlist_t **newdevs; 1769 1770 /* 1771 * Generate new dev list by concatentating with the 1772 * current dev list. 1773 */ 1774 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 1775 &olddevs, &oldndevs) == 0); 1776 1777 newdevs = kmem_alloc(sizeof (void *) * 1778 (ndevs + oldndevs), KM_SLEEP); 1779 for (i = 0; i < oldndevs; i++) 1780 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 1781 KM_SLEEP) == 0); 1782 for (i = 0; i < ndevs; i++) 1783 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 1784 KM_SLEEP) == 0); 1785 1786 VERIFY(nvlist_remove(sav->sav_config, config, 1787 DATA_TYPE_NVLIST_ARRAY) == 0); 1788 1789 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1790 config, newdevs, ndevs + oldndevs) == 0); 1791 for (i = 0; i < oldndevs + ndevs; i++) 1792 nvlist_free(newdevs[i]); 1793 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 1794 } else { 1795 /* 1796 * Generate a new dev list. 1797 */ 1798 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 1799 KM_SLEEP) == 0); 1800 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 1801 devs, ndevs) == 0); 1802 } 1803 } 1804 1805 /* 1806 * Stop and drop level 2 ARC devices 1807 */ 1808 void 1809 spa_l2cache_drop(spa_t *spa) 1810 { 1811 vdev_t *vd; 1812 int i; 1813 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1814 1815 for (i = 0; i < sav->sav_count; i++) { 1816 uint64_t pool; 1817 1818 vd = sav->sav_vdevs[i]; 1819 ASSERT(vd != NULL); 1820 1821 if (spa_mode & FWRITE && 1822 spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && 1823 l2arc_vdev_present(vd)) { 1824 l2arc_remove_vdev(vd); 1825 } 1826 if (vd->vdev_isl2cache) 1827 spa_l2cache_remove(vd); 1828 vdev_clear_stats(vd); 1829 (void) vdev_close(vd); 1830 } 1831 } 1832 1833 /* 1834 * Pool Creation 1835 */ 1836 int 1837 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 1838 const char *history_str) 1839 { 1840 spa_t *spa; 1841 char *altroot = NULL; 1842 vdev_t *rvd; 1843 dsl_pool_t *dp; 1844 dmu_tx_t *tx; 1845 int c, error = 0; 1846 uint64_t txg = TXG_INITIAL; 1847 nvlist_t **spares, **l2cache; 1848 uint_t nspares, nl2cache; 1849 uint64_t version; 1850 1851 /* 1852 * If this pool already exists, return failure. 1853 */ 1854 mutex_enter(&spa_namespace_lock); 1855 if (spa_lookup(pool) != NULL) { 1856 mutex_exit(&spa_namespace_lock); 1857 return (EEXIST); 1858 } 1859 1860 /* 1861 * Allocate a new spa_t structure. 1862 */ 1863 (void) nvlist_lookup_string(props, 1864 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 1865 spa = spa_add(pool, altroot); 1866 spa_activate(spa); 1867 1868 spa->spa_uberblock.ub_txg = txg - 1; 1869 1870 if (props && (error = spa_prop_validate(spa, props))) { 1871 spa_unload(spa); 1872 spa_deactivate(spa); 1873 spa_remove(spa); 1874 mutex_exit(&spa_namespace_lock); 1875 return (error); 1876 } 1877 1878 if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 1879 &version) != 0) 1880 version = SPA_VERSION; 1881 ASSERT(version <= SPA_VERSION); 1882 spa->spa_uberblock.ub_version = version; 1883 spa->spa_ubsync = spa->spa_uberblock; 1884 1885 /* 1886 * Create the root vdev. 1887 */ 1888 spa_config_enter(spa, RW_WRITER, FTAG); 1889 1890 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1891 1892 ASSERT(error != 0 || rvd != NULL); 1893 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1894 1895 if (error == 0 && !zfs_allocatable_devs(nvroot)) 1896 error = EINVAL; 1897 1898 if (error == 0 && 1899 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1900 (error = spa_validate_aux(spa, nvroot, txg, 1901 VDEV_ALLOC_ADD)) == 0) { 1902 for (c = 0; c < rvd->vdev_children; c++) 1903 vdev_init(rvd->vdev_child[c], txg); 1904 vdev_config_dirty(rvd); 1905 } 1906 1907 spa_config_exit(spa, FTAG); 1908 1909 if (error != 0) { 1910 spa_unload(spa); 1911 spa_deactivate(spa); 1912 spa_remove(spa); 1913 mutex_exit(&spa_namespace_lock); 1914 return (error); 1915 } 1916 1917 /* 1918 * Get the list of spares, if specified. 1919 */ 1920 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1921 &spares, &nspares) == 0) { 1922 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 1923 KM_SLEEP) == 0); 1924 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1925 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1926 spa_config_enter(spa, RW_WRITER, FTAG); 1927 spa_load_spares(spa); 1928 spa_config_exit(spa, FTAG); 1929 spa->spa_spares.sav_sync = B_TRUE; 1930 } 1931 1932 /* 1933 * Get the list of level 2 cache devices, if specified. 1934 */ 1935 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 1936 &l2cache, &nl2cache) == 0) { 1937 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 1938 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1939 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 1940 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 1941 spa_config_enter(spa, RW_WRITER, FTAG); 1942 spa_load_l2cache(spa); 1943 spa_config_exit(spa, FTAG); 1944 spa->spa_l2cache.sav_sync = B_TRUE; 1945 } 1946 1947 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1948 spa->spa_meta_objset = dp->dp_meta_objset; 1949 1950 tx = dmu_tx_create_assigned(dp, txg); 1951 1952 /* 1953 * Create the pool config object. 1954 */ 1955 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1956 DMU_OT_PACKED_NVLIST, 1 << 14, 1957 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1958 1959 if (zap_add(spa->spa_meta_objset, 1960 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1961 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1962 cmn_err(CE_PANIC, "failed to add pool config"); 1963 } 1964 1965 /* Newly created pools with the right version are always deflated. */ 1966 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 1967 spa->spa_deflate = TRUE; 1968 if (zap_add(spa->spa_meta_objset, 1969 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1970 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 1971 cmn_err(CE_PANIC, "failed to add deflate"); 1972 } 1973 } 1974 1975 /* 1976 * Create the deferred-free bplist object. Turn off compression 1977 * because sync-to-convergence takes longer if the blocksize 1978 * keeps changing. 1979 */ 1980 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1981 1 << 14, tx); 1982 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1983 ZIO_COMPRESS_OFF, tx); 1984 1985 if (zap_add(spa->spa_meta_objset, 1986 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1987 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 1988 cmn_err(CE_PANIC, "failed to add bplist"); 1989 } 1990 1991 /* 1992 * Create the pool's history object. 1993 */ 1994 if (version >= SPA_VERSION_ZPOOL_HISTORY) 1995 spa_history_create_obj(spa, tx); 1996 1997 /* 1998 * Set pool properties. 1999 */ 2000 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 2001 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2002 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 2003 if (props) 2004 spa_sync_props(spa, props, CRED(), tx); 2005 2006 dmu_tx_commit(tx); 2007 2008 spa->spa_sync_on = B_TRUE; 2009 txg_sync_start(spa->spa_dsl_pool); 2010 2011 /* 2012 * We explicitly wait for the first transaction to complete so that our 2013 * bean counters are appropriately updated. 2014 */ 2015 txg_wait_synced(spa->spa_dsl_pool, txg); 2016 2017 spa_config_sync(spa, B_FALSE, B_TRUE); 2018 2019 if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 2020 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 2021 2022 mutex_exit(&spa_namespace_lock); 2023 2024 return (0); 2025 } 2026 2027 /* 2028 * Import the given pool into the system. We set up the necessary spa_t and 2029 * then call spa_load() to do the dirty work. 2030 */ 2031 static int 2032 spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props, 2033 boolean_t isroot, boolean_t allowfaulted) 2034 { 2035 spa_t *spa; 2036 char *altroot = NULL; 2037 int error, loaderr; 2038 nvlist_t *nvroot; 2039 nvlist_t **spares, **l2cache; 2040 uint_t nspares, nl2cache; 2041 int mosconfig = isroot? B_FALSE : B_TRUE; 2042 2043 /* 2044 * If a pool with this name exists, return failure. 2045 */ 2046 mutex_enter(&spa_namespace_lock); 2047 if (spa_lookup(pool) != NULL) { 2048 mutex_exit(&spa_namespace_lock); 2049 return (EEXIST); 2050 } 2051 2052 /* 2053 * Create and initialize the spa structure. 2054 */ 2055 (void) nvlist_lookup_string(props, 2056 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2057 spa = spa_add(pool, altroot); 2058 spa_activate(spa); 2059 2060 if (allowfaulted) 2061 spa->spa_import_faulted = B_TRUE; 2062 2063 /* 2064 * Pass off the heavy lifting to spa_load(). 2065 * Pass TRUE for mosconfig because the user-supplied config 2066 * is actually the one to trust when doing an import. 2067 */ 2068 loaderr = error = spa_load(spa, config, SPA_LOAD_IMPORT, mosconfig); 2069 2070 spa_config_enter(spa, RW_WRITER, FTAG); 2071 /* 2072 * Toss any existing sparelist, as it doesn't have any validity anymore, 2073 * and conflicts with spa_has_spare(). 2074 */ 2075 if (!isroot && spa->spa_spares.sav_config) { 2076 nvlist_free(spa->spa_spares.sav_config); 2077 spa->spa_spares.sav_config = NULL; 2078 spa_load_spares(spa); 2079 } 2080 if (!isroot && spa->spa_l2cache.sav_config) { 2081 nvlist_free(spa->spa_l2cache.sav_config); 2082 spa->spa_l2cache.sav_config = NULL; 2083 spa_load_l2cache(spa); 2084 } 2085 2086 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 2087 &nvroot) == 0); 2088 if (error == 0) 2089 error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE); 2090 if (error == 0) 2091 error = spa_validate_aux(spa, nvroot, -1ULL, 2092 VDEV_ALLOC_L2CACHE); 2093 spa_config_exit(spa, FTAG); 2094 2095 if (error != 0 || (props && (error = spa_prop_set(spa, props)))) { 2096 if (loaderr != 0 && loaderr != EINVAL && allowfaulted) { 2097 /* 2098 * If we failed to load the pool, but 'allowfaulted' is 2099 * set, then manually set the config as if the config 2100 * passed in was specified in the cache file. 2101 */ 2102 error = 0; 2103 spa->spa_import_faulted = B_FALSE; 2104 if (spa->spa_config == NULL) { 2105 spa_config_enter(spa, RW_READER, FTAG); 2106 spa->spa_config = spa_config_generate(spa, 2107 NULL, -1ULL, B_TRUE); 2108 spa_config_exit(spa, FTAG); 2109 } 2110 spa_unload(spa); 2111 spa_deactivate(spa); 2112 spa_config_sync(spa, B_FALSE, B_TRUE); 2113 } else { 2114 spa_unload(spa); 2115 spa_deactivate(spa); 2116 spa_remove(spa); 2117 } 2118 mutex_exit(&spa_namespace_lock); 2119 return (error); 2120 } 2121 2122 /* 2123 * Override any spares and level 2 cache devices as specified by 2124 * the user, as these may have correct device names/devids, etc. 2125 */ 2126 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2127 &spares, &nspares) == 0) { 2128 if (spa->spa_spares.sav_config) 2129 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 2130 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 2131 else 2132 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 2133 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2134 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2135 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2136 spa_config_enter(spa, RW_WRITER, FTAG); 2137 spa_load_spares(spa); 2138 spa_config_exit(spa, FTAG); 2139 spa->spa_spares.sav_sync = B_TRUE; 2140 } 2141 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2142 &l2cache, &nl2cache) == 0) { 2143 if (spa->spa_l2cache.sav_config) 2144 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 2145 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 2146 else 2147 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2148 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2149 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2150 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2151 spa_config_enter(spa, RW_WRITER, FTAG); 2152 spa_load_l2cache(spa); 2153 spa_config_exit(spa, FTAG); 2154 spa->spa_l2cache.sav_sync = B_TRUE; 2155 } 2156 2157 if (spa_mode & FWRITE) { 2158 /* 2159 * Update the config cache to include the newly-imported pool. 2160 */ 2161 spa_config_update_common(spa, SPA_CONFIG_UPDATE_POOL, isroot); 2162 2163 /* 2164 * Resilver anything that's out of date. 2165 */ 2166 if (!isroot) 2167 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, 2168 B_TRUE) == 0); 2169 } 2170 2171 spa->spa_import_faulted = B_FALSE; 2172 mutex_exit(&spa_namespace_lock); 2173 2174 return (0); 2175 } 2176 2177 #ifdef _KERNEL 2178 /* 2179 * Build a "root" vdev for a top level vdev read in from a rootpool 2180 * device label. 2181 */ 2182 static void 2183 spa_build_rootpool_config(nvlist_t *config) 2184 { 2185 nvlist_t *nvtop, *nvroot; 2186 uint64_t pgid; 2187 2188 /* 2189 * Add this top-level vdev to the child array. 2190 */ 2191 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) 2192 == 0); 2193 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid) 2194 == 0); 2195 2196 /* 2197 * Put this pool's top-level vdevs into a root vdev. 2198 */ 2199 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2200 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) 2201 == 0); 2202 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 2203 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 2204 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 2205 &nvtop, 1) == 0); 2206 2207 /* 2208 * Replace the existing vdev_tree with the new root vdev in 2209 * this pool's configuration (remove the old, add the new). 2210 */ 2211 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 2212 nvlist_free(nvroot); 2213 } 2214 2215 /* 2216 * Get the root pool information from the root disk, then import the root pool 2217 * during the system boot up time. 2218 */ 2219 extern nvlist_t *vdev_disk_read_rootlabel(char *); 2220 2221 void 2222 spa_check_rootconf(char *devpath, char **bestdev, nvlist_t **bestconf, 2223 uint64_t *besttxg) 2224 { 2225 nvlist_t *config; 2226 uint64_t txg; 2227 2228 if ((config = vdev_disk_read_rootlabel(devpath)) == NULL) 2229 return; 2230 2231 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 2232 2233 if (txg > *besttxg) { 2234 *besttxg = txg; 2235 if (*bestconf != NULL) 2236 nvlist_free(*bestconf); 2237 *bestconf = config; 2238 *bestdev = devpath; 2239 } 2240 } 2241 2242 boolean_t 2243 spa_rootdev_validate(nvlist_t *nv) 2244 { 2245 uint64_t ival; 2246 2247 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 || 2248 nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 || 2249 nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, &ival) == 0 || 2250 nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0) 2251 return (B_FALSE); 2252 2253 return (B_TRUE); 2254 } 2255 2256 /* 2257 * Import a root pool. 2258 * 2259 * For x86. devpath_list will consist the physpath name of the vdev in a single 2260 * disk root pool or a list of physnames for the vdevs in a mirrored rootpool. 2261 * e.g. 2262 * "/pci@1f,0/ide@d/disk@0,0:a /pci@1f,o/ide@d/disk@2,0:a" 2263 * 2264 * For Sparc, devpath_list consists the physpath name of the booting device 2265 * no matter the rootpool is a single device pool or a mirrored pool. 2266 * e.g. 2267 * "/pci@1f,0/ide@d/disk@0,0:a" 2268 */ 2269 int 2270 spa_import_rootpool(char *devpath_list) 2271 { 2272 nvlist_t *conf = NULL; 2273 char *dev = NULL; 2274 char *pname; 2275 int error; 2276 2277 /* 2278 * Get the vdev pathname and configuation from the most 2279 * recently updated vdev (highest txg). 2280 */ 2281 if (error = spa_get_rootconf(devpath_list, &dev, &conf)) 2282 goto msg_out; 2283 2284 /* 2285 * Add type "root" vdev to the config. 2286 */ 2287 spa_build_rootpool_config(conf); 2288 2289 VERIFY(nvlist_lookup_string(conf, ZPOOL_CONFIG_POOL_NAME, &pname) == 0); 2290 2291 error = spa_import_common(pname, conf, NULL, B_TRUE, B_FALSE); 2292 if (error == EEXIST) 2293 error = 0; 2294 2295 nvlist_free(conf); 2296 return (error); 2297 2298 msg_out: 2299 cmn_err(CE_NOTE, "\n\n" 2300 " *************************************************** \n" 2301 " * This device is not bootable! * \n" 2302 " * It is either offlined or detached or faulted. * \n" 2303 " * Please try to boot from a different device. * \n" 2304 " *************************************************** \n\n"); 2305 2306 return (error); 2307 } 2308 #endif 2309 2310 /* 2311 * Import a non-root pool into the system. 2312 */ 2313 int 2314 spa_import(const char *pool, nvlist_t *config, nvlist_t *props) 2315 { 2316 return (spa_import_common(pool, config, props, B_FALSE, B_FALSE)); 2317 } 2318 2319 int 2320 spa_import_faulted(const char *pool, nvlist_t *config, nvlist_t *props) 2321 { 2322 return (spa_import_common(pool, config, props, B_FALSE, B_TRUE)); 2323 } 2324 2325 2326 /* 2327 * This (illegal) pool name is used when temporarily importing a spa_t in order 2328 * to get the vdev stats associated with the imported devices. 2329 */ 2330 #define TRYIMPORT_NAME "$import" 2331 2332 nvlist_t * 2333 spa_tryimport(nvlist_t *tryconfig) 2334 { 2335 nvlist_t *config = NULL; 2336 char *poolname; 2337 spa_t *spa; 2338 uint64_t state; 2339 2340 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 2341 return (NULL); 2342 2343 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 2344 return (NULL); 2345 2346 /* 2347 * Create and initialize the spa structure. 2348 */ 2349 mutex_enter(&spa_namespace_lock); 2350 spa = spa_add(TRYIMPORT_NAME, NULL); 2351 spa_activate(spa); 2352 2353 /* 2354 * Pass off the heavy lifting to spa_load(). 2355 * Pass TRUE for mosconfig because the user-supplied config 2356 * is actually the one to trust when doing an import. 2357 */ 2358 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 2359 2360 /* 2361 * If 'tryconfig' was at least parsable, return the current config. 2362 */ 2363 if (spa->spa_root_vdev != NULL) { 2364 spa_config_enter(spa, RW_READER, FTAG); 2365 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2366 spa_config_exit(spa, FTAG); 2367 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 2368 poolname) == 0); 2369 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 2370 state) == 0); 2371 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 2372 spa->spa_uberblock.ub_timestamp) == 0); 2373 2374 /* 2375 * If the bootfs property exists on this pool then we 2376 * copy it out so that external consumers can tell which 2377 * pools are bootable. 2378 */ 2379 if (spa->spa_bootfs) { 2380 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 2381 2382 /* 2383 * We have to play games with the name since the 2384 * pool was opened as TRYIMPORT_NAME. 2385 */ 2386 if (dsl_dsobj_to_dsname(spa->spa_name, 2387 spa->spa_bootfs, tmpname) == 0) { 2388 char *cp; 2389 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 2390 2391 cp = strchr(tmpname, '/'); 2392 if (cp == NULL) { 2393 (void) strlcpy(dsname, tmpname, 2394 MAXPATHLEN); 2395 } else { 2396 (void) snprintf(dsname, MAXPATHLEN, 2397 "%s/%s", poolname, ++cp); 2398 } 2399 VERIFY(nvlist_add_string(config, 2400 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 2401 kmem_free(dsname, MAXPATHLEN); 2402 } 2403 kmem_free(tmpname, MAXPATHLEN); 2404 } 2405 2406 /* 2407 * Add the list of hot spares and level 2 cache devices. 2408 */ 2409 spa_add_spares(spa, config); 2410 spa_add_l2cache(spa, config); 2411 } 2412 2413 spa_unload(spa); 2414 spa_deactivate(spa); 2415 spa_remove(spa); 2416 mutex_exit(&spa_namespace_lock); 2417 2418 return (config); 2419 } 2420 2421 /* 2422 * Pool export/destroy 2423 * 2424 * The act of destroying or exporting a pool is very simple. We make sure there 2425 * is no more pending I/O and any references to the pool are gone. Then, we 2426 * update the pool state and sync all the labels to disk, removing the 2427 * configuration from the cache afterwards. 2428 */ 2429 static int 2430 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 2431 { 2432 spa_t *spa; 2433 2434 if (oldconfig) 2435 *oldconfig = NULL; 2436 2437 if (!(spa_mode & FWRITE)) 2438 return (EROFS); 2439 2440 mutex_enter(&spa_namespace_lock); 2441 if ((spa = spa_lookup(pool)) == NULL) { 2442 mutex_exit(&spa_namespace_lock); 2443 return (ENOENT); 2444 } 2445 2446 /* 2447 * Put a hold on the pool, drop the namespace lock, stop async tasks, 2448 * reacquire the namespace lock, and see if we can export. 2449 */ 2450 spa_open_ref(spa, FTAG); 2451 mutex_exit(&spa_namespace_lock); 2452 spa_async_suspend(spa); 2453 mutex_enter(&spa_namespace_lock); 2454 spa_close(spa, FTAG); 2455 2456 /* 2457 * The pool will be in core if it's openable, 2458 * in which case we can modify its state. 2459 */ 2460 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 2461 /* 2462 * Objsets may be open only because they're dirty, so we 2463 * have to force it to sync before checking spa_refcnt. 2464 */ 2465 spa_scrub_suspend(spa); 2466 txg_wait_synced(spa->spa_dsl_pool, 0); 2467 2468 /* 2469 * A pool cannot be exported or destroyed if there are active 2470 * references. If we are resetting a pool, allow references by 2471 * fault injection handlers. 2472 */ 2473 if (!spa_refcount_zero(spa) || 2474 (spa->spa_inject_ref != 0 && 2475 new_state != POOL_STATE_UNINITIALIZED)) { 2476 spa_scrub_resume(spa); 2477 spa_async_resume(spa); 2478 mutex_exit(&spa_namespace_lock); 2479 return (EBUSY); 2480 } 2481 2482 spa_scrub_resume(spa); 2483 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 2484 2485 /* 2486 * We want this to be reflected on every label, 2487 * so mark them all dirty. spa_unload() will do the 2488 * final sync that pushes these changes out. 2489 */ 2490 if (new_state != POOL_STATE_UNINITIALIZED) { 2491 spa_config_enter(spa, RW_WRITER, FTAG); 2492 spa->spa_state = new_state; 2493 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 2494 vdev_config_dirty(spa->spa_root_vdev); 2495 spa_config_exit(spa, FTAG); 2496 } 2497 } 2498 2499 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 2500 2501 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2502 spa_unload(spa); 2503 spa_deactivate(spa); 2504 } 2505 2506 if (oldconfig && spa->spa_config) 2507 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 2508 2509 if (new_state != POOL_STATE_UNINITIALIZED) { 2510 spa_config_sync(spa, B_TRUE, B_TRUE); 2511 spa_remove(spa); 2512 } 2513 mutex_exit(&spa_namespace_lock); 2514 2515 return (0); 2516 } 2517 2518 /* 2519 * Destroy a storage pool. 2520 */ 2521 int 2522 spa_destroy(char *pool) 2523 { 2524 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 2525 } 2526 2527 /* 2528 * Export a storage pool. 2529 */ 2530 int 2531 spa_export(char *pool, nvlist_t **oldconfig) 2532 { 2533 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 2534 } 2535 2536 /* 2537 * Similar to spa_export(), this unloads the spa_t without actually removing it 2538 * from the namespace in any way. 2539 */ 2540 int 2541 spa_reset(char *pool) 2542 { 2543 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 2544 } 2545 2546 2547 /* 2548 * ========================================================================== 2549 * Device manipulation 2550 * ========================================================================== 2551 */ 2552 2553 /* 2554 * Add a device to a storage pool. 2555 */ 2556 int 2557 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 2558 { 2559 uint64_t txg; 2560 int c, error; 2561 vdev_t *rvd = spa->spa_root_vdev; 2562 vdev_t *vd, *tvd; 2563 nvlist_t **spares, **l2cache; 2564 uint_t nspares, nl2cache; 2565 2566 txg = spa_vdev_enter(spa); 2567 2568 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 2569 VDEV_ALLOC_ADD)) != 0) 2570 return (spa_vdev_exit(spa, NULL, txg, error)); 2571 2572 spa->spa_pending_vdev = vd; 2573 2574 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 2575 &nspares) != 0) 2576 nspares = 0; 2577 2578 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 2579 &nl2cache) != 0) 2580 nl2cache = 0; 2581 2582 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) { 2583 spa->spa_pending_vdev = NULL; 2584 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 2585 } 2586 2587 if (vd->vdev_children != 0) { 2588 if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 2589 spa->spa_pending_vdev = NULL; 2590 return (spa_vdev_exit(spa, vd, txg, error)); 2591 } 2592 } 2593 2594 /* 2595 * We must validate the spares and l2cache devices after checking the 2596 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 2597 */ 2598 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) { 2599 spa->spa_pending_vdev = NULL; 2600 return (spa_vdev_exit(spa, vd, txg, error)); 2601 } 2602 2603 spa->spa_pending_vdev = NULL; 2604 2605 /* 2606 * Transfer each new top-level vdev from vd to rvd. 2607 */ 2608 for (c = 0; c < vd->vdev_children; c++) { 2609 tvd = vd->vdev_child[c]; 2610 vdev_remove_child(vd, tvd); 2611 tvd->vdev_id = rvd->vdev_children; 2612 vdev_add_child(rvd, tvd); 2613 vdev_config_dirty(tvd); 2614 } 2615 2616 if (nspares != 0) { 2617 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 2618 ZPOOL_CONFIG_SPARES); 2619 spa_load_spares(spa); 2620 spa->spa_spares.sav_sync = B_TRUE; 2621 } 2622 2623 if (nl2cache != 0) { 2624 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 2625 ZPOOL_CONFIG_L2CACHE); 2626 spa_load_l2cache(spa); 2627 spa->spa_l2cache.sav_sync = B_TRUE; 2628 } 2629 2630 /* 2631 * We have to be careful when adding new vdevs to an existing pool. 2632 * If other threads start allocating from these vdevs before we 2633 * sync the config cache, and we lose power, then upon reboot we may 2634 * fail to open the pool because there are DVAs that the config cache 2635 * can't translate. Therefore, we first add the vdevs without 2636 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 2637 * and then let spa_config_update() initialize the new metaslabs. 2638 * 2639 * spa_load() checks for added-but-not-initialized vdevs, so that 2640 * if we lose power at any point in this sequence, the remaining 2641 * steps will be completed the next time we load the pool. 2642 */ 2643 (void) spa_vdev_exit(spa, vd, txg, 0); 2644 2645 mutex_enter(&spa_namespace_lock); 2646 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2647 mutex_exit(&spa_namespace_lock); 2648 2649 return (0); 2650 } 2651 2652 /* 2653 * Attach a device to a mirror. The arguments are the path to any device 2654 * in the mirror, and the nvroot for the new device. If the path specifies 2655 * a device that is not mirrored, we automatically insert the mirror vdev. 2656 * 2657 * If 'replacing' is specified, the new device is intended to replace the 2658 * existing device; in this case the two devices are made into their own 2659 * mirror using the 'replacing' vdev, which is functionally identical to 2660 * the mirror vdev (it actually reuses all the same ops) but has a few 2661 * extra rules: you can't attach to it after it's been created, and upon 2662 * completion of resilvering, the first disk (the one being replaced) 2663 * is automatically detached. 2664 */ 2665 int 2666 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 2667 { 2668 uint64_t txg, open_txg; 2669 int error; 2670 vdev_t *rvd = spa->spa_root_vdev; 2671 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 2672 vdev_ops_t *pvops; 2673 int is_log; 2674 2675 txg = spa_vdev_enter(spa); 2676 2677 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 2678 2679 if (oldvd == NULL) 2680 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 2681 2682 if (!oldvd->vdev_ops->vdev_op_leaf) 2683 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2684 2685 pvd = oldvd->vdev_parent; 2686 2687 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 2688 VDEV_ALLOC_ADD)) != 0) 2689 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 2690 2691 if (newrootvd->vdev_children != 1) 2692 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 2693 2694 newvd = newrootvd->vdev_child[0]; 2695 2696 if (!newvd->vdev_ops->vdev_op_leaf) 2697 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 2698 2699 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 2700 return (spa_vdev_exit(spa, newrootvd, txg, error)); 2701 2702 /* 2703 * Spares can't replace logs 2704 */ 2705 is_log = oldvd->vdev_islog; 2706 if (is_log && newvd->vdev_isspare) 2707 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2708 2709 if (!replacing) { 2710 /* 2711 * For attach, the only allowable parent is a mirror or the root 2712 * vdev. 2713 */ 2714 if (pvd->vdev_ops != &vdev_mirror_ops && 2715 pvd->vdev_ops != &vdev_root_ops) 2716 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2717 2718 pvops = &vdev_mirror_ops; 2719 } else { 2720 /* 2721 * Active hot spares can only be replaced by inactive hot 2722 * spares. 2723 */ 2724 if (pvd->vdev_ops == &vdev_spare_ops && 2725 pvd->vdev_child[1] == oldvd && 2726 !spa_has_spare(spa, newvd->vdev_guid)) 2727 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2728 2729 /* 2730 * If the source is a hot spare, and the parent isn't already a 2731 * spare, then we want to create a new hot spare. Otherwise, we 2732 * want to create a replacing vdev. The user is not allowed to 2733 * attach to a spared vdev child unless the 'isspare' state is 2734 * the same (spare replaces spare, non-spare replaces 2735 * non-spare). 2736 */ 2737 if (pvd->vdev_ops == &vdev_replacing_ops) 2738 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2739 else if (pvd->vdev_ops == &vdev_spare_ops && 2740 newvd->vdev_isspare != oldvd->vdev_isspare) 2741 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2742 else if (pvd->vdev_ops != &vdev_spare_ops && 2743 newvd->vdev_isspare) 2744 pvops = &vdev_spare_ops; 2745 else 2746 pvops = &vdev_replacing_ops; 2747 } 2748 2749 /* 2750 * Compare the new device size with the replaceable/attachable 2751 * device size. 2752 */ 2753 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 2754 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 2755 2756 /* 2757 * The new device cannot have a higher alignment requirement 2758 * than the top-level vdev. 2759 */ 2760 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 2761 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 2762 2763 /* 2764 * If this is an in-place replacement, update oldvd's path and devid 2765 * to make it distinguishable from newvd, and unopenable from now on. 2766 */ 2767 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 2768 spa_strfree(oldvd->vdev_path); 2769 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 2770 KM_SLEEP); 2771 (void) sprintf(oldvd->vdev_path, "%s/%s", 2772 newvd->vdev_path, "old"); 2773 if (oldvd->vdev_devid != NULL) { 2774 spa_strfree(oldvd->vdev_devid); 2775 oldvd->vdev_devid = NULL; 2776 } 2777 } 2778 2779 /* 2780 * If the parent is not a mirror, or if we're replacing, insert the new 2781 * mirror/replacing/spare vdev above oldvd. 2782 */ 2783 if (pvd->vdev_ops != pvops) 2784 pvd = vdev_add_parent(oldvd, pvops); 2785 2786 ASSERT(pvd->vdev_top->vdev_parent == rvd); 2787 ASSERT(pvd->vdev_ops == pvops); 2788 ASSERT(oldvd->vdev_parent == pvd); 2789 2790 /* 2791 * Extract the new device from its root and add it to pvd. 2792 */ 2793 vdev_remove_child(newrootvd, newvd); 2794 newvd->vdev_id = pvd->vdev_children; 2795 vdev_add_child(pvd, newvd); 2796 2797 /* 2798 * If newvd is smaller than oldvd, but larger than its rsize, 2799 * the addition of newvd may have decreased our parent's asize. 2800 */ 2801 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 2802 2803 tvd = newvd->vdev_top; 2804 ASSERT(pvd->vdev_top == tvd); 2805 ASSERT(tvd->vdev_parent == rvd); 2806 2807 vdev_config_dirty(tvd); 2808 2809 /* 2810 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 2811 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 2812 */ 2813 open_txg = txg + TXG_CONCURRENT_STATES - 1; 2814 2815 mutex_enter(&newvd->vdev_dtl_lock); 2816 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 2817 open_txg - TXG_INITIAL + 1); 2818 mutex_exit(&newvd->vdev_dtl_lock); 2819 2820 if (newvd->vdev_isspare) 2821 spa_spare_activate(newvd); 2822 2823 /* 2824 * Mark newvd's DTL dirty in this txg. 2825 */ 2826 vdev_dirty(tvd, VDD_DTL, newvd, txg); 2827 2828 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 2829 2830 /* 2831 * Kick off a resilver to update newvd. We need to grab the namespace 2832 * lock because spa_scrub() needs to post a sysevent with the pool name. 2833 */ 2834 mutex_enter(&spa_namespace_lock); 2835 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 2836 mutex_exit(&spa_namespace_lock); 2837 2838 return (0); 2839 } 2840 2841 /* 2842 * Detach a device from a mirror or replacing vdev. 2843 * If 'replace_done' is specified, only detach if the parent 2844 * is a replacing vdev. 2845 */ 2846 int 2847 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 2848 { 2849 uint64_t txg; 2850 int c, t, error; 2851 vdev_t *rvd = spa->spa_root_vdev; 2852 vdev_t *vd, *pvd, *cvd, *tvd; 2853 boolean_t unspare = B_FALSE; 2854 uint64_t unspare_guid; 2855 2856 txg = spa_vdev_enter(spa); 2857 2858 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 2859 2860 if (vd == NULL) 2861 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 2862 2863 if (!vd->vdev_ops->vdev_op_leaf) 2864 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2865 2866 pvd = vd->vdev_parent; 2867 2868 /* 2869 * If replace_done is specified, only remove this device if it's 2870 * the first child of a replacing vdev. For the 'spare' vdev, either 2871 * disk can be removed. 2872 */ 2873 if (replace_done) { 2874 if (pvd->vdev_ops == &vdev_replacing_ops) { 2875 if (vd->vdev_id != 0) 2876 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2877 } else if (pvd->vdev_ops != &vdev_spare_ops) { 2878 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2879 } 2880 } 2881 2882 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 2883 spa_version(spa) >= SPA_VERSION_SPARES); 2884 2885 /* 2886 * Only mirror, replacing, and spare vdevs support detach. 2887 */ 2888 if (pvd->vdev_ops != &vdev_replacing_ops && 2889 pvd->vdev_ops != &vdev_mirror_ops && 2890 pvd->vdev_ops != &vdev_spare_ops) 2891 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2892 2893 /* 2894 * If there's only one replica, you can't detach it. 2895 */ 2896 if (pvd->vdev_children <= 1) 2897 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 2898 2899 /* 2900 * If all siblings have non-empty DTLs, this device may have the only 2901 * valid copy of the data, which means we cannot safely detach it. 2902 * 2903 * XXX -- as in the vdev_offline() case, we really want a more 2904 * precise DTL check. 2905 */ 2906 for (c = 0; c < pvd->vdev_children; c++) { 2907 uint64_t dirty; 2908 2909 cvd = pvd->vdev_child[c]; 2910 if (cvd == vd) 2911 continue; 2912 if (vdev_is_dead(cvd)) 2913 continue; 2914 mutex_enter(&cvd->vdev_dtl_lock); 2915 dirty = cvd->vdev_dtl_map.sm_space | 2916 cvd->vdev_dtl_scrub.sm_space; 2917 mutex_exit(&cvd->vdev_dtl_lock); 2918 if (!dirty) 2919 break; 2920 } 2921 2922 /* 2923 * If we are a replacing or spare vdev, then we can always detach the 2924 * latter child, as that is how one cancels the operation. 2925 */ 2926 if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 2927 c == pvd->vdev_children) 2928 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 2929 2930 /* 2931 * If we are detaching the original disk from a spare, then it implies 2932 * that the spare should become a real disk, and be removed from the 2933 * active spare list for the pool. 2934 */ 2935 if (pvd->vdev_ops == &vdev_spare_ops && 2936 vd->vdev_id == 0) 2937 unspare = B_TRUE; 2938 2939 /* 2940 * Erase the disk labels so the disk can be used for other things. 2941 * This must be done after all other error cases are handled, 2942 * but before we disembowel vd (so we can still do I/O to it). 2943 * But if we can't do it, don't treat the error as fatal -- 2944 * it may be that the unwritability of the disk is the reason 2945 * it's being detached! 2946 */ 2947 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 2948 2949 /* 2950 * Remove vd from its parent and compact the parent's children. 2951 */ 2952 vdev_remove_child(pvd, vd); 2953 vdev_compact_children(pvd); 2954 2955 /* 2956 * Remember one of the remaining children so we can get tvd below. 2957 */ 2958 cvd = pvd->vdev_child[0]; 2959 2960 /* 2961 * If we need to remove the remaining child from the list of hot spares, 2962 * do it now, marking the vdev as no longer a spare in the process. We 2963 * must do this before vdev_remove_parent(), because that can change the 2964 * GUID if it creates a new toplevel GUID. 2965 */ 2966 if (unspare) { 2967 ASSERT(cvd->vdev_isspare); 2968 spa_spare_remove(cvd); 2969 unspare_guid = cvd->vdev_guid; 2970 } 2971 2972 /* 2973 * If the parent mirror/replacing vdev only has one child, 2974 * the parent is no longer needed. Remove it from the tree. 2975 */ 2976 if (pvd->vdev_children == 1) 2977 vdev_remove_parent(cvd); 2978 2979 /* 2980 * We don't set tvd until now because the parent we just removed 2981 * may have been the previous top-level vdev. 2982 */ 2983 tvd = cvd->vdev_top; 2984 ASSERT(tvd->vdev_parent == rvd); 2985 2986 /* 2987 * Reevaluate the parent vdev state. 2988 */ 2989 vdev_propagate_state(cvd); 2990 2991 /* 2992 * If the device we just detached was smaller than the others, it may be 2993 * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 2994 * can't fail because the existing metaslabs are already in core, so 2995 * there's nothing to read from disk. 2996 */ 2997 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 2998 2999 vdev_config_dirty(tvd); 3000 3001 /* 3002 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 3003 * vd->vdev_detached is set and free vd's DTL object in syncing context. 3004 * But first make sure we're not on any *other* txg's DTL list, to 3005 * prevent vd from being accessed after it's freed. 3006 */ 3007 for (t = 0; t < TXG_SIZE; t++) 3008 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 3009 vd->vdev_detached = B_TRUE; 3010 vdev_dirty(tvd, VDD_DTL, vd, txg); 3011 3012 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 3013 3014 error = spa_vdev_exit(spa, vd, txg, 0); 3015 3016 /* 3017 * If this was the removal of the original device in a hot spare vdev, 3018 * then we want to go through and remove the device from the hot spare 3019 * list of every other pool. 3020 */ 3021 if (unspare) { 3022 spa = NULL; 3023 mutex_enter(&spa_namespace_lock); 3024 while ((spa = spa_next(spa)) != NULL) { 3025 if (spa->spa_state != POOL_STATE_ACTIVE) 3026 continue; 3027 3028 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 3029 } 3030 mutex_exit(&spa_namespace_lock); 3031 } 3032 3033 return (error); 3034 } 3035 3036 /* 3037 * Remove a spares vdev from the nvlist config. 3038 */ 3039 static int 3040 spa_remove_spares(spa_aux_vdev_t *sav, uint64_t guid, boolean_t unspare, 3041 nvlist_t **spares, int nspares, vdev_t *vd) 3042 { 3043 nvlist_t *nv, **newspares; 3044 int i, j; 3045 3046 nv = NULL; 3047 for (i = 0; i < nspares; i++) { 3048 uint64_t theguid; 3049 3050 VERIFY(nvlist_lookup_uint64(spares[i], 3051 ZPOOL_CONFIG_GUID, &theguid) == 0); 3052 if (theguid == guid) { 3053 nv = spares[i]; 3054 break; 3055 } 3056 } 3057 3058 /* 3059 * Only remove the hot spare if it's not currently in use in this pool. 3060 */ 3061 if (nv == NULL && vd == NULL) 3062 return (ENOENT); 3063 3064 if (nv == NULL && vd != NULL) 3065 return (ENOTSUP); 3066 3067 if (!unspare && nv != NULL && vd != NULL) 3068 return (EBUSY); 3069 3070 if (nspares == 1) { 3071 newspares = NULL; 3072 } else { 3073 newspares = kmem_alloc((nspares - 1) * sizeof (void *), 3074 KM_SLEEP); 3075 for (i = 0, j = 0; i < nspares; i++) { 3076 if (spares[i] != nv) 3077 VERIFY(nvlist_dup(spares[i], 3078 &newspares[j++], KM_SLEEP) == 0); 3079 } 3080 } 3081 3082 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_SPARES, 3083 DATA_TYPE_NVLIST_ARRAY) == 0); 3084 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3085 ZPOOL_CONFIG_SPARES, newspares, nspares - 1) == 0); 3086 for (i = 0; i < nspares - 1; i++) 3087 nvlist_free(newspares[i]); 3088 kmem_free(newspares, (nspares - 1) * sizeof (void *)); 3089 3090 return (0); 3091 } 3092 3093 /* 3094 * Remove an l2cache vdev from the nvlist config. 3095 */ 3096 static int 3097 spa_remove_l2cache(spa_aux_vdev_t *sav, uint64_t guid, nvlist_t **l2cache, 3098 int nl2cache, vdev_t *vd) 3099 { 3100 nvlist_t *nv, **newl2cache; 3101 int i, j; 3102 3103 nv = NULL; 3104 for (i = 0; i < nl2cache; i++) { 3105 uint64_t theguid; 3106 3107 VERIFY(nvlist_lookup_uint64(l2cache[i], 3108 ZPOOL_CONFIG_GUID, &theguid) == 0); 3109 if (theguid == guid) { 3110 nv = l2cache[i]; 3111 break; 3112 } 3113 } 3114 3115 if (vd == NULL) { 3116 for (i = 0; i < nl2cache; i++) { 3117 if (sav->sav_vdevs[i]->vdev_guid == guid) { 3118 vd = sav->sav_vdevs[i]; 3119 break; 3120 } 3121 } 3122 } 3123 3124 if (nv == NULL && vd == NULL) 3125 return (ENOENT); 3126 3127 if (nv == NULL && vd != NULL) 3128 return (ENOTSUP); 3129 3130 if (nl2cache == 1) { 3131 newl2cache = NULL; 3132 } else { 3133 newl2cache = kmem_alloc((nl2cache - 1) * sizeof (void *), 3134 KM_SLEEP); 3135 for (i = 0, j = 0; i < nl2cache; i++) { 3136 if (l2cache[i] != nv) 3137 VERIFY(nvlist_dup(l2cache[i], 3138 &newl2cache[j++], KM_SLEEP) == 0); 3139 } 3140 } 3141 3142 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 3143 DATA_TYPE_NVLIST_ARRAY) == 0); 3144 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3145 ZPOOL_CONFIG_L2CACHE, newl2cache, nl2cache - 1) == 0); 3146 for (i = 0; i < nl2cache - 1; i++) 3147 nvlist_free(newl2cache[i]); 3148 kmem_free(newl2cache, (nl2cache - 1) * sizeof (void *)); 3149 3150 return (0); 3151 } 3152 3153 /* 3154 * Remove a device from the pool. Currently, this supports removing only hot 3155 * spares and level 2 ARC devices. 3156 */ 3157 int 3158 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 3159 { 3160 vdev_t *vd; 3161 nvlist_t **spares, **l2cache; 3162 uint_t nspares, nl2cache; 3163 int error = 0; 3164 3165 spa_config_enter(spa, RW_WRITER, FTAG); 3166 3167 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 3168 3169 if (spa->spa_spares.sav_vdevs != NULL && 3170 spa_spare_exists(guid, NULL) && 3171 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3172 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { 3173 if ((error = spa_remove_spares(&spa->spa_spares, guid, unspare, 3174 spares, nspares, vd)) != 0) 3175 goto out; 3176 spa_load_spares(spa); 3177 spa->spa_spares.sav_sync = B_TRUE; 3178 goto out; 3179 } 3180 3181 if (spa->spa_l2cache.sav_vdevs != NULL && 3182 spa_l2cache_exists(guid, NULL) && 3183 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3184 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { 3185 if ((error = spa_remove_l2cache(&spa->spa_l2cache, guid, 3186 l2cache, nl2cache, vd)) != 0) 3187 goto out; 3188 spa_load_l2cache(spa); 3189 spa->spa_l2cache.sav_sync = B_TRUE; 3190 } 3191 3192 out: 3193 spa_config_exit(spa, FTAG); 3194 return (error); 3195 } 3196 3197 /* 3198 * Find any device that's done replacing, or a vdev marked 'unspare' that's 3199 * current spared, so we can detach it. 3200 */ 3201 static vdev_t * 3202 spa_vdev_resilver_done_hunt(vdev_t *vd) 3203 { 3204 vdev_t *newvd, *oldvd; 3205 int c; 3206 3207 for (c = 0; c < vd->vdev_children; c++) { 3208 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 3209 if (oldvd != NULL) 3210 return (oldvd); 3211 } 3212 3213 /* 3214 * Check for a completed replacement. 3215 */ 3216 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 3217 oldvd = vd->vdev_child[0]; 3218 newvd = vd->vdev_child[1]; 3219 3220 mutex_enter(&newvd->vdev_dtl_lock); 3221 if (newvd->vdev_dtl_map.sm_space == 0 && 3222 newvd->vdev_dtl_scrub.sm_space == 0) { 3223 mutex_exit(&newvd->vdev_dtl_lock); 3224 return (oldvd); 3225 } 3226 mutex_exit(&newvd->vdev_dtl_lock); 3227 } 3228 3229 /* 3230 * Check for a completed resilver with the 'unspare' flag set. 3231 */ 3232 if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { 3233 newvd = vd->vdev_child[0]; 3234 oldvd = vd->vdev_child[1]; 3235 3236 mutex_enter(&newvd->vdev_dtl_lock); 3237 if (newvd->vdev_unspare && 3238 newvd->vdev_dtl_map.sm_space == 0 && 3239 newvd->vdev_dtl_scrub.sm_space == 0) { 3240 newvd->vdev_unspare = 0; 3241 mutex_exit(&newvd->vdev_dtl_lock); 3242 return (oldvd); 3243 } 3244 mutex_exit(&newvd->vdev_dtl_lock); 3245 } 3246 3247 return (NULL); 3248 } 3249 3250 static void 3251 spa_vdev_resilver_done(spa_t *spa) 3252 { 3253 vdev_t *vd; 3254 vdev_t *pvd; 3255 uint64_t guid; 3256 uint64_t pguid = 0; 3257 3258 spa_config_enter(spa, RW_READER, FTAG); 3259 3260 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 3261 guid = vd->vdev_guid; 3262 /* 3263 * If we have just finished replacing a hot spared device, then 3264 * we need to detach the parent's first child (the original hot 3265 * spare) as well. 3266 */ 3267 pvd = vd->vdev_parent; 3268 if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3269 pvd->vdev_id == 0) { 3270 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 3271 ASSERT(pvd->vdev_parent->vdev_children == 2); 3272 pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 3273 } 3274 spa_config_exit(spa, FTAG); 3275 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 3276 return; 3277 if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 3278 return; 3279 spa_config_enter(spa, RW_READER, FTAG); 3280 } 3281 3282 spa_config_exit(spa, FTAG); 3283 } 3284 3285 /* 3286 * Update the stored path for this vdev. Dirty the vdev configuration, relying 3287 * on spa_vdev_enter/exit() to synchronize the labels and cache. 3288 */ 3289 int 3290 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 3291 { 3292 vdev_t *vd; 3293 uint64_t txg; 3294 3295 txg = spa_vdev_enter(spa); 3296 3297 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) { 3298 /* 3299 * Determine if this is a reference to a hot spare device. If 3300 * it is, update the path manually as there is no associated 3301 * vdev_t that can be synced to disk. 3302 */ 3303 nvlist_t **spares; 3304 uint_t i, nspares; 3305 3306 if (spa->spa_spares.sav_config != NULL) { 3307 VERIFY(nvlist_lookup_nvlist_array( 3308 spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 3309 &spares, &nspares) == 0); 3310 for (i = 0; i < nspares; i++) { 3311 uint64_t theguid; 3312 VERIFY(nvlist_lookup_uint64(spares[i], 3313 ZPOOL_CONFIG_GUID, &theguid) == 0); 3314 if (theguid == guid) { 3315 VERIFY(nvlist_add_string(spares[i], 3316 ZPOOL_CONFIG_PATH, newpath) == 0); 3317 spa_load_spares(spa); 3318 spa->spa_spares.sav_sync = B_TRUE; 3319 return (spa_vdev_exit(spa, NULL, txg, 3320 0)); 3321 } 3322 } 3323 } 3324 3325 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 3326 } 3327 3328 if (!vd->vdev_ops->vdev_op_leaf) 3329 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3330 3331 spa_strfree(vd->vdev_path); 3332 vd->vdev_path = spa_strdup(newpath); 3333 3334 vdev_config_dirty(vd->vdev_top); 3335 3336 return (spa_vdev_exit(spa, NULL, txg, 0)); 3337 } 3338 3339 /* 3340 * ========================================================================== 3341 * SPA Scrubbing 3342 * ========================================================================== 3343 */ 3344 3345 static void 3346 spa_scrub_io_done(zio_t *zio) 3347 { 3348 spa_t *spa = zio->io_spa; 3349 3350 arc_data_buf_free(zio->io_data, zio->io_size); 3351 3352 mutex_enter(&spa->spa_scrub_lock); 3353 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 3354 vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 3355 spa->spa_scrub_errors++; 3356 mutex_enter(&vd->vdev_stat_lock); 3357 vd->vdev_stat.vs_scrub_errors++; 3358 mutex_exit(&vd->vdev_stat_lock); 3359 } 3360 3361 if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight) 3362 cv_broadcast(&spa->spa_scrub_io_cv); 3363 3364 ASSERT(spa->spa_scrub_inflight >= 0); 3365 3366 mutex_exit(&spa->spa_scrub_lock); 3367 } 3368 3369 static void 3370 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 3371 zbookmark_t *zb) 3372 { 3373 size_t size = BP_GET_LSIZE(bp); 3374 void *data; 3375 3376 mutex_enter(&spa->spa_scrub_lock); 3377 /* 3378 * Do not give too much work to vdev(s). 3379 */ 3380 while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) { 3381 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 3382 } 3383 spa->spa_scrub_inflight++; 3384 mutex_exit(&spa->spa_scrub_lock); 3385 3386 data = arc_data_buf_alloc(size); 3387 3388 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 3389 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 3390 3391 flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 3392 3393 zio_nowait(zio_read(NULL, spa, bp, data, size, 3394 spa_scrub_io_done, NULL, priority, flags, zb)); 3395 } 3396 3397 /* ARGSUSED */ 3398 static int 3399 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 3400 { 3401 blkptr_t *bp = &bc->bc_blkptr; 3402 vdev_t *vd = spa->spa_root_vdev; 3403 dva_t *dva = bp->blk_dva; 3404 int needs_resilver = B_FALSE; 3405 int d; 3406 3407 if (bc->bc_errno) { 3408 /* 3409 * We can't scrub this block, but we can continue to scrub 3410 * the rest of the pool. Note the error and move along. 3411 */ 3412 mutex_enter(&spa->spa_scrub_lock); 3413 spa->spa_scrub_errors++; 3414 mutex_exit(&spa->spa_scrub_lock); 3415 3416 mutex_enter(&vd->vdev_stat_lock); 3417 vd->vdev_stat.vs_scrub_errors++; 3418 mutex_exit(&vd->vdev_stat_lock); 3419 3420 return (ERESTART); 3421 } 3422 3423 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 3424 3425 for (d = 0; d < BP_GET_NDVAS(bp); d++) { 3426 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 3427 3428 ASSERT(vd != NULL); 3429 3430 /* 3431 * Keep track of how much data we've examined so that 3432 * zpool(1M) status can make useful progress reports. 3433 */ 3434 mutex_enter(&vd->vdev_stat_lock); 3435 vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 3436 mutex_exit(&vd->vdev_stat_lock); 3437 3438 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 3439 if (DVA_GET_GANG(&dva[d])) { 3440 /* 3441 * Gang members may be spread across multiple 3442 * vdevs, so the best we can do is look at the 3443 * pool-wide DTL. 3444 * XXX -- it would be better to change our 3445 * allocation policy to ensure that this can't 3446 * happen. 3447 */ 3448 vd = spa->spa_root_vdev; 3449 } 3450 if (vdev_dtl_contains(&vd->vdev_dtl_map, 3451 bp->blk_birth, 1)) 3452 needs_resilver = B_TRUE; 3453 } 3454 } 3455 3456 if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 3457 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 3458 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 3459 else if (needs_resilver) 3460 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 3461 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 3462 3463 return (0); 3464 } 3465 3466 static void 3467 spa_scrub_thread(spa_t *spa) 3468 { 3469 callb_cpr_t cprinfo; 3470 traverse_handle_t *th = spa->spa_scrub_th; 3471 vdev_t *rvd = spa->spa_root_vdev; 3472 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 3473 int error = 0; 3474 boolean_t complete; 3475 3476 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 3477 3478 /* 3479 * If we're restarting due to a snapshot create/delete, 3480 * wait for that to complete. 3481 */ 3482 txg_wait_synced(spa_get_dsl(spa), 0); 3483 3484 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 3485 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 3486 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 3487 3488 spa_config_enter(spa, RW_WRITER, FTAG); 3489 vdev_reopen(rvd); /* purge all vdev caches */ 3490 vdev_config_dirty(rvd); /* rewrite all disk labels */ 3491 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 3492 spa_config_exit(spa, FTAG); 3493 3494 mutex_enter(&spa->spa_scrub_lock); 3495 spa->spa_scrub_errors = 0; 3496 spa->spa_scrub_active = 1; 3497 ASSERT(spa->spa_scrub_inflight == 0); 3498 3499 while (!spa->spa_scrub_stop) { 3500 CALLB_CPR_SAFE_BEGIN(&cprinfo); 3501 while (spa->spa_scrub_suspended) { 3502 spa->spa_scrub_active = 0; 3503 cv_broadcast(&spa->spa_scrub_cv); 3504 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 3505 spa->spa_scrub_active = 1; 3506 } 3507 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 3508 3509 if (spa->spa_scrub_restart_txg != 0) 3510 break; 3511 3512 mutex_exit(&spa->spa_scrub_lock); 3513 error = traverse_more(th); 3514 mutex_enter(&spa->spa_scrub_lock); 3515 if (error != EAGAIN) 3516 break; 3517 } 3518 3519 while (spa->spa_scrub_inflight) 3520 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 3521 3522 spa->spa_scrub_active = 0; 3523 cv_broadcast(&spa->spa_scrub_cv); 3524 3525 mutex_exit(&spa->spa_scrub_lock); 3526 3527 spa_config_enter(spa, RW_WRITER, FTAG); 3528 3529 mutex_enter(&spa->spa_scrub_lock); 3530 3531 /* 3532 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 3533 * AND the spa config lock to synchronize with any config changes 3534 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 3535 */ 3536 if (spa->spa_scrub_restart_txg != 0) 3537 error = ERESTART; 3538 3539 if (spa->spa_scrub_stop) 3540 error = EINTR; 3541 3542 /* 3543 * Even if there were uncorrectable errors, we consider the scrub 3544 * completed. The downside is that if there is a transient error during 3545 * a resilver, we won't resilver the data properly to the target. But 3546 * if the damage is permanent (more likely) we will resilver forever, 3547 * which isn't really acceptable. Since there is enough information for 3548 * the user to know what has failed and why, this seems like a more 3549 * tractable approach. 3550 */ 3551 complete = (error == 0); 3552 3553 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 3554 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 3555 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 3556 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 3557 3558 mutex_exit(&spa->spa_scrub_lock); 3559 3560 /* 3561 * If the scrub/resilver completed, update all DTLs to reflect this. 3562 * Whether it succeeded or not, vacate all temporary scrub DTLs. 3563 */ 3564 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 3565 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 3566 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 3567 spa_errlog_rotate(spa); 3568 3569 if (scrub_type == POOL_SCRUB_RESILVER && complete) 3570 spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_FINISH); 3571 3572 spa_config_exit(spa, FTAG); 3573 3574 mutex_enter(&spa->spa_scrub_lock); 3575 3576 /* 3577 * We may have finished replacing a device. 3578 * Let the async thread assess this and handle the detach. 3579 */ 3580 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 3581 3582 /* 3583 * If we were told to restart, our final act is to start a new scrub. 3584 */ 3585 if (error == ERESTART) 3586 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 3587 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 3588 3589 spa->spa_scrub_type = POOL_SCRUB_NONE; 3590 spa->spa_scrub_active = 0; 3591 spa->spa_scrub_thread = NULL; 3592 cv_broadcast(&spa->spa_scrub_cv); 3593 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 3594 thread_exit(); 3595 } 3596 3597 void 3598 spa_scrub_suspend(spa_t *spa) 3599 { 3600 mutex_enter(&spa->spa_scrub_lock); 3601 spa->spa_scrub_suspended++; 3602 while (spa->spa_scrub_active) { 3603 cv_broadcast(&spa->spa_scrub_cv); 3604 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 3605 } 3606 while (spa->spa_scrub_inflight) 3607 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 3608 mutex_exit(&spa->spa_scrub_lock); 3609 } 3610 3611 void 3612 spa_scrub_resume(spa_t *spa) 3613 { 3614 mutex_enter(&spa->spa_scrub_lock); 3615 ASSERT(spa->spa_scrub_suspended != 0); 3616 if (--spa->spa_scrub_suspended == 0) 3617 cv_broadcast(&spa->spa_scrub_cv); 3618 mutex_exit(&spa->spa_scrub_lock); 3619 } 3620 3621 void 3622 spa_scrub_restart(spa_t *spa, uint64_t txg) 3623 { 3624 /* 3625 * Something happened (e.g. snapshot create/delete) that means 3626 * we must restart any in-progress scrubs. The itinerary will 3627 * fix this properly. 3628 */ 3629 mutex_enter(&spa->spa_scrub_lock); 3630 spa->spa_scrub_restart_txg = txg; 3631 mutex_exit(&spa->spa_scrub_lock); 3632 } 3633 3634 int 3635 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 3636 { 3637 space_seg_t *ss; 3638 uint64_t mintxg, maxtxg; 3639 vdev_t *rvd = spa->spa_root_vdev; 3640 3641 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3642 ASSERT(!spa_config_held(spa, RW_WRITER)); 3643 3644 if ((uint_t)type >= POOL_SCRUB_TYPES) 3645 return (ENOTSUP); 3646 3647 mutex_enter(&spa->spa_scrub_lock); 3648 3649 /* 3650 * If there's a scrub or resilver already in progress, stop it. 3651 */ 3652 while (spa->spa_scrub_thread != NULL) { 3653 /* 3654 * Don't stop a resilver unless forced. 3655 */ 3656 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 3657 mutex_exit(&spa->spa_scrub_lock); 3658 return (EBUSY); 3659 } 3660 spa->spa_scrub_stop = 1; 3661 cv_broadcast(&spa->spa_scrub_cv); 3662 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 3663 } 3664 3665 /* 3666 * Terminate the previous traverse. 3667 */ 3668 if (spa->spa_scrub_th != NULL) { 3669 traverse_fini(spa->spa_scrub_th); 3670 spa->spa_scrub_th = NULL; 3671 } 3672 3673 if (rvd == NULL) { 3674 ASSERT(spa->spa_scrub_stop == 0); 3675 ASSERT(spa->spa_scrub_type == type); 3676 ASSERT(spa->spa_scrub_restart_txg == 0); 3677 mutex_exit(&spa->spa_scrub_lock); 3678 return (0); 3679 } 3680 3681 mintxg = TXG_INITIAL - 1; 3682 maxtxg = spa_last_synced_txg(spa) + 1; 3683 3684 mutex_enter(&rvd->vdev_dtl_lock); 3685 3686 if (rvd->vdev_dtl_map.sm_space == 0) { 3687 /* 3688 * The pool-wide DTL is empty. 3689 * If this is a resilver, there's nothing to do except 3690 * check whether any in-progress replacements have completed. 3691 */ 3692 if (type == POOL_SCRUB_RESILVER) { 3693 type = POOL_SCRUB_NONE; 3694 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 3695 } 3696 } else { 3697 /* 3698 * The pool-wide DTL is non-empty. 3699 * If this is a normal scrub, upgrade to a resilver instead. 3700 */ 3701 if (type == POOL_SCRUB_EVERYTHING) 3702 type = POOL_SCRUB_RESILVER; 3703 } 3704 3705 if (type == POOL_SCRUB_RESILVER) { 3706 /* 3707 * Determine the resilvering boundaries. 3708 * 3709 * Note: (mintxg, maxtxg) is an open interval, 3710 * i.e. mintxg and maxtxg themselves are not included. 3711 * 3712 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 3713 * so we don't claim to resilver a txg that's still changing. 3714 */ 3715 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 3716 mintxg = ss->ss_start - 1; 3717 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 3718 maxtxg = MIN(ss->ss_end, maxtxg); 3719 3720 spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START); 3721 } 3722 3723 mutex_exit(&rvd->vdev_dtl_lock); 3724 3725 spa->spa_scrub_stop = 0; 3726 spa->spa_scrub_type = type; 3727 spa->spa_scrub_restart_txg = 0; 3728 3729 if (type != POOL_SCRUB_NONE) { 3730 spa->spa_scrub_mintxg = mintxg; 3731 spa->spa_scrub_maxtxg = maxtxg; 3732 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 3733 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 3734 ZIO_FLAG_CANFAIL); 3735 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 3736 spa->spa_scrub_thread = thread_create(NULL, 0, 3737 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 3738 } 3739 3740 mutex_exit(&spa->spa_scrub_lock); 3741 3742 return (0); 3743 } 3744 3745 /* 3746 * ========================================================================== 3747 * SPA async task processing 3748 * ========================================================================== 3749 */ 3750 3751 static void 3752 spa_async_remove(spa_t *spa, vdev_t *vd) 3753 { 3754 vdev_t *tvd; 3755 int c; 3756 3757 for (c = 0; c < vd->vdev_children; c++) { 3758 tvd = vd->vdev_child[c]; 3759 if (tvd->vdev_remove_wanted) { 3760 tvd->vdev_remove_wanted = 0; 3761 vdev_set_state(tvd, B_FALSE, VDEV_STATE_REMOVED, 3762 VDEV_AUX_NONE); 3763 vdev_clear(spa, tvd, B_TRUE); 3764 vdev_config_dirty(tvd->vdev_top); 3765 } 3766 spa_async_remove(spa, tvd); 3767 } 3768 } 3769 3770 static void 3771 spa_async_thread(spa_t *spa) 3772 { 3773 int tasks; 3774 uint64_t txg; 3775 3776 ASSERT(spa->spa_sync_on); 3777 3778 mutex_enter(&spa->spa_async_lock); 3779 tasks = spa->spa_async_tasks; 3780 spa->spa_async_tasks = 0; 3781 mutex_exit(&spa->spa_async_lock); 3782 3783 /* 3784 * See if the config needs to be updated. 3785 */ 3786 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 3787 mutex_enter(&spa_namespace_lock); 3788 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3789 mutex_exit(&spa_namespace_lock); 3790 } 3791 3792 /* 3793 * See if any devices need to be marked REMOVED. 3794 * 3795 * XXX - We avoid doing this when we are in 3796 * I/O failure state since spa_vdev_enter() grabs 3797 * the namespace lock and would not be able to obtain 3798 * the writer config lock. 3799 */ 3800 if (tasks & SPA_ASYNC_REMOVE && 3801 spa_state(spa) != POOL_STATE_IO_FAILURE) { 3802 txg = spa_vdev_enter(spa); 3803 spa_async_remove(spa, spa->spa_root_vdev); 3804 (void) spa_vdev_exit(spa, NULL, txg, 0); 3805 } 3806 3807 /* 3808 * If any devices are done replacing, detach them. 3809 */ 3810 if (tasks & SPA_ASYNC_RESILVER_DONE) 3811 spa_vdev_resilver_done(spa); 3812 3813 /* 3814 * Kick off a scrub. When starting a RESILVER scrub (or an EVERYTHING 3815 * scrub which can become a resilver), we need to hold 3816 * spa_namespace_lock() because the sysevent we post via 3817 * spa_event_notify() needs to get the name of the pool. 3818 */ 3819 if (tasks & SPA_ASYNC_SCRUB) { 3820 mutex_enter(&spa_namespace_lock); 3821 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 3822 mutex_exit(&spa_namespace_lock); 3823 } 3824 3825 /* 3826 * Kick off a resilver. 3827 */ 3828 if (tasks & SPA_ASYNC_RESILVER) { 3829 mutex_enter(&spa_namespace_lock); 3830 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 3831 mutex_exit(&spa_namespace_lock); 3832 } 3833 3834 /* 3835 * Let the world know that we're done. 3836 */ 3837 mutex_enter(&spa->spa_async_lock); 3838 spa->spa_async_thread = NULL; 3839 cv_broadcast(&spa->spa_async_cv); 3840 mutex_exit(&spa->spa_async_lock); 3841 thread_exit(); 3842 } 3843 3844 void 3845 spa_async_suspend(spa_t *spa) 3846 { 3847 mutex_enter(&spa->spa_async_lock); 3848 spa->spa_async_suspended++; 3849 while (spa->spa_async_thread != NULL) 3850 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 3851 mutex_exit(&spa->spa_async_lock); 3852 } 3853 3854 void 3855 spa_async_resume(spa_t *spa) 3856 { 3857 mutex_enter(&spa->spa_async_lock); 3858 ASSERT(spa->spa_async_suspended != 0); 3859 spa->spa_async_suspended--; 3860 mutex_exit(&spa->spa_async_lock); 3861 } 3862 3863 static void 3864 spa_async_dispatch(spa_t *spa) 3865 { 3866 mutex_enter(&spa->spa_async_lock); 3867 if (spa->spa_async_tasks && !spa->spa_async_suspended && 3868 spa->spa_async_thread == NULL && 3869 rootdir != NULL && !vn_is_readonly(rootdir)) 3870 spa->spa_async_thread = thread_create(NULL, 0, 3871 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 3872 mutex_exit(&spa->spa_async_lock); 3873 } 3874 3875 void 3876 spa_async_request(spa_t *spa, int task) 3877 { 3878 mutex_enter(&spa->spa_async_lock); 3879 spa->spa_async_tasks |= task; 3880 mutex_exit(&spa->spa_async_lock); 3881 } 3882 3883 /* 3884 * ========================================================================== 3885 * SPA syncing routines 3886 * ========================================================================== 3887 */ 3888 3889 static void 3890 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 3891 { 3892 bplist_t *bpl = &spa->spa_sync_bplist; 3893 dmu_tx_t *tx; 3894 blkptr_t blk; 3895 uint64_t itor = 0; 3896 zio_t *zio; 3897 int error; 3898 uint8_t c = 1; 3899 3900 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 3901 3902 while (bplist_iterate(bpl, &itor, &blk) == 0) 3903 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 3904 3905 error = zio_wait(zio); 3906 ASSERT3U(error, ==, 0); 3907 3908 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 3909 bplist_vacate(bpl, tx); 3910 3911 /* 3912 * Pre-dirty the first block so we sync to convergence faster. 3913 * (Usually only the first block is needed.) 3914 */ 3915 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 3916 dmu_tx_commit(tx); 3917 } 3918 3919 static void 3920 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 3921 { 3922 char *packed = NULL; 3923 size_t nvsize = 0; 3924 dmu_buf_t *db; 3925 3926 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 3927 3928 packed = kmem_alloc(nvsize, KM_SLEEP); 3929 3930 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 3931 KM_SLEEP) == 0); 3932 3933 dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 3934 3935 kmem_free(packed, nvsize); 3936 3937 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 3938 dmu_buf_will_dirty(db, tx); 3939 *(uint64_t *)db->db_data = nvsize; 3940 dmu_buf_rele(db, FTAG); 3941 } 3942 3943 static void 3944 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 3945 const char *config, const char *entry) 3946 { 3947 nvlist_t *nvroot; 3948 nvlist_t **list; 3949 int i; 3950 3951 if (!sav->sav_sync) 3952 return; 3953 3954 /* 3955 * Update the MOS nvlist describing the list of available devices. 3956 * spa_validate_aux() will have already made sure this nvlist is 3957 * valid and the vdevs are labeled appropriately. 3958 */ 3959 if (sav->sav_object == 0) { 3960 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 3961 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 3962 sizeof (uint64_t), tx); 3963 VERIFY(zap_update(spa->spa_meta_objset, 3964 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 3965 &sav->sav_object, tx) == 0); 3966 } 3967 3968 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3969 if (sav->sav_count == 0) { 3970 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 3971 } else { 3972 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 3973 for (i = 0; i < sav->sav_count; i++) 3974 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 3975 B_FALSE, B_FALSE, B_TRUE); 3976 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 3977 sav->sav_count) == 0); 3978 for (i = 0; i < sav->sav_count; i++) 3979 nvlist_free(list[i]); 3980 kmem_free(list, sav->sav_count * sizeof (void *)); 3981 } 3982 3983 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 3984 nvlist_free(nvroot); 3985 3986 sav->sav_sync = B_FALSE; 3987 } 3988 3989 static void 3990 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 3991 { 3992 nvlist_t *config; 3993 3994 if (list_is_empty(&spa->spa_dirty_list)) 3995 return; 3996 3997 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 3998 3999 if (spa->spa_config_syncing) 4000 nvlist_free(spa->spa_config_syncing); 4001 spa->spa_config_syncing = config; 4002 4003 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 4004 } 4005 4006 /* 4007 * Set zpool properties. 4008 */ 4009 static void 4010 spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 4011 { 4012 spa_t *spa = arg1; 4013 objset_t *mos = spa->spa_meta_objset; 4014 nvlist_t *nvp = arg2; 4015 nvpair_t *elem; 4016 uint64_t intval; 4017 char *strval; 4018 zpool_prop_t prop; 4019 const char *propname; 4020 zprop_type_t proptype; 4021 spa_config_dirent_t *dp; 4022 4023 elem = NULL; 4024 while ((elem = nvlist_next_nvpair(nvp, elem))) { 4025 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 4026 case ZPOOL_PROP_VERSION: 4027 /* 4028 * Only set version for non-zpool-creation cases 4029 * (set/import). spa_create() needs special care 4030 * for version setting. 4031 */ 4032 if (tx->tx_txg != TXG_INITIAL) { 4033 VERIFY(nvpair_value_uint64(elem, 4034 &intval) == 0); 4035 ASSERT(intval <= SPA_VERSION); 4036 ASSERT(intval >= spa_version(spa)); 4037 spa->spa_uberblock.ub_version = intval; 4038 vdev_config_dirty(spa->spa_root_vdev); 4039 } 4040 break; 4041 4042 case ZPOOL_PROP_ALTROOT: 4043 /* 4044 * 'altroot' is a non-persistent property. It should 4045 * have been set temporarily at creation or import time. 4046 */ 4047 ASSERT(spa->spa_root != NULL); 4048 break; 4049 4050 case ZPOOL_PROP_CACHEFILE: 4051 /* 4052 * 'cachefile' is a non-persistent property, but note 4053 * an async request that the config cache needs to be 4054 * udpated. 4055 */ 4056 VERIFY(nvpair_value_string(elem, &strval) == 0); 4057 4058 dp = kmem_alloc(sizeof (spa_config_dirent_t), 4059 KM_SLEEP); 4060 4061 if (strval[0] == '\0') 4062 dp->scd_path = spa_strdup(spa_config_path); 4063 else if (strcmp(strval, "none") == 0) 4064 dp->scd_path = NULL; 4065 else 4066 dp->scd_path = spa_strdup(strval); 4067 4068 list_insert_head(&spa->spa_config_list, dp); 4069 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 4070 break; 4071 default: 4072 /* 4073 * Set pool property values in the poolprops mos object. 4074 */ 4075 mutex_enter(&spa->spa_props_lock); 4076 if (spa->spa_pool_props_object == 0) { 4077 objset_t *mos = spa->spa_meta_objset; 4078 4079 VERIFY((spa->spa_pool_props_object = 4080 zap_create(mos, DMU_OT_POOL_PROPS, 4081 DMU_OT_NONE, 0, tx)) > 0); 4082 4083 VERIFY(zap_update(mos, 4084 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 4085 8, 1, &spa->spa_pool_props_object, tx) 4086 == 0); 4087 } 4088 mutex_exit(&spa->spa_props_lock); 4089 4090 /* normalize the property name */ 4091 propname = zpool_prop_to_name(prop); 4092 proptype = zpool_prop_get_type(prop); 4093 4094 if (nvpair_type(elem) == DATA_TYPE_STRING) { 4095 ASSERT(proptype == PROP_TYPE_STRING); 4096 VERIFY(nvpair_value_string(elem, &strval) == 0); 4097 VERIFY(zap_update(mos, 4098 spa->spa_pool_props_object, propname, 4099 1, strlen(strval) + 1, strval, tx) == 0); 4100 4101 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 4102 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 4103 4104 if (proptype == PROP_TYPE_INDEX) { 4105 const char *unused; 4106 VERIFY(zpool_prop_index_to_string( 4107 prop, intval, &unused) == 0); 4108 } 4109 VERIFY(zap_update(mos, 4110 spa->spa_pool_props_object, propname, 4111 8, 1, &intval, tx) == 0); 4112 } else { 4113 ASSERT(0); /* not allowed */ 4114 } 4115 4116 switch (prop) { 4117 case ZPOOL_PROP_DELEGATION: 4118 spa->spa_delegation = intval; 4119 break; 4120 case ZPOOL_PROP_BOOTFS: 4121 spa->spa_bootfs = intval; 4122 break; 4123 case ZPOOL_PROP_FAILUREMODE: 4124 spa->spa_failmode = intval; 4125 break; 4126 default: 4127 break; 4128 } 4129 } 4130 4131 /* log internal history if this is not a zpool create */ 4132 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 4133 tx->tx_txg != TXG_INITIAL) { 4134 spa_history_internal_log(LOG_POOL_PROPSET, 4135 spa, tx, cr, "%s %lld %s", 4136 nvpair_name(elem), intval, spa->spa_name); 4137 } 4138 } 4139 } 4140 4141 /* 4142 * Sync the specified transaction group. New blocks may be dirtied as 4143 * part of the process, so we iterate until it converges. 4144 */ 4145 void 4146 spa_sync(spa_t *spa, uint64_t txg) 4147 { 4148 dsl_pool_t *dp = spa->spa_dsl_pool; 4149 objset_t *mos = spa->spa_meta_objset; 4150 bplist_t *bpl = &spa->spa_sync_bplist; 4151 vdev_t *rvd = spa->spa_root_vdev; 4152 vdev_t *vd; 4153 dmu_tx_t *tx; 4154 int dirty_vdevs; 4155 4156 /* 4157 * Lock out configuration changes. 4158 */ 4159 spa_config_enter(spa, RW_READER, FTAG); 4160 4161 spa->spa_syncing_txg = txg; 4162 spa->spa_sync_pass = 0; 4163 4164 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 4165 4166 tx = dmu_tx_create_assigned(dp, txg); 4167 4168 /* 4169 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 4170 * set spa_deflate if we have no raid-z vdevs. 4171 */ 4172 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 4173 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 4174 int i; 4175 4176 for (i = 0; i < rvd->vdev_children; i++) { 4177 vd = rvd->vdev_child[i]; 4178 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 4179 break; 4180 } 4181 if (i == rvd->vdev_children) { 4182 spa->spa_deflate = TRUE; 4183 VERIFY(0 == zap_add(spa->spa_meta_objset, 4184 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 4185 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 4186 } 4187 } 4188 4189 /* 4190 * If anything has changed in this txg, push the deferred frees 4191 * from the previous txg. If not, leave them alone so that we 4192 * don't generate work on an otherwise idle system. 4193 */ 4194 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 4195 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 4196 !txg_list_empty(&dp->dp_sync_tasks, txg)) 4197 spa_sync_deferred_frees(spa, txg); 4198 4199 /* 4200 * Iterate to convergence. 4201 */ 4202 do { 4203 spa->spa_sync_pass++; 4204 4205 spa_sync_config_object(spa, tx); 4206 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 4207 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 4208 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 4209 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 4210 spa_errlog_sync(spa, txg); 4211 dsl_pool_sync(dp, txg); 4212 4213 dirty_vdevs = 0; 4214 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 4215 vdev_sync(vd, txg); 4216 dirty_vdevs++; 4217 } 4218 4219 bplist_sync(bpl, tx); 4220 } while (dirty_vdevs); 4221 4222 bplist_close(bpl); 4223 4224 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 4225 4226 /* 4227 * Rewrite the vdev configuration (which includes the uberblock) 4228 * to commit the transaction group. 4229 * 4230 * If there are no dirty vdevs, we sync the uberblock to a few 4231 * random top-level vdevs that are known to be visible in the 4232 * config cache (see spa_vdev_add() for details). If there *are* 4233 * dirty vdevs -- or if the sync to our random subset fails -- 4234 * then sync the uberblock to all vdevs. 4235 */ 4236 if (list_is_empty(&spa->spa_dirty_list)) { 4237 vdev_t *svd[SPA_DVAS_PER_BP]; 4238 int svdcount = 0; 4239 int children = rvd->vdev_children; 4240 int c0 = spa_get_random(children); 4241 int c; 4242 4243 for (c = 0; c < children; c++) { 4244 vd = rvd->vdev_child[(c0 + c) % children]; 4245 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 4246 continue; 4247 svd[svdcount++] = vd; 4248 if (svdcount == SPA_DVAS_PER_BP) 4249 break; 4250 } 4251 vdev_config_sync(svd, svdcount, txg); 4252 } else { 4253 vdev_config_sync(rvd->vdev_child, rvd->vdev_children, txg); 4254 } 4255 dmu_tx_commit(tx); 4256 4257 /* 4258 * Clear the dirty config list. 4259 */ 4260 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 4261 vdev_config_clean(vd); 4262 4263 /* 4264 * Now that the new config has synced transactionally, 4265 * let it become visible to the config cache. 4266 */ 4267 if (spa->spa_config_syncing != NULL) { 4268 spa_config_set(spa, spa->spa_config_syncing); 4269 spa->spa_config_txg = txg; 4270 spa->spa_config_syncing = NULL; 4271 } 4272 4273 /* 4274 * Make a stable copy of the fully synced uberblock. 4275 * We use this as the root for pool traversals. 4276 */ 4277 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 4278 4279 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 4280 4281 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 4282 spa->spa_traverse_wanted = 0; 4283 spa->spa_ubsync = spa->spa_uberblock; 4284 rw_exit(&spa->spa_traverse_lock); 4285 4286 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 4287 4288 /* 4289 * Clean up the ZIL records for the synced txg. 4290 */ 4291 dsl_pool_zil_clean(dp); 4292 4293 /* 4294 * Update usable space statistics. 4295 */ 4296 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 4297 vdev_sync_done(vd, txg); 4298 4299 /* 4300 * It had better be the case that we didn't dirty anything 4301 * since vdev_config_sync(). 4302 */ 4303 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 4304 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 4305 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 4306 ASSERT(bpl->bpl_queue == NULL); 4307 4308 spa_config_exit(spa, FTAG); 4309 4310 /* 4311 * If any async tasks have been requested, kick them off. 4312 */ 4313 spa_async_dispatch(spa); 4314 } 4315 4316 /* 4317 * Sync all pools. We don't want to hold the namespace lock across these 4318 * operations, so we take a reference on the spa_t and drop the lock during the 4319 * sync. 4320 */ 4321 void 4322 spa_sync_allpools(void) 4323 { 4324 spa_t *spa = NULL; 4325 mutex_enter(&spa_namespace_lock); 4326 while ((spa = spa_next(spa)) != NULL) { 4327 if (spa_state(spa) != POOL_STATE_ACTIVE) 4328 continue; 4329 spa_open_ref(spa, FTAG); 4330 mutex_exit(&spa_namespace_lock); 4331 txg_wait_synced(spa_get_dsl(spa), 0); 4332 mutex_enter(&spa_namespace_lock); 4333 spa_close(spa, FTAG); 4334 } 4335 mutex_exit(&spa_namespace_lock); 4336 } 4337 4338 /* 4339 * ========================================================================== 4340 * Miscellaneous routines 4341 * ========================================================================== 4342 */ 4343 4344 /* 4345 * Remove all pools in the system. 4346 */ 4347 void 4348 spa_evict_all(void) 4349 { 4350 spa_t *spa; 4351 4352 /* 4353 * Remove all cached state. All pools should be closed now, 4354 * so every spa in the AVL tree should be unreferenced. 4355 */ 4356 mutex_enter(&spa_namespace_lock); 4357 while ((spa = spa_next(NULL)) != NULL) { 4358 /* 4359 * Stop async tasks. The async thread may need to detach 4360 * a device that's been replaced, which requires grabbing 4361 * spa_namespace_lock, so we must drop it here. 4362 */ 4363 spa_open_ref(spa, FTAG); 4364 mutex_exit(&spa_namespace_lock); 4365 spa_async_suspend(spa); 4366 mutex_enter(&spa_namespace_lock); 4367 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 4368 spa_close(spa, FTAG); 4369 4370 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4371 spa_unload(spa); 4372 spa_deactivate(spa); 4373 } 4374 spa_remove(spa); 4375 } 4376 mutex_exit(&spa_namespace_lock); 4377 } 4378 4379 vdev_t * 4380 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache) 4381 { 4382 vdev_t *vd; 4383 int i; 4384 4385 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 4386 return (vd); 4387 4388 if (l2cache) { 4389 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 4390 vd = spa->spa_l2cache.sav_vdevs[i]; 4391 if (vd->vdev_guid == guid) 4392 return (vd); 4393 } 4394 } 4395 4396 return (NULL); 4397 } 4398 4399 void 4400 spa_upgrade(spa_t *spa, uint64_t version) 4401 { 4402 spa_config_enter(spa, RW_WRITER, FTAG); 4403 4404 /* 4405 * This should only be called for a non-faulted pool, and since a 4406 * future version would result in an unopenable pool, this shouldn't be 4407 * possible. 4408 */ 4409 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 4410 ASSERT(version >= spa->spa_uberblock.ub_version); 4411 4412 spa->spa_uberblock.ub_version = version; 4413 vdev_config_dirty(spa->spa_root_vdev); 4414 4415 spa_config_exit(spa, FTAG); 4416 4417 txg_wait_synced(spa_get_dsl(spa), 0); 4418 } 4419 4420 boolean_t 4421 spa_has_spare(spa_t *spa, uint64_t guid) 4422 { 4423 int i; 4424 uint64_t spareguid; 4425 spa_aux_vdev_t *sav = &spa->spa_spares; 4426 4427 for (i = 0; i < sav->sav_count; i++) 4428 if (sav->sav_vdevs[i]->vdev_guid == guid) 4429 return (B_TRUE); 4430 4431 for (i = 0; i < sav->sav_npending; i++) { 4432 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 4433 &spareguid) == 0 && spareguid == guid) 4434 return (B_TRUE); 4435 } 4436 4437 return (B_FALSE); 4438 } 4439 4440 /* 4441 * Post a sysevent corresponding to the given event. The 'name' must be one of 4442 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 4443 * filled in from the spa and (optionally) the vdev. This doesn't do anything 4444 * in the userland libzpool, as we don't want consumers to misinterpret ztest 4445 * or zdb as real changes. 4446 */ 4447 void 4448 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 4449 { 4450 #ifdef _KERNEL 4451 sysevent_t *ev; 4452 sysevent_attr_list_t *attr = NULL; 4453 sysevent_value_t value; 4454 sysevent_id_t eid; 4455 4456 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 4457 SE_SLEEP); 4458 4459 value.value_type = SE_DATA_TYPE_STRING; 4460 value.value.sv_string = spa_name(spa); 4461 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 4462 goto done; 4463 4464 value.value_type = SE_DATA_TYPE_UINT64; 4465 value.value.sv_uint64 = spa_guid(spa); 4466 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 4467 goto done; 4468 4469 if (vd) { 4470 value.value_type = SE_DATA_TYPE_UINT64; 4471 value.value.sv_uint64 = vd->vdev_guid; 4472 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 4473 SE_SLEEP) != 0) 4474 goto done; 4475 4476 if (vd->vdev_path) { 4477 value.value_type = SE_DATA_TYPE_STRING; 4478 value.value.sv_string = vd->vdev_path; 4479 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 4480 &value, SE_SLEEP) != 0) 4481 goto done; 4482 } 4483 } 4484 4485 if (sysevent_attach_attributes(ev, attr) != 0) 4486 goto done; 4487 attr = NULL; 4488 4489 (void) log_sysevent(ev, SE_SLEEP, &eid); 4490 4491 done: 4492 if (attr) 4493 sysevent_free_attr(attr); 4494 sysevent_free(ev); 4495 #endif 4496 } 4497