1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * This file contains all the routines used when modifying on-disk SPA state. 29 * This includes opening, importing, destroying, exporting a pool, and syncing a 30 * pool. 31 */ 32 33 #include <sys/zfs_context.h> 34 #include <sys/fm/fs/zfs.h> 35 #include <sys/spa_impl.h> 36 #include <sys/zio.h> 37 #include <sys/zio_checksum.h> 38 #include <sys/dmu.h> 39 #include <sys/dmu_tx.h> 40 #include <sys/zap.h> 41 #include <sys/zil.h> 42 #include <sys/ddt.h> 43 #include <sys/vdev_impl.h> 44 #include <sys/metaslab.h> 45 #include <sys/metaslab_impl.h> 46 #include <sys/uberblock_impl.h> 47 #include <sys/txg.h> 48 #include <sys/avl.h> 49 #include <sys/dmu_traverse.h> 50 #include <sys/dmu_objset.h> 51 #include <sys/unique.h> 52 #include <sys/dsl_pool.h> 53 #include <sys/dsl_dataset.h> 54 #include <sys/dsl_dir.h> 55 #include <sys/dsl_prop.h> 56 #include <sys/dsl_synctask.h> 57 #include <sys/fs/zfs.h> 58 #include <sys/arc.h> 59 #include <sys/callb.h> 60 #include <sys/systeminfo.h> 61 #include <sys/spa_boot.h> 62 #include <sys/zfs_ioctl.h> 63 64 #ifdef _KERNEL 65 #include <sys/zone.h> 66 #include <sys/bootprops.h> 67 #endif /* _KERNEL */ 68 69 #include "zfs_prop.h" 70 #include "zfs_comutil.h" 71 72 enum zti_modes { 73 zti_mode_fixed, /* value is # of threads (min 1) */ 74 zti_mode_online_percent, /* value is % of online CPUs */ 75 zti_mode_tune, /* fill from zio_taskq_tune_* */ 76 zti_mode_null, /* don't create a taskq */ 77 zti_nmodes 78 }; 79 80 #define ZTI_FIX(n) { zti_mode_fixed, (n) } 81 #define ZTI_PCT(n) { zti_mode_online_percent, (n) } 82 #define ZTI_TUNE { zti_mode_tune, 0 } 83 #define ZTI_NULL { zti_mode_null, 0 } 84 85 #define ZTI_ONE ZTI_FIX(1) 86 87 typedef struct zio_taskq_info { 88 enum zti_modes zti_mode; 89 uint_t zti_value; 90 } zio_taskq_info_t; 91 92 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 93 "issue", "issue_high", "intr", "intr_high" 94 }; 95 96 /* 97 * Define the taskq threads for the following I/O types: 98 * NULL, READ, WRITE, FREE, CLAIM, and IOCTL 99 */ 100 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 101 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 102 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 103 { ZTI_FIX(8), ZTI_NULL, ZTI_TUNE, ZTI_NULL }, 104 { ZTI_TUNE, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, 105 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 106 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 107 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 108 }; 109 110 enum zti_modes zio_taskq_tune_mode = zti_mode_online_percent; 111 uint_t zio_taskq_tune_value = 80; /* #threads = 80% of # online CPUs */ 112 113 static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); 114 static boolean_t spa_has_active_shared_spare(spa_t *spa); 115 116 /* 117 * ========================================================================== 118 * SPA properties routines 119 * ========================================================================== 120 */ 121 122 /* 123 * Add a (source=src, propname=propval) list to an nvlist. 124 */ 125 static void 126 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 127 uint64_t intval, zprop_source_t src) 128 { 129 const char *propname = zpool_prop_to_name(prop); 130 nvlist_t *propval; 131 132 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 133 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 134 135 if (strval != NULL) 136 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 137 else 138 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 139 140 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 141 nvlist_free(propval); 142 } 143 144 /* 145 * Get property values from the spa configuration. 146 */ 147 static void 148 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 149 { 150 uint64_t size; 151 uint64_t alloc; 152 uint64_t cap, version; 153 zprop_source_t src = ZPROP_SRC_NONE; 154 spa_config_dirent_t *dp; 155 156 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 157 158 if (spa->spa_root_vdev != NULL) { 159 alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 160 size = metaslab_class_get_space(spa_normal_class(spa)); 161 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 162 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 163 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 164 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 165 size - alloc, src); 166 167 cap = (size == 0) ? 0 : (alloc * 100 / size); 168 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 169 170 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 171 ddt_get_pool_dedup_ratio(spa), src); 172 173 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 174 spa->spa_root_vdev->vdev_state, src); 175 176 version = spa_version(spa); 177 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 178 src = ZPROP_SRC_DEFAULT; 179 else 180 src = ZPROP_SRC_LOCAL; 181 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 182 } 183 184 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 185 186 if (spa->spa_root != NULL) 187 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 188 0, ZPROP_SRC_LOCAL); 189 190 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 191 if (dp->scd_path == NULL) { 192 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 193 "none", 0, ZPROP_SRC_LOCAL); 194 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 195 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 196 dp->scd_path, 0, ZPROP_SRC_LOCAL); 197 } 198 } 199 } 200 201 /* 202 * Get zpool property values. 203 */ 204 int 205 spa_prop_get(spa_t *spa, nvlist_t **nvp) 206 { 207 objset_t *mos = spa->spa_meta_objset; 208 zap_cursor_t zc; 209 zap_attribute_t za; 210 int err; 211 212 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 213 214 mutex_enter(&spa->spa_props_lock); 215 216 /* 217 * Get properties from the spa config. 218 */ 219 spa_prop_get_config(spa, nvp); 220 221 /* If no pool property object, no more prop to get. */ 222 if (spa->spa_pool_props_object == 0) { 223 mutex_exit(&spa->spa_props_lock); 224 return (0); 225 } 226 227 /* 228 * Get properties from the MOS pool property object. 229 */ 230 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 231 (err = zap_cursor_retrieve(&zc, &za)) == 0; 232 zap_cursor_advance(&zc)) { 233 uint64_t intval = 0; 234 char *strval = NULL; 235 zprop_source_t src = ZPROP_SRC_DEFAULT; 236 zpool_prop_t prop; 237 238 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 239 continue; 240 241 switch (za.za_integer_length) { 242 case 8: 243 /* integer property */ 244 if (za.za_first_integer != 245 zpool_prop_default_numeric(prop)) 246 src = ZPROP_SRC_LOCAL; 247 248 if (prop == ZPOOL_PROP_BOOTFS) { 249 dsl_pool_t *dp; 250 dsl_dataset_t *ds = NULL; 251 252 dp = spa_get_dsl(spa); 253 rw_enter(&dp->dp_config_rwlock, RW_READER); 254 if (err = dsl_dataset_hold_obj(dp, 255 za.za_first_integer, FTAG, &ds)) { 256 rw_exit(&dp->dp_config_rwlock); 257 break; 258 } 259 260 strval = kmem_alloc( 261 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 262 KM_SLEEP); 263 dsl_dataset_name(ds, strval); 264 dsl_dataset_rele(ds, FTAG); 265 rw_exit(&dp->dp_config_rwlock); 266 } else { 267 strval = NULL; 268 intval = za.za_first_integer; 269 } 270 271 spa_prop_add_list(*nvp, prop, strval, intval, src); 272 273 if (strval != NULL) 274 kmem_free(strval, 275 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 276 277 break; 278 279 case 1: 280 /* string property */ 281 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 282 err = zap_lookup(mos, spa->spa_pool_props_object, 283 za.za_name, 1, za.za_num_integers, strval); 284 if (err) { 285 kmem_free(strval, za.za_num_integers); 286 break; 287 } 288 spa_prop_add_list(*nvp, prop, strval, 0, src); 289 kmem_free(strval, za.za_num_integers); 290 break; 291 292 default: 293 break; 294 } 295 } 296 zap_cursor_fini(&zc); 297 mutex_exit(&spa->spa_props_lock); 298 out: 299 if (err && err != ENOENT) { 300 nvlist_free(*nvp); 301 *nvp = NULL; 302 return (err); 303 } 304 305 return (0); 306 } 307 308 /* 309 * Validate the given pool properties nvlist and modify the list 310 * for the property values to be set. 311 */ 312 static int 313 spa_prop_validate(spa_t *spa, nvlist_t *props) 314 { 315 nvpair_t *elem; 316 int error = 0, reset_bootfs = 0; 317 uint64_t objnum; 318 319 elem = NULL; 320 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 321 zpool_prop_t prop; 322 char *propname, *strval; 323 uint64_t intval; 324 objset_t *os; 325 char *slash; 326 327 propname = nvpair_name(elem); 328 329 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) 330 return (EINVAL); 331 332 switch (prop) { 333 case ZPOOL_PROP_VERSION: 334 error = nvpair_value_uint64(elem, &intval); 335 if (!error && 336 (intval < spa_version(spa) || intval > SPA_VERSION)) 337 error = EINVAL; 338 break; 339 340 case ZPOOL_PROP_DELEGATION: 341 case ZPOOL_PROP_AUTOREPLACE: 342 case ZPOOL_PROP_LISTSNAPS: 343 case ZPOOL_PROP_AUTOEXPAND: 344 error = nvpair_value_uint64(elem, &intval); 345 if (!error && intval > 1) 346 error = EINVAL; 347 break; 348 349 case ZPOOL_PROP_BOOTFS: 350 /* 351 * If the pool version is less than SPA_VERSION_BOOTFS, 352 * or the pool is still being created (version == 0), 353 * the bootfs property cannot be set. 354 */ 355 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 356 error = ENOTSUP; 357 break; 358 } 359 360 /* 361 * Make sure the vdev config is bootable 362 */ 363 if (!vdev_is_bootable(spa->spa_root_vdev)) { 364 error = ENOTSUP; 365 break; 366 } 367 368 reset_bootfs = 1; 369 370 error = nvpair_value_string(elem, &strval); 371 372 if (!error) { 373 uint64_t compress; 374 375 if (strval == NULL || strval[0] == '\0') { 376 objnum = zpool_prop_default_numeric( 377 ZPOOL_PROP_BOOTFS); 378 break; 379 } 380 381 if (error = dmu_objset_hold(strval, FTAG, &os)) 382 break; 383 384 /* Must be ZPL and not gzip compressed. */ 385 386 if (dmu_objset_type(os) != DMU_OST_ZFS) { 387 error = ENOTSUP; 388 } else if ((error = dsl_prop_get_integer(strval, 389 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 390 &compress, NULL)) == 0 && 391 !BOOTFS_COMPRESS_VALID(compress)) { 392 error = ENOTSUP; 393 } else { 394 objnum = dmu_objset_id(os); 395 } 396 dmu_objset_rele(os, FTAG); 397 } 398 break; 399 400 case ZPOOL_PROP_FAILUREMODE: 401 error = nvpair_value_uint64(elem, &intval); 402 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 403 intval > ZIO_FAILURE_MODE_PANIC)) 404 error = EINVAL; 405 406 /* 407 * This is a special case which only occurs when 408 * the pool has completely failed. This allows 409 * the user to change the in-core failmode property 410 * without syncing it out to disk (I/Os might 411 * currently be blocked). We do this by returning 412 * EIO to the caller (spa_prop_set) to trick it 413 * into thinking we encountered a property validation 414 * error. 415 */ 416 if (!error && spa_suspended(spa)) { 417 spa->spa_failmode = intval; 418 error = EIO; 419 } 420 break; 421 422 case ZPOOL_PROP_CACHEFILE: 423 if ((error = nvpair_value_string(elem, &strval)) != 0) 424 break; 425 426 if (strval[0] == '\0') 427 break; 428 429 if (strcmp(strval, "none") == 0) 430 break; 431 432 if (strval[0] != '/') { 433 error = EINVAL; 434 break; 435 } 436 437 slash = strrchr(strval, '/'); 438 ASSERT(slash != NULL); 439 440 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 441 strcmp(slash, "/..") == 0) 442 error = EINVAL; 443 break; 444 445 case ZPOOL_PROP_DEDUPDITTO: 446 if (spa_version(spa) < SPA_VERSION_DEDUP) 447 error = ENOTSUP; 448 else 449 error = nvpair_value_uint64(elem, &intval); 450 if (error == 0 && 451 intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 452 error = EINVAL; 453 break; 454 } 455 456 if (error) 457 break; 458 } 459 460 if (!error && reset_bootfs) { 461 error = nvlist_remove(props, 462 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 463 464 if (!error) { 465 error = nvlist_add_uint64(props, 466 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 467 } 468 } 469 470 return (error); 471 } 472 473 void 474 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 475 { 476 char *cachefile; 477 spa_config_dirent_t *dp; 478 479 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 480 &cachefile) != 0) 481 return; 482 483 dp = kmem_alloc(sizeof (spa_config_dirent_t), 484 KM_SLEEP); 485 486 if (cachefile[0] == '\0') 487 dp->scd_path = spa_strdup(spa_config_path); 488 else if (strcmp(cachefile, "none") == 0) 489 dp->scd_path = NULL; 490 else 491 dp->scd_path = spa_strdup(cachefile); 492 493 list_insert_head(&spa->spa_config_list, dp); 494 if (need_sync) 495 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 496 } 497 498 int 499 spa_prop_set(spa_t *spa, nvlist_t *nvp) 500 { 501 int error; 502 nvpair_t *elem; 503 boolean_t need_sync = B_FALSE; 504 zpool_prop_t prop; 505 506 if ((error = spa_prop_validate(spa, nvp)) != 0) 507 return (error); 508 509 elem = NULL; 510 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 511 if ((prop = zpool_name_to_prop( 512 nvpair_name(elem))) == ZPROP_INVAL) 513 return (EINVAL); 514 515 if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT) 516 continue; 517 518 need_sync = B_TRUE; 519 break; 520 } 521 522 if (need_sync) 523 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 524 spa, nvp, 3)); 525 else 526 return (0); 527 } 528 529 /* 530 * If the bootfs property value is dsobj, clear it. 531 */ 532 void 533 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 534 { 535 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 536 VERIFY(zap_remove(spa->spa_meta_objset, 537 spa->spa_pool_props_object, 538 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 539 spa->spa_bootfs = 0; 540 } 541 } 542 543 /* 544 * ========================================================================== 545 * SPA state manipulation (open/create/destroy/import/export) 546 * ========================================================================== 547 */ 548 549 static int 550 spa_error_entry_compare(const void *a, const void *b) 551 { 552 spa_error_entry_t *sa = (spa_error_entry_t *)a; 553 spa_error_entry_t *sb = (spa_error_entry_t *)b; 554 int ret; 555 556 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 557 sizeof (zbookmark_t)); 558 559 if (ret < 0) 560 return (-1); 561 else if (ret > 0) 562 return (1); 563 else 564 return (0); 565 } 566 567 /* 568 * Utility function which retrieves copies of the current logs and 569 * re-initializes them in the process. 570 */ 571 void 572 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 573 { 574 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 575 576 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 577 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 578 579 avl_create(&spa->spa_errlist_scrub, 580 spa_error_entry_compare, sizeof (spa_error_entry_t), 581 offsetof(spa_error_entry_t, se_avl)); 582 avl_create(&spa->spa_errlist_last, 583 spa_error_entry_compare, sizeof (spa_error_entry_t), 584 offsetof(spa_error_entry_t, se_avl)); 585 } 586 587 /* 588 * Activate an uninitialized pool. 589 */ 590 static void 591 spa_activate(spa_t *spa, int mode) 592 { 593 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 594 595 spa->spa_state = POOL_STATE_ACTIVE; 596 spa->spa_mode = mode; 597 598 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 599 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 600 601 for (int t = 0; t < ZIO_TYPES; t++) { 602 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 603 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 604 enum zti_modes mode = ztip->zti_mode; 605 uint_t value = ztip->zti_value; 606 char name[32]; 607 608 (void) snprintf(name, sizeof (name), 609 "%s_%s", zio_type_name[t], zio_taskq_types[q]); 610 611 if (mode == zti_mode_tune) { 612 mode = zio_taskq_tune_mode; 613 value = zio_taskq_tune_value; 614 if (mode == zti_mode_tune) 615 mode = zti_mode_online_percent; 616 } 617 618 switch (mode) { 619 case zti_mode_fixed: 620 ASSERT3U(value, >=, 1); 621 value = MAX(value, 1); 622 623 spa->spa_zio_taskq[t][q] = taskq_create(name, 624 value, maxclsyspri, 50, INT_MAX, 625 TASKQ_PREPOPULATE); 626 break; 627 628 case zti_mode_online_percent: 629 spa->spa_zio_taskq[t][q] = taskq_create(name, 630 value, maxclsyspri, 50, INT_MAX, 631 TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT); 632 break; 633 634 case zti_mode_null: 635 spa->spa_zio_taskq[t][q] = NULL; 636 break; 637 638 case zti_mode_tune: 639 default: 640 panic("unrecognized mode for " 641 "zio_taskqs[%u]->zti_nthreads[%u] (%u:%u) " 642 "in spa_activate()", 643 t, q, mode, value); 644 break; 645 } 646 } 647 } 648 649 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 650 offsetof(vdev_t, vdev_config_dirty_node)); 651 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 652 offsetof(vdev_t, vdev_state_dirty_node)); 653 654 txg_list_create(&spa->spa_vdev_txg_list, 655 offsetof(struct vdev, vdev_txg_node)); 656 657 avl_create(&spa->spa_errlist_scrub, 658 spa_error_entry_compare, sizeof (spa_error_entry_t), 659 offsetof(spa_error_entry_t, se_avl)); 660 avl_create(&spa->spa_errlist_last, 661 spa_error_entry_compare, sizeof (spa_error_entry_t), 662 offsetof(spa_error_entry_t, se_avl)); 663 } 664 665 /* 666 * Opposite of spa_activate(). 667 */ 668 static void 669 spa_deactivate(spa_t *spa) 670 { 671 ASSERT(spa->spa_sync_on == B_FALSE); 672 ASSERT(spa->spa_dsl_pool == NULL); 673 ASSERT(spa->spa_root_vdev == NULL); 674 ASSERT(spa->spa_async_zio_root == NULL); 675 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 676 677 txg_list_destroy(&spa->spa_vdev_txg_list); 678 679 list_destroy(&spa->spa_config_dirty_list); 680 list_destroy(&spa->spa_state_dirty_list); 681 682 for (int t = 0; t < ZIO_TYPES; t++) { 683 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 684 if (spa->spa_zio_taskq[t][q] != NULL) 685 taskq_destroy(spa->spa_zio_taskq[t][q]); 686 spa->spa_zio_taskq[t][q] = NULL; 687 } 688 } 689 690 metaslab_class_destroy(spa->spa_normal_class); 691 spa->spa_normal_class = NULL; 692 693 metaslab_class_destroy(spa->spa_log_class); 694 spa->spa_log_class = NULL; 695 696 /* 697 * If this was part of an import or the open otherwise failed, we may 698 * still have errors left in the queues. Empty them just in case. 699 */ 700 spa_errlog_drain(spa); 701 702 avl_destroy(&spa->spa_errlist_scrub); 703 avl_destroy(&spa->spa_errlist_last); 704 705 spa->spa_state = POOL_STATE_UNINITIALIZED; 706 } 707 708 /* 709 * Verify a pool configuration, and construct the vdev tree appropriately. This 710 * will create all the necessary vdevs in the appropriate layout, with each vdev 711 * in the CLOSED state. This will prep the pool before open/creation/import. 712 * All vdev validation is done by the vdev_alloc() routine. 713 */ 714 static int 715 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 716 uint_t id, int atype) 717 { 718 nvlist_t **child; 719 uint_t children; 720 int error; 721 722 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 723 return (error); 724 725 if ((*vdp)->vdev_ops->vdev_op_leaf) 726 return (0); 727 728 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 729 &child, &children); 730 731 if (error == ENOENT) 732 return (0); 733 734 if (error) { 735 vdev_free(*vdp); 736 *vdp = NULL; 737 return (EINVAL); 738 } 739 740 for (int c = 0; c < children; c++) { 741 vdev_t *vd; 742 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 743 atype)) != 0) { 744 vdev_free(*vdp); 745 *vdp = NULL; 746 return (error); 747 } 748 } 749 750 ASSERT(*vdp != NULL); 751 752 return (0); 753 } 754 755 /* 756 * Opposite of spa_load(). 757 */ 758 static void 759 spa_unload(spa_t *spa) 760 { 761 int i; 762 763 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 764 765 /* 766 * Stop async tasks. 767 */ 768 spa_async_suspend(spa); 769 770 /* 771 * Stop syncing. 772 */ 773 if (spa->spa_sync_on) { 774 txg_sync_stop(spa->spa_dsl_pool); 775 spa->spa_sync_on = B_FALSE; 776 } 777 778 /* 779 * Wait for any outstanding async I/O to complete. 780 */ 781 if (spa->spa_async_zio_root != NULL) { 782 (void) zio_wait(spa->spa_async_zio_root); 783 spa->spa_async_zio_root = NULL; 784 } 785 786 /* 787 * Close the dsl pool. 788 */ 789 if (spa->spa_dsl_pool) { 790 dsl_pool_close(spa->spa_dsl_pool); 791 spa->spa_dsl_pool = NULL; 792 } 793 794 ddt_unload(spa); 795 796 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 797 798 /* 799 * Drop and purge level 2 cache 800 */ 801 spa_l2cache_drop(spa); 802 803 /* 804 * Close all vdevs. 805 */ 806 if (spa->spa_root_vdev) 807 vdev_free(spa->spa_root_vdev); 808 ASSERT(spa->spa_root_vdev == NULL); 809 810 for (i = 0; i < spa->spa_spares.sav_count; i++) 811 vdev_free(spa->spa_spares.sav_vdevs[i]); 812 if (spa->spa_spares.sav_vdevs) { 813 kmem_free(spa->spa_spares.sav_vdevs, 814 spa->spa_spares.sav_count * sizeof (void *)); 815 spa->spa_spares.sav_vdevs = NULL; 816 } 817 if (spa->spa_spares.sav_config) { 818 nvlist_free(spa->spa_spares.sav_config); 819 spa->spa_spares.sav_config = NULL; 820 } 821 spa->spa_spares.sav_count = 0; 822 823 for (i = 0; i < spa->spa_l2cache.sav_count; i++) 824 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 825 if (spa->spa_l2cache.sav_vdevs) { 826 kmem_free(spa->spa_l2cache.sav_vdevs, 827 spa->spa_l2cache.sav_count * sizeof (void *)); 828 spa->spa_l2cache.sav_vdevs = NULL; 829 } 830 if (spa->spa_l2cache.sav_config) { 831 nvlist_free(spa->spa_l2cache.sav_config); 832 spa->spa_l2cache.sav_config = NULL; 833 } 834 spa->spa_l2cache.sav_count = 0; 835 836 spa->spa_async_suspended = 0; 837 838 spa_config_exit(spa, SCL_ALL, FTAG); 839 } 840 841 /* 842 * Load (or re-load) the current list of vdevs describing the active spares for 843 * this pool. When this is called, we have some form of basic information in 844 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 845 * then re-generate a more complete list including status information. 846 */ 847 static void 848 spa_load_spares(spa_t *spa) 849 { 850 nvlist_t **spares; 851 uint_t nspares; 852 int i; 853 vdev_t *vd, *tvd; 854 855 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 856 857 /* 858 * First, close and free any existing spare vdevs. 859 */ 860 for (i = 0; i < spa->spa_spares.sav_count; i++) { 861 vd = spa->spa_spares.sav_vdevs[i]; 862 863 /* Undo the call to spa_activate() below */ 864 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 865 B_FALSE)) != NULL && tvd->vdev_isspare) 866 spa_spare_remove(tvd); 867 vdev_close(vd); 868 vdev_free(vd); 869 } 870 871 if (spa->spa_spares.sav_vdevs) 872 kmem_free(spa->spa_spares.sav_vdevs, 873 spa->spa_spares.sav_count * sizeof (void *)); 874 875 if (spa->spa_spares.sav_config == NULL) 876 nspares = 0; 877 else 878 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 879 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 880 881 spa->spa_spares.sav_count = (int)nspares; 882 spa->spa_spares.sav_vdevs = NULL; 883 884 if (nspares == 0) 885 return; 886 887 /* 888 * Construct the array of vdevs, opening them to get status in the 889 * process. For each spare, there is potentially two different vdev_t 890 * structures associated with it: one in the list of spares (used only 891 * for basic validation purposes) and one in the active vdev 892 * configuration (if it's spared in). During this phase we open and 893 * validate each vdev on the spare list. If the vdev also exists in the 894 * active configuration, then we also mark this vdev as an active spare. 895 */ 896 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 897 KM_SLEEP); 898 for (i = 0; i < spa->spa_spares.sav_count; i++) { 899 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 900 VDEV_ALLOC_SPARE) == 0); 901 ASSERT(vd != NULL); 902 903 spa->spa_spares.sav_vdevs[i] = vd; 904 905 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 906 B_FALSE)) != NULL) { 907 if (!tvd->vdev_isspare) 908 spa_spare_add(tvd); 909 910 /* 911 * We only mark the spare active if we were successfully 912 * able to load the vdev. Otherwise, importing a pool 913 * with a bad active spare would result in strange 914 * behavior, because multiple pool would think the spare 915 * is actively in use. 916 * 917 * There is a vulnerability here to an equally bizarre 918 * circumstance, where a dead active spare is later 919 * brought back to life (onlined or otherwise). Given 920 * the rarity of this scenario, and the extra complexity 921 * it adds, we ignore the possibility. 922 */ 923 if (!vdev_is_dead(tvd)) 924 spa_spare_activate(tvd); 925 } 926 927 vd->vdev_top = vd; 928 vd->vdev_aux = &spa->spa_spares; 929 930 if (vdev_open(vd) != 0) 931 continue; 932 933 if (vdev_validate_aux(vd) == 0) 934 spa_spare_add(vd); 935 } 936 937 /* 938 * Recompute the stashed list of spares, with status information 939 * this time. 940 */ 941 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 942 DATA_TYPE_NVLIST_ARRAY) == 0); 943 944 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 945 KM_SLEEP); 946 for (i = 0; i < spa->spa_spares.sav_count; i++) 947 spares[i] = vdev_config_generate(spa, 948 spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE); 949 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 950 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 951 for (i = 0; i < spa->spa_spares.sav_count; i++) 952 nvlist_free(spares[i]); 953 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 954 } 955 956 /* 957 * Load (or re-load) the current list of vdevs describing the active l2cache for 958 * this pool. When this is called, we have some form of basic information in 959 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 960 * then re-generate a more complete list including status information. 961 * Devices which are already active have their details maintained, and are 962 * not re-opened. 963 */ 964 static void 965 spa_load_l2cache(spa_t *spa) 966 { 967 nvlist_t **l2cache; 968 uint_t nl2cache; 969 int i, j, oldnvdevs; 970 uint64_t guid; 971 vdev_t *vd, **oldvdevs, **newvdevs; 972 spa_aux_vdev_t *sav = &spa->spa_l2cache; 973 974 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 975 976 if (sav->sav_config != NULL) { 977 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 978 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 979 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 980 } else { 981 nl2cache = 0; 982 } 983 984 oldvdevs = sav->sav_vdevs; 985 oldnvdevs = sav->sav_count; 986 sav->sav_vdevs = NULL; 987 sav->sav_count = 0; 988 989 /* 990 * Process new nvlist of vdevs. 991 */ 992 for (i = 0; i < nl2cache; i++) { 993 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 994 &guid) == 0); 995 996 newvdevs[i] = NULL; 997 for (j = 0; j < oldnvdevs; j++) { 998 vd = oldvdevs[j]; 999 if (vd != NULL && guid == vd->vdev_guid) { 1000 /* 1001 * Retain previous vdev for add/remove ops. 1002 */ 1003 newvdevs[i] = vd; 1004 oldvdevs[j] = NULL; 1005 break; 1006 } 1007 } 1008 1009 if (newvdevs[i] == NULL) { 1010 /* 1011 * Create new vdev 1012 */ 1013 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1014 VDEV_ALLOC_L2CACHE) == 0); 1015 ASSERT(vd != NULL); 1016 newvdevs[i] = vd; 1017 1018 /* 1019 * Commit this vdev as an l2cache device, 1020 * even if it fails to open. 1021 */ 1022 spa_l2cache_add(vd); 1023 1024 vd->vdev_top = vd; 1025 vd->vdev_aux = sav; 1026 1027 spa_l2cache_activate(vd); 1028 1029 if (vdev_open(vd) != 0) 1030 continue; 1031 1032 (void) vdev_validate_aux(vd); 1033 1034 if (!vdev_is_dead(vd)) 1035 l2arc_add_vdev(spa, vd); 1036 } 1037 } 1038 1039 /* 1040 * Purge vdevs that were dropped 1041 */ 1042 for (i = 0; i < oldnvdevs; i++) { 1043 uint64_t pool; 1044 1045 vd = oldvdevs[i]; 1046 if (vd != NULL) { 1047 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1048 pool != 0ULL && l2arc_vdev_present(vd)) 1049 l2arc_remove_vdev(vd); 1050 (void) vdev_close(vd); 1051 spa_l2cache_remove(vd); 1052 } 1053 } 1054 1055 if (oldvdevs) 1056 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1057 1058 if (sav->sav_config == NULL) 1059 goto out; 1060 1061 sav->sav_vdevs = newvdevs; 1062 sav->sav_count = (int)nl2cache; 1063 1064 /* 1065 * Recompute the stashed list of l2cache devices, with status 1066 * information this time. 1067 */ 1068 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1069 DATA_TYPE_NVLIST_ARRAY) == 0); 1070 1071 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1072 for (i = 0; i < sav->sav_count; i++) 1073 l2cache[i] = vdev_config_generate(spa, 1074 sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE); 1075 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1076 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1077 out: 1078 for (i = 0; i < sav->sav_count; i++) 1079 nvlist_free(l2cache[i]); 1080 if (sav->sav_count) 1081 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1082 } 1083 1084 static int 1085 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1086 { 1087 dmu_buf_t *db; 1088 char *packed = NULL; 1089 size_t nvsize = 0; 1090 int error; 1091 *value = NULL; 1092 1093 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 1094 nvsize = *(uint64_t *)db->db_data; 1095 dmu_buf_rele(db, FTAG); 1096 1097 packed = kmem_alloc(nvsize, KM_SLEEP); 1098 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1099 DMU_READ_PREFETCH); 1100 if (error == 0) 1101 error = nvlist_unpack(packed, nvsize, value, 0); 1102 kmem_free(packed, nvsize); 1103 1104 return (error); 1105 } 1106 1107 /* 1108 * Checks to see if the given vdev could not be opened, in which case we post a 1109 * sysevent to notify the autoreplace code that the device has been removed. 1110 */ 1111 static void 1112 spa_check_removed(vdev_t *vd) 1113 { 1114 for (int c = 0; c < vd->vdev_children; c++) 1115 spa_check_removed(vd->vdev_child[c]); 1116 1117 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 1118 zfs_post_autoreplace(vd->vdev_spa, vd); 1119 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1120 } 1121 } 1122 1123 /* 1124 * Load the slog device state from the config object since it's possible 1125 * that the label does not contain the most up-to-date information. 1126 */ 1127 void 1128 spa_load_log_state(spa_t *spa, nvlist_t *nv) 1129 { 1130 vdev_t *ovd, *rvd = spa->spa_root_vdev; 1131 1132 /* 1133 * Load the original root vdev tree from the passed config. 1134 */ 1135 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1136 VERIFY(spa_config_parse(spa, &ovd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1137 1138 for (int c = 0; c < rvd->vdev_children; c++) { 1139 vdev_t *cvd = rvd->vdev_child[c]; 1140 if (cvd->vdev_islog) 1141 vdev_load_log_state(cvd, ovd->vdev_child[c]); 1142 } 1143 vdev_free(ovd); 1144 spa_config_exit(spa, SCL_ALL, FTAG); 1145 } 1146 1147 /* 1148 * Check for missing log devices 1149 */ 1150 int 1151 spa_check_logs(spa_t *spa) 1152 { 1153 switch (spa->spa_log_state) { 1154 case SPA_LOG_MISSING: 1155 /* need to recheck in case slog has been restored */ 1156 case SPA_LOG_UNKNOWN: 1157 if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, 1158 DS_FIND_CHILDREN)) { 1159 spa->spa_log_state = SPA_LOG_MISSING; 1160 return (1); 1161 } 1162 break; 1163 } 1164 return (0); 1165 } 1166 1167 static void 1168 spa_aux_check_removed(spa_aux_vdev_t *sav) 1169 { 1170 for (int i = 0; i < sav->sav_count; i++) 1171 spa_check_removed(sav->sav_vdevs[i]); 1172 } 1173 1174 void 1175 spa_claim_notify(zio_t *zio) 1176 { 1177 spa_t *spa = zio->io_spa; 1178 1179 if (zio->io_error) 1180 return; 1181 1182 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1183 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1184 spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1185 mutex_exit(&spa->spa_props_lock); 1186 } 1187 1188 typedef struct spa_load_error { 1189 uint64_t sle_metadata_count; 1190 uint64_t sle_data_count; 1191 } spa_load_error_t; 1192 1193 static void 1194 spa_load_verify_done(zio_t *zio) 1195 { 1196 blkptr_t *bp = zio->io_bp; 1197 spa_load_error_t *sle = zio->io_private; 1198 dmu_object_type_t type = BP_GET_TYPE(bp); 1199 int error = zio->io_error; 1200 1201 if (error) { 1202 if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) && 1203 type != DMU_OT_INTENT_LOG) 1204 atomic_add_64(&sle->sle_metadata_count, 1); 1205 else 1206 atomic_add_64(&sle->sle_data_count, 1); 1207 } 1208 zio_data_buf_free(zio->io_data, zio->io_size); 1209 } 1210 1211 /*ARGSUSED*/ 1212 static int 1213 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1214 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 1215 { 1216 if (bp != NULL) { 1217 zio_t *rio = arg; 1218 size_t size = BP_GET_PSIZE(bp); 1219 void *data = zio_data_buf_alloc(size); 1220 1221 zio_nowait(zio_read(rio, spa, bp, data, size, 1222 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1223 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1224 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1225 } 1226 return (0); 1227 } 1228 1229 static int 1230 spa_load_verify(spa_t *spa) 1231 { 1232 zio_t *rio; 1233 spa_load_error_t sle = { 0 }; 1234 zpool_rewind_policy_t policy; 1235 boolean_t verify_ok = B_FALSE; 1236 int error; 1237 1238 rio = zio_root(spa, NULL, &sle, 1239 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1240 1241 error = traverse_pool(spa, spa->spa_verify_min_txg, 1242 TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio); 1243 1244 (void) zio_wait(rio); 1245 1246 zpool_get_rewind_policy(spa->spa_config, &policy); 1247 1248 spa->spa_load_meta_errors = sle.sle_metadata_count; 1249 spa->spa_load_data_errors = sle.sle_data_count; 1250 1251 if (!error && sle.sle_metadata_count <= policy.zrp_maxmeta && 1252 sle.sle_data_count <= policy.zrp_maxdata) { 1253 verify_ok = B_TRUE; 1254 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 1255 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 1256 } else { 1257 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 1258 } 1259 1260 if (error) { 1261 if (error != ENXIO && error != EIO) 1262 error = EIO; 1263 return (error); 1264 } 1265 1266 return (verify_ok ? 0 : EIO); 1267 } 1268 1269 /* 1270 * Load an existing storage pool, using the pool's builtin spa_config as a 1271 * source of configuration information. 1272 */ 1273 static int 1274 spa_load(spa_t *spa, spa_load_state_t state, int mosconfig) 1275 { 1276 int error = 0; 1277 nvlist_t *nvconfig, *nvroot = NULL; 1278 vdev_t *rvd; 1279 uberblock_t *ub = &spa->spa_uberblock; 1280 uint64_t config_cache_txg = spa->spa_config_txg; 1281 uint64_t pool_guid; 1282 uint64_t version; 1283 uint64_t autoreplace = 0; 1284 int orig_mode = spa->spa_mode; 1285 char *ereport = FM_EREPORT_ZFS_POOL; 1286 nvlist_t *config = spa->spa_config; 1287 1288 /* 1289 * If this is an untrusted config, access the pool in read-only mode. 1290 * This prevents things like resilvering recently removed devices. 1291 */ 1292 if (!mosconfig) 1293 spa->spa_mode = FREAD; 1294 1295 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1296 1297 spa->spa_load_state = state; 1298 1299 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 1300 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 1301 error = EINVAL; 1302 goto out; 1303 } 1304 1305 /* 1306 * Versioning wasn't explicitly added to the label until later, so if 1307 * it's not present treat it as the initial version. 1308 */ 1309 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 1310 version = SPA_VERSION_INITIAL; 1311 1312 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 1313 &spa->spa_config_txg); 1314 1315 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 1316 spa_guid_exists(pool_guid, 0)) { 1317 error = EEXIST; 1318 goto out; 1319 } 1320 1321 spa->spa_load_guid = pool_guid; 1322 1323 /* 1324 * Create "The Godfather" zio to hold all async IOs 1325 */ 1326 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 1327 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 1328 1329 /* 1330 * Parse the configuration into a vdev tree. We explicitly set the 1331 * value that will be returned by spa_version() since parsing the 1332 * configuration requires knowing the version number. 1333 */ 1334 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1335 spa->spa_ubsync.ub_version = version; 1336 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 1337 spa_config_exit(spa, SCL_ALL, FTAG); 1338 1339 if (error != 0) 1340 goto out; 1341 1342 ASSERT(spa->spa_root_vdev == rvd); 1343 ASSERT(spa_guid(spa) == pool_guid); 1344 1345 /* 1346 * Try to open all vdevs, loading each label in the process. 1347 */ 1348 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1349 error = vdev_open(rvd); 1350 spa_config_exit(spa, SCL_ALL, FTAG); 1351 if (error != 0) 1352 goto out; 1353 1354 /* 1355 * We need to validate the vdev labels against the configuration that 1356 * we have in hand, which is dependent on the setting of mosconfig. If 1357 * mosconfig is true then we're validating the vdev labels based on 1358 * that config. Otherwise, we're validating against the cached config 1359 * (zpool.cache) that was read when we loaded the zfs module, and then 1360 * later we will recursively call spa_load() and validate against 1361 * the vdev config. 1362 */ 1363 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1364 error = vdev_validate(rvd); 1365 spa_config_exit(spa, SCL_ALL, FTAG); 1366 if (error != 0) 1367 goto out; 1368 1369 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1370 error = ENXIO; 1371 goto out; 1372 } 1373 1374 /* 1375 * Find the best uberblock. 1376 */ 1377 vdev_uberblock_load(NULL, rvd, ub); 1378 1379 /* 1380 * If we weren't able to find a single valid uberblock, return failure. 1381 */ 1382 if (ub->ub_txg == 0) { 1383 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1384 VDEV_AUX_CORRUPT_DATA); 1385 error = ENXIO; 1386 goto out; 1387 } 1388 1389 /* 1390 * If the pool is newer than the code, we can't open it. 1391 */ 1392 if (ub->ub_version > SPA_VERSION) { 1393 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1394 VDEV_AUX_VERSION_NEWER); 1395 error = ENOTSUP; 1396 goto out; 1397 } 1398 1399 /* 1400 * If the vdev guid sum doesn't match the uberblock, we have an 1401 * incomplete configuration. 1402 */ 1403 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 1404 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1405 VDEV_AUX_BAD_GUID_SUM); 1406 error = ENXIO; 1407 goto out; 1408 } 1409 1410 /* 1411 * Initialize internal SPA structures. 1412 */ 1413 spa->spa_state = POOL_STATE_ACTIVE; 1414 spa->spa_ubsync = spa->spa_uberblock; 1415 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 1416 TXG_INITIAL : spa_last_synced_txg(spa) - TXG_DEFER_SIZE; 1417 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 1418 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 1419 spa->spa_claim_max_txg = spa->spa_first_txg; 1420 1421 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 1422 if (error) { 1423 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1424 VDEV_AUX_CORRUPT_DATA); 1425 error = EIO; 1426 goto out; 1427 } 1428 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 1429 1430 if (zap_lookup(spa->spa_meta_objset, 1431 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1432 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 1433 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1434 VDEV_AUX_CORRUPT_DATA); 1435 error = EIO; 1436 goto out; 1437 } 1438 1439 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) { 1440 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1441 VDEV_AUX_CORRUPT_DATA); 1442 error = EIO; 1443 goto out; 1444 } 1445 1446 if (!mosconfig) { 1447 uint64_t hostid; 1448 1449 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 1450 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 1451 char *hostname; 1452 unsigned long myhostid = 0; 1453 1454 VERIFY(nvlist_lookup_string(nvconfig, 1455 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 1456 1457 #ifdef _KERNEL 1458 myhostid = zone_get_hostid(NULL); 1459 #else /* _KERNEL */ 1460 /* 1461 * We're emulating the system's hostid in userland, so 1462 * we can't use zone_get_hostid(). 1463 */ 1464 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 1465 #endif /* _KERNEL */ 1466 if (hostid != 0 && myhostid != 0 && 1467 hostid != myhostid) { 1468 cmn_err(CE_WARN, "pool '%s' could not be " 1469 "loaded as it was last accessed by " 1470 "another system (host: %s hostid: 0x%lx). " 1471 "See: http://www.sun.com/msg/ZFS-8000-EY", 1472 spa_name(spa), hostname, 1473 (unsigned long)hostid); 1474 error = EBADF; 1475 goto out; 1476 } 1477 } 1478 1479 spa_config_set(spa, nvconfig); 1480 spa_unload(spa); 1481 spa_deactivate(spa); 1482 spa_activate(spa, orig_mode); 1483 1484 return (spa_load(spa, state, B_TRUE)); 1485 } 1486 1487 if (zap_lookup(spa->spa_meta_objset, 1488 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1489 sizeof (uint64_t), 1, &spa->spa_deferred_bplist_obj) != 0) { 1490 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1491 VDEV_AUX_CORRUPT_DATA); 1492 error = EIO; 1493 goto out; 1494 } 1495 1496 /* 1497 * Load the bit that tells us to use the new accounting function 1498 * (raid-z deflation). If we have an older pool, this will not 1499 * be present. 1500 */ 1501 error = zap_lookup(spa->spa_meta_objset, 1502 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1503 sizeof (uint64_t), 1, &spa->spa_deflate); 1504 if (error != 0 && error != ENOENT) { 1505 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1506 VDEV_AUX_CORRUPT_DATA); 1507 error = EIO; 1508 goto out; 1509 } 1510 1511 /* 1512 * Load the persistent error log. If we have an older pool, this will 1513 * not be present. 1514 */ 1515 error = zap_lookup(spa->spa_meta_objset, 1516 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 1517 sizeof (uint64_t), 1, &spa->spa_errlog_last); 1518 if (error != 0 && error != ENOENT) { 1519 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1520 VDEV_AUX_CORRUPT_DATA); 1521 error = EIO; 1522 goto out; 1523 } 1524 1525 error = zap_lookup(spa->spa_meta_objset, 1526 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 1527 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 1528 if (error != 0 && error != ENOENT) { 1529 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1530 VDEV_AUX_CORRUPT_DATA); 1531 error = EIO; 1532 goto out; 1533 } 1534 1535 /* 1536 * Load the history object. If we have an older pool, this 1537 * will not be present. 1538 */ 1539 error = zap_lookup(spa->spa_meta_objset, 1540 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 1541 sizeof (uint64_t), 1, &spa->spa_history); 1542 if (error != 0 && error != ENOENT) { 1543 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1544 VDEV_AUX_CORRUPT_DATA); 1545 error = EIO; 1546 goto out; 1547 } 1548 1549 /* 1550 * Load any hot spares for this pool. 1551 */ 1552 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1553 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object); 1554 if (error != 0 && error != ENOENT) { 1555 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1556 VDEV_AUX_CORRUPT_DATA); 1557 error = EIO; 1558 goto out; 1559 } 1560 if (error == 0) { 1561 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 1562 if (load_nvlist(spa, spa->spa_spares.sav_object, 1563 &spa->spa_spares.sav_config) != 0) { 1564 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1565 VDEV_AUX_CORRUPT_DATA); 1566 error = EIO; 1567 goto out; 1568 } 1569 1570 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1571 spa_load_spares(spa); 1572 spa_config_exit(spa, SCL_ALL, FTAG); 1573 } 1574 1575 /* 1576 * Load any level 2 ARC devices for this pool. 1577 */ 1578 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1579 DMU_POOL_L2CACHE, sizeof (uint64_t), 1, 1580 &spa->spa_l2cache.sav_object); 1581 if (error != 0 && error != ENOENT) { 1582 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1583 VDEV_AUX_CORRUPT_DATA); 1584 error = EIO; 1585 goto out; 1586 } 1587 if (error == 0) { 1588 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 1589 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 1590 &spa->spa_l2cache.sav_config) != 0) { 1591 vdev_set_state(rvd, B_TRUE, 1592 VDEV_STATE_CANT_OPEN, 1593 VDEV_AUX_CORRUPT_DATA); 1594 error = EIO; 1595 goto out; 1596 } 1597 1598 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1599 spa_load_l2cache(spa); 1600 spa_config_exit(spa, SCL_ALL, FTAG); 1601 } 1602 1603 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 1604 1605 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1606 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 1607 1608 if (error && error != ENOENT) { 1609 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1610 VDEV_AUX_CORRUPT_DATA); 1611 error = EIO; 1612 goto out; 1613 } 1614 1615 if (error == 0) { 1616 (void) zap_lookup(spa->spa_meta_objset, 1617 spa->spa_pool_props_object, 1618 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 1619 sizeof (uint64_t), 1, &spa->spa_bootfs); 1620 (void) zap_lookup(spa->spa_meta_objset, 1621 spa->spa_pool_props_object, 1622 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1623 sizeof (uint64_t), 1, &autoreplace); 1624 spa->spa_autoreplace = (autoreplace != 0); 1625 (void) zap_lookup(spa->spa_meta_objset, 1626 spa->spa_pool_props_object, 1627 zpool_prop_to_name(ZPOOL_PROP_DELEGATION), 1628 sizeof (uint64_t), 1, &spa->spa_delegation); 1629 (void) zap_lookup(spa->spa_meta_objset, 1630 spa->spa_pool_props_object, 1631 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 1632 sizeof (uint64_t), 1, &spa->spa_failmode); 1633 (void) zap_lookup(spa->spa_meta_objset, 1634 spa->spa_pool_props_object, 1635 zpool_prop_to_name(ZPOOL_PROP_AUTOEXPAND), 1636 sizeof (uint64_t), 1, &spa->spa_autoexpand); 1637 (void) zap_lookup(spa->spa_meta_objset, 1638 spa->spa_pool_props_object, 1639 zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO), 1640 sizeof (uint64_t), 1, &spa->spa_dedup_ditto); 1641 } 1642 1643 /* 1644 * If the 'autoreplace' property is set, then post a resource notifying 1645 * the ZFS DE that it should not issue any faults for unopenable 1646 * devices. We also iterate over the vdevs, and post a sysevent for any 1647 * unopenable vdevs so that the normal autoreplace handler can take 1648 * over. 1649 */ 1650 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 1651 spa_check_removed(spa->spa_root_vdev); 1652 /* 1653 * For the import case, this is done in spa_import(), because 1654 * at this point we're using the spare definitions from 1655 * the MOS config, not necessarily from the userland config. 1656 */ 1657 if (state != SPA_LOAD_IMPORT) { 1658 spa_aux_check_removed(&spa->spa_spares); 1659 spa_aux_check_removed(&spa->spa_l2cache); 1660 } 1661 } 1662 1663 /* 1664 * Load the vdev state for all toplevel vdevs. 1665 */ 1666 vdev_load(rvd); 1667 1668 /* 1669 * Propagate the leaf DTLs we just loaded all the way up the tree. 1670 */ 1671 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1672 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 1673 spa_config_exit(spa, SCL_ALL, FTAG); 1674 1675 /* 1676 * Check the state of the root vdev. If it can't be opened, it 1677 * indicates one or more toplevel vdevs are faulted. 1678 */ 1679 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1680 error = ENXIO; 1681 goto out; 1682 } 1683 1684 /* 1685 * Load the DDTs (dedup tables). 1686 */ 1687 error = ddt_load(spa); 1688 if (error != 0) { 1689 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1690 VDEV_AUX_CORRUPT_DATA); 1691 error = EIO; 1692 goto out; 1693 } 1694 1695 spa_update_dspace(spa); 1696 1697 if (state != SPA_LOAD_TRYIMPORT) { 1698 error = spa_load_verify(spa); 1699 if (error) { 1700 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1701 VDEV_AUX_CORRUPT_DATA); 1702 goto out; 1703 } 1704 } 1705 1706 /* 1707 * Load the intent log state and check log integrity. 1708 */ 1709 VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE, 1710 &nvroot) == 0); 1711 spa_load_log_state(spa, nvroot); 1712 nvlist_free(nvconfig); 1713 1714 if (spa_check_logs(spa)) { 1715 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1716 VDEV_AUX_BAD_LOG); 1717 error = ENXIO; 1718 ereport = FM_EREPORT_ZFS_LOG_REPLAY; 1719 goto out; 1720 } 1721 1722 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 1723 spa->spa_load_max_txg == UINT64_MAX)) { 1724 dmu_tx_t *tx; 1725 int need_update = B_FALSE; 1726 1727 ASSERT(state != SPA_LOAD_TRYIMPORT); 1728 1729 /* 1730 * Claim log blocks that haven't been committed yet. 1731 * This must all happen in a single txg. 1732 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 1733 * invoked from zil_claim_log_block()'s i/o done callback. 1734 * Price of rollback is that we abandon the log. 1735 */ 1736 spa->spa_claiming = B_TRUE; 1737 1738 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 1739 spa_first_txg(spa)); 1740 (void) dmu_objset_find(spa_name(spa), 1741 zil_claim, tx, DS_FIND_CHILDREN); 1742 dmu_tx_commit(tx); 1743 1744 spa->spa_claiming = B_FALSE; 1745 1746 spa->spa_log_state = SPA_LOG_GOOD; 1747 spa->spa_sync_on = B_TRUE; 1748 txg_sync_start(spa->spa_dsl_pool); 1749 1750 /* 1751 * Wait for all claims to sync. We sync up to the highest 1752 * claimed log block birth time so that claimed log blocks 1753 * don't appear to be from the future. spa_claim_max_txg 1754 * will have been set for us by either zil_check_log_chain() 1755 * (invoked from spa_check_logs()) or zil_claim() above. 1756 */ 1757 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 1758 1759 /* 1760 * If the config cache is stale, or we have uninitialized 1761 * metaslabs (see spa_vdev_add()), then update the config. 1762 * 1763 * If spa_load_verbatim is true, trust the current 1764 * in-core spa_config and update the disk labels. 1765 */ 1766 if (config_cache_txg != spa->spa_config_txg || 1767 state == SPA_LOAD_IMPORT || spa->spa_load_verbatim || 1768 state == SPA_LOAD_RECOVER) 1769 need_update = B_TRUE; 1770 1771 for (int c = 0; c < rvd->vdev_children; c++) 1772 if (rvd->vdev_child[c]->vdev_ms_array == 0) 1773 need_update = B_TRUE; 1774 1775 /* 1776 * Update the config cache asychronously in case we're the 1777 * root pool, in which case the config cache isn't writable yet. 1778 */ 1779 if (need_update) 1780 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 1781 1782 /* 1783 * Check all DTLs to see if anything needs resilvering. 1784 */ 1785 if (vdev_resilver_needed(rvd, NULL, NULL)) 1786 spa_async_request(spa, SPA_ASYNC_RESILVER); 1787 1788 /* 1789 * Delete any inconsistent datasets. 1790 */ 1791 (void) dmu_objset_find(spa_name(spa), 1792 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 1793 1794 /* 1795 * Clean up any stale temporary dataset userrefs. 1796 */ 1797 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 1798 } 1799 1800 error = 0; 1801 out: 1802 1803 spa->spa_minref = refcount_count(&spa->spa_refcount); 1804 if (error && error != EBADF) 1805 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 1806 1807 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 1808 spa->spa_ena = 0; 1809 1810 return (error); 1811 } 1812 1813 static int 1814 spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 1815 { 1816 spa_unload(spa); 1817 spa_deactivate(spa); 1818 1819 spa->spa_load_max_txg--; 1820 1821 spa_activate(spa, spa_mode_global); 1822 spa_async_suspend(spa); 1823 1824 return (spa_load(spa, state, mosconfig)); 1825 } 1826 1827 static int 1828 spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 1829 uint64_t max_request, boolean_t extreme) 1830 { 1831 nvlist_t *config = NULL; 1832 int load_error, rewind_error; 1833 uint64_t safe_rollback_txg; 1834 uint64_t min_txg; 1835 1836 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 1837 spa->spa_load_max_txg = spa->spa_load_txg; 1838 spa->spa_log_state = SPA_LOG_CLEAR; 1839 } else { 1840 spa->spa_load_max_txg = max_request; 1841 } 1842 1843 load_error = rewind_error = spa_load(spa, state, mosconfig); 1844 if (load_error == 0) 1845 return (0); 1846 1847 if (spa->spa_root_vdev != NULL) 1848 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1849 1850 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 1851 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 1852 1853 /* specific txg requested */ 1854 if (spa->spa_load_max_txg != UINT64_MAX && !extreme) { 1855 nvlist_free(config); 1856 return (load_error); 1857 } 1858 1859 /* Price of rolling back is discarding txgs, including log */ 1860 if (state == SPA_LOAD_RECOVER) 1861 spa->spa_log_state = SPA_LOG_CLEAR; 1862 1863 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 1864 safe_rollback_txg = spa->spa_uberblock.ub_txg - TXG_DEFER_SIZE; 1865 1866 min_txg = extreme ? TXG_INITIAL : safe_rollback_txg; 1867 while (rewind_error && (spa->spa_uberblock.ub_txg >= min_txg)) { 1868 if (spa->spa_load_max_txg < safe_rollback_txg) 1869 spa->spa_extreme_rewind = B_TRUE; 1870 rewind_error = spa_load_retry(spa, state, mosconfig); 1871 } 1872 1873 if (config) 1874 spa_rewind_data_to_nvlist(spa, config); 1875 1876 spa->spa_extreme_rewind = B_FALSE; 1877 spa->spa_load_max_txg = UINT64_MAX; 1878 1879 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 1880 spa_config_set(spa, config); 1881 1882 return (state == SPA_LOAD_RECOVER ? rewind_error : load_error); 1883 } 1884 1885 /* 1886 * Pool Open/Import 1887 * 1888 * The import case is identical to an open except that the configuration is sent 1889 * down from userland, instead of grabbed from the configuration cache. For the 1890 * case of an open, the pool configuration will exist in the 1891 * POOL_STATE_UNINITIALIZED state. 1892 * 1893 * The stats information (gen/count/ustats) is used to gather vdev statistics at 1894 * the same time open the pool, without having to keep around the spa_t in some 1895 * ambiguous state. 1896 */ 1897 static int 1898 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 1899 nvlist_t **config) 1900 { 1901 spa_t *spa; 1902 boolean_t norewind; 1903 boolean_t extreme; 1904 zpool_rewind_policy_t policy; 1905 spa_load_state_t state = SPA_LOAD_OPEN; 1906 int error; 1907 int locked = B_FALSE; 1908 1909 *spapp = NULL; 1910 1911 zpool_get_rewind_policy(nvpolicy, &policy); 1912 if (policy.zrp_request & ZPOOL_DO_REWIND) 1913 state = SPA_LOAD_RECOVER; 1914 norewind = (policy.zrp_request == ZPOOL_NO_REWIND); 1915 extreme = ((policy.zrp_request & ZPOOL_EXTREME_REWIND) != 0); 1916 1917 /* 1918 * As disgusting as this is, we need to support recursive calls to this 1919 * function because dsl_dir_open() is called during spa_load(), and ends 1920 * up calling spa_open() again. The real fix is to figure out how to 1921 * avoid dsl_dir_open() calling this in the first place. 1922 */ 1923 if (mutex_owner(&spa_namespace_lock) != curthread) { 1924 mutex_enter(&spa_namespace_lock); 1925 locked = B_TRUE; 1926 } 1927 1928 if ((spa = spa_lookup(pool)) == NULL) { 1929 if (locked) 1930 mutex_exit(&spa_namespace_lock); 1931 return (ENOENT); 1932 } 1933 1934 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 1935 1936 spa_activate(spa, spa_mode_global); 1937 1938 if (spa->spa_last_open_failed && norewind) { 1939 if (config != NULL && spa->spa_config) 1940 VERIFY(nvlist_dup(spa->spa_config, 1941 config, KM_SLEEP) == 0); 1942 spa_deactivate(spa); 1943 if (locked) 1944 mutex_exit(&spa_namespace_lock); 1945 return (spa->spa_last_open_failed); 1946 } 1947 1948 if (state != SPA_LOAD_RECOVER) 1949 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 1950 1951 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 1952 extreme); 1953 1954 if (error == EBADF) { 1955 /* 1956 * If vdev_validate() returns failure (indicated by 1957 * EBADF), it indicates that one of the vdevs indicates 1958 * that the pool has been exported or destroyed. If 1959 * this is the case, the config cache is out of sync and 1960 * we should remove the pool from the namespace. 1961 */ 1962 spa_unload(spa); 1963 spa_deactivate(spa); 1964 spa_config_sync(spa, B_TRUE, B_TRUE); 1965 spa_remove(spa); 1966 if (locked) 1967 mutex_exit(&spa_namespace_lock); 1968 return (ENOENT); 1969 } 1970 1971 if (error) { 1972 /* 1973 * We can't open the pool, but we still have useful 1974 * information: the state of each vdev after the 1975 * attempted vdev_open(). Return this to the user. 1976 */ 1977 if (config != NULL && spa->spa_config) 1978 VERIFY(nvlist_dup(spa->spa_config, config, 1979 KM_SLEEP) == 0); 1980 spa_unload(spa); 1981 spa_deactivate(spa); 1982 spa->spa_last_open_failed = error; 1983 if (locked) 1984 mutex_exit(&spa_namespace_lock); 1985 *spapp = NULL; 1986 return (error); 1987 } 1988 1989 } 1990 1991 spa_open_ref(spa, tag); 1992 1993 1994 if (config != NULL) 1995 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1996 1997 if (locked) { 1998 spa->spa_last_open_failed = 0; 1999 spa->spa_last_ubsync_txg = 0; 2000 spa->spa_load_txg = 0; 2001 mutex_exit(&spa_namespace_lock); 2002 } 2003 2004 *spapp = spa; 2005 2006 return (0); 2007 } 2008 2009 int 2010 spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 2011 nvlist_t **config) 2012 { 2013 return (spa_open_common(name, spapp, tag, policy, config)); 2014 } 2015 2016 int 2017 spa_open(const char *name, spa_t **spapp, void *tag) 2018 { 2019 return (spa_open_common(name, spapp, tag, NULL, NULL)); 2020 } 2021 2022 /* 2023 * Lookup the given spa_t, incrementing the inject count in the process, 2024 * preventing it from being exported or destroyed. 2025 */ 2026 spa_t * 2027 spa_inject_addref(char *name) 2028 { 2029 spa_t *spa; 2030 2031 mutex_enter(&spa_namespace_lock); 2032 if ((spa = spa_lookup(name)) == NULL) { 2033 mutex_exit(&spa_namespace_lock); 2034 return (NULL); 2035 } 2036 spa->spa_inject_ref++; 2037 mutex_exit(&spa_namespace_lock); 2038 2039 return (spa); 2040 } 2041 2042 void 2043 spa_inject_delref(spa_t *spa) 2044 { 2045 mutex_enter(&spa_namespace_lock); 2046 spa->spa_inject_ref--; 2047 mutex_exit(&spa_namespace_lock); 2048 } 2049 2050 /* 2051 * Add spares device information to the nvlist. 2052 */ 2053 static void 2054 spa_add_spares(spa_t *spa, nvlist_t *config) 2055 { 2056 nvlist_t **spares; 2057 uint_t i, nspares; 2058 nvlist_t *nvroot; 2059 uint64_t guid; 2060 vdev_stat_t *vs; 2061 uint_t vsc; 2062 uint64_t pool; 2063 2064 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2065 2066 if (spa->spa_spares.sav_count == 0) 2067 return; 2068 2069 VERIFY(nvlist_lookup_nvlist(config, 2070 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2071 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 2072 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2073 if (nspares != 0) { 2074 VERIFY(nvlist_add_nvlist_array(nvroot, 2075 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2076 VERIFY(nvlist_lookup_nvlist_array(nvroot, 2077 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2078 2079 /* 2080 * Go through and find any spares which have since been 2081 * repurposed as an active spare. If this is the case, update 2082 * their status appropriately. 2083 */ 2084 for (i = 0; i < nspares; i++) { 2085 VERIFY(nvlist_lookup_uint64(spares[i], 2086 ZPOOL_CONFIG_GUID, &guid) == 0); 2087 if (spa_spare_exists(guid, &pool, NULL) && 2088 pool != 0ULL) { 2089 VERIFY(nvlist_lookup_uint64_array( 2090 spares[i], ZPOOL_CONFIG_STATS, 2091 (uint64_t **)&vs, &vsc) == 0); 2092 vs->vs_state = VDEV_STATE_CANT_OPEN; 2093 vs->vs_aux = VDEV_AUX_SPARED; 2094 } 2095 } 2096 } 2097 } 2098 2099 /* 2100 * Add l2cache device information to the nvlist, including vdev stats. 2101 */ 2102 static void 2103 spa_add_l2cache(spa_t *spa, nvlist_t *config) 2104 { 2105 nvlist_t **l2cache; 2106 uint_t i, j, nl2cache; 2107 nvlist_t *nvroot; 2108 uint64_t guid; 2109 vdev_t *vd; 2110 vdev_stat_t *vs; 2111 uint_t vsc; 2112 2113 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2114 2115 if (spa->spa_l2cache.sav_count == 0) 2116 return; 2117 2118 VERIFY(nvlist_lookup_nvlist(config, 2119 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2120 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 2121 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 2122 if (nl2cache != 0) { 2123 VERIFY(nvlist_add_nvlist_array(nvroot, 2124 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2125 VERIFY(nvlist_lookup_nvlist_array(nvroot, 2126 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 2127 2128 /* 2129 * Update level 2 cache device stats. 2130 */ 2131 2132 for (i = 0; i < nl2cache; i++) { 2133 VERIFY(nvlist_lookup_uint64(l2cache[i], 2134 ZPOOL_CONFIG_GUID, &guid) == 0); 2135 2136 vd = NULL; 2137 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 2138 if (guid == 2139 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 2140 vd = spa->spa_l2cache.sav_vdevs[j]; 2141 break; 2142 } 2143 } 2144 ASSERT(vd != NULL); 2145 2146 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 2147 ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0); 2148 vdev_get_stats(vd, vs); 2149 } 2150 } 2151 } 2152 2153 int 2154 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 2155 { 2156 int error; 2157 spa_t *spa; 2158 2159 *config = NULL; 2160 error = spa_open_common(name, &spa, FTAG, NULL, config); 2161 2162 if (spa != NULL) { 2163 /* 2164 * This still leaves a window of inconsistency where the spares 2165 * or l2cache devices could change and the config would be 2166 * self-inconsistent. 2167 */ 2168 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 2169 2170 if (*config != NULL) { 2171 VERIFY(nvlist_add_uint64(*config, 2172 ZPOOL_CONFIG_ERRCOUNT, 2173 spa_get_errlog_size(spa)) == 0); 2174 2175 if (spa_suspended(spa)) 2176 VERIFY(nvlist_add_uint64(*config, 2177 ZPOOL_CONFIG_SUSPENDED, 2178 spa->spa_failmode) == 0); 2179 2180 spa_add_spares(spa, *config); 2181 spa_add_l2cache(spa, *config); 2182 } 2183 } 2184 2185 /* 2186 * We want to get the alternate root even for faulted pools, so we cheat 2187 * and call spa_lookup() directly. 2188 */ 2189 if (altroot) { 2190 if (spa == NULL) { 2191 mutex_enter(&spa_namespace_lock); 2192 spa = spa_lookup(name); 2193 if (spa) 2194 spa_altroot(spa, altroot, buflen); 2195 else 2196 altroot[0] = '\0'; 2197 spa = NULL; 2198 mutex_exit(&spa_namespace_lock); 2199 } else { 2200 spa_altroot(spa, altroot, buflen); 2201 } 2202 } 2203 2204 if (spa != NULL) { 2205 spa_config_exit(spa, SCL_CONFIG, FTAG); 2206 spa_close(spa, FTAG); 2207 } 2208 2209 return (error); 2210 } 2211 2212 /* 2213 * Validate that the auxiliary device array is well formed. We must have an 2214 * array of nvlists, each which describes a valid leaf vdev. If this is an 2215 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 2216 * specified, as long as they are well-formed. 2217 */ 2218 static int 2219 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 2220 spa_aux_vdev_t *sav, const char *config, uint64_t version, 2221 vdev_labeltype_t label) 2222 { 2223 nvlist_t **dev; 2224 uint_t i, ndev; 2225 vdev_t *vd; 2226 int error; 2227 2228 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2229 2230 /* 2231 * It's acceptable to have no devs specified. 2232 */ 2233 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 2234 return (0); 2235 2236 if (ndev == 0) 2237 return (EINVAL); 2238 2239 /* 2240 * Make sure the pool is formatted with a version that supports this 2241 * device type. 2242 */ 2243 if (spa_version(spa) < version) 2244 return (ENOTSUP); 2245 2246 /* 2247 * Set the pending device list so we correctly handle device in-use 2248 * checking. 2249 */ 2250 sav->sav_pending = dev; 2251 sav->sav_npending = ndev; 2252 2253 for (i = 0; i < ndev; i++) { 2254 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 2255 mode)) != 0) 2256 goto out; 2257 2258 if (!vd->vdev_ops->vdev_op_leaf) { 2259 vdev_free(vd); 2260 error = EINVAL; 2261 goto out; 2262 } 2263 2264 /* 2265 * The L2ARC currently only supports disk devices in 2266 * kernel context. For user-level testing, we allow it. 2267 */ 2268 #ifdef _KERNEL 2269 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 2270 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 2271 error = ENOTBLK; 2272 goto out; 2273 } 2274 #endif 2275 vd->vdev_top = vd; 2276 2277 if ((error = vdev_open(vd)) == 0 && 2278 (error = vdev_label_init(vd, crtxg, label)) == 0) { 2279 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 2280 vd->vdev_guid) == 0); 2281 } 2282 2283 vdev_free(vd); 2284 2285 if (error && 2286 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 2287 goto out; 2288 else 2289 error = 0; 2290 } 2291 2292 out: 2293 sav->sav_pending = NULL; 2294 sav->sav_npending = 0; 2295 return (error); 2296 } 2297 2298 static int 2299 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 2300 { 2301 int error; 2302 2303 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2304 2305 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2306 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 2307 VDEV_LABEL_SPARE)) != 0) { 2308 return (error); 2309 } 2310 2311 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2312 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 2313 VDEV_LABEL_L2CACHE)); 2314 } 2315 2316 static void 2317 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 2318 const char *config) 2319 { 2320 int i; 2321 2322 if (sav->sav_config != NULL) { 2323 nvlist_t **olddevs; 2324 uint_t oldndevs; 2325 nvlist_t **newdevs; 2326 2327 /* 2328 * Generate new dev list by concatentating with the 2329 * current dev list. 2330 */ 2331 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 2332 &olddevs, &oldndevs) == 0); 2333 2334 newdevs = kmem_alloc(sizeof (void *) * 2335 (ndevs + oldndevs), KM_SLEEP); 2336 for (i = 0; i < oldndevs; i++) 2337 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 2338 KM_SLEEP) == 0); 2339 for (i = 0; i < ndevs; i++) 2340 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 2341 KM_SLEEP) == 0); 2342 2343 VERIFY(nvlist_remove(sav->sav_config, config, 2344 DATA_TYPE_NVLIST_ARRAY) == 0); 2345 2346 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 2347 config, newdevs, ndevs + oldndevs) == 0); 2348 for (i = 0; i < oldndevs + ndevs; i++) 2349 nvlist_free(newdevs[i]); 2350 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 2351 } else { 2352 /* 2353 * Generate a new dev list. 2354 */ 2355 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 2356 KM_SLEEP) == 0); 2357 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 2358 devs, ndevs) == 0); 2359 } 2360 } 2361 2362 /* 2363 * Stop and drop level 2 ARC devices 2364 */ 2365 void 2366 spa_l2cache_drop(spa_t *spa) 2367 { 2368 vdev_t *vd; 2369 int i; 2370 spa_aux_vdev_t *sav = &spa->spa_l2cache; 2371 2372 for (i = 0; i < sav->sav_count; i++) { 2373 uint64_t pool; 2374 2375 vd = sav->sav_vdevs[i]; 2376 ASSERT(vd != NULL); 2377 2378 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 2379 pool != 0ULL && l2arc_vdev_present(vd)) 2380 l2arc_remove_vdev(vd); 2381 if (vd->vdev_isl2cache) 2382 spa_l2cache_remove(vd); 2383 vdev_clear_stats(vd); 2384 (void) vdev_close(vd); 2385 } 2386 } 2387 2388 /* 2389 * Pool Creation 2390 */ 2391 int 2392 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 2393 const char *history_str, nvlist_t *zplprops) 2394 { 2395 spa_t *spa; 2396 char *altroot = NULL; 2397 vdev_t *rvd; 2398 dsl_pool_t *dp; 2399 dmu_tx_t *tx; 2400 int error = 0; 2401 uint64_t txg = TXG_INITIAL; 2402 nvlist_t **spares, **l2cache; 2403 uint_t nspares, nl2cache; 2404 uint64_t version; 2405 2406 /* 2407 * If this pool already exists, return failure. 2408 */ 2409 mutex_enter(&spa_namespace_lock); 2410 if (spa_lookup(pool) != NULL) { 2411 mutex_exit(&spa_namespace_lock); 2412 return (EEXIST); 2413 } 2414 2415 /* 2416 * Allocate a new spa_t structure. 2417 */ 2418 (void) nvlist_lookup_string(props, 2419 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2420 spa = spa_add(pool, NULL, altroot); 2421 spa_activate(spa, spa_mode_global); 2422 2423 if (props && (error = spa_prop_validate(spa, props))) { 2424 spa_deactivate(spa); 2425 spa_remove(spa); 2426 mutex_exit(&spa_namespace_lock); 2427 return (error); 2428 } 2429 2430 if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 2431 &version) != 0) 2432 version = SPA_VERSION; 2433 ASSERT(version <= SPA_VERSION); 2434 2435 spa->spa_first_txg = txg; 2436 spa->spa_uberblock.ub_txg = txg - 1; 2437 spa->spa_uberblock.ub_version = version; 2438 spa->spa_ubsync = spa->spa_uberblock; 2439 2440 /* 2441 * Create "The Godfather" zio to hold all async IOs 2442 */ 2443 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 2444 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 2445 2446 /* 2447 * Create the root vdev. 2448 */ 2449 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2450 2451 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 2452 2453 ASSERT(error != 0 || rvd != NULL); 2454 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 2455 2456 if (error == 0 && !zfs_allocatable_devs(nvroot)) 2457 error = EINVAL; 2458 2459 if (error == 0 && 2460 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 2461 (error = spa_validate_aux(spa, nvroot, txg, 2462 VDEV_ALLOC_ADD)) == 0) { 2463 for (int c = 0; c < rvd->vdev_children; c++) { 2464 vdev_metaslab_set_size(rvd->vdev_child[c]); 2465 vdev_expand(rvd->vdev_child[c], txg); 2466 } 2467 } 2468 2469 spa_config_exit(spa, SCL_ALL, FTAG); 2470 2471 if (error != 0) { 2472 spa_unload(spa); 2473 spa_deactivate(spa); 2474 spa_remove(spa); 2475 mutex_exit(&spa_namespace_lock); 2476 return (error); 2477 } 2478 2479 /* 2480 * Get the list of spares, if specified. 2481 */ 2482 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2483 &spares, &nspares) == 0) { 2484 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 2485 KM_SLEEP) == 0); 2486 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2487 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2488 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2489 spa_load_spares(spa); 2490 spa_config_exit(spa, SCL_ALL, FTAG); 2491 spa->spa_spares.sav_sync = B_TRUE; 2492 } 2493 2494 /* 2495 * Get the list of level 2 cache devices, if specified. 2496 */ 2497 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2498 &l2cache, &nl2cache) == 0) { 2499 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2500 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2501 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2502 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2503 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2504 spa_load_l2cache(spa); 2505 spa_config_exit(spa, SCL_ALL, FTAG); 2506 spa->spa_l2cache.sav_sync = B_TRUE; 2507 } 2508 2509 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 2510 spa->spa_meta_objset = dp->dp_meta_objset; 2511 2512 /* 2513 * Create DDTs (dedup tables). 2514 */ 2515 ddt_create(spa); 2516 2517 spa_update_dspace(spa); 2518 2519 tx = dmu_tx_create_assigned(dp, txg); 2520 2521 /* 2522 * Create the pool config object. 2523 */ 2524 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 2525 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 2526 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2527 2528 if (zap_add(spa->spa_meta_objset, 2529 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 2530 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 2531 cmn_err(CE_PANIC, "failed to add pool config"); 2532 } 2533 2534 /* Newly created pools with the right version are always deflated. */ 2535 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 2536 spa->spa_deflate = TRUE; 2537 if (zap_add(spa->spa_meta_objset, 2538 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2539 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 2540 cmn_err(CE_PANIC, "failed to add deflate"); 2541 } 2542 } 2543 2544 /* 2545 * Create the deferred-free bplist object. Turn off compression 2546 * because sync-to-convergence takes longer if the blocksize 2547 * keeps changing. 2548 */ 2549 spa->spa_deferred_bplist_obj = bplist_create(spa->spa_meta_objset, 2550 1 << 14, tx); 2551 dmu_object_set_compress(spa->spa_meta_objset, 2552 spa->spa_deferred_bplist_obj, ZIO_COMPRESS_OFF, tx); 2553 2554 if (zap_add(spa->spa_meta_objset, 2555 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 2556 sizeof (uint64_t), 1, &spa->spa_deferred_bplist_obj, tx) != 0) { 2557 cmn_err(CE_PANIC, "failed to add bplist"); 2558 } 2559 2560 /* 2561 * Create the pool's history object. 2562 */ 2563 if (version >= SPA_VERSION_ZPOOL_HISTORY) 2564 spa_history_create_obj(spa, tx); 2565 2566 /* 2567 * Set pool properties. 2568 */ 2569 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 2570 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2571 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 2572 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 2573 2574 if (props != NULL) { 2575 spa_configfile_set(spa, props, B_FALSE); 2576 spa_sync_props(spa, props, CRED(), tx); 2577 } 2578 2579 dmu_tx_commit(tx); 2580 2581 spa->spa_sync_on = B_TRUE; 2582 txg_sync_start(spa->spa_dsl_pool); 2583 2584 /* 2585 * We explicitly wait for the first transaction to complete so that our 2586 * bean counters are appropriately updated. 2587 */ 2588 txg_wait_synced(spa->spa_dsl_pool, txg); 2589 2590 spa_config_sync(spa, B_FALSE, B_TRUE); 2591 2592 if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 2593 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 2594 spa_history_log_version(spa, LOG_POOL_CREATE); 2595 2596 spa->spa_minref = refcount_count(&spa->spa_refcount); 2597 2598 mutex_exit(&spa_namespace_lock); 2599 2600 return (0); 2601 } 2602 2603 #ifdef _KERNEL 2604 /* 2605 * Get the root pool information from the root disk, then import the root pool 2606 * during the system boot up time. 2607 */ 2608 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 2609 2610 static nvlist_t * 2611 spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 2612 { 2613 nvlist_t *config; 2614 nvlist_t *nvtop, *nvroot; 2615 uint64_t pgid; 2616 2617 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 2618 return (NULL); 2619 2620 /* 2621 * Add this top-level vdev to the child array. 2622 */ 2623 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 2624 &nvtop) == 0); 2625 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 2626 &pgid) == 0); 2627 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 2628 2629 /* 2630 * Put this pool's top-level vdevs into a root vdev. 2631 */ 2632 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2633 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 2634 VDEV_TYPE_ROOT) == 0); 2635 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 2636 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 2637 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 2638 &nvtop, 1) == 0); 2639 2640 /* 2641 * Replace the existing vdev_tree with the new root vdev in 2642 * this pool's configuration (remove the old, add the new). 2643 */ 2644 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 2645 nvlist_free(nvroot); 2646 return (config); 2647 } 2648 2649 /* 2650 * Walk the vdev tree and see if we can find a device with "better" 2651 * configuration. A configuration is "better" if the label on that 2652 * device has a more recent txg. 2653 */ 2654 static void 2655 spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 2656 { 2657 for (int c = 0; c < vd->vdev_children; c++) 2658 spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 2659 2660 if (vd->vdev_ops->vdev_op_leaf) { 2661 nvlist_t *label; 2662 uint64_t label_txg; 2663 2664 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 2665 &label) != 0) 2666 return; 2667 2668 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 2669 &label_txg) == 0); 2670 2671 /* 2672 * Do we have a better boot device? 2673 */ 2674 if (label_txg > *txg) { 2675 *txg = label_txg; 2676 *avd = vd; 2677 } 2678 nvlist_free(label); 2679 } 2680 } 2681 2682 /* 2683 * Import a root pool. 2684 * 2685 * For x86. devpath_list will consist of devid and/or physpath name of 2686 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 2687 * The GRUB "findroot" command will return the vdev we should boot. 2688 * 2689 * For Sparc, devpath_list consists the physpath name of the booting device 2690 * no matter the rootpool is a single device pool or a mirrored pool. 2691 * e.g. 2692 * "/pci@1f,0/ide@d/disk@0,0:a" 2693 */ 2694 int 2695 spa_import_rootpool(char *devpath, char *devid) 2696 { 2697 spa_t *spa; 2698 vdev_t *rvd, *bvd, *avd = NULL; 2699 nvlist_t *config, *nvtop; 2700 uint64_t guid, txg; 2701 char *pname; 2702 int error; 2703 2704 /* 2705 * Read the label from the boot device and generate a configuration. 2706 */ 2707 config = spa_generate_rootconf(devpath, devid, &guid); 2708 #if defined(_OBP) && defined(_KERNEL) 2709 if (config == NULL) { 2710 if (strstr(devpath, "/iscsi/ssd") != NULL) { 2711 /* iscsi boot */ 2712 get_iscsi_bootpath_phy(devpath); 2713 config = spa_generate_rootconf(devpath, devid, &guid); 2714 } 2715 } 2716 #endif 2717 if (config == NULL) { 2718 cmn_err(CE_NOTE, "Can not read the pool label from '%s'", 2719 devpath); 2720 return (EIO); 2721 } 2722 2723 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 2724 &pname) == 0); 2725 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 2726 2727 mutex_enter(&spa_namespace_lock); 2728 if ((spa = spa_lookup(pname)) != NULL) { 2729 /* 2730 * Remove the existing root pool from the namespace so that we 2731 * can replace it with the correct config we just read in. 2732 */ 2733 spa_remove(spa); 2734 } 2735 2736 spa = spa_add(pname, config, NULL); 2737 spa->spa_is_root = B_TRUE; 2738 spa->spa_load_verbatim = B_TRUE; 2739 2740 /* 2741 * Build up a vdev tree based on the boot device's label config. 2742 */ 2743 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 2744 &nvtop) == 0); 2745 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2746 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 2747 VDEV_ALLOC_ROOTPOOL); 2748 spa_config_exit(spa, SCL_ALL, FTAG); 2749 if (error) { 2750 mutex_exit(&spa_namespace_lock); 2751 nvlist_free(config); 2752 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 2753 pname); 2754 return (error); 2755 } 2756 2757 /* 2758 * Get the boot vdev. 2759 */ 2760 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2761 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 2762 (u_longlong_t)guid); 2763 error = ENOENT; 2764 goto out; 2765 } 2766 2767 /* 2768 * Determine if there is a better boot device. 2769 */ 2770 avd = bvd; 2771 spa_alt_rootvdev(rvd, &avd, &txg); 2772 if (avd != bvd) { 2773 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 2774 "try booting from '%s'", avd->vdev_path); 2775 error = EINVAL; 2776 goto out; 2777 } 2778 2779 /* 2780 * If the boot device is part of a spare vdev then ensure that 2781 * we're booting off the active spare. 2782 */ 2783 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2784 !bvd->vdev_isspare) { 2785 cmn_err(CE_NOTE, "The boot device is currently spared. Please " 2786 "try booting from '%s'", 2787 bvd->vdev_parent->vdev_child[1]->vdev_path); 2788 error = EINVAL; 2789 goto out; 2790 } 2791 2792 error = 0; 2793 spa_history_log_version(spa, LOG_POOL_IMPORT); 2794 out: 2795 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2796 vdev_free(rvd); 2797 spa_config_exit(spa, SCL_ALL, FTAG); 2798 mutex_exit(&spa_namespace_lock); 2799 2800 nvlist_free(config); 2801 return (error); 2802 } 2803 2804 #endif 2805 2806 /* 2807 * Take a pool and insert it into the namespace as if it had been loaded at 2808 * boot. 2809 */ 2810 int 2811 spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props) 2812 { 2813 spa_t *spa; 2814 zpool_rewind_policy_t policy; 2815 char *altroot = NULL; 2816 2817 mutex_enter(&spa_namespace_lock); 2818 if (spa_lookup(pool) != NULL) { 2819 mutex_exit(&spa_namespace_lock); 2820 return (EEXIST); 2821 } 2822 2823 (void) nvlist_lookup_string(props, 2824 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2825 spa = spa_add(pool, config, altroot); 2826 2827 zpool_get_rewind_policy(config, &policy); 2828 spa->spa_load_max_txg = policy.zrp_txg; 2829 2830 spa->spa_load_verbatim = B_TRUE; 2831 2832 if (props != NULL) 2833 spa_configfile_set(spa, props, B_FALSE); 2834 2835 spa_config_sync(spa, B_FALSE, B_TRUE); 2836 2837 mutex_exit(&spa_namespace_lock); 2838 spa_history_log_version(spa, LOG_POOL_IMPORT); 2839 2840 return (0); 2841 } 2842 2843 /* 2844 * Import a non-root pool into the system. 2845 */ 2846 int 2847 spa_import(const char *pool, nvlist_t *config, nvlist_t *props) 2848 { 2849 spa_t *spa; 2850 char *altroot = NULL; 2851 spa_load_state_t state = SPA_LOAD_IMPORT; 2852 zpool_rewind_policy_t policy; 2853 int error; 2854 nvlist_t *nvroot; 2855 nvlist_t **spares, **l2cache; 2856 uint_t nspares, nl2cache; 2857 2858 /* 2859 * If a pool with this name exists, return failure. 2860 */ 2861 mutex_enter(&spa_namespace_lock); 2862 if ((spa = spa_lookup(pool)) != NULL) { 2863 mutex_exit(&spa_namespace_lock); 2864 return (EEXIST); 2865 } 2866 2867 zpool_get_rewind_policy(config, &policy); 2868 if (policy.zrp_request & ZPOOL_DO_REWIND) 2869 state = SPA_LOAD_RECOVER; 2870 2871 /* 2872 * Create and initialize the spa structure. 2873 */ 2874 (void) nvlist_lookup_string(props, 2875 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2876 spa = spa_add(pool, config, altroot); 2877 spa_activate(spa, spa_mode_global); 2878 2879 /* 2880 * Don't start async tasks until we know everything is healthy. 2881 */ 2882 spa_async_suspend(spa); 2883 2884 /* 2885 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 2886 * because the user-supplied config is actually the one to trust when 2887 * doing an import. 2888 */ 2889 if (state != SPA_LOAD_RECOVER) 2890 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 2891 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 2892 ((policy.zrp_request & ZPOOL_EXTREME_REWIND) != 0)); 2893 2894 /* 2895 * Propagate anything learned about failing or best txgs 2896 * back to caller 2897 */ 2898 spa_rewind_data_to_nvlist(spa, config); 2899 2900 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2901 /* 2902 * Toss any existing sparelist, as it doesn't have any validity 2903 * anymore, and conflicts with spa_has_spare(). 2904 */ 2905 if (spa->spa_spares.sav_config) { 2906 nvlist_free(spa->spa_spares.sav_config); 2907 spa->spa_spares.sav_config = NULL; 2908 spa_load_spares(spa); 2909 } 2910 if (spa->spa_l2cache.sav_config) { 2911 nvlist_free(spa->spa_l2cache.sav_config); 2912 spa->spa_l2cache.sav_config = NULL; 2913 spa_load_l2cache(spa); 2914 } 2915 2916 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 2917 &nvroot) == 0); 2918 if (error == 0) 2919 error = spa_validate_aux(spa, nvroot, -1ULL, 2920 VDEV_ALLOC_SPARE); 2921 if (error == 0) 2922 error = spa_validate_aux(spa, nvroot, -1ULL, 2923 VDEV_ALLOC_L2CACHE); 2924 spa_config_exit(spa, SCL_ALL, FTAG); 2925 2926 if (props != NULL) 2927 spa_configfile_set(spa, props, B_FALSE); 2928 2929 if (error != 0 || (props && spa_writeable(spa) && 2930 (error = spa_prop_set(spa, props)))) { 2931 spa_unload(spa); 2932 spa_deactivate(spa); 2933 spa_remove(spa); 2934 mutex_exit(&spa_namespace_lock); 2935 return (error); 2936 } 2937 2938 spa_async_resume(spa); 2939 2940 /* 2941 * Override any spares and level 2 cache devices as specified by 2942 * the user, as these may have correct device names/devids, etc. 2943 */ 2944 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2945 &spares, &nspares) == 0) { 2946 if (spa->spa_spares.sav_config) 2947 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 2948 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 2949 else 2950 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 2951 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2952 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2953 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2954 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2955 spa_load_spares(spa); 2956 spa_config_exit(spa, SCL_ALL, FTAG); 2957 spa->spa_spares.sav_sync = B_TRUE; 2958 } 2959 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2960 &l2cache, &nl2cache) == 0) { 2961 if (spa->spa_l2cache.sav_config) 2962 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 2963 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 2964 else 2965 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2966 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2967 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2968 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2969 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2970 spa_load_l2cache(spa); 2971 spa_config_exit(spa, SCL_ALL, FTAG); 2972 spa->spa_l2cache.sav_sync = B_TRUE; 2973 } 2974 2975 /* 2976 * Check for any removed devices. 2977 */ 2978 if (spa->spa_autoreplace) { 2979 spa_aux_check_removed(&spa->spa_spares); 2980 spa_aux_check_removed(&spa->spa_l2cache); 2981 } 2982 2983 if (spa_writeable(spa)) { 2984 /* 2985 * Update the config cache to include the newly-imported pool. 2986 */ 2987 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2988 } 2989 2990 /* 2991 * It's possible that the pool was expanded while it was exported. 2992 * We kick off an async task to handle this for us. 2993 */ 2994 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 2995 2996 mutex_exit(&spa_namespace_lock); 2997 spa_history_log_version(spa, LOG_POOL_IMPORT); 2998 2999 return (0); 3000 } 3001 3002 3003 /* 3004 * This (illegal) pool name is used when temporarily importing a spa_t in order 3005 * to get the vdev stats associated with the imported devices. 3006 */ 3007 #define TRYIMPORT_NAME "$import" 3008 3009 nvlist_t * 3010 spa_tryimport(nvlist_t *tryconfig) 3011 { 3012 nvlist_t *config = NULL; 3013 char *poolname; 3014 spa_t *spa; 3015 uint64_t state; 3016 int error; 3017 3018 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 3019 return (NULL); 3020 3021 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 3022 return (NULL); 3023 3024 /* 3025 * Create and initialize the spa structure. 3026 */ 3027 mutex_enter(&spa_namespace_lock); 3028 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 3029 spa_activate(spa, FREAD); 3030 3031 /* 3032 * Pass off the heavy lifting to spa_load(). 3033 * Pass TRUE for mosconfig because the user-supplied config 3034 * is actually the one to trust when doing an import. 3035 */ 3036 error = spa_load(spa, SPA_LOAD_TRYIMPORT, B_TRUE); 3037 3038 /* 3039 * If 'tryconfig' was at least parsable, return the current config. 3040 */ 3041 if (spa->spa_root_vdev != NULL) { 3042 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3043 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 3044 poolname) == 0); 3045 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 3046 state) == 0); 3047 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 3048 spa->spa_uberblock.ub_timestamp) == 0); 3049 3050 /* 3051 * If the bootfs property exists on this pool then we 3052 * copy it out so that external consumers can tell which 3053 * pools are bootable. 3054 */ 3055 if ((!error || error == EEXIST) && spa->spa_bootfs) { 3056 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 3057 3058 /* 3059 * We have to play games with the name since the 3060 * pool was opened as TRYIMPORT_NAME. 3061 */ 3062 if (dsl_dsobj_to_dsname(spa_name(spa), 3063 spa->spa_bootfs, tmpname) == 0) { 3064 char *cp; 3065 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 3066 3067 cp = strchr(tmpname, '/'); 3068 if (cp == NULL) { 3069 (void) strlcpy(dsname, tmpname, 3070 MAXPATHLEN); 3071 } else { 3072 (void) snprintf(dsname, MAXPATHLEN, 3073 "%s/%s", poolname, ++cp); 3074 } 3075 VERIFY(nvlist_add_string(config, 3076 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 3077 kmem_free(dsname, MAXPATHLEN); 3078 } 3079 kmem_free(tmpname, MAXPATHLEN); 3080 } 3081 3082 /* 3083 * Add the list of hot spares and level 2 cache devices. 3084 */ 3085 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3086 spa_add_spares(spa, config); 3087 spa_add_l2cache(spa, config); 3088 spa_config_exit(spa, SCL_CONFIG, FTAG); 3089 } 3090 3091 spa_unload(spa); 3092 spa_deactivate(spa); 3093 spa_remove(spa); 3094 mutex_exit(&spa_namespace_lock); 3095 3096 return (config); 3097 } 3098 3099 /* 3100 * Pool export/destroy 3101 * 3102 * The act of destroying or exporting a pool is very simple. We make sure there 3103 * is no more pending I/O and any references to the pool are gone. Then, we 3104 * update the pool state and sync all the labels to disk, removing the 3105 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 3106 * we don't sync the labels or remove the configuration cache. 3107 */ 3108 static int 3109 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 3110 boolean_t force, boolean_t hardforce) 3111 { 3112 spa_t *spa; 3113 3114 if (oldconfig) 3115 *oldconfig = NULL; 3116 3117 if (!(spa_mode_global & FWRITE)) 3118 return (EROFS); 3119 3120 mutex_enter(&spa_namespace_lock); 3121 if ((spa = spa_lookup(pool)) == NULL) { 3122 mutex_exit(&spa_namespace_lock); 3123 return (ENOENT); 3124 } 3125 3126 /* 3127 * Put a hold on the pool, drop the namespace lock, stop async tasks, 3128 * reacquire the namespace lock, and see if we can export. 3129 */ 3130 spa_open_ref(spa, FTAG); 3131 mutex_exit(&spa_namespace_lock); 3132 spa_async_suspend(spa); 3133 mutex_enter(&spa_namespace_lock); 3134 spa_close(spa, FTAG); 3135 3136 /* 3137 * The pool will be in core if it's openable, 3138 * in which case we can modify its state. 3139 */ 3140 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 3141 /* 3142 * Objsets may be open only because they're dirty, so we 3143 * have to force it to sync before checking spa_refcnt. 3144 */ 3145 txg_wait_synced(spa->spa_dsl_pool, 0); 3146 3147 /* 3148 * A pool cannot be exported or destroyed if there are active 3149 * references. If we are resetting a pool, allow references by 3150 * fault injection handlers. 3151 */ 3152 if (!spa_refcount_zero(spa) || 3153 (spa->spa_inject_ref != 0 && 3154 new_state != POOL_STATE_UNINITIALIZED)) { 3155 spa_async_resume(spa); 3156 mutex_exit(&spa_namespace_lock); 3157 return (EBUSY); 3158 } 3159 3160 /* 3161 * A pool cannot be exported if it has an active shared spare. 3162 * This is to prevent other pools stealing the active spare 3163 * from an exported pool. At user's own will, such pool can 3164 * be forcedly exported. 3165 */ 3166 if (!force && new_state == POOL_STATE_EXPORTED && 3167 spa_has_active_shared_spare(spa)) { 3168 spa_async_resume(spa); 3169 mutex_exit(&spa_namespace_lock); 3170 return (EXDEV); 3171 } 3172 3173 /* 3174 * We want this to be reflected on every label, 3175 * so mark them all dirty. spa_unload() will do the 3176 * final sync that pushes these changes out. 3177 */ 3178 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 3179 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3180 spa->spa_state = new_state; 3181 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 3182 vdev_config_dirty(spa->spa_root_vdev); 3183 spa_config_exit(spa, SCL_ALL, FTAG); 3184 } 3185 } 3186 3187 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 3188 3189 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3190 spa_unload(spa); 3191 spa_deactivate(spa); 3192 } 3193 3194 if (oldconfig && spa->spa_config) 3195 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 3196 3197 if (new_state != POOL_STATE_UNINITIALIZED) { 3198 if (!hardforce) 3199 spa_config_sync(spa, B_TRUE, B_TRUE); 3200 spa_remove(spa); 3201 } 3202 mutex_exit(&spa_namespace_lock); 3203 3204 return (0); 3205 } 3206 3207 /* 3208 * Destroy a storage pool. 3209 */ 3210 int 3211 spa_destroy(char *pool) 3212 { 3213 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 3214 B_FALSE, B_FALSE)); 3215 } 3216 3217 /* 3218 * Export a storage pool. 3219 */ 3220 int 3221 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 3222 boolean_t hardforce) 3223 { 3224 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 3225 force, hardforce)); 3226 } 3227 3228 /* 3229 * Similar to spa_export(), this unloads the spa_t without actually removing it 3230 * from the namespace in any way. 3231 */ 3232 int 3233 spa_reset(char *pool) 3234 { 3235 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 3236 B_FALSE, B_FALSE)); 3237 } 3238 3239 /* 3240 * ========================================================================== 3241 * Device manipulation 3242 * ========================================================================== 3243 */ 3244 3245 /* 3246 * Add a device to a storage pool. 3247 */ 3248 int 3249 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 3250 { 3251 uint64_t txg, id; 3252 int error; 3253 vdev_t *rvd = spa->spa_root_vdev; 3254 vdev_t *vd, *tvd; 3255 nvlist_t **spares, **l2cache; 3256 uint_t nspares, nl2cache; 3257 3258 txg = spa_vdev_enter(spa); 3259 3260 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 3261 VDEV_ALLOC_ADD)) != 0) 3262 return (spa_vdev_exit(spa, NULL, txg, error)); 3263 3264 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 3265 3266 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 3267 &nspares) != 0) 3268 nspares = 0; 3269 3270 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 3271 &nl2cache) != 0) 3272 nl2cache = 0; 3273 3274 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 3275 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 3276 3277 if (vd->vdev_children != 0 && 3278 (error = vdev_create(vd, txg, B_FALSE)) != 0) 3279 return (spa_vdev_exit(spa, vd, txg, error)); 3280 3281 /* 3282 * We must validate the spares and l2cache devices after checking the 3283 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 3284 */ 3285 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 3286 return (spa_vdev_exit(spa, vd, txg, error)); 3287 3288 /* 3289 * Transfer each new top-level vdev from vd to rvd. 3290 */ 3291 for (int c = 0; c < vd->vdev_children; c++) { 3292 3293 /* 3294 * Set the vdev id to the first hole, if one exists. 3295 */ 3296 for (id = 0; id < rvd->vdev_children; id++) { 3297 if (rvd->vdev_child[id]->vdev_ishole) { 3298 vdev_free(rvd->vdev_child[id]); 3299 break; 3300 } 3301 } 3302 tvd = vd->vdev_child[c]; 3303 vdev_remove_child(vd, tvd); 3304 tvd->vdev_id = id; 3305 vdev_add_child(rvd, tvd); 3306 vdev_config_dirty(tvd); 3307 } 3308 3309 if (nspares != 0) { 3310 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 3311 ZPOOL_CONFIG_SPARES); 3312 spa_load_spares(spa); 3313 spa->spa_spares.sav_sync = B_TRUE; 3314 } 3315 3316 if (nl2cache != 0) { 3317 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 3318 ZPOOL_CONFIG_L2CACHE); 3319 spa_load_l2cache(spa); 3320 spa->spa_l2cache.sav_sync = B_TRUE; 3321 } 3322 3323 /* 3324 * We have to be careful when adding new vdevs to an existing pool. 3325 * If other threads start allocating from these vdevs before we 3326 * sync the config cache, and we lose power, then upon reboot we may 3327 * fail to open the pool because there are DVAs that the config cache 3328 * can't translate. Therefore, we first add the vdevs without 3329 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 3330 * and then let spa_config_update() initialize the new metaslabs. 3331 * 3332 * spa_load() checks for added-but-not-initialized vdevs, so that 3333 * if we lose power at any point in this sequence, the remaining 3334 * steps will be completed the next time we load the pool. 3335 */ 3336 (void) spa_vdev_exit(spa, vd, txg, 0); 3337 3338 mutex_enter(&spa_namespace_lock); 3339 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3340 mutex_exit(&spa_namespace_lock); 3341 3342 return (0); 3343 } 3344 3345 /* 3346 * Attach a device to a mirror. The arguments are the path to any device 3347 * in the mirror, and the nvroot for the new device. If the path specifies 3348 * a device that is not mirrored, we automatically insert the mirror vdev. 3349 * 3350 * If 'replacing' is specified, the new device is intended to replace the 3351 * existing device; in this case the two devices are made into their own 3352 * mirror using the 'replacing' vdev, which is functionally identical to 3353 * the mirror vdev (it actually reuses all the same ops) but has a few 3354 * extra rules: you can't attach to it after it's been created, and upon 3355 * completion of resilvering, the first disk (the one being replaced) 3356 * is automatically detached. 3357 */ 3358 int 3359 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 3360 { 3361 uint64_t txg, open_txg; 3362 vdev_t *rvd = spa->spa_root_vdev; 3363 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 3364 vdev_ops_t *pvops; 3365 char *oldvdpath, *newvdpath; 3366 int newvd_isspare; 3367 int error; 3368 3369 txg = spa_vdev_enter(spa); 3370 3371 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 3372 3373 if (oldvd == NULL) 3374 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3375 3376 if (!oldvd->vdev_ops->vdev_op_leaf) 3377 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3378 3379 pvd = oldvd->vdev_parent; 3380 3381 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 3382 VDEV_ALLOC_ADD)) != 0) 3383 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 3384 3385 if (newrootvd->vdev_children != 1) 3386 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3387 3388 newvd = newrootvd->vdev_child[0]; 3389 3390 if (!newvd->vdev_ops->vdev_op_leaf) 3391 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3392 3393 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 3394 return (spa_vdev_exit(spa, newrootvd, txg, error)); 3395 3396 /* 3397 * Spares can't replace logs 3398 */ 3399 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 3400 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3401 3402 if (!replacing) { 3403 /* 3404 * For attach, the only allowable parent is a mirror or the root 3405 * vdev. 3406 */ 3407 if (pvd->vdev_ops != &vdev_mirror_ops && 3408 pvd->vdev_ops != &vdev_root_ops) 3409 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3410 3411 pvops = &vdev_mirror_ops; 3412 } else { 3413 /* 3414 * Active hot spares can only be replaced by inactive hot 3415 * spares. 3416 */ 3417 if (pvd->vdev_ops == &vdev_spare_ops && 3418 pvd->vdev_child[1] == oldvd && 3419 !spa_has_spare(spa, newvd->vdev_guid)) 3420 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3421 3422 /* 3423 * If the source is a hot spare, and the parent isn't already a 3424 * spare, then we want to create a new hot spare. Otherwise, we 3425 * want to create a replacing vdev. The user is not allowed to 3426 * attach to a spared vdev child unless the 'isspare' state is 3427 * the same (spare replaces spare, non-spare replaces 3428 * non-spare). 3429 */ 3430 if (pvd->vdev_ops == &vdev_replacing_ops) 3431 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3432 else if (pvd->vdev_ops == &vdev_spare_ops && 3433 newvd->vdev_isspare != oldvd->vdev_isspare) 3434 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3435 else if (pvd->vdev_ops != &vdev_spare_ops && 3436 newvd->vdev_isspare) 3437 pvops = &vdev_spare_ops; 3438 else 3439 pvops = &vdev_replacing_ops; 3440 } 3441 3442 /* 3443 * Make sure the new device is big enough. 3444 */ 3445 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 3446 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 3447 3448 /* 3449 * The new device cannot have a higher alignment requirement 3450 * than the top-level vdev. 3451 */ 3452 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 3453 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 3454 3455 /* 3456 * If this is an in-place replacement, update oldvd's path and devid 3457 * to make it distinguishable from newvd, and unopenable from now on. 3458 */ 3459 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 3460 spa_strfree(oldvd->vdev_path); 3461 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 3462 KM_SLEEP); 3463 (void) sprintf(oldvd->vdev_path, "%s/%s", 3464 newvd->vdev_path, "old"); 3465 if (oldvd->vdev_devid != NULL) { 3466 spa_strfree(oldvd->vdev_devid); 3467 oldvd->vdev_devid = NULL; 3468 } 3469 } 3470 3471 /* 3472 * If the parent is not a mirror, or if we're replacing, insert the new 3473 * mirror/replacing/spare vdev above oldvd. 3474 */ 3475 if (pvd->vdev_ops != pvops) 3476 pvd = vdev_add_parent(oldvd, pvops); 3477 3478 ASSERT(pvd->vdev_top->vdev_parent == rvd); 3479 ASSERT(pvd->vdev_ops == pvops); 3480 ASSERT(oldvd->vdev_parent == pvd); 3481 3482 /* 3483 * Extract the new device from its root and add it to pvd. 3484 */ 3485 vdev_remove_child(newrootvd, newvd); 3486 newvd->vdev_id = pvd->vdev_children; 3487 newvd->vdev_crtxg = oldvd->vdev_crtxg; 3488 vdev_add_child(pvd, newvd); 3489 3490 tvd = newvd->vdev_top; 3491 ASSERT(pvd->vdev_top == tvd); 3492 ASSERT(tvd->vdev_parent == rvd); 3493 3494 vdev_config_dirty(tvd); 3495 3496 /* 3497 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 3498 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 3499 */ 3500 open_txg = txg + TXG_CONCURRENT_STATES - 1; 3501 3502 vdev_dtl_dirty(newvd, DTL_MISSING, 3503 TXG_INITIAL, open_txg - TXG_INITIAL + 1); 3504 3505 if (newvd->vdev_isspare) { 3506 spa_spare_activate(newvd); 3507 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 3508 } 3509 3510 oldvdpath = spa_strdup(oldvd->vdev_path); 3511 newvdpath = spa_strdup(newvd->vdev_path); 3512 newvd_isspare = newvd->vdev_isspare; 3513 3514 /* 3515 * Mark newvd's DTL dirty in this txg. 3516 */ 3517 vdev_dirty(tvd, VDD_DTL, newvd, txg); 3518 3519 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 3520 3521 spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, NULL, 3522 CRED(), "%s vdev=%s %s vdev=%s", 3523 replacing && newvd_isspare ? "spare in" : 3524 replacing ? "replace" : "attach", newvdpath, 3525 replacing ? "for" : "to", oldvdpath); 3526 3527 spa_strfree(oldvdpath); 3528 spa_strfree(newvdpath); 3529 3530 /* 3531 * Kick off a resilver to update newvd. 3532 */ 3533 VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0); 3534 3535 return (0); 3536 } 3537 3538 /* 3539 * Detach a device from a mirror or replacing vdev. 3540 * If 'replace_done' is specified, only detach if the parent 3541 * is a replacing vdev. 3542 */ 3543 int 3544 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 3545 { 3546 uint64_t txg; 3547 int error; 3548 vdev_t *rvd = spa->spa_root_vdev; 3549 vdev_t *vd, *pvd, *cvd, *tvd; 3550 boolean_t unspare = B_FALSE; 3551 uint64_t unspare_guid; 3552 size_t len; 3553 3554 txg = spa_vdev_enter(spa); 3555 3556 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 3557 3558 if (vd == NULL) 3559 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3560 3561 if (!vd->vdev_ops->vdev_op_leaf) 3562 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3563 3564 pvd = vd->vdev_parent; 3565 3566 /* 3567 * If the parent/child relationship is not as expected, don't do it. 3568 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 3569 * vdev that's replacing B with C. The user's intent in replacing 3570 * is to go from M(A,B) to M(A,C). If the user decides to cancel 3571 * the replace by detaching C, the expected behavior is to end up 3572 * M(A,B). But suppose that right after deciding to detach C, 3573 * the replacement of B completes. We would have M(A,C), and then 3574 * ask to detach C, which would leave us with just A -- not what 3575 * the user wanted. To prevent this, we make sure that the 3576 * parent/child relationship hasn't changed -- in this example, 3577 * that C's parent is still the replacing vdev R. 3578 */ 3579 if (pvd->vdev_guid != pguid && pguid != 0) 3580 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3581 3582 /* 3583 * If replace_done is specified, only remove this device if it's 3584 * the first child of a replacing vdev. For the 'spare' vdev, either 3585 * disk can be removed. 3586 */ 3587 if (replace_done) { 3588 if (pvd->vdev_ops == &vdev_replacing_ops) { 3589 if (vd->vdev_id != 0) 3590 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3591 } else if (pvd->vdev_ops != &vdev_spare_ops) { 3592 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3593 } 3594 } 3595 3596 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 3597 spa_version(spa) >= SPA_VERSION_SPARES); 3598 3599 /* 3600 * Only mirror, replacing, and spare vdevs support detach. 3601 */ 3602 if (pvd->vdev_ops != &vdev_replacing_ops && 3603 pvd->vdev_ops != &vdev_mirror_ops && 3604 pvd->vdev_ops != &vdev_spare_ops) 3605 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3606 3607 /* 3608 * If this device has the only valid copy of some data, 3609 * we cannot safely detach it. 3610 */ 3611 if (vdev_dtl_required(vd)) 3612 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3613 3614 ASSERT(pvd->vdev_children >= 2); 3615 3616 /* 3617 * If we are detaching the second disk from a replacing vdev, then 3618 * check to see if we changed the original vdev's path to have "/old" 3619 * at the end in spa_vdev_attach(). If so, undo that change now. 3620 */ 3621 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 && 3622 pvd->vdev_child[0]->vdev_path != NULL && 3623 pvd->vdev_child[1]->vdev_path != NULL) { 3624 ASSERT(pvd->vdev_child[1] == vd); 3625 cvd = pvd->vdev_child[0]; 3626 len = strlen(vd->vdev_path); 3627 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 3628 strcmp(cvd->vdev_path + len, "/old") == 0) { 3629 spa_strfree(cvd->vdev_path); 3630 cvd->vdev_path = spa_strdup(vd->vdev_path); 3631 } 3632 } 3633 3634 /* 3635 * If we are detaching the original disk from a spare, then it implies 3636 * that the spare should become a real disk, and be removed from the 3637 * active spare list for the pool. 3638 */ 3639 if (pvd->vdev_ops == &vdev_spare_ops && 3640 vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare) 3641 unspare = B_TRUE; 3642 3643 /* 3644 * Erase the disk labels so the disk can be used for other things. 3645 * This must be done after all other error cases are handled, 3646 * but before we disembowel vd (so we can still do I/O to it). 3647 * But if we can't do it, don't treat the error as fatal -- 3648 * it may be that the unwritability of the disk is the reason 3649 * it's being detached! 3650 */ 3651 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 3652 3653 /* 3654 * Remove vd from its parent and compact the parent's children. 3655 */ 3656 vdev_remove_child(pvd, vd); 3657 vdev_compact_children(pvd); 3658 3659 /* 3660 * Remember one of the remaining children so we can get tvd below. 3661 */ 3662 cvd = pvd->vdev_child[0]; 3663 3664 /* 3665 * If we need to remove the remaining child from the list of hot spares, 3666 * do it now, marking the vdev as no longer a spare in the process. 3667 * We must do this before vdev_remove_parent(), because that can 3668 * change the GUID if it creates a new toplevel GUID. For a similar 3669 * reason, we must remove the spare now, in the same txg as the detach; 3670 * otherwise someone could attach a new sibling, change the GUID, and 3671 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 3672 */ 3673 if (unspare) { 3674 ASSERT(cvd->vdev_isspare); 3675 spa_spare_remove(cvd); 3676 unspare_guid = cvd->vdev_guid; 3677 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 3678 } 3679 3680 /* 3681 * If the parent mirror/replacing vdev only has one child, 3682 * the parent is no longer needed. Remove it from the tree. 3683 */ 3684 if (pvd->vdev_children == 1) 3685 vdev_remove_parent(cvd); 3686 3687 /* 3688 * We don't set tvd until now because the parent we just removed 3689 * may have been the previous top-level vdev. 3690 */ 3691 tvd = cvd->vdev_top; 3692 ASSERT(tvd->vdev_parent == rvd); 3693 3694 /* 3695 * Reevaluate the parent vdev state. 3696 */ 3697 vdev_propagate_state(cvd); 3698 3699 /* 3700 * If the 'autoexpand' property is set on the pool then automatically 3701 * try to expand the size of the pool. For example if the device we 3702 * just detached was smaller than the others, it may be possible to 3703 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 3704 * first so that we can obtain the updated sizes of the leaf vdevs. 3705 */ 3706 if (spa->spa_autoexpand) { 3707 vdev_reopen(tvd); 3708 vdev_expand(tvd, txg); 3709 } 3710 3711 vdev_config_dirty(tvd); 3712 3713 /* 3714 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 3715 * vd->vdev_detached is set and free vd's DTL object in syncing context. 3716 * But first make sure we're not on any *other* txg's DTL list, to 3717 * prevent vd from being accessed after it's freed. 3718 */ 3719 for (int t = 0; t < TXG_SIZE; t++) 3720 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 3721 vd->vdev_detached = B_TRUE; 3722 vdev_dirty(tvd, VDD_DTL, vd, txg); 3723 3724 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 3725 3726 error = spa_vdev_exit(spa, vd, txg, 0); 3727 3728 /* 3729 * If this was the removal of the original device in a hot spare vdev, 3730 * then we want to go through and remove the device from the hot spare 3731 * list of every other pool. 3732 */ 3733 if (unspare) { 3734 spa_t *myspa = spa; 3735 spa = NULL; 3736 mutex_enter(&spa_namespace_lock); 3737 while ((spa = spa_next(spa)) != NULL) { 3738 if (spa->spa_state != POOL_STATE_ACTIVE) 3739 continue; 3740 if (spa == myspa) 3741 continue; 3742 spa_open_ref(spa, FTAG); 3743 mutex_exit(&spa_namespace_lock); 3744 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 3745 mutex_enter(&spa_namespace_lock); 3746 spa_close(spa, FTAG); 3747 } 3748 mutex_exit(&spa_namespace_lock); 3749 } 3750 3751 return (error); 3752 } 3753 3754 static nvlist_t * 3755 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 3756 { 3757 for (int i = 0; i < count; i++) { 3758 uint64_t guid; 3759 3760 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 3761 &guid) == 0); 3762 3763 if (guid == target_guid) 3764 return (nvpp[i]); 3765 } 3766 3767 return (NULL); 3768 } 3769 3770 static void 3771 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 3772 nvlist_t *dev_to_remove) 3773 { 3774 nvlist_t **newdev = NULL; 3775 3776 if (count > 1) 3777 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 3778 3779 for (int i = 0, j = 0; i < count; i++) { 3780 if (dev[i] == dev_to_remove) 3781 continue; 3782 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 3783 } 3784 3785 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 3786 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 3787 3788 for (int i = 0; i < count - 1; i++) 3789 nvlist_free(newdev[i]); 3790 3791 if (count > 1) 3792 kmem_free(newdev, (count - 1) * sizeof (void *)); 3793 } 3794 3795 /* 3796 * Removing a device from the vdev namespace requires several steps 3797 * and can take a significant amount of time. As a result we use 3798 * the spa_vdev_config_[enter/exit] functions which allow us to 3799 * grab and release the spa_config_lock while still holding the namespace 3800 * lock. During each step the configuration is synced out. 3801 */ 3802 3803 /* 3804 * Evacuate the device. 3805 */ 3806 int 3807 spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 3808 { 3809 int error = 0; 3810 uint64_t txg; 3811 3812 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3813 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 3814 ASSERT(vd == vd->vdev_top); 3815 3816 /* 3817 * Evacuate the device. We don't hold the config lock as writer 3818 * since we need to do I/O but we do keep the 3819 * spa_namespace_lock held. Once this completes the device 3820 * should no longer have any blocks allocated on it. 3821 */ 3822 if (vd->vdev_islog) { 3823 error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 3824 NULL, DS_FIND_CHILDREN); 3825 } else { 3826 error = ENOTSUP; /* until we have bp rewrite */ 3827 } 3828 3829 txg_wait_synced(spa_get_dsl(spa), 0); 3830 3831 if (error) 3832 return (error); 3833 3834 /* 3835 * The evacuation succeeded. Remove any remaining MOS metadata 3836 * associated with this vdev, and wait for these changes to sync. 3837 */ 3838 txg = spa_vdev_config_enter(spa); 3839 vd->vdev_removing = B_TRUE; 3840 vdev_dirty(vd, 0, NULL, txg); 3841 vdev_config_dirty(vd); 3842 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 3843 3844 return (0); 3845 } 3846 3847 /* 3848 * Complete the removal by cleaning up the namespace. 3849 */ 3850 void 3851 spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 3852 { 3853 vdev_t *rvd = spa->spa_root_vdev; 3854 uint64_t id = vd->vdev_id; 3855 boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 3856 3857 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3858 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3859 ASSERT(vd == vd->vdev_top); 3860 3861 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 3862 3863 if (list_link_active(&vd->vdev_state_dirty_node)) 3864 vdev_state_clean(vd); 3865 if (list_link_active(&vd->vdev_config_dirty_node)) 3866 vdev_config_clean(vd); 3867 3868 vdev_free(vd); 3869 3870 if (last_vdev) { 3871 vdev_compact_children(rvd); 3872 } else { 3873 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 3874 vdev_add_child(rvd, vd); 3875 } 3876 vdev_config_dirty(rvd); 3877 3878 /* 3879 * Reassess the health of our root vdev. 3880 */ 3881 vdev_reopen(rvd); 3882 } 3883 3884 /* 3885 * Remove a device from the pool. Currently, this supports removing only hot 3886 * spares, slogs, and level 2 ARC devices. 3887 */ 3888 int 3889 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 3890 { 3891 vdev_t *vd; 3892 metaslab_group_t *mg; 3893 nvlist_t **spares, **l2cache, *nv; 3894 uint64_t txg = 0; 3895 uint_t nspares, nl2cache; 3896 int error = 0; 3897 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 3898 3899 if (!locked) 3900 txg = spa_vdev_enter(spa); 3901 3902 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 3903 3904 if (spa->spa_spares.sav_vdevs != NULL && 3905 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3906 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 3907 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 3908 /* 3909 * Only remove the hot spare if it's not currently in use 3910 * in this pool. 3911 */ 3912 if (vd == NULL || unspare) { 3913 spa_vdev_remove_aux(spa->spa_spares.sav_config, 3914 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 3915 spa_load_spares(spa); 3916 spa->spa_spares.sav_sync = B_TRUE; 3917 } else { 3918 error = EBUSY; 3919 } 3920 } else if (spa->spa_l2cache.sav_vdevs != NULL && 3921 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3922 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 3923 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 3924 /* 3925 * Cache devices can always be removed. 3926 */ 3927 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 3928 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 3929 spa_load_l2cache(spa); 3930 spa->spa_l2cache.sav_sync = B_TRUE; 3931 } else if (vd != NULL && vd->vdev_islog) { 3932 ASSERT(!locked); 3933 ASSERT(vd == vd->vdev_top); 3934 3935 /* 3936 * XXX - Once we have bp-rewrite this should 3937 * become the common case. 3938 */ 3939 3940 mg = vd->vdev_mg; 3941 3942 /* 3943 * Stop allocating from this vdev. 3944 */ 3945 metaslab_group_passivate(mg); 3946 3947 /* 3948 * Wait for the youngest allocations and frees to sync, 3949 * and then wait for the deferral of those frees to finish. 3950 */ 3951 spa_vdev_config_exit(spa, NULL, 3952 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 3953 3954 /* 3955 * Attempt to evacuate the vdev. 3956 */ 3957 error = spa_vdev_remove_evacuate(spa, vd); 3958 3959 txg = spa_vdev_config_enter(spa); 3960 3961 /* 3962 * If we couldn't evacuate the vdev, unwind. 3963 */ 3964 if (error) { 3965 metaslab_group_activate(mg); 3966 return (spa_vdev_exit(spa, NULL, txg, error)); 3967 } 3968 3969 /* 3970 * Clean up the vdev namespace. 3971 */ 3972 spa_vdev_remove_from_namespace(spa, vd); 3973 3974 } else if (vd != NULL) { 3975 /* 3976 * Normal vdevs cannot be removed (yet). 3977 */ 3978 error = ENOTSUP; 3979 } else { 3980 /* 3981 * There is no vdev of any kind with the specified guid. 3982 */ 3983 error = ENOENT; 3984 } 3985 3986 if (!locked) 3987 return (spa_vdev_exit(spa, NULL, txg, error)); 3988 3989 return (error); 3990 } 3991 3992 /* 3993 * Find any device that's done replacing, or a vdev marked 'unspare' that's 3994 * current spared, so we can detach it. 3995 */ 3996 static vdev_t * 3997 spa_vdev_resilver_done_hunt(vdev_t *vd) 3998 { 3999 vdev_t *newvd, *oldvd; 4000 4001 for (int c = 0; c < vd->vdev_children; c++) { 4002 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 4003 if (oldvd != NULL) 4004 return (oldvd); 4005 } 4006 4007 /* 4008 * Check for a completed replacement. 4009 */ 4010 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 4011 oldvd = vd->vdev_child[0]; 4012 newvd = vd->vdev_child[1]; 4013 4014 if (vdev_dtl_empty(newvd, DTL_MISSING) && 4015 !vdev_dtl_required(oldvd)) 4016 return (oldvd); 4017 } 4018 4019 /* 4020 * Check for a completed resilver with the 'unspare' flag set. 4021 */ 4022 if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { 4023 newvd = vd->vdev_child[0]; 4024 oldvd = vd->vdev_child[1]; 4025 4026 if (newvd->vdev_unspare && 4027 vdev_dtl_empty(newvd, DTL_MISSING) && 4028 !vdev_dtl_required(oldvd)) { 4029 newvd->vdev_unspare = 0; 4030 return (oldvd); 4031 } 4032 } 4033 4034 return (NULL); 4035 } 4036 4037 static void 4038 spa_vdev_resilver_done(spa_t *spa) 4039 { 4040 vdev_t *vd, *pvd, *ppvd; 4041 uint64_t guid, sguid, pguid, ppguid; 4042 4043 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4044 4045 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 4046 pvd = vd->vdev_parent; 4047 ppvd = pvd->vdev_parent; 4048 guid = vd->vdev_guid; 4049 pguid = pvd->vdev_guid; 4050 ppguid = ppvd->vdev_guid; 4051 sguid = 0; 4052 /* 4053 * If we have just finished replacing a hot spared device, then 4054 * we need to detach the parent's first child (the original hot 4055 * spare) as well. 4056 */ 4057 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) { 4058 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 4059 ASSERT(ppvd->vdev_children == 2); 4060 sguid = ppvd->vdev_child[1]->vdev_guid; 4061 } 4062 spa_config_exit(spa, SCL_ALL, FTAG); 4063 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 4064 return; 4065 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 4066 return; 4067 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4068 } 4069 4070 spa_config_exit(spa, SCL_ALL, FTAG); 4071 } 4072 4073 /* 4074 * Update the stored path or FRU for this vdev. 4075 */ 4076 int 4077 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 4078 boolean_t ispath) 4079 { 4080 vdev_t *vd; 4081 4082 spa_vdev_state_enter(spa, SCL_ALL); 4083 4084 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 4085 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 4086 4087 if (!vd->vdev_ops->vdev_op_leaf) 4088 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 4089 4090 if (ispath) { 4091 spa_strfree(vd->vdev_path); 4092 vd->vdev_path = spa_strdup(value); 4093 } else { 4094 if (vd->vdev_fru != NULL) 4095 spa_strfree(vd->vdev_fru); 4096 vd->vdev_fru = spa_strdup(value); 4097 } 4098 4099 return (spa_vdev_state_exit(spa, vd, 0)); 4100 } 4101 4102 int 4103 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 4104 { 4105 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 4106 } 4107 4108 int 4109 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 4110 { 4111 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 4112 } 4113 4114 /* 4115 * ========================================================================== 4116 * SPA Scrubbing 4117 * ========================================================================== 4118 */ 4119 4120 int 4121 spa_scrub(spa_t *spa, pool_scrub_type_t type) 4122 { 4123 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 4124 4125 if ((uint_t)type >= POOL_SCRUB_TYPES) 4126 return (ENOTSUP); 4127 4128 /* 4129 * If a resilver was requested, but there is no DTL on a 4130 * writeable leaf device, we have nothing to do. 4131 */ 4132 if (type == POOL_SCRUB_RESILVER && 4133 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 4134 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 4135 return (0); 4136 } 4137 4138 if (type == POOL_SCRUB_EVERYTHING && 4139 spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE && 4140 spa->spa_dsl_pool->dp_scrub_isresilver) 4141 return (EBUSY); 4142 4143 if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) { 4144 return (dsl_pool_scrub_clean(spa->spa_dsl_pool)); 4145 } else if (type == POOL_SCRUB_NONE) { 4146 return (dsl_pool_scrub_cancel(spa->spa_dsl_pool)); 4147 } else { 4148 return (EINVAL); 4149 } 4150 } 4151 4152 /* 4153 * ========================================================================== 4154 * SPA async task processing 4155 * ========================================================================== 4156 */ 4157 4158 static void 4159 spa_async_remove(spa_t *spa, vdev_t *vd) 4160 { 4161 if (vd->vdev_remove_wanted) { 4162 vd->vdev_remove_wanted = 0; 4163 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 4164 4165 /* 4166 * We want to clear the stats, but we don't want to do a full 4167 * vdev_clear() as that will cause us to throw away 4168 * degraded/faulted state as well as attempt to reopen the 4169 * device, all of which is a waste. 4170 */ 4171 vd->vdev_stat.vs_read_errors = 0; 4172 vd->vdev_stat.vs_write_errors = 0; 4173 vd->vdev_stat.vs_checksum_errors = 0; 4174 4175 vdev_state_dirty(vd->vdev_top); 4176 } 4177 4178 for (int c = 0; c < vd->vdev_children; c++) 4179 spa_async_remove(spa, vd->vdev_child[c]); 4180 } 4181 4182 static void 4183 spa_async_probe(spa_t *spa, vdev_t *vd) 4184 { 4185 if (vd->vdev_probe_wanted) { 4186 vd->vdev_probe_wanted = 0; 4187 vdev_reopen(vd); /* vdev_open() does the actual probe */ 4188 } 4189 4190 for (int c = 0; c < vd->vdev_children; c++) 4191 spa_async_probe(spa, vd->vdev_child[c]); 4192 } 4193 4194 static void 4195 spa_async_autoexpand(spa_t *spa, vdev_t *vd) 4196 { 4197 sysevent_id_t eid; 4198 nvlist_t *attr; 4199 char *physpath; 4200 4201 if (!spa->spa_autoexpand) 4202 return; 4203 4204 for (int c = 0; c < vd->vdev_children; c++) { 4205 vdev_t *cvd = vd->vdev_child[c]; 4206 spa_async_autoexpand(spa, cvd); 4207 } 4208 4209 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 4210 return; 4211 4212 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 4213 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 4214 4215 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4216 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 4217 4218 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 4219 ESC_DEV_DLE, attr, &eid, DDI_SLEEP); 4220 4221 nvlist_free(attr); 4222 kmem_free(physpath, MAXPATHLEN); 4223 } 4224 4225 static void 4226 spa_async_thread(spa_t *spa) 4227 { 4228 int tasks; 4229 4230 ASSERT(spa->spa_sync_on); 4231 4232 mutex_enter(&spa->spa_async_lock); 4233 tasks = spa->spa_async_tasks; 4234 spa->spa_async_tasks = 0; 4235 mutex_exit(&spa->spa_async_lock); 4236 4237 /* 4238 * See if the config needs to be updated. 4239 */ 4240 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 4241 uint64_t old_space, new_space; 4242 4243 mutex_enter(&spa_namespace_lock); 4244 old_space = metaslab_class_get_space(spa_normal_class(spa)); 4245 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4246 new_space = metaslab_class_get_space(spa_normal_class(spa)); 4247 mutex_exit(&spa_namespace_lock); 4248 4249 /* 4250 * If the pool grew as a result of the config update, 4251 * then log an internal history event. 4252 */ 4253 if (new_space != old_space) { 4254 spa_history_internal_log(LOG_POOL_VDEV_ONLINE, 4255 spa, NULL, CRED(), 4256 "pool '%s' size: %llu(+%llu)", 4257 spa_name(spa), new_space, new_space - old_space); 4258 } 4259 } 4260 4261 /* 4262 * See if any devices need to be marked REMOVED. 4263 */ 4264 if (tasks & SPA_ASYNC_REMOVE) { 4265 spa_vdev_state_enter(spa, SCL_NONE); 4266 spa_async_remove(spa, spa->spa_root_vdev); 4267 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 4268 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 4269 for (int i = 0; i < spa->spa_spares.sav_count; i++) 4270 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 4271 (void) spa_vdev_state_exit(spa, NULL, 0); 4272 } 4273 4274 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 4275 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4276 spa_async_autoexpand(spa, spa->spa_root_vdev); 4277 spa_config_exit(spa, SCL_CONFIG, FTAG); 4278 } 4279 4280 /* 4281 * See if any devices need to be probed. 4282 */ 4283 if (tasks & SPA_ASYNC_PROBE) { 4284 spa_vdev_state_enter(spa, SCL_NONE); 4285 spa_async_probe(spa, spa->spa_root_vdev); 4286 (void) spa_vdev_state_exit(spa, NULL, 0); 4287 } 4288 4289 /* 4290 * If any devices are done replacing, detach them. 4291 */ 4292 if (tasks & SPA_ASYNC_RESILVER_DONE) 4293 spa_vdev_resilver_done(spa); 4294 4295 /* 4296 * Kick off a resilver. 4297 */ 4298 if (tasks & SPA_ASYNC_RESILVER) 4299 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0); 4300 4301 /* 4302 * Let the world know that we're done. 4303 */ 4304 mutex_enter(&spa->spa_async_lock); 4305 spa->spa_async_thread = NULL; 4306 cv_broadcast(&spa->spa_async_cv); 4307 mutex_exit(&spa->spa_async_lock); 4308 thread_exit(); 4309 } 4310 4311 void 4312 spa_async_suspend(spa_t *spa) 4313 { 4314 mutex_enter(&spa->spa_async_lock); 4315 spa->spa_async_suspended++; 4316 while (spa->spa_async_thread != NULL) 4317 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 4318 mutex_exit(&spa->spa_async_lock); 4319 } 4320 4321 void 4322 spa_async_resume(spa_t *spa) 4323 { 4324 mutex_enter(&spa->spa_async_lock); 4325 ASSERT(spa->spa_async_suspended != 0); 4326 spa->spa_async_suspended--; 4327 mutex_exit(&spa->spa_async_lock); 4328 } 4329 4330 static void 4331 spa_async_dispatch(spa_t *spa) 4332 { 4333 mutex_enter(&spa->spa_async_lock); 4334 if (spa->spa_async_tasks && !spa->spa_async_suspended && 4335 spa->spa_async_thread == NULL && 4336 rootdir != NULL && !vn_is_readonly(rootdir)) 4337 spa->spa_async_thread = thread_create(NULL, 0, 4338 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 4339 mutex_exit(&spa->spa_async_lock); 4340 } 4341 4342 void 4343 spa_async_request(spa_t *spa, int task) 4344 { 4345 mutex_enter(&spa->spa_async_lock); 4346 spa->spa_async_tasks |= task; 4347 mutex_exit(&spa->spa_async_lock); 4348 } 4349 4350 /* 4351 * ========================================================================== 4352 * SPA syncing routines 4353 * ========================================================================== 4354 */ 4355 static void 4356 spa_sync_deferred_bplist(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx, uint64_t txg) 4357 { 4358 blkptr_t blk; 4359 uint64_t itor = 0; 4360 uint8_t c = 1; 4361 4362 while (bplist_iterate(bpl, &itor, &blk) == 0) { 4363 ASSERT(blk.blk_birth < txg); 4364 zio_free(spa, txg, &blk); 4365 } 4366 4367 bplist_vacate(bpl, tx); 4368 4369 /* 4370 * Pre-dirty the first block so we sync to convergence faster. 4371 * (Usually only the first block is needed.) 4372 */ 4373 dmu_write(bpl->bpl_mos, spa->spa_deferred_bplist_obj, 0, 1, &c, tx); 4374 } 4375 4376 static void 4377 spa_sync_free(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 4378 { 4379 zio_t *zio = arg; 4380 4381 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 4382 zio->io_flags)); 4383 } 4384 4385 static void 4386 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 4387 { 4388 char *packed = NULL; 4389 size_t bufsize; 4390 size_t nvsize = 0; 4391 dmu_buf_t *db; 4392 4393 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 4394 4395 /* 4396 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 4397 * information. This avoids the dbuf_will_dirty() path and 4398 * saves us a pre-read to get data we don't actually care about. 4399 */ 4400 bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); 4401 packed = kmem_alloc(bufsize, KM_SLEEP); 4402 4403 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 4404 KM_SLEEP) == 0); 4405 bzero(packed + nvsize, bufsize - nvsize); 4406 4407 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 4408 4409 kmem_free(packed, bufsize); 4410 4411 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 4412 dmu_buf_will_dirty(db, tx); 4413 *(uint64_t *)db->db_data = nvsize; 4414 dmu_buf_rele(db, FTAG); 4415 } 4416 4417 static void 4418 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 4419 const char *config, const char *entry) 4420 { 4421 nvlist_t *nvroot; 4422 nvlist_t **list; 4423 int i; 4424 4425 if (!sav->sav_sync) 4426 return; 4427 4428 /* 4429 * Update the MOS nvlist describing the list of available devices. 4430 * spa_validate_aux() will have already made sure this nvlist is 4431 * valid and the vdevs are labeled appropriately. 4432 */ 4433 if (sav->sav_object == 0) { 4434 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 4435 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 4436 sizeof (uint64_t), tx); 4437 VERIFY(zap_update(spa->spa_meta_objset, 4438 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 4439 &sav->sav_object, tx) == 0); 4440 } 4441 4442 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4443 if (sav->sav_count == 0) { 4444 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 4445 } else { 4446 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 4447 for (i = 0; i < sav->sav_count; i++) 4448 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 4449 B_FALSE, B_FALSE, B_TRUE); 4450 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 4451 sav->sav_count) == 0); 4452 for (i = 0; i < sav->sav_count; i++) 4453 nvlist_free(list[i]); 4454 kmem_free(list, sav->sav_count * sizeof (void *)); 4455 } 4456 4457 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 4458 nvlist_free(nvroot); 4459 4460 sav->sav_sync = B_FALSE; 4461 } 4462 4463 static void 4464 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 4465 { 4466 nvlist_t *config; 4467 4468 if (list_is_empty(&spa->spa_config_dirty_list)) 4469 return; 4470 4471 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4472 4473 config = spa_config_generate(spa, spa->spa_root_vdev, 4474 dmu_tx_get_txg(tx), B_FALSE); 4475 4476 spa_config_exit(spa, SCL_STATE, FTAG); 4477 4478 if (spa->spa_config_syncing) 4479 nvlist_free(spa->spa_config_syncing); 4480 spa->spa_config_syncing = config; 4481 4482 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 4483 } 4484 4485 /* 4486 * Set zpool properties. 4487 */ 4488 static void 4489 spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 4490 { 4491 spa_t *spa = arg1; 4492 objset_t *mos = spa->spa_meta_objset; 4493 nvlist_t *nvp = arg2; 4494 nvpair_t *elem; 4495 uint64_t intval; 4496 char *strval; 4497 zpool_prop_t prop; 4498 const char *propname; 4499 zprop_type_t proptype; 4500 4501 mutex_enter(&spa->spa_props_lock); 4502 4503 elem = NULL; 4504 while ((elem = nvlist_next_nvpair(nvp, elem))) { 4505 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 4506 case ZPOOL_PROP_VERSION: 4507 /* 4508 * Only set version for non-zpool-creation cases 4509 * (set/import). spa_create() needs special care 4510 * for version setting. 4511 */ 4512 if (tx->tx_txg != TXG_INITIAL) { 4513 VERIFY(nvpair_value_uint64(elem, 4514 &intval) == 0); 4515 ASSERT(intval <= SPA_VERSION); 4516 ASSERT(intval >= spa_version(spa)); 4517 spa->spa_uberblock.ub_version = intval; 4518 vdev_config_dirty(spa->spa_root_vdev); 4519 } 4520 break; 4521 4522 case ZPOOL_PROP_ALTROOT: 4523 /* 4524 * 'altroot' is a non-persistent property. It should 4525 * have been set temporarily at creation or import time. 4526 */ 4527 ASSERT(spa->spa_root != NULL); 4528 break; 4529 4530 case ZPOOL_PROP_CACHEFILE: 4531 /* 4532 * 'cachefile' is also a non-persisitent property. 4533 */ 4534 break; 4535 default: 4536 /* 4537 * Set pool property values in the poolprops mos object. 4538 */ 4539 if (spa->spa_pool_props_object == 0) { 4540 VERIFY((spa->spa_pool_props_object = 4541 zap_create(mos, DMU_OT_POOL_PROPS, 4542 DMU_OT_NONE, 0, tx)) > 0); 4543 4544 VERIFY(zap_update(mos, 4545 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 4546 8, 1, &spa->spa_pool_props_object, tx) 4547 == 0); 4548 } 4549 4550 /* normalize the property name */ 4551 propname = zpool_prop_to_name(prop); 4552 proptype = zpool_prop_get_type(prop); 4553 4554 if (nvpair_type(elem) == DATA_TYPE_STRING) { 4555 ASSERT(proptype == PROP_TYPE_STRING); 4556 VERIFY(nvpair_value_string(elem, &strval) == 0); 4557 VERIFY(zap_update(mos, 4558 spa->spa_pool_props_object, propname, 4559 1, strlen(strval) + 1, strval, tx) == 0); 4560 4561 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 4562 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 4563 4564 if (proptype == PROP_TYPE_INDEX) { 4565 const char *unused; 4566 VERIFY(zpool_prop_index_to_string( 4567 prop, intval, &unused) == 0); 4568 } 4569 VERIFY(zap_update(mos, 4570 spa->spa_pool_props_object, propname, 4571 8, 1, &intval, tx) == 0); 4572 } else { 4573 ASSERT(0); /* not allowed */ 4574 } 4575 4576 switch (prop) { 4577 case ZPOOL_PROP_DELEGATION: 4578 spa->spa_delegation = intval; 4579 break; 4580 case ZPOOL_PROP_BOOTFS: 4581 spa->spa_bootfs = intval; 4582 break; 4583 case ZPOOL_PROP_FAILUREMODE: 4584 spa->spa_failmode = intval; 4585 break; 4586 case ZPOOL_PROP_AUTOEXPAND: 4587 spa->spa_autoexpand = intval; 4588 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 4589 break; 4590 case ZPOOL_PROP_DEDUPDITTO: 4591 spa->spa_dedup_ditto = intval; 4592 break; 4593 default: 4594 break; 4595 } 4596 } 4597 4598 /* log internal history if this is not a zpool create */ 4599 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 4600 tx->tx_txg != TXG_INITIAL) { 4601 spa_history_internal_log(LOG_POOL_PROPSET, 4602 spa, tx, cr, "%s %lld %s", 4603 nvpair_name(elem), intval, spa_name(spa)); 4604 } 4605 } 4606 4607 mutex_exit(&spa->spa_props_lock); 4608 } 4609 4610 /* 4611 * Sync the specified transaction group. New blocks may be dirtied as 4612 * part of the process, so we iterate until it converges. 4613 */ 4614 void 4615 spa_sync(spa_t *spa, uint64_t txg) 4616 { 4617 dsl_pool_t *dp = spa->spa_dsl_pool; 4618 objset_t *mos = spa->spa_meta_objset; 4619 bplist_t *defer_bpl = &spa->spa_deferred_bplist; 4620 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 4621 vdev_t *rvd = spa->spa_root_vdev; 4622 vdev_t *vd; 4623 dmu_tx_t *tx; 4624 int error; 4625 4626 /* 4627 * Lock out configuration changes. 4628 */ 4629 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4630 4631 spa->spa_syncing_txg = txg; 4632 spa->spa_sync_pass = 0; 4633 4634 /* 4635 * If there are any pending vdev state changes, convert them 4636 * into config changes that go out with this transaction group. 4637 */ 4638 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4639 while (list_head(&spa->spa_state_dirty_list) != NULL) { 4640 /* 4641 * We need the write lock here because, for aux vdevs, 4642 * calling vdev_config_dirty() modifies sav_config. 4643 * This is ugly and will become unnecessary when we 4644 * eliminate the aux vdev wart by integrating all vdevs 4645 * into the root vdev tree. 4646 */ 4647 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 4648 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 4649 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 4650 vdev_state_clean(vd); 4651 vdev_config_dirty(vd); 4652 } 4653 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 4654 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 4655 } 4656 spa_config_exit(spa, SCL_STATE, FTAG); 4657 4658 VERIFY(0 == bplist_open(defer_bpl, mos, spa->spa_deferred_bplist_obj)); 4659 4660 tx = dmu_tx_create_assigned(dp, txg); 4661 4662 /* 4663 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 4664 * set spa_deflate if we have no raid-z vdevs. 4665 */ 4666 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 4667 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 4668 int i; 4669 4670 for (i = 0; i < rvd->vdev_children; i++) { 4671 vd = rvd->vdev_child[i]; 4672 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 4673 break; 4674 } 4675 if (i == rvd->vdev_children) { 4676 spa->spa_deflate = TRUE; 4677 VERIFY(0 == zap_add(spa->spa_meta_objset, 4678 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 4679 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 4680 } 4681 } 4682 4683 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 4684 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 4685 dsl_pool_create_origin(dp, tx); 4686 4687 /* Keeping the origin open increases spa_minref */ 4688 spa->spa_minref += 3; 4689 } 4690 4691 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 4692 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 4693 dsl_pool_upgrade_clones(dp, tx); 4694 } 4695 4696 /* 4697 * If anything has changed in this txg, push the deferred frees 4698 * from the previous txg. If not, leave them alone so that we 4699 * don't generate work on an otherwise idle system. 4700 */ 4701 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 4702 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 4703 !txg_list_empty(&dp->dp_sync_tasks, txg)) 4704 spa_sync_deferred_bplist(spa, defer_bpl, tx, txg); 4705 4706 /* 4707 * Iterate to convergence. 4708 */ 4709 do { 4710 int pass = ++spa->spa_sync_pass; 4711 4712 spa_sync_config_object(spa, tx); 4713 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 4714 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 4715 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 4716 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 4717 spa_errlog_sync(spa, txg); 4718 dsl_pool_sync(dp, txg); 4719 4720 if (pass <= SYNC_PASS_DEFERRED_FREE) { 4721 zio_t *zio = zio_root(spa, NULL, NULL, 0); 4722 bplist_sync(free_bpl, spa_sync_free, zio, tx); 4723 VERIFY(zio_wait(zio) == 0); 4724 } else { 4725 bplist_sync(free_bpl, bplist_enqueue_cb, defer_bpl, tx); 4726 } 4727 4728 ddt_sync(spa, txg); 4729 4730 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 4731 vdev_sync(vd, txg); 4732 4733 } while (dmu_objset_is_dirty(mos, txg)); 4734 4735 ASSERT(free_bpl->bpl_queue == NULL); 4736 4737 bplist_close(defer_bpl); 4738 4739 /* 4740 * Rewrite the vdev configuration (which includes the uberblock) 4741 * to commit the transaction group. 4742 * 4743 * If there are no dirty vdevs, we sync the uberblock to a few 4744 * random top-level vdevs that are known to be visible in the 4745 * config cache (see spa_vdev_add() for a complete description). 4746 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 4747 */ 4748 for (;;) { 4749 /* 4750 * We hold SCL_STATE to prevent vdev open/close/etc. 4751 * while we're attempting to write the vdev labels. 4752 */ 4753 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4754 4755 if (list_is_empty(&spa->spa_config_dirty_list)) { 4756 vdev_t *svd[SPA_DVAS_PER_BP]; 4757 int svdcount = 0; 4758 int children = rvd->vdev_children; 4759 int c0 = spa_get_random(children); 4760 4761 for (int c = 0; c < children; c++) { 4762 vd = rvd->vdev_child[(c0 + c) % children]; 4763 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 4764 continue; 4765 svd[svdcount++] = vd; 4766 if (svdcount == SPA_DVAS_PER_BP) 4767 break; 4768 } 4769 error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 4770 if (error != 0) 4771 error = vdev_config_sync(svd, svdcount, txg, 4772 B_TRUE); 4773 } else { 4774 error = vdev_config_sync(rvd->vdev_child, 4775 rvd->vdev_children, txg, B_FALSE); 4776 if (error != 0) 4777 error = vdev_config_sync(rvd->vdev_child, 4778 rvd->vdev_children, txg, B_TRUE); 4779 } 4780 4781 spa_config_exit(spa, SCL_STATE, FTAG); 4782 4783 if (error == 0) 4784 break; 4785 zio_suspend(spa, NULL); 4786 zio_resume_wait(spa); 4787 } 4788 dmu_tx_commit(tx); 4789 4790 /* 4791 * Clear the dirty config list. 4792 */ 4793 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 4794 vdev_config_clean(vd); 4795 4796 /* 4797 * Now that the new config has synced transactionally, 4798 * let it become visible to the config cache. 4799 */ 4800 if (spa->spa_config_syncing != NULL) { 4801 spa_config_set(spa, spa->spa_config_syncing); 4802 spa->spa_config_txg = txg; 4803 spa->spa_config_syncing = NULL; 4804 } 4805 4806 spa->spa_ubsync = spa->spa_uberblock; 4807 4808 dsl_pool_sync_done(dp, txg); 4809 4810 /* 4811 * Update usable space statistics. 4812 */ 4813 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 4814 vdev_sync_done(vd, txg); 4815 4816 spa_update_dspace(spa); 4817 4818 /* 4819 * It had better be the case that we didn't dirty anything 4820 * since vdev_config_sync(). 4821 */ 4822 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 4823 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 4824 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 4825 ASSERT(defer_bpl->bpl_queue == NULL); 4826 ASSERT(free_bpl->bpl_queue == NULL); 4827 4828 spa->spa_sync_pass = 0; 4829 4830 spa_config_exit(spa, SCL_CONFIG, FTAG); 4831 4832 spa_handle_ignored_writes(spa); 4833 4834 /* 4835 * If any async tasks have been requested, kick them off. 4836 */ 4837 spa_async_dispatch(spa); 4838 } 4839 4840 /* 4841 * Sync all pools. We don't want to hold the namespace lock across these 4842 * operations, so we take a reference on the spa_t and drop the lock during the 4843 * sync. 4844 */ 4845 void 4846 spa_sync_allpools(void) 4847 { 4848 spa_t *spa = NULL; 4849 mutex_enter(&spa_namespace_lock); 4850 while ((spa = spa_next(spa)) != NULL) { 4851 if (spa_state(spa) != POOL_STATE_ACTIVE || spa_suspended(spa)) 4852 continue; 4853 spa_open_ref(spa, FTAG); 4854 mutex_exit(&spa_namespace_lock); 4855 txg_wait_synced(spa_get_dsl(spa), 0); 4856 mutex_enter(&spa_namespace_lock); 4857 spa_close(spa, FTAG); 4858 } 4859 mutex_exit(&spa_namespace_lock); 4860 } 4861 4862 /* 4863 * ========================================================================== 4864 * Miscellaneous routines 4865 * ========================================================================== 4866 */ 4867 4868 /* 4869 * Remove all pools in the system. 4870 */ 4871 void 4872 spa_evict_all(void) 4873 { 4874 spa_t *spa; 4875 4876 /* 4877 * Remove all cached state. All pools should be closed now, 4878 * so every spa in the AVL tree should be unreferenced. 4879 */ 4880 mutex_enter(&spa_namespace_lock); 4881 while ((spa = spa_next(NULL)) != NULL) { 4882 /* 4883 * Stop async tasks. The async thread may need to detach 4884 * a device that's been replaced, which requires grabbing 4885 * spa_namespace_lock, so we must drop it here. 4886 */ 4887 spa_open_ref(spa, FTAG); 4888 mutex_exit(&spa_namespace_lock); 4889 spa_async_suspend(spa); 4890 mutex_enter(&spa_namespace_lock); 4891 spa_close(spa, FTAG); 4892 4893 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4894 spa_unload(spa); 4895 spa_deactivate(spa); 4896 } 4897 spa_remove(spa); 4898 } 4899 mutex_exit(&spa_namespace_lock); 4900 } 4901 4902 vdev_t * 4903 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 4904 { 4905 vdev_t *vd; 4906 int i; 4907 4908 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 4909 return (vd); 4910 4911 if (aux) { 4912 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 4913 vd = spa->spa_l2cache.sav_vdevs[i]; 4914 if (vd->vdev_guid == guid) 4915 return (vd); 4916 } 4917 4918 for (i = 0; i < spa->spa_spares.sav_count; i++) { 4919 vd = spa->spa_spares.sav_vdevs[i]; 4920 if (vd->vdev_guid == guid) 4921 return (vd); 4922 } 4923 } 4924 4925 return (NULL); 4926 } 4927 4928 void 4929 spa_upgrade(spa_t *spa, uint64_t version) 4930 { 4931 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4932 4933 /* 4934 * This should only be called for a non-faulted pool, and since a 4935 * future version would result in an unopenable pool, this shouldn't be 4936 * possible. 4937 */ 4938 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 4939 ASSERT(version >= spa->spa_uberblock.ub_version); 4940 4941 spa->spa_uberblock.ub_version = version; 4942 vdev_config_dirty(spa->spa_root_vdev); 4943 4944 spa_config_exit(spa, SCL_ALL, FTAG); 4945 4946 txg_wait_synced(spa_get_dsl(spa), 0); 4947 } 4948 4949 boolean_t 4950 spa_has_spare(spa_t *spa, uint64_t guid) 4951 { 4952 int i; 4953 uint64_t spareguid; 4954 spa_aux_vdev_t *sav = &spa->spa_spares; 4955 4956 for (i = 0; i < sav->sav_count; i++) 4957 if (sav->sav_vdevs[i]->vdev_guid == guid) 4958 return (B_TRUE); 4959 4960 for (i = 0; i < sav->sav_npending; i++) { 4961 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 4962 &spareguid) == 0 && spareguid == guid) 4963 return (B_TRUE); 4964 } 4965 4966 return (B_FALSE); 4967 } 4968 4969 /* 4970 * Check if a pool has an active shared spare device. 4971 * Note: reference count of an active spare is 2, as a spare and as a replace 4972 */ 4973 static boolean_t 4974 spa_has_active_shared_spare(spa_t *spa) 4975 { 4976 int i, refcnt; 4977 uint64_t pool; 4978 spa_aux_vdev_t *sav = &spa->spa_spares; 4979 4980 for (i = 0; i < sav->sav_count; i++) { 4981 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 4982 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 4983 refcnt > 2) 4984 return (B_TRUE); 4985 } 4986 4987 return (B_FALSE); 4988 } 4989 4990 /* 4991 * Post a sysevent corresponding to the given event. The 'name' must be one of 4992 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 4993 * filled in from the spa and (optionally) the vdev. This doesn't do anything 4994 * in the userland libzpool, as we don't want consumers to misinterpret ztest 4995 * or zdb as real changes. 4996 */ 4997 void 4998 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 4999 { 5000 #ifdef _KERNEL 5001 sysevent_t *ev; 5002 sysevent_attr_list_t *attr = NULL; 5003 sysevent_value_t value; 5004 sysevent_id_t eid; 5005 5006 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 5007 SE_SLEEP); 5008 5009 value.value_type = SE_DATA_TYPE_STRING; 5010 value.value.sv_string = spa_name(spa); 5011 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 5012 goto done; 5013 5014 value.value_type = SE_DATA_TYPE_UINT64; 5015 value.value.sv_uint64 = spa_guid(spa); 5016 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 5017 goto done; 5018 5019 if (vd) { 5020 value.value_type = SE_DATA_TYPE_UINT64; 5021 value.value.sv_uint64 = vd->vdev_guid; 5022 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 5023 SE_SLEEP) != 0) 5024 goto done; 5025 5026 if (vd->vdev_path) { 5027 value.value_type = SE_DATA_TYPE_STRING; 5028 value.value.sv_string = vd->vdev_path; 5029 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 5030 &value, SE_SLEEP) != 0) 5031 goto done; 5032 } 5033 } 5034 5035 if (sysevent_attach_attributes(ev, attr) != 0) 5036 goto done; 5037 attr = NULL; 5038 5039 (void) log_sysevent(ev, SE_SLEEP, &eid); 5040 5041 done: 5042 if (attr) 5043 sysevent_free_attr(attr); 5044 sysevent_free(ev); 5045 #endif 5046 } 5047