1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * This file contains all the routines used when modifying on-disk SPA state. 29 * This includes opening, importing, destroying, exporting a pool, and syncing a 30 * pool. 31 */ 32 33 #include <sys/zfs_context.h> 34 #include <sys/fm/fs/zfs.h> 35 #include <sys/spa_impl.h> 36 #include <sys/zio.h> 37 #include <sys/zio_checksum.h> 38 #include <sys/zio_compress.h> 39 #include <sys/dmu.h> 40 #include <sys/dmu_tx.h> 41 #include <sys/zap.h> 42 #include <sys/zil.h> 43 #include <sys/vdev_impl.h> 44 #include <sys/metaslab.h> 45 #include <sys/metaslab_impl.h> 46 #include <sys/uberblock_impl.h> 47 #include <sys/txg.h> 48 #include <sys/avl.h> 49 #include <sys/dmu_traverse.h> 50 #include <sys/dmu_objset.h> 51 #include <sys/unique.h> 52 #include <sys/dsl_pool.h> 53 #include <sys/dsl_dataset.h> 54 #include <sys/dsl_dir.h> 55 #include <sys/dsl_prop.h> 56 #include <sys/dsl_synctask.h> 57 #include <sys/fs/zfs.h> 58 #include <sys/arc.h> 59 #include <sys/callb.h> 60 #include <sys/systeminfo.h> 61 #include <sys/sunddi.h> 62 #include <sys/spa_boot.h> 63 #include <sys/zfs_ioctl.h> 64 65 #ifdef _KERNEL 66 #include <sys/zone.h> 67 #include <sys/bootprops.h> 68 #endif /* _KERNEL */ 69 70 #include "zfs_prop.h" 71 #include "zfs_comutil.h" 72 73 enum zti_modes { 74 zti_mode_fixed, /* value is # of threads (min 1) */ 75 zti_mode_online_percent, /* value is % of online CPUs */ 76 zti_mode_tune, /* fill from zio_taskq_tune_* */ 77 zti_nmodes 78 }; 79 80 #define ZTI_THREAD_FIX(n) { zti_mode_fixed, (n) } 81 #define ZTI_THREAD_PCT(n) { zti_mode_online_percent, (n) } 82 #define ZTI_THREAD_TUNE { zti_mode_tune, 0 } 83 84 #define ZTI_THREAD_ONE ZTI_THREAD_FIX(1) 85 86 typedef struct zio_taskq_info { 87 const char *zti_name; 88 struct { 89 enum zti_modes zti_mode; 90 uint_t zti_value; 91 } zti_nthreads[ZIO_TASKQ_TYPES]; 92 } zio_taskq_info_t; 93 94 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 95 "issue", "intr" 96 }; 97 98 const zio_taskq_info_t zio_taskqs[ZIO_TYPES] = { 99 /* ISSUE INTR */ 100 { "spa_zio_null", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, 101 { "spa_zio_read", { ZTI_THREAD_FIX(8), ZTI_THREAD_TUNE } }, 102 { "spa_zio_write", { ZTI_THREAD_TUNE, ZTI_THREAD_FIX(8) } }, 103 { "spa_zio_free", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, 104 { "spa_zio_claim", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, 105 { "spa_zio_ioctl", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, 106 }; 107 108 enum zti_modes zio_taskq_tune_mode = zti_mode_online_percent; 109 uint_t zio_taskq_tune_value = 80; /* #threads = 80% of # online CPUs */ 110 111 static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); 112 static boolean_t spa_has_active_shared_spare(spa_t *spa); 113 114 /* 115 * ========================================================================== 116 * SPA properties routines 117 * ========================================================================== 118 */ 119 120 /* 121 * Add a (source=src, propname=propval) list to an nvlist. 122 */ 123 static void 124 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 125 uint64_t intval, zprop_source_t src) 126 { 127 const char *propname = zpool_prop_to_name(prop); 128 nvlist_t *propval; 129 130 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 131 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 132 133 if (strval != NULL) 134 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 135 else 136 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 137 138 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 139 nvlist_free(propval); 140 } 141 142 /* 143 * Get property values from the spa configuration. 144 */ 145 static void 146 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 147 { 148 uint64_t size; 149 uint64_t used; 150 uint64_t cap, version; 151 zprop_source_t src = ZPROP_SRC_NONE; 152 spa_config_dirent_t *dp; 153 154 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 155 156 if (spa->spa_root_vdev != NULL) { 157 size = spa_get_space(spa); 158 used = spa_get_alloc(spa); 159 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 160 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 161 spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src); 162 spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, 163 size - used, src); 164 165 cap = (size == 0) ? 0 : (used * 100 / size); 166 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 167 168 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 169 spa->spa_root_vdev->vdev_state, src); 170 171 version = spa_version(spa); 172 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 173 src = ZPROP_SRC_DEFAULT; 174 else 175 src = ZPROP_SRC_LOCAL; 176 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 177 } 178 179 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 180 181 if (spa->spa_root != NULL) 182 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 183 0, ZPROP_SRC_LOCAL); 184 185 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 186 if (dp->scd_path == NULL) { 187 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 188 "none", 0, ZPROP_SRC_LOCAL); 189 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 190 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 191 dp->scd_path, 0, ZPROP_SRC_LOCAL); 192 } 193 } 194 } 195 196 /* 197 * Get zpool property values. 198 */ 199 int 200 spa_prop_get(spa_t *spa, nvlist_t **nvp) 201 { 202 zap_cursor_t zc; 203 zap_attribute_t za; 204 objset_t *mos = spa->spa_meta_objset; 205 int err; 206 207 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 208 209 mutex_enter(&spa->spa_props_lock); 210 211 /* 212 * Get properties from the spa config. 213 */ 214 spa_prop_get_config(spa, nvp); 215 216 /* If no pool property object, no more prop to get. */ 217 if (spa->spa_pool_props_object == 0) { 218 mutex_exit(&spa->spa_props_lock); 219 return (0); 220 } 221 222 /* 223 * Get properties from the MOS pool property object. 224 */ 225 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 226 (err = zap_cursor_retrieve(&zc, &za)) == 0; 227 zap_cursor_advance(&zc)) { 228 uint64_t intval = 0; 229 char *strval = NULL; 230 zprop_source_t src = ZPROP_SRC_DEFAULT; 231 zpool_prop_t prop; 232 233 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 234 continue; 235 236 switch (za.za_integer_length) { 237 case 8: 238 /* integer property */ 239 if (za.za_first_integer != 240 zpool_prop_default_numeric(prop)) 241 src = ZPROP_SRC_LOCAL; 242 243 if (prop == ZPOOL_PROP_BOOTFS) { 244 dsl_pool_t *dp; 245 dsl_dataset_t *ds = NULL; 246 247 dp = spa_get_dsl(spa); 248 rw_enter(&dp->dp_config_rwlock, RW_READER); 249 if (err = dsl_dataset_hold_obj(dp, 250 za.za_first_integer, FTAG, &ds)) { 251 rw_exit(&dp->dp_config_rwlock); 252 break; 253 } 254 255 strval = kmem_alloc( 256 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 257 KM_SLEEP); 258 dsl_dataset_name(ds, strval); 259 dsl_dataset_rele(ds, FTAG); 260 rw_exit(&dp->dp_config_rwlock); 261 } else { 262 strval = NULL; 263 intval = za.za_first_integer; 264 } 265 266 spa_prop_add_list(*nvp, prop, strval, intval, src); 267 268 if (strval != NULL) 269 kmem_free(strval, 270 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 271 272 break; 273 274 case 1: 275 /* string property */ 276 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 277 err = zap_lookup(mos, spa->spa_pool_props_object, 278 za.za_name, 1, za.za_num_integers, strval); 279 if (err) { 280 kmem_free(strval, za.za_num_integers); 281 break; 282 } 283 spa_prop_add_list(*nvp, prop, strval, 0, src); 284 kmem_free(strval, za.za_num_integers); 285 break; 286 287 default: 288 break; 289 } 290 } 291 zap_cursor_fini(&zc); 292 mutex_exit(&spa->spa_props_lock); 293 out: 294 if (err && err != ENOENT) { 295 nvlist_free(*nvp); 296 *nvp = NULL; 297 return (err); 298 } 299 300 return (0); 301 } 302 303 /* 304 * Validate the given pool properties nvlist and modify the list 305 * for the property values to be set. 306 */ 307 static int 308 spa_prop_validate(spa_t *spa, nvlist_t *props) 309 { 310 nvpair_t *elem; 311 int error = 0, reset_bootfs = 0; 312 uint64_t objnum; 313 314 elem = NULL; 315 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 316 zpool_prop_t prop; 317 char *propname, *strval; 318 uint64_t intval; 319 objset_t *os; 320 char *slash; 321 322 propname = nvpair_name(elem); 323 324 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) 325 return (EINVAL); 326 327 switch (prop) { 328 case ZPOOL_PROP_VERSION: 329 error = nvpair_value_uint64(elem, &intval); 330 if (!error && 331 (intval < spa_version(spa) || intval > SPA_VERSION)) 332 error = EINVAL; 333 break; 334 335 case ZPOOL_PROP_DELEGATION: 336 case ZPOOL_PROP_AUTOREPLACE: 337 case ZPOOL_PROP_LISTSNAPS: 338 case ZPOOL_PROP_AUTOEXPAND: 339 error = nvpair_value_uint64(elem, &intval); 340 if (!error && intval > 1) 341 error = EINVAL; 342 break; 343 344 case ZPOOL_PROP_BOOTFS: 345 /* 346 * If the pool version is less than SPA_VERSION_BOOTFS, 347 * or the pool is still being created (version == 0), 348 * the bootfs property cannot be set. 349 */ 350 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 351 error = ENOTSUP; 352 break; 353 } 354 355 /* 356 * Make sure the vdev config is bootable 357 */ 358 if (!vdev_is_bootable(spa->spa_root_vdev)) { 359 error = ENOTSUP; 360 break; 361 } 362 363 reset_bootfs = 1; 364 365 error = nvpair_value_string(elem, &strval); 366 367 if (!error) { 368 uint64_t compress; 369 370 if (strval == NULL || strval[0] == '\0') { 371 objnum = zpool_prop_default_numeric( 372 ZPOOL_PROP_BOOTFS); 373 break; 374 } 375 376 if (error = dmu_objset_hold(strval, FTAG, &os)) 377 break; 378 379 /* Must be ZPL and not gzip compressed. */ 380 381 if (dmu_objset_type(os) != DMU_OST_ZFS) { 382 error = ENOTSUP; 383 } else if ((error = dsl_prop_get_integer(strval, 384 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 385 &compress, NULL)) == 0 && 386 !BOOTFS_COMPRESS_VALID(compress)) { 387 error = ENOTSUP; 388 } else { 389 objnum = dmu_objset_id(os); 390 } 391 dmu_objset_rele(os, FTAG); 392 } 393 break; 394 395 case ZPOOL_PROP_FAILUREMODE: 396 error = nvpair_value_uint64(elem, &intval); 397 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 398 intval > ZIO_FAILURE_MODE_PANIC)) 399 error = EINVAL; 400 401 /* 402 * This is a special case which only occurs when 403 * the pool has completely failed. This allows 404 * the user to change the in-core failmode property 405 * without syncing it out to disk (I/Os might 406 * currently be blocked). We do this by returning 407 * EIO to the caller (spa_prop_set) to trick it 408 * into thinking we encountered a property validation 409 * error. 410 */ 411 if (!error && spa_suspended(spa)) { 412 spa->spa_failmode = intval; 413 error = EIO; 414 } 415 break; 416 417 case ZPOOL_PROP_CACHEFILE: 418 if ((error = nvpair_value_string(elem, &strval)) != 0) 419 break; 420 421 if (strval[0] == '\0') 422 break; 423 424 if (strcmp(strval, "none") == 0) 425 break; 426 427 if (strval[0] != '/') { 428 error = EINVAL; 429 break; 430 } 431 432 slash = strrchr(strval, '/'); 433 ASSERT(slash != NULL); 434 435 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 436 strcmp(slash, "/..") == 0) 437 error = EINVAL; 438 break; 439 } 440 441 if (error) 442 break; 443 } 444 445 if (!error && reset_bootfs) { 446 error = nvlist_remove(props, 447 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 448 449 if (!error) { 450 error = nvlist_add_uint64(props, 451 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 452 } 453 } 454 455 return (error); 456 } 457 458 void 459 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 460 { 461 char *cachefile; 462 spa_config_dirent_t *dp; 463 464 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 465 &cachefile) != 0) 466 return; 467 468 dp = kmem_alloc(sizeof (spa_config_dirent_t), 469 KM_SLEEP); 470 471 if (cachefile[0] == '\0') 472 dp->scd_path = spa_strdup(spa_config_path); 473 else if (strcmp(cachefile, "none") == 0) 474 dp->scd_path = NULL; 475 else 476 dp->scd_path = spa_strdup(cachefile); 477 478 list_insert_head(&spa->spa_config_list, dp); 479 if (need_sync) 480 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 481 } 482 483 int 484 spa_prop_set(spa_t *spa, nvlist_t *nvp) 485 { 486 int error; 487 nvpair_t *elem; 488 boolean_t need_sync = B_FALSE; 489 zpool_prop_t prop; 490 491 if ((error = spa_prop_validate(spa, nvp)) != 0) 492 return (error); 493 494 elem = NULL; 495 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 496 if ((prop = zpool_name_to_prop( 497 nvpair_name(elem))) == ZPROP_INVAL) 498 return (EINVAL); 499 500 if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT) 501 continue; 502 503 need_sync = B_TRUE; 504 break; 505 } 506 507 if (need_sync) 508 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 509 spa, nvp, 3)); 510 else 511 return (0); 512 } 513 514 /* 515 * If the bootfs property value is dsobj, clear it. 516 */ 517 void 518 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 519 { 520 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 521 VERIFY(zap_remove(spa->spa_meta_objset, 522 spa->spa_pool_props_object, 523 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 524 spa->spa_bootfs = 0; 525 } 526 } 527 528 /* 529 * ========================================================================== 530 * SPA state manipulation (open/create/destroy/import/export) 531 * ========================================================================== 532 */ 533 534 static int 535 spa_error_entry_compare(const void *a, const void *b) 536 { 537 spa_error_entry_t *sa = (spa_error_entry_t *)a; 538 spa_error_entry_t *sb = (spa_error_entry_t *)b; 539 int ret; 540 541 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 542 sizeof (zbookmark_t)); 543 544 if (ret < 0) 545 return (-1); 546 else if (ret > 0) 547 return (1); 548 else 549 return (0); 550 } 551 552 /* 553 * Utility function which retrieves copies of the current logs and 554 * re-initializes them in the process. 555 */ 556 void 557 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 558 { 559 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 560 561 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 562 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 563 564 avl_create(&spa->spa_errlist_scrub, 565 spa_error_entry_compare, sizeof (spa_error_entry_t), 566 offsetof(spa_error_entry_t, se_avl)); 567 avl_create(&spa->spa_errlist_last, 568 spa_error_entry_compare, sizeof (spa_error_entry_t), 569 offsetof(spa_error_entry_t, se_avl)); 570 } 571 572 /* 573 * Activate an uninitialized pool. 574 */ 575 static void 576 spa_activate(spa_t *spa, int mode) 577 { 578 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 579 580 spa->spa_state = POOL_STATE_ACTIVE; 581 spa->spa_mode = mode; 582 583 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 584 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 585 586 for (int t = 0; t < ZIO_TYPES; t++) { 587 const zio_taskq_info_t *ztip = &zio_taskqs[t]; 588 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 589 enum zti_modes mode = ztip->zti_nthreads[q].zti_mode; 590 uint_t value = ztip->zti_nthreads[q].zti_value; 591 char name[32]; 592 593 (void) snprintf(name, sizeof (name), 594 "%s_%s", ztip->zti_name, zio_taskq_types[q]); 595 596 if (mode == zti_mode_tune) { 597 mode = zio_taskq_tune_mode; 598 value = zio_taskq_tune_value; 599 if (mode == zti_mode_tune) 600 mode = zti_mode_online_percent; 601 } 602 603 switch (mode) { 604 case zti_mode_fixed: 605 ASSERT3U(value, >=, 1); 606 value = MAX(value, 1); 607 608 spa->spa_zio_taskq[t][q] = taskq_create(name, 609 value, maxclsyspri, 50, INT_MAX, 610 TASKQ_PREPOPULATE); 611 break; 612 613 case zti_mode_online_percent: 614 spa->spa_zio_taskq[t][q] = taskq_create(name, 615 value, maxclsyspri, 50, INT_MAX, 616 TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT); 617 break; 618 619 case zti_mode_tune: 620 default: 621 panic("unrecognized mode for " 622 "zio_taskqs[%u]->zti_nthreads[%u] (%u:%u) " 623 "in spa_activate()", 624 t, q, mode, value); 625 break; 626 } 627 } 628 } 629 630 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 631 offsetof(vdev_t, vdev_config_dirty_node)); 632 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 633 offsetof(vdev_t, vdev_state_dirty_node)); 634 635 txg_list_create(&spa->spa_vdev_txg_list, 636 offsetof(struct vdev, vdev_txg_node)); 637 638 avl_create(&spa->spa_errlist_scrub, 639 spa_error_entry_compare, sizeof (spa_error_entry_t), 640 offsetof(spa_error_entry_t, se_avl)); 641 avl_create(&spa->spa_errlist_last, 642 spa_error_entry_compare, sizeof (spa_error_entry_t), 643 offsetof(spa_error_entry_t, se_avl)); 644 } 645 646 /* 647 * Opposite of spa_activate(). 648 */ 649 static void 650 spa_deactivate(spa_t *spa) 651 { 652 ASSERT(spa->spa_sync_on == B_FALSE); 653 ASSERT(spa->spa_dsl_pool == NULL); 654 ASSERT(spa->spa_root_vdev == NULL); 655 ASSERT(spa->spa_async_zio_root == NULL); 656 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 657 658 txg_list_destroy(&spa->spa_vdev_txg_list); 659 660 list_destroy(&spa->spa_config_dirty_list); 661 list_destroy(&spa->spa_state_dirty_list); 662 663 for (int t = 0; t < ZIO_TYPES; t++) { 664 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 665 taskq_destroy(spa->spa_zio_taskq[t][q]); 666 spa->spa_zio_taskq[t][q] = NULL; 667 } 668 } 669 670 metaslab_class_destroy(spa->spa_normal_class); 671 spa->spa_normal_class = NULL; 672 673 metaslab_class_destroy(spa->spa_log_class); 674 spa->spa_log_class = NULL; 675 676 /* 677 * If this was part of an import or the open otherwise failed, we may 678 * still have errors left in the queues. Empty them just in case. 679 */ 680 spa_errlog_drain(spa); 681 682 avl_destroy(&spa->spa_errlist_scrub); 683 avl_destroy(&spa->spa_errlist_last); 684 685 spa->spa_state = POOL_STATE_UNINITIALIZED; 686 } 687 688 /* 689 * Verify a pool configuration, and construct the vdev tree appropriately. This 690 * will create all the necessary vdevs in the appropriate layout, with each vdev 691 * in the CLOSED state. This will prep the pool before open/creation/import. 692 * All vdev validation is done by the vdev_alloc() routine. 693 */ 694 static int 695 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 696 uint_t id, int atype) 697 { 698 nvlist_t **child; 699 uint_t children; 700 int error; 701 702 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 703 return (error); 704 705 if ((*vdp)->vdev_ops->vdev_op_leaf) 706 return (0); 707 708 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 709 &child, &children); 710 711 if (error == ENOENT) 712 return (0); 713 714 if (error) { 715 vdev_free(*vdp); 716 *vdp = NULL; 717 return (EINVAL); 718 } 719 720 for (int c = 0; c < children; c++) { 721 vdev_t *vd; 722 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 723 atype)) != 0) { 724 vdev_free(*vdp); 725 *vdp = NULL; 726 return (error); 727 } 728 } 729 730 ASSERT(*vdp != NULL); 731 732 return (0); 733 } 734 735 /* 736 * Opposite of spa_load(). 737 */ 738 static void 739 spa_unload(spa_t *spa) 740 { 741 int i; 742 743 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 744 745 /* 746 * Stop async tasks. 747 */ 748 spa_async_suspend(spa); 749 750 /* 751 * Stop syncing. 752 */ 753 if (spa->spa_sync_on) { 754 txg_sync_stop(spa->spa_dsl_pool); 755 spa->spa_sync_on = B_FALSE; 756 } 757 758 /* 759 * Wait for any outstanding async I/O to complete. 760 */ 761 if (spa->spa_async_zio_root != NULL) { 762 (void) zio_wait(spa->spa_async_zio_root); 763 spa->spa_async_zio_root = NULL; 764 } 765 766 /* 767 * Close the dsl pool. 768 */ 769 if (spa->spa_dsl_pool) { 770 dsl_pool_close(spa->spa_dsl_pool); 771 spa->spa_dsl_pool = NULL; 772 } 773 774 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 775 776 /* 777 * Drop and purge level 2 cache 778 */ 779 spa_l2cache_drop(spa); 780 781 /* 782 * Close all vdevs. 783 */ 784 if (spa->spa_root_vdev) 785 vdev_free(spa->spa_root_vdev); 786 ASSERT(spa->spa_root_vdev == NULL); 787 788 for (i = 0; i < spa->spa_spares.sav_count; i++) 789 vdev_free(spa->spa_spares.sav_vdevs[i]); 790 if (spa->spa_spares.sav_vdevs) { 791 kmem_free(spa->spa_spares.sav_vdevs, 792 spa->spa_spares.sav_count * sizeof (void *)); 793 spa->spa_spares.sav_vdevs = NULL; 794 } 795 if (spa->spa_spares.sav_config) { 796 nvlist_free(spa->spa_spares.sav_config); 797 spa->spa_spares.sav_config = NULL; 798 } 799 spa->spa_spares.sav_count = 0; 800 801 for (i = 0; i < spa->spa_l2cache.sav_count; i++) 802 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 803 if (spa->spa_l2cache.sav_vdevs) { 804 kmem_free(spa->spa_l2cache.sav_vdevs, 805 spa->spa_l2cache.sav_count * sizeof (void *)); 806 spa->spa_l2cache.sav_vdevs = NULL; 807 } 808 if (spa->spa_l2cache.sav_config) { 809 nvlist_free(spa->spa_l2cache.sav_config); 810 spa->spa_l2cache.sav_config = NULL; 811 } 812 spa->spa_l2cache.sav_count = 0; 813 814 spa->spa_async_suspended = 0; 815 816 spa_config_exit(spa, SCL_ALL, FTAG); 817 } 818 819 /* 820 * Load (or re-load) the current list of vdevs describing the active spares for 821 * this pool. When this is called, we have some form of basic information in 822 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 823 * then re-generate a more complete list including status information. 824 */ 825 static void 826 spa_load_spares(spa_t *spa) 827 { 828 nvlist_t **spares; 829 uint_t nspares; 830 int i; 831 vdev_t *vd, *tvd; 832 833 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 834 835 /* 836 * First, close and free any existing spare vdevs. 837 */ 838 for (i = 0; i < spa->spa_spares.sav_count; i++) { 839 vd = spa->spa_spares.sav_vdevs[i]; 840 841 /* Undo the call to spa_activate() below */ 842 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 843 B_FALSE)) != NULL && tvd->vdev_isspare) 844 spa_spare_remove(tvd); 845 vdev_close(vd); 846 vdev_free(vd); 847 } 848 849 if (spa->spa_spares.sav_vdevs) 850 kmem_free(spa->spa_spares.sav_vdevs, 851 spa->spa_spares.sav_count * sizeof (void *)); 852 853 if (spa->spa_spares.sav_config == NULL) 854 nspares = 0; 855 else 856 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 857 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 858 859 spa->spa_spares.sav_count = (int)nspares; 860 spa->spa_spares.sav_vdevs = NULL; 861 862 if (nspares == 0) 863 return; 864 865 /* 866 * Construct the array of vdevs, opening them to get status in the 867 * process. For each spare, there is potentially two different vdev_t 868 * structures associated with it: one in the list of spares (used only 869 * for basic validation purposes) and one in the active vdev 870 * configuration (if it's spared in). During this phase we open and 871 * validate each vdev on the spare list. If the vdev also exists in the 872 * active configuration, then we also mark this vdev as an active spare. 873 */ 874 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 875 KM_SLEEP); 876 for (i = 0; i < spa->spa_spares.sav_count; i++) { 877 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 878 VDEV_ALLOC_SPARE) == 0); 879 ASSERT(vd != NULL); 880 881 spa->spa_spares.sav_vdevs[i] = vd; 882 883 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 884 B_FALSE)) != NULL) { 885 if (!tvd->vdev_isspare) 886 spa_spare_add(tvd); 887 888 /* 889 * We only mark the spare active if we were successfully 890 * able to load the vdev. Otherwise, importing a pool 891 * with a bad active spare would result in strange 892 * behavior, because multiple pool would think the spare 893 * is actively in use. 894 * 895 * There is a vulnerability here to an equally bizarre 896 * circumstance, where a dead active spare is later 897 * brought back to life (onlined or otherwise). Given 898 * the rarity of this scenario, and the extra complexity 899 * it adds, we ignore the possibility. 900 */ 901 if (!vdev_is_dead(tvd)) 902 spa_spare_activate(tvd); 903 } 904 905 vd->vdev_top = vd; 906 vd->vdev_aux = &spa->spa_spares; 907 908 if (vdev_open(vd) != 0) 909 continue; 910 911 if (vdev_validate_aux(vd) == 0) 912 spa_spare_add(vd); 913 } 914 915 /* 916 * Recompute the stashed list of spares, with status information 917 * this time. 918 */ 919 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 920 DATA_TYPE_NVLIST_ARRAY) == 0); 921 922 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 923 KM_SLEEP); 924 for (i = 0; i < spa->spa_spares.sav_count; i++) 925 spares[i] = vdev_config_generate(spa, 926 spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE); 927 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 928 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 929 for (i = 0; i < spa->spa_spares.sav_count; i++) 930 nvlist_free(spares[i]); 931 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 932 } 933 934 /* 935 * Load (or re-load) the current list of vdevs describing the active l2cache for 936 * this pool. When this is called, we have some form of basic information in 937 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 938 * then re-generate a more complete list including status information. 939 * Devices which are already active have their details maintained, and are 940 * not re-opened. 941 */ 942 static void 943 spa_load_l2cache(spa_t *spa) 944 { 945 nvlist_t **l2cache; 946 uint_t nl2cache; 947 int i, j, oldnvdevs; 948 uint64_t guid; 949 vdev_t *vd, **oldvdevs, **newvdevs; 950 spa_aux_vdev_t *sav = &spa->spa_l2cache; 951 952 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 953 954 if (sav->sav_config != NULL) { 955 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 956 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 957 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 958 } else { 959 nl2cache = 0; 960 } 961 962 oldvdevs = sav->sav_vdevs; 963 oldnvdevs = sav->sav_count; 964 sav->sav_vdevs = NULL; 965 sav->sav_count = 0; 966 967 /* 968 * Process new nvlist of vdevs. 969 */ 970 for (i = 0; i < nl2cache; i++) { 971 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 972 &guid) == 0); 973 974 newvdevs[i] = NULL; 975 for (j = 0; j < oldnvdevs; j++) { 976 vd = oldvdevs[j]; 977 if (vd != NULL && guid == vd->vdev_guid) { 978 /* 979 * Retain previous vdev for add/remove ops. 980 */ 981 newvdevs[i] = vd; 982 oldvdevs[j] = NULL; 983 break; 984 } 985 } 986 987 if (newvdevs[i] == NULL) { 988 /* 989 * Create new vdev 990 */ 991 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 992 VDEV_ALLOC_L2CACHE) == 0); 993 ASSERT(vd != NULL); 994 newvdevs[i] = vd; 995 996 /* 997 * Commit this vdev as an l2cache device, 998 * even if it fails to open. 999 */ 1000 spa_l2cache_add(vd); 1001 1002 vd->vdev_top = vd; 1003 vd->vdev_aux = sav; 1004 1005 spa_l2cache_activate(vd); 1006 1007 if (vdev_open(vd) != 0) 1008 continue; 1009 1010 (void) vdev_validate_aux(vd); 1011 1012 if (!vdev_is_dead(vd)) 1013 l2arc_add_vdev(spa, vd); 1014 } 1015 } 1016 1017 /* 1018 * Purge vdevs that were dropped 1019 */ 1020 for (i = 0; i < oldnvdevs; i++) { 1021 uint64_t pool; 1022 1023 vd = oldvdevs[i]; 1024 if (vd != NULL) { 1025 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1026 pool != 0ULL && l2arc_vdev_present(vd)) 1027 l2arc_remove_vdev(vd); 1028 (void) vdev_close(vd); 1029 spa_l2cache_remove(vd); 1030 } 1031 } 1032 1033 if (oldvdevs) 1034 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1035 1036 if (sav->sav_config == NULL) 1037 goto out; 1038 1039 sav->sav_vdevs = newvdevs; 1040 sav->sav_count = (int)nl2cache; 1041 1042 /* 1043 * Recompute the stashed list of l2cache devices, with status 1044 * information this time. 1045 */ 1046 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1047 DATA_TYPE_NVLIST_ARRAY) == 0); 1048 1049 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1050 for (i = 0; i < sav->sav_count; i++) 1051 l2cache[i] = vdev_config_generate(spa, 1052 sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE); 1053 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1054 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1055 out: 1056 for (i = 0; i < sav->sav_count; i++) 1057 nvlist_free(l2cache[i]); 1058 if (sav->sav_count) 1059 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1060 } 1061 1062 static int 1063 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1064 { 1065 dmu_buf_t *db; 1066 char *packed = NULL; 1067 size_t nvsize = 0; 1068 int error; 1069 *value = NULL; 1070 1071 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 1072 nvsize = *(uint64_t *)db->db_data; 1073 dmu_buf_rele(db, FTAG); 1074 1075 packed = kmem_alloc(nvsize, KM_SLEEP); 1076 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1077 DMU_READ_PREFETCH); 1078 if (error == 0) 1079 error = nvlist_unpack(packed, nvsize, value, 0); 1080 kmem_free(packed, nvsize); 1081 1082 return (error); 1083 } 1084 1085 /* 1086 * Checks to see if the given vdev could not be opened, in which case we post a 1087 * sysevent to notify the autoreplace code that the device has been removed. 1088 */ 1089 static void 1090 spa_check_removed(vdev_t *vd) 1091 { 1092 for (int c = 0; c < vd->vdev_children; c++) 1093 spa_check_removed(vd->vdev_child[c]); 1094 1095 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 1096 zfs_post_autoreplace(vd->vdev_spa, vd); 1097 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1098 } 1099 } 1100 1101 /* 1102 * Load the slog device state from the config object since it's possible 1103 * that the label does not contain the most up-to-date information. 1104 */ 1105 void 1106 spa_load_log_state(spa_t *spa, nvlist_t *nv) 1107 { 1108 vdev_t *ovd, *rvd = spa->spa_root_vdev; 1109 1110 /* 1111 * Load the original root vdev tree from the passed config. 1112 */ 1113 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1114 VERIFY(spa_config_parse(spa, &ovd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1115 1116 for (int c = 0; c < rvd->vdev_children; c++) { 1117 vdev_t *cvd = rvd->vdev_child[c]; 1118 if (cvd->vdev_islog) 1119 vdev_load_log_state(cvd, ovd->vdev_child[c]); 1120 } 1121 vdev_free(ovd); 1122 spa_config_exit(spa, SCL_ALL, FTAG); 1123 } 1124 1125 /* 1126 * Check for missing log devices 1127 */ 1128 int 1129 spa_check_logs(spa_t *spa) 1130 { 1131 switch (spa->spa_log_state) { 1132 case SPA_LOG_MISSING: 1133 /* need to recheck in case slog has been restored */ 1134 case SPA_LOG_UNKNOWN: 1135 if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, 1136 DS_FIND_CHILDREN)) { 1137 spa->spa_log_state = SPA_LOG_MISSING; 1138 return (1); 1139 } 1140 break; 1141 } 1142 return (0); 1143 } 1144 1145 static void 1146 spa_aux_check_removed(spa_aux_vdev_t *sav) 1147 { 1148 int i; 1149 1150 for (i = 0; i < sav->sav_count; i++) 1151 spa_check_removed(sav->sav_vdevs[i]); 1152 } 1153 1154 typedef struct spa_load_error { 1155 uint64_t sle_metadata_count; 1156 uint64_t sle_data_count; 1157 } spa_load_error_t; 1158 1159 static void 1160 spa_load_verify_done(zio_t *zio) 1161 { 1162 blkptr_t *bp = zio->io_bp; 1163 spa_load_error_t *sle = zio->io_private; 1164 dmu_object_type_t type = BP_GET_TYPE(bp); 1165 int error = zio->io_error; 1166 1167 if (error) { 1168 if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) && 1169 type != DMU_OT_INTENT_LOG) 1170 atomic_add_64(&sle->sle_metadata_count, 1); 1171 else 1172 atomic_add_64(&sle->sle_data_count, 1); 1173 } 1174 zio_data_buf_free(zio->io_data, zio->io_size); 1175 } 1176 1177 /*ARGSUSED*/ 1178 static int 1179 spa_load_verify_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, 1180 const dnode_phys_t *dnp, void *arg) 1181 { 1182 if (bp != NULL) { 1183 zio_t *rio = arg; 1184 size_t size = BP_GET_PSIZE(bp); 1185 void *data = zio_data_buf_alloc(size); 1186 1187 zio_nowait(zio_read(rio, spa, bp, data, size, 1188 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1189 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1190 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1191 } 1192 return (0); 1193 } 1194 1195 static int 1196 spa_load_verify(spa_t *spa) 1197 { 1198 zio_t *rio; 1199 spa_load_error_t sle = { 0 }; 1200 zpool_rewind_policy_t policy; 1201 boolean_t verify_ok = B_FALSE; 1202 int error; 1203 1204 rio = zio_root(spa, NULL, &sle, 1205 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1206 1207 error = traverse_pool(spa, spa_load_verify_cb, rio, 1208 spa->spa_verify_min_txg); 1209 1210 (void) zio_wait(rio); 1211 1212 zpool_get_rewind_policy(spa->spa_config, &policy); 1213 1214 spa->spa_load_meta_errors = sle.sle_metadata_count; 1215 spa->spa_load_data_errors = sle.sle_data_count; 1216 1217 if (!error && sle.sle_metadata_count <= policy.zrp_maxmeta && 1218 sle.sle_data_count <= policy.zrp_maxdata) { 1219 verify_ok = B_TRUE; 1220 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 1221 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 1222 } 1223 1224 if (error) { 1225 if (error != ENXIO && error != EIO) 1226 error = EIO; 1227 return (error); 1228 } 1229 1230 return (verify_ok ? 0 : EIO); 1231 } 1232 1233 /* 1234 * Load an existing storage pool, using the pool's builtin spa_config as a 1235 * source of configuration information. 1236 */ 1237 static int 1238 spa_load(spa_t *spa, spa_load_state_t state, int mosconfig) 1239 { 1240 int error = 0; 1241 nvlist_t *nvconfig, *nvroot = NULL; 1242 vdev_t *rvd; 1243 uberblock_t *ub = &spa->spa_uberblock; 1244 uint64_t config_cache_txg = spa->spa_config_txg; 1245 uint64_t pool_guid; 1246 uint64_t version; 1247 uint64_t autoreplace = 0; 1248 int orig_mode = spa->spa_mode; 1249 char *ereport = FM_EREPORT_ZFS_POOL; 1250 nvlist_t *config = spa->spa_config; 1251 1252 /* 1253 * If this is an untrusted config, access the pool in read-only mode. 1254 * This prevents things like resilvering recently removed devices. 1255 */ 1256 if (!mosconfig) 1257 spa->spa_mode = FREAD; 1258 1259 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1260 1261 spa->spa_load_state = state; 1262 1263 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 1264 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 1265 error = EINVAL; 1266 goto out; 1267 } 1268 1269 /* 1270 * Versioning wasn't explicitly added to the label until later, so if 1271 * it's not present treat it as the initial version. 1272 */ 1273 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 1274 version = SPA_VERSION_INITIAL; 1275 1276 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 1277 &spa->spa_config_txg); 1278 1279 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 1280 spa_guid_exists(pool_guid, 0)) { 1281 error = EEXIST; 1282 goto out; 1283 } 1284 1285 spa->spa_load_guid = pool_guid; 1286 1287 /* 1288 * Create "The Godfather" zio to hold all async IOs 1289 */ 1290 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 1291 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 1292 1293 /* 1294 * Parse the configuration into a vdev tree. We explicitly set the 1295 * value that will be returned by spa_version() since parsing the 1296 * configuration requires knowing the version number. 1297 */ 1298 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1299 spa->spa_ubsync.ub_version = version; 1300 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 1301 spa_config_exit(spa, SCL_ALL, FTAG); 1302 1303 if (error != 0) 1304 goto out; 1305 1306 ASSERT(spa->spa_root_vdev == rvd); 1307 ASSERT(spa_guid(spa) == pool_guid); 1308 1309 /* 1310 * Try to open all vdevs, loading each label in the process. 1311 */ 1312 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1313 error = vdev_open(rvd); 1314 spa_config_exit(spa, SCL_ALL, FTAG); 1315 if (error != 0) 1316 goto out; 1317 1318 /* 1319 * We need to validate the vdev labels against the configuration that 1320 * we have in hand, which is dependent on the setting of mosconfig. If 1321 * mosconfig is true then we're validating the vdev labels based on 1322 * that config. Otherwise, we're validating against the cached config 1323 * (zpool.cache) that was read when we loaded the zfs module, and then 1324 * later we will recursively call spa_load() and validate against 1325 * the vdev config. 1326 */ 1327 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1328 error = vdev_validate(rvd); 1329 spa_config_exit(spa, SCL_ALL, FTAG); 1330 if (error != 0) 1331 goto out; 1332 1333 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1334 error = ENXIO; 1335 goto out; 1336 } 1337 1338 /* 1339 * Find the best uberblock. 1340 */ 1341 vdev_uberblock_load(NULL, rvd, ub); 1342 1343 /* 1344 * If we weren't able to find a single valid uberblock, return failure. 1345 */ 1346 if (ub->ub_txg == 0) { 1347 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1348 VDEV_AUX_CORRUPT_DATA); 1349 error = ENXIO; 1350 goto out; 1351 } 1352 1353 /* 1354 * If the pool is newer than the code, we can't open it. 1355 */ 1356 if (ub->ub_version > SPA_VERSION) { 1357 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1358 VDEV_AUX_VERSION_NEWER); 1359 error = ENOTSUP; 1360 goto out; 1361 } 1362 1363 /* 1364 * If the vdev guid sum doesn't match the uberblock, we have an 1365 * incomplete configuration. 1366 */ 1367 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 1368 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1369 VDEV_AUX_BAD_GUID_SUM); 1370 error = ENXIO; 1371 goto out; 1372 } 1373 1374 /* 1375 * Initialize internal SPA structures. 1376 */ 1377 spa->spa_state = POOL_STATE_ACTIVE; 1378 spa->spa_ubsync = spa->spa_uberblock; 1379 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 1380 TXG_INITIAL : spa_last_synced_txg(spa) - TXG_DEFER_SIZE; 1381 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 1382 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 1383 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 1384 if (error) { 1385 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1386 VDEV_AUX_CORRUPT_DATA); 1387 error = EIO; 1388 goto out; 1389 } 1390 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 1391 1392 if (zap_lookup(spa->spa_meta_objset, 1393 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1394 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 1395 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1396 VDEV_AUX_CORRUPT_DATA); 1397 error = EIO; 1398 goto out; 1399 } 1400 1401 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) { 1402 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1403 VDEV_AUX_CORRUPT_DATA); 1404 error = EIO; 1405 goto out; 1406 } 1407 1408 if (!mosconfig) { 1409 uint64_t hostid; 1410 1411 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 1412 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 1413 char *hostname; 1414 unsigned long myhostid = 0; 1415 1416 VERIFY(nvlist_lookup_string(nvconfig, 1417 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 1418 1419 #ifdef _KERNEL 1420 myhostid = zone_get_hostid(NULL); 1421 #else /* _KERNEL */ 1422 /* 1423 * We're emulating the system's hostid in userland, so 1424 * we can't use zone_get_hostid(). 1425 */ 1426 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 1427 #endif /* _KERNEL */ 1428 if (hostid != 0 && myhostid != 0 && 1429 hostid != myhostid) { 1430 cmn_err(CE_WARN, "pool '%s' could not be " 1431 "loaded as it was last accessed by " 1432 "another system (host: %s hostid: 0x%lx). " 1433 "See: http://www.sun.com/msg/ZFS-8000-EY", 1434 spa_name(spa), hostname, 1435 (unsigned long)hostid); 1436 error = EBADF; 1437 goto out; 1438 } 1439 } 1440 1441 spa_config_set(spa, nvconfig); 1442 spa_unload(spa); 1443 spa_deactivate(spa); 1444 spa_activate(spa, orig_mode); 1445 1446 return (spa_load(spa, state, B_TRUE)); 1447 } 1448 1449 if (zap_lookup(spa->spa_meta_objset, 1450 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1451 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 1452 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1453 VDEV_AUX_CORRUPT_DATA); 1454 error = EIO; 1455 goto out; 1456 } 1457 1458 /* 1459 * Load the bit that tells us to use the new accounting function 1460 * (raid-z deflation). If we have an older pool, this will not 1461 * be present. 1462 */ 1463 error = zap_lookup(spa->spa_meta_objset, 1464 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1465 sizeof (uint64_t), 1, &spa->spa_deflate); 1466 if (error != 0 && error != ENOENT) { 1467 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1468 VDEV_AUX_CORRUPT_DATA); 1469 error = EIO; 1470 goto out; 1471 } 1472 1473 /* 1474 * Load the persistent error log. If we have an older pool, this will 1475 * not be present. 1476 */ 1477 error = zap_lookup(spa->spa_meta_objset, 1478 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 1479 sizeof (uint64_t), 1, &spa->spa_errlog_last); 1480 if (error != 0 && error != ENOENT) { 1481 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1482 VDEV_AUX_CORRUPT_DATA); 1483 error = EIO; 1484 goto out; 1485 } 1486 1487 error = zap_lookup(spa->spa_meta_objset, 1488 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 1489 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 1490 if (error != 0 && error != ENOENT) { 1491 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1492 VDEV_AUX_CORRUPT_DATA); 1493 error = EIO; 1494 goto out; 1495 } 1496 1497 /* 1498 * Load the history object. If we have an older pool, this 1499 * will not be present. 1500 */ 1501 error = zap_lookup(spa->spa_meta_objset, 1502 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 1503 sizeof (uint64_t), 1, &spa->spa_history); 1504 if (error != 0 && error != ENOENT) { 1505 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1506 VDEV_AUX_CORRUPT_DATA); 1507 error = EIO; 1508 goto out; 1509 } 1510 1511 /* 1512 * Load any hot spares for this pool. 1513 */ 1514 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1515 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object); 1516 if (error != 0 && error != ENOENT) { 1517 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1518 VDEV_AUX_CORRUPT_DATA); 1519 error = EIO; 1520 goto out; 1521 } 1522 if (error == 0) { 1523 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 1524 if (load_nvlist(spa, spa->spa_spares.sav_object, 1525 &spa->spa_spares.sav_config) != 0) { 1526 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1527 VDEV_AUX_CORRUPT_DATA); 1528 error = EIO; 1529 goto out; 1530 } 1531 1532 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1533 spa_load_spares(spa); 1534 spa_config_exit(spa, SCL_ALL, FTAG); 1535 } 1536 1537 /* 1538 * Load any level 2 ARC devices for this pool. 1539 */ 1540 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1541 DMU_POOL_L2CACHE, sizeof (uint64_t), 1, 1542 &spa->spa_l2cache.sav_object); 1543 if (error != 0 && error != ENOENT) { 1544 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1545 VDEV_AUX_CORRUPT_DATA); 1546 error = EIO; 1547 goto out; 1548 } 1549 if (error == 0) { 1550 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 1551 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 1552 &spa->spa_l2cache.sav_config) != 0) { 1553 vdev_set_state(rvd, B_TRUE, 1554 VDEV_STATE_CANT_OPEN, 1555 VDEV_AUX_CORRUPT_DATA); 1556 error = EIO; 1557 goto out; 1558 } 1559 1560 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1561 spa_load_l2cache(spa); 1562 spa_config_exit(spa, SCL_ALL, FTAG); 1563 } 1564 1565 VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE, 1566 &nvroot) == 0); 1567 spa_load_log_state(spa, nvroot); 1568 nvlist_free(nvconfig); 1569 1570 if (spa_check_logs(spa)) { 1571 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1572 VDEV_AUX_BAD_LOG); 1573 error = ENXIO; 1574 ereport = FM_EREPORT_ZFS_LOG_REPLAY; 1575 goto out; 1576 } 1577 1578 1579 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 1580 1581 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1582 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 1583 1584 if (error && error != ENOENT) { 1585 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1586 VDEV_AUX_CORRUPT_DATA); 1587 error = EIO; 1588 goto out; 1589 } 1590 1591 if (error == 0) { 1592 (void) zap_lookup(spa->spa_meta_objset, 1593 spa->spa_pool_props_object, 1594 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 1595 sizeof (uint64_t), 1, &spa->spa_bootfs); 1596 (void) zap_lookup(spa->spa_meta_objset, 1597 spa->spa_pool_props_object, 1598 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1599 sizeof (uint64_t), 1, &autoreplace); 1600 spa->spa_autoreplace = (autoreplace != 0); 1601 (void) zap_lookup(spa->spa_meta_objset, 1602 spa->spa_pool_props_object, 1603 zpool_prop_to_name(ZPOOL_PROP_DELEGATION), 1604 sizeof (uint64_t), 1, &spa->spa_delegation); 1605 (void) zap_lookup(spa->spa_meta_objset, 1606 spa->spa_pool_props_object, 1607 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 1608 sizeof (uint64_t), 1, &spa->spa_failmode); 1609 (void) zap_lookup(spa->spa_meta_objset, 1610 spa->spa_pool_props_object, 1611 zpool_prop_to_name(ZPOOL_PROP_AUTOEXPAND), 1612 sizeof (uint64_t), 1, &spa->spa_autoexpand); 1613 } 1614 1615 /* 1616 * If the 'autoreplace' property is set, then post a resource notifying 1617 * the ZFS DE that it should not issue any faults for unopenable 1618 * devices. We also iterate over the vdevs, and post a sysevent for any 1619 * unopenable vdevs so that the normal autoreplace handler can take 1620 * over. 1621 */ 1622 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 1623 spa_check_removed(spa->spa_root_vdev); 1624 /* 1625 * For the import case, this is done in spa_import(), because 1626 * at this point we're using the spare definitions from 1627 * the MOS config, not necessarily from the userland config. 1628 */ 1629 if (state != SPA_LOAD_IMPORT) { 1630 spa_aux_check_removed(&spa->spa_spares); 1631 spa_aux_check_removed(&spa->spa_l2cache); 1632 } 1633 } 1634 1635 /* 1636 * Load the vdev state for all toplevel vdevs. 1637 */ 1638 vdev_load(rvd); 1639 1640 /* 1641 * Propagate the leaf DTLs we just loaded all the way up the tree. 1642 */ 1643 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1644 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 1645 spa_config_exit(spa, SCL_ALL, FTAG); 1646 1647 /* 1648 * Check the state of the root vdev. If it can't be opened, it 1649 * indicates one or more toplevel vdevs are faulted. 1650 */ 1651 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1652 error = ENXIO; 1653 goto out; 1654 } 1655 1656 if (state != SPA_LOAD_TRYIMPORT) { 1657 error = spa_load_verify(spa); 1658 if (error) { 1659 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1660 VDEV_AUX_CORRUPT_DATA); 1661 goto out; 1662 } 1663 } 1664 1665 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 1666 spa->spa_load_max_txg == UINT64_MAX)) { 1667 dmu_tx_t *tx; 1668 int need_update = B_FALSE; 1669 1670 ASSERT(state != SPA_LOAD_TRYIMPORT); 1671 1672 /* 1673 * Claim log blocks that haven't been committed yet. 1674 * This must all happen in a single txg. 1675 * Price of rollback is that we abandon the log. 1676 */ 1677 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 1678 spa_first_txg(spa)); 1679 (void) dmu_objset_find(spa_name(spa), 1680 zil_claim, tx, DS_FIND_CHILDREN); 1681 dmu_tx_commit(tx); 1682 1683 spa->spa_log_state = SPA_LOG_GOOD; 1684 spa->spa_sync_on = B_TRUE; 1685 txg_sync_start(spa->spa_dsl_pool); 1686 1687 /* 1688 * Wait for all claims to sync. 1689 */ 1690 txg_wait_synced(spa->spa_dsl_pool, 0); 1691 1692 /* 1693 * If the config cache is stale, or we have uninitialized 1694 * metaslabs (see spa_vdev_add()), then update the config. 1695 * 1696 * If spa_load_verbatim is true, trust the current 1697 * in-core spa_config and update the disk labels. 1698 */ 1699 if (config_cache_txg != spa->spa_config_txg || 1700 state == SPA_LOAD_IMPORT || spa->spa_load_verbatim || 1701 state == SPA_LOAD_RECOVER) 1702 need_update = B_TRUE; 1703 1704 for (int c = 0; c < rvd->vdev_children; c++) 1705 if (rvd->vdev_child[c]->vdev_ms_array == 0) 1706 need_update = B_TRUE; 1707 1708 /* 1709 * Update the config cache asychronously in case we're the 1710 * root pool, in which case the config cache isn't writable yet. 1711 */ 1712 if (need_update) 1713 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 1714 1715 /* 1716 * Check all DTLs to see if anything needs resilvering. 1717 */ 1718 if (vdev_resilver_needed(rvd, NULL, NULL)) 1719 spa_async_request(spa, SPA_ASYNC_RESILVER); 1720 1721 /* 1722 * Delete any inconsistent datasets. 1723 */ 1724 (void) dmu_objset_find(spa_name(spa), 1725 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 1726 1727 /* 1728 * Clean up any stale temporary dataset userrefs. 1729 */ 1730 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 1731 } 1732 1733 error = 0; 1734 out: 1735 1736 spa->spa_minref = refcount_count(&spa->spa_refcount); 1737 if (error && error != EBADF) 1738 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 1739 spa->spa_load_state = SPA_LOAD_NONE; 1740 spa->spa_ena = 0; 1741 1742 return (error); 1743 } 1744 1745 static int 1746 spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 1747 { 1748 spa_unload(spa); 1749 spa_deactivate(spa); 1750 1751 spa->spa_load_max_txg--; 1752 1753 spa_activate(spa, spa_mode_global); 1754 spa_async_suspend(spa); 1755 1756 return (spa_load(spa, state, mosconfig)); 1757 } 1758 1759 static int 1760 spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 1761 uint64_t max_request, boolean_t extreme) 1762 { 1763 nvlist_t *config = NULL; 1764 int load_error, rewind_error; 1765 uint64_t safe_rollback_txg; 1766 uint64_t min_txg; 1767 1768 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) 1769 spa->spa_load_max_txg = spa->spa_load_txg; 1770 else 1771 spa->spa_load_max_txg = max_request; 1772 1773 load_error = rewind_error = spa_load(spa, state, mosconfig); 1774 if (load_error == 0) 1775 return (0); 1776 1777 if (spa->spa_root_vdev != NULL) 1778 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1779 1780 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 1781 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 1782 1783 /* specific txg requested */ 1784 if (spa->spa_load_max_txg != UINT64_MAX && !extreme) { 1785 nvlist_free(config); 1786 return (load_error); 1787 } 1788 1789 /* Price of rolling back is discarding txgs, including log */ 1790 if (state == SPA_LOAD_RECOVER) 1791 spa->spa_log_state = SPA_LOG_CLEAR; 1792 1793 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 1794 safe_rollback_txg = spa->spa_uberblock.ub_txg - TXG_DEFER_SIZE; 1795 1796 min_txg = extreme ? TXG_INITIAL : safe_rollback_txg; 1797 while (rewind_error && (spa->spa_uberblock.ub_txg >= min_txg)) { 1798 if (spa->spa_load_max_txg < safe_rollback_txg) 1799 spa->spa_extreme_rewind = B_TRUE; 1800 rewind_error = spa_load_retry(spa, state, mosconfig); 1801 } 1802 1803 if (config) 1804 spa_rewind_data_to_nvlist(spa, config); 1805 1806 spa->spa_extreme_rewind = B_FALSE; 1807 spa->spa_load_max_txg = UINT64_MAX; 1808 1809 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 1810 spa_config_set(spa, config); 1811 1812 return (state == SPA_LOAD_RECOVER ? rewind_error : load_error); 1813 } 1814 1815 /* 1816 * Pool Open/Import 1817 * 1818 * The import case is identical to an open except that the configuration is sent 1819 * down from userland, instead of grabbed from the configuration cache. For the 1820 * case of an open, the pool configuration will exist in the 1821 * POOL_STATE_UNINITIALIZED state. 1822 * 1823 * The stats information (gen/count/ustats) is used to gather vdev statistics at 1824 * the same time open the pool, without having to keep around the spa_t in some 1825 * ambiguous state. 1826 */ 1827 static int 1828 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 1829 nvlist_t **config) 1830 { 1831 spa_t *spa; 1832 boolean_t norewind; 1833 boolean_t extreme; 1834 zpool_rewind_policy_t policy; 1835 spa_load_state_t state = SPA_LOAD_OPEN; 1836 int error; 1837 int locked = B_FALSE; 1838 1839 *spapp = NULL; 1840 1841 zpool_get_rewind_policy(nvpolicy, &policy); 1842 if (policy.zrp_request & ZPOOL_DO_REWIND) 1843 state = SPA_LOAD_RECOVER; 1844 norewind = (policy.zrp_request == ZPOOL_NO_REWIND); 1845 extreme = ((policy.zrp_request & ZPOOL_EXTREME_REWIND) != 0); 1846 1847 /* 1848 * As disgusting as this is, we need to support recursive calls to this 1849 * function because dsl_dir_open() is called during spa_load(), and ends 1850 * up calling spa_open() again. The real fix is to figure out how to 1851 * avoid dsl_dir_open() calling this in the first place. 1852 */ 1853 if (mutex_owner(&spa_namespace_lock) != curthread) { 1854 mutex_enter(&spa_namespace_lock); 1855 locked = B_TRUE; 1856 } 1857 1858 if ((spa = spa_lookup(pool)) == NULL) { 1859 if (locked) 1860 mutex_exit(&spa_namespace_lock); 1861 return (ENOENT); 1862 } 1863 1864 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 1865 1866 spa_activate(spa, spa_mode_global); 1867 1868 if (spa->spa_last_open_failed && norewind) { 1869 if (config != NULL && spa->spa_config) 1870 VERIFY(nvlist_dup(spa->spa_config, 1871 config, KM_SLEEP) == 0); 1872 spa_deactivate(spa); 1873 if (locked) 1874 mutex_exit(&spa_namespace_lock); 1875 return (spa->spa_last_open_failed); 1876 } 1877 1878 if (state != SPA_LOAD_RECOVER) 1879 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 1880 1881 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 1882 extreme); 1883 1884 if (error == EBADF) { 1885 /* 1886 * If vdev_validate() returns failure (indicated by 1887 * EBADF), it indicates that one of the vdevs indicates 1888 * that the pool has been exported or destroyed. If 1889 * this is the case, the config cache is out of sync and 1890 * we should remove the pool from the namespace. 1891 */ 1892 spa_unload(spa); 1893 spa_deactivate(spa); 1894 spa_config_sync(spa, B_TRUE, B_TRUE); 1895 spa_remove(spa); 1896 if (locked) 1897 mutex_exit(&spa_namespace_lock); 1898 return (ENOENT); 1899 } 1900 1901 if (error) { 1902 /* 1903 * We can't open the pool, but we still have useful 1904 * information: the state of each vdev after the 1905 * attempted vdev_open(). Return this to the user. 1906 */ 1907 if (config != NULL && spa->spa_config) 1908 VERIFY(nvlist_dup(spa->spa_config, config, 1909 KM_SLEEP) == 0); 1910 spa_unload(spa); 1911 spa_deactivate(spa); 1912 spa->spa_last_open_failed = error; 1913 if (locked) 1914 mutex_exit(&spa_namespace_lock); 1915 *spapp = NULL; 1916 return (error); 1917 } 1918 1919 } 1920 1921 spa_open_ref(spa, tag); 1922 1923 spa->spa_last_open_failed = 0; 1924 1925 if (config != NULL) 1926 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1927 1928 spa->spa_last_ubsync_txg = 0; 1929 spa->spa_load_txg = 0; 1930 1931 if (locked) 1932 mutex_exit(&spa_namespace_lock); 1933 1934 *spapp = spa; 1935 1936 return (0); 1937 } 1938 1939 int 1940 spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 1941 nvlist_t **config) 1942 { 1943 return (spa_open_common(name, spapp, tag, policy, config)); 1944 } 1945 1946 int 1947 spa_open(const char *name, spa_t **spapp, void *tag) 1948 { 1949 return (spa_open_common(name, spapp, tag, NULL, NULL)); 1950 } 1951 1952 /* 1953 * Lookup the given spa_t, incrementing the inject count in the process, 1954 * preventing it from being exported or destroyed. 1955 */ 1956 spa_t * 1957 spa_inject_addref(char *name) 1958 { 1959 spa_t *spa; 1960 1961 mutex_enter(&spa_namespace_lock); 1962 if ((spa = spa_lookup(name)) == NULL) { 1963 mutex_exit(&spa_namespace_lock); 1964 return (NULL); 1965 } 1966 spa->spa_inject_ref++; 1967 mutex_exit(&spa_namespace_lock); 1968 1969 return (spa); 1970 } 1971 1972 void 1973 spa_inject_delref(spa_t *spa) 1974 { 1975 mutex_enter(&spa_namespace_lock); 1976 spa->spa_inject_ref--; 1977 mutex_exit(&spa_namespace_lock); 1978 } 1979 1980 /* 1981 * Add spares device information to the nvlist. 1982 */ 1983 static void 1984 spa_add_spares(spa_t *spa, nvlist_t *config) 1985 { 1986 nvlist_t **spares; 1987 uint_t i, nspares; 1988 nvlist_t *nvroot; 1989 uint64_t guid; 1990 vdev_stat_t *vs; 1991 uint_t vsc; 1992 uint64_t pool; 1993 1994 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 1995 1996 if (spa->spa_spares.sav_count == 0) 1997 return; 1998 1999 VERIFY(nvlist_lookup_nvlist(config, 2000 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2001 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 2002 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2003 if (nspares != 0) { 2004 VERIFY(nvlist_add_nvlist_array(nvroot, 2005 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2006 VERIFY(nvlist_lookup_nvlist_array(nvroot, 2007 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2008 2009 /* 2010 * Go through and find any spares which have since been 2011 * repurposed as an active spare. If this is the case, update 2012 * their status appropriately. 2013 */ 2014 for (i = 0; i < nspares; i++) { 2015 VERIFY(nvlist_lookup_uint64(spares[i], 2016 ZPOOL_CONFIG_GUID, &guid) == 0); 2017 if (spa_spare_exists(guid, &pool, NULL) && 2018 pool != 0ULL) { 2019 VERIFY(nvlist_lookup_uint64_array( 2020 spares[i], ZPOOL_CONFIG_STATS, 2021 (uint64_t **)&vs, &vsc) == 0); 2022 vs->vs_state = VDEV_STATE_CANT_OPEN; 2023 vs->vs_aux = VDEV_AUX_SPARED; 2024 } 2025 } 2026 } 2027 } 2028 2029 /* 2030 * Add l2cache device information to the nvlist, including vdev stats. 2031 */ 2032 static void 2033 spa_add_l2cache(spa_t *spa, nvlist_t *config) 2034 { 2035 nvlist_t **l2cache; 2036 uint_t i, j, nl2cache; 2037 nvlist_t *nvroot; 2038 uint64_t guid; 2039 vdev_t *vd; 2040 vdev_stat_t *vs; 2041 uint_t vsc; 2042 2043 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2044 2045 if (spa->spa_l2cache.sav_count == 0) 2046 return; 2047 2048 VERIFY(nvlist_lookup_nvlist(config, 2049 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2050 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 2051 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 2052 if (nl2cache != 0) { 2053 VERIFY(nvlist_add_nvlist_array(nvroot, 2054 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2055 VERIFY(nvlist_lookup_nvlist_array(nvroot, 2056 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 2057 2058 /* 2059 * Update level 2 cache device stats. 2060 */ 2061 2062 for (i = 0; i < nl2cache; i++) { 2063 VERIFY(nvlist_lookup_uint64(l2cache[i], 2064 ZPOOL_CONFIG_GUID, &guid) == 0); 2065 2066 vd = NULL; 2067 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 2068 if (guid == 2069 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 2070 vd = spa->spa_l2cache.sav_vdevs[j]; 2071 break; 2072 } 2073 } 2074 ASSERT(vd != NULL); 2075 2076 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 2077 ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0); 2078 vdev_get_stats(vd, vs); 2079 } 2080 } 2081 } 2082 2083 int 2084 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 2085 { 2086 int error; 2087 spa_t *spa; 2088 2089 *config = NULL; 2090 error = spa_open_common(name, &spa, FTAG, NULL, config); 2091 2092 if (spa != NULL) { 2093 /* 2094 * This still leaves a window of inconsistency where the spares 2095 * or l2cache devices could change and the config would be 2096 * self-inconsistent. 2097 */ 2098 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 2099 2100 if (*config != NULL) { 2101 VERIFY(nvlist_add_uint64(*config, 2102 ZPOOL_CONFIG_ERRCOUNT, 2103 spa_get_errlog_size(spa)) == 0); 2104 2105 if (spa_suspended(spa)) 2106 VERIFY(nvlist_add_uint64(*config, 2107 ZPOOL_CONFIG_SUSPENDED, 2108 spa->spa_failmode) == 0); 2109 2110 spa_add_spares(spa, *config); 2111 spa_add_l2cache(spa, *config); 2112 } 2113 } 2114 2115 /* 2116 * We want to get the alternate root even for faulted pools, so we cheat 2117 * and call spa_lookup() directly. 2118 */ 2119 if (altroot) { 2120 if (spa == NULL) { 2121 mutex_enter(&spa_namespace_lock); 2122 spa = spa_lookup(name); 2123 if (spa) 2124 spa_altroot(spa, altroot, buflen); 2125 else 2126 altroot[0] = '\0'; 2127 spa = NULL; 2128 mutex_exit(&spa_namespace_lock); 2129 } else { 2130 spa_altroot(spa, altroot, buflen); 2131 } 2132 } 2133 2134 if (spa != NULL) { 2135 spa_config_exit(spa, SCL_CONFIG, FTAG); 2136 spa_close(spa, FTAG); 2137 } 2138 2139 return (error); 2140 } 2141 2142 /* 2143 * Validate that the auxiliary device array is well formed. We must have an 2144 * array of nvlists, each which describes a valid leaf vdev. If this is an 2145 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 2146 * specified, as long as they are well-formed. 2147 */ 2148 static int 2149 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 2150 spa_aux_vdev_t *sav, const char *config, uint64_t version, 2151 vdev_labeltype_t label) 2152 { 2153 nvlist_t **dev; 2154 uint_t i, ndev; 2155 vdev_t *vd; 2156 int error; 2157 2158 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2159 2160 /* 2161 * It's acceptable to have no devs specified. 2162 */ 2163 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 2164 return (0); 2165 2166 if (ndev == 0) 2167 return (EINVAL); 2168 2169 /* 2170 * Make sure the pool is formatted with a version that supports this 2171 * device type. 2172 */ 2173 if (spa_version(spa) < version) 2174 return (ENOTSUP); 2175 2176 /* 2177 * Set the pending device list so we correctly handle device in-use 2178 * checking. 2179 */ 2180 sav->sav_pending = dev; 2181 sav->sav_npending = ndev; 2182 2183 for (i = 0; i < ndev; i++) { 2184 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 2185 mode)) != 0) 2186 goto out; 2187 2188 if (!vd->vdev_ops->vdev_op_leaf) { 2189 vdev_free(vd); 2190 error = EINVAL; 2191 goto out; 2192 } 2193 2194 /* 2195 * The L2ARC currently only supports disk devices in 2196 * kernel context. For user-level testing, we allow it. 2197 */ 2198 #ifdef _KERNEL 2199 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 2200 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 2201 error = ENOTBLK; 2202 goto out; 2203 } 2204 #endif 2205 vd->vdev_top = vd; 2206 2207 if ((error = vdev_open(vd)) == 0 && 2208 (error = vdev_label_init(vd, crtxg, label)) == 0) { 2209 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 2210 vd->vdev_guid) == 0); 2211 } 2212 2213 vdev_free(vd); 2214 2215 if (error && 2216 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 2217 goto out; 2218 else 2219 error = 0; 2220 } 2221 2222 out: 2223 sav->sav_pending = NULL; 2224 sav->sav_npending = 0; 2225 return (error); 2226 } 2227 2228 static int 2229 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 2230 { 2231 int error; 2232 2233 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2234 2235 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2236 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 2237 VDEV_LABEL_SPARE)) != 0) { 2238 return (error); 2239 } 2240 2241 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2242 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 2243 VDEV_LABEL_L2CACHE)); 2244 } 2245 2246 static void 2247 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 2248 const char *config) 2249 { 2250 int i; 2251 2252 if (sav->sav_config != NULL) { 2253 nvlist_t **olddevs; 2254 uint_t oldndevs; 2255 nvlist_t **newdevs; 2256 2257 /* 2258 * Generate new dev list by concatentating with the 2259 * current dev list. 2260 */ 2261 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 2262 &olddevs, &oldndevs) == 0); 2263 2264 newdevs = kmem_alloc(sizeof (void *) * 2265 (ndevs + oldndevs), KM_SLEEP); 2266 for (i = 0; i < oldndevs; i++) 2267 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 2268 KM_SLEEP) == 0); 2269 for (i = 0; i < ndevs; i++) 2270 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 2271 KM_SLEEP) == 0); 2272 2273 VERIFY(nvlist_remove(sav->sav_config, config, 2274 DATA_TYPE_NVLIST_ARRAY) == 0); 2275 2276 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 2277 config, newdevs, ndevs + oldndevs) == 0); 2278 for (i = 0; i < oldndevs + ndevs; i++) 2279 nvlist_free(newdevs[i]); 2280 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 2281 } else { 2282 /* 2283 * Generate a new dev list. 2284 */ 2285 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 2286 KM_SLEEP) == 0); 2287 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 2288 devs, ndevs) == 0); 2289 } 2290 } 2291 2292 /* 2293 * Stop and drop level 2 ARC devices 2294 */ 2295 void 2296 spa_l2cache_drop(spa_t *spa) 2297 { 2298 vdev_t *vd; 2299 int i; 2300 spa_aux_vdev_t *sav = &spa->spa_l2cache; 2301 2302 for (i = 0; i < sav->sav_count; i++) { 2303 uint64_t pool; 2304 2305 vd = sav->sav_vdevs[i]; 2306 ASSERT(vd != NULL); 2307 2308 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 2309 pool != 0ULL && l2arc_vdev_present(vd)) 2310 l2arc_remove_vdev(vd); 2311 if (vd->vdev_isl2cache) 2312 spa_l2cache_remove(vd); 2313 vdev_clear_stats(vd); 2314 (void) vdev_close(vd); 2315 } 2316 } 2317 2318 /* 2319 * Pool Creation 2320 */ 2321 int 2322 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 2323 const char *history_str, nvlist_t *zplprops) 2324 { 2325 spa_t *spa; 2326 char *altroot = NULL; 2327 vdev_t *rvd; 2328 dsl_pool_t *dp; 2329 dmu_tx_t *tx; 2330 int error = 0; 2331 uint64_t txg = TXG_INITIAL; 2332 nvlist_t **spares, **l2cache; 2333 uint_t nspares, nl2cache; 2334 uint64_t version; 2335 2336 /* 2337 * If this pool already exists, return failure. 2338 */ 2339 mutex_enter(&spa_namespace_lock); 2340 if (spa_lookup(pool) != NULL) { 2341 mutex_exit(&spa_namespace_lock); 2342 return (EEXIST); 2343 } 2344 2345 /* 2346 * Allocate a new spa_t structure. 2347 */ 2348 (void) nvlist_lookup_string(props, 2349 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2350 spa = spa_add(pool, NULL, altroot); 2351 spa_activate(spa, spa_mode_global); 2352 2353 spa->spa_uberblock.ub_txg = txg - 1; 2354 2355 if (props && (error = spa_prop_validate(spa, props))) { 2356 spa_deactivate(spa); 2357 spa_remove(spa); 2358 mutex_exit(&spa_namespace_lock); 2359 return (error); 2360 } 2361 2362 if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 2363 &version) != 0) 2364 version = SPA_VERSION; 2365 ASSERT(version <= SPA_VERSION); 2366 spa->spa_uberblock.ub_version = version; 2367 spa->spa_ubsync = spa->spa_uberblock; 2368 2369 /* 2370 * Create "The Godfather" zio to hold all async IOs 2371 */ 2372 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 2373 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 2374 2375 /* 2376 * Create the root vdev. 2377 */ 2378 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2379 2380 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 2381 2382 ASSERT(error != 0 || rvd != NULL); 2383 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 2384 2385 if (error == 0 && !zfs_allocatable_devs(nvroot)) 2386 error = EINVAL; 2387 2388 if (error == 0 && 2389 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 2390 (error = spa_validate_aux(spa, nvroot, txg, 2391 VDEV_ALLOC_ADD)) == 0) { 2392 for (int c = 0; c < rvd->vdev_children; c++) { 2393 vdev_metaslab_set_size(rvd->vdev_child[c]); 2394 vdev_expand(rvd->vdev_child[c], txg); 2395 } 2396 } 2397 2398 spa_config_exit(spa, SCL_ALL, FTAG); 2399 2400 if (error != 0) { 2401 spa_unload(spa); 2402 spa_deactivate(spa); 2403 spa_remove(spa); 2404 mutex_exit(&spa_namespace_lock); 2405 return (error); 2406 } 2407 2408 /* 2409 * Get the list of spares, if specified. 2410 */ 2411 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2412 &spares, &nspares) == 0) { 2413 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 2414 KM_SLEEP) == 0); 2415 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2416 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2417 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2418 spa_load_spares(spa); 2419 spa_config_exit(spa, SCL_ALL, FTAG); 2420 spa->spa_spares.sav_sync = B_TRUE; 2421 } 2422 2423 /* 2424 * Get the list of level 2 cache devices, if specified. 2425 */ 2426 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2427 &l2cache, &nl2cache) == 0) { 2428 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2429 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2430 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2431 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2432 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2433 spa_load_l2cache(spa); 2434 spa_config_exit(spa, SCL_ALL, FTAG); 2435 spa->spa_l2cache.sav_sync = B_TRUE; 2436 } 2437 2438 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 2439 spa->spa_meta_objset = dp->dp_meta_objset; 2440 2441 tx = dmu_tx_create_assigned(dp, txg); 2442 2443 /* 2444 * Create the pool config object. 2445 */ 2446 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 2447 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 2448 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2449 2450 if (zap_add(spa->spa_meta_objset, 2451 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 2452 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 2453 cmn_err(CE_PANIC, "failed to add pool config"); 2454 } 2455 2456 /* Newly created pools with the right version are always deflated. */ 2457 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 2458 spa->spa_deflate = TRUE; 2459 if (zap_add(spa->spa_meta_objset, 2460 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2461 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 2462 cmn_err(CE_PANIC, "failed to add deflate"); 2463 } 2464 } 2465 2466 /* 2467 * Create the deferred-free bplist object. Turn off compression 2468 * because sync-to-convergence takes longer if the blocksize 2469 * keeps changing. 2470 */ 2471 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 2472 1 << 14, tx); 2473 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 2474 ZIO_COMPRESS_OFF, tx); 2475 2476 if (zap_add(spa->spa_meta_objset, 2477 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 2478 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 2479 cmn_err(CE_PANIC, "failed to add bplist"); 2480 } 2481 2482 /* 2483 * Create the pool's history object. 2484 */ 2485 if (version >= SPA_VERSION_ZPOOL_HISTORY) 2486 spa_history_create_obj(spa, tx); 2487 2488 /* 2489 * Set pool properties. 2490 */ 2491 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 2492 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2493 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 2494 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 2495 if (props != NULL) { 2496 spa_configfile_set(spa, props, B_FALSE); 2497 spa_sync_props(spa, props, CRED(), tx); 2498 } 2499 2500 dmu_tx_commit(tx); 2501 2502 spa->spa_sync_on = B_TRUE; 2503 txg_sync_start(spa->spa_dsl_pool); 2504 2505 /* 2506 * We explicitly wait for the first transaction to complete so that our 2507 * bean counters are appropriately updated. 2508 */ 2509 txg_wait_synced(spa->spa_dsl_pool, txg); 2510 2511 spa_config_sync(spa, B_FALSE, B_TRUE); 2512 2513 if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 2514 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 2515 spa_history_log_version(spa, LOG_POOL_CREATE); 2516 2517 spa->spa_minref = refcount_count(&spa->spa_refcount); 2518 2519 mutex_exit(&spa_namespace_lock); 2520 2521 return (0); 2522 } 2523 2524 #ifdef _KERNEL 2525 /* 2526 * Get the root pool information from the root disk, then import the root pool 2527 * during the system boot up time. 2528 */ 2529 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 2530 2531 static nvlist_t * 2532 spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 2533 { 2534 nvlist_t *config; 2535 nvlist_t *nvtop, *nvroot; 2536 uint64_t pgid; 2537 2538 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 2539 return (NULL); 2540 2541 /* 2542 * Add this top-level vdev to the child array. 2543 */ 2544 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 2545 &nvtop) == 0); 2546 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 2547 &pgid) == 0); 2548 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 2549 2550 /* 2551 * Put this pool's top-level vdevs into a root vdev. 2552 */ 2553 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2554 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 2555 VDEV_TYPE_ROOT) == 0); 2556 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 2557 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 2558 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 2559 &nvtop, 1) == 0); 2560 2561 /* 2562 * Replace the existing vdev_tree with the new root vdev in 2563 * this pool's configuration (remove the old, add the new). 2564 */ 2565 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 2566 nvlist_free(nvroot); 2567 return (config); 2568 } 2569 2570 /* 2571 * Walk the vdev tree and see if we can find a device with "better" 2572 * configuration. A configuration is "better" if the label on that 2573 * device has a more recent txg. 2574 */ 2575 static void 2576 spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 2577 { 2578 for (int c = 0; c < vd->vdev_children; c++) 2579 spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 2580 2581 if (vd->vdev_ops->vdev_op_leaf) { 2582 nvlist_t *label; 2583 uint64_t label_txg; 2584 2585 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 2586 &label) != 0) 2587 return; 2588 2589 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 2590 &label_txg) == 0); 2591 2592 /* 2593 * Do we have a better boot device? 2594 */ 2595 if (label_txg > *txg) { 2596 *txg = label_txg; 2597 *avd = vd; 2598 } 2599 nvlist_free(label); 2600 } 2601 } 2602 2603 /* 2604 * Import a root pool. 2605 * 2606 * For x86. devpath_list will consist of devid and/or physpath name of 2607 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 2608 * The GRUB "findroot" command will return the vdev we should boot. 2609 * 2610 * For Sparc, devpath_list consists the physpath name of the booting device 2611 * no matter the rootpool is a single device pool or a mirrored pool. 2612 * e.g. 2613 * "/pci@1f,0/ide@d/disk@0,0:a" 2614 */ 2615 int 2616 spa_import_rootpool(char *devpath, char *devid) 2617 { 2618 spa_t *spa; 2619 vdev_t *rvd, *bvd, *avd = NULL; 2620 nvlist_t *config, *nvtop; 2621 uint64_t guid, txg; 2622 char *pname; 2623 int error; 2624 2625 /* 2626 * Read the label from the boot device and generate a configuration. 2627 */ 2628 config = spa_generate_rootconf(devpath, devid, &guid); 2629 #if defined(_OBP) && defined(_KERNEL) 2630 if (config == NULL) { 2631 if (strstr(devpath, "/iscsi/ssd") != NULL) { 2632 /* iscsi boot */ 2633 get_iscsi_bootpath_phy(devpath); 2634 config = spa_generate_rootconf(devpath, devid, &guid); 2635 } 2636 } 2637 #endif 2638 if (config == NULL) { 2639 cmn_err(CE_NOTE, "Can not read the pool label from '%s'", 2640 devpath); 2641 return (EIO); 2642 } 2643 2644 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 2645 &pname) == 0); 2646 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 2647 2648 mutex_enter(&spa_namespace_lock); 2649 if ((spa = spa_lookup(pname)) != NULL) { 2650 /* 2651 * Remove the existing root pool from the namespace so that we 2652 * can replace it with the correct config we just read in. 2653 */ 2654 spa_remove(spa); 2655 } 2656 2657 spa = spa_add(pname, config, NULL); 2658 spa->spa_is_root = B_TRUE; 2659 spa->spa_load_verbatim = B_TRUE; 2660 2661 /* 2662 * Build up a vdev tree based on the boot device's label config. 2663 */ 2664 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 2665 &nvtop) == 0); 2666 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2667 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 2668 VDEV_ALLOC_ROOTPOOL); 2669 spa_config_exit(spa, SCL_ALL, FTAG); 2670 if (error) { 2671 mutex_exit(&spa_namespace_lock); 2672 nvlist_free(config); 2673 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 2674 pname); 2675 return (error); 2676 } 2677 2678 /* 2679 * Get the boot vdev. 2680 */ 2681 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2682 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 2683 (u_longlong_t)guid); 2684 error = ENOENT; 2685 goto out; 2686 } 2687 2688 /* 2689 * Determine if there is a better boot device. 2690 */ 2691 avd = bvd; 2692 spa_alt_rootvdev(rvd, &avd, &txg); 2693 if (avd != bvd) { 2694 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 2695 "try booting from '%s'", avd->vdev_path); 2696 error = EINVAL; 2697 goto out; 2698 } 2699 2700 /* 2701 * If the boot device is part of a spare vdev then ensure that 2702 * we're booting off the active spare. 2703 */ 2704 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2705 !bvd->vdev_isspare) { 2706 cmn_err(CE_NOTE, "The boot device is currently spared. Please " 2707 "try booting from '%s'", 2708 bvd->vdev_parent->vdev_child[1]->vdev_path); 2709 error = EINVAL; 2710 goto out; 2711 } 2712 2713 VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); 2714 error = 0; 2715 spa_history_log_version(spa, LOG_POOL_IMPORT); 2716 out: 2717 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2718 vdev_free(rvd); 2719 spa_config_exit(spa, SCL_ALL, FTAG); 2720 mutex_exit(&spa_namespace_lock); 2721 2722 nvlist_free(config); 2723 return (error); 2724 } 2725 2726 #endif 2727 2728 /* 2729 * Take a pool and insert it into the namespace as if it had been loaded at 2730 * boot. 2731 */ 2732 int 2733 spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props) 2734 { 2735 spa_t *spa; 2736 zpool_rewind_policy_t policy; 2737 char *altroot = NULL; 2738 2739 mutex_enter(&spa_namespace_lock); 2740 if (spa_lookup(pool) != NULL) { 2741 mutex_exit(&spa_namespace_lock); 2742 return (EEXIST); 2743 } 2744 2745 (void) nvlist_lookup_string(props, 2746 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2747 spa = spa_add(pool, config, altroot); 2748 2749 zpool_get_rewind_policy(config, &policy); 2750 spa->spa_load_max_txg = policy.zrp_txg; 2751 2752 spa->spa_load_verbatim = B_TRUE; 2753 2754 if (props != NULL) 2755 spa_configfile_set(spa, props, B_FALSE); 2756 2757 spa_config_sync(spa, B_FALSE, B_TRUE); 2758 2759 mutex_exit(&spa_namespace_lock); 2760 spa_history_log_version(spa, LOG_POOL_IMPORT); 2761 2762 return (0); 2763 } 2764 2765 /* 2766 * Import a non-root pool into the system. 2767 */ 2768 int 2769 spa_import(const char *pool, nvlist_t *config, nvlist_t *props) 2770 { 2771 spa_t *spa; 2772 char *altroot = NULL; 2773 spa_load_state_t state = SPA_LOAD_IMPORT; 2774 zpool_rewind_policy_t policy; 2775 int error; 2776 nvlist_t *nvroot; 2777 nvlist_t **spares, **l2cache; 2778 uint_t nspares, nl2cache; 2779 2780 /* 2781 * If a pool with this name exists, return failure. 2782 */ 2783 mutex_enter(&spa_namespace_lock); 2784 if ((spa = spa_lookup(pool)) != NULL) { 2785 mutex_exit(&spa_namespace_lock); 2786 return (EEXIST); 2787 } 2788 2789 zpool_get_rewind_policy(config, &policy); 2790 if (policy.zrp_request & ZPOOL_DO_REWIND) 2791 state = SPA_LOAD_RECOVER; 2792 2793 /* 2794 * Create and initialize the spa structure. 2795 */ 2796 (void) nvlist_lookup_string(props, 2797 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2798 spa = spa_add(pool, config, altroot); 2799 spa_activate(spa, spa_mode_global); 2800 2801 /* 2802 * Don't start async tasks until we know everything is healthy. 2803 */ 2804 spa_async_suspend(spa); 2805 2806 /* 2807 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 2808 * because the user-supplied config is actually the one to trust when 2809 * doing an import. 2810 */ 2811 if (state != SPA_LOAD_RECOVER) 2812 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 2813 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 2814 ((policy.zrp_request & ZPOOL_EXTREME_REWIND) != 0)); 2815 2816 /* 2817 * Propagate anything learned about failing or best txgs 2818 * back to caller 2819 */ 2820 spa_rewind_data_to_nvlist(spa, config); 2821 2822 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2823 /* 2824 * Toss any existing sparelist, as it doesn't have any validity 2825 * anymore, and conflicts with spa_has_spare(). 2826 */ 2827 if (spa->spa_spares.sav_config) { 2828 nvlist_free(spa->spa_spares.sav_config); 2829 spa->spa_spares.sav_config = NULL; 2830 spa_load_spares(spa); 2831 } 2832 if (spa->spa_l2cache.sav_config) { 2833 nvlist_free(spa->spa_l2cache.sav_config); 2834 spa->spa_l2cache.sav_config = NULL; 2835 spa_load_l2cache(spa); 2836 } 2837 2838 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 2839 &nvroot) == 0); 2840 if (error == 0) 2841 error = spa_validate_aux(spa, nvroot, -1ULL, 2842 VDEV_ALLOC_SPARE); 2843 if (error == 0) 2844 error = spa_validate_aux(spa, nvroot, -1ULL, 2845 VDEV_ALLOC_L2CACHE); 2846 spa_config_exit(spa, SCL_ALL, FTAG); 2847 2848 if (props != NULL) 2849 spa_configfile_set(spa, props, B_FALSE); 2850 2851 if (error != 0 || (props && spa_writeable(spa) && 2852 (error = spa_prop_set(spa, props)))) { 2853 spa_unload(spa); 2854 spa_deactivate(spa); 2855 spa_remove(spa); 2856 mutex_exit(&spa_namespace_lock); 2857 return (error); 2858 } 2859 2860 spa_async_resume(spa); 2861 2862 /* 2863 * Override any spares and level 2 cache devices as specified by 2864 * the user, as these may have correct device names/devids, etc. 2865 */ 2866 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2867 &spares, &nspares) == 0) { 2868 if (spa->spa_spares.sav_config) 2869 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 2870 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 2871 else 2872 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 2873 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2874 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2875 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2876 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2877 spa_load_spares(spa); 2878 spa_config_exit(spa, SCL_ALL, FTAG); 2879 spa->spa_spares.sav_sync = B_TRUE; 2880 } 2881 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2882 &l2cache, &nl2cache) == 0) { 2883 if (spa->spa_l2cache.sav_config) 2884 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 2885 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 2886 else 2887 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2888 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2889 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2890 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2891 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2892 spa_load_l2cache(spa); 2893 spa_config_exit(spa, SCL_ALL, FTAG); 2894 spa->spa_l2cache.sav_sync = B_TRUE; 2895 } 2896 2897 /* 2898 * Check for any removed devices. 2899 */ 2900 if (spa->spa_autoreplace) { 2901 spa_aux_check_removed(&spa->spa_spares); 2902 spa_aux_check_removed(&spa->spa_l2cache); 2903 } 2904 2905 if (spa_writeable(spa)) { 2906 /* 2907 * Update the config cache to include the newly-imported pool. 2908 */ 2909 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2910 } 2911 2912 /* 2913 * It's possible that the pool was expanded while it was exported. 2914 * We kick off an async task to handle this for us. 2915 */ 2916 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 2917 2918 mutex_exit(&spa_namespace_lock); 2919 spa_history_log_version(spa, LOG_POOL_IMPORT); 2920 2921 return (0); 2922 } 2923 2924 2925 /* 2926 * This (illegal) pool name is used when temporarily importing a spa_t in order 2927 * to get the vdev stats associated with the imported devices. 2928 */ 2929 #define TRYIMPORT_NAME "$import" 2930 2931 nvlist_t * 2932 spa_tryimport(nvlist_t *tryconfig) 2933 { 2934 nvlist_t *config = NULL; 2935 char *poolname; 2936 spa_t *spa; 2937 uint64_t state; 2938 int error; 2939 2940 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 2941 return (NULL); 2942 2943 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 2944 return (NULL); 2945 2946 /* 2947 * Create and initialize the spa structure. 2948 */ 2949 mutex_enter(&spa_namespace_lock); 2950 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 2951 spa_activate(spa, FREAD); 2952 2953 /* 2954 * Pass off the heavy lifting to spa_load(). 2955 * Pass TRUE for mosconfig because the user-supplied config 2956 * is actually the one to trust when doing an import. 2957 */ 2958 error = spa_load(spa, SPA_LOAD_TRYIMPORT, B_TRUE); 2959 2960 /* 2961 * If 'tryconfig' was at least parsable, return the current config. 2962 */ 2963 if (spa->spa_root_vdev != NULL) { 2964 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2965 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 2966 poolname) == 0); 2967 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 2968 state) == 0); 2969 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 2970 spa->spa_uberblock.ub_timestamp) == 0); 2971 2972 /* 2973 * If the bootfs property exists on this pool then we 2974 * copy it out so that external consumers can tell which 2975 * pools are bootable. 2976 */ 2977 if ((!error || error == EEXIST) && spa->spa_bootfs) { 2978 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 2979 2980 /* 2981 * We have to play games with the name since the 2982 * pool was opened as TRYIMPORT_NAME. 2983 */ 2984 if (dsl_dsobj_to_dsname(spa_name(spa), 2985 spa->spa_bootfs, tmpname) == 0) { 2986 char *cp; 2987 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 2988 2989 cp = strchr(tmpname, '/'); 2990 if (cp == NULL) { 2991 (void) strlcpy(dsname, tmpname, 2992 MAXPATHLEN); 2993 } else { 2994 (void) snprintf(dsname, MAXPATHLEN, 2995 "%s/%s", poolname, ++cp); 2996 } 2997 VERIFY(nvlist_add_string(config, 2998 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 2999 kmem_free(dsname, MAXPATHLEN); 3000 } 3001 kmem_free(tmpname, MAXPATHLEN); 3002 } 3003 3004 /* 3005 * Add the list of hot spares and level 2 cache devices. 3006 */ 3007 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3008 spa_add_spares(spa, config); 3009 spa_add_l2cache(spa, config); 3010 spa_config_exit(spa, SCL_CONFIG, FTAG); 3011 } 3012 3013 spa_unload(spa); 3014 spa_deactivate(spa); 3015 spa_remove(spa); 3016 mutex_exit(&spa_namespace_lock); 3017 3018 return (config); 3019 } 3020 3021 /* 3022 * Pool export/destroy 3023 * 3024 * The act of destroying or exporting a pool is very simple. We make sure there 3025 * is no more pending I/O and any references to the pool are gone. Then, we 3026 * update the pool state and sync all the labels to disk, removing the 3027 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 3028 * we don't sync the labels or remove the configuration cache. 3029 */ 3030 static int 3031 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 3032 boolean_t force, boolean_t hardforce) 3033 { 3034 spa_t *spa; 3035 3036 if (oldconfig) 3037 *oldconfig = NULL; 3038 3039 if (!(spa_mode_global & FWRITE)) 3040 return (EROFS); 3041 3042 mutex_enter(&spa_namespace_lock); 3043 if ((spa = spa_lookup(pool)) == NULL) { 3044 mutex_exit(&spa_namespace_lock); 3045 return (ENOENT); 3046 } 3047 3048 /* 3049 * Put a hold on the pool, drop the namespace lock, stop async tasks, 3050 * reacquire the namespace lock, and see if we can export. 3051 */ 3052 spa_open_ref(spa, FTAG); 3053 mutex_exit(&spa_namespace_lock); 3054 spa_async_suspend(spa); 3055 mutex_enter(&spa_namespace_lock); 3056 spa_close(spa, FTAG); 3057 3058 /* 3059 * The pool will be in core if it's openable, 3060 * in which case we can modify its state. 3061 */ 3062 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 3063 /* 3064 * Objsets may be open only because they're dirty, so we 3065 * have to force it to sync before checking spa_refcnt. 3066 */ 3067 txg_wait_synced(spa->spa_dsl_pool, 0); 3068 3069 /* 3070 * A pool cannot be exported or destroyed if there are active 3071 * references. If we are resetting a pool, allow references by 3072 * fault injection handlers. 3073 */ 3074 if (!spa_refcount_zero(spa) || 3075 (spa->spa_inject_ref != 0 && 3076 new_state != POOL_STATE_UNINITIALIZED)) { 3077 spa_async_resume(spa); 3078 mutex_exit(&spa_namespace_lock); 3079 return (EBUSY); 3080 } 3081 3082 /* 3083 * A pool cannot be exported if it has an active shared spare. 3084 * This is to prevent other pools stealing the active spare 3085 * from an exported pool. At user's own will, such pool can 3086 * be forcedly exported. 3087 */ 3088 if (!force && new_state == POOL_STATE_EXPORTED && 3089 spa_has_active_shared_spare(spa)) { 3090 spa_async_resume(spa); 3091 mutex_exit(&spa_namespace_lock); 3092 return (EXDEV); 3093 } 3094 3095 /* 3096 * We want this to be reflected on every label, 3097 * so mark them all dirty. spa_unload() will do the 3098 * final sync that pushes these changes out. 3099 */ 3100 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 3101 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3102 spa->spa_state = new_state; 3103 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 3104 vdev_config_dirty(spa->spa_root_vdev); 3105 spa_config_exit(spa, SCL_ALL, FTAG); 3106 } 3107 } 3108 3109 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 3110 3111 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3112 spa_unload(spa); 3113 spa_deactivate(spa); 3114 } 3115 3116 if (oldconfig && spa->spa_config) 3117 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 3118 3119 if (new_state != POOL_STATE_UNINITIALIZED) { 3120 if (!hardforce) 3121 spa_config_sync(spa, B_TRUE, B_TRUE); 3122 spa_remove(spa); 3123 } 3124 mutex_exit(&spa_namespace_lock); 3125 3126 return (0); 3127 } 3128 3129 /* 3130 * Destroy a storage pool. 3131 */ 3132 int 3133 spa_destroy(char *pool) 3134 { 3135 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 3136 B_FALSE, B_FALSE)); 3137 } 3138 3139 /* 3140 * Export a storage pool. 3141 */ 3142 int 3143 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 3144 boolean_t hardforce) 3145 { 3146 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 3147 force, hardforce)); 3148 } 3149 3150 /* 3151 * Similar to spa_export(), this unloads the spa_t without actually removing it 3152 * from the namespace in any way. 3153 */ 3154 int 3155 spa_reset(char *pool) 3156 { 3157 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 3158 B_FALSE, B_FALSE)); 3159 } 3160 3161 /* 3162 * ========================================================================== 3163 * Device manipulation 3164 * ========================================================================== 3165 */ 3166 3167 /* 3168 * Add a device to a storage pool. 3169 */ 3170 int 3171 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 3172 { 3173 uint64_t txg, id; 3174 int error; 3175 vdev_t *rvd = spa->spa_root_vdev; 3176 vdev_t *vd, *tvd; 3177 nvlist_t **spares, **l2cache; 3178 uint_t nspares, nl2cache; 3179 3180 txg = spa_vdev_enter(spa); 3181 3182 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 3183 VDEV_ALLOC_ADD)) != 0) 3184 return (spa_vdev_exit(spa, NULL, txg, error)); 3185 3186 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 3187 3188 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 3189 &nspares) != 0) 3190 nspares = 0; 3191 3192 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 3193 &nl2cache) != 0) 3194 nl2cache = 0; 3195 3196 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 3197 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 3198 3199 if (vd->vdev_children != 0 && 3200 (error = vdev_create(vd, txg, B_FALSE)) != 0) 3201 return (spa_vdev_exit(spa, vd, txg, error)); 3202 3203 /* 3204 * We must validate the spares and l2cache devices after checking the 3205 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 3206 */ 3207 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 3208 return (spa_vdev_exit(spa, vd, txg, error)); 3209 3210 /* 3211 * Transfer each new top-level vdev from vd to rvd. 3212 */ 3213 for (int c = 0; c < vd->vdev_children; c++) { 3214 3215 /* 3216 * Set the vdev id to the first hole, if one exists. 3217 */ 3218 for (id = 0; id < rvd->vdev_children; id++) { 3219 if (rvd->vdev_child[id]->vdev_ishole) { 3220 vdev_free(rvd->vdev_child[id]); 3221 break; 3222 } 3223 } 3224 tvd = vd->vdev_child[c]; 3225 vdev_remove_child(vd, tvd); 3226 tvd->vdev_id = id; 3227 vdev_add_child(rvd, tvd); 3228 vdev_config_dirty(tvd); 3229 } 3230 3231 if (nspares != 0) { 3232 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 3233 ZPOOL_CONFIG_SPARES); 3234 spa_load_spares(spa); 3235 spa->spa_spares.sav_sync = B_TRUE; 3236 } 3237 3238 if (nl2cache != 0) { 3239 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 3240 ZPOOL_CONFIG_L2CACHE); 3241 spa_load_l2cache(spa); 3242 spa->spa_l2cache.sav_sync = B_TRUE; 3243 } 3244 3245 /* 3246 * We have to be careful when adding new vdevs to an existing pool. 3247 * If other threads start allocating from these vdevs before we 3248 * sync the config cache, and we lose power, then upon reboot we may 3249 * fail to open the pool because there are DVAs that the config cache 3250 * can't translate. Therefore, we first add the vdevs without 3251 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 3252 * and then let spa_config_update() initialize the new metaslabs. 3253 * 3254 * spa_load() checks for added-but-not-initialized vdevs, so that 3255 * if we lose power at any point in this sequence, the remaining 3256 * steps will be completed the next time we load the pool. 3257 */ 3258 (void) spa_vdev_exit(spa, vd, txg, 0); 3259 3260 mutex_enter(&spa_namespace_lock); 3261 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3262 mutex_exit(&spa_namespace_lock); 3263 3264 return (0); 3265 } 3266 3267 /* 3268 * Attach a device to a mirror. The arguments are the path to any device 3269 * in the mirror, and the nvroot for the new device. If the path specifies 3270 * a device that is not mirrored, we automatically insert the mirror vdev. 3271 * 3272 * If 'replacing' is specified, the new device is intended to replace the 3273 * existing device; in this case the two devices are made into their own 3274 * mirror using the 'replacing' vdev, which is functionally identical to 3275 * the mirror vdev (it actually reuses all the same ops) but has a few 3276 * extra rules: you can't attach to it after it's been created, and upon 3277 * completion of resilvering, the first disk (the one being replaced) 3278 * is automatically detached. 3279 */ 3280 int 3281 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 3282 { 3283 uint64_t txg, open_txg; 3284 vdev_t *rvd = spa->spa_root_vdev; 3285 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 3286 vdev_ops_t *pvops; 3287 char *oldvdpath, *newvdpath; 3288 int newvd_isspare; 3289 int error; 3290 3291 txg = spa_vdev_enter(spa); 3292 3293 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 3294 3295 if (oldvd == NULL) 3296 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3297 3298 if (!oldvd->vdev_ops->vdev_op_leaf) 3299 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3300 3301 pvd = oldvd->vdev_parent; 3302 3303 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 3304 VDEV_ALLOC_ADD)) != 0) 3305 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 3306 3307 if (newrootvd->vdev_children != 1) 3308 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3309 3310 newvd = newrootvd->vdev_child[0]; 3311 3312 if (!newvd->vdev_ops->vdev_op_leaf) 3313 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3314 3315 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 3316 return (spa_vdev_exit(spa, newrootvd, txg, error)); 3317 3318 /* 3319 * Spares can't replace logs 3320 */ 3321 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 3322 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3323 3324 if (!replacing) { 3325 /* 3326 * For attach, the only allowable parent is a mirror or the root 3327 * vdev. 3328 */ 3329 if (pvd->vdev_ops != &vdev_mirror_ops && 3330 pvd->vdev_ops != &vdev_root_ops) 3331 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3332 3333 pvops = &vdev_mirror_ops; 3334 } else { 3335 /* 3336 * Active hot spares can only be replaced by inactive hot 3337 * spares. 3338 */ 3339 if (pvd->vdev_ops == &vdev_spare_ops && 3340 pvd->vdev_child[1] == oldvd && 3341 !spa_has_spare(spa, newvd->vdev_guid)) 3342 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3343 3344 /* 3345 * If the source is a hot spare, and the parent isn't already a 3346 * spare, then we want to create a new hot spare. Otherwise, we 3347 * want to create a replacing vdev. The user is not allowed to 3348 * attach to a spared vdev child unless the 'isspare' state is 3349 * the same (spare replaces spare, non-spare replaces 3350 * non-spare). 3351 */ 3352 if (pvd->vdev_ops == &vdev_replacing_ops) 3353 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3354 else if (pvd->vdev_ops == &vdev_spare_ops && 3355 newvd->vdev_isspare != oldvd->vdev_isspare) 3356 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3357 else if (pvd->vdev_ops != &vdev_spare_ops && 3358 newvd->vdev_isspare) 3359 pvops = &vdev_spare_ops; 3360 else 3361 pvops = &vdev_replacing_ops; 3362 } 3363 3364 /* 3365 * Make sure the new device is big enough. 3366 */ 3367 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 3368 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 3369 3370 /* 3371 * The new device cannot have a higher alignment requirement 3372 * than the top-level vdev. 3373 */ 3374 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 3375 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 3376 3377 /* 3378 * If this is an in-place replacement, update oldvd's path and devid 3379 * to make it distinguishable from newvd, and unopenable from now on. 3380 */ 3381 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 3382 spa_strfree(oldvd->vdev_path); 3383 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 3384 KM_SLEEP); 3385 (void) sprintf(oldvd->vdev_path, "%s/%s", 3386 newvd->vdev_path, "old"); 3387 if (oldvd->vdev_devid != NULL) { 3388 spa_strfree(oldvd->vdev_devid); 3389 oldvd->vdev_devid = NULL; 3390 } 3391 } 3392 3393 /* 3394 * If the parent is not a mirror, or if we're replacing, insert the new 3395 * mirror/replacing/spare vdev above oldvd. 3396 */ 3397 if (pvd->vdev_ops != pvops) 3398 pvd = vdev_add_parent(oldvd, pvops); 3399 3400 ASSERT(pvd->vdev_top->vdev_parent == rvd); 3401 ASSERT(pvd->vdev_ops == pvops); 3402 ASSERT(oldvd->vdev_parent == pvd); 3403 3404 /* 3405 * Extract the new device from its root and add it to pvd. 3406 */ 3407 vdev_remove_child(newrootvd, newvd); 3408 newvd->vdev_id = pvd->vdev_children; 3409 newvd->vdev_crtxg = oldvd->vdev_crtxg; 3410 vdev_add_child(pvd, newvd); 3411 3412 tvd = newvd->vdev_top; 3413 ASSERT(pvd->vdev_top == tvd); 3414 ASSERT(tvd->vdev_parent == rvd); 3415 3416 vdev_config_dirty(tvd); 3417 3418 /* 3419 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 3420 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 3421 */ 3422 open_txg = txg + TXG_CONCURRENT_STATES - 1; 3423 3424 vdev_dtl_dirty(newvd, DTL_MISSING, 3425 TXG_INITIAL, open_txg - TXG_INITIAL + 1); 3426 3427 if (newvd->vdev_isspare) { 3428 spa_spare_activate(newvd); 3429 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 3430 } 3431 3432 oldvdpath = spa_strdup(oldvd->vdev_path); 3433 newvdpath = spa_strdup(newvd->vdev_path); 3434 newvd_isspare = newvd->vdev_isspare; 3435 3436 /* 3437 * Mark newvd's DTL dirty in this txg. 3438 */ 3439 vdev_dirty(tvd, VDD_DTL, newvd, txg); 3440 3441 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 3442 3443 spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, NULL, 3444 CRED(), "%s vdev=%s %s vdev=%s", 3445 replacing && newvd_isspare ? "spare in" : 3446 replacing ? "replace" : "attach", newvdpath, 3447 replacing ? "for" : "to", oldvdpath); 3448 3449 spa_strfree(oldvdpath); 3450 spa_strfree(newvdpath); 3451 3452 /* 3453 * Kick off a resilver to update newvd. 3454 */ 3455 VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0); 3456 3457 return (0); 3458 } 3459 3460 /* 3461 * Detach a device from a mirror or replacing vdev. 3462 * If 'replace_done' is specified, only detach if the parent 3463 * is a replacing vdev. 3464 */ 3465 int 3466 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 3467 { 3468 uint64_t txg; 3469 int error; 3470 vdev_t *rvd = spa->spa_root_vdev; 3471 vdev_t *vd, *pvd, *cvd, *tvd; 3472 boolean_t unspare = B_FALSE; 3473 uint64_t unspare_guid; 3474 size_t len; 3475 3476 txg = spa_vdev_enter(spa); 3477 3478 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 3479 3480 if (vd == NULL) 3481 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3482 3483 if (!vd->vdev_ops->vdev_op_leaf) 3484 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3485 3486 pvd = vd->vdev_parent; 3487 3488 /* 3489 * If the parent/child relationship is not as expected, don't do it. 3490 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 3491 * vdev that's replacing B with C. The user's intent in replacing 3492 * is to go from M(A,B) to M(A,C). If the user decides to cancel 3493 * the replace by detaching C, the expected behavior is to end up 3494 * M(A,B). But suppose that right after deciding to detach C, 3495 * the replacement of B completes. We would have M(A,C), and then 3496 * ask to detach C, which would leave us with just A -- not what 3497 * the user wanted. To prevent this, we make sure that the 3498 * parent/child relationship hasn't changed -- in this example, 3499 * that C's parent is still the replacing vdev R. 3500 */ 3501 if (pvd->vdev_guid != pguid && pguid != 0) 3502 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3503 3504 /* 3505 * If replace_done is specified, only remove this device if it's 3506 * the first child of a replacing vdev. For the 'spare' vdev, either 3507 * disk can be removed. 3508 */ 3509 if (replace_done) { 3510 if (pvd->vdev_ops == &vdev_replacing_ops) { 3511 if (vd->vdev_id != 0) 3512 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3513 } else if (pvd->vdev_ops != &vdev_spare_ops) { 3514 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3515 } 3516 } 3517 3518 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 3519 spa_version(spa) >= SPA_VERSION_SPARES); 3520 3521 /* 3522 * Only mirror, replacing, and spare vdevs support detach. 3523 */ 3524 if (pvd->vdev_ops != &vdev_replacing_ops && 3525 pvd->vdev_ops != &vdev_mirror_ops && 3526 pvd->vdev_ops != &vdev_spare_ops) 3527 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3528 3529 /* 3530 * If this device has the only valid copy of some data, 3531 * we cannot safely detach it. 3532 */ 3533 if (vdev_dtl_required(vd)) 3534 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3535 3536 ASSERT(pvd->vdev_children >= 2); 3537 3538 /* 3539 * If we are detaching the second disk from a replacing vdev, then 3540 * check to see if we changed the original vdev's path to have "/old" 3541 * at the end in spa_vdev_attach(). If so, undo that change now. 3542 */ 3543 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 && 3544 pvd->vdev_child[0]->vdev_path != NULL && 3545 pvd->vdev_child[1]->vdev_path != NULL) { 3546 ASSERT(pvd->vdev_child[1] == vd); 3547 cvd = pvd->vdev_child[0]; 3548 len = strlen(vd->vdev_path); 3549 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 3550 strcmp(cvd->vdev_path + len, "/old") == 0) { 3551 spa_strfree(cvd->vdev_path); 3552 cvd->vdev_path = spa_strdup(vd->vdev_path); 3553 } 3554 } 3555 3556 /* 3557 * If we are detaching the original disk from a spare, then it implies 3558 * that the spare should become a real disk, and be removed from the 3559 * active spare list for the pool. 3560 */ 3561 if (pvd->vdev_ops == &vdev_spare_ops && 3562 vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare) 3563 unspare = B_TRUE; 3564 3565 /* 3566 * Erase the disk labels so the disk can be used for other things. 3567 * This must be done after all other error cases are handled, 3568 * but before we disembowel vd (so we can still do I/O to it). 3569 * But if we can't do it, don't treat the error as fatal -- 3570 * it may be that the unwritability of the disk is the reason 3571 * it's being detached! 3572 */ 3573 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 3574 3575 /* 3576 * Remove vd from its parent and compact the parent's children. 3577 */ 3578 vdev_remove_child(pvd, vd); 3579 vdev_compact_children(pvd); 3580 3581 /* 3582 * Remember one of the remaining children so we can get tvd below. 3583 */ 3584 cvd = pvd->vdev_child[0]; 3585 3586 /* 3587 * If we need to remove the remaining child from the list of hot spares, 3588 * do it now, marking the vdev as no longer a spare in the process. 3589 * We must do this before vdev_remove_parent(), because that can 3590 * change the GUID if it creates a new toplevel GUID. For a similar 3591 * reason, we must remove the spare now, in the same txg as the detach; 3592 * otherwise someone could attach a new sibling, change the GUID, and 3593 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 3594 */ 3595 if (unspare) { 3596 ASSERT(cvd->vdev_isspare); 3597 spa_spare_remove(cvd); 3598 unspare_guid = cvd->vdev_guid; 3599 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 3600 } 3601 3602 /* 3603 * If the parent mirror/replacing vdev only has one child, 3604 * the parent is no longer needed. Remove it from the tree. 3605 */ 3606 if (pvd->vdev_children == 1) 3607 vdev_remove_parent(cvd); 3608 3609 /* 3610 * We don't set tvd until now because the parent we just removed 3611 * may have been the previous top-level vdev. 3612 */ 3613 tvd = cvd->vdev_top; 3614 ASSERT(tvd->vdev_parent == rvd); 3615 3616 /* 3617 * Reevaluate the parent vdev state. 3618 */ 3619 vdev_propagate_state(cvd); 3620 3621 /* 3622 * If the 'autoexpand' property is set on the pool then automatically 3623 * try to expand the size of the pool. For example if the device we 3624 * just detached was smaller than the others, it may be possible to 3625 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 3626 * first so that we can obtain the updated sizes of the leaf vdevs. 3627 */ 3628 if (spa->spa_autoexpand) { 3629 vdev_reopen(tvd); 3630 vdev_expand(tvd, txg); 3631 } 3632 3633 vdev_config_dirty(tvd); 3634 3635 /* 3636 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 3637 * vd->vdev_detached is set and free vd's DTL object in syncing context. 3638 * But first make sure we're not on any *other* txg's DTL list, to 3639 * prevent vd from being accessed after it's freed. 3640 */ 3641 for (int t = 0; t < TXG_SIZE; t++) 3642 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 3643 vd->vdev_detached = B_TRUE; 3644 vdev_dirty(tvd, VDD_DTL, vd, txg); 3645 3646 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 3647 3648 error = spa_vdev_exit(spa, vd, txg, 0); 3649 3650 /* 3651 * If this was the removal of the original device in a hot spare vdev, 3652 * then we want to go through and remove the device from the hot spare 3653 * list of every other pool. 3654 */ 3655 if (unspare) { 3656 spa_t *myspa = spa; 3657 spa = NULL; 3658 mutex_enter(&spa_namespace_lock); 3659 while ((spa = spa_next(spa)) != NULL) { 3660 if (spa->spa_state != POOL_STATE_ACTIVE) 3661 continue; 3662 if (spa == myspa) 3663 continue; 3664 spa_open_ref(spa, FTAG); 3665 mutex_exit(&spa_namespace_lock); 3666 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 3667 mutex_enter(&spa_namespace_lock); 3668 spa_close(spa, FTAG); 3669 } 3670 mutex_exit(&spa_namespace_lock); 3671 } 3672 3673 return (error); 3674 } 3675 3676 static nvlist_t * 3677 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 3678 { 3679 for (int i = 0; i < count; i++) { 3680 uint64_t guid; 3681 3682 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 3683 &guid) == 0); 3684 3685 if (guid == target_guid) 3686 return (nvpp[i]); 3687 } 3688 3689 return (NULL); 3690 } 3691 3692 static void 3693 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 3694 nvlist_t *dev_to_remove) 3695 { 3696 nvlist_t **newdev = NULL; 3697 3698 if (count > 1) 3699 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 3700 3701 for (int i = 0, j = 0; i < count; i++) { 3702 if (dev[i] == dev_to_remove) 3703 continue; 3704 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 3705 } 3706 3707 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 3708 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 3709 3710 for (int i = 0; i < count - 1; i++) 3711 nvlist_free(newdev[i]); 3712 3713 if (count > 1) 3714 kmem_free(newdev, (count - 1) * sizeof (void *)); 3715 } 3716 3717 /* 3718 * Removing a device from the vdev namespace requires several steps 3719 * and can take a significant amount of time. As a result we use 3720 * the spa_vdev_config_[enter/exit] functions which allow us to 3721 * grab and release the spa_config_lock while still holding the namespace 3722 * lock. During each step the configuration is synced out. 3723 */ 3724 3725 /* 3726 * Initial phase of device removal - stop future allocations from this device. 3727 */ 3728 void 3729 spa_vdev_remove_start(spa_t *spa, vdev_t *vd) 3730 { 3731 metaslab_group_t *mg = vd->vdev_mg; 3732 3733 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3734 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3735 3736 /* 3737 * Remove our vdev from the allocatable vdevs 3738 */ 3739 if (mg) 3740 metaslab_class_remove(mg->mg_class, mg); 3741 } 3742 3743 /* 3744 * Evacuate the device. 3745 */ 3746 int 3747 spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 3748 { 3749 uint64_t txg; 3750 int error; 3751 3752 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3753 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 3754 3755 /* 3756 * Evacuate the device. We don't hold the config lock as writer 3757 * since we need to do I/O but we do keep the 3758 * spa_namespace_lock held. Once this completes the device 3759 * should no longer have any blocks allocated on it. 3760 */ 3761 if (vd->vdev_islog) { 3762 /* 3763 * Evacuate the device. 3764 */ 3765 if (error = dmu_objset_find(spa_name(spa), 3766 zil_vdev_offline, NULL, DS_FIND_CHILDREN)) { 3767 uint64_t txg; 3768 3769 txg = spa_vdev_config_enter(spa); 3770 metaslab_class_add(spa->spa_log_class, 3771 vd->vdev_mg); 3772 return (spa_vdev_exit(spa, NULL, txg, error)); 3773 } 3774 txg_wait_synced(spa_get_dsl(spa), 0); 3775 } 3776 3777 /* 3778 * Remove any remaining MOS metadata associated with the device. 3779 */ 3780 txg = spa_vdev_config_enter(spa); 3781 vd->vdev_removing = B_TRUE; 3782 vdev_dirty(vd, 0, NULL, txg); 3783 vdev_config_dirty(vd); 3784 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 3785 3786 return (0); 3787 } 3788 3789 /* 3790 * Complete the removal by cleaning up the namespace. 3791 */ 3792 void 3793 spa_vdev_remove_done(spa_t *spa, vdev_t *vd) 3794 { 3795 vdev_t *rvd = spa->spa_root_vdev; 3796 metaslab_group_t *mg = vd->vdev_mg; 3797 uint64_t id = vd->vdev_id; 3798 boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 3799 3800 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3801 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3802 3803 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 3804 vdev_free(vd); 3805 3806 /* 3807 * It's possible that another thread is trying todo a spa_vdev_add() 3808 * at the same time we're trying remove it. As a result the 3809 * added vdev may not have initialized its metaslabs yet. 3810 */ 3811 if (mg != NULL) 3812 metaslab_group_destroy(mg); 3813 3814 if (last_vdev) { 3815 vdev_compact_children(rvd); 3816 } else { 3817 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 3818 vdev_add_child(rvd, vd); 3819 } 3820 vdev_config_dirty(rvd); 3821 3822 /* 3823 * Reassess the health of our root vdev. 3824 */ 3825 vdev_reopen(rvd); 3826 } 3827 3828 /* 3829 * Remove a device from the pool. Currently, this supports removing only hot 3830 * spares, slogs, and level 2 ARC devices. 3831 */ 3832 int 3833 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 3834 { 3835 vdev_t *vd; 3836 nvlist_t **spares, **l2cache, *nv; 3837 uint64_t txg = 0; 3838 uint_t nspares, nl2cache; 3839 int error = 0; 3840 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 3841 3842 if (!locked) 3843 txg = spa_vdev_enter(spa); 3844 3845 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 3846 3847 if (spa->spa_spares.sav_vdevs != NULL && 3848 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3849 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 3850 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 3851 /* 3852 * Only remove the hot spare if it's not currently in use 3853 * in this pool. 3854 */ 3855 if (vd == NULL || unspare) { 3856 spa_vdev_remove_aux(spa->spa_spares.sav_config, 3857 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 3858 spa_load_spares(spa); 3859 spa->spa_spares.sav_sync = B_TRUE; 3860 } else { 3861 error = EBUSY; 3862 } 3863 } else if (spa->spa_l2cache.sav_vdevs != NULL && 3864 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3865 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 3866 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 3867 /* 3868 * Cache devices can always be removed. 3869 */ 3870 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 3871 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 3872 spa_load_l2cache(spa); 3873 spa->spa_l2cache.sav_sync = B_TRUE; 3874 } else if (vd != NULL && vd->vdev_islog) { 3875 ASSERT(!locked); 3876 3877 /* 3878 * XXX - Once we have bp-rewrite this should 3879 * become the common case. 3880 */ 3881 3882 /* 3883 * 1. Stop allocations 3884 * 2. Evacuate the device (i.e. kill off stubby and 3885 * metadata) and wait for it to complete (i.e. sync). 3886 * 3. Cleanup the vdev namespace. 3887 */ 3888 spa_vdev_remove_start(spa, vd); 3889 3890 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 3891 if ((error = spa_vdev_remove_evacuate(spa, vd)) != 0) 3892 return (error); 3893 txg = spa_vdev_config_enter(spa); 3894 3895 spa_vdev_remove_done(spa, vd); 3896 3897 } else if (vd != NULL) { 3898 /* 3899 * Normal vdevs cannot be removed (yet). 3900 */ 3901 error = ENOTSUP; 3902 } else { 3903 /* 3904 * There is no vdev of any kind with the specified guid. 3905 */ 3906 error = ENOENT; 3907 } 3908 3909 if (!locked) 3910 return (spa_vdev_exit(spa, NULL, txg, error)); 3911 3912 return (error); 3913 } 3914 3915 /* 3916 * Find any device that's done replacing, or a vdev marked 'unspare' that's 3917 * current spared, so we can detach it. 3918 */ 3919 static vdev_t * 3920 spa_vdev_resilver_done_hunt(vdev_t *vd) 3921 { 3922 vdev_t *newvd, *oldvd; 3923 3924 for (int c = 0; c < vd->vdev_children; c++) { 3925 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 3926 if (oldvd != NULL) 3927 return (oldvd); 3928 } 3929 3930 /* 3931 * Check for a completed replacement. 3932 */ 3933 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 3934 oldvd = vd->vdev_child[0]; 3935 newvd = vd->vdev_child[1]; 3936 3937 if (vdev_dtl_empty(newvd, DTL_MISSING) && 3938 !vdev_dtl_required(oldvd)) 3939 return (oldvd); 3940 } 3941 3942 /* 3943 * Check for a completed resilver with the 'unspare' flag set. 3944 */ 3945 if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { 3946 newvd = vd->vdev_child[0]; 3947 oldvd = vd->vdev_child[1]; 3948 3949 if (newvd->vdev_unspare && 3950 vdev_dtl_empty(newvd, DTL_MISSING) && 3951 !vdev_dtl_required(oldvd)) { 3952 newvd->vdev_unspare = 0; 3953 return (oldvd); 3954 } 3955 } 3956 3957 return (NULL); 3958 } 3959 3960 static void 3961 spa_vdev_resilver_done(spa_t *spa) 3962 { 3963 vdev_t *vd, *pvd, *ppvd; 3964 uint64_t guid, sguid, pguid, ppguid; 3965 3966 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3967 3968 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 3969 pvd = vd->vdev_parent; 3970 ppvd = pvd->vdev_parent; 3971 guid = vd->vdev_guid; 3972 pguid = pvd->vdev_guid; 3973 ppguid = ppvd->vdev_guid; 3974 sguid = 0; 3975 /* 3976 * If we have just finished replacing a hot spared device, then 3977 * we need to detach the parent's first child (the original hot 3978 * spare) as well. 3979 */ 3980 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) { 3981 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 3982 ASSERT(ppvd->vdev_children == 2); 3983 sguid = ppvd->vdev_child[1]->vdev_guid; 3984 } 3985 spa_config_exit(spa, SCL_ALL, FTAG); 3986 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 3987 return; 3988 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 3989 return; 3990 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3991 } 3992 3993 spa_config_exit(spa, SCL_ALL, FTAG); 3994 } 3995 3996 /* 3997 * Update the stored path or FRU for this vdev. Dirty the vdev configuration, 3998 * relying on spa_vdev_enter/exit() to synchronize the labels and cache. 3999 */ 4000 int 4001 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 4002 boolean_t ispath) 4003 { 4004 vdev_t *vd; 4005 uint64_t txg; 4006 4007 txg = spa_vdev_enter(spa); 4008 4009 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 4010 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 4011 4012 if (!vd->vdev_ops->vdev_op_leaf) 4013 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4014 4015 if (ispath) { 4016 spa_strfree(vd->vdev_path); 4017 vd->vdev_path = spa_strdup(value); 4018 } else { 4019 if (vd->vdev_fru != NULL) 4020 spa_strfree(vd->vdev_fru); 4021 vd->vdev_fru = spa_strdup(value); 4022 } 4023 4024 vdev_config_dirty(vd->vdev_top); 4025 4026 return (spa_vdev_exit(spa, NULL, txg, 0)); 4027 } 4028 4029 int 4030 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 4031 { 4032 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 4033 } 4034 4035 int 4036 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 4037 { 4038 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 4039 } 4040 4041 /* 4042 * ========================================================================== 4043 * SPA Scrubbing 4044 * ========================================================================== 4045 */ 4046 4047 int 4048 spa_scrub(spa_t *spa, pool_scrub_type_t type) 4049 { 4050 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 4051 4052 if ((uint_t)type >= POOL_SCRUB_TYPES) 4053 return (ENOTSUP); 4054 4055 /* 4056 * If a resilver was requested, but there is no DTL on a 4057 * writeable leaf device, we have nothing to do. 4058 */ 4059 if (type == POOL_SCRUB_RESILVER && 4060 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 4061 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 4062 return (0); 4063 } 4064 4065 if (type == POOL_SCRUB_EVERYTHING && 4066 spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE && 4067 spa->spa_dsl_pool->dp_scrub_isresilver) 4068 return (EBUSY); 4069 4070 if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) { 4071 return (dsl_pool_scrub_clean(spa->spa_dsl_pool)); 4072 } else if (type == POOL_SCRUB_NONE) { 4073 return (dsl_pool_scrub_cancel(spa->spa_dsl_pool)); 4074 } else { 4075 return (EINVAL); 4076 } 4077 } 4078 4079 /* 4080 * ========================================================================== 4081 * SPA async task processing 4082 * ========================================================================== 4083 */ 4084 4085 static void 4086 spa_async_remove(spa_t *spa, vdev_t *vd) 4087 { 4088 if (vd->vdev_remove_wanted) { 4089 vd->vdev_remove_wanted = 0; 4090 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 4091 4092 /* 4093 * We want to clear the stats, but we don't want to do a full 4094 * vdev_clear() as that will cause us to throw away 4095 * degraded/faulted state as well as attempt to reopen the 4096 * device, all of which is a waste. 4097 */ 4098 vd->vdev_stat.vs_read_errors = 0; 4099 vd->vdev_stat.vs_write_errors = 0; 4100 vd->vdev_stat.vs_checksum_errors = 0; 4101 4102 vdev_state_dirty(vd->vdev_top); 4103 } 4104 4105 for (int c = 0; c < vd->vdev_children; c++) 4106 spa_async_remove(spa, vd->vdev_child[c]); 4107 } 4108 4109 static void 4110 spa_async_probe(spa_t *spa, vdev_t *vd) 4111 { 4112 if (vd->vdev_probe_wanted) { 4113 vd->vdev_probe_wanted = 0; 4114 vdev_reopen(vd); /* vdev_open() does the actual probe */ 4115 } 4116 4117 for (int c = 0; c < vd->vdev_children; c++) 4118 spa_async_probe(spa, vd->vdev_child[c]); 4119 } 4120 4121 static void 4122 spa_async_autoexpand(spa_t *spa, vdev_t *vd) 4123 { 4124 sysevent_id_t eid; 4125 nvlist_t *attr; 4126 char *physpath; 4127 4128 if (!spa->spa_autoexpand) 4129 return; 4130 4131 for (int c = 0; c < vd->vdev_children; c++) { 4132 vdev_t *cvd = vd->vdev_child[c]; 4133 spa_async_autoexpand(spa, cvd); 4134 } 4135 4136 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 4137 return; 4138 4139 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 4140 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 4141 4142 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4143 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 4144 4145 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 4146 ESC_DEV_DLE, attr, &eid, DDI_SLEEP); 4147 4148 nvlist_free(attr); 4149 kmem_free(physpath, MAXPATHLEN); 4150 } 4151 4152 static void 4153 spa_async_thread(spa_t *spa) 4154 { 4155 int tasks; 4156 4157 ASSERT(spa->spa_sync_on); 4158 4159 mutex_enter(&spa->spa_async_lock); 4160 tasks = spa->spa_async_tasks; 4161 spa->spa_async_tasks = 0; 4162 mutex_exit(&spa->spa_async_lock); 4163 4164 /* 4165 * See if the config needs to be updated. 4166 */ 4167 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 4168 uint64_t oldsz, space_update; 4169 4170 mutex_enter(&spa_namespace_lock); 4171 oldsz = spa_get_space(spa); 4172 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4173 space_update = spa_get_space(spa) - oldsz; 4174 mutex_exit(&spa_namespace_lock); 4175 4176 /* 4177 * If the pool grew as a result of the config update, 4178 * then log an internal history event. 4179 */ 4180 if (space_update) { 4181 spa_history_internal_log(LOG_POOL_VDEV_ONLINE, 4182 spa, NULL, CRED(), 4183 "pool '%s' size: %llu(+%llu)", 4184 spa_name(spa), spa_get_space(spa), 4185 space_update); 4186 } 4187 } 4188 4189 /* 4190 * See if any devices need to be marked REMOVED. 4191 */ 4192 if (tasks & SPA_ASYNC_REMOVE) { 4193 spa_vdev_state_enter(spa, SCL_NONE); 4194 spa_async_remove(spa, spa->spa_root_vdev); 4195 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 4196 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 4197 for (int i = 0; i < spa->spa_spares.sav_count; i++) 4198 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 4199 (void) spa_vdev_state_exit(spa, NULL, 0); 4200 } 4201 4202 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 4203 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4204 spa_async_autoexpand(spa, spa->spa_root_vdev); 4205 spa_config_exit(spa, SCL_CONFIG, FTAG); 4206 } 4207 4208 /* 4209 * See if any devices need to be probed. 4210 */ 4211 if (tasks & SPA_ASYNC_PROBE) { 4212 spa_vdev_state_enter(spa, SCL_NONE); 4213 spa_async_probe(spa, spa->spa_root_vdev); 4214 (void) spa_vdev_state_exit(spa, NULL, 0); 4215 } 4216 4217 /* 4218 * If any devices are done replacing, detach them. 4219 */ 4220 if (tasks & SPA_ASYNC_RESILVER_DONE) 4221 spa_vdev_resilver_done(spa); 4222 4223 /* 4224 * Kick off a resilver. 4225 */ 4226 if (tasks & SPA_ASYNC_RESILVER) 4227 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0); 4228 4229 /* 4230 * Let the world know that we're done. 4231 */ 4232 mutex_enter(&spa->spa_async_lock); 4233 spa->spa_async_thread = NULL; 4234 cv_broadcast(&spa->spa_async_cv); 4235 mutex_exit(&spa->spa_async_lock); 4236 thread_exit(); 4237 } 4238 4239 void 4240 spa_async_suspend(spa_t *spa) 4241 { 4242 mutex_enter(&spa->spa_async_lock); 4243 spa->spa_async_suspended++; 4244 while (spa->spa_async_thread != NULL) 4245 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 4246 mutex_exit(&spa->spa_async_lock); 4247 } 4248 4249 void 4250 spa_async_resume(spa_t *spa) 4251 { 4252 mutex_enter(&spa->spa_async_lock); 4253 ASSERT(spa->spa_async_suspended != 0); 4254 spa->spa_async_suspended--; 4255 mutex_exit(&spa->spa_async_lock); 4256 } 4257 4258 static void 4259 spa_async_dispatch(spa_t *spa) 4260 { 4261 mutex_enter(&spa->spa_async_lock); 4262 if (spa->spa_async_tasks && !spa->spa_async_suspended && 4263 spa->spa_async_thread == NULL && 4264 rootdir != NULL && !vn_is_readonly(rootdir)) 4265 spa->spa_async_thread = thread_create(NULL, 0, 4266 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 4267 mutex_exit(&spa->spa_async_lock); 4268 } 4269 4270 void 4271 spa_async_request(spa_t *spa, int task) 4272 { 4273 mutex_enter(&spa->spa_async_lock); 4274 spa->spa_async_tasks |= task; 4275 mutex_exit(&spa->spa_async_lock); 4276 } 4277 4278 /* 4279 * ========================================================================== 4280 * SPA syncing routines 4281 * ========================================================================== 4282 */ 4283 4284 static void 4285 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 4286 { 4287 bplist_t *bpl = &spa->spa_sync_bplist; 4288 dmu_tx_t *tx; 4289 blkptr_t blk; 4290 uint64_t itor = 0; 4291 zio_t *zio; 4292 int error; 4293 uint8_t c = 1; 4294 4295 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4296 4297 while (bplist_iterate(bpl, &itor, &blk) == 0) { 4298 ASSERT(blk.blk_birth < txg); 4299 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL, 4300 ZIO_FLAG_MUSTSUCCEED)); 4301 } 4302 4303 error = zio_wait(zio); 4304 ASSERT3U(error, ==, 0); 4305 4306 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 4307 bplist_vacate(bpl, tx); 4308 4309 /* 4310 * Pre-dirty the first block so we sync to convergence faster. 4311 * (Usually only the first block is needed.) 4312 */ 4313 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 4314 dmu_tx_commit(tx); 4315 } 4316 4317 static void 4318 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 4319 { 4320 char *packed = NULL; 4321 size_t bufsize; 4322 size_t nvsize = 0; 4323 dmu_buf_t *db; 4324 4325 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 4326 4327 /* 4328 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 4329 * information. This avoids the dbuf_will_dirty() path and 4330 * saves us a pre-read to get data we don't actually care about. 4331 */ 4332 bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); 4333 packed = kmem_alloc(bufsize, KM_SLEEP); 4334 4335 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 4336 KM_SLEEP) == 0); 4337 bzero(packed + nvsize, bufsize - nvsize); 4338 4339 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 4340 4341 kmem_free(packed, bufsize); 4342 4343 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 4344 dmu_buf_will_dirty(db, tx); 4345 *(uint64_t *)db->db_data = nvsize; 4346 dmu_buf_rele(db, FTAG); 4347 } 4348 4349 static void 4350 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 4351 const char *config, const char *entry) 4352 { 4353 nvlist_t *nvroot; 4354 nvlist_t **list; 4355 int i; 4356 4357 if (!sav->sav_sync) 4358 return; 4359 4360 /* 4361 * Update the MOS nvlist describing the list of available devices. 4362 * spa_validate_aux() will have already made sure this nvlist is 4363 * valid and the vdevs are labeled appropriately. 4364 */ 4365 if (sav->sav_object == 0) { 4366 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 4367 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 4368 sizeof (uint64_t), tx); 4369 VERIFY(zap_update(spa->spa_meta_objset, 4370 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 4371 &sav->sav_object, tx) == 0); 4372 } 4373 4374 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4375 if (sav->sav_count == 0) { 4376 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 4377 } else { 4378 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 4379 for (i = 0; i < sav->sav_count; i++) 4380 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 4381 B_FALSE, B_FALSE, B_TRUE); 4382 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 4383 sav->sav_count) == 0); 4384 for (i = 0; i < sav->sav_count; i++) 4385 nvlist_free(list[i]); 4386 kmem_free(list, sav->sav_count * sizeof (void *)); 4387 } 4388 4389 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 4390 nvlist_free(nvroot); 4391 4392 sav->sav_sync = B_FALSE; 4393 } 4394 4395 static void 4396 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 4397 { 4398 nvlist_t *config; 4399 4400 if (list_is_empty(&spa->spa_config_dirty_list)) 4401 return; 4402 4403 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4404 4405 config = spa_config_generate(spa, spa->spa_root_vdev, 4406 dmu_tx_get_txg(tx), B_FALSE); 4407 4408 spa_config_exit(spa, SCL_STATE, FTAG); 4409 4410 if (spa->spa_config_syncing) 4411 nvlist_free(spa->spa_config_syncing); 4412 spa->spa_config_syncing = config; 4413 4414 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 4415 } 4416 4417 /* 4418 * Set zpool properties. 4419 */ 4420 static void 4421 spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 4422 { 4423 spa_t *spa = arg1; 4424 objset_t *mos = spa->spa_meta_objset; 4425 nvlist_t *nvp = arg2; 4426 nvpair_t *elem; 4427 uint64_t intval; 4428 char *strval; 4429 zpool_prop_t prop; 4430 const char *propname; 4431 zprop_type_t proptype; 4432 4433 mutex_enter(&spa->spa_props_lock); 4434 4435 elem = NULL; 4436 while ((elem = nvlist_next_nvpair(nvp, elem))) { 4437 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 4438 case ZPOOL_PROP_VERSION: 4439 /* 4440 * Only set version for non-zpool-creation cases 4441 * (set/import). spa_create() needs special care 4442 * for version setting. 4443 */ 4444 if (tx->tx_txg != TXG_INITIAL) { 4445 VERIFY(nvpair_value_uint64(elem, 4446 &intval) == 0); 4447 ASSERT(intval <= SPA_VERSION); 4448 ASSERT(intval >= spa_version(spa)); 4449 spa->spa_uberblock.ub_version = intval; 4450 vdev_config_dirty(spa->spa_root_vdev); 4451 } 4452 break; 4453 4454 case ZPOOL_PROP_ALTROOT: 4455 /* 4456 * 'altroot' is a non-persistent property. It should 4457 * have been set temporarily at creation or import time. 4458 */ 4459 ASSERT(spa->spa_root != NULL); 4460 break; 4461 4462 case ZPOOL_PROP_CACHEFILE: 4463 /* 4464 * 'cachefile' is also a non-persisitent property. 4465 */ 4466 break; 4467 default: 4468 /* 4469 * Set pool property values in the poolprops mos object. 4470 */ 4471 if (spa->spa_pool_props_object == 0) { 4472 objset_t *mos = spa->spa_meta_objset; 4473 4474 VERIFY((spa->spa_pool_props_object = 4475 zap_create(mos, DMU_OT_POOL_PROPS, 4476 DMU_OT_NONE, 0, tx)) > 0); 4477 4478 VERIFY(zap_update(mos, 4479 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 4480 8, 1, &spa->spa_pool_props_object, tx) 4481 == 0); 4482 } 4483 4484 /* normalize the property name */ 4485 propname = zpool_prop_to_name(prop); 4486 proptype = zpool_prop_get_type(prop); 4487 4488 if (nvpair_type(elem) == DATA_TYPE_STRING) { 4489 ASSERT(proptype == PROP_TYPE_STRING); 4490 VERIFY(nvpair_value_string(elem, &strval) == 0); 4491 VERIFY(zap_update(mos, 4492 spa->spa_pool_props_object, propname, 4493 1, strlen(strval) + 1, strval, tx) == 0); 4494 4495 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 4496 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 4497 4498 if (proptype == PROP_TYPE_INDEX) { 4499 const char *unused; 4500 VERIFY(zpool_prop_index_to_string( 4501 prop, intval, &unused) == 0); 4502 } 4503 VERIFY(zap_update(mos, 4504 spa->spa_pool_props_object, propname, 4505 8, 1, &intval, tx) == 0); 4506 } else { 4507 ASSERT(0); /* not allowed */ 4508 } 4509 4510 switch (prop) { 4511 case ZPOOL_PROP_DELEGATION: 4512 spa->spa_delegation = intval; 4513 break; 4514 case ZPOOL_PROP_BOOTFS: 4515 spa->spa_bootfs = intval; 4516 break; 4517 case ZPOOL_PROP_FAILUREMODE: 4518 spa->spa_failmode = intval; 4519 break; 4520 case ZPOOL_PROP_AUTOEXPAND: 4521 spa->spa_autoexpand = intval; 4522 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 4523 break; 4524 default: 4525 break; 4526 } 4527 } 4528 4529 /* log internal history if this is not a zpool create */ 4530 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 4531 tx->tx_txg != TXG_INITIAL) { 4532 spa_history_internal_log(LOG_POOL_PROPSET, 4533 spa, tx, cr, "%s %lld %s", 4534 nvpair_name(elem), intval, spa_name(spa)); 4535 } 4536 } 4537 4538 mutex_exit(&spa->spa_props_lock); 4539 } 4540 4541 /* 4542 * Sync the specified transaction group. New blocks may be dirtied as 4543 * part of the process, so we iterate until it converges. 4544 */ 4545 void 4546 spa_sync(spa_t *spa, uint64_t txg) 4547 { 4548 dsl_pool_t *dp = spa->spa_dsl_pool; 4549 objset_t *mos = spa->spa_meta_objset; 4550 bplist_t *bpl = &spa->spa_sync_bplist; 4551 vdev_t *rvd = spa->spa_root_vdev; 4552 vdev_t *vd; 4553 dmu_tx_t *tx; 4554 int dirty_vdevs; 4555 int error; 4556 4557 /* 4558 * Lock out configuration changes. 4559 */ 4560 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4561 4562 spa->spa_syncing_txg = txg; 4563 spa->spa_sync_pass = 0; 4564 4565 /* 4566 * If there are any pending vdev state changes, convert them 4567 * into config changes that go out with this transaction group. 4568 */ 4569 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4570 while (list_head(&spa->spa_state_dirty_list) != NULL) { 4571 /* 4572 * We need the write lock here because, for aux vdevs, 4573 * calling vdev_config_dirty() modifies sav_config. 4574 * This is ugly and will become unnecessary when we 4575 * eliminate the aux vdev wart by integrating all vdevs 4576 * into the root vdev tree. 4577 */ 4578 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 4579 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 4580 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 4581 vdev_state_clean(vd); 4582 vdev_config_dirty(vd); 4583 } 4584 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 4585 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 4586 } 4587 spa_config_exit(spa, SCL_STATE, FTAG); 4588 4589 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 4590 4591 tx = dmu_tx_create_assigned(dp, txg); 4592 4593 /* 4594 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 4595 * set spa_deflate if we have no raid-z vdevs. 4596 */ 4597 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 4598 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 4599 int i; 4600 4601 for (i = 0; i < rvd->vdev_children; i++) { 4602 vd = rvd->vdev_child[i]; 4603 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 4604 break; 4605 } 4606 if (i == rvd->vdev_children) { 4607 spa->spa_deflate = TRUE; 4608 VERIFY(0 == zap_add(spa->spa_meta_objset, 4609 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 4610 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 4611 } 4612 } 4613 4614 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 4615 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 4616 dsl_pool_create_origin(dp, tx); 4617 4618 /* Keeping the origin open increases spa_minref */ 4619 spa->spa_minref += 3; 4620 } 4621 4622 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 4623 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 4624 dsl_pool_upgrade_clones(dp, tx); 4625 } 4626 4627 /* 4628 * If anything has changed in this txg, push the deferred frees 4629 * from the previous txg. If not, leave them alone so that we 4630 * don't generate work on an otherwise idle system. 4631 */ 4632 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 4633 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 4634 !txg_list_empty(&dp->dp_sync_tasks, txg)) 4635 spa_sync_deferred_frees(spa, txg); 4636 4637 /* 4638 * Iterate to convergence. 4639 */ 4640 do { 4641 spa->spa_sync_pass++; 4642 4643 spa_sync_config_object(spa, tx); 4644 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 4645 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 4646 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 4647 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 4648 spa_errlog_sync(spa, txg); 4649 dsl_pool_sync(dp, txg); 4650 4651 dirty_vdevs = 0; 4652 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 4653 vdev_sync(vd, txg); 4654 dirty_vdevs++; 4655 } 4656 4657 bplist_sync(bpl, tx); 4658 } while (dirty_vdevs); 4659 4660 bplist_close(bpl); 4661 4662 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 4663 4664 /* 4665 * Rewrite the vdev configuration (which includes the uberblock) 4666 * to commit the transaction group. 4667 * 4668 * If there are no dirty vdevs, we sync the uberblock to a few 4669 * random top-level vdevs that are known to be visible in the 4670 * config cache (see spa_vdev_add() for a complete description). 4671 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 4672 */ 4673 for (;;) { 4674 /* 4675 * We hold SCL_STATE to prevent vdev open/close/etc. 4676 * while we're attempting to write the vdev labels. 4677 */ 4678 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4679 4680 if (list_is_empty(&spa->spa_config_dirty_list)) { 4681 vdev_t *svd[SPA_DVAS_PER_BP]; 4682 int svdcount = 0; 4683 int children = rvd->vdev_children; 4684 int c0 = spa_get_random(children); 4685 4686 for (int c = 0; c < children; c++) { 4687 vd = rvd->vdev_child[(c0 + c) % children]; 4688 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 4689 continue; 4690 svd[svdcount++] = vd; 4691 if (svdcount == SPA_DVAS_PER_BP) 4692 break; 4693 } 4694 error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 4695 if (error != 0) 4696 error = vdev_config_sync(svd, svdcount, txg, 4697 B_TRUE); 4698 } else { 4699 error = vdev_config_sync(rvd->vdev_child, 4700 rvd->vdev_children, txg, B_FALSE); 4701 if (error != 0) 4702 error = vdev_config_sync(rvd->vdev_child, 4703 rvd->vdev_children, txg, B_TRUE); 4704 } 4705 4706 spa_config_exit(spa, SCL_STATE, FTAG); 4707 4708 if (error == 0) 4709 break; 4710 zio_suspend(spa, NULL); 4711 zio_resume_wait(spa); 4712 } 4713 dmu_tx_commit(tx); 4714 4715 /* 4716 * Clear the dirty config list. 4717 */ 4718 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 4719 vdev_config_clean(vd); 4720 4721 /* 4722 * Now that the new config has synced transactionally, 4723 * let it become visible to the config cache. 4724 */ 4725 if (spa->spa_config_syncing != NULL) { 4726 spa_config_set(spa, spa->spa_config_syncing); 4727 spa->spa_config_txg = txg; 4728 spa->spa_config_syncing = NULL; 4729 } 4730 4731 spa->spa_ubsync = spa->spa_uberblock; 4732 4733 /* 4734 * Clean up the ZIL records for the synced txg. 4735 */ 4736 dsl_pool_zil_clean(dp); 4737 4738 /* 4739 * Update usable space statistics. 4740 */ 4741 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 4742 vdev_sync_done(vd, txg); 4743 4744 /* 4745 * It had better be the case that we didn't dirty anything 4746 * since vdev_config_sync(). 4747 */ 4748 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 4749 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 4750 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 4751 ASSERT(bpl->bpl_queue == NULL); 4752 4753 spa_config_exit(spa, SCL_CONFIG, FTAG); 4754 4755 spa_handle_ignored_writes(spa); 4756 4757 /* 4758 * If any async tasks have been requested, kick them off. 4759 */ 4760 spa_async_dispatch(spa); 4761 } 4762 4763 /* 4764 * Sync all pools. We don't want to hold the namespace lock across these 4765 * operations, so we take a reference on the spa_t and drop the lock during the 4766 * sync. 4767 */ 4768 void 4769 spa_sync_allpools(void) 4770 { 4771 spa_t *spa = NULL; 4772 mutex_enter(&spa_namespace_lock); 4773 while ((spa = spa_next(spa)) != NULL) { 4774 if (spa_state(spa) != POOL_STATE_ACTIVE || spa_suspended(spa)) 4775 continue; 4776 spa_open_ref(spa, FTAG); 4777 mutex_exit(&spa_namespace_lock); 4778 txg_wait_synced(spa_get_dsl(spa), 0); 4779 mutex_enter(&spa_namespace_lock); 4780 spa_close(spa, FTAG); 4781 } 4782 mutex_exit(&spa_namespace_lock); 4783 } 4784 4785 /* 4786 * ========================================================================== 4787 * Miscellaneous routines 4788 * ========================================================================== 4789 */ 4790 4791 /* 4792 * Remove all pools in the system. 4793 */ 4794 void 4795 spa_evict_all(void) 4796 { 4797 spa_t *spa; 4798 4799 /* 4800 * Remove all cached state. All pools should be closed now, 4801 * so every spa in the AVL tree should be unreferenced. 4802 */ 4803 mutex_enter(&spa_namespace_lock); 4804 while ((spa = spa_next(NULL)) != NULL) { 4805 /* 4806 * Stop async tasks. The async thread may need to detach 4807 * a device that's been replaced, which requires grabbing 4808 * spa_namespace_lock, so we must drop it here. 4809 */ 4810 spa_open_ref(spa, FTAG); 4811 mutex_exit(&spa_namespace_lock); 4812 spa_async_suspend(spa); 4813 mutex_enter(&spa_namespace_lock); 4814 spa_close(spa, FTAG); 4815 4816 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4817 spa_unload(spa); 4818 spa_deactivate(spa); 4819 } 4820 spa_remove(spa); 4821 } 4822 mutex_exit(&spa_namespace_lock); 4823 } 4824 4825 vdev_t * 4826 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 4827 { 4828 vdev_t *vd; 4829 int i; 4830 4831 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 4832 return (vd); 4833 4834 if (aux) { 4835 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 4836 vd = spa->spa_l2cache.sav_vdevs[i]; 4837 if (vd->vdev_guid == guid) 4838 return (vd); 4839 } 4840 4841 for (i = 0; i < spa->spa_spares.sav_count; i++) { 4842 vd = spa->spa_spares.sav_vdevs[i]; 4843 if (vd->vdev_guid == guid) 4844 return (vd); 4845 } 4846 } 4847 4848 return (NULL); 4849 } 4850 4851 void 4852 spa_upgrade(spa_t *spa, uint64_t version) 4853 { 4854 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4855 4856 /* 4857 * This should only be called for a non-faulted pool, and since a 4858 * future version would result in an unopenable pool, this shouldn't be 4859 * possible. 4860 */ 4861 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 4862 ASSERT(version >= spa->spa_uberblock.ub_version); 4863 4864 spa->spa_uberblock.ub_version = version; 4865 vdev_config_dirty(spa->spa_root_vdev); 4866 4867 spa_config_exit(spa, SCL_ALL, FTAG); 4868 4869 txg_wait_synced(spa_get_dsl(spa), 0); 4870 } 4871 4872 boolean_t 4873 spa_has_spare(spa_t *spa, uint64_t guid) 4874 { 4875 int i; 4876 uint64_t spareguid; 4877 spa_aux_vdev_t *sav = &spa->spa_spares; 4878 4879 for (i = 0; i < sav->sav_count; i++) 4880 if (sav->sav_vdevs[i]->vdev_guid == guid) 4881 return (B_TRUE); 4882 4883 for (i = 0; i < sav->sav_npending; i++) { 4884 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 4885 &spareguid) == 0 && spareguid == guid) 4886 return (B_TRUE); 4887 } 4888 4889 return (B_FALSE); 4890 } 4891 4892 /* 4893 * Check if a pool has an active shared spare device. 4894 * Note: reference count of an active spare is 2, as a spare and as a replace 4895 */ 4896 static boolean_t 4897 spa_has_active_shared_spare(spa_t *spa) 4898 { 4899 int i, refcnt; 4900 uint64_t pool; 4901 spa_aux_vdev_t *sav = &spa->spa_spares; 4902 4903 for (i = 0; i < sav->sav_count; i++) { 4904 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 4905 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 4906 refcnt > 2) 4907 return (B_TRUE); 4908 } 4909 4910 return (B_FALSE); 4911 } 4912 4913 /* 4914 * Post a sysevent corresponding to the given event. The 'name' must be one of 4915 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 4916 * filled in from the spa and (optionally) the vdev. This doesn't do anything 4917 * in the userland libzpool, as we don't want consumers to misinterpret ztest 4918 * or zdb as real changes. 4919 */ 4920 void 4921 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 4922 { 4923 #ifdef _KERNEL 4924 sysevent_t *ev; 4925 sysevent_attr_list_t *attr = NULL; 4926 sysevent_value_t value; 4927 sysevent_id_t eid; 4928 4929 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 4930 SE_SLEEP); 4931 4932 value.value_type = SE_DATA_TYPE_STRING; 4933 value.value.sv_string = spa_name(spa); 4934 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 4935 goto done; 4936 4937 value.value_type = SE_DATA_TYPE_UINT64; 4938 value.value.sv_uint64 = spa_guid(spa); 4939 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 4940 goto done; 4941 4942 if (vd) { 4943 value.value_type = SE_DATA_TYPE_UINT64; 4944 value.value.sv_uint64 = vd->vdev_guid; 4945 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 4946 SE_SLEEP) != 0) 4947 goto done; 4948 4949 if (vd->vdev_path) { 4950 value.value_type = SE_DATA_TYPE_STRING; 4951 value.value.sv_string = vd->vdev_path; 4952 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 4953 &value, SE_SLEEP) != 0) 4954 goto done; 4955 } 4956 } 4957 4958 if (sysevent_attach_attributes(ev, attr) != 0) 4959 goto done; 4960 attr = NULL; 4961 4962 (void) log_sysevent(ev, SE_SLEEP, &eid); 4963 4964 done: 4965 if (attr) 4966 sysevent_free_attr(attr); 4967 sysevent_free(ev); 4968 #endif 4969 } 4970