1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * This file contains all the routines used when modifying on-disk SPA state. 29 * This includes opening, importing, destroying, exporting a pool, and syncing a 30 * pool. 31 */ 32 33 #include <sys/zfs_context.h> 34 #include <sys/fm/fs/zfs.h> 35 #include <sys/spa_impl.h> 36 #include <sys/zio.h> 37 #include <sys/zio_checksum.h> 38 #include <sys/zio_compress.h> 39 #include <sys/dmu.h> 40 #include <sys/dmu_tx.h> 41 #include <sys/zap.h> 42 #include <sys/zil.h> 43 #include <sys/vdev_impl.h> 44 #include <sys/metaslab.h> 45 #include <sys/metaslab_impl.h> 46 #include <sys/uberblock_impl.h> 47 #include <sys/txg.h> 48 #include <sys/avl.h> 49 #include <sys/dmu_traverse.h> 50 #include <sys/dmu_objset.h> 51 #include <sys/unique.h> 52 #include <sys/dsl_pool.h> 53 #include <sys/dsl_dataset.h> 54 #include <sys/dsl_dir.h> 55 #include <sys/dsl_prop.h> 56 #include <sys/dsl_synctask.h> 57 #include <sys/fs/zfs.h> 58 #include <sys/arc.h> 59 #include <sys/callb.h> 60 #include <sys/systeminfo.h> 61 #include <sys/sunddi.h> 62 #include <sys/spa_boot.h> 63 #include <sys/zfs_ioctl.h> 64 65 #ifdef _KERNEL 66 #include <sys/zone.h> 67 #endif /* _KERNEL */ 68 69 #include "zfs_prop.h" 70 #include "zfs_comutil.h" 71 72 enum zti_modes { 73 zti_mode_fixed, /* value is # of threads (min 1) */ 74 zti_mode_online_percent, /* value is % of online CPUs */ 75 zti_mode_tune, /* fill from zio_taskq_tune_* */ 76 zti_nmodes 77 }; 78 79 #define ZTI_THREAD_FIX(n) { zti_mode_fixed, (n) } 80 #define ZTI_THREAD_PCT(n) { zti_mode_online_percent, (n) } 81 #define ZTI_THREAD_TUNE { zti_mode_tune, 0 } 82 83 #define ZTI_THREAD_ONE ZTI_THREAD_FIX(1) 84 85 typedef struct zio_taskq_info { 86 const char *zti_name; 87 struct { 88 enum zti_modes zti_mode; 89 uint_t zti_value; 90 } zti_nthreads[ZIO_TASKQ_TYPES]; 91 } zio_taskq_info_t; 92 93 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 94 "issue", "intr" 95 }; 96 97 const zio_taskq_info_t zio_taskqs[ZIO_TYPES] = { 98 /* ISSUE INTR */ 99 { "spa_zio_null", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, 100 { "spa_zio_read", { ZTI_THREAD_FIX(8), ZTI_THREAD_TUNE } }, 101 { "spa_zio_write", { ZTI_THREAD_TUNE, ZTI_THREAD_FIX(8) } }, 102 { "spa_zio_free", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, 103 { "spa_zio_claim", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, 104 { "spa_zio_ioctl", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, 105 }; 106 107 enum zti_modes zio_taskq_tune_mode = zti_mode_online_percent; 108 uint_t zio_taskq_tune_value = 80; /* #threads = 80% of # online CPUs */ 109 110 static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); 111 static boolean_t spa_has_active_shared_spare(spa_t *spa); 112 113 /* 114 * ========================================================================== 115 * SPA properties routines 116 * ========================================================================== 117 */ 118 119 /* 120 * Add a (source=src, propname=propval) list to an nvlist. 121 */ 122 static void 123 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 124 uint64_t intval, zprop_source_t src) 125 { 126 const char *propname = zpool_prop_to_name(prop); 127 nvlist_t *propval; 128 129 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 130 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 131 132 if (strval != NULL) 133 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 134 else 135 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 136 137 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 138 nvlist_free(propval); 139 } 140 141 /* 142 * Get property values from the spa configuration. 143 */ 144 static void 145 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 146 { 147 uint64_t size; 148 uint64_t used; 149 uint64_t cap, version; 150 zprop_source_t src = ZPROP_SRC_NONE; 151 spa_config_dirent_t *dp; 152 153 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 154 155 if (spa->spa_root_vdev != NULL) { 156 size = spa_get_space(spa); 157 used = spa_get_alloc(spa); 158 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 159 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 160 spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src); 161 spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, 162 size - used, src); 163 164 cap = (size == 0) ? 0 : (used * 100 / size); 165 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 166 167 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 168 spa->spa_root_vdev->vdev_state, src); 169 170 version = spa_version(spa); 171 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 172 src = ZPROP_SRC_DEFAULT; 173 else 174 src = ZPROP_SRC_LOCAL; 175 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 176 } 177 178 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 179 180 if (spa->spa_root != NULL) 181 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 182 0, ZPROP_SRC_LOCAL); 183 184 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 185 if (dp->scd_path == NULL) { 186 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 187 "none", 0, ZPROP_SRC_LOCAL); 188 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 189 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 190 dp->scd_path, 0, ZPROP_SRC_LOCAL); 191 } 192 } 193 } 194 195 /* 196 * Get zpool property values. 197 */ 198 int 199 spa_prop_get(spa_t *spa, nvlist_t **nvp) 200 { 201 zap_cursor_t zc; 202 zap_attribute_t za; 203 objset_t *mos = spa->spa_meta_objset; 204 int err; 205 206 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 207 208 mutex_enter(&spa->spa_props_lock); 209 210 /* 211 * Get properties from the spa config. 212 */ 213 spa_prop_get_config(spa, nvp); 214 215 /* If no pool property object, no more prop to get. */ 216 if (spa->spa_pool_props_object == 0) { 217 mutex_exit(&spa->spa_props_lock); 218 return (0); 219 } 220 221 /* 222 * Get properties from the MOS pool property object. 223 */ 224 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 225 (err = zap_cursor_retrieve(&zc, &za)) == 0; 226 zap_cursor_advance(&zc)) { 227 uint64_t intval = 0; 228 char *strval = NULL; 229 zprop_source_t src = ZPROP_SRC_DEFAULT; 230 zpool_prop_t prop; 231 232 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 233 continue; 234 235 switch (za.za_integer_length) { 236 case 8: 237 /* integer property */ 238 if (za.za_first_integer != 239 zpool_prop_default_numeric(prop)) 240 src = ZPROP_SRC_LOCAL; 241 242 if (prop == ZPOOL_PROP_BOOTFS) { 243 dsl_pool_t *dp; 244 dsl_dataset_t *ds = NULL; 245 246 dp = spa_get_dsl(spa); 247 rw_enter(&dp->dp_config_rwlock, RW_READER); 248 if (err = dsl_dataset_hold_obj(dp, 249 za.za_first_integer, FTAG, &ds)) { 250 rw_exit(&dp->dp_config_rwlock); 251 break; 252 } 253 254 strval = kmem_alloc( 255 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 256 KM_SLEEP); 257 dsl_dataset_name(ds, strval); 258 dsl_dataset_rele(ds, FTAG); 259 rw_exit(&dp->dp_config_rwlock); 260 } else { 261 strval = NULL; 262 intval = za.za_first_integer; 263 } 264 265 spa_prop_add_list(*nvp, prop, strval, intval, src); 266 267 if (strval != NULL) 268 kmem_free(strval, 269 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 270 271 break; 272 273 case 1: 274 /* string property */ 275 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 276 err = zap_lookup(mos, spa->spa_pool_props_object, 277 za.za_name, 1, za.za_num_integers, strval); 278 if (err) { 279 kmem_free(strval, za.za_num_integers); 280 break; 281 } 282 spa_prop_add_list(*nvp, prop, strval, 0, src); 283 kmem_free(strval, za.za_num_integers); 284 break; 285 286 default: 287 break; 288 } 289 } 290 zap_cursor_fini(&zc); 291 mutex_exit(&spa->spa_props_lock); 292 out: 293 if (err && err != ENOENT) { 294 nvlist_free(*nvp); 295 *nvp = NULL; 296 return (err); 297 } 298 299 return (0); 300 } 301 302 /* 303 * Validate the given pool properties nvlist and modify the list 304 * for the property values to be set. 305 */ 306 static int 307 spa_prop_validate(spa_t *spa, nvlist_t *props) 308 { 309 nvpair_t *elem; 310 int error = 0, reset_bootfs = 0; 311 uint64_t objnum; 312 313 elem = NULL; 314 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 315 zpool_prop_t prop; 316 char *propname, *strval; 317 uint64_t intval; 318 objset_t *os; 319 char *slash; 320 321 propname = nvpair_name(elem); 322 323 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) 324 return (EINVAL); 325 326 switch (prop) { 327 case ZPOOL_PROP_VERSION: 328 error = nvpair_value_uint64(elem, &intval); 329 if (!error && 330 (intval < spa_version(spa) || intval > SPA_VERSION)) 331 error = EINVAL; 332 break; 333 334 case ZPOOL_PROP_DELEGATION: 335 case ZPOOL_PROP_AUTOREPLACE: 336 case ZPOOL_PROP_LISTSNAPS: 337 case ZPOOL_PROP_AUTOEXPAND: 338 error = nvpair_value_uint64(elem, &intval); 339 if (!error && intval > 1) 340 error = EINVAL; 341 break; 342 343 case ZPOOL_PROP_BOOTFS: 344 /* 345 * If the pool version is less than SPA_VERSION_BOOTFS, 346 * or the pool is still being created (version == 0), 347 * the bootfs property cannot be set. 348 */ 349 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 350 error = ENOTSUP; 351 break; 352 } 353 354 /* 355 * Make sure the vdev config is bootable 356 */ 357 if (!vdev_is_bootable(spa->spa_root_vdev)) { 358 error = ENOTSUP; 359 break; 360 } 361 362 reset_bootfs = 1; 363 364 error = nvpair_value_string(elem, &strval); 365 366 if (!error) { 367 uint64_t compress; 368 369 if (strval == NULL || strval[0] == '\0') { 370 objnum = zpool_prop_default_numeric( 371 ZPOOL_PROP_BOOTFS); 372 break; 373 } 374 375 if (error = dmu_objset_hold(strval, FTAG, &os)) 376 break; 377 378 /* Must be ZPL and not gzip compressed. */ 379 380 if (dmu_objset_type(os) != DMU_OST_ZFS) { 381 error = ENOTSUP; 382 } else if ((error = dsl_prop_get_integer(strval, 383 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 384 &compress, NULL)) == 0 && 385 !BOOTFS_COMPRESS_VALID(compress)) { 386 error = ENOTSUP; 387 } else { 388 objnum = dmu_objset_id(os); 389 } 390 dmu_objset_rele(os, FTAG); 391 } 392 break; 393 394 case ZPOOL_PROP_FAILUREMODE: 395 error = nvpair_value_uint64(elem, &intval); 396 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 397 intval > ZIO_FAILURE_MODE_PANIC)) 398 error = EINVAL; 399 400 /* 401 * This is a special case which only occurs when 402 * the pool has completely failed. This allows 403 * the user to change the in-core failmode property 404 * without syncing it out to disk (I/Os might 405 * currently be blocked). We do this by returning 406 * EIO to the caller (spa_prop_set) to trick it 407 * into thinking we encountered a property validation 408 * error. 409 */ 410 if (!error && spa_suspended(spa)) { 411 spa->spa_failmode = intval; 412 error = EIO; 413 } 414 break; 415 416 case ZPOOL_PROP_CACHEFILE: 417 if ((error = nvpair_value_string(elem, &strval)) != 0) 418 break; 419 420 if (strval[0] == '\0') 421 break; 422 423 if (strcmp(strval, "none") == 0) 424 break; 425 426 if (strval[0] != '/') { 427 error = EINVAL; 428 break; 429 } 430 431 slash = strrchr(strval, '/'); 432 ASSERT(slash != NULL); 433 434 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 435 strcmp(slash, "/..") == 0) 436 error = EINVAL; 437 break; 438 } 439 440 if (error) 441 break; 442 } 443 444 if (!error && reset_bootfs) { 445 error = nvlist_remove(props, 446 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 447 448 if (!error) { 449 error = nvlist_add_uint64(props, 450 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 451 } 452 } 453 454 return (error); 455 } 456 457 void 458 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 459 { 460 char *cachefile; 461 spa_config_dirent_t *dp; 462 463 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 464 &cachefile) != 0) 465 return; 466 467 dp = kmem_alloc(sizeof (spa_config_dirent_t), 468 KM_SLEEP); 469 470 if (cachefile[0] == '\0') 471 dp->scd_path = spa_strdup(spa_config_path); 472 else if (strcmp(cachefile, "none") == 0) 473 dp->scd_path = NULL; 474 else 475 dp->scd_path = spa_strdup(cachefile); 476 477 list_insert_head(&spa->spa_config_list, dp); 478 if (need_sync) 479 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 480 } 481 482 int 483 spa_prop_set(spa_t *spa, nvlist_t *nvp) 484 { 485 int error; 486 nvpair_t *elem; 487 boolean_t need_sync = B_FALSE; 488 zpool_prop_t prop; 489 490 if ((error = spa_prop_validate(spa, nvp)) != 0) 491 return (error); 492 493 elem = NULL; 494 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 495 if ((prop = zpool_name_to_prop( 496 nvpair_name(elem))) == ZPROP_INVAL) 497 return (EINVAL); 498 499 if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT) 500 continue; 501 502 need_sync = B_TRUE; 503 break; 504 } 505 506 if (need_sync) 507 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 508 spa, nvp, 3)); 509 else 510 return (0); 511 } 512 513 /* 514 * If the bootfs property value is dsobj, clear it. 515 */ 516 void 517 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 518 { 519 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 520 VERIFY(zap_remove(spa->spa_meta_objset, 521 spa->spa_pool_props_object, 522 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 523 spa->spa_bootfs = 0; 524 } 525 } 526 527 /* 528 * ========================================================================== 529 * SPA state manipulation (open/create/destroy/import/export) 530 * ========================================================================== 531 */ 532 533 static int 534 spa_error_entry_compare(const void *a, const void *b) 535 { 536 spa_error_entry_t *sa = (spa_error_entry_t *)a; 537 spa_error_entry_t *sb = (spa_error_entry_t *)b; 538 int ret; 539 540 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 541 sizeof (zbookmark_t)); 542 543 if (ret < 0) 544 return (-1); 545 else if (ret > 0) 546 return (1); 547 else 548 return (0); 549 } 550 551 /* 552 * Utility function which retrieves copies of the current logs and 553 * re-initializes them in the process. 554 */ 555 void 556 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 557 { 558 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 559 560 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 561 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 562 563 avl_create(&spa->spa_errlist_scrub, 564 spa_error_entry_compare, sizeof (spa_error_entry_t), 565 offsetof(spa_error_entry_t, se_avl)); 566 avl_create(&spa->spa_errlist_last, 567 spa_error_entry_compare, sizeof (spa_error_entry_t), 568 offsetof(spa_error_entry_t, se_avl)); 569 } 570 571 /* 572 * Activate an uninitialized pool. 573 */ 574 static void 575 spa_activate(spa_t *spa, int mode) 576 { 577 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 578 579 spa->spa_state = POOL_STATE_ACTIVE; 580 spa->spa_mode = mode; 581 582 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 583 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 584 585 for (int t = 0; t < ZIO_TYPES; t++) { 586 const zio_taskq_info_t *ztip = &zio_taskqs[t]; 587 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 588 enum zti_modes mode = ztip->zti_nthreads[q].zti_mode; 589 uint_t value = ztip->zti_nthreads[q].zti_value; 590 char name[32]; 591 592 (void) snprintf(name, sizeof (name), 593 "%s_%s", ztip->zti_name, zio_taskq_types[q]); 594 595 if (mode == zti_mode_tune) { 596 mode = zio_taskq_tune_mode; 597 value = zio_taskq_tune_value; 598 if (mode == zti_mode_tune) 599 mode = zti_mode_online_percent; 600 } 601 602 switch (mode) { 603 case zti_mode_fixed: 604 ASSERT3U(value, >=, 1); 605 value = MAX(value, 1); 606 607 spa->spa_zio_taskq[t][q] = taskq_create(name, 608 value, maxclsyspri, 50, INT_MAX, 609 TASKQ_PREPOPULATE); 610 break; 611 612 case zti_mode_online_percent: 613 spa->spa_zio_taskq[t][q] = taskq_create(name, 614 value, maxclsyspri, 50, INT_MAX, 615 TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT); 616 break; 617 618 case zti_mode_tune: 619 default: 620 panic("unrecognized mode for " 621 "zio_taskqs[%u]->zti_nthreads[%u] (%u:%u) " 622 "in spa_activate()", 623 t, q, mode, value); 624 break; 625 } 626 } 627 } 628 629 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 630 offsetof(vdev_t, vdev_config_dirty_node)); 631 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 632 offsetof(vdev_t, vdev_state_dirty_node)); 633 634 txg_list_create(&spa->spa_vdev_txg_list, 635 offsetof(struct vdev, vdev_txg_node)); 636 637 avl_create(&spa->spa_errlist_scrub, 638 spa_error_entry_compare, sizeof (spa_error_entry_t), 639 offsetof(spa_error_entry_t, se_avl)); 640 avl_create(&spa->spa_errlist_last, 641 spa_error_entry_compare, sizeof (spa_error_entry_t), 642 offsetof(spa_error_entry_t, se_avl)); 643 } 644 645 /* 646 * Opposite of spa_activate(). 647 */ 648 static void 649 spa_deactivate(spa_t *spa) 650 { 651 ASSERT(spa->spa_sync_on == B_FALSE); 652 ASSERT(spa->spa_dsl_pool == NULL); 653 ASSERT(spa->spa_root_vdev == NULL); 654 ASSERT(spa->spa_async_zio_root == NULL); 655 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 656 657 txg_list_destroy(&spa->spa_vdev_txg_list); 658 659 list_destroy(&spa->spa_config_dirty_list); 660 list_destroy(&spa->spa_state_dirty_list); 661 662 for (int t = 0; t < ZIO_TYPES; t++) { 663 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 664 taskq_destroy(spa->spa_zio_taskq[t][q]); 665 spa->spa_zio_taskq[t][q] = NULL; 666 } 667 } 668 669 metaslab_class_destroy(spa->spa_normal_class); 670 spa->spa_normal_class = NULL; 671 672 metaslab_class_destroy(spa->spa_log_class); 673 spa->spa_log_class = NULL; 674 675 /* 676 * If this was part of an import or the open otherwise failed, we may 677 * still have errors left in the queues. Empty them just in case. 678 */ 679 spa_errlog_drain(spa); 680 681 avl_destroy(&spa->spa_errlist_scrub); 682 avl_destroy(&spa->spa_errlist_last); 683 684 spa->spa_state = POOL_STATE_UNINITIALIZED; 685 } 686 687 /* 688 * Verify a pool configuration, and construct the vdev tree appropriately. This 689 * will create all the necessary vdevs in the appropriate layout, with each vdev 690 * in the CLOSED state. This will prep the pool before open/creation/import. 691 * All vdev validation is done by the vdev_alloc() routine. 692 */ 693 static int 694 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 695 uint_t id, int atype) 696 { 697 nvlist_t **child; 698 uint_t children; 699 int error; 700 701 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 702 return (error); 703 704 if ((*vdp)->vdev_ops->vdev_op_leaf) 705 return (0); 706 707 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 708 &child, &children); 709 710 if (error == ENOENT) 711 return (0); 712 713 if (error) { 714 vdev_free(*vdp); 715 *vdp = NULL; 716 return (EINVAL); 717 } 718 719 for (int c = 0; c < children; c++) { 720 vdev_t *vd; 721 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 722 atype)) != 0) { 723 vdev_free(*vdp); 724 *vdp = NULL; 725 return (error); 726 } 727 } 728 729 ASSERT(*vdp != NULL); 730 731 return (0); 732 } 733 734 /* 735 * Opposite of spa_load(). 736 */ 737 static void 738 spa_unload(spa_t *spa) 739 { 740 int i; 741 742 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 743 744 /* 745 * Stop async tasks. 746 */ 747 spa_async_suspend(spa); 748 749 /* 750 * Stop syncing. 751 */ 752 if (spa->spa_sync_on) { 753 txg_sync_stop(spa->spa_dsl_pool); 754 spa->spa_sync_on = B_FALSE; 755 } 756 757 /* 758 * Wait for any outstanding async I/O to complete. 759 */ 760 if (spa->spa_async_zio_root != NULL) { 761 (void) zio_wait(spa->spa_async_zio_root); 762 spa->spa_async_zio_root = NULL; 763 } 764 765 /* 766 * Close the dsl pool. 767 */ 768 if (spa->spa_dsl_pool) { 769 dsl_pool_close(spa->spa_dsl_pool); 770 spa->spa_dsl_pool = NULL; 771 } 772 773 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 774 775 /* 776 * Drop and purge level 2 cache 777 */ 778 spa_l2cache_drop(spa); 779 780 /* 781 * Close all vdevs. 782 */ 783 if (spa->spa_root_vdev) 784 vdev_free(spa->spa_root_vdev); 785 ASSERT(spa->spa_root_vdev == NULL); 786 787 for (i = 0; i < spa->spa_spares.sav_count; i++) 788 vdev_free(spa->spa_spares.sav_vdevs[i]); 789 if (spa->spa_spares.sav_vdevs) { 790 kmem_free(spa->spa_spares.sav_vdevs, 791 spa->spa_spares.sav_count * sizeof (void *)); 792 spa->spa_spares.sav_vdevs = NULL; 793 } 794 if (spa->spa_spares.sav_config) { 795 nvlist_free(spa->spa_spares.sav_config); 796 spa->spa_spares.sav_config = NULL; 797 } 798 spa->spa_spares.sav_count = 0; 799 800 for (i = 0; i < spa->spa_l2cache.sav_count; i++) 801 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 802 if (spa->spa_l2cache.sav_vdevs) { 803 kmem_free(spa->spa_l2cache.sav_vdevs, 804 spa->spa_l2cache.sav_count * sizeof (void *)); 805 spa->spa_l2cache.sav_vdevs = NULL; 806 } 807 if (spa->spa_l2cache.sav_config) { 808 nvlist_free(spa->spa_l2cache.sav_config); 809 spa->spa_l2cache.sav_config = NULL; 810 } 811 spa->spa_l2cache.sav_count = 0; 812 813 spa->spa_async_suspended = 0; 814 815 spa_config_exit(spa, SCL_ALL, FTAG); 816 } 817 818 /* 819 * Load (or re-load) the current list of vdevs describing the active spares for 820 * this pool. When this is called, we have some form of basic information in 821 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 822 * then re-generate a more complete list including status information. 823 */ 824 static void 825 spa_load_spares(spa_t *spa) 826 { 827 nvlist_t **spares; 828 uint_t nspares; 829 int i; 830 vdev_t *vd, *tvd; 831 832 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 833 834 /* 835 * First, close and free any existing spare vdevs. 836 */ 837 for (i = 0; i < spa->spa_spares.sav_count; i++) { 838 vd = spa->spa_spares.sav_vdevs[i]; 839 840 /* Undo the call to spa_activate() below */ 841 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 842 B_FALSE)) != NULL && tvd->vdev_isspare) 843 spa_spare_remove(tvd); 844 vdev_close(vd); 845 vdev_free(vd); 846 } 847 848 if (spa->spa_spares.sav_vdevs) 849 kmem_free(spa->spa_spares.sav_vdevs, 850 spa->spa_spares.sav_count * sizeof (void *)); 851 852 if (spa->spa_spares.sav_config == NULL) 853 nspares = 0; 854 else 855 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 856 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 857 858 spa->spa_spares.sav_count = (int)nspares; 859 spa->spa_spares.sav_vdevs = NULL; 860 861 if (nspares == 0) 862 return; 863 864 /* 865 * Construct the array of vdevs, opening them to get status in the 866 * process. For each spare, there is potentially two different vdev_t 867 * structures associated with it: one in the list of spares (used only 868 * for basic validation purposes) and one in the active vdev 869 * configuration (if it's spared in). During this phase we open and 870 * validate each vdev on the spare list. If the vdev also exists in the 871 * active configuration, then we also mark this vdev as an active spare. 872 */ 873 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 874 KM_SLEEP); 875 for (i = 0; i < spa->spa_spares.sav_count; i++) { 876 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 877 VDEV_ALLOC_SPARE) == 0); 878 ASSERT(vd != NULL); 879 880 spa->spa_spares.sav_vdevs[i] = vd; 881 882 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 883 B_FALSE)) != NULL) { 884 if (!tvd->vdev_isspare) 885 spa_spare_add(tvd); 886 887 /* 888 * We only mark the spare active if we were successfully 889 * able to load the vdev. Otherwise, importing a pool 890 * with a bad active spare would result in strange 891 * behavior, because multiple pool would think the spare 892 * is actively in use. 893 * 894 * There is a vulnerability here to an equally bizarre 895 * circumstance, where a dead active spare is later 896 * brought back to life (onlined or otherwise). Given 897 * the rarity of this scenario, and the extra complexity 898 * it adds, we ignore the possibility. 899 */ 900 if (!vdev_is_dead(tvd)) 901 spa_spare_activate(tvd); 902 } 903 904 vd->vdev_top = vd; 905 vd->vdev_aux = &spa->spa_spares; 906 907 if (vdev_open(vd) != 0) 908 continue; 909 910 if (vdev_validate_aux(vd) == 0) 911 spa_spare_add(vd); 912 } 913 914 /* 915 * Recompute the stashed list of spares, with status information 916 * this time. 917 */ 918 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 919 DATA_TYPE_NVLIST_ARRAY) == 0); 920 921 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 922 KM_SLEEP); 923 for (i = 0; i < spa->spa_spares.sav_count; i++) 924 spares[i] = vdev_config_generate(spa, 925 spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE); 926 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 927 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 928 for (i = 0; i < spa->spa_spares.sav_count; i++) 929 nvlist_free(spares[i]); 930 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 931 } 932 933 /* 934 * Load (or re-load) the current list of vdevs describing the active l2cache for 935 * this pool. When this is called, we have some form of basic information in 936 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 937 * then re-generate a more complete list including status information. 938 * Devices which are already active have their details maintained, and are 939 * not re-opened. 940 */ 941 static void 942 spa_load_l2cache(spa_t *spa) 943 { 944 nvlist_t **l2cache; 945 uint_t nl2cache; 946 int i, j, oldnvdevs; 947 uint64_t guid; 948 vdev_t *vd, **oldvdevs, **newvdevs; 949 spa_aux_vdev_t *sav = &spa->spa_l2cache; 950 951 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 952 953 if (sav->sav_config != NULL) { 954 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 955 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 956 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 957 } else { 958 nl2cache = 0; 959 } 960 961 oldvdevs = sav->sav_vdevs; 962 oldnvdevs = sav->sav_count; 963 sav->sav_vdevs = NULL; 964 sav->sav_count = 0; 965 966 /* 967 * Process new nvlist of vdevs. 968 */ 969 for (i = 0; i < nl2cache; i++) { 970 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 971 &guid) == 0); 972 973 newvdevs[i] = NULL; 974 for (j = 0; j < oldnvdevs; j++) { 975 vd = oldvdevs[j]; 976 if (vd != NULL && guid == vd->vdev_guid) { 977 /* 978 * Retain previous vdev for add/remove ops. 979 */ 980 newvdevs[i] = vd; 981 oldvdevs[j] = NULL; 982 break; 983 } 984 } 985 986 if (newvdevs[i] == NULL) { 987 /* 988 * Create new vdev 989 */ 990 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 991 VDEV_ALLOC_L2CACHE) == 0); 992 ASSERT(vd != NULL); 993 newvdevs[i] = vd; 994 995 /* 996 * Commit this vdev as an l2cache device, 997 * even if it fails to open. 998 */ 999 spa_l2cache_add(vd); 1000 1001 vd->vdev_top = vd; 1002 vd->vdev_aux = sav; 1003 1004 spa_l2cache_activate(vd); 1005 1006 if (vdev_open(vd) != 0) 1007 continue; 1008 1009 (void) vdev_validate_aux(vd); 1010 1011 if (!vdev_is_dead(vd)) 1012 l2arc_add_vdev(spa, vd); 1013 } 1014 } 1015 1016 /* 1017 * Purge vdevs that were dropped 1018 */ 1019 for (i = 0; i < oldnvdevs; i++) { 1020 uint64_t pool; 1021 1022 vd = oldvdevs[i]; 1023 if (vd != NULL) { 1024 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1025 pool != 0ULL && l2arc_vdev_present(vd)) 1026 l2arc_remove_vdev(vd); 1027 (void) vdev_close(vd); 1028 spa_l2cache_remove(vd); 1029 } 1030 } 1031 1032 if (oldvdevs) 1033 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1034 1035 if (sav->sav_config == NULL) 1036 goto out; 1037 1038 sav->sav_vdevs = newvdevs; 1039 sav->sav_count = (int)nl2cache; 1040 1041 /* 1042 * Recompute the stashed list of l2cache devices, with status 1043 * information this time. 1044 */ 1045 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1046 DATA_TYPE_NVLIST_ARRAY) == 0); 1047 1048 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1049 for (i = 0; i < sav->sav_count; i++) 1050 l2cache[i] = vdev_config_generate(spa, 1051 sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE); 1052 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1053 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1054 out: 1055 for (i = 0; i < sav->sav_count; i++) 1056 nvlist_free(l2cache[i]); 1057 if (sav->sav_count) 1058 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1059 } 1060 1061 static int 1062 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1063 { 1064 dmu_buf_t *db; 1065 char *packed = NULL; 1066 size_t nvsize = 0; 1067 int error; 1068 *value = NULL; 1069 1070 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 1071 nvsize = *(uint64_t *)db->db_data; 1072 dmu_buf_rele(db, FTAG); 1073 1074 packed = kmem_alloc(nvsize, KM_SLEEP); 1075 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1076 DMU_READ_PREFETCH); 1077 if (error == 0) 1078 error = nvlist_unpack(packed, nvsize, value, 0); 1079 kmem_free(packed, nvsize); 1080 1081 return (error); 1082 } 1083 1084 /* 1085 * Checks to see if the given vdev could not be opened, in which case we post a 1086 * sysevent to notify the autoreplace code that the device has been removed. 1087 */ 1088 static void 1089 spa_check_removed(vdev_t *vd) 1090 { 1091 for (int c = 0; c < vd->vdev_children; c++) 1092 spa_check_removed(vd->vdev_child[c]); 1093 1094 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 1095 zfs_post_autoreplace(vd->vdev_spa, vd); 1096 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1097 } 1098 } 1099 1100 /* 1101 * Load the slog device state from the config object since it's possible 1102 * that the label does not contain the most up-to-date information. 1103 */ 1104 void 1105 spa_load_log_state(spa_t *spa, nvlist_t *nv) 1106 { 1107 vdev_t *ovd, *rvd = spa->spa_root_vdev; 1108 1109 /* 1110 * Load the original root vdev tree from the passed config. 1111 */ 1112 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1113 VERIFY(spa_config_parse(spa, &ovd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1114 1115 for (int c = 0; c < rvd->vdev_children; c++) { 1116 vdev_t *cvd = rvd->vdev_child[c]; 1117 if (cvd->vdev_islog) 1118 vdev_load_log_state(cvd, ovd->vdev_child[c]); 1119 } 1120 vdev_free(ovd); 1121 spa_config_exit(spa, SCL_ALL, FTAG); 1122 } 1123 1124 /* 1125 * Check for missing log devices 1126 */ 1127 int 1128 spa_check_logs(spa_t *spa) 1129 { 1130 switch (spa->spa_log_state) { 1131 case SPA_LOG_MISSING: 1132 /* need to recheck in case slog has been restored */ 1133 case SPA_LOG_UNKNOWN: 1134 if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, 1135 DS_FIND_CHILDREN)) { 1136 spa->spa_log_state = SPA_LOG_MISSING; 1137 return (1); 1138 } 1139 break; 1140 } 1141 return (0); 1142 } 1143 1144 static void 1145 spa_aux_check_removed(spa_aux_vdev_t *sav) 1146 { 1147 int i; 1148 1149 for (i = 0; i < sav->sav_count; i++) 1150 spa_check_removed(sav->sav_vdevs[i]); 1151 } 1152 1153 /* 1154 * Load an existing storage pool, using the pool's builtin spa_config as a 1155 * source of configuration information. 1156 */ 1157 static int 1158 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 1159 { 1160 int error = 0; 1161 nvlist_t *nvconfig, *nvroot = NULL; 1162 vdev_t *rvd; 1163 uberblock_t *ub = &spa->spa_uberblock; 1164 uint64_t config_cache_txg = spa->spa_config_txg; 1165 uint64_t pool_guid; 1166 uint64_t version; 1167 uint64_t autoreplace = 0; 1168 int orig_mode = spa->spa_mode; 1169 char *ereport = FM_EREPORT_ZFS_POOL; 1170 1171 /* 1172 * If this is an untrusted config, access the pool in read-only mode. 1173 * This prevents things like resilvering recently removed devices. 1174 */ 1175 if (!mosconfig) 1176 spa->spa_mode = FREAD; 1177 1178 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1179 1180 spa->spa_load_state = state; 1181 1182 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 1183 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 1184 error = EINVAL; 1185 goto out; 1186 } 1187 1188 /* 1189 * Versioning wasn't explicitly added to the label until later, so if 1190 * it's not present treat it as the initial version. 1191 */ 1192 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 1193 version = SPA_VERSION_INITIAL; 1194 1195 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 1196 &spa->spa_config_txg); 1197 1198 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 1199 spa_guid_exists(pool_guid, 0)) { 1200 error = EEXIST; 1201 goto out; 1202 } 1203 1204 spa->spa_load_guid = pool_guid; 1205 1206 /* 1207 * Create "The Godfather" zio to hold all async IOs 1208 */ 1209 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 1210 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 1211 1212 /* 1213 * Parse the configuration into a vdev tree. We explicitly set the 1214 * value that will be returned by spa_version() since parsing the 1215 * configuration requires knowing the version number. 1216 */ 1217 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1218 spa->spa_ubsync.ub_version = version; 1219 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 1220 spa_config_exit(spa, SCL_ALL, FTAG); 1221 1222 if (error != 0) 1223 goto out; 1224 1225 ASSERT(spa->spa_root_vdev == rvd); 1226 ASSERT(spa_guid(spa) == pool_guid); 1227 1228 /* 1229 * Try to open all vdevs, loading each label in the process. 1230 */ 1231 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1232 error = vdev_open(rvd); 1233 spa_config_exit(spa, SCL_ALL, FTAG); 1234 if (error != 0) 1235 goto out; 1236 1237 /* 1238 * We need to validate the vdev labels against the configuration that 1239 * we have in hand, which is dependent on the setting of mosconfig. If 1240 * mosconfig is true then we're validating the vdev labels based on 1241 * that config. Otherwise, we're validating against the cached config 1242 * (zpool.cache) that was read when we loaded the zfs module, and then 1243 * later we will recursively call spa_load() and validate against 1244 * the vdev config. 1245 */ 1246 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1247 error = vdev_validate(rvd); 1248 spa_config_exit(spa, SCL_ALL, FTAG); 1249 if (error != 0) 1250 goto out; 1251 1252 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1253 error = ENXIO; 1254 goto out; 1255 } 1256 1257 /* 1258 * Find the best uberblock. 1259 */ 1260 vdev_uberblock_load(NULL, rvd, ub); 1261 1262 /* 1263 * If we weren't able to find a single valid uberblock, return failure. 1264 */ 1265 if (ub->ub_txg == 0) { 1266 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1267 VDEV_AUX_CORRUPT_DATA); 1268 error = ENXIO; 1269 goto out; 1270 } 1271 1272 /* 1273 * If the pool is newer than the code, we can't open it. 1274 */ 1275 if (ub->ub_version > SPA_VERSION) { 1276 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1277 VDEV_AUX_VERSION_NEWER); 1278 error = ENOTSUP; 1279 goto out; 1280 } 1281 1282 /* 1283 * If the vdev guid sum doesn't match the uberblock, we have an 1284 * incomplete configuration. 1285 */ 1286 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 1287 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1288 VDEV_AUX_BAD_GUID_SUM); 1289 error = ENXIO; 1290 goto out; 1291 } 1292 1293 /* 1294 * Initialize internal SPA structures. 1295 */ 1296 spa->spa_state = POOL_STATE_ACTIVE; 1297 spa->spa_ubsync = spa->spa_uberblock; 1298 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 1299 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 1300 if (error) { 1301 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1302 VDEV_AUX_CORRUPT_DATA); 1303 goto out; 1304 } 1305 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 1306 1307 if (zap_lookup(spa->spa_meta_objset, 1308 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1309 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 1310 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1311 VDEV_AUX_CORRUPT_DATA); 1312 error = EIO; 1313 goto out; 1314 } 1315 1316 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) { 1317 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1318 VDEV_AUX_CORRUPT_DATA); 1319 error = EIO; 1320 goto out; 1321 } 1322 1323 if (!mosconfig) { 1324 uint64_t hostid; 1325 1326 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 1327 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 1328 char *hostname; 1329 unsigned long myhostid = 0; 1330 1331 VERIFY(nvlist_lookup_string(nvconfig, 1332 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 1333 1334 #ifdef _KERNEL 1335 myhostid = zone_get_hostid(NULL); 1336 #else /* _KERNEL */ 1337 /* 1338 * We're emulating the system's hostid in userland, so 1339 * we can't use zone_get_hostid(). 1340 */ 1341 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 1342 #endif /* _KERNEL */ 1343 if (hostid != 0 && myhostid != 0 && 1344 hostid != myhostid) { 1345 cmn_err(CE_WARN, "pool '%s' could not be " 1346 "loaded as it was last accessed by " 1347 "another system (host: %s hostid: 0x%lx). " 1348 "See: http://www.sun.com/msg/ZFS-8000-EY", 1349 spa_name(spa), hostname, 1350 (unsigned long)hostid); 1351 error = EBADF; 1352 goto out; 1353 } 1354 } 1355 1356 spa_config_set(spa, nvconfig); 1357 spa_unload(spa); 1358 spa_deactivate(spa); 1359 spa_activate(spa, orig_mode); 1360 1361 return (spa_load(spa, nvconfig, state, B_TRUE)); 1362 } 1363 1364 if (zap_lookup(spa->spa_meta_objset, 1365 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1366 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 1367 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1368 VDEV_AUX_CORRUPT_DATA); 1369 error = EIO; 1370 goto out; 1371 } 1372 1373 /* 1374 * Load the bit that tells us to use the new accounting function 1375 * (raid-z deflation). If we have an older pool, this will not 1376 * be present. 1377 */ 1378 error = zap_lookup(spa->spa_meta_objset, 1379 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1380 sizeof (uint64_t), 1, &spa->spa_deflate); 1381 if (error != 0 && error != ENOENT) { 1382 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1383 VDEV_AUX_CORRUPT_DATA); 1384 error = EIO; 1385 goto out; 1386 } 1387 1388 /* 1389 * Load the persistent error log. If we have an older pool, this will 1390 * not be present. 1391 */ 1392 error = zap_lookup(spa->spa_meta_objset, 1393 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 1394 sizeof (uint64_t), 1, &spa->spa_errlog_last); 1395 if (error != 0 && error != ENOENT) { 1396 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1397 VDEV_AUX_CORRUPT_DATA); 1398 error = EIO; 1399 goto out; 1400 } 1401 1402 error = zap_lookup(spa->spa_meta_objset, 1403 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 1404 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 1405 if (error != 0 && error != ENOENT) { 1406 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1407 VDEV_AUX_CORRUPT_DATA); 1408 error = EIO; 1409 goto out; 1410 } 1411 1412 /* 1413 * Load the history object. If we have an older pool, this 1414 * will not be present. 1415 */ 1416 error = zap_lookup(spa->spa_meta_objset, 1417 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 1418 sizeof (uint64_t), 1, &spa->spa_history); 1419 if (error != 0 && error != ENOENT) { 1420 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1421 VDEV_AUX_CORRUPT_DATA); 1422 error = EIO; 1423 goto out; 1424 } 1425 1426 /* 1427 * Load any hot spares for this pool. 1428 */ 1429 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1430 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object); 1431 if (error != 0 && error != ENOENT) { 1432 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1433 VDEV_AUX_CORRUPT_DATA); 1434 error = EIO; 1435 goto out; 1436 } 1437 if (error == 0) { 1438 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 1439 if (load_nvlist(spa, spa->spa_spares.sav_object, 1440 &spa->spa_spares.sav_config) != 0) { 1441 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1442 VDEV_AUX_CORRUPT_DATA); 1443 error = EIO; 1444 goto out; 1445 } 1446 1447 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1448 spa_load_spares(spa); 1449 spa_config_exit(spa, SCL_ALL, FTAG); 1450 } 1451 1452 /* 1453 * Load any level 2 ARC devices for this pool. 1454 */ 1455 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1456 DMU_POOL_L2CACHE, sizeof (uint64_t), 1, 1457 &spa->spa_l2cache.sav_object); 1458 if (error != 0 && error != ENOENT) { 1459 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1460 VDEV_AUX_CORRUPT_DATA); 1461 error = EIO; 1462 goto out; 1463 } 1464 if (error == 0) { 1465 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 1466 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 1467 &spa->spa_l2cache.sav_config) != 0) { 1468 vdev_set_state(rvd, B_TRUE, 1469 VDEV_STATE_CANT_OPEN, 1470 VDEV_AUX_CORRUPT_DATA); 1471 error = EIO; 1472 goto out; 1473 } 1474 1475 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1476 spa_load_l2cache(spa); 1477 spa_config_exit(spa, SCL_ALL, FTAG); 1478 } 1479 1480 VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE, 1481 &nvroot) == 0); 1482 spa_load_log_state(spa, nvroot); 1483 nvlist_free(nvconfig); 1484 1485 if (spa_check_logs(spa)) { 1486 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1487 VDEV_AUX_BAD_LOG); 1488 error = ENXIO; 1489 ereport = FM_EREPORT_ZFS_LOG_REPLAY; 1490 goto out; 1491 } 1492 1493 1494 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 1495 1496 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1497 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 1498 1499 if (error && error != ENOENT) { 1500 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1501 VDEV_AUX_CORRUPT_DATA); 1502 error = EIO; 1503 goto out; 1504 } 1505 1506 if (error == 0) { 1507 (void) zap_lookup(spa->spa_meta_objset, 1508 spa->spa_pool_props_object, 1509 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 1510 sizeof (uint64_t), 1, &spa->spa_bootfs); 1511 (void) zap_lookup(spa->spa_meta_objset, 1512 spa->spa_pool_props_object, 1513 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1514 sizeof (uint64_t), 1, &autoreplace); 1515 spa->spa_autoreplace = (autoreplace != 0); 1516 (void) zap_lookup(spa->spa_meta_objset, 1517 spa->spa_pool_props_object, 1518 zpool_prop_to_name(ZPOOL_PROP_DELEGATION), 1519 sizeof (uint64_t), 1, &spa->spa_delegation); 1520 (void) zap_lookup(spa->spa_meta_objset, 1521 spa->spa_pool_props_object, 1522 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 1523 sizeof (uint64_t), 1, &spa->spa_failmode); 1524 (void) zap_lookup(spa->spa_meta_objset, 1525 spa->spa_pool_props_object, 1526 zpool_prop_to_name(ZPOOL_PROP_AUTOEXPAND), 1527 sizeof (uint64_t), 1, &spa->spa_autoexpand); 1528 } 1529 1530 /* 1531 * If the 'autoreplace' property is set, then post a resource notifying 1532 * the ZFS DE that it should not issue any faults for unopenable 1533 * devices. We also iterate over the vdevs, and post a sysevent for any 1534 * unopenable vdevs so that the normal autoreplace handler can take 1535 * over. 1536 */ 1537 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 1538 spa_check_removed(spa->spa_root_vdev); 1539 /* 1540 * For the import case, this is done in spa_import(), because 1541 * at this point we're using the spare definitions from 1542 * the MOS config, not necessarily from the userland config. 1543 */ 1544 if (state != SPA_LOAD_IMPORT) { 1545 spa_aux_check_removed(&spa->spa_spares); 1546 spa_aux_check_removed(&spa->spa_l2cache); 1547 } 1548 } 1549 1550 /* 1551 * Load the vdev state for all toplevel vdevs. 1552 */ 1553 vdev_load(rvd); 1554 1555 /* 1556 * Propagate the leaf DTLs we just loaded all the way up the tree. 1557 */ 1558 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1559 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 1560 spa_config_exit(spa, SCL_ALL, FTAG); 1561 1562 /* 1563 * Check the state of the root vdev. If it can't be opened, it 1564 * indicates one or more toplevel vdevs are faulted. 1565 */ 1566 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1567 error = ENXIO; 1568 goto out; 1569 } 1570 1571 if (spa_writeable(spa)) { 1572 dmu_tx_t *tx; 1573 int need_update = B_FALSE; 1574 1575 ASSERT(state != SPA_LOAD_TRYIMPORT); 1576 1577 /* 1578 * Claim log blocks that haven't been committed yet. 1579 * This must all happen in a single txg. 1580 */ 1581 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 1582 spa_first_txg(spa)); 1583 (void) dmu_objset_find(spa_name(spa), 1584 zil_claim, tx, DS_FIND_CHILDREN); 1585 dmu_tx_commit(tx); 1586 1587 spa->spa_log_state = SPA_LOG_GOOD; 1588 spa->spa_sync_on = B_TRUE; 1589 txg_sync_start(spa->spa_dsl_pool); 1590 1591 /* 1592 * Wait for all claims to sync. 1593 */ 1594 txg_wait_synced(spa->spa_dsl_pool, 0); 1595 1596 /* 1597 * If the config cache is stale, or we have uninitialized 1598 * metaslabs (see spa_vdev_add()), then update the config. 1599 * 1600 * If spa_load_verbatim is true, trust the current 1601 * in-core spa_config and update the disk labels. 1602 */ 1603 if (config_cache_txg != spa->spa_config_txg || 1604 state == SPA_LOAD_IMPORT || spa->spa_load_verbatim) 1605 need_update = B_TRUE; 1606 1607 for (int c = 0; c < rvd->vdev_children; c++) 1608 if (rvd->vdev_child[c]->vdev_ms_array == 0) 1609 need_update = B_TRUE; 1610 1611 /* 1612 * Update the config cache asychronously in case we're the 1613 * root pool, in which case the config cache isn't writable yet. 1614 */ 1615 if (need_update) 1616 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 1617 1618 /* 1619 * Check all DTLs to see if anything needs resilvering. 1620 */ 1621 if (vdev_resilver_needed(rvd, NULL, NULL)) 1622 spa_async_request(spa, SPA_ASYNC_RESILVER); 1623 1624 /* 1625 * Delete any inconsistent datasets. 1626 */ 1627 (void) dmu_objset_find(spa_name(spa), 1628 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 1629 1630 /* 1631 * Clean up any stale temporary dataset userrefs. 1632 */ 1633 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 1634 } 1635 1636 error = 0; 1637 out: 1638 spa->spa_minref = refcount_count(&spa->spa_refcount); 1639 if (error && error != EBADF) 1640 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 1641 spa->spa_load_state = SPA_LOAD_NONE; 1642 spa->spa_ena = 0; 1643 1644 return (error); 1645 } 1646 1647 /* 1648 * Pool Open/Import 1649 * 1650 * The import case is identical to an open except that the configuration is sent 1651 * down from userland, instead of grabbed from the configuration cache. For the 1652 * case of an open, the pool configuration will exist in the 1653 * POOL_STATE_UNINITIALIZED state. 1654 * 1655 * The stats information (gen/count/ustats) is used to gather vdev statistics at 1656 * the same time open the pool, without having to keep around the spa_t in some 1657 * ambiguous state. 1658 */ 1659 static int 1660 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 1661 { 1662 spa_t *spa; 1663 int error; 1664 int locked = B_FALSE; 1665 1666 *spapp = NULL; 1667 1668 /* 1669 * As disgusting as this is, we need to support recursive calls to this 1670 * function because dsl_dir_open() is called during spa_load(), and ends 1671 * up calling spa_open() again. The real fix is to figure out how to 1672 * avoid dsl_dir_open() calling this in the first place. 1673 */ 1674 if (mutex_owner(&spa_namespace_lock) != curthread) { 1675 mutex_enter(&spa_namespace_lock); 1676 locked = B_TRUE; 1677 } 1678 1679 if ((spa = spa_lookup(pool)) == NULL) { 1680 if (locked) 1681 mutex_exit(&spa_namespace_lock); 1682 return (ENOENT); 1683 } 1684 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 1685 1686 spa_activate(spa, spa_mode_global); 1687 1688 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 1689 1690 if (error == EBADF) { 1691 /* 1692 * If vdev_validate() returns failure (indicated by 1693 * EBADF), it indicates that one of the vdevs indicates 1694 * that the pool has been exported or destroyed. If 1695 * this is the case, the config cache is out of sync and 1696 * we should remove the pool from the namespace. 1697 */ 1698 spa_unload(spa); 1699 spa_deactivate(spa); 1700 spa_config_sync(spa, B_TRUE, B_TRUE); 1701 spa_remove(spa); 1702 if (locked) 1703 mutex_exit(&spa_namespace_lock); 1704 return (ENOENT); 1705 } 1706 1707 if (error) { 1708 /* 1709 * We can't open the pool, but we still have useful 1710 * information: the state of each vdev after the 1711 * attempted vdev_open(). Return this to the user. 1712 */ 1713 if (config != NULL && spa->spa_root_vdev != NULL) 1714 *config = spa_config_generate(spa, NULL, -1ULL, 1715 B_TRUE); 1716 spa_unload(spa); 1717 spa_deactivate(spa); 1718 spa->spa_last_open_failed = B_TRUE; 1719 if (locked) 1720 mutex_exit(&spa_namespace_lock); 1721 *spapp = NULL; 1722 return (error); 1723 } else { 1724 spa->spa_last_open_failed = B_FALSE; 1725 } 1726 } 1727 1728 spa_open_ref(spa, tag); 1729 1730 if (locked) 1731 mutex_exit(&spa_namespace_lock); 1732 1733 *spapp = spa; 1734 1735 if (config != NULL) 1736 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1737 1738 return (0); 1739 } 1740 1741 int 1742 spa_open(const char *name, spa_t **spapp, void *tag) 1743 { 1744 return (spa_open_common(name, spapp, tag, NULL)); 1745 } 1746 1747 /* 1748 * Lookup the given spa_t, incrementing the inject count in the process, 1749 * preventing it from being exported or destroyed. 1750 */ 1751 spa_t * 1752 spa_inject_addref(char *name) 1753 { 1754 spa_t *spa; 1755 1756 mutex_enter(&spa_namespace_lock); 1757 if ((spa = spa_lookup(name)) == NULL) { 1758 mutex_exit(&spa_namespace_lock); 1759 return (NULL); 1760 } 1761 spa->spa_inject_ref++; 1762 mutex_exit(&spa_namespace_lock); 1763 1764 return (spa); 1765 } 1766 1767 void 1768 spa_inject_delref(spa_t *spa) 1769 { 1770 mutex_enter(&spa_namespace_lock); 1771 spa->spa_inject_ref--; 1772 mutex_exit(&spa_namespace_lock); 1773 } 1774 1775 /* 1776 * Add spares device information to the nvlist. 1777 */ 1778 static void 1779 spa_add_spares(spa_t *spa, nvlist_t *config) 1780 { 1781 nvlist_t **spares; 1782 uint_t i, nspares; 1783 nvlist_t *nvroot; 1784 uint64_t guid; 1785 vdev_stat_t *vs; 1786 uint_t vsc; 1787 uint64_t pool; 1788 1789 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 1790 1791 if (spa->spa_spares.sav_count == 0) 1792 return; 1793 1794 VERIFY(nvlist_lookup_nvlist(config, 1795 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1796 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1797 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1798 if (nspares != 0) { 1799 VERIFY(nvlist_add_nvlist_array(nvroot, 1800 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1801 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1802 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1803 1804 /* 1805 * Go through and find any spares which have since been 1806 * repurposed as an active spare. If this is the case, update 1807 * their status appropriately. 1808 */ 1809 for (i = 0; i < nspares; i++) { 1810 VERIFY(nvlist_lookup_uint64(spares[i], 1811 ZPOOL_CONFIG_GUID, &guid) == 0); 1812 if (spa_spare_exists(guid, &pool, NULL) && 1813 pool != 0ULL) { 1814 VERIFY(nvlist_lookup_uint64_array( 1815 spares[i], ZPOOL_CONFIG_STATS, 1816 (uint64_t **)&vs, &vsc) == 0); 1817 vs->vs_state = VDEV_STATE_CANT_OPEN; 1818 vs->vs_aux = VDEV_AUX_SPARED; 1819 } 1820 } 1821 } 1822 } 1823 1824 /* 1825 * Add l2cache device information to the nvlist, including vdev stats. 1826 */ 1827 static void 1828 spa_add_l2cache(spa_t *spa, nvlist_t *config) 1829 { 1830 nvlist_t **l2cache; 1831 uint_t i, j, nl2cache; 1832 nvlist_t *nvroot; 1833 uint64_t guid; 1834 vdev_t *vd; 1835 vdev_stat_t *vs; 1836 uint_t vsc; 1837 1838 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 1839 1840 if (spa->spa_l2cache.sav_count == 0) 1841 return; 1842 1843 VERIFY(nvlist_lookup_nvlist(config, 1844 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1845 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 1846 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1847 if (nl2cache != 0) { 1848 VERIFY(nvlist_add_nvlist_array(nvroot, 1849 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 1850 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1851 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1852 1853 /* 1854 * Update level 2 cache device stats. 1855 */ 1856 1857 for (i = 0; i < nl2cache; i++) { 1858 VERIFY(nvlist_lookup_uint64(l2cache[i], 1859 ZPOOL_CONFIG_GUID, &guid) == 0); 1860 1861 vd = NULL; 1862 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 1863 if (guid == 1864 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 1865 vd = spa->spa_l2cache.sav_vdevs[j]; 1866 break; 1867 } 1868 } 1869 ASSERT(vd != NULL); 1870 1871 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 1872 ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0); 1873 vdev_get_stats(vd, vs); 1874 } 1875 } 1876 } 1877 1878 int 1879 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 1880 { 1881 int error; 1882 spa_t *spa; 1883 1884 *config = NULL; 1885 error = spa_open_common(name, &spa, FTAG, config); 1886 1887 if (spa != NULL) { 1888 /* 1889 * This still leaves a window of inconsistency where the spares 1890 * or l2cache devices could change and the config would be 1891 * self-inconsistent. 1892 */ 1893 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 1894 1895 if (*config != NULL) { 1896 VERIFY(nvlist_add_uint64(*config, 1897 ZPOOL_CONFIG_ERRCOUNT, 1898 spa_get_errlog_size(spa)) == 0); 1899 1900 if (spa_suspended(spa)) 1901 VERIFY(nvlist_add_uint64(*config, 1902 ZPOOL_CONFIG_SUSPENDED, 1903 spa->spa_failmode) == 0); 1904 1905 spa_add_spares(spa, *config); 1906 spa_add_l2cache(spa, *config); 1907 } 1908 } 1909 1910 /* 1911 * We want to get the alternate root even for faulted pools, so we cheat 1912 * and call spa_lookup() directly. 1913 */ 1914 if (altroot) { 1915 if (spa == NULL) { 1916 mutex_enter(&spa_namespace_lock); 1917 spa = spa_lookup(name); 1918 if (spa) 1919 spa_altroot(spa, altroot, buflen); 1920 else 1921 altroot[0] = '\0'; 1922 spa = NULL; 1923 mutex_exit(&spa_namespace_lock); 1924 } else { 1925 spa_altroot(spa, altroot, buflen); 1926 } 1927 } 1928 1929 if (spa != NULL) { 1930 spa_config_exit(spa, SCL_CONFIG, FTAG); 1931 spa_close(spa, FTAG); 1932 } 1933 1934 return (error); 1935 } 1936 1937 /* 1938 * Validate that the auxiliary device array is well formed. We must have an 1939 * array of nvlists, each which describes a valid leaf vdev. If this is an 1940 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 1941 * specified, as long as they are well-formed. 1942 */ 1943 static int 1944 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 1945 spa_aux_vdev_t *sav, const char *config, uint64_t version, 1946 vdev_labeltype_t label) 1947 { 1948 nvlist_t **dev; 1949 uint_t i, ndev; 1950 vdev_t *vd; 1951 int error; 1952 1953 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1954 1955 /* 1956 * It's acceptable to have no devs specified. 1957 */ 1958 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 1959 return (0); 1960 1961 if (ndev == 0) 1962 return (EINVAL); 1963 1964 /* 1965 * Make sure the pool is formatted with a version that supports this 1966 * device type. 1967 */ 1968 if (spa_version(spa) < version) 1969 return (ENOTSUP); 1970 1971 /* 1972 * Set the pending device list so we correctly handle device in-use 1973 * checking. 1974 */ 1975 sav->sav_pending = dev; 1976 sav->sav_npending = ndev; 1977 1978 for (i = 0; i < ndev; i++) { 1979 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 1980 mode)) != 0) 1981 goto out; 1982 1983 if (!vd->vdev_ops->vdev_op_leaf) { 1984 vdev_free(vd); 1985 error = EINVAL; 1986 goto out; 1987 } 1988 1989 /* 1990 * The L2ARC currently only supports disk devices in 1991 * kernel context. For user-level testing, we allow it. 1992 */ 1993 #ifdef _KERNEL 1994 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 1995 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 1996 error = ENOTBLK; 1997 goto out; 1998 } 1999 #endif 2000 vd->vdev_top = vd; 2001 2002 if ((error = vdev_open(vd)) == 0 && 2003 (error = vdev_label_init(vd, crtxg, label)) == 0) { 2004 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 2005 vd->vdev_guid) == 0); 2006 } 2007 2008 vdev_free(vd); 2009 2010 if (error && 2011 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 2012 goto out; 2013 else 2014 error = 0; 2015 } 2016 2017 out: 2018 sav->sav_pending = NULL; 2019 sav->sav_npending = 0; 2020 return (error); 2021 } 2022 2023 static int 2024 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 2025 { 2026 int error; 2027 2028 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2029 2030 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2031 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 2032 VDEV_LABEL_SPARE)) != 0) { 2033 return (error); 2034 } 2035 2036 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2037 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 2038 VDEV_LABEL_L2CACHE)); 2039 } 2040 2041 static void 2042 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 2043 const char *config) 2044 { 2045 int i; 2046 2047 if (sav->sav_config != NULL) { 2048 nvlist_t **olddevs; 2049 uint_t oldndevs; 2050 nvlist_t **newdevs; 2051 2052 /* 2053 * Generate new dev list by concatentating with the 2054 * current dev list. 2055 */ 2056 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 2057 &olddevs, &oldndevs) == 0); 2058 2059 newdevs = kmem_alloc(sizeof (void *) * 2060 (ndevs + oldndevs), KM_SLEEP); 2061 for (i = 0; i < oldndevs; i++) 2062 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 2063 KM_SLEEP) == 0); 2064 for (i = 0; i < ndevs; i++) 2065 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 2066 KM_SLEEP) == 0); 2067 2068 VERIFY(nvlist_remove(sav->sav_config, config, 2069 DATA_TYPE_NVLIST_ARRAY) == 0); 2070 2071 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 2072 config, newdevs, ndevs + oldndevs) == 0); 2073 for (i = 0; i < oldndevs + ndevs; i++) 2074 nvlist_free(newdevs[i]); 2075 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 2076 } else { 2077 /* 2078 * Generate a new dev list. 2079 */ 2080 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 2081 KM_SLEEP) == 0); 2082 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 2083 devs, ndevs) == 0); 2084 } 2085 } 2086 2087 /* 2088 * Stop and drop level 2 ARC devices 2089 */ 2090 void 2091 spa_l2cache_drop(spa_t *spa) 2092 { 2093 vdev_t *vd; 2094 int i; 2095 spa_aux_vdev_t *sav = &spa->spa_l2cache; 2096 2097 for (i = 0; i < sav->sav_count; i++) { 2098 uint64_t pool; 2099 2100 vd = sav->sav_vdevs[i]; 2101 ASSERT(vd != NULL); 2102 2103 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 2104 pool != 0ULL && l2arc_vdev_present(vd)) 2105 l2arc_remove_vdev(vd); 2106 if (vd->vdev_isl2cache) 2107 spa_l2cache_remove(vd); 2108 vdev_clear_stats(vd); 2109 (void) vdev_close(vd); 2110 } 2111 } 2112 2113 /* 2114 * Pool Creation 2115 */ 2116 int 2117 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 2118 const char *history_str, nvlist_t *zplprops) 2119 { 2120 spa_t *spa; 2121 char *altroot = NULL; 2122 vdev_t *rvd; 2123 dsl_pool_t *dp; 2124 dmu_tx_t *tx; 2125 int error = 0; 2126 uint64_t txg = TXG_INITIAL; 2127 nvlist_t **spares, **l2cache; 2128 uint_t nspares, nl2cache; 2129 uint64_t version; 2130 2131 /* 2132 * If this pool already exists, return failure. 2133 */ 2134 mutex_enter(&spa_namespace_lock); 2135 if (spa_lookup(pool) != NULL) { 2136 mutex_exit(&spa_namespace_lock); 2137 return (EEXIST); 2138 } 2139 2140 /* 2141 * Allocate a new spa_t structure. 2142 */ 2143 (void) nvlist_lookup_string(props, 2144 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2145 spa = spa_add(pool, altroot); 2146 spa_activate(spa, spa_mode_global); 2147 2148 spa->spa_uberblock.ub_txg = txg - 1; 2149 2150 if (props && (error = spa_prop_validate(spa, props))) { 2151 spa_deactivate(spa); 2152 spa_remove(spa); 2153 mutex_exit(&spa_namespace_lock); 2154 return (error); 2155 } 2156 2157 if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 2158 &version) != 0) 2159 version = SPA_VERSION; 2160 ASSERT(version <= SPA_VERSION); 2161 spa->spa_uberblock.ub_version = version; 2162 spa->spa_ubsync = spa->spa_uberblock; 2163 2164 /* 2165 * Create "The Godfather" zio to hold all async IOs 2166 */ 2167 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 2168 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 2169 2170 /* 2171 * Create the root vdev. 2172 */ 2173 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2174 2175 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 2176 2177 ASSERT(error != 0 || rvd != NULL); 2178 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 2179 2180 if (error == 0 && !zfs_allocatable_devs(nvroot)) 2181 error = EINVAL; 2182 2183 if (error == 0 && 2184 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 2185 (error = spa_validate_aux(spa, nvroot, txg, 2186 VDEV_ALLOC_ADD)) == 0) { 2187 for (int c = 0; c < rvd->vdev_children; c++) { 2188 vdev_metaslab_set_size(rvd->vdev_child[c]); 2189 vdev_expand(rvd->vdev_child[c], txg); 2190 } 2191 } 2192 2193 spa_config_exit(spa, SCL_ALL, FTAG); 2194 2195 if (error != 0) { 2196 spa_unload(spa); 2197 spa_deactivate(spa); 2198 spa_remove(spa); 2199 mutex_exit(&spa_namespace_lock); 2200 return (error); 2201 } 2202 2203 /* 2204 * Get the list of spares, if specified. 2205 */ 2206 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2207 &spares, &nspares) == 0) { 2208 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 2209 KM_SLEEP) == 0); 2210 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2211 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2212 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2213 spa_load_spares(spa); 2214 spa_config_exit(spa, SCL_ALL, FTAG); 2215 spa->spa_spares.sav_sync = B_TRUE; 2216 } 2217 2218 /* 2219 * Get the list of level 2 cache devices, if specified. 2220 */ 2221 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2222 &l2cache, &nl2cache) == 0) { 2223 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2224 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2225 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2226 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2227 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2228 spa_load_l2cache(spa); 2229 spa_config_exit(spa, SCL_ALL, FTAG); 2230 spa->spa_l2cache.sav_sync = B_TRUE; 2231 } 2232 2233 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 2234 spa->spa_meta_objset = dp->dp_meta_objset; 2235 2236 tx = dmu_tx_create_assigned(dp, txg); 2237 2238 /* 2239 * Create the pool config object. 2240 */ 2241 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 2242 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 2243 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2244 2245 if (zap_add(spa->spa_meta_objset, 2246 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 2247 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 2248 cmn_err(CE_PANIC, "failed to add pool config"); 2249 } 2250 2251 /* Newly created pools with the right version are always deflated. */ 2252 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 2253 spa->spa_deflate = TRUE; 2254 if (zap_add(spa->spa_meta_objset, 2255 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2256 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 2257 cmn_err(CE_PANIC, "failed to add deflate"); 2258 } 2259 } 2260 2261 /* 2262 * Create the deferred-free bplist object. Turn off compression 2263 * because sync-to-convergence takes longer if the blocksize 2264 * keeps changing. 2265 */ 2266 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 2267 1 << 14, tx); 2268 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 2269 ZIO_COMPRESS_OFF, tx); 2270 2271 if (zap_add(spa->spa_meta_objset, 2272 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 2273 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 2274 cmn_err(CE_PANIC, "failed to add bplist"); 2275 } 2276 2277 /* 2278 * Create the pool's history object. 2279 */ 2280 if (version >= SPA_VERSION_ZPOOL_HISTORY) 2281 spa_history_create_obj(spa, tx); 2282 2283 /* 2284 * Set pool properties. 2285 */ 2286 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 2287 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2288 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 2289 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 2290 if (props != NULL) { 2291 spa_configfile_set(spa, props, B_FALSE); 2292 spa_sync_props(spa, props, CRED(), tx); 2293 } 2294 2295 dmu_tx_commit(tx); 2296 2297 spa->spa_sync_on = B_TRUE; 2298 txg_sync_start(spa->spa_dsl_pool); 2299 2300 /* 2301 * We explicitly wait for the first transaction to complete so that our 2302 * bean counters are appropriately updated. 2303 */ 2304 txg_wait_synced(spa->spa_dsl_pool, txg); 2305 2306 spa_config_sync(spa, B_FALSE, B_TRUE); 2307 2308 if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 2309 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 2310 spa_history_log_version(spa, LOG_POOL_CREATE); 2311 2312 spa->spa_minref = refcount_count(&spa->spa_refcount); 2313 2314 mutex_exit(&spa_namespace_lock); 2315 2316 return (0); 2317 } 2318 2319 #ifdef _KERNEL 2320 /* 2321 * Get the root pool information from the root disk, then import the root pool 2322 * during the system boot up time. 2323 */ 2324 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 2325 2326 static nvlist_t * 2327 spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 2328 { 2329 nvlist_t *config; 2330 nvlist_t *nvtop, *nvroot; 2331 uint64_t pgid; 2332 2333 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 2334 return (NULL); 2335 2336 /* 2337 * Add this top-level vdev to the child array. 2338 */ 2339 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 2340 &nvtop) == 0); 2341 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 2342 &pgid) == 0); 2343 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 2344 2345 /* 2346 * Put this pool's top-level vdevs into a root vdev. 2347 */ 2348 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2349 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 2350 VDEV_TYPE_ROOT) == 0); 2351 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 2352 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 2353 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 2354 &nvtop, 1) == 0); 2355 2356 /* 2357 * Replace the existing vdev_tree with the new root vdev in 2358 * this pool's configuration (remove the old, add the new). 2359 */ 2360 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 2361 nvlist_free(nvroot); 2362 return (config); 2363 } 2364 2365 /* 2366 * Walk the vdev tree and see if we can find a device with "better" 2367 * configuration. A configuration is "better" if the label on that 2368 * device has a more recent txg. 2369 */ 2370 static void 2371 spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 2372 { 2373 for (int c = 0; c < vd->vdev_children; c++) 2374 spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 2375 2376 if (vd->vdev_ops->vdev_op_leaf) { 2377 nvlist_t *label; 2378 uint64_t label_txg; 2379 2380 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 2381 &label) != 0) 2382 return; 2383 2384 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 2385 &label_txg) == 0); 2386 2387 /* 2388 * Do we have a better boot device? 2389 */ 2390 if (label_txg > *txg) { 2391 *txg = label_txg; 2392 *avd = vd; 2393 } 2394 nvlist_free(label); 2395 } 2396 } 2397 2398 /* 2399 * Import a root pool. 2400 * 2401 * For x86. devpath_list will consist of devid and/or physpath name of 2402 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 2403 * The GRUB "findroot" command will return the vdev we should boot. 2404 * 2405 * For Sparc, devpath_list consists the physpath name of the booting device 2406 * no matter the rootpool is a single device pool or a mirrored pool. 2407 * e.g. 2408 * "/pci@1f,0/ide@d/disk@0,0:a" 2409 */ 2410 int 2411 spa_import_rootpool(char *devpath, char *devid) 2412 { 2413 spa_t *spa; 2414 vdev_t *rvd, *bvd, *avd = NULL; 2415 nvlist_t *config, *nvtop; 2416 uint64_t guid, txg; 2417 char *pname; 2418 int error; 2419 2420 /* 2421 * Read the label from the boot device and generate a configuration. 2422 */ 2423 if ((config = spa_generate_rootconf(devpath, devid, &guid)) == NULL) { 2424 cmn_err(CE_NOTE, "Can not read the pool label from '%s'", 2425 devpath); 2426 return (EIO); 2427 } 2428 2429 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 2430 &pname) == 0); 2431 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 2432 2433 mutex_enter(&spa_namespace_lock); 2434 if ((spa = spa_lookup(pname)) != NULL) { 2435 /* 2436 * Remove the existing root pool from the namespace so that we 2437 * can replace it with the correct config we just read in. 2438 */ 2439 spa_remove(spa); 2440 } 2441 2442 spa = spa_add(pname, NULL); 2443 spa->spa_is_root = B_TRUE; 2444 spa->spa_load_verbatim = B_TRUE; 2445 2446 /* 2447 * Build up a vdev tree based on the boot device's label config. 2448 */ 2449 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 2450 &nvtop) == 0); 2451 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2452 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 2453 VDEV_ALLOC_ROOTPOOL); 2454 spa_config_exit(spa, SCL_ALL, FTAG); 2455 if (error) { 2456 mutex_exit(&spa_namespace_lock); 2457 nvlist_free(config); 2458 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 2459 pname); 2460 return (error); 2461 } 2462 2463 /* 2464 * Get the boot vdev. 2465 */ 2466 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2467 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 2468 (u_longlong_t)guid); 2469 error = ENOENT; 2470 goto out; 2471 } 2472 2473 /* 2474 * Determine if there is a better boot device. 2475 */ 2476 avd = bvd; 2477 spa_alt_rootvdev(rvd, &avd, &txg); 2478 if (avd != bvd) { 2479 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 2480 "try booting from '%s'", avd->vdev_path); 2481 error = EINVAL; 2482 goto out; 2483 } 2484 2485 /* 2486 * If the boot device is part of a spare vdev then ensure that 2487 * we're booting off the active spare. 2488 */ 2489 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2490 !bvd->vdev_isspare) { 2491 cmn_err(CE_NOTE, "The boot device is currently spared. Please " 2492 "try booting from '%s'", 2493 bvd->vdev_parent->vdev_child[1]->vdev_path); 2494 error = EINVAL; 2495 goto out; 2496 } 2497 2498 VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); 2499 error = 0; 2500 spa_history_log_version(spa, LOG_POOL_IMPORT); 2501 out: 2502 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2503 vdev_free(rvd); 2504 spa_config_exit(spa, SCL_ALL, FTAG); 2505 mutex_exit(&spa_namespace_lock); 2506 2507 nvlist_free(config); 2508 return (error); 2509 } 2510 2511 #endif 2512 2513 /* 2514 * Take a pool and insert it into the namespace as if it had been loaded at 2515 * boot. 2516 */ 2517 int 2518 spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props) 2519 { 2520 spa_t *spa; 2521 char *altroot = NULL; 2522 2523 mutex_enter(&spa_namespace_lock); 2524 if (spa_lookup(pool) != NULL) { 2525 mutex_exit(&spa_namespace_lock); 2526 return (EEXIST); 2527 } 2528 2529 (void) nvlist_lookup_string(props, 2530 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2531 spa = spa_add(pool, altroot); 2532 2533 spa->spa_load_verbatim = B_TRUE; 2534 2535 VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); 2536 2537 if (props != NULL) 2538 spa_configfile_set(spa, props, B_FALSE); 2539 2540 spa_config_sync(spa, B_FALSE, B_TRUE); 2541 2542 mutex_exit(&spa_namespace_lock); 2543 spa_history_log_version(spa, LOG_POOL_IMPORT); 2544 2545 return (0); 2546 } 2547 2548 /* 2549 * Import a non-root pool into the system. 2550 */ 2551 int 2552 spa_import(const char *pool, nvlist_t *config, nvlist_t *props) 2553 { 2554 spa_t *spa; 2555 char *altroot = NULL; 2556 int error; 2557 nvlist_t *nvroot; 2558 nvlist_t **spares, **l2cache; 2559 uint_t nspares, nl2cache; 2560 2561 /* 2562 * If a pool with this name exists, return failure. 2563 */ 2564 mutex_enter(&spa_namespace_lock); 2565 if ((spa = spa_lookup(pool)) != NULL) { 2566 mutex_exit(&spa_namespace_lock); 2567 return (EEXIST); 2568 } 2569 2570 /* 2571 * Create and initialize the spa structure. 2572 */ 2573 (void) nvlist_lookup_string(props, 2574 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2575 spa = spa_add(pool, altroot); 2576 spa_activate(spa, spa_mode_global); 2577 2578 /* 2579 * Don't start async tasks until we know everything is healthy. 2580 */ 2581 spa_async_suspend(spa); 2582 2583 /* 2584 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 2585 * because the user-supplied config is actually the one to trust when 2586 * doing an import. 2587 */ 2588 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 2589 2590 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2591 /* 2592 * Toss any existing sparelist, as it doesn't have any validity 2593 * anymore, and conflicts with spa_has_spare(). 2594 */ 2595 if (spa->spa_spares.sav_config) { 2596 nvlist_free(spa->spa_spares.sav_config); 2597 spa->spa_spares.sav_config = NULL; 2598 spa_load_spares(spa); 2599 } 2600 if (spa->spa_l2cache.sav_config) { 2601 nvlist_free(spa->spa_l2cache.sav_config); 2602 spa->spa_l2cache.sav_config = NULL; 2603 spa_load_l2cache(spa); 2604 } 2605 2606 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 2607 &nvroot) == 0); 2608 if (error == 0) 2609 error = spa_validate_aux(spa, nvroot, -1ULL, 2610 VDEV_ALLOC_SPARE); 2611 if (error == 0) 2612 error = spa_validate_aux(spa, nvroot, -1ULL, 2613 VDEV_ALLOC_L2CACHE); 2614 spa_config_exit(spa, SCL_ALL, FTAG); 2615 2616 if (props != NULL) 2617 spa_configfile_set(spa, props, B_FALSE); 2618 2619 if (error != 0 || (props && spa_writeable(spa) && 2620 (error = spa_prop_set(spa, props)))) { 2621 spa_unload(spa); 2622 spa_deactivate(spa); 2623 spa_remove(spa); 2624 mutex_exit(&spa_namespace_lock); 2625 return (error); 2626 } 2627 2628 spa_async_resume(spa); 2629 2630 /* 2631 * Override any spares and level 2 cache devices as specified by 2632 * the user, as these may have correct device names/devids, etc. 2633 */ 2634 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2635 &spares, &nspares) == 0) { 2636 if (spa->spa_spares.sav_config) 2637 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 2638 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 2639 else 2640 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 2641 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2642 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2643 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2644 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2645 spa_load_spares(spa); 2646 spa_config_exit(spa, SCL_ALL, FTAG); 2647 spa->spa_spares.sav_sync = B_TRUE; 2648 } 2649 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2650 &l2cache, &nl2cache) == 0) { 2651 if (spa->spa_l2cache.sav_config) 2652 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 2653 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 2654 else 2655 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2656 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2657 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2658 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2659 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2660 spa_load_l2cache(spa); 2661 spa_config_exit(spa, SCL_ALL, FTAG); 2662 spa->spa_l2cache.sav_sync = B_TRUE; 2663 } 2664 2665 /* 2666 * Check for any removed devices. 2667 */ 2668 if (spa->spa_autoreplace) { 2669 spa_aux_check_removed(&spa->spa_spares); 2670 spa_aux_check_removed(&spa->spa_l2cache); 2671 } 2672 2673 if (spa_writeable(spa)) { 2674 /* 2675 * Update the config cache to include the newly-imported pool. 2676 */ 2677 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2678 } 2679 2680 /* 2681 * It's possible that the pool was expanded while it was exported. 2682 * We kick off an async task to handle this for us. 2683 */ 2684 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 2685 2686 mutex_exit(&spa_namespace_lock); 2687 spa_history_log_version(spa, LOG_POOL_IMPORT); 2688 2689 return (0); 2690 } 2691 2692 2693 /* 2694 * This (illegal) pool name is used when temporarily importing a spa_t in order 2695 * to get the vdev stats associated with the imported devices. 2696 */ 2697 #define TRYIMPORT_NAME "$import" 2698 2699 nvlist_t * 2700 spa_tryimport(nvlist_t *tryconfig) 2701 { 2702 nvlist_t *config = NULL; 2703 char *poolname; 2704 spa_t *spa; 2705 uint64_t state; 2706 int error; 2707 2708 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 2709 return (NULL); 2710 2711 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 2712 return (NULL); 2713 2714 /* 2715 * Create and initialize the spa structure. 2716 */ 2717 mutex_enter(&spa_namespace_lock); 2718 spa = spa_add(TRYIMPORT_NAME, NULL); 2719 spa_activate(spa, FREAD); 2720 2721 /* 2722 * Pass off the heavy lifting to spa_load(). 2723 * Pass TRUE for mosconfig because the user-supplied config 2724 * is actually the one to trust when doing an import. 2725 */ 2726 error = spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 2727 2728 /* 2729 * If 'tryconfig' was at least parsable, return the current config. 2730 */ 2731 if (spa->spa_root_vdev != NULL) { 2732 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2733 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 2734 poolname) == 0); 2735 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 2736 state) == 0); 2737 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 2738 spa->spa_uberblock.ub_timestamp) == 0); 2739 2740 /* 2741 * If the bootfs property exists on this pool then we 2742 * copy it out so that external consumers can tell which 2743 * pools are bootable. 2744 */ 2745 if ((!error || error == EEXIST) && spa->spa_bootfs) { 2746 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 2747 2748 /* 2749 * We have to play games with the name since the 2750 * pool was opened as TRYIMPORT_NAME. 2751 */ 2752 if (dsl_dsobj_to_dsname(spa_name(spa), 2753 spa->spa_bootfs, tmpname) == 0) { 2754 char *cp; 2755 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 2756 2757 cp = strchr(tmpname, '/'); 2758 if (cp == NULL) { 2759 (void) strlcpy(dsname, tmpname, 2760 MAXPATHLEN); 2761 } else { 2762 (void) snprintf(dsname, MAXPATHLEN, 2763 "%s/%s", poolname, ++cp); 2764 } 2765 VERIFY(nvlist_add_string(config, 2766 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 2767 kmem_free(dsname, MAXPATHLEN); 2768 } 2769 kmem_free(tmpname, MAXPATHLEN); 2770 } 2771 2772 /* 2773 * Add the list of hot spares and level 2 cache devices. 2774 */ 2775 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 2776 spa_add_spares(spa, config); 2777 spa_add_l2cache(spa, config); 2778 spa_config_exit(spa, SCL_CONFIG, FTAG); 2779 } 2780 2781 spa_unload(spa); 2782 spa_deactivate(spa); 2783 spa_remove(spa); 2784 mutex_exit(&spa_namespace_lock); 2785 2786 return (config); 2787 } 2788 2789 /* 2790 * Pool export/destroy 2791 * 2792 * The act of destroying or exporting a pool is very simple. We make sure there 2793 * is no more pending I/O and any references to the pool are gone. Then, we 2794 * update the pool state and sync all the labels to disk, removing the 2795 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 2796 * we don't sync the labels or remove the configuration cache. 2797 */ 2798 static int 2799 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 2800 boolean_t force, boolean_t hardforce) 2801 { 2802 spa_t *spa; 2803 2804 if (oldconfig) 2805 *oldconfig = NULL; 2806 2807 if (!(spa_mode_global & FWRITE)) 2808 return (EROFS); 2809 2810 mutex_enter(&spa_namespace_lock); 2811 if ((spa = spa_lookup(pool)) == NULL) { 2812 mutex_exit(&spa_namespace_lock); 2813 return (ENOENT); 2814 } 2815 2816 /* 2817 * Put a hold on the pool, drop the namespace lock, stop async tasks, 2818 * reacquire the namespace lock, and see if we can export. 2819 */ 2820 spa_open_ref(spa, FTAG); 2821 mutex_exit(&spa_namespace_lock); 2822 spa_async_suspend(spa); 2823 mutex_enter(&spa_namespace_lock); 2824 spa_close(spa, FTAG); 2825 2826 /* 2827 * The pool will be in core if it's openable, 2828 * in which case we can modify its state. 2829 */ 2830 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 2831 /* 2832 * Objsets may be open only because they're dirty, so we 2833 * have to force it to sync before checking spa_refcnt. 2834 */ 2835 txg_wait_synced(spa->spa_dsl_pool, 0); 2836 2837 /* 2838 * A pool cannot be exported or destroyed if there are active 2839 * references. If we are resetting a pool, allow references by 2840 * fault injection handlers. 2841 */ 2842 if (!spa_refcount_zero(spa) || 2843 (spa->spa_inject_ref != 0 && 2844 new_state != POOL_STATE_UNINITIALIZED)) { 2845 spa_async_resume(spa); 2846 mutex_exit(&spa_namespace_lock); 2847 return (EBUSY); 2848 } 2849 2850 /* 2851 * A pool cannot be exported if it has an active shared spare. 2852 * This is to prevent other pools stealing the active spare 2853 * from an exported pool. At user's own will, such pool can 2854 * be forcedly exported. 2855 */ 2856 if (!force && new_state == POOL_STATE_EXPORTED && 2857 spa_has_active_shared_spare(spa)) { 2858 spa_async_resume(spa); 2859 mutex_exit(&spa_namespace_lock); 2860 return (EXDEV); 2861 } 2862 2863 /* 2864 * We want this to be reflected on every label, 2865 * so mark them all dirty. spa_unload() will do the 2866 * final sync that pushes these changes out. 2867 */ 2868 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 2869 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2870 spa->spa_state = new_state; 2871 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 2872 vdev_config_dirty(spa->spa_root_vdev); 2873 spa_config_exit(spa, SCL_ALL, FTAG); 2874 } 2875 } 2876 2877 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 2878 2879 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2880 spa_unload(spa); 2881 spa_deactivate(spa); 2882 } 2883 2884 if (oldconfig && spa->spa_config) 2885 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 2886 2887 if (new_state != POOL_STATE_UNINITIALIZED) { 2888 if (!hardforce) 2889 spa_config_sync(spa, B_TRUE, B_TRUE); 2890 spa_remove(spa); 2891 } 2892 mutex_exit(&spa_namespace_lock); 2893 2894 return (0); 2895 } 2896 2897 /* 2898 * Destroy a storage pool. 2899 */ 2900 int 2901 spa_destroy(char *pool) 2902 { 2903 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 2904 B_FALSE, B_FALSE)); 2905 } 2906 2907 /* 2908 * Export a storage pool. 2909 */ 2910 int 2911 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 2912 boolean_t hardforce) 2913 { 2914 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 2915 force, hardforce)); 2916 } 2917 2918 /* 2919 * Similar to spa_export(), this unloads the spa_t without actually removing it 2920 * from the namespace in any way. 2921 */ 2922 int 2923 spa_reset(char *pool) 2924 { 2925 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 2926 B_FALSE, B_FALSE)); 2927 } 2928 2929 /* 2930 * ========================================================================== 2931 * Device manipulation 2932 * ========================================================================== 2933 */ 2934 2935 /* 2936 * Add a device to a storage pool. 2937 */ 2938 int 2939 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 2940 { 2941 uint64_t txg, id; 2942 int error; 2943 vdev_t *rvd = spa->spa_root_vdev; 2944 vdev_t *vd, *tvd; 2945 nvlist_t **spares, **l2cache; 2946 uint_t nspares, nl2cache; 2947 2948 txg = spa_vdev_enter(spa); 2949 2950 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 2951 VDEV_ALLOC_ADD)) != 0) 2952 return (spa_vdev_exit(spa, NULL, txg, error)); 2953 2954 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 2955 2956 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 2957 &nspares) != 0) 2958 nspares = 0; 2959 2960 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 2961 &nl2cache) != 0) 2962 nl2cache = 0; 2963 2964 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 2965 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 2966 2967 if (vd->vdev_children != 0 && 2968 (error = vdev_create(vd, txg, B_FALSE)) != 0) 2969 return (spa_vdev_exit(spa, vd, txg, error)); 2970 2971 /* 2972 * We must validate the spares and l2cache devices after checking the 2973 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 2974 */ 2975 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 2976 return (spa_vdev_exit(spa, vd, txg, error)); 2977 2978 /* 2979 * Transfer each new top-level vdev from vd to rvd. 2980 */ 2981 for (int c = 0; c < vd->vdev_children; c++) { 2982 2983 /* 2984 * Set the vdev id to the first hole, if one exists. 2985 */ 2986 for (id = 0; id < rvd->vdev_children; id++) { 2987 if (rvd->vdev_child[id]->vdev_ishole) { 2988 vdev_free(rvd->vdev_child[id]); 2989 break; 2990 } 2991 } 2992 tvd = vd->vdev_child[c]; 2993 vdev_remove_child(vd, tvd); 2994 tvd->vdev_id = id; 2995 vdev_add_child(rvd, tvd); 2996 vdev_config_dirty(tvd); 2997 } 2998 2999 if (nspares != 0) { 3000 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 3001 ZPOOL_CONFIG_SPARES); 3002 spa_load_spares(spa); 3003 spa->spa_spares.sav_sync = B_TRUE; 3004 } 3005 3006 if (nl2cache != 0) { 3007 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 3008 ZPOOL_CONFIG_L2CACHE); 3009 spa_load_l2cache(spa); 3010 spa->spa_l2cache.sav_sync = B_TRUE; 3011 } 3012 3013 /* 3014 * We have to be careful when adding new vdevs to an existing pool. 3015 * If other threads start allocating from these vdevs before we 3016 * sync the config cache, and we lose power, then upon reboot we may 3017 * fail to open the pool because there are DVAs that the config cache 3018 * can't translate. Therefore, we first add the vdevs without 3019 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 3020 * and then let spa_config_update() initialize the new metaslabs. 3021 * 3022 * spa_load() checks for added-but-not-initialized vdevs, so that 3023 * if we lose power at any point in this sequence, the remaining 3024 * steps will be completed the next time we load the pool. 3025 */ 3026 (void) spa_vdev_exit(spa, vd, txg, 0); 3027 3028 mutex_enter(&spa_namespace_lock); 3029 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3030 mutex_exit(&spa_namespace_lock); 3031 3032 return (0); 3033 } 3034 3035 /* 3036 * Attach a device to a mirror. The arguments are the path to any device 3037 * in the mirror, and the nvroot for the new device. If the path specifies 3038 * a device that is not mirrored, we automatically insert the mirror vdev. 3039 * 3040 * If 'replacing' is specified, the new device is intended to replace the 3041 * existing device; in this case the two devices are made into their own 3042 * mirror using the 'replacing' vdev, which is functionally identical to 3043 * the mirror vdev (it actually reuses all the same ops) but has a few 3044 * extra rules: you can't attach to it after it's been created, and upon 3045 * completion of resilvering, the first disk (the one being replaced) 3046 * is automatically detached. 3047 */ 3048 int 3049 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 3050 { 3051 uint64_t txg, open_txg; 3052 vdev_t *rvd = spa->spa_root_vdev; 3053 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 3054 vdev_ops_t *pvops; 3055 char *oldvdpath, *newvdpath; 3056 int newvd_isspare; 3057 int error; 3058 3059 txg = spa_vdev_enter(spa); 3060 3061 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 3062 3063 if (oldvd == NULL) 3064 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3065 3066 if (!oldvd->vdev_ops->vdev_op_leaf) 3067 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3068 3069 pvd = oldvd->vdev_parent; 3070 3071 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 3072 VDEV_ALLOC_ADD)) != 0) 3073 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 3074 3075 if (newrootvd->vdev_children != 1) 3076 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3077 3078 newvd = newrootvd->vdev_child[0]; 3079 3080 if (!newvd->vdev_ops->vdev_op_leaf) 3081 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3082 3083 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 3084 return (spa_vdev_exit(spa, newrootvd, txg, error)); 3085 3086 /* 3087 * Spares can't replace logs 3088 */ 3089 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 3090 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3091 3092 if (!replacing) { 3093 /* 3094 * For attach, the only allowable parent is a mirror or the root 3095 * vdev. 3096 */ 3097 if (pvd->vdev_ops != &vdev_mirror_ops && 3098 pvd->vdev_ops != &vdev_root_ops) 3099 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3100 3101 pvops = &vdev_mirror_ops; 3102 } else { 3103 /* 3104 * Active hot spares can only be replaced by inactive hot 3105 * spares. 3106 */ 3107 if (pvd->vdev_ops == &vdev_spare_ops && 3108 pvd->vdev_child[1] == oldvd && 3109 !spa_has_spare(spa, newvd->vdev_guid)) 3110 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3111 3112 /* 3113 * If the source is a hot spare, and the parent isn't already a 3114 * spare, then we want to create a new hot spare. Otherwise, we 3115 * want to create a replacing vdev. The user is not allowed to 3116 * attach to a spared vdev child unless the 'isspare' state is 3117 * the same (spare replaces spare, non-spare replaces 3118 * non-spare). 3119 */ 3120 if (pvd->vdev_ops == &vdev_replacing_ops) 3121 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3122 else if (pvd->vdev_ops == &vdev_spare_ops && 3123 newvd->vdev_isspare != oldvd->vdev_isspare) 3124 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3125 else if (pvd->vdev_ops != &vdev_spare_ops && 3126 newvd->vdev_isspare) 3127 pvops = &vdev_spare_ops; 3128 else 3129 pvops = &vdev_replacing_ops; 3130 } 3131 3132 /* 3133 * Make sure the new device is big enough. 3134 */ 3135 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 3136 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 3137 3138 /* 3139 * The new device cannot have a higher alignment requirement 3140 * than the top-level vdev. 3141 */ 3142 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 3143 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 3144 3145 /* 3146 * If this is an in-place replacement, update oldvd's path and devid 3147 * to make it distinguishable from newvd, and unopenable from now on. 3148 */ 3149 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 3150 spa_strfree(oldvd->vdev_path); 3151 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 3152 KM_SLEEP); 3153 (void) sprintf(oldvd->vdev_path, "%s/%s", 3154 newvd->vdev_path, "old"); 3155 if (oldvd->vdev_devid != NULL) { 3156 spa_strfree(oldvd->vdev_devid); 3157 oldvd->vdev_devid = NULL; 3158 } 3159 } 3160 3161 /* 3162 * If the parent is not a mirror, or if we're replacing, insert the new 3163 * mirror/replacing/spare vdev above oldvd. 3164 */ 3165 if (pvd->vdev_ops != pvops) 3166 pvd = vdev_add_parent(oldvd, pvops); 3167 3168 ASSERT(pvd->vdev_top->vdev_parent == rvd); 3169 ASSERT(pvd->vdev_ops == pvops); 3170 ASSERT(oldvd->vdev_parent == pvd); 3171 3172 /* 3173 * Extract the new device from its root and add it to pvd. 3174 */ 3175 vdev_remove_child(newrootvd, newvd); 3176 newvd->vdev_id = pvd->vdev_children; 3177 newvd->vdev_crtxg = oldvd->vdev_crtxg; 3178 vdev_add_child(pvd, newvd); 3179 3180 tvd = newvd->vdev_top; 3181 ASSERT(pvd->vdev_top == tvd); 3182 ASSERT(tvd->vdev_parent == rvd); 3183 3184 vdev_config_dirty(tvd); 3185 3186 /* 3187 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 3188 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 3189 */ 3190 open_txg = txg + TXG_CONCURRENT_STATES - 1; 3191 3192 vdev_dtl_dirty(newvd, DTL_MISSING, 3193 TXG_INITIAL, open_txg - TXG_INITIAL + 1); 3194 3195 if (newvd->vdev_isspare) { 3196 spa_spare_activate(newvd); 3197 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 3198 } 3199 3200 oldvdpath = spa_strdup(oldvd->vdev_path); 3201 newvdpath = spa_strdup(newvd->vdev_path); 3202 newvd_isspare = newvd->vdev_isspare; 3203 3204 /* 3205 * Mark newvd's DTL dirty in this txg. 3206 */ 3207 vdev_dirty(tvd, VDD_DTL, newvd, txg); 3208 3209 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 3210 3211 spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, NULL, 3212 CRED(), "%s vdev=%s %s vdev=%s", 3213 replacing && newvd_isspare ? "spare in" : 3214 replacing ? "replace" : "attach", newvdpath, 3215 replacing ? "for" : "to", oldvdpath); 3216 3217 spa_strfree(oldvdpath); 3218 spa_strfree(newvdpath); 3219 3220 /* 3221 * Kick off a resilver to update newvd. 3222 */ 3223 VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0); 3224 3225 return (0); 3226 } 3227 3228 /* 3229 * Detach a device from a mirror or replacing vdev. 3230 * If 'replace_done' is specified, only detach if the parent 3231 * is a replacing vdev. 3232 */ 3233 int 3234 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 3235 { 3236 uint64_t txg; 3237 int error; 3238 vdev_t *rvd = spa->spa_root_vdev; 3239 vdev_t *vd, *pvd, *cvd, *tvd; 3240 boolean_t unspare = B_FALSE; 3241 uint64_t unspare_guid; 3242 size_t len; 3243 3244 txg = spa_vdev_enter(spa); 3245 3246 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 3247 3248 if (vd == NULL) 3249 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3250 3251 if (!vd->vdev_ops->vdev_op_leaf) 3252 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3253 3254 pvd = vd->vdev_parent; 3255 3256 /* 3257 * If the parent/child relationship is not as expected, don't do it. 3258 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 3259 * vdev that's replacing B with C. The user's intent in replacing 3260 * is to go from M(A,B) to M(A,C). If the user decides to cancel 3261 * the replace by detaching C, the expected behavior is to end up 3262 * M(A,B). But suppose that right after deciding to detach C, 3263 * the replacement of B completes. We would have M(A,C), and then 3264 * ask to detach C, which would leave us with just A -- not what 3265 * the user wanted. To prevent this, we make sure that the 3266 * parent/child relationship hasn't changed -- in this example, 3267 * that C's parent is still the replacing vdev R. 3268 */ 3269 if (pvd->vdev_guid != pguid && pguid != 0) 3270 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3271 3272 /* 3273 * If replace_done is specified, only remove this device if it's 3274 * the first child of a replacing vdev. For the 'spare' vdev, either 3275 * disk can be removed. 3276 */ 3277 if (replace_done) { 3278 if (pvd->vdev_ops == &vdev_replacing_ops) { 3279 if (vd->vdev_id != 0) 3280 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3281 } else if (pvd->vdev_ops != &vdev_spare_ops) { 3282 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3283 } 3284 } 3285 3286 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 3287 spa_version(spa) >= SPA_VERSION_SPARES); 3288 3289 /* 3290 * Only mirror, replacing, and spare vdevs support detach. 3291 */ 3292 if (pvd->vdev_ops != &vdev_replacing_ops && 3293 pvd->vdev_ops != &vdev_mirror_ops && 3294 pvd->vdev_ops != &vdev_spare_ops) 3295 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3296 3297 /* 3298 * If this device has the only valid copy of some data, 3299 * we cannot safely detach it. 3300 */ 3301 if (vdev_dtl_required(vd)) 3302 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3303 3304 ASSERT(pvd->vdev_children >= 2); 3305 3306 /* 3307 * If we are detaching the second disk from a replacing vdev, then 3308 * check to see if we changed the original vdev's path to have "/old" 3309 * at the end in spa_vdev_attach(). If so, undo that change now. 3310 */ 3311 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 && 3312 pvd->vdev_child[0]->vdev_path != NULL && 3313 pvd->vdev_child[1]->vdev_path != NULL) { 3314 ASSERT(pvd->vdev_child[1] == vd); 3315 cvd = pvd->vdev_child[0]; 3316 len = strlen(vd->vdev_path); 3317 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 3318 strcmp(cvd->vdev_path + len, "/old") == 0) { 3319 spa_strfree(cvd->vdev_path); 3320 cvd->vdev_path = spa_strdup(vd->vdev_path); 3321 } 3322 } 3323 3324 /* 3325 * If we are detaching the original disk from a spare, then it implies 3326 * that the spare should become a real disk, and be removed from the 3327 * active spare list for the pool. 3328 */ 3329 if (pvd->vdev_ops == &vdev_spare_ops && 3330 vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare) 3331 unspare = B_TRUE; 3332 3333 /* 3334 * Erase the disk labels so the disk can be used for other things. 3335 * This must be done after all other error cases are handled, 3336 * but before we disembowel vd (so we can still do I/O to it). 3337 * But if we can't do it, don't treat the error as fatal -- 3338 * it may be that the unwritability of the disk is the reason 3339 * it's being detached! 3340 */ 3341 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 3342 3343 /* 3344 * Remove vd from its parent and compact the parent's children. 3345 */ 3346 vdev_remove_child(pvd, vd); 3347 vdev_compact_children(pvd); 3348 3349 /* 3350 * Remember one of the remaining children so we can get tvd below. 3351 */ 3352 cvd = pvd->vdev_child[0]; 3353 3354 /* 3355 * If we need to remove the remaining child from the list of hot spares, 3356 * do it now, marking the vdev as no longer a spare in the process. 3357 * We must do this before vdev_remove_parent(), because that can 3358 * change the GUID if it creates a new toplevel GUID. For a similar 3359 * reason, we must remove the spare now, in the same txg as the detach; 3360 * otherwise someone could attach a new sibling, change the GUID, and 3361 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 3362 */ 3363 if (unspare) { 3364 ASSERT(cvd->vdev_isspare); 3365 spa_spare_remove(cvd); 3366 unspare_guid = cvd->vdev_guid; 3367 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 3368 } 3369 3370 /* 3371 * If the parent mirror/replacing vdev only has one child, 3372 * the parent is no longer needed. Remove it from the tree. 3373 */ 3374 if (pvd->vdev_children == 1) 3375 vdev_remove_parent(cvd); 3376 3377 /* 3378 * We don't set tvd until now because the parent we just removed 3379 * may have been the previous top-level vdev. 3380 */ 3381 tvd = cvd->vdev_top; 3382 ASSERT(tvd->vdev_parent == rvd); 3383 3384 /* 3385 * Reevaluate the parent vdev state. 3386 */ 3387 vdev_propagate_state(cvd); 3388 3389 /* 3390 * If the 'autoexpand' property is set on the pool then automatically 3391 * try to expand the size of the pool. For example if the device we 3392 * just detached was smaller than the others, it may be possible to 3393 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 3394 * first so that we can obtain the updated sizes of the leaf vdevs. 3395 */ 3396 if (spa->spa_autoexpand) { 3397 vdev_reopen(tvd); 3398 vdev_expand(tvd, txg); 3399 } 3400 3401 vdev_config_dirty(tvd); 3402 3403 /* 3404 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 3405 * vd->vdev_detached is set and free vd's DTL object in syncing context. 3406 * But first make sure we're not on any *other* txg's DTL list, to 3407 * prevent vd from being accessed after it's freed. 3408 */ 3409 for (int t = 0; t < TXG_SIZE; t++) 3410 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 3411 vd->vdev_detached = B_TRUE; 3412 vdev_dirty(tvd, VDD_DTL, vd, txg); 3413 3414 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 3415 3416 error = spa_vdev_exit(spa, vd, txg, 0); 3417 3418 /* 3419 * If this was the removal of the original device in a hot spare vdev, 3420 * then we want to go through and remove the device from the hot spare 3421 * list of every other pool. 3422 */ 3423 if (unspare) { 3424 spa_t *myspa = spa; 3425 spa = NULL; 3426 mutex_enter(&spa_namespace_lock); 3427 while ((spa = spa_next(spa)) != NULL) { 3428 if (spa->spa_state != POOL_STATE_ACTIVE) 3429 continue; 3430 if (spa == myspa) 3431 continue; 3432 spa_open_ref(spa, FTAG); 3433 mutex_exit(&spa_namespace_lock); 3434 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 3435 mutex_enter(&spa_namespace_lock); 3436 spa_close(spa, FTAG); 3437 } 3438 mutex_exit(&spa_namespace_lock); 3439 } 3440 3441 return (error); 3442 } 3443 3444 static nvlist_t * 3445 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 3446 { 3447 for (int i = 0; i < count; i++) { 3448 uint64_t guid; 3449 3450 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 3451 &guid) == 0); 3452 3453 if (guid == target_guid) 3454 return (nvpp[i]); 3455 } 3456 3457 return (NULL); 3458 } 3459 3460 static void 3461 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 3462 nvlist_t *dev_to_remove) 3463 { 3464 nvlist_t **newdev = NULL; 3465 3466 if (count > 1) 3467 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 3468 3469 for (int i = 0, j = 0; i < count; i++) { 3470 if (dev[i] == dev_to_remove) 3471 continue; 3472 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 3473 } 3474 3475 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 3476 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 3477 3478 for (int i = 0; i < count - 1; i++) 3479 nvlist_free(newdev[i]); 3480 3481 if (count > 1) 3482 kmem_free(newdev, (count - 1) * sizeof (void *)); 3483 } 3484 3485 /* 3486 * Removing a device from the vdev namespace requires several steps 3487 * and can take a significant amount of time. As a result we use 3488 * the spa_vdev_config_[enter/exit] functions which allow us to 3489 * grab and release the spa_config_lock while still holding the namespace 3490 * lock. During each step the configuration is synced out. 3491 */ 3492 3493 /* 3494 * Initial phase of device removal - stop future allocations from this device. 3495 */ 3496 void 3497 spa_vdev_remove_start(spa_t *spa, vdev_t *vd) 3498 { 3499 metaslab_group_t *mg = vd->vdev_mg; 3500 3501 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3502 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3503 3504 /* 3505 * Remove our vdev from the allocatable vdevs 3506 */ 3507 if (mg) 3508 metaslab_class_remove(mg->mg_class, mg); 3509 } 3510 3511 /* 3512 * Evacuate the device. 3513 */ 3514 int 3515 spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 3516 { 3517 uint64_t txg; 3518 int error; 3519 3520 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3521 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 3522 3523 /* 3524 * Evacuate the device. We don't hold the config lock as writer 3525 * since we need to do I/O but we do keep the 3526 * spa_namespace_lock held. Once this completes the device 3527 * should no longer have any blocks allocated on it. 3528 */ 3529 if (vd->vdev_islog) { 3530 /* 3531 * Evacuate the device. 3532 */ 3533 if (error = dmu_objset_find(spa_name(spa), 3534 zil_vdev_offline, NULL, DS_FIND_CHILDREN)) { 3535 uint64_t txg; 3536 3537 txg = spa_vdev_config_enter(spa); 3538 metaslab_class_add(spa->spa_log_class, 3539 vd->vdev_mg); 3540 return (spa_vdev_exit(spa, NULL, txg, error)); 3541 } 3542 txg_wait_synced(spa_get_dsl(spa), 0); 3543 } 3544 3545 /* 3546 * Remove any remaining MOS metadata associated with the device. 3547 */ 3548 txg = spa_vdev_config_enter(spa); 3549 vd->vdev_removing = B_TRUE; 3550 vdev_dirty(vd, 0, NULL, txg); 3551 vdev_config_dirty(vd); 3552 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 3553 3554 return (0); 3555 } 3556 3557 /* 3558 * Complete the removal by cleaning up the namespace. 3559 */ 3560 void 3561 spa_vdev_remove_done(spa_t *spa, vdev_t *vd) 3562 { 3563 vdev_t *rvd = spa->spa_root_vdev; 3564 metaslab_group_t *mg = vd->vdev_mg; 3565 uint64_t id = vd->vdev_id; 3566 boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 3567 3568 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3569 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3570 3571 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 3572 vdev_free(vd); 3573 3574 /* 3575 * It's possible that another thread is trying todo a spa_vdev_add() 3576 * at the same time we're trying remove it. As a result the 3577 * added vdev may not have initialized its metaslabs yet. 3578 */ 3579 if (mg != NULL) 3580 metaslab_group_destroy(mg); 3581 3582 if (last_vdev) { 3583 vdev_compact_children(rvd); 3584 } else { 3585 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 3586 vdev_add_child(rvd, vd); 3587 } 3588 vdev_config_dirty(rvd); 3589 3590 /* 3591 * Reassess the health of our root vdev. 3592 */ 3593 vdev_reopen(rvd); 3594 } 3595 3596 /* 3597 * Remove a device from the pool. Currently, this supports removing only hot 3598 * spares, slogs, and level 2 ARC devices. 3599 */ 3600 int 3601 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 3602 { 3603 vdev_t *vd; 3604 nvlist_t **spares, **l2cache, *nv; 3605 uint64_t txg = 0; 3606 uint_t nspares, nl2cache; 3607 int error = 0; 3608 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 3609 3610 if (!locked) 3611 txg = spa_vdev_enter(spa); 3612 3613 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 3614 3615 if (spa->spa_spares.sav_vdevs != NULL && 3616 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3617 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 3618 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 3619 /* 3620 * Only remove the hot spare if it's not currently in use 3621 * in this pool. 3622 */ 3623 if (vd == NULL || unspare) { 3624 spa_vdev_remove_aux(spa->spa_spares.sav_config, 3625 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 3626 spa_load_spares(spa); 3627 spa->spa_spares.sav_sync = B_TRUE; 3628 } else { 3629 error = EBUSY; 3630 } 3631 } else if (spa->spa_l2cache.sav_vdevs != NULL && 3632 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3633 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 3634 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 3635 /* 3636 * Cache devices can always be removed. 3637 */ 3638 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 3639 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 3640 spa_load_l2cache(spa); 3641 spa->spa_l2cache.sav_sync = B_TRUE; 3642 } else if (vd != NULL && vd->vdev_islog) { 3643 ASSERT(!locked); 3644 3645 /* 3646 * XXX - Once we have bp-rewrite this should 3647 * become the common case. 3648 */ 3649 3650 /* 3651 * 1. Stop allocations 3652 * 2. Evacuate the device (i.e. kill off stubby and 3653 * metadata) and wait for it to complete (i.e. sync). 3654 * 3. Cleanup the vdev namespace. 3655 */ 3656 spa_vdev_remove_start(spa, vd); 3657 3658 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 3659 if ((error = spa_vdev_remove_evacuate(spa, vd)) != 0) 3660 return (error); 3661 txg = spa_vdev_config_enter(spa); 3662 3663 spa_vdev_remove_done(spa, vd); 3664 3665 } else if (vd != NULL) { 3666 /* 3667 * Normal vdevs cannot be removed (yet). 3668 */ 3669 error = ENOTSUP; 3670 } else { 3671 /* 3672 * There is no vdev of any kind with the specified guid. 3673 */ 3674 error = ENOENT; 3675 } 3676 3677 if (!locked) 3678 return (spa_vdev_exit(spa, NULL, txg, error)); 3679 3680 return (error); 3681 } 3682 3683 /* 3684 * Find any device that's done replacing, or a vdev marked 'unspare' that's 3685 * current spared, so we can detach it. 3686 */ 3687 static vdev_t * 3688 spa_vdev_resilver_done_hunt(vdev_t *vd) 3689 { 3690 vdev_t *newvd, *oldvd; 3691 3692 for (int c = 0; c < vd->vdev_children; c++) { 3693 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 3694 if (oldvd != NULL) 3695 return (oldvd); 3696 } 3697 3698 /* 3699 * Check for a completed replacement. 3700 */ 3701 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 3702 oldvd = vd->vdev_child[0]; 3703 newvd = vd->vdev_child[1]; 3704 3705 if (vdev_dtl_empty(newvd, DTL_MISSING) && 3706 !vdev_dtl_required(oldvd)) 3707 return (oldvd); 3708 } 3709 3710 /* 3711 * Check for a completed resilver with the 'unspare' flag set. 3712 */ 3713 if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { 3714 newvd = vd->vdev_child[0]; 3715 oldvd = vd->vdev_child[1]; 3716 3717 if (newvd->vdev_unspare && 3718 vdev_dtl_empty(newvd, DTL_MISSING) && 3719 !vdev_dtl_required(oldvd)) { 3720 newvd->vdev_unspare = 0; 3721 return (oldvd); 3722 } 3723 } 3724 3725 return (NULL); 3726 } 3727 3728 static void 3729 spa_vdev_resilver_done(spa_t *spa) 3730 { 3731 vdev_t *vd, *pvd, *ppvd; 3732 uint64_t guid, sguid, pguid, ppguid; 3733 3734 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3735 3736 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 3737 pvd = vd->vdev_parent; 3738 ppvd = pvd->vdev_parent; 3739 guid = vd->vdev_guid; 3740 pguid = pvd->vdev_guid; 3741 ppguid = ppvd->vdev_guid; 3742 sguid = 0; 3743 /* 3744 * If we have just finished replacing a hot spared device, then 3745 * we need to detach the parent's first child (the original hot 3746 * spare) as well. 3747 */ 3748 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) { 3749 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 3750 ASSERT(ppvd->vdev_children == 2); 3751 sguid = ppvd->vdev_child[1]->vdev_guid; 3752 } 3753 spa_config_exit(spa, SCL_ALL, FTAG); 3754 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 3755 return; 3756 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 3757 return; 3758 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3759 } 3760 3761 spa_config_exit(spa, SCL_ALL, FTAG); 3762 } 3763 3764 /* 3765 * Update the stored path or FRU for this vdev. Dirty the vdev configuration, 3766 * relying on spa_vdev_enter/exit() to synchronize the labels and cache. 3767 */ 3768 int 3769 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 3770 boolean_t ispath) 3771 { 3772 vdev_t *vd; 3773 uint64_t txg; 3774 3775 txg = spa_vdev_enter(spa); 3776 3777 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 3778 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 3779 3780 if (!vd->vdev_ops->vdev_op_leaf) 3781 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3782 3783 if (ispath) { 3784 spa_strfree(vd->vdev_path); 3785 vd->vdev_path = spa_strdup(value); 3786 } else { 3787 if (vd->vdev_fru != NULL) 3788 spa_strfree(vd->vdev_fru); 3789 vd->vdev_fru = spa_strdup(value); 3790 } 3791 3792 vdev_config_dirty(vd->vdev_top); 3793 3794 return (spa_vdev_exit(spa, NULL, txg, 0)); 3795 } 3796 3797 int 3798 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 3799 { 3800 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 3801 } 3802 3803 int 3804 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 3805 { 3806 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 3807 } 3808 3809 /* 3810 * ========================================================================== 3811 * SPA Scrubbing 3812 * ========================================================================== 3813 */ 3814 3815 int 3816 spa_scrub(spa_t *spa, pool_scrub_type_t type) 3817 { 3818 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 3819 3820 if ((uint_t)type >= POOL_SCRUB_TYPES) 3821 return (ENOTSUP); 3822 3823 /* 3824 * If a resilver was requested, but there is no DTL on a 3825 * writeable leaf device, we have nothing to do. 3826 */ 3827 if (type == POOL_SCRUB_RESILVER && 3828 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 3829 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 3830 return (0); 3831 } 3832 3833 if (type == POOL_SCRUB_EVERYTHING && 3834 spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE && 3835 spa->spa_dsl_pool->dp_scrub_isresilver) 3836 return (EBUSY); 3837 3838 if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) { 3839 return (dsl_pool_scrub_clean(spa->spa_dsl_pool)); 3840 } else if (type == POOL_SCRUB_NONE) { 3841 return (dsl_pool_scrub_cancel(spa->spa_dsl_pool)); 3842 } else { 3843 return (EINVAL); 3844 } 3845 } 3846 3847 /* 3848 * ========================================================================== 3849 * SPA async task processing 3850 * ========================================================================== 3851 */ 3852 3853 static void 3854 spa_async_remove(spa_t *spa, vdev_t *vd) 3855 { 3856 if (vd->vdev_remove_wanted) { 3857 vd->vdev_remove_wanted = 0; 3858 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 3859 3860 /* 3861 * We want to clear the stats, but we don't want to do a full 3862 * vdev_clear() as that will cause us to throw away 3863 * degraded/faulted state as well as attempt to reopen the 3864 * device, all of which is a waste. 3865 */ 3866 vd->vdev_stat.vs_read_errors = 0; 3867 vd->vdev_stat.vs_write_errors = 0; 3868 vd->vdev_stat.vs_checksum_errors = 0; 3869 3870 vdev_state_dirty(vd->vdev_top); 3871 } 3872 3873 for (int c = 0; c < vd->vdev_children; c++) 3874 spa_async_remove(spa, vd->vdev_child[c]); 3875 } 3876 3877 static void 3878 spa_async_probe(spa_t *spa, vdev_t *vd) 3879 { 3880 if (vd->vdev_probe_wanted) { 3881 vd->vdev_probe_wanted = 0; 3882 vdev_reopen(vd); /* vdev_open() does the actual probe */ 3883 } 3884 3885 for (int c = 0; c < vd->vdev_children; c++) 3886 spa_async_probe(spa, vd->vdev_child[c]); 3887 } 3888 3889 static void 3890 spa_async_autoexpand(spa_t *spa, vdev_t *vd) 3891 { 3892 sysevent_id_t eid; 3893 nvlist_t *attr; 3894 char *physpath; 3895 3896 if (!spa->spa_autoexpand) 3897 return; 3898 3899 for (int c = 0; c < vd->vdev_children; c++) { 3900 vdev_t *cvd = vd->vdev_child[c]; 3901 spa_async_autoexpand(spa, cvd); 3902 } 3903 3904 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 3905 return; 3906 3907 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 3908 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 3909 3910 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3911 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 3912 3913 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 3914 ESC_DEV_DLE, attr, &eid, DDI_SLEEP); 3915 3916 nvlist_free(attr); 3917 kmem_free(physpath, MAXPATHLEN); 3918 } 3919 3920 static void 3921 spa_async_thread(spa_t *spa) 3922 { 3923 int tasks; 3924 3925 ASSERT(spa->spa_sync_on); 3926 3927 mutex_enter(&spa->spa_async_lock); 3928 tasks = spa->spa_async_tasks; 3929 spa->spa_async_tasks = 0; 3930 mutex_exit(&spa->spa_async_lock); 3931 3932 /* 3933 * See if the config needs to be updated. 3934 */ 3935 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 3936 uint64_t oldsz, space_update; 3937 3938 mutex_enter(&spa_namespace_lock); 3939 oldsz = spa_get_space(spa); 3940 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3941 space_update = spa_get_space(spa) - oldsz; 3942 mutex_exit(&spa_namespace_lock); 3943 3944 /* 3945 * If the pool grew as a result of the config update, 3946 * then log an internal history event. 3947 */ 3948 if (space_update) { 3949 spa_history_internal_log(LOG_POOL_VDEV_ONLINE, 3950 spa, NULL, CRED(), 3951 "pool '%s' size: %llu(+%llu)", 3952 spa_name(spa), spa_get_space(spa), 3953 space_update); 3954 } 3955 } 3956 3957 /* 3958 * See if any devices need to be marked REMOVED. 3959 */ 3960 if (tasks & SPA_ASYNC_REMOVE) { 3961 spa_vdev_state_enter(spa, SCL_NONE); 3962 spa_async_remove(spa, spa->spa_root_vdev); 3963 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 3964 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 3965 for (int i = 0; i < spa->spa_spares.sav_count; i++) 3966 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 3967 (void) spa_vdev_state_exit(spa, NULL, 0); 3968 } 3969 3970 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 3971 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3972 spa_async_autoexpand(spa, spa->spa_root_vdev); 3973 spa_config_exit(spa, SCL_CONFIG, FTAG); 3974 } 3975 3976 /* 3977 * See if any devices need to be probed. 3978 */ 3979 if (tasks & SPA_ASYNC_PROBE) { 3980 spa_vdev_state_enter(spa, SCL_NONE); 3981 spa_async_probe(spa, spa->spa_root_vdev); 3982 (void) spa_vdev_state_exit(spa, NULL, 0); 3983 } 3984 3985 /* 3986 * If any devices are done replacing, detach them. 3987 */ 3988 if (tasks & SPA_ASYNC_RESILVER_DONE) 3989 spa_vdev_resilver_done(spa); 3990 3991 /* 3992 * Kick off a resilver. 3993 */ 3994 if (tasks & SPA_ASYNC_RESILVER) 3995 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0); 3996 3997 /* 3998 * Let the world know that we're done. 3999 */ 4000 mutex_enter(&spa->spa_async_lock); 4001 spa->spa_async_thread = NULL; 4002 cv_broadcast(&spa->spa_async_cv); 4003 mutex_exit(&spa->spa_async_lock); 4004 thread_exit(); 4005 } 4006 4007 void 4008 spa_async_suspend(spa_t *spa) 4009 { 4010 mutex_enter(&spa->spa_async_lock); 4011 spa->spa_async_suspended++; 4012 while (spa->spa_async_thread != NULL) 4013 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 4014 mutex_exit(&spa->spa_async_lock); 4015 } 4016 4017 void 4018 spa_async_resume(spa_t *spa) 4019 { 4020 mutex_enter(&spa->spa_async_lock); 4021 ASSERT(spa->spa_async_suspended != 0); 4022 spa->spa_async_suspended--; 4023 mutex_exit(&spa->spa_async_lock); 4024 } 4025 4026 static void 4027 spa_async_dispatch(spa_t *spa) 4028 { 4029 mutex_enter(&spa->spa_async_lock); 4030 if (spa->spa_async_tasks && !spa->spa_async_suspended && 4031 spa->spa_async_thread == NULL && 4032 rootdir != NULL && !vn_is_readonly(rootdir)) 4033 spa->spa_async_thread = thread_create(NULL, 0, 4034 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 4035 mutex_exit(&spa->spa_async_lock); 4036 } 4037 4038 void 4039 spa_async_request(spa_t *spa, int task) 4040 { 4041 mutex_enter(&spa->spa_async_lock); 4042 spa->spa_async_tasks |= task; 4043 mutex_exit(&spa->spa_async_lock); 4044 } 4045 4046 /* 4047 * ========================================================================== 4048 * SPA syncing routines 4049 * ========================================================================== 4050 */ 4051 4052 static void 4053 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 4054 { 4055 bplist_t *bpl = &spa->spa_sync_bplist; 4056 dmu_tx_t *tx; 4057 blkptr_t blk; 4058 uint64_t itor = 0; 4059 zio_t *zio; 4060 int error; 4061 uint8_t c = 1; 4062 4063 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4064 4065 while (bplist_iterate(bpl, &itor, &blk) == 0) { 4066 ASSERT(blk.blk_birth < txg); 4067 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL, 4068 ZIO_FLAG_MUSTSUCCEED)); 4069 } 4070 4071 error = zio_wait(zio); 4072 ASSERT3U(error, ==, 0); 4073 4074 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 4075 bplist_vacate(bpl, tx); 4076 4077 /* 4078 * Pre-dirty the first block so we sync to convergence faster. 4079 * (Usually only the first block is needed.) 4080 */ 4081 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 4082 dmu_tx_commit(tx); 4083 } 4084 4085 static void 4086 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 4087 { 4088 char *packed = NULL; 4089 size_t bufsize; 4090 size_t nvsize = 0; 4091 dmu_buf_t *db; 4092 4093 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 4094 4095 /* 4096 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 4097 * information. This avoids the dbuf_will_dirty() path and 4098 * saves us a pre-read to get data we don't actually care about. 4099 */ 4100 bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); 4101 packed = kmem_alloc(bufsize, KM_SLEEP); 4102 4103 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 4104 KM_SLEEP) == 0); 4105 bzero(packed + nvsize, bufsize - nvsize); 4106 4107 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 4108 4109 kmem_free(packed, bufsize); 4110 4111 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 4112 dmu_buf_will_dirty(db, tx); 4113 *(uint64_t *)db->db_data = nvsize; 4114 dmu_buf_rele(db, FTAG); 4115 } 4116 4117 static void 4118 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 4119 const char *config, const char *entry) 4120 { 4121 nvlist_t *nvroot; 4122 nvlist_t **list; 4123 int i; 4124 4125 if (!sav->sav_sync) 4126 return; 4127 4128 /* 4129 * Update the MOS nvlist describing the list of available devices. 4130 * spa_validate_aux() will have already made sure this nvlist is 4131 * valid and the vdevs are labeled appropriately. 4132 */ 4133 if (sav->sav_object == 0) { 4134 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 4135 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 4136 sizeof (uint64_t), tx); 4137 VERIFY(zap_update(spa->spa_meta_objset, 4138 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 4139 &sav->sav_object, tx) == 0); 4140 } 4141 4142 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4143 if (sav->sav_count == 0) { 4144 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 4145 } else { 4146 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 4147 for (i = 0; i < sav->sav_count; i++) 4148 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 4149 B_FALSE, B_FALSE, B_TRUE); 4150 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 4151 sav->sav_count) == 0); 4152 for (i = 0; i < sav->sav_count; i++) 4153 nvlist_free(list[i]); 4154 kmem_free(list, sav->sav_count * sizeof (void *)); 4155 } 4156 4157 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 4158 nvlist_free(nvroot); 4159 4160 sav->sav_sync = B_FALSE; 4161 } 4162 4163 static void 4164 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 4165 { 4166 nvlist_t *config; 4167 4168 if (list_is_empty(&spa->spa_config_dirty_list)) 4169 return; 4170 4171 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4172 4173 config = spa_config_generate(spa, spa->spa_root_vdev, 4174 dmu_tx_get_txg(tx), B_FALSE); 4175 4176 spa_config_exit(spa, SCL_STATE, FTAG); 4177 4178 if (spa->spa_config_syncing) 4179 nvlist_free(spa->spa_config_syncing); 4180 spa->spa_config_syncing = config; 4181 4182 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 4183 } 4184 4185 /* 4186 * Set zpool properties. 4187 */ 4188 static void 4189 spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 4190 { 4191 spa_t *spa = arg1; 4192 objset_t *mos = spa->spa_meta_objset; 4193 nvlist_t *nvp = arg2; 4194 nvpair_t *elem; 4195 uint64_t intval; 4196 char *strval; 4197 zpool_prop_t prop; 4198 const char *propname; 4199 zprop_type_t proptype; 4200 4201 mutex_enter(&spa->spa_props_lock); 4202 4203 elem = NULL; 4204 while ((elem = nvlist_next_nvpair(nvp, elem))) { 4205 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 4206 case ZPOOL_PROP_VERSION: 4207 /* 4208 * Only set version for non-zpool-creation cases 4209 * (set/import). spa_create() needs special care 4210 * for version setting. 4211 */ 4212 if (tx->tx_txg != TXG_INITIAL) { 4213 VERIFY(nvpair_value_uint64(elem, 4214 &intval) == 0); 4215 ASSERT(intval <= SPA_VERSION); 4216 ASSERT(intval >= spa_version(spa)); 4217 spa->spa_uberblock.ub_version = intval; 4218 vdev_config_dirty(spa->spa_root_vdev); 4219 } 4220 break; 4221 4222 case ZPOOL_PROP_ALTROOT: 4223 /* 4224 * 'altroot' is a non-persistent property. It should 4225 * have been set temporarily at creation or import time. 4226 */ 4227 ASSERT(spa->spa_root != NULL); 4228 break; 4229 4230 case ZPOOL_PROP_CACHEFILE: 4231 /* 4232 * 'cachefile' is also a non-persisitent property. 4233 */ 4234 break; 4235 default: 4236 /* 4237 * Set pool property values in the poolprops mos object. 4238 */ 4239 if (spa->spa_pool_props_object == 0) { 4240 objset_t *mos = spa->spa_meta_objset; 4241 4242 VERIFY((spa->spa_pool_props_object = 4243 zap_create(mos, DMU_OT_POOL_PROPS, 4244 DMU_OT_NONE, 0, tx)) > 0); 4245 4246 VERIFY(zap_update(mos, 4247 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 4248 8, 1, &spa->spa_pool_props_object, tx) 4249 == 0); 4250 } 4251 4252 /* normalize the property name */ 4253 propname = zpool_prop_to_name(prop); 4254 proptype = zpool_prop_get_type(prop); 4255 4256 if (nvpair_type(elem) == DATA_TYPE_STRING) { 4257 ASSERT(proptype == PROP_TYPE_STRING); 4258 VERIFY(nvpair_value_string(elem, &strval) == 0); 4259 VERIFY(zap_update(mos, 4260 spa->spa_pool_props_object, propname, 4261 1, strlen(strval) + 1, strval, tx) == 0); 4262 4263 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 4264 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 4265 4266 if (proptype == PROP_TYPE_INDEX) { 4267 const char *unused; 4268 VERIFY(zpool_prop_index_to_string( 4269 prop, intval, &unused) == 0); 4270 } 4271 VERIFY(zap_update(mos, 4272 spa->spa_pool_props_object, propname, 4273 8, 1, &intval, tx) == 0); 4274 } else { 4275 ASSERT(0); /* not allowed */ 4276 } 4277 4278 switch (prop) { 4279 case ZPOOL_PROP_DELEGATION: 4280 spa->spa_delegation = intval; 4281 break; 4282 case ZPOOL_PROP_BOOTFS: 4283 spa->spa_bootfs = intval; 4284 break; 4285 case ZPOOL_PROP_FAILUREMODE: 4286 spa->spa_failmode = intval; 4287 break; 4288 case ZPOOL_PROP_AUTOEXPAND: 4289 spa->spa_autoexpand = intval; 4290 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 4291 break; 4292 default: 4293 break; 4294 } 4295 } 4296 4297 /* log internal history if this is not a zpool create */ 4298 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 4299 tx->tx_txg != TXG_INITIAL) { 4300 spa_history_internal_log(LOG_POOL_PROPSET, 4301 spa, tx, cr, "%s %lld %s", 4302 nvpair_name(elem), intval, spa_name(spa)); 4303 } 4304 } 4305 4306 mutex_exit(&spa->spa_props_lock); 4307 } 4308 4309 /* 4310 * Sync the specified transaction group. New blocks may be dirtied as 4311 * part of the process, so we iterate until it converges. 4312 */ 4313 void 4314 spa_sync(spa_t *spa, uint64_t txg) 4315 { 4316 dsl_pool_t *dp = spa->spa_dsl_pool; 4317 objset_t *mos = spa->spa_meta_objset; 4318 bplist_t *bpl = &spa->spa_sync_bplist; 4319 vdev_t *rvd = spa->spa_root_vdev; 4320 vdev_t *vd; 4321 dmu_tx_t *tx; 4322 int dirty_vdevs; 4323 int error; 4324 4325 /* 4326 * Lock out configuration changes. 4327 */ 4328 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4329 4330 spa->spa_syncing_txg = txg; 4331 spa->spa_sync_pass = 0; 4332 4333 /* 4334 * If there are any pending vdev state changes, convert them 4335 * into config changes that go out with this transaction group. 4336 */ 4337 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4338 while (list_head(&spa->spa_state_dirty_list) != NULL) { 4339 /* 4340 * We need the write lock here because, for aux vdevs, 4341 * calling vdev_config_dirty() modifies sav_config. 4342 * This is ugly and will become unnecessary when we 4343 * eliminate the aux vdev wart by integrating all vdevs 4344 * into the root vdev tree. 4345 */ 4346 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 4347 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 4348 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 4349 vdev_state_clean(vd); 4350 vdev_config_dirty(vd); 4351 } 4352 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 4353 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 4354 } 4355 spa_config_exit(spa, SCL_STATE, FTAG); 4356 4357 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 4358 4359 tx = dmu_tx_create_assigned(dp, txg); 4360 4361 /* 4362 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 4363 * set spa_deflate if we have no raid-z vdevs. 4364 */ 4365 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 4366 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 4367 int i; 4368 4369 for (i = 0; i < rvd->vdev_children; i++) { 4370 vd = rvd->vdev_child[i]; 4371 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 4372 break; 4373 } 4374 if (i == rvd->vdev_children) { 4375 spa->spa_deflate = TRUE; 4376 VERIFY(0 == zap_add(spa->spa_meta_objset, 4377 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 4378 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 4379 } 4380 } 4381 4382 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 4383 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 4384 dsl_pool_create_origin(dp, tx); 4385 4386 /* Keeping the origin open increases spa_minref */ 4387 spa->spa_minref += 3; 4388 } 4389 4390 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 4391 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 4392 dsl_pool_upgrade_clones(dp, tx); 4393 } 4394 4395 /* 4396 * If anything has changed in this txg, push the deferred frees 4397 * from the previous txg. If not, leave them alone so that we 4398 * don't generate work on an otherwise idle system. 4399 */ 4400 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 4401 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 4402 !txg_list_empty(&dp->dp_sync_tasks, txg)) 4403 spa_sync_deferred_frees(spa, txg); 4404 4405 /* 4406 * Iterate to convergence. 4407 */ 4408 do { 4409 spa->spa_sync_pass++; 4410 4411 spa_sync_config_object(spa, tx); 4412 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 4413 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 4414 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 4415 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 4416 spa_errlog_sync(spa, txg); 4417 dsl_pool_sync(dp, txg); 4418 4419 dirty_vdevs = 0; 4420 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 4421 vdev_sync(vd, txg); 4422 dirty_vdevs++; 4423 } 4424 4425 bplist_sync(bpl, tx); 4426 } while (dirty_vdevs); 4427 4428 bplist_close(bpl); 4429 4430 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 4431 4432 /* 4433 * Rewrite the vdev configuration (which includes the uberblock) 4434 * to commit the transaction group. 4435 * 4436 * If there are no dirty vdevs, we sync the uberblock to a few 4437 * random top-level vdevs that are known to be visible in the 4438 * config cache (see spa_vdev_add() for a complete description). 4439 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 4440 */ 4441 for (;;) { 4442 /* 4443 * We hold SCL_STATE to prevent vdev open/close/etc. 4444 * while we're attempting to write the vdev labels. 4445 */ 4446 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4447 4448 if (list_is_empty(&spa->spa_config_dirty_list)) { 4449 vdev_t *svd[SPA_DVAS_PER_BP]; 4450 int svdcount = 0; 4451 int children = rvd->vdev_children; 4452 int c0 = spa_get_random(children); 4453 4454 for (int c = 0; c < children; c++) { 4455 vd = rvd->vdev_child[(c0 + c) % children]; 4456 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 4457 continue; 4458 svd[svdcount++] = vd; 4459 if (svdcount == SPA_DVAS_PER_BP) 4460 break; 4461 } 4462 error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 4463 if (error != 0) 4464 error = vdev_config_sync(svd, svdcount, txg, 4465 B_TRUE); 4466 } else { 4467 error = vdev_config_sync(rvd->vdev_child, 4468 rvd->vdev_children, txg, B_FALSE); 4469 if (error != 0) 4470 error = vdev_config_sync(rvd->vdev_child, 4471 rvd->vdev_children, txg, B_TRUE); 4472 } 4473 4474 spa_config_exit(spa, SCL_STATE, FTAG); 4475 4476 if (error == 0) 4477 break; 4478 zio_suspend(spa, NULL); 4479 zio_resume_wait(spa); 4480 } 4481 dmu_tx_commit(tx); 4482 4483 /* 4484 * Clear the dirty config list. 4485 */ 4486 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 4487 vdev_config_clean(vd); 4488 4489 /* 4490 * Now that the new config has synced transactionally, 4491 * let it become visible to the config cache. 4492 */ 4493 if (spa->spa_config_syncing != NULL) { 4494 spa_config_set(spa, spa->spa_config_syncing); 4495 spa->spa_config_txg = txg; 4496 spa->spa_config_syncing = NULL; 4497 } 4498 4499 spa->spa_ubsync = spa->spa_uberblock; 4500 4501 /* 4502 * Clean up the ZIL records for the synced txg. 4503 */ 4504 dsl_pool_zil_clean(dp); 4505 4506 /* 4507 * Update usable space statistics. 4508 */ 4509 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 4510 vdev_sync_done(vd, txg); 4511 4512 /* 4513 * It had better be the case that we didn't dirty anything 4514 * since vdev_config_sync(). 4515 */ 4516 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 4517 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 4518 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 4519 ASSERT(bpl->bpl_queue == NULL); 4520 4521 spa_config_exit(spa, SCL_CONFIG, FTAG); 4522 4523 /* 4524 * If any async tasks have been requested, kick them off. 4525 */ 4526 spa_async_dispatch(spa); 4527 } 4528 4529 /* 4530 * Sync all pools. We don't want to hold the namespace lock across these 4531 * operations, so we take a reference on the spa_t and drop the lock during the 4532 * sync. 4533 */ 4534 void 4535 spa_sync_allpools(void) 4536 { 4537 spa_t *spa = NULL; 4538 mutex_enter(&spa_namespace_lock); 4539 while ((spa = spa_next(spa)) != NULL) { 4540 if (spa_state(spa) != POOL_STATE_ACTIVE || spa_suspended(spa)) 4541 continue; 4542 spa_open_ref(spa, FTAG); 4543 mutex_exit(&spa_namespace_lock); 4544 txg_wait_synced(spa_get_dsl(spa), 0); 4545 mutex_enter(&spa_namespace_lock); 4546 spa_close(spa, FTAG); 4547 } 4548 mutex_exit(&spa_namespace_lock); 4549 } 4550 4551 /* 4552 * ========================================================================== 4553 * Miscellaneous routines 4554 * ========================================================================== 4555 */ 4556 4557 /* 4558 * Remove all pools in the system. 4559 */ 4560 void 4561 spa_evict_all(void) 4562 { 4563 spa_t *spa; 4564 4565 /* 4566 * Remove all cached state. All pools should be closed now, 4567 * so every spa in the AVL tree should be unreferenced. 4568 */ 4569 mutex_enter(&spa_namespace_lock); 4570 while ((spa = spa_next(NULL)) != NULL) { 4571 /* 4572 * Stop async tasks. The async thread may need to detach 4573 * a device that's been replaced, which requires grabbing 4574 * spa_namespace_lock, so we must drop it here. 4575 */ 4576 spa_open_ref(spa, FTAG); 4577 mutex_exit(&spa_namespace_lock); 4578 spa_async_suspend(spa); 4579 mutex_enter(&spa_namespace_lock); 4580 spa_close(spa, FTAG); 4581 4582 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4583 spa_unload(spa); 4584 spa_deactivate(spa); 4585 } 4586 spa_remove(spa); 4587 } 4588 mutex_exit(&spa_namespace_lock); 4589 } 4590 4591 vdev_t * 4592 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 4593 { 4594 vdev_t *vd; 4595 int i; 4596 4597 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 4598 return (vd); 4599 4600 if (aux) { 4601 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 4602 vd = spa->spa_l2cache.sav_vdevs[i]; 4603 if (vd->vdev_guid == guid) 4604 return (vd); 4605 } 4606 4607 for (i = 0; i < spa->spa_spares.sav_count; i++) { 4608 vd = spa->spa_spares.sav_vdevs[i]; 4609 if (vd->vdev_guid == guid) 4610 return (vd); 4611 } 4612 } 4613 4614 return (NULL); 4615 } 4616 4617 void 4618 spa_upgrade(spa_t *spa, uint64_t version) 4619 { 4620 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4621 4622 /* 4623 * This should only be called for a non-faulted pool, and since a 4624 * future version would result in an unopenable pool, this shouldn't be 4625 * possible. 4626 */ 4627 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 4628 ASSERT(version >= spa->spa_uberblock.ub_version); 4629 4630 spa->spa_uberblock.ub_version = version; 4631 vdev_config_dirty(spa->spa_root_vdev); 4632 4633 spa_config_exit(spa, SCL_ALL, FTAG); 4634 4635 txg_wait_synced(spa_get_dsl(spa), 0); 4636 } 4637 4638 boolean_t 4639 spa_has_spare(spa_t *spa, uint64_t guid) 4640 { 4641 int i; 4642 uint64_t spareguid; 4643 spa_aux_vdev_t *sav = &spa->spa_spares; 4644 4645 for (i = 0; i < sav->sav_count; i++) 4646 if (sav->sav_vdevs[i]->vdev_guid == guid) 4647 return (B_TRUE); 4648 4649 for (i = 0; i < sav->sav_npending; i++) { 4650 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 4651 &spareguid) == 0 && spareguid == guid) 4652 return (B_TRUE); 4653 } 4654 4655 return (B_FALSE); 4656 } 4657 4658 /* 4659 * Check if a pool has an active shared spare device. 4660 * Note: reference count of an active spare is 2, as a spare and as a replace 4661 */ 4662 static boolean_t 4663 spa_has_active_shared_spare(spa_t *spa) 4664 { 4665 int i, refcnt; 4666 uint64_t pool; 4667 spa_aux_vdev_t *sav = &spa->spa_spares; 4668 4669 for (i = 0; i < sav->sav_count; i++) { 4670 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 4671 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 4672 refcnt > 2) 4673 return (B_TRUE); 4674 } 4675 4676 return (B_FALSE); 4677 } 4678 4679 /* 4680 * Post a sysevent corresponding to the given event. The 'name' must be one of 4681 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 4682 * filled in from the spa and (optionally) the vdev. This doesn't do anything 4683 * in the userland libzpool, as we don't want consumers to misinterpret ztest 4684 * or zdb as real changes. 4685 */ 4686 void 4687 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 4688 { 4689 #ifdef _KERNEL 4690 sysevent_t *ev; 4691 sysevent_attr_list_t *attr = NULL; 4692 sysevent_value_t value; 4693 sysevent_id_t eid; 4694 4695 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 4696 SE_SLEEP); 4697 4698 value.value_type = SE_DATA_TYPE_STRING; 4699 value.value.sv_string = spa_name(spa); 4700 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 4701 goto done; 4702 4703 value.value_type = SE_DATA_TYPE_UINT64; 4704 value.value.sv_uint64 = spa_guid(spa); 4705 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 4706 goto done; 4707 4708 if (vd) { 4709 value.value_type = SE_DATA_TYPE_UINT64; 4710 value.value.sv_uint64 = vd->vdev_guid; 4711 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 4712 SE_SLEEP) != 0) 4713 goto done; 4714 4715 if (vd->vdev_path) { 4716 value.value_type = SE_DATA_TYPE_STRING; 4717 value.value.sv_string = vd->vdev_path; 4718 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 4719 &value, SE_SLEEP) != 0) 4720 goto done; 4721 } 4722 } 4723 4724 if (sysevent_attach_attributes(ev, attr) != 0) 4725 goto done; 4726 attr = NULL; 4727 4728 (void) log_sysevent(ev, SE_SLEEP, &eid); 4729 4730 done: 4731 if (attr) 4732 sysevent_free_attr(attr); 4733 sysevent_free(ev); 4734 #endif 4735 } 4736