1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * This file contains all the routines used when modifying on-disk SPA state. 29 * This includes opening, importing, destroying, exporting a pool, and syncing a 30 * pool. 31 */ 32 33 #include <sys/zfs_context.h> 34 #include <sys/fm/fs/zfs.h> 35 #include <sys/spa_impl.h> 36 #include <sys/zio.h> 37 #include <sys/zio_checksum.h> 38 #include <sys/zio_compress.h> 39 #include <sys/dmu.h> 40 #include <sys/dmu_tx.h> 41 #include <sys/zap.h> 42 #include <sys/zil.h> 43 #include <sys/vdev_impl.h> 44 #include <sys/metaslab.h> 45 #include <sys/metaslab_impl.h> 46 #include <sys/uberblock_impl.h> 47 #include <sys/txg.h> 48 #include <sys/avl.h> 49 #include <sys/dmu_traverse.h> 50 #include <sys/dmu_objset.h> 51 #include <sys/unique.h> 52 #include <sys/dsl_pool.h> 53 #include <sys/dsl_dataset.h> 54 #include <sys/dsl_dir.h> 55 #include <sys/dsl_prop.h> 56 #include <sys/dsl_synctask.h> 57 #include <sys/fs/zfs.h> 58 #include <sys/arc.h> 59 #include <sys/callb.h> 60 #include <sys/systeminfo.h> 61 #include <sys/sunddi.h> 62 #include <sys/spa_boot.h> 63 #include <sys/zfs_ioctl.h> 64 65 #ifdef _KERNEL 66 #include <sys/zone.h> 67 #include <sys/bootprops.h> 68 #endif /* _KERNEL */ 69 70 #include "zfs_prop.h" 71 #include "zfs_comutil.h" 72 73 enum zti_modes { 74 zti_mode_fixed, /* value is # of threads (min 1) */ 75 zti_mode_online_percent, /* value is % of online CPUs */ 76 zti_mode_tune, /* fill from zio_taskq_tune_* */ 77 zti_nmodes 78 }; 79 80 #define ZTI_THREAD_FIX(n) { zti_mode_fixed, (n) } 81 #define ZTI_THREAD_PCT(n) { zti_mode_online_percent, (n) } 82 #define ZTI_THREAD_TUNE { zti_mode_tune, 0 } 83 84 #define ZTI_THREAD_ONE ZTI_THREAD_FIX(1) 85 86 typedef struct zio_taskq_info { 87 const char *zti_name; 88 struct { 89 enum zti_modes zti_mode; 90 uint_t zti_value; 91 } zti_nthreads[ZIO_TASKQ_TYPES]; 92 } zio_taskq_info_t; 93 94 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 95 "issue", "intr" 96 }; 97 98 const zio_taskq_info_t zio_taskqs[ZIO_TYPES] = { 99 /* ISSUE INTR */ 100 { "spa_zio_null", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, 101 { "spa_zio_read", { ZTI_THREAD_FIX(8), ZTI_THREAD_TUNE } }, 102 { "spa_zio_write", { ZTI_THREAD_TUNE, ZTI_THREAD_FIX(8) } }, 103 { "spa_zio_free", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, 104 { "spa_zio_claim", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, 105 { "spa_zio_ioctl", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, 106 }; 107 108 enum zti_modes zio_taskq_tune_mode = zti_mode_online_percent; 109 uint_t zio_taskq_tune_value = 80; /* #threads = 80% of # online CPUs */ 110 111 static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); 112 static boolean_t spa_has_active_shared_spare(spa_t *spa); 113 114 /* 115 * ========================================================================== 116 * SPA properties routines 117 * ========================================================================== 118 */ 119 120 /* 121 * Add a (source=src, propname=propval) list to an nvlist. 122 */ 123 static void 124 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 125 uint64_t intval, zprop_source_t src) 126 { 127 const char *propname = zpool_prop_to_name(prop); 128 nvlist_t *propval; 129 130 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 131 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 132 133 if (strval != NULL) 134 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 135 else 136 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 137 138 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 139 nvlist_free(propval); 140 } 141 142 /* 143 * Get property values from the spa configuration. 144 */ 145 static void 146 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 147 { 148 uint64_t size; 149 uint64_t used; 150 uint64_t cap, version; 151 zprop_source_t src = ZPROP_SRC_NONE; 152 spa_config_dirent_t *dp; 153 154 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 155 156 if (spa->spa_root_vdev != NULL) { 157 size = spa_get_space(spa); 158 used = spa_get_alloc(spa); 159 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 160 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 161 spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src); 162 spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, 163 size - used, src); 164 165 cap = (size == 0) ? 0 : (used * 100 / size); 166 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 167 168 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 169 spa->spa_root_vdev->vdev_state, src); 170 171 version = spa_version(spa); 172 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 173 src = ZPROP_SRC_DEFAULT; 174 else 175 src = ZPROP_SRC_LOCAL; 176 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 177 } 178 179 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 180 181 if (spa->spa_root != NULL) 182 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 183 0, ZPROP_SRC_LOCAL); 184 185 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 186 if (dp->scd_path == NULL) { 187 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 188 "none", 0, ZPROP_SRC_LOCAL); 189 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 190 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 191 dp->scd_path, 0, ZPROP_SRC_LOCAL); 192 } 193 } 194 } 195 196 /* 197 * Get zpool property values. 198 */ 199 int 200 spa_prop_get(spa_t *spa, nvlist_t **nvp) 201 { 202 zap_cursor_t zc; 203 zap_attribute_t za; 204 objset_t *mos = spa->spa_meta_objset; 205 int err; 206 207 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 208 209 mutex_enter(&spa->spa_props_lock); 210 211 /* 212 * Get properties from the spa config. 213 */ 214 spa_prop_get_config(spa, nvp); 215 216 /* If no pool property object, no more prop to get. */ 217 if (spa->spa_pool_props_object == 0) { 218 mutex_exit(&spa->spa_props_lock); 219 return (0); 220 } 221 222 /* 223 * Get properties from the MOS pool property object. 224 */ 225 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 226 (err = zap_cursor_retrieve(&zc, &za)) == 0; 227 zap_cursor_advance(&zc)) { 228 uint64_t intval = 0; 229 char *strval = NULL; 230 zprop_source_t src = ZPROP_SRC_DEFAULT; 231 zpool_prop_t prop; 232 233 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 234 continue; 235 236 switch (za.za_integer_length) { 237 case 8: 238 /* integer property */ 239 if (za.za_first_integer != 240 zpool_prop_default_numeric(prop)) 241 src = ZPROP_SRC_LOCAL; 242 243 if (prop == ZPOOL_PROP_BOOTFS) { 244 dsl_pool_t *dp; 245 dsl_dataset_t *ds = NULL; 246 247 dp = spa_get_dsl(spa); 248 rw_enter(&dp->dp_config_rwlock, RW_READER); 249 if (err = dsl_dataset_hold_obj(dp, 250 za.za_first_integer, FTAG, &ds)) { 251 rw_exit(&dp->dp_config_rwlock); 252 break; 253 } 254 255 strval = kmem_alloc( 256 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 257 KM_SLEEP); 258 dsl_dataset_name(ds, strval); 259 dsl_dataset_rele(ds, FTAG); 260 rw_exit(&dp->dp_config_rwlock); 261 } else { 262 strval = NULL; 263 intval = za.za_first_integer; 264 } 265 266 spa_prop_add_list(*nvp, prop, strval, intval, src); 267 268 if (strval != NULL) 269 kmem_free(strval, 270 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 271 272 break; 273 274 case 1: 275 /* string property */ 276 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 277 err = zap_lookup(mos, spa->spa_pool_props_object, 278 za.za_name, 1, za.za_num_integers, strval); 279 if (err) { 280 kmem_free(strval, za.za_num_integers); 281 break; 282 } 283 spa_prop_add_list(*nvp, prop, strval, 0, src); 284 kmem_free(strval, za.za_num_integers); 285 break; 286 287 default: 288 break; 289 } 290 } 291 zap_cursor_fini(&zc); 292 mutex_exit(&spa->spa_props_lock); 293 out: 294 if (err && err != ENOENT) { 295 nvlist_free(*nvp); 296 *nvp = NULL; 297 return (err); 298 } 299 300 return (0); 301 } 302 303 /* 304 * Validate the given pool properties nvlist and modify the list 305 * for the property values to be set. 306 */ 307 static int 308 spa_prop_validate(spa_t *spa, nvlist_t *props) 309 { 310 nvpair_t *elem; 311 int error = 0, reset_bootfs = 0; 312 uint64_t objnum; 313 314 elem = NULL; 315 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 316 zpool_prop_t prop; 317 char *propname, *strval; 318 uint64_t intval; 319 objset_t *os; 320 char *slash; 321 322 propname = nvpair_name(elem); 323 324 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) 325 return (EINVAL); 326 327 switch (prop) { 328 case ZPOOL_PROP_VERSION: 329 error = nvpair_value_uint64(elem, &intval); 330 if (!error && 331 (intval < spa_version(spa) || intval > SPA_VERSION)) 332 error = EINVAL; 333 break; 334 335 case ZPOOL_PROP_DELEGATION: 336 case ZPOOL_PROP_AUTOREPLACE: 337 case ZPOOL_PROP_LISTSNAPS: 338 case ZPOOL_PROP_AUTOEXPAND: 339 error = nvpair_value_uint64(elem, &intval); 340 if (!error && intval > 1) 341 error = EINVAL; 342 break; 343 344 case ZPOOL_PROP_BOOTFS: 345 /* 346 * If the pool version is less than SPA_VERSION_BOOTFS, 347 * or the pool is still being created (version == 0), 348 * the bootfs property cannot be set. 349 */ 350 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 351 error = ENOTSUP; 352 break; 353 } 354 355 /* 356 * Make sure the vdev config is bootable 357 */ 358 if (!vdev_is_bootable(spa->spa_root_vdev)) { 359 error = ENOTSUP; 360 break; 361 } 362 363 reset_bootfs = 1; 364 365 error = nvpair_value_string(elem, &strval); 366 367 if (!error) { 368 uint64_t compress; 369 370 if (strval == NULL || strval[0] == '\0') { 371 objnum = zpool_prop_default_numeric( 372 ZPOOL_PROP_BOOTFS); 373 break; 374 } 375 376 if (error = dmu_objset_hold(strval, FTAG, &os)) 377 break; 378 379 /* Must be ZPL and not gzip compressed. */ 380 381 if (dmu_objset_type(os) != DMU_OST_ZFS) { 382 error = ENOTSUP; 383 } else if ((error = dsl_prop_get_integer(strval, 384 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 385 &compress, NULL)) == 0 && 386 !BOOTFS_COMPRESS_VALID(compress)) { 387 error = ENOTSUP; 388 } else { 389 objnum = dmu_objset_id(os); 390 } 391 dmu_objset_rele(os, FTAG); 392 } 393 break; 394 395 case ZPOOL_PROP_FAILUREMODE: 396 error = nvpair_value_uint64(elem, &intval); 397 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 398 intval > ZIO_FAILURE_MODE_PANIC)) 399 error = EINVAL; 400 401 /* 402 * This is a special case which only occurs when 403 * the pool has completely failed. This allows 404 * the user to change the in-core failmode property 405 * without syncing it out to disk (I/Os might 406 * currently be blocked). We do this by returning 407 * EIO to the caller (spa_prop_set) to trick it 408 * into thinking we encountered a property validation 409 * error. 410 */ 411 if (!error && spa_suspended(spa)) { 412 spa->spa_failmode = intval; 413 error = EIO; 414 } 415 break; 416 417 case ZPOOL_PROP_CACHEFILE: 418 if ((error = nvpair_value_string(elem, &strval)) != 0) 419 break; 420 421 if (strval[0] == '\0') 422 break; 423 424 if (strcmp(strval, "none") == 0) 425 break; 426 427 if (strval[0] != '/') { 428 error = EINVAL; 429 break; 430 } 431 432 slash = strrchr(strval, '/'); 433 ASSERT(slash != NULL); 434 435 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 436 strcmp(slash, "/..") == 0) 437 error = EINVAL; 438 break; 439 } 440 441 if (error) 442 break; 443 } 444 445 if (!error && reset_bootfs) { 446 error = nvlist_remove(props, 447 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 448 449 if (!error) { 450 error = nvlist_add_uint64(props, 451 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 452 } 453 } 454 455 return (error); 456 } 457 458 void 459 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 460 { 461 char *cachefile; 462 spa_config_dirent_t *dp; 463 464 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 465 &cachefile) != 0) 466 return; 467 468 dp = kmem_alloc(sizeof (spa_config_dirent_t), 469 KM_SLEEP); 470 471 if (cachefile[0] == '\0') 472 dp->scd_path = spa_strdup(spa_config_path); 473 else if (strcmp(cachefile, "none") == 0) 474 dp->scd_path = NULL; 475 else 476 dp->scd_path = spa_strdup(cachefile); 477 478 list_insert_head(&spa->spa_config_list, dp); 479 if (need_sync) 480 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 481 } 482 483 int 484 spa_prop_set(spa_t *spa, nvlist_t *nvp) 485 { 486 int error; 487 nvpair_t *elem; 488 boolean_t need_sync = B_FALSE; 489 zpool_prop_t prop; 490 491 if ((error = spa_prop_validate(spa, nvp)) != 0) 492 return (error); 493 494 elem = NULL; 495 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 496 if ((prop = zpool_name_to_prop( 497 nvpair_name(elem))) == ZPROP_INVAL) 498 return (EINVAL); 499 500 if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT) 501 continue; 502 503 need_sync = B_TRUE; 504 break; 505 } 506 507 if (need_sync) 508 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 509 spa, nvp, 3)); 510 else 511 return (0); 512 } 513 514 /* 515 * If the bootfs property value is dsobj, clear it. 516 */ 517 void 518 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 519 { 520 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 521 VERIFY(zap_remove(spa->spa_meta_objset, 522 spa->spa_pool_props_object, 523 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 524 spa->spa_bootfs = 0; 525 } 526 } 527 528 /* 529 * ========================================================================== 530 * SPA state manipulation (open/create/destroy/import/export) 531 * ========================================================================== 532 */ 533 534 static int 535 spa_error_entry_compare(const void *a, const void *b) 536 { 537 spa_error_entry_t *sa = (spa_error_entry_t *)a; 538 spa_error_entry_t *sb = (spa_error_entry_t *)b; 539 int ret; 540 541 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 542 sizeof (zbookmark_t)); 543 544 if (ret < 0) 545 return (-1); 546 else if (ret > 0) 547 return (1); 548 else 549 return (0); 550 } 551 552 /* 553 * Utility function which retrieves copies of the current logs and 554 * re-initializes them in the process. 555 */ 556 void 557 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 558 { 559 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 560 561 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 562 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 563 564 avl_create(&spa->spa_errlist_scrub, 565 spa_error_entry_compare, sizeof (spa_error_entry_t), 566 offsetof(spa_error_entry_t, se_avl)); 567 avl_create(&spa->spa_errlist_last, 568 spa_error_entry_compare, sizeof (spa_error_entry_t), 569 offsetof(spa_error_entry_t, se_avl)); 570 } 571 572 /* 573 * Activate an uninitialized pool. 574 */ 575 static void 576 spa_activate(spa_t *spa, int mode) 577 { 578 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 579 580 spa->spa_state = POOL_STATE_ACTIVE; 581 spa->spa_mode = mode; 582 583 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 584 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 585 586 for (int t = 0; t < ZIO_TYPES; t++) { 587 const zio_taskq_info_t *ztip = &zio_taskqs[t]; 588 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 589 enum zti_modes mode = ztip->zti_nthreads[q].zti_mode; 590 uint_t value = ztip->zti_nthreads[q].zti_value; 591 char name[32]; 592 593 (void) snprintf(name, sizeof (name), 594 "%s_%s", ztip->zti_name, zio_taskq_types[q]); 595 596 if (mode == zti_mode_tune) { 597 mode = zio_taskq_tune_mode; 598 value = zio_taskq_tune_value; 599 if (mode == zti_mode_tune) 600 mode = zti_mode_online_percent; 601 } 602 603 switch (mode) { 604 case zti_mode_fixed: 605 ASSERT3U(value, >=, 1); 606 value = MAX(value, 1); 607 608 spa->spa_zio_taskq[t][q] = taskq_create(name, 609 value, maxclsyspri, 50, INT_MAX, 610 TASKQ_PREPOPULATE); 611 break; 612 613 case zti_mode_online_percent: 614 spa->spa_zio_taskq[t][q] = taskq_create(name, 615 value, maxclsyspri, 50, INT_MAX, 616 TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT); 617 break; 618 619 case zti_mode_tune: 620 default: 621 panic("unrecognized mode for " 622 "zio_taskqs[%u]->zti_nthreads[%u] (%u:%u) " 623 "in spa_activate()", 624 t, q, mode, value); 625 break; 626 } 627 } 628 } 629 630 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 631 offsetof(vdev_t, vdev_config_dirty_node)); 632 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 633 offsetof(vdev_t, vdev_state_dirty_node)); 634 635 txg_list_create(&spa->spa_vdev_txg_list, 636 offsetof(struct vdev, vdev_txg_node)); 637 638 avl_create(&spa->spa_errlist_scrub, 639 spa_error_entry_compare, sizeof (spa_error_entry_t), 640 offsetof(spa_error_entry_t, se_avl)); 641 avl_create(&spa->spa_errlist_last, 642 spa_error_entry_compare, sizeof (spa_error_entry_t), 643 offsetof(spa_error_entry_t, se_avl)); 644 } 645 646 /* 647 * Opposite of spa_activate(). 648 */ 649 static void 650 spa_deactivate(spa_t *spa) 651 { 652 ASSERT(spa->spa_sync_on == B_FALSE); 653 ASSERT(spa->spa_dsl_pool == NULL); 654 ASSERT(spa->spa_root_vdev == NULL); 655 ASSERT(spa->spa_async_zio_root == NULL); 656 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 657 658 txg_list_destroy(&spa->spa_vdev_txg_list); 659 660 list_destroy(&spa->spa_config_dirty_list); 661 list_destroy(&spa->spa_state_dirty_list); 662 663 for (int t = 0; t < ZIO_TYPES; t++) { 664 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 665 taskq_destroy(spa->spa_zio_taskq[t][q]); 666 spa->spa_zio_taskq[t][q] = NULL; 667 } 668 } 669 670 metaslab_class_destroy(spa->spa_normal_class); 671 spa->spa_normal_class = NULL; 672 673 metaslab_class_destroy(spa->spa_log_class); 674 spa->spa_log_class = NULL; 675 676 /* 677 * If this was part of an import or the open otherwise failed, we may 678 * still have errors left in the queues. Empty them just in case. 679 */ 680 spa_errlog_drain(spa); 681 682 avl_destroy(&spa->spa_errlist_scrub); 683 avl_destroy(&spa->spa_errlist_last); 684 685 spa->spa_state = POOL_STATE_UNINITIALIZED; 686 } 687 688 /* 689 * Verify a pool configuration, and construct the vdev tree appropriately. This 690 * will create all the necessary vdevs in the appropriate layout, with each vdev 691 * in the CLOSED state. This will prep the pool before open/creation/import. 692 * All vdev validation is done by the vdev_alloc() routine. 693 */ 694 static int 695 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 696 uint_t id, int atype) 697 { 698 nvlist_t **child; 699 uint_t children; 700 int error; 701 702 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 703 return (error); 704 705 if ((*vdp)->vdev_ops->vdev_op_leaf) 706 return (0); 707 708 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 709 &child, &children); 710 711 if (error == ENOENT) 712 return (0); 713 714 if (error) { 715 vdev_free(*vdp); 716 *vdp = NULL; 717 return (EINVAL); 718 } 719 720 for (int c = 0; c < children; c++) { 721 vdev_t *vd; 722 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 723 atype)) != 0) { 724 vdev_free(*vdp); 725 *vdp = NULL; 726 return (error); 727 } 728 } 729 730 ASSERT(*vdp != NULL); 731 732 return (0); 733 } 734 735 /* 736 * Opposite of spa_load(). 737 */ 738 static void 739 spa_unload(spa_t *spa) 740 { 741 int i; 742 743 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 744 745 /* 746 * Stop async tasks. 747 */ 748 spa_async_suspend(spa); 749 750 /* 751 * Stop syncing. 752 */ 753 if (spa->spa_sync_on) { 754 txg_sync_stop(spa->spa_dsl_pool); 755 spa->spa_sync_on = B_FALSE; 756 } 757 758 /* 759 * Wait for any outstanding async I/O to complete. 760 */ 761 if (spa->spa_async_zio_root != NULL) { 762 (void) zio_wait(spa->spa_async_zio_root); 763 spa->spa_async_zio_root = NULL; 764 } 765 766 /* 767 * Close the dsl pool. 768 */ 769 if (spa->spa_dsl_pool) { 770 dsl_pool_close(spa->spa_dsl_pool); 771 spa->spa_dsl_pool = NULL; 772 } 773 774 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 775 776 /* 777 * Drop and purge level 2 cache 778 */ 779 spa_l2cache_drop(spa); 780 781 /* 782 * Close all vdevs. 783 */ 784 if (spa->spa_root_vdev) 785 vdev_free(spa->spa_root_vdev); 786 ASSERT(spa->spa_root_vdev == NULL); 787 788 for (i = 0; i < spa->spa_spares.sav_count; i++) 789 vdev_free(spa->spa_spares.sav_vdevs[i]); 790 if (spa->spa_spares.sav_vdevs) { 791 kmem_free(spa->spa_spares.sav_vdevs, 792 spa->spa_spares.sav_count * sizeof (void *)); 793 spa->spa_spares.sav_vdevs = NULL; 794 } 795 if (spa->spa_spares.sav_config) { 796 nvlist_free(spa->spa_spares.sav_config); 797 spa->spa_spares.sav_config = NULL; 798 } 799 spa->spa_spares.sav_count = 0; 800 801 for (i = 0; i < spa->spa_l2cache.sav_count; i++) 802 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 803 if (spa->spa_l2cache.sav_vdevs) { 804 kmem_free(spa->spa_l2cache.sav_vdevs, 805 spa->spa_l2cache.sav_count * sizeof (void *)); 806 spa->spa_l2cache.sav_vdevs = NULL; 807 } 808 if (spa->spa_l2cache.sav_config) { 809 nvlist_free(spa->spa_l2cache.sav_config); 810 spa->spa_l2cache.sav_config = NULL; 811 } 812 spa->spa_l2cache.sav_count = 0; 813 814 spa->spa_async_suspended = 0; 815 816 spa_config_exit(spa, SCL_ALL, FTAG); 817 } 818 819 /* 820 * Load (or re-load) the current list of vdevs describing the active spares for 821 * this pool. When this is called, we have some form of basic information in 822 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 823 * then re-generate a more complete list including status information. 824 */ 825 static void 826 spa_load_spares(spa_t *spa) 827 { 828 nvlist_t **spares; 829 uint_t nspares; 830 int i; 831 vdev_t *vd, *tvd; 832 833 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 834 835 /* 836 * First, close and free any existing spare vdevs. 837 */ 838 for (i = 0; i < spa->spa_spares.sav_count; i++) { 839 vd = spa->spa_spares.sav_vdevs[i]; 840 841 /* Undo the call to spa_activate() below */ 842 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 843 B_FALSE)) != NULL && tvd->vdev_isspare) 844 spa_spare_remove(tvd); 845 vdev_close(vd); 846 vdev_free(vd); 847 } 848 849 if (spa->spa_spares.sav_vdevs) 850 kmem_free(spa->spa_spares.sav_vdevs, 851 spa->spa_spares.sav_count * sizeof (void *)); 852 853 if (spa->spa_spares.sav_config == NULL) 854 nspares = 0; 855 else 856 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 857 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 858 859 spa->spa_spares.sav_count = (int)nspares; 860 spa->spa_spares.sav_vdevs = NULL; 861 862 if (nspares == 0) 863 return; 864 865 /* 866 * Construct the array of vdevs, opening them to get status in the 867 * process. For each spare, there is potentially two different vdev_t 868 * structures associated with it: one in the list of spares (used only 869 * for basic validation purposes) and one in the active vdev 870 * configuration (if it's spared in). During this phase we open and 871 * validate each vdev on the spare list. If the vdev also exists in the 872 * active configuration, then we also mark this vdev as an active spare. 873 */ 874 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 875 KM_SLEEP); 876 for (i = 0; i < spa->spa_spares.sav_count; i++) { 877 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 878 VDEV_ALLOC_SPARE) == 0); 879 ASSERT(vd != NULL); 880 881 spa->spa_spares.sav_vdevs[i] = vd; 882 883 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 884 B_FALSE)) != NULL) { 885 if (!tvd->vdev_isspare) 886 spa_spare_add(tvd); 887 888 /* 889 * We only mark the spare active if we were successfully 890 * able to load the vdev. Otherwise, importing a pool 891 * with a bad active spare would result in strange 892 * behavior, because multiple pool would think the spare 893 * is actively in use. 894 * 895 * There is a vulnerability here to an equally bizarre 896 * circumstance, where a dead active spare is later 897 * brought back to life (onlined or otherwise). Given 898 * the rarity of this scenario, and the extra complexity 899 * it adds, we ignore the possibility. 900 */ 901 if (!vdev_is_dead(tvd)) 902 spa_spare_activate(tvd); 903 } 904 905 vd->vdev_top = vd; 906 vd->vdev_aux = &spa->spa_spares; 907 908 if (vdev_open(vd) != 0) 909 continue; 910 911 if (vdev_validate_aux(vd) == 0) 912 spa_spare_add(vd); 913 } 914 915 /* 916 * Recompute the stashed list of spares, with status information 917 * this time. 918 */ 919 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 920 DATA_TYPE_NVLIST_ARRAY) == 0); 921 922 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 923 KM_SLEEP); 924 for (i = 0; i < spa->spa_spares.sav_count; i++) 925 spares[i] = vdev_config_generate(spa, 926 spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE); 927 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 928 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 929 for (i = 0; i < spa->spa_spares.sav_count; i++) 930 nvlist_free(spares[i]); 931 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 932 } 933 934 /* 935 * Load (or re-load) the current list of vdevs describing the active l2cache for 936 * this pool. When this is called, we have some form of basic information in 937 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 938 * then re-generate a more complete list including status information. 939 * Devices which are already active have their details maintained, and are 940 * not re-opened. 941 */ 942 static void 943 spa_load_l2cache(spa_t *spa) 944 { 945 nvlist_t **l2cache; 946 uint_t nl2cache; 947 int i, j, oldnvdevs; 948 uint64_t guid; 949 vdev_t *vd, **oldvdevs, **newvdevs; 950 spa_aux_vdev_t *sav = &spa->spa_l2cache; 951 952 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 953 954 if (sav->sav_config != NULL) { 955 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 956 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 957 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 958 } else { 959 nl2cache = 0; 960 } 961 962 oldvdevs = sav->sav_vdevs; 963 oldnvdevs = sav->sav_count; 964 sav->sav_vdevs = NULL; 965 sav->sav_count = 0; 966 967 /* 968 * Process new nvlist of vdevs. 969 */ 970 for (i = 0; i < nl2cache; i++) { 971 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 972 &guid) == 0); 973 974 newvdevs[i] = NULL; 975 for (j = 0; j < oldnvdevs; j++) { 976 vd = oldvdevs[j]; 977 if (vd != NULL && guid == vd->vdev_guid) { 978 /* 979 * Retain previous vdev for add/remove ops. 980 */ 981 newvdevs[i] = vd; 982 oldvdevs[j] = NULL; 983 break; 984 } 985 } 986 987 if (newvdevs[i] == NULL) { 988 /* 989 * Create new vdev 990 */ 991 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 992 VDEV_ALLOC_L2CACHE) == 0); 993 ASSERT(vd != NULL); 994 newvdevs[i] = vd; 995 996 /* 997 * Commit this vdev as an l2cache device, 998 * even if it fails to open. 999 */ 1000 spa_l2cache_add(vd); 1001 1002 vd->vdev_top = vd; 1003 vd->vdev_aux = sav; 1004 1005 spa_l2cache_activate(vd); 1006 1007 if (vdev_open(vd) != 0) 1008 continue; 1009 1010 (void) vdev_validate_aux(vd); 1011 1012 if (!vdev_is_dead(vd)) 1013 l2arc_add_vdev(spa, vd); 1014 } 1015 } 1016 1017 /* 1018 * Purge vdevs that were dropped 1019 */ 1020 for (i = 0; i < oldnvdevs; i++) { 1021 uint64_t pool; 1022 1023 vd = oldvdevs[i]; 1024 if (vd != NULL) { 1025 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1026 pool != 0ULL && l2arc_vdev_present(vd)) 1027 l2arc_remove_vdev(vd); 1028 (void) vdev_close(vd); 1029 spa_l2cache_remove(vd); 1030 } 1031 } 1032 1033 if (oldvdevs) 1034 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1035 1036 if (sav->sav_config == NULL) 1037 goto out; 1038 1039 sav->sav_vdevs = newvdevs; 1040 sav->sav_count = (int)nl2cache; 1041 1042 /* 1043 * Recompute the stashed list of l2cache devices, with status 1044 * information this time. 1045 */ 1046 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1047 DATA_TYPE_NVLIST_ARRAY) == 0); 1048 1049 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1050 for (i = 0; i < sav->sav_count; i++) 1051 l2cache[i] = vdev_config_generate(spa, 1052 sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE); 1053 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1054 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1055 out: 1056 for (i = 0; i < sav->sav_count; i++) 1057 nvlist_free(l2cache[i]); 1058 if (sav->sav_count) 1059 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1060 } 1061 1062 static int 1063 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1064 { 1065 dmu_buf_t *db; 1066 char *packed = NULL; 1067 size_t nvsize = 0; 1068 int error; 1069 *value = NULL; 1070 1071 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 1072 nvsize = *(uint64_t *)db->db_data; 1073 dmu_buf_rele(db, FTAG); 1074 1075 packed = kmem_alloc(nvsize, KM_SLEEP); 1076 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1077 DMU_READ_PREFETCH); 1078 if (error == 0) 1079 error = nvlist_unpack(packed, nvsize, value, 0); 1080 kmem_free(packed, nvsize); 1081 1082 return (error); 1083 } 1084 1085 /* 1086 * Checks to see if the given vdev could not be opened, in which case we post a 1087 * sysevent to notify the autoreplace code that the device has been removed. 1088 */ 1089 static void 1090 spa_check_removed(vdev_t *vd) 1091 { 1092 for (int c = 0; c < vd->vdev_children; c++) 1093 spa_check_removed(vd->vdev_child[c]); 1094 1095 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 1096 zfs_post_autoreplace(vd->vdev_spa, vd); 1097 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1098 } 1099 } 1100 1101 /* 1102 * Load the slog device state from the config object since it's possible 1103 * that the label does not contain the most up-to-date information. 1104 */ 1105 void 1106 spa_load_log_state(spa_t *spa, nvlist_t *nv) 1107 { 1108 vdev_t *ovd, *rvd = spa->spa_root_vdev; 1109 1110 /* 1111 * Load the original root vdev tree from the passed config. 1112 */ 1113 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1114 VERIFY(spa_config_parse(spa, &ovd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1115 1116 for (int c = 0; c < rvd->vdev_children; c++) { 1117 vdev_t *cvd = rvd->vdev_child[c]; 1118 if (cvd->vdev_islog) 1119 vdev_load_log_state(cvd, ovd->vdev_child[c]); 1120 } 1121 vdev_free(ovd); 1122 spa_config_exit(spa, SCL_ALL, FTAG); 1123 } 1124 1125 /* 1126 * Check for missing log devices 1127 */ 1128 int 1129 spa_check_logs(spa_t *spa) 1130 { 1131 switch (spa->spa_log_state) { 1132 case SPA_LOG_MISSING: 1133 /* need to recheck in case slog has been restored */ 1134 case SPA_LOG_UNKNOWN: 1135 if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, 1136 DS_FIND_CHILDREN)) { 1137 spa->spa_log_state = SPA_LOG_MISSING; 1138 return (1); 1139 } 1140 break; 1141 } 1142 return (0); 1143 } 1144 1145 static void 1146 spa_aux_check_removed(spa_aux_vdev_t *sav) 1147 { 1148 int i; 1149 1150 for (i = 0; i < sav->sav_count; i++) 1151 spa_check_removed(sav->sav_vdevs[i]); 1152 } 1153 1154 /* 1155 * Load an existing storage pool, using the pool's builtin spa_config as a 1156 * source of configuration information. 1157 */ 1158 static int 1159 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 1160 { 1161 int error = 0; 1162 nvlist_t *nvconfig, *nvroot = NULL; 1163 vdev_t *rvd; 1164 uberblock_t *ub = &spa->spa_uberblock; 1165 uint64_t config_cache_txg = spa->spa_config_txg; 1166 uint64_t pool_guid; 1167 uint64_t version; 1168 uint64_t autoreplace = 0; 1169 int orig_mode = spa->spa_mode; 1170 char *ereport = FM_EREPORT_ZFS_POOL; 1171 1172 /* 1173 * If this is an untrusted config, access the pool in read-only mode. 1174 * This prevents things like resilvering recently removed devices. 1175 */ 1176 if (!mosconfig) 1177 spa->spa_mode = FREAD; 1178 1179 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1180 1181 spa->spa_load_state = state; 1182 1183 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 1184 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 1185 error = EINVAL; 1186 goto out; 1187 } 1188 1189 /* 1190 * Versioning wasn't explicitly added to the label until later, so if 1191 * it's not present treat it as the initial version. 1192 */ 1193 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 1194 version = SPA_VERSION_INITIAL; 1195 1196 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 1197 &spa->spa_config_txg); 1198 1199 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 1200 spa_guid_exists(pool_guid, 0)) { 1201 error = EEXIST; 1202 goto out; 1203 } 1204 1205 spa->spa_load_guid = pool_guid; 1206 1207 /* 1208 * Create "The Godfather" zio to hold all async IOs 1209 */ 1210 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 1211 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 1212 1213 /* 1214 * Parse the configuration into a vdev tree. We explicitly set the 1215 * value that will be returned by spa_version() since parsing the 1216 * configuration requires knowing the version number. 1217 */ 1218 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1219 spa->spa_ubsync.ub_version = version; 1220 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 1221 spa_config_exit(spa, SCL_ALL, FTAG); 1222 1223 if (error != 0) 1224 goto out; 1225 1226 ASSERT(spa->spa_root_vdev == rvd); 1227 ASSERT(spa_guid(spa) == pool_guid); 1228 1229 /* 1230 * Try to open all vdevs, loading each label in the process. 1231 */ 1232 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1233 error = vdev_open(rvd); 1234 spa_config_exit(spa, SCL_ALL, FTAG); 1235 if (error != 0) 1236 goto out; 1237 1238 /* 1239 * We need to validate the vdev labels against the configuration that 1240 * we have in hand, which is dependent on the setting of mosconfig. If 1241 * mosconfig is true then we're validating the vdev labels based on 1242 * that config. Otherwise, we're validating against the cached config 1243 * (zpool.cache) that was read when we loaded the zfs module, and then 1244 * later we will recursively call spa_load() and validate against 1245 * the vdev config. 1246 */ 1247 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1248 error = vdev_validate(rvd); 1249 spa_config_exit(spa, SCL_ALL, FTAG); 1250 if (error != 0) 1251 goto out; 1252 1253 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1254 error = ENXIO; 1255 goto out; 1256 } 1257 1258 /* 1259 * Find the best uberblock. 1260 */ 1261 vdev_uberblock_load(NULL, rvd, ub); 1262 1263 /* 1264 * If we weren't able to find a single valid uberblock, return failure. 1265 */ 1266 if (ub->ub_txg == 0) { 1267 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1268 VDEV_AUX_CORRUPT_DATA); 1269 error = ENXIO; 1270 goto out; 1271 } 1272 1273 /* 1274 * If the pool is newer than the code, we can't open it. 1275 */ 1276 if (ub->ub_version > SPA_VERSION) { 1277 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1278 VDEV_AUX_VERSION_NEWER); 1279 error = ENOTSUP; 1280 goto out; 1281 } 1282 1283 /* 1284 * If the vdev guid sum doesn't match the uberblock, we have an 1285 * incomplete configuration. 1286 */ 1287 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 1288 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1289 VDEV_AUX_BAD_GUID_SUM); 1290 error = ENXIO; 1291 goto out; 1292 } 1293 1294 /* 1295 * Initialize internal SPA structures. 1296 */ 1297 spa->spa_state = POOL_STATE_ACTIVE; 1298 spa->spa_ubsync = spa->spa_uberblock; 1299 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 1300 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 1301 if (error) { 1302 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1303 VDEV_AUX_CORRUPT_DATA); 1304 goto out; 1305 } 1306 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 1307 1308 if (zap_lookup(spa->spa_meta_objset, 1309 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1310 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 1311 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1312 VDEV_AUX_CORRUPT_DATA); 1313 error = EIO; 1314 goto out; 1315 } 1316 1317 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) { 1318 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1319 VDEV_AUX_CORRUPT_DATA); 1320 error = EIO; 1321 goto out; 1322 } 1323 1324 if (!mosconfig) { 1325 uint64_t hostid; 1326 1327 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 1328 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 1329 char *hostname; 1330 unsigned long myhostid = 0; 1331 1332 VERIFY(nvlist_lookup_string(nvconfig, 1333 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 1334 1335 #ifdef _KERNEL 1336 myhostid = zone_get_hostid(NULL); 1337 #else /* _KERNEL */ 1338 /* 1339 * We're emulating the system's hostid in userland, so 1340 * we can't use zone_get_hostid(). 1341 */ 1342 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 1343 #endif /* _KERNEL */ 1344 if (hostid != 0 && myhostid != 0 && 1345 hostid != myhostid) { 1346 cmn_err(CE_WARN, "pool '%s' could not be " 1347 "loaded as it was last accessed by " 1348 "another system (host: %s hostid: 0x%lx). " 1349 "See: http://www.sun.com/msg/ZFS-8000-EY", 1350 spa_name(spa), hostname, 1351 (unsigned long)hostid); 1352 error = EBADF; 1353 goto out; 1354 } 1355 } 1356 1357 spa_config_set(spa, nvconfig); 1358 spa_unload(spa); 1359 spa_deactivate(spa); 1360 spa_activate(spa, orig_mode); 1361 1362 return (spa_load(spa, nvconfig, state, B_TRUE)); 1363 } 1364 1365 if (zap_lookup(spa->spa_meta_objset, 1366 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1367 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 1368 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1369 VDEV_AUX_CORRUPT_DATA); 1370 error = EIO; 1371 goto out; 1372 } 1373 1374 /* 1375 * Load the bit that tells us to use the new accounting function 1376 * (raid-z deflation). If we have an older pool, this will not 1377 * be present. 1378 */ 1379 error = zap_lookup(spa->spa_meta_objset, 1380 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1381 sizeof (uint64_t), 1, &spa->spa_deflate); 1382 if (error != 0 && error != ENOENT) { 1383 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1384 VDEV_AUX_CORRUPT_DATA); 1385 error = EIO; 1386 goto out; 1387 } 1388 1389 /* 1390 * Load the persistent error log. If we have an older pool, this will 1391 * not be present. 1392 */ 1393 error = zap_lookup(spa->spa_meta_objset, 1394 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 1395 sizeof (uint64_t), 1, &spa->spa_errlog_last); 1396 if (error != 0 && error != ENOENT) { 1397 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1398 VDEV_AUX_CORRUPT_DATA); 1399 error = EIO; 1400 goto out; 1401 } 1402 1403 error = zap_lookup(spa->spa_meta_objset, 1404 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 1405 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 1406 if (error != 0 && error != ENOENT) { 1407 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1408 VDEV_AUX_CORRUPT_DATA); 1409 error = EIO; 1410 goto out; 1411 } 1412 1413 /* 1414 * Load the history object. If we have an older pool, this 1415 * will not be present. 1416 */ 1417 error = zap_lookup(spa->spa_meta_objset, 1418 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 1419 sizeof (uint64_t), 1, &spa->spa_history); 1420 if (error != 0 && error != ENOENT) { 1421 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1422 VDEV_AUX_CORRUPT_DATA); 1423 error = EIO; 1424 goto out; 1425 } 1426 1427 /* 1428 * Load any hot spares for this pool. 1429 */ 1430 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1431 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object); 1432 if (error != 0 && error != ENOENT) { 1433 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1434 VDEV_AUX_CORRUPT_DATA); 1435 error = EIO; 1436 goto out; 1437 } 1438 if (error == 0) { 1439 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 1440 if (load_nvlist(spa, spa->spa_spares.sav_object, 1441 &spa->spa_spares.sav_config) != 0) { 1442 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1443 VDEV_AUX_CORRUPT_DATA); 1444 error = EIO; 1445 goto out; 1446 } 1447 1448 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1449 spa_load_spares(spa); 1450 spa_config_exit(spa, SCL_ALL, FTAG); 1451 } 1452 1453 /* 1454 * Load any level 2 ARC devices for this pool. 1455 */ 1456 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1457 DMU_POOL_L2CACHE, sizeof (uint64_t), 1, 1458 &spa->spa_l2cache.sav_object); 1459 if (error != 0 && error != ENOENT) { 1460 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1461 VDEV_AUX_CORRUPT_DATA); 1462 error = EIO; 1463 goto out; 1464 } 1465 if (error == 0) { 1466 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 1467 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 1468 &spa->spa_l2cache.sav_config) != 0) { 1469 vdev_set_state(rvd, B_TRUE, 1470 VDEV_STATE_CANT_OPEN, 1471 VDEV_AUX_CORRUPT_DATA); 1472 error = EIO; 1473 goto out; 1474 } 1475 1476 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1477 spa_load_l2cache(spa); 1478 spa_config_exit(spa, SCL_ALL, FTAG); 1479 } 1480 1481 VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE, 1482 &nvroot) == 0); 1483 spa_load_log_state(spa, nvroot); 1484 nvlist_free(nvconfig); 1485 1486 if (spa_check_logs(spa)) { 1487 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1488 VDEV_AUX_BAD_LOG); 1489 error = ENXIO; 1490 ereport = FM_EREPORT_ZFS_LOG_REPLAY; 1491 goto out; 1492 } 1493 1494 1495 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 1496 1497 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1498 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 1499 1500 if (error && error != ENOENT) { 1501 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1502 VDEV_AUX_CORRUPT_DATA); 1503 error = EIO; 1504 goto out; 1505 } 1506 1507 if (error == 0) { 1508 (void) zap_lookup(spa->spa_meta_objset, 1509 spa->spa_pool_props_object, 1510 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 1511 sizeof (uint64_t), 1, &spa->spa_bootfs); 1512 (void) zap_lookup(spa->spa_meta_objset, 1513 spa->spa_pool_props_object, 1514 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1515 sizeof (uint64_t), 1, &autoreplace); 1516 spa->spa_autoreplace = (autoreplace != 0); 1517 (void) zap_lookup(spa->spa_meta_objset, 1518 spa->spa_pool_props_object, 1519 zpool_prop_to_name(ZPOOL_PROP_DELEGATION), 1520 sizeof (uint64_t), 1, &spa->spa_delegation); 1521 (void) zap_lookup(spa->spa_meta_objset, 1522 spa->spa_pool_props_object, 1523 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 1524 sizeof (uint64_t), 1, &spa->spa_failmode); 1525 (void) zap_lookup(spa->spa_meta_objset, 1526 spa->spa_pool_props_object, 1527 zpool_prop_to_name(ZPOOL_PROP_AUTOEXPAND), 1528 sizeof (uint64_t), 1, &spa->spa_autoexpand); 1529 } 1530 1531 /* 1532 * If the 'autoreplace' property is set, then post a resource notifying 1533 * the ZFS DE that it should not issue any faults for unopenable 1534 * devices. We also iterate over the vdevs, and post a sysevent for any 1535 * unopenable vdevs so that the normal autoreplace handler can take 1536 * over. 1537 */ 1538 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 1539 spa_check_removed(spa->spa_root_vdev); 1540 /* 1541 * For the import case, this is done in spa_import(), because 1542 * at this point we're using the spare definitions from 1543 * the MOS config, not necessarily from the userland config. 1544 */ 1545 if (state != SPA_LOAD_IMPORT) { 1546 spa_aux_check_removed(&spa->spa_spares); 1547 spa_aux_check_removed(&spa->spa_l2cache); 1548 } 1549 } 1550 1551 /* 1552 * Load the vdev state for all toplevel vdevs. 1553 */ 1554 vdev_load(rvd); 1555 1556 /* 1557 * Propagate the leaf DTLs we just loaded all the way up the tree. 1558 */ 1559 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1560 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 1561 spa_config_exit(spa, SCL_ALL, FTAG); 1562 1563 /* 1564 * Check the state of the root vdev. If it can't be opened, it 1565 * indicates one or more toplevel vdevs are faulted. 1566 */ 1567 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1568 error = ENXIO; 1569 goto out; 1570 } 1571 1572 if (spa_writeable(spa)) { 1573 dmu_tx_t *tx; 1574 int need_update = B_FALSE; 1575 1576 ASSERT(state != SPA_LOAD_TRYIMPORT); 1577 1578 /* 1579 * Claim log blocks that haven't been committed yet. 1580 * This must all happen in a single txg. 1581 */ 1582 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 1583 spa_first_txg(spa)); 1584 (void) dmu_objset_find(spa_name(spa), 1585 zil_claim, tx, DS_FIND_CHILDREN); 1586 dmu_tx_commit(tx); 1587 1588 spa->spa_log_state = SPA_LOG_GOOD; 1589 spa->spa_sync_on = B_TRUE; 1590 txg_sync_start(spa->spa_dsl_pool); 1591 1592 /* 1593 * Wait for all claims to sync. 1594 */ 1595 txg_wait_synced(spa->spa_dsl_pool, 0); 1596 1597 /* 1598 * If the config cache is stale, or we have uninitialized 1599 * metaslabs (see spa_vdev_add()), then update the config. 1600 * 1601 * If spa_load_verbatim is true, trust the current 1602 * in-core spa_config and update the disk labels. 1603 */ 1604 if (config_cache_txg != spa->spa_config_txg || 1605 state == SPA_LOAD_IMPORT || spa->spa_load_verbatim) 1606 need_update = B_TRUE; 1607 1608 for (int c = 0; c < rvd->vdev_children; c++) 1609 if (rvd->vdev_child[c]->vdev_ms_array == 0) 1610 need_update = B_TRUE; 1611 1612 /* 1613 * Update the config cache asychronously in case we're the 1614 * root pool, in which case the config cache isn't writable yet. 1615 */ 1616 if (need_update) 1617 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 1618 1619 /* 1620 * Check all DTLs to see if anything needs resilvering. 1621 */ 1622 if (vdev_resilver_needed(rvd, NULL, NULL)) 1623 spa_async_request(spa, SPA_ASYNC_RESILVER); 1624 1625 /* 1626 * Delete any inconsistent datasets. 1627 */ 1628 (void) dmu_objset_find(spa_name(spa), 1629 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 1630 1631 /* 1632 * Clean up any stale temporary dataset userrefs. 1633 */ 1634 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 1635 } 1636 1637 error = 0; 1638 out: 1639 spa->spa_minref = refcount_count(&spa->spa_refcount); 1640 if (error && error != EBADF) 1641 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 1642 spa->spa_load_state = SPA_LOAD_NONE; 1643 spa->spa_ena = 0; 1644 1645 return (error); 1646 } 1647 1648 /* 1649 * Pool Open/Import 1650 * 1651 * The import case is identical to an open except that the configuration is sent 1652 * down from userland, instead of grabbed from the configuration cache. For the 1653 * case of an open, the pool configuration will exist in the 1654 * POOL_STATE_UNINITIALIZED state. 1655 * 1656 * The stats information (gen/count/ustats) is used to gather vdev statistics at 1657 * the same time open the pool, without having to keep around the spa_t in some 1658 * ambiguous state. 1659 */ 1660 static int 1661 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 1662 { 1663 spa_t *spa; 1664 int error; 1665 int locked = B_FALSE; 1666 1667 *spapp = NULL; 1668 1669 /* 1670 * As disgusting as this is, we need to support recursive calls to this 1671 * function because dsl_dir_open() is called during spa_load(), and ends 1672 * up calling spa_open() again. The real fix is to figure out how to 1673 * avoid dsl_dir_open() calling this in the first place. 1674 */ 1675 if (mutex_owner(&spa_namespace_lock) != curthread) { 1676 mutex_enter(&spa_namespace_lock); 1677 locked = B_TRUE; 1678 } 1679 1680 if ((spa = spa_lookup(pool)) == NULL) { 1681 if (locked) 1682 mutex_exit(&spa_namespace_lock); 1683 return (ENOENT); 1684 } 1685 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 1686 1687 spa_activate(spa, spa_mode_global); 1688 1689 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 1690 1691 if (error == EBADF) { 1692 /* 1693 * If vdev_validate() returns failure (indicated by 1694 * EBADF), it indicates that one of the vdevs indicates 1695 * that the pool has been exported or destroyed. If 1696 * this is the case, the config cache is out of sync and 1697 * we should remove the pool from the namespace. 1698 */ 1699 spa_unload(spa); 1700 spa_deactivate(spa); 1701 spa_config_sync(spa, B_TRUE, B_TRUE); 1702 spa_remove(spa); 1703 if (locked) 1704 mutex_exit(&spa_namespace_lock); 1705 return (ENOENT); 1706 } 1707 1708 if (error) { 1709 /* 1710 * We can't open the pool, but we still have useful 1711 * information: the state of each vdev after the 1712 * attempted vdev_open(). Return this to the user. 1713 */ 1714 if (config != NULL && spa->spa_root_vdev != NULL) 1715 *config = spa_config_generate(spa, NULL, -1ULL, 1716 B_TRUE); 1717 spa_unload(spa); 1718 spa_deactivate(spa); 1719 spa->spa_last_open_failed = B_TRUE; 1720 if (locked) 1721 mutex_exit(&spa_namespace_lock); 1722 *spapp = NULL; 1723 return (error); 1724 } else { 1725 spa->spa_last_open_failed = B_FALSE; 1726 } 1727 } 1728 1729 spa_open_ref(spa, tag); 1730 1731 if (locked) 1732 mutex_exit(&spa_namespace_lock); 1733 1734 *spapp = spa; 1735 1736 if (config != NULL) 1737 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1738 1739 return (0); 1740 } 1741 1742 int 1743 spa_open(const char *name, spa_t **spapp, void *tag) 1744 { 1745 return (spa_open_common(name, spapp, tag, NULL)); 1746 } 1747 1748 /* 1749 * Lookup the given spa_t, incrementing the inject count in the process, 1750 * preventing it from being exported or destroyed. 1751 */ 1752 spa_t * 1753 spa_inject_addref(char *name) 1754 { 1755 spa_t *spa; 1756 1757 mutex_enter(&spa_namespace_lock); 1758 if ((spa = spa_lookup(name)) == NULL) { 1759 mutex_exit(&spa_namespace_lock); 1760 return (NULL); 1761 } 1762 spa->spa_inject_ref++; 1763 mutex_exit(&spa_namespace_lock); 1764 1765 return (spa); 1766 } 1767 1768 void 1769 spa_inject_delref(spa_t *spa) 1770 { 1771 mutex_enter(&spa_namespace_lock); 1772 spa->spa_inject_ref--; 1773 mutex_exit(&spa_namespace_lock); 1774 } 1775 1776 /* 1777 * Add spares device information to the nvlist. 1778 */ 1779 static void 1780 spa_add_spares(spa_t *spa, nvlist_t *config) 1781 { 1782 nvlist_t **spares; 1783 uint_t i, nspares; 1784 nvlist_t *nvroot; 1785 uint64_t guid; 1786 vdev_stat_t *vs; 1787 uint_t vsc; 1788 uint64_t pool; 1789 1790 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 1791 1792 if (spa->spa_spares.sav_count == 0) 1793 return; 1794 1795 VERIFY(nvlist_lookup_nvlist(config, 1796 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1797 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1798 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1799 if (nspares != 0) { 1800 VERIFY(nvlist_add_nvlist_array(nvroot, 1801 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1802 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1803 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1804 1805 /* 1806 * Go through and find any spares which have since been 1807 * repurposed as an active spare. If this is the case, update 1808 * their status appropriately. 1809 */ 1810 for (i = 0; i < nspares; i++) { 1811 VERIFY(nvlist_lookup_uint64(spares[i], 1812 ZPOOL_CONFIG_GUID, &guid) == 0); 1813 if (spa_spare_exists(guid, &pool, NULL) && 1814 pool != 0ULL) { 1815 VERIFY(nvlist_lookup_uint64_array( 1816 spares[i], ZPOOL_CONFIG_STATS, 1817 (uint64_t **)&vs, &vsc) == 0); 1818 vs->vs_state = VDEV_STATE_CANT_OPEN; 1819 vs->vs_aux = VDEV_AUX_SPARED; 1820 } 1821 } 1822 } 1823 } 1824 1825 /* 1826 * Add l2cache device information to the nvlist, including vdev stats. 1827 */ 1828 static void 1829 spa_add_l2cache(spa_t *spa, nvlist_t *config) 1830 { 1831 nvlist_t **l2cache; 1832 uint_t i, j, nl2cache; 1833 nvlist_t *nvroot; 1834 uint64_t guid; 1835 vdev_t *vd; 1836 vdev_stat_t *vs; 1837 uint_t vsc; 1838 1839 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 1840 1841 if (spa->spa_l2cache.sav_count == 0) 1842 return; 1843 1844 VERIFY(nvlist_lookup_nvlist(config, 1845 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1846 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 1847 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1848 if (nl2cache != 0) { 1849 VERIFY(nvlist_add_nvlist_array(nvroot, 1850 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 1851 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1852 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1853 1854 /* 1855 * Update level 2 cache device stats. 1856 */ 1857 1858 for (i = 0; i < nl2cache; i++) { 1859 VERIFY(nvlist_lookup_uint64(l2cache[i], 1860 ZPOOL_CONFIG_GUID, &guid) == 0); 1861 1862 vd = NULL; 1863 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 1864 if (guid == 1865 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 1866 vd = spa->spa_l2cache.sav_vdevs[j]; 1867 break; 1868 } 1869 } 1870 ASSERT(vd != NULL); 1871 1872 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 1873 ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0); 1874 vdev_get_stats(vd, vs); 1875 } 1876 } 1877 } 1878 1879 int 1880 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 1881 { 1882 int error; 1883 spa_t *spa; 1884 1885 *config = NULL; 1886 error = spa_open_common(name, &spa, FTAG, config); 1887 1888 if (spa != NULL) { 1889 /* 1890 * This still leaves a window of inconsistency where the spares 1891 * or l2cache devices could change and the config would be 1892 * self-inconsistent. 1893 */ 1894 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 1895 1896 if (*config != NULL) { 1897 VERIFY(nvlist_add_uint64(*config, 1898 ZPOOL_CONFIG_ERRCOUNT, 1899 spa_get_errlog_size(spa)) == 0); 1900 1901 if (spa_suspended(spa)) 1902 VERIFY(nvlist_add_uint64(*config, 1903 ZPOOL_CONFIG_SUSPENDED, 1904 spa->spa_failmode) == 0); 1905 1906 spa_add_spares(spa, *config); 1907 spa_add_l2cache(spa, *config); 1908 } 1909 } 1910 1911 /* 1912 * We want to get the alternate root even for faulted pools, so we cheat 1913 * and call spa_lookup() directly. 1914 */ 1915 if (altroot) { 1916 if (spa == NULL) { 1917 mutex_enter(&spa_namespace_lock); 1918 spa = spa_lookup(name); 1919 if (spa) 1920 spa_altroot(spa, altroot, buflen); 1921 else 1922 altroot[0] = '\0'; 1923 spa = NULL; 1924 mutex_exit(&spa_namespace_lock); 1925 } else { 1926 spa_altroot(spa, altroot, buflen); 1927 } 1928 } 1929 1930 if (spa != NULL) { 1931 spa_config_exit(spa, SCL_CONFIG, FTAG); 1932 spa_close(spa, FTAG); 1933 } 1934 1935 return (error); 1936 } 1937 1938 /* 1939 * Validate that the auxiliary device array is well formed. We must have an 1940 * array of nvlists, each which describes a valid leaf vdev. If this is an 1941 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 1942 * specified, as long as they are well-formed. 1943 */ 1944 static int 1945 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 1946 spa_aux_vdev_t *sav, const char *config, uint64_t version, 1947 vdev_labeltype_t label) 1948 { 1949 nvlist_t **dev; 1950 uint_t i, ndev; 1951 vdev_t *vd; 1952 int error; 1953 1954 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1955 1956 /* 1957 * It's acceptable to have no devs specified. 1958 */ 1959 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 1960 return (0); 1961 1962 if (ndev == 0) 1963 return (EINVAL); 1964 1965 /* 1966 * Make sure the pool is formatted with a version that supports this 1967 * device type. 1968 */ 1969 if (spa_version(spa) < version) 1970 return (ENOTSUP); 1971 1972 /* 1973 * Set the pending device list so we correctly handle device in-use 1974 * checking. 1975 */ 1976 sav->sav_pending = dev; 1977 sav->sav_npending = ndev; 1978 1979 for (i = 0; i < ndev; i++) { 1980 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 1981 mode)) != 0) 1982 goto out; 1983 1984 if (!vd->vdev_ops->vdev_op_leaf) { 1985 vdev_free(vd); 1986 error = EINVAL; 1987 goto out; 1988 } 1989 1990 /* 1991 * The L2ARC currently only supports disk devices in 1992 * kernel context. For user-level testing, we allow it. 1993 */ 1994 #ifdef _KERNEL 1995 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 1996 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 1997 error = ENOTBLK; 1998 goto out; 1999 } 2000 #endif 2001 vd->vdev_top = vd; 2002 2003 if ((error = vdev_open(vd)) == 0 && 2004 (error = vdev_label_init(vd, crtxg, label)) == 0) { 2005 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 2006 vd->vdev_guid) == 0); 2007 } 2008 2009 vdev_free(vd); 2010 2011 if (error && 2012 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 2013 goto out; 2014 else 2015 error = 0; 2016 } 2017 2018 out: 2019 sav->sav_pending = NULL; 2020 sav->sav_npending = 0; 2021 return (error); 2022 } 2023 2024 static int 2025 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 2026 { 2027 int error; 2028 2029 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2030 2031 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2032 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 2033 VDEV_LABEL_SPARE)) != 0) { 2034 return (error); 2035 } 2036 2037 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2038 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 2039 VDEV_LABEL_L2CACHE)); 2040 } 2041 2042 static void 2043 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 2044 const char *config) 2045 { 2046 int i; 2047 2048 if (sav->sav_config != NULL) { 2049 nvlist_t **olddevs; 2050 uint_t oldndevs; 2051 nvlist_t **newdevs; 2052 2053 /* 2054 * Generate new dev list by concatentating with the 2055 * current dev list. 2056 */ 2057 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 2058 &olddevs, &oldndevs) == 0); 2059 2060 newdevs = kmem_alloc(sizeof (void *) * 2061 (ndevs + oldndevs), KM_SLEEP); 2062 for (i = 0; i < oldndevs; i++) 2063 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 2064 KM_SLEEP) == 0); 2065 for (i = 0; i < ndevs; i++) 2066 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 2067 KM_SLEEP) == 0); 2068 2069 VERIFY(nvlist_remove(sav->sav_config, config, 2070 DATA_TYPE_NVLIST_ARRAY) == 0); 2071 2072 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 2073 config, newdevs, ndevs + oldndevs) == 0); 2074 for (i = 0; i < oldndevs + ndevs; i++) 2075 nvlist_free(newdevs[i]); 2076 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 2077 } else { 2078 /* 2079 * Generate a new dev list. 2080 */ 2081 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 2082 KM_SLEEP) == 0); 2083 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 2084 devs, ndevs) == 0); 2085 } 2086 } 2087 2088 /* 2089 * Stop and drop level 2 ARC devices 2090 */ 2091 void 2092 spa_l2cache_drop(spa_t *spa) 2093 { 2094 vdev_t *vd; 2095 int i; 2096 spa_aux_vdev_t *sav = &spa->spa_l2cache; 2097 2098 for (i = 0; i < sav->sav_count; i++) { 2099 uint64_t pool; 2100 2101 vd = sav->sav_vdevs[i]; 2102 ASSERT(vd != NULL); 2103 2104 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 2105 pool != 0ULL && l2arc_vdev_present(vd)) 2106 l2arc_remove_vdev(vd); 2107 if (vd->vdev_isl2cache) 2108 spa_l2cache_remove(vd); 2109 vdev_clear_stats(vd); 2110 (void) vdev_close(vd); 2111 } 2112 } 2113 2114 /* 2115 * Pool Creation 2116 */ 2117 int 2118 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 2119 const char *history_str, nvlist_t *zplprops) 2120 { 2121 spa_t *spa; 2122 char *altroot = NULL; 2123 vdev_t *rvd; 2124 dsl_pool_t *dp; 2125 dmu_tx_t *tx; 2126 int error = 0; 2127 uint64_t txg = TXG_INITIAL; 2128 nvlist_t **spares, **l2cache; 2129 uint_t nspares, nl2cache; 2130 uint64_t version; 2131 2132 /* 2133 * If this pool already exists, return failure. 2134 */ 2135 mutex_enter(&spa_namespace_lock); 2136 if (spa_lookup(pool) != NULL) { 2137 mutex_exit(&spa_namespace_lock); 2138 return (EEXIST); 2139 } 2140 2141 /* 2142 * Allocate a new spa_t structure. 2143 */ 2144 (void) nvlist_lookup_string(props, 2145 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2146 spa = spa_add(pool, altroot); 2147 spa_activate(spa, spa_mode_global); 2148 2149 spa->spa_uberblock.ub_txg = txg - 1; 2150 2151 if (props && (error = spa_prop_validate(spa, props))) { 2152 spa_deactivate(spa); 2153 spa_remove(spa); 2154 mutex_exit(&spa_namespace_lock); 2155 return (error); 2156 } 2157 2158 if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 2159 &version) != 0) 2160 version = SPA_VERSION; 2161 ASSERT(version <= SPA_VERSION); 2162 spa->spa_uberblock.ub_version = version; 2163 spa->spa_ubsync = spa->spa_uberblock; 2164 2165 /* 2166 * Create "The Godfather" zio to hold all async IOs 2167 */ 2168 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 2169 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 2170 2171 /* 2172 * Create the root vdev. 2173 */ 2174 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2175 2176 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 2177 2178 ASSERT(error != 0 || rvd != NULL); 2179 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 2180 2181 if (error == 0 && !zfs_allocatable_devs(nvroot)) 2182 error = EINVAL; 2183 2184 if (error == 0 && 2185 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 2186 (error = spa_validate_aux(spa, nvroot, txg, 2187 VDEV_ALLOC_ADD)) == 0) { 2188 for (int c = 0; c < rvd->vdev_children; c++) { 2189 vdev_metaslab_set_size(rvd->vdev_child[c]); 2190 vdev_expand(rvd->vdev_child[c], txg); 2191 } 2192 } 2193 2194 spa_config_exit(spa, SCL_ALL, FTAG); 2195 2196 if (error != 0) { 2197 spa_unload(spa); 2198 spa_deactivate(spa); 2199 spa_remove(spa); 2200 mutex_exit(&spa_namespace_lock); 2201 return (error); 2202 } 2203 2204 /* 2205 * Get the list of spares, if specified. 2206 */ 2207 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2208 &spares, &nspares) == 0) { 2209 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 2210 KM_SLEEP) == 0); 2211 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2212 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2213 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2214 spa_load_spares(spa); 2215 spa_config_exit(spa, SCL_ALL, FTAG); 2216 spa->spa_spares.sav_sync = B_TRUE; 2217 } 2218 2219 /* 2220 * Get the list of level 2 cache devices, if specified. 2221 */ 2222 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2223 &l2cache, &nl2cache) == 0) { 2224 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2225 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2226 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2227 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2228 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2229 spa_load_l2cache(spa); 2230 spa_config_exit(spa, SCL_ALL, FTAG); 2231 spa->spa_l2cache.sav_sync = B_TRUE; 2232 } 2233 2234 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 2235 spa->spa_meta_objset = dp->dp_meta_objset; 2236 2237 tx = dmu_tx_create_assigned(dp, txg); 2238 2239 /* 2240 * Create the pool config object. 2241 */ 2242 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 2243 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 2244 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2245 2246 if (zap_add(spa->spa_meta_objset, 2247 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 2248 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 2249 cmn_err(CE_PANIC, "failed to add pool config"); 2250 } 2251 2252 /* Newly created pools with the right version are always deflated. */ 2253 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 2254 spa->spa_deflate = TRUE; 2255 if (zap_add(spa->spa_meta_objset, 2256 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2257 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 2258 cmn_err(CE_PANIC, "failed to add deflate"); 2259 } 2260 } 2261 2262 /* 2263 * Create the deferred-free bplist object. Turn off compression 2264 * because sync-to-convergence takes longer if the blocksize 2265 * keeps changing. 2266 */ 2267 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 2268 1 << 14, tx); 2269 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 2270 ZIO_COMPRESS_OFF, tx); 2271 2272 if (zap_add(spa->spa_meta_objset, 2273 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 2274 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 2275 cmn_err(CE_PANIC, "failed to add bplist"); 2276 } 2277 2278 /* 2279 * Create the pool's history object. 2280 */ 2281 if (version >= SPA_VERSION_ZPOOL_HISTORY) 2282 spa_history_create_obj(spa, tx); 2283 2284 /* 2285 * Set pool properties. 2286 */ 2287 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 2288 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2289 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 2290 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 2291 if (props != NULL) { 2292 spa_configfile_set(spa, props, B_FALSE); 2293 spa_sync_props(spa, props, CRED(), tx); 2294 } 2295 2296 dmu_tx_commit(tx); 2297 2298 spa->spa_sync_on = B_TRUE; 2299 txg_sync_start(spa->spa_dsl_pool); 2300 2301 /* 2302 * We explicitly wait for the first transaction to complete so that our 2303 * bean counters are appropriately updated. 2304 */ 2305 txg_wait_synced(spa->spa_dsl_pool, txg); 2306 2307 spa_config_sync(spa, B_FALSE, B_TRUE); 2308 2309 if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 2310 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 2311 spa_history_log_version(spa, LOG_POOL_CREATE); 2312 2313 spa->spa_minref = refcount_count(&spa->spa_refcount); 2314 2315 mutex_exit(&spa_namespace_lock); 2316 2317 return (0); 2318 } 2319 2320 #ifdef _KERNEL 2321 /* 2322 * Get the root pool information from the root disk, then import the root pool 2323 * during the system boot up time. 2324 */ 2325 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 2326 2327 static nvlist_t * 2328 spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 2329 { 2330 nvlist_t *config; 2331 nvlist_t *nvtop, *nvroot; 2332 uint64_t pgid; 2333 2334 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 2335 return (NULL); 2336 2337 /* 2338 * Add this top-level vdev to the child array. 2339 */ 2340 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 2341 &nvtop) == 0); 2342 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 2343 &pgid) == 0); 2344 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 2345 2346 /* 2347 * Put this pool's top-level vdevs into a root vdev. 2348 */ 2349 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2350 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 2351 VDEV_TYPE_ROOT) == 0); 2352 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 2353 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 2354 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 2355 &nvtop, 1) == 0); 2356 2357 /* 2358 * Replace the existing vdev_tree with the new root vdev in 2359 * this pool's configuration (remove the old, add the new). 2360 */ 2361 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 2362 nvlist_free(nvroot); 2363 return (config); 2364 } 2365 2366 /* 2367 * Walk the vdev tree and see if we can find a device with "better" 2368 * configuration. A configuration is "better" if the label on that 2369 * device has a more recent txg. 2370 */ 2371 static void 2372 spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 2373 { 2374 for (int c = 0; c < vd->vdev_children; c++) 2375 spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 2376 2377 if (vd->vdev_ops->vdev_op_leaf) { 2378 nvlist_t *label; 2379 uint64_t label_txg; 2380 2381 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 2382 &label) != 0) 2383 return; 2384 2385 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 2386 &label_txg) == 0); 2387 2388 /* 2389 * Do we have a better boot device? 2390 */ 2391 if (label_txg > *txg) { 2392 *txg = label_txg; 2393 *avd = vd; 2394 } 2395 nvlist_free(label); 2396 } 2397 } 2398 2399 /* 2400 * Import a root pool. 2401 * 2402 * For x86. devpath_list will consist of devid and/or physpath name of 2403 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 2404 * The GRUB "findroot" command will return the vdev we should boot. 2405 * 2406 * For Sparc, devpath_list consists the physpath name of the booting device 2407 * no matter the rootpool is a single device pool or a mirrored pool. 2408 * e.g. 2409 * "/pci@1f,0/ide@d/disk@0,0:a" 2410 */ 2411 int 2412 spa_import_rootpool(char *devpath, char *devid) 2413 { 2414 spa_t *spa; 2415 vdev_t *rvd, *bvd, *avd = NULL; 2416 nvlist_t *config, *nvtop; 2417 uint64_t guid, txg; 2418 char *pname; 2419 int error; 2420 2421 /* 2422 * Read the label from the boot device and generate a configuration. 2423 */ 2424 config = spa_generate_rootconf(devpath, devid, &guid); 2425 #if defined(_OBP) && defined(_KERNEL) 2426 if (config == NULL) { 2427 if (strstr(devpath, "/iscsi/ssd") != NULL) { 2428 /* iscsi boot */ 2429 get_iscsi_bootpath_phy(devpath); 2430 config = spa_generate_rootconf(devpath, devid, &guid); 2431 } 2432 } 2433 #endif 2434 if (config == NULL) { 2435 cmn_err(CE_NOTE, "Can not read the pool label from '%s'", 2436 devpath); 2437 return (EIO); 2438 } 2439 2440 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 2441 &pname) == 0); 2442 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 2443 2444 mutex_enter(&spa_namespace_lock); 2445 if ((spa = spa_lookup(pname)) != NULL) { 2446 /* 2447 * Remove the existing root pool from the namespace so that we 2448 * can replace it with the correct config we just read in. 2449 */ 2450 spa_remove(spa); 2451 } 2452 2453 spa = spa_add(pname, NULL); 2454 spa->spa_is_root = B_TRUE; 2455 spa->spa_load_verbatim = B_TRUE; 2456 2457 /* 2458 * Build up a vdev tree based on the boot device's label config. 2459 */ 2460 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 2461 &nvtop) == 0); 2462 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2463 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 2464 VDEV_ALLOC_ROOTPOOL); 2465 spa_config_exit(spa, SCL_ALL, FTAG); 2466 if (error) { 2467 mutex_exit(&spa_namespace_lock); 2468 nvlist_free(config); 2469 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 2470 pname); 2471 return (error); 2472 } 2473 2474 /* 2475 * Get the boot vdev. 2476 */ 2477 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2478 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 2479 (u_longlong_t)guid); 2480 error = ENOENT; 2481 goto out; 2482 } 2483 2484 /* 2485 * Determine if there is a better boot device. 2486 */ 2487 avd = bvd; 2488 spa_alt_rootvdev(rvd, &avd, &txg); 2489 if (avd != bvd) { 2490 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 2491 "try booting from '%s'", avd->vdev_path); 2492 error = EINVAL; 2493 goto out; 2494 } 2495 2496 /* 2497 * If the boot device is part of a spare vdev then ensure that 2498 * we're booting off the active spare. 2499 */ 2500 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2501 !bvd->vdev_isspare) { 2502 cmn_err(CE_NOTE, "The boot device is currently spared. Please " 2503 "try booting from '%s'", 2504 bvd->vdev_parent->vdev_child[1]->vdev_path); 2505 error = EINVAL; 2506 goto out; 2507 } 2508 2509 VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); 2510 error = 0; 2511 spa_history_log_version(spa, LOG_POOL_IMPORT); 2512 out: 2513 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2514 vdev_free(rvd); 2515 spa_config_exit(spa, SCL_ALL, FTAG); 2516 mutex_exit(&spa_namespace_lock); 2517 2518 nvlist_free(config); 2519 return (error); 2520 } 2521 2522 #endif 2523 2524 /* 2525 * Take a pool and insert it into the namespace as if it had been loaded at 2526 * boot. 2527 */ 2528 int 2529 spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props) 2530 { 2531 spa_t *spa; 2532 char *altroot = NULL; 2533 2534 mutex_enter(&spa_namespace_lock); 2535 if (spa_lookup(pool) != NULL) { 2536 mutex_exit(&spa_namespace_lock); 2537 return (EEXIST); 2538 } 2539 2540 (void) nvlist_lookup_string(props, 2541 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2542 spa = spa_add(pool, altroot); 2543 2544 spa->spa_load_verbatim = B_TRUE; 2545 2546 VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); 2547 2548 if (props != NULL) 2549 spa_configfile_set(spa, props, B_FALSE); 2550 2551 spa_config_sync(spa, B_FALSE, B_TRUE); 2552 2553 mutex_exit(&spa_namespace_lock); 2554 spa_history_log_version(spa, LOG_POOL_IMPORT); 2555 2556 return (0); 2557 } 2558 2559 /* 2560 * Import a non-root pool into the system. 2561 */ 2562 int 2563 spa_import(const char *pool, nvlist_t *config, nvlist_t *props) 2564 { 2565 spa_t *spa; 2566 char *altroot = NULL; 2567 int error; 2568 nvlist_t *nvroot; 2569 nvlist_t **spares, **l2cache; 2570 uint_t nspares, nl2cache; 2571 2572 /* 2573 * If a pool with this name exists, return failure. 2574 */ 2575 mutex_enter(&spa_namespace_lock); 2576 if ((spa = spa_lookup(pool)) != NULL) { 2577 mutex_exit(&spa_namespace_lock); 2578 return (EEXIST); 2579 } 2580 2581 /* 2582 * Create and initialize the spa structure. 2583 */ 2584 (void) nvlist_lookup_string(props, 2585 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2586 spa = spa_add(pool, altroot); 2587 spa_activate(spa, spa_mode_global); 2588 2589 /* 2590 * Don't start async tasks until we know everything is healthy. 2591 */ 2592 spa_async_suspend(spa); 2593 2594 /* 2595 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 2596 * because the user-supplied config is actually the one to trust when 2597 * doing an import. 2598 */ 2599 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 2600 2601 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2602 /* 2603 * Toss any existing sparelist, as it doesn't have any validity 2604 * anymore, and conflicts with spa_has_spare(). 2605 */ 2606 if (spa->spa_spares.sav_config) { 2607 nvlist_free(spa->spa_spares.sav_config); 2608 spa->spa_spares.sav_config = NULL; 2609 spa_load_spares(spa); 2610 } 2611 if (spa->spa_l2cache.sav_config) { 2612 nvlist_free(spa->spa_l2cache.sav_config); 2613 spa->spa_l2cache.sav_config = NULL; 2614 spa_load_l2cache(spa); 2615 } 2616 2617 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 2618 &nvroot) == 0); 2619 if (error == 0) 2620 error = spa_validate_aux(spa, nvroot, -1ULL, 2621 VDEV_ALLOC_SPARE); 2622 if (error == 0) 2623 error = spa_validate_aux(spa, nvroot, -1ULL, 2624 VDEV_ALLOC_L2CACHE); 2625 spa_config_exit(spa, SCL_ALL, FTAG); 2626 2627 if (props != NULL) 2628 spa_configfile_set(spa, props, B_FALSE); 2629 2630 if (error != 0 || (props && spa_writeable(spa) && 2631 (error = spa_prop_set(spa, props)))) { 2632 spa_unload(spa); 2633 spa_deactivate(spa); 2634 spa_remove(spa); 2635 mutex_exit(&spa_namespace_lock); 2636 return (error); 2637 } 2638 2639 spa_async_resume(spa); 2640 2641 /* 2642 * Override any spares and level 2 cache devices as specified by 2643 * the user, as these may have correct device names/devids, etc. 2644 */ 2645 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2646 &spares, &nspares) == 0) { 2647 if (spa->spa_spares.sav_config) 2648 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 2649 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 2650 else 2651 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 2652 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2653 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2654 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2655 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2656 spa_load_spares(spa); 2657 spa_config_exit(spa, SCL_ALL, FTAG); 2658 spa->spa_spares.sav_sync = B_TRUE; 2659 } 2660 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2661 &l2cache, &nl2cache) == 0) { 2662 if (spa->spa_l2cache.sav_config) 2663 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 2664 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 2665 else 2666 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2667 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2668 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2669 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2670 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2671 spa_load_l2cache(spa); 2672 spa_config_exit(spa, SCL_ALL, FTAG); 2673 spa->spa_l2cache.sav_sync = B_TRUE; 2674 } 2675 2676 /* 2677 * Check for any removed devices. 2678 */ 2679 if (spa->spa_autoreplace) { 2680 spa_aux_check_removed(&spa->spa_spares); 2681 spa_aux_check_removed(&spa->spa_l2cache); 2682 } 2683 2684 if (spa_writeable(spa)) { 2685 /* 2686 * Update the config cache to include the newly-imported pool. 2687 */ 2688 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2689 } 2690 2691 /* 2692 * It's possible that the pool was expanded while it was exported. 2693 * We kick off an async task to handle this for us. 2694 */ 2695 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 2696 2697 mutex_exit(&spa_namespace_lock); 2698 spa_history_log_version(spa, LOG_POOL_IMPORT); 2699 2700 return (0); 2701 } 2702 2703 2704 /* 2705 * This (illegal) pool name is used when temporarily importing a spa_t in order 2706 * to get the vdev stats associated with the imported devices. 2707 */ 2708 #define TRYIMPORT_NAME "$import" 2709 2710 nvlist_t * 2711 spa_tryimport(nvlist_t *tryconfig) 2712 { 2713 nvlist_t *config = NULL; 2714 char *poolname; 2715 spa_t *spa; 2716 uint64_t state; 2717 int error; 2718 2719 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 2720 return (NULL); 2721 2722 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 2723 return (NULL); 2724 2725 /* 2726 * Create and initialize the spa structure. 2727 */ 2728 mutex_enter(&spa_namespace_lock); 2729 spa = spa_add(TRYIMPORT_NAME, NULL); 2730 spa_activate(spa, FREAD); 2731 2732 /* 2733 * Pass off the heavy lifting to spa_load(). 2734 * Pass TRUE for mosconfig because the user-supplied config 2735 * is actually the one to trust when doing an import. 2736 */ 2737 error = spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 2738 2739 /* 2740 * If 'tryconfig' was at least parsable, return the current config. 2741 */ 2742 if (spa->spa_root_vdev != NULL) { 2743 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2744 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 2745 poolname) == 0); 2746 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 2747 state) == 0); 2748 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 2749 spa->spa_uberblock.ub_timestamp) == 0); 2750 2751 /* 2752 * If the bootfs property exists on this pool then we 2753 * copy it out so that external consumers can tell which 2754 * pools are bootable. 2755 */ 2756 if ((!error || error == EEXIST) && spa->spa_bootfs) { 2757 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 2758 2759 /* 2760 * We have to play games with the name since the 2761 * pool was opened as TRYIMPORT_NAME. 2762 */ 2763 if (dsl_dsobj_to_dsname(spa_name(spa), 2764 spa->spa_bootfs, tmpname) == 0) { 2765 char *cp; 2766 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 2767 2768 cp = strchr(tmpname, '/'); 2769 if (cp == NULL) { 2770 (void) strlcpy(dsname, tmpname, 2771 MAXPATHLEN); 2772 } else { 2773 (void) snprintf(dsname, MAXPATHLEN, 2774 "%s/%s", poolname, ++cp); 2775 } 2776 VERIFY(nvlist_add_string(config, 2777 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 2778 kmem_free(dsname, MAXPATHLEN); 2779 } 2780 kmem_free(tmpname, MAXPATHLEN); 2781 } 2782 2783 /* 2784 * Add the list of hot spares and level 2 cache devices. 2785 */ 2786 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 2787 spa_add_spares(spa, config); 2788 spa_add_l2cache(spa, config); 2789 spa_config_exit(spa, SCL_CONFIG, FTAG); 2790 } 2791 2792 spa_unload(spa); 2793 spa_deactivate(spa); 2794 spa_remove(spa); 2795 mutex_exit(&spa_namespace_lock); 2796 2797 return (config); 2798 } 2799 2800 /* 2801 * Pool export/destroy 2802 * 2803 * The act of destroying or exporting a pool is very simple. We make sure there 2804 * is no more pending I/O and any references to the pool are gone. Then, we 2805 * update the pool state and sync all the labels to disk, removing the 2806 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 2807 * we don't sync the labels or remove the configuration cache. 2808 */ 2809 static int 2810 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 2811 boolean_t force, boolean_t hardforce) 2812 { 2813 spa_t *spa; 2814 2815 if (oldconfig) 2816 *oldconfig = NULL; 2817 2818 if (!(spa_mode_global & FWRITE)) 2819 return (EROFS); 2820 2821 mutex_enter(&spa_namespace_lock); 2822 if ((spa = spa_lookup(pool)) == NULL) { 2823 mutex_exit(&spa_namespace_lock); 2824 return (ENOENT); 2825 } 2826 2827 /* 2828 * Put a hold on the pool, drop the namespace lock, stop async tasks, 2829 * reacquire the namespace lock, and see if we can export. 2830 */ 2831 spa_open_ref(spa, FTAG); 2832 mutex_exit(&spa_namespace_lock); 2833 spa_async_suspend(spa); 2834 mutex_enter(&spa_namespace_lock); 2835 spa_close(spa, FTAG); 2836 2837 /* 2838 * The pool will be in core if it's openable, 2839 * in which case we can modify its state. 2840 */ 2841 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 2842 /* 2843 * Objsets may be open only because they're dirty, so we 2844 * have to force it to sync before checking spa_refcnt. 2845 */ 2846 txg_wait_synced(spa->spa_dsl_pool, 0); 2847 2848 /* 2849 * A pool cannot be exported or destroyed if there are active 2850 * references. If we are resetting a pool, allow references by 2851 * fault injection handlers. 2852 */ 2853 if (!spa_refcount_zero(spa) || 2854 (spa->spa_inject_ref != 0 && 2855 new_state != POOL_STATE_UNINITIALIZED)) { 2856 spa_async_resume(spa); 2857 mutex_exit(&spa_namespace_lock); 2858 return (EBUSY); 2859 } 2860 2861 /* 2862 * A pool cannot be exported if it has an active shared spare. 2863 * This is to prevent other pools stealing the active spare 2864 * from an exported pool. At user's own will, such pool can 2865 * be forcedly exported. 2866 */ 2867 if (!force && new_state == POOL_STATE_EXPORTED && 2868 spa_has_active_shared_spare(spa)) { 2869 spa_async_resume(spa); 2870 mutex_exit(&spa_namespace_lock); 2871 return (EXDEV); 2872 } 2873 2874 /* 2875 * We want this to be reflected on every label, 2876 * so mark them all dirty. spa_unload() will do the 2877 * final sync that pushes these changes out. 2878 */ 2879 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 2880 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2881 spa->spa_state = new_state; 2882 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 2883 vdev_config_dirty(spa->spa_root_vdev); 2884 spa_config_exit(spa, SCL_ALL, FTAG); 2885 } 2886 } 2887 2888 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 2889 2890 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2891 spa_unload(spa); 2892 spa_deactivate(spa); 2893 } 2894 2895 if (oldconfig && spa->spa_config) 2896 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 2897 2898 if (new_state != POOL_STATE_UNINITIALIZED) { 2899 if (!hardforce) 2900 spa_config_sync(spa, B_TRUE, B_TRUE); 2901 spa_remove(spa); 2902 } 2903 mutex_exit(&spa_namespace_lock); 2904 2905 return (0); 2906 } 2907 2908 /* 2909 * Destroy a storage pool. 2910 */ 2911 int 2912 spa_destroy(char *pool) 2913 { 2914 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 2915 B_FALSE, B_FALSE)); 2916 } 2917 2918 /* 2919 * Export a storage pool. 2920 */ 2921 int 2922 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 2923 boolean_t hardforce) 2924 { 2925 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 2926 force, hardforce)); 2927 } 2928 2929 /* 2930 * Similar to spa_export(), this unloads the spa_t without actually removing it 2931 * from the namespace in any way. 2932 */ 2933 int 2934 spa_reset(char *pool) 2935 { 2936 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 2937 B_FALSE, B_FALSE)); 2938 } 2939 2940 /* 2941 * ========================================================================== 2942 * Device manipulation 2943 * ========================================================================== 2944 */ 2945 2946 /* 2947 * Add a device to a storage pool. 2948 */ 2949 int 2950 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 2951 { 2952 uint64_t txg, id; 2953 int error; 2954 vdev_t *rvd = spa->spa_root_vdev; 2955 vdev_t *vd, *tvd; 2956 nvlist_t **spares, **l2cache; 2957 uint_t nspares, nl2cache; 2958 2959 txg = spa_vdev_enter(spa); 2960 2961 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 2962 VDEV_ALLOC_ADD)) != 0) 2963 return (spa_vdev_exit(spa, NULL, txg, error)); 2964 2965 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 2966 2967 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 2968 &nspares) != 0) 2969 nspares = 0; 2970 2971 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 2972 &nl2cache) != 0) 2973 nl2cache = 0; 2974 2975 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 2976 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 2977 2978 if (vd->vdev_children != 0 && 2979 (error = vdev_create(vd, txg, B_FALSE)) != 0) 2980 return (spa_vdev_exit(spa, vd, txg, error)); 2981 2982 /* 2983 * We must validate the spares and l2cache devices after checking the 2984 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 2985 */ 2986 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 2987 return (spa_vdev_exit(spa, vd, txg, error)); 2988 2989 /* 2990 * Transfer each new top-level vdev from vd to rvd. 2991 */ 2992 for (int c = 0; c < vd->vdev_children; c++) { 2993 2994 /* 2995 * Set the vdev id to the first hole, if one exists. 2996 */ 2997 for (id = 0; id < rvd->vdev_children; id++) { 2998 if (rvd->vdev_child[id]->vdev_ishole) { 2999 vdev_free(rvd->vdev_child[id]); 3000 break; 3001 } 3002 } 3003 tvd = vd->vdev_child[c]; 3004 vdev_remove_child(vd, tvd); 3005 tvd->vdev_id = id; 3006 vdev_add_child(rvd, tvd); 3007 vdev_config_dirty(tvd); 3008 } 3009 3010 if (nspares != 0) { 3011 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 3012 ZPOOL_CONFIG_SPARES); 3013 spa_load_spares(spa); 3014 spa->spa_spares.sav_sync = B_TRUE; 3015 } 3016 3017 if (nl2cache != 0) { 3018 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 3019 ZPOOL_CONFIG_L2CACHE); 3020 spa_load_l2cache(spa); 3021 spa->spa_l2cache.sav_sync = B_TRUE; 3022 } 3023 3024 /* 3025 * We have to be careful when adding new vdevs to an existing pool. 3026 * If other threads start allocating from these vdevs before we 3027 * sync the config cache, and we lose power, then upon reboot we may 3028 * fail to open the pool because there are DVAs that the config cache 3029 * can't translate. Therefore, we first add the vdevs without 3030 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 3031 * and then let spa_config_update() initialize the new metaslabs. 3032 * 3033 * spa_load() checks for added-but-not-initialized vdevs, so that 3034 * if we lose power at any point in this sequence, the remaining 3035 * steps will be completed the next time we load the pool. 3036 */ 3037 (void) spa_vdev_exit(spa, vd, txg, 0); 3038 3039 mutex_enter(&spa_namespace_lock); 3040 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3041 mutex_exit(&spa_namespace_lock); 3042 3043 return (0); 3044 } 3045 3046 /* 3047 * Attach a device to a mirror. The arguments are the path to any device 3048 * in the mirror, and the nvroot for the new device. If the path specifies 3049 * a device that is not mirrored, we automatically insert the mirror vdev. 3050 * 3051 * If 'replacing' is specified, the new device is intended to replace the 3052 * existing device; in this case the two devices are made into their own 3053 * mirror using the 'replacing' vdev, which is functionally identical to 3054 * the mirror vdev (it actually reuses all the same ops) but has a few 3055 * extra rules: you can't attach to it after it's been created, and upon 3056 * completion of resilvering, the first disk (the one being replaced) 3057 * is automatically detached. 3058 */ 3059 int 3060 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 3061 { 3062 uint64_t txg, open_txg; 3063 vdev_t *rvd = spa->spa_root_vdev; 3064 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 3065 vdev_ops_t *pvops; 3066 char *oldvdpath, *newvdpath; 3067 int newvd_isspare; 3068 int error; 3069 3070 txg = spa_vdev_enter(spa); 3071 3072 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 3073 3074 if (oldvd == NULL) 3075 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3076 3077 if (!oldvd->vdev_ops->vdev_op_leaf) 3078 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3079 3080 pvd = oldvd->vdev_parent; 3081 3082 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 3083 VDEV_ALLOC_ADD)) != 0) 3084 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 3085 3086 if (newrootvd->vdev_children != 1) 3087 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3088 3089 newvd = newrootvd->vdev_child[0]; 3090 3091 if (!newvd->vdev_ops->vdev_op_leaf) 3092 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3093 3094 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 3095 return (spa_vdev_exit(spa, newrootvd, txg, error)); 3096 3097 /* 3098 * Spares can't replace logs 3099 */ 3100 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 3101 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3102 3103 if (!replacing) { 3104 /* 3105 * For attach, the only allowable parent is a mirror or the root 3106 * vdev. 3107 */ 3108 if (pvd->vdev_ops != &vdev_mirror_ops && 3109 pvd->vdev_ops != &vdev_root_ops) 3110 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3111 3112 pvops = &vdev_mirror_ops; 3113 } else { 3114 /* 3115 * Active hot spares can only be replaced by inactive hot 3116 * spares. 3117 */ 3118 if (pvd->vdev_ops == &vdev_spare_ops && 3119 pvd->vdev_child[1] == oldvd && 3120 !spa_has_spare(spa, newvd->vdev_guid)) 3121 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3122 3123 /* 3124 * If the source is a hot spare, and the parent isn't already a 3125 * spare, then we want to create a new hot spare. Otherwise, we 3126 * want to create a replacing vdev. The user is not allowed to 3127 * attach to a spared vdev child unless the 'isspare' state is 3128 * the same (spare replaces spare, non-spare replaces 3129 * non-spare). 3130 */ 3131 if (pvd->vdev_ops == &vdev_replacing_ops) 3132 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3133 else if (pvd->vdev_ops == &vdev_spare_ops && 3134 newvd->vdev_isspare != oldvd->vdev_isspare) 3135 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3136 else if (pvd->vdev_ops != &vdev_spare_ops && 3137 newvd->vdev_isspare) 3138 pvops = &vdev_spare_ops; 3139 else 3140 pvops = &vdev_replacing_ops; 3141 } 3142 3143 /* 3144 * Make sure the new device is big enough. 3145 */ 3146 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 3147 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 3148 3149 /* 3150 * The new device cannot have a higher alignment requirement 3151 * than the top-level vdev. 3152 */ 3153 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 3154 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 3155 3156 /* 3157 * If this is an in-place replacement, update oldvd's path and devid 3158 * to make it distinguishable from newvd, and unopenable from now on. 3159 */ 3160 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 3161 spa_strfree(oldvd->vdev_path); 3162 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 3163 KM_SLEEP); 3164 (void) sprintf(oldvd->vdev_path, "%s/%s", 3165 newvd->vdev_path, "old"); 3166 if (oldvd->vdev_devid != NULL) { 3167 spa_strfree(oldvd->vdev_devid); 3168 oldvd->vdev_devid = NULL; 3169 } 3170 } 3171 3172 /* 3173 * If the parent is not a mirror, or if we're replacing, insert the new 3174 * mirror/replacing/spare vdev above oldvd. 3175 */ 3176 if (pvd->vdev_ops != pvops) 3177 pvd = vdev_add_parent(oldvd, pvops); 3178 3179 ASSERT(pvd->vdev_top->vdev_parent == rvd); 3180 ASSERT(pvd->vdev_ops == pvops); 3181 ASSERT(oldvd->vdev_parent == pvd); 3182 3183 /* 3184 * Extract the new device from its root and add it to pvd. 3185 */ 3186 vdev_remove_child(newrootvd, newvd); 3187 newvd->vdev_id = pvd->vdev_children; 3188 newvd->vdev_crtxg = oldvd->vdev_crtxg; 3189 vdev_add_child(pvd, newvd); 3190 3191 tvd = newvd->vdev_top; 3192 ASSERT(pvd->vdev_top == tvd); 3193 ASSERT(tvd->vdev_parent == rvd); 3194 3195 vdev_config_dirty(tvd); 3196 3197 /* 3198 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 3199 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 3200 */ 3201 open_txg = txg + TXG_CONCURRENT_STATES - 1; 3202 3203 vdev_dtl_dirty(newvd, DTL_MISSING, 3204 TXG_INITIAL, open_txg - TXG_INITIAL + 1); 3205 3206 if (newvd->vdev_isspare) { 3207 spa_spare_activate(newvd); 3208 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 3209 } 3210 3211 oldvdpath = spa_strdup(oldvd->vdev_path); 3212 newvdpath = spa_strdup(newvd->vdev_path); 3213 newvd_isspare = newvd->vdev_isspare; 3214 3215 /* 3216 * Mark newvd's DTL dirty in this txg. 3217 */ 3218 vdev_dirty(tvd, VDD_DTL, newvd, txg); 3219 3220 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 3221 3222 spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, NULL, 3223 CRED(), "%s vdev=%s %s vdev=%s", 3224 replacing && newvd_isspare ? "spare in" : 3225 replacing ? "replace" : "attach", newvdpath, 3226 replacing ? "for" : "to", oldvdpath); 3227 3228 spa_strfree(oldvdpath); 3229 spa_strfree(newvdpath); 3230 3231 /* 3232 * Kick off a resilver to update newvd. 3233 */ 3234 VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0); 3235 3236 return (0); 3237 } 3238 3239 /* 3240 * Detach a device from a mirror or replacing vdev. 3241 * If 'replace_done' is specified, only detach if the parent 3242 * is a replacing vdev. 3243 */ 3244 int 3245 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 3246 { 3247 uint64_t txg; 3248 int error; 3249 vdev_t *rvd = spa->spa_root_vdev; 3250 vdev_t *vd, *pvd, *cvd, *tvd; 3251 boolean_t unspare = B_FALSE; 3252 uint64_t unspare_guid; 3253 size_t len; 3254 3255 txg = spa_vdev_enter(spa); 3256 3257 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 3258 3259 if (vd == NULL) 3260 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3261 3262 if (!vd->vdev_ops->vdev_op_leaf) 3263 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3264 3265 pvd = vd->vdev_parent; 3266 3267 /* 3268 * If the parent/child relationship is not as expected, don't do it. 3269 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 3270 * vdev that's replacing B with C. The user's intent in replacing 3271 * is to go from M(A,B) to M(A,C). If the user decides to cancel 3272 * the replace by detaching C, the expected behavior is to end up 3273 * M(A,B). But suppose that right after deciding to detach C, 3274 * the replacement of B completes. We would have M(A,C), and then 3275 * ask to detach C, which would leave us with just A -- not what 3276 * the user wanted. To prevent this, we make sure that the 3277 * parent/child relationship hasn't changed -- in this example, 3278 * that C's parent is still the replacing vdev R. 3279 */ 3280 if (pvd->vdev_guid != pguid && pguid != 0) 3281 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3282 3283 /* 3284 * If replace_done is specified, only remove this device if it's 3285 * the first child of a replacing vdev. For the 'spare' vdev, either 3286 * disk can be removed. 3287 */ 3288 if (replace_done) { 3289 if (pvd->vdev_ops == &vdev_replacing_ops) { 3290 if (vd->vdev_id != 0) 3291 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3292 } else if (pvd->vdev_ops != &vdev_spare_ops) { 3293 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3294 } 3295 } 3296 3297 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 3298 spa_version(spa) >= SPA_VERSION_SPARES); 3299 3300 /* 3301 * Only mirror, replacing, and spare vdevs support detach. 3302 */ 3303 if (pvd->vdev_ops != &vdev_replacing_ops && 3304 pvd->vdev_ops != &vdev_mirror_ops && 3305 pvd->vdev_ops != &vdev_spare_ops) 3306 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3307 3308 /* 3309 * If this device has the only valid copy of some data, 3310 * we cannot safely detach it. 3311 */ 3312 if (vdev_dtl_required(vd)) 3313 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3314 3315 ASSERT(pvd->vdev_children >= 2); 3316 3317 /* 3318 * If we are detaching the second disk from a replacing vdev, then 3319 * check to see if we changed the original vdev's path to have "/old" 3320 * at the end in spa_vdev_attach(). If so, undo that change now. 3321 */ 3322 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 && 3323 pvd->vdev_child[0]->vdev_path != NULL && 3324 pvd->vdev_child[1]->vdev_path != NULL) { 3325 ASSERT(pvd->vdev_child[1] == vd); 3326 cvd = pvd->vdev_child[0]; 3327 len = strlen(vd->vdev_path); 3328 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 3329 strcmp(cvd->vdev_path + len, "/old") == 0) { 3330 spa_strfree(cvd->vdev_path); 3331 cvd->vdev_path = spa_strdup(vd->vdev_path); 3332 } 3333 } 3334 3335 /* 3336 * If we are detaching the original disk from a spare, then it implies 3337 * that the spare should become a real disk, and be removed from the 3338 * active spare list for the pool. 3339 */ 3340 if (pvd->vdev_ops == &vdev_spare_ops && 3341 vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare) 3342 unspare = B_TRUE; 3343 3344 /* 3345 * Erase the disk labels so the disk can be used for other things. 3346 * This must be done after all other error cases are handled, 3347 * but before we disembowel vd (so we can still do I/O to it). 3348 * But if we can't do it, don't treat the error as fatal -- 3349 * it may be that the unwritability of the disk is the reason 3350 * it's being detached! 3351 */ 3352 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 3353 3354 /* 3355 * Remove vd from its parent and compact the parent's children. 3356 */ 3357 vdev_remove_child(pvd, vd); 3358 vdev_compact_children(pvd); 3359 3360 /* 3361 * Remember one of the remaining children so we can get tvd below. 3362 */ 3363 cvd = pvd->vdev_child[0]; 3364 3365 /* 3366 * If we need to remove the remaining child from the list of hot spares, 3367 * do it now, marking the vdev as no longer a spare in the process. 3368 * We must do this before vdev_remove_parent(), because that can 3369 * change the GUID if it creates a new toplevel GUID. For a similar 3370 * reason, we must remove the spare now, in the same txg as the detach; 3371 * otherwise someone could attach a new sibling, change the GUID, and 3372 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 3373 */ 3374 if (unspare) { 3375 ASSERT(cvd->vdev_isspare); 3376 spa_spare_remove(cvd); 3377 unspare_guid = cvd->vdev_guid; 3378 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 3379 } 3380 3381 /* 3382 * If the parent mirror/replacing vdev only has one child, 3383 * the parent is no longer needed. Remove it from the tree. 3384 */ 3385 if (pvd->vdev_children == 1) 3386 vdev_remove_parent(cvd); 3387 3388 /* 3389 * We don't set tvd until now because the parent we just removed 3390 * may have been the previous top-level vdev. 3391 */ 3392 tvd = cvd->vdev_top; 3393 ASSERT(tvd->vdev_parent == rvd); 3394 3395 /* 3396 * Reevaluate the parent vdev state. 3397 */ 3398 vdev_propagate_state(cvd); 3399 3400 /* 3401 * If the 'autoexpand' property is set on the pool then automatically 3402 * try to expand the size of the pool. For example if the device we 3403 * just detached was smaller than the others, it may be possible to 3404 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 3405 * first so that we can obtain the updated sizes of the leaf vdevs. 3406 */ 3407 if (spa->spa_autoexpand) { 3408 vdev_reopen(tvd); 3409 vdev_expand(tvd, txg); 3410 } 3411 3412 vdev_config_dirty(tvd); 3413 3414 /* 3415 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 3416 * vd->vdev_detached is set and free vd's DTL object in syncing context. 3417 * But first make sure we're not on any *other* txg's DTL list, to 3418 * prevent vd from being accessed after it's freed. 3419 */ 3420 for (int t = 0; t < TXG_SIZE; t++) 3421 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 3422 vd->vdev_detached = B_TRUE; 3423 vdev_dirty(tvd, VDD_DTL, vd, txg); 3424 3425 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 3426 3427 error = spa_vdev_exit(spa, vd, txg, 0); 3428 3429 /* 3430 * If this was the removal of the original device in a hot spare vdev, 3431 * then we want to go through and remove the device from the hot spare 3432 * list of every other pool. 3433 */ 3434 if (unspare) { 3435 spa_t *myspa = spa; 3436 spa = NULL; 3437 mutex_enter(&spa_namespace_lock); 3438 while ((spa = spa_next(spa)) != NULL) { 3439 if (spa->spa_state != POOL_STATE_ACTIVE) 3440 continue; 3441 if (spa == myspa) 3442 continue; 3443 spa_open_ref(spa, FTAG); 3444 mutex_exit(&spa_namespace_lock); 3445 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 3446 mutex_enter(&spa_namespace_lock); 3447 spa_close(spa, FTAG); 3448 } 3449 mutex_exit(&spa_namespace_lock); 3450 } 3451 3452 return (error); 3453 } 3454 3455 static nvlist_t * 3456 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 3457 { 3458 for (int i = 0; i < count; i++) { 3459 uint64_t guid; 3460 3461 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 3462 &guid) == 0); 3463 3464 if (guid == target_guid) 3465 return (nvpp[i]); 3466 } 3467 3468 return (NULL); 3469 } 3470 3471 static void 3472 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 3473 nvlist_t *dev_to_remove) 3474 { 3475 nvlist_t **newdev = NULL; 3476 3477 if (count > 1) 3478 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 3479 3480 for (int i = 0, j = 0; i < count; i++) { 3481 if (dev[i] == dev_to_remove) 3482 continue; 3483 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 3484 } 3485 3486 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 3487 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 3488 3489 for (int i = 0; i < count - 1; i++) 3490 nvlist_free(newdev[i]); 3491 3492 if (count > 1) 3493 kmem_free(newdev, (count - 1) * sizeof (void *)); 3494 } 3495 3496 /* 3497 * Removing a device from the vdev namespace requires several steps 3498 * and can take a significant amount of time. As a result we use 3499 * the spa_vdev_config_[enter/exit] functions which allow us to 3500 * grab and release the spa_config_lock while still holding the namespace 3501 * lock. During each step the configuration is synced out. 3502 */ 3503 3504 /* 3505 * Initial phase of device removal - stop future allocations from this device. 3506 */ 3507 void 3508 spa_vdev_remove_start(spa_t *spa, vdev_t *vd) 3509 { 3510 metaslab_group_t *mg = vd->vdev_mg; 3511 3512 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3513 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3514 3515 /* 3516 * Remove our vdev from the allocatable vdevs 3517 */ 3518 if (mg) 3519 metaslab_class_remove(mg->mg_class, mg); 3520 } 3521 3522 /* 3523 * Evacuate the device. 3524 */ 3525 int 3526 spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 3527 { 3528 uint64_t txg; 3529 int error; 3530 3531 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3532 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 3533 3534 /* 3535 * Evacuate the device. We don't hold the config lock as writer 3536 * since we need to do I/O but we do keep the 3537 * spa_namespace_lock held. Once this completes the device 3538 * should no longer have any blocks allocated on it. 3539 */ 3540 if (vd->vdev_islog) { 3541 /* 3542 * Evacuate the device. 3543 */ 3544 if (error = dmu_objset_find(spa_name(spa), 3545 zil_vdev_offline, NULL, DS_FIND_CHILDREN)) { 3546 uint64_t txg; 3547 3548 txg = spa_vdev_config_enter(spa); 3549 metaslab_class_add(spa->spa_log_class, 3550 vd->vdev_mg); 3551 return (spa_vdev_exit(spa, NULL, txg, error)); 3552 } 3553 txg_wait_synced(spa_get_dsl(spa), 0); 3554 } 3555 3556 /* 3557 * Remove any remaining MOS metadata associated with the device. 3558 */ 3559 txg = spa_vdev_config_enter(spa); 3560 vd->vdev_removing = B_TRUE; 3561 vdev_dirty(vd, 0, NULL, txg); 3562 vdev_config_dirty(vd); 3563 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 3564 3565 return (0); 3566 } 3567 3568 /* 3569 * Complete the removal by cleaning up the namespace. 3570 */ 3571 void 3572 spa_vdev_remove_done(spa_t *spa, vdev_t *vd) 3573 { 3574 vdev_t *rvd = spa->spa_root_vdev; 3575 metaslab_group_t *mg = vd->vdev_mg; 3576 uint64_t id = vd->vdev_id; 3577 boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 3578 3579 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3580 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3581 3582 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 3583 vdev_free(vd); 3584 3585 /* 3586 * It's possible that another thread is trying todo a spa_vdev_add() 3587 * at the same time we're trying remove it. As a result the 3588 * added vdev may not have initialized its metaslabs yet. 3589 */ 3590 if (mg != NULL) 3591 metaslab_group_destroy(mg); 3592 3593 if (last_vdev) { 3594 vdev_compact_children(rvd); 3595 } else { 3596 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 3597 vdev_add_child(rvd, vd); 3598 } 3599 vdev_config_dirty(rvd); 3600 3601 /* 3602 * Reassess the health of our root vdev. 3603 */ 3604 vdev_reopen(rvd); 3605 } 3606 3607 /* 3608 * Remove a device from the pool. Currently, this supports removing only hot 3609 * spares, slogs, and level 2 ARC devices. 3610 */ 3611 int 3612 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 3613 { 3614 vdev_t *vd; 3615 nvlist_t **spares, **l2cache, *nv; 3616 uint64_t txg = 0; 3617 uint_t nspares, nl2cache; 3618 int error = 0; 3619 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 3620 3621 if (!locked) 3622 txg = spa_vdev_enter(spa); 3623 3624 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 3625 3626 if (spa->spa_spares.sav_vdevs != NULL && 3627 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3628 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 3629 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 3630 /* 3631 * Only remove the hot spare if it's not currently in use 3632 * in this pool. 3633 */ 3634 if (vd == NULL || unspare) { 3635 spa_vdev_remove_aux(spa->spa_spares.sav_config, 3636 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 3637 spa_load_spares(spa); 3638 spa->spa_spares.sav_sync = B_TRUE; 3639 } else { 3640 error = EBUSY; 3641 } 3642 } else if (spa->spa_l2cache.sav_vdevs != NULL && 3643 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3644 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 3645 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 3646 /* 3647 * Cache devices can always be removed. 3648 */ 3649 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 3650 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 3651 spa_load_l2cache(spa); 3652 spa->spa_l2cache.sav_sync = B_TRUE; 3653 } else if (vd != NULL && vd->vdev_islog) { 3654 ASSERT(!locked); 3655 3656 /* 3657 * XXX - Once we have bp-rewrite this should 3658 * become the common case. 3659 */ 3660 3661 /* 3662 * 1. Stop allocations 3663 * 2. Evacuate the device (i.e. kill off stubby and 3664 * metadata) and wait for it to complete (i.e. sync). 3665 * 3. Cleanup the vdev namespace. 3666 */ 3667 spa_vdev_remove_start(spa, vd); 3668 3669 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 3670 if ((error = spa_vdev_remove_evacuate(spa, vd)) != 0) 3671 return (error); 3672 txg = spa_vdev_config_enter(spa); 3673 3674 spa_vdev_remove_done(spa, vd); 3675 3676 } else if (vd != NULL) { 3677 /* 3678 * Normal vdevs cannot be removed (yet). 3679 */ 3680 error = ENOTSUP; 3681 } else { 3682 /* 3683 * There is no vdev of any kind with the specified guid. 3684 */ 3685 error = ENOENT; 3686 } 3687 3688 if (!locked) 3689 return (spa_vdev_exit(spa, NULL, txg, error)); 3690 3691 return (error); 3692 } 3693 3694 /* 3695 * Find any device that's done replacing, or a vdev marked 'unspare' that's 3696 * current spared, so we can detach it. 3697 */ 3698 static vdev_t * 3699 spa_vdev_resilver_done_hunt(vdev_t *vd) 3700 { 3701 vdev_t *newvd, *oldvd; 3702 3703 for (int c = 0; c < vd->vdev_children; c++) { 3704 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 3705 if (oldvd != NULL) 3706 return (oldvd); 3707 } 3708 3709 /* 3710 * Check for a completed replacement. 3711 */ 3712 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 3713 oldvd = vd->vdev_child[0]; 3714 newvd = vd->vdev_child[1]; 3715 3716 if (vdev_dtl_empty(newvd, DTL_MISSING) && 3717 !vdev_dtl_required(oldvd)) 3718 return (oldvd); 3719 } 3720 3721 /* 3722 * Check for a completed resilver with the 'unspare' flag set. 3723 */ 3724 if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { 3725 newvd = vd->vdev_child[0]; 3726 oldvd = vd->vdev_child[1]; 3727 3728 if (newvd->vdev_unspare && 3729 vdev_dtl_empty(newvd, DTL_MISSING) && 3730 !vdev_dtl_required(oldvd)) { 3731 newvd->vdev_unspare = 0; 3732 return (oldvd); 3733 } 3734 } 3735 3736 return (NULL); 3737 } 3738 3739 static void 3740 spa_vdev_resilver_done(spa_t *spa) 3741 { 3742 vdev_t *vd, *pvd, *ppvd; 3743 uint64_t guid, sguid, pguid, ppguid; 3744 3745 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3746 3747 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 3748 pvd = vd->vdev_parent; 3749 ppvd = pvd->vdev_parent; 3750 guid = vd->vdev_guid; 3751 pguid = pvd->vdev_guid; 3752 ppguid = ppvd->vdev_guid; 3753 sguid = 0; 3754 /* 3755 * If we have just finished replacing a hot spared device, then 3756 * we need to detach the parent's first child (the original hot 3757 * spare) as well. 3758 */ 3759 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) { 3760 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 3761 ASSERT(ppvd->vdev_children == 2); 3762 sguid = ppvd->vdev_child[1]->vdev_guid; 3763 } 3764 spa_config_exit(spa, SCL_ALL, FTAG); 3765 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 3766 return; 3767 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 3768 return; 3769 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3770 } 3771 3772 spa_config_exit(spa, SCL_ALL, FTAG); 3773 } 3774 3775 /* 3776 * Update the stored path or FRU for this vdev. Dirty the vdev configuration, 3777 * relying on spa_vdev_enter/exit() to synchronize the labels and cache. 3778 */ 3779 int 3780 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 3781 boolean_t ispath) 3782 { 3783 vdev_t *vd; 3784 uint64_t txg; 3785 3786 txg = spa_vdev_enter(spa); 3787 3788 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 3789 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 3790 3791 if (!vd->vdev_ops->vdev_op_leaf) 3792 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3793 3794 if (ispath) { 3795 spa_strfree(vd->vdev_path); 3796 vd->vdev_path = spa_strdup(value); 3797 } else { 3798 if (vd->vdev_fru != NULL) 3799 spa_strfree(vd->vdev_fru); 3800 vd->vdev_fru = spa_strdup(value); 3801 } 3802 3803 vdev_config_dirty(vd->vdev_top); 3804 3805 return (spa_vdev_exit(spa, NULL, txg, 0)); 3806 } 3807 3808 int 3809 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 3810 { 3811 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 3812 } 3813 3814 int 3815 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 3816 { 3817 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 3818 } 3819 3820 /* 3821 * ========================================================================== 3822 * SPA Scrubbing 3823 * ========================================================================== 3824 */ 3825 3826 int 3827 spa_scrub(spa_t *spa, pool_scrub_type_t type) 3828 { 3829 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 3830 3831 if ((uint_t)type >= POOL_SCRUB_TYPES) 3832 return (ENOTSUP); 3833 3834 /* 3835 * If a resilver was requested, but there is no DTL on a 3836 * writeable leaf device, we have nothing to do. 3837 */ 3838 if (type == POOL_SCRUB_RESILVER && 3839 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 3840 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 3841 return (0); 3842 } 3843 3844 if (type == POOL_SCRUB_EVERYTHING && 3845 spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE && 3846 spa->spa_dsl_pool->dp_scrub_isresilver) 3847 return (EBUSY); 3848 3849 if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) { 3850 return (dsl_pool_scrub_clean(spa->spa_dsl_pool)); 3851 } else if (type == POOL_SCRUB_NONE) { 3852 return (dsl_pool_scrub_cancel(spa->spa_dsl_pool)); 3853 } else { 3854 return (EINVAL); 3855 } 3856 } 3857 3858 /* 3859 * ========================================================================== 3860 * SPA async task processing 3861 * ========================================================================== 3862 */ 3863 3864 static void 3865 spa_async_remove(spa_t *spa, vdev_t *vd) 3866 { 3867 if (vd->vdev_remove_wanted) { 3868 vd->vdev_remove_wanted = 0; 3869 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 3870 3871 /* 3872 * We want to clear the stats, but we don't want to do a full 3873 * vdev_clear() as that will cause us to throw away 3874 * degraded/faulted state as well as attempt to reopen the 3875 * device, all of which is a waste. 3876 */ 3877 vd->vdev_stat.vs_read_errors = 0; 3878 vd->vdev_stat.vs_write_errors = 0; 3879 vd->vdev_stat.vs_checksum_errors = 0; 3880 3881 vdev_state_dirty(vd->vdev_top); 3882 } 3883 3884 for (int c = 0; c < vd->vdev_children; c++) 3885 spa_async_remove(spa, vd->vdev_child[c]); 3886 } 3887 3888 static void 3889 spa_async_probe(spa_t *spa, vdev_t *vd) 3890 { 3891 if (vd->vdev_probe_wanted) { 3892 vd->vdev_probe_wanted = 0; 3893 vdev_reopen(vd); /* vdev_open() does the actual probe */ 3894 } 3895 3896 for (int c = 0; c < vd->vdev_children; c++) 3897 spa_async_probe(spa, vd->vdev_child[c]); 3898 } 3899 3900 static void 3901 spa_async_autoexpand(spa_t *spa, vdev_t *vd) 3902 { 3903 sysevent_id_t eid; 3904 nvlist_t *attr; 3905 char *physpath; 3906 3907 if (!spa->spa_autoexpand) 3908 return; 3909 3910 for (int c = 0; c < vd->vdev_children; c++) { 3911 vdev_t *cvd = vd->vdev_child[c]; 3912 spa_async_autoexpand(spa, cvd); 3913 } 3914 3915 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 3916 return; 3917 3918 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 3919 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 3920 3921 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3922 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 3923 3924 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 3925 ESC_DEV_DLE, attr, &eid, DDI_SLEEP); 3926 3927 nvlist_free(attr); 3928 kmem_free(physpath, MAXPATHLEN); 3929 } 3930 3931 static void 3932 spa_async_thread(spa_t *spa) 3933 { 3934 int tasks; 3935 3936 ASSERT(spa->spa_sync_on); 3937 3938 mutex_enter(&spa->spa_async_lock); 3939 tasks = spa->spa_async_tasks; 3940 spa->spa_async_tasks = 0; 3941 mutex_exit(&spa->spa_async_lock); 3942 3943 /* 3944 * See if the config needs to be updated. 3945 */ 3946 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 3947 uint64_t oldsz, space_update; 3948 3949 mutex_enter(&spa_namespace_lock); 3950 oldsz = spa_get_space(spa); 3951 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3952 space_update = spa_get_space(spa) - oldsz; 3953 mutex_exit(&spa_namespace_lock); 3954 3955 /* 3956 * If the pool grew as a result of the config update, 3957 * then log an internal history event. 3958 */ 3959 if (space_update) { 3960 spa_history_internal_log(LOG_POOL_VDEV_ONLINE, 3961 spa, NULL, CRED(), 3962 "pool '%s' size: %llu(+%llu)", 3963 spa_name(spa), spa_get_space(spa), 3964 space_update); 3965 } 3966 } 3967 3968 /* 3969 * See if any devices need to be marked REMOVED. 3970 */ 3971 if (tasks & SPA_ASYNC_REMOVE) { 3972 spa_vdev_state_enter(spa, SCL_NONE); 3973 spa_async_remove(spa, spa->spa_root_vdev); 3974 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 3975 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 3976 for (int i = 0; i < spa->spa_spares.sav_count; i++) 3977 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 3978 (void) spa_vdev_state_exit(spa, NULL, 0); 3979 } 3980 3981 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 3982 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3983 spa_async_autoexpand(spa, spa->spa_root_vdev); 3984 spa_config_exit(spa, SCL_CONFIG, FTAG); 3985 } 3986 3987 /* 3988 * See if any devices need to be probed. 3989 */ 3990 if (tasks & SPA_ASYNC_PROBE) { 3991 spa_vdev_state_enter(spa, SCL_NONE); 3992 spa_async_probe(spa, spa->spa_root_vdev); 3993 (void) spa_vdev_state_exit(spa, NULL, 0); 3994 } 3995 3996 /* 3997 * If any devices are done replacing, detach them. 3998 */ 3999 if (tasks & SPA_ASYNC_RESILVER_DONE) 4000 spa_vdev_resilver_done(spa); 4001 4002 /* 4003 * Kick off a resilver. 4004 */ 4005 if (tasks & SPA_ASYNC_RESILVER) 4006 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0); 4007 4008 /* 4009 * Let the world know that we're done. 4010 */ 4011 mutex_enter(&spa->spa_async_lock); 4012 spa->spa_async_thread = NULL; 4013 cv_broadcast(&spa->spa_async_cv); 4014 mutex_exit(&spa->spa_async_lock); 4015 thread_exit(); 4016 } 4017 4018 void 4019 spa_async_suspend(spa_t *spa) 4020 { 4021 mutex_enter(&spa->spa_async_lock); 4022 spa->spa_async_suspended++; 4023 while (spa->spa_async_thread != NULL) 4024 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 4025 mutex_exit(&spa->spa_async_lock); 4026 } 4027 4028 void 4029 spa_async_resume(spa_t *spa) 4030 { 4031 mutex_enter(&spa->spa_async_lock); 4032 ASSERT(spa->spa_async_suspended != 0); 4033 spa->spa_async_suspended--; 4034 mutex_exit(&spa->spa_async_lock); 4035 } 4036 4037 static void 4038 spa_async_dispatch(spa_t *spa) 4039 { 4040 mutex_enter(&spa->spa_async_lock); 4041 if (spa->spa_async_tasks && !spa->spa_async_suspended && 4042 spa->spa_async_thread == NULL && 4043 rootdir != NULL && !vn_is_readonly(rootdir)) 4044 spa->spa_async_thread = thread_create(NULL, 0, 4045 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 4046 mutex_exit(&spa->spa_async_lock); 4047 } 4048 4049 void 4050 spa_async_request(spa_t *spa, int task) 4051 { 4052 mutex_enter(&spa->spa_async_lock); 4053 spa->spa_async_tasks |= task; 4054 mutex_exit(&spa->spa_async_lock); 4055 } 4056 4057 /* 4058 * ========================================================================== 4059 * SPA syncing routines 4060 * ========================================================================== 4061 */ 4062 4063 static void 4064 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 4065 { 4066 bplist_t *bpl = &spa->spa_sync_bplist; 4067 dmu_tx_t *tx; 4068 blkptr_t blk; 4069 uint64_t itor = 0; 4070 zio_t *zio; 4071 int error; 4072 uint8_t c = 1; 4073 4074 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4075 4076 while (bplist_iterate(bpl, &itor, &blk) == 0) { 4077 ASSERT(blk.blk_birth < txg); 4078 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL, 4079 ZIO_FLAG_MUSTSUCCEED)); 4080 } 4081 4082 error = zio_wait(zio); 4083 ASSERT3U(error, ==, 0); 4084 4085 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 4086 bplist_vacate(bpl, tx); 4087 4088 /* 4089 * Pre-dirty the first block so we sync to convergence faster. 4090 * (Usually only the first block is needed.) 4091 */ 4092 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 4093 dmu_tx_commit(tx); 4094 } 4095 4096 static void 4097 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 4098 { 4099 char *packed = NULL; 4100 size_t bufsize; 4101 size_t nvsize = 0; 4102 dmu_buf_t *db; 4103 4104 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 4105 4106 /* 4107 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 4108 * information. This avoids the dbuf_will_dirty() path and 4109 * saves us a pre-read to get data we don't actually care about. 4110 */ 4111 bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); 4112 packed = kmem_alloc(bufsize, KM_SLEEP); 4113 4114 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 4115 KM_SLEEP) == 0); 4116 bzero(packed + nvsize, bufsize - nvsize); 4117 4118 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 4119 4120 kmem_free(packed, bufsize); 4121 4122 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 4123 dmu_buf_will_dirty(db, tx); 4124 *(uint64_t *)db->db_data = nvsize; 4125 dmu_buf_rele(db, FTAG); 4126 } 4127 4128 static void 4129 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 4130 const char *config, const char *entry) 4131 { 4132 nvlist_t *nvroot; 4133 nvlist_t **list; 4134 int i; 4135 4136 if (!sav->sav_sync) 4137 return; 4138 4139 /* 4140 * Update the MOS nvlist describing the list of available devices. 4141 * spa_validate_aux() will have already made sure this nvlist is 4142 * valid and the vdevs are labeled appropriately. 4143 */ 4144 if (sav->sav_object == 0) { 4145 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 4146 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 4147 sizeof (uint64_t), tx); 4148 VERIFY(zap_update(spa->spa_meta_objset, 4149 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 4150 &sav->sav_object, tx) == 0); 4151 } 4152 4153 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4154 if (sav->sav_count == 0) { 4155 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 4156 } else { 4157 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 4158 for (i = 0; i < sav->sav_count; i++) 4159 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 4160 B_FALSE, B_FALSE, B_TRUE); 4161 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 4162 sav->sav_count) == 0); 4163 for (i = 0; i < sav->sav_count; i++) 4164 nvlist_free(list[i]); 4165 kmem_free(list, sav->sav_count * sizeof (void *)); 4166 } 4167 4168 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 4169 nvlist_free(nvroot); 4170 4171 sav->sav_sync = B_FALSE; 4172 } 4173 4174 static void 4175 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 4176 { 4177 nvlist_t *config; 4178 4179 if (list_is_empty(&spa->spa_config_dirty_list)) 4180 return; 4181 4182 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4183 4184 config = spa_config_generate(spa, spa->spa_root_vdev, 4185 dmu_tx_get_txg(tx), B_FALSE); 4186 4187 spa_config_exit(spa, SCL_STATE, FTAG); 4188 4189 if (spa->spa_config_syncing) 4190 nvlist_free(spa->spa_config_syncing); 4191 spa->spa_config_syncing = config; 4192 4193 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 4194 } 4195 4196 /* 4197 * Set zpool properties. 4198 */ 4199 static void 4200 spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 4201 { 4202 spa_t *spa = arg1; 4203 objset_t *mos = spa->spa_meta_objset; 4204 nvlist_t *nvp = arg2; 4205 nvpair_t *elem; 4206 uint64_t intval; 4207 char *strval; 4208 zpool_prop_t prop; 4209 const char *propname; 4210 zprop_type_t proptype; 4211 4212 mutex_enter(&spa->spa_props_lock); 4213 4214 elem = NULL; 4215 while ((elem = nvlist_next_nvpair(nvp, elem))) { 4216 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 4217 case ZPOOL_PROP_VERSION: 4218 /* 4219 * Only set version for non-zpool-creation cases 4220 * (set/import). spa_create() needs special care 4221 * for version setting. 4222 */ 4223 if (tx->tx_txg != TXG_INITIAL) { 4224 VERIFY(nvpair_value_uint64(elem, 4225 &intval) == 0); 4226 ASSERT(intval <= SPA_VERSION); 4227 ASSERT(intval >= spa_version(spa)); 4228 spa->spa_uberblock.ub_version = intval; 4229 vdev_config_dirty(spa->spa_root_vdev); 4230 } 4231 break; 4232 4233 case ZPOOL_PROP_ALTROOT: 4234 /* 4235 * 'altroot' is a non-persistent property. It should 4236 * have been set temporarily at creation or import time. 4237 */ 4238 ASSERT(spa->spa_root != NULL); 4239 break; 4240 4241 case ZPOOL_PROP_CACHEFILE: 4242 /* 4243 * 'cachefile' is also a non-persisitent property. 4244 */ 4245 break; 4246 default: 4247 /* 4248 * Set pool property values in the poolprops mos object. 4249 */ 4250 if (spa->spa_pool_props_object == 0) { 4251 objset_t *mos = spa->spa_meta_objset; 4252 4253 VERIFY((spa->spa_pool_props_object = 4254 zap_create(mos, DMU_OT_POOL_PROPS, 4255 DMU_OT_NONE, 0, tx)) > 0); 4256 4257 VERIFY(zap_update(mos, 4258 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 4259 8, 1, &spa->spa_pool_props_object, tx) 4260 == 0); 4261 } 4262 4263 /* normalize the property name */ 4264 propname = zpool_prop_to_name(prop); 4265 proptype = zpool_prop_get_type(prop); 4266 4267 if (nvpair_type(elem) == DATA_TYPE_STRING) { 4268 ASSERT(proptype == PROP_TYPE_STRING); 4269 VERIFY(nvpair_value_string(elem, &strval) == 0); 4270 VERIFY(zap_update(mos, 4271 spa->spa_pool_props_object, propname, 4272 1, strlen(strval) + 1, strval, tx) == 0); 4273 4274 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 4275 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 4276 4277 if (proptype == PROP_TYPE_INDEX) { 4278 const char *unused; 4279 VERIFY(zpool_prop_index_to_string( 4280 prop, intval, &unused) == 0); 4281 } 4282 VERIFY(zap_update(mos, 4283 spa->spa_pool_props_object, propname, 4284 8, 1, &intval, tx) == 0); 4285 } else { 4286 ASSERT(0); /* not allowed */ 4287 } 4288 4289 switch (prop) { 4290 case ZPOOL_PROP_DELEGATION: 4291 spa->spa_delegation = intval; 4292 break; 4293 case ZPOOL_PROP_BOOTFS: 4294 spa->spa_bootfs = intval; 4295 break; 4296 case ZPOOL_PROP_FAILUREMODE: 4297 spa->spa_failmode = intval; 4298 break; 4299 case ZPOOL_PROP_AUTOEXPAND: 4300 spa->spa_autoexpand = intval; 4301 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 4302 break; 4303 default: 4304 break; 4305 } 4306 } 4307 4308 /* log internal history if this is not a zpool create */ 4309 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 4310 tx->tx_txg != TXG_INITIAL) { 4311 spa_history_internal_log(LOG_POOL_PROPSET, 4312 spa, tx, cr, "%s %lld %s", 4313 nvpair_name(elem), intval, spa_name(spa)); 4314 } 4315 } 4316 4317 mutex_exit(&spa->spa_props_lock); 4318 } 4319 4320 /* 4321 * Sync the specified transaction group. New blocks may be dirtied as 4322 * part of the process, so we iterate until it converges. 4323 */ 4324 void 4325 spa_sync(spa_t *spa, uint64_t txg) 4326 { 4327 dsl_pool_t *dp = spa->spa_dsl_pool; 4328 objset_t *mos = spa->spa_meta_objset; 4329 bplist_t *bpl = &spa->spa_sync_bplist; 4330 vdev_t *rvd = spa->spa_root_vdev; 4331 vdev_t *vd; 4332 dmu_tx_t *tx; 4333 int dirty_vdevs; 4334 int error; 4335 4336 /* 4337 * Lock out configuration changes. 4338 */ 4339 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4340 4341 spa->spa_syncing_txg = txg; 4342 spa->spa_sync_pass = 0; 4343 4344 /* 4345 * If there are any pending vdev state changes, convert them 4346 * into config changes that go out with this transaction group. 4347 */ 4348 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4349 while (list_head(&spa->spa_state_dirty_list) != NULL) { 4350 /* 4351 * We need the write lock here because, for aux vdevs, 4352 * calling vdev_config_dirty() modifies sav_config. 4353 * This is ugly and will become unnecessary when we 4354 * eliminate the aux vdev wart by integrating all vdevs 4355 * into the root vdev tree. 4356 */ 4357 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 4358 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 4359 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 4360 vdev_state_clean(vd); 4361 vdev_config_dirty(vd); 4362 } 4363 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 4364 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 4365 } 4366 spa_config_exit(spa, SCL_STATE, FTAG); 4367 4368 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 4369 4370 tx = dmu_tx_create_assigned(dp, txg); 4371 4372 /* 4373 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 4374 * set spa_deflate if we have no raid-z vdevs. 4375 */ 4376 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 4377 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 4378 int i; 4379 4380 for (i = 0; i < rvd->vdev_children; i++) { 4381 vd = rvd->vdev_child[i]; 4382 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 4383 break; 4384 } 4385 if (i == rvd->vdev_children) { 4386 spa->spa_deflate = TRUE; 4387 VERIFY(0 == zap_add(spa->spa_meta_objset, 4388 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 4389 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 4390 } 4391 } 4392 4393 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 4394 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 4395 dsl_pool_create_origin(dp, tx); 4396 4397 /* Keeping the origin open increases spa_minref */ 4398 spa->spa_minref += 3; 4399 } 4400 4401 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 4402 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 4403 dsl_pool_upgrade_clones(dp, tx); 4404 } 4405 4406 /* 4407 * If anything has changed in this txg, push the deferred frees 4408 * from the previous txg. If not, leave them alone so that we 4409 * don't generate work on an otherwise idle system. 4410 */ 4411 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 4412 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 4413 !txg_list_empty(&dp->dp_sync_tasks, txg)) 4414 spa_sync_deferred_frees(spa, txg); 4415 4416 /* 4417 * Iterate to convergence. 4418 */ 4419 do { 4420 spa->spa_sync_pass++; 4421 4422 spa_sync_config_object(spa, tx); 4423 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 4424 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 4425 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 4426 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 4427 spa_errlog_sync(spa, txg); 4428 dsl_pool_sync(dp, txg); 4429 4430 dirty_vdevs = 0; 4431 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 4432 vdev_sync(vd, txg); 4433 dirty_vdevs++; 4434 } 4435 4436 bplist_sync(bpl, tx); 4437 } while (dirty_vdevs); 4438 4439 bplist_close(bpl); 4440 4441 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 4442 4443 /* 4444 * Rewrite the vdev configuration (which includes the uberblock) 4445 * to commit the transaction group. 4446 * 4447 * If there are no dirty vdevs, we sync the uberblock to a few 4448 * random top-level vdevs that are known to be visible in the 4449 * config cache (see spa_vdev_add() for a complete description). 4450 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 4451 */ 4452 for (;;) { 4453 /* 4454 * We hold SCL_STATE to prevent vdev open/close/etc. 4455 * while we're attempting to write the vdev labels. 4456 */ 4457 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4458 4459 if (list_is_empty(&spa->spa_config_dirty_list)) { 4460 vdev_t *svd[SPA_DVAS_PER_BP]; 4461 int svdcount = 0; 4462 int children = rvd->vdev_children; 4463 int c0 = spa_get_random(children); 4464 4465 for (int c = 0; c < children; c++) { 4466 vd = rvd->vdev_child[(c0 + c) % children]; 4467 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 4468 continue; 4469 svd[svdcount++] = vd; 4470 if (svdcount == SPA_DVAS_PER_BP) 4471 break; 4472 } 4473 error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 4474 if (error != 0) 4475 error = vdev_config_sync(svd, svdcount, txg, 4476 B_TRUE); 4477 } else { 4478 error = vdev_config_sync(rvd->vdev_child, 4479 rvd->vdev_children, txg, B_FALSE); 4480 if (error != 0) 4481 error = vdev_config_sync(rvd->vdev_child, 4482 rvd->vdev_children, txg, B_TRUE); 4483 } 4484 4485 spa_config_exit(spa, SCL_STATE, FTAG); 4486 4487 if (error == 0) 4488 break; 4489 zio_suspend(spa, NULL); 4490 zio_resume_wait(spa); 4491 } 4492 dmu_tx_commit(tx); 4493 4494 /* 4495 * Clear the dirty config list. 4496 */ 4497 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 4498 vdev_config_clean(vd); 4499 4500 /* 4501 * Now that the new config has synced transactionally, 4502 * let it become visible to the config cache. 4503 */ 4504 if (spa->spa_config_syncing != NULL) { 4505 spa_config_set(spa, spa->spa_config_syncing); 4506 spa->spa_config_txg = txg; 4507 spa->spa_config_syncing = NULL; 4508 } 4509 4510 spa->spa_ubsync = spa->spa_uberblock; 4511 4512 /* 4513 * Clean up the ZIL records for the synced txg. 4514 */ 4515 dsl_pool_zil_clean(dp); 4516 4517 /* 4518 * Update usable space statistics. 4519 */ 4520 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 4521 vdev_sync_done(vd, txg); 4522 4523 /* 4524 * It had better be the case that we didn't dirty anything 4525 * since vdev_config_sync(). 4526 */ 4527 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 4528 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 4529 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 4530 ASSERT(bpl->bpl_queue == NULL); 4531 4532 spa_config_exit(spa, SCL_CONFIG, FTAG); 4533 4534 /* 4535 * If any async tasks have been requested, kick them off. 4536 */ 4537 spa_async_dispatch(spa); 4538 } 4539 4540 /* 4541 * Sync all pools. We don't want to hold the namespace lock across these 4542 * operations, so we take a reference on the spa_t and drop the lock during the 4543 * sync. 4544 */ 4545 void 4546 spa_sync_allpools(void) 4547 { 4548 spa_t *spa = NULL; 4549 mutex_enter(&spa_namespace_lock); 4550 while ((spa = spa_next(spa)) != NULL) { 4551 if (spa_state(spa) != POOL_STATE_ACTIVE || spa_suspended(spa)) 4552 continue; 4553 spa_open_ref(spa, FTAG); 4554 mutex_exit(&spa_namespace_lock); 4555 txg_wait_synced(spa_get_dsl(spa), 0); 4556 mutex_enter(&spa_namespace_lock); 4557 spa_close(spa, FTAG); 4558 } 4559 mutex_exit(&spa_namespace_lock); 4560 } 4561 4562 /* 4563 * ========================================================================== 4564 * Miscellaneous routines 4565 * ========================================================================== 4566 */ 4567 4568 /* 4569 * Remove all pools in the system. 4570 */ 4571 void 4572 spa_evict_all(void) 4573 { 4574 spa_t *spa; 4575 4576 /* 4577 * Remove all cached state. All pools should be closed now, 4578 * so every spa in the AVL tree should be unreferenced. 4579 */ 4580 mutex_enter(&spa_namespace_lock); 4581 while ((spa = spa_next(NULL)) != NULL) { 4582 /* 4583 * Stop async tasks. The async thread may need to detach 4584 * a device that's been replaced, which requires grabbing 4585 * spa_namespace_lock, so we must drop it here. 4586 */ 4587 spa_open_ref(spa, FTAG); 4588 mutex_exit(&spa_namespace_lock); 4589 spa_async_suspend(spa); 4590 mutex_enter(&spa_namespace_lock); 4591 spa_close(spa, FTAG); 4592 4593 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4594 spa_unload(spa); 4595 spa_deactivate(spa); 4596 } 4597 spa_remove(spa); 4598 } 4599 mutex_exit(&spa_namespace_lock); 4600 } 4601 4602 vdev_t * 4603 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 4604 { 4605 vdev_t *vd; 4606 int i; 4607 4608 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 4609 return (vd); 4610 4611 if (aux) { 4612 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 4613 vd = spa->spa_l2cache.sav_vdevs[i]; 4614 if (vd->vdev_guid == guid) 4615 return (vd); 4616 } 4617 4618 for (i = 0; i < spa->spa_spares.sav_count; i++) { 4619 vd = spa->spa_spares.sav_vdevs[i]; 4620 if (vd->vdev_guid == guid) 4621 return (vd); 4622 } 4623 } 4624 4625 return (NULL); 4626 } 4627 4628 void 4629 spa_upgrade(spa_t *spa, uint64_t version) 4630 { 4631 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4632 4633 /* 4634 * This should only be called for a non-faulted pool, and since a 4635 * future version would result in an unopenable pool, this shouldn't be 4636 * possible. 4637 */ 4638 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 4639 ASSERT(version >= spa->spa_uberblock.ub_version); 4640 4641 spa->spa_uberblock.ub_version = version; 4642 vdev_config_dirty(spa->spa_root_vdev); 4643 4644 spa_config_exit(spa, SCL_ALL, FTAG); 4645 4646 txg_wait_synced(spa_get_dsl(spa), 0); 4647 } 4648 4649 boolean_t 4650 spa_has_spare(spa_t *spa, uint64_t guid) 4651 { 4652 int i; 4653 uint64_t spareguid; 4654 spa_aux_vdev_t *sav = &spa->spa_spares; 4655 4656 for (i = 0; i < sav->sav_count; i++) 4657 if (sav->sav_vdevs[i]->vdev_guid == guid) 4658 return (B_TRUE); 4659 4660 for (i = 0; i < sav->sav_npending; i++) { 4661 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 4662 &spareguid) == 0 && spareguid == guid) 4663 return (B_TRUE); 4664 } 4665 4666 return (B_FALSE); 4667 } 4668 4669 /* 4670 * Check if a pool has an active shared spare device. 4671 * Note: reference count of an active spare is 2, as a spare and as a replace 4672 */ 4673 static boolean_t 4674 spa_has_active_shared_spare(spa_t *spa) 4675 { 4676 int i, refcnt; 4677 uint64_t pool; 4678 spa_aux_vdev_t *sav = &spa->spa_spares; 4679 4680 for (i = 0; i < sav->sav_count; i++) { 4681 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 4682 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 4683 refcnt > 2) 4684 return (B_TRUE); 4685 } 4686 4687 return (B_FALSE); 4688 } 4689 4690 /* 4691 * Post a sysevent corresponding to the given event. The 'name' must be one of 4692 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 4693 * filled in from the spa and (optionally) the vdev. This doesn't do anything 4694 * in the userland libzpool, as we don't want consumers to misinterpret ztest 4695 * or zdb as real changes. 4696 */ 4697 void 4698 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 4699 { 4700 #ifdef _KERNEL 4701 sysevent_t *ev; 4702 sysevent_attr_list_t *attr = NULL; 4703 sysevent_value_t value; 4704 sysevent_id_t eid; 4705 4706 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 4707 SE_SLEEP); 4708 4709 value.value_type = SE_DATA_TYPE_STRING; 4710 value.value.sv_string = spa_name(spa); 4711 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 4712 goto done; 4713 4714 value.value_type = SE_DATA_TYPE_UINT64; 4715 value.value.sv_uint64 = spa_guid(spa); 4716 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 4717 goto done; 4718 4719 if (vd) { 4720 value.value_type = SE_DATA_TYPE_UINT64; 4721 value.value.sv_uint64 = vd->vdev_guid; 4722 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 4723 SE_SLEEP) != 0) 4724 goto done; 4725 4726 if (vd->vdev_path) { 4727 value.value_type = SE_DATA_TYPE_STRING; 4728 value.value.sv_string = vd->vdev_path; 4729 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 4730 &value, SE_SLEEP) != 0) 4731 goto done; 4732 } 4733 } 4734 4735 if (sysevent_attach_attributes(ev, attr) != 0) 4736 goto done; 4737 attr = NULL; 4738 4739 (void) log_sysevent(ev, SE_SLEEP, &eid); 4740 4741 done: 4742 if (attr) 4743 sysevent_free_attr(attr); 4744 sysevent_free(ev); 4745 #endif 4746 } 4747