1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * This file contains all the routines used when modifying on-disk SPA state. 29 * This includes opening, importing, destroying, exporting a pool, and syncing a 30 * pool. 31 */ 32 33 #include <sys/zfs_context.h> 34 #include <sys/fm/fs/zfs.h> 35 #include <sys/spa_impl.h> 36 #include <sys/zio.h> 37 #include <sys/zio_checksum.h> 38 #include <sys/zio_compress.h> 39 #include <sys/dmu.h> 40 #include <sys/dmu_tx.h> 41 #include <sys/zap.h> 42 #include <sys/zil.h> 43 #include <sys/vdev_impl.h> 44 #include <sys/metaslab.h> 45 #include <sys/metaslab_impl.h> 46 #include <sys/uberblock_impl.h> 47 #include <sys/txg.h> 48 #include <sys/avl.h> 49 #include <sys/dmu_traverse.h> 50 #include <sys/dmu_objset.h> 51 #include <sys/unique.h> 52 #include <sys/dsl_pool.h> 53 #include <sys/dsl_dataset.h> 54 #include <sys/dsl_dir.h> 55 #include <sys/dsl_prop.h> 56 #include <sys/dsl_synctask.h> 57 #include <sys/fs/zfs.h> 58 #include <sys/arc.h> 59 #include <sys/callb.h> 60 #include <sys/systeminfo.h> 61 #include <sys/sunddi.h> 62 #include <sys/spa_boot.h> 63 #include <sys/zfs_ioctl.h> 64 65 #ifdef _KERNEL 66 #include <sys/zone.h> 67 #endif /* _KERNEL */ 68 69 #include "zfs_prop.h" 70 #include "zfs_comutil.h" 71 72 enum zti_modes { 73 zti_mode_fixed, /* value is # of threads (min 1) */ 74 zti_mode_online_percent, /* value is % of online CPUs */ 75 zti_mode_tune, /* fill from zio_taskq_tune_* */ 76 zti_nmodes 77 }; 78 79 #define ZTI_THREAD_FIX(n) { zti_mode_fixed, (n) } 80 #define ZTI_THREAD_PCT(n) { zti_mode_online_percent, (n) } 81 #define ZTI_THREAD_TUNE { zti_mode_tune, 0 } 82 83 #define ZTI_THREAD_ONE ZTI_THREAD_FIX(1) 84 85 typedef struct zio_taskq_info { 86 const char *zti_name; 87 struct { 88 enum zti_modes zti_mode; 89 uint_t zti_value; 90 } zti_nthreads[ZIO_TASKQ_TYPES]; 91 } zio_taskq_info_t; 92 93 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 94 "issue", "intr" 95 }; 96 97 const zio_taskq_info_t zio_taskqs[ZIO_TYPES] = { 98 /* ISSUE INTR */ 99 { "spa_zio_null", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, 100 { "spa_zio_read", { ZTI_THREAD_FIX(8), ZTI_THREAD_TUNE } }, 101 { "spa_zio_write", { ZTI_THREAD_TUNE, ZTI_THREAD_FIX(8) } }, 102 { "spa_zio_free", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, 103 { "spa_zio_claim", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, 104 { "spa_zio_ioctl", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, 105 }; 106 107 enum zti_modes zio_taskq_tune_mode = zti_mode_online_percent; 108 uint_t zio_taskq_tune_value = 80; /* #threads = 80% of # online CPUs */ 109 110 static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); 111 static boolean_t spa_has_active_shared_spare(spa_t *spa); 112 113 /* 114 * ========================================================================== 115 * SPA properties routines 116 * ========================================================================== 117 */ 118 119 /* 120 * Add a (source=src, propname=propval) list to an nvlist. 121 */ 122 static void 123 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 124 uint64_t intval, zprop_source_t src) 125 { 126 const char *propname = zpool_prop_to_name(prop); 127 nvlist_t *propval; 128 129 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 130 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 131 132 if (strval != NULL) 133 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 134 else 135 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 136 137 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 138 nvlist_free(propval); 139 } 140 141 /* 142 * Get property values from the spa configuration. 143 */ 144 static void 145 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 146 { 147 uint64_t size; 148 uint64_t used; 149 uint64_t cap, version; 150 zprop_source_t src = ZPROP_SRC_NONE; 151 spa_config_dirent_t *dp; 152 153 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 154 155 if (spa->spa_root_vdev != NULL) { 156 size = spa_get_space(spa); 157 used = spa_get_alloc(spa); 158 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 159 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 160 spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src); 161 spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, 162 size - used, src); 163 164 cap = (size == 0) ? 0 : (used * 100 / size); 165 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 166 167 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 168 spa->spa_root_vdev->vdev_state, src); 169 170 version = spa_version(spa); 171 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 172 src = ZPROP_SRC_DEFAULT; 173 else 174 src = ZPROP_SRC_LOCAL; 175 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 176 } 177 178 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 179 180 if (spa->spa_root != NULL) 181 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 182 0, ZPROP_SRC_LOCAL); 183 184 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 185 if (dp->scd_path == NULL) { 186 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 187 "none", 0, ZPROP_SRC_LOCAL); 188 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 189 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 190 dp->scd_path, 0, ZPROP_SRC_LOCAL); 191 } 192 } 193 } 194 195 /* 196 * Get zpool property values. 197 */ 198 int 199 spa_prop_get(spa_t *spa, nvlist_t **nvp) 200 { 201 zap_cursor_t zc; 202 zap_attribute_t za; 203 objset_t *mos = spa->spa_meta_objset; 204 int err; 205 206 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 207 208 mutex_enter(&spa->spa_props_lock); 209 210 /* 211 * Get properties from the spa config. 212 */ 213 spa_prop_get_config(spa, nvp); 214 215 /* If no pool property object, no more prop to get. */ 216 if (spa->spa_pool_props_object == 0) { 217 mutex_exit(&spa->spa_props_lock); 218 return (0); 219 } 220 221 /* 222 * Get properties from the MOS pool property object. 223 */ 224 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 225 (err = zap_cursor_retrieve(&zc, &za)) == 0; 226 zap_cursor_advance(&zc)) { 227 uint64_t intval = 0; 228 char *strval = NULL; 229 zprop_source_t src = ZPROP_SRC_DEFAULT; 230 zpool_prop_t prop; 231 232 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 233 continue; 234 235 switch (za.za_integer_length) { 236 case 8: 237 /* integer property */ 238 if (za.za_first_integer != 239 zpool_prop_default_numeric(prop)) 240 src = ZPROP_SRC_LOCAL; 241 242 if (prop == ZPOOL_PROP_BOOTFS) { 243 dsl_pool_t *dp; 244 dsl_dataset_t *ds = NULL; 245 246 dp = spa_get_dsl(spa); 247 rw_enter(&dp->dp_config_rwlock, RW_READER); 248 if (err = dsl_dataset_hold_obj(dp, 249 za.za_first_integer, FTAG, &ds)) { 250 rw_exit(&dp->dp_config_rwlock); 251 break; 252 } 253 254 strval = kmem_alloc( 255 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 256 KM_SLEEP); 257 dsl_dataset_name(ds, strval); 258 dsl_dataset_rele(ds, FTAG); 259 rw_exit(&dp->dp_config_rwlock); 260 } else { 261 strval = NULL; 262 intval = za.za_first_integer; 263 } 264 265 spa_prop_add_list(*nvp, prop, strval, intval, src); 266 267 if (strval != NULL) 268 kmem_free(strval, 269 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 270 271 break; 272 273 case 1: 274 /* string property */ 275 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 276 err = zap_lookup(mos, spa->spa_pool_props_object, 277 za.za_name, 1, za.za_num_integers, strval); 278 if (err) { 279 kmem_free(strval, za.za_num_integers); 280 break; 281 } 282 spa_prop_add_list(*nvp, prop, strval, 0, src); 283 kmem_free(strval, za.za_num_integers); 284 break; 285 286 default: 287 break; 288 } 289 } 290 zap_cursor_fini(&zc); 291 mutex_exit(&spa->spa_props_lock); 292 out: 293 if (err && err != ENOENT) { 294 nvlist_free(*nvp); 295 *nvp = NULL; 296 return (err); 297 } 298 299 return (0); 300 } 301 302 /* 303 * Validate the given pool properties nvlist and modify the list 304 * for the property values to be set. 305 */ 306 static int 307 spa_prop_validate(spa_t *spa, nvlist_t *props) 308 { 309 nvpair_t *elem; 310 int error = 0, reset_bootfs = 0; 311 uint64_t objnum; 312 313 elem = NULL; 314 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 315 zpool_prop_t prop; 316 char *propname, *strval; 317 uint64_t intval; 318 objset_t *os; 319 char *slash; 320 321 propname = nvpair_name(elem); 322 323 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) 324 return (EINVAL); 325 326 switch (prop) { 327 case ZPOOL_PROP_VERSION: 328 error = nvpair_value_uint64(elem, &intval); 329 if (!error && 330 (intval < spa_version(spa) || intval > SPA_VERSION)) 331 error = EINVAL; 332 break; 333 334 case ZPOOL_PROP_DELEGATION: 335 case ZPOOL_PROP_AUTOREPLACE: 336 case ZPOOL_PROP_LISTSNAPS: 337 case ZPOOL_PROP_AUTOEXPAND: 338 error = nvpair_value_uint64(elem, &intval); 339 if (!error && intval > 1) 340 error = EINVAL; 341 break; 342 343 case ZPOOL_PROP_BOOTFS: 344 /* 345 * If the pool version is less than SPA_VERSION_BOOTFS, 346 * or the pool is still being created (version == 0), 347 * the bootfs property cannot be set. 348 */ 349 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 350 error = ENOTSUP; 351 break; 352 } 353 354 /* 355 * Make sure the vdev config is bootable 356 */ 357 if (!vdev_is_bootable(spa->spa_root_vdev)) { 358 error = ENOTSUP; 359 break; 360 } 361 362 reset_bootfs = 1; 363 364 error = nvpair_value_string(elem, &strval); 365 366 if (!error) { 367 uint64_t compress; 368 369 if (strval == NULL || strval[0] == '\0') { 370 objnum = zpool_prop_default_numeric( 371 ZPOOL_PROP_BOOTFS); 372 break; 373 } 374 375 if (error = dmu_objset_hold(strval, FTAG, &os)) 376 break; 377 378 /* Must be ZPL and not gzip compressed. */ 379 380 if (dmu_objset_type(os) != DMU_OST_ZFS) { 381 error = ENOTSUP; 382 } else if ((error = dsl_prop_get_integer(strval, 383 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 384 &compress, NULL)) == 0 && 385 !BOOTFS_COMPRESS_VALID(compress)) { 386 error = ENOTSUP; 387 } else { 388 objnum = dmu_objset_id(os); 389 } 390 dmu_objset_rele(os, FTAG); 391 } 392 break; 393 394 case ZPOOL_PROP_FAILUREMODE: 395 error = nvpair_value_uint64(elem, &intval); 396 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 397 intval > ZIO_FAILURE_MODE_PANIC)) 398 error = EINVAL; 399 400 /* 401 * This is a special case which only occurs when 402 * the pool has completely failed. This allows 403 * the user to change the in-core failmode property 404 * without syncing it out to disk (I/Os might 405 * currently be blocked). We do this by returning 406 * EIO to the caller (spa_prop_set) to trick it 407 * into thinking we encountered a property validation 408 * error. 409 */ 410 if (!error && spa_suspended(spa)) { 411 spa->spa_failmode = intval; 412 error = EIO; 413 } 414 break; 415 416 case ZPOOL_PROP_CACHEFILE: 417 if ((error = nvpair_value_string(elem, &strval)) != 0) 418 break; 419 420 if (strval[0] == '\0') 421 break; 422 423 if (strcmp(strval, "none") == 0) 424 break; 425 426 if (strval[0] != '/') { 427 error = EINVAL; 428 break; 429 } 430 431 slash = strrchr(strval, '/'); 432 ASSERT(slash != NULL); 433 434 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 435 strcmp(slash, "/..") == 0) 436 error = EINVAL; 437 break; 438 } 439 440 if (error) 441 break; 442 } 443 444 if (!error && reset_bootfs) { 445 error = nvlist_remove(props, 446 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 447 448 if (!error) { 449 error = nvlist_add_uint64(props, 450 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 451 } 452 } 453 454 return (error); 455 } 456 457 void 458 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 459 { 460 char *cachefile; 461 spa_config_dirent_t *dp; 462 463 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 464 &cachefile) != 0) 465 return; 466 467 dp = kmem_alloc(sizeof (spa_config_dirent_t), 468 KM_SLEEP); 469 470 if (cachefile[0] == '\0') 471 dp->scd_path = spa_strdup(spa_config_path); 472 else if (strcmp(cachefile, "none") == 0) 473 dp->scd_path = NULL; 474 else 475 dp->scd_path = spa_strdup(cachefile); 476 477 list_insert_head(&spa->spa_config_list, dp); 478 if (need_sync) 479 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 480 } 481 482 int 483 spa_prop_set(spa_t *spa, nvlist_t *nvp) 484 { 485 int error; 486 nvpair_t *elem; 487 boolean_t need_sync = B_FALSE; 488 zpool_prop_t prop; 489 490 if ((error = spa_prop_validate(spa, nvp)) != 0) 491 return (error); 492 493 elem = NULL; 494 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 495 if ((prop = zpool_name_to_prop( 496 nvpair_name(elem))) == ZPROP_INVAL) 497 return (EINVAL); 498 499 if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT) 500 continue; 501 502 need_sync = B_TRUE; 503 break; 504 } 505 506 if (need_sync) 507 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 508 spa, nvp, 3)); 509 else 510 return (0); 511 } 512 513 /* 514 * If the bootfs property value is dsobj, clear it. 515 */ 516 void 517 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 518 { 519 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 520 VERIFY(zap_remove(spa->spa_meta_objset, 521 spa->spa_pool_props_object, 522 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 523 spa->spa_bootfs = 0; 524 } 525 } 526 527 /* 528 * ========================================================================== 529 * SPA state manipulation (open/create/destroy/import/export) 530 * ========================================================================== 531 */ 532 533 static int 534 spa_error_entry_compare(const void *a, const void *b) 535 { 536 spa_error_entry_t *sa = (spa_error_entry_t *)a; 537 spa_error_entry_t *sb = (spa_error_entry_t *)b; 538 int ret; 539 540 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 541 sizeof (zbookmark_t)); 542 543 if (ret < 0) 544 return (-1); 545 else if (ret > 0) 546 return (1); 547 else 548 return (0); 549 } 550 551 /* 552 * Utility function which retrieves copies of the current logs and 553 * re-initializes them in the process. 554 */ 555 void 556 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 557 { 558 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 559 560 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 561 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 562 563 avl_create(&spa->spa_errlist_scrub, 564 spa_error_entry_compare, sizeof (spa_error_entry_t), 565 offsetof(spa_error_entry_t, se_avl)); 566 avl_create(&spa->spa_errlist_last, 567 spa_error_entry_compare, sizeof (spa_error_entry_t), 568 offsetof(spa_error_entry_t, se_avl)); 569 } 570 571 /* 572 * Activate an uninitialized pool. 573 */ 574 static void 575 spa_activate(spa_t *spa, int mode) 576 { 577 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 578 579 spa->spa_state = POOL_STATE_ACTIVE; 580 spa->spa_mode = mode; 581 582 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 583 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 584 585 for (int t = 0; t < ZIO_TYPES; t++) { 586 const zio_taskq_info_t *ztip = &zio_taskqs[t]; 587 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 588 enum zti_modes mode = ztip->zti_nthreads[q].zti_mode; 589 uint_t value = ztip->zti_nthreads[q].zti_value; 590 char name[32]; 591 592 (void) snprintf(name, sizeof (name), 593 "%s_%s", ztip->zti_name, zio_taskq_types[q]); 594 595 if (mode == zti_mode_tune) { 596 mode = zio_taskq_tune_mode; 597 value = zio_taskq_tune_value; 598 if (mode == zti_mode_tune) 599 mode = zti_mode_online_percent; 600 } 601 602 switch (mode) { 603 case zti_mode_fixed: 604 ASSERT3U(value, >=, 1); 605 value = MAX(value, 1); 606 607 spa->spa_zio_taskq[t][q] = taskq_create(name, 608 value, maxclsyspri, 50, INT_MAX, 609 TASKQ_PREPOPULATE); 610 break; 611 612 case zti_mode_online_percent: 613 spa->spa_zio_taskq[t][q] = taskq_create(name, 614 value, maxclsyspri, 50, INT_MAX, 615 TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT); 616 break; 617 618 case zti_mode_tune: 619 default: 620 panic("unrecognized mode for " 621 "zio_taskqs[%u]->zti_nthreads[%u] (%u:%u) " 622 "in spa_activate()", 623 t, q, mode, value); 624 break; 625 } 626 } 627 } 628 629 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 630 offsetof(vdev_t, vdev_config_dirty_node)); 631 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 632 offsetof(vdev_t, vdev_state_dirty_node)); 633 634 txg_list_create(&spa->spa_vdev_txg_list, 635 offsetof(struct vdev, vdev_txg_node)); 636 637 avl_create(&spa->spa_errlist_scrub, 638 spa_error_entry_compare, sizeof (spa_error_entry_t), 639 offsetof(spa_error_entry_t, se_avl)); 640 avl_create(&spa->spa_errlist_last, 641 spa_error_entry_compare, sizeof (spa_error_entry_t), 642 offsetof(spa_error_entry_t, se_avl)); 643 } 644 645 /* 646 * Opposite of spa_activate(). 647 */ 648 static void 649 spa_deactivate(spa_t *spa) 650 { 651 ASSERT(spa->spa_sync_on == B_FALSE); 652 ASSERT(spa->spa_dsl_pool == NULL); 653 ASSERT(spa->spa_root_vdev == NULL); 654 ASSERT(spa->spa_async_zio_root == NULL); 655 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 656 657 txg_list_destroy(&spa->spa_vdev_txg_list); 658 659 list_destroy(&spa->spa_config_dirty_list); 660 list_destroy(&spa->spa_state_dirty_list); 661 662 for (int t = 0; t < ZIO_TYPES; t++) { 663 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 664 taskq_destroy(spa->spa_zio_taskq[t][q]); 665 spa->spa_zio_taskq[t][q] = NULL; 666 } 667 } 668 669 metaslab_class_destroy(spa->spa_normal_class); 670 spa->spa_normal_class = NULL; 671 672 metaslab_class_destroy(spa->spa_log_class); 673 spa->spa_log_class = NULL; 674 675 /* 676 * If this was part of an import or the open otherwise failed, we may 677 * still have errors left in the queues. Empty them just in case. 678 */ 679 spa_errlog_drain(spa); 680 681 avl_destroy(&spa->spa_errlist_scrub); 682 avl_destroy(&spa->spa_errlist_last); 683 684 spa->spa_state = POOL_STATE_UNINITIALIZED; 685 } 686 687 /* 688 * Verify a pool configuration, and construct the vdev tree appropriately. This 689 * will create all the necessary vdevs in the appropriate layout, with each vdev 690 * in the CLOSED state. This will prep the pool before open/creation/import. 691 * All vdev validation is done by the vdev_alloc() routine. 692 */ 693 static int 694 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 695 uint_t id, int atype) 696 { 697 nvlist_t **child; 698 uint_t children; 699 int error; 700 701 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 702 return (error); 703 704 if ((*vdp)->vdev_ops->vdev_op_leaf) 705 return (0); 706 707 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 708 &child, &children); 709 710 if (error == ENOENT) 711 return (0); 712 713 if (error) { 714 vdev_free(*vdp); 715 *vdp = NULL; 716 return (EINVAL); 717 } 718 719 for (int c = 0; c < children; c++) { 720 vdev_t *vd; 721 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 722 atype)) != 0) { 723 vdev_free(*vdp); 724 *vdp = NULL; 725 return (error); 726 } 727 } 728 729 ASSERT(*vdp != NULL); 730 731 return (0); 732 } 733 734 /* 735 * Opposite of spa_load(). 736 */ 737 static void 738 spa_unload(spa_t *spa) 739 { 740 int i; 741 742 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 743 744 /* 745 * Stop async tasks. 746 */ 747 spa_async_suspend(spa); 748 749 /* 750 * Stop syncing. 751 */ 752 if (spa->spa_sync_on) { 753 txg_sync_stop(spa->spa_dsl_pool); 754 spa->spa_sync_on = B_FALSE; 755 } 756 757 /* 758 * Wait for any outstanding async I/O to complete. 759 */ 760 if (spa->spa_async_zio_root != NULL) { 761 (void) zio_wait(spa->spa_async_zio_root); 762 spa->spa_async_zio_root = NULL; 763 } 764 765 /* 766 * Close the dsl pool. 767 */ 768 if (spa->spa_dsl_pool) { 769 dsl_pool_close(spa->spa_dsl_pool); 770 spa->spa_dsl_pool = NULL; 771 } 772 773 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 774 775 /* 776 * Drop and purge level 2 cache 777 */ 778 spa_l2cache_drop(spa); 779 780 /* 781 * Close all vdevs. 782 */ 783 if (spa->spa_root_vdev) 784 vdev_free(spa->spa_root_vdev); 785 ASSERT(spa->spa_root_vdev == NULL); 786 787 for (i = 0; i < spa->spa_spares.sav_count; i++) 788 vdev_free(spa->spa_spares.sav_vdevs[i]); 789 if (spa->spa_spares.sav_vdevs) { 790 kmem_free(spa->spa_spares.sav_vdevs, 791 spa->spa_spares.sav_count * sizeof (void *)); 792 spa->spa_spares.sav_vdevs = NULL; 793 } 794 if (spa->spa_spares.sav_config) { 795 nvlist_free(spa->spa_spares.sav_config); 796 spa->spa_spares.sav_config = NULL; 797 } 798 spa->spa_spares.sav_count = 0; 799 800 for (i = 0; i < spa->spa_l2cache.sav_count; i++) 801 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 802 if (spa->spa_l2cache.sav_vdevs) { 803 kmem_free(spa->spa_l2cache.sav_vdevs, 804 spa->spa_l2cache.sav_count * sizeof (void *)); 805 spa->spa_l2cache.sav_vdevs = NULL; 806 } 807 if (spa->spa_l2cache.sav_config) { 808 nvlist_free(spa->spa_l2cache.sav_config); 809 spa->spa_l2cache.sav_config = NULL; 810 } 811 spa->spa_l2cache.sav_count = 0; 812 813 spa->spa_async_suspended = 0; 814 815 spa_config_exit(spa, SCL_ALL, FTAG); 816 } 817 818 /* 819 * Load (or re-load) the current list of vdevs describing the active spares for 820 * this pool. When this is called, we have some form of basic information in 821 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 822 * then re-generate a more complete list including status information. 823 */ 824 static void 825 spa_load_spares(spa_t *spa) 826 { 827 nvlist_t **spares; 828 uint_t nspares; 829 int i; 830 vdev_t *vd, *tvd; 831 832 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 833 834 /* 835 * First, close and free any existing spare vdevs. 836 */ 837 for (i = 0; i < spa->spa_spares.sav_count; i++) { 838 vd = spa->spa_spares.sav_vdevs[i]; 839 840 /* Undo the call to spa_activate() below */ 841 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 842 B_FALSE)) != NULL && tvd->vdev_isspare) 843 spa_spare_remove(tvd); 844 vdev_close(vd); 845 vdev_free(vd); 846 } 847 848 if (spa->spa_spares.sav_vdevs) 849 kmem_free(spa->spa_spares.sav_vdevs, 850 spa->spa_spares.sav_count * sizeof (void *)); 851 852 if (spa->spa_spares.sav_config == NULL) 853 nspares = 0; 854 else 855 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 856 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 857 858 spa->spa_spares.sav_count = (int)nspares; 859 spa->spa_spares.sav_vdevs = NULL; 860 861 if (nspares == 0) 862 return; 863 864 /* 865 * Construct the array of vdevs, opening them to get status in the 866 * process. For each spare, there is potentially two different vdev_t 867 * structures associated with it: one in the list of spares (used only 868 * for basic validation purposes) and one in the active vdev 869 * configuration (if it's spared in). During this phase we open and 870 * validate each vdev on the spare list. If the vdev also exists in the 871 * active configuration, then we also mark this vdev as an active spare. 872 */ 873 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 874 KM_SLEEP); 875 for (i = 0; i < spa->spa_spares.sav_count; i++) { 876 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 877 VDEV_ALLOC_SPARE) == 0); 878 ASSERT(vd != NULL); 879 880 spa->spa_spares.sav_vdevs[i] = vd; 881 882 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 883 B_FALSE)) != NULL) { 884 if (!tvd->vdev_isspare) 885 spa_spare_add(tvd); 886 887 /* 888 * We only mark the spare active if we were successfully 889 * able to load the vdev. Otherwise, importing a pool 890 * with a bad active spare would result in strange 891 * behavior, because multiple pool would think the spare 892 * is actively in use. 893 * 894 * There is a vulnerability here to an equally bizarre 895 * circumstance, where a dead active spare is later 896 * brought back to life (onlined or otherwise). Given 897 * the rarity of this scenario, and the extra complexity 898 * it adds, we ignore the possibility. 899 */ 900 if (!vdev_is_dead(tvd)) 901 spa_spare_activate(tvd); 902 } 903 904 vd->vdev_top = vd; 905 vd->vdev_aux = &spa->spa_spares; 906 907 if (vdev_open(vd) != 0) 908 continue; 909 910 if (vdev_validate_aux(vd) == 0) 911 spa_spare_add(vd); 912 } 913 914 /* 915 * Recompute the stashed list of spares, with status information 916 * this time. 917 */ 918 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 919 DATA_TYPE_NVLIST_ARRAY) == 0); 920 921 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 922 KM_SLEEP); 923 for (i = 0; i < spa->spa_spares.sav_count; i++) 924 spares[i] = vdev_config_generate(spa, 925 spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE); 926 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 927 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 928 for (i = 0; i < spa->spa_spares.sav_count; i++) 929 nvlist_free(spares[i]); 930 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 931 } 932 933 /* 934 * Load (or re-load) the current list of vdevs describing the active l2cache for 935 * this pool. When this is called, we have some form of basic information in 936 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 937 * then re-generate a more complete list including status information. 938 * Devices which are already active have their details maintained, and are 939 * not re-opened. 940 */ 941 static void 942 spa_load_l2cache(spa_t *spa) 943 { 944 nvlist_t **l2cache; 945 uint_t nl2cache; 946 int i, j, oldnvdevs; 947 uint64_t guid; 948 vdev_t *vd, **oldvdevs, **newvdevs; 949 spa_aux_vdev_t *sav = &spa->spa_l2cache; 950 951 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 952 953 if (sav->sav_config != NULL) { 954 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 955 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 956 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 957 } else { 958 nl2cache = 0; 959 } 960 961 oldvdevs = sav->sav_vdevs; 962 oldnvdevs = sav->sav_count; 963 sav->sav_vdevs = NULL; 964 sav->sav_count = 0; 965 966 /* 967 * Process new nvlist of vdevs. 968 */ 969 for (i = 0; i < nl2cache; i++) { 970 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 971 &guid) == 0); 972 973 newvdevs[i] = NULL; 974 for (j = 0; j < oldnvdevs; j++) { 975 vd = oldvdevs[j]; 976 if (vd != NULL && guid == vd->vdev_guid) { 977 /* 978 * Retain previous vdev for add/remove ops. 979 */ 980 newvdevs[i] = vd; 981 oldvdevs[j] = NULL; 982 break; 983 } 984 } 985 986 if (newvdevs[i] == NULL) { 987 /* 988 * Create new vdev 989 */ 990 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 991 VDEV_ALLOC_L2CACHE) == 0); 992 ASSERT(vd != NULL); 993 newvdevs[i] = vd; 994 995 /* 996 * Commit this vdev as an l2cache device, 997 * even if it fails to open. 998 */ 999 spa_l2cache_add(vd); 1000 1001 vd->vdev_top = vd; 1002 vd->vdev_aux = sav; 1003 1004 spa_l2cache_activate(vd); 1005 1006 if (vdev_open(vd) != 0) 1007 continue; 1008 1009 (void) vdev_validate_aux(vd); 1010 1011 if (!vdev_is_dead(vd)) 1012 l2arc_add_vdev(spa, vd); 1013 } 1014 } 1015 1016 /* 1017 * Purge vdevs that were dropped 1018 */ 1019 for (i = 0; i < oldnvdevs; i++) { 1020 uint64_t pool; 1021 1022 vd = oldvdevs[i]; 1023 if (vd != NULL) { 1024 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1025 pool != 0ULL && l2arc_vdev_present(vd)) 1026 l2arc_remove_vdev(vd); 1027 (void) vdev_close(vd); 1028 spa_l2cache_remove(vd); 1029 } 1030 } 1031 1032 if (oldvdevs) 1033 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1034 1035 if (sav->sav_config == NULL) 1036 goto out; 1037 1038 sav->sav_vdevs = newvdevs; 1039 sav->sav_count = (int)nl2cache; 1040 1041 /* 1042 * Recompute the stashed list of l2cache devices, with status 1043 * information this time. 1044 */ 1045 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1046 DATA_TYPE_NVLIST_ARRAY) == 0); 1047 1048 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1049 for (i = 0; i < sav->sav_count; i++) 1050 l2cache[i] = vdev_config_generate(spa, 1051 sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE); 1052 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1053 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1054 out: 1055 for (i = 0; i < sav->sav_count; i++) 1056 nvlist_free(l2cache[i]); 1057 if (sav->sav_count) 1058 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1059 } 1060 1061 static int 1062 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1063 { 1064 dmu_buf_t *db; 1065 char *packed = NULL; 1066 size_t nvsize = 0; 1067 int error; 1068 *value = NULL; 1069 1070 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 1071 nvsize = *(uint64_t *)db->db_data; 1072 dmu_buf_rele(db, FTAG); 1073 1074 packed = kmem_alloc(nvsize, KM_SLEEP); 1075 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1076 DMU_READ_PREFETCH); 1077 if (error == 0) 1078 error = nvlist_unpack(packed, nvsize, value, 0); 1079 kmem_free(packed, nvsize); 1080 1081 return (error); 1082 } 1083 1084 /* 1085 * Checks to see if the given vdev could not be opened, in which case we post a 1086 * sysevent to notify the autoreplace code that the device has been removed. 1087 */ 1088 static void 1089 spa_check_removed(vdev_t *vd) 1090 { 1091 for (int c = 0; c < vd->vdev_children; c++) 1092 spa_check_removed(vd->vdev_child[c]); 1093 1094 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 1095 zfs_post_autoreplace(vd->vdev_spa, vd); 1096 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1097 } 1098 } 1099 1100 /* 1101 * Load the slog device state from the config object since it's possible 1102 * that the label does not contain the most up-to-date information. 1103 */ 1104 void 1105 spa_load_log_state(spa_t *spa, nvlist_t *nv) 1106 { 1107 vdev_t *ovd, *rvd = spa->spa_root_vdev; 1108 1109 /* 1110 * Load the original root vdev tree from the passed config. 1111 */ 1112 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1113 VERIFY(spa_config_parse(spa, &ovd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1114 1115 for (int c = 0; c < rvd->vdev_children; c++) { 1116 vdev_t *cvd = rvd->vdev_child[c]; 1117 if (cvd->vdev_islog) 1118 vdev_load_log_state(cvd, ovd->vdev_child[c]); 1119 } 1120 vdev_free(ovd); 1121 spa_config_exit(spa, SCL_ALL, FTAG); 1122 } 1123 1124 /* 1125 * Check for missing log devices 1126 */ 1127 int 1128 spa_check_logs(spa_t *spa) 1129 { 1130 switch (spa->spa_log_state) { 1131 case SPA_LOG_MISSING: 1132 /* need to recheck in case slog has been restored */ 1133 case SPA_LOG_UNKNOWN: 1134 if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, 1135 DS_FIND_CHILDREN)) { 1136 spa->spa_log_state = SPA_LOG_MISSING; 1137 return (1); 1138 } 1139 break; 1140 } 1141 return (0); 1142 } 1143 1144 /* 1145 * Load an existing storage pool, using the pool's builtin spa_config as a 1146 * source of configuration information. 1147 */ 1148 static int 1149 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 1150 { 1151 int error = 0; 1152 nvlist_t *nvconfig, *nvroot = NULL; 1153 vdev_t *rvd; 1154 uberblock_t *ub = &spa->spa_uberblock; 1155 uint64_t config_cache_txg = spa->spa_config_txg; 1156 uint64_t pool_guid; 1157 uint64_t version; 1158 uint64_t autoreplace = 0; 1159 int orig_mode = spa->spa_mode; 1160 char *ereport = FM_EREPORT_ZFS_POOL; 1161 1162 /* 1163 * If this is an untrusted config, access the pool in read-only mode. 1164 * This prevents things like resilvering recently removed devices. 1165 */ 1166 if (!mosconfig) 1167 spa->spa_mode = FREAD; 1168 1169 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1170 1171 spa->spa_load_state = state; 1172 1173 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 1174 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 1175 error = EINVAL; 1176 goto out; 1177 } 1178 1179 /* 1180 * Versioning wasn't explicitly added to the label until later, so if 1181 * it's not present treat it as the initial version. 1182 */ 1183 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 1184 version = SPA_VERSION_INITIAL; 1185 1186 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 1187 &spa->spa_config_txg); 1188 1189 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 1190 spa_guid_exists(pool_guid, 0)) { 1191 error = EEXIST; 1192 goto out; 1193 } 1194 1195 spa->spa_load_guid = pool_guid; 1196 1197 /* 1198 * Create "The Godfather" zio to hold all async IOs 1199 */ 1200 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 1201 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 1202 1203 /* 1204 * Parse the configuration into a vdev tree. We explicitly set the 1205 * value that will be returned by spa_version() since parsing the 1206 * configuration requires knowing the version number. 1207 */ 1208 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1209 spa->spa_ubsync.ub_version = version; 1210 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 1211 spa_config_exit(spa, SCL_ALL, FTAG); 1212 1213 if (error != 0) 1214 goto out; 1215 1216 ASSERT(spa->spa_root_vdev == rvd); 1217 ASSERT(spa_guid(spa) == pool_guid); 1218 1219 /* 1220 * Try to open all vdevs, loading each label in the process. 1221 */ 1222 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1223 error = vdev_open(rvd); 1224 spa_config_exit(spa, SCL_ALL, FTAG); 1225 if (error != 0) 1226 goto out; 1227 1228 /* 1229 * We need to validate the vdev labels against the configuration that 1230 * we have in hand, which is dependent on the setting of mosconfig. If 1231 * mosconfig is true then we're validating the vdev labels based on 1232 * that config. Otherwise, we're validating against the cached config 1233 * (zpool.cache) that was read when we loaded the zfs module, and then 1234 * later we will recursively call spa_load() and validate against 1235 * the vdev config. 1236 */ 1237 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1238 error = vdev_validate(rvd); 1239 spa_config_exit(spa, SCL_ALL, FTAG); 1240 if (error != 0) 1241 goto out; 1242 1243 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1244 error = ENXIO; 1245 goto out; 1246 } 1247 1248 /* 1249 * Find the best uberblock. 1250 */ 1251 vdev_uberblock_load(NULL, rvd, ub); 1252 1253 /* 1254 * If we weren't able to find a single valid uberblock, return failure. 1255 */ 1256 if (ub->ub_txg == 0) { 1257 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1258 VDEV_AUX_CORRUPT_DATA); 1259 error = ENXIO; 1260 goto out; 1261 } 1262 1263 /* 1264 * If the pool is newer than the code, we can't open it. 1265 */ 1266 if (ub->ub_version > SPA_VERSION) { 1267 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1268 VDEV_AUX_VERSION_NEWER); 1269 error = ENOTSUP; 1270 goto out; 1271 } 1272 1273 /* 1274 * If the vdev guid sum doesn't match the uberblock, we have an 1275 * incomplete configuration. 1276 */ 1277 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 1278 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1279 VDEV_AUX_BAD_GUID_SUM); 1280 error = ENXIO; 1281 goto out; 1282 } 1283 1284 /* 1285 * Initialize internal SPA structures. 1286 */ 1287 spa->spa_state = POOL_STATE_ACTIVE; 1288 spa->spa_ubsync = spa->spa_uberblock; 1289 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 1290 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 1291 if (error) { 1292 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1293 VDEV_AUX_CORRUPT_DATA); 1294 goto out; 1295 } 1296 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 1297 1298 if (zap_lookup(spa->spa_meta_objset, 1299 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1300 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 1301 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1302 VDEV_AUX_CORRUPT_DATA); 1303 error = EIO; 1304 goto out; 1305 } 1306 1307 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) { 1308 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1309 VDEV_AUX_CORRUPT_DATA); 1310 error = EIO; 1311 goto out; 1312 } 1313 1314 if (!mosconfig) { 1315 uint64_t hostid; 1316 1317 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 1318 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 1319 char *hostname; 1320 unsigned long myhostid = 0; 1321 1322 VERIFY(nvlist_lookup_string(nvconfig, 1323 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 1324 1325 #ifdef _KERNEL 1326 myhostid = zone_get_hostid(NULL); 1327 #else /* _KERNEL */ 1328 /* 1329 * We're emulating the system's hostid in userland, so 1330 * we can't use zone_get_hostid(). 1331 */ 1332 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 1333 #endif /* _KERNEL */ 1334 if (hostid != 0 && myhostid != 0 && 1335 hostid != myhostid) { 1336 cmn_err(CE_WARN, "pool '%s' could not be " 1337 "loaded as it was last accessed by " 1338 "another system (host: %s hostid: 0x%lx). " 1339 "See: http://www.sun.com/msg/ZFS-8000-EY", 1340 spa_name(spa), hostname, 1341 (unsigned long)hostid); 1342 error = EBADF; 1343 goto out; 1344 } 1345 } 1346 1347 spa_config_set(spa, nvconfig); 1348 spa_unload(spa); 1349 spa_deactivate(spa); 1350 spa_activate(spa, orig_mode); 1351 1352 return (spa_load(spa, nvconfig, state, B_TRUE)); 1353 } 1354 1355 if (zap_lookup(spa->spa_meta_objset, 1356 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1357 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 1358 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1359 VDEV_AUX_CORRUPT_DATA); 1360 error = EIO; 1361 goto out; 1362 } 1363 1364 /* 1365 * Load the bit that tells us to use the new accounting function 1366 * (raid-z deflation). If we have an older pool, this will not 1367 * be present. 1368 */ 1369 error = zap_lookup(spa->spa_meta_objset, 1370 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1371 sizeof (uint64_t), 1, &spa->spa_deflate); 1372 if (error != 0 && error != ENOENT) { 1373 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1374 VDEV_AUX_CORRUPT_DATA); 1375 error = EIO; 1376 goto out; 1377 } 1378 1379 /* 1380 * Load the persistent error log. If we have an older pool, this will 1381 * not be present. 1382 */ 1383 error = zap_lookup(spa->spa_meta_objset, 1384 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 1385 sizeof (uint64_t), 1, &spa->spa_errlog_last); 1386 if (error != 0 && error != ENOENT) { 1387 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1388 VDEV_AUX_CORRUPT_DATA); 1389 error = EIO; 1390 goto out; 1391 } 1392 1393 error = zap_lookup(spa->spa_meta_objset, 1394 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 1395 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 1396 if (error != 0 && error != ENOENT) { 1397 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1398 VDEV_AUX_CORRUPT_DATA); 1399 error = EIO; 1400 goto out; 1401 } 1402 1403 /* 1404 * Load the history object. If we have an older pool, this 1405 * will not be present. 1406 */ 1407 error = zap_lookup(spa->spa_meta_objset, 1408 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 1409 sizeof (uint64_t), 1, &spa->spa_history); 1410 if (error != 0 && error != ENOENT) { 1411 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1412 VDEV_AUX_CORRUPT_DATA); 1413 error = EIO; 1414 goto out; 1415 } 1416 1417 /* 1418 * Load any hot spares for this pool. 1419 */ 1420 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1421 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object); 1422 if (error != 0 && error != ENOENT) { 1423 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1424 VDEV_AUX_CORRUPT_DATA); 1425 error = EIO; 1426 goto out; 1427 } 1428 if (error == 0) { 1429 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 1430 if (load_nvlist(spa, spa->spa_spares.sav_object, 1431 &spa->spa_spares.sav_config) != 0) { 1432 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1433 VDEV_AUX_CORRUPT_DATA); 1434 error = EIO; 1435 goto out; 1436 } 1437 1438 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1439 spa_load_spares(spa); 1440 spa_config_exit(spa, SCL_ALL, FTAG); 1441 } 1442 1443 /* 1444 * Load any level 2 ARC devices for this pool. 1445 */ 1446 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1447 DMU_POOL_L2CACHE, sizeof (uint64_t), 1, 1448 &spa->spa_l2cache.sav_object); 1449 if (error != 0 && error != ENOENT) { 1450 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1451 VDEV_AUX_CORRUPT_DATA); 1452 error = EIO; 1453 goto out; 1454 } 1455 if (error == 0) { 1456 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 1457 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 1458 &spa->spa_l2cache.sav_config) != 0) { 1459 vdev_set_state(rvd, B_TRUE, 1460 VDEV_STATE_CANT_OPEN, 1461 VDEV_AUX_CORRUPT_DATA); 1462 error = EIO; 1463 goto out; 1464 } 1465 1466 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1467 spa_load_l2cache(spa); 1468 spa_config_exit(spa, SCL_ALL, FTAG); 1469 } 1470 1471 VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE, 1472 &nvroot) == 0); 1473 spa_load_log_state(spa, nvroot); 1474 nvlist_free(nvconfig); 1475 1476 if (spa_check_logs(spa)) { 1477 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1478 VDEV_AUX_BAD_LOG); 1479 error = ENXIO; 1480 ereport = FM_EREPORT_ZFS_LOG_REPLAY; 1481 goto out; 1482 } 1483 1484 1485 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 1486 1487 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1488 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 1489 1490 if (error && error != ENOENT) { 1491 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1492 VDEV_AUX_CORRUPT_DATA); 1493 error = EIO; 1494 goto out; 1495 } 1496 1497 if (error == 0) { 1498 (void) zap_lookup(spa->spa_meta_objset, 1499 spa->spa_pool_props_object, 1500 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 1501 sizeof (uint64_t), 1, &spa->spa_bootfs); 1502 (void) zap_lookup(spa->spa_meta_objset, 1503 spa->spa_pool_props_object, 1504 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1505 sizeof (uint64_t), 1, &autoreplace); 1506 (void) zap_lookup(spa->spa_meta_objset, 1507 spa->spa_pool_props_object, 1508 zpool_prop_to_name(ZPOOL_PROP_DELEGATION), 1509 sizeof (uint64_t), 1, &spa->spa_delegation); 1510 (void) zap_lookup(spa->spa_meta_objset, 1511 spa->spa_pool_props_object, 1512 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 1513 sizeof (uint64_t), 1, &spa->spa_failmode); 1514 (void) zap_lookup(spa->spa_meta_objset, 1515 spa->spa_pool_props_object, 1516 zpool_prop_to_name(ZPOOL_PROP_AUTOEXPAND), 1517 sizeof (uint64_t), 1, &spa->spa_autoexpand); 1518 } 1519 1520 /* 1521 * If the 'autoreplace' property is set, then post a resource notifying 1522 * the ZFS DE that it should not issue any faults for unopenable 1523 * devices. We also iterate over the vdevs, and post a sysevent for any 1524 * unopenable vdevs so that the normal autoreplace handler can take 1525 * over. 1526 */ 1527 if (autoreplace && state != SPA_LOAD_TRYIMPORT) 1528 spa_check_removed(spa->spa_root_vdev); 1529 1530 /* 1531 * Load the vdev state for all toplevel vdevs. 1532 */ 1533 vdev_load(rvd); 1534 1535 /* 1536 * Propagate the leaf DTLs we just loaded all the way up the tree. 1537 */ 1538 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1539 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 1540 spa_config_exit(spa, SCL_ALL, FTAG); 1541 1542 /* 1543 * Check the state of the root vdev. If it can't be opened, it 1544 * indicates one or more toplevel vdevs are faulted. 1545 */ 1546 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1547 error = ENXIO; 1548 goto out; 1549 } 1550 1551 if (spa_writeable(spa)) { 1552 dmu_tx_t *tx; 1553 int need_update = B_FALSE; 1554 1555 ASSERT(state != SPA_LOAD_TRYIMPORT); 1556 1557 /* 1558 * Claim log blocks that haven't been committed yet. 1559 * This must all happen in a single txg. 1560 */ 1561 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 1562 spa_first_txg(spa)); 1563 (void) dmu_objset_find(spa_name(spa), 1564 zil_claim, tx, DS_FIND_CHILDREN); 1565 dmu_tx_commit(tx); 1566 1567 spa->spa_log_state = SPA_LOG_GOOD; 1568 spa->spa_sync_on = B_TRUE; 1569 txg_sync_start(spa->spa_dsl_pool); 1570 1571 /* 1572 * Wait for all claims to sync. 1573 */ 1574 txg_wait_synced(spa->spa_dsl_pool, 0); 1575 1576 /* 1577 * If the config cache is stale, or we have uninitialized 1578 * metaslabs (see spa_vdev_add()), then update the config. 1579 * 1580 * If spa_load_verbatim is true, trust the current 1581 * in-core spa_config and update the disk labels. 1582 */ 1583 if (config_cache_txg != spa->spa_config_txg || 1584 state == SPA_LOAD_IMPORT || spa->spa_load_verbatim) 1585 need_update = B_TRUE; 1586 1587 for (int c = 0; c < rvd->vdev_children; c++) 1588 if (rvd->vdev_child[c]->vdev_ms_array == 0) 1589 need_update = B_TRUE; 1590 1591 /* 1592 * Update the config cache asychronously in case we're the 1593 * root pool, in which case the config cache isn't writable yet. 1594 */ 1595 if (need_update) 1596 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 1597 1598 /* 1599 * Check all DTLs to see if anything needs resilvering. 1600 */ 1601 if (vdev_resilver_needed(rvd, NULL, NULL)) 1602 spa_async_request(spa, SPA_ASYNC_RESILVER); 1603 1604 /* 1605 * Delete any inconsistent datasets. 1606 */ 1607 (void) dmu_objset_find(spa_name(spa), 1608 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 1609 1610 /* 1611 * Clean up any stale temporary dataset userrefs. 1612 */ 1613 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 1614 } 1615 1616 error = 0; 1617 out: 1618 spa->spa_minref = refcount_count(&spa->spa_refcount); 1619 if (error && error != EBADF) 1620 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 1621 spa->spa_load_state = SPA_LOAD_NONE; 1622 spa->spa_ena = 0; 1623 1624 return (error); 1625 } 1626 1627 /* 1628 * Pool Open/Import 1629 * 1630 * The import case is identical to an open except that the configuration is sent 1631 * down from userland, instead of grabbed from the configuration cache. For the 1632 * case of an open, the pool configuration will exist in the 1633 * POOL_STATE_UNINITIALIZED state. 1634 * 1635 * The stats information (gen/count/ustats) is used to gather vdev statistics at 1636 * the same time open the pool, without having to keep around the spa_t in some 1637 * ambiguous state. 1638 */ 1639 static int 1640 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 1641 { 1642 spa_t *spa; 1643 int error; 1644 int locked = B_FALSE; 1645 1646 *spapp = NULL; 1647 1648 /* 1649 * As disgusting as this is, we need to support recursive calls to this 1650 * function because dsl_dir_open() is called during spa_load(), and ends 1651 * up calling spa_open() again. The real fix is to figure out how to 1652 * avoid dsl_dir_open() calling this in the first place. 1653 */ 1654 if (mutex_owner(&spa_namespace_lock) != curthread) { 1655 mutex_enter(&spa_namespace_lock); 1656 locked = B_TRUE; 1657 } 1658 1659 if ((spa = spa_lookup(pool)) == NULL) { 1660 if (locked) 1661 mutex_exit(&spa_namespace_lock); 1662 return (ENOENT); 1663 } 1664 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 1665 1666 spa_activate(spa, spa_mode_global); 1667 1668 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 1669 1670 if (error == EBADF) { 1671 /* 1672 * If vdev_validate() returns failure (indicated by 1673 * EBADF), it indicates that one of the vdevs indicates 1674 * that the pool has been exported or destroyed. If 1675 * this is the case, the config cache is out of sync and 1676 * we should remove the pool from the namespace. 1677 */ 1678 spa_unload(spa); 1679 spa_deactivate(spa); 1680 spa_config_sync(spa, B_TRUE, B_TRUE); 1681 spa_remove(spa); 1682 if (locked) 1683 mutex_exit(&spa_namespace_lock); 1684 return (ENOENT); 1685 } 1686 1687 if (error) { 1688 /* 1689 * We can't open the pool, but we still have useful 1690 * information: the state of each vdev after the 1691 * attempted vdev_open(). Return this to the user. 1692 */ 1693 if (config != NULL && spa->spa_root_vdev != NULL) 1694 *config = spa_config_generate(spa, NULL, -1ULL, 1695 B_TRUE); 1696 spa_unload(spa); 1697 spa_deactivate(spa); 1698 spa->spa_last_open_failed = B_TRUE; 1699 if (locked) 1700 mutex_exit(&spa_namespace_lock); 1701 *spapp = NULL; 1702 return (error); 1703 } else { 1704 spa->spa_last_open_failed = B_FALSE; 1705 } 1706 } 1707 1708 spa_open_ref(spa, tag); 1709 1710 if (locked) 1711 mutex_exit(&spa_namespace_lock); 1712 1713 *spapp = spa; 1714 1715 if (config != NULL) 1716 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1717 1718 return (0); 1719 } 1720 1721 int 1722 spa_open(const char *name, spa_t **spapp, void *tag) 1723 { 1724 return (spa_open_common(name, spapp, tag, NULL)); 1725 } 1726 1727 /* 1728 * Lookup the given spa_t, incrementing the inject count in the process, 1729 * preventing it from being exported or destroyed. 1730 */ 1731 spa_t * 1732 spa_inject_addref(char *name) 1733 { 1734 spa_t *spa; 1735 1736 mutex_enter(&spa_namespace_lock); 1737 if ((spa = spa_lookup(name)) == NULL) { 1738 mutex_exit(&spa_namespace_lock); 1739 return (NULL); 1740 } 1741 spa->spa_inject_ref++; 1742 mutex_exit(&spa_namespace_lock); 1743 1744 return (spa); 1745 } 1746 1747 void 1748 spa_inject_delref(spa_t *spa) 1749 { 1750 mutex_enter(&spa_namespace_lock); 1751 spa->spa_inject_ref--; 1752 mutex_exit(&spa_namespace_lock); 1753 } 1754 1755 /* 1756 * Add spares device information to the nvlist. 1757 */ 1758 static void 1759 spa_add_spares(spa_t *spa, nvlist_t *config) 1760 { 1761 nvlist_t **spares; 1762 uint_t i, nspares; 1763 nvlist_t *nvroot; 1764 uint64_t guid; 1765 vdev_stat_t *vs; 1766 uint_t vsc; 1767 uint64_t pool; 1768 1769 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 1770 1771 if (spa->spa_spares.sav_count == 0) 1772 return; 1773 1774 VERIFY(nvlist_lookup_nvlist(config, 1775 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1776 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1777 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1778 if (nspares != 0) { 1779 VERIFY(nvlist_add_nvlist_array(nvroot, 1780 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1781 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1782 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1783 1784 /* 1785 * Go through and find any spares which have since been 1786 * repurposed as an active spare. If this is the case, update 1787 * their status appropriately. 1788 */ 1789 for (i = 0; i < nspares; i++) { 1790 VERIFY(nvlist_lookup_uint64(spares[i], 1791 ZPOOL_CONFIG_GUID, &guid) == 0); 1792 if (spa_spare_exists(guid, &pool, NULL) && 1793 pool != 0ULL) { 1794 VERIFY(nvlist_lookup_uint64_array( 1795 spares[i], ZPOOL_CONFIG_STATS, 1796 (uint64_t **)&vs, &vsc) == 0); 1797 vs->vs_state = VDEV_STATE_CANT_OPEN; 1798 vs->vs_aux = VDEV_AUX_SPARED; 1799 } 1800 } 1801 } 1802 } 1803 1804 /* 1805 * Add l2cache device information to the nvlist, including vdev stats. 1806 */ 1807 static void 1808 spa_add_l2cache(spa_t *spa, nvlist_t *config) 1809 { 1810 nvlist_t **l2cache; 1811 uint_t i, j, nl2cache; 1812 nvlist_t *nvroot; 1813 uint64_t guid; 1814 vdev_t *vd; 1815 vdev_stat_t *vs; 1816 uint_t vsc; 1817 1818 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 1819 1820 if (spa->spa_l2cache.sav_count == 0) 1821 return; 1822 1823 VERIFY(nvlist_lookup_nvlist(config, 1824 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1825 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 1826 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1827 if (nl2cache != 0) { 1828 VERIFY(nvlist_add_nvlist_array(nvroot, 1829 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 1830 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1831 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1832 1833 /* 1834 * Update level 2 cache device stats. 1835 */ 1836 1837 for (i = 0; i < nl2cache; i++) { 1838 VERIFY(nvlist_lookup_uint64(l2cache[i], 1839 ZPOOL_CONFIG_GUID, &guid) == 0); 1840 1841 vd = NULL; 1842 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 1843 if (guid == 1844 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 1845 vd = spa->spa_l2cache.sav_vdevs[j]; 1846 break; 1847 } 1848 } 1849 ASSERT(vd != NULL); 1850 1851 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 1852 ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0); 1853 vdev_get_stats(vd, vs); 1854 } 1855 } 1856 } 1857 1858 int 1859 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 1860 { 1861 int error; 1862 spa_t *spa; 1863 1864 *config = NULL; 1865 error = spa_open_common(name, &spa, FTAG, config); 1866 1867 if (spa != NULL) { 1868 /* 1869 * This still leaves a window of inconsistency where the spares 1870 * or l2cache devices could change and the config would be 1871 * self-inconsistent. 1872 */ 1873 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 1874 1875 if (*config != NULL) { 1876 VERIFY(nvlist_add_uint64(*config, 1877 ZPOOL_CONFIG_ERRCOUNT, 1878 spa_get_errlog_size(spa)) == 0); 1879 1880 if (spa_suspended(spa)) 1881 VERIFY(nvlist_add_uint64(*config, 1882 ZPOOL_CONFIG_SUSPENDED, 1883 spa->spa_failmode) == 0); 1884 1885 spa_add_spares(spa, *config); 1886 spa_add_l2cache(spa, *config); 1887 } 1888 } 1889 1890 /* 1891 * We want to get the alternate root even for faulted pools, so we cheat 1892 * and call spa_lookup() directly. 1893 */ 1894 if (altroot) { 1895 if (spa == NULL) { 1896 mutex_enter(&spa_namespace_lock); 1897 spa = spa_lookup(name); 1898 if (spa) 1899 spa_altroot(spa, altroot, buflen); 1900 else 1901 altroot[0] = '\0'; 1902 spa = NULL; 1903 mutex_exit(&spa_namespace_lock); 1904 } else { 1905 spa_altroot(spa, altroot, buflen); 1906 } 1907 } 1908 1909 if (spa != NULL) { 1910 spa_config_exit(spa, SCL_CONFIG, FTAG); 1911 spa_close(spa, FTAG); 1912 } 1913 1914 return (error); 1915 } 1916 1917 /* 1918 * Validate that the auxiliary device array is well formed. We must have an 1919 * array of nvlists, each which describes a valid leaf vdev. If this is an 1920 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 1921 * specified, as long as they are well-formed. 1922 */ 1923 static int 1924 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 1925 spa_aux_vdev_t *sav, const char *config, uint64_t version, 1926 vdev_labeltype_t label) 1927 { 1928 nvlist_t **dev; 1929 uint_t i, ndev; 1930 vdev_t *vd; 1931 int error; 1932 1933 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1934 1935 /* 1936 * It's acceptable to have no devs specified. 1937 */ 1938 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 1939 return (0); 1940 1941 if (ndev == 0) 1942 return (EINVAL); 1943 1944 /* 1945 * Make sure the pool is formatted with a version that supports this 1946 * device type. 1947 */ 1948 if (spa_version(spa) < version) 1949 return (ENOTSUP); 1950 1951 /* 1952 * Set the pending device list so we correctly handle device in-use 1953 * checking. 1954 */ 1955 sav->sav_pending = dev; 1956 sav->sav_npending = ndev; 1957 1958 for (i = 0; i < ndev; i++) { 1959 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 1960 mode)) != 0) 1961 goto out; 1962 1963 if (!vd->vdev_ops->vdev_op_leaf) { 1964 vdev_free(vd); 1965 error = EINVAL; 1966 goto out; 1967 } 1968 1969 /* 1970 * The L2ARC currently only supports disk devices in 1971 * kernel context. For user-level testing, we allow it. 1972 */ 1973 #ifdef _KERNEL 1974 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 1975 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 1976 error = ENOTBLK; 1977 goto out; 1978 } 1979 #endif 1980 vd->vdev_top = vd; 1981 1982 if ((error = vdev_open(vd)) == 0 && 1983 (error = vdev_label_init(vd, crtxg, label)) == 0) { 1984 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 1985 vd->vdev_guid) == 0); 1986 } 1987 1988 vdev_free(vd); 1989 1990 if (error && 1991 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 1992 goto out; 1993 else 1994 error = 0; 1995 } 1996 1997 out: 1998 sav->sav_pending = NULL; 1999 sav->sav_npending = 0; 2000 return (error); 2001 } 2002 2003 static int 2004 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 2005 { 2006 int error; 2007 2008 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2009 2010 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2011 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 2012 VDEV_LABEL_SPARE)) != 0) { 2013 return (error); 2014 } 2015 2016 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2017 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 2018 VDEV_LABEL_L2CACHE)); 2019 } 2020 2021 static void 2022 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 2023 const char *config) 2024 { 2025 int i; 2026 2027 if (sav->sav_config != NULL) { 2028 nvlist_t **olddevs; 2029 uint_t oldndevs; 2030 nvlist_t **newdevs; 2031 2032 /* 2033 * Generate new dev list by concatentating with the 2034 * current dev list. 2035 */ 2036 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 2037 &olddevs, &oldndevs) == 0); 2038 2039 newdevs = kmem_alloc(sizeof (void *) * 2040 (ndevs + oldndevs), KM_SLEEP); 2041 for (i = 0; i < oldndevs; i++) 2042 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 2043 KM_SLEEP) == 0); 2044 for (i = 0; i < ndevs; i++) 2045 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 2046 KM_SLEEP) == 0); 2047 2048 VERIFY(nvlist_remove(sav->sav_config, config, 2049 DATA_TYPE_NVLIST_ARRAY) == 0); 2050 2051 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 2052 config, newdevs, ndevs + oldndevs) == 0); 2053 for (i = 0; i < oldndevs + ndevs; i++) 2054 nvlist_free(newdevs[i]); 2055 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 2056 } else { 2057 /* 2058 * Generate a new dev list. 2059 */ 2060 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 2061 KM_SLEEP) == 0); 2062 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 2063 devs, ndevs) == 0); 2064 } 2065 } 2066 2067 /* 2068 * Stop and drop level 2 ARC devices 2069 */ 2070 void 2071 spa_l2cache_drop(spa_t *spa) 2072 { 2073 vdev_t *vd; 2074 int i; 2075 spa_aux_vdev_t *sav = &spa->spa_l2cache; 2076 2077 for (i = 0; i < sav->sav_count; i++) { 2078 uint64_t pool; 2079 2080 vd = sav->sav_vdevs[i]; 2081 ASSERT(vd != NULL); 2082 2083 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 2084 pool != 0ULL && l2arc_vdev_present(vd)) 2085 l2arc_remove_vdev(vd); 2086 if (vd->vdev_isl2cache) 2087 spa_l2cache_remove(vd); 2088 vdev_clear_stats(vd); 2089 (void) vdev_close(vd); 2090 } 2091 } 2092 2093 /* 2094 * Pool Creation 2095 */ 2096 int 2097 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 2098 const char *history_str, nvlist_t *zplprops) 2099 { 2100 spa_t *spa; 2101 char *altroot = NULL; 2102 vdev_t *rvd; 2103 dsl_pool_t *dp; 2104 dmu_tx_t *tx; 2105 int error = 0; 2106 uint64_t txg = TXG_INITIAL; 2107 nvlist_t **spares, **l2cache; 2108 uint_t nspares, nl2cache; 2109 uint64_t version; 2110 2111 /* 2112 * If this pool already exists, return failure. 2113 */ 2114 mutex_enter(&spa_namespace_lock); 2115 if (spa_lookup(pool) != NULL) { 2116 mutex_exit(&spa_namespace_lock); 2117 return (EEXIST); 2118 } 2119 2120 /* 2121 * Allocate a new spa_t structure. 2122 */ 2123 (void) nvlist_lookup_string(props, 2124 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2125 spa = spa_add(pool, altroot); 2126 spa_activate(spa, spa_mode_global); 2127 2128 spa->spa_uberblock.ub_txg = txg - 1; 2129 2130 if (props && (error = spa_prop_validate(spa, props))) { 2131 spa_deactivate(spa); 2132 spa_remove(spa); 2133 mutex_exit(&spa_namespace_lock); 2134 return (error); 2135 } 2136 2137 if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 2138 &version) != 0) 2139 version = SPA_VERSION; 2140 ASSERT(version <= SPA_VERSION); 2141 spa->spa_uberblock.ub_version = version; 2142 spa->spa_ubsync = spa->spa_uberblock; 2143 2144 /* 2145 * Create "The Godfather" zio to hold all async IOs 2146 */ 2147 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 2148 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 2149 2150 /* 2151 * Create the root vdev. 2152 */ 2153 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2154 2155 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 2156 2157 ASSERT(error != 0 || rvd != NULL); 2158 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 2159 2160 if (error == 0 && !zfs_allocatable_devs(nvroot)) 2161 error = EINVAL; 2162 2163 if (error == 0 && 2164 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 2165 (error = spa_validate_aux(spa, nvroot, txg, 2166 VDEV_ALLOC_ADD)) == 0) { 2167 for (int c = 0; c < rvd->vdev_children; c++) { 2168 vdev_metaslab_set_size(rvd->vdev_child[c]); 2169 vdev_expand(rvd->vdev_child[c], txg); 2170 } 2171 } 2172 2173 spa_config_exit(spa, SCL_ALL, FTAG); 2174 2175 if (error != 0) { 2176 spa_unload(spa); 2177 spa_deactivate(spa); 2178 spa_remove(spa); 2179 mutex_exit(&spa_namespace_lock); 2180 return (error); 2181 } 2182 2183 /* 2184 * Get the list of spares, if specified. 2185 */ 2186 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2187 &spares, &nspares) == 0) { 2188 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 2189 KM_SLEEP) == 0); 2190 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2191 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2192 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2193 spa_load_spares(spa); 2194 spa_config_exit(spa, SCL_ALL, FTAG); 2195 spa->spa_spares.sav_sync = B_TRUE; 2196 } 2197 2198 /* 2199 * Get the list of level 2 cache devices, if specified. 2200 */ 2201 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2202 &l2cache, &nl2cache) == 0) { 2203 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2204 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2205 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2206 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2207 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2208 spa_load_l2cache(spa); 2209 spa_config_exit(spa, SCL_ALL, FTAG); 2210 spa->spa_l2cache.sav_sync = B_TRUE; 2211 } 2212 2213 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 2214 spa->spa_meta_objset = dp->dp_meta_objset; 2215 2216 tx = dmu_tx_create_assigned(dp, txg); 2217 2218 /* 2219 * Create the pool config object. 2220 */ 2221 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 2222 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 2223 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2224 2225 if (zap_add(spa->spa_meta_objset, 2226 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 2227 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 2228 cmn_err(CE_PANIC, "failed to add pool config"); 2229 } 2230 2231 /* Newly created pools with the right version are always deflated. */ 2232 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 2233 spa->spa_deflate = TRUE; 2234 if (zap_add(spa->spa_meta_objset, 2235 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2236 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 2237 cmn_err(CE_PANIC, "failed to add deflate"); 2238 } 2239 } 2240 2241 /* 2242 * Create the deferred-free bplist object. Turn off compression 2243 * because sync-to-convergence takes longer if the blocksize 2244 * keeps changing. 2245 */ 2246 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 2247 1 << 14, tx); 2248 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 2249 ZIO_COMPRESS_OFF, tx); 2250 2251 if (zap_add(spa->spa_meta_objset, 2252 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 2253 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 2254 cmn_err(CE_PANIC, "failed to add bplist"); 2255 } 2256 2257 /* 2258 * Create the pool's history object. 2259 */ 2260 if (version >= SPA_VERSION_ZPOOL_HISTORY) 2261 spa_history_create_obj(spa, tx); 2262 2263 /* 2264 * Set pool properties. 2265 */ 2266 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 2267 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2268 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 2269 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 2270 if (props != NULL) { 2271 spa_configfile_set(spa, props, B_FALSE); 2272 spa_sync_props(spa, props, CRED(), tx); 2273 } 2274 2275 dmu_tx_commit(tx); 2276 2277 spa->spa_sync_on = B_TRUE; 2278 txg_sync_start(spa->spa_dsl_pool); 2279 2280 /* 2281 * We explicitly wait for the first transaction to complete so that our 2282 * bean counters are appropriately updated. 2283 */ 2284 txg_wait_synced(spa->spa_dsl_pool, txg); 2285 2286 spa_config_sync(spa, B_FALSE, B_TRUE); 2287 2288 if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 2289 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 2290 spa_history_log_version(spa, LOG_POOL_CREATE); 2291 2292 spa->spa_minref = refcount_count(&spa->spa_refcount); 2293 2294 mutex_exit(&spa_namespace_lock); 2295 2296 return (0); 2297 } 2298 2299 #ifdef _KERNEL 2300 /* 2301 * Get the root pool information from the root disk, then import the root pool 2302 * during the system boot up time. 2303 */ 2304 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 2305 2306 static nvlist_t * 2307 spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 2308 { 2309 nvlist_t *config; 2310 nvlist_t *nvtop, *nvroot; 2311 uint64_t pgid; 2312 2313 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 2314 return (NULL); 2315 2316 /* 2317 * Add this top-level vdev to the child array. 2318 */ 2319 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 2320 &nvtop) == 0); 2321 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 2322 &pgid) == 0); 2323 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 2324 2325 /* 2326 * Put this pool's top-level vdevs into a root vdev. 2327 */ 2328 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2329 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 2330 VDEV_TYPE_ROOT) == 0); 2331 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 2332 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 2333 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 2334 &nvtop, 1) == 0); 2335 2336 /* 2337 * Replace the existing vdev_tree with the new root vdev in 2338 * this pool's configuration (remove the old, add the new). 2339 */ 2340 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 2341 nvlist_free(nvroot); 2342 return (config); 2343 } 2344 2345 /* 2346 * Walk the vdev tree and see if we can find a device with "better" 2347 * configuration. A configuration is "better" if the label on that 2348 * device has a more recent txg. 2349 */ 2350 static void 2351 spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 2352 { 2353 for (int c = 0; c < vd->vdev_children; c++) 2354 spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 2355 2356 if (vd->vdev_ops->vdev_op_leaf) { 2357 nvlist_t *label; 2358 uint64_t label_txg; 2359 2360 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 2361 &label) != 0) 2362 return; 2363 2364 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 2365 &label_txg) == 0); 2366 2367 /* 2368 * Do we have a better boot device? 2369 */ 2370 if (label_txg > *txg) { 2371 *txg = label_txg; 2372 *avd = vd; 2373 } 2374 nvlist_free(label); 2375 } 2376 } 2377 2378 /* 2379 * Import a root pool. 2380 * 2381 * For x86. devpath_list will consist of devid and/or physpath name of 2382 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 2383 * The GRUB "findroot" command will return the vdev we should boot. 2384 * 2385 * For Sparc, devpath_list consists the physpath name of the booting device 2386 * no matter the rootpool is a single device pool or a mirrored pool. 2387 * e.g. 2388 * "/pci@1f,0/ide@d/disk@0,0:a" 2389 */ 2390 int 2391 spa_import_rootpool(char *devpath, char *devid) 2392 { 2393 spa_t *spa; 2394 vdev_t *rvd, *bvd, *avd = NULL; 2395 nvlist_t *config, *nvtop; 2396 uint64_t guid, txg; 2397 char *pname; 2398 int error; 2399 2400 /* 2401 * Read the label from the boot device and generate a configuration. 2402 */ 2403 if ((config = spa_generate_rootconf(devpath, devid, &guid)) == NULL) { 2404 cmn_err(CE_NOTE, "Can not read the pool label from '%s'", 2405 devpath); 2406 return (EIO); 2407 } 2408 2409 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 2410 &pname) == 0); 2411 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 2412 2413 mutex_enter(&spa_namespace_lock); 2414 if ((spa = spa_lookup(pname)) != NULL) { 2415 /* 2416 * Remove the existing root pool from the namespace so that we 2417 * can replace it with the correct config we just read in. 2418 */ 2419 spa_remove(spa); 2420 } 2421 2422 spa = spa_add(pname, NULL); 2423 spa->spa_is_root = B_TRUE; 2424 spa->spa_load_verbatim = B_TRUE; 2425 2426 /* 2427 * Build up a vdev tree based on the boot device's label config. 2428 */ 2429 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 2430 &nvtop) == 0); 2431 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2432 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 2433 VDEV_ALLOC_ROOTPOOL); 2434 spa_config_exit(spa, SCL_ALL, FTAG); 2435 if (error) { 2436 mutex_exit(&spa_namespace_lock); 2437 nvlist_free(config); 2438 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 2439 pname); 2440 return (error); 2441 } 2442 2443 /* 2444 * Get the boot vdev. 2445 */ 2446 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2447 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 2448 (u_longlong_t)guid); 2449 error = ENOENT; 2450 goto out; 2451 } 2452 2453 /* 2454 * Determine if there is a better boot device. 2455 */ 2456 avd = bvd; 2457 spa_alt_rootvdev(rvd, &avd, &txg); 2458 if (avd != bvd) { 2459 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 2460 "try booting from '%s'", avd->vdev_path); 2461 error = EINVAL; 2462 goto out; 2463 } 2464 2465 /* 2466 * If the boot device is part of a spare vdev then ensure that 2467 * we're booting off the active spare. 2468 */ 2469 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2470 !bvd->vdev_isspare) { 2471 cmn_err(CE_NOTE, "The boot device is currently spared. Please " 2472 "try booting from '%s'", 2473 bvd->vdev_parent->vdev_child[1]->vdev_path); 2474 error = EINVAL; 2475 goto out; 2476 } 2477 2478 VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); 2479 error = 0; 2480 spa_history_log_version(spa, LOG_POOL_IMPORT); 2481 out: 2482 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2483 vdev_free(rvd); 2484 spa_config_exit(spa, SCL_ALL, FTAG); 2485 mutex_exit(&spa_namespace_lock); 2486 2487 nvlist_free(config); 2488 return (error); 2489 } 2490 2491 #endif 2492 2493 /* 2494 * Take a pool and insert it into the namespace as if it had been loaded at 2495 * boot. 2496 */ 2497 int 2498 spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props) 2499 { 2500 spa_t *spa; 2501 char *altroot = NULL; 2502 2503 mutex_enter(&spa_namespace_lock); 2504 if (spa_lookup(pool) != NULL) { 2505 mutex_exit(&spa_namespace_lock); 2506 return (EEXIST); 2507 } 2508 2509 (void) nvlist_lookup_string(props, 2510 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2511 spa = spa_add(pool, altroot); 2512 2513 spa->spa_load_verbatim = B_TRUE; 2514 2515 VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); 2516 2517 if (props != NULL) 2518 spa_configfile_set(spa, props, B_FALSE); 2519 2520 spa_config_sync(spa, B_FALSE, B_TRUE); 2521 2522 mutex_exit(&spa_namespace_lock); 2523 spa_history_log_version(spa, LOG_POOL_IMPORT); 2524 2525 return (0); 2526 } 2527 2528 /* 2529 * Import a non-root pool into the system. 2530 */ 2531 int 2532 spa_import(const char *pool, nvlist_t *config, nvlist_t *props) 2533 { 2534 spa_t *spa; 2535 char *altroot = NULL; 2536 int error; 2537 nvlist_t *nvroot; 2538 nvlist_t **spares, **l2cache; 2539 uint_t nspares, nl2cache; 2540 2541 /* 2542 * If a pool with this name exists, return failure. 2543 */ 2544 mutex_enter(&spa_namespace_lock); 2545 if ((spa = spa_lookup(pool)) != NULL) { 2546 mutex_exit(&spa_namespace_lock); 2547 return (EEXIST); 2548 } 2549 2550 /* 2551 * Create and initialize the spa structure. 2552 */ 2553 (void) nvlist_lookup_string(props, 2554 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2555 spa = spa_add(pool, altroot); 2556 spa_activate(spa, spa_mode_global); 2557 2558 /* 2559 * Don't start async tasks until we know everything is healthy. 2560 */ 2561 spa_async_suspend(spa); 2562 2563 /* 2564 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 2565 * because the user-supplied config is actually the one to trust when 2566 * doing an import. 2567 */ 2568 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 2569 2570 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2571 /* 2572 * Toss any existing sparelist, as it doesn't have any validity 2573 * anymore, and conflicts with spa_has_spare(). 2574 */ 2575 if (spa->spa_spares.sav_config) { 2576 nvlist_free(spa->spa_spares.sav_config); 2577 spa->spa_spares.sav_config = NULL; 2578 spa_load_spares(spa); 2579 } 2580 if (spa->spa_l2cache.sav_config) { 2581 nvlist_free(spa->spa_l2cache.sav_config); 2582 spa->spa_l2cache.sav_config = NULL; 2583 spa_load_l2cache(spa); 2584 } 2585 2586 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 2587 &nvroot) == 0); 2588 if (error == 0) 2589 error = spa_validate_aux(spa, nvroot, -1ULL, 2590 VDEV_ALLOC_SPARE); 2591 if (error == 0) 2592 error = spa_validate_aux(spa, nvroot, -1ULL, 2593 VDEV_ALLOC_L2CACHE); 2594 spa_config_exit(spa, SCL_ALL, FTAG); 2595 2596 if (props != NULL) 2597 spa_configfile_set(spa, props, B_FALSE); 2598 2599 if (error != 0 || (props && spa_writeable(spa) && 2600 (error = spa_prop_set(spa, props)))) { 2601 spa_unload(spa); 2602 spa_deactivate(spa); 2603 spa_remove(spa); 2604 mutex_exit(&spa_namespace_lock); 2605 return (error); 2606 } 2607 2608 spa_async_resume(spa); 2609 2610 /* 2611 * Override any spares and level 2 cache devices as specified by 2612 * the user, as these may have correct device names/devids, etc. 2613 */ 2614 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2615 &spares, &nspares) == 0) { 2616 if (spa->spa_spares.sav_config) 2617 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 2618 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 2619 else 2620 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 2621 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2622 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2623 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2624 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2625 spa_load_spares(spa); 2626 spa_config_exit(spa, SCL_ALL, FTAG); 2627 spa->spa_spares.sav_sync = B_TRUE; 2628 } 2629 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2630 &l2cache, &nl2cache) == 0) { 2631 if (spa->spa_l2cache.sav_config) 2632 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 2633 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 2634 else 2635 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2636 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2637 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2638 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2639 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2640 spa_load_l2cache(spa); 2641 spa_config_exit(spa, SCL_ALL, FTAG); 2642 spa->spa_l2cache.sav_sync = B_TRUE; 2643 } 2644 2645 if (spa_writeable(spa)) { 2646 /* 2647 * Update the config cache to include the newly-imported pool. 2648 */ 2649 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2650 } 2651 2652 /* 2653 * It's possible that the pool was expanded while it was exported. 2654 * We kick off an async task to handle this for us. 2655 */ 2656 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 2657 2658 mutex_exit(&spa_namespace_lock); 2659 spa_history_log_version(spa, LOG_POOL_IMPORT); 2660 2661 return (0); 2662 } 2663 2664 2665 /* 2666 * This (illegal) pool name is used when temporarily importing a spa_t in order 2667 * to get the vdev stats associated with the imported devices. 2668 */ 2669 #define TRYIMPORT_NAME "$import" 2670 2671 nvlist_t * 2672 spa_tryimport(nvlist_t *tryconfig) 2673 { 2674 nvlist_t *config = NULL; 2675 char *poolname; 2676 spa_t *spa; 2677 uint64_t state; 2678 int error; 2679 2680 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 2681 return (NULL); 2682 2683 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 2684 return (NULL); 2685 2686 /* 2687 * Create and initialize the spa structure. 2688 */ 2689 mutex_enter(&spa_namespace_lock); 2690 spa = spa_add(TRYIMPORT_NAME, NULL); 2691 spa_activate(spa, FREAD); 2692 2693 /* 2694 * Pass off the heavy lifting to spa_load(). 2695 * Pass TRUE for mosconfig because the user-supplied config 2696 * is actually the one to trust when doing an import. 2697 */ 2698 error = spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 2699 2700 /* 2701 * If 'tryconfig' was at least parsable, return the current config. 2702 */ 2703 if (spa->spa_root_vdev != NULL) { 2704 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2705 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 2706 poolname) == 0); 2707 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 2708 state) == 0); 2709 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 2710 spa->spa_uberblock.ub_timestamp) == 0); 2711 2712 /* 2713 * If the bootfs property exists on this pool then we 2714 * copy it out so that external consumers can tell which 2715 * pools are bootable. 2716 */ 2717 if ((!error || error == EEXIST) && spa->spa_bootfs) { 2718 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 2719 2720 /* 2721 * We have to play games with the name since the 2722 * pool was opened as TRYIMPORT_NAME. 2723 */ 2724 if (dsl_dsobj_to_dsname(spa_name(spa), 2725 spa->spa_bootfs, tmpname) == 0) { 2726 char *cp; 2727 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 2728 2729 cp = strchr(tmpname, '/'); 2730 if (cp == NULL) { 2731 (void) strlcpy(dsname, tmpname, 2732 MAXPATHLEN); 2733 } else { 2734 (void) snprintf(dsname, MAXPATHLEN, 2735 "%s/%s", poolname, ++cp); 2736 } 2737 VERIFY(nvlist_add_string(config, 2738 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 2739 kmem_free(dsname, MAXPATHLEN); 2740 } 2741 kmem_free(tmpname, MAXPATHLEN); 2742 } 2743 2744 /* 2745 * Add the list of hot spares and level 2 cache devices. 2746 */ 2747 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 2748 spa_add_spares(spa, config); 2749 spa_add_l2cache(spa, config); 2750 spa_config_exit(spa, SCL_CONFIG, FTAG); 2751 } 2752 2753 spa_unload(spa); 2754 spa_deactivate(spa); 2755 spa_remove(spa); 2756 mutex_exit(&spa_namespace_lock); 2757 2758 return (config); 2759 } 2760 2761 /* 2762 * Pool export/destroy 2763 * 2764 * The act of destroying or exporting a pool is very simple. We make sure there 2765 * is no more pending I/O and any references to the pool are gone. Then, we 2766 * update the pool state and sync all the labels to disk, removing the 2767 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 2768 * we don't sync the labels or remove the configuration cache. 2769 */ 2770 static int 2771 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 2772 boolean_t force, boolean_t hardforce) 2773 { 2774 spa_t *spa; 2775 2776 if (oldconfig) 2777 *oldconfig = NULL; 2778 2779 if (!(spa_mode_global & FWRITE)) 2780 return (EROFS); 2781 2782 mutex_enter(&spa_namespace_lock); 2783 if ((spa = spa_lookup(pool)) == NULL) { 2784 mutex_exit(&spa_namespace_lock); 2785 return (ENOENT); 2786 } 2787 2788 /* 2789 * Put a hold on the pool, drop the namespace lock, stop async tasks, 2790 * reacquire the namespace lock, and see if we can export. 2791 */ 2792 spa_open_ref(spa, FTAG); 2793 mutex_exit(&spa_namespace_lock); 2794 spa_async_suspend(spa); 2795 mutex_enter(&spa_namespace_lock); 2796 spa_close(spa, FTAG); 2797 2798 /* 2799 * The pool will be in core if it's openable, 2800 * in which case we can modify its state. 2801 */ 2802 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 2803 /* 2804 * Objsets may be open only because they're dirty, so we 2805 * have to force it to sync before checking spa_refcnt. 2806 */ 2807 txg_wait_synced(spa->spa_dsl_pool, 0); 2808 2809 /* 2810 * A pool cannot be exported or destroyed if there are active 2811 * references. If we are resetting a pool, allow references by 2812 * fault injection handlers. 2813 */ 2814 if (!spa_refcount_zero(spa) || 2815 (spa->spa_inject_ref != 0 && 2816 new_state != POOL_STATE_UNINITIALIZED)) { 2817 spa_async_resume(spa); 2818 mutex_exit(&spa_namespace_lock); 2819 return (EBUSY); 2820 } 2821 2822 /* 2823 * A pool cannot be exported if it has an active shared spare. 2824 * This is to prevent other pools stealing the active spare 2825 * from an exported pool. At user's own will, such pool can 2826 * be forcedly exported. 2827 */ 2828 if (!force && new_state == POOL_STATE_EXPORTED && 2829 spa_has_active_shared_spare(spa)) { 2830 spa_async_resume(spa); 2831 mutex_exit(&spa_namespace_lock); 2832 return (EXDEV); 2833 } 2834 2835 /* 2836 * We want this to be reflected on every label, 2837 * so mark them all dirty. spa_unload() will do the 2838 * final sync that pushes these changes out. 2839 */ 2840 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 2841 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2842 spa->spa_state = new_state; 2843 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 2844 vdev_config_dirty(spa->spa_root_vdev); 2845 spa_config_exit(spa, SCL_ALL, FTAG); 2846 } 2847 } 2848 2849 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 2850 2851 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2852 spa_unload(spa); 2853 spa_deactivate(spa); 2854 } 2855 2856 if (oldconfig && spa->spa_config) 2857 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 2858 2859 if (new_state != POOL_STATE_UNINITIALIZED) { 2860 if (!hardforce) 2861 spa_config_sync(spa, B_TRUE, B_TRUE); 2862 spa_remove(spa); 2863 } 2864 mutex_exit(&spa_namespace_lock); 2865 2866 return (0); 2867 } 2868 2869 /* 2870 * Destroy a storage pool. 2871 */ 2872 int 2873 spa_destroy(char *pool) 2874 { 2875 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 2876 B_FALSE, B_FALSE)); 2877 } 2878 2879 /* 2880 * Export a storage pool. 2881 */ 2882 int 2883 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 2884 boolean_t hardforce) 2885 { 2886 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 2887 force, hardforce)); 2888 } 2889 2890 /* 2891 * Similar to spa_export(), this unloads the spa_t without actually removing it 2892 * from the namespace in any way. 2893 */ 2894 int 2895 spa_reset(char *pool) 2896 { 2897 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 2898 B_FALSE, B_FALSE)); 2899 } 2900 2901 /* 2902 * ========================================================================== 2903 * Device manipulation 2904 * ========================================================================== 2905 */ 2906 2907 /* 2908 * Add a device to a storage pool. 2909 */ 2910 int 2911 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 2912 { 2913 uint64_t txg, id; 2914 int error; 2915 vdev_t *rvd = spa->spa_root_vdev; 2916 vdev_t *vd, *tvd; 2917 nvlist_t **spares, **l2cache; 2918 uint_t nspares, nl2cache; 2919 2920 txg = spa_vdev_enter(spa); 2921 2922 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 2923 VDEV_ALLOC_ADD)) != 0) 2924 return (spa_vdev_exit(spa, NULL, txg, error)); 2925 2926 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 2927 2928 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 2929 &nspares) != 0) 2930 nspares = 0; 2931 2932 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 2933 &nl2cache) != 0) 2934 nl2cache = 0; 2935 2936 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 2937 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 2938 2939 if (vd->vdev_children != 0 && 2940 (error = vdev_create(vd, txg, B_FALSE)) != 0) 2941 return (spa_vdev_exit(spa, vd, txg, error)); 2942 2943 /* 2944 * We must validate the spares and l2cache devices after checking the 2945 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 2946 */ 2947 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 2948 return (spa_vdev_exit(spa, vd, txg, error)); 2949 2950 /* 2951 * Transfer each new top-level vdev from vd to rvd. 2952 */ 2953 for (int c = 0; c < vd->vdev_children; c++) { 2954 2955 /* 2956 * Set the vdev id to the first hole, if one exists. 2957 */ 2958 for (id = 0; id < rvd->vdev_children; id++) { 2959 if (rvd->vdev_child[id]->vdev_ishole) { 2960 vdev_free(rvd->vdev_child[id]); 2961 break; 2962 } 2963 } 2964 tvd = vd->vdev_child[c]; 2965 vdev_remove_child(vd, tvd); 2966 tvd->vdev_id = id; 2967 vdev_add_child(rvd, tvd); 2968 vdev_config_dirty(tvd); 2969 } 2970 2971 if (nspares != 0) { 2972 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 2973 ZPOOL_CONFIG_SPARES); 2974 spa_load_spares(spa); 2975 spa->spa_spares.sav_sync = B_TRUE; 2976 } 2977 2978 if (nl2cache != 0) { 2979 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 2980 ZPOOL_CONFIG_L2CACHE); 2981 spa_load_l2cache(spa); 2982 spa->spa_l2cache.sav_sync = B_TRUE; 2983 } 2984 2985 /* 2986 * We have to be careful when adding new vdevs to an existing pool. 2987 * If other threads start allocating from these vdevs before we 2988 * sync the config cache, and we lose power, then upon reboot we may 2989 * fail to open the pool because there are DVAs that the config cache 2990 * can't translate. Therefore, we first add the vdevs without 2991 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 2992 * and then let spa_config_update() initialize the new metaslabs. 2993 * 2994 * spa_load() checks for added-but-not-initialized vdevs, so that 2995 * if we lose power at any point in this sequence, the remaining 2996 * steps will be completed the next time we load the pool. 2997 */ 2998 (void) spa_vdev_exit(spa, vd, txg, 0); 2999 3000 mutex_enter(&spa_namespace_lock); 3001 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3002 mutex_exit(&spa_namespace_lock); 3003 3004 return (0); 3005 } 3006 3007 /* 3008 * Attach a device to a mirror. The arguments are the path to any device 3009 * in the mirror, and the nvroot for the new device. If the path specifies 3010 * a device that is not mirrored, we automatically insert the mirror vdev. 3011 * 3012 * If 'replacing' is specified, the new device is intended to replace the 3013 * existing device; in this case the two devices are made into their own 3014 * mirror using the 'replacing' vdev, which is functionally identical to 3015 * the mirror vdev (it actually reuses all the same ops) but has a few 3016 * extra rules: you can't attach to it after it's been created, and upon 3017 * completion of resilvering, the first disk (the one being replaced) 3018 * is automatically detached. 3019 */ 3020 int 3021 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 3022 { 3023 uint64_t txg, open_txg; 3024 vdev_t *rvd = spa->spa_root_vdev; 3025 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 3026 vdev_ops_t *pvops; 3027 char *oldvdpath, *newvdpath; 3028 int newvd_isspare; 3029 int error; 3030 3031 txg = spa_vdev_enter(spa); 3032 3033 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 3034 3035 if (oldvd == NULL) 3036 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3037 3038 if (!oldvd->vdev_ops->vdev_op_leaf) 3039 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3040 3041 pvd = oldvd->vdev_parent; 3042 3043 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 3044 VDEV_ALLOC_ADD)) != 0) 3045 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 3046 3047 if (newrootvd->vdev_children != 1) 3048 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3049 3050 newvd = newrootvd->vdev_child[0]; 3051 3052 if (!newvd->vdev_ops->vdev_op_leaf) 3053 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3054 3055 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 3056 return (spa_vdev_exit(spa, newrootvd, txg, error)); 3057 3058 /* 3059 * Spares can't replace logs 3060 */ 3061 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 3062 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3063 3064 if (!replacing) { 3065 /* 3066 * For attach, the only allowable parent is a mirror or the root 3067 * vdev. 3068 */ 3069 if (pvd->vdev_ops != &vdev_mirror_ops && 3070 pvd->vdev_ops != &vdev_root_ops) 3071 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3072 3073 pvops = &vdev_mirror_ops; 3074 } else { 3075 /* 3076 * Active hot spares can only be replaced by inactive hot 3077 * spares. 3078 */ 3079 if (pvd->vdev_ops == &vdev_spare_ops && 3080 pvd->vdev_child[1] == oldvd && 3081 !spa_has_spare(spa, newvd->vdev_guid)) 3082 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3083 3084 /* 3085 * If the source is a hot spare, and the parent isn't already a 3086 * spare, then we want to create a new hot spare. Otherwise, we 3087 * want to create a replacing vdev. The user is not allowed to 3088 * attach to a spared vdev child unless the 'isspare' state is 3089 * the same (spare replaces spare, non-spare replaces 3090 * non-spare). 3091 */ 3092 if (pvd->vdev_ops == &vdev_replacing_ops) 3093 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3094 else if (pvd->vdev_ops == &vdev_spare_ops && 3095 newvd->vdev_isspare != oldvd->vdev_isspare) 3096 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3097 else if (pvd->vdev_ops != &vdev_spare_ops && 3098 newvd->vdev_isspare) 3099 pvops = &vdev_spare_ops; 3100 else 3101 pvops = &vdev_replacing_ops; 3102 } 3103 3104 /* 3105 * Make sure the new device is big enough. 3106 */ 3107 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 3108 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 3109 3110 /* 3111 * The new device cannot have a higher alignment requirement 3112 * than the top-level vdev. 3113 */ 3114 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 3115 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 3116 3117 /* 3118 * If this is an in-place replacement, update oldvd's path and devid 3119 * to make it distinguishable from newvd, and unopenable from now on. 3120 */ 3121 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 3122 spa_strfree(oldvd->vdev_path); 3123 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 3124 KM_SLEEP); 3125 (void) sprintf(oldvd->vdev_path, "%s/%s", 3126 newvd->vdev_path, "old"); 3127 if (oldvd->vdev_devid != NULL) { 3128 spa_strfree(oldvd->vdev_devid); 3129 oldvd->vdev_devid = NULL; 3130 } 3131 } 3132 3133 /* 3134 * If the parent is not a mirror, or if we're replacing, insert the new 3135 * mirror/replacing/spare vdev above oldvd. 3136 */ 3137 if (pvd->vdev_ops != pvops) 3138 pvd = vdev_add_parent(oldvd, pvops); 3139 3140 ASSERT(pvd->vdev_top->vdev_parent == rvd); 3141 ASSERT(pvd->vdev_ops == pvops); 3142 ASSERT(oldvd->vdev_parent == pvd); 3143 3144 /* 3145 * Extract the new device from its root and add it to pvd. 3146 */ 3147 vdev_remove_child(newrootvd, newvd); 3148 newvd->vdev_id = pvd->vdev_children; 3149 newvd->vdev_crtxg = oldvd->vdev_crtxg; 3150 vdev_add_child(pvd, newvd); 3151 3152 tvd = newvd->vdev_top; 3153 ASSERT(pvd->vdev_top == tvd); 3154 ASSERT(tvd->vdev_parent == rvd); 3155 3156 vdev_config_dirty(tvd); 3157 3158 /* 3159 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 3160 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 3161 */ 3162 open_txg = txg + TXG_CONCURRENT_STATES - 1; 3163 3164 vdev_dtl_dirty(newvd, DTL_MISSING, 3165 TXG_INITIAL, open_txg - TXG_INITIAL + 1); 3166 3167 if (newvd->vdev_isspare) { 3168 spa_spare_activate(newvd); 3169 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 3170 } 3171 3172 oldvdpath = spa_strdup(oldvd->vdev_path); 3173 newvdpath = spa_strdup(newvd->vdev_path); 3174 newvd_isspare = newvd->vdev_isspare; 3175 3176 /* 3177 * Mark newvd's DTL dirty in this txg. 3178 */ 3179 vdev_dirty(tvd, VDD_DTL, newvd, txg); 3180 3181 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 3182 3183 spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, NULL, 3184 CRED(), "%s vdev=%s %s vdev=%s", 3185 replacing && newvd_isspare ? "spare in" : 3186 replacing ? "replace" : "attach", newvdpath, 3187 replacing ? "for" : "to", oldvdpath); 3188 3189 spa_strfree(oldvdpath); 3190 spa_strfree(newvdpath); 3191 3192 /* 3193 * Kick off a resilver to update newvd. 3194 */ 3195 VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0); 3196 3197 return (0); 3198 } 3199 3200 /* 3201 * Detach a device from a mirror or replacing vdev. 3202 * If 'replace_done' is specified, only detach if the parent 3203 * is a replacing vdev. 3204 */ 3205 int 3206 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 3207 { 3208 uint64_t txg; 3209 int error; 3210 vdev_t *rvd = spa->spa_root_vdev; 3211 vdev_t *vd, *pvd, *cvd, *tvd; 3212 boolean_t unspare = B_FALSE; 3213 uint64_t unspare_guid; 3214 size_t len; 3215 3216 txg = spa_vdev_enter(spa); 3217 3218 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 3219 3220 if (vd == NULL) 3221 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3222 3223 if (!vd->vdev_ops->vdev_op_leaf) 3224 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3225 3226 pvd = vd->vdev_parent; 3227 3228 /* 3229 * If the parent/child relationship is not as expected, don't do it. 3230 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 3231 * vdev that's replacing B with C. The user's intent in replacing 3232 * is to go from M(A,B) to M(A,C). If the user decides to cancel 3233 * the replace by detaching C, the expected behavior is to end up 3234 * M(A,B). But suppose that right after deciding to detach C, 3235 * the replacement of B completes. We would have M(A,C), and then 3236 * ask to detach C, which would leave us with just A -- not what 3237 * the user wanted. To prevent this, we make sure that the 3238 * parent/child relationship hasn't changed -- in this example, 3239 * that C's parent is still the replacing vdev R. 3240 */ 3241 if (pvd->vdev_guid != pguid && pguid != 0) 3242 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3243 3244 /* 3245 * If replace_done is specified, only remove this device if it's 3246 * the first child of a replacing vdev. For the 'spare' vdev, either 3247 * disk can be removed. 3248 */ 3249 if (replace_done) { 3250 if (pvd->vdev_ops == &vdev_replacing_ops) { 3251 if (vd->vdev_id != 0) 3252 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3253 } else if (pvd->vdev_ops != &vdev_spare_ops) { 3254 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3255 } 3256 } 3257 3258 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 3259 spa_version(spa) >= SPA_VERSION_SPARES); 3260 3261 /* 3262 * Only mirror, replacing, and spare vdevs support detach. 3263 */ 3264 if (pvd->vdev_ops != &vdev_replacing_ops && 3265 pvd->vdev_ops != &vdev_mirror_ops && 3266 pvd->vdev_ops != &vdev_spare_ops) 3267 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3268 3269 /* 3270 * If this device has the only valid copy of some data, 3271 * we cannot safely detach it. 3272 */ 3273 if (vdev_dtl_required(vd)) 3274 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3275 3276 ASSERT(pvd->vdev_children >= 2); 3277 3278 /* 3279 * If we are detaching the second disk from a replacing vdev, then 3280 * check to see if we changed the original vdev's path to have "/old" 3281 * at the end in spa_vdev_attach(). If so, undo that change now. 3282 */ 3283 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 && 3284 pvd->vdev_child[0]->vdev_path != NULL && 3285 pvd->vdev_child[1]->vdev_path != NULL) { 3286 ASSERT(pvd->vdev_child[1] == vd); 3287 cvd = pvd->vdev_child[0]; 3288 len = strlen(vd->vdev_path); 3289 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 3290 strcmp(cvd->vdev_path + len, "/old") == 0) { 3291 spa_strfree(cvd->vdev_path); 3292 cvd->vdev_path = spa_strdup(vd->vdev_path); 3293 } 3294 } 3295 3296 /* 3297 * If we are detaching the original disk from a spare, then it implies 3298 * that the spare should become a real disk, and be removed from the 3299 * active spare list for the pool. 3300 */ 3301 if (pvd->vdev_ops == &vdev_spare_ops && 3302 vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare) 3303 unspare = B_TRUE; 3304 3305 /* 3306 * Erase the disk labels so the disk can be used for other things. 3307 * This must be done after all other error cases are handled, 3308 * but before we disembowel vd (so we can still do I/O to it). 3309 * But if we can't do it, don't treat the error as fatal -- 3310 * it may be that the unwritability of the disk is the reason 3311 * it's being detached! 3312 */ 3313 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 3314 3315 /* 3316 * Remove vd from its parent and compact the parent's children. 3317 */ 3318 vdev_remove_child(pvd, vd); 3319 vdev_compact_children(pvd); 3320 3321 /* 3322 * Remember one of the remaining children so we can get tvd below. 3323 */ 3324 cvd = pvd->vdev_child[0]; 3325 3326 /* 3327 * If we need to remove the remaining child from the list of hot spares, 3328 * do it now, marking the vdev as no longer a spare in the process. 3329 * We must do this before vdev_remove_parent(), because that can 3330 * change the GUID if it creates a new toplevel GUID. For a similar 3331 * reason, we must remove the spare now, in the same txg as the detach; 3332 * otherwise someone could attach a new sibling, change the GUID, and 3333 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 3334 */ 3335 if (unspare) { 3336 ASSERT(cvd->vdev_isspare); 3337 spa_spare_remove(cvd); 3338 unspare_guid = cvd->vdev_guid; 3339 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 3340 } 3341 3342 /* 3343 * If the parent mirror/replacing vdev only has one child, 3344 * the parent is no longer needed. Remove it from the tree. 3345 */ 3346 if (pvd->vdev_children == 1) 3347 vdev_remove_parent(cvd); 3348 3349 /* 3350 * We don't set tvd until now because the parent we just removed 3351 * may have been the previous top-level vdev. 3352 */ 3353 tvd = cvd->vdev_top; 3354 ASSERT(tvd->vdev_parent == rvd); 3355 3356 /* 3357 * Reevaluate the parent vdev state. 3358 */ 3359 vdev_propagate_state(cvd); 3360 3361 /* 3362 * If the 'autoexpand' property is set on the pool then automatically 3363 * try to expand the size of the pool. For example if the device we 3364 * just detached was smaller than the others, it may be possible to 3365 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 3366 * first so that we can obtain the updated sizes of the leaf vdevs. 3367 */ 3368 if (spa->spa_autoexpand) { 3369 vdev_reopen(tvd); 3370 vdev_expand(tvd, txg); 3371 } 3372 3373 vdev_config_dirty(tvd); 3374 3375 /* 3376 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 3377 * vd->vdev_detached is set and free vd's DTL object in syncing context. 3378 * But first make sure we're not on any *other* txg's DTL list, to 3379 * prevent vd from being accessed after it's freed. 3380 */ 3381 for (int t = 0; t < TXG_SIZE; t++) 3382 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 3383 vd->vdev_detached = B_TRUE; 3384 vdev_dirty(tvd, VDD_DTL, vd, txg); 3385 3386 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 3387 3388 error = spa_vdev_exit(spa, vd, txg, 0); 3389 3390 /* 3391 * If this was the removal of the original device in a hot spare vdev, 3392 * then we want to go through and remove the device from the hot spare 3393 * list of every other pool. 3394 */ 3395 if (unspare) { 3396 spa_t *myspa = spa; 3397 spa = NULL; 3398 mutex_enter(&spa_namespace_lock); 3399 while ((spa = spa_next(spa)) != NULL) { 3400 if (spa->spa_state != POOL_STATE_ACTIVE) 3401 continue; 3402 if (spa == myspa) 3403 continue; 3404 spa_open_ref(spa, FTAG); 3405 mutex_exit(&spa_namespace_lock); 3406 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 3407 mutex_enter(&spa_namespace_lock); 3408 spa_close(spa, FTAG); 3409 } 3410 mutex_exit(&spa_namespace_lock); 3411 } 3412 3413 return (error); 3414 } 3415 3416 static nvlist_t * 3417 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 3418 { 3419 for (int i = 0; i < count; i++) { 3420 uint64_t guid; 3421 3422 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 3423 &guid) == 0); 3424 3425 if (guid == target_guid) 3426 return (nvpp[i]); 3427 } 3428 3429 return (NULL); 3430 } 3431 3432 static void 3433 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 3434 nvlist_t *dev_to_remove) 3435 { 3436 nvlist_t **newdev = NULL; 3437 3438 if (count > 1) 3439 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 3440 3441 for (int i = 0, j = 0; i < count; i++) { 3442 if (dev[i] == dev_to_remove) 3443 continue; 3444 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 3445 } 3446 3447 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 3448 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 3449 3450 for (int i = 0; i < count - 1; i++) 3451 nvlist_free(newdev[i]); 3452 3453 if (count > 1) 3454 kmem_free(newdev, (count - 1) * sizeof (void *)); 3455 } 3456 3457 /* 3458 * Removing a device from the vdev namespace requires several steps 3459 * and can take a significant amount of time. As a result we use 3460 * the spa_vdev_config_[enter/exit] functions which allow us to 3461 * grab and release the spa_config_lock while still holding the namespace 3462 * lock. During each step the configuration is synced out. 3463 */ 3464 3465 /* 3466 * Initial phase of device removal - stop future allocations from this device. 3467 */ 3468 void 3469 spa_vdev_remove_start(spa_t *spa, vdev_t *vd) 3470 { 3471 metaslab_group_t *mg = vd->vdev_mg; 3472 3473 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3474 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3475 3476 /* 3477 * Remove our vdev from the allocatable vdevs 3478 */ 3479 if (mg) 3480 metaslab_class_remove(mg->mg_class, mg); 3481 } 3482 3483 /* 3484 * Evacuate the device. 3485 */ 3486 int 3487 spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 3488 { 3489 uint64_t txg; 3490 int error; 3491 3492 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3493 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 3494 3495 /* 3496 * Evacuate the device. We don't hold the config lock as writer 3497 * since we need to do I/O but we do keep the 3498 * spa_namespace_lock held. Once this completes the device 3499 * should no longer have any blocks allocated on it. 3500 */ 3501 if (vd->vdev_islog) { 3502 /* 3503 * Evacuate the device. 3504 */ 3505 if (error = dmu_objset_find(spa_name(spa), 3506 zil_vdev_offline, NULL, DS_FIND_CHILDREN)) { 3507 uint64_t txg; 3508 3509 txg = spa_vdev_config_enter(spa); 3510 metaslab_class_add(spa->spa_log_class, 3511 vd->vdev_mg); 3512 return (spa_vdev_exit(spa, NULL, txg, error)); 3513 } 3514 txg_wait_synced(spa_get_dsl(spa), 0); 3515 } 3516 3517 /* 3518 * Remove any remaining MOS metadata associated with the device. 3519 */ 3520 txg = spa_vdev_config_enter(spa); 3521 vd->vdev_removing = B_TRUE; 3522 vdev_dirty(vd, 0, NULL, txg); 3523 vdev_config_dirty(vd); 3524 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 3525 3526 return (0); 3527 } 3528 3529 /* 3530 * Complete the removal by cleaning up the namespace. 3531 */ 3532 void 3533 spa_vdev_remove_done(spa_t *spa, vdev_t *vd) 3534 { 3535 vdev_t *rvd = spa->spa_root_vdev; 3536 metaslab_group_t *mg = vd->vdev_mg; 3537 uint64_t id = vd->vdev_id; 3538 boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 3539 3540 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3541 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3542 3543 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 3544 vdev_free(vd); 3545 3546 /* 3547 * It's possible that another thread is trying todo a spa_vdev_add() 3548 * at the same time we're trying remove it. As a result the 3549 * added vdev may not have initialized its metaslabs yet. 3550 */ 3551 if (mg != NULL) 3552 metaslab_group_destroy(mg); 3553 3554 if (last_vdev) { 3555 vdev_compact_children(rvd); 3556 } else { 3557 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 3558 vdev_add_child(rvd, vd); 3559 } 3560 vdev_config_dirty(rvd); 3561 3562 /* 3563 * Reassess the health of our root vdev. 3564 */ 3565 vdev_reopen(rvd); 3566 } 3567 3568 /* 3569 * Remove a device from the pool. Currently, this supports removing only hot 3570 * spares, slogs, and level 2 ARC devices. 3571 */ 3572 int 3573 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 3574 { 3575 vdev_t *vd; 3576 nvlist_t **spares, **l2cache, *nv; 3577 uint64_t txg = 0; 3578 uint_t nspares, nl2cache; 3579 int error = 0; 3580 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 3581 3582 if (!locked) 3583 txg = spa_vdev_enter(spa); 3584 3585 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 3586 3587 if (spa->spa_spares.sav_vdevs != NULL && 3588 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3589 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 3590 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 3591 /* 3592 * Only remove the hot spare if it's not currently in use 3593 * in this pool. 3594 */ 3595 if (vd == NULL || unspare) { 3596 spa_vdev_remove_aux(spa->spa_spares.sav_config, 3597 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 3598 spa_load_spares(spa); 3599 spa->spa_spares.sav_sync = B_TRUE; 3600 } else { 3601 error = EBUSY; 3602 } 3603 } else if (spa->spa_l2cache.sav_vdevs != NULL && 3604 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3605 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 3606 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 3607 /* 3608 * Cache devices can always be removed. 3609 */ 3610 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 3611 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 3612 spa_load_l2cache(spa); 3613 spa->spa_l2cache.sav_sync = B_TRUE; 3614 } else if (vd != NULL && vd->vdev_islog) { 3615 ASSERT(!locked); 3616 3617 /* 3618 * XXX - Once we have bp-rewrite this should 3619 * become the common case. 3620 */ 3621 3622 /* 3623 * 1. Stop allocations 3624 * 2. Evacuate the device (i.e. kill off stubby and 3625 * metadata) and wait for it to complete (i.e. sync). 3626 * 3. Cleanup the vdev namespace. 3627 */ 3628 spa_vdev_remove_start(spa, vd); 3629 3630 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 3631 if ((error = spa_vdev_remove_evacuate(spa, vd)) != 0) 3632 return (error); 3633 txg = spa_vdev_config_enter(spa); 3634 3635 spa_vdev_remove_done(spa, vd); 3636 3637 } else if (vd != NULL) { 3638 /* 3639 * Normal vdevs cannot be removed (yet). 3640 */ 3641 error = ENOTSUP; 3642 } else { 3643 /* 3644 * There is no vdev of any kind with the specified guid. 3645 */ 3646 error = ENOENT; 3647 } 3648 3649 if (!locked) 3650 return (spa_vdev_exit(spa, NULL, txg, error)); 3651 3652 return (error); 3653 } 3654 3655 /* 3656 * Find any device that's done replacing, or a vdev marked 'unspare' that's 3657 * current spared, so we can detach it. 3658 */ 3659 static vdev_t * 3660 spa_vdev_resilver_done_hunt(vdev_t *vd) 3661 { 3662 vdev_t *newvd, *oldvd; 3663 3664 for (int c = 0; c < vd->vdev_children; c++) { 3665 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 3666 if (oldvd != NULL) 3667 return (oldvd); 3668 } 3669 3670 /* 3671 * Check for a completed replacement. 3672 */ 3673 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 3674 oldvd = vd->vdev_child[0]; 3675 newvd = vd->vdev_child[1]; 3676 3677 if (vdev_dtl_empty(newvd, DTL_MISSING) && 3678 !vdev_dtl_required(oldvd)) 3679 return (oldvd); 3680 } 3681 3682 /* 3683 * Check for a completed resilver with the 'unspare' flag set. 3684 */ 3685 if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { 3686 newvd = vd->vdev_child[0]; 3687 oldvd = vd->vdev_child[1]; 3688 3689 if (newvd->vdev_unspare && 3690 vdev_dtl_empty(newvd, DTL_MISSING) && 3691 !vdev_dtl_required(oldvd)) { 3692 newvd->vdev_unspare = 0; 3693 return (oldvd); 3694 } 3695 } 3696 3697 return (NULL); 3698 } 3699 3700 static void 3701 spa_vdev_resilver_done(spa_t *spa) 3702 { 3703 vdev_t *vd, *pvd, *ppvd; 3704 uint64_t guid, sguid, pguid, ppguid; 3705 3706 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3707 3708 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 3709 pvd = vd->vdev_parent; 3710 ppvd = pvd->vdev_parent; 3711 guid = vd->vdev_guid; 3712 pguid = pvd->vdev_guid; 3713 ppguid = ppvd->vdev_guid; 3714 sguid = 0; 3715 /* 3716 * If we have just finished replacing a hot spared device, then 3717 * we need to detach the parent's first child (the original hot 3718 * spare) as well. 3719 */ 3720 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) { 3721 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 3722 ASSERT(ppvd->vdev_children == 2); 3723 sguid = ppvd->vdev_child[1]->vdev_guid; 3724 } 3725 spa_config_exit(spa, SCL_ALL, FTAG); 3726 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 3727 return; 3728 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 3729 return; 3730 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3731 } 3732 3733 spa_config_exit(spa, SCL_ALL, FTAG); 3734 } 3735 3736 /* 3737 * Update the stored path or FRU for this vdev. Dirty the vdev configuration, 3738 * relying on spa_vdev_enter/exit() to synchronize the labels and cache. 3739 */ 3740 int 3741 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 3742 boolean_t ispath) 3743 { 3744 vdev_t *vd; 3745 uint64_t txg; 3746 3747 txg = spa_vdev_enter(spa); 3748 3749 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 3750 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 3751 3752 if (!vd->vdev_ops->vdev_op_leaf) 3753 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3754 3755 if (ispath) { 3756 spa_strfree(vd->vdev_path); 3757 vd->vdev_path = spa_strdup(value); 3758 } else { 3759 if (vd->vdev_fru != NULL) 3760 spa_strfree(vd->vdev_fru); 3761 vd->vdev_fru = spa_strdup(value); 3762 } 3763 3764 vdev_config_dirty(vd->vdev_top); 3765 3766 return (spa_vdev_exit(spa, NULL, txg, 0)); 3767 } 3768 3769 int 3770 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 3771 { 3772 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 3773 } 3774 3775 int 3776 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 3777 { 3778 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 3779 } 3780 3781 /* 3782 * ========================================================================== 3783 * SPA Scrubbing 3784 * ========================================================================== 3785 */ 3786 3787 int 3788 spa_scrub(spa_t *spa, pool_scrub_type_t type) 3789 { 3790 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 3791 3792 if ((uint_t)type >= POOL_SCRUB_TYPES) 3793 return (ENOTSUP); 3794 3795 /* 3796 * If a resilver was requested, but there is no DTL on a 3797 * writeable leaf device, we have nothing to do. 3798 */ 3799 if (type == POOL_SCRUB_RESILVER && 3800 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 3801 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 3802 return (0); 3803 } 3804 3805 if (type == POOL_SCRUB_EVERYTHING && 3806 spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE && 3807 spa->spa_dsl_pool->dp_scrub_isresilver) 3808 return (EBUSY); 3809 3810 if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) { 3811 return (dsl_pool_scrub_clean(spa->spa_dsl_pool)); 3812 } else if (type == POOL_SCRUB_NONE) { 3813 return (dsl_pool_scrub_cancel(spa->spa_dsl_pool)); 3814 } else { 3815 return (EINVAL); 3816 } 3817 } 3818 3819 /* 3820 * ========================================================================== 3821 * SPA async task processing 3822 * ========================================================================== 3823 */ 3824 3825 static void 3826 spa_async_remove(spa_t *spa, vdev_t *vd) 3827 { 3828 if (vd->vdev_remove_wanted) { 3829 vd->vdev_remove_wanted = 0; 3830 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 3831 3832 /* 3833 * We want to clear the stats, but we don't want to do a full 3834 * vdev_clear() as that will cause us to throw away 3835 * degraded/faulted state as well as attempt to reopen the 3836 * device, all of which is a waste. 3837 */ 3838 vd->vdev_stat.vs_read_errors = 0; 3839 vd->vdev_stat.vs_write_errors = 0; 3840 vd->vdev_stat.vs_checksum_errors = 0; 3841 3842 vdev_state_dirty(vd->vdev_top); 3843 } 3844 3845 for (int c = 0; c < vd->vdev_children; c++) 3846 spa_async_remove(spa, vd->vdev_child[c]); 3847 } 3848 3849 static void 3850 spa_async_probe(spa_t *spa, vdev_t *vd) 3851 { 3852 if (vd->vdev_probe_wanted) { 3853 vd->vdev_probe_wanted = 0; 3854 vdev_reopen(vd); /* vdev_open() does the actual probe */ 3855 } 3856 3857 for (int c = 0; c < vd->vdev_children; c++) 3858 spa_async_probe(spa, vd->vdev_child[c]); 3859 } 3860 3861 static void 3862 spa_async_autoexpand(spa_t *spa, vdev_t *vd) 3863 { 3864 sysevent_id_t eid; 3865 nvlist_t *attr; 3866 char *physpath; 3867 3868 if (!spa->spa_autoexpand) 3869 return; 3870 3871 for (int c = 0; c < vd->vdev_children; c++) { 3872 vdev_t *cvd = vd->vdev_child[c]; 3873 spa_async_autoexpand(spa, cvd); 3874 } 3875 3876 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 3877 return; 3878 3879 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 3880 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 3881 3882 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3883 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 3884 3885 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 3886 ESC_DEV_DLE, attr, &eid, DDI_SLEEP); 3887 3888 nvlist_free(attr); 3889 kmem_free(physpath, MAXPATHLEN); 3890 } 3891 3892 static void 3893 spa_async_thread(spa_t *spa) 3894 { 3895 int tasks; 3896 3897 ASSERT(spa->spa_sync_on); 3898 3899 mutex_enter(&spa->spa_async_lock); 3900 tasks = spa->spa_async_tasks; 3901 spa->spa_async_tasks = 0; 3902 mutex_exit(&spa->spa_async_lock); 3903 3904 /* 3905 * See if the config needs to be updated. 3906 */ 3907 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 3908 uint64_t oldsz, space_update; 3909 3910 mutex_enter(&spa_namespace_lock); 3911 oldsz = spa_get_space(spa); 3912 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3913 space_update = spa_get_space(spa) - oldsz; 3914 mutex_exit(&spa_namespace_lock); 3915 3916 /* 3917 * If the pool grew as a result of the config update, 3918 * then log an internal history event. 3919 */ 3920 if (space_update) { 3921 spa_history_internal_log(LOG_POOL_VDEV_ONLINE, 3922 spa, NULL, CRED(), 3923 "pool '%s' size: %llu(+%llu)", 3924 spa_name(spa), spa_get_space(spa), 3925 space_update); 3926 } 3927 } 3928 3929 /* 3930 * See if any devices need to be marked REMOVED. 3931 */ 3932 if (tasks & SPA_ASYNC_REMOVE) { 3933 spa_vdev_state_enter(spa); 3934 spa_async_remove(spa, spa->spa_root_vdev); 3935 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 3936 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 3937 for (int i = 0; i < spa->spa_spares.sav_count; i++) 3938 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 3939 (void) spa_vdev_state_exit(spa, NULL, 0); 3940 } 3941 3942 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 3943 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3944 spa_async_autoexpand(spa, spa->spa_root_vdev); 3945 spa_config_exit(spa, SCL_CONFIG, FTAG); 3946 } 3947 3948 /* 3949 * See if any devices need to be probed. 3950 */ 3951 if (tasks & SPA_ASYNC_PROBE) { 3952 spa_vdev_state_enter(spa); 3953 spa_async_probe(spa, spa->spa_root_vdev); 3954 (void) spa_vdev_state_exit(spa, NULL, 0); 3955 } 3956 3957 /* 3958 * If any devices are done replacing, detach them. 3959 */ 3960 if (tasks & SPA_ASYNC_RESILVER_DONE) 3961 spa_vdev_resilver_done(spa); 3962 3963 /* 3964 * Kick off a resilver. 3965 */ 3966 if (tasks & SPA_ASYNC_RESILVER) 3967 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0); 3968 3969 /* 3970 * Let the world know that we're done. 3971 */ 3972 mutex_enter(&spa->spa_async_lock); 3973 spa->spa_async_thread = NULL; 3974 cv_broadcast(&spa->spa_async_cv); 3975 mutex_exit(&spa->spa_async_lock); 3976 thread_exit(); 3977 } 3978 3979 void 3980 spa_async_suspend(spa_t *spa) 3981 { 3982 mutex_enter(&spa->spa_async_lock); 3983 spa->spa_async_suspended++; 3984 while (spa->spa_async_thread != NULL) 3985 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 3986 mutex_exit(&spa->spa_async_lock); 3987 } 3988 3989 void 3990 spa_async_resume(spa_t *spa) 3991 { 3992 mutex_enter(&spa->spa_async_lock); 3993 ASSERT(spa->spa_async_suspended != 0); 3994 spa->spa_async_suspended--; 3995 mutex_exit(&spa->spa_async_lock); 3996 } 3997 3998 static void 3999 spa_async_dispatch(spa_t *spa) 4000 { 4001 mutex_enter(&spa->spa_async_lock); 4002 if (spa->spa_async_tasks && !spa->spa_async_suspended && 4003 spa->spa_async_thread == NULL && 4004 rootdir != NULL && !vn_is_readonly(rootdir)) 4005 spa->spa_async_thread = thread_create(NULL, 0, 4006 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 4007 mutex_exit(&spa->spa_async_lock); 4008 } 4009 4010 void 4011 spa_async_request(spa_t *spa, int task) 4012 { 4013 mutex_enter(&spa->spa_async_lock); 4014 spa->spa_async_tasks |= task; 4015 mutex_exit(&spa->spa_async_lock); 4016 } 4017 4018 /* 4019 * ========================================================================== 4020 * SPA syncing routines 4021 * ========================================================================== 4022 */ 4023 4024 static void 4025 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 4026 { 4027 bplist_t *bpl = &spa->spa_sync_bplist; 4028 dmu_tx_t *tx; 4029 blkptr_t blk; 4030 uint64_t itor = 0; 4031 zio_t *zio; 4032 int error; 4033 uint8_t c = 1; 4034 4035 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 4036 4037 while (bplist_iterate(bpl, &itor, &blk) == 0) { 4038 ASSERT(blk.blk_birth < txg); 4039 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL, 4040 ZIO_FLAG_MUSTSUCCEED)); 4041 } 4042 4043 error = zio_wait(zio); 4044 ASSERT3U(error, ==, 0); 4045 4046 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 4047 bplist_vacate(bpl, tx); 4048 4049 /* 4050 * Pre-dirty the first block so we sync to convergence faster. 4051 * (Usually only the first block is needed.) 4052 */ 4053 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 4054 dmu_tx_commit(tx); 4055 } 4056 4057 static void 4058 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 4059 { 4060 char *packed = NULL; 4061 size_t bufsize; 4062 size_t nvsize = 0; 4063 dmu_buf_t *db; 4064 4065 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 4066 4067 /* 4068 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 4069 * information. This avoids the dbuf_will_dirty() path and 4070 * saves us a pre-read to get data we don't actually care about. 4071 */ 4072 bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); 4073 packed = kmem_alloc(bufsize, KM_SLEEP); 4074 4075 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 4076 KM_SLEEP) == 0); 4077 bzero(packed + nvsize, bufsize - nvsize); 4078 4079 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 4080 4081 kmem_free(packed, bufsize); 4082 4083 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 4084 dmu_buf_will_dirty(db, tx); 4085 *(uint64_t *)db->db_data = nvsize; 4086 dmu_buf_rele(db, FTAG); 4087 } 4088 4089 static void 4090 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 4091 const char *config, const char *entry) 4092 { 4093 nvlist_t *nvroot; 4094 nvlist_t **list; 4095 int i; 4096 4097 if (!sav->sav_sync) 4098 return; 4099 4100 /* 4101 * Update the MOS nvlist describing the list of available devices. 4102 * spa_validate_aux() will have already made sure this nvlist is 4103 * valid and the vdevs are labeled appropriately. 4104 */ 4105 if (sav->sav_object == 0) { 4106 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 4107 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 4108 sizeof (uint64_t), tx); 4109 VERIFY(zap_update(spa->spa_meta_objset, 4110 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 4111 &sav->sav_object, tx) == 0); 4112 } 4113 4114 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4115 if (sav->sav_count == 0) { 4116 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 4117 } else { 4118 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 4119 for (i = 0; i < sav->sav_count; i++) 4120 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 4121 B_FALSE, B_FALSE, B_TRUE); 4122 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 4123 sav->sav_count) == 0); 4124 for (i = 0; i < sav->sav_count; i++) 4125 nvlist_free(list[i]); 4126 kmem_free(list, sav->sav_count * sizeof (void *)); 4127 } 4128 4129 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 4130 nvlist_free(nvroot); 4131 4132 sav->sav_sync = B_FALSE; 4133 } 4134 4135 static void 4136 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 4137 { 4138 nvlist_t *config; 4139 4140 if (list_is_empty(&spa->spa_config_dirty_list)) 4141 return; 4142 4143 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4144 4145 config = spa_config_generate(spa, spa->spa_root_vdev, 4146 dmu_tx_get_txg(tx), B_FALSE); 4147 4148 spa_config_exit(spa, SCL_STATE, FTAG); 4149 4150 if (spa->spa_config_syncing) 4151 nvlist_free(spa->spa_config_syncing); 4152 spa->spa_config_syncing = config; 4153 4154 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 4155 } 4156 4157 /* 4158 * Set zpool properties. 4159 */ 4160 static void 4161 spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 4162 { 4163 spa_t *spa = arg1; 4164 objset_t *mos = spa->spa_meta_objset; 4165 nvlist_t *nvp = arg2; 4166 nvpair_t *elem; 4167 uint64_t intval; 4168 char *strval; 4169 zpool_prop_t prop; 4170 const char *propname; 4171 zprop_type_t proptype; 4172 4173 mutex_enter(&spa->spa_props_lock); 4174 4175 elem = NULL; 4176 while ((elem = nvlist_next_nvpair(nvp, elem))) { 4177 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 4178 case ZPOOL_PROP_VERSION: 4179 /* 4180 * Only set version for non-zpool-creation cases 4181 * (set/import). spa_create() needs special care 4182 * for version setting. 4183 */ 4184 if (tx->tx_txg != TXG_INITIAL) { 4185 VERIFY(nvpair_value_uint64(elem, 4186 &intval) == 0); 4187 ASSERT(intval <= SPA_VERSION); 4188 ASSERT(intval >= spa_version(spa)); 4189 spa->spa_uberblock.ub_version = intval; 4190 vdev_config_dirty(spa->spa_root_vdev); 4191 } 4192 break; 4193 4194 case ZPOOL_PROP_ALTROOT: 4195 /* 4196 * 'altroot' is a non-persistent property. It should 4197 * have been set temporarily at creation or import time. 4198 */ 4199 ASSERT(spa->spa_root != NULL); 4200 break; 4201 4202 case ZPOOL_PROP_CACHEFILE: 4203 /* 4204 * 'cachefile' is also a non-persisitent property. 4205 */ 4206 break; 4207 default: 4208 /* 4209 * Set pool property values in the poolprops mos object. 4210 */ 4211 if (spa->spa_pool_props_object == 0) { 4212 objset_t *mos = spa->spa_meta_objset; 4213 4214 VERIFY((spa->spa_pool_props_object = 4215 zap_create(mos, DMU_OT_POOL_PROPS, 4216 DMU_OT_NONE, 0, tx)) > 0); 4217 4218 VERIFY(zap_update(mos, 4219 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 4220 8, 1, &spa->spa_pool_props_object, tx) 4221 == 0); 4222 } 4223 4224 /* normalize the property name */ 4225 propname = zpool_prop_to_name(prop); 4226 proptype = zpool_prop_get_type(prop); 4227 4228 if (nvpair_type(elem) == DATA_TYPE_STRING) { 4229 ASSERT(proptype == PROP_TYPE_STRING); 4230 VERIFY(nvpair_value_string(elem, &strval) == 0); 4231 VERIFY(zap_update(mos, 4232 spa->spa_pool_props_object, propname, 4233 1, strlen(strval) + 1, strval, tx) == 0); 4234 4235 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 4236 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 4237 4238 if (proptype == PROP_TYPE_INDEX) { 4239 const char *unused; 4240 VERIFY(zpool_prop_index_to_string( 4241 prop, intval, &unused) == 0); 4242 } 4243 VERIFY(zap_update(mos, 4244 spa->spa_pool_props_object, propname, 4245 8, 1, &intval, tx) == 0); 4246 } else { 4247 ASSERT(0); /* not allowed */ 4248 } 4249 4250 switch (prop) { 4251 case ZPOOL_PROP_DELEGATION: 4252 spa->spa_delegation = intval; 4253 break; 4254 case ZPOOL_PROP_BOOTFS: 4255 spa->spa_bootfs = intval; 4256 break; 4257 case ZPOOL_PROP_FAILUREMODE: 4258 spa->spa_failmode = intval; 4259 break; 4260 case ZPOOL_PROP_AUTOEXPAND: 4261 spa->spa_autoexpand = intval; 4262 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 4263 break; 4264 default: 4265 break; 4266 } 4267 } 4268 4269 /* log internal history if this is not a zpool create */ 4270 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 4271 tx->tx_txg != TXG_INITIAL) { 4272 spa_history_internal_log(LOG_POOL_PROPSET, 4273 spa, tx, cr, "%s %lld %s", 4274 nvpair_name(elem), intval, spa_name(spa)); 4275 } 4276 } 4277 4278 mutex_exit(&spa->spa_props_lock); 4279 } 4280 4281 /* 4282 * Sync the specified transaction group. New blocks may be dirtied as 4283 * part of the process, so we iterate until it converges. 4284 */ 4285 void 4286 spa_sync(spa_t *spa, uint64_t txg) 4287 { 4288 dsl_pool_t *dp = spa->spa_dsl_pool; 4289 objset_t *mos = spa->spa_meta_objset; 4290 bplist_t *bpl = &spa->spa_sync_bplist; 4291 vdev_t *rvd = spa->spa_root_vdev; 4292 vdev_t *vd; 4293 dmu_tx_t *tx; 4294 int dirty_vdevs; 4295 int error; 4296 4297 /* 4298 * Lock out configuration changes. 4299 */ 4300 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4301 4302 spa->spa_syncing_txg = txg; 4303 spa->spa_sync_pass = 0; 4304 4305 /* 4306 * If there are any pending vdev state changes, convert them 4307 * into config changes that go out with this transaction group. 4308 */ 4309 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4310 while (list_head(&spa->spa_state_dirty_list) != NULL) { 4311 /* 4312 * We need the write lock here because, for aux vdevs, 4313 * calling vdev_config_dirty() modifies sav_config. 4314 * This is ugly and will become unnecessary when we 4315 * eliminate the aux vdev wart by integrating all vdevs 4316 * into the root vdev tree. 4317 */ 4318 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 4319 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 4320 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 4321 vdev_state_clean(vd); 4322 vdev_config_dirty(vd); 4323 } 4324 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 4325 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 4326 } 4327 spa_config_exit(spa, SCL_STATE, FTAG); 4328 4329 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 4330 4331 tx = dmu_tx_create_assigned(dp, txg); 4332 4333 /* 4334 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 4335 * set spa_deflate if we have no raid-z vdevs. 4336 */ 4337 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 4338 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 4339 int i; 4340 4341 for (i = 0; i < rvd->vdev_children; i++) { 4342 vd = rvd->vdev_child[i]; 4343 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 4344 break; 4345 } 4346 if (i == rvd->vdev_children) { 4347 spa->spa_deflate = TRUE; 4348 VERIFY(0 == zap_add(spa->spa_meta_objset, 4349 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 4350 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 4351 } 4352 } 4353 4354 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 4355 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 4356 dsl_pool_create_origin(dp, tx); 4357 4358 /* Keeping the origin open increases spa_minref */ 4359 spa->spa_minref += 3; 4360 } 4361 4362 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 4363 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 4364 dsl_pool_upgrade_clones(dp, tx); 4365 } 4366 4367 /* 4368 * If anything has changed in this txg, push the deferred frees 4369 * from the previous txg. If not, leave them alone so that we 4370 * don't generate work on an otherwise idle system. 4371 */ 4372 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 4373 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 4374 !txg_list_empty(&dp->dp_sync_tasks, txg)) 4375 spa_sync_deferred_frees(spa, txg); 4376 4377 /* 4378 * Iterate to convergence. 4379 */ 4380 do { 4381 spa->spa_sync_pass++; 4382 4383 spa_sync_config_object(spa, tx); 4384 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 4385 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 4386 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 4387 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 4388 spa_errlog_sync(spa, txg); 4389 dsl_pool_sync(dp, txg); 4390 4391 dirty_vdevs = 0; 4392 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 4393 vdev_sync(vd, txg); 4394 dirty_vdevs++; 4395 } 4396 4397 bplist_sync(bpl, tx); 4398 } while (dirty_vdevs); 4399 4400 bplist_close(bpl); 4401 4402 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 4403 4404 /* 4405 * Rewrite the vdev configuration (which includes the uberblock) 4406 * to commit the transaction group. 4407 * 4408 * If there are no dirty vdevs, we sync the uberblock to a few 4409 * random top-level vdevs that are known to be visible in the 4410 * config cache (see spa_vdev_add() for a complete description). 4411 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 4412 */ 4413 for (;;) { 4414 /* 4415 * We hold SCL_STATE to prevent vdev open/close/etc. 4416 * while we're attempting to write the vdev labels. 4417 */ 4418 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4419 4420 if (list_is_empty(&spa->spa_config_dirty_list)) { 4421 vdev_t *svd[SPA_DVAS_PER_BP]; 4422 int svdcount = 0; 4423 int children = rvd->vdev_children; 4424 int c0 = spa_get_random(children); 4425 4426 for (int c = 0; c < children; c++) { 4427 vd = rvd->vdev_child[(c0 + c) % children]; 4428 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 4429 continue; 4430 svd[svdcount++] = vd; 4431 if (svdcount == SPA_DVAS_PER_BP) 4432 break; 4433 } 4434 error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 4435 if (error != 0) 4436 error = vdev_config_sync(svd, svdcount, txg, 4437 B_TRUE); 4438 } else { 4439 error = vdev_config_sync(rvd->vdev_child, 4440 rvd->vdev_children, txg, B_FALSE); 4441 if (error != 0) 4442 error = vdev_config_sync(rvd->vdev_child, 4443 rvd->vdev_children, txg, B_TRUE); 4444 } 4445 4446 spa_config_exit(spa, SCL_STATE, FTAG); 4447 4448 if (error == 0) 4449 break; 4450 zio_suspend(spa, NULL); 4451 zio_resume_wait(spa); 4452 } 4453 dmu_tx_commit(tx); 4454 4455 /* 4456 * Clear the dirty config list. 4457 */ 4458 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 4459 vdev_config_clean(vd); 4460 4461 /* 4462 * Now that the new config has synced transactionally, 4463 * let it become visible to the config cache. 4464 */ 4465 if (spa->spa_config_syncing != NULL) { 4466 spa_config_set(spa, spa->spa_config_syncing); 4467 spa->spa_config_txg = txg; 4468 spa->spa_config_syncing = NULL; 4469 } 4470 4471 spa->spa_ubsync = spa->spa_uberblock; 4472 4473 /* 4474 * Clean up the ZIL records for the synced txg. 4475 */ 4476 dsl_pool_zil_clean(dp); 4477 4478 /* 4479 * Update usable space statistics. 4480 */ 4481 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 4482 vdev_sync_done(vd, txg); 4483 4484 /* 4485 * It had better be the case that we didn't dirty anything 4486 * since vdev_config_sync(). 4487 */ 4488 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 4489 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 4490 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 4491 ASSERT(bpl->bpl_queue == NULL); 4492 4493 spa_config_exit(spa, SCL_CONFIG, FTAG); 4494 4495 /* 4496 * If any async tasks have been requested, kick them off. 4497 */ 4498 spa_async_dispatch(spa); 4499 } 4500 4501 /* 4502 * Sync all pools. We don't want to hold the namespace lock across these 4503 * operations, so we take a reference on the spa_t and drop the lock during the 4504 * sync. 4505 */ 4506 void 4507 spa_sync_allpools(void) 4508 { 4509 spa_t *spa = NULL; 4510 mutex_enter(&spa_namespace_lock); 4511 while ((spa = spa_next(spa)) != NULL) { 4512 if (spa_state(spa) != POOL_STATE_ACTIVE || spa_suspended(spa)) 4513 continue; 4514 spa_open_ref(spa, FTAG); 4515 mutex_exit(&spa_namespace_lock); 4516 txg_wait_synced(spa_get_dsl(spa), 0); 4517 mutex_enter(&spa_namespace_lock); 4518 spa_close(spa, FTAG); 4519 } 4520 mutex_exit(&spa_namespace_lock); 4521 } 4522 4523 /* 4524 * ========================================================================== 4525 * Miscellaneous routines 4526 * ========================================================================== 4527 */ 4528 4529 /* 4530 * Remove all pools in the system. 4531 */ 4532 void 4533 spa_evict_all(void) 4534 { 4535 spa_t *spa; 4536 4537 /* 4538 * Remove all cached state. All pools should be closed now, 4539 * so every spa in the AVL tree should be unreferenced. 4540 */ 4541 mutex_enter(&spa_namespace_lock); 4542 while ((spa = spa_next(NULL)) != NULL) { 4543 /* 4544 * Stop async tasks. The async thread may need to detach 4545 * a device that's been replaced, which requires grabbing 4546 * spa_namespace_lock, so we must drop it here. 4547 */ 4548 spa_open_ref(spa, FTAG); 4549 mutex_exit(&spa_namespace_lock); 4550 spa_async_suspend(spa); 4551 mutex_enter(&spa_namespace_lock); 4552 spa_close(spa, FTAG); 4553 4554 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4555 spa_unload(spa); 4556 spa_deactivate(spa); 4557 } 4558 spa_remove(spa); 4559 } 4560 mutex_exit(&spa_namespace_lock); 4561 } 4562 4563 vdev_t * 4564 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 4565 { 4566 vdev_t *vd; 4567 int i; 4568 4569 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 4570 return (vd); 4571 4572 if (aux) { 4573 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 4574 vd = spa->spa_l2cache.sav_vdevs[i]; 4575 if (vd->vdev_guid == guid) 4576 return (vd); 4577 } 4578 4579 for (i = 0; i < spa->spa_spares.sav_count; i++) { 4580 vd = spa->spa_spares.sav_vdevs[i]; 4581 if (vd->vdev_guid == guid) 4582 return (vd); 4583 } 4584 } 4585 4586 return (NULL); 4587 } 4588 4589 void 4590 spa_upgrade(spa_t *spa, uint64_t version) 4591 { 4592 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4593 4594 /* 4595 * This should only be called for a non-faulted pool, and since a 4596 * future version would result in an unopenable pool, this shouldn't be 4597 * possible. 4598 */ 4599 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 4600 ASSERT(version >= spa->spa_uberblock.ub_version); 4601 4602 spa->spa_uberblock.ub_version = version; 4603 vdev_config_dirty(spa->spa_root_vdev); 4604 4605 spa_config_exit(spa, SCL_ALL, FTAG); 4606 4607 txg_wait_synced(spa_get_dsl(spa), 0); 4608 } 4609 4610 boolean_t 4611 spa_has_spare(spa_t *spa, uint64_t guid) 4612 { 4613 int i; 4614 uint64_t spareguid; 4615 spa_aux_vdev_t *sav = &spa->spa_spares; 4616 4617 for (i = 0; i < sav->sav_count; i++) 4618 if (sav->sav_vdevs[i]->vdev_guid == guid) 4619 return (B_TRUE); 4620 4621 for (i = 0; i < sav->sav_npending; i++) { 4622 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 4623 &spareguid) == 0 && spareguid == guid) 4624 return (B_TRUE); 4625 } 4626 4627 return (B_FALSE); 4628 } 4629 4630 /* 4631 * Check if a pool has an active shared spare device. 4632 * Note: reference count of an active spare is 2, as a spare and as a replace 4633 */ 4634 static boolean_t 4635 spa_has_active_shared_spare(spa_t *spa) 4636 { 4637 int i, refcnt; 4638 uint64_t pool; 4639 spa_aux_vdev_t *sav = &spa->spa_spares; 4640 4641 for (i = 0; i < sav->sav_count; i++) { 4642 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 4643 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 4644 refcnt > 2) 4645 return (B_TRUE); 4646 } 4647 4648 return (B_FALSE); 4649 } 4650 4651 /* 4652 * Post a sysevent corresponding to the given event. The 'name' must be one of 4653 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 4654 * filled in from the spa and (optionally) the vdev. This doesn't do anything 4655 * in the userland libzpool, as we don't want consumers to misinterpret ztest 4656 * or zdb as real changes. 4657 */ 4658 void 4659 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 4660 { 4661 #ifdef _KERNEL 4662 sysevent_t *ev; 4663 sysevent_attr_list_t *attr = NULL; 4664 sysevent_value_t value; 4665 sysevent_id_t eid; 4666 4667 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 4668 SE_SLEEP); 4669 4670 value.value_type = SE_DATA_TYPE_STRING; 4671 value.value.sv_string = spa_name(spa); 4672 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 4673 goto done; 4674 4675 value.value_type = SE_DATA_TYPE_UINT64; 4676 value.value.sv_uint64 = spa_guid(spa); 4677 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 4678 goto done; 4679 4680 if (vd) { 4681 value.value_type = SE_DATA_TYPE_UINT64; 4682 value.value.sv_uint64 = vd->vdev_guid; 4683 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 4684 SE_SLEEP) != 0) 4685 goto done; 4686 4687 if (vd->vdev_path) { 4688 value.value_type = SE_DATA_TYPE_STRING; 4689 value.value.sv_string = vd->vdev_path; 4690 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 4691 &value, SE_SLEEP) != 0) 4692 goto done; 4693 } 4694 } 4695 4696 if (sysevent_attach_attributes(ev, attr) != 0) 4697 goto done; 4698 attr = NULL; 4699 4700 (void) log_sysevent(ev, SE_SLEEP, &eid); 4701 4702 done: 4703 if (attr) 4704 sysevent_free_attr(attr); 4705 sysevent_free(ev); 4706 #endif 4707 } 4708