1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * This file contains all the routines used when modifying on-disk SPA state. 29 * This includes opening, importing, destroying, exporting a pool, and syncing a 30 * pool. 31 */ 32 33 #include <sys/zfs_context.h> 34 #include <sys/fm/fs/zfs.h> 35 #include <sys/spa_impl.h> 36 #include <sys/zio.h> 37 #include <sys/zio_checksum.h> 38 #include <sys/zio_compress.h> 39 #include <sys/dmu.h> 40 #include <sys/dmu_tx.h> 41 #include <sys/zap.h> 42 #include <sys/zil.h> 43 #include <sys/vdev_impl.h> 44 #include <sys/metaslab.h> 45 #include <sys/uberblock_impl.h> 46 #include <sys/txg.h> 47 #include <sys/avl.h> 48 #include <sys/dmu_traverse.h> 49 #include <sys/dmu_objset.h> 50 #include <sys/unique.h> 51 #include <sys/dsl_pool.h> 52 #include <sys/dsl_dataset.h> 53 #include <sys/dsl_dir.h> 54 #include <sys/dsl_prop.h> 55 #include <sys/dsl_synctask.h> 56 #include <sys/fs/zfs.h> 57 #include <sys/arc.h> 58 #include <sys/callb.h> 59 #include <sys/systeminfo.h> 60 #include <sys/sunddi.h> 61 #include <sys/spa_boot.h> 62 63 #ifdef _KERNEL 64 #include <sys/zone.h> 65 #endif /* _KERNEL */ 66 67 #include "zfs_prop.h" 68 #include "zfs_comutil.h" 69 70 int zio_taskq_threads[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 71 /* ISSUE INTR */ 72 { 1, 1 }, /* ZIO_TYPE_NULL */ 73 { 1, 8 }, /* ZIO_TYPE_READ */ 74 { 8, 1 }, /* ZIO_TYPE_WRITE */ 75 { 1, 1 }, /* ZIO_TYPE_FREE */ 76 { 1, 1 }, /* ZIO_TYPE_CLAIM */ 77 { 1, 1 }, /* ZIO_TYPE_IOCTL */ 78 }; 79 80 static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); 81 static boolean_t spa_has_active_shared_spare(spa_t *spa); 82 83 /* 84 * ========================================================================== 85 * SPA properties routines 86 * ========================================================================== 87 */ 88 89 /* 90 * Add a (source=src, propname=propval) list to an nvlist. 91 */ 92 static void 93 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 94 uint64_t intval, zprop_source_t src) 95 { 96 const char *propname = zpool_prop_to_name(prop); 97 nvlist_t *propval; 98 99 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 100 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 101 102 if (strval != NULL) 103 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 104 else 105 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 106 107 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 108 nvlist_free(propval); 109 } 110 111 /* 112 * Get property values from the spa configuration. 113 */ 114 static void 115 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 116 { 117 uint64_t size; 118 uint64_t used; 119 uint64_t cap, version; 120 zprop_source_t src = ZPROP_SRC_NONE; 121 spa_config_dirent_t *dp; 122 123 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 124 125 if (spa->spa_root_vdev != NULL) { 126 size = spa_get_space(spa); 127 used = spa_get_alloc(spa); 128 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 129 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 130 spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src); 131 spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, 132 size - used, src); 133 134 cap = (size == 0) ? 0 : (used * 100 / size); 135 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 136 137 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 138 spa->spa_root_vdev->vdev_state, src); 139 140 version = spa_version(spa); 141 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 142 src = ZPROP_SRC_DEFAULT; 143 else 144 src = ZPROP_SRC_LOCAL; 145 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 146 } 147 148 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 149 150 if (spa->spa_root != NULL) 151 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 152 0, ZPROP_SRC_LOCAL); 153 154 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 155 if (dp->scd_path == NULL) { 156 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 157 "none", 0, ZPROP_SRC_LOCAL); 158 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 159 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 160 dp->scd_path, 0, ZPROP_SRC_LOCAL); 161 } 162 } 163 } 164 165 /* 166 * Get zpool property values. 167 */ 168 int 169 spa_prop_get(spa_t *spa, nvlist_t **nvp) 170 { 171 zap_cursor_t zc; 172 zap_attribute_t za; 173 objset_t *mos = spa->spa_meta_objset; 174 int err; 175 176 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 177 178 mutex_enter(&spa->spa_props_lock); 179 180 /* 181 * Get properties from the spa config. 182 */ 183 spa_prop_get_config(spa, nvp); 184 185 /* If no pool property object, no more prop to get. */ 186 if (spa->spa_pool_props_object == 0) { 187 mutex_exit(&spa->spa_props_lock); 188 return (0); 189 } 190 191 /* 192 * Get properties from the MOS pool property object. 193 */ 194 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 195 (err = zap_cursor_retrieve(&zc, &za)) == 0; 196 zap_cursor_advance(&zc)) { 197 uint64_t intval = 0; 198 char *strval = NULL; 199 zprop_source_t src = ZPROP_SRC_DEFAULT; 200 zpool_prop_t prop; 201 202 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 203 continue; 204 205 switch (za.za_integer_length) { 206 case 8: 207 /* integer property */ 208 if (za.za_first_integer != 209 zpool_prop_default_numeric(prop)) 210 src = ZPROP_SRC_LOCAL; 211 212 if (prop == ZPOOL_PROP_BOOTFS) { 213 dsl_pool_t *dp; 214 dsl_dataset_t *ds = NULL; 215 216 dp = spa_get_dsl(spa); 217 rw_enter(&dp->dp_config_rwlock, RW_READER); 218 if (err = dsl_dataset_hold_obj(dp, 219 za.za_first_integer, FTAG, &ds)) { 220 rw_exit(&dp->dp_config_rwlock); 221 break; 222 } 223 224 strval = kmem_alloc( 225 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 226 KM_SLEEP); 227 dsl_dataset_name(ds, strval); 228 dsl_dataset_rele(ds, FTAG); 229 rw_exit(&dp->dp_config_rwlock); 230 } else { 231 strval = NULL; 232 intval = za.za_first_integer; 233 } 234 235 spa_prop_add_list(*nvp, prop, strval, intval, src); 236 237 if (strval != NULL) 238 kmem_free(strval, 239 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 240 241 break; 242 243 case 1: 244 /* string property */ 245 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 246 err = zap_lookup(mos, spa->spa_pool_props_object, 247 za.za_name, 1, za.za_num_integers, strval); 248 if (err) { 249 kmem_free(strval, za.za_num_integers); 250 break; 251 } 252 spa_prop_add_list(*nvp, prop, strval, 0, src); 253 kmem_free(strval, za.za_num_integers); 254 break; 255 256 default: 257 break; 258 } 259 } 260 zap_cursor_fini(&zc); 261 mutex_exit(&spa->spa_props_lock); 262 out: 263 if (err && err != ENOENT) { 264 nvlist_free(*nvp); 265 *nvp = NULL; 266 return (err); 267 } 268 269 return (0); 270 } 271 272 /* 273 * Validate the given pool properties nvlist and modify the list 274 * for the property values to be set. 275 */ 276 static int 277 spa_prop_validate(spa_t *spa, nvlist_t *props) 278 { 279 nvpair_t *elem; 280 int error = 0, reset_bootfs = 0; 281 uint64_t objnum; 282 283 elem = NULL; 284 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 285 zpool_prop_t prop; 286 char *propname, *strval; 287 uint64_t intval; 288 objset_t *os; 289 char *slash; 290 291 propname = nvpair_name(elem); 292 293 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) 294 return (EINVAL); 295 296 switch (prop) { 297 case ZPOOL_PROP_VERSION: 298 error = nvpair_value_uint64(elem, &intval); 299 if (!error && 300 (intval < spa_version(spa) || intval > SPA_VERSION)) 301 error = EINVAL; 302 break; 303 304 case ZPOOL_PROP_DELEGATION: 305 case ZPOOL_PROP_AUTOREPLACE: 306 case ZPOOL_PROP_LISTSNAPS: 307 error = nvpair_value_uint64(elem, &intval); 308 if (!error && intval > 1) 309 error = EINVAL; 310 break; 311 312 case ZPOOL_PROP_BOOTFS: 313 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 314 error = ENOTSUP; 315 break; 316 } 317 318 /* 319 * Make sure the vdev config is bootable 320 */ 321 if (!vdev_is_bootable(spa->spa_root_vdev)) { 322 error = ENOTSUP; 323 break; 324 } 325 326 reset_bootfs = 1; 327 328 error = nvpair_value_string(elem, &strval); 329 330 if (!error) { 331 uint64_t compress; 332 333 if (strval == NULL || strval[0] == '\0') { 334 objnum = zpool_prop_default_numeric( 335 ZPOOL_PROP_BOOTFS); 336 break; 337 } 338 339 if (error = dmu_objset_open(strval, DMU_OST_ZFS, 340 DS_MODE_USER | DS_MODE_READONLY, &os)) 341 break; 342 343 /* We don't support gzip bootable datasets */ 344 if ((error = dsl_prop_get_integer(strval, 345 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 346 &compress, NULL)) == 0 && 347 !BOOTFS_COMPRESS_VALID(compress)) { 348 error = ENOTSUP; 349 } else { 350 objnum = dmu_objset_id(os); 351 } 352 dmu_objset_close(os); 353 } 354 break; 355 356 case ZPOOL_PROP_FAILUREMODE: 357 error = nvpair_value_uint64(elem, &intval); 358 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 359 intval > ZIO_FAILURE_MODE_PANIC)) 360 error = EINVAL; 361 362 /* 363 * This is a special case which only occurs when 364 * the pool has completely failed. This allows 365 * the user to change the in-core failmode property 366 * without syncing it out to disk (I/Os might 367 * currently be blocked). We do this by returning 368 * EIO to the caller (spa_prop_set) to trick it 369 * into thinking we encountered a property validation 370 * error. 371 */ 372 if (!error && spa_suspended(spa)) { 373 spa->spa_failmode = intval; 374 error = EIO; 375 } 376 break; 377 378 case ZPOOL_PROP_CACHEFILE: 379 if ((error = nvpair_value_string(elem, &strval)) != 0) 380 break; 381 382 if (strval[0] == '\0') 383 break; 384 385 if (strcmp(strval, "none") == 0) 386 break; 387 388 if (strval[0] != '/') { 389 error = EINVAL; 390 break; 391 } 392 393 slash = strrchr(strval, '/'); 394 ASSERT(slash != NULL); 395 396 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 397 strcmp(slash, "/..") == 0) 398 error = EINVAL; 399 break; 400 } 401 402 if (error) 403 break; 404 } 405 406 if (!error && reset_bootfs) { 407 error = nvlist_remove(props, 408 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 409 410 if (!error) { 411 error = nvlist_add_uint64(props, 412 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 413 } 414 } 415 416 return (error); 417 } 418 419 void 420 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 421 { 422 char *cachefile; 423 spa_config_dirent_t *dp; 424 425 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 426 &cachefile) != 0) 427 return; 428 429 dp = kmem_alloc(sizeof (spa_config_dirent_t), 430 KM_SLEEP); 431 432 if (cachefile[0] == '\0') 433 dp->scd_path = spa_strdup(spa_config_path); 434 else if (strcmp(cachefile, "none") == 0) 435 dp->scd_path = NULL; 436 else 437 dp->scd_path = spa_strdup(cachefile); 438 439 list_insert_head(&spa->spa_config_list, dp); 440 if (need_sync) 441 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 442 } 443 444 int 445 spa_prop_set(spa_t *spa, nvlist_t *nvp) 446 { 447 int error; 448 nvpair_t *elem; 449 boolean_t need_sync = B_FALSE; 450 zpool_prop_t prop; 451 452 if ((error = spa_prop_validate(spa, nvp)) != 0) 453 return (error); 454 455 elem = NULL; 456 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 457 if ((prop = zpool_name_to_prop( 458 nvpair_name(elem))) == ZPROP_INVAL) 459 return (EINVAL); 460 461 if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT) 462 continue; 463 464 need_sync = B_TRUE; 465 break; 466 } 467 468 if (need_sync) 469 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 470 spa, nvp, 3)); 471 else 472 return (0); 473 } 474 475 /* 476 * If the bootfs property value is dsobj, clear it. 477 */ 478 void 479 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 480 { 481 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 482 VERIFY(zap_remove(spa->spa_meta_objset, 483 spa->spa_pool_props_object, 484 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 485 spa->spa_bootfs = 0; 486 } 487 } 488 489 /* 490 * ========================================================================== 491 * SPA state manipulation (open/create/destroy/import/export) 492 * ========================================================================== 493 */ 494 495 static int 496 spa_error_entry_compare(const void *a, const void *b) 497 { 498 spa_error_entry_t *sa = (spa_error_entry_t *)a; 499 spa_error_entry_t *sb = (spa_error_entry_t *)b; 500 int ret; 501 502 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 503 sizeof (zbookmark_t)); 504 505 if (ret < 0) 506 return (-1); 507 else if (ret > 0) 508 return (1); 509 else 510 return (0); 511 } 512 513 /* 514 * Utility function which retrieves copies of the current logs and 515 * re-initializes them in the process. 516 */ 517 void 518 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 519 { 520 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 521 522 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 523 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 524 525 avl_create(&spa->spa_errlist_scrub, 526 spa_error_entry_compare, sizeof (spa_error_entry_t), 527 offsetof(spa_error_entry_t, se_avl)); 528 avl_create(&spa->spa_errlist_last, 529 spa_error_entry_compare, sizeof (spa_error_entry_t), 530 offsetof(spa_error_entry_t, se_avl)); 531 } 532 533 /* 534 * Activate an uninitialized pool. 535 */ 536 static void 537 spa_activate(spa_t *spa, int mode) 538 { 539 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 540 541 spa->spa_state = POOL_STATE_ACTIVE; 542 spa->spa_mode = mode; 543 544 spa->spa_normal_class = metaslab_class_create(); 545 spa->spa_log_class = metaslab_class_create(); 546 547 for (int t = 0; t < ZIO_TYPES; t++) { 548 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 549 spa->spa_zio_taskq[t][q] = taskq_create("spa_zio", 550 zio_taskq_threads[t][q], maxclsyspri, 50, 551 INT_MAX, TASKQ_PREPOPULATE); 552 } 553 } 554 555 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 556 offsetof(vdev_t, vdev_config_dirty_node)); 557 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 558 offsetof(vdev_t, vdev_state_dirty_node)); 559 560 txg_list_create(&spa->spa_vdev_txg_list, 561 offsetof(struct vdev, vdev_txg_node)); 562 563 avl_create(&spa->spa_errlist_scrub, 564 spa_error_entry_compare, sizeof (spa_error_entry_t), 565 offsetof(spa_error_entry_t, se_avl)); 566 avl_create(&spa->spa_errlist_last, 567 spa_error_entry_compare, sizeof (spa_error_entry_t), 568 offsetof(spa_error_entry_t, se_avl)); 569 } 570 571 /* 572 * Opposite of spa_activate(). 573 */ 574 static void 575 spa_deactivate(spa_t *spa) 576 { 577 ASSERT(spa->spa_sync_on == B_FALSE); 578 ASSERT(spa->spa_dsl_pool == NULL); 579 ASSERT(spa->spa_root_vdev == NULL); 580 581 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 582 583 txg_list_destroy(&spa->spa_vdev_txg_list); 584 585 list_destroy(&spa->spa_config_dirty_list); 586 list_destroy(&spa->spa_state_dirty_list); 587 588 for (int t = 0; t < ZIO_TYPES; t++) { 589 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 590 taskq_destroy(spa->spa_zio_taskq[t][q]); 591 spa->spa_zio_taskq[t][q] = NULL; 592 } 593 } 594 595 metaslab_class_destroy(spa->spa_normal_class); 596 spa->spa_normal_class = NULL; 597 598 metaslab_class_destroy(spa->spa_log_class); 599 spa->spa_log_class = NULL; 600 601 /* 602 * If this was part of an import or the open otherwise failed, we may 603 * still have errors left in the queues. Empty them just in case. 604 */ 605 spa_errlog_drain(spa); 606 607 avl_destroy(&spa->spa_errlist_scrub); 608 avl_destroy(&spa->spa_errlist_last); 609 610 spa->spa_state = POOL_STATE_UNINITIALIZED; 611 } 612 613 /* 614 * Verify a pool configuration, and construct the vdev tree appropriately. This 615 * will create all the necessary vdevs in the appropriate layout, with each vdev 616 * in the CLOSED state. This will prep the pool before open/creation/import. 617 * All vdev validation is done by the vdev_alloc() routine. 618 */ 619 static int 620 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 621 uint_t id, int atype) 622 { 623 nvlist_t **child; 624 uint_t c, children; 625 int error; 626 627 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 628 return (error); 629 630 if ((*vdp)->vdev_ops->vdev_op_leaf) 631 return (0); 632 633 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 634 &child, &children); 635 636 if (error == ENOENT) 637 return (0); 638 639 if (error) { 640 vdev_free(*vdp); 641 *vdp = NULL; 642 return (EINVAL); 643 } 644 645 for (c = 0; c < children; c++) { 646 vdev_t *vd; 647 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 648 atype)) != 0) { 649 vdev_free(*vdp); 650 *vdp = NULL; 651 return (error); 652 } 653 } 654 655 ASSERT(*vdp != NULL); 656 657 return (0); 658 } 659 660 /* 661 * Opposite of spa_load(). 662 */ 663 static void 664 spa_unload(spa_t *spa) 665 { 666 int i; 667 668 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 669 670 /* 671 * Stop async tasks. 672 */ 673 spa_async_suspend(spa); 674 675 /* 676 * Stop syncing. 677 */ 678 if (spa->spa_sync_on) { 679 txg_sync_stop(spa->spa_dsl_pool); 680 spa->spa_sync_on = B_FALSE; 681 } 682 683 /* 684 * Wait for any outstanding async I/O to complete. 685 */ 686 mutex_enter(&spa->spa_async_root_lock); 687 while (spa->spa_async_root_count != 0) 688 cv_wait(&spa->spa_async_root_cv, &spa->spa_async_root_lock); 689 mutex_exit(&spa->spa_async_root_lock); 690 691 /* 692 * Close the dsl pool. 693 */ 694 if (spa->spa_dsl_pool) { 695 dsl_pool_close(spa->spa_dsl_pool); 696 spa->spa_dsl_pool = NULL; 697 } 698 699 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 700 701 /* 702 * Drop and purge level 2 cache 703 */ 704 spa_l2cache_drop(spa); 705 706 /* 707 * Close all vdevs. 708 */ 709 if (spa->spa_root_vdev) 710 vdev_free(spa->spa_root_vdev); 711 ASSERT(spa->spa_root_vdev == NULL); 712 713 for (i = 0; i < spa->spa_spares.sav_count; i++) 714 vdev_free(spa->spa_spares.sav_vdevs[i]); 715 if (spa->spa_spares.sav_vdevs) { 716 kmem_free(spa->spa_spares.sav_vdevs, 717 spa->spa_spares.sav_count * sizeof (void *)); 718 spa->spa_spares.sav_vdevs = NULL; 719 } 720 if (spa->spa_spares.sav_config) { 721 nvlist_free(spa->spa_spares.sav_config); 722 spa->spa_spares.sav_config = NULL; 723 } 724 spa->spa_spares.sav_count = 0; 725 726 for (i = 0; i < spa->spa_l2cache.sav_count; i++) 727 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 728 if (spa->spa_l2cache.sav_vdevs) { 729 kmem_free(spa->spa_l2cache.sav_vdevs, 730 spa->spa_l2cache.sav_count * sizeof (void *)); 731 spa->spa_l2cache.sav_vdevs = NULL; 732 } 733 if (spa->spa_l2cache.sav_config) { 734 nvlist_free(spa->spa_l2cache.sav_config); 735 spa->spa_l2cache.sav_config = NULL; 736 } 737 spa->spa_l2cache.sav_count = 0; 738 739 spa->spa_async_suspended = 0; 740 741 spa_config_exit(spa, SCL_ALL, FTAG); 742 } 743 744 /* 745 * Load (or re-load) the current list of vdevs describing the active spares for 746 * this pool. When this is called, we have some form of basic information in 747 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 748 * then re-generate a more complete list including status information. 749 */ 750 static void 751 spa_load_spares(spa_t *spa) 752 { 753 nvlist_t **spares; 754 uint_t nspares; 755 int i; 756 vdev_t *vd, *tvd; 757 758 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 759 760 /* 761 * First, close and free any existing spare vdevs. 762 */ 763 for (i = 0; i < spa->spa_spares.sav_count; i++) { 764 vd = spa->spa_spares.sav_vdevs[i]; 765 766 /* Undo the call to spa_activate() below */ 767 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 768 B_FALSE)) != NULL && tvd->vdev_isspare) 769 spa_spare_remove(tvd); 770 vdev_close(vd); 771 vdev_free(vd); 772 } 773 774 if (spa->spa_spares.sav_vdevs) 775 kmem_free(spa->spa_spares.sav_vdevs, 776 spa->spa_spares.sav_count * sizeof (void *)); 777 778 if (spa->spa_spares.sav_config == NULL) 779 nspares = 0; 780 else 781 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 782 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 783 784 spa->spa_spares.sav_count = (int)nspares; 785 spa->spa_spares.sav_vdevs = NULL; 786 787 if (nspares == 0) 788 return; 789 790 /* 791 * Construct the array of vdevs, opening them to get status in the 792 * process. For each spare, there is potentially two different vdev_t 793 * structures associated with it: one in the list of spares (used only 794 * for basic validation purposes) and one in the active vdev 795 * configuration (if it's spared in). During this phase we open and 796 * validate each vdev on the spare list. If the vdev also exists in the 797 * active configuration, then we also mark this vdev as an active spare. 798 */ 799 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 800 KM_SLEEP); 801 for (i = 0; i < spa->spa_spares.sav_count; i++) { 802 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 803 VDEV_ALLOC_SPARE) == 0); 804 ASSERT(vd != NULL); 805 806 spa->spa_spares.sav_vdevs[i] = vd; 807 808 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 809 B_FALSE)) != NULL) { 810 if (!tvd->vdev_isspare) 811 spa_spare_add(tvd); 812 813 /* 814 * We only mark the spare active if we were successfully 815 * able to load the vdev. Otherwise, importing a pool 816 * with a bad active spare would result in strange 817 * behavior, because multiple pool would think the spare 818 * is actively in use. 819 * 820 * There is a vulnerability here to an equally bizarre 821 * circumstance, where a dead active spare is later 822 * brought back to life (onlined or otherwise). Given 823 * the rarity of this scenario, and the extra complexity 824 * it adds, we ignore the possibility. 825 */ 826 if (!vdev_is_dead(tvd)) 827 spa_spare_activate(tvd); 828 } 829 830 vd->vdev_top = vd; 831 832 if (vdev_open(vd) != 0) 833 continue; 834 835 if (vdev_validate_aux(vd) == 0) 836 spa_spare_add(vd); 837 } 838 839 /* 840 * Recompute the stashed list of spares, with status information 841 * this time. 842 */ 843 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 844 DATA_TYPE_NVLIST_ARRAY) == 0); 845 846 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 847 KM_SLEEP); 848 for (i = 0; i < spa->spa_spares.sav_count; i++) 849 spares[i] = vdev_config_generate(spa, 850 spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE); 851 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 852 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 853 for (i = 0; i < spa->spa_spares.sav_count; i++) 854 nvlist_free(spares[i]); 855 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 856 } 857 858 /* 859 * Load (or re-load) the current list of vdevs describing the active l2cache for 860 * this pool. When this is called, we have some form of basic information in 861 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 862 * then re-generate a more complete list including status information. 863 * Devices which are already active have their details maintained, and are 864 * not re-opened. 865 */ 866 static void 867 spa_load_l2cache(spa_t *spa) 868 { 869 nvlist_t **l2cache; 870 uint_t nl2cache; 871 int i, j, oldnvdevs; 872 uint64_t guid, size; 873 vdev_t *vd, **oldvdevs, **newvdevs; 874 spa_aux_vdev_t *sav = &spa->spa_l2cache; 875 876 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 877 878 if (sav->sav_config != NULL) { 879 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 880 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 881 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 882 } else { 883 nl2cache = 0; 884 } 885 886 oldvdevs = sav->sav_vdevs; 887 oldnvdevs = sav->sav_count; 888 sav->sav_vdevs = NULL; 889 sav->sav_count = 0; 890 891 /* 892 * Process new nvlist of vdevs. 893 */ 894 for (i = 0; i < nl2cache; i++) { 895 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 896 &guid) == 0); 897 898 newvdevs[i] = NULL; 899 for (j = 0; j < oldnvdevs; j++) { 900 vd = oldvdevs[j]; 901 if (vd != NULL && guid == vd->vdev_guid) { 902 /* 903 * Retain previous vdev for add/remove ops. 904 */ 905 newvdevs[i] = vd; 906 oldvdevs[j] = NULL; 907 break; 908 } 909 } 910 911 if (newvdevs[i] == NULL) { 912 /* 913 * Create new vdev 914 */ 915 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 916 VDEV_ALLOC_L2CACHE) == 0); 917 ASSERT(vd != NULL); 918 newvdevs[i] = vd; 919 920 /* 921 * Commit this vdev as an l2cache device, 922 * even if it fails to open. 923 */ 924 spa_l2cache_add(vd); 925 926 vd->vdev_top = vd; 927 vd->vdev_aux = sav; 928 929 spa_l2cache_activate(vd); 930 931 if (vdev_open(vd) != 0) 932 continue; 933 934 (void) vdev_validate_aux(vd); 935 936 if (!vdev_is_dead(vd)) { 937 size = vdev_get_rsize(vd); 938 l2arc_add_vdev(spa, vd, 939 VDEV_LABEL_START_SIZE, 940 size - VDEV_LABEL_START_SIZE); 941 } 942 } 943 } 944 945 /* 946 * Purge vdevs that were dropped 947 */ 948 for (i = 0; i < oldnvdevs; i++) { 949 uint64_t pool; 950 951 vd = oldvdevs[i]; 952 if (vd != NULL) { 953 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 954 pool != 0ULL && l2arc_vdev_present(vd)) 955 l2arc_remove_vdev(vd); 956 (void) vdev_close(vd); 957 spa_l2cache_remove(vd); 958 } 959 } 960 961 if (oldvdevs) 962 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 963 964 if (sav->sav_config == NULL) 965 goto out; 966 967 sav->sav_vdevs = newvdevs; 968 sav->sav_count = (int)nl2cache; 969 970 /* 971 * Recompute the stashed list of l2cache devices, with status 972 * information this time. 973 */ 974 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 975 DATA_TYPE_NVLIST_ARRAY) == 0); 976 977 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 978 for (i = 0; i < sav->sav_count; i++) 979 l2cache[i] = vdev_config_generate(spa, 980 sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE); 981 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 982 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 983 out: 984 for (i = 0; i < sav->sav_count; i++) 985 nvlist_free(l2cache[i]); 986 if (sav->sav_count) 987 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 988 } 989 990 static int 991 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 992 { 993 dmu_buf_t *db; 994 char *packed = NULL; 995 size_t nvsize = 0; 996 int error; 997 *value = NULL; 998 999 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 1000 nvsize = *(uint64_t *)db->db_data; 1001 dmu_buf_rele(db, FTAG); 1002 1003 packed = kmem_alloc(nvsize, KM_SLEEP); 1004 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 1005 if (error == 0) 1006 error = nvlist_unpack(packed, nvsize, value, 0); 1007 kmem_free(packed, nvsize); 1008 1009 return (error); 1010 } 1011 1012 /* 1013 * Checks to see if the given vdev could not be opened, in which case we post a 1014 * sysevent to notify the autoreplace code that the device has been removed. 1015 */ 1016 static void 1017 spa_check_removed(vdev_t *vd) 1018 { 1019 int c; 1020 1021 for (c = 0; c < vd->vdev_children; c++) 1022 spa_check_removed(vd->vdev_child[c]); 1023 1024 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 1025 zfs_post_autoreplace(vd->vdev_spa, vd); 1026 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1027 } 1028 } 1029 1030 /* 1031 * Check for missing log devices 1032 */ 1033 int 1034 spa_check_logs(spa_t *spa) 1035 { 1036 switch (spa->spa_log_state) { 1037 case SPA_LOG_MISSING: 1038 /* need to recheck in case slog has been restored */ 1039 case SPA_LOG_UNKNOWN: 1040 if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, 1041 DS_FIND_CHILDREN)) { 1042 spa->spa_log_state = SPA_LOG_MISSING; 1043 return (1); 1044 } 1045 break; 1046 1047 case SPA_LOG_CLEAR: 1048 (void) dmu_objset_find(spa->spa_name, zil_clear_log_chain, NULL, 1049 DS_FIND_CHILDREN); 1050 break; 1051 } 1052 spa->spa_log_state = SPA_LOG_GOOD; 1053 return (0); 1054 } 1055 1056 /* 1057 * Load an existing storage pool, using the pool's builtin spa_config as a 1058 * source of configuration information. 1059 */ 1060 static int 1061 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 1062 { 1063 int error = 0; 1064 nvlist_t *nvroot = NULL; 1065 vdev_t *rvd; 1066 uberblock_t *ub = &spa->spa_uberblock; 1067 uint64_t config_cache_txg = spa->spa_config_txg; 1068 uint64_t pool_guid; 1069 uint64_t version; 1070 uint64_t autoreplace = 0; 1071 int orig_mode = spa->spa_mode; 1072 char *ereport = FM_EREPORT_ZFS_POOL; 1073 1074 /* 1075 * If this is an untrusted config, access the pool in read-only mode. 1076 * This prevents things like resilvering recently removed devices. 1077 */ 1078 if (!mosconfig) 1079 spa->spa_mode = FREAD; 1080 1081 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1082 1083 spa->spa_load_state = state; 1084 1085 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 1086 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 1087 error = EINVAL; 1088 goto out; 1089 } 1090 1091 /* 1092 * Versioning wasn't explicitly added to the label until later, so if 1093 * it's not present treat it as the initial version. 1094 */ 1095 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 1096 version = SPA_VERSION_INITIAL; 1097 1098 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 1099 &spa->spa_config_txg); 1100 1101 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 1102 spa_guid_exists(pool_guid, 0)) { 1103 error = EEXIST; 1104 goto out; 1105 } 1106 1107 spa->spa_load_guid = pool_guid; 1108 1109 /* 1110 * Parse the configuration into a vdev tree. We explicitly set the 1111 * value that will be returned by spa_version() since parsing the 1112 * configuration requires knowing the version number. 1113 */ 1114 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1115 spa->spa_ubsync.ub_version = version; 1116 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 1117 spa_config_exit(spa, SCL_ALL, FTAG); 1118 1119 if (error != 0) 1120 goto out; 1121 1122 ASSERT(spa->spa_root_vdev == rvd); 1123 ASSERT(spa_guid(spa) == pool_guid); 1124 1125 /* 1126 * Try to open all vdevs, loading each label in the process. 1127 */ 1128 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1129 error = vdev_open(rvd); 1130 spa_config_exit(spa, SCL_ALL, FTAG); 1131 if (error != 0) 1132 goto out; 1133 1134 /* 1135 * Validate the labels for all leaf vdevs. We need to grab the config 1136 * lock because all label I/O is done with ZIO_FLAG_CONFIG_WRITER. 1137 */ 1138 if (mosconfig) { 1139 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1140 error = vdev_validate(rvd); 1141 spa_config_exit(spa, SCL_ALL, FTAG); 1142 if (error != 0) 1143 goto out; 1144 } 1145 1146 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1147 error = ENXIO; 1148 goto out; 1149 } 1150 1151 /* 1152 * Find the best uberblock. 1153 */ 1154 vdev_uberblock_load(NULL, rvd, ub); 1155 1156 /* 1157 * If we weren't able to find a single valid uberblock, return failure. 1158 */ 1159 if (ub->ub_txg == 0) { 1160 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1161 VDEV_AUX_CORRUPT_DATA); 1162 error = ENXIO; 1163 goto out; 1164 } 1165 1166 /* 1167 * If the pool is newer than the code, we can't open it. 1168 */ 1169 if (ub->ub_version > SPA_VERSION) { 1170 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1171 VDEV_AUX_VERSION_NEWER); 1172 error = ENOTSUP; 1173 goto out; 1174 } 1175 1176 /* 1177 * If the vdev guid sum doesn't match the uberblock, we have an 1178 * incomplete configuration. 1179 */ 1180 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 1181 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1182 VDEV_AUX_BAD_GUID_SUM); 1183 error = ENXIO; 1184 goto out; 1185 } 1186 1187 /* 1188 * Initialize internal SPA structures. 1189 */ 1190 spa->spa_state = POOL_STATE_ACTIVE; 1191 spa->spa_ubsync = spa->spa_uberblock; 1192 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 1193 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 1194 if (error) { 1195 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1196 VDEV_AUX_CORRUPT_DATA); 1197 goto out; 1198 } 1199 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 1200 1201 if (zap_lookup(spa->spa_meta_objset, 1202 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1203 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 1204 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1205 VDEV_AUX_CORRUPT_DATA); 1206 error = EIO; 1207 goto out; 1208 } 1209 1210 if (!mosconfig) { 1211 nvlist_t *newconfig; 1212 uint64_t hostid; 1213 1214 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 1215 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1216 VDEV_AUX_CORRUPT_DATA); 1217 error = EIO; 1218 goto out; 1219 } 1220 1221 if (!spa_is_root(spa) && nvlist_lookup_uint64(newconfig, 1222 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 1223 char *hostname; 1224 unsigned long myhostid = 0; 1225 1226 VERIFY(nvlist_lookup_string(newconfig, 1227 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 1228 1229 #ifdef _KERNEL 1230 myhostid = zone_get_hostid(NULL); 1231 #else /* _KERNEL */ 1232 /* 1233 * We're emulating the system's hostid in userland, so 1234 * we can't use zone_get_hostid(). 1235 */ 1236 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 1237 #endif /* _KERNEL */ 1238 if (hostid != 0 && myhostid != 0 && 1239 hostid != myhostid) { 1240 cmn_err(CE_WARN, "pool '%s' could not be " 1241 "loaded as it was last accessed by " 1242 "another system (host: %s hostid: 0x%lx). " 1243 "See: http://www.sun.com/msg/ZFS-8000-EY", 1244 spa_name(spa), hostname, 1245 (unsigned long)hostid); 1246 error = EBADF; 1247 goto out; 1248 } 1249 } 1250 1251 spa_config_set(spa, newconfig); 1252 spa_unload(spa); 1253 spa_deactivate(spa); 1254 spa_activate(spa, orig_mode); 1255 1256 return (spa_load(spa, newconfig, state, B_TRUE)); 1257 } 1258 1259 if (zap_lookup(spa->spa_meta_objset, 1260 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1261 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 1262 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1263 VDEV_AUX_CORRUPT_DATA); 1264 error = EIO; 1265 goto out; 1266 } 1267 1268 /* 1269 * Load the bit that tells us to use the new accounting function 1270 * (raid-z deflation). If we have an older pool, this will not 1271 * be present. 1272 */ 1273 error = zap_lookup(spa->spa_meta_objset, 1274 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1275 sizeof (uint64_t), 1, &spa->spa_deflate); 1276 if (error != 0 && error != ENOENT) { 1277 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1278 VDEV_AUX_CORRUPT_DATA); 1279 error = EIO; 1280 goto out; 1281 } 1282 1283 /* 1284 * Load the persistent error log. If we have an older pool, this will 1285 * not be present. 1286 */ 1287 error = zap_lookup(spa->spa_meta_objset, 1288 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 1289 sizeof (uint64_t), 1, &spa->spa_errlog_last); 1290 if (error != 0 && error != ENOENT) { 1291 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1292 VDEV_AUX_CORRUPT_DATA); 1293 error = EIO; 1294 goto out; 1295 } 1296 1297 error = zap_lookup(spa->spa_meta_objset, 1298 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 1299 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 1300 if (error != 0 && error != ENOENT) { 1301 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1302 VDEV_AUX_CORRUPT_DATA); 1303 error = EIO; 1304 goto out; 1305 } 1306 1307 /* 1308 * Load the history object. If we have an older pool, this 1309 * will not be present. 1310 */ 1311 error = zap_lookup(spa->spa_meta_objset, 1312 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 1313 sizeof (uint64_t), 1, &spa->spa_history); 1314 if (error != 0 && error != ENOENT) { 1315 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1316 VDEV_AUX_CORRUPT_DATA); 1317 error = EIO; 1318 goto out; 1319 } 1320 1321 /* 1322 * Load any hot spares for this pool. 1323 */ 1324 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1325 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object); 1326 if (error != 0 && error != ENOENT) { 1327 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1328 VDEV_AUX_CORRUPT_DATA); 1329 error = EIO; 1330 goto out; 1331 } 1332 if (error == 0) { 1333 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 1334 if (load_nvlist(spa, spa->spa_spares.sav_object, 1335 &spa->spa_spares.sav_config) != 0) { 1336 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1337 VDEV_AUX_CORRUPT_DATA); 1338 error = EIO; 1339 goto out; 1340 } 1341 1342 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1343 spa_load_spares(spa); 1344 spa_config_exit(spa, SCL_ALL, FTAG); 1345 } 1346 1347 /* 1348 * Load any level 2 ARC devices for this pool. 1349 */ 1350 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1351 DMU_POOL_L2CACHE, sizeof (uint64_t), 1, 1352 &spa->spa_l2cache.sav_object); 1353 if (error != 0 && error != ENOENT) { 1354 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1355 VDEV_AUX_CORRUPT_DATA); 1356 error = EIO; 1357 goto out; 1358 } 1359 if (error == 0) { 1360 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 1361 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 1362 &spa->spa_l2cache.sav_config) != 0) { 1363 vdev_set_state(rvd, B_TRUE, 1364 VDEV_STATE_CANT_OPEN, 1365 VDEV_AUX_CORRUPT_DATA); 1366 error = EIO; 1367 goto out; 1368 } 1369 1370 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1371 spa_load_l2cache(spa); 1372 spa_config_exit(spa, SCL_ALL, FTAG); 1373 } 1374 1375 if (spa_check_logs(spa)) { 1376 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1377 VDEV_AUX_BAD_LOG); 1378 error = ENXIO; 1379 ereport = FM_EREPORT_ZFS_LOG_REPLAY; 1380 goto out; 1381 } 1382 1383 1384 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 1385 1386 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1387 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 1388 1389 if (error && error != ENOENT) { 1390 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1391 VDEV_AUX_CORRUPT_DATA); 1392 error = EIO; 1393 goto out; 1394 } 1395 1396 if (error == 0) { 1397 (void) zap_lookup(spa->spa_meta_objset, 1398 spa->spa_pool_props_object, 1399 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 1400 sizeof (uint64_t), 1, &spa->spa_bootfs); 1401 (void) zap_lookup(spa->spa_meta_objset, 1402 spa->spa_pool_props_object, 1403 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1404 sizeof (uint64_t), 1, &autoreplace); 1405 (void) zap_lookup(spa->spa_meta_objset, 1406 spa->spa_pool_props_object, 1407 zpool_prop_to_name(ZPOOL_PROP_DELEGATION), 1408 sizeof (uint64_t), 1, &spa->spa_delegation); 1409 (void) zap_lookup(spa->spa_meta_objset, 1410 spa->spa_pool_props_object, 1411 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 1412 sizeof (uint64_t), 1, &spa->spa_failmode); 1413 } 1414 1415 /* 1416 * If the 'autoreplace' property is set, then post a resource notifying 1417 * the ZFS DE that it should not issue any faults for unopenable 1418 * devices. We also iterate over the vdevs, and post a sysevent for any 1419 * unopenable vdevs so that the normal autoreplace handler can take 1420 * over. 1421 */ 1422 if (autoreplace && state != SPA_LOAD_TRYIMPORT) 1423 spa_check_removed(spa->spa_root_vdev); 1424 1425 /* 1426 * Load the vdev state for all toplevel vdevs. 1427 */ 1428 vdev_load(rvd); 1429 1430 /* 1431 * Propagate the leaf DTLs we just loaded all the way up the tree. 1432 */ 1433 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1434 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 1435 spa_config_exit(spa, SCL_ALL, FTAG); 1436 1437 /* 1438 * Check the state of the root vdev. If it can't be opened, it 1439 * indicates one or more toplevel vdevs are faulted. 1440 */ 1441 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1442 error = ENXIO; 1443 goto out; 1444 } 1445 1446 if (spa_writeable(spa)) { 1447 dmu_tx_t *tx; 1448 int need_update = B_FALSE; 1449 1450 ASSERT(state != SPA_LOAD_TRYIMPORT); 1451 1452 /* 1453 * Claim log blocks that haven't been committed yet. 1454 * This must all happen in a single txg. 1455 */ 1456 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 1457 spa_first_txg(spa)); 1458 (void) dmu_objset_find(spa_name(spa), 1459 zil_claim, tx, DS_FIND_CHILDREN); 1460 dmu_tx_commit(tx); 1461 1462 spa->spa_sync_on = B_TRUE; 1463 txg_sync_start(spa->spa_dsl_pool); 1464 1465 /* 1466 * Wait for all claims to sync. 1467 */ 1468 txg_wait_synced(spa->spa_dsl_pool, 0); 1469 1470 /* 1471 * If the config cache is stale, or we have uninitialized 1472 * metaslabs (see spa_vdev_add()), then update the config. 1473 */ 1474 if (config_cache_txg != spa->spa_config_txg || 1475 state == SPA_LOAD_IMPORT) 1476 need_update = B_TRUE; 1477 1478 for (int c = 0; c < rvd->vdev_children; c++) 1479 if (rvd->vdev_child[c]->vdev_ms_array == 0) 1480 need_update = B_TRUE; 1481 1482 /* 1483 * Update the config cache asychronously in case we're the 1484 * root pool, in which case the config cache isn't writable yet. 1485 */ 1486 if (need_update) 1487 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 1488 1489 /* 1490 * Check all DTLs to see if anything needs resilvering. 1491 */ 1492 if (vdev_resilver_needed(rvd, NULL, NULL)) 1493 spa_async_request(spa, SPA_ASYNC_RESILVER); 1494 } 1495 1496 error = 0; 1497 out: 1498 spa->spa_minref = refcount_count(&spa->spa_refcount); 1499 if (error && error != EBADF) 1500 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 1501 spa->spa_load_state = SPA_LOAD_NONE; 1502 spa->spa_ena = 0; 1503 1504 return (error); 1505 } 1506 1507 /* 1508 * Pool Open/Import 1509 * 1510 * The import case is identical to an open except that the configuration is sent 1511 * down from userland, instead of grabbed from the configuration cache. For the 1512 * case of an open, the pool configuration will exist in the 1513 * POOL_STATE_UNINITIALIZED state. 1514 * 1515 * The stats information (gen/count/ustats) is used to gather vdev statistics at 1516 * the same time open the pool, without having to keep around the spa_t in some 1517 * ambiguous state. 1518 */ 1519 static int 1520 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 1521 { 1522 spa_t *spa; 1523 int error; 1524 int locked = B_FALSE; 1525 1526 *spapp = NULL; 1527 1528 /* 1529 * As disgusting as this is, we need to support recursive calls to this 1530 * function because dsl_dir_open() is called during spa_load(), and ends 1531 * up calling spa_open() again. The real fix is to figure out how to 1532 * avoid dsl_dir_open() calling this in the first place. 1533 */ 1534 if (mutex_owner(&spa_namespace_lock) != curthread) { 1535 mutex_enter(&spa_namespace_lock); 1536 locked = B_TRUE; 1537 } 1538 1539 if ((spa = spa_lookup(pool)) == NULL) { 1540 if (locked) 1541 mutex_exit(&spa_namespace_lock); 1542 return (ENOENT); 1543 } 1544 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 1545 1546 spa_activate(spa, spa_mode_global); 1547 1548 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 1549 1550 if (error == EBADF) { 1551 /* 1552 * If vdev_validate() returns failure (indicated by 1553 * EBADF), it indicates that one of the vdevs indicates 1554 * that the pool has been exported or destroyed. If 1555 * this is the case, the config cache is out of sync and 1556 * we should remove the pool from the namespace. 1557 */ 1558 spa_unload(spa); 1559 spa_deactivate(spa); 1560 spa_config_sync(spa, B_TRUE, B_TRUE); 1561 spa_remove(spa); 1562 if (locked) 1563 mutex_exit(&spa_namespace_lock); 1564 return (ENOENT); 1565 } 1566 1567 if (error) { 1568 /* 1569 * We can't open the pool, but we still have useful 1570 * information: the state of each vdev after the 1571 * attempted vdev_open(). Return this to the user. 1572 */ 1573 if (config != NULL && spa->spa_root_vdev != NULL) 1574 *config = spa_config_generate(spa, NULL, -1ULL, 1575 B_TRUE); 1576 spa_unload(spa); 1577 spa_deactivate(spa); 1578 spa->spa_last_open_failed = B_TRUE; 1579 if (locked) 1580 mutex_exit(&spa_namespace_lock); 1581 *spapp = NULL; 1582 return (error); 1583 } else { 1584 spa->spa_last_open_failed = B_FALSE; 1585 } 1586 } 1587 1588 spa_open_ref(spa, tag); 1589 1590 if (locked) 1591 mutex_exit(&spa_namespace_lock); 1592 1593 *spapp = spa; 1594 1595 if (config != NULL) 1596 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1597 1598 return (0); 1599 } 1600 1601 int 1602 spa_open(const char *name, spa_t **spapp, void *tag) 1603 { 1604 return (spa_open_common(name, spapp, tag, NULL)); 1605 } 1606 1607 /* 1608 * Lookup the given spa_t, incrementing the inject count in the process, 1609 * preventing it from being exported or destroyed. 1610 */ 1611 spa_t * 1612 spa_inject_addref(char *name) 1613 { 1614 spa_t *spa; 1615 1616 mutex_enter(&spa_namespace_lock); 1617 if ((spa = spa_lookup(name)) == NULL) { 1618 mutex_exit(&spa_namespace_lock); 1619 return (NULL); 1620 } 1621 spa->spa_inject_ref++; 1622 mutex_exit(&spa_namespace_lock); 1623 1624 return (spa); 1625 } 1626 1627 void 1628 spa_inject_delref(spa_t *spa) 1629 { 1630 mutex_enter(&spa_namespace_lock); 1631 spa->spa_inject_ref--; 1632 mutex_exit(&spa_namespace_lock); 1633 } 1634 1635 /* 1636 * Add spares device information to the nvlist. 1637 */ 1638 static void 1639 spa_add_spares(spa_t *spa, nvlist_t *config) 1640 { 1641 nvlist_t **spares; 1642 uint_t i, nspares; 1643 nvlist_t *nvroot; 1644 uint64_t guid; 1645 vdev_stat_t *vs; 1646 uint_t vsc; 1647 uint64_t pool; 1648 1649 if (spa->spa_spares.sav_count == 0) 1650 return; 1651 1652 VERIFY(nvlist_lookup_nvlist(config, 1653 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1654 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1655 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1656 if (nspares != 0) { 1657 VERIFY(nvlist_add_nvlist_array(nvroot, 1658 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1659 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1660 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1661 1662 /* 1663 * Go through and find any spares which have since been 1664 * repurposed as an active spare. If this is the case, update 1665 * their status appropriately. 1666 */ 1667 for (i = 0; i < nspares; i++) { 1668 VERIFY(nvlist_lookup_uint64(spares[i], 1669 ZPOOL_CONFIG_GUID, &guid) == 0); 1670 if (spa_spare_exists(guid, &pool, NULL) && 1671 pool != 0ULL) { 1672 VERIFY(nvlist_lookup_uint64_array( 1673 spares[i], ZPOOL_CONFIG_STATS, 1674 (uint64_t **)&vs, &vsc) == 0); 1675 vs->vs_state = VDEV_STATE_CANT_OPEN; 1676 vs->vs_aux = VDEV_AUX_SPARED; 1677 } 1678 } 1679 } 1680 } 1681 1682 /* 1683 * Add l2cache device information to the nvlist, including vdev stats. 1684 */ 1685 static void 1686 spa_add_l2cache(spa_t *spa, nvlist_t *config) 1687 { 1688 nvlist_t **l2cache; 1689 uint_t i, j, nl2cache; 1690 nvlist_t *nvroot; 1691 uint64_t guid; 1692 vdev_t *vd; 1693 vdev_stat_t *vs; 1694 uint_t vsc; 1695 1696 if (spa->spa_l2cache.sav_count == 0) 1697 return; 1698 1699 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 1700 1701 VERIFY(nvlist_lookup_nvlist(config, 1702 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1703 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 1704 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1705 if (nl2cache != 0) { 1706 VERIFY(nvlist_add_nvlist_array(nvroot, 1707 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 1708 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1709 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1710 1711 /* 1712 * Update level 2 cache device stats. 1713 */ 1714 1715 for (i = 0; i < nl2cache; i++) { 1716 VERIFY(nvlist_lookup_uint64(l2cache[i], 1717 ZPOOL_CONFIG_GUID, &guid) == 0); 1718 1719 vd = NULL; 1720 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 1721 if (guid == 1722 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 1723 vd = spa->spa_l2cache.sav_vdevs[j]; 1724 break; 1725 } 1726 } 1727 ASSERT(vd != NULL); 1728 1729 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 1730 ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0); 1731 vdev_get_stats(vd, vs); 1732 } 1733 } 1734 1735 spa_config_exit(spa, SCL_CONFIG, FTAG); 1736 } 1737 1738 int 1739 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 1740 { 1741 int error; 1742 spa_t *spa; 1743 1744 *config = NULL; 1745 error = spa_open_common(name, &spa, FTAG, config); 1746 1747 if (spa && *config != NULL) { 1748 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 1749 spa_get_errlog_size(spa)) == 0); 1750 1751 if (spa_suspended(spa)) 1752 VERIFY(nvlist_add_uint64(*config, 1753 ZPOOL_CONFIG_SUSPENDED, spa->spa_failmode) == 0); 1754 1755 spa_add_spares(spa, *config); 1756 spa_add_l2cache(spa, *config); 1757 } 1758 1759 /* 1760 * We want to get the alternate root even for faulted pools, so we cheat 1761 * and call spa_lookup() directly. 1762 */ 1763 if (altroot) { 1764 if (spa == NULL) { 1765 mutex_enter(&spa_namespace_lock); 1766 spa = spa_lookup(name); 1767 if (spa) 1768 spa_altroot(spa, altroot, buflen); 1769 else 1770 altroot[0] = '\0'; 1771 spa = NULL; 1772 mutex_exit(&spa_namespace_lock); 1773 } else { 1774 spa_altroot(spa, altroot, buflen); 1775 } 1776 } 1777 1778 if (spa != NULL) 1779 spa_close(spa, FTAG); 1780 1781 return (error); 1782 } 1783 1784 /* 1785 * Validate that the auxiliary device array is well formed. We must have an 1786 * array of nvlists, each which describes a valid leaf vdev. If this is an 1787 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 1788 * specified, as long as they are well-formed. 1789 */ 1790 static int 1791 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 1792 spa_aux_vdev_t *sav, const char *config, uint64_t version, 1793 vdev_labeltype_t label) 1794 { 1795 nvlist_t **dev; 1796 uint_t i, ndev; 1797 vdev_t *vd; 1798 int error; 1799 1800 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1801 1802 /* 1803 * It's acceptable to have no devs specified. 1804 */ 1805 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 1806 return (0); 1807 1808 if (ndev == 0) 1809 return (EINVAL); 1810 1811 /* 1812 * Make sure the pool is formatted with a version that supports this 1813 * device type. 1814 */ 1815 if (spa_version(spa) < version) 1816 return (ENOTSUP); 1817 1818 /* 1819 * Set the pending device list so we correctly handle device in-use 1820 * checking. 1821 */ 1822 sav->sav_pending = dev; 1823 sav->sav_npending = ndev; 1824 1825 for (i = 0; i < ndev; i++) { 1826 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 1827 mode)) != 0) 1828 goto out; 1829 1830 if (!vd->vdev_ops->vdev_op_leaf) { 1831 vdev_free(vd); 1832 error = EINVAL; 1833 goto out; 1834 } 1835 1836 /* 1837 * The L2ARC currently only supports disk devices in 1838 * kernel context. For user-level testing, we allow it. 1839 */ 1840 #ifdef _KERNEL 1841 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 1842 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 1843 error = ENOTBLK; 1844 goto out; 1845 } 1846 #endif 1847 vd->vdev_top = vd; 1848 1849 if ((error = vdev_open(vd)) == 0 && 1850 (error = vdev_label_init(vd, crtxg, label)) == 0) { 1851 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 1852 vd->vdev_guid) == 0); 1853 } 1854 1855 vdev_free(vd); 1856 1857 if (error && 1858 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 1859 goto out; 1860 else 1861 error = 0; 1862 } 1863 1864 out: 1865 sav->sav_pending = NULL; 1866 sav->sav_npending = 0; 1867 return (error); 1868 } 1869 1870 static int 1871 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 1872 { 1873 int error; 1874 1875 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1876 1877 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 1878 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 1879 VDEV_LABEL_SPARE)) != 0) { 1880 return (error); 1881 } 1882 1883 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 1884 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 1885 VDEV_LABEL_L2CACHE)); 1886 } 1887 1888 static void 1889 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 1890 const char *config) 1891 { 1892 int i; 1893 1894 if (sav->sav_config != NULL) { 1895 nvlist_t **olddevs; 1896 uint_t oldndevs; 1897 nvlist_t **newdevs; 1898 1899 /* 1900 * Generate new dev list by concatentating with the 1901 * current dev list. 1902 */ 1903 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 1904 &olddevs, &oldndevs) == 0); 1905 1906 newdevs = kmem_alloc(sizeof (void *) * 1907 (ndevs + oldndevs), KM_SLEEP); 1908 for (i = 0; i < oldndevs; i++) 1909 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 1910 KM_SLEEP) == 0); 1911 for (i = 0; i < ndevs; i++) 1912 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 1913 KM_SLEEP) == 0); 1914 1915 VERIFY(nvlist_remove(sav->sav_config, config, 1916 DATA_TYPE_NVLIST_ARRAY) == 0); 1917 1918 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1919 config, newdevs, ndevs + oldndevs) == 0); 1920 for (i = 0; i < oldndevs + ndevs; i++) 1921 nvlist_free(newdevs[i]); 1922 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 1923 } else { 1924 /* 1925 * Generate a new dev list. 1926 */ 1927 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 1928 KM_SLEEP) == 0); 1929 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 1930 devs, ndevs) == 0); 1931 } 1932 } 1933 1934 /* 1935 * Stop and drop level 2 ARC devices 1936 */ 1937 void 1938 spa_l2cache_drop(spa_t *spa) 1939 { 1940 vdev_t *vd; 1941 int i; 1942 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1943 1944 for (i = 0; i < sav->sav_count; i++) { 1945 uint64_t pool; 1946 1947 vd = sav->sav_vdevs[i]; 1948 ASSERT(vd != NULL); 1949 1950 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1951 pool != 0ULL && l2arc_vdev_present(vd)) 1952 l2arc_remove_vdev(vd); 1953 if (vd->vdev_isl2cache) 1954 spa_l2cache_remove(vd); 1955 vdev_clear_stats(vd); 1956 (void) vdev_close(vd); 1957 } 1958 } 1959 1960 /* 1961 * Pool Creation 1962 */ 1963 int 1964 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 1965 const char *history_str, nvlist_t *zplprops) 1966 { 1967 spa_t *spa; 1968 char *altroot = NULL; 1969 vdev_t *rvd; 1970 dsl_pool_t *dp; 1971 dmu_tx_t *tx; 1972 int c, error = 0; 1973 uint64_t txg = TXG_INITIAL; 1974 nvlist_t **spares, **l2cache; 1975 uint_t nspares, nl2cache; 1976 uint64_t version; 1977 1978 /* 1979 * If this pool already exists, return failure. 1980 */ 1981 mutex_enter(&spa_namespace_lock); 1982 if (spa_lookup(pool) != NULL) { 1983 mutex_exit(&spa_namespace_lock); 1984 return (EEXIST); 1985 } 1986 1987 /* 1988 * Allocate a new spa_t structure. 1989 */ 1990 (void) nvlist_lookup_string(props, 1991 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 1992 spa = spa_add(pool, altroot); 1993 spa_activate(spa, spa_mode_global); 1994 1995 spa->spa_uberblock.ub_txg = txg - 1; 1996 1997 if (props && (error = spa_prop_validate(spa, props))) { 1998 spa_unload(spa); 1999 spa_deactivate(spa); 2000 spa_remove(spa); 2001 mutex_exit(&spa_namespace_lock); 2002 return (error); 2003 } 2004 2005 if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 2006 &version) != 0) 2007 version = SPA_VERSION; 2008 ASSERT(version <= SPA_VERSION); 2009 spa->spa_uberblock.ub_version = version; 2010 spa->spa_ubsync = spa->spa_uberblock; 2011 2012 /* 2013 * Create the root vdev. 2014 */ 2015 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2016 2017 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 2018 2019 ASSERT(error != 0 || rvd != NULL); 2020 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 2021 2022 if (error == 0 && !zfs_allocatable_devs(nvroot)) 2023 error = EINVAL; 2024 2025 if (error == 0 && 2026 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 2027 (error = spa_validate_aux(spa, nvroot, txg, 2028 VDEV_ALLOC_ADD)) == 0) { 2029 for (c = 0; c < rvd->vdev_children; c++) 2030 vdev_init(rvd->vdev_child[c], txg); 2031 vdev_config_dirty(rvd); 2032 } 2033 2034 spa_config_exit(spa, SCL_ALL, FTAG); 2035 2036 if (error != 0) { 2037 spa_unload(spa); 2038 spa_deactivate(spa); 2039 spa_remove(spa); 2040 mutex_exit(&spa_namespace_lock); 2041 return (error); 2042 } 2043 2044 /* 2045 * Get the list of spares, if specified. 2046 */ 2047 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2048 &spares, &nspares) == 0) { 2049 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 2050 KM_SLEEP) == 0); 2051 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2052 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2053 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2054 spa_load_spares(spa); 2055 spa_config_exit(spa, SCL_ALL, FTAG); 2056 spa->spa_spares.sav_sync = B_TRUE; 2057 } 2058 2059 /* 2060 * Get the list of level 2 cache devices, if specified. 2061 */ 2062 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2063 &l2cache, &nl2cache) == 0) { 2064 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2065 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2066 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2067 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2068 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2069 spa_load_l2cache(spa); 2070 spa_config_exit(spa, SCL_ALL, FTAG); 2071 spa->spa_l2cache.sav_sync = B_TRUE; 2072 } 2073 2074 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 2075 spa->spa_meta_objset = dp->dp_meta_objset; 2076 2077 tx = dmu_tx_create_assigned(dp, txg); 2078 2079 /* 2080 * Create the pool config object. 2081 */ 2082 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 2083 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 2084 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2085 2086 if (zap_add(spa->spa_meta_objset, 2087 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 2088 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 2089 cmn_err(CE_PANIC, "failed to add pool config"); 2090 } 2091 2092 /* Newly created pools with the right version are always deflated. */ 2093 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 2094 spa->spa_deflate = TRUE; 2095 if (zap_add(spa->spa_meta_objset, 2096 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2097 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 2098 cmn_err(CE_PANIC, "failed to add deflate"); 2099 } 2100 } 2101 2102 /* 2103 * Create the deferred-free bplist object. Turn off compression 2104 * because sync-to-convergence takes longer if the blocksize 2105 * keeps changing. 2106 */ 2107 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 2108 1 << 14, tx); 2109 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 2110 ZIO_COMPRESS_OFF, tx); 2111 2112 if (zap_add(spa->spa_meta_objset, 2113 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 2114 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 2115 cmn_err(CE_PANIC, "failed to add bplist"); 2116 } 2117 2118 /* 2119 * Create the pool's history object. 2120 */ 2121 if (version >= SPA_VERSION_ZPOOL_HISTORY) 2122 spa_history_create_obj(spa, tx); 2123 2124 /* 2125 * Set pool properties. 2126 */ 2127 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 2128 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2129 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 2130 if (props != NULL) { 2131 spa_configfile_set(spa, props, B_FALSE); 2132 spa_sync_props(spa, props, CRED(), tx); 2133 } 2134 2135 dmu_tx_commit(tx); 2136 2137 spa->spa_sync_on = B_TRUE; 2138 txg_sync_start(spa->spa_dsl_pool); 2139 2140 /* 2141 * We explicitly wait for the first transaction to complete so that our 2142 * bean counters are appropriately updated. 2143 */ 2144 txg_wait_synced(spa->spa_dsl_pool, txg); 2145 2146 spa_config_sync(spa, B_FALSE, B_TRUE); 2147 2148 if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 2149 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 2150 2151 spa->spa_minref = refcount_count(&spa->spa_refcount); 2152 2153 mutex_exit(&spa_namespace_lock); 2154 2155 return (0); 2156 } 2157 2158 /* 2159 * Import the given pool into the system. We set up the necessary spa_t and 2160 * then call spa_load() to do the dirty work. 2161 */ 2162 static int 2163 spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props, 2164 boolean_t isroot, boolean_t allowfaulted) 2165 { 2166 spa_t *spa; 2167 char *altroot = NULL; 2168 int error, loaderr; 2169 nvlist_t *nvroot; 2170 nvlist_t **spares, **l2cache; 2171 uint_t nspares, nl2cache; 2172 2173 /* 2174 * If a pool with this name exists, return failure. 2175 */ 2176 mutex_enter(&spa_namespace_lock); 2177 if ((spa = spa_lookup(pool)) != NULL) { 2178 if (isroot) { 2179 /* 2180 * Remove the existing root pool from the 2181 * namespace so that we can replace it with 2182 * the correct config we just read in. 2183 */ 2184 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 2185 spa_remove(spa); 2186 } else { 2187 mutex_exit(&spa_namespace_lock); 2188 return (EEXIST); 2189 } 2190 } 2191 2192 /* 2193 * Create and initialize the spa structure. 2194 */ 2195 (void) nvlist_lookup_string(props, 2196 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2197 spa = spa_add(pool, altroot); 2198 spa_activate(spa, spa_mode_global); 2199 2200 if (allowfaulted) 2201 spa->spa_import_faulted = B_TRUE; 2202 spa->spa_is_root = isroot; 2203 2204 /* 2205 * Pass off the heavy lifting to spa_load(). 2206 * Pass TRUE for mosconfig (unless this is a root pool) because 2207 * the user-supplied config is actually the one to trust when 2208 * doing an import. 2209 */ 2210 loaderr = error = spa_load(spa, config, SPA_LOAD_IMPORT, !isroot); 2211 2212 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2213 /* 2214 * Toss any existing sparelist, as it doesn't have any validity anymore, 2215 * and conflicts with spa_has_spare(). 2216 */ 2217 if (!isroot && spa->spa_spares.sav_config) { 2218 nvlist_free(spa->spa_spares.sav_config); 2219 spa->spa_spares.sav_config = NULL; 2220 spa_load_spares(spa); 2221 } 2222 if (!isroot && spa->spa_l2cache.sav_config) { 2223 nvlist_free(spa->spa_l2cache.sav_config); 2224 spa->spa_l2cache.sav_config = NULL; 2225 spa_load_l2cache(spa); 2226 } 2227 2228 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 2229 &nvroot) == 0); 2230 if (error == 0) 2231 error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE); 2232 if (error == 0) 2233 error = spa_validate_aux(spa, nvroot, -1ULL, 2234 VDEV_ALLOC_L2CACHE); 2235 spa_config_exit(spa, SCL_ALL, FTAG); 2236 2237 if (props != NULL) 2238 spa_configfile_set(spa, props, B_FALSE); 2239 2240 if (error != 0 || (props && spa_writeable(spa) && 2241 (error = spa_prop_set(spa, props)))) { 2242 if (loaderr != 0 && loaderr != EINVAL && allowfaulted) { 2243 /* 2244 * If we failed to load the pool, but 'allowfaulted' is 2245 * set, then manually set the config as if the config 2246 * passed in was specified in the cache file. 2247 */ 2248 error = 0; 2249 spa->spa_import_faulted = B_FALSE; 2250 if (spa->spa_config == NULL) 2251 spa->spa_config = spa_config_generate(spa, 2252 NULL, -1ULL, B_TRUE); 2253 spa_unload(spa); 2254 spa_deactivate(spa); 2255 spa_config_sync(spa, B_FALSE, B_TRUE); 2256 } else { 2257 spa_unload(spa); 2258 spa_deactivate(spa); 2259 spa_remove(spa); 2260 } 2261 mutex_exit(&spa_namespace_lock); 2262 return (error); 2263 } 2264 2265 /* 2266 * Override any spares and level 2 cache devices as specified by 2267 * the user, as these may have correct device names/devids, etc. 2268 */ 2269 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2270 &spares, &nspares) == 0) { 2271 if (spa->spa_spares.sav_config) 2272 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 2273 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 2274 else 2275 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 2276 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2277 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2278 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2279 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2280 spa_load_spares(spa); 2281 spa_config_exit(spa, SCL_ALL, FTAG); 2282 spa->spa_spares.sav_sync = B_TRUE; 2283 } 2284 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2285 &l2cache, &nl2cache) == 0) { 2286 if (spa->spa_l2cache.sav_config) 2287 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 2288 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 2289 else 2290 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2291 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2292 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2293 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2294 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2295 spa_load_l2cache(spa); 2296 spa_config_exit(spa, SCL_ALL, FTAG); 2297 spa->spa_l2cache.sav_sync = B_TRUE; 2298 } 2299 2300 if (spa_writeable(spa)) { 2301 /* 2302 * Update the config cache to include the newly-imported pool. 2303 */ 2304 spa_config_update_common(spa, SPA_CONFIG_UPDATE_POOL, isroot); 2305 } 2306 2307 spa->spa_import_faulted = B_FALSE; 2308 mutex_exit(&spa_namespace_lock); 2309 2310 return (0); 2311 } 2312 2313 #ifdef _KERNEL 2314 /* 2315 * Build a "root" vdev for a top level vdev read in from a rootpool 2316 * device label. 2317 */ 2318 static void 2319 spa_build_rootpool_config(nvlist_t *config) 2320 { 2321 nvlist_t *nvtop, *nvroot; 2322 uint64_t pgid; 2323 2324 /* 2325 * Add this top-level vdev to the child array. 2326 */ 2327 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) 2328 == 0); 2329 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid) 2330 == 0); 2331 2332 /* 2333 * Put this pool's top-level vdevs into a root vdev. 2334 */ 2335 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2336 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) 2337 == 0); 2338 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 2339 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 2340 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 2341 &nvtop, 1) == 0); 2342 2343 /* 2344 * Replace the existing vdev_tree with the new root vdev in 2345 * this pool's configuration (remove the old, add the new). 2346 */ 2347 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 2348 nvlist_free(nvroot); 2349 } 2350 2351 /* 2352 * Get the root pool information from the root disk, then import the root pool 2353 * during the system boot up time. 2354 */ 2355 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 2356 2357 int 2358 spa_check_rootconf(char *devpath, char *devid, nvlist_t **bestconf, 2359 uint64_t *besttxg) 2360 { 2361 nvlist_t *config; 2362 uint64_t txg; 2363 int error; 2364 2365 if (error = vdev_disk_read_rootlabel(devpath, devid, &config)) 2366 return (error); 2367 2368 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 2369 2370 if (bestconf != NULL) 2371 *bestconf = config; 2372 else 2373 nvlist_free(config); 2374 *besttxg = txg; 2375 return (0); 2376 } 2377 2378 boolean_t 2379 spa_rootdev_validate(nvlist_t *nv) 2380 { 2381 uint64_t ival; 2382 2383 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 || 2384 nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 || 2385 nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0) 2386 return (B_FALSE); 2387 2388 return (B_TRUE); 2389 } 2390 2391 2392 /* 2393 * Given the boot device's physical path or devid, check if the device 2394 * is in a valid state. If so, return the configuration from the vdev 2395 * label. 2396 */ 2397 int 2398 spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf) 2399 { 2400 nvlist_t *conf = NULL; 2401 uint64_t txg = 0; 2402 nvlist_t *nvtop, **child; 2403 char *type; 2404 char *bootpath = NULL; 2405 uint_t children, c; 2406 char *tmp; 2407 int error; 2408 2409 if (devpath && ((tmp = strchr(devpath, ' ')) != NULL)) 2410 *tmp = '\0'; 2411 if (error = spa_check_rootconf(devpath, devid, &conf, &txg)) { 2412 cmn_err(CE_NOTE, "error reading device label"); 2413 return (error); 2414 } 2415 if (txg == 0) { 2416 cmn_err(CE_NOTE, "this device is detached"); 2417 nvlist_free(conf); 2418 return (EINVAL); 2419 } 2420 2421 VERIFY(nvlist_lookup_nvlist(conf, ZPOOL_CONFIG_VDEV_TREE, 2422 &nvtop) == 0); 2423 VERIFY(nvlist_lookup_string(nvtop, ZPOOL_CONFIG_TYPE, &type) == 0); 2424 2425 if (strcmp(type, VDEV_TYPE_DISK) == 0) { 2426 if (spa_rootdev_validate(nvtop)) { 2427 goto out; 2428 } else { 2429 nvlist_free(conf); 2430 return (EINVAL); 2431 } 2432 } 2433 2434 ASSERT(strcmp(type, VDEV_TYPE_MIRROR) == 0); 2435 2436 VERIFY(nvlist_lookup_nvlist_array(nvtop, ZPOOL_CONFIG_CHILDREN, 2437 &child, &children) == 0); 2438 2439 /* 2440 * Go thru vdevs in the mirror to see if the given device 2441 * has the most recent txg. Only the device with the most 2442 * recent txg has valid information and should be booted. 2443 */ 2444 for (c = 0; c < children; c++) { 2445 char *cdevid, *cpath; 2446 uint64_t tmptxg; 2447 2448 cpath = NULL; 2449 cdevid = NULL; 2450 if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_PHYS_PATH, 2451 &cpath) != 0 && nvlist_lookup_string(child[c], 2452 ZPOOL_CONFIG_DEVID, &cdevid) != 0) 2453 return (EINVAL); 2454 if ((spa_check_rootconf(cpath, cdevid, NULL, 2455 &tmptxg) == 0) && (tmptxg > txg)) { 2456 txg = tmptxg; 2457 VERIFY(nvlist_lookup_string(child[c], 2458 ZPOOL_CONFIG_PATH, &bootpath) == 0); 2459 } 2460 } 2461 2462 /* Does the best device match the one we've booted from? */ 2463 if (bootpath) { 2464 cmn_err(CE_NOTE, "try booting from '%s'", bootpath); 2465 return (EINVAL); 2466 } 2467 out: 2468 *bestconf = conf; 2469 return (0); 2470 } 2471 2472 /* 2473 * Import a root pool. 2474 * 2475 * For x86. devpath_list will consist of devid and/or physpath name of 2476 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 2477 * The GRUB "findroot" command will return the vdev we should boot. 2478 * 2479 * For Sparc, devpath_list consists the physpath name of the booting device 2480 * no matter the rootpool is a single device pool or a mirrored pool. 2481 * e.g. 2482 * "/pci@1f,0/ide@d/disk@0,0:a" 2483 */ 2484 int 2485 spa_import_rootpool(char *devpath, char *devid) 2486 { 2487 nvlist_t *conf = NULL; 2488 char *pname; 2489 int error; 2490 2491 /* 2492 * Get the vdev pathname and configuation from the most 2493 * recently updated vdev (highest txg). 2494 */ 2495 if (error = spa_get_rootconf(devpath, devid, &conf)) 2496 goto msg_out; 2497 2498 /* 2499 * Add type "root" vdev to the config. 2500 */ 2501 spa_build_rootpool_config(conf); 2502 2503 VERIFY(nvlist_lookup_string(conf, ZPOOL_CONFIG_POOL_NAME, &pname) == 0); 2504 2505 /* 2506 * We specify 'allowfaulted' for this to be treated like spa_open() 2507 * instead of spa_import(). This prevents us from marking vdevs as 2508 * persistently unavailable, and generates FMA ereports as if it were a 2509 * pool open, not import. 2510 */ 2511 error = spa_import_common(pname, conf, NULL, B_TRUE, B_TRUE); 2512 ASSERT(error != EEXIST); 2513 2514 nvlist_free(conf); 2515 return (error); 2516 2517 msg_out: 2518 cmn_err(CE_NOTE, "\n" 2519 " *************************************************** \n" 2520 " * This device is not bootable! * \n" 2521 " * It is either offlined or detached or faulted. * \n" 2522 " * Please try to boot from a different device. * \n" 2523 " *************************************************** "); 2524 2525 return (error); 2526 } 2527 #endif 2528 2529 /* 2530 * Import a non-root pool into the system. 2531 */ 2532 int 2533 spa_import(const char *pool, nvlist_t *config, nvlist_t *props) 2534 { 2535 return (spa_import_common(pool, config, props, B_FALSE, B_FALSE)); 2536 } 2537 2538 int 2539 spa_import_faulted(const char *pool, nvlist_t *config, nvlist_t *props) 2540 { 2541 return (spa_import_common(pool, config, props, B_FALSE, B_TRUE)); 2542 } 2543 2544 2545 /* 2546 * This (illegal) pool name is used when temporarily importing a spa_t in order 2547 * to get the vdev stats associated with the imported devices. 2548 */ 2549 #define TRYIMPORT_NAME "$import" 2550 2551 nvlist_t * 2552 spa_tryimport(nvlist_t *tryconfig) 2553 { 2554 nvlist_t *config = NULL; 2555 char *poolname; 2556 spa_t *spa; 2557 uint64_t state; 2558 int error; 2559 2560 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 2561 return (NULL); 2562 2563 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 2564 return (NULL); 2565 2566 /* 2567 * Create and initialize the spa structure. 2568 */ 2569 mutex_enter(&spa_namespace_lock); 2570 spa = spa_add(TRYIMPORT_NAME, NULL); 2571 spa_activate(spa, FREAD); 2572 2573 /* 2574 * Pass off the heavy lifting to spa_load(). 2575 * Pass TRUE for mosconfig because the user-supplied config 2576 * is actually the one to trust when doing an import. 2577 */ 2578 error = spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 2579 2580 /* 2581 * If 'tryconfig' was at least parsable, return the current config. 2582 */ 2583 if (spa->spa_root_vdev != NULL) { 2584 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2585 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 2586 poolname) == 0); 2587 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 2588 state) == 0); 2589 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 2590 spa->spa_uberblock.ub_timestamp) == 0); 2591 2592 /* 2593 * If the bootfs property exists on this pool then we 2594 * copy it out so that external consumers can tell which 2595 * pools are bootable. 2596 */ 2597 if ((!error || error == EEXIST) && spa->spa_bootfs) { 2598 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 2599 2600 /* 2601 * We have to play games with the name since the 2602 * pool was opened as TRYIMPORT_NAME. 2603 */ 2604 if (dsl_dsobj_to_dsname(spa_name(spa), 2605 spa->spa_bootfs, tmpname) == 0) { 2606 char *cp; 2607 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 2608 2609 cp = strchr(tmpname, '/'); 2610 if (cp == NULL) { 2611 (void) strlcpy(dsname, tmpname, 2612 MAXPATHLEN); 2613 } else { 2614 (void) snprintf(dsname, MAXPATHLEN, 2615 "%s/%s", poolname, ++cp); 2616 } 2617 VERIFY(nvlist_add_string(config, 2618 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 2619 kmem_free(dsname, MAXPATHLEN); 2620 } 2621 kmem_free(tmpname, MAXPATHLEN); 2622 } 2623 2624 /* 2625 * Add the list of hot spares and level 2 cache devices. 2626 */ 2627 spa_add_spares(spa, config); 2628 spa_add_l2cache(spa, config); 2629 } 2630 2631 spa_unload(spa); 2632 spa_deactivate(spa); 2633 spa_remove(spa); 2634 mutex_exit(&spa_namespace_lock); 2635 2636 return (config); 2637 } 2638 2639 /* 2640 * Pool export/destroy 2641 * 2642 * The act of destroying or exporting a pool is very simple. We make sure there 2643 * is no more pending I/O and any references to the pool are gone. Then, we 2644 * update the pool state and sync all the labels to disk, removing the 2645 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 2646 * we don't sync the labels or remove the configuration cache. 2647 */ 2648 static int 2649 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 2650 boolean_t force, boolean_t hardforce) 2651 { 2652 spa_t *spa; 2653 2654 if (oldconfig) 2655 *oldconfig = NULL; 2656 2657 if (!(spa_mode_global & FWRITE)) 2658 return (EROFS); 2659 2660 mutex_enter(&spa_namespace_lock); 2661 if ((spa = spa_lookup(pool)) == NULL) { 2662 mutex_exit(&spa_namespace_lock); 2663 return (ENOENT); 2664 } 2665 2666 /* 2667 * Put a hold on the pool, drop the namespace lock, stop async tasks, 2668 * reacquire the namespace lock, and see if we can export. 2669 */ 2670 spa_open_ref(spa, FTAG); 2671 mutex_exit(&spa_namespace_lock); 2672 spa_async_suspend(spa); 2673 mutex_enter(&spa_namespace_lock); 2674 spa_close(spa, FTAG); 2675 2676 /* 2677 * The pool will be in core if it's openable, 2678 * in which case we can modify its state. 2679 */ 2680 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 2681 /* 2682 * Objsets may be open only because they're dirty, so we 2683 * have to force it to sync before checking spa_refcnt. 2684 */ 2685 txg_wait_synced(spa->spa_dsl_pool, 0); 2686 2687 /* 2688 * A pool cannot be exported or destroyed if there are active 2689 * references. If we are resetting a pool, allow references by 2690 * fault injection handlers. 2691 */ 2692 if (!spa_refcount_zero(spa) || 2693 (spa->spa_inject_ref != 0 && 2694 new_state != POOL_STATE_UNINITIALIZED)) { 2695 spa_async_resume(spa); 2696 mutex_exit(&spa_namespace_lock); 2697 return (EBUSY); 2698 } 2699 2700 /* 2701 * A pool cannot be exported if it has an active shared spare. 2702 * This is to prevent other pools stealing the active spare 2703 * from an exported pool. At user's own will, such pool can 2704 * be forcedly exported. 2705 */ 2706 if (!force && new_state == POOL_STATE_EXPORTED && 2707 spa_has_active_shared_spare(spa)) { 2708 spa_async_resume(spa); 2709 mutex_exit(&spa_namespace_lock); 2710 return (EXDEV); 2711 } 2712 2713 /* 2714 * We want this to be reflected on every label, 2715 * so mark them all dirty. spa_unload() will do the 2716 * final sync that pushes these changes out. 2717 */ 2718 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 2719 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2720 spa->spa_state = new_state; 2721 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 2722 vdev_config_dirty(spa->spa_root_vdev); 2723 spa_config_exit(spa, SCL_ALL, FTAG); 2724 } 2725 } 2726 2727 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 2728 2729 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2730 spa_unload(spa); 2731 spa_deactivate(spa); 2732 } 2733 2734 if (oldconfig && spa->spa_config) 2735 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 2736 2737 if (new_state != POOL_STATE_UNINITIALIZED) { 2738 if (!hardforce) 2739 spa_config_sync(spa, B_TRUE, B_TRUE); 2740 spa_remove(spa); 2741 } 2742 mutex_exit(&spa_namespace_lock); 2743 2744 return (0); 2745 } 2746 2747 /* 2748 * Destroy a storage pool. 2749 */ 2750 int 2751 spa_destroy(char *pool) 2752 { 2753 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 2754 B_FALSE, B_FALSE)); 2755 } 2756 2757 /* 2758 * Export a storage pool. 2759 */ 2760 int 2761 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 2762 boolean_t hardforce) 2763 { 2764 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 2765 force, hardforce)); 2766 } 2767 2768 /* 2769 * Similar to spa_export(), this unloads the spa_t without actually removing it 2770 * from the namespace in any way. 2771 */ 2772 int 2773 spa_reset(char *pool) 2774 { 2775 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 2776 B_FALSE, B_FALSE)); 2777 } 2778 2779 /* 2780 * ========================================================================== 2781 * Device manipulation 2782 * ========================================================================== 2783 */ 2784 2785 /* 2786 * Add a device to a storage pool. 2787 */ 2788 int 2789 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 2790 { 2791 uint64_t txg; 2792 int error; 2793 vdev_t *rvd = spa->spa_root_vdev; 2794 vdev_t *vd, *tvd; 2795 nvlist_t **spares, **l2cache; 2796 uint_t nspares, nl2cache; 2797 2798 txg = spa_vdev_enter(spa); 2799 2800 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 2801 VDEV_ALLOC_ADD)) != 0) 2802 return (spa_vdev_exit(spa, NULL, txg, error)); 2803 2804 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 2805 2806 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 2807 &nspares) != 0) 2808 nspares = 0; 2809 2810 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 2811 &nl2cache) != 0) 2812 nl2cache = 0; 2813 2814 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 2815 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 2816 2817 if (vd->vdev_children != 0 && 2818 (error = vdev_create(vd, txg, B_FALSE)) != 0) 2819 return (spa_vdev_exit(spa, vd, txg, error)); 2820 2821 /* 2822 * We must validate the spares and l2cache devices after checking the 2823 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 2824 */ 2825 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 2826 return (spa_vdev_exit(spa, vd, txg, error)); 2827 2828 /* 2829 * Transfer each new top-level vdev from vd to rvd. 2830 */ 2831 for (int c = 0; c < vd->vdev_children; c++) { 2832 tvd = vd->vdev_child[c]; 2833 vdev_remove_child(vd, tvd); 2834 tvd->vdev_id = rvd->vdev_children; 2835 vdev_add_child(rvd, tvd); 2836 vdev_config_dirty(tvd); 2837 } 2838 2839 if (nspares != 0) { 2840 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 2841 ZPOOL_CONFIG_SPARES); 2842 spa_load_spares(spa); 2843 spa->spa_spares.sav_sync = B_TRUE; 2844 } 2845 2846 if (nl2cache != 0) { 2847 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 2848 ZPOOL_CONFIG_L2CACHE); 2849 spa_load_l2cache(spa); 2850 spa->spa_l2cache.sav_sync = B_TRUE; 2851 } 2852 2853 /* 2854 * We have to be careful when adding new vdevs to an existing pool. 2855 * If other threads start allocating from these vdevs before we 2856 * sync the config cache, and we lose power, then upon reboot we may 2857 * fail to open the pool because there are DVAs that the config cache 2858 * can't translate. Therefore, we first add the vdevs without 2859 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 2860 * and then let spa_config_update() initialize the new metaslabs. 2861 * 2862 * spa_load() checks for added-but-not-initialized vdevs, so that 2863 * if we lose power at any point in this sequence, the remaining 2864 * steps will be completed the next time we load the pool. 2865 */ 2866 (void) spa_vdev_exit(spa, vd, txg, 0); 2867 2868 mutex_enter(&spa_namespace_lock); 2869 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2870 mutex_exit(&spa_namespace_lock); 2871 2872 return (0); 2873 } 2874 2875 /* 2876 * Attach a device to a mirror. The arguments are the path to any device 2877 * in the mirror, and the nvroot for the new device. If the path specifies 2878 * a device that is not mirrored, we automatically insert the mirror vdev. 2879 * 2880 * If 'replacing' is specified, the new device is intended to replace the 2881 * existing device; in this case the two devices are made into their own 2882 * mirror using the 'replacing' vdev, which is functionally identical to 2883 * the mirror vdev (it actually reuses all the same ops) but has a few 2884 * extra rules: you can't attach to it after it's been created, and upon 2885 * completion of resilvering, the first disk (the one being replaced) 2886 * is automatically detached. 2887 */ 2888 int 2889 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 2890 { 2891 uint64_t txg, open_txg; 2892 vdev_t *rvd = spa->spa_root_vdev; 2893 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 2894 vdev_ops_t *pvops; 2895 dmu_tx_t *tx; 2896 char *oldvdpath, *newvdpath; 2897 int newvd_isspare; 2898 int error; 2899 2900 txg = spa_vdev_enter(spa); 2901 2902 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 2903 2904 if (oldvd == NULL) 2905 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 2906 2907 if (!oldvd->vdev_ops->vdev_op_leaf) 2908 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2909 2910 pvd = oldvd->vdev_parent; 2911 2912 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 2913 VDEV_ALLOC_ADD)) != 0) 2914 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 2915 2916 if (newrootvd->vdev_children != 1) 2917 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 2918 2919 newvd = newrootvd->vdev_child[0]; 2920 2921 if (!newvd->vdev_ops->vdev_op_leaf) 2922 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 2923 2924 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 2925 return (spa_vdev_exit(spa, newrootvd, txg, error)); 2926 2927 /* 2928 * Spares can't replace logs 2929 */ 2930 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 2931 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2932 2933 if (!replacing) { 2934 /* 2935 * For attach, the only allowable parent is a mirror or the root 2936 * vdev. 2937 */ 2938 if (pvd->vdev_ops != &vdev_mirror_ops && 2939 pvd->vdev_ops != &vdev_root_ops) 2940 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2941 2942 pvops = &vdev_mirror_ops; 2943 } else { 2944 /* 2945 * Active hot spares can only be replaced by inactive hot 2946 * spares. 2947 */ 2948 if (pvd->vdev_ops == &vdev_spare_ops && 2949 pvd->vdev_child[1] == oldvd && 2950 !spa_has_spare(spa, newvd->vdev_guid)) 2951 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2952 2953 /* 2954 * If the source is a hot spare, and the parent isn't already a 2955 * spare, then we want to create a new hot spare. Otherwise, we 2956 * want to create a replacing vdev. The user is not allowed to 2957 * attach to a spared vdev child unless the 'isspare' state is 2958 * the same (spare replaces spare, non-spare replaces 2959 * non-spare). 2960 */ 2961 if (pvd->vdev_ops == &vdev_replacing_ops) 2962 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2963 else if (pvd->vdev_ops == &vdev_spare_ops && 2964 newvd->vdev_isspare != oldvd->vdev_isspare) 2965 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2966 else if (pvd->vdev_ops != &vdev_spare_ops && 2967 newvd->vdev_isspare) 2968 pvops = &vdev_spare_ops; 2969 else 2970 pvops = &vdev_replacing_ops; 2971 } 2972 2973 /* 2974 * Compare the new device size with the replaceable/attachable 2975 * device size. 2976 */ 2977 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 2978 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 2979 2980 /* 2981 * The new device cannot have a higher alignment requirement 2982 * than the top-level vdev. 2983 */ 2984 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 2985 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 2986 2987 /* 2988 * If this is an in-place replacement, update oldvd's path and devid 2989 * to make it distinguishable from newvd, and unopenable from now on. 2990 */ 2991 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 2992 spa_strfree(oldvd->vdev_path); 2993 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 2994 KM_SLEEP); 2995 (void) sprintf(oldvd->vdev_path, "%s/%s", 2996 newvd->vdev_path, "old"); 2997 if (oldvd->vdev_devid != NULL) { 2998 spa_strfree(oldvd->vdev_devid); 2999 oldvd->vdev_devid = NULL; 3000 } 3001 } 3002 3003 /* 3004 * If the parent is not a mirror, or if we're replacing, insert the new 3005 * mirror/replacing/spare vdev above oldvd. 3006 */ 3007 if (pvd->vdev_ops != pvops) 3008 pvd = vdev_add_parent(oldvd, pvops); 3009 3010 ASSERT(pvd->vdev_top->vdev_parent == rvd); 3011 ASSERT(pvd->vdev_ops == pvops); 3012 ASSERT(oldvd->vdev_parent == pvd); 3013 3014 /* 3015 * Extract the new device from its root and add it to pvd. 3016 */ 3017 vdev_remove_child(newrootvd, newvd); 3018 newvd->vdev_id = pvd->vdev_children; 3019 vdev_add_child(pvd, newvd); 3020 3021 /* 3022 * If newvd is smaller than oldvd, but larger than its rsize, 3023 * the addition of newvd may have decreased our parent's asize. 3024 */ 3025 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 3026 3027 tvd = newvd->vdev_top; 3028 ASSERT(pvd->vdev_top == tvd); 3029 ASSERT(tvd->vdev_parent == rvd); 3030 3031 vdev_config_dirty(tvd); 3032 3033 /* 3034 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 3035 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 3036 */ 3037 open_txg = txg + TXG_CONCURRENT_STATES - 1; 3038 3039 vdev_dtl_dirty(newvd, DTL_MISSING, 3040 TXG_INITIAL, open_txg - TXG_INITIAL + 1); 3041 3042 if (newvd->vdev_isspare) 3043 spa_spare_activate(newvd); 3044 oldvdpath = spa_strdup(oldvd->vdev_path); 3045 newvdpath = spa_strdup(newvd->vdev_path); 3046 newvd_isspare = newvd->vdev_isspare; 3047 3048 /* 3049 * Mark newvd's DTL dirty in this txg. 3050 */ 3051 vdev_dirty(tvd, VDD_DTL, newvd, txg); 3052 3053 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 3054 3055 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 3056 if (dmu_tx_assign(tx, TXG_WAIT) == 0) { 3057 spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, tx, 3058 CRED(), "%s vdev=%s %s vdev=%s", 3059 replacing && newvd_isspare ? "spare in" : 3060 replacing ? "replace" : "attach", newvdpath, 3061 replacing ? "for" : "to", oldvdpath); 3062 dmu_tx_commit(tx); 3063 } else { 3064 dmu_tx_abort(tx); 3065 } 3066 3067 spa_strfree(oldvdpath); 3068 spa_strfree(newvdpath); 3069 3070 /* 3071 * Kick off a resilver to update newvd. 3072 */ 3073 VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0); 3074 3075 return (0); 3076 } 3077 3078 /* 3079 * Detach a device from a mirror or replacing vdev. 3080 * If 'replace_done' is specified, only detach if the parent 3081 * is a replacing vdev. 3082 */ 3083 int 3084 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 3085 { 3086 uint64_t txg; 3087 int error; 3088 vdev_t *rvd = spa->spa_root_vdev; 3089 vdev_t *vd, *pvd, *cvd, *tvd; 3090 boolean_t unspare = B_FALSE; 3091 uint64_t unspare_guid; 3092 size_t len; 3093 3094 txg = spa_vdev_enter(spa); 3095 3096 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 3097 3098 if (vd == NULL) 3099 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3100 3101 if (!vd->vdev_ops->vdev_op_leaf) 3102 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3103 3104 pvd = vd->vdev_parent; 3105 3106 /* 3107 * If the parent/child relationship is not as expected, don't do it. 3108 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 3109 * vdev that's replacing B with C. The user's intent in replacing 3110 * is to go from M(A,B) to M(A,C). If the user decides to cancel 3111 * the replace by detaching C, the expected behavior is to end up 3112 * M(A,B). But suppose that right after deciding to detach C, 3113 * the replacement of B completes. We would have M(A,C), and then 3114 * ask to detach C, which would leave us with just A -- not what 3115 * the user wanted. To prevent this, we make sure that the 3116 * parent/child relationship hasn't changed -- in this example, 3117 * that C's parent is still the replacing vdev R. 3118 */ 3119 if (pvd->vdev_guid != pguid && pguid != 0) 3120 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3121 3122 /* 3123 * If replace_done is specified, only remove this device if it's 3124 * the first child of a replacing vdev. For the 'spare' vdev, either 3125 * disk can be removed. 3126 */ 3127 if (replace_done) { 3128 if (pvd->vdev_ops == &vdev_replacing_ops) { 3129 if (vd->vdev_id != 0) 3130 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3131 } else if (pvd->vdev_ops != &vdev_spare_ops) { 3132 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3133 } 3134 } 3135 3136 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 3137 spa_version(spa) >= SPA_VERSION_SPARES); 3138 3139 /* 3140 * Only mirror, replacing, and spare vdevs support detach. 3141 */ 3142 if (pvd->vdev_ops != &vdev_replacing_ops && 3143 pvd->vdev_ops != &vdev_mirror_ops && 3144 pvd->vdev_ops != &vdev_spare_ops) 3145 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3146 3147 /* 3148 * If this device has the only valid copy of some data, 3149 * we cannot safely detach it. 3150 */ 3151 if (vdev_dtl_required(vd)) 3152 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3153 3154 ASSERT(pvd->vdev_children >= 2); 3155 3156 /* 3157 * If we are detaching the second disk from a replacing vdev, then 3158 * check to see if we changed the original vdev's path to have "/old" 3159 * at the end in spa_vdev_attach(). If so, undo that change now. 3160 */ 3161 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 && 3162 pvd->vdev_child[0]->vdev_path != NULL && 3163 pvd->vdev_child[1]->vdev_path != NULL) { 3164 ASSERT(pvd->vdev_child[1] == vd); 3165 cvd = pvd->vdev_child[0]; 3166 len = strlen(vd->vdev_path); 3167 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 3168 strcmp(cvd->vdev_path + len, "/old") == 0) { 3169 spa_strfree(cvd->vdev_path); 3170 cvd->vdev_path = spa_strdup(vd->vdev_path); 3171 } 3172 } 3173 3174 /* 3175 * If we are detaching the original disk from a spare, then it implies 3176 * that the spare should become a real disk, and be removed from the 3177 * active spare list for the pool. 3178 */ 3179 if (pvd->vdev_ops == &vdev_spare_ops && 3180 vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare) 3181 unspare = B_TRUE; 3182 3183 /* 3184 * Erase the disk labels so the disk can be used for other things. 3185 * This must be done after all other error cases are handled, 3186 * but before we disembowel vd (so we can still do I/O to it). 3187 * But if we can't do it, don't treat the error as fatal -- 3188 * it may be that the unwritability of the disk is the reason 3189 * it's being detached! 3190 */ 3191 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 3192 3193 /* 3194 * Remove vd from its parent and compact the parent's children. 3195 */ 3196 vdev_remove_child(pvd, vd); 3197 vdev_compact_children(pvd); 3198 3199 /* 3200 * Remember one of the remaining children so we can get tvd below. 3201 */ 3202 cvd = pvd->vdev_child[0]; 3203 3204 /* 3205 * If we need to remove the remaining child from the list of hot spares, 3206 * do it now, marking the vdev as no longer a spare in the process. 3207 * We must do this before vdev_remove_parent(), because that can 3208 * change the GUID if it creates a new toplevel GUID. For a similar 3209 * reason, we must remove the spare now, in the same txg as the detach; 3210 * otherwise someone could attach a new sibling, change the GUID, and 3211 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 3212 */ 3213 if (unspare) { 3214 ASSERT(cvd->vdev_isspare); 3215 spa_spare_remove(cvd); 3216 unspare_guid = cvd->vdev_guid; 3217 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 3218 } 3219 3220 /* 3221 * If the parent mirror/replacing vdev only has one child, 3222 * the parent is no longer needed. Remove it from the tree. 3223 */ 3224 if (pvd->vdev_children == 1) 3225 vdev_remove_parent(cvd); 3226 3227 /* 3228 * We don't set tvd until now because the parent we just removed 3229 * may have been the previous top-level vdev. 3230 */ 3231 tvd = cvd->vdev_top; 3232 ASSERT(tvd->vdev_parent == rvd); 3233 3234 /* 3235 * Reevaluate the parent vdev state. 3236 */ 3237 vdev_propagate_state(cvd); 3238 3239 /* 3240 * If the device we just detached was smaller than the others, it may be 3241 * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 3242 * can't fail because the existing metaslabs are already in core, so 3243 * there's nothing to read from disk. 3244 */ 3245 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 3246 3247 vdev_config_dirty(tvd); 3248 3249 /* 3250 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 3251 * vd->vdev_detached is set and free vd's DTL object in syncing context. 3252 * But first make sure we're not on any *other* txg's DTL list, to 3253 * prevent vd from being accessed after it's freed. 3254 */ 3255 for (int t = 0; t < TXG_SIZE; t++) 3256 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 3257 vd->vdev_detached = B_TRUE; 3258 vdev_dirty(tvd, VDD_DTL, vd, txg); 3259 3260 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 3261 3262 error = spa_vdev_exit(spa, vd, txg, 0); 3263 3264 /* 3265 * If this was the removal of the original device in a hot spare vdev, 3266 * then we want to go through and remove the device from the hot spare 3267 * list of every other pool. 3268 */ 3269 if (unspare) { 3270 spa_t *myspa = spa; 3271 spa = NULL; 3272 mutex_enter(&spa_namespace_lock); 3273 while ((spa = spa_next(spa)) != NULL) { 3274 if (spa->spa_state != POOL_STATE_ACTIVE) 3275 continue; 3276 if (spa == myspa) 3277 continue; 3278 spa_open_ref(spa, FTAG); 3279 mutex_exit(&spa_namespace_lock); 3280 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 3281 mutex_enter(&spa_namespace_lock); 3282 spa_close(spa, FTAG); 3283 } 3284 mutex_exit(&spa_namespace_lock); 3285 } 3286 3287 return (error); 3288 } 3289 3290 static nvlist_t * 3291 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 3292 { 3293 for (int i = 0; i < count; i++) { 3294 uint64_t guid; 3295 3296 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 3297 &guid) == 0); 3298 3299 if (guid == target_guid) 3300 return (nvpp[i]); 3301 } 3302 3303 return (NULL); 3304 } 3305 3306 static void 3307 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 3308 nvlist_t *dev_to_remove) 3309 { 3310 nvlist_t **newdev = NULL; 3311 3312 if (count > 1) 3313 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 3314 3315 for (int i = 0, j = 0; i < count; i++) { 3316 if (dev[i] == dev_to_remove) 3317 continue; 3318 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 3319 } 3320 3321 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 3322 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 3323 3324 for (int i = 0; i < count - 1; i++) 3325 nvlist_free(newdev[i]); 3326 3327 if (count > 1) 3328 kmem_free(newdev, (count - 1) * sizeof (void *)); 3329 } 3330 3331 /* 3332 * Remove a device from the pool. Currently, this supports removing only hot 3333 * spares and level 2 ARC devices. 3334 */ 3335 int 3336 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 3337 { 3338 vdev_t *vd; 3339 nvlist_t **spares, **l2cache, *nv; 3340 uint_t nspares, nl2cache; 3341 uint64_t txg = 0; 3342 int error = 0; 3343 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 3344 3345 if (!locked) 3346 txg = spa_vdev_enter(spa); 3347 3348 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 3349 3350 if (spa->spa_spares.sav_vdevs != NULL && 3351 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3352 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 3353 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 3354 /* 3355 * Only remove the hot spare if it's not currently in use 3356 * in this pool. 3357 */ 3358 if (vd == NULL || unspare) { 3359 spa_vdev_remove_aux(spa->spa_spares.sav_config, 3360 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 3361 spa_load_spares(spa); 3362 spa->spa_spares.sav_sync = B_TRUE; 3363 } else { 3364 error = EBUSY; 3365 } 3366 } else if (spa->spa_l2cache.sav_vdevs != NULL && 3367 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3368 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 3369 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 3370 /* 3371 * Cache devices can always be removed. 3372 */ 3373 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 3374 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 3375 spa_load_l2cache(spa); 3376 spa->spa_l2cache.sav_sync = B_TRUE; 3377 } else if (vd != NULL) { 3378 /* 3379 * Normal vdevs cannot be removed (yet). 3380 */ 3381 error = ENOTSUP; 3382 } else { 3383 /* 3384 * There is no vdev of any kind with the specified guid. 3385 */ 3386 error = ENOENT; 3387 } 3388 3389 if (!locked) 3390 return (spa_vdev_exit(spa, NULL, txg, error)); 3391 3392 return (error); 3393 } 3394 3395 /* 3396 * Find any device that's done replacing, or a vdev marked 'unspare' that's 3397 * current spared, so we can detach it. 3398 */ 3399 static vdev_t * 3400 spa_vdev_resilver_done_hunt(vdev_t *vd) 3401 { 3402 vdev_t *newvd, *oldvd; 3403 int c; 3404 3405 for (c = 0; c < vd->vdev_children; c++) { 3406 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 3407 if (oldvd != NULL) 3408 return (oldvd); 3409 } 3410 3411 /* 3412 * Check for a completed replacement. 3413 */ 3414 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 3415 oldvd = vd->vdev_child[0]; 3416 newvd = vd->vdev_child[1]; 3417 3418 if (vdev_dtl_empty(newvd, DTL_MISSING) && 3419 !vdev_dtl_required(oldvd)) 3420 return (oldvd); 3421 } 3422 3423 /* 3424 * Check for a completed resilver with the 'unspare' flag set. 3425 */ 3426 if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { 3427 newvd = vd->vdev_child[0]; 3428 oldvd = vd->vdev_child[1]; 3429 3430 if (newvd->vdev_unspare && 3431 vdev_dtl_empty(newvd, DTL_MISSING) && 3432 !vdev_dtl_required(oldvd)) { 3433 newvd->vdev_unspare = 0; 3434 return (oldvd); 3435 } 3436 } 3437 3438 return (NULL); 3439 } 3440 3441 static void 3442 spa_vdev_resilver_done(spa_t *spa) 3443 { 3444 vdev_t *vd, *pvd, *ppvd; 3445 uint64_t guid, sguid, pguid, ppguid; 3446 3447 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3448 3449 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 3450 pvd = vd->vdev_parent; 3451 ppvd = pvd->vdev_parent; 3452 guid = vd->vdev_guid; 3453 pguid = pvd->vdev_guid; 3454 ppguid = ppvd->vdev_guid; 3455 sguid = 0; 3456 /* 3457 * If we have just finished replacing a hot spared device, then 3458 * we need to detach the parent's first child (the original hot 3459 * spare) as well. 3460 */ 3461 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) { 3462 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 3463 ASSERT(ppvd->vdev_children == 2); 3464 sguid = ppvd->vdev_child[1]->vdev_guid; 3465 } 3466 spa_config_exit(spa, SCL_ALL, FTAG); 3467 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 3468 return; 3469 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 3470 return; 3471 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3472 } 3473 3474 spa_config_exit(spa, SCL_ALL, FTAG); 3475 } 3476 3477 /* 3478 * Update the stored path for this vdev. Dirty the vdev configuration, relying 3479 * on spa_vdev_enter/exit() to synchronize the labels and cache. 3480 */ 3481 int 3482 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 3483 { 3484 vdev_t *vd; 3485 uint64_t txg; 3486 3487 txg = spa_vdev_enter(spa); 3488 3489 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) { 3490 /* 3491 * Determine if this is a reference to a hot spare device. If 3492 * it is, update the path manually as there is no associated 3493 * vdev_t that can be synced to disk. 3494 */ 3495 nvlist_t **spares; 3496 uint_t i, nspares; 3497 3498 if (spa->spa_spares.sav_config != NULL) { 3499 VERIFY(nvlist_lookup_nvlist_array( 3500 spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 3501 &spares, &nspares) == 0); 3502 for (i = 0; i < nspares; i++) { 3503 uint64_t theguid; 3504 VERIFY(nvlist_lookup_uint64(spares[i], 3505 ZPOOL_CONFIG_GUID, &theguid) == 0); 3506 if (theguid == guid) { 3507 VERIFY(nvlist_add_string(spares[i], 3508 ZPOOL_CONFIG_PATH, newpath) == 0); 3509 spa_load_spares(spa); 3510 spa->spa_spares.sav_sync = B_TRUE; 3511 return (spa_vdev_exit(spa, NULL, txg, 3512 0)); 3513 } 3514 } 3515 } 3516 3517 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 3518 } 3519 3520 if (!vd->vdev_ops->vdev_op_leaf) 3521 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3522 3523 spa_strfree(vd->vdev_path); 3524 vd->vdev_path = spa_strdup(newpath); 3525 3526 vdev_config_dirty(vd->vdev_top); 3527 3528 return (spa_vdev_exit(spa, NULL, txg, 0)); 3529 } 3530 3531 /* 3532 * ========================================================================== 3533 * SPA Scrubbing 3534 * ========================================================================== 3535 */ 3536 3537 int 3538 spa_scrub(spa_t *spa, pool_scrub_type_t type) 3539 { 3540 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 3541 3542 if ((uint_t)type >= POOL_SCRUB_TYPES) 3543 return (ENOTSUP); 3544 3545 /* 3546 * If a resilver was requested, but there is no DTL on a 3547 * writeable leaf device, we have nothing to do. 3548 */ 3549 if (type == POOL_SCRUB_RESILVER && 3550 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 3551 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 3552 return (0); 3553 } 3554 3555 if (type == POOL_SCRUB_EVERYTHING && 3556 spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE && 3557 spa->spa_dsl_pool->dp_scrub_isresilver) 3558 return (EBUSY); 3559 3560 if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) { 3561 return (dsl_pool_scrub_clean(spa->spa_dsl_pool)); 3562 } else if (type == POOL_SCRUB_NONE) { 3563 return (dsl_pool_scrub_cancel(spa->spa_dsl_pool)); 3564 } else { 3565 return (EINVAL); 3566 } 3567 } 3568 3569 /* 3570 * ========================================================================== 3571 * SPA async task processing 3572 * ========================================================================== 3573 */ 3574 3575 static void 3576 spa_async_remove(spa_t *spa, vdev_t *vd) 3577 { 3578 if (vd->vdev_remove_wanted) { 3579 vd->vdev_remove_wanted = 0; 3580 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 3581 vdev_clear(spa, vd); 3582 vdev_state_dirty(vd->vdev_top); 3583 } 3584 3585 for (int c = 0; c < vd->vdev_children; c++) 3586 spa_async_remove(spa, vd->vdev_child[c]); 3587 } 3588 3589 static void 3590 spa_async_probe(spa_t *spa, vdev_t *vd) 3591 { 3592 if (vd->vdev_probe_wanted) { 3593 vd->vdev_probe_wanted = 0; 3594 vdev_reopen(vd); /* vdev_open() does the actual probe */ 3595 } 3596 3597 for (int c = 0; c < vd->vdev_children; c++) 3598 spa_async_probe(spa, vd->vdev_child[c]); 3599 } 3600 3601 static void 3602 spa_async_thread(spa_t *spa) 3603 { 3604 int tasks; 3605 3606 ASSERT(spa->spa_sync_on); 3607 3608 mutex_enter(&spa->spa_async_lock); 3609 tasks = spa->spa_async_tasks; 3610 spa->spa_async_tasks = 0; 3611 mutex_exit(&spa->spa_async_lock); 3612 3613 /* 3614 * See if the config needs to be updated. 3615 */ 3616 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 3617 mutex_enter(&spa_namespace_lock); 3618 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3619 mutex_exit(&spa_namespace_lock); 3620 } 3621 3622 /* 3623 * See if any devices need to be marked REMOVED. 3624 */ 3625 if (tasks & SPA_ASYNC_REMOVE) { 3626 spa_vdev_state_enter(spa); 3627 spa_async_remove(spa, spa->spa_root_vdev); 3628 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 3629 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 3630 for (int i = 0; i < spa->spa_spares.sav_count; i++) 3631 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 3632 (void) spa_vdev_state_exit(spa, NULL, 0); 3633 } 3634 3635 /* 3636 * See if any devices need to be probed. 3637 */ 3638 if (tasks & SPA_ASYNC_PROBE) { 3639 spa_vdev_state_enter(spa); 3640 spa_async_probe(spa, spa->spa_root_vdev); 3641 (void) spa_vdev_state_exit(spa, NULL, 0); 3642 } 3643 3644 /* 3645 * If any devices are done replacing, detach them. 3646 */ 3647 if (tasks & SPA_ASYNC_RESILVER_DONE) 3648 spa_vdev_resilver_done(spa); 3649 3650 /* 3651 * Kick off a resilver. 3652 */ 3653 if (tasks & SPA_ASYNC_RESILVER) 3654 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0); 3655 3656 /* 3657 * Let the world know that we're done. 3658 */ 3659 mutex_enter(&spa->spa_async_lock); 3660 spa->spa_async_thread = NULL; 3661 cv_broadcast(&spa->spa_async_cv); 3662 mutex_exit(&spa->spa_async_lock); 3663 thread_exit(); 3664 } 3665 3666 void 3667 spa_async_suspend(spa_t *spa) 3668 { 3669 mutex_enter(&spa->spa_async_lock); 3670 spa->spa_async_suspended++; 3671 while (spa->spa_async_thread != NULL) 3672 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 3673 mutex_exit(&spa->spa_async_lock); 3674 } 3675 3676 void 3677 spa_async_resume(spa_t *spa) 3678 { 3679 mutex_enter(&spa->spa_async_lock); 3680 ASSERT(spa->spa_async_suspended != 0); 3681 spa->spa_async_suspended--; 3682 mutex_exit(&spa->spa_async_lock); 3683 } 3684 3685 static void 3686 spa_async_dispatch(spa_t *spa) 3687 { 3688 mutex_enter(&spa->spa_async_lock); 3689 if (spa->spa_async_tasks && !spa->spa_async_suspended && 3690 spa->spa_async_thread == NULL && 3691 rootdir != NULL && !vn_is_readonly(rootdir)) 3692 spa->spa_async_thread = thread_create(NULL, 0, 3693 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 3694 mutex_exit(&spa->spa_async_lock); 3695 } 3696 3697 void 3698 spa_async_request(spa_t *spa, int task) 3699 { 3700 mutex_enter(&spa->spa_async_lock); 3701 spa->spa_async_tasks |= task; 3702 mutex_exit(&spa->spa_async_lock); 3703 } 3704 3705 /* 3706 * ========================================================================== 3707 * SPA syncing routines 3708 * ========================================================================== 3709 */ 3710 3711 static void 3712 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 3713 { 3714 bplist_t *bpl = &spa->spa_sync_bplist; 3715 dmu_tx_t *tx; 3716 blkptr_t blk; 3717 uint64_t itor = 0; 3718 zio_t *zio; 3719 int error; 3720 uint8_t c = 1; 3721 3722 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 3723 3724 while (bplist_iterate(bpl, &itor, &blk) == 0) { 3725 ASSERT(blk.blk_birth < txg); 3726 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL, 3727 ZIO_FLAG_MUSTSUCCEED)); 3728 } 3729 3730 error = zio_wait(zio); 3731 ASSERT3U(error, ==, 0); 3732 3733 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 3734 bplist_vacate(bpl, tx); 3735 3736 /* 3737 * Pre-dirty the first block so we sync to convergence faster. 3738 * (Usually only the first block is needed.) 3739 */ 3740 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 3741 dmu_tx_commit(tx); 3742 } 3743 3744 static void 3745 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 3746 { 3747 char *packed = NULL; 3748 size_t bufsize; 3749 size_t nvsize = 0; 3750 dmu_buf_t *db; 3751 3752 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 3753 3754 /* 3755 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 3756 * information. This avoids the dbuf_will_dirty() path and 3757 * saves us a pre-read to get data we don't actually care about. 3758 */ 3759 bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); 3760 packed = kmem_alloc(bufsize, KM_SLEEP); 3761 3762 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 3763 KM_SLEEP) == 0); 3764 bzero(packed + nvsize, bufsize - nvsize); 3765 3766 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 3767 3768 kmem_free(packed, bufsize); 3769 3770 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 3771 dmu_buf_will_dirty(db, tx); 3772 *(uint64_t *)db->db_data = nvsize; 3773 dmu_buf_rele(db, FTAG); 3774 } 3775 3776 static void 3777 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 3778 const char *config, const char *entry) 3779 { 3780 nvlist_t *nvroot; 3781 nvlist_t **list; 3782 int i; 3783 3784 if (!sav->sav_sync) 3785 return; 3786 3787 /* 3788 * Update the MOS nvlist describing the list of available devices. 3789 * spa_validate_aux() will have already made sure this nvlist is 3790 * valid and the vdevs are labeled appropriately. 3791 */ 3792 if (sav->sav_object == 0) { 3793 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 3794 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 3795 sizeof (uint64_t), tx); 3796 VERIFY(zap_update(spa->spa_meta_objset, 3797 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 3798 &sav->sav_object, tx) == 0); 3799 } 3800 3801 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3802 if (sav->sav_count == 0) { 3803 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 3804 } else { 3805 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 3806 for (i = 0; i < sav->sav_count; i++) 3807 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 3808 B_FALSE, B_FALSE, B_TRUE); 3809 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 3810 sav->sav_count) == 0); 3811 for (i = 0; i < sav->sav_count; i++) 3812 nvlist_free(list[i]); 3813 kmem_free(list, sav->sav_count * sizeof (void *)); 3814 } 3815 3816 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 3817 nvlist_free(nvroot); 3818 3819 sav->sav_sync = B_FALSE; 3820 } 3821 3822 static void 3823 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 3824 { 3825 nvlist_t *config; 3826 3827 if (list_is_empty(&spa->spa_config_dirty_list)) 3828 return; 3829 3830 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 3831 3832 config = spa_config_generate(spa, spa->spa_root_vdev, 3833 dmu_tx_get_txg(tx), B_FALSE); 3834 3835 spa_config_exit(spa, SCL_STATE, FTAG); 3836 3837 if (spa->spa_config_syncing) 3838 nvlist_free(spa->spa_config_syncing); 3839 spa->spa_config_syncing = config; 3840 3841 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 3842 } 3843 3844 /* 3845 * Set zpool properties. 3846 */ 3847 static void 3848 spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 3849 { 3850 spa_t *spa = arg1; 3851 objset_t *mos = spa->spa_meta_objset; 3852 nvlist_t *nvp = arg2; 3853 nvpair_t *elem; 3854 uint64_t intval; 3855 char *strval; 3856 zpool_prop_t prop; 3857 const char *propname; 3858 zprop_type_t proptype; 3859 3860 mutex_enter(&spa->spa_props_lock); 3861 3862 elem = NULL; 3863 while ((elem = nvlist_next_nvpair(nvp, elem))) { 3864 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 3865 case ZPOOL_PROP_VERSION: 3866 /* 3867 * Only set version for non-zpool-creation cases 3868 * (set/import). spa_create() needs special care 3869 * for version setting. 3870 */ 3871 if (tx->tx_txg != TXG_INITIAL) { 3872 VERIFY(nvpair_value_uint64(elem, 3873 &intval) == 0); 3874 ASSERT(intval <= SPA_VERSION); 3875 ASSERT(intval >= spa_version(spa)); 3876 spa->spa_uberblock.ub_version = intval; 3877 vdev_config_dirty(spa->spa_root_vdev); 3878 } 3879 break; 3880 3881 case ZPOOL_PROP_ALTROOT: 3882 /* 3883 * 'altroot' is a non-persistent property. It should 3884 * have been set temporarily at creation or import time. 3885 */ 3886 ASSERT(spa->spa_root != NULL); 3887 break; 3888 3889 case ZPOOL_PROP_CACHEFILE: 3890 /* 3891 * 'cachefile' is also a non-persisitent property. 3892 */ 3893 break; 3894 default: 3895 /* 3896 * Set pool property values in the poolprops mos object. 3897 */ 3898 if (spa->spa_pool_props_object == 0) { 3899 objset_t *mos = spa->spa_meta_objset; 3900 3901 VERIFY((spa->spa_pool_props_object = 3902 zap_create(mos, DMU_OT_POOL_PROPS, 3903 DMU_OT_NONE, 0, tx)) > 0); 3904 3905 VERIFY(zap_update(mos, 3906 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 3907 8, 1, &spa->spa_pool_props_object, tx) 3908 == 0); 3909 } 3910 3911 /* normalize the property name */ 3912 propname = zpool_prop_to_name(prop); 3913 proptype = zpool_prop_get_type(prop); 3914 3915 if (nvpair_type(elem) == DATA_TYPE_STRING) { 3916 ASSERT(proptype == PROP_TYPE_STRING); 3917 VERIFY(nvpair_value_string(elem, &strval) == 0); 3918 VERIFY(zap_update(mos, 3919 spa->spa_pool_props_object, propname, 3920 1, strlen(strval) + 1, strval, tx) == 0); 3921 3922 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 3923 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 3924 3925 if (proptype == PROP_TYPE_INDEX) { 3926 const char *unused; 3927 VERIFY(zpool_prop_index_to_string( 3928 prop, intval, &unused) == 0); 3929 } 3930 VERIFY(zap_update(mos, 3931 spa->spa_pool_props_object, propname, 3932 8, 1, &intval, tx) == 0); 3933 } else { 3934 ASSERT(0); /* not allowed */ 3935 } 3936 3937 switch (prop) { 3938 case ZPOOL_PROP_DELEGATION: 3939 spa->spa_delegation = intval; 3940 break; 3941 case ZPOOL_PROP_BOOTFS: 3942 spa->spa_bootfs = intval; 3943 break; 3944 case ZPOOL_PROP_FAILUREMODE: 3945 spa->spa_failmode = intval; 3946 break; 3947 default: 3948 break; 3949 } 3950 } 3951 3952 /* log internal history if this is not a zpool create */ 3953 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 3954 tx->tx_txg != TXG_INITIAL) { 3955 spa_history_internal_log(LOG_POOL_PROPSET, 3956 spa, tx, cr, "%s %lld %s", 3957 nvpair_name(elem), intval, spa_name(spa)); 3958 } 3959 } 3960 3961 mutex_exit(&spa->spa_props_lock); 3962 } 3963 3964 /* 3965 * Sync the specified transaction group. New blocks may be dirtied as 3966 * part of the process, so we iterate until it converges. 3967 */ 3968 void 3969 spa_sync(spa_t *spa, uint64_t txg) 3970 { 3971 dsl_pool_t *dp = spa->spa_dsl_pool; 3972 objset_t *mos = spa->spa_meta_objset; 3973 bplist_t *bpl = &spa->spa_sync_bplist; 3974 vdev_t *rvd = spa->spa_root_vdev; 3975 vdev_t *vd; 3976 dmu_tx_t *tx; 3977 int dirty_vdevs; 3978 int error; 3979 3980 /* 3981 * Lock out configuration changes. 3982 */ 3983 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3984 3985 spa->spa_syncing_txg = txg; 3986 spa->spa_sync_pass = 0; 3987 3988 /* 3989 * If there are any pending vdev state changes, convert them 3990 * into config changes that go out with this transaction group. 3991 */ 3992 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 3993 while (list_head(&spa->spa_state_dirty_list) != NULL) { 3994 /* 3995 * We need the write lock here because, for aux vdevs, 3996 * calling vdev_config_dirty() modifies sav_config. 3997 * This is ugly and will become unnecessary when we 3998 * eliminate the aux vdev wart by integrating all vdevs 3999 * into the root vdev tree. 4000 */ 4001 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 4002 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 4003 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 4004 vdev_state_clean(vd); 4005 vdev_config_dirty(vd); 4006 } 4007 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 4008 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 4009 } 4010 spa_config_exit(spa, SCL_STATE, FTAG); 4011 4012 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 4013 4014 tx = dmu_tx_create_assigned(dp, txg); 4015 4016 /* 4017 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 4018 * set spa_deflate if we have no raid-z vdevs. 4019 */ 4020 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 4021 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 4022 int i; 4023 4024 for (i = 0; i < rvd->vdev_children; i++) { 4025 vd = rvd->vdev_child[i]; 4026 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 4027 break; 4028 } 4029 if (i == rvd->vdev_children) { 4030 spa->spa_deflate = TRUE; 4031 VERIFY(0 == zap_add(spa->spa_meta_objset, 4032 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 4033 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 4034 } 4035 } 4036 4037 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 4038 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 4039 dsl_pool_create_origin(dp, tx); 4040 4041 /* Keeping the origin open increases spa_minref */ 4042 spa->spa_minref += 3; 4043 } 4044 4045 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 4046 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 4047 dsl_pool_upgrade_clones(dp, tx); 4048 } 4049 4050 /* 4051 * If anything has changed in this txg, push the deferred frees 4052 * from the previous txg. If not, leave them alone so that we 4053 * don't generate work on an otherwise idle system. 4054 */ 4055 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 4056 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 4057 !txg_list_empty(&dp->dp_sync_tasks, txg)) 4058 spa_sync_deferred_frees(spa, txg); 4059 4060 /* 4061 * Iterate to convergence. 4062 */ 4063 do { 4064 spa->spa_sync_pass++; 4065 4066 spa_sync_config_object(spa, tx); 4067 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 4068 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 4069 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 4070 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 4071 spa_errlog_sync(spa, txg); 4072 dsl_pool_sync(dp, txg); 4073 4074 dirty_vdevs = 0; 4075 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 4076 vdev_sync(vd, txg); 4077 dirty_vdevs++; 4078 } 4079 4080 bplist_sync(bpl, tx); 4081 } while (dirty_vdevs); 4082 4083 bplist_close(bpl); 4084 4085 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 4086 4087 /* 4088 * Rewrite the vdev configuration (which includes the uberblock) 4089 * to commit the transaction group. 4090 * 4091 * If there are no dirty vdevs, we sync the uberblock to a few 4092 * random top-level vdevs that are known to be visible in the 4093 * config cache (see spa_vdev_add() for a complete description). 4094 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 4095 */ 4096 for (;;) { 4097 /* 4098 * We hold SCL_STATE to prevent vdev open/close/etc. 4099 * while we're attempting to write the vdev labels. 4100 */ 4101 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4102 4103 if (list_is_empty(&spa->spa_config_dirty_list)) { 4104 vdev_t *svd[SPA_DVAS_PER_BP]; 4105 int svdcount = 0; 4106 int children = rvd->vdev_children; 4107 int c0 = spa_get_random(children); 4108 int c; 4109 4110 for (c = 0; c < children; c++) { 4111 vd = rvd->vdev_child[(c0 + c) % children]; 4112 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 4113 continue; 4114 svd[svdcount++] = vd; 4115 if (svdcount == SPA_DVAS_PER_BP) 4116 break; 4117 } 4118 error = vdev_config_sync(svd, svdcount, txg); 4119 } else { 4120 error = vdev_config_sync(rvd->vdev_child, 4121 rvd->vdev_children, txg); 4122 } 4123 4124 spa_config_exit(spa, SCL_STATE, FTAG); 4125 4126 if (error == 0) 4127 break; 4128 zio_suspend(spa, NULL); 4129 zio_resume_wait(spa); 4130 } 4131 dmu_tx_commit(tx); 4132 4133 /* 4134 * Clear the dirty config list. 4135 */ 4136 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 4137 vdev_config_clean(vd); 4138 4139 /* 4140 * Now that the new config has synced transactionally, 4141 * let it become visible to the config cache. 4142 */ 4143 if (spa->spa_config_syncing != NULL) { 4144 spa_config_set(spa, spa->spa_config_syncing); 4145 spa->spa_config_txg = txg; 4146 spa->spa_config_syncing = NULL; 4147 } 4148 4149 spa->spa_ubsync = spa->spa_uberblock; 4150 4151 /* 4152 * Clean up the ZIL records for the synced txg. 4153 */ 4154 dsl_pool_zil_clean(dp); 4155 4156 /* 4157 * Update usable space statistics. 4158 */ 4159 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 4160 vdev_sync_done(vd, txg); 4161 4162 /* 4163 * It had better be the case that we didn't dirty anything 4164 * since vdev_config_sync(). 4165 */ 4166 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 4167 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 4168 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 4169 ASSERT(bpl->bpl_queue == NULL); 4170 4171 spa_config_exit(spa, SCL_CONFIG, FTAG); 4172 4173 /* 4174 * If any async tasks have been requested, kick them off. 4175 */ 4176 spa_async_dispatch(spa); 4177 } 4178 4179 /* 4180 * Sync all pools. We don't want to hold the namespace lock across these 4181 * operations, so we take a reference on the spa_t and drop the lock during the 4182 * sync. 4183 */ 4184 void 4185 spa_sync_allpools(void) 4186 { 4187 spa_t *spa = NULL; 4188 mutex_enter(&spa_namespace_lock); 4189 while ((spa = spa_next(spa)) != NULL) { 4190 if (spa_state(spa) != POOL_STATE_ACTIVE || spa_suspended(spa)) 4191 continue; 4192 spa_open_ref(spa, FTAG); 4193 mutex_exit(&spa_namespace_lock); 4194 txg_wait_synced(spa_get_dsl(spa), 0); 4195 mutex_enter(&spa_namespace_lock); 4196 spa_close(spa, FTAG); 4197 } 4198 mutex_exit(&spa_namespace_lock); 4199 } 4200 4201 /* 4202 * ========================================================================== 4203 * Miscellaneous routines 4204 * ========================================================================== 4205 */ 4206 4207 /* 4208 * Remove all pools in the system. 4209 */ 4210 void 4211 spa_evict_all(void) 4212 { 4213 spa_t *spa; 4214 4215 /* 4216 * Remove all cached state. All pools should be closed now, 4217 * so every spa in the AVL tree should be unreferenced. 4218 */ 4219 mutex_enter(&spa_namespace_lock); 4220 while ((spa = spa_next(NULL)) != NULL) { 4221 /* 4222 * Stop async tasks. The async thread may need to detach 4223 * a device that's been replaced, which requires grabbing 4224 * spa_namespace_lock, so we must drop it here. 4225 */ 4226 spa_open_ref(spa, FTAG); 4227 mutex_exit(&spa_namespace_lock); 4228 spa_async_suspend(spa); 4229 mutex_enter(&spa_namespace_lock); 4230 spa_close(spa, FTAG); 4231 4232 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4233 spa_unload(spa); 4234 spa_deactivate(spa); 4235 } 4236 spa_remove(spa); 4237 } 4238 mutex_exit(&spa_namespace_lock); 4239 } 4240 4241 vdev_t * 4242 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache) 4243 { 4244 vdev_t *vd; 4245 int i; 4246 4247 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 4248 return (vd); 4249 4250 if (l2cache) { 4251 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 4252 vd = spa->spa_l2cache.sav_vdevs[i]; 4253 if (vd->vdev_guid == guid) 4254 return (vd); 4255 } 4256 } 4257 4258 return (NULL); 4259 } 4260 4261 void 4262 spa_upgrade(spa_t *spa, uint64_t version) 4263 { 4264 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4265 4266 /* 4267 * This should only be called for a non-faulted pool, and since a 4268 * future version would result in an unopenable pool, this shouldn't be 4269 * possible. 4270 */ 4271 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 4272 ASSERT(version >= spa->spa_uberblock.ub_version); 4273 4274 spa->spa_uberblock.ub_version = version; 4275 vdev_config_dirty(spa->spa_root_vdev); 4276 4277 spa_config_exit(spa, SCL_ALL, FTAG); 4278 4279 txg_wait_synced(spa_get_dsl(spa), 0); 4280 } 4281 4282 boolean_t 4283 spa_has_spare(spa_t *spa, uint64_t guid) 4284 { 4285 int i; 4286 uint64_t spareguid; 4287 spa_aux_vdev_t *sav = &spa->spa_spares; 4288 4289 for (i = 0; i < sav->sav_count; i++) 4290 if (sav->sav_vdevs[i]->vdev_guid == guid) 4291 return (B_TRUE); 4292 4293 for (i = 0; i < sav->sav_npending; i++) { 4294 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 4295 &spareguid) == 0 && spareguid == guid) 4296 return (B_TRUE); 4297 } 4298 4299 return (B_FALSE); 4300 } 4301 4302 /* 4303 * Check if a pool has an active shared spare device. 4304 * Note: reference count of an active spare is 2, as a spare and as a replace 4305 */ 4306 static boolean_t 4307 spa_has_active_shared_spare(spa_t *spa) 4308 { 4309 int i, refcnt; 4310 uint64_t pool; 4311 spa_aux_vdev_t *sav = &spa->spa_spares; 4312 4313 for (i = 0; i < sav->sav_count; i++) { 4314 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 4315 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 4316 refcnt > 2) 4317 return (B_TRUE); 4318 } 4319 4320 return (B_FALSE); 4321 } 4322 4323 /* 4324 * Post a sysevent corresponding to the given event. The 'name' must be one of 4325 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 4326 * filled in from the spa and (optionally) the vdev. This doesn't do anything 4327 * in the userland libzpool, as we don't want consumers to misinterpret ztest 4328 * or zdb as real changes. 4329 */ 4330 void 4331 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 4332 { 4333 #ifdef _KERNEL 4334 sysevent_t *ev; 4335 sysevent_attr_list_t *attr = NULL; 4336 sysevent_value_t value; 4337 sysevent_id_t eid; 4338 4339 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 4340 SE_SLEEP); 4341 4342 value.value_type = SE_DATA_TYPE_STRING; 4343 value.value.sv_string = spa_name(spa); 4344 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 4345 goto done; 4346 4347 value.value_type = SE_DATA_TYPE_UINT64; 4348 value.value.sv_uint64 = spa_guid(spa); 4349 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 4350 goto done; 4351 4352 if (vd) { 4353 value.value_type = SE_DATA_TYPE_UINT64; 4354 value.value.sv_uint64 = vd->vdev_guid; 4355 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 4356 SE_SLEEP) != 0) 4357 goto done; 4358 4359 if (vd->vdev_path) { 4360 value.value_type = SE_DATA_TYPE_STRING; 4361 value.value.sv_string = vd->vdev_path; 4362 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 4363 &value, SE_SLEEP) != 0) 4364 goto done; 4365 } 4366 } 4367 4368 if (sysevent_attach_attributes(ev, attr) != 0) 4369 goto done; 4370 attr = NULL; 4371 4372 (void) log_sysevent(ev, SE_SLEEP, &eid); 4373 4374 done: 4375 if (attr) 4376 sysevent_free_attr(attr); 4377 sysevent_free(ev); 4378 #endif 4379 } 4380