1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * This file contains all the routines used when modifying on-disk SPA state. 29 * This includes opening, importing, destroying, exporting a pool, and syncing a 30 * pool. 31 */ 32 33 #include <sys/zfs_context.h> 34 #include <sys/fm/fs/zfs.h> 35 #include <sys/spa_impl.h> 36 #include <sys/zio.h> 37 #include <sys/zio_checksum.h> 38 #include <sys/zio_compress.h> 39 #include <sys/dmu.h> 40 #include <sys/dmu_tx.h> 41 #include <sys/zap.h> 42 #include <sys/zil.h> 43 #include <sys/vdev_impl.h> 44 #include <sys/metaslab.h> 45 #include <sys/uberblock_impl.h> 46 #include <sys/txg.h> 47 #include <sys/avl.h> 48 #include <sys/dmu_traverse.h> 49 #include <sys/dmu_objset.h> 50 #include <sys/unique.h> 51 #include <sys/dsl_pool.h> 52 #include <sys/dsl_dataset.h> 53 #include <sys/dsl_dir.h> 54 #include <sys/dsl_prop.h> 55 #include <sys/dsl_synctask.h> 56 #include <sys/fs/zfs.h> 57 #include <sys/arc.h> 58 #include <sys/callb.h> 59 #include <sys/systeminfo.h> 60 #include <sys/sunddi.h> 61 #include <sys/spa_boot.h> 62 63 #include "zfs_prop.h" 64 #include "zfs_comutil.h" 65 66 int zio_taskq_threads = 8; 67 68 static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); 69 static boolean_t spa_has_active_shared_spare(spa_t *spa); 70 71 /* 72 * ========================================================================== 73 * SPA properties routines 74 * ========================================================================== 75 */ 76 77 /* 78 * Add a (source=src, propname=propval) list to an nvlist. 79 */ 80 static void 81 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 82 uint64_t intval, zprop_source_t src) 83 { 84 const char *propname = zpool_prop_to_name(prop); 85 nvlist_t *propval; 86 87 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 88 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 89 90 if (strval != NULL) 91 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 92 else 93 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 94 95 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 96 nvlist_free(propval); 97 } 98 99 /* 100 * Get property values from the spa configuration. 101 */ 102 static void 103 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 104 { 105 uint64_t size = spa_get_space(spa); 106 uint64_t used = spa_get_alloc(spa); 107 uint64_t cap, version; 108 zprop_source_t src = ZPROP_SRC_NONE; 109 spa_config_dirent_t *dp; 110 111 /* 112 * readonly properties 113 */ 114 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa->spa_name, 0, src); 115 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 116 spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src); 117 spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, size - used, src); 118 119 cap = (size == 0) ? 0 : (used * 100 / size); 120 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 121 122 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 123 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 124 spa->spa_root_vdev->vdev_state, src); 125 126 /* 127 * settable properties that are not stored in the pool property object. 128 */ 129 version = spa_version(spa); 130 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 131 src = ZPROP_SRC_DEFAULT; 132 else 133 src = ZPROP_SRC_LOCAL; 134 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 135 136 if (spa->spa_root != NULL) 137 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 138 0, ZPROP_SRC_LOCAL); 139 140 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 141 if (dp->scd_path == NULL) { 142 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 143 "none", 0, ZPROP_SRC_LOCAL); 144 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 145 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 146 dp->scd_path, 0, ZPROP_SRC_LOCAL); 147 } 148 } 149 } 150 151 /* 152 * Get zpool property values. 153 */ 154 int 155 spa_prop_get(spa_t *spa, nvlist_t **nvp) 156 { 157 zap_cursor_t zc; 158 zap_attribute_t za; 159 objset_t *mos = spa->spa_meta_objset; 160 int err; 161 162 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 163 164 /* 165 * Get properties from the spa config. 166 */ 167 spa_prop_get_config(spa, nvp); 168 169 mutex_enter(&spa->spa_props_lock); 170 /* If no pool property object, no more prop to get. */ 171 if (spa->spa_pool_props_object == 0) { 172 mutex_exit(&spa->spa_props_lock); 173 return (0); 174 } 175 176 /* 177 * Get properties from the MOS pool property object. 178 */ 179 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 180 (err = zap_cursor_retrieve(&zc, &za)) == 0; 181 zap_cursor_advance(&zc)) { 182 uint64_t intval = 0; 183 char *strval = NULL; 184 zprop_source_t src = ZPROP_SRC_DEFAULT; 185 zpool_prop_t prop; 186 187 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 188 continue; 189 190 switch (za.za_integer_length) { 191 case 8: 192 /* integer property */ 193 if (za.za_first_integer != 194 zpool_prop_default_numeric(prop)) 195 src = ZPROP_SRC_LOCAL; 196 197 if (prop == ZPOOL_PROP_BOOTFS) { 198 dsl_pool_t *dp; 199 dsl_dataset_t *ds = NULL; 200 201 dp = spa_get_dsl(spa); 202 rw_enter(&dp->dp_config_rwlock, RW_READER); 203 if (err = dsl_dataset_hold_obj(dp, 204 za.za_first_integer, FTAG, &ds)) { 205 rw_exit(&dp->dp_config_rwlock); 206 break; 207 } 208 209 strval = kmem_alloc( 210 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 211 KM_SLEEP); 212 dsl_dataset_name(ds, strval); 213 dsl_dataset_rele(ds, FTAG); 214 rw_exit(&dp->dp_config_rwlock); 215 } else { 216 strval = NULL; 217 intval = za.za_first_integer; 218 } 219 220 spa_prop_add_list(*nvp, prop, strval, intval, src); 221 222 if (strval != NULL) 223 kmem_free(strval, 224 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 225 226 break; 227 228 case 1: 229 /* string property */ 230 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 231 err = zap_lookup(mos, spa->spa_pool_props_object, 232 za.za_name, 1, za.za_num_integers, strval); 233 if (err) { 234 kmem_free(strval, za.za_num_integers); 235 break; 236 } 237 spa_prop_add_list(*nvp, prop, strval, 0, src); 238 kmem_free(strval, za.za_num_integers); 239 break; 240 241 default: 242 break; 243 } 244 } 245 zap_cursor_fini(&zc); 246 mutex_exit(&spa->spa_props_lock); 247 out: 248 if (err && err != ENOENT) { 249 nvlist_free(*nvp); 250 *nvp = NULL; 251 return (err); 252 } 253 254 return (0); 255 } 256 257 /* 258 * Validate the given pool properties nvlist and modify the list 259 * for the property values to be set. 260 */ 261 static int 262 spa_prop_validate(spa_t *spa, nvlist_t *props) 263 { 264 nvpair_t *elem; 265 int error = 0, reset_bootfs = 0; 266 uint64_t objnum; 267 268 elem = NULL; 269 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 270 zpool_prop_t prop; 271 char *propname, *strval; 272 uint64_t intval; 273 objset_t *os; 274 char *slash; 275 276 propname = nvpair_name(elem); 277 278 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) 279 return (EINVAL); 280 281 switch (prop) { 282 case ZPOOL_PROP_VERSION: 283 error = nvpair_value_uint64(elem, &intval); 284 if (!error && 285 (intval < spa_version(spa) || intval > SPA_VERSION)) 286 error = EINVAL; 287 break; 288 289 case ZPOOL_PROP_DELEGATION: 290 case ZPOOL_PROP_AUTOREPLACE: 291 case ZPOOL_PROP_LISTSNAPS: 292 error = nvpair_value_uint64(elem, &intval); 293 if (!error && intval > 1) 294 error = EINVAL; 295 break; 296 297 case ZPOOL_PROP_BOOTFS: 298 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 299 error = ENOTSUP; 300 break; 301 } 302 303 /* 304 * Make sure the vdev config is bootable 305 */ 306 if (!vdev_is_bootable(spa->spa_root_vdev)) { 307 error = ENOTSUP; 308 break; 309 } 310 311 reset_bootfs = 1; 312 313 error = nvpair_value_string(elem, &strval); 314 315 if (!error) { 316 uint64_t compress; 317 318 if (strval == NULL || strval[0] == '\0') { 319 objnum = zpool_prop_default_numeric( 320 ZPOOL_PROP_BOOTFS); 321 break; 322 } 323 324 if (error = dmu_objset_open(strval, DMU_OST_ZFS, 325 DS_MODE_USER | DS_MODE_READONLY, &os)) 326 break; 327 328 /* We don't support gzip bootable datasets */ 329 if ((error = dsl_prop_get_integer(strval, 330 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 331 &compress, NULL)) == 0 && 332 !BOOTFS_COMPRESS_VALID(compress)) { 333 error = ENOTSUP; 334 } else { 335 objnum = dmu_objset_id(os); 336 } 337 dmu_objset_close(os); 338 } 339 break; 340 case ZPOOL_PROP_FAILUREMODE: 341 error = nvpair_value_uint64(elem, &intval); 342 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 343 intval > ZIO_FAILURE_MODE_PANIC)) 344 error = EINVAL; 345 346 /* 347 * This is a special case which only occurs when 348 * the pool has completely failed. This allows 349 * the user to change the in-core failmode property 350 * without syncing it out to disk (I/Os might 351 * currently be blocked). We do this by returning 352 * EIO to the caller (spa_prop_set) to trick it 353 * into thinking we encountered a property validation 354 * error. 355 */ 356 if (!error && spa_state(spa) == POOL_STATE_IO_FAILURE) { 357 spa->spa_failmode = intval; 358 error = EIO; 359 } 360 break; 361 362 case ZPOOL_PROP_CACHEFILE: 363 if ((error = nvpair_value_string(elem, &strval)) != 0) 364 break; 365 366 if (strval[0] == '\0') 367 break; 368 369 if (strcmp(strval, "none") == 0) 370 break; 371 372 if (strval[0] != '/') { 373 error = EINVAL; 374 break; 375 } 376 377 slash = strrchr(strval, '/'); 378 ASSERT(slash != NULL); 379 380 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 381 strcmp(slash, "/..") == 0) 382 error = EINVAL; 383 break; 384 } 385 386 if (error) 387 break; 388 } 389 390 if (!error && reset_bootfs) { 391 error = nvlist_remove(props, 392 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 393 394 if (!error) { 395 error = nvlist_add_uint64(props, 396 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 397 } 398 } 399 400 return (error); 401 } 402 403 int 404 spa_prop_set(spa_t *spa, nvlist_t *nvp) 405 { 406 int error; 407 408 if ((error = spa_prop_validate(spa, nvp)) != 0) 409 return (error); 410 411 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 412 spa, nvp, 3)); 413 } 414 415 /* 416 * If the bootfs property value is dsobj, clear it. 417 */ 418 void 419 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 420 { 421 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 422 VERIFY(zap_remove(spa->spa_meta_objset, 423 spa->spa_pool_props_object, 424 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 425 spa->spa_bootfs = 0; 426 } 427 } 428 429 /* 430 * ========================================================================== 431 * SPA state manipulation (open/create/destroy/import/export) 432 * ========================================================================== 433 */ 434 435 static int 436 spa_error_entry_compare(const void *a, const void *b) 437 { 438 spa_error_entry_t *sa = (spa_error_entry_t *)a; 439 spa_error_entry_t *sb = (spa_error_entry_t *)b; 440 int ret; 441 442 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 443 sizeof (zbookmark_t)); 444 445 if (ret < 0) 446 return (-1); 447 else if (ret > 0) 448 return (1); 449 else 450 return (0); 451 } 452 453 /* 454 * Utility function which retrieves copies of the current logs and 455 * re-initializes them in the process. 456 */ 457 void 458 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 459 { 460 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 461 462 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 463 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 464 465 avl_create(&spa->spa_errlist_scrub, 466 spa_error_entry_compare, sizeof (spa_error_entry_t), 467 offsetof(spa_error_entry_t, se_avl)); 468 avl_create(&spa->spa_errlist_last, 469 spa_error_entry_compare, sizeof (spa_error_entry_t), 470 offsetof(spa_error_entry_t, se_avl)); 471 } 472 473 /* 474 * Activate an uninitialized pool. 475 */ 476 static void 477 spa_activate(spa_t *spa) 478 { 479 int t; 480 481 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 482 483 spa->spa_state = POOL_STATE_ACTIVE; 484 485 spa->spa_normal_class = metaslab_class_create(); 486 spa->spa_log_class = metaslab_class_create(); 487 488 for (t = 0; t < ZIO_TYPES; t++) { 489 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 490 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 491 TASKQ_PREPOPULATE); 492 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 493 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 494 TASKQ_PREPOPULATE); 495 } 496 497 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 498 offsetof(vdev_t, vdev_dirty_node)); 499 list_create(&spa->spa_zio_list, sizeof (zio_t), 500 offsetof(zio_t, zio_link_node)); 501 502 txg_list_create(&spa->spa_vdev_txg_list, 503 offsetof(struct vdev, vdev_txg_node)); 504 505 avl_create(&spa->spa_errlist_scrub, 506 spa_error_entry_compare, sizeof (spa_error_entry_t), 507 offsetof(spa_error_entry_t, se_avl)); 508 avl_create(&spa->spa_errlist_last, 509 spa_error_entry_compare, sizeof (spa_error_entry_t), 510 offsetof(spa_error_entry_t, se_avl)); 511 } 512 513 /* 514 * Opposite of spa_activate(). 515 */ 516 static void 517 spa_deactivate(spa_t *spa) 518 { 519 int t; 520 521 ASSERT(spa->spa_sync_on == B_FALSE); 522 ASSERT(spa->spa_dsl_pool == NULL); 523 ASSERT(spa->spa_root_vdev == NULL); 524 525 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 526 527 txg_list_destroy(&spa->spa_vdev_txg_list); 528 529 list_destroy(&spa->spa_dirty_list); 530 list_destroy(&spa->spa_zio_list); 531 532 for (t = 0; t < ZIO_TYPES; t++) { 533 taskq_destroy(spa->spa_zio_issue_taskq[t]); 534 taskq_destroy(spa->spa_zio_intr_taskq[t]); 535 spa->spa_zio_issue_taskq[t] = NULL; 536 spa->spa_zio_intr_taskq[t] = NULL; 537 } 538 539 metaslab_class_destroy(spa->spa_normal_class); 540 spa->spa_normal_class = NULL; 541 542 metaslab_class_destroy(spa->spa_log_class); 543 spa->spa_log_class = NULL; 544 545 /* 546 * If this was part of an import or the open otherwise failed, we may 547 * still have errors left in the queues. Empty them just in case. 548 */ 549 spa_errlog_drain(spa); 550 551 avl_destroy(&spa->spa_errlist_scrub); 552 avl_destroy(&spa->spa_errlist_last); 553 554 spa->spa_state = POOL_STATE_UNINITIALIZED; 555 } 556 557 /* 558 * Verify a pool configuration, and construct the vdev tree appropriately. This 559 * will create all the necessary vdevs in the appropriate layout, with each vdev 560 * in the CLOSED state. This will prep the pool before open/creation/import. 561 * All vdev validation is done by the vdev_alloc() routine. 562 */ 563 static int 564 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 565 uint_t id, int atype) 566 { 567 nvlist_t **child; 568 uint_t c, children; 569 int error; 570 571 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 572 return (error); 573 574 if ((*vdp)->vdev_ops->vdev_op_leaf) 575 return (0); 576 577 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 578 &child, &children) != 0) { 579 vdev_free(*vdp); 580 *vdp = NULL; 581 return (EINVAL); 582 } 583 584 for (c = 0; c < children; c++) { 585 vdev_t *vd; 586 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 587 atype)) != 0) { 588 vdev_free(*vdp); 589 *vdp = NULL; 590 return (error); 591 } 592 } 593 594 ASSERT(*vdp != NULL); 595 596 return (0); 597 } 598 599 /* 600 * Opposite of spa_load(). 601 */ 602 static void 603 spa_unload(spa_t *spa) 604 { 605 int i; 606 607 /* 608 * Stop async tasks. 609 */ 610 spa_async_suspend(spa); 611 612 /* 613 * Stop syncing. 614 */ 615 if (spa->spa_sync_on) { 616 txg_sync_stop(spa->spa_dsl_pool); 617 spa->spa_sync_on = B_FALSE; 618 } 619 620 /* 621 * Wait for any outstanding prefetch I/O to complete. 622 */ 623 spa_config_enter(spa, RW_WRITER, FTAG); 624 spa_config_exit(spa, FTAG); 625 626 /* 627 * Drop and purge level 2 cache 628 */ 629 spa_l2cache_drop(spa); 630 631 /* 632 * Close the dsl pool. 633 */ 634 if (spa->spa_dsl_pool) { 635 dsl_pool_close(spa->spa_dsl_pool); 636 spa->spa_dsl_pool = NULL; 637 } 638 639 /* 640 * Close all vdevs. 641 */ 642 if (spa->spa_root_vdev) 643 vdev_free(spa->spa_root_vdev); 644 ASSERT(spa->spa_root_vdev == NULL); 645 646 for (i = 0; i < spa->spa_spares.sav_count; i++) 647 vdev_free(spa->spa_spares.sav_vdevs[i]); 648 if (spa->spa_spares.sav_vdevs) { 649 kmem_free(spa->spa_spares.sav_vdevs, 650 spa->spa_spares.sav_count * sizeof (void *)); 651 spa->spa_spares.sav_vdevs = NULL; 652 } 653 if (spa->spa_spares.sav_config) { 654 nvlist_free(spa->spa_spares.sav_config); 655 spa->spa_spares.sav_config = NULL; 656 } 657 spa->spa_spares.sav_count = 0; 658 659 for (i = 0; i < spa->spa_l2cache.sav_count; i++) 660 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 661 if (spa->spa_l2cache.sav_vdevs) { 662 kmem_free(spa->spa_l2cache.sav_vdevs, 663 spa->spa_l2cache.sav_count * sizeof (void *)); 664 spa->spa_l2cache.sav_vdevs = NULL; 665 } 666 if (spa->spa_l2cache.sav_config) { 667 nvlist_free(spa->spa_l2cache.sav_config); 668 spa->spa_l2cache.sav_config = NULL; 669 } 670 spa->spa_l2cache.sav_count = 0; 671 672 spa->spa_async_suspended = 0; 673 } 674 675 /* 676 * Load (or re-load) the current list of vdevs describing the active spares for 677 * this pool. When this is called, we have some form of basic information in 678 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 679 * then re-generate a more complete list including status information. 680 */ 681 static void 682 spa_load_spares(spa_t *spa) 683 { 684 nvlist_t **spares; 685 uint_t nspares; 686 int i; 687 vdev_t *vd, *tvd; 688 689 /* 690 * First, close and free any existing spare vdevs. 691 */ 692 for (i = 0; i < spa->spa_spares.sav_count; i++) { 693 vd = spa->spa_spares.sav_vdevs[i]; 694 695 /* Undo the call to spa_activate() below */ 696 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 697 B_FALSE)) != NULL && tvd->vdev_isspare) 698 spa_spare_remove(tvd); 699 vdev_close(vd); 700 vdev_free(vd); 701 } 702 703 if (spa->spa_spares.sav_vdevs) 704 kmem_free(spa->spa_spares.sav_vdevs, 705 spa->spa_spares.sav_count * sizeof (void *)); 706 707 if (spa->spa_spares.sav_config == NULL) 708 nspares = 0; 709 else 710 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 711 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 712 713 spa->spa_spares.sav_count = (int)nspares; 714 spa->spa_spares.sav_vdevs = NULL; 715 716 if (nspares == 0) 717 return; 718 719 /* 720 * Construct the array of vdevs, opening them to get status in the 721 * process. For each spare, there is potentially two different vdev_t 722 * structures associated with it: one in the list of spares (used only 723 * for basic validation purposes) and one in the active vdev 724 * configuration (if it's spared in). During this phase we open and 725 * validate each vdev on the spare list. If the vdev also exists in the 726 * active configuration, then we also mark this vdev as an active spare. 727 */ 728 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 729 KM_SLEEP); 730 for (i = 0; i < spa->spa_spares.sav_count; i++) { 731 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 732 VDEV_ALLOC_SPARE) == 0); 733 ASSERT(vd != NULL); 734 735 spa->spa_spares.sav_vdevs[i] = vd; 736 737 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 738 B_FALSE)) != NULL) { 739 if (!tvd->vdev_isspare) 740 spa_spare_add(tvd); 741 742 /* 743 * We only mark the spare active if we were successfully 744 * able to load the vdev. Otherwise, importing a pool 745 * with a bad active spare would result in strange 746 * behavior, because multiple pool would think the spare 747 * is actively in use. 748 * 749 * There is a vulnerability here to an equally bizarre 750 * circumstance, where a dead active spare is later 751 * brought back to life (onlined or otherwise). Given 752 * the rarity of this scenario, and the extra complexity 753 * it adds, we ignore the possibility. 754 */ 755 if (!vdev_is_dead(tvd)) 756 spa_spare_activate(tvd); 757 } 758 759 if (vdev_open(vd) != 0) 760 continue; 761 762 vd->vdev_top = vd; 763 if (vdev_validate_aux(vd) == 0) 764 spa_spare_add(vd); 765 } 766 767 /* 768 * Recompute the stashed list of spares, with status information 769 * this time. 770 */ 771 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 772 DATA_TYPE_NVLIST_ARRAY) == 0); 773 774 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 775 KM_SLEEP); 776 for (i = 0; i < spa->spa_spares.sav_count; i++) 777 spares[i] = vdev_config_generate(spa, 778 spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE); 779 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 780 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 781 for (i = 0; i < spa->spa_spares.sav_count; i++) 782 nvlist_free(spares[i]); 783 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 784 } 785 786 /* 787 * Load (or re-load) the current list of vdevs describing the active l2cache for 788 * this pool. When this is called, we have some form of basic information in 789 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 790 * then re-generate a more complete list including status information. 791 * Devices which are already active have their details maintained, and are 792 * not re-opened. 793 */ 794 static void 795 spa_load_l2cache(spa_t *spa) 796 { 797 nvlist_t **l2cache; 798 uint_t nl2cache; 799 int i, j, oldnvdevs; 800 uint64_t guid, size; 801 vdev_t *vd, **oldvdevs, **newvdevs; 802 spa_aux_vdev_t *sav = &spa->spa_l2cache; 803 804 if (sav->sav_config != NULL) { 805 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 806 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 807 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 808 } else { 809 nl2cache = 0; 810 } 811 812 oldvdevs = sav->sav_vdevs; 813 oldnvdevs = sav->sav_count; 814 sav->sav_vdevs = NULL; 815 sav->sav_count = 0; 816 817 /* 818 * Process new nvlist of vdevs. 819 */ 820 for (i = 0; i < nl2cache; i++) { 821 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 822 &guid) == 0); 823 824 newvdevs[i] = NULL; 825 for (j = 0; j < oldnvdevs; j++) { 826 vd = oldvdevs[j]; 827 if (vd != NULL && guid == vd->vdev_guid) { 828 /* 829 * Retain previous vdev for add/remove ops. 830 */ 831 newvdevs[i] = vd; 832 oldvdevs[j] = NULL; 833 break; 834 } 835 } 836 837 if (newvdevs[i] == NULL) { 838 /* 839 * Create new vdev 840 */ 841 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 842 VDEV_ALLOC_L2CACHE) == 0); 843 ASSERT(vd != NULL); 844 newvdevs[i] = vd; 845 846 /* 847 * Commit this vdev as an l2cache device, 848 * even if it fails to open. 849 */ 850 spa_l2cache_add(vd); 851 852 vd->vdev_top = vd; 853 vd->vdev_aux = sav; 854 855 spa_l2cache_activate(vd); 856 857 if (vdev_open(vd) != 0) 858 continue; 859 860 (void) vdev_validate_aux(vd); 861 862 if (!vdev_is_dead(vd)) { 863 size = vdev_get_rsize(vd); 864 l2arc_add_vdev(spa, vd, 865 VDEV_LABEL_START_SIZE, 866 size - VDEV_LABEL_START_SIZE); 867 } 868 } 869 } 870 871 /* 872 * Purge vdevs that were dropped 873 */ 874 for (i = 0; i < oldnvdevs; i++) { 875 uint64_t pool; 876 877 vd = oldvdevs[i]; 878 if (vd != NULL) { 879 if (spa_mode & FWRITE && 880 spa_l2cache_exists(vd->vdev_guid, &pool) && 881 pool != 0ULL && 882 l2arc_vdev_present(vd)) { 883 l2arc_remove_vdev(vd); 884 } 885 (void) vdev_close(vd); 886 spa_l2cache_remove(vd); 887 } 888 } 889 890 if (oldvdevs) 891 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 892 893 if (sav->sav_config == NULL) 894 goto out; 895 896 sav->sav_vdevs = newvdevs; 897 sav->sav_count = (int)nl2cache; 898 899 /* 900 * Recompute the stashed list of l2cache devices, with status 901 * information this time. 902 */ 903 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 904 DATA_TYPE_NVLIST_ARRAY) == 0); 905 906 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 907 for (i = 0; i < sav->sav_count; i++) 908 l2cache[i] = vdev_config_generate(spa, 909 sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE); 910 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 911 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 912 out: 913 for (i = 0; i < sav->sav_count; i++) 914 nvlist_free(l2cache[i]); 915 if (sav->sav_count) 916 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 917 } 918 919 static int 920 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 921 { 922 dmu_buf_t *db; 923 char *packed = NULL; 924 size_t nvsize = 0; 925 int error; 926 *value = NULL; 927 928 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 929 nvsize = *(uint64_t *)db->db_data; 930 dmu_buf_rele(db, FTAG); 931 932 packed = kmem_alloc(nvsize, KM_SLEEP); 933 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 934 if (error == 0) 935 error = nvlist_unpack(packed, nvsize, value, 0); 936 kmem_free(packed, nvsize); 937 938 return (error); 939 } 940 941 /* 942 * Checks to see if the given vdev could not be opened, in which case we post a 943 * sysevent to notify the autoreplace code that the device has been removed. 944 */ 945 static void 946 spa_check_removed(vdev_t *vd) 947 { 948 int c; 949 950 for (c = 0; c < vd->vdev_children; c++) 951 spa_check_removed(vd->vdev_child[c]); 952 953 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 954 zfs_post_autoreplace(vd->vdev_spa, vd); 955 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 956 } 957 } 958 959 /* 960 * Check for missing log devices 961 */ 962 int 963 spa_check_logs(spa_t *spa) 964 { 965 switch (spa->spa_log_state) { 966 case SPA_LOG_MISSING: 967 /* need to recheck in case slog has been restored */ 968 case SPA_LOG_UNKNOWN: 969 if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, 970 DS_FIND_CHILDREN)) { 971 spa->spa_log_state = SPA_LOG_MISSING; 972 return (1); 973 } 974 break; 975 976 case SPA_LOG_CLEAR: 977 (void) dmu_objset_find(spa->spa_name, zil_clear_log_chain, NULL, 978 DS_FIND_CHILDREN); 979 break; 980 } 981 spa->spa_log_state = SPA_LOG_GOOD; 982 return (0); 983 } 984 985 /* 986 * Load an existing storage pool, using the pool's builtin spa_config as a 987 * source of configuration information. 988 */ 989 static int 990 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 991 { 992 int error = 0; 993 nvlist_t *nvroot = NULL; 994 vdev_t *rvd; 995 uberblock_t *ub = &spa->spa_uberblock; 996 uint64_t config_cache_txg = spa->spa_config_txg; 997 uint64_t pool_guid; 998 uint64_t version; 999 zio_t *zio; 1000 uint64_t autoreplace = 0; 1001 char *ereport = FM_EREPORT_ZFS_POOL; 1002 1003 spa->spa_load_state = state; 1004 1005 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 1006 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 1007 error = EINVAL; 1008 goto out; 1009 } 1010 1011 /* 1012 * Versioning wasn't explicitly added to the label until later, so if 1013 * it's not present treat it as the initial version. 1014 */ 1015 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 1016 version = SPA_VERSION_INITIAL; 1017 1018 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 1019 &spa->spa_config_txg); 1020 1021 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 1022 spa_guid_exists(pool_guid, 0)) { 1023 error = EEXIST; 1024 goto out; 1025 } 1026 1027 spa->spa_load_guid = pool_guid; 1028 1029 /* 1030 * Parse the configuration into a vdev tree. We explicitly set the 1031 * value that will be returned by spa_version() since parsing the 1032 * configuration requires knowing the version number. 1033 */ 1034 spa_config_enter(spa, RW_WRITER, FTAG); 1035 spa->spa_ubsync.ub_version = version; 1036 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 1037 spa_config_exit(spa, FTAG); 1038 1039 if (error != 0) 1040 goto out; 1041 1042 ASSERT(spa->spa_root_vdev == rvd); 1043 ASSERT(spa_guid(spa) == pool_guid); 1044 1045 /* 1046 * Try to open all vdevs, loading each label in the process. 1047 */ 1048 error = vdev_open(rvd); 1049 if (error != 0) 1050 goto out; 1051 1052 /* 1053 * Validate the labels for all leaf vdevs. We need to grab the config 1054 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 1055 * flag. 1056 */ 1057 spa_config_enter(spa, RW_READER, FTAG); 1058 error = vdev_validate(rvd); 1059 spa_config_exit(spa, FTAG); 1060 1061 if (error != 0) 1062 goto out; 1063 1064 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1065 error = ENXIO; 1066 goto out; 1067 } 1068 1069 /* 1070 * Find the best uberblock. 1071 */ 1072 bzero(ub, sizeof (uberblock_t)); 1073 1074 zio = zio_root(spa, NULL, NULL, 1075 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1076 vdev_uberblock_load(zio, rvd, ub); 1077 error = zio_wait(zio); 1078 1079 /* 1080 * If we weren't able to find a single valid uberblock, return failure. 1081 */ 1082 if (ub->ub_txg == 0) { 1083 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1084 VDEV_AUX_CORRUPT_DATA); 1085 error = ENXIO; 1086 goto out; 1087 } 1088 1089 /* 1090 * If the pool is newer than the code, we can't open it. 1091 */ 1092 if (ub->ub_version > SPA_VERSION) { 1093 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1094 VDEV_AUX_VERSION_NEWER); 1095 error = ENOTSUP; 1096 goto out; 1097 } 1098 1099 /* 1100 * If the vdev guid sum doesn't match the uberblock, we have an 1101 * incomplete configuration. 1102 */ 1103 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 1104 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1105 VDEV_AUX_BAD_GUID_SUM); 1106 error = ENXIO; 1107 goto out; 1108 } 1109 1110 /* 1111 * Initialize internal SPA structures. 1112 */ 1113 spa->spa_state = POOL_STATE_ACTIVE; 1114 spa->spa_ubsync = spa->spa_uberblock; 1115 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 1116 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 1117 if (error) { 1118 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1119 VDEV_AUX_CORRUPT_DATA); 1120 goto out; 1121 } 1122 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 1123 1124 if (zap_lookup(spa->spa_meta_objset, 1125 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1126 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 1127 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1128 VDEV_AUX_CORRUPT_DATA); 1129 error = EIO; 1130 goto out; 1131 } 1132 1133 if (!mosconfig) { 1134 nvlist_t *newconfig; 1135 uint64_t hostid; 1136 1137 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 1138 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1139 VDEV_AUX_CORRUPT_DATA); 1140 error = EIO; 1141 goto out; 1142 } 1143 1144 if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID, 1145 &hostid) == 0) { 1146 char *hostname; 1147 unsigned long myhostid = 0; 1148 1149 VERIFY(nvlist_lookup_string(newconfig, 1150 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 1151 1152 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 1153 if (hostid != 0 && myhostid != 0 && 1154 (unsigned long)hostid != myhostid) { 1155 cmn_err(CE_WARN, "pool '%s' could not be " 1156 "loaded as it was last accessed by " 1157 "another system (host: %s hostid: 0x%lx). " 1158 "See: http://www.sun.com/msg/ZFS-8000-EY", 1159 spa->spa_name, hostname, 1160 (unsigned long)hostid); 1161 error = EBADF; 1162 goto out; 1163 } 1164 } 1165 1166 spa_config_set(spa, newconfig); 1167 spa_unload(spa); 1168 spa_deactivate(spa); 1169 spa_activate(spa); 1170 1171 return (spa_load(spa, newconfig, state, B_TRUE)); 1172 } 1173 1174 if (zap_lookup(spa->spa_meta_objset, 1175 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1176 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 1177 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1178 VDEV_AUX_CORRUPT_DATA); 1179 error = EIO; 1180 goto out; 1181 } 1182 1183 /* 1184 * Load the bit that tells us to use the new accounting function 1185 * (raid-z deflation). If we have an older pool, this will not 1186 * be present. 1187 */ 1188 error = zap_lookup(spa->spa_meta_objset, 1189 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1190 sizeof (uint64_t), 1, &spa->spa_deflate); 1191 if (error != 0 && error != ENOENT) { 1192 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1193 VDEV_AUX_CORRUPT_DATA); 1194 error = EIO; 1195 goto out; 1196 } 1197 1198 /* 1199 * Load the persistent error log. If we have an older pool, this will 1200 * not be present. 1201 */ 1202 error = zap_lookup(spa->spa_meta_objset, 1203 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 1204 sizeof (uint64_t), 1, &spa->spa_errlog_last); 1205 if (error != 0 && error != ENOENT) { 1206 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1207 VDEV_AUX_CORRUPT_DATA); 1208 error = EIO; 1209 goto out; 1210 } 1211 1212 error = zap_lookup(spa->spa_meta_objset, 1213 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 1214 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 1215 if (error != 0 && error != ENOENT) { 1216 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1217 VDEV_AUX_CORRUPT_DATA); 1218 error = EIO; 1219 goto out; 1220 } 1221 1222 /* 1223 * Load the history object. If we have an older pool, this 1224 * will not be present. 1225 */ 1226 error = zap_lookup(spa->spa_meta_objset, 1227 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 1228 sizeof (uint64_t), 1, &spa->spa_history); 1229 if (error != 0 && error != ENOENT) { 1230 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1231 VDEV_AUX_CORRUPT_DATA); 1232 error = EIO; 1233 goto out; 1234 } 1235 1236 /* 1237 * Load any hot spares for this pool. 1238 */ 1239 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1240 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object); 1241 if (error != 0 && error != ENOENT) { 1242 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1243 VDEV_AUX_CORRUPT_DATA); 1244 error = EIO; 1245 goto out; 1246 } 1247 if (error == 0) { 1248 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 1249 if (load_nvlist(spa, spa->spa_spares.sav_object, 1250 &spa->spa_spares.sav_config) != 0) { 1251 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1252 VDEV_AUX_CORRUPT_DATA); 1253 error = EIO; 1254 goto out; 1255 } 1256 1257 spa_config_enter(spa, RW_WRITER, FTAG); 1258 spa_load_spares(spa); 1259 spa_config_exit(spa, FTAG); 1260 } 1261 1262 /* 1263 * Load any level 2 ARC devices for this pool. 1264 */ 1265 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1266 DMU_POOL_L2CACHE, sizeof (uint64_t), 1, 1267 &spa->spa_l2cache.sav_object); 1268 if (error != 0 && error != ENOENT) { 1269 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1270 VDEV_AUX_CORRUPT_DATA); 1271 error = EIO; 1272 goto out; 1273 } 1274 if (error == 0) { 1275 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 1276 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 1277 &spa->spa_l2cache.sav_config) != 0) { 1278 vdev_set_state(rvd, B_TRUE, 1279 VDEV_STATE_CANT_OPEN, 1280 VDEV_AUX_CORRUPT_DATA); 1281 error = EIO; 1282 goto out; 1283 } 1284 1285 spa_config_enter(spa, RW_WRITER, FTAG); 1286 spa_load_l2cache(spa); 1287 spa_config_exit(spa, FTAG); 1288 } 1289 1290 if (spa_check_logs(spa)) { 1291 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1292 VDEV_AUX_BAD_LOG); 1293 error = ENXIO; 1294 ereport = FM_EREPORT_ZFS_LOG_REPLAY; 1295 goto out; 1296 } 1297 1298 1299 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 1300 1301 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1302 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 1303 1304 if (error && error != ENOENT) { 1305 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1306 VDEV_AUX_CORRUPT_DATA); 1307 error = EIO; 1308 goto out; 1309 } 1310 1311 if (error == 0) { 1312 (void) zap_lookup(spa->spa_meta_objset, 1313 spa->spa_pool_props_object, 1314 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 1315 sizeof (uint64_t), 1, &spa->spa_bootfs); 1316 (void) zap_lookup(spa->spa_meta_objset, 1317 spa->spa_pool_props_object, 1318 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1319 sizeof (uint64_t), 1, &autoreplace); 1320 (void) zap_lookup(spa->spa_meta_objset, 1321 spa->spa_pool_props_object, 1322 zpool_prop_to_name(ZPOOL_PROP_DELEGATION), 1323 sizeof (uint64_t), 1, &spa->spa_delegation); 1324 (void) zap_lookup(spa->spa_meta_objset, 1325 spa->spa_pool_props_object, 1326 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 1327 sizeof (uint64_t), 1, &spa->spa_failmode); 1328 } 1329 1330 /* 1331 * If the 'autoreplace' property is set, then post a resource notifying 1332 * the ZFS DE that it should not issue any faults for unopenable 1333 * devices. We also iterate over the vdevs, and post a sysevent for any 1334 * unopenable vdevs so that the normal autoreplace handler can take 1335 * over. 1336 */ 1337 if (autoreplace && state != SPA_LOAD_TRYIMPORT) 1338 spa_check_removed(spa->spa_root_vdev); 1339 1340 /* 1341 * Load the vdev state for all toplevel vdevs. 1342 */ 1343 vdev_load(rvd); 1344 1345 /* 1346 * Propagate the leaf DTLs we just loaded all the way up the tree. 1347 */ 1348 spa_config_enter(spa, RW_WRITER, FTAG); 1349 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 1350 spa_config_exit(spa, FTAG); 1351 1352 /* 1353 * Check the state of the root vdev. If it can't be opened, it 1354 * indicates one or more toplevel vdevs are faulted. 1355 */ 1356 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1357 error = ENXIO; 1358 goto out; 1359 } 1360 1361 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 1362 dmu_tx_t *tx; 1363 int need_update = B_FALSE; 1364 int c; 1365 1366 /* 1367 * Claim log blocks that haven't been committed yet. 1368 * This must all happen in a single txg. 1369 */ 1370 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 1371 spa_first_txg(spa)); 1372 (void) dmu_objset_find(spa->spa_name, 1373 zil_claim, tx, DS_FIND_CHILDREN); 1374 dmu_tx_commit(tx); 1375 1376 spa->spa_sync_on = B_TRUE; 1377 txg_sync_start(spa->spa_dsl_pool); 1378 1379 /* 1380 * Wait for all claims to sync. 1381 */ 1382 txg_wait_synced(spa->spa_dsl_pool, 0); 1383 1384 /* 1385 * If the config cache is stale, or we have uninitialized 1386 * metaslabs (see spa_vdev_add()), then update the config. 1387 */ 1388 if (config_cache_txg != spa->spa_config_txg || 1389 state == SPA_LOAD_IMPORT) 1390 need_update = B_TRUE; 1391 1392 for (c = 0; c < rvd->vdev_children; c++) 1393 if (rvd->vdev_child[c]->vdev_ms_array == 0) 1394 need_update = B_TRUE; 1395 1396 /* 1397 * Update the config cache asychronously in case we're the 1398 * root pool, in which case the config cache isn't writable yet. 1399 */ 1400 if (need_update) 1401 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 1402 } 1403 1404 error = 0; 1405 out: 1406 spa->spa_minref = refcount_count(&spa->spa_refcount); 1407 if (error && error != EBADF) 1408 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 1409 spa->spa_load_state = SPA_LOAD_NONE; 1410 spa->spa_ena = 0; 1411 1412 return (error); 1413 } 1414 1415 /* 1416 * Pool Open/Import 1417 * 1418 * The import case is identical to an open except that the configuration is sent 1419 * down from userland, instead of grabbed from the configuration cache. For the 1420 * case of an open, the pool configuration will exist in the 1421 * POOL_STATE_UNINITIALIZED state. 1422 * 1423 * The stats information (gen/count/ustats) is used to gather vdev statistics at 1424 * the same time open the pool, without having to keep around the spa_t in some 1425 * ambiguous state. 1426 */ 1427 static int 1428 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 1429 { 1430 spa_t *spa; 1431 int error; 1432 int locked = B_FALSE; 1433 1434 *spapp = NULL; 1435 1436 /* 1437 * As disgusting as this is, we need to support recursive calls to this 1438 * function because dsl_dir_open() is called during spa_load(), and ends 1439 * up calling spa_open() again. The real fix is to figure out how to 1440 * avoid dsl_dir_open() calling this in the first place. 1441 */ 1442 if (mutex_owner(&spa_namespace_lock) != curthread) { 1443 mutex_enter(&spa_namespace_lock); 1444 locked = B_TRUE; 1445 } 1446 1447 if ((spa = spa_lookup(pool)) == NULL) { 1448 if (locked) 1449 mutex_exit(&spa_namespace_lock); 1450 return (ENOENT); 1451 } 1452 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 1453 1454 spa_activate(spa); 1455 1456 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 1457 1458 if (error == EBADF) { 1459 /* 1460 * If vdev_validate() returns failure (indicated by 1461 * EBADF), it indicates that one of the vdevs indicates 1462 * that the pool has been exported or destroyed. If 1463 * this is the case, the config cache is out of sync and 1464 * we should remove the pool from the namespace. 1465 */ 1466 spa_unload(spa); 1467 spa_deactivate(spa); 1468 spa_config_sync(spa, B_TRUE, B_TRUE); 1469 spa_remove(spa); 1470 if (locked) 1471 mutex_exit(&spa_namespace_lock); 1472 return (ENOENT); 1473 } 1474 1475 if (error) { 1476 /* 1477 * We can't open the pool, but we still have useful 1478 * information: the state of each vdev after the 1479 * attempted vdev_open(). Return this to the user. 1480 */ 1481 if (config != NULL && spa->spa_root_vdev != NULL) { 1482 spa_config_enter(spa, RW_READER, FTAG); 1483 *config = spa_config_generate(spa, NULL, -1ULL, 1484 B_TRUE); 1485 spa_config_exit(spa, FTAG); 1486 } 1487 spa_unload(spa); 1488 spa_deactivate(spa); 1489 spa->spa_last_open_failed = B_TRUE; 1490 if (locked) 1491 mutex_exit(&spa_namespace_lock); 1492 *spapp = NULL; 1493 return (error); 1494 } else { 1495 spa->spa_last_open_failed = B_FALSE; 1496 } 1497 } 1498 1499 spa_open_ref(spa, tag); 1500 1501 if (locked) 1502 mutex_exit(&spa_namespace_lock); 1503 1504 *spapp = spa; 1505 1506 if (config != NULL) { 1507 spa_config_enter(spa, RW_READER, FTAG); 1508 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1509 spa_config_exit(spa, FTAG); 1510 } 1511 1512 return (0); 1513 } 1514 1515 int 1516 spa_open(const char *name, spa_t **spapp, void *tag) 1517 { 1518 return (spa_open_common(name, spapp, tag, NULL)); 1519 } 1520 1521 /* 1522 * Lookup the given spa_t, incrementing the inject count in the process, 1523 * preventing it from being exported or destroyed. 1524 */ 1525 spa_t * 1526 spa_inject_addref(char *name) 1527 { 1528 spa_t *spa; 1529 1530 mutex_enter(&spa_namespace_lock); 1531 if ((spa = spa_lookup(name)) == NULL) { 1532 mutex_exit(&spa_namespace_lock); 1533 return (NULL); 1534 } 1535 spa->spa_inject_ref++; 1536 mutex_exit(&spa_namespace_lock); 1537 1538 return (spa); 1539 } 1540 1541 void 1542 spa_inject_delref(spa_t *spa) 1543 { 1544 mutex_enter(&spa_namespace_lock); 1545 spa->spa_inject_ref--; 1546 mutex_exit(&spa_namespace_lock); 1547 } 1548 1549 /* 1550 * Add spares device information to the nvlist. 1551 */ 1552 static void 1553 spa_add_spares(spa_t *spa, nvlist_t *config) 1554 { 1555 nvlist_t **spares; 1556 uint_t i, nspares; 1557 nvlist_t *nvroot; 1558 uint64_t guid; 1559 vdev_stat_t *vs; 1560 uint_t vsc; 1561 uint64_t pool; 1562 1563 if (spa->spa_spares.sav_count == 0) 1564 return; 1565 1566 VERIFY(nvlist_lookup_nvlist(config, 1567 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1568 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1569 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1570 if (nspares != 0) { 1571 VERIFY(nvlist_add_nvlist_array(nvroot, 1572 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1573 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1574 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1575 1576 /* 1577 * Go through and find any spares which have since been 1578 * repurposed as an active spare. If this is the case, update 1579 * their status appropriately. 1580 */ 1581 for (i = 0; i < nspares; i++) { 1582 VERIFY(nvlist_lookup_uint64(spares[i], 1583 ZPOOL_CONFIG_GUID, &guid) == 0); 1584 if (spa_spare_exists(guid, &pool, NULL) && 1585 pool != 0ULL) { 1586 VERIFY(nvlist_lookup_uint64_array( 1587 spares[i], ZPOOL_CONFIG_STATS, 1588 (uint64_t **)&vs, &vsc) == 0); 1589 vs->vs_state = VDEV_STATE_CANT_OPEN; 1590 vs->vs_aux = VDEV_AUX_SPARED; 1591 } 1592 } 1593 } 1594 } 1595 1596 /* 1597 * Add l2cache device information to the nvlist, including vdev stats. 1598 */ 1599 static void 1600 spa_add_l2cache(spa_t *spa, nvlist_t *config) 1601 { 1602 nvlist_t **l2cache; 1603 uint_t i, j, nl2cache; 1604 nvlist_t *nvroot; 1605 uint64_t guid; 1606 vdev_t *vd; 1607 vdev_stat_t *vs; 1608 uint_t vsc; 1609 1610 if (spa->spa_l2cache.sav_count == 0) 1611 return; 1612 1613 spa_config_enter(spa, RW_READER, FTAG); 1614 1615 VERIFY(nvlist_lookup_nvlist(config, 1616 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1617 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 1618 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1619 if (nl2cache != 0) { 1620 VERIFY(nvlist_add_nvlist_array(nvroot, 1621 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 1622 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1623 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1624 1625 /* 1626 * Update level 2 cache device stats. 1627 */ 1628 1629 for (i = 0; i < nl2cache; i++) { 1630 VERIFY(nvlist_lookup_uint64(l2cache[i], 1631 ZPOOL_CONFIG_GUID, &guid) == 0); 1632 1633 vd = NULL; 1634 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 1635 if (guid == 1636 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 1637 vd = spa->spa_l2cache.sav_vdevs[j]; 1638 break; 1639 } 1640 } 1641 ASSERT(vd != NULL); 1642 1643 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 1644 ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0); 1645 vdev_get_stats(vd, vs); 1646 } 1647 } 1648 1649 spa_config_exit(spa, FTAG); 1650 } 1651 1652 int 1653 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 1654 { 1655 int error; 1656 spa_t *spa; 1657 1658 *config = NULL; 1659 error = spa_open_common(name, &spa, FTAG, config); 1660 1661 if (spa && *config != NULL) { 1662 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 1663 spa_get_errlog_size(spa)) == 0); 1664 1665 spa_add_spares(spa, *config); 1666 spa_add_l2cache(spa, *config); 1667 } 1668 1669 /* 1670 * We want to get the alternate root even for faulted pools, so we cheat 1671 * and call spa_lookup() directly. 1672 */ 1673 if (altroot) { 1674 if (spa == NULL) { 1675 mutex_enter(&spa_namespace_lock); 1676 spa = spa_lookup(name); 1677 if (spa) 1678 spa_altroot(spa, altroot, buflen); 1679 else 1680 altroot[0] = '\0'; 1681 spa = NULL; 1682 mutex_exit(&spa_namespace_lock); 1683 } else { 1684 spa_altroot(spa, altroot, buflen); 1685 } 1686 } 1687 1688 if (spa != NULL) 1689 spa_close(spa, FTAG); 1690 1691 return (error); 1692 } 1693 1694 /* 1695 * Validate that the auxiliary device array is well formed. We must have an 1696 * array of nvlists, each which describes a valid leaf vdev. If this is an 1697 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 1698 * specified, as long as they are well-formed. 1699 */ 1700 static int 1701 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 1702 spa_aux_vdev_t *sav, const char *config, uint64_t version, 1703 vdev_labeltype_t label) 1704 { 1705 nvlist_t **dev; 1706 uint_t i, ndev; 1707 vdev_t *vd; 1708 int error; 1709 1710 /* 1711 * It's acceptable to have no devs specified. 1712 */ 1713 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 1714 return (0); 1715 1716 if (ndev == 0) 1717 return (EINVAL); 1718 1719 /* 1720 * Make sure the pool is formatted with a version that supports this 1721 * device type. 1722 */ 1723 if (spa_version(spa) < version) 1724 return (ENOTSUP); 1725 1726 /* 1727 * Set the pending device list so we correctly handle device in-use 1728 * checking. 1729 */ 1730 sav->sav_pending = dev; 1731 sav->sav_npending = ndev; 1732 1733 for (i = 0; i < ndev; i++) { 1734 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 1735 mode)) != 0) 1736 goto out; 1737 1738 if (!vd->vdev_ops->vdev_op_leaf) { 1739 vdev_free(vd); 1740 error = EINVAL; 1741 goto out; 1742 } 1743 1744 /* 1745 * The L2ARC currently only supports disk devices. 1746 */ 1747 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 1748 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 1749 error = ENOTBLK; 1750 goto out; 1751 } 1752 1753 vd->vdev_top = vd; 1754 1755 if ((error = vdev_open(vd)) == 0 && 1756 (error = vdev_label_init(vd, crtxg, label)) == 0) { 1757 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 1758 vd->vdev_guid) == 0); 1759 } 1760 1761 vdev_free(vd); 1762 1763 if (error && 1764 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 1765 goto out; 1766 else 1767 error = 0; 1768 } 1769 1770 out: 1771 sav->sav_pending = NULL; 1772 sav->sav_npending = 0; 1773 return (error); 1774 } 1775 1776 static int 1777 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 1778 { 1779 int error; 1780 1781 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 1782 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 1783 VDEV_LABEL_SPARE)) != 0) { 1784 return (error); 1785 } 1786 1787 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 1788 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 1789 VDEV_LABEL_L2CACHE)); 1790 } 1791 1792 static void 1793 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 1794 const char *config) 1795 { 1796 int i; 1797 1798 if (sav->sav_config != NULL) { 1799 nvlist_t **olddevs; 1800 uint_t oldndevs; 1801 nvlist_t **newdevs; 1802 1803 /* 1804 * Generate new dev list by concatentating with the 1805 * current dev list. 1806 */ 1807 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 1808 &olddevs, &oldndevs) == 0); 1809 1810 newdevs = kmem_alloc(sizeof (void *) * 1811 (ndevs + oldndevs), KM_SLEEP); 1812 for (i = 0; i < oldndevs; i++) 1813 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 1814 KM_SLEEP) == 0); 1815 for (i = 0; i < ndevs; i++) 1816 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 1817 KM_SLEEP) == 0); 1818 1819 VERIFY(nvlist_remove(sav->sav_config, config, 1820 DATA_TYPE_NVLIST_ARRAY) == 0); 1821 1822 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1823 config, newdevs, ndevs + oldndevs) == 0); 1824 for (i = 0; i < oldndevs + ndevs; i++) 1825 nvlist_free(newdevs[i]); 1826 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 1827 } else { 1828 /* 1829 * Generate a new dev list. 1830 */ 1831 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 1832 KM_SLEEP) == 0); 1833 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 1834 devs, ndevs) == 0); 1835 } 1836 } 1837 1838 /* 1839 * Stop and drop level 2 ARC devices 1840 */ 1841 void 1842 spa_l2cache_drop(spa_t *spa) 1843 { 1844 vdev_t *vd; 1845 int i; 1846 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1847 1848 for (i = 0; i < sav->sav_count; i++) { 1849 uint64_t pool; 1850 1851 vd = sav->sav_vdevs[i]; 1852 ASSERT(vd != NULL); 1853 1854 if (spa_mode & FWRITE && 1855 spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && 1856 l2arc_vdev_present(vd)) { 1857 l2arc_remove_vdev(vd); 1858 } 1859 if (vd->vdev_isl2cache) 1860 spa_l2cache_remove(vd); 1861 vdev_clear_stats(vd); 1862 (void) vdev_close(vd); 1863 } 1864 } 1865 1866 /* 1867 * Pool Creation 1868 */ 1869 int 1870 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 1871 const char *history_str, nvlist_t *zplprops) 1872 { 1873 spa_t *spa; 1874 char *altroot = NULL; 1875 vdev_t *rvd; 1876 dsl_pool_t *dp; 1877 dmu_tx_t *tx; 1878 int c, error = 0; 1879 uint64_t txg = TXG_INITIAL; 1880 nvlist_t **spares, **l2cache; 1881 uint_t nspares, nl2cache; 1882 uint64_t version; 1883 1884 /* 1885 * If this pool already exists, return failure. 1886 */ 1887 mutex_enter(&spa_namespace_lock); 1888 if (spa_lookup(pool) != NULL) { 1889 mutex_exit(&spa_namespace_lock); 1890 return (EEXIST); 1891 } 1892 1893 /* 1894 * Allocate a new spa_t structure. 1895 */ 1896 (void) nvlist_lookup_string(props, 1897 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 1898 spa = spa_add(pool, altroot); 1899 spa_activate(spa); 1900 1901 spa->spa_uberblock.ub_txg = txg - 1; 1902 1903 if (props && (error = spa_prop_validate(spa, props))) { 1904 spa_unload(spa); 1905 spa_deactivate(spa); 1906 spa_remove(spa); 1907 mutex_exit(&spa_namespace_lock); 1908 return (error); 1909 } 1910 1911 if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 1912 &version) != 0) 1913 version = SPA_VERSION; 1914 ASSERT(version <= SPA_VERSION); 1915 spa->spa_uberblock.ub_version = version; 1916 spa->spa_ubsync = spa->spa_uberblock; 1917 1918 /* 1919 * Create the root vdev. 1920 */ 1921 spa_config_enter(spa, RW_WRITER, FTAG); 1922 1923 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1924 1925 ASSERT(error != 0 || rvd != NULL); 1926 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1927 1928 if (error == 0 && !zfs_allocatable_devs(nvroot)) 1929 error = EINVAL; 1930 1931 if (error == 0 && 1932 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1933 (error = spa_validate_aux(spa, nvroot, txg, 1934 VDEV_ALLOC_ADD)) == 0) { 1935 for (c = 0; c < rvd->vdev_children; c++) 1936 vdev_init(rvd->vdev_child[c], txg); 1937 vdev_config_dirty(rvd); 1938 } 1939 1940 spa_config_exit(spa, FTAG); 1941 1942 if (error != 0) { 1943 spa_unload(spa); 1944 spa_deactivate(spa); 1945 spa_remove(spa); 1946 mutex_exit(&spa_namespace_lock); 1947 return (error); 1948 } 1949 1950 /* 1951 * Get the list of spares, if specified. 1952 */ 1953 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1954 &spares, &nspares) == 0) { 1955 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 1956 KM_SLEEP) == 0); 1957 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1958 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1959 spa_config_enter(spa, RW_WRITER, FTAG); 1960 spa_load_spares(spa); 1961 spa_config_exit(spa, FTAG); 1962 spa->spa_spares.sav_sync = B_TRUE; 1963 } 1964 1965 /* 1966 * Get the list of level 2 cache devices, if specified. 1967 */ 1968 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 1969 &l2cache, &nl2cache) == 0) { 1970 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 1971 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1972 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 1973 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 1974 spa_config_enter(spa, RW_WRITER, FTAG); 1975 spa_load_l2cache(spa); 1976 spa_config_exit(spa, FTAG); 1977 spa->spa_l2cache.sav_sync = B_TRUE; 1978 } 1979 1980 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 1981 spa->spa_meta_objset = dp->dp_meta_objset; 1982 1983 tx = dmu_tx_create_assigned(dp, txg); 1984 1985 /* 1986 * Create the pool config object. 1987 */ 1988 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1989 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 1990 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1991 1992 if (zap_add(spa->spa_meta_objset, 1993 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1994 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1995 cmn_err(CE_PANIC, "failed to add pool config"); 1996 } 1997 1998 /* Newly created pools with the right version are always deflated. */ 1999 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 2000 spa->spa_deflate = TRUE; 2001 if (zap_add(spa->spa_meta_objset, 2002 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2003 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 2004 cmn_err(CE_PANIC, "failed to add deflate"); 2005 } 2006 } 2007 2008 /* 2009 * Create the deferred-free bplist object. Turn off compression 2010 * because sync-to-convergence takes longer if the blocksize 2011 * keeps changing. 2012 */ 2013 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 2014 1 << 14, tx); 2015 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 2016 ZIO_COMPRESS_OFF, tx); 2017 2018 if (zap_add(spa->spa_meta_objset, 2019 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 2020 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 2021 cmn_err(CE_PANIC, "failed to add bplist"); 2022 } 2023 2024 /* 2025 * Create the pool's history object. 2026 */ 2027 if (version >= SPA_VERSION_ZPOOL_HISTORY) 2028 spa_history_create_obj(spa, tx); 2029 2030 /* 2031 * Set pool properties. 2032 */ 2033 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 2034 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2035 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 2036 if (props) 2037 spa_sync_props(spa, props, CRED(), tx); 2038 2039 dmu_tx_commit(tx); 2040 2041 spa->spa_sync_on = B_TRUE; 2042 txg_sync_start(spa->spa_dsl_pool); 2043 2044 /* 2045 * We explicitly wait for the first transaction to complete so that our 2046 * bean counters are appropriately updated. 2047 */ 2048 txg_wait_synced(spa->spa_dsl_pool, txg); 2049 2050 spa_config_sync(spa, B_FALSE, B_TRUE); 2051 2052 if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 2053 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 2054 2055 mutex_exit(&spa_namespace_lock); 2056 2057 spa->spa_minref = refcount_count(&spa->spa_refcount); 2058 2059 return (0); 2060 } 2061 2062 /* 2063 * Import the given pool into the system. We set up the necessary spa_t and 2064 * then call spa_load() to do the dirty work. 2065 */ 2066 static int 2067 spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props, 2068 boolean_t isroot, boolean_t allowfaulted) 2069 { 2070 spa_t *spa; 2071 char *altroot = NULL; 2072 int error, loaderr; 2073 nvlist_t *nvroot; 2074 nvlist_t **spares, **l2cache; 2075 uint_t nspares, nl2cache; 2076 2077 /* 2078 * If a pool with this name exists, return failure. 2079 */ 2080 mutex_enter(&spa_namespace_lock); 2081 if (spa_lookup(pool) != NULL) { 2082 mutex_exit(&spa_namespace_lock); 2083 return (EEXIST); 2084 } 2085 2086 /* 2087 * Create and initialize the spa structure. 2088 */ 2089 (void) nvlist_lookup_string(props, 2090 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2091 spa = spa_add(pool, altroot); 2092 spa_activate(spa); 2093 2094 if (allowfaulted) 2095 spa->spa_import_faulted = B_TRUE; 2096 spa->spa_is_root = isroot; 2097 2098 /* 2099 * Pass off the heavy lifting to spa_load(). 2100 * Pass TRUE for mosconfig (unless this is a root pool) because 2101 * the user-supplied config is actually the one to trust when 2102 * doing an import. 2103 */ 2104 loaderr = error = spa_load(spa, config, SPA_LOAD_IMPORT, !isroot); 2105 2106 spa_config_enter(spa, RW_WRITER, FTAG); 2107 /* 2108 * Toss any existing sparelist, as it doesn't have any validity anymore, 2109 * and conflicts with spa_has_spare(). 2110 */ 2111 if (!isroot && spa->spa_spares.sav_config) { 2112 nvlist_free(spa->spa_spares.sav_config); 2113 spa->spa_spares.sav_config = NULL; 2114 spa_load_spares(spa); 2115 } 2116 if (!isroot && spa->spa_l2cache.sav_config) { 2117 nvlist_free(spa->spa_l2cache.sav_config); 2118 spa->spa_l2cache.sav_config = NULL; 2119 spa_load_l2cache(spa); 2120 } 2121 2122 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 2123 &nvroot) == 0); 2124 if (error == 0) 2125 error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE); 2126 if (error == 0) 2127 error = spa_validate_aux(spa, nvroot, -1ULL, 2128 VDEV_ALLOC_L2CACHE); 2129 spa_config_exit(spa, FTAG); 2130 2131 if (error != 0 || (props && (error = spa_prop_set(spa, props)))) { 2132 if (loaderr != 0 && loaderr != EINVAL && allowfaulted) { 2133 /* 2134 * If we failed to load the pool, but 'allowfaulted' is 2135 * set, then manually set the config as if the config 2136 * passed in was specified in the cache file. 2137 */ 2138 error = 0; 2139 spa->spa_import_faulted = B_FALSE; 2140 if (spa->spa_config == NULL) { 2141 spa_config_enter(spa, RW_READER, FTAG); 2142 spa->spa_config = spa_config_generate(spa, 2143 NULL, -1ULL, B_TRUE); 2144 spa_config_exit(spa, FTAG); 2145 } 2146 spa_unload(spa); 2147 spa_deactivate(spa); 2148 spa_config_sync(spa, B_FALSE, B_TRUE); 2149 } else { 2150 spa_unload(spa); 2151 spa_deactivate(spa); 2152 spa_remove(spa); 2153 } 2154 mutex_exit(&spa_namespace_lock); 2155 return (error); 2156 } 2157 2158 /* 2159 * Override any spares and level 2 cache devices as specified by 2160 * the user, as these may have correct device names/devids, etc. 2161 */ 2162 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2163 &spares, &nspares) == 0) { 2164 if (spa->spa_spares.sav_config) 2165 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 2166 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 2167 else 2168 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 2169 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2170 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2171 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2172 spa_config_enter(spa, RW_WRITER, FTAG); 2173 spa_load_spares(spa); 2174 spa_config_exit(spa, FTAG); 2175 spa->spa_spares.sav_sync = B_TRUE; 2176 } 2177 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2178 &l2cache, &nl2cache) == 0) { 2179 if (spa->spa_l2cache.sav_config) 2180 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 2181 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 2182 else 2183 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2184 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2185 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2186 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2187 spa_config_enter(spa, RW_WRITER, FTAG); 2188 spa_load_l2cache(spa); 2189 spa_config_exit(spa, FTAG); 2190 spa->spa_l2cache.sav_sync = B_TRUE; 2191 } 2192 2193 if (spa_mode & FWRITE) { 2194 /* 2195 * Update the config cache to include the newly-imported pool. 2196 */ 2197 spa_config_update_common(spa, SPA_CONFIG_UPDATE_POOL, isroot); 2198 } 2199 2200 spa->spa_import_faulted = B_FALSE; 2201 mutex_exit(&spa_namespace_lock); 2202 2203 return (0); 2204 } 2205 2206 #ifdef _KERNEL 2207 /* 2208 * Build a "root" vdev for a top level vdev read in from a rootpool 2209 * device label. 2210 */ 2211 static void 2212 spa_build_rootpool_config(nvlist_t *config) 2213 { 2214 nvlist_t *nvtop, *nvroot; 2215 uint64_t pgid; 2216 2217 /* 2218 * Add this top-level vdev to the child array. 2219 */ 2220 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) 2221 == 0); 2222 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid) 2223 == 0); 2224 2225 /* 2226 * Put this pool's top-level vdevs into a root vdev. 2227 */ 2228 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2229 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) 2230 == 0); 2231 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 2232 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 2233 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 2234 &nvtop, 1) == 0); 2235 2236 /* 2237 * Replace the existing vdev_tree with the new root vdev in 2238 * this pool's configuration (remove the old, add the new). 2239 */ 2240 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 2241 nvlist_free(nvroot); 2242 } 2243 2244 /* 2245 * Get the root pool information from the root disk, then import the root pool 2246 * during the system boot up time. 2247 */ 2248 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 2249 2250 int 2251 spa_check_rootconf(char *devpath, char *devid, nvlist_t **bestconf, 2252 uint64_t *besttxg) 2253 { 2254 nvlist_t *config; 2255 uint64_t txg; 2256 int error; 2257 2258 if (error = vdev_disk_read_rootlabel(devpath, devid, &config)) 2259 return (error); 2260 2261 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 2262 2263 if (bestconf != NULL) 2264 *bestconf = config; 2265 else 2266 nvlist_free(config); 2267 *besttxg = txg; 2268 return (0); 2269 } 2270 2271 boolean_t 2272 spa_rootdev_validate(nvlist_t *nv) 2273 { 2274 uint64_t ival; 2275 2276 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 || 2277 nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 || 2278 nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0) 2279 return (B_FALSE); 2280 2281 return (B_TRUE); 2282 } 2283 2284 2285 /* 2286 * Given the boot device's physical path or devid, check if the device 2287 * is in a valid state. If so, return the configuration from the vdev 2288 * label. 2289 */ 2290 int 2291 spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf) 2292 { 2293 nvlist_t *conf = NULL; 2294 uint64_t txg = 0; 2295 nvlist_t *nvtop, **child; 2296 char *type; 2297 char *bootpath = NULL; 2298 uint_t children, c; 2299 char *tmp; 2300 int error; 2301 2302 if (devpath && ((tmp = strchr(devpath, ' ')) != NULL)) 2303 *tmp = '\0'; 2304 if (error = spa_check_rootconf(devpath, devid, &conf, &txg)) { 2305 cmn_err(CE_NOTE, "error reading device label"); 2306 return (error); 2307 } 2308 if (txg == 0) { 2309 cmn_err(CE_NOTE, "this device is detached"); 2310 nvlist_free(conf); 2311 return (EINVAL); 2312 } 2313 2314 VERIFY(nvlist_lookup_nvlist(conf, ZPOOL_CONFIG_VDEV_TREE, 2315 &nvtop) == 0); 2316 VERIFY(nvlist_lookup_string(nvtop, ZPOOL_CONFIG_TYPE, &type) == 0); 2317 2318 if (strcmp(type, VDEV_TYPE_DISK) == 0) { 2319 if (spa_rootdev_validate(nvtop)) { 2320 goto out; 2321 } else { 2322 nvlist_free(conf); 2323 return (EINVAL); 2324 } 2325 } 2326 2327 ASSERT(strcmp(type, VDEV_TYPE_MIRROR) == 0); 2328 2329 VERIFY(nvlist_lookup_nvlist_array(nvtop, ZPOOL_CONFIG_CHILDREN, 2330 &child, &children) == 0); 2331 2332 /* 2333 * Go thru vdevs in the mirror to see if the given device 2334 * has the most recent txg. Only the device with the most 2335 * recent txg has valid information and should be booted. 2336 */ 2337 for (c = 0; c < children; c++) { 2338 char *cdevid, *cpath; 2339 uint64_t tmptxg; 2340 2341 if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_PHYS_PATH, 2342 &cpath) != 0) 2343 return (EINVAL); 2344 if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_DEVID, 2345 &cdevid) != 0) 2346 return (EINVAL); 2347 if ((spa_check_rootconf(cpath, cdevid, NULL, 2348 &tmptxg) == 0) && (tmptxg > txg)) { 2349 txg = tmptxg; 2350 VERIFY(nvlist_lookup_string(child[c], 2351 ZPOOL_CONFIG_PATH, &bootpath) == 0); 2352 } 2353 } 2354 2355 /* Does the best device match the one we've booted from? */ 2356 if (bootpath) { 2357 cmn_err(CE_NOTE, "try booting from '%s'", bootpath); 2358 return (EINVAL); 2359 } 2360 out: 2361 *bestconf = conf; 2362 return (0); 2363 } 2364 2365 /* 2366 * Import a root pool. 2367 * 2368 * For x86. devpath_list will consist of devid and/or physpath name of 2369 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 2370 * The GRUB "findroot" command will return the vdev we should boot. 2371 * 2372 * For Sparc, devpath_list consists the physpath name of the booting device 2373 * no matter the rootpool is a single device pool or a mirrored pool. 2374 * e.g. 2375 * "/pci@1f,0/ide@d/disk@0,0:a" 2376 */ 2377 int 2378 spa_import_rootpool(char *devpath, char *devid) 2379 { 2380 nvlist_t *conf = NULL; 2381 char *pname; 2382 int error; 2383 2384 /* 2385 * Get the vdev pathname and configuation from the most 2386 * recently updated vdev (highest txg). 2387 */ 2388 if (error = spa_get_rootconf(devpath, devid, &conf)) 2389 goto msg_out; 2390 2391 /* 2392 * Add type "root" vdev to the config. 2393 */ 2394 spa_build_rootpool_config(conf); 2395 2396 VERIFY(nvlist_lookup_string(conf, ZPOOL_CONFIG_POOL_NAME, &pname) == 0); 2397 2398 /* 2399 * We specify 'allowfaulted' for this to be treated like spa_open() 2400 * instead of spa_import(). This prevents us from marking vdevs as 2401 * persistently unavailable, and generates FMA ereports as if it were a 2402 * pool open, not import. 2403 */ 2404 error = spa_import_common(pname, conf, NULL, B_TRUE, B_TRUE); 2405 if (error == EEXIST) 2406 error = 0; 2407 2408 nvlist_free(conf); 2409 return (error); 2410 2411 msg_out: 2412 cmn_err(CE_NOTE, "\n" 2413 " *************************************************** \n" 2414 " * This device is not bootable! * \n" 2415 " * It is either offlined or detached or faulted. * \n" 2416 " * Please try to boot from a different device. * \n" 2417 " *************************************************** "); 2418 2419 return (error); 2420 } 2421 #endif 2422 2423 /* 2424 * Import a non-root pool into the system. 2425 */ 2426 int 2427 spa_import(const char *pool, nvlist_t *config, nvlist_t *props) 2428 { 2429 return (spa_import_common(pool, config, props, B_FALSE, B_FALSE)); 2430 } 2431 2432 int 2433 spa_import_faulted(const char *pool, nvlist_t *config, nvlist_t *props) 2434 { 2435 return (spa_import_common(pool, config, props, B_FALSE, B_TRUE)); 2436 } 2437 2438 2439 /* 2440 * This (illegal) pool name is used when temporarily importing a spa_t in order 2441 * to get the vdev stats associated with the imported devices. 2442 */ 2443 #define TRYIMPORT_NAME "$import" 2444 2445 nvlist_t * 2446 spa_tryimport(nvlist_t *tryconfig) 2447 { 2448 nvlist_t *config = NULL; 2449 char *poolname; 2450 spa_t *spa; 2451 uint64_t state; 2452 2453 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 2454 return (NULL); 2455 2456 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 2457 return (NULL); 2458 2459 /* 2460 * Create and initialize the spa structure. 2461 */ 2462 mutex_enter(&spa_namespace_lock); 2463 spa = spa_add(TRYIMPORT_NAME, NULL); 2464 spa_activate(spa); 2465 2466 /* 2467 * Pass off the heavy lifting to spa_load(). 2468 * Pass TRUE for mosconfig because the user-supplied config 2469 * is actually the one to trust when doing an import. 2470 */ 2471 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 2472 2473 /* 2474 * If 'tryconfig' was at least parsable, return the current config. 2475 */ 2476 if (spa->spa_root_vdev != NULL) { 2477 spa_config_enter(spa, RW_READER, FTAG); 2478 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2479 spa_config_exit(spa, FTAG); 2480 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 2481 poolname) == 0); 2482 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 2483 state) == 0); 2484 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 2485 spa->spa_uberblock.ub_timestamp) == 0); 2486 2487 /* 2488 * If the bootfs property exists on this pool then we 2489 * copy it out so that external consumers can tell which 2490 * pools are bootable. 2491 */ 2492 if (spa->spa_bootfs) { 2493 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 2494 2495 /* 2496 * We have to play games with the name since the 2497 * pool was opened as TRYIMPORT_NAME. 2498 */ 2499 if (dsl_dsobj_to_dsname(spa->spa_name, 2500 spa->spa_bootfs, tmpname) == 0) { 2501 char *cp; 2502 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 2503 2504 cp = strchr(tmpname, '/'); 2505 if (cp == NULL) { 2506 (void) strlcpy(dsname, tmpname, 2507 MAXPATHLEN); 2508 } else { 2509 (void) snprintf(dsname, MAXPATHLEN, 2510 "%s/%s", poolname, ++cp); 2511 } 2512 VERIFY(nvlist_add_string(config, 2513 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 2514 kmem_free(dsname, MAXPATHLEN); 2515 } 2516 kmem_free(tmpname, MAXPATHLEN); 2517 } 2518 2519 /* 2520 * Add the list of hot spares and level 2 cache devices. 2521 */ 2522 spa_add_spares(spa, config); 2523 spa_add_l2cache(spa, config); 2524 } 2525 2526 spa_unload(spa); 2527 spa_deactivate(spa); 2528 spa_remove(spa); 2529 mutex_exit(&spa_namespace_lock); 2530 2531 return (config); 2532 } 2533 2534 /* 2535 * Pool export/destroy 2536 * 2537 * The act of destroying or exporting a pool is very simple. We make sure there 2538 * is no more pending I/O and any references to the pool are gone. Then, we 2539 * update the pool state and sync all the labels to disk, removing the 2540 * configuration from the cache afterwards. 2541 */ 2542 static int 2543 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 2544 boolean_t force) 2545 { 2546 spa_t *spa; 2547 2548 if (oldconfig) 2549 *oldconfig = NULL; 2550 2551 if (!(spa_mode & FWRITE)) 2552 return (EROFS); 2553 2554 mutex_enter(&spa_namespace_lock); 2555 if ((spa = spa_lookup(pool)) == NULL) { 2556 mutex_exit(&spa_namespace_lock); 2557 return (ENOENT); 2558 } 2559 2560 /* 2561 * Put a hold on the pool, drop the namespace lock, stop async tasks, 2562 * reacquire the namespace lock, and see if we can export. 2563 */ 2564 spa_open_ref(spa, FTAG); 2565 mutex_exit(&spa_namespace_lock); 2566 spa_async_suspend(spa); 2567 mutex_enter(&spa_namespace_lock); 2568 spa_close(spa, FTAG); 2569 2570 /* 2571 * The pool will be in core if it's openable, 2572 * in which case we can modify its state. 2573 */ 2574 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 2575 /* 2576 * Objsets may be open only because they're dirty, so we 2577 * have to force it to sync before checking spa_refcnt. 2578 */ 2579 txg_wait_synced(spa->spa_dsl_pool, 0); 2580 2581 /* 2582 * A pool cannot be exported or destroyed if there are active 2583 * references. If we are resetting a pool, allow references by 2584 * fault injection handlers. 2585 */ 2586 if (!spa_refcount_zero(spa) || 2587 (spa->spa_inject_ref != 0 && 2588 new_state != POOL_STATE_UNINITIALIZED)) { 2589 spa_async_resume(spa); 2590 mutex_exit(&spa_namespace_lock); 2591 return (EBUSY); 2592 } 2593 2594 /* 2595 * A pool cannot be exported if it has an active shared spare. 2596 * This is to prevent other pools stealing the active spare 2597 * from an exported pool. At user's own will, such pool can 2598 * be forcedly exported. 2599 */ 2600 if (!force && new_state == POOL_STATE_EXPORTED && 2601 spa_has_active_shared_spare(spa)) { 2602 spa_async_resume(spa); 2603 mutex_exit(&spa_namespace_lock); 2604 return (EXDEV); 2605 } 2606 2607 /* 2608 * We want this to be reflected on every label, 2609 * so mark them all dirty. spa_unload() will do the 2610 * final sync that pushes these changes out. 2611 */ 2612 if (new_state != POOL_STATE_UNINITIALIZED) { 2613 spa_config_enter(spa, RW_WRITER, FTAG); 2614 spa->spa_state = new_state; 2615 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 2616 vdev_config_dirty(spa->spa_root_vdev); 2617 spa_config_exit(spa, FTAG); 2618 } 2619 } 2620 2621 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 2622 2623 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2624 spa_unload(spa); 2625 spa_deactivate(spa); 2626 } 2627 2628 if (oldconfig && spa->spa_config) 2629 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 2630 2631 if (new_state != POOL_STATE_UNINITIALIZED) { 2632 spa_config_sync(spa, B_TRUE, B_TRUE); 2633 spa_remove(spa); 2634 } 2635 mutex_exit(&spa_namespace_lock); 2636 2637 return (0); 2638 } 2639 2640 /* 2641 * Destroy a storage pool. 2642 */ 2643 int 2644 spa_destroy(char *pool) 2645 { 2646 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, B_FALSE)); 2647 } 2648 2649 /* 2650 * Export a storage pool. 2651 */ 2652 int 2653 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force) 2654 { 2655 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, force)); 2656 } 2657 2658 /* 2659 * Similar to spa_export(), this unloads the spa_t without actually removing it 2660 * from the namespace in any way. 2661 */ 2662 int 2663 spa_reset(char *pool) 2664 { 2665 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 2666 B_FALSE)); 2667 } 2668 2669 /* 2670 * ========================================================================== 2671 * Device manipulation 2672 * ========================================================================== 2673 */ 2674 2675 /* 2676 * Add a device to a storage pool. 2677 */ 2678 int 2679 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 2680 { 2681 uint64_t txg; 2682 int c, error; 2683 vdev_t *rvd = spa->spa_root_vdev; 2684 vdev_t *vd, *tvd; 2685 nvlist_t **spares, **l2cache; 2686 uint_t nspares, nl2cache; 2687 2688 txg = spa_vdev_enter(spa); 2689 2690 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 2691 VDEV_ALLOC_ADD)) != 0) 2692 return (spa_vdev_exit(spa, NULL, txg, error)); 2693 2694 spa->spa_pending_vdev = vd; 2695 2696 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 2697 &nspares) != 0) 2698 nspares = 0; 2699 2700 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 2701 &nl2cache) != 0) 2702 nl2cache = 0; 2703 2704 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) { 2705 spa->spa_pending_vdev = NULL; 2706 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 2707 } 2708 2709 if (vd->vdev_children != 0) { 2710 if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 2711 spa->spa_pending_vdev = NULL; 2712 return (spa_vdev_exit(spa, vd, txg, error)); 2713 } 2714 } 2715 2716 /* 2717 * We must validate the spares and l2cache devices after checking the 2718 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 2719 */ 2720 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) { 2721 spa->spa_pending_vdev = NULL; 2722 return (spa_vdev_exit(spa, vd, txg, error)); 2723 } 2724 2725 spa->spa_pending_vdev = NULL; 2726 2727 /* 2728 * Transfer each new top-level vdev from vd to rvd. 2729 */ 2730 for (c = 0; c < vd->vdev_children; c++) { 2731 tvd = vd->vdev_child[c]; 2732 vdev_remove_child(vd, tvd); 2733 tvd->vdev_id = rvd->vdev_children; 2734 vdev_add_child(rvd, tvd); 2735 vdev_config_dirty(tvd); 2736 } 2737 2738 if (nspares != 0) { 2739 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 2740 ZPOOL_CONFIG_SPARES); 2741 spa_load_spares(spa); 2742 spa->spa_spares.sav_sync = B_TRUE; 2743 } 2744 2745 if (nl2cache != 0) { 2746 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 2747 ZPOOL_CONFIG_L2CACHE); 2748 spa_load_l2cache(spa); 2749 spa->spa_l2cache.sav_sync = B_TRUE; 2750 } 2751 2752 /* 2753 * We have to be careful when adding new vdevs to an existing pool. 2754 * If other threads start allocating from these vdevs before we 2755 * sync the config cache, and we lose power, then upon reboot we may 2756 * fail to open the pool because there are DVAs that the config cache 2757 * can't translate. Therefore, we first add the vdevs without 2758 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 2759 * and then let spa_config_update() initialize the new metaslabs. 2760 * 2761 * spa_load() checks for added-but-not-initialized vdevs, so that 2762 * if we lose power at any point in this sequence, the remaining 2763 * steps will be completed the next time we load the pool. 2764 */ 2765 (void) spa_vdev_exit(spa, vd, txg, 0); 2766 2767 mutex_enter(&spa_namespace_lock); 2768 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2769 mutex_exit(&spa_namespace_lock); 2770 2771 return (0); 2772 } 2773 2774 /* 2775 * Attach a device to a mirror. The arguments are the path to any device 2776 * in the mirror, and the nvroot for the new device. If the path specifies 2777 * a device that is not mirrored, we automatically insert the mirror vdev. 2778 * 2779 * If 'replacing' is specified, the new device is intended to replace the 2780 * existing device; in this case the two devices are made into their own 2781 * mirror using the 'replacing' vdev, which is functionally identical to 2782 * the mirror vdev (it actually reuses all the same ops) but has a few 2783 * extra rules: you can't attach to it after it's been created, and upon 2784 * completion of resilvering, the first disk (the one being replaced) 2785 * is automatically detached. 2786 */ 2787 int 2788 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 2789 { 2790 uint64_t txg, open_txg; 2791 vdev_t *rvd = spa->spa_root_vdev; 2792 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 2793 vdev_ops_t *pvops; 2794 dmu_tx_t *tx; 2795 char *oldvdpath, *newvdpath; 2796 int newvd_isspare; 2797 int error; 2798 2799 txg = spa_vdev_enter(spa); 2800 2801 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 2802 2803 if (oldvd == NULL) 2804 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 2805 2806 if (!oldvd->vdev_ops->vdev_op_leaf) 2807 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2808 2809 pvd = oldvd->vdev_parent; 2810 2811 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 2812 VDEV_ALLOC_ADD)) != 0) 2813 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 2814 2815 if (newrootvd->vdev_children != 1) 2816 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 2817 2818 newvd = newrootvd->vdev_child[0]; 2819 2820 if (!newvd->vdev_ops->vdev_op_leaf) 2821 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 2822 2823 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 2824 return (spa_vdev_exit(spa, newrootvd, txg, error)); 2825 2826 /* 2827 * Spares can't replace logs 2828 */ 2829 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 2830 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2831 2832 if (!replacing) { 2833 /* 2834 * For attach, the only allowable parent is a mirror or the root 2835 * vdev. 2836 */ 2837 if (pvd->vdev_ops != &vdev_mirror_ops && 2838 pvd->vdev_ops != &vdev_root_ops) 2839 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2840 2841 pvops = &vdev_mirror_ops; 2842 } else { 2843 /* 2844 * Active hot spares can only be replaced by inactive hot 2845 * spares. 2846 */ 2847 if (pvd->vdev_ops == &vdev_spare_ops && 2848 pvd->vdev_child[1] == oldvd && 2849 !spa_has_spare(spa, newvd->vdev_guid)) 2850 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2851 2852 /* 2853 * If the source is a hot spare, and the parent isn't already a 2854 * spare, then we want to create a new hot spare. Otherwise, we 2855 * want to create a replacing vdev. The user is not allowed to 2856 * attach to a spared vdev child unless the 'isspare' state is 2857 * the same (spare replaces spare, non-spare replaces 2858 * non-spare). 2859 */ 2860 if (pvd->vdev_ops == &vdev_replacing_ops) 2861 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2862 else if (pvd->vdev_ops == &vdev_spare_ops && 2863 newvd->vdev_isspare != oldvd->vdev_isspare) 2864 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2865 else if (pvd->vdev_ops != &vdev_spare_ops && 2866 newvd->vdev_isspare) 2867 pvops = &vdev_spare_ops; 2868 else 2869 pvops = &vdev_replacing_ops; 2870 } 2871 2872 /* 2873 * Compare the new device size with the replaceable/attachable 2874 * device size. 2875 */ 2876 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 2877 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 2878 2879 /* 2880 * The new device cannot have a higher alignment requirement 2881 * than the top-level vdev. 2882 */ 2883 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 2884 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 2885 2886 /* 2887 * If this is an in-place replacement, update oldvd's path and devid 2888 * to make it distinguishable from newvd, and unopenable from now on. 2889 */ 2890 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 2891 spa_strfree(oldvd->vdev_path); 2892 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 2893 KM_SLEEP); 2894 (void) sprintf(oldvd->vdev_path, "%s/%s", 2895 newvd->vdev_path, "old"); 2896 if (oldvd->vdev_devid != NULL) { 2897 spa_strfree(oldvd->vdev_devid); 2898 oldvd->vdev_devid = NULL; 2899 } 2900 } 2901 2902 /* 2903 * If the parent is not a mirror, or if we're replacing, insert the new 2904 * mirror/replacing/spare vdev above oldvd. 2905 */ 2906 if (pvd->vdev_ops != pvops) 2907 pvd = vdev_add_parent(oldvd, pvops); 2908 2909 ASSERT(pvd->vdev_top->vdev_parent == rvd); 2910 ASSERT(pvd->vdev_ops == pvops); 2911 ASSERT(oldvd->vdev_parent == pvd); 2912 2913 /* 2914 * Extract the new device from its root and add it to pvd. 2915 */ 2916 vdev_remove_child(newrootvd, newvd); 2917 newvd->vdev_id = pvd->vdev_children; 2918 vdev_add_child(pvd, newvd); 2919 2920 /* 2921 * If newvd is smaller than oldvd, but larger than its rsize, 2922 * the addition of newvd may have decreased our parent's asize. 2923 */ 2924 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 2925 2926 tvd = newvd->vdev_top; 2927 ASSERT(pvd->vdev_top == tvd); 2928 ASSERT(tvd->vdev_parent == rvd); 2929 2930 vdev_config_dirty(tvd); 2931 2932 /* 2933 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 2934 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 2935 */ 2936 open_txg = txg + TXG_CONCURRENT_STATES - 1; 2937 2938 mutex_enter(&newvd->vdev_dtl_lock); 2939 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 2940 open_txg - TXG_INITIAL + 1); 2941 mutex_exit(&newvd->vdev_dtl_lock); 2942 2943 if (newvd->vdev_isspare) 2944 spa_spare_activate(newvd); 2945 oldvdpath = spa_strdup(vdev_description(oldvd)); 2946 newvdpath = spa_strdup(vdev_description(newvd)); 2947 newvd_isspare = newvd->vdev_isspare; 2948 2949 /* 2950 * Mark newvd's DTL dirty in this txg. 2951 */ 2952 vdev_dirty(tvd, VDD_DTL, newvd, txg); 2953 2954 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 2955 2956 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 2957 if (dmu_tx_assign(tx, TXG_WAIT) == 0) { 2958 spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, tx, 2959 CRED(), "%s vdev=%s %s vdev=%s", 2960 replacing && newvd_isspare ? "spare in" : 2961 replacing ? "replace" : "attach", newvdpath, 2962 replacing ? "for" : "to", oldvdpath); 2963 dmu_tx_commit(tx); 2964 } else { 2965 dmu_tx_abort(tx); 2966 } 2967 2968 spa_strfree(oldvdpath); 2969 spa_strfree(newvdpath); 2970 2971 /* 2972 * Kick off a resilver to update newvd. 2973 */ 2974 VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0); 2975 2976 return (0); 2977 } 2978 2979 /* 2980 * Detach a device from a mirror or replacing vdev. 2981 * If 'replace_done' is specified, only detach if the parent 2982 * is a replacing vdev. 2983 */ 2984 int 2985 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 2986 { 2987 uint64_t txg; 2988 int c, t, error; 2989 vdev_t *rvd = spa->spa_root_vdev; 2990 vdev_t *vd, *pvd, *cvd, *tvd; 2991 boolean_t unspare = B_FALSE; 2992 uint64_t unspare_guid; 2993 size_t len; 2994 2995 txg = spa_vdev_enter(spa); 2996 2997 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 2998 2999 if (vd == NULL) 3000 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3001 3002 if (!vd->vdev_ops->vdev_op_leaf) 3003 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3004 3005 pvd = vd->vdev_parent; 3006 3007 /* 3008 * If replace_done is specified, only remove this device if it's 3009 * the first child of a replacing vdev. For the 'spare' vdev, either 3010 * disk can be removed. 3011 */ 3012 if (replace_done) { 3013 if (pvd->vdev_ops == &vdev_replacing_ops) { 3014 if (vd->vdev_id != 0) 3015 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3016 } else if (pvd->vdev_ops != &vdev_spare_ops) { 3017 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3018 } 3019 } 3020 3021 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 3022 spa_version(spa) >= SPA_VERSION_SPARES); 3023 3024 /* 3025 * Only mirror, replacing, and spare vdevs support detach. 3026 */ 3027 if (pvd->vdev_ops != &vdev_replacing_ops && 3028 pvd->vdev_ops != &vdev_mirror_ops && 3029 pvd->vdev_ops != &vdev_spare_ops) 3030 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3031 3032 /* 3033 * If there's only one replica, you can't detach it. 3034 */ 3035 if (pvd->vdev_children <= 1) 3036 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3037 3038 /* 3039 * If all siblings have non-empty DTLs, this device may have the only 3040 * valid copy of the data, which means we cannot safely detach it. 3041 * 3042 * XXX -- as in the vdev_offline() case, we really want a more 3043 * precise DTL check. 3044 */ 3045 for (c = 0; c < pvd->vdev_children; c++) { 3046 uint64_t dirty; 3047 3048 cvd = pvd->vdev_child[c]; 3049 if (cvd == vd) 3050 continue; 3051 if (vdev_is_dead(cvd)) 3052 continue; 3053 mutex_enter(&cvd->vdev_dtl_lock); 3054 dirty = cvd->vdev_dtl_map.sm_space | 3055 cvd->vdev_dtl_scrub.sm_space; 3056 mutex_exit(&cvd->vdev_dtl_lock); 3057 if (!dirty) 3058 break; 3059 } 3060 3061 /* 3062 * If we are a replacing or spare vdev, then we can always detach the 3063 * latter child, as that is how one cancels the operation. 3064 */ 3065 if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 3066 c == pvd->vdev_children) 3067 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3068 3069 /* 3070 * If we are detaching the second disk from a replacing vdev, then 3071 * check to see if we changed the original vdev's path to have "/old" 3072 * at the end in spa_vdev_attach(). If so, undo that change now. 3073 */ 3074 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 && 3075 pvd->vdev_child[0]->vdev_path != NULL && 3076 pvd->vdev_child[1]->vdev_path != NULL) { 3077 ASSERT(pvd->vdev_child[1] == vd); 3078 cvd = pvd->vdev_child[0]; 3079 len = strlen(vd->vdev_path); 3080 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 3081 strcmp(cvd->vdev_path + len, "/old") == 0) { 3082 spa_strfree(cvd->vdev_path); 3083 cvd->vdev_path = spa_strdup(vd->vdev_path); 3084 } 3085 } 3086 3087 /* 3088 * If we are detaching the original disk from a spare, then it implies 3089 * that the spare should become a real disk, and be removed from the 3090 * active spare list for the pool. 3091 */ 3092 if (pvd->vdev_ops == &vdev_spare_ops && 3093 vd->vdev_id == 0) 3094 unspare = B_TRUE; 3095 3096 /* 3097 * Erase the disk labels so the disk can be used for other things. 3098 * This must be done after all other error cases are handled, 3099 * but before we disembowel vd (so we can still do I/O to it). 3100 * But if we can't do it, don't treat the error as fatal -- 3101 * it may be that the unwritability of the disk is the reason 3102 * it's being detached! 3103 */ 3104 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 3105 3106 /* 3107 * Remove vd from its parent and compact the parent's children. 3108 */ 3109 vdev_remove_child(pvd, vd); 3110 vdev_compact_children(pvd); 3111 3112 /* 3113 * Remember one of the remaining children so we can get tvd below. 3114 */ 3115 cvd = pvd->vdev_child[0]; 3116 3117 /* 3118 * If we need to remove the remaining child from the list of hot spares, 3119 * do it now, marking the vdev as no longer a spare in the process. We 3120 * must do this before vdev_remove_parent(), because that can change the 3121 * GUID if it creates a new toplevel GUID. 3122 */ 3123 if (unspare) { 3124 ASSERT(cvd->vdev_isspare); 3125 spa_spare_remove(cvd); 3126 unspare_guid = cvd->vdev_guid; 3127 } 3128 3129 /* 3130 * If the parent mirror/replacing vdev only has one child, 3131 * the parent is no longer needed. Remove it from the tree. 3132 */ 3133 if (pvd->vdev_children == 1) 3134 vdev_remove_parent(cvd); 3135 3136 /* 3137 * We don't set tvd until now because the parent we just removed 3138 * may have been the previous top-level vdev. 3139 */ 3140 tvd = cvd->vdev_top; 3141 ASSERT(tvd->vdev_parent == rvd); 3142 3143 /* 3144 * Reevaluate the parent vdev state. 3145 */ 3146 vdev_propagate_state(cvd); 3147 3148 /* 3149 * If the device we just detached was smaller than the others, it may be 3150 * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 3151 * can't fail because the existing metaslabs are already in core, so 3152 * there's nothing to read from disk. 3153 */ 3154 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 3155 3156 vdev_config_dirty(tvd); 3157 3158 /* 3159 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 3160 * vd->vdev_detached is set and free vd's DTL object in syncing context. 3161 * But first make sure we're not on any *other* txg's DTL list, to 3162 * prevent vd from being accessed after it's freed. 3163 */ 3164 for (t = 0; t < TXG_SIZE; t++) 3165 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 3166 vd->vdev_detached = B_TRUE; 3167 vdev_dirty(tvd, VDD_DTL, vd, txg); 3168 3169 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 3170 3171 error = spa_vdev_exit(spa, vd, txg, 0); 3172 3173 /* 3174 * If this was the removal of the original device in a hot spare vdev, 3175 * then we want to go through and remove the device from the hot spare 3176 * list of every other pool. 3177 */ 3178 if (unspare) { 3179 spa = NULL; 3180 mutex_enter(&spa_namespace_lock); 3181 while ((spa = spa_next(spa)) != NULL) { 3182 if (spa->spa_state != POOL_STATE_ACTIVE) 3183 continue; 3184 3185 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 3186 } 3187 mutex_exit(&spa_namespace_lock); 3188 } 3189 3190 return (error); 3191 } 3192 3193 /* 3194 * Remove a spares vdev from the nvlist config. 3195 */ 3196 static int 3197 spa_remove_spares(spa_aux_vdev_t *sav, uint64_t guid, boolean_t unspare, 3198 nvlist_t **spares, int nspares, vdev_t *vd) 3199 { 3200 nvlist_t *nv, **newspares; 3201 int i, j; 3202 3203 nv = NULL; 3204 for (i = 0; i < nspares; i++) { 3205 uint64_t theguid; 3206 3207 VERIFY(nvlist_lookup_uint64(spares[i], 3208 ZPOOL_CONFIG_GUID, &theguid) == 0); 3209 if (theguid == guid) { 3210 nv = spares[i]; 3211 break; 3212 } 3213 } 3214 3215 /* 3216 * Only remove the hot spare if it's not currently in use in this pool. 3217 */ 3218 if (nv == NULL && vd == NULL) 3219 return (ENOENT); 3220 3221 if (nv == NULL && vd != NULL) 3222 return (ENOTSUP); 3223 3224 if (!unspare && nv != NULL && vd != NULL) 3225 return (EBUSY); 3226 3227 if (nspares == 1) { 3228 newspares = NULL; 3229 } else { 3230 newspares = kmem_alloc((nspares - 1) * sizeof (void *), 3231 KM_SLEEP); 3232 for (i = 0, j = 0; i < nspares; i++) { 3233 if (spares[i] != nv) 3234 VERIFY(nvlist_dup(spares[i], 3235 &newspares[j++], KM_SLEEP) == 0); 3236 } 3237 } 3238 3239 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_SPARES, 3240 DATA_TYPE_NVLIST_ARRAY) == 0); 3241 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3242 ZPOOL_CONFIG_SPARES, newspares, nspares - 1) == 0); 3243 for (i = 0; i < nspares - 1; i++) 3244 nvlist_free(newspares[i]); 3245 kmem_free(newspares, (nspares - 1) * sizeof (void *)); 3246 3247 return (0); 3248 } 3249 3250 /* 3251 * Remove an l2cache vdev from the nvlist config. 3252 */ 3253 static int 3254 spa_remove_l2cache(spa_aux_vdev_t *sav, uint64_t guid, nvlist_t **l2cache, 3255 int nl2cache, vdev_t *vd) 3256 { 3257 nvlist_t *nv, **newl2cache; 3258 int i, j; 3259 3260 nv = NULL; 3261 for (i = 0; i < nl2cache; i++) { 3262 uint64_t theguid; 3263 3264 VERIFY(nvlist_lookup_uint64(l2cache[i], 3265 ZPOOL_CONFIG_GUID, &theguid) == 0); 3266 if (theguid == guid) { 3267 nv = l2cache[i]; 3268 break; 3269 } 3270 } 3271 3272 if (vd == NULL) { 3273 for (i = 0; i < nl2cache; i++) { 3274 if (sav->sav_vdevs[i]->vdev_guid == guid) { 3275 vd = sav->sav_vdevs[i]; 3276 break; 3277 } 3278 } 3279 } 3280 3281 if (nv == NULL && vd == NULL) 3282 return (ENOENT); 3283 3284 if (nv == NULL && vd != NULL) 3285 return (ENOTSUP); 3286 3287 if (nl2cache == 1) { 3288 newl2cache = NULL; 3289 } else { 3290 newl2cache = kmem_alloc((nl2cache - 1) * sizeof (void *), 3291 KM_SLEEP); 3292 for (i = 0, j = 0; i < nl2cache; i++) { 3293 if (l2cache[i] != nv) 3294 VERIFY(nvlist_dup(l2cache[i], 3295 &newl2cache[j++], KM_SLEEP) == 0); 3296 } 3297 } 3298 3299 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 3300 DATA_TYPE_NVLIST_ARRAY) == 0); 3301 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3302 ZPOOL_CONFIG_L2CACHE, newl2cache, nl2cache - 1) == 0); 3303 for (i = 0; i < nl2cache - 1; i++) 3304 nvlist_free(newl2cache[i]); 3305 kmem_free(newl2cache, (nl2cache - 1) * sizeof (void *)); 3306 3307 return (0); 3308 } 3309 3310 /* 3311 * Remove a device from the pool. Currently, this supports removing only hot 3312 * spares and level 2 ARC devices. 3313 */ 3314 int 3315 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 3316 { 3317 vdev_t *vd; 3318 nvlist_t **spares, **l2cache; 3319 uint_t nspares, nl2cache; 3320 int error = 0; 3321 3322 spa_config_enter(spa, RW_WRITER, FTAG); 3323 3324 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 3325 3326 if (spa->spa_spares.sav_vdevs != NULL && 3327 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3328 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { 3329 if ((error = spa_remove_spares(&spa->spa_spares, guid, unspare, 3330 spares, nspares, vd)) != 0) 3331 goto cache; 3332 spa_load_spares(spa); 3333 spa->spa_spares.sav_sync = B_TRUE; 3334 goto out; 3335 } 3336 3337 cache: 3338 if (spa->spa_l2cache.sav_vdevs != NULL && 3339 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3340 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { 3341 if ((error = spa_remove_l2cache(&spa->spa_l2cache, guid, 3342 l2cache, nl2cache, vd)) != 0) 3343 goto out; 3344 spa_load_l2cache(spa); 3345 spa->spa_l2cache.sav_sync = B_TRUE; 3346 } 3347 3348 out: 3349 spa_config_exit(spa, FTAG); 3350 return (error); 3351 } 3352 3353 /* 3354 * Find any device that's done replacing, or a vdev marked 'unspare' that's 3355 * current spared, so we can detach it. 3356 */ 3357 static vdev_t * 3358 spa_vdev_resilver_done_hunt(vdev_t *vd) 3359 { 3360 vdev_t *newvd, *oldvd; 3361 int c; 3362 3363 for (c = 0; c < vd->vdev_children; c++) { 3364 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 3365 if (oldvd != NULL) 3366 return (oldvd); 3367 } 3368 3369 /* 3370 * Check for a completed replacement. 3371 */ 3372 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 3373 oldvd = vd->vdev_child[0]; 3374 newvd = vd->vdev_child[1]; 3375 3376 mutex_enter(&newvd->vdev_dtl_lock); 3377 if (newvd->vdev_dtl_map.sm_space == 0 && 3378 newvd->vdev_dtl_scrub.sm_space == 0) { 3379 mutex_exit(&newvd->vdev_dtl_lock); 3380 return (oldvd); 3381 } 3382 mutex_exit(&newvd->vdev_dtl_lock); 3383 } 3384 3385 /* 3386 * Check for a completed resilver with the 'unspare' flag set. 3387 */ 3388 if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { 3389 newvd = vd->vdev_child[0]; 3390 oldvd = vd->vdev_child[1]; 3391 3392 mutex_enter(&newvd->vdev_dtl_lock); 3393 if (newvd->vdev_unspare && 3394 newvd->vdev_dtl_map.sm_space == 0 && 3395 newvd->vdev_dtl_scrub.sm_space == 0) { 3396 newvd->vdev_unspare = 0; 3397 mutex_exit(&newvd->vdev_dtl_lock); 3398 return (oldvd); 3399 } 3400 mutex_exit(&newvd->vdev_dtl_lock); 3401 } 3402 3403 return (NULL); 3404 } 3405 3406 static void 3407 spa_vdev_resilver_done(spa_t *spa) 3408 { 3409 vdev_t *vd; 3410 vdev_t *pvd; 3411 uint64_t guid; 3412 uint64_t pguid = 0; 3413 3414 spa_config_enter(spa, RW_READER, FTAG); 3415 3416 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 3417 guid = vd->vdev_guid; 3418 /* 3419 * If we have just finished replacing a hot spared device, then 3420 * we need to detach the parent's first child (the original hot 3421 * spare) as well. 3422 */ 3423 pvd = vd->vdev_parent; 3424 if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3425 pvd->vdev_id == 0) { 3426 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 3427 ASSERT(pvd->vdev_parent->vdev_children == 2); 3428 pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 3429 } 3430 spa_config_exit(spa, FTAG); 3431 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 3432 return; 3433 if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 3434 return; 3435 spa_config_enter(spa, RW_READER, FTAG); 3436 } 3437 3438 spa_config_exit(spa, FTAG); 3439 } 3440 3441 /* 3442 * Update the stored path for this vdev. Dirty the vdev configuration, relying 3443 * on spa_vdev_enter/exit() to synchronize the labels and cache. 3444 */ 3445 int 3446 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 3447 { 3448 vdev_t *vd; 3449 uint64_t txg; 3450 3451 txg = spa_vdev_enter(spa); 3452 3453 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) { 3454 /* 3455 * Determine if this is a reference to a hot spare device. If 3456 * it is, update the path manually as there is no associated 3457 * vdev_t that can be synced to disk. 3458 */ 3459 nvlist_t **spares; 3460 uint_t i, nspares; 3461 3462 if (spa->spa_spares.sav_config != NULL) { 3463 VERIFY(nvlist_lookup_nvlist_array( 3464 spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 3465 &spares, &nspares) == 0); 3466 for (i = 0; i < nspares; i++) { 3467 uint64_t theguid; 3468 VERIFY(nvlist_lookup_uint64(spares[i], 3469 ZPOOL_CONFIG_GUID, &theguid) == 0); 3470 if (theguid == guid) { 3471 VERIFY(nvlist_add_string(spares[i], 3472 ZPOOL_CONFIG_PATH, newpath) == 0); 3473 spa_load_spares(spa); 3474 spa->spa_spares.sav_sync = B_TRUE; 3475 return (spa_vdev_exit(spa, NULL, txg, 3476 0)); 3477 } 3478 } 3479 } 3480 3481 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 3482 } 3483 3484 if (!vd->vdev_ops->vdev_op_leaf) 3485 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3486 3487 spa_strfree(vd->vdev_path); 3488 vd->vdev_path = spa_strdup(newpath); 3489 3490 vdev_config_dirty(vd->vdev_top); 3491 3492 return (spa_vdev_exit(spa, NULL, txg, 0)); 3493 } 3494 3495 /* 3496 * ========================================================================== 3497 * SPA Scrubbing 3498 * ========================================================================== 3499 */ 3500 3501 int 3502 spa_scrub(spa_t *spa, pool_scrub_type_t type) 3503 { 3504 ASSERT(!spa_config_held(spa, RW_WRITER)); 3505 3506 if ((uint_t)type >= POOL_SCRUB_TYPES) 3507 return (ENOTSUP); 3508 3509 /* 3510 * If a resilver was requested, but there is no DTL on a 3511 * writeable leaf device, we have nothing to do. 3512 */ 3513 if (type == POOL_SCRUB_RESILVER && 3514 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 3515 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 3516 return (0); 3517 } 3518 3519 if (type == POOL_SCRUB_EVERYTHING && 3520 spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE && 3521 spa->spa_dsl_pool->dp_scrub_isresilver) 3522 return (EBUSY); 3523 3524 if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) { 3525 return (dsl_pool_scrub_clean(spa->spa_dsl_pool)); 3526 } else if (type == POOL_SCRUB_NONE) { 3527 return (dsl_pool_scrub_cancel(spa->spa_dsl_pool)); 3528 } else { 3529 return (EINVAL); 3530 } 3531 } 3532 3533 /* 3534 * ========================================================================== 3535 * SPA async task processing 3536 * ========================================================================== 3537 */ 3538 3539 static void 3540 spa_async_remove(spa_t *spa, vdev_t *vd) 3541 { 3542 int c; 3543 3544 if (vd->vdev_remove_wanted) { 3545 vd->vdev_remove_wanted = 0; 3546 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 3547 vdev_clear(spa, vd, B_TRUE); 3548 vdev_config_dirty(vd->vdev_top); 3549 } 3550 3551 for (c = 0; c < vd->vdev_children; c++) 3552 spa_async_remove(spa, vd->vdev_child[c]); 3553 } 3554 3555 static void 3556 spa_async_thread(spa_t *spa) 3557 { 3558 int tasks, i; 3559 uint64_t txg; 3560 3561 ASSERT(spa->spa_sync_on); 3562 3563 mutex_enter(&spa->spa_async_lock); 3564 tasks = spa->spa_async_tasks; 3565 spa->spa_async_tasks = 0; 3566 mutex_exit(&spa->spa_async_lock); 3567 3568 /* 3569 * See if the config needs to be updated. 3570 */ 3571 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 3572 mutex_enter(&spa_namespace_lock); 3573 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3574 mutex_exit(&spa_namespace_lock); 3575 } 3576 3577 /* 3578 * See if any devices need to be marked REMOVED. 3579 * 3580 * XXX - We avoid doing this when we are in 3581 * I/O failure state since spa_vdev_enter() grabs 3582 * the namespace lock and would not be able to obtain 3583 * the writer config lock. 3584 */ 3585 if (tasks & SPA_ASYNC_REMOVE && 3586 spa_state(spa) != POOL_STATE_IO_FAILURE) { 3587 txg = spa_vdev_enter(spa); 3588 spa_async_remove(spa, spa->spa_root_vdev); 3589 for (i = 0; i < spa->spa_l2cache.sav_count; i++) 3590 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 3591 for (i = 0; i < spa->spa_spares.sav_count; i++) 3592 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 3593 (void) spa_vdev_exit(spa, NULL, txg, 0); 3594 } 3595 3596 /* 3597 * If any devices are done replacing, detach them. 3598 */ 3599 if (tasks & SPA_ASYNC_RESILVER_DONE) 3600 spa_vdev_resilver_done(spa); 3601 3602 /* 3603 * Kick off a resilver. 3604 */ 3605 if (tasks & SPA_ASYNC_RESILVER) 3606 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0); 3607 3608 /* 3609 * Let the world know that we're done. 3610 */ 3611 mutex_enter(&spa->spa_async_lock); 3612 spa->spa_async_thread = NULL; 3613 cv_broadcast(&spa->spa_async_cv); 3614 mutex_exit(&spa->spa_async_lock); 3615 thread_exit(); 3616 } 3617 3618 void 3619 spa_async_suspend(spa_t *spa) 3620 { 3621 mutex_enter(&spa->spa_async_lock); 3622 spa->spa_async_suspended++; 3623 while (spa->spa_async_thread != NULL) 3624 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 3625 mutex_exit(&spa->spa_async_lock); 3626 } 3627 3628 void 3629 spa_async_resume(spa_t *spa) 3630 { 3631 mutex_enter(&spa->spa_async_lock); 3632 ASSERT(spa->spa_async_suspended != 0); 3633 spa->spa_async_suspended--; 3634 mutex_exit(&spa->spa_async_lock); 3635 } 3636 3637 static void 3638 spa_async_dispatch(spa_t *spa) 3639 { 3640 mutex_enter(&spa->spa_async_lock); 3641 if (spa->spa_async_tasks && !spa->spa_async_suspended && 3642 spa->spa_async_thread == NULL && 3643 rootdir != NULL && !vn_is_readonly(rootdir)) 3644 spa->spa_async_thread = thread_create(NULL, 0, 3645 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 3646 mutex_exit(&spa->spa_async_lock); 3647 } 3648 3649 void 3650 spa_async_request(spa_t *spa, int task) 3651 { 3652 mutex_enter(&spa->spa_async_lock); 3653 spa->spa_async_tasks |= task; 3654 mutex_exit(&spa->spa_async_lock); 3655 } 3656 3657 /* 3658 * ========================================================================== 3659 * SPA syncing routines 3660 * ========================================================================== 3661 */ 3662 3663 static void 3664 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 3665 { 3666 bplist_t *bpl = &spa->spa_sync_bplist; 3667 dmu_tx_t *tx; 3668 blkptr_t blk; 3669 uint64_t itor = 0; 3670 zio_t *zio; 3671 int error; 3672 uint8_t c = 1; 3673 3674 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 3675 3676 while (bplist_iterate(bpl, &itor, &blk) == 0) 3677 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 3678 3679 error = zio_wait(zio); 3680 ASSERT3U(error, ==, 0); 3681 3682 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 3683 bplist_vacate(bpl, tx); 3684 3685 /* 3686 * Pre-dirty the first block so we sync to convergence faster. 3687 * (Usually only the first block is needed.) 3688 */ 3689 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 3690 dmu_tx_commit(tx); 3691 } 3692 3693 static void 3694 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 3695 { 3696 char *packed = NULL; 3697 size_t bufsize; 3698 size_t nvsize = 0; 3699 dmu_buf_t *db; 3700 3701 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 3702 3703 /* 3704 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 3705 * information. This avoids the dbuf_will_dirty() path and 3706 * saves us a pre-read to get data we don't actually care about. 3707 */ 3708 bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); 3709 packed = kmem_alloc(bufsize, KM_SLEEP); 3710 3711 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 3712 KM_SLEEP) == 0); 3713 bzero(packed + nvsize, bufsize - nvsize); 3714 3715 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 3716 3717 kmem_free(packed, bufsize); 3718 3719 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 3720 dmu_buf_will_dirty(db, tx); 3721 *(uint64_t *)db->db_data = nvsize; 3722 dmu_buf_rele(db, FTAG); 3723 } 3724 3725 static void 3726 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 3727 const char *config, const char *entry) 3728 { 3729 nvlist_t *nvroot; 3730 nvlist_t **list; 3731 int i; 3732 3733 if (!sav->sav_sync) 3734 return; 3735 3736 /* 3737 * Update the MOS nvlist describing the list of available devices. 3738 * spa_validate_aux() will have already made sure this nvlist is 3739 * valid and the vdevs are labeled appropriately. 3740 */ 3741 if (sav->sav_object == 0) { 3742 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 3743 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 3744 sizeof (uint64_t), tx); 3745 VERIFY(zap_update(spa->spa_meta_objset, 3746 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 3747 &sav->sav_object, tx) == 0); 3748 } 3749 3750 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3751 if (sav->sav_count == 0) { 3752 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 3753 } else { 3754 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 3755 for (i = 0; i < sav->sav_count; i++) 3756 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 3757 B_FALSE, B_FALSE, B_TRUE); 3758 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 3759 sav->sav_count) == 0); 3760 for (i = 0; i < sav->sav_count; i++) 3761 nvlist_free(list[i]); 3762 kmem_free(list, sav->sav_count * sizeof (void *)); 3763 } 3764 3765 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 3766 nvlist_free(nvroot); 3767 3768 sav->sav_sync = B_FALSE; 3769 } 3770 3771 static void 3772 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 3773 { 3774 nvlist_t *config; 3775 3776 if (list_is_empty(&spa->spa_dirty_list)) 3777 return; 3778 3779 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 3780 3781 if (spa->spa_config_syncing) 3782 nvlist_free(spa->spa_config_syncing); 3783 spa->spa_config_syncing = config; 3784 3785 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 3786 } 3787 3788 /* 3789 * Set zpool properties. 3790 */ 3791 static void 3792 spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 3793 { 3794 spa_t *spa = arg1; 3795 objset_t *mos = spa->spa_meta_objset; 3796 nvlist_t *nvp = arg2; 3797 nvpair_t *elem; 3798 uint64_t intval; 3799 char *strval; 3800 zpool_prop_t prop; 3801 const char *propname; 3802 zprop_type_t proptype; 3803 spa_config_dirent_t *dp; 3804 3805 elem = NULL; 3806 while ((elem = nvlist_next_nvpair(nvp, elem))) { 3807 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 3808 case ZPOOL_PROP_VERSION: 3809 /* 3810 * Only set version for non-zpool-creation cases 3811 * (set/import). spa_create() needs special care 3812 * for version setting. 3813 */ 3814 if (tx->tx_txg != TXG_INITIAL) { 3815 VERIFY(nvpair_value_uint64(elem, 3816 &intval) == 0); 3817 ASSERT(intval <= SPA_VERSION); 3818 ASSERT(intval >= spa_version(spa)); 3819 spa->spa_uberblock.ub_version = intval; 3820 vdev_config_dirty(spa->spa_root_vdev); 3821 } 3822 break; 3823 3824 case ZPOOL_PROP_ALTROOT: 3825 /* 3826 * 'altroot' is a non-persistent property. It should 3827 * have been set temporarily at creation or import time. 3828 */ 3829 ASSERT(spa->spa_root != NULL); 3830 break; 3831 3832 case ZPOOL_PROP_CACHEFILE: 3833 /* 3834 * 'cachefile' is a non-persistent property, but note 3835 * an async request that the config cache needs to be 3836 * udpated. 3837 */ 3838 VERIFY(nvpair_value_string(elem, &strval) == 0); 3839 3840 dp = kmem_alloc(sizeof (spa_config_dirent_t), 3841 KM_SLEEP); 3842 3843 if (strval[0] == '\0') 3844 dp->scd_path = spa_strdup(spa_config_path); 3845 else if (strcmp(strval, "none") == 0) 3846 dp->scd_path = NULL; 3847 else 3848 dp->scd_path = spa_strdup(strval); 3849 3850 list_insert_head(&spa->spa_config_list, dp); 3851 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 3852 break; 3853 default: 3854 /* 3855 * Set pool property values in the poolprops mos object. 3856 */ 3857 mutex_enter(&spa->spa_props_lock); 3858 if (spa->spa_pool_props_object == 0) { 3859 objset_t *mos = spa->spa_meta_objset; 3860 3861 VERIFY((spa->spa_pool_props_object = 3862 zap_create(mos, DMU_OT_POOL_PROPS, 3863 DMU_OT_NONE, 0, tx)) > 0); 3864 3865 VERIFY(zap_update(mos, 3866 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 3867 8, 1, &spa->spa_pool_props_object, tx) 3868 == 0); 3869 } 3870 mutex_exit(&spa->spa_props_lock); 3871 3872 /* normalize the property name */ 3873 propname = zpool_prop_to_name(prop); 3874 proptype = zpool_prop_get_type(prop); 3875 3876 if (nvpair_type(elem) == DATA_TYPE_STRING) { 3877 ASSERT(proptype == PROP_TYPE_STRING); 3878 VERIFY(nvpair_value_string(elem, &strval) == 0); 3879 VERIFY(zap_update(mos, 3880 spa->spa_pool_props_object, propname, 3881 1, strlen(strval) + 1, strval, tx) == 0); 3882 3883 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 3884 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 3885 3886 if (proptype == PROP_TYPE_INDEX) { 3887 const char *unused; 3888 VERIFY(zpool_prop_index_to_string( 3889 prop, intval, &unused) == 0); 3890 } 3891 VERIFY(zap_update(mos, 3892 spa->spa_pool_props_object, propname, 3893 8, 1, &intval, tx) == 0); 3894 } else { 3895 ASSERT(0); /* not allowed */ 3896 } 3897 3898 switch (prop) { 3899 case ZPOOL_PROP_DELEGATION: 3900 spa->spa_delegation = intval; 3901 break; 3902 case ZPOOL_PROP_BOOTFS: 3903 spa->spa_bootfs = intval; 3904 break; 3905 case ZPOOL_PROP_FAILUREMODE: 3906 spa->spa_failmode = intval; 3907 break; 3908 default: 3909 break; 3910 } 3911 } 3912 3913 /* log internal history if this is not a zpool create */ 3914 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 3915 tx->tx_txg != TXG_INITIAL) { 3916 spa_history_internal_log(LOG_POOL_PROPSET, 3917 spa, tx, cr, "%s %lld %s", 3918 nvpair_name(elem), intval, spa->spa_name); 3919 } 3920 } 3921 } 3922 3923 /* 3924 * Sync the specified transaction group. New blocks may be dirtied as 3925 * part of the process, so we iterate until it converges. 3926 */ 3927 void 3928 spa_sync(spa_t *spa, uint64_t txg) 3929 { 3930 dsl_pool_t *dp = spa->spa_dsl_pool; 3931 objset_t *mos = spa->spa_meta_objset; 3932 bplist_t *bpl = &spa->spa_sync_bplist; 3933 vdev_t *rvd = spa->spa_root_vdev; 3934 vdev_t *vd; 3935 dmu_tx_t *tx; 3936 int dirty_vdevs; 3937 3938 /* 3939 * Lock out configuration changes. 3940 */ 3941 spa_config_enter(spa, RW_READER, FTAG); 3942 3943 spa->spa_syncing_txg = txg; 3944 spa->spa_sync_pass = 0; 3945 3946 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 3947 3948 tx = dmu_tx_create_assigned(dp, txg); 3949 3950 /* 3951 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 3952 * set spa_deflate if we have no raid-z vdevs. 3953 */ 3954 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 3955 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 3956 int i; 3957 3958 for (i = 0; i < rvd->vdev_children; i++) { 3959 vd = rvd->vdev_child[i]; 3960 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 3961 break; 3962 } 3963 if (i == rvd->vdev_children) { 3964 spa->spa_deflate = TRUE; 3965 VERIFY(0 == zap_add(spa->spa_meta_objset, 3966 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3967 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 3968 } 3969 } 3970 3971 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 3972 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 3973 dsl_pool_create_origin(dp, tx); 3974 3975 /* Keeping the origin open increases spa_minref */ 3976 spa->spa_minref += 3; 3977 } 3978 3979 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 3980 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 3981 dsl_pool_upgrade_clones(dp, tx); 3982 } 3983 3984 /* 3985 * If anything has changed in this txg, push the deferred frees 3986 * from the previous txg. If not, leave them alone so that we 3987 * don't generate work on an otherwise idle system. 3988 */ 3989 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 3990 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 3991 !txg_list_empty(&dp->dp_sync_tasks, txg)) 3992 spa_sync_deferred_frees(spa, txg); 3993 3994 /* 3995 * Iterate to convergence. 3996 */ 3997 do { 3998 spa->spa_sync_pass++; 3999 4000 spa_sync_config_object(spa, tx); 4001 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 4002 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 4003 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 4004 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 4005 spa_errlog_sync(spa, txg); 4006 dsl_pool_sync(dp, txg); 4007 4008 dirty_vdevs = 0; 4009 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 4010 vdev_sync(vd, txg); 4011 dirty_vdevs++; 4012 } 4013 4014 bplist_sync(bpl, tx); 4015 } while (dirty_vdevs); 4016 4017 bplist_close(bpl); 4018 4019 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 4020 4021 /* 4022 * Rewrite the vdev configuration (which includes the uberblock) 4023 * to commit the transaction group. 4024 * 4025 * If there are no dirty vdevs, we sync the uberblock to a few 4026 * random top-level vdevs that are known to be visible in the 4027 * config cache (see spa_vdev_add() for details). If there *are* 4028 * dirty vdevs -- or if the sync to our random subset fails -- 4029 * then sync the uberblock to all vdevs. 4030 */ 4031 if (list_is_empty(&spa->spa_dirty_list)) { 4032 vdev_t *svd[SPA_DVAS_PER_BP]; 4033 int svdcount = 0; 4034 int children = rvd->vdev_children; 4035 int c0 = spa_get_random(children); 4036 int c; 4037 4038 for (c = 0; c < children; c++) { 4039 vd = rvd->vdev_child[(c0 + c) % children]; 4040 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 4041 continue; 4042 svd[svdcount++] = vd; 4043 if (svdcount == SPA_DVAS_PER_BP) 4044 break; 4045 } 4046 vdev_config_sync(svd, svdcount, txg); 4047 } else { 4048 vdev_config_sync(rvd->vdev_child, rvd->vdev_children, txg); 4049 } 4050 dmu_tx_commit(tx); 4051 4052 /* 4053 * Clear the dirty config list. 4054 */ 4055 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 4056 vdev_config_clean(vd); 4057 4058 /* 4059 * Now that the new config has synced transactionally, 4060 * let it become visible to the config cache. 4061 */ 4062 if (spa->spa_config_syncing != NULL) { 4063 spa_config_set(spa, spa->spa_config_syncing); 4064 spa->spa_config_txg = txg; 4065 spa->spa_config_syncing = NULL; 4066 } 4067 4068 spa->spa_traverse_wanted = B_TRUE; 4069 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 4070 spa->spa_traverse_wanted = B_FALSE; 4071 spa->spa_ubsync = spa->spa_uberblock; 4072 rw_exit(&spa->spa_traverse_lock); 4073 4074 /* 4075 * Clean up the ZIL records for the synced txg. 4076 */ 4077 dsl_pool_zil_clean(dp); 4078 4079 /* 4080 * Update usable space statistics. 4081 */ 4082 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 4083 vdev_sync_done(vd, txg); 4084 4085 /* 4086 * It had better be the case that we didn't dirty anything 4087 * since vdev_config_sync(). 4088 */ 4089 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 4090 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 4091 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 4092 ASSERT(bpl->bpl_queue == NULL); 4093 4094 spa_config_exit(spa, FTAG); 4095 4096 /* 4097 * If any async tasks have been requested, kick them off. 4098 */ 4099 spa_async_dispatch(spa); 4100 } 4101 4102 /* 4103 * Sync all pools. We don't want to hold the namespace lock across these 4104 * operations, so we take a reference on the spa_t and drop the lock during the 4105 * sync. 4106 */ 4107 void 4108 spa_sync_allpools(void) 4109 { 4110 spa_t *spa = NULL; 4111 mutex_enter(&spa_namespace_lock); 4112 while ((spa = spa_next(spa)) != NULL) { 4113 if (spa_state(spa) != POOL_STATE_ACTIVE) 4114 continue; 4115 spa_open_ref(spa, FTAG); 4116 mutex_exit(&spa_namespace_lock); 4117 txg_wait_synced(spa_get_dsl(spa), 0); 4118 mutex_enter(&spa_namespace_lock); 4119 spa_close(spa, FTAG); 4120 } 4121 mutex_exit(&spa_namespace_lock); 4122 } 4123 4124 /* 4125 * ========================================================================== 4126 * Miscellaneous routines 4127 * ========================================================================== 4128 */ 4129 4130 /* 4131 * Remove all pools in the system. 4132 */ 4133 void 4134 spa_evict_all(void) 4135 { 4136 spa_t *spa; 4137 4138 /* 4139 * Remove all cached state. All pools should be closed now, 4140 * so every spa in the AVL tree should be unreferenced. 4141 */ 4142 mutex_enter(&spa_namespace_lock); 4143 while ((spa = spa_next(NULL)) != NULL) { 4144 /* 4145 * Stop async tasks. The async thread may need to detach 4146 * a device that's been replaced, which requires grabbing 4147 * spa_namespace_lock, so we must drop it here. 4148 */ 4149 spa_open_ref(spa, FTAG); 4150 mutex_exit(&spa_namespace_lock); 4151 spa_async_suspend(spa); 4152 mutex_enter(&spa_namespace_lock); 4153 spa_close(spa, FTAG); 4154 4155 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4156 spa_unload(spa); 4157 spa_deactivate(spa); 4158 } 4159 spa_remove(spa); 4160 } 4161 mutex_exit(&spa_namespace_lock); 4162 } 4163 4164 vdev_t * 4165 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache) 4166 { 4167 vdev_t *vd; 4168 int i; 4169 4170 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 4171 return (vd); 4172 4173 if (l2cache) { 4174 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 4175 vd = spa->spa_l2cache.sav_vdevs[i]; 4176 if (vd->vdev_guid == guid) 4177 return (vd); 4178 } 4179 } 4180 4181 return (NULL); 4182 } 4183 4184 void 4185 spa_upgrade(spa_t *spa, uint64_t version) 4186 { 4187 spa_config_enter(spa, RW_WRITER, FTAG); 4188 4189 /* 4190 * This should only be called for a non-faulted pool, and since a 4191 * future version would result in an unopenable pool, this shouldn't be 4192 * possible. 4193 */ 4194 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 4195 ASSERT(version >= spa->spa_uberblock.ub_version); 4196 4197 spa->spa_uberblock.ub_version = version; 4198 vdev_config_dirty(spa->spa_root_vdev); 4199 4200 spa_config_exit(spa, FTAG); 4201 4202 txg_wait_synced(spa_get_dsl(spa), 0); 4203 } 4204 4205 boolean_t 4206 spa_has_spare(spa_t *spa, uint64_t guid) 4207 { 4208 int i; 4209 uint64_t spareguid; 4210 spa_aux_vdev_t *sav = &spa->spa_spares; 4211 4212 for (i = 0; i < sav->sav_count; i++) 4213 if (sav->sav_vdevs[i]->vdev_guid == guid) 4214 return (B_TRUE); 4215 4216 for (i = 0; i < sav->sav_npending; i++) { 4217 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 4218 &spareguid) == 0 && spareguid == guid) 4219 return (B_TRUE); 4220 } 4221 4222 return (B_FALSE); 4223 } 4224 4225 /* 4226 * Check if a pool has an active shared spare device. 4227 * Note: reference count of an active spare is 2, as a spare and as a replace 4228 */ 4229 static boolean_t 4230 spa_has_active_shared_spare(spa_t *spa) 4231 { 4232 int i, refcnt; 4233 uint64_t pool; 4234 spa_aux_vdev_t *sav = &spa->spa_spares; 4235 4236 for (i = 0; i < sav->sav_count; i++) { 4237 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 4238 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 4239 refcnt > 2) 4240 return (B_TRUE); 4241 } 4242 4243 return (B_FALSE); 4244 } 4245 4246 /* 4247 * Post a sysevent corresponding to the given event. The 'name' must be one of 4248 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 4249 * filled in from the spa and (optionally) the vdev. This doesn't do anything 4250 * in the userland libzpool, as we don't want consumers to misinterpret ztest 4251 * or zdb as real changes. 4252 */ 4253 void 4254 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 4255 { 4256 #ifdef _KERNEL 4257 sysevent_t *ev; 4258 sysevent_attr_list_t *attr = NULL; 4259 sysevent_value_t value; 4260 sysevent_id_t eid; 4261 4262 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 4263 SE_SLEEP); 4264 4265 value.value_type = SE_DATA_TYPE_STRING; 4266 value.value.sv_string = spa_name(spa); 4267 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 4268 goto done; 4269 4270 value.value_type = SE_DATA_TYPE_UINT64; 4271 value.value.sv_uint64 = spa_guid(spa); 4272 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 4273 goto done; 4274 4275 if (vd) { 4276 value.value_type = SE_DATA_TYPE_UINT64; 4277 value.value.sv_uint64 = vd->vdev_guid; 4278 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 4279 SE_SLEEP) != 0) 4280 goto done; 4281 4282 if (vd->vdev_path) { 4283 value.value_type = SE_DATA_TYPE_STRING; 4284 value.value.sv_string = vd->vdev_path; 4285 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 4286 &value, SE_SLEEP) != 0) 4287 goto done; 4288 } 4289 } 4290 4291 if (sysevent_attach_attributes(ev, attr) != 0) 4292 goto done; 4293 attr = NULL; 4294 4295 (void) log_sysevent(ev, SE_SLEEP, &eid); 4296 4297 done: 4298 if (attr) 4299 sysevent_free_attr(attr); 4300 sysevent_free(ev); 4301 #endif 4302 } 4303