1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * This file contains all the routines used when modifying on-disk SPA state. 29 * This includes opening, importing, destroying, exporting a pool, and syncing a 30 * pool. 31 */ 32 33 #include <sys/zfs_context.h> 34 #include <sys/fm/fs/zfs.h> 35 #include <sys/spa_impl.h> 36 #include <sys/zio.h> 37 #include <sys/zio_checksum.h> 38 #include <sys/zio_compress.h> 39 #include <sys/dmu.h> 40 #include <sys/dmu_tx.h> 41 #include <sys/zap.h> 42 #include <sys/zil.h> 43 #include <sys/vdev_impl.h> 44 #include <sys/metaslab.h> 45 #include <sys/uberblock_impl.h> 46 #include <sys/txg.h> 47 #include <sys/avl.h> 48 #include <sys/dmu_traverse.h> 49 #include <sys/dmu_objset.h> 50 #include <sys/unique.h> 51 #include <sys/dsl_pool.h> 52 #include <sys/dsl_dataset.h> 53 #include <sys/dsl_dir.h> 54 #include <sys/dsl_prop.h> 55 #include <sys/dsl_synctask.h> 56 #include <sys/fs/zfs.h> 57 #include <sys/arc.h> 58 #include <sys/callb.h> 59 #include <sys/systeminfo.h> 60 #include <sys/sunddi.h> 61 #include <sys/spa_boot.h> 62 63 #include "zfs_prop.h" 64 #include "zfs_comutil.h" 65 66 int zio_taskq_threads = 8; 67 68 static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); 69 static boolean_t spa_has_active_shared_spare(spa_t *spa); 70 71 /* 72 * ========================================================================== 73 * SPA properties routines 74 * ========================================================================== 75 */ 76 77 /* 78 * Add a (source=src, propname=propval) list to an nvlist. 79 */ 80 static void 81 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 82 uint64_t intval, zprop_source_t src) 83 { 84 const char *propname = zpool_prop_to_name(prop); 85 nvlist_t *propval; 86 87 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 88 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 89 90 if (strval != NULL) 91 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 92 else 93 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 94 95 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 96 nvlist_free(propval); 97 } 98 99 /* 100 * Get property values from the spa configuration. 101 */ 102 static void 103 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 104 { 105 uint64_t size = spa_get_space(spa); 106 uint64_t used = spa_get_alloc(spa); 107 uint64_t cap, version; 108 zprop_source_t src = ZPROP_SRC_NONE; 109 spa_config_dirent_t *dp; 110 111 /* 112 * readonly properties 113 */ 114 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa->spa_name, 0, src); 115 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 116 spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src); 117 spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, size - used, src); 118 119 cap = (size == 0) ? 0 : (used * 100 / size); 120 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 121 122 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 123 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 124 spa->spa_root_vdev->vdev_state, src); 125 126 /* 127 * settable properties that are not stored in the pool property object. 128 */ 129 version = spa_version(spa); 130 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 131 src = ZPROP_SRC_DEFAULT; 132 else 133 src = ZPROP_SRC_LOCAL; 134 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 135 136 if (spa->spa_root != NULL) 137 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 138 0, ZPROP_SRC_LOCAL); 139 140 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 141 if (dp->scd_path == NULL) { 142 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 143 "none", 0, ZPROP_SRC_LOCAL); 144 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 145 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 146 dp->scd_path, 0, ZPROP_SRC_LOCAL); 147 } 148 } 149 } 150 151 /* 152 * Get zpool property values. 153 */ 154 int 155 spa_prop_get(spa_t *spa, nvlist_t **nvp) 156 { 157 zap_cursor_t zc; 158 zap_attribute_t za; 159 objset_t *mos = spa->spa_meta_objset; 160 int err; 161 162 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 163 164 /* 165 * Get properties from the spa config. 166 */ 167 spa_prop_get_config(spa, nvp); 168 169 mutex_enter(&spa->spa_props_lock); 170 /* If no pool property object, no more prop to get. */ 171 if (spa->spa_pool_props_object == 0) { 172 mutex_exit(&spa->spa_props_lock); 173 return (0); 174 } 175 176 /* 177 * Get properties from the MOS pool property object. 178 */ 179 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 180 (err = zap_cursor_retrieve(&zc, &za)) == 0; 181 zap_cursor_advance(&zc)) { 182 uint64_t intval = 0; 183 char *strval = NULL; 184 zprop_source_t src = ZPROP_SRC_DEFAULT; 185 zpool_prop_t prop; 186 187 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 188 continue; 189 190 switch (za.za_integer_length) { 191 case 8: 192 /* integer property */ 193 if (za.za_first_integer != 194 zpool_prop_default_numeric(prop)) 195 src = ZPROP_SRC_LOCAL; 196 197 if (prop == ZPOOL_PROP_BOOTFS) { 198 dsl_pool_t *dp; 199 dsl_dataset_t *ds = NULL; 200 201 dp = spa_get_dsl(spa); 202 rw_enter(&dp->dp_config_rwlock, RW_READER); 203 if (err = dsl_dataset_hold_obj(dp, 204 za.za_first_integer, FTAG, &ds)) { 205 rw_exit(&dp->dp_config_rwlock); 206 break; 207 } 208 209 strval = kmem_alloc( 210 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 211 KM_SLEEP); 212 dsl_dataset_name(ds, strval); 213 dsl_dataset_rele(ds, FTAG); 214 rw_exit(&dp->dp_config_rwlock); 215 } else { 216 strval = NULL; 217 intval = za.za_first_integer; 218 } 219 220 spa_prop_add_list(*nvp, prop, strval, intval, src); 221 222 if (strval != NULL) 223 kmem_free(strval, 224 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 225 226 break; 227 228 case 1: 229 /* string property */ 230 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 231 err = zap_lookup(mos, spa->spa_pool_props_object, 232 za.za_name, 1, za.za_num_integers, strval); 233 if (err) { 234 kmem_free(strval, za.za_num_integers); 235 break; 236 } 237 spa_prop_add_list(*nvp, prop, strval, 0, src); 238 kmem_free(strval, za.za_num_integers); 239 break; 240 241 default: 242 break; 243 } 244 } 245 zap_cursor_fini(&zc); 246 mutex_exit(&spa->spa_props_lock); 247 out: 248 if (err && err != ENOENT) { 249 nvlist_free(*nvp); 250 *nvp = NULL; 251 return (err); 252 } 253 254 return (0); 255 } 256 257 /* 258 * Validate the given pool properties nvlist and modify the list 259 * for the property values to be set. 260 */ 261 static int 262 spa_prop_validate(spa_t *spa, nvlist_t *props) 263 { 264 nvpair_t *elem; 265 int error = 0, reset_bootfs = 0; 266 uint64_t objnum; 267 268 elem = NULL; 269 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 270 zpool_prop_t prop; 271 char *propname, *strval; 272 uint64_t intval; 273 objset_t *os; 274 char *slash; 275 276 propname = nvpair_name(elem); 277 278 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) 279 return (EINVAL); 280 281 switch (prop) { 282 case ZPOOL_PROP_VERSION: 283 error = nvpair_value_uint64(elem, &intval); 284 if (!error && 285 (intval < spa_version(spa) || intval > SPA_VERSION)) 286 error = EINVAL; 287 break; 288 289 case ZPOOL_PROP_DELEGATION: 290 case ZPOOL_PROP_AUTOREPLACE: 291 case ZPOOL_PROP_LISTSNAPS: 292 error = nvpair_value_uint64(elem, &intval); 293 if (!error && intval > 1) 294 error = EINVAL; 295 break; 296 297 case ZPOOL_PROP_BOOTFS: 298 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 299 error = ENOTSUP; 300 break; 301 } 302 303 /* 304 * Make sure the vdev config is bootable 305 */ 306 if (!vdev_is_bootable(spa->spa_root_vdev)) { 307 error = ENOTSUP; 308 break; 309 } 310 311 reset_bootfs = 1; 312 313 error = nvpair_value_string(elem, &strval); 314 315 if (!error) { 316 uint64_t compress; 317 318 if (strval == NULL || strval[0] == '\0') { 319 objnum = zpool_prop_default_numeric( 320 ZPOOL_PROP_BOOTFS); 321 break; 322 } 323 324 if (error = dmu_objset_open(strval, DMU_OST_ZFS, 325 DS_MODE_USER | DS_MODE_READONLY, &os)) 326 break; 327 328 /* We don't support gzip bootable datasets */ 329 if ((error = dsl_prop_get_integer(strval, 330 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 331 &compress, NULL)) == 0 && 332 !BOOTFS_COMPRESS_VALID(compress)) { 333 error = ENOTSUP; 334 } else { 335 objnum = dmu_objset_id(os); 336 } 337 dmu_objset_close(os); 338 } 339 break; 340 case ZPOOL_PROP_FAILUREMODE: 341 error = nvpair_value_uint64(elem, &intval); 342 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 343 intval > ZIO_FAILURE_MODE_PANIC)) 344 error = EINVAL; 345 346 /* 347 * This is a special case which only occurs when 348 * the pool has completely failed. This allows 349 * the user to change the in-core failmode property 350 * without syncing it out to disk (I/Os might 351 * currently be blocked). We do this by returning 352 * EIO to the caller (spa_prop_set) to trick it 353 * into thinking we encountered a property validation 354 * error. 355 */ 356 if (!error && spa_state(spa) == POOL_STATE_IO_FAILURE) { 357 spa->spa_failmode = intval; 358 error = EIO; 359 } 360 break; 361 362 case ZPOOL_PROP_CACHEFILE: 363 if ((error = nvpair_value_string(elem, &strval)) != 0) 364 break; 365 366 if (strval[0] == '\0') 367 break; 368 369 if (strcmp(strval, "none") == 0) 370 break; 371 372 if (strval[0] != '/') { 373 error = EINVAL; 374 break; 375 } 376 377 slash = strrchr(strval, '/'); 378 ASSERT(slash != NULL); 379 380 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 381 strcmp(slash, "/..") == 0) 382 error = EINVAL; 383 break; 384 } 385 386 if (error) 387 break; 388 } 389 390 if (!error && reset_bootfs) { 391 error = nvlist_remove(props, 392 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 393 394 if (!error) { 395 error = nvlist_add_uint64(props, 396 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 397 } 398 } 399 400 return (error); 401 } 402 403 int 404 spa_prop_set(spa_t *spa, nvlist_t *nvp) 405 { 406 int error; 407 408 if ((error = spa_prop_validate(spa, nvp)) != 0) 409 return (error); 410 411 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 412 spa, nvp, 3)); 413 } 414 415 /* 416 * If the bootfs property value is dsobj, clear it. 417 */ 418 void 419 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 420 { 421 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 422 VERIFY(zap_remove(spa->spa_meta_objset, 423 spa->spa_pool_props_object, 424 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 425 spa->spa_bootfs = 0; 426 } 427 } 428 429 /* 430 * ========================================================================== 431 * SPA state manipulation (open/create/destroy/import/export) 432 * ========================================================================== 433 */ 434 435 static int 436 spa_error_entry_compare(const void *a, const void *b) 437 { 438 spa_error_entry_t *sa = (spa_error_entry_t *)a; 439 spa_error_entry_t *sb = (spa_error_entry_t *)b; 440 int ret; 441 442 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 443 sizeof (zbookmark_t)); 444 445 if (ret < 0) 446 return (-1); 447 else if (ret > 0) 448 return (1); 449 else 450 return (0); 451 } 452 453 /* 454 * Utility function which retrieves copies of the current logs and 455 * re-initializes them in the process. 456 */ 457 void 458 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 459 { 460 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 461 462 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 463 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 464 465 avl_create(&spa->spa_errlist_scrub, 466 spa_error_entry_compare, sizeof (spa_error_entry_t), 467 offsetof(spa_error_entry_t, se_avl)); 468 avl_create(&spa->spa_errlist_last, 469 spa_error_entry_compare, sizeof (spa_error_entry_t), 470 offsetof(spa_error_entry_t, se_avl)); 471 } 472 473 /* 474 * Activate an uninitialized pool. 475 */ 476 static void 477 spa_activate(spa_t *spa) 478 { 479 int t; 480 481 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 482 483 spa->spa_state = POOL_STATE_ACTIVE; 484 485 spa->spa_normal_class = metaslab_class_create(); 486 spa->spa_log_class = metaslab_class_create(); 487 488 for (t = 0; t < ZIO_TYPES; t++) { 489 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 490 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 491 TASKQ_PREPOPULATE); 492 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 493 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 494 TASKQ_PREPOPULATE); 495 } 496 497 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 498 offsetof(vdev_t, vdev_dirty_node)); 499 list_create(&spa->spa_zio_list, sizeof (zio_t), 500 offsetof(zio_t, zio_link_node)); 501 502 txg_list_create(&spa->spa_vdev_txg_list, 503 offsetof(struct vdev, vdev_txg_node)); 504 505 avl_create(&spa->spa_errlist_scrub, 506 spa_error_entry_compare, sizeof (spa_error_entry_t), 507 offsetof(spa_error_entry_t, se_avl)); 508 avl_create(&spa->spa_errlist_last, 509 spa_error_entry_compare, sizeof (spa_error_entry_t), 510 offsetof(spa_error_entry_t, se_avl)); 511 } 512 513 /* 514 * Opposite of spa_activate(). 515 */ 516 static void 517 spa_deactivate(spa_t *spa) 518 { 519 int t; 520 521 ASSERT(spa->spa_sync_on == B_FALSE); 522 ASSERT(spa->spa_dsl_pool == NULL); 523 ASSERT(spa->spa_root_vdev == NULL); 524 525 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 526 527 txg_list_destroy(&spa->spa_vdev_txg_list); 528 529 list_destroy(&spa->spa_dirty_list); 530 list_destroy(&spa->spa_zio_list); 531 532 for (t = 0; t < ZIO_TYPES; t++) { 533 taskq_destroy(spa->spa_zio_issue_taskq[t]); 534 taskq_destroy(spa->spa_zio_intr_taskq[t]); 535 spa->spa_zio_issue_taskq[t] = NULL; 536 spa->spa_zio_intr_taskq[t] = NULL; 537 } 538 539 metaslab_class_destroy(spa->spa_normal_class); 540 spa->spa_normal_class = NULL; 541 542 metaslab_class_destroy(spa->spa_log_class); 543 spa->spa_log_class = NULL; 544 545 /* 546 * If this was part of an import or the open otherwise failed, we may 547 * still have errors left in the queues. Empty them just in case. 548 */ 549 spa_errlog_drain(spa); 550 551 avl_destroy(&spa->spa_errlist_scrub); 552 avl_destroy(&spa->spa_errlist_last); 553 554 spa->spa_state = POOL_STATE_UNINITIALIZED; 555 } 556 557 /* 558 * Verify a pool configuration, and construct the vdev tree appropriately. This 559 * will create all the necessary vdevs in the appropriate layout, with each vdev 560 * in the CLOSED state. This will prep the pool before open/creation/import. 561 * All vdev validation is done by the vdev_alloc() routine. 562 */ 563 static int 564 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 565 uint_t id, int atype) 566 { 567 nvlist_t **child; 568 uint_t c, children; 569 int error; 570 571 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 572 return (error); 573 574 if ((*vdp)->vdev_ops->vdev_op_leaf) 575 return (0); 576 577 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 578 &child, &children) != 0) { 579 vdev_free(*vdp); 580 *vdp = NULL; 581 return (EINVAL); 582 } 583 584 for (c = 0; c < children; c++) { 585 vdev_t *vd; 586 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 587 atype)) != 0) { 588 vdev_free(*vdp); 589 *vdp = NULL; 590 return (error); 591 } 592 } 593 594 ASSERT(*vdp != NULL); 595 596 return (0); 597 } 598 599 /* 600 * Opposite of spa_load(). 601 */ 602 static void 603 spa_unload(spa_t *spa) 604 { 605 int i; 606 607 /* 608 * Stop async tasks. 609 */ 610 spa_async_suspend(spa); 611 612 /* 613 * Stop syncing. 614 */ 615 if (spa->spa_sync_on) { 616 txg_sync_stop(spa->spa_dsl_pool); 617 spa->spa_sync_on = B_FALSE; 618 } 619 620 /* 621 * Wait for any outstanding prefetch I/O to complete. 622 */ 623 spa_config_enter(spa, RW_WRITER, FTAG); 624 spa_config_exit(spa, FTAG); 625 626 /* 627 * Drop and purge level 2 cache 628 */ 629 spa_l2cache_drop(spa); 630 631 /* 632 * Close the dsl pool. 633 */ 634 if (spa->spa_dsl_pool) { 635 dsl_pool_close(spa->spa_dsl_pool); 636 spa->spa_dsl_pool = NULL; 637 } 638 639 /* 640 * Close all vdevs. 641 */ 642 if (spa->spa_root_vdev) 643 vdev_free(spa->spa_root_vdev); 644 ASSERT(spa->spa_root_vdev == NULL); 645 646 for (i = 0; i < spa->spa_spares.sav_count; i++) 647 vdev_free(spa->spa_spares.sav_vdevs[i]); 648 if (spa->spa_spares.sav_vdevs) { 649 kmem_free(spa->spa_spares.sav_vdevs, 650 spa->spa_spares.sav_count * sizeof (void *)); 651 spa->spa_spares.sav_vdevs = NULL; 652 } 653 if (spa->spa_spares.sav_config) { 654 nvlist_free(spa->spa_spares.sav_config); 655 spa->spa_spares.sav_config = NULL; 656 } 657 spa->spa_spares.sav_count = 0; 658 659 for (i = 0; i < spa->spa_l2cache.sav_count; i++) 660 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 661 if (spa->spa_l2cache.sav_vdevs) { 662 kmem_free(spa->spa_l2cache.sav_vdevs, 663 spa->spa_l2cache.sav_count * sizeof (void *)); 664 spa->spa_l2cache.sav_vdevs = NULL; 665 } 666 if (spa->spa_l2cache.sav_config) { 667 nvlist_free(spa->spa_l2cache.sav_config); 668 spa->spa_l2cache.sav_config = NULL; 669 } 670 spa->spa_l2cache.sav_count = 0; 671 672 spa->spa_async_suspended = 0; 673 } 674 675 /* 676 * Load (or re-load) the current list of vdevs describing the active spares for 677 * this pool. When this is called, we have some form of basic information in 678 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 679 * then re-generate a more complete list including status information. 680 */ 681 static void 682 spa_load_spares(spa_t *spa) 683 { 684 nvlist_t **spares; 685 uint_t nspares; 686 int i; 687 vdev_t *vd, *tvd; 688 689 /* 690 * First, close and free any existing spare vdevs. 691 */ 692 for (i = 0; i < spa->spa_spares.sav_count; i++) { 693 vd = spa->spa_spares.sav_vdevs[i]; 694 695 /* Undo the call to spa_activate() below */ 696 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 697 B_FALSE)) != NULL && tvd->vdev_isspare) 698 spa_spare_remove(tvd); 699 vdev_close(vd); 700 vdev_free(vd); 701 } 702 703 if (spa->spa_spares.sav_vdevs) 704 kmem_free(spa->spa_spares.sav_vdevs, 705 spa->spa_spares.sav_count * sizeof (void *)); 706 707 if (spa->spa_spares.sav_config == NULL) 708 nspares = 0; 709 else 710 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 711 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 712 713 spa->spa_spares.sav_count = (int)nspares; 714 spa->spa_spares.sav_vdevs = NULL; 715 716 if (nspares == 0) 717 return; 718 719 /* 720 * Construct the array of vdevs, opening them to get status in the 721 * process. For each spare, there is potentially two different vdev_t 722 * structures associated with it: one in the list of spares (used only 723 * for basic validation purposes) and one in the active vdev 724 * configuration (if it's spared in). During this phase we open and 725 * validate each vdev on the spare list. If the vdev also exists in the 726 * active configuration, then we also mark this vdev as an active spare. 727 */ 728 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 729 KM_SLEEP); 730 for (i = 0; i < spa->spa_spares.sav_count; i++) { 731 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 732 VDEV_ALLOC_SPARE) == 0); 733 ASSERT(vd != NULL); 734 735 spa->spa_spares.sav_vdevs[i] = vd; 736 737 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 738 B_FALSE)) != NULL) { 739 if (!tvd->vdev_isspare) 740 spa_spare_add(tvd); 741 742 /* 743 * We only mark the spare active if we were successfully 744 * able to load the vdev. Otherwise, importing a pool 745 * with a bad active spare would result in strange 746 * behavior, because multiple pool would think the spare 747 * is actively in use. 748 * 749 * There is a vulnerability here to an equally bizarre 750 * circumstance, where a dead active spare is later 751 * brought back to life (onlined or otherwise). Given 752 * the rarity of this scenario, and the extra complexity 753 * it adds, we ignore the possibility. 754 */ 755 if (!vdev_is_dead(tvd)) 756 spa_spare_activate(tvd); 757 } 758 759 if (vdev_open(vd) != 0) 760 continue; 761 762 vd->vdev_top = vd; 763 if (vdev_validate_aux(vd) == 0) 764 spa_spare_add(vd); 765 } 766 767 /* 768 * Recompute the stashed list of spares, with status information 769 * this time. 770 */ 771 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 772 DATA_TYPE_NVLIST_ARRAY) == 0); 773 774 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 775 KM_SLEEP); 776 for (i = 0; i < spa->spa_spares.sav_count; i++) 777 spares[i] = vdev_config_generate(spa, 778 spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE); 779 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 780 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 781 for (i = 0; i < spa->spa_spares.sav_count; i++) 782 nvlist_free(spares[i]); 783 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 784 } 785 786 /* 787 * Load (or re-load) the current list of vdevs describing the active l2cache for 788 * this pool. When this is called, we have some form of basic information in 789 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 790 * then re-generate a more complete list including status information. 791 * Devices which are already active have their details maintained, and are 792 * not re-opened. 793 */ 794 static void 795 spa_load_l2cache(spa_t *spa) 796 { 797 nvlist_t **l2cache; 798 uint_t nl2cache; 799 int i, j, oldnvdevs; 800 uint64_t guid, size; 801 vdev_t *vd, **oldvdevs, **newvdevs; 802 spa_aux_vdev_t *sav = &spa->spa_l2cache; 803 804 if (sav->sav_config != NULL) { 805 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 806 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 807 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 808 } else { 809 nl2cache = 0; 810 } 811 812 oldvdevs = sav->sav_vdevs; 813 oldnvdevs = sav->sav_count; 814 sav->sav_vdevs = NULL; 815 sav->sav_count = 0; 816 817 /* 818 * Process new nvlist of vdevs. 819 */ 820 for (i = 0; i < nl2cache; i++) { 821 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 822 &guid) == 0); 823 824 newvdevs[i] = NULL; 825 for (j = 0; j < oldnvdevs; j++) { 826 vd = oldvdevs[j]; 827 if (vd != NULL && guid == vd->vdev_guid) { 828 /* 829 * Retain previous vdev for add/remove ops. 830 */ 831 newvdevs[i] = vd; 832 oldvdevs[j] = NULL; 833 break; 834 } 835 } 836 837 if (newvdevs[i] == NULL) { 838 /* 839 * Create new vdev 840 */ 841 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 842 VDEV_ALLOC_L2CACHE) == 0); 843 ASSERT(vd != NULL); 844 newvdevs[i] = vd; 845 846 /* 847 * Commit this vdev as an l2cache device, 848 * even if it fails to open. 849 */ 850 spa_l2cache_add(vd); 851 852 vd->vdev_top = vd; 853 vd->vdev_aux = sav; 854 855 spa_l2cache_activate(vd); 856 857 if (vdev_open(vd) != 0) 858 continue; 859 860 (void) vdev_validate_aux(vd); 861 862 if (!vdev_is_dead(vd)) { 863 size = vdev_get_rsize(vd); 864 l2arc_add_vdev(spa, vd, 865 VDEV_LABEL_START_SIZE, 866 size - VDEV_LABEL_START_SIZE); 867 } 868 } 869 } 870 871 /* 872 * Purge vdevs that were dropped 873 */ 874 for (i = 0; i < oldnvdevs; i++) { 875 uint64_t pool; 876 877 vd = oldvdevs[i]; 878 if (vd != NULL) { 879 if (spa_mode & FWRITE && 880 spa_l2cache_exists(vd->vdev_guid, &pool) && 881 pool != 0ULL && 882 l2arc_vdev_present(vd)) { 883 l2arc_remove_vdev(vd); 884 } 885 (void) vdev_close(vd); 886 spa_l2cache_remove(vd); 887 } 888 } 889 890 if (oldvdevs) 891 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 892 893 if (sav->sav_config == NULL) 894 goto out; 895 896 sav->sav_vdevs = newvdevs; 897 sav->sav_count = (int)nl2cache; 898 899 /* 900 * Recompute the stashed list of l2cache devices, with status 901 * information this time. 902 */ 903 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 904 DATA_TYPE_NVLIST_ARRAY) == 0); 905 906 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 907 for (i = 0; i < sav->sav_count; i++) 908 l2cache[i] = vdev_config_generate(spa, 909 sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE); 910 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 911 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 912 out: 913 for (i = 0; i < sav->sav_count; i++) 914 nvlist_free(l2cache[i]); 915 if (sav->sav_count) 916 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 917 } 918 919 static int 920 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 921 { 922 dmu_buf_t *db; 923 char *packed = NULL; 924 size_t nvsize = 0; 925 int error; 926 *value = NULL; 927 928 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 929 nvsize = *(uint64_t *)db->db_data; 930 dmu_buf_rele(db, FTAG); 931 932 packed = kmem_alloc(nvsize, KM_SLEEP); 933 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 934 if (error == 0) 935 error = nvlist_unpack(packed, nvsize, value, 0); 936 kmem_free(packed, nvsize); 937 938 return (error); 939 } 940 941 /* 942 * Checks to see if the given vdev could not be opened, in which case we post a 943 * sysevent to notify the autoreplace code that the device has been removed. 944 */ 945 static void 946 spa_check_removed(vdev_t *vd) 947 { 948 int c; 949 950 for (c = 0; c < vd->vdev_children; c++) 951 spa_check_removed(vd->vdev_child[c]); 952 953 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 954 zfs_post_autoreplace(vd->vdev_spa, vd); 955 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 956 } 957 } 958 959 /* 960 * Check for missing log devices 961 */ 962 int 963 spa_check_logs(spa_t *spa) 964 { 965 switch (spa->spa_log_state) { 966 case SPA_LOG_MISSING: 967 /* need to recheck in case slog has been restored */ 968 case SPA_LOG_UNKNOWN: 969 if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, 970 DS_FIND_CHILDREN)) { 971 spa->spa_log_state = SPA_LOG_MISSING; 972 return (1); 973 } 974 break; 975 976 case SPA_LOG_CLEAR: 977 (void) dmu_objset_find(spa->spa_name, zil_clear_log_chain, NULL, 978 DS_FIND_CHILDREN); 979 break; 980 } 981 spa->spa_log_state = SPA_LOG_GOOD; 982 return (0); 983 } 984 985 /* 986 * Load an existing storage pool, using the pool's builtin spa_config as a 987 * source of configuration information. 988 */ 989 static int 990 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 991 { 992 int error = 0; 993 nvlist_t *nvroot = NULL; 994 vdev_t *rvd; 995 uberblock_t *ub = &spa->spa_uberblock; 996 uint64_t config_cache_txg = spa->spa_config_txg; 997 uint64_t pool_guid; 998 uint64_t version; 999 zio_t *zio; 1000 uint64_t autoreplace = 0; 1001 char *ereport = FM_EREPORT_ZFS_POOL; 1002 1003 spa->spa_load_state = state; 1004 1005 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 1006 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 1007 error = EINVAL; 1008 goto out; 1009 } 1010 1011 /* 1012 * Versioning wasn't explicitly added to the label until later, so if 1013 * it's not present treat it as the initial version. 1014 */ 1015 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 1016 version = SPA_VERSION_INITIAL; 1017 1018 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 1019 &spa->spa_config_txg); 1020 1021 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 1022 spa_guid_exists(pool_guid, 0)) { 1023 error = EEXIST; 1024 goto out; 1025 } 1026 1027 spa->spa_load_guid = pool_guid; 1028 1029 /* 1030 * Parse the configuration into a vdev tree. We explicitly set the 1031 * value that will be returned by spa_version() since parsing the 1032 * configuration requires knowing the version number. 1033 */ 1034 spa_config_enter(spa, RW_WRITER, FTAG); 1035 spa->spa_ubsync.ub_version = version; 1036 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 1037 spa_config_exit(spa, FTAG); 1038 1039 if (error != 0) 1040 goto out; 1041 1042 ASSERT(spa->spa_root_vdev == rvd); 1043 ASSERT(spa_guid(spa) == pool_guid); 1044 1045 /* 1046 * Try to open all vdevs, loading each label in the process. 1047 */ 1048 error = vdev_open(rvd); 1049 if (error != 0) 1050 goto out; 1051 1052 /* 1053 * Validate the labels for all leaf vdevs. We need to grab the config 1054 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 1055 * flag. 1056 */ 1057 spa_config_enter(spa, RW_READER, FTAG); 1058 error = vdev_validate(rvd); 1059 spa_config_exit(spa, FTAG); 1060 1061 if (error != 0) 1062 goto out; 1063 1064 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1065 error = ENXIO; 1066 goto out; 1067 } 1068 1069 /* 1070 * Find the best uberblock. 1071 */ 1072 bzero(ub, sizeof (uberblock_t)); 1073 1074 zio = zio_root(spa, NULL, NULL, 1075 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1076 vdev_uberblock_load(zio, rvd, ub); 1077 error = zio_wait(zio); 1078 1079 /* 1080 * If we weren't able to find a single valid uberblock, return failure. 1081 */ 1082 if (ub->ub_txg == 0) { 1083 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1084 VDEV_AUX_CORRUPT_DATA); 1085 error = ENXIO; 1086 goto out; 1087 } 1088 1089 /* 1090 * If the pool is newer than the code, we can't open it. 1091 */ 1092 if (ub->ub_version > SPA_VERSION) { 1093 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1094 VDEV_AUX_VERSION_NEWER); 1095 error = ENOTSUP; 1096 goto out; 1097 } 1098 1099 /* 1100 * If the vdev guid sum doesn't match the uberblock, we have an 1101 * incomplete configuration. 1102 */ 1103 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 1104 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1105 VDEV_AUX_BAD_GUID_SUM); 1106 error = ENXIO; 1107 goto out; 1108 } 1109 1110 /* 1111 * Initialize internal SPA structures. 1112 */ 1113 spa->spa_state = POOL_STATE_ACTIVE; 1114 spa->spa_ubsync = spa->spa_uberblock; 1115 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 1116 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 1117 if (error) { 1118 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1119 VDEV_AUX_CORRUPT_DATA); 1120 goto out; 1121 } 1122 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 1123 1124 if (zap_lookup(spa->spa_meta_objset, 1125 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1126 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 1127 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1128 VDEV_AUX_CORRUPT_DATA); 1129 error = EIO; 1130 goto out; 1131 } 1132 1133 if (!mosconfig) { 1134 nvlist_t *newconfig; 1135 uint64_t hostid; 1136 1137 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 1138 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1139 VDEV_AUX_CORRUPT_DATA); 1140 error = EIO; 1141 goto out; 1142 } 1143 1144 if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID, 1145 &hostid) == 0) { 1146 char *hostname; 1147 unsigned long myhostid = 0; 1148 1149 VERIFY(nvlist_lookup_string(newconfig, 1150 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 1151 1152 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 1153 if (hostid != 0 && myhostid != 0 && 1154 (unsigned long)hostid != myhostid) { 1155 cmn_err(CE_WARN, "pool '%s' could not be " 1156 "loaded as it was last accessed by " 1157 "another system (host: %s hostid: 0x%lx). " 1158 "See: http://www.sun.com/msg/ZFS-8000-EY", 1159 spa->spa_name, hostname, 1160 (unsigned long)hostid); 1161 error = EBADF; 1162 goto out; 1163 } 1164 } 1165 1166 spa_config_set(spa, newconfig); 1167 spa_unload(spa); 1168 spa_deactivate(spa); 1169 spa_activate(spa); 1170 1171 return (spa_load(spa, newconfig, state, B_TRUE)); 1172 } 1173 1174 if (zap_lookup(spa->spa_meta_objset, 1175 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1176 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 1177 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1178 VDEV_AUX_CORRUPT_DATA); 1179 error = EIO; 1180 goto out; 1181 } 1182 1183 /* 1184 * Load the bit that tells us to use the new accounting function 1185 * (raid-z deflation). If we have an older pool, this will not 1186 * be present. 1187 */ 1188 error = zap_lookup(spa->spa_meta_objset, 1189 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1190 sizeof (uint64_t), 1, &spa->spa_deflate); 1191 if (error != 0 && error != ENOENT) { 1192 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1193 VDEV_AUX_CORRUPT_DATA); 1194 error = EIO; 1195 goto out; 1196 } 1197 1198 /* 1199 * Load the persistent error log. If we have an older pool, this will 1200 * not be present. 1201 */ 1202 error = zap_lookup(spa->spa_meta_objset, 1203 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 1204 sizeof (uint64_t), 1, &spa->spa_errlog_last); 1205 if (error != 0 && error != ENOENT) { 1206 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1207 VDEV_AUX_CORRUPT_DATA); 1208 error = EIO; 1209 goto out; 1210 } 1211 1212 error = zap_lookup(spa->spa_meta_objset, 1213 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 1214 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 1215 if (error != 0 && error != ENOENT) { 1216 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1217 VDEV_AUX_CORRUPT_DATA); 1218 error = EIO; 1219 goto out; 1220 } 1221 1222 /* 1223 * Load the history object. If we have an older pool, this 1224 * will not be present. 1225 */ 1226 error = zap_lookup(spa->spa_meta_objset, 1227 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 1228 sizeof (uint64_t), 1, &spa->spa_history); 1229 if (error != 0 && error != ENOENT) { 1230 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1231 VDEV_AUX_CORRUPT_DATA); 1232 error = EIO; 1233 goto out; 1234 } 1235 1236 /* 1237 * Load any hot spares for this pool. 1238 */ 1239 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1240 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object); 1241 if (error != 0 && error != ENOENT) { 1242 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1243 VDEV_AUX_CORRUPT_DATA); 1244 error = EIO; 1245 goto out; 1246 } 1247 if (error == 0) { 1248 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 1249 if (load_nvlist(spa, spa->spa_spares.sav_object, 1250 &spa->spa_spares.sav_config) != 0) { 1251 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1252 VDEV_AUX_CORRUPT_DATA); 1253 error = EIO; 1254 goto out; 1255 } 1256 1257 spa_config_enter(spa, RW_WRITER, FTAG); 1258 spa_load_spares(spa); 1259 spa_config_exit(spa, FTAG); 1260 } 1261 1262 /* 1263 * Load any level 2 ARC devices for this pool. 1264 */ 1265 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1266 DMU_POOL_L2CACHE, sizeof (uint64_t), 1, 1267 &spa->spa_l2cache.sav_object); 1268 if (error != 0 && error != ENOENT) { 1269 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1270 VDEV_AUX_CORRUPT_DATA); 1271 error = EIO; 1272 goto out; 1273 } 1274 if (error == 0) { 1275 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 1276 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 1277 &spa->spa_l2cache.sav_config) != 0) { 1278 vdev_set_state(rvd, B_TRUE, 1279 VDEV_STATE_CANT_OPEN, 1280 VDEV_AUX_CORRUPT_DATA); 1281 error = EIO; 1282 goto out; 1283 } 1284 1285 spa_config_enter(spa, RW_WRITER, FTAG); 1286 spa_load_l2cache(spa); 1287 spa_config_exit(spa, FTAG); 1288 } 1289 1290 if (spa_check_logs(spa)) { 1291 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1292 VDEV_AUX_BAD_LOG); 1293 error = ENXIO; 1294 ereport = FM_EREPORT_ZFS_LOG_REPLAY; 1295 goto out; 1296 } 1297 1298 1299 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 1300 1301 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1302 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 1303 1304 if (error && error != ENOENT) { 1305 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1306 VDEV_AUX_CORRUPT_DATA); 1307 error = EIO; 1308 goto out; 1309 } 1310 1311 if (error == 0) { 1312 (void) zap_lookup(spa->spa_meta_objset, 1313 spa->spa_pool_props_object, 1314 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 1315 sizeof (uint64_t), 1, &spa->spa_bootfs); 1316 (void) zap_lookup(spa->spa_meta_objset, 1317 spa->spa_pool_props_object, 1318 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1319 sizeof (uint64_t), 1, &autoreplace); 1320 (void) zap_lookup(spa->spa_meta_objset, 1321 spa->spa_pool_props_object, 1322 zpool_prop_to_name(ZPOOL_PROP_DELEGATION), 1323 sizeof (uint64_t), 1, &spa->spa_delegation); 1324 (void) zap_lookup(spa->spa_meta_objset, 1325 spa->spa_pool_props_object, 1326 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 1327 sizeof (uint64_t), 1, &spa->spa_failmode); 1328 } 1329 1330 /* 1331 * If the 'autoreplace' property is set, then post a resource notifying 1332 * the ZFS DE that it should not issue any faults for unopenable 1333 * devices. We also iterate over the vdevs, and post a sysevent for any 1334 * unopenable vdevs so that the normal autoreplace handler can take 1335 * over. 1336 */ 1337 if (autoreplace && state != SPA_LOAD_TRYIMPORT) 1338 spa_check_removed(spa->spa_root_vdev); 1339 1340 /* 1341 * Load the vdev state for all toplevel vdevs. 1342 */ 1343 vdev_load(rvd); 1344 1345 /* 1346 * Propagate the leaf DTLs we just loaded all the way up the tree. 1347 */ 1348 spa_config_enter(spa, RW_WRITER, FTAG); 1349 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 1350 spa_config_exit(spa, FTAG); 1351 1352 /* 1353 * Check the state of the root vdev. If it can't be opened, it 1354 * indicates one or more toplevel vdevs are faulted. 1355 */ 1356 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1357 error = ENXIO; 1358 goto out; 1359 } 1360 1361 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 1362 dmu_tx_t *tx; 1363 int need_update = B_FALSE; 1364 int c; 1365 1366 /* 1367 * Claim log blocks that haven't been committed yet. 1368 * This must all happen in a single txg. 1369 */ 1370 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 1371 spa_first_txg(spa)); 1372 (void) dmu_objset_find(spa->spa_name, 1373 zil_claim, tx, DS_FIND_CHILDREN); 1374 dmu_tx_commit(tx); 1375 1376 spa->spa_sync_on = B_TRUE; 1377 txg_sync_start(spa->spa_dsl_pool); 1378 1379 /* 1380 * Wait for all claims to sync. 1381 */ 1382 txg_wait_synced(spa->spa_dsl_pool, 0); 1383 1384 /* 1385 * If the config cache is stale, or we have uninitialized 1386 * metaslabs (see spa_vdev_add()), then update the config. 1387 */ 1388 if (config_cache_txg != spa->spa_config_txg || 1389 state == SPA_LOAD_IMPORT) 1390 need_update = B_TRUE; 1391 1392 for (c = 0; c < rvd->vdev_children; c++) 1393 if (rvd->vdev_child[c]->vdev_ms_array == 0) 1394 need_update = B_TRUE; 1395 1396 /* 1397 * Update the config cache asychronously in case we're the 1398 * root pool, in which case the config cache isn't writable yet. 1399 */ 1400 if (need_update) 1401 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 1402 } 1403 1404 error = 0; 1405 out: 1406 spa->spa_minref = refcount_count(&spa->spa_refcount); 1407 if (error && error != EBADF) 1408 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 1409 spa->spa_load_state = SPA_LOAD_NONE; 1410 spa->spa_ena = 0; 1411 1412 return (error); 1413 } 1414 1415 /* 1416 * Pool Open/Import 1417 * 1418 * The import case is identical to an open except that the configuration is sent 1419 * down from userland, instead of grabbed from the configuration cache. For the 1420 * case of an open, the pool configuration will exist in the 1421 * POOL_STATE_UNINITIALIZED state. 1422 * 1423 * The stats information (gen/count/ustats) is used to gather vdev statistics at 1424 * the same time open the pool, without having to keep around the spa_t in some 1425 * ambiguous state. 1426 */ 1427 static int 1428 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 1429 { 1430 spa_t *spa; 1431 int error; 1432 int locked = B_FALSE; 1433 1434 *spapp = NULL; 1435 1436 /* 1437 * As disgusting as this is, we need to support recursive calls to this 1438 * function because dsl_dir_open() is called during spa_load(), and ends 1439 * up calling spa_open() again. The real fix is to figure out how to 1440 * avoid dsl_dir_open() calling this in the first place. 1441 */ 1442 if (mutex_owner(&spa_namespace_lock) != curthread) { 1443 mutex_enter(&spa_namespace_lock); 1444 locked = B_TRUE; 1445 } 1446 1447 if ((spa = spa_lookup(pool)) == NULL) { 1448 if (locked) 1449 mutex_exit(&spa_namespace_lock); 1450 return (ENOENT); 1451 } 1452 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 1453 1454 spa_activate(spa); 1455 1456 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 1457 1458 if (error == EBADF) { 1459 /* 1460 * If vdev_validate() returns failure (indicated by 1461 * EBADF), it indicates that one of the vdevs indicates 1462 * that the pool has been exported or destroyed. If 1463 * this is the case, the config cache is out of sync and 1464 * we should remove the pool from the namespace. 1465 */ 1466 spa_unload(spa); 1467 spa_deactivate(spa); 1468 spa_config_sync(spa, B_TRUE, B_TRUE); 1469 spa_remove(spa); 1470 if (locked) 1471 mutex_exit(&spa_namespace_lock); 1472 return (ENOENT); 1473 } 1474 1475 if (error) { 1476 /* 1477 * We can't open the pool, but we still have useful 1478 * information: the state of each vdev after the 1479 * attempted vdev_open(). Return this to the user. 1480 */ 1481 if (config != NULL && spa->spa_root_vdev != NULL) { 1482 spa_config_enter(spa, RW_READER, FTAG); 1483 *config = spa_config_generate(spa, NULL, -1ULL, 1484 B_TRUE); 1485 spa_config_exit(spa, FTAG); 1486 } 1487 spa_unload(spa); 1488 spa_deactivate(spa); 1489 spa->spa_last_open_failed = B_TRUE; 1490 if (locked) 1491 mutex_exit(&spa_namespace_lock); 1492 *spapp = NULL; 1493 return (error); 1494 } else { 1495 spa->spa_last_open_failed = B_FALSE; 1496 } 1497 } 1498 1499 spa_open_ref(spa, tag); 1500 1501 if (locked) 1502 mutex_exit(&spa_namespace_lock); 1503 1504 *spapp = spa; 1505 1506 if (config != NULL) { 1507 spa_config_enter(spa, RW_READER, FTAG); 1508 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1509 spa_config_exit(spa, FTAG); 1510 } 1511 1512 return (0); 1513 } 1514 1515 int 1516 spa_open(const char *name, spa_t **spapp, void *tag) 1517 { 1518 return (spa_open_common(name, spapp, tag, NULL)); 1519 } 1520 1521 /* 1522 * Lookup the given spa_t, incrementing the inject count in the process, 1523 * preventing it from being exported or destroyed. 1524 */ 1525 spa_t * 1526 spa_inject_addref(char *name) 1527 { 1528 spa_t *spa; 1529 1530 mutex_enter(&spa_namespace_lock); 1531 if ((spa = spa_lookup(name)) == NULL) { 1532 mutex_exit(&spa_namespace_lock); 1533 return (NULL); 1534 } 1535 spa->spa_inject_ref++; 1536 mutex_exit(&spa_namespace_lock); 1537 1538 return (spa); 1539 } 1540 1541 void 1542 spa_inject_delref(spa_t *spa) 1543 { 1544 mutex_enter(&spa_namespace_lock); 1545 spa->spa_inject_ref--; 1546 mutex_exit(&spa_namespace_lock); 1547 } 1548 1549 /* 1550 * Add spares device information to the nvlist. 1551 */ 1552 static void 1553 spa_add_spares(spa_t *spa, nvlist_t *config) 1554 { 1555 nvlist_t **spares; 1556 uint_t i, nspares; 1557 nvlist_t *nvroot; 1558 uint64_t guid; 1559 vdev_stat_t *vs; 1560 uint_t vsc; 1561 uint64_t pool; 1562 1563 if (spa->spa_spares.sav_count == 0) 1564 return; 1565 1566 VERIFY(nvlist_lookup_nvlist(config, 1567 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1568 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1569 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1570 if (nspares != 0) { 1571 VERIFY(nvlist_add_nvlist_array(nvroot, 1572 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1573 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1574 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1575 1576 /* 1577 * Go through and find any spares which have since been 1578 * repurposed as an active spare. If this is the case, update 1579 * their status appropriately. 1580 */ 1581 for (i = 0; i < nspares; i++) { 1582 VERIFY(nvlist_lookup_uint64(spares[i], 1583 ZPOOL_CONFIG_GUID, &guid) == 0); 1584 if (spa_spare_exists(guid, &pool, NULL) && 1585 pool != 0ULL) { 1586 VERIFY(nvlist_lookup_uint64_array( 1587 spares[i], ZPOOL_CONFIG_STATS, 1588 (uint64_t **)&vs, &vsc) == 0); 1589 vs->vs_state = VDEV_STATE_CANT_OPEN; 1590 vs->vs_aux = VDEV_AUX_SPARED; 1591 } 1592 } 1593 } 1594 } 1595 1596 /* 1597 * Add l2cache device information to the nvlist, including vdev stats. 1598 */ 1599 static void 1600 spa_add_l2cache(spa_t *spa, nvlist_t *config) 1601 { 1602 nvlist_t **l2cache; 1603 uint_t i, j, nl2cache; 1604 nvlist_t *nvroot; 1605 uint64_t guid; 1606 vdev_t *vd; 1607 vdev_stat_t *vs; 1608 uint_t vsc; 1609 1610 if (spa->spa_l2cache.sav_count == 0) 1611 return; 1612 1613 spa_config_enter(spa, RW_READER, FTAG); 1614 1615 VERIFY(nvlist_lookup_nvlist(config, 1616 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1617 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 1618 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1619 if (nl2cache != 0) { 1620 VERIFY(nvlist_add_nvlist_array(nvroot, 1621 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 1622 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1623 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1624 1625 /* 1626 * Update level 2 cache device stats. 1627 */ 1628 1629 for (i = 0; i < nl2cache; i++) { 1630 VERIFY(nvlist_lookup_uint64(l2cache[i], 1631 ZPOOL_CONFIG_GUID, &guid) == 0); 1632 1633 vd = NULL; 1634 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 1635 if (guid == 1636 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 1637 vd = spa->spa_l2cache.sav_vdevs[j]; 1638 break; 1639 } 1640 } 1641 ASSERT(vd != NULL); 1642 1643 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 1644 ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0); 1645 vdev_get_stats(vd, vs); 1646 } 1647 } 1648 1649 spa_config_exit(spa, FTAG); 1650 } 1651 1652 int 1653 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 1654 { 1655 int error; 1656 spa_t *spa; 1657 1658 *config = NULL; 1659 error = spa_open_common(name, &spa, FTAG, config); 1660 1661 if (spa && *config != NULL) { 1662 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 1663 spa_get_errlog_size(spa)) == 0); 1664 1665 spa_add_spares(spa, *config); 1666 spa_add_l2cache(spa, *config); 1667 } 1668 1669 /* 1670 * We want to get the alternate root even for faulted pools, so we cheat 1671 * and call spa_lookup() directly. 1672 */ 1673 if (altroot) { 1674 if (spa == NULL) { 1675 mutex_enter(&spa_namespace_lock); 1676 spa = spa_lookup(name); 1677 if (spa) 1678 spa_altroot(spa, altroot, buflen); 1679 else 1680 altroot[0] = '\0'; 1681 spa = NULL; 1682 mutex_exit(&spa_namespace_lock); 1683 } else { 1684 spa_altroot(spa, altroot, buflen); 1685 } 1686 } 1687 1688 if (spa != NULL) 1689 spa_close(spa, FTAG); 1690 1691 return (error); 1692 } 1693 1694 /* 1695 * Validate that the auxiliary device array is well formed. We must have an 1696 * array of nvlists, each which describes a valid leaf vdev. If this is an 1697 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 1698 * specified, as long as they are well-formed. 1699 */ 1700 static int 1701 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 1702 spa_aux_vdev_t *sav, const char *config, uint64_t version, 1703 vdev_labeltype_t label) 1704 { 1705 nvlist_t **dev; 1706 uint_t i, ndev; 1707 vdev_t *vd; 1708 int error; 1709 1710 /* 1711 * It's acceptable to have no devs specified. 1712 */ 1713 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 1714 return (0); 1715 1716 if (ndev == 0) 1717 return (EINVAL); 1718 1719 /* 1720 * Make sure the pool is formatted with a version that supports this 1721 * device type. 1722 */ 1723 if (spa_version(spa) < version) 1724 return (ENOTSUP); 1725 1726 /* 1727 * Set the pending device list so we correctly handle device in-use 1728 * checking. 1729 */ 1730 sav->sav_pending = dev; 1731 sav->sav_npending = ndev; 1732 1733 for (i = 0; i < ndev; i++) { 1734 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 1735 mode)) != 0) 1736 goto out; 1737 1738 if (!vd->vdev_ops->vdev_op_leaf) { 1739 vdev_free(vd); 1740 error = EINVAL; 1741 goto out; 1742 } 1743 1744 /* 1745 * The L2ARC currently only supports disk devices. 1746 */ 1747 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 1748 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 1749 error = ENOTBLK; 1750 goto out; 1751 } 1752 1753 vd->vdev_top = vd; 1754 1755 if ((error = vdev_open(vd)) == 0 && 1756 (error = vdev_label_init(vd, crtxg, label)) == 0) { 1757 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 1758 vd->vdev_guid) == 0); 1759 } 1760 1761 vdev_free(vd); 1762 1763 if (error && 1764 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 1765 goto out; 1766 else 1767 error = 0; 1768 } 1769 1770 out: 1771 sav->sav_pending = NULL; 1772 sav->sav_npending = 0; 1773 return (error); 1774 } 1775 1776 static int 1777 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 1778 { 1779 int error; 1780 1781 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 1782 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 1783 VDEV_LABEL_SPARE)) != 0) { 1784 return (error); 1785 } 1786 1787 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 1788 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 1789 VDEV_LABEL_L2CACHE)); 1790 } 1791 1792 static void 1793 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 1794 const char *config) 1795 { 1796 int i; 1797 1798 if (sav->sav_config != NULL) { 1799 nvlist_t **olddevs; 1800 uint_t oldndevs; 1801 nvlist_t **newdevs; 1802 1803 /* 1804 * Generate new dev list by concatentating with the 1805 * current dev list. 1806 */ 1807 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 1808 &olddevs, &oldndevs) == 0); 1809 1810 newdevs = kmem_alloc(sizeof (void *) * 1811 (ndevs + oldndevs), KM_SLEEP); 1812 for (i = 0; i < oldndevs; i++) 1813 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 1814 KM_SLEEP) == 0); 1815 for (i = 0; i < ndevs; i++) 1816 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 1817 KM_SLEEP) == 0); 1818 1819 VERIFY(nvlist_remove(sav->sav_config, config, 1820 DATA_TYPE_NVLIST_ARRAY) == 0); 1821 1822 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1823 config, newdevs, ndevs + oldndevs) == 0); 1824 for (i = 0; i < oldndevs + ndevs; i++) 1825 nvlist_free(newdevs[i]); 1826 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 1827 } else { 1828 /* 1829 * Generate a new dev list. 1830 */ 1831 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 1832 KM_SLEEP) == 0); 1833 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 1834 devs, ndevs) == 0); 1835 } 1836 } 1837 1838 /* 1839 * Stop and drop level 2 ARC devices 1840 */ 1841 void 1842 spa_l2cache_drop(spa_t *spa) 1843 { 1844 vdev_t *vd; 1845 int i; 1846 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1847 1848 for (i = 0; i < sav->sav_count; i++) { 1849 uint64_t pool; 1850 1851 vd = sav->sav_vdevs[i]; 1852 ASSERT(vd != NULL); 1853 1854 if (spa_mode & FWRITE && 1855 spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && 1856 l2arc_vdev_present(vd)) { 1857 l2arc_remove_vdev(vd); 1858 } 1859 if (vd->vdev_isl2cache) 1860 spa_l2cache_remove(vd); 1861 vdev_clear_stats(vd); 1862 (void) vdev_close(vd); 1863 } 1864 } 1865 1866 /* 1867 * Pool Creation 1868 */ 1869 int 1870 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 1871 const char *history_str, nvlist_t *zplprops) 1872 { 1873 spa_t *spa; 1874 char *altroot = NULL; 1875 vdev_t *rvd; 1876 dsl_pool_t *dp; 1877 dmu_tx_t *tx; 1878 int c, error = 0; 1879 uint64_t txg = TXG_INITIAL; 1880 nvlist_t **spares, **l2cache; 1881 uint_t nspares, nl2cache; 1882 uint64_t version; 1883 1884 /* 1885 * If this pool already exists, return failure. 1886 */ 1887 mutex_enter(&spa_namespace_lock); 1888 if (spa_lookup(pool) != NULL) { 1889 mutex_exit(&spa_namespace_lock); 1890 return (EEXIST); 1891 } 1892 1893 /* 1894 * Allocate a new spa_t structure. 1895 */ 1896 (void) nvlist_lookup_string(props, 1897 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 1898 spa = spa_add(pool, altroot); 1899 spa_activate(spa); 1900 1901 spa->spa_uberblock.ub_txg = txg - 1; 1902 1903 if (props && (error = spa_prop_validate(spa, props))) { 1904 spa_unload(spa); 1905 spa_deactivate(spa); 1906 spa_remove(spa); 1907 mutex_exit(&spa_namespace_lock); 1908 return (error); 1909 } 1910 1911 if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 1912 &version) != 0) 1913 version = SPA_VERSION; 1914 ASSERT(version <= SPA_VERSION); 1915 spa->spa_uberblock.ub_version = version; 1916 spa->spa_ubsync = spa->spa_uberblock; 1917 1918 /* 1919 * Create the root vdev. 1920 */ 1921 spa_config_enter(spa, RW_WRITER, FTAG); 1922 1923 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1924 1925 ASSERT(error != 0 || rvd != NULL); 1926 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1927 1928 if (error == 0 && !zfs_allocatable_devs(nvroot)) 1929 error = EINVAL; 1930 1931 if (error == 0 && 1932 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1933 (error = spa_validate_aux(spa, nvroot, txg, 1934 VDEV_ALLOC_ADD)) == 0) { 1935 for (c = 0; c < rvd->vdev_children; c++) 1936 vdev_init(rvd->vdev_child[c], txg); 1937 vdev_config_dirty(rvd); 1938 } 1939 1940 spa_config_exit(spa, FTAG); 1941 1942 if (error != 0) { 1943 spa_unload(spa); 1944 spa_deactivate(spa); 1945 spa_remove(spa); 1946 mutex_exit(&spa_namespace_lock); 1947 return (error); 1948 } 1949 1950 /* 1951 * Get the list of spares, if specified. 1952 */ 1953 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1954 &spares, &nspares) == 0) { 1955 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 1956 KM_SLEEP) == 0); 1957 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1958 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1959 spa_config_enter(spa, RW_WRITER, FTAG); 1960 spa_load_spares(spa); 1961 spa_config_exit(spa, FTAG); 1962 spa->spa_spares.sav_sync = B_TRUE; 1963 } 1964 1965 /* 1966 * Get the list of level 2 cache devices, if specified. 1967 */ 1968 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 1969 &l2cache, &nl2cache) == 0) { 1970 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 1971 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1972 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 1973 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 1974 spa_config_enter(spa, RW_WRITER, FTAG); 1975 spa_load_l2cache(spa); 1976 spa_config_exit(spa, FTAG); 1977 spa->spa_l2cache.sav_sync = B_TRUE; 1978 } 1979 1980 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 1981 spa->spa_meta_objset = dp->dp_meta_objset; 1982 1983 tx = dmu_tx_create_assigned(dp, txg); 1984 1985 /* 1986 * Create the pool config object. 1987 */ 1988 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1989 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 1990 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1991 1992 if (zap_add(spa->spa_meta_objset, 1993 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1994 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1995 cmn_err(CE_PANIC, "failed to add pool config"); 1996 } 1997 1998 /* Newly created pools with the right version are always deflated. */ 1999 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 2000 spa->spa_deflate = TRUE; 2001 if (zap_add(spa->spa_meta_objset, 2002 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2003 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 2004 cmn_err(CE_PANIC, "failed to add deflate"); 2005 } 2006 } 2007 2008 /* 2009 * Create the deferred-free bplist object. Turn off compression 2010 * because sync-to-convergence takes longer if the blocksize 2011 * keeps changing. 2012 */ 2013 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 2014 1 << 14, tx); 2015 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 2016 ZIO_COMPRESS_OFF, tx); 2017 2018 if (zap_add(spa->spa_meta_objset, 2019 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 2020 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 2021 cmn_err(CE_PANIC, "failed to add bplist"); 2022 } 2023 2024 /* 2025 * Create the pool's history object. 2026 */ 2027 if (version >= SPA_VERSION_ZPOOL_HISTORY) 2028 spa_history_create_obj(spa, tx); 2029 2030 /* 2031 * Set pool properties. 2032 */ 2033 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 2034 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2035 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 2036 if (props) 2037 spa_sync_props(spa, props, CRED(), tx); 2038 2039 dmu_tx_commit(tx); 2040 2041 spa->spa_sync_on = B_TRUE; 2042 txg_sync_start(spa->spa_dsl_pool); 2043 2044 /* 2045 * We explicitly wait for the first transaction to complete so that our 2046 * bean counters are appropriately updated. 2047 */ 2048 txg_wait_synced(spa->spa_dsl_pool, txg); 2049 2050 spa_config_sync(spa, B_FALSE, B_TRUE); 2051 2052 if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 2053 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 2054 2055 mutex_exit(&spa_namespace_lock); 2056 2057 spa->spa_minref = refcount_count(&spa->spa_refcount); 2058 2059 return (0); 2060 } 2061 2062 /* 2063 * Import the given pool into the system. We set up the necessary spa_t and 2064 * then call spa_load() to do the dirty work. 2065 */ 2066 static int 2067 spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props, 2068 boolean_t isroot, boolean_t allowfaulted) 2069 { 2070 spa_t *spa; 2071 char *altroot = NULL; 2072 int error, loaderr; 2073 nvlist_t *nvroot; 2074 nvlist_t **spares, **l2cache; 2075 uint_t nspares, nl2cache; 2076 2077 /* 2078 * If a pool with this name exists, return failure. 2079 */ 2080 mutex_enter(&spa_namespace_lock); 2081 if (spa_lookup(pool) != NULL) { 2082 mutex_exit(&spa_namespace_lock); 2083 return (EEXIST); 2084 } 2085 2086 /* 2087 * Create and initialize the spa structure. 2088 */ 2089 (void) nvlist_lookup_string(props, 2090 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2091 spa = spa_add(pool, altroot); 2092 spa_activate(spa); 2093 2094 if (allowfaulted) 2095 spa->spa_import_faulted = B_TRUE; 2096 spa->spa_is_root = isroot; 2097 2098 /* 2099 * Pass off the heavy lifting to spa_load(). 2100 * Pass TRUE for mosconfig (unless this is a root pool) because 2101 * the user-supplied config is actually the one to trust when 2102 * doing an import. 2103 */ 2104 loaderr = error = spa_load(spa, config, SPA_LOAD_IMPORT, !isroot); 2105 2106 spa_config_enter(spa, RW_WRITER, FTAG); 2107 /* 2108 * Toss any existing sparelist, as it doesn't have any validity anymore, 2109 * and conflicts with spa_has_spare(). 2110 */ 2111 if (!isroot && spa->spa_spares.sav_config) { 2112 nvlist_free(spa->spa_spares.sav_config); 2113 spa->spa_spares.sav_config = NULL; 2114 spa_load_spares(spa); 2115 } 2116 if (!isroot && spa->spa_l2cache.sav_config) { 2117 nvlist_free(spa->spa_l2cache.sav_config); 2118 spa->spa_l2cache.sav_config = NULL; 2119 spa_load_l2cache(spa); 2120 } 2121 2122 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 2123 &nvroot) == 0); 2124 if (error == 0) 2125 error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE); 2126 if (error == 0) 2127 error = spa_validate_aux(spa, nvroot, -1ULL, 2128 VDEV_ALLOC_L2CACHE); 2129 spa_config_exit(spa, FTAG); 2130 2131 if (error != 0 || (props && (error = spa_prop_set(spa, props)))) { 2132 if (loaderr != 0 && loaderr != EINVAL && allowfaulted) { 2133 /* 2134 * If we failed to load the pool, but 'allowfaulted' is 2135 * set, then manually set the config as if the config 2136 * passed in was specified in the cache file. 2137 */ 2138 error = 0; 2139 spa->spa_import_faulted = B_FALSE; 2140 if (spa->spa_config == NULL) { 2141 spa_config_enter(spa, RW_READER, FTAG); 2142 spa->spa_config = spa_config_generate(spa, 2143 NULL, -1ULL, B_TRUE); 2144 spa_config_exit(spa, FTAG); 2145 } 2146 spa_unload(spa); 2147 spa_deactivate(spa); 2148 spa_config_sync(spa, B_FALSE, B_TRUE); 2149 } else { 2150 spa_unload(spa); 2151 spa_deactivate(spa); 2152 spa_remove(spa); 2153 } 2154 mutex_exit(&spa_namespace_lock); 2155 return (error); 2156 } 2157 2158 /* 2159 * Override any spares and level 2 cache devices as specified by 2160 * the user, as these may have correct device names/devids, etc. 2161 */ 2162 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2163 &spares, &nspares) == 0) { 2164 if (spa->spa_spares.sav_config) 2165 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 2166 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 2167 else 2168 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 2169 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2170 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2171 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2172 spa_config_enter(spa, RW_WRITER, FTAG); 2173 spa_load_spares(spa); 2174 spa_config_exit(spa, FTAG); 2175 spa->spa_spares.sav_sync = B_TRUE; 2176 } 2177 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2178 &l2cache, &nl2cache) == 0) { 2179 if (spa->spa_l2cache.sav_config) 2180 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 2181 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 2182 else 2183 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2184 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2185 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2186 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2187 spa_config_enter(spa, RW_WRITER, FTAG); 2188 spa_load_l2cache(spa); 2189 spa_config_exit(spa, FTAG); 2190 spa->spa_l2cache.sav_sync = B_TRUE; 2191 } 2192 2193 if (spa_mode & FWRITE) { 2194 /* 2195 * Update the config cache to include the newly-imported pool. 2196 */ 2197 spa_config_update_common(spa, SPA_CONFIG_UPDATE_POOL, isroot); 2198 } 2199 2200 spa->spa_import_faulted = B_FALSE; 2201 mutex_exit(&spa_namespace_lock); 2202 2203 return (0); 2204 } 2205 2206 #ifdef _KERNEL 2207 /* 2208 * Build a "root" vdev for a top level vdev read in from a rootpool 2209 * device label. 2210 */ 2211 static void 2212 spa_build_rootpool_config(nvlist_t *config) 2213 { 2214 nvlist_t *nvtop, *nvroot; 2215 uint64_t pgid; 2216 2217 /* 2218 * Add this top-level vdev to the child array. 2219 */ 2220 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) 2221 == 0); 2222 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid) 2223 == 0); 2224 2225 /* 2226 * Put this pool's top-level vdevs into a root vdev. 2227 */ 2228 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2229 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) 2230 == 0); 2231 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 2232 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 2233 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 2234 &nvtop, 1) == 0); 2235 2236 /* 2237 * Replace the existing vdev_tree with the new root vdev in 2238 * this pool's configuration (remove the old, add the new). 2239 */ 2240 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 2241 nvlist_free(nvroot); 2242 } 2243 2244 /* 2245 * Get the root pool information from the root disk, then import the root pool 2246 * during the system boot up time. 2247 */ 2248 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 2249 2250 int 2251 spa_check_rootconf(char *devpath, char *devid, nvlist_t **bestconf, 2252 uint64_t *besttxg) 2253 { 2254 nvlist_t *config; 2255 uint64_t txg; 2256 int error; 2257 2258 if (error = vdev_disk_read_rootlabel(devpath, devid, &config)) 2259 return (error); 2260 2261 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 2262 2263 if (bestconf != NULL) 2264 *bestconf = config; 2265 else 2266 nvlist_free(config); 2267 *besttxg = txg; 2268 return (0); 2269 } 2270 2271 boolean_t 2272 spa_rootdev_validate(nvlist_t *nv) 2273 { 2274 uint64_t ival; 2275 2276 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 || 2277 nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 || 2278 nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0) 2279 return (B_FALSE); 2280 2281 return (B_TRUE); 2282 } 2283 2284 2285 /* 2286 * Given the boot device's physical path or devid, check if the device 2287 * is in a valid state. If so, return the configuration from the vdev 2288 * label. 2289 */ 2290 int 2291 spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf) 2292 { 2293 nvlist_t *conf = NULL; 2294 uint64_t txg = 0; 2295 nvlist_t *nvtop, **child; 2296 char *type; 2297 char *bootpath = NULL; 2298 uint_t children, c; 2299 char *tmp; 2300 int error; 2301 2302 if (devpath && ((tmp = strchr(devpath, ' ')) != NULL)) 2303 *tmp = '\0'; 2304 if (error = spa_check_rootconf(devpath, devid, &conf, &txg)) { 2305 cmn_err(CE_NOTE, "error reading device label"); 2306 return (error); 2307 } 2308 if (txg == 0) { 2309 cmn_err(CE_NOTE, "this device is detached"); 2310 nvlist_free(conf); 2311 return (EINVAL); 2312 } 2313 2314 VERIFY(nvlist_lookup_nvlist(conf, ZPOOL_CONFIG_VDEV_TREE, 2315 &nvtop) == 0); 2316 VERIFY(nvlist_lookup_string(nvtop, ZPOOL_CONFIG_TYPE, &type) == 0); 2317 2318 if (strcmp(type, VDEV_TYPE_DISK) == 0) { 2319 if (spa_rootdev_validate(nvtop)) { 2320 goto out; 2321 } else { 2322 nvlist_free(conf); 2323 return (EINVAL); 2324 } 2325 } 2326 2327 ASSERT(strcmp(type, VDEV_TYPE_MIRROR) == 0); 2328 2329 VERIFY(nvlist_lookup_nvlist_array(nvtop, ZPOOL_CONFIG_CHILDREN, 2330 &child, &children) == 0); 2331 2332 /* 2333 * Go thru vdevs in the mirror to see if the given device 2334 * has the most recent txg. Only the device with the most 2335 * recent txg has valid information and should be booted. 2336 */ 2337 for (c = 0; c < children; c++) { 2338 char *cdevid, *cpath; 2339 uint64_t tmptxg; 2340 2341 if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_PHYS_PATH, 2342 &cpath) != 0) 2343 return (EINVAL); 2344 if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_DEVID, 2345 &cdevid) != 0) 2346 return (EINVAL); 2347 if (error = spa_check_rootconf(cpath, cdevid, NULL, &tmptxg)) 2348 return (error); 2349 if (tmptxg > txg) { 2350 txg = tmptxg; 2351 VERIFY(nvlist_lookup_string(child[c], 2352 ZPOOL_CONFIG_PATH, &bootpath) == 0); 2353 } 2354 } 2355 2356 /* Does the best device match the one we've booted from? */ 2357 if (bootpath) { 2358 cmn_err(CE_NOTE, "try booting from '%s'", bootpath); 2359 return (EINVAL); 2360 } 2361 out: 2362 *bestconf = conf; 2363 return (0); 2364 } 2365 2366 /* 2367 * Import a root pool. 2368 * 2369 * For x86. devpath_list will consist of devid and/or physpath name of 2370 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 2371 * The GRUB "findroot" command will return the vdev we should boot. 2372 * 2373 * For Sparc, devpath_list consists the physpath name of the booting device 2374 * no matter the rootpool is a single device pool or a mirrored pool. 2375 * e.g. 2376 * "/pci@1f,0/ide@d/disk@0,0:a" 2377 */ 2378 int 2379 spa_import_rootpool(char *devpath, char *devid) 2380 { 2381 nvlist_t *conf = NULL; 2382 char *pname; 2383 int error; 2384 2385 /* 2386 * Get the vdev pathname and configuation from the most 2387 * recently updated vdev (highest txg). 2388 */ 2389 if (error = spa_get_rootconf(devpath, devid, &conf)) 2390 goto msg_out; 2391 2392 /* 2393 * Add type "root" vdev to the config. 2394 */ 2395 spa_build_rootpool_config(conf); 2396 2397 VERIFY(nvlist_lookup_string(conf, ZPOOL_CONFIG_POOL_NAME, &pname) == 0); 2398 2399 /* 2400 * We specify 'allowfaulted' for this to be treated like spa_open() 2401 * instead of spa_import(). This prevents us from marking vdevs as 2402 * persistently unavailable, and generates FMA ereports as if it were a 2403 * pool open, not import. 2404 */ 2405 error = spa_import_common(pname, conf, NULL, B_TRUE, B_TRUE); 2406 if (error == EEXIST) 2407 error = 0; 2408 2409 nvlist_free(conf); 2410 return (error); 2411 2412 msg_out: 2413 cmn_err(CE_NOTE, "\n" 2414 " *************************************************** \n" 2415 " * This device is not bootable! * \n" 2416 " * It is either offlined or detached or faulted. * \n" 2417 " * Please try to boot from a different device. * \n" 2418 " *************************************************** "); 2419 2420 return (error); 2421 } 2422 #endif 2423 2424 /* 2425 * Import a non-root pool into the system. 2426 */ 2427 int 2428 spa_import(const char *pool, nvlist_t *config, nvlist_t *props) 2429 { 2430 return (spa_import_common(pool, config, props, B_FALSE, B_FALSE)); 2431 } 2432 2433 int 2434 spa_import_faulted(const char *pool, nvlist_t *config, nvlist_t *props) 2435 { 2436 return (spa_import_common(pool, config, props, B_FALSE, B_TRUE)); 2437 } 2438 2439 2440 /* 2441 * This (illegal) pool name is used when temporarily importing a spa_t in order 2442 * to get the vdev stats associated with the imported devices. 2443 */ 2444 #define TRYIMPORT_NAME "$import" 2445 2446 nvlist_t * 2447 spa_tryimport(nvlist_t *tryconfig) 2448 { 2449 nvlist_t *config = NULL; 2450 char *poolname; 2451 spa_t *spa; 2452 uint64_t state; 2453 2454 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 2455 return (NULL); 2456 2457 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 2458 return (NULL); 2459 2460 /* 2461 * Create and initialize the spa structure. 2462 */ 2463 mutex_enter(&spa_namespace_lock); 2464 spa = spa_add(TRYIMPORT_NAME, NULL); 2465 spa_activate(spa); 2466 2467 /* 2468 * Pass off the heavy lifting to spa_load(). 2469 * Pass TRUE for mosconfig because the user-supplied config 2470 * is actually the one to trust when doing an import. 2471 */ 2472 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 2473 2474 /* 2475 * If 'tryconfig' was at least parsable, return the current config. 2476 */ 2477 if (spa->spa_root_vdev != NULL) { 2478 spa_config_enter(spa, RW_READER, FTAG); 2479 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2480 spa_config_exit(spa, FTAG); 2481 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 2482 poolname) == 0); 2483 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 2484 state) == 0); 2485 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 2486 spa->spa_uberblock.ub_timestamp) == 0); 2487 2488 /* 2489 * If the bootfs property exists on this pool then we 2490 * copy it out so that external consumers can tell which 2491 * pools are bootable. 2492 */ 2493 if (spa->spa_bootfs) { 2494 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 2495 2496 /* 2497 * We have to play games with the name since the 2498 * pool was opened as TRYIMPORT_NAME. 2499 */ 2500 if (dsl_dsobj_to_dsname(spa->spa_name, 2501 spa->spa_bootfs, tmpname) == 0) { 2502 char *cp; 2503 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 2504 2505 cp = strchr(tmpname, '/'); 2506 if (cp == NULL) { 2507 (void) strlcpy(dsname, tmpname, 2508 MAXPATHLEN); 2509 } else { 2510 (void) snprintf(dsname, MAXPATHLEN, 2511 "%s/%s", poolname, ++cp); 2512 } 2513 VERIFY(nvlist_add_string(config, 2514 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 2515 kmem_free(dsname, MAXPATHLEN); 2516 } 2517 kmem_free(tmpname, MAXPATHLEN); 2518 } 2519 2520 /* 2521 * Add the list of hot spares and level 2 cache devices. 2522 */ 2523 spa_add_spares(spa, config); 2524 spa_add_l2cache(spa, config); 2525 } 2526 2527 spa_unload(spa); 2528 spa_deactivate(spa); 2529 spa_remove(spa); 2530 mutex_exit(&spa_namespace_lock); 2531 2532 return (config); 2533 } 2534 2535 /* 2536 * Pool export/destroy 2537 * 2538 * The act of destroying or exporting a pool is very simple. We make sure there 2539 * is no more pending I/O and any references to the pool are gone. Then, we 2540 * update the pool state and sync all the labels to disk, removing the 2541 * configuration from the cache afterwards. 2542 */ 2543 static int 2544 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 2545 boolean_t force) 2546 { 2547 spa_t *spa; 2548 2549 if (oldconfig) 2550 *oldconfig = NULL; 2551 2552 if (!(spa_mode & FWRITE)) 2553 return (EROFS); 2554 2555 mutex_enter(&spa_namespace_lock); 2556 if ((spa = spa_lookup(pool)) == NULL) { 2557 mutex_exit(&spa_namespace_lock); 2558 return (ENOENT); 2559 } 2560 2561 /* 2562 * Put a hold on the pool, drop the namespace lock, stop async tasks, 2563 * reacquire the namespace lock, and see if we can export. 2564 */ 2565 spa_open_ref(spa, FTAG); 2566 mutex_exit(&spa_namespace_lock); 2567 spa_async_suspend(spa); 2568 mutex_enter(&spa_namespace_lock); 2569 spa_close(spa, FTAG); 2570 2571 /* 2572 * The pool will be in core if it's openable, 2573 * in which case we can modify its state. 2574 */ 2575 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 2576 /* 2577 * Objsets may be open only because they're dirty, so we 2578 * have to force it to sync before checking spa_refcnt. 2579 */ 2580 txg_wait_synced(spa->spa_dsl_pool, 0); 2581 2582 /* 2583 * A pool cannot be exported or destroyed if there are active 2584 * references. If we are resetting a pool, allow references by 2585 * fault injection handlers. 2586 */ 2587 if (!spa_refcount_zero(spa) || 2588 (spa->spa_inject_ref != 0 && 2589 new_state != POOL_STATE_UNINITIALIZED)) { 2590 spa_async_resume(spa); 2591 mutex_exit(&spa_namespace_lock); 2592 return (EBUSY); 2593 } 2594 2595 /* 2596 * A pool cannot be exported if it has an active shared spare. 2597 * This is to prevent other pools stealing the active spare 2598 * from an exported pool. At user's own will, such pool can 2599 * be forcedly exported. 2600 */ 2601 if (!force && new_state == POOL_STATE_EXPORTED && 2602 spa_has_active_shared_spare(spa)) { 2603 spa_async_resume(spa); 2604 mutex_exit(&spa_namespace_lock); 2605 return (EXDEV); 2606 } 2607 2608 /* 2609 * We want this to be reflected on every label, 2610 * so mark them all dirty. spa_unload() will do the 2611 * final sync that pushes these changes out. 2612 */ 2613 if (new_state != POOL_STATE_UNINITIALIZED) { 2614 spa_config_enter(spa, RW_WRITER, FTAG); 2615 spa->spa_state = new_state; 2616 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 2617 vdev_config_dirty(spa->spa_root_vdev); 2618 spa_config_exit(spa, FTAG); 2619 } 2620 } 2621 2622 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 2623 2624 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2625 spa_unload(spa); 2626 spa_deactivate(spa); 2627 } 2628 2629 if (oldconfig && spa->spa_config) 2630 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 2631 2632 if (new_state != POOL_STATE_UNINITIALIZED) { 2633 spa_config_sync(spa, B_TRUE, B_TRUE); 2634 spa_remove(spa); 2635 } 2636 mutex_exit(&spa_namespace_lock); 2637 2638 return (0); 2639 } 2640 2641 /* 2642 * Destroy a storage pool. 2643 */ 2644 int 2645 spa_destroy(char *pool) 2646 { 2647 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, B_FALSE)); 2648 } 2649 2650 /* 2651 * Export a storage pool. 2652 */ 2653 int 2654 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force) 2655 { 2656 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, force)); 2657 } 2658 2659 /* 2660 * Similar to spa_export(), this unloads the spa_t without actually removing it 2661 * from the namespace in any way. 2662 */ 2663 int 2664 spa_reset(char *pool) 2665 { 2666 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 2667 B_FALSE)); 2668 } 2669 2670 /* 2671 * ========================================================================== 2672 * Device manipulation 2673 * ========================================================================== 2674 */ 2675 2676 /* 2677 * Add a device to a storage pool. 2678 */ 2679 int 2680 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 2681 { 2682 uint64_t txg; 2683 int c, error; 2684 vdev_t *rvd = spa->spa_root_vdev; 2685 vdev_t *vd, *tvd; 2686 nvlist_t **spares, **l2cache; 2687 uint_t nspares, nl2cache; 2688 2689 txg = spa_vdev_enter(spa); 2690 2691 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 2692 VDEV_ALLOC_ADD)) != 0) 2693 return (spa_vdev_exit(spa, NULL, txg, error)); 2694 2695 spa->spa_pending_vdev = vd; 2696 2697 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 2698 &nspares) != 0) 2699 nspares = 0; 2700 2701 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 2702 &nl2cache) != 0) 2703 nl2cache = 0; 2704 2705 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) { 2706 spa->spa_pending_vdev = NULL; 2707 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 2708 } 2709 2710 if (vd->vdev_children != 0) { 2711 if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 2712 spa->spa_pending_vdev = NULL; 2713 return (spa_vdev_exit(spa, vd, txg, error)); 2714 } 2715 } 2716 2717 /* 2718 * We must validate the spares and l2cache devices after checking the 2719 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 2720 */ 2721 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) { 2722 spa->spa_pending_vdev = NULL; 2723 return (spa_vdev_exit(spa, vd, txg, error)); 2724 } 2725 2726 spa->spa_pending_vdev = NULL; 2727 2728 /* 2729 * Transfer each new top-level vdev from vd to rvd. 2730 */ 2731 for (c = 0; c < vd->vdev_children; c++) { 2732 tvd = vd->vdev_child[c]; 2733 vdev_remove_child(vd, tvd); 2734 tvd->vdev_id = rvd->vdev_children; 2735 vdev_add_child(rvd, tvd); 2736 vdev_config_dirty(tvd); 2737 } 2738 2739 if (nspares != 0) { 2740 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 2741 ZPOOL_CONFIG_SPARES); 2742 spa_load_spares(spa); 2743 spa->spa_spares.sav_sync = B_TRUE; 2744 } 2745 2746 if (nl2cache != 0) { 2747 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 2748 ZPOOL_CONFIG_L2CACHE); 2749 spa_load_l2cache(spa); 2750 spa->spa_l2cache.sav_sync = B_TRUE; 2751 } 2752 2753 /* 2754 * We have to be careful when adding new vdevs to an existing pool. 2755 * If other threads start allocating from these vdevs before we 2756 * sync the config cache, and we lose power, then upon reboot we may 2757 * fail to open the pool because there are DVAs that the config cache 2758 * can't translate. Therefore, we first add the vdevs without 2759 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 2760 * and then let spa_config_update() initialize the new metaslabs. 2761 * 2762 * spa_load() checks for added-but-not-initialized vdevs, so that 2763 * if we lose power at any point in this sequence, the remaining 2764 * steps will be completed the next time we load the pool. 2765 */ 2766 (void) spa_vdev_exit(spa, vd, txg, 0); 2767 2768 mutex_enter(&spa_namespace_lock); 2769 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2770 mutex_exit(&spa_namespace_lock); 2771 2772 return (0); 2773 } 2774 2775 /* 2776 * Attach a device to a mirror. The arguments are the path to any device 2777 * in the mirror, and the nvroot for the new device. If the path specifies 2778 * a device that is not mirrored, we automatically insert the mirror vdev. 2779 * 2780 * If 'replacing' is specified, the new device is intended to replace the 2781 * existing device; in this case the two devices are made into their own 2782 * mirror using the 'replacing' vdev, which is functionally identical to 2783 * the mirror vdev (it actually reuses all the same ops) but has a few 2784 * extra rules: you can't attach to it after it's been created, and upon 2785 * completion of resilvering, the first disk (the one being replaced) 2786 * is automatically detached. 2787 */ 2788 int 2789 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 2790 { 2791 uint64_t txg, open_txg; 2792 vdev_t *rvd = spa->spa_root_vdev; 2793 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 2794 vdev_ops_t *pvops; 2795 dmu_tx_t *tx; 2796 char *oldvdpath, *newvdpath; 2797 int newvd_isspare; 2798 int error; 2799 2800 txg = spa_vdev_enter(spa); 2801 2802 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 2803 2804 if (oldvd == NULL) 2805 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 2806 2807 if (!oldvd->vdev_ops->vdev_op_leaf) 2808 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2809 2810 pvd = oldvd->vdev_parent; 2811 2812 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 2813 VDEV_ALLOC_ADD)) != 0) 2814 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 2815 2816 if (newrootvd->vdev_children != 1) 2817 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 2818 2819 newvd = newrootvd->vdev_child[0]; 2820 2821 if (!newvd->vdev_ops->vdev_op_leaf) 2822 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 2823 2824 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 2825 return (spa_vdev_exit(spa, newrootvd, txg, error)); 2826 2827 /* 2828 * Spares can't replace logs 2829 */ 2830 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 2831 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2832 2833 if (!replacing) { 2834 /* 2835 * For attach, the only allowable parent is a mirror or the root 2836 * vdev. 2837 */ 2838 if (pvd->vdev_ops != &vdev_mirror_ops && 2839 pvd->vdev_ops != &vdev_root_ops) 2840 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2841 2842 pvops = &vdev_mirror_ops; 2843 } else { 2844 /* 2845 * Active hot spares can only be replaced by inactive hot 2846 * spares. 2847 */ 2848 if (pvd->vdev_ops == &vdev_spare_ops && 2849 pvd->vdev_child[1] == oldvd && 2850 !spa_has_spare(spa, newvd->vdev_guid)) 2851 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2852 2853 /* 2854 * If the source is a hot spare, and the parent isn't already a 2855 * spare, then we want to create a new hot spare. Otherwise, we 2856 * want to create a replacing vdev. The user is not allowed to 2857 * attach to a spared vdev child unless the 'isspare' state is 2858 * the same (spare replaces spare, non-spare replaces 2859 * non-spare). 2860 */ 2861 if (pvd->vdev_ops == &vdev_replacing_ops) 2862 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2863 else if (pvd->vdev_ops == &vdev_spare_ops && 2864 newvd->vdev_isspare != oldvd->vdev_isspare) 2865 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2866 else if (pvd->vdev_ops != &vdev_spare_ops && 2867 newvd->vdev_isspare) 2868 pvops = &vdev_spare_ops; 2869 else 2870 pvops = &vdev_replacing_ops; 2871 } 2872 2873 /* 2874 * Compare the new device size with the replaceable/attachable 2875 * device size. 2876 */ 2877 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 2878 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 2879 2880 /* 2881 * The new device cannot have a higher alignment requirement 2882 * than the top-level vdev. 2883 */ 2884 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 2885 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 2886 2887 /* 2888 * If this is an in-place replacement, update oldvd's path and devid 2889 * to make it distinguishable from newvd, and unopenable from now on. 2890 */ 2891 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 2892 spa_strfree(oldvd->vdev_path); 2893 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 2894 KM_SLEEP); 2895 (void) sprintf(oldvd->vdev_path, "%s/%s", 2896 newvd->vdev_path, "old"); 2897 if (oldvd->vdev_devid != NULL) { 2898 spa_strfree(oldvd->vdev_devid); 2899 oldvd->vdev_devid = NULL; 2900 } 2901 } 2902 2903 /* 2904 * If the parent is not a mirror, or if we're replacing, insert the new 2905 * mirror/replacing/spare vdev above oldvd. 2906 */ 2907 if (pvd->vdev_ops != pvops) 2908 pvd = vdev_add_parent(oldvd, pvops); 2909 2910 ASSERT(pvd->vdev_top->vdev_parent == rvd); 2911 ASSERT(pvd->vdev_ops == pvops); 2912 ASSERT(oldvd->vdev_parent == pvd); 2913 2914 /* 2915 * Extract the new device from its root and add it to pvd. 2916 */ 2917 vdev_remove_child(newrootvd, newvd); 2918 newvd->vdev_id = pvd->vdev_children; 2919 vdev_add_child(pvd, newvd); 2920 2921 /* 2922 * If newvd is smaller than oldvd, but larger than its rsize, 2923 * the addition of newvd may have decreased our parent's asize. 2924 */ 2925 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 2926 2927 tvd = newvd->vdev_top; 2928 ASSERT(pvd->vdev_top == tvd); 2929 ASSERT(tvd->vdev_parent == rvd); 2930 2931 vdev_config_dirty(tvd); 2932 2933 /* 2934 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 2935 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 2936 */ 2937 open_txg = txg + TXG_CONCURRENT_STATES - 1; 2938 2939 mutex_enter(&newvd->vdev_dtl_lock); 2940 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 2941 open_txg - TXG_INITIAL + 1); 2942 mutex_exit(&newvd->vdev_dtl_lock); 2943 2944 if (newvd->vdev_isspare) 2945 spa_spare_activate(newvd); 2946 oldvdpath = spa_strdup(vdev_description(oldvd)); 2947 newvdpath = spa_strdup(vdev_description(newvd)); 2948 newvd_isspare = newvd->vdev_isspare; 2949 2950 /* 2951 * Mark newvd's DTL dirty in this txg. 2952 */ 2953 vdev_dirty(tvd, VDD_DTL, newvd, txg); 2954 2955 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 2956 2957 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 2958 if (dmu_tx_assign(tx, TXG_WAIT) == 0) { 2959 spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, tx, 2960 CRED(), "%s vdev=%s %s vdev=%s", 2961 replacing && newvd_isspare ? "spare in" : 2962 replacing ? "replace" : "attach", newvdpath, 2963 replacing ? "for" : "to", oldvdpath); 2964 dmu_tx_commit(tx); 2965 } else { 2966 dmu_tx_abort(tx); 2967 } 2968 2969 spa_strfree(oldvdpath); 2970 spa_strfree(newvdpath); 2971 2972 /* 2973 * Kick off a resilver to update newvd. 2974 */ 2975 VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0); 2976 2977 return (0); 2978 } 2979 2980 /* 2981 * Detach a device from a mirror or replacing vdev. 2982 * If 'replace_done' is specified, only detach if the parent 2983 * is a replacing vdev. 2984 */ 2985 int 2986 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 2987 { 2988 uint64_t txg; 2989 int c, t, error; 2990 vdev_t *rvd = spa->spa_root_vdev; 2991 vdev_t *vd, *pvd, *cvd, *tvd; 2992 boolean_t unspare = B_FALSE; 2993 uint64_t unspare_guid; 2994 size_t len; 2995 2996 txg = spa_vdev_enter(spa); 2997 2998 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 2999 3000 if (vd == NULL) 3001 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3002 3003 if (!vd->vdev_ops->vdev_op_leaf) 3004 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3005 3006 pvd = vd->vdev_parent; 3007 3008 /* 3009 * If replace_done is specified, only remove this device if it's 3010 * the first child of a replacing vdev. For the 'spare' vdev, either 3011 * disk can be removed. 3012 */ 3013 if (replace_done) { 3014 if (pvd->vdev_ops == &vdev_replacing_ops) { 3015 if (vd->vdev_id != 0) 3016 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3017 } else if (pvd->vdev_ops != &vdev_spare_ops) { 3018 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3019 } 3020 } 3021 3022 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 3023 spa_version(spa) >= SPA_VERSION_SPARES); 3024 3025 /* 3026 * Only mirror, replacing, and spare vdevs support detach. 3027 */ 3028 if (pvd->vdev_ops != &vdev_replacing_ops && 3029 pvd->vdev_ops != &vdev_mirror_ops && 3030 pvd->vdev_ops != &vdev_spare_ops) 3031 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3032 3033 /* 3034 * If there's only one replica, you can't detach it. 3035 */ 3036 if (pvd->vdev_children <= 1) 3037 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3038 3039 /* 3040 * If all siblings have non-empty DTLs, this device may have the only 3041 * valid copy of the data, which means we cannot safely detach it. 3042 * 3043 * XXX -- as in the vdev_offline() case, we really want a more 3044 * precise DTL check. 3045 */ 3046 for (c = 0; c < pvd->vdev_children; c++) { 3047 uint64_t dirty; 3048 3049 cvd = pvd->vdev_child[c]; 3050 if (cvd == vd) 3051 continue; 3052 if (vdev_is_dead(cvd)) 3053 continue; 3054 mutex_enter(&cvd->vdev_dtl_lock); 3055 dirty = cvd->vdev_dtl_map.sm_space | 3056 cvd->vdev_dtl_scrub.sm_space; 3057 mutex_exit(&cvd->vdev_dtl_lock); 3058 if (!dirty) 3059 break; 3060 } 3061 3062 /* 3063 * If we are a replacing or spare vdev, then we can always detach the 3064 * latter child, as that is how one cancels the operation. 3065 */ 3066 if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 3067 c == pvd->vdev_children) 3068 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3069 3070 /* 3071 * If we are detaching the second disk from a replacing vdev, then 3072 * check to see if we changed the original vdev's path to have "/old" 3073 * at the end in spa_vdev_attach(). If so, undo that change now. 3074 */ 3075 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 && 3076 pvd->vdev_child[0]->vdev_path != NULL && 3077 pvd->vdev_child[1]->vdev_path != NULL) { 3078 ASSERT(pvd->vdev_child[1] == vd); 3079 cvd = pvd->vdev_child[0]; 3080 len = strlen(vd->vdev_path); 3081 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 3082 strcmp(cvd->vdev_path + len, "/old") == 0) { 3083 spa_strfree(cvd->vdev_path); 3084 cvd->vdev_path = spa_strdup(vd->vdev_path); 3085 } 3086 } 3087 3088 /* 3089 * If we are detaching the original disk from a spare, then it implies 3090 * that the spare should become a real disk, and be removed from the 3091 * active spare list for the pool. 3092 */ 3093 if (pvd->vdev_ops == &vdev_spare_ops && 3094 vd->vdev_id == 0) 3095 unspare = B_TRUE; 3096 3097 /* 3098 * Erase the disk labels so the disk can be used for other things. 3099 * This must be done after all other error cases are handled, 3100 * but before we disembowel vd (so we can still do I/O to it). 3101 * But if we can't do it, don't treat the error as fatal -- 3102 * it may be that the unwritability of the disk is the reason 3103 * it's being detached! 3104 */ 3105 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 3106 3107 /* 3108 * Remove vd from its parent and compact the parent's children. 3109 */ 3110 vdev_remove_child(pvd, vd); 3111 vdev_compact_children(pvd); 3112 3113 /* 3114 * Remember one of the remaining children so we can get tvd below. 3115 */ 3116 cvd = pvd->vdev_child[0]; 3117 3118 /* 3119 * If we need to remove the remaining child from the list of hot spares, 3120 * do it now, marking the vdev as no longer a spare in the process. We 3121 * must do this before vdev_remove_parent(), because that can change the 3122 * GUID if it creates a new toplevel GUID. 3123 */ 3124 if (unspare) { 3125 ASSERT(cvd->vdev_isspare); 3126 spa_spare_remove(cvd); 3127 unspare_guid = cvd->vdev_guid; 3128 } 3129 3130 /* 3131 * If the parent mirror/replacing vdev only has one child, 3132 * the parent is no longer needed. Remove it from the tree. 3133 */ 3134 if (pvd->vdev_children == 1) 3135 vdev_remove_parent(cvd); 3136 3137 /* 3138 * We don't set tvd until now because the parent we just removed 3139 * may have been the previous top-level vdev. 3140 */ 3141 tvd = cvd->vdev_top; 3142 ASSERT(tvd->vdev_parent == rvd); 3143 3144 /* 3145 * Reevaluate the parent vdev state. 3146 */ 3147 vdev_propagate_state(cvd); 3148 3149 /* 3150 * If the device we just detached was smaller than the others, it may be 3151 * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 3152 * can't fail because the existing metaslabs are already in core, so 3153 * there's nothing to read from disk. 3154 */ 3155 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 3156 3157 vdev_config_dirty(tvd); 3158 3159 /* 3160 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 3161 * vd->vdev_detached is set and free vd's DTL object in syncing context. 3162 * But first make sure we're not on any *other* txg's DTL list, to 3163 * prevent vd from being accessed after it's freed. 3164 */ 3165 for (t = 0; t < TXG_SIZE; t++) 3166 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 3167 vd->vdev_detached = B_TRUE; 3168 vdev_dirty(tvd, VDD_DTL, vd, txg); 3169 3170 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 3171 3172 error = spa_vdev_exit(spa, vd, txg, 0); 3173 3174 /* 3175 * If this was the removal of the original device in a hot spare vdev, 3176 * then we want to go through and remove the device from the hot spare 3177 * list of every other pool. 3178 */ 3179 if (unspare) { 3180 spa = NULL; 3181 mutex_enter(&spa_namespace_lock); 3182 while ((spa = spa_next(spa)) != NULL) { 3183 if (spa->spa_state != POOL_STATE_ACTIVE) 3184 continue; 3185 3186 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 3187 } 3188 mutex_exit(&spa_namespace_lock); 3189 } 3190 3191 return (error); 3192 } 3193 3194 /* 3195 * Remove a spares vdev from the nvlist config. 3196 */ 3197 static int 3198 spa_remove_spares(spa_aux_vdev_t *sav, uint64_t guid, boolean_t unspare, 3199 nvlist_t **spares, int nspares, vdev_t *vd) 3200 { 3201 nvlist_t *nv, **newspares; 3202 int i, j; 3203 3204 nv = NULL; 3205 for (i = 0; i < nspares; i++) { 3206 uint64_t theguid; 3207 3208 VERIFY(nvlist_lookup_uint64(spares[i], 3209 ZPOOL_CONFIG_GUID, &theguid) == 0); 3210 if (theguid == guid) { 3211 nv = spares[i]; 3212 break; 3213 } 3214 } 3215 3216 /* 3217 * Only remove the hot spare if it's not currently in use in this pool. 3218 */ 3219 if (nv == NULL && vd == NULL) 3220 return (ENOENT); 3221 3222 if (nv == NULL && vd != NULL) 3223 return (ENOTSUP); 3224 3225 if (!unspare && nv != NULL && vd != NULL) 3226 return (EBUSY); 3227 3228 if (nspares == 1) { 3229 newspares = NULL; 3230 } else { 3231 newspares = kmem_alloc((nspares - 1) * sizeof (void *), 3232 KM_SLEEP); 3233 for (i = 0, j = 0; i < nspares; i++) { 3234 if (spares[i] != nv) 3235 VERIFY(nvlist_dup(spares[i], 3236 &newspares[j++], KM_SLEEP) == 0); 3237 } 3238 } 3239 3240 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_SPARES, 3241 DATA_TYPE_NVLIST_ARRAY) == 0); 3242 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3243 ZPOOL_CONFIG_SPARES, newspares, nspares - 1) == 0); 3244 for (i = 0; i < nspares - 1; i++) 3245 nvlist_free(newspares[i]); 3246 kmem_free(newspares, (nspares - 1) * sizeof (void *)); 3247 3248 return (0); 3249 } 3250 3251 /* 3252 * Remove an l2cache vdev from the nvlist config. 3253 */ 3254 static int 3255 spa_remove_l2cache(spa_aux_vdev_t *sav, uint64_t guid, nvlist_t **l2cache, 3256 int nl2cache, vdev_t *vd) 3257 { 3258 nvlist_t *nv, **newl2cache; 3259 int i, j; 3260 3261 nv = NULL; 3262 for (i = 0; i < nl2cache; i++) { 3263 uint64_t theguid; 3264 3265 VERIFY(nvlist_lookup_uint64(l2cache[i], 3266 ZPOOL_CONFIG_GUID, &theguid) == 0); 3267 if (theguid == guid) { 3268 nv = l2cache[i]; 3269 break; 3270 } 3271 } 3272 3273 if (vd == NULL) { 3274 for (i = 0; i < nl2cache; i++) { 3275 if (sav->sav_vdevs[i]->vdev_guid == guid) { 3276 vd = sav->sav_vdevs[i]; 3277 break; 3278 } 3279 } 3280 } 3281 3282 if (nv == NULL && vd == NULL) 3283 return (ENOENT); 3284 3285 if (nv == NULL && vd != NULL) 3286 return (ENOTSUP); 3287 3288 if (nl2cache == 1) { 3289 newl2cache = NULL; 3290 } else { 3291 newl2cache = kmem_alloc((nl2cache - 1) * sizeof (void *), 3292 KM_SLEEP); 3293 for (i = 0, j = 0; i < nl2cache; i++) { 3294 if (l2cache[i] != nv) 3295 VERIFY(nvlist_dup(l2cache[i], 3296 &newl2cache[j++], KM_SLEEP) == 0); 3297 } 3298 } 3299 3300 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 3301 DATA_TYPE_NVLIST_ARRAY) == 0); 3302 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3303 ZPOOL_CONFIG_L2CACHE, newl2cache, nl2cache - 1) == 0); 3304 for (i = 0; i < nl2cache - 1; i++) 3305 nvlist_free(newl2cache[i]); 3306 kmem_free(newl2cache, (nl2cache - 1) * sizeof (void *)); 3307 3308 return (0); 3309 } 3310 3311 /* 3312 * Remove a device from the pool. Currently, this supports removing only hot 3313 * spares and level 2 ARC devices. 3314 */ 3315 int 3316 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 3317 { 3318 vdev_t *vd; 3319 nvlist_t **spares, **l2cache; 3320 uint_t nspares, nl2cache; 3321 int error = 0; 3322 3323 spa_config_enter(spa, RW_WRITER, FTAG); 3324 3325 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 3326 3327 if (spa->spa_spares.sav_vdevs != NULL && 3328 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3329 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { 3330 if ((error = spa_remove_spares(&spa->spa_spares, guid, unspare, 3331 spares, nspares, vd)) != 0) 3332 goto cache; 3333 spa_load_spares(spa); 3334 spa->spa_spares.sav_sync = B_TRUE; 3335 goto out; 3336 } 3337 3338 cache: 3339 if (spa->spa_l2cache.sav_vdevs != NULL && 3340 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3341 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { 3342 if ((error = spa_remove_l2cache(&spa->spa_l2cache, guid, 3343 l2cache, nl2cache, vd)) != 0) 3344 goto out; 3345 spa_load_l2cache(spa); 3346 spa->spa_l2cache.sav_sync = B_TRUE; 3347 } 3348 3349 out: 3350 spa_config_exit(spa, FTAG); 3351 return (error); 3352 } 3353 3354 /* 3355 * Find any device that's done replacing, or a vdev marked 'unspare' that's 3356 * current spared, so we can detach it. 3357 */ 3358 static vdev_t * 3359 spa_vdev_resilver_done_hunt(vdev_t *vd) 3360 { 3361 vdev_t *newvd, *oldvd; 3362 int c; 3363 3364 for (c = 0; c < vd->vdev_children; c++) { 3365 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 3366 if (oldvd != NULL) 3367 return (oldvd); 3368 } 3369 3370 /* 3371 * Check for a completed replacement. 3372 */ 3373 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 3374 oldvd = vd->vdev_child[0]; 3375 newvd = vd->vdev_child[1]; 3376 3377 mutex_enter(&newvd->vdev_dtl_lock); 3378 if (newvd->vdev_dtl_map.sm_space == 0 && 3379 newvd->vdev_dtl_scrub.sm_space == 0) { 3380 mutex_exit(&newvd->vdev_dtl_lock); 3381 return (oldvd); 3382 } 3383 mutex_exit(&newvd->vdev_dtl_lock); 3384 } 3385 3386 /* 3387 * Check for a completed resilver with the 'unspare' flag set. 3388 */ 3389 if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { 3390 newvd = vd->vdev_child[0]; 3391 oldvd = vd->vdev_child[1]; 3392 3393 mutex_enter(&newvd->vdev_dtl_lock); 3394 if (newvd->vdev_unspare && 3395 newvd->vdev_dtl_map.sm_space == 0 && 3396 newvd->vdev_dtl_scrub.sm_space == 0) { 3397 newvd->vdev_unspare = 0; 3398 mutex_exit(&newvd->vdev_dtl_lock); 3399 return (oldvd); 3400 } 3401 mutex_exit(&newvd->vdev_dtl_lock); 3402 } 3403 3404 return (NULL); 3405 } 3406 3407 static void 3408 spa_vdev_resilver_done(spa_t *spa) 3409 { 3410 vdev_t *vd; 3411 vdev_t *pvd; 3412 uint64_t guid; 3413 uint64_t pguid = 0; 3414 3415 spa_config_enter(spa, RW_READER, FTAG); 3416 3417 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 3418 guid = vd->vdev_guid; 3419 /* 3420 * If we have just finished replacing a hot spared device, then 3421 * we need to detach the parent's first child (the original hot 3422 * spare) as well. 3423 */ 3424 pvd = vd->vdev_parent; 3425 if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3426 pvd->vdev_id == 0) { 3427 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 3428 ASSERT(pvd->vdev_parent->vdev_children == 2); 3429 pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 3430 } 3431 spa_config_exit(spa, FTAG); 3432 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 3433 return; 3434 if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 3435 return; 3436 spa_config_enter(spa, RW_READER, FTAG); 3437 } 3438 3439 spa_config_exit(spa, FTAG); 3440 } 3441 3442 /* 3443 * Update the stored path for this vdev. Dirty the vdev configuration, relying 3444 * on spa_vdev_enter/exit() to synchronize the labels and cache. 3445 */ 3446 int 3447 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 3448 { 3449 vdev_t *vd; 3450 uint64_t txg; 3451 3452 txg = spa_vdev_enter(spa); 3453 3454 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) { 3455 /* 3456 * Determine if this is a reference to a hot spare device. If 3457 * it is, update the path manually as there is no associated 3458 * vdev_t that can be synced to disk. 3459 */ 3460 nvlist_t **spares; 3461 uint_t i, nspares; 3462 3463 if (spa->spa_spares.sav_config != NULL) { 3464 VERIFY(nvlist_lookup_nvlist_array( 3465 spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 3466 &spares, &nspares) == 0); 3467 for (i = 0; i < nspares; i++) { 3468 uint64_t theguid; 3469 VERIFY(nvlist_lookup_uint64(spares[i], 3470 ZPOOL_CONFIG_GUID, &theguid) == 0); 3471 if (theguid == guid) { 3472 VERIFY(nvlist_add_string(spares[i], 3473 ZPOOL_CONFIG_PATH, newpath) == 0); 3474 spa_load_spares(spa); 3475 spa->spa_spares.sav_sync = B_TRUE; 3476 return (spa_vdev_exit(spa, NULL, txg, 3477 0)); 3478 } 3479 } 3480 } 3481 3482 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 3483 } 3484 3485 if (!vd->vdev_ops->vdev_op_leaf) 3486 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3487 3488 spa_strfree(vd->vdev_path); 3489 vd->vdev_path = spa_strdup(newpath); 3490 3491 vdev_config_dirty(vd->vdev_top); 3492 3493 return (spa_vdev_exit(spa, NULL, txg, 0)); 3494 } 3495 3496 /* 3497 * ========================================================================== 3498 * SPA Scrubbing 3499 * ========================================================================== 3500 */ 3501 3502 int 3503 spa_scrub(spa_t *spa, pool_scrub_type_t type) 3504 { 3505 ASSERT(!spa_config_held(spa, RW_WRITER)); 3506 3507 if ((uint_t)type >= POOL_SCRUB_TYPES) 3508 return (ENOTSUP); 3509 3510 /* 3511 * If a resilver was requested, but there is no DTL on a 3512 * writeable leaf device, we have nothing to do. 3513 */ 3514 if (type == POOL_SCRUB_RESILVER && 3515 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 3516 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 3517 return (0); 3518 } 3519 3520 if (type == POOL_SCRUB_EVERYTHING && 3521 spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE && 3522 spa->spa_dsl_pool->dp_scrub_isresilver) 3523 return (EBUSY); 3524 3525 if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) { 3526 return (dsl_pool_scrub_clean(spa->spa_dsl_pool)); 3527 } else if (type == POOL_SCRUB_NONE) { 3528 return (dsl_pool_scrub_cancel(spa->spa_dsl_pool)); 3529 } else { 3530 return (EINVAL); 3531 } 3532 } 3533 3534 /* 3535 * ========================================================================== 3536 * SPA async task processing 3537 * ========================================================================== 3538 */ 3539 3540 static void 3541 spa_async_remove(spa_t *spa, vdev_t *vd) 3542 { 3543 int c; 3544 3545 if (vd->vdev_remove_wanted) { 3546 vd->vdev_remove_wanted = 0; 3547 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 3548 vdev_clear(spa, vd, B_TRUE); 3549 vdev_config_dirty(vd->vdev_top); 3550 } 3551 3552 for (c = 0; c < vd->vdev_children; c++) 3553 spa_async_remove(spa, vd->vdev_child[c]); 3554 } 3555 3556 static void 3557 spa_async_thread(spa_t *spa) 3558 { 3559 int tasks, i; 3560 uint64_t txg; 3561 3562 ASSERT(spa->spa_sync_on); 3563 3564 mutex_enter(&spa->spa_async_lock); 3565 tasks = spa->spa_async_tasks; 3566 spa->spa_async_tasks = 0; 3567 mutex_exit(&spa->spa_async_lock); 3568 3569 /* 3570 * See if the config needs to be updated. 3571 */ 3572 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 3573 mutex_enter(&spa_namespace_lock); 3574 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3575 mutex_exit(&spa_namespace_lock); 3576 } 3577 3578 /* 3579 * See if any devices need to be marked REMOVED. 3580 * 3581 * XXX - We avoid doing this when we are in 3582 * I/O failure state since spa_vdev_enter() grabs 3583 * the namespace lock and would not be able to obtain 3584 * the writer config lock. 3585 */ 3586 if (tasks & SPA_ASYNC_REMOVE && 3587 spa_state(spa) != POOL_STATE_IO_FAILURE) { 3588 txg = spa_vdev_enter(spa); 3589 spa_async_remove(spa, spa->spa_root_vdev); 3590 for (i = 0; i < spa->spa_l2cache.sav_count; i++) 3591 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 3592 for (i = 0; i < spa->spa_spares.sav_count; i++) 3593 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 3594 (void) spa_vdev_exit(spa, NULL, txg, 0); 3595 } 3596 3597 /* 3598 * If any devices are done replacing, detach them. 3599 */ 3600 if (tasks & SPA_ASYNC_RESILVER_DONE) 3601 spa_vdev_resilver_done(spa); 3602 3603 /* 3604 * Kick off a resilver. 3605 */ 3606 if (tasks & SPA_ASYNC_RESILVER) 3607 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0); 3608 3609 /* 3610 * Let the world know that we're done. 3611 */ 3612 mutex_enter(&spa->spa_async_lock); 3613 spa->spa_async_thread = NULL; 3614 cv_broadcast(&spa->spa_async_cv); 3615 mutex_exit(&spa->spa_async_lock); 3616 thread_exit(); 3617 } 3618 3619 void 3620 spa_async_suspend(spa_t *spa) 3621 { 3622 mutex_enter(&spa->spa_async_lock); 3623 spa->spa_async_suspended++; 3624 while (spa->spa_async_thread != NULL) 3625 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 3626 mutex_exit(&spa->spa_async_lock); 3627 } 3628 3629 void 3630 spa_async_resume(spa_t *spa) 3631 { 3632 mutex_enter(&spa->spa_async_lock); 3633 ASSERT(spa->spa_async_suspended != 0); 3634 spa->spa_async_suspended--; 3635 mutex_exit(&spa->spa_async_lock); 3636 } 3637 3638 static void 3639 spa_async_dispatch(spa_t *spa) 3640 { 3641 mutex_enter(&spa->spa_async_lock); 3642 if (spa->spa_async_tasks && !spa->spa_async_suspended && 3643 spa->spa_async_thread == NULL && 3644 rootdir != NULL && !vn_is_readonly(rootdir)) 3645 spa->spa_async_thread = thread_create(NULL, 0, 3646 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 3647 mutex_exit(&spa->spa_async_lock); 3648 } 3649 3650 void 3651 spa_async_request(spa_t *spa, int task) 3652 { 3653 mutex_enter(&spa->spa_async_lock); 3654 spa->spa_async_tasks |= task; 3655 mutex_exit(&spa->spa_async_lock); 3656 } 3657 3658 /* 3659 * ========================================================================== 3660 * SPA syncing routines 3661 * ========================================================================== 3662 */ 3663 3664 static void 3665 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 3666 { 3667 bplist_t *bpl = &spa->spa_sync_bplist; 3668 dmu_tx_t *tx; 3669 blkptr_t blk; 3670 uint64_t itor = 0; 3671 zio_t *zio; 3672 int error; 3673 uint8_t c = 1; 3674 3675 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 3676 3677 while (bplist_iterate(bpl, &itor, &blk) == 0) 3678 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 3679 3680 error = zio_wait(zio); 3681 ASSERT3U(error, ==, 0); 3682 3683 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 3684 bplist_vacate(bpl, tx); 3685 3686 /* 3687 * Pre-dirty the first block so we sync to convergence faster. 3688 * (Usually only the first block is needed.) 3689 */ 3690 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 3691 dmu_tx_commit(tx); 3692 } 3693 3694 static void 3695 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 3696 { 3697 char *packed = NULL; 3698 size_t bufsize; 3699 size_t nvsize = 0; 3700 dmu_buf_t *db; 3701 3702 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 3703 3704 /* 3705 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 3706 * information. This avoids the dbuf_will_dirty() path and 3707 * saves us a pre-read to get data we don't actually care about. 3708 */ 3709 bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); 3710 packed = kmem_alloc(bufsize, KM_SLEEP); 3711 3712 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 3713 KM_SLEEP) == 0); 3714 bzero(packed + nvsize, bufsize - nvsize); 3715 3716 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 3717 3718 kmem_free(packed, bufsize); 3719 3720 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 3721 dmu_buf_will_dirty(db, tx); 3722 *(uint64_t *)db->db_data = nvsize; 3723 dmu_buf_rele(db, FTAG); 3724 } 3725 3726 static void 3727 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 3728 const char *config, const char *entry) 3729 { 3730 nvlist_t *nvroot; 3731 nvlist_t **list; 3732 int i; 3733 3734 if (!sav->sav_sync) 3735 return; 3736 3737 /* 3738 * Update the MOS nvlist describing the list of available devices. 3739 * spa_validate_aux() will have already made sure this nvlist is 3740 * valid and the vdevs are labeled appropriately. 3741 */ 3742 if (sav->sav_object == 0) { 3743 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 3744 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 3745 sizeof (uint64_t), tx); 3746 VERIFY(zap_update(spa->spa_meta_objset, 3747 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 3748 &sav->sav_object, tx) == 0); 3749 } 3750 3751 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3752 if (sav->sav_count == 0) { 3753 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 3754 } else { 3755 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 3756 for (i = 0; i < sav->sav_count; i++) 3757 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 3758 B_FALSE, B_FALSE, B_TRUE); 3759 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 3760 sav->sav_count) == 0); 3761 for (i = 0; i < sav->sav_count; i++) 3762 nvlist_free(list[i]); 3763 kmem_free(list, sav->sav_count * sizeof (void *)); 3764 } 3765 3766 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 3767 nvlist_free(nvroot); 3768 3769 sav->sav_sync = B_FALSE; 3770 } 3771 3772 static void 3773 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 3774 { 3775 nvlist_t *config; 3776 3777 if (list_is_empty(&spa->spa_dirty_list)) 3778 return; 3779 3780 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 3781 3782 if (spa->spa_config_syncing) 3783 nvlist_free(spa->spa_config_syncing); 3784 spa->spa_config_syncing = config; 3785 3786 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 3787 } 3788 3789 /* 3790 * Set zpool properties. 3791 */ 3792 static void 3793 spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 3794 { 3795 spa_t *spa = arg1; 3796 objset_t *mos = spa->spa_meta_objset; 3797 nvlist_t *nvp = arg2; 3798 nvpair_t *elem; 3799 uint64_t intval; 3800 char *strval; 3801 zpool_prop_t prop; 3802 const char *propname; 3803 zprop_type_t proptype; 3804 spa_config_dirent_t *dp; 3805 3806 elem = NULL; 3807 while ((elem = nvlist_next_nvpair(nvp, elem))) { 3808 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 3809 case ZPOOL_PROP_VERSION: 3810 /* 3811 * Only set version for non-zpool-creation cases 3812 * (set/import). spa_create() needs special care 3813 * for version setting. 3814 */ 3815 if (tx->tx_txg != TXG_INITIAL) { 3816 VERIFY(nvpair_value_uint64(elem, 3817 &intval) == 0); 3818 ASSERT(intval <= SPA_VERSION); 3819 ASSERT(intval >= spa_version(spa)); 3820 spa->spa_uberblock.ub_version = intval; 3821 vdev_config_dirty(spa->spa_root_vdev); 3822 } 3823 break; 3824 3825 case ZPOOL_PROP_ALTROOT: 3826 /* 3827 * 'altroot' is a non-persistent property. It should 3828 * have been set temporarily at creation or import time. 3829 */ 3830 ASSERT(spa->spa_root != NULL); 3831 break; 3832 3833 case ZPOOL_PROP_CACHEFILE: 3834 /* 3835 * 'cachefile' is a non-persistent property, but note 3836 * an async request that the config cache needs to be 3837 * udpated. 3838 */ 3839 VERIFY(nvpair_value_string(elem, &strval) == 0); 3840 3841 dp = kmem_alloc(sizeof (spa_config_dirent_t), 3842 KM_SLEEP); 3843 3844 if (strval[0] == '\0') 3845 dp->scd_path = spa_strdup(spa_config_path); 3846 else if (strcmp(strval, "none") == 0) 3847 dp->scd_path = NULL; 3848 else 3849 dp->scd_path = spa_strdup(strval); 3850 3851 list_insert_head(&spa->spa_config_list, dp); 3852 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 3853 break; 3854 default: 3855 /* 3856 * Set pool property values in the poolprops mos object. 3857 */ 3858 mutex_enter(&spa->spa_props_lock); 3859 if (spa->spa_pool_props_object == 0) { 3860 objset_t *mos = spa->spa_meta_objset; 3861 3862 VERIFY((spa->spa_pool_props_object = 3863 zap_create(mos, DMU_OT_POOL_PROPS, 3864 DMU_OT_NONE, 0, tx)) > 0); 3865 3866 VERIFY(zap_update(mos, 3867 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 3868 8, 1, &spa->spa_pool_props_object, tx) 3869 == 0); 3870 } 3871 mutex_exit(&spa->spa_props_lock); 3872 3873 /* normalize the property name */ 3874 propname = zpool_prop_to_name(prop); 3875 proptype = zpool_prop_get_type(prop); 3876 3877 if (nvpair_type(elem) == DATA_TYPE_STRING) { 3878 ASSERT(proptype == PROP_TYPE_STRING); 3879 VERIFY(nvpair_value_string(elem, &strval) == 0); 3880 VERIFY(zap_update(mos, 3881 spa->spa_pool_props_object, propname, 3882 1, strlen(strval) + 1, strval, tx) == 0); 3883 3884 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 3885 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 3886 3887 if (proptype == PROP_TYPE_INDEX) { 3888 const char *unused; 3889 VERIFY(zpool_prop_index_to_string( 3890 prop, intval, &unused) == 0); 3891 } 3892 VERIFY(zap_update(mos, 3893 spa->spa_pool_props_object, propname, 3894 8, 1, &intval, tx) == 0); 3895 } else { 3896 ASSERT(0); /* not allowed */ 3897 } 3898 3899 switch (prop) { 3900 case ZPOOL_PROP_DELEGATION: 3901 spa->spa_delegation = intval; 3902 break; 3903 case ZPOOL_PROP_BOOTFS: 3904 spa->spa_bootfs = intval; 3905 break; 3906 case ZPOOL_PROP_FAILUREMODE: 3907 spa->spa_failmode = intval; 3908 break; 3909 default: 3910 break; 3911 } 3912 } 3913 3914 /* log internal history if this is not a zpool create */ 3915 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 3916 tx->tx_txg != TXG_INITIAL) { 3917 spa_history_internal_log(LOG_POOL_PROPSET, 3918 spa, tx, cr, "%s %lld %s", 3919 nvpair_name(elem), intval, spa->spa_name); 3920 } 3921 } 3922 } 3923 3924 /* 3925 * Sync the specified transaction group. New blocks may be dirtied as 3926 * part of the process, so we iterate until it converges. 3927 */ 3928 void 3929 spa_sync(spa_t *spa, uint64_t txg) 3930 { 3931 dsl_pool_t *dp = spa->spa_dsl_pool; 3932 objset_t *mos = spa->spa_meta_objset; 3933 bplist_t *bpl = &spa->spa_sync_bplist; 3934 vdev_t *rvd = spa->spa_root_vdev; 3935 vdev_t *vd; 3936 dmu_tx_t *tx; 3937 int dirty_vdevs; 3938 3939 /* 3940 * Lock out configuration changes. 3941 */ 3942 spa_config_enter(spa, RW_READER, FTAG); 3943 3944 spa->spa_syncing_txg = txg; 3945 spa->spa_sync_pass = 0; 3946 3947 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 3948 3949 tx = dmu_tx_create_assigned(dp, txg); 3950 3951 /* 3952 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 3953 * set spa_deflate if we have no raid-z vdevs. 3954 */ 3955 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 3956 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 3957 int i; 3958 3959 for (i = 0; i < rvd->vdev_children; i++) { 3960 vd = rvd->vdev_child[i]; 3961 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 3962 break; 3963 } 3964 if (i == rvd->vdev_children) { 3965 spa->spa_deflate = TRUE; 3966 VERIFY(0 == zap_add(spa->spa_meta_objset, 3967 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3968 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 3969 } 3970 } 3971 3972 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 3973 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 3974 dsl_pool_create_origin(dp, tx); 3975 3976 /* Keeping the origin open increases spa_minref */ 3977 spa->spa_minref += 3; 3978 } 3979 3980 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 3981 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 3982 dsl_pool_upgrade_clones(dp, tx); 3983 } 3984 3985 /* 3986 * If anything has changed in this txg, push the deferred frees 3987 * from the previous txg. If not, leave them alone so that we 3988 * don't generate work on an otherwise idle system. 3989 */ 3990 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 3991 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 3992 !txg_list_empty(&dp->dp_sync_tasks, txg)) 3993 spa_sync_deferred_frees(spa, txg); 3994 3995 /* 3996 * Iterate to convergence. 3997 */ 3998 do { 3999 spa->spa_sync_pass++; 4000 4001 spa_sync_config_object(spa, tx); 4002 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 4003 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 4004 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 4005 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 4006 spa_errlog_sync(spa, txg); 4007 dsl_pool_sync(dp, txg); 4008 4009 dirty_vdevs = 0; 4010 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 4011 vdev_sync(vd, txg); 4012 dirty_vdevs++; 4013 } 4014 4015 bplist_sync(bpl, tx); 4016 } while (dirty_vdevs); 4017 4018 bplist_close(bpl); 4019 4020 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 4021 4022 /* 4023 * Rewrite the vdev configuration (which includes the uberblock) 4024 * to commit the transaction group. 4025 * 4026 * If there are no dirty vdevs, we sync the uberblock to a few 4027 * random top-level vdevs that are known to be visible in the 4028 * config cache (see spa_vdev_add() for details). If there *are* 4029 * dirty vdevs -- or if the sync to our random subset fails -- 4030 * then sync the uberblock to all vdevs. 4031 */ 4032 if (list_is_empty(&spa->spa_dirty_list)) { 4033 vdev_t *svd[SPA_DVAS_PER_BP]; 4034 int svdcount = 0; 4035 int children = rvd->vdev_children; 4036 int c0 = spa_get_random(children); 4037 int c; 4038 4039 for (c = 0; c < children; c++) { 4040 vd = rvd->vdev_child[(c0 + c) % children]; 4041 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 4042 continue; 4043 svd[svdcount++] = vd; 4044 if (svdcount == SPA_DVAS_PER_BP) 4045 break; 4046 } 4047 vdev_config_sync(svd, svdcount, txg); 4048 } else { 4049 vdev_config_sync(rvd->vdev_child, rvd->vdev_children, txg); 4050 } 4051 dmu_tx_commit(tx); 4052 4053 /* 4054 * Clear the dirty config list. 4055 */ 4056 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 4057 vdev_config_clean(vd); 4058 4059 /* 4060 * Now that the new config has synced transactionally, 4061 * let it become visible to the config cache. 4062 */ 4063 if (spa->spa_config_syncing != NULL) { 4064 spa_config_set(spa, spa->spa_config_syncing); 4065 spa->spa_config_txg = txg; 4066 spa->spa_config_syncing = NULL; 4067 } 4068 4069 spa->spa_traverse_wanted = B_TRUE; 4070 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 4071 spa->spa_traverse_wanted = B_FALSE; 4072 spa->spa_ubsync = spa->spa_uberblock; 4073 rw_exit(&spa->spa_traverse_lock); 4074 4075 /* 4076 * Clean up the ZIL records for the synced txg. 4077 */ 4078 dsl_pool_zil_clean(dp); 4079 4080 /* 4081 * Update usable space statistics. 4082 */ 4083 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 4084 vdev_sync_done(vd, txg); 4085 4086 /* 4087 * It had better be the case that we didn't dirty anything 4088 * since vdev_config_sync(). 4089 */ 4090 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 4091 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 4092 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 4093 ASSERT(bpl->bpl_queue == NULL); 4094 4095 spa_config_exit(spa, FTAG); 4096 4097 /* 4098 * If any async tasks have been requested, kick them off. 4099 */ 4100 spa_async_dispatch(spa); 4101 } 4102 4103 /* 4104 * Sync all pools. We don't want to hold the namespace lock across these 4105 * operations, so we take a reference on the spa_t and drop the lock during the 4106 * sync. 4107 */ 4108 void 4109 spa_sync_allpools(void) 4110 { 4111 spa_t *spa = NULL; 4112 mutex_enter(&spa_namespace_lock); 4113 while ((spa = spa_next(spa)) != NULL) { 4114 if (spa_state(spa) != POOL_STATE_ACTIVE) 4115 continue; 4116 spa_open_ref(spa, FTAG); 4117 mutex_exit(&spa_namespace_lock); 4118 txg_wait_synced(spa_get_dsl(spa), 0); 4119 mutex_enter(&spa_namespace_lock); 4120 spa_close(spa, FTAG); 4121 } 4122 mutex_exit(&spa_namespace_lock); 4123 } 4124 4125 /* 4126 * ========================================================================== 4127 * Miscellaneous routines 4128 * ========================================================================== 4129 */ 4130 4131 /* 4132 * Remove all pools in the system. 4133 */ 4134 void 4135 spa_evict_all(void) 4136 { 4137 spa_t *spa; 4138 4139 /* 4140 * Remove all cached state. All pools should be closed now, 4141 * so every spa in the AVL tree should be unreferenced. 4142 */ 4143 mutex_enter(&spa_namespace_lock); 4144 while ((spa = spa_next(NULL)) != NULL) { 4145 /* 4146 * Stop async tasks. The async thread may need to detach 4147 * a device that's been replaced, which requires grabbing 4148 * spa_namespace_lock, so we must drop it here. 4149 */ 4150 spa_open_ref(spa, FTAG); 4151 mutex_exit(&spa_namespace_lock); 4152 spa_async_suspend(spa); 4153 mutex_enter(&spa_namespace_lock); 4154 spa_close(spa, FTAG); 4155 4156 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4157 spa_unload(spa); 4158 spa_deactivate(spa); 4159 } 4160 spa_remove(spa); 4161 } 4162 mutex_exit(&spa_namespace_lock); 4163 } 4164 4165 vdev_t * 4166 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache) 4167 { 4168 vdev_t *vd; 4169 int i; 4170 4171 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 4172 return (vd); 4173 4174 if (l2cache) { 4175 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 4176 vd = spa->spa_l2cache.sav_vdevs[i]; 4177 if (vd->vdev_guid == guid) 4178 return (vd); 4179 } 4180 } 4181 4182 return (NULL); 4183 } 4184 4185 void 4186 spa_upgrade(spa_t *spa, uint64_t version) 4187 { 4188 spa_config_enter(spa, RW_WRITER, FTAG); 4189 4190 /* 4191 * This should only be called for a non-faulted pool, and since a 4192 * future version would result in an unopenable pool, this shouldn't be 4193 * possible. 4194 */ 4195 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 4196 ASSERT(version >= spa->spa_uberblock.ub_version); 4197 4198 spa->spa_uberblock.ub_version = version; 4199 vdev_config_dirty(spa->spa_root_vdev); 4200 4201 spa_config_exit(spa, FTAG); 4202 4203 txg_wait_synced(spa_get_dsl(spa), 0); 4204 } 4205 4206 boolean_t 4207 spa_has_spare(spa_t *spa, uint64_t guid) 4208 { 4209 int i; 4210 uint64_t spareguid; 4211 spa_aux_vdev_t *sav = &spa->spa_spares; 4212 4213 for (i = 0; i < sav->sav_count; i++) 4214 if (sav->sav_vdevs[i]->vdev_guid == guid) 4215 return (B_TRUE); 4216 4217 for (i = 0; i < sav->sav_npending; i++) { 4218 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 4219 &spareguid) == 0 && spareguid == guid) 4220 return (B_TRUE); 4221 } 4222 4223 return (B_FALSE); 4224 } 4225 4226 /* 4227 * Check if a pool has an active shared spare device. 4228 * Note: reference count of an active spare is 2, as a spare and as a replace 4229 */ 4230 static boolean_t 4231 spa_has_active_shared_spare(spa_t *spa) 4232 { 4233 int i, refcnt; 4234 uint64_t pool; 4235 spa_aux_vdev_t *sav = &spa->spa_spares; 4236 4237 for (i = 0; i < sav->sav_count; i++) { 4238 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 4239 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 4240 refcnt > 2) 4241 return (B_TRUE); 4242 } 4243 4244 return (B_FALSE); 4245 } 4246 4247 /* 4248 * Post a sysevent corresponding to the given event. The 'name' must be one of 4249 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 4250 * filled in from the spa and (optionally) the vdev. This doesn't do anything 4251 * in the userland libzpool, as we don't want consumers to misinterpret ztest 4252 * or zdb as real changes. 4253 */ 4254 void 4255 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 4256 { 4257 #ifdef _KERNEL 4258 sysevent_t *ev; 4259 sysevent_attr_list_t *attr = NULL; 4260 sysevent_value_t value; 4261 sysevent_id_t eid; 4262 4263 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 4264 SE_SLEEP); 4265 4266 value.value_type = SE_DATA_TYPE_STRING; 4267 value.value.sv_string = spa_name(spa); 4268 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 4269 goto done; 4270 4271 value.value_type = SE_DATA_TYPE_UINT64; 4272 value.value.sv_uint64 = spa_guid(spa); 4273 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 4274 goto done; 4275 4276 if (vd) { 4277 value.value_type = SE_DATA_TYPE_UINT64; 4278 value.value.sv_uint64 = vd->vdev_guid; 4279 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 4280 SE_SLEEP) != 0) 4281 goto done; 4282 4283 if (vd->vdev_path) { 4284 value.value_type = SE_DATA_TYPE_STRING; 4285 value.value.sv_string = vd->vdev_path; 4286 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 4287 &value, SE_SLEEP) != 0) 4288 goto done; 4289 } 4290 } 4291 4292 if (sysevent_attach_attributes(ev, attr) != 0) 4293 goto done; 4294 attr = NULL; 4295 4296 (void) log_sysevent(ev, SE_SLEEP, &eid); 4297 4298 done: 4299 if (attr) 4300 sysevent_free_attr(attr); 4301 sysevent_free(ev); 4302 #endif 4303 } 4304