1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * This file contains all the routines used when modifying on-disk SPA state. 29 * This includes opening, importing, destroying, exporting a pool, and syncing a 30 * pool. 31 */ 32 33 #include <sys/zfs_context.h> 34 #include <sys/fm/fs/zfs.h> 35 #include <sys/spa_impl.h> 36 #include <sys/zio.h> 37 #include <sys/zio_checksum.h> 38 #include <sys/zio_compress.h> 39 #include <sys/dmu.h> 40 #include <sys/dmu_tx.h> 41 #include <sys/zap.h> 42 #include <sys/zil.h> 43 #include <sys/vdev_impl.h> 44 #include <sys/metaslab.h> 45 #include <sys/uberblock_impl.h> 46 #include <sys/txg.h> 47 #include <sys/avl.h> 48 #include <sys/dmu_traverse.h> 49 #include <sys/dmu_objset.h> 50 #include <sys/unique.h> 51 #include <sys/dsl_pool.h> 52 #include <sys/dsl_dataset.h> 53 #include <sys/dsl_dir.h> 54 #include <sys/dsl_prop.h> 55 #include <sys/dsl_synctask.h> 56 #include <sys/fs/zfs.h> 57 #include <sys/arc.h> 58 #include <sys/callb.h> 59 #include <sys/systeminfo.h> 60 #include <sys/sunddi.h> 61 #include <sys/spa_boot.h> 62 63 #ifdef _KERNEL 64 #include <sys/zone.h> 65 #endif /* _KERNEL */ 66 67 #include "zfs_prop.h" 68 #include "zfs_comutil.h" 69 70 int zio_taskq_threads[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 71 /* ISSUE INTR */ 72 { 1, 1 }, /* ZIO_TYPE_NULL */ 73 { 1, 8 }, /* ZIO_TYPE_READ */ 74 { 8, 1 }, /* ZIO_TYPE_WRITE */ 75 { 1, 1 }, /* ZIO_TYPE_FREE */ 76 { 1, 1 }, /* ZIO_TYPE_CLAIM */ 77 { 1, 1 }, /* ZIO_TYPE_IOCTL */ 78 }; 79 80 static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); 81 static boolean_t spa_has_active_shared_spare(spa_t *spa); 82 83 /* 84 * ========================================================================== 85 * SPA properties routines 86 * ========================================================================== 87 */ 88 89 /* 90 * Add a (source=src, propname=propval) list to an nvlist. 91 */ 92 static void 93 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 94 uint64_t intval, zprop_source_t src) 95 { 96 const char *propname = zpool_prop_to_name(prop); 97 nvlist_t *propval; 98 99 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 100 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 101 102 if (strval != NULL) 103 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 104 else 105 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 106 107 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 108 nvlist_free(propval); 109 } 110 111 /* 112 * Get property values from the spa configuration. 113 */ 114 static void 115 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 116 { 117 uint64_t size; 118 uint64_t used; 119 uint64_t cap, version; 120 zprop_source_t src = ZPROP_SRC_NONE; 121 spa_config_dirent_t *dp; 122 123 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 124 125 if (spa->spa_root_vdev != NULL) { 126 size = spa_get_space(spa); 127 used = spa_get_alloc(spa); 128 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 129 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 130 spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src); 131 spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, 132 size - used, src); 133 134 cap = (size == 0) ? 0 : (used * 100 / size); 135 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 136 137 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 138 spa->spa_root_vdev->vdev_state, src); 139 140 version = spa_version(spa); 141 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 142 src = ZPROP_SRC_DEFAULT; 143 else 144 src = ZPROP_SRC_LOCAL; 145 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 146 } 147 148 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 149 150 if (spa->spa_root != NULL) 151 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 152 0, ZPROP_SRC_LOCAL); 153 154 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 155 if (dp->scd_path == NULL) { 156 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 157 "none", 0, ZPROP_SRC_LOCAL); 158 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 159 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 160 dp->scd_path, 0, ZPROP_SRC_LOCAL); 161 } 162 } 163 } 164 165 /* 166 * Get zpool property values. 167 */ 168 int 169 spa_prop_get(spa_t *spa, nvlist_t **nvp) 170 { 171 zap_cursor_t zc; 172 zap_attribute_t za; 173 objset_t *mos = spa->spa_meta_objset; 174 int err; 175 176 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 177 178 mutex_enter(&spa->spa_props_lock); 179 180 /* 181 * Get properties from the spa config. 182 */ 183 spa_prop_get_config(spa, nvp); 184 185 /* If no pool property object, no more prop to get. */ 186 if (spa->spa_pool_props_object == 0) { 187 mutex_exit(&spa->spa_props_lock); 188 return (0); 189 } 190 191 /* 192 * Get properties from the MOS pool property object. 193 */ 194 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 195 (err = zap_cursor_retrieve(&zc, &za)) == 0; 196 zap_cursor_advance(&zc)) { 197 uint64_t intval = 0; 198 char *strval = NULL; 199 zprop_source_t src = ZPROP_SRC_DEFAULT; 200 zpool_prop_t prop; 201 202 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 203 continue; 204 205 switch (za.za_integer_length) { 206 case 8: 207 /* integer property */ 208 if (za.za_first_integer != 209 zpool_prop_default_numeric(prop)) 210 src = ZPROP_SRC_LOCAL; 211 212 if (prop == ZPOOL_PROP_BOOTFS) { 213 dsl_pool_t *dp; 214 dsl_dataset_t *ds = NULL; 215 216 dp = spa_get_dsl(spa); 217 rw_enter(&dp->dp_config_rwlock, RW_READER); 218 if (err = dsl_dataset_hold_obj(dp, 219 za.za_first_integer, FTAG, &ds)) { 220 rw_exit(&dp->dp_config_rwlock); 221 break; 222 } 223 224 strval = kmem_alloc( 225 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 226 KM_SLEEP); 227 dsl_dataset_name(ds, strval); 228 dsl_dataset_rele(ds, FTAG); 229 rw_exit(&dp->dp_config_rwlock); 230 } else { 231 strval = NULL; 232 intval = za.za_first_integer; 233 } 234 235 spa_prop_add_list(*nvp, prop, strval, intval, src); 236 237 if (strval != NULL) 238 kmem_free(strval, 239 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 240 241 break; 242 243 case 1: 244 /* string property */ 245 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 246 err = zap_lookup(mos, spa->spa_pool_props_object, 247 za.za_name, 1, za.za_num_integers, strval); 248 if (err) { 249 kmem_free(strval, za.za_num_integers); 250 break; 251 } 252 spa_prop_add_list(*nvp, prop, strval, 0, src); 253 kmem_free(strval, za.za_num_integers); 254 break; 255 256 default: 257 break; 258 } 259 } 260 zap_cursor_fini(&zc); 261 mutex_exit(&spa->spa_props_lock); 262 out: 263 if (err && err != ENOENT) { 264 nvlist_free(*nvp); 265 *nvp = NULL; 266 return (err); 267 } 268 269 return (0); 270 } 271 272 /* 273 * Validate the given pool properties nvlist and modify the list 274 * for the property values to be set. 275 */ 276 static int 277 spa_prop_validate(spa_t *spa, nvlist_t *props) 278 { 279 nvpair_t *elem; 280 int error = 0, reset_bootfs = 0; 281 uint64_t objnum; 282 283 elem = NULL; 284 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 285 zpool_prop_t prop; 286 char *propname, *strval; 287 uint64_t intval; 288 objset_t *os; 289 char *slash; 290 291 propname = nvpair_name(elem); 292 293 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) 294 return (EINVAL); 295 296 switch (prop) { 297 case ZPOOL_PROP_VERSION: 298 error = nvpair_value_uint64(elem, &intval); 299 if (!error && 300 (intval < spa_version(spa) || intval > SPA_VERSION)) 301 error = EINVAL; 302 break; 303 304 case ZPOOL_PROP_DELEGATION: 305 case ZPOOL_PROP_AUTOREPLACE: 306 case ZPOOL_PROP_LISTSNAPS: 307 error = nvpair_value_uint64(elem, &intval); 308 if (!error && intval > 1) 309 error = EINVAL; 310 break; 311 312 case ZPOOL_PROP_BOOTFS: 313 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 314 error = ENOTSUP; 315 break; 316 } 317 318 /* 319 * Make sure the vdev config is bootable 320 */ 321 if (!vdev_is_bootable(spa->spa_root_vdev)) { 322 error = ENOTSUP; 323 break; 324 } 325 326 reset_bootfs = 1; 327 328 error = nvpair_value_string(elem, &strval); 329 330 if (!error) { 331 uint64_t compress; 332 333 if (strval == NULL || strval[0] == '\0') { 334 objnum = zpool_prop_default_numeric( 335 ZPOOL_PROP_BOOTFS); 336 break; 337 } 338 339 if (error = dmu_objset_open(strval, DMU_OST_ZFS, 340 DS_MODE_USER | DS_MODE_READONLY, &os)) 341 break; 342 343 /* We don't support gzip bootable datasets */ 344 if ((error = dsl_prop_get_integer(strval, 345 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 346 &compress, NULL)) == 0 && 347 !BOOTFS_COMPRESS_VALID(compress)) { 348 error = ENOTSUP; 349 } else { 350 objnum = dmu_objset_id(os); 351 } 352 dmu_objset_close(os); 353 } 354 break; 355 356 case ZPOOL_PROP_FAILUREMODE: 357 error = nvpair_value_uint64(elem, &intval); 358 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 359 intval > ZIO_FAILURE_MODE_PANIC)) 360 error = EINVAL; 361 362 /* 363 * This is a special case which only occurs when 364 * the pool has completely failed. This allows 365 * the user to change the in-core failmode property 366 * without syncing it out to disk (I/Os might 367 * currently be blocked). We do this by returning 368 * EIO to the caller (spa_prop_set) to trick it 369 * into thinking we encountered a property validation 370 * error. 371 */ 372 if (!error && spa_suspended(spa)) { 373 spa->spa_failmode = intval; 374 error = EIO; 375 } 376 break; 377 378 case ZPOOL_PROP_CACHEFILE: 379 if ((error = nvpair_value_string(elem, &strval)) != 0) 380 break; 381 382 if (strval[0] == '\0') 383 break; 384 385 if (strcmp(strval, "none") == 0) 386 break; 387 388 if (strval[0] != '/') { 389 error = EINVAL; 390 break; 391 } 392 393 slash = strrchr(strval, '/'); 394 ASSERT(slash != NULL); 395 396 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 397 strcmp(slash, "/..") == 0) 398 error = EINVAL; 399 break; 400 } 401 402 if (error) 403 break; 404 } 405 406 if (!error && reset_bootfs) { 407 error = nvlist_remove(props, 408 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 409 410 if (!error) { 411 error = nvlist_add_uint64(props, 412 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 413 } 414 } 415 416 return (error); 417 } 418 419 void 420 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 421 { 422 char *cachefile; 423 spa_config_dirent_t *dp; 424 425 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 426 &cachefile) != 0) 427 return; 428 429 dp = kmem_alloc(sizeof (spa_config_dirent_t), 430 KM_SLEEP); 431 432 if (cachefile[0] == '\0') 433 dp->scd_path = spa_strdup(spa_config_path); 434 else if (strcmp(cachefile, "none") == 0) 435 dp->scd_path = NULL; 436 else 437 dp->scd_path = spa_strdup(cachefile); 438 439 list_insert_head(&spa->spa_config_list, dp); 440 if (need_sync) 441 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 442 } 443 444 int 445 spa_prop_set(spa_t *spa, nvlist_t *nvp) 446 { 447 int error; 448 nvpair_t *elem; 449 boolean_t need_sync = B_FALSE; 450 zpool_prop_t prop; 451 452 if ((error = spa_prop_validate(spa, nvp)) != 0) 453 return (error); 454 455 elem = NULL; 456 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 457 if ((prop = zpool_name_to_prop( 458 nvpair_name(elem))) == ZPROP_INVAL) 459 return (EINVAL); 460 461 if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT) 462 continue; 463 464 need_sync = B_TRUE; 465 break; 466 } 467 468 if (need_sync) 469 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 470 spa, nvp, 3)); 471 else 472 return (0); 473 } 474 475 /* 476 * If the bootfs property value is dsobj, clear it. 477 */ 478 void 479 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 480 { 481 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 482 VERIFY(zap_remove(spa->spa_meta_objset, 483 spa->spa_pool_props_object, 484 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 485 spa->spa_bootfs = 0; 486 } 487 } 488 489 /* 490 * ========================================================================== 491 * SPA state manipulation (open/create/destroy/import/export) 492 * ========================================================================== 493 */ 494 495 static int 496 spa_error_entry_compare(const void *a, const void *b) 497 { 498 spa_error_entry_t *sa = (spa_error_entry_t *)a; 499 spa_error_entry_t *sb = (spa_error_entry_t *)b; 500 int ret; 501 502 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 503 sizeof (zbookmark_t)); 504 505 if (ret < 0) 506 return (-1); 507 else if (ret > 0) 508 return (1); 509 else 510 return (0); 511 } 512 513 /* 514 * Utility function which retrieves copies of the current logs and 515 * re-initializes them in the process. 516 */ 517 void 518 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 519 { 520 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 521 522 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 523 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 524 525 avl_create(&spa->spa_errlist_scrub, 526 spa_error_entry_compare, sizeof (spa_error_entry_t), 527 offsetof(spa_error_entry_t, se_avl)); 528 avl_create(&spa->spa_errlist_last, 529 spa_error_entry_compare, sizeof (spa_error_entry_t), 530 offsetof(spa_error_entry_t, se_avl)); 531 } 532 533 /* 534 * Activate an uninitialized pool. 535 */ 536 static void 537 spa_activate(spa_t *spa, int mode) 538 { 539 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 540 541 spa->spa_state = POOL_STATE_ACTIVE; 542 spa->spa_mode = mode; 543 544 spa->spa_normal_class = metaslab_class_create(); 545 spa->spa_log_class = metaslab_class_create(); 546 547 for (int t = 0; t < ZIO_TYPES; t++) { 548 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 549 spa->spa_zio_taskq[t][q] = taskq_create("spa_zio", 550 zio_taskq_threads[t][q], maxclsyspri, 50, 551 INT_MAX, TASKQ_PREPOPULATE); 552 } 553 } 554 555 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 556 offsetof(vdev_t, vdev_config_dirty_node)); 557 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 558 offsetof(vdev_t, vdev_state_dirty_node)); 559 560 txg_list_create(&spa->spa_vdev_txg_list, 561 offsetof(struct vdev, vdev_txg_node)); 562 563 avl_create(&spa->spa_errlist_scrub, 564 spa_error_entry_compare, sizeof (spa_error_entry_t), 565 offsetof(spa_error_entry_t, se_avl)); 566 avl_create(&spa->spa_errlist_last, 567 spa_error_entry_compare, sizeof (spa_error_entry_t), 568 offsetof(spa_error_entry_t, se_avl)); 569 } 570 571 /* 572 * Opposite of spa_activate(). 573 */ 574 static void 575 spa_deactivate(spa_t *spa) 576 { 577 ASSERT(spa->spa_sync_on == B_FALSE); 578 ASSERT(spa->spa_dsl_pool == NULL); 579 ASSERT(spa->spa_root_vdev == NULL); 580 581 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 582 583 txg_list_destroy(&spa->spa_vdev_txg_list); 584 585 list_destroy(&spa->spa_config_dirty_list); 586 list_destroy(&spa->spa_state_dirty_list); 587 588 for (int t = 0; t < ZIO_TYPES; t++) { 589 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 590 taskq_destroy(spa->spa_zio_taskq[t][q]); 591 spa->spa_zio_taskq[t][q] = NULL; 592 } 593 } 594 595 metaslab_class_destroy(spa->spa_normal_class); 596 spa->spa_normal_class = NULL; 597 598 metaslab_class_destroy(spa->spa_log_class); 599 spa->spa_log_class = NULL; 600 601 /* 602 * If this was part of an import or the open otherwise failed, we may 603 * still have errors left in the queues. Empty them just in case. 604 */ 605 spa_errlog_drain(spa); 606 607 avl_destroy(&spa->spa_errlist_scrub); 608 avl_destroy(&spa->spa_errlist_last); 609 610 spa->spa_state = POOL_STATE_UNINITIALIZED; 611 } 612 613 /* 614 * Verify a pool configuration, and construct the vdev tree appropriately. This 615 * will create all the necessary vdevs in the appropriate layout, with each vdev 616 * in the CLOSED state. This will prep the pool before open/creation/import. 617 * All vdev validation is done by the vdev_alloc() routine. 618 */ 619 static int 620 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 621 uint_t id, int atype) 622 { 623 nvlist_t **child; 624 uint_t c, children; 625 int error; 626 627 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 628 return (error); 629 630 if ((*vdp)->vdev_ops->vdev_op_leaf) 631 return (0); 632 633 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 634 &child, &children); 635 636 if (error == ENOENT) 637 return (0); 638 639 if (error) { 640 vdev_free(*vdp); 641 *vdp = NULL; 642 return (EINVAL); 643 } 644 645 for (c = 0; c < children; c++) { 646 vdev_t *vd; 647 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 648 atype)) != 0) { 649 vdev_free(*vdp); 650 *vdp = NULL; 651 return (error); 652 } 653 } 654 655 ASSERT(*vdp != NULL); 656 657 return (0); 658 } 659 660 /* 661 * Opposite of spa_load(). 662 */ 663 static void 664 spa_unload(spa_t *spa) 665 { 666 int i; 667 668 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 669 670 /* 671 * Stop async tasks. 672 */ 673 spa_async_suspend(spa); 674 675 /* 676 * Stop syncing. 677 */ 678 if (spa->spa_sync_on) { 679 txg_sync_stop(spa->spa_dsl_pool); 680 spa->spa_sync_on = B_FALSE; 681 } 682 683 /* 684 * Wait for any outstanding async I/O to complete. 685 */ 686 mutex_enter(&spa->spa_async_root_lock); 687 while (spa->spa_async_root_count != 0) 688 cv_wait(&spa->spa_async_root_cv, &spa->spa_async_root_lock); 689 mutex_exit(&spa->spa_async_root_lock); 690 691 /* 692 * Close the dsl pool. 693 */ 694 if (spa->spa_dsl_pool) { 695 dsl_pool_close(spa->spa_dsl_pool); 696 spa->spa_dsl_pool = NULL; 697 } 698 699 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 700 701 /* 702 * Drop and purge level 2 cache 703 */ 704 spa_l2cache_drop(spa); 705 706 /* 707 * Close all vdevs. 708 */ 709 if (spa->spa_root_vdev) 710 vdev_free(spa->spa_root_vdev); 711 ASSERT(spa->spa_root_vdev == NULL); 712 713 for (i = 0; i < spa->spa_spares.sav_count; i++) 714 vdev_free(spa->spa_spares.sav_vdevs[i]); 715 if (spa->spa_spares.sav_vdevs) { 716 kmem_free(spa->spa_spares.sav_vdevs, 717 spa->spa_spares.sav_count * sizeof (void *)); 718 spa->spa_spares.sav_vdevs = NULL; 719 } 720 if (spa->spa_spares.sav_config) { 721 nvlist_free(spa->spa_spares.sav_config); 722 spa->spa_spares.sav_config = NULL; 723 } 724 spa->spa_spares.sav_count = 0; 725 726 for (i = 0; i < spa->spa_l2cache.sav_count; i++) 727 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 728 if (spa->spa_l2cache.sav_vdevs) { 729 kmem_free(spa->spa_l2cache.sav_vdevs, 730 spa->spa_l2cache.sav_count * sizeof (void *)); 731 spa->spa_l2cache.sav_vdevs = NULL; 732 } 733 if (spa->spa_l2cache.sav_config) { 734 nvlist_free(spa->spa_l2cache.sav_config); 735 spa->spa_l2cache.sav_config = NULL; 736 } 737 spa->spa_l2cache.sav_count = 0; 738 739 spa->spa_async_suspended = 0; 740 741 spa_config_exit(spa, SCL_ALL, FTAG); 742 } 743 744 /* 745 * Load (or re-load) the current list of vdevs describing the active spares for 746 * this pool. When this is called, we have some form of basic information in 747 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 748 * then re-generate a more complete list including status information. 749 */ 750 static void 751 spa_load_spares(spa_t *spa) 752 { 753 nvlist_t **spares; 754 uint_t nspares; 755 int i; 756 vdev_t *vd, *tvd; 757 758 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 759 760 /* 761 * First, close and free any existing spare vdevs. 762 */ 763 for (i = 0; i < spa->spa_spares.sav_count; i++) { 764 vd = spa->spa_spares.sav_vdevs[i]; 765 766 /* Undo the call to spa_activate() below */ 767 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 768 B_FALSE)) != NULL && tvd->vdev_isspare) 769 spa_spare_remove(tvd); 770 vdev_close(vd); 771 vdev_free(vd); 772 } 773 774 if (spa->spa_spares.sav_vdevs) 775 kmem_free(spa->spa_spares.sav_vdevs, 776 spa->spa_spares.sav_count * sizeof (void *)); 777 778 if (spa->spa_spares.sav_config == NULL) 779 nspares = 0; 780 else 781 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 782 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 783 784 spa->spa_spares.sav_count = (int)nspares; 785 spa->spa_spares.sav_vdevs = NULL; 786 787 if (nspares == 0) 788 return; 789 790 /* 791 * Construct the array of vdevs, opening them to get status in the 792 * process. For each spare, there is potentially two different vdev_t 793 * structures associated with it: one in the list of spares (used only 794 * for basic validation purposes) and one in the active vdev 795 * configuration (if it's spared in). During this phase we open and 796 * validate each vdev on the spare list. If the vdev also exists in the 797 * active configuration, then we also mark this vdev as an active spare. 798 */ 799 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 800 KM_SLEEP); 801 for (i = 0; i < spa->spa_spares.sav_count; i++) { 802 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 803 VDEV_ALLOC_SPARE) == 0); 804 ASSERT(vd != NULL); 805 806 spa->spa_spares.sav_vdevs[i] = vd; 807 808 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 809 B_FALSE)) != NULL) { 810 if (!tvd->vdev_isspare) 811 spa_spare_add(tvd); 812 813 /* 814 * We only mark the spare active if we were successfully 815 * able to load the vdev. Otherwise, importing a pool 816 * with a bad active spare would result in strange 817 * behavior, because multiple pool would think the spare 818 * is actively in use. 819 * 820 * There is a vulnerability here to an equally bizarre 821 * circumstance, where a dead active spare is later 822 * brought back to life (onlined or otherwise). Given 823 * the rarity of this scenario, and the extra complexity 824 * it adds, we ignore the possibility. 825 */ 826 if (!vdev_is_dead(tvd)) 827 spa_spare_activate(tvd); 828 } 829 830 vd->vdev_top = vd; 831 832 if (vdev_open(vd) != 0) 833 continue; 834 835 if (vdev_validate_aux(vd) == 0) 836 spa_spare_add(vd); 837 } 838 839 /* 840 * Recompute the stashed list of spares, with status information 841 * this time. 842 */ 843 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 844 DATA_TYPE_NVLIST_ARRAY) == 0); 845 846 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 847 KM_SLEEP); 848 for (i = 0; i < spa->spa_spares.sav_count; i++) 849 spares[i] = vdev_config_generate(spa, 850 spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE); 851 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 852 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 853 for (i = 0; i < spa->spa_spares.sav_count; i++) 854 nvlist_free(spares[i]); 855 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 856 } 857 858 /* 859 * Load (or re-load) the current list of vdevs describing the active l2cache for 860 * this pool. When this is called, we have some form of basic information in 861 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 862 * then re-generate a more complete list including status information. 863 * Devices which are already active have their details maintained, and are 864 * not re-opened. 865 */ 866 static void 867 spa_load_l2cache(spa_t *spa) 868 { 869 nvlist_t **l2cache; 870 uint_t nl2cache; 871 int i, j, oldnvdevs; 872 uint64_t guid, size; 873 vdev_t *vd, **oldvdevs, **newvdevs; 874 spa_aux_vdev_t *sav = &spa->spa_l2cache; 875 876 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 877 878 if (sav->sav_config != NULL) { 879 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 880 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 881 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 882 } else { 883 nl2cache = 0; 884 } 885 886 oldvdevs = sav->sav_vdevs; 887 oldnvdevs = sav->sav_count; 888 sav->sav_vdevs = NULL; 889 sav->sav_count = 0; 890 891 /* 892 * Process new nvlist of vdevs. 893 */ 894 for (i = 0; i < nl2cache; i++) { 895 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 896 &guid) == 0); 897 898 newvdevs[i] = NULL; 899 for (j = 0; j < oldnvdevs; j++) { 900 vd = oldvdevs[j]; 901 if (vd != NULL && guid == vd->vdev_guid) { 902 /* 903 * Retain previous vdev for add/remove ops. 904 */ 905 newvdevs[i] = vd; 906 oldvdevs[j] = NULL; 907 break; 908 } 909 } 910 911 if (newvdevs[i] == NULL) { 912 /* 913 * Create new vdev 914 */ 915 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 916 VDEV_ALLOC_L2CACHE) == 0); 917 ASSERT(vd != NULL); 918 newvdevs[i] = vd; 919 920 /* 921 * Commit this vdev as an l2cache device, 922 * even if it fails to open. 923 */ 924 spa_l2cache_add(vd); 925 926 vd->vdev_top = vd; 927 vd->vdev_aux = sav; 928 929 spa_l2cache_activate(vd); 930 931 if (vdev_open(vd) != 0) 932 continue; 933 934 (void) vdev_validate_aux(vd); 935 936 if (!vdev_is_dead(vd)) { 937 size = vdev_get_rsize(vd); 938 l2arc_add_vdev(spa, vd, 939 VDEV_LABEL_START_SIZE, 940 size - VDEV_LABEL_START_SIZE); 941 } 942 } 943 } 944 945 /* 946 * Purge vdevs that were dropped 947 */ 948 for (i = 0; i < oldnvdevs; i++) { 949 uint64_t pool; 950 951 vd = oldvdevs[i]; 952 if (vd != NULL) { 953 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 954 pool != 0ULL && l2arc_vdev_present(vd)) 955 l2arc_remove_vdev(vd); 956 (void) vdev_close(vd); 957 spa_l2cache_remove(vd); 958 } 959 } 960 961 if (oldvdevs) 962 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 963 964 if (sav->sav_config == NULL) 965 goto out; 966 967 sav->sav_vdevs = newvdevs; 968 sav->sav_count = (int)nl2cache; 969 970 /* 971 * Recompute the stashed list of l2cache devices, with status 972 * information this time. 973 */ 974 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 975 DATA_TYPE_NVLIST_ARRAY) == 0); 976 977 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 978 for (i = 0; i < sav->sav_count; i++) 979 l2cache[i] = vdev_config_generate(spa, 980 sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE); 981 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 982 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 983 out: 984 for (i = 0; i < sav->sav_count; i++) 985 nvlist_free(l2cache[i]); 986 if (sav->sav_count) 987 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 988 } 989 990 static int 991 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 992 { 993 dmu_buf_t *db; 994 char *packed = NULL; 995 size_t nvsize = 0; 996 int error; 997 *value = NULL; 998 999 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 1000 nvsize = *(uint64_t *)db->db_data; 1001 dmu_buf_rele(db, FTAG); 1002 1003 packed = kmem_alloc(nvsize, KM_SLEEP); 1004 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 1005 if (error == 0) 1006 error = nvlist_unpack(packed, nvsize, value, 0); 1007 kmem_free(packed, nvsize); 1008 1009 return (error); 1010 } 1011 1012 /* 1013 * Checks to see if the given vdev could not be opened, in which case we post a 1014 * sysevent to notify the autoreplace code that the device has been removed. 1015 */ 1016 static void 1017 spa_check_removed(vdev_t *vd) 1018 { 1019 int c; 1020 1021 for (c = 0; c < vd->vdev_children; c++) 1022 spa_check_removed(vd->vdev_child[c]); 1023 1024 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 1025 zfs_post_autoreplace(vd->vdev_spa, vd); 1026 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1027 } 1028 } 1029 1030 /* 1031 * Check for missing log devices 1032 */ 1033 int 1034 spa_check_logs(spa_t *spa) 1035 { 1036 switch (spa->spa_log_state) { 1037 case SPA_LOG_MISSING: 1038 /* need to recheck in case slog has been restored */ 1039 case SPA_LOG_UNKNOWN: 1040 if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, 1041 DS_FIND_CHILDREN)) { 1042 spa->spa_log_state = SPA_LOG_MISSING; 1043 return (1); 1044 } 1045 break; 1046 1047 case SPA_LOG_CLEAR: 1048 (void) dmu_objset_find(spa->spa_name, zil_clear_log_chain, NULL, 1049 DS_FIND_CHILDREN); 1050 break; 1051 } 1052 spa->spa_log_state = SPA_LOG_GOOD; 1053 return (0); 1054 } 1055 1056 /* 1057 * Load an existing storage pool, using the pool's builtin spa_config as a 1058 * source of configuration information. 1059 */ 1060 static int 1061 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 1062 { 1063 int error = 0; 1064 nvlist_t *nvroot = NULL; 1065 vdev_t *rvd; 1066 uberblock_t *ub = &spa->spa_uberblock; 1067 uint64_t config_cache_txg = spa->spa_config_txg; 1068 uint64_t pool_guid; 1069 uint64_t version; 1070 uint64_t autoreplace = 0; 1071 int orig_mode = spa->spa_mode; 1072 char *ereport = FM_EREPORT_ZFS_POOL; 1073 1074 /* 1075 * If this is an untrusted config, access the pool in read-only mode. 1076 * This prevents things like resilvering recently removed devices. 1077 */ 1078 if (!mosconfig) 1079 spa->spa_mode = FREAD; 1080 1081 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1082 1083 spa->spa_load_state = state; 1084 1085 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 1086 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 1087 error = EINVAL; 1088 goto out; 1089 } 1090 1091 /* 1092 * Versioning wasn't explicitly added to the label until later, so if 1093 * it's not present treat it as the initial version. 1094 */ 1095 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 1096 version = SPA_VERSION_INITIAL; 1097 1098 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 1099 &spa->spa_config_txg); 1100 1101 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 1102 spa_guid_exists(pool_guid, 0)) { 1103 error = EEXIST; 1104 goto out; 1105 } 1106 1107 spa->spa_load_guid = pool_guid; 1108 1109 /* 1110 * Parse the configuration into a vdev tree. We explicitly set the 1111 * value that will be returned by spa_version() since parsing the 1112 * configuration requires knowing the version number. 1113 */ 1114 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1115 spa->spa_ubsync.ub_version = version; 1116 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 1117 spa_config_exit(spa, SCL_ALL, FTAG); 1118 1119 if (error != 0) 1120 goto out; 1121 1122 ASSERT(spa->spa_root_vdev == rvd); 1123 ASSERT(spa_guid(spa) == pool_guid); 1124 1125 /* 1126 * Try to open all vdevs, loading each label in the process. 1127 */ 1128 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1129 error = vdev_open(rvd); 1130 spa_config_exit(spa, SCL_ALL, FTAG); 1131 if (error != 0) 1132 goto out; 1133 1134 /* 1135 * Validate the labels for all leaf vdevs. We need to grab the config 1136 * lock because all label I/O is done with ZIO_FLAG_CONFIG_WRITER. 1137 */ 1138 if (mosconfig) { 1139 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1140 error = vdev_validate(rvd); 1141 spa_config_exit(spa, SCL_ALL, FTAG); 1142 if (error != 0) 1143 goto out; 1144 } 1145 1146 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1147 error = ENXIO; 1148 goto out; 1149 } 1150 1151 /* 1152 * Find the best uberblock. 1153 */ 1154 vdev_uberblock_load(NULL, rvd, ub); 1155 1156 /* 1157 * If we weren't able to find a single valid uberblock, return failure. 1158 */ 1159 if (ub->ub_txg == 0) { 1160 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1161 VDEV_AUX_CORRUPT_DATA); 1162 error = ENXIO; 1163 goto out; 1164 } 1165 1166 /* 1167 * If the pool is newer than the code, we can't open it. 1168 */ 1169 if (ub->ub_version > SPA_VERSION) { 1170 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1171 VDEV_AUX_VERSION_NEWER); 1172 error = ENOTSUP; 1173 goto out; 1174 } 1175 1176 /* 1177 * If the vdev guid sum doesn't match the uberblock, we have an 1178 * incomplete configuration. 1179 */ 1180 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 1181 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1182 VDEV_AUX_BAD_GUID_SUM); 1183 error = ENXIO; 1184 goto out; 1185 } 1186 1187 /* 1188 * Initialize internal SPA structures. 1189 */ 1190 spa->spa_state = POOL_STATE_ACTIVE; 1191 spa->spa_ubsync = spa->spa_uberblock; 1192 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 1193 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 1194 if (error) { 1195 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1196 VDEV_AUX_CORRUPT_DATA); 1197 goto out; 1198 } 1199 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 1200 1201 if (zap_lookup(spa->spa_meta_objset, 1202 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1203 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 1204 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1205 VDEV_AUX_CORRUPT_DATA); 1206 error = EIO; 1207 goto out; 1208 } 1209 1210 if (!mosconfig) { 1211 nvlist_t *newconfig; 1212 uint64_t hostid; 1213 1214 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 1215 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1216 VDEV_AUX_CORRUPT_DATA); 1217 error = EIO; 1218 goto out; 1219 } 1220 1221 if (!spa_is_root(spa) && nvlist_lookup_uint64(newconfig, 1222 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 1223 char *hostname; 1224 unsigned long myhostid = 0; 1225 1226 VERIFY(nvlist_lookup_string(newconfig, 1227 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 1228 1229 #ifdef _KERNEL 1230 myhostid = zone_get_hostid(NULL); 1231 #else /* _KERNEL */ 1232 /* 1233 * We're emulating the system's hostid in userland, so 1234 * we can't use zone_get_hostid(). 1235 */ 1236 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 1237 #endif /* _KERNEL */ 1238 if (hostid != 0 && myhostid != 0 && 1239 hostid != myhostid) { 1240 cmn_err(CE_WARN, "pool '%s' could not be " 1241 "loaded as it was last accessed by " 1242 "another system (host: %s hostid: 0x%lx). " 1243 "See: http://www.sun.com/msg/ZFS-8000-EY", 1244 spa_name(spa), hostname, 1245 (unsigned long)hostid); 1246 error = EBADF; 1247 goto out; 1248 } 1249 } 1250 1251 spa_config_set(spa, newconfig); 1252 spa_unload(spa); 1253 spa_deactivate(spa); 1254 spa_activate(spa, orig_mode); 1255 1256 return (spa_load(spa, newconfig, state, B_TRUE)); 1257 } 1258 1259 if (zap_lookup(spa->spa_meta_objset, 1260 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1261 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 1262 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1263 VDEV_AUX_CORRUPT_DATA); 1264 error = EIO; 1265 goto out; 1266 } 1267 1268 /* 1269 * Load the bit that tells us to use the new accounting function 1270 * (raid-z deflation). If we have an older pool, this will not 1271 * be present. 1272 */ 1273 error = zap_lookup(spa->spa_meta_objset, 1274 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1275 sizeof (uint64_t), 1, &spa->spa_deflate); 1276 if (error != 0 && error != ENOENT) { 1277 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1278 VDEV_AUX_CORRUPT_DATA); 1279 error = EIO; 1280 goto out; 1281 } 1282 1283 /* 1284 * Load the persistent error log. If we have an older pool, this will 1285 * not be present. 1286 */ 1287 error = zap_lookup(spa->spa_meta_objset, 1288 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 1289 sizeof (uint64_t), 1, &spa->spa_errlog_last); 1290 if (error != 0 && error != ENOENT) { 1291 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1292 VDEV_AUX_CORRUPT_DATA); 1293 error = EIO; 1294 goto out; 1295 } 1296 1297 error = zap_lookup(spa->spa_meta_objset, 1298 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 1299 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 1300 if (error != 0 && error != ENOENT) { 1301 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1302 VDEV_AUX_CORRUPT_DATA); 1303 error = EIO; 1304 goto out; 1305 } 1306 1307 /* 1308 * Load the history object. If we have an older pool, this 1309 * will not be present. 1310 */ 1311 error = zap_lookup(spa->spa_meta_objset, 1312 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 1313 sizeof (uint64_t), 1, &spa->spa_history); 1314 if (error != 0 && error != ENOENT) { 1315 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1316 VDEV_AUX_CORRUPT_DATA); 1317 error = EIO; 1318 goto out; 1319 } 1320 1321 /* 1322 * Load any hot spares for this pool. 1323 */ 1324 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1325 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object); 1326 if (error != 0 && error != ENOENT) { 1327 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1328 VDEV_AUX_CORRUPT_DATA); 1329 error = EIO; 1330 goto out; 1331 } 1332 if (error == 0) { 1333 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 1334 if (load_nvlist(spa, spa->spa_spares.sav_object, 1335 &spa->spa_spares.sav_config) != 0) { 1336 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1337 VDEV_AUX_CORRUPT_DATA); 1338 error = EIO; 1339 goto out; 1340 } 1341 1342 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1343 spa_load_spares(spa); 1344 spa_config_exit(spa, SCL_ALL, FTAG); 1345 } 1346 1347 /* 1348 * Load any level 2 ARC devices for this pool. 1349 */ 1350 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1351 DMU_POOL_L2CACHE, sizeof (uint64_t), 1, 1352 &spa->spa_l2cache.sav_object); 1353 if (error != 0 && error != ENOENT) { 1354 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1355 VDEV_AUX_CORRUPT_DATA); 1356 error = EIO; 1357 goto out; 1358 } 1359 if (error == 0) { 1360 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 1361 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 1362 &spa->spa_l2cache.sav_config) != 0) { 1363 vdev_set_state(rvd, B_TRUE, 1364 VDEV_STATE_CANT_OPEN, 1365 VDEV_AUX_CORRUPT_DATA); 1366 error = EIO; 1367 goto out; 1368 } 1369 1370 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1371 spa_load_l2cache(spa); 1372 spa_config_exit(spa, SCL_ALL, FTAG); 1373 } 1374 1375 if (spa_check_logs(spa)) { 1376 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1377 VDEV_AUX_BAD_LOG); 1378 error = ENXIO; 1379 ereport = FM_EREPORT_ZFS_LOG_REPLAY; 1380 goto out; 1381 } 1382 1383 1384 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 1385 1386 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1387 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 1388 1389 if (error && error != ENOENT) { 1390 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1391 VDEV_AUX_CORRUPT_DATA); 1392 error = EIO; 1393 goto out; 1394 } 1395 1396 if (error == 0) { 1397 (void) zap_lookup(spa->spa_meta_objset, 1398 spa->spa_pool_props_object, 1399 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 1400 sizeof (uint64_t), 1, &spa->spa_bootfs); 1401 (void) zap_lookup(spa->spa_meta_objset, 1402 spa->spa_pool_props_object, 1403 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1404 sizeof (uint64_t), 1, &autoreplace); 1405 (void) zap_lookup(spa->spa_meta_objset, 1406 spa->spa_pool_props_object, 1407 zpool_prop_to_name(ZPOOL_PROP_DELEGATION), 1408 sizeof (uint64_t), 1, &spa->spa_delegation); 1409 (void) zap_lookup(spa->spa_meta_objset, 1410 spa->spa_pool_props_object, 1411 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 1412 sizeof (uint64_t), 1, &spa->spa_failmode); 1413 } 1414 1415 /* 1416 * If the 'autoreplace' property is set, then post a resource notifying 1417 * the ZFS DE that it should not issue any faults for unopenable 1418 * devices. We also iterate over the vdevs, and post a sysevent for any 1419 * unopenable vdevs so that the normal autoreplace handler can take 1420 * over. 1421 */ 1422 if (autoreplace && state != SPA_LOAD_TRYIMPORT) 1423 spa_check_removed(spa->spa_root_vdev); 1424 1425 /* 1426 * Load the vdev state for all toplevel vdevs. 1427 */ 1428 vdev_load(rvd); 1429 1430 /* 1431 * Propagate the leaf DTLs we just loaded all the way up the tree. 1432 */ 1433 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1434 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 1435 spa_config_exit(spa, SCL_ALL, FTAG); 1436 1437 /* 1438 * Check the state of the root vdev. If it can't be opened, it 1439 * indicates one or more toplevel vdevs are faulted. 1440 */ 1441 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1442 error = ENXIO; 1443 goto out; 1444 } 1445 1446 if (spa_writeable(spa)) { 1447 dmu_tx_t *tx; 1448 int need_update = B_FALSE; 1449 1450 ASSERT(state != SPA_LOAD_TRYIMPORT); 1451 1452 /* 1453 * Claim log blocks that haven't been committed yet. 1454 * This must all happen in a single txg. 1455 */ 1456 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 1457 spa_first_txg(spa)); 1458 (void) dmu_objset_find(spa_name(spa), 1459 zil_claim, tx, DS_FIND_CHILDREN); 1460 dmu_tx_commit(tx); 1461 1462 spa->spa_sync_on = B_TRUE; 1463 txg_sync_start(spa->spa_dsl_pool); 1464 1465 /* 1466 * Wait for all claims to sync. 1467 */ 1468 txg_wait_synced(spa->spa_dsl_pool, 0); 1469 1470 /* 1471 * If the config cache is stale, or we have uninitialized 1472 * metaslabs (see spa_vdev_add()), then update the config. 1473 */ 1474 if (config_cache_txg != spa->spa_config_txg || 1475 state == SPA_LOAD_IMPORT) 1476 need_update = B_TRUE; 1477 1478 for (int c = 0; c < rvd->vdev_children; c++) 1479 if (rvd->vdev_child[c]->vdev_ms_array == 0) 1480 need_update = B_TRUE; 1481 1482 /* 1483 * Update the config cache asychronously in case we're the 1484 * root pool, in which case the config cache isn't writable yet. 1485 */ 1486 if (need_update) 1487 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 1488 1489 /* 1490 * Check all DTLs to see if anything needs resilvering. 1491 */ 1492 if (vdev_resilver_needed(rvd, NULL, NULL)) 1493 spa_async_request(spa, SPA_ASYNC_RESILVER); 1494 } 1495 1496 error = 0; 1497 out: 1498 spa->spa_minref = refcount_count(&spa->spa_refcount); 1499 if (error && error != EBADF) 1500 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 1501 spa->spa_load_state = SPA_LOAD_NONE; 1502 spa->spa_ena = 0; 1503 1504 return (error); 1505 } 1506 1507 /* 1508 * Pool Open/Import 1509 * 1510 * The import case is identical to an open except that the configuration is sent 1511 * down from userland, instead of grabbed from the configuration cache. For the 1512 * case of an open, the pool configuration will exist in the 1513 * POOL_STATE_UNINITIALIZED state. 1514 * 1515 * The stats information (gen/count/ustats) is used to gather vdev statistics at 1516 * the same time open the pool, without having to keep around the spa_t in some 1517 * ambiguous state. 1518 */ 1519 static int 1520 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 1521 { 1522 spa_t *spa; 1523 int error; 1524 int locked = B_FALSE; 1525 1526 *spapp = NULL; 1527 1528 /* 1529 * As disgusting as this is, we need to support recursive calls to this 1530 * function because dsl_dir_open() is called during spa_load(), and ends 1531 * up calling spa_open() again. The real fix is to figure out how to 1532 * avoid dsl_dir_open() calling this in the first place. 1533 */ 1534 if (mutex_owner(&spa_namespace_lock) != curthread) { 1535 mutex_enter(&spa_namespace_lock); 1536 locked = B_TRUE; 1537 } 1538 1539 if ((spa = spa_lookup(pool)) == NULL) { 1540 if (locked) 1541 mutex_exit(&spa_namespace_lock); 1542 return (ENOENT); 1543 } 1544 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 1545 1546 spa_activate(spa, spa_mode_global); 1547 1548 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 1549 1550 if (error == EBADF) { 1551 /* 1552 * If vdev_validate() returns failure (indicated by 1553 * EBADF), it indicates that one of the vdevs indicates 1554 * that the pool has been exported or destroyed. If 1555 * this is the case, the config cache is out of sync and 1556 * we should remove the pool from the namespace. 1557 */ 1558 spa_unload(spa); 1559 spa_deactivate(spa); 1560 spa_config_sync(spa, B_TRUE, B_TRUE); 1561 spa_remove(spa); 1562 if (locked) 1563 mutex_exit(&spa_namespace_lock); 1564 return (ENOENT); 1565 } 1566 1567 if (error) { 1568 /* 1569 * We can't open the pool, but we still have useful 1570 * information: the state of each vdev after the 1571 * attempted vdev_open(). Return this to the user. 1572 */ 1573 if (config != NULL && spa->spa_root_vdev != NULL) 1574 *config = spa_config_generate(spa, NULL, -1ULL, 1575 B_TRUE); 1576 spa_unload(spa); 1577 spa_deactivate(spa); 1578 spa->spa_last_open_failed = B_TRUE; 1579 if (locked) 1580 mutex_exit(&spa_namespace_lock); 1581 *spapp = NULL; 1582 return (error); 1583 } else { 1584 spa->spa_last_open_failed = B_FALSE; 1585 } 1586 } 1587 1588 spa_open_ref(spa, tag); 1589 1590 if (locked) 1591 mutex_exit(&spa_namespace_lock); 1592 1593 *spapp = spa; 1594 1595 if (config != NULL) 1596 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1597 1598 return (0); 1599 } 1600 1601 int 1602 spa_open(const char *name, spa_t **spapp, void *tag) 1603 { 1604 return (spa_open_common(name, spapp, tag, NULL)); 1605 } 1606 1607 /* 1608 * Lookup the given spa_t, incrementing the inject count in the process, 1609 * preventing it from being exported or destroyed. 1610 */ 1611 spa_t * 1612 spa_inject_addref(char *name) 1613 { 1614 spa_t *spa; 1615 1616 mutex_enter(&spa_namespace_lock); 1617 if ((spa = spa_lookup(name)) == NULL) { 1618 mutex_exit(&spa_namespace_lock); 1619 return (NULL); 1620 } 1621 spa->spa_inject_ref++; 1622 mutex_exit(&spa_namespace_lock); 1623 1624 return (spa); 1625 } 1626 1627 void 1628 spa_inject_delref(spa_t *spa) 1629 { 1630 mutex_enter(&spa_namespace_lock); 1631 spa->spa_inject_ref--; 1632 mutex_exit(&spa_namespace_lock); 1633 } 1634 1635 /* 1636 * Add spares device information to the nvlist. 1637 */ 1638 static void 1639 spa_add_spares(spa_t *spa, nvlist_t *config) 1640 { 1641 nvlist_t **spares; 1642 uint_t i, nspares; 1643 nvlist_t *nvroot; 1644 uint64_t guid; 1645 vdev_stat_t *vs; 1646 uint_t vsc; 1647 uint64_t pool; 1648 1649 if (spa->spa_spares.sav_count == 0) 1650 return; 1651 1652 VERIFY(nvlist_lookup_nvlist(config, 1653 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1654 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1655 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1656 if (nspares != 0) { 1657 VERIFY(nvlist_add_nvlist_array(nvroot, 1658 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1659 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1660 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1661 1662 /* 1663 * Go through and find any spares which have since been 1664 * repurposed as an active spare. If this is the case, update 1665 * their status appropriately. 1666 */ 1667 for (i = 0; i < nspares; i++) { 1668 VERIFY(nvlist_lookup_uint64(spares[i], 1669 ZPOOL_CONFIG_GUID, &guid) == 0); 1670 if (spa_spare_exists(guid, &pool, NULL) && 1671 pool != 0ULL) { 1672 VERIFY(nvlist_lookup_uint64_array( 1673 spares[i], ZPOOL_CONFIG_STATS, 1674 (uint64_t **)&vs, &vsc) == 0); 1675 vs->vs_state = VDEV_STATE_CANT_OPEN; 1676 vs->vs_aux = VDEV_AUX_SPARED; 1677 } 1678 } 1679 } 1680 } 1681 1682 /* 1683 * Add l2cache device information to the nvlist, including vdev stats. 1684 */ 1685 static void 1686 spa_add_l2cache(spa_t *spa, nvlist_t *config) 1687 { 1688 nvlist_t **l2cache; 1689 uint_t i, j, nl2cache; 1690 nvlist_t *nvroot; 1691 uint64_t guid; 1692 vdev_t *vd; 1693 vdev_stat_t *vs; 1694 uint_t vsc; 1695 1696 if (spa->spa_l2cache.sav_count == 0) 1697 return; 1698 1699 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 1700 1701 VERIFY(nvlist_lookup_nvlist(config, 1702 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1703 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 1704 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1705 if (nl2cache != 0) { 1706 VERIFY(nvlist_add_nvlist_array(nvroot, 1707 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 1708 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1709 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1710 1711 /* 1712 * Update level 2 cache device stats. 1713 */ 1714 1715 for (i = 0; i < nl2cache; i++) { 1716 VERIFY(nvlist_lookup_uint64(l2cache[i], 1717 ZPOOL_CONFIG_GUID, &guid) == 0); 1718 1719 vd = NULL; 1720 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 1721 if (guid == 1722 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 1723 vd = spa->spa_l2cache.sav_vdevs[j]; 1724 break; 1725 } 1726 } 1727 ASSERT(vd != NULL); 1728 1729 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 1730 ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0); 1731 vdev_get_stats(vd, vs); 1732 } 1733 } 1734 1735 spa_config_exit(spa, SCL_CONFIG, FTAG); 1736 } 1737 1738 int 1739 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 1740 { 1741 int error; 1742 spa_t *spa; 1743 1744 *config = NULL; 1745 error = spa_open_common(name, &spa, FTAG, config); 1746 1747 if (spa && *config != NULL) { 1748 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 1749 spa_get_errlog_size(spa)) == 0); 1750 1751 if (spa_suspended(spa)) 1752 VERIFY(nvlist_add_uint64(*config, 1753 ZPOOL_CONFIG_SUSPENDED, spa->spa_failmode) == 0); 1754 1755 spa_add_spares(spa, *config); 1756 spa_add_l2cache(spa, *config); 1757 } 1758 1759 /* 1760 * We want to get the alternate root even for faulted pools, so we cheat 1761 * and call spa_lookup() directly. 1762 */ 1763 if (altroot) { 1764 if (spa == NULL) { 1765 mutex_enter(&spa_namespace_lock); 1766 spa = spa_lookup(name); 1767 if (spa) 1768 spa_altroot(spa, altroot, buflen); 1769 else 1770 altroot[0] = '\0'; 1771 spa = NULL; 1772 mutex_exit(&spa_namespace_lock); 1773 } else { 1774 spa_altroot(spa, altroot, buflen); 1775 } 1776 } 1777 1778 if (spa != NULL) 1779 spa_close(spa, FTAG); 1780 1781 return (error); 1782 } 1783 1784 /* 1785 * Validate that the auxiliary device array is well formed. We must have an 1786 * array of nvlists, each which describes a valid leaf vdev. If this is an 1787 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 1788 * specified, as long as they are well-formed. 1789 */ 1790 static int 1791 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 1792 spa_aux_vdev_t *sav, const char *config, uint64_t version, 1793 vdev_labeltype_t label) 1794 { 1795 nvlist_t **dev; 1796 uint_t i, ndev; 1797 vdev_t *vd; 1798 int error; 1799 1800 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1801 1802 /* 1803 * It's acceptable to have no devs specified. 1804 */ 1805 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 1806 return (0); 1807 1808 if (ndev == 0) 1809 return (EINVAL); 1810 1811 /* 1812 * Make sure the pool is formatted with a version that supports this 1813 * device type. 1814 */ 1815 if (spa_version(spa) < version) 1816 return (ENOTSUP); 1817 1818 /* 1819 * Set the pending device list so we correctly handle device in-use 1820 * checking. 1821 */ 1822 sav->sav_pending = dev; 1823 sav->sav_npending = ndev; 1824 1825 for (i = 0; i < ndev; i++) { 1826 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 1827 mode)) != 0) 1828 goto out; 1829 1830 if (!vd->vdev_ops->vdev_op_leaf) { 1831 vdev_free(vd); 1832 error = EINVAL; 1833 goto out; 1834 } 1835 1836 /* 1837 * The L2ARC currently only supports disk devices in 1838 * kernel context. For user-level testing, we allow it. 1839 */ 1840 #ifdef _KERNEL 1841 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 1842 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 1843 error = ENOTBLK; 1844 goto out; 1845 } 1846 #endif 1847 vd->vdev_top = vd; 1848 1849 if ((error = vdev_open(vd)) == 0 && 1850 (error = vdev_label_init(vd, crtxg, label)) == 0) { 1851 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 1852 vd->vdev_guid) == 0); 1853 } 1854 1855 vdev_free(vd); 1856 1857 if (error && 1858 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 1859 goto out; 1860 else 1861 error = 0; 1862 } 1863 1864 out: 1865 sav->sav_pending = NULL; 1866 sav->sav_npending = 0; 1867 return (error); 1868 } 1869 1870 static int 1871 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 1872 { 1873 int error; 1874 1875 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1876 1877 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 1878 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 1879 VDEV_LABEL_SPARE)) != 0) { 1880 return (error); 1881 } 1882 1883 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 1884 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 1885 VDEV_LABEL_L2CACHE)); 1886 } 1887 1888 static void 1889 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 1890 const char *config) 1891 { 1892 int i; 1893 1894 if (sav->sav_config != NULL) { 1895 nvlist_t **olddevs; 1896 uint_t oldndevs; 1897 nvlist_t **newdevs; 1898 1899 /* 1900 * Generate new dev list by concatentating with the 1901 * current dev list. 1902 */ 1903 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 1904 &olddevs, &oldndevs) == 0); 1905 1906 newdevs = kmem_alloc(sizeof (void *) * 1907 (ndevs + oldndevs), KM_SLEEP); 1908 for (i = 0; i < oldndevs; i++) 1909 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 1910 KM_SLEEP) == 0); 1911 for (i = 0; i < ndevs; i++) 1912 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 1913 KM_SLEEP) == 0); 1914 1915 VERIFY(nvlist_remove(sav->sav_config, config, 1916 DATA_TYPE_NVLIST_ARRAY) == 0); 1917 1918 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1919 config, newdevs, ndevs + oldndevs) == 0); 1920 for (i = 0; i < oldndevs + ndevs; i++) 1921 nvlist_free(newdevs[i]); 1922 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 1923 } else { 1924 /* 1925 * Generate a new dev list. 1926 */ 1927 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 1928 KM_SLEEP) == 0); 1929 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 1930 devs, ndevs) == 0); 1931 } 1932 } 1933 1934 /* 1935 * Stop and drop level 2 ARC devices 1936 */ 1937 void 1938 spa_l2cache_drop(spa_t *spa) 1939 { 1940 vdev_t *vd; 1941 int i; 1942 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1943 1944 for (i = 0; i < sav->sav_count; i++) { 1945 uint64_t pool; 1946 1947 vd = sav->sav_vdevs[i]; 1948 ASSERT(vd != NULL); 1949 1950 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1951 pool != 0ULL && l2arc_vdev_present(vd)) 1952 l2arc_remove_vdev(vd); 1953 if (vd->vdev_isl2cache) 1954 spa_l2cache_remove(vd); 1955 vdev_clear_stats(vd); 1956 (void) vdev_close(vd); 1957 } 1958 } 1959 1960 /* 1961 * Pool Creation 1962 */ 1963 int 1964 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 1965 const char *history_str, nvlist_t *zplprops) 1966 { 1967 spa_t *spa; 1968 char *altroot = NULL; 1969 vdev_t *rvd; 1970 dsl_pool_t *dp; 1971 dmu_tx_t *tx; 1972 int c, error = 0; 1973 uint64_t txg = TXG_INITIAL; 1974 nvlist_t **spares, **l2cache; 1975 uint_t nspares, nl2cache; 1976 uint64_t version; 1977 1978 /* 1979 * If this pool already exists, return failure. 1980 */ 1981 mutex_enter(&spa_namespace_lock); 1982 if (spa_lookup(pool) != NULL) { 1983 mutex_exit(&spa_namespace_lock); 1984 return (EEXIST); 1985 } 1986 1987 /* 1988 * Allocate a new spa_t structure. 1989 */ 1990 (void) nvlist_lookup_string(props, 1991 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 1992 spa = spa_add(pool, altroot); 1993 spa_activate(spa, spa_mode_global); 1994 1995 spa->spa_uberblock.ub_txg = txg - 1; 1996 1997 if (props && (error = spa_prop_validate(spa, props))) { 1998 spa_unload(spa); 1999 spa_deactivate(spa); 2000 spa_remove(spa); 2001 mutex_exit(&spa_namespace_lock); 2002 return (error); 2003 } 2004 2005 if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 2006 &version) != 0) 2007 version = SPA_VERSION; 2008 ASSERT(version <= SPA_VERSION); 2009 spa->spa_uberblock.ub_version = version; 2010 spa->spa_ubsync = spa->spa_uberblock; 2011 2012 /* 2013 * Create the root vdev. 2014 */ 2015 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2016 2017 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 2018 2019 ASSERT(error != 0 || rvd != NULL); 2020 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 2021 2022 if (error == 0 && !zfs_allocatable_devs(nvroot)) 2023 error = EINVAL; 2024 2025 if (error == 0 && 2026 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 2027 (error = spa_validate_aux(spa, nvroot, txg, 2028 VDEV_ALLOC_ADD)) == 0) { 2029 for (c = 0; c < rvd->vdev_children; c++) 2030 vdev_init(rvd->vdev_child[c], txg); 2031 vdev_config_dirty(rvd); 2032 } 2033 2034 spa_config_exit(spa, SCL_ALL, FTAG); 2035 2036 if (error != 0) { 2037 spa_unload(spa); 2038 spa_deactivate(spa); 2039 spa_remove(spa); 2040 mutex_exit(&spa_namespace_lock); 2041 return (error); 2042 } 2043 2044 /* 2045 * Get the list of spares, if specified. 2046 */ 2047 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2048 &spares, &nspares) == 0) { 2049 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 2050 KM_SLEEP) == 0); 2051 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2052 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2053 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2054 spa_load_spares(spa); 2055 spa_config_exit(spa, SCL_ALL, FTAG); 2056 spa->spa_spares.sav_sync = B_TRUE; 2057 } 2058 2059 /* 2060 * Get the list of level 2 cache devices, if specified. 2061 */ 2062 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2063 &l2cache, &nl2cache) == 0) { 2064 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2065 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2066 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2067 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2068 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2069 spa_load_l2cache(spa); 2070 spa_config_exit(spa, SCL_ALL, FTAG); 2071 spa->spa_l2cache.sav_sync = B_TRUE; 2072 } 2073 2074 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 2075 spa->spa_meta_objset = dp->dp_meta_objset; 2076 2077 tx = dmu_tx_create_assigned(dp, txg); 2078 2079 /* 2080 * Create the pool config object. 2081 */ 2082 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 2083 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 2084 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2085 2086 if (zap_add(spa->spa_meta_objset, 2087 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 2088 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 2089 cmn_err(CE_PANIC, "failed to add pool config"); 2090 } 2091 2092 /* Newly created pools with the right version are always deflated. */ 2093 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 2094 spa->spa_deflate = TRUE; 2095 if (zap_add(spa->spa_meta_objset, 2096 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2097 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 2098 cmn_err(CE_PANIC, "failed to add deflate"); 2099 } 2100 } 2101 2102 /* 2103 * Create the deferred-free bplist object. Turn off compression 2104 * because sync-to-convergence takes longer if the blocksize 2105 * keeps changing. 2106 */ 2107 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 2108 1 << 14, tx); 2109 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 2110 ZIO_COMPRESS_OFF, tx); 2111 2112 if (zap_add(spa->spa_meta_objset, 2113 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 2114 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 2115 cmn_err(CE_PANIC, "failed to add bplist"); 2116 } 2117 2118 /* 2119 * Create the pool's history object. 2120 */ 2121 if (version >= SPA_VERSION_ZPOOL_HISTORY) 2122 spa_history_create_obj(spa, tx); 2123 2124 /* 2125 * Set pool properties. 2126 */ 2127 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 2128 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2129 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 2130 if (props != NULL) { 2131 spa_configfile_set(spa, props, B_FALSE); 2132 spa_sync_props(spa, props, CRED(), tx); 2133 } 2134 2135 dmu_tx_commit(tx); 2136 2137 spa->spa_sync_on = B_TRUE; 2138 txg_sync_start(spa->spa_dsl_pool); 2139 2140 /* 2141 * We explicitly wait for the first transaction to complete so that our 2142 * bean counters are appropriately updated. 2143 */ 2144 txg_wait_synced(spa->spa_dsl_pool, txg); 2145 2146 spa_config_sync(spa, B_FALSE, B_TRUE); 2147 2148 if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 2149 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 2150 2151 spa->spa_minref = refcount_count(&spa->spa_refcount); 2152 2153 mutex_exit(&spa_namespace_lock); 2154 2155 return (0); 2156 } 2157 2158 /* 2159 * Import the given pool into the system. We set up the necessary spa_t and 2160 * then call spa_load() to do the dirty work. 2161 */ 2162 static int 2163 spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props, 2164 boolean_t isroot, boolean_t allowfaulted) 2165 { 2166 spa_t *spa; 2167 char *altroot = NULL; 2168 int error, loaderr; 2169 nvlist_t *nvroot; 2170 nvlist_t **spares, **l2cache; 2171 uint_t nspares, nl2cache; 2172 2173 /* 2174 * If a pool with this name exists, return failure. 2175 */ 2176 mutex_enter(&spa_namespace_lock); 2177 if ((spa = spa_lookup(pool)) != NULL) { 2178 if (isroot) { 2179 /* 2180 * Remove the existing root pool from the 2181 * namespace so that we can replace it with 2182 * the correct config we just read in. 2183 */ 2184 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 2185 spa_remove(spa); 2186 } else { 2187 mutex_exit(&spa_namespace_lock); 2188 return (EEXIST); 2189 } 2190 } 2191 2192 /* 2193 * Create and initialize the spa structure. 2194 */ 2195 (void) nvlist_lookup_string(props, 2196 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2197 spa = spa_add(pool, altroot); 2198 spa_activate(spa, spa_mode_global); 2199 2200 if (allowfaulted) 2201 spa->spa_import_faulted = B_TRUE; 2202 spa->spa_is_root = isroot; 2203 2204 /* 2205 * Pass off the heavy lifting to spa_load(). 2206 * Pass TRUE for mosconfig (unless this is a root pool) because 2207 * the user-supplied config is actually the one to trust when 2208 * doing an import. 2209 */ 2210 loaderr = error = spa_load(spa, config, SPA_LOAD_IMPORT, !isroot); 2211 2212 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2213 /* 2214 * Toss any existing sparelist, as it doesn't have any validity anymore, 2215 * and conflicts with spa_has_spare(). 2216 */ 2217 if (!isroot && spa->spa_spares.sav_config) { 2218 nvlist_free(spa->spa_spares.sav_config); 2219 spa->spa_spares.sav_config = NULL; 2220 spa_load_spares(spa); 2221 } 2222 if (!isroot && spa->spa_l2cache.sav_config) { 2223 nvlist_free(spa->spa_l2cache.sav_config); 2224 spa->spa_l2cache.sav_config = NULL; 2225 spa_load_l2cache(spa); 2226 } 2227 2228 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 2229 &nvroot) == 0); 2230 if (error == 0) 2231 error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE); 2232 if (error == 0) 2233 error = spa_validate_aux(spa, nvroot, -1ULL, 2234 VDEV_ALLOC_L2CACHE); 2235 spa_config_exit(spa, SCL_ALL, FTAG); 2236 2237 if (props != NULL) 2238 spa_configfile_set(spa, props, B_FALSE); 2239 2240 if (error != 0 || (props && spa_writeable(spa) && 2241 (error = spa_prop_set(spa, props)))) { 2242 if (loaderr != 0 && loaderr != EINVAL && allowfaulted) { 2243 /* 2244 * If we failed to load the pool, but 'allowfaulted' is 2245 * set, then manually set the config as if the config 2246 * passed in was specified in the cache file. 2247 */ 2248 error = 0; 2249 spa->spa_import_faulted = B_FALSE; 2250 if (spa->spa_config == NULL) 2251 spa->spa_config = spa_config_generate(spa, 2252 NULL, -1ULL, B_TRUE); 2253 spa_unload(spa); 2254 spa_deactivate(spa); 2255 spa_config_sync(spa, B_FALSE, B_TRUE); 2256 } else { 2257 spa_unload(spa); 2258 spa_deactivate(spa); 2259 spa_remove(spa); 2260 } 2261 mutex_exit(&spa_namespace_lock); 2262 return (error); 2263 } 2264 2265 /* 2266 * Override any spares and level 2 cache devices as specified by 2267 * the user, as these may have correct device names/devids, etc. 2268 */ 2269 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2270 &spares, &nspares) == 0) { 2271 if (spa->spa_spares.sav_config) 2272 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 2273 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 2274 else 2275 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 2276 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2277 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2278 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2279 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2280 spa_load_spares(spa); 2281 spa_config_exit(spa, SCL_ALL, FTAG); 2282 spa->spa_spares.sav_sync = B_TRUE; 2283 } 2284 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2285 &l2cache, &nl2cache) == 0) { 2286 if (spa->spa_l2cache.sav_config) 2287 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 2288 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 2289 else 2290 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2291 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2292 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2293 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2294 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2295 spa_load_l2cache(spa); 2296 spa_config_exit(spa, SCL_ALL, FTAG); 2297 spa->spa_l2cache.sav_sync = B_TRUE; 2298 } 2299 2300 if (spa_writeable(spa)) { 2301 /* 2302 * Update the config cache to include the newly-imported pool. 2303 */ 2304 spa_config_update_common(spa, SPA_CONFIG_UPDATE_POOL, isroot); 2305 } 2306 2307 spa->spa_import_faulted = B_FALSE; 2308 mutex_exit(&spa_namespace_lock); 2309 2310 return (0); 2311 } 2312 2313 #ifdef _KERNEL 2314 /* 2315 * Build a "root" vdev for a top level vdev read in from a rootpool 2316 * device label. 2317 */ 2318 static void 2319 spa_build_rootpool_config(nvlist_t *config) 2320 { 2321 nvlist_t *nvtop, *nvroot; 2322 uint64_t pgid; 2323 2324 /* 2325 * Add this top-level vdev to the child array. 2326 */ 2327 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) 2328 == 0); 2329 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid) 2330 == 0); 2331 2332 /* 2333 * Put this pool's top-level vdevs into a root vdev. 2334 */ 2335 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2336 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) 2337 == 0); 2338 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 2339 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 2340 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 2341 &nvtop, 1) == 0); 2342 2343 /* 2344 * Replace the existing vdev_tree with the new root vdev in 2345 * this pool's configuration (remove the old, add the new). 2346 */ 2347 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 2348 nvlist_free(nvroot); 2349 } 2350 2351 /* 2352 * Get the root pool information from the root disk, then import the root pool 2353 * during the system boot up time. 2354 */ 2355 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 2356 2357 int 2358 spa_check_rootconf(char *devpath, char *devid, nvlist_t **bestconf, 2359 uint64_t *besttxg) 2360 { 2361 nvlist_t *config; 2362 uint64_t txg; 2363 int error; 2364 2365 if (error = vdev_disk_read_rootlabel(devpath, devid, &config)) 2366 return (error); 2367 2368 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 2369 2370 if (bestconf != NULL) 2371 *bestconf = config; 2372 else 2373 nvlist_free(config); 2374 *besttxg = txg; 2375 return (0); 2376 } 2377 2378 boolean_t 2379 spa_rootdev_validate(nvlist_t *nv) 2380 { 2381 uint64_t ival; 2382 2383 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 || 2384 nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 || 2385 nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0) 2386 return (B_FALSE); 2387 2388 return (B_TRUE); 2389 } 2390 2391 2392 /* 2393 * Given the boot device's physical path or devid, check if the device 2394 * is in a valid state. If so, return the configuration from the vdev 2395 * label. 2396 */ 2397 int 2398 spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf) 2399 { 2400 nvlist_t *conf = NULL; 2401 uint64_t txg = 0; 2402 nvlist_t *nvtop, **child; 2403 char *type; 2404 char *bootpath = NULL; 2405 uint_t children, c; 2406 char *tmp; 2407 int error; 2408 2409 if (devpath && ((tmp = strchr(devpath, ' ')) != NULL)) 2410 *tmp = '\0'; 2411 if (error = spa_check_rootconf(devpath, devid, &conf, &txg)) { 2412 cmn_err(CE_NOTE, "error reading device label"); 2413 return (error); 2414 } 2415 if (txg == 0) { 2416 cmn_err(CE_NOTE, "this device is detached"); 2417 nvlist_free(conf); 2418 return (EINVAL); 2419 } 2420 2421 VERIFY(nvlist_lookup_nvlist(conf, ZPOOL_CONFIG_VDEV_TREE, 2422 &nvtop) == 0); 2423 VERIFY(nvlist_lookup_string(nvtop, ZPOOL_CONFIG_TYPE, &type) == 0); 2424 2425 if (strcmp(type, VDEV_TYPE_DISK) == 0) { 2426 if (spa_rootdev_validate(nvtop)) { 2427 goto out; 2428 } else { 2429 nvlist_free(conf); 2430 return (EINVAL); 2431 } 2432 } 2433 2434 ASSERT(strcmp(type, VDEV_TYPE_MIRROR) == 0); 2435 2436 VERIFY(nvlist_lookup_nvlist_array(nvtop, ZPOOL_CONFIG_CHILDREN, 2437 &child, &children) == 0); 2438 2439 /* 2440 * Go thru vdevs in the mirror to see if the given device 2441 * has the most recent txg. Only the device with the most 2442 * recent txg has valid information and should be booted. 2443 */ 2444 for (c = 0; c < children; c++) { 2445 char *cdevid, *cpath; 2446 uint64_t tmptxg; 2447 2448 cpath = NULL; 2449 cdevid = NULL; 2450 if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_PHYS_PATH, 2451 &cpath) != 0 && nvlist_lookup_string(child[c], 2452 ZPOOL_CONFIG_DEVID, &cdevid) != 0) 2453 return (EINVAL); 2454 if ((spa_check_rootconf(cpath, cdevid, NULL, 2455 &tmptxg) == 0) && (tmptxg > txg)) { 2456 txg = tmptxg; 2457 VERIFY(nvlist_lookup_string(child[c], 2458 ZPOOL_CONFIG_PATH, &bootpath) == 0); 2459 } 2460 } 2461 2462 /* Does the best device match the one we've booted from? */ 2463 if (bootpath) { 2464 cmn_err(CE_NOTE, "try booting from '%s'", bootpath); 2465 return (EINVAL); 2466 } 2467 out: 2468 *bestconf = conf; 2469 return (0); 2470 } 2471 2472 /* 2473 * Import a root pool. 2474 * 2475 * For x86. devpath_list will consist of devid and/or physpath name of 2476 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 2477 * The GRUB "findroot" command will return the vdev we should boot. 2478 * 2479 * For Sparc, devpath_list consists the physpath name of the booting device 2480 * no matter the rootpool is a single device pool or a mirrored pool. 2481 * e.g. 2482 * "/pci@1f,0/ide@d/disk@0,0:a" 2483 */ 2484 int 2485 spa_import_rootpool(char *devpath, char *devid) 2486 { 2487 nvlist_t *conf = NULL; 2488 char *pname; 2489 int error; 2490 2491 /* 2492 * Get the vdev pathname and configuation from the most 2493 * recently updated vdev (highest txg). 2494 */ 2495 if (error = spa_get_rootconf(devpath, devid, &conf)) 2496 goto msg_out; 2497 2498 /* 2499 * Add type "root" vdev to the config. 2500 */ 2501 spa_build_rootpool_config(conf); 2502 2503 VERIFY(nvlist_lookup_string(conf, ZPOOL_CONFIG_POOL_NAME, &pname) == 0); 2504 2505 /* 2506 * We specify 'allowfaulted' for this to be treated like spa_open() 2507 * instead of spa_import(). This prevents us from marking vdevs as 2508 * persistently unavailable, and generates FMA ereports as if it were a 2509 * pool open, not import. 2510 */ 2511 error = spa_import_common(pname, conf, NULL, B_TRUE, B_TRUE); 2512 ASSERT(error != EEXIST); 2513 2514 nvlist_free(conf); 2515 return (error); 2516 2517 msg_out: 2518 cmn_err(CE_NOTE, "\n" 2519 " *************************************************** \n" 2520 " * This device is not bootable! * \n" 2521 " * It is either offlined or detached or faulted. * \n" 2522 " * Please try to boot from a different device. * \n" 2523 " *************************************************** "); 2524 2525 return (error); 2526 } 2527 #endif 2528 2529 /* 2530 * Import a non-root pool into the system. 2531 */ 2532 int 2533 spa_import(const char *pool, nvlist_t *config, nvlist_t *props) 2534 { 2535 return (spa_import_common(pool, config, props, B_FALSE, B_FALSE)); 2536 } 2537 2538 int 2539 spa_import_faulted(const char *pool, nvlist_t *config, nvlist_t *props) 2540 { 2541 return (spa_import_common(pool, config, props, B_FALSE, B_TRUE)); 2542 } 2543 2544 2545 /* 2546 * This (illegal) pool name is used when temporarily importing a spa_t in order 2547 * to get the vdev stats associated with the imported devices. 2548 */ 2549 #define TRYIMPORT_NAME "$import" 2550 2551 nvlist_t * 2552 spa_tryimport(nvlist_t *tryconfig) 2553 { 2554 nvlist_t *config = NULL; 2555 char *poolname; 2556 spa_t *spa; 2557 uint64_t state; 2558 2559 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 2560 return (NULL); 2561 2562 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 2563 return (NULL); 2564 2565 /* 2566 * Create and initialize the spa structure. 2567 */ 2568 mutex_enter(&spa_namespace_lock); 2569 spa = spa_add(TRYIMPORT_NAME, NULL); 2570 spa_activate(spa, FREAD); 2571 2572 /* 2573 * Pass off the heavy lifting to spa_load(). 2574 * Pass TRUE for mosconfig because the user-supplied config 2575 * is actually the one to trust when doing an import. 2576 */ 2577 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 2578 2579 /* 2580 * If 'tryconfig' was at least parsable, return the current config. 2581 */ 2582 if (spa->spa_root_vdev != NULL) { 2583 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2584 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 2585 poolname) == 0); 2586 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 2587 state) == 0); 2588 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 2589 spa->spa_uberblock.ub_timestamp) == 0); 2590 2591 /* 2592 * If the bootfs property exists on this pool then we 2593 * copy it out so that external consumers can tell which 2594 * pools are bootable. 2595 */ 2596 if (spa->spa_bootfs) { 2597 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 2598 2599 /* 2600 * We have to play games with the name since the 2601 * pool was opened as TRYIMPORT_NAME. 2602 */ 2603 if (dsl_dsobj_to_dsname(spa_name(spa), 2604 spa->spa_bootfs, tmpname) == 0) { 2605 char *cp; 2606 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 2607 2608 cp = strchr(tmpname, '/'); 2609 if (cp == NULL) { 2610 (void) strlcpy(dsname, tmpname, 2611 MAXPATHLEN); 2612 } else { 2613 (void) snprintf(dsname, MAXPATHLEN, 2614 "%s/%s", poolname, ++cp); 2615 } 2616 VERIFY(nvlist_add_string(config, 2617 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 2618 kmem_free(dsname, MAXPATHLEN); 2619 } 2620 kmem_free(tmpname, MAXPATHLEN); 2621 } 2622 2623 /* 2624 * Add the list of hot spares and level 2 cache devices. 2625 */ 2626 spa_add_spares(spa, config); 2627 spa_add_l2cache(spa, config); 2628 } 2629 2630 spa_unload(spa); 2631 spa_deactivate(spa); 2632 spa_remove(spa); 2633 mutex_exit(&spa_namespace_lock); 2634 2635 return (config); 2636 } 2637 2638 /* 2639 * Pool export/destroy 2640 * 2641 * The act of destroying or exporting a pool is very simple. We make sure there 2642 * is no more pending I/O and any references to the pool are gone. Then, we 2643 * update the pool state and sync all the labels to disk, removing the 2644 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 2645 * we don't sync the labels or remove the configuration cache. 2646 */ 2647 static int 2648 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 2649 boolean_t force, boolean_t hardforce) 2650 { 2651 spa_t *spa; 2652 2653 if (oldconfig) 2654 *oldconfig = NULL; 2655 2656 if (!(spa_mode_global & FWRITE)) 2657 return (EROFS); 2658 2659 mutex_enter(&spa_namespace_lock); 2660 if ((spa = spa_lookup(pool)) == NULL) { 2661 mutex_exit(&spa_namespace_lock); 2662 return (ENOENT); 2663 } 2664 2665 /* 2666 * Put a hold on the pool, drop the namespace lock, stop async tasks, 2667 * reacquire the namespace lock, and see if we can export. 2668 */ 2669 spa_open_ref(spa, FTAG); 2670 mutex_exit(&spa_namespace_lock); 2671 spa_async_suspend(spa); 2672 mutex_enter(&spa_namespace_lock); 2673 spa_close(spa, FTAG); 2674 2675 /* 2676 * The pool will be in core if it's openable, 2677 * in which case we can modify its state. 2678 */ 2679 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 2680 /* 2681 * Objsets may be open only because they're dirty, so we 2682 * have to force it to sync before checking spa_refcnt. 2683 */ 2684 txg_wait_synced(spa->spa_dsl_pool, 0); 2685 2686 /* 2687 * A pool cannot be exported or destroyed if there are active 2688 * references. If we are resetting a pool, allow references by 2689 * fault injection handlers. 2690 */ 2691 if (!spa_refcount_zero(spa) || 2692 (spa->spa_inject_ref != 0 && 2693 new_state != POOL_STATE_UNINITIALIZED)) { 2694 spa_async_resume(spa); 2695 mutex_exit(&spa_namespace_lock); 2696 return (EBUSY); 2697 } 2698 2699 /* 2700 * A pool cannot be exported if it has an active shared spare. 2701 * This is to prevent other pools stealing the active spare 2702 * from an exported pool. At user's own will, such pool can 2703 * be forcedly exported. 2704 */ 2705 if (!force && new_state == POOL_STATE_EXPORTED && 2706 spa_has_active_shared_spare(spa)) { 2707 spa_async_resume(spa); 2708 mutex_exit(&spa_namespace_lock); 2709 return (EXDEV); 2710 } 2711 2712 /* 2713 * We want this to be reflected on every label, 2714 * so mark them all dirty. spa_unload() will do the 2715 * final sync that pushes these changes out. 2716 */ 2717 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 2718 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2719 spa->spa_state = new_state; 2720 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 2721 vdev_config_dirty(spa->spa_root_vdev); 2722 spa_config_exit(spa, SCL_ALL, FTAG); 2723 } 2724 } 2725 2726 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 2727 2728 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2729 spa_unload(spa); 2730 spa_deactivate(spa); 2731 } 2732 2733 if (oldconfig && spa->spa_config) 2734 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 2735 2736 if (new_state != POOL_STATE_UNINITIALIZED) { 2737 if (!hardforce) 2738 spa_config_sync(spa, B_TRUE, B_TRUE); 2739 spa_remove(spa); 2740 } 2741 mutex_exit(&spa_namespace_lock); 2742 2743 return (0); 2744 } 2745 2746 /* 2747 * Destroy a storage pool. 2748 */ 2749 int 2750 spa_destroy(char *pool) 2751 { 2752 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 2753 B_FALSE, B_FALSE)); 2754 } 2755 2756 /* 2757 * Export a storage pool. 2758 */ 2759 int 2760 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 2761 boolean_t hardforce) 2762 { 2763 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 2764 force, hardforce)); 2765 } 2766 2767 /* 2768 * Similar to spa_export(), this unloads the spa_t without actually removing it 2769 * from the namespace in any way. 2770 */ 2771 int 2772 spa_reset(char *pool) 2773 { 2774 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 2775 B_FALSE, B_FALSE)); 2776 } 2777 2778 /* 2779 * ========================================================================== 2780 * Device manipulation 2781 * ========================================================================== 2782 */ 2783 2784 /* 2785 * Add a device to a storage pool. 2786 */ 2787 int 2788 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 2789 { 2790 uint64_t txg; 2791 int error; 2792 vdev_t *rvd = spa->spa_root_vdev; 2793 vdev_t *vd, *tvd; 2794 nvlist_t **spares, **l2cache; 2795 uint_t nspares, nl2cache; 2796 2797 txg = spa_vdev_enter(spa); 2798 2799 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 2800 VDEV_ALLOC_ADD)) != 0) 2801 return (spa_vdev_exit(spa, NULL, txg, error)); 2802 2803 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 2804 2805 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 2806 &nspares) != 0) 2807 nspares = 0; 2808 2809 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 2810 &nl2cache) != 0) 2811 nl2cache = 0; 2812 2813 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 2814 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 2815 2816 if (vd->vdev_children != 0 && 2817 (error = vdev_create(vd, txg, B_FALSE)) != 0) 2818 return (spa_vdev_exit(spa, vd, txg, error)); 2819 2820 /* 2821 * We must validate the spares and l2cache devices after checking the 2822 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 2823 */ 2824 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 2825 return (spa_vdev_exit(spa, vd, txg, error)); 2826 2827 /* 2828 * Transfer each new top-level vdev from vd to rvd. 2829 */ 2830 for (int c = 0; c < vd->vdev_children; c++) { 2831 tvd = vd->vdev_child[c]; 2832 vdev_remove_child(vd, tvd); 2833 tvd->vdev_id = rvd->vdev_children; 2834 vdev_add_child(rvd, tvd); 2835 vdev_config_dirty(tvd); 2836 } 2837 2838 if (nspares != 0) { 2839 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 2840 ZPOOL_CONFIG_SPARES); 2841 spa_load_spares(spa); 2842 spa->spa_spares.sav_sync = B_TRUE; 2843 } 2844 2845 if (nl2cache != 0) { 2846 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 2847 ZPOOL_CONFIG_L2CACHE); 2848 spa_load_l2cache(spa); 2849 spa->spa_l2cache.sav_sync = B_TRUE; 2850 } 2851 2852 /* 2853 * We have to be careful when adding new vdevs to an existing pool. 2854 * If other threads start allocating from these vdevs before we 2855 * sync the config cache, and we lose power, then upon reboot we may 2856 * fail to open the pool because there are DVAs that the config cache 2857 * can't translate. Therefore, we first add the vdevs without 2858 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 2859 * and then let spa_config_update() initialize the new metaslabs. 2860 * 2861 * spa_load() checks for added-but-not-initialized vdevs, so that 2862 * if we lose power at any point in this sequence, the remaining 2863 * steps will be completed the next time we load the pool. 2864 */ 2865 (void) spa_vdev_exit(spa, vd, txg, 0); 2866 2867 mutex_enter(&spa_namespace_lock); 2868 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2869 mutex_exit(&spa_namespace_lock); 2870 2871 return (0); 2872 } 2873 2874 /* 2875 * Attach a device to a mirror. The arguments are the path to any device 2876 * in the mirror, and the nvroot for the new device. If the path specifies 2877 * a device that is not mirrored, we automatically insert the mirror vdev. 2878 * 2879 * If 'replacing' is specified, the new device is intended to replace the 2880 * existing device; in this case the two devices are made into their own 2881 * mirror using the 'replacing' vdev, which is functionally identical to 2882 * the mirror vdev (it actually reuses all the same ops) but has a few 2883 * extra rules: you can't attach to it after it's been created, and upon 2884 * completion of resilvering, the first disk (the one being replaced) 2885 * is automatically detached. 2886 */ 2887 int 2888 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 2889 { 2890 uint64_t txg, open_txg; 2891 vdev_t *rvd = spa->spa_root_vdev; 2892 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 2893 vdev_ops_t *pvops; 2894 dmu_tx_t *tx; 2895 char *oldvdpath, *newvdpath; 2896 int newvd_isspare; 2897 int error; 2898 2899 txg = spa_vdev_enter(spa); 2900 2901 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 2902 2903 if (oldvd == NULL) 2904 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 2905 2906 if (!oldvd->vdev_ops->vdev_op_leaf) 2907 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2908 2909 pvd = oldvd->vdev_parent; 2910 2911 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 2912 VDEV_ALLOC_ADD)) != 0) 2913 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 2914 2915 if (newrootvd->vdev_children != 1) 2916 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 2917 2918 newvd = newrootvd->vdev_child[0]; 2919 2920 if (!newvd->vdev_ops->vdev_op_leaf) 2921 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 2922 2923 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 2924 return (spa_vdev_exit(spa, newrootvd, txg, error)); 2925 2926 /* 2927 * Spares can't replace logs 2928 */ 2929 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 2930 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2931 2932 if (!replacing) { 2933 /* 2934 * For attach, the only allowable parent is a mirror or the root 2935 * vdev. 2936 */ 2937 if (pvd->vdev_ops != &vdev_mirror_ops && 2938 pvd->vdev_ops != &vdev_root_ops) 2939 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2940 2941 pvops = &vdev_mirror_ops; 2942 } else { 2943 /* 2944 * Active hot spares can only be replaced by inactive hot 2945 * spares. 2946 */ 2947 if (pvd->vdev_ops == &vdev_spare_ops && 2948 pvd->vdev_child[1] == oldvd && 2949 !spa_has_spare(spa, newvd->vdev_guid)) 2950 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2951 2952 /* 2953 * If the source is a hot spare, and the parent isn't already a 2954 * spare, then we want to create a new hot spare. Otherwise, we 2955 * want to create a replacing vdev. The user is not allowed to 2956 * attach to a spared vdev child unless the 'isspare' state is 2957 * the same (spare replaces spare, non-spare replaces 2958 * non-spare). 2959 */ 2960 if (pvd->vdev_ops == &vdev_replacing_ops) 2961 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2962 else if (pvd->vdev_ops == &vdev_spare_ops && 2963 newvd->vdev_isspare != oldvd->vdev_isspare) 2964 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2965 else if (pvd->vdev_ops != &vdev_spare_ops && 2966 newvd->vdev_isspare) 2967 pvops = &vdev_spare_ops; 2968 else 2969 pvops = &vdev_replacing_ops; 2970 } 2971 2972 /* 2973 * Compare the new device size with the replaceable/attachable 2974 * device size. 2975 */ 2976 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 2977 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 2978 2979 /* 2980 * The new device cannot have a higher alignment requirement 2981 * than the top-level vdev. 2982 */ 2983 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 2984 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 2985 2986 /* 2987 * If this is an in-place replacement, update oldvd's path and devid 2988 * to make it distinguishable from newvd, and unopenable from now on. 2989 */ 2990 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 2991 spa_strfree(oldvd->vdev_path); 2992 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 2993 KM_SLEEP); 2994 (void) sprintf(oldvd->vdev_path, "%s/%s", 2995 newvd->vdev_path, "old"); 2996 if (oldvd->vdev_devid != NULL) { 2997 spa_strfree(oldvd->vdev_devid); 2998 oldvd->vdev_devid = NULL; 2999 } 3000 } 3001 3002 /* 3003 * If the parent is not a mirror, or if we're replacing, insert the new 3004 * mirror/replacing/spare vdev above oldvd. 3005 */ 3006 if (pvd->vdev_ops != pvops) 3007 pvd = vdev_add_parent(oldvd, pvops); 3008 3009 ASSERT(pvd->vdev_top->vdev_parent == rvd); 3010 ASSERT(pvd->vdev_ops == pvops); 3011 ASSERT(oldvd->vdev_parent == pvd); 3012 3013 /* 3014 * Extract the new device from its root and add it to pvd. 3015 */ 3016 vdev_remove_child(newrootvd, newvd); 3017 newvd->vdev_id = pvd->vdev_children; 3018 vdev_add_child(pvd, newvd); 3019 3020 /* 3021 * If newvd is smaller than oldvd, but larger than its rsize, 3022 * the addition of newvd may have decreased our parent's asize. 3023 */ 3024 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 3025 3026 tvd = newvd->vdev_top; 3027 ASSERT(pvd->vdev_top == tvd); 3028 ASSERT(tvd->vdev_parent == rvd); 3029 3030 vdev_config_dirty(tvd); 3031 3032 /* 3033 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 3034 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 3035 */ 3036 open_txg = txg + TXG_CONCURRENT_STATES - 1; 3037 3038 vdev_dtl_dirty(newvd, DTL_MISSING, 3039 TXG_INITIAL, open_txg - TXG_INITIAL + 1); 3040 3041 if (newvd->vdev_isspare) 3042 spa_spare_activate(newvd); 3043 oldvdpath = spa_strdup(oldvd->vdev_path); 3044 newvdpath = spa_strdup(newvd->vdev_path); 3045 newvd_isspare = newvd->vdev_isspare; 3046 3047 /* 3048 * Mark newvd's DTL dirty in this txg. 3049 */ 3050 vdev_dirty(tvd, VDD_DTL, newvd, txg); 3051 3052 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 3053 3054 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 3055 if (dmu_tx_assign(tx, TXG_WAIT) == 0) { 3056 spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, tx, 3057 CRED(), "%s vdev=%s %s vdev=%s", 3058 replacing && newvd_isspare ? "spare in" : 3059 replacing ? "replace" : "attach", newvdpath, 3060 replacing ? "for" : "to", oldvdpath); 3061 dmu_tx_commit(tx); 3062 } else { 3063 dmu_tx_abort(tx); 3064 } 3065 3066 spa_strfree(oldvdpath); 3067 spa_strfree(newvdpath); 3068 3069 /* 3070 * Kick off a resilver to update newvd. 3071 */ 3072 VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0); 3073 3074 return (0); 3075 } 3076 3077 /* 3078 * Detach a device from a mirror or replacing vdev. 3079 * If 'replace_done' is specified, only detach if the parent 3080 * is a replacing vdev. 3081 */ 3082 int 3083 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 3084 { 3085 uint64_t txg; 3086 int error; 3087 vdev_t *rvd = spa->spa_root_vdev; 3088 vdev_t *vd, *pvd, *cvd, *tvd; 3089 boolean_t unspare = B_FALSE; 3090 uint64_t unspare_guid; 3091 size_t len; 3092 3093 txg = spa_vdev_enter(spa); 3094 3095 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 3096 3097 if (vd == NULL) 3098 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3099 3100 if (!vd->vdev_ops->vdev_op_leaf) 3101 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3102 3103 pvd = vd->vdev_parent; 3104 3105 /* 3106 * If the parent/child relationship is not as expected, don't do it. 3107 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 3108 * vdev that's replacing B with C. The user's intent in replacing 3109 * is to go from M(A,B) to M(A,C). If the user decides to cancel 3110 * the replace by detaching C, the expected behavior is to end up 3111 * M(A,B). But suppose that right after deciding to detach C, 3112 * the replacement of B completes. We would have M(A,C), and then 3113 * ask to detach C, which would leave us with just A -- not what 3114 * the user wanted. To prevent this, we make sure that the 3115 * parent/child relationship hasn't changed -- in this example, 3116 * that C's parent is still the replacing vdev R. 3117 */ 3118 if (pvd->vdev_guid != pguid && pguid != 0) 3119 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3120 3121 /* 3122 * If replace_done is specified, only remove this device if it's 3123 * the first child of a replacing vdev. For the 'spare' vdev, either 3124 * disk can be removed. 3125 */ 3126 if (replace_done) { 3127 if (pvd->vdev_ops == &vdev_replacing_ops) { 3128 if (vd->vdev_id != 0) 3129 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3130 } else if (pvd->vdev_ops != &vdev_spare_ops) { 3131 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3132 } 3133 } 3134 3135 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 3136 spa_version(spa) >= SPA_VERSION_SPARES); 3137 3138 /* 3139 * Only mirror, replacing, and spare vdevs support detach. 3140 */ 3141 if (pvd->vdev_ops != &vdev_replacing_ops && 3142 pvd->vdev_ops != &vdev_mirror_ops && 3143 pvd->vdev_ops != &vdev_spare_ops) 3144 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3145 3146 /* 3147 * If this device has the only valid copy of some data, 3148 * we cannot safely detach it. 3149 */ 3150 if (vdev_dtl_required(vd)) 3151 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3152 3153 ASSERT(pvd->vdev_children >= 2); 3154 3155 /* 3156 * If we are detaching the second disk from a replacing vdev, then 3157 * check to see if we changed the original vdev's path to have "/old" 3158 * at the end in spa_vdev_attach(). If so, undo that change now. 3159 */ 3160 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 && 3161 pvd->vdev_child[0]->vdev_path != NULL && 3162 pvd->vdev_child[1]->vdev_path != NULL) { 3163 ASSERT(pvd->vdev_child[1] == vd); 3164 cvd = pvd->vdev_child[0]; 3165 len = strlen(vd->vdev_path); 3166 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 3167 strcmp(cvd->vdev_path + len, "/old") == 0) { 3168 spa_strfree(cvd->vdev_path); 3169 cvd->vdev_path = spa_strdup(vd->vdev_path); 3170 } 3171 } 3172 3173 /* 3174 * If we are detaching the original disk from a spare, then it implies 3175 * that the spare should become a real disk, and be removed from the 3176 * active spare list for the pool. 3177 */ 3178 if (pvd->vdev_ops == &vdev_spare_ops && 3179 vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare) 3180 unspare = B_TRUE; 3181 3182 /* 3183 * Erase the disk labels so the disk can be used for other things. 3184 * This must be done after all other error cases are handled, 3185 * but before we disembowel vd (so we can still do I/O to it). 3186 * But if we can't do it, don't treat the error as fatal -- 3187 * it may be that the unwritability of the disk is the reason 3188 * it's being detached! 3189 */ 3190 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 3191 3192 /* 3193 * Remove vd from its parent and compact the parent's children. 3194 */ 3195 vdev_remove_child(pvd, vd); 3196 vdev_compact_children(pvd); 3197 3198 /* 3199 * Remember one of the remaining children so we can get tvd below. 3200 */ 3201 cvd = pvd->vdev_child[0]; 3202 3203 /* 3204 * If we need to remove the remaining child from the list of hot spares, 3205 * do it now, marking the vdev as no longer a spare in the process. 3206 * We must do this before vdev_remove_parent(), because that can 3207 * change the GUID if it creates a new toplevel GUID. For a similar 3208 * reason, we must remove the spare now, in the same txg as the detach; 3209 * otherwise someone could attach a new sibling, change the GUID, and 3210 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 3211 */ 3212 if (unspare) { 3213 ASSERT(cvd->vdev_isspare); 3214 spa_spare_remove(cvd); 3215 unspare_guid = cvd->vdev_guid; 3216 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 3217 } 3218 3219 /* 3220 * If the parent mirror/replacing vdev only has one child, 3221 * the parent is no longer needed. Remove it from the tree. 3222 */ 3223 if (pvd->vdev_children == 1) 3224 vdev_remove_parent(cvd); 3225 3226 /* 3227 * We don't set tvd until now because the parent we just removed 3228 * may have been the previous top-level vdev. 3229 */ 3230 tvd = cvd->vdev_top; 3231 ASSERT(tvd->vdev_parent == rvd); 3232 3233 /* 3234 * Reevaluate the parent vdev state. 3235 */ 3236 vdev_propagate_state(cvd); 3237 3238 /* 3239 * If the device we just detached was smaller than the others, it may be 3240 * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 3241 * can't fail because the existing metaslabs are already in core, so 3242 * there's nothing to read from disk. 3243 */ 3244 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 3245 3246 vdev_config_dirty(tvd); 3247 3248 /* 3249 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 3250 * vd->vdev_detached is set and free vd's DTL object in syncing context. 3251 * But first make sure we're not on any *other* txg's DTL list, to 3252 * prevent vd from being accessed after it's freed. 3253 */ 3254 for (int t = 0; t < TXG_SIZE; t++) 3255 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 3256 vd->vdev_detached = B_TRUE; 3257 vdev_dirty(tvd, VDD_DTL, vd, txg); 3258 3259 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 3260 3261 error = spa_vdev_exit(spa, vd, txg, 0); 3262 3263 /* 3264 * If this was the removal of the original device in a hot spare vdev, 3265 * then we want to go through and remove the device from the hot spare 3266 * list of every other pool. 3267 */ 3268 if (unspare) { 3269 spa_t *myspa = spa; 3270 spa = NULL; 3271 mutex_enter(&spa_namespace_lock); 3272 while ((spa = spa_next(spa)) != NULL) { 3273 if (spa->spa_state != POOL_STATE_ACTIVE) 3274 continue; 3275 if (spa == myspa) 3276 continue; 3277 spa_open_ref(spa, FTAG); 3278 mutex_exit(&spa_namespace_lock); 3279 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 3280 mutex_enter(&spa_namespace_lock); 3281 spa_close(spa, FTAG); 3282 } 3283 mutex_exit(&spa_namespace_lock); 3284 } 3285 3286 return (error); 3287 } 3288 3289 static nvlist_t * 3290 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 3291 { 3292 for (int i = 0; i < count; i++) { 3293 uint64_t guid; 3294 3295 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 3296 &guid) == 0); 3297 3298 if (guid == target_guid) 3299 return (nvpp[i]); 3300 } 3301 3302 return (NULL); 3303 } 3304 3305 static void 3306 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 3307 nvlist_t *dev_to_remove) 3308 { 3309 nvlist_t **newdev = NULL; 3310 3311 if (count > 1) 3312 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 3313 3314 for (int i = 0, j = 0; i < count; i++) { 3315 if (dev[i] == dev_to_remove) 3316 continue; 3317 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 3318 } 3319 3320 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 3321 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 3322 3323 for (int i = 0; i < count - 1; i++) 3324 nvlist_free(newdev[i]); 3325 3326 if (count > 1) 3327 kmem_free(newdev, (count - 1) * sizeof (void *)); 3328 } 3329 3330 /* 3331 * Remove a device from the pool. Currently, this supports removing only hot 3332 * spares and level 2 ARC devices. 3333 */ 3334 int 3335 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 3336 { 3337 vdev_t *vd; 3338 nvlist_t **spares, **l2cache, *nv; 3339 uint_t nspares, nl2cache; 3340 uint64_t txg = 0; 3341 int error = 0; 3342 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 3343 3344 if (!locked) 3345 txg = spa_vdev_enter(spa); 3346 3347 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 3348 3349 if (spa->spa_spares.sav_vdevs != NULL && 3350 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3351 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 3352 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 3353 /* 3354 * Only remove the hot spare if it's not currently in use 3355 * in this pool. 3356 */ 3357 if (vd == NULL || unspare) { 3358 spa_vdev_remove_aux(spa->spa_spares.sav_config, 3359 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 3360 spa_load_spares(spa); 3361 spa->spa_spares.sav_sync = B_TRUE; 3362 } else { 3363 error = EBUSY; 3364 } 3365 } else if (spa->spa_l2cache.sav_vdevs != NULL && 3366 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3367 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 3368 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 3369 /* 3370 * Cache devices can always be removed. 3371 */ 3372 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 3373 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 3374 spa_load_l2cache(spa); 3375 spa->spa_l2cache.sav_sync = B_TRUE; 3376 } else if (vd != NULL) { 3377 /* 3378 * Normal vdevs cannot be removed (yet). 3379 */ 3380 error = ENOTSUP; 3381 } else { 3382 /* 3383 * There is no vdev of any kind with the specified guid. 3384 */ 3385 error = ENOENT; 3386 } 3387 3388 if (!locked) 3389 return (spa_vdev_exit(spa, NULL, txg, error)); 3390 3391 return (error); 3392 } 3393 3394 /* 3395 * Find any device that's done replacing, or a vdev marked 'unspare' that's 3396 * current spared, so we can detach it. 3397 */ 3398 static vdev_t * 3399 spa_vdev_resilver_done_hunt(vdev_t *vd) 3400 { 3401 vdev_t *newvd, *oldvd; 3402 int c; 3403 3404 for (c = 0; c < vd->vdev_children; c++) { 3405 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 3406 if (oldvd != NULL) 3407 return (oldvd); 3408 } 3409 3410 /* 3411 * Check for a completed replacement. 3412 */ 3413 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 3414 oldvd = vd->vdev_child[0]; 3415 newvd = vd->vdev_child[1]; 3416 3417 if (vdev_dtl_empty(newvd, DTL_MISSING) && 3418 !vdev_dtl_required(oldvd)) 3419 return (oldvd); 3420 } 3421 3422 /* 3423 * Check for a completed resilver with the 'unspare' flag set. 3424 */ 3425 if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { 3426 newvd = vd->vdev_child[0]; 3427 oldvd = vd->vdev_child[1]; 3428 3429 if (newvd->vdev_unspare && 3430 vdev_dtl_empty(newvd, DTL_MISSING) && 3431 !vdev_dtl_required(oldvd)) { 3432 newvd->vdev_unspare = 0; 3433 return (oldvd); 3434 } 3435 } 3436 3437 return (NULL); 3438 } 3439 3440 static void 3441 spa_vdev_resilver_done(spa_t *spa) 3442 { 3443 vdev_t *vd, *pvd, *ppvd; 3444 uint64_t guid, sguid, pguid, ppguid; 3445 3446 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3447 3448 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 3449 pvd = vd->vdev_parent; 3450 ppvd = pvd->vdev_parent; 3451 guid = vd->vdev_guid; 3452 pguid = pvd->vdev_guid; 3453 ppguid = ppvd->vdev_guid; 3454 sguid = 0; 3455 /* 3456 * If we have just finished replacing a hot spared device, then 3457 * we need to detach the parent's first child (the original hot 3458 * spare) as well. 3459 */ 3460 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) { 3461 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 3462 ASSERT(ppvd->vdev_children == 2); 3463 sguid = ppvd->vdev_child[1]->vdev_guid; 3464 } 3465 spa_config_exit(spa, SCL_ALL, FTAG); 3466 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 3467 return; 3468 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 3469 return; 3470 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3471 } 3472 3473 spa_config_exit(spa, SCL_ALL, FTAG); 3474 } 3475 3476 /* 3477 * Update the stored path for this vdev. Dirty the vdev configuration, relying 3478 * on spa_vdev_enter/exit() to synchronize the labels and cache. 3479 */ 3480 int 3481 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 3482 { 3483 vdev_t *vd; 3484 uint64_t txg; 3485 3486 txg = spa_vdev_enter(spa); 3487 3488 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) { 3489 /* 3490 * Determine if this is a reference to a hot spare device. If 3491 * it is, update the path manually as there is no associated 3492 * vdev_t that can be synced to disk. 3493 */ 3494 nvlist_t **spares; 3495 uint_t i, nspares; 3496 3497 if (spa->spa_spares.sav_config != NULL) { 3498 VERIFY(nvlist_lookup_nvlist_array( 3499 spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 3500 &spares, &nspares) == 0); 3501 for (i = 0; i < nspares; i++) { 3502 uint64_t theguid; 3503 VERIFY(nvlist_lookup_uint64(spares[i], 3504 ZPOOL_CONFIG_GUID, &theguid) == 0); 3505 if (theguid == guid) { 3506 VERIFY(nvlist_add_string(spares[i], 3507 ZPOOL_CONFIG_PATH, newpath) == 0); 3508 spa_load_spares(spa); 3509 spa->spa_spares.sav_sync = B_TRUE; 3510 return (spa_vdev_exit(spa, NULL, txg, 3511 0)); 3512 } 3513 } 3514 } 3515 3516 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 3517 } 3518 3519 if (!vd->vdev_ops->vdev_op_leaf) 3520 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3521 3522 spa_strfree(vd->vdev_path); 3523 vd->vdev_path = spa_strdup(newpath); 3524 3525 vdev_config_dirty(vd->vdev_top); 3526 3527 return (spa_vdev_exit(spa, NULL, txg, 0)); 3528 } 3529 3530 /* 3531 * ========================================================================== 3532 * SPA Scrubbing 3533 * ========================================================================== 3534 */ 3535 3536 int 3537 spa_scrub(spa_t *spa, pool_scrub_type_t type) 3538 { 3539 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 3540 3541 if ((uint_t)type >= POOL_SCRUB_TYPES) 3542 return (ENOTSUP); 3543 3544 /* 3545 * If a resilver was requested, but there is no DTL on a 3546 * writeable leaf device, we have nothing to do. 3547 */ 3548 if (type == POOL_SCRUB_RESILVER && 3549 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 3550 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 3551 return (0); 3552 } 3553 3554 if (type == POOL_SCRUB_EVERYTHING && 3555 spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE && 3556 spa->spa_dsl_pool->dp_scrub_isresilver) 3557 return (EBUSY); 3558 3559 if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) { 3560 return (dsl_pool_scrub_clean(spa->spa_dsl_pool)); 3561 } else if (type == POOL_SCRUB_NONE) { 3562 return (dsl_pool_scrub_cancel(spa->spa_dsl_pool)); 3563 } else { 3564 return (EINVAL); 3565 } 3566 } 3567 3568 /* 3569 * ========================================================================== 3570 * SPA async task processing 3571 * ========================================================================== 3572 */ 3573 3574 static void 3575 spa_async_remove(spa_t *spa, vdev_t *vd) 3576 { 3577 if (vd->vdev_remove_wanted) { 3578 vd->vdev_remove_wanted = 0; 3579 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 3580 vdev_clear(spa, vd); 3581 vdev_state_dirty(vd->vdev_top); 3582 } 3583 3584 for (int c = 0; c < vd->vdev_children; c++) 3585 spa_async_remove(spa, vd->vdev_child[c]); 3586 } 3587 3588 static void 3589 spa_async_probe(spa_t *spa, vdev_t *vd) 3590 { 3591 if (vd->vdev_probe_wanted) { 3592 vd->vdev_probe_wanted = 0; 3593 vdev_reopen(vd); /* vdev_open() does the actual probe */ 3594 } 3595 3596 for (int c = 0; c < vd->vdev_children; c++) 3597 spa_async_probe(spa, vd->vdev_child[c]); 3598 } 3599 3600 static void 3601 spa_async_thread(spa_t *spa) 3602 { 3603 int tasks; 3604 3605 ASSERT(spa->spa_sync_on); 3606 3607 mutex_enter(&spa->spa_async_lock); 3608 tasks = spa->spa_async_tasks; 3609 spa->spa_async_tasks = 0; 3610 mutex_exit(&spa->spa_async_lock); 3611 3612 /* 3613 * See if the config needs to be updated. 3614 */ 3615 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 3616 mutex_enter(&spa_namespace_lock); 3617 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3618 mutex_exit(&spa_namespace_lock); 3619 } 3620 3621 /* 3622 * See if any devices need to be marked REMOVED. 3623 */ 3624 if (tasks & SPA_ASYNC_REMOVE) { 3625 spa_vdev_state_enter(spa); 3626 spa_async_remove(spa, spa->spa_root_vdev); 3627 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 3628 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 3629 for (int i = 0; i < spa->spa_spares.sav_count; i++) 3630 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 3631 (void) spa_vdev_state_exit(spa, NULL, 0); 3632 } 3633 3634 /* 3635 * See if any devices need to be probed. 3636 */ 3637 if (tasks & SPA_ASYNC_PROBE) { 3638 spa_vdev_state_enter(spa); 3639 spa_async_probe(spa, spa->spa_root_vdev); 3640 (void) spa_vdev_state_exit(spa, NULL, 0); 3641 } 3642 3643 /* 3644 * If any devices are done replacing, detach them. 3645 */ 3646 if (tasks & SPA_ASYNC_RESILVER_DONE) 3647 spa_vdev_resilver_done(spa); 3648 3649 /* 3650 * Kick off a resilver. 3651 */ 3652 if (tasks & SPA_ASYNC_RESILVER) 3653 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0); 3654 3655 /* 3656 * Let the world know that we're done. 3657 */ 3658 mutex_enter(&spa->spa_async_lock); 3659 spa->spa_async_thread = NULL; 3660 cv_broadcast(&spa->spa_async_cv); 3661 mutex_exit(&spa->spa_async_lock); 3662 thread_exit(); 3663 } 3664 3665 void 3666 spa_async_suspend(spa_t *spa) 3667 { 3668 mutex_enter(&spa->spa_async_lock); 3669 spa->spa_async_suspended++; 3670 while (spa->spa_async_thread != NULL) 3671 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 3672 mutex_exit(&spa->spa_async_lock); 3673 } 3674 3675 void 3676 spa_async_resume(spa_t *spa) 3677 { 3678 mutex_enter(&spa->spa_async_lock); 3679 ASSERT(spa->spa_async_suspended != 0); 3680 spa->spa_async_suspended--; 3681 mutex_exit(&spa->spa_async_lock); 3682 } 3683 3684 static void 3685 spa_async_dispatch(spa_t *spa) 3686 { 3687 mutex_enter(&spa->spa_async_lock); 3688 if (spa->spa_async_tasks && !spa->spa_async_suspended && 3689 spa->spa_async_thread == NULL && 3690 rootdir != NULL && !vn_is_readonly(rootdir)) 3691 spa->spa_async_thread = thread_create(NULL, 0, 3692 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 3693 mutex_exit(&spa->spa_async_lock); 3694 } 3695 3696 void 3697 spa_async_request(spa_t *spa, int task) 3698 { 3699 mutex_enter(&spa->spa_async_lock); 3700 spa->spa_async_tasks |= task; 3701 mutex_exit(&spa->spa_async_lock); 3702 } 3703 3704 /* 3705 * ========================================================================== 3706 * SPA syncing routines 3707 * ========================================================================== 3708 */ 3709 3710 static void 3711 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 3712 { 3713 bplist_t *bpl = &spa->spa_sync_bplist; 3714 dmu_tx_t *tx; 3715 blkptr_t blk; 3716 uint64_t itor = 0; 3717 zio_t *zio; 3718 int error; 3719 uint8_t c = 1; 3720 3721 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 3722 3723 while (bplist_iterate(bpl, &itor, &blk) == 0) { 3724 ASSERT(blk.blk_birth < txg); 3725 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL, 3726 ZIO_FLAG_MUSTSUCCEED)); 3727 } 3728 3729 error = zio_wait(zio); 3730 ASSERT3U(error, ==, 0); 3731 3732 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 3733 bplist_vacate(bpl, tx); 3734 3735 /* 3736 * Pre-dirty the first block so we sync to convergence faster. 3737 * (Usually only the first block is needed.) 3738 */ 3739 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 3740 dmu_tx_commit(tx); 3741 } 3742 3743 static void 3744 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 3745 { 3746 char *packed = NULL; 3747 size_t bufsize; 3748 size_t nvsize = 0; 3749 dmu_buf_t *db; 3750 3751 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 3752 3753 /* 3754 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 3755 * information. This avoids the dbuf_will_dirty() path and 3756 * saves us a pre-read to get data we don't actually care about. 3757 */ 3758 bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); 3759 packed = kmem_alloc(bufsize, KM_SLEEP); 3760 3761 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 3762 KM_SLEEP) == 0); 3763 bzero(packed + nvsize, bufsize - nvsize); 3764 3765 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 3766 3767 kmem_free(packed, bufsize); 3768 3769 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 3770 dmu_buf_will_dirty(db, tx); 3771 *(uint64_t *)db->db_data = nvsize; 3772 dmu_buf_rele(db, FTAG); 3773 } 3774 3775 static void 3776 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 3777 const char *config, const char *entry) 3778 { 3779 nvlist_t *nvroot; 3780 nvlist_t **list; 3781 int i; 3782 3783 if (!sav->sav_sync) 3784 return; 3785 3786 /* 3787 * Update the MOS nvlist describing the list of available devices. 3788 * spa_validate_aux() will have already made sure this nvlist is 3789 * valid and the vdevs are labeled appropriately. 3790 */ 3791 if (sav->sav_object == 0) { 3792 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 3793 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 3794 sizeof (uint64_t), tx); 3795 VERIFY(zap_update(spa->spa_meta_objset, 3796 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 3797 &sav->sav_object, tx) == 0); 3798 } 3799 3800 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3801 if (sav->sav_count == 0) { 3802 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 3803 } else { 3804 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 3805 for (i = 0; i < sav->sav_count; i++) 3806 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 3807 B_FALSE, B_FALSE, B_TRUE); 3808 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 3809 sav->sav_count) == 0); 3810 for (i = 0; i < sav->sav_count; i++) 3811 nvlist_free(list[i]); 3812 kmem_free(list, sav->sav_count * sizeof (void *)); 3813 } 3814 3815 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 3816 nvlist_free(nvroot); 3817 3818 sav->sav_sync = B_FALSE; 3819 } 3820 3821 static void 3822 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 3823 { 3824 nvlist_t *config; 3825 3826 if (list_is_empty(&spa->spa_config_dirty_list)) 3827 return; 3828 3829 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 3830 3831 config = spa_config_generate(spa, spa->spa_root_vdev, 3832 dmu_tx_get_txg(tx), B_FALSE); 3833 3834 spa_config_exit(spa, SCL_STATE, FTAG); 3835 3836 if (spa->spa_config_syncing) 3837 nvlist_free(spa->spa_config_syncing); 3838 spa->spa_config_syncing = config; 3839 3840 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 3841 } 3842 3843 /* 3844 * Set zpool properties. 3845 */ 3846 static void 3847 spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 3848 { 3849 spa_t *spa = arg1; 3850 objset_t *mos = spa->spa_meta_objset; 3851 nvlist_t *nvp = arg2; 3852 nvpair_t *elem; 3853 uint64_t intval; 3854 char *strval; 3855 zpool_prop_t prop; 3856 const char *propname; 3857 zprop_type_t proptype; 3858 3859 mutex_enter(&spa->spa_props_lock); 3860 3861 elem = NULL; 3862 while ((elem = nvlist_next_nvpair(nvp, elem))) { 3863 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 3864 case ZPOOL_PROP_VERSION: 3865 /* 3866 * Only set version for non-zpool-creation cases 3867 * (set/import). spa_create() needs special care 3868 * for version setting. 3869 */ 3870 if (tx->tx_txg != TXG_INITIAL) { 3871 VERIFY(nvpair_value_uint64(elem, 3872 &intval) == 0); 3873 ASSERT(intval <= SPA_VERSION); 3874 ASSERT(intval >= spa_version(spa)); 3875 spa->spa_uberblock.ub_version = intval; 3876 vdev_config_dirty(spa->spa_root_vdev); 3877 } 3878 break; 3879 3880 case ZPOOL_PROP_ALTROOT: 3881 /* 3882 * 'altroot' is a non-persistent property. It should 3883 * have been set temporarily at creation or import time. 3884 */ 3885 ASSERT(spa->spa_root != NULL); 3886 break; 3887 3888 case ZPOOL_PROP_CACHEFILE: 3889 /* 3890 * 'cachefile' is also a non-persisitent property. 3891 */ 3892 break; 3893 default: 3894 /* 3895 * Set pool property values in the poolprops mos object. 3896 */ 3897 if (spa->spa_pool_props_object == 0) { 3898 objset_t *mos = spa->spa_meta_objset; 3899 3900 VERIFY((spa->spa_pool_props_object = 3901 zap_create(mos, DMU_OT_POOL_PROPS, 3902 DMU_OT_NONE, 0, tx)) > 0); 3903 3904 VERIFY(zap_update(mos, 3905 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 3906 8, 1, &spa->spa_pool_props_object, tx) 3907 == 0); 3908 } 3909 3910 /* normalize the property name */ 3911 propname = zpool_prop_to_name(prop); 3912 proptype = zpool_prop_get_type(prop); 3913 3914 if (nvpair_type(elem) == DATA_TYPE_STRING) { 3915 ASSERT(proptype == PROP_TYPE_STRING); 3916 VERIFY(nvpair_value_string(elem, &strval) == 0); 3917 VERIFY(zap_update(mos, 3918 spa->spa_pool_props_object, propname, 3919 1, strlen(strval) + 1, strval, tx) == 0); 3920 3921 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 3922 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 3923 3924 if (proptype == PROP_TYPE_INDEX) { 3925 const char *unused; 3926 VERIFY(zpool_prop_index_to_string( 3927 prop, intval, &unused) == 0); 3928 } 3929 VERIFY(zap_update(mos, 3930 spa->spa_pool_props_object, propname, 3931 8, 1, &intval, tx) == 0); 3932 } else { 3933 ASSERT(0); /* not allowed */ 3934 } 3935 3936 switch (prop) { 3937 case ZPOOL_PROP_DELEGATION: 3938 spa->spa_delegation = intval; 3939 break; 3940 case ZPOOL_PROP_BOOTFS: 3941 spa->spa_bootfs = intval; 3942 break; 3943 case ZPOOL_PROP_FAILUREMODE: 3944 spa->spa_failmode = intval; 3945 break; 3946 default: 3947 break; 3948 } 3949 } 3950 3951 /* log internal history if this is not a zpool create */ 3952 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 3953 tx->tx_txg != TXG_INITIAL) { 3954 spa_history_internal_log(LOG_POOL_PROPSET, 3955 spa, tx, cr, "%s %lld %s", 3956 nvpair_name(elem), intval, spa_name(spa)); 3957 } 3958 } 3959 3960 mutex_exit(&spa->spa_props_lock); 3961 } 3962 3963 /* 3964 * Sync the specified transaction group. New blocks may be dirtied as 3965 * part of the process, so we iterate until it converges. 3966 */ 3967 void 3968 spa_sync(spa_t *spa, uint64_t txg) 3969 { 3970 dsl_pool_t *dp = spa->spa_dsl_pool; 3971 objset_t *mos = spa->spa_meta_objset; 3972 bplist_t *bpl = &spa->spa_sync_bplist; 3973 vdev_t *rvd = spa->spa_root_vdev; 3974 vdev_t *vd; 3975 dmu_tx_t *tx; 3976 int dirty_vdevs; 3977 int error; 3978 3979 /* 3980 * Lock out configuration changes. 3981 */ 3982 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3983 3984 spa->spa_syncing_txg = txg; 3985 spa->spa_sync_pass = 0; 3986 3987 /* 3988 * If there are any pending vdev state changes, convert them 3989 * into config changes that go out with this transaction group. 3990 */ 3991 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 3992 while (list_head(&spa->spa_state_dirty_list) != NULL) { 3993 /* 3994 * We need the write lock here because, for aux vdevs, 3995 * calling vdev_config_dirty() modifies sav_config. 3996 * This is ugly and will become unnecessary when we 3997 * eliminate the aux vdev wart by integrating all vdevs 3998 * into the root vdev tree. 3999 */ 4000 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 4001 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 4002 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 4003 vdev_state_clean(vd); 4004 vdev_config_dirty(vd); 4005 } 4006 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 4007 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 4008 } 4009 spa_config_exit(spa, SCL_STATE, FTAG); 4010 4011 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 4012 4013 tx = dmu_tx_create_assigned(dp, txg); 4014 4015 /* 4016 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 4017 * set spa_deflate if we have no raid-z vdevs. 4018 */ 4019 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 4020 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 4021 int i; 4022 4023 for (i = 0; i < rvd->vdev_children; i++) { 4024 vd = rvd->vdev_child[i]; 4025 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 4026 break; 4027 } 4028 if (i == rvd->vdev_children) { 4029 spa->spa_deflate = TRUE; 4030 VERIFY(0 == zap_add(spa->spa_meta_objset, 4031 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 4032 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 4033 } 4034 } 4035 4036 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 4037 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 4038 dsl_pool_create_origin(dp, tx); 4039 4040 /* Keeping the origin open increases spa_minref */ 4041 spa->spa_minref += 3; 4042 } 4043 4044 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 4045 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 4046 dsl_pool_upgrade_clones(dp, tx); 4047 } 4048 4049 /* 4050 * If anything has changed in this txg, push the deferred frees 4051 * from the previous txg. If not, leave them alone so that we 4052 * don't generate work on an otherwise idle system. 4053 */ 4054 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 4055 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 4056 !txg_list_empty(&dp->dp_sync_tasks, txg)) 4057 spa_sync_deferred_frees(spa, txg); 4058 4059 /* 4060 * Iterate to convergence. 4061 */ 4062 do { 4063 spa->spa_sync_pass++; 4064 4065 spa_sync_config_object(spa, tx); 4066 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 4067 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 4068 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 4069 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 4070 spa_errlog_sync(spa, txg); 4071 dsl_pool_sync(dp, txg); 4072 4073 dirty_vdevs = 0; 4074 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 4075 vdev_sync(vd, txg); 4076 dirty_vdevs++; 4077 } 4078 4079 bplist_sync(bpl, tx); 4080 } while (dirty_vdevs); 4081 4082 bplist_close(bpl); 4083 4084 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 4085 4086 /* 4087 * Rewrite the vdev configuration (which includes the uberblock) 4088 * to commit the transaction group. 4089 * 4090 * If there are no dirty vdevs, we sync the uberblock to a few 4091 * random top-level vdevs that are known to be visible in the 4092 * config cache (see spa_vdev_add() for a complete description). 4093 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 4094 */ 4095 for (;;) { 4096 /* 4097 * We hold SCL_STATE to prevent vdev open/close/etc. 4098 * while we're attempting to write the vdev labels. 4099 */ 4100 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4101 4102 if (list_is_empty(&spa->spa_config_dirty_list)) { 4103 vdev_t *svd[SPA_DVAS_PER_BP]; 4104 int svdcount = 0; 4105 int children = rvd->vdev_children; 4106 int c0 = spa_get_random(children); 4107 int c; 4108 4109 for (c = 0; c < children; c++) { 4110 vd = rvd->vdev_child[(c0 + c) % children]; 4111 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 4112 continue; 4113 svd[svdcount++] = vd; 4114 if (svdcount == SPA_DVAS_PER_BP) 4115 break; 4116 } 4117 error = vdev_config_sync(svd, svdcount, txg); 4118 } else { 4119 error = vdev_config_sync(rvd->vdev_child, 4120 rvd->vdev_children, txg); 4121 } 4122 4123 spa_config_exit(spa, SCL_STATE, FTAG); 4124 4125 if (error == 0) 4126 break; 4127 zio_suspend(spa, NULL); 4128 zio_resume_wait(spa); 4129 } 4130 dmu_tx_commit(tx); 4131 4132 /* 4133 * Clear the dirty config list. 4134 */ 4135 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 4136 vdev_config_clean(vd); 4137 4138 /* 4139 * Now that the new config has synced transactionally, 4140 * let it become visible to the config cache. 4141 */ 4142 if (spa->spa_config_syncing != NULL) { 4143 spa_config_set(spa, spa->spa_config_syncing); 4144 spa->spa_config_txg = txg; 4145 spa->spa_config_syncing = NULL; 4146 } 4147 4148 spa->spa_ubsync = spa->spa_uberblock; 4149 4150 /* 4151 * Clean up the ZIL records for the synced txg. 4152 */ 4153 dsl_pool_zil_clean(dp); 4154 4155 /* 4156 * Update usable space statistics. 4157 */ 4158 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 4159 vdev_sync_done(vd, txg); 4160 4161 /* 4162 * It had better be the case that we didn't dirty anything 4163 * since vdev_config_sync(). 4164 */ 4165 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 4166 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 4167 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 4168 ASSERT(bpl->bpl_queue == NULL); 4169 4170 spa_config_exit(spa, SCL_CONFIG, FTAG); 4171 4172 /* 4173 * If any async tasks have been requested, kick them off. 4174 */ 4175 spa_async_dispatch(spa); 4176 } 4177 4178 /* 4179 * Sync all pools. We don't want to hold the namespace lock across these 4180 * operations, so we take a reference on the spa_t and drop the lock during the 4181 * sync. 4182 */ 4183 void 4184 spa_sync_allpools(void) 4185 { 4186 spa_t *spa = NULL; 4187 mutex_enter(&spa_namespace_lock); 4188 while ((spa = spa_next(spa)) != NULL) { 4189 if (spa_state(spa) != POOL_STATE_ACTIVE || spa_suspended(spa)) 4190 continue; 4191 spa_open_ref(spa, FTAG); 4192 mutex_exit(&spa_namespace_lock); 4193 txg_wait_synced(spa_get_dsl(spa), 0); 4194 mutex_enter(&spa_namespace_lock); 4195 spa_close(spa, FTAG); 4196 } 4197 mutex_exit(&spa_namespace_lock); 4198 } 4199 4200 /* 4201 * ========================================================================== 4202 * Miscellaneous routines 4203 * ========================================================================== 4204 */ 4205 4206 /* 4207 * Remove all pools in the system. 4208 */ 4209 void 4210 spa_evict_all(void) 4211 { 4212 spa_t *spa; 4213 4214 /* 4215 * Remove all cached state. All pools should be closed now, 4216 * so every spa in the AVL tree should be unreferenced. 4217 */ 4218 mutex_enter(&spa_namespace_lock); 4219 while ((spa = spa_next(NULL)) != NULL) { 4220 /* 4221 * Stop async tasks. The async thread may need to detach 4222 * a device that's been replaced, which requires grabbing 4223 * spa_namespace_lock, so we must drop it here. 4224 */ 4225 spa_open_ref(spa, FTAG); 4226 mutex_exit(&spa_namespace_lock); 4227 spa_async_suspend(spa); 4228 mutex_enter(&spa_namespace_lock); 4229 spa_close(spa, FTAG); 4230 4231 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4232 spa_unload(spa); 4233 spa_deactivate(spa); 4234 } 4235 spa_remove(spa); 4236 } 4237 mutex_exit(&spa_namespace_lock); 4238 } 4239 4240 vdev_t * 4241 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache) 4242 { 4243 vdev_t *vd; 4244 int i; 4245 4246 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 4247 return (vd); 4248 4249 if (l2cache) { 4250 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 4251 vd = spa->spa_l2cache.sav_vdevs[i]; 4252 if (vd->vdev_guid == guid) 4253 return (vd); 4254 } 4255 } 4256 4257 return (NULL); 4258 } 4259 4260 void 4261 spa_upgrade(spa_t *spa, uint64_t version) 4262 { 4263 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4264 4265 /* 4266 * This should only be called for a non-faulted pool, and since a 4267 * future version would result in an unopenable pool, this shouldn't be 4268 * possible. 4269 */ 4270 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 4271 ASSERT(version >= spa->spa_uberblock.ub_version); 4272 4273 spa->spa_uberblock.ub_version = version; 4274 vdev_config_dirty(spa->spa_root_vdev); 4275 4276 spa_config_exit(spa, SCL_ALL, FTAG); 4277 4278 txg_wait_synced(spa_get_dsl(spa), 0); 4279 } 4280 4281 boolean_t 4282 spa_has_spare(spa_t *spa, uint64_t guid) 4283 { 4284 int i; 4285 uint64_t spareguid; 4286 spa_aux_vdev_t *sav = &spa->spa_spares; 4287 4288 for (i = 0; i < sav->sav_count; i++) 4289 if (sav->sav_vdevs[i]->vdev_guid == guid) 4290 return (B_TRUE); 4291 4292 for (i = 0; i < sav->sav_npending; i++) { 4293 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 4294 &spareguid) == 0 && spareguid == guid) 4295 return (B_TRUE); 4296 } 4297 4298 return (B_FALSE); 4299 } 4300 4301 /* 4302 * Check if a pool has an active shared spare device. 4303 * Note: reference count of an active spare is 2, as a spare and as a replace 4304 */ 4305 static boolean_t 4306 spa_has_active_shared_spare(spa_t *spa) 4307 { 4308 int i, refcnt; 4309 uint64_t pool; 4310 spa_aux_vdev_t *sav = &spa->spa_spares; 4311 4312 for (i = 0; i < sav->sav_count; i++) { 4313 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 4314 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 4315 refcnt > 2) 4316 return (B_TRUE); 4317 } 4318 4319 return (B_FALSE); 4320 } 4321 4322 /* 4323 * Post a sysevent corresponding to the given event. The 'name' must be one of 4324 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 4325 * filled in from the spa and (optionally) the vdev. This doesn't do anything 4326 * in the userland libzpool, as we don't want consumers to misinterpret ztest 4327 * or zdb as real changes. 4328 */ 4329 void 4330 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 4331 { 4332 #ifdef _KERNEL 4333 sysevent_t *ev; 4334 sysevent_attr_list_t *attr = NULL; 4335 sysevent_value_t value; 4336 sysevent_id_t eid; 4337 4338 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 4339 SE_SLEEP); 4340 4341 value.value_type = SE_DATA_TYPE_STRING; 4342 value.value.sv_string = spa_name(spa); 4343 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 4344 goto done; 4345 4346 value.value_type = SE_DATA_TYPE_UINT64; 4347 value.value.sv_uint64 = spa_guid(spa); 4348 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 4349 goto done; 4350 4351 if (vd) { 4352 value.value_type = SE_DATA_TYPE_UINT64; 4353 value.value.sv_uint64 = vd->vdev_guid; 4354 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 4355 SE_SLEEP) != 0) 4356 goto done; 4357 4358 if (vd->vdev_path) { 4359 value.value_type = SE_DATA_TYPE_STRING; 4360 value.value.sv_string = vd->vdev_path; 4361 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 4362 &value, SE_SLEEP) != 0) 4363 goto done; 4364 } 4365 } 4366 4367 if (sysevent_attach_attributes(ev, attr) != 0) 4368 goto done; 4369 attr = NULL; 4370 4371 (void) log_sysevent(ev, SE_SLEEP, &eid); 4372 4373 done: 4374 if (attr) 4375 sysevent_free_attr(attr); 4376 sysevent_free(ev); 4377 #endif 4378 } 4379