1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains all the routines used when modifying on-disk SPA state. 31 * This includes opening, importing, destroying, exporting a pool, and syncing a 32 * pool. 33 */ 34 35 #include <sys/zfs_context.h> 36 #include <sys/fm/fs/zfs.h> 37 #include <sys/spa_impl.h> 38 #include <sys/zio.h> 39 #include <sys/zio_checksum.h> 40 #include <sys/zio_compress.h> 41 #include <sys/dmu.h> 42 #include <sys/dmu_tx.h> 43 #include <sys/zap.h> 44 #include <sys/zil.h> 45 #include <sys/vdev_impl.h> 46 #include <sys/metaslab.h> 47 #include <sys/uberblock_impl.h> 48 #include <sys/txg.h> 49 #include <sys/avl.h> 50 #include <sys/dmu_traverse.h> 51 #include <sys/dmu_objset.h> 52 #include <sys/unique.h> 53 #include <sys/dsl_pool.h> 54 #include <sys/dsl_dataset.h> 55 #include <sys/dsl_dir.h> 56 #include <sys/dsl_prop.h> 57 #include <sys/dsl_synctask.h> 58 #include <sys/fs/zfs.h> 59 #include <sys/arc.h> 60 #include <sys/callb.h> 61 #include <sys/systeminfo.h> 62 #include <sys/sunddi.h> 63 #include <sys/spa_boot.h> 64 65 #include "zfs_prop.h" 66 #include "zfs_comutil.h" 67 68 int zio_taskq_threads = 8; 69 70 static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); 71 72 /* 73 * ========================================================================== 74 * SPA properties routines 75 * ========================================================================== 76 */ 77 78 /* 79 * Add a (source=src, propname=propval) list to an nvlist. 80 */ 81 static void 82 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 83 uint64_t intval, zprop_source_t src) 84 { 85 const char *propname = zpool_prop_to_name(prop); 86 nvlist_t *propval; 87 88 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 89 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 90 91 if (strval != NULL) 92 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 93 else 94 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 95 96 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 97 nvlist_free(propval); 98 } 99 100 /* 101 * Get property values from the spa configuration. 102 */ 103 static void 104 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 105 { 106 uint64_t size = spa_get_space(spa); 107 uint64_t used = spa_get_alloc(spa); 108 uint64_t cap, version; 109 zprop_source_t src = ZPROP_SRC_NONE; 110 spa_config_dirent_t *dp; 111 112 /* 113 * readonly properties 114 */ 115 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa->spa_name, 0, src); 116 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 117 spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src); 118 spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, size - used, src); 119 120 cap = (size == 0) ? 0 : (used * 100 / size); 121 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 122 123 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 124 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 125 spa->spa_root_vdev->vdev_state, src); 126 127 /* 128 * settable properties that are not stored in the pool property object. 129 */ 130 version = spa_version(spa); 131 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 132 src = ZPROP_SRC_DEFAULT; 133 else 134 src = ZPROP_SRC_LOCAL; 135 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 136 137 if (spa->spa_root != NULL) 138 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 139 0, ZPROP_SRC_LOCAL); 140 141 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 142 if (dp->scd_path == NULL) { 143 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 144 "none", 0, ZPROP_SRC_LOCAL); 145 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 146 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 147 dp->scd_path, 0, ZPROP_SRC_LOCAL); 148 } 149 } 150 } 151 152 /* 153 * Get zpool property values. 154 */ 155 int 156 spa_prop_get(spa_t *spa, nvlist_t **nvp) 157 { 158 zap_cursor_t zc; 159 zap_attribute_t za; 160 objset_t *mos = spa->spa_meta_objset; 161 int err; 162 163 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 164 165 /* 166 * Get properties from the spa config. 167 */ 168 spa_prop_get_config(spa, nvp); 169 170 mutex_enter(&spa->spa_props_lock); 171 /* If no pool property object, no more prop to get. */ 172 if (spa->spa_pool_props_object == 0) { 173 mutex_exit(&spa->spa_props_lock); 174 return (0); 175 } 176 177 /* 178 * Get properties from the MOS pool property object. 179 */ 180 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 181 (err = zap_cursor_retrieve(&zc, &za)) == 0; 182 zap_cursor_advance(&zc)) { 183 uint64_t intval = 0; 184 char *strval = NULL; 185 zprop_source_t src = ZPROP_SRC_DEFAULT; 186 zpool_prop_t prop; 187 188 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 189 continue; 190 191 switch (za.za_integer_length) { 192 case 8: 193 /* integer property */ 194 if (za.za_first_integer != 195 zpool_prop_default_numeric(prop)) 196 src = ZPROP_SRC_LOCAL; 197 198 if (prop == ZPOOL_PROP_BOOTFS) { 199 dsl_pool_t *dp; 200 dsl_dataset_t *ds = NULL; 201 202 dp = spa_get_dsl(spa); 203 rw_enter(&dp->dp_config_rwlock, RW_READER); 204 if (err = dsl_dataset_open_obj(dp, 205 za.za_first_integer, NULL, DS_MODE_NONE, 206 FTAG, &ds)) { 207 rw_exit(&dp->dp_config_rwlock); 208 break; 209 } 210 211 strval = kmem_alloc( 212 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 213 KM_SLEEP); 214 dsl_dataset_name(ds, strval); 215 dsl_dataset_close(ds, DS_MODE_NONE, FTAG); 216 rw_exit(&dp->dp_config_rwlock); 217 } else { 218 strval = NULL; 219 intval = za.za_first_integer; 220 } 221 222 spa_prop_add_list(*nvp, prop, strval, intval, src); 223 224 if (strval != NULL) 225 kmem_free(strval, 226 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 227 228 break; 229 230 case 1: 231 /* string property */ 232 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 233 err = zap_lookup(mos, spa->spa_pool_props_object, 234 za.za_name, 1, za.za_num_integers, strval); 235 if (err) { 236 kmem_free(strval, za.za_num_integers); 237 break; 238 } 239 spa_prop_add_list(*nvp, prop, strval, 0, src); 240 kmem_free(strval, za.za_num_integers); 241 break; 242 243 default: 244 break; 245 } 246 } 247 zap_cursor_fini(&zc); 248 mutex_exit(&spa->spa_props_lock); 249 out: 250 if (err && err != ENOENT) { 251 nvlist_free(*nvp); 252 *nvp = NULL; 253 return (err); 254 } 255 256 return (0); 257 } 258 259 /* 260 * Validate the given pool properties nvlist and modify the list 261 * for the property values to be set. 262 */ 263 static int 264 spa_prop_validate(spa_t *spa, nvlist_t *props) 265 { 266 nvpair_t *elem; 267 int error = 0, reset_bootfs = 0; 268 uint64_t objnum; 269 270 elem = NULL; 271 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 272 zpool_prop_t prop; 273 char *propname, *strval; 274 uint64_t intval; 275 vdev_t *rvdev; 276 char *vdev_type; 277 objset_t *os; 278 char *slash; 279 280 propname = nvpair_name(elem); 281 282 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) 283 return (EINVAL); 284 285 switch (prop) { 286 case ZPOOL_PROP_VERSION: 287 error = nvpair_value_uint64(elem, &intval); 288 if (!error && 289 (intval < spa_version(spa) || intval > SPA_VERSION)) 290 error = EINVAL; 291 break; 292 293 case ZPOOL_PROP_DELEGATION: 294 case ZPOOL_PROP_AUTOREPLACE: 295 error = nvpair_value_uint64(elem, &intval); 296 if (!error && intval > 1) 297 error = EINVAL; 298 break; 299 300 case ZPOOL_PROP_BOOTFS: 301 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 302 error = ENOTSUP; 303 break; 304 } 305 306 /* 307 * A bootable filesystem can not be on a RAIDZ pool 308 * nor a striped pool with more than 1 device. 309 */ 310 rvdev = spa->spa_root_vdev; 311 vdev_type = 312 rvdev->vdev_child[0]->vdev_ops->vdev_op_type; 313 if (rvdev->vdev_children > 1 || 314 strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || 315 strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { 316 error = ENOTSUP; 317 break; 318 } 319 320 reset_bootfs = 1; 321 322 error = nvpair_value_string(elem, &strval); 323 324 if (!error) { 325 if (strval == NULL || strval[0] == '\0') { 326 objnum = zpool_prop_default_numeric( 327 ZPOOL_PROP_BOOTFS); 328 break; 329 } 330 331 if (error = dmu_objset_open(strval, DMU_OST_ZFS, 332 DS_MODE_STANDARD | DS_MODE_READONLY, &os)) 333 break; 334 objnum = dmu_objset_id(os); 335 dmu_objset_close(os); 336 } 337 break; 338 case ZPOOL_PROP_FAILUREMODE: 339 error = nvpair_value_uint64(elem, &intval); 340 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 341 intval > ZIO_FAILURE_MODE_PANIC)) 342 error = EINVAL; 343 344 /* 345 * This is a special case which only occurs when 346 * the pool has completely failed. This allows 347 * the user to change the in-core failmode property 348 * without syncing it out to disk (I/Os might 349 * currently be blocked). We do this by returning 350 * EIO to the caller (spa_prop_set) to trick it 351 * into thinking we encountered a property validation 352 * error. 353 */ 354 if (!error && spa_state(spa) == POOL_STATE_IO_FAILURE) { 355 spa->spa_failmode = intval; 356 error = EIO; 357 } 358 break; 359 360 case ZPOOL_PROP_CACHEFILE: 361 if ((error = nvpair_value_string(elem, &strval)) != 0) 362 break; 363 364 if (strval[0] == '\0') 365 break; 366 367 if (strcmp(strval, "none") == 0) 368 break; 369 370 if (strval[0] != '/') { 371 error = EINVAL; 372 break; 373 } 374 375 slash = strrchr(strval, '/'); 376 ASSERT(slash != NULL); 377 378 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 379 strcmp(slash, "/..") == 0) 380 error = EINVAL; 381 break; 382 } 383 384 if (error) 385 break; 386 } 387 388 if (!error && reset_bootfs) { 389 error = nvlist_remove(props, 390 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 391 392 if (!error) { 393 error = nvlist_add_uint64(props, 394 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 395 } 396 } 397 398 return (error); 399 } 400 401 int 402 spa_prop_set(spa_t *spa, nvlist_t *nvp) 403 { 404 int error; 405 406 if ((error = spa_prop_validate(spa, nvp)) != 0) 407 return (error); 408 409 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 410 spa, nvp, 3)); 411 } 412 413 /* 414 * If the bootfs property value is dsobj, clear it. 415 */ 416 void 417 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 418 { 419 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 420 VERIFY(zap_remove(spa->spa_meta_objset, 421 spa->spa_pool_props_object, 422 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 423 spa->spa_bootfs = 0; 424 } 425 } 426 427 /* 428 * ========================================================================== 429 * SPA state manipulation (open/create/destroy/import/export) 430 * ========================================================================== 431 */ 432 433 static int 434 spa_error_entry_compare(const void *a, const void *b) 435 { 436 spa_error_entry_t *sa = (spa_error_entry_t *)a; 437 spa_error_entry_t *sb = (spa_error_entry_t *)b; 438 int ret; 439 440 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 441 sizeof (zbookmark_t)); 442 443 if (ret < 0) 444 return (-1); 445 else if (ret > 0) 446 return (1); 447 else 448 return (0); 449 } 450 451 /* 452 * Utility function which retrieves copies of the current logs and 453 * re-initializes them in the process. 454 */ 455 void 456 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 457 { 458 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 459 460 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 461 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 462 463 avl_create(&spa->spa_errlist_scrub, 464 spa_error_entry_compare, sizeof (spa_error_entry_t), 465 offsetof(spa_error_entry_t, se_avl)); 466 avl_create(&spa->spa_errlist_last, 467 spa_error_entry_compare, sizeof (spa_error_entry_t), 468 offsetof(spa_error_entry_t, se_avl)); 469 } 470 471 /* 472 * Activate an uninitialized pool. 473 */ 474 static void 475 spa_activate(spa_t *spa) 476 { 477 int t; 478 479 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 480 481 spa->spa_state = POOL_STATE_ACTIVE; 482 483 spa->spa_normal_class = metaslab_class_create(); 484 spa->spa_log_class = metaslab_class_create(); 485 486 for (t = 0; t < ZIO_TYPES; t++) { 487 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 488 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 489 TASKQ_PREPOPULATE); 490 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 491 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 492 TASKQ_PREPOPULATE); 493 } 494 495 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 496 offsetof(vdev_t, vdev_dirty_node)); 497 list_create(&spa->spa_zio_list, sizeof (zio_t), 498 offsetof(zio_t, zio_link_node)); 499 500 txg_list_create(&spa->spa_vdev_txg_list, 501 offsetof(struct vdev, vdev_txg_node)); 502 503 avl_create(&spa->spa_errlist_scrub, 504 spa_error_entry_compare, sizeof (spa_error_entry_t), 505 offsetof(spa_error_entry_t, se_avl)); 506 avl_create(&spa->spa_errlist_last, 507 spa_error_entry_compare, sizeof (spa_error_entry_t), 508 offsetof(spa_error_entry_t, se_avl)); 509 } 510 511 /* 512 * Opposite of spa_activate(). 513 */ 514 static void 515 spa_deactivate(spa_t *spa) 516 { 517 int t; 518 519 ASSERT(spa->spa_sync_on == B_FALSE); 520 ASSERT(spa->spa_dsl_pool == NULL); 521 ASSERT(spa->spa_root_vdev == NULL); 522 523 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 524 525 txg_list_destroy(&spa->spa_vdev_txg_list); 526 527 list_destroy(&spa->spa_dirty_list); 528 list_destroy(&spa->spa_zio_list); 529 530 for (t = 0; t < ZIO_TYPES; t++) { 531 taskq_destroy(spa->spa_zio_issue_taskq[t]); 532 taskq_destroy(spa->spa_zio_intr_taskq[t]); 533 spa->spa_zio_issue_taskq[t] = NULL; 534 spa->spa_zio_intr_taskq[t] = NULL; 535 } 536 537 metaslab_class_destroy(spa->spa_normal_class); 538 spa->spa_normal_class = NULL; 539 540 metaslab_class_destroy(spa->spa_log_class); 541 spa->spa_log_class = NULL; 542 543 /* 544 * If this was part of an import or the open otherwise failed, we may 545 * still have errors left in the queues. Empty them just in case. 546 */ 547 spa_errlog_drain(spa); 548 549 avl_destroy(&spa->spa_errlist_scrub); 550 avl_destroy(&spa->spa_errlist_last); 551 552 spa->spa_state = POOL_STATE_UNINITIALIZED; 553 } 554 555 /* 556 * Verify a pool configuration, and construct the vdev tree appropriately. This 557 * will create all the necessary vdevs in the appropriate layout, with each vdev 558 * in the CLOSED state. This will prep the pool before open/creation/import. 559 * All vdev validation is done by the vdev_alloc() routine. 560 */ 561 static int 562 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 563 uint_t id, int atype) 564 { 565 nvlist_t **child; 566 uint_t c, children; 567 int error; 568 569 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 570 return (error); 571 572 if ((*vdp)->vdev_ops->vdev_op_leaf) 573 return (0); 574 575 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 576 &child, &children) != 0) { 577 vdev_free(*vdp); 578 *vdp = NULL; 579 return (EINVAL); 580 } 581 582 for (c = 0; c < children; c++) { 583 vdev_t *vd; 584 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 585 atype)) != 0) { 586 vdev_free(*vdp); 587 *vdp = NULL; 588 return (error); 589 } 590 } 591 592 ASSERT(*vdp != NULL); 593 594 return (0); 595 } 596 597 /* 598 * Opposite of spa_load(). 599 */ 600 static void 601 spa_unload(spa_t *spa) 602 { 603 int i; 604 605 /* 606 * Stop async tasks. 607 */ 608 spa_async_suspend(spa); 609 610 /* 611 * Stop syncing. 612 */ 613 if (spa->spa_sync_on) { 614 txg_sync_stop(spa->spa_dsl_pool); 615 spa->spa_sync_on = B_FALSE; 616 } 617 618 /* 619 * Wait for any outstanding prefetch I/O to complete. 620 */ 621 spa_config_enter(spa, RW_WRITER, FTAG); 622 spa_config_exit(spa, FTAG); 623 624 /* 625 * Drop and purge level 2 cache 626 */ 627 spa_l2cache_drop(spa); 628 629 /* 630 * Close the dsl pool. 631 */ 632 if (spa->spa_dsl_pool) { 633 dsl_pool_close(spa->spa_dsl_pool); 634 spa->spa_dsl_pool = NULL; 635 } 636 637 /* 638 * Close all vdevs. 639 */ 640 if (spa->spa_root_vdev) 641 vdev_free(spa->spa_root_vdev); 642 ASSERT(spa->spa_root_vdev == NULL); 643 644 for (i = 0; i < spa->spa_spares.sav_count; i++) 645 vdev_free(spa->spa_spares.sav_vdevs[i]); 646 if (spa->spa_spares.sav_vdevs) { 647 kmem_free(spa->spa_spares.sav_vdevs, 648 spa->spa_spares.sav_count * sizeof (void *)); 649 spa->spa_spares.sav_vdevs = NULL; 650 } 651 if (spa->spa_spares.sav_config) { 652 nvlist_free(spa->spa_spares.sav_config); 653 spa->spa_spares.sav_config = NULL; 654 } 655 656 for (i = 0; i < spa->spa_l2cache.sav_count; i++) 657 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 658 if (spa->spa_l2cache.sav_vdevs) { 659 kmem_free(spa->spa_l2cache.sav_vdevs, 660 spa->spa_l2cache.sav_count * sizeof (void *)); 661 spa->spa_l2cache.sav_vdevs = NULL; 662 } 663 if (spa->spa_l2cache.sav_config) { 664 nvlist_free(spa->spa_l2cache.sav_config); 665 spa->spa_l2cache.sav_config = NULL; 666 } 667 668 spa->spa_async_suspended = 0; 669 } 670 671 /* 672 * Load (or re-load) the current list of vdevs describing the active spares for 673 * this pool. When this is called, we have some form of basic information in 674 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 675 * then re-generate a more complete list including status information. 676 */ 677 static void 678 spa_load_spares(spa_t *spa) 679 { 680 nvlist_t **spares; 681 uint_t nspares; 682 int i; 683 vdev_t *vd, *tvd; 684 685 /* 686 * First, close and free any existing spare vdevs. 687 */ 688 for (i = 0; i < spa->spa_spares.sav_count; i++) { 689 vd = spa->spa_spares.sav_vdevs[i]; 690 691 /* Undo the call to spa_activate() below */ 692 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 693 B_FALSE)) != NULL && tvd->vdev_isspare) 694 spa_spare_remove(tvd); 695 vdev_close(vd); 696 vdev_free(vd); 697 } 698 699 if (spa->spa_spares.sav_vdevs) 700 kmem_free(spa->spa_spares.sav_vdevs, 701 spa->spa_spares.sav_count * sizeof (void *)); 702 703 if (spa->spa_spares.sav_config == NULL) 704 nspares = 0; 705 else 706 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 707 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 708 709 spa->spa_spares.sav_count = (int)nspares; 710 spa->spa_spares.sav_vdevs = NULL; 711 712 if (nspares == 0) 713 return; 714 715 /* 716 * Construct the array of vdevs, opening them to get status in the 717 * process. For each spare, there is potentially two different vdev_t 718 * structures associated with it: one in the list of spares (used only 719 * for basic validation purposes) and one in the active vdev 720 * configuration (if it's spared in). During this phase we open and 721 * validate each vdev on the spare list. If the vdev also exists in the 722 * active configuration, then we also mark this vdev as an active spare. 723 */ 724 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 725 KM_SLEEP); 726 for (i = 0; i < spa->spa_spares.sav_count; i++) { 727 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 728 VDEV_ALLOC_SPARE) == 0); 729 ASSERT(vd != NULL); 730 731 spa->spa_spares.sav_vdevs[i] = vd; 732 733 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 734 B_FALSE)) != NULL) { 735 if (!tvd->vdev_isspare) 736 spa_spare_add(tvd); 737 738 /* 739 * We only mark the spare active if we were successfully 740 * able to load the vdev. Otherwise, importing a pool 741 * with a bad active spare would result in strange 742 * behavior, because multiple pool would think the spare 743 * is actively in use. 744 * 745 * There is a vulnerability here to an equally bizarre 746 * circumstance, where a dead active spare is later 747 * brought back to life (onlined or otherwise). Given 748 * the rarity of this scenario, and the extra complexity 749 * it adds, we ignore the possibility. 750 */ 751 if (!vdev_is_dead(tvd)) 752 spa_spare_activate(tvd); 753 } 754 755 if (vdev_open(vd) != 0) 756 continue; 757 758 vd->vdev_top = vd; 759 if (vdev_validate_aux(vd) == 0) 760 spa_spare_add(vd); 761 } 762 763 /* 764 * Recompute the stashed list of spares, with status information 765 * this time. 766 */ 767 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 768 DATA_TYPE_NVLIST_ARRAY) == 0); 769 770 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 771 KM_SLEEP); 772 for (i = 0; i < spa->spa_spares.sav_count; i++) 773 spares[i] = vdev_config_generate(spa, 774 spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE); 775 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 776 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 777 for (i = 0; i < spa->spa_spares.sav_count; i++) 778 nvlist_free(spares[i]); 779 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 780 } 781 782 /* 783 * Load (or re-load) the current list of vdevs describing the active l2cache for 784 * this pool. When this is called, we have some form of basic information in 785 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 786 * then re-generate a more complete list including status information. 787 * Devices which are already active have their details maintained, and are 788 * not re-opened. 789 */ 790 static void 791 spa_load_l2cache(spa_t *spa) 792 { 793 nvlist_t **l2cache; 794 uint_t nl2cache; 795 int i, j, oldnvdevs; 796 uint64_t guid, size; 797 vdev_t *vd, **oldvdevs, **newvdevs; 798 spa_aux_vdev_t *sav = &spa->spa_l2cache; 799 800 if (sav->sav_config != NULL) { 801 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 802 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 803 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 804 } else { 805 nl2cache = 0; 806 } 807 808 oldvdevs = sav->sav_vdevs; 809 oldnvdevs = sav->sav_count; 810 sav->sav_vdevs = NULL; 811 sav->sav_count = 0; 812 813 /* 814 * Process new nvlist of vdevs. 815 */ 816 for (i = 0; i < nl2cache; i++) { 817 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 818 &guid) == 0); 819 820 newvdevs[i] = NULL; 821 for (j = 0; j < oldnvdevs; j++) { 822 vd = oldvdevs[j]; 823 if (vd != NULL && guid == vd->vdev_guid) { 824 /* 825 * Retain previous vdev for add/remove ops. 826 */ 827 newvdevs[i] = vd; 828 oldvdevs[j] = NULL; 829 break; 830 } 831 } 832 833 if (newvdevs[i] == NULL) { 834 /* 835 * Create new vdev 836 */ 837 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 838 VDEV_ALLOC_L2CACHE) == 0); 839 ASSERT(vd != NULL); 840 newvdevs[i] = vd; 841 842 /* 843 * Commit this vdev as an l2cache device, 844 * even if it fails to open. 845 */ 846 spa_l2cache_add(vd); 847 848 vd->vdev_top = vd; 849 vd->vdev_aux = sav; 850 851 spa_l2cache_activate(vd); 852 853 if (vdev_open(vd) != 0) 854 continue; 855 856 (void) vdev_validate_aux(vd); 857 858 if (!vdev_is_dead(vd)) { 859 size = vdev_get_rsize(vd); 860 l2arc_add_vdev(spa, vd, 861 VDEV_LABEL_START_SIZE, 862 size - VDEV_LABEL_START_SIZE); 863 } 864 } 865 } 866 867 /* 868 * Purge vdevs that were dropped 869 */ 870 for (i = 0; i < oldnvdevs; i++) { 871 uint64_t pool; 872 873 vd = oldvdevs[i]; 874 if (vd != NULL) { 875 if (spa_mode & FWRITE && 876 spa_l2cache_exists(vd->vdev_guid, &pool) && 877 pool != 0ULL && 878 l2arc_vdev_present(vd)) { 879 l2arc_remove_vdev(vd); 880 } 881 (void) vdev_close(vd); 882 spa_l2cache_remove(vd); 883 } 884 } 885 886 if (oldvdevs) 887 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 888 889 if (sav->sav_config == NULL) 890 goto out; 891 892 sav->sav_vdevs = newvdevs; 893 sav->sav_count = (int)nl2cache; 894 895 /* 896 * Recompute the stashed list of l2cache devices, with status 897 * information this time. 898 */ 899 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 900 DATA_TYPE_NVLIST_ARRAY) == 0); 901 902 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 903 for (i = 0; i < sav->sav_count; i++) 904 l2cache[i] = vdev_config_generate(spa, 905 sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE); 906 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 907 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 908 out: 909 for (i = 0; i < sav->sav_count; i++) 910 nvlist_free(l2cache[i]); 911 if (sav->sav_count) 912 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 913 } 914 915 static int 916 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 917 { 918 dmu_buf_t *db; 919 char *packed = NULL; 920 size_t nvsize = 0; 921 int error; 922 *value = NULL; 923 924 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 925 nvsize = *(uint64_t *)db->db_data; 926 dmu_buf_rele(db, FTAG); 927 928 packed = kmem_alloc(nvsize, KM_SLEEP); 929 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 930 if (error == 0) 931 error = nvlist_unpack(packed, nvsize, value, 0); 932 kmem_free(packed, nvsize); 933 934 return (error); 935 } 936 937 /* 938 * Checks to see if the given vdev could not be opened, in which case we post a 939 * sysevent to notify the autoreplace code that the device has been removed. 940 */ 941 static void 942 spa_check_removed(vdev_t *vd) 943 { 944 int c; 945 946 for (c = 0; c < vd->vdev_children; c++) 947 spa_check_removed(vd->vdev_child[c]); 948 949 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 950 zfs_post_autoreplace(vd->vdev_spa, vd); 951 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 952 } 953 } 954 955 /* 956 * Load an existing storage pool, using the pool's builtin spa_config as a 957 * source of configuration information. 958 */ 959 static int 960 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 961 { 962 int error = 0; 963 nvlist_t *nvroot = NULL; 964 vdev_t *rvd; 965 uberblock_t *ub = &spa->spa_uberblock; 966 uint64_t config_cache_txg = spa->spa_config_txg; 967 uint64_t pool_guid; 968 uint64_t version; 969 zio_t *zio; 970 uint64_t autoreplace = 0; 971 972 spa->spa_load_state = state; 973 974 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 975 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 976 error = EINVAL; 977 goto out; 978 } 979 980 /* 981 * Versioning wasn't explicitly added to the label until later, so if 982 * it's not present treat it as the initial version. 983 */ 984 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 985 version = SPA_VERSION_INITIAL; 986 987 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 988 &spa->spa_config_txg); 989 990 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 991 spa_guid_exists(pool_guid, 0)) { 992 error = EEXIST; 993 goto out; 994 } 995 996 spa->spa_load_guid = pool_guid; 997 998 /* 999 * Parse the configuration into a vdev tree. We explicitly set the 1000 * value that will be returned by spa_version() since parsing the 1001 * configuration requires knowing the version number. 1002 */ 1003 spa_config_enter(spa, RW_WRITER, FTAG); 1004 spa->spa_ubsync.ub_version = version; 1005 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 1006 spa_config_exit(spa, FTAG); 1007 1008 if (error != 0) 1009 goto out; 1010 1011 ASSERT(spa->spa_root_vdev == rvd); 1012 ASSERT(spa_guid(spa) == pool_guid); 1013 1014 /* 1015 * Try to open all vdevs, loading each label in the process. 1016 */ 1017 error = vdev_open(rvd); 1018 if (error != 0) 1019 goto out; 1020 1021 /* 1022 * Validate the labels for all leaf vdevs. We need to grab the config 1023 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 1024 * flag. 1025 */ 1026 spa_config_enter(spa, RW_READER, FTAG); 1027 error = vdev_validate(rvd); 1028 spa_config_exit(spa, FTAG); 1029 1030 if (error != 0) 1031 goto out; 1032 1033 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1034 error = ENXIO; 1035 goto out; 1036 } 1037 1038 /* 1039 * Find the best uberblock. 1040 */ 1041 bzero(ub, sizeof (uberblock_t)); 1042 1043 zio = zio_root(spa, NULL, NULL, 1044 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1045 vdev_uberblock_load(zio, rvd, ub); 1046 error = zio_wait(zio); 1047 1048 /* 1049 * If we weren't able to find a single valid uberblock, return failure. 1050 */ 1051 if (ub->ub_txg == 0) { 1052 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1053 VDEV_AUX_CORRUPT_DATA); 1054 error = ENXIO; 1055 goto out; 1056 } 1057 1058 /* 1059 * If the pool is newer than the code, we can't open it. 1060 */ 1061 if (ub->ub_version > SPA_VERSION) { 1062 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1063 VDEV_AUX_VERSION_NEWER); 1064 error = ENOTSUP; 1065 goto out; 1066 } 1067 1068 /* 1069 * If the vdev guid sum doesn't match the uberblock, we have an 1070 * incomplete configuration. 1071 */ 1072 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 1073 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1074 VDEV_AUX_BAD_GUID_SUM); 1075 error = ENXIO; 1076 goto out; 1077 } 1078 1079 /* 1080 * Initialize internal SPA structures. 1081 */ 1082 spa->spa_state = POOL_STATE_ACTIVE; 1083 spa->spa_ubsync = spa->spa_uberblock; 1084 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 1085 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 1086 if (error) { 1087 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1088 VDEV_AUX_CORRUPT_DATA); 1089 goto out; 1090 } 1091 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 1092 1093 if (zap_lookup(spa->spa_meta_objset, 1094 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1095 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 1096 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1097 VDEV_AUX_CORRUPT_DATA); 1098 error = EIO; 1099 goto out; 1100 } 1101 1102 if (!mosconfig) { 1103 nvlist_t *newconfig; 1104 uint64_t hostid; 1105 1106 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 1107 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1108 VDEV_AUX_CORRUPT_DATA); 1109 error = EIO; 1110 goto out; 1111 } 1112 1113 if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID, 1114 &hostid) == 0) { 1115 char *hostname; 1116 unsigned long myhostid = 0; 1117 1118 VERIFY(nvlist_lookup_string(newconfig, 1119 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 1120 1121 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 1122 if (hostid != 0 && myhostid != 0 && 1123 (unsigned long)hostid != myhostid) { 1124 cmn_err(CE_WARN, "pool '%s' could not be " 1125 "loaded as it was last accessed by " 1126 "another system (host: %s hostid: 0x%lx). " 1127 "See: http://www.sun.com/msg/ZFS-8000-EY", 1128 spa->spa_name, hostname, 1129 (unsigned long)hostid); 1130 error = EBADF; 1131 goto out; 1132 } 1133 } 1134 1135 spa_config_set(spa, newconfig); 1136 spa_unload(spa); 1137 spa_deactivate(spa); 1138 spa_activate(spa); 1139 1140 return (spa_load(spa, newconfig, state, B_TRUE)); 1141 } 1142 1143 if (zap_lookup(spa->spa_meta_objset, 1144 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1145 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 1146 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1147 VDEV_AUX_CORRUPT_DATA); 1148 error = EIO; 1149 goto out; 1150 } 1151 1152 /* 1153 * Load the bit that tells us to use the new accounting function 1154 * (raid-z deflation). If we have an older pool, this will not 1155 * be present. 1156 */ 1157 error = zap_lookup(spa->spa_meta_objset, 1158 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1159 sizeof (uint64_t), 1, &spa->spa_deflate); 1160 if (error != 0 && error != ENOENT) { 1161 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1162 VDEV_AUX_CORRUPT_DATA); 1163 error = EIO; 1164 goto out; 1165 } 1166 1167 /* 1168 * Load the persistent error log. If we have an older pool, this will 1169 * not be present. 1170 */ 1171 error = zap_lookup(spa->spa_meta_objset, 1172 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 1173 sizeof (uint64_t), 1, &spa->spa_errlog_last); 1174 if (error != 0 && error != ENOENT) { 1175 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1176 VDEV_AUX_CORRUPT_DATA); 1177 error = EIO; 1178 goto out; 1179 } 1180 1181 error = zap_lookup(spa->spa_meta_objset, 1182 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 1183 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 1184 if (error != 0 && error != ENOENT) { 1185 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1186 VDEV_AUX_CORRUPT_DATA); 1187 error = EIO; 1188 goto out; 1189 } 1190 1191 /* 1192 * Load the history object. If we have an older pool, this 1193 * will not be present. 1194 */ 1195 error = zap_lookup(spa->spa_meta_objset, 1196 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 1197 sizeof (uint64_t), 1, &spa->spa_history); 1198 if (error != 0 && error != ENOENT) { 1199 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1200 VDEV_AUX_CORRUPT_DATA); 1201 error = EIO; 1202 goto out; 1203 } 1204 1205 /* 1206 * Load any hot spares for this pool. 1207 */ 1208 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1209 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object); 1210 if (error != 0 && error != ENOENT) { 1211 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1212 VDEV_AUX_CORRUPT_DATA); 1213 error = EIO; 1214 goto out; 1215 } 1216 if (error == 0) { 1217 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 1218 if (load_nvlist(spa, spa->spa_spares.sav_object, 1219 &spa->spa_spares.sav_config) != 0) { 1220 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1221 VDEV_AUX_CORRUPT_DATA); 1222 error = EIO; 1223 goto out; 1224 } 1225 1226 spa_config_enter(spa, RW_WRITER, FTAG); 1227 spa_load_spares(spa); 1228 spa_config_exit(spa, FTAG); 1229 } 1230 1231 /* 1232 * Load any level 2 ARC devices for this pool. 1233 */ 1234 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1235 DMU_POOL_L2CACHE, sizeof (uint64_t), 1, 1236 &spa->spa_l2cache.sav_object); 1237 if (error != 0 && error != ENOENT) { 1238 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1239 VDEV_AUX_CORRUPT_DATA); 1240 error = EIO; 1241 goto out; 1242 } 1243 if (error == 0) { 1244 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 1245 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 1246 &spa->spa_l2cache.sav_config) != 0) { 1247 vdev_set_state(rvd, B_TRUE, 1248 VDEV_STATE_CANT_OPEN, 1249 VDEV_AUX_CORRUPT_DATA); 1250 error = EIO; 1251 goto out; 1252 } 1253 1254 spa_config_enter(spa, RW_WRITER, FTAG); 1255 spa_load_l2cache(spa); 1256 spa_config_exit(spa, FTAG); 1257 } 1258 1259 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 1260 1261 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1262 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 1263 1264 if (error && error != ENOENT) { 1265 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1266 VDEV_AUX_CORRUPT_DATA); 1267 error = EIO; 1268 goto out; 1269 } 1270 1271 if (error == 0) { 1272 (void) zap_lookup(spa->spa_meta_objset, 1273 spa->spa_pool_props_object, 1274 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 1275 sizeof (uint64_t), 1, &spa->spa_bootfs); 1276 (void) zap_lookup(spa->spa_meta_objset, 1277 spa->spa_pool_props_object, 1278 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1279 sizeof (uint64_t), 1, &autoreplace); 1280 (void) zap_lookup(spa->spa_meta_objset, 1281 spa->spa_pool_props_object, 1282 zpool_prop_to_name(ZPOOL_PROP_DELEGATION), 1283 sizeof (uint64_t), 1, &spa->spa_delegation); 1284 (void) zap_lookup(spa->spa_meta_objset, 1285 spa->spa_pool_props_object, 1286 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 1287 sizeof (uint64_t), 1, &spa->spa_failmode); 1288 } 1289 1290 /* 1291 * If the 'autoreplace' property is set, then post a resource notifying 1292 * the ZFS DE that it should not issue any faults for unopenable 1293 * devices. We also iterate over the vdevs, and post a sysevent for any 1294 * unopenable vdevs so that the normal autoreplace handler can take 1295 * over. 1296 */ 1297 if (autoreplace && state != SPA_LOAD_TRYIMPORT) 1298 spa_check_removed(spa->spa_root_vdev); 1299 1300 /* 1301 * Load the vdev state for all toplevel vdevs. 1302 */ 1303 vdev_load(rvd); 1304 1305 /* 1306 * Propagate the leaf DTLs we just loaded all the way up the tree. 1307 */ 1308 spa_config_enter(spa, RW_WRITER, FTAG); 1309 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 1310 spa_config_exit(spa, FTAG); 1311 1312 /* 1313 * Check the state of the root vdev. If it can't be opened, it 1314 * indicates one or more toplevel vdevs are faulted. 1315 */ 1316 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1317 error = ENXIO; 1318 goto out; 1319 } 1320 1321 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 1322 dmu_tx_t *tx; 1323 int need_update = B_FALSE; 1324 int c; 1325 1326 /* 1327 * Claim log blocks that haven't been committed yet. 1328 * This must all happen in a single txg. 1329 */ 1330 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 1331 spa_first_txg(spa)); 1332 (void) dmu_objset_find(spa->spa_name, 1333 zil_claim, tx, DS_FIND_CHILDREN); 1334 dmu_tx_commit(tx); 1335 1336 spa->spa_sync_on = B_TRUE; 1337 txg_sync_start(spa->spa_dsl_pool); 1338 1339 /* 1340 * Wait for all claims to sync. 1341 */ 1342 txg_wait_synced(spa->spa_dsl_pool, 0); 1343 1344 /* 1345 * If the config cache is stale, or we have uninitialized 1346 * metaslabs (see spa_vdev_add()), then update the config. 1347 */ 1348 if (config_cache_txg != spa->spa_config_txg || 1349 state == SPA_LOAD_IMPORT) 1350 need_update = B_TRUE; 1351 1352 for (c = 0; c < rvd->vdev_children; c++) 1353 if (rvd->vdev_child[c]->vdev_ms_array == 0) 1354 need_update = B_TRUE; 1355 1356 /* 1357 * Update the config cache asychronously in case we're the 1358 * root pool, in which case the config cache isn't writable yet. 1359 */ 1360 if (need_update) 1361 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 1362 } 1363 1364 error = 0; 1365 out: 1366 if (error && error != EBADF) 1367 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 1368 spa->spa_load_state = SPA_LOAD_NONE; 1369 spa->spa_ena = 0; 1370 1371 return (error); 1372 } 1373 1374 /* 1375 * Pool Open/Import 1376 * 1377 * The import case is identical to an open except that the configuration is sent 1378 * down from userland, instead of grabbed from the configuration cache. For the 1379 * case of an open, the pool configuration will exist in the 1380 * POOL_STATE_UNINITIALIZED state. 1381 * 1382 * The stats information (gen/count/ustats) is used to gather vdev statistics at 1383 * the same time open the pool, without having to keep around the spa_t in some 1384 * ambiguous state. 1385 */ 1386 static int 1387 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 1388 { 1389 spa_t *spa; 1390 int error; 1391 int loaded = B_FALSE; 1392 int locked = B_FALSE; 1393 1394 *spapp = NULL; 1395 1396 /* 1397 * As disgusting as this is, we need to support recursive calls to this 1398 * function because dsl_dir_open() is called during spa_load(), and ends 1399 * up calling spa_open() again. The real fix is to figure out how to 1400 * avoid dsl_dir_open() calling this in the first place. 1401 */ 1402 if (mutex_owner(&spa_namespace_lock) != curthread) { 1403 mutex_enter(&spa_namespace_lock); 1404 locked = B_TRUE; 1405 } 1406 1407 if ((spa = spa_lookup(pool)) == NULL) { 1408 if (locked) 1409 mutex_exit(&spa_namespace_lock); 1410 return (ENOENT); 1411 } 1412 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 1413 1414 spa_activate(spa); 1415 1416 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 1417 1418 if (error == EBADF) { 1419 /* 1420 * If vdev_validate() returns failure (indicated by 1421 * EBADF), it indicates that one of the vdevs indicates 1422 * that the pool has been exported or destroyed. If 1423 * this is the case, the config cache is out of sync and 1424 * we should remove the pool from the namespace. 1425 */ 1426 spa_unload(spa); 1427 spa_deactivate(spa); 1428 spa_config_sync(spa, B_TRUE, B_TRUE); 1429 spa_remove(spa); 1430 if (locked) 1431 mutex_exit(&spa_namespace_lock); 1432 return (ENOENT); 1433 } 1434 1435 if (error) { 1436 /* 1437 * We can't open the pool, but we still have useful 1438 * information: the state of each vdev after the 1439 * attempted vdev_open(). Return this to the user. 1440 */ 1441 if (config != NULL && spa->spa_root_vdev != NULL) { 1442 spa_config_enter(spa, RW_READER, FTAG); 1443 *config = spa_config_generate(spa, NULL, -1ULL, 1444 B_TRUE); 1445 spa_config_exit(spa, FTAG); 1446 } 1447 spa_unload(spa); 1448 spa_deactivate(spa); 1449 spa->spa_last_open_failed = B_TRUE; 1450 if (locked) 1451 mutex_exit(&spa_namespace_lock); 1452 *spapp = NULL; 1453 return (error); 1454 } else { 1455 spa->spa_last_open_failed = B_FALSE; 1456 } 1457 1458 loaded = B_TRUE; 1459 } 1460 1461 spa_open_ref(spa, tag); 1462 1463 /* 1464 * If we just loaded the pool, resilver anything that's out of date. 1465 */ 1466 if (loaded && (spa_mode & FWRITE)) 1467 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1468 1469 if (locked) 1470 mutex_exit(&spa_namespace_lock); 1471 1472 *spapp = spa; 1473 1474 if (config != NULL) { 1475 spa_config_enter(spa, RW_READER, FTAG); 1476 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1477 spa_config_exit(spa, FTAG); 1478 } 1479 1480 return (0); 1481 } 1482 1483 int 1484 spa_open(const char *name, spa_t **spapp, void *tag) 1485 { 1486 return (spa_open_common(name, spapp, tag, NULL)); 1487 } 1488 1489 /* 1490 * Lookup the given spa_t, incrementing the inject count in the process, 1491 * preventing it from being exported or destroyed. 1492 */ 1493 spa_t * 1494 spa_inject_addref(char *name) 1495 { 1496 spa_t *spa; 1497 1498 mutex_enter(&spa_namespace_lock); 1499 if ((spa = spa_lookup(name)) == NULL) { 1500 mutex_exit(&spa_namespace_lock); 1501 return (NULL); 1502 } 1503 spa->spa_inject_ref++; 1504 mutex_exit(&spa_namespace_lock); 1505 1506 return (spa); 1507 } 1508 1509 void 1510 spa_inject_delref(spa_t *spa) 1511 { 1512 mutex_enter(&spa_namespace_lock); 1513 spa->spa_inject_ref--; 1514 mutex_exit(&spa_namespace_lock); 1515 } 1516 1517 /* 1518 * Add spares device information to the nvlist. 1519 */ 1520 static void 1521 spa_add_spares(spa_t *spa, nvlist_t *config) 1522 { 1523 nvlist_t **spares; 1524 uint_t i, nspares; 1525 nvlist_t *nvroot; 1526 uint64_t guid; 1527 vdev_stat_t *vs; 1528 uint_t vsc; 1529 uint64_t pool; 1530 1531 if (spa->spa_spares.sav_count == 0) 1532 return; 1533 1534 VERIFY(nvlist_lookup_nvlist(config, 1535 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1536 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1537 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1538 if (nspares != 0) { 1539 VERIFY(nvlist_add_nvlist_array(nvroot, 1540 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1541 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1542 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1543 1544 /* 1545 * Go through and find any spares which have since been 1546 * repurposed as an active spare. If this is the case, update 1547 * their status appropriately. 1548 */ 1549 for (i = 0; i < nspares; i++) { 1550 VERIFY(nvlist_lookup_uint64(spares[i], 1551 ZPOOL_CONFIG_GUID, &guid) == 0); 1552 if (spa_spare_exists(guid, &pool) && pool != 0ULL) { 1553 VERIFY(nvlist_lookup_uint64_array( 1554 spares[i], ZPOOL_CONFIG_STATS, 1555 (uint64_t **)&vs, &vsc) == 0); 1556 vs->vs_state = VDEV_STATE_CANT_OPEN; 1557 vs->vs_aux = VDEV_AUX_SPARED; 1558 } 1559 } 1560 } 1561 } 1562 1563 /* 1564 * Add l2cache device information to the nvlist, including vdev stats. 1565 */ 1566 static void 1567 spa_add_l2cache(spa_t *spa, nvlist_t *config) 1568 { 1569 nvlist_t **l2cache; 1570 uint_t i, j, nl2cache; 1571 nvlist_t *nvroot; 1572 uint64_t guid; 1573 vdev_t *vd; 1574 vdev_stat_t *vs; 1575 uint_t vsc; 1576 1577 if (spa->spa_l2cache.sav_count == 0) 1578 return; 1579 1580 spa_config_enter(spa, RW_READER, FTAG); 1581 1582 VERIFY(nvlist_lookup_nvlist(config, 1583 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1584 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 1585 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1586 if (nl2cache != 0) { 1587 VERIFY(nvlist_add_nvlist_array(nvroot, 1588 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 1589 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1590 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1591 1592 /* 1593 * Update level 2 cache device stats. 1594 */ 1595 1596 for (i = 0; i < nl2cache; i++) { 1597 VERIFY(nvlist_lookup_uint64(l2cache[i], 1598 ZPOOL_CONFIG_GUID, &guid) == 0); 1599 1600 vd = NULL; 1601 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 1602 if (guid == 1603 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 1604 vd = spa->spa_l2cache.sav_vdevs[j]; 1605 break; 1606 } 1607 } 1608 ASSERT(vd != NULL); 1609 1610 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 1611 ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0); 1612 vdev_get_stats(vd, vs); 1613 } 1614 } 1615 1616 spa_config_exit(spa, FTAG); 1617 } 1618 1619 int 1620 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 1621 { 1622 int error; 1623 spa_t *spa; 1624 1625 *config = NULL; 1626 error = spa_open_common(name, &spa, FTAG, config); 1627 1628 if (spa && *config != NULL) { 1629 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 1630 spa_get_errlog_size(spa)) == 0); 1631 1632 spa_add_spares(spa, *config); 1633 spa_add_l2cache(spa, *config); 1634 } 1635 1636 /* 1637 * We want to get the alternate root even for faulted pools, so we cheat 1638 * and call spa_lookup() directly. 1639 */ 1640 if (altroot) { 1641 if (spa == NULL) { 1642 mutex_enter(&spa_namespace_lock); 1643 spa = spa_lookup(name); 1644 if (spa) 1645 spa_altroot(spa, altroot, buflen); 1646 else 1647 altroot[0] = '\0'; 1648 spa = NULL; 1649 mutex_exit(&spa_namespace_lock); 1650 } else { 1651 spa_altroot(spa, altroot, buflen); 1652 } 1653 } 1654 1655 if (spa != NULL) 1656 spa_close(spa, FTAG); 1657 1658 return (error); 1659 } 1660 1661 /* 1662 * Validate that the auxiliary device array is well formed. We must have an 1663 * array of nvlists, each which describes a valid leaf vdev. If this is an 1664 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 1665 * specified, as long as they are well-formed. 1666 */ 1667 static int 1668 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 1669 spa_aux_vdev_t *sav, const char *config, uint64_t version, 1670 vdev_labeltype_t label) 1671 { 1672 nvlist_t **dev; 1673 uint_t i, ndev; 1674 vdev_t *vd; 1675 int error; 1676 1677 /* 1678 * It's acceptable to have no devs specified. 1679 */ 1680 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 1681 return (0); 1682 1683 if (ndev == 0) 1684 return (EINVAL); 1685 1686 /* 1687 * Make sure the pool is formatted with a version that supports this 1688 * device type. 1689 */ 1690 if (spa_version(spa) < version) 1691 return (ENOTSUP); 1692 1693 /* 1694 * Set the pending device list so we correctly handle device in-use 1695 * checking. 1696 */ 1697 sav->sav_pending = dev; 1698 sav->sav_npending = ndev; 1699 1700 for (i = 0; i < ndev; i++) { 1701 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 1702 mode)) != 0) 1703 goto out; 1704 1705 if (!vd->vdev_ops->vdev_op_leaf) { 1706 vdev_free(vd); 1707 error = EINVAL; 1708 goto out; 1709 } 1710 1711 /* 1712 * The L2ARC currently only supports disk devices. 1713 */ 1714 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 1715 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 1716 error = ENOTBLK; 1717 goto out; 1718 } 1719 1720 vd->vdev_top = vd; 1721 1722 if ((error = vdev_open(vd)) == 0 && 1723 (error = vdev_label_init(vd, crtxg, label)) == 0) { 1724 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 1725 vd->vdev_guid) == 0); 1726 } 1727 1728 vdev_free(vd); 1729 1730 if (error && 1731 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 1732 goto out; 1733 else 1734 error = 0; 1735 } 1736 1737 out: 1738 sav->sav_pending = NULL; 1739 sav->sav_npending = 0; 1740 return (error); 1741 } 1742 1743 static int 1744 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 1745 { 1746 int error; 1747 1748 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 1749 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 1750 VDEV_LABEL_SPARE)) != 0) { 1751 return (error); 1752 } 1753 1754 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 1755 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 1756 VDEV_LABEL_L2CACHE)); 1757 } 1758 1759 static void 1760 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 1761 const char *config) 1762 { 1763 int i; 1764 1765 if (sav->sav_config != NULL) { 1766 nvlist_t **olddevs; 1767 uint_t oldndevs; 1768 nvlist_t **newdevs; 1769 1770 /* 1771 * Generate new dev list by concatentating with the 1772 * current dev list. 1773 */ 1774 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 1775 &olddevs, &oldndevs) == 0); 1776 1777 newdevs = kmem_alloc(sizeof (void *) * 1778 (ndevs + oldndevs), KM_SLEEP); 1779 for (i = 0; i < oldndevs; i++) 1780 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 1781 KM_SLEEP) == 0); 1782 for (i = 0; i < ndevs; i++) 1783 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 1784 KM_SLEEP) == 0); 1785 1786 VERIFY(nvlist_remove(sav->sav_config, config, 1787 DATA_TYPE_NVLIST_ARRAY) == 0); 1788 1789 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1790 config, newdevs, ndevs + oldndevs) == 0); 1791 for (i = 0; i < oldndevs + ndevs; i++) 1792 nvlist_free(newdevs[i]); 1793 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 1794 } else { 1795 /* 1796 * Generate a new dev list. 1797 */ 1798 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 1799 KM_SLEEP) == 0); 1800 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 1801 devs, ndevs) == 0); 1802 } 1803 } 1804 1805 /* 1806 * Stop and drop level 2 ARC devices 1807 */ 1808 void 1809 spa_l2cache_drop(spa_t *spa) 1810 { 1811 vdev_t *vd; 1812 int i; 1813 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1814 1815 for (i = 0; i < sav->sav_count; i++) { 1816 uint64_t pool; 1817 1818 vd = sav->sav_vdevs[i]; 1819 ASSERT(vd != NULL); 1820 1821 if (spa_mode & FWRITE && 1822 spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && 1823 l2arc_vdev_present(vd)) { 1824 l2arc_remove_vdev(vd); 1825 } 1826 if (vd->vdev_isl2cache) 1827 spa_l2cache_remove(vd); 1828 vdev_clear_stats(vd); 1829 (void) vdev_close(vd); 1830 } 1831 } 1832 1833 /* 1834 * Pool Creation 1835 */ 1836 int 1837 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 1838 const char *history_str) 1839 { 1840 spa_t *spa; 1841 char *altroot = NULL; 1842 vdev_t *rvd; 1843 dsl_pool_t *dp; 1844 dmu_tx_t *tx; 1845 int c, error = 0; 1846 uint64_t txg = TXG_INITIAL; 1847 nvlist_t **spares, **l2cache; 1848 uint_t nspares, nl2cache; 1849 uint64_t version; 1850 1851 /* 1852 * If this pool already exists, return failure. 1853 */ 1854 mutex_enter(&spa_namespace_lock); 1855 if (spa_lookup(pool) != NULL) { 1856 mutex_exit(&spa_namespace_lock); 1857 return (EEXIST); 1858 } 1859 1860 /* 1861 * Allocate a new spa_t structure. 1862 */ 1863 (void) nvlist_lookup_string(props, 1864 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 1865 spa = spa_add(pool, altroot); 1866 spa_activate(spa); 1867 1868 spa->spa_uberblock.ub_txg = txg - 1; 1869 1870 if (props && (error = spa_prop_validate(spa, props))) { 1871 spa_unload(spa); 1872 spa_deactivate(spa); 1873 spa_remove(spa); 1874 mutex_exit(&spa_namespace_lock); 1875 return (error); 1876 } 1877 1878 if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 1879 &version) != 0) 1880 version = SPA_VERSION; 1881 ASSERT(version <= SPA_VERSION); 1882 spa->spa_uberblock.ub_version = version; 1883 spa->spa_ubsync = spa->spa_uberblock; 1884 1885 /* 1886 * Create the root vdev. 1887 */ 1888 spa_config_enter(spa, RW_WRITER, FTAG); 1889 1890 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1891 1892 ASSERT(error != 0 || rvd != NULL); 1893 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1894 1895 if (error == 0 && !zfs_allocatable_devs(nvroot)) 1896 error = EINVAL; 1897 1898 if (error == 0 && 1899 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1900 (error = spa_validate_aux(spa, nvroot, txg, 1901 VDEV_ALLOC_ADD)) == 0) { 1902 for (c = 0; c < rvd->vdev_children; c++) 1903 vdev_init(rvd->vdev_child[c], txg); 1904 vdev_config_dirty(rvd); 1905 } 1906 1907 spa_config_exit(spa, FTAG); 1908 1909 if (error != 0) { 1910 spa_unload(spa); 1911 spa_deactivate(spa); 1912 spa_remove(spa); 1913 mutex_exit(&spa_namespace_lock); 1914 return (error); 1915 } 1916 1917 /* 1918 * Get the list of spares, if specified. 1919 */ 1920 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1921 &spares, &nspares) == 0) { 1922 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 1923 KM_SLEEP) == 0); 1924 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1925 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1926 spa_config_enter(spa, RW_WRITER, FTAG); 1927 spa_load_spares(spa); 1928 spa_config_exit(spa, FTAG); 1929 spa->spa_spares.sav_sync = B_TRUE; 1930 } 1931 1932 /* 1933 * Get the list of level 2 cache devices, if specified. 1934 */ 1935 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 1936 &l2cache, &nl2cache) == 0) { 1937 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 1938 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1939 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 1940 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 1941 spa_config_enter(spa, RW_WRITER, FTAG); 1942 spa_load_l2cache(spa); 1943 spa_config_exit(spa, FTAG); 1944 spa->spa_l2cache.sav_sync = B_TRUE; 1945 } 1946 1947 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1948 spa->spa_meta_objset = dp->dp_meta_objset; 1949 1950 tx = dmu_tx_create_assigned(dp, txg); 1951 1952 /* 1953 * Create the pool config object. 1954 */ 1955 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1956 DMU_OT_PACKED_NVLIST, 1 << 14, 1957 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1958 1959 if (zap_add(spa->spa_meta_objset, 1960 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1961 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1962 cmn_err(CE_PANIC, "failed to add pool config"); 1963 } 1964 1965 /* Newly created pools with the right version are always deflated. */ 1966 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 1967 spa->spa_deflate = TRUE; 1968 if (zap_add(spa->spa_meta_objset, 1969 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1970 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 1971 cmn_err(CE_PANIC, "failed to add deflate"); 1972 } 1973 } 1974 1975 /* 1976 * Create the deferred-free bplist object. Turn off compression 1977 * because sync-to-convergence takes longer if the blocksize 1978 * keeps changing. 1979 */ 1980 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1981 1 << 14, tx); 1982 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1983 ZIO_COMPRESS_OFF, tx); 1984 1985 if (zap_add(spa->spa_meta_objset, 1986 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1987 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 1988 cmn_err(CE_PANIC, "failed to add bplist"); 1989 } 1990 1991 /* 1992 * Create the pool's history object. 1993 */ 1994 if (version >= SPA_VERSION_ZPOOL_HISTORY) 1995 spa_history_create_obj(spa, tx); 1996 1997 /* 1998 * Set pool properties. 1999 */ 2000 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 2001 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2002 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 2003 if (props) 2004 spa_sync_props(spa, props, CRED(), tx); 2005 2006 dmu_tx_commit(tx); 2007 2008 spa->spa_sync_on = B_TRUE; 2009 txg_sync_start(spa->spa_dsl_pool); 2010 2011 /* 2012 * We explicitly wait for the first transaction to complete so that our 2013 * bean counters are appropriately updated. 2014 */ 2015 txg_wait_synced(spa->spa_dsl_pool, txg); 2016 2017 spa_config_sync(spa, B_FALSE, B_TRUE); 2018 2019 if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 2020 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 2021 2022 mutex_exit(&spa_namespace_lock); 2023 2024 return (0); 2025 } 2026 2027 /* 2028 * Import the given pool into the system. We set up the necessary spa_t and 2029 * then call spa_load() to do the dirty work. 2030 */ 2031 static int 2032 spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props, 2033 boolean_t isroot, boolean_t allowfaulted) 2034 { 2035 spa_t *spa; 2036 char *altroot = NULL; 2037 int error, loaderr; 2038 nvlist_t *nvroot; 2039 nvlist_t **spares, **l2cache; 2040 uint_t nspares, nl2cache; 2041 int mosconfig = isroot? B_FALSE : B_TRUE; 2042 2043 /* 2044 * If a pool with this name exists, return failure. 2045 */ 2046 mutex_enter(&spa_namespace_lock); 2047 if (spa_lookup(pool) != NULL) { 2048 mutex_exit(&spa_namespace_lock); 2049 return (EEXIST); 2050 } 2051 2052 /* 2053 * Create and initialize the spa structure. 2054 */ 2055 (void) nvlist_lookup_string(props, 2056 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2057 spa = spa_add(pool, altroot); 2058 spa_activate(spa); 2059 2060 if (allowfaulted) 2061 spa->spa_import_faulted = B_TRUE; 2062 spa->spa_is_root = isroot; 2063 2064 /* 2065 * Pass off the heavy lifting to spa_load(). 2066 * Pass TRUE for mosconfig because the user-supplied config 2067 * is actually the one to trust when doing an import. 2068 */ 2069 loaderr = error = spa_load(spa, config, SPA_LOAD_IMPORT, mosconfig); 2070 2071 spa_config_enter(spa, RW_WRITER, FTAG); 2072 /* 2073 * Toss any existing sparelist, as it doesn't have any validity anymore, 2074 * and conflicts with spa_has_spare(). 2075 */ 2076 if (!isroot && spa->spa_spares.sav_config) { 2077 nvlist_free(spa->spa_spares.sav_config); 2078 spa->spa_spares.sav_config = NULL; 2079 spa_load_spares(spa); 2080 } 2081 if (!isroot && spa->spa_l2cache.sav_config) { 2082 nvlist_free(spa->spa_l2cache.sav_config); 2083 spa->spa_l2cache.sav_config = NULL; 2084 spa_load_l2cache(spa); 2085 } 2086 2087 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 2088 &nvroot) == 0); 2089 if (error == 0) 2090 error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE); 2091 if (error == 0) 2092 error = spa_validate_aux(spa, nvroot, -1ULL, 2093 VDEV_ALLOC_L2CACHE); 2094 spa_config_exit(spa, FTAG); 2095 2096 if (error != 0 || (props && (error = spa_prop_set(spa, props)))) { 2097 if (loaderr != 0 && loaderr != EINVAL && allowfaulted) { 2098 /* 2099 * If we failed to load the pool, but 'allowfaulted' is 2100 * set, then manually set the config as if the config 2101 * passed in was specified in the cache file. 2102 */ 2103 error = 0; 2104 spa->spa_import_faulted = B_FALSE; 2105 if (spa->spa_config == NULL) { 2106 spa_config_enter(spa, RW_READER, FTAG); 2107 spa->spa_config = spa_config_generate(spa, 2108 NULL, -1ULL, B_TRUE); 2109 spa_config_exit(spa, FTAG); 2110 } 2111 spa_unload(spa); 2112 spa_deactivate(spa); 2113 spa_config_sync(spa, B_FALSE, B_TRUE); 2114 } else { 2115 spa_unload(spa); 2116 spa_deactivate(spa); 2117 spa_remove(spa); 2118 } 2119 mutex_exit(&spa_namespace_lock); 2120 return (error); 2121 } 2122 2123 /* 2124 * Override any spares and level 2 cache devices as specified by 2125 * the user, as these may have correct device names/devids, etc. 2126 */ 2127 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2128 &spares, &nspares) == 0) { 2129 if (spa->spa_spares.sav_config) 2130 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 2131 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 2132 else 2133 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 2134 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2135 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2136 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2137 spa_config_enter(spa, RW_WRITER, FTAG); 2138 spa_load_spares(spa); 2139 spa_config_exit(spa, FTAG); 2140 spa->spa_spares.sav_sync = B_TRUE; 2141 } 2142 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2143 &l2cache, &nl2cache) == 0) { 2144 if (spa->spa_l2cache.sav_config) 2145 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 2146 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 2147 else 2148 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2149 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2150 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2151 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2152 spa_config_enter(spa, RW_WRITER, FTAG); 2153 spa_load_l2cache(spa); 2154 spa_config_exit(spa, FTAG); 2155 spa->spa_l2cache.sav_sync = B_TRUE; 2156 } 2157 2158 if (spa_mode & FWRITE) { 2159 /* 2160 * Update the config cache to include the newly-imported pool. 2161 */ 2162 spa_config_update_common(spa, SPA_CONFIG_UPDATE_POOL, isroot); 2163 2164 /* 2165 * Resilver anything that's out of date. 2166 */ 2167 if (!isroot) 2168 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, 2169 B_TRUE) == 0); 2170 } 2171 2172 spa->spa_import_faulted = B_FALSE; 2173 mutex_exit(&spa_namespace_lock); 2174 2175 return (0); 2176 } 2177 2178 #ifdef _KERNEL 2179 /* 2180 * Build a "root" vdev for a top level vdev read in from a rootpool 2181 * device label. 2182 */ 2183 static void 2184 spa_build_rootpool_config(nvlist_t *config) 2185 { 2186 nvlist_t *nvtop, *nvroot; 2187 uint64_t pgid; 2188 2189 /* 2190 * Add this top-level vdev to the child array. 2191 */ 2192 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) 2193 == 0); 2194 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid) 2195 == 0); 2196 2197 /* 2198 * Put this pool's top-level vdevs into a root vdev. 2199 */ 2200 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2201 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) 2202 == 0); 2203 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 2204 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 2205 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 2206 &nvtop, 1) == 0); 2207 2208 /* 2209 * Replace the existing vdev_tree with the new root vdev in 2210 * this pool's configuration (remove the old, add the new). 2211 */ 2212 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 2213 nvlist_free(nvroot); 2214 } 2215 2216 /* 2217 * Get the root pool information from the root disk, then import the root pool 2218 * during the system boot up time. 2219 */ 2220 extern nvlist_t *vdev_disk_read_rootlabel(char *); 2221 2222 void 2223 spa_check_rootconf(char *devpath, char **bestdev, nvlist_t **bestconf, 2224 uint64_t *besttxg) 2225 { 2226 nvlist_t *config; 2227 uint64_t txg; 2228 2229 if ((config = vdev_disk_read_rootlabel(devpath)) == NULL) 2230 return; 2231 2232 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 2233 2234 if (txg > *besttxg) { 2235 *besttxg = txg; 2236 if (*bestconf != NULL) 2237 nvlist_free(*bestconf); 2238 *bestconf = config; 2239 *bestdev = devpath; 2240 } 2241 } 2242 2243 boolean_t 2244 spa_rootdev_validate(nvlist_t *nv) 2245 { 2246 uint64_t ival; 2247 2248 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 || 2249 nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 || 2250 nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, &ival) == 0 || 2251 nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0) 2252 return (B_FALSE); 2253 2254 return (B_TRUE); 2255 } 2256 2257 /* 2258 * Import a root pool. 2259 * 2260 * For x86. devpath_list will consist the physpath name of the vdev in a single 2261 * disk root pool or a list of physnames for the vdevs in a mirrored rootpool. 2262 * e.g. 2263 * "/pci@1f,0/ide@d/disk@0,0:a /pci@1f,o/ide@d/disk@2,0:a" 2264 * 2265 * For Sparc, devpath_list consists the physpath name of the booting device 2266 * no matter the rootpool is a single device pool or a mirrored pool. 2267 * e.g. 2268 * "/pci@1f,0/ide@d/disk@0,0:a" 2269 */ 2270 int 2271 spa_import_rootpool(char *devpath_list) 2272 { 2273 nvlist_t *conf = NULL; 2274 char *dev = NULL; 2275 char *pname; 2276 int error; 2277 2278 /* 2279 * Get the vdev pathname and configuation from the most 2280 * recently updated vdev (highest txg). 2281 */ 2282 if (error = spa_get_rootconf(devpath_list, &dev, &conf)) 2283 goto msg_out; 2284 2285 /* 2286 * Add type "root" vdev to the config. 2287 */ 2288 spa_build_rootpool_config(conf); 2289 2290 VERIFY(nvlist_lookup_string(conf, ZPOOL_CONFIG_POOL_NAME, &pname) == 0); 2291 2292 /* 2293 * We specify 'allowfaulted' for this to be treated like spa_open() 2294 * instead of spa_import(). This prevents us from marking vdevs as 2295 * persistently unavailable, and generates FMA ereports as if it were a 2296 * pool open, not import. 2297 */ 2298 error = spa_import_common(pname, conf, NULL, B_TRUE, B_TRUE); 2299 if (error == EEXIST) 2300 error = 0; 2301 2302 nvlist_free(conf); 2303 return (error); 2304 2305 msg_out: 2306 cmn_err(CE_NOTE, "\n\n" 2307 " *************************************************** \n" 2308 " * This device is not bootable! * \n" 2309 " * It is either offlined or detached or faulted. * \n" 2310 " * Please try to boot from a different device. * \n" 2311 " *************************************************** \n\n"); 2312 2313 return (error); 2314 } 2315 #endif 2316 2317 /* 2318 * Import a non-root pool into the system. 2319 */ 2320 int 2321 spa_import(const char *pool, nvlist_t *config, nvlist_t *props) 2322 { 2323 return (spa_import_common(pool, config, props, B_FALSE, B_FALSE)); 2324 } 2325 2326 int 2327 spa_import_faulted(const char *pool, nvlist_t *config, nvlist_t *props) 2328 { 2329 return (spa_import_common(pool, config, props, B_FALSE, B_TRUE)); 2330 } 2331 2332 2333 /* 2334 * This (illegal) pool name is used when temporarily importing a spa_t in order 2335 * to get the vdev stats associated with the imported devices. 2336 */ 2337 #define TRYIMPORT_NAME "$import" 2338 2339 nvlist_t * 2340 spa_tryimport(nvlist_t *tryconfig) 2341 { 2342 nvlist_t *config = NULL; 2343 char *poolname; 2344 spa_t *spa; 2345 uint64_t state; 2346 2347 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 2348 return (NULL); 2349 2350 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 2351 return (NULL); 2352 2353 /* 2354 * Create and initialize the spa structure. 2355 */ 2356 mutex_enter(&spa_namespace_lock); 2357 spa = spa_add(TRYIMPORT_NAME, NULL); 2358 spa_activate(spa); 2359 2360 /* 2361 * Pass off the heavy lifting to spa_load(). 2362 * Pass TRUE for mosconfig because the user-supplied config 2363 * is actually the one to trust when doing an import. 2364 */ 2365 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 2366 2367 /* 2368 * If 'tryconfig' was at least parsable, return the current config. 2369 */ 2370 if (spa->spa_root_vdev != NULL) { 2371 spa_config_enter(spa, RW_READER, FTAG); 2372 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2373 spa_config_exit(spa, FTAG); 2374 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 2375 poolname) == 0); 2376 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 2377 state) == 0); 2378 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 2379 spa->spa_uberblock.ub_timestamp) == 0); 2380 2381 /* 2382 * If the bootfs property exists on this pool then we 2383 * copy it out so that external consumers can tell which 2384 * pools are bootable. 2385 */ 2386 if (spa->spa_bootfs) { 2387 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 2388 2389 /* 2390 * We have to play games with the name since the 2391 * pool was opened as TRYIMPORT_NAME. 2392 */ 2393 if (dsl_dsobj_to_dsname(spa->spa_name, 2394 spa->spa_bootfs, tmpname) == 0) { 2395 char *cp; 2396 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 2397 2398 cp = strchr(tmpname, '/'); 2399 if (cp == NULL) { 2400 (void) strlcpy(dsname, tmpname, 2401 MAXPATHLEN); 2402 } else { 2403 (void) snprintf(dsname, MAXPATHLEN, 2404 "%s/%s", poolname, ++cp); 2405 } 2406 VERIFY(nvlist_add_string(config, 2407 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 2408 kmem_free(dsname, MAXPATHLEN); 2409 } 2410 kmem_free(tmpname, MAXPATHLEN); 2411 } 2412 2413 /* 2414 * Add the list of hot spares and level 2 cache devices. 2415 */ 2416 spa_add_spares(spa, config); 2417 spa_add_l2cache(spa, config); 2418 } 2419 2420 spa_unload(spa); 2421 spa_deactivate(spa); 2422 spa_remove(spa); 2423 mutex_exit(&spa_namespace_lock); 2424 2425 return (config); 2426 } 2427 2428 /* 2429 * Pool export/destroy 2430 * 2431 * The act of destroying or exporting a pool is very simple. We make sure there 2432 * is no more pending I/O and any references to the pool are gone. Then, we 2433 * update the pool state and sync all the labels to disk, removing the 2434 * configuration from the cache afterwards. 2435 */ 2436 static int 2437 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 2438 { 2439 spa_t *spa; 2440 2441 if (oldconfig) 2442 *oldconfig = NULL; 2443 2444 if (!(spa_mode & FWRITE)) 2445 return (EROFS); 2446 2447 mutex_enter(&spa_namespace_lock); 2448 if ((spa = spa_lookup(pool)) == NULL) { 2449 mutex_exit(&spa_namespace_lock); 2450 return (ENOENT); 2451 } 2452 2453 /* 2454 * Put a hold on the pool, drop the namespace lock, stop async tasks, 2455 * reacquire the namespace lock, and see if we can export. 2456 */ 2457 spa_open_ref(spa, FTAG); 2458 mutex_exit(&spa_namespace_lock); 2459 spa_async_suspend(spa); 2460 mutex_enter(&spa_namespace_lock); 2461 spa_close(spa, FTAG); 2462 2463 /* 2464 * The pool will be in core if it's openable, 2465 * in which case we can modify its state. 2466 */ 2467 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 2468 /* 2469 * Objsets may be open only because they're dirty, so we 2470 * have to force it to sync before checking spa_refcnt. 2471 */ 2472 spa_scrub_suspend(spa); 2473 txg_wait_synced(spa->spa_dsl_pool, 0); 2474 2475 /* 2476 * A pool cannot be exported or destroyed if there are active 2477 * references. If we are resetting a pool, allow references by 2478 * fault injection handlers. 2479 */ 2480 if (!spa_refcount_zero(spa) || 2481 (spa->spa_inject_ref != 0 && 2482 new_state != POOL_STATE_UNINITIALIZED)) { 2483 spa_scrub_resume(spa); 2484 spa_async_resume(spa); 2485 mutex_exit(&spa_namespace_lock); 2486 return (EBUSY); 2487 } 2488 2489 spa_scrub_resume(spa); 2490 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 2491 2492 /* 2493 * We want this to be reflected on every label, 2494 * so mark them all dirty. spa_unload() will do the 2495 * final sync that pushes these changes out. 2496 */ 2497 if (new_state != POOL_STATE_UNINITIALIZED) { 2498 spa_config_enter(spa, RW_WRITER, FTAG); 2499 spa->spa_state = new_state; 2500 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 2501 vdev_config_dirty(spa->spa_root_vdev); 2502 spa_config_exit(spa, FTAG); 2503 } 2504 } 2505 2506 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 2507 2508 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2509 spa_unload(spa); 2510 spa_deactivate(spa); 2511 } 2512 2513 if (oldconfig && spa->spa_config) 2514 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 2515 2516 if (new_state != POOL_STATE_UNINITIALIZED) { 2517 spa_config_sync(spa, B_TRUE, B_TRUE); 2518 spa_remove(spa); 2519 } 2520 mutex_exit(&spa_namespace_lock); 2521 2522 return (0); 2523 } 2524 2525 /* 2526 * Destroy a storage pool. 2527 */ 2528 int 2529 spa_destroy(char *pool) 2530 { 2531 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 2532 } 2533 2534 /* 2535 * Export a storage pool. 2536 */ 2537 int 2538 spa_export(char *pool, nvlist_t **oldconfig) 2539 { 2540 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 2541 } 2542 2543 /* 2544 * Similar to spa_export(), this unloads the spa_t without actually removing it 2545 * from the namespace in any way. 2546 */ 2547 int 2548 spa_reset(char *pool) 2549 { 2550 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 2551 } 2552 2553 2554 /* 2555 * ========================================================================== 2556 * Device manipulation 2557 * ========================================================================== 2558 */ 2559 2560 /* 2561 * Add a device to a storage pool. 2562 */ 2563 int 2564 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 2565 { 2566 uint64_t txg; 2567 int c, error; 2568 vdev_t *rvd = spa->spa_root_vdev; 2569 vdev_t *vd, *tvd; 2570 nvlist_t **spares, **l2cache; 2571 uint_t nspares, nl2cache; 2572 2573 txg = spa_vdev_enter(spa); 2574 2575 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 2576 VDEV_ALLOC_ADD)) != 0) 2577 return (spa_vdev_exit(spa, NULL, txg, error)); 2578 2579 spa->spa_pending_vdev = vd; 2580 2581 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 2582 &nspares) != 0) 2583 nspares = 0; 2584 2585 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 2586 &nl2cache) != 0) 2587 nl2cache = 0; 2588 2589 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) { 2590 spa->spa_pending_vdev = NULL; 2591 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 2592 } 2593 2594 if (vd->vdev_children != 0) { 2595 if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 2596 spa->spa_pending_vdev = NULL; 2597 return (spa_vdev_exit(spa, vd, txg, error)); 2598 } 2599 } 2600 2601 /* 2602 * We must validate the spares and l2cache devices after checking the 2603 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 2604 */ 2605 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) { 2606 spa->spa_pending_vdev = NULL; 2607 return (spa_vdev_exit(spa, vd, txg, error)); 2608 } 2609 2610 spa->spa_pending_vdev = NULL; 2611 2612 /* 2613 * Transfer each new top-level vdev from vd to rvd. 2614 */ 2615 for (c = 0; c < vd->vdev_children; c++) { 2616 tvd = vd->vdev_child[c]; 2617 vdev_remove_child(vd, tvd); 2618 tvd->vdev_id = rvd->vdev_children; 2619 vdev_add_child(rvd, tvd); 2620 vdev_config_dirty(tvd); 2621 } 2622 2623 if (nspares != 0) { 2624 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 2625 ZPOOL_CONFIG_SPARES); 2626 spa_load_spares(spa); 2627 spa->spa_spares.sav_sync = B_TRUE; 2628 } 2629 2630 if (nl2cache != 0) { 2631 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 2632 ZPOOL_CONFIG_L2CACHE); 2633 spa_load_l2cache(spa); 2634 spa->spa_l2cache.sav_sync = B_TRUE; 2635 } 2636 2637 /* 2638 * We have to be careful when adding new vdevs to an existing pool. 2639 * If other threads start allocating from these vdevs before we 2640 * sync the config cache, and we lose power, then upon reboot we may 2641 * fail to open the pool because there are DVAs that the config cache 2642 * can't translate. Therefore, we first add the vdevs without 2643 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 2644 * and then let spa_config_update() initialize the new metaslabs. 2645 * 2646 * spa_load() checks for added-but-not-initialized vdevs, so that 2647 * if we lose power at any point in this sequence, the remaining 2648 * steps will be completed the next time we load the pool. 2649 */ 2650 (void) spa_vdev_exit(spa, vd, txg, 0); 2651 2652 mutex_enter(&spa_namespace_lock); 2653 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2654 mutex_exit(&spa_namespace_lock); 2655 2656 return (0); 2657 } 2658 2659 /* 2660 * Attach a device to a mirror. The arguments are the path to any device 2661 * in the mirror, and the nvroot for the new device. If the path specifies 2662 * a device that is not mirrored, we automatically insert the mirror vdev. 2663 * 2664 * If 'replacing' is specified, the new device is intended to replace the 2665 * existing device; in this case the two devices are made into their own 2666 * mirror using the 'replacing' vdev, which is functionally identical to 2667 * the mirror vdev (it actually reuses all the same ops) but has a few 2668 * extra rules: you can't attach to it after it's been created, and upon 2669 * completion of resilvering, the first disk (the one being replaced) 2670 * is automatically detached. 2671 */ 2672 int 2673 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 2674 { 2675 uint64_t txg, open_txg; 2676 int error; 2677 vdev_t *rvd = spa->spa_root_vdev; 2678 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 2679 vdev_ops_t *pvops; 2680 int is_log; 2681 2682 txg = spa_vdev_enter(spa); 2683 2684 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 2685 2686 if (oldvd == NULL) 2687 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 2688 2689 if (!oldvd->vdev_ops->vdev_op_leaf) 2690 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2691 2692 pvd = oldvd->vdev_parent; 2693 2694 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 2695 VDEV_ALLOC_ADD)) != 0) 2696 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 2697 2698 if (newrootvd->vdev_children != 1) 2699 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 2700 2701 newvd = newrootvd->vdev_child[0]; 2702 2703 if (!newvd->vdev_ops->vdev_op_leaf) 2704 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 2705 2706 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 2707 return (spa_vdev_exit(spa, newrootvd, txg, error)); 2708 2709 /* 2710 * Spares can't replace logs 2711 */ 2712 is_log = oldvd->vdev_islog; 2713 if (is_log && newvd->vdev_isspare) 2714 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2715 2716 if (!replacing) { 2717 /* 2718 * For attach, the only allowable parent is a mirror or the root 2719 * vdev. 2720 */ 2721 if (pvd->vdev_ops != &vdev_mirror_ops && 2722 pvd->vdev_ops != &vdev_root_ops) 2723 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2724 2725 pvops = &vdev_mirror_ops; 2726 } else { 2727 /* 2728 * Active hot spares can only be replaced by inactive hot 2729 * spares. 2730 */ 2731 if (pvd->vdev_ops == &vdev_spare_ops && 2732 pvd->vdev_child[1] == oldvd && 2733 !spa_has_spare(spa, newvd->vdev_guid)) 2734 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2735 2736 /* 2737 * If the source is a hot spare, and the parent isn't already a 2738 * spare, then we want to create a new hot spare. Otherwise, we 2739 * want to create a replacing vdev. The user is not allowed to 2740 * attach to a spared vdev child unless the 'isspare' state is 2741 * the same (spare replaces spare, non-spare replaces 2742 * non-spare). 2743 */ 2744 if (pvd->vdev_ops == &vdev_replacing_ops) 2745 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2746 else if (pvd->vdev_ops == &vdev_spare_ops && 2747 newvd->vdev_isspare != oldvd->vdev_isspare) 2748 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2749 else if (pvd->vdev_ops != &vdev_spare_ops && 2750 newvd->vdev_isspare) 2751 pvops = &vdev_spare_ops; 2752 else 2753 pvops = &vdev_replacing_ops; 2754 } 2755 2756 /* 2757 * Compare the new device size with the replaceable/attachable 2758 * device size. 2759 */ 2760 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 2761 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 2762 2763 /* 2764 * The new device cannot have a higher alignment requirement 2765 * than the top-level vdev. 2766 */ 2767 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 2768 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 2769 2770 /* 2771 * If this is an in-place replacement, update oldvd's path and devid 2772 * to make it distinguishable from newvd, and unopenable from now on. 2773 */ 2774 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 2775 spa_strfree(oldvd->vdev_path); 2776 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 2777 KM_SLEEP); 2778 (void) sprintf(oldvd->vdev_path, "%s/%s", 2779 newvd->vdev_path, "old"); 2780 if (oldvd->vdev_devid != NULL) { 2781 spa_strfree(oldvd->vdev_devid); 2782 oldvd->vdev_devid = NULL; 2783 } 2784 } 2785 2786 /* 2787 * If the parent is not a mirror, or if we're replacing, insert the new 2788 * mirror/replacing/spare vdev above oldvd. 2789 */ 2790 if (pvd->vdev_ops != pvops) 2791 pvd = vdev_add_parent(oldvd, pvops); 2792 2793 ASSERT(pvd->vdev_top->vdev_parent == rvd); 2794 ASSERT(pvd->vdev_ops == pvops); 2795 ASSERT(oldvd->vdev_parent == pvd); 2796 2797 /* 2798 * Extract the new device from its root and add it to pvd. 2799 */ 2800 vdev_remove_child(newrootvd, newvd); 2801 newvd->vdev_id = pvd->vdev_children; 2802 vdev_add_child(pvd, newvd); 2803 2804 /* 2805 * If newvd is smaller than oldvd, but larger than its rsize, 2806 * the addition of newvd may have decreased our parent's asize. 2807 */ 2808 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 2809 2810 tvd = newvd->vdev_top; 2811 ASSERT(pvd->vdev_top == tvd); 2812 ASSERT(tvd->vdev_parent == rvd); 2813 2814 vdev_config_dirty(tvd); 2815 2816 /* 2817 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 2818 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 2819 */ 2820 open_txg = txg + TXG_CONCURRENT_STATES - 1; 2821 2822 mutex_enter(&newvd->vdev_dtl_lock); 2823 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 2824 open_txg - TXG_INITIAL + 1); 2825 mutex_exit(&newvd->vdev_dtl_lock); 2826 2827 if (newvd->vdev_isspare) 2828 spa_spare_activate(newvd); 2829 2830 /* 2831 * Mark newvd's DTL dirty in this txg. 2832 */ 2833 vdev_dirty(tvd, VDD_DTL, newvd, txg); 2834 2835 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 2836 2837 /* 2838 * Kick off a resilver to update newvd. We need to grab the namespace 2839 * lock because spa_scrub() needs to post a sysevent with the pool name. 2840 */ 2841 mutex_enter(&spa_namespace_lock); 2842 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 2843 mutex_exit(&spa_namespace_lock); 2844 2845 return (0); 2846 } 2847 2848 /* 2849 * Detach a device from a mirror or replacing vdev. 2850 * If 'replace_done' is specified, only detach if the parent 2851 * is a replacing vdev. 2852 */ 2853 int 2854 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 2855 { 2856 uint64_t txg; 2857 int c, t, error; 2858 vdev_t *rvd = spa->spa_root_vdev; 2859 vdev_t *vd, *pvd, *cvd, *tvd; 2860 boolean_t unspare = B_FALSE; 2861 uint64_t unspare_guid; 2862 size_t len; 2863 2864 txg = spa_vdev_enter(spa); 2865 2866 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 2867 2868 if (vd == NULL) 2869 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 2870 2871 if (!vd->vdev_ops->vdev_op_leaf) 2872 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2873 2874 pvd = vd->vdev_parent; 2875 2876 /* 2877 * If replace_done is specified, only remove this device if it's 2878 * the first child of a replacing vdev. For the 'spare' vdev, either 2879 * disk can be removed. 2880 */ 2881 if (replace_done) { 2882 if (pvd->vdev_ops == &vdev_replacing_ops) { 2883 if (vd->vdev_id != 0) 2884 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2885 } else if (pvd->vdev_ops != &vdev_spare_ops) { 2886 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2887 } 2888 } 2889 2890 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 2891 spa_version(spa) >= SPA_VERSION_SPARES); 2892 2893 /* 2894 * Only mirror, replacing, and spare vdevs support detach. 2895 */ 2896 if (pvd->vdev_ops != &vdev_replacing_ops && 2897 pvd->vdev_ops != &vdev_mirror_ops && 2898 pvd->vdev_ops != &vdev_spare_ops) 2899 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2900 2901 /* 2902 * If there's only one replica, you can't detach it. 2903 */ 2904 if (pvd->vdev_children <= 1) 2905 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 2906 2907 /* 2908 * If all siblings have non-empty DTLs, this device may have the only 2909 * valid copy of the data, which means we cannot safely detach it. 2910 * 2911 * XXX -- as in the vdev_offline() case, we really want a more 2912 * precise DTL check. 2913 */ 2914 for (c = 0; c < pvd->vdev_children; c++) { 2915 uint64_t dirty; 2916 2917 cvd = pvd->vdev_child[c]; 2918 if (cvd == vd) 2919 continue; 2920 if (vdev_is_dead(cvd)) 2921 continue; 2922 mutex_enter(&cvd->vdev_dtl_lock); 2923 dirty = cvd->vdev_dtl_map.sm_space | 2924 cvd->vdev_dtl_scrub.sm_space; 2925 mutex_exit(&cvd->vdev_dtl_lock); 2926 if (!dirty) 2927 break; 2928 } 2929 2930 /* 2931 * If we are a replacing or spare vdev, then we can always detach the 2932 * latter child, as that is how one cancels the operation. 2933 */ 2934 if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 2935 c == pvd->vdev_children) 2936 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 2937 2938 /* 2939 * If we are detaching the second disk from a replacing vdev, then 2940 * check to see if we changed the original vdev's path to have "/old" 2941 * at the end in spa_vdev_attach(). If so, undo that change now. 2942 */ 2943 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 && 2944 pvd->vdev_child[0]->vdev_path != NULL && 2945 pvd->vdev_child[1]->vdev_path != NULL) { 2946 ASSERT(pvd->vdev_child[1] == vd); 2947 cvd = pvd->vdev_child[0]; 2948 len = strlen(vd->vdev_path); 2949 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 2950 strcmp(cvd->vdev_path + len, "/old") == 0) { 2951 spa_strfree(cvd->vdev_path); 2952 cvd->vdev_path = spa_strdup(vd->vdev_path); 2953 } 2954 } 2955 2956 /* 2957 * If we are detaching the original disk from a spare, then it implies 2958 * that the spare should become a real disk, and be removed from the 2959 * active spare list for the pool. 2960 */ 2961 if (pvd->vdev_ops == &vdev_spare_ops && 2962 vd->vdev_id == 0) 2963 unspare = B_TRUE; 2964 2965 /* 2966 * Erase the disk labels so the disk can be used for other things. 2967 * This must be done after all other error cases are handled, 2968 * but before we disembowel vd (so we can still do I/O to it). 2969 * But if we can't do it, don't treat the error as fatal -- 2970 * it may be that the unwritability of the disk is the reason 2971 * it's being detached! 2972 */ 2973 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 2974 2975 /* 2976 * Remove vd from its parent and compact the parent's children. 2977 */ 2978 vdev_remove_child(pvd, vd); 2979 vdev_compact_children(pvd); 2980 2981 /* 2982 * Remember one of the remaining children so we can get tvd below. 2983 */ 2984 cvd = pvd->vdev_child[0]; 2985 2986 /* 2987 * If we need to remove the remaining child from the list of hot spares, 2988 * do it now, marking the vdev as no longer a spare in the process. We 2989 * must do this before vdev_remove_parent(), because that can change the 2990 * GUID if it creates a new toplevel GUID. 2991 */ 2992 if (unspare) { 2993 ASSERT(cvd->vdev_isspare); 2994 spa_spare_remove(cvd); 2995 unspare_guid = cvd->vdev_guid; 2996 } 2997 2998 /* 2999 * If the parent mirror/replacing vdev only has one child, 3000 * the parent is no longer needed. Remove it from the tree. 3001 */ 3002 if (pvd->vdev_children == 1) 3003 vdev_remove_parent(cvd); 3004 3005 /* 3006 * We don't set tvd until now because the parent we just removed 3007 * may have been the previous top-level vdev. 3008 */ 3009 tvd = cvd->vdev_top; 3010 ASSERT(tvd->vdev_parent == rvd); 3011 3012 /* 3013 * Reevaluate the parent vdev state. 3014 */ 3015 vdev_propagate_state(cvd); 3016 3017 /* 3018 * If the device we just detached was smaller than the others, it may be 3019 * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 3020 * can't fail because the existing metaslabs are already in core, so 3021 * there's nothing to read from disk. 3022 */ 3023 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 3024 3025 vdev_config_dirty(tvd); 3026 3027 /* 3028 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 3029 * vd->vdev_detached is set and free vd's DTL object in syncing context. 3030 * But first make sure we're not on any *other* txg's DTL list, to 3031 * prevent vd from being accessed after it's freed. 3032 */ 3033 for (t = 0; t < TXG_SIZE; t++) 3034 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 3035 vd->vdev_detached = B_TRUE; 3036 vdev_dirty(tvd, VDD_DTL, vd, txg); 3037 3038 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 3039 3040 error = spa_vdev_exit(spa, vd, txg, 0); 3041 3042 /* 3043 * If this was the removal of the original device in a hot spare vdev, 3044 * then we want to go through and remove the device from the hot spare 3045 * list of every other pool. 3046 */ 3047 if (unspare) { 3048 spa = NULL; 3049 mutex_enter(&spa_namespace_lock); 3050 while ((spa = spa_next(spa)) != NULL) { 3051 if (spa->spa_state != POOL_STATE_ACTIVE) 3052 continue; 3053 3054 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 3055 } 3056 mutex_exit(&spa_namespace_lock); 3057 } 3058 3059 return (error); 3060 } 3061 3062 /* 3063 * Remove a spares vdev from the nvlist config. 3064 */ 3065 static int 3066 spa_remove_spares(spa_aux_vdev_t *sav, uint64_t guid, boolean_t unspare, 3067 nvlist_t **spares, int nspares, vdev_t *vd) 3068 { 3069 nvlist_t *nv, **newspares; 3070 int i, j; 3071 3072 nv = NULL; 3073 for (i = 0; i < nspares; i++) { 3074 uint64_t theguid; 3075 3076 VERIFY(nvlist_lookup_uint64(spares[i], 3077 ZPOOL_CONFIG_GUID, &theguid) == 0); 3078 if (theguid == guid) { 3079 nv = spares[i]; 3080 break; 3081 } 3082 } 3083 3084 /* 3085 * Only remove the hot spare if it's not currently in use in this pool. 3086 */ 3087 if (nv == NULL && vd == NULL) 3088 return (ENOENT); 3089 3090 if (nv == NULL && vd != NULL) 3091 return (ENOTSUP); 3092 3093 if (!unspare && nv != NULL && vd != NULL) 3094 return (EBUSY); 3095 3096 if (nspares == 1) { 3097 newspares = NULL; 3098 } else { 3099 newspares = kmem_alloc((nspares - 1) * sizeof (void *), 3100 KM_SLEEP); 3101 for (i = 0, j = 0; i < nspares; i++) { 3102 if (spares[i] != nv) 3103 VERIFY(nvlist_dup(spares[i], 3104 &newspares[j++], KM_SLEEP) == 0); 3105 } 3106 } 3107 3108 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_SPARES, 3109 DATA_TYPE_NVLIST_ARRAY) == 0); 3110 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3111 ZPOOL_CONFIG_SPARES, newspares, nspares - 1) == 0); 3112 for (i = 0; i < nspares - 1; i++) 3113 nvlist_free(newspares[i]); 3114 kmem_free(newspares, (nspares - 1) * sizeof (void *)); 3115 3116 return (0); 3117 } 3118 3119 /* 3120 * Remove an l2cache vdev from the nvlist config. 3121 */ 3122 static int 3123 spa_remove_l2cache(spa_aux_vdev_t *sav, uint64_t guid, nvlist_t **l2cache, 3124 int nl2cache, vdev_t *vd) 3125 { 3126 nvlist_t *nv, **newl2cache; 3127 int i, j; 3128 3129 nv = NULL; 3130 for (i = 0; i < nl2cache; i++) { 3131 uint64_t theguid; 3132 3133 VERIFY(nvlist_lookup_uint64(l2cache[i], 3134 ZPOOL_CONFIG_GUID, &theguid) == 0); 3135 if (theguid == guid) { 3136 nv = l2cache[i]; 3137 break; 3138 } 3139 } 3140 3141 if (vd == NULL) { 3142 for (i = 0; i < nl2cache; i++) { 3143 if (sav->sav_vdevs[i]->vdev_guid == guid) { 3144 vd = sav->sav_vdevs[i]; 3145 break; 3146 } 3147 } 3148 } 3149 3150 if (nv == NULL && vd == NULL) 3151 return (ENOENT); 3152 3153 if (nv == NULL && vd != NULL) 3154 return (ENOTSUP); 3155 3156 if (nl2cache == 1) { 3157 newl2cache = NULL; 3158 } else { 3159 newl2cache = kmem_alloc((nl2cache - 1) * sizeof (void *), 3160 KM_SLEEP); 3161 for (i = 0, j = 0; i < nl2cache; i++) { 3162 if (l2cache[i] != nv) 3163 VERIFY(nvlist_dup(l2cache[i], 3164 &newl2cache[j++], KM_SLEEP) == 0); 3165 } 3166 } 3167 3168 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 3169 DATA_TYPE_NVLIST_ARRAY) == 0); 3170 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3171 ZPOOL_CONFIG_L2CACHE, newl2cache, nl2cache - 1) == 0); 3172 for (i = 0; i < nl2cache - 1; i++) 3173 nvlist_free(newl2cache[i]); 3174 kmem_free(newl2cache, (nl2cache - 1) * sizeof (void *)); 3175 3176 return (0); 3177 } 3178 3179 /* 3180 * Remove a device from the pool. Currently, this supports removing only hot 3181 * spares and level 2 ARC devices. 3182 */ 3183 int 3184 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 3185 { 3186 vdev_t *vd; 3187 nvlist_t **spares, **l2cache; 3188 uint_t nspares, nl2cache; 3189 int error = 0; 3190 3191 spa_config_enter(spa, RW_WRITER, FTAG); 3192 3193 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 3194 3195 if (spa->spa_spares.sav_vdevs != NULL && 3196 spa_spare_exists(guid, NULL) && 3197 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3198 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { 3199 if ((error = spa_remove_spares(&spa->spa_spares, guid, unspare, 3200 spares, nspares, vd)) != 0) 3201 goto out; 3202 spa_load_spares(spa); 3203 spa->spa_spares.sav_sync = B_TRUE; 3204 goto out; 3205 } 3206 3207 if (spa->spa_l2cache.sav_vdevs != NULL && 3208 spa_l2cache_exists(guid, NULL) && 3209 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3210 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { 3211 if ((error = spa_remove_l2cache(&spa->spa_l2cache, guid, 3212 l2cache, nl2cache, vd)) != 0) 3213 goto out; 3214 spa_load_l2cache(spa); 3215 spa->spa_l2cache.sav_sync = B_TRUE; 3216 } 3217 3218 out: 3219 spa_config_exit(spa, FTAG); 3220 return (error); 3221 } 3222 3223 /* 3224 * Find any device that's done replacing, or a vdev marked 'unspare' that's 3225 * current spared, so we can detach it. 3226 */ 3227 static vdev_t * 3228 spa_vdev_resilver_done_hunt(vdev_t *vd) 3229 { 3230 vdev_t *newvd, *oldvd; 3231 int c; 3232 3233 for (c = 0; c < vd->vdev_children; c++) { 3234 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 3235 if (oldvd != NULL) 3236 return (oldvd); 3237 } 3238 3239 /* 3240 * Check for a completed replacement. 3241 */ 3242 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 3243 oldvd = vd->vdev_child[0]; 3244 newvd = vd->vdev_child[1]; 3245 3246 mutex_enter(&newvd->vdev_dtl_lock); 3247 if (newvd->vdev_dtl_map.sm_space == 0 && 3248 newvd->vdev_dtl_scrub.sm_space == 0) { 3249 mutex_exit(&newvd->vdev_dtl_lock); 3250 return (oldvd); 3251 } 3252 mutex_exit(&newvd->vdev_dtl_lock); 3253 } 3254 3255 /* 3256 * Check for a completed resilver with the 'unspare' flag set. 3257 */ 3258 if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { 3259 newvd = vd->vdev_child[0]; 3260 oldvd = vd->vdev_child[1]; 3261 3262 mutex_enter(&newvd->vdev_dtl_lock); 3263 if (newvd->vdev_unspare && 3264 newvd->vdev_dtl_map.sm_space == 0 && 3265 newvd->vdev_dtl_scrub.sm_space == 0) { 3266 newvd->vdev_unspare = 0; 3267 mutex_exit(&newvd->vdev_dtl_lock); 3268 return (oldvd); 3269 } 3270 mutex_exit(&newvd->vdev_dtl_lock); 3271 } 3272 3273 return (NULL); 3274 } 3275 3276 static void 3277 spa_vdev_resilver_done(spa_t *spa) 3278 { 3279 vdev_t *vd; 3280 vdev_t *pvd; 3281 uint64_t guid; 3282 uint64_t pguid = 0; 3283 3284 spa_config_enter(spa, RW_READER, FTAG); 3285 3286 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 3287 guid = vd->vdev_guid; 3288 /* 3289 * If we have just finished replacing a hot spared device, then 3290 * we need to detach the parent's first child (the original hot 3291 * spare) as well. 3292 */ 3293 pvd = vd->vdev_parent; 3294 if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3295 pvd->vdev_id == 0) { 3296 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 3297 ASSERT(pvd->vdev_parent->vdev_children == 2); 3298 pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 3299 } 3300 spa_config_exit(spa, FTAG); 3301 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 3302 return; 3303 if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 3304 return; 3305 spa_config_enter(spa, RW_READER, FTAG); 3306 } 3307 3308 spa_config_exit(spa, FTAG); 3309 } 3310 3311 /* 3312 * Update the stored path for this vdev. Dirty the vdev configuration, relying 3313 * on spa_vdev_enter/exit() to synchronize the labels and cache. 3314 */ 3315 int 3316 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 3317 { 3318 vdev_t *vd; 3319 uint64_t txg; 3320 3321 txg = spa_vdev_enter(spa); 3322 3323 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) { 3324 /* 3325 * Determine if this is a reference to a hot spare device. If 3326 * it is, update the path manually as there is no associated 3327 * vdev_t that can be synced to disk. 3328 */ 3329 nvlist_t **spares; 3330 uint_t i, nspares; 3331 3332 if (spa->spa_spares.sav_config != NULL) { 3333 VERIFY(nvlist_lookup_nvlist_array( 3334 spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 3335 &spares, &nspares) == 0); 3336 for (i = 0; i < nspares; i++) { 3337 uint64_t theguid; 3338 VERIFY(nvlist_lookup_uint64(spares[i], 3339 ZPOOL_CONFIG_GUID, &theguid) == 0); 3340 if (theguid == guid) { 3341 VERIFY(nvlist_add_string(spares[i], 3342 ZPOOL_CONFIG_PATH, newpath) == 0); 3343 spa_load_spares(spa); 3344 spa->spa_spares.sav_sync = B_TRUE; 3345 return (spa_vdev_exit(spa, NULL, txg, 3346 0)); 3347 } 3348 } 3349 } 3350 3351 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 3352 } 3353 3354 if (!vd->vdev_ops->vdev_op_leaf) 3355 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3356 3357 spa_strfree(vd->vdev_path); 3358 vd->vdev_path = spa_strdup(newpath); 3359 3360 vdev_config_dirty(vd->vdev_top); 3361 3362 return (spa_vdev_exit(spa, NULL, txg, 0)); 3363 } 3364 3365 /* 3366 * ========================================================================== 3367 * SPA Scrubbing 3368 * ========================================================================== 3369 */ 3370 3371 static void 3372 spa_scrub_io_done(zio_t *zio) 3373 { 3374 spa_t *spa = zio->io_spa; 3375 3376 arc_data_buf_free(zio->io_data, zio->io_size); 3377 3378 mutex_enter(&spa->spa_scrub_lock); 3379 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 3380 vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 3381 spa->spa_scrub_errors++; 3382 mutex_enter(&vd->vdev_stat_lock); 3383 vd->vdev_stat.vs_scrub_errors++; 3384 mutex_exit(&vd->vdev_stat_lock); 3385 } 3386 3387 if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight) 3388 cv_broadcast(&spa->spa_scrub_io_cv); 3389 3390 ASSERT(spa->spa_scrub_inflight >= 0); 3391 3392 mutex_exit(&spa->spa_scrub_lock); 3393 } 3394 3395 static void 3396 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 3397 zbookmark_t *zb) 3398 { 3399 size_t size = BP_GET_LSIZE(bp); 3400 void *data; 3401 3402 mutex_enter(&spa->spa_scrub_lock); 3403 /* 3404 * Do not give too much work to vdev(s). 3405 */ 3406 while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) { 3407 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 3408 } 3409 spa->spa_scrub_inflight++; 3410 mutex_exit(&spa->spa_scrub_lock); 3411 3412 data = arc_data_buf_alloc(size); 3413 3414 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 3415 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 3416 3417 flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 3418 3419 zio_nowait(zio_read(NULL, spa, bp, data, size, 3420 spa_scrub_io_done, NULL, priority, flags, zb)); 3421 } 3422 3423 /* ARGSUSED */ 3424 static int 3425 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 3426 { 3427 blkptr_t *bp = &bc->bc_blkptr; 3428 vdev_t *vd = spa->spa_root_vdev; 3429 dva_t *dva = bp->blk_dva; 3430 int needs_resilver = B_FALSE; 3431 int d; 3432 3433 if (bc->bc_errno) { 3434 /* 3435 * We can't scrub this block, but we can continue to scrub 3436 * the rest of the pool. Note the error and move along. 3437 */ 3438 mutex_enter(&spa->spa_scrub_lock); 3439 spa->spa_scrub_errors++; 3440 mutex_exit(&spa->spa_scrub_lock); 3441 3442 mutex_enter(&vd->vdev_stat_lock); 3443 vd->vdev_stat.vs_scrub_errors++; 3444 mutex_exit(&vd->vdev_stat_lock); 3445 3446 return (ERESTART); 3447 } 3448 3449 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 3450 3451 for (d = 0; d < BP_GET_NDVAS(bp); d++) { 3452 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 3453 3454 ASSERT(vd != NULL); 3455 3456 /* 3457 * Keep track of how much data we've examined so that 3458 * zpool(1M) status can make useful progress reports. 3459 */ 3460 mutex_enter(&vd->vdev_stat_lock); 3461 vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 3462 mutex_exit(&vd->vdev_stat_lock); 3463 3464 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 3465 if (DVA_GET_GANG(&dva[d])) { 3466 /* 3467 * Gang members may be spread across multiple 3468 * vdevs, so the best we can do is look at the 3469 * pool-wide DTL. 3470 * XXX -- it would be better to change our 3471 * allocation policy to ensure that this can't 3472 * happen. 3473 */ 3474 vd = spa->spa_root_vdev; 3475 } 3476 if (vdev_dtl_contains(&vd->vdev_dtl_map, 3477 bp->blk_birth, 1)) 3478 needs_resilver = B_TRUE; 3479 } 3480 } 3481 3482 if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 3483 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 3484 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 3485 else if (needs_resilver) 3486 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 3487 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 3488 3489 return (0); 3490 } 3491 3492 static void 3493 spa_scrub_thread(spa_t *spa) 3494 { 3495 callb_cpr_t cprinfo; 3496 traverse_handle_t *th = spa->spa_scrub_th; 3497 vdev_t *rvd = spa->spa_root_vdev; 3498 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 3499 int error = 0; 3500 boolean_t complete; 3501 3502 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 3503 3504 /* 3505 * If we're restarting due to a snapshot create/delete, 3506 * wait for that to complete. 3507 */ 3508 txg_wait_synced(spa_get_dsl(spa), 0); 3509 3510 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 3511 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 3512 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 3513 3514 spa_config_enter(spa, RW_WRITER, FTAG); 3515 vdev_reopen(rvd); /* purge all vdev caches */ 3516 vdev_config_dirty(rvd); /* rewrite all disk labels */ 3517 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 3518 spa_config_exit(spa, FTAG); 3519 3520 mutex_enter(&spa->spa_scrub_lock); 3521 spa->spa_scrub_errors = 0; 3522 spa->spa_scrub_active = 1; 3523 ASSERT(spa->spa_scrub_inflight == 0); 3524 3525 while (!spa->spa_scrub_stop) { 3526 CALLB_CPR_SAFE_BEGIN(&cprinfo); 3527 while (spa->spa_scrub_suspended) { 3528 spa->spa_scrub_active = 0; 3529 cv_broadcast(&spa->spa_scrub_cv); 3530 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 3531 spa->spa_scrub_active = 1; 3532 } 3533 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 3534 3535 if (spa->spa_scrub_restart_txg != 0) 3536 break; 3537 3538 mutex_exit(&spa->spa_scrub_lock); 3539 error = traverse_more(th); 3540 mutex_enter(&spa->spa_scrub_lock); 3541 if (error != EAGAIN) 3542 break; 3543 } 3544 3545 while (spa->spa_scrub_inflight) 3546 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 3547 3548 spa->spa_scrub_active = 0; 3549 cv_broadcast(&spa->spa_scrub_cv); 3550 3551 mutex_exit(&spa->spa_scrub_lock); 3552 3553 spa_config_enter(spa, RW_WRITER, FTAG); 3554 3555 mutex_enter(&spa->spa_scrub_lock); 3556 3557 /* 3558 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 3559 * AND the spa config lock to synchronize with any config changes 3560 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 3561 */ 3562 if (spa->spa_scrub_restart_txg != 0) 3563 error = ERESTART; 3564 3565 if (spa->spa_scrub_stop) 3566 error = EINTR; 3567 3568 /* 3569 * Even if there were uncorrectable errors, we consider the scrub 3570 * completed. The downside is that if there is a transient error during 3571 * a resilver, we won't resilver the data properly to the target. But 3572 * if the damage is permanent (more likely) we will resilver forever, 3573 * which isn't really acceptable. Since there is enough information for 3574 * the user to know what has failed and why, this seems like a more 3575 * tractable approach. 3576 */ 3577 complete = (error == 0); 3578 3579 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 3580 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 3581 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 3582 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 3583 3584 mutex_exit(&spa->spa_scrub_lock); 3585 3586 /* 3587 * If the scrub/resilver completed, update all DTLs to reflect this. 3588 * Whether it succeeded or not, vacate all temporary scrub DTLs. 3589 */ 3590 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 3591 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 3592 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 3593 spa_errlog_rotate(spa); 3594 3595 if (scrub_type == POOL_SCRUB_RESILVER && complete) 3596 spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_FINISH); 3597 3598 spa_config_exit(spa, FTAG); 3599 3600 mutex_enter(&spa->spa_scrub_lock); 3601 3602 /* 3603 * We may have finished replacing a device. 3604 * Let the async thread assess this and handle the detach. 3605 */ 3606 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 3607 3608 /* 3609 * If we were told to restart, our final act is to start a new scrub. 3610 */ 3611 if (error == ERESTART) 3612 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 3613 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 3614 3615 spa->spa_scrub_type = POOL_SCRUB_NONE; 3616 spa->spa_scrub_active = 0; 3617 spa->spa_scrub_thread = NULL; 3618 cv_broadcast(&spa->spa_scrub_cv); 3619 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 3620 thread_exit(); 3621 } 3622 3623 void 3624 spa_scrub_suspend(spa_t *spa) 3625 { 3626 mutex_enter(&spa->spa_scrub_lock); 3627 spa->spa_scrub_suspended++; 3628 while (spa->spa_scrub_active) { 3629 cv_broadcast(&spa->spa_scrub_cv); 3630 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 3631 } 3632 while (spa->spa_scrub_inflight) 3633 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 3634 mutex_exit(&spa->spa_scrub_lock); 3635 } 3636 3637 void 3638 spa_scrub_resume(spa_t *spa) 3639 { 3640 mutex_enter(&spa->spa_scrub_lock); 3641 ASSERT(spa->spa_scrub_suspended != 0); 3642 if (--spa->spa_scrub_suspended == 0) 3643 cv_broadcast(&spa->spa_scrub_cv); 3644 mutex_exit(&spa->spa_scrub_lock); 3645 } 3646 3647 void 3648 spa_scrub_restart(spa_t *spa, uint64_t txg) 3649 { 3650 /* 3651 * Something happened (e.g. snapshot create/delete) that means 3652 * we must restart any in-progress scrubs. The itinerary will 3653 * fix this properly. 3654 */ 3655 mutex_enter(&spa->spa_scrub_lock); 3656 spa->spa_scrub_restart_txg = txg; 3657 mutex_exit(&spa->spa_scrub_lock); 3658 } 3659 3660 int 3661 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 3662 { 3663 space_seg_t *ss; 3664 uint64_t mintxg, maxtxg; 3665 vdev_t *rvd = spa->spa_root_vdev; 3666 3667 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3668 ASSERT(!spa_config_held(spa, RW_WRITER)); 3669 3670 if ((uint_t)type >= POOL_SCRUB_TYPES) 3671 return (ENOTSUP); 3672 3673 mutex_enter(&spa->spa_scrub_lock); 3674 3675 /* 3676 * If there's a scrub or resilver already in progress, stop it. 3677 */ 3678 while (spa->spa_scrub_thread != NULL) { 3679 /* 3680 * Don't stop a resilver unless forced. 3681 */ 3682 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 3683 mutex_exit(&spa->spa_scrub_lock); 3684 return (EBUSY); 3685 } 3686 spa->spa_scrub_stop = 1; 3687 cv_broadcast(&spa->spa_scrub_cv); 3688 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 3689 } 3690 3691 /* 3692 * Terminate the previous traverse. 3693 */ 3694 if (spa->spa_scrub_th != NULL) { 3695 traverse_fini(spa->spa_scrub_th); 3696 spa->spa_scrub_th = NULL; 3697 } 3698 3699 if (rvd == NULL) { 3700 ASSERT(spa->spa_scrub_stop == 0); 3701 ASSERT(spa->spa_scrub_type == type); 3702 ASSERT(spa->spa_scrub_restart_txg == 0); 3703 mutex_exit(&spa->spa_scrub_lock); 3704 return (0); 3705 } 3706 3707 mintxg = TXG_INITIAL - 1; 3708 maxtxg = spa_last_synced_txg(spa) + 1; 3709 3710 mutex_enter(&rvd->vdev_dtl_lock); 3711 3712 if (rvd->vdev_dtl_map.sm_space == 0) { 3713 /* 3714 * The pool-wide DTL is empty. 3715 * If this is a resilver, there's nothing to do except 3716 * check whether any in-progress replacements have completed. 3717 */ 3718 if (type == POOL_SCRUB_RESILVER) { 3719 type = POOL_SCRUB_NONE; 3720 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 3721 } 3722 } else { 3723 /* 3724 * The pool-wide DTL is non-empty. 3725 * If this is a normal scrub, upgrade to a resilver instead. 3726 */ 3727 if (type == POOL_SCRUB_EVERYTHING) 3728 type = POOL_SCRUB_RESILVER; 3729 } 3730 3731 if (type == POOL_SCRUB_RESILVER) { 3732 /* 3733 * Determine the resilvering boundaries. 3734 * 3735 * Note: (mintxg, maxtxg) is an open interval, 3736 * i.e. mintxg and maxtxg themselves are not included. 3737 * 3738 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 3739 * so we don't claim to resilver a txg that's still changing. 3740 */ 3741 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 3742 mintxg = ss->ss_start - 1; 3743 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 3744 maxtxg = MIN(ss->ss_end, maxtxg); 3745 3746 spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START); 3747 } 3748 3749 mutex_exit(&rvd->vdev_dtl_lock); 3750 3751 spa->spa_scrub_stop = 0; 3752 spa->spa_scrub_type = type; 3753 spa->spa_scrub_restart_txg = 0; 3754 3755 if (type != POOL_SCRUB_NONE) { 3756 spa->spa_scrub_mintxg = mintxg; 3757 spa->spa_scrub_maxtxg = maxtxg; 3758 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 3759 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 3760 ZIO_FLAG_CANFAIL); 3761 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 3762 spa->spa_scrub_thread = thread_create(NULL, 0, 3763 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 3764 } 3765 3766 mutex_exit(&spa->spa_scrub_lock); 3767 3768 return (0); 3769 } 3770 3771 /* 3772 * ========================================================================== 3773 * SPA async task processing 3774 * ========================================================================== 3775 */ 3776 3777 static void 3778 spa_async_remove(spa_t *spa, vdev_t *vd) 3779 { 3780 vdev_t *tvd; 3781 int c; 3782 3783 for (c = 0; c < vd->vdev_children; c++) { 3784 tvd = vd->vdev_child[c]; 3785 if (tvd->vdev_remove_wanted) { 3786 tvd->vdev_remove_wanted = 0; 3787 vdev_set_state(tvd, B_FALSE, VDEV_STATE_REMOVED, 3788 VDEV_AUX_NONE); 3789 vdev_clear(spa, tvd, B_TRUE); 3790 vdev_config_dirty(tvd->vdev_top); 3791 } 3792 spa_async_remove(spa, tvd); 3793 } 3794 } 3795 3796 static void 3797 spa_async_thread(spa_t *spa) 3798 { 3799 int tasks; 3800 uint64_t txg; 3801 3802 ASSERT(spa->spa_sync_on); 3803 3804 mutex_enter(&spa->spa_async_lock); 3805 tasks = spa->spa_async_tasks; 3806 spa->spa_async_tasks = 0; 3807 mutex_exit(&spa->spa_async_lock); 3808 3809 /* 3810 * See if the config needs to be updated. 3811 */ 3812 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 3813 mutex_enter(&spa_namespace_lock); 3814 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3815 mutex_exit(&spa_namespace_lock); 3816 } 3817 3818 /* 3819 * See if any devices need to be marked REMOVED. 3820 * 3821 * XXX - We avoid doing this when we are in 3822 * I/O failure state since spa_vdev_enter() grabs 3823 * the namespace lock and would not be able to obtain 3824 * the writer config lock. 3825 */ 3826 if (tasks & SPA_ASYNC_REMOVE && 3827 spa_state(spa) != POOL_STATE_IO_FAILURE) { 3828 txg = spa_vdev_enter(spa); 3829 spa_async_remove(spa, spa->spa_root_vdev); 3830 (void) spa_vdev_exit(spa, NULL, txg, 0); 3831 } 3832 3833 /* 3834 * If any devices are done replacing, detach them. 3835 */ 3836 if (tasks & SPA_ASYNC_RESILVER_DONE) 3837 spa_vdev_resilver_done(spa); 3838 3839 /* 3840 * Kick off a scrub. When starting a RESILVER scrub (or an EVERYTHING 3841 * scrub which can become a resilver), we need to hold 3842 * spa_namespace_lock() because the sysevent we post via 3843 * spa_event_notify() needs to get the name of the pool. 3844 */ 3845 if (tasks & SPA_ASYNC_SCRUB) { 3846 mutex_enter(&spa_namespace_lock); 3847 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 3848 mutex_exit(&spa_namespace_lock); 3849 } 3850 3851 /* 3852 * Kick off a resilver. 3853 */ 3854 if (tasks & SPA_ASYNC_RESILVER) { 3855 mutex_enter(&spa_namespace_lock); 3856 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 3857 mutex_exit(&spa_namespace_lock); 3858 } 3859 3860 /* 3861 * Let the world know that we're done. 3862 */ 3863 mutex_enter(&spa->spa_async_lock); 3864 spa->spa_async_thread = NULL; 3865 cv_broadcast(&spa->spa_async_cv); 3866 mutex_exit(&spa->spa_async_lock); 3867 thread_exit(); 3868 } 3869 3870 void 3871 spa_async_suspend(spa_t *spa) 3872 { 3873 mutex_enter(&spa->spa_async_lock); 3874 spa->spa_async_suspended++; 3875 while (spa->spa_async_thread != NULL) 3876 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 3877 mutex_exit(&spa->spa_async_lock); 3878 } 3879 3880 void 3881 spa_async_resume(spa_t *spa) 3882 { 3883 mutex_enter(&spa->spa_async_lock); 3884 ASSERT(spa->spa_async_suspended != 0); 3885 spa->spa_async_suspended--; 3886 mutex_exit(&spa->spa_async_lock); 3887 } 3888 3889 static void 3890 spa_async_dispatch(spa_t *spa) 3891 { 3892 mutex_enter(&spa->spa_async_lock); 3893 if (spa->spa_async_tasks && !spa->spa_async_suspended && 3894 spa->spa_async_thread == NULL && 3895 rootdir != NULL && !vn_is_readonly(rootdir)) 3896 spa->spa_async_thread = thread_create(NULL, 0, 3897 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 3898 mutex_exit(&spa->spa_async_lock); 3899 } 3900 3901 void 3902 spa_async_request(spa_t *spa, int task) 3903 { 3904 mutex_enter(&spa->spa_async_lock); 3905 spa->spa_async_tasks |= task; 3906 mutex_exit(&spa->spa_async_lock); 3907 } 3908 3909 /* 3910 * ========================================================================== 3911 * SPA syncing routines 3912 * ========================================================================== 3913 */ 3914 3915 static void 3916 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 3917 { 3918 bplist_t *bpl = &spa->spa_sync_bplist; 3919 dmu_tx_t *tx; 3920 blkptr_t blk; 3921 uint64_t itor = 0; 3922 zio_t *zio; 3923 int error; 3924 uint8_t c = 1; 3925 3926 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 3927 3928 while (bplist_iterate(bpl, &itor, &blk) == 0) 3929 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 3930 3931 error = zio_wait(zio); 3932 ASSERT3U(error, ==, 0); 3933 3934 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 3935 bplist_vacate(bpl, tx); 3936 3937 /* 3938 * Pre-dirty the first block so we sync to convergence faster. 3939 * (Usually only the first block is needed.) 3940 */ 3941 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 3942 dmu_tx_commit(tx); 3943 } 3944 3945 static void 3946 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 3947 { 3948 char *packed = NULL; 3949 size_t nvsize = 0; 3950 dmu_buf_t *db; 3951 3952 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 3953 3954 packed = kmem_alloc(nvsize, KM_SLEEP); 3955 3956 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 3957 KM_SLEEP) == 0); 3958 3959 dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 3960 3961 kmem_free(packed, nvsize); 3962 3963 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 3964 dmu_buf_will_dirty(db, tx); 3965 *(uint64_t *)db->db_data = nvsize; 3966 dmu_buf_rele(db, FTAG); 3967 } 3968 3969 static void 3970 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 3971 const char *config, const char *entry) 3972 { 3973 nvlist_t *nvroot; 3974 nvlist_t **list; 3975 int i; 3976 3977 if (!sav->sav_sync) 3978 return; 3979 3980 /* 3981 * Update the MOS nvlist describing the list of available devices. 3982 * spa_validate_aux() will have already made sure this nvlist is 3983 * valid and the vdevs are labeled appropriately. 3984 */ 3985 if (sav->sav_object == 0) { 3986 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 3987 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 3988 sizeof (uint64_t), tx); 3989 VERIFY(zap_update(spa->spa_meta_objset, 3990 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 3991 &sav->sav_object, tx) == 0); 3992 } 3993 3994 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3995 if (sav->sav_count == 0) { 3996 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 3997 } else { 3998 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 3999 for (i = 0; i < sav->sav_count; i++) 4000 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 4001 B_FALSE, B_FALSE, B_TRUE); 4002 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 4003 sav->sav_count) == 0); 4004 for (i = 0; i < sav->sav_count; i++) 4005 nvlist_free(list[i]); 4006 kmem_free(list, sav->sav_count * sizeof (void *)); 4007 } 4008 4009 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 4010 nvlist_free(nvroot); 4011 4012 sav->sav_sync = B_FALSE; 4013 } 4014 4015 static void 4016 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 4017 { 4018 nvlist_t *config; 4019 4020 if (list_is_empty(&spa->spa_dirty_list)) 4021 return; 4022 4023 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 4024 4025 if (spa->spa_config_syncing) 4026 nvlist_free(spa->spa_config_syncing); 4027 spa->spa_config_syncing = config; 4028 4029 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 4030 } 4031 4032 /* 4033 * Set zpool properties. 4034 */ 4035 static void 4036 spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 4037 { 4038 spa_t *spa = arg1; 4039 objset_t *mos = spa->spa_meta_objset; 4040 nvlist_t *nvp = arg2; 4041 nvpair_t *elem; 4042 uint64_t intval; 4043 char *strval; 4044 zpool_prop_t prop; 4045 const char *propname; 4046 zprop_type_t proptype; 4047 spa_config_dirent_t *dp; 4048 4049 elem = NULL; 4050 while ((elem = nvlist_next_nvpair(nvp, elem))) { 4051 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 4052 case ZPOOL_PROP_VERSION: 4053 /* 4054 * Only set version for non-zpool-creation cases 4055 * (set/import). spa_create() needs special care 4056 * for version setting. 4057 */ 4058 if (tx->tx_txg != TXG_INITIAL) { 4059 VERIFY(nvpair_value_uint64(elem, 4060 &intval) == 0); 4061 ASSERT(intval <= SPA_VERSION); 4062 ASSERT(intval >= spa_version(spa)); 4063 spa->spa_uberblock.ub_version = intval; 4064 vdev_config_dirty(spa->spa_root_vdev); 4065 } 4066 break; 4067 4068 case ZPOOL_PROP_ALTROOT: 4069 /* 4070 * 'altroot' is a non-persistent property. It should 4071 * have been set temporarily at creation or import time. 4072 */ 4073 ASSERT(spa->spa_root != NULL); 4074 break; 4075 4076 case ZPOOL_PROP_CACHEFILE: 4077 /* 4078 * 'cachefile' is a non-persistent property, but note 4079 * an async request that the config cache needs to be 4080 * udpated. 4081 */ 4082 VERIFY(nvpair_value_string(elem, &strval) == 0); 4083 4084 dp = kmem_alloc(sizeof (spa_config_dirent_t), 4085 KM_SLEEP); 4086 4087 if (strval[0] == '\0') 4088 dp->scd_path = spa_strdup(spa_config_path); 4089 else if (strcmp(strval, "none") == 0) 4090 dp->scd_path = NULL; 4091 else 4092 dp->scd_path = spa_strdup(strval); 4093 4094 list_insert_head(&spa->spa_config_list, dp); 4095 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 4096 break; 4097 default: 4098 /* 4099 * Set pool property values in the poolprops mos object. 4100 */ 4101 mutex_enter(&spa->spa_props_lock); 4102 if (spa->spa_pool_props_object == 0) { 4103 objset_t *mos = spa->spa_meta_objset; 4104 4105 VERIFY((spa->spa_pool_props_object = 4106 zap_create(mos, DMU_OT_POOL_PROPS, 4107 DMU_OT_NONE, 0, tx)) > 0); 4108 4109 VERIFY(zap_update(mos, 4110 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 4111 8, 1, &spa->spa_pool_props_object, tx) 4112 == 0); 4113 } 4114 mutex_exit(&spa->spa_props_lock); 4115 4116 /* normalize the property name */ 4117 propname = zpool_prop_to_name(prop); 4118 proptype = zpool_prop_get_type(prop); 4119 4120 if (nvpair_type(elem) == DATA_TYPE_STRING) { 4121 ASSERT(proptype == PROP_TYPE_STRING); 4122 VERIFY(nvpair_value_string(elem, &strval) == 0); 4123 VERIFY(zap_update(mos, 4124 spa->spa_pool_props_object, propname, 4125 1, strlen(strval) + 1, strval, tx) == 0); 4126 4127 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 4128 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 4129 4130 if (proptype == PROP_TYPE_INDEX) { 4131 const char *unused; 4132 VERIFY(zpool_prop_index_to_string( 4133 prop, intval, &unused) == 0); 4134 } 4135 VERIFY(zap_update(mos, 4136 spa->spa_pool_props_object, propname, 4137 8, 1, &intval, tx) == 0); 4138 } else { 4139 ASSERT(0); /* not allowed */ 4140 } 4141 4142 switch (prop) { 4143 case ZPOOL_PROP_DELEGATION: 4144 spa->spa_delegation = intval; 4145 break; 4146 case ZPOOL_PROP_BOOTFS: 4147 spa->spa_bootfs = intval; 4148 break; 4149 case ZPOOL_PROP_FAILUREMODE: 4150 spa->spa_failmode = intval; 4151 break; 4152 default: 4153 break; 4154 } 4155 } 4156 4157 /* log internal history if this is not a zpool create */ 4158 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 4159 tx->tx_txg != TXG_INITIAL) { 4160 spa_history_internal_log(LOG_POOL_PROPSET, 4161 spa, tx, cr, "%s %lld %s", 4162 nvpair_name(elem), intval, spa->spa_name); 4163 } 4164 } 4165 } 4166 4167 /* 4168 * Sync the specified transaction group. New blocks may be dirtied as 4169 * part of the process, so we iterate until it converges. 4170 */ 4171 void 4172 spa_sync(spa_t *spa, uint64_t txg) 4173 { 4174 dsl_pool_t *dp = spa->spa_dsl_pool; 4175 objset_t *mos = spa->spa_meta_objset; 4176 bplist_t *bpl = &spa->spa_sync_bplist; 4177 vdev_t *rvd = spa->spa_root_vdev; 4178 vdev_t *vd; 4179 dmu_tx_t *tx; 4180 int dirty_vdevs; 4181 4182 /* 4183 * Lock out configuration changes. 4184 */ 4185 spa_config_enter(spa, RW_READER, FTAG); 4186 4187 spa->spa_syncing_txg = txg; 4188 spa->spa_sync_pass = 0; 4189 4190 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 4191 4192 tx = dmu_tx_create_assigned(dp, txg); 4193 4194 /* 4195 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 4196 * set spa_deflate if we have no raid-z vdevs. 4197 */ 4198 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 4199 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 4200 int i; 4201 4202 for (i = 0; i < rvd->vdev_children; i++) { 4203 vd = rvd->vdev_child[i]; 4204 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 4205 break; 4206 } 4207 if (i == rvd->vdev_children) { 4208 spa->spa_deflate = TRUE; 4209 VERIFY(0 == zap_add(spa->spa_meta_objset, 4210 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 4211 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 4212 } 4213 } 4214 4215 /* 4216 * If anything has changed in this txg, push the deferred frees 4217 * from the previous txg. If not, leave them alone so that we 4218 * don't generate work on an otherwise idle system. 4219 */ 4220 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 4221 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 4222 !txg_list_empty(&dp->dp_sync_tasks, txg)) 4223 spa_sync_deferred_frees(spa, txg); 4224 4225 /* 4226 * Iterate to convergence. 4227 */ 4228 do { 4229 spa->spa_sync_pass++; 4230 4231 spa_sync_config_object(spa, tx); 4232 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 4233 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 4234 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 4235 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 4236 spa_errlog_sync(spa, txg); 4237 dsl_pool_sync(dp, txg); 4238 4239 dirty_vdevs = 0; 4240 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 4241 vdev_sync(vd, txg); 4242 dirty_vdevs++; 4243 } 4244 4245 bplist_sync(bpl, tx); 4246 } while (dirty_vdevs); 4247 4248 bplist_close(bpl); 4249 4250 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 4251 4252 /* 4253 * Rewrite the vdev configuration (which includes the uberblock) 4254 * to commit the transaction group. 4255 * 4256 * If there are no dirty vdevs, we sync the uberblock to a few 4257 * random top-level vdevs that are known to be visible in the 4258 * config cache (see spa_vdev_add() for details). If there *are* 4259 * dirty vdevs -- or if the sync to our random subset fails -- 4260 * then sync the uberblock to all vdevs. 4261 */ 4262 if (list_is_empty(&spa->spa_dirty_list)) { 4263 vdev_t *svd[SPA_DVAS_PER_BP]; 4264 int svdcount = 0; 4265 int children = rvd->vdev_children; 4266 int c0 = spa_get_random(children); 4267 int c; 4268 4269 for (c = 0; c < children; c++) { 4270 vd = rvd->vdev_child[(c0 + c) % children]; 4271 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 4272 continue; 4273 svd[svdcount++] = vd; 4274 if (svdcount == SPA_DVAS_PER_BP) 4275 break; 4276 } 4277 vdev_config_sync(svd, svdcount, txg); 4278 } else { 4279 vdev_config_sync(rvd->vdev_child, rvd->vdev_children, txg); 4280 } 4281 dmu_tx_commit(tx); 4282 4283 /* 4284 * Clear the dirty config list. 4285 */ 4286 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 4287 vdev_config_clean(vd); 4288 4289 /* 4290 * Now that the new config has synced transactionally, 4291 * let it become visible to the config cache. 4292 */ 4293 if (spa->spa_config_syncing != NULL) { 4294 spa_config_set(spa, spa->spa_config_syncing); 4295 spa->spa_config_txg = txg; 4296 spa->spa_config_syncing = NULL; 4297 } 4298 4299 /* 4300 * Make a stable copy of the fully synced uberblock. 4301 * We use this as the root for pool traversals. 4302 */ 4303 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 4304 4305 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 4306 4307 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 4308 spa->spa_traverse_wanted = 0; 4309 spa->spa_ubsync = spa->spa_uberblock; 4310 rw_exit(&spa->spa_traverse_lock); 4311 4312 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 4313 4314 /* 4315 * Clean up the ZIL records for the synced txg. 4316 */ 4317 dsl_pool_zil_clean(dp); 4318 4319 /* 4320 * Update usable space statistics. 4321 */ 4322 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 4323 vdev_sync_done(vd, txg); 4324 4325 /* 4326 * It had better be the case that we didn't dirty anything 4327 * since vdev_config_sync(). 4328 */ 4329 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 4330 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 4331 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 4332 ASSERT(bpl->bpl_queue == NULL); 4333 4334 spa_config_exit(spa, FTAG); 4335 4336 /* 4337 * If any async tasks have been requested, kick them off. 4338 */ 4339 spa_async_dispatch(spa); 4340 } 4341 4342 /* 4343 * Sync all pools. We don't want to hold the namespace lock across these 4344 * operations, so we take a reference on the spa_t and drop the lock during the 4345 * sync. 4346 */ 4347 void 4348 spa_sync_allpools(void) 4349 { 4350 spa_t *spa = NULL; 4351 mutex_enter(&spa_namespace_lock); 4352 while ((spa = spa_next(spa)) != NULL) { 4353 if (spa_state(spa) != POOL_STATE_ACTIVE) 4354 continue; 4355 spa_open_ref(spa, FTAG); 4356 mutex_exit(&spa_namespace_lock); 4357 txg_wait_synced(spa_get_dsl(spa), 0); 4358 mutex_enter(&spa_namespace_lock); 4359 spa_close(spa, FTAG); 4360 } 4361 mutex_exit(&spa_namespace_lock); 4362 } 4363 4364 /* 4365 * ========================================================================== 4366 * Miscellaneous routines 4367 * ========================================================================== 4368 */ 4369 4370 /* 4371 * Remove all pools in the system. 4372 */ 4373 void 4374 spa_evict_all(void) 4375 { 4376 spa_t *spa; 4377 4378 /* 4379 * Remove all cached state. All pools should be closed now, 4380 * so every spa in the AVL tree should be unreferenced. 4381 */ 4382 mutex_enter(&spa_namespace_lock); 4383 while ((spa = spa_next(NULL)) != NULL) { 4384 /* 4385 * Stop async tasks. The async thread may need to detach 4386 * a device that's been replaced, which requires grabbing 4387 * spa_namespace_lock, so we must drop it here. 4388 */ 4389 spa_open_ref(spa, FTAG); 4390 mutex_exit(&spa_namespace_lock); 4391 spa_async_suspend(spa); 4392 mutex_enter(&spa_namespace_lock); 4393 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 4394 spa_close(spa, FTAG); 4395 4396 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4397 spa_unload(spa); 4398 spa_deactivate(spa); 4399 } 4400 spa_remove(spa); 4401 } 4402 mutex_exit(&spa_namespace_lock); 4403 } 4404 4405 vdev_t * 4406 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache) 4407 { 4408 vdev_t *vd; 4409 int i; 4410 4411 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 4412 return (vd); 4413 4414 if (l2cache) { 4415 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 4416 vd = spa->spa_l2cache.sav_vdevs[i]; 4417 if (vd->vdev_guid == guid) 4418 return (vd); 4419 } 4420 } 4421 4422 return (NULL); 4423 } 4424 4425 void 4426 spa_upgrade(spa_t *spa, uint64_t version) 4427 { 4428 spa_config_enter(spa, RW_WRITER, FTAG); 4429 4430 /* 4431 * This should only be called for a non-faulted pool, and since a 4432 * future version would result in an unopenable pool, this shouldn't be 4433 * possible. 4434 */ 4435 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 4436 ASSERT(version >= spa->spa_uberblock.ub_version); 4437 4438 spa->spa_uberblock.ub_version = version; 4439 vdev_config_dirty(spa->spa_root_vdev); 4440 4441 spa_config_exit(spa, FTAG); 4442 4443 txg_wait_synced(spa_get_dsl(spa), 0); 4444 } 4445 4446 boolean_t 4447 spa_has_spare(spa_t *spa, uint64_t guid) 4448 { 4449 int i; 4450 uint64_t spareguid; 4451 spa_aux_vdev_t *sav = &spa->spa_spares; 4452 4453 for (i = 0; i < sav->sav_count; i++) 4454 if (sav->sav_vdevs[i]->vdev_guid == guid) 4455 return (B_TRUE); 4456 4457 for (i = 0; i < sav->sav_npending; i++) { 4458 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 4459 &spareguid) == 0 && spareguid == guid) 4460 return (B_TRUE); 4461 } 4462 4463 return (B_FALSE); 4464 } 4465 4466 /* 4467 * Post a sysevent corresponding to the given event. The 'name' must be one of 4468 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 4469 * filled in from the spa and (optionally) the vdev. This doesn't do anything 4470 * in the userland libzpool, as we don't want consumers to misinterpret ztest 4471 * or zdb as real changes. 4472 */ 4473 void 4474 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 4475 { 4476 #ifdef _KERNEL 4477 sysevent_t *ev; 4478 sysevent_attr_list_t *attr = NULL; 4479 sysevent_value_t value; 4480 sysevent_id_t eid; 4481 4482 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 4483 SE_SLEEP); 4484 4485 value.value_type = SE_DATA_TYPE_STRING; 4486 value.value.sv_string = spa_name(spa); 4487 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 4488 goto done; 4489 4490 value.value_type = SE_DATA_TYPE_UINT64; 4491 value.value.sv_uint64 = spa_guid(spa); 4492 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 4493 goto done; 4494 4495 if (vd) { 4496 value.value_type = SE_DATA_TYPE_UINT64; 4497 value.value.sv_uint64 = vd->vdev_guid; 4498 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 4499 SE_SLEEP) != 0) 4500 goto done; 4501 4502 if (vd->vdev_path) { 4503 value.value_type = SE_DATA_TYPE_STRING; 4504 value.value.sv_string = vd->vdev_path; 4505 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 4506 &value, SE_SLEEP) != 0) 4507 goto done; 4508 } 4509 } 4510 4511 if (sysevent_attach_attributes(ev, attr) != 0) 4512 goto done; 4513 attr = NULL; 4514 4515 (void) log_sysevent(ev, SE_SLEEP, &eid); 4516 4517 done: 4518 if (attr) 4519 sysevent_free_attr(attr); 4520 sysevent_free(ev); 4521 #endif 4522 } 4523