1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains all the routines used when modifying on-disk SPA state. 31 * This includes opening, importing, destroying, exporting a pool, and syncing a 32 * pool. 33 */ 34 35 #include <sys/zfs_context.h> 36 #include <sys/fm/fs/zfs.h> 37 #include <sys/spa_impl.h> 38 #include <sys/zio.h> 39 #include <sys/zio_checksum.h> 40 #include <sys/zio_compress.h> 41 #include <sys/dmu.h> 42 #include <sys/dmu_tx.h> 43 #include <sys/zap.h> 44 #include <sys/zil.h> 45 #include <sys/vdev_impl.h> 46 #include <sys/metaslab.h> 47 #include <sys/uberblock_impl.h> 48 #include <sys/txg.h> 49 #include <sys/avl.h> 50 #include <sys/dmu_traverse.h> 51 #include <sys/dmu_objset.h> 52 #include <sys/unique.h> 53 #include <sys/dsl_pool.h> 54 #include <sys/dsl_dataset.h> 55 #include <sys/dsl_dir.h> 56 #include <sys/dsl_prop.h> 57 #include <sys/dsl_synctask.h> 58 #include <sys/fs/zfs.h> 59 #include <sys/callb.h> 60 #include <sys/systeminfo.h> 61 #include <sys/sunddi.h> 62 63 #include "zfs_prop.h" 64 65 int zio_taskq_threads = 8; 66 67 static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); 68 69 /* 70 * ========================================================================== 71 * SPA properties routines 72 * ========================================================================== 73 */ 74 75 /* 76 * Add a (source=src, propname=propval) list to an nvlist. 77 */ 78 static int 79 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 80 uint64_t intval, zprop_source_t src) 81 { 82 const char *propname = zpool_prop_to_name(prop); 83 nvlist_t *propval; 84 int err = 0; 85 86 if (err = nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP)) 87 return (err); 88 89 if (err = nvlist_add_uint64(propval, ZPROP_SOURCE, src)) 90 goto out; 91 92 if (strval != NULL) { 93 if (err = nvlist_add_string(propval, ZPROP_VALUE, strval)) 94 goto out; 95 } else { 96 if (err = nvlist_add_uint64(propval, ZPROP_VALUE, intval)) 97 goto out; 98 } 99 100 err = nvlist_add_nvlist(nvl, propname, propval); 101 out: 102 nvlist_free(propval); 103 return (err); 104 } 105 106 /* 107 * Get property values from the spa configuration. 108 */ 109 static int 110 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 111 { 112 uint64_t size = spa_get_space(spa); 113 uint64_t used = spa_get_alloc(spa); 114 uint64_t cap, version; 115 zprop_source_t src = ZPROP_SRC_NONE; 116 int err; 117 118 /* 119 * readonly properties 120 */ 121 if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa->spa_name, 122 0, src)) 123 return (err); 124 125 if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src)) 126 return (err); 127 128 if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src)) 129 return (err); 130 131 if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, 132 size - used, src)) 133 return (err); 134 135 cap = (size == 0) ? 0 : (used * 100 / size); 136 if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src)) 137 return (err); 138 139 if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, 140 spa_guid(spa), src)) 141 return (err); 142 143 if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 144 spa->spa_root_vdev->vdev_state, src)) 145 return (err); 146 147 /* 148 * settable properties that are not stored in the pool property object. 149 */ 150 version = spa_version(spa); 151 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 152 src = ZPROP_SRC_DEFAULT; 153 else 154 src = ZPROP_SRC_LOCAL; 155 if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, 156 version, src)) 157 return (err); 158 159 if (spa->spa_root != NULL) { 160 src = ZPROP_SRC_LOCAL; 161 if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, 162 spa->spa_root, 0, src)) 163 return (err); 164 } 165 166 if (spa->spa_temporary == 167 zpool_prop_default_numeric(ZPOOL_PROP_TEMPORARY)) 168 src = ZPROP_SRC_DEFAULT; 169 else 170 src = ZPROP_SRC_LOCAL; 171 if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_TEMPORARY, NULL, 172 spa->spa_temporary, src)) 173 return (err); 174 175 return (0); 176 } 177 178 /* 179 * Get zpool property values. 180 */ 181 int 182 spa_prop_get(spa_t *spa, nvlist_t **nvp) 183 { 184 zap_cursor_t zc; 185 zap_attribute_t za; 186 objset_t *mos = spa->spa_meta_objset; 187 int err; 188 189 if (err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP)) 190 return (err); 191 192 /* 193 * Get properties from the spa config. 194 */ 195 if (err = spa_prop_get_config(spa, nvp)) 196 goto out; 197 198 mutex_enter(&spa->spa_props_lock); 199 /* If no pool property object, no more prop to get. */ 200 if (spa->spa_pool_props_object == 0) { 201 mutex_exit(&spa->spa_props_lock); 202 return (0); 203 } 204 205 /* 206 * Get properties from the MOS pool property object. 207 */ 208 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 209 (err = zap_cursor_retrieve(&zc, &za)) == 0; 210 zap_cursor_advance(&zc)) { 211 uint64_t intval = 0; 212 char *strval = NULL; 213 zprop_source_t src = ZPROP_SRC_DEFAULT; 214 zpool_prop_t prop; 215 216 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 217 continue; 218 219 switch (za.za_integer_length) { 220 case 8: 221 /* integer property */ 222 if (za.za_first_integer != 223 zpool_prop_default_numeric(prop)) 224 src = ZPROP_SRC_LOCAL; 225 226 if (prop == ZPOOL_PROP_BOOTFS) { 227 dsl_pool_t *dp; 228 dsl_dataset_t *ds = NULL; 229 230 dp = spa_get_dsl(spa); 231 rw_enter(&dp->dp_config_rwlock, RW_READER); 232 if (err = dsl_dataset_open_obj(dp, 233 za.za_first_integer, NULL, DS_MODE_NONE, 234 FTAG, &ds)) { 235 rw_exit(&dp->dp_config_rwlock); 236 break; 237 } 238 239 strval = kmem_alloc( 240 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 241 KM_SLEEP); 242 dsl_dataset_name(ds, strval); 243 dsl_dataset_close(ds, DS_MODE_NONE, FTAG); 244 rw_exit(&dp->dp_config_rwlock); 245 } else { 246 strval = NULL; 247 intval = za.za_first_integer; 248 } 249 250 err = spa_prop_add_list(*nvp, prop, strval, 251 intval, src); 252 253 if (strval != NULL) 254 kmem_free(strval, 255 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 256 257 break; 258 259 case 1: 260 /* string property */ 261 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 262 err = zap_lookup(mos, spa->spa_pool_props_object, 263 za.za_name, 1, za.za_num_integers, strval); 264 if (err) { 265 kmem_free(strval, za.za_num_integers); 266 break; 267 } 268 err = spa_prop_add_list(*nvp, prop, strval, 0, src); 269 kmem_free(strval, za.za_num_integers); 270 break; 271 272 default: 273 break; 274 } 275 } 276 zap_cursor_fini(&zc); 277 mutex_exit(&spa->spa_props_lock); 278 out: 279 if (err && err != ENOENT) { 280 nvlist_free(*nvp); 281 return (err); 282 } 283 284 return (0); 285 } 286 287 /* 288 * Validate the given pool properties nvlist and modify the list 289 * for the property values to be set. 290 */ 291 static int 292 spa_prop_validate(spa_t *spa, nvlist_t *props) 293 { 294 nvpair_t *elem; 295 int error = 0, reset_bootfs = 0; 296 uint64_t objnum; 297 298 elem = NULL; 299 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 300 zpool_prop_t prop; 301 char *propname, *strval; 302 uint64_t intval; 303 vdev_t *rvdev; 304 char *vdev_type; 305 objset_t *os; 306 307 propname = nvpair_name(elem); 308 309 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) 310 return (EINVAL); 311 312 switch (prop) { 313 case ZPOOL_PROP_VERSION: 314 error = nvpair_value_uint64(elem, &intval); 315 if (!error && 316 (intval < spa_version(spa) || intval > SPA_VERSION)) 317 error = EINVAL; 318 break; 319 320 case ZPOOL_PROP_DELEGATION: 321 case ZPOOL_PROP_AUTOREPLACE: 322 error = nvpair_value_uint64(elem, &intval); 323 if (!error && intval > 1) 324 error = EINVAL; 325 break; 326 327 case ZPOOL_PROP_BOOTFS: 328 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 329 error = ENOTSUP; 330 break; 331 } 332 333 /* 334 * A bootable filesystem can not be on a RAIDZ pool 335 * nor a striped pool with more than 1 device. 336 */ 337 rvdev = spa->spa_root_vdev; 338 vdev_type = 339 rvdev->vdev_child[0]->vdev_ops->vdev_op_type; 340 if (rvdev->vdev_children > 1 || 341 strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || 342 strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { 343 error = ENOTSUP; 344 break; 345 } 346 347 reset_bootfs = 1; 348 349 error = nvpair_value_string(elem, &strval); 350 351 if (!error) { 352 if (strval == NULL || strval[0] == '\0') { 353 objnum = zpool_prop_default_numeric( 354 ZPOOL_PROP_BOOTFS); 355 break; 356 } 357 358 if (error = dmu_objset_open(strval, DMU_OST_ZFS, 359 DS_MODE_STANDARD | DS_MODE_READONLY, &os)) 360 break; 361 objnum = dmu_objset_id(os); 362 dmu_objset_close(os); 363 } 364 break; 365 case ZPOOL_PROP_FAILUREMODE: 366 error = nvpair_value_uint64(elem, &intval); 367 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 368 intval > ZIO_FAILURE_MODE_PANIC)) 369 error = EINVAL; 370 371 /* 372 * This is a special case which only occurs when 373 * the pool has completely failed. This allows 374 * the user to change the in-core failmode property 375 * without syncing it out to disk (I/Os might 376 * currently be blocked). We do this by returning 377 * EIO to the caller (spa_prop_set) to trick it 378 * into thinking we encountered a property validation 379 * error. 380 */ 381 if (!error && spa_state(spa) == POOL_STATE_IO_FAILURE) { 382 spa->spa_failmode = intval; 383 error = EIO; 384 } 385 break; 386 } 387 388 if (error) 389 break; 390 } 391 392 if (!error && reset_bootfs) { 393 error = nvlist_remove(props, 394 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 395 396 if (!error) { 397 error = nvlist_add_uint64(props, 398 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 399 } 400 } 401 402 return (error); 403 } 404 405 int 406 spa_prop_set(spa_t *spa, nvlist_t *nvp) 407 { 408 int error; 409 410 if ((error = spa_prop_validate(spa, nvp)) != 0) 411 return (error); 412 413 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 414 spa, nvp, 3)); 415 } 416 417 /* 418 * If the bootfs property value is dsobj, clear it. 419 */ 420 void 421 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 422 { 423 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 424 VERIFY(zap_remove(spa->spa_meta_objset, 425 spa->spa_pool_props_object, 426 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 427 spa->spa_bootfs = 0; 428 } 429 } 430 431 /* 432 * ========================================================================== 433 * SPA state manipulation (open/create/destroy/import/export) 434 * ========================================================================== 435 */ 436 437 static int 438 spa_error_entry_compare(const void *a, const void *b) 439 { 440 spa_error_entry_t *sa = (spa_error_entry_t *)a; 441 spa_error_entry_t *sb = (spa_error_entry_t *)b; 442 int ret; 443 444 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 445 sizeof (zbookmark_t)); 446 447 if (ret < 0) 448 return (-1); 449 else if (ret > 0) 450 return (1); 451 else 452 return (0); 453 } 454 455 /* 456 * Utility function which retrieves copies of the current logs and 457 * re-initializes them in the process. 458 */ 459 void 460 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 461 { 462 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 463 464 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 465 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 466 467 avl_create(&spa->spa_errlist_scrub, 468 spa_error_entry_compare, sizeof (spa_error_entry_t), 469 offsetof(spa_error_entry_t, se_avl)); 470 avl_create(&spa->spa_errlist_last, 471 spa_error_entry_compare, sizeof (spa_error_entry_t), 472 offsetof(spa_error_entry_t, se_avl)); 473 } 474 475 /* 476 * Activate an uninitialized pool. 477 */ 478 static void 479 spa_activate(spa_t *spa) 480 { 481 int t; 482 483 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 484 485 spa->spa_state = POOL_STATE_ACTIVE; 486 487 spa->spa_normal_class = metaslab_class_create(); 488 spa->spa_log_class = metaslab_class_create(); 489 490 for (t = 0; t < ZIO_TYPES; t++) { 491 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 492 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 493 TASKQ_PREPOPULATE); 494 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 495 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 496 TASKQ_PREPOPULATE); 497 } 498 499 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 500 offsetof(vdev_t, vdev_dirty_node)); 501 list_create(&spa->spa_zio_list, sizeof (zio_t), 502 offsetof(zio_t, zio_link_node)); 503 504 txg_list_create(&spa->spa_vdev_txg_list, 505 offsetof(struct vdev, vdev_txg_node)); 506 507 avl_create(&spa->spa_errlist_scrub, 508 spa_error_entry_compare, sizeof (spa_error_entry_t), 509 offsetof(spa_error_entry_t, se_avl)); 510 avl_create(&spa->spa_errlist_last, 511 spa_error_entry_compare, sizeof (spa_error_entry_t), 512 offsetof(spa_error_entry_t, se_avl)); 513 } 514 515 /* 516 * Opposite of spa_activate(). 517 */ 518 static void 519 spa_deactivate(spa_t *spa) 520 { 521 int t; 522 523 ASSERT(spa->spa_sync_on == B_FALSE); 524 ASSERT(spa->spa_dsl_pool == NULL); 525 ASSERT(spa->spa_root_vdev == NULL); 526 527 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 528 529 txg_list_destroy(&spa->spa_vdev_txg_list); 530 531 list_destroy(&spa->spa_dirty_list); 532 list_destroy(&spa->spa_zio_list); 533 534 for (t = 0; t < ZIO_TYPES; t++) { 535 taskq_destroy(spa->spa_zio_issue_taskq[t]); 536 taskq_destroy(spa->spa_zio_intr_taskq[t]); 537 spa->spa_zio_issue_taskq[t] = NULL; 538 spa->spa_zio_intr_taskq[t] = NULL; 539 } 540 541 metaslab_class_destroy(spa->spa_normal_class); 542 spa->spa_normal_class = NULL; 543 544 metaslab_class_destroy(spa->spa_log_class); 545 spa->spa_log_class = NULL; 546 547 /* 548 * If this was part of an import or the open otherwise failed, we may 549 * still have errors left in the queues. Empty them just in case. 550 */ 551 spa_errlog_drain(spa); 552 553 avl_destroy(&spa->spa_errlist_scrub); 554 avl_destroy(&spa->spa_errlist_last); 555 556 spa->spa_state = POOL_STATE_UNINITIALIZED; 557 } 558 559 /* 560 * Verify a pool configuration, and construct the vdev tree appropriately. This 561 * will create all the necessary vdevs in the appropriate layout, with each vdev 562 * in the CLOSED state. This will prep the pool before open/creation/import. 563 * All vdev validation is done by the vdev_alloc() routine. 564 */ 565 static int 566 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 567 uint_t id, int atype) 568 { 569 nvlist_t **child; 570 uint_t c, children; 571 int error; 572 573 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 574 return (error); 575 576 if ((*vdp)->vdev_ops->vdev_op_leaf) 577 return (0); 578 579 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 580 &child, &children) != 0) { 581 vdev_free(*vdp); 582 *vdp = NULL; 583 return (EINVAL); 584 } 585 586 for (c = 0; c < children; c++) { 587 vdev_t *vd; 588 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 589 atype)) != 0) { 590 vdev_free(*vdp); 591 *vdp = NULL; 592 return (error); 593 } 594 } 595 596 ASSERT(*vdp != NULL); 597 598 return (0); 599 } 600 601 /* 602 * Opposite of spa_load(). 603 */ 604 static void 605 spa_unload(spa_t *spa) 606 { 607 int i; 608 609 /* 610 * Stop async tasks. 611 */ 612 spa_async_suspend(spa); 613 614 /* 615 * Stop syncing. 616 */ 617 if (spa->spa_sync_on) { 618 txg_sync_stop(spa->spa_dsl_pool); 619 spa->spa_sync_on = B_FALSE; 620 } 621 622 /* 623 * Wait for any outstanding prefetch I/O to complete. 624 */ 625 spa_config_enter(spa, RW_WRITER, FTAG); 626 spa_config_exit(spa, FTAG); 627 628 /* 629 * Close the dsl pool. 630 */ 631 if (spa->spa_dsl_pool) { 632 dsl_pool_close(spa->spa_dsl_pool); 633 spa->spa_dsl_pool = NULL; 634 } 635 636 /* 637 * Close all vdevs. 638 */ 639 if (spa->spa_root_vdev) 640 vdev_free(spa->spa_root_vdev); 641 ASSERT(spa->spa_root_vdev == NULL); 642 643 for (i = 0; i < spa->spa_nspares; i++) 644 vdev_free(spa->spa_spares[i]); 645 if (spa->spa_spares) { 646 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 647 spa->spa_spares = NULL; 648 } 649 if (spa->spa_sparelist) { 650 nvlist_free(spa->spa_sparelist); 651 spa->spa_sparelist = NULL; 652 } 653 654 spa->spa_async_suspended = 0; 655 } 656 657 /* 658 * Load (or re-load) the current list of vdevs describing the active spares for 659 * this pool. When this is called, we have some form of basic information in 660 * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 661 * re-generate a more complete list including status information. 662 */ 663 static void 664 spa_load_spares(spa_t *spa) 665 { 666 nvlist_t **spares; 667 uint_t nspares; 668 int i; 669 vdev_t *vd, *tvd; 670 671 /* 672 * First, close and free any existing spare vdevs. 673 */ 674 for (i = 0; i < spa->spa_nspares; i++) { 675 vd = spa->spa_spares[i]; 676 677 /* Undo the call to spa_activate() below */ 678 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && 679 tvd->vdev_isspare) 680 spa_spare_remove(tvd); 681 vdev_close(vd); 682 vdev_free(vd); 683 } 684 685 if (spa->spa_spares) 686 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 687 688 if (spa->spa_sparelist == NULL) 689 nspares = 0; 690 else 691 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 692 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 693 694 spa->spa_nspares = (int)nspares; 695 spa->spa_spares = NULL; 696 697 if (nspares == 0) 698 return; 699 700 /* 701 * Construct the array of vdevs, opening them to get status in the 702 * process. For each spare, there is potentially two different vdev_t 703 * structures associated with it: one in the list of spares (used only 704 * for basic validation purposes) and one in the active vdev 705 * configuration (if it's spared in). During this phase we open and 706 * validate each vdev on the spare list. If the vdev also exists in the 707 * active configuration, then we also mark this vdev as an active spare. 708 */ 709 spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 710 for (i = 0; i < spa->spa_nspares; i++) { 711 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 712 VDEV_ALLOC_SPARE) == 0); 713 ASSERT(vd != NULL); 714 715 spa->spa_spares[i] = vd; 716 717 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { 718 if (!tvd->vdev_isspare) 719 spa_spare_add(tvd); 720 721 /* 722 * We only mark the spare active if we were successfully 723 * able to load the vdev. Otherwise, importing a pool 724 * with a bad active spare would result in strange 725 * behavior, because multiple pool would think the spare 726 * is actively in use. 727 * 728 * There is a vulnerability here to an equally bizarre 729 * circumstance, where a dead active spare is later 730 * brought back to life (onlined or otherwise). Given 731 * the rarity of this scenario, and the extra complexity 732 * it adds, we ignore the possibility. 733 */ 734 if (!vdev_is_dead(tvd)) 735 spa_spare_activate(tvd); 736 } 737 738 if (vdev_open(vd) != 0) 739 continue; 740 741 vd->vdev_top = vd; 742 (void) vdev_validate_spare(vd); 743 } 744 745 /* 746 * Recompute the stashed list of spares, with status information 747 * this time. 748 */ 749 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 750 DATA_TYPE_NVLIST_ARRAY) == 0); 751 752 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 753 for (i = 0; i < spa->spa_nspares; i++) 754 spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 755 B_TRUE, B_TRUE); 756 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 757 spares, spa->spa_nspares) == 0); 758 for (i = 0; i < spa->spa_nspares; i++) 759 nvlist_free(spares[i]); 760 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 761 } 762 763 static int 764 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 765 { 766 dmu_buf_t *db; 767 char *packed = NULL; 768 size_t nvsize = 0; 769 int error; 770 *value = NULL; 771 772 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 773 nvsize = *(uint64_t *)db->db_data; 774 dmu_buf_rele(db, FTAG); 775 776 packed = kmem_alloc(nvsize, KM_SLEEP); 777 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 778 if (error == 0) 779 error = nvlist_unpack(packed, nvsize, value, 0); 780 kmem_free(packed, nvsize); 781 782 return (error); 783 } 784 785 /* 786 * Checks to see if the given vdev could not be opened, in which case we post a 787 * sysevent to notify the autoreplace code that the device has been removed. 788 */ 789 static void 790 spa_check_removed(vdev_t *vd) 791 { 792 int c; 793 794 for (c = 0; c < vd->vdev_children; c++) 795 spa_check_removed(vd->vdev_child[c]); 796 797 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 798 zfs_post_autoreplace(vd->vdev_spa, vd); 799 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 800 } 801 } 802 803 /* 804 * Load an existing storage pool, using the pool's builtin spa_config as a 805 * source of configuration information. 806 */ 807 static int 808 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 809 { 810 int error = 0; 811 nvlist_t *nvroot = NULL; 812 vdev_t *rvd; 813 uberblock_t *ub = &spa->spa_uberblock; 814 uint64_t config_cache_txg = spa->spa_config_txg; 815 uint64_t pool_guid; 816 uint64_t version; 817 zio_t *zio; 818 uint64_t autoreplace = 0; 819 820 spa->spa_load_state = state; 821 822 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 823 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 824 error = EINVAL; 825 goto out; 826 } 827 828 /* 829 * Versioning wasn't explicitly added to the label until later, so if 830 * it's not present treat it as the initial version. 831 */ 832 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 833 version = SPA_VERSION_INITIAL; 834 835 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 836 &spa->spa_config_txg); 837 838 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 839 spa_guid_exists(pool_guid, 0)) { 840 error = EEXIST; 841 goto out; 842 } 843 844 spa->spa_load_guid = pool_guid; 845 846 /* 847 * Parse the configuration into a vdev tree. We explicitly set the 848 * value that will be returned by spa_version() since parsing the 849 * configuration requires knowing the version number. 850 */ 851 spa_config_enter(spa, RW_WRITER, FTAG); 852 spa->spa_ubsync.ub_version = version; 853 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 854 spa_config_exit(spa, FTAG); 855 856 if (error != 0) 857 goto out; 858 859 ASSERT(spa->spa_root_vdev == rvd); 860 ASSERT(spa_guid(spa) == pool_guid); 861 862 /* 863 * Try to open all vdevs, loading each label in the process. 864 */ 865 error = vdev_open(rvd); 866 if (error != 0) 867 goto out; 868 869 /* 870 * Validate the labels for all leaf vdevs. We need to grab the config 871 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 872 * flag. 873 */ 874 spa_config_enter(spa, RW_READER, FTAG); 875 error = vdev_validate(rvd); 876 spa_config_exit(spa, FTAG); 877 878 if (error != 0) 879 goto out; 880 881 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 882 error = ENXIO; 883 goto out; 884 } 885 886 /* 887 * Find the best uberblock. 888 */ 889 bzero(ub, sizeof (uberblock_t)); 890 891 zio = zio_root(spa, NULL, NULL, 892 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 893 vdev_uberblock_load(zio, rvd, ub); 894 error = zio_wait(zio); 895 896 /* 897 * If we weren't able to find a single valid uberblock, return failure. 898 */ 899 if (ub->ub_txg == 0) { 900 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 901 VDEV_AUX_CORRUPT_DATA); 902 error = ENXIO; 903 goto out; 904 } 905 906 /* 907 * If the pool is newer than the code, we can't open it. 908 */ 909 if (ub->ub_version > SPA_VERSION) { 910 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 911 VDEV_AUX_VERSION_NEWER); 912 error = ENOTSUP; 913 goto out; 914 } 915 916 /* 917 * If the vdev guid sum doesn't match the uberblock, we have an 918 * incomplete configuration. 919 */ 920 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 921 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 922 VDEV_AUX_BAD_GUID_SUM); 923 error = ENXIO; 924 goto out; 925 } 926 927 /* 928 * Initialize internal SPA structures. 929 */ 930 spa->spa_state = POOL_STATE_ACTIVE; 931 spa->spa_ubsync = spa->spa_uberblock; 932 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 933 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 934 if (error) { 935 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 936 VDEV_AUX_CORRUPT_DATA); 937 goto out; 938 } 939 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 940 941 if (zap_lookup(spa->spa_meta_objset, 942 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 943 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 944 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 945 VDEV_AUX_CORRUPT_DATA); 946 error = EIO; 947 goto out; 948 } 949 950 if (!mosconfig) { 951 nvlist_t *newconfig; 952 uint64_t hostid; 953 954 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 955 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 956 VDEV_AUX_CORRUPT_DATA); 957 error = EIO; 958 goto out; 959 } 960 961 if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID, 962 &hostid) == 0) { 963 char *hostname; 964 unsigned long myhostid = 0; 965 966 VERIFY(nvlist_lookup_string(newconfig, 967 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 968 969 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 970 if (hostid != 0 && myhostid != 0 && 971 (unsigned long)hostid != myhostid) { 972 cmn_err(CE_WARN, "pool '%s' could not be " 973 "loaded as it was last accessed by " 974 "another system (host: %s hostid: 0x%lx). " 975 "See: http://www.sun.com/msg/ZFS-8000-EY", 976 spa->spa_name, hostname, 977 (unsigned long)hostid); 978 error = EBADF; 979 goto out; 980 } 981 } 982 983 spa_config_set(spa, newconfig); 984 spa_unload(spa); 985 spa_deactivate(spa); 986 spa_activate(spa); 987 988 return (spa_load(spa, newconfig, state, B_TRUE)); 989 } 990 991 if (zap_lookup(spa->spa_meta_objset, 992 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 993 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 994 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 995 VDEV_AUX_CORRUPT_DATA); 996 error = EIO; 997 goto out; 998 } 999 1000 /* 1001 * Load the bit that tells us to use the new accounting function 1002 * (raid-z deflation). If we have an older pool, this will not 1003 * be present. 1004 */ 1005 error = zap_lookup(spa->spa_meta_objset, 1006 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1007 sizeof (uint64_t), 1, &spa->spa_deflate); 1008 if (error != 0 && error != ENOENT) { 1009 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1010 VDEV_AUX_CORRUPT_DATA); 1011 error = EIO; 1012 goto out; 1013 } 1014 1015 /* 1016 * Load the persistent error log. If we have an older pool, this will 1017 * not be present. 1018 */ 1019 error = zap_lookup(spa->spa_meta_objset, 1020 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 1021 sizeof (uint64_t), 1, &spa->spa_errlog_last); 1022 if (error != 0 && error != ENOENT) { 1023 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1024 VDEV_AUX_CORRUPT_DATA); 1025 error = EIO; 1026 goto out; 1027 } 1028 1029 error = zap_lookup(spa->spa_meta_objset, 1030 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 1031 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 1032 if (error != 0 && error != ENOENT) { 1033 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1034 VDEV_AUX_CORRUPT_DATA); 1035 error = EIO; 1036 goto out; 1037 } 1038 1039 /* 1040 * Load the history object. If we have an older pool, this 1041 * will not be present. 1042 */ 1043 error = zap_lookup(spa->spa_meta_objset, 1044 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 1045 sizeof (uint64_t), 1, &spa->spa_history); 1046 if (error != 0 && error != ENOENT) { 1047 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1048 VDEV_AUX_CORRUPT_DATA); 1049 error = EIO; 1050 goto out; 1051 } 1052 1053 /* 1054 * Load any hot spares for this pool. 1055 */ 1056 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1057 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 1058 if (error != 0 && error != ENOENT) { 1059 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1060 VDEV_AUX_CORRUPT_DATA); 1061 error = EIO; 1062 goto out; 1063 } 1064 if (error == 0) { 1065 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 1066 if (load_nvlist(spa, spa->spa_spares_object, 1067 &spa->spa_sparelist) != 0) { 1068 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1069 VDEV_AUX_CORRUPT_DATA); 1070 error = EIO; 1071 goto out; 1072 } 1073 1074 spa_config_enter(spa, RW_WRITER, FTAG); 1075 spa_load_spares(spa); 1076 spa_config_exit(spa, FTAG); 1077 } 1078 1079 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 1080 1081 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1082 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 1083 1084 if (error && error != ENOENT) { 1085 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1086 VDEV_AUX_CORRUPT_DATA); 1087 error = EIO; 1088 goto out; 1089 } 1090 1091 if (error == 0) { 1092 (void) zap_lookup(spa->spa_meta_objset, 1093 spa->spa_pool_props_object, 1094 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 1095 sizeof (uint64_t), 1, &spa->spa_bootfs); 1096 (void) zap_lookup(spa->spa_meta_objset, 1097 spa->spa_pool_props_object, 1098 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1099 sizeof (uint64_t), 1, &autoreplace); 1100 (void) zap_lookup(spa->spa_meta_objset, 1101 spa->spa_pool_props_object, 1102 zpool_prop_to_name(ZPOOL_PROP_DELEGATION), 1103 sizeof (uint64_t), 1, &spa->spa_delegation); 1104 (void) zap_lookup(spa->spa_meta_objset, 1105 spa->spa_pool_props_object, 1106 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 1107 sizeof (uint64_t), 1, &spa->spa_failmode); 1108 } 1109 1110 /* 1111 * If the 'autoreplace' property is set, then post a resource notifying 1112 * the ZFS DE that it should not issue any faults for unopenable 1113 * devices. We also iterate over the vdevs, and post a sysevent for any 1114 * unopenable vdevs so that the normal autoreplace handler can take 1115 * over. 1116 */ 1117 if (autoreplace) 1118 spa_check_removed(spa->spa_root_vdev); 1119 1120 /* 1121 * Load the vdev state for all toplevel vdevs. 1122 */ 1123 vdev_load(rvd); 1124 1125 /* 1126 * Propagate the leaf DTLs we just loaded all the way up the tree. 1127 */ 1128 spa_config_enter(spa, RW_WRITER, FTAG); 1129 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 1130 spa_config_exit(spa, FTAG); 1131 1132 /* 1133 * Check the state of the root vdev. If it can't be opened, it 1134 * indicates one or more toplevel vdevs are faulted. 1135 */ 1136 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1137 error = ENXIO; 1138 goto out; 1139 } 1140 1141 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 1142 dmu_tx_t *tx; 1143 int need_update = B_FALSE; 1144 int c; 1145 1146 /* 1147 * Claim log blocks that haven't been committed yet. 1148 * This must all happen in a single txg. 1149 */ 1150 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 1151 spa_first_txg(spa)); 1152 (void) dmu_objset_find(spa->spa_name, 1153 zil_claim, tx, DS_FIND_CHILDREN); 1154 dmu_tx_commit(tx); 1155 1156 spa->spa_sync_on = B_TRUE; 1157 txg_sync_start(spa->spa_dsl_pool); 1158 1159 /* 1160 * Wait for all claims to sync. 1161 */ 1162 txg_wait_synced(spa->spa_dsl_pool, 0); 1163 1164 /* 1165 * If the config cache is stale, or we have uninitialized 1166 * metaslabs (see spa_vdev_add()), then update the config. 1167 */ 1168 if (config_cache_txg != spa->spa_config_txg || 1169 state == SPA_LOAD_IMPORT) 1170 need_update = B_TRUE; 1171 1172 for (c = 0; c < rvd->vdev_children; c++) 1173 if (rvd->vdev_child[c]->vdev_ms_array == 0) 1174 need_update = B_TRUE; 1175 1176 /* 1177 * Update the config cache asychronously in case we're the 1178 * root pool, in which case the config cache isn't writable yet. 1179 */ 1180 if (need_update) 1181 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 1182 } 1183 1184 error = 0; 1185 out: 1186 if (error && error != EBADF) 1187 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 1188 spa->spa_load_state = SPA_LOAD_NONE; 1189 spa->spa_ena = 0; 1190 1191 return (error); 1192 } 1193 1194 /* 1195 * Pool Open/Import 1196 * 1197 * The import case is identical to an open except that the configuration is sent 1198 * down from userland, instead of grabbed from the configuration cache. For the 1199 * case of an open, the pool configuration will exist in the 1200 * POOL_STATE_UNINITIALIZED state. 1201 * 1202 * The stats information (gen/count/ustats) is used to gather vdev statistics at 1203 * the same time open the pool, without having to keep around the spa_t in some 1204 * ambiguous state. 1205 */ 1206 static int 1207 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 1208 { 1209 spa_t *spa; 1210 int error; 1211 int loaded = B_FALSE; 1212 int locked = B_FALSE; 1213 1214 *spapp = NULL; 1215 1216 /* 1217 * As disgusting as this is, we need to support recursive calls to this 1218 * function because dsl_dir_open() is called during spa_load(), and ends 1219 * up calling spa_open() again. The real fix is to figure out how to 1220 * avoid dsl_dir_open() calling this in the first place. 1221 */ 1222 if (mutex_owner(&spa_namespace_lock) != curthread) { 1223 mutex_enter(&spa_namespace_lock); 1224 locked = B_TRUE; 1225 } 1226 1227 if ((spa = spa_lookup(pool)) == NULL) { 1228 if (locked) 1229 mutex_exit(&spa_namespace_lock); 1230 return (ENOENT); 1231 } 1232 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 1233 1234 spa_activate(spa); 1235 1236 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 1237 1238 if (error == EBADF) { 1239 /* 1240 * If vdev_validate() returns failure (indicated by 1241 * EBADF), it indicates that one of the vdevs indicates 1242 * that the pool has been exported or destroyed. If 1243 * this is the case, the config cache is out of sync and 1244 * we should remove the pool from the namespace. 1245 */ 1246 zfs_post_ok(spa, NULL); 1247 spa_unload(spa); 1248 spa_deactivate(spa); 1249 spa_remove(spa); 1250 spa_config_sync(); 1251 if (locked) 1252 mutex_exit(&spa_namespace_lock); 1253 return (ENOENT); 1254 } 1255 1256 if (error) { 1257 /* 1258 * We can't open the pool, but we still have useful 1259 * information: the state of each vdev after the 1260 * attempted vdev_open(). Return this to the user. 1261 */ 1262 if (config != NULL && spa->spa_root_vdev != NULL) { 1263 spa_config_enter(spa, RW_READER, FTAG); 1264 *config = spa_config_generate(spa, NULL, -1ULL, 1265 B_TRUE); 1266 spa_config_exit(spa, FTAG); 1267 } 1268 spa_unload(spa); 1269 spa_deactivate(spa); 1270 spa->spa_last_open_failed = B_TRUE; 1271 if (locked) 1272 mutex_exit(&spa_namespace_lock); 1273 *spapp = NULL; 1274 return (error); 1275 } else { 1276 zfs_post_ok(spa, NULL); 1277 spa->spa_last_open_failed = B_FALSE; 1278 } 1279 1280 loaded = B_TRUE; 1281 } 1282 1283 spa_open_ref(spa, tag); 1284 1285 /* 1286 * If we just loaded the pool, resilver anything that's out of date. 1287 */ 1288 if (loaded && (spa_mode & FWRITE)) 1289 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1290 1291 if (locked) 1292 mutex_exit(&spa_namespace_lock); 1293 1294 *spapp = spa; 1295 1296 if (config != NULL) { 1297 spa_config_enter(spa, RW_READER, FTAG); 1298 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1299 spa_config_exit(spa, FTAG); 1300 } 1301 1302 return (0); 1303 } 1304 1305 int 1306 spa_open(const char *name, spa_t **spapp, void *tag) 1307 { 1308 return (spa_open_common(name, spapp, tag, NULL)); 1309 } 1310 1311 /* 1312 * Lookup the given spa_t, incrementing the inject count in the process, 1313 * preventing it from being exported or destroyed. 1314 */ 1315 spa_t * 1316 spa_inject_addref(char *name) 1317 { 1318 spa_t *spa; 1319 1320 mutex_enter(&spa_namespace_lock); 1321 if ((spa = spa_lookup(name)) == NULL) { 1322 mutex_exit(&spa_namespace_lock); 1323 return (NULL); 1324 } 1325 spa->spa_inject_ref++; 1326 mutex_exit(&spa_namespace_lock); 1327 1328 return (spa); 1329 } 1330 1331 void 1332 spa_inject_delref(spa_t *spa) 1333 { 1334 mutex_enter(&spa_namespace_lock); 1335 spa->spa_inject_ref--; 1336 mutex_exit(&spa_namespace_lock); 1337 } 1338 1339 static void 1340 spa_add_spares(spa_t *spa, nvlist_t *config) 1341 { 1342 nvlist_t **spares; 1343 uint_t i, nspares; 1344 nvlist_t *nvroot; 1345 uint64_t guid; 1346 vdev_stat_t *vs; 1347 uint_t vsc; 1348 uint64_t pool; 1349 1350 if (spa->spa_nspares == 0) 1351 return; 1352 1353 VERIFY(nvlist_lookup_nvlist(config, 1354 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1355 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 1356 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1357 if (nspares != 0) { 1358 VERIFY(nvlist_add_nvlist_array(nvroot, 1359 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1360 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1361 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1362 1363 /* 1364 * Go through and find any spares which have since been 1365 * repurposed as an active spare. If this is the case, update 1366 * their status appropriately. 1367 */ 1368 for (i = 0; i < nspares; i++) { 1369 VERIFY(nvlist_lookup_uint64(spares[i], 1370 ZPOOL_CONFIG_GUID, &guid) == 0); 1371 if (spa_spare_exists(guid, &pool) && pool != 0ULL) { 1372 VERIFY(nvlist_lookup_uint64_array( 1373 spares[i], ZPOOL_CONFIG_STATS, 1374 (uint64_t **)&vs, &vsc) == 0); 1375 vs->vs_state = VDEV_STATE_CANT_OPEN; 1376 vs->vs_aux = VDEV_AUX_SPARED; 1377 } 1378 } 1379 } 1380 } 1381 1382 int 1383 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 1384 { 1385 int error; 1386 spa_t *spa; 1387 1388 *config = NULL; 1389 error = spa_open_common(name, &spa, FTAG, config); 1390 1391 if (spa && *config != NULL) { 1392 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 1393 spa_get_errlog_size(spa)) == 0); 1394 1395 spa_add_spares(spa, *config); 1396 } 1397 1398 /* 1399 * We want to get the alternate root even for faulted pools, so we cheat 1400 * and call spa_lookup() directly. 1401 */ 1402 if (altroot) { 1403 if (spa == NULL) { 1404 mutex_enter(&spa_namespace_lock); 1405 spa = spa_lookup(name); 1406 if (spa) 1407 spa_altroot(spa, altroot, buflen); 1408 else 1409 altroot[0] = '\0'; 1410 spa = NULL; 1411 mutex_exit(&spa_namespace_lock); 1412 } else { 1413 spa_altroot(spa, altroot, buflen); 1414 } 1415 } 1416 1417 if (spa != NULL) 1418 spa_close(spa, FTAG); 1419 1420 return (error); 1421 } 1422 1423 /* 1424 * Validate that the 'spares' array is well formed. We must have an array of 1425 * nvlists, each which describes a valid leaf vdev. If this is an import (mode 1426 * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long 1427 * as they are well-formed. 1428 */ 1429 static int 1430 spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 1431 { 1432 nvlist_t **spares; 1433 uint_t i, nspares; 1434 vdev_t *vd; 1435 int error; 1436 1437 /* 1438 * It's acceptable to have no spares specified. 1439 */ 1440 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1441 &spares, &nspares) != 0) 1442 return (0); 1443 1444 if (nspares == 0) 1445 return (EINVAL); 1446 1447 /* 1448 * Make sure the pool is formatted with a version that supports hot 1449 * spares. 1450 */ 1451 if (spa_version(spa) < SPA_VERSION_SPARES) 1452 return (ENOTSUP); 1453 1454 /* 1455 * Set the pending spare list so we correctly handle device in-use 1456 * checking. 1457 */ 1458 spa->spa_pending_spares = spares; 1459 spa->spa_pending_nspares = nspares; 1460 1461 for (i = 0; i < nspares; i++) { 1462 if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 1463 mode)) != 0) 1464 goto out; 1465 1466 if (!vd->vdev_ops->vdev_op_leaf) { 1467 vdev_free(vd); 1468 error = EINVAL; 1469 goto out; 1470 } 1471 1472 vd->vdev_top = vd; 1473 1474 if ((error = vdev_open(vd)) == 0 && 1475 (error = vdev_label_init(vd, crtxg, 1476 VDEV_LABEL_SPARE)) == 0) { 1477 VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 1478 vd->vdev_guid) == 0); 1479 } 1480 1481 vdev_free(vd); 1482 1483 if (error && mode != VDEV_ALLOC_SPARE) 1484 goto out; 1485 else 1486 error = 0; 1487 } 1488 1489 out: 1490 spa->spa_pending_spares = NULL; 1491 spa->spa_pending_nspares = 0; 1492 return (error); 1493 } 1494 1495 /* 1496 * Pool Creation 1497 */ 1498 int 1499 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 1500 const char *history_str) 1501 { 1502 spa_t *spa; 1503 char *altroot = NULL; 1504 vdev_t *rvd; 1505 dsl_pool_t *dp; 1506 dmu_tx_t *tx; 1507 int c, error = 0; 1508 uint64_t txg = TXG_INITIAL; 1509 nvlist_t **spares; 1510 uint_t nspares; 1511 uint64_t version; 1512 1513 /* 1514 * If this pool already exists, return failure. 1515 */ 1516 mutex_enter(&spa_namespace_lock); 1517 if (spa_lookup(pool) != NULL) { 1518 mutex_exit(&spa_namespace_lock); 1519 return (EEXIST); 1520 } 1521 1522 /* 1523 * Allocate a new spa_t structure. 1524 */ 1525 (void) nvlist_lookup_string(props, 1526 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 1527 spa = spa_add(pool, altroot); 1528 spa_activate(spa); 1529 1530 spa->spa_uberblock.ub_txg = txg - 1; 1531 1532 if (props && (error = spa_prop_validate(spa, props))) { 1533 spa_unload(spa); 1534 spa_deactivate(spa); 1535 spa_remove(spa); 1536 return (error); 1537 } 1538 1539 if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 1540 &version) != 0) 1541 version = SPA_VERSION; 1542 ASSERT(version <= SPA_VERSION); 1543 spa->spa_uberblock.ub_version = version; 1544 spa->spa_ubsync = spa->spa_uberblock; 1545 1546 /* 1547 * Create the root vdev. 1548 */ 1549 spa_config_enter(spa, RW_WRITER, FTAG); 1550 1551 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1552 1553 ASSERT(error != 0 || rvd != NULL); 1554 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1555 1556 if (error == 0 && rvd->vdev_children == 0) 1557 error = EINVAL; 1558 1559 if (error == 0 && 1560 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1561 (error = spa_validate_spares(spa, nvroot, txg, 1562 VDEV_ALLOC_ADD)) == 0) { 1563 for (c = 0; c < rvd->vdev_children; c++) 1564 vdev_init(rvd->vdev_child[c], txg); 1565 vdev_config_dirty(rvd); 1566 } 1567 1568 spa_config_exit(spa, FTAG); 1569 1570 if (error != 0) { 1571 spa_unload(spa); 1572 spa_deactivate(spa); 1573 spa_remove(spa); 1574 mutex_exit(&spa_namespace_lock); 1575 return (error); 1576 } 1577 1578 /* 1579 * Get the list of spares, if specified. 1580 */ 1581 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1582 &spares, &nspares) == 0) { 1583 VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 1584 KM_SLEEP) == 0); 1585 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1586 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1587 spa_config_enter(spa, RW_WRITER, FTAG); 1588 spa_load_spares(spa); 1589 spa_config_exit(spa, FTAG); 1590 spa->spa_sync_spares = B_TRUE; 1591 } 1592 1593 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1594 spa->spa_meta_objset = dp->dp_meta_objset; 1595 1596 tx = dmu_tx_create_assigned(dp, txg); 1597 1598 /* 1599 * Create the pool config object. 1600 */ 1601 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1602 DMU_OT_PACKED_NVLIST, 1 << 14, 1603 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1604 1605 if (zap_add(spa->spa_meta_objset, 1606 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1607 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1608 cmn_err(CE_PANIC, "failed to add pool config"); 1609 } 1610 1611 /* Newly created pools with the right version are always deflated. */ 1612 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 1613 spa->spa_deflate = TRUE; 1614 if (zap_add(spa->spa_meta_objset, 1615 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1616 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 1617 cmn_err(CE_PANIC, "failed to add deflate"); 1618 } 1619 } 1620 1621 /* 1622 * Create the deferred-free bplist object. Turn off compression 1623 * because sync-to-convergence takes longer if the blocksize 1624 * keeps changing. 1625 */ 1626 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1627 1 << 14, tx); 1628 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1629 ZIO_COMPRESS_OFF, tx); 1630 1631 if (zap_add(spa->spa_meta_objset, 1632 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1633 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 1634 cmn_err(CE_PANIC, "failed to add bplist"); 1635 } 1636 1637 /* 1638 * Create the pool's history object. 1639 */ 1640 if (version >= SPA_VERSION_ZPOOL_HISTORY) 1641 spa_history_create_obj(spa, tx); 1642 1643 /* 1644 * Set pool properties. 1645 */ 1646 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 1647 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 1648 spa->spa_temporary = zpool_prop_default_numeric(ZPOOL_PROP_TEMPORARY); 1649 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 1650 if (props) 1651 spa_sync_props(spa, props, CRED(), tx); 1652 1653 dmu_tx_commit(tx); 1654 1655 spa->spa_sync_on = B_TRUE; 1656 txg_sync_start(spa->spa_dsl_pool); 1657 1658 /* 1659 * We explicitly wait for the first transaction to complete so that our 1660 * bean counters are appropriately updated. 1661 */ 1662 txg_wait_synced(spa->spa_dsl_pool, txg); 1663 1664 spa_config_sync(); 1665 1666 if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 1667 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 1668 1669 mutex_exit(&spa_namespace_lock); 1670 1671 return (0); 1672 } 1673 1674 /* 1675 * Import the given pool into the system. We set up the necessary spa_t and 1676 * then call spa_load() to do the dirty work. 1677 */ 1678 int 1679 spa_import(const char *pool, nvlist_t *config, nvlist_t *props) 1680 { 1681 spa_t *spa; 1682 char *altroot = NULL; 1683 int error; 1684 nvlist_t *nvroot; 1685 nvlist_t **spares; 1686 uint_t nspares; 1687 1688 /* 1689 * If a pool with this name exists, return failure. 1690 */ 1691 mutex_enter(&spa_namespace_lock); 1692 if (spa_lookup(pool) != NULL) { 1693 mutex_exit(&spa_namespace_lock); 1694 return (EEXIST); 1695 } 1696 1697 /* 1698 * Create and initialize the spa structure. 1699 */ 1700 (void) nvlist_lookup_string(props, 1701 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 1702 spa = spa_add(pool, altroot); 1703 spa_activate(spa); 1704 1705 /* 1706 * Pass off the heavy lifting to spa_load(). 1707 * Pass TRUE for mosconfig because the user-supplied config 1708 * is actually the one to trust when doing an import. 1709 */ 1710 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1711 1712 spa_config_enter(spa, RW_WRITER, FTAG); 1713 /* 1714 * Toss any existing sparelist, as it doesn't have any validity anymore, 1715 * and conflicts with spa_has_spare(). 1716 */ 1717 if (spa->spa_sparelist) { 1718 nvlist_free(spa->spa_sparelist); 1719 spa->spa_sparelist = NULL; 1720 spa_load_spares(spa); 1721 } 1722 1723 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1724 &nvroot) == 0); 1725 if (error == 0) { 1726 error = spa_validate_spares(spa, nvroot, -1ULL, 1727 VDEV_ALLOC_SPARE); 1728 } 1729 spa_config_exit(spa, FTAG); 1730 1731 if (error != 0 || (props && (error = spa_prop_set(spa, props)))) { 1732 spa_unload(spa); 1733 spa_deactivate(spa); 1734 spa_remove(spa); 1735 mutex_exit(&spa_namespace_lock); 1736 return (error); 1737 } 1738 1739 /* 1740 * Override any spares as specified by the user, as these may have 1741 * correct device names/devids, etc. 1742 */ 1743 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1744 &spares, &nspares) == 0) { 1745 if (spa->spa_sparelist) 1746 VERIFY(nvlist_remove(spa->spa_sparelist, 1747 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1748 else 1749 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1750 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1751 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1752 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1753 spa_config_enter(spa, RW_WRITER, FTAG); 1754 spa_load_spares(spa); 1755 spa_config_exit(spa, FTAG); 1756 spa->spa_sync_spares = B_TRUE; 1757 } 1758 1759 /* 1760 * Update the config cache to include the newly-imported pool. 1761 */ 1762 if (spa_mode & FWRITE) 1763 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1764 1765 /* 1766 * Resilver anything that's out of date. 1767 */ 1768 if (spa_mode & FWRITE) 1769 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1770 1771 mutex_exit(&spa_namespace_lock); 1772 1773 return (0); 1774 } 1775 1776 /* 1777 * This (illegal) pool name is used when temporarily importing a spa_t in order 1778 * to get the vdev stats associated with the imported devices. 1779 */ 1780 #define TRYIMPORT_NAME "$import" 1781 1782 nvlist_t * 1783 spa_tryimport(nvlist_t *tryconfig) 1784 { 1785 nvlist_t *config = NULL; 1786 char *poolname; 1787 spa_t *spa; 1788 uint64_t state; 1789 1790 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1791 return (NULL); 1792 1793 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1794 return (NULL); 1795 1796 /* 1797 * Create and initialize the spa structure. 1798 */ 1799 mutex_enter(&spa_namespace_lock); 1800 spa = spa_add(TRYIMPORT_NAME, NULL); 1801 spa_activate(spa); 1802 1803 /* 1804 * Pass off the heavy lifting to spa_load(). 1805 * Pass TRUE for mosconfig because the user-supplied config 1806 * is actually the one to trust when doing an import. 1807 */ 1808 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1809 1810 /* 1811 * If 'tryconfig' was at least parsable, return the current config. 1812 */ 1813 if (spa->spa_root_vdev != NULL) { 1814 spa_config_enter(spa, RW_READER, FTAG); 1815 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1816 spa_config_exit(spa, FTAG); 1817 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1818 poolname) == 0); 1819 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1820 state) == 0); 1821 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 1822 spa->spa_uberblock.ub_timestamp) == 0); 1823 1824 /* 1825 * Add the list of hot spares. 1826 */ 1827 spa_add_spares(spa, config); 1828 } 1829 1830 spa_unload(spa); 1831 spa_deactivate(spa); 1832 spa_remove(spa); 1833 mutex_exit(&spa_namespace_lock); 1834 1835 return (config); 1836 } 1837 1838 /* 1839 * Pool export/destroy 1840 * 1841 * The act of destroying or exporting a pool is very simple. We make sure there 1842 * is no more pending I/O and any references to the pool are gone. Then, we 1843 * update the pool state and sync all the labels to disk, removing the 1844 * configuration from the cache afterwards. 1845 */ 1846 static int 1847 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1848 { 1849 spa_t *spa; 1850 1851 if (oldconfig) 1852 *oldconfig = NULL; 1853 1854 if (!(spa_mode & FWRITE)) 1855 return (EROFS); 1856 1857 mutex_enter(&spa_namespace_lock); 1858 if ((spa = spa_lookup(pool)) == NULL) { 1859 mutex_exit(&spa_namespace_lock); 1860 return (ENOENT); 1861 } 1862 1863 /* 1864 * Put a hold on the pool, drop the namespace lock, stop async tasks, 1865 * reacquire the namespace lock, and see if we can export. 1866 */ 1867 spa_open_ref(spa, FTAG); 1868 mutex_exit(&spa_namespace_lock); 1869 spa_async_suspend(spa); 1870 mutex_enter(&spa_namespace_lock); 1871 spa_close(spa, FTAG); 1872 1873 /* 1874 * The pool will be in core if it's openable, 1875 * in which case we can modify its state. 1876 */ 1877 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1878 /* 1879 * Objsets may be open only because they're dirty, so we 1880 * have to force it to sync before checking spa_refcnt. 1881 */ 1882 spa_scrub_suspend(spa); 1883 txg_wait_synced(spa->spa_dsl_pool, 0); 1884 1885 /* 1886 * A pool cannot be exported or destroyed if there are active 1887 * references. If we are resetting a pool, allow references by 1888 * fault injection handlers. 1889 */ 1890 if (!spa_refcount_zero(spa) || 1891 (spa->spa_inject_ref != 0 && 1892 new_state != POOL_STATE_UNINITIALIZED)) { 1893 spa_scrub_resume(spa); 1894 spa_async_resume(spa); 1895 mutex_exit(&spa_namespace_lock); 1896 return (EBUSY); 1897 } 1898 1899 spa_scrub_resume(spa); 1900 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1901 1902 /* 1903 * We want this to be reflected on every label, 1904 * so mark them all dirty. spa_unload() will do the 1905 * final sync that pushes these changes out. 1906 */ 1907 if (new_state != POOL_STATE_UNINITIALIZED) { 1908 spa_config_enter(spa, RW_WRITER, FTAG); 1909 spa->spa_state = new_state; 1910 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 1911 vdev_config_dirty(spa->spa_root_vdev); 1912 spa_config_exit(spa, FTAG); 1913 } 1914 } 1915 1916 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 1917 1918 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1919 spa_unload(spa); 1920 spa_deactivate(spa); 1921 } 1922 1923 if (oldconfig && spa->spa_config) 1924 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 1925 1926 if (new_state != POOL_STATE_UNINITIALIZED) { 1927 spa_remove(spa); 1928 spa_config_sync(); 1929 } 1930 mutex_exit(&spa_namespace_lock); 1931 1932 return (0); 1933 } 1934 1935 /* 1936 * Destroy a storage pool. 1937 */ 1938 int 1939 spa_destroy(char *pool) 1940 { 1941 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1942 } 1943 1944 /* 1945 * Export a storage pool. 1946 */ 1947 int 1948 spa_export(char *pool, nvlist_t **oldconfig) 1949 { 1950 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1951 } 1952 1953 /* 1954 * Similar to spa_export(), this unloads the spa_t without actually removing it 1955 * from the namespace in any way. 1956 */ 1957 int 1958 spa_reset(char *pool) 1959 { 1960 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 1961 } 1962 1963 1964 /* 1965 * ========================================================================== 1966 * Device manipulation 1967 * ========================================================================== 1968 */ 1969 1970 /* 1971 * Add a device to a storage pool. 1972 */ 1973 int 1974 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1975 { 1976 uint64_t txg; 1977 int c, error; 1978 vdev_t *rvd = spa->spa_root_vdev; 1979 vdev_t *vd, *tvd; 1980 nvlist_t **spares; 1981 uint_t i, nspares; 1982 1983 txg = spa_vdev_enter(spa); 1984 1985 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 1986 VDEV_ALLOC_ADD)) != 0) 1987 return (spa_vdev_exit(spa, NULL, txg, error)); 1988 1989 spa->spa_pending_vdev = vd; 1990 1991 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1992 &spares, &nspares) != 0) 1993 nspares = 0; 1994 1995 if (vd->vdev_children == 0 && nspares == 0) { 1996 spa->spa_pending_vdev = NULL; 1997 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1998 } 1999 2000 if (vd->vdev_children != 0) { 2001 if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 2002 spa->spa_pending_vdev = NULL; 2003 return (spa_vdev_exit(spa, vd, txg, error)); 2004 } 2005 } 2006 2007 /* 2008 * We must validate the spares after checking the children. Otherwise, 2009 * vdev_inuse() will blindly overwrite the spare. 2010 */ 2011 if ((error = spa_validate_spares(spa, nvroot, txg, 2012 VDEV_ALLOC_ADD)) != 0) { 2013 spa->spa_pending_vdev = NULL; 2014 return (spa_vdev_exit(spa, vd, txg, error)); 2015 } 2016 2017 spa->spa_pending_vdev = NULL; 2018 2019 /* 2020 * Transfer each new top-level vdev from vd to rvd. 2021 */ 2022 for (c = 0; c < vd->vdev_children; c++) { 2023 tvd = vd->vdev_child[c]; 2024 vdev_remove_child(vd, tvd); 2025 tvd->vdev_id = rvd->vdev_children; 2026 vdev_add_child(rvd, tvd); 2027 vdev_config_dirty(tvd); 2028 } 2029 2030 if (nspares != 0) { 2031 if (spa->spa_sparelist != NULL) { 2032 nvlist_t **oldspares; 2033 uint_t oldnspares; 2034 nvlist_t **newspares; 2035 2036 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 2037 ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 2038 2039 newspares = kmem_alloc(sizeof (void *) * 2040 (nspares + oldnspares), KM_SLEEP); 2041 for (i = 0; i < oldnspares; i++) 2042 VERIFY(nvlist_dup(oldspares[i], 2043 &newspares[i], KM_SLEEP) == 0); 2044 for (i = 0; i < nspares; i++) 2045 VERIFY(nvlist_dup(spares[i], 2046 &newspares[i + oldnspares], 2047 KM_SLEEP) == 0); 2048 2049 VERIFY(nvlist_remove(spa->spa_sparelist, 2050 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 2051 2052 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 2053 ZPOOL_CONFIG_SPARES, newspares, 2054 nspares + oldnspares) == 0); 2055 for (i = 0; i < oldnspares + nspares; i++) 2056 nvlist_free(newspares[i]); 2057 kmem_free(newspares, (oldnspares + nspares) * 2058 sizeof (void *)); 2059 } else { 2060 VERIFY(nvlist_alloc(&spa->spa_sparelist, 2061 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2062 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 2063 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2064 } 2065 2066 spa_load_spares(spa); 2067 spa->spa_sync_spares = B_TRUE; 2068 } 2069 2070 /* 2071 * We have to be careful when adding new vdevs to an existing pool. 2072 * If other threads start allocating from these vdevs before we 2073 * sync the config cache, and we lose power, then upon reboot we may 2074 * fail to open the pool because there are DVAs that the config cache 2075 * can't translate. Therefore, we first add the vdevs without 2076 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 2077 * and then let spa_config_update() initialize the new metaslabs. 2078 * 2079 * spa_load() checks for added-but-not-initialized vdevs, so that 2080 * if we lose power at any point in this sequence, the remaining 2081 * steps will be completed the next time we load the pool. 2082 */ 2083 (void) spa_vdev_exit(spa, vd, txg, 0); 2084 2085 mutex_enter(&spa_namespace_lock); 2086 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2087 mutex_exit(&spa_namespace_lock); 2088 2089 return (0); 2090 } 2091 2092 /* 2093 * Attach a device to a mirror. The arguments are the path to any device 2094 * in the mirror, and the nvroot for the new device. If the path specifies 2095 * a device that is not mirrored, we automatically insert the mirror vdev. 2096 * 2097 * If 'replacing' is specified, the new device is intended to replace the 2098 * existing device; in this case the two devices are made into their own 2099 * mirror using the 'replacing' vdev, which is functionally identical to 2100 * the mirror vdev (it actually reuses all the same ops) but has a few 2101 * extra rules: you can't attach to it after it's been created, and upon 2102 * completion of resilvering, the first disk (the one being replaced) 2103 * is automatically detached. 2104 */ 2105 int 2106 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 2107 { 2108 uint64_t txg, open_txg; 2109 int error; 2110 vdev_t *rvd = spa->spa_root_vdev; 2111 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 2112 vdev_ops_t *pvops; 2113 int is_log; 2114 2115 txg = spa_vdev_enter(spa); 2116 2117 oldvd = vdev_lookup_by_guid(rvd, guid); 2118 2119 if (oldvd == NULL) 2120 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 2121 2122 if (!oldvd->vdev_ops->vdev_op_leaf) 2123 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2124 2125 pvd = oldvd->vdev_parent; 2126 2127 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 2128 VDEV_ALLOC_ADD)) != 0) 2129 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 2130 2131 if (newrootvd->vdev_children != 1) 2132 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 2133 2134 newvd = newrootvd->vdev_child[0]; 2135 2136 if (!newvd->vdev_ops->vdev_op_leaf) 2137 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 2138 2139 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 2140 return (spa_vdev_exit(spa, newrootvd, txg, error)); 2141 2142 /* 2143 * Spares can't replace logs 2144 */ 2145 is_log = oldvd->vdev_islog; 2146 if (is_log && newvd->vdev_isspare) 2147 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2148 2149 if (!replacing) { 2150 /* 2151 * For attach, the only allowable parent is a mirror or the root 2152 * vdev. 2153 */ 2154 if (pvd->vdev_ops != &vdev_mirror_ops && 2155 pvd->vdev_ops != &vdev_root_ops) 2156 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2157 2158 pvops = &vdev_mirror_ops; 2159 } else { 2160 /* 2161 * Active hot spares can only be replaced by inactive hot 2162 * spares. 2163 */ 2164 if (pvd->vdev_ops == &vdev_spare_ops && 2165 pvd->vdev_child[1] == oldvd && 2166 !spa_has_spare(spa, newvd->vdev_guid)) 2167 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2168 2169 /* 2170 * If the source is a hot spare, and the parent isn't already a 2171 * spare, then we want to create a new hot spare. Otherwise, we 2172 * want to create a replacing vdev. The user is not allowed to 2173 * attach to a spared vdev child unless the 'isspare' state is 2174 * the same (spare replaces spare, non-spare replaces 2175 * non-spare). 2176 */ 2177 if (pvd->vdev_ops == &vdev_replacing_ops) 2178 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2179 else if (pvd->vdev_ops == &vdev_spare_ops && 2180 newvd->vdev_isspare != oldvd->vdev_isspare) 2181 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2182 else if (pvd->vdev_ops != &vdev_spare_ops && 2183 newvd->vdev_isspare) 2184 pvops = &vdev_spare_ops; 2185 else 2186 pvops = &vdev_replacing_ops; 2187 } 2188 2189 /* 2190 * Compare the new device size with the replaceable/attachable 2191 * device size. 2192 */ 2193 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 2194 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 2195 2196 /* 2197 * The new device cannot have a higher alignment requirement 2198 * than the top-level vdev. 2199 */ 2200 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 2201 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 2202 2203 /* 2204 * If this is an in-place replacement, update oldvd's path and devid 2205 * to make it distinguishable from newvd, and unopenable from now on. 2206 */ 2207 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 2208 spa_strfree(oldvd->vdev_path); 2209 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 2210 KM_SLEEP); 2211 (void) sprintf(oldvd->vdev_path, "%s/%s", 2212 newvd->vdev_path, "old"); 2213 if (oldvd->vdev_devid != NULL) { 2214 spa_strfree(oldvd->vdev_devid); 2215 oldvd->vdev_devid = NULL; 2216 } 2217 } 2218 2219 /* 2220 * If the parent is not a mirror, or if we're replacing, insert the new 2221 * mirror/replacing/spare vdev above oldvd. 2222 */ 2223 if (pvd->vdev_ops != pvops) 2224 pvd = vdev_add_parent(oldvd, pvops); 2225 2226 ASSERT(pvd->vdev_top->vdev_parent == rvd); 2227 ASSERT(pvd->vdev_ops == pvops); 2228 ASSERT(oldvd->vdev_parent == pvd); 2229 2230 /* 2231 * Extract the new device from its root and add it to pvd. 2232 */ 2233 vdev_remove_child(newrootvd, newvd); 2234 newvd->vdev_id = pvd->vdev_children; 2235 vdev_add_child(pvd, newvd); 2236 2237 /* 2238 * If newvd is smaller than oldvd, but larger than its rsize, 2239 * the addition of newvd may have decreased our parent's asize. 2240 */ 2241 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 2242 2243 tvd = newvd->vdev_top; 2244 ASSERT(pvd->vdev_top == tvd); 2245 ASSERT(tvd->vdev_parent == rvd); 2246 2247 vdev_config_dirty(tvd); 2248 2249 /* 2250 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 2251 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 2252 */ 2253 open_txg = txg + TXG_CONCURRENT_STATES - 1; 2254 2255 mutex_enter(&newvd->vdev_dtl_lock); 2256 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 2257 open_txg - TXG_INITIAL + 1); 2258 mutex_exit(&newvd->vdev_dtl_lock); 2259 2260 if (newvd->vdev_isspare) 2261 spa_spare_activate(newvd); 2262 2263 /* 2264 * Mark newvd's DTL dirty in this txg. 2265 */ 2266 vdev_dirty(tvd, VDD_DTL, newvd, txg); 2267 2268 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 2269 2270 /* 2271 * Kick off a resilver to update newvd. We need to grab the namespace 2272 * lock because spa_scrub() needs to post a sysevent with the pool name. 2273 */ 2274 mutex_enter(&spa_namespace_lock); 2275 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 2276 mutex_exit(&spa_namespace_lock); 2277 2278 return (0); 2279 } 2280 2281 /* 2282 * Detach a device from a mirror or replacing vdev. 2283 * If 'replace_done' is specified, only detach if the parent 2284 * is a replacing vdev. 2285 */ 2286 int 2287 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 2288 { 2289 uint64_t txg; 2290 int c, t, error; 2291 vdev_t *rvd = spa->spa_root_vdev; 2292 vdev_t *vd, *pvd, *cvd, *tvd; 2293 boolean_t unspare = B_FALSE; 2294 uint64_t unspare_guid; 2295 2296 txg = spa_vdev_enter(spa); 2297 2298 vd = vdev_lookup_by_guid(rvd, guid); 2299 2300 if (vd == NULL) 2301 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 2302 2303 if (!vd->vdev_ops->vdev_op_leaf) 2304 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2305 2306 pvd = vd->vdev_parent; 2307 2308 /* 2309 * If replace_done is specified, only remove this device if it's 2310 * the first child of a replacing vdev. For the 'spare' vdev, either 2311 * disk can be removed. 2312 */ 2313 if (replace_done) { 2314 if (pvd->vdev_ops == &vdev_replacing_ops) { 2315 if (vd->vdev_id != 0) 2316 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2317 } else if (pvd->vdev_ops != &vdev_spare_ops) { 2318 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2319 } 2320 } 2321 2322 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 2323 spa_version(spa) >= SPA_VERSION_SPARES); 2324 2325 /* 2326 * Only mirror, replacing, and spare vdevs support detach. 2327 */ 2328 if (pvd->vdev_ops != &vdev_replacing_ops && 2329 pvd->vdev_ops != &vdev_mirror_ops && 2330 pvd->vdev_ops != &vdev_spare_ops) 2331 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2332 2333 /* 2334 * If there's only one replica, you can't detach it. 2335 */ 2336 if (pvd->vdev_children <= 1) 2337 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 2338 2339 /* 2340 * If all siblings have non-empty DTLs, this device may have the only 2341 * valid copy of the data, which means we cannot safely detach it. 2342 * 2343 * XXX -- as in the vdev_offline() case, we really want a more 2344 * precise DTL check. 2345 */ 2346 for (c = 0; c < pvd->vdev_children; c++) { 2347 uint64_t dirty; 2348 2349 cvd = pvd->vdev_child[c]; 2350 if (cvd == vd) 2351 continue; 2352 if (vdev_is_dead(cvd)) 2353 continue; 2354 mutex_enter(&cvd->vdev_dtl_lock); 2355 dirty = cvd->vdev_dtl_map.sm_space | 2356 cvd->vdev_dtl_scrub.sm_space; 2357 mutex_exit(&cvd->vdev_dtl_lock); 2358 if (!dirty) 2359 break; 2360 } 2361 2362 /* 2363 * If we are a replacing or spare vdev, then we can always detach the 2364 * latter child, as that is how one cancels the operation. 2365 */ 2366 if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 2367 c == pvd->vdev_children) 2368 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 2369 2370 /* 2371 * If we are detaching the original disk from a spare, then it implies 2372 * that the spare should become a real disk, and be removed from the 2373 * active spare list for the pool. 2374 */ 2375 if (pvd->vdev_ops == &vdev_spare_ops && 2376 vd->vdev_id == 0) 2377 unspare = B_TRUE; 2378 2379 /* 2380 * Erase the disk labels so the disk can be used for other things. 2381 * This must be done after all other error cases are handled, 2382 * but before we disembowel vd (so we can still do I/O to it). 2383 * But if we can't do it, don't treat the error as fatal -- 2384 * it may be that the unwritability of the disk is the reason 2385 * it's being detached! 2386 */ 2387 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 2388 2389 /* 2390 * Remove vd from its parent and compact the parent's children. 2391 */ 2392 vdev_remove_child(pvd, vd); 2393 vdev_compact_children(pvd); 2394 2395 /* 2396 * Remember one of the remaining children so we can get tvd below. 2397 */ 2398 cvd = pvd->vdev_child[0]; 2399 2400 /* 2401 * If we need to remove the remaining child from the list of hot spares, 2402 * do it now, marking the vdev as no longer a spare in the process. We 2403 * must do this before vdev_remove_parent(), because that can change the 2404 * GUID if it creates a new toplevel GUID. 2405 */ 2406 if (unspare) { 2407 ASSERT(cvd->vdev_isspare); 2408 spa_spare_remove(cvd); 2409 unspare_guid = cvd->vdev_guid; 2410 } 2411 2412 /* 2413 * If the parent mirror/replacing vdev only has one child, 2414 * the parent is no longer needed. Remove it from the tree. 2415 */ 2416 if (pvd->vdev_children == 1) 2417 vdev_remove_parent(cvd); 2418 2419 /* 2420 * We don't set tvd until now because the parent we just removed 2421 * may have been the previous top-level vdev. 2422 */ 2423 tvd = cvd->vdev_top; 2424 ASSERT(tvd->vdev_parent == rvd); 2425 2426 /* 2427 * Reevaluate the parent vdev state. 2428 */ 2429 vdev_propagate_state(cvd); 2430 2431 /* 2432 * If the device we just detached was smaller than the others, it may be 2433 * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 2434 * can't fail because the existing metaslabs are already in core, so 2435 * there's nothing to read from disk. 2436 */ 2437 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 2438 2439 vdev_config_dirty(tvd); 2440 2441 /* 2442 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 2443 * vd->vdev_detached is set and free vd's DTL object in syncing context. 2444 * But first make sure we're not on any *other* txg's DTL list, to 2445 * prevent vd from being accessed after it's freed. 2446 */ 2447 for (t = 0; t < TXG_SIZE; t++) 2448 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 2449 vd->vdev_detached = B_TRUE; 2450 vdev_dirty(tvd, VDD_DTL, vd, txg); 2451 2452 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 2453 2454 error = spa_vdev_exit(spa, vd, txg, 0); 2455 2456 /* 2457 * If this was the removal of the original device in a hot spare vdev, 2458 * then we want to go through and remove the device from the hot spare 2459 * list of every other pool. 2460 */ 2461 if (unspare) { 2462 spa = NULL; 2463 mutex_enter(&spa_namespace_lock); 2464 while ((spa = spa_next(spa)) != NULL) { 2465 if (spa->spa_state != POOL_STATE_ACTIVE) 2466 continue; 2467 2468 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 2469 } 2470 mutex_exit(&spa_namespace_lock); 2471 } 2472 2473 return (error); 2474 } 2475 2476 /* 2477 * Remove a device from the pool. Currently, this supports removing only hot 2478 * spares. 2479 */ 2480 int 2481 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 2482 { 2483 vdev_t *vd; 2484 nvlist_t **spares, *nv, **newspares; 2485 uint_t i, j, nspares; 2486 int ret = 0; 2487 2488 spa_config_enter(spa, RW_WRITER, FTAG); 2489 2490 vd = spa_lookup_by_guid(spa, guid); 2491 2492 nv = NULL; 2493 if (spa->spa_spares != NULL && 2494 nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2495 &spares, &nspares) == 0) { 2496 for (i = 0; i < nspares; i++) { 2497 uint64_t theguid; 2498 2499 VERIFY(nvlist_lookup_uint64(spares[i], 2500 ZPOOL_CONFIG_GUID, &theguid) == 0); 2501 if (theguid == guid) { 2502 nv = spares[i]; 2503 break; 2504 } 2505 } 2506 } 2507 2508 /* 2509 * We only support removing a hot spare, and only if it's not currently 2510 * in use in this pool. 2511 */ 2512 if (nv == NULL && vd == NULL) { 2513 ret = ENOENT; 2514 goto out; 2515 } 2516 2517 if (nv == NULL && vd != NULL) { 2518 ret = ENOTSUP; 2519 goto out; 2520 } 2521 2522 if (!unspare && nv != NULL && vd != NULL) { 2523 ret = EBUSY; 2524 goto out; 2525 } 2526 2527 if (nspares == 1) { 2528 newspares = NULL; 2529 } else { 2530 newspares = kmem_alloc((nspares - 1) * sizeof (void *), 2531 KM_SLEEP); 2532 for (i = 0, j = 0; i < nspares; i++) { 2533 if (spares[i] != nv) 2534 VERIFY(nvlist_dup(spares[i], 2535 &newspares[j++], KM_SLEEP) == 0); 2536 } 2537 } 2538 2539 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2540 DATA_TYPE_NVLIST_ARRAY) == 0); 2541 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2542 newspares, nspares - 1) == 0); 2543 for (i = 0; i < nspares - 1; i++) 2544 nvlist_free(newspares[i]); 2545 kmem_free(newspares, (nspares - 1) * sizeof (void *)); 2546 spa_load_spares(spa); 2547 spa->spa_sync_spares = B_TRUE; 2548 2549 out: 2550 spa_config_exit(spa, FTAG); 2551 2552 return (ret); 2553 } 2554 2555 /* 2556 * Find any device that's done replacing, or a vdev marked 'unspare' that's 2557 * current spared, so we can detach it. 2558 */ 2559 static vdev_t * 2560 spa_vdev_resilver_done_hunt(vdev_t *vd) 2561 { 2562 vdev_t *newvd, *oldvd; 2563 int c; 2564 2565 for (c = 0; c < vd->vdev_children; c++) { 2566 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 2567 if (oldvd != NULL) 2568 return (oldvd); 2569 } 2570 2571 /* 2572 * Check for a completed replacement. 2573 */ 2574 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 2575 oldvd = vd->vdev_child[0]; 2576 newvd = vd->vdev_child[1]; 2577 2578 mutex_enter(&newvd->vdev_dtl_lock); 2579 if (newvd->vdev_dtl_map.sm_space == 0 && 2580 newvd->vdev_dtl_scrub.sm_space == 0) { 2581 mutex_exit(&newvd->vdev_dtl_lock); 2582 return (oldvd); 2583 } 2584 mutex_exit(&newvd->vdev_dtl_lock); 2585 } 2586 2587 /* 2588 * Check for a completed resilver with the 'unspare' flag set. 2589 */ 2590 if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { 2591 newvd = vd->vdev_child[0]; 2592 oldvd = vd->vdev_child[1]; 2593 2594 mutex_enter(&newvd->vdev_dtl_lock); 2595 if (newvd->vdev_unspare && 2596 newvd->vdev_dtl_map.sm_space == 0 && 2597 newvd->vdev_dtl_scrub.sm_space == 0) { 2598 newvd->vdev_unspare = 0; 2599 mutex_exit(&newvd->vdev_dtl_lock); 2600 return (oldvd); 2601 } 2602 mutex_exit(&newvd->vdev_dtl_lock); 2603 } 2604 2605 return (NULL); 2606 } 2607 2608 static void 2609 spa_vdev_resilver_done(spa_t *spa) 2610 { 2611 vdev_t *vd; 2612 vdev_t *pvd; 2613 uint64_t guid; 2614 uint64_t pguid = 0; 2615 2616 spa_config_enter(spa, RW_READER, FTAG); 2617 2618 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 2619 guid = vd->vdev_guid; 2620 /* 2621 * If we have just finished replacing a hot spared device, then 2622 * we need to detach the parent's first child (the original hot 2623 * spare) as well. 2624 */ 2625 pvd = vd->vdev_parent; 2626 if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2627 pvd->vdev_id == 0) { 2628 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 2629 ASSERT(pvd->vdev_parent->vdev_children == 2); 2630 pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 2631 } 2632 spa_config_exit(spa, FTAG); 2633 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 2634 return; 2635 if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 2636 return; 2637 spa_config_enter(spa, RW_READER, FTAG); 2638 } 2639 2640 spa_config_exit(spa, FTAG); 2641 } 2642 2643 /* 2644 * Update the stored path for this vdev. Dirty the vdev configuration, relying 2645 * on spa_vdev_enter/exit() to synchronize the labels and cache. 2646 */ 2647 int 2648 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 2649 { 2650 vdev_t *rvd, *vd; 2651 uint64_t txg; 2652 2653 rvd = spa->spa_root_vdev; 2654 2655 txg = spa_vdev_enter(spa); 2656 2657 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2658 /* 2659 * Determine if this is a reference to a hot spare. In that 2660 * case, update the path as stored in the spare list. 2661 */ 2662 nvlist_t **spares; 2663 uint_t i, nspares; 2664 if (spa->spa_sparelist != NULL) { 2665 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 2666 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2667 for (i = 0; i < nspares; i++) { 2668 uint64_t theguid; 2669 VERIFY(nvlist_lookup_uint64(spares[i], 2670 ZPOOL_CONFIG_GUID, &theguid) == 0); 2671 if (theguid == guid) 2672 break; 2673 } 2674 2675 if (i == nspares) 2676 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2677 2678 VERIFY(nvlist_add_string(spares[i], 2679 ZPOOL_CONFIG_PATH, newpath) == 0); 2680 spa_load_spares(spa); 2681 spa->spa_sync_spares = B_TRUE; 2682 return (spa_vdev_exit(spa, NULL, txg, 0)); 2683 } else { 2684 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2685 } 2686 } 2687 2688 if (!vd->vdev_ops->vdev_op_leaf) 2689 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2690 2691 spa_strfree(vd->vdev_path); 2692 vd->vdev_path = spa_strdup(newpath); 2693 2694 vdev_config_dirty(vd->vdev_top); 2695 2696 return (spa_vdev_exit(spa, NULL, txg, 0)); 2697 } 2698 2699 /* 2700 * ========================================================================== 2701 * SPA Scrubbing 2702 * ========================================================================== 2703 */ 2704 2705 static void 2706 spa_scrub_io_done(zio_t *zio) 2707 { 2708 spa_t *spa = zio->io_spa; 2709 2710 arc_data_buf_free(zio->io_data, zio->io_size); 2711 2712 mutex_enter(&spa->spa_scrub_lock); 2713 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2714 vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2715 spa->spa_scrub_errors++; 2716 mutex_enter(&vd->vdev_stat_lock); 2717 vd->vdev_stat.vs_scrub_errors++; 2718 mutex_exit(&vd->vdev_stat_lock); 2719 } 2720 2721 if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight) 2722 cv_broadcast(&spa->spa_scrub_io_cv); 2723 2724 ASSERT(spa->spa_scrub_inflight >= 0); 2725 2726 mutex_exit(&spa->spa_scrub_lock); 2727 } 2728 2729 static void 2730 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 2731 zbookmark_t *zb) 2732 { 2733 size_t size = BP_GET_LSIZE(bp); 2734 void *data; 2735 2736 mutex_enter(&spa->spa_scrub_lock); 2737 /* 2738 * Do not give too much work to vdev(s). 2739 */ 2740 while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) { 2741 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2742 } 2743 spa->spa_scrub_inflight++; 2744 mutex_exit(&spa->spa_scrub_lock); 2745 2746 data = arc_data_buf_alloc(size); 2747 2748 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 2749 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 2750 2751 flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 2752 2753 zio_nowait(zio_read(NULL, spa, bp, data, size, 2754 spa_scrub_io_done, NULL, priority, flags, zb)); 2755 } 2756 2757 /* ARGSUSED */ 2758 static int 2759 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2760 { 2761 blkptr_t *bp = &bc->bc_blkptr; 2762 vdev_t *vd = spa->spa_root_vdev; 2763 dva_t *dva = bp->blk_dva; 2764 int needs_resilver = B_FALSE; 2765 int d; 2766 2767 if (bc->bc_errno) { 2768 /* 2769 * We can't scrub this block, but we can continue to scrub 2770 * the rest of the pool. Note the error and move along. 2771 */ 2772 mutex_enter(&spa->spa_scrub_lock); 2773 spa->spa_scrub_errors++; 2774 mutex_exit(&spa->spa_scrub_lock); 2775 2776 mutex_enter(&vd->vdev_stat_lock); 2777 vd->vdev_stat.vs_scrub_errors++; 2778 mutex_exit(&vd->vdev_stat_lock); 2779 2780 return (ERESTART); 2781 } 2782 2783 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2784 2785 for (d = 0; d < BP_GET_NDVAS(bp); d++) { 2786 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 2787 2788 ASSERT(vd != NULL); 2789 2790 /* 2791 * Keep track of how much data we've examined so that 2792 * zpool(1M) status can make useful progress reports. 2793 */ 2794 mutex_enter(&vd->vdev_stat_lock); 2795 vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 2796 mutex_exit(&vd->vdev_stat_lock); 2797 2798 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 2799 if (DVA_GET_GANG(&dva[d])) { 2800 /* 2801 * Gang members may be spread across multiple 2802 * vdevs, so the best we can do is look at the 2803 * pool-wide DTL. 2804 * XXX -- it would be better to change our 2805 * allocation policy to ensure that this can't 2806 * happen. 2807 */ 2808 vd = spa->spa_root_vdev; 2809 } 2810 if (vdev_dtl_contains(&vd->vdev_dtl_map, 2811 bp->blk_birth, 1)) 2812 needs_resilver = B_TRUE; 2813 } 2814 } 2815 2816 if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2817 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 2818 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 2819 else if (needs_resilver) 2820 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 2821 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2822 2823 return (0); 2824 } 2825 2826 static void 2827 spa_scrub_thread(spa_t *spa) 2828 { 2829 callb_cpr_t cprinfo; 2830 traverse_handle_t *th = spa->spa_scrub_th; 2831 vdev_t *rvd = spa->spa_root_vdev; 2832 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2833 int error = 0; 2834 boolean_t complete; 2835 2836 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2837 2838 /* 2839 * If we're restarting due to a snapshot create/delete, 2840 * wait for that to complete. 2841 */ 2842 txg_wait_synced(spa_get_dsl(spa), 0); 2843 2844 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 2845 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2846 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 2847 2848 spa_config_enter(spa, RW_WRITER, FTAG); 2849 vdev_reopen(rvd); /* purge all vdev caches */ 2850 vdev_config_dirty(rvd); /* rewrite all disk labels */ 2851 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 2852 spa_config_exit(spa, FTAG); 2853 2854 mutex_enter(&spa->spa_scrub_lock); 2855 spa->spa_scrub_errors = 0; 2856 spa->spa_scrub_active = 1; 2857 ASSERT(spa->spa_scrub_inflight == 0); 2858 2859 while (!spa->spa_scrub_stop) { 2860 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2861 while (spa->spa_scrub_suspended) { 2862 spa->spa_scrub_active = 0; 2863 cv_broadcast(&spa->spa_scrub_cv); 2864 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2865 spa->spa_scrub_active = 1; 2866 } 2867 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2868 2869 if (spa->spa_scrub_restart_txg != 0) 2870 break; 2871 2872 mutex_exit(&spa->spa_scrub_lock); 2873 error = traverse_more(th); 2874 mutex_enter(&spa->spa_scrub_lock); 2875 if (error != EAGAIN) 2876 break; 2877 } 2878 2879 while (spa->spa_scrub_inflight) 2880 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2881 2882 spa->spa_scrub_active = 0; 2883 cv_broadcast(&spa->spa_scrub_cv); 2884 2885 mutex_exit(&spa->spa_scrub_lock); 2886 2887 spa_config_enter(spa, RW_WRITER, FTAG); 2888 2889 mutex_enter(&spa->spa_scrub_lock); 2890 2891 /* 2892 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 2893 * AND the spa config lock to synchronize with any config changes 2894 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 2895 */ 2896 if (spa->spa_scrub_restart_txg != 0) 2897 error = ERESTART; 2898 2899 if (spa->spa_scrub_stop) 2900 error = EINTR; 2901 2902 /* 2903 * Even if there were uncorrectable errors, we consider the scrub 2904 * completed. The downside is that if there is a transient error during 2905 * a resilver, we won't resilver the data properly to the target. But 2906 * if the damage is permanent (more likely) we will resilver forever, 2907 * which isn't really acceptable. Since there is enough information for 2908 * the user to know what has failed and why, this seems like a more 2909 * tractable approach. 2910 */ 2911 complete = (error == 0); 2912 2913 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 2914 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2915 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2916 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2917 2918 mutex_exit(&spa->spa_scrub_lock); 2919 2920 /* 2921 * If the scrub/resilver completed, update all DTLs to reflect this. 2922 * Whether it succeeded or not, vacate all temporary scrub DTLs. 2923 */ 2924 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2925 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2926 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 2927 spa_errlog_rotate(spa); 2928 2929 if (scrub_type == POOL_SCRUB_RESILVER && complete) 2930 spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_FINISH); 2931 2932 spa_config_exit(spa, FTAG); 2933 2934 mutex_enter(&spa->spa_scrub_lock); 2935 2936 /* 2937 * We may have finished replacing a device. 2938 * Let the async thread assess this and handle the detach. 2939 */ 2940 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 2941 2942 /* 2943 * If we were told to restart, our final act is to start a new scrub. 2944 */ 2945 if (error == ERESTART) 2946 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 2947 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2948 2949 spa->spa_scrub_type = POOL_SCRUB_NONE; 2950 spa->spa_scrub_active = 0; 2951 spa->spa_scrub_thread = NULL; 2952 cv_broadcast(&spa->spa_scrub_cv); 2953 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2954 thread_exit(); 2955 } 2956 2957 void 2958 spa_scrub_suspend(spa_t *spa) 2959 { 2960 mutex_enter(&spa->spa_scrub_lock); 2961 spa->spa_scrub_suspended++; 2962 while (spa->spa_scrub_active) { 2963 cv_broadcast(&spa->spa_scrub_cv); 2964 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2965 } 2966 while (spa->spa_scrub_inflight) 2967 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2968 mutex_exit(&spa->spa_scrub_lock); 2969 } 2970 2971 void 2972 spa_scrub_resume(spa_t *spa) 2973 { 2974 mutex_enter(&spa->spa_scrub_lock); 2975 ASSERT(spa->spa_scrub_suspended != 0); 2976 if (--spa->spa_scrub_suspended == 0) 2977 cv_broadcast(&spa->spa_scrub_cv); 2978 mutex_exit(&spa->spa_scrub_lock); 2979 } 2980 2981 void 2982 spa_scrub_restart(spa_t *spa, uint64_t txg) 2983 { 2984 /* 2985 * Something happened (e.g. snapshot create/delete) that means 2986 * we must restart any in-progress scrubs. The itinerary will 2987 * fix this properly. 2988 */ 2989 mutex_enter(&spa->spa_scrub_lock); 2990 spa->spa_scrub_restart_txg = txg; 2991 mutex_exit(&spa->spa_scrub_lock); 2992 } 2993 2994 int 2995 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2996 { 2997 space_seg_t *ss; 2998 uint64_t mintxg, maxtxg; 2999 vdev_t *rvd = spa->spa_root_vdev; 3000 3001 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3002 ASSERT(!spa_config_held(spa, RW_WRITER)); 3003 3004 if ((uint_t)type >= POOL_SCRUB_TYPES) 3005 return (ENOTSUP); 3006 3007 mutex_enter(&spa->spa_scrub_lock); 3008 3009 /* 3010 * If there's a scrub or resilver already in progress, stop it. 3011 */ 3012 while (spa->spa_scrub_thread != NULL) { 3013 /* 3014 * Don't stop a resilver unless forced. 3015 */ 3016 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 3017 mutex_exit(&spa->spa_scrub_lock); 3018 return (EBUSY); 3019 } 3020 spa->spa_scrub_stop = 1; 3021 cv_broadcast(&spa->spa_scrub_cv); 3022 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 3023 } 3024 3025 /* 3026 * Terminate the previous traverse. 3027 */ 3028 if (spa->spa_scrub_th != NULL) { 3029 traverse_fini(spa->spa_scrub_th); 3030 spa->spa_scrub_th = NULL; 3031 } 3032 3033 if (rvd == NULL) { 3034 ASSERT(spa->spa_scrub_stop == 0); 3035 ASSERT(spa->spa_scrub_type == type); 3036 ASSERT(spa->spa_scrub_restart_txg == 0); 3037 mutex_exit(&spa->spa_scrub_lock); 3038 return (0); 3039 } 3040 3041 mintxg = TXG_INITIAL - 1; 3042 maxtxg = spa_last_synced_txg(spa) + 1; 3043 3044 mutex_enter(&rvd->vdev_dtl_lock); 3045 3046 if (rvd->vdev_dtl_map.sm_space == 0) { 3047 /* 3048 * The pool-wide DTL is empty. 3049 * If this is a resilver, there's nothing to do except 3050 * check whether any in-progress replacements have completed. 3051 */ 3052 if (type == POOL_SCRUB_RESILVER) { 3053 type = POOL_SCRUB_NONE; 3054 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 3055 } 3056 } else { 3057 /* 3058 * The pool-wide DTL is non-empty. 3059 * If this is a normal scrub, upgrade to a resilver instead. 3060 */ 3061 if (type == POOL_SCRUB_EVERYTHING) 3062 type = POOL_SCRUB_RESILVER; 3063 } 3064 3065 if (type == POOL_SCRUB_RESILVER) { 3066 /* 3067 * Determine the resilvering boundaries. 3068 * 3069 * Note: (mintxg, maxtxg) is an open interval, 3070 * i.e. mintxg and maxtxg themselves are not included. 3071 * 3072 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 3073 * so we don't claim to resilver a txg that's still changing. 3074 */ 3075 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 3076 mintxg = ss->ss_start - 1; 3077 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 3078 maxtxg = MIN(ss->ss_end, maxtxg); 3079 3080 spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START); 3081 } 3082 3083 mutex_exit(&rvd->vdev_dtl_lock); 3084 3085 spa->spa_scrub_stop = 0; 3086 spa->spa_scrub_type = type; 3087 spa->spa_scrub_restart_txg = 0; 3088 3089 if (type != POOL_SCRUB_NONE) { 3090 spa->spa_scrub_mintxg = mintxg; 3091 spa->spa_scrub_maxtxg = maxtxg; 3092 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 3093 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 3094 ZIO_FLAG_CANFAIL); 3095 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 3096 spa->spa_scrub_thread = thread_create(NULL, 0, 3097 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 3098 } 3099 3100 mutex_exit(&spa->spa_scrub_lock); 3101 3102 return (0); 3103 } 3104 3105 /* 3106 * ========================================================================== 3107 * SPA async task processing 3108 * ========================================================================== 3109 */ 3110 3111 static void 3112 spa_async_remove(spa_t *spa, vdev_t *vd) 3113 { 3114 vdev_t *tvd; 3115 int c; 3116 3117 for (c = 0; c < vd->vdev_children; c++) { 3118 tvd = vd->vdev_child[c]; 3119 if (tvd->vdev_remove_wanted) { 3120 tvd->vdev_remove_wanted = 0; 3121 vdev_set_state(tvd, B_FALSE, VDEV_STATE_REMOVED, 3122 VDEV_AUX_NONE); 3123 vdev_clear(spa, tvd, B_TRUE); 3124 vdev_config_dirty(tvd->vdev_top); 3125 } 3126 spa_async_remove(spa, tvd); 3127 } 3128 } 3129 3130 static void 3131 spa_async_thread(spa_t *spa) 3132 { 3133 int tasks; 3134 uint64_t txg; 3135 3136 ASSERT(spa->spa_sync_on); 3137 3138 mutex_enter(&spa->spa_async_lock); 3139 tasks = spa->spa_async_tasks; 3140 spa->spa_async_tasks = 0; 3141 mutex_exit(&spa->spa_async_lock); 3142 3143 /* 3144 * See if the config needs to be updated. 3145 */ 3146 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 3147 mutex_enter(&spa_namespace_lock); 3148 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3149 mutex_exit(&spa_namespace_lock); 3150 } 3151 3152 /* 3153 * See if any devices need to be marked REMOVED. 3154 * 3155 * XXX - We avoid doing this when we are in 3156 * I/O failure state since spa_vdev_enter() grabs 3157 * the namespace lock and would not be able to obtain 3158 * the writer config lock. 3159 */ 3160 if (tasks & SPA_ASYNC_REMOVE && 3161 spa_state(spa) != POOL_STATE_IO_FAILURE) { 3162 txg = spa_vdev_enter(spa); 3163 spa_async_remove(spa, spa->spa_root_vdev); 3164 (void) spa_vdev_exit(spa, NULL, txg, 0); 3165 } 3166 3167 /* 3168 * If any devices are done replacing, detach them. 3169 */ 3170 if (tasks & SPA_ASYNC_RESILVER_DONE) 3171 spa_vdev_resilver_done(spa); 3172 3173 /* 3174 * Kick off a scrub. When starting a RESILVER scrub (or an EVERYTHING 3175 * scrub which can become a resilver), we need to hold 3176 * spa_namespace_lock() because the sysevent we post via 3177 * spa_event_notify() needs to get the name of the pool. 3178 */ 3179 if (tasks & SPA_ASYNC_SCRUB) { 3180 mutex_enter(&spa_namespace_lock); 3181 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 3182 mutex_exit(&spa_namespace_lock); 3183 } 3184 3185 /* 3186 * Kick off a resilver. 3187 */ 3188 if (tasks & SPA_ASYNC_RESILVER) { 3189 mutex_enter(&spa_namespace_lock); 3190 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 3191 mutex_exit(&spa_namespace_lock); 3192 } 3193 3194 /* 3195 * Let the world know that we're done. 3196 */ 3197 mutex_enter(&spa->spa_async_lock); 3198 spa->spa_async_thread = NULL; 3199 cv_broadcast(&spa->spa_async_cv); 3200 mutex_exit(&spa->spa_async_lock); 3201 thread_exit(); 3202 } 3203 3204 void 3205 spa_async_suspend(spa_t *spa) 3206 { 3207 mutex_enter(&spa->spa_async_lock); 3208 spa->spa_async_suspended++; 3209 while (spa->spa_async_thread != NULL) 3210 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 3211 mutex_exit(&spa->spa_async_lock); 3212 } 3213 3214 void 3215 spa_async_resume(spa_t *spa) 3216 { 3217 mutex_enter(&spa->spa_async_lock); 3218 ASSERT(spa->spa_async_suspended != 0); 3219 spa->spa_async_suspended--; 3220 mutex_exit(&spa->spa_async_lock); 3221 } 3222 3223 static void 3224 spa_async_dispatch(spa_t *spa) 3225 { 3226 mutex_enter(&spa->spa_async_lock); 3227 if (spa->spa_async_tasks && !spa->spa_async_suspended && 3228 spa->spa_async_thread == NULL && 3229 rootdir != NULL && !vn_is_readonly(rootdir)) 3230 spa->spa_async_thread = thread_create(NULL, 0, 3231 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 3232 mutex_exit(&spa->spa_async_lock); 3233 } 3234 3235 void 3236 spa_async_request(spa_t *spa, int task) 3237 { 3238 mutex_enter(&spa->spa_async_lock); 3239 spa->spa_async_tasks |= task; 3240 mutex_exit(&spa->spa_async_lock); 3241 } 3242 3243 /* 3244 * ========================================================================== 3245 * SPA syncing routines 3246 * ========================================================================== 3247 */ 3248 3249 static void 3250 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 3251 { 3252 bplist_t *bpl = &spa->spa_sync_bplist; 3253 dmu_tx_t *tx; 3254 blkptr_t blk; 3255 uint64_t itor = 0; 3256 zio_t *zio; 3257 int error; 3258 uint8_t c = 1; 3259 3260 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 3261 3262 while (bplist_iterate(bpl, &itor, &blk) == 0) 3263 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 3264 3265 error = zio_wait(zio); 3266 ASSERT3U(error, ==, 0); 3267 3268 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 3269 bplist_vacate(bpl, tx); 3270 3271 /* 3272 * Pre-dirty the first block so we sync to convergence faster. 3273 * (Usually only the first block is needed.) 3274 */ 3275 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 3276 dmu_tx_commit(tx); 3277 } 3278 3279 static void 3280 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 3281 { 3282 char *packed = NULL; 3283 size_t nvsize = 0; 3284 dmu_buf_t *db; 3285 3286 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 3287 3288 packed = kmem_alloc(nvsize, KM_SLEEP); 3289 3290 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 3291 KM_SLEEP) == 0); 3292 3293 dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 3294 3295 kmem_free(packed, nvsize); 3296 3297 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 3298 dmu_buf_will_dirty(db, tx); 3299 *(uint64_t *)db->db_data = nvsize; 3300 dmu_buf_rele(db, FTAG); 3301 } 3302 3303 static void 3304 spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 3305 { 3306 nvlist_t *nvroot; 3307 nvlist_t **spares; 3308 int i; 3309 3310 if (!spa->spa_sync_spares) 3311 return; 3312 3313 /* 3314 * Update the MOS nvlist describing the list of available spares. 3315 * spa_validate_spares() will have already made sure this nvlist is 3316 * valid and the vdevs are labeled appropriately. 3317 */ 3318 if (spa->spa_spares_object == 0) { 3319 spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 3320 DMU_OT_PACKED_NVLIST, 1 << 14, 3321 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3322 VERIFY(zap_update(spa->spa_meta_objset, 3323 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 3324 sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 3325 } 3326 3327 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3328 if (spa->spa_nspares == 0) { 3329 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3330 NULL, 0) == 0); 3331 } else { 3332 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 3333 KM_SLEEP); 3334 for (i = 0; i < spa->spa_nspares; i++) 3335 spares[i] = vdev_config_generate(spa, 3336 spa->spa_spares[i], B_FALSE, B_TRUE); 3337 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3338 spares, spa->spa_nspares) == 0); 3339 for (i = 0; i < spa->spa_nspares; i++) 3340 nvlist_free(spares[i]); 3341 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 3342 } 3343 3344 spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 3345 nvlist_free(nvroot); 3346 3347 spa->spa_sync_spares = B_FALSE; 3348 } 3349 3350 static void 3351 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 3352 { 3353 nvlist_t *config; 3354 3355 if (list_is_empty(&spa->spa_dirty_list)) 3356 return; 3357 3358 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 3359 3360 if (spa->spa_config_syncing) 3361 nvlist_free(spa->spa_config_syncing); 3362 spa->spa_config_syncing = config; 3363 3364 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 3365 } 3366 3367 /* 3368 * Set zpool properties. 3369 */ 3370 static void 3371 spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 3372 { 3373 spa_t *spa = arg1; 3374 objset_t *mos = spa->spa_meta_objset; 3375 nvlist_t *nvp = arg2; 3376 nvpair_t *elem; 3377 uint64_t intval; 3378 char *strval; 3379 zpool_prop_t prop; 3380 const char *propname; 3381 zprop_type_t proptype; 3382 3383 elem = NULL; 3384 while ((elem = nvlist_next_nvpair(nvp, elem))) { 3385 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 3386 case ZPOOL_PROP_VERSION: 3387 /* 3388 * Only set version for non-zpool-creation cases 3389 * (set/import). spa_create() needs special care 3390 * for version setting. 3391 */ 3392 if (tx->tx_txg != TXG_INITIAL) { 3393 VERIFY(nvpair_value_uint64(elem, 3394 &intval) == 0); 3395 ASSERT(intval <= SPA_VERSION); 3396 ASSERT(intval >= spa_version(spa)); 3397 spa->spa_uberblock.ub_version = intval; 3398 vdev_config_dirty(spa->spa_root_vdev); 3399 } 3400 break; 3401 3402 case ZPOOL_PROP_ALTROOT: 3403 /* 3404 * 'altroot' is a non-persistent property. It should 3405 * have been set temporarily at creation or import time. 3406 */ 3407 ASSERT(spa->spa_root != NULL); 3408 break; 3409 3410 case ZPOOL_PROP_TEMPORARY: 3411 /* 3412 * 'temporary' is a non-persistant property. 3413 */ 3414 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 3415 spa->spa_temporary = intval; 3416 break; 3417 default: 3418 /* 3419 * Set pool property values in the poolprops mos object. 3420 */ 3421 mutex_enter(&spa->spa_props_lock); 3422 if (spa->spa_pool_props_object == 0) { 3423 objset_t *mos = spa->spa_meta_objset; 3424 3425 VERIFY((spa->spa_pool_props_object = 3426 zap_create(mos, DMU_OT_POOL_PROPS, 3427 DMU_OT_NONE, 0, tx)) > 0); 3428 3429 VERIFY(zap_update(mos, 3430 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 3431 8, 1, &spa->spa_pool_props_object, tx) 3432 == 0); 3433 } 3434 mutex_exit(&spa->spa_props_lock); 3435 3436 /* normalize the property name */ 3437 propname = zpool_prop_to_name(prop); 3438 proptype = zpool_prop_get_type(prop); 3439 3440 if (nvpair_type(elem) == DATA_TYPE_STRING) { 3441 ASSERT(proptype == PROP_TYPE_STRING); 3442 VERIFY(nvpair_value_string(elem, &strval) == 0); 3443 VERIFY(zap_update(mos, 3444 spa->spa_pool_props_object, propname, 3445 1, strlen(strval) + 1, strval, tx) == 0); 3446 3447 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 3448 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 3449 3450 if (proptype == PROP_TYPE_INDEX) { 3451 const char *unused; 3452 VERIFY(zpool_prop_index_to_string( 3453 prop, intval, &unused) == 0); 3454 } 3455 VERIFY(zap_update(mos, 3456 spa->spa_pool_props_object, propname, 3457 8, 1, &intval, tx) == 0); 3458 } else { 3459 ASSERT(0); /* not allowed */ 3460 } 3461 3462 switch (prop) { 3463 case ZPOOL_PROP_DELEGATION: 3464 spa->spa_delegation = intval; 3465 break; 3466 case ZPOOL_PROP_BOOTFS: 3467 spa->spa_bootfs = intval; 3468 break; 3469 case ZPOOL_PROP_FAILUREMODE: 3470 spa->spa_failmode = intval; 3471 break; 3472 default: 3473 break; 3474 } 3475 } 3476 3477 /* log internal history if this is not a zpool create */ 3478 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 3479 tx->tx_txg != TXG_INITIAL) { 3480 spa_history_internal_log(LOG_POOL_PROPSET, 3481 spa, tx, cr, "%s %lld %s", 3482 nvpair_name(elem), intval, spa->spa_name); 3483 } 3484 } 3485 } 3486 3487 /* 3488 * Sync the specified transaction group. New blocks may be dirtied as 3489 * part of the process, so we iterate until it converges. 3490 */ 3491 void 3492 spa_sync(spa_t *spa, uint64_t txg) 3493 { 3494 dsl_pool_t *dp = spa->spa_dsl_pool; 3495 objset_t *mos = spa->spa_meta_objset; 3496 bplist_t *bpl = &spa->spa_sync_bplist; 3497 vdev_t *rvd = spa->spa_root_vdev; 3498 vdev_t *vd; 3499 dmu_tx_t *tx; 3500 int dirty_vdevs; 3501 3502 /* 3503 * Lock out configuration changes. 3504 */ 3505 spa_config_enter(spa, RW_READER, FTAG); 3506 3507 spa->spa_syncing_txg = txg; 3508 spa->spa_sync_pass = 0; 3509 3510 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 3511 3512 tx = dmu_tx_create_assigned(dp, txg); 3513 3514 /* 3515 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 3516 * set spa_deflate if we have no raid-z vdevs. 3517 */ 3518 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 3519 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 3520 int i; 3521 3522 for (i = 0; i < rvd->vdev_children; i++) { 3523 vd = rvd->vdev_child[i]; 3524 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 3525 break; 3526 } 3527 if (i == rvd->vdev_children) { 3528 spa->spa_deflate = TRUE; 3529 VERIFY(0 == zap_add(spa->spa_meta_objset, 3530 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3531 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 3532 } 3533 } 3534 3535 /* 3536 * If anything has changed in this txg, push the deferred frees 3537 * from the previous txg. If not, leave them alone so that we 3538 * don't generate work on an otherwise idle system. 3539 */ 3540 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 3541 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 3542 !txg_list_empty(&dp->dp_sync_tasks, txg)) 3543 spa_sync_deferred_frees(spa, txg); 3544 3545 /* 3546 * Iterate to convergence. 3547 */ 3548 do { 3549 spa->spa_sync_pass++; 3550 3551 spa_sync_config_object(spa, tx); 3552 spa_sync_spares(spa, tx); 3553 spa_errlog_sync(spa, txg); 3554 dsl_pool_sync(dp, txg); 3555 3556 dirty_vdevs = 0; 3557 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 3558 vdev_sync(vd, txg); 3559 dirty_vdevs++; 3560 } 3561 3562 bplist_sync(bpl, tx); 3563 } while (dirty_vdevs); 3564 3565 bplist_close(bpl); 3566 3567 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 3568 3569 /* 3570 * Rewrite the vdev configuration (which includes the uberblock) 3571 * to commit the transaction group. 3572 * 3573 * If there are any dirty vdevs, sync the uberblock to all vdevs. 3574 * Otherwise, pick a random top-level vdev that's known to be 3575 * visible in the config cache (see spa_vdev_add() for details). 3576 * If the write fails, try the next vdev until we're tried them all. 3577 */ 3578 if (!list_is_empty(&spa->spa_dirty_list)) { 3579 VERIFY(vdev_config_sync(rvd, txg) == 0); 3580 } else { 3581 int children = rvd->vdev_children; 3582 int c0 = spa_get_random(children); 3583 int c; 3584 3585 for (c = 0; c < children; c++) { 3586 vd = rvd->vdev_child[(c0 + c) % children]; 3587 if (vd->vdev_ms_array == 0) 3588 continue; 3589 if (vdev_config_sync(vd, txg) == 0) 3590 break; 3591 } 3592 if (c == children) 3593 VERIFY(vdev_config_sync(rvd, txg) == 0); 3594 } 3595 3596 dmu_tx_commit(tx); 3597 3598 /* 3599 * Clear the dirty config list. 3600 */ 3601 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 3602 vdev_config_clean(vd); 3603 3604 /* 3605 * Now that the new config has synced transactionally, 3606 * let it become visible to the config cache. 3607 */ 3608 if (spa->spa_config_syncing != NULL) { 3609 spa_config_set(spa, spa->spa_config_syncing); 3610 spa->spa_config_txg = txg; 3611 spa->spa_config_syncing = NULL; 3612 } 3613 3614 /* 3615 * Make a stable copy of the fully synced uberblock. 3616 * We use this as the root for pool traversals. 3617 */ 3618 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 3619 3620 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 3621 3622 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 3623 spa->spa_traverse_wanted = 0; 3624 spa->spa_ubsync = spa->spa_uberblock; 3625 rw_exit(&spa->spa_traverse_lock); 3626 3627 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 3628 3629 /* 3630 * Clean up the ZIL records for the synced txg. 3631 */ 3632 dsl_pool_zil_clean(dp); 3633 3634 /* 3635 * Update usable space statistics. 3636 */ 3637 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 3638 vdev_sync_done(vd, txg); 3639 3640 /* 3641 * It had better be the case that we didn't dirty anything 3642 * since vdev_config_sync(). 3643 */ 3644 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 3645 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 3646 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 3647 ASSERT(bpl->bpl_queue == NULL); 3648 3649 spa_config_exit(spa, FTAG); 3650 3651 /* 3652 * If any async tasks have been requested, kick them off. 3653 */ 3654 spa_async_dispatch(spa); 3655 } 3656 3657 /* 3658 * Sync all pools. We don't want to hold the namespace lock across these 3659 * operations, so we take a reference on the spa_t and drop the lock during the 3660 * sync. 3661 */ 3662 void 3663 spa_sync_allpools(void) 3664 { 3665 spa_t *spa = NULL; 3666 mutex_enter(&spa_namespace_lock); 3667 while ((spa = spa_next(spa)) != NULL) { 3668 if (spa_state(spa) != POOL_STATE_ACTIVE) 3669 continue; 3670 spa_open_ref(spa, FTAG); 3671 mutex_exit(&spa_namespace_lock); 3672 txg_wait_synced(spa_get_dsl(spa), 0); 3673 mutex_enter(&spa_namespace_lock); 3674 spa_close(spa, FTAG); 3675 } 3676 mutex_exit(&spa_namespace_lock); 3677 } 3678 3679 /* 3680 * ========================================================================== 3681 * Miscellaneous routines 3682 * ========================================================================== 3683 */ 3684 3685 /* 3686 * Remove all pools in the system. 3687 */ 3688 void 3689 spa_evict_all(void) 3690 { 3691 spa_t *spa; 3692 3693 /* 3694 * Remove all cached state. All pools should be closed now, 3695 * so every spa in the AVL tree should be unreferenced. 3696 */ 3697 mutex_enter(&spa_namespace_lock); 3698 while ((spa = spa_next(NULL)) != NULL) { 3699 /* 3700 * Stop async tasks. The async thread may need to detach 3701 * a device that's been replaced, which requires grabbing 3702 * spa_namespace_lock, so we must drop it here. 3703 */ 3704 spa_open_ref(spa, FTAG); 3705 mutex_exit(&spa_namespace_lock); 3706 spa_async_suspend(spa); 3707 mutex_enter(&spa_namespace_lock); 3708 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 3709 spa_close(spa, FTAG); 3710 3711 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3712 spa_unload(spa); 3713 spa_deactivate(spa); 3714 } 3715 spa_remove(spa); 3716 } 3717 mutex_exit(&spa_namespace_lock); 3718 } 3719 3720 vdev_t * 3721 spa_lookup_by_guid(spa_t *spa, uint64_t guid) 3722 { 3723 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 3724 } 3725 3726 void 3727 spa_upgrade(spa_t *spa, uint64_t version) 3728 { 3729 spa_config_enter(spa, RW_WRITER, FTAG); 3730 3731 /* 3732 * This should only be called for a non-faulted pool, and since a 3733 * future version would result in an unopenable pool, this shouldn't be 3734 * possible. 3735 */ 3736 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 3737 ASSERT(version >= spa->spa_uberblock.ub_version); 3738 3739 spa->spa_uberblock.ub_version = version; 3740 vdev_config_dirty(spa->spa_root_vdev); 3741 3742 spa_config_exit(spa, FTAG); 3743 3744 txg_wait_synced(spa_get_dsl(spa), 0); 3745 } 3746 3747 boolean_t 3748 spa_has_spare(spa_t *spa, uint64_t guid) 3749 { 3750 int i; 3751 uint64_t spareguid; 3752 3753 for (i = 0; i < spa->spa_nspares; i++) 3754 if (spa->spa_spares[i]->vdev_guid == guid) 3755 return (B_TRUE); 3756 3757 for (i = 0; i < spa->spa_pending_nspares; i++) { 3758 if (nvlist_lookup_uint64(spa->spa_pending_spares[i], 3759 ZPOOL_CONFIG_GUID, &spareguid) == 0 && 3760 spareguid == guid) 3761 return (B_TRUE); 3762 } 3763 3764 return (B_FALSE); 3765 } 3766 3767 /* 3768 * Post a sysevent corresponding to the given event. The 'name' must be one of 3769 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 3770 * filled in from the spa and (optionally) the vdev. This doesn't do anything 3771 * in the userland libzpool, as we don't want consumers to misinterpret ztest 3772 * or zdb as real changes. 3773 */ 3774 void 3775 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 3776 { 3777 #ifdef _KERNEL 3778 sysevent_t *ev; 3779 sysevent_attr_list_t *attr = NULL; 3780 sysevent_value_t value; 3781 sysevent_id_t eid; 3782 3783 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 3784 SE_SLEEP); 3785 3786 value.value_type = SE_DATA_TYPE_STRING; 3787 value.value.sv_string = spa_name(spa); 3788 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 3789 goto done; 3790 3791 value.value_type = SE_DATA_TYPE_UINT64; 3792 value.value.sv_uint64 = spa_guid(spa); 3793 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 3794 goto done; 3795 3796 if (vd) { 3797 value.value_type = SE_DATA_TYPE_UINT64; 3798 value.value.sv_uint64 = vd->vdev_guid; 3799 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 3800 SE_SLEEP) != 0) 3801 goto done; 3802 3803 if (vd->vdev_path) { 3804 value.value_type = SE_DATA_TYPE_STRING; 3805 value.value.sv_string = vd->vdev_path; 3806 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 3807 &value, SE_SLEEP) != 0) 3808 goto done; 3809 } 3810 } 3811 3812 (void) log_sysevent(ev, SE_SLEEP, &eid); 3813 3814 done: 3815 if (attr) 3816 sysevent_free_attr(attr); 3817 sysevent_free(ev); 3818 #endif 3819 } 3820