1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * This file contains all the routines used when modifying on-disk SPA state. 29 * This includes opening, importing, destroying, exporting a pool, and syncing a 30 * pool. 31 */ 32 33 #include <sys/zfs_context.h> 34 #include <sys/fm/fs/zfs.h> 35 #include <sys/spa_impl.h> 36 #include <sys/zio.h> 37 #include <sys/zio_checksum.h> 38 #include <sys/dmu.h> 39 #include <sys/dmu_tx.h> 40 #include <sys/zap.h> 41 #include <sys/zil.h> 42 #include <sys/ddt.h> 43 #include <sys/vdev_impl.h> 44 #include <sys/metaslab.h> 45 #include <sys/metaslab_impl.h> 46 #include <sys/uberblock_impl.h> 47 #include <sys/txg.h> 48 #include <sys/avl.h> 49 #include <sys/dmu_traverse.h> 50 #include <sys/dmu_objset.h> 51 #include <sys/unique.h> 52 #include <sys/dsl_pool.h> 53 #include <sys/dsl_dataset.h> 54 #include <sys/dsl_dir.h> 55 #include <sys/dsl_prop.h> 56 #include <sys/dsl_synctask.h> 57 #include <sys/fs/zfs.h> 58 #include <sys/arc.h> 59 #include <sys/callb.h> 60 #include <sys/systeminfo.h> 61 #include <sys/spa_boot.h> 62 #include <sys/zfs_ioctl.h> 63 64 #ifdef _KERNEL 65 #include <sys/bootprops.h> 66 #include <sys/callb.h> 67 #include <sys/cpupart.h> 68 #include <sys/pool.h> 69 #include <sys/sysdc.h> 70 #include <sys/zone.h> 71 #endif /* _KERNEL */ 72 73 #include "zfs_prop.h" 74 #include "zfs_comutil.h" 75 76 typedef enum zti_modes { 77 zti_mode_fixed, /* value is # of threads (min 1) */ 78 zti_mode_online_percent, /* value is % of online CPUs */ 79 zti_mode_batch, /* cpu-intensive; value is ignored */ 80 zti_mode_null, /* don't create a taskq */ 81 zti_nmodes 82 } zti_modes_t; 83 84 #define ZTI_FIX(n) { zti_mode_fixed, (n) } 85 #define ZTI_PCT(n) { zti_mode_online_percent, (n) } 86 #define ZTI_BATCH { zti_mode_batch, 0 } 87 #define ZTI_NULL { zti_mode_null, 0 } 88 89 #define ZTI_ONE ZTI_FIX(1) 90 91 typedef struct zio_taskq_info { 92 enum zti_modes zti_mode; 93 uint_t zti_value; 94 } zio_taskq_info_t; 95 96 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 97 "issue", "issue_high", "intr", "intr_high" 98 }; 99 100 /* 101 * Define the taskq threads for the following I/O types: 102 * NULL, READ, WRITE, FREE, CLAIM, and IOCTL 103 */ 104 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 105 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 106 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 107 { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, 108 { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, 109 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 110 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 111 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 112 }; 113 114 static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); 115 static boolean_t spa_has_active_shared_spare(spa_t *spa); 116 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 117 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 118 char **ereport); 119 120 uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */ 121 id_t zio_taskq_psrset_bind = PS_NONE; 122 boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 123 uint_t zio_taskq_basedc = 80; /* base duty cycle */ 124 125 boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 126 127 /* 128 * This (illegal) pool name is used when temporarily importing a spa_t in order 129 * to get the vdev stats associated with the imported devices. 130 */ 131 #define TRYIMPORT_NAME "$import" 132 133 /* 134 * ========================================================================== 135 * SPA properties routines 136 * ========================================================================== 137 */ 138 139 /* 140 * Add a (source=src, propname=propval) list to an nvlist. 141 */ 142 static void 143 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 144 uint64_t intval, zprop_source_t src) 145 { 146 const char *propname = zpool_prop_to_name(prop); 147 nvlist_t *propval; 148 149 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 150 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 151 152 if (strval != NULL) 153 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 154 else 155 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 156 157 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 158 nvlist_free(propval); 159 } 160 161 /* 162 * Get property values from the spa configuration. 163 */ 164 static void 165 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 166 { 167 uint64_t size; 168 uint64_t alloc; 169 uint64_t cap, version; 170 zprop_source_t src = ZPROP_SRC_NONE; 171 spa_config_dirent_t *dp; 172 173 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 174 175 if (spa->spa_root_vdev != NULL) { 176 alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 177 size = metaslab_class_get_space(spa_normal_class(spa)); 178 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 179 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 180 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 181 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 182 size - alloc, src); 183 184 cap = (size == 0) ? 0 : (alloc * 100 / size); 185 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 186 187 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 188 ddt_get_pool_dedup_ratio(spa), src); 189 190 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 191 spa->spa_root_vdev->vdev_state, src); 192 193 version = spa_version(spa); 194 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 195 src = ZPROP_SRC_DEFAULT; 196 else 197 src = ZPROP_SRC_LOCAL; 198 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 199 } 200 201 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 202 203 if (spa->spa_root != NULL) 204 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 205 0, ZPROP_SRC_LOCAL); 206 207 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 208 if (dp->scd_path == NULL) { 209 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 210 "none", 0, ZPROP_SRC_LOCAL); 211 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 212 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 213 dp->scd_path, 0, ZPROP_SRC_LOCAL); 214 } 215 } 216 } 217 218 /* 219 * Get zpool property values. 220 */ 221 int 222 spa_prop_get(spa_t *spa, nvlist_t **nvp) 223 { 224 objset_t *mos = spa->spa_meta_objset; 225 zap_cursor_t zc; 226 zap_attribute_t za; 227 int err; 228 229 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 230 231 mutex_enter(&spa->spa_props_lock); 232 233 /* 234 * Get properties from the spa config. 235 */ 236 spa_prop_get_config(spa, nvp); 237 238 /* If no pool property object, no more prop to get. */ 239 if (spa->spa_pool_props_object == 0) { 240 mutex_exit(&spa->spa_props_lock); 241 return (0); 242 } 243 244 /* 245 * Get properties from the MOS pool property object. 246 */ 247 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 248 (err = zap_cursor_retrieve(&zc, &za)) == 0; 249 zap_cursor_advance(&zc)) { 250 uint64_t intval = 0; 251 char *strval = NULL; 252 zprop_source_t src = ZPROP_SRC_DEFAULT; 253 zpool_prop_t prop; 254 255 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 256 continue; 257 258 switch (za.za_integer_length) { 259 case 8: 260 /* integer property */ 261 if (za.za_first_integer != 262 zpool_prop_default_numeric(prop)) 263 src = ZPROP_SRC_LOCAL; 264 265 if (prop == ZPOOL_PROP_BOOTFS) { 266 dsl_pool_t *dp; 267 dsl_dataset_t *ds = NULL; 268 269 dp = spa_get_dsl(spa); 270 rw_enter(&dp->dp_config_rwlock, RW_READER); 271 if (err = dsl_dataset_hold_obj(dp, 272 za.za_first_integer, FTAG, &ds)) { 273 rw_exit(&dp->dp_config_rwlock); 274 break; 275 } 276 277 strval = kmem_alloc( 278 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 279 KM_SLEEP); 280 dsl_dataset_name(ds, strval); 281 dsl_dataset_rele(ds, FTAG); 282 rw_exit(&dp->dp_config_rwlock); 283 } else { 284 strval = NULL; 285 intval = za.za_first_integer; 286 } 287 288 spa_prop_add_list(*nvp, prop, strval, intval, src); 289 290 if (strval != NULL) 291 kmem_free(strval, 292 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 293 294 break; 295 296 case 1: 297 /* string property */ 298 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 299 err = zap_lookup(mos, spa->spa_pool_props_object, 300 za.za_name, 1, za.za_num_integers, strval); 301 if (err) { 302 kmem_free(strval, za.za_num_integers); 303 break; 304 } 305 spa_prop_add_list(*nvp, prop, strval, 0, src); 306 kmem_free(strval, za.za_num_integers); 307 break; 308 309 default: 310 break; 311 } 312 } 313 zap_cursor_fini(&zc); 314 mutex_exit(&spa->spa_props_lock); 315 out: 316 if (err && err != ENOENT) { 317 nvlist_free(*nvp); 318 *nvp = NULL; 319 return (err); 320 } 321 322 return (0); 323 } 324 325 /* 326 * Validate the given pool properties nvlist and modify the list 327 * for the property values to be set. 328 */ 329 static int 330 spa_prop_validate(spa_t *spa, nvlist_t *props) 331 { 332 nvpair_t *elem; 333 int error = 0, reset_bootfs = 0; 334 uint64_t objnum; 335 336 elem = NULL; 337 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 338 zpool_prop_t prop; 339 char *propname, *strval; 340 uint64_t intval; 341 objset_t *os; 342 char *slash; 343 344 propname = nvpair_name(elem); 345 346 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) 347 return (EINVAL); 348 349 switch (prop) { 350 case ZPOOL_PROP_VERSION: 351 error = nvpair_value_uint64(elem, &intval); 352 if (!error && 353 (intval < spa_version(spa) || intval > SPA_VERSION)) 354 error = EINVAL; 355 break; 356 357 case ZPOOL_PROP_DELEGATION: 358 case ZPOOL_PROP_AUTOREPLACE: 359 case ZPOOL_PROP_LISTSNAPS: 360 case ZPOOL_PROP_AUTOEXPAND: 361 error = nvpair_value_uint64(elem, &intval); 362 if (!error && intval > 1) 363 error = EINVAL; 364 break; 365 366 case ZPOOL_PROP_BOOTFS: 367 /* 368 * If the pool version is less than SPA_VERSION_BOOTFS, 369 * or the pool is still being created (version == 0), 370 * the bootfs property cannot be set. 371 */ 372 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 373 error = ENOTSUP; 374 break; 375 } 376 377 /* 378 * Make sure the vdev config is bootable 379 */ 380 if (!vdev_is_bootable(spa->spa_root_vdev)) { 381 error = ENOTSUP; 382 break; 383 } 384 385 reset_bootfs = 1; 386 387 error = nvpair_value_string(elem, &strval); 388 389 if (!error) { 390 uint64_t compress; 391 392 if (strval == NULL || strval[0] == '\0') { 393 objnum = zpool_prop_default_numeric( 394 ZPOOL_PROP_BOOTFS); 395 break; 396 } 397 398 if (error = dmu_objset_hold(strval, FTAG, &os)) 399 break; 400 401 /* Must be ZPL and not gzip compressed. */ 402 403 if (dmu_objset_type(os) != DMU_OST_ZFS) { 404 error = ENOTSUP; 405 } else if ((error = dsl_prop_get_integer(strval, 406 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 407 &compress, NULL)) == 0 && 408 !BOOTFS_COMPRESS_VALID(compress)) { 409 error = ENOTSUP; 410 } else { 411 objnum = dmu_objset_id(os); 412 } 413 dmu_objset_rele(os, FTAG); 414 } 415 break; 416 417 case ZPOOL_PROP_FAILUREMODE: 418 error = nvpair_value_uint64(elem, &intval); 419 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 420 intval > ZIO_FAILURE_MODE_PANIC)) 421 error = EINVAL; 422 423 /* 424 * This is a special case which only occurs when 425 * the pool has completely failed. This allows 426 * the user to change the in-core failmode property 427 * without syncing it out to disk (I/Os might 428 * currently be blocked). We do this by returning 429 * EIO to the caller (spa_prop_set) to trick it 430 * into thinking we encountered a property validation 431 * error. 432 */ 433 if (!error && spa_suspended(spa)) { 434 spa->spa_failmode = intval; 435 error = EIO; 436 } 437 break; 438 439 case ZPOOL_PROP_CACHEFILE: 440 if ((error = nvpair_value_string(elem, &strval)) != 0) 441 break; 442 443 if (strval[0] == '\0') 444 break; 445 446 if (strcmp(strval, "none") == 0) 447 break; 448 449 if (strval[0] != '/') { 450 error = EINVAL; 451 break; 452 } 453 454 slash = strrchr(strval, '/'); 455 ASSERT(slash != NULL); 456 457 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 458 strcmp(slash, "/..") == 0) 459 error = EINVAL; 460 break; 461 462 case ZPOOL_PROP_DEDUPDITTO: 463 if (spa_version(spa) < SPA_VERSION_DEDUP) 464 error = ENOTSUP; 465 else 466 error = nvpair_value_uint64(elem, &intval); 467 if (error == 0 && 468 intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 469 error = EINVAL; 470 break; 471 } 472 473 if (error) 474 break; 475 } 476 477 if (!error && reset_bootfs) { 478 error = nvlist_remove(props, 479 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 480 481 if (!error) { 482 error = nvlist_add_uint64(props, 483 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 484 } 485 } 486 487 return (error); 488 } 489 490 void 491 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 492 { 493 char *cachefile; 494 spa_config_dirent_t *dp; 495 496 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 497 &cachefile) != 0) 498 return; 499 500 dp = kmem_alloc(sizeof (spa_config_dirent_t), 501 KM_SLEEP); 502 503 if (cachefile[0] == '\0') 504 dp->scd_path = spa_strdup(spa_config_path); 505 else if (strcmp(cachefile, "none") == 0) 506 dp->scd_path = NULL; 507 else 508 dp->scd_path = spa_strdup(cachefile); 509 510 list_insert_head(&spa->spa_config_list, dp); 511 if (need_sync) 512 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 513 } 514 515 int 516 spa_prop_set(spa_t *spa, nvlist_t *nvp) 517 { 518 int error; 519 nvpair_t *elem; 520 boolean_t need_sync = B_FALSE; 521 zpool_prop_t prop; 522 523 if ((error = spa_prop_validate(spa, nvp)) != 0) 524 return (error); 525 526 elem = NULL; 527 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 528 if ((prop = zpool_name_to_prop( 529 nvpair_name(elem))) == ZPROP_INVAL) 530 return (EINVAL); 531 532 if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT) 533 continue; 534 535 need_sync = B_TRUE; 536 break; 537 } 538 539 if (need_sync) 540 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 541 spa, nvp, 3)); 542 else 543 return (0); 544 } 545 546 /* 547 * If the bootfs property value is dsobj, clear it. 548 */ 549 void 550 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 551 { 552 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 553 VERIFY(zap_remove(spa->spa_meta_objset, 554 spa->spa_pool_props_object, 555 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 556 spa->spa_bootfs = 0; 557 } 558 } 559 560 /* 561 * ========================================================================== 562 * SPA state manipulation (open/create/destroy/import/export) 563 * ========================================================================== 564 */ 565 566 static int 567 spa_error_entry_compare(const void *a, const void *b) 568 { 569 spa_error_entry_t *sa = (spa_error_entry_t *)a; 570 spa_error_entry_t *sb = (spa_error_entry_t *)b; 571 int ret; 572 573 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 574 sizeof (zbookmark_t)); 575 576 if (ret < 0) 577 return (-1); 578 else if (ret > 0) 579 return (1); 580 else 581 return (0); 582 } 583 584 /* 585 * Utility function which retrieves copies of the current logs and 586 * re-initializes them in the process. 587 */ 588 void 589 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 590 { 591 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 592 593 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 594 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 595 596 avl_create(&spa->spa_errlist_scrub, 597 spa_error_entry_compare, sizeof (spa_error_entry_t), 598 offsetof(spa_error_entry_t, se_avl)); 599 avl_create(&spa->spa_errlist_last, 600 spa_error_entry_compare, sizeof (spa_error_entry_t), 601 offsetof(spa_error_entry_t, se_avl)); 602 } 603 604 static taskq_t * 605 spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, 606 uint_t value) 607 { 608 uint_t flags = TASKQ_PREPOPULATE; 609 boolean_t batch = B_FALSE; 610 611 switch (mode) { 612 case zti_mode_null: 613 return (NULL); /* no taskq needed */ 614 615 case zti_mode_fixed: 616 ASSERT3U(value, >=, 1); 617 value = MAX(value, 1); 618 break; 619 620 case zti_mode_batch: 621 batch = B_TRUE; 622 flags |= TASKQ_THREADS_CPU_PCT; 623 value = zio_taskq_batch_pct; 624 break; 625 626 case zti_mode_online_percent: 627 flags |= TASKQ_THREADS_CPU_PCT; 628 break; 629 630 default: 631 panic("unrecognized mode for %s taskq (%u:%u) in " 632 "spa_activate()", 633 name, mode, value); 634 break; 635 } 636 637 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 638 if (batch) 639 flags |= TASKQ_DC_BATCH; 640 641 return (taskq_create_sysdc(name, value, 50, INT_MAX, 642 spa->spa_proc, zio_taskq_basedc, flags)); 643 } 644 return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX, 645 spa->spa_proc, flags)); 646 } 647 648 static void 649 spa_create_zio_taskqs(spa_t *spa) 650 { 651 for (int t = 0; t < ZIO_TYPES; t++) { 652 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 653 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 654 enum zti_modes mode = ztip->zti_mode; 655 uint_t value = ztip->zti_value; 656 char name[32]; 657 658 (void) snprintf(name, sizeof (name), 659 "%s_%s", zio_type_name[t], zio_taskq_types[q]); 660 661 spa->spa_zio_taskq[t][q] = 662 spa_taskq_create(spa, name, mode, value); 663 } 664 } 665 } 666 667 #ifdef _KERNEL 668 static void 669 spa_thread(void *arg) 670 { 671 callb_cpr_t cprinfo; 672 673 spa_t *spa = arg; 674 user_t *pu = PTOU(curproc); 675 676 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 677 spa->spa_name); 678 679 ASSERT(curproc != &p0); 680 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 681 "zpool-%s", spa->spa_name); 682 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 683 684 /* bind this thread to the requested psrset */ 685 if (zio_taskq_psrset_bind != PS_NONE) { 686 pool_lock(); 687 mutex_enter(&cpu_lock); 688 mutex_enter(&pidlock); 689 mutex_enter(&curproc->p_lock); 690 691 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 692 0, NULL, NULL) == 0) { 693 curthread->t_bind_pset = zio_taskq_psrset_bind; 694 } else { 695 cmn_err(CE_WARN, 696 "Couldn't bind process for zfs pool \"%s\" to " 697 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 698 } 699 700 mutex_exit(&curproc->p_lock); 701 mutex_exit(&pidlock); 702 mutex_exit(&cpu_lock); 703 pool_unlock(); 704 } 705 706 if (zio_taskq_sysdc) { 707 sysdc_thread_enter(curthread, 100, 0); 708 } 709 710 spa->spa_proc = curproc; 711 spa->spa_did = curthread->t_did; 712 713 spa_create_zio_taskqs(spa); 714 715 mutex_enter(&spa->spa_proc_lock); 716 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 717 718 spa->spa_proc_state = SPA_PROC_ACTIVE; 719 cv_broadcast(&spa->spa_proc_cv); 720 721 CALLB_CPR_SAFE_BEGIN(&cprinfo); 722 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 723 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 724 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 725 726 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 727 spa->spa_proc_state = SPA_PROC_GONE; 728 spa->spa_proc = &p0; 729 cv_broadcast(&spa->spa_proc_cv); 730 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 731 732 mutex_enter(&curproc->p_lock); 733 lwp_exit(); 734 } 735 #endif 736 737 /* 738 * Activate an uninitialized pool. 739 */ 740 static void 741 spa_activate(spa_t *spa, int mode) 742 { 743 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 744 745 spa->spa_state = POOL_STATE_ACTIVE; 746 spa->spa_mode = mode; 747 748 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 749 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 750 751 /* Try to create a covering process */ 752 mutex_enter(&spa->spa_proc_lock); 753 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 754 ASSERT(spa->spa_proc == &p0); 755 spa->spa_did = 0; 756 757 /* Only create a process if we're going to be around a while. */ 758 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 759 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 760 NULL, 0) == 0) { 761 spa->spa_proc_state = SPA_PROC_CREATED; 762 while (spa->spa_proc_state == SPA_PROC_CREATED) { 763 cv_wait(&spa->spa_proc_cv, 764 &spa->spa_proc_lock); 765 } 766 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 767 ASSERT(spa->spa_proc != &p0); 768 ASSERT(spa->spa_did != 0); 769 } else { 770 #ifdef _KERNEL 771 cmn_err(CE_WARN, 772 "Couldn't create process for zfs pool \"%s\"\n", 773 spa->spa_name); 774 #endif 775 } 776 } 777 mutex_exit(&spa->spa_proc_lock); 778 779 /* If we didn't create a process, we need to create our taskqs. */ 780 if (spa->spa_proc == &p0) { 781 spa_create_zio_taskqs(spa); 782 } 783 784 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 785 offsetof(vdev_t, vdev_config_dirty_node)); 786 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 787 offsetof(vdev_t, vdev_state_dirty_node)); 788 789 txg_list_create(&spa->spa_vdev_txg_list, 790 offsetof(struct vdev, vdev_txg_node)); 791 792 avl_create(&spa->spa_errlist_scrub, 793 spa_error_entry_compare, sizeof (spa_error_entry_t), 794 offsetof(spa_error_entry_t, se_avl)); 795 avl_create(&spa->spa_errlist_last, 796 spa_error_entry_compare, sizeof (spa_error_entry_t), 797 offsetof(spa_error_entry_t, se_avl)); 798 } 799 800 /* 801 * Opposite of spa_activate(). 802 */ 803 static void 804 spa_deactivate(spa_t *spa) 805 { 806 ASSERT(spa->spa_sync_on == B_FALSE); 807 ASSERT(spa->spa_dsl_pool == NULL); 808 ASSERT(spa->spa_root_vdev == NULL); 809 ASSERT(spa->spa_async_zio_root == NULL); 810 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 811 812 txg_list_destroy(&spa->spa_vdev_txg_list); 813 814 list_destroy(&spa->spa_config_dirty_list); 815 list_destroy(&spa->spa_state_dirty_list); 816 817 for (int t = 0; t < ZIO_TYPES; t++) { 818 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 819 if (spa->spa_zio_taskq[t][q] != NULL) 820 taskq_destroy(spa->spa_zio_taskq[t][q]); 821 spa->spa_zio_taskq[t][q] = NULL; 822 } 823 } 824 825 metaslab_class_destroy(spa->spa_normal_class); 826 spa->spa_normal_class = NULL; 827 828 metaslab_class_destroy(spa->spa_log_class); 829 spa->spa_log_class = NULL; 830 831 /* 832 * If this was part of an import or the open otherwise failed, we may 833 * still have errors left in the queues. Empty them just in case. 834 */ 835 spa_errlog_drain(spa); 836 837 avl_destroy(&spa->spa_errlist_scrub); 838 avl_destroy(&spa->spa_errlist_last); 839 840 spa->spa_state = POOL_STATE_UNINITIALIZED; 841 842 mutex_enter(&spa->spa_proc_lock); 843 if (spa->spa_proc_state != SPA_PROC_NONE) { 844 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 845 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 846 cv_broadcast(&spa->spa_proc_cv); 847 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 848 ASSERT(spa->spa_proc != &p0); 849 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 850 } 851 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 852 spa->spa_proc_state = SPA_PROC_NONE; 853 } 854 ASSERT(spa->spa_proc == &p0); 855 mutex_exit(&spa->spa_proc_lock); 856 857 /* 858 * We want to make sure spa_thread() has actually exited the ZFS 859 * module, so that the module can't be unloaded out from underneath 860 * it. 861 */ 862 if (spa->spa_did != 0) { 863 thread_join(spa->spa_did); 864 spa->spa_did = 0; 865 } 866 } 867 868 /* 869 * Verify a pool configuration, and construct the vdev tree appropriately. This 870 * will create all the necessary vdevs in the appropriate layout, with each vdev 871 * in the CLOSED state. This will prep the pool before open/creation/import. 872 * All vdev validation is done by the vdev_alloc() routine. 873 */ 874 static int 875 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 876 uint_t id, int atype) 877 { 878 nvlist_t **child; 879 uint_t children; 880 int error; 881 882 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 883 return (error); 884 885 if ((*vdp)->vdev_ops->vdev_op_leaf) 886 return (0); 887 888 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 889 &child, &children); 890 891 if (error == ENOENT) 892 return (0); 893 894 if (error) { 895 vdev_free(*vdp); 896 *vdp = NULL; 897 return (EINVAL); 898 } 899 900 for (int c = 0; c < children; c++) { 901 vdev_t *vd; 902 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 903 atype)) != 0) { 904 vdev_free(*vdp); 905 *vdp = NULL; 906 return (error); 907 } 908 } 909 910 ASSERT(*vdp != NULL); 911 912 return (0); 913 } 914 915 /* 916 * Opposite of spa_load(). 917 */ 918 static void 919 spa_unload(spa_t *spa) 920 { 921 int i; 922 923 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 924 925 /* 926 * Stop async tasks. 927 */ 928 spa_async_suspend(spa); 929 930 /* 931 * Stop syncing. 932 */ 933 if (spa->spa_sync_on) { 934 txg_sync_stop(spa->spa_dsl_pool); 935 spa->spa_sync_on = B_FALSE; 936 } 937 938 /* 939 * Wait for any outstanding async I/O to complete. 940 */ 941 if (spa->spa_async_zio_root != NULL) { 942 (void) zio_wait(spa->spa_async_zio_root); 943 spa->spa_async_zio_root = NULL; 944 } 945 946 /* 947 * Close the dsl pool. 948 */ 949 if (spa->spa_dsl_pool) { 950 dsl_pool_close(spa->spa_dsl_pool); 951 spa->spa_dsl_pool = NULL; 952 } 953 954 ddt_unload(spa); 955 956 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 957 958 /* 959 * Drop and purge level 2 cache 960 */ 961 spa_l2cache_drop(spa); 962 963 /* 964 * Close all vdevs. 965 */ 966 if (spa->spa_root_vdev) 967 vdev_free(spa->spa_root_vdev); 968 ASSERT(spa->spa_root_vdev == NULL); 969 970 for (i = 0; i < spa->spa_spares.sav_count; i++) 971 vdev_free(spa->spa_spares.sav_vdevs[i]); 972 if (spa->spa_spares.sav_vdevs) { 973 kmem_free(spa->spa_spares.sav_vdevs, 974 spa->spa_spares.sav_count * sizeof (void *)); 975 spa->spa_spares.sav_vdevs = NULL; 976 } 977 if (spa->spa_spares.sav_config) { 978 nvlist_free(spa->spa_spares.sav_config); 979 spa->spa_spares.sav_config = NULL; 980 } 981 spa->spa_spares.sav_count = 0; 982 983 for (i = 0; i < spa->spa_l2cache.sav_count; i++) 984 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 985 if (spa->spa_l2cache.sav_vdevs) { 986 kmem_free(spa->spa_l2cache.sav_vdevs, 987 spa->spa_l2cache.sav_count * sizeof (void *)); 988 spa->spa_l2cache.sav_vdevs = NULL; 989 } 990 if (spa->spa_l2cache.sav_config) { 991 nvlist_free(spa->spa_l2cache.sav_config); 992 spa->spa_l2cache.sav_config = NULL; 993 } 994 spa->spa_l2cache.sav_count = 0; 995 996 spa->spa_async_suspended = 0; 997 998 spa_config_exit(spa, SCL_ALL, FTAG); 999 } 1000 1001 /* 1002 * Load (or re-load) the current list of vdevs describing the active spares for 1003 * this pool. When this is called, we have some form of basic information in 1004 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1005 * then re-generate a more complete list including status information. 1006 */ 1007 static void 1008 spa_load_spares(spa_t *spa) 1009 { 1010 nvlist_t **spares; 1011 uint_t nspares; 1012 int i; 1013 vdev_t *vd, *tvd; 1014 1015 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1016 1017 /* 1018 * First, close and free any existing spare vdevs. 1019 */ 1020 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1021 vd = spa->spa_spares.sav_vdevs[i]; 1022 1023 /* Undo the call to spa_activate() below */ 1024 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1025 B_FALSE)) != NULL && tvd->vdev_isspare) 1026 spa_spare_remove(tvd); 1027 vdev_close(vd); 1028 vdev_free(vd); 1029 } 1030 1031 if (spa->spa_spares.sav_vdevs) 1032 kmem_free(spa->spa_spares.sav_vdevs, 1033 spa->spa_spares.sav_count * sizeof (void *)); 1034 1035 if (spa->spa_spares.sav_config == NULL) 1036 nspares = 0; 1037 else 1038 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1039 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1040 1041 spa->spa_spares.sav_count = (int)nspares; 1042 spa->spa_spares.sav_vdevs = NULL; 1043 1044 if (nspares == 0) 1045 return; 1046 1047 /* 1048 * Construct the array of vdevs, opening them to get status in the 1049 * process. For each spare, there is potentially two different vdev_t 1050 * structures associated with it: one in the list of spares (used only 1051 * for basic validation purposes) and one in the active vdev 1052 * configuration (if it's spared in). During this phase we open and 1053 * validate each vdev on the spare list. If the vdev also exists in the 1054 * active configuration, then we also mark this vdev as an active spare. 1055 */ 1056 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1057 KM_SLEEP); 1058 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1059 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1060 VDEV_ALLOC_SPARE) == 0); 1061 ASSERT(vd != NULL); 1062 1063 spa->spa_spares.sav_vdevs[i] = vd; 1064 1065 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1066 B_FALSE)) != NULL) { 1067 if (!tvd->vdev_isspare) 1068 spa_spare_add(tvd); 1069 1070 /* 1071 * We only mark the spare active if we were successfully 1072 * able to load the vdev. Otherwise, importing a pool 1073 * with a bad active spare would result in strange 1074 * behavior, because multiple pool would think the spare 1075 * is actively in use. 1076 * 1077 * There is a vulnerability here to an equally bizarre 1078 * circumstance, where a dead active spare is later 1079 * brought back to life (onlined or otherwise). Given 1080 * the rarity of this scenario, and the extra complexity 1081 * it adds, we ignore the possibility. 1082 */ 1083 if (!vdev_is_dead(tvd)) 1084 spa_spare_activate(tvd); 1085 } 1086 1087 vd->vdev_top = vd; 1088 vd->vdev_aux = &spa->spa_spares; 1089 1090 if (vdev_open(vd) != 0) 1091 continue; 1092 1093 if (vdev_validate_aux(vd) == 0) 1094 spa_spare_add(vd); 1095 } 1096 1097 /* 1098 * Recompute the stashed list of spares, with status information 1099 * this time. 1100 */ 1101 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1102 DATA_TYPE_NVLIST_ARRAY) == 0); 1103 1104 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1105 KM_SLEEP); 1106 for (i = 0; i < spa->spa_spares.sav_count; i++) 1107 spares[i] = vdev_config_generate(spa, 1108 spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE); 1109 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1110 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1111 for (i = 0; i < spa->spa_spares.sav_count; i++) 1112 nvlist_free(spares[i]); 1113 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1114 } 1115 1116 /* 1117 * Load (or re-load) the current list of vdevs describing the active l2cache for 1118 * this pool. When this is called, we have some form of basic information in 1119 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1120 * then re-generate a more complete list including status information. 1121 * Devices which are already active have their details maintained, and are 1122 * not re-opened. 1123 */ 1124 static void 1125 spa_load_l2cache(spa_t *spa) 1126 { 1127 nvlist_t **l2cache; 1128 uint_t nl2cache; 1129 int i, j, oldnvdevs; 1130 uint64_t guid; 1131 vdev_t *vd, **oldvdevs, **newvdevs; 1132 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1133 1134 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1135 1136 if (sav->sav_config != NULL) { 1137 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1138 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1139 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1140 } else { 1141 nl2cache = 0; 1142 } 1143 1144 oldvdevs = sav->sav_vdevs; 1145 oldnvdevs = sav->sav_count; 1146 sav->sav_vdevs = NULL; 1147 sav->sav_count = 0; 1148 1149 /* 1150 * Process new nvlist of vdevs. 1151 */ 1152 for (i = 0; i < nl2cache; i++) { 1153 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1154 &guid) == 0); 1155 1156 newvdevs[i] = NULL; 1157 for (j = 0; j < oldnvdevs; j++) { 1158 vd = oldvdevs[j]; 1159 if (vd != NULL && guid == vd->vdev_guid) { 1160 /* 1161 * Retain previous vdev for add/remove ops. 1162 */ 1163 newvdevs[i] = vd; 1164 oldvdevs[j] = NULL; 1165 break; 1166 } 1167 } 1168 1169 if (newvdevs[i] == NULL) { 1170 /* 1171 * Create new vdev 1172 */ 1173 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1174 VDEV_ALLOC_L2CACHE) == 0); 1175 ASSERT(vd != NULL); 1176 newvdevs[i] = vd; 1177 1178 /* 1179 * Commit this vdev as an l2cache device, 1180 * even if it fails to open. 1181 */ 1182 spa_l2cache_add(vd); 1183 1184 vd->vdev_top = vd; 1185 vd->vdev_aux = sav; 1186 1187 spa_l2cache_activate(vd); 1188 1189 if (vdev_open(vd) != 0) 1190 continue; 1191 1192 (void) vdev_validate_aux(vd); 1193 1194 if (!vdev_is_dead(vd)) 1195 l2arc_add_vdev(spa, vd); 1196 } 1197 } 1198 1199 /* 1200 * Purge vdevs that were dropped 1201 */ 1202 for (i = 0; i < oldnvdevs; i++) { 1203 uint64_t pool; 1204 1205 vd = oldvdevs[i]; 1206 if (vd != NULL) { 1207 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1208 pool != 0ULL && l2arc_vdev_present(vd)) 1209 l2arc_remove_vdev(vd); 1210 (void) vdev_close(vd); 1211 spa_l2cache_remove(vd); 1212 } 1213 } 1214 1215 if (oldvdevs) 1216 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1217 1218 if (sav->sav_config == NULL) 1219 goto out; 1220 1221 sav->sav_vdevs = newvdevs; 1222 sav->sav_count = (int)nl2cache; 1223 1224 /* 1225 * Recompute the stashed list of l2cache devices, with status 1226 * information this time. 1227 */ 1228 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1229 DATA_TYPE_NVLIST_ARRAY) == 0); 1230 1231 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1232 for (i = 0; i < sav->sav_count; i++) 1233 l2cache[i] = vdev_config_generate(spa, 1234 sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE); 1235 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1236 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1237 out: 1238 for (i = 0; i < sav->sav_count; i++) 1239 nvlist_free(l2cache[i]); 1240 if (sav->sav_count) 1241 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1242 } 1243 1244 static int 1245 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1246 { 1247 dmu_buf_t *db; 1248 char *packed = NULL; 1249 size_t nvsize = 0; 1250 int error; 1251 *value = NULL; 1252 1253 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 1254 nvsize = *(uint64_t *)db->db_data; 1255 dmu_buf_rele(db, FTAG); 1256 1257 packed = kmem_alloc(nvsize, KM_SLEEP); 1258 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1259 DMU_READ_PREFETCH); 1260 if (error == 0) 1261 error = nvlist_unpack(packed, nvsize, value, 0); 1262 kmem_free(packed, nvsize); 1263 1264 return (error); 1265 } 1266 1267 /* 1268 * Checks to see if the given vdev could not be opened, in which case we post a 1269 * sysevent to notify the autoreplace code that the device has been removed. 1270 */ 1271 static void 1272 spa_check_removed(vdev_t *vd) 1273 { 1274 for (int c = 0; c < vd->vdev_children; c++) 1275 spa_check_removed(vd->vdev_child[c]); 1276 1277 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 1278 zfs_post_autoreplace(vd->vdev_spa, vd); 1279 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1280 } 1281 } 1282 1283 /* 1284 * Load the slog device state from the config object since it's possible 1285 * that the label does not contain the most up-to-date information. 1286 */ 1287 void 1288 spa_load_log_state(spa_t *spa, nvlist_t *nv) 1289 { 1290 vdev_t *ovd, *rvd = spa->spa_root_vdev; 1291 1292 /* 1293 * Load the original root vdev tree from the passed config. 1294 */ 1295 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1296 VERIFY(spa_config_parse(spa, &ovd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1297 1298 for (int c = 0; c < rvd->vdev_children; c++) { 1299 vdev_t *cvd = rvd->vdev_child[c]; 1300 if (cvd->vdev_islog) 1301 vdev_load_log_state(cvd, ovd->vdev_child[c]); 1302 } 1303 vdev_free(ovd); 1304 spa_config_exit(spa, SCL_ALL, FTAG); 1305 } 1306 1307 /* 1308 * Check for missing log devices 1309 */ 1310 int 1311 spa_check_logs(spa_t *spa) 1312 { 1313 switch (spa->spa_log_state) { 1314 case SPA_LOG_MISSING: 1315 /* need to recheck in case slog has been restored */ 1316 case SPA_LOG_UNKNOWN: 1317 if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, 1318 DS_FIND_CHILDREN)) { 1319 spa_set_log_state(spa, SPA_LOG_MISSING); 1320 return (1); 1321 } 1322 break; 1323 } 1324 return (0); 1325 } 1326 1327 static boolean_t 1328 spa_passivate_log(spa_t *spa) 1329 { 1330 vdev_t *rvd = spa->spa_root_vdev; 1331 boolean_t slog_found = B_FALSE; 1332 1333 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1334 1335 if (!spa_has_slogs(spa)) 1336 return (B_FALSE); 1337 1338 for (int c = 0; c < rvd->vdev_children; c++) { 1339 vdev_t *tvd = rvd->vdev_child[c]; 1340 metaslab_group_t *mg = tvd->vdev_mg; 1341 1342 if (tvd->vdev_islog) { 1343 metaslab_group_passivate(mg); 1344 slog_found = B_TRUE; 1345 } 1346 } 1347 1348 return (slog_found); 1349 } 1350 1351 static void 1352 spa_activate_log(spa_t *spa) 1353 { 1354 vdev_t *rvd = spa->spa_root_vdev; 1355 1356 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1357 1358 for (int c = 0; c < rvd->vdev_children; c++) { 1359 vdev_t *tvd = rvd->vdev_child[c]; 1360 metaslab_group_t *mg = tvd->vdev_mg; 1361 1362 if (tvd->vdev_islog) 1363 metaslab_group_activate(mg); 1364 } 1365 } 1366 1367 int 1368 spa_offline_log(spa_t *spa) 1369 { 1370 int error = 0; 1371 1372 if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 1373 NULL, DS_FIND_CHILDREN)) == 0) { 1374 1375 /* 1376 * We successfully offlined the log device, sync out the 1377 * current txg so that the "stubby" block can be removed 1378 * by zil_sync(). 1379 */ 1380 txg_wait_synced(spa->spa_dsl_pool, 0); 1381 } 1382 return (error); 1383 } 1384 1385 static void 1386 spa_aux_check_removed(spa_aux_vdev_t *sav) 1387 { 1388 for (int i = 0; i < sav->sav_count; i++) 1389 spa_check_removed(sav->sav_vdevs[i]); 1390 } 1391 1392 void 1393 spa_claim_notify(zio_t *zio) 1394 { 1395 spa_t *spa = zio->io_spa; 1396 1397 if (zio->io_error) 1398 return; 1399 1400 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1401 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1402 spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1403 mutex_exit(&spa->spa_props_lock); 1404 } 1405 1406 typedef struct spa_load_error { 1407 uint64_t sle_metadata_count; 1408 uint64_t sle_data_count; 1409 } spa_load_error_t; 1410 1411 static void 1412 spa_load_verify_done(zio_t *zio) 1413 { 1414 blkptr_t *bp = zio->io_bp; 1415 spa_load_error_t *sle = zio->io_private; 1416 dmu_object_type_t type = BP_GET_TYPE(bp); 1417 int error = zio->io_error; 1418 1419 if (error) { 1420 if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) && 1421 type != DMU_OT_INTENT_LOG) 1422 atomic_add_64(&sle->sle_metadata_count, 1); 1423 else 1424 atomic_add_64(&sle->sle_data_count, 1); 1425 } 1426 zio_data_buf_free(zio->io_data, zio->io_size); 1427 } 1428 1429 /*ARGSUSED*/ 1430 static int 1431 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1432 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 1433 { 1434 if (bp != NULL) { 1435 zio_t *rio = arg; 1436 size_t size = BP_GET_PSIZE(bp); 1437 void *data = zio_data_buf_alloc(size); 1438 1439 zio_nowait(zio_read(rio, spa, bp, data, size, 1440 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1441 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1442 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1443 } 1444 return (0); 1445 } 1446 1447 static int 1448 spa_load_verify(spa_t *spa) 1449 { 1450 zio_t *rio; 1451 spa_load_error_t sle = { 0 }; 1452 zpool_rewind_policy_t policy; 1453 boolean_t verify_ok = B_FALSE; 1454 int error; 1455 1456 rio = zio_root(spa, NULL, &sle, 1457 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1458 1459 error = traverse_pool(spa, spa->spa_verify_min_txg, 1460 TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio); 1461 1462 (void) zio_wait(rio); 1463 1464 zpool_get_rewind_policy(spa->spa_config, &policy); 1465 1466 spa->spa_load_meta_errors = sle.sle_metadata_count; 1467 spa->spa_load_data_errors = sle.sle_data_count; 1468 1469 if (!error && sle.sle_metadata_count <= policy.zrp_maxmeta && 1470 sle.sle_data_count <= policy.zrp_maxdata) { 1471 verify_ok = B_TRUE; 1472 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 1473 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 1474 } else { 1475 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 1476 } 1477 1478 if (error) { 1479 if (error != ENXIO && error != EIO) 1480 error = EIO; 1481 return (error); 1482 } 1483 1484 return (verify_ok ? 0 : EIO); 1485 } 1486 1487 /* 1488 * Find a value in the pool props object. 1489 */ 1490 static void 1491 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 1492 { 1493 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 1494 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 1495 } 1496 1497 /* 1498 * Find a value in the pool directory object. 1499 */ 1500 static int 1501 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 1502 { 1503 return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1504 name, sizeof (uint64_t), 1, val)); 1505 } 1506 1507 static int 1508 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 1509 { 1510 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 1511 return (err); 1512 } 1513 1514 /* 1515 * Fix up config after a partly-completed split. This is done with the 1516 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 1517 * pool have that entry in their config, but only the splitting one contains 1518 * a list of all the guids of the vdevs that are being split off. 1519 * 1520 * This function determines what to do with that list: either rejoin 1521 * all the disks to the pool, or complete the splitting process. To attempt 1522 * the rejoin, each disk that is offlined is marked online again, and 1523 * we do a reopen() call. If the vdev label for every disk that was 1524 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 1525 * then we call vdev_split() on each disk, and complete the split. 1526 * 1527 * Otherwise we leave the config alone, with all the vdevs in place in 1528 * the original pool. 1529 */ 1530 static void 1531 spa_try_repair(spa_t *spa, nvlist_t *config) 1532 { 1533 uint_t extracted; 1534 uint64_t *glist; 1535 uint_t i, gcount; 1536 nvlist_t *nvl; 1537 vdev_t **vd; 1538 boolean_t attempt_reopen; 1539 1540 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 1541 return; 1542 1543 /* check that the config is complete */ 1544 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 1545 &glist, &gcount) != 0) 1546 return; 1547 1548 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 1549 1550 /* attempt to online all the vdevs & validate */ 1551 attempt_reopen = B_TRUE; 1552 for (i = 0; i < gcount; i++) { 1553 if (glist[i] == 0) /* vdev is hole */ 1554 continue; 1555 1556 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 1557 if (vd[i] == NULL) { 1558 /* 1559 * Don't bother attempting to reopen the disks; 1560 * just do the split. 1561 */ 1562 attempt_reopen = B_FALSE; 1563 } else { 1564 /* attempt to re-online it */ 1565 vd[i]->vdev_offline = B_FALSE; 1566 } 1567 } 1568 1569 if (attempt_reopen) { 1570 vdev_reopen(spa->spa_root_vdev); 1571 1572 /* check each device to see what state it's in */ 1573 for (extracted = 0, i = 0; i < gcount; i++) { 1574 if (vd[i] != NULL && 1575 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 1576 break; 1577 ++extracted; 1578 } 1579 } 1580 1581 /* 1582 * If every disk has been moved to the new pool, or if we never 1583 * even attempted to look at them, then we split them off for 1584 * good. 1585 */ 1586 if (!attempt_reopen || gcount == extracted) { 1587 for (i = 0; i < gcount; i++) 1588 if (vd[i] != NULL) 1589 vdev_split(vd[i]); 1590 vdev_reopen(spa->spa_root_vdev); 1591 } 1592 1593 kmem_free(vd, gcount * sizeof (vdev_t *)); 1594 } 1595 1596 static int 1597 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 1598 boolean_t mosconfig) 1599 { 1600 nvlist_t *config = spa->spa_config; 1601 char *ereport = FM_EREPORT_ZFS_POOL; 1602 int error; 1603 uint64_t pool_guid; 1604 nvlist_t *nvl; 1605 1606 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 1607 return (EINVAL); 1608 1609 /* 1610 * Versioning wasn't explicitly added to the label until later, so if 1611 * it's not present treat it as the initial version. 1612 */ 1613 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 1614 &spa->spa_ubsync.ub_version) != 0) 1615 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 1616 1617 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 1618 &spa->spa_config_txg); 1619 1620 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 1621 spa_guid_exists(pool_guid, 0)) { 1622 error = EEXIST; 1623 } else { 1624 spa->spa_load_guid = pool_guid; 1625 1626 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 1627 &nvl) == 0) { 1628 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 1629 KM_SLEEP) == 0); 1630 } 1631 1632 error = spa_load_impl(spa, pool_guid, config, state, type, 1633 mosconfig, &ereport); 1634 } 1635 1636 spa->spa_minref = refcount_count(&spa->spa_refcount); 1637 if (error && error != EBADF) 1638 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 1639 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 1640 spa->spa_ena = 0; 1641 1642 return (error); 1643 } 1644 1645 /* 1646 * Load an existing storage pool, using the pool's builtin spa_config as a 1647 * source of configuration information. 1648 */ 1649 static int 1650 spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 1651 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 1652 char **ereport) 1653 { 1654 int error = 0; 1655 nvlist_t *nvconfig, *nvroot = NULL; 1656 vdev_t *rvd; 1657 uberblock_t *ub = &spa->spa_uberblock; 1658 uint64_t config_cache_txg = spa->spa_config_txg; 1659 int orig_mode = spa->spa_mode; 1660 int parse; 1661 1662 /* 1663 * If this is an untrusted config, access the pool in read-only mode. 1664 * This prevents things like resilvering recently removed devices. 1665 */ 1666 if (!mosconfig) 1667 spa->spa_mode = FREAD; 1668 1669 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1670 1671 spa->spa_load_state = state; 1672 1673 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 1674 return (EINVAL); 1675 1676 parse = (type == SPA_IMPORT_EXISTING ? 1677 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 1678 1679 /* 1680 * Create "The Godfather" zio to hold all async IOs 1681 */ 1682 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 1683 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 1684 1685 /* 1686 * Parse the configuration into a vdev tree. We explicitly set the 1687 * value that will be returned by spa_version() since parsing the 1688 * configuration requires knowing the version number. 1689 */ 1690 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1691 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 1692 spa_config_exit(spa, SCL_ALL, FTAG); 1693 1694 if (error != 0) 1695 return (error); 1696 1697 ASSERT(spa->spa_root_vdev == rvd); 1698 1699 if (type != SPA_IMPORT_ASSEMBLE) { 1700 ASSERT(spa_guid(spa) == pool_guid); 1701 } 1702 1703 /* 1704 * Try to open all vdevs, loading each label in the process. 1705 */ 1706 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1707 error = vdev_open(rvd); 1708 spa_config_exit(spa, SCL_ALL, FTAG); 1709 if (error != 0) 1710 return (error); 1711 1712 /* 1713 * We need to validate the vdev labels against the configuration that 1714 * we have in hand, which is dependent on the setting of mosconfig. If 1715 * mosconfig is true then we're validating the vdev labels based on 1716 * that config. Otherwise, we're validating against the cached config 1717 * (zpool.cache) that was read when we loaded the zfs module, and then 1718 * later we will recursively call spa_load() and validate against 1719 * the vdev config. 1720 * 1721 * If we're assembling a new pool that's been split off from an 1722 * existing pool, the labels haven't yet been updated so we skip 1723 * validation for now. 1724 */ 1725 if (type != SPA_IMPORT_ASSEMBLE) { 1726 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1727 error = vdev_validate(rvd); 1728 spa_config_exit(spa, SCL_ALL, FTAG); 1729 1730 if (error != 0) 1731 return (error); 1732 1733 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 1734 return (ENXIO); 1735 } 1736 1737 /* 1738 * Find the best uberblock. 1739 */ 1740 vdev_uberblock_load(NULL, rvd, ub); 1741 1742 /* 1743 * If we weren't able to find a single valid uberblock, return failure. 1744 */ 1745 if (ub->ub_txg == 0) 1746 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 1747 1748 /* 1749 * If the pool is newer than the code, we can't open it. 1750 */ 1751 if (ub->ub_version > SPA_VERSION) 1752 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 1753 1754 /* 1755 * If the vdev guid sum doesn't match the uberblock, we have an 1756 * incomplete configuration. 1757 */ 1758 if (mosconfig && type != SPA_IMPORT_ASSEMBLE && 1759 rvd->vdev_guid_sum != ub->ub_guid_sum) 1760 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 1761 1762 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 1763 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1764 spa_try_repair(spa, config); 1765 spa_config_exit(spa, SCL_ALL, FTAG); 1766 nvlist_free(spa->spa_config_splitting); 1767 spa->spa_config_splitting = NULL; 1768 } 1769 1770 /* 1771 * Initialize internal SPA structures. 1772 */ 1773 spa->spa_state = POOL_STATE_ACTIVE; 1774 spa->spa_ubsync = spa->spa_uberblock; 1775 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 1776 TXG_INITIAL : spa_last_synced_txg(spa) - TXG_DEFER_SIZE; 1777 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 1778 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 1779 spa->spa_claim_max_txg = spa->spa_first_txg; 1780 1781 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 1782 if (error) 1783 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1784 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 1785 1786 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 1787 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1788 1789 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 1790 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1791 1792 if (!mosconfig) { 1793 uint64_t hostid; 1794 1795 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 1796 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 1797 char *hostname; 1798 unsigned long myhostid = 0; 1799 1800 VERIFY(nvlist_lookup_string(nvconfig, 1801 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 1802 1803 #ifdef _KERNEL 1804 myhostid = zone_get_hostid(NULL); 1805 #else /* _KERNEL */ 1806 /* 1807 * We're emulating the system's hostid in userland, so 1808 * we can't use zone_get_hostid(). 1809 */ 1810 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 1811 #endif /* _KERNEL */ 1812 if (hostid != 0 && myhostid != 0 && 1813 hostid != myhostid) { 1814 cmn_err(CE_WARN, "pool '%s' could not be " 1815 "loaded as it was last accessed by " 1816 "another system (host: %s hostid: 0x%lx). " 1817 "See: http://www.sun.com/msg/ZFS-8000-EY", 1818 spa_name(spa), hostname, 1819 (unsigned long)hostid); 1820 return (EBADF); 1821 } 1822 } 1823 1824 spa_config_set(spa, nvconfig); 1825 spa_unload(spa); 1826 spa_deactivate(spa); 1827 spa_activate(spa, orig_mode); 1828 1829 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 1830 } 1831 1832 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPLIST, 1833 &spa->spa_deferred_bplist_obj) != 0) 1834 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1835 1836 /* 1837 * Load the bit that tells us to use the new accounting function 1838 * (raid-z deflation). If we have an older pool, this will not 1839 * be present. 1840 */ 1841 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 1842 if (error != 0 && error != ENOENT) 1843 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1844 1845 /* 1846 * Load the persistent error log. If we have an older pool, this will 1847 * not be present. 1848 */ 1849 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 1850 if (error != 0 && error != ENOENT) 1851 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1852 1853 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 1854 &spa->spa_errlog_scrub); 1855 if (error != 0 && error != ENOENT) 1856 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1857 1858 /* 1859 * Load the history object. If we have an older pool, this 1860 * will not be present. 1861 */ 1862 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 1863 if (error != 0 && error != ENOENT) 1864 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1865 1866 /* 1867 * If we're assembling the pool from the split-off vdevs of 1868 * an existing pool, we don't want to attach the spares & cache 1869 * devices. 1870 */ 1871 1872 /* 1873 * Load any hot spares for this pool. 1874 */ 1875 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 1876 if (error != 0 && error != ENOENT) 1877 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1878 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 1879 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 1880 if (load_nvlist(spa, spa->spa_spares.sav_object, 1881 &spa->spa_spares.sav_config) != 0) 1882 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1883 1884 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1885 spa_load_spares(spa); 1886 spa_config_exit(spa, SCL_ALL, FTAG); 1887 } else if (error == 0) { 1888 spa->spa_spares.sav_sync = B_TRUE; 1889 } 1890 1891 /* 1892 * Load any level 2 ARC devices for this pool. 1893 */ 1894 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 1895 &spa->spa_l2cache.sav_object); 1896 if (error != 0 && error != ENOENT) 1897 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1898 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 1899 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 1900 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 1901 &spa->spa_l2cache.sav_config) != 0) 1902 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1903 1904 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1905 spa_load_l2cache(spa); 1906 spa_config_exit(spa, SCL_ALL, FTAG); 1907 } else if (error == 0) { 1908 spa->spa_l2cache.sav_sync = B_TRUE; 1909 } 1910 1911 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 1912 1913 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 1914 if (error && error != ENOENT) 1915 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1916 1917 if (error == 0) { 1918 uint64_t autoreplace; 1919 1920 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 1921 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 1922 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 1923 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 1924 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 1925 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 1926 &spa->spa_dedup_ditto); 1927 1928 spa->spa_autoreplace = (autoreplace != 0); 1929 } 1930 1931 /* 1932 * If the 'autoreplace' property is set, then post a resource notifying 1933 * the ZFS DE that it should not issue any faults for unopenable 1934 * devices. We also iterate over the vdevs, and post a sysevent for any 1935 * unopenable vdevs so that the normal autoreplace handler can take 1936 * over. 1937 */ 1938 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 1939 spa_check_removed(spa->spa_root_vdev); 1940 /* 1941 * For the import case, this is done in spa_import(), because 1942 * at this point we're using the spare definitions from 1943 * the MOS config, not necessarily from the userland config. 1944 */ 1945 if (state != SPA_LOAD_IMPORT) { 1946 spa_aux_check_removed(&spa->spa_spares); 1947 spa_aux_check_removed(&spa->spa_l2cache); 1948 } 1949 } 1950 1951 /* 1952 * Load the vdev state for all toplevel vdevs. 1953 */ 1954 vdev_load(rvd); 1955 1956 /* 1957 * Propagate the leaf DTLs we just loaded all the way up the tree. 1958 */ 1959 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1960 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 1961 spa_config_exit(spa, SCL_ALL, FTAG); 1962 1963 /* 1964 * Check the state of the root vdev. If it can't be opened, it 1965 * indicates one or more toplevel vdevs are faulted. 1966 */ 1967 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 1968 return (ENXIO); 1969 1970 /* 1971 * Load the DDTs (dedup tables). 1972 */ 1973 error = ddt_load(spa); 1974 if (error != 0) 1975 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1976 1977 spa_update_dspace(spa); 1978 1979 if (state != SPA_LOAD_TRYIMPORT) { 1980 error = spa_load_verify(spa); 1981 if (error) 1982 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 1983 error)); 1984 } 1985 1986 /* 1987 * Load the intent log state and check log integrity. If we're 1988 * assembling a pool from a split, the log is not transferred over. 1989 */ 1990 if (type != SPA_IMPORT_ASSEMBLE) { 1991 VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE, 1992 &nvroot) == 0); 1993 spa_load_log_state(spa, nvroot); 1994 nvlist_free(nvconfig); 1995 1996 if (spa_check_logs(spa)) { 1997 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 1998 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 1999 } 2000 } 2001 2002 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2003 spa->spa_load_max_txg == UINT64_MAX)) { 2004 dmu_tx_t *tx; 2005 int need_update = B_FALSE; 2006 2007 ASSERT(state != SPA_LOAD_TRYIMPORT); 2008 2009 /* 2010 * Claim log blocks that haven't been committed yet. 2011 * This must all happen in a single txg. 2012 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2013 * invoked from zil_claim_log_block()'s i/o done callback. 2014 * Price of rollback is that we abandon the log. 2015 */ 2016 spa->spa_claiming = B_TRUE; 2017 2018 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 2019 spa_first_txg(spa)); 2020 (void) dmu_objset_find(spa_name(spa), 2021 zil_claim, tx, DS_FIND_CHILDREN); 2022 dmu_tx_commit(tx); 2023 2024 spa->spa_claiming = B_FALSE; 2025 2026 spa_set_log_state(spa, SPA_LOG_GOOD); 2027 spa->spa_sync_on = B_TRUE; 2028 txg_sync_start(spa->spa_dsl_pool); 2029 2030 /* 2031 * Wait for all claims to sync. We sync up to the highest 2032 * claimed log block birth time so that claimed log blocks 2033 * don't appear to be from the future. spa_claim_max_txg 2034 * will have been set for us by either zil_check_log_chain() 2035 * (invoked from spa_check_logs()) or zil_claim() above. 2036 */ 2037 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2038 2039 /* 2040 * If the config cache is stale, or we have uninitialized 2041 * metaslabs (see spa_vdev_add()), then update the config. 2042 * 2043 * If spa_load_verbatim is true, trust the current 2044 * in-core spa_config and update the disk labels. 2045 */ 2046 if (config_cache_txg != spa->spa_config_txg || 2047 state == SPA_LOAD_IMPORT || spa->spa_load_verbatim || 2048 state == SPA_LOAD_RECOVER) 2049 need_update = B_TRUE; 2050 2051 for (int c = 0; c < rvd->vdev_children; c++) 2052 if (rvd->vdev_child[c]->vdev_ms_array == 0) 2053 need_update = B_TRUE; 2054 2055 /* 2056 * Update the config cache asychronously in case we're the 2057 * root pool, in which case the config cache isn't writable yet. 2058 */ 2059 if (need_update) 2060 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2061 2062 /* 2063 * Check all DTLs to see if anything needs resilvering. 2064 */ 2065 if (vdev_resilver_needed(rvd, NULL, NULL)) 2066 spa_async_request(spa, SPA_ASYNC_RESILVER); 2067 2068 /* 2069 * Delete any inconsistent datasets. 2070 */ 2071 (void) dmu_objset_find(spa_name(spa), 2072 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 2073 2074 /* 2075 * Clean up any stale temporary dataset userrefs. 2076 */ 2077 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2078 } 2079 2080 return (0); 2081 } 2082 2083 static int 2084 spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 2085 { 2086 spa_unload(spa); 2087 spa_deactivate(spa); 2088 2089 spa->spa_load_max_txg--; 2090 2091 spa_activate(spa, spa_mode_global); 2092 spa_async_suspend(spa); 2093 2094 return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 2095 } 2096 2097 static int 2098 spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 2099 uint64_t max_request, boolean_t extreme) 2100 { 2101 nvlist_t *config = NULL; 2102 int load_error, rewind_error; 2103 uint64_t safe_rollback_txg; 2104 uint64_t min_txg; 2105 2106 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 2107 spa->spa_load_max_txg = spa->spa_load_txg; 2108 spa_set_log_state(spa, SPA_LOG_CLEAR); 2109 } else { 2110 spa->spa_load_max_txg = max_request; 2111 } 2112 2113 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 2114 mosconfig); 2115 if (load_error == 0) 2116 return (0); 2117 2118 if (spa->spa_root_vdev != NULL) 2119 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2120 2121 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 2122 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 2123 2124 /* specific txg requested */ 2125 if (spa->spa_load_max_txg != UINT64_MAX && !extreme) { 2126 nvlist_free(config); 2127 return (load_error); 2128 } 2129 2130 /* Price of rolling back is discarding txgs, including log */ 2131 if (state == SPA_LOAD_RECOVER) 2132 spa_set_log_state(spa, SPA_LOG_CLEAR); 2133 2134 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 2135 safe_rollback_txg = spa->spa_uberblock.ub_txg - TXG_DEFER_SIZE; 2136 2137 min_txg = extreme ? TXG_INITIAL : safe_rollback_txg; 2138 while (rewind_error && (spa->spa_uberblock.ub_txg >= min_txg)) { 2139 if (spa->spa_load_max_txg < safe_rollback_txg) 2140 spa->spa_extreme_rewind = B_TRUE; 2141 rewind_error = spa_load_retry(spa, state, mosconfig); 2142 } 2143 2144 if (config) 2145 spa_rewind_data_to_nvlist(spa, config); 2146 2147 spa->spa_extreme_rewind = B_FALSE; 2148 spa->spa_load_max_txg = UINT64_MAX; 2149 2150 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 2151 spa_config_set(spa, config); 2152 2153 return (state == SPA_LOAD_RECOVER ? rewind_error : load_error); 2154 } 2155 2156 /* 2157 * Pool Open/Import 2158 * 2159 * The import case is identical to an open except that the configuration is sent 2160 * down from userland, instead of grabbed from the configuration cache. For the 2161 * case of an open, the pool configuration will exist in the 2162 * POOL_STATE_UNINITIALIZED state. 2163 * 2164 * The stats information (gen/count/ustats) is used to gather vdev statistics at 2165 * the same time open the pool, without having to keep around the spa_t in some 2166 * ambiguous state. 2167 */ 2168 static int 2169 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 2170 nvlist_t **config) 2171 { 2172 spa_t *spa; 2173 boolean_t norewind; 2174 boolean_t extreme; 2175 zpool_rewind_policy_t policy; 2176 spa_load_state_t state = SPA_LOAD_OPEN; 2177 int error; 2178 int locked = B_FALSE; 2179 2180 *spapp = NULL; 2181 2182 zpool_get_rewind_policy(nvpolicy, &policy); 2183 if (policy.zrp_request & ZPOOL_DO_REWIND) 2184 state = SPA_LOAD_RECOVER; 2185 norewind = (policy.zrp_request == ZPOOL_NO_REWIND); 2186 extreme = ((policy.zrp_request & ZPOOL_EXTREME_REWIND) != 0); 2187 2188 /* 2189 * As disgusting as this is, we need to support recursive calls to this 2190 * function because dsl_dir_open() is called during spa_load(), and ends 2191 * up calling spa_open() again. The real fix is to figure out how to 2192 * avoid dsl_dir_open() calling this in the first place. 2193 */ 2194 if (mutex_owner(&spa_namespace_lock) != curthread) { 2195 mutex_enter(&spa_namespace_lock); 2196 locked = B_TRUE; 2197 } 2198 2199 if ((spa = spa_lookup(pool)) == NULL) { 2200 if (locked) 2201 mutex_exit(&spa_namespace_lock); 2202 return (ENOENT); 2203 } 2204 2205 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 2206 2207 spa_activate(spa, spa_mode_global); 2208 2209 if (spa->spa_last_open_failed && norewind) { 2210 if (config != NULL && spa->spa_config) 2211 VERIFY(nvlist_dup(spa->spa_config, 2212 config, KM_SLEEP) == 0); 2213 spa_deactivate(spa); 2214 if (locked) 2215 mutex_exit(&spa_namespace_lock); 2216 return (spa->spa_last_open_failed); 2217 } 2218 2219 if (state != SPA_LOAD_RECOVER) 2220 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 2221 2222 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 2223 extreme); 2224 2225 if (error == EBADF) { 2226 /* 2227 * If vdev_validate() returns failure (indicated by 2228 * EBADF), it indicates that one of the vdevs indicates 2229 * that the pool has been exported or destroyed. If 2230 * this is the case, the config cache is out of sync and 2231 * we should remove the pool from the namespace. 2232 */ 2233 spa_unload(spa); 2234 spa_deactivate(spa); 2235 spa_config_sync(spa, B_TRUE, B_TRUE); 2236 spa_remove(spa); 2237 if (locked) 2238 mutex_exit(&spa_namespace_lock); 2239 return (ENOENT); 2240 } 2241 2242 if (error) { 2243 /* 2244 * We can't open the pool, but we still have useful 2245 * information: the state of each vdev after the 2246 * attempted vdev_open(). Return this to the user. 2247 */ 2248 if (config != NULL && spa->spa_config) 2249 VERIFY(nvlist_dup(spa->spa_config, config, 2250 KM_SLEEP) == 0); 2251 spa_unload(spa); 2252 spa_deactivate(spa); 2253 spa->spa_last_open_failed = error; 2254 if (locked) 2255 mutex_exit(&spa_namespace_lock); 2256 *spapp = NULL; 2257 return (error); 2258 } 2259 2260 } 2261 2262 spa_open_ref(spa, tag); 2263 2264 2265 if (config != NULL) 2266 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2267 2268 if (locked) { 2269 spa->spa_last_open_failed = 0; 2270 spa->spa_last_ubsync_txg = 0; 2271 spa->spa_load_txg = 0; 2272 mutex_exit(&spa_namespace_lock); 2273 } 2274 2275 *spapp = spa; 2276 2277 return (0); 2278 } 2279 2280 int 2281 spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 2282 nvlist_t **config) 2283 { 2284 return (spa_open_common(name, spapp, tag, policy, config)); 2285 } 2286 2287 int 2288 spa_open(const char *name, spa_t **spapp, void *tag) 2289 { 2290 return (spa_open_common(name, spapp, tag, NULL, NULL)); 2291 } 2292 2293 /* 2294 * Lookup the given spa_t, incrementing the inject count in the process, 2295 * preventing it from being exported or destroyed. 2296 */ 2297 spa_t * 2298 spa_inject_addref(char *name) 2299 { 2300 spa_t *spa; 2301 2302 mutex_enter(&spa_namespace_lock); 2303 if ((spa = spa_lookup(name)) == NULL) { 2304 mutex_exit(&spa_namespace_lock); 2305 return (NULL); 2306 } 2307 spa->spa_inject_ref++; 2308 mutex_exit(&spa_namespace_lock); 2309 2310 return (spa); 2311 } 2312 2313 void 2314 spa_inject_delref(spa_t *spa) 2315 { 2316 mutex_enter(&spa_namespace_lock); 2317 spa->spa_inject_ref--; 2318 mutex_exit(&spa_namespace_lock); 2319 } 2320 2321 /* 2322 * Add spares device information to the nvlist. 2323 */ 2324 static void 2325 spa_add_spares(spa_t *spa, nvlist_t *config) 2326 { 2327 nvlist_t **spares; 2328 uint_t i, nspares; 2329 nvlist_t *nvroot; 2330 uint64_t guid; 2331 vdev_stat_t *vs; 2332 uint_t vsc; 2333 uint64_t pool; 2334 2335 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2336 2337 if (spa->spa_spares.sav_count == 0) 2338 return; 2339 2340 VERIFY(nvlist_lookup_nvlist(config, 2341 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2342 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 2343 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2344 if (nspares != 0) { 2345 VERIFY(nvlist_add_nvlist_array(nvroot, 2346 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2347 VERIFY(nvlist_lookup_nvlist_array(nvroot, 2348 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2349 2350 /* 2351 * Go through and find any spares which have since been 2352 * repurposed as an active spare. If this is the case, update 2353 * their status appropriately. 2354 */ 2355 for (i = 0; i < nspares; i++) { 2356 VERIFY(nvlist_lookup_uint64(spares[i], 2357 ZPOOL_CONFIG_GUID, &guid) == 0); 2358 if (spa_spare_exists(guid, &pool, NULL) && 2359 pool != 0ULL) { 2360 VERIFY(nvlist_lookup_uint64_array( 2361 spares[i], ZPOOL_CONFIG_STATS, 2362 (uint64_t **)&vs, &vsc) == 0); 2363 vs->vs_state = VDEV_STATE_CANT_OPEN; 2364 vs->vs_aux = VDEV_AUX_SPARED; 2365 } 2366 } 2367 } 2368 } 2369 2370 /* 2371 * Add l2cache device information to the nvlist, including vdev stats. 2372 */ 2373 static void 2374 spa_add_l2cache(spa_t *spa, nvlist_t *config) 2375 { 2376 nvlist_t **l2cache; 2377 uint_t i, j, nl2cache; 2378 nvlist_t *nvroot; 2379 uint64_t guid; 2380 vdev_t *vd; 2381 vdev_stat_t *vs; 2382 uint_t vsc; 2383 2384 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2385 2386 if (spa->spa_l2cache.sav_count == 0) 2387 return; 2388 2389 VERIFY(nvlist_lookup_nvlist(config, 2390 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2391 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 2392 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 2393 if (nl2cache != 0) { 2394 VERIFY(nvlist_add_nvlist_array(nvroot, 2395 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2396 VERIFY(nvlist_lookup_nvlist_array(nvroot, 2397 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 2398 2399 /* 2400 * Update level 2 cache device stats. 2401 */ 2402 2403 for (i = 0; i < nl2cache; i++) { 2404 VERIFY(nvlist_lookup_uint64(l2cache[i], 2405 ZPOOL_CONFIG_GUID, &guid) == 0); 2406 2407 vd = NULL; 2408 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 2409 if (guid == 2410 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 2411 vd = spa->spa_l2cache.sav_vdevs[j]; 2412 break; 2413 } 2414 } 2415 ASSERT(vd != NULL); 2416 2417 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 2418 ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0); 2419 vdev_get_stats(vd, vs); 2420 } 2421 } 2422 } 2423 2424 int 2425 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 2426 { 2427 int error; 2428 spa_t *spa; 2429 2430 *config = NULL; 2431 error = spa_open_common(name, &spa, FTAG, NULL, config); 2432 2433 if (spa != NULL) { 2434 /* 2435 * This still leaves a window of inconsistency where the spares 2436 * or l2cache devices could change and the config would be 2437 * self-inconsistent. 2438 */ 2439 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 2440 2441 if (*config != NULL) { 2442 VERIFY(nvlist_add_uint64(*config, 2443 ZPOOL_CONFIG_ERRCOUNT, 2444 spa_get_errlog_size(spa)) == 0); 2445 2446 if (spa_suspended(spa)) 2447 VERIFY(nvlist_add_uint64(*config, 2448 ZPOOL_CONFIG_SUSPENDED, 2449 spa->spa_failmode) == 0); 2450 2451 spa_add_spares(spa, *config); 2452 spa_add_l2cache(spa, *config); 2453 } 2454 } 2455 2456 /* 2457 * We want to get the alternate root even for faulted pools, so we cheat 2458 * and call spa_lookup() directly. 2459 */ 2460 if (altroot) { 2461 if (spa == NULL) { 2462 mutex_enter(&spa_namespace_lock); 2463 spa = spa_lookup(name); 2464 if (spa) 2465 spa_altroot(spa, altroot, buflen); 2466 else 2467 altroot[0] = '\0'; 2468 spa = NULL; 2469 mutex_exit(&spa_namespace_lock); 2470 } else { 2471 spa_altroot(spa, altroot, buflen); 2472 } 2473 } 2474 2475 if (spa != NULL) { 2476 spa_config_exit(spa, SCL_CONFIG, FTAG); 2477 spa_close(spa, FTAG); 2478 } 2479 2480 return (error); 2481 } 2482 2483 /* 2484 * Validate that the auxiliary device array is well formed. We must have an 2485 * array of nvlists, each which describes a valid leaf vdev. If this is an 2486 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 2487 * specified, as long as they are well-formed. 2488 */ 2489 static int 2490 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 2491 spa_aux_vdev_t *sav, const char *config, uint64_t version, 2492 vdev_labeltype_t label) 2493 { 2494 nvlist_t **dev; 2495 uint_t i, ndev; 2496 vdev_t *vd; 2497 int error; 2498 2499 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2500 2501 /* 2502 * It's acceptable to have no devs specified. 2503 */ 2504 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 2505 return (0); 2506 2507 if (ndev == 0) 2508 return (EINVAL); 2509 2510 /* 2511 * Make sure the pool is formatted with a version that supports this 2512 * device type. 2513 */ 2514 if (spa_version(spa) < version) 2515 return (ENOTSUP); 2516 2517 /* 2518 * Set the pending device list so we correctly handle device in-use 2519 * checking. 2520 */ 2521 sav->sav_pending = dev; 2522 sav->sav_npending = ndev; 2523 2524 for (i = 0; i < ndev; i++) { 2525 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 2526 mode)) != 0) 2527 goto out; 2528 2529 if (!vd->vdev_ops->vdev_op_leaf) { 2530 vdev_free(vd); 2531 error = EINVAL; 2532 goto out; 2533 } 2534 2535 /* 2536 * The L2ARC currently only supports disk devices in 2537 * kernel context. For user-level testing, we allow it. 2538 */ 2539 #ifdef _KERNEL 2540 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 2541 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 2542 error = ENOTBLK; 2543 goto out; 2544 } 2545 #endif 2546 vd->vdev_top = vd; 2547 2548 if ((error = vdev_open(vd)) == 0 && 2549 (error = vdev_label_init(vd, crtxg, label)) == 0) { 2550 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 2551 vd->vdev_guid) == 0); 2552 } 2553 2554 vdev_free(vd); 2555 2556 if (error && 2557 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 2558 goto out; 2559 else 2560 error = 0; 2561 } 2562 2563 out: 2564 sav->sav_pending = NULL; 2565 sav->sav_npending = 0; 2566 return (error); 2567 } 2568 2569 static int 2570 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 2571 { 2572 int error; 2573 2574 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2575 2576 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2577 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 2578 VDEV_LABEL_SPARE)) != 0) { 2579 return (error); 2580 } 2581 2582 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2583 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 2584 VDEV_LABEL_L2CACHE)); 2585 } 2586 2587 static void 2588 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 2589 const char *config) 2590 { 2591 int i; 2592 2593 if (sav->sav_config != NULL) { 2594 nvlist_t **olddevs; 2595 uint_t oldndevs; 2596 nvlist_t **newdevs; 2597 2598 /* 2599 * Generate new dev list by concatentating with the 2600 * current dev list. 2601 */ 2602 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 2603 &olddevs, &oldndevs) == 0); 2604 2605 newdevs = kmem_alloc(sizeof (void *) * 2606 (ndevs + oldndevs), KM_SLEEP); 2607 for (i = 0; i < oldndevs; i++) 2608 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 2609 KM_SLEEP) == 0); 2610 for (i = 0; i < ndevs; i++) 2611 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 2612 KM_SLEEP) == 0); 2613 2614 VERIFY(nvlist_remove(sav->sav_config, config, 2615 DATA_TYPE_NVLIST_ARRAY) == 0); 2616 2617 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 2618 config, newdevs, ndevs + oldndevs) == 0); 2619 for (i = 0; i < oldndevs + ndevs; i++) 2620 nvlist_free(newdevs[i]); 2621 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 2622 } else { 2623 /* 2624 * Generate a new dev list. 2625 */ 2626 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 2627 KM_SLEEP) == 0); 2628 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 2629 devs, ndevs) == 0); 2630 } 2631 } 2632 2633 /* 2634 * Stop and drop level 2 ARC devices 2635 */ 2636 void 2637 spa_l2cache_drop(spa_t *spa) 2638 { 2639 vdev_t *vd; 2640 int i; 2641 spa_aux_vdev_t *sav = &spa->spa_l2cache; 2642 2643 for (i = 0; i < sav->sav_count; i++) { 2644 uint64_t pool; 2645 2646 vd = sav->sav_vdevs[i]; 2647 ASSERT(vd != NULL); 2648 2649 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 2650 pool != 0ULL && l2arc_vdev_present(vd)) 2651 l2arc_remove_vdev(vd); 2652 if (vd->vdev_isl2cache) 2653 spa_l2cache_remove(vd); 2654 vdev_clear_stats(vd); 2655 (void) vdev_close(vd); 2656 } 2657 } 2658 2659 /* 2660 * Pool Creation 2661 */ 2662 int 2663 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 2664 const char *history_str, nvlist_t *zplprops) 2665 { 2666 spa_t *spa; 2667 char *altroot = NULL; 2668 vdev_t *rvd; 2669 dsl_pool_t *dp; 2670 dmu_tx_t *tx; 2671 int error = 0; 2672 uint64_t txg = TXG_INITIAL; 2673 nvlist_t **spares, **l2cache; 2674 uint_t nspares, nl2cache; 2675 uint64_t version; 2676 2677 /* 2678 * If this pool already exists, return failure. 2679 */ 2680 mutex_enter(&spa_namespace_lock); 2681 if (spa_lookup(pool) != NULL) { 2682 mutex_exit(&spa_namespace_lock); 2683 return (EEXIST); 2684 } 2685 2686 /* 2687 * Allocate a new spa_t structure. 2688 */ 2689 (void) nvlist_lookup_string(props, 2690 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2691 spa = spa_add(pool, NULL, altroot); 2692 spa_activate(spa, spa_mode_global); 2693 2694 if (props && (error = spa_prop_validate(spa, props))) { 2695 spa_deactivate(spa); 2696 spa_remove(spa); 2697 mutex_exit(&spa_namespace_lock); 2698 return (error); 2699 } 2700 2701 if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 2702 &version) != 0) 2703 version = SPA_VERSION; 2704 ASSERT(version <= SPA_VERSION); 2705 2706 spa->spa_first_txg = txg; 2707 spa->spa_uberblock.ub_txg = txg - 1; 2708 spa->spa_uberblock.ub_version = version; 2709 spa->spa_ubsync = spa->spa_uberblock; 2710 2711 /* 2712 * Create "The Godfather" zio to hold all async IOs 2713 */ 2714 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 2715 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 2716 2717 /* 2718 * Create the root vdev. 2719 */ 2720 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2721 2722 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 2723 2724 ASSERT(error != 0 || rvd != NULL); 2725 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 2726 2727 if (error == 0 && !zfs_allocatable_devs(nvroot)) 2728 error = EINVAL; 2729 2730 if (error == 0 && 2731 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 2732 (error = spa_validate_aux(spa, nvroot, txg, 2733 VDEV_ALLOC_ADD)) == 0) { 2734 for (int c = 0; c < rvd->vdev_children; c++) { 2735 vdev_metaslab_set_size(rvd->vdev_child[c]); 2736 vdev_expand(rvd->vdev_child[c], txg); 2737 } 2738 } 2739 2740 spa_config_exit(spa, SCL_ALL, FTAG); 2741 2742 if (error != 0) { 2743 spa_unload(spa); 2744 spa_deactivate(spa); 2745 spa_remove(spa); 2746 mutex_exit(&spa_namespace_lock); 2747 return (error); 2748 } 2749 2750 /* 2751 * Get the list of spares, if specified. 2752 */ 2753 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2754 &spares, &nspares) == 0) { 2755 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 2756 KM_SLEEP) == 0); 2757 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2758 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2759 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2760 spa_load_spares(spa); 2761 spa_config_exit(spa, SCL_ALL, FTAG); 2762 spa->spa_spares.sav_sync = B_TRUE; 2763 } 2764 2765 /* 2766 * Get the list of level 2 cache devices, if specified. 2767 */ 2768 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2769 &l2cache, &nl2cache) == 0) { 2770 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2771 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2772 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2773 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2774 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2775 spa_load_l2cache(spa); 2776 spa_config_exit(spa, SCL_ALL, FTAG); 2777 spa->spa_l2cache.sav_sync = B_TRUE; 2778 } 2779 2780 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 2781 spa->spa_meta_objset = dp->dp_meta_objset; 2782 2783 /* 2784 * Create DDTs (dedup tables). 2785 */ 2786 ddt_create(spa); 2787 2788 spa_update_dspace(spa); 2789 2790 tx = dmu_tx_create_assigned(dp, txg); 2791 2792 /* 2793 * Create the pool config object. 2794 */ 2795 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 2796 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 2797 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2798 2799 if (zap_add(spa->spa_meta_objset, 2800 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 2801 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 2802 cmn_err(CE_PANIC, "failed to add pool config"); 2803 } 2804 2805 /* Newly created pools with the right version are always deflated. */ 2806 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 2807 spa->spa_deflate = TRUE; 2808 if (zap_add(spa->spa_meta_objset, 2809 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2810 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 2811 cmn_err(CE_PANIC, "failed to add deflate"); 2812 } 2813 } 2814 2815 /* 2816 * Create the deferred-free bplist object. Turn off compression 2817 * because sync-to-convergence takes longer if the blocksize 2818 * keeps changing. 2819 */ 2820 spa->spa_deferred_bplist_obj = bplist_create(spa->spa_meta_objset, 2821 1 << 14, tx); 2822 dmu_object_set_compress(spa->spa_meta_objset, 2823 spa->spa_deferred_bplist_obj, ZIO_COMPRESS_OFF, tx); 2824 2825 if (zap_add(spa->spa_meta_objset, 2826 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 2827 sizeof (uint64_t), 1, &spa->spa_deferred_bplist_obj, tx) != 0) { 2828 cmn_err(CE_PANIC, "failed to add bplist"); 2829 } 2830 2831 /* 2832 * Create the pool's history object. 2833 */ 2834 if (version >= SPA_VERSION_ZPOOL_HISTORY) 2835 spa_history_create_obj(spa, tx); 2836 2837 /* 2838 * Set pool properties. 2839 */ 2840 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 2841 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2842 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 2843 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 2844 2845 if (props != NULL) { 2846 spa_configfile_set(spa, props, B_FALSE); 2847 spa_sync_props(spa, props, CRED(), tx); 2848 } 2849 2850 dmu_tx_commit(tx); 2851 2852 spa->spa_sync_on = B_TRUE; 2853 txg_sync_start(spa->spa_dsl_pool); 2854 2855 /* 2856 * We explicitly wait for the first transaction to complete so that our 2857 * bean counters are appropriately updated. 2858 */ 2859 txg_wait_synced(spa->spa_dsl_pool, txg); 2860 2861 spa_config_sync(spa, B_FALSE, B_TRUE); 2862 2863 if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 2864 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 2865 spa_history_log_version(spa, LOG_POOL_CREATE); 2866 2867 spa->spa_minref = refcount_count(&spa->spa_refcount); 2868 2869 mutex_exit(&spa_namespace_lock); 2870 2871 return (0); 2872 } 2873 2874 #ifdef _KERNEL 2875 /* 2876 * Get the root pool information from the root disk, then import the root pool 2877 * during the system boot up time. 2878 */ 2879 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 2880 2881 static nvlist_t * 2882 spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 2883 { 2884 nvlist_t *config; 2885 nvlist_t *nvtop, *nvroot; 2886 uint64_t pgid; 2887 2888 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 2889 return (NULL); 2890 2891 /* 2892 * Add this top-level vdev to the child array. 2893 */ 2894 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 2895 &nvtop) == 0); 2896 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 2897 &pgid) == 0); 2898 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 2899 2900 /* 2901 * Put this pool's top-level vdevs into a root vdev. 2902 */ 2903 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2904 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 2905 VDEV_TYPE_ROOT) == 0); 2906 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 2907 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 2908 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 2909 &nvtop, 1) == 0); 2910 2911 /* 2912 * Replace the existing vdev_tree with the new root vdev in 2913 * this pool's configuration (remove the old, add the new). 2914 */ 2915 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 2916 nvlist_free(nvroot); 2917 return (config); 2918 } 2919 2920 /* 2921 * Walk the vdev tree and see if we can find a device with "better" 2922 * configuration. A configuration is "better" if the label on that 2923 * device has a more recent txg. 2924 */ 2925 static void 2926 spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 2927 { 2928 for (int c = 0; c < vd->vdev_children; c++) 2929 spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 2930 2931 if (vd->vdev_ops->vdev_op_leaf) { 2932 nvlist_t *label; 2933 uint64_t label_txg; 2934 2935 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 2936 &label) != 0) 2937 return; 2938 2939 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 2940 &label_txg) == 0); 2941 2942 /* 2943 * Do we have a better boot device? 2944 */ 2945 if (label_txg > *txg) { 2946 *txg = label_txg; 2947 *avd = vd; 2948 } 2949 nvlist_free(label); 2950 } 2951 } 2952 2953 /* 2954 * Import a root pool. 2955 * 2956 * For x86. devpath_list will consist of devid and/or physpath name of 2957 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 2958 * The GRUB "findroot" command will return the vdev we should boot. 2959 * 2960 * For Sparc, devpath_list consists the physpath name of the booting device 2961 * no matter the rootpool is a single device pool or a mirrored pool. 2962 * e.g. 2963 * "/pci@1f,0/ide@d/disk@0,0:a" 2964 */ 2965 int 2966 spa_import_rootpool(char *devpath, char *devid) 2967 { 2968 spa_t *spa; 2969 vdev_t *rvd, *bvd, *avd = NULL; 2970 nvlist_t *config, *nvtop; 2971 uint64_t guid, txg; 2972 char *pname; 2973 int error; 2974 2975 /* 2976 * Read the label from the boot device and generate a configuration. 2977 */ 2978 config = spa_generate_rootconf(devpath, devid, &guid); 2979 #if defined(_OBP) && defined(_KERNEL) 2980 if (config == NULL) { 2981 if (strstr(devpath, "/iscsi/ssd") != NULL) { 2982 /* iscsi boot */ 2983 get_iscsi_bootpath_phy(devpath); 2984 config = spa_generate_rootconf(devpath, devid, &guid); 2985 } 2986 } 2987 #endif 2988 if (config == NULL) { 2989 cmn_err(CE_NOTE, "Can not read the pool label from '%s'", 2990 devpath); 2991 return (EIO); 2992 } 2993 2994 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 2995 &pname) == 0); 2996 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 2997 2998 mutex_enter(&spa_namespace_lock); 2999 if ((spa = spa_lookup(pname)) != NULL) { 3000 /* 3001 * Remove the existing root pool from the namespace so that we 3002 * can replace it with the correct config we just read in. 3003 */ 3004 spa_remove(spa); 3005 } 3006 3007 spa = spa_add(pname, config, NULL); 3008 spa->spa_is_root = B_TRUE; 3009 spa->spa_load_verbatim = B_TRUE; 3010 3011 /* 3012 * Build up a vdev tree based on the boot device's label config. 3013 */ 3014 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3015 &nvtop) == 0); 3016 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3017 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3018 VDEV_ALLOC_ROOTPOOL); 3019 spa_config_exit(spa, SCL_ALL, FTAG); 3020 if (error) { 3021 mutex_exit(&spa_namespace_lock); 3022 nvlist_free(config); 3023 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3024 pname); 3025 return (error); 3026 } 3027 3028 /* 3029 * Get the boot vdev. 3030 */ 3031 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 3032 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 3033 (u_longlong_t)guid); 3034 error = ENOENT; 3035 goto out; 3036 } 3037 3038 /* 3039 * Determine if there is a better boot device. 3040 */ 3041 avd = bvd; 3042 spa_alt_rootvdev(rvd, &avd, &txg); 3043 if (avd != bvd) { 3044 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 3045 "try booting from '%s'", avd->vdev_path); 3046 error = EINVAL; 3047 goto out; 3048 } 3049 3050 /* 3051 * If the boot device is part of a spare vdev then ensure that 3052 * we're booting off the active spare. 3053 */ 3054 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3055 !bvd->vdev_isspare) { 3056 cmn_err(CE_NOTE, "The boot device is currently spared. Please " 3057 "try booting from '%s'", 3058 bvd->vdev_parent->vdev_child[1]->vdev_path); 3059 error = EINVAL; 3060 goto out; 3061 } 3062 3063 error = 0; 3064 spa_history_log_version(spa, LOG_POOL_IMPORT); 3065 out: 3066 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3067 vdev_free(rvd); 3068 spa_config_exit(spa, SCL_ALL, FTAG); 3069 mutex_exit(&spa_namespace_lock); 3070 3071 nvlist_free(config); 3072 return (error); 3073 } 3074 3075 #endif 3076 3077 /* 3078 * Take a pool and insert it into the namespace as if it had been loaded at 3079 * boot. 3080 */ 3081 int 3082 spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props) 3083 { 3084 spa_t *spa; 3085 zpool_rewind_policy_t policy; 3086 char *altroot = NULL; 3087 3088 mutex_enter(&spa_namespace_lock); 3089 if (spa_lookup(pool) != NULL) { 3090 mutex_exit(&spa_namespace_lock); 3091 return (EEXIST); 3092 } 3093 3094 (void) nvlist_lookup_string(props, 3095 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3096 spa = spa_add(pool, config, altroot); 3097 3098 zpool_get_rewind_policy(config, &policy); 3099 spa->spa_load_max_txg = policy.zrp_txg; 3100 3101 spa->spa_load_verbatim = B_TRUE; 3102 3103 if (props != NULL) 3104 spa_configfile_set(spa, props, B_FALSE); 3105 3106 spa_config_sync(spa, B_FALSE, B_TRUE); 3107 3108 mutex_exit(&spa_namespace_lock); 3109 spa_history_log_version(spa, LOG_POOL_IMPORT); 3110 3111 return (0); 3112 } 3113 3114 /* 3115 * Import a non-root pool into the system. 3116 */ 3117 int 3118 spa_import(const char *pool, nvlist_t *config, nvlist_t *props) 3119 { 3120 spa_t *spa; 3121 char *altroot = NULL; 3122 spa_load_state_t state = SPA_LOAD_IMPORT; 3123 zpool_rewind_policy_t policy; 3124 int error; 3125 nvlist_t *nvroot; 3126 nvlist_t **spares, **l2cache; 3127 uint_t nspares, nl2cache; 3128 3129 /* 3130 * If a pool with this name exists, return failure. 3131 */ 3132 mutex_enter(&spa_namespace_lock); 3133 if (spa_lookup(pool) != NULL) { 3134 mutex_exit(&spa_namespace_lock); 3135 return (EEXIST); 3136 } 3137 3138 zpool_get_rewind_policy(config, &policy); 3139 if (policy.zrp_request & ZPOOL_DO_REWIND) 3140 state = SPA_LOAD_RECOVER; 3141 3142 /* 3143 * Create and initialize the spa structure. 3144 */ 3145 (void) nvlist_lookup_string(props, 3146 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3147 spa = spa_add(pool, config, altroot); 3148 spa_activate(spa, spa_mode_global); 3149 3150 /* 3151 * Don't start async tasks until we know everything is healthy. 3152 */ 3153 spa_async_suspend(spa); 3154 3155 /* 3156 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 3157 * because the user-supplied config is actually the one to trust when 3158 * doing an import. 3159 */ 3160 if (state != SPA_LOAD_RECOVER) 3161 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 3162 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 3163 ((policy.zrp_request & ZPOOL_EXTREME_REWIND) != 0)); 3164 3165 /* 3166 * Propagate anything learned about failing or best txgs 3167 * back to caller 3168 */ 3169 spa_rewind_data_to_nvlist(spa, config); 3170 3171 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3172 /* 3173 * Toss any existing sparelist, as it doesn't have any validity 3174 * anymore, and conflicts with spa_has_spare(). 3175 */ 3176 if (spa->spa_spares.sav_config) { 3177 nvlist_free(spa->spa_spares.sav_config); 3178 spa->spa_spares.sav_config = NULL; 3179 spa_load_spares(spa); 3180 } 3181 if (spa->spa_l2cache.sav_config) { 3182 nvlist_free(spa->spa_l2cache.sav_config); 3183 spa->spa_l2cache.sav_config = NULL; 3184 spa_load_l2cache(spa); 3185 } 3186 3187 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3188 &nvroot) == 0); 3189 if (error == 0) 3190 error = spa_validate_aux(spa, nvroot, -1ULL, 3191 VDEV_ALLOC_SPARE); 3192 if (error == 0) 3193 error = spa_validate_aux(spa, nvroot, -1ULL, 3194 VDEV_ALLOC_L2CACHE); 3195 spa_config_exit(spa, SCL_ALL, FTAG); 3196 3197 if (props != NULL) 3198 spa_configfile_set(spa, props, B_FALSE); 3199 3200 if (error != 0 || (props && spa_writeable(spa) && 3201 (error = spa_prop_set(spa, props)))) { 3202 spa_unload(spa); 3203 spa_deactivate(spa); 3204 spa_remove(spa); 3205 mutex_exit(&spa_namespace_lock); 3206 return (error); 3207 } 3208 3209 spa_async_resume(spa); 3210 3211 /* 3212 * Override any spares and level 2 cache devices as specified by 3213 * the user, as these may have correct device names/devids, etc. 3214 */ 3215 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3216 &spares, &nspares) == 0) { 3217 if (spa->spa_spares.sav_config) 3218 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 3219 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 3220 else 3221 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 3222 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3223 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3224 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3225 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3226 spa_load_spares(spa); 3227 spa_config_exit(spa, SCL_ALL, FTAG); 3228 spa->spa_spares.sav_sync = B_TRUE; 3229 } 3230 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3231 &l2cache, &nl2cache) == 0) { 3232 if (spa->spa_l2cache.sav_config) 3233 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 3234 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 3235 else 3236 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3237 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3238 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3239 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3240 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3241 spa_load_l2cache(spa); 3242 spa_config_exit(spa, SCL_ALL, FTAG); 3243 spa->spa_l2cache.sav_sync = B_TRUE; 3244 } 3245 3246 /* 3247 * Check for any removed devices. 3248 */ 3249 if (spa->spa_autoreplace) { 3250 spa_aux_check_removed(&spa->spa_spares); 3251 spa_aux_check_removed(&spa->spa_l2cache); 3252 } 3253 3254 if (spa_writeable(spa)) { 3255 /* 3256 * Update the config cache to include the newly-imported pool. 3257 */ 3258 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3259 } 3260 3261 /* 3262 * It's possible that the pool was expanded while it was exported. 3263 * We kick off an async task to handle this for us. 3264 */ 3265 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 3266 3267 mutex_exit(&spa_namespace_lock); 3268 spa_history_log_version(spa, LOG_POOL_IMPORT); 3269 3270 return (0); 3271 } 3272 3273 nvlist_t * 3274 spa_tryimport(nvlist_t *tryconfig) 3275 { 3276 nvlist_t *config = NULL; 3277 char *poolname; 3278 spa_t *spa; 3279 uint64_t state; 3280 int error; 3281 3282 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 3283 return (NULL); 3284 3285 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 3286 return (NULL); 3287 3288 /* 3289 * Create and initialize the spa structure. 3290 */ 3291 mutex_enter(&spa_namespace_lock); 3292 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 3293 spa_activate(spa, FREAD); 3294 3295 /* 3296 * Pass off the heavy lifting to spa_load(). 3297 * Pass TRUE for mosconfig because the user-supplied config 3298 * is actually the one to trust when doing an import. 3299 */ 3300 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 3301 3302 /* 3303 * If 'tryconfig' was at least parsable, return the current config. 3304 */ 3305 if (spa->spa_root_vdev != NULL) { 3306 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3307 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 3308 poolname) == 0); 3309 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 3310 state) == 0); 3311 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 3312 spa->spa_uberblock.ub_timestamp) == 0); 3313 3314 /* 3315 * If the bootfs property exists on this pool then we 3316 * copy it out so that external consumers can tell which 3317 * pools are bootable. 3318 */ 3319 if ((!error || error == EEXIST) && spa->spa_bootfs) { 3320 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 3321 3322 /* 3323 * We have to play games with the name since the 3324 * pool was opened as TRYIMPORT_NAME. 3325 */ 3326 if (dsl_dsobj_to_dsname(spa_name(spa), 3327 spa->spa_bootfs, tmpname) == 0) { 3328 char *cp; 3329 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 3330 3331 cp = strchr(tmpname, '/'); 3332 if (cp == NULL) { 3333 (void) strlcpy(dsname, tmpname, 3334 MAXPATHLEN); 3335 } else { 3336 (void) snprintf(dsname, MAXPATHLEN, 3337 "%s/%s", poolname, ++cp); 3338 } 3339 VERIFY(nvlist_add_string(config, 3340 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 3341 kmem_free(dsname, MAXPATHLEN); 3342 } 3343 kmem_free(tmpname, MAXPATHLEN); 3344 } 3345 3346 /* 3347 * Add the list of hot spares and level 2 cache devices. 3348 */ 3349 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3350 spa_add_spares(spa, config); 3351 spa_add_l2cache(spa, config); 3352 spa_config_exit(spa, SCL_CONFIG, FTAG); 3353 } 3354 3355 spa_unload(spa); 3356 spa_deactivate(spa); 3357 spa_remove(spa); 3358 mutex_exit(&spa_namespace_lock); 3359 3360 return (config); 3361 } 3362 3363 /* 3364 * Pool export/destroy 3365 * 3366 * The act of destroying or exporting a pool is very simple. We make sure there 3367 * is no more pending I/O and any references to the pool are gone. Then, we 3368 * update the pool state and sync all the labels to disk, removing the 3369 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 3370 * we don't sync the labels or remove the configuration cache. 3371 */ 3372 static int 3373 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 3374 boolean_t force, boolean_t hardforce) 3375 { 3376 spa_t *spa; 3377 3378 if (oldconfig) 3379 *oldconfig = NULL; 3380 3381 if (!(spa_mode_global & FWRITE)) 3382 return (EROFS); 3383 3384 mutex_enter(&spa_namespace_lock); 3385 if ((spa = spa_lookup(pool)) == NULL) { 3386 mutex_exit(&spa_namespace_lock); 3387 return (ENOENT); 3388 } 3389 3390 /* 3391 * Put a hold on the pool, drop the namespace lock, stop async tasks, 3392 * reacquire the namespace lock, and see if we can export. 3393 */ 3394 spa_open_ref(spa, FTAG); 3395 mutex_exit(&spa_namespace_lock); 3396 spa_async_suspend(spa); 3397 mutex_enter(&spa_namespace_lock); 3398 spa_close(spa, FTAG); 3399 3400 /* 3401 * The pool will be in core if it's openable, 3402 * in which case we can modify its state. 3403 */ 3404 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 3405 /* 3406 * Objsets may be open only because they're dirty, so we 3407 * have to force it to sync before checking spa_refcnt. 3408 */ 3409 txg_wait_synced(spa->spa_dsl_pool, 0); 3410 3411 /* 3412 * A pool cannot be exported or destroyed if there are active 3413 * references. If we are resetting a pool, allow references by 3414 * fault injection handlers. 3415 */ 3416 if (!spa_refcount_zero(spa) || 3417 (spa->spa_inject_ref != 0 && 3418 new_state != POOL_STATE_UNINITIALIZED)) { 3419 spa_async_resume(spa); 3420 mutex_exit(&spa_namespace_lock); 3421 return (EBUSY); 3422 } 3423 3424 /* 3425 * A pool cannot be exported if it has an active shared spare. 3426 * This is to prevent other pools stealing the active spare 3427 * from an exported pool. At user's own will, such pool can 3428 * be forcedly exported. 3429 */ 3430 if (!force && new_state == POOL_STATE_EXPORTED && 3431 spa_has_active_shared_spare(spa)) { 3432 spa_async_resume(spa); 3433 mutex_exit(&spa_namespace_lock); 3434 return (EXDEV); 3435 } 3436 3437 /* 3438 * We want this to be reflected on every label, 3439 * so mark them all dirty. spa_unload() will do the 3440 * final sync that pushes these changes out. 3441 */ 3442 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 3443 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3444 spa->spa_state = new_state; 3445 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 3446 vdev_config_dirty(spa->spa_root_vdev); 3447 spa_config_exit(spa, SCL_ALL, FTAG); 3448 } 3449 } 3450 3451 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 3452 3453 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3454 spa_unload(spa); 3455 spa_deactivate(spa); 3456 } 3457 3458 if (oldconfig && spa->spa_config) 3459 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 3460 3461 if (new_state != POOL_STATE_UNINITIALIZED) { 3462 if (!hardforce) 3463 spa_config_sync(spa, B_TRUE, B_TRUE); 3464 spa_remove(spa); 3465 } 3466 mutex_exit(&spa_namespace_lock); 3467 3468 return (0); 3469 } 3470 3471 /* 3472 * Destroy a storage pool. 3473 */ 3474 int 3475 spa_destroy(char *pool) 3476 { 3477 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 3478 B_FALSE, B_FALSE)); 3479 } 3480 3481 /* 3482 * Export a storage pool. 3483 */ 3484 int 3485 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 3486 boolean_t hardforce) 3487 { 3488 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 3489 force, hardforce)); 3490 } 3491 3492 /* 3493 * Similar to spa_export(), this unloads the spa_t without actually removing it 3494 * from the namespace in any way. 3495 */ 3496 int 3497 spa_reset(char *pool) 3498 { 3499 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 3500 B_FALSE, B_FALSE)); 3501 } 3502 3503 /* 3504 * ========================================================================== 3505 * Device manipulation 3506 * ========================================================================== 3507 */ 3508 3509 /* 3510 * Add a device to a storage pool. 3511 */ 3512 int 3513 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 3514 { 3515 uint64_t txg, id; 3516 int error; 3517 vdev_t *rvd = spa->spa_root_vdev; 3518 vdev_t *vd, *tvd; 3519 nvlist_t **spares, **l2cache; 3520 uint_t nspares, nl2cache; 3521 3522 txg = spa_vdev_enter(spa); 3523 3524 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 3525 VDEV_ALLOC_ADD)) != 0) 3526 return (spa_vdev_exit(spa, NULL, txg, error)); 3527 3528 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 3529 3530 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 3531 &nspares) != 0) 3532 nspares = 0; 3533 3534 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 3535 &nl2cache) != 0) 3536 nl2cache = 0; 3537 3538 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 3539 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 3540 3541 if (vd->vdev_children != 0 && 3542 (error = vdev_create(vd, txg, B_FALSE)) != 0) 3543 return (spa_vdev_exit(spa, vd, txg, error)); 3544 3545 /* 3546 * We must validate the spares and l2cache devices after checking the 3547 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 3548 */ 3549 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 3550 return (spa_vdev_exit(spa, vd, txg, error)); 3551 3552 /* 3553 * Transfer each new top-level vdev from vd to rvd. 3554 */ 3555 for (int c = 0; c < vd->vdev_children; c++) { 3556 3557 /* 3558 * Set the vdev id to the first hole, if one exists. 3559 */ 3560 for (id = 0; id < rvd->vdev_children; id++) { 3561 if (rvd->vdev_child[id]->vdev_ishole) { 3562 vdev_free(rvd->vdev_child[id]); 3563 break; 3564 } 3565 } 3566 tvd = vd->vdev_child[c]; 3567 vdev_remove_child(vd, tvd); 3568 tvd->vdev_id = id; 3569 vdev_add_child(rvd, tvd); 3570 vdev_config_dirty(tvd); 3571 } 3572 3573 if (nspares != 0) { 3574 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 3575 ZPOOL_CONFIG_SPARES); 3576 spa_load_spares(spa); 3577 spa->spa_spares.sav_sync = B_TRUE; 3578 } 3579 3580 if (nl2cache != 0) { 3581 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 3582 ZPOOL_CONFIG_L2CACHE); 3583 spa_load_l2cache(spa); 3584 spa->spa_l2cache.sav_sync = B_TRUE; 3585 } 3586 3587 /* 3588 * We have to be careful when adding new vdevs to an existing pool. 3589 * If other threads start allocating from these vdevs before we 3590 * sync the config cache, and we lose power, then upon reboot we may 3591 * fail to open the pool because there are DVAs that the config cache 3592 * can't translate. Therefore, we first add the vdevs without 3593 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 3594 * and then let spa_config_update() initialize the new metaslabs. 3595 * 3596 * spa_load() checks for added-but-not-initialized vdevs, so that 3597 * if we lose power at any point in this sequence, the remaining 3598 * steps will be completed the next time we load the pool. 3599 */ 3600 (void) spa_vdev_exit(spa, vd, txg, 0); 3601 3602 mutex_enter(&spa_namespace_lock); 3603 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3604 mutex_exit(&spa_namespace_lock); 3605 3606 return (0); 3607 } 3608 3609 /* 3610 * Attach a device to a mirror. The arguments are the path to any device 3611 * in the mirror, and the nvroot for the new device. If the path specifies 3612 * a device that is not mirrored, we automatically insert the mirror vdev. 3613 * 3614 * If 'replacing' is specified, the new device is intended to replace the 3615 * existing device; in this case the two devices are made into their own 3616 * mirror using the 'replacing' vdev, which is functionally identical to 3617 * the mirror vdev (it actually reuses all the same ops) but has a few 3618 * extra rules: you can't attach to it after it's been created, and upon 3619 * completion of resilvering, the first disk (the one being replaced) 3620 * is automatically detached. 3621 */ 3622 int 3623 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 3624 { 3625 uint64_t txg, open_txg; 3626 vdev_t *rvd = spa->spa_root_vdev; 3627 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 3628 vdev_ops_t *pvops; 3629 char *oldvdpath, *newvdpath; 3630 int newvd_isspare; 3631 int error; 3632 3633 txg = spa_vdev_enter(spa); 3634 3635 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 3636 3637 if (oldvd == NULL) 3638 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3639 3640 if (!oldvd->vdev_ops->vdev_op_leaf) 3641 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3642 3643 pvd = oldvd->vdev_parent; 3644 3645 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 3646 VDEV_ALLOC_ADD)) != 0) 3647 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 3648 3649 if (newrootvd->vdev_children != 1) 3650 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3651 3652 newvd = newrootvd->vdev_child[0]; 3653 3654 if (!newvd->vdev_ops->vdev_op_leaf) 3655 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3656 3657 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 3658 return (spa_vdev_exit(spa, newrootvd, txg, error)); 3659 3660 /* 3661 * Spares can't replace logs 3662 */ 3663 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 3664 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3665 3666 if (!replacing) { 3667 /* 3668 * For attach, the only allowable parent is a mirror or the root 3669 * vdev. 3670 */ 3671 if (pvd->vdev_ops != &vdev_mirror_ops && 3672 pvd->vdev_ops != &vdev_root_ops) 3673 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3674 3675 pvops = &vdev_mirror_ops; 3676 } else { 3677 /* 3678 * Active hot spares can only be replaced by inactive hot 3679 * spares. 3680 */ 3681 if (pvd->vdev_ops == &vdev_spare_ops && 3682 pvd->vdev_child[1] == oldvd && 3683 !spa_has_spare(spa, newvd->vdev_guid)) 3684 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3685 3686 /* 3687 * If the source is a hot spare, and the parent isn't already a 3688 * spare, then we want to create a new hot spare. Otherwise, we 3689 * want to create a replacing vdev. The user is not allowed to 3690 * attach to a spared vdev child unless the 'isspare' state is 3691 * the same (spare replaces spare, non-spare replaces 3692 * non-spare). 3693 */ 3694 if (pvd->vdev_ops == &vdev_replacing_ops) 3695 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3696 else if (pvd->vdev_ops == &vdev_spare_ops && 3697 newvd->vdev_isspare != oldvd->vdev_isspare) 3698 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3699 else if (pvd->vdev_ops != &vdev_spare_ops && 3700 newvd->vdev_isspare) 3701 pvops = &vdev_spare_ops; 3702 else 3703 pvops = &vdev_replacing_ops; 3704 } 3705 3706 /* 3707 * Make sure the new device is big enough. 3708 */ 3709 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 3710 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 3711 3712 /* 3713 * The new device cannot have a higher alignment requirement 3714 * than the top-level vdev. 3715 */ 3716 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 3717 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 3718 3719 /* 3720 * If this is an in-place replacement, update oldvd's path and devid 3721 * to make it distinguishable from newvd, and unopenable from now on. 3722 */ 3723 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 3724 spa_strfree(oldvd->vdev_path); 3725 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 3726 KM_SLEEP); 3727 (void) sprintf(oldvd->vdev_path, "%s/%s", 3728 newvd->vdev_path, "old"); 3729 if (oldvd->vdev_devid != NULL) { 3730 spa_strfree(oldvd->vdev_devid); 3731 oldvd->vdev_devid = NULL; 3732 } 3733 } 3734 3735 /* 3736 * If the parent is not a mirror, or if we're replacing, insert the new 3737 * mirror/replacing/spare vdev above oldvd. 3738 */ 3739 if (pvd->vdev_ops != pvops) 3740 pvd = vdev_add_parent(oldvd, pvops); 3741 3742 ASSERT(pvd->vdev_top->vdev_parent == rvd); 3743 ASSERT(pvd->vdev_ops == pvops); 3744 ASSERT(oldvd->vdev_parent == pvd); 3745 3746 /* 3747 * Extract the new device from its root and add it to pvd. 3748 */ 3749 vdev_remove_child(newrootvd, newvd); 3750 newvd->vdev_id = pvd->vdev_children; 3751 newvd->vdev_crtxg = oldvd->vdev_crtxg; 3752 vdev_add_child(pvd, newvd); 3753 3754 tvd = newvd->vdev_top; 3755 ASSERT(pvd->vdev_top == tvd); 3756 ASSERT(tvd->vdev_parent == rvd); 3757 3758 vdev_config_dirty(tvd); 3759 3760 /* 3761 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 3762 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 3763 */ 3764 open_txg = txg + TXG_CONCURRENT_STATES - 1; 3765 3766 vdev_dtl_dirty(newvd, DTL_MISSING, 3767 TXG_INITIAL, open_txg - TXG_INITIAL + 1); 3768 3769 if (newvd->vdev_isspare) { 3770 spa_spare_activate(newvd); 3771 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 3772 } 3773 3774 oldvdpath = spa_strdup(oldvd->vdev_path); 3775 newvdpath = spa_strdup(newvd->vdev_path); 3776 newvd_isspare = newvd->vdev_isspare; 3777 3778 /* 3779 * Mark newvd's DTL dirty in this txg. 3780 */ 3781 vdev_dirty(tvd, VDD_DTL, newvd, txg); 3782 3783 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 3784 3785 spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, NULL, 3786 CRED(), "%s vdev=%s %s vdev=%s", 3787 replacing && newvd_isspare ? "spare in" : 3788 replacing ? "replace" : "attach", newvdpath, 3789 replacing ? "for" : "to", oldvdpath); 3790 3791 spa_strfree(oldvdpath); 3792 spa_strfree(newvdpath); 3793 3794 /* 3795 * Kick off a resilver to update newvd. 3796 */ 3797 VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0); 3798 3799 return (0); 3800 } 3801 3802 /* 3803 * Detach a device from a mirror or replacing vdev. 3804 * If 'replace_done' is specified, only detach if the parent 3805 * is a replacing vdev. 3806 */ 3807 int 3808 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 3809 { 3810 uint64_t txg; 3811 int error; 3812 vdev_t *rvd = spa->spa_root_vdev; 3813 vdev_t *vd, *pvd, *cvd, *tvd; 3814 boolean_t unspare = B_FALSE; 3815 uint64_t unspare_guid; 3816 size_t len; 3817 char *vdpath; 3818 3819 txg = spa_vdev_enter(spa); 3820 3821 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 3822 3823 if (vd == NULL) 3824 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3825 3826 if (!vd->vdev_ops->vdev_op_leaf) 3827 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3828 3829 pvd = vd->vdev_parent; 3830 3831 /* 3832 * If the parent/child relationship is not as expected, don't do it. 3833 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 3834 * vdev that's replacing B with C. The user's intent in replacing 3835 * is to go from M(A,B) to M(A,C). If the user decides to cancel 3836 * the replace by detaching C, the expected behavior is to end up 3837 * M(A,B). But suppose that right after deciding to detach C, 3838 * the replacement of B completes. We would have M(A,C), and then 3839 * ask to detach C, which would leave us with just A -- not what 3840 * the user wanted. To prevent this, we make sure that the 3841 * parent/child relationship hasn't changed -- in this example, 3842 * that C's parent is still the replacing vdev R. 3843 */ 3844 if (pvd->vdev_guid != pguid && pguid != 0) 3845 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3846 3847 /* 3848 * If replace_done is specified, only remove this device if it's 3849 * the first child of a replacing vdev. For the 'spare' vdev, either 3850 * disk can be removed. 3851 */ 3852 if (replace_done) { 3853 if (pvd->vdev_ops == &vdev_replacing_ops) { 3854 if (vd->vdev_id != 0) 3855 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3856 } else if (pvd->vdev_ops != &vdev_spare_ops) { 3857 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3858 } 3859 } 3860 3861 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 3862 spa_version(spa) >= SPA_VERSION_SPARES); 3863 3864 /* 3865 * Only mirror, replacing, and spare vdevs support detach. 3866 */ 3867 if (pvd->vdev_ops != &vdev_replacing_ops && 3868 pvd->vdev_ops != &vdev_mirror_ops && 3869 pvd->vdev_ops != &vdev_spare_ops) 3870 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3871 3872 /* 3873 * If this device has the only valid copy of some data, 3874 * we cannot safely detach it. 3875 */ 3876 if (vdev_dtl_required(vd)) 3877 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3878 3879 ASSERT(pvd->vdev_children >= 2); 3880 3881 /* 3882 * If we are detaching the second disk from a replacing vdev, then 3883 * check to see if we changed the original vdev's path to have "/old" 3884 * at the end in spa_vdev_attach(). If so, undo that change now. 3885 */ 3886 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 && 3887 pvd->vdev_child[0]->vdev_path != NULL && 3888 pvd->vdev_child[1]->vdev_path != NULL) { 3889 ASSERT(pvd->vdev_child[1] == vd); 3890 cvd = pvd->vdev_child[0]; 3891 len = strlen(vd->vdev_path); 3892 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 3893 strcmp(cvd->vdev_path + len, "/old") == 0) { 3894 spa_strfree(cvd->vdev_path); 3895 cvd->vdev_path = spa_strdup(vd->vdev_path); 3896 } 3897 } 3898 3899 /* 3900 * If we are detaching the original disk from a spare, then it implies 3901 * that the spare should become a real disk, and be removed from the 3902 * active spare list for the pool. 3903 */ 3904 if (pvd->vdev_ops == &vdev_spare_ops && 3905 vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare) 3906 unspare = B_TRUE; 3907 3908 /* 3909 * Erase the disk labels so the disk can be used for other things. 3910 * This must be done after all other error cases are handled, 3911 * but before we disembowel vd (so we can still do I/O to it). 3912 * But if we can't do it, don't treat the error as fatal -- 3913 * it may be that the unwritability of the disk is the reason 3914 * it's being detached! 3915 */ 3916 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 3917 3918 /* 3919 * Remove vd from its parent and compact the parent's children. 3920 */ 3921 vdev_remove_child(pvd, vd); 3922 vdev_compact_children(pvd); 3923 3924 /* 3925 * Remember one of the remaining children so we can get tvd below. 3926 */ 3927 cvd = pvd->vdev_child[0]; 3928 3929 /* 3930 * If we need to remove the remaining child from the list of hot spares, 3931 * do it now, marking the vdev as no longer a spare in the process. 3932 * We must do this before vdev_remove_parent(), because that can 3933 * change the GUID if it creates a new toplevel GUID. For a similar 3934 * reason, we must remove the spare now, in the same txg as the detach; 3935 * otherwise someone could attach a new sibling, change the GUID, and 3936 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 3937 */ 3938 if (unspare) { 3939 ASSERT(cvd->vdev_isspare); 3940 spa_spare_remove(cvd); 3941 unspare_guid = cvd->vdev_guid; 3942 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 3943 } 3944 3945 /* 3946 * If the parent mirror/replacing vdev only has one child, 3947 * the parent is no longer needed. Remove it from the tree. 3948 */ 3949 if (pvd->vdev_children == 1) 3950 vdev_remove_parent(cvd); 3951 3952 /* 3953 * We don't set tvd until now because the parent we just removed 3954 * may have been the previous top-level vdev. 3955 */ 3956 tvd = cvd->vdev_top; 3957 ASSERT(tvd->vdev_parent == rvd); 3958 3959 /* 3960 * Reevaluate the parent vdev state. 3961 */ 3962 vdev_propagate_state(cvd); 3963 3964 /* 3965 * If the 'autoexpand' property is set on the pool then automatically 3966 * try to expand the size of the pool. For example if the device we 3967 * just detached was smaller than the others, it may be possible to 3968 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 3969 * first so that we can obtain the updated sizes of the leaf vdevs. 3970 */ 3971 if (spa->spa_autoexpand) { 3972 vdev_reopen(tvd); 3973 vdev_expand(tvd, txg); 3974 } 3975 3976 vdev_config_dirty(tvd); 3977 3978 /* 3979 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 3980 * vd->vdev_detached is set and free vd's DTL object in syncing context. 3981 * But first make sure we're not on any *other* txg's DTL list, to 3982 * prevent vd from being accessed after it's freed. 3983 */ 3984 vdpath = spa_strdup(vd->vdev_path); 3985 for (int t = 0; t < TXG_SIZE; t++) 3986 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 3987 vd->vdev_detached = B_TRUE; 3988 vdev_dirty(tvd, VDD_DTL, vd, txg); 3989 3990 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 3991 3992 error = spa_vdev_exit(spa, vd, txg, 0); 3993 3994 spa_history_internal_log(LOG_POOL_VDEV_DETACH, spa, NULL, CRED(), 3995 "vdev=%s", vdpath); 3996 spa_strfree(vdpath); 3997 3998 /* 3999 * If this was the removal of the original device in a hot spare vdev, 4000 * then we want to go through and remove the device from the hot spare 4001 * list of every other pool. 4002 */ 4003 if (unspare) { 4004 spa_t *myspa = spa; 4005 spa = NULL; 4006 mutex_enter(&spa_namespace_lock); 4007 while ((spa = spa_next(spa)) != NULL) { 4008 if (spa->spa_state != POOL_STATE_ACTIVE) 4009 continue; 4010 if (spa == myspa) 4011 continue; 4012 spa_open_ref(spa, FTAG); 4013 mutex_exit(&spa_namespace_lock); 4014 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 4015 mutex_enter(&spa_namespace_lock); 4016 spa_close(spa, FTAG); 4017 } 4018 mutex_exit(&spa_namespace_lock); 4019 } 4020 4021 return (error); 4022 } 4023 4024 /* 4025 * Split a set of devices from their mirrors, and create a new pool from them. 4026 */ 4027 int 4028 spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 4029 nvlist_t *props, boolean_t exp) 4030 { 4031 int error = 0; 4032 uint64_t txg, *glist; 4033 spa_t *newspa; 4034 uint_t c, children, lastlog; 4035 nvlist_t **child, *nvl, *tmp; 4036 dmu_tx_t *tx; 4037 char *altroot = NULL; 4038 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 4039 boolean_t activate_slog; 4040 4041 if (!spa_writeable(spa)) 4042 return (EROFS); 4043 4044 txg = spa_vdev_enter(spa); 4045 4046 /* clear the log and flush everything up to now */ 4047 activate_slog = spa_passivate_log(spa); 4048 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4049 error = spa_offline_log(spa); 4050 txg = spa_vdev_config_enter(spa); 4051 4052 if (activate_slog) 4053 spa_activate_log(spa); 4054 4055 if (error != 0) 4056 return (spa_vdev_exit(spa, NULL, txg, error)); 4057 4058 /* check new spa name before going any further */ 4059 if (spa_lookup(newname) != NULL) 4060 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 4061 4062 /* 4063 * scan through all the children to ensure they're all mirrors 4064 */ 4065 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 4066 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 4067 &children) != 0) 4068 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4069 4070 /* first, check to ensure we've got the right child count */ 4071 rvd = spa->spa_root_vdev; 4072 lastlog = 0; 4073 for (c = 0; c < rvd->vdev_children; c++) { 4074 vdev_t *vd = rvd->vdev_child[c]; 4075 4076 /* don't count the holes & logs as children */ 4077 if (vd->vdev_islog || vd->vdev_ishole) { 4078 if (lastlog == 0) 4079 lastlog = c; 4080 continue; 4081 } 4082 4083 lastlog = 0; 4084 } 4085 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 4086 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4087 4088 /* next, ensure no spare or cache devices are part of the split */ 4089 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 4090 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 4091 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4092 4093 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 4094 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 4095 4096 /* then, loop over each vdev and validate it */ 4097 for (c = 0; c < children; c++) { 4098 uint64_t is_hole = 0; 4099 4100 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 4101 &is_hole); 4102 4103 if (is_hole != 0) { 4104 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 4105 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 4106 continue; 4107 } else { 4108 error = EINVAL; 4109 break; 4110 } 4111 } 4112 4113 /* which disk is going to be split? */ 4114 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 4115 &glist[c]) != 0) { 4116 error = EINVAL; 4117 break; 4118 } 4119 4120 /* look it up in the spa */ 4121 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 4122 if (vml[c] == NULL) { 4123 error = ENODEV; 4124 break; 4125 } 4126 4127 /* make sure there's nothing stopping the split */ 4128 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 4129 vml[c]->vdev_islog || 4130 vml[c]->vdev_ishole || 4131 vml[c]->vdev_isspare || 4132 vml[c]->vdev_isl2cache || 4133 !vdev_writeable(vml[c]) || 4134 vml[c]->vdev_children != 0 || 4135 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 4136 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 4137 error = EINVAL; 4138 break; 4139 } 4140 4141 if (vdev_dtl_required(vml[c])) { 4142 error = EBUSY; 4143 break; 4144 } 4145 4146 /* we need certain info from the top level */ 4147 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 4148 vml[c]->vdev_top->vdev_ms_array) == 0); 4149 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 4150 vml[c]->vdev_top->vdev_ms_shift) == 0); 4151 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 4152 vml[c]->vdev_top->vdev_asize) == 0); 4153 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 4154 vml[c]->vdev_top->vdev_ashift) == 0); 4155 } 4156 4157 if (error != 0) { 4158 kmem_free(vml, children * sizeof (vdev_t *)); 4159 kmem_free(glist, children * sizeof (uint64_t)); 4160 return (spa_vdev_exit(spa, NULL, txg, error)); 4161 } 4162 4163 /* stop writers from using the disks */ 4164 for (c = 0; c < children; c++) { 4165 if (vml[c] != NULL) 4166 vml[c]->vdev_offline = B_TRUE; 4167 } 4168 vdev_reopen(spa->spa_root_vdev); 4169 4170 /* 4171 * Temporarily record the splitting vdevs in the spa config. This 4172 * will disappear once the config is regenerated. 4173 */ 4174 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4175 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 4176 glist, children) == 0); 4177 kmem_free(glist, children * sizeof (uint64_t)); 4178 4179 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 4180 nvl) == 0); 4181 spa->spa_config_splitting = nvl; 4182 vdev_config_dirty(spa->spa_root_vdev); 4183 4184 /* configure and create the new pool */ 4185 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 4186 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4187 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 4188 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 4189 spa_version(spa)) == 0); 4190 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 4191 spa->spa_config_txg) == 0); 4192 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 4193 spa_generate_guid(NULL)) == 0); 4194 (void) nvlist_lookup_string(props, 4195 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4196 4197 /* add the new pool to the namespace */ 4198 newspa = spa_add(newname, config, altroot); 4199 newspa->spa_config_txg = spa->spa_config_txg; 4200 spa_set_log_state(newspa, SPA_LOG_CLEAR); 4201 4202 /* release the spa config lock, retaining the namespace lock */ 4203 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4204 4205 if (zio_injection_enabled) 4206 zio_handle_panic_injection(spa, FTAG, 1); 4207 4208 spa_activate(newspa, spa_mode_global); 4209 spa_async_suspend(newspa); 4210 4211 /* create the new pool from the disks of the original pool */ 4212 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 4213 if (error) 4214 goto out; 4215 4216 /* if that worked, generate a real config for the new pool */ 4217 if (newspa->spa_root_vdev != NULL) { 4218 VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 4219 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4220 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 4221 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 4222 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 4223 B_TRUE)); 4224 } 4225 4226 /* set the props */ 4227 if (props != NULL) { 4228 spa_configfile_set(newspa, props, B_FALSE); 4229 error = spa_prop_set(newspa, props); 4230 if (error) 4231 goto out; 4232 } 4233 4234 /* flush everything */ 4235 txg = spa_vdev_config_enter(newspa); 4236 vdev_config_dirty(newspa->spa_root_vdev); 4237 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 4238 4239 if (zio_injection_enabled) 4240 zio_handle_panic_injection(spa, FTAG, 2); 4241 4242 spa_async_resume(newspa); 4243 4244 /* finally, update the original pool's config */ 4245 txg = spa_vdev_config_enter(spa); 4246 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 4247 error = dmu_tx_assign(tx, TXG_WAIT); 4248 if (error != 0) 4249 dmu_tx_abort(tx); 4250 for (c = 0; c < children; c++) { 4251 if (vml[c] != NULL) { 4252 vdev_split(vml[c]); 4253 if (error == 0) 4254 spa_history_internal_log(LOG_POOL_VDEV_DETACH, 4255 spa, tx, CRED(), "vdev=%s", 4256 vml[c]->vdev_path); 4257 vdev_free(vml[c]); 4258 } 4259 } 4260 vdev_config_dirty(spa->spa_root_vdev); 4261 spa->spa_config_splitting = NULL; 4262 nvlist_free(nvl); 4263 if (error == 0) 4264 dmu_tx_commit(tx); 4265 (void) spa_vdev_exit(spa, NULL, txg, 0); 4266 4267 if (zio_injection_enabled) 4268 zio_handle_panic_injection(spa, FTAG, 3); 4269 4270 /* split is complete; log a history record */ 4271 spa_history_internal_log(LOG_POOL_SPLIT, newspa, NULL, CRED(), 4272 "split new pool %s from pool %s", newname, spa_name(spa)); 4273 4274 kmem_free(vml, children * sizeof (vdev_t *)); 4275 4276 /* if we're not going to mount the filesystems in userland, export */ 4277 if (exp) 4278 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 4279 B_FALSE, B_FALSE); 4280 4281 return (error); 4282 4283 out: 4284 spa_unload(newspa); 4285 spa_deactivate(newspa); 4286 spa_remove(newspa); 4287 4288 txg = spa_vdev_config_enter(spa); 4289 nvlist_free(spa->spa_config_splitting); 4290 spa->spa_config_splitting = NULL; 4291 (void) spa_vdev_exit(spa, NULL, txg, error); 4292 4293 kmem_free(vml, children * sizeof (vdev_t *)); 4294 return (error); 4295 } 4296 4297 static nvlist_t * 4298 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 4299 { 4300 for (int i = 0; i < count; i++) { 4301 uint64_t guid; 4302 4303 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 4304 &guid) == 0); 4305 4306 if (guid == target_guid) 4307 return (nvpp[i]); 4308 } 4309 4310 return (NULL); 4311 } 4312 4313 static void 4314 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 4315 nvlist_t *dev_to_remove) 4316 { 4317 nvlist_t **newdev = NULL; 4318 4319 if (count > 1) 4320 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 4321 4322 for (int i = 0, j = 0; i < count; i++) { 4323 if (dev[i] == dev_to_remove) 4324 continue; 4325 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 4326 } 4327 4328 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 4329 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 4330 4331 for (int i = 0; i < count - 1; i++) 4332 nvlist_free(newdev[i]); 4333 4334 if (count > 1) 4335 kmem_free(newdev, (count - 1) * sizeof (void *)); 4336 } 4337 4338 /* 4339 * Removing a device from the vdev namespace requires several steps 4340 * and can take a significant amount of time. As a result we use 4341 * the spa_vdev_config_[enter/exit] functions which allow us to 4342 * grab and release the spa_config_lock while still holding the namespace 4343 * lock. During each step the configuration is synced out. 4344 */ 4345 4346 /* 4347 * Evacuate the device. 4348 */ 4349 int 4350 spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 4351 { 4352 int error = 0; 4353 uint64_t txg; 4354 4355 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4356 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 4357 ASSERT(vd == vd->vdev_top); 4358 4359 /* 4360 * Evacuate the device. We don't hold the config lock as writer 4361 * since we need to do I/O but we do keep the 4362 * spa_namespace_lock held. Once this completes the device 4363 * should no longer have any blocks allocated on it. 4364 */ 4365 if (vd->vdev_islog) { 4366 error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 4367 NULL, DS_FIND_CHILDREN); 4368 } else { 4369 error = ENOTSUP; /* until we have bp rewrite */ 4370 } 4371 4372 txg_wait_synced(spa_get_dsl(spa), 0); 4373 4374 if (error) 4375 return (error); 4376 4377 /* 4378 * The evacuation succeeded. Remove any remaining MOS metadata 4379 * associated with this vdev, and wait for these changes to sync. 4380 */ 4381 txg = spa_vdev_config_enter(spa); 4382 vd->vdev_removing = B_TRUE; 4383 vdev_dirty(vd, 0, NULL, txg); 4384 vdev_config_dirty(vd); 4385 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4386 4387 return (0); 4388 } 4389 4390 /* 4391 * Complete the removal by cleaning up the namespace. 4392 */ 4393 void 4394 spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 4395 { 4396 vdev_t *rvd = spa->spa_root_vdev; 4397 uint64_t id = vd->vdev_id; 4398 boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 4399 4400 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4401 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 4402 ASSERT(vd == vd->vdev_top); 4403 4404 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 4405 4406 if (list_link_active(&vd->vdev_state_dirty_node)) 4407 vdev_state_clean(vd); 4408 if (list_link_active(&vd->vdev_config_dirty_node)) 4409 vdev_config_clean(vd); 4410 4411 vdev_free(vd); 4412 4413 if (last_vdev) { 4414 vdev_compact_children(rvd); 4415 } else { 4416 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 4417 vdev_add_child(rvd, vd); 4418 } 4419 vdev_config_dirty(rvd); 4420 4421 /* 4422 * Reassess the health of our root vdev. 4423 */ 4424 vdev_reopen(rvd); 4425 } 4426 4427 /* 4428 * Remove a device from the pool. Currently, this supports removing only hot 4429 * spares, slogs, and level 2 ARC devices. 4430 */ 4431 int 4432 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 4433 { 4434 vdev_t *vd; 4435 metaslab_group_t *mg; 4436 nvlist_t **spares, **l2cache, *nv; 4437 uint64_t txg = 0; 4438 uint_t nspares, nl2cache; 4439 int error = 0; 4440 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 4441 4442 if (!locked) 4443 txg = spa_vdev_enter(spa); 4444 4445 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4446 4447 if (spa->spa_spares.sav_vdevs != NULL && 4448 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 4449 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 4450 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 4451 /* 4452 * Only remove the hot spare if it's not currently in use 4453 * in this pool. 4454 */ 4455 if (vd == NULL || unspare) { 4456 spa_vdev_remove_aux(spa->spa_spares.sav_config, 4457 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 4458 spa_load_spares(spa); 4459 spa->spa_spares.sav_sync = B_TRUE; 4460 } else { 4461 error = EBUSY; 4462 } 4463 } else if (spa->spa_l2cache.sav_vdevs != NULL && 4464 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 4465 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 4466 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 4467 /* 4468 * Cache devices can always be removed. 4469 */ 4470 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 4471 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 4472 spa_load_l2cache(spa); 4473 spa->spa_l2cache.sav_sync = B_TRUE; 4474 } else if (vd != NULL && vd->vdev_islog) { 4475 ASSERT(!locked); 4476 ASSERT(vd == vd->vdev_top); 4477 4478 /* 4479 * XXX - Once we have bp-rewrite this should 4480 * become the common case. 4481 */ 4482 4483 mg = vd->vdev_mg; 4484 4485 /* 4486 * Stop allocating from this vdev. 4487 */ 4488 metaslab_group_passivate(mg); 4489 4490 /* 4491 * Wait for the youngest allocations and frees to sync, 4492 * and then wait for the deferral of those frees to finish. 4493 */ 4494 spa_vdev_config_exit(spa, NULL, 4495 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 4496 4497 /* 4498 * Attempt to evacuate the vdev. 4499 */ 4500 error = spa_vdev_remove_evacuate(spa, vd); 4501 4502 txg = spa_vdev_config_enter(spa); 4503 4504 /* 4505 * If we couldn't evacuate the vdev, unwind. 4506 */ 4507 if (error) { 4508 metaslab_group_activate(mg); 4509 return (spa_vdev_exit(spa, NULL, txg, error)); 4510 } 4511 4512 /* 4513 * Clean up the vdev namespace. 4514 */ 4515 spa_vdev_remove_from_namespace(spa, vd); 4516 4517 } else if (vd != NULL) { 4518 /* 4519 * Normal vdevs cannot be removed (yet). 4520 */ 4521 error = ENOTSUP; 4522 } else { 4523 /* 4524 * There is no vdev of any kind with the specified guid. 4525 */ 4526 error = ENOENT; 4527 } 4528 4529 if (!locked) 4530 return (spa_vdev_exit(spa, NULL, txg, error)); 4531 4532 return (error); 4533 } 4534 4535 /* 4536 * Find any device that's done replacing, or a vdev marked 'unspare' that's 4537 * current spared, so we can detach it. 4538 */ 4539 static vdev_t * 4540 spa_vdev_resilver_done_hunt(vdev_t *vd) 4541 { 4542 vdev_t *newvd, *oldvd; 4543 4544 for (int c = 0; c < vd->vdev_children; c++) { 4545 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 4546 if (oldvd != NULL) 4547 return (oldvd); 4548 } 4549 4550 /* 4551 * Check for a completed replacement. 4552 */ 4553 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 4554 oldvd = vd->vdev_child[0]; 4555 newvd = vd->vdev_child[1]; 4556 4557 if (vdev_dtl_empty(newvd, DTL_MISSING) && 4558 !vdev_dtl_required(oldvd)) 4559 return (oldvd); 4560 } 4561 4562 /* 4563 * Check for a completed resilver with the 'unspare' flag set. 4564 */ 4565 if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { 4566 newvd = vd->vdev_child[0]; 4567 oldvd = vd->vdev_child[1]; 4568 4569 if (newvd->vdev_unspare && 4570 vdev_dtl_empty(newvd, DTL_MISSING) && 4571 !vdev_dtl_required(oldvd)) { 4572 newvd->vdev_unspare = 0; 4573 return (oldvd); 4574 } 4575 } 4576 4577 return (NULL); 4578 } 4579 4580 static void 4581 spa_vdev_resilver_done(spa_t *spa) 4582 { 4583 vdev_t *vd, *pvd, *ppvd; 4584 uint64_t guid, sguid, pguid, ppguid; 4585 4586 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4587 4588 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 4589 pvd = vd->vdev_parent; 4590 ppvd = pvd->vdev_parent; 4591 guid = vd->vdev_guid; 4592 pguid = pvd->vdev_guid; 4593 ppguid = ppvd->vdev_guid; 4594 sguid = 0; 4595 /* 4596 * If we have just finished replacing a hot spared device, then 4597 * we need to detach the parent's first child (the original hot 4598 * spare) as well. 4599 */ 4600 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) { 4601 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 4602 ASSERT(ppvd->vdev_children == 2); 4603 sguid = ppvd->vdev_child[1]->vdev_guid; 4604 } 4605 spa_config_exit(spa, SCL_ALL, FTAG); 4606 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 4607 return; 4608 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 4609 return; 4610 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4611 } 4612 4613 spa_config_exit(spa, SCL_ALL, FTAG); 4614 } 4615 4616 /* 4617 * Update the stored path or FRU for this vdev. 4618 */ 4619 int 4620 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 4621 boolean_t ispath) 4622 { 4623 vdev_t *vd; 4624 4625 spa_vdev_state_enter(spa, SCL_ALL); 4626 4627 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 4628 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 4629 4630 if (!vd->vdev_ops->vdev_op_leaf) 4631 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 4632 4633 if (ispath) { 4634 spa_strfree(vd->vdev_path); 4635 vd->vdev_path = spa_strdup(value); 4636 } else { 4637 if (vd->vdev_fru != NULL) 4638 spa_strfree(vd->vdev_fru); 4639 vd->vdev_fru = spa_strdup(value); 4640 } 4641 4642 return (spa_vdev_state_exit(spa, vd, 0)); 4643 } 4644 4645 int 4646 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 4647 { 4648 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 4649 } 4650 4651 int 4652 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 4653 { 4654 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 4655 } 4656 4657 /* 4658 * ========================================================================== 4659 * SPA Scrubbing 4660 * ========================================================================== 4661 */ 4662 4663 int 4664 spa_scrub(spa_t *spa, pool_scrub_type_t type) 4665 { 4666 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 4667 4668 if ((uint_t)type >= POOL_SCRUB_TYPES) 4669 return (ENOTSUP); 4670 4671 /* 4672 * If a resilver was requested, but there is no DTL on a 4673 * writeable leaf device, we have nothing to do. 4674 */ 4675 if (type == POOL_SCRUB_RESILVER && 4676 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 4677 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 4678 return (0); 4679 } 4680 4681 if (type == POOL_SCRUB_EVERYTHING && 4682 spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE && 4683 spa->spa_dsl_pool->dp_scrub_isresilver) 4684 return (EBUSY); 4685 4686 if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) { 4687 return (dsl_pool_scrub_clean(spa->spa_dsl_pool)); 4688 } else if (type == POOL_SCRUB_NONE) { 4689 return (dsl_pool_scrub_cancel(spa->spa_dsl_pool)); 4690 } else { 4691 return (EINVAL); 4692 } 4693 } 4694 4695 /* 4696 * ========================================================================== 4697 * SPA async task processing 4698 * ========================================================================== 4699 */ 4700 4701 static void 4702 spa_async_remove(spa_t *spa, vdev_t *vd) 4703 { 4704 if (vd->vdev_remove_wanted) { 4705 vd->vdev_remove_wanted = 0; 4706 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 4707 4708 /* 4709 * We want to clear the stats, but we don't want to do a full 4710 * vdev_clear() as that will cause us to throw away 4711 * degraded/faulted state as well as attempt to reopen the 4712 * device, all of which is a waste. 4713 */ 4714 vd->vdev_stat.vs_read_errors = 0; 4715 vd->vdev_stat.vs_write_errors = 0; 4716 vd->vdev_stat.vs_checksum_errors = 0; 4717 4718 vdev_state_dirty(vd->vdev_top); 4719 } 4720 4721 for (int c = 0; c < vd->vdev_children; c++) 4722 spa_async_remove(spa, vd->vdev_child[c]); 4723 } 4724 4725 static void 4726 spa_async_probe(spa_t *spa, vdev_t *vd) 4727 { 4728 if (vd->vdev_probe_wanted) { 4729 vd->vdev_probe_wanted = 0; 4730 vdev_reopen(vd); /* vdev_open() does the actual probe */ 4731 } 4732 4733 for (int c = 0; c < vd->vdev_children; c++) 4734 spa_async_probe(spa, vd->vdev_child[c]); 4735 } 4736 4737 static void 4738 spa_async_autoexpand(spa_t *spa, vdev_t *vd) 4739 { 4740 sysevent_id_t eid; 4741 nvlist_t *attr; 4742 char *physpath; 4743 4744 if (!spa->spa_autoexpand) 4745 return; 4746 4747 for (int c = 0; c < vd->vdev_children; c++) { 4748 vdev_t *cvd = vd->vdev_child[c]; 4749 spa_async_autoexpand(spa, cvd); 4750 } 4751 4752 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 4753 return; 4754 4755 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 4756 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 4757 4758 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4759 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 4760 4761 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 4762 ESC_DEV_DLE, attr, &eid, DDI_SLEEP); 4763 4764 nvlist_free(attr); 4765 kmem_free(physpath, MAXPATHLEN); 4766 } 4767 4768 static void 4769 spa_async_thread(spa_t *spa) 4770 { 4771 int tasks; 4772 4773 ASSERT(spa->spa_sync_on); 4774 4775 mutex_enter(&spa->spa_async_lock); 4776 tasks = spa->spa_async_tasks; 4777 spa->spa_async_tasks = 0; 4778 mutex_exit(&spa->spa_async_lock); 4779 4780 /* 4781 * See if the config needs to be updated. 4782 */ 4783 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 4784 uint64_t old_space, new_space; 4785 4786 mutex_enter(&spa_namespace_lock); 4787 old_space = metaslab_class_get_space(spa_normal_class(spa)); 4788 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4789 new_space = metaslab_class_get_space(spa_normal_class(spa)); 4790 mutex_exit(&spa_namespace_lock); 4791 4792 /* 4793 * If the pool grew as a result of the config update, 4794 * then log an internal history event. 4795 */ 4796 if (new_space != old_space) { 4797 spa_history_internal_log(LOG_POOL_VDEV_ONLINE, 4798 spa, NULL, CRED(), 4799 "pool '%s' size: %llu(+%llu)", 4800 spa_name(spa), new_space, new_space - old_space); 4801 } 4802 } 4803 4804 /* 4805 * See if any devices need to be marked REMOVED. 4806 */ 4807 if (tasks & SPA_ASYNC_REMOVE) { 4808 spa_vdev_state_enter(spa, SCL_NONE); 4809 spa_async_remove(spa, spa->spa_root_vdev); 4810 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 4811 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 4812 for (int i = 0; i < spa->spa_spares.sav_count; i++) 4813 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 4814 (void) spa_vdev_state_exit(spa, NULL, 0); 4815 } 4816 4817 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 4818 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4819 spa_async_autoexpand(spa, spa->spa_root_vdev); 4820 spa_config_exit(spa, SCL_CONFIG, FTAG); 4821 } 4822 4823 /* 4824 * See if any devices need to be probed. 4825 */ 4826 if (tasks & SPA_ASYNC_PROBE) { 4827 spa_vdev_state_enter(spa, SCL_NONE); 4828 spa_async_probe(spa, spa->spa_root_vdev); 4829 (void) spa_vdev_state_exit(spa, NULL, 0); 4830 } 4831 4832 /* 4833 * If any devices are done replacing, detach them. 4834 */ 4835 if (tasks & SPA_ASYNC_RESILVER_DONE) 4836 spa_vdev_resilver_done(spa); 4837 4838 /* 4839 * Kick off a resilver. 4840 */ 4841 if (tasks & SPA_ASYNC_RESILVER) 4842 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0); 4843 4844 /* 4845 * Let the world know that we're done. 4846 */ 4847 mutex_enter(&spa->spa_async_lock); 4848 spa->spa_async_thread = NULL; 4849 cv_broadcast(&spa->spa_async_cv); 4850 mutex_exit(&spa->spa_async_lock); 4851 thread_exit(); 4852 } 4853 4854 void 4855 spa_async_suspend(spa_t *spa) 4856 { 4857 mutex_enter(&spa->spa_async_lock); 4858 spa->spa_async_suspended++; 4859 while (spa->spa_async_thread != NULL) 4860 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 4861 mutex_exit(&spa->spa_async_lock); 4862 } 4863 4864 void 4865 spa_async_resume(spa_t *spa) 4866 { 4867 mutex_enter(&spa->spa_async_lock); 4868 ASSERT(spa->spa_async_suspended != 0); 4869 spa->spa_async_suspended--; 4870 mutex_exit(&spa->spa_async_lock); 4871 } 4872 4873 static void 4874 spa_async_dispatch(spa_t *spa) 4875 { 4876 mutex_enter(&spa->spa_async_lock); 4877 if (spa->spa_async_tasks && !spa->spa_async_suspended && 4878 spa->spa_async_thread == NULL && 4879 rootdir != NULL && !vn_is_readonly(rootdir)) 4880 spa->spa_async_thread = thread_create(NULL, 0, 4881 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 4882 mutex_exit(&spa->spa_async_lock); 4883 } 4884 4885 void 4886 spa_async_request(spa_t *spa, int task) 4887 { 4888 mutex_enter(&spa->spa_async_lock); 4889 spa->spa_async_tasks |= task; 4890 mutex_exit(&spa->spa_async_lock); 4891 } 4892 4893 /* 4894 * ========================================================================== 4895 * SPA syncing routines 4896 * ========================================================================== 4897 */ 4898 static void 4899 spa_sync_deferred_bplist(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx, uint64_t txg) 4900 { 4901 blkptr_t blk; 4902 uint64_t itor = 0; 4903 uint8_t c = 1; 4904 4905 while (bplist_iterate(bpl, &itor, &blk) == 0) { 4906 ASSERT(blk.blk_birth < txg); 4907 zio_free(spa, txg, &blk); 4908 } 4909 4910 bplist_vacate(bpl, tx); 4911 4912 /* 4913 * Pre-dirty the first block so we sync to convergence faster. 4914 * (Usually only the first block is needed.) 4915 */ 4916 dmu_write(bpl->bpl_mos, spa->spa_deferred_bplist_obj, 0, 1, &c, tx); 4917 } 4918 4919 static void 4920 spa_sync_free(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 4921 { 4922 zio_t *zio = arg; 4923 4924 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 4925 zio->io_flags)); 4926 } 4927 4928 static void 4929 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 4930 { 4931 char *packed = NULL; 4932 size_t bufsize; 4933 size_t nvsize = 0; 4934 dmu_buf_t *db; 4935 4936 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 4937 4938 /* 4939 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 4940 * information. This avoids the dbuf_will_dirty() path and 4941 * saves us a pre-read to get data we don't actually care about. 4942 */ 4943 bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); 4944 packed = kmem_alloc(bufsize, KM_SLEEP); 4945 4946 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 4947 KM_SLEEP) == 0); 4948 bzero(packed + nvsize, bufsize - nvsize); 4949 4950 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 4951 4952 kmem_free(packed, bufsize); 4953 4954 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 4955 dmu_buf_will_dirty(db, tx); 4956 *(uint64_t *)db->db_data = nvsize; 4957 dmu_buf_rele(db, FTAG); 4958 } 4959 4960 static void 4961 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 4962 const char *config, const char *entry) 4963 { 4964 nvlist_t *nvroot; 4965 nvlist_t **list; 4966 int i; 4967 4968 if (!sav->sav_sync) 4969 return; 4970 4971 /* 4972 * Update the MOS nvlist describing the list of available devices. 4973 * spa_validate_aux() will have already made sure this nvlist is 4974 * valid and the vdevs are labeled appropriately. 4975 */ 4976 if (sav->sav_object == 0) { 4977 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 4978 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 4979 sizeof (uint64_t), tx); 4980 VERIFY(zap_update(spa->spa_meta_objset, 4981 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 4982 &sav->sav_object, tx) == 0); 4983 } 4984 4985 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4986 if (sav->sav_count == 0) { 4987 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 4988 } else { 4989 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 4990 for (i = 0; i < sav->sav_count; i++) 4991 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 4992 B_FALSE, B_FALSE, B_TRUE); 4993 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 4994 sav->sav_count) == 0); 4995 for (i = 0; i < sav->sav_count; i++) 4996 nvlist_free(list[i]); 4997 kmem_free(list, sav->sav_count * sizeof (void *)); 4998 } 4999 5000 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 5001 nvlist_free(nvroot); 5002 5003 sav->sav_sync = B_FALSE; 5004 } 5005 5006 static void 5007 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 5008 { 5009 nvlist_t *config; 5010 5011 if (list_is_empty(&spa->spa_config_dirty_list)) 5012 return; 5013 5014 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5015 5016 config = spa_config_generate(spa, spa->spa_root_vdev, 5017 dmu_tx_get_txg(tx), B_FALSE); 5018 5019 spa_config_exit(spa, SCL_STATE, FTAG); 5020 5021 if (spa->spa_config_syncing) 5022 nvlist_free(spa->spa_config_syncing); 5023 spa->spa_config_syncing = config; 5024 5025 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 5026 } 5027 5028 /* 5029 * Set zpool properties. 5030 */ 5031 static void 5032 spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 5033 { 5034 spa_t *spa = arg1; 5035 objset_t *mos = spa->spa_meta_objset; 5036 nvlist_t *nvp = arg2; 5037 nvpair_t *elem; 5038 uint64_t intval; 5039 char *strval; 5040 zpool_prop_t prop; 5041 const char *propname; 5042 zprop_type_t proptype; 5043 5044 mutex_enter(&spa->spa_props_lock); 5045 5046 elem = NULL; 5047 while ((elem = nvlist_next_nvpair(nvp, elem))) { 5048 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 5049 case ZPOOL_PROP_VERSION: 5050 /* 5051 * Only set version for non-zpool-creation cases 5052 * (set/import). spa_create() needs special care 5053 * for version setting. 5054 */ 5055 if (tx->tx_txg != TXG_INITIAL) { 5056 VERIFY(nvpair_value_uint64(elem, 5057 &intval) == 0); 5058 ASSERT(intval <= SPA_VERSION); 5059 ASSERT(intval >= spa_version(spa)); 5060 spa->spa_uberblock.ub_version = intval; 5061 vdev_config_dirty(spa->spa_root_vdev); 5062 } 5063 break; 5064 5065 case ZPOOL_PROP_ALTROOT: 5066 /* 5067 * 'altroot' is a non-persistent property. It should 5068 * have been set temporarily at creation or import time. 5069 */ 5070 ASSERT(spa->spa_root != NULL); 5071 break; 5072 5073 case ZPOOL_PROP_CACHEFILE: 5074 /* 5075 * 'cachefile' is also a non-persisitent property. 5076 */ 5077 break; 5078 default: 5079 /* 5080 * Set pool property values in the poolprops mos object. 5081 */ 5082 if (spa->spa_pool_props_object == 0) { 5083 VERIFY((spa->spa_pool_props_object = 5084 zap_create(mos, DMU_OT_POOL_PROPS, 5085 DMU_OT_NONE, 0, tx)) > 0); 5086 5087 VERIFY(zap_update(mos, 5088 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 5089 8, 1, &spa->spa_pool_props_object, tx) 5090 == 0); 5091 } 5092 5093 /* normalize the property name */ 5094 propname = zpool_prop_to_name(prop); 5095 proptype = zpool_prop_get_type(prop); 5096 5097 if (nvpair_type(elem) == DATA_TYPE_STRING) { 5098 ASSERT(proptype == PROP_TYPE_STRING); 5099 VERIFY(nvpair_value_string(elem, &strval) == 0); 5100 VERIFY(zap_update(mos, 5101 spa->spa_pool_props_object, propname, 5102 1, strlen(strval) + 1, strval, tx) == 0); 5103 5104 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 5105 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 5106 5107 if (proptype == PROP_TYPE_INDEX) { 5108 const char *unused; 5109 VERIFY(zpool_prop_index_to_string( 5110 prop, intval, &unused) == 0); 5111 } 5112 VERIFY(zap_update(mos, 5113 spa->spa_pool_props_object, propname, 5114 8, 1, &intval, tx) == 0); 5115 } else { 5116 ASSERT(0); /* not allowed */ 5117 } 5118 5119 switch (prop) { 5120 case ZPOOL_PROP_DELEGATION: 5121 spa->spa_delegation = intval; 5122 break; 5123 case ZPOOL_PROP_BOOTFS: 5124 spa->spa_bootfs = intval; 5125 break; 5126 case ZPOOL_PROP_FAILUREMODE: 5127 spa->spa_failmode = intval; 5128 break; 5129 case ZPOOL_PROP_AUTOEXPAND: 5130 spa->spa_autoexpand = intval; 5131 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 5132 break; 5133 case ZPOOL_PROP_DEDUPDITTO: 5134 spa->spa_dedup_ditto = intval; 5135 break; 5136 default: 5137 break; 5138 } 5139 } 5140 5141 /* log internal history if this is not a zpool create */ 5142 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 5143 tx->tx_txg != TXG_INITIAL) { 5144 spa_history_internal_log(LOG_POOL_PROPSET, 5145 spa, tx, cr, "%s %lld %s", 5146 nvpair_name(elem), intval, spa_name(spa)); 5147 } 5148 } 5149 5150 mutex_exit(&spa->spa_props_lock); 5151 } 5152 5153 /* 5154 * Sync the specified transaction group. New blocks may be dirtied as 5155 * part of the process, so we iterate until it converges. 5156 */ 5157 void 5158 spa_sync(spa_t *spa, uint64_t txg) 5159 { 5160 dsl_pool_t *dp = spa->spa_dsl_pool; 5161 objset_t *mos = spa->spa_meta_objset; 5162 bplist_t *defer_bpl = &spa->spa_deferred_bplist; 5163 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 5164 vdev_t *rvd = spa->spa_root_vdev; 5165 vdev_t *vd; 5166 dmu_tx_t *tx; 5167 int error; 5168 5169 /* 5170 * Lock out configuration changes. 5171 */ 5172 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5173 5174 spa->spa_syncing_txg = txg; 5175 spa->spa_sync_pass = 0; 5176 5177 /* 5178 * If there are any pending vdev state changes, convert them 5179 * into config changes that go out with this transaction group. 5180 */ 5181 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5182 while (list_head(&spa->spa_state_dirty_list) != NULL) { 5183 /* 5184 * We need the write lock here because, for aux vdevs, 5185 * calling vdev_config_dirty() modifies sav_config. 5186 * This is ugly and will become unnecessary when we 5187 * eliminate the aux vdev wart by integrating all vdevs 5188 * into the root vdev tree. 5189 */ 5190 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 5191 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 5192 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 5193 vdev_state_clean(vd); 5194 vdev_config_dirty(vd); 5195 } 5196 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 5197 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 5198 } 5199 spa_config_exit(spa, SCL_STATE, FTAG); 5200 5201 VERIFY(0 == bplist_open(defer_bpl, mos, spa->spa_deferred_bplist_obj)); 5202 5203 tx = dmu_tx_create_assigned(dp, txg); 5204 5205 /* 5206 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 5207 * set spa_deflate if we have no raid-z vdevs. 5208 */ 5209 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 5210 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 5211 int i; 5212 5213 for (i = 0; i < rvd->vdev_children; i++) { 5214 vd = rvd->vdev_child[i]; 5215 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 5216 break; 5217 } 5218 if (i == rvd->vdev_children) { 5219 spa->spa_deflate = TRUE; 5220 VERIFY(0 == zap_add(spa->spa_meta_objset, 5221 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 5222 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 5223 } 5224 } 5225 5226 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 5227 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 5228 dsl_pool_create_origin(dp, tx); 5229 5230 /* Keeping the origin open increases spa_minref */ 5231 spa->spa_minref += 3; 5232 } 5233 5234 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 5235 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 5236 dsl_pool_upgrade_clones(dp, tx); 5237 } 5238 5239 /* 5240 * If anything has changed in this txg, push the deferred frees 5241 * from the previous txg. If not, leave them alone so that we 5242 * don't generate work on an otherwise idle system. 5243 */ 5244 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 5245 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 5246 !txg_list_empty(&dp->dp_sync_tasks, txg)) 5247 spa_sync_deferred_bplist(spa, defer_bpl, tx, txg); 5248 5249 /* 5250 * Iterate to convergence. 5251 */ 5252 do { 5253 int pass = ++spa->spa_sync_pass; 5254 5255 spa_sync_config_object(spa, tx); 5256 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 5257 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 5258 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 5259 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 5260 spa_errlog_sync(spa, txg); 5261 dsl_pool_sync(dp, txg); 5262 5263 if (pass <= SYNC_PASS_DEFERRED_FREE) { 5264 zio_t *zio = zio_root(spa, NULL, NULL, 0); 5265 bplist_sync(free_bpl, spa_sync_free, zio, tx); 5266 VERIFY(zio_wait(zio) == 0); 5267 } else { 5268 bplist_sync(free_bpl, bplist_enqueue_cb, defer_bpl, tx); 5269 } 5270 5271 ddt_sync(spa, txg); 5272 5273 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 5274 vdev_sync(vd, txg); 5275 5276 } while (dmu_objset_is_dirty(mos, txg)); 5277 5278 ASSERT(free_bpl->bpl_queue == NULL); 5279 5280 bplist_close(defer_bpl); 5281 5282 /* 5283 * Rewrite the vdev configuration (which includes the uberblock) 5284 * to commit the transaction group. 5285 * 5286 * If there are no dirty vdevs, we sync the uberblock to a few 5287 * random top-level vdevs that are known to be visible in the 5288 * config cache (see spa_vdev_add() for a complete description). 5289 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 5290 */ 5291 for (;;) { 5292 /* 5293 * We hold SCL_STATE to prevent vdev open/close/etc. 5294 * while we're attempting to write the vdev labels. 5295 */ 5296 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5297 5298 if (list_is_empty(&spa->spa_config_dirty_list)) { 5299 vdev_t *svd[SPA_DVAS_PER_BP]; 5300 int svdcount = 0; 5301 int children = rvd->vdev_children; 5302 int c0 = spa_get_random(children); 5303 5304 for (int c = 0; c < children; c++) { 5305 vd = rvd->vdev_child[(c0 + c) % children]; 5306 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 5307 continue; 5308 svd[svdcount++] = vd; 5309 if (svdcount == SPA_DVAS_PER_BP) 5310 break; 5311 } 5312 error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 5313 if (error != 0) 5314 error = vdev_config_sync(svd, svdcount, txg, 5315 B_TRUE); 5316 } else { 5317 error = vdev_config_sync(rvd->vdev_child, 5318 rvd->vdev_children, txg, B_FALSE); 5319 if (error != 0) 5320 error = vdev_config_sync(rvd->vdev_child, 5321 rvd->vdev_children, txg, B_TRUE); 5322 } 5323 5324 spa_config_exit(spa, SCL_STATE, FTAG); 5325 5326 if (error == 0) 5327 break; 5328 zio_suspend(spa, NULL); 5329 zio_resume_wait(spa); 5330 } 5331 dmu_tx_commit(tx); 5332 5333 /* 5334 * Clear the dirty config list. 5335 */ 5336 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 5337 vdev_config_clean(vd); 5338 5339 /* 5340 * Now that the new config has synced transactionally, 5341 * let it become visible to the config cache. 5342 */ 5343 if (spa->spa_config_syncing != NULL) { 5344 spa_config_set(spa, spa->spa_config_syncing); 5345 spa->spa_config_txg = txg; 5346 spa->spa_config_syncing = NULL; 5347 } 5348 5349 spa->spa_ubsync = spa->spa_uberblock; 5350 5351 dsl_pool_sync_done(dp, txg); 5352 5353 /* 5354 * Update usable space statistics. 5355 */ 5356 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 5357 vdev_sync_done(vd, txg); 5358 5359 spa_update_dspace(spa); 5360 5361 /* 5362 * It had better be the case that we didn't dirty anything 5363 * since vdev_config_sync(). 5364 */ 5365 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 5366 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 5367 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 5368 ASSERT(defer_bpl->bpl_queue == NULL); 5369 ASSERT(free_bpl->bpl_queue == NULL); 5370 5371 spa->spa_sync_pass = 0; 5372 5373 spa_config_exit(spa, SCL_CONFIG, FTAG); 5374 5375 spa_handle_ignored_writes(spa); 5376 5377 /* 5378 * If any async tasks have been requested, kick them off. 5379 */ 5380 spa_async_dispatch(spa); 5381 } 5382 5383 /* 5384 * Sync all pools. We don't want to hold the namespace lock across these 5385 * operations, so we take a reference on the spa_t and drop the lock during the 5386 * sync. 5387 */ 5388 void 5389 spa_sync_allpools(void) 5390 { 5391 spa_t *spa = NULL; 5392 mutex_enter(&spa_namespace_lock); 5393 while ((spa = spa_next(spa)) != NULL) { 5394 if (spa_state(spa) != POOL_STATE_ACTIVE || spa_suspended(spa)) 5395 continue; 5396 spa_open_ref(spa, FTAG); 5397 mutex_exit(&spa_namespace_lock); 5398 txg_wait_synced(spa_get_dsl(spa), 0); 5399 mutex_enter(&spa_namespace_lock); 5400 spa_close(spa, FTAG); 5401 } 5402 mutex_exit(&spa_namespace_lock); 5403 } 5404 5405 /* 5406 * ========================================================================== 5407 * Miscellaneous routines 5408 * ========================================================================== 5409 */ 5410 5411 /* 5412 * Remove all pools in the system. 5413 */ 5414 void 5415 spa_evict_all(void) 5416 { 5417 spa_t *spa; 5418 5419 /* 5420 * Remove all cached state. All pools should be closed now, 5421 * so every spa in the AVL tree should be unreferenced. 5422 */ 5423 mutex_enter(&spa_namespace_lock); 5424 while ((spa = spa_next(NULL)) != NULL) { 5425 /* 5426 * Stop async tasks. The async thread may need to detach 5427 * a device that's been replaced, which requires grabbing 5428 * spa_namespace_lock, so we must drop it here. 5429 */ 5430 spa_open_ref(spa, FTAG); 5431 mutex_exit(&spa_namespace_lock); 5432 spa_async_suspend(spa); 5433 mutex_enter(&spa_namespace_lock); 5434 spa_close(spa, FTAG); 5435 5436 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 5437 spa_unload(spa); 5438 spa_deactivate(spa); 5439 } 5440 spa_remove(spa); 5441 } 5442 mutex_exit(&spa_namespace_lock); 5443 } 5444 5445 vdev_t * 5446 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 5447 { 5448 vdev_t *vd; 5449 int i; 5450 5451 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 5452 return (vd); 5453 5454 if (aux) { 5455 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 5456 vd = spa->spa_l2cache.sav_vdevs[i]; 5457 if (vd->vdev_guid == guid) 5458 return (vd); 5459 } 5460 5461 for (i = 0; i < spa->spa_spares.sav_count; i++) { 5462 vd = spa->spa_spares.sav_vdevs[i]; 5463 if (vd->vdev_guid == guid) 5464 return (vd); 5465 } 5466 } 5467 5468 return (NULL); 5469 } 5470 5471 void 5472 spa_upgrade(spa_t *spa, uint64_t version) 5473 { 5474 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5475 5476 /* 5477 * This should only be called for a non-faulted pool, and since a 5478 * future version would result in an unopenable pool, this shouldn't be 5479 * possible. 5480 */ 5481 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 5482 ASSERT(version >= spa->spa_uberblock.ub_version); 5483 5484 spa->spa_uberblock.ub_version = version; 5485 vdev_config_dirty(spa->spa_root_vdev); 5486 5487 spa_config_exit(spa, SCL_ALL, FTAG); 5488 5489 txg_wait_synced(spa_get_dsl(spa), 0); 5490 } 5491 5492 boolean_t 5493 spa_has_spare(spa_t *spa, uint64_t guid) 5494 { 5495 int i; 5496 uint64_t spareguid; 5497 spa_aux_vdev_t *sav = &spa->spa_spares; 5498 5499 for (i = 0; i < sav->sav_count; i++) 5500 if (sav->sav_vdevs[i]->vdev_guid == guid) 5501 return (B_TRUE); 5502 5503 for (i = 0; i < sav->sav_npending; i++) { 5504 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 5505 &spareguid) == 0 && spareguid == guid) 5506 return (B_TRUE); 5507 } 5508 5509 return (B_FALSE); 5510 } 5511 5512 /* 5513 * Check if a pool has an active shared spare device. 5514 * Note: reference count of an active spare is 2, as a spare and as a replace 5515 */ 5516 static boolean_t 5517 spa_has_active_shared_spare(spa_t *spa) 5518 { 5519 int i, refcnt; 5520 uint64_t pool; 5521 spa_aux_vdev_t *sav = &spa->spa_spares; 5522 5523 for (i = 0; i < sav->sav_count; i++) { 5524 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 5525 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 5526 refcnt > 2) 5527 return (B_TRUE); 5528 } 5529 5530 return (B_FALSE); 5531 } 5532 5533 /* 5534 * Post a sysevent corresponding to the given event. The 'name' must be one of 5535 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 5536 * filled in from the spa and (optionally) the vdev. This doesn't do anything 5537 * in the userland libzpool, as we don't want consumers to misinterpret ztest 5538 * or zdb as real changes. 5539 */ 5540 void 5541 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 5542 { 5543 #ifdef _KERNEL 5544 sysevent_t *ev; 5545 sysevent_attr_list_t *attr = NULL; 5546 sysevent_value_t value; 5547 sysevent_id_t eid; 5548 5549 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 5550 SE_SLEEP); 5551 5552 value.value_type = SE_DATA_TYPE_STRING; 5553 value.value.sv_string = spa_name(spa); 5554 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 5555 goto done; 5556 5557 value.value_type = SE_DATA_TYPE_UINT64; 5558 value.value.sv_uint64 = spa_guid(spa); 5559 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 5560 goto done; 5561 5562 if (vd) { 5563 value.value_type = SE_DATA_TYPE_UINT64; 5564 value.value.sv_uint64 = vd->vdev_guid; 5565 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 5566 SE_SLEEP) != 0) 5567 goto done; 5568 5569 if (vd->vdev_path) { 5570 value.value_type = SE_DATA_TYPE_STRING; 5571 value.value.sv_string = vd->vdev_path; 5572 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 5573 &value, SE_SLEEP) != 0) 5574 goto done; 5575 } 5576 } 5577 5578 if (sysevent_attach_attributes(ev, attr) != 0) 5579 goto done; 5580 attr = NULL; 5581 5582 (void) log_sysevent(ev, SE_SLEEP, &eid); 5583 5584 done: 5585 if (attr) 5586 sysevent_free_attr(attr); 5587 sysevent_free(ev); 5588 #endif 5589 } 5590