1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 25 * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 27 * Copyright 2013 Saso Kiselkov. All rights reserved. 28 * Copyright (c) 2014 Integros [integros.com] 29 */ 30 31 /* 32 * SPA: Storage Pool Allocator 33 * 34 * This file contains all the routines used when modifying on-disk SPA state. 35 * This includes opening, importing, destroying, exporting a pool, and syncing a 36 * pool. 37 */ 38 39 #include <sys/zfs_context.h> 40 #include <sys/fm/fs/zfs.h> 41 #include <sys/spa_impl.h> 42 #include <sys/zio.h> 43 #include <sys/zio_checksum.h> 44 #include <sys/dmu.h> 45 #include <sys/dmu_tx.h> 46 #include <sys/zap.h> 47 #include <sys/zil.h> 48 #include <sys/ddt.h> 49 #include <sys/vdev_impl.h> 50 #include <sys/metaslab.h> 51 #include <sys/metaslab_impl.h> 52 #include <sys/uberblock_impl.h> 53 #include <sys/txg.h> 54 #include <sys/avl.h> 55 #include <sys/dmu_traverse.h> 56 #include <sys/dmu_objset.h> 57 #include <sys/unique.h> 58 #include <sys/dsl_pool.h> 59 #include <sys/dsl_dataset.h> 60 #include <sys/dsl_dir.h> 61 #include <sys/dsl_prop.h> 62 #include <sys/dsl_synctask.h> 63 #include <sys/fs/zfs.h> 64 #include <sys/arc.h> 65 #include <sys/callb.h> 66 #include <sys/systeminfo.h> 67 #include <sys/spa_boot.h> 68 #include <sys/zfs_ioctl.h> 69 #include <sys/dsl_scan.h> 70 #include <sys/zfeature.h> 71 #include <sys/dsl_destroy.h> 72 73 #ifdef _KERNEL 74 #include <sys/bootprops.h> 75 #include <sys/callb.h> 76 #include <sys/cpupart.h> 77 #include <sys/pool.h> 78 #include <sys/sysdc.h> 79 #include <sys/zone.h> 80 #endif /* _KERNEL */ 81 82 #include "zfs_prop.h" 83 #include "zfs_comutil.h" 84 85 /* 86 * The interval, in seconds, at which failed configuration cache file writes 87 * should be retried. 88 */ 89 static int zfs_ccw_retry_interval = 300; 90 91 typedef enum zti_modes { 92 ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 93 ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ 94 ZTI_MODE_NULL, /* don't create a taskq */ 95 ZTI_NMODES 96 } zti_modes_t; 97 98 #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 99 #define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } 100 #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 101 102 #define ZTI_N(n) ZTI_P(n, 1) 103 #define ZTI_ONE ZTI_N(1) 104 105 typedef struct zio_taskq_info { 106 zti_modes_t zti_mode; 107 uint_t zti_value; 108 uint_t zti_count; 109 } zio_taskq_info_t; 110 111 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 112 "issue", "issue_high", "intr", "intr_high" 113 }; 114 115 /* 116 * This table defines the taskq settings for each ZFS I/O type. When 117 * initializing a pool, we use this table to create an appropriately sized 118 * taskq. Some operations are low volume and therefore have a small, static 119 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 120 * macros. Other operations process a large amount of data; the ZTI_BATCH 121 * macro causes us to create a taskq oriented for throughput. Some operations 122 * are so high frequency and short-lived that the taskq itself can become a a 123 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 124 * additional degree of parallelism specified by the number of threads per- 125 * taskq and the number of taskqs; when dispatching an event in this case, the 126 * particular taskq is chosen at random. 127 * 128 * The different taskq priorities are to handle the different contexts (issue 129 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that 130 * need to be handled with minimum delay. 131 */ 132 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 133 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 134 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 135 { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */ 136 { ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */ 137 { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 138 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 139 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ 140 }; 141 142 static void spa_sync_version(void *arg, dmu_tx_t *tx); 143 static void spa_sync_props(void *arg, dmu_tx_t *tx); 144 static boolean_t spa_has_active_shared_spare(spa_t *spa); 145 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 146 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 147 char **ereport); 148 static void spa_vdev_resilver_done(spa_t *spa); 149 150 uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ 151 id_t zio_taskq_psrset_bind = PS_NONE; 152 boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 153 uint_t zio_taskq_basedc = 80; /* base duty cycle */ 154 155 boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 156 extern int zfs_sync_pass_deferred_free; 157 158 /* 159 * This (illegal) pool name is used when temporarily importing a spa_t in order 160 * to get the vdev stats associated with the imported devices. 161 */ 162 #define TRYIMPORT_NAME "$import" 163 164 /* 165 * ========================================================================== 166 * SPA properties routines 167 * ========================================================================== 168 */ 169 170 /* 171 * Add a (source=src, propname=propval) list to an nvlist. 172 */ 173 static void 174 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 175 uint64_t intval, zprop_source_t src) 176 { 177 const char *propname = zpool_prop_to_name(prop); 178 nvlist_t *propval; 179 180 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 181 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 182 183 if (strval != NULL) 184 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 185 else 186 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 187 188 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 189 nvlist_free(propval); 190 } 191 192 /* 193 * Get property values from the spa configuration. 194 */ 195 static void 196 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 197 { 198 vdev_t *rvd = spa->spa_root_vdev; 199 dsl_pool_t *pool = spa->spa_dsl_pool; 200 uint64_t size, alloc, cap, version; 201 zprop_source_t src = ZPROP_SRC_NONE; 202 spa_config_dirent_t *dp; 203 metaslab_class_t *mc = spa_normal_class(spa); 204 205 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 206 207 if (rvd != NULL) { 208 alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 209 size = metaslab_class_get_space(spa_normal_class(spa)); 210 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 211 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 212 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 213 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 214 size - alloc, src); 215 216 spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, 217 metaslab_class_fragmentation(mc), src); 218 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, 219 metaslab_class_expandable_space(mc), src); 220 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 221 (spa_mode(spa) == FREAD), src); 222 223 cap = (size == 0) ? 0 : (alloc * 100 / size); 224 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 225 226 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 227 ddt_get_pool_dedup_ratio(spa), src); 228 229 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 230 rvd->vdev_state, src); 231 232 version = spa_version(spa); 233 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 234 src = ZPROP_SRC_DEFAULT; 235 else 236 src = ZPROP_SRC_LOCAL; 237 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 238 } 239 240 if (pool != NULL) { 241 /* 242 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 243 * when opening pools before this version freedir will be NULL. 244 */ 245 if (pool->dp_free_dir != NULL) { 246 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 247 dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, 248 src); 249 } else { 250 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 251 NULL, 0, src); 252 } 253 254 if (pool->dp_leak_dir != NULL) { 255 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 256 dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, 257 src); 258 } else { 259 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, 260 NULL, 0, src); 261 } 262 } 263 264 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 265 266 if (spa->spa_comment != NULL) { 267 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 268 0, ZPROP_SRC_LOCAL); 269 } 270 271 if (spa->spa_root != NULL) 272 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 273 0, ZPROP_SRC_LOCAL); 274 275 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { 276 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 277 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); 278 } else { 279 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 280 SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); 281 } 282 283 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 284 if (dp->scd_path == NULL) { 285 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 286 "none", 0, ZPROP_SRC_LOCAL); 287 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 288 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 289 dp->scd_path, 0, ZPROP_SRC_LOCAL); 290 } 291 } 292 } 293 294 /* 295 * Get zpool property values. 296 */ 297 int 298 spa_prop_get(spa_t *spa, nvlist_t **nvp) 299 { 300 objset_t *mos = spa->spa_meta_objset; 301 zap_cursor_t zc; 302 zap_attribute_t za; 303 int err; 304 305 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 306 307 mutex_enter(&spa->spa_props_lock); 308 309 /* 310 * Get properties from the spa config. 311 */ 312 spa_prop_get_config(spa, nvp); 313 314 /* If no pool property object, no more prop to get. */ 315 if (mos == NULL || spa->spa_pool_props_object == 0) { 316 mutex_exit(&spa->spa_props_lock); 317 return (0); 318 } 319 320 /* 321 * Get properties from the MOS pool property object. 322 */ 323 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 324 (err = zap_cursor_retrieve(&zc, &za)) == 0; 325 zap_cursor_advance(&zc)) { 326 uint64_t intval = 0; 327 char *strval = NULL; 328 zprop_source_t src = ZPROP_SRC_DEFAULT; 329 zpool_prop_t prop; 330 331 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 332 continue; 333 334 switch (za.za_integer_length) { 335 case 8: 336 /* integer property */ 337 if (za.za_first_integer != 338 zpool_prop_default_numeric(prop)) 339 src = ZPROP_SRC_LOCAL; 340 341 if (prop == ZPOOL_PROP_BOOTFS) { 342 dsl_pool_t *dp; 343 dsl_dataset_t *ds = NULL; 344 345 dp = spa_get_dsl(spa); 346 dsl_pool_config_enter(dp, FTAG); 347 if (err = dsl_dataset_hold_obj(dp, 348 za.za_first_integer, FTAG, &ds)) { 349 dsl_pool_config_exit(dp, FTAG); 350 break; 351 } 352 353 strval = kmem_alloc( 354 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 355 KM_SLEEP); 356 dsl_dataset_name(ds, strval); 357 dsl_dataset_rele(ds, FTAG); 358 dsl_pool_config_exit(dp, FTAG); 359 } else { 360 strval = NULL; 361 intval = za.za_first_integer; 362 } 363 364 spa_prop_add_list(*nvp, prop, strval, intval, src); 365 366 if (strval != NULL) 367 kmem_free(strval, 368 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 369 370 break; 371 372 case 1: 373 /* string property */ 374 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 375 err = zap_lookup(mos, spa->spa_pool_props_object, 376 za.za_name, 1, za.za_num_integers, strval); 377 if (err) { 378 kmem_free(strval, za.za_num_integers); 379 break; 380 } 381 spa_prop_add_list(*nvp, prop, strval, 0, src); 382 kmem_free(strval, za.za_num_integers); 383 break; 384 385 default: 386 break; 387 } 388 } 389 zap_cursor_fini(&zc); 390 mutex_exit(&spa->spa_props_lock); 391 out: 392 if (err && err != ENOENT) { 393 nvlist_free(*nvp); 394 *nvp = NULL; 395 return (err); 396 } 397 398 return (0); 399 } 400 401 /* 402 * Validate the given pool properties nvlist and modify the list 403 * for the property values to be set. 404 */ 405 static int 406 spa_prop_validate(spa_t *spa, nvlist_t *props) 407 { 408 nvpair_t *elem; 409 int error = 0, reset_bootfs = 0; 410 uint64_t objnum = 0; 411 boolean_t has_feature = B_FALSE; 412 413 elem = NULL; 414 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 415 uint64_t intval; 416 char *strval, *slash, *check, *fname; 417 const char *propname = nvpair_name(elem); 418 zpool_prop_t prop = zpool_name_to_prop(propname); 419 420 switch (prop) { 421 case ZPROP_INVAL: 422 if (!zpool_prop_feature(propname)) { 423 error = SET_ERROR(EINVAL); 424 break; 425 } 426 427 /* 428 * Sanitize the input. 429 */ 430 if (nvpair_type(elem) != DATA_TYPE_UINT64) { 431 error = SET_ERROR(EINVAL); 432 break; 433 } 434 435 if (nvpair_value_uint64(elem, &intval) != 0) { 436 error = SET_ERROR(EINVAL); 437 break; 438 } 439 440 if (intval != 0) { 441 error = SET_ERROR(EINVAL); 442 break; 443 } 444 445 fname = strchr(propname, '@') + 1; 446 if (zfeature_lookup_name(fname, NULL) != 0) { 447 error = SET_ERROR(EINVAL); 448 break; 449 } 450 451 has_feature = B_TRUE; 452 break; 453 454 case ZPOOL_PROP_VERSION: 455 error = nvpair_value_uint64(elem, &intval); 456 if (!error && 457 (intval < spa_version(spa) || 458 intval > SPA_VERSION_BEFORE_FEATURES || 459 has_feature)) 460 error = SET_ERROR(EINVAL); 461 break; 462 463 case ZPOOL_PROP_DELEGATION: 464 case ZPOOL_PROP_AUTOREPLACE: 465 case ZPOOL_PROP_LISTSNAPS: 466 case ZPOOL_PROP_AUTOEXPAND: 467 error = nvpair_value_uint64(elem, &intval); 468 if (!error && intval > 1) 469 error = SET_ERROR(EINVAL); 470 break; 471 472 case ZPOOL_PROP_BOOTFS: 473 /* 474 * If the pool version is less than SPA_VERSION_BOOTFS, 475 * or the pool is still being created (version == 0), 476 * the bootfs property cannot be set. 477 */ 478 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 479 error = SET_ERROR(ENOTSUP); 480 break; 481 } 482 483 /* 484 * Make sure the vdev config is bootable 485 */ 486 if (!vdev_is_bootable(spa->spa_root_vdev)) { 487 error = SET_ERROR(ENOTSUP); 488 break; 489 } 490 491 reset_bootfs = 1; 492 493 error = nvpair_value_string(elem, &strval); 494 495 if (!error) { 496 objset_t *os; 497 uint64_t propval; 498 499 if (strval == NULL || strval[0] == '\0') { 500 objnum = zpool_prop_default_numeric( 501 ZPOOL_PROP_BOOTFS); 502 break; 503 } 504 505 if (error = dmu_objset_hold(strval, FTAG, &os)) 506 break; 507 508 /* 509 * Must be ZPL, and its property settings 510 * must be supported by GRUB (compression 511 * is not gzip, and large blocks are not used). 512 */ 513 514 if (dmu_objset_type(os) != DMU_OST_ZFS) { 515 error = SET_ERROR(ENOTSUP); 516 } else if ((error = 517 dsl_prop_get_int_ds(dmu_objset_ds(os), 518 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 519 &propval)) == 0 && 520 !BOOTFS_COMPRESS_VALID(propval)) { 521 error = SET_ERROR(ENOTSUP); 522 } else if ((error = 523 dsl_prop_get_int_ds(dmu_objset_ds(os), 524 zfs_prop_to_name(ZFS_PROP_RECORDSIZE), 525 &propval)) == 0 && 526 propval > SPA_OLD_MAXBLOCKSIZE) { 527 error = SET_ERROR(ENOTSUP); 528 } else { 529 objnum = dmu_objset_id(os); 530 } 531 dmu_objset_rele(os, FTAG); 532 } 533 break; 534 535 case ZPOOL_PROP_FAILUREMODE: 536 error = nvpair_value_uint64(elem, &intval); 537 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 538 intval > ZIO_FAILURE_MODE_PANIC)) 539 error = SET_ERROR(EINVAL); 540 541 /* 542 * This is a special case which only occurs when 543 * the pool has completely failed. This allows 544 * the user to change the in-core failmode property 545 * without syncing it out to disk (I/Os might 546 * currently be blocked). We do this by returning 547 * EIO to the caller (spa_prop_set) to trick it 548 * into thinking we encountered a property validation 549 * error. 550 */ 551 if (!error && spa_suspended(spa)) { 552 spa->spa_failmode = intval; 553 error = SET_ERROR(EIO); 554 } 555 break; 556 557 case ZPOOL_PROP_CACHEFILE: 558 if ((error = nvpair_value_string(elem, &strval)) != 0) 559 break; 560 561 if (strval[0] == '\0') 562 break; 563 564 if (strcmp(strval, "none") == 0) 565 break; 566 567 if (strval[0] != '/') { 568 error = SET_ERROR(EINVAL); 569 break; 570 } 571 572 slash = strrchr(strval, '/'); 573 ASSERT(slash != NULL); 574 575 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 576 strcmp(slash, "/..") == 0) 577 error = SET_ERROR(EINVAL); 578 break; 579 580 case ZPOOL_PROP_COMMENT: 581 if ((error = nvpair_value_string(elem, &strval)) != 0) 582 break; 583 for (check = strval; *check != '\0'; check++) { 584 /* 585 * The kernel doesn't have an easy isprint() 586 * check. For this kernel check, we merely 587 * check ASCII apart from DEL. Fix this if 588 * there is an easy-to-use kernel isprint(). 589 */ 590 if (*check >= 0x7f) { 591 error = SET_ERROR(EINVAL); 592 break; 593 } 594 } 595 if (strlen(strval) > ZPROP_MAX_COMMENT) 596 error = E2BIG; 597 break; 598 599 case ZPOOL_PROP_DEDUPDITTO: 600 if (spa_version(spa) < SPA_VERSION_DEDUP) 601 error = SET_ERROR(ENOTSUP); 602 else 603 error = nvpair_value_uint64(elem, &intval); 604 if (error == 0 && 605 intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 606 error = SET_ERROR(EINVAL); 607 break; 608 } 609 610 if (error) 611 break; 612 } 613 614 if (!error && reset_bootfs) { 615 error = nvlist_remove(props, 616 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 617 618 if (!error) { 619 error = nvlist_add_uint64(props, 620 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 621 } 622 } 623 624 return (error); 625 } 626 627 void 628 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 629 { 630 char *cachefile; 631 spa_config_dirent_t *dp; 632 633 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 634 &cachefile) != 0) 635 return; 636 637 dp = kmem_alloc(sizeof (spa_config_dirent_t), 638 KM_SLEEP); 639 640 if (cachefile[0] == '\0') 641 dp->scd_path = spa_strdup(spa_config_path); 642 else if (strcmp(cachefile, "none") == 0) 643 dp->scd_path = NULL; 644 else 645 dp->scd_path = spa_strdup(cachefile); 646 647 list_insert_head(&spa->spa_config_list, dp); 648 if (need_sync) 649 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 650 } 651 652 int 653 spa_prop_set(spa_t *spa, nvlist_t *nvp) 654 { 655 int error; 656 nvpair_t *elem = NULL; 657 boolean_t need_sync = B_FALSE; 658 659 if ((error = spa_prop_validate(spa, nvp)) != 0) 660 return (error); 661 662 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 663 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 664 665 if (prop == ZPOOL_PROP_CACHEFILE || 666 prop == ZPOOL_PROP_ALTROOT || 667 prop == ZPOOL_PROP_READONLY) 668 continue; 669 670 if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) { 671 uint64_t ver; 672 673 if (prop == ZPOOL_PROP_VERSION) { 674 VERIFY(nvpair_value_uint64(elem, &ver) == 0); 675 } else { 676 ASSERT(zpool_prop_feature(nvpair_name(elem))); 677 ver = SPA_VERSION_FEATURES; 678 need_sync = B_TRUE; 679 } 680 681 /* Save time if the version is already set. */ 682 if (ver == spa_version(spa)) 683 continue; 684 685 /* 686 * In addition to the pool directory object, we might 687 * create the pool properties object, the features for 688 * read object, the features for write object, or the 689 * feature descriptions object. 690 */ 691 error = dsl_sync_task(spa->spa_name, NULL, 692 spa_sync_version, &ver, 693 6, ZFS_SPACE_CHECK_RESERVED); 694 if (error) 695 return (error); 696 continue; 697 } 698 699 need_sync = B_TRUE; 700 break; 701 } 702 703 if (need_sync) { 704 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 705 nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 706 } 707 708 return (0); 709 } 710 711 /* 712 * If the bootfs property value is dsobj, clear it. 713 */ 714 void 715 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 716 { 717 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 718 VERIFY(zap_remove(spa->spa_meta_objset, 719 spa->spa_pool_props_object, 720 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 721 spa->spa_bootfs = 0; 722 } 723 } 724 725 /*ARGSUSED*/ 726 static int 727 spa_change_guid_check(void *arg, dmu_tx_t *tx) 728 { 729 uint64_t *newguid = arg; 730 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 731 vdev_t *rvd = spa->spa_root_vdev; 732 uint64_t vdev_state; 733 734 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 735 vdev_state = rvd->vdev_state; 736 spa_config_exit(spa, SCL_STATE, FTAG); 737 738 if (vdev_state != VDEV_STATE_HEALTHY) 739 return (SET_ERROR(ENXIO)); 740 741 ASSERT3U(spa_guid(spa), !=, *newguid); 742 743 return (0); 744 } 745 746 static void 747 spa_change_guid_sync(void *arg, dmu_tx_t *tx) 748 { 749 uint64_t *newguid = arg; 750 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 751 uint64_t oldguid; 752 vdev_t *rvd = spa->spa_root_vdev; 753 754 oldguid = spa_guid(spa); 755 756 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 757 rvd->vdev_guid = *newguid; 758 rvd->vdev_guid_sum += (*newguid - oldguid); 759 vdev_config_dirty(rvd); 760 spa_config_exit(spa, SCL_STATE, FTAG); 761 762 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 763 oldguid, *newguid); 764 } 765 766 /* 767 * Change the GUID for the pool. This is done so that we can later 768 * re-import a pool built from a clone of our own vdevs. We will modify 769 * the root vdev's guid, our own pool guid, and then mark all of our 770 * vdevs dirty. Note that we must make sure that all our vdevs are 771 * online when we do this, or else any vdevs that weren't present 772 * would be orphaned from our pool. We are also going to issue a 773 * sysevent to update any watchers. 774 */ 775 int 776 spa_change_guid(spa_t *spa) 777 { 778 int error; 779 uint64_t guid; 780 781 mutex_enter(&spa->spa_vdev_top_lock); 782 mutex_enter(&spa_namespace_lock); 783 guid = spa_generate_guid(NULL); 784 785 error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 786 spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 787 788 if (error == 0) { 789 spa_config_sync(spa, B_FALSE, B_TRUE); 790 spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID); 791 } 792 793 mutex_exit(&spa_namespace_lock); 794 mutex_exit(&spa->spa_vdev_top_lock); 795 796 return (error); 797 } 798 799 /* 800 * ========================================================================== 801 * SPA state manipulation (open/create/destroy/import/export) 802 * ========================================================================== 803 */ 804 805 static int 806 spa_error_entry_compare(const void *a, const void *b) 807 { 808 spa_error_entry_t *sa = (spa_error_entry_t *)a; 809 spa_error_entry_t *sb = (spa_error_entry_t *)b; 810 int ret; 811 812 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 813 sizeof (zbookmark_phys_t)); 814 815 if (ret < 0) 816 return (-1); 817 else if (ret > 0) 818 return (1); 819 else 820 return (0); 821 } 822 823 /* 824 * Utility function which retrieves copies of the current logs and 825 * re-initializes them in the process. 826 */ 827 void 828 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 829 { 830 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 831 832 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 833 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 834 835 avl_create(&spa->spa_errlist_scrub, 836 spa_error_entry_compare, sizeof (spa_error_entry_t), 837 offsetof(spa_error_entry_t, se_avl)); 838 avl_create(&spa->spa_errlist_last, 839 spa_error_entry_compare, sizeof (spa_error_entry_t), 840 offsetof(spa_error_entry_t, se_avl)); 841 } 842 843 static void 844 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 845 { 846 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 847 enum zti_modes mode = ztip->zti_mode; 848 uint_t value = ztip->zti_value; 849 uint_t count = ztip->zti_count; 850 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 851 char name[32]; 852 uint_t flags = 0; 853 boolean_t batch = B_FALSE; 854 855 if (mode == ZTI_MODE_NULL) { 856 tqs->stqs_count = 0; 857 tqs->stqs_taskq = NULL; 858 return; 859 } 860 861 ASSERT3U(count, >, 0); 862 863 tqs->stqs_count = count; 864 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 865 866 switch (mode) { 867 case ZTI_MODE_FIXED: 868 ASSERT3U(value, >=, 1); 869 value = MAX(value, 1); 870 break; 871 872 case ZTI_MODE_BATCH: 873 batch = B_TRUE; 874 flags |= TASKQ_THREADS_CPU_PCT; 875 value = zio_taskq_batch_pct; 876 break; 877 878 default: 879 panic("unrecognized mode for %s_%s taskq (%u:%u) in " 880 "spa_activate()", 881 zio_type_name[t], zio_taskq_types[q], mode, value); 882 break; 883 } 884 885 for (uint_t i = 0; i < count; i++) { 886 taskq_t *tq; 887 888 if (count > 1) { 889 (void) snprintf(name, sizeof (name), "%s_%s_%u", 890 zio_type_name[t], zio_taskq_types[q], i); 891 } else { 892 (void) snprintf(name, sizeof (name), "%s_%s", 893 zio_type_name[t], zio_taskq_types[q]); 894 } 895 896 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 897 if (batch) 898 flags |= TASKQ_DC_BATCH; 899 900 tq = taskq_create_sysdc(name, value, 50, INT_MAX, 901 spa->spa_proc, zio_taskq_basedc, flags); 902 } else { 903 pri_t pri = maxclsyspri; 904 /* 905 * The write issue taskq can be extremely CPU 906 * intensive. Run it at slightly lower priority 907 * than the other taskqs. 908 */ 909 if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) 910 pri--; 911 912 tq = taskq_create_proc(name, value, pri, 50, 913 INT_MAX, spa->spa_proc, flags); 914 } 915 916 tqs->stqs_taskq[i] = tq; 917 } 918 } 919 920 static void 921 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 922 { 923 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 924 925 if (tqs->stqs_taskq == NULL) { 926 ASSERT0(tqs->stqs_count); 927 return; 928 } 929 930 for (uint_t i = 0; i < tqs->stqs_count; i++) { 931 ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 932 taskq_destroy(tqs->stqs_taskq[i]); 933 } 934 935 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 936 tqs->stqs_taskq = NULL; 937 } 938 939 /* 940 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 941 * Note that a type may have multiple discrete taskqs to avoid lock contention 942 * on the taskq itself. In that case we choose which taskq at random by using 943 * the low bits of gethrtime(). 944 */ 945 void 946 spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 947 task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) 948 { 949 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 950 taskq_t *tq; 951 952 ASSERT3P(tqs->stqs_taskq, !=, NULL); 953 ASSERT3U(tqs->stqs_count, !=, 0); 954 955 if (tqs->stqs_count == 1) { 956 tq = tqs->stqs_taskq[0]; 957 } else { 958 tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count]; 959 } 960 961 taskq_dispatch_ent(tq, func, arg, flags, ent); 962 } 963 964 static void 965 spa_create_zio_taskqs(spa_t *spa) 966 { 967 for (int t = 0; t < ZIO_TYPES; t++) { 968 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 969 spa_taskqs_init(spa, t, q); 970 } 971 } 972 } 973 974 #ifdef _KERNEL 975 static void 976 spa_thread(void *arg) 977 { 978 callb_cpr_t cprinfo; 979 980 spa_t *spa = arg; 981 user_t *pu = PTOU(curproc); 982 983 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 984 spa->spa_name); 985 986 ASSERT(curproc != &p0); 987 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 988 "zpool-%s", spa->spa_name); 989 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 990 991 /* bind this thread to the requested psrset */ 992 if (zio_taskq_psrset_bind != PS_NONE) { 993 pool_lock(); 994 mutex_enter(&cpu_lock); 995 mutex_enter(&pidlock); 996 mutex_enter(&curproc->p_lock); 997 998 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 999 0, NULL, NULL) == 0) { 1000 curthread->t_bind_pset = zio_taskq_psrset_bind; 1001 } else { 1002 cmn_err(CE_WARN, 1003 "Couldn't bind process for zfs pool \"%s\" to " 1004 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1005 } 1006 1007 mutex_exit(&curproc->p_lock); 1008 mutex_exit(&pidlock); 1009 mutex_exit(&cpu_lock); 1010 pool_unlock(); 1011 } 1012 1013 if (zio_taskq_sysdc) { 1014 sysdc_thread_enter(curthread, 100, 0); 1015 } 1016 1017 spa->spa_proc = curproc; 1018 spa->spa_did = curthread->t_did; 1019 1020 spa_create_zio_taskqs(spa); 1021 1022 mutex_enter(&spa->spa_proc_lock); 1023 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1024 1025 spa->spa_proc_state = SPA_PROC_ACTIVE; 1026 cv_broadcast(&spa->spa_proc_cv); 1027 1028 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1029 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1030 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1031 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1032 1033 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1034 spa->spa_proc_state = SPA_PROC_GONE; 1035 spa->spa_proc = &p0; 1036 cv_broadcast(&spa->spa_proc_cv); 1037 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1038 1039 mutex_enter(&curproc->p_lock); 1040 lwp_exit(); 1041 } 1042 #endif 1043 1044 /* 1045 * Activate an uninitialized pool. 1046 */ 1047 static void 1048 spa_activate(spa_t *spa, int mode) 1049 { 1050 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1051 1052 spa->spa_state = POOL_STATE_ACTIVE; 1053 spa->spa_mode = mode; 1054 1055 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 1056 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 1057 1058 /* Try to create a covering process */ 1059 mutex_enter(&spa->spa_proc_lock); 1060 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1061 ASSERT(spa->spa_proc == &p0); 1062 spa->spa_did = 0; 1063 1064 /* Only create a process if we're going to be around a while. */ 1065 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1066 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1067 NULL, 0) == 0) { 1068 spa->spa_proc_state = SPA_PROC_CREATED; 1069 while (spa->spa_proc_state == SPA_PROC_CREATED) { 1070 cv_wait(&spa->spa_proc_cv, 1071 &spa->spa_proc_lock); 1072 } 1073 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1074 ASSERT(spa->spa_proc != &p0); 1075 ASSERT(spa->spa_did != 0); 1076 } else { 1077 #ifdef _KERNEL 1078 cmn_err(CE_WARN, 1079 "Couldn't create process for zfs pool \"%s\"\n", 1080 spa->spa_name); 1081 #endif 1082 } 1083 } 1084 mutex_exit(&spa->spa_proc_lock); 1085 1086 /* If we didn't create a process, we need to create our taskqs. */ 1087 if (spa->spa_proc == &p0) { 1088 spa_create_zio_taskqs(spa); 1089 } 1090 1091 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1092 offsetof(vdev_t, vdev_config_dirty_node)); 1093 list_create(&spa->spa_evicting_os_list, sizeof (objset_t), 1094 offsetof(objset_t, os_evicting_node)); 1095 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1096 offsetof(vdev_t, vdev_state_dirty_node)); 1097 1098 txg_list_create(&spa->spa_vdev_txg_list, 1099 offsetof(struct vdev, vdev_txg_node)); 1100 1101 avl_create(&spa->spa_errlist_scrub, 1102 spa_error_entry_compare, sizeof (spa_error_entry_t), 1103 offsetof(spa_error_entry_t, se_avl)); 1104 avl_create(&spa->spa_errlist_last, 1105 spa_error_entry_compare, sizeof (spa_error_entry_t), 1106 offsetof(spa_error_entry_t, se_avl)); 1107 } 1108 1109 /* 1110 * Opposite of spa_activate(). 1111 */ 1112 static void 1113 spa_deactivate(spa_t *spa) 1114 { 1115 ASSERT(spa->spa_sync_on == B_FALSE); 1116 ASSERT(spa->spa_dsl_pool == NULL); 1117 ASSERT(spa->spa_root_vdev == NULL); 1118 ASSERT(spa->spa_async_zio_root == NULL); 1119 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1120 1121 spa_evicting_os_wait(spa); 1122 1123 txg_list_destroy(&spa->spa_vdev_txg_list); 1124 1125 list_destroy(&spa->spa_config_dirty_list); 1126 list_destroy(&spa->spa_evicting_os_list); 1127 list_destroy(&spa->spa_state_dirty_list); 1128 1129 for (int t = 0; t < ZIO_TYPES; t++) { 1130 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1131 spa_taskqs_fini(spa, t, q); 1132 } 1133 } 1134 1135 metaslab_class_destroy(spa->spa_normal_class); 1136 spa->spa_normal_class = NULL; 1137 1138 metaslab_class_destroy(spa->spa_log_class); 1139 spa->spa_log_class = NULL; 1140 1141 /* 1142 * If this was part of an import or the open otherwise failed, we may 1143 * still have errors left in the queues. Empty them just in case. 1144 */ 1145 spa_errlog_drain(spa); 1146 1147 avl_destroy(&spa->spa_errlist_scrub); 1148 avl_destroy(&spa->spa_errlist_last); 1149 1150 spa->spa_state = POOL_STATE_UNINITIALIZED; 1151 1152 mutex_enter(&spa->spa_proc_lock); 1153 if (spa->spa_proc_state != SPA_PROC_NONE) { 1154 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1155 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1156 cv_broadcast(&spa->spa_proc_cv); 1157 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1158 ASSERT(spa->spa_proc != &p0); 1159 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1160 } 1161 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1162 spa->spa_proc_state = SPA_PROC_NONE; 1163 } 1164 ASSERT(spa->spa_proc == &p0); 1165 mutex_exit(&spa->spa_proc_lock); 1166 1167 /* 1168 * We want to make sure spa_thread() has actually exited the ZFS 1169 * module, so that the module can't be unloaded out from underneath 1170 * it. 1171 */ 1172 if (spa->spa_did != 0) { 1173 thread_join(spa->spa_did); 1174 spa->spa_did = 0; 1175 } 1176 } 1177 1178 /* 1179 * Verify a pool configuration, and construct the vdev tree appropriately. This 1180 * will create all the necessary vdevs in the appropriate layout, with each vdev 1181 * in the CLOSED state. This will prep the pool before open/creation/import. 1182 * All vdev validation is done by the vdev_alloc() routine. 1183 */ 1184 static int 1185 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1186 uint_t id, int atype) 1187 { 1188 nvlist_t **child; 1189 uint_t children; 1190 int error; 1191 1192 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1193 return (error); 1194 1195 if ((*vdp)->vdev_ops->vdev_op_leaf) 1196 return (0); 1197 1198 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1199 &child, &children); 1200 1201 if (error == ENOENT) 1202 return (0); 1203 1204 if (error) { 1205 vdev_free(*vdp); 1206 *vdp = NULL; 1207 return (SET_ERROR(EINVAL)); 1208 } 1209 1210 for (int c = 0; c < children; c++) { 1211 vdev_t *vd; 1212 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1213 atype)) != 0) { 1214 vdev_free(*vdp); 1215 *vdp = NULL; 1216 return (error); 1217 } 1218 } 1219 1220 ASSERT(*vdp != NULL); 1221 1222 return (0); 1223 } 1224 1225 /* 1226 * Opposite of spa_load(). 1227 */ 1228 static void 1229 spa_unload(spa_t *spa) 1230 { 1231 int i; 1232 1233 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1234 1235 /* 1236 * Stop async tasks. 1237 */ 1238 spa_async_suspend(spa); 1239 1240 /* 1241 * Stop syncing. 1242 */ 1243 if (spa->spa_sync_on) { 1244 txg_sync_stop(spa->spa_dsl_pool); 1245 spa->spa_sync_on = B_FALSE; 1246 } 1247 1248 /* 1249 * Wait for any outstanding async I/O to complete. 1250 */ 1251 if (spa->spa_async_zio_root != NULL) { 1252 for (int i = 0; i < max_ncpus; i++) 1253 (void) zio_wait(spa->spa_async_zio_root[i]); 1254 kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); 1255 spa->spa_async_zio_root = NULL; 1256 } 1257 1258 bpobj_close(&spa->spa_deferred_bpobj); 1259 1260 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1261 1262 /* 1263 * Close all vdevs. 1264 */ 1265 if (spa->spa_root_vdev) 1266 vdev_free(spa->spa_root_vdev); 1267 ASSERT(spa->spa_root_vdev == NULL); 1268 1269 /* 1270 * Close the dsl pool. 1271 */ 1272 if (spa->spa_dsl_pool) { 1273 dsl_pool_close(spa->spa_dsl_pool); 1274 spa->spa_dsl_pool = NULL; 1275 spa->spa_meta_objset = NULL; 1276 } 1277 1278 ddt_unload(spa); 1279 1280 1281 /* 1282 * Drop and purge level 2 cache 1283 */ 1284 spa_l2cache_drop(spa); 1285 1286 for (i = 0; i < spa->spa_spares.sav_count; i++) 1287 vdev_free(spa->spa_spares.sav_vdevs[i]); 1288 if (spa->spa_spares.sav_vdevs) { 1289 kmem_free(spa->spa_spares.sav_vdevs, 1290 spa->spa_spares.sav_count * sizeof (void *)); 1291 spa->spa_spares.sav_vdevs = NULL; 1292 } 1293 if (spa->spa_spares.sav_config) { 1294 nvlist_free(spa->spa_spares.sav_config); 1295 spa->spa_spares.sav_config = NULL; 1296 } 1297 spa->spa_spares.sav_count = 0; 1298 1299 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 1300 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1301 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1302 } 1303 if (spa->spa_l2cache.sav_vdevs) { 1304 kmem_free(spa->spa_l2cache.sav_vdevs, 1305 spa->spa_l2cache.sav_count * sizeof (void *)); 1306 spa->spa_l2cache.sav_vdevs = NULL; 1307 } 1308 if (spa->spa_l2cache.sav_config) { 1309 nvlist_free(spa->spa_l2cache.sav_config); 1310 spa->spa_l2cache.sav_config = NULL; 1311 } 1312 spa->spa_l2cache.sav_count = 0; 1313 1314 spa->spa_async_suspended = 0; 1315 1316 if (spa->spa_comment != NULL) { 1317 spa_strfree(spa->spa_comment); 1318 spa->spa_comment = NULL; 1319 } 1320 1321 spa_config_exit(spa, SCL_ALL, FTAG); 1322 } 1323 1324 /* 1325 * Load (or re-load) the current list of vdevs describing the active spares for 1326 * this pool. When this is called, we have some form of basic information in 1327 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1328 * then re-generate a more complete list including status information. 1329 */ 1330 static void 1331 spa_load_spares(spa_t *spa) 1332 { 1333 nvlist_t **spares; 1334 uint_t nspares; 1335 int i; 1336 vdev_t *vd, *tvd; 1337 1338 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1339 1340 /* 1341 * First, close and free any existing spare vdevs. 1342 */ 1343 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1344 vd = spa->spa_spares.sav_vdevs[i]; 1345 1346 /* Undo the call to spa_activate() below */ 1347 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1348 B_FALSE)) != NULL && tvd->vdev_isspare) 1349 spa_spare_remove(tvd); 1350 vdev_close(vd); 1351 vdev_free(vd); 1352 } 1353 1354 if (spa->spa_spares.sav_vdevs) 1355 kmem_free(spa->spa_spares.sav_vdevs, 1356 spa->spa_spares.sav_count * sizeof (void *)); 1357 1358 if (spa->spa_spares.sav_config == NULL) 1359 nspares = 0; 1360 else 1361 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1362 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1363 1364 spa->spa_spares.sav_count = (int)nspares; 1365 spa->spa_spares.sav_vdevs = NULL; 1366 1367 if (nspares == 0) 1368 return; 1369 1370 /* 1371 * Construct the array of vdevs, opening them to get status in the 1372 * process. For each spare, there is potentially two different vdev_t 1373 * structures associated with it: one in the list of spares (used only 1374 * for basic validation purposes) and one in the active vdev 1375 * configuration (if it's spared in). During this phase we open and 1376 * validate each vdev on the spare list. If the vdev also exists in the 1377 * active configuration, then we also mark this vdev as an active spare. 1378 */ 1379 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1380 KM_SLEEP); 1381 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1382 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1383 VDEV_ALLOC_SPARE) == 0); 1384 ASSERT(vd != NULL); 1385 1386 spa->spa_spares.sav_vdevs[i] = vd; 1387 1388 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1389 B_FALSE)) != NULL) { 1390 if (!tvd->vdev_isspare) 1391 spa_spare_add(tvd); 1392 1393 /* 1394 * We only mark the spare active if we were successfully 1395 * able to load the vdev. Otherwise, importing a pool 1396 * with a bad active spare would result in strange 1397 * behavior, because multiple pool would think the spare 1398 * is actively in use. 1399 * 1400 * There is a vulnerability here to an equally bizarre 1401 * circumstance, where a dead active spare is later 1402 * brought back to life (onlined or otherwise). Given 1403 * the rarity of this scenario, and the extra complexity 1404 * it adds, we ignore the possibility. 1405 */ 1406 if (!vdev_is_dead(tvd)) 1407 spa_spare_activate(tvd); 1408 } 1409 1410 vd->vdev_top = vd; 1411 vd->vdev_aux = &spa->spa_spares; 1412 1413 if (vdev_open(vd) != 0) 1414 continue; 1415 1416 if (vdev_validate_aux(vd) == 0) 1417 spa_spare_add(vd); 1418 } 1419 1420 /* 1421 * Recompute the stashed list of spares, with status information 1422 * this time. 1423 */ 1424 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1425 DATA_TYPE_NVLIST_ARRAY) == 0); 1426 1427 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1428 KM_SLEEP); 1429 for (i = 0; i < spa->spa_spares.sav_count; i++) 1430 spares[i] = vdev_config_generate(spa, 1431 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1432 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1433 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1434 for (i = 0; i < spa->spa_spares.sav_count; i++) 1435 nvlist_free(spares[i]); 1436 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1437 } 1438 1439 /* 1440 * Load (or re-load) the current list of vdevs describing the active l2cache for 1441 * this pool. When this is called, we have some form of basic information in 1442 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1443 * then re-generate a more complete list including status information. 1444 * Devices which are already active have their details maintained, and are 1445 * not re-opened. 1446 */ 1447 static void 1448 spa_load_l2cache(spa_t *spa) 1449 { 1450 nvlist_t **l2cache; 1451 uint_t nl2cache; 1452 int i, j, oldnvdevs; 1453 uint64_t guid; 1454 vdev_t *vd, **oldvdevs, **newvdevs; 1455 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1456 1457 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1458 1459 if (sav->sav_config != NULL) { 1460 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1461 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1462 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1463 } else { 1464 nl2cache = 0; 1465 newvdevs = NULL; 1466 } 1467 1468 oldvdevs = sav->sav_vdevs; 1469 oldnvdevs = sav->sav_count; 1470 sav->sav_vdevs = NULL; 1471 sav->sav_count = 0; 1472 1473 /* 1474 * Process new nvlist of vdevs. 1475 */ 1476 for (i = 0; i < nl2cache; i++) { 1477 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1478 &guid) == 0); 1479 1480 newvdevs[i] = NULL; 1481 for (j = 0; j < oldnvdevs; j++) { 1482 vd = oldvdevs[j]; 1483 if (vd != NULL && guid == vd->vdev_guid) { 1484 /* 1485 * Retain previous vdev for add/remove ops. 1486 */ 1487 newvdevs[i] = vd; 1488 oldvdevs[j] = NULL; 1489 break; 1490 } 1491 } 1492 1493 if (newvdevs[i] == NULL) { 1494 /* 1495 * Create new vdev 1496 */ 1497 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1498 VDEV_ALLOC_L2CACHE) == 0); 1499 ASSERT(vd != NULL); 1500 newvdevs[i] = vd; 1501 1502 /* 1503 * Commit this vdev as an l2cache device, 1504 * even if it fails to open. 1505 */ 1506 spa_l2cache_add(vd); 1507 1508 vd->vdev_top = vd; 1509 vd->vdev_aux = sav; 1510 1511 spa_l2cache_activate(vd); 1512 1513 if (vdev_open(vd) != 0) 1514 continue; 1515 1516 (void) vdev_validate_aux(vd); 1517 1518 if (!vdev_is_dead(vd)) 1519 l2arc_add_vdev(spa, vd); 1520 } 1521 } 1522 1523 /* 1524 * Purge vdevs that were dropped 1525 */ 1526 for (i = 0; i < oldnvdevs; i++) { 1527 uint64_t pool; 1528 1529 vd = oldvdevs[i]; 1530 if (vd != NULL) { 1531 ASSERT(vd->vdev_isl2cache); 1532 1533 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1534 pool != 0ULL && l2arc_vdev_present(vd)) 1535 l2arc_remove_vdev(vd); 1536 vdev_clear_stats(vd); 1537 vdev_free(vd); 1538 } 1539 } 1540 1541 if (oldvdevs) 1542 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1543 1544 if (sav->sav_config == NULL) 1545 goto out; 1546 1547 sav->sav_vdevs = newvdevs; 1548 sav->sav_count = (int)nl2cache; 1549 1550 /* 1551 * Recompute the stashed list of l2cache devices, with status 1552 * information this time. 1553 */ 1554 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1555 DATA_TYPE_NVLIST_ARRAY) == 0); 1556 1557 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1558 for (i = 0; i < sav->sav_count; i++) 1559 l2cache[i] = vdev_config_generate(spa, 1560 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1561 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1562 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1563 out: 1564 for (i = 0; i < sav->sav_count; i++) 1565 nvlist_free(l2cache[i]); 1566 if (sav->sav_count) 1567 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1568 } 1569 1570 static int 1571 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1572 { 1573 dmu_buf_t *db; 1574 char *packed = NULL; 1575 size_t nvsize = 0; 1576 int error; 1577 *value = NULL; 1578 1579 error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 1580 if (error != 0) 1581 return (error); 1582 1583 nvsize = *(uint64_t *)db->db_data; 1584 dmu_buf_rele(db, FTAG); 1585 1586 packed = kmem_alloc(nvsize, KM_SLEEP); 1587 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1588 DMU_READ_PREFETCH); 1589 if (error == 0) 1590 error = nvlist_unpack(packed, nvsize, value, 0); 1591 kmem_free(packed, nvsize); 1592 1593 return (error); 1594 } 1595 1596 /* 1597 * Checks to see if the given vdev could not be opened, in which case we post a 1598 * sysevent to notify the autoreplace code that the device has been removed. 1599 */ 1600 static void 1601 spa_check_removed(vdev_t *vd) 1602 { 1603 for (int c = 0; c < vd->vdev_children; c++) 1604 spa_check_removed(vd->vdev_child[c]); 1605 1606 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 1607 !vd->vdev_ishole) { 1608 zfs_post_autoreplace(vd->vdev_spa, vd); 1609 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1610 } 1611 } 1612 1613 static void 1614 spa_config_valid_zaps(vdev_t *vd, vdev_t *mvd) 1615 { 1616 ASSERT3U(vd->vdev_children, ==, mvd->vdev_children); 1617 1618 vd->vdev_top_zap = mvd->vdev_top_zap; 1619 vd->vdev_leaf_zap = mvd->vdev_leaf_zap; 1620 1621 for (uint64_t i = 0; i < vd->vdev_children; i++) { 1622 spa_config_valid_zaps(vd->vdev_child[i], mvd->vdev_child[i]); 1623 } 1624 } 1625 1626 /* 1627 * Validate the current config against the MOS config 1628 */ 1629 static boolean_t 1630 spa_config_valid(spa_t *spa, nvlist_t *config) 1631 { 1632 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 1633 nvlist_t *nv; 1634 1635 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 1636 1637 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1638 VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1639 1640 ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 1641 1642 /* 1643 * If we're doing a normal import, then build up any additional 1644 * diagnostic information about missing devices in this config. 1645 * We'll pass this up to the user for further processing. 1646 */ 1647 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1648 nvlist_t **child, *nv; 1649 uint64_t idx = 0; 1650 1651 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1652 KM_SLEEP); 1653 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1654 1655 for (int c = 0; c < rvd->vdev_children; c++) { 1656 vdev_t *tvd = rvd->vdev_child[c]; 1657 vdev_t *mtvd = mrvd->vdev_child[c]; 1658 1659 if (tvd->vdev_ops == &vdev_missing_ops && 1660 mtvd->vdev_ops != &vdev_missing_ops && 1661 mtvd->vdev_islog) 1662 child[idx++] = vdev_config_generate(spa, mtvd, 1663 B_FALSE, 0); 1664 } 1665 1666 if (idx) { 1667 VERIFY(nvlist_add_nvlist_array(nv, 1668 ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 1669 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 1670 ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 1671 1672 for (int i = 0; i < idx; i++) 1673 nvlist_free(child[i]); 1674 } 1675 nvlist_free(nv); 1676 kmem_free(child, rvd->vdev_children * sizeof (char **)); 1677 } 1678 1679 /* 1680 * Compare the root vdev tree with the information we have 1681 * from the MOS config (mrvd). Check each top-level vdev 1682 * with the corresponding MOS config top-level (mtvd). 1683 */ 1684 for (int c = 0; c < rvd->vdev_children; c++) { 1685 vdev_t *tvd = rvd->vdev_child[c]; 1686 vdev_t *mtvd = mrvd->vdev_child[c]; 1687 1688 /* 1689 * Resolve any "missing" vdevs in the current configuration. 1690 * If we find that the MOS config has more accurate information 1691 * about the top-level vdev then use that vdev instead. 1692 */ 1693 if (tvd->vdev_ops == &vdev_missing_ops && 1694 mtvd->vdev_ops != &vdev_missing_ops) { 1695 1696 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) 1697 continue; 1698 1699 /* 1700 * Device specific actions. 1701 */ 1702 if (mtvd->vdev_islog) { 1703 spa_set_log_state(spa, SPA_LOG_CLEAR); 1704 } else { 1705 /* 1706 * XXX - once we have 'readonly' pool 1707 * support we should be able to handle 1708 * missing data devices by transitioning 1709 * the pool to readonly. 1710 */ 1711 continue; 1712 } 1713 1714 /* 1715 * Swap the missing vdev with the data we were 1716 * able to obtain from the MOS config. 1717 */ 1718 vdev_remove_child(rvd, tvd); 1719 vdev_remove_child(mrvd, mtvd); 1720 1721 vdev_add_child(rvd, mtvd); 1722 vdev_add_child(mrvd, tvd); 1723 1724 spa_config_exit(spa, SCL_ALL, FTAG); 1725 vdev_load(mtvd); 1726 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1727 1728 vdev_reopen(rvd); 1729 } else { 1730 if (mtvd->vdev_islog) { 1731 /* 1732 * Load the slog device's state from the MOS 1733 * config since it's possible that the label 1734 * does not contain the most up-to-date 1735 * information. 1736 */ 1737 vdev_load_log_state(tvd, mtvd); 1738 vdev_reopen(tvd); 1739 } 1740 1741 /* 1742 * Per-vdev ZAP info is stored exclusively in the MOS. 1743 */ 1744 spa_config_valid_zaps(tvd, mtvd); 1745 } 1746 } 1747 1748 vdev_free(mrvd); 1749 spa_config_exit(spa, SCL_ALL, FTAG); 1750 1751 /* 1752 * Ensure we were able to validate the config. 1753 */ 1754 return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 1755 } 1756 1757 /* 1758 * Check for missing log devices 1759 */ 1760 static boolean_t 1761 spa_check_logs(spa_t *spa) 1762 { 1763 boolean_t rv = B_FALSE; 1764 dsl_pool_t *dp = spa_get_dsl(spa); 1765 1766 switch (spa->spa_log_state) { 1767 case SPA_LOG_MISSING: 1768 /* need to recheck in case slog has been restored */ 1769 case SPA_LOG_UNKNOWN: 1770 rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 1771 zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); 1772 if (rv) 1773 spa_set_log_state(spa, SPA_LOG_MISSING); 1774 break; 1775 } 1776 return (rv); 1777 } 1778 1779 static boolean_t 1780 spa_passivate_log(spa_t *spa) 1781 { 1782 vdev_t *rvd = spa->spa_root_vdev; 1783 boolean_t slog_found = B_FALSE; 1784 1785 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1786 1787 if (!spa_has_slogs(spa)) 1788 return (B_FALSE); 1789 1790 for (int c = 0; c < rvd->vdev_children; c++) { 1791 vdev_t *tvd = rvd->vdev_child[c]; 1792 metaslab_group_t *mg = tvd->vdev_mg; 1793 1794 if (tvd->vdev_islog) { 1795 metaslab_group_passivate(mg); 1796 slog_found = B_TRUE; 1797 } 1798 } 1799 1800 return (slog_found); 1801 } 1802 1803 static void 1804 spa_activate_log(spa_t *spa) 1805 { 1806 vdev_t *rvd = spa->spa_root_vdev; 1807 1808 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1809 1810 for (int c = 0; c < rvd->vdev_children; c++) { 1811 vdev_t *tvd = rvd->vdev_child[c]; 1812 metaslab_group_t *mg = tvd->vdev_mg; 1813 1814 if (tvd->vdev_islog) 1815 metaslab_group_activate(mg); 1816 } 1817 } 1818 1819 int 1820 spa_offline_log(spa_t *spa) 1821 { 1822 int error; 1823 1824 error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 1825 NULL, DS_FIND_CHILDREN); 1826 if (error == 0) { 1827 /* 1828 * We successfully offlined the log device, sync out the 1829 * current txg so that the "stubby" block can be removed 1830 * by zil_sync(). 1831 */ 1832 txg_wait_synced(spa->spa_dsl_pool, 0); 1833 } 1834 return (error); 1835 } 1836 1837 static void 1838 spa_aux_check_removed(spa_aux_vdev_t *sav) 1839 { 1840 for (int i = 0; i < sav->sav_count; i++) 1841 spa_check_removed(sav->sav_vdevs[i]); 1842 } 1843 1844 void 1845 spa_claim_notify(zio_t *zio) 1846 { 1847 spa_t *spa = zio->io_spa; 1848 1849 if (zio->io_error) 1850 return; 1851 1852 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1853 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1854 spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1855 mutex_exit(&spa->spa_props_lock); 1856 } 1857 1858 typedef struct spa_load_error { 1859 uint64_t sle_meta_count; 1860 uint64_t sle_data_count; 1861 } spa_load_error_t; 1862 1863 static void 1864 spa_load_verify_done(zio_t *zio) 1865 { 1866 blkptr_t *bp = zio->io_bp; 1867 spa_load_error_t *sle = zio->io_private; 1868 dmu_object_type_t type = BP_GET_TYPE(bp); 1869 int error = zio->io_error; 1870 spa_t *spa = zio->io_spa; 1871 1872 if (error) { 1873 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 1874 type != DMU_OT_INTENT_LOG) 1875 atomic_inc_64(&sle->sle_meta_count); 1876 else 1877 atomic_inc_64(&sle->sle_data_count); 1878 } 1879 zio_data_buf_free(zio->io_data, zio->io_size); 1880 1881 mutex_enter(&spa->spa_scrub_lock); 1882 spa->spa_scrub_inflight--; 1883 cv_broadcast(&spa->spa_scrub_io_cv); 1884 mutex_exit(&spa->spa_scrub_lock); 1885 } 1886 1887 /* 1888 * Maximum number of concurrent scrub i/os to create while verifying 1889 * a pool while importing it. 1890 */ 1891 int spa_load_verify_maxinflight = 10000; 1892 boolean_t spa_load_verify_metadata = B_TRUE; 1893 boolean_t spa_load_verify_data = B_TRUE; 1894 1895 /*ARGSUSED*/ 1896 static int 1897 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1898 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 1899 { 1900 if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) 1901 return (0); 1902 /* 1903 * Note: normally this routine will not be called if 1904 * spa_load_verify_metadata is not set. However, it may be useful 1905 * to manually set the flag after the traversal has begun. 1906 */ 1907 if (!spa_load_verify_metadata) 1908 return (0); 1909 if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data) 1910 return (0); 1911 1912 zio_t *rio = arg; 1913 size_t size = BP_GET_PSIZE(bp); 1914 void *data = zio_data_buf_alloc(size); 1915 1916 mutex_enter(&spa->spa_scrub_lock); 1917 while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight) 1918 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1919 spa->spa_scrub_inflight++; 1920 mutex_exit(&spa->spa_scrub_lock); 1921 1922 zio_nowait(zio_read(rio, spa, bp, data, size, 1923 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1924 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1925 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1926 return (0); 1927 } 1928 1929 static int 1930 spa_load_verify(spa_t *spa) 1931 { 1932 zio_t *rio; 1933 spa_load_error_t sle = { 0 }; 1934 zpool_rewind_policy_t policy; 1935 boolean_t verify_ok = B_FALSE; 1936 int error = 0; 1937 1938 zpool_get_rewind_policy(spa->spa_config, &policy); 1939 1940 if (policy.zrp_request & ZPOOL_NEVER_REWIND) 1941 return (0); 1942 1943 rio = zio_root(spa, NULL, &sle, 1944 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1945 1946 if (spa_load_verify_metadata) { 1947 error = traverse_pool(spa, spa->spa_verify_min_txg, 1948 TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, 1949 spa_load_verify_cb, rio); 1950 } 1951 1952 (void) zio_wait(rio); 1953 1954 spa->spa_load_meta_errors = sle.sle_meta_count; 1955 spa->spa_load_data_errors = sle.sle_data_count; 1956 1957 if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 1958 sle.sle_data_count <= policy.zrp_maxdata) { 1959 int64_t loss = 0; 1960 1961 verify_ok = B_TRUE; 1962 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 1963 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 1964 1965 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 1966 VERIFY(nvlist_add_uint64(spa->spa_load_info, 1967 ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 1968 VERIFY(nvlist_add_int64(spa->spa_load_info, 1969 ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 1970 VERIFY(nvlist_add_uint64(spa->spa_load_info, 1971 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 1972 } else { 1973 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 1974 } 1975 1976 if (error) { 1977 if (error != ENXIO && error != EIO) 1978 error = SET_ERROR(EIO); 1979 return (error); 1980 } 1981 1982 return (verify_ok ? 0 : EIO); 1983 } 1984 1985 /* 1986 * Find a value in the pool props object. 1987 */ 1988 static void 1989 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 1990 { 1991 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 1992 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 1993 } 1994 1995 /* 1996 * Find a value in the pool directory object. 1997 */ 1998 static int 1999 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 2000 { 2001 return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2002 name, sizeof (uint64_t), 1, val)); 2003 } 2004 2005 static int 2006 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 2007 { 2008 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 2009 return (err); 2010 } 2011 2012 /* 2013 * Fix up config after a partly-completed split. This is done with the 2014 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 2015 * pool have that entry in their config, but only the splitting one contains 2016 * a list of all the guids of the vdevs that are being split off. 2017 * 2018 * This function determines what to do with that list: either rejoin 2019 * all the disks to the pool, or complete the splitting process. To attempt 2020 * the rejoin, each disk that is offlined is marked online again, and 2021 * we do a reopen() call. If the vdev label for every disk that was 2022 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 2023 * then we call vdev_split() on each disk, and complete the split. 2024 * 2025 * Otherwise we leave the config alone, with all the vdevs in place in 2026 * the original pool. 2027 */ 2028 static void 2029 spa_try_repair(spa_t *spa, nvlist_t *config) 2030 { 2031 uint_t extracted; 2032 uint64_t *glist; 2033 uint_t i, gcount; 2034 nvlist_t *nvl; 2035 vdev_t **vd; 2036 boolean_t attempt_reopen; 2037 2038 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 2039 return; 2040 2041 /* check that the config is complete */ 2042 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 2043 &glist, &gcount) != 0) 2044 return; 2045 2046 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 2047 2048 /* attempt to online all the vdevs & validate */ 2049 attempt_reopen = B_TRUE; 2050 for (i = 0; i < gcount; i++) { 2051 if (glist[i] == 0) /* vdev is hole */ 2052 continue; 2053 2054 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 2055 if (vd[i] == NULL) { 2056 /* 2057 * Don't bother attempting to reopen the disks; 2058 * just do the split. 2059 */ 2060 attempt_reopen = B_FALSE; 2061 } else { 2062 /* attempt to re-online it */ 2063 vd[i]->vdev_offline = B_FALSE; 2064 } 2065 } 2066 2067 if (attempt_reopen) { 2068 vdev_reopen(spa->spa_root_vdev); 2069 2070 /* check each device to see what state it's in */ 2071 for (extracted = 0, i = 0; i < gcount; i++) { 2072 if (vd[i] != NULL && 2073 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 2074 break; 2075 ++extracted; 2076 } 2077 } 2078 2079 /* 2080 * If every disk has been moved to the new pool, or if we never 2081 * even attempted to look at them, then we split them off for 2082 * good. 2083 */ 2084 if (!attempt_reopen || gcount == extracted) { 2085 for (i = 0; i < gcount; i++) 2086 if (vd[i] != NULL) 2087 vdev_split(vd[i]); 2088 vdev_reopen(spa->spa_root_vdev); 2089 } 2090 2091 kmem_free(vd, gcount * sizeof (vdev_t *)); 2092 } 2093 2094 static int 2095 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 2096 boolean_t mosconfig) 2097 { 2098 nvlist_t *config = spa->spa_config; 2099 char *ereport = FM_EREPORT_ZFS_POOL; 2100 char *comment; 2101 int error; 2102 uint64_t pool_guid; 2103 nvlist_t *nvl; 2104 2105 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 2106 return (SET_ERROR(EINVAL)); 2107 2108 ASSERT(spa->spa_comment == NULL); 2109 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 2110 spa->spa_comment = spa_strdup(comment); 2111 2112 /* 2113 * Versioning wasn't explicitly added to the label until later, so if 2114 * it's not present treat it as the initial version. 2115 */ 2116 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 2117 &spa->spa_ubsync.ub_version) != 0) 2118 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 2119 2120 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 2121 &spa->spa_config_txg); 2122 2123 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 2124 spa_guid_exists(pool_guid, 0)) { 2125 error = SET_ERROR(EEXIST); 2126 } else { 2127 spa->spa_config_guid = pool_guid; 2128 2129 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 2130 &nvl) == 0) { 2131 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 2132 KM_SLEEP) == 0); 2133 } 2134 2135 nvlist_free(spa->spa_load_info); 2136 spa->spa_load_info = fnvlist_alloc(); 2137 2138 gethrestime(&spa->spa_loaded_ts); 2139 error = spa_load_impl(spa, pool_guid, config, state, type, 2140 mosconfig, &ereport); 2141 } 2142 2143 /* 2144 * Don't count references from objsets that are already closed 2145 * and are making their way through the eviction process. 2146 */ 2147 spa_evicting_os_wait(spa); 2148 spa->spa_minref = refcount_count(&spa->spa_refcount); 2149 if (error) { 2150 if (error != EEXIST) { 2151 spa->spa_loaded_ts.tv_sec = 0; 2152 spa->spa_loaded_ts.tv_nsec = 0; 2153 } 2154 if (error != EBADF) { 2155 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 2156 } 2157 } 2158 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 2159 spa->spa_ena = 0; 2160 2161 return (error); 2162 } 2163 2164 /* 2165 * Count the number of per-vdev ZAPs associated with all of the vdevs in the 2166 * vdev tree rooted in the given vd, and ensure that each ZAP is present in the 2167 * spa's per-vdev ZAP list. 2168 */ 2169 static uint64_t 2170 vdev_count_verify_zaps(vdev_t *vd) 2171 { 2172 spa_t *spa = vd->vdev_spa; 2173 uint64_t total = 0; 2174 if (vd->vdev_top_zap != 0) { 2175 total++; 2176 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 2177 spa->spa_all_vdev_zaps, vd->vdev_top_zap)); 2178 } 2179 if (vd->vdev_leaf_zap != 0) { 2180 total++; 2181 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 2182 spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); 2183 } 2184 2185 for (uint64_t i = 0; i < vd->vdev_children; i++) { 2186 total += vdev_count_verify_zaps(vd->vdev_child[i]); 2187 } 2188 2189 return (total); 2190 } 2191 2192 /* 2193 * Load an existing storage pool, using the pool's builtin spa_config as a 2194 * source of configuration information. 2195 */ 2196 static int 2197 spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 2198 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 2199 char **ereport) 2200 { 2201 int error = 0; 2202 nvlist_t *nvroot = NULL; 2203 nvlist_t *label; 2204 vdev_t *rvd; 2205 uberblock_t *ub = &spa->spa_uberblock; 2206 uint64_t children, config_cache_txg = spa->spa_config_txg; 2207 int orig_mode = spa->spa_mode; 2208 int parse; 2209 uint64_t obj; 2210 boolean_t missing_feat_write = B_FALSE; 2211 2212 /* 2213 * If this is an untrusted config, access the pool in read-only mode. 2214 * This prevents things like resilvering recently removed devices. 2215 */ 2216 if (!mosconfig) 2217 spa->spa_mode = FREAD; 2218 2219 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2220 2221 spa->spa_load_state = state; 2222 2223 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 2224 return (SET_ERROR(EINVAL)); 2225 2226 parse = (type == SPA_IMPORT_EXISTING ? 2227 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 2228 2229 /* 2230 * Create "The Godfather" zio to hold all async IOs 2231 */ 2232 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 2233 KM_SLEEP); 2234 for (int i = 0; i < max_ncpus; i++) { 2235 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 2236 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 2237 ZIO_FLAG_GODFATHER); 2238 } 2239 2240 /* 2241 * Parse the configuration into a vdev tree. We explicitly set the 2242 * value that will be returned by spa_version() since parsing the 2243 * configuration requires knowing the version number. 2244 */ 2245 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2246 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 2247 spa_config_exit(spa, SCL_ALL, FTAG); 2248 2249 if (error != 0) 2250 return (error); 2251 2252 ASSERT(spa->spa_root_vdev == rvd); 2253 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 2254 ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); 2255 2256 if (type != SPA_IMPORT_ASSEMBLE) { 2257 ASSERT(spa_guid(spa) == pool_guid); 2258 } 2259 2260 /* 2261 * Try to open all vdevs, loading each label in the process. 2262 */ 2263 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2264 error = vdev_open(rvd); 2265 spa_config_exit(spa, SCL_ALL, FTAG); 2266 if (error != 0) 2267 return (error); 2268 2269 /* 2270 * We need to validate the vdev labels against the configuration that 2271 * we have in hand, which is dependent on the setting of mosconfig. If 2272 * mosconfig is true then we're validating the vdev labels based on 2273 * that config. Otherwise, we're validating against the cached config 2274 * (zpool.cache) that was read when we loaded the zfs module, and then 2275 * later we will recursively call spa_load() and validate against 2276 * the vdev config. 2277 * 2278 * If we're assembling a new pool that's been split off from an 2279 * existing pool, the labels haven't yet been updated so we skip 2280 * validation for now. 2281 */ 2282 if (type != SPA_IMPORT_ASSEMBLE) { 2283 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2284 error = vdev_validate(rvd, mosconfig); 2285 spa_config_exit(spa, SCL_ALL, FTAG); 2286 2287 if (error != 0) 2288 return (error); 2289 2290 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2291 return (SET_ERROR(ENXIO)); 2292 } 2293 2294 /* 2295 * Find the best uberblock. 2296 */ 2297 vdev_uberblock_load(rvd, ub, &label); 2298 2299 /* 2300 * If we weren't able to find a single valid uberblock, return failure. 2301 */ 2302 if (ub->ub_txg == 0) { 2303 nvlist_free(label); 2304 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 2305 } 2306 2307 /* 2308 * If the pool has an unsupported version we can't open it. 2309 */ 2310 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 2311 nvlist_free(label); 2312 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 2313 } 2314 2315 if (ub->ub_version >= SPA_VERSION_FEATURES) { 2316 nvlist_t *features; 2317 2318 /* 2319 * If we weren't able to find what's necessary for reading the 2320 * MOS in the label, return failure. 2321 */ 2322 if (label == NULL || nvlist_lookup_nvlist(label, 2323 ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { 2324 nvlist_free(label); 2325 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2326 ENXIO)); 2327 } 2328 2329 /* 2330 * Update our in-core representation with the definitive values 2331 * from the label. 2332 */ 2333 nvlist_free(spa->spa_label_features); 2334 VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); 2335 } 2336 2337 nvlist_free(label); 2338 2339 /* 2340 * Look through entries in the label nvlist's features_for_read. If 2341 * there is a feature listed there which we don't understand then we 2342 * cannot open a pool. 2343 */ 2344 if (ub->ub_version >= SPA_VERSION_FEATURES) { 2345 nvlist_t *unsup_feat; 2346 2347 VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 2348 0); 2349 2350 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 2351 NULL); nvp != NULL; 2352 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 2353 if (!zfeature_is_supported(nvpair_name(nvp))) { 2354 VERIFY(nvlist_add_string(unsup_feat, 2355 nvpair_name(nvp), "") == 0); 2356 } 2357 } 2358 2359 if (!nvlist_empty(unsup_feat)) { 2360 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 2361 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); 2362 nvlist_free(unsup_feat); 2363 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2364 ENOTSUP)); 2365 } 2366 2367 nvlist_free(unsup_feat); 2368 } 2369 2370 /* 2371 * If the vdev guid sum doesn't match the uberblock, we have an 2372 * incomplete configuration. We first check to see if the pool 2373 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 2374 * If it is, defer the vdev_guid_sum check till later so we 2375 * can handle missing vdevs. 2376 */ 2377 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 2378 &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && 2379 rvd->vdev_guid_sum != ub->ub_guid_sum) 2380 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 2381 2382 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 2383 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2384 spa_try_repair(spa, config); 2385 spa_config_exit(spa, SCL_ALL, FTAG); 2386 nvlist_free(spa->spa_config_splitting); 2387 spa->spa_config_splitting = NULL; 2388 } 2389 2390 /* 2391 * Initialize internal SPA structures. 2392 */ 2393 spa->spa_state = POOL_STATE_ACTIVE; 2394 spa->spa_ubsync = spa->spa_uberblock; 2395 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2396 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2397 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2398 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2399 spa->spa_claim_max_txg = spa->spa_first_txg; 2400 spa->spa_prev_software_version = ub->ub_software_version; 2401 2402 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2403 if (error) 2404 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2405 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2406 2407 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 2408 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2409 2410 if (spa_version(spa) >= SPA_VERSION_FEATURES) { 2411 boolean_t missing_feat_read = B_FALSE; 2412 nvlist_t *unsup_feat, *enabled_feat; 2413 2414 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 2415 &spa->spa_feat_for_read_obj) != 0) { 2416 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2417 } 2418 2419 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 2420 &spa->spa_feat_for_write_obj) != 0) { 2421 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2422 } 2423 2424 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 2425 &spa->spa_feat_desc_obj) != 0) { 2426 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2427 } 2428 2429 enabled_feat = fnvlist_alloc(); 2430 unsup_feat = fnvlist_alloc(); 2431 2432 if (!spa_features_check(spa, B_FALSE, 2433 unsup_feat, enabled_feat)) 2434 missing_feat_read = B_TRUE; 2435 2436 if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { 2437 if (!spa_features_check(spa, B_TRUE, 2438 unsup_feat, enabled_feat)) { 2439 missing_feat_write = B_TRUE; 2440 } 2441 } 2442 2443 fnvlist_add_nvlist(spa->spa_load_info, 2444 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 2445 2446 if (!nvlist_empty(unsup_feat)) { 2447 fnvlist_add_nvlist(spa->spa_load_info, 2448 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 2449 } 2450 2451 fnvlist_free(enabled_feat); 2452 fnvlist_free(unsup_feat); 2453 2454 if (!missing_feat_read) { 2455 fnvlist_add_boolean(spa->spa_load_info, 2456 ZPOOL_CONFIG_CAN_RDONLY); 2457 } 2458 2459 /* 2460 * If the state is SPA_LOAD_TRYIMPORT, our objective is 2461 * twofold: to determine whether the pool is available for 2462 * import in read-write mode and (if it is not) whether the 2463 * pool is available for import in read-only mode. If the pool 2464 * is available for import in read-write mode, it is displayed 2465 * as available in userland; if it is not available for import 2466 * in read-only mode, it is displayed as unavailable in 2467 * userland. If the pool is available for import in read-only 2468 * mode but not read-write mode, it is displayed as unavailable 2469 * in userland with a special note that the pool is actually 2470 * available for open in read-only mode. 2471 * 2472 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 2473 * missing a feature for write, we must first determine whether 2474 * the pool can be opened read-only before returning to 2475 * userland in order to know whether to display the 2476 * abovementioned note. 2477 */ 2478 if (missing_feat_read || (missing_feat_write && 2479 spa_writeable(spa))) { 2480 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2481 ENOTSUP)); 2482 } 2483 2484 /* 2485 * Load refcounts for ZFS features from disk into an in-memory 2486 * cache during SPA initialization. 2487 */ 2488 for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 2489 uint64_t refcount; 2490 2491 error = feature_get_refcount_from_disk(spa, 2492 &spa_feature_table[i], &refcount); 2493 if (error == 0) { 2494 spa->spa_feat_refcount_cache[i] = refcount; 2495 } else if (error == ENOTSUP) { 2496 spa->spa_feat_refcount_cache[i] = 2497 SPA_FEATURE_DISABLED; 2498 } else { 2499 return (spa_vdev_err(rvd, 2500 VDEV_AUX_CORRUPT_DATA, EIO)); 2501 } 2502 } 2503 } 2504 2505 if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 2506 if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 2507 &spa->spa_feat_enabled_txg_obj) != 0) 2508 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2509 } 2510 2511 spa->spa_is_initializing = B_TRUE; 2512 error = dsl_pool_open(spa->spa_dsl_pool); 2513 spa->spa_is_initializing = B_FALSE; 2514 if (error != 0) 2515 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2516 2517 if (!mosconfig) { 2518 uint64_t hostid; 2519 nvlist_t *policy = NULL, *nvconfig; 2520 2521 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2522 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2523 2524 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 2525 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2526 char *hostname; 2527 unsigned long myhostid = 0; 2528 2529 VERIFY(nvlist_lookup_string(nvconfig, 2530 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 2531 2532 #ifdef _KERNEL 2533 myhostid = zone_get_hostid(NULL); 2534 #else /* _KERNEL */ 2535 /* 2536 * We're emulating the system's hostid in userland, so 2537 * we can't use zone_get_hostid(). 2538 */ 2539 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 2540 #endif /* _KERNEL */ 2541 if (hostid != 0 && myhostid != 0 && 2542 hostid != myhostid) { 2543 nvlist_free(nvconfig); 2544 cmn_err(CE_WARN, "pool '%s' could not be " 2545 "loaded as it was last accessed by " 2546 "another system (host: %s hostid: 0x%lx). " 2547 "See: http://illumos.org/msg/ZFS-8000-EY", 2548 spa_name(spa), hostname, 2549 (unsigned long)hostid); 2550 return (SET_ERROR(EBADF)); 2551 } 2552 } 2553 if (nvlist_lookup_nvlist(spa->spa_config, 2554 ZPOOL_REWIND_POLICY, &policy) == 0) 2555 VERIFY(nvlist_add_nvlist(nvconfig, 2556 ZPOOL_REWIND_POLICY, policy) == 0); 2557 2558 spa_config_set(spa, nvconfig); 2559 spa_unload(spa); 2560 spa_deactivate(spa); 2561 spa_activate(spa, orig_mode); 2562 2563 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 2564 } 2565 2566 /* Grab the secret checksum salt from the MOS. */ 2567 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2568 DMU_POOL_CHECKSUM_SALT, 1, 2569 sizeof (spa->spa_cksum_salt.zcs_bytes), 2570 spa->spa_cksum_salt.zcs_bytes); 2571 if (error == ENOENT) { 2572 /* Generate a new salt for subsequent use */ 2573 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 2574 sizeof (spa->spa_cksum_salt.zcs_bytes)); 2575 } else if (error != 0) { 2576 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2577 } 2578 2579 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) 2580 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2581 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 2582 if (error != 0) 2583 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2584 2585 /* 2586 * Load the bit that tells us to use the new accounting function 2587 * (raid-z deflation). If we have an older pool, this will not 2588 * be present. 2589 */ 2590 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 2591 if (error != 0 && error != ENOENT) 2592 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2593 2594 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2595 &spa->spa_creation_version); 2596 if (error != 0 && error != ENOENT) 2597 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2598 2599 /* 2600 * Load the persistent error log. If we have an older pool, this will 2601 * not be present. 2602 */ 2603 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 2604 if (error != 0 && error != ENOENT) 2605 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2606 2607 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2608 &spa->spa_errlog_scrub); 2609 if (error != 0 && error != ENOENT) 2610 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2611 2612 /* 2613 * Load the history object. If we have an older pool, this 2614 * will not be present. 2615 */ 2616 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 2617 if (error != 0 && error != ENOENT) 2618 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2619 2620 /* 2621 * Load the per-vdev ZAP map. If we have an older pool, this will not 2622 * be present; in this case, defer its creation to a later time to 2623 * avoid dirtying the MOS this early / out of sync context. See 2624 * spa_sync_config_object. 2625 */ 2626 2627 /* The sentinel is only available in the MOS config. */ 2628 nvlist_t *mos_config; 2629 if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) 2630 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2631 2632 error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, 2633 &spa->spa_all_vdev_zaps); 2634 2635 if (error != ENOENT && error != 0) { 2636 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2637 } else if (error == 0 && !nvlist_exists(mos_config, 2638 ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { 2639 /* 2640 * An older version of ZFS overwrote the sentinel value, so 2641 * we have orphaned per-vdev ZAPs in the MOS. Defer their 2642 * destruction to later; see spa_sync_config_object. 2643 */ 2644 spa->spa_avz_action = AVZ_ACTION_DESTROY; 2645 /* 2646 * We're assuming that no vdevs have had their ZAPs created 2647 * before this. Better be sure of it. 2648 */ 2649 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 2650 } 2651 nvlist_free(mos_config); 2652 2653 /* 2654 * If we're assembling the pool from the split-off vdevs of 2655 * an existing pool, we don't want to attach the spares & cache 2656 * devices. 2657 */ 2658 2659 /* 2660 * Load any hot spares for this pool. 2661 */ 2662 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 2663 if (error != 0 && error != ENOENT) 2664 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2665 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2666 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2667 if (load_nvlist(spa, spa->spa_spares.sav_object, 2668 &spa->spa_spares.sav_config) != 0) 2669 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2670 2671 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2672 spa_load_spares(spa); 2673 spa_config_exit(spa, SCL_ALL, FTAG); 2674 } else if (error == 0) { 2675 spa->spa_spares.sav_sync = B_TRUE; 2676 } 2677 2678 /* 2679 * Load any level 2 ARC devices for this pool. 2680 */ 2681 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2682 &spa->spa_l2cache.sav_object); 2683 if (error != 0 && error != ENOENT) 2684 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2685 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2686 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2687 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2688 &spa->spa_l2cache.sav_config) != 0) 2689 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2690 2691 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2692 spa_load_l2cache(spa); 2693 spa_config_exit(spa, SCL_ALL, FTAG); 2694 } else if (error == 0) { 2695 spa->spa_l2cache.sav_sync = B_TRUE; 2696 } 2697 2698 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2699 2700 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 2701 if (error && error != ENOENT) 2702 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2703 2704 if (error == 0) { 2705 uint64_t autoreplace; 2706 2707 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2708 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2709 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2710 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2711 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2712 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2713 &spa->spa_dedup_ditto); 2714 2715 spa->spa_autoreplace = (autoreplace != 0); 2716 } 2717 2718 /* 2719 * If the 'autoreplace' property is set, then post a resource notifying 2720 * the ZFS DE that it should not issue any faults for unopenable 2721 * devices. We also iterate over the vdevs, and post a sysevent for any 2722 * unopenable vdevs so that the normal autoreplace handler can take 2723 * over. 2724 */ 2725 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 2726 spa_check_removed(spa->spa_root_vdev); 2727 /* 2728 * For the import case, this is done in spa_import(), because 2729 * at this point we're using the spare definitions from 2730 * the MOS config, not necessarily from the userland config. 2731 */ 2732 if (state != SPA_LOAD_IMPORT) { 2733 spa_aux_check_removed(&spa->spa_spares); 2734 spa_aux_check_removed(&spa->spa_l2cache); 2735 } 2736 } 2737 2738 /* 2739 * Load the vdev state for all toplevel vdevs. 2740 */ 2741 vdev_load(rvd); 2742 2743 /* 2744 * Propagate the leaf DTLs we just loaded all the way up the tree. 2745 */ 2746 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2747 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 2748 spa_config_exit(spa, SCL_ALL, FTAG); 2749 2750 /* 2751 * Load the DDTs (dedup tables). 2752 */ 2753 error = ddt_load(spa); 2754 if (error != 0) 2755 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2756 2757 spa_update_dspace(spa); 2758 2759 /* 2760 * Validate the config, using the MOS config to fill in any 2761 * information which might be missing. If we fail to validate 2762 * the config then declare the pool unfit for use. If we're 2763 * assembling a pool from a split, the log is not transferred 2764 * over. 2765 */ 2766 if (type != SPA_IMPORT_ASSEMBLE) { 2767 nvlist_t *nvconfig; 2768 2769 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2770 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2771 2772 if (!spa_config_valid(spa, nvconfig)) { 2773 nvlist_free(nvconfig); 2774 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2775 ENXIO)); 2776 } 2777 nvlist_free(nvconfig); 2778 2779 /* 2780 * Now that we've validated the config, check the state of the 2781 * root vdev. If it can't be opened, it indicates one or 2782 * more toplevel vdevs are faulted. 2783 */ 2784 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2785 return (SET_ERROR(ENXIO)); 2786 2787 if (spa_writeable(spa) && spa_check_logs(spa)) { 2788 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 2789 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 2790 } 2791 } 2792 2793 if (missing_feat_write) { 2794 ASSERT(state == SPA_LOAD_TRYIMPORT); 2795 2796 /* 2797 * At this point, we know that we can open the pool in 2798 * read-only mode but not read-write mode. We now have enough 2799 * information and can return to userland. 2800 */ 2801 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); 2802 } 2803 2804 /* 2805 * We've successfully opened the pool, verify that we're ready 2806 * to start pushing transactions. 2807 */ 2808 if (state != SPA_LOAD_TRYIMPORT) { 2809 if (error = spa_load_verify(spa)) 2810 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2811 error)); 2812 } 2813 2814 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2815 spa->spa_load_max_txg == UINT64_MAX)) { 2816 dmu_tx_t *tx; 2817 int need_update = B_FALSE; 2818 dsl_pool_t *dp = spa_get_dsl(spa); 2819 2820 ASSERT(state != SPA_LOAD_TRYIMPORT); 2821 2822 /* 2823 * Claim log blocks that haven't been committed yet. 2824 * This must all happen in a single txg. 2825 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2826 * invoked from zil_claim_log_block()'s i/o done callback. 2827 * Price of rollback is that we abandon the log. 2828 */ 2829 spa->spa_claiming = B_TRUE; 2830 2831 tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); 2832 (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 2833 zil_claim, tx, DS_FIND_CHILDREN); 2834 dmu_tx_commit(tx); 2835 2836 spa->spa_claiming = B_FALSE; 2837 2838 spa_set_log_state(spa, SPA_LOG_GOOD); 2839 spa->spa_sync_on = B_TRUE; 2840 txg_sync_start(spa->spa_dsl_pool); 2841 2842 /* 2843 * Wait for all claims to sync. We sync up to the highest 2844 * claimed log block birth time so that claimed log blocks 2845 * don't appear to be from the future. spa_claim_max_txg 2846 * will have been set for us by either zil_check_log_chain() 2847 * (invoked from spa_check_logs()) or zil_claim() above. 2848 */ 2849 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2850 2851 /* 2852 * If the config cache is stale, or we have uninitialized 2853 * metaslabs (see spa_vdev_add()), then update the config. 2854 * 2855 * If this is a verbatim import, trust the current 2856 * in-core spa_config and update the disk labels. 2857 */ 2858 if (config_cache_txg != spa->spa_config_txg || 2859 state == SPA_LOAD_IMPORT || 2860 state == SPA_LOAD_RECOVER || 2861 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 2862 need_update = B_TRUE; 2863 2864 for (int c = 0; c < rvd->vdev_children; c++) 2865 if (rvd->vdev_child[c]->vdev_ms_array == 0) 2866 need_update = B_TRUE; 2867 2868 /* 2869 * Update the config cache asychronously in case we're the 2870 * root pool, in which case the config cache isn't writable yet. 2871 */ 2872 if (need_update) 2873 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2874 2875 /* 2876 * Check all DTLs to see if anything needs resilvering. 2877 */ 2878 if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 2879 vdev_resilver_needed(rvd, NULL, NULL)) 2880 spa_async_request(spa, SPA_ASYNC_RESILVER); 2881 2882 /* 2883 * Log the fact that we booted up (so that we can detect if 2884 * we rebooted in the middle of an operation). 2885 */ 2886 spa_history_log_version(spa, "open"); 2887 2888 /* 2889 * Delete any inconsistent datasets. 2890 */ 2891 (void) dmu_objset_find(spa_name(spa), 2892 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 2893 2894 /* 2895 * Clean up any stale temporary dataset userrefs. 2896 */ 2897 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2898 } 2899 2900 return (0); 2901 } 2902 2903 static int 2904 spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 2905 { 2906 int mode = spa->spa_mode; 2907 2908 spa_unload(spa); 2909 spa_deactivate(spa); 2910 2911 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; 2912 2913 spa_activate(spa, mode); 2914 spa_async_suspend(spa); 2915 2916 return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 2917 } 2918 2919 /* 2920 * If spa_load() fails this function will try loading prior txg's. If 2921 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 2922 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 2923 * function will not rewind the pool and will return the same error as 2924 * spa_load(). 2925 */ 2926 static int 2927 spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 2928 uint64_t max_request, int rewind_flags) 2929 { 2930 nvlist_t *loadinfo = NULL; 2931 nvlist_t *config = NULL; 2932 int load_error, rewind_error; 2933 uint64_t safe_rewind_txg; 2934 uint64_t min_txg; 2935 2936 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 2937 spa->spa_load_max_txg = spa->spa_load_txg; 2938 spa_set_log_state(spa, SPA_LOG_CLEAR); 2939 } else { 2940 spa->spa_load_max_txg = max_request; 2941 if (max_request != UINT64_MAX) 2942 spa->spa_extreme_rewind = B_TRUE; 2943 } 2944 2945 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 2946 mosconfig); 2947 if (load_error == 0) 2948 return (0); 2949 2950 if (spa->spa_root_vdev != NULL) 2951 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2952 2953 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 2954 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 2955 2956 if (rewind_flags & ZPOOL_NEVER_REWIND) { 2957 nvlist_free(config); 2958 return (load_error); 2959 } 2960 2961 if (state == SPA_LOAD_RECOVER) { 2962 /* Price of rolling back is discarding txgs, including log */ 2963 spa_set_log_state(spa, SPA_LOG_CLEAR); 2964 } else { 2965 /* 2966 * If we aren't rolling back save the load info from our first 2967 * import attempt so that we can restore it after attempting 2968 * to rewind. 2969 */ 2970 loadinfo = spa->spa_load_info; 2971 spa->spa_load_info = fnvlist_alloc(); 2972 } 2973 2974 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 2975 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 2976 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 2977 TXG_INITIAL : safe_rewind_txg; 2978 2979 /* 2980 * Continue as long as we're finding errors, we're still within 2981 * the acceptable rewind range, and we're still finding uberblocks 2982 */ 2983 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 2984 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 2985 if (spa->spa_load_max_txg < safe_rewind_txg) 2986 spa->spa_extreme_rewind = B_TRUE; 2987 rewind_error = spa_load_retry(spa, state, mosconfig); 2988 } 2989 2990 spa->spa_extreme_rewind = B_FALSE; 2991 spa->spa_load_max_txg = UINT64_MAX; 2992 2993 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 2994 spa_config_set(spa, config); 2995 2996 if (state == SPA_LOAD_RECOVER) { 2997 ASSERT3P(loadinfo, ==, NULL); 2998 return (rewind_error); 2999 } else { 3000 /* Store the rewind info as part of the initial load info */ 3001 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 3002 spa->spa_load_info); 3003 3004 /* Restore the initial load info */ 3005 fnvlist_free(spa->spa_load_info); 3006 spa->spa_load_info = loadinfo; 3007 3008 return (load_error); 3009 } 3010 } 3011 3012 /* 3013 * Pool Open/Import 3014 * 3015 * The import case is identical to an open except that the configuration is sent 3016 * down from userland, instead of grabbed from the configuration cache. For the 3017 * case of an open, the pool configuration will exist in the 3018 * POOL_STATE_UNINITIALIZED state. 3019 * 3020 * The stats information (gen/count/ustats) is used to gather vdev statistics at 3021 * the same time open the pool, without having to keep around the spa_t in some 3022 * ambiguous state. 3023 */ 3024 static int 3025 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 3026 nvlist_t **config) 3027 { 3028 spa_t *spa; 3029 spa_load_state_t state = SPA_LOAD_OPEN; 3030 int error; 3031 int locked = B_FALSE; 3032 3033 *spapp = NULL; 3034 3035 /* 3036 * As disgusting as this is, we need to support recursive calls to this 3037 * function because dsl_dir_open() is called during spa_load(), and ends 3038 * up calling spa_open() again. The real fix is to figure out how to 3039 * avoid dsl_dir_open() calling this in the first place. 3040 */ 3041 if (mutex_owner(&spa_namespace_lock) != curthread) { 3042 mutex_enter(&spa_namespace_lock); 3043 locked = B_TRUE; 3044 } 3045 3046 if ((spa = spa_lookup(pool)) == NULL) { 3047 if (locked) 3048 mutex_exit(&spa_namespace_lock); 3049 return (SET_ERROR(ENOENT)); 3050 } 3051 3052 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 3053 zpool_rewind_policy_t policy; 3054 3055 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 3056 &policy); 3057 if (policy.zrp_request & ZPOOL_DO_REWIND) 3058 state = SPA_LOAD_RECOVER; 3059 3060 spa_activate(spa, spa_mode_global); 3061 3062 if (state != SPA_LOAD_RECOVER) 3063 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 3064 3065 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 3066 policy.zrp_request); 3067 3068 if (error == EBADF) { 3069 /* 3070 * If vdev_validate() returns failure (indicated by 3071 * EBADF), it indicates that one of the vdevs indicates 3072 * that the pool has been exported or destroyed. If 3073 * this is the case, the config cache is out of sync and 3074 * we should remove the pool from the namespace. 3075 */ 3076 spa_unload(spa); 3077 spa_deactivate(spa); 3078 spa_config_sync(spa, B_TRUE, B_TRUE); 3079 spa_remove(spa); 3080 if (locked) 3081 mutex_exit(&spa_namespace_lock); 3082 return (SET_ERROR(ENOENT)); 3083 } 3084 3085 if (error) { 3086 /* 3087 * We can't open the pool, but we still have useful 3088 * information: the state of each vdev after the 3089 * attempted vdev_open(). Return this to the user. 3090 */ 3091 if (config != NULL && spa->spa_config) { 3092 VERIFY(nvlist_dup(spa->spa_config, config, 3093 KM_SLEEP) == 0); 3094 VERIFY(nvlist_add_nvlist(*config, 3095 ZPOOL_CONFIG_LOAD_INFO, 3096 spa->spa_load_info) == 0); 3097 } 3098 spa_unload(spa); 3099 spa_deactivate(spa); 3100 spa->spa_last_open_failed = error; 3101 if (locked) 3102 mutex_exit(&spa_namespace_lock); 3103 *spapp = NULL; 3104 return (error); 3105 } 3106 } 3107 3108 spa_open_ref(spa, tag); 3109 3110 if (config != NULL) 3111 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3112 3113 /* 3114 * If we've recovered the pool, pass back any information we 3115 * gathered while doing the load. 3116 */ 3117 if (state == SPA_LOAD_RECOVER) { 3118 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 3119 spa->spa_load_info) == 0); 3120 } 3121 3122 if (locked) { 3123 spa->spa_last_open_failed = 0; 3124 spa->spa_last_ubsync_txg = 0; 3125 spa->spa_load_txg = 0; 3126 mutex_exit(&spa_namespace_lock); 3127 } 3128 3129 *spapp = spa; 3130 3131 return (0); 3132 } 3133 3134 int 3135 spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 3136 nvlist_t **config) 3137 { 3138 return (spa_open_common(name, spapp, tag, policy, config)); 3139 } 3140 3141 int 3142 spa_open(const char *name, spa_t **spapp, void *tag) 3143 { 3144 return (spa_open_common(name, spapp, tag, NULL, NULL)); 3145 } 3146 3147 /* 3148 * Lookup the given spa_t, incrementing the inject count in the process, 3149 * preventing it from being exported or destroyed. 3150 */ 3151 spa_t * 3152 spa_inject_addref(char *name) 3153 { 3154 spa_t *spa; 3155 3156 mutex_enter(&spa_namespace_lock); 3157 if ((spa = spa_lookup(name)) == NULL) { 3158 mutex_exit(&spa_namespace_lock); 3159 return (NULL); 3160 } 3161 spa->spa_inject_ref++; 3162 mutex_exit(&spa_namespace_lock); 3163 3164 return (spa); 3165 } 3166 3167 void 3168 spa_inject_delref(spa_t *spa) 3169 { 3170 mutex_enter(&spa_namespace_lock); 3171 spa->spa_inject_ref--; 3172 mutex_exit(&spa_namespace_lock); 3173 } 3174 3175 /* 3176 * Add spares device information to the nvlist. 3177 */ 3178 static void 3179 spa_add_spares(spa_t *spa, nvlist_t *config) 3180 { 3181 nvlist_t **spares; 3182 uint_t i, nspares; 3183 nvlist_t *nvroot; 3184 uint64_t guid; 3185 vdev_stat_t *vs; 3186 uint_t vsc; 3187 uint64_t pool; 3188 3189 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3190 3191 if (spa->spa_spares.sav_count == 0) 3192 return; 3193 3194 VERIFY(nvlist_lookup_nvlist(config, 3195 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3196 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3197 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3198 if (nspares != 0) { 3199 VERIFY(nvlist_add_nvlist_array(nvroot, 3200 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3201 VERIFY(nvlist_lookup_nvlist_array(nvroot, 3202 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3203 3204 /* 3205 * Go through and find any spares which have since been 3206 * repurposed as an active spare. If this is the case, update 3207 * their status appropriately. 3208 */ 3209 for (i = 0; i < nspares; i++) { 3210 VERIFY(nvlist_lookup_uint64(spares[i], 3211 ZPOOL_CONFIG_GUID, &guid) == 0); 3212 if (spa_spare_exists(guid, &pool, NULL) && 3213 pool != 0ULL) { 3214 VERIFY(nvlist_lookup_uint64_array( 3215 spares[i], ZPOOL_CONFIG_VDEV_STATS, 3216 (uint64_t **)&vs, &vsc) == 0); 3217 vs->vs_state = VDEV_STATE_CANT_OPEN; 3218 vs->vs_aux = VDEV_AUX_SPARED; 3219 } 3220 } 3221 } 3222 } 3223 3224 /* 3225 * Add l2cache device information to the nvlist, including vdev stats. 3226 */ 3227 static void 3228 spa_add_l2cache(spa_t *spa, nvlist_t *config) 3229 { 3230 nvlist_t **l2cache; 3231 uint_t i, j, nl2cache; 3232 nvlist_t *nvroot; 3233 uint64_t guid; 3234 vdev_t *vd; 3235 vdev_stat_t *vs; 3236 uint_t vsc; 3237 3238 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3239 3240 if (spa->spa_l2cache.sav_count == 0) 3241 return; 3242 3243 VERIFY(nvlist_lookup_nvlist(config, 3244 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3245 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3246 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3247 if (nl2cache != 0) { 3248 VERIFY(nvlist_add_nvlist_array(nvroot, 3249 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3250 VERIFY(nvlist_lookup_nvlist_array(nvroot, 3251 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3252 3253 /* 3254 * Update level 2 cache device stats. 3255 */ 3256 3257 for (i = 0; i < nl2cache; i++) { 3258 VERIFY(nvlist_lookup_uint64(l2cache[i], 3259 ZPOOL_CONFIG_GUID, &guid) == 0); 3260 3261 vd = NULL; 3262 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 3263 if (guid == 3264 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 3265 vd = spa->spa_l2cache.sav_vdevs[j]; 3266 break; 3267 } 3268 } 3269 ASSERT(vd != NULL); 3270 3271 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 3272 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 3273 == 0); 3274 vdev_get_stats(vd, vs); 3275 } 3276 } 3277 } 3278 3279 static void 3280 spa_add_feature_stats(spa_t *spa, nvlist_t *config) 3281 { 3282 nvlist_t *features; 3283 zap_cursor_t zc; 3284 zap_attribute_t za; 3285 3286 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3287 VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3288 3289 if (spa->spa_feat_for_read_obj != 0) { 3290 for (zap_cursor_init(&zc, spa->spa_meta_objset, 3291 spa->spa_feat_for_read_obj); 3292 zap_cursor_retrieve(&zc, &za) == 0; 3293 zap_cursor_advance(&zc)) { 3294 ASSERT(za.za_integer_length == sizeof (uint64_t) && 3295 za.za_num_integers == 1); 3296 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3297 za.za_first_integer)); 3298 } 3299 zap_cursor_fini(&zc); 3300 } 3301 3302 if (spa->spa_feat_for_write_obj != 0) { 3303 for (zap_cursor_init(&zc, spa->spa_meta_objset, 3304 spa->spa_feat_for_write_obj); 3305 zap_cursor_retrieve(&zc, &za) == 0; 3306 zap_cursor_advance(&zc)) { 3307 ASSERT(za.za_integer_length == sizeof (uint64_t) && 3308 za.za_num_integers == 1); 3309 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3310 za.za_first_integer)); 3311 } 3312 zap_cursor_fini(&zc); 3313 } 3314 3315 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 3316 features) == 0); 3317 nvlist_free(features); 3318 } 3319 3320 int 3321 spa_get_stats(const char *name, nvlist_t **config, 3322 char *altroot, size_t buflen) 3323 { 3324 int error; 3325 spa_t *spa; 3326 3327 *config = NULL; 3328 error = spa_open_common(name, &spa, FTAG, NULL, config); 3329 3330 if (spa != NULL) { 3331 /* 3332 * This still leaves a window of inconsistency where the spares 3333 * or l2cache devices could change and the config would be 3334 * self-inconsistent. 3335 */ 3336 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3337 3338 if (*config != NULL) { 3339 uint64_t loadtimes[2]; 3340 3341 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 3342 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 3343 VERIFY(nvlist_add_uint64_array(*config, 3344 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 3345 3346 VERIFY(nvlist_add_uint64(*config, 3347 ZPOOL_CONFIG_ERRCOUNT, 3348 spa_get_errlog_size(spa)) == 0); 3349 3350 if (spa_suspended(spa)) 3351 VERIFY(nvlist_add_uint64(*config, 3352 ZPOOL_CONFIG_SUSPENDED, 3353 spa->spa_failmode) == 0); 3354 3355 spa_add_spares(spa, *config); 3356 spa_add_l2cache(spa, *config); 3357 spa_add_feature_stats(spa, *config); 3358 } 3359 } 3360 3361 /* 3362 * We want to get the alternate root even for faulted pools, so we cheat 3363 * and call spa_lookup() directly. 3364 */ 3365 if (altroot) { 3366 if (spa == NULL) { 3367 mutex_enter(&spa_namespace_lock); 3368 spa = spa_lookup(name); 3369 if (spa) 3370 spa_altroot(spa, altroot, buflen); 3371 else 3372 altroot[0] = '\0'; 3373 spa = NULL; 3374 mutex_exit(&spa_namespace_lock); 3375 } else { 3376 spa_altroot(spa, altroot, buflen); 3377 } 3378 } 3379 3380 if (spa != NULL) { 3381 spa_config_exit(spa, SCL_CONFIG, FTAG); 3382 spa_close(spa, FTAG); 3383 } 3384 3385 return (error); 3386 } 3387 3388 /* 3389 * Validate that the auxiliary device array is well formed. We must have an 3390 * array of nvlists, each which describes a valid leaf vdev. If this is an 3391 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 3392 * specified, as long as they are well-formed. 3393 */ 3394 static int 3395 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 3396 spa_aux_vdev_t *sav, const char *config, uint64_t version, 3397 vdev_labeltype_t label) 3398 { 3399 nvlist_t **dev; 3400 uint_t i, ndev; 3401 vdev_t *vd; 3402 int error; 3403 3404 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3405 3406 /* 3407 * It's acceptable to have no devs specified. 3408 */ 3409 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 3410 return (0); 3411 3412 if (ndev == 0) 3413 return (SET_ERROR(EINVAL)); 3414 3415 /* 3416 * Make sure the pool is formatted with a version that supports this 3417 * device type. 3418 */ 3419 if (spa_version(spa) < version) 3420 return (SET_ERROR(ENOTSUP)); 3421 3422 /* 3423 * Set the pending device list so we correctly handle device in-use 3424 * checking. 3425 */ 3426 sav->sav_pending = dev; 3427 sav->sav_npending = ndev; 3428 3429 for (i = 0; i < ndev; i++) { 3430 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 3431 mode)) != 0) 3432 goto out; 3433 3434 if (!vd->vdev_ops->vdev_op_leaf) { 3435 vdev_free(vd); 3436 error = SET_ERROR(EINVAL); 3437 goto out; 3438 } 3439 3440 /* 3441 * The L2ARC currently only supports disk devices in 3442 * kernel context. For user-level testing, we allow it. 3443 */ 3444 #ifdef _KERNEL 3445 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 3446 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 3447 error = SET_ERROR(ENOTBLK); 3448 vdev_free(vd); 3449 goto out; 3450 } 3451 #endif 3452 vd->vdev_top = vd; 3453 3454 if ((error = vdev_open(vd)) == 0 && 3455 (error = vdev_label_init(vd, crtxg, label)) == 0) { 3456 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 3457 vd->vdev_guid) == 0); 3458 } 3459 3460 vdev_free(vd); 3461 3462 if (error && 3463 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 3464 goto out; 3465 else 3466 error = 0; 3467 } 3468 3469 out: 3470 sav->sav_pending = NULL; 3471 sav->sav_npending = 0; 3472 return (error); 3473 } 3474 3475 static int 3476 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 3477 { 3478 int error; 3479 3480 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3481 3482 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3483 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 3484 VDEV_LABEL_SPARE)) != 0) { 3485 return (error); 3486 } 3487 3488 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3489 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 3490 VDEV_LABEL_L2CACHE)); 3491 } 3492 3493 static void 3494 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 3495 const char *config) 3496 { 3497 int i; 3498 3499 if (sav->sav_config != NULL) { 3500 nvlist_t **olddevs; 3501 uint_t oldndevs; 3502 nvlist_t **newdevs; 3503 3504 /* 3505 * Generate new dev list by concatentating with the 3506 * current dev list. 3507 */ 3508 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 3509 &olddevs, &oldndevs) == 0); 3510 3511 newdevs = kmem_alloc(sizeof (void *) * 3512 (ndevs + oldndevs), KM_SLEEP); 3513 for (i = 0; i < oldndevs; i++) 3514 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 3515 KM_SLEEP) == 0); 3516 for (i = 0; i < ndevs; i++) 3517 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 3518 KM_SLEEP) == 0); 3519 3520 VERIFY(nvlist_remove(sav->sav_config, config, 3521 DATA_TYPE_NVLIST_ARRAY) == 0); 3522 3523 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3524 config, newdevs, ndevs + oldndevs) == 0); 3525 for (i = 0; i < oldndevs + ndevs; i++) 3526 nvlist_free(newdevs[i]); 3527 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 3528 } else { 3529 /* 3530 * Generate a new dev list. 3531 */ 3532 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 3533 KM_SLEEP) == 0); 3534 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 3535 devs, ndevs) == 0); 3536 } 3537 } 3538 3539 /* 3540 * Stop and drop level 2 ARC devices 3541 */ 3542 void 3543 spa_l2cache_drop(spa_t *spa) 3544 { 3545 vdev_t *vd; 3546 int i; 3547 spa_aux_vdev_t *sav = &spa->spa_l2cache; 3548 3549 for (i = 0; i < sav->sav_count; i++) { 3550 uint64_t pool; 3551 3552 vd = sav->sav_vdevs[i]; 3553 ASSERT(vd != NULL); 3554 3555 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 3556 pool != 0ULL && l2arc_vdev_present(vd)) 3557 l2arc_remove_vdev(vd); 3558 } 3559 } 3560 3561 /* 3562 * Pool Creation 3563 */ 3564 int 3565 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 3566 nvlist_t *zplprops) 3567 { 3568 spa_t *spa; 3569 char *altroot = NULL; 3570 vdev_t *rvd; 3571 dsl_pool_t *dp; 3572 dmu_tx_t *tx; 3573 int error = 0; 3574 uint64_t txg = TXG_INITIAL; 3575 nvlist_t **spares, **l2cache; 3576 uint_t nspares, nl2cache; 3577 uint64_t version, obj; 3578 boolean_t has_features; 3579 3580 /* 3581 * If this pool already exists, return failure. 3582 */ 3583 mutex_enter(&spa_namespace_lock); 3584 if (spa_lookup(pool) != NULL) { 3585 mutex_exit(&spa_namespace_lock); 3586 return (SET_ERROR(EEXIST)); 3587 } 3588 3589 /* 3590 * Allocate a new spa_t structure. 3591 */ 3592 (void) nvlist_lookup_string(props, 3593 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3594 spa = spa_add(pool, NULL, altroot); 3595 spa_activate(spa, spa_mode_global); 3596 3597 if (props && (error = spa_prop_validate(spa, props))) { 3598 spa_deactivate(spa); 3599 spa_remove(spa); 3600 mutex_exit(&spa_namespace_lock); 3601 return (error); 3602 } 3603 3604 has_features = B_FALSE; 3605 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 3606 elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 3607 if (zpool_prop_feature(nvpair_name(elem))) 3608 has_features = B_TRUE; 3609 } 3610 3611 if (has_features || nvlist_lookup_uint64(props, 3612 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 3613 version = SPA_VERSION; 3614 } 3615 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 3616 3617 spa->spa_first_txg = txg; 3618 spa->spa_uberblock.ub_txg = txg - 1; 3619 spa->spa_uberblock.ub_version = version; 3620 spa->spa_ubsync = spa->spa_uberblock; 3621 3622 /* 3623 * Create "The Godfather" zio to hold all async IOs 3624 */ 3625 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 3626 KM_SLEEP); 3627 for (int i = 0; i < max_ncpus; i++) { 3628 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 3629 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 3630 ZIO_FLAG_GODFATHER); 3631 } 3632 3633 /* 3634 * Create the root vdev. 3635 */ 3636 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3637 3638 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 3639 3640 ASSERT(error != 0 || rvd != NULL); 3641 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 3642 3643 if (error == 0 && !zfs_allocatable_devs(nvroot)) 3644 error = SET_ERROR(EINVAL); 3645 3646 if (error == 0 && 3647 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 3648 (error = spa_validate_aux(spa, nvroot, txg, 3649 VDEV_ALLOC_ADD)) == 0) { 3650 for (int c = 0; c < rvd->vdev_children; c++) { 3651 vdev_metaslab_set_size(rvd->vdev_child[c]); 3652 vdev_expand(rvd->vdev_child[c], txg); 3653 } 3654 } 3655 3656 spa_config_exit(spa, SCL_ALL, FTAG); 3657 3658 if (error != 0) { 3659 spa_unload(spa); 3660 spa_deactivate(spa); 3661 spa_remove(spa); 3662 mutex_exit(&spa_namespace_lock); 3663 return (error); 3664 } 3665 3666 /* 3667 * Get the list of spares, if specified. 3668 */ 3669 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3670 &spares, &nspares) == 0) { 3671 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 3672 KM_SLEEP) == 0); 3673 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3674 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3675 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3676 spa_load_spares(spa); 3677 spa_config_exit(spa, SCL_ALL, FTAG); 3678 spa->spa_spares.sav_sync = B_TRUE; 3679 } 3680 3681 /* 3682 * Get the list of level 2 cache devices, if specified. 3683 */ 3684 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3685 &l2cache, &nl2cache) == 0) { 3686 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3687 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3688 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3689 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3690 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3691 spa_load_l2cache(spa); 3692 spa_config_exit(spa, SCL_ALL, FTAG); 3693 spa->spa_l2cache.sav_sync = B_TRUE; 3694 } 3695 3696 spa->spa_is_initializing = B_TRUE; 3697 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 3698 spa->spa_meta_objset = dp->dp_meta_objset; 3699 spa->spa_is_initializing = B_FALSE; 3700 3701 /* 3702 * Create DDTs (dedup tables). 3703 */ 3704 ddt_create(spa); 3705 3706 spa_update_dspace(spa); 3707 3708 tx = dmu_tx_create_assigned(dp, txg); 3709 3710 /* 3711 * Create the pool config object. 3712 */ 3713 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 3714 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 3715 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3716 3717 if (zap_add(spa->spa_meta_objset, 3718 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3719 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 3720 cmn_err(CE_PANIC, "failed to add pool config"); 3721 } 3722 3723 if (spa_version(spa) >= SPA_VERSION_FEATURES) 3724 spa_feature_create_zap_objects(spa, tx); 3725 3726 if (zap_add(spa->spa_meta_objset, 3727 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 3728 sizeof (uint64_t), 1, &version, tx) != 0) { 3729 cmn_err(CE_PANIC, "failed to add pool version"); 3730 } 3731 3732 /* Newly created pools with the right version are always deflated. */ 3733 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 3734 spa->spa_deflate = TRUE; 3735 if (zap_add(spa->spa_meta_objset, 3736 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3737 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 3738 cmn_err(CE_PANIC, "failed to add deflate"); 3739 } 3740 } 3741 3742 /* 3743 * Create the deferred-free bpobj. Turn off compression 3744 * because sync-to-convergence takes longer if the blocksize 3745 * keeps changing. 3746 */ 3747 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 3748 dmu_object_set_compress(spa->spa_meta_objset, obj, 3749 ZIO_COMPRESS_OFF, tx); 3750 if (zap_add(spa->spa_meta_objset, 3751 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 3752 sizeof (uint64_t), 1, &obj, tx) != 0) { 3753 cmn_err(CE_PANIC, "failed to add bpobj"); 3754 } 3755 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 3756 spa->spa_meta_objset, obj)); 3757 3758 /* 3759 * Create the pool's history object. 3760 */ 3761 if (version >= SPA_VERSION_ZPOOL_HISTORY) 3762 spa_history_create_obj(spa, tx); 3763 3764 /* 3765 * Generate some random noise for salted checksums to operate on. 3766 */ 3767 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 3768 sizeof (spa->spa_cksum_salt.zcs_bytes)); 3769 3770 /* 3771 * Set pool properties. 3772 */ 3773 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 3774 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3775 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 3776 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 3777 3778 if (props != NULL) { 3779 spa_configfile_set(spa, props, B_FALSE); 3780 spa_sync_props(props, tx); 3781 } 3782 3783 dmu_tx_commit(tx); 3784 3785 spa->spa_sync_on = B_TRUE; 3786 txg_sync_start(spa->spa_dsl_pool); 3787 3788 /* 3789 * We explicitly wait for the first transaction to complete so that our 3790 * bean counters are appropriately updated. 3791 */ 3792 txg_wait_synced(spa->spa_dsl_pool, txg); 3793 3794 spa_config_sync(spa, B_FALSE, B_TRUE); 3795 spa_event_notify(spa, NULL, ESC_ZFS_POOL_CREATE); 3796 3797 spa_history_log_version(spa, "create"); 3798 3799 /* 3800 * Don't count references from objsets that are already closed 3801 * and are making their way through the eviction process. 3802 */ 3803 spa_evicting_os_wait(spa); 3804 spa->spa_minref = refcount_count(&spa->spa_refcount); 3805 3806 mutex_exit(&spa_namespace_lock); 3807 3808 return (0); 3809 } 3810 3811 #ifdef _KERNEL 3812 /* 3813 * Get the root pool information from the root disk, then import the root pool 3814 * during the system boot up time. 3815 */ 3816 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 3817 3818 static nvlist_t * 3819 spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 3820 { 3821 nvlist_t *config; 3822 nvlist_t *nvtop, *nvroot; 3823 uint64_t pgid; 3824 3825 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 3826 return (NULL); 3827 3828 /* 3829 * Add this top-level vdev to the child array. 3830 */ 3831 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3832 &nvtop) == 0); 3833 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3834 &pgid) == 0); 3835 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 3836 3837 /* 3838 * Put this pool's top-level vdevs into a root vdev. 3839 */ 3840 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3841 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3842 VDEV_TYPE_ROOT) == 0); 3843 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3844 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3845 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3846 &nvtop, 1) == 0); 3847 3848 /* 3849 * Replace the existing vdev_tree with the new root vdev in 3850 * this pool's configuration (remove the old, add the new). 3851 */ 3852 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3853 nvlist_free(nvroot); 3854 return (config); 3855 } 3856 3857 /* 3858 * Walk the vdev tree and see if we can find a device with "better" 3859 * configuration. A configuration is "better" if the label on that 3860 * device has a more recent txg. 3861 */ 3862 static void 3863 spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 3864 { 3865 for (int c = 0; c < vd->vdev_children; c++) 3866 spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 3867 3868 if (vd->vdev_ops->vdev_op_leaf) { 3869 nvlist_t *label; 3870 uint64_t label_txg; 3871 3872 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 3873 &label) != 0) 3874 return; 3875 3876 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 3877 &label_txg) == 0); 3878 3879 /* 3880 * Do we have a better boot device? 3881 */ 3882 if (label_txg > *txg) { 3883 *txg = label_txg; 3884 *avd = vd; 3885 } 3886 nvlist_free(label); 3887 } 3888 } 3889 3890 /* 3891 * Import a root pool. 3892 * 3893 * For x86. devpath_list will consist of devid and/or physpath name of 3894 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 3895 * The GRUB "findroot" command will return the vdev we should boot. 3896 * 3897 * For Sparc, devpath_list consists the physpath name of the booting device 3898 * no matter the rootpool is a single device pool or a mirrored pool. 3899 * e.g. 3900 * "/pci@1f,0/ide@d/disk@0,0:a" 3901 */ 3902 int 3903 spa_import_rootpool(char *devpath, char *devid) 3904 { 3905 spa_t *spa; 3906 vdev_t *rvd, *bvd, *avd = NULL; 3907 nvlist_t *config, *nvtop; 3908 uint64_t guid, txg; 3909 char *pname; 3910 int error; 3911 3912 /* 3913 * Read the label from the boot device and generate a configuration. 3914 */ 3915 config = spa_generate_rootconf(devpath, devid, &guid); 3916 #if defined(_OBP) && defined(_KERNEL) 3917 if (config == NULL) { 3918 if (strstr(devpath, "/iscsi/ssd") != NULL) { 3919 /* iscsi boot */ 3920 get_iscsi_bootpath_phy(devpath); 3921 config = spa_generate_rootconf(devpath, devid, &guid); 3922 } 3923 } 3924 #endif 3925 if (config == NULL) { 3926 cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", 3927 devpath); 3928 return (SET_ERROR(EIO)); 3929 } 3930 3931 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3932 &pname) == 0); 3933 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 3934 3935 mutex_enter(&spa_namespace_lock); 3936 if ((spa = spa_lookup(pname)) != NULL) { 3937 /* 3938 * Remove the existing root pool from the namespace so that we 3939 * can replace it with the correct config we just read in. 3940 */ 3941 spa_remove(spa); 3942 } 3943 3944 spa = spa_add(pname, config, NULL); 3945 spa->spa_is_root = B_TRUE; 3946 spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3947 3948 /* 3949 * Build up a vdev tree based on the boot device's label config. 3950 */ 3951 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3952 &nvtop) == 0); 3953 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3954 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3955 VDEV_ALLOC_ROOTPOOL); 3956 spa_config_exit(spa, SCL_ALL, FTAG); 3957 if (error) { 3958 mutex_exit(&spa_namespace_lock); 3959 nvlist_free(config); 3960 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3961 pname); 3962 return (error); 3963 } 3964 3965 /* 3966 * Get the boot vdev. 3967 */ 3968 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 3969 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 3970 (u_longlong_t)guid); 3971 error = SET_ERROR(ENOENT); 3972 goto out; 3973 } 3974 3975 /* 3976 * Determine if there is a better boot device. 3977 */ 3978 avd = bvd; 3979 spa_alt_rootvdev(rvd, &avd, &txg); 3980 if (avd != bvd) { 3981 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 3982 "try booting from '%s'", avd->vdev_path); 3983 error = SET_ERROR(EINVAL); 3984 goto out; 3985 } 3986 3987 /* 3988 * If the boot device is part of a spare vdev then ensure that 3989 * we're booting off the active spare. 3990 */ 3991 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3992 !bvd->vdev_isspare) { 3993 cmn_err(CE_NOTE, "The boot device is currently spared. Please " 3994 "try booting from '%s'", 3995 bvd->vdev_parent-> 3996 vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 3997 error = SET_ERROR(EINVAL); 3998 goto out; 3999 } 4000 4001 error = 0; 4002 out: 4003 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4004 vdev_free(rvd); 4005 spa_config_exit(spa, SCL_ALL, FTAG); 4006 mutex_exit(&spa_namespace_lock); 4007 4008 nvlist_free(config); 4009 return (error); 4010 } 4011 4012 #endif 4013 4014 /* 4015 * Import a non-root pool into the system. 4016 */ 4017 int 4018 spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 4019 { 4020 spa_t *spa; 4021 char *altroot = NULL; 4022 spa_load_state_t state = SPA_LOAD_IMPORT; 4023 zpool_rewind_policy_t policy; 4024 uint64_t mode = spa_mode_global; 4025 uint64_t readonly = B_FALSE; 4026 int error; 4027 nvlist_t *nvroot; 4028 nvlist_t **spares, **l2cache; 4029 uint_t nspares, nl2cache; 4030 4031 /* 4032 * If a pool with this name exists, return failure. 4033 */ 4034 mutex_enter(&spa_namespace_lock); 4035 if (spa_lookup(pool) != NULL) { 4036 mutex_exit(&spa_namespace_lock); 4037 return (SET_ERROR(EEXIST)); 4038 } 4039 4040 /* 4041 * Create and initialize the spa structure. 4042 */ 4043 (void) nvlist_lookup_string(props, 4044 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4045 (void) nvlist_lookup_uint64(props, 4046 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 4047 if (readonly) 4048 mode = FREAD; 4049 spa = spa_add(pool, config, altroot); 4050 spa->spa_import_flags = flags; 4051 4052 /* 4053 * Verbatim import - Take a pool and insert it into the namespace 4054 * as if it had been loaded at boot. 4055 */ 4056 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 4057 if (props != NULL) 4058 spa_configfile_set(spa, props, B_FALSE); 4059 4060 spa_config_sync(spa, B_FALSE, B_TRUE); 4061 spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT); 4062 4063 mutex_exit(&spa_namespace_lock); 4064 return (0); 4065 } 4066 4067 spa_activate(spa, mode); 4068 4069 /* 4070 * Don't start async tasks until we know everything is healthy. 4071 */ 4072 spa_async_suspend(spa); 4073 4074 zpool_get_rewind_policy(config, &policy); 4075 if (policy.zrp_request & ZPOOL_DO_REWIND) 4076 state = SPA_LOAD_RECOVER; 4077 4078 /* 4079 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 4080 * because the user-supplied config is actually the one to trust when 4081 * doing an import. 4082 */ 4083 if (state != SPA_LOAD_RECOVER) 4084 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 4085 4086 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 4087 policy.zrp_request); 4088 4089 /* 4090 * Propagate anything learned while loading the pool and pass it 4091 * back to caller (i.e. rewind info, missing devices, etc). 4092 */ 4093 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4094 spa->spa_load_info) == 0); 4095 4096 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4097 /* 4098 * Toss any existing sparelist, as it doesn't have any validity 4099 * anymore, and conflicts with spa_has_spare(). 4100 */ 4101 if (spa->spa_spares.sav_config) { 4102 nvlist_free(spa->spa_spares.sav_config); 4103 spa->spa_spares.sav_config = NULL; 4104 spa_load_spares(spa); 4105 } 4106 if (spa->spa_l2cache.sav_config) { 4107 nvlist_free(spa->spa_l2cache.sav_config); 4108 spa->spa_l2cache.sav_config = NULL; 4109 spa_load_l2cache(spa); 4110 } 4111 4112 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4113 &nvroot) == 0); 4114 if (error == 0) 4115 error = spa_validate_aux(spa, nvroot, -1ULL, 4116 VDEV_ALLOC_SPARE); 4117 if (error == 0) 4118 error = spa_validate_aux(spa, nvroot, -1ULL, 4119 VDEV_ALLOC_L2CACHE); 4120 spa_config_exit(spa, SCL_ALL, FTAG); 4121 4122 if (props != NULL) 4123 spa_configfile_set(spa, props, B_FALSE); 4124 4125 if (error != 0 || (props && spa_writeable(spa) && 4126 (error = spa_prop_set(spa, props)))) { 4127 spa_unload(spa); 4128 spa_deactivate(spa); 4129 spa_remove(spa); 4130 mutex_exit(&spa_namespace_lock); 4131 return (error); 4132 } 4133 4134 spa_async_resume(spa); 4135 4136 /* 4137 * Override any spares and level 2 cache devices as specified by 4138 * the user, as these may have correct device names/devids, etc. 4139 */ 4140 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 4141 &spares, &nspares) == 0) { 4142 if (spa->spa_spares.sav_config) 4143 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 4144 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 4145 else 4146 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 4147 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4148 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 4149 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 4150 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4151 spa_load_spares(spa); 4152 spa_config_exit(spa, SCL_ALL, FTAG); 4153 spa->spa_spares.sav_sync = B_TRUE; 4154 } 4155 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 4156 &l2cache, &nl2cache) == 0) { 4157 if (spa->spa_l2cache.sav_config) 4158 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 4159 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 4160 else 4161 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 4162 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4163 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 4164 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 4165 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4166 spa_load_l2cache(spa); 4167 spa_config_exit(spa, SCL_ALL, FTAG); 4168 spa->spa_l2cache.sav_sync = B_TRUE; 4169 } 4170 4171 /* 4172 * Check for any removed devices. 4173 */ 4174 if (spa->spa_autoreplace) { 4175 spa_aux_check_removed(&spa->spa_spares); 4176 spa_aux_check_removed(&spa->spa_l2cache); 4177 } 4178 4179 if (spa_writeable(spa)) { 4180 /* 4181 * Update the config cache to include the newly-imported pool. 4182 */ 4183 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4184 } 4185 4186 /* 4187 * It's possible that the pool was expanded while it was exported. 4188 * We kick off an async task to handle this for us. 4189 */ 4190 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 4191 4192 spa_history_log_version(spa, "import"); 4193 4194 spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT); 4195 4196 mutex_exit(&spa_namespace_lock); 4197 4198 return (0); 4199 } 4200 4201 nvlist_t * 4202 spa_tryimport(nvlist_t *tryconfig) 4203 { 4204 nvlist_t *config = NULL; 4205 char *poolname; 4206 spa_t *spa; 4207 uint64_t state; 4208 int error; 4209 4210 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 4211 return (NULL); 4212 4213 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 4214 return (NULL); 4215 4216 /* 4217 * Create and initialize the spa structure. 4218 */ 4219 mutex_enter(&spa_namespace_lock); 4220 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 4221 spa_activate(spa, FREAD); 4222 4223 /* 4224 * Pass off the heavy lifting to spa_load(). 4225 * Pass TRUE for mosconfig because the user-supplied config 4226 * is actually the one to trust when doing an import. 4227 */ 4228 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 4229 4230 /* 4231 * If 'tryconfig' was at least parsable, return the current config. 4232 */ 4233 if (spa->spa_root_vdev != NULL) { 4234 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 4235 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 4236 poolname) == 0); 4237 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4238 state) == 0); 4239 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 4240 spa->spa_uberblock.ub_timestamp) == 0); 4241 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4242 spa->spa_load_info) == 0); 4243 4244 /* 4245 * If the bootfs property exists on this pool then we 4246 * copy it out so that external consumers can tell which 4247 * pools are bootable. 4248 */ 4249 if ((!error || error == EEXIST) && spa->spa_bootfs) { 4250 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4251 4252 /* 4253 * We have to play games with the name since the 4254 * pool was opened as TRYIMPORT_NAME. 4255 */ 4256 if (dsl_dsobj_to_dsname(spa_name(spa), 4257 spa->spa_bootfs, tmpname) == 0) { 4258 char *cp; 4259 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4260 4261 cp = strchr(tmpname, '/'); 4262 if (cp == NULL) { 4263 (void) strlcpy(dsname, tmpname, 4264 MAXPATHLEN); 4265 } else { 4266 (void) snprintf(dsname, MAXPATHLEN, 4267 "%s/%s", poolname, ++cp); 4268 } 4269 VERIFY(nvlist_add_string(config, 4270 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 4271 kmem_free(dsname, MAXPATHLEN); 4272 } 4273 kmem_free(tmpname, MAXPATHLEN); 4274 } 4275 4276 /* 4277 * Add the list of hot spares and level 2 cache devices. 4278 */ 4279 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4280 spa_add_spares(spa, config); 4281 spa_add_l2cache(spa, config); 4282 spa_config_exit(spa, SCL_CONFIG, FTAG); 4283 } 4284 4285 spa_unload(spa); 4286 spa_deactivate(spa); 4287 spa_remove(spa); 4288 mutex_exit(&spa_namespace_lock); 4289 4290 return (config); 4291 } 4292 4293 /* 4294 * Pool export/destroy 4295 * 4296 * The act of destroying or exporting a pool is very simple. We make sure there 4297 * is no more pending I/O and any references to the pool are gone. Then, we 4298 * update the pool state and sync all the labels to disk, removing the 4299 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 4300 * we don't sync the labels or remove the configuration cache. 4301 */ 4302 static int 4303 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 4304 boolean_t force, boolean_t hardforce) 4305 { 4306 spa_t *spa; 4307 4308 if (oldconfig) 4309 *oldconfig = NULL; 4310 4311 if (!(spa_mode_global & FWRITE)) 4312 return (SET_ERROR(EROFS)); 4313 4314 mutex_enter(&spa_namespace_lock); 4315 if ((spa = spa_lookup(pool)) == NULL) { 4316 mutex_exit(&spa_namespace_lock); 4317 return (SET_ERROR(ENOENT)); 4318 } 4319 4320 /* 4321 * Put a hold on the pool, drop the namespace lock, stop async tasks, 4322 * reacquire the namespace lock, and see if we can export. 4323 */ 4324 spa_open_ref(spa, FTAG); 4325 mutex_exit(&spa_namespace_lock); 4326 spa_async_suspend(spa); 4327 mutex_enter(&spa_namespace_lock); 4328 spa_close(spa, FTAG); 4329 4330 /* 4331 * The pool will be in core if it's openable, 4332 * in which case we can modify its state. 4333 */ 4334 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 4335 /* 4336 * Objsets may be open only because they're dirty, so we 4337 * have to force it to sync before checking spa_refcnt. 4338 */ 4339 txg_wait_synced(spa->spa_dsl_pool, 0); 4340 spa_evicting_os_wait(spa); 4341 4342 /* 4343 * A pool cannot be exported or destroyed if there are active 4344 * references. If we are resetting a pool, allow references by 4345 * fault injection handlers. 4346 */ 4347 if (!spa_refcount_zero(spa) || 4348 (spa->spa_inject_ref != 0 && 4349 new_state != POOL_STATE_UNINITIALIZED)) { 4350 spa_async_resume(spa); 4351 mutex_exit(&spa_namespace_lock); 4352 return (SET_ERROR(EBUSY)); 4353 } 4354 4355 /* 4356 * A pool cannot be exported if it has an active shared spare. 4357 * This is to prevent other pools stealing the active spare 4358 * from an exported pool. At user's own will, such pool can 4359 * be forcedly exported. 4360 */ 4361 if (!force && new_state == POOL_STATE_EXPORTED && 4362 spa_has_active_shared_spare(spa)) { 4363 spa_async_resume(spa); 4364 mutex_exit(&spa_namespace_lock); 4365 return (SET_ERROR(EXDEV)); 4366 } 4367 4368 /* 4369 * We want this to be reflected on every label, 4370 * so mark them all dirty. spa_unload() will do the 4371 * final sync that pushes these changes out. 4372 */ 4373 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 4374 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4375 spa->spa_state = new_state; 4376 spa->spa_final_txg = spa_last_synced_txg(spa) + 4377 TXG_DEFER_SIZE + 1; 4378 vdev_config_dirty(spa->spa_root_vdev); 4379 spa_config_exit(spa, SCL_ALL, FTAG); 4380 } 4381 } 4382 4383 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 4384 4385 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4386 spa_unload(spa); 4387 spa_deactivate(spa); 4388 } 4389 4390 if (oldconfig && spa->spa_config) 4391 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 4392 4393 if (new_state != POOL_STATE_UNINITIALIZED) { 4394 if (!hardforce) 4395 spa_config_sync(spa, B_TRUE, B_TRUE); 4396 spa_remove(spa); 4397 } 4398 mutex_exit(&spa_namespace_lock); 4399 4400 return (0); 4401 } 4402 4403 /* 4404 * Destroy a storage pool. 4405 */ 4406 int 4407 spa_destroy(char *pool) 4408 { 4409 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 4410 B_FALSE, B_FALSE)); 4411 } 4412 4413 /* 4414 * Export a storage pool. 4415 */ 4416 int 4417 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 4418 boolean_t hardforce) 4419 { 4420 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 4421 force, hardforce)); 4422 } 4423 4424 /* 4425 * Similar to spa_export(), this unloads the spa_t without actually removing it 4426 * from the namespace in any way. 4427 */ 4428 int 4429 spa_reset(char *pool) 4430 { 4431 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 4432 B_FALSE, B_FALSE)); 4433 } 4434 4435 /* 4436 * ========================================================================== 4437 * Device manipulation 4438 * ========================================================================== 4439 */ 4440 4441 /* 4442 * Add a device to a storage pool. 4443 */ 4444 int 4445 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 4446 { 4447 uint64_t txg, id; 4448 int error; 4449 vdev_t *rvd = spa->spa_root_vdev; 4450 vdev_t *vd, *tvd; 4451 nvlist_t **spares, **l2cache; 4452 uint_t nspares, nl2cache; 4453 4454 ASSERT(spa_writeable(spa)); 4455 4456 txg = spa_vdev_enter(spa); 4457 4458 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 4459 VDEV_ALLOC_ADD)) != 0) 4460 return (spa_vdev_exit(spa, NULL, txg, error)); 4461 4462 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 4463 4464 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 4465 &nspares) != 0) 4466 nspares = 0; 4467 4468 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 4469 &nl2cache) != 0) 4470 nl2cache = 0; 4471 4472 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 4473 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 4474 4475 if (vd->vdev_children != 0 && 4476 (error = vdev_create(vd, txg, B_FALSE)) != 0) 4477 return (spa_vdev_exit(spa, vd, txg, error)); 4478 4479 /* 4480 * We must validate the spares and l2cache devices after checking the 4481 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 4482 */ 4483 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 4484 return (spa_vdev_exit(spa, vd, txg, error)); 4485 4486 /* 4487 * Transfer each new top-level vdev from vd to rvd. 4488 */ 4489 for (int c = 0; c < vd->vdev_children; c++) { 4490 4491 /* 4492 * Set the vdev id to the first hole, if one exists. 4493 */ 4494 for (id = 0; id < rvd->vdev_children; id++) { 4495 if (rvd->vdev_child[id]->vdev_ishole) { 4496 vdev_free(rvd->vdev_child[id]); 4497 break; 4498 } 4499 } 4500 tvd = vd->vdev_child[c]; 4501 vdev_remove_child(vd, tvd); 4502 tvd->vdev_id = id; 4503 vdev_add_child(rvd, tvd); 4504 vdev_config_dirty(tvd); 4505 } 4506 4507 if (nspares != 0) { 4508 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 4509 ZPOOL_CONFIG_SPARES); 4510 spa_load_spares(spa); 4511 spa->spa_spares.sav_sync = B_TRUE; 4512 } 4513 4514 if (nl2cache != 0) { 4515 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 4516 ZPOOL_CONFIG_L2CACHE); 4517 spa_load_l2cache(spa); 4518 spa->spa_l2cache.sav_sync = B_TRUE; 4519 } 4520 4521 /* 4522 * We have to be careful when adding new vdevs to an existing pool. 4523 * If other threads start allocating from these vdevs before we 4524 * sync the config cache, and we lose power, then upon reboot we may 4525 * fail to open the pool because there are DVAs that the config cache 4526 * can't translate. Therefore, we first add the vdevs without 4527 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 4528 * and then let spa_config_update() initialize the new metaslabs. 4529 * 4530 * spa_load() checks for added-but-not-initialized vdevs, so that 4531 * if we lose power at any point in this sequence, the remaining 4532 * steps will be completed the next time we load the pool. 4533 */ 4534 (void) spa_vdev_exit(spa, vd, txg, 0); 4535 4536 mutex_enter(&spa_namespace_lock); 4537 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4538 spa_event_notify(spa, NULL, ESC_ZFS_VDEV_ADD); 4539 mutex_exit(&spa_namespace_lock); 4540 4541 return (0); 4542 } 4543 4544 /* 4545 * Attach a device to a mirror. The arguments are the path to any device 4546 * in the mirror, and the nvroot for the new device. If the path specifies 4547 * a device that is not mirrored, we automatically insert the mirror vdev. 4548 * 4549 * If 'replacing' is specified, the new device is intended to replace the 4550 * existing device; in this case the two devices are made into their own 4551 * mirror using the 'replacing' vdev, which is functionally identical to 4552 * the mirror vdev (it actually reuses all the same ops) but has a few 4553 * extra rules: you can't attach to it after it's been created, and upon 4554 * completion of resilvering, the first disk (the one being replaced) 4555 * is automatically detached. 4556 */ 4557 int 4558 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 4559 { 4560 uint64_t txg, dtl_max_txg; 4561 vdev_t *rvd = spa->spa_root_vdev; 4562 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 4563 vdev_ops_t *pvops; 4564 char *oldvdpath, *newvdpath; 4565 int newvd_isspare; 4566 int error; 4567 4568 ASSERT(spa_writeable(spa)); 4569 4570 txg = spa_vdev_enter(spa); 4571 4572 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 4573 4574 if (oldvd == NULL) 4575 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4576 4577 if (!oldvd->vdev_ops->vdev_op_leaf) 4578 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4579 4580 pvd = oldvd->vdev_parent; 4581 4582 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 4583 VDEV_ALLOC_ATTACH)) != 0) 4584 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4585 4586 if (newrootvd->vdev_children != 1) 4587 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4588 4589 newvd = newrootvd->vdev_child[0]; 4590 4591 if (!newvd->vdev_ops->vdev_op_leaf) 4592 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4593 4594 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 4595 return (spa_vdev_exit(spa, newrootvd, txg, error)); 4596 4597 /* 4598 * Spares can't replace logs 4599 */ 4600 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 4601 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4602 4603 if (!replacing) { 4604 /* 4605 * For attach, the only allowable parent is a mirror or the root 4606 * vdev. 4607 */ 4608 if (pvd->vdev_ops != &vdev_mirror_ops && 4609 pvd->vdev_ops != &vdev_root_ops) 4610 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4611 4612 pvops = &vdev_mirror_ops; 4613 } else { 4614 /* 4615 * Active hot spares can only be replaced by inactive hot 4616 * spares. 4617 */ 4618 if (pvd->vdev_ops == &vdev_spare_ops && 4619 oldvd->vdev_isspare && 4620 !spa_has_spare(spa, newvd->vdev_guid)) 4621 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4622 4623 /* 4624 * If the source is a hot spare, and the parent isn't already a 4625 * spare, then we want to create a new hot spare. Otherwise, we 4626 * want to create a replacing vdev. The user is not allowed to 4627 * attach to a spared vdev child unless the 'isspare' state is 4628 * the same (spare replaces spare, non-spare replaces 4629 * non-spare). 4630 */ 4631 if (pvd->vdev_ops == &vdev_replacing_ops && 4632 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 4633 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4634 } else if (pvd->vdev_ops == &vdev_spare_ops && 4635 newvd->vdev_isspare != oldvd->vdev_isspare) { 4636 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4637 } 4638 4639 if (newvd->vdev_isspare) 4640 pvops = &vdev_spare_ops; 4641 else 4642 pvops = &vdev_replacing_ops; 4643 } 4644 4645 /* 4646 * Make sure the new device is big enough. 4647 */ 4648 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 4649 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 4650 4651 /* 4652 * The new device cannot have a higher alignment requirement 4653 * than the top-level vdev. 4654 */ 4655 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 4656 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 4657 4658 /* 4659 * If this is an in-place replacement, update oldvd's path and devid 4660 * to make it distinguishable from newvd, and unopenable from now on. 4661 */ 4662 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 4663 spa_strfree(oldvd->vdev_path); 4664 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 4665 KM_SLEEP); 4666 (void) sprintf(oldvd->vdev_path, "%s/%s", 4667 newvd->vdev_path, "old"); 4668 if (oldvd->vdev_devid != NULL) { 4669 spa_strfree(oldvd->vdev_devid); 4670 oldvd->vdev_devid = NULL; 4671 } 4672 } 4673 4674 /* mark the device being resilvered */ 4675 newvd->vdev_resilver_txg = txg; 4676 4677 /* 4678 * If the parent is not a mirror, or if we're replacing, insert the new 4679 * mirror/replacing/spare vdev above oldvd. 4680 */ 4681 if (pvd->vdev_ops != pvops) 4682 pvd = vdev_add_parent(oldvd, pvops); 4683 4684 ASSERT(pvd->vdev_top->vdev_parent == rvd); 4685 ASSERT(pvd->vdev_ops == pvops); 4686 ASSERT(oldvd->vdev_parent == pvd); 4687 4688 /* 4689 * Extract the new device from its root and add it to pvd. 4690 */ 4691 vdev_remove_child(newrootvd, newvd); 4692 newvd->vdev_id = pvd->vdev_children; 4693 newvd->vdev_crtxg = oldvd->vdev_crtxg; 4694 vdev_add_child(pvd, newvd); 4695 4696 tvd = newvd->vdev_top; 4697 ASSERT(pvd->vdev_top == tvd); 4698 ASSERT(tvd->vdev_parent == rvd); 4699 4700 vdev_config_dirty(tvd); 4701 4702 /* 4703 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 4704 * for any dmu_sync-ed blocks. It will propagate upward when 4705 * spa_vdev_exit() calls vdev_dtl_reassess(). 4706 */ 4707 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 4708 4709 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 4710 dtl_max_txg - TXG_INITIAL); 4711 4712 if (newvd->vdev_isspare) { 4713 spa_spare_activate(newvd); 4714 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 4715 } 4716 4717 oldvdpath = spa_strdup(oldvd->vdev_path); 4718 newvdpath = spa_strdup(newvd->vdev_path); 4719 newvd_isspare = newvd->vdev_isspare; 4720 4721 /* 4722 * Mark newvd's DTL dirty in this txg. 4723 */ 4724 vdev_dirty(tvd, VDD_DTL, newvd, txg); 4725 4726 /* 4727 * Schedule the resilver to restart in the future. We do this to 4728 * ensure that dmu_sync-ed blocks have been stitched into the 4729 * respective datasets. 4730 */ 4731 dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 4732 4733 if (spa->spa_bootfs) 4734 spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); 4735 4736 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_ATTACH); 4737 4738 /* 4739 * Commit the config 4740 */ 4741 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 4742 4743 spa_history_log_internal(spa, "vdev attach", NULL, 4744 "%s vdev=%s %s vdev=%s", 4745 replacing && newvd_isspare ? "spare in" : 4746 replacing ? "replace" : "attach", newvdpath, 4747 replacing ? "for" : "to", oldvdpath); 4748 4749 spa_strfree(oldvdpath); 4750 spa_strfree(newvdpath); 4751 4752 return (0); 4753 } 4754 4755 /* 4756 * Detach a device from a mirror or replacing vdev. 4757 * 4758 * If 'replace_done' is specified, only detach if the parent 4759 * is a replacing vdev. 4760 */ 4761 int 4762 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 4763 { 4764 uint64_t txg; 4765 int error; 4766 vdev_t *rvd = spa->spa_root_vdev; 4767 vdev_t *vd, *pvd, *cvd, *tvd; 4768 boolean_t unspare = B_FALSE; 4769 uint64_t unspare_guid = 0; 4770 char *vdpath; 4771 4772 ASSERT(spa_writeable(spa)); 4773 4774 txg = spa_vdev_enter(spa); 4775 4776 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4777 4778 if (vd == NULL) 4779 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4780 4781 if (!vd->vdev_ops->vdev_op_leaf) 4782 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4783 4784 pvd = vd->vdev_parent; 4785 4786 /* 4787 * If the parent/child relationship is not as expected, don't do it. 4788 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 4789 * vdev that's replacing B with C. The user's intent in replacing 4790 * is to go from M(A,B) to M(A,C). If the user decides to cancel 4791 * the replace by detaching C, the expected behavior is to end up 4792 * M(A,B). But suppose that right after deciding to detach C, 4793 * the replacement of B completes. We would have M(A,C), and then 4794 * ask to detach C, which would leave us with just A -- not what 4795 * the user wanted. To prevent this, we make sure that the 4796 * parent/child relationship hasn't changed -- in this example, 4797 * that C's parent is still the replacing vdev R. 4798 */ 4799 if (pvd->vdev_guid != pguid && pguid != 0) 4800 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4801 4802 /* 4803 * Only 'replacing' or 'spare' vdevs can be replaced. 4804 */ 4805 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 4806 pvd->vdev_ops != &vdev_spare_ops) 4807 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4808 4809 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 4810 spa_version(spa) >= SPA_VERSION_SPARES); 4811 4812 /* 4813 * Only mirror, replacing, and spare vdevs support detach. 4814 */ 4815 if (pvd->vdev_ops != &vdev_replacing_ops && 4816 pvd->vdev_ops != &vdev_mirror_ops && 4817 pvd->vdev_ops != &vdev_spare_ops) 4818 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4819 4820 /* 4821 * If this device has the only valid copy of some data, 4822 * we cannot safely detach it. 4823 */ 4824 if (vdev_dtl_required(vd)) 4825 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4826 4827 ASSERT(pvd->vdev_children >= 2); 4828 4829 /* 4830 * If we are detaching the second disk from a replacing vdev, then 4831 * check to see if we changed the original vdev's path to have "/old" 4832 * at the end in spa_vdev_attach(). If so, undo that change now. 4833 */ 4834 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 4835 vd->vdev_path != NULL) { 4836 size_t len = strlen(vd->vdev_path); 4837 4838 for (int c = 0; c < pvd->vdev_children; c++) { 4839 cvd = pvd->vdev_child[c]; 4840 4841 if (cvd == vd || cvd->vdev_path == NULL) 4842 continue; 4843 4844 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 4845 strcmp(cvd->vdev_path + len, "/old") == 0) { 4846 spa_strfree(cvd->vdev_path); 4847 cvd->vdev_path = spa_strdup(vd->vdev_path); 4848 break; 4849 } 4850 } 4851 } 4852 4853 /* 4854 * If we are detaching the original disk from a spare, then it implies 4855 * that the spare should become a real disk, and be removed from the 4856 * active spare list for the pool. 4857 */ 4858 if (pvd->vdev_ops == &vdev_spare_ops && 4859 vd->vdev_id == 0 && 4860 pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 4861 unspare = B_TRUE; 4862 4863 /* 4864 * Erase the disk labels so the disk can be used for other things. 4865 * This must be done after all other error cases are handled, 4866 * but before we disembowel vd (so we can still do I/O to it). 4867 * But if we can't do it, don't treat the error as fatal -- 4868 * it may be that the unwritability of the disk is the reason 4869 * it's being detached! 4870 */ 4871 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 4872 4873 /* 4874 * Remove vd from its parent and compact the parent's children. 4875 */ 4876 vdev_remove_child(pvd, vd); 4877 vdev_compact_children(pvd); 4878 4879 /* 4880 * Remember one of the remaining children so we can get tvd below. 4881 */ 4882 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 4883 4884 /* 4885 * If we need to remove the remaining child from the list of hot spares, 4886 * do it now, marking the vdev as no longer a spare in the process. 4887 * We must do this before vdev_remove_parent(), because that can 4888 * change the GUID if it creates a new toplevel GUID. For a similar 4889 * reason, we must remove the spare now, in the same txg as the detach; 4890 * otherwise someone could attach a new sibling, change the GUID, and 4891 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 4892 */ 4893 if (unspare) { 4894 ASSERT(cvd->vdev_isspare); 4895 spa_spare_remove(cvd); 4896 unspare_guid = cvd->vdev_guid; 4897 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 4898 cvd->vdev_unspare = B_TRUE; 4899 } 4900 4901 /* 4902 * If the parent mirror/replacing vdev only has one child, 4903 * the parent is no longer needed. Remove it from the tree. 4904 */ 4905 if (pvd->vdev_children == 1) { 4906 if (pvd->vdev_ops == &vdev_spare_ops) 4907 cvd->vdev_unspare = B_FALSE; 4908 vdev_remove_parent(cvd); 4909 } 4910 4911 4912 /* 4913 * We don't set tvd until now because the parent we just removed 4914 * may have been the previous top-level vdev. 4915 */ 4916 tvd = cvd->vdev_top; 4917 ASSERT(tvd->vdev_parent == rvd); 4918 4919 /* 4920 * Reevaluate the parent vdev state. 4921 */ 4922 vdev_propagate_state(cvd); 4923 4924 /* 4925 * If the 'autoexpand' property is set on the pool then automatically 4926 * try to expand the size of the pool. For example if the device we 4927 * just detached was smaller than the others, it may be possible to 4928 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 4929 * first so that we can obtain the updated sizes of the leaf vdevs. 4930 */ 4931 if (spa->spa_autoexpand) { 4932 vdev_reopen(tvd); 4933 vdev_expand(tvd, txg); 4934 } 4935 4936 vdev_config_dirty(tvd); 4937 4938 /* 4939 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 4940 * vd->vdev_detached is set and free vd's DTL object in syncing context. 4941 * But first make sure we're not on any *other* txg's DTL list, to 4942 * prevent vd from being accessed after it's freed. 4943 */ 4944 vdpath = spa_strdup(vd->vdev_path); 4945 for (int t = 0; t < TXG_SIZE; t++) 4946 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 4947 vd->vdev_detached = B_TRUE; 4948 vdev_dirty(tvd, VDD_DTL, vd, txg); 4949 4950 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 4951 4952 /* hang on to the spa before we release the lock */ 4953 spa_open_ref(spa, FTAG); 4954 4955 error = spa_vdev_exit(spa, vd, txg, 0); 4956 4957 spa_history_log_internal(spa, "detach", NULL, 4958 "vdev=%s", vdpath); 4959 spa_strfree(vdpath); 4960 4961 /* 4962 * If this was the removal of the original device in a hot spare vdev, 4963 * then we want to go through and remove the device from the hot spare 4964 * list of every other pool. 4965 */ 4966 if (unspare) { 4967 spa_t *altspa = NULL; 4968 4969 mutex_enter(&spa_namespace_lock); 4970 while ((altspa = spa_next(altspa)) != NULL) { 4971 if (altspa->spa_state != POOL_STATE_ACTIVE || 4972 altspa == spa) 4973 continue; 4974 4975 spa_open_ref(altspa, FTAG); 4976 mutex_exit(&spa_namespace_lock); 4977 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 4978 mutex_enter(&spa_namespace_lock); 4979 spa_close(altspa, FTAG); 4980 } 4981 mutex_exit(&spa_namespace_lock); 4982 4983 /* search the rest of the vdevs for spares to remove */ 4984 spa_vdev_resilver_done(spa); 4985 } 4986 4987 /* all done with the spa; OK to release */ 4988 mutex_enter(&spa_namespace_lock); 4989 spa_close(spa, FTAG); 4990 mutex_exit(&spa_namespace_lock); 4991 4992 return (error); 4993 } 4994 4995 /* 4996 * Split a set of devices from their mirrors, and create a new pool from them. 4997 */ 4998 int 4999 spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 5000 nvlist_t *props, boolean_t exp) 5001 { 5002 int error = 0; 5003 uint64_t txg, *glist; 5004 spa_t *newspa; 5005 uint_t c, children, lastlog; 5006 nvlist_t **child, *nvl, *tmp; 5007 dmu_tx_t *tx; 5008 char *altroot = NULL; 5009 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 5010 boolean_t activate_slog; 5011 5012 ASSERT(spa_writeable(spa)); 5013 5014 txg = spa_vdev_enter(spa); 5015 5016 /* clear the log and flush everything up to now */ 5017 activate_slog = spa_passivate_log(spa); 5018 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5019 error = spa_offline_log(spa); 5020 txg = spa_vdev_config_enter(spa); 5021 5022 if (activate_slog) 5023 spa_activate_log(spa); 5024 5025 if (error != 0) 5026 return (spa_vdev_exit(spa, NULL, txg, error)); 5027 5028 /* check new spa name before going any further */ 5029 if (spa_lookup(newname) != NULL) 5030 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 5031 5032 /* 5033 * scan through all the children to ensure they're all mirrors 5034 */ 5035 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 5036 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 5037 &children) != 0) 5038 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5039 5040 /* first, check to ensure we've got the right child count */ 5041 rvd = spa->spa_root_vdev; 5042 lastlog = 0; 5043 for (c = 0; c < rvd->vdev_children; c++) { 5044 vdev_t *vd = rvd->vdev_child[c]; 5045 5046 /* don't count the holes & logs as children */ 5047 if (vd->vdev_islog || vd->vdev_ishole) { 5048 if (lastlog == 0) 5049 lastlog = c; 5050 continue; 5051 } 5052 5053 lastlog = 0; 5054 } 5055 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 5056 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5057 5058 /* next, ensure no spare or cache devices are part of the split */ 5059 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 5060 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 5061 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5062 5063 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 5064 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 5065 5066 /* then, loop over each vdev and validate it */ 5067 for (c = 0; c < children; c++) { 5068 uint64_t is_hole = 0; 5069 5070 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 5071 &is_hole); 5072 5073 if (is_hole != 0) { 5074 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 5075 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 5076 continue; 5077 } else { 5078 error = SET_ERROR(EINVAL); 5079 break; 5080 } 5081 } 5082 5083 /* which disk is going to be split? */ 5084 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 5085 &glist[c]) != 0) { 5086 error = SET_ERROR(EINVAL); 5087 break; 5088 } 5089 5090 /* look it up in the spa */ 5091 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 5092 if (vml[c] == NULL) { 5093 error = SET_ERROR(ENODEV); 5094 break; 5095 } 5096 5097 /* make sure there's nothing stopping the split */ 5098 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 5099 vml[c]->vdev_islog || 5100 vml[c]->vdev_ishole || 5101 vml[c]->vdev_isspare || 5102 vml[c]->vdev_isl2cache || 5103 !vdev_writeable(vml[c]) || 5104 vml[c]->vdev_children != 0 || 5105 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 5106 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 5107 error = SET_ERROR(EINVAL); 5108 break; 5109 } 5110 5111 if (vdev_dtl_required(vml[c])) { 5112 error = SET_ERROR(EBUSY); 5113 break; 5114 } 5115 5116 /* we need certain info from the top level */ 5117 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 5118 vml[c]->vdev_top->vdev_ms_array) == 0); 5119 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 5120 vml[c]->vdev_top->vdev_ms_shift) == 0); 5121 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 5122 vml[c]->vdev_top->vdev_asize) == 0); 5123 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 5124 vml[c]->vdev_top->vdev_ashift) == 0); 5125 5126 /* transfer per-vdev ZAPs */ 5127 ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); 5128 VERIFY0(nvlist_add_uint64(child[c], 5129 ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); 5130 5131 ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); 5132 VERIFY0(nvlist_add_uint64(child[c], 5133 ZPOOL_CONFIG_VDEV_TOP_ZAP, 5134 vml[c]->vdev_parent->vdev_top_zap)); 5135 } 5136 5137 if (error != 0) { 5138 kmem_free(vml, children * sizeof (vdev_t *)); 5139 kmem_free(glist, children * sizeof (uint64_t)); 5140 return (spa_vdev_exit(spa, NULL, txg, error)); 5141 } 5142 5143 /* stop writers from using the disks */ 5144 for (c = 0; c < children; c++) { 5145 if (vml[c] != NULL) 5146 vml[c]->vdev_offline = B_TRUE; 5147 } 5148 vdev_reopen(spa->spa_root_vdev); 5149 5150 /* 5151 * Temporarily record the splitting vdevs in the spa config. This 5152 * will disappear once the config is regenerated. 5153 */ 5154 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5155 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 5156 glist, children) == 0); 5157 kmem_free(glist, children * sizeof (uint64_t)); 5158 5159 mutex_enter(&spa->spa_props_lock); 5160 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 5161 nvl) == 0); 5162 mutex_exit(&spa->spa_props_lock); 5163 spa->spa_config_splitting = nvl; 5164 vdev_config_dirty(spa->spa_root_vdev); 5165 5166 /* configure and create the new pool */ 5167 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 5168 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 5169 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 5170 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 5171 spa_version(spa)) == 0); 5172 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 5173 spa->spa_config_txg) == 0); 5174 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 5175 spa_generate_guid(NULL)) == 0); 5176 VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 5177 (void) nvlist_lookup_string(props, 5178 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5179 5180 /* add the new pool to the namespace */ 5181 newspa = spa_add(newname, config, altroot); 5182 newspa->spa_avz_action = AVZ_ACTION_REBUILD; 5183 newspa->spa_config_txg = spa->spa_config_txg; 5184 spa_set_log_state(newspa, SPA_LOG_CLEAR); 5185 5186 /* release the spa config lock, retaining the namespace lock */ 5187 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5188 5189 if (zio_injection_enabled) 5190 zio_handle_panic_injection(spa, FTAG, 1); 5191 5192 spa_activate(newspa, spa_mode_global); 5193 spa_async_suspend(newspa); 5194 5195 /* create the new pool from the disks of the original pool */ 5196 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 5197 if (error) 5198 goto out; 5199 5200 /* if that worked, generate a real config for the new pool */ 5201 if (newspa->spa_root_vdev != NULL) { 5202 VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 5203 NV_UNIQUE_NAME, KM_SLEEP) == 0); 5204 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 5205 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 5206 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 5207 B_TRUE)); 5208 } 5209 5210 /* set the props */ 5211 if (props != NULL) { 5212 spa_configfile_set(newspa, props, B_FALSE); 5213 error = spa_prop_set(newspa, props); 5214 if (error) 5215 goto out; 5216 } 5217 5218 /* flush everything */ 5219 txg = spa_vdev_config_enter(newspa); 5220 vdev_config_dirty(newspa->spa_root_vdev); 5221 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 5222 5223 if (zio_injection_enabled) 5224 zio_handle_panic_injection(spa, FTAG, 2); 5225 5226 spa_async_resume(newspa); 5227 5228 /* finally, update the original pool's config */ 5229 txg = spa_vdev_config_enter(spa); 5230 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 5231 error = dmu_tx_assign(tx, TXG_WAIT); 5232 if (error != 0) 5233 dmu_tx_abort(tx); 5234 for (c = 0; c < children; c++) { 5235 if (vml[c] != NULL) { 5236 vdev_split(vml[c]); 5237 if (error == 0) 5238 spa_history_log_internal(spa, "detach", tx, 5239 "vdev=%s", vml[c]->vdev_path); 5240 5241 vdev_free(vml[c]); 5242 } 5243 } 5244 spa->spa_avz_action = AVZ_ACTION_REBUILD; 5245 vdev_config_dirty(spa->spa_root_vdev); 5246 spa->spa_config_splitting = NULL; 5247 nvlist_free(nvl); 5248 if (error == 0) 5249 dmu_tx_commit(tx); 5250 (void) spa_vdev_exit(spa, NULL, txg, 0); 5251 5252 if (zio_injection_enabled) 5253 zio_handle_panic_injection(spa, FTAG, 3); 5254 5255 /* split is complete; log a history record */ 5256 spa_history_log_internal(newspa, "split", NULL, 5257 "from pool %s", spa_name(spa)); 5258 5259 kmem_free(vml, children * sizeof (vdev_t *)); 5260 5261 /* if we're not going to mount the filesystems in userland, export */ 5262 if (exp) 5263 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 5264 B_FALSE, B_FALSE); 5265 5266 return (error); 5267 5268 out: 5269 spa_unload(newspa); 5270 spa_deactivate(newspa); 5271 spa_remove(newspa); 5272 5273 txg = spa_vdev_config_enter(spa); 5274 5275 /* re-online all offlined disks */ 5276 for (c = 0; c < children; c++) { 5277 if (vml[c] != NULL) 5278 vml[c]->vdev_offline = B_FALSE; 5279 } 5280 vdev_reopen(spa->spa_root_vdev); 5281 5282 nvlist_free(spa->spa_config_splitting); 5283 spa->spa_config_splitting = NULL; 5284 (void) spa_vdev_exit(spa, NULL, txg, error); 5285 5286 kmem_free(vml, children * sizeof (vdev_t *)); 5287 return (error); 5288 } 5289 5290 static nvlist_t * 5291 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 5292 { 5293 for (int i = 0; i < count; i++) { 5294 uint64_t guid; 5295 5296 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 5297 &guid) == 0); 5298 5299 if (guid == target_guid) 5300 return (nvpp[i]); 5301 } 5302 5303 return (NULL); 5304 } 5305 5306 static void 5307 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 5308 nvlist_t *dev_to_remove) 5309 { 5310 nvlist_t **newdev = NULL; 5311 5312 if (count > 1) 5313 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 5314 5315 for (int i = 0, j = 0; i < count; i++) { 5316 if (dev[i] == dev_to_remove) 5317 continue; 5318 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 5319 } 5320 5321 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 5322 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 5323 5324 for (int i = 0; i < count - 1; i++) 5325 nvlist_free(newdev[i]); 5326 5327 if (count > 1) 5328 kmem_free(newdev, (count - 1) * sizeof (void *)); 5329 } 5330 5331 /* 5332 * Evacuate the device. 5333 */ 5334 static int 5335 spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 5336 { 5337 uint64_t txg; 5338 int error = 0; 5339 5340 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5341 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5342 ASSERT(vd == vd->vdev_top); 5343 5344 /* 5345 * Evacuate the device. We don't hold the config lock as writer 5346 * since we need to do I/O but we do keep the 5347 * spa_namespace_lock held. Once this completes the device 5348 * should no longer have any blocks allocated on it. 5349 */ 5350 if (vd->vdev_islog) { 5351 if (vd->vdev_stat.vs_alloc != 0) 5352 error = spa_offline_log(spa); 5353 } else { 5354 error = SET_ERROR(ENOTSUP); 5355 } 5356 5357 if (error) 5358 return (error); 5359 5360 /* 5361 * The evacuation succeeded. Remove any remaining MOS metadata 5362 * associated with this vdev, and wait for these changes to sync. 5363 */ 5364 ASSERT0(vd->vdev_stat.vs_alloc); 5365 txg = spa_vdev_config_enter(spa); 5366 vd->vdev_removing = B_TRUE; 5367 vdev_dirty_leaves(vd, VDD_DTL, txg); 5368 vdev_config_dirty(vd); 5369 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5370 5371 return (0); 5372 } 5373 5374 /* 5375 * Complete the removal by cleaning up the namespace. 5376 */ 5377 static void 5378 spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 5379 { 5380 vdev_t *rvd = spa->spa_root_vdev; 5381 uint64_t id = vd->vdev_id; 5382 boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 5383 5384 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5385 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5386 ASSERT(vd == vd->vdev_top); 5387 5388 /* 5389 * Only remove any devices which are empty. 5390 */ 5391 if (vd->vdev_stat.vs_alloc != 0) 5392 return; 5393 5394 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5395 5396 if (list_link_active(&vd->vdev_state_dirty_node)) 5397 vdev_state_clean(vd); 5398 if (list_link_active(&vd->vdev_config_dirty_node)) 5399 vdev_config_clean(vd); 5400 5401 vdev_free(vd); 5402 5403 if (last_vdev) { 5404 vdev_compact_children(rvd); 5405 } else { 5406 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 5407 vdev_add_child(rvd, vd); 5408 } 5409 vdev_config_dirty(rvd); 5410 5411 /* 5412 * Reassess the health of our root vdev. 5413 */ 5414 vdev_reopen(rvd); 5415 } 5416 5417 /* 5418 * Remove a device from the pool - 5419 * 5420 * Removing a device from the vdev namespace requires several steps 5421 * and can take a significant amount of time. As a result we use 5422 * the spa_vdev_config_[enter/exit] functions which allow us to 5423 * grab and release the spa_config_lock while still holding the namespace 5424 * lock. During each step the configuration is synced out. 5425 * 5426 * Currently, this supports removing only hot spares, slogs, and level 2 ARC 5427 * devices. 5428 */ 5429 int 5430 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 5431 { 5432 vdev_t *vd; 5433 metaslab_group_t *mg; 5434 nvlist_t **spares, **l2cache, *nv; 5435 uint64_t txg = 0; 5436 uint_t nspares, nl2cache; 5437 int error = 0; 5438 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 5439 5440 ASSERT(spa_writeable(spa)); 5441 5442 if (!locked) 5443 txg = spa_vdev_enter(spa); 5444 5445 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 5446 5447 if (spa->spa_spares.sav_vdevs != NULL && 5448 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5449 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 5450 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 5451 /* 5452 * Only remove the hot spare if it's not currently in use 5453 * in this pool. 5454 */ 5455 if (vd == NULL || unspare) { 5456 spa_vdev_remove_aux(spa->spa_spares.sav_config, 5457 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 5458 spa_load_spares(spa); 5459 spa->spa_spares.sav_sync = B_TRUE; 5460 } else { 5461 error = SET_ERROR(EBUSY); 5462 } 5463 } else if (spa->spa_l2cache.sav_vdevs != NULL && 5464 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5465 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 5466 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 5467 /* 5468 * Cache devices can always be removed. 5469 */ 5470 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 5471 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 5472 spa_load_l2cache(spa); 5473 spa->spa_l2cache.sav_sync = B_TRUE; 5474 } else if (vd != NULL && vd->vdev_islog) { 5475 ASSERT(!locked); 5476 ASSERT(vd == vd->vdev_top); 5477 5478 mg = vd->vdev_mg; 5479 5480 /* 5481 * Stop allocating from this vdev. 5482 */ 5483 metaslab_group_passivate(mg); 5484 5485 /* 5486 * Wait for the youngest allocations and frees to sync, 5487 * and then wait for the deferral of those frees to finish. 5488 */ 5489 spa_vdev_config_exit(spa, NULL, 5490 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 5491 5492 /* 5493 * Attempt to evacuate the vdev. 5494 */ 5495 error = spa_vdev_remove_evacuate(spa, vd); 5496 5497 txg = spa_vdev_config_enter(spa); 5498 5499 /* 5500 * If we couldn't evacuate the vdev, unwind. 5501 */ 5502 if (error) { 5503 metaslab_group_activate(mg); 5504 return (spa_vdev_exit(spa, NULL, txg, error)); 5505 } 5506 5507 /* 5508 * Clean up the vdev namespace. 5509 */ 5510 spa_vdev_remove_from_namespace(spa, vd); 5511 5512 } else if (vd != NULL) { 5513 /* 5514 * Normal vdevs cannot be removed (yet). 5515 */ 5516 error = SET_ERROR(ENOTSUP); 5517 } else { 5518 /* 5519 * There is no vdev of any kind with the specified guid. 5520 */ 5521 error = SET_ERROR(ENOENT); 5522 } 5523 5524 if (!locked) 5525 return (spa_vdev_exit(spa, NULL, txg, error)); 5526 5527 return (error); 5528 } 5529 5530 /* 5531 * Find any device that's done replacing, or a vdev marked 'unspare' that's 5532 * currently spared, so we can detach it. 5533 */ 5534 static vdev_t * 5535 spa_vdev_resilver_done_hunt(vdev_t *vd) 5536 { 5537 vdev_t *newvd, *oldvd; 5538 5539 for (int c = 0; c < vd->vdev_children; c++) { 5540 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 5541 if (oldvd != NULL) 5542 return (oldvd); 5543 } 5544 5545 /* 5546 * Check for a completed replacement. We always consider the first 5547 * vdev in the list to be the oldest vdev, and the last one to be 5548 * the newest (see spa_vdev_attach() for how that works). In 5549 * the case where the newest vdev is faulted, we will not automatically 5550 * remove it after a resilver completes. This is OK as it will require 5551 * user intervention to determine which disk the admin wishes to keep. 5552 */ 5553 if (vd->vdev_ops == &vdev_replacing_ops) { 5554 ASSERT(vd->vdev_children > 1); 5555 5556 newvd = vd->vdev_child[vd->vdev_children - 1]; 5557 oldvd = vd->vdev_child[0]; 5558 5559 if (vdev_dtl_empty(newvd, DTL_MISSING) && 5560 vdev_dtl_empty(newvd, DTL_OUTAGE) && 5561 !vdev_dtl_required(oldvd)) 5562 return (oldvd); 5563 } 5564 5565 /* 5566 * Check for a completed resilver with the 'unspare' flag set. 5567 */ 5568 if (vd->vdev_ops == &vdev_spare_ops) { 5569 vdev_t *first = vd->vdev_child[0]; 5570 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 5571 5572 if (last->vdev_unspare) { 5573 oldvd = first; 5574 newvd = last; 5575 } else if (first->vdev_unspare) { 5576 oldvd = last; 5577 newvd = first; 5578 } else { 5579 oldvd = NULL; 5580 } 5581 5582 if (oldvd != NULL && 5583 vdev_dtl_empty(newvd, DTL_MISSING) && 5584 vdev_dtl_empty(newvd, DTL_OUTAGE) && 5585 !vdev_dtl_required(oldvd)) 5586 return (oldvd); 5587 5588 /* 5589 * If there are more than two spares attached to a disk, 5590 * and those spares are not required, then we want to 5591 * attempt to free them up now so that they can be used 5592 * by other pools. Once we're back down to a single 5593 * disk+spare, we stop removing them. 5594 */ 5595 if (vd->vdev_children > 2) { 5596 newvd = vd->vdev_child[1]; 5597 5598 if (newvd->vdev_isspare && last->vdev_isspare && 5599 vdev_dtl_empty(last, DTL_MISSING) && 5600 vdev_dtl_empty(last, DTL_OUTAGE) && 5601 !vdev_dtl_required(newvd)) 5602 return (newvd); 5603 } 5604 } 5605 5606 return (NULL); 5607 } 5608 5609 static void 5610 spa_vdev_resilver_done(spa_t *spa) 5611 { 5612 vdev_t *vd, *pvd, *ppvd; 5613 uint64_t guid, sguid, pguid, ppguid; 5614 5615 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5616 5617 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 5618 pvd = vd->vdev_parent; 5619 ppvd = pvd->vdev_parent; 5620 guid = vd->vdev_guid; 5621 pguid = pvd->vdev_guid; 5622 ppguid = ppvd->vdev_guid; 5623 sguid = 0; 5624 /* 5625 * If we have just finished replacing a hot spared device, then 5626 * we need to detach the parent's first child (the original hot 5627 * spare) as well. 5628 */ 5629 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 5630 ppvd->vdev_children == 2) { 5631 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 5632 sguid = ppvd->vdev_child[1]->vdev_guid; 5633 } 5634 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 5635 5636 spa_config_exit(spa, SCL_ALL, FTAG); 5637 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 5638 return; 5639 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 5640 return; 5641 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5642 } 5643 5644 spa_config_exit(spa, SCL_ALL, FTAG); 5645 } 5646 5647 /* 5648 * Update the stored path or FRU for this vdev. 5649 */ 5650 int 5651 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 5652 boolean_t ispath) 5653 { 5654 vdev_t *vd; 5655 boolean_t sync = B_FALSE; 5656 5657 ASSERT(spa_writeable(spa)); 5658 5659 spa_vdev_state_enter(spa, SCL_ALL); 5660 5661 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 5662 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 5663 5664 if (!vd->vdev_ops->vdev_op_leaf) 5665 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 5666 5667 if (ispath) { 5668 if (strcmp(value, vd->vdev_path) != 0) { 5669 spa_strfree(vd->vdev_path); 5670 vd->vdev_path = spa_strdup(value); 5671 sync = B_TRUE; 5672 } 5673 } else { 5674 if (vd->vdev_fru == NULL) { 5675 vd->vdev_fru = spa_strdup(value); 5676 sync = B_TRUE; 5677 } else if (strcmp(value, vd->vdev_fru) != 0) { 5678 spa_strfree(vd->vdev_fru); 5679 vd->vdev_fru = spa_strdup(value); 5680 sync = B_TRUE; 5681 } 5682 } 5683 5684 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 5685 } 5686 5687 int 5688 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 5689 { 5690 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 5691 } 5692 5693 int 5694 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 5695 { 5696 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 5697 } 5698 5699 /* 5700 * ========================================================================== 5701 * SPA Scanning 5702 * ========================================================================== 5703 */ 5704 5705 int 5706 spa_scan_stop(spa_t *spa) 5707 { 5708 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5709 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 5710 return (SET_ERROR(EBUSY)); 5711 return (dsl_scan_cancel(spa->spa_dsl_pool)); 5712 } 5713 5714 int 5715 spa_scan(spa_t *spa, pool_scan_func_t func) 5716 { 5717 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5718 5719 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 5720 return (SET_ERROR(ENOTSUP)); 5721 5722 /* 5723 * If a resilver was requested, but there is no DTL on a 5724 * writeable leaf device, we have nothing to do. 5725 */ 5726 if (func == POOL_SCAN_RESILVER && 5727 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5728 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 5729 return (0); 5730 } 5731 5732 return (dsl_scan(spa->spa_dsl_pool, func)); 5733 } 5734 5735 /* 5736 * ========================================================================== 5737 * SPA async task processing 5738 * ========================================================================== 5739 */ 5740 5741 static void 5742 spa_async_remove(spa_t *spa, vdev_t *vd) 5743 { 5744 if (vd->vdev_remove_wanted) { 5745 vd->vdev_remove_wanted = B_FALSE; 5746 vd->vdev_delayed_close = B_FALSE; 5747 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 5748 5749 /* 5750 * We want to clear the stats, but we don't want to do a full 5751 * vdev_clear() as that will cause us to throw away 5752 * degraded/faulted state as well as attempt to reopen the 5753 * device, all of which is a waste. 5754 */ 5755 vd->vdev_stat.vs_read_errors = 0; 5756 vd->vdev_stat.vs_write_errors = 0; 5757 vd->vdev_stat.vs_checksum_errors = 0; 5758 5759 vdev_state_dirty(vd->vdev_top); 5760 } 5761 5762 for (int c = 0; c < vd->vdev_children; c++) 5763 spa_async_remove(spa, vd->vdev_child[c]); 5764 } 5765 5766 static void 5767 spa_async_probe(spa_t *spa, vdev_t *vd) 5768 { 5769 if (vd->vdev_probe_wanted) { 5770 vd->vdev_probe_wanted = B_FALSE; 5771 vdev_reopen(vd); /* vdev_open() does the actual probe */ 5772 } 5773 5774 for (int c = 0; c < vd->vdev_children; c++) 5775 spa_async_probe(spa, vd->vdev_child[c]); 5776 } 5777 5778 static void 5779 spa_async_autoexpand(spa_t *spa, vdev_t *vd) 5780 { 5781 sysevent_id_t eid; 5782 nvlist_t *attr; 5783 char *physpath; 5784 5785 if (!spa->spa_autoexpand) 5786 return; 5787 5788 for (int c = 0; c < vd->vdev_children; c++) { 5789 vdev_t *cvd = vd->vdev_child[c]; 5790 spa_async_autoexpand(spa, cvd); 5791 } 5792 5793 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 5794 return; 5795 5796 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 5797 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 5798 5799 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5800 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 5801 5802 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 5803 ESC_DEV_DLE, attr, &eid, DDI_SLEEP); 5804 5805 nvlist_free(attr); 5806 kmem_free(physpath, MAXPATHLEN); 5807 } 5808 5809 static void 5810 spa_async_thread(spa_t *spa) 5811 { 5812 int tasks; 5813 5814 ASSERT(spa->spa_sync_on); 5815 5816 mutex_enter(&spa->spa_async_lock); 5817 tasks = spa->spa_async_tasks; 5818 spa->spa_async_tasks = 0; 5819 mutex_exit(&spa->spa_async_lock); 5820 5821 /* 5822 * See if the config needs to be updated. 5823 */ 5824 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 5825 uint64_t old_space, new_space; 5826 5827 mutex_enter(&spa_namespace_lock); 5828 old_space = metaslab_class_get_space(spa_normal_class(spa)); 5829 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5830 new_space = metaslab_class_get_space(spa_normal_class(spa)); 5831 mutex_exit(&spa_namespace_lock); 5832 5833 /* 5834 * If the pool grew as a result of the config update, 5835 * then log an internal history event. 5836 */ 5837 if (new_space != old_space) { 5838 spa_history_log_internal(spa, "vdev online", NULL, 5839 "pool '%s' size: %llu(+%llu)", 5840 spa_name(spa), new_space, new_space - old_space); 5841 } 5842 } 5843 5844 /* 5845 * See if any devices need to be marked REMOVED. 5846 */ 5847 if (tasks & SPA_ASYNC_REMOVE) { 5848 spa_vdev_state_enter(spa, SCL_NONE); 5849 spa_async_remove(spa, spa->spa_root_vdev); 5850 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 5851 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 5852 for (int i = 0; i < spa->spa_spares.sav_count; i++) 5853 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 5854 (void) spa_vdev_state_exit(spa, NULL, 0); 5855 } 5856 5857 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 5858 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5859 spa_async_autoexpand(spa, spa->spa_root_vdev); 5860 spa_config_exit(spa, SCL_CONFIG, FTAG); 5861 } 5862 5863 /* 5864 * See if any devices need to be probed. 5865 */ 5866 if (tasks & SPA_ASYNC_PROBE) { 5867 spa_vdev_state_enter(spa, SCL_NONE); 5868 spa_async_probe(spa, spa->spa_root_vdev); 5869 (void) spa_vdev_state_exit(spa, NULL, 0); 5870 } 5871 5872 /* 5873 * If any devices are done replacing, detach them. 5874 */ 5875 if (tasks & SPA_ASYNC_RESILVER_DONE) 5876 spa_vdev_resilver_done(spa); 5877 5878 /* 5879 * Kick off a resilver. 5880 */ 5881 if (tasks & SPA_ASYNC_RESILVER) 5882 dsl_resilver_restart(spa->spa_dsl_pool, 0); 5883 5884 /* 5885 * Let the world know that we're done. 5886 */ 5887 mutex_enter(&spa->spa_async_lock); 5888 spa->spa_async_thread = NULL; 5889 cv_broadcast(&spa->spa_async_cv); 5890 mutex_exit(&spa->spa_async_lock); 5891 thread_exit(); 5892 } 5893 5894 void 5895 spa_async_suspend(spa_t *spa) 5896 { 5897 mutex_enter(&spa->spa_async_lock); 5898 spa->spa_async_suspended++; 5899 while (spa->spa_async_thread != NULL) 5900 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 5901 mutex_exit(&spa->spa_async_lock); 5902 } 5903 5904 void 5905 spa_async_resume(spa_t *spa) 5906 { 5907 mutex_enter(&spa->spa_async_lock); 5908 ASSERT(spa->spa_async_suspended != 0); 5909 spa->spa_async_suspended--; 5910 mutex_exit(&spa->spa_async_lock); 5911 } 5912 5913 static boolean_t 5914 spa_async_tasks_pending(spa_t *spa) 5915 { 5916 uint_t non_config_tasks; 5917 uint_t config_task; 5918 boolean_t config_task_suspended; 5919 5920 non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE; 5921 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 5922 if (spa->spa_ccw_fail_time == 0) { 5923 config_task_suspended = B_FALSE; 5924 } else { 5925 config_task_suspended = 5926 (gethrtime() - spa->spa_ccw_fail_time) < 5927 (zfs_ccw_retry_interval * NANOSEC); 5928 } 5929 5930 return (non_config_tasks || (config_task && !config_task_suspended)); 5931 } 5932 5933 static void 5934 spa_async_dispatch(spa_t *spa) 5935 { 5936 mutex_enter(&spa->spa_async_lock); 5937 if (spa_async_tasks_pending(spa) && 5938 !spa->spa_async_suspended && 5939 spa->spa_async_thread == NULL && 5940 rootdir != NULL) 5941 spa->spa_async_thread = thread_create(NULL, 0, 5942 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 5943 mutex_exit(&spa->spa_async_lock); 5944 } 5945 5946 void 5947 spa_async_request(spa_t *spa, int task) 5948 { 5949 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 5950 mutex_enter(&spa->spa_async_lock); 5951 spa->spa_async_tasks |= task; 5952 mutex_exit(&spa->spa_async_lock); 5953 } 5954 5955 /* 5956 * ========================================================================== 5957 * SPA syncing routines 5958 * ========================================================================== 5959 */ 5960 5961 static int 5962 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5963 { 5964 bpobj_t *bpo = arg; 5965 bpobj_enqueue(bpo, bp, tx); 5966 return (0); 5967 } 5968 5969 static int 5970 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5971 { 5972 zio_t *zio = arg; 5973 5974 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 5975 zio->io_flags)); 5976 return (0); 5977 } 5978 5979 /* 5980 * Note: this simple function is not inlined to make it easier to dtrace the 5981 * amount of time spent syncing frees. 5982 */ 5983 static void 5984 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 5985 { 5986 zio_t *zio = zio_root(spa, NULL, NULL, 0); 5987 bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 5988 VERIFY(zio_wait(zio) == 0); 5989 } 5990 5991 /* 5992 * Note: this simple function is not inlined to make it easier to dtrace the 5993 * amount of time spent syncing deferred frees. 5994 */ 5995 static void 5996 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 5997 { 5998 zio_t *zio = zio_root(spa, NULL, NULL, 0); 5999 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 6000 spa_free_sync_cb, zio, tx), ==, 0); 6001 VERIFY0(zio_wait(zio)); 6002 } 6003 6004 6005 static void 6006 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 6007 { 6008 char *packed = NULL; 6009 size_t bufsize; 6010 size_t nvsize = 0; 6011 dmu_buf_t *db; 6012 6013 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 6014 6015 /* 6016 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 6017 * information. This avoids the dmu_buf_will_dirty() path and 6018 * saves us a pre-read to get data we don't actually care about. 6019 */ 6020 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 6021 packed = kmem_alloc(bufsize, KM_SLEEP); 6022 6023 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 6024 KM_SLEEP) == 0); 6025 bzero(packed + nvsize, bufsize - nvsize); 6026 6027 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 6028 6029 kmem_free(packed, bufsize); 6030 6031 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 6032 dmu_buf_will_dirty(db, tx); 6033 *(uint64_t *)db->db_data = nvsize; 6034 dmu_buf_rele(db, FTAG); 6035 } 6036 6037 static void 6038 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 6039 const char *config, const char *entry) 6040 { 6041 nvlist_t *nvroot; 6042 nvlist_t **list; 6043 int i; 6044 6045 if (!sav->sav_sync) 6046 return; 6047 6048 /* 6049 * Update the MOS nvlist describing the list of available devices. 6050 * spa_validate_aux() will have already made sure this nvlist is 6051 * valid and the vdevs are labeled appropriately. 6052 */ 6053 if (sav->sav_object == 0) { 6054 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 6055 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 6056 sizeof (uint64_t), tx); 6057 VERIFY(zap_update(spa->spa_meta_objset, 6058 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 6059 &sav->sav_object, tx) == 0); 6060 } 6061 6062 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 6063 if (sav->sav_count == 0) { 6064 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 6065 } else { 6066 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 6067 for (i = 0; i < sav->sav_count; i++) 6068 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 6069 B_FALSE, VDEV_CONFIG_L2CACHE); 6070 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 6071 sav->sav_count) == 0); 6072 for (i = 0; i < sav->sav_count; i++) 6073 nvlist_free(list[i]); 6074 kmem_free(list, sav->sav_count * sizeof (void *)); 6075 } 6076 6077 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 6078 nvlist_free(nvroot); 6079 6080 sav->sav_sync = B_FALSE; 6081 } 6082 6083 /* 6084 * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. 6085 * The all-vdev ZAP must be empty. 6086 */ 6087 static void 6088 spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) 6089 { 6090 spa_t *spa = vd->vdev_spa; 6091 if (vd->vdev_top_zap != 0) { 6092 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 6093 vd->vdev_top_zap, tx)); 6094 } 6095 if (vd->vdev_leaf_zap != 0) { 6096 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 6097 vd->vdev_leaf_zap, tx)); 6098 } 6099 for (uint64_t i = 0; i < vd->vdev_children; i++) { 6100 spa_avz_build(vd->vdev_child[i], avz, tx); 6101 } 6102 } 6103 6104 static void 6105 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 6106 { 6107 nvlist_t *config; 6108 6109 /* 6110 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, 6111 * its config may not be dirty but we still need to build per-vdev ZAPs. 6112 * Similarly, if the pool is being assembled (e.g. after a split), we 6113 * need to rebuild the AVZ although the config may not be dirty. 6114 */ 6115 if (list_is_empty(&spa->spa_config_dirty_list) && 6116 spa->spa_avz_action == AVZ_ACTION_NONE) 6117 return; 6118 6119 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6120 6121 ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || 6122 spa->spa_all_vdev_zaps != 0); 6123 6124 if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { 6125 /* Make and build the new AVZ */ 6126 uint64_t new_avz = zap_create(spa->spa_meta_objset, 6127 DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); 6128 spa_avz_build(spa->spa_root_vdev, new_avz, tx); 6129 6130 /* Diff old AVZ with new one */ 6131 zap_cursor_t zc; 6132 zap_attribute_t za; 6133 6134 for (zap_cursor_init(&zc, spa->spa_meta_objset, 6135 spa->spa_all_vdev_zaps); 6136 zap_cursor_retrieve(&zc, &za) == 0; 6137 zap_cursor_advance(&zc)) { 6138 uint64_t vdzap = za.za_first_integer; 6139 if (zap_lookup_int(spa->spa_meta_objset, new_avz, 6140 vdzap) == ENOENT) { 6141 /* 6142 * ZAP is listed in old AVZ but not in new one; 6143 * destroy it 6144 */ 6145 VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, 6146 tx)); 6147 } 6148 } 6149 6150 zap_cursor_fini(&zc); 6151 6152 /* Destroy the old AVZ */ 6153 VERIFY0(zap_destroy(spa->spa_meta_objset, 6154 spa->spa_all_vdev_zaps, tx)); 6155 6156 /* Replace the old AVZ in the dir obj with the new one */ 6157 VERIFY0(zap_update(spa->spa_meta_objset, 6158 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, 6159 sizeof (new_avz), 1, &new_avz, tx)); 6160 6161 spa->spa_all_vdev_zaps = new_avz; 6162 } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { 6163 zap_cursor_t zc; 6164 zap_attribute_t za; 6165 6166 /* Walk through the AVZ and destroy all listed ZAPs */ 6167 for (zap_cursor_init(&zc, spa->spa_meta_objset, 6168 spa->spa_all_vdev_zaps); 6169 zap_cursor_retrieve(&zc, &za) == 0; 6170 zap_cursor_advance(&zc)) { 6171 uint64_t zap = za.za_first_integer; 6172 VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); 6173 } 6174 6175 zap_cursor_fini(&zc); 6176 6177 /* Destroy and unlink the AVZ itself */ 6178 VERIFY0(zap_destroy(spa->spa_meta_objset, 6179 spa->spa_all_vdev_zaps, tx)); 6180 VERIFY0(zap_remove(spa->spa_meta_objset, 6181 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); 6182 spa->spa_all_vdev_zaps = 0; 6183 } 6184 6185 if (spa->spa_all_vdev_zaps == 0) { 6186 spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, 6187 DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, 6188 DMU_POOL_VDEV_ZAP_MAP, tx); 6189 } 6190 spa->spa_avz_action = AVZ_ACTION_NONE; 6191 6192 /* Create ZAPs for vdevs that don't have them. */ 6193 vdev_construct_zaps(spa->spa_root_vdev, tx); 6194 6195 config = spa_config_generate(spa, spa->spa_root_vdev, 6196 dmu_tx_get_txg(tx), B_FALSE); 6197 6198 /* 6199 * If we're upgrading the spa version then make sure that 6200 * the config object gets updated with the correct version. 6201 */ 6202 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 6203 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 6204 spa->spa_uberblock.ub_version); 6205 6206 spa_config_exit(spa, SCL_STATE, FTAG); 6207 6208 nvlist_free(spa->spa_config_syncing); 6209 spa->spa_config_syncing = config; 6210 6211 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 6212 } 6213 6214 static void 6215 spa_sync_version(void *arg, dmu_tx_t *tx) 6216 { 6217 uint64_t *versionp = arg; 6218 uint64_t version = *versionp; 6219 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6220 6221 /* 6222 * Setting the version is special cased when first creating the pool. 6223 */ 6224 ASSERT(tx->tx_txg != TXG_INITIAL); 6225 6226 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 6227 ASSERT(version >= spa_version(spa)); 6228 6229 spa->spa_uberblock.ub_version = version; 6230 vdev_config_dirty(spa->spa_root_vdev); 6231 spa_history_log_internal(spa, "set", tx, "version=%lld", version); 6232 } 6233 6234 /* 6235 * Set zpool properties. 6236 */ 6237 static void 6238 spa_sync_props(void *arg, dmu_tx_t *tx) 6239 { 6240 nvlist_t *nvp = arg; 6241 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6242 objset_t *mos = spa->spa_meta_objset; 6243 nvpair_t *elem = NULL; 6244 6245 mutex_enter(&spa->spa_props_lock); 6246 6247 while ((elem = nvlist_next_nvpair(nvp, elem))) { 6248 uint64_t intval; 6249 char *strval, *fname; 6250 zpool_prop_t prop; 6251 const char *propname; 6252 zprop_type_t proptype; 6253 spa_feature_t fid; 6254 6255 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 6256 case ZPROP_INVAL: 6257 /* 6258 * We checked this earlier in spa_prop_validate(). 6259 */ 6260 ASSERT(zpool_prop_feature(nvpair_name(elem))); 6261 6262 fname = strchr(nvpair_name(elem), '@') + 1; 6263 VERIFY0(zfeature_lookup_name(fname, &fid)); 6264 6265 spa_feature_enable(spa, fid, tx); 6266 spa_history_log_internal(spa, "set", tx, 6267 "%s=enabled", nvpair_name(elem)); 6268 break; 6269 6270 case ZPOOL_PROP_VERSION: 6271 intval = fnvpair_value_uint64(elem); 6272 /* 6273 * The version is synced seperatly before other 6274 * properties and should be correct by now. 6275 */ 6276 ASSERT3U(spa_version(spa), >=, intval); 6277 break; 6278 6279 case ZPOOL_PROP_ALTROOT: 6280 /* 6281 * 'altroot' is a non-persistent property. It should 6282 * have been set temporarily at creation or import time. 6283 */ 6284 ASSERT(spa->spa_root != NULL); 6285 break; 6286 6287 case ZPOOL_PROP_READONLY: 6288 case ZPOOL_PROP_CACHEFILE: 6289 /* 6290 * 'readonly' and 'cachefile' are also non-persisitent 6291 * properties. 6292 */ 6293 break; 6294 case ZPOOL_PROP_COMMENT: 6295 strval = fnvpair_value_string(elem); 6296 if (spa->spa_comment != NULL) 6297 spa_strfree(spa->spa_comment); 6298 spa->spa_comment = spa_strdup(strval); 6299 /* 6300 * We need to dirty the configuration on all the vdevs 6301 * so that their labels get updated. It's unnecessary 6302 * to do this for pool creation since the vdev's 6303 * configuratoin has already been dirtied. 6304 */ 6305 if (tx->tx_txg != TXG_INITIAL) 6306 vdev_config_dirty(spa->spa_root_vdev); 6307 spa_history_log_internal(spa, "set", tx, 6308 "%s=%s", nvpair_name(elem), strval); 6309 break; 6310 default: 6311 /* 6312 * Set pool property values in the poolprops mos object. 6313 */ 6314 if (spa->spa_pool_props_object == 0) { 6315 spa->spa_pool_props_object = 6316 zap_create_link(mos, DMU_OT_POOL_PROPS, 6317 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 6318 tx); 6319 } 6320 6321 /* normalize the property name */ 6322 propname = zpool_prop_to_name(prop); 6323 proptype = zpool_prop_get_type(prop); 6324 6325 if (nvpair_type(elem) == DATA_TYPE_STRING) { 6326 ASSERT(proptype == PROP_TYPE_STRING); 6327 strval = fnvpair_value_string(elem); 6328 VERIFY0(zap_update(mos, 6329 spa->spa_pool_props_object, propname, 6330 1, strlen(strval) + 1, strval, tx)); 6331 spa_history_log_internal(spa, "set", tx, 6332 "%s=%s", nvpair_name(elem), strval); 6333 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 6334 intval = fnvpair_value_uint64(elem); 6335 6336 if (proptype == PROP_TYPE_INDEX) { 6337 const char *unused; 6338 VERIFY0(zpool_prop_index_to_string( 6339 prop, intval, &unused)); 6340 } 6341 VERIFY0(zap_update(mos, 6342 spa->spa_pool_props_object, propname, 6343 8, 1, &intval, tx)); 6344 spa_history_log_internal(spa, "set", tx, 6345 "%s=%lld", nvpair_name(elem), intval); 6346 } else { 6347 ASSERT(0); /* not allowed */ 6348 } 6349 6350 switch (prop) { 6351 case ZPOOL_PROP_DELEGATION: 6352 spa->spa_delegation = intval; 6353 break; 6354 case ZPOOL_PROP_BOOTFS: 6355 spa->spa_bootfs = intval; 6356 break; 6357 case ZPOOL_PROP_FAILUREMODE: 6358 spa->spa_failmode = intval; 6359 break; 6360 case ZPOOL_PROP_AUTOEXPAND: 6361 spa->spa_autoexpand = intval; 6362 if (tx->tx_txg != TXG_INITIAL) 6363 spa_async_request(spa, 6364 SPA_ASYNC_AUTOEXPAND); 6365 break; 6366 case ZPOOL_PROP_DEDUPDITTO: 6367 spa->spa_dedup_ditto = intval; 6368 break; 6369 default: 6370 break; 6371 } 6372 } 6373 6374 } 6375 6376 mutex_exit(&spa->spa_props_lock); 6377 } 6378 6379 /* 6380 * Perform one-time upgrade on-disk changes. spa_version() does not 6381 * reflect the new version this txg, so there must be no changes this 6382 * txg to anything that the upgrade code depends on after it executes. 6383 * Therefore this must be called after dsl_pool_sync() does the sync 6384 * tasks. 6385 */ 6386 static void 6387 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 6388 { 6389 dsl_pool_t *dp = spa->spa_dsl_pool; 6390 6391 ASSERT(spa->spa_sync_pass == 1); 6392 6393 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 6394 6395 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 6396 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 6397 dsl_pool_create_origin(dp, tx); 6398 6399 /* Keeping the origin open increases spa_minref */ 6400 spa->spa_minref += 3; 6401 } 6402 6403 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 6404 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 6405 dsl_pool_upgrade_clones(dp, tx); 6406 } 6407 6408 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 6409 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 6410 dsl_pool_upgrade_dir_clones(dp, tx); 6411 6412 /* Keeping the freedir open increases spa_minref */ 6413 spa->spa_minref += 3; 6414 } 6415 6416 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 6417 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6418 spa_feature_create_zap_objects(spa, tx); 6419 } 6420 6421 /* 6422 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 6423 * when possibility to use lz4 compression for metadata was added 6424 * Old pools that have this feature enabled must be upgraded to have 6425 * this feature active 6426 */ 6427 if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6428 boolean_t lz4_en = spa_feature_is_enabled(spa, 6429 SPA_FEATURE_LZ4_COMPRESS); 6430 boolean_t lz4_ac = spa_feature_is_active(spa, 6431 SPA_FEATURE_LZ4_COMPRESS); 6432 6433 if (lz4_en && !lz4_ac) 6434 spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 6435 } 6436 6437 /* 6438 * If we haven't written the salt, do so now. Note that the 6439 * feature may not be activated yet, but that's fine since 6440 * the presence of this ZAP entry is backwards compatible. 6441 */ 6442 if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 6443 DMU_POOL_CHECKSUM_SALT) == ENOENT) { 6444 VERIFY0(zap_add(spa->spa_meta_objset, 6445 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, 6446 sizeof (spa->spa_cksum_salt.zcs_bytes), 6447 spa->spa_cksum_salt.zcs_bytes, tx)); 6448 } 6449 6450 rrw_exit(&dp->dp_config_rwlock, FTAG); 6451 } 6452 6453 /* 6454 * Sync the specified transaction group. New blocks may be dirtied as 6455 * part of the process, so we iterate until it converges. 6456 */ 6457 void 6458 spa_sync(spa_t *spa, uint64_t txg) 6459 { 6460 dsl_pool_t *dp = spa->spa_dsl_pool; 6461 objset_t *mos = spa->spa_meta_objset; 6462 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 6463 vdev_t *rvd = spa->spa_root_vdev; 6464 vdev_t *vd; 6465 dmu_tx_t *tx; 6466 int error; 6467 6468 VERIFY(spa_writeable(spa)); 6469 6470 /* 6471 * Lock out configuration changes. 6472 */ 6473 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6474 6475 spa->spa_syncing_txg = txg; 6476 spa->spa_sync_pass = 0; 6477 6478 /* 6479 * If there are any pending vdev state changes, convert them 6480 * into config changes that go out with this transaction group. 6481 */ 6482 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6483 while (list_head(&spa->spa_state_dirty_list) != NULL) { 6484 /* 6485 * We need the write lock here because, for aux vdevs, 6486 * calling vdev_config_dirty() modifies sav_config. 6487 * This is ugly and will become unnecessary when we 6488 * eliminate the aux vdev wart by integrating all vdevs 6489 * into the root vdev tree. 6490 */ 6491 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6492 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 6493 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 6494 vdev_state_clean(vd); 6495 vdev_config_dirty(vd); 6496 } 6497 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6498 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 6499 } 6500 spa_config_exit(spa, SCL_STATE, FTAG); 6501 6502 tx = dmu_tx_create_assigned(dp, txg); 6503 6504 spa->spa_sync_starttime = gethrtime(); 6505 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, 6506 spa->spa_sync_starttime + spa->spa_deadman_synctime)); 6507 6508 /* 6509 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 6510 * set spa_deflate if we have no raid-z vdevs. 6511 */ 6512 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 6513 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 6514 int i; 6515 6516 for (i = 0; i < rvd->vdev_children; i++) { 6517 vd = rvd->vdev_child[i]; 6518 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 6519 break; 6520 } 6521 if (i == rvd->vdev_children) { 6522 spa->spa_deflate = TRUE; 6523 VERIFY(0 == zap_add(spa->spa_meta_objset, 6524 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6525 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 6526 } 6527 } 6528 6529 /* 6530 * Iterate to convergence. 6531 */ 6532 do { 6533 int pass = ++spa->spa_sync_pass; 6534 6535 spa_sync_config_object(spa, tx); 6536 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 6537 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 6538 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 6539 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 6540 spa_errlog_sync(spa, txg); 6541 dsl_pool_sync(dp, txg); 6542 6543 if (pass < zfs_sync_pass_deferred_free) { 6544 spa_sync_frees(spa, free_bpl, tx); 6545 } else { 6546 /* 6547 * We can not defer frees in pass 1, because 6548 * we sync the deferred frees later in pass 1. 6549 */ 6550 ASSERT3U(pass, >, 1); 6551 bplist_iterate(free_bpl, bpobj_enqueue_cb, 6552 &spa->spa_deferred_bpobj, tx); 6553 } 6554 6555 ddt_sync(spa, txg); 6556 dsl_scan_sync(dp, tx); 6557 6558 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 6559 vdev_sync(vd, txg); 6560 6561 if (pass == 1) { 6562 spa_sync_upgrades(spa, tx); 6563 ASSERT3U(txg, >=, 6564 spa->spa_uberblock.ub_rootbp.blk_birth); 6565 /* 6566 * Note: We need to check if the MOS is dirty 6567 * because we could have marked the MOS dirty 6568 * without updating the uberblock (e.g. if we 6569 * have sync tasks but no dirty user data). We 6570 * need to check the uberblock's rootbp because 6571 * it is updated if we have synced out dirty 6572 * data (though in this case the MOS will most 6573 * likely also be dirty due to second order 6574 * effects, we don't want to rely on that here). 6575 */ 6576 if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && 6577 !dmu_objset_is_dirty(mos, txg)) { 6578 /* 6579 * Nothing changed on the first pass, 6580 * therefore this TXG is a no-op. Avoid 6581 * syncing deferred frees, so that we 6582 * can keep this TXG as a no-op. 6583 */ 6584 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, 6585 txg)); 6586 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 6587 ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); 6588 break; 6589 } 6590 spa_sync_deferred_frees(spa, tx); 6591 } 6592 6593 } while (dmu_objset_is_dirty(mos, txg)); 6594 6595 if (!list_is_empty(&spa->spa_config_dirty_list)) { 6596 /* 6597 * Make sure that the number of ZAPs for all the vdevs matches 6598 * the number of ZAPs in the per-vdev ZAP list. This only gets 6599 * called if the config is dirty; otherwise there may be 6600 * outstanding AVZ operations that weren't completed in 6601 * spa_sync_config_object. 6602 */ 6603 uint64_t all_vdev_zap_entry_count; 6604 ASSERT0(zap_count(spa->spa_meta_objset, 6605 spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); 6606 ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, 6607 all_vdev_zap_entry_count); 6608 } 6609 6610 /* 6611 * Rewrite the vdev configuration (which includes the uberblock) 6612 * to commit the transaction group. 6613 * 6614 * If there are no dirty vdevs, we sync the uberblock to a few 6615 * random top-level vdevs that are known to be visible in the 6616 * config cache (see spa_vdev_add() for a complete description). 6617 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 6618 */ 6619 for (;;) { 6620 /* 6621 * We hold SCL_STATE to prevent vdev open/close/etc. 6622 * while we're attempting to write the vdev labels. 6623 */ 6624 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6625 6626 if (list_is_empty(&spa->spa_config_dirty_list)) { 6627 vdev_t *svd[SPA_DVAS_PER_BP]; 6628 int svdcount = 0; 6629 int children = rvd->vdev_children; 6630 int c0 = spa_get_random(children); 6631 6632 for (int c = 0; c < children; c++) { 6633 vd = rvd->vdev_child[(c0 + c) % children]; 6634 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 6635 continue; 6636 svd[svdcount++] = vd; 6637 if (svdcount == SPA_DVAS_PER_BP) 6638 break; 6639 } 6640 error = vdev_config_sync(svd, svdcount, txg); 6641 } else { 6642 error = vdev_config_sync(rvd->vdev_child, 6643 rvd->vdev_children, txg); 6644 } 6645 6646 if (error == 0) 6647 spa->spa_last_synced_guid = rvd->vdev_guid; 6648 6649 spa_config_exit(spa, SCL_STATE, FTAG); 6650 6651 if (error == 0) 6652 break; 6653 zio_suspend(spa, NULL); 6654 zio_resume_wait(spa); 6655 } 6656 dmu_tx_commit(tx); 6657 6658 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); 6659 6660 /* 6661 * Clear the dirty config list. 6662 */ 6663 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 6664 vdev_config_clean(vd); 6665 6666 /* 6667 * Now that the new config has synced transactionally, 6668 * let it become visible to the config cache. 6669 */ 6670 if (spa->spa_config_syncing != NULL) { 6671 spa_config_set(spa, spa->spa_config_syncing); 6672 spa->spa_config_txg = txg; 6673 spa->spa_config_syncing = NULL; 6674 } 6675 6676 spa->spa_ubsync = spa->spa_uberblock; 6677 6678 dsl_pool_sync_done(dp, txg); 6679 6680 /* 6681 * Update usable space statistics. 6682 */ 6683 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 6684 vdev_sync_done(vd, txg); 6685 6686 spa_update_dspace(spa); 6687 6688 /* 6689 * It had better be the case that we didn't dirty anything 6690 * since vdev_config_sync(). 6691 */ 6692 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 6693 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 6694 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 6695 6696 spa->spa_sync_pass = 0; 6697 6698 spa_config_exit(spa, SCL_CONFIG, FTAG); 6699 6700 spa_handle_ignored_writes(spa); 6701 6702 /* 6703 * If any async tasks have been requested, kick them off. 6704 */ 6705 spa_async_dispatch(spa); 6706 } 6707 6708 /* 6709 * Sync all pools. We don't want to hold the namespace lock across these 6710 * operations, so we take a reference on the spa_t and drop the lock during the 6711 * sync. 6712 */ 6713 void 6714 spa_sync_allpools(void) 6715 { 6716 spa_t *spa = NULL; 6717 mutex_enter(&spa_namespace_lock); 6718 while ((spa = spa_next(spa)) != NULL) { 6719 if (spa_state(spa) != POOL_STATE_ACTIVE || 6720 !spa_writeable(spa) || spa_suspended(spa)) 6721 continue; 6722 spa_open_ref(spa, FTAG); 6723 mutex_exit(&spa_namespace_lock); 6724 txg_wait_synced(spa_get_dsl(spa), 0); 6725 mutex_enter(&spa_namespace_lock); 6726 spa_close(spa, FTAG); 6727 } 6728 mutex_exit(&spa_namespace_lock); 6729 } 6730 6731 /* 6732 * ========================================================================== 6733 * Miscellaneous routines 6734 * ========================================================================== 6735 */ 6736 6737 /* 6738 * Remove all pools in the system. 6739 */ 6740 void 6741 spa_evict_all(void) 6742 { 6743 spa_t *spa; 6744 6745 /* 6746 * Remove all cached state. All pools should be closed now, 6747 * so every spa in the AVL tree should be unreferenced. 6748 */ 6749 mutex_enter(&spa_namespace_lock); 6750 while ((spa = spa_next(NULL)) != NULL) { 6751 /* 6752 * Stop async tasks. The async thread may need to detach 6753 * a device that's been replaced, which requires grabbing 6754 * spa_namespace_lock, so we must drop it here. 6755 */ 6756 spa_open_ref(spa, FTAG); 6757 mutex_exit(&spa_namespace_lock); 6758 spa_async_suspend(spa); 6759 mutex_enter(&spa_namespace_lock); 6760 spa_close(spa, FTAG); 6761 6762 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 6763 spa_unload(spa); 6764 spa_deactivate(spa); 6765 } 6766 spa_remove(spa); 6767 } 6768 mutex_exit(&spa_namespace_lock); 6769 } 6770 6771 vdev_t * 6772 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 6773 { 6774 vdev_t *vd; 6775 int i; 6776 6777 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 6778 return (vd); 6779 6780 if (aux) { 6781 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 6782 vd = spa->spa_l2cache.sav_vdevs[i]; 6783 if (vd->vdev_guid == guid) 6784 return (vd); 6785 } 6786 6787 for (i = 0; i < spa->spa_spares.sav_count; i++) { 6788 vd = spa->spa_spares.sav_vdevs[i]; 6789 if (vd->vdev_guid == guid) 6790 return (vd); 6791 } 6792 } 6793 6794 return (NULL); 6795 } 6796 6797 void 6798 spa_upgrade(spa_t *spa, uint64_t version) 6799 { 6800 ASSERT(spa_writeable(spa)); 6801 6802 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6803 6804 /* 6805 * This should only be called for a non-faulted pool, and since a 6806 * future version would result in an unopenable pool, this shouldn't be 6807 * possible. 6808 */ 6809 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 6810 ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 6811 6812 spa->spa_uberblock.ub_version = version; 6813 vdev_config_dirty(spa->spa_root_vdev); 6814 6815 spa_config_exit(spa, SCL_ALL, FTAG); 6816 6817 txg_wait_synced(spa_get_dsl(spa), 0); 6818 } 6819 6820 boolean_t 6821 spa_has_spare(spa_t *spa, uint64_t guid) 6822 { 6823 int i; 6824 uint64_t spareguid; 6825 spa_aux_vdev_t *sav = &spa->spa_spares; 6826 6827 for (i = 0; i < sav->sav_count; i++) 6828 if (sav->sav_vdevs[i]->vdev_guid == guid) 6829 return (B_TRUE); 6830 6831 for (i = 0; i < sav->sav_npending; i++) { 6832 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 6833 &spareguid) == 0 && spareguid == guid) 6834 return (B_TRUE); 6835 } 6836 6837 return (B_FALSE); 6838 } 6839 6840 /* 6841 * Check if a pool has an active shared spare device. 6842 * Note: reference count of an active spare is 2, as a spare and as a replace 6843 */ 6844 static boolean_t 6845 spa_has_active_shared_spare(spa_t *spa) 6846 { 6847 int i, refcnt; 6848 uint64_t pool; 6849 spa_aux_vdev_t *sav = &spa->spa_spares; 6850 6851 for (i = 0; i < sav->sav_count; i++) { 6852 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 6853 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 6854 refcnt > 2) 6855 return (B_TRUE); 6856 } 6857 6858 return (B_FALSE); 6859 } 6860 6861 /* 6862 * Post a sysevent corresponding to the given event. The 'name' must be one of 6863 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 6864 * filled in from the spa and (optionally) the vdev. This doesn't do anything 6865 * in the userland libzpool, as we don't want consumers to misinterpret ztest 6866 * or zdb as real changes. 6867 */ 6868 void 6869 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 6870 { 6871 #ifdef _KERNEL 6872 sysevent_t *ev; 6873 sysevent_attr_list_t *attr = NULL; 6874 sysevent_value_t value; 6875 sysevent_id_t eid; 6876 6877 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 6878 SE_SLEEP); 6879 6880 value.value_type = SE_DATA_TYPE_STRING; 6881 value.value.sv_string = spa_name(spa); 6882 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 6883 goto done; 6884 6885 value.value_type = SE_DATA_TYPE_UINT64; 6886 value.value.sv_uint64 = spa_guid(spa); 6887 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 6888 goto done; 6889 6890 if (vd) { 6891 value.value_type = SE_DATA_TYPE_UINT64; 6892 value.value.sv_uint64 = vd->vdev_guid; 6893 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 6894 SE_SLEEP) != 0) 6895 goto done; 6896 6897 if (vd->vdev_path) { 6898 value.value_type = SE_DATA_TYPE_STRING; 6899 value.value.sv_string = vd->vdev_path; 6900 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 6901 &value, SE_SLEEP) != 0) 6902 goto done; 6903 } 6904 } 6905 6906 if (sysevent_attach_attributes(ev, attr) != 0) 6907 goto done; 6908 attr = NULL; 6909 6910 (void) log_sysevent(ev, SE_SLEEP, &eid); 6911 6912 done: 6913 if (attr) 6914 sysevent_free_attr(attr); 6915 sysevent_free(ev); 6916 #endif 6917 } 6918