1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 25 * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 27 * Copyright 2013 Saso Kiselkov. All rights reserved. 28 * Copyright (c) 2014 Integros [integros.com] 29 * Copyright 2016 Toomas Soome <tsoome@me.com> 30 */ 31 32 /* 33 * SPA: Storage Pool Allocator 34 * 35 * This file contains all the routines used when modifying on-disk SPA state. 36 * This includes opening, importing, destroying, exporting a pool, and syncing a 37 * pool. 38 */ 39 40 #include <sys/zfs_context.h> 41 #include <sys/fm/fs/zfs.h> 42 #include <sys/spa_impl.h> 43 #include <sys/zio.h> 44 #include <sys/zio_checksum.h> 45 #include <sys/dmu.h> 46 #include <sys/dmu_tx.h> 47 #include <sys/zap.h> 48 #include <sys/zil.h> 49 #include <sys/ddt.h> 50 #include <sys/vdev_impl.h> 51 #include <sys/metaslab.h> 52 #include <sys/metaslab_impl.h> 53 #include <sys/uberblock_impl.h> 54 #include <sys/txg.h> 55 #include <sys/avl.h> 56 #include <sys/dmu_traverse.h> 57 #include <sys/dmu_objset.h> 58 #include <sys/unique.h> 59 #include <sys/dsl_pool.h> 60 #include <sys/dsl_dataset.h> 61 #include <sys/dsl_dir.h> 62 #include <sys/dsl_prop.h> 63 #include <sys/dsl_synctask.h> 64 #include <sys/fs/zfs.h> 65 #include <sys/arc.h> 66 #include <sys/callb.h> 67 #include <sys/systeminfo.h> 68 #include <sys/spa_boot.h> 69 #include <sys/zfs_ioctl.h> 70 #include <sys/dsl_scan.h> 71 #include <sys/zfeature.h> 72 #include <sys/dsl_destroy.h> 73 74 #ifdef _KERNEL 75 #include <sys/bootprops.h> 76 #include <sys/callb.h> 77 #include <sys/cpupart.h> 78 #include <sys/pool.h> 79 #include <sys/sysdc.h> 80 #include <sys/zone.h> 81 #endif /* _KERNEL */ 82 83 #include "zfs_prop.h" 84 #include "zfs_comutil.h" 85 86 /* 87 * The interval, in seconds, at which failed configuration cache file writes 88 * should be retried. 89 */ 90 static int zfs_ccw_retry_interval = 300; 91 92 typedef enum zti_modes { 93 ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 94 ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ 95 ZTI_MODE_NULL, /* don't create a taskq */ 96 ZTI_NMODES 97 } zti_modes_t; 98 99 #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 100 #define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } 101 #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 102 103 #define ZTI_N(n) ZTI_P(n, 1) 104 #define ZTI_ONE ZTI_N(1) 105 106 typedef struct zio_taskq_info { 107 zti_modes_t zti_mode; 108 uint_t zti_value; 109 uint_t zti_count; 110 } zio_taskq_info_t; 111 112 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 113 "issue", "issue_high", "intr", "intr_high" 114 }; 115 116 /* 117 * This table defines the taskq settings for each ZFS I/O type. When 118 * initializing a pool, we use this table to create an appropriately sized 119 * taskq. Some operations are low volume and therefore have a small, static 120 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 121 * macros. Other operations process a large amount of data; the ZTI_BATCH 122 * macro causes us to create a taskq oriented for throughput. Some operations 123 * are so high frequency and short-lived that the taskq itself can become a a 124 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 125 * additional degree of parallelism specified by the number of threads per- 126 * taskq and the number of taskqs; when dispatching an event in this case, the 127 * particular taskq is chosen at random. 128 * 129 * The different taskq priorities are to handle the different contexts (issue 130 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that 131 * need to be handled with minimum delay. 132 */ 133 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 134 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 135 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 136 { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */ 137 { ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */ 138 { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 139 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 140 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ 141 }; 142 143 static void spa_sync_version(void *arg, dmu_tx_t *tx); 144 static void spa_sync_props(void *arg, dmu_tx_t *tx); 145 static boolean_t spa_has_active_shared_spare(spa_t *spa); 146 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 147 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 148 char **ereport); 149 static void spa_vdev_resilver_done(spa_t *spa); 150 151 uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ 152 id_t zio_taskq_psrset_bind = PS_NONE; 153 boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 154 uint_t zio_taskq_basedc = 80; /* base duty cycle */ 155 156 boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 157 extern int zfs_sync_pass_deferred_free; 158 159 /* 160 * This (illegal) pool name is used when temporarily importing a spa_t in order 161 * to get the vdev stats associated with the imported devices. 162 */ 163 #define TRYIMPORT_NAME "$import" 164 165 /* 166 * ========================================================================== 167 * SPA properties routines 168 * ========================================================================== 169 */ 170 171 /* 172 * Add a (source=src, propname=propval) list to an nvlist. 173 */ 174 static void 175 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 176 uint64_t intval, zprop_source_t src) 177 { 178 const char *propname = zpool_prop_to_name(prop); 179 nvlist_t *propval; 180 181 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 182 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 183 184 if (strval != NULL) 185 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 186 else 187 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 188 189 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 190 nvlist_free(propval); 191 } 192 193 /* 194 * Get property values from the spa configuration. 195 */ 196 static void 197 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 198 { 199 vdev_t *rvd = spa->spa_root_vdev; 200 dsl_pool_t *pool = spa->spa_dsl_pool; 201 uint64_t size, alloc, cap, version; 202 zprop_source_t src = ZPROP_SRC_NONE; 203 spa_config_dirent_t *dp; 204 metaslab_class_t *mc = spa_normal_class(spa); 205 206 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 207 208 if (rvd != NULL) { 209 alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 210 size = metaslab_class_get_space(spa_normal_class(spa)); 211 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 212 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 213 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 214 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 215 size - alloc, src); 216 217 spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, 218 metaslab_class_fragmentation(mc), src); 219 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, 220 metaslab_class_expandable_space(mc), src); 221 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 222 (spa_mode(spa) == FREAD), src); 223 224 cap = (size == 0) ? 0 : (alloc * 100 / size); 225 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 226 227 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 228 ddt_get_pool_dedup_ratio(spa), src); 229 230 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 231 rvd->vdev_state, src); 232 233 version = spa_version(spa); 234 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 235 src = ZPROP_SRC_DEFAULT; 236 else 237 src = ZPROP_SRC_LOCAL; 238 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 239 } 240 241 if (pool != NULL) { 242 /* 243 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 244 * when opening pools before this version freedir will be NULL. 245 */ 246 if (pool->dp_free_dir != NULL) { 247 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 248 dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, 249 src); 250 } else { 251 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 252 NULL, 0, src); 253 } 254 255 if (pool->dp_leak_dir != NULL) { 256 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 257 dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, 258 src); 259 } else { 260 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, 261 NULL, 0, src); 262 } 263 } 264 265 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 266 267 if (spa->spa_comment != NULL) { 268 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 269 0, ZPROP_SRC_LOCAL); 270 } 271 272 if (spa->spa_root != NULL) 273 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 274 0, ZPROP_SRC_LOCAL); 275 276 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { 277 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 278 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); 279 } else { 280 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 281 SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); 282 } 283 284 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 285 if (dp->scd_path == NULL) { 286 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 287 "none", 0, ZPROP_SRC_LOCAL); 288 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 289 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 290 dp->scd_path, 0, ZPROP_SRC_LOCAL); 291 } 292 } 293 } 294 295 /* 296 * Get zpool property values. 297 */ 298 int 299 spa_prop_get(spa_t *spa, nvlist_t **nvp) 300 { 301 objset_t *mos = spa->spa_meta_objset; 302 zap_cursor_t zc; 303 zap_attribute_t za; 304 int err; 305 306 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 307 308 mutex_enter(&spa->spa_props_lock); 309 310 /* 311 * Get properties from the spa config. 312 */ 313 spa_prop_get_config(spa, nvp); 314 315 /* If no pool property object, no more prop to get. */ 316 if (mos == NULL || spa->spa_pool_props_object == 0) { 317 mutex_exit(&spa->spa_props_lock); 318 return (0); 319 } 320 321 /* 322 * Get properties from the MOS pool property object. 323 */ 324 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 325 (err = zap_cursor_retrieve(&zc, &za)) == 0; 326 zap_cursor_advance(&zc)) { 327 uint64_t intval = 0; 328 char *strval = NULL; 329 zprop_source_t src = ZPROP_SRC_DEFAULT; 330 zpool_prop_t prop; 331 332 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 333 continue; 334 335 switch (za.za_integer_length) { 336 case 8: 337 /* integer property */ 338 if (za.za_first_integer != 339 zpool_prop_default_numeric(prop)) 340 src = ZPROP_SRC_LOCAL; 341 342 if (prop == ZPOOL_PROP_BOOTFS) { 343 dsl_pool_t *dp; 344 dsl_dataset_t *ds = NULL; 345 346 dp = spa_get_dsl(spa); 347 dsl_pool_config_enter(dp, FTAG); 348 if (err = dsl_dataset_hold_obj(dp, 349 za.za_first_integer, FTAG, &ds)) { 350 dsl_pool_config_exit(dp, FTAG); 351 break; 352 } 353 354 strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, 355 KM_SLEEP); 356 dsl_dataset_name(ds, strval); 357 dsl_dataset_rele(ds, FTAG); 358 dsl_pool_config_exit(dp, FTAG); 359 } else { 360 strval = NULL; 361 intval = za.za_first_integer; 362 } 363 364 spa_prop_add_list(*nvp, prop, strval, intval, src); 365 366 if (strval != NULL) 367 kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); 368 369 break; 370 371 case 1: 372 /* string property */ 373 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 374 err = zap_lookup(mos, spa->spa_pool_props_object, 375 za.za_name, 1, za.za_num_integers, strval); 376 if (err) { 377 kmem_free(strval, za.za_num_integers); 378 break; 379 } 380 spa_prop_add_list(*nvp, prop, strval, 0, src); 381 kmem_free(strval, za.za_num_integers); 382 break; 383 384 default: 385 break; 386 } 387 } 388 zap_cursor_fini(&zc); 389 mutex_exit(&spa->spa_props_lock); 390 out: 391 if (err && err != ENOENT) { 392 nvlist_free(*nvp); 393 *nvp = NULL; 394 return (err); 395 } 396 397 return (0); 398 } 399 400 /* 401 * Validate the given pool properties nvlist and modify the list 402 * for the property values to be set. 403 */ 404 static int 405 spa_prop_validate(spa_t *spa, nvlist_t *props) 406 { 407 nvpair_t *elem; 408 int error = 0, reset_bootfs = 0; 409 uint64_t objnum = 0; 410 boolean_t has_feature = B_FALSE; 411 412 elem = NULL; 413 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 414 uint64_t intval; 415 char *strval, *slash, *check, *fname; 416 const char *propname = nvpair_name(elem); 417 zpool_prop_t prop = zpool_name_to_prop(propname); 418 419 switch (prop) { 420 case ZPROP_INVAL: 421 if (!zpool_prop_feature(propname)) { 422 error = SET_ERROR(EINVAL); 423 break; 424 } 425 426 /* 427 * Sanitize the input. 428 */ 429 if (nvpair_type(elem) != DATA_TYPE_UINT64) { 430 error = SET_ERROR(EINVAL); 431 break; 432 } 433 434 if (nvpair_value_uint64(elem, &intval) != 0) { 435 error = SET_ERROR(EINVAL); 436 break; 437 } 438 439 if (intval != 0) { 440 error = SET_ERROR(EINVAL); 441 break; 442 } 443 444 fname = strchr(propname, '@') + 1; 445 if (zfeature_lookup_name(fname, NULL) != 0) { 446 error = SET_ERROR(EINVAL); 447 break; 448 } 449 450 has_feature = B_TRUE; 451 break; 452 453 case ZPOOL_PROP_VERSION: 454 error = nvpair_value_uint64(elem, &intval); 455 if (!error && 456 (intval < spa_version(spa) || 457 intval > SPA_VERSION_BEFORE_FEATURES || 458 has_feature)) 459 error = SET_ERROR(EINVAL); 460 break; 461 462 case ZPOOL_PROP_DELEGATION: 463 case ZPOOL_PROP_AUTOREPLACE: 464 case ZPOOL_PROP_LISTSNAPS: 465 case ZPOOL_PROP_AUTOEXPAND: 466 error = nvpair_value_uint64(elem, &intval); 467 if (!error && intval > 1) 468 error = SET_ERROR(EINVAL); 469 break; 470 471 case ZPOOL_PROP_BOOTFS: 472 /* 473 * If the pool version is less than SPA_VERSION_BOOTFS, 474 * or the pool is still being created (version == 0), 475 * the bootfs property cannot be set. 476 */ 477 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 478 error = SET_ERROR(ENOTSUP); 479 break; 480 } 481 482 /* 483 * Make sure the vdev config is bootable 484 */ 485 if (!vdev_is_bootable(spa->spa_root_vdev)) { 486 error = SET_ERROR(ENOTSUP); 487 break; 488 } 489 490 reset_bootfs = 1; 491 492 error = nvpair_value_string(elem, &strval); 493 494 if (!error) { 495 objset_t *os; 496 uint64_t propval; 497 498 if (strval == NULL || strval[0] == '\0') { 499 objnum = zpool_prop_default_numeric( 500 ZPOOL_PROP_BOOTFS); 501 break; 502 } 503 504 if (error = dmu_objset_hold(strval, FTAG, &os)) 505 break; 506 507 /* 508 * Must be ZPL, and its property settings 509 * must be supported by GRUB (compression 510 * is not gzip, and large blocks are not used). 511 */ 512 513 if (dmu_objset_type(os) != DMU_OST_ZFS) { 514 error = SET_ERROR(ENOTSUP); 515 } else if ((error = 516 dsl_prop_get_int_ds(dmu_objset_ds(os), 517 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 518 &propval)) == 0 && 519 !BOOTFS_COMPRESS_VALID(propval)) { 520 error = SET_ERROR(ENOTSUP); 521 } else { 522 objnum = dmu_objset_id(os); 523 } 524 dmu_objset_rele(os, FTAG); 525 } 526 break; 527 528 case ZPOOL_PROP_FAILUREMODE: 529 error = nvpair_value_uint64(elem, &intval); 530 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 531 intval > ZIO_FAILURE_MODE_PANIC)) 532 error = SET_ERROR(EINVAL); 533 534 /* 535 * This is a special case which only occurs when 536 * the pool has completely failed. This allows 537 * the user to change the in-core failmode property 538 * without syncing it out to disk (I/Os might 539 * currently be blocked). We do this by returning 540 * EIO to the caller (spa_prop_set) to trick it 541 * into thinking we encountered a property validation 542 * error. 543 */ 544 if (!error && spa_suspended(spa)) { 545 spa->spa_failmode = intval; 546 error = SET_ERROR(EIO); 547 } 548 break; 549 550 case ZPOOL_PROP_CACHEFILE: 551 if ((error = nvpair_value_string(elem, &strval)) != 0) 552 break; 553 554 if (strval[0] == '\0') 555 break; 556 557 if (strcmp(strval, "none") == 0) 558 break; 559 560 if (strval[0] != '/') { 561 error = SET_ERROR(EINVAL); 562 break; 563 } 564 565 slash = strrchr(strval, '/'); 566 ASSERT(slash != NULL); 567 568 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 569 strcmp(slash, "/..") == 0) 570 error = SET_ERROR(EINVAL); 571 break; 572 573 case ZPOOL_PROP_COMMENT: 574 if ((error = nvpair_value_string(elem, &strval)) != 0) 575 break; 576 for (check = strval; *check != '\0'; check++) { 577 /* 578 * The kernel doesn't have an easy isprint() 579 * check. For this kernel check, we merely 580 * check ASCII apart from DEL. Fix this if 581 * there is an easy-to-use kernel isprint(). 582 */ 583 if (*check >= 0x7f) { 584 error = SET_ERROR(EINVAL); 585 break; 586 } 587 } 588 if (strlen(strval) > ZPROP_MAX_COMMENT) 589 error = E2BIG; 590 break; 591 592 case ZPOOL_PROP_DEDUPDITTO: 593 if (spa_version(spa) < SPA_VERSION_DEDUP) 594 error = SET_ERROR(ENOTSUP); 595 else 596 error = nvpair_value_uint64(elem, &intval); 597 if (error == 0 && 598 intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 599 error = SET_ERROR(EINVAL); 600 break; 601 } 602 603 if (error) 604 break; 605 } 606 607 if (!error && reset_bootfs) { 608 error = nvlist_remove(props, 609 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 610 611 if (!error) { 612 error = nvlist_add_uint64(props, 613 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 614 } 615 } 616 617 return (error); 618 } 619 620 void 621 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 622 { 623 char *cachefile; 624 spa_config_dirent_t *dp; 625 626 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 627 &cachefile) != 0) 628 return; 629 630 dp = kmem_alloc(sizeof (spa_config_dirent_t), 631 KM_SLEEP); 632 633 if (cachefile[0] == '\0') 634 dp->scd_path = spa_strdup(spa_config_path); 635 else if (strcmp(cachefile, "none") == 0) 636 dp->scd_path = NULL; 637 else 638 dp->scd_path = spa_strdup(cachefile); 639 640 list_insert_head(&spa->spa_config_list, dp); 641 if (need_sync) 642 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 643 } 644 645 int 646 spa_prop_set(spa_t *spa, nvlist_t *nvp) 647 { 648 int error; 649 nvpair_t *elem = NULL; 650 boolean_t need_sync = B_FALSE; 651 652 if ((error = spa_prop_validate(spa, nvp)) != 0) 653 return (error); 654 655 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 656 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 657 658 if (prop == ZPOOL_PROP_CACHEFILE || 659 prop == ZPOOL_PROP_ALTROOT || 660 prop == ZPOOL_PROP_READONLY) 661 continue; 662 663 if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) { 664 uint64_t ver; 665 666 if (prop == ZPOOL_PROP_VERSION) { 667 VERIFY(nvpair_value_uint64(elem, &ver) == 0); 668 } else { 669 ASSERT(zpool_prop_feature(nvpair_name(elem))); 670 ver = SPA_VERSION_FEATURES; 671 need_sync = B_TRUE; 672 } 673 674 /* Save time if the version is already set. */ 675 if (ver == spa_version(spa)) 676 continue; 677 678 /* 679 * In addition to the pool directory object, we might 680 * create the pool properties object, the features for 681 * read object, the features for write object, or the 682 * feature descriptions object. 683 */ 684 error = dsl_sync_task(spa->spa_name, NULL, 685 spa_sync_version, &ver, 686 6, ZFS_SPACE_CHECK_RESERVED); 687 if (error) 688 return (error); 689 continue; 690 } 691 692 need_sync = B_TRUE; 693 break; 694 } 695 696 if (need_sync) { 697 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 698 nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 699 } 700 701 return (0); 702 } 703 704 /* 705 * If the bootfs property value is dsobj, clear it. 706 */ 707 void 708 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 709 { 710 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 711 VERIFY(zap_remove(spa->spa_meta_objset, 712 spa->spa_pool_props_object, 713 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 714 spa->spa_bootfs = 0; 715 } 716 } 717 718 /*ARGSUSED*/ 719 static int 720 spa_change_guid_check(void *arg, dmu_tx_t *tx) 721 { 722 uint64_t *newguid = arg; 723 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 724 vdev_t *rvd = spa->spa_root_vdev; 725 uint64_t vdev_state; 726 727 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 728 vdev_state = rvd->vdev_state; 729 spa_config_exit(spa, SCL_STATE, FTAG); 730 731 if (vdev_state != VDEV_STATE_HEALTHY) 732 return (SET_ERROR(ENXIO)); 733 734 ASSERT3U(spa_guid(spa), !=, *newguid); 735 736 return (0); 737 } 738 739 static void 740 spa_change_guid_sync(void *arg, dmu_tx_t *tx) 741 { 742 uint64_t *newguid = arg; 743 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 744 uint64_t oldguid; 745 vdev_t *rvd = spa->spa_root_vdev; 746 747 oldguid = spa_guid(spa); 748 749 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 750 rvd->vdev_guid = *newguid; 751 rvd->vdev_guid_sum += (*newguid - oldguid); 752 vdev_config_dirty(rvd); 753 spa_config_exit(spa, SCL_STATE, FTAG); 754 755 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 756 oldguid, *newguid); 757 } 758 759 /* 760 * Change the GUID for the pool. This is done so that we can later 761 * re-import a pool built from a clone of our own vdevs. We will modify 762 * the root vdev's guid, our own pool guid, and then mark all of our 763 * vdevs dirty. Note that we must make sure that all our vdevs are 764 * online when we do this, or else any vdevs that weren't present 765 * would be orphaned from our pool. We are also going to issue a 766 * sysevent to update any watchers. 767 */ 768 int 769 spa_change_guid(spa_t *spa) 770 { 771 int error; 772 uint64_t guid; 773 774 mutex_enter(&spa->spa_vdev_top_lock); 775 mutex_enter(&spa_namespace_lock); 776 guid = spa_generate_guid(NULL); 777 778 error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 779 spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 780 781 if (error == 0) { 782 spa_config_sync(spa, B_FALSE, B_TRUE); 783 spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID); 784 } 785 786 mutex_exit(&spa_namespace_lock); 787 mutex_exit(&spa->spa_vdev_top_lock); 788 789 return (error); 790 } 791 792 /* 793 * ========================================================================== 794 * SPA state manipulation (open/create/destroy/import/export) 795 * ========================================================================== 796 */ 797 798 static int 799 spa_error_entry_compare(const void *a, const void *b) 800 { 801 spa_error_entry_t *sa = (spa_error_entry_t *)a; 802 spa_error_entry_t *sb = (spa_error_entry_t *)b; 803 int ret; 804 805 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 806 sizeof (zbookmark_phys_t)); 807 808 if (ret < 0) 809 return (-1); 810 else if (ret > 0) 811 return (1); 812 else 813 return (0); 814 } 815 816 /* 817 * Utility function which retrieves copies of the current logs and 818 * re-initializes them in the process. 819 */ 820 void 821 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 822 { 823 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 824 825 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 826 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 827 828 avl_create(&spa->spa_errlist_scrub, 829 spa_error_entry_compare, sizeof (spa_error_entry_t), 830 offsetof(spa_error_entry_t, se_avl)); 831 avl_create(&spa->spa_errlist_last, 832 spa_error_entry_compare, sizeof (spa_error_entry_t), 833 offsetof(spa_error_entry_t, se_avl)); 834 } 835 836 static void 837 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 838 { 839 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 840 enum zti_modes mode = ztip->zti_mode; 841 uint_t value = ztip->zti_value; 842 uint_t count = ztip->zti_count; 843 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 844 char name[32]; 845 uint_t flags = 0; 846 boolean_t batch = B_FALSE; 847 848 if (mode == ZTI_MODE_NULL) { 849 tqs->stqs_count = 0; 850 tqs->stqs_taskq = NULL; 851 return; 852 } 853 854 ASSERT3U(count, >, 0); 855 856 tqs->stqs_count = count; 857 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 858 859 switch (mode) { 860 case ZTI_MODE_FIXED: 861 ASSERT3U(value, >=, 1); 862 value = MAX(value, 1); 863 break; 864 865 case ZTI_MODE_BATCH: 866 batch = B_TRUE; 867 flags |= TASKQ_THREADS_CPU_PCT; 868 value = zio_taskq_batch_pct; 869 break; 870 871 default: 872 panic("unrecognized mode for %s_%s taskq (%u:%u) in " 873 "spa_activate()", 874 zio_type_name[t], zio_taskq_types[q], mode, value); 875 break; 876 } 877 878 for (uint_t i = 0; i < count; i++) { 879 taskq_t *tq; 880 881 if (count > 1) { 882 (void) snprintf(name, sizeof (name), "%s_%s_%u", 883 zio_type_name[t], zio_taskq_types[q], i); 884 } else { 885 (void) snprintf(name, sizeof (name), "%s_%s", 886 zio_type_name[t], zio_taskq_types[q]); 887 } 888 889 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 890 if (batch) 891 flags |= TASKQ_DC_BATCH; 892 893 tq = taskq_create_sysdc(name, value, 50, INT_MAX, 894 spa->spa_proc, zio_taskq_basedc, flags); 895 } else { 896 pri_t pri = maxclsyspri; 897 /* 898 * The write issue taskq can be extremely CPU 899 * intensive. Run it at slightly lower priority 900 * than the other taskqs. 901 */ 902 if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) 903 pri--; 904 905 tq = taskq_create_proc(name, value, pri, 50, 906 INT_MAX, spa->spa_proc, flags); 907 } 908 909 tqs->stqs_taskq[i] = tq; 910 } 911 } 912 913 static void 914 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 915 { 916 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 917 918 if (tqs->stqs_taskq == NULL) { 919 ASSERT0(tqs->stqs_count); 920 return; 921 } 922 923 for (uint_t i = 0; i < tqs->stqs_count; i++) { 924 ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 925 taskq_destroy(tqs->stqs_taskq[i]); 926 } 927 928 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 929 tqs->stqs_taskq = NULL; 930 } 931 932 /* 933 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 934 * Note that a type may have multiple discrete taskqs to avoid lock contention 935 * on the taskq itself. In that case we choose which taskq at random by using 936 * the low bits of gethrtime(). 937 */ 938 void 939 spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 940 task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) 941 { 942 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 943 taskq_t *tq; 944 945 ASSERT3P(tqs->stqs_taskq, !=, NULL); 946 ASSERT3U(tqs->stqs_count, !=, 0); 947 948 if (tqs->stqs_count == 1) { 949 tq = tqs->stqs_taskq[0]; 950 } else { 951 tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count]; 952 } 953 954 taskq_dispatch_ent(tq, func, arg, flags, ent); 955 } 956 957 static void 958 spa_create_zio_taskqs(spa_t *spa) 959 { 960 for (int t = 0; t < ZIO_TYPES; t++) { 961 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 962 spa_taskqs_init(spa, t, q); 963 } 964 } 965 } 966 967 #ifdef _KERNEL 968 static void 969 spa_thread(void *arg) 970 { 971 callb_cpr_t cprinfo; 972 973 spa_t *spa = arg; 974 user_t *pu = PTOU(curproc); 975 976 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 977 spa->spa_name); 978 979 ASSERT(curproc != &p0); 980 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 981 "zpool-%s", spa->spa_name); 982 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 983 984 /* bind this thread to the requested psrset */ 985 if (zio_taskq_psrset_bind != PS_NONE) { 986 pool_lock(); 987 mutex_enter(&cpu_lock); 988 mutex_enter(&pidlock); 989 mutex_enter(&curproc->p_lock); 990 991 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 992 0, NULL, NULL) == 0) { 993 curthread->t_bind_pset = zio_taskq_psrset_bind; 994 } else { 995 cmn_err(CE_WARN, 996 "Couldn't bind process for zfs pool \"%s\" to " 997 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 998 } 999 1000 mutex_exit(&curproc->p_lock); 1001 mutex_exit(&pidlock); 1002 mutex_exit(&cpu_lock); 1003 pool_unlock(); 1004 } 1005 1006 if (zio_taskq_sysdc) { 1007 sysdc_thread_enter(curthread, 100, 0); 1008 } 1009 1010 spa->spa_proc = curproc; 1011 spa->spa_did = curthread->t_did; 1012 1013 spa_create_zio_taskqs(spa); 1014 1015 mutex_enter(&spa->spa_proc_lock); 1016 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1017 1018 spa->spa_proc_state = SPA_PROC_ACTIVE; 1019 cv_broadcast(&spa->spa_proc_cv); 1020 1021 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1022 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1023 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1024 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1025 1026 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1027 spa->spa_proc_state = SPA_PROC_GONE; 1028 spa->spa_proc = &p0; 1029 cv_broadcast(&spa->spa_proc_cv); 1030 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1031 1032 mutex_enter(&curproc->p_lock); 1033 lwp_exit(); 1034 } 1035 #endif 1036 1037 /* 1038 * Activate an uninitialized pool. 1039 */ 1040 static void 1041 spa_activate(spa_t *spa, int mode) 1042 { 1043 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1044 1045 spa->spa_state = POOL_STATE_ACTIVE; 1046 spa->spa_mode = mode; 1047 1048 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 1049 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 1050 1051 /* Try to create a covering process */ 1052 mutex_enter(&spa->spa_proc_lock); 1053 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1054 ASSERT(spa->spa_proc == &p0); 1055 spa->spa_did = 0; 1056 1057 /* Only create a process if we're going to be around a while. */ 1058 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1059 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1060 NULL, 0) == 0) { 1061 spa->spa_proc_state = SPA_PROC_CREATED; 1062 while (spa->spa_proc_state == SPA_PROC_CREATED) { 1063 cv_wait(&spa->spa_proc_cv, 1064 &spa->spa_proc_lock); 1065 } 1066 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1067 ASSERT(spa->spa_proc != &p0); 1068 ASSERT(spa->spa_did != 0); 1069 } else { 1070 #ifdef _KERNEL 1071 cmn_err(CE_WARN, 1072 "Couldn't create process for zfs pool \"%s\"\n", 1073 spa->spa_name); 1074 #endif 1075 } 1076 } 1077 mutex_exit(&spa->spa_proc_lock); 1078 1079 /* If we didn't create a process, we need to create our taskqs. */ 1080 if (spa->spa_proc == &p0) { 1081 spa_create_zio_taskqs(spa); 1082 } 1083 1084 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1085 offsetof(vdev_t, vdev_config_dirty_node)); 1086 list_create(&spa->spa_evicting_os_list, sizeof (objset_t), 1087 offsetof(objset_t, os_evicting_node)); 1088 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1089 offsetof(vdev_t, vdev_state_dirty_node)); 1090 1091 txg_list_create(&spa->spa_vdev_txg_list, 1092 offsetof(struct vdev, vdev_txg_node)); 1093 1094 avl_create(&spa->spa_errlist_scrub, 1095 spa_error_entry_compare, sizeof (spa_error_entry_t), 1096 offsetof(spa_error_entry_t, se_avl)); 1097 avl_create(&spa->spa_errlist_last, 1098 spa_error_entry_compare, sizeof (spa_error_entry_t), 1099 offsetof(spa_error_entry_t, se_avl)); 1100 } 1101 1102 /* 1103 * Opposite of spa_activate(). 1104 */ 1105 static void 1106 spa_deactivate(spa_t *spa) 1107 { 1108 ASSERT(spa->spa_sync_on == B_FALSE); 1109 ASSERT(spa->spa_dsl_pool == NULL); 1110 ASSERT(spa->spa_root_vdev == NULL); 1111 ASSERT(spa->spa_async_zio_root == NULL); 1112 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1113 1114 spa_evicting_os_wait(spa); 1115 1116 txg_list_destroy(&spa->spa_vdev_txg_list); 1117 1118 list_destroy(&spa->spa_config_dirty_list); 1119 list_destroy(&spa->spa_evicting_os_list); 1120 list_destroy(&spa->spa_state_dirty_list); 1121 1122 for (int t = 0; t < ZIO_TYPES; t++) { 1123 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1124 spa_taskqs_fini(spa, t, q); 1125 } 1126 } 1127 1128 metaslab_class_destroy(spa->spa_normal_class); 1129 spa->spa_normal_class = NULL; 1130 1131 metaslab_class_destroy(spa->spa_log_class); 1132 spa->spa_log_class = NULL; 1133 1134 /* 1135 * If this was part of an import or the open otherwise failed, we may 1136 * still have errors left in the queues. Empty them just in case. 1137 */ 1138 spa_errlog_drain(spa); 1139 1140 avl_destroy(&spa->spa_errlist_scrub); 1141 avl_destroy(&spa->spa_errlist_last); 1142 1143 spa->spa_state = POOL_STATE_UNINITIALIZED; 1144 1145 mutex_enter(&spa->spa_proc_lock); 1146 if (spa->spa_proc_state != SPA_PROC_NONE) { 1147 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1148 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1149 cv_broadcast(&spa->spa_proc_cv); 1150 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1151 ASSERT(spa->spa_proc != &p0); 1152 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1153 } 1154 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1155 spa->spa_proc_state = SPA_PROC_NONE; 1156 } 1157 ASSERT(spa->spa_proc == &p0); 1158 mutex_exit(&spa->spa_proc_lock); 1159 1160 /* 1161 * We want to make sure spa_thread() has actually exited the ZFS 1162 * module, so that the module can't be unloaded out from underneath 1163 * it. 1164 */ 1165 if (spa->spa_did != 0) { 1166 thread_join(spa->spa_did); 1167 spa->spa_did = 0; 1168 } 1169 } 1170 1171 /* 1172 * Verify a pool configuration, and construct the vdev tree appropriately. This 1173 * will create all the necessary vdevs in the appropriate layout, with each vdev 1174 * in the CLOSED state. This will prep the pool before open/creation/import. 1175 * All vdev validation is done by the vdev_alloc() routine. 1176 */ 1177 static int 1178 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1179 uint_t id, int atype) 1180 { 1181 nvlist_t **child; 1182 uint_t children; 1183 int error; 1184 1185 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1186 return (error); 1187 1188 if ((*vdp)->vdev_ops->vdev_op_leaf) 1189 return (0); 1190 1191 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1192 &child, &children); 1193 1194 if (error == ENOENT) 1195 return (0); 1196 1197 if (error) { 1198 vdev_free(*vdp); 1199 *vdp = NULL; 1200 return (SET_ERROR(EINVAL)); 1201 } 1202 1203 for (int c = 0; c < children; c++) { 1204 vdev_t *vd; 1205 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1206 atype)) != 0) { 1207 vdev_free(*vdp); 1208 *vdp = NULL; 1209 return (error); 1210 } 1211 } 1212 1213 ASSERT(*vdp != NULL); 1214 1215 return (0); 1216 } 1217 1218 /* 1219 * Opposite of spa_load(). 1220 */ 1221 static void 1222 spa_unload(spa_t *spa) 1223 { 1224 int i; 1225 1226 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1227 1228 /* 1229 * Stop async tasks. 1230 */ 1231 spa_async_suspend(spa); 1232 1233 /* 1234 * Stop syncing. 1235 */ 1236 if (spa->spa_sync_on) { 1237 txg_sync_stop(spa->spa_dsl_pool); 1238 spa->spa_sync_on = B_FALSE; 1239 } 1240 1241 /* 1242 * Wait for any outstanding async I/O to complete. 1243 */ 1244 if (spa->spa_async_zio_root != NULL) { 1245 for (int i = 0; i < max_ncpus; i++) 1246 (void) zio_wait(spa->spa_async_zio_root[i]); 1247 kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); 1248 spa->spa_async_zio_root = NULL; 1249 } 1250 1251 bpobj_close(&spa->spa_deferred_bpobj); 1252 1253 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1254 1255 /* 1256 * Close all vdevs. 1257 */ 1258 if (spa->spa_root_vdev) 1259 vdev_free(spa->spa_root_vdev); 1260 ASSERT(spa->spa_root_vdev == NULL); 1261 1262 /* 1263 * Close the dsl pool. 1264 */ 1265 if (spa->spa_dsl_pool) { 1266 dsl_pool_close(spa->spa_dsl_pool); 1267 spa->spa_dsl_pool = NULL; 1268 spa->spa_meta_objset = NULL; 1269 } 1270 1271 ddt_unload(spa); 1272 1273 1274 /* 1275 * Drop and purge level 2 cache 1276 */ 1277 spa_l2cache_drop(spa); 1278 1279 for (i = 0; i < spa->spa_spares.sav_count; i++) 1280 vdev_free(spa->spa_spares.sav_vdevs[i]); 1281 if (spa->spa_spares.sav_vdevs) { 1282 kmem_free(spa->spa_spares.sav_vdevs, 1283 spa->spa_spares.sav_count * sizeof (void *)); 1284 spa->spa_spares.sav_vdevs = NULL; 1285 } 1286 if (spa->spa_spares.sav_config) { 1287 nvlist_free(spa->spa_spares.sav_config); 1288 spa->spa_spares.sav_config = NULL; 1289 } 1290 spa->spa_spares.sav_count = 0; 1291 1292 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 1293 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1294 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1295 } 1296 if (spa->spa_l2cache.sav_vdevs) { 1297 kmem_free(spa->spa_l2cache.sav_vdevs, 1298 spa->spa_l2cache.sav_count * sizeof (void *)); 1299 spa->spa_l2cache.sav_vdevs = NULL; 1300 } 1301 if (spa->spa_l2cache.sav_config) { 1302 nvlist_free(spa->spa_l2cache.sav_config); 1303 spa->spa_l2cache.sav_config = NULL; 1304 } 1305 spa->spa_l2cache.sav_count = 0; 1306 1307 spa->spa_async_suspended = 0; 1308 1309 if (spa->spa_comment != NULL) { 1310 spa_strfree(spa->spa_comment); 1311 spa->spa_comment = NULL; 1312 } 1313 1314 spa_config_exit(spa, SCL_ALL, FTAG); 1315 } 1316 1317 /* 1318 * Load (or re-load) the current list of vdevs describing the active spares for 1319 * this pool. When this is called, we have some form of basic information in 1320 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1321 * then re-generate a more complete list including status information. 1322 */ 1323 static void 1324 spa_load_spares(spa_t *spa) 1325 { 1326 nvlist_t **spares; 1327 uint_t nspares; 1328 int i; 1329 vdev_t *vd, *tvd; 1330 1331 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1332 1333 /* 1334 * First, close and free any existing spare vdevs. 1335 */ 1336 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1337 vd = spa->spa_spares.sav_vdevs[i]; 1338 1339 /* Undo the call to spa_activate() below */ 1340 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1341 B_FALSE)) != NULL && tvd->vdev_isspare) 1342 spa_spare_remove(tvd); 1343 vdev_close(vd); 1344 vdev_free(vd); 1345 } 1346 1347 if (spa->spa_spares.sav_vdevs) 1348 kmem_free(spa->spa_spares.sav_vdevs, 1349 spa->spa_spares.sav_count * sizeof (void *)); 1350 1351 if (spa->spa_spares.sav_config == NULL) 1352 nspares = 0; 1353 else 1354 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1355 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1356 1357 spa->spa_spares.sav_count = (int)nspares; 1358 spa->spa_spares.sav_vdevs = NULL; 1359 1360 if (nspares == 0) 1361 return; 1362 1363 /* 1364 * Construct the array of vdevs, opening them to get status in the 1365 * process. For each spare, there is potentially two different vdev_t 1366 * structures associated with it: one in the list of spares (used only 1367 * for basic validation purposes) and one in the active vdev 1368 * configuration (if it's spared in). During this phase we open and 1369 * validate each vdev on the spare list. If the vdev also exists in the 1370 * active configuration, then we also mark this vdev as an active spare. 1371 */ 1372 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1373 KM_SLEEP); 1374 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1375 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1376 VDEV_ALLOC_SPARE) == 0); 1377 ASSERT(vd != NULL); 1378 1379 spa->spa_spares.sav_vdevs[i] = vd; 1380 1381 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1382 B_FALSE)) != NULL) { 1383 if (!tvd->vdev_isspare) 1384 spa_spare_add(tvd); 1385 1386 /* 1387 * We only mark the spare active if we were successfully 1388 * able to load the vdev. Otherwise, importing a pool 1389 * with a bad active spare would result in strange 1390 * behavior, because multiple pool would think the spare 1391 * is actively in use. 1392 * 1393 * There is a vulnerability here to an equally bizarre 1394 * circumstance, where a dead active spare is later 1395 * brought back to life (onlined or otherwise). Given 1396 * the rarity of this scenario, and the extra complexity 1397 * it adds, we ignore the possibility. 1398 */ 1399 if (!vdev_is_dead(tvd)) 1400 spa_spare_activate(tvd); 1401 } 1402 1403 vd->vdev_top = vd; 1404 vd->vdev_aux = &spa->spa_spares; 1405 1406 if (vdev_open(vd) != 0) 1407 continue; 1408 1409 if (vdev_validate_aux(vd) == 0) 1410 spa_spare_add(vd); 1411 } 1412 1413 /* 1414 * Recompute the stashed list of spares, with status information 1415 * this time. 1416 */ 1417 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1418 DATA_TYPE_NVLIST_ARRAY) == 0); 1419 1420 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1421 KM_SLEEP); 1422 for (i = 0; i < spa->spa_spares.sav_count; i++) 1423 spares[i] = vdev_config_generate(spa, 1424 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1425 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1426 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1427 for (i = 0; i < spa->spa_spares.sav_count; i++) 1428 nvlist_free(spares[i]); 1429 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1430 } 1431 1432 /* 1433 * Load (or re-load) the current list of vdevs describing the active l2cache for 1434 * this pool. When this is called, we have some form of basic information in 1435 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1436 * then re-generate a more complete list including status information. 1437 * Devices which are already active have their details maintained, and are 1438 * not re-opened. 1439 */ 1440 static void 1441 spa_load_l2cache(spa_t *spa) 1442 { 1443 nvlist_t **l2cache; 1444 uint_t nl2cache; 1445 int i, j, oldnvdevs; 1446 uint64_t guid; 1447 vdev_t *vd, **oldvdevs, **newvdevs; 1448 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1449 1450 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1451 1452 if (sav->sav_config != NULL) { 1453 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1454 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1455 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1456 } else { 1457 nl2cache = 0; 1458 newvdevs = NULL; 1459 } 1460 1461 oldvdevs = sav->sav_vdevs; 1462 oldnvdevs = sav->sav_count; 1463 sav->sav_vdevs = NULL; 1464 sav->sav_count = 0; 1465 1466 /* 1467 * Process new nvlist of vdevs. 1468 */ 1469 for (i = 0; i < nl2cache; i++) { 1470 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1471 &guid) == 0); 1472 1473 newvdevs[i] = NULL; 1474 for (j = 0; j < oldnvdevs; j++) { 1475 vd = oldvdevs[j]; 1476 if (vd != NULL && guid == vd->vdev_guid) { 1477 /* 1478 * Retain previous vdev for add/remove ops. 1479 */ 1480 newvdevs[i] = vd; 1481 oldvdevs[j] = NULL; 1482 break; 1483 } 1484 } 1485 1486 if (newvdevs[i] == NULL) { 1487 /* 1488 * Create new vdev 1489 */ 1490 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1491 VDEV_ALLOC_L2CACHE) == 0); 1492 ASSERT(vd != NULL); 1493 newvdevs[i] = vd; 1494 1495 /* 1496 * Commit this vdev as an l2cache device, 1497 * even if it fails to open. 1498 */ 1499 spa_l2cache_add(vd); 1500 1501 vd->vdev_top = vd; 1502 vd->vdev_aux = sav; 1503 1504 spa_l2cache_activate(vd); 1505 1506 if (vdev_open(vd) != 0) 1507 continue; 1508 1509 (void) vdev_validate_aux(vd); 1510 1511 if (!vdev_is_dead(vd)) 1512 l2arc_add_vdev(spa, vd); 1513 } 1514 } 1515 1516 /* 1517 * Purge vdevs that were dropped 1518 */ 1519 for (i = 0; i < oldnvdevs; i++) { 1520 uint64_t pool; 1521 1522 vd = oldvdevs[i]; 1523 if (vd != NULL) { 1524 ASSERT(vd->vdev_isl2cache); 1525 1526 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1527 pool != 0ULL && l2arc_vdev_present(vd)) 1528 l2arc_remove_vdev(vd); 1529 vdev_clear_stats(vd); 1530 vdev_free(vd); 1531 } 1532 } 1533 1534 if (oldvdevs) 1535 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1536 1537 if (sav->sav_config == NULL) 1538 goto out; 1539 1540 sav->sav_vdevs = newvdevs; 1541 sav->sav_count = (int)nl2cache; 1542 1543 /* 1544 * Recompute the stashed list of l2cache devices, with status 1545 * information this time. 1546 */ 1547 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1548 DATA_TYPE_NVLIST_ARRAY) == 0); 1549 1550 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1551 for (i = 0; i < sav->sav_count; i++) 1552 l2cache[i] = vdev_config_generate(spa, 1553 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1554 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1555 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1556 out: 1557 for (i = 0; i < sav->sav_count; i++) 1558 nvlist_free(l2cache[i]); 1559 if (sav->sav_count) 1560 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1561 } 1562 1563 static int 1564 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1565 { 1566 dmu_buf_t *db; 1567 char *packed = NULL; 1568 size_t nvsize = 0; 1569 int error; 1570 *value = NULL; 1571 1572 error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 1573 if (error != 0) 1574 return (error); 1575 1576 nvsize = *(uint64_t *)db->db_data; 1577 dmu_buf_rele(db, FTAG); 1578 1579 packed = kmem_alloc(nvsize, KM_SLEEP); 1580 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1581 DMU_READ_PREFETCH); 1582 if (error == 0) 1583 error = nvlist_unpack(packed, nvsize, value, 0); 1584 kmem_free(packed, nvsize); 1585 1586 return (error); 1587 } 1588 1589 /* 1590 * Checks to see if the given vdev could not be opened, in which case we post a 1591 * sysevent to notify the autoreplace code that the device has been removed. 1592 */ 1593 static void 1594 spa_check_removed(vdev_t *vd) 1595 { 1596 for (int c = 0; c < vd->vdev_children; c++) 1597 spa_check_removed(vd->vdev_child[c]); 1598 1599 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 1600 !vd->vdev_ishole) { 1601 zfs_post_autoreplace(vd->vdev_spa, vd); 1602 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1603 } 1604 } 1605 1606 /* 1607 * Validate the current config against the MOS config 1608 */ 1609 static boolean_t 1610 spa_config_valid(spa_t *spa, nvlist_t *config) 1611 { 1612 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 1613 nvlist_t *nv; 1614 1615 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 1616 1617 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1618 VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1619 1620 ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 1621 1622 /* 1623 * If we're doing a normal import, then build up any additional 1624 * diagnostic information about missing devices in this config. 1625 * We'll pass this up to the user for further processing. 1626 */ 1627 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1628 nvlist_t **child, *nv; 1629 uint64_t idx = 0; 1630 1631 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1632 KM_SLEEP); 1633 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1634 1635 for (int c = 0; c < rvd->vdev_children; c++) { 1636 vdev_t *tvd = rvd->vdev_child[c]; 1637 vdev_t *mtvd = mrvd->vdev_child[c]; 1638 1639 if (tvd->vdev_ops == &vdev_missing_ops && 1640 mtvd->vdev_ops != &vdev_missing_ops && 1641 mtvd->vdev_islog) 1642 child[idx++] = vdev_config_generate(spa, mtvd, 1643 B_FALSE, 0); 1644 } 1645 1646 if (idx) { 1647 VERIFY(nvlist_add_nvlist_array(nv, 1648 ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 1649 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 1650 ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 1651 1652 for (int i = 0; i < idx; i++) 1653 nvlist_free(child[i]); 1654 } 1655 nvlist_free(nv); 1656 kmem_free(child, rvd->vdev_children * sizeof (char **)); 1657 } 1658 1659 /* 1660 * Compare the root vdev tree with the information we have 1661 * from the MOS config (mrvd). Check each top-level vdev 1662 * with the corresponding MOS config top-level (mtvd). 1663 */ 1664 for (int c = 0; c < rvd->vdev_children; c++) { 1665 vdev_t *tvd = rvd->vdev_child[c]; 1666 vdev_t *mtvd = mrvd->vdev_child[c]; 1667 1668 /* 1669 * Resolve any "missing" vdevs in the current configuration. 1670 * If we find that the MOS config has more accurate information 1671 * about the top-level vdev then use that vdev instead. 1672 */ 1673 if (tvd->vdev_ops == &vdev_missing_ops && 1674 mtvd->vdev_ops != &vdev_missing_ops) { 1675 1676 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) 1677 continue; 1678 1679 /* 1680 * Device specific actions. 1681 */ 1682 if (mtvd->vdev_islog) { 1683 spa_set_log_state(spa, SPA_LOG_CLEAR); 1684 } else { 1685 /* 1686 * XXX - once we have 'readonly' pool 1687 * support we should be able to handle 1688 * missing data devices by transitioning 1689 * the pool to readonly. 1690 */ 1691 continue; 1692 } 1693 1694 /* 1695 * Swap the missing vdev with the data we were 1696 * able to obtain from the MOS config. 1697 */ 1698 vdev_remove_child(rvd, tvd); 1699 vdev_remove_child(mrvd, mtvd); 1700 1701 vdev_add_child(rvd, mtvd); 1702 vdev_add_child(mrvd, tvd); 1703 1704 spa_config_exit(spa, SCL_ALL, FTAG); 1705 vdev_load(mtvd); 1706 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1707 1708 vdev_reopen(rvd); 1709 } else if (mtvd->vdev_islog) { 1710 /* 1711 * Load the slog device's state from the MOS config 1712 * since it's possible that the label does not 1713 * contain the most up-to-date information. 1714 */ 1715 vdev_load_log_state(tvd, mtvd); 1716 vdev_reopen(tvd); 1717 } 1718 } 1719 vdev_free(mrvd); 1720 spa_config_exit(spa, SCL_ALL, FTAG); 1721 1722 /* 1723 * Ensure we were able to validate the config. 1724 */ 1725 return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 1726 } 1727 1728 /* 1729 * Check for missing log devices 1730 */ 1731 static boolean_t 1732 spa_check_logs(spa_t *spa) 1733 { 1734 boolean_t rv = B_FALSE; 1735 dsl_pool_t *dp = spa_get_dsl(spa); 1736 1737 switch (spa->spa_log_state) { 1738 case SPA_LOG_MISSING: 1739 /* need to recheck in case slog has been restored */ 1740 case SPA_LOG_UNKNOWN: 1741 rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 1742 zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); 1743 if (rv) 1744 spa_set_log_state(spa, SPA_LOG_MISSING); 1745 break; 1746 } 1747 return (rv); 1748 } 1749 1750 static boolean_t 1751 spa_passivate_log(spa_t *spa) 1752 { 1753 vdev_t *rvd = spa->spa_root_vdev; 1754 boolean_t slog_found = B_FALSE; 1755 1756 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1757 1758 if (!spa_has_slogs(spa)) 1759 return (B_FALSE); 1760 1761 for (int c = 0; c < rvd->vdev_children; c++) { 1762 vdev_t *tvd = rvd->vdev_child[c]; 1763 metaslab_group_t *mg = tvd->vdev_mg; 1764 1765 if (tvd->vdev_islog) { 1766 metaslab_group_passivate(mg); 1767 slog_found = B_TRUE; 1768 } 1769 } 1770 1771 return (slog_found); 1772 } 1773 1774 static void 1775 spa_activate_log(spa_t *spa) 1776 { 1777 vdev_t *rvd = spa->spa_root_vdev; 1778 1779 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1780 1781 for (int c = 0; c < rvd->vdev_children; c++) { 1782 vdev_t *tvd = rvd->vdev_child[c]; 1783 metaslab_group_t *mg = tvd->vdev_mg; 1784 1785 if (tvd->vdev_islog) 1786 metaslab_group_activate(mg); 1787 } 1788 } 1789 1790 int 1791 spa_offline_log(spa_t *spa) 1792 { 1793 int error; 1794 1795 error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 1796 NULL, DS_FIND_CHILDREN); 1797 if (error == 0) { 1798 /* 1799 * We successfully offlined the log device, sync out the 1800 * current txg so that the "stubby" block can be removed 1801 * by zil_sync(). 1802 */ 1803 txg_wait_synced(spa->spa_dsl_pool, 0); 1804 } 1805 return (error); 1806 } 1807 1808 static void 1809 spa_aux_check_removed(spa_aux_vdev_t *sav) 1810 { 1811 for (int i = 0; i < sav->sav_count; i++) 1812 spa_check_removed(sav->sav_vdevs[i]); 1813 } 1814 1815 void 1816 spa_claim_notify(zio_t *zio) 1817 { 1818 spa_t *spa = zio->io_spa; 1819 1820 if (zio->io_error) 1821 return; 1822 1823 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1824 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1825 spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1826 mutex_exit(&spa->spa_props_lock); 1827 } 1828 1829 typedef struct spa_load_error { 1830 uint64_t sle_meta_count; 1831 uint64_t sle_data_count; 1832 } spa_load_error_t; 1833 1834 static void 1835 spa_load_verify_done(zio_t *zio) 1836 { 1837 blkptr_t *bp = zio->io_bp; 1838 spa_load_error_t *sle = zio->io_private; 1839 dmu_object_type_t type = BP_GET_TYPE(bp); 1840 int error = zio->io_error; 1841 spa_t *spa = zio->io_spa; 1842 1843 if (error) { 1844 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 1845 type != DMU_OT_INTENT_LOG) 1846 atomic_inc_64(&sle->sle_meta_count); 1847 else 1848 atomic_inc_64(&sle->sle_data_count); 1849 } 1850 zio_data_buf_free(zio->io_data, zio->io_size); 1851 1852 mutex_enter(&spa->spa_scrub_lock); 1853 spa->spa_scrub_inflight--; 1854 cv_broadcast(&spa->spa_scrub_io_cv); 1855 mutex_exit(&spa->spa_scrub_lock); 1856 } 1857 1858 /* 1859 * Maximum number of concurrent scrub i/os to create while verifying 1860 * a pool while importing it. 1861 */ 1862 int spa_load_verify_maxinflight = 10000; 1863 boolean_t spa_load_verify_metadata = B_TRUE; 1864 boolean_t spa_load_verify_data = B_TRUE; 1865 1866 /*ARGSUSED*/ 1867 static int 1868 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1869 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 1870 { 1871 if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) 1872 return (0); 1873 /* 1874 * Note: normally this routine will not be called if 1875 * spa_load_verify_metadata is not set. However, it may be useful 1876 * to manually set the flag after the traversal has begun. 1877 */ 1878 if (!spa_load_verify_metadata) 1879 return (0); 1880 if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data) 1881 return (0); 1882 1883 zio_t *rio = arg; 1884 size_t size = BP_GET_PSIZE(bp); 1885 void *data = zio_data_buf_alloc(size); 1886 1887 mutex_enter(&spa->spa_scrub_lock); 1888 while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight) 1889 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1890 spa->spa_scrub_inflight++; 1891 mutex_exit(&spa->spa_scrub_lock); 1892 1893 zio_nowait(zio_read(rio, spa, bp, data, size, 1894 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1895 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1896 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1897 return (0); 1898 } 1899 1900 static int 1901 spa_load_verify(spa_t *spa) 1902 { 1903 zio_t *rio; 1904 spa_load_error_t sle = { 0 }; 1905 zpool_rewind_policy_t policy; 1906 boolean_t verify_ok = B_FALSE; 1907 int error = 0; 1908 1909 zpool_get_rewind_policy(spa->spa_config, &policy); 1910 1911 if (policy.zrp_request & ZPOOL_NEVER_REWIND) 1912 return (0); 1913 1914 rio = zio_root(spa, NULL, &sle, 1915 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1916 1917 if (spa_load_verify_metadata) { 1918 error = traverse_pool(spa, spa->spa_verify_min_txg, 1919 TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, 1920 spa_load_verify_cb, rio); 1921 } 1922 1923 (void) zio_wait(rio); 1924 1925 spa->spa_load_meta_errors = sle.sle_meta_count; 1926 spa->spa_load_data_errors = sle.sle_data_count; 1927 1928 if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 1929 sle.sle_data_count <= policy.zrp_maxdata) { 1930 int64_t loss = 0; 1931 1932 verify_ok = B_TRUE; 1933 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 1934 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 1935 1936 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 1937 VERIFY(nvlist_add_uint64(spa->spa_load_info, 1938 ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 1939 VERIFY(nvlist_add_int64(spa->spa_load_info, 1940 ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 1941 VERIFY(nvlist_add_uint64(spa->spa_load_info, 1942 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 1943 } else { 1944 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 1945 } 1946 1947 if (error) { 1948 if (error != ENXIO && error != EIO) 1949 error = SET_ERROR(EIO); 1950 return (error); 1951 } 1952 1953 return (verify_ok ? 0 : EIO); 1954 } 1955 1956 /* 1957 * Find a value in the pool props object. 1958 */ 1959 static void 1960 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 1961 { 1962 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 1963 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 1964 } 1965 1966 /* 1967 * Find a value in the pool directory object. 1968 */ 1969 static int 1970 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 1971 { 1972 return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1973 name, sizeof (uint64_t), 1, val)); 1974 } 1975 1976 static int 1977 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 1978 { 1979 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 1980 return (err); 1981 } 1982 1983 /* 1984 * Fix up config after a partly-completed split. This is done with the 1985 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 1986 * pool have that entry in their config, but only the splitting one contains 1987 * a list of all the guids of the vdevs that are being split off. 1988 * 1989 * This function determines what to do with that list: either rejoin 1990 * all the disks to the pool, or complete the splitting process. To attempt 1991 * the rejoin, each disk that is offlined is marked online again, and 1992 * we do a reopen() call. If the vdev label for every disk that was 1993 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 1994 * then we call vdev_split() on each disk, and complete the split. 1995 * 1996 * Otherwise we leave the config alone, with all the vdevs in place in 1997 * the original pool. 1998 */ 1999 static void 2000 spa_try_repair(spa_t *spa, nvlist_t *config) 2001 { 2002 uint_t extracted; 2003 uint64_t *glist; 2004 uint_t i, gcount; 2005 nvlist_t *nvl; 2006 vdev_t **vd; 2007 boolean_t attempt_reopen; 2008 2009 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 2010 return; 2011 2012 /* check that the config is complete */ 2013 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 2014 &glist, &gcount) != 0) 2015 return; 2016 2017 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 2018 2019 /* attempt to online all the vdevs & validate */ 2020 attempt_reopen = B_TRUE; 2021 for (i = 0; i < gcount; i++) { 2022 if (glist[i] == 0) /* vdev is hole */ 2023 continue; 2024 2025 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 2026 if (vd[i] == NULL) { 2027 /* 2028 * Don't bother attempting to reopen the disks; 2029 * just do the split. 2030 */ 2031 attempt_reopen = B_FALSE; 2032 } else { 2033 /* attempt to re-online it */ 2034 vd[i]->vdev_offline = B_FALSE; 2035 } 2036 } 2037 2038 if (attempt_reopen) { 2039 vdev_reopen(spa->spa_root_vdev); 2040 2041 /* check each device to see what state it's in */ 2042 for (extracted = 0, i = 0; i < gcount; i++) { 2043 if (vd[i] != NULL && 2044 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 2045 break; 2046 ++extracted; 2047 } 2048 } 2049 2050 /* 2051 * If every disk has been moved to the new pool, or if we never 2052 * even attempted to look at them, then we split them off for 2053 * good. 2054 */ 2055 if (!attempt_reopen || gcount == extracted) { 2056 for (i = 0; i < gcount; i++) 2057 if (vd[i] != NULL) 2058 vdev_split(vd[i]); 2059 vdev_reopen(spa->spa_root_vdev); 2060 } 2061 2062 kmem_free(vd, gcount * sizeof (vdev_t *)); 2063 } 2064 2065 static int 2066 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 2067 boolean_t mosconfig) 2068 { 2069 nvlist_t *config = spa->spa_config; 2070 char *ereport = FM_EREPORT_ZFS_POOL; 2071 char *comment; 2072 int error; 2073 uint64_t pool_guid; 2074 nvlist_t *nvl; 2075 2076 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 2077 return (SET_ERROR(EINVAL)); 2078 2079 ASSERT(spa->spa_comment == NULL); 2080 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 2081 spa->spa_comment = spa_strdup(comment); 2082 2083 /* 2084 * Versioning wasn't explicitly added to the label until later, so if 2085 * it's not present treat it as the initial version. 2086 */ 2087 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 2088 &spa->spa_ubsync.ub_version) != 0) 2089 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 2090 2091 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 2092 &spa->spa_config_txg); 2093 2094 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 2095 spa_guid_exists(pool_guid, 0)) { 2096 error = SET_ERROR(EEXIST); 2097 } else { 2098 spa->spa_config_guid = pool_guid; 2099 2100 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 2101 &nvl) == 0) { 2102 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 2103 KM_SLEEP) == 0); 2104 } 2105 2106 nvlist_free(spa->spa_load_info); 2107 spa->spa_load_info = fnvlist_alloc(); 2108 2109 gethrestime(&spa->spa_loaded_ts); 2110 error = spa_load_impl(spa, pool_guid, config, state, type, 2111 mosconfig, &ereport); 2112 } 2113 2114 /* 2115 * Don't count references from objsets that are already closed 2116 * and are making their way through the eviction process. 2117 */ 2118 spa_evicting_os_wait(spa); 2119 spa->spa_minref = refcount_count(&spa->spa_refcount); 2120 if (error) { 2121 if (error != EEXIST) { 2122 spa->spa_loaded_ts.tv_sec = 0; 2123 spa->spa_loaded_ts.tv_nsec = 0; 2124 } 2125 if (error != EBADF) { 2126 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 2127 } 2128 } 2129 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 2130 spa->spa_ena = 0; 2131 2132 return (error); 2133 } 2134 2135 /* 2136 * Load an existing storage pool, using the pool's builtin spa_config as a 2137 * source of configuration information. 2138 */ 2139 static int 2140 spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 2141 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 2142 char **ereport) 2143 { 2144 int error = 0; 2145 nvlist_t *nvroot = NULL; 2146 nvlist_t *label; 2147 vdev_t *rvd; 2148 uberblock_t *ub = &spa->spa_uberblock; 2149 uint64_t children, config_cache_txg = spa->spa_config_txg; 2150 int orig_mode = spa->spa_mode; 2151 int parse; 2152 uint64_t obj; 2153 boolean_t missing_feat_write = B_FALSE; 2154 2155 /* 2156 * If this is an untrusted config, access the pool in read-only mode. 2157 * This prevents things like resilvering recently removed devices. 2158 */ 2159 if (!mosconfig) 2160 spa->spa_mode = FREAD; 2161 2162 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2163 2164 spa->spa_load_state = state; 2165 2166 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 2167 return (SET_ERROR(EINVAL)); 2168 2169 parse = (type == SPA_IMPORT_EXISTING ? 2170 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 2171 2172 /* 2173 * Create "The Godfather" zio to hold all async IOs 2174 */ 2175 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 2176 KM_SLEEP); 2177 for (int i = 0; i < max_ncpus; i++) { 2178 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 2179 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 2180 ZIO_FLAG_GODFATHER); 2181 } 2182 2183 /* 2184 * Parse the configuration into a vdev tree. We explicitly set the 2185 * value that will be returned by spa_version() since parsing the 2186 * configuration requires knowing the version number. 2187 */ 2188 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2189 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 2190 spa_config_exit(spa, SCL_ALL, FTAG); 2191 2192 if (error != 0) 2193 return (error); 2194 2195 ASSERT(spa->spa_root_vdev == rvd); 2196 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 2197 ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); 2198 2199 if (type != SPA_IMPORT_ASSEMBLE) { 2200 ASSERT(spa_guid(spa) == pool_guid); 2201 } 2202 2203 /* 2204 * Try to open all vdevs, loading each label in the process. 2205 */ 2206 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2207 error = vdev_open(rvd); 2208 spa_config_exit(spa, SCL_ALL, FTAG); 2209 if (error != 0) 2210 return (error); 2211 2212 /* 2213 * We need to validate the vdev labels against the configuration that 2214 * we have in hand, which is dependent on the setting of mosconfig. If 2215 * mosconfig is true then we're validating the vdev labels based on 2216 * that config. Otherwise, we're validating against the cached config 2217 * (zpool.cache) that was read when we loaded the zfs module, and then 2218 * later we will recursively call spa_load() and validate against 2219 * the vdev config. 2220 * 2221 * If we're assembling a new pool that's been split off from an 2222 * existing pool, the labels haven't yet been updated so we skip 2223 * validation for now. 2224 */ 2225 if (type != SPA_IMPORT_ASSEMBLE) { 2226 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2227 error = vdev_validate(rvd, mosconfig); 2228 spa_config_exit(spa, SCL_ALL, FTAG); 2229 2230 if (error != 0) 2231 return (error); 2232 2233 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2234 return (SET_ERROR(ENXIO)); 2235 } 2236 2237 /* 2238 * Find the best uberblock. 2239 */ 2240 vdev_uberblock_load(rvd, ub, &label); 2241 2242 /* 2243 * If we weren't able to find a single valid uberblock, return failure. 2244 */ 2245 if (ub->ub_txg == 0) { 2246 nvlist_free(label); 2247 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 2248 } 2249 2250 /* 2251 * If the pool has an unsupported version we can't open it. 2252 */ 2253 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 2254 nvlist_free(label); 2255 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 2256 } 2257 2258 if (ub->ub_version >= SPA_VERSION_FEATURES) { 2259 nvlist_t *features; 2260 2261 /* 2262 * If we weren't able to find what's necessary for reading the 2263 * MOS in the label, return failure. 2264 */ 2265 if (label == NULL || nvlist_lookup_nvlist(label, 2266 ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { 2267 nvlist_free(label); 2268 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2269 ENXIO)); 2270 } 2271 2272 /* 2273 * Update our in-core representation with the definitive values 2274 * from the label. 2275 */ 2276 nvlist_free(spa->spa_label_features); 2277 VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); 2278 } 2279 2280 nvlist_free(label); 2281 2282 /* 2283 * Look through entries in the label nvlist's features_for_read. If 2284 * there is a feature listed there which we don't understand then we 2285 * cannot open a pool. 2286 */ 2287 if (ub->ub_version >= SPA_VERSION_FEATURES) { 2288 nvlist_t *unsup_feat; 2289 2290 VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 2291 0); 2292 2293 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 2294 NULL); nvp != NULL; 2295 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 2296 if (!zfeature_is_supported(nvpair_name(nvp))) { 2297 VERIFY(nvlist_add_string(unsup_feat, 2298 nvpair_name(nvp), "") == 0); 2299 } 2300 } 2301 2302 if (!nvlist_empty(unsup_feat)) { 2303 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 2304 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); 2305 nvlist_free(unsup_feat); 2306 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2307 ENOTSUP)); 2308 } 2309 2310 nvlist_free(unsup_feat); 2311 } 2312 2313 /* 2314 * If the vdev guid sum doesn't match the uberblock, we have an 2315 * incomplete configuration. We first check to see if the pool 2316 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 2317 * If it is, defer the vdev_guid_sum check till later so we 2318 * can handle missing vdevs. 2319 */ 2320 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 2321 &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && 2322 rvd->vdev_guid_sum != ub->ub_guid_sum) 2323 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 2324 2325 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 2326 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2327 spa_try_repair(spa, config); 2328 spa_config_exit(spa, SCL_ALL, FTAG); 2329 nvlist_free(spa->spa_config_splitting); 2330 spa->spa_config_splitting = NULL; 2331 } 2332 2333 /* 2334 * Initialize internal SPA structures. 2335 */ 2336 spa->spa_state = POOL_STATE_ACTIVE; 2337 spa->spa_ubsync = spa->spa_uberblock; 2338 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2339 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2340 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2341 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2342 spa->spa_claim_max_txg = spa->spa_first_txg; 2343 spa->spa_prev_software_version = ub->ub_software_version; 2344 2345 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2346 if (error) 2347 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2348 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2349 2350 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 2351 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2352 2353 if (spa_version(spa) >= SPA_VERSION_FEATURES) { 2354 boolean_t missing_feat_read = B_FALSE; 2355 nvlist_t *unsup_feat, *enabled_feat; 2356 2357 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 2358 &spa->spa_feat_for_read_obj) != 0) { 2359 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2360 } 2361 2362 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 2363 &spa->spa_feat_for_write_obj) != 0) { 2364 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2365 } 2366 2367 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 2368 &spa->spa_feat_desc_obj) != 0) { 2369 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2370 } 2371 2372 enabled_feat = fnvlist_alloc(); 2373 unsup_feat = fnvlist_alloc(); 2374 2375 if (!spa_features_check(spa, B_FALSE, 2376 unsup_feat, enabled_feat)) 2377 missing_feat_read = B_TRUE; 2378 2379 if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { 2380 if (!spa_features_check(spa, B_TRUE, 2381 unsup_feat, enabled_feat)) { 2382 missing_feat_write = B_TRUE; 2383 } 2384 } 2385 2386 fnvlist_add_nvlist(spa->spa_load_info, 2387 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 2388 2389 if (!nvlist_empty(unsup_feat)) { 2390 fnvlist_add_nvlist(spa->spa_load_info, 2391 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 2392 } 2393 2394 fnvlist_free(enabled_feat); 2395 fnvlist_free(unsup_feat); 2396 2397 if (!missing_feat_read) { 2398 fnvlist_add_boolean(spa->spa_load_info, 2399 ZPOOL_CONFIG_CAN_RDONLY); 2400 } 2401 2402 /* 2403 * If the state is SPA_LOAD_TRYIMPORT, our objective is 2404 * twofold: to determine whether the pool is available for 2405 * import in read-write mode and (if it is not) whether the 2406 * pool is available for import in read-only mode. If the pool 2407 * is available for import in read-write mode, it is displayed 2408 * as available in userland; if it is not available for import 2409 * in read-only mode, it is displayed as unavailable in 2410 * userland. If the pool is available for import in read-only 2411 * mode but not read-write mode, it is displayed as unavailable 2412 * in userland with a special note that the pool is actually 2413 * available for open in read-only mode. 2414 * 2415 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 2416 * missing a feature for write, we must first determine whether 2417 * the pool can be opened read-only before returning to 2418 * userland in order to know whether to display the 2419 * abovementioned note. 2420 */ 2421 if (missing_feat_read || (missing_feat_write && 2422 spa_writeable(spa))) { 2423 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2424 ENOTSUP)); 2425 } 2426 2427 /* 2428 * Load refcounts for ZFS features from disk into an in-memory 2429 * cache during SPA initialization. 2430 */ 2431 for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 2432 uint64_t refcount; 2433 2434 error = feature_get_refcount_from_disk(spa, 2435 &spa_feature_table[i], &refcount); 2436 if (error == 0) { 2437 spa->spa_feat_refcount_cache[i] = refcount; 2438 } else if (error == ENOTSUP) { 2439 spa->spa_feat_refcount_cache[i] = 2440 SPA_FEATURE_DISABLED; 2441 } else { 2442 return (spa_vdev_err(rvd, 2443 VDEV_AUX_CORRUPT_DATA, EIO)); 2444 } 2445 } 2446 } 2447 2448 if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 2449 if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 2450 &spa->spa_feat_enabled_txg_obj) != 0) 2451 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2452 } 2453 2454 spa->spa_is_initializing = B_TRUE; 2455 error = dsl_pool_open(spa->spa_dsl_pool); 2456 spa->spa_is_initializing = B_FALSE; 2457 if (error != 0) 2458 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2459 2460 if (!mosconfig) { 2461 uint64_t hostid; 2462 nvlist_t *policy = NULL, *nvconfig; 2463 2464 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2465 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2466 2467 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 2468 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2469 char *hostname; 2470 unsigned long myhostid = 0; 2471 2472 VERIFY(nvlist_lookup_string(nvconfig, 2473 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 2474 2475 #ifdef _KERNEL 2476 myhostid = zone_get_hostid(NULL); 2477 #else /* _KERNEL */ 2478 /* 2479 * We're emulating the system's hostid in userland, so 2480 * we can't use zone_get_hostid(). 2481 */ 2482 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 2483 #endif /* _KERNEL */ 2484 if (hostid != 0 && myhostid != 0 && 2485 hostid != myhostid) { 2486 nvlist_free(nvconfig); 2487 cmn_err(CE_WARN, "pool '%s' could not be " 2488 "loaded as it was last accessed by " 2489 "another system (host: %s hostid: 0x%lx). " 2490 "See: http://illumos.org/msg/ZFS-8000-EY", 2491 spa_name(spa), hostname, 2492 (unsigned long)hostid); 2493 return (SET_ERROR(EBADF)); 2494 } 2495 } 2496 if (nvlist_lookup_nvlist(spa->spa_config, 2497 ZPOOL_REWIND_POLICY, &policy) == 0) 2498 VERIFY(nvlist_add_nvlist(nvconfig, 2499 ZPOOL_REWIND_POLICY, policy) == 0); 2500 2501 spa_config_set(spa, nvconfig); 2502 spa_unload(spa); 2503 spa_deactivate(spa); 2504 spa_activate(spa, orig_mode); 2505 2506 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 2507 } 2508 2509 /* Grab the secret checksum salt from the MOS. */ 2510 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2511 DMU_POOL_CHECKSUM_SALT, 1, 2512 sizeof (spa->spa_cksum_salt.zcs_bytes), 2513 spa->spa_cksum_salt.zcs_bytes); 2514 if (error == ENOENT) { 2515 /* Generate a new salt for subsequent use */ 2516 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 2517 sizeof (spa->spa_cksum_salt.zcs_bytes)); 2518 } else if (error != 0) { 2519 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2520 } 2521 2522 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) 2523 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2524 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 2525 if (error != 0) 2526 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2527 2528 /* 2529 * Load the bit that tells us to use the new accounting function 2530 * (raid-z deflation). If we have an older pool, this will not 2531 * be present. 2532 */ 2533 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 2534 if (error != 0 && error != ENOENT) 2535 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2536 2537 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2538 &spa->spa_creation_version); 2539 if (error != 0 && error != ENOENT) 2540 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2541 2542 /* 2543 * Load the persistent error log. If we have an older pool, this will 2544 * not be present. 2545 */ 2546 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 2547 if (error != 0 && error != ENOENT) 2548 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2549 2550 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2551 &spa->spa_errlog_scrub); 2552 if (error != 0 && error != ENOENT) 2553 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2554 2555 /* 2556 * Load the history object. If we have an older pool, this 2557 * will not be present. 2558 */ 2559 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 2560 if (error != 0 && error != ENOENT) 2561 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2562 2563 /* 2564 * If we're assembling the pool from the split-off vdevs of 2565 * an existing pool, we don't want to attach the spares & cache 2566 * devices. 2567 */ 2568 2569 /* 2570 * Load any hot spares for this pool. 2571 */ 2572 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 2573 if (error != 0 && error != ENOENT) 2574 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2575 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2576 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2577 if (load_nvlist(spa, spa->spa_spares.sav_object, 2578 &spa->spa_spares.sav_config) != 0) 2579 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2580 2581 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2582 spa_load_spares(spa); 2583 spa_config_exit(spa, SCL_ALL, FTAG); 2584 } else if (error == 0) { 2585 spa->spa_spares.sav_sync = B_TRUE; 2586 } 2587 2588 /* 2589 * Load any level 2 ARC devices for this pool. 2590 */ 2591 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2592 &spa->spa_l2cache.sav_object); 2593 if (error != 0 && error != ENOENT) 2594 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2595 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2596 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2597 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2598 &spa->spa_l2cache.sav_config) != 0) 2599 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2600 2601 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2602 spa_load_l2cache(spa); 2603 spa_config_exit(spa, SCL_ALL, FTAG); 2604 } else if (error == 0) { 2605 spa->spa_l2cache.sav_sync = B_TRUE; 2606 } 2607 2608 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2609 2610 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 2611 if (error && error != ENOENT) 2612 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2613 2614 if (error == 0) { 2615 uint64_t autoreplace; 2616 2617 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2618 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2619 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2620 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2621 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2622 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2623 &spa->spa_dedup_ditto); 2624 2625 spa->spa_autoreplace = (autoreplace != 0); 2626 } 2627 2628 /* 2629 * If the 'autoreplace' property is set, then post a resource notifying 2630 * the ZFS DE that it should not issue any faults for unopenable 2631 * devices. We also iterate over the vdevs, and post a sysevent for any 2632 * unopenable vdevs so that the normal autoreplace handler can take 2633 * over. 2634 */ 2635 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 2636 spa_check_removed(spa->spa_root_vdev); 2637 /* 2638 * For the import case, this is done in spa_import(), because 2639 * at this point we're using the spare definitions from 2640 * the MOS config, not necessarily from the userland config. 2641 */ 2642 if (state != SPA_LOAD_IMPORT) { 2643 spa_aux_check_removed(&spa->spa_spares); 2644 spa_aux_check_removed(&spa->spa_l2cache); 2645 } 2646 } 2647 2648 /* 2649 * Load the vdev state for all toplevel vdevs. 2650 */ 2651 vdev_load(rvd); 2652 2653 /* 2654 * Propagate the leaf DTLs we just loaded all the way up the tree. 2655 */ 2656 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2657 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 2658 spa_config_exit(spa, SCL_ALL, FTAG); 2659 2660 /* 2661 * Load the DDTs (dedup tables). 2662 */ 2663 error = ddt_load(spa); 2664 if (error != 0) 2665 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2666 2667 spa_update_dspace(spa); 2668 2669 /* 2670 * Validate the config, using the MOS config to fill in any 2671 * information which might be missing. If we fail to validate 2672 * the config then declare the pool unfit for use. If we're 2673 * assembling a pool from a split, the log is not transferred 2674 * over. 2675 */ 2676 if (type != SPA_IMPORT_ASSEMBLE) { 2677 nvlist_t *nvconfig; 2678 2679 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2680 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2681 2682 if (!spa_config_valid(spa, nvconfig)) { 2683 nvlist_free(nvconfig); 2684 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2685 ENXIO)); 2686 } 2687 nvlist_free(nvconfig); 2688 2689 /* 2690 * Now that we've validated the config, check the state of the 2691 * root vdev. If it can't be opened, it indicates one or 2692 * more toplevel vdevs are faulted. 2693 */ 2694 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2695 return (SET_ERROR(ENXIO)); 2696 2697 if (spa_writeable(spa) && spa_check_logs(spa)) { 2698 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 2699 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 2700 } 2701 } 2702 2703 if (missing_feat_write) { 2704 ASSERT(state == SPA_LOAD_TRYIMPORT); 2705 2706 /* 2707 * At this point, we know that we can open the pool in 2708 * read-only mode but not read-write mode. We now have enough 2709 * information and can return to userland. 2710 */ 2711 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); 2712 } 2713 2714 /* 2715 * We've successfully opened the pool, verify that we're ready 2716 * to start pushing transactions. 2717 */ 2718 if (state != SPA_LOAD_TRYIMPORT) { 2719 if (error = spa_load_verify(spa)) 2720 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2721 error)); 2722 } 2723 2724 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2725 spa->spa_load_max_txg == UINT64_MAX)) { 2726 dmu_tx_t *tx; 2727 int need_update = B_FALSE; 2728 dsl_pool_t *dp = spa_get_dsl(spa); 2729 2730 ASSERT(state != SPA_LOAD_TRYIMPORT); 2731 2732 /* 2733 * Claim log blocks that haven't been committed yet. 2734 * This must all happen in a single txg. 2735 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2736 * invoked from zil_claim_log_block()'s i/o done callback. 2737 * Price of rollback is that we abandon the log. 2738 */ 2739 spa->spa_claiming = B_TRUE; 2740 2741 tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); 2742 (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 2743 zil_claim, tx, DS_FIND_CHILDREN); 2744 dmu_tx_commit(tx); 2745 2746 spa->spa_claiming = B_FALSE; 2747 2748 spa_set_log_state(spa, SPA_LOG_GOOD); 2749 spa->spa_sync_on = B_TRUE; 2750 txg_sync_start(spa->spa_dsl_pool); 2751 2752 /* 2753 * Wait for all claims to sync. We sync up to the highest 2754 * claimed log block birth time so that claimed log blocks 2755 * don't appear to be from the future. spa_claim_max_txg 2756 * will have been set for us by either zil_check_log_chain() 2757 * (invoked from spa_check_logs()) or zil_claim() above. 2758 */ 2759 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2760 2761 /* 2762 * If the config cache is stale, or we have uninitialized 2763 * metaslabs (see spa_vdev_add()), then update the config. 2764 * 2765 * If this is a verbatim import, trust the current 2766 * in-core spa_config and update the disk labels. 2767 */ 2768 if (config_cache_txg != spa->spa_config_txg || 2769 state == SPA_LOAD_IMPORT || 2770 state == SPA_LOAD_RECOVER || 2771 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 2772 need_update = B_TRUE; 2773 2774 for (int c = 0; c < rvd->vdev_children; c++) 2775 if (rvd->vdev_child[c]->vdev_ms_array == 0) 2776 need_update = B_TRUE; 2777 2778 /* 2779 * Update the config cache asychronously in case we're the 2780 * root pool, in which case the config cache isn't writable yet. 2781 */ 2782 if (need_update) 2783 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2784 2785 /* 2786 * Check all DTLs to see if anything needs resilvering. 2787 */ 2788 if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 2789 vdev_resilver_needed(rvd, NULL, NULL)) 2790 spa_async_request(spa, SPA_ASYNC_RESILVER); 2791 2792 /* 2793 * Log the fact that we booted up (so that we can detect if 2794 * we rebooted in the middle of an operation). 2795 */ 2796 spa_history_log_version(spa, "open"); 2797 2798 /* 2799 * Delete any inconsistent datasets. 2800 */ 2801 (void) dmu_objset_find(spa_name(spa), 2802 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 2803 2804 /* 2805 * Clean up any stale temporary dataset userrefs. 2806 */ 2807 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2808 } 2809 2810 return (0); 2811 } 2812 2813 static int 2814 spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 2815 { 2816 int mode = spa->spa_mode; 2817 2818 spa_unload(spa); 2819 spa_deactivate(spa); 2820 2821 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; 2822 2823 spa_activate(spa, mode); 2824 spa_async_suspend(spa); 2825 2826 return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 2827 } 2828 2829 /* 2830 * If spa_load() fails this function will try loading prior txg's. If 2831 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 2832 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 2833 * function will not rewind the pool and will return the same error as 2834 * spa_load(). 2835 */ 2836 static int 2837 spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 2838 uint64_t max_request, int rewind_flags) 2839 { 2840 nvlist_t *loadinfo = NULL; 2841 nvlist_t *config = NULL; 2842 int load_error, rewind_error; 2843 uint64_t safe_rewind_txg; 2844 uint64_t min_txg; 2845 2846 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 2847 spa->spa_load_max_txg = spa->spa_load_txg; 2848 spa_set_log_state(spa, SPA_LOG_CLEAR); 2849 } else { 2850 spa->spa_load_max_txg = max_request; 2851 if (max_request != UINT64_MAX) 2852 spa->spa_extreme_rewind = B_TRUE; 2853 } 2854 2855 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 2856 mosconfig); 2857 if (load_error == 0) 2858 return (0); 2859 2860 if (spa->spa_root_vdev != NULL) 2861 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2862 2863 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 2864 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 2865 2866 if (rewind_flags & ZPOOL_NEVER_REWIND) { 2867 nvlist_free(config); 2868 return (load_error); 2869 } 2870 2871 if (state == SPA_LOAD_RECOVER) { 2872 /* Price of rolling back is discarding txgs, including log */ 2873 spa_set_log_state(spa, SPA_LOG_CLEAR); 2874 } else { 2875 /* 2876 * If we aren't rolling back save the load info from our first 2877 * import attempt so that we can restore it after attempting 2878 * to rewind. 2879 */ 2880 loadinfo = spa->spa_load_info; 2881 spa->spa_load_info = fnvlist_alloc(); 2882 } 2883 2884 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 2885 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 2886 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 2887 TXG_INITIAL : safe_rewind_txg; 2888 2889 /* 2890 * Continue as long as we're finding errors, we're still within 2891 * the acceptable rewind range, and we're still finding uberblocks 2892 */ 2893 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 2894 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 2895 if (spa->spa_load_max_txg < safe_rewind_txg) 2896 spa->spa_extreme_rewind = B_TRUE; 2897 rewind_error = spa_load_retry(spa, state, mosconfig); 2898 } 2899 2900 spa->spa_extreme_rewind = B_FALSE; 2901 spa->spa_load_max_txg = UINT64_MAX; 2902 2903 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 2904 spa_config_set(spa, config); 2905 2906 if (state == SPA_LOAD_RECOVER) { 2907 ASSERT3P(loadinfo, ==, NULL); 2908 return (rewind_error); 2909 } else { 2910 /* Store the rewind info as part of the initial load info */ 2911 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 2912 spa->spa_load_info); 2913 2914 /* Restore the initial load info */ 2915 fnvlist_free(spa->spa_load_info); 2916 spa->spa_load_info = loadinfo; 2917 2918 return (load_error); 2919 } 2920 } 2921 2922 /* 2923 * Pool Open/Import 2924 * 2925 * The import case is identical to an open except that the configuration is sent 2926 * down from userland, instead of grabbed from the configuration cache. For the 2927 * case of an open, the pool configuration will exist in the 2928 * POOL_STATE_UNINITIALIZED state. 2929 * 2930 * The stats information (gen/count/ustats) is used to gather vdev statistics at 2931 * the same time open the pool, without having to keep around the spa_t in some 2932 * ambiguous state. 2933 */ 2934 static int 2935 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 2936 nvlist_t **config) 2937 { 2938 spa_t *spa; 2939 spa_load_state_t state = SPA_LOAD_OPEN; 2940 int error; 2941 int locked = B_FALSE; 2942 2943 *spapp = NULL; 2944 2945 /* 2946 * As disgusting as this is, we need to support recursive calls to this 2947 * function because dsl_dir_open() is called during spa_load(), and ends 2948 * up calling spa_open() again. The real fix is to figure out how to 2949 * avoid dsl_dir_open() calling this in the first place. 2950 */ 2951 if (mutex_owner(&spa_namespace_lock) != curthread) { 2952 mutex_enter(&spa_namespace_lock); 2953 locked = B_TRUE; 2954 } 2955 2956 if ((spa = spa_lookup(pool)) == NULL) { 2957 if (locked) 2958 mutex_exit(&spa_namespace_lock); 2959 return (SET_ERROR(ENOENT)); 2960 } 2961 2962 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 2963 zpool_rewind_policy_t policy; 2964 2965 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 2966 &policy); 2967 if (policy.zrp_request & ZPOOL_DO_REWIND) 2968 state = SPA_LOAD_RECOVER; 2969 2970 spa_activate(spa, spa_mode_global); 2971 2972 if (state != SPA_LOAD_RECOVER) 2973 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 2974 2975 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 2976 policy.zrp_request); 2977 2978 if (error == EBADF) { 2979 /* 2980 * If vdev_validate() returns failure (indicated by 2981 * EBADF), it indicates that one of the vdevs indicates 2982 * that the pool has been exported or destroyed. If 2983 * this is the case, the config cache is out of sync and 2984 * we should remove the pool from the namespace. 2985 */ 2986 spa_unload(spa); 2987 spa_deactivate(spa); 2988 spa_config_sync(spa, B_TRUE, B_TRUE); 2989 spa_remove(spa); 2990 if (locked) 2991 mutex_exit(&spa_namespace_lock); 2992 return (SET_ERROR(ENOENT)); 2993 } 2994 2995 if (error) { 2996 /* 2997 * We can't open the pool, but we still have useful 2998 * information: the state of each vdev after the 2999 * attempted vdev_open(). Return this to the user. 3000 */ 3001 if (config != NULL && spa->spa_config) { 3002 VERIFY(nvlist_dup(spa->spa_config, config, 3003 KM_SLEEP) == 0); 3004 VERIFY(nvlist_add_nvlist(*config, 3005 ZPOOL_CONFIG_LOAD_INFO, 3006 spa->spa_load_info) == 0); 3007 } 3008 spa_unload(spa); 3009 spa_deactivate(spa); 3010 spa->spa_last_open_failed = error; 3011 if (locked) 3012 mutex_exit(&spa_namespace_lock); 3013 *spapp = NULL; 3014 return (error); 3015 } 3016 } 3017 3018 spa_open_ref(spa, tag); 3019 3020 if (config != NULL) 3021 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3022 3023 /* 3024 * If we've recovered the pool, pass back any information we 3025 * gathered while doing the load. 3026 */ 3027 if (state == SPA_LOAD_RECOVER) { 3028 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 3029 spa->spa_load_info) == 0); 3030 } 3031 3032 if (locked) { 3033 spa->spa_last_open_failed = 0; 3034 spa->spa_last_ubsync_txg = 0; 3035 spa->spa_load_txg = 0; 3036 mutex_exit(&spa_namespace_lock); 3037 } 3038 3039 *spapp = spa; 3040 3041 return (0); 3042 } 3043 3044 int 3045 spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 3046 nvlist_t **config) 3047 { 3048 return (spa_open_common(name, spapp, tag, policy, config)); 3049 } 3050 3051 int 3052 spa_open(const char *name, spa_t **spapp, void *tag) 3053 { 3054 return (spa_open_common(name, spapp, tag, NULL, NULL)); 3055 } 3056 3057 /* 3058 * Lookup the given spa_t, incrementing the inject count in the process, 3059 * preventing it from being exported or destroyed. 3060 */ 3061 spa_t * 3062 spa_inject_addref(char *name) 3063 { 3064 spa_t *spa; 3065 3066 mutex_enter(&spa_namespace_lock); 3067 if ((spa = spa_lookup(name)) == NULL) { 3068 mutex_exit(&spa_namespace_lock); 3069 return (NULL); 3070 } 3071 spa->spa_inject_ref++; 3072 mutex_exit(&spa_namespace_lock); 3073 3074 return (spa); 3075 } 3076 3077 void 3078 spa_inject_delref(spa_t *spa) 3079 { 3080 mutex_enter(&spa_namespace_lock); 3081 spa->spa_inject_ref--; 3082 mutex_exit(&spa_namespace_lock); 3083 } 3084 3085 /* 3086 * Add spares device information to the nvlist. 3087 */ 3088 static void 3089 spa_add_spares(spa_t *spa, nvlist_t *config) 3090 { 3091 nvlist_t **spares; 3092 uint_t i, nspares; 3093 nvlist_t *nvroot; 3094 uint64_t guid; 3095 vdev_stat_t *vs; 3096 uint_t vsc; 3097 uint64_t pool; 3098 3099 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3100 3101 if (spa->spa_spares.sav_count == 0) 3102 return; 3103 3104 VERIFY(nvlist_lookup_nvlist(config, 3105 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3106 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3107 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3108 if (nspares != 0) { 3109 VERIFY(nvlist_add_nvlist_array(nvroot, 3110 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3111 VERIFY(nvlist_lookup_nvlist_array(nvroot, 3112 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3113 3114 /* 3115 * Go through and find any spares which have since been 3116 * repurposed as an active spare. If this is the case, update 3117 * their status appropriately. 3118 */ 3119 for (i = 0; i < nspares; i++) { 3120 VERIFY(nvlist_lookup_uint64(spares[i], 3121 ZPOOL_CONFIG_GUID, &guid) == 0); 3122 if (spa_spare_exists(guid, &pool, NULL) && 3123 pool != 0ULL) { 3124 VERIFY(nvlist_lookup_uint64_array( 3125 spares[i], ZPOOL_CONFIG_VDEV_STATS, 3126 (uint64_t **)&vs, &vsc) == 0); 3127 vs->vs_state = VDEV_STATE_CANT_OPEN; 3128 vs->vs_aux = VDEV_AUX_SPARED; 3129 } 3130 } 3131 } 3132 } 3133 3134 /* 3135 * Add l2cache device information to the nvlist, including vdev stats. 3136 */ 3137 static void 3138 spa_add_l2cache(spa_t *spa, nvlist_t *config) 3139 { 3140 nvlist_t **l2cache; 3141 uint_t i, j, nl2cache; 3142 nvlist_t *nvroot; 3143 uint64_t guid; 3144 vdev_t *vd; 3145 vdev_stat_t *vs; 3146 uint_t vsc; 3147 3148 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3149 3150 if (spa->spa_l2cache.sav_count == 0) 3151 return; 3152 3153 VERIFY(nvlist_lookup_nvlist(config, 3154 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3155 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3156 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3157 if (nl2cache != 0) { 3158 VERIFY(nvlist_add_nvlist_array(nvroot, 3159 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3160 VERIFY(nvlist_lookup_nvlist_array(nvroot, 3161 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3162 3163 /* 3164 * Update level 2 cache device stats. 3165 */ 3166 3167 for (i = 0; i < nl2cache; i++) { 3168 VERIFY(nvlist_lookup_uint64(l2cache[i], 3169 ZPOOL_CONFIG_GUID, &guid) == 0); 3170 3171 vd = NULL; 3172 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 3173 if (guid == 3174 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 3175 vd = spa->spa_l2cache.sav_vdevs[j]; 3176 break; 3177 } 3178 } 3179 ASSERT(vd != NULL); 3180 3181 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 3182 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 3183 == 0); 3184 vdev_get_stats(vd, vs); 3185 } 3186 } 3187 } 3188 3189 static void 3190 spa_add_feature_stats(spa_t *spa, nvlist_t *config) 3191 { 3192 nvlist_t *features; 3193 zap_cursor_t zc; 3194 zap_attribute_t za; 3195 3196 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3197 VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3198 3199 if (spa->spa_feat_for_read_obj != 0) { 3200 for (zap_cursor_init(&zc, spa->spa_meta_objset, 3201 spa->spa_feat_for_read_obj); 3202 zap_cursor_retrieve(&zc, &za) == 0; 3203 zap_cursor_advance(&zc)) { 3204 ASSERT(za.za_integer_length == sizeof (uint64_t) && 3205 za.za_num_integers == 1); 3206 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3207 za.za_first_integer)); 3208 } 3209 zap_cursor_fini(&zc); 3210 } 3211 3212 if (spa->spa_feat_for_write_obj != 0) { 3213 for (zap_cursor_init(&zc, spa->spa_meta_objset, 3214 spa->spa_feat_for_write_obj); 3215 zap_cursor_retrieve(&zc, &za) == 0; 3216 zap_cursor_advance(&zc)) { 3217 ASSERT(za.za_integer_length == sizeof (uint64_t) && 3218 za.za_num_integers == 1); 3219 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3220 za.za_first_integer)); 3221 } 3222 zap_cursor_fini(&zc); 3223 } 3224 3225 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 3226 features) == 0); 3227 nvlist_free(features); 3228 } 3229 3230 int 3231 spa_get_stats(const char *name, nvlist_t **config, 3232 char *altroot, size_t buflen) 3233 { 3234 int error; 3235 spa_t *spa; 3236 3237 *config = NULL; 3238 error = spa_open_common(name, &spa, FTAG, NULL, config); 3239 3240 if (spa != NULL) { 3241 /* 3242 * This still leaves a window of inconsistency where the spares 3243 * or l2cache devices could change and the config would be 3244 * self-inconsistent. 3245 */ 3246 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3247 3248 if (*config != NULL) { 3249 uint64_t loadtimes[2]; 3250 3251 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 3252 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 3253 VERIFY(nvlist_add_uint64_array(*config, 3254 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 3255 3256 VERIFY(nvlist_add_uint64(*config, 3257 ZPOOL_CONFIG_ERRCOUNT, 3258 spa_get_errlog_size(spa)) == 0); 3259 3260 if (spa_suspended(spa)) 3261 VERIFY(nvlist_add_uint64(*config, 3262 ZPOOL_CONFIG_SUSPENDED, 3263 spa->spa_failmode) == 0); 3264 3265 spa_add_spares(spa, *config); 3266 spa_add_l2cache(spa, *config); 3267 spa_add_feature_stats(spa, *config); 3268 } 3269 } 3270 3271 /* 3272 * We want to get the alternate root even for faulted pools, so we cheat 3273 * and call spa_lookup() directly. 3274 */ 3275 if (altroot) { 3276 if (spa == NULL) { 3277 mutex_enter(&spa_namespace_lock); 3278 spa = spa_lookup(name); 3279 if (spa) 3280 spa_altroot(spa, altroot, buflen); 3281 else 3282 altroot[0] = '\0'; 3283 spa = NULL; 3284 mutex_exit(&spa_namespace_lock); 3285 } else { 3286 spa_altroot(spa, altroot, buflen); 3287 } 3288 } 3289 3290 if (spa != NULL) { 3291 spa_config_exit(spa, SCL_CONFIG, FTAG); 3292 spa_close(spa, FTAG); 3293 } 3294 3295 return (error); 3296 } 3297 3298 /* 3299 * Validate that the auxiliary device array is well formed. We must have an 3300 * array of nvlists, each which describes a valid leaf vdev. If this is an 3301 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 3302 * specified, as long as they are well-formed. 3303 */ 3304 static int 3305 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 3306 spa_aux_vdev_t *sav, const char *config, uint64_t version, 3307 vdev_labeltype_t label) 3308 { 3309 nvlist_t **dev; 3310 uint_t i, ndev; 3311 vdev_t *vd; 3312 int error; 3313 3314 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3315 3316 /* 3317 * It's acceptable to have no devs specified. 3318 */ 3319 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 3320 return (0); 3321 3322 if (ndev == 0) 3323 return (SET_ERROR(EINVAL)); 3324 3325 /* 3326 * Make sure the pool is formatted with a version that supports this 3327 * device type. 3328 */ 3329 if (spa_version(spa) < version) 3330 return (SET_ERROR(ENOTSUP)); 3331 3332 /* 3333 * Set the pending device list so we correctly handle device in-use 3334 * checking. 3335 */ 3336 sav->sav_pending = dev; 3337 sav->sav_npending = ndev; 3338 3339 for (i = 0; i < ndev; i++) { 3340 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 3341 mode)) != 0) 3342 goto out; 3343 3344 if (!vd->vdev_ops->vdev_op_leaf) { 3345 vdev_free(vd); 3346 error = SET_ERROR(EINVAL); 3347 goto out; 3348 } 3349 3350 /* 3351 * The L2ARC currently only supports disk devices in 3352 * kernel context. For user-level testing, we allow it. 3353 */ 3354 #ifdef _KERNEL 3355 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 3356 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 3357 error = SET_ERROR(ENOTBLK); 3358 vdev_free(vd); 3359 goto out; 3360 } 3361 #endif 3362 vd->vdev_top = vd; 3363 3364 if ((error = vdev_open(vd)) == 0 && 3365 (error = vdev_label_init(vd, crtxg, label)) == 0) { 3366 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 3367 vd->vdev_guid) == 0); 3368 } 3369 3370 vdev_free(vd); 3371 3372 if (error && 3373 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 3374 goto out; 3375 else 3376 error = 0; 3377 } 3378 3379 out: 3380 sav->sav_pending = NULL; 3381 sav->sav_npending = 0; 3382 return (error); 3383 } 3384 3385 static int 3386 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 3387 { 3388 int error; 3389 3390 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3391 3392 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3393 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 3394 VDEV_LABEL_SPARE)) != 0) { 3395 return (error); 3396 } 3397 3398 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3399 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 3400 VDEV_LABEL_L2CACHE)); 3401 } 3402 3403 static void 3404 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 3405 const char *config) 3406 { 3407 int i; 3408 3409 if (sav->sav_config != NULL) { 3410 nvlist_t **olddevs; 3411 uint_t oldndevs; 3412 nvlist_t **newdevs; 3413 3414 /* 3415 * Generate new dev list by concatentating with the 3416 * current dev list. 3417 */ 3418 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 3419 &olddevs, &oldndevs) == 0); 3420 3421 newdevs = kmem_alloc(sizeof (void *) * 3422 (ndevs + oldndevs), KM_SLEEP); 3423 for (i = 0; i < oldndevs; i++) 3424 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 3425 KM_SLEEP) == 0); 3426 for (i = 0; i < ndevs; i++) 3427 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 3428 KM_SLEEP) == 0); 3429 3430 VERIFY(nvlist_remove(sav->sav_config, config, 3431 DATA_TYPE_NVLIST_ARRAY) == 0); 3432 3433 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3434 config, newdevs, ndevs + oldndevs) == 0); 3435 for (i = 0; i < oldndevs + ndevs; i++) 3436 nvlist_free(newdevs[i]); 3437 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 3438 } else { 3439 /* 3440 * Generate a new dev list. 3441 */ 3442 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 3443 KM_SLEEP) == 0); 3444 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 3445 devs, ndevs) == 0); 3446 } 3447 } 3448 3449 /* 3450 * Stop and drop level 2 ARC devices 3451 */ 3452 void 3453 spa_l2cache_drop(spa_t *spa) 3454 { 3455 vdev_t *vd; 3456 int i; 3457 spa_aux_vdev_t *sav = &spa->spa_l2cache; 3458 3459 for (i = 0; i < sav->sav_count; i++) { 3460 uint64_t pool; 3461 3462 vd = sav->sav_vdevs[i]; 3463 ASSERT(vd != NULL); 3464 3465 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 3466 pool != 0ULL && l2arc_vdev_present(vd)) 3467 l2arc_remove_vdev(vd); 3468 } 3469 } 3470 3471 /* 3472 * Pool Creation 3473 */ 3474 int 3475 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 3476 nvlist_t *zplprops) 3477 { 3478 spa_t *spa; 3479 char *altroot = NULL; 3480 vdev_t *rvd; 3481 dsl_pool_t *dp; 3482 dmu_tx_t *tx; 3483 int error = 0; 3484 uint64_t txg = TXG_INITIAL; 3485 nvlist_t **spares, **l2cache; 3486 uint_t nspares, nl2cache; 3487 uint64_t version, obj; 3488 boolean_t has_features; 3489 3490 /* 3491 * If this pool already exists, return failure. 3492 */ 3493 mutex_enter(&spa_namespace_lock); 3494 if (spa_lookup(pool) != NULL) { 3495 mutex_exit(&spa_namespace_lock); 3496 return (SET_ERROR(EEXIST)); 3497 } 3498 3499 /* 3500 * Allocate a new spa_t structure. 3501 */ 3502 (void) nvlist_lookup_string(props, 3503 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3504 spa = spa_add(pool, NULL, altroot); 3505 spa_activate(spa, spa_mode_global); 3506 3507 if (props && (error = spa_prop_validate(spa, props))) { 3508 spa_deactivate(spa); 3509 spa_remove(spa); 3510 mutex_exit(&spa_namespace_lock); 3511 return (error); 3512 } 3513 3514 has_features = B_FALSE; 3515 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 3516 elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 3517 if (zpool_prop_feature(nvpair_name(elem))) 3518 has_features = B_TRUE; 3519 } 3520 3521 if (has_features || nvlist_lookup_uint64(props, 3522 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 3523 version = SPA_VERSION; 3524 } 3525 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 3526 3527 spa->spa_first_txg = txg; 3528 spa->spa_uberblock.ub_txg = txg - 1; 3529 spa->spa_uberblock.ub_version = version; 3530 spa->spa_ubsync = spa->spa_uberblock; 3531 3532 /* 3533 * Create "The Godfather" zio to hold all async IOs 3534 */ 3535 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 3536 KM_SLEEP); 3537 for (int i = 0; i < max_ncpus; i++) { 3538 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 3539 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 3540 ZIO_FLAG_GODFATHER); 3541 } 3542 3543 /* 3544 * Create the root vdev. 3545 */ 3546 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3547 3548 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 3549 3550 ASSERT(error != 0 || rvd != NULL); 3551 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 3552 3553 if (error == 0 && !zfs_allocatable_devs(nvroot)) 3554 error = SET_ERROR(EINVAL); 3555 3556 if (error == 0 && 3557 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 3558 (error = spa_validate_aux(spa, nvroot, txg, 3559 VDEV_ALLOC_ADD)) == 0) { 3560 for (int c = 0; c < rvd->vdev_children; c++) { 3561 vdev_metaslab_set_size(rvd->vdev_child[c]); 3562 vdev_expand(rvd->vdev_child[c], txg); 3563 } 3564 } 3565 3566 spa_config_exit(spa, SCL_ALL, FTAG); 3567 3568 if (error != 0) { 3569 spa_unload(spa); 3570 spa_deactivate(spa); 3571 spa_remove(spa); 3572 mutex_exit(&spa_namespace_lock); 3573 return (error); 3574 } 3575 3576 /* 3577 * Get the list of spares, if specified. 3578 */ 3579 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3580 &spares, &nspares) == 0) { 3581 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 3582 KM_SLEEP) == 0); 3583 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3584 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3585 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3586 spa_load_spares(spa); 3587 spa_config_exit(spa, SCL_ALL, FTAG); 3588 spa->spa_spares.sav_sync = B_TRUE; 3589 } 3590 3591 /* 3592 * Get the list of level 2 cache devices, if specified. 3593 */ 3594 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3595 &l2cache, &nl2cache) == 0) { 3596 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3597 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3598 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3599 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3600 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3601 spa_load_l2cache(spa); 3602 spa_config_exit(spa, SCL_ALL, FTAG); 3603 spa->spa_l2cache.sav_sync = B_TRUE; 3604 } 3605 3606 spa->spa_is_initializing = B_TRUE; 3607 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 3608 spa->spa_meta_objset = dp->dp_meta_objset; 3609 spa->spa_is_initializing = B_FALSE; 3610 3611 /* 3612 * Create DDTs (dedup tables). 3613 */ 3614 ddt_create(spa); 3615 3616 spa_update_dspace(spa); 3617 3618 tx = dmu_tx_create_assigned(dp, txg); 3619 3620 /* 3621 * Create the pool config object. 3622 */ 3623 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 3624 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 3625 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3626 3627 if (zap_add(spa->spa_meta_objset, 3628 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3629 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 3630 cmn_err(CE_PANIC, "failed to add pool config"); 3631 } 3632 3633 if (spa_version(spa) >= SPA_VERSION_FEATURES) 3634 spa_feature_create_zap_objects(spa, tx); 3635 3636 if (zap_add(spa->spa_meta_objset, 3637 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 3638 sizeof (uint64_t), 1, &version, tx) != 0) { 3639 cmn_err(CE_PANIC, "failed to add pool version"); 3640 } 3641 3642 /* Newly created pools with the right version are always deflated. */ 3643 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 3644 spa->spa_deflate = TRUE; 3645 if (zap_add(spa->spa_meta_objset, 3646 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3647 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 3648 cmn_err(CE_PANIC, "failed to add deflate"); 3649 } 3650 } 3651 3652 /* 3653 * Create the deferred-free bpobj. Turn off compression 3654 * because sync-to-convergence takes longer if the blocksize 3655 * keeps changing. 3656 */ 3657 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 3658 dmu_object_set_compress(spa->spa_meta_objset, obj, 3659 ZIO_COMPRESS_OFF, tx); 3660 if (zap_add(spa->spa_meta_objset, 3661 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 3662 sizeof (uint64_t), 1, &obj, tx) != 0) { 3663 cmn_err(CE_PANIC, "failed to add bpobj"); 3664 } 3665 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 3666 spa->spa_meta_objset, obj)); 3667 3668 /* 3669 * Create the pool's history object. 3670 */ 3671 if (version >= SPA_VERSION_ZPOOL_HISTORY) 3672 spa_history_create_obj(spa, tx); 3673 3674 /* 3675 * Generate some random noise for salted checksums to operate on. 3676 */ 3677 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 3678 sizeof (spa->spa_cksum_salt.zcs_bytes)); 3679 3680 /* 3681 * Set pool properties. 3682 */ 3683 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 3684 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3685 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 3686 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 3687 3688 if (props != NULL) { 3689 spa_configfile_set(spa, props, B_FALSE); 3690 spa_sync_props(props, tx); 3691 } 3692 3693 dmu_tx_commit(tx); 3694 3695 spa->spa_sync_on = B_TRUE; 3696 txg_sync_start(spa->spa_dsl_pool); 3697 3698 /* 3699 * We explicitly wait for the first transaction to complete so that our 3700 * bean counters are appropriately updated. 3701 */ 3702 txg_wait_synced(spa->spa_dsl_pool, txg); 3703 3704 spa_config_sync(spa, B_FALSE, B_TRUE); 3705 spa_event_notify(spa, NULL, ESC_ZFS_POOL_CREATE); 3706 3707 spa_history_log_version(spa, "create"); 3708 3709 /* 3710 * Don't count references from objsets that are already closed 3711 * and are making their way through the eviction process. 3712 */ 3713 spa_evicting_os_wait(spa); 3714 spa->spa_minref = refcount_count(&spa->spa_refcount); 3715 3716 mutex_exit(&spa_namespace_lock); 3717 3718 return (0); 3719 } 3720 3721 #ifdef _KERNEL 3722 /* 3723 * Get the root pool information from the root disk, then import the root pool 3724 * during the system boot up time. 3725 */ 3726 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 3727 3728 static nvlist_t * 3729 spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 3730 { 3731 nvlist_t *config; 3732 nvlist_t *nvtop, *nvroot; 3733 uint64_t pgid; 3734 3735 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 3736 return (NULL); 3737 3738 /* 3739 * Add this top-level vdev to the child array. 3740 */ 3741 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3742 &nvtop) == 0); 3743 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3744 &pgid) == 0); 3745 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 3746 3747 /* 3748 * Put this pool's top-level vdevs into a root vdev. 3749 */ 3750 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3751 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3752 VDEV_TYPE_ROOT) == 0); 3753 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3754 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3755 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3756 &nvtop, 1) == 0); 3757 3758 /* 3759 * Replace the existing vdev_tree with the new root vdev in 3760 * this pool's configuration (remove the old, add the new). 3761 */ 3762 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3763 nvlist_free(nvroot); 3764 return (config); 3765 } 3766 3767 /* 3768 * Walk the vdev tree and see if we can find a device with "better" 3769 * configuration. A configuration is "better" if the label on that 3770 * device has a more recent txg. 3771 */ 3772 static void 3773 spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 3774 { 3775 for (int c = 0; c < vd->vdev_children; c++) 3776 spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 3777 3778 if (vd->vdev_ops->vdev_op_leaf) { 3779 nvlist_t *label; 3780 uint64_t label_txg; 3781 3782 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 3783 &label) != 0) 3784 return; 3785 3786 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 3787 &label_txg) == 0); 3788 3789 /* 3790 * Do we have a better boot device? 3791 */ 3792 if (label_txg > *txg) { 3793 *txg = label_txg; 3794 *avd = vd; 3795 } 3796 nvlist_free(label); 3797 } 3798 } 3799 3800 /* 3801 * Import a root pool. 3802 * 3803 * For x86. devpath_list will consist of devid and/or physpath name of 3804 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 3805 * The GRUB "findroot" command will return the vdev we should boot. 3806 * 3807 * For Sparc, devpath_list consists the physpath name of the booting device 3808 * no matter the rootpool is a single device pool or a mirrored pool. 3809 * e.g. 3810 * "/pci@1f,0/ide@d/disk@0,0:a" 3811 */ 3812 int 3813 spa_import_rootpool(char *devpath, char *devid) 3814 { 3815 spa_t *spa; 3816 vdev_t *rvd, *bvd, *avd = NULL; 3817 nvlist_t *config, *nvtop; 3818 uint64_t guid, txg; 3819 char *pname; 3820 int error; 3821 3822 /* 3823 * Read the label from the boot device and generate a configuration. 3824 */ 3825 config = spa_generate_rootconf(devpath, devid, &guid); 3826 #if defined(_OBP) && defined(_KERNEL) 3827 if (config == NULL) { 3828 if (strstr(devpath, "/iscsi/ssd") != NULL) { 3829 /* iscsi boot */ 3830 get_iscsi_bootpath_phy(devpath); 3831 config = spa_generate_rootconf(devpath, devid, &guid); 3832 } 3833 } 3834 #endif 3835 if (config == NULL) { 3836 cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", 3837 devpath); 3838 return (SET_ERROR(EIO)); 3839 } 3840 3841 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3842 &pname) == 0); 3843 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 3844 3845 mutex_enter(&spa_namespace_lock); 3846 if ((spa = spa_lookup(pname)) != NULL) { 3847 /* 3848 * Remove the existing root pool from the namespace so that we 3849 * can replace it with the correct config we just read in. 3850 */ 3851 spa_remove(spa); 3852 } 3853 3854 spa = spa_add(pname, config, NULL); 3855 spa->spa_is_root = B_TRUE; 3856 spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3857 3858 /* 3859 * Build up a vdev tree based on the boot device's label config. 3860 */ 3861 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3862 &nvtop) == 0); 3863 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3864 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3865 VDEV_ALLOC_ROOTPOOL); 3866 spa_config_exit(spa, SCL_ALL, FTAG); 3867 if (error) { 3868 mutex_exit(&spa_namespace_lock); 3869 nvlist_free(config); 3870 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3871 pname); 3872 return (error); 3873 } 3874 3875 /* 3876 * Get the boot vdev. 3877 */ 3878 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 3879 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 3880 (u_longlong_t)guid); 3881 error = SET_ERROR(ENOENT); 3882 goto out; 3883 } 3884 3885 /* 3886 * Determine if there is a better boot device. 3887 */ 3888 avd = bvd; 3889 spa_alt_rootvdev(rvd, &avd, &txg); 3890 if (avd != bvd) { 3891 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 3892 "try booting from '%s'", avd->vdev_path); 3893 error = SET_ERROR(EINVAL); 3894 goto out; 3895 } 3896 3897 /* 3898 * If the boot device is part of a spare vdev then ensure that 3899 * we're booting off the active spare. 3900 */ 3901 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3902 !bvd->vdev_isspare) { 3903 cmn_err(CE_NOTE, "The boot device is currently spared. Please " 3904 "try booting from '%s'", 3905 bvd->vdev_parent-> 3906 vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 3907 error = SET_ERROR(EINVAL); 3908 goto out; 3909 } 3910 3911 error = 0; 3912 out: 3913 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3914 vdev_free(rvd); 3915 spa_config_exit(spa, SCL_ALL, FTAG); 3916 mutex_exit(&spa_namespace_lock); 3917 3918 nvlist_free(config); 3919 return (error); 3920 } 3921 3922 #endif 3923 3924 /* 3925 * Import a non-root pool into the system. 3926 */ 3927 int 3928 spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 3929 { 3930 spa_t *spa; 3931 char *altroot = NULL; 3932 spa_load_state_t state = SPA_LOAD_IMPORT; 3933 zpool_rewind_policy_t policy; 3934 uint64_t mode = spa_mode_global; 3935 uint64_t readonly = B_FALSE; 3936 int error; 3937 nvlist_t *nvroot; 3938 nvlist_t **spares, **l2cache; 3939 uint_t nspares, nl2cache; 3940 3941 /* 3942 * If a pool with this name exists, return failure. 3943 */ 3944 mutex_enter(&spa_namespace_lock); 3945 if (spa_lookup(pool) != NULL) { 3946 mutex_exit(&spa_namespace_lock); 3947 return (SET_ERROR(EEXIST)); 3948 } 3949 3950 /* 3951 * Create and initialize the spa structure. 3952 */ 3953 (void) nvlist_lookup_string(props, 3954 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3955 (void) nvlist_lookup_uint64(props, 3956 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 3957 if (readonly) 3958 mode = FREAD; 3959 spa = spa_add(pool, config, altroot); 3960 spa->spa_import_flags = flags; 3961 3962 /* 3963 * Verbatim import - Take a pool and insert it into the namespace 3964 * as if it had been loaded at boot. 3965 */ 3966 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 3967 if (props != NULL) 3968 spa_configfile_set(spa, props, B_FALSE); 3969 3970 spa_config_sync(spa, B_FALSE, B_TRUE); 3971 spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT); 3972 3973 mutex_exit(&spa_namespace_lock); 3974 return (0); 3975 } 3976 3977 spa_activate(spa, mode); 3978 3979 /* 3980 * Don't start async tasks until we know everything is healthy. 3981 */ 3982 spa_async_suspend(spa); 3983 3984 zpool_get_rewind_policy(config, &policy); 3985 if (policy.zrp_request & ZPOOL_DO_REWIND) 3986 state = SPA_LOAD_RECOVER; 3987 3988 /* 3989 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 3990 * because the user-supplied config is actually the one to trust when 3991 * doing an import. 3992 */ 3993 if (state != SPA_LOAD_RECOVER) 3994 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 3995 3996 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 3997 policy.zrp_request); 3998 3999 /* 4000 * Propagate anything learned while loading the pool and pass it 4001 * back to caller (i.e. rewind info, missing devices, etc). 4002 */ 4003 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4004 spa->spa_load_info) == 0); 4005 4006 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4007 /* 4008 * Toss any existing sparelist, as it doesn't have any validity 4009 * anymore, and conflicts with spa_has_spare(). 4010 */ 4011 if (spa->spa_spares.sav_config) { 4012 nvlist_free(spa->spa_spares.sav_config); 4013 spa->spa_spares.sav_config = NULL; 4014 spa_load_spares(spa); 4015 } 4016 if (spa->spa_l2cache.sav_config) { 4017 nvlist_free(spa->spa_l2cache.sav_config); 4018 spa->spa_l2cache.sav_config = NULL; 4019 spa_load_l2cache(spa); 4020 } 4021 4022 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4023 &nvroot) == 0); 4024 if (error == 0) 4025 error = spa_validate_aux(spa, nvroot, -1ULL, 4026 VDEV_ALLOC_SPARE); 4027 if (error == 0) 4028 error = spa_validate_aux(spa, nvroot, -1ULL, 4029 VDEV_ALLOC_L2CACHE); 4030 spa_config_exit(spa, SCL_ALL, FTAG); 4031 4032 if (props != NULL) 4033 spa_configfile_set(spa, props, B_FALSE); 4034 4035 if (error != 0 || (props && spa_writeable(spa) && 4036 (error = spa_prop_set(spa, props)))) { 4037 spa_unload(spa); 4038 spa_deactivate(spa); 4039 spa_remove(spa); 4040 mutex_exit(&spa_namespace_lock); 4041 return (error); 4042 } 4043 4044 spa_async_resume(spa); 4045 4046 /* 4047 * Override any spares and level 2 cache devices as specified by 4048 * the user, as these may have correct device names/devids, etc. 4049 */ 4050 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 4051 &spares, &nspares) == 0) { 4052 if (spa->spa_spares.sav_config) 4053 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 4054 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 4055 else 4056 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 4057 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4058 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 4059 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 4060 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4061 spa_load_spares(spa); 4062 spa_config_exit(spa, SCL_ALL, FTAG); 4063 spa->spa_spares.sav_sync = B_TRUE; 4064 } 4065 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 4066 &l2cache, &nl2cache) == 0) { 4067 if (spa->spa_l2cache.sav_config) 4068 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 4069 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 4070 else 4071 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 4072 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4073 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 4074 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 4075 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4076 spa_load_l2cache(spa); 4077 spa_config_exit(spa, SCL_ALL, FTAG); 4078 spa->spa_l2cache.sav_sync = B_TRUE; 4079 } 4080 4081 /* 4082 * Check for any removed devices. 4083 */ 4084 if (spa->spa_autoreplace) { 4085 spa_aux_check_removed(&spa->spa_spares); 4086 spa_aux_check_removed(&spa->spa_l2cache); 4087 } 4088 4089 if (spa_writeable(spa)) { 4090 /* 4091 * Update the config cache to include the newly-imported pool. 4092 */ 4093 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4094 } 4095 4096 /* 4097 * It's possible that the pool was expanded while it was exported. 4098 * We kick off an async task to handle this for us. 4099 */ 4100 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 4101 4102 spa_history_log_version(spa, "import"); 4103 4104 spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT); 4105 4106 mutex_exit(&spa_namespace_lock); 4107 4108 return (0); 4109 } 4110 4111 nvlist_t * 4112 spa_tryimport(nvlist_t *tryconfig) 4113 { 4114 nvlist_t *config = NULL; 4115 char *poolname; 4116 spa_t *spa; 4117 uint64_t state; 4118 int error; 4119 4120 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 4121 return (NULL); 4122 4123 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 4124 return (NULL); 4125 4126 /* 4127 * Create and initialize the spa structure. 4128 */ 4129 mutex_enter(&spa_namespace_lock); 4130 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 4131 spa_activate(spa, FREAD); 4132 4133 /* 4134 * Pass off the heavy lifting to spa_load(). 4135 * Pass TRUE for mosconfig because the user-supplied config 4136 * is actually the one to trust when doing an import. 4137 */ 4138 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 4139 4140 /* 4141 * If 'tryconfig' was at least parsable, return the current config. 4142 */ 4143 if (spa->spa_root_vdev != NULL) { 4144 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 4145 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 4146 poolname) == 0); 4147 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4148 state) == 0); 4149 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 4150 spa->spa_uberblock.ub_timestamp) == 0); 4151 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4152 spa->spa_load_info) == 0); 4153 4154 /* 4155 * If the bootfs property exists on this pool then we 4156 * copy it out so that external consumers can tell which 4157 * pools are bootable. 4158 */ 4159 if ((!error || error == EEXIST) && spa->spa_bootfs) { 4160 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4161 4162 /* 4163 * We have to play games with the name since the 4164 * pool was opened as TRYIMPORT_NAME. 4165 */ 4166 if (dsl_dsobj_to_dsname(spa_name(spa), 4167 spa->spa_bootfs, tmpname) == 0) { 4168 char *cp; 4169 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4170 4171 cp = strchr(tmpname, '/'); 4172 if (cp == NULL) { 4173 (void) strlcpy(dsname, tmpname, 4174 MAXPATHLEN); 4175 } else { 4176 (void) snprintf(dsname, MAXPATHLEN, 4177 "%s/%s", poolname, ++cp); 4178 } 4179 VERIFY(nvlist_add_string(config, 4180 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 4181 kmem_free(dsname, MAXPATHLEN); 4182 } 4183 kmem_free(tmpname, MAXPATHLEN); 4184 } 4185 4186 /* 4187 * Add the list of hot spares and level 2 cache devices. 4188 */ 4189 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4190 spa_add_spares(spa, config); 4191 spa_add_l2cache(spa, config); 4192 spa_config_exit(spa, SCL_CONFIG, FTAG); 4193 } 4194 4195 spa_unload(spa); 4196 spa_deactivate(spa); 4197 spa_remove(spa); 4198 mutex_exit(&spa_namespace_lock); 4199 4200 return (config); 4201 } 4202 4203 /* 4204 * Pool export/destroy 4205 * 4206 * The act of destroying or exporting a pool is very simple. We make sure there 4207 * is no more pending I/O and any references to the pool are gone. Then, we 4208 * update the pool state and sync all the labels to disk, removing the 4209 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 4210 * we don't sync the labels or remove the configuration cache. 4211 */ 4212 static int 4213 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 4214 boolean_t force, boolean_t hardforce) 4215 { 4216 spa_t *spa; 4217 4218 if (oldconfig) 4219 *oldconfig = NULL; 4220 4221 if (!(spa_mode_global & FWRITE)) 4222 return (SET_ERROR(EROFS)); 4223 4224 mutex_enter(&spa_namespace_lock); 4225 if ((spa = spa_lookup(pool)) == NULL) { 4226 mutex_exit(&spa_namespace_lock); 4227 return (SET_ERROR(ENOENT)); 4228 } 4229 4230 /* 4231 * Put a hold on the pool, drop the namespace lock, stop async tasks, 4232 * reacquire the namespace lock, and see if we can export. 4233 */ 4234 spa_open_ref(spa, FTAG); 4235 mutex_exit(&spa_namespace_lock); 4236 spa_async_suspend(spa); 4237 mutex_enter(&spa_namespace_lock); 4238 spa_close(spa, FTAG); 4239 4240 /* 4241 * The pool will be in core if it's openable, 4242 * in which case we can modify its state. 4243 */ 4244 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 4245 /* 4246 * Objsets may be open only because they're dirty, so we 4247 * have to force it to sync before checking spa_refcnt. 4248 */ 4249 txg_wait_synced(spa->spa_dsl_pool, 0); 4250 spa_evicting_os_wait(spa); 4251 4252 /* 4253 * A pool cannot be exported or destroyed if there are active 4254 * references. If we are resetting a pool, allow references by 4255 * fault injection handlers. 4256 */ 4257 if (!spa_refcount_zero(spa) || 4258 (spa->spa_inject_ref != 0 && 4259 new_state != POOL_STATE_UNINITIALIZED)) { 4260 spa_async_resume(spa); 4261 mutex_exit(&spa_namespace_lock); 4262 return (SET_ERROR(EBUSY)); 4263 } 4264 4265 /* 4266 * A pool cannot be exported if it has an active shared spare. 4267 * This is to prevent other pools stealing the active spare 4268 * from an exported pool. At user's own will, such pool can 4269 * be forcedly exported. 4270 */ 4271 if (!force && new_state == POOL_STATE_EXPORTED && 4272 spa_has_active_shared_spare(spa)) { 4273 spa_async_resume(spa); 4274 mutex_exit(&spa_namespace_lock); 4275 return (SET_ERROR(EXDEV)); 4276 } 4277 4278 /* 4279 * We want this to be reflected on every label, 4280 * so mark them all dirty. spa_unload() will do the 4281 * final sync that pushes these changes out. 4282 */ 4283 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 4284 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4285 spa->spa_state = new_state; 4286 spa->spa_final_txg = spa_last_synced_txg(spa) + 4287 TXG_DEFER_SIZE + 1; 4288 vdev_config_dirty(spa->spa_root_vdev); 4289 spa_config_exit(spa, SCL_ALL, FTAG); 4290 } 4291 } 4292 4293 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 4294 4295 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4296 spa_unload(spa); 4297 spa_deactivate(spa); 4298 } 4299 4300 if (oldconfig && spa->spa_config) 4301 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 4302 4303 if (new_state != POOL_STATE_UNINITIALIZED) { 4304 if (!hardforce) 4305 spa_config_sync(spa, B_TRUE, B_TRUE); 4306 spa_remove(spa); 4307 } 4308 mutex_exit(&spa_namespace_lock); 4309 4310 return (0); 4311 } 4312 4313 /* 4314 * Destroy a storage pool. 4315 */ 4316 int 4317 spa_destroy(char *pool) 4318 { 4319 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 4320 B_FALSE, B_FALSE)); 4321 } 4322 4323 /* 4324 * Export a storage pool. 4325 */ 4326 int 4327 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 4328 boolean_t hardforce) 4329 { 4330 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 4331 force, hardforce)); 4332 } 4333 4334 /* 4335 * Similar to spa_export(), this unloads the spa_t without actually removing it 4336 * from the namespace in any way. 4337 */ 4338 int 4339 spa_reset(char *pool) 4340 { 4341 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 4342 B_FALSE, B_FALSE)); 4343 } 4344 4345 /* 4346 * ========================================================================== 4347 * Device manipulation 4348 * ========================================================================== 4349 */ 4350 4351 /* 4352 * Add a device to a storage pool. 4353 */ 4354 int 4355 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 4356 { 4357 uint64_t txg, id; 4358 int error; 4359 vdev_t *rvd = spa->spa_root_vdev; 4360 vdev_t *vd, *tvd; 4361 nvlist_t **spares, **l2cache; 4362 uint_t nspares, nl2cache; 4363 4364 ASSERT(spa_writeable(spa)); 4365 4366 txg = spa_vdev_enter(spa); 4367 4368 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 4369 VDEV_ALLOC_ADD)) != 0) 4370 return (spa_vdev_exit(spa, NULL, txg, error)); 4371 4372 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 4373 4374 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 4375 &nspares) != 0) 4376 nspares = 0; 4377 4378 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 4379 &nl2cache) != 0) 4380 nl2cache = 0; 4381 4382 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 4383 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 4384 4385 if (vd->vdev_children != 0 && 4386 (error = vdev_create(vd, txg, B_FALSE)) != 0) 4387 return (spa_vdev_exit(spa, vd, txg, error)); 4388 4389 /* 4390 * We must validate the spares and l2cache devices after checking the 4391 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 4392 */ 4393 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 4394 return (spa_vdev_exit(spa, vd, txg, error)); 4395 4396 /* 4397 * Transfer each new top-level vdev from vd to rvd. 4398 */ 4399 for (int c = 0; c < vd->vdev_children; c++) { 4400 4401 /* 4402 * Set the vdev id to the first hole, if one exists. 4403 */ 4404 for (id = 0; id < rvd->vdev_children; id++) { 4405 if (rvd->vdev_child[id]->vdev_ishole) { 4406 vdev_free(rvd->vdev_child[id]); 4407 break; 4408 } 4409 } 4410 tvd = vd->vdev_child[c]; 4411 vdev_remove_child(vd, tvd); 4412 tvd->vdev_id = id; 4413 vdev_add_child(rvd, tvd); 4414 vdev_config_dirty(tvd); 4415 } 4416 4417 if (nspares != 0) { 4418 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 4419 ZPOOL_CONFIG_SPARES); 4420 spa_load_spares(spa); 4421 spa->spa_spares.sav_sync = B_TRUE; 4422 } 4423 4424 if (nl2cache != 0) { 4425 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 4426 ZPOOL_CONFIG_L2CACHE); 4427 spa_load_l2cache(spa); 4428 spa->spa_l2cache.sav_sync = B_TRUE; 4429 } 4430 4431 /* 4432 * We have to be careful when adding new vdevs to an existing pool. 4433 * If other threads start allocating from these vdevs before we 4434 * sync the config cache, and we lose power, then upon reboot we may 4435 * fail to open the pool because there are DVAs that the config cache 4436 * can't translate. Therefore, we first add the vdevs without 4437 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 4438 * and then let spa_config_update() initialize the new metaslabs. 4439 * 4440 * spa_load() checks for added-but-not-initialized vdevs, so that 4441 * if we lose power at any point in this sequence, the remaining 4442 * steps will be completed the next time we load the pool. 4443 */ 4444 (void) spa_vdev_exit(spa, vd, txg, 0); 4445 4446 mutex_enter(&spa_namespace_lock); 4447 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4448 spa_event_notify(spa, NULL, ESC_ZFS_VDEV_ADD); 4449 mutex_exit(&spa_namespace_lock); 4450 4451 return (0); 4452 } 4453 4454 /* 4455 * Attach a device to a mirror. The arguments are the path to any device 4456 * in the mirror, and the nvroot for the new device. If the path specifies 4457 * a device that is not mirrored, we automatically insert the mirror vdev. 4458 * 4459 * If 'replacing' is specified, the new device is intended to replace the 4460 * existing device; in this case the two devices are made into their own 4461 * mirror using the 'replacing' vdev, which is functionally identical to 4462 * the mirror vdev (it actually reuses all the same ops) but has a few 4463 * extra rules: you can't attach to it after it's been created, and upon 4464 * completion of resilvering, the first disk (the one being replaced) 4465 * is automatically detached. 4466 */ 4467 int 4468 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 4469 { 4470 uint64_t txg, dtl_max_txg; 4471 vdev_t *rvd = spa->spa_root_vdev; 4472 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 4473 vdev_ops_t *pvops; 4474 char *oldvdpath, *newvdpath; 4475 int newvd_isspare; 4476 int error; 4477 4478 ASSERT(spa_writeable(spa)); 4479 4480 txg = spa_vdev_enter(spa); 4481 4482 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 4483 4484 if (oldvd == NULL) 4485 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4486 4487 if (!oldvd->vdev_ops->vdev_op_leaf) 4488 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4489 4490 pvd = oldvd->vdev_parent; 4491 4492 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 4493 VDEV_ALLOC_ATTACH)) != 0) 4494 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4495 4496 if (newrootvd->vdev_children != 1) 4497 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4498 4499 newvd = newrootvd->vdev_child[0]; 4500 4501 if (!newvd->vdev_ops->vdev_op_leaf) 4502 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4503 4504 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 4505 return (spa_vdev_exit(spa, newrootvd, txg, error)); 4506 4507 /* 4508 * Spares can't replace logs 4509 */ 4510 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 4511 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4512 4513 if (!replacing) { 4514 /* 4515 * For attach, the only allowable parent is a mirror or the root 4516 * vdev. 4517 */ 4518 if (pvd->vdev_ops != &vdev_mirror_ops && 4519 pvd->vdev_ops != &vdev_root_ops) 4520 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4521 4522 pvops = &vdev_mirror_ops; 4523 } else { 4524 /* 4525 * Active hot spares can only be replaced by inactive hot 4526 * spares. 4527 */ 4528 if (pvd->vdev_ops == &vdev_spare_ops && 4529 oldvd->vdev_isspare && 4530 !spa_has_spare(spa, newvd->vdev_guid)) 4531 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4532 4533 /* 4534 * If the source is a hot spare, and the parent isn't already a 4535 * spare, then we want to create a new hot spare. Otherwise, we 4536 * want to create a replacing vdev. The user is not allowed to 4537 * attach to a spared vdev child unless the 'isspare' state is 4538 * the same (spare replaces spare, non-spare replaces 4539 * non-spare). 4540 */ 4541 if (pvd->vdev_ops == &vdev_replacing_ops && 4542 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 4543 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4544 } else if (pvd->vdev_ops == &vdev_spare_ops && 4545 newvd->vdev_isspare != oldvd->vdev_isspare) { 4546 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4547 } 4548 4549 if (newvd->vdev_isspare) 4550 pvops = &vdev_spare_ops; 4551 else 4552 pvops = &vdev_replacing_ops; 4553 } 4554 4555 /* 4556 * Make sure the new device is big enough. 4557 */ 4558 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 4559 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 4560 4561 /* 4562 * The new device cannot have a higher alignment requirement 4563 * than the top-level vdev. 4564 */ 4565 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 4566 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 4567 4568 /* 4569 * If this is an in-place replacement, update oldvd's path and devid 4570 * to make it distinguishable from newvd, and unopenable from now on. 4571 */ 4572 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 4573 spa_strfree(oldvd->vdev_path); 4574 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 4575 KM_SLEEP); 4576 (void) sprintf(oldvd->vdev_path, "%s/%s", 4577 newvd->vdev_path, "old"); 4578 if (oldvd->vdev_devid != NULL) { 4579 spa_strfree(oldvd->vdev_devid); 4580 oldvd->vdev_devid = NULL; 4581 } 4582 } 4583 4584 /* mark the device being resilvered */ 4585 newvd->vdev_resilver_txg = txg; 4586 4587 /* 4588 * If the parent is not a mirror, or if we're replacing, insert the new 4589 * mirror/replacing/spare vdev above oldvd. 4590 */ 4591 if (pvd->vdev_ops != pvops) 4592 pvd = vdev_add_parent(oldvd, pvops); 4593 4594 ASSERT(pvd->vdev_top->vdev_parent == rvd); 4595 ASSERT(pvd->vdev_ops == pvops); 4596 ASSERT(oldvd->vdev_parent == pvd); 4597 4598 /* 4599 * Extract the new device from its root and add it to pvd. 4600 */ 4601 vdev_remove_child(newrootvd, newvd); 4602 newvd->vdev_id = pvd->vdev_children; 4603 newvd->vdev_crtxg = oldvd->vdev_crtxg; 4604 vdev_add_child(pvd, newvd); 4605 4606 tvd = newvd->vdev_top; 4607 ASSERT(pvd->vdev_top == tvd); 4608 ASSERT(tvd->vdev_parent == rvd); 4609 4610 vdev_config_dirty(tvd); 4611 4612 /* 4613 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 4614 * for any dmu_sync-ed blocks. It will propagate upward when 4615 * spa_vdev_exit() calls vdev_dtl_reassess(). 4616 */ 4617 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 4618 4619 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 4620 dtl_max_txg - TXG_INITIAL); 4621 4622 if (newvd->vdev_isspare) { 4623 spa_spare_activate(newvd); 4624 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 4625 } 4626 4627 oldvdpath = spa_strdup(oldvd->vdev_path); 4628 newvdpath = spa_strdup(newvd->vdev_path); 4629 newvd_isspare = newvd->vdev_isspare; 4630 4631 /* 4632 * Mark newvd's DTL dirty in this txg. 4633 */ 4634 vdev_dirty(tvd, VDD_DTL, newvd, txg); 4635 4636 /* 4637 * Schedule the resilver to restart in the future. We do this to 4638 * ensure that dmu_sync-ed blocks have been stitched into the 4639 * respective datasets. 4640 */ 4641 dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 4642 4643 if (spa->spa_bootfs) 4644 spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); 4645 4646 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_ATTACH); 4647 4648 /* 4649 * Commit the config 4650 */ 4651 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 4652 4653 spa_history_log_internal(spa, "vdev attach", NULL, 4654 "%s vdev=%s %s vdev=%s", 4655 replacing && newvd_isspare ? "spare in" : 4656 replacing ? "replace" : "attach", newvdpath, 4657 replacing ? "for" : "to", oldvdpath); 4658 4659 spa_strfree(oldvdpath); 4660 spa_strfree(newvdpath); 4661 4662 return (0); 4663 } 4664 4665 /* 4666 * Detach a device from a mirror or replacing vdev. 4667 * 4668 * If 'replace_done' is specified, only detach if the parent 4669 * is a replacing vdev. 4670 */ 4671 int 4672 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 4673 { 4674 uint64_t txg; 4675 int error; 4676 vdev_t *rvd = spa->spa_root_vdev; 4677 vdev_t *vd, *pvd, *cvd, *tvd; 4678 boolean_t unspare = B_FALSE; 4679 uint64_t unspare_guid = 0; 4680 char *vdpath; 4681 4682 ASSERT(spa_writeable(spa)); 4683 4684 txg = spa_vdev_enter(spa); 4685 4686 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4687 4688 if (vd == NULL) 4689 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4690 4691 if (!vd->vdev_ops->vdev_op_leaf) 4692 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4693 4694 pvd = vd->vdev_parent; 4695 4696 /* 4697 * If the parent/child relationship is not as expected, don't do it. 4698 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 4699 * vdev that's replacing B with C. The user's intent in replacing 4700 * is to go from M(A,B) to M(A,C). If the user decides to cancel 4701 * the replace by detaching C, the expected behavior is to end up 4702 * M(A,B). But suppose that right after deciding to detach C, 4703 * the replacement of B completes. We would have M(A,C), and then 4704 * ask to detach C, which would leave us with just A -- not what 4705 * the user wanted. To prevent this, we make sure that the 4706 * parent/child relationship hasn't changed -- in this example, 4707 * that C's parent is still the replacing vdev R. 4708 */ 4709 if (pvd->vdev_guid != pguid && pguid != 0) 4710 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4711 4712 /* 4713 * Only 'replacing' or 'spare' vdevs can be replaced. 4714 */ 4715 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 4716 pvd->vdev_ops != &vdev_spare_ops) 4717 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4718 4719 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 4720 spa_version(spa) >= SPA_VERSION_SPARES); 4721 4722 /* 4723 * Only mirror, replacing, and spare vdevs support detach. 4724 */ 4725 if (pvd->vdev_ops != &vdev_replacing_ops && 4726 pvd->vdev_ops != &vdev_mirror_ops && 4727 pvd->vdev_ops != &vdev_spare_ops) 4728 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4729 4730 /* 4731 * If this device has the only valid copy of some data, 4732 * we cannot safely detach it. 4733 */ 4734 if (vdev_dtl_required(vd)) 4735 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4736 4737 ASSERT(pvd->vdev_children >= 2); 4738 4739 /* 4740 * If we are detaching the second disk from a replacing vdev, then 4741 * check to see if we changed the original vdev's path to have "/old" 4742 * at the end in spa_vdev_attach(). If so, undo that change now. 4743 */ 4744 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 4745 vd->vdev_path != NULL) { 4746 size_t len = strlen(vd->vdev_path); 4747 4748 for (int c = 0; c < pvd->vdev_children; c++) { 4749 cvd = pvd->vdev_child[c]; 4750 4751 if (cvd == vd || cvd->vdev_path == NULL) 4752 continue; 4753 4754 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 4755 strcmp(cvd->vdev_path + len, "/old") == 0) { 4756 spa_strfree(cvd->vdev_path); 4757 cvd->vdev_path = spa_strdup(vd->vdev_path); 4758 break; 4759 } 4760 } 4761 } 4762 4763 /* 4764 * If we are detaching the original disk from a spare, then it implies 4765 * that the spare should become a real disk, and be removed from the 4766 * active spare list for the pool. 4767 */ 4768 if (pvd->vdev_ops == &vdev_spare_ops && 4769 vd->vdev_id == 0 && 4770 pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 4771 unspare = B_TRUE; 4772 4773 /* 4774 * Erase the disk labels so the disk can be used for other things. 4775 * This must be done after all other error cases are handled, 4776 * but before we disembowel vd (so we can still do I/O to it). 4777 * But if we can't do it, don't treat the error as fatal -- 4778 * it may be that the unwritability of the disk is the reason 4779 * it's being detached! 4780 */ 4781 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 4782 4783 /* 4784 * Remove vd from its parent and compact the parent's children. 4785 */ 4786 vdev_remove_child(pvd, vd); 4787 vdev_compact_children(pvd); 4788 4789 /* 4790 * Remember one of the remaining children so we can get tvd below. 4791 */ 4792 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 4793 4794 /* 4795 * If we need to remove the remaining child from the list of hot spares, 4796 * do it now, marking the vdev as no longer a spare in the process. 4797 * We must do this before vdev_remove_parent(), because that can 4798 * change the GUID if it creates a new toplevel GUID. For a similar 4799 * reason, we must remove the spare now, in the same txg as the detach; 4800 * otherwise someone could attach a new sibling, change the GUID, and 4801 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 4802 */ 4803 if (unspare) { 4804 ASSERT(cvd->vdev_isspare); 4805 spa_spare_remove(cvd); 4806 unspare_guid = cvd->vdev_guid; 4807 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 4808 cvd->vdev_unspare = B_TRUE; 4809 } 4810 4811 /* 4812 * If the parent mirror/replacing vdev only has one child, 4813 * the parent is no longer needed. Remove it from the tree. 4814 */ 4815 if (pvd->vdev_children == 1) { 4816 if (pvd->vdev_ops == &vdev_spare_ops) 4817 cvd->vdev_unspare = B_FALSE; 4818 vdev_remove_parent(cvd); 4819 } 4820 4821 4822 /* 4823 * We don't set tvd until now because the parent we just removed 4824 * may have been the previous top-level vdev. 4825 */ 4826 tvd = cvd->vdev_top; 4827 ASSERT(tvd->vdev_parent == rvd); 4828 4829 /* 4830 * Reevaluate the parent vdev state. 4831 */ 4832 vdev_propagate_state(cvd); 4833 4834 /* 4835 * If the 'autoexpand' property is set on the pool then automatically 4836 * try to expand the size of the pool. For example if the device we 4837 * just detached was smaller than the others, it may be possible to 4838 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 4839 * first so that we can obtain the updated sizes of the leaf vdevs. 4840 */ 4841 if (spa->spa_autoexpand) { 4842 vdev_reopen(tvd); 4843 vdev_expand(tvd, txg); 4844 } 4845 4846 vdev_config_dirty(tvd); 4847 4848 /* 4849 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 4850 * vd->vdev_detached is set and free vd's DTL object in syncing context. 4851 * But first make sure we're not on any *other* txg's DTL list, to 4852 * prevent vd from being accessed after it's freed. 4853 */ 4854 vdpath = spa_strdup(vd->vdev_path); 4855 for (int t = 0; t < TXG_SIZE; t++) 4856 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 4857 vd->vdev_detached = B_TRUE; 4858 vdev_dirty(tvd, VDD_DTL, vd, txg); 4859 4860 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 4861 4862 /* hang on to the spa before we release the lock */ 4863 spa_open_ref(spa, FTAG); 4864 4865 error = spa_vdev_exit(spa, vd, txg, 0); 4866 4867 spa_history_log_internal(spa, "detach", NULL, 4868 "vdev=%s", vdpath); 4869 spa_strfree(vdpath); 4870 4871 /* 4872 * If this was the removal of the original device in a hot spare vdev, 4873 * then we want to go through and remove the device from the hot spare 4874 * list of every other pool. 4875 */ 4876 if (unspare) { 4877 spa_t *altspa = NULL; 4878 4879 mutex_enter(&spa_namespace_lock); 4880 while ((altspa = spa_next(altspa)) != NULL) { 4881 if (altspa->spa_state != POOL_STATE_ACTIVE || 4882 altspa == spa) 4883 continue; 4884 4885 spa_open_ref(altspa, FTAG); 4886 mutex_exit(&spa_namespace_lock); 4887 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 4888 mutex_enter(&spa_namespace_lock); 4889 spa_close(altspa, FTAG); 4890 } 4891 mutex_exit(&spa_namespace_lock); 4892 4893 /* search the rest of the vdevs for spares to remove */ 4894 spa_vdev_resilver_done(spa); 4895 } 4896 4897 /* all done with the spa; OK to release */ 4898 mutex_enter(&spa_namespace_lock); 4899 spa_close(spa, FTAG); 4900 mutex_exit(&spa_namespace_lock); 4901 4902 return (error); 4903 } 4904 4905 /* 4906 * Split a set of devices from their mirrors, and create a new pool from them. 4907 */ 4908 int 4909 spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 4910 nvlist_t *props, boolean_t exp) 4911 { 4912 int error = 0; 4913 uint64_t txg, *glist; 4914 spa_t *newspa; 4915 uint_t c, children, lastlog; 4916 nvlist_t **child, *nvl, *tmp; 4917 dmu_tx_t *tx; 4918 char *altroot = NULL; 4919 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 4920 boolean_t activate_slog; 4921 4922 ASSERT(spa_writeable(spa)); 4923 4924 txg = spa_vdev_enter(spa); 4925 4926 /* clear the log and flush everything up to now */ 4927 activate_slog = spa_passivate_log(spa); 4928 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4929 error = spa_offline_log(spa); 4930 txg = spa_vdev_config_enter(spa); 4931 4932 if (activate_slog) 4933 spa_activate_log(spa); 4934 4935 if (error != 0) 4936 return (spa_vdev_exit(spa, NULL, txg, error)); 4937 4938 /* check new spa name before going any further */ 4939 if (spa_lookup(newname) != NULL) 4940 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 4941 4942 /* 4943 * scan through all the children to ensure they're all mirrors 4944 */ 4945 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 4946 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 4947 &children) != 0) 4948 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4949 4950 /* first, check to ensure we've got the right child count */ 4951 rvd = spa->spa_root_vdev; 4952 lastlog = 0; 4953 for (c = 0; c < rvd->vdev_children; c++) { 4954 vdev_t *vd = rvd->vdev_child[c]; 4955 4956 /* don't count the holes & logs as children */ 4957 if (vd->vdev_islog || vd->vdev_ishole) { 4958 if (lastlog == 0) 4959 lastlog = c; 4960 continue; 4961 } 4962 4963 lastlog = 0; 4964 } 4965 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 4966 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4967 4968 /* next, ensure no spare or cache devices are part of the split */ 4969 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 4970 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 4971 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4972 4973 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 4974 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 4975 4976 /* then, loop over each vdev and validate it */ 4977 for (c = 0; c < children; c++) { 4978 uint64_t is_hole = 0; 4979 4980 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 4981 &is_hole); 4982 4983 if (is_hole != 0) { 4984 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 4985 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 4986 continue; 4987 } else { 4988 error = SET_ERROR(EINVAL); 4989 break; 4990 } 4991 } 4992 4993 /* which disk is going to be split? */ 4994 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 4995 &glist[c]) != 0) { 4996 error = SET_ERROR(EINVAL); 4997 break; 4998 } 4999 5000 /* look it up in the spa */ 5001 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 5002 if (vml[c] == NULL) { 5003 error = SET_ERROR(ENODEV); 5004 break; 5005 } 5006 5007 /* make sure there's nothing stopping the split */ 5008 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 5009 vml[c]->vdev_islog || 5010 vml[c]->vdev_ishole || 5011 vml[c]->vdev_isspare || 5012 vml[c]->vdev_isl2cache || 5013 !vdev_writeable(vml[c]) || 5014 vml[c]->vdev_children != 0 || 5015 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 5016 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 5017 error = SET_ERROR(EINVAL); 5018 break; 5019 } 5020 5021 if (vdev_dtl_required(vml[c])) { 5022 error = SET_ERROR(EBUSY); 5023 break; 5024 } 5025 5026 /* we need certain info from the top level */ 5027 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 5028 vml[c]->vdev_top->vdev_ms_array) == 0); 5029 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 5030 vml[c]->vdev_top->vdev_ms_shift) == 0); 5031 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 5032 vml[c]->vdev_top->vdev_asize) == 0); 5033 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 5034 vml[c]->vdev_top->vdev_ashift) == 0); 5035 } 5036 5037 if (error != 0) { 5038 kmem_free(vml, children * sizeof (vdev_t *)); 5039 kmem_free(glist, children * sizeof (uint64_t)); 5040 return (spa_vdev_exit(spa, NULL, txg, error)); 5041 } 5042 5043 /* stop writers from using the disks */ 5044 for (c = 0; c < children; c++) { 5045 if (vml[c] != NULL) 5046 vml[c]->vdev_offline = B_TRUE; 5047 } 5048 vdev_reopen(spa->spa_root_vdev); 5049 5050 /* 5051 * Temporarily record the splitting vdevs in the spa config. This 5052 * will disappear once the config is regenerated. 5053 */ 5054 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5055 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 5056 glist, children) == 0); 5057 kmem_free(glist, children * sizeof (uint64_t)); 5058 5059 mutex_enter(&spa->spa_props_lock); 5060 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 5061 nvl) == 0); 5062 mutex_exit(&spa->spa_props_lock); 5063 spa->spa_config_splitting = nvl; 5064 vdev_config_dirty(spa->spa_root_vdev); 5065 5066 /* configure and create the new pool */ 5067 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 5068 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 5069 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 5070 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 5071 spa_version(spa)) == 0); 5072 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 5073 spa->spa_config_txg) == 0); 5074 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 5075 spa_generate_guid(NULL)) == 0); 5076 (void) nvlist_lookup_string(props, 5077 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5078 5079 /* add the new pool to the namespace */ 5080 newspa = spa_add(newname, config, altroot); 5081 newspa->spa_config_txg = spa->spa_config_txg; 5082 spa_set_log_state(newspa, SPA_LOG_CLEAR); 5083 5084 /* release the spa config lock, retaining the namespace lock */ 5085 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5086 5087 if (zio_injection_enabled) 5088 zio_handle_panic_injection(spa, FTAG, 1); 5089 5090 spa_activate(newspa, spa_mode_global); 5091 spa_async_suspend(newspa); 5092 5093 /* create the new pool from the disks of the original pool */ 5094 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 5095 if (error) 5096 goto out; 5097 5098 /* if that worked, generate a real config for the new pool */ 5099 if (newspa->spa_root_vdev != NULL) { 5100 VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 5101 NV_UNIQUE_NAME, KM_SLEEP) == 0); 5102 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 5103 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 5104 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 5105 B_TRUE)); 5106 } 5107 5108 /* set the props */ 5109 if (props != NULL) { 5110 spa_configfile_set(newspa, props, B_FALSE); 5111 error = spa_prop_set(newspa, props); 5112 if (error) 5113 goto out; 5114 } 5115 5116 /* flush everything */ 5117 txg = spa_vdev_config_enter(newspa); 5118 vdev_config_dirty(newspa->spa_root_vdev); 5119 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 5120 5121 if (zio_injection_enabled) 5122 zio_handle_panic_injection(spa, FTAG, 2); 5123 5124 spa_async_resume(newspa); 5125 5126 /* finally, update the original pool's config */ 5127 txg = spa_vdev_config_enter(spa); 5128 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 5129 error = dmu_tx_assign(tx, TXG_WAIT); 5130 if (error != 0) 5131 dmu_tx_abort(tx); 5132 for (c = 0; c < children; c++) { 5133 if (vml[c] != NULL) { 5134 vdev_split(vml[c]); 5135 if (error == 0) 5136 spa_history_log_internal(spa, "detach", tx, 5137 "vdev=%s", vml[c]->vdev_path); 5138 vdev_free(vml[c]); 5139 } 5140 } 5141 vdev_config_dirty(spa->spa_root_vdev); 5142 spa->spa_config_splitting = NULL; 5143 nvlist_free(nvl); 5144 if (error == 0) 5145 dmu_tx_commit(tx); 5146 (void) spa_vdev_exit(spa, NULL, txg, 0); 5147 5148 if (zio_injection_enabled) 5149 zio_handle_panic_injection(spa, FTAG, 3); 5150 5151 /* split is complete; log a history record */ 5152 spa_history_log_internal(newspa, "split", NULL, 5153 "from pool %s", spa_name(spa)); 5154 5155 kmem_free(vml, children * sizeof (vdev_t *)); 5156 5157 /* if we're not going to mount the filesystems in userland, export */ 5158 if (exp) 5159 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 5160 B_FALSE, B_FALSE); 5161 5162 return (error); 5163 5164 out: 5165 spa_unload(newspa); 5166 spa_deactivate(newspa); 5167 spa_remove(newspa); 5168 5169 txg = spa_vdev_config_enter(spa); 5170 5171 /* re-online all offlined disks */ 5172 for (c = 0; c < children; c++) { 5173 if (vml[c] != NULL) 5174 vml[c]->vdev_offline = B_FALSE; 5175 } 5176 vdev_reopen(spa->spa_root_vdev); 5177 5178 nvlist_free(spa->spa_config_splitting); 5179 spa->spa_config_splitting = NULL; 5180 (void) spa_vdev_exit(spa, NULL, txg, error); 5181 5182 kmem_free(vml, children * sizeof (vdev_t *)); 5183 return (error); 5184 } 5185 5186 static nvlist_t * 5187 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 5188 { 5189 for (int i = 0; i < count; i++) { 5190 uint64_t guid; 5191 5192 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 5193 &guid) == 0); 5194 5195 if (guid == target_guid) 5196 return (nvpp[i]); 5197 } 5198 5199 return (NULL); 5200 } 5201 5202 static void 5203 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 5204 nvlist_t *dev_to_remove) 5205 { 5206 nvlist_t **newdev = NULL; 5207 5208 if (count > 1) 5209 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 5210 5211 for (int i = 0, j = 0; i < count; i++) { 5212 if (dev[i] == dev_to_remove) 5213 continue; 5214 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 5215 } 5216 5217 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 5218 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 5219 5220 for (int i = 0; i < count - 1; i++) 5221 nvlist_free(newdev[i]); 5222 5223 if (count > 1) 5224 kmem_free(newdev, (count - 1) * sizeof (void *)); 5225 } 5226 5227 /* 5228 * Evacuate the device. 5229 */ 5230 static int 5231 spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 5232 { 5233 uint64_t txg; 5234 int error = 0; 5235 5236 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5237 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5238 ASSERT(vd == vd->vdev_top); 5239 5240 /* 5241 * Evacuate the device. We don't hold the config lock as writer 5242 * since we need to do I/O but we do keep the 5243 * spa_namespace_lock held. Once this completes the device 5244 * should no longer have any blocks allocated on it. 5245 */ 5246 if (vd->vdev_islog) { 5247 if (vd->vdev_stat.vs_alloc != 0) 5248 error = spa_offline_log(spa); 5249 } else { 5250 error = SET_ERROR(ENOTSUP); 5251 } 5252 5253 if (error) 5254 return (error); 5255 5256 /* 5257 * The evacuation succeeded. Remove any remaining MOS metadata 5258 * associated with this vdev, and wait for these changes to sync. 5259 */ 5260 ASSERT0(vd->vdev_stat.vs_alloc); 5261 txg = spa_vdev_config_enter(spa); 5262 vd->vdev_removing = B_TRUE; 5263 vdev_dirty_leaves(vd, VDD_DTL, txg); 5264 vdev_config_dirty(vd); 5265 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5266 5267 return (0); 5268 } 5269 5270 /* 5271 * Complete the removal by cleaning up the namespace. 5272 */ 5273 static void 5274 spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 5275 { 5276 vdev_t *rvd = spa->spa_root_vdev; 5277 uint64_t id = vd->vdev_id; 5278 boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 5279 5280 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5281 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5282 ASSERT(vd == vd->vdev_top); 5283 5284 /* 5285 * Only remove any devices which are empty. 5286 */ 5287 if (vd->vdev_stat.vs_alloc != 0) 5288 return; 5289 5290 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5291 5292 if (list_link_active(&vd->vdev_state_dirty_node)) 5293 vdev_state_clean(vd); 5294 if (list_link_active(&vd->vdev_config_dirty_node)) 5295 vdev_config_clean(vd); 5296 5297 vdev_free(vd); 5298 5299 if (last_vdev) { 5300 vdev_compact_children(rvd); 5301 } else { 5302 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 5303 vdev_add_child(rvd, vd); 5304 } 5305 vdev_config_dirty(rvd); 5306 5307 /* 5308 * Reassess the health of our root vdev. 5309 */ 5310 vdev_reopen(rvd); 5311 } 5312 5313 /* 5314 * Remove a device from the pool - 5315 * 5316 * Removing a device from the vdev namespace requires several steps 5317 * and can take a significant amount of time. As a result we use 5318 * the spa_vdev_config_[enter/exit] functions which allow us to 5319 * grab and release the spa_config_lock while still holding the namespace 5320 * lock. During each step the configuration is synced out. 5321 * 5322 * Currently, this supports removing only hot spares, slogs, and level 2 ARC 5323 * devices. 5324 */ 5325 int 5326 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 5327 { 5328 vdev_t *vd; 5329 metaslab_group_t *mg; 5330 nvlist_t **spares, **l2cache, *nv; 5331 uint64_t txg = 0; 5332 uint_t nspares, nl2cache; 5333 int error = 0; 5334 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 5335 5336 ASSERT(spa_writeable(spa)); 5337 5338 if (!locked) 5339 txg = spa_vdev_enter(spa); 5340 5341 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 5342 5343 if (spa->spa_spares.sav_vdevs != NULL && 5344 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5345 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 5346 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 5347 /* 5348 * Only remove the hot spare if it's not currently in use 5349 * in this pool. 5350 */ 5351 if (vd == NULL || unspare) { 5352 spa_vdev_remove_aux(spa->spa_spares.sav_config, 5353 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 5354 spa_load_spares(spa); 5355 spa->spa_spares.sav_sync = B_TRUE; 5356 } else { 5357 error = SET_ERROR(EBUSY); 5358 } 5359 } else if (spa->spa_l2cache.sav_vdevs != NULL && 5360 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5361 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 5362 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 5363 /* 5364 * Cache devices can always be removed. 5365 */ 5366 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 5367 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 5368 spa_load_l2cache(spa); 5369 spa->spa_l2cache.sav_sync = B_TRUE; 5370 } else if (vd != NULL && vd->vdev_islog) { 5371 ASSERT(!locked); 5372 ASSERT(vd == vd->vdev_top); 5373 5374 mg = vd->vdev_mg; 5375 5376 /* 5377 * Stop allocating from this vdev. 5378 */ 5379 metaslab_group_passivate(mg); 5380 5381 /* 5382 * Wait for the youngest allocations and frees to sync, 5383 * and then wait for the deferral of those frees to finish. 5384 */ 5385 spa_vdev_config_exit(spa, NULL, 5386 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 5387 5388 /* 5389 * Attempt to evacuate the vdev. 5390 */ 5391 error = spa_vdev_remove_evacuate(spa, vd); 5392 5393 txg = spa_vdev_config_enter(spa); 5394 5395 /* 5396 * If we couldn't evacuate the vdev, unwind. 5397 */ 5398 if (error) { 5399 metaslab_group_activate(mg); 5400 return (spa_vdev_exit(spa, NULL, txg, error)); 5401 } 5402 5403 /* 5404 * Clean up the vdev namespace. 5405 */ 5406 spa_vdev_remove_from_namespace(spa, vd); 5407 5408 } else if (vd != NULL) { 5409 /* 5410 * Normal vdevs cannot be removed (yet). 5411 */ 5412 error = SET_ERROR(ENOTSUP); 5413 } else { 5414 /* 5415 * There is no vdev of any kind with the specified guid. 5416 */ 5417 error = SET_ERROR(ENOENT); 5418 } 5419 5420 if (!locked) 5421 return (spa_vdev_exit(spa, NULL, txg, error)); 5422 5423 return (error); 5424 } 5425 5426 /* 5427 * Find any device that's done replacing, or a vdev marked 'unspare' that's 5428 * currently spared, so we can detach it. 5429 */ 5430 static vdev_t * 5431 spa_vdev_resilver_done_hunt(vdev_t *vd) 5432 { 5433 vdev_t *newvd, *oldvd; 5434 5435 for (int c = 0; c < vd->vdev_children; c++) { 5436 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 5437 if (oldvd != NULL) 5438 return (oldvd); 5439 } 5440 5441 /* 5442 * Check for a completed replacement. We always consider the first 5443 * vdev in the list to be the oldest vdev, and the last one to be 5444 * the newest (see spa_vdev_attach() for how that works). In 5445 * the case where the newest vdev is faulted, we will not automatically 5446 * remove it after a resilver completes. This is OK as it will require 5447 * user intervention to determine which disk the admin wishes to keep. 5448 */ 5449 if (vd->vdev_ops == &vdev_replacing_ops) { 5450 ASSERT(vd->vdev_children > 1); 5451 5452 newvd = vd->vdev_child[vd->vdev_children - 1]; 5453 oldvd = vd->vdev_child[0]; 5454 5455 if (vdev_dtl_empty(newvd, DTL_MISSING) && 5456 vdev_dtl_empty(newvd, DTL_OUTAGE) && 5457 !vdev_dtl_required(oldvd)) 5458 return (oldvd); 5459 } 5460 5461 /* 5462 * Check for a completed resilver with the 'unspare' flag set. 5463 */ 5464 if (vd->vdev_ops == &vdev_spare_ops) { 5465 vdev_t *first = vd->vdev_child[0]; 5466 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 5467 5468 if (last->vdev_unspare) { 5469 oldvd = first; 5470 newvd = last; 5471 } else if (first->vdev_unspare) { 5472 oldvd = last; 5473 newvd = first; 5474 } else { 5475 oldvd = NULL; 5476 } 5477 5478 if (oldvd != NULL && 5479 vdev_dtl_empty(newvd, DTL_MISSING) && 5480 vdev_dtl_empty(newvd, DTL_OUTAGE) && 5481 !vdev_dtl_required(oldvd)) 5482 return (oldvd); 5483 5484 /* 5485 * If there are more than two spares attached to a disk, 5486 * and those spares are not required, then we want to 5487 * attempt to free them up now so that they can be used 5488 * by other pools. Once we're back down to a single 5489 * disk+spare, we stop removing them. 5490 */ 5491 if (vd->vdev_children > 2) { 5492 newvd = vd->vdev_child[1]; 5493 5494 if (newvd->vdev_isspare && last->vdev_isspare && 5495 vdev_dtl_empty(last, DTL_MISSING) && 5496 vdev_dtl_empty(last, DTL_OUTAGE) && 5497 !vdev_dtl_required(newvd)) 5498 return (newvd); 5499 } 5500 } 5501 5502 return (NULL); 5503 } 5504 5505 static void 5506 spa_vdev_resilver_done(spa_t *spa) 5507 { 5508 vdev_t *vd, *pvd, *ppvd; 5509 uint64_t guid, sguid, pguid, ppguid; 5510 5511 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5512 5513 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 5514 pvd = vd->vdev_parent; 5515 ppvd = pvd->vdev_parent; 5516 guid = vd->vdev_guid; 5517 pguid = pvd->vdev_guid; 5518 ppguid = ppvd->vdev_guid; 5519 sguid = 0; 5520 /* 5521 * If we have just finished replacing a hot spared device, then 5522 * we need to detach the parent's first child (the original hot 5523 * spare) as well. 5524 */ 5525 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 5526 ppvd->vdev_children == 2) { 5527 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 5528 sguid = ppvd->vdev_child[1]->vdev_guid; 5529 } 5530 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 5531 5532 spa_config_exit(spa, SCL_ALL, FTAG); 5533 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 5534 return; 5535 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 5536 return; 5537 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5538 } 5539 5540 spa_config_exit(spa, SCL_ALL, FTAG); 5541 } 5542 5543 /* 5544 * Update the stored path or FRU for this vdev. 5545 */ 5546 int 5547 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 5548 boolean_t ispath) 5549 { 5550 vdev_t *vd; 5551 boolean_t sync = B_FALSE; 5552 5553 ASSERT(spa_writeable(spa)); 5554 5555 spa_vdev_state_enter(spa, SCL_ALL); 5556 5557 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 5558 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 5559 5560 if (!vd->vdev_ops->vdev_op_leaf) 5561 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 5562 5563 if (ispath) { 5564 if (strcmp(value, vd->vdev_path) != 0) { 5565 spa_strfree(vd->vdev_path); 5566 vd->vdev_path = spa_strdup(value); 5567 sync = B_TRUE; 5568 } 5569 } else { 5570 if (vd->vdev_fru == NULL) { 5571 vd->vdev_fru = spa_strdup(value); 5572 sync = B_TRUE; 5573 } else if (strcmp(value, vd->vdev_fru) != 0) { 5574 spa_strfree(vd->vdev_fru); 5575 vd->vdev_fru = spa_strdup(value); 5576 sync = B_TRUE; 5577 } 5578 } 5579 5580 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 5581 } 5582 5583 int 5584 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 5585 { 5586 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 5587 } 5588 5589 int 5590 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 5591 { 5592 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 5593 } 5594 5595 /* 5596 * ========================================================================== 5597 * SPA Scanning 5598 * ========================================================================== 5599 */ 5600 5601 int 5602 spa_scan_stop(spa_t *spa) 5603 { 5604 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5605 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 5606 return (SET_ERROR(EBUSY)); 5607 return (dsl_scan_cancel(spa->spa_dsl_pool)); 5608 } 5609 5610 int 5611 spa_scan(spa_t *spa, pool_scan_func_t func) 5612 { 5613 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5614 5615 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 5616 return (SET_ERROR(ENOTSUP)); 5617 5618 /* 5619 * If a resilver was requested, but there is no DTL on a 5620 * writeable leaf device, we have nothing to do. 5621 */ 5622 if (func == POOL_SCAN_RESILVER && 5623 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5624 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 5625 return (0); 5626 } 5627 5628 return (dsl_scan(spa->spa_dsl_pool, func)); 5629 } 5630 5631 /* 5632 * ========================================================================== 5633 * SPA async task processing 5634 * ========================================================================== 5635 */ 5636 5637 static void 5638 spa_async_remove(spa_t *spa, vdev_t *vd) 5639 { 5640 if (vd->vdev_remove_wanted) { 5641 vd->vdev_remove_wanted = B_FALSE; 5642 vd->vdev_delayed_close = B_FALSE; 5643 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 5644 5645 /* 5646 * We want to clear the stats, but we don't want to do a full 5647 * vdev_clear() as that will cause us to throw away 5648 * degraded/faulted state as well as attempt to reopen the 5649 * device, all of which is a waste. 5650 */ 5651 vd->vdev_stat.vs_read_errors = 0; 5652 vd->vdev_stat.vs_write_errors = 0; 5653 vd->vdev_stat.vs_checksum_errors = 0; 5654 5655 vdev_state_dirty(vd->vdev_top); 5656 } 5657 5658 for (int c = 0; c < vd->vdev_children; c++) 5659 spa_async_remove(spa, vd->vdev_child[c]); 5660 } 5661 5662 static void 5663 spa_async_probe(spa_t *spa, vdev_t *vd) 5664 { 5665 if (vd->vdev_probe_wanted) { 5666 vd->vdev_probe_wanted = B_FALSE; 5667 vdev_reopen(vd); /* vdev_open() does the actual probe */ 5668 } 5669 5670 for (int c = 0; c < vd->vdev_children; c++) 5671 spa_async_probe(spa, vd->vdev_child[c]); 5672 } 5673 5674 static void 5675 spa_async_autoexpand(spa_t *spa, vdev_t *vd) 5676 { 5677 sysevent_id_t eid; 5678 nvlist_t *attr; 5679 char *physpath; 5680 5681 if (!spa->spa_autoexpand) 5682 return; 5683 5684 for (int c = 0; c < vd->vdev_children; c++) { 5685 vdev_t *cvd = vd->vdev_child[c]; 5686 spa_async_autoexpand(spa, cvd); 5687 } 5688 5689 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 5690 return; 5691 5692 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 5693 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 5694 5695 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5696 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 5697 5698 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 5699 ESC_DEV_DLE, attr, &eid, DDI_SLEEP); 5700 5701 nvlist_free(attr); 5702 kmem_free(physpath, MAXPATHLEN); 5703 } 5704 5705 static void 5706 spa_async_thread(spa_t *spa) 5707 { 5708 int tasks; 5709 5710 ASSERT(spa->spa_sync_on); 5711 5712 mutex_enter(&spa->spa_async_lock); 5713 tasks = spa->spa_async_tasks; 5714 spa->spa_async_tasks = 0; 5715 mutex_exit(&spa->spa_async_lock); 5716 5717 /* 5718 * See if the config needs to be updated. 5719 */ 5720 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 5721 uint64_t old_space, new_space; 5722 5723 mutex_enter(&spa_namespace_lock); 5724 old_space = metaslab_class_get_space(spa_normal_class(spa)); 5725 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5726 new_space = metaslab_class_get_space(spa_normal_class(spa)); 5727 mutex_exit(&spa_namespace_lock); 5728 5729 /* 5730 * If the pool grew as a result of the config update, 5731 * then log an internal history event. 5732 */ 5733 if (new_space != old_space) { 5734 spa_history_log_internal(spa, "vdev online", NULL, 5735 "pool '%s' size: %llu(+%llu)", 5736 spa_name(spa), new_space, new_space - old_space); 5737 } 5738 } 5739 5740 /* 5741 * See if any devices need to be marked REMOVED. 5742 */ 5743 if (tasks & SPA_ASYNC_REMOVE) { 5744 spa_vdev_state_enter(spa, SCL_NONE); 5745 spa_async_remove(spa, spa->spa_root_vdev); 5746 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 5747 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 5748 for (int i = 0; i < spa->spa_spares.sav_count; i++) 5749 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 5750 (void) spa_vdev_state_exit(spa, NULL, 0); 5751 } 5752 5753 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 5754 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5755 spa_async_autoexpand(spa, spa->spa_root_vdev); 5756 spa_config_exit(spa, SCL_CONFIG, FTAG); 5757 } 5758 5759 /* 5760 * See if any devices need to be probed. 5761 */ 5762 if (tasks & SPA_ASYNC_PROBE) { 5763 spa_vdev_state_enter(spa, SCL_NONE); 5764 spa_async_probe(spa, spa->spa_root_vdev); 5765 (void) spa_vdev_state_exit(spa, NULL, 0); 5766 } 5767 5768 /* 5769 * If any devices are done replacing, detach them. 5770 */ 5771 if (tasks & SPA_ASYNC_RESILVER_DONE) 5772 spa_vdev_resilver_done(spa); 5773 5774 /* 5775 * Kick off a resilver. 5776 */ 5777 if (tasks & SPA_ASYNC_RESILVER) 5778 dsl_resilver_restart(spa->spa_dsl_pool, 0); 5779 5780 /* 5781 * Let the world know that we're done. 5782 */ 5783 mutex_enter(&spa->spa_async_lock); 5784 spa->spa_async_thread = NULL; 5785 cv_broadcast(&spa->spa_async_cv); 5786 mutex_exit(&spa->spa_async_lock); 5787 thread_exit(); 5788 } 5789 5790 void 5791 spa_async_suspend(spa_t *spa) 5792 { 5793 mutex_enter(&spa->spa_async_lock); 5794 spa->spa_async_suspended++; 5795 while (spa->spa_async_thread != NULL) 5796 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 5797 mutex_exit(&spa->spa_async_lock); 5798 } 5799 5800 void 5801 spa_async_resume(spa_t *spa) 5802 { 5803 mutex_enter(&spa->spa_async_lock); 5804 ASSERT(spa->spa_async_suspended != 0); 5805 spa->spa_async_suspended--; 5806 mutex_exit(&spa->spa_async_lock); 5807 } 5808 5809 static boolean_t 5810 spa_async_tasks_pending(spa_t *spa) 5811 { 5812 uint_t non_config_tasks; 5813 uint_t config_task; 5814 boolean_t config_task_suspended; 5815 5816 non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE; 5817 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 5818 if (spa->spa_ccw_fail_time == 0) { 5819 config_task_suspended = B_FALSE; 5820 } else { 5821 config_task_suspended = 5822 (gethrtime() - spa->spa_ccw_fail_time) < 5823 (zfs_ccw_retry_interval * NANOSEC); 5824 } 5825 5826 return (non_config_tasks || (config_task && !config_task_suspended)); 5827 } 5828 5829 static void 5830 spa_async_dispatch(spa_t *spa) 5831 { 5832 mutex_enter(&spa->spa_async_lock); 5833 if (spa_async_tasks_pending(spa) && 5834 !spa->spa_async_suspended && 5835 spa->spa_async_thread == NULL && 5836 rootdir != NULL) 5837 spa->spa_async_thread = thread_create(NULL, 0, 5838 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 5839 mutex_exit(&spa->spa_async_lock); 5840 } 5841 5842 void 5843 spa_async_request(spa_t *spa, int task) 5844 { 5845 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 5846 mutex_enter(&spa->spa_async_lock); 5847 spa->spa_async_tasks |= task; 5848 mutex_exit(&spa->spa_async_lock); 5849 } 5850 5851 /* 5852 * ========================================================================== 5853 * SPA syncing routines 5854 * ========================================================================== 5855 */ 5856 5857 static int 5858 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5859 { 5860 bpobj_t *bpo = arg; 5861 bpobj_enqueue(bpo, bp, tx); 5862 return (0); 5863 } 5864 5865 static int 5866 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5867 { 5868 zio_t *zio = arg; 5869 5870 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 5871 zio->io_flags)); 5872 return (0); 5873 } 5874 5875 /* 5876 * Note: this simple function is not inlined to make it easier to dtrace the 5877 * amount of time spent syncing frees. 5878 */ 5879 static void 5880 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 5881 { 5882 zio_t *zio = zio_root(spa, NULL, NULL, 0); 5883 bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 5884 VERIFY(zio_wait(zio) == 0); 5885 } 5886 5887 /* 5888 * Note: this simple function is not inlined to make it easier to dtrace the 5889 * amount of time spent syncing deferred frees. 5890 */ 5891 static void 5892 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 5893 { 5894 zio_t *zio = zio_root(spa, NULL, NULL, 0); 5895 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 5896 spa_free_sync_cb, zio, tx), ==, 0); 5897 VERIFY0(zio_wait(zio)); 5898 } 5899 5900 5901 static void 5902 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 5903 { 5904 char *packed = NULL; 5905 size_t bufsize; 5906 size_t nvsize = 0; 5907 dmu_buf_t *db; 5908 5909 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 5910 5911 /* 5912 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 5913 * information. This avoids the dmu_buf_will_dirty() path and 5914 * saves us a pre-read to get data we don't actually care about. 5915 */ 5916 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 5917 packed = kmem_alloc(bufsize, KM_SLEEP); 5918 5919 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 5920 KM_SLEEP) == 0); 5921 bzero(packed + nvsize, bufsize - nvsize); 5922 5923 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 5924 5925 kmem_free(packed, bufsize); 5926 5927 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 5928 dmu_buf_will_dirty(db, tx); 5929 *(uint64_t *)db->db_data = nvsize; 5930 dmu_buf_rele(db, FTAG); 5931 } 5932 5933 static void 5934 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 5935 const char *config, const char *entry) 5936 { 5937 nvlist_t *nvroot; 5938 nvlist_t **list; 5939 int i; 5940 5941 if (!sav->sav_sync) 5942 return; 5943 5944 /* 5945 * Update the MOS nvlist describing the list of available devices. 5946 * spa_validate_aux() will have already made sure this nvlist is 5947 * valid and the vdevs are labeled appropriately. 5948 */ 5949 if (sav->sav_object == 0) { 5950 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 5951 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 5952 sizeof (uint64_t), tx); 5953 VERIFY(zap_update(spa->spa_meta_objset, 5954 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 5955 &sav->sav_object, tx) == 0); 5956 } 5957 5958 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5959 if (sav->sav_count == 0) { 5960 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 5961 } else { 5962 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 5963 for (i = 0; i < sav->sav_count; i++) 5964 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 5965 B_FALSE, VDEV_CONFIG_L2CACHE); 5966 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 5967 sav->sav_count) == 0); 5968 for (i = 0; i < sav->sav_count; i++) 5969 nvlist_free(list[i]); 5970 kmem_free(list, sav->sav_count * sizeof (void *)); 5971 } 5972 5973 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 5974 nvlist_free(nvroot); 5975 5976 sav->sav_sync = B_FALSE; 5977 } 5978 5979 static void 5980 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 5981 { 5982 nvlist_t *config; 5983 5984 if (list_is_empty(&spa->spa_config_dirty_list)) 5985 return; 5986 5987 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5988 5989 config = spa_config_generate(spa, spa->spa_root_vdev, 5990 dmu_tx_get_txg(tx), B_FALSE); 5991 5992 /* 5993 * If we're upgrading the spa version then make sure that 5994 * the config object gets updated with the correct version. 5995 */ 5996 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 5997 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 5998 spa->spa_uberblock.ub_version); 5999 6000 spa_config_exit(spa, SCL_STATE, FTAG); 6001 6002 nvlist_free(spa->spa_config_syncing); 6003 spa->spa_config_syncing = config; 6004 6005 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 6006 } 6007 6008 static void 6009 spa_sync_version(void *arg, dmu_tx_t *tx) 6010 { 6011 uint64_t *versionp = arg; 6012 uint64_t version = *versionp; 6013 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6014 6015 /* 6016 * Setting the version is special cased when first creating the pool. 6017 */ 6018 ASSERT(tx->tx_txg != TXG_INITIAL); 6019 6020 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 6021 ASSERT(version >= spa_version(spa)); 6022 6023 spa->spa_uberblock.ub_version = version; 6024 vdev_config_dirty(spa->spa_root_vdev); 6025 spa_history_log_internal(spa, "set", tx, "version=%lld", version); 6026 } 6027 6028 /* 6029 * Set zpool properties. 6030 */ 6031 static void 6032 spa_sync_props(void *arg, dmu_tx_t *tx) 6033 { 6034 nvlist_t *nvp = arg; 6035 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6036 objset_t *mos = spa->spa_meta_objset; 6037 nvpair_t *elem = NULL; 6038 6039 mutex_enter(&spa->spa_props_lock); 6040 6041 while ((elem = nvlist_next_nvpair(nvp, elem))) { 6042 uint64_t intval; 6043 char *strval, *fname; 6044 zpool_prop_t prop; 6045 const char *propname; 6046 zprop_type_t proptype; 6047 spa_feature_t fid; 6048 6049 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 6050 case ZPROP_INVAL: 6051 /* 6052 * We checked this earlier in spa_prop_validate(). 6053 */ 6054 ASSERT(zpool_prop_feature(nvpair_name(elem))); 6055 6056 fname = strchr(nvpair_name(elem), '@') + 1; 6057 VERIFY0(zfeature_lookup_name(fname, &fid)); 6058 6059 spa_feature_enable(spa, fid, tx); 6060 spa_history_log_internal(spa, "set", tx, 6061 "%s=enabled", nvpair_name(elem)); 6062 break; 6063 6064 case ZPOOL_PROP_VERSION: 6065 intval = fnvpair_value_uint64(elem); 6066 /* 6067 * The version is synced seperatly before other 6068 * properties and should be correct by now. 6069 */ 6070 ASSERT3U(spa_version(spa), >=, intval); 6071 break; 6072 6073 case ZPOOL_PROP_ALTROOT: 6074 /* 6075 * 'altroot' is a non-persistent property. It should 6076 * have been set temporarily at creation or import time. 6077 */ 6078 ASSERT(spa->spa_root != NULL); 6079 break; 6080 6081 case ZPOOL_PROP_READONLY: 6082 case ZPOOL_PROP_CACHEFILE: 6083 /* 6084 * 'readonly' and 'cachefile' are also non-persisitent 6085 * properties. 6086 */ 6087 break; 6088 case ZPOOL_PROP_COMMENT: 6089 strval = fnvpair_value_string(elem); 6090 if (spa->spa_comment != NULL) 6091 spa_strfree(spa->spa_comment); 6092 spa->spa_comment = spa_strdup(strval); 6093 /* 6094 * We need to dirty the configuration on all the vdevs 6095 * so that their labels get updated. It's unnecessary 6096 * to do this for pool creation since the vdev's 6097 * configuratoin has already been dirtied. 6098 */ 6099 if (tx->tx_txg != TXG_INITIAL) 6100 vdev_config_dirty(spa->spa_root_vdev); 6101 spa_history_log_internal(spa, "set", tx, 6102 "%s=%s", nvpair_name(elem), strval); 6103 break; 6104 default: 6105 /* 6106 * Set pool property values in the poolprops mos object. 6107 */ 6108 if (spa->spa_pool_props_object == 0) { 6109 spa->spa_pool_props_object = 6110 zap_create_link(mos, DMU_OT_POOL_PROPS, 6111 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 6112 tx); 6113 } 6114 6115 /* normalize the property name */ 6116 propname = zpool_prop_to_name(prop); 6117 proptype = zpool_prop_get_type(prop); 6118 6119 if (nvpair_type(elem) == DATA_TYPE_STRING) { 6120 ASSERT(proptype == PROP_TYPE_STRING); 6121 strval = fnvpair_value_string(elem); 6122 VERIFY0(zap_update(mos, 6123 spa->spa_pool_props_object, propname, 6124 1, strlen(strval) + 1, strval, tx)); 6125 spa_history_log_internal(spa, "set", tx, 6126 "%s=%s", nvpair_name(elem), strval); 6127 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 6128 intval = fnvpair_value_uint64(elem); 6129 6130 if (proptype == PROP_TYPE_INDEX) { 6131 const char *unused; 6132 VERIFY0(zpool_prop_index_to_string( 6133 prop, intval, &unused)); 6134 } 6135 VERIFY0(zap_update(mos, 6136 spa->spa_pool_props_object, propname, 6137 8, 1, &intval, tx)); 6138 spa_history_log_internal(spa, "set", tx, 6139 "%s=%lld", nvpair_name(elem), intval); 6140 } else { 6141 ASSERT(0); /* not allowed */ 6142 } 6143 6144 switch (prop) { 6145 case ZPOOL_PROP_DELEGATION: 6146 spa->spa_delegation = intval; 6147 break; 6148 case ZPOOL_PROP_BOOTFS: 6149 spa->spa_bootfs = intval; 6150 break; 6151 case ZPOOL_PROP_FAILUREMODE: 6152 spa->spa_failmode = intval; 6153 break; 6154 case ZPOOL_PROP_AUTOEXPAND: 6155 spa->spa_autoexpand = intval; 6156 if (tx->tx_txg != TXG_INITIAL) 6157 spa_async_request(spa, 6158 SPA_ASYNC_AUTOEXPAND); 6159 break; 6160 case ZPOOL_PROP_DEDUPDITTO: 6161 spa->spa_dedup_ditto = intval; 6162 break; 6163 default: 6164 break; 6165 } 6166 } 6167 6168 } 6169 6170 mutex_exit(&spa->spa_props_lock); 6171 } 6172 6173 /* 6174 * Perform one-time upgrade on-disk changes. spa_version() does not 6175 * reflect the new version this txg, so there must be no changes this 6176 * txg to anything that the upgrade code depends on after it executes. 6177 * Therefore this must be called after dsl_pool_sync() does the sync 6178 * tasks. 6179 */ 6180 static void 6181 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 6182 { 6183 dsl_pool_t *dp = spa->spa_dsl_pool; 6184 6185 ASSERT(spa->spa_sync_pass == 1); 6186 6187 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 6188 6189 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 6190 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 6191 dsl_pool_create_origin(dp, tx); 6192 6193 /* Keeping the origin open increases spa_minref */ 6194 spa->spa_minref += 3; 6195 } 6196 6197 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 6198 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 6199 dsl_pool_upgrade_clones(dp, tx); 6200 } 6201 6202 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 6203 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 6204 dsl_pool_upgrade_dir_clones(dp, tx); 6205 6206 /* Keeping the freedir open increases spa_minref */ 6207 spa->spa_minref += 3; 6208 } 6209 6210 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 6211 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6212 spa_feature_create_zap_objects(spa, tx); 6213 } 6214 6215 /* 6216 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 6217 * when possibility to use lz4 compression for metadata was added 6218 * Old pools that have this feature enabled must be upgraded to have 6219 * this feature active 6220 */ 6221 if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6222 boolean_t lz4_en = spa_feature_is_enabled(spa, 6223 SPA_FEATURE_LZ4_COMPRESS); 6224 boolean_t lz4_ac = spa_feature_is_active(spa, 6225 SPA_FEATURE_LZ4_COMPRESS); 6226 6227 if (lz4_en && !lz4_ac) 6228 spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 6229 } 6230 6231 /* 6232 * If we haven't written the salt, do so now. Note that the 6233 * feature may not be activated yet, but that's fine since 6234 * the presence of this ZAP entry is backwards compatible. 6235 */ 6236 if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 6237 DMU_POOL_CHECKSUM_SALT) == ENOENT) { 6238 VERIFY0(zap_add(spa->spa_meta_objset, 6239 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, 6240 sizeof (spa->spa_cksum_salt.zcs_bytes), 6241 spa->spa_cksum_salt.zcs_bytes, tx)); 6242 } 6243 6244 rrw_exit(&dp->dp_config_rwlock, FTAG); 6245 } 6246 6247 /* 6248 * Sync the specified transaction group. New blocks may be dirtied as 6249 * part of the process, so we iterate until it converges. 6250 */ 6251 void 6252 spa_sync(spa_t *spa, uint64_t txg) 6253 { 6254 dsl_pool_t *dp = spa->spa_dsl_pool; 6255 objset_t *mos = spa->spa_meta_objset; 6256 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 6257 vdev_t *rvd = spa->spa_root_vdev; 6258 vdev_t *vd; 6259 dmu_tx_t *tx; 6260 int error; 6261 6262 VERIFY(spa_writeable(spa)); 6263 6264 /* 6265 * Lock out configuration changes. 6266 */ 6267 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6268 6269 spa->spa_syncing_txg = txg; 6270 spa->spa_sync_pass = 0; 6271 6272 /* 6273 * If there are any pending vdev state changes, convert them 6274 * into config changes that go out with this transaction group. 6275 */ 6276 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6277 while (list_head(&spa->spa_state_dirty_list) != NULL) { 6278 /* 6279 * We need the write lock here because, for aux vdevs, 6280 * calling vdev_config_dirty() modifies sav_config. 6281 * This is ugly and will become unnecessary when we 6282 * eliminate the aux vdev wart by integrating all vdevs 6283 * into the root vdev tree. 6284 */ 6285 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6286 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 6287 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 6288 vdev_state_clean(vd); 6289 vdev_config_dirty(vd); 6290 } 6291 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6292 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 6293 } 6294 spa_config_exit(spa, SCL_STATE, FTAG); 6295 6296 tx = dmu_tx_create_assigned(dp, txg); 6297 6298 spa->spa_sync_starttime = gethrtime(); 6299 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, 6300 spa->spa_sync_starttime + spa->spa_deadman_synctime)); 6301 6302 /* 6303 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 6304 * set spa_deflate if we have no raid-z vdevs. 6305 */ 6306 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 6307 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 6308 int i; 6309 6310 for (i = 0; i < rvd->vdev_children; i++) { 6311 vd = rvd->vdev_child[i]; 6312 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 6313 break; 6314 } 6315 if (i == rvd->vdev_children) { 6316 spa->spa_deflate = TRUE; 6317 VERIFY(0 == zap_add(spa->spa_meta_objset, 6318 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6319 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 6320 } 6321 } 6322 6323 /* 6324 * Iterate to convergence. 6325 */ 6326 do { 6327 int pass = ++spa->spa_sync_pass; 6328 6329 spa_sync_config_object(spa, tx); 6330 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 6331 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 6332 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 6333 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 6334 spa_errlog_sync(spa, txg); 6335 dsl_pool_sync(dp, txg); 6336 6337 if (pass < zfs_sync_pass_deferred_free) { 6338 spa_sync_frees(spa, free_bpl, tx); 6339 } else { 6340 /* 6341 * We can not defer frees in pass 1, because 6342 * we sync the deferred frees later in pass 1. 6343 */ 6344 ASSERT3U(pass, >, 1); 6345 bplist_iterate(free_bpl, bpobj_enqueue_cb, 6346 &spa->spa_deferred_bpobj, tx); 6347 } 6348 6349 ddt_sync(spa, txg); 6350 dsl_scan_sync(dp, tx); 6351 6352 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 6353 vdev_sync(vd, txg); 6354 6355 if (pass == 1) { 6356 spa_sync_upgrades(spa, tx); 6357 ASSERT3U(txg, >=, 6358 spa->spa_uberblock.ub_rootbp.blk_birth); 6359 /* 6360 * Note: We need to check if the MOS is dirty 6361 * because we could have marked the MOS dirty 6362 * without updating the uberblock (e.g. if we 6363 * have sync tasks but no dirty user data). We 6364 * need to check the uberblock's rootbp because 6365 * it is updated if we have synced out dirty 6366 * data (though in this case the MOS will most 6367 * likely also be dirty due to second order 6368 * effects, we don't want to rely on that here). 6369 */ 6370 if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && 6371 !dmu_objset_is_dirty(mos, txg)) { 6372 /* 6373 * Nothing changed on the first pass, 6374 * therefore this TXG is a no-op. Avoid 6375 * syncing deferred frees, so that we 6376 * can keep this TXG as a no-op. 6377 */ 6378 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, 6379 txg)); 6380 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 6381 ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); 6382 break; 6383 } 6384 spa_sync_deferred_frees(spa, tx); 6385 } 6386 6387 } while (dmu_objset_is_dirty(mos, txg)); 6388 6389 /* 6390 * Rewrite the vdev configuration (which includes the uberblock) 6391 * to commit the transaction group. 6392 * 6393 * If there are no dirty vdevs, we sync the uberblock to a few 6394 * random top-level vdevs that are known to be visible in the 6395 * config cache (see spa_vdev_add() for a complete description). 6396 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 6397 */ 6398 for (;;) { 6399 /* 6400 * We hold SCL_STATE to prevent vdev open/close/etc. 6401 * while we're attempting to write the vdev labels. 6402 */ 6403 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6404 6405 if (list_is_empty(&spa->spa_config_dirty_list)) { 6406 vdev_t *svd[SPA_DVAS_PER_BP]; 6407 int svdcount = 0; 6408 int children = rvd->vdev_children; 6409 int c0 = spa_get_random(children); 6410 6411 for (int c = 0; c < children; c++) { 6412 vd = rvd->vdev_child[(c0 + c) % children]; 6413 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 6414 continue; 6415 svd[svdcount++] = vd; 6416 if (svdcount == SPA_DVAS_PER_BP) 6417 break; 6418 } 6419 error = vdev_config_sync(svd, svdcount, txg); 6420 } else { 6421 error = vdev_config_sync(rvd->vdev_child, 6422 rvd->vdev_children, txg); 6423 } 6424 6425 if (error == 0) 6426 spa->spa_last_synced_guid = rvd->vdev_guid; 6427 6428 spa_config_exit(spa, SCL_STATE, FTAG); 6429 6430 if (error == 0) 6431 break; 6432 zio_suspend(spa, NULL); 6433 zio_resume_wait(spa); 6434 } 6435 dmu_tx_commit(tx); 6436 6437 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); 6438 6439 /* 6440 * Clear the dirty config list. 6441 */ 6442 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 6443 vdev_config_clean(vd); 6444 6445 /* 6446 * Now that the new config has synced transactionally, 6447 * let it become visible to the config cache. 6448 */ 6449 if (spa->spa_config_syncing != NULL) { 6450 spa_config_set(spa, spa->spa_config_syncing); 6451 spa->spa_config_txg = txg; 6452 spa->spa_config_syncing = NULL; 6453 } 6454 6455 spa->spa_ubsync = spa->spa_uberblock; 6456 6457 dsl_pool_sync_done(dp, txg); 6458 6459 /* 6460 * Update usable space statistics. 6461 */ 6462 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 6463 vdev_sync_done(vd, txg); 6464 6465 spa_update_dspace(spa); 6466 6467 /* 6468 * It had better be the case that we didn't dirty anything 6469 * since vdev_config_sync(). 6470 */ 6471 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 6472 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 6473 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 6474 6475 spa->spa_sync_pass = 0; 6476 6477 spa_config_exit(spa, SCL_CONFIG, FTAG); 6478 6479 spa_handle_ignored_writes(spa); 6480 6481 /* 6482 * If any async tasks have been requested, kick them off. 6483 */ 6484 spa_async_dispatch(spa); 6485 } 6486 6487 /* 6488 * Sync all pools. We don't want to hold the namespace lock across these 6489 * operations, so we take a reference on the spa_t and drop the lock during the 6490 * sync. 6491 */ 6492 void 6493 spa_sync_allpools(void) 6494 { 6495 spa_t *spa = NULL; 6496 mutex_enter(&spa_namespace_lock); 6497 while ((spa = spa_next(spa)) != NULL) { 6498 if (spa_state(spa) != POOL_STATE_ACTIVE || 6499 !spa_writeable(spa) || spa_suspended(spa)) 6500 continue; 6501 spa_open_ref(spa, FTAG); 6502 mutex_exit(&spa_namespace_lock); 6503 txg_wait_synced(spa_get_dsl(spa), 0); 6504 mutex_enter(&spa_namespace_lock); 6505 spa_close(spa, FTAG); 6506 } 6507 mutex_exit(&spa_namespace_lock); 6508 } 6509 6510 /* 6511 * ========================================================================== 6512 * Miscellaneous routines 6513 * ========================================================================== 6514 */ 6515 6516 /* 6517 * Remove all pools in the system. 6518 */ 6519 void 6520 spa_evict_all(void) 6521 { 6522 spa_t *spa; 6523 6524 /* 6525 * Remove all cached state. All pools should be closed now, 6526 * so every spa in the AVL tree should be unreferenced. 6527 */ 6528 mutex_enter(&spa_namespace_lock); 6529 while ((spa = spa_next(NULL)) != NULL) { 6530 /* 6531 * Stop async tasks. The async thread may need to detach 6532 * a device that's been replaced, which requires grabbing 6533 * spa_namespace_lock, so we must drop it here. 6534 */ 6535 spa_open_ref(spa, FTAG); 6536 mutex_exit(&spa_namespace_lock); 6537 spa_async_suspend(spa); 6538 mutex_enter(&spa_namespace_lock); 6539 spa_close(spa, FTAG); 6540 6541 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 6542 spa_unload(spa); 6543 spa_deactivate(spa); 6544 } 6545 spa_remove(spa); 6546 } 6547 mutex_exit(&spa_namespace_lock); 6548 } 6549 6550 vdev_t * 6551 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 6552 { 6553 vdev_t *vd; 6554 int i; 6555 6556 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 6557 return (vd); 6558 6559 if (aux) { 6560 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 6561 vd = spa->spa_l2cache.sav_vdevs[i]; 6562 if (vd->vdev_guid == guid) 6563 return (vd); 6564 } 6565 6566 for (i = 0; i < spa->spa_spares.sav_count; i++) { 6567 vd = spa->spa_spares.sav_vdevs[i]; 6568 if (vd->vdev_guid == guid) 6569 return (vd); 6570 } 6571 } 6572 6573 return (NULL); 6574 } 6575 6576 void 6577 spa_upgrade(spa_t *spa, uint64_t version) 6578 { 6579 ASSERT(spa_writeable(spa)); 6580 6581 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6582 6583 /* 6584 * This should only be called for a non-faulted pool, and since a 6585 * future version would result in an unopenable pool, this shouldn't be 6586 * possible. 6587 */ 6588 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 6589 ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 6590 6591 spa->spa_uberblock.ub_version = version; 6592 vdev_config_dirty(spa->spa_root_vdev); 6593 6594 spa_config_exit(spa, SCL_ALL, FTAG); 6595 6596 txg_wait_synced(spa_get_dsl(spa), 0); 6597 } 6598 6599 boolean_t 6600 spa_has_spare(spa_t *spa, uint64_t guid) 6601 { 6602 int i; 6603 uint64_t spareguid; 6604 spa_aux_vdev_t *sav = &spa->spa_spares; 6605 6606 for (i = 0; i < sav->sav_count; i++) 6607 if (sav->sav_vdevs[i]->vdev_guid == guid) 6608 return (B_TRUE); 6609 6610 for (i = 0; i < sav->sav_npending; i++) { 6611 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 6612 &spareguid) == 0 && spareguid == guid) 6613 return (B_TRUE); 6614 } 6615 6616 return (B_FALSE); 6617 } 6618 6619 /* 6620 * Check if a pool has an active shared spare device. 6621 * Note: reference count of an active spare is 2, as a spare and as a replace 6622 */ 6623 static boolean_t 6624 spa_has_active_shared_spare(spa_t *spa) 6625 { 6626 int i, refcnt; 6627 uint64_t pool; 6628 spa_aux_vdev_t *sav = &spa->spa_spares; 6629 6630 for (i = 0; i < sav->sav_count; i++) { 6631 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 6632 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 6633 refcnt > 2) 6634 return (B_TRUE); 6635 } 6636 6637 return (B_FALSE); 6638 } 6639 6640 /* 6641 * Post a sysevent corresponding to the given event. The 'name' must be one of 6642 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 6643 * filled in from the spa and (optionally) the vdev. This doesn't do anything 6644 * in the userland libzpool, as we don't want consumers to misinterpret ztest 6645 * or zdb as real changes. 6646 */ 6647 void 6648 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 6649 { 6650 #ifdef _KERNEL 6651 sysevent_t *ev; 6652 sysevent_attr_list_t *attr = NULL; 6653 sysevent_value_t value; 6654 sysevent_id_t eid; 6655 6656 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 6657 SE_SLEEP); 6658 6659 value.value_type = SE_DATA_TYPE_STRING; 6660 value.value.sv_string = spa_name(spa); 6661 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 6662 goto done; 6663 6664 value.value_type = SE_DATA_TYPE_UINT64; 6665 value.value.sv_uint64 = spa_guid(spa); 6666 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 6667 goto done; 6668 6669 if (vd) { 6670 value.value_type = SE_DATA_TYPE_UINT64; 6671 value.value.sv_uint64 = vd->vdev_guid; 6672 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 6673 SE_SLEEP) != 0) 6674 goto done; 6675 6676 if (vd->vdev_path) { 6677 value.value_type = SE_DATA_TYPE_STRING; 6678 value.value.sv_string = vd->vdev_path; 6679 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 6680 &value, SE_SLEEP) != 0) 6681 goto done; 6682 } 6683 } 6684 6685 if (sysevent_attach_attributes(ev, attr) != 0) 6686 goto done; 6687 attr = NULL; 6688 6689 (void) log_sysevent(ev, SE_SLEEP, &eid); 6690 6691 done: 6692 if (attr) 6693 sysevent_free_attr(attr); 6694 sysevent_free(ev); 6695 #endif 6696 } 6697