1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 25 * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 27 * Copyright 2013 Saso Kiselkov. All rights reserved. 28 * Copyright (c) 2014 Integros [integros.com] 29 */ 30 31 /* 32 * SPA: Storage Pool Allocator 33 * 34 * This file contains all the routines used when modifying on-disk SPA state. 35 * This includes opening, importing, destroying, exporting a pool, and syncing a 36 * pool. 37 */ 38 39 #include <sys/zfs_context.h> 40 #include <sys/fm/fs/zfs.h> 41 #include <sys/spa_impl.h> 42 #include <sys/zio.h> 43 #include <sys/zio_checksum.h> 44 #include <sys/dmu.h> 45 #include <sys/dmu_tx.h> 46 #include <sys/zap.h> 47 #include <sys/zil.h> 48 #include <sys/ddt.h> 49 #include <sys/vdev_impl.h> 50 #include <sys/metaslab.h> 51 #include <sys/metaslab_impl.h> 52 #include <sys/uberblock_impl.h> 53 #include <sys/txg.h> 54 #include <sys/avl.h> 55 #include <sys/dmu_traverse.h> 56 #include <sys/dmu_objset.h> 57 #include <sys/unique.h> 58 #include <sys/dsl_pool.h> 59 #include <sys/dsl_dataset.h> 60 #include <sys/dsl_dir.h> 61 #include <sys/dsl_prop.h> 62 #include <sys/dsl_synctask.h> 63 #include <sys/fs/zfs.h> 64 #include <sys/arc.h> 65 #include <sys/callb.h> 66 #include <sys/systeminfo.h> 67 #include <sys/spa_boot.h> 68 #include <sys/zfs_ioctl.h> 69 #include <sys/dsl_scan.h> 70 #include <sys/zfeature.h> 71 #include <sys/dsl_destroy.h> 72 73 #ifdef _KERNEL 74 #include <sys/bootprops.h> 75 #include <sys/callb.h> 76 #include <sys/cpupart.h> 77 #include <sys/pool.h> 78 #include <sys/sysdc.h> 79 #include <sys/zone.h> 80 #endif /* _KERNEL */ 81 82 #include "zfs_prop.h" 83 #include "zfs_comutil.h" 84 85 /* 86 * The interval, in seconds, at which failed configuration cache file writes 87 * should be retried. 88 */ 89 static int zfs_ccw_retry_interval = 300; 90 91 typedef enum zti_modes { 92 ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 93 ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ 94 ZTI_MODE_NULL, /* don't create a taskq */ 95 ZTI_NMODES 96 } zti_modes_t; 97 98 #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 99 #define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } 100 #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 101 102 #define ZTI_N(n) ZTI_P(n, 1) 103 #define ZTI_ONE ZTI_N(1) 104 105 typedef struct zio_taskq_info { 106 zti_modes_t zti_mode; 107 uint_t zti_value; 108 uint_t zti_count; 109 } zio_taskq_info_t; 110 111 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 112 "issue", "issue_high", "intr", "intr_high" 113 }; 114 115 /* 116 * This table defines the taskq settings for each ZFS I/O type. When 117 * initializing a pool, we use this table to create an appropriately sized 118 * taskq. Some operations are low volume and therefore have a small, static 119 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 120 * macros. Other operations process a large amount of data; the ZTI_BATCH 121 * macro causes us to create a taskq oriented for throughput. Some operations 122 * are so high frequency and short-lived that the taskq itself can become a a 123 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 124 * additional degree of parallelism specified by the number of threads per- 125 * taskq and the number of taskqs; when dispatching an event in this case, the 126 * particular taskq is chosen at random. 127 * 128 * The different taskq priorities are to handle the different contexts (issue 129 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that 130 * need to be handled with minimum delay. 131 */ 132 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 133 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 134 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 135 { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */ 136 { ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */ 137 { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 138 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 139 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ 140 }; 141 142 static void spa_sync_version(void *arg, dmu_tx_t *tx); 143 static void spa_sync_props(void *arg, dmu_tx_t *tx); 144 static boolean_t spa_has_active_shared_spare(spa_t *spa); 145 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 146 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 147 char **ereport); 148 static void spa_vdev_resilver_done(spa_t *spa); 149 150 uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ 151 id_t zio_taskq_psrset_bind = PS_NONE; 152 boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 153 uint_t zio_taskq_basedc = 80; /* base duty cycle */ 154 155 boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 156 extern int zfs_sync_pass_deferred_free; 157 158 /* 159 * This (illegal) pool name is used when temporarily importing a spa_t in order 160 * to get the vdev stats associated with the imported devices. 161 */ 162 #define TRYIMPORT_NAME "$import" 163 164 /* 165 * ========================================================================== 166 * SPA properties routines 167 * ========================================================================== 168 */ 169 170 /* 171 * Add a (source=src, propname=propval) list to an nvlist. 172 */ 173 static void 174 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 175 uint64_t intval, zprop_source_t src) 176 { 177 const char *propname = zpool_prop_to_name(prop); 178 nvlist_t *propval; 179 180 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 181 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 182 183 if (strval != NULL) 184 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 185 else 186 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 187 188 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 189 nvlist_free(propval); 190 } 191 192 /* 193 * Get property values from the spa configuration. 194 */ 195 static void 196 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 197 { 198 vdev_t *rvd = spa->spa_root_vdev; 199 dsl_pool_t *pool = spa->spa_dsl_pool; 200 uint64_t size, alloc, cap, version; 201 zprop_source_t src = ZPROP_SRC_NONE; 202 spa_config_dirent_t *dp; 203 metaslab_class_t *mc = spa_normal_class(spa); 204 205 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 206 207 if (rvd != NULL) { 208 alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 209 size = metaslab_class_get_space(spa_normal_class(spa)); 210 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 211 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 212 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 213 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 214 size - alloc, src); 215 216 spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, 217 metaslab_class_fragmentation(mc), src); 218 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, 219 metaslab_class_expandable_space(mc), src); 220 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 221 (spa_mode(spa) == FREAD), src); 222 223 cap = (size == 0) ? 0 : (alloc * 100 / size); 224 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 225 226 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 227 ddt_get_pool_dedup_ratio(spa), src); 228 229 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 230 rvd->vdev_state, src); 231 232 version = spa_version(spa); 233 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 234 src = ZPROP_SRC_DEFAULT; 235 else 236 src = ZPROP_SRC_LOCAL; 237 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 238 } 239 240 if (pool != NULL) { 241 /* 242 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 243 * when opening pools before this version freedir will be NULL. 244 */ 245 if (pool->dp_free_dir != NULL) { 246 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 247 dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, 248 src); 249 } else { 250 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 251 NULL, 0, src); 252 } 253 254 if (pool->dp_leak_dir != NULL) { 255 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 256 dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, 257 src); 258 } else { 259 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, 260 NULL, 0, src); 261 } 262 } 263 264 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 265 266 if (spa->spa_comment != NULL) { 267 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 268 0, ZPROP_SRC_LOCAL); 269 } 270 271 if (spa->spa_root != NULL) 272 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 273 0, ZPROP_SRC_LOCAL); 274 275 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { 276 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 277 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); 278 } else { 279 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 280 SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); 281 } 282 283 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 284 if (dp->scd_path == NULL) { 285 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 286 "none", 0, ZPROP_SRC_LOCAL); 287 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 288 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 289 dp->scd_path, 0, ZPROP_SRC_LOCAL); 290 } 291 } 292 } 293 294 /* 295 * Get zpool property values. 296 */ 297 int 298 spa_prop_get(spa_t *spa, nvlist_t **nvp) 299 { 300 objset_t *mos = spa->spa_meta_objset; 301 zap_cursor_t zc; 302 zap_attribute_t za; 303 int err; 304 305 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 306 307 mutex_enter(&spa->spa_props_lock); 308 309 /* 310 * Get properties from the spa config. 311 */ 312 spa_prop_get_config(spa, nvp); 313 314 /* If no pool property object, no more prop to get. */ 315 if (mos == NULL || spa->spa_pool_props_object == 0) { 316 mutex_exit(&spa->spa_props_lock); 317 return (0); 318 } 319 320 /* 321 * Get properties from the MOS pool property object. 322 */ 323 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 324 (err = zap_cursor_retrieve(&zc, &za)) == 0; 325 zap_cursor_advance(&zc)) { 326 uint64_t intval = 0; 327 char *strval = NULL; 328 zprop_source_t src = ZPROP_SRC_DEFAULT; 329 zpool_prop_t prop; 330 331 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 332 continue; 333 334 switch (za.za_integer_length) { 335 case 8: 336 /* integer property */ 337 if (za.za_first_integer != 338 zpool_prop_default_numeric(prop)) 339 src = ZPROP_SRC_LOCAL; 340 341 if (prop == ZPOOL_PROP_BOOTFS) { 342 dsl_pool_t *dp; 343 dsl_dataset_t *ds = NULL; 344 345 dp = spa_get_dsl(spa); 346 dsl_pool_config_enter(dp, FTAG); 347 if (err = dsl_dataset_hold_obj(dp, 348 za.za_first_integer, FTAG, &ds)) { 349 dsl_pool_config_exit(dp, FTAG); 350 break; 351 } 352 353 strval = kmem_alloc( 354 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 355 KM_SLEEP); 356 dsl_dataset_name(ds, strval); 357 dsl_dataset_rele(ds, FTAG); 358 dsl_pool_config_exit(dp, FTAG); 359 } else { 360 strval = NULL; 361 intval = za.za_first_integer; 362 } 363 364 spa_prop_add_list(*nvp, prop, strval, intval, src); 365 366 if (strval != NULL) 367 kmem_free(strval, 368 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 369 370 break; 371 372 case 1: 373 /* string property */ 374 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 375 err = zap_lookup(mos, spa->spa_pool_props_object, 376 za.za_name, 1, za.za_num_integers, strval); 377 if (err) { 378 kmem_free(strval, za.za_num_integers); 379 break; 380 } 381 spa_prop_add_list(*nvp, prop, strval, 0, src); 382 kmem_free(strval, za.za_num_integers); 383 break; 384 385 default: 386 break; 387 } 388 } 389 zap_cursor_fini(&zc); 390 mutex_exit(&spa->spa_props_lock); 391 out: 392 if (err && err != ENOENT) { 393 nvlist_free(*nvp); 394 *nvp = NULL; 395 return (err); 396 } 397 398 return (0); 399 } 400 401 /* 402 * Validate the given pool properties nvlist and modify the list 403 * for the property values to be set. 404 */ 405 static int 406 spa_prop_validate(spa_t *spa, nvlist_t *props) 407 { 408 nvpair_t *elem; 409 int error = 0, reset_bootfs = 0; 410 uint64_t objnum = 0; 411 boolean_t has_feature = B_FALSE; 412 413 elem = NULL; 414 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 415 uint64_t intval; 416 char *strval, *slash, *check, *fname; 417 const char *propname = nvpair_name(elem); 418 zpool_prop_t prop = zpool_name_to_prop(propname); 419 420 switch (prop) { 421 case ZPROP_INVAL: 422 if (!zpool_prop_feature(propname)) { 423 error = SET_ERROR(EINVAL); 424 break; 425 } 426 427 /* 428 * Sanitize the input. 429 */ 430 if (nvpair_type(elem) != DATA_TYPE_UINT64) { 431 error = SET_ERROR(EINVAL); 432 break; 433 } 434 435 if (nvpair_value_uint64(elem, &intval) != 0) { 436 error = SET_ERROR(EINVAL); 437 break; 438 } 439 440 if (intval != 0) { 441 error = SET_ERROR(EINVAL); 442 break; 443 } 444 445 fname = strchr(propname, '@') + 1; 446 if (zfeature_lookup_name(fname, NULL) != 0) { 447 error = SET_ERROR(EINVAL); 448 break; 449 } 450 451 has_feature = B_TRUE; 452 break; 453 454 case ZPOOL_PROP_VERSION: 455 error = nvpair_value_uint64(elem, &intval); 456 if (!error && 457 (intval < spa_version(spa) || 458 intval > SPA_VERSION_BEFORE_FEATURES || 459 has_feature)) 460 error = SET_ERROR(EINVAL); 461 break; 462 463 case ZPOOL_PROP_DELEGATION: 464 case ZPOOL_PROP_AUTOREPLACE: 465 case ZPOOL_PROP_LISTSNAPS: 466 case ZPOOL_PROP_AUTOEXPAND: 467 error = nvpair_value_uint64(elem, &intval); 468 if (!error && intval > 1) 469 error = SET_ERROR(EINVAL); 470 break; 471 472 case ZPOOL_PROP_BOOTFS: 473 /* 474 * If the pool version is less than SPA_VERSION_BOOTFS, 475 * or the pool is still being created (version == 0), 476 * the bootfs property cannot be set. 477 */ 478 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 479 error = SET_ERROR(ENOTSUP); 480 break; 481 } 482 483 /* 484 * Make sure the vdev config is bootable 485 */ 486 if (!vdev_is_bootable(spa->spa_root_vdev)) { 487 error = SET_ERROR(ENOTSUP); 488 break; 489 } 490 491 reset_bootfs = 1; 492 493 error = nvpair_value_string(elem, &strval); 494 495 if (!error) { 496 objset_t *os; 497 uint64_t propval; 498 499 if (strval == NULL || strval[0] == '\0') { 500 objnum = zpool_prop_default_numeric( 501 ZPOOL_PROP_BOOTFS); 502 break; 503 } 504 505 if (error = dmu_objset_hold(strval, FTAG, &os)) 506 break; 507 508 /* 509 * Must be ZPL, and its property settings 510 * must be supported by GRUB (compression 511 * is not gzip, and large blocks are not used). 512 */ 513 514 if (dmu_objset_type(os) != DMU_OST_ZFS) { 515 error = SET_ERROR(ENOTSUP); 516 } else if ((error = 517 dsl_prop_get_int_ds(dmu_objset_ds(os), 518 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 519 &propval)) == 0 && 520 !BOOTFS_COMPRESS_VALID(propval)) { 521 error = SET_ERROR(ENOTSUP); 522 } else if ((error = 523 dsl_prop_get_int_ds(dmu_objset_ds(os), 524 zfs_prop_to_name(ZFS_PROP_RECORDSIZE), 525 &propval)) == 0 && 526 propval > SPA_OLD_MAXBLOCKSIZE) { 527 error = SET_ERROR(ENOTSUP); 528 } else { 529 objnum = dmu_objset_id(os); 530 } 531 dmu_objset_rele(os, FTAG); 532 } 533 break; 534 535 case ZPOOL_PROP_FAILUREMODE: 536 error = nvpair_value_uint64(elem, &intval); 537 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 538 intval > ZIO_FAILURE_MODE_PANIC)) 539 error = SET_ERROR(EINVAL); 540 541 /* 542 * This is a special case which only occurs when 543 * the pool has completely failed. This allows 544 * the user to change the in-core failmode property 545 * without syncing it out to disk (I/Os might 546 * currently be blocked). We do this by returning 547 * EIO to the caller (spa_prop_set) to trick it 548 * into thinking we encountered a property validation 549 * error. 550 */ 551 if (!error && spa_suspended(spa)) { 552 spa->spa_failmode = intval; 553 error = SET_ERROR(EIO); 554 } 555 break; 556 557 case ZPOOL_PROP_CACHEFILE: 558 if ((error = nvpair_value_string(elem, &strval)) != 0) 559 break; 560 561 if (strval[0] == '\0') 562 break; 563 564 if (strcmp(strval, "none") == 0) 565 break; 566 567 if (strval[0] != '/') { 568 error = SET_ERROR(EINVAL); 569 break; 570 } 571 572 slash = strrchr(strval, '/'); 573 ASSERT(slash != NULL); 574 575 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 576 strcmp(slash, "/..") == 0) 577 error = SET_ERROR(EINVAL); 578 break; 579 580 case ZPOOL_PROP_COMMENT: 581 if ((error = nvpair_value_string(elem, &strval)) != 0) 582 break; 583 for (check = strval; *check != '\0'; check++) { 584 /* 585 * The kernel doesn't have an easy isprint() 586 * check. For this kernel check, we merely 587 * check ASCII apart from DEL. Fix this if 588 * there is an easy-to-use kernel isprint(). 589 */ 590 if (*check >= 0x7f) { 591 error = SET_ERROR(EINVAL); 592 break; 593 } 594 } 595 if (strlen(strval) > ZPROP_MAX_COMMENT) 596 error = E2BIG; 597 break; 598 599 case ZPOOL_PROP_DEDUPDITTO: 600 if (spa_version(spa) < SPA_VERSION_DEDUP) 601 error = SET_ERROR(ENOTSUP); 602 else 603 error = nvpair_value_uint64(elem, &intval); 604 if (error == 0 && 605 intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 606 error = SET_ERROR(EINVAL); 607 break; 608 } 609 610 if (error) 611 break; 612 } 613 614 if (!error && reset_bootfs) { 615 error = nvlist_remove(props, 616 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 617 618 if (!error) { 619 error = nvlist_add_uint64(props, 620 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 621 } 622 } 623 624 return (error); 625 } 626 627 void 628 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 629 { 630 char *cachefile; 631 spa_config_dirent_t *dp; 632 633 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 634 &cachefile) != 0) 635 return; 636 637 dp = kmem_alloc(sizeof (spa_config_dirent_t), 638 KM_SLEEP); 639 640 if (cachefile[0] == '\0') 641 dp->scd_path = spa_strdup(spa_config_path); 642 else if (strcmp(cachefile, "none") == 0) 643 dp->scd_path = NULL; 644 else 645 dp->scd_path = spa_strdup(cachefile); 646 647 list_insert_head(&spa->spa_config_list, dp); 648 if (need_sync) 649 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 650 } 651 652 int 653 spa_prop_set(spa_t *spa, nvlist_t *nvp) 654 { 655 int error; 656 nvpair_t *elem = NULL; 657 boolean_t need_sync = B_FALSE; 658 659 if ((error = spa_prop_validate(spa, nvp)) != 0) 660 return (error); 661 662 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 663 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 664 665 if (prop == ZPOOL_PROP_CACHEFILE || 666 prop == ZPOOL_PROP_ALTROOT || 667 prop == ZPOOL_PROP_READONLY) 668 continue; 669 670 if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) { 671 uint64_t ver; 672 673 if (prop == ZPOOL_PROP_VERSION) { 674 VERIFY(nvpair_value_uint64(elem, &ver) == 0); 675 } else { 676 ASSERT(zpool_prop_feature(nvpair_name(elem))); 677 ver = SPA_VERSION_FEATURES; 678 need_sync = B_TRUE; 679 } 680 681 /* Save time if the version is already set. */ 682 if (ver == spa_version(spa)) 683 continue; 684 685 /* 686 * In addition to the pool directory object, we might 687 * create the pool properties object, the features for 688 * read object, the features for write object, or the 689 * feature descriptions object. 690 */ 691 error = dsl_sync_task(spa->spa_name, NULL, 692 spa_sync_version, &ver, 693 6, ZFS_SPACE_CHECK_RESERVED); 694 if (error) 695 return (error); 696 continue; 697 } 698 699 need_sync = B_TRUE; 700 break; 701 } 702 703 if (need_sync) { 704 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 705 nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 706 } 707 708 return (0); 709 } 710 711 /* 712 * If the bootfs property value is dsobj, clear it. 713 */ 714 void 715 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 716 { 717 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 718 VERIFY(zap_remove(spa->spa_meta_objset, 719 spa->spa_pool_props_object, 720 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 721 spa->spa_bootfs = 0; 722 } 723 } 724 725 /*ARGSUSED*/ 726 static int 727 spa_change_guid_check(void *arg, dmu_tx_t *tx) 728 { 729 uint64_t *newguid = arg; 730 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 731 vdev_t *rvd = spa->spa_root_vdev; 732 uint64_t vdev_state; 733 734 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 735 vdev_state = rvd->vdev_state; 736 spa_config_exit(spa, SCL_STATE, FTAG); 737 738 if (vdev_state != VDEV_STATE_HEALTHY) 739 return (SET_ERROR(ENXIO)); 740 741 ASSERT3U(spa_guid(spa), !=, *newguid); 742 743 return (0); 744 } 745 746 static void 747 spa_change_guid_sync(void *arg, dmu_tx_t *tx) 748 { 749 uint64_t *newguid = arg; 750 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 751 uint64_t oldguid; 752 vdev_t *rvd = spa->spa_root_vdev; 753 754 oldguid = spa_guid(spa); 755 756 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 757 rvd->vdev_guid = *newguid; 758 rvd->vdev_guid_sum += (*newguid - oldguid); 759 vdev_config_dirty(rvd); 760 spa_config_exit(spa, SCL_STATE, FTAG); 761 762 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 763 oldguid, *newguid); 764 } 765 766 /* 767 * Change the GUID for the pool. This is done so that we can later 768 * re-import a pool built from a clone of our own vdevs. We will modify 769 * the root vdev's guid, our own pool guid, and then mark all of our 770 * vdevs dirty. Note that we must make sure that all our vdevs are 771 * online when we do this, or else any vdevs that weren't present 772 * would be orphaned from our pool. We are also going to issue a 773 * sysevent to update any watchers. 774 */ 775 int 776 spa_change_guid(spa_t *spa) 777 { 778 int error; 779 uint64_t guid; 780 781 mutex_enter(&spa->spa_vdev_top_lock); 782 mutex_enter(&spa_namespace_lock); 783 guid = spa_generate_guid(NULL); 784 785 error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 786 spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 787 788 if (error == 0) { 789 spa_config_sync(spa, B_FALSE, B_TRUE); 790 spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID); 791 } 792 793 mutex_exit(&spa_namespace_lock); 794 mutex_exit(&spa->spa_vdev_top_lock); 795 796 return (error); 797 } 798 799 /* 800 * ========================================================================== 801 * SPA state manipulation (open/create/destroy/import/export) 802 * ========================================================================== 803 */ 804 805 static int 806 spa_error_entry_compare(const void *a, const void *b) 807 { 808 spa_error_entry_t *sa = (spa_error_entry_t *)a; 809 spa_error_entry_t *sb = (spa_error_entry_t *)b; 810 int ret; 811 812 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 813 sizeof (zbookmark_phys_t)); 814 815 if (ret < 0) 816 return (-1); 817 else if (ret > 0) 818 return (1); 819 else 820 return (0); 821 } 822 823 /* 824 * Utility function which retrieves copies of the current logs and 825 * re-initializes them in the process. 826 */ 827 void 828 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 829 { 830 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 831 832 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 833 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 834 835 avl_create(&spa->spa_errlist_scrub, 836 spa_error_entry_compare, sizeof (spa_error_entry_t), 837 offsetof(spa_error_entry_t, se_avl)); 838 avl_create(&spa->spa_errlist_last, 839 spa_error_entry_compare, sizeof (spa_error_entry_t), 840 offsetof(spa_error_entry_t, se_avl)); 841 } 842 843 static void 844 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 845 { 846 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 847 enum zti_modes mode = ztip->zti_mode; 848 uint_t value = ztip->zti_value; 849 uint_t count = ztip->zti_count; 850 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 851 char name[32]; 852 uint_t flags = 0; 853 boolean_t batch = B_FALSE; 854 855 if (mode == ZTI_MODE_NULL) { 856 tqs->stqs_count = 0; 857 tqs->stqs_taskq = NULL; 858 return; 859 } 860 861 ASSERT3U(count, >, 0); 862 863 tqs->stqs_count = count; 864 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 865 866 switch (mode) { 867 case ZTI_MODE_FIXED: 868 ASSERT3U(value, >=, 1); 869 value = MAX(value, 1); 870 break; 871 872 case ZTI_MODE_BATCH: 873 batch = B_TRUE; 874 flags |= TASKQ_THREADS_CPU_PCT; 875 value = zio_taskq_batch_pct; 876 break; 877 878 default: 879 panic("unrecognized mode for %s_%s taskq (%u:%u) in " 880 "spa_activate()", 881 zio_type_name[t], zio_taskq_types[q], mode, value); 882 break; 883 } 884 885 for (uint_t i = 0; i < count; i++) { 886 taskq_t *tq; 887 888 if (count > 1) { 889 (void) snprintf(name, sizeof (name), "%s_%s_%u", 890 zio_type_name[t], zio_taskq_types[q], i); 891 } else { 892 (void) snprintf(name, sizeof (name), "%s_%s", 893 zio_type_name[t], zio_taskq_types[q]); 894 } 895 896 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 897 if (batch) 898 flags |= TASKQ_DC_BATCH; 899 900 tq = taskq_create_sysdc(name, value, 50, INT_MAX, 901 spa->spa_proc, zio_taskq_basedc, flags); 902 } else { 903 pri_t pri = maxclsyspri; 904 /* 905 * The write issue taskq can be extremely CPU 906 * intensive. Run it at slightly lower priority 907 * than the other taskqs. 908 */ 909 if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) 910 pri--; 911 912 tq = taskq_create_proc(name, value, pri, 50, 913 INT_MAX, spa->spa_proc, flags); 914 } 915 916 tqs->stqs_taskq[i] = tq; 917 } 918 } 919 920 static void 921 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 922 { 923 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 924 925 if (tqs->stqs_taskq == NULL) { 926 ASSERT0(tqs->stqs_count); 927 return; 928 } 929 930 for (uint_t i = 0; i < tqs->stqs_count; i++) { 931 ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 932 taskq_destroy(tqs->stqs_taskq[i]); 933 } 934 935 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 936 tqs->stqs_taskq = NULL; 937 } 938 939 /* 940 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 941 * Note that a type may have multiple discrete taskqs to avoid lock contention 942 * on the taskq itself. In that case we choose which taskq at random by using 943 * the low bits of gethrtime(). 944 */ 945 void 946 spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 947 task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) 948 { 949 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 950 taskq_t *tq; 951 952 ASSERT3P(tqs->stqs_taskq, !=, NULL); 953 ASSERT3U(tqs->stqs_count, !=, 0); 954 955 if (tqs->stqs_count == 1) { 956 tq = tqs->stqs_taskq[0]; 957 } else { 958 tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count]; 959 } 960 961 taskq_dispatch_ent(tq, func, arg, flags, ent); 962 } 963 964 static void 965 spa_create_zio_taskqs(spa_t *spa) 966 { 967 for (int t = 0; t < ZIO_TYPES; t++) { 968 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 969 spa_taskqs_init(spa, t, q); 970 } 971 } 972 } 973 974 #ifdef _KERNEL 975 static void 976 spa_thread(void *arg) 977 { 978 callb_cpr_t cprinfo; 979 980 spa_t *spa = arg; 981 user_t *pu = PTOU(curproc); 982 983 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 984 spa->spa_name); 985 986 ASSERT(curproc != &p0); 987 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 988 "zpool-%s", spa->spa_name); 989 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 990 991 /* bind this thread to the requested psrset */ 992 if (zio_taskq_psrset_bind != PS_NONE) { 993 pool_lock(); 994 mutex_enter(&cpu_lock); 995 mutex_enter(&pidlock); 996 mutex_enter(&curproc->p_lock); 997 998 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 999 0, NULL, NULL) == 0) { 1000 curthread->t_bind_pset = zio_taskq_psrset_bind; 1001 } else { 1002 cmn_err(CE_WARN, 1003 "Couldn't bind process for zfs pool \"%s\" to " 1004 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1005 } 1006 1007 mutex_exit(&curproc->p_lock); 1008 mutex_exit(&pidlock); 1009 mutex_exit(&cpu_lock); 1010 pool_unlock(); 1011 } 1012 1013 if (zio_taskq_sysdc) { 1014 sysdc_thread_enter(curthread, 100, 0); 1015 } 1016 1017 spa->spa_proc = curproc; 1018 spa->spa_did = curthread->t_did; 1019 1020 spa_create_zio_taskqs(spa); 1021 1022 mutex_enter(&spa->spa_proc_lock); 1023 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1024 1025 spa->spa_proc_state = SPA_PROC_ACTIVE; 1026 cv_broadcast(&spa->spa_proc_cv); 1027 1028 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1029 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1030 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1031 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1032 1033 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1034 spa->spa_proc_state = SPA_PROC_GONE; 1035 spa->spa_proc = &p0; 1036 cv_broadcast(&spa->spa_proc_cv); 1037 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1038 1039 mutex_enter(&curproc->p_lock); 1040 lwp_exit(); 1041 } 1042 #endif 1043 1044 /* 1045 * Activate an uninitialized pool. 1046 */ 1047 static void 1048 spa_activate(spa_t *spa, int mode) 1049 { 1050 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1051 1052 spa->spa_state = POOL_STATE_ACTIVE; 1053 spa->spa_mode = mode; 1054 1055 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 1056 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 1057 1058 /* Try to create a covering process */ 1059 mutex_enter(&spa->spa_proc_lock); 1060 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1061 ASSERT(spa->spa_proc == &p0); 1062 spa->spa_did = 0; 1063 1064 /* Only create a process if we're going to be around a while. */ 1065 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1066 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1067 NULL, 0) == 0) { 1068 spa->spa_proc_state = SPA_PROC_CREATED; 1069 while (spa->spa_proc_state == SPA_PROC_CREATED) { 1070 cv_wait(&spa->spa_proc_cv, 1071 &spa->spa_proc_lock); 1072 } 1073 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1074 ASSERT(spa->spa_proc != &p0); 1075 ASSERT(spa->spa_did != 0); 1076 } else { 1077 #ifdef _KERNEL 1078 cmn_err(CE_WARN, 1079 "Couldn't create process for zfs pool \"%s\"\n", 1080 spa->spa_name); 1081 #endif 1082 } 1083 } 1084 mutex_exit(&spa->spa_proc_lock); 1085 1086 /* If we didn't create a process, we need to create our taskqs. */ 1087 if (spa->spa_proc == &p0) { 1088 spa_create_zio_taskqs(spa); 1089 } 1090 1091 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1092 offsetof(vdev_t, vdev_config_dirty_node)); 1093 list_create(&spa->spa_evicting_os_list, sizeof (objset_t), 1094 offsetof(objset_t, os_evicting_node)); 1095 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1096 offsetof(vdev_t, vdev_state_dirty_node)); 1097 1098 txg_list_create(&spa->spa_vdev_txg_list, 1099 offsetof(struct vdev, vdev_txg_node)); 1100 1101 avl_create(&spa->spa_errlist_scrub, 1102 spa_error_entry_compare, sizeof (spa_error_entry_t), 1103 offsetof(spa_error_entry_t, se_avl)); 1104 avl_create(&spa->spa_errlist_last, 1105 spa_error_entry_compare, sizeof (spa_error_entry_t), 1106 offsetof(spa_error_entry_t, se_avl)); 1107 } 1108 1109 /* 1110 * Opposite of spa_activate(). 1111 */ 1112 static void 1113 spa_deactivate(spa_t *spa) 1114 { 1115 ASSERT(spa->spa_sync_on == B_FALSE); 1116 ASSERT(spa->spa_dsl_pool == NULL); 1117 ASSERT(spa->spa_root_vdev == NULL); 1118 ASSERT(spa->spa_async_zio_root == NULL); 1119 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1120 1121 spa_evicting_os_wait(spa); 1122 1123 txg_list_destroy(&spa->spa_vdev_txg_list); 1124 1125 list_destroy(&spa->spa_config_dirty_list); 1126 list_destroy(&spa->spa_evicting_os_list); 1127 list_destroy(&spa->spa_state_dirty_list); 1128 1129 for (int t = 0; t < ZIO_TYPES; t++) { 1130 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1131 spa_taskqs_fini(spa, t, q); 1132 } 1133 } 1134 1135 metaslab_class_destroy(spa->spa_normal_class); 1136 spa->spa_normal_class = NULL; 1137 1138 metaslab_class_destroy(spa->spa_log_class); 1139 spa->spa_log_class = NULL; 1140 1141 /* 1142 * If this was part of an import or the open otherwise failed, we may 1143 * still have errors left in the queues. Empty them just in case. 1144 */ 1145 spa_errlog_drain(spa); 1146 1147 avl_destroy(&spa->spa_errlist_scrub); 1148 avl_destroy(&spa->spa_errlist_last); 1149 1150 spa->spa_state = POOL_STATE_UNINITIALIZED; 1151 1152 mutex_enter(&spa->spa_proc_lock); 1153 if (spa->spa_proc_state != SPA_PROC_NONE) { 1154 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1155 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1156 cv_broadcast(&spa->spa_proc_cv); 1157 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1158 ASSERT(spa->spa_proc != &p0); 1159 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1160 } 1161 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1162 spa->spa_proc_state = SPA_PROC_NONE; 1163 } 1164 ASSERT(spa->spa_proc == &p0); 1165 mutex_exit(&spa->spa_proc_lock); 1166 1167 /* 1168 * We want to make sure spa_thread() has actually exited the ZFS 1169 * module, so that the module can't be unloaded out from underneath 1170 * it. 1171 */ 1172 if (spa->spa_did != 0) { 1173 thread_join(spa->spa_did); 1174 spa->spa_did = 0; 1175 } 1176 } 1177 1178 /* 1179 * Verify a pool configuration, and construct the vdev tree appropriately. This 1180 * will create all the necessary vdevs in the appropriate layout, with each vdev 1181 * in the CLOSED state. This will prep the pool before open/creation/import. 1182 * All vdev validation is done by the vdev_alloc() routine. 1183 */ 1184 static int 1185 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1186 uint_t id, int atype) 1187 { 1188 nvlist_t **child; 1189 uint_t children; 1190 int error; 1191 1192 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1193 return (error); 1194 1195 if ((*vdp)->vdev_ops->vdev_op_leaf) 1196 return (0); 1197 1198 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1199 &child, &children); 1200 1201 if (error == ENOENT) 1202 return (0); 1203 1204 if (error) { 1205 vdev_free(*vdp); 1206 *vdp = NULL; 1207 return (SET_ERROR(EINVAL)); 1208 } 1209 1210 for (int c = 0; c < children; c++) { 1211 vdev_t *vd; 1212 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1213 atype)) != 0) { 1214 vdev_free(*vdp); 1215 *vdp = NULL; 1216 return (error); 1217 } 1218 } 1219 1220 ASSERT(*vdp != NULL); 1221 1222 return (0); 1223 } 1224 1225 /* 1226 * Opposite of spa_load(). 1227 */ 1228 static void 1229 spa_unload(spa_t *spa) 1230 { 1231 int i; 1232 1233 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1234 1235 /* 1236 * Stop async tasks. 1237 */ 1238 spa_async_suspend(spa); 1239 1240 /* 1241 * Stop syncing. 1242 */ 1243 if (spa->spa_sync_on) { 1244 txg_sync_stop(spa->spa_dsl_pool); 1245 spa->spa_sync_on = B_FALSE; 1246 } 1247 1248 /* 1249 * Wait for any outstanding async I/O to complete. 1250 */ 1251 if (spa->spa_async_zio_root != NULL) { 1252 for (int i = 0; i < max_ncpus; i++) 1253 (void) zio_wait(spa->spa_async_zio_root[i]); 1254 kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); 1255 spa->spa_async_zio_root = NULL; 1256 } 1257 1258 bpobj_close(&spa->spa_deferred_bpobj); 1259 1260 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1261 1262 /* 1263 * Close all vdevs. 1264 */ 1265 if (spa->spa_root_vdev) 1266 vdev_free(spa->spa_root_vdev); 1267 ASSERT(spa->spa_root_vdev == NULL); 1268 1269 /* 1270 * Close the dsl pool. 1271 */ 1272 if (spa->spa_dsl_pool) { 1273 dsl_pool_close(spa->spa_dsl_pool); 1274 spa->spa_dsl_pool = NULL; 1275 spa->spa_meta_objset = NULL; 1276 } 1277 1278 ddt_unload(spa); 1279 1280 1281 /* 1282 * Drop and purge level 2 cache 1283 */ 1284 spa_l2cache_drop(spa); 1285 1286 for (i = 0; i < spa->spa_spares.sav_count; i++) 1287 vdev_free(spa->spa_spares.sav_vdevs[i]); 1288 if (spa->spa_spares.sav_vdevs) { 1289 kmem_free(spa->spa_spares.sav_vdevs, 1290 spa->spa_spares.sav_count * sizeof (void *)); 1291 spa->spa_spares.sav_vdevs = NULL; 1292 } 1293 if (spa->spa_spares.sav_config) { 1294 nvlist_free(spa->spa_spares.sav_config); 1295 spa->spa_spares.sav_config = NULL; 1296 } 1297 spa->spa_spares.sav_count = 0; 1298 1299 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 1300 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1301 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1302 } 1303 if (spa->spa_l2cache.sav_vdevs) { 1304 kmem_free(spa->spa_l2cache.sav_vdevs, 1305 spa->spa_l2cache.sav_count * sizeof (void *)); 1306 spa->spa_l2cache.sav_vdevs = NULL; 1307 } 1308 if (spa->spa_l2cache.sav_config) { 1309 nvlist_free(spa->spa_l2cache.sav_config); 1310 spa->spa_l2cache.sav_config = NULL; 1311 } 1312 spa->spa_l2cache.sav_count = 0; 1313 1314 spa->spa_async_suspended = 0; 1315 1316 if (spa->spa_comment != NULL) { 1317 spa_strfree(spa->spa_comment); 1318 spa->spa_comment = NULL; 1319 } 1320 1321 spa_config_exit(spa, SCL_ALL, FTAG); 1322 } 1323 1324 /* 1325 * Load (or re-load) the current list of vdevs describing the active spares for 1326 * this pool. When this is called, we have some form of basic information in 1327 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1328 * then re-generate a more complete list including status information. 1329 */ 1330 static void 1331 spa_load_spares(spa_t *spa) 1332 { 1333 nvlist_t **spares; 1334 uint_t nspares; 1335 int i; 1336 vdev_t *vd, *tvd; 1337 1338 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1339 1340 /* 1341 * First, close and free any existing spare vdevs. 1342 */ 1343 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1344 vd = spa->spa_spares.sav_vdevs[i]; 1345 1346 /* Undo the call to spa_activate() below */ 1347 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1348 B_FALSE)) != NULL && tvd->vdev_isspare) 1349 spa_spare_remove(tvd); 1350 vdev_close(vd); 1351 vdev_free(vd); 1352 } 1353 1354 if (spa->spa_spares.sav_vdevs) 1355 kmem_free(spa->spa_spares.sav_vdevs, 1356 spa->spa_spares.sav_count * sizeof (void *)); 1357 1358 if (spa->spa_spares.sav_config == NULL) 1359 nspares = 0; 1360 else 1361 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1362 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1363 1364 spa->spa_spares.sav_count = (int)nspares; 1365 spa->spa_spares.sav_vdevs = NULL; 1366 1367 if (nspares == 0) 1368 return; 1369 1370 /* 1371 * Construct the array of vdevs, opening them to get status in the 1372 * process. For each spare, there is potentially two different vdev_t 1373 * structures associated with it: one in the list of spares (used only 1374 * for basic validation purposes) and one in the active vdev 1375 * configuration (if it's spared in). During this phase we open and 1376 * validate each vdev on the spare list. If the vdev also exists in the 1377 * active configuration, then we also mark this vdev as an active spare. 1378 */ 1379 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1380 KM_SLEEP); 1381 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1382 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1383 VDEV_ALLOC_SPARE) == 0); 1384 ASSERT(vd != NULL); 1385 1386 spa->spa_spares.sav_vdevs[i] = vd; 1387 1388 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1389 B_FALSE)) != NULL) { 1390 if (!tvd->vdev_isspare) 1391 spa_spare_add(tvd); 1392 1393 /* 1394 * We only mark the spare active if we were successfully 1395 * able to load the vdev. Otherwise, importing a pool 1396 * with a bad active spare would result in strange 1397 * behavior, because multiple pool would think the spare 1398 * is actively in use. 1399 * 1400 * There is a vulnerability here to an equally bizarre 1401 * circumstance, where a dead active spare is later 1402 * brought back to life (onlined or otherwise). Given 1403 * the rarity of this scenario, and the extra complexity 1404 * it adds, we ignore the possibility. 1405 */ 1406 if (!vdev_is_dead(tvd)) 1407 spa_spare_activate(tvd); 1408 } 1409 1410 vd->vdev_top = vd; 1411 vd->vdev_aux = &spa->spa_spares; 1412 1413 if (vdev_open(vd) != 0) 1414 continue; 1415 1416 if (vdev_validate_aux(vd) == 0) 1417 spa_spare_add(vd); 1418 } 1419 1420 /* 1421 * Recompute the stashed list of spares, with status information 1422 * this time. 1423 */ 1424 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1425 DATA_TYPE_NVLIST_ARRAY) == 0); 1426 1427 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1428 KM_SLEEP); 1429 for (i = 0; i < spa->spa_spares.sav_count; i++) 1430 spares[i] = vdev_config_generate(spa, 1431 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1432 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1433 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1434 for (i = 0; i < spa->spa_spares.sav_count; i++) 1435 nvlist_free(spares[i]); 1436 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1437 } 1438 1439 /* 1440 * Load (or re-load) the current list of vdevs describing the active l2cache for 1441 * this pool. When this is called, we have some form of basic information in 1442 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1443 * then re-generate a more complete list including status information. 1444 * Devices which are already active have their details maintained, and are 1445 * not re-opened. 1446 */ 1447 static void 1448 spa_load_l2cache(spa_t *spa) 1449 { 1450 nvlist_t **l2cache; 1451 uint_t nl2cache; 1452 int i, j, oldnvdevs; 1453 uint64_t guid; 1454 vdev_t *vd, **oldvdevs, **newvdevs; 1455 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1456 1457 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1458 1459 if (sav->sav_config != NULL) { 1460 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1461 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1462 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1463 } else { 1464 nl2cache = 0; 1465 newvdevs = NULL; 1466 } 1467 1468 oldvdevs = sav->sav_vdevs; 1469 oldnvdevs = sav->sav_count; 1470 sav->sav_vdevs = NULL; 1471 sav->sav_count = 0; 1472 1473 /* 1474 * Process new nvlist of vdevs. 1475 */ 1476 for (i = 0; i < nl2cache; i++) { 1477 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1478 &guid) == 0); 1479 1480 newvdevs[i] = NULL; 1481 for (j = 0; j < oldnvdevs; j++) { 1482 vd = oldvdevs[j]; 1483 if (vd != NULL && guid == vd->vdev_guid) { 1484 /* 1485 * Retain previous vdev for add/remove ops. 1486 */ 1487 newvdevs[i] = vd; 1488 oldvdevs[j] = NULL; 1489 break; 1490 } 1491 } 1492 1493 if (newvdevs[i] == NULL) { 1494 /* 1495 * Create new vdev 1496 */ 1497 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1498 VDEV_ALLOC_L2CACHE) == 0); 1499 ASSERT(vd != NULL); 1500 newvdevs[i] = vd; 1501 1502 /* 1503 * Commit this vdev as an l2cache device, 1504 * even if it fails to open. 1505 */ 1506 spa_l2cache_add(vd); 1507 1508 vd->vdev_top = vd; 1509 vd->vdev_aux = sav; 1510 1511 spa_l2cache_activate(vd); 1512 1513 if (vdev_open(vd) != 0) 1514 continue; 1515 1516 (void) vdev_validate_aux(vd); 1517 1518 if (!vdev_is_dead(vd)) 1519 l2arc_add_vdev(spa, vd); 1520 } 1521 } 1522 1523 /* 1524 * Purge vdevs that were dropped 1525 */ 1526 for (i = 0; i < oldnvdevs; i++) { 1527 uint64_t pool; 1528 1529 vd = oldvdevs[i]; 1530 if (vd != NULL) { 1531 ASSERT(vd->vdev_isl2cache); 1532 1533 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1534 pool != 0ULL && l2arc_vdev_present(vd)) 1535 l2arc_remove_vdev(vd); 1536 vdev_clear_stats(vd); 1537 vdev_free(vd); 1538 } 1539 } 1540 1541 if (oldvdevs) 1542 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1543 1544 if (sav->sav_config == NULL) 1545 goto out; 1546 1547 sav->sav_vdevs = newvdevs; 1548 sav->sav_count = (int)nl2cache; 1549 1550 /* 1551 * Recompute the stashed list of l2cache devices, with status 1552 * information this time. 1553 */ 1554 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1555 DATA_TYPE_NVLIST_ARRAY) == 0); 1556 1557 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1558 for (i = 0; i < sav->sav_count; i++) 1559 l2cache[i] = vdev_config_generate(spa, 1560 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1561 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1562 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1563 out: 1564 for (i = 0; i < sav->sav_count; i++) 1565 nvlist_free(l2cache[i]); 1566 if (sav->sav_count) 1567 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1568 } 1569 1570 static int 1571 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1572 { 1573 dmu_buf_t *db; 1574 char *packed = NULL; 1575 size_t nvsize = 0; 1576 int error; 1577 *value = NULL; 1578 1579 error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 1580 if (error != 0) 1581 return (error); 1582 1583 nvsize = *(uint64_t *)db->db_data; 1584 dmu_buf_rele(db, FTAG); 1585 1586 packed = kmem_alloc(nvsize, KM_SLEEP); 1587 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1588 DMU_READ_PREFETCH); 1589 if (error == 0) 1590 error = nvlist_unpack(packed, nvsize, value, 0); 1591 kmem_free(packed, nvsize); 1592 1593 return (error); 1594 } 1595 1596 /* 1597 * Checks to see if the given vdev could not be opened, in which case we post a 1598 * sysevent to notify the autoreplace code that the device has been removed. 1599 */ 1600 static void 1601 spa_check_removed(vdev_t *vd) 1602 { 1603 for (int c = 0; c < vd->vdev_children; c++) 1604 spa_check_removed(vd->vdev_child[c]); 1605 1606 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 1607 !vd->vdev_ishole) { 1608 zfs_post_autoreplace(vd->vdev_spa, vd); 1609 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1610 } 1611 } 1612 1613 /* 1614 * Validate the current config against the MOS config 1615 */ 1616 static boolean_t 1617 spa_config_valid(spa_t *spa, nvlist_t *config) 1618 { 1619 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 1620 nvlist_t *nv; 1621 1622 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 1623 1624 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1625 VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1626 1627 ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 1628 1629 /* 1630 * If we're doing a normal import, then build up any additional 1631 * diagnostic information about missing devices in this config. 1632 * We'll pass this up to the user for further processing. 1633 */ 1634 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1635 nvlist_t **child, *nv; 1636 uint64_t idx = 0; 1637 1638 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1639 KM_SLEEP); 1640 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1641 1642 for (int c = 0; c < rvd->vdev_children; c++) { 1643 vdev_t *tvd = rvd->vdev_child[c]; 1644 vdev_t *mtvd = mrvd->vdev_child[c]; 1645 1646 if (tvd->vdev_ops == &vdev_missing_ops && 1647 mtvd->vdev_ops != &vdev_missing_ops && 1648 mtvd->vdev_islog) 1649 child[idx++] = vdev_config_generate(spa, mtvd, 1650 B_FALSE, 0); 1651 } 1652 1653 if (idx) { 1654 VERIFY(nvlist_add_nvlist_array(nv, 1655 ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 1656 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 1657 ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 1658 1659 for (int i = 0; i < idx; i++) 1660 nvlist_free(child[i]); 1661 } 1662 nvlist_free(nv); 1663 kmem_free(child, rvd->vdev_children * sizeof (char **)); 1664 } 1665 1666 /* 1667 * Compare the root vdev tree with the information we have 1668 * from the MOS config (mrvd). Check each top-level vdev 1669 * with the corresponding MOS config top-level (mtvd). 1670 */ 1671 for (int c = 0; c < rvd->vdev_children; c++) { 1672 vdev_t *tvd = rvd->vdev_child[c]; 1673 vdev_t *mtvd = mrvd->vdev_child[c]; 1674 1675 /* 1676 * Resolve any "missing" vdevs in the current configuration. 1677 * If we find that the MOS config has more accurate information 1678 * about the top-level vdev then use that vdev instead. 1679 */ 1680 if (tvd->vdev_ops == &vdev_missing_ops && 1681 mtvd->vdev_ops != &vdev_missing_ops) { 1682 1683 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) 1684 continue; 1685 1686 /* 1687 * Device specific actions. 1688 */ 1689 if (mtvd->vdev_islog) { 1690 spa_set_log_state(spa, SPA_LOG_CLEAR); 1691 } else { 1692 /* 1693 * XXX - once we have 'readonly' pool 1694 * support we should be able to handle 1695 * missing data devices by transitioning 1696 * the pool to readonly. 1697 */ 1698 continue; 1699 } 1700 1701 /* 1702 * Swap the missing vdev with the data we were 1703 * able to obtain from the MOS config. 1704 */ 1705 vdev_remove_child(rvd, tvd); 1706 vdev_remove_child(mrvd, mtvd); 1707 1708 vdev_add_child(rvd, mtvd); 1709 vdev_add_child(mrvd, tvd); 1710 1711 spa_config_exit(spa, SCL_ALL, FTAG); 1712 vdev_load(mtvd); 1713 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1714 1715 vdev_reopen(rvd); 1716 } else if (mtvd->vdev_islog) { 1717 /* 1718 * Load the slog device's state from the MOS config 1719 * since it's possible that the label does not 1720 * contain the most up-to-date information. 1721 */ 1722 vdev_load_log_state(tvd, mtvd); 1723 vdev_reopen(tvd); 1724 } 1725 } 1726 vdev_free(mrvd); 1727 spa_config_exit(spa, SCL_ALL, FTAG); 1728 1729 /* 1730 * Ensure we were able to validate the config. 1731 */ 1732 return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 1733 } 1734 1735 /* 1736 * Check for missing log devices 1737 */ 1738 static boolean_t 1739 spa_check_logs(spa_t *spa) 1740 { 1741 boolean_t rv = B_FALSE; 1742 dsl_pool_t *dp = spa_get_dsl(spa); 1743 1744 switch (spa->spa_log_state) { 1745 case SPA_LOG_MISSING: 1746 /* need to recheck in case slog has been restored */ 1747 case SPA_LOG_UNKNOWN: 1748 rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 1749 zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); 1750 if (rv) 1751 spa_set_log_state(spa, SPA_LOG_MISSING); 1752 break; 1753 } 1754 return (rv); 1755 } 1756 1757 static boolean_t 1758 spa_passivate_log(spa_t *spa) 1759 { 1760 vdev_t *rvd = spa->spa_root_vdev; 1761 boolean_t slog_found = B_FALSE; 1762 1763 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1764 1765 if (!spa_has_slogs(spa)) 1766 return (B_FALSE); 1767 1768 for (int c = 0; c < rvd->vdev_children; c++) { 1769 vdev_t *tvd = rvd->vdev_child[c]; 1770 metaslab_group_t *mg = tvd->vdev_mg; 1771 1772 if (tvd->vdev_islog) { 1773 metaslab_group_passivate(mg); 1774 slog_found = B_TRUE; 1775 } 1776 } 1777 1778 return (slog_found); 1779 } 1780 1781 static void 1782 spa_activate_log(spa_t *spa) 1783 { 1784 vdev_t *rvd = spa->spa_root_vdev; 1785 1786 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1787 1788 for (int c = 0; c < rvd->vdev_children; c++) { 1789 vdev_t *tvd = rvd->vdev_child[c]; 1790 metaslab_group_t *mg = tvd->vdev_mg; 1791 1792 if (tvd->vdev_islog) 1793 metaslab_group_activate(mg); 1794 } 1795 } 1796 1797 int 1798 spa_offline_log(spa_t *spa) 1799 { 1800 int error; 1801 1802 error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 1803 NULL, DS_FIND_CHILDREN); 1804 if (error == 0) { 1805 /* 1806 * We successfully offlined the log device, sync out the 1807 * current txg so that the "stubby" block can be removed 1808 * by zil_sync(). 1809 */ 1810 txg_wait_synced(spa->spa_dsl_pool, 0); 1811 } 1812 return (error); 1813 } 1814 1815 static void 1816 spa_aux_check_removed(spa_aux_vdev_t *sav) 1817 { 1818 for (int i = 0; i < sav->sav_count; i++) 1819 spa_check_removed(sav->sav_vdevs[i]); 1820 } 1821 1822 void 1823 spa_claim_notify(zio_t *zio) 1824 { 1825 spa_t *spa = zio->io_spa; 1826 1827 if (zio->io_error) 1828 return; 1829 1830 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1831 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1832 spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1833 mutex_exit(&spa->spa_props_lock); 1834 } 1835 1836 typedef struct spa_load_error { 1837 uint64_t sle_meta_count; 1838 uint64_t sle_data_count; 1839 } spa_load_error_t; 1840 1841 static void 1842 spa_load_verify_done(zio_t *zio) 1843 { 1844 blkptr_t *bp = zio->io_bp; 1845 spa_load_error_t *sle = zio->io_private; 1846 dmu_object_type_t type = BP_GET_TYPE(bp); 1847 int error = zio->io_error; 1848 spa_t *spa = zio->io_spa; 1849 1850 if (error) { 1851 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 1852 type != DMU_OT_INTENT_LOG) 1853 atomic_inc_64(&sle->sle_meta_count); 1854 else 1855 atomic_inc_64(&sle->sle_data_count); 1856 } 1857 zio_data_buf_free(zio->io_data, zio->io_size); 1858 1859 mutex_enter(&spa->spa_scrub_lock); 1860 spa->spa_scrub_inflight--; 1861 cv_broadcast(&spa->spa_scrub_io_cv); 1862 mutex_exit(&spa->spa_scrub_lock); 1863 } 1864 1865 /* 1866 * Maximum number of concurrent scrub i/os to create while verifying 1867 * a pool while importing it. 1868 */ 1869 int spa_load_verify_maxinflight = 10000; 1870 boolean_t spa_load_verify_metadata = B_TRUE; 1871 boolean_t spa_load_verify_data = B_TRUE; 1872 1873 /*ARGSUSED*/ 1874 static int 1875 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1876 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 1877 { 1878 if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) 1879 return (0); 1880 /* 1881 * Note: normally this routine will not be called if 1882 * spa_load_verify_metadata is not set. However, it may be useful 1883 * to manually set the flag after the traversal has begun. 1884 */ 1885 if (!spa_load_verify_metadata) 1886 return (0); 1887 if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data) 1888 return (0); 1889 1890 zio_t *rio = arg; 1891 size_t size = BP_GET_PSIZE(bp); 1892 void *data = zio_data_buf_alloc(size); 1893 1894 mutex_enter(&spa->spa_scrub_lock); 1895 while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight) 1896 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1897 spa->spa_scrub_inflight++; 1898 mutex_exit(&spa->spa_scrub_lock); 1899 1900 zio_nowait(zio_read(rio, spa, bp, data, size, 1901 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1902 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1903 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1904 return (0); 1905 } 1906 1907 static int 1908 spa_load_verify(spa_t *spa) 1909 { 1910 zio_t *rio; 1911 spa_load_error_t sle = { 0 }; 1912 zpool_rewind_policy_t policy; 1913 boolean_t verify_ok = B_FALSE; 1914 int error = 0; 1915 1916 zpool_get_rewind_policy(spa->spa_config, &policy); 1917 1918 if (policy.zrp_request & ZPOOL_NEVER_REWIND) 1919 return (0); 1920 1921 rio = zio_root(spa, NULL, &sle, 1922 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1923 1924 if (spa_load_verify_metadata) { 1925 error = traverse_pool(spa, spa->spa_verify_min_txg, 1926 TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, 1927 spa_load_verify_cb, rio); 1928 } 1929 1930 (void) zio_wait(rio); 1931 1932 spa->spa_load_meta_errors = sle.sle_meta_count; 1933 spa->spa_load_data_errors = sle.sle_data_count; 1934 1935 if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 1936 sle.sle_data_count <= policy.zrp_maxdata) { 1937 int64_t loss = 0; 1938 1939 verify_ok = B_TRUE; 1940 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 1941 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 1942 1943 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 1944 VERIFY(nvlist_add_uint64(spa->spa_load_info, 1945 ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 1946 VERIFY(nvlist_add_int64(spa->spa_load_info, 1947 ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 1948 VERIFY(nvlist_add_uint64(spa->spa_load_info, 1949 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 1950 } else { 1951 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 1952 } 1953 1954 if (error) { 1955 if (error != ENXIO && error != EIO) 1956 error = SET_ERROR(EIO); 1957 return (error); 1958 } 1959 1960 return (verify_ok ? 0 : EIO); 1961 } 1962 1963 /* 1964 * Find a value in the pool props object. 1965 */ 1966 static void 1967 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 1968 { 1969 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 1970 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 1971 } 1972 1973 /* 1974 * Find a value in the pool directory object. 1975 */ 1976 static int 1977 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 1978 { 1979 return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1980 name, sizeof (uint64_t), 1, val)); 1981 } 1982 1983 static int 1984 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 1985 { 1986 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 1987 return (err); 1988 } 1989 1990 /* 1991 * Fix up config after a partly-completed split. This is done with the 1992 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 1993 * pool have that entry in their config, but only the splitting one contains 1994 * a list of all the guids of the vdevs that are being split off. 1995 * 1996 * This function determines what to do with that list: either rejoin 1997 * all the disks to the pool, or complete the splitting process. To attempt 1998 * the rejoin, each disk that is offlined is marked online again, and 1999 * we do a reopen() call. If the vdev label for every disk that was 2000 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 2001 * then we call vdev_split() on each disk, and complete the split. 2002 * 2003 * Otherwise we leave the config alone, with all the vdevs in place in 2004 * the original pool. 2005 */ 2006 static void 2007 spa_try_repair(spa_t *spa, nvlist_t *config) 2008 { 2009 uint_t extracted; 2010 uint64_t *glist; 2011 uint_t i, gcount; 2012 nvlist_t *nvl; 2013 vdev_t **vd; 2014 boolean_t attempt_reopen; 2015 2016 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 2017 return; 2018 2019 /* check that the config is complete */ 2020 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 2021 &glist, &gcount) != 0) 2022 return; 2023 2024 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 2025 2026 /* attempt to online all the vdevs & validate */ 2027 attempt_reopen = B_TRUE; 2028 for (i = 0; i < gcount; i++) { 2029 if (glist[i] == 0) /* vdev is hole */ 2030 continue; 2031 2032 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 2033 if (vd[i] == NULL) { 2034 /* 2035 * Don't bother attempting to reopen the disks; 2036 * just do the split. 2037 */ 2038 attempt_reopen = B_FALSE; 2039 } else { 2040 /* attempt to re-online it */ 2041 vd[i]->vdev_offline = B_FALSE; 2042 } 2043 } 2044 2045 if (attempt_reopen) { 2046 vdev_reopen(spa->spa_root_vdev); 2047 2048 /* check each device to see what state it's in */ 2049 for (extracted = 0, i = 0; i < gcount; i++) { 2050 if (vd[i] != NULL && 2051 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 2052 break; 2053 ++extracted; 2054 } 2055 } 2056 2057 /* 2058 * If every disk has been moved to the new pool, or if we never 2059 * even attempted to look at them, then we split them off for 2060 * good. 2061 */ 2062 if (!attempt_reopen || gcount == extracted) { 2063 for (i = 0; i < gcount; i++) 2064 if (vd[i] != NULL) 2065 vdev_split(vd[i]); 2066 vdev_reopen(spa->spa_root_vdev); 2067 } 2068 2069 kmem_free(vd, gcount * sizeof (vdev_t *)); 2070 } 2071 2072 static int 2073 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 2074 boolean_t mosconfig) 2075 { 2076 nvlist_t *config = spa->spa_config; 2077 char *ereport = FM_EREPORT_ZFS_POOL; 2078 char *comment; 2079 int error; 2080 uint64_t pool_guid; 2081 nvlist_t *nvl; 2082 2083 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 2084 return (SET_ERROR(EINVAL)); 2085 2086 ASSERT(spa->spa_comment == NULL); 2087 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 2088 spa->spa_comment = spa_strdup(comment); 2089 2090 /* 2091 * Versioning wasn't explicitly added to the label until later, so if 2092 * it's not present treat it as the initial version. 2093 */ 2094 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 2095 &spa->spa_ubsync.ub_version) != 0) 2096 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 2097 2098 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 2099 &spa->spa_config_txg); 2100 2101 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 2102 spa_guid_exists(pool_guid, 0)) { 2103 error = SET_ERROR(EEXIST); 2104 } else { 2105 spa->spa_config_guid = pool_guid; 2106 2107 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 2108 &nvl) == 0) { 2109 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 2110 KM_SLEEP) == 0); 2111 } 2112 2113 nvlist_free(spa->spa_load_info); 2114 spa->spa_load_info = fnvlist_alloc(); 2115 2116 gethrestime(&spa->spa_loaded_ts); 2117 error = spa_load_impl(spa, pool_guid, config, state, type, 2118 mosconfig, &ereport); 2119 } 2120 2121 /* 2122 * Don't count references from objsets that are already closed 2123 * and are making their way through the eviction process. 2124 */ 2125 spa_evicting_os_wait(spa); 2126 spa->spa_minref = refcount_count(&spa->spa_refcount); 2127 if (error) { 2128 if (error != EEXIST) { 2129 spa->spa_loaded_ts.tv_sec = 0; 2130 spa->spa_loaded_ts.tv_nsec = 0; 2131 } 2132 if (error != EBADF) { 2133 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 2134 } 2135 } 2136 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 2137 spa->spa_ena = 0; 2138 2139 return (error); 2140 } 2141 2142 /* 2143 * Load an existing storage pool, using the pool's builtin spa_config as a 2144 * source of configuration information. 2145 */ 2146 static int 2147 spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 2148 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 2149 char **ereport) 2150 { 2151 int error = 0; 2152 nvlist_t *nvroot = NULL; 2153 nvlist_t *label; 2154 vdev_t *rvd; 2155 uberblock_t *ub = &spa->spa_uberblock; 2156 uint64_t children, config_cache_txg = spa->spa_config_txg; 2157 int orig_mode = spa->spa_mode; 2158 int parse; 2159 uint64_t obj; 2160 boolean_t missing_feat_write = B_FALSE; 2161 2162 /* 2163 * If this is an untrusted config, access the pool in read-only mode. 2164 * This prevents things like resilvering recently removed devices. 2165 */ 2166 if (!mosconfig) 2167 spa->spa_mode = FREAD; 2168 2169 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2170 2171 spa->spa_load_state = state; 2172 2173 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 2174 return (SET_ERROR(EINVAL)); 2175 2176 parse = (type == SPA_IMPORT_EXISTING ? 2177 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 2178 2179 /* 2180 * Create "The Godfather" zio to hold all async IOs 2181 */ 2182 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 2183 KM_SLEEP); 2184 for (int i = 0; i < max_ncpus; i++) { 2185 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 2186 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 2187 ZIO_FLAG_GODFATHER); 2188 } 2189 2190 /* 2191 * Parse the configuration into a vdev tree. We explicitly set the 2192 * value that will be returned by spa_version() since parsing the 2193 * configuration requires knowing the version number. 2194 */ 2195 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2196 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 2197 spa_config_exit(spa, SCL_ALL, FTAG); 2198 2199 if (error != 0) 2200 return (error); 2201 2202 ASSERT(spa->spa_root_vdev == rvd); 2203 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 2204 ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); 2205 2206 if (type != SPA_IMPORT_ASSEMBLE) { 2207 ASSERT(spa_guid(spa) == pool_guid); 2208 } 2209 2210 /* 2211 * Try to open all vdevs, loading each label in the process. 2212 */ 2213 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2214 error = vdev_open(rvd); 2215 spa_config_exit(spa, SCL_ALL, FTAG); 2216 if (error != 0) 2217 return (error); 2218 2219 /* 2220 * We need to validate the vdev labels against the configuration that 2221 * we have in hand, which is dependent on the setting of mosconfig. If 2222 * mosconfig is true then we're validating the vdev labels based on 2223 * that config. Otherwise, we're validating against the cached config 2224 * (zpool.cache) that was read when we loaded the zfs module, and then 2225 * later we will recursively call spa_load() and validate against 2226 * the vdev config. 2227 * 2228 * If we're assembling a new pool that's been split off from an 2229 * existing pool, the labels haven't yet been updated so we skip 2230 * validation for now. 2231 */ 2232 if (type != SPA_IMPORT_ASSEMBLE) { 2233 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2234 error = vdev_validate(rvd, mosconfig); 2235 spa_config_exit(spa, SCL_ALL, FTAG); 2236 2237 if (error != 0) 2238 return (error); 2239 2240 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2241 return (SET_ERROR(ENXIO)); 2242 } 2243 2244 /* 2245 * Find the best uberblock. 2246 */ 2247 vdev_uberblock_load(rvd, ub, &label); 2248 2249 /* 2250 * If we weren't able to find a single valid uberblock, return failure. 2251 */ 2252 if (ub->ub_txg == 0) { 2253 nvlist_free(label); 2254 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 2255 } 2256 2257 /* 2258 * If the pool has an unsupported version we can't open it. 2259 */ 2260 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 2261 nvlist_free(label); 2262 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 2263 } 2264 2265 if (ub->ub_version >= SPA_VERSION_FEATURES) { 2266 nvlist_t *features; 2267 2268 /* 2269 * If we weren't able to find what's necessary for reading the 2270 * MOS in the label, return failure. 2271 */ 2272 if (label == NULL || nvlist_lookup_nvlist(label, 2273 ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { 2274 nvlist_free(label); 2275 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2276 ENXIO)); 2277 } 2278 2279 /* 2280 * Update our in-core representation with the definitive values 2281 * from the label. 2282 */ 2283 nvlist_free(spa->spa_label_features); 2284 VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); 2285 } 2286 2287 nvlist_free(label); 2288 2289 /* 2290 * Look through entries in the label nvlist's features_for_read. If 2291 * there is a feature listed there which we don't understand then we 2292 * cannot open a pool. 2293 */ 2294 if (ub->ub_version >= SPA_VERSION_FEATURES) { 2295 nvlist_t *unsup_feat; 2296 2297 VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 2298 0); 2299 2300 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 2301 NULL); nvp != NULL; 2302 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 2303 if (!zfeature_is_supported(nvpair_name(nvp))) { 2304 VERIFY(nvlist_add_string(unsup_feat, 2305 nvpair_name(nvp), "") == 0); 2306 } 2307 } 2308 2309 if (!nvlist_empty(unsup_feat)) { 2310 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 2311 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); 2312 nvlist_free(unsup_feat); 2313 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2314 ENOTSUP)); 2315 } 2316 2317 nvlist_free(unsup_feat); 2318 } 2319 2320 /* 2321 * If the vdev guid sum doesn't match the uberblock, we have an 2322 * incomplete configuration. We first check to see if the pool 2323 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 2324 * If it is, defer the vdev_guid_sum check till later so we 2325 * can handle missing vdevs. 2326 */ 2327 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 2328 &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && 2329 rvd->vdev_guid_sum != ub->ub_guid_sum) 2330 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 2331 2332 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 2333 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2334 spa_try_repair(spa, config); 2335 spa_config_exit(spa, SCL_ALL, FTAG); 2336 nvlist_free(spa->spa_config_splitting); 2337 spa->spa_config_splitting = NULL; 2338 } 2339 2340 /* 2341 * Initialize internal SPA structures. 2342 */ 2343 spa->spa_state = POOL_STATE_ACTIVE; 2344 spa->spa_ubsync = spa->spa_uberblock; 2345 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2346 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2347 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2348 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2349 spa->spa_claim_max_txg = spa->spa_first_txg; 2350 spa->spa_prev_software_version = ub->ub_software_version; 2351 2352 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2353 if (error) 2354 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2355 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2356 2357 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 2358 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2359 2360 if (spa_version(spa) >= SPA_VERSION_FEATURES) { 2361 boolean_t missing_feat_read = B_FALSE; 2362 nvlist_t *unsup_feat, *enabled_feat; 2363 2364 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 2365 &spa->spa_feat_for_read_obj) != 0) { 2366 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2367 } 2368 2369 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 2370 &spa->spa_feat_for_write_obj) != 0) { 2371 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2372 } 2373 2374 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 2375 &spa->spa_feat_desc_obj) != 0) { 2376 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2377 } 2378 2379 enabled_feat = fnvlist_alloc(); 2380 unsup_feat = fnvlist_alloc(); 2381 2382 if (!spa_features_check(spa, B_FALSE, 2383 unsup_feat, enabled_feat)) 2384 missing_feat_read = B_TRUE; 2385 2386 if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { 2387 if (!spa_features_check(spa, B_TRUE, 2388 unsup_feat, enabled_feat)) { 2389 missing_feat_write = B_TRUE; 2390 } 2391 } 2392 2393 fnvlist_add_nvlist(spa->spa_load_info, 2394 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 2395 2396 if (!nvlist_empty(unsup_feat)) { 2397 fnvlist_add_nvlist(spa->spa_load_info, 2398 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 2399 } 2400 2401 fnvlist_free(enabled_feat); 2402 fnvlist_free(unsup_feat); 2403 2404 if (!missing_feat_read) { 2405 fnvlist_add_boolean(spa->spa_load_info, 2406 ZPOOL_CONFIG_CAN_RDONLY); 2407 } 2408 2409 /* 2410 * If the state is SPA_LOAD_TRYIMPORT, our objective is 2411 * twofold: to determine whether the pool is available for 2412 * import in read-write mode and (if it is not) whether the 2413 * pool is available for import in read-only mode. If the pool 2414 * is available for import in read-write mode, it is displayed 2415 * as available in userland; if it is not available for import 2416 * in read-only mode, it is displayed as unavailable in 2417 * userland. If the pool is available for import in read-only 2418 * mode but not read-write mode, it is displayed as unavailable 2419 * in userland with a special note that the pool is actually 2420 * available for open in read-only mode. 2421 * 2422 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 2423 * missing a feature for write, we must first determine whether 2424 * the pool can be opened read-only before returning to 2425 * userland in order to know whether to display the 2426 * abovementioned note. 2427 */ 2428 if (missing_feat_read || (missing_feat_write && 2429 spa_writeable(spa))) { 2430 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2431 ENOTSUP)); 2432 } 2433 2434 /* 2435 * Load refcounts for ZFS features from disk into an in-memory 2436 * cache during SPA initialization. 2437 */ 2438 for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 2439 uint64_t refcount; 2440 2441 error = feature_get_refcount_from_disk(spa, 2442 &spa_feature_table[i], &refcount); 2443 if (error == 0) { 2444 spa->spa_feat_refcount_cache[i] = refcount; 2445 } else if (error == ENOTSUP) { 2446 spa->spa_feat_refcount_cache[i] = 2447 SPA_FEATURE_DISABLED; 2448 } else { 2449 return (spa_vdev_err(rvd, 2450 VDEV_AUX_CORRUPT_DATA, EIO)); 2451 } 2452 } 2453 } 2454 2455 if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 2456 if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 2457 &spa->spa_feat_enabled_txg_obj) != 0) 2458 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2459 } 2460 2461 spa->spa_is_initializing = B_TRUE; 2462 error = dsl_pool_open(spa->spa_dsl_pool); 2463 spa->spa_is_initializing = B_FALSE; 2464 if (error != 0) 2465 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2466 2467 if (!mosconfig) { 2468 uint64_t hostid; 2469 nvlist_t *policy = NULL, *nvconfig; 2470 2471 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2472 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2473 2474 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 2475 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2476 char *hostname; 2477 unsigned long myhostid = 0; 2478 2479 VERIFY(nvlist_lookup_string(nvconfig, 2480 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 2481 2482 #ifdef _KERNEL 2483 myhostid = zone_get_hostid(NULL); 2484 #else /* _KERNEL */ 2485 /* 2486 * We're emulating the system's hostid in userland, so 2487 * we can't use zone_get_hostid(). 2488 */ 2489 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 2490 #endif /* _KERNEL */ 2491 if (hostid != 0 && myhostid != 0 && 2492 hostid != myhostid) { 2493 nvlist_free(nvconfig); 2494 cmn_err(CE_WARN, "pool '%s' could not be " 2495 "loaded as it was last accessed by " 2496 "another system (host: %s hostid: 0x%lx). " 2497 "See: http://illumos.org/msg/ZFS-8000-EY", 2498 spa_name(spa), hostname, 2499 (unsigned long)hostid); 2500 return (SET_ERROR(EBADF)); 2501 } 2502 } 2503 if (nvlist_lookup_nvlist(spa->spa_config, 2504 ZPOOL_REWIND_POLICY, &policy) == 0) 2505 VERIFY(nvlist_add_nvlist(nvconfig, 2506 ZPOOL_REWIND_POLICY, policy) == 0); 2507 2508 spa_config_set(spa, nvconfig); 2509 spa_unload(spa); 2510 spa_deactivate(spa); 2511 spa_activate(spa, orig_mode); 2512 2513 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 2514 } 2515 2516 /* Grab the secret checksum salt from the MOS. */ 2517 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2518 DMU_POOL_CHECKSUM_SALT, 1, 2519 sizeof (spa->spa_cksum_salt.zcs_bytes), 2520 spa->spa_cksum_salt.zcs_bytes); 2521 if (error == ENOENT) { 2522 /* Generate a new salt for subsequent use */ 2523 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 2524 sizeof (spa->spa_cksum_salt.zcs_bytes)); 2525 } else if (error != 0) { 2526 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2527 } 2528 2529 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) 2530 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2531 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 2532 if (error != 0) 2533 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2534 2535 /* 2536 * Load the bit that tells us to use the new accounting function 2537 * (raid-z deflation). If we have an older pool, this will not 2538 * be present. 2539 */ 2540 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 2541 if (error != 0 && error != ENOENT) 2542 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2543 2544 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2545 &spa->spa_creation_version); 2546 if (error != 0 && error != ENOENT) 2547 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2548 2549 /* 2550 * Load the persistent error log. If we have an older pool, this will 2551 * not be present. 2552 */ 2553 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 2554 if (error != 0 && error != ENOENT) 2555 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2556 2557 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2558 &spa->spa_errlog_scrub); 2559 if (error != 0 && error != ENOENT) 2560 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2561 2562 /* 2563 * Load the history object. If we have an older pool, this 2564 * will not be present. 2565 */ 2566 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 2567 if (error != 0 && error != ENOENT) 2568 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2569 2570 /* 2571 * If we're assembling the pool from the split-off vdevs of 2572 * an existing pool, we don't want to attach the spares & cache 2573 * devices. 2574 */ 2575 2576 /* 2577 * Load any hot spares for this pool. 2578 */ 2579 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 2580 if (error != 0 && error != ENOENT) 2581 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2582 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2583 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2584 if (load_nvlist(spa, spa->spa_spares.sav_object, 2585 &spa->spa_spares.sav_config) != 0) 2586 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2587 2588 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2589 spa_load_spares(spa); 2590 spa_config_exit(spa, SCL_ALL, FTAG); 2591 } else if (error == 0) { 2592 spa->spa_spares.sav_sync = B_TRUE; 2593 } 2594 2595 /* 2596 * Load any level 2 ARC devices for this pool. 2597 */ 2598 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2599 &spa->spa_l2cache.sav_object); 2600 if (error != 0 && error != ENOENT) 2601 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2602 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2603 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2604 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2605 &spa->spa_l2cache.sav_config) != 0) 2606 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2607 2608 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2609 spa_load_l2cache(spa); 2610 spa_config_exit(spa, SCL_ALL, FTAG); 2611 } else if (error == 0) { 2612 spa->spa_l2cache.sav_sync = B_TRUE; 2613 } 2614 2615 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2616 2617 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 2618 if (error && error != ENOENT) 2619 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2620 2621 if (error == 0) { 2622 uint64_t autoreplace; 2623 2624 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2625 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2626 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2627 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2628 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2629 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2630 &spa->spa_dedup_ditto); 2631 2632 spa->spa_autoreplace = (autoreplace != 0); 2633 } 2634 2635 /* 2636 * If the 'autoreplace' property is set, then post a resource notifying 2637 * the ZFS DE that it should not issue any faults for unopenable 2638 * devices. We also iterate over the vdevs, and post a sysevent for any 2639 * unopenable vdevs so that the normal autoreplace handler can take 2640 * over. 2641 */ 2642 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 2643 spa_check_removed(spa->spa_root_vdev); 2644 /* 2645 * For the import case, this is done in spa_import(), because 2646 * at this point we're using the spare definitions from 2647 * the MOS config, not necessarily from the userland config. 2648 */ 2649 if (state != SPA_LOAD_IMPORT) { 2650 spa_aux_check_removed(&spa->spa_spares); 2651 spa_aux_check_removed(&spa->spa_l2cache); 2652 } 2653 } 2654 2655 /* 2656 * Load the vdev state for all toplevel vdevs. 2657 */ 2658 vdev_load(rvd); 2659 2660 /* 2661 * Propagate the leaf DTLs we just loaded all the way up the tree. 2662 */ 2663 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2664 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 2665 spa_config_exit(spa, SCL_ALL, FTAG); 2666 2667 /* 2668 * Load the DDTs (dedup tables). 2669 */ 2670 error = ddt_load(spa); 2671 if (error != 0) 2672 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2673 2674 spa_update_dspace(spa); 2675 2676 /* 2677 * Validate the config, using the MOS config to fill in any 2678 * information which might be missing. If we fail to validate 2679 * the config then declare the pool unfit for use. If we're 2680 * assembling a pool from a split, the log is not transferred 2681 * over. 2682 */ 2683 if (type != SPA_IMPORT_ASSEMBLE) { 2684 nvlist_t *nvconfig; 2685 2686 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2687 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2688 2689 if (!spa_config_valid(spa, nvconfig)) { 2690 nvlist_free(nvconfig); 2691 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2692 ENXIO)); 2693 } 2694 nvlist_free(nvconfig); 2695 2696 /* 2697 * Now that we've validated the config, check the state of the 2698 * root vdev. If it can't be opened, it indicates one or 2699 * more toplevel vdevs are faulted. 2700 */ 2701 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2702 return (SET_ERROR(ENXIO)); 2703 2704 if (spa_writeable(spa) && spa_check_logs(spa)) { 2705 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 2706 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 2707 } 2708 } 2709 2710 if (missing_feat_write) { 2711 ASSERT(state == SPA_LOAD_TRYIMPORT); 2712 2713 /* 2714 * At this point, we know that we can open the pool in 2715 * read-only mode but not read-write mode. We now have enough 2716 * information and can return to userland. 2717 */ 2718 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); 2719 } 2720 2721 /* 2722 * We've successfully opened the pool, verify that we're ready 2723 * to start pushing transactions. 2724 */ 2725 if (state != SPA_LOAD_TRYIMPORT) { 2726 if (error = spa_load_verify(spa)) 2727 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2728 error)); 2729 } 2730 2731 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2732 spa->spa_load_max_txg == UINT64_MAX)) { 2733 dmu_tx_t *tx; 2734 int need_update = B_FALSE; 2735 dsl_pool_t *dp = spa_get_dsl(spa); 2736 2737 ASSERT(state != SPA_LOAD_TRYIMPORT); 2738 2739 /* 2740 * Claim log blocks that haven't been committed yet. 2741 * This must all happen in a single txg. 2742 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2743 * invoked from zil_claim_log_block()'s i/o done callback. 2744 * Price of rollback is that we abandon the log. 2745 */ 2746 spa->spa_claiming = B_TRUE; 2747 2748 tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); 2749 (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 2750 zil_claim, tx, DS_FIND_CHILDREN); 2751 dmu_tx_commit(tx); 2752 2753 spa->spa_claiming = B_FALSE; 2754 2755 spa_set_log_state(spa, SPA_LOG_GOOD); 2756 spa->spa_sync_on = B_TRUE; 2757 txg_sync_start(spa->spa_dsl_pool); 2758 2759 /* 2760 * Wait for all claims to sync. We sync up to the highest 2761 * claimed log block birth time so that claimed log blocks 2762 * don't appear to be from the future. spa_claim_max_txg 2763 * will have been set for us by either zil_check_log_chain() 2764 * (invoked from spa_check_logs()) or zil_claim() above. 2765 */ 2766 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2767 2768 /* 2769 * If the config cache is stale, or we have uninitialized 2770 * metaslabs (see spa_vdev_add()), then update the config. 2771 * 2772 * If this is a verbatim import, trust the current 2773 * in-core spa_config and update the disk labels. 2774 */ 2775 if (config_cache_txg != spa->spa_config_txg || 2776 state == SPA_LOAD_IMPORT || 2777 state == SPA_LOAD_RECOVER || 2778 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 2779 need_update = B_TRUE; 2780 2781 for (int c = 0; c < rvd->vdev_children; c++) 2782 if (rvd->vdev_child[c]->vdev_ms_array == 0) 2783 need_update = B_TRUE; 2784 2785 /* 2786 * Update the config cache asychronously in case we're the 2787 * root pool, in which case the config cache isn't writable yet. 2788 */ 2789 if (need_update) 2790 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2791 2792 /* 2793 * Check all DTLs to see if anything needs resilvering. 2794 */ 2795 if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 2796 vdev_resilver_needed(rvd, NULL, NULL)) 2797 spa_async_request(spa, SPA_ASYNC_RESILVER); 2798 2799 /* 2800 * Log the fact that we booted up (so that we can detect if 2801 * we rebooted in the middle of an operation). 2802 */ 2803 spa_history_log_version(spa, "open"); 2804 2805 /* 2806 * Delete any inconsistent datasets. 2807 */ 2808 (void) dmu_objset_find(spa_name(spa), 2809 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 2810 2811 /* 2812 * Clean up any stale temporary dataset userrefs. 2813 */ 2814 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2815 } 2816 2817 return (0); 2818 } 2819 2820 static int 2821 spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 2822 { 2823 int mode = spa->spa_mode; 2824 2825 spa_unload(spa); 2826 spa_deactivate(spa); 2827 2828 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; 2829 2830 spa_activate(spa, mode); 2831 spa_async_suspend(spa); 2832 2833 return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 2834 } 2835 2836 /* 2837 * If spa_load() fails this function will try loading prior txg's. If 2838 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 2839 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 2840 * function will not rewind the pool and will return the same error as 2841 * spa_load(). 2842 */ 2843 static int 2844 spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 2845 uint64_t max_request, int rewind_flags) 2846 { 2847 nvlist_t *loadinfo = NULL; 2848 nvlist_t *config = NULL; 2849 int load_error, rewind_error; 2850 uint64_t safe_rewind_txg; 2851 uint64_t min_txg; 2852 2853 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 2854 spa->spa_load_max_txg = spa->spa_load_txg; 2855 spa_set_log_state(spa, SPA_LOG_CLEAR); 2856 } else { 2857 spa->spa_load_max_txg = max_request; 2858 if (max_request != UINT64_MAX) 2859 spa->spa_extreme_rewind = B_TRUE; 2860 } 2861 2862 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 2863 mosconfig); 2864 if (load_error == 0) 2865 return (0); 2866 2867 if (spa->spa_root_vdev != NULL) 2868 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2869 2870 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 2871 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 2872 2873 if (rewind_flags & ZPOOL_NEVER_REWIND) { 2874 nvlist_free(config); 2875 return (load_error); 2876 } 2877 2878 if (state == SPA_LOAD_RECOVER) { 2879 /* Price of rolling back is discarding txgs, including log */ 2880 spa_set_log_state(spa, SPA_LOG_CLEAR); 2881 } else { 2882 /* 2883 * If we aren't rolling back save the load info from our first 2884 * import attempt so that we can restore it after attempting 2885 * to rewind. 2886 */ 2887 loadinfo = spa->spa_load_info; 2888 spa->spa_load_info = fnvlist_alloc(); 2889 } 2890 2891 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 2892 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 2893 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 2894 TXG_INITIAL : safe_rewind_txg; 2895 2896 /* 2897 * Continue as long as we're finding errors, we're still within 2898 * the acceptable rewind range, and we're still finding uberblocks 2899 */ 2900 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 2901 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 2902 if (spa->spa_load_max_txg < safe_rewind_txg) 2903 spa->spa_extreme_rewind = B_TRUE; 2904 rewind_error = spa_load_retry(spa, state, mosconfig); 2905 } 2906 2907 spa->spa_extreme_rewind = B_FALSE; 2908 spa->spa_load_max_txg = UINT64_MAX; 2909 2910 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 2911 spa_config_set(spa, config); 2912 2913 if (state == SPA_LOAD_RECOVER) { 2914 ASSERT3P(loadinfo, ==, NULL); 2915 return (rewind_error); 2916 } else { 2917 /* Store the rewind info as part of the initial load info */ 2918 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 2919 spa->spa_load_info); 2920 2921 /* Restore the initial load info */ 2922 fnvlist_free(spa->spa_load_info); 2923 spa->spa_load_info = loadinfo; 2924 2925 return (load_error); 2926 } 2927 } 2928 2929 /* 2930 * Pool Open/Import 2931 * 2932 * The import case is identical to an open except that the configuration is sent 2933 * down from userland, instead of grabbed from the configuration cache. For the 2934 * case of an open, the pool configuration will exist in the 2935 * POOL_STATE_UNINITIALIZED state. 2936 * 2937 * The stats information (gen/count/ustats) is used to gather vdev statistics at 2938 * the same time open the pool, without having to keep around the spa_t in some 2939 * ambiguous state. 2940 */ 2941 static int 2942 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 2943 nvlist_t **config) 2944 { 2945 spa_t *spa; 2946 spa_load_state_t state = SPA_LOAD_OPEN; 2947 int error; 2948 int locked = B_FALSE; 2949 2950 *spapp = NULL; 2951 2952 /* 2953 * As disgusting as this is, we need to support recursive calls to this 2954 * function because dsl_dir_open() is called during spa_load(), and ends 2955 * up calling spa_open() again. The real fix is to figure out how to 2956 * avoid dsl_dir_open() calling this in the first place. 2957 */ 2958 if (mutex_owner(&spa_namespace_lock) != curthread) { 2959 mutex_enter(&spa_namespace_lock); 2960 locked = B_TRUE; 2961 } 2962 2963 if ((spa = spa_lookup(pool)) == NULL) { 2964 if (locked) 2965 mutex_exit(&spa_namespace_lock); 2966 return (SET_ERROR(ENOENT)); 2967 } 2968 2969 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 2970 zpool_rewind_policy_t policy; 2971 2972 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 2973 &policy); 2974 if (policy.zrp_request & ZPOOL_DO_REWIND) 2975 state = SPA_LOAD_RECOVER; 2976 2977 spa_activate(spa, spa_mode_global); 2978 2979 if (state != SPA_LOAD_RECOVER) 2980 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 2981 2982 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 2983 policy.zrp_request); 2984 2985 if (error == EBADF) { 2986 /* 2987 * If vdev_validate() returns failure (indicated by 2988 * EBADF), it indicates that one of the vdevs indicates 2989 * that the pool has been exported or destroyed. If 2990 * this is the case, the config cache is out of sync and 2991 * we should remove the pool from the namespace. 2992 */ 2993 spa_unload(spa); 2994 spa_deactivate(spa); 2995 spa_config_sync(spa, B_TRUE, B_TRUE); 2996 spa_remove(spa); 2997 if (locked) 2998 mutex_exit(&spa_namespace_lock); 2999 return (SET_ERROR(ENOENT)); 3000 } 3001 3002 if (error) { 3003 /* 3004 * We can't open the pool, but we still have useful 3005 * information: the state of each vdev after the 3006 * attempted vdev_open(). Return this to the user. 3007 */ 3008 if (config != NULL && spa->spa_config) { 3009 VERIFY(nvlist_dup(spa->spa_config, config, 3010 KM_SLEEP) == 0); 3011 VERIFY(nvlist_add_nvlist(*config, 3012 ZPOOL_CONFIG_LOAD_INFO, 3013 spa->spa_load_info) == 0); 3014 } 3015 spa_unload(spa); 3016 spa_deactivate(spa); 3017 spa->spa_last_open_failed = error; 3018 if (locked) 3019 mutex_exit(&spa_namespace_lock); 3020 *spapp = NULL; 3021 return (error); 3022 } 3023 } 3024 3025 spa_open_ref(spa, tag); 3026 3027 if (config != NULL) 3028 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3029 3030 /* 3031 * If we've recovered the pool, pass back any information we 3032 * gathered while doing the load. 3033 */ 3034 if (state == SPA_LOAD_RECOVER) { 3035 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 3036 spa->spa_load_info) == 0); 3037 } 3038 3039 if (locked) { 3040 spa->spa_last_open_failed = 0; 3041 spa->spa_last_ubsync_txg = 0; 3042 spa->spa_load_txg = 0; 3043 mutex_exit(&spa_namespace_lock); 3044 } 3045 3046 *spapp = spa; 3047 3048 return (0); 3049 } 3050 3051 int 3052 spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 3053 nvlist_t **config) 3054 { 3055 return (spa_open_common(name, spapp, tag, policy, config)); 3056 } 3057 3058 int 3059 spa_open(const char *name, spa_t **spapp, void *tag) 3060 { 3061 return (spa_open_common(name, spapp, tag, NULL, NULL)); 3062 } 3063 3064 /* 3065 * Lookup the given spa_t, incrementing the inject count in the process, 3066 * preventing it from being exported or destroyed. 3067 */ 3068 spa_t * 3069 spa_inject_addref(char *name) 3070 { 3071 spa_t *spa; 3072 3073 mutex_enter(&spa_namespace_lock); 3074 if ((spa = spa_lookup(name)) == NULL) { 3075 mutex_exit(&spa_namespace_lock); 3076 return (NULL); 3077 } 3078 spa->spa_inject_ref++; 3079 mutex_exit(&spa_namespace_lock); 3080 3081 return (spa); 3082 } 3083 3084 void 3085 spa_inject_delref(spa_t *spa) 3086 { 3087 mutex_enter(&spa_namespace_lock); 3088 spa->spa_inject_ref--; 3089 mutex_exit(&spa_namespace_lock); 3090 } 3091 3092 /* 3093 * Add spares device information to the nvlist. 3094 */ 3095 static void 3096 spa_add_spares(spa_t *spa, nvlist_t *config) 3097 { 3098 nvlist_t **spares; 3099 uint_t i, nspares; 3100 nvlist_t *nvroot; 3101 uint64_t guid; 3102 vdev_stat_t *vs; 3103 uint_t vsc; 3104 uint64_t pool; 3105 3106 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3107 3108 if (spa->spa_spares.sav_count == 0) 3109 return; 3110 3111 VERIFY(nvlist_lookup_nvlist(config, 3112 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3113 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3114 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3115 if (nspares != 0) { 3116 VERIFY(nvlist_add_nvlist_array(nvroot, 3117 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3118 VERIFY(nvlist_lookup_nvlist_array(nvroot, 3119 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3120 3121 /* 3122 * Go through and find any spares which have since been 3123 * repurposed as an active spare. If this is the case, update 3124 * their status appropriately. 3125 */ 3126 for (i = 0; i < nspares; i++) { 3127 VERIFY(nvlist_lookup_uint64(spares[i], 3128 ZPOOL_CONFIG_GUID, &guid) == 0); 3129 if (spa_spare_exists(guid, &pool, NULL) && 3130 pool != 0ULL) { 3131 VERIFY(nvlist_lookup_uint64_array( 3132 spares[i], ZPOOL_CONFIG_VDEV_STATS, 3133 (uint64_t **)&vs, &vsc) == 0); 3134 vs->vs_state = VDEV_STATE_CANT_OPEN; 3135 vs->vs_aux = VDEV_AUX_SPARED; 3136 } 3137 } 3138 } 3139 } 3140 3141 /* 3142 * Add l2cache device information to the nvlist, including vdev stats. 3143 */ 3144 static void 3145 spa_add_l2cache(spa_t *spa, nvlist_t *config) 3146 { 3147 nvlist_t **l2cache; 3148 uint_t i, j, nl2cache; 3149 nvlist_t *nvroot; 3150 uint64_t guid; 3151 vdev_t *vd; 3152 vdev_stat_t *vs; 3153 uint_t vsc; 3154 3155 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3156 3157 if (spa->spa_l2cache.sav_count == 0) 3158 return; 3159 3160 VERIFY(nvlist_lookup_nvlist(config, 3161 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3162 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3163 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3164 if (nl2cache != 0) { 3165 VERIFY(nvlist_add_nvlist_array(nvroot, 3166 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3167 VERIFY(nvlist_lookup_nvlist_array(nvroot, 3168 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3169 3170 /* 3171 * Update level 2 cache device stats. 3172 */ 3173 3174 for (i = 0; i < nl2cache; i++) { 3175 VERIFY(nvlist_lookup_uint64(l2cache[i], 3176 ZPOOL_CONFIG_GUID, &guid) == 0); 3177 3178 vd = NULL; 3179 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 3180 if (guid == 3181 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 3182 vd = spa->spa_l2cache.sav_vdevs[j]; 3183 break; 3184 } 3185 } 3186 ASSERT(vd != NULL); 3187 3188 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 3189 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 3190 == 0); 3191 vdev_get_stats(vd, vs); 3192 } 3193 } 3194 } 3195 3196 static void 3197 spa_add_feature_stats(spa_t *spa, nvlist_t *config) 3198 { 3199 nvlist_t *features; 3200 zap_cursor_t zc; 3201 zap_attribute_t za; 3202 3203 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3204 VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3205 3206 if (spa->spa_feat_for_read_obj != 0) { 3207 for (zap_cursor_init(&zc, spa->spa_meta_objset, 3208 spa->spa_feat_for_read_obj); 3209 zap_cursor_retrieve(&zc, &za) == 0; 3210 zap_cursor_advance(&zc)) { 3211 ASSERT(za.za_integer_length == sizeof (uint64_t) && 3212 za.za_num_integers == 1); 3213 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3214 za.za_first_integer)); 3215 } 3216 zap_cursor_fini(&zc); 3217 } 3218 3219 if (spa->spa_feat_for_write_obj != 0) { 3220 for (zap_cursor_init(&zc, spa->spa_meta_objset, 3221 spa->spa_feat_for_write_obj); 3222 zap_cursor_retrieve(&zc, &za) == 0; 3223 zap_cursor_advance(&zc)) { 3224 ASSERT(za.za_integer_length == sizeof (uint64_t) && 3225 za.za_num_integers == 1); 3226 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3227 za.za_first_integer)); 3228 } 3229 zap_cursor_fini(&zc); 3230 } 3231 3232 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 3233 features) == 0); 3234 nvlist_free(features); 3235 } 3236 3237 int 3238 spa_get_stats(const char *name, nvlist_t **config, 3239 char *altroot, size_t buflen) 3240 { 3241 int error; 3242 spa_t *spa; 3243 3244 *config = NULL; 3245 error = spa_open_common(name, &spa, FTAG, NULL, config); 3246 3247 if (spa != NULL) { 3248 /* 3249 * This still leaves a window of inconsistency where the spares 3250 * or l2cache devices could change and the config would be 3251 * self-inconsistent. 3252 */ 3253 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3254 3255 if (*config != NULL) { 3256 uint64_t loadtimes[2]; 3257 3258 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 3259 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 3260 VERIFY(nvlist_add_uint64_array(*config, 3261 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 3262 3263 VERIFY(nvlist_add_uint64(*config, 3264 ZPOOL_CONFIG_ERRCOUNT, 3265 spa_get_errlog_size(spa)) == 0); 3266 3267 if (spa_suspended(spa)) 3268 VERIFY(nvlist_add_uint64(*config, 3269 ZPOOL_CONFIG_SUSPENDED, 3270 spa->spa_failmode) == 0); 3271 3272 spa_add_spares(spa, *config); 3273 spa_add_l2cache(spa, *config); 3274 spa_add_feature_stats(spa, *config); 3275 } 3276 } 3277 3278 /* 3279 * We want to get the alternate root even for faulted pools, so we cheat 3280 * and call spa_lookup() directly. 3281 */ 3282 if (altroot) { 3283 if (spa == NULL) { 3284 mutex_enter(&spa_namespace_lock); 3285 spa = spa_lookup(name); 3286 if (spa) 3287 spa_altroot(spa, altroot, buflen); 3288 else 3289 altroot[0] = '\0'; 3290 spa = NULL; 3291 mutex_exit(&spa_namespace_lock); 3292 } else { 3293 spa_altroot(spa, altroot, buflen); 3294 } 3295 } 3296 3297 if (spa != NULL) { 3298 spa_config_exit(spa, SCL_CONFIG, FTAG); 3299 spa_close(spa, FTAG); 3300 } 3301 3302 return (error); 3303 } 3304 3305 /* 3306 * Validate that the auxiliary device array is well formed. We must have an 3307 * array of nvlists, each which describes a valid leaf vdev. If this is an 3308 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 3309 * specified, as long as they are well-formed. 3310 */ 3311 static int 3312 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 3313 spa_aux_vdev_t *sav, const char *config, uint64_t version, 3314 vdev_labeltype_t label) 3315 { 3316 nvlist_t **dev; 3317 uint_t i, ndev; 3318 vdev_t *vd; 3319 int error; 3320 3321 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3322 3323 /* 3324 * It's acceptable to have no devs specified. 3325 */ 3326 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 3327 return (0); 3328 3329 if (ndev == 0) 3330 return (SET_ERROR(EINVAL)); 3331 3332 /* 3333 * Make sure the pool is formatted with a version that supports this 3334 * device type. 3335 */ 3336 if (spa_version(spa) < version) 3337 return (SET_ERROR(ENOTSUP)); 3338 3339 /* 3340 * Set the pending device list so we correctly handle device in-use 3341 * checking. 3342 */ 3343 sav->sav_pending = dev; 3344 sav->sav_npending = ndev; 3345 3346 for (i = 0; i < ndev; i++) { 3347 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 3348 mode)) != 0) 3349 goto out; 3350 3351 if (!vd->vdev_ops->vdev_op_leaf) { 3352 vdev_free(vd); 3353 error = SET_ERROR(EINVAL); 3354 goto out; 3355 } 3356 3357 /* 3358 * The L2ARC currently only supports disk devices in 3359 * kernel context. For user-level testing, we allow it. 3360 */ 3361 #ifdef _KERNEL 3362 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 3363 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 3364 error = SET_ERROR(ENOTBLK); 3365 vdev_free(vd); 3366 goto out; 3367 } 3368 #endif 3369 vd->vdev_top = vd; 3370 3371 if ((error = vdev_open(vd)) == 0 && 3372 (error = vdev_label_init(vd, crtxg, label)) == 0) { 3373 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 3374 vd->vdev_guid) == 0); 3375 } 3376 3377 vdev_free(vd); 3378 3379 if (error && 3380 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 3381 goto out; 3382 else 3383 error = 0; 3384 } 3385 3386 out: 3387 sav->sav_pending = NULL; 3388 sav->sav_npending = 0; 3389 return (error); 3390 } 3391 3392 static int 3393 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 3394 { 3395 int error; 3396 3397 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3398 3399 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3400 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 3401 VDEV_LABEL_SPARE)) != 0) { 3402 return (error); 3403 } 3404 3405 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3406 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 3407 VDEV_LABEL_L2CACHE)); 3408 } 3409 3410 static void 3411 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 3412 const char *config) 3413 { 3414 int i; 3415 3416 if (sav->sav_config != NULL) { 3417 nvlist_t **olddevs; 3418 uint_t oldndevs; 3419 nvlist_t **newdevs; 3420 3421 /* 3422 * Generate new dev list by concatentating with the 3423 * current dev list. 3424 */ 3425 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 3426 &olddevs, &oldndevs) == 0); 3427 3428 newdevs = kmem_alloc(sizeof (void *) * 3429 (ndevs + oldndevs), KM_SLEEP); 3430 for (i = 0; i < oldndevs; i++) 3431 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 3432 KM_SLEEP) == 0); 3433 for (i = 0; i < ndevs; i++) 3434 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 3435 KM_SLEEP) == 0); 3436 3437 VERIFY(nvlist_remove(sav->sav_config, config, 3438 DATA_TYPE_NVLIST_ARRAY) == 0); 3439 3440 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3441 config, newdevs, ndevs + oldndevs) == 0); 3442 for (i = 0; i < oldndevs + ndevs; i++) 3443 nvlist_free(newdevs[i]); 3444 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 3445 } else { 3446 /* 3447 * Generate a new dev list. 3448 */ 3449 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 3450 KM_SLEEP) == 0); 3451 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 3452 devs, ndevs) == 0); 3453 } 3454 } 3455 3456 /* 3457 * Stop and drop level 2 ARC devices 3458 */ 3459 void 3460 spa_l2cache_drop(spa_t *spa) 3461 { 3462 vdev_t *vd; 3463 int i; 3464 spa_aux_vdev_t *sav = &spa->spa_l2cache; 3465 3466 for (i = 0; i < sav->sav_count; i++) { 3467 uint64_t pool; 3468 3469 vd = sav->sav_vdevs[i]; 3470 ASSERT(vd != NULL); 3471 3472 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 3473 pool != 0ULL && l2arc_vdev_present(vd)) 3474 l2arc_remove_vdev(vd); 3475 } 3476 } 3477 3478 /* 3479 * Pool Creation 3480 */ 3481 int 3482 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 3483 nvlist_t *zplprops) 3484 { 3485 spa_t *spa; 3486 char *altroot = NULL; 3487 vdev_t *rvd; 3488 dsl_pool_t *dp; 3489 dmu_tx_t *tx; 3490 int error = 0; 3491 uint64_t txg = TXG_INITIAL; 3492 nvlist_t **spares, **l2cache; 3493 uint_t nspares, nl2cache; 3494 uint64_t version, obj; 3495 boolean_t has_features; 3496 3497 /* 3498 * If this pool already exists, return failure. 3499 */ 3500 mutex_enter(&spa_namespace_lock); 3501 if (spa_lookup(pool) != NULL) { 3502 mutex_exit(&spa_namespace_lock); 3503 return (SET_ERROR(EEXIST)); 3504 } 3505 3506 /* 3507 * Allocate a new spa_t structure. 3508 */ 3509 (void) nvlist_lookup_string(props, 3510 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3511 spa = spa_add(pool, NULL, altroot); 3512 spa_activate(spa, spa_mode_global); 3513 3514 if (props && (error = spa_prop_validate(spa, props))) { 3515 spa_deactivate(spa); 3516 spa_remove(spa); 3517 mutex_exit(&spa_namespace_lock); 3518 return (error); 3519 } 3520 3521 has_features = B_FALSE; 3522 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 3523 elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 3524 if (zpool_prop_feature(nvpair_name(elem))) 3525 has_features = B_TRUE; 3526 } 3527 3528 if (has_features || nvlist_lookup_uint64(props, 3529 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 3530 version = SPA_VERSION; 3531 } 3532 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 3533 3534 spa->spa_first_txg = txg; 3535 spa->spa_uberblock.ub_txg = txg - 1; 3536 spa->spa_uberblock.ub_version = version; 3537 spa->spa_ubsync = spa->spa_uberblock; 3538 3539 /* 3540 * Create "The Godfather" zio to hold all async IOs 3541 */ 3542 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 3543 KM_SLEEP); 3544 for (int i = 0; i < max_ncpus; i++) { 3545 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 3546 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 3547 ZIO_FLAG_GODFATHER); 3548 } 3549 3550 /* 3551 * Create the root vdev. 3552 */ 3553 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3554 3555 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 3556 3557 ASSERT(error != 0 || rvd != NULL); 3558 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 3559 3560 if (error == 0 && !zfs_allocatable_devs(nvroot)) 3561 error = SET_ERROR(EINVAL); 3562 3563 if (error == 0 && 3564 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 3565 (error = spa_validate_aux(spa, nvroot, txg, 3566 VDEV_ALLOC_ADD)) == 0) { 3567 for (int c = 0; c < rvd->vdev_children; c++) { 3568 vdev_metaslab_set_size(rvd->vdev_child[c]); 3569 vdev_expand(rvd->vdev_child[c], txg); 3570 } 3571 } 3572 3573 spa_config_exit(spa, SCL_ALL, FTAG); 3574 3575 if (error != 0) { 3576 spa_unload(spa); 3577 spa_deactivate(spa); 3578 spa_remove(spa); 3579 mutex_exit(&spa_namespace_lock); 3580 return (error); 3581 } 3582 3583 /* 3584 * Get the list of spares, if specified. 3585 */ 3586 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3587 &spares, &nspares) == 0) { 3588 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 3589 KM_SLEEP) == 0); 3590 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3591 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3592 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3593 spa_load_spares(spa); 3594 spa_config_exit(spa, SCL_ALL, FTAG); 3595 spa->spa_spares.sav_sync = B_TRUE; 3596 } 3597 3598 /* 3599 * Get the list of level 2 cache devices, if specified. 3600 */ 3601 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3602 &l2cache, &nl2cache) == 0) { 3603 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3604 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3605 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3606 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3607 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3608 spa_load_l2cache(spa); 3609 spa_config_exit(spa, SCL_ALL, FTAG); 3610 spa->spa_l2cache.sav_sync = B_TRUE; 3611 } 3612 3613 spa->spa_is_initializing = B_TRUE; 3614 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 3615 spa->spa_meta_objset = dp->dp_meta_objset; 3616 spa->spa_is_initializing = B_FALSE; 3617 3618 /* 3619 * Create DDTs (dedup tables). 3620 */ 3621 ddt_create(spa); 3622 3623 spa_update_dspace(spa); 3624 3625 tx = dmu_tx_create_assigned(dp, txg); 3626 3627 /* 3628 * Create the pool config object. 3629 */ 3630 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 3631 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 3632 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3633 3634 if (zap_add(spa->spa_meta_objset, 3635 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3636 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 3637 cmn_err(CE_PANIC, "failed to add pool config"); 3638 } 3639 3640 if (spa_version(spa) >= SPA_VERSION_FEATURES) 3641 spa_feature_create_zap_objects(spa, tx); 3642 3643 if (zap_add(spa->spa_meta_objset, 3644 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 3645 sizeof (uint64_t), 1, &version, tx) != 0) { 3646 cmn_err(CE_PANIC, "failed to add pool version"); 3647 } 3648 3649 /* Newly created pools with the right version are always deflated. */ 3650 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 3651 spa->spa_deflate = TRUE; 3652 if (zap_add(spa->spa_meta_objset, 3653 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3654 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 3655 cmn_err(CE_PANIC, "failed to add deflate"); 3656 } 3657 } 3658 3659 /* 3660 * Create the deferred-free bpobj. Turn off compression 3661 * because sync-to-convergence takes longer if the blocksize 3662 * keeps changing. 3663 */ 3664 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 3665 dmu_object_set_compress(spa->spa_meta_objset, obj, 3666 ZIO_COMPRESS_OFF, tx); 3667 if (zap_add(spa->spa_meta_objset, 3668 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 3669 sizeof (uint64_t), 1, &obj, tx) != 0) { 3670 cmn_err(CE_PANIC, "failed to add bpobj"); 3671 } 3672 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 3673 spa->spa_meta_objset, obj)); 3674 3675 /* 3676 * Create the pool's history object. 3677 */ 3678 if (version >= SPA_VERSION_ZPOOL_HISTORY) 3679 spa_history_create_obj(spa, tx); 3680 3681 /* 3682 * Generate some random noise for salted checksums to operate on. 3683 */ 3684 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 3685 sizeof (spa->spa_cksum_salt.zcs_bytes)); 3686 3687 /* 3688 * Set pool properties. 3689 */ 3690 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 3691 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3692 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 3693 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 3694 3695 if (props != NULL) { 3696 spa_configfile_set(spa, props, B_FALSE); 3697 spa_sync_props(props, tx); 3698 } 3699 3700 dmu_tx_commit(tx); 3701 3702 spa->spa_sync_on = B_TRUE; 3703 txg_sync_start(spa->spa_dsl_pool); 3704 3705 /* 3706 * We explicitly wait for the first transaction to complete so that our 3707 * bean counters are appropriately updated. 3708 */ 3709 txg_wait_synced(spa->spa_dsl_pool, txg); 3710 3711 spa_config_sync(spa, B_FALSE, B_TRUE); 3712 spa_event_notify(spa, NULL, ESC_ZFS_POOL_CREATE); 3713 3714 spa_history_log_version(spa, "create"); 3715 3716 /* 3717 * Don't count references from objsets that are already closed 3718 * and are making their way through the eviction process. 3719 */ 3720 spa_evicting_os_wait(spa); 3721 spa->spa_minref = refcount_count(&spa->spa_refcount); 3722 3723 mutex_exit(&spa_namespace_lock); 3724 3725 return (0); 3726 } 3727 3728 #ifdef _KERNEL 3729 /* 3730 * Get the root pool information from the root disk, then import the root pool 3731 * during the system boot up time. 3732 */ 3733 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 3734 3735 static nvlist_t * 3736 spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 3737 { 3738 nvlist_t *config; 3739 nvlist_t *nvtop, *nvroot; 3740 uint64_t pgid; 3741 3742 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 3743 return (NULL); 3744 3745 /* 3746 * Add this top-level vdev to the child array. 3747 */ 3748 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3749 &nvtop) == 0); 3750 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3751 &pgid) == 0); 3752 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 3753 3754 /* 3755 * Put this pool's top-level vdevs into a root vdev. 3756 */ 3757 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3758 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3759 VDEV_TYPE_ROOT) == 0); 3760 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3761 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3762 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3763 &nvtop, 1) == 0); 3764 3765 /* 3766 * Replace the existing vdev_tree with the new root vdev in 3767 * this pool's configuration (remove the old, add the new). 3768 */ 3769 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3770 nvlist_free(nvroot); 3771 return (config); 3772 } 3773 3774 /* 3775 * Walk the vdev tree and see if we can find a device with "better" 3776 * configuration. A configuration is "better" if the label on that 3777 * device has a more recent txg. 3778 */ 3779 static void 3780 spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 3781 { 3782 for (int c = 0; c < vd->vdev_children; c++) 3783 spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 3784 3785 if (vd->vdev_ops->vdev_op_leaf) { 3786 nvlist_t *label; 3787 uint64_t label_txg; 3788 3789 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 3790 &label) != 0) 3791 return; 3792 3793 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 3794 &label_txg) == 0); 3795 3796 /* 3797 * Do we have a better boot device? 3798 */ 3799 if (label_txg > *txg) { 3800 *txg = label_txg; 3801 *avd = vd; 3802 } 3803 nvlist_free(label); 3804 } 3805 } 3806 3807 /* 3808 * Import a root pool. 3809 * 3810 * For x86. devpath_list will consist of devid and/or physpath name of 3811 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 3812 * The GRUB "findroot" command will return the vdev we should boot. 3813 * 3814 * For Sparc, devpath_list consists the physpath name of the booting device 3815 * no matter the rootpool is a single device pool or a mirrored pool. 3816 * e.g. 3817 * "/pci@1f,0/ide@d/disk@0,0:a" 3818 */ 3819 int 3820 spa_import_rootpool(char *devpath, char *devid) 3821 { 3822 spa_t *spa; 3823 vdev_t *rvd, *bvd, *avd = NULL; 3824 nvlist_t *config, *nvtop; 3825 uint64_t guid, txg; 3826 char *pname; 3827 int error; 3828 3829 /* 3830 * Read the label from the boot device and generate a configuration. 3831 */ 3832 config = spa_generate_rootconf(devpath, devid, &guid); 3833 #if defined(_OBP) && defined(_KERNEL) 3834 if (config == NULL) { 3835 if (strstr(devpath, "/iscsi/ssd") != NULL) { 3836 /* iscsi boot */ 3837 get_iscsi_bootpath_phy(devpath); 3838 config = spa_generate_rootconf(devpath, devid, &guid); 3839 } 3840 } 3841 #endif 3842 if (config == NULL) { 3843 cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", 3844 devpath); 3845 return (SET_ERROR(EIO)); 3846 } 3847 3848 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3849 &pname) == 0); 3850 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 3851 3852 mutex_enter(&spa_namespace_lock); 3853 if ((spa = spa_lookup(pname)) != NULL) { 3854 /* 3855 * Remove the existing root pool from the namespace so that we 3856 * can replace it with the correct config we just read in. 3857 */ 3858 spa_remove(spa); 3859 } 3860 3861 spa = spa_add(pname, config, NULL); 3862 spa->spa_is_root = B_TRUE; 3863 spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3864 3865 /* 3866 * Build up a vdev tree based on the boot device's label config. 3867 */ 3868 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3869 &nvtop) == 0); 3870 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3871 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3872 VDEV_ALLOC_ROOTPOOL); 3873 spa_config_exit(spa, SCL_ALL, FTAG); 3874 if (error) { 3875 mutex_exit(&spa_namespace_lock); 3876 nvlist_free(config); 3877 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3878 pname); 3879 return (error); 3880 } 3881 3882 /* 3883 * Get the boot vdev. 3884 */ 3885 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 3886 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 3887 (u_longlong_t)guid); 3888 error = SET_ERROR(ENOENT); 3889 goto out; 3890 } 3891 3892 /* 3893 * Determine if there is a better boot device. 3894 */ 3895 avd = bvd; 3896 spa_alt_rootvdev(rvd, &avd, &txg); 3897 if (avd != bvd) { 3898 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 3899 "try booting from '%s'", avd->vdev_path); 3900 error = SET_ERROR(EINVAL); 3901 goto out; 3902 } 3903 3904 /* 3905 * If the boot device is part of a spare vdev then ensure that 3906 * we're booting off the active spare. 3907 */ 3908 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3909 !bvd->vdev_isspare) { 3910 cmn_err(CE_NOTE, "The boot device is currently spared. Please " 3911 "try booting from '%s'", 3912 bvd->vdev_parent-> 3913 vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 3914 error = SET_ERROR(EINVAL); 3915 goto out; 3916 } 3917 3918 error = 0; 3919 out: 3920 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3921 vdev_free(rvd); 3922 spa_config_exit(spa, SCL_ALL, FTAG); 3923 mutex_exit(&spa_namespace_lock); 3924 3925 nvlist_free(config); 3926 return (error); 3927 } 3928 3929 #endif 3930 3931 /* 3932 * Import a non-root pool into the system. 3933 */ 3934 int 3935 spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 3936 { 3937 spa_t *spa; 3938 char *altroot = NULL; 3939 spa_load_state_t state = SPA_LOAD_IMPORT; 3940 zpool_rewind_policy_t policy; 3941 uint64_t mode = spa_mode_global; 3942 uint64_t readonly = B_FALSE; 3943 int error; 3944 nvlist_t *nvroot; 3945 nvlist_t **spares, **l2cache; 3946 uint_t nspares, nl2cache; 3947 3948 /* 3949 * If a pool with this name exists, return failure. 3950 */ 3951 mutex_enter(&spa_namespace_lock); 3952 if (spa_lookup(pool) != NULL) { 3953 mutex_exit(&spa_namespace_lock); 3954 return (SET_ERROR(EEXIST)); 3955 } 3956 3957 /* 3958 * Create and initialize the spa structure. 3959 */ 3960 (void) nvlist_lookup_string(props, 3961 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3962 (void) nvlist_lookup_uint64(props, 3963 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 3964 if (readonly) 3965 mode = FREAD; 3966 spa = spa_add(pool, config, altroot); 3967 spa->spa_import_flags = flags; 3968 3969 /* 3970 * Verbatim import - Take a pool and insert it into the namespace 3971 * as if it had been loaded at boot. 3972 */ 3973 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 3974 if (props != NULL) 3975 spa_configfile_set(spa, props, B_FALSE); 3976 3977 spa_config_sync(spa, B_FALSE, B_TRUE); 3978 spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT); 3979 3980 mutex_exit(&spa_namespace_lock); 3981 return (0); 3982 } 3983 3984 spa_activate(spa, mode); 3985 3986 /* 3987 * Don't start async tasks until we know everything is healthy. 3988 */ 3989 spa_async_suspend(spa); 3990 3991 zpool_get_rewind_policy(config, &policy); 3992 if (policy.zrp_request & ZPOOL_DO_REWIND) 3993 state = SPA_LOAD_RECOVER; 3994 3995 /* 3996 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 3997 * because the user-supplied config is actually the one to trust when 3998 * doing an import. 3999 */ 4000 if (state != SPA_LOAD_RECOVER) 4001 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 4002 4003 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 4004 policy.zrp_request); 4005 4006 /* 4007 * Propagate anything learned while loading the pool and pass it 4008 * back to caller (i.e. rewind info, missing devices, etc). 4009 */ 4010 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4011 spa->spa_load_info) == 0); 4012 4013 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4014 /* 4015 * Toss any existing sparelist, as it doesn't have any validity 4016 * anymore, and conflicts with spa_has_spare(). 4017 */ 4018 if (spa->spa_spares.sav_config) { 4019 nvlist_free(spa->spa_spares.sav_config); 4020 spa->spa_spares.sav_config = NULL; 4021 spa_load_spares(spa); 4022 } 4023 if (spa->spa_l2cache.sav_config) { 4024 nvlist_free(spa->spa_l2cache.sav_config); 4025 spa->spa_l2cache.sav_config = NULL; 4026 spa_load_l2cache(spa); 4027 } 4028 4029 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4030 &nvroot) == 0); 4031 if (error == 0) 4032 error = spa_validate_aux(spa, nvroot, -1ULL, 4033 VDEV_ALLOC_SPARE); 4034 if (error == 0) 4035 error = spa_validate_aux(spa, nvroot, -1ULL, 4036 VDEV_ALLOC_L2CACHE); 4037 spa_config_exit(spa, SCL_ALL, FTAG); 4038 4039 if (props != NULL) 4040 spa_configfile_set(spa, props, B_FALSE); 4041 4042 if (error != 0 || (props && spa_writeable(spa) && 4043 (error = spa_prop_set(spa, props)))) { 4044 spa_unload(spa); 4045 spa_deactivate(spa); 4046 spa_remove(spa); 4047 mutex_exit(&spa_namespace_lock); 4048 return (error); 4049 } 4050 4051 spa_async_resume(spa); 4052 4053 /* 4054 * Override any spares and level 2 cache devices as specified by 4055 * the user, as these may have correct device names/devids, etc. 4056 */ 4057 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 4058 &spares, &nspares) == 0) { 4059 if (spa->spa_spares.sav_config) 4060 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 4061 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 4062 else 4063 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 4064 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4065 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 4066 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 4067 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4068 spa_load_spares(spa); 4069 spa_config_exit(spa, SCL_ALL, FTAG); 4070 spa->spa_spares.sav_sync = B_TRUE; 4071 } 4072 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 4073 &l2cache, &nl2cache) == 0) { 4074 if (spa->spa_l2cache.sav_config) 4075 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 4076 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 4077 else 4078 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 4079 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4080 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 4081 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 4082 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4083 spa_load_l2cache(spa); 4084 spa_config_exit(spa, SCL_ALL, FTAG); 4085 spa->spa_l2cache.sav_sync = B_TRUE; 4086 } 4087 4088 /* 4089 * Check for any removed devices. 4090 */ 4091 if (spa->spa_autoreplace) { 4092 spa_aux_check_removed(&spa->spa_spares); 4093 spa_aux_check_removed(&spa->spa_l2cache); 4094 } 4095 4096 if (spa_writeable(spa)) { 4097 /* 4098 * Update the config cache to include the newly-imported pool. 4099 */ 4100 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4101 } 4102 4103 /* 4104 * It's possible that the pool was expanded while it was exported. 4105 * We kick off an async task to handle this for us. 4106 */ 4107 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 4108 4109 spa_history_log_version(spa, "import"); 4110 4111 spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT); 4112 4113 mutex_exit(&spa_namespace_lock); 4114 4115 return (0); 4116 } 4117 4118 nvlist_t * 4119 spa_tryimport(nvlist_t *tryconfig) 4120 { 4121 nvlist_t *config = NULL; 4122 char *poolname; 4123 spa_t *spa; 4124 uint64_t state; 4125 int error; 4126 4127 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 4128 return (NULL); 4129 4130 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 4131 return (NULL); 4132 4133 /* 4134 * Create and initialize the spa structure. 4135 */ 4136 mutex_enter(&spa_namespace_lock); 4137 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 4138 spa_activate(spa, FREAD); 4139 4140 /* 4141 * Pass off the heavy lifting to spa_load(). 4142 * Pass TRUE for mosconfig because the user-supplied config 4143 * is actually the one to trust when doing an import. 4144 */ 4145 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 4146 4147 /* 4148 * If 'tryconfig' was at least parsable, return the current config. 4149 */ 4150 if (spa->spa_root_vdev != NULL) { 4151 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 4152 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 4153 poolname) == 0); 4154 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4155 state) == 0); 4156 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 4157 spa->spa_uberblock.ub_timestamp) == 0); 4158 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4159 spa->spa_load_info) == 0); 4160 4161 /* 4162 * If the bootfs property exists on this pool then we 4163 * copy it out so that external consumers can tell which 4164 * pools are bootable. 4165 */ 4166 if ((!error || error == EEXIST) && spa->spa_bootfs) { 4167 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4168 4169 /* 4170 * We have to play games with the name since the 4171 * pool was opened as TRYIMPORT_NAME. 4172 */ 4173 if (dsl_dsobj_to_dsname(spa_name(spa), 4174 spa->spa_bootfs, tmpname) == 0) { 4175 char *cp; 4176 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4177 4178 cp = strchr(tmpname, '/'); 4179 if (cp == NULL) { 4180 (void) strlcpy(dsname, tmpname, 4181 MAXPATHLEN); 4182 } else { 4183 (void) snprintf(dsname, MAXPATHLEN, 4184 "%s/%s", poolname, ++cp); 4185 } 4186 VERIFY(nvlist_add_string(config, 4187 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 4188 kmem_free(dsname, MAXPATHLEN); 4189 } 4190 kmem_free(tmpname, MAXPATHLEN); 4191 } 4192 4193 /* 4194 * Add the list of hot spares and level 2 cache devices. 4195 */ 4196 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4197 spa_add_spares(spa, config); 4198 spa_add_l2cache(spa, config); 4199 spa_config_exit(spa, SCL_CONFIG, FTAG); 4200 } 4201 4202 spa_unload(spa); 4203 spa_deactivate(spa); 4204 spa_remove(spa); 4205 mutex_exit(&spa_namespace_lock); 4206 4207 return (config); 4208 } 4209 4210 /* 4211 * Pool export/destroy 4212 * 4213 * The act of destroying or exporting a pool is very simple. We make sure there 4214 * is no more pending I/O and any references to the pool are gone. Then, we 4215 * update the pool state and sync all the labels to disk, removing the 4216 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 4217 * we don't sync the labels or remove the configuration cache. 4218 */ 4219 static int 4220 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 4221 boolean_t force, boolean_t hardforce) 4222 { 4223 spa_t *spa; 4224 4225 if (oldconfig) 4226 *oldconfig = NULL; 4227 4228 if (!(spa_mode_global & FWRITE)) 4229 return (SET_ERROR(EROFS)); 4230 4231 mutex_enter(&spa_namespace_lock); 4232 if ((spa = spa_lookup(pool)) == NULL) { 4233 mutex_exit(&spa_namespace_lock); 4234 return (SET_ERROR(ENOENT)); 4235 } 4236 4237 /* 4238 * Put a hold on the pool, drop the namespace lock, stop async tasks, 4239 * reacquire the namespace lock, and see if we can export. 4240 */ 4241 spa_open_ref(spa, FTAG); 4242 mutex_exit(&spa_namespace_lock); 4243 spa_async_suspend(spa); 4244 mutex_enter(&spa_namespace_lock); 4245 spa_close(spa, FTAG); 4246 4247 /* 4248 * The pool will be in core if it's openable, 4249 * in which case we can modify its state. 4250 */ 4251 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 4252 /* 4253 * Objsets may be open only because they're dirty, so we 4254 * have to force it to sync before checking spa_refcnt. 4255 */ 4256 txg_wait_synced(spa->spa_dsl_pool, 0); 4257 spa_evicting_os_wait(spa); 4258 4259 /* 4260 * A pool cannot be exported or destroyed if there are active 4261 * references. If we are resetting a pool, allow references by 4262 * fault injection handlers. 4263 */ 4264 if (!spa_refcount_zero(spa) || 4265 (spa->spa_inject_ref != 0 && 4266 new_state != POOL_STATE_UNINITIALIZED)) { 4267 spa_async_resume(spa); 4268 mutex_exit(&spa_namespace_lock); 4269 return (SET_ERROR(EBUSY)); 4270 } 4271 4272 /* 4273 * A pool cannot be exported if it has an active shared spare. 4274 * This is to prevent other pools stealing the active spare 4275 * from an exported pool. At user's own will, such pool can 4276 * be forcedly exported. 4277 */ 4278 if (!force && new_state == POOL_STATE_EXPORTED && 4279 spa_has_active_shared_spare(spa)) { 4280 spa_async_resume(spa); 4281 mutex_exit(&spa_namespace_lock); 4282 return (SET_ERROR(EXDEV)); 4283 } 4284 4285 /* 4286 * We want this to be reflected on every label, 4287 * so mark them all dirty. spa_unload() will do the 4288 * final sync that pushes these changes out. 4289 */ 4290 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 4291 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4292 spa->spa_state = new_state; 4293 spa->spa_final_txg = spa_last_synced_txg(spa) + 4294 TXG_DEFER_SIZE + 1; 4295 vdev_config_dirty(spa->spa_root_vdev); 4296 spa_config_exit(spa, SCL_ALL, FTAG); 4297 } 4298 } 4299 4300 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 4301 4302 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4303 spa_unload(spa); 4304 spa_deactivate(spa); 4305 } 4306 4307 if (oldconfig && spa->spa_config) 4308 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 4309 4310 if (new_state != POOL_STATE_UNINITIALIZED) { 4311 if (!hardforce) 4312 spa_config_sync(spa, B_TRUE, B_TRUE); 4313 spa_remove(spa); 4314 } 4315 mutex_exit(&spa_namespace_lock); 4316 4317 return (0); 4318 } 4319 4320 /* 4321 * Destroy a storage pool. 4322 */ 4323 int 4324 spa_destroy(char *pool) 4325 { 4326 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 4327 B_FALSE, B_FALSE)); 4328 } 4329 4330 /* 4331 * Export a storage pool. 4332 */ 4333 int 4334 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 4335 boolean_t hardforce) 4336 { 4337 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 4338 force, hardforce)); 4339 } 4340 4341 /* 4342 * Similar to spa_export(), this unloads the spa_t without actually removing it 4343 * from the namespace in any way. 4344 */ 4345 int 4346 spa_reset(char *pool) 4347 { 4348 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 4349 B_FALSE, B_FALSE)); 4350 } 4351 4352 /* 4353 * ========================================================================== 4354 * Device manipulation 4355 * ========================================================================== 4356 */ 4357 4358 /* 4359 * Add a device to a storage pool. 4360 */ 4361 int 4362 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 4363 { 4364 uint64_t txg, id; 4365 int error; 4366 vdev_t *rvd = spa->spa_root_vdev; 4367 vdev_t *vd, *tvd; 4368 nvlist_t **spares, **l2cache; 4369 uint_t nspares, nl2cache; 4370 4371 ASSERT(spa_writeable(spa)); 4372 4373 txg = spa_vdev_enter(spa); 4374 4375 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 4376 VDEV_ALLOC_ADD)) != 0) 4377 return (spa_vdev_exit(spa, NULL, txg, error)); 4378 4379 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 4380 4381 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 4382 &nspares) != 0) 4383 nspares = 0; 4384 4385 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 4386 &nl2cache) != 0) 4387 nl2cache = 0; 4388 4389 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 4390 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 4391 4392 if (vd->vdev_children != 0 && 4393 (error = vdev_create(vd, txg, B_FALSE)) != 0) 4394 return (spa_vdev_exit(spa, vd, txg, error)); 4395 4396 /* 4397 * We must validate the spares and l2cache devices after checking the 4398 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 4399 */ 4400 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 4401 return (spa_vdev_exit(spa, vd, txg, error)); 4402 4403 /* 4404 * Transfer each new top-level vdev from vd to rvd. 4405 */ 4406 for (int c = 0; c < vd->vdev_children; c++) { 4407 4408 /* 4409 * Set the vdev id to the first hole, if one exists. 4410 */ 4411 for (id = 0; id < rvd->vdev_children; id++) { 4412 if (rvd->vdev_child[id]->vdev_ishole) { 4413 vdev_free(rvd->vdev_child[id]); 4414 break; 4415 } 4416 } 4417 tvd = vd->vdev_child[c]; 4418 vdev_remove_child(vd, tvd); 4419 tvd->vdev_id = id; 4420 vdev_add_child(rvd, tvd); 4421 vdev_config_dirty(tvd); 4422 } 4423 4424 if (nspares != 0) { 4425 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 4426 ZPOOL_CONFIG_SPARES); 4427 spa_load_spares(spa); 4428 spa->spa_spares.sav_sync = B_TRUE; 4429 } 4430 4431 if (nl2cache != 0) { 4432 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 4433 ZPOOL_CONFIG_L2CACHE); 4434 spa_load_l2cache(spa); 4435 spa->spa_l2cache.sav_sync = B_TRUE; 4436 } 4437 4438 /* 4439 * We have to be careful when adding new vdevs to an existing pool. 4440 * If other threads start allocating from these vdevs before we 4441 * sync the config cache, and we lose power, then upon reboot we may 4442 * fail to open the pool because there are DVAs that the config cache 4443 * can't translate. Therefore, we first add the vdevs without 4444 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 4445 * and then let spa_config_update() initialize the new metaslabs. 4446 * 4447 * spa_load() checks for added-but-not-initialized vdevs, so that 4448 * if we lose power at any point in this sequence, the remaining 4449 * steps will be completed the next time we load the pool. 4450 */ 4451 (void) spa_vdev_exit(spa, vd, txg, 0); 4452 4453 mutex_enter(&spa_namespace_lock); 4454 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4455 spa_event_notify(spa, NULL, ESC_ZFS_VDEV_ADD); 4456 mutex_exit(&spa_namespace_lock); 4457 4458 return (0); 4459 } 4460 4461 /* 4462 * Attach a device to a mirror. The arguments are the path to any device 4463 * in the mirror, and the nvroot for the new device. If the path specifies 4464 * a device that is not mirrored, we automatically insert the mirror vdev. 4465 * 4466 * If 'replacing' is specified, the new device is intended to replace the 4467 * existing device; in this case the two devices are made into their own 4468 * mirror using the 'replacing' vdev, which is functionally identical to 4469 * the mirror vdev (it actually reuses all the same ops) but has a few 4470 * extra rules: you can't attach to it after it's been created, and upon 4471 * completion of resilvering, the first disk (the one being replaced) 4472 * is automatically detached. 4473 */ 4474 int 4475 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 4476 { 4477 uint64_t txg, dtl_max_txg; 4478 vdev_t *rvd = spa->spa_root_vdev; 4479 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 4480 vdev_ops_t *pvops; 4481 char *oldvdpath, *newvdpath; 4482 int newvd_isspare; 4483 int error; 4484 4485 ASSERT(spa_writeable(spa)); 4486 4487 txg = spa_vdev_enter(spa); 4488 4489 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 4490 4491 if (oldvd == NULL) 4492 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4493 4494 if (!oldvd->vdev_ops->vdev_op_leaf) 4495 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4496 4497 pvd = oldvd->vdev_parent; 4498 4499 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 4500 VDEV_ALLOC_ATTACH)) != 0) 4501 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4502 4503 if (newrootvd->vdev_children != 1) 4504 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4505 4506 newvd = newrootvd->vdev_child[0]; 4507 4508 if (!newvd->vdev_ops->vdev_op_leaf) 4509 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4510 4511 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 4512 return (spa_vdev_exit(spa, newrootvd, txg, error)); 4513 4514 /* 4515 * Spares can't replace logs 4516 */ 4517 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 4518 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4519 4520 if (!replacing) { 4521 /* 4522 * For attach, the only allowable parent is a mirror or the root 4523 * vdev. 4524 */ 4525 if (pvd->vdev_ops != &vdev_mirror_ops && 4526 pvd->vdev_ops != &vdev_root_ops) 4527 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4528 4529 pvops = &vdev_mirror_ops; 4530 } else { 4531 /* 4532 * Active hot spares can only be replaced by inactive hot 4533 * spares. 4534 */ 4535 if (pvd->vdev_ops == &vdev_spare_ops && 4536 oldvd->vdev_isspare && 4537 !spa_has_spare(spa, newvd->vdev_guid)) 4538 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4539 4540 /* 4541 * If the source is a hot spare, and the parent isn't already a 4542 * spare, then we want to create a new hot spare. Otherwise, we 4543 * want to create a replacing vdev. The user is not allowed to 4544 * attach to a spared vdev child unless the 'isspare' state is 4545 * the same (spare replaces spare, non-spare replaces 4546 * non-spare). 4547 */ 4548 if (pvd->vdev_ops == &vdev_replacing_ops && 4549 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 4550 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4551 } else if (pvd->vdev_ops == &vdev_spare_ops && 4552 newvd->vdev_isspare != oldvd->vdev_isspare) { 4553 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4554 } 4555 4556 if (newvd->vdev_isspare) 4557 pvops = &vdev_spare_ops; 4558 else 4559 pvops = &vdev_replacing_ops; 4560 } 4561 4562 /* 4563 * Make sure the new device is big enough. 4564 */ 4565 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 4566 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 4567 4568 /* 4569 * The new device cannot have a higher alignment requirement 4570 * than the top-level vdev. 4571 */ 4572 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 4573 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 4574 4575 /* 4576 * If this is an in-place replacement, update oldvd's path and devid 4577 * to make it distinguishable from newvd, and unopenable from now on. 4578 */ 4579 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 4580 spa_strfree(oldvd->vdev_path); 4581 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 4582 KM_SLEEP); 4583 (void) sprintf(oldvd->vdev_path, "%s/%s", 4584 newvd->vdev_path, "old"); 4585 if (oldvd->vdev_devid != NULL) { 4586 spa_strfree(oldvd->vdev_devid); 4587 oldvd->vdev_devid = NULL; 4588 } 4589 } 4590 4591 /* mark the device being resilvered */ 4592 newvd->vdev_resilver_txg = txg; 4593 4594 /* 4595 * If the parent is not a mirror, or if we're replacing, insert the new 4596 * mirror/replacing/spare vdev above oldvd. 4597 */ 4598 if (pvd->vdev_ops != pvops) 4599 pvd = vdev_add_parent(oldvd, pvops); 4600 4601 ASSERT(pvd->vdev_top->vdev_parent == rvd); 4602 ASSERT(pvd->vdev_ops == pvops); 4603 ASSERT(oldvd->vdev_parent == pvd); 4604 4605 /* 4606 * Extract the new device from its root and add it to pvd. 4607 */ 4608 vdev_remove_child(newrootvd, newvd); 4609 newvd->vdev_id = pvd->vdev_children; 4610 newvd->vdev_crtxg = oldvd->vdev_crtxg; 4611 vdev_add_child(pvd, newvd); 4612 4613 tvd = newvd->vdev_top; 4614 ASSERT(pvd->vdev_top == tvd); 4615 ASSERT(tvd->vdev_parent == rvd); 4616 4617 vdev_config_dirty(tvd); 4618 4619 /* 4620 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 4621 * for any dmu_sync-ed blocks. It will propagate upward when 4622 * spa_vdev_exit() calls vdev_dtl_reassess(). 4623 */ 4624 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 4625 4626 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 4627 dtl_max_txg - TXG_INITIAL); 4628 4629 if (newvd->vdev_isspare) { 4630 spa_spare_activate(newvd); 4631 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 4632 } 4633 4634 oldvdpath = spa_strdup(oldvd->vdev_path); 4635 newvdpath = spa_strdup(newvd->vdev_path); 4636 newvd_isspare = newvd->vdev_isspare; 4637 4638 /* 4639 * Mark newvd's DTL dirty in this txg. 4640 */ 4641 vdev_dirty(tvd, VDD_DTL, newvd, txg); 4642 4643 /* 4644 * Schedule the resilver to restart in the future. We do this to 4645 * ensure that dmu_sync-ed blocks have been stitched into the 4646 * respective datasets. 4647 */ 4648 dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 4649 4650 if (spa->spa_bootfs) 4651 spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); 4652 4653 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_ATTACH); 4654 4655 /* 4656 * Commit the config 4657 */ 4658 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 4659 4660 spa_history_log_internal(spa, "vdev attach", NULL, 4661 "%s vdev=%s %s vdev=%s", 4662 replacing && newvd_isspare ? "spare in" : 4663 replacing ? "replace" : "attach", newvdpath, 4664 replacing ? "for" : "to", oldvdpath); 4665 4666 spa_strfree(oldvdpath); 4667 spa_strfree(newvdpath); 4668 4669 return (0); 4670 } 4671 4672 /* 4673 * Detach a device from a mirror or replacing vdev. 4674 * 4675 * If 'replace_done' is specified, only detach if the parent 4676 * is a replacing vdev. 4677 */ 4678 int 4679 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 4680 { 4681 uint64_t txg; 4682 int error; 4683 vdev_t *rvd = spa->spa_root_vdev; 4684 vdev_t *vd, *pvd, *cvd, *tvd; 4685 boolean_t unspare = B_FALSE; 4686 uint64_t unspare_guid = 0; 4687 char *vdpath; 4688 4689 ASSERT(spa_writeable(spa)); 4690 4691 txg = spa_vdev_enter(spa); 4692 4693 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4694 4695 if (vd == NULL) 4696 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4697 4698 if (!vd->vdev_ops->vdev_op_leaf) 4699 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4700 4701 pvd = vd->vdev_parent; 4702 4703 /* 4704 * If the parent/child relationship is not as expected, don't do it. 4705 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 4706 * vdev that's replacing B with C. The user's intent in replacing 4707 * is to go from M(A,B) to M(A,C). If the user decides to cancel 4708 * the replace by detaching C, the expected behavior is to end up 4709 * M(A,B). But suppose that right after deciding to detach C, 4710 * the replacement of B completes. We would have M(A,C), and then 4711 * ask to detach C, which would leave us with just A -- not what 4712 * the user wanted. To prevent this, we make sure that the 4713 * parent/child relationship hasn't changed -- in this example, 4714 * that C's parent is still the replacing vdev R. 4715 */ 4716 if (pvd->vdev_guid != pguid && pguid != 0) 4717 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4718 4719 /* 4720 * Only 'replacing' or 'spare' vdevs can be replaced. 4721 */ 4722 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 4723 pvd->vdev_ops != &vdev_spare_ops) 4724 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4725 4726 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 4727 spa_version(spa) >= SPA_VERSION_SPARES); 4728 4729 /* 4730 * Only mirror, replacing, and spare vdevs support detach. 4731 */ 4732 if (pvd->vdev_ops != &vdev_replacing_ops && 4733 pvd->vdev_ops != &vdev_mirror_ops && 4734 pvd->vdev_ops != &vdev_spare_ops) 4735 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4736 4737 /* 4738 * If this device has the only valid copy of some data, 4739 * we cannot safely detach it. 4740 */ 4741 if (vdev_dtl_required(vd)) 4742 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4743 4744 ASSERT(pvd->vdev_children >= 2); 4745 4746 /* 4747 * If we are detaching the second disk from a replacing vdev, then 4748 * check to see if we changed the original vdev's path to have "/old" 4749 * at the end in spa_vdev_attach(). If so, undo that change now. 4750 */ 4751 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 4752 vd->vdev_path != NULL) { 4753 size_t len = strlen(vd->vdev_path); 4754 4755 for (int c = 0; c < pvd->vdev_children; c++) { 4756 cvd = pvd->vdev_child[c]; 4757 4758 if (cvd == vd || cvd->vdev_path == NULL) 4759 continue; 4760 4761 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 4762 strcmp(cvd->vdev_path + len, "/old") == 0) { 4763 spa_strfree(cvd->vdev_path); 4764 cvd->vdev_path = spa_strdup(vd->vdev_path); 4765 break; 4766 } 4767 } 4768 } 4769 4770 /* 4771 * If we are detaching the original disk from a spare, then it implies 4772 * that the spare should become a real disk, and be removed from the 4773 * active spare list for the pool. 4774 */ 4775 if (pvd->vdev_ops == &vdev_spare_ops && 4776 vd->vdev_id == 0 && 4777 pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 4778 unspare = B_TRUE; 4779 4780 /* 4781 * Erase the disk labels so the disk can be used for other things. 4782 * This must be done after all other error cases are handled, 4783 * but before we disembowel vd (so we can still do I/O to it). 4784 * But if we can't do it, don't treat the error as fatal -- 4785 * it may be that the unwritability of the disk is the reason 4786 * it's being detached! 4787 */ 4788 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 4789 4790 /* 4791 * Remove vd from its parent and compact the parent's children. 4792 */ 4793 vdev_remove_child(pvd, vd); 4794 vdev_compact_children(pvd); 4795 4796 /* 4797 * Remember one of the remaining children so we can get tvd below. 4798 */ 4799 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 4800 4801 /* 4802 * If we need to remove the remaining child from the list of hot spares, 4803 * do it now, marking the vdev as no longer a spare in the process. 4804 * We must do this before vdev_remove_parent(), because that can 4805 * change the GUID if it creates a new toplevel GUID. For a similar 4806 * reason, we must remove the spare now, in the same txg as the detach; 4807 * otherwise someone could attach a new sibling, change the GUID, and 4808 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 4809 */ 4810 if (unspare) { 4811 ASSERT(cvd->vdev_isspare); 4812 spa_spare_remove(cvd); 4813 unspare_guid = cvd->vdev_guid; 4814 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 4815 cvd->vdev_unspare = B_TRUE; 4816 } 4817 4818 /* 4819 * If the parent mirror/replacing vdev only has one child, 4820 * the parent is no longer needed. Remove it from the tree. 4821 */ 4822 if (pvd->vdev_children == 1) { 4823 if (pvd->vdev_ops == &vdev_spare_ops) 4824 cvd->vdev_unspare = B_FALSE; 4825 vdev_remove_parent(cvd); 4826 } 4827 4828 4829 /* 4830 * We don't set tvd until now because the parent we just removed 4831 * may have been the previous top-level vdev. 4832 */ 4833 tvd = cvd->vdev_top; 4834 ASSERT(tvd->vdev_parent == rvd); 4835 4836 /* 4837 * Reevaluate the parent vdev state. 4838 */ 4839 vdev_propagate_state(cvd); 4840 4841 /* 4842 * If the 'autoexpand' property is set on the pool then automatically 4843 * try to expand the size of the pool. For example if the device we 4844 * just detached was smaller than the others, it may be possible to 4845 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 4846 * first so that we can obtain the updated sizes of the leaf vdevs. 4847 */ 4848 if (spa->spa_autoexpand) { 4849 vdev_reopen(tvd); 4850 vdev_expand(tvd, txg); 4851 } 4852 4853 vdev_config_dirty(tvd); 4854 4855 /* 4856 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 4857 * vd->vdev_detached is set and free vd's DTL object in syncing context. 4858 * But first make sure we're not on any *other* txg's DTL list, to 4859 * prevent vd from being accessed after it's freed. 4860 */ 4861 vdpath = spa_strdup(vd->vdev_path); 4862 for (int t = 0; t < TXG_SIZE; t++) 4863 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 4864 vd->vdev_detached = B_TRUE; 4865 vdev_dirty(tvd, VDD_DTL, vd, txg); 4866 4867 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 4868 4869 /* hang on to the spa before we release the lock */ 4870 spa_open_ref(spa, FTAG); 4871 4872 error = spa_vdev_exit(spa, vd, txg, 0); 4873 4874 spa_history_log_internal(spa, "detach", NULL, 4875 "vdev=%s", vdpath); 4876 spa_strfree(vdpath); 4877 4878 /* 4879 * If this was the removal of the original device in a hot spare vdev, 4880 * then we want to go through and remove the device from the hot spare 4881 * list of every other pool. 4882 */ 4883 if (unspare) { 4884 spa_t *altspa = NULL; 4885 4886 mutex_enter(&spa_namespace_lock); 4887 while ((altspa = spa_next(altspa)) != NULL) { 4888 if (altspa->spa_state != POOL_STATE_ACTIVE || 4889 altspa == spa) 4890 continue; 4891 4892 spa_open_ref(altspa, FTAG); 4893 mutex_exit(&spa_namespace_lock); 4894 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 4895 mutex_enter(&spa_namespace_lock); 4896 spa_close(altspa, FTAG); 4897 } 4898 mutex_exit(&spa_namespace_lock); 4899 4900 /* search the rest of the vdevs for spares to remove */ 4901 spa_vdev_resilver_done(spa); 4902 } 4903 4904 /* all done with the spa; OK to release */ 4905 mutex_enter(&spa_namespace_lock); 4906 spa_close(spa, FTAG); 4907 mutex_exit(&spa_namespace_lock); 4908 4909 return (error); 4910 } 4911 4912 /* 4913 * Split a set of devices from their mirrors, and create a new pool from them. 4914 */ 4915 int 4916 spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 4917 nvlist_t *props, boolean_t exp) 4918 { 4919 int error = 0; 4920 uint64_t txg, *glist; 4921 spa_t *newspa; 4922 uint_t c, children, lastlog; 4923 nvlist_t **child, *nvl, *tmp; 4924 dmu_tx_t *tx; 4925 char *altroot = NULL; 4926 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 4927 boolean_t activate_slog; 4928 4929 ASSERT(spa_writeable(spa)); 4930 4931 txg = spa_vdev_enter(spa); 4932 4933 /* clear the log and flush everything up to now */ 4934 activate_slog = spa_passivate_log(spa); 4935 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4936 error = spa_offline_log(spa); 4937 txg = spa_vdev_config_enter(spa); 4938 4939 if (activate_slog) 4940 spa_activate_log(spa); 4941 4942 if (error != 0) 4943 return (spa_vdev_exit(spa, NULL, txg, error)); 4944 4945 /* check new spa name before going any further */ 4946 if (spa_lookup(newname) != NULL) 4947 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 4948 4949 /* 4950 * scan through all the children to ensure they're all mirrors 4951 */ 4952 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 4953 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 4954 &children) != 0) 4955 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4956 4957 /* first, check to ensure we've got the right child count */ 4958 rvd = spa->spa_root_vdev; 4959 lastlog = 0; 4960 for (c = 0; c < rvd->vdev_children; c++) { 4961 vdev_t *vd = rvd->vdev_child[c]; 4962 4963 /* don't count the holes & logs as children */ 4964 if (vd->vdev_islog || vd->vdev_ishole) { 4965 if (lastlog == 0) 4966 lastlog = c; 4967 continue; 4968 } 4969 4970 lastlog = 0; 4971 } 4972 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 4973 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4974 4975 /* next, ensure no spare or cache devices are part of the split */ 4976 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 4977 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 4978 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4979 4980 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 4981 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 4982 4983 /* then, loop over each vdev and validate it */ 4984 for (c = 0; c < children; c++) { 4985 uint64_t is_hole = 0; 4986 4987 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 4988 &is_hole); 4989 4990 if (is_hole != 0) { 4991 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 4992 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 4993 continue; 4994 } else { 4995 error = SET_ERROR(EINVAL); 4996 break; 4997 } 4998 } 4999 5000 /* which disk is going to be split? */ 5001 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 5002 &glist[c]) != 0) { 5003 error = SET_ERROR(EINVAL); 5004 break; 5005 } 5006 5007 /* look it up in the spa */ 5008 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 5009 if (vml[c] == NULL) { 5010 error = SET_ERROR(ENODEV); 5011 break; 5012 } 5013 5014 /* make sure there's nothing stopping the split */ 5015 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 5016 vml[c]->vdev_islog || 5017 vml[c]->vdev_ishole || 5018 vml[c]->vdev_isspare || 5019 vml[c]->vdev_isl2cache || 5020 !vdev_writeable(vml[c]) || 5021 vml[c]->vdev_children != 0 || 5022 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 5023 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 5024 error = SET_ERROR(EINVAL); 5025 break; 5026 } 5027 5028 if (vdev_dtl_required(vml[c])) { 5029 error = SET_ERROR(EBUSY); 5030 break; 5031 } 5032 5033 /* we need certain info from the top level */ 5034 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 5035 vml[c]->vdev_top->vdev_ms_array) == 0); 5036 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 5037 vml[c]->vdev_top->vdev_ms_shift) == 0); 5038 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 5039 vml[c]->vdev_top->vdev_asize) == 0); 5040 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 5041 vml[c]->vdev_top->vdev_ashift) == 0); 5042 } 5043 5044 if (error != 0) { 5045 kmem_free(vml, children * sizeof (vdev_t *)); 5046 kmem_free(glist, children * sizeof (uint64_t)); 5047 return (spa_vdev_exit(spa, NULL, txg, error)); 5048 } 5049 5050 /* stop writers from using the disks */ 5051 for (c = 0; c < children; c++) { 5052 if (vml[c] != NULL) 5053 vml[c]->vdev_offline = B_TRUE; 5054 } 5055 vdev_reopen(spa->spa_root_vdev); 5056 5057 /* 5058 * Temporarily record the splitting vdevs in the spa config. This 5059 * will disappear once the config is regenerated. 5060 */ 5061 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5062 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 5063 glist, children) == 0); 5064 kmem_free(glist, children * sizeof (uint64_t)); 5065 5066 mutex_enter(&spa->spa_props_lock); 5067 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 5068 nvl) == 0); 5069 mutex_exit(&spa->spa_props_lock); 5070 spa->spa_config_splitting = nvl; 5071 vdev_config_dirty(spa->spa_root_vdev); 5072 5073 /* configure and create the new pool */ 5074 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 5075 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 5076 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 5077 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 5078 spa_version(spa)) == 0); 5079 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 5080 spa->spa_config_txg) == 0); 5081 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 5082 spa_generate_guid(NULL)) == 0); 5083 (void) nvlist_lookup_string(props, 5084 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5085 5086 /* add the new pool to the namespace */ 5087 newspa = spa_add(newname, config, altroot); 5088 newspa->spa_config_txg = spa->spa_config_txg; 5089 spa_set_log_state(newspa, SPA_LOG_CLEAR); 5090 5091 /* release the spa config lock, retaining the namespace lock */ 5092 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5093 5094 if (zio_injection_enabled) 5095 zio_handle_panic_injection(spa, FTAG, 1); 5096 5097 spa_activate(newspa, spa_mode_global); 5098 spa_async_suspend(newspa); 5099 5100 /* create the new pool from the disks of the original pool */ 5101 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 5102 if (error) 5103 goto out; 5104 5105 /* if that worked, generate a real config for the new pool */ 5106 if (newspa->spa_root_vdev != NULL) { 5107 VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 5108 NV_UNIQUE_NAME, KM_SLEEP) == 0); 5109 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 5110 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 5111 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 5112 B_TRUE)); 5113 } 5114 5115 /* set the props */ 5116 if (props != NULL) { 5117 spa_configfile_set(newspa, props, B_FALSE); 5118 error = spa_prop_set(newspa, props); 5119 if (error) 5120 goto out; 5121 } 5122 5123 /* flush everything */ 5124 txg = spa_vdev_config_enter(newspa); 5125 vdev_config_dirty(newspa->spa_root_vdev); 5126 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 5127 5128 if (zio_injection_enabled) 5129 zio_handle_panic_injection(spa, FTAG, 2); 5130 5131 spa_async_resume(newspa); 5132 5133 /* finally, update the original pool's config */ 5134 txg = spa_vdev_config_enter(spa); 5135 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 5136 error = dmu_tx_assign(tx, TXG_WAIT); 5137 if (error != 0) 5138 dmu_tx_abort(tx); 5139 for (c = 0; c < children; c++) { 5140 if (vml[c] != NULL) { 5141 vdev_split(vml[c]); 5142 if (error == 0) 5143 spa_history_log_internal(spa, "detach", tx, 5144 "vdev=%s", vml[c]->vdev_path); 5145 vdev_free(vml[c]); 5146 } 5147 } 5148 vdev_config_dirty(spa->spa_root_vdev); 5149 spa->spa_config_splitting = NULL; 5150 nvlist_free(nvl); 5151 if (error == 0) 5152 dmu_tx_commit(tx); 5153 (void) spa_vdev_exit(spa, NULL, txg, 0); 5154 5155 if (zio_injection_enabled) 5156 zio_handle_panic_injection(spa, FTAG, 3); 5157 5158 /* split is complete; log a history record */ 5159 spa_history_log_internal(newspa, "split", NULL, 5160 "from pool %s", spa_name(spa)); 5161 5162 kmem_free(vml, children * sizeof (vdev_t *)); 5163 5164 /* if we're not going to mount the filesystems in userland, export */ 5165 if (exp) 5166 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 5167 B_FALSE, B_FALSE); 5168 5169 return (error); 5170 5171 out: 5172 spa_unload(newspa); 5173 spa_deactivate(newspa); 5174 spa_remove(newspa); 5175 5176 txg = spa_vdev_config_enter(spa); 5177 5178 /* re-online all offlined disks */ 5179 for (c = 0; c < children; c++) { 5180 if (vml[c] != NULL) 5181 vml[c]->vdev_offline = B_FALSE; 5182 } 5183 vdev_reopen(spa->spa_root_vdev); 5184 5185 nvlist_free(spa->spa_config_splitting); 5186 spa->spa_config_splitting = NULL; 5187 (void) spa_vdev_exit(spa, NULL, txg, error); 5188 5189 kmem_free(vml, children * sizeof (vdev_t *)); 5190 return (error); 5191 } 5192 5193 static nvlist_t * 5194 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 5195 { 5196 for (int i = 0; i < count; i++) { 5197 uint64_t guid; 5198 5199 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 5200 &guid) == 0); 5201 5202 if (guid == target_guid) 5203 return (nvpp[i]); 5204 } 5205 5206 return (NULL); 5207 } 5208 5209 static void 5210 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 5211 nvlist_t *dev_to_remove) 5212 { 5213 nvlist_t **newdev = NULL; 5214 5215 if (count > 1) 5216 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 5217 5218 for (int i = 0, j = 0; i < count; i++) { 5219 if (dev[i] == dev_to_remove) 5220 continue; 5221 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 5222 } 5223 5224 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 5225 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 5226 5227 for (int i = 0; i < count - 1; i++) 5228 nvlist_free(newdev[i]); 5229 5230 if (count > 1) 5231 kmem_free(newdev, (count - 1) * sizeof (void *)); 5232 } 5233 5234 /* 5235 * Evacuate the device. 5236 */ 5237 static int 5238 spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 5239 { 5240 uint64_t txg; 5241 int error = 0; 5242 5243 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5244 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5245 ASSERT(vd == vd->vdev_top); 5246 5247 /* 5248 * Evacuate the device. We don't hold the config lock as writer 5249 * since we need to do I/O but we do keep the 5250 * spa_namespace_lock held. Once this completes the device 5251 * should no longer have any blocks allocated on it. 5252 */ 5253 if (vd->vdev_islog) { 5254 if (vd->vdev_stat.vs_alloc != 0) 5255 error = spa_offline_log(spa); 5256 } else { 5257 error = SET_ERROR(ENOTSUP); 5258 } 5259 5260 if (error) 5261 return (error); 5262 5263 /* 5264 * The evacuation succeeded. Remove any remaining MOS metadata 5265 * associated with this vdev, and wait for these changes to sync. 5266 */ 5267 ASSERT0(vd->vdev_stat.vs_alloc); 5268 txg = spa_vdev_config_enter(spa); 5269 vd->vdev_removing = B_TRUE; 5270 vdev_dirty_leaves(vd, VDD_DTL, txg); 5271 vdev_config_dirty(vd); 5272 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5273 5274 return (0); 5275 } 5276 5277 /* 5278 * Complete the removal by cleaning up the namespace. 5279 */ 5280 static void 5281 spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 5282 { 5283 vdev_t *rvd = spa->spa_root_vdev; 5284 uint64_t id = vd->vdev_id; 5285 boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 5286 5287 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5288 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5289 ASSERT(vd == vd->vdev_top); 5290 5291 /* 5292 * Only remove any devices which are empty. 5293 */ 5294 if (vd->vdev_stat.vs_alloc != 0) 5295 return; 5296 5297 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5298 5299 if (list_link_active(&vd->vdev_state_dirty_node)) 5300 vdev_state_clean(vd); 5301 if (list_link_active(&vd->vdev_config_dirty_node)) 5302 vdev_config_clean(vd); 5303 5304 vdev_free(vd); 5305 5306 if (last_vdev) { 5307 vdev_compact_children(rvd); 5308 } else { 5309 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 5310 vdev_add_child(rvd, vd); 5311 } 5312 vdev_config_dirty(rvd); 5313 5314 /* 5315 * Reassess the health of our root vdev. 5316 */ 5317 vdev_reopen(rvd); 5318 } 5319 5320 /* 5321 * Remove a device from the pool - 5322 * 5323 * Removing a device from the vdev namespace requires several steps 5324 * and can take a significant amount of time. As a result we use 5325 * the spa_vdev_config_[enter/exit] functions which allow us to 5326 * grab and release the spa_config_lock while still holding the namespace 5327 * lock. During each step the configuration is synced out. 5328 * 5329 * Currently, this supports removing only hot spares, slogs, and level 2 ARC 5330 * devices. 5331 */ 5332 int 5333 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 5334 { 5335 vdev_t *vd; 5336 metaslab_group_t *mg; 5337 nvlist_t **spares, **l2cache, *nv; 5338 uint64_t txg = 0; 5339 uint_t nspares, nl2cache; 5340 int error = 0; 5341 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 5342 5343 ASSERT(spa_writeable(spa)); 5344 5345 if (!locked) 5346 txg = spa_vdev_enter(spa); 5347 5348 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 5349 5350 if (spa->spa_spares.sav_vdevs != NULL && 5351 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5352 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 5353 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 5354 /* 5355 * Only remove the hot spare if it's not currently in use 5356 * in this pool. 5357 */ 5358 if (vd == NULL || unspare) { 5359 spa_vdev_remove_aux(spa->spa_spares.sav_config, 5360 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 5361 spa_load_spares(spa); 5362 spa->spa_spares.sav_sync = B_TRUE; 5363 } else { 5364 error = SET_ERROR(EBUSY); 5365 } 5366 } else if (spa->spa_l2cache.sav_vdevs != NULL && 5367 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5368 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 5369 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 5370 /* 5371 * Cache devices can always be removed. 5372 */ 5373 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 5374 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 5375 spa_load_l2cache(spa); 5376 spa->spa_l2cache.sav_sync = B_TRUE; 5377 } else if (vd != NULL && vd->vdev_islog) { 5378 ASSERT(!locked); 5379 ASSERT(vd == vd->vdev_top); 5380 5381 mg = vd->vdev_mg; 5382 5383 /* 5384 * Stop allocating from this vdev. 5385 */ 5386 metaslab_group_passivate(mg); 5387 5388 /* 5389 * Wait for the youngest allocations and frees to sync, 5390 * and then wait for the deferral of those frees to finish. 5391 */ 5392 spa_vdev_config_exit(spa, NULL, 5393 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 5394 5395 /* 5396 * Attempt to evacuate the vdev. 5397 */ 5398 error = spa_vdev_remove_evacuate(spa, vd); 5399 5400 txg = spa_vdev_config_enter(spa); 5401 5402 /* 5403 * If we couldn't evacuate the vdev, unwind. 5404 */ 5405 if (error) { 5406 metaslab_group_activate(mg); 5407 return (spa_vdev_exit(spa, NULL, txg, error)); 5408 } 5409 5410 /* 5411 * Clean up the vdev namespace. 5412 */ 5413 spa_vdev_remove_from_namespace(spa, vd); 5414 5415 } else if (vd != NULL) { 5416 /* 5417 * Normal vdevs cannot be removed (yet). 5418 */ 5419 error = SET_ERROR(ENOTSUP); 5420 } else { 5421 /* 5422 * There is no vdev of any kind with the specified guid. 5423 */ 5424 error = SET_ERROR(ENOENT); 5425 } 5426 5427 if (!locked) 5428 return (spa_vdev_exit(spa, NULL, txg, error)); 5429 5430 return (error); 5431 } 5432 5433 /* 5434 * Find any device that's done replacing, or a vdev marked 'unspare' that's 5435 * currently spared, so we can detach it. 5436 */ 5437 static vdev_t * 5438 spa_vdev_resilver_done_hunt(vdev_t *vd) 5439 { 5440 vdev_t *newvd, *oldvd; 5441 5442 for (int c = 0; c < vd->vdev_children; c++) { 5443 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 5444 if (oldvd != NULL) 5445 return (oldvd); 5446 } 5447 5448 /* 5449 * Check for a completed replacement. We always consider the first 5450 * vdev in the list to be the oldest vdev, and the last one to be 5451 * the newest (see spa_vdev_attach() for how that works). In 5452 * the case where the newest vdev is faulted, we will not automatically 5453 * remove it after a resilver completes. This is OK as it will require 5454 * user intervention to determine which disk the admin wishes to keep. 5455 */ 5456 if (vd->vdev_ops == &vdev_replacing_ops) { 5457 ASSERT(vd->vdev_children > 1); 5458 5459 newvd = vd->vdev_child[vd->vdev_children - 1]; 5460 oldvd = vd->vdev_child[0]; 5461 5462 if (vdev_dtl_empty(newvd, DTL_MISSING) && 5463 vdev_dtl_empty(newvd, DTL_OUTAGE) && 5464 !vdev_dtl_required(oldvd)) 5465 return (oldvd); 5466 } 5467 5468 /* 5469 * Check for a completed resilver with the 'unspare' flag set. 5470 */ 5471 if (vd->vdev_ops == &vdev_spare_ops) { 5472 vdev_t *first = vd->vdev_child[0]; 5473 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 5474 5475 if (last->vdev_unspare) { 5476 oldvd = first; 5477 newvd = last; 5478 } else if (first->vdev_unspare) { 5479 oldvd = last; 5480 newvd = first; 5481 } else { 5482 oldvd = NULL; 5483 } 5484 5485 if (oldvd != NULL && 5486 vdev_dtl_empty(newvd, DTL_MISSING) && 5487 vdev_dtl_empty(newvd, DTL_OUTAGE) && 5488 !vdev_dtl_required(oldvd)) 5489 return (oldvd); 5490 5491 /* 5492 * If there are more than two spares attached to a disk, 5493 * and those spares are not required, then we want to 5494 * attempt to free them up now so that they can be used 5495 * by other pools. Once we're back down to a single 5496 * disk+spare, we stop removing them. 5497 */ 5498 if (vd->vdev_children > 2) { 5499 newvd = vd->vdev_child[1]; 5500 5501 if (newvd->vdev_isspare && last->vdev_isspare && 5502 vdev_dtl_empty(last, DTL_MISSING) && 5503 vdev_dtl_empty(last, DTL_OUTAGE) && 5504 !vdev_dtl_required(newvd)) 5505 return (newvd); 5506 } 5507 } 5508 5509 return (NULL); 5510 } 5511 5512 static void 5513 spa_vdev_resilver_done(spa_t *spa) 5514 { 5515 vdev_t *vd, *pvd, *ppvd; 5516 uint64_t guid, sguid, pguid, ppguid; 5517 5518 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5519 5520 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 5521 pvd = vd->vdev_parent; 5522 ppvd = pvd->vdev_parent; 5523 guid = vd->vdev_guid; 5524 pguid = pvd->vdev_guid; 5525 ppguid = ppvd->vdev_guid; 5526 sguid = 0; 5527 /* 5528 * If we have just finished replacing a hot spared device, then 5529 * we need to detach the parent's first child (the original hot 5530 * spare) as well. 5531 */ 5532 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 5533 ppvd->vdev_children == 2) { 5534 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 5535 sguid = ppvd->vdev_child[1]->vdev_guid; 5536 } 5537 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 5538 5539 spa_config_exit(spa, SCL_ALL, FTAG); 5540 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 5541 return; 5542 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 5543 return; 5544 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5545 } 5546 5547 spa_config_exit(spa, SCL_ALL, FTAG); 5548 } 5549 5550 /* 5551 * Update the stored path or FRU for this vdev. 5552 */ 5553 int 5554 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 5555 boolean_t ispath) 5556 { 5557 vdev_t *vd; 5558 boolean_t sync = B_FALSE; 5559 5560 ASSERT(spa_writeable(spa)); 5561 5562 spa_vdev_state_enter(spa, SCL_ALL); 5563 5564 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 5565 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 5566 5567 if (!vd->vdev_ops->vdev_op_leaf) 5568 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 5569 5570 if (ispath) { 5571 if (strcmp(value, vd->vdev_path) != 0) { 5572 spa_strfree(vd->vdev_path); 5573 vd->vdev_path = spa_strdup(value); 5574 sync = B_TRUE; 5575 } 5576 } else { 5577 if (vd->vdev_fru == NULL) { 5578 vd->vdev_fru = spa_strdup(value); 5579 sync = B_TRUE; 5580 } else if (strcmp(value, vd->vdev_fru) != 0) { 5581 spa_strfree(vd->vdev_fru); 5582 vd->vdev_fru = spa_strdup(value); 5583 sync = B_TRUE; 5584 } 5585 } 5586 5587 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 5588 } 5589 5590 int 5591 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 5592 { 5593 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 5594 } 5595 5596 int 5597 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 5598 { 5599 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 5600 } 5601 5602 /* 5603 * ========================================================================== 5604 * SPA Scanning 5605 * ========================================================================== 5606 */ 5607 5608 int 5609 spa_scan_stop(spa_t *spa) 5610 { 5611 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5612 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 5613 return (SET_ERROR(EBUSY)); 5614 return (dsl_scan_cancel(spa->spa_dsl_pool)); 5615 } 5616 5617 int 5618 spa_scan(spa_t *spa, pool_scan_func_t func) 5619 { 5620 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5621 5622 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 5623 return (SET_ERROR(ENOTSUP)); 5624 5625 /* 5626 * If a resilver was requested, but there is no DTL on a 5627 * writeable leaf device, we have nothing to do. 5628 */ 5629 if (func == POOL_SCAN_RESILVER && 5630 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5631 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 5632 return (0); 5633 } 5634 5635 return (dsl_scan(spa->spa_dsl_pool, func)); 5636 } 5637 5638 /* 5639 * ========================================================================== 5640 * SPA async task processing 5641 * ========================================================================== 5642 */ 5643 5644 static void 5645 spa_async_remove(spa_t *spa, vdev_t *vd) 5646 { 5647 if (vd->vdev_remove_wanted) { 5648 vd->vdev_remove_wanted = B_FALSE; 5649 vd->vdev_delayed_close = B_FALSE; 5650 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 5651 5652 /* 5653 * We want to clear the stats, but we don't want to do a full 5654 * vdev_clear() as that will cause us to throw away 5655 * degraded/faulted state as well as attempt to reopen the 5656 * device, all of which is a waste. 5657 */ 5658 vd->vdev_stat.vs_read_errors = 0; 5659 vd->vdev_stat.vs_write_errors = 0; 5660 vd->vdev_stat.vs_checksum_errors = 0; 5661 5662 vdev_state_dirty(vd->vdev_top); 5663 } 5664 5665 for (int c = 0; c < vd->vdev_children; c++) 5666 spa_async_remove(spa, vd->vdev_child[c]); 5667 } 5668 5669 static void 5670 spa_async_probe(spa_t *spa, vdev_t *vd) 5671 { 5672 if (vd->vdev_probe_wanted) { 5673 vd->vdev_probe_wanted = B_FALSE; 5674 vdev_reopen(vd); /* vdev_open() does the actual probe */ 5675 } 5676 5677 for (int c = 0; c < vd->vdev_children; c++) 5678 spa_async_probe(spa, vd->vdev_child[c]); 5679 } 5680 5681 static void 5682 spa_async_autoexpand(spa_t *spa, vdev_t *vd) 5683 { 5684 sysevent_id_t eid; 5685 nvlist_t *attr; 5686 char *physpath; 5687 5688 if (!spa->spa_autoexpand) 5689 return; 5690 5691 for (int c = 0; c < vd->vdev_children; c++) { 5692 vdev_t *cvd = vd->vdev_child[c]; 5693 spa_async_autoexpand(spa, cvd); 5694 } 5695 5696 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 5697 return; 5698 5699 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 5700 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 5701 5702 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5703 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 5704 5705 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 5706 ESC_DEV_DLE, attr, &eid, DDI_SLEEP); 5707 5708 nvlist_free(attr); 5709 kmem_free(physpath, MAXPATHLEN); 5710 } 5711 5712 static void 5713 spa_async_thread(spa_t *spa) 5714 { 5715 int tasks; 5716 5717 ASSERT(spa->spa_sync_on); 5718 5719 mutex_enter(&spa->spa_async_lock); 5720 tasks = spa->spa_async_tasks; 5721 spa->spa_async_tasks = 0; 5722 mutex_exit(&spa->spa_async_lock); 5723 5724 /* 5725 * See if the config needs to be updated. 5726 */ 5727 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 5728 uint64_t old_space, new_space; 5729 5730 mutex_enter(&spa_namespace_lock); 5731 old_space = metaslab_class_get_space(spa_normal_class(spa)); 5732 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5733 new_space = metaslab_class_get_space(spa_normal_class(spa)); 5734 mutex_exit(&spa_namespace_lock); 5735 5736 /* 5737 * If the pool grew as a result of the config update, 5738 * then log an internal history event. 5739 */ 5740 if (new_space != old_space) { 5741 spa_history_log_internal(spa, "vdev online", NULL, 5742 "pool '%s' size: %llu(+%llu)", 5743 spa_name(spa), new_space, new_space - old_space); 5744 } 5745 } 5746 5747 /* 5748 * See if any devices need to be marked REMOVED. 5749 */ 5750 if (tasks & SPA_ASYNC_REMOVE) { 5751 spa_vdev_state_enter(spa, SCL_NONE); 5752 spa_async_remove(spa, spa->spa_root_vdev); 5753 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 5754 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 5755 for (int i = 0; i < spa->spa_spares.sav_count; i++) 5756 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 5757 (void) spa_vdev_state_exit(spa, NULL, 0); 5758 } 5759 5760 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 5761 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5762 spa_async_autoexpand(spa, spa->spa_root_vdev); 5763 spa_config_exit(spa, SCL_CONFIG, FTAG); 5764 } 5765 5766 /* 5767 * See if any devices need to be probed. 5768 */ 5769 if (tasks & SPA_ASYNC_PROBE) { 5770 spa_vdev_state_enter(spa, SCL_NONE); 5771 spa_async_probe(spa, spa->spa_root_vdev); 5772 (void) spa_vdev_state_exit(spa, NULL, 0); 5773 } 5774 5775 /* 5776 * If any devices are done replacing, detach them. 5777 */ 5778 if (tasks & SPA_ASYNC_RESILVER_DONE) 5779 spa_vdev_resilver_done(spa); 5780 5781 /* 5782 * Kick off a resilver. 5783 */ 5784 if (tasks & SPA_ASYNC_RESILVER) 5785 dsl_resilver_restart(spa->spa_dsl_pool, 0); 5786 5787 /* 5788 * Let the world know that we're done. 5789 */ 5790 mutex_enter(&spa->spa_async_lock); 5791 spa->spa_async_thread = NULL; 5792 cv_broadcast(&spa->spa_async_cv); 5793 mutex_exit(&spa->spa_async_lock); 5794 thread_exit(); 5795 } 5796 5797 void 5798 spa_async_suspend(spa_t *spa) 5799 { 5800 mutex_enter(&spa->spa_async_lock); 5801 spa->spa_async_suspended++; 5802 while (spa->spa_async_thread != NULL) 5803 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 5804 mutex_exit(&spa->spa_async_lock); 5805 } 5806 5807 void 5808 spa_async_resume(spa_t *spa) 5809 { 5810 mutex_enter(&spa->spa_async_lock); 5811 ASSERT(spa->spa_async_suspended != 0); 5812 spa->spa_async_suspended--; 5813 mutex_exit(&spa->spa_async_lock); 5814 } 5815 5816 static boolean_t 5817 spa_async_tasks_pending(spa_t *spa) 5818 { 5819 uint_t non_config_tasks; 5820 uint_t config_task; 5821 boolean_t config_task_suspended; 5822 5823 non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE; 5824 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 5825 if (spa->spa_ccw_fail_time == 0) { 5826 config_task_suspended = B_FALSE; 5827 } else { 5828 config_task_suspended = 5829 (gethrtime() - spa->spa_ccw_fail_time) < 5830 (zfs_ccw_retry_interval * NANOSEC); 5831 } 5832 5833 return (non_config_tasks || (config_task && !config_task_suspended)); 5834 } 5835 5836 static void 5837 spa_async_dispatch(spa_t *spa) 5838 { 5839 mutex_enter(&spa->spa_async_lock); 5840 if (spa_async_tasks_pending(spa) && 5841 !spa->spa_async_suspended && 5842 spa->spa_async_thread == NULL && 5843 rootdir != NULL) 5844 spa->spa_async_thread = thread_create(NULL, 0, 5845 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 5846 mutex_exit(&spa->spa_async_lock); 5847 } 5848 5849 void 5850 spa_async_request(spa_t *spa, int task) 5851 { 5852 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 5853 mutex_enter(&spa->spa_async_lock); 5854 spa->spa_async_tasks |= task; 5855 mutex_exit(&spa->spa_async_lock); 5856 } 5857 5858 /* 5859 * ========================================================================== 5860 * SPA syncing routines 5861 * ========================================================================== 5862 */ 5863 5864 static int 5865 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5866 { 5867 bpobj_t *bpo = arg; 5868 bpobj_enqueue(bpo, bp, tx); 5869 return (0); 5870 } 5871 5872 static int 5873 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5874 { 5875 zio_t *zio = arg; 5876 5877 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 5878 zio->io_flags)); 5879 return (0); 5880 } 5881 5882 /* 5883 * Note: this simple function is not inlined to make it easier to dtrace the 5884 * amount of time spent syncing frees. 5885 */ 5886 static void 5887 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 5888 { 5889 zio_t *zio = zio_root(spa, NULL, NULL, 0); 5890 bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 5891 VERIFY(zio_wait(zio) == 0); 5892 } 5893 5894 /* 5895 * Note: this simple function is not inlined to make it easier to dtrace the 5896 * amount of time spent syncing deferred frees. 5897 */ 5898 static void 5899 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 5900 { 5901 zio_t *zio = zio_root(spa, NULL, NULL, 0); 5902 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 5903 spa_free_sync_cb, zio, tx), ==, 0); 5904 VERIFY0(zio_wait(zio)); 5905 } 5906 5907 5908 static void 5909 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 5910 { 5911 char *packed = NULL; 5912 size_t bufsize; 5913 size_t nvsize = 0; 5914 dmu_buf_t *db; 5915 5916 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 5917 5918 /* 5919 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 5920 * information. This avoids the dmu_buf_will_dirty() path and 5921 * saves us a pre-read to get data we don't actually care about. 5922 */ 5923 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 5924 packed = kmem_alloc(bufsize, KM_SLEEP); 5925 5926 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 5927 KM_SLEEP) == 0); 5928 bzero(packed + nvsize, bufsize - nvsize); 5929 5930 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 5931 5932 kmem_free(packed, bufsize); 5933 5934 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 5935 dmu_buf_will_dirty(db, tx); 5936 *(uint64_t *)db->db_data = nvsize; 5937 dmu_buf_rele(db, FTAG); 5938 } 5939 5940 static void 5941 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 5942 const char *config, const char *entry) 5943 { 5944 nvlist_t *nvroot; 5945 nvlist_t **list; 5946 int i; 5947 5948 if (!sav->sav_sync) 5949 return; 5950 5951 /* 5952 * Update the MOS nvlist describing the list of available devices. 5953 * spa_validate_aux() will have already made sure this nvlist is 5954 * valid and the vdevs are labeled appropriately. 5955 */ 5956 if (sav->sav_object == 0) { 5957 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 5958 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 5959 sizeof (uint64_t), tx); 5960 VERIFY(zap_update(spa->spa_meta_objset, 5961 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 5962 &sav->sav_object, tx) == 0); 5963 } 5964 5965 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5966 if (sav->sav_count == 0) { 5967 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 5968 } else { 5969 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 5970 for (i = 0; i < sav->sav_count; i++) 5971 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 5972 B_FALSE, VDEV_CONFIG_L2CACHE); 5973 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 5974 sav->sav_count) == 0); 5975 for (i = 0; i < sav->sav_count; i++) 5976 nvlist_free(list[i]); 5977 kmem_free(list, sav->sav_count * sizeof (void *)); 5978 } 5979 5980 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 5981 nvlist_free(nvroot); 5982 5983 sav->sav_sync = B_FALSE; 5984 } 5985 5986 static void 5987 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 5988 { 5989 nvlist_t *config; 5990 5991 if (list_is_empty(&spa->spa_config_dirty_list)) 5992 return; 5993 5994 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5995 5996 config = spa_config_generate(spa, spa->spa_root_vdev, 5997 dmu_tx_get_txg(tx), B_FALSE); 5998 5999 /* 6000 * If we're upgrading the spa version then make sure that 6001 * the config object gets updated with the correct version. 6002 */ 6003 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 6004 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 6005 spa->spa_uberblock.ub_version); 6006 6007 spa_config_exit(spa, SCL_STATE, FTAG); 6008 6009 if (spa->spa_config_syncing) 6010 nvlist_free(spa->spa_config_syncing); 6011 spa->spa_config_syncing = config; 6012 6013 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 6014 } 6015 6016 static void 6017 spa_sync_version(void *arg, dmu_tx_t *tx) 6018 { 6019 uint64_t *versionp = arg; 6020 uint64_t version = *versionp; 6021 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6022 6023 /* 6024 * Setting the version is special cased when first creating the pool. 6025 */ 6026 ASSERT(tx->tx_txg != TXG_INITIAL); 6027 6028 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 6029 ASSERT(version >= spa_version(spa)); 6030 6031 spa->spa_uberblock.ub_version = version; 6032 vdev_config_dirty(spa->spa_root_vdev); 6033 spa_history_log_internal(spa, "set", tx, "version=%lld", version); 6034 } 6035 6036 /* 6037 * Set zpool properties. 6038 */ 6039 static void 6040 spa_sync_props(void *arg, dmu_tx_t *tx) 6041 { 6042 nvlist_t *nvp = arg; 6043 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6044 objset_t *mos = spa->spa_meta_objset; 6045 nvpair_t *elem = NULL; 6046 6047 mutex_enter(&spa->spa_props_lock); 6048 6049 while ((elem = nvlist_next_nvpair(nvp, elem))) { 6050 uint64_t intval; 6051 char *strval, *fname; 6052 zpool_prop_t prop; 6053 const char *propname; 6054 zprop_type_t proptype; 6055 spa_feature_t fid; 6056 6057 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 6058 case ZPROP_INVAL: 6059 /* 6060 * We checked this earlier in spa_prop_validate(). 6061 */ 6062 ASSERT(zpool_prop_feature(nvpair_name(elem))); 6063 6064 fname = strchr(nvpair_name(elem), '@') + 1; 6065 VERIFY0(zfeature_lookup_name(fname, &fid)); 6066 6067 spa_feature_enable(spa, fid, tx); 6068 spa_history_log_internal(spa, "set", tx, 6069 "%s=enabled", nvpair_name(elem)); 6070 break; 6071 6072 case ZPOOL_PROP_VERSION: 6073 intval = fnvpair_value_uint64(elem); 6074 /* 6075 * The version is synced seperatly before other 6076 * properties and should be correct by now. 6077 */ 6078 ASSERT3U(spa_version(spa), >=, intval); 6079 break; 6080 6081 case ZPOOL_PROP_ALTROOT: 6082 /* 6083 * 'altroot' is a non-persistent property. It should 6084 * have been set temporarily at creation or import time. 6085 */ 6086 ASSERT(spa->spa_root != NULL); 6087 break; 6088 6089 case ZPOOL_PROP_READONLY: 6090 case ZPOOL_PROP_CACHEFILE: 6091 /* 6092 * 'readonly' and 'cachefile' are also non-persisitent 6093 * properties. 6094 */ 6095 break; 6096 case ZPOOL_PROP_COMMENT: 6097 strval = fnvpair_value_string(elem); 6098 if (spa->spa_comment != NULL) 6099 spa_strfree(spa->spa_comment); 6100 spa->spa_comment = spa_strdup(strval); 6101 /* 6102 * We need to dirty the configuration on all the vdevs 6103 * so that their labels get updated. It's unnecessary 6104 * to do this for pool creation since the vdev's 6105 * configuratoin has already been dirtied. 6106 */ 6107 if (tx->tx_txg != TXG_INITIAL) 6108 vdev_config_dirty(spa->spa_root_vdev); 6109 spa_history_log_internal(spa, "set", tx, 6110 "%s=%s", nvpair_name(elem), strval); 6111 break; 6112 default: 6113 /* 6114 * Set pool property values in the poolprops mos object. 6115 */ 6116 if (spa->spa_pool_props_object == 0) { 6117 spa->spa_pool_props_object = 6118 zap_create_link(mos, DMU_OT_POOL_PROPS, 6119 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 6120 tx); 6121 } 6122 6123 /* normalize the property name */ 6124 propname = zpool_prop_to_name(prop); 6125 proptype = zpool_prop_get_type(prop); 6126 6127 if (nvpair_type(elem) == DATA_TYPE_STRING) { 6128 ASSERT(proptype == PROP_TYPE_STRING); 6129 strval = fnvpair_value_string(elem); 6130 VERIFY0(zap_update(mos, 6131 spa->spa_pool_props_object, propname, 6132 1, strlen(strval) + 1, strval, tx)); 6133 spa_history_log_internal(spa, "set", tx, 6134 "%s=%s", nvpair_name(elem), strval); 6135 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 6136 intval = fnvpair_value_uint64(elem); 6137 6138 if (proptype == PROP_TYPE_INDEX) { 6139 const char *unused; 6140 VERIFY0(zpool_prop_index_to_string( 6141 prop, intval, &unused)); 6142 } 6143 VERIFY0(zap_update(mos, 6144 spa->spa_pool_props_object, propname, 6145 8, 1, &intval, tx)); 6146 spa_history_log_internal(spa, "set", tx, 6147 "%s=%lld", nvpair_name(elem), intval); 6148 } else { 6149 ASSERT(0); /* not allowed */ 6150 } 6151 6152 switch (prop) { 6153 case ZPOOL_PROP_DELEGATION: 6154 spa->spa_delegation = intval; 6155 break; 6156 case ZPOOL_PROP_BOOTFS: 6157 spa->spa_bootfs = intval; 6158 break; 6159 case ZPOOL_PROP_FAILUREMODE: 6160 spa->spa_failmode = intval; 6161 break; 6162 case ZPOOL_PROP_AUTOEXPAND: 6163 spa->spa_autoexpand = intval; 6164 if (tx->tx_txg != TXG_INITIAL) 6165 spa_async_request(spa, 6166 SPA_ASYNC_AUTOEXPAND); 6167 break; 6168 case ZPOOL_PROP_DEDUPDITTO: 6169 spa->spa_dedup_ditto = intval; 6170 break; 6171 default: 6172 break; 6173 } 6174 } 6175 6176 } 6177 6178 mutex_exit(&spa->spa_props_lock); 6179 } 6180 6181 /* 6182 * Perform one-time upgrade on-disk changes. spa_version() does not 6183 * reflect the new version this txg, so there must be no changes this 6184 * txg to anything that the upgrade code depends on after it executes. 6185 * Therefore this must be called after dsl_pool_sync() does the sync 6186 * tasks. 6187 */ 6188 static void 6189 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 6190 { 6191 dsl_pool_t *dp = spa->spa_dsl_pool; 6192 6193 ASSERT(spa->spa_sync_pass == 1); 6194 6195 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 6196 6197 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 6198 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 6199 dsl_pool_create_origin(dp, tx); 6200 6201 /* Keeping the origin open increases spa_minref */ 6202 spa->spa_minref += 3; 6203 } 6204 6205 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 6206 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 6207 dsl_pool_upgrade_clones(dp, tx); 6208 } 6209 6210 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 6211 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 6212 dsl_pool_upgrade_dir_clones(dp, tx); 6213 6214 /* Keeping the freedir open increases spa_minref */ 6215 spa->spa_minref += 3; 6216 } 6217 6218 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 6219 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6220 spa_feature_create_zap_objects(spa, tx); 6221 } 6222 6223 /* 6224 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 6225 * when possibility to use lz4 compression for metadata was added 6226 * Old pools that have this feature enabled must be upgraded to have 6227 * this feature active 6228 */ 6229 if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6230 boolean_t lz4_en = spa_feature_is_enabled(spa, 6231 SPA_FEATURE_LZ4_COMPRESS); 6232 boolean_t lz4_ac = spa_feature_is_active(spa, 6233 SPA_FEATURE_LZ4_COMPRESS); 6234 6235 if (lz4_en && !lz4_ac) 6236 spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 6237 } 6238 6239 /* 6240 * If we haven't written the salt, do so now. Note that the 6241 * feature may not be activated yet, but that's fine since 6242 * the presence of this ZAP entry is backwards compatible. 6243 */ 6244 if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 6245 DMU_POOL_CHECKSUM_SALT) == ENOENT) { 6246 VERIFY0(zap_add(spa->spa_meta_objset, 6247 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, 6248 sizeof (spa->spa_cksum_salt.zcs_bytes), 6249 spa->spa_cksum_salt.zcs_bytes, tx)); 6250 } 6251 6252 rrw_exit(&dp->dp_config_rwlock, FTAG); 6253 } 6254 6255 /* 6256 * Sync the specified transaction group. New blocks may be dirtied as 6257 * part of the process, so we iterate until it converges. 6258 */ 6259 void 6260 spa_sync(spa_t *spa, uint64_t txg) 6261 { 6262 dsl_pool_t *dp = spa->spa_dsl_pool; 6263 objset_t *mos = spa->spa_meta_objset; 6264 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 6265 vdev_t *rvd = spa->spa_root_vdev; 6266 vdev_t *vd; 6267 dmu_tx_t *tx; 6268 int error; 6269 6270 VERIFY(spa_writeable(spa)); 6271 6272 /* 6273 * Lock out configuration changes. 6274 */ 6275 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6276 6277 spa->spa_syncing_txg = txg; 6278 spa->spa_sync_pass = 0; 6279 6280 /* 6281 * If there are any pending vdev state changes, convert them 6282 * into config changes that go out with this transaction group. 6283 */ 6284 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6285 while (list_head(&spa->spa_state_dirty_list) != NULL) { 6286 /* 6287 * We need the write lock here because, for aux vdevs, 6288 * calling vdev_config_dirty() modifies sav_config. 6289 * This is ugly and will become unnecessary when we 6290 * eliminate the aux vdev wart by integrating all vdevs 6291 * into the root vdev tree. 6292 */ 6293 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6294 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 6295 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 6296 vdev_state_clean(vd); 6297 vdev_config_dirty(vd); 6298 } 6299 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6300 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 6301 } 6302 spa_config_exit(spa, SCL_STATE, FTAG); 6303 6304 tx = dmu_tx_create_assigned(dp, txg); 6305 6306 spa->spa_sync_starttime = gethrtime(); 6307 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, 6308 spa->spa_sync_starttime + spa->spa_deadman_synctime)); 6309 6310 /* 6311 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 6312 * set spa_deflate if we have no raid-z vdevs. 6313 */ 6314 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 6315 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 6316 int i; 6317 6318 for (i = 0; i < rvd->vdev_children; i++) { 6319 vd = rvd->vdev_child[i]; 6320 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 6321 break; 6322 } 6323 if (i == rvd->vdev_children) { 6324 spa->spa_deflate = TRUE; 6325 VERIFY(0 == zap_add(spa->spa_meta_objset, 6326 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6327 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 6328 } 6329 } 6330 6331 /* 6332 * Iterate to convergence. 6333 */ 6334 do { 6335 int pass = ++spa->spa_sync_pass; 6336 6337 spa_sync_config_object(spa, tx); 6338 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 6339 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 6340 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 6341 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 6342 spa_errlog_sync(spa, txg); 6343 dsl_pool_sync(dp, txg); 6344 6345 if (pass < zfs_sync_pass_deferred_free) { 6346 spa_sync_frees(spa, free_bpl, tx); 6347 } else { 6348 /* 6349 * We can not defer frees in pass 1, because 6350 * we sync the deferred frees later in pass 1. 6351 */ 6352 ASSERT3U(pass, >, 1); 6353 bplist_iterate(free_bpl, bpobj_enqueue_cb, 6354 &spa->spa_deferred_bpobj, tx); 6355 } 6356 6357 ddt_sync(spa, txg); 6358 dsl_scan_sync(dp, tx); 6359 6360 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 6361 vdev_sync(vd, txg); 6362 6363 if (pass == 1) { 6364 spa_sync_upgrades(spa, tx); 6365 ASSERT3U(txg, >=, 6366 spa->spa_uberblock.ub_rootbp.blk_birth); 6367 /* 6368 * Note: We need to check if the MOS is dirty 6369 * because we could have marked the MOS dirty 6370 * without updating the uberblock (e.g. if we 6371 * have sync tasks but no dirty user data). We 6372 * need to check the uberblock's rootbp because 6373 * it is updated if we have synced out dirty 6374 * data (though in this case the MOS will most 6375 * likely also be dirty due to second order 6376 * effects, we don't want to rely on that here). 6377 */ 6378 if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && 6379 !dmu_objset_is_dirty(mos, txg)) { 6380 /* 6381 * Nothing changed on the first pass, 6382 * therefore this TXG is a no-op. Avoid 6383 * syncing deferred frees, so that we 6384 * can keep this TXG as a no-op. 6385 */ 6386 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, 6387 txg)); 6388 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 6389 ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); 6390 break; 6391 } 6392 spa_sync_deferred_frees(spa, tx); 6393 } 6394 6395 } while (dmu_objset_is_dirty(mos, txg)); 6396 6397 /* 6398 * Rewrite the vdev configuration (which includes the uberblock) 6399 * to commit the transaction group. 6400 * 6401 * If there are no dirty vdevs, we sync the uberblock to a few 6402 * random top-level vdevs that are known to be visible in the 6403 * config cache (see spa_vdev_add() for a complete description). 6404 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 6405 */ 6406 for (;;) { 6407 /* 6408 * We hold SCL_STATE to prevent vdev open/close/etc. 6409 * while we're attempting to write the vdev labels. 6410 */ 6411 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6412 6413 if (list_is_empty(&spa->spa_config_dirty_list)) { 6414 vdev_t *svd[SPA_DVAS_PER_BP]; 6415 int svdcount = 0; 6416 int children = rvd->vdev_children; 6417 int c0 = spa_get_random(children); 6418 6419 for (int c = 0; c < children; c++) { 6420 vd = rvd->vdev_child[(c0 + c) % children]; 6421 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 6422 continue; 6423 svd[svdcount++] = vd; 6424 if (svdcount == SPA_DVAS_PER_BP) 6425 break; 6426 } 6427 error = vdev_config_sync(svd, svdcount, txg); 6428 } else { 6429 error = vdev_config_sync(rvd->vdev_child, 6430 rvd->vdev_children, txg); 6431 } 6432 6433 if (error == 0) 6434 spa->spa_last_synced_guid = rvd->vdev_guid; 6435 6436 spa_config_exit(spa, SCL_STATE, FTAG); 6437 6438 if (error == 0) 6439 break; 6440 zio_suspend(spa, NULL); 6441 zio_resume_wait(spa); 6442 } 6443 dmu_tx_commit(tx); 6444 6445 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); 6446 6447 /* 6448 * Clear the dirty config list. 6449 */ 6450 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 6451 vdev_config_clean(vd); 6452 6453 /* 6454 * Now that the new config has synced transactionally, 6455 * let it become visible to the config cache. 6456 */ 6457 if (spa->spa_config_syncing != NULL) { 6458 spa_config_set(spa, spa->spa_config_syncing); 6459 spa->spa_config_txg = txg; 6460 spa->spa_config_syncing = NULL; 6461 } 6462 6463 spa->spa_ubsync = spa->spa_uberblock; 6464 6465 dsl_pool_sync_done(dp, txg); 6466 6467 /* 6468 * Update usable space statistics. 6469 */ 6470 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 6471 vdev_sync_done(vd, txg); 6472 6473 spa_update_dspace(spa); 6474 6475 /* 6476 * It had better be the case that we didn't dirty anything 6477 * since vdev_config_sync(). 6478 */ 6479 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 6480 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 6481 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 6482 6483 spa->spa_sync_pass = 0; 6484 6485 spa_config_exit(spa, SCL_CONFIG, FTAG); 6486 6487 spa_handle_ignored_writes(spa); 6488 6489 /* 6490 * If any async tasks have been requested, kick them off. 6491 */ 6492 spa_async_dispatch(spa); 6493 } 6494 6495 /* 6496 * Sync all pools. We don't want to hold the namespace lock across these 6497 * operations, so we take a reference on the spa_t and drop the lock during the 6498 * sync. 6499 */ 6500 void 6501 spa_sync_allpools(void) 6502 { 6503 spa_t *spa = NULL; 6504 mutex_enter(&spa_namespace_lock); 6505 while ((spa = spa_next(spa)) != NULL) { 6506 if (spa_state(spa) != POOL_STATE_ACTIVE || 6507 !spa_writeable(spa) || spa_suspended(spa)) 6508 continue; 6509 spa_open_ref(spa, FTAG); 6510 mutex_exit(&spa_namespace_lock); 6511 txg_wait_synced(spa_get_dsl(spa), 0); 6512 mutex_enter(&spa_namespace_lock); 6513 spa_close(spa, FTAG); 6514 } 6515 mutex_exit(&spa_namespace_lock); 6516 } 6517 6518 /* 6519 * ========================================================================== 6520 * Miscellaneous routines 6521 * ========================================================================== 6522 */ 6523 6524 /* 6525 * Remove all pools in the system. 6526 */ 6527 void 6528 spa_evict_all(void) 6529 { 6530 spa_t *spa; 6531 6532 /* 6533 * Remove all cached state. All pools should be closed now, 6534 * so every spa in the AVL tree should be unreferenced. 6535 */ 6536 mutex_enter(&spa_namespace_lock); 6537 while ((spa = spa_next(NULL)) != NULL) { 6538 /* 6539 * Stop async tasks. The async thread may need to detach 6540 * a device that's been replaced, which requires grabbing 6541 * spa_namespace_lock, so we must drop it here. 6542 */ 6543 spa_open_ref(spa, FTAG); 6544 mutex_exit(&spa_namespace_lock); 6545 spa_async_suspend(spa); 6546 mutex_enter(&spa_namespace_lock); 6547 spa_close(spa, FTAG); 6548 6549 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 6550 spa_unload(spa); 6551 spa_deactivate(spa); 6552 } 6553 spa_remove(spa); 6554 } 6555 mutex_exit(&spa_namespace_lock); 6556 } 6557 6558 vdev_t * 6559 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 6560 { 6561 vdev_t *vd; 6562 int i; 6563 6564 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 6565 return (vd); 6566 6567 if (aux) { 6568 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 6569 vd = spa->spa_l2cache.sav_vdevs[i]; 6570 if (vd->vdev_guid == guid) 6571 return (vd); 6572 } 6573 6574 for (i = 0; i < spa->spa_spares.sav_count; i++) { 6575 vd = spa->spa_spares.sav_vdevs[i]; 6576 if (vd->vdev_guid == guid) 6577 return (vd); 6578 } 6579 } 6580 6581 return (NULL); 6582 } 6583 6584 void 6585 spa_upgrade(spa_t *spa, uint64_t version) 6586 { 6587 ASSERT(spa_writeable(spa)); 6588 6589 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6590 6591 /* 6592 * This should only be called for a non-faulted pool, and since a 6593 * future version would result in an unopenable pool, this shouldn't be 6594 * possible. 6595 */ 6596 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 6597 ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 6598 6599 spa->spa_uberblock.ub_version = version; 6600 vdev_config_dirty(spa->spa_root_vdev); 6601 6602 spa_config_exit(spa, SCL_ALL, FTAG); 6603 6604 txg_wait_synced(spa_get_dsl(spa), 0); 6605 } 6606 6607 boolean_t 6608 spa_has_spare(spa_t *spa, uint64_t guid) 6609 { 6610 int i; 6611 uint64_t spareguid; 6612 spa_aux_vdev_t *sav = &spa->spa_spares; 6613 6614 for (i = 0; i < sav->sav_count; i++) 6615 if (sav->sav_vdevs[i]->vdev_guid == guid) 6616 return (B_TRUE); 6617 6618 for (i = 0; i < sav->sav_npending; i++) { 6619 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 6620 &spareguid) == 0 && spareguid == guid) 6621 return (B_TRUE); 6622 } 6623 6624 return (B_FALSE); 6625 } 6626 6627 /* 6628 * Check if a pool has an active shared spare device. 6629 * Note: reference count of an active spare is 2, as a spare and as a replace 6630 */ 6631 static boolean_t 6632 spa_has_active_shared_spare(spa_t *spa) 6633 { 6634 int i, refcnt; 6635 uint64_t pool; 6636 spa_aux_vdev_t *sav = &spa->spa_spares; 6637 6638 for (i = 0; i < sav->sav_count; i++) { 6639 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 6640 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 6641 refcnt > 2) 6642 return (B_TRUE); 6643 } 6644 6645 return (B_FALSE); 6646 } 6647 6648 /* 6649 * Post a sysevent corresponding to the given event. The 'name' must be one of 6650 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 6651 * filled in from the spa and (optionally) the vdev. This doesn't do anything 6652 * in the userland libzpool, as we don't want consumers to misinterpret ztest 6653 * or zdb as real changes. 6654 */ 6655 void 6656 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 6657 { 6658 #ifdef _KERNEL 6659 sysevent_t *ev; 6660 sysevent_attr_list_t *attr = NULL; 6661 sysevent_value_t value; 6662 sysevent_id_t eid; 6663 6664 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 6665 SE_SLEEP); 6666 6667 value.value_type = SE_DATA_TYPE_STRING; 6668 value.value.sv_string = spa_name(spa); 6669 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 6670 goto done; 6671 6672 value.value_type = SE_DATA_TYPE_UINT64; 6673 value.value.sv_uint64 = spa_guid(spa); 6674 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 6675 goto done; 6676 6677 if (vd) { 6678 value.value_type = SE_DATA_TYPE_UINT64; 6679 value.value.sv_uint64 = vd->vdev_guid; 6680 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 6681 SE_SLEEP) != 0) 6682 goto done; 6683 6684 if (vd->vdev_path) { 6685 value.value_type = SE_DATA_TYPE_STRING; 6686 value.value.sv_string = vd->vdev_path; 6687 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 6688 &value, SE_SLEEP) != 0) 6689 goto done; 6690 } 6691 } 6692 6693 if (sysevent_attach_attributes(ev, attr) != 0) 6694 goto done; 6695 attr = NULL; 6696 6697 (void) log_sysevent(ev, SE_SLEEP, &eid); 6698 6699 done: 6700 if (attr) 6701 sysevent_free_attr(attr); 6702 sysevent_free(ev); 6703 #endif 6704 } 6705