1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 25 * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 27 * Copyright 2013 Saso Kiselkov. All rights reserved. 28 * Copyright (c) 2014 Integros [integros.com] 29 * Copyright 2016 Toomas Soome <tsoome@me.com> 30 * Copyright 2017 Joyent, Inc. 31 * Copyright (c) 2017 Datto Inc. 32 */ 33 34 /* 35 * SPA: Storage Pool Allocator 36 * 37 * This file contains all the routines used when modifying on-disk SPA state. 38 * This includes opening, importing, destroying, exporting a pool, and syncing a 39 * pool. 40 */ 41 42 #include <sys/zfs_context.h> 43 #include <sys/fm/fs/zfs.h> 44 #include <sys/spa_impl.h> 45 #include <sys/zio.h> 46 #include <sys/zio_checksum.h> 47 #include <sys/dmu.h> 48 #include <sys/dmu_tx.h> 49 #include <sys/zap.h> 50 #include <sys/zil.h> 51 #include <sys/ddt.h> 52 #include <sys/vdev_impl.h> 53 #include <sys/vdev_removal.h> 54 #include <sys/vdev_indirect_mapping.h> 55 #include <sys/vdev_indirect_births.h> 56 #include <sys/metaslab.h> 57 #include <sys/metaslab_impl.h> 58 #include <sys/uberblock_impl.h> 59 #include <sys/txg.h> 60 #include <sys/avl.h> 61 #include <sys/bpobj.h> 62 #include <sys/dmu_traverse.h> 63 #include <sys/dmu_objset.h> 64 #include <sys/unique.h> 65 #include <sys/dsl_pool.h> 66 #include <sys/dsl_dataset.h> 67 #include <sys/dsl_dir.h> 68 #include <sys/dsl_prop.h> 69 #include <sys/dsl_synctask.h> 70 #include <sys/fs/zfs.h> 71 #include <sys/arc.h> 72 #include <sys/callb.h> 73 #include <sys/systeminfo.h> 74 #include <sys/spa_boot.h> 75 #include <sys/zfs_ioctl.h> 76 #include <sys/dsl_scan.h> 77 #include <sys/zfeature.h> 78 #include <sys/dsl_destroy.h> 79 #include <sys/abd.h> 80 81 #ifdef _KERNEL 82 #include <sys/bootprops.h> 83 #include <sys/callb.h> 84 #include <sys/cpupart.h> 85 #include <sys/pool.h> 86 #include <sys/sysdc.h> 87 #include <sys/zone.h> 88 #endif /* _KERNEL */ 89 90 #include "zfs_prop.h" 91 #include "zfs_comutil.h" 92 93 /* 94 * The interval, in seconds, at which failed configuration cache file writes 95 * should be retried. 96 */ 97 int zfs_ccw_retry_interval = 300; 98 99 typedef enum zti_modes { 100 ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 101 ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ 102 ZTI_MODE_NULL, /* don't create a taskq */ 103 ZTI_NMODES 104 } zti_modes_t; 105 106 #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 107 #define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } 108 #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 109 110 #define ZTI_N(n) ZTI_P(n, 1) 111 #define ZTI_ONE ZTI_N(1) 112 113 typedef struct zio_taskq_info { 114 zti_modes_t zti_mode; 115 uint_t zti_value; 116 uint_t zti_count; 117 } zio_taskq_info_t; 118 119 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 120 "issue", "issue_high", "intr", "intr_high" 121 }; 122 123 /* 124 * This table defines the taskq settings for each ZFS I/O type. When 125 * initializing a pool, we use this table to create an appropriately sized 126 * taskq. Some operations are low volume and therefore have a small, static 127 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 128 * macros. Other operations process a large amount of data; the ZTI_BATCH 129 * macro causes us to create a taskq oriented for throughput. Some operations 130 * are so high frequency and short-lived that the taskq itself can become a a 131 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 132 * additional degree of parallelism specified by the number of threads per- 133 * taskq and the number of taskqs; when dispatching an event in this case, the 134 * particular taskq is chosen at random. 135 * 136 * The different taskq priorities are to handle the different contexts (issue 137 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that 138 * need to be handled with minimum delay. 139 */ 140 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 141 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 142 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 143 { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */ 144 { ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */ 145 { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 146 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 147 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ 148 }; 149 150 static void spa_sync_version(void *arg, dmu_tx_t *tx); 151 static void spa_sync_props(void *arg, dmu_tx_t *tx); 152 static boolean_t spa_has_active_shared_spare(spa_t *spa); 153 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 154 spa_load_state_t state, spa_import_type_t type, boolean_t trust_config, 155 char **ereport); 156 static void spa_vdev_resilver_done(spa_t *spa); 157 158 uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ 159 id_t zio_taskq_psrset_bind = PS_NONE; 160 boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 161 uint_t zio_taskq_basedc = 80; /* base duty cycle */ 162 163 boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 164 extern int zfs_sync_pass_deferred_free; 165 166 /* 167 * This (illegal) pool name is used when temporarily importing a spa_t in order 168 * to get the vdev stats associated with the imported devices. 169 */ 170 #define TRYIMPORT_NAME "$import" 171 172 /* 173 * ========================================================================== 174 * SPA properties routines 175 * ========================================================================== 176 */ 177 178 /* 179 * Add a (source=src, propname=propval) list to an nvlist. 180 */ 181 static void 182 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 183 uint64_t intval, zprop_source_t src) 184 { 185 const char *propname = zpool_prop_to_name(prop); 186 nvlist_t *propval; 187 188 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 189 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 190 191 if (strval != NULL) 192 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 193 else 194 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 195 196 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 197 nvlist_free(propval); 198 } 199 200 /* 201 * Get property values from the spa configuration. 202 */ 203 static void 204 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 205 { 206 vdev_t *rvd = spa->spa_root_vdev; 207 dsl_pool_t *pool = spa->spa_dsl_pool; 208 uint64_t size, alloc, cap, version; 209 zprop_source_t src = ZPROP_SRC_NONE; 210 spa_config_dirent_t *dp; 211 metaslab_class_t *mc = spa_normal_class(spa); 212 213 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 214 215 if (rvd != NULL) { 216 alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 217 size = metaslab_class_get_space(spa_normal_class(spa)); 218 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 219 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 220 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 221 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 222 size - alloc, src); 223 224 spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, 225 metaslab_class_fragmentation(mc), src); 226 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, 227 metaslab_class_expandable_space(mc), src); 228 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 229 (spa_mode(spa) == FREAD), src); 230 231 cap = (size == 0) ? 0 : (alloc * 100 / size); 232 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 233 234 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 235 ddt_get_pool_dedup_ratio(spa), src); 236 237 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 238 rvd->vdev_state, src); 239 240 version = spa_version(spa); 241 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 242 src = ZPROP_SRC_DEFAULT; 243 else 244 src = ZPROP_SRC_LOCAL; 245 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 246 } 247 248 if (pool != NULL) { 249 /* 250 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 251 * when opening pools before this version freedir will be NULL. 252 */ 253 if (pool->dp_free_dir != NULL) { 254 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 255 dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, 256 src); 257 } else { 258 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 259 NULL, 0, src); 260 } 261 262 if (pool->dp_leak_dir != NULL) { 263 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 264 dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, 265 src); 266 } else { 267 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, 268 NULL, 0, src); 269 } 270 } 271 272 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 273 274 if (spa->spa_comment != NULL) { 275 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 276 0, ZPROP_SRC_LOCAL); 277 } 278 279 if (spa->spa_root != NULL) 280 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 281 0, ZPROP_SRC_LOCAL); 282 283 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { 284 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 285 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); 286 } else { 287 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 288 SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); 289 } 290 291 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 292 if (dp->scd_path == NULL) { 293 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 294 "none", 0, ZPROP_SRC_LOCAL); 295 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 296 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 297 dp->scd_path, 0, ZPROP_SRC_LOCAL); 298 } 299 } 300 } 301 302 /* 303 * Get zpool property values. 304 */ 305 int 306 spa_prop_get(spa_t *spa, nvlist_t **nvp) 307 { 308 objset_t *mos = spa->spa_meta_objset; 309 zap_cursor_t zc; 310 zap_attribute_t za; 311 int err; 312 313 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 314 315 mutex_enter(&spa->spa_props_lock); 316 317 /* 318 * Get properties from the spa config. 319 */ 320 spa_prop_get_config(spa, nvp); 321 322 /* If no pool property object, no more prop to get. */ 323 if (mos == NULL || spa->spa_pool_props_object == 0) { 324 mutex_exit(&spa->spa_props_lock); 325 return (0); 326 } 327 328 /* 329 * Get properties from the MOS pool property object. 330 */ 331 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 332 (err = zap_cursor_retrieve(&zc, &za)) == 0; 333 zap_cursor_advance(&zc)) { 334 uint64_t intval = 0; 335 char *strval = NULL; 336 zprop_source_t src = ZPROP_SRC_DEFAULT; 337 zpool_prop_t prop; 338 339 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 340 continue; 341 342 switch (za.za_integer_length) { 343 case 8: 344 /* integer property */ 345 if (za.za_first_integer != 346 zpool_prop_default_numeric(prop)) 347 src = ZPROP_SRC_LOCAL; 348 349 if (prop == ZPOOL_PROP_BOOTFS) { 350 dsl_pool_t *dp; 351 dsl_dataset_t *ds = NULL; 352 353 dp = spa_get_dsl(spa); 354 dsl_pool_config_enter(dp, FTAG); 355 if (err = dsl_dataset_hold_obj(dp, 356 za.za_first_integer, FTAG, &ds)) { 357 dsl_pool_config_exit(dp, FTAG); 358 break; 359 } 360 361 strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, 362 KM_SLEEP); 363 dsl_dataset_name(ds, strval); 364 dsl_dataset_rele(ds, FTAG); 365 dsl_pool_config_exit(dp, FTAG); 366 } else { 367 strval = NULL; 368 intval = za.za_first_integer; 369 } 370 371 spa_prop_add_list(*nvp, prop, strval, intval, src); 372 373 if (strval != NULL) 374 kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); 375 376 break; 377 378 case 1: 379 /* string property */ 380 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 381 err = zap_lookup(mos, spa->spa_pool_props_object, 382 za.za_name, 1, za.za_num_integers, strval); 383 if (err) { 384 kmem_free(strval, za.za_num_integers); 385 break; 386 } 387 spa_prop_add_list(*nvp, prop, strval, 0, src); 388 kmem_free(strval, za.za_num_integers); 389 break; 390 391 default: 392 break; 393 } 394 } 395 zap_cursor_fini(&zc); 396 mutex_exit(&spa->spa_props_lock); 397 out: 398 if (err && err != ENOENT) { 399 nvlist_free(*nvp); 400 *nvp = NULL; 401 return (err); 402 } 403 404 return (0); 405 } 406 407 /* 408 * Validate the given pool properties nvlist and modify the list 409 * for the property values to be set. 410 */ 411 static int 412 spa_prop_validate(spa_t *spa, nvlist_t *props) 413 { 414 nvpair_t *elem; 415 int error = 0, reset_bootfs = 0; 416 uint64_t objnum = 0; 417 boolean_t has_feature = B_FALSE; 418 419 elem = NULL; 420 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 421 uint64_t intval; 422 char *strval, *slash, *check, *fname; 423 const char *propname = nvpair_name(elem); 424 zpool_prop_t prop = zpool_name_to_prop(propname); 425 426 switch (prop) { 427 case ZPROP_INVAL: 428 if (!zpool_prop_feature(propname)) { 429 error = SET_ERROR(EINVAL); 430 break; 431 } 432 433 /* 434 * Sanitize the input. 435 */ 436 if (nvpair_type(elem) != DATA_TYPE_UINT64) { 437 error = SET_ERROR(EINVAL); 438 break; 439 } 440 441 if (nvpair_value_uint64(elem, &intval) != 0) { 442 error = SET_ERROR(EINVAL); 443 break; 444 } 445 446 if (intval != 0) { 447 error = SET_ERROR(EINVAL); 448 break; 449 } 450 451 fname = strchr(propname, '@') + 1; 452 if (zfeature_lookup_name(fname, NULL) != 0) { 453 error = SET_ERROR(EINVAL); 454 break; 455 } 456 457 has_feature = B_TRUE; 458 break; 459 460 case ZPOOL_PROP_VERSION: 461 error = nvpair_value_uint64(elem, &intval); 462 if (!error && 463 (intval < spa_version(spa) || 464 intval > SPA_VERSION_BEFORE_FEATURES || 465 has_feature)) 466 error = SET_ERROR(EINVAL); 467 break; 468 469 case ZPOOL_PROP_DELEGATION: 470 case ZPOOL_PROP_AUTOREPLACE: 471 case ZPOOL_PROP_LISTSNAPS: 472 case ZPOOL_PROP_AUTOEXPAND: 473 error = nvpair_value_uint64(elem, &intval); 474 if (!error && intval > 1) 475 error = SET_ERROR(EINVAL); 476 break; 477 478 case ZPOOL_PROP_BOOTFS: 479 /* 480 * If the pool version is less than SPA_VERSION_BOOTFS, 481 * or the pool is still being created (version == 0), 482 * the bootfs property cannot be set. 483 */ 484 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 485 error = SET_ERROR(ENOTSUP); 486 break; 487 } 488 489 /* 490 * Make sure the vdev config is bootable 491 */ 492 if (!vdev_is_bootable(spa->spa_root_vdev)) { 493 error = SET_ERROR(ENOTSUP); 494 break; 495 } 496 497 reset_bootfs = 1; 498 499 error = nvpair_value_string(elem, &strval); 500 501 if (!error) { 502 objset_t *os; 503 uint64_t propval; 504 505 if (strval == NULL || strval[0] == '\0') { 506 objnum = zpool_prop_default_numeric( 507 ZPOOL_PROP_BOOTFS); 508 break; 509 } 510 511 if (error = dmu_objset_hold(strval, FTAG, &os)) 512 break; 513 514 /* 515 * Must be ZPL, and its property settings 516 * must be supported by GRUB (compression 517 * is not gzip, and large blocks are not used). 518 */ 519 520 if (dmu_objset_type(os) != DMU_OST_ZFS) { 521 error = SET_ERROR(ENOTSUP); 522 } else if ((error = 523 dsl_prop_get_int_ds(dmu_objset_ds(os), 524 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 525 &propval)) == 0 && 526 !BOOTFS_COMPRESS_VALID(propval)) { 527 error = SET_ERROR(ENOTSUP); 528 } else { 529 objnum = dmu_objset_id(os); 530 } 531 dmu_objset_rele(os, FTAG); 532 } 533 break; 534 535 case ZPOOL_PROP_FAILUREMODE: 536 error = nvpair_value_uint64(elem, &intval); 537 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 538 intval > ZIO_FAILURE_MODE_PANIC)) 539 error = SET_ERROR(EINVAL); 540 541 /* 542 * This is a special case which only occurs when 543 * the pool has completely failed. This allows 544 * the user to change the in-core failmode property 545 * without syncing it out to disk (I/Os might 546 * currently be blocked). We do this by returning 547 * EIO to the caller (spa_prop_set) to trick it 548 * into thinking we encountered a property validation 549 * error. 550 */ 551 if (!error && spa_suspended(spa)) { 552 spa->spa_failmode = intval; 553 error = SET_ERROR(EIO); 554 } 555 break; 556 557 case ZPOOL_PROP_CACHEFILE: 558 if ((error = nvpair_value_string(elem, &strval)) != 0) 559 break; 560 561 if (strval[0] == '\0') 562 break; 563 564 if (strcmp(strval, "none") == 0) 565 break; 566 567 if (strval[0] != '/') { 568 error = SET_ERROR(EINVAL); 569 break; 570 } 571 572 slash = strrchr(strval, '/'); 573 ASSERT(slash != NULL); 574 575 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 576 strcmp(slash, "/..") == 0) 577 error = SET_ERROR(EINVAL); 578 break; 579 580 case ZPOOL_PROP_COMMENT: 581 if ((error = nvpair_value_string(elem, &strval)) != 0) 582 break; 583 for (check = strval; *check != '\0'; check++) { 584 /* 585 * The kernel doesn't have an easy isprint() 586 * check. For this kernel check, we merely 587 * check ASCII apart from DEL. Fix this if 588 * there is an easy-to-use kernel isprint(). 589 */ 590 if (*check >= 0x7f) { 591 error = SET_ERROR(EINVAL); 592 break; 593 } 594 } 595 if (strlen(strval) > ZPROP_MAX_COMMENT) 596 error = E2BIG; 597 break; 598 599 case ZPOOL_PROP_DEDUPDITTO: 600 if (spa_version(spa) < SPA_VERSION_DEDUP) 601 error = SET_ERROR(ENOTSUP); 602 else 603 error = nvpair_value_uint64(elem, &intval); 604 if (error == 0 && 605 intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 606 error = SET_ERROR(EINVAL); 607 break; 608 } 609 610 if (error) 611 break; 612 } 613 614 if (!error && reset_bootfs) { 615 error = nvlist_remove(props, 616 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 617 618 if (!error) { 619 error = nvlist_add_uint64(props, 620 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 621 } 622 } 623 624 return (error); 625 } 626 627 void 628 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 629 { 630 char *cachefile; 631 spa_config_dirent_t *dp; 632 633 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 634 &cachefile) != 0) 635 return; 636 637 dp = kmem_alloc(sizeof (spa_config_dirent_t), 638 KM_SLEEP); 639 640 if (cachefile[0] == '\0') 641 dp->scd_path = spa_strdup(spa_config_path); 642 else if (strcmp(cachefile, "none") == 0) 643 dp->scd_path = NULL; 644 else 645 dp->scd_path = spa_strdup(cachefile); 646 647 list_insert_head(&spa->spa_config_list, dp); 648 if (need_sync) 649 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 650 } 651 652 int 653 spa_prop_set(spa_t *spa, nvlist_t *nvp) 654 { 655 int error; 656 nvpair_t *elem = NULL; 657 boolean_t need_sync = B_FALSE; 658 659 if ((error = spa_prop_validate(spa, nvp)) != 0) 660 return (error); 661 662 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 663 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 664 665 if (prop == ZPOOL_PROP_CACHEFILE || 666 prop == ZPOOL_PROP_ALTROOT || 667 prop == ZPOOL_PROP_READONLY) 668 continue; 669 670 if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) { 671 uint64_t ver; 672 673 if (prop == ZPOOL_PROP_VERSION) { 674 VERIFY(nvpair_value_uint64(elem, &ver) == 0); 675 } else { 676 ASSERT(zpool_prop_feature(nvpair_name(elem))); 677 ver = SPA_VERSION_FEATURES; 678 need_sync = B_TRUE; 679 } 680 681 /* Save time if the version is already set. */ 682 if (ver == spa_version(spa)) 683 continue; 684 685 /* 686 * In addition to the pool directory object, we might 687 * create the pool properties object, the features for 688 * read object, the features for write object, or the 689 * feature descriptions object. 690 */ 691 error = dsl_sync_task(spa->spa_name, NULL, 692 spa_sync_version, &ver, 693 6, ZFS_SPACE_CHECK_RESERVED); 694 if (error) 695 return (error); 696 continue; 697 } 698 699 need_sync = B_TRUE; 700 break; 701 } 702 703 if (need_sync) { 704 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 705 nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 706 } 707 708 return (0); 709 } 710 711 /* 712 * If the bootfs property value is dsobj, clear it. 713 */ 714 void 715 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 716 { 717 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 718 VERIFY(zap_remove(spa->spa_meta_objset, 719 spa->spa_pool_props_object, 720 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 721 spa->spa_bootfs = 0; 722 } 723 } 724 725 /*ARGSUSED*/ 726 static int 727 spa_change_guid_check(void *arg, dmu_tx_t *tx) 728 { 729 uint64_t *newguid = arg; 730 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 731 vdev_t *rvd = spa->spa_root_vdev; 732 uint64_t vdev_state; 733 734 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 735 vdev_state = rvd->vdev_state; 736 spa_config_exit(spa, SCL_STATE, FTAG); 737 738 if (vdev_state != VDEV_STATE_HEALTHY) 739 return (SET_ERROR(ENXIO)); 740 741 ASSERT3U(spa_guid(spa), !=, *newguid); 742 743 return (0); 744 } 745 746 static void 747 spa_change_guid_sync(void *arg, dmu_tx_t *tx) 748 { 749 uint64_t *newguid = arg; 750 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 751 uint64_t oldguid; 752 vdev_t *rvd = spa->spa_root_vdev; 753 754 oldguid = spa_guid(spa); 755 756 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 757 rvd->vdev_guid = *newguid; 758 rvd->vdev_guid_sum += (*newguid - oldguid); 759 vdev_config_dirty(rvd); 760 spa_config_exit(spa, SCL_STATE, FTAG); 761 762 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 763 oldguid, *newguid); 764 } 765 766 /* 767 * Change the GUID for the pool. This is done so that we can later 768 * re-import a pool built from a clone of our own vdevs. We will modify 769 * the root vdev's guid, our own pool guid, and then mark all of our 770 * vdevs dirty. Note that we must make sure that all our vdevs are 771 * online when we do this, or else any vdevs that weren't present 772 * would be orphaned from our pool. We are also going to issue a 773 * sysevent to update any watchers. 774 */ 775 int 776 spa_change_guid(spa_t *spa) 777 { 778 int error; 779 uint64_t guid; 780 781 mutex_enter(&spa->spa_vdev_top_lock); 782 mutex_enter(&spa_namespace_lock); 783 guid = spa_generate_guid(NULL); 784 785 error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 786 spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 787 788 if (error == 0) { 789 spa_write_cachefile(spa, B_FALSE, B_TRUE); 790 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); 791 } 792 793 mutex_exit(&spa_namespace_lock); 794 mutex_exit(&spa->spa_vdev_top_lock); 795 796 return (error); 797 } 798 799 /* 800 * ========================================================================== 801 * SPA state manipulation (open/create/destroy/import/export) 802 * ========================================================================== 803 */ 804 805 static int 806 spa_error_entry_compare(const void *a, const void *b) 807 { 808 spa_error_entry_t *sa = (spa_error_entry_t *)a; 809 spa_error_entry_t *sb = (spa_error_entry_t *)b; 810 int ret; 811 812 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 813 sizeof (zbookmark_phys_t)); 814 815 if (ret < 0) 816 return (-1); 817 else if (ret > 0) 818 return (1); 819 else 820 return (0); 821 } 822 823 /* 824 * Utility function which retrieves copies of the current logs and 825 * re-initializes them in the process. 826 */ 827 void 828 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 829 { 830 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 831 832 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 833 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 834 835 avl_create(&spa->spa_errlist_scrub, 836 spa_error_entry_compare, sizeof (spa_error_entry_t), 837 offsetof(spa_error_entry_t, se_avl)); 838 avl_create(&spa->spa_errlist_last, 839 spa_error_entry_compare, sizeof (spa_error_entry_t), 840 offsetof(spa_error_entry_t, se_avl)); 841 } 842 843 static void 844 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 845 { 846 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 847 enum zti_modes mode = ztip->zti_mode; 848 uint_t value = ztip->zti_value; 849 uint_t count = ztip->zti_count; 850 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 851 char name[32]; 852 uint_t flags = 0; 853 boolean_t batch = B_FALSE; 854 855 if (mode == ZTI_MODE_NULL) { 856 tqs->stqs_count = 0; 857 tqs->stqs_taskq = NULL; 858 return; 859 } 860 861 ASSERT3U(count, >, 0); 862 863 tqs->stqs_count = count; 864 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 865 866 switch (mode) { 867 case ZTI_MODE_FIXED: 868 ASSERT3U(value, >=, 1); 869 value = MAX(value, 1); 870 break; 871 872 case ZTI_MODE_BATCH: 873 batch = B_TRUE; 874 flags |= TASKQ_THREADS_CPU_PCT; 875 value = zio_taskq_batch_pct; 876 break; 877 878 default: 879 panic("unrecognized mode for %s_%s taskq (%u:%u) in " 880 "spa_activate()", 881 zio_type_name[t], zio_taskq_types[q], mode, value); 882 break; 883 } 884 885 for (uint_t i = 0; i < count; i++) { 886 taskq_t *tq; 887 888 if (count > 1) { 889 (void) snprintf(name, sizeof (name), "%s_%s_%u", 890 zio_type_name[t], zio_taskq_types[q], i); 891 } else { 892 (void) snprintf(name, sizeof (name), "%s_%s", 893 zio_type_name[t], zio_taskq_types[q]); 894 } 895 896 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 897 if (batch) 898 flags |= TASKQ_DC_BATCH; 899 900 tq = taskq_create_sysdc(name, value, 50, INT_MAX, 901 spa->spa_proc, zio_taskq_basedc, flags); 902 } else { 903 pri_t pri = maxclsyspri; 904 /* 905 * The write issue taskq can be extremely CPU 906 * intensive. Run it at slightly lower priority 907 * than the other taskqs. 908 */ 909 if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) 910 pri--; 911 912 tq = taskq_create_proc(name, value, pri, 50, 913 INT_MAX, spa->spa_proc, flags); 914 } 915 916 tqs->stqs_taskq[i] = tq; 917 } 918 } 919 920 static void 921 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 922 { 923 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 924 925 if (tqs->stqs_taskq == NULL) { 926 ASSERT0(tqs->stqs_count); 927 return; 928 } 929 930 for (uint_t i = 0; i < tqs->stqs_count; i++) { 931 ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 932 taskq_destroy(tqs->stqs_taskq[i]); 933 } 934 935 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 936 tqs->stqs_taskq = NULL; 937 } 938 939 /* 940 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 941 * Note that a type may have multiple discrete taskqs to avoid lock contention 942 * on the taskq itself. In that case we choose which taskq at random by using 943 * the low bits of gethrtime(). 944 */ 945 void 946 spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 947 task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) 948 { 949 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 950 taskq_t *tq; 951 952 ASSERT3P(tqs->stqs_taskq, !=, NULL); 953 ASSERT3U(tqs->stqs_count, !=, 0); 954 955 if (tqs->stqs_count == 1) { 956 tq = tqs->stqs_taskq[0]; 957 } else { 958 tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count]; 959 } 960 961 taskq_dispatch_ent(tq, func, arg, flags, ent); 962 } 963 964 static void 965 spa_create_zio_taskqs(spa_t *spa) 966 { 967 for (int t = 0; t < ZIO_TYPES; t++) { 968 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 969 spa_taskqs_init(spa, t, q); 970 } 971 } 972 } 973 974 #ifdef _KERNEL 975 static void 976 spa_thread(void *arg) 977 { 978 callb_cpr_t cprinfo; 979 980 spa_t *spa = arg; 981 user_t *pu = PTOU(curproc); 982 983 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 984 spa->spa_name); 985 986 ASSERT(curproc != &p0); 987 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 988 "zpool-%s", spa->spa_name); 989 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 990 991 /* bind this thread to the requested psrset */ 992 if (zio_taskq_psrset_bind != PS_NONE) { 993 pool_lock(); 994 mutex_enter(&cpu_lock); 995 mutex_enter(&pidlock); 996 mutex_enter(&curproc->p_lock); 997 998 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 999 0, NULL, NULL) == 0) { 1000 curthread->t_bind_pset = zio_taskq_psrset_bind; 1001 } else { 1002 cmn_err(CE_WARN, 1003 "Couldn't bind process for zfs pool \"%s\" to " 1004 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1005 } 1006 1007 mutex_exit(&curproc->p_lock); 1008 mutex_exit(&pidlock); 1009 mutex_exit(&cpu_lock); 1010 pool_unlock(); 1011 } 1012 1013 if (zio_taskq_sysdc) { 1014 sysdc_thread_enter(curthread, 100, 0); 1015 } 1016 1017 spa->spa_proc = curproc; 1018 spa->spa_did = curthread->t_did; 1019 1020 spa_create_zio_taskqs(spa); 1021 1022 mutex_enter(&spa->spa_proc_lock); 1023 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1024 1025 spa->spa_proc_state = SPA_PROC_ACTIVE; 1026 cv_broadcast(&spa->spa_proc_cv); 1027 1028 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1029 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1030 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1031 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1032 1033 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1034 spa->spa_proc_state = SPA_PROC_GONE; 1035 spa->spa_proc = &p0; 1036 cv_broadcast(&spa->spa_proc_cv); 1037 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1038 1039 mutex_enter(&curproc->p_lock); 1040 lwp_exit(); 1041 } 1042 #endif 1043 1044 /* 1045 * Activate an uninitialized pool. 1046 */ 1047 static void 1048 spa_activate(spa_t *spa, int mode) 1049 { 1050 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1051 1052 spa->spa_state = POOL_STATE_ACTIVE; 1053 spa->spa_mode = mode; 1054 1055 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 1056 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 1057 1058 /* Try to create a covering process */ 1059 mutex_enter(&spa->spa_proc_lock); 1060 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1061 ASSERT(spa->spa_proc == &p0); 1062 spa->spa_did = 0; 1063 1064 /* Only create a process if we're going to be around a while. */ 1065 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1066 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1067 NULL, 0) == 0) { 1068 spa->spa_proc_state = SPA_PROC_CREATED; 1069 while (spa->spa_proc_state == SPA_PROC_CREATED) { 1070 cv_wait(&spa->spa_proc_cv, 1071 &spa->spa_proc_lock); 1072 } 1073 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1074 ASSERT(spa->spa_proc != &p0); 1075 ASSERT(spa->spa_did != 0); 1076 } else { 1077 #ifdef _KERNEL 1078 cmn_err(CE_WARN, 1079 "Couldn't create process for zfs pool \"%s\"\n", 1080 spa->spa_name); 1081 #endif 1082 } 1083 } 1084 mutex_exit(&spa->spa_proc_lock); 1085 1086 /* If we didn't create a process, we need to create our taskqs. */ 1087 if (spa->spa_proc == &p0) { 1088 spa_create_zio_taskqs(spa); 1089 } 1090 1091 for (size_t i = 0; i < TXG_SIZE; i++) 1092 spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 0); 1093 1094 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1095 offsetof(vdev_t, vdev_config_dirty_node)); 1096 list_create(&spa->spa_evicting_os_list, sizeof (objset_t), 1097 offsetof(objset_t, os_evicting_node)); 1098 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1099 offsetof(vdev_t, vdev_state_dirty_node)); 1100 1101 txg_list_create(&spa->spa_vdev_txg_list, spa, 1102 offsetof(struct vdev, vdev_txg_node)); 1103 1104 avl_create(&spa->spa_errlist_scrub, 1105 spa_error_entry_compare, sizeof (spa_error_entry_t), 1106 offsetof(spa_error_entry_t, se_avl)); 1107 avl_create(&spa->spa_errlist_last, 1108 spa_error_entry_compare, sizeof (spa_error_entry_t), 1109 offsetof(spa_error_entry_t, se_avl)); 1110 } 1111 1112 /* 1113 * Opposite of spa_activate(). 1114 */ 1115 static void 1116 spa_deactivate(spa_t *spa) 1117 { 1118 ASSERT(spa->spa_sync_on == B_FALSE); 1119 ASSERT(spa->spa_dsl_pool == NULL); 1120 ASSERT(spa->spa_root_vdev == NULL); 1121 ASSERT(spa->spa_async_zio_root == NULL); 1122 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1123 1124 spa_evicting_os_wait(spa); 1125 1126 txg_list_destroy(&spa->spa_vdev_txg_list); 1127 1128 list_destroy(&spa->spa_config_dirty_list); 1129 list_destroy(&spa->spa_evicting_os_list); 1130 list_destroy(&spa->spa_state_dirty_list); 1131 1132 for (int t = 0; t < ZIO_TYPES; t++) { 1133 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1134 spa_taskqs_fini(spa, t, q); 1135 } 1136 } 1137 1138 for (size_t i = 0; i < TXG_SIZE; i++) { 1139 ASSERT3P(spa->spa_txg_zio[i], !=, NULL); 1140 VERIFY0(zio_wait(spa->spa_txg_zio[i])); 1141 spa->spa_txg_zio[i] = NULL; 1142 } 1143 1144 metaslab_class_destroy(spa->spa_normal_class); 1145 spa->spa_normal_class = NULL; 1146 1147 metaslab_class_destroy(spa->spa_log_class); 1148 spa->spa_log_class = NULL; 1149 1150 /* 1151 * If this was part of an import or the open otherwise failed, we may 1152 * still have errors left in the queues. Empty them just in case. 1153 */ 1154 spa_errlog_drain(spa); 1155 1156 avl_destroy(&spa->spa_errlist_scrub); 1157 avl_destroy(&spa->spa_errlist_last); 1158 1159 spa->spa_state = POOL_STATE_UNINITIALIZED; 1160 1161 mutex_enter(&spa->spa_proc_lock); 1162 if (spa->spa_proc_state != SPA_PROC_NONE) { 1163 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1164 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1165 cv_broadcast(&spa->spa_proc_cv); 1166 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1167 ASSERT(spa->spa_proc != &p0); 1168 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1169 } 1170 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1171 spa->spa_proc_state = SPA_PROC_NONE; 1172 } 1173 ASSERT(spa->spa_proc == &p0); 1174 mutex_exit(&spa->spa_proc_lock); 1175 1176 /* 1177 * We want to make sure spa_thread() has actually exited the ZFS 1178 * module, so that the module can't be unloaded out from underneath 1179 * it. 1180 */ 1181 if (spa->spa_did != 0) { 1182 thread_join(spa->spa_did); 1183 spa->spa_did = 0; 1184 } 1185 } 1186 1187 /* 1188 * Verify a pool configuration, and construct the vdev tree appropriately. This 1189 * will create all the necessary vdevs in the appropriate layout, with each vdev 1190 * in the CLOSED state. This will prep the pool before open/creation/import. 1191 * All vdev validation is done by the vdev_alloc() routine. 1192 */ 1193 static int 1194 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1195 uint_t id, int atype) 1196 { 1197 nvlist_t **child; 1198 uint_t children; 1199 int error; 1200 1201 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1202 return (error); 1203 1204 if ((*vdp)->vdev_ops->vdev_op_leaf) 1205 return (0); 1206 1207 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1208 &child, &children); 1209 1210 if (error == ENOENT) 1211 return (0); 1212 1213 if (error) { 1214 vdev_free(*vdp); 1215 *vdp = NULL; 1216 return (SET_ERROR(EINVAL)); 1217 } 1218 1219 for (int c = 0; c < children; c++) { 1220 vdev_t *vd; 1221 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1222 atype)) != 0) { 1223 vdev_free(*vdp); 1224 *vdp = NULL; 1225 return (error); 1226 } 1227 } 1228 1229 ASSERT(*vdp != NULL); 1230 1231 return (0); 1232 } 1233 1234 /* 1235 * Opposite of spa_load(). 1236 */ 1237 static void 1238 spa_unload(spa_t *spa) 1239 { 1240 int i; 1241 1242 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1243 1244 /* 1245 * Stop async tasks. 1246 */ 1247 spa_async_suspend(spa); 1248 1249 /* 1250 * Stop syncing. 1251 */ 1252 if (spa->spa_sync_on) { 1253 txg_sync_stop(spa->spa_dsl_pool); 1254 spa->spa_sync_on = B_FALSE; 1255 } 1256 1257 /* 1258 * Even though vdev_free() also calls vdev_metaslab_fini, we need 1259 * to call it earlier, before we wait for async i/o to complete. 1260 * This ensures that there is no async metaslab prefetching, by 1261 * calling taskq_wait(mg_taskq). 1262 */ 1263 if (spa->spa_root_vdev != NULL) { 1264 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1265 for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) 1266 vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]); 1267 spa_config_exit(spa, SCL_ALL, FTAG); 1268 } 1269 1270 /* 1271 * Wait for any outstanding async I/O to complete. 1272 */ 1273 if (spa->spa_async_zio_root != NULL) { 1274 for (int i = 0; i < max_ncpus; i++) 1275 (void) zio_wait(spa->spa_async_zio_root[i]); 1276 kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); 1277 spa->spa_async_zio_root = NULL; 1278 } 1279 1280 if (spa->spa_vdev_removal != NULL) { 1281 spa_vdev_removal_destroy(spa->spa_vdev_removal); 1282 spa->spa_vdev_removal = NULL; 1283 } 1284 1285 spa_condense_fini(spa); 1286 1287 bpobj_close(&spa->spa_deferred_bpobj); 1288 1289 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1290 1291 /* 1292 * Close all vdevs. 1293 */ 1294 if (spa->spa_root_vdev) 1295 vdev_free(spa->spa_root_vdev); 1296 ASSERT(spa->spa_root_vdev == NULL); 1297 1298 /* 1299 * Close the dsl pool. 1300 */ 1301 if (spa->spa_dsl_pool) { 1302 dsl_pool_close(spa->spa_dsl_pool); 1303 spa->spa_dsl_pool = NULL; 1304 spa->spa_meta_objset = NULL; 1305 } 1306 1307 ddt_unload(spa); 1308 1309 /* 1310 * Drop and purge level 2 cache 1311 */ 1312 spa_l2cache_drop(spa); 1313 1314 for (i = 0; i < spa->spa_spares.sav_count; i++) 1315 vdev_free(spa->spa_spares.sav_vdevs[i]); 1316 if (spa->spa_spares.sav_vdevs) { 1317 kmem_free(spa->spa_spares.sav_vdevs, 1318 spa->spa_spares.sav_count * sizeof (void *)); 1319 spa->spa_spares.sav_vdevs = NULL; 1320 } 1321 if (spa->spa_spares.sav_config) { 1322 nvlist_free(spa->spa_spares.sav_config); 1323 spa->spa_spares.sav_config = NULL; 1324 } 1325 spa->spa_spares.sav_count = 0; 1326 1327 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 1328 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1329 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1330 } 1331 if (spa->spa_l2cache.sav_vdevs) { 1332 kmem_free(spa->spa_l2cache.sav_vdevs, 1333 spa->spa_l2cache.sav_count * sizeof (void *)); 1334 spa->spa_l2cache.sav_vdevs = NULL; 1335 } 1336 if (spa->spa_l2cache.sav_config) { 1337 nvlist_free(spa->spa_l2cache.sav_config); 1338 spa->spa_l2cache.sav_config = NULL; 1339 } 1340 spa->spa_l2cache.sav_count = 0; 1341 1342 spa->spa_async_suspended = 0; 1343 1344 spa->spa_indirect_vdevs_loaded = B_FALSE; 1345 1346 if (spa->spa_comment != NULL) { 1347 spa_strfree(spa->spa_comment); 1348 spa->spa_comment = NULL; 1349 } 1350 1351 spa_config_exit(spa, SCL_ALL, FTAG); 1352 } 1353 1354 /* 1355 * Load (or re-load) the current list of vdevs describing the active spares for 1356 * this pool. When this is called, we have some form of basic information in 1357 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1358 * then re-generate a more complete list including status information. 1359 */ 1360 void 1361 spa_load_spares(spa_t *spa) 1362 { 1363 nvlist_t **spares; 1364 uint_t nspares; 1365 int i; 1366 vdev_t *vd, *tvd; 1367 1368 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1369 1370 /* 1371 * First, close and free any existing spare vdevs. 1372 */ 1373 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1374 vd = spa->spa_spares.sav_vdevs[i]; 1375 1376 /* Undo the call to spa_activate() below */ 1377 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1378 B_FALSE)) != NULL && tvd->vdev_isspare) 1379 spa_spare_remove(tvd); 1380 vdev_close(vd); 1381 vdev_free(vd); 1382 } 1383 1384 if (spa->spa_spares.sav_vdevs) 1385 kmem_free(spa->spa_spares.sav_vdevs, 1386 spa->spa_spares.sav_count * sizeof (void *)); 1387 1388 if (spa->spa_spares.sav_config == NULL) 1389 nspares = 0; 1390 else 1391 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1392 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1393 1394 spa->spa_spares.sav_count = (int)nspares; 1395 spa->spa_spares.sav_vdevs = NULL; 1396 1397 if (nspares == 0) 1398 return; 1399 1400 /* 1401 * Construct the array of vdevs, opening them to get status in the 1402 * process. For each spare, there is potentially two different vdev_t 1403 * structures associated with it: one in the list of spares (used only 1404 * for basic validation purposes) and one in the active vdev 1405 * configuration (if it's spared in). During this phase we open and 1406 * validate each vdev on the spare list. If the vdev also exists in the 1407 * active configuration, then we also mark this vdev as an active spare. 1408 */ 1409 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1410 KM_SLEEP); 1411 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1412 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1413 VDEV_ALLOC_SPARE) == 0); 1414 ASSERT(vd != NULL); 1415 1416 spa->spa_spares.sav_vdevs[i] = vd; 1417 1418 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1419 B_FALSE)) != NULL) { 1420 if (!tvd->vdev_isspare) 1421 spa_spare_add(tvd); 1422 1423 /* 1424 * We only mark the spare active if we were successfully 1425 * able to load the vdev. Otherwise, importing a pool 1426 * with a bad active spare would result in strange 1427 * behavior, because multiple pool would think the spare 1428 * is actively in use. 1429 * 1430 * There is a vulnerability here to an equally bizarre 1431 * circumstance, where a dead active spare is later 1432 * brought back to life (onlined or otherwise). Given 1433 * the rarity of this scenario, and the extra complexity 1434 * it adds, we ignore the possibility. 1435 */ 1436 if (!vdev_is_dead(tvd)) 1437 spa_spare_activate(tvd); 1438 } 1439 1440 vd->vdev_top = vd; 1441 vd->vdev_aux = &spa->spa_spares; 1442 1443 if (vdev_open(vd) != 0) 1444 continue; 1445 1446 if (vdev_validate_aux(vd) == 0) 1447 spa_spare_add(vd); 1448 } 1449 1450 /* 1451 * Recompute the stashed list of spares, with status information 1452 * this time. 1453 */ 1454 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1455 DATA_TYPE_NVLIST_ARRAY) == 0); 1456 1457 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1458 KM_SLEEP); 1459 for (i = 0; i < spa->spa_spares.sav_count; i++) 1460 spares[i] = vdev_config_generate(spa, 1461 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1462 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1463 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1464 for (i = 0; i < spa->spa_spares.sav_count; i++) 1465 nvlist_free(spares[i]); 1466 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1467 } 1468 1469 /* 1470 * Load (or re-load) the current list of vdevs describing the active l2cache for 1471 * this pool. When this is called, we have some form of basic information in 1472 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1473 * then re-generate a more complete list including status information. 1474 * Devices which are already active have their details maintained, and are 1475 * not re-opened. 1476 */ 1477 void 1478 spa_load_l2cache(spa_t *spa) 1479 { 1480 nvlist_t **l2cache; 1481 uint_t nl2cache; 1482 int i, j, oldnvdevs; 1483 uint64_t guid; 1484 vdev_t *vd, **oldvdevs, **newvdevs; 1485 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1486 1487 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1488 1489 if (sav->sav_config != NULL) { 1490 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1491 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1492 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1493 } else { 1494 nl2cache = 0; 1495 newvdevs = NULL; 1496 } 1497 1498 oldvdevs = sav->sav_vdevs; 1499 oldnvdevs = sav->sav_count; 1500 sav->sav_vdevs = NULL; 1501 sav->sav_count = 0; 1502 1503 /* 1504 * Process new nvlist of vdevs. 1505 */ 1506 for (i = 0; i < nl2cache; i++) { 1507 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1508 &guid) == 0); 1509 1510 newvdevs[i] = NULL; 1511 for (j = 0; j < oldnvdevs; j++) { 1512 vd = oldvdevs[j]; 1513 if (vd != NULL && guid == vd->vdev_guid) { 1514 /* 1515 * Retain previous vdev for add/remove ops. 1516 */ 1517 newvdevs[i] = vd; 1518 oldvdevs[j] = NULL; 1519 break; 1520 } 1521 } 1522 1523 if (newvdevs[i] == NULL) { 1524 /* 1525 * Create new vdev 1526 */ 1527 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1528 VDEV_ALLOC_L2CACHE) == 0); 1529 ASSERT(vd != NULL); 1530 newvdevs[i] = vd; 1531 1532 /* 1533 * Commit this vdev as an l2cache device, 1534 * even if it fails to open. 1535 */ 1536 spa_l2cache_add(vd); 1537 1538 vd->vdev_top = vd; 1539 vd->vdev_aux = sav; 1540 1541 spa_l2cache_activate(vd); 1542 1543 if (vdev_open(vd) != 0) 1544 continue; 1545 1546 (void) vdev_validate_aux(vd); 1547 1548 if (!vdev_is_dead(vd)) 1549 l2arc_add_vdev(spa, vd); 1550 } 1551 } 1552 1553 /* 1554 * Purge vdevs that were dropped 1555 */ 1556 for (i = 0; i < oldnvdevs; i++) { 1557 uint64_t pool; 1558 1559 vd = oldvdevs[i]; 1560 if (vd != NULL) { 1561 ASSERT(vd->vdev_isl2cache); 1562 1563 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1564 pool != 0ULL && l2arc_vdev_present(vd)) 1565 l2arc_remove_vdev(vd); 1566 vdev_clear_stats(vd); 1567 vdev_free(vd); 1568 } 1569 } 1570 1571 if (oldvdevs) 1572 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1573 1574 if (sav->sav_config == NULL) 1575 goto out; 1576 1577 sav->sav_vdevs = newvdevs; 1578 sav->sav_count = (int)nl2cache; 1579 1580 /* 1581 * Recompute the stashed list of l2cache devices, with status 1582 * information this time. 1583 */ 1584 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1585 DATA_TYPE_NVLIST_ARRAY) == 0); 1586 1587 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1588 for (i = 0; i < sav->sav_count; i++) 1589 l2cache[i] = vdev_config_generate(spa, 1590 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1591 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1592 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1593 out: 1594 for (i = 0; i < sav->sav_count; i++) 1595 nvlist_free(l2cache[i]); 1596 if (sav->sav_count) 1597 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1598 } 1599 1600 static int 1601 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1602 { 1603 dmu_buf_t *db; 1604 char *packed = NULL; 1605 size_t nvsize = 0; 1606 int error; 1607 *value = NULL; 1608 1609 error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 1610 if (error != 0) 1611 return (error); 1612 1613 nvsize = *(uint64_t *)db->db_data; 1614 dmu_buf_rele(db, FTAG); 1615 1616 packed = kmem_alloc(nvsize, KM_SLEEP); 1617 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1618 DMU_READ_PREFETCH); 1619 if (error == 0) 1620 error = nvlist_unpack(packed, nvsize, value, 0); 1621 kmem_free(packed, nvsize); 1622 1623 return (error); 1624 } 1625 1626 /* 1627 * Checks to see if the given vdev could not be opened, in which case we post a 1628 * sysevent to notify the autoreplace code that the device has been removed. 1629 */ 1630 static void 1631 spa_check_removed(vdev_t *vd) 1632 { 1633 for (int c = 0; c < vd->vdev_children; c++) 1634 spa_check_removed(vd->vdev_child[c]); 1635 1636 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 1637 vdev_is_concrete(vd)) { 1638 zfs_post_autoreplace(vd->vdev_spa, vd); 1639 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); 1640 } 1641 } 1642 1643 static void 1644 spa_config_valid_zaps(vdev_t *vd, vdev_t *mvd) 1645 { 1646 ASSERT3U(vd->vdev_children, ==, mvd->vdev_children); 1647 1648 vd->vdev_top_zap = mvd->vdev_top_zap; 1649 vd->vdev_leaf_zap = mvd->vdev_leaf_zap; 1650 1651 for (uint64_t i = 0; i < vd->vdev_children; i++) { 1652 spa_config_valid_zaps(vd->vdev_child[i], mvd->vdev_child[i]); 1653 } 1654 } 1655 1656 /* 1657 * Validate the current config against the MOS config 1658 */ 1659 static boolean_t 1660 spa_config_valid(spa_t *spa, nvlist_t *config) 1661 { 1662 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 1663 nvlist_t *nv; 1664 1665 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 1666 1667 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1668 VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1669 1670 ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 1671 1672 /* 1673 * If we're doing a normal import, then build up any additional 1674 * diagnostic information about missing devices in this config. 1675 * We'll pass this up to the user for further processing. 1676 */ 1677 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1678 nvlist_t **child, *nv; 1679 uint64_t idx = 0; 1680 1681 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1682 KM_SLEEP); 1683 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1684 1685 for (int c = 0; c < rvd->vdev_children; c++) { 1686 vdev_t *tvd = rvd->vdev_child[c]; 1687 vdev_t *mtvd = mrvd->vdev_child[c]; 1688 1689 if (tvd->vdev_ops == &vdev_missing_ops && 1690 mtvd->vdev_ops != &vdev_missing_ops && 1691 mtvd->vdev_islog) 1692 child[idx++] = vdev_config_generate(spa, mtvd, 1693 B_FALSE, 0); 1694 } 1695 1696 if (idx) { 1697 VERIFY(nvlist_add_nvlist_array(nv, 1698 ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 1699 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 1700 ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 1701 1702 for (int i = 0; i < idx; i++) 1703 nvlist_free(child[i]); 1704 } 1705 nvlist_free(nv); 1706 kmem_free(child, rvd->vdev_children * sizeof (char **)); 1707 } 1708 1709 /* 1710 * Compare the root vdev tree with the information we have 1711 * from the MOS config (mrvd). Check each top-level vdev 1712 * with the corresponding MOS config top-level (mtvd). 1713 */ 1714 for (int c = 0; c < rvd->vdev_children; c++) { 1715 vdev_t *tvd = rvd->vdev_child[c]; 1716 vdev_t *mtvd = mrvd->vdev_child[c]; 1717 1718 /* 1719 * Resolve any "missing" vdevs in the current configuration. 1720 * Also trust the MOS config about any "indirect" vdevs. 1721 * If we find that the MOS config has more accurate information 1722 * about the top-level vdev then use that vdev instead. 1723 */ 1724 if ((tvd->vdev_ops == &vdev_missing_ops && 1725 mtvd->vdev_ops != &vdev_missing_ops) || 1726 (mtvd->vdev_ops == &vdev_indirect_ops && 1727 tvd->vdev_ops != &vdev_indirect_ops)) { 1728 1729 /* 1730 * Device specific actions. 1731 */ 1732 if (mtvd->vdev_islog) { 1733 if (!(spa->spa_import_flags & 1734 ZFS_IMPORT_MISSING_LOG)) { 1735 continue; 1736 } 1737 1738 spa_set_log_state(spa, SPA_LOG_CLEAR); 1739 } else if (mtvd->vdev_ops != &vdev_indirect_ops) { 1740 continue; 1741 } 1742 1743 /* 1744 * Swap the missing vdev with the data we were 1745 * able to obtain from the MOS config. 1746 */ 1747 vdev_remove_child(rvd, tvd); 1748 vdev_remove_child(mrvd, mtvd); 1749 1750 vdev_add_child(rvd, mtvd); 1751 vdev_add_child(mrvd, tvd); 1752 1753 vdev_reopen(rvd); 1754 } else { 1755 if (mtvd->vdev_islog) { 1756 /* 1757 * Load the slog device's state from the MOS 1758 * config since it's possible that the label 1759 * does not contain the most up-to-date 1760 * information. 1761 */ 1762 vdev_load_log_state(tvd, mtvd); 1763 vdev_reopen(tvd); 1764 } 1765 1766 /* 1767 * Per-vdev ZAP info is stored exclusively in the MOS. 1768 */ 1769 spa_config_valid_zaps(tvd, mtvd); 1770 } 1771 1772 /* 1773 * Never trust this info from userland; always use what's 1774 * in the MOS. This prevents it from getting out of sync 1775 * with the rest of the info in the MOS. 1776 */ 1777 tvd->vdev_removing = mtvd->vdev_removing; 1778 tvd->vdev_indirect_config = mtvd->vdev_indirect_config; 1779 } 1780 1781 vdev_free(mrvd); 1782 spa_config_exit(spa, SCL_ALL, FTAG); 1783 1784 /* 1785 * Ensure we were able to validate the config. 1786 */ 1787 return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 1788 } 1789 1790 /* 1791 * Check for missing log devices 1792 */ 1793 static boolean_t 1794 spa_check_logs(spa_t *spa) 1795 { 1796 boolean_t rv = B_FALSE; 1797 dsl_pool_t *dp = spa_get_dsl(spa); 1798 1799 switch (spa->spa_log_state) { 1800 case SPA_LOG_MISSING: 1801 /* need to recheck in case slog has been restored */ 1802 case SPA_LOG_UNKNOWN: 1803 rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 1804 zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); 1805 if (rv) 1806 spa_set_log_state(spa, SPA_LOG_MISSING); 1807 break; 1808 } 1809 return (rv); 1810 } 1811 1812 static boolean_t 1813 spa_passivate_log(spa_t *spa) 1814 { 1815 vdev_t *rvd = spa->spa_root_vdev; 1816 boolean_t slog_found = B_FALSE; 1817 1818 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1819 1820 if (!spa_has_slogs(spa)) 1821 return (B_FALSE); 1822 1823 for (int c = 0; c < rvd->vdev_children; c++) { 1824 vdev_t *tvd = rvd->vdev_child[c]; 1825 metaslab_group_t *mg = tvd->vdev_mg; 1826 1827 if (tvd->vdev_islog) { 1828 metaslab_group_passivate(mg); 1829 slog_found = B_TRUE; 1830 } 1831 } 1832 1833 return (slog_found); 1834 } 1835 1836 static void 1837 spa_activate_log(spa_t *spa) 1838 { 1839 vdev_t *rvd = spa->spa_root_vdev; 1840 1841 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1842 1843 for (int c = 0; c < rvd->vdev_children; c++) { 1844 vdev_t *tvd = rvd->vdev_child[c]; 1845 metaslab_group_t *mg = tvd->vdev_mg; 1846 1847 if (tvd->vdev_islog) 1848 metaslab_group_activate(mg); 1849 } 1850 } 1851 1852 int 1853 spa_reset_logs(spa_t *spa) 1854 { 1855 int error; 1856 1857 error = dmu_objset_find(spa_name(spa), zil_reset, 1858 NULL, DS_FIND_CHILDREN); 1859 if (error == 0) { 1860 /* 1861 * We successfully offlined the log device, sync out the 1862 * current txg so that the "stubby" block can be removed 1863 * by zil_sync(). 1864 */ 1865 txg_wait_synced(spa->spa_dsl_pool, 0); 1866 } 1867 return (error); 1868 } 1869 1870 static void 1871 spa_aux_check_removed(spa_aux_vdev_t *sav) 1872 { 1873 for (int i = 0; i < sav->sav_count; i++) 1874 spa_check_removed(sav->sav_vdevs[i]); 1875 } 1876 1877 void 1878 spa_claim_notify(zio_t *zio) 1879 { 1880 spa_t *spa = zio->io_spa; 1881 1882 if (zio->io_error) 1883 return; 1884 1885 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1886 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1887 spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1888 mutex_exit(&spa->spa_props_lock); 1889 } 1890 1891 typedef struct spa_load_error { 1892 uint64_t sle_meta_count; 1893 uint64_t sle_data_count; 1894 } spa_load_error_t; 1895 1896 static void 1897 spa_load_verify_done(zio_t *zio) 1898 { 1899 blkptr_t *bp = zio->io_bp; 1900 spa_load_error_t *sle = zio->io_private; 1901 dmu_object_type_t type = BP_GET_TYPE(bp); 1902 int error = zio->io_error; 1903 spa_t *spa = zio->io_spa; 1904 1905 abd_free(zio->io_abd); 1906 if (error) { 1907 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 1908 type != DMU_OT_INTENT_LOG) 1909 atomic_inc_64(&sle->sle_meta_count); 1910 else 1911 atomic_inc_64(&sle->sle_data_count); 1912 } 1913 1914 mutex_enter(&spa->spa_scrub_lock); 1915 spa->spa_scrub_inflight--; 1916 cv_broadcast(&spa->spa_scrub_io_cv); 1917 mutex_exit(&spa->spa_scrub_lock); 1918 } 1919 1920 /* 1921 * Maximum number of concurrent scrub i/os to create while verifying 1922 * a pool while importing it. 1923 */ 1924 int spa_load_verify_maxinflight = 10000; 1925 boolean_t spa_load_verify_metadata = B_TRUE; 1926 boolean_t spa_load_verify_data = B_TRUE; 1927 1928 /*ARGSUSED*/ 1929 static int 1930 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1931 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 1932 { 1933 if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) 1934 return (0); 1935 /* 1936 * Note: normally this routine will not be called if 1937 * spa_load_verify_metadata is not set. However, it may be useful 1938 * to manually set the flag after the traversal has begun. 1939 */ 1940 if (!spa_load_verify_metadata) 1941 return (0); 1942 if (!BP_IS_METADATA(bp) && !spa_load_verify_data) 1943 return (0); 1944 1945 zio_t *rio = arg; 1946 size_t size = BP_GET_PSIZE(bp); 1947 1948 mutex_enter(&spa->spa_scrub_lock); 1949 while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight) 1950 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1951 spa->spa_scrub_inflight++; 1952 mutex_exit(&spa->spa_scrub_lock); 1953 1954 zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, 1955 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1956 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1957 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1958 return (0); 1959 } 1960 1961 /* ARGSUSED */ 1962 int 1963 verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 1964 { 1965 if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) 1966 return (SET_ERROR(ENAMETOOLONG)); 1967 1968 return (0); 1969 } 1970 1971 static int 1972 spa_load_verify(spa_t *spa) 1973 { 1974 zio_t *rio; 1975 spa_load_error_t sle = { 0 }; 1976 zpool_rewind_policy_t policy; 1977 boolean_t verify_ok = B_FALSE; 1978 int error = 0; 1979 1980 zpool_get_rewind_policy(spa->spa_config, &policy); 1981 1982 if (policy.zrp_request & ZPOOL_NEVER_REWIND) 1983 return (0); 1984 1985 dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); 1986 error = dmu_objset_find_dp(spa->spa_dsl_pool, 1987 spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, 1988 DS_FIND_CHILDREN); 1989 dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); 1990 if (error != 0) 1991 return (error); 1992 1993 rio = zio_root(spa, NULL, &sle, 1994 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1995 1996 if (spa_load_verify_metadata) { 1997 error = traverse_pool(spa, spa->spa_verify_min_txg, 1998 TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, 1999 spa_load_verify_cb, rio); 2000 } 2001 2002 (void) zio_wait(rio); 2003 2004 spa->spa_load_meta_errors = sle.sle_meta_count; 2005 spa->spa_load_data_errors = sle.sle_data_count; 2006 2007 if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 2008 sle.sle_data_count <= policy.zrp_maxdata) { 2009 int64_t loss = 0; 2010 2011 verify_ok = B_TRUE; 2012 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 2013 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 2014 2015 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 2016 VERIFY(nvlist_add_uint64(spa->spa_load_info, 2017 ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 2018 VERIFY(nvlist_add_int64(spa->spa_load_info, 2019 ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 2020 VERIFY(nvlist_add_uint64(spa->spa_load_info, 2021 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 2022 } else { 2023 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 2024 } 2025 2026 if (error) { 2027 if (error != ENXIO && error != EIO) 2028 error = SET_ERROR(EIO); 2029 return (error); 2030 } 2031 2032 return (verify_ok ? 0 : EIO); 2033 } 2034 2035 /* 2036 * Find a value in the pool props object. 2037 */ 2038 static void 2039 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 2040 { 2041 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 2042 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 2043 } 2044 2045 /* 2046 * Find a value in the pool directory object. 2047 */ 2048 static int 2049 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 2050 { 2051 return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2052 name, sizeof (uint64_t), 1, val)); 2053 } 2054 2055 static int 2056 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 2057 { 2058 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 2059 return (SET_ERROR(err)); 2060 } 2061 2062 /* 2063 * Fix up config after a partly-completed split. This is done with the 2064 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 2065 * pool have that entry in their config, but only the splitting one contains 2066 * a list of all the guids of the vdevs that are being split off. 2067 * 2068 * This function determines what to do with that list: either rejoin 2069 * all the disks to the pool, or complete the splitting process. To attempt 2070 * the rejoin, each disk that is offlined is marked online again, and 2071 * we do a reopen() call. If the vdev label for every disk that was 2072 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 2073 * then we call vdev_split() on each disk, and complete the split. 2074 * 2075 * Otherwise we leave the config alone, with all the vdevs in place in 2076 * the original pool. 2077 */ 2078 static void 2079 spa_try_repair(spa_t *spa, nvlist_t *config) 2080 { 2081 uint_t extracted; 2082 uint64_t *glist; 2083 uint_t i, gcount; 2084 nvlist_t *nvl; 2085 vdev_t **vd; 2086 boolean_t attempt_reopen; 2087 2088 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 2089 return; 2090 2091 /* check that the config is complete */ 2092 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 2093 &glist, &gcount) != 0) 2094 return; 2095 2096 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 2097 2098 /* attempt to online all the vdevs & validate */ 2099 attempt_reopen = B_TRUE; 2100 for (i = 0; i < gcount; i++) { 2101 if (glist[i] == 0) /* vdev is hole */ 2102 continue; 2103 2104 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 2105 if (vd[i] == NULL) { 2106 /* 2107 * Don't bother attempting to reopen the disks; 2108 * just do the split. 2109 */ 2110 attempt_reopen = B_FALSE; 2111 } else { 2112 /* attempt to re-online it */ 2113 vd[i]->vdev_offline = B_FALSE; 2114 } 2115 } 2116 2117 if (attempt_reopen) { 2118 vdev_reopen(spa->spa_root_vdev); 2119 2120 /* check each device to see what state it's in */ 2121 for (extracted = 0, i = 0; i < gcount; i++) { 2122 if (vd[i] != NULL && 2123 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 2124 break; 2125 ++extracted; 2126 } 2127 } 2128 2129 /* 2130 * If every disk has been moved to the new pool, or if we never 2131 * even attempted to look at them, then we split them off for 2132 * good. 2133 */ 2134 if (!attempt_reopen || gcount == extracted) { 2135 for (i = 0; i < gcount; i++) 2136 if (vd[i] != NULL) 2137 vdev_split(vd[i]); 2138 vdev_reopen(spa->spa_root_vdev); 2139 } 2140 2141 kmem_free(vd, gcount * sizeof (vdev_t *)); 2142 } 2143 2144 static int 2145 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 2146 boolean_t mosconfig) 2147 { 2148 nvlist_t *config = spa->spa_config; 2149 char *ereport = FM_EREPORT_ZFS_POOL; 2150 char *comment; 2151 int error; 2152 uint64_t pool_guid; 2153 nvlist_t *nvl; 2154 2155 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 2156 return (SET_ERROR(EINVAL)); 2157 2158 ASSERT(spa->spa_comment == NULL); 2159 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 2160 spa->spa_comment = spa_strdup(comment); 2161 2162 /* 2163 * Versioning wasn't explicitly added to the label until later, so if 2164 * it's not present treat it as the initial version. 2165 */ 2166 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 2167 &spa->spa_ubsync.ub_version) != 0) 2168 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 2169 2170 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 2171 &spa->spa_config_txg); 2172 2173 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 2174 spa_guid_exists(pool_guid, 0)) { 2175 error = SET_ERROR(EEXIST); 2176 } else { 2177 spa->spa_config_guid = pool_guid; 2178 2179 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 2180 &nvl) == 0) { 2181 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 2182 KM_SLEEP) == 0); 2183 } 2184 2185 nvlist_free(spa->spa_load_info); 2186 spa->spa_load_info = fnvlist_alloc(); 2187 2188 gethrestime(&spa->spa_loaded_ts); 2189 error = spa_load_impl(spa, pool_guid, config, state, type, 2190 mosconfig, &ereport); 2191 } 2192 2193 /* 2194 * Don't count references from objsets that are already closed 2195 * and are making their way through the eviction process. 2196 */ 2197 spa_evicting_os_wait(spa); 2198 spa->spa_minref = refcount_count(&spa->spa_refcount); 2199 if (error) { 2200 if (error != EEXIST) { 2201 spa->spa_loaded_ts.tv_sec = 0; 2202 spa->spa_loaded_ts.tv_nsec = 0; 2203 } 2204 if (error != EBADF) { 2205 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 2206 } 2207 } 2208 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 2209 spa->spa_ena = 0; 2210 2211 return (error); 2212 } 2213 2214 /* 2215 * Count the number of per-vdev ZAPs associated with all of the vdevs in the 2216 * vdev tree rooted in the given vd, and ensure that each ZAP is present in the 2217 * spa's per-vdev ZAP list. 2218 */ 2219 static uint64_t 2220 vdev_count_verify_zaps(vdev_t *vd) 2221 { 2222 spa_t *spa = vd->vdev_spa; 2223 uint64_t total = 0; 2224 if (vd->vdev_top_zap != 0) { 2225 total++; 2226 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 2227 spa->spa_all_vdev_zaps, vd->vdev_top_zap)); 2228 } 2229 if (vd->vdev_leaf_zap != 0) { 2230 total++; 2231 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 2232 spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); 2233 } 2234 2235 for (uint64_t i = 0; i < vd->vdev_children; i++) { 2236 total += vdev_count_verify_zaps(vd->vdev_child[i]); 2237 } 2238 2239 return (total); 2240 } 2241 2242 /* 2243 * Load an existing storage pool, using the pool's builtin spa_config as a 2244 * source of configuration information. 2245 */ 2246 static int 2247 spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 2248 spa_load_state_t state, spa_import_type_t type, boolean_t trust_config, 2249 char **ereport) 2250 { 2251 int error = 0; 2252 nvlist_t *nvroot = NULL; 2253 nvlist_t *label; 2254 vdev_t *rvd; 2255 uberblock_t *ub = &spa->spa_uberblock; 2256 uint64_t children, config_cache_txg = spa->spa_config_txg; 2257 int orig_mode = spa->spa_mode; 2258 int parse; 2259 uint64_t obj; 2260 boolean_t missing_feat_write = B_FALSE; 2261 2262 /* 2263 * If this is an untrusted config, access the pool in read-only mode. 2264 * This prevents things like resilvering recently removed devices. 2265 */ 2266 if (!trust_config) 2267 spa->spa_mode = FREAD; 2268 2269 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2270 2271 spa->spa_load_state = state; 2272 2273 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 2274 return (SET_ERROR(EINVAL)); 2275 2276 parse = (type == SPA_IMPORT_EXISTING ? 2277 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 2278 2279 /* 2280 * Create "The Godfather" zio to hold all async IOs 2281 */ 2282 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 2283 KM_SLEEP); 2284 for (int i = 0; i < max_ncpus; i++) { 2285 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 2286 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 2287 ZIO_FLAG_GODFATHER); 2288 } 2289 2290 /* 2291 * Parse the configuration into a vdev tree. We explicitly set the 2292 * value that will be returned by spa_version() since parsing the 2293 * configuration requires knowing the version number. 2294 */ 2295 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2296 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 2297 spa_config_exit(spa, SCL_ALL, FTAG); 2298 2299 if (error != 0) 2300 return (error); 2301 2302 ASSERT(spa->spa_root_vdev == rvd); 2303 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 2304 ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); 2305 2306 if (type != SPA_IMPORT_ASSEMBLE) { 2307 ASSERT(spa_guid(spa) == pool_guid); 2308 } 2309 2310 /* 2311 * Try to open all vdevs, loading each label in the process. 2312 */ 2313 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2314 error = vdev_open(rvd); 2315 spa_config_exit(spa, SCL_ALL, FTAG); 2316 if (error != 0) 2317 return (error); 2318 2319 /* 2320 * We need to validate the vdev labels against the configuration that 2321 * we have in hand, which is dependent on the setting of mosconfig. If 2322 * mosconfig is true then we're validating the vdev labels based on 2323 * that config. Otherwise, we're validating against the cached config 2324 * (zpool.cache) that was read when we loaded the zfs module, and then 2325 * later we will recursively call spa_load() and validate against 2326 * the vdev config. 2327 * 2328 * If we're assembling a new pool that's been split off from an 2329 * existing pool, the labels haven't yet been updated so we skip 2330 * validation for now. 2331 */ 2332 if (type != SPA_IMPORT_ASSEMBLE) { 2333 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2334 error = vdev_validate(rvd, trust_config); 2335 spa_config_exit(spa, SCL_ALL, FTAG); 2336 2337 if (error != 0) 2338 return (error); 2339 2340 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2341 return (SET_ERROR(ENXIO)); 2342 } 2343 2344 /* 2345 * Find the best uberblock. 2346 */ 2347 vdev_uberblock_load(rvd, ub, &label); 2348 2349 /* 2350 * If we weren't able to find a single valid uberblock, return failure. 2351 */ 2352 if (ub->ub_txg == 0) { 2353 nvlist_free(label); 2354 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 2355 } 2356 2357 /* 2358 * If the pool has an unsupported version we can't open it. 2359 */ 2360 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 2361 nvlist_free(label); 2362 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 2363 } 2364 2365 if (ub->ub_version >= SPA_VERSION_FEATURES) { 2366 nvlist_t *features; 2367 2368 /* 2369 * If we weren't able to find what's necessary for reading the 2370 * MOS in the label, return failure. 2371 */ 2372 if (label == NULL || nvlist_lookup_nvlist(label, 2373 ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { 2374 nvlist_free(label); 2375 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2376 ENXIO)); 2377 } 2378 2379 /* 2380 * Update our in-core representation with the definitive values 2381 * from the label. 2382 */ 2383 nvlist_free(spa->spa_label_features); 2384 VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); 2385 } 2386 2387 nvlist_free(label); 2388 2389 /* 2390 * Look through entries in the label nvlist's features_for_read. If 2391 * there is a feature listed there which we don't understand then we 2392 * cannot open a pool. 2393 */ 2394 if (ub->ub_version >= SPA_VERSION_FEATURES) { 2395 nvlist_t *unsup_feat; 2396 2397 VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 2398 0); 2399 2400 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 2401 NULL); nvp != NULL; 2402 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 2403 if (!zfeature_is_supported(nvpair_name(nvp))) { 2404 VERIFY(nvlist_add_string(unsup_feat, 2405 nvpair_name(nvp), "") == 0); 2406 } 2407 } 2408 2409 if (!nvlist_empty(unsup_feat)) { 2410 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 2411 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); 2412 nvlist_free(unsup_feat); 2413 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2414 ENOTSUP)); 2415 } 2416 2417 nvlist_free(unsup_feat); 2418 } 2419 2420 /* 2421 * If the vdev guid sum doesn't match the uberblock, we have an 2422 * incomplete configuration. We first check to see if the pool 2423 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 2424 * If it is, defer the vdev_guid_sum check till later so we 2425 * can handle missing vdevs. 2426 */ 2427 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 2428 &children) != 0 && trust_config && type != SPA_IMPORT_ASSEMBLE && 2429 rvd->vdev_guid_sum != ub->ub_guid_sum) 2430 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 2431 2432 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 2433 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2434 spa_try_repair(spa, config); 2435 spa_config_exit(spa, SCL_ALL, FTAG); 2436 nvlist_free(spa->spa_config_splitting); 2437 spa->spa_config_splitting = NULL; 2438 } 2439 2440 /* 2441 * Initialize internal SPA structures. 2442 */ 2443 spa->spa_state = POOL_STATE_ACTIVE; 2444 spa->spa_ubsync = spa->spa_uberblock; 2445 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2446 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2447 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2448 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2449 spa->spa_claim_max_txg = spa->spa_first_txg; 2450 spa->spa_prev_software_version = ub->ub_software_version; 2451 2452 /* 2453 * Everything that we read before we do spa_remove_init() must 2454 * have been rewritten after the last device removal was initiated. 2455 * Otherwise we could be reading from indirect vdevs before 2456 * we have loaded their mappings. 2457 */ 2458 2459 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2460 if (error) 2461 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2462 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2463 2464 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 2465 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2466 2467 /* 2468 * Validate the config, using the MOS config to fill in any 2469 * information which might be missing. If we fail to validate 2470 * the config then declare the pool unfit for use. If we're 2471 * assembling a pool from a split, the log is not transferred 2472 * over. 2473 */ 2474 if (type != SPA_IMPORT_ASSEMBLE) { 2475 nvlist_t *mos_config; 2476 if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) 2477 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2478 2479 if (!spa_config_valid(spa, mos_config)) { 2480 nvlist_free(mos_config); 2481 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2482 ENXIO)); 2483 } 2484 nvlist_free(mos_config); 2485 2486 /* 2487 * Now that we've validated the config, check the state of the 2488 * root vdev. If it can't be opened, it indicates one or 2489 * more toplevel vdevs are faulted. 2490 */ 2491 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2492 return (SET_ERROR(ENXIO)); 2493 } 2494 2495 /* 2496 * Everything that we read before spa_remove_init() must be stored 2497 * on concreted vdevs. Therefore we do this as early as possible. 2498 */ 2499 if (spa_remove_init(spa) != 0) 2500 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2501 2502 if (spa_version(spa) >= SPA_VERSION_FEATURES) { 2503 boolean_t missing_feat_read = B_FALSE; 2504 nvlist_t *unsup_feat, *enabled_feat; 2505 2506 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 2507 &spa->spa_feat_for_read_obj) != 0) { 2508 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2509 } 2510 2511 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 2512 &spa->spa_feat_for_write_obj) != 0) { 2513 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2514 } 2515 2516 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 2517 &spa->spa_feat_desc_obj) != 0) { 2518 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2519 } 2520 2521 enabled_feat = fnvlist_alloc(); 2522 unsup_feat = fnvlist_alloc(); 2523 2524 if (!spa_features_check(spa, B_FALSE, 2525 unsup_feat, enabled_feat)) 2526 missing_feat_read = B_TRUE; 2527 2528 if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { 2529 if (!spa_features_check(spa, B_TRUE, 2530 unsup_feat, enabled_feat)) { 2531 missing_feat_write = B_TRUE; 2532 } 2533 } 2534 2535 fnvlist_add_nvlist(spa->spa_load_info, 2536 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 2537 2538 if (!nvlist_empty(unsup_feat)) { 2539 fnvlist_add_nvlist(spa->spa_load_info, 2540 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 2541 } 2542 2543 fnvlist_free(enabled_feat); 2544 fnvlist_free(unsup_feat); 2545 2546 if (!missing_feat_read) { 2547 fnvlist_add_boolean(spa->spa_load_info, 2548 ZPOOL_CONFIG_CAN_RDONLY); 2549 } 2550 2551 /* 2552 * If the state is SPA_LOAD_TRYIMPORT, our objective is 2553 * twofold: to determine whether the pool is available for 2554 * import in read-write mode and (if it is not) whether the 2555 * pool is available for import in read-only mode. If the pool 2556 * is available for import in read-write mode, it is displayed 2557 * as available in userland; if it is not available for import 2558 * in read-only mode, it is displayed as unavailable in 2559 * userland. If the pool is available for import in read-only 2560 * mode but not read-write mode, it is displayed as unavailable 2561 * in userland with a special note that the pool is actually 2562 * available for open in read-only mode. 2563 * 2564 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 2565 * missing a feature for write, we must first determine whether 2566 * the pool can be opened read-only before returning to 2567 * userland in order to know whether to display the 2568 * abovementioned note. 2569 */ 2570 if (missing_feat_read || (missing_feat_write && 2571 spa_writeable(spa))) { 2572 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2573 ENOTSUP)); 2574 } 2575 2576 /* 2577 * Load refcounts for ZFS features from disk into an in-memory 2578 * cache during SPA initialization. 2579 */ 2580 for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 2581 uint64_t refcount; 2582 2583 error = feature_get_refcount_from_disk(spa, 2584 &spa_feature_table[i], &refcount); 2585 if (error == 0) { 2586 spa->spa_feat_refcount_cache[i] = refcount; 2587 } else if (error == ENOTSUP) { 2588 spa->spa_feat_refcount_cache[i] = 2589 SPA_FEATURE_DISABLED; 2590 } else { 2591 return (spa_vdev_err(rvd, 2592 VDEV_AUX_CORRUPT_DATA, EIO)); 2593 } 2594 } 2595 } 2596 2597 if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 2598 if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 2599 &spa->spa_feat_enabled_txg_obj) != 0) 2600 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2601 } 2602 2603 spa->spa_is_initializing = B_TRUE; 2604 error = dsl_pool_open(spa->spa_dsl_pool); 2605 spa->spa_is_initializing = B_FALSE; 2606 if (error != 0) 2607 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2608 2609 if (!trust_config) { 2610 uint64_t hostid; 2611 nvlist_t *policy = NULL; 2612 nvlist_t *mos_config; 2613 2614 if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) 2615 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2616 2617 if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, 2618 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2619 char *hostname; 2620 unsigned long myhostid = 0; 2621 2622 VERIFY(nvlist_lookup_string(mos_config, 2623 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 2624 2625 #ifdef _KERNEL 2626 myhostid = zone_get_hostid(NULL); 2627 #else /* _KERNEL */ 2628 /* 2629 * We're emulating the system's hostid in userland, so 2630 * we can't use zone_get_hostid(). 2631 */ 2632 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 2633 #endif /* _KERNEL */ 2634 if (hostid != 0 && myhostid != 0 && 2635 hostid != myhostid) { 2636 nvlist_free(mos_config); 2637 cmn_err(CE_WARN, "pool '%s' could not be " 2638 "loaded as it was last accessed by " 2639 "another system (host: %s hostid: 0x%lx). " 2640 "See: http://illumos.org/msg/ZFS-8000-EY", 2641 spa_name(spa), hostname, 2642 (unsigned long)hostid); 2643 return (SET_ERROR(EBADF)); 2644 } 2645 } 2646 if (nvlist_lookup_nvlist(spa->spa_config, 2647 ZPOOL_REWIND_POLICY, &policy) == 0) 2648 VERIFY(nvlist_add_nvlist(mos_config, 2649 ZPOOL_REWIND_POLICY, policy) == 0); 2650 2651 spa_config_set(spa, mos_config); 2652 spa_unload(spa); 2653 spa_deactivate(spa); 2654 spa_activate(spa, orig_mode); 2655 2656 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 2657 } 2658 2659 /* Grab the secret checksum salt from the MOS. */ 2660 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2661 DMU_POOL_CHECKSUM_SALT, 1, 2662 sizeof (spa->spa_cksum_salt.zcs_bytes), 2663 spa->spa_cksum_salt.zcs_bytes); 2664 if (error == ENOENT) { 2665 /* Generate a new salt for subsequent use */ 2666 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 2667 sizeof (spa->spa_cksum_salt.zcs_bytes)); 2668 } else if (error != 0) { 2669 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2670 } 2671 2672 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) 2673 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2674 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 2675 if (error != 0) 2676 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2677 2678 /* 2679 * Load the bit that tells us to use the new accounting function 2680 * (raid-z deflation). If we have an older pool, this will not 2681 * be present. 2682 */ 2683 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 2684 if (error != 0 && error != ENOENT) 2685 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2686 2687 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2688 &spa->spa_creation_version); 2689 if (error != 0 && error != ENOENT) 2690 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2691 2692 /* 2693 * Load the persistent error log. If we have an older pool, this will 2694 * not be present. 2695 */ 2696 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 2697 if (error != 0 && error != ENOENT) 2698 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2699 2700 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2701 &spa->spa_errlog_scrub); 2702 if (error != 0 && error != ENOENT) 2703 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2704 2705 /* 2706 * Load the history object. If we have an older pool, this 2707 * will not be present. 2708 */ 2709 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 2710 if (error != 0 && error != ENOENT) 2711 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2712 2713 /* 2714 * Load the per-vdev ZAP map. If we have an older pool, this will not 2715 * be present; in this case, defer its creation to a later time to 2716 * avoid dirtying the MOS this early / out of sync context. See 2717 * spa_sync_config_object. 2718 */ 2719 2720 /* The sentinel is only available in the MOS config. */ 2721 nvlist_t *mos_config; 2722 if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) 2723 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2724 2725 error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, 2726 &spa->spa_all_vdev_zaps); 2727 2728 if (error == ENOENT) { 2729 VERIFY(!nvlist_exists(mos_config, 2730 ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 2731 spa->spa_avz_action = AVZ_ACTION_INITIALIZE; 2732 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 2733 } else if (error != 0) { 2734 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2735 } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { 2736 /* 2737 * An older version of ZFS overwrote the sentinel value, so 2738 * we have orphaned per-vdev ZAPs in the MOS. Defer their 2739 * destruction to later; see spa_sync_config_object. 2740 */ 2741 spa->spa_avz_action = AVZ_ACTION_DESTROY; 2742 /* 2743 * We're assuming that no vdevs have had their ZAPs created 2744 * before this. Better be sure of it. 2745 */ 2746 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 2747 } 2748 nvlist_free(mos_config); 2749 2750 /* 2751 * If we're assembling the pool from the split-off vdevs of 2752 * an existing pool, we don't want to attach the spares & cache 2753 * devices. 2754 */ 2755 2756 /* 2757 * Load any hot spares for this pool. 2758 */ 2759 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 2760 if (error != 0 && error != ENOENT) 2761 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2762 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2763 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2764 if (load_nvlist(spa, spa->spa_spares.sav_object, 2765 &spa->spa_spares.sav_config) != 0) 2766 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2767 2768 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2769 spa_load_spares(spa); 2770 spa_config_exit(spa, SCL_ALL, FTAG); 2771 } else if (error == 0) { 2772 spa->spa_spares.sav_sync = B_TRUE; 2773 } 2774 2775 /* 2776 * Load any level 2 ARC devices for this pool. 2777 */ 2778 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2779 &spa->spa_l2cache.sav_object); 2780 if (error != 0 && error != ENOENT) 2781 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2782 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2783 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2784 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2785 &spa->spa_l2cache.sav_config) != 0) 2786 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2787 2788 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2789 spa_load_l2cache(spa); 2790 spa_config_exit(spa, SCL_ALL, FTAG); 2791 } else if (error == 0) { 2792 spa->spa_l2cache.sav_sync = B_TRUE; 2793 } 2794 2795 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2796 2797 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 2798 if (error && error != ENOENT) 2799 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2800 2801 if (error == 0) { 2802 uint64_t autoreplace; 2803 2804 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2805 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2806 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2807 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2808 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2809 spa_prop_find(spa, ZPOOL_PROP_BOOTSIZE, &spa->spa_bootsize); 2810 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2811 &spa->spa_dedup_ditto); 2812 2813 spa->spa_autoreplace = (autoreplace != 0); 2814 } 2815 2816 /* 2817 * If the 'autoreplace' property is set, then post a resource notifying 2818 * the ZFS DE that it should not issue any faults for unopenable 2819 * devices. We also iterate over the vdevs, and post a sysevent for any 2820 * unopenable vdevs so that the normal autoreplace handler can take 2821 * over. 2822 */ 2823 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 2824 spa_check_removed(spa->spa_root_vdev); 2825 /* 2826 * For the import case, this is done in spa_import(), because 2827 * at this point we're using the spare definitions from 2828 * the MOS config, not necessarily from the userland config. 2829 */ 2830 if (state != SPA_LOAD_IMPORT) { 2831 spa_aux_check_removed(&spa->spa_spares); 2832 spa_aux_check_removed(&spa->spa_l2cache); 2833 } 2834 } 2835 2836 /* 2837 * Load the vdev state for all toplevel vdevs. 2838 */ 2839 error = vdev_load(rvd); 2840 if (error != 0) { 2841 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 2842 } 2843 2844 error = spa_condense_init(spa); 2845 if (error != 0) { 2846 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 2847 } 2848 2849 /* 2850 * Propagate the leaf DTLs we just loaded all the way up the tree. 2851 */ 2852 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2853 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 2854 spa_config_exit(spa, SCL_ALL, FTAG); 2855 2856 /* 2857 * Load the DDTs (dedup tables). 2858 */ 2859 error = ddt_load(spa); 2860 if (error != 0) 2861 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2862 2863 spa_update_dspace(spa); 2864 2865 if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa) && 2866 spa_check_logs(spa)) { 2867 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 2868 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 2869 } 2870 2871 if (missing_feat_write) { 2872 ASSERT(state == SPA_LOAD_TRYIMPORT); 2873 2874 /* 2875 * At this point, we know that we can open the pool in 2876 * read-only mode but not read-write mode. We now have enough 2877 * information and can return to userland. 2878 */ 2879 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); 2880 } 2881 2882 /* 2883 * We've successfully opened the pool, verify that we're ready 2884 * to start pushing transactions. 2885 */ 2886 if (state != SPA_LOAD_TRYIMPORT) { 2887 if (error = spa_load_verify(spa)) 2888 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2889 error)); 2890 } 2891 2892 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2893 spa->spa_load_max_txg == UINT64_MAX)) { 2894 dmu_tx_t *tx; 2895 int need_update = B_FALSE; 2896 dsl_pool_t *dp = spa_get_dsl(spa); 2897 2898 /* 2899 * We must check this before we start the sync thread, because 2900 * we only want to start a condense thread for condense 2901 * operations that were in progress when the pool was 2902 * imported. Once we start syncing, spa_sync() could 2903 * initiate a condense (and start a thread for it). In 2904 * that case it would be wrong to start a second 2905 * condense thread. 2906 */ 2907 boolean_t condense_in_progress = 2908 (spa->spa_condensing_indirect != NULL); 2909 2910 ASSERT(state != SPA_LOAD_TRYIMPORT); 2911 2912 /* 2913 * Claim log blocks that haven't been committed yet. 2914 * This must all happen in a single txg. 2915 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2916 * invoked from zil_claim_log_block()'s i/o done callback. 2917 * Price of rollback is that we abandon the log. 2918 */ 2919 spa->spa_claiming = B_TRUE; 2920 2921 tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); 2922 (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 2923 zil_claim, tx, DS_FIND_CHILDREN); 2924 dmu_tx_commit(tx); 2925 2926 spa->spa_claiming = B_FALSE; 2927 2928 spa_set_log_state(spa, SPA_LOG_GOOD); 2929 spa->spa_sync_on = B_TRUE; 2930 txg_sync_start(spa->spa_dsl_pool); 2931 2932 /* 2933 * Wait for all claims to sync. We sync up to the highest 2934 * claimed log block birth time so that claimed log blocks 2935 * don't appear to be from the future. spa_claim_max_txg 2936 * will have been set for us by either zil_check_log_chain() 2937 * (invoked from spa_check_logs()) or zil_claim() above. 2938 */ 2939 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2940 2941 /* 2942 * If the config cache is stale, or we have uninitialized 2943 * metaslabs (see spa_vdev_add()), then update the config. 2944 * 2945 * If this is a verbatim import, trust the current 2946 * in-core spa_config and update the disk labels. 2947 */ 2948 if (config_cache_txg != spa->spa_config_txg || 2949 state == SPA_LOAD_IMPORT || 2950 state == SPA_LOAD_RECOVER || 2951 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 2952 need_update = B_TRUE; 2953 2954 for (int c = 0; c < rvd->vdev_children; c++) 2955 if (rvd->vdev_child[c]->vdev_ms_array == 0) 2956 need_update = B_TRUE; 2957 2958 /* 2959 * Update the config cache asychronously in case we're the 2960 * root pool, in which case the config cache isn't writable yet. 2961 */ 2962 if (need_update) 2963 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2964 2965 /* 2966 * Check all DTLs to see if anything needs resilvering. 2967 */ 2968 if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 2969 vdev_resilver_needed(rvd, NULL, NULL)) 2970 spa_async_request(spa, SPA_ASYNC_RESILVER); 2971 2972 /* 2973 * Log the fact that we booted up (so that we can detect if 2974 * we rebooted in the middle of an operation). 2975 */ 2976 spa_history_log_version(spa, "open"); 2977 2978 /* 2979 * Delete any inconsistent datasets. 2980 */ 2981 (void) dmu_objset_find(spa_name(spa), 2982 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 2983 2984 /* 2985 * Clean up any stale temporary dataset userrefs. 2986 */ 2987 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2988 2989 /* 2990 * Note: unlike condensing, we don't need an analogous 2991 * "removal_in_progress" dance because no other thread 2992 * can start a removal while we hold the spa_namespace_lock. 2993 */ 2994 spa_restart_removal(spa); 2995 2996 if (condense_in_progress) 2997 spa_condense_indirect_restart(spa); 2998 } 2999 3000 return (0); 3001 } 3002 3003 static int 3004 spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 3005 { 3006 int mode = spa->spa_mode; 3007 3008 spa_unload(spa); 3009 spa_deactivate(spa); 3010 3011 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; 3012 3013 spa_activate(spa, mode); 3014 spa_async_suspend(spa); 3015 3016 return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 3017 } 3018 3019 /* 3020 * If spa_load() fails this function will try loading prior txg's. If 3021 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 3022 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 3023 * function will not rewind the pool and will return the same error as 3024 * spa_load(). 3025 */ 3026 static int 3027 spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 3028 uint64_t max_request, int rewind_flags) 3029 { 3030 nvlist_t *loadinfo = NULL; 3031 nvlist_t *config = NULL; 3032 int load_error, rewind_error; 3033 uint64_t safe_rewind_txg; 3034 uint64_t min_txg; 3035 3036 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 3037 spa->spa_load_max_txg = spa->spa_load_txg; 3038 spa_set_log_state(spa, SPA_LOG_CLEAR); 3039 } else { 3040 spa->spa_load_max_txg = max_request; 3041 if (max_request != UINT64_MAX) 3042 spa->spa_extreme_rewind = B_TRUE; 3043 } 3044 3045 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 3046 mosconfig); 3047 if (load_error == 0) 3048 return (0); 3049 3050 if (spa->spa_root_vdev != NULL) 3051 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3052 3053 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 3054 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 3055 3056 if (rewind_flags & ZPOOL_NEVER_REWIND) { 3057 nvlist_free(config); 3058 return (load_error); 3059 } 3060 3061 if (state == SPA_LOAD_RECOVER) { 3062 /* Price of rolling back is discarding txgs, including log */ 3063 spa_set_log_state(spa, SPA_LOG_CLEAR); 3064 } else { 3065 /* 3066 * If we aren't rolling back save the load info from our first 3067 * import attempt so that we can restore it after attempting 3068 * to rewind. 3069 */ 3070 loadinfo = spa->spa_load_info; 3071 spa->spa_load_info = fnvlist_alloc(); 3072 } 3073 3074 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 3075 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 3076 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 3077 TXG_INITIAL : safe_rewind_txg; 3078 3079 /* 3080 * Continue as long as we're finding errors, we're still within 3081 * the acceptable rewind range, and we're still finding uberblocks 3082 */ 3083 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 3084 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 3085 if (spa->spa_load_max_txg < safe_rewind_txg) 3086 spa->spa_extreme_rewind = B_TRUE; 3087 rewind_error = spa_load_retry(spa, state, mosconfig); 3088 } 3089 3090 spa->spa_extreme_rewind = B_FALSE; 3091 spa->spa_load_max_txg = UINT64_MAX; 3092 3093 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 3094 spa_config_set(spa, config); 3095 else 3096 nvlist_free(config); 3097 3098 if (state == SPA_LOAD_RECOVER) { 3099 ASSERT3P(loadinfo, ==, NULL); 3100 return (rewind_error); 3101 } else { 3102 /* Store the rewind info as part of the initial load info */ 3103 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 3104 spa->spa_load_info); 3105 3106 /* Restore the initial load info */ 3107 fnvlist_free(spa->spa_load_info); 3108 spa->spa_load_info = loadinfo; 3109 3110 return (load_error); 3111 } 3112 } 3113 3114 /* 3115 * Pool Open/Import 3116 * 3117 * The import case is identical to an open except that the configuration is sent 3118 * down from userland, instead of grabbed from the configuration cache. For the 3119 * case of an open, the pool configuration will exist in the 3120 * POOL_STATE_UNINITIALIZED state. 3121 * 3122 * The stats information (gen/count/ustats) is used to gather vdev statistics at 3123 * the same time open the pool, without having to keep around the spa_t in some 3124 * ambiguous state. 3125 */ 3126 static int 3127 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 3128 nvlist_t **config) 3129 { 3130 spa_t *spa; 3131 spa_load_state_t state = SPA_LOAD_OPEN; 3132 int error; 3133 int locked = B_FALSE; 3134 3135 *spapp = NULL; 3136 3137 /* 3138 * As disgusting as this is, we need to support recursive calls to this 3139 * function because dsl_dir_open() is called during spa_load(), and ends 3140 * up calling spa_open() again. The real fix is to figure out how to 3141 * avoid dsl_dir_open() calling this in the first place. 3142 */ 3143 if (mutex_owner(&spa_namespace_lock) != curthread) { 3144 mutex_enter(&spa_namespace_lock); 3145 locked = B_TRUE; 3146 } 3147 3148 if ((spa = spa_lookup(pool)) == NULL) { 3149 if (locked) 3150 mutex_exit(&spa_namespace_lock); 3151 return (SET_ERROR(ENOENT)); 3152 } 3153 3154 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 3155 zpool_rewind_policy_t policy; 3156 3157 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 3158 &policy); 3159 if (policy.zrp_request & ZPOOL_DO_REWIND) 3160 state = SPA_LOAD_RECOVER; 3161 3162 spa_activate(spa, spa_mode_global); 3163 3164 if (state != SPA_LOAD_RECOVER) 3165 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 3166 3167 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 3168 policy.zrp_request); 3169 3170 if (error == EBADF) { 3171 /* 3172 * If vdev_validate() returns failure (indicated by 3173 * EBADF), it indicates that one of the vdevs indicates 3174 * that the pool has been exported or destroyed. If 3175 * this is the case, the config cache is out of sync and 3176 * we should remove the pool from the namespace. 3177 */ 3178 spa_unload(spa); 3179 spa_deactivate(spa); 3180 spa_write_cachefile(spa, B_TRUE, B_TRUE); 3181 spa_remove(spa); 3182 if (locked) 3183 mutex_exit(&spa_namespace_lock); 3184 return (SET_ERROR(ENOENT)); 3185 } 3186 3187 if (error) { 3188 /* 3189 * We can't open the pool, but we still have useful 3190 * information: the state of each vdev after the 3191 * attempted vdev_open(). Return this to the user. 3192 */ 3193 if (config != NULL && spa->spa_config) { 3194 VERIFY(nvlist_dup(spa->spa_config, config, 3195 KM_SLEEP) == 0); 3196 VERIFY(nvlist_add_nvlist(*config, 3197 ZPOOL_CONFIG_LOAD_INFO, 3198 spa->spa_load_info) == 0); 3199 } 3200 spa_unload(spa); 3201 spa_deactivate(spa); 3202 spa->spa_last_open_failed = error; 3203 if (locked) 3204 mutex_exit(&spa_namespace_lock); 3205 *spapp = NULL; 3206 return (error); 3207 } 3208 } 3209 3210 spa_open_ref(spa, tag); 3211 3212 if (config != NULL) 3213 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3214 3215 /* 3216 * If we've recovered the pool, pass back any information we 3217 * gathered while doing the load. 3218 */ 3219 if (state == SPA_LOAD_RECOVER) { 3220 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 3221 spa->spa_load_info) == 0); 3222 } 3223 3224 if (locked) { 3225 spa->spa_last_open_failed = 0; 3226 spa->spa_last_ubsync_txg = 0; 3227 spa->spa_load_txg = 0; 3228 mutex_exit(&spa_namespace_lock); 3229 } 3230 3231 *spapp = spa; 3232 3233 return (0); 3234 } 3235 3236 int 3237 spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 3238 nvlist_t **config) 3239 { 3240 return (spa_open_common(name, spapp, tag, policy, config)); 3241 } 3242 3243 int 3244 spa_open(const char *name, spa_t **spapp, void *tag) 3245 { 3246 return (spa_open_common(name, spapp, tag, NULL, NULL)); 3247 } 3248 3249 /* 3250 * Lookup the given spa_t, incrementing the inject count in the process, 3251 * preventing it from being exported or destroyed. 3252 */ 3253 spa_t * 3254 spa_inject_addref(char *name) 3255 { 3256 spa_t *spa; 3257 3258 mutex_enter(&spa_namespace_lock); 3259 if ((spa = spa_lookup(name)) == NULL) { 3260 mutex_exit(&spa_namespace_lock); 3261 return (NULL); 3262 } 3263 spa->spa_inject_ref++; 3264 mutex_exit(&spa_namespace_lock); 3265 3266 return (spa); 3267 } 3268 3269 void 3270 spa_inject_delref(spa_t *spa) 3271 { 3272 mutex_enter(&spa_namespace_lock); 3273 spa->spa_inject_ref--; 3274 mutex_exit(&spa_namespace_lock); 3275 } 3276 3277 /* 3278 * Add spares device information to the nvlist. 3279 */ 3280 static void 3281 spa_add_spares(spa_t *spa, nvlist_t *config) 3282 { 3283 nvlist_t **spares; 3284 uint_t i, nspares; 3285 nvlist_t *nvroot; 3286 uint64_t guid; 3287 vdev_stat_t *vs; 3288 uint_t vsc; 3289 uint64_t pool; 3290 3291 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3292 3293 if (spa->spa_spares.sav_count == 0) 3294 return; 3295 3296 VERIFY(nvlist_lookup_nvlist(config, 3297 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3298 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3299 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3300 if (nspares != 0) { 3301 VERIFY(nvlist_add_nvlist_array(nvroot, 3302 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3303 VERIFY(nvlist_lookup_nvlist_array(nvroot, 3304 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3305 3306 /* 3307 * Go through and find any spares which have since been 3308 * repurposed as an active spare. If this is the case, update 3309 * their status appropriately. 3310 */ 3311 for (i = 0; i < nspares; i++) { 3312 VERIFY(nvlist_lookup_uint64(spares[i], 3313 ZPOOL_CONFIG_GUID, &guid) == 0); 3314 if (spa_spare_exists(guid, &pool, NULL) && 3315 pool != 0ULL) { 3316 VERIFY(nvlist_lookup_uint64_array( 3317 spares[i], ZPOOL_CONFIG_VDEV_STATS, 3318 (uint64_t **)&vs, &vsc) == 0); 3319 vs->vs_state = VDEV_STATE_CANT_OPEN; 3320 vs->vs_aux = VDEV_AUX_SPARED; 3321 } 3322 } 3323 } 3324 } 3325 3326 /* 3327 * Add l2cache device information to the nvlist, including vdev stats. 3328 */ 3329 static void 3330 spa_add_l2cache(spa_t *spa, nvlist_t *config) 3331 { 3332 nvlist_t **l2cache; 3333 uint_t i, j, nl2cache; 3334 nvlist_t *nvroot; 3335 uint64_t guid; 3336 vdev_t *vd; 3337 vdev_stat_t *vs; 3338 uint_t vsc; 3339 3340 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3341 3342 if (spa->spa_l2cache.sav_count == 0) 3343 return; 3344 3345 VERIFY(nvlist_lookup_nvlist(config, 3346 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3347 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3348 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3349 if (nl2cache != 0) { 3350 VERIFY(nvlist_add_nvlist_array(nvroot, 3351 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3352 VERIFY(nvlist_lookup_nvlist_array(nvroot, 3353 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3354 3355 /* 3356 * Update level 2 cache device stats. 3357 */ 3358 3359 for (i = 0; i < nl2cache; i++) { 3360 VERIFY(nvlist_lookup_uint64(l2cache[i], 3361 ZPOOL_CONFIG_GUID, &guid) == 0); 3362 3363 vd = NULL; 3364 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 3365 if (guid == 3366 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 3367 vd = spa->spa_l2cache.sav_vdevs[j]; 3368 break; 3369 } 3370 } 3371 ASSERT(vd != NULL); 3372 3373 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 3374 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 3375 == 0); 3376 vdev_get_stats(vd, vs); 3377 } 3378 } 3379 } 3380 3381 static void 3382 spa_add_feature_stats(spa_t *spa, nvlist_t *config) 3383 { 3384 nvlist_t *features; 3385 zap_cursor_t zc; 3386 zap_attribute_t za; 3387 3388 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3389 VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3390 3391 if (spa->spa_feat_for_read_obj != 0) { 3392 for (zap_cursor_init(&zc, spa->spa_meta_objset, 3393 spa->spa_feat_for_read_obj); 3394 zap_cursor_retrieve(&zc, &za) == 0; 3395 zap_cursor_advance(&zc)) { 3396 ASSERT(za.za_integer_length == sizeof (uint64_t) && 3397 za.za_num_integers == 1); 3398 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3399 za.za_first_integer)); 3400 } 3401 zap_cursor_fini(&zc); 3402 } 3403 3404 if (spa->spa_feat_for_write_obj != 0) { 3405 for (zap_cursor_init(&zc, spa->spa_meta_objset, 3406 spa->spa_feat_for_write_obj); 3407 zap_cursor_retrieve(&zc, &za) == 0; 3408 zap_cursor_advance(&zc)) { 3409 ASSERT(za.za_integer_length == sizeof (uint64_t) && 3410 za.za_num_integers == 1); 3411 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3412 za.za_first_integer)); 3413 } 3414 zap_cursor_fini(&zc); 3415 } 3416 3417 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 3418 features) == 0); 3419 nvlist_free(features); 3420 } 3421 3422 int 3423 spa_get_stats(const char *name, nvlist_t **config, 3424 char *altroot, size_t buflen) 3425 { 3426 int error; 3427 spa_t *spa; 3428 3429 *config = NULL; 3430 error = spa_open_common(name, &spa, FTAG, NULL, config); 3431 3432 if (spa != NULL) { 3433 /* 3434 * This still leaves a window of inconsistency where the spares 3435 * or l2cache devices could change and the config would be 3436 * self-inconsistent. 3437 */ 3438 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3439 3440 if (*config != NULL) { 3441 uint64_t loadtimes[2]; 3442 3443 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 3444 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 3445 VERIFY(nvlist_add_uint64_array(*config, 3446 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 3447 3448 VERIFY(nvlist_add_uint64(*config, 3449 ZPOOL_CONFIG_ERRCOUNT, 3450 spa_get_errlog_size(spa)) == 0); 3451 3452 if (spa_suspended(spa)) 3453 VERIFY(nvlist_add_uint64(*config, 3454 ZPOOL_CONFIG_SUSPENDED, 3455 spa->spa_failmode) == 0); 3456 3457 spa_add_spares(spa, *config); 3458 spa_add_l2cache(spa, *config); 3459 spa_add_feature_stats(spa, *config); 3460 } 3461 } 3462 3463 /* 3464 * We want to get the alternate root even for faulted pools, so we cheat 3465 * and call spa_lookup() directly. 3466 */ 3467 if (altroot) { 3468 if (spa == NULL) { 3469 mutex_enter(&spa_namespace_lock); 3470 spa = spa_lookup(name); 3471 if (spa) 3472 spa_altroot(spa, altroot, buflen); 3473 else 3474 altroot[0] = '\0'; 3475 spa = NULL; 3476 mutex_exit(&spa_namespace_lock); 3477 } else { 3478 spa_altroot(spa, altroot, buflen); 3479 } 3480 } 3481 3482 if (spa != NULL) { 3483 spa_config_exit(spa, SCL_CONFIG, FTAG); 3484 spa_close(spa, FTAG); 3485 } 3486 3487 return (error); 3488 } 3489 3490 /* 3491 * Validate that the auxiliary device array is well formed. We must have an 3492 * array of nvlists, each which describes a valid leaf vdev. If this is an 3493 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 3494 * specified, as long as they are well-formed. 3495 */ 3496 static int 3497 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 3498 spa_aux_vdev_t *sav, const char *config, uint64_t version, 3499 vdev_labeltype_t label) 3500 { 3501 nvlist_t **dev; 3502 uint_t i, ndev; 3503 vdev_t *vd; 3504 int error; 3505 3506 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3507 3508 /* 3509 * It's acceptable to have no devs specified. 3510 */ 3511 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 3512 return (0); 3513 3514 if (ndev == 0) 3515 return (SET_ERROR(EINVAL)); 3516 3517 /* 3518 * Make sure the pool is formatted with a version that supports this 3519 * device type. 3520 */ 3521 if (spa_version(spa) < version) 3522 return (SET_ERROR(ENOTSUP)); 3523 3524 /* 3525 * Set the pending device list so we correctly handle device in-use 3526 * checking. 3527 */ 3528 sav->sav_pending = dev; 3529 sav->sav_npending = ndev; 3530 3531 for (i = 0; i < ndev; i++) { 3532 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 3533 mode)) != 0) 3534 goto out; 3535 3536 if (!vd->vdev_ops->vdev_op_leaf) { 3537 vdev_free(vd); 3538 error = SET_ERROR(EINVAL); 3539 goto out; 3540 } 3541 3542 /* 3543 * The L2ARC currently only supports disk devices in 3544 * kernel context. For user-level testing, we allow it. 3545 */ 3546 #ifdef _KERNEL 3547 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 3548 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 3549 error = SET_ERROR(ENOTBLK); 3550 vdev_free(vd); 3551 goto out; 3552 } 3553 #endif 3554 vd->vdev_top = vd; 3555 3556 if ((error = vdev_open(vd)) == 0 && 3557 (error = vdev_label_init(vd, crtxg, label)) == 0) { 3558 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 3559 vd->vdev_guid) == 0); 3560 } 3561 3562 vdev_free(vd); 3563 3564 if (error && 3565 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 3566 goto out; 3567 else 3568 error = 0; 3569 } 3570 3571 out: 3572 sav->sav_pending = NULL; 3573 sav->sav_npending = 0; 3574 return (error); 3575 } 3576 3577 static int 3578 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 3579 { 3580 int error; 3581 3582 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3583 3584 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3585 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 3586 VDEV_LABEL_SPARE)) != 0) { 3587 return (error); 3588 } 3589 3590 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3591 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 3592 VDEV_LABEL_L2CACHE)); 3593 } 3594 3595 static void 3596 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 3597 const char *config) 3598 { 3599 int i; 3600 3601 if (sav->sav_config != NULL) { 3602 nvlist_t **olddevs; 3603 uint_t oldndevs; 3604 nvlist_t **newdevs; 3605 3606 /* 3607 * Generate new dev list by concatentating with the 3608 * current dev list. 3609 */ 3610 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 3611 &olddevs, &oldndevs) == 0); 3612 3613 newdevs = kmem_alloc(sizeof (void *) * 3614 (ndevs + oldndevs), KM_SLEEP); 3615 for (i = 0; i < oldndevs; i++) 3616 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 3617 KM_SLEEP) == 0); 3618 for (i = 0; i < ndevs; i++) 3619 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 3620 KM_SLEEP) == 0); 3621 3622 VERIFY(nvlist_remove(sav->sav_config, config, 3623 DATA_TYPE_NVLIST_ARRAY) == 0); 3624 3625 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3626 config, newdevs, ndevs + oldndevs) == 0); 3627 for (i = 0; i < oldndevs + ndevs; i++) 3628 nvlist_free(newdevs[i]); 3629 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 3630 } else { 3631 /* 3632 * Generate a new dev list. 3633 */ 3634 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 3635 KM_SLEEP) == 0); 3636 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 3637 devs, ndevs) == 0); 3638 } 3639 } 3640 3641 /* 3642 * Stop and drop level 2 ARC devices 3643 */ 3644 void 3645 spa_l2cache_drop(spa_t *spa) 3646 { 3647 vdev_t *vd; 3648 int i; 3649 spa_aux_vdev_t *sav = &spa->spa_l2cache; 3650 3651 for (i = 0; i < sav->sav_count; i++) { 3652 uint64_t pool; 3653 3654 vd = sav->sav_vdevs[i]; 3655 ASSERT(vd != NULL); 3656 3657 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 3658 pool != 0ULL && l2arc_vdev_present(vd)) 3659 l2arc_remove_vdev(vd); 3660 } 3661 } 3662 3663 /* 3664 * Pool Creation 3665 */ 3666 int 3667 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 3668 nvlist_t *zplprops) 3669 { 3670 spa_t *spa; 3671 char *altroot = NULL; 3672 vdev_t *rvd; 3673 dsl_pool_t *dp; 3674 dmu_tx_t *tx; 3675 int error = 0; 3676 uint64_t txg = TXG_INITIAL; 3677 nvlist_t **spares, **l2cache; 3678 uint_t nspares, nl2cache; 3679 uint64_t version, obj; 3680 boolean_t has_features; 3681 3682 /* 3683 * If this pool already exists, return failure. 3684 */ 3685 mutex_enter(&spa_namespace_lock); 3686 if (spa_lookup(pool) != NULL) { 3687 mutex_exit(&spa_namespace_lock); 3688 return (SET_ERROR(EEXIST)); 3689 } 3690 3691 /* 3692 * Allocate a new spa_t structure. 3693 */ 3694 (void) nvlist_lookup_string(props, 3695 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3696 spa = spa_add(pool, NULL, altroot); 3697 spa_activate(spa, spa_mode_global); 3698 3699 if (props && (error = spa_prop_validate(spa, props))) { 3700 spa_deactivate(spa); 3701 spa_remove(spa); 3702 mutex_exit(&spa_namespace_lock); 3703 return (error); 3704 } 3705 3706 has_features = B_FALSE; 3707 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 3708 elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 3709 if (zpool_prop_feature(nvpair_name(elem))) 3710 has_features = B_TRUE; 3711 } 3712 3713 if (has_features || nvlist_lookup_uint64(props, 3714 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 3715 version = SPA_VERSION; 3716 } 3717 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 3718 3719 spa->spa_first_txg = txg; 3720 spa->spa_uberblock.ub_txg = txg - 1; 3721 spa->spa_uberblock.ub_version = version; 3722 spa->spa_ubsync = spa->spa_uberblock; 3723 spa->spa_load_state = SPA_LOAD_CREATE; 3724 spa->spa_removing_phys.sr_state = DSS_NONE; 3725 spa->spa_removing_phys.sr_removing_vdev = -1; 3726 spa->spa_removing_phys.sr_prev_indirect_vdev = -1; 3727 3728 /* 3729 * Create "The Godfather" zio to hold all async IOs 3730 */ 3731 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 3732 KM_SLEEP); 3733 for (int i = 0; i < max_ncpus; i++) { 3734 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 3735 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 3736 ZIO_FLAG_GODFATHER); 3737 } 3738 3739 /* 3740 * Create the root vdev. 3741 */ 3742 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3743 3744 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 3745 3746 ASSERT(error != 0 || rvd != NULL); 3747 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 3748 3749 if (error == 0 && !zfs_allocatable_devs(nvroot)) 3750 error = SET_ERROR(EINVAL); 3751 3752 if (error == 0 && 3753 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 3754 (error = spa_validate_aux(spa, nvroot, txg, 3755 VDEV_ALLOC_ADD)) == 0) { 3756 for (int c = 0; c < rvd->vdev_children; c++) { 3757 vdev_metaslab_set_size(rvd->vdev_child[c]); 3758 vdev_expand(rvd->vdev_child[c], txg); 3759 } 3760 } 3761 3762 spa_config_exit(spa, SCL_ALL, FTAG); 3763 3764 if (error != 0) { 3765 spa_unload(spa); 3766 spa_deactivate(spa); 3767 spa_remove(spa); 3768 mutex_exit(&spa_namespace_lock); 3769 return (error); 3770 } 3771 3772 /* 3773 * Get the list of spares, if specified. 3774 */ 3775 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3776 &spares, &nspares) == 0) { 3777 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 3778 KM_SLEEP) == 0); 3779 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3780 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3781 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3782 spa_load_spares(spa); 3783 spa_config_exit(spa, SCL_ALL, FTAG); 3784 spa->spa_spares.sav_sync = B_TRUE; 3785 } 3786 3787 /* 3788 * Get the list of level 2 cache devices, if specified. 3789 */ 3790 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3791 &l2cache, &nl2cache) == 0) { 3792 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3793 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3794 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3795 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3796 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3797 spa_load_l2cache(spa); 3798 spa_config_exit(spa, SCL_ALL, FTAG); 3799 spa->spa_l2cache.sav_sync = B_TRUE; 3800 } 3801 3802 spa->spa_is_initializing = B_TRUE; 3803 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 3804 spa->spa_meta_objset = dp->dp_meta_objset; 3805 spa->spa_is_initializing = B_FALSE; 3806 3807 /* 3808 * Create DDTs (dedup tables). 3809 */ 3810 ddt_create(spa); 3811 3812 spa_update_dspace(spa); 3813 3814 tx = dmu_tx_create_assigned(dp, txg); 3815 3816 /* 3817 * Create the pool config object. 3818 */ 3819 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 3820 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 3821 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3822 3823 if (zap_add(spa->spa_meta_objset, 3824 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3825 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 3826 cmn_err(CE_PANIC, "failed to add pool config"); 3827 } 3828 3829 if (spa_version(spa) >= SPA_VERSION_FEATURES) 3830 spa_feature_create_zap_objects(spa, tx); 3831 3832 if (zap_add(spa->spa_meta_objset, 3833 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 3834 sizeof (uint64_t), 1, &version, tx) != 0) { 3835 cmn_err(CE_PANIC, "failed to add pool version"); 3836 } 3837 3838 /* Newly created pools with the right version are always deflated. */ 3839 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 3840 spa->spa_deflate = TRUE; 3841 if (zap_add(spa->spa_meta_objset, 3842 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3843 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 3844 cmn_err(CE_PANIC, "failed to add deflate"); 3845 } 3846 } 3847 3848 /* 3849 * Create the deferred-free bpobj. Turn off compression 3850 * because sync-to-convergence takes longer if the blocksize 3851 * keeps changing. 3852 */ 3853 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 3854 dmu_object_set_compress(spa->spa_meta_objset, obj, 3855 ZIO_COMPRESS_OFF, tx); 3856 if (zap_add(spa->spa_meta_objset, 3857 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 3858 sizeof (uint64_t), 1, &obj, tx) != 0) { 3859 cmn_err(CE_PANIC, "failed to add bpobj"); 3860 } 3861 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 3862 spa->spa_meta_objset, obj)); 3863 3864 /* 3865 * Create the pool's history object. 3866 */ 3867 if (version >= SPA_VERSION_ZPOOL_HISTORY) 3868 spa_history_create_obj(spa, tx); 3869 3870 /* 3871 * Generate some random noise for salted checksums to operate on. 3872 */ 3873 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 3874 sizeof (spa->spa_cksum_salt.zcs_bytes)); 3875 3876 /* 3877 * Set pool properties. 3878 */ 3879 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 3880 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3881 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 3882 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 3883 3884 if (props != NULL) { 3885 spa_configfile_set(spa, props, B_FALSE); 3886 spa_sync_props(props, tx); 3887 } 3888 3889 dmu_tx_commit(tx); 3890 3891 spa->spa_sync_on = B_TRUE; 3892 txg_sync_start(spa->spa_dsl_pool); 3893 3894 /* 3895 * We explicitly wait for the first transaction to complete so that our 3896 * bean counters are appropriately updated. 3897 */ 3898 txg_wait_synced(spa->spa_dsl_pool, txg); 3899 3900 spa_write_cachefile(spa, B_FALSE, B_TRUE); 3901 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); 3902 3903 spa_history_log_version(spa, "create"); 3904 3905 /* 3906 * Don't count references from objsets that are already closed 3907 * and are making their way through the eviction process. 3908 */ 3909 spa_evicting_os_wait(spa); 3910 spa->spa_minref = refcount_count(&spa->spa_refcount); 3911 spa->spa_load_state = SPA_LOAD_NONE; 3912 3913 mutex_exit(&spa_namespace_lock); 3914 3915 return (0); 3916 } 3917 3918 #ifdef _KERNEL 3919 /* 3920 * Get the root pool information from the root disk, then import the root pool 3921 * during the system boot up time. 3922 */ 3923 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 3924 3925 static nvlist_t * 3926 spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 3927 { 3928 nvlist_t *config; 3929 nvlist_t *nvtop, *nvroot; 3930 uint64_t pgid; 3931 3932 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 3933 return (NULL); 3934 3935 /* 3936 * Add this top-level vdev to the child array. 3937 */ 3938 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3939 &nvtop) == 0); 3940 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3941 &pgid) == 0); 3942 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 3943 3944 /* 3945 * Put this pool's top-level vdevs into a root vdev. 3946 */ 3947 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3948 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3949 VDEV_TYPE_ROOT) == 0); 3950 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3951 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3952 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3953 &nvtop, 1) == 0); 3954 3955 /* 3956 * Replace the existing vdev_tree with the new root vdev in 3957 * this pool's configuration (remove the old, add the new). 3958 */ 3959 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3960 nvlist_free(nvroot); 3961 return (config); 3962 } 3963 3964 /* 3965 * Walk the vdev tree and see if we can find a device with "better" 3966 * configuration. A configuration is "better" if the label on that 3967 * device has a more recent txg. 3968 */ 3969 static void 3970 spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 3971 { 3972 for (int c = 0; c < vd->vdev_children; c++) 3973 spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 3974 3975 if (vd->vdev_ops->vdev_op_leaf) { 3976 nvlist_t *label; 3977 uint64_t label_txg; 3978 3979 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 3980 &label) != 0) 3981 return; 3982 3983 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 3984 &label_txg) == 0); 3985 3986 /* 3987 * Do we have a better boot device? 3988 */ 3989 if (label_txg > *txg) { 3990 *txg = label_txg; 3991 *avd = vd; 3992 } 3993 nvlist_free(label); 3994 } 3995 } 3996 3997 /* 3998 * Import a root pool. 3999 * 4000 * For x86. devpath_list will consist of devid and/or physpath name of 4001 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 4002 * The GRUB "findroot" command will return the vdev we should boot. 4003 * 4004 * For Sparc, devpath_list consists the physpath name of the booting device 4005 * no matter the rootpool is a single device pool or a mirrored pool. 4006 * e.g. 4007 * "/pci@1f,0/ide@d/disk@0,0:a" 4008 */ 4009 int 4010 spa_import_rootpool(char *devpath, char *devid) 4011 { 4012 spa_t *spa; 4013 vdev_t *rvd, *bvd, *avd = NULL; 4014 nvlist_t *config, *nvtop; 4015 uint64_t guid, txg; 4016 char *pname; 4017 int error; 4018 4019 /* 4020 * Read the label from the boot device and generate a configuration. 4021 */ 4022 config = spa_generate_rootconf(devpath, devid, &guid); 4023 #if defined(_OBP) && defined(_KERNEL) 4024 if (config == NULL) { 4025 if (strstr(devpath, "/iscsi/ssd") != NULL) { 4026 /* iscsi boot */ 4027 get_iscsi_bootpath_phy(devpath); 4028 config = spa_generate_rootconf(devpath, devid, &guid); 4029 } 4030 } 4031 #endif 4032 if (config == NULL) { 4033 cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", 4034 devpath); 4035 return (SET_ERROR(EIO)); 4036 } 4037 4038 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 4039 &pname) == 0); 4040 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 4041 4042 mutex_enter(&spa_namespace_lock); 4043 if ((spa = spa_lookup(pname)) != NULL) { 4044 /* 4045 * Remove the existing root pool from the namespace so that we 4046 * can replace it with the correct config we just read in. 4047 */ 4048 spa_remove(spa); 4049 } 4050 4051 spa = spa_add(pname, config, NULL); 4052 spa->spa_is_root = B_TRUE; 4053 spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 4054 4055 /* 4056 * Build up a vdev tree based on the boot device's label config. 4057 */ 4058 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4059 &nvtop) == 0); 4060 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4061 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 4062 VDEV_ALLOC_ROOTPOOL); 4063 spa_config_exit(spa, SCL_ALL, FTAG); 4064 if (error) { 4065 mutex_exit(&spa_namespace_lock); 4066 nvlist_free(config); 4067 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 4068 pname); 4069 return (error); 4070 } 4071 4072 /* 4073 * Get the boot vdev. 4074 */ 4075 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 4076 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 4077 (u_longlong_t)guid); 4078 error = SET_ERROR(ENOENT); 4079 goto out; 4080 } 4081 4082 /* 4083 * Determine if there is a better boot device. 4084 */ 4085 avd = bvd; 4086 spa_alt_rootvdev(rvd, &avd, &txg); 4087 if (avd != bvd) { 4088 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 4089 "try booting from '%s'", avd->vdev_path); 4090 error = SET_ERROR(EINVAL); 4091 goto out; 4092 } 4093 4094 /* 4095 * If the boot device is part of a spare vdev then ensure that 4096 * we're booting off the active spare. 4097 */ 4098 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 4099 !bvd->vdev_isspare) { 4100 cmn_err(CE_NOTE, "The boot device is currently spared. Please " 4101 "try booting from '%s'", 4102 bvd->vdev_parent-> 4103 vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 4104 error = SET_ERROR(EINVAL); 4105 goto out; 4106 } 4107 4108 error = 0; 4109 out: 4110 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4111 vdev_free(rvd); 4112 spa_config_exit(spa, SCL_ALL, FTAG); 4113 mutex_exit(&spa_namespace_lock); 4114 4115 nvlist_free(config); 4116 return (error); 4117 } 4118 4119 #endif 4120 4121 /* 4122 * Import a non-root pool into the system. 4123 */ 4124 int 4125 spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 4126 { 4127 spa_t *spa; 4128 char *altroot = NULL; 4129 spa_load_state_t state = SPA_LOAD_IMPORT; 4130 zpool_rewind_policy_t policy; 4131 uint64_t mode = spa_mode_global; 4132 uint64_t readonly = B_FALSE; 4133 int error; 4134 nvlist_t *nvroot; 4135 nvlist_t **spares, **l2cache; 4136 uint_t nspares, nl2cache; 4137 4138 /* 4139 * If a pool with this name exists, return failure. 4140 */ 4141 mutex_enter(&spa_namespace_lock); 4142 if (spa_lookup(pool) != NULL) { 4143 mutex_exit(&spa_namespace_lock); 4144 return (SET_ERROR(EEXIST)); 4145 } 4146 4147 /* 4148 * Create and initialize the spa structure. 4149 */ 4150 (void) nvlist_lookup_string(props, 4151 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4152 (void) nvlist_lookup_uint64(props, 4153 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 4154 if (readonly) 4155 mode = FREAD; 4156 spa = spa_add(pool, config, altroot); 4157 spa->spa_import_flags = flags; 4158 4159 /* 4160 * Verbatim import - Take a pool and insert it into the namespace 4161 * as if it had been loaded at boot. 4162 */ 4163 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 4164 if (props != NULL) 4165 spa_configfile_set(spa, props, B_FALSE); 4166 4167 spa_write_cachefile(spa, B_FALSE, B_TRUE); 4168 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 4169 4170 mutex_exit(&spa_namespace_lock); 4171 return (0); 4172 } 4173 4174 spa_activate(spa, mode); 4175 4176 /* 4177 * Don't start async tasks until we know everything is healthy. 4178 */ 4179 spa_async_suspend(spa); 4180 4181 zpool_get_rewind_policy(config, &policy); 4182 if (policy.zrp_request & ZPOOL_DO_REWIND) 4183 state = SPA_LOAD_RECOVER; 4184 4185 /* 4186 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 4187 * because the user-supplied config is actually the one to trust when 4188 * doing an import. 4189 */ 4190 if (state != SPA_LOAD_RECOVER) 4191 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 4192 4193 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 4194 policy.zrp_request); 4195 4196 /* 4197 * Propagate anything learned while loading the pool and pass it 4198 * back to caller (i.e. rewind info, missing devices, etc). 4199 */ 4200 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4201 spa->spa_load_info) == 0); 4202 4203 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4204 /* 4205 * Toss any existing sparelist, as it doesn't have any validity 4206 * anymore, and conflicts with spa_has_spare(). 4207 */ 4208 if (spa->spa_spares.sav_config) { 4209 nvlist_free(spa->spa_spares.sav_config); 4210 spa->spa_spares.sav_config = NULL; 4211 spa_load_spares(spa); 4212 } 4213 if (spa->spa_l2cache.sav_config) { 4214 nvlist_free(spa->spa_l2cache.sav_config); 4215 spa->spa_l2cache.sav_config = NULL; 4216 spa_load_l2cache(spa); 4217 } 4218 4219 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4220 &nvroot) == 0); 4221 if (error == 0) 4222 error = spa_validate_aux(spa, nvroot, -1ULL, 4223 VDEV_ALLOC_SPARE); 4224 if (error == 0) 4225 error = spa_validate_aux(spa, nvroot, -1ULL, 4226 VDEV_ALLOC_L2CACHE); 4227 spa_config_exit(spa, SCL_ALL, FTAG); 4228 4229 if (props != NULL) 4230 spa_configfile_set(spa, props, B_FALSE); 4231 4232 if (error != 0 || (props && spa_writeable(spa) && 4233 (error = spa_prop_set(spa, props)))) { 4234 spa_unload(spa); 4235 spa_deactivate(spa); 4236 spa_remove(spa); 4237 mutex_exit(&spa_namespace_lock); 4238 return (error); 4239 } 4240 4241 spa_async_resume(spa); 4242 4243 /* 4244 * Override any spares and level 2 cache devices as specified by 4245 * the user, as these may have correct device names/devids, etc. 4246 */ 4247 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 4248 &spares, &nspares) == 0) { 4249 if (spa->spa_spares.sav_config) 4250 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 4251 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 4252 else 4253 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 4254 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4255 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 4256 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 4257 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4258 spa_load_spares(spa); 4259 spa_config_exit(spa, SCL_ALL, FTAG); 4260 spa->spa_spares.sav_sync = B_TRUE; 4261 } 4262 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 4263 &l2cache, &nl2cache) == 0) { 4264 if (spa->spa_l2cache.sav_config) 4265 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 4266 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 4267 else 4268 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 4269 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4270 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 4271 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 4272 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4273 spa_load_l2cache(spa); 4274 spa_config_exit(spa, SCL_ALL, FTAG); 4275 spa->spa_l2cache.sav_sync = B_TRUE; 4276 } 4277 4278 /* 4279 * Check for any removed devices. 4280 */ 4281 if (spa->spa_autoreplace) { 4282 spa_aux_check_removed(&spa->spa_spares); 4283 spa_aux_check_removed(&spa->spa_l2cache); 4284 } 4285 4286 if (spa_writeable(spa)) { 4287 /* 4288 * Update the config cache to include the newly-imported pool. 4289 */ 4290 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4291 } 4292 4293 /* 4294 * It's possible that the pool was expanded while it was exported. 4295 * We kick off an async task to handle this for us. 4296 */ 4297 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 4298 4299 spa_history_log_version(spa, "import"); 4300 4301 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 4302 4303 mutex_exit(&spa_namespace_lock); 4304 4305 return (0); 4306 } 4307 4308 nvlist_t * 4309 spa_tryimport(nvlist_t *tryconfig) 4310 { 4311 nvlist_t *config = NULL; 4312 char *poolname; 4313 spa_t *spa; 4314 uint64_t state; 4315 int error; 4316 4317 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 4318 return (NULL); 4319 4320 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 4321 return (NULL); 4322 4323 /* 4324 * Create and initialize the spa structure. 4325 */ 4326 mutex_enter(&spa_namespace_lock); 4327 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 4328 spa_activate(spa, FREAD); 4329 4330 /* 4331 * Pass off the heavy lifting to spa_load(). 4332 * Pass TRUE for mosconfig because the user-supplied config 4333 * is actually the one to trust when doing an import. 4334 */ 4335 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 4336 4337 /* 4338 * If 'tryconfig' was at least parsable, return the current config. 4339 */ 4340 if (spa->spa_root_vdev != NULL) { 4341 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 4342 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 4343 poolname) == 0); 4344 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4345 state) == 0); 4346 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 4347 spa->spa_uberblock.ub_timestamp) == 0); 4348 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4349 spa->spa_load_info) == 0); 4350 4351 /* 4352 * If the bootfs property exists on this pool then we 4353 * copy it out so that external consumers can tell which 4354 * pools are bootable. 4355 */ 4356 if ((!error || error == EEXIST) && spa->spa_bootfs) { 4357 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4358 4359 /* 4360 * We have to play games with the name since the 4361 * pool was opened as TRYIMPORT_NAME. 4362 */ 4363 if (dsl_dsobj_to_dsname(spa_name(spa), 4364 spa->spa_bootfs, tmpname) == 0) { 4365 char *cp; 4366 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4367 4368 cp = strchr(tmpname, '/'); 4369 if (cp == NULL) { 4370 (void) strlcpy(dsname, tmpname, 4371 MAXPATHLEN); 4372 } else { 4373 (void) snprintf(dsname, MAXPATHLEN, 4374 "%s/%s", poolname, ++cp); 4375 } 4376 VERIFY(nvlist_add_string(config, 4377 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 4378 kmem_free(dsname, MAXPATHLEN); 4379 } 4380 kmem_free(tmpname, MAXPATHLEN); 4381 } 4382 4383 /* 4384 * Add the list of hot spares and level 2 cache devices. 4385 */ 4386 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4387 spa_add_spares(spa, config); 4388 spa_add_l2cache(spa, config); 4389 spa_config_exit(spa, SCL_CONFIG, FTAG); 4390 } 4391 4392 spa_unload(spa); 4393 spa_deactivate(spa); 4394 spa_remove(spa); 4395 mutex_exit(&spa_namespace_lock); 4396 4397 return (config); 4398 } 4399 4400 /* 4401 * Pool export/destroy 4402 * 4403 * The act of destroying or exporting a pool is very simple. We make sure there 4404 * is no more pending I/O and any references to the pool are gone. Then, we 4405 * update the pool state and sync all the labels to disk, removing the 4406 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 4407 * we don't sync the labels or remove the configuration cache. 4408 */ 4409 static int 4410 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 4411 boolean_t force, boolean_t hardforce) 4412 { 4413 spa_t *spa; 4414 4415 if (oldconfig) 4416 *oldconfig = NULL; 4417 4418 if (!(spa_mode_global & FWRITE)) 4419 return (SET_ERROR(EROFS)); 4420 4421 mutex_enter(&spa_namespace_lock); 4422 if ((spa = spa_lookup(pool)) == NULL) { 4423 mutex_exit(&spa_namespace_lock); 4424 return (SET_ERROR(ENOENT)); 4425 } 4426 4427 /* 4428 * Put a hold on the pool, drop the namespace lock, stop async tasks, 4429 * reacquire the namespace lock, and see if we can export. 4430 */ 4431 spa_open_ref(spa, FTAG); 4432 mutex_exit(&spa_namespace_lock); 4433 spa_async_suspend(spa); 4434 mutex_enter(&spa_namespace_lock); 4435 spa_close(spa, FTAG); 4436 4437 /* 4438 * The pool will be in core if it's openable, 4439 * in which case we can modify its state. 4440 */ 4441 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 4442 /* 4443 * Objsets may be open only because they're dirty, so we 4444 * have to force it to sync before checking spa_refcnt. 4445 */ 4446 txg_wait_synced(spa->spa_dsl_pool, 0); 4447 spa_evicting_os_wait(spa); 4448 4449 /* 4450 * A pool cannot be exported or destroyed if there are active 4451 * references. If we are resetting a pool, allow references by 4452 * fault injection handlers. 4453 */ 4454 if (!spa_refcount_zero(spa) || 4455 (spa->spa_inject_ref != 0 && 4456 new_state != POOL_STATE_UNINITIALIZED)) { 4457 spa_async_resume(spa); 4458 mutex_exit(&spa_namespace_lock); 4459 return (SET_ERROR(EBUSY)); 4460 } 4461 4462 /* 4463 * A pool cannot be exported if it has an active shared spare. 4464 * This is to prevent other pools stealing the active spare 4465 * from an exported pool. At user's own will, such pool can 4466 * be forcedly exported. 4467 */ 4468 if (!force && new_state == POOL_STATE_EXPORTED && 4469 spa_has_active_shared_spare(spa)) { 4470 spa_async_resume(spa); 4471 mutex_exit(&spa_namespace_lock); 4472 return (SET_ERROR(EXDEV)); 4473 } 4474 4475 /* 4476 * We want this to be reflected on every label, 4477 * so mark them all dirty. spa_unload() will do the 4478 * final sync that pushes these changes out. 4479 */ 4480 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 4481 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4482 spa->spa_state = new_state; 4483 spa->spa_final_txg = spa_last_synced_txg(spa) + 4484 TXG_DEFER_SIZE + 1; 4485 vdev_config_dirty(spa->spa_root_vdev); 4486 spa_config_exit(spa, SCL_ALL, FTAG); 4487 } 4488 } 4489 4490 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); 4491 4492 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4493 spa_unload(spa); 4494 spa_deactivate(spa); 4495 } 4496 4497 if (oldconfig && spa->spa_config) 4498 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 4499 4500 if (new_state != POOL_STATE_UNINITIALIZED) { 4501 if (!hardforce) 4502 spa_write_cachefile(spa, B_TRUE, B_TRUE); 4503 spa_remove(spa); 4504 } 4505 mutex_exit(&spa_namespace_lock); 4506 4507 return (0); 4508 } 4509 4510 /* 4511 * Destroy a storage pool. 4512 */ 4513 int 4514 spa_destroy(char *pool) 4515 { 4516 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 4517 B_FALSE, B_FALSE)); 4518 } 4519 4520 /* 4521 * Export a storage pool. 4522 */ 4523 int 4524 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 4525 boolean_t hardforce) 4526 { 4527 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 4528 force, hardforce)); 4529 } 4530 4531 /* 4532 * Similar to spa_export(), this unloads the spa_t without actually removing it 4533 * from the namespace in any way. 4534 */ 4535 int 4536 spa_reset(char *pool) 4537 { 4538 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 4539 B_FALSE, B_FALSE)); 4540 } 4541 4542 /* 4543 * ========================================================================== 4544 * Device manipulation 4545 * ========================================================================== 4546 */ 4547 4548 /* 4549 * Add a device to a storage pool. 4550 */ 4551 int 4552 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 4553 { 4554 uint64_t txg, id; 4555 int error; 4556 vdev_t *rvd = spa->spa_root_vdev; 4557 vdev_t *vd, *tvd; 4558 nvlist_t **spares, **l2cache; 4559 uint_t nspares, nl2cache; 4560 4561 ASSERT(spa_writeable(spa)); 4562 4563 txg = spa_vdev_enter(spa); 4564 4565 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 4566 VDEV_ALLOC_ADD)) != 0) 4567 return (spa_vdev_exit(spa, NULL, txg, error)); 4568 4569 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 4570 4571 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 4572 &nspares) != 0) 4573 nspares = 0; 4574 4575 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 4576 &nl2cache) != 0) 4577 nl2cache = 0; 4578 4579 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 4580 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 4581 4582 if (vd->vdev_children != 0 && 4583 (error = vdev_create(vd, txg, B_FALSE)) != 0) 4584 return (spa_vdev_exit(spa, vd, txg, error)); 4585 4586 /* 4587 * We must validate the spares and l2cache devices after checking the 4588 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 4589 */ 4590 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 4591 return (spa_vdev_exit(spa, vd, txg, error)); 4592 4593 /* 4594 * If we are in the middle of a device removal, we can only add 4595 * devices which match the existing devices in the pool. 4596 * If we are in the middle of a removal, or have some indirect 4597 * vdevs, we can not add raidz toplevels. 4598 */ 4599 if (spa->spa_vdev_removal != NULL || 4600 spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { 4601 for (int c = 0; c < vd->vdev_children; c++) { 4602 tvd = vd->vdev_child[c]; 4603 if (spa->spa_vdev_removal != NULL && 4604 tvd->vdev_ashift != 4605 spa->spa_vdev_removal->svr_vdev->vdev_ashift) { 4606 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 4607 } 4608 /* Fail if top level vdev is raidz */ 4609 if (tvd->vdev_ops == &vdev_raidz_ops) { 4610 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 4611 } 4612 /* 4613 * Need the top level mirror to be 4614 * a mirror of leaf vdevs only 4615 */ 4616 if (tvd->vdev_ops == &vdev_mirror_ops) { 4617 for (uint64_t cid = 0; 4618 cid < tvd->vdev_children; cid++) { 4619 vdev_t *cvd = tvd->vdev_child[cid]; 4620 if (!cvd->vdev_ops->vdev_op_leaf) { 4621 return (spa_vdev_exit(spa, vd, 4622 txg, EINVAL)); 4623 } 4624 } 4625 } 4626 } 4627 } 4628 4629 for (int c = 0; c < vd->vdev_children; c++) { 4630 4631 /* 4632 * Set the vdev id to the first hole, if one exists. 4633 */ 4634 for (id = 0; id < rvd->vdev_children; id++) { 4635 if (rvd->vdev_child[id]->vdev_ishole) { 4636 vdev_free(rvd->vdev_child[id]); 4637 break; 4638 } 4639 } 4640 tvd = vd->vdev_child[c]; 4641 vdev_remove_child(vd, tvd); 4642 tvd->vdev_id = id; 4643 vdev_add_child(rvd, tvd); 4644 vdev_config_dirty(tvd); 4645 } 4646 4647 if (nspares != 0) { 4648 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 4649 ZPOOL_CONFIG_SPARES); 4650 spa_load_spares(spa); 4651 spa->spa_spares.sav_sync = B_TRUE; 4652 } 4653 4654 if (nl2cache != 0) { 4655 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 4656 ZPOOL_CONFIG_L2CACHE); 4657 spa_load_l2cache(spa); 4658 spa->spa_l2cache.sav_sync = B_TRUE; 4659 } 4660 4661 /* 4662 * We have to be careful when adding new vdevs to an existing pool. 4663 * If other threads start allocating from these vdevs before we 4664 * sync the config cache, and we lose power, then upon reboot we may 4665 * fail to open the pool because there are DVAs that the config cache 4666 * can't translate. Therefore, we first add the vdevs without 4667 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 4668 * and then let spa_config_update() initialize the new metaslabs. 4669 * 4670 * spa_load() checks for added-but-not-initialized vdevs, so that 4671 * if we lose power at any point in this sequence, the remaining 4672 * steps will be completed the next time we load the pool. 4673 */ 4674 (void) spa_vdev_exit(spa, vd, txg, 0); 4675 4676 mutex_enter(&spa_namespace_lock); 4677 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4678 spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); 4679 mutex_exit(&spa_namespace_lock); 4680 4681 return (0); 4682 } 4683 4684 /* 4685 * Attach a device to a mirror. The arguments are the path to any device 4686 * in the mirror, and the nvroot for the new device. If the path specifies 4687 * a device that is not mirrored, we automatically insert the mirror vdev. 4688 * 4689 * If 'replacing' is specified, the new device is intended to replace the 4690 * existing device; in this case the two devices are made into their own 4691 * mirror using the 'replacing' vdev, which is functionally identical to 4692 * the mirror vdev (it actually reuses all the same ops) but has a few 4693 * extra rules: you can't attach to it after it's been created, and upon 4694 * completion of resilvering, the first disk (the one being replaced) 4695 * is automatically detached. 4696 */ 4697 int 4698 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 4699 { 4700 uint64_t txg, dtl_max_txg; 4701 vdev_t *rvd = spa->spa_root_vdev; 4702 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 4703 vdev_ops_t *pvops; 4704 char *oldvdpath, *newvdpath; 4705 int newvd_isspare; 4706 int error; 4707 4708 ASSERT(spa_writeable(spa)); 4709 4710 txg = spa_vdev_enter(spa); 4711 4712 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 4713 4714 if (spa->spa_vdev_removal != NULL || 4715 spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { 4716 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4717 } 4718 4719 if (oldvd == NULL) 4720 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4721 4722 if (!oldvd->vdev_ops->vdev_op_leaf) 4723 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4724 4725 pvd = oldvd->vdev_parent; 4726 4727 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 4728 VDEV_ALLOC_ATTACH)) != 0) 4729 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4730 4731 if (newrootvd->vdev_children != 1) 4732 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4733 4734 newvd = newrootvd->vdev_child[0]; 4735 4736 if (!newvd->vdev_ops->vdev_op_leaf) 4737 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4738 4739 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 4740 return (spa_vdev_exit(spa, newrootvd, txg, error)); 4741 4742 /* 4743 * Spares can't replace logs 4744 */ 4745 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 4746 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4747 4748 if (!replacing) { 4749 /* 4750 * For attach, the only allowable parent is a mirror or the root 4751 * vdev. 4752 */ 4753 if (pvd->vdev_ops != &vdev_mirror_ops && 4754 pvd->vdev_ops != &vdev_root_ops) 4755 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4756 4757 pvops = &vdev_mirror_ops; 4758 } else { 4759 /* 4760 * Active hot spares can only be replaced by inactive hot 4761 * spares. 4762 */ 4763 if (pvd->vdev_ops == &vdev_spare_ops && 4764 oldvd->vdev_isspare && 4765 !spa_has_spare(spa, newvd->vdev_guid)) 4766 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4767 4768 /* 4769 * If the source is a hot spare, and the parent isn't already a 4770 * spare, then we want to create a new hot spare. Otherwise, we 4771 * want to create a replacing vdev. The user is not allowed to 4772 * attach to a spared vdev child unless the 'isspare' state is 4773 * the same (spare replaces spare, non-spare replaces 4774 * non-spare). 4775 */ 4776 if (pvd->vdev_ops == &vdev_replacing_ops && 4777 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 4778 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4779 } else if (pvd->vdev_ops == &vdev_spare_ops && 4780 newvd->vdev_isspare != oldvd->vdev_isspare) { 4781 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4782 } 4783 4784 if (newvd->vdev_isspare) 4785 pvops = &vdev_spare_ops; 4786 else 4787 pvops = &vdev_replacing_ops; 4788 } 4789 4790 /* 4791 * Make sure the new device is big enough. 4792 */ 4793 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 4794 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 4795 4796 /* 4797 * The new device cannot have a higher alignment requirement 4798 * than the top-level vdev. 4799 */ 4800 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 4801 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 4802 4803 /* 4804 * If this is an in-place replacement, update oldvd's path and devid 4805 * to make it distinguishable from newvd, and unopenable from now on. 4806 */ 4807 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 4808 spa_strfree(oldvd->vdev_path); 4809 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 4810 KM_SLEEP); 4811 (void) sprintf(oldvd->vdev_path, "%s/%s", 4812 newvd->vdev_path, "old"); 4813 if (oldvd->vdev_devid != NULL) { 4814 spa_strfree(oldvd->vdev_devid); 4815 oldvd->vdev_devid = NULL; 4816 } 4817 } 4818 4819 /* mark the device being resilvered */ 4820 newvd->vdev_resilver_txg = txg; 4821 4822 /* 4823 * If the parent is not a mirror, or if we're replacing, insert the new 4824 * mirror/replacing/spare vdev above oldvd. 4825 */ 4826 if (pvd->vdev_ops != pvops) 4827 pvd = vdev_add_parent(oldvd, pvops); 4828 4829 ASSERT(pvd->vdev_top->vdev_parent == rvd); 4830 ASSERT(pvd->vdev_ops == pvops); 4831 ASSERT(oldvd->vdev_parent == pvd); 4832 4833 /* 4834 * Extract the new device from its root and add it to pvd. 4835 */ 4836 vdev_remove_child(newrootvd, newvd); 4837 newvd->vdev_id = pvd->vdev_children; 4838 newvd->vdev_crtxg = oldvd->vdev_crtxg; 4839 vdev_add_child(pvd, newvd); 4840 4841 tvd = newvd->vdev_top; 4842 ASSERT(pvd->vdev_top == tvd); 4843 ASSERT(tvd->vdev_parent == rvd); 4844 4845 vdev_config_dirty(tvd); 4846 4847 /* 4848 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 4849 * for any dmu_sync-ed blocks. It will propagate upward when 4850 * spa_vdev_exit() calls vdev_dtl_reassess(). 4851 */ 4852 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 4853 4854 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 4855 dtl_max_txg - TXG_INITIAL); 4856 4857 if (newvd->vdev_isspare) { 4858 spa_spare_activate(newvd); 4859 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); 4860 } 4861 4862 oldvdpath = spa_strdup(oldvd->vdev_path); 4863 newvdpath = spa_strdup(newvd->vdev_path); 4864 newvd_isspare = newvd->vdev_isspare; 4865 4866 /* 4867 * Mark newvd's DTL dirty in this txg. 4868 */ 4869 vdev_dirty(tvd, VDD_DTL, newvd, txg); 4870 4871 /* 4872 * Schedule the resilver to restart in the future. We do this to 4873 * ensure that dmu_sync-ed blocks have been stitched into the 4874 * respective datasets. 4875 */ 4876 dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 4877 4878 if (spa->spa_bootfs) 4879 spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); 4880 4881 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); 4882 4883 /* 4884 * Commit the config 4885 */ 4886 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 4887 4888 spa_history_log_internal(spa, "vdev attach", NULL, 4889 "%s vdev=%s %s vdev=%s", 4890 replacing && newvd_isspare ? "spare in" : 4891 replacing ? "replace" : "attach", newvdpath, 4892 replacing ? "for" : "to", oldvdpath); 4893 4894 spa_strfree(oldvdpath); 4895 spa_strfree(newvdpath); 4896 4897 return (0); 4898 } 4899 4900 /* 4901 * Detach a device from a mirror or replacing vdev. 4902 * 4903 * If 'replace_done' is specified, only detach if the parent 4904 * is a replacing vdev. 4905 */ 4906 int 4907 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 4908 { 4909 uint64_t txg; 4910 int error; 4911 vdev_t *rvd = spa->spa_root_vdev; 4912 vdev_t *vd, *pvd, *cvd, *tvd; 4913 boolean_t unspare = B_FALSE; 4914 uint64_t unspare_guid = 0; 4915 char *vdpath; 4916 4917 ASSERT(spa_writeable(spa)); 4918 4919 txg = spa_vdev_enter(spa); 4920 4921 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4922 4923 if (vd == NULL) 4924 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4925 4926 if (!vd->vdev_ops->vdev_op_leaf) 4927 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4928 4929 pvd = vd->vdev_parent; 4930 4931 /* 4932 * If the parent/child relationship is not as expected, don't do it. 4933 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 4934 * vdev that's replacing B with C. The user's intent in replacing 4935 * is to go from M(A,B) to M(A,C). If the user decides to cancel 4936 * the replace by detaching C, the expected behavior is to end up 4937 * M(A,B). But suppose that right after deciding to detach C, 4938 * the replacement of B completes. We would have M(A,C), and then 4939 * ask to detach C, which would leave us with just A -- not what 4940 * the user wanted. To prevent this, we make sure that the 4941 * parent/child relationship hasn't changed -- in this example, 4942 * that C's parent is still the replacing vdev R. 4943 */ 4944 if (pvd->vdev_guid != pguid && pguid != 0) 4945 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4946 4947 /* 4948 * Only 'replacing' or 'spare' vdevs can be replaced. 4949 */ 4950 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 4951 pvd->vdev_ops != &vdev_spare_ops) 4952 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4953 4954 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 4955 spa_version(spa) >= SPA_VERSION_SPARES); 4956 4957 /* 4958 * Only mirror, replacing, and spare vdevs support detach. 4959 */ 4960 if (pvd->vdev_ops != &vdev_replacing_ops && 4961 pvd->vdev_ops != &vdev_mirror_ops && 4962 pvd->vdev_ops != &vdev_spare_ops) 4963 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4964 4965 /* 4966 * If this device has the only valid copy of some data, 4967 * we cannot safely detach it. 4968 */ 4969 if (vdev_dtl_required(vd)) 4970 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4971 4972 ASSERT(pvd->vdev_children >= 2); 4973 4974 /* 4975 * If we are detaching the second disk from a replacing vdev, then 4976 * check to see if we changed the original vdev's path to have "/old" 4977 * at the end in spa_vdev_attach(). If so, undo that change now. 4978 */ 4979 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 4980 vd->vdev_path != NULL) { 4981 size_t len = strlen(vd->vdev_path); 4982 4983 for (int c = 0; c < pvd->vdev_children; c++) { 4984 cvd = pvd->vdev_child[c]; 4985 4986 if (cvd == vd || cvd->vdev_path == NULL) 4987 continue; 4988 4989 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 4990 strcmp(cvd->vdev_path + len, "/old") == 0) { 4991 spa_strfree(cvd->vdev_path); 4992 cvd->vdev_path = spa_strdup(vd->vdev_path); 4993 break; 4994 } 4995 } 4996 } 4997 4998 /* 4999 * If we are detaching the original disk from a spare, then it implies 5000 * that the spare should become a real disk, and be removed from the 5001 * active spare list for the pool. 5002 */ 5003 if (pvd->vdev_ops == &vdev_spare_ops && 5004 vd->vdev_id == 0 && 5005 pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 5006 unspare = B_TRUE; 5007 5008 /* 5009 * Erase the disk labels so the disk can be used for other things. 5010 * This must be done after all other error cases are handled, 5011 * but before we disembowel vd (so we can still do I/O to it). 5012 * But if we can't do it, don't treat the error as fatal -- 5013 * it may be that the unwritability of the disk is the reason 5014 * it's being detached! 5015 */ 5016 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5017 5018 /* 5019 * Remove vd from its parent and compact the parent's children. 5020 */ 5021 vdev_remove_child(pvd, vd); 5022 vdev_compact_children(pvd); 5023 5024 /* 5025 * Remember one of the remaining children so we can get tvd below. 5026 */ 5027 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 5028 5029 /* 5030 * If we need to remove the remaining child from the list of hot spares, 5031 * do it now, marking the vdev as no longer a spare in the process. 5032 * We must do this before vdev_remove_parent(), because that can 5033 * change the GUID if it creates a new toplevel GUID. For a similar 5034 * reason, we must remove the spare now, in the same txg as the detach; 5035 * otherwise someone could attach a new sibling, change the GUID, and 5036 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 5037 */ 5038 if (unspare) { 5039 ASSERT(cvd->vdev_isspare); 5040 spa_spare_remove(cvd); 5041 unspare_guid = cvd->vdev_guid; 5042 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 5043 cvd->vdev_unspare = B_TRUE; 5044 } 5045 5046 /* 5047 * If the parent mirror/replacing vdev only has one child, 5048 * the parent is no longer needed. Remove it from the tree. 5049 */ 5050 if (pvd->vdev_children == 1) { 5051 if (pvd->vdev_ops == &vdev_spare_ops) 5052 cvd->vdev_unspare = B_FALSE; 5053 vdev_remove_parent(cvd); 5054 } 5055 5056 5057 /* 5058 * We don't set tvd until now because the parent we just removed 5059 * may have been the previous top-level vdev. 5060 */ 5061 tvd = cvd->vdev_top; 5062 ASSERT(tvd->vdev_parent == rvd); 5063 5064 /* 5065 * Reevaluate the parent vdev state. 5066 */ 5067 vdev_propagate_state(cvd); 5068 5069 /* 5070 * If the 'autoexpand' property is set on the pool then automatically 5071 * try to expand the size of the pool. For example if the device we 5072 * just detached was smaller than the others, it may be possible to 5073 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 5074 * first so that we can obtain the updated sizes of the leaf vdevs. 5075 */ 5076 if (spa->spa_autoexpand) { 5077 vdev_reopen(tvd); 5078 vdev_expand(tvd, txg); 5079 } 5080 5081 vdev_config_dirty(tvd); 5082 5083 /* 5084 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 5085 * vd->vdev_detached is set and free vd's DTL object in syncing context. 5086 * But first make sure we're not on any *other* txg's DTL list, to 5087 * prevent vd from being accessed after it's freed. 5088 */ 5089 vdpath = spa_strdup(vd->vdev_path); 5090 for (int t = 0; t < TXG_SIZE; t++) 5091 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 5092 vd->vdev_detached = B_TRUE; 5093 vdev_dirty(tvd, VDD_DTL, vd, txg); 5094 5095 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); 5096 5097 /* hang on to the spa before we release the lock */ 5098 spa_open_ref(spa, FTAG); 5099 5100 error = spa_vdev_exit(spa, vd, txg, 0); 5101 5102 spa_history_log_internal(spa, "detach", NULL, 5103 "vdev=%s", vdpath); 5104 spa_strfree(vdpath); 5105 5106 /* 5107 * If this was the removal of the original device in a hot spare vdev, 5108 * then we want to go through and remove the device from the hot spare 5109 * list of every other pool. 5110 */ 5111 if (unspare) { 5112 spa_t *altspa = NULL; 5113 5114 mutex_enter(&spa_namespace_lock); 5115 while ((altspa = spa_next(altspa)) != NULL) { 5116 if (altspa->spa_state != POOL_STATE_ACTIVE || 5117 altspa == spa) 5118 continue; 5119 5120 spa_open_ref(altspa, FTAG); 5121 mutex_exit(&spa_namespace_lock); 5122 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 5123 mutex_enter(&spa_namespace_lock); 5124 spa_close(altspa, FTAG); 5125 } 5126 mutex_exit(&spa_namespace_lock); 5127 5128 /* search the rest of the vdevs for spares to remove */ 5129 spa_vdev_resilver_done(spa); 5130 } 5131 5132 /* all done with the spa; OK to release */ 5133 mutex_enter(&spa_namespace_lock); 5134 spa_close(spa, FTAG); 5135 mutex_exit(&spa_namespace_lock); 5136 5137 return (error); 5138 } 5139 5140 /* 5141 * Split a set of devices from their mirrors, and create a new pool from them. 5142 */ 5143 int 5144 spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 5145 nvlist_t *props, boolean_t exp) 5146 { 5147 int error = 0; 5148 uint64_t txg, *glist; 5149 spa_t *newspa; 5150 uint_t c, children, lastlog; 5151 nvlist_t **child, *nvl, *tmp; 5152 dmu_tx_t *tx; 5153 char *altroot = NULL; 5154 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 5155 boolean_t activate_slog; 5156 5157 ASSERT(spa_writeable(spa)); 5158 5159 txg = spa_vdev_enter(spa); 5160 5161 /* clear the log and flush everything up to now */ 5162 activate_slog = spa_passivate_log(spa); 5163 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5164 error = spa_reset_logs(spa); 5165 txg = spa_vdev_config_enter(spa); 5166 5167 if (activate_slog) 5168 spa_activate_log(spa); 5169 5170 if (error != 0) 5171 return (spa_vdev_exit(spa, NULL, txg, error)); 5172 5173 /* check new spa name before going any further */ 5174 if (spa_lookup(newname) != NULL) 5175 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 5176 5177 /* 5178 * scan through all the children to ensure they're all mirrors 5179 */ 5180 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 5181 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 5182 &children) != 0) 5183 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5184 5185 /* first, check to ensure we've got the right child count */ 5186 rvd = spa->spa_root_vdev; 5187 lastlog = 0; 5188 for (c = 0; c < rvd->vdev_children; c++) { 5189 vdev_t *vd = rvd->vdev_child[c]; 5190 5191 /* don't count the holes & logs as children */ 5192 if (vd->vdev_islog || !vdev_is_concrete(vd)) { 5193 if (lastlog == 0) 5194 lastlog = c; 5195 continue; 5196 } 5197 5198 lastlog = 0; 5199 } 5200 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 5201 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5202 5203 /* next, ensure no spare or cache devices are part of the split */ 5204 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 5205 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 5206 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5207 5208 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 5209 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 5210 5211 /* then, loop over each vdev and validate it */ 5212 for (c = 0; c < children; c++) { 5213 uint64_t is_hole = 0; 5214 5215 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 5216 &is_hole); 5217 5218 if (is_hole != 0) { 5219 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 5220 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 5221 continue; 5222 } else { 5223 error = SET_ERROR(EINVAL); 5224 break; 5225 } 5226 } 5227 5228 /* which disk is going to be split? */ 5229 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 5230 &glist[c]) != 0) { 5231 error = SET_ERROR(EINVAL); 5232 break; 5233 } 5234 5235 /* look it up in the spa */ 5236 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 5237 if (vml[c] == NULL) { 5238 error = SET_ERROR(ENODEV); 5239 break; 5240 } 5241 5242 /* make sure there's nothing stopping the split */ 5243 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 5244 vml[c]->vdev_islog || 5245 !vdev_is_concrete(vml[c]) || 5246 vml[c]->vdev_isspare || 5247 vml[c]->vdev_isl2cache || 5248 !vdev_writeable(vml[c]) || 5249 vml[c]->vdev_children != 0 || 5250 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 5251 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 5252 error = SET_ERROR(EINVAL); 5253 break; 5254 } 5255 5256 if (vdev_dtl_required(vml[c])) { 5257 error = SET_ERROR(EBUSY); 5258 break; 5259 } 5260 5261 /* we need certain info from the top level */ 5262 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 5263 vml[c]->vdev_top->vdev_ms_array) == 0); 5264 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 5265 vml[c]->vdev_top->vdev_ms_shift) == 0); 5266 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 5267 vml[c]->vdev_top->vdev_asize) == 0); 5268 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 5269 vml[c]->vdev_top->vdev_ashift) == 0); 5270 5271 /* transfer per-vdev ZAPs */ 5272 ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); 5273 VERIFY0(nvlist_add_uint64(child[c], 5274 ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); 5275 5276 ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); 5277 VERIFY0(nvlist_add_uint64(child[c], 5278 ZPOOL_CONFIG_VDEV_TOP_ZAP, 5279 vml[c]->vdev_parent->vdev_top_zap)); 5280 } 5281 5282 if (error != 0) { 5283 kmem_free(vml, children * sizeof (vdev_t *)); 5284 kmem_free(glist, children * sizeof (uint64_t)); 5285 return (spa_vdev_exit(spa, NULL, txg, error)); 5286 } 5287 5288 /* stop writers from using the disks */ 5289 for (c = 0; c < children; c++) { 5290 if (vml[c] != NULL) 5291 vml[c]->vdev_offline = B_TRUE; 5292 } 5293 vdev_reopen(spa->spa_root_vdev); 5294 5295 /* 5296 * Temporarily record the splitting vdevs in the spa config. This 5297 * will disappear once the config is regenerated. 5298 */ 5299 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5300 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 5301 glist, children) == 0); 5302 kmem_free(glist, children * sizeof (uint64_t)); 5303 5304 mutex_enter(&spa->spa_props_lock); 5305 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 5306 nvl) == 0); 5307 mutex_exit(&spa->spa_props_lock); 5308 spa->spa_config_splitting = nvl; 5309 vdev_config_dirty(spa->spa_root_vdev); 5310 5311 /* configure and create the new pool */ 5312 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 5313 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 5314 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 5315 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 5316 spa_version(spa)) == 0); 5317 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 5318 spa->spa_config_txg) == 0); 5319 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 5320 spa_generate_guid(NULL)) == 0); 5321 VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 5322 (void) nvlist_lookup_string(props, 5323 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5324 5325 /* add the new pool to the namespace */ 5326 newspa = spa_add(newname, config, altroot); 5327 newspa->spa_avz_action = AVZ_ACTION_REBUILD; 5328 newspa->spa_config_txg = spa->spa_config_txg; 5329 spa_set_log_state(newspa, SPA_LOG_CLEAR); 5330 5331 /* release the spa config lock, retaining the namespace lock */ 5332 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5333 5334 if (zio_injection_enabled) 5335 zio_handle_panic_injection(spa, FTAG, 1); 5336 5337 spa_activate(newspa, spa_mode_global); 5338 spa_async_suspend(newspa); 5339 5340 /* create the new pool from the disks of the original pool */ 5341 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 5342 if (error) 5343 goto out; 5344 5345 /* if that worked, generate a real config for the new pool */ 5346 if (newspa->spa_root_vdev != NULL) { 5347 VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 5348 NV_UNIQUE_NAME, KM_SLEEP) == 0); 5349 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 5350 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 5351 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 5352 B_TRUE)); 5353 } 5354 5355 /* set the props */ 5356 if (props != NULL) { 5357 spa_configfile_set(newspa, props, B_FALSE); 5358 error = spa_prop_set(newspa, props); 5359 if (error) 5360 goto out; 5361 } 5362 5363 /* flush everything */ 5364 txg = spa_vdev_config_enter(newspa); 5365 vdev_config_dirty(newspa->spa_root_vdev); 5366 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 5367 5368 if (zio_injection_enabled) 5369 zio_handle_panic_injection(spa, FTAG, 2); 5370 5371 spa_async_resume(newspa); 5372 5373 /* finally, update the original pool's config */ 5374 txg = spa_vdev_config_enter(spa); 5375 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 5376 error = dmu_tx_assign(tx, TXG_WAIT); 5377 if (error != 0) 5378 dmu_tx_abort(tx); 5379 for (c = 0; c < children; c++) { 5380 if (vml[c] != NULL) { 5381 vdev_split(vml[c]); 5382 if (error == 0) 5383 spa_history_log_internal(spa, "detach", tx, 5384 "vdev=%s", vml[c]->vdev_path); 5385 5386 vdev_free(vml[c]); 5387 } 5388 } 5389 spa->spa_avz_action = AVZ_ACTION_REBUILD; 5390 vdev_config_dirty(spa->spa_root_vdev); 5391 spa->spa_config_splitting = NULL; 5392 nvlist_free(nvl); 5393 if (error == 0) 5394 dmu_tx_commit(tx); 5395 (void) spa_vdev_exit(spa, NULL, txg, 0); 5396 5397 if (zio_injection_enabled) 5398 zio_handle_panic_injection(spa, FTAG, 3); 5399 5400 /* split is complete; log a history record */ 5401 spa_history_log_internal(newspa, "split", NULL, 5402 "from pool %s", spa_name(spa)); 5403 5404 kmem_free(vml, children * sizeof (vdev_t *)); 5405 5406 /* if we're not going to mount the filesystems in userland, export */ 5407 if (exp) 5408 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 5409 B_FALSE, B_FALSE); 5410 5411 return (error); 5412 5413 out: 5414 spa_unload(newspa); 5415 spa_deactivate(newspa); 5416 spa_remove(newspa); 5417 5418 txg = spa_vdev_config_enter(spa); 5419 5420 /* re-online all offlined disks */ 5421 for (c = 0; c < children; c++) { 5422 if (vml[c] != NULL) 5423 vml[c]->vdev_offline = B_FALSE; 5424 } 5425 vdev_reopen(spa->spa_root_vdev); 5426 5427 nvlist_free(spa->spa_config_splitting); 5428 spa->spa_config_splitting = NULL; 5429 (void) spa_vdev_exit(spa, NULL, txg, error); 5430 5431 kmem_free(vml, children * sizeof (vdev_t *)); 5432 return (error); 5433 } 5434 5435 /* 5436 * Find any device that's done replacing, or a vdev marked 'unspare' that's 5437 * currently spared, so we can detach it. 5438 */ 5439 static vdev_t * 5440 spa_vdev_resilver_done_hunt(vdev_t *vd) 5441 { 5442 vdev_t *newvd, *oldvd; 5443 5444 for (int c = 0; c < vd->vdev_children; c++) { 5445 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 5446 if (oldvd != NULL) 5447 return (oldvd); 5448 } 5449 5450 /* 5451 * Check for a completed replacement. We always consider the first 5452 * vdev in the list to be the oldest vdev, and the last one to be 5453 * the newest (see spa_vdev_attach() for how that works). In 5454 * the case where the newest vdev is faulted, we will not automatically 5455 * remove it after a resilver completes. This is OK as it will require 5456 * user intervention to determine which disk the admin wishes to keep. 5457 */ 5458 if (vd->vdev_ops == &vdev_replacing_ops) { 5459 ASSERT(vd->vdev_children > 1); 5460 5461 newvd = vd->vdev_child[vd->vdev_children - 1]; 5462 oldvd = vd->vdev_child[0]; 5463 5464 if (vdev_dtl_empty(newvd, DTL_MISSING) && 5465 vdev_dtl_empty(newvd, DTL_OUTAGE) && 5466 !vdev_dtl_required(oldvd)) 5467 return (oldvd); 5468 } 5469 5470 /* 5471 * Check for a completed resilver with the 'unspare' flag set. 5472 */ 5473 if (vd->vdev_ops == &vdev_spare_ops) { 5474 vdev_t *first = vd->vdev_child[0]; 5475 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 5476 5477 if (last->vdev_unspare) { 5478 oldvd = first; 5479 newvd = last; 5480 } else if (first->vdev_unspare) { 5481 oldvd = last; 5482 newvd = first; 5483 } else { 5484 oldvd = NULL; 5485 } 5486 5487 if (oldvd != NULL && 5488 vdev_dtl_empty(newvd, DTL_MISSING) && 5489 vdev_dtl_empty(newvd, DTL_OUTAGE) && 5490 !vdev_dtl_required(oldvd)) 5491 return (oldvd); 5492 5493 /* 5494 * If there are more than two spares attached to a disk, 5495 * and those spares are not required, then we want to 5496 * attempt to free them up now so that they can be used 5497 * by other pools. Once we're back down to a single 5498 * disk+spare, we stop removing them. 5499 */ 5500 if (vd->vdev_children > 2) { 5501 newvd = vd->vdev_child[1]; 5502 5503 if (newvd->vdev_isspare && last->vdev_isspare && 5504 vdev_dtl_empty(last, DTL_MISSING) && 5505 vdev_dtl_empty(last, DTL_OUTAGE) && 5506 !vdev_dtl_required(newvd)) 5507 return (newvd); 5508 } 5509 } 5510 5511 return (NULL); 5512 } 5513 5514 static void 5515 spa_vdev_resilver_done(spa_t *spa) 5516 { 5517 vdev_t *vd, *pvd, *ppvd; 5518 uint64_t guid, sguid, pguid, ppguid; 5519 5520 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5521 5522 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 5523 pvd = vd->vdev_parent; 5524 ppvd = pvd->vdev_parent; 5525 guid = vd->vdev_guid; 5526 pguid = pvd->vdev_guid; 5527 ppguid = ppvd->vdev_guid; 5528 sguid = 0; 5529 /* 5530 * If we have just finished replacing a hot spared device, then 5531 * we need to detach the parent's first child (the original hot 5532 * spare) as well. 5533 */ 5534 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 5535 ppvd->vdev_children == 2) { 5536 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 5537 sguid = ppvd->vdev_child[1]->vdev_guid; 5538 } 5539 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 5540 5541 spa_config_exit(spa, SCL_ALL, FTAG); 5542 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 5543 return; 5544 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 5545 return; 5546 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5547 } 5548 5549 spa_config_exit(spa, SCL_ALL, FTAG); 5550 } 5551 5552 /* 5553 * Update the stored path or FRU for this vdev. 5554 */ 5555 int 5556 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 5557 boolean_t ispath) 5558 { 5559 vdev_t *vd; 5560 boolean_t sync = B_FALSE; 5561 5562 ASSERT(spa_writeable(spa)); 5563 5564 spa_vdev_state_enter(spa, SCL_ALL); 5565 5566 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 5567 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 5568 5569 if (!vd->vdev_ops->vdev_op_leaf) 5570 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 5571 5572 if (ispath) { 5573 if (strcmp(value, vd->vdev_path) != 0) { 5574 spa_strfree(vd->vdev_path); 5575 vd->vdev_path = spa_strdup(value); 5576 sync = B_TRUE; 5577 } 5578 } else { 5579 if (vd->vdev_fru == NULL) { 5580 vd->vdev_fru = spa_strdup(value); 5581 sync = B_TRUE; 5582 } else if (strcmp(value, vd->vdev_fru) != 0) { 5583 spa_strfree(vd->vdev_fru); 5584 vd->vdev_fru = spa_strdup(value); 5585 sync = B_TRUE; 5586 } 5587 } 5588 5589 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 5590 } 5591 5592 int 5593 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 5594 { 5595 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 5596 } 5597 5598 int 5599 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 5600 { 5601 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 5602 } 5603 5604 /* 5605 * ========================================================================== 5606 * SPA Scanning 5607 * ========================================================================== 5608 */ 5609 int 5610 spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) 5611 { 5612 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5613 5614 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 5615 return (SET_ERROR(EBUSY)); 5616 5617 return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); 5618 } 5619 5620 int 5621 spa_scan_stop(spa_t *spa) 5622 { 5623 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5624 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 5625 return (SET_ERROR(EBUSY)); 5626 return (dsl_scan_cancel(spa->spa_dsl_pool)); 5627 } 5628 5629 int 5630 spa_scan(spa_t *spa, pool_scan_func_t func) 5631 { 5632 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5633 5634 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 5635 return (SET_ERROR(ENOTSUP)); 5636 5637 /* 5638 * If a resilver was requested, but there is no DTL on a 5639 * writeable leaf device, we have nothing to do. 5640 */ 5641 if (func == POOL_SCAN_RESILVER && 5642 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5643 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 5644 return (0); 5645 } 5646 5647 return (dsl_scan(spa->spa_dsl_pool, func)); 5648 } 5649 5650 /* 5651 * ========================================================================== 5652 * SPA async task processing 5653 * ========================================================================== 5654 */ 5655 5656 static void 5657 spa_async_remove(spa_t *spa, vdev_t *vd) 5658 { 5659 if (vd->vdev_remove_wanted) { 5660 vd->vdev_remove_wanted = B_FALSE; 5661 vd->vdev_delayed_close = B_FALSE; 5662 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 5663 5664 /* 5665 * We want to clear the stats, but we don't want to do a full 5666 * vdev_clear() as that will cause us to throw away 5667 * degraded/faulted state as well as attempt to reopen the 5668 * device, all of which is a waste. 5669 */ 5670 vd->vdev_stat.vs_read_errors = 0; 5671 vd->vdev_stat.vs_write_errors = 0; 5672 vd->vdev_stat.vs_checksum_errors = 0; 5673 5674 vdev_state_dirty(vd->vdev_top); 5675 } 5676 5677 for (int c = 0; c < vd->vdev_children; c++) 5678 spa_async_remove(spa, vd->vdev_child[c]); 5679 } 5680 5681 static void 5682 spa_async_probe(spa_t *spa, vdev_t *vd) 5683 { 5684 if (vd->vdev_probe_wanted) { 5685 vd->vdev_probe_wanted = B_FALSE; 5686 vdev_reopen(vd); /* vdev_open() does the actual probe */ 5687 } 5688 5689 for (int c = 0; c < vd->vdev_children; c++) 5690 spa_async_probe(spa, vd->vdev_child[c]); 5691 } 5692 5693 static void 5694 spa_async_autoexpand(spa_t *spa, vdev_t *vd) 5695 { 5696 sysevent_id_t eid; 5697 nvlist_t *attr; 5698 char *physpath; 5699 5700 if (!spa->spa_autoexpand) 5701 return; 5702 5703 for (int c = 0; c < vd->vdev_children; c++) { 5704 vdev_t *cvd = vd->vdev_child[c]; 5705 spa_async_autoexpand(spa, cvd); 5706 } 5707 5708 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 5709 return; 5710 5711 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 5712 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 5713 5714 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5715 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 5716 5717 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 5718 ESC_DEV_DLE, attr, &eid, DDI_SLEEP); 5719 5720 nvlist_free(attr); 5721 kmem_free(physpath, MAXPATHLEN); 5722 } 5723 5724 static void 5725 spa_async_thread(void *arg) 5726 { 5727 spa_t *spa = (spa_t *)arg; 5728 int tasks; 5729 5730 ASSERT(spa->spa_sync_on); 5731 5732 mutex_enter(&spa->spa_async_lock); 5733 tasks = spa->spa_async_tasks; 5734 spa->spa_async_tasks = 0; 5735 mutex_exit(&spa->spa_async_lock); 5736 5737 /* 5738 * See if the config needs to be updated. 5739 */ 5740 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 5741 uint64_t old_space, new_space; 5742 5743 mutex_enter(&spa_namespace_lock); 5744 old_space = metaslab_class_get_space(spa_normal_class(spa)); 5745 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5746 new_space = metaslab_class_get_space(spa_normal_class(spa)); 5747 mutex_exit(&spa_namespace_lock); 5748 5749 /* 5750 * If the pool grew as a result of the config update, 5751 * then log an internal history event. 5752 */ 5753 if (new_space != old_space) { 5754 spa_history_log_internal(spa, "vdev online", NULL, 5755 "pool '%s' size: %llu(+%llu)", 5756 spa_name(spa), new_space, new_space - old_space); 5757 } 5758 } 5759 5760 /* 5761 * See if any devices need to be marked REMOVED. 5762 */ 5763 if (tasks & SPA_ASYNC_REMOVE) { 5764 spa_vdev_state_enter(spa, SCL_NONE); 5765 spa_async_remove(spa, spa->spa_root_vdev); 5766 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 5767 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 5768 for (int i = 0; i < spa->spa_spares.sav_count; i++) 5769 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 5770 (void) spa_vdev_state_exit(spa, NULL, 0); 5771 } 5772 5773 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 5774 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5775 spa_async_autoexpand(spa, spa->spa_root_vdev); 5776 spa_config_exit(spa, SCL_CONFIG, FTAG); 5777 } 5778 5779 /* 5780 * See if any devices need to be probed. 5781 */ 5782 if (tasks & SPA_ASYNC_PROBE) { 5783 spa_vdev_state_enter(spa, SCL_NONE); 5784 spa_async_probe(spa, spa->spa_root_vdev); 5785 (void) spa_vdev_state_exit(spa, NULL, 0); 5786 } 5787 5788 /* 5789 * If any devices are done replacing, detach them. 5790 */ 5791 if (tasks & SPA_ASYNC_RESILVER_DONE) 5792 spa_vdev_resilver_done(spa); 5793 5794 /* 5795 * Kick off a resilver. 5796 */ 5797 if (tasks & SPA_ASYNC_RESILVER) 5798 dsl_resilver_restart(spa->spa_dsl_pool, 0); 5799 5800 /* 5801 * Let the world know that we're done. 5802 */ 5803 mutex_enter(&spa->spa_async_lock); 5804 spa->spa_async_thread = NULL; 5805 cv_broadcast(&spa->spa_async_cv); 5806 mutex_exit(&spa->spa_async_lock); 5807 thread_exit(); 5808 } 5809 5810 void 5811 spa_async_suspend(spa_t *spa) 5812 { 5813 mutex_enter(&spa->spa_async_lock); 5814 spa->spa_async_suspended++; 5815 while (spa->spa_async_thread != NULL || 5816 spa->spa_condense_thread != NULL) 5817 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 5818 mutex_exit(&spa->spa_async_lock); 5819 5820 spa_vdev_remove_suspend(spa); 5821 } 5822 5823 void 5824 spa_async_resume(spa_t *spa) 5825 { 5826 mutex_enter(&spa->spa_async_lock); 5827 ASSERT(spa->spa_async_suspended != 0); 5828 spa->spa_async_suspended--; 5829 mutex_exit(&spa->spa_async_lock); 5830 spa_restart_removal(spa); 5831 } 5832 5833 static boolean_t 5834 spa_async_tasks_pending(spa_t *spa) 5835 { 5836 uint_t non_config_tasks; 5837 uint_t config_task; 5838 boolean_t config_task_suspended; 5839 5840 non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE; 5841 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 5842 if (spa->spa_ccw_fail_time == 0) { 5843 config_task_suspended = B_FALSE; 5844 } else { 5845 config_task_suspended = 5846 (gethrtime() - spa->spa_ccw_fail_time) < 5847 (zfs_ccw_retry_interval * NANOSEC); 5848 } 5849 5850 return (non_config_tasks || (config_task && !config_task_suspended)); 5851 } 5852 5853 static void 5854 spa_async_dispatch(spa_t *spa) 5855 { 5856 mutex_enter(&spa->spa_async_lock); 5857 if (spa_async_tasks_pending(spa) && 5858 !spa->spa_async_suspended && 5859 spa->spa_async_thread == NULL && 5860 rootdir != NULL) 5861 spa->spa_async_thread = thread_create(NULL, 0, 5862 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 5863 mutex_exit(&spa->spa_async_lock); 5864 } 5865 5866 void 5867 spa_async_request(spa_t *spa, int task) 5868 { 5869 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 5870 mutex_enter(&spa->spa_async_lock); 5871 spa->spa_async_tasks |= task; 5872 mutex_exit(&spa->spa_async_lock); 5873 } 5874 5875 /* 5876 * ========================================================================== 5877 * SPA syncing routines 5878 * ========================================================================== 5879 */ 5880 5881 static int 5882 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5883 { 5884 bpobj_t *bpo = arg; 5885 bpobj_enqueue(bpo, bp, tx); 5886 return (0); 5887 } 5888 5889 static int 5890 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5891 { 5892 zio_t *zio = arg; 5893 5894 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 5895 zio->io_flags)); 5896 return (0); 5897 } 5898 5899 /* 5900 * Note: this simple function is not inlined to make it easier to dtrace the 5901 * amount of time spent syncing frees. 5902 */ 5903 static void 5904 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 5905 { 5906 zio_t *zio = zio_root(spa, NULL, NULL, 0); 5907 bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 5908 VERIFY(zio_wait(zio) == 0); 5909 } 5910 5911 /* 5912 * Note: this simple function is not inlined to make it easier to dtrace the 5913 * amount of time spent syncing deferred frees. 5914 */ 5915 static void 5916 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 5917 { 5918 zio_t *zio = zio_root(spa, NULL, NULL, 0); 5919 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 5920 spa_free_sync_cb, zio, tx), ==, 0); 5921 VERIFY0(zio_wait(zio)); 5922 } 5923 5924 5925 static void 5926 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 5927 { 5928 char *packed = NULL; 5929 size_t bufsize; 5930 size_t nvsize = 0; 5931 dmu_buf_t *db; 5932 5933 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 5934 5935 /* 5936 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 5937 * information. This avoids the dmu_buf_will_dirty() path and 5938 * saves us a pre-read to get data we don't actually care about. 5939 */ 5940 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 5941 packed = kmem_alloc(bufsize, KM_SLEEP); 5942 5943 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 5944 KM_SLEEP) == 0); 5945 bzero(packed + nvsize, bufsize - nvsize); 5946 5947 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 5948 5949 kmem_free(packed, bufsize); 5950 5951 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 5952 dmu_buf_will_dirty(db, tx); 5953 *(uint64_t *)db->db_data = nvsize; 5954 dmu_buf_rele(db, FTAG); 5955 } 5956 5957 static void 5958 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 5959 const char *config, const char *entry) 5960 { 5961 nvlist_t *nvroot; 5962 nvlist_t **list; 5963 int i; 5964 5965 if (!sav->sav_sync) 5966 return; 5967 5968 /* 5969 * Update the MOS nvlist describing the list of available devices. 5970 * spa_validate_aux() will have already made sure this nvlist is 5971 * valid and the vdevs are labeled appropriately. 5972 */ 5973 if (sav->sav_object == 0) { 5974 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 5975 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 5976 sizeof (uint64_t), tx); 5977 VERIFY(zap_update(spa->spa_meta_objset, 5978 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 5979 &sav->sav_object, tx) == 0); 5980 } 5981 5982 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5983 if (sav->sav_count == 0) { 5984 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 5985 } else { 5986 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 5987 for (i = 0; i < sav->sav_count; i++) 5988 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 5989 B_FALSE, VDEV_CONFIG_L2CACHE); 5990 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 5991 sav->sav_count) == 0); 5992 for (i = 0; i < sav->sav_count; i++) 5993 nvlist_free(list[i]); 5994 kmem_free(list, sav->sav_count * sizeof (void *)); 5995 } 5996 5997 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 5998 nvlist_free(nvroot); 5999 6000 sav->sav_sync = B_FALSE; 6001 } 6002 6003 /* 6004 * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. 6005 * The all-vdev ZAP must be empty. 6006 */ 6007 static void 6008 spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) 6009 { 6010 spa_t *spa = vd->vdev_spa; 6011 if (vd->vdev_top_zap != 0) { 6012 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 6013 vd->vdev_top_zap, tx)); 6014 } 6015 if (vd->vdev_leaf_zap != 0) { 6016 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 6017 vd->vdev_leaf_zap, tx)); 6018 } 6019 for (uint64_t i = 0; i < vd->vdev_children; i++) { 6020 spa_avz_build(vd->vdev_child[i], avz, tx); 6021 } 6022 } 6023 6024 static void 6025 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 6026 { 6027 nvlist_t *config; 6028 6029 /* 6030 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, 6031 * its config may not be dirty but we still need to build per-vdev ZAPs. 6032 * Similarly, if the pool is being assembled (e.g. after a split), we 6033 * need to rebuild the AVZ although the config may not be dirty. 6034 */ 6035 if (list_is_empty(&spa->spa_config_dirty_list) && 6036 spa->spa_avz_action == AVZ_ACTION_NONE) 6037 return; 6038 6039 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6040 6041 ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || 6042 spa->spa_avz_action == AVZ_ACTION_INITIALIZE || 6043 spa->spa_all_vdev_zaps != 0); 6044 6045 if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { 6046 /* Make and build the new AVZ */ 6047 uint64_t new_avz = zap_create(spa->spa_meta_objset, 6048 DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); 6049 spa_avz_build(spa->spa_root_vdev, new_avz, tx); 6050 6051 /* Diff old AVZ with new one */ 6052 zap_cursor_t zc; 6053 zap_attribute_t za; 6054 6055 for (zap_cursor_init(&zc, spa->spa_meta_objset, 6056 spa->spa_all_vdev_zaps); 6057 zap_cursor_retrieve(&zc, &za) == 0; 6058 zap_cursor_advance(&zc)) { 6059 uint64_t vdzap = za.za_first_integer; 6060 if (zap_lookup_int(spa->spa_meta_objset, new_avz, 6061 vdzap) == ENOENT) { 6062 /* 6063 * ZAP is listed in old AVZ but not in new one; 6064 * destroy it 6065 */ 6066 VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, 6067 tx)); 6068 } 6069 } 6070 6071 zap_cursor_fini(&zc); 6072 6073 /* Destroy the old AVZ */ 6074 VERIFY0(zap_destroy(spa->spa_meta_objset, 6075 spa->spa_all_vdev_zaps, tx)); 6076 6077 /* Replace the old AVZ in the dir obj with the new one */ 6078 VERIFY0(zap_update(spa->spa_meta_objset, 6079 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, 6080 sizeof (new_avz), 1, &new_avz, tx)); 6081 6082 spa->spa_all_vdev_zaps = new_avz; 6083 } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { 6084 zap_cursor_t zc; 6085 zap_attribute_t za; 6086 6087 /* Walk through the AVZ and destroy all listed ZAPs */ 6088 for (zap_cursor_init(&zc, spa->spa_meta_objset, 6089 spa->spa_all_vdev_zaps); 6090 zap_cursor_retrieve(&zc, &za) == 0; 6091 zap_cursor_advance(&zc)) { 6092 uint64_t zap = za.za_first_integer; 6093 VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); 6094 } 6095 6096 zap_cursor_fini(&zc); 6097 6098 /* Destroy and unlink the AVZ itself */ 6099 VERIFY0(zap_destroy(spa->spa_meta_objset, 6100 spa->spa_all_vdev_zaps, tx)); 6101 VERIFY0(zap_remove(spa->spa_meta_objset, 6102 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); 6103 spa->spa_all_vdev_zaps = 0; 6104 } 6105 6106 if (spa->spa_all_vdev_zaps == 0) { 6107 spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, 6108 DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, 6109 DMU_POOL_VDEV_ZAP_MAP, tx); 6110 } 6111 spa->spa_avz_action = AVZ_ACTION_NONE; 6112 6113 /* Create ZAPs for vdevs that don't have them. */ 6114 vdev_construct_zaps(spa->spa_root_vdev, tx); 6115 6116 config = spa_config_generate(spa, spa->spa_root_vdev, 6117 dmu_tx_get_txg(tx), B_FALSE); 6118 6119 /* 6120 * If we're upgrading the spa version then make sure that 6121 * the config object gets updated with the correct version. 6122 */ 6123 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 6124 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 6125 spa->spa_uberblock.ub_version); 6126 6127 spa_config_exit(spa, SCL_STATE, FTAG); 6128 6129 nvlist_free(spa->spa_config_syncing); 6130 spa->spa_config_syncing = config; 6131 6132 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 6133 } 6134 6135 static void 6136 spa_sync_version(void *arg, dmu_tx_t *tx) 6137 { 6138 uint64_t *versionp = arg; 6139 uint64_t version = *versionp; 6140 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6141 6142 /* 6143 * Setting the version is special cased when first creating the pool. 6144 */ 6145 ASSERT(tx->tx_txg != TXG_INITIAL); 6146 6147 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 6148 ASSERT(version >= spa_version(spa)); 6149 6150 spa->spa_uberblock.ub_version = version; 6151 vdev_config_dirty(spa->spa_root_vdev); 6152 spa_history_log_internal(spa, "set", tx, "version=%lld", version); 6153 } 6154 6155 /* 6156 * Set zpool properties. 6157 */ 6158 static void 6159 spa_sync_props(void *arg, dmu_tx_t *tx) 6160 { 6161 nvlist_t *nvp = arg; 6162 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6163 objset_t *mos = spa->spa_meta_objset; 6164 nvpair_t *elem = NULL; 6165 6166 mutex_enter(&spa->spa_props_lock); 6167 6168 while ((elem = nvlist_next_nvpair(nvp, elem))) { 6169 uint64_t intval; 6170 char *strval, *fname; 6171 zpool_prop_t prop; 6172 const char *propname; 6173 zprop_type_t proptype; 6174 spa_feature_t fid; 6175 6176 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 6177 case ZPROP_INVAL: 6178 /* 6179 * We checked this earlier in spa_prop_validate(). 6180 */ 6181 ASSERT(zpool_prop_feature(nvpair_name(elem))); 6182 6183 fname = strchr(nvpair_name(elem), '@') + 1; 6184 VERIFY0(zfeature_lookup_name(fname, &fid)); 6185 6186 spa_feature_enable(spa, fid, tx); 6187 spa_history_log_internal(spa, "set", tx, 6188 "%s=enabled", nvpair_name(elem)); 6189 break; 6190 6191 case ZPOOL_PROP_VERSION: 6192 intval = fnvpair_value_uint64(elem); 6193 /* 6194 * The version is synced seperatly before other 6195 * properties and should be correct by now. 6196 */ 6197 ASSERT3U(spa_version(spa), >=, intval); 6198 break; 6199 6200 case ZPOOL_PROP_ALTROOT: 6201 /* 6202 * 'altroot' is a non-persistent property. It should 6203 * have been set temporarily at creation or import time. 6204 */ 6205 ASSERT(spa->spa_root != NULL); 6206 break; 6207 6208 case ZPOOL_PROP_READONLY: 6209 case ZPOOL_PROP_CACHEFILE: 6210 /* 6211 * 'readonly' and 'cachefile' are also non-persisitent 6212 * properties. 6213 */ 6214 break; 6215 case ZPOOL_PROP_COMMENT: 6216 strval = fnvpair_value_string(elem); 6217 if (spa->spa_comment != NULL) 6218 spa_strfree(spa->spa_comment); 6219 spa->spa_comment = spa_strdup(strval); 6220 /* 6221 * We need to dirty the configuration on all the vdevs 6222 * so that their labels get updated. It's unnecessary 6223 * to do this for pool creation since the vdev's 6224 * configuratoin has already been dirtied. 6225 */ 6226 if (tx->tx_txg != TXG_INITIAL) 6227 vdev_config_dirty(spa->spa_root_vdev); 6228 spa_history_log_internal(spa, "set", tx, 6229 "%s=%s", nvpair_name(elem), strval); 6230 break; 6231 default: 6232 /* 6233 * Set pool property values in the poolprops mos object. 6234 */ 6235 if (spa->spa_pool_props_object == 0) { 6236 spa->spa_pool_props_object = 6237 zap_create_link(mos, DMU_OT_POOL_PROPS, 6238 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 6239 tx); 6240 } 6241 6242 /* normalize the property name */ 6243 propname = zpool_prop_to_name(prop); 6244 proptype = zpool_prop_get_type(prop); 6245 6246 if (nvpair_type(elem) == DATA_TYPE_STRING) { 6247 ASSERT(proptype == PROP_TYPE_STRING); 6248 strval = fnvpair_value_string(elem); 6249 VERIFY0(zap_update(mos, 6250 spa->spa_pool_props_object, propname, 6251 1, strlen(strval) + 1, strval, tx)); 6252 spa_history_log_internal(spa, "set", tx, 6253 "%s=%s", nvpair_name(elem), strval); 6254 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 6255 intval = fnvpair_value_uint64(elem); 6256 6257 if (proptype == PROP_TYPE_INDEX) { 6258 const char *unused; 6259 VERIFY0(zpool_prop_index_to_string( 6260 prop, intval, &unused)); 6261 } 6262 VERIFY0(zap_update(mos, 6263 spa->spa_pool_props_object, propname, 6264 8, 1, &intval, tx)); 6265 spa_history_log_internal(spa, "set", tx, 6266 "%s=%lld", nvpair_name(elem), intval); 6267 } else { 6268 ASSERT(0); /* not allowed */ 6269 } 6270 6271 switch (prop) { 6272 case ZPOOL_PROP_DELEGATION: 6273 spa->spa_delegation = intval; 6274 break; 6275 case ZPOOL_PROP_BOOTFS: 6276 spa->spa_bootfs = intval; 6277 break; 6278 case ZPOOL_PROP_FAILUREMODE: 6279 spa->spa_failmode = intval; 6280 break; 6281 case ZPOOL_PROP_AUTOEXPAND: 6282 spa->spa_autoexpand = intval; 6283 if (tx->tx_txg != TXG_INITIAL) 6284 spa_async_request(spa, 6285 SPA_ASYNC_AUTOEXPAND); 6286 break; 6287 case ZPOOL_PROP_DEDUPDITTO: 6288 spa->spa_dedup_ditto = intval; 6289 break; 6290 default: 6291 break; 6292 } 6293 } 6294 6295 } 6296 6297 mutex_exit(&spa->spa_props_lock); 6298 } 6299 6300 /* 6301 * Perform one-time upgrade on-disk changes. spa_version() does not 6302 * reflect the new version this txg, so there must be no changes this 6303 * txg to anything that the upgrade code depends on after it executes. 6304 * Therefore this must be called after dsl_pool_sync() does the sync 6305 * tasks. 6306 */ 6307 static void 6308 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 6309 { 6310 dsl_pool_t *dp = spa->spa_dsl_pool; 6311 6312 ASSERT(spa->spa_sync_pass == 1); 6313 6314 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 6315 6316 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 6317 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 6318 dsl_pool_create_origin(dp, tx); 6319 6320 /* Keeping the origin open increases spa_minref */ 6321 spa->spa_minref += 3; 6322 } 6323 6324 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 6325 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 6326 dsl_pool_upgrade_clones(dp, tx); 6327 } 6328 6329 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 6330 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 6331 dsl_pool_upgrade_dir_clones(dp, tx); 6332 6333 /* Keeping the freedir open increases spa_minref */ 6334 spa->spa_minref += 3; 6335 } 6336 6337 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 6338 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6339 spa_feature_create_zap_objects(spa, tx); 6340 } 6341 6342 /* 6343 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 6344 * when possibility to use lz4 compression for metadata was added 6345 * Old pools that have this feature enabled must be upgraded to have 6346 * this feature active 6347 */ 6348 if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6349 boolean_t lz4_en = spa_feature_is_enabled(spa, 6350 SPA_FEATURE_LZ4_COMPRESS); 6351 boolean_t lz4_ac = spa_feature_is_active(spa, 6352 SPA_FEATURE_LZ4_COMPRESS); 6353 6354 if (lz4_en && !lz4_ac) 6355 spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 6356 } 6357 6358 /* 6359 * If we haven't written the salt, do so now. Note that the 6360 * feature may not be activated yet, but that's fine since 6361 * the presence of this ZAP entry is backwards compatible. 6362 */ 6363 if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 6364 DMU_POOL_CHECKSUM_SALT) == ENOENT) { 6365 VERIFY0(zap_add(spa->spa_meta_objset, 6366 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, 6367 sizeof (spa->spa_cksum_salt.zcs_bytes), 6368 spa->spa_cksum_salt.zcs_bytes, tx)); 6369 } 6370 6371 rrw_exit(&dp->dp_config_rwlock, FTAG); 6372 } 6373 6374 static void 6375 vdev_indirect_state_sync_verify(vdev_t *vd) 6376 { 6377 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 6378 vdev_indirect_births_t *vib = vd->vdev_indirect_births; 6379 6380 if (vd->vdev_ops == &vdev_indirect_ops) { 6381 ASSERT(vim != NULL); 6382 ASSERT(vib != NULL); 6383 } 6384 6385 if (vdev_obsolete_sm_object(vd) != 0) { 6386 ASSERT(vd->vdev_obsolete_sm != NULL); 6387 ASSERT(vd->vdev_removing || 6388 vd->vdev_ops == &vdev_indirect_ops); 6389 ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); 6390 ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); 6391 6392 ASSERT3U(vdev_obsolete_sm_object(vd), ==, 6393 space_map_object(vd->vdev_obsolete_sm)); 6394 ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, 6395 space_map_allocated(vd->vdev_obsolete_sm)); 6396 } 6397 ASSERT(vd->vdev_obsolete_segments != NULL); 6398 6399 /* 6400 * Since frees / remaps to an indirect vdev can only 6401 * happen in syncing context, the obsolete segments 6402 * tree must be empty when we start syncing. 6403 */ 6404 ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); 6405 } 6406 6407 /* 6408 * Sync the specified transaction group. New blocks may be dirtied as 6409 * part of the process, so we iterate until it converges. 6410 */ 6411 void 6412 spa_sync(spa_t *spa, uint64_t txg) 6413 { 6414 dsl_pool_t *dp = spa->spa_dsl_pool; 6415 objset_t *mos = spa->spa_meta_objset; 6416 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 6417 vdev_t *rvd = spa->spa_root_vdev; 6418 vdev_t *vd; 6419 dmu_tx_t *tx; 6420 int error; 6421 uint32_t max_queue_depth = zfs_vdev_async_write_max_active * 6422 zfs_vdev_queue_depth_pct / 100; 6423 6424 VERIFY(spa_writeable(spa)); 6425 6426 /* 6427 * Wait for i/os issued in open context that need to complete 6428 * before this txg syncs. 6429 */ 6430 VERIFY0(zio_wait(spa->spa_txg_zio[txg & TXG_MASK])); 6431 spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 0); 6432 6433 /* 6434 * Lock out configuration changes. 6435 */ 6436 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6437 6438 spa->spa_syncing_txg = txg; 6439 spa->spa_sync_pass = 0; 6440 6441 mutex_enter(&spa->spa_alloc_lock); 6442 VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); 6443 mutex_exit(&spa->spa_alloc_lock); 6444 6445 /* 6446 * If there are any pending vdev state changes, convert them 6447 * into config changes that go out with this transaction group. 6448 */ 6449 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6450 while (list_head(&spa->spa_state_dirty_list) != NULL) { 6451 /* 6452 * We need the write lock here because, for aux vdevs, 6453 * calling vdev_config_dirty() modifies sav_config. 6454 * This is ugly and will become unnecessary when we 6455 * eliminate the aux vdev wart by integrating all vdevs 6456 * into the root vdev tree. 6457 */ 6458 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6459 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 6460 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 6461 vdev_state_clean(vd); 6462 vdev_config_dirty(vd); 6463 } 6464 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6465 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 6466 } 6467 spa_config_exit(spa, SCL_STATE, FTAG); 6468 6469 tx = dmu_tx_create_assigned(dp, txg); 6470 6471 spa->spa_sync_starttime = gethrtime(); 6472 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, 6473 spa->spa_sync_starttime + spa->spa_deadman_synctime)); 6474 6475 /* 6476 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 6477 * set spa_deflate if we have no raid-z vdevs. 6478 */ 6479 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 6480 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 6481 int i; 6482 6483 for (i = 0; i < rvd->vdev_children; i++) { 6484 vd = rvd->vdev_child[i]; 6485 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 6486 break; 6487 } 6488 if (i == rvd->vdev_children) { 6489 spa->spa_deflate = TRUE; 6490 VERIFY(0 == zap_add(spa->spa_meta_objset, 6491 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6492 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 6493 } 6494 } 6495 6496 /* 6497 * Set the top-level vdev's max queue depth. Evaluate each 6498 * top-level's async write queue depth in case it changed. 6499 * The max queue depth will not change in the middle of syncing 6500 * out this txg. 6501 */ 6502 uint64_t queue_depth_total = 0; 6503 for (int c = 0; c < rvd->vdev_children; c++) { 6504 vdev_t *tvd = rvd->vdev_child[c]; 6505 metaslab_group_t *mg = tvd->vdev_mg; 6506 6507 if (mg == NULL || mg->mg_class != spa_normal_class(spa) || 6508 !metaslab_group_initialized(mg)) 6509 continue; 6510 6511 /* 6512 * It is safe to do a lock-free check here because only async 6513 * allocations look at mg_max_alloc_queue_depth, and async 6514 * allocations all happen from spa_sync(). 6515 */ 6516 ASSERT0(refcount_count(&mg->mg_alloc_queue_depth)); 6517 mg->mg_max_alloc_queue_depth = max_queue_depth; 6518 queue_depth_total += mg->mg_max_alloc_queue_depth; 6519 } 6520 metaslab_class_t *mc = spa_normal_class(spa); 6521 ASSERT0(refcount_count(&mc->mc_alloc_slots)); 6522 mc->mc_alloc_max_slots = queue_depth_total; 6523 mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 6524 6525 ASSERT3U(mc->mc_alloc_max_slots, <=, 6526 max_queue_depth * rvd->vdev_children); 6527 6528 for (int c = 0; c < rvd->vdev_children; c++) { 6529 vdev_t *vd = rvd->vdev_child[c]; 6530 vdev_indirect_state_sync_verify(vd); 6531 6532 if (vdev_indirect_should_condense(vd)) { 6533 spa_condense_indirect_start_sync(vd, tx); 6534 break; 6535 } 6536 } 6537 6538 /* 6539 * Iterate to convergence. 6540 */ 6541 do { 6542 int pass = ++spa->spa_sync_pass; 6543 6544 spa_sync_config_object(spa, tx); 6545 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 6546 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 6547 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 6548 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 6549 spa_errlog_sync(spa, txg); 6550 dsl_pool_sync(dp, txg); 6551 6552 if (pass < zfs_sync_pass_deferred_free) { 6553 spa_sync_frees(spa, free_bpl, tx); 6554 } else { 6555 /* 6556 * We can not defer frees in pass 1, because 6557 * we sync the deferred frees later in pass 1. 6558 */ 6559 ASSERT3U(pass, >, 1); 6560 bplist_iterate(free_bpl, bpobj_enqueue_cb, 6561 &spa->spa_deferred_bpobj, tx); 6562 } 6563 6564 ddt_sync(spa, txg); 6565 dsl_scan_sync(dp, tx); 6566 6567 if (spa->spa_vdev_removal != NULL) 6568 svr_sync(spa, tx); 6569 6570 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 6571 != NULL) 6572 vdev_sync(vd, txg); 6573 6574 if (pass == 1) { 6575 spa_sync_upgrades(spa, tx); 6576 ASSERT3U(txg, >=, 6577 spa->spa_uberblock.ub_rootbp.blk_birth); 6578 /* 6579 * Note: We need to check if the MOS is dirty 6580 * because we could have marked the MOS dirty 6581 * without updating the uberblock (e.g. if we 6582 * have sync tasks but no dirty user data). We 6583 * need to check the uberblock's rootbp because 6584 * it is updated if we have synced out dirty 6585 * data (though in this case the MOS will most 6586 * likely also be dirty due to second order 6587 * effects, we don't want to rely on that here). 6588 */ 6589 if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && 6590 !dmu_objset_is_dirty(mos, txg)) { 6591 /* 6592 * Nothing changed on the first pass, 6593 * therefore this TXG is a no-op. Avoid 6594 * syncing deferred frees, so that we 6595 * can keep this TXG as a no-op. 6596 */ 6597 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, 6598 txg)); 6599 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 6600 ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); 6601 break; 6602 } 6603 spa_sync_deferred_frees(spa, tx); 6604 } 6605 6606 } while (dmu_objset_is_dirty(mos, txg)); 6607 6608 if (!list_is_empty(&spa->spa_config_dirty_list)) { 6609 /* 6610 * Make sure that the number of ZAPs for all the vdevs matches 6611 * the number of ZAPs in the per-vdev ZAP list. This only gets 6612 * called if the config is dirty; otherwise there may be 6613 * outstanding AVZ operations that weren't completed in 6614 * spa_sync_config_object. 6615 */ 6616 uint64_t all_vdev_zap_entry_count; 6617 ASSERT0(zap_count(spa->spa_meta_objset, 6618 spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); 6619 ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, 6620 all_vdev_zap_entry_count); 6621 } 6622 6623 if (spa->spa_vdev_removal != NULL) { 6624 ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); 6625 } 6626 6627 /* 6628 * Rewrite the vdev configuration (which includes the uberblock) 6629 * to commit the transaction group. 6630 * 6631 * If there are no dirty vdevs, we sync the uberblock to a few 6632 * random top-level vdevs that are known to be visible in the 6633 * config cache (see spa_vdev_add() for a complete description). 6634 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 6635 */ 6636 for (;;) { 6637 /* 6638 * We hold SCL_STATE to prevent vdev open/close/etc. 6639 * while we're attempting to write the vdev labels. 6640 */ 6641 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6642 6643 if (list_is_empty(&spa->spa_config_dirty_list)) { 6644 vdev_t *svd[SPA_DVAS_PER_BP]; 6645 int svdcount = 0; 6646 int children = rvd->vdev_children; 6647 int c0 = spa_get_random(children); 6648 6649 for (int c = 0; c < children; c++) { 6650 vd = rvd->vdev_child[(c0 + c) % children]; 6651 if (vd->vdev_ms_array == 0 || vd->vdev_islog || 6652 !vdev_is_concrete(vd)) 6653 continue; 6654 svd[svdcount++] = vd; 6655 if (svdcount == SPA_DVAS_PER_BP) 6656 break; 6657 } 6658 error = vdev_config_sync(svd, svdcount, txg); 6659 } else { 6660 error = vdev_config_sync(rvd->vdev_child, 6661 rvd->vdev_children, txg); 6662 } 6663 6664 if (error == 0) 6665 spa->spa_last_synced_guid = rvd->vdev_guid; 6666 6667 spa_config_exit(spa, SCL_STATE, FTAG); 6668 6669 if (error == 0) 6670 break; 6671 zio_suspend(spa, NULL); 6672 zio_resume_wait(spa); 6673 } 6674 dmu_tx_commit(tx); 6675 6676 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); 6677 6678 /* 6679 * Clear the dirty config list. 6680 */ 6681 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 6682 vdev_config_clean(vd); 6683 6684 /* 6685 * Now that the new config has synced transactionally, 6686 * let it become visible to the config cache. 6687 */ 6688 if (spa->spa_config_syncing != NULL) { 6689 spa_config_set(spa, spa->spa_config_syncing); 6690 spa->spa_config_txg = txg; 6691 spa->spa_config_syncing = NULL; 6692 } 6693 6694 dsl_pool_sync_done(dp, txg); 6695 6696 mutex_enter(&spa->spa_alloc_lock); 6697 VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); 6698 mutex_exit(&spa->spa_alloc_lock); 6699 6700 /* 6701 * Update usable space statistics. 6702 */ 6703 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 6704 vdev_sync_done(vd, txg); 6705 6706 spa_update_dspace(spa); 6707 6708 /* 6709 * It had better be the case that we didn't dirty anything 6710 * since vdev_config_sync(). 6711 */ 6712 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 6713 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 6714 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 6715 6716 spa->spa_sync_pass = 0; 6717 6718 /* 6719 * Update the last synced uberblock here. We want to do this at 6720 * the end of spa_sync() so that consumers of spa_last_synced_txg() 6721 * will be guaranteed that all the processing associated with 6722 * that txg has been completed. 6723 */ 6724 spa->spa_ubsync = spa->spa_uberblock; 6725 spa_config_exit(spa, SCL_CONFIG, FTAG); 6726 6727 spa_handle_ignored_writes(spa); 6728 6729 /* 6730 * If any async tasks have been requested, kick them off. 6731 */ 6732 spa_async_dispatch(spa); 6733 } 6734 6735 /* 6736 * Sync all pools. We don't want to hold the namespace lock across these 6737 * operations, so we take a reference on the spa_t and drop the lock during the 6738 * sync. 6739 */ 6740 void 6741 spa_sync_allpools(void) 6742 { 6743 spa_t *spa = NULL; 6744 mutex_enter(&spa_namespace_lock); 6745 while ((spa = spa_next(spa)) != NULL) { 6746 if (spa_state(spa) != POOL_STATE_ACTIVE || 6747 !spa_writeable(spa) || spa_suspended(spa)) 6748 continue; 6749 spa_open_ref(spa, FTAG); 6750 mutex_exit(&spa_namespace_lock); 6751 txg_wait_synced(spa_get_dsl(spa), 0); 6752 mutex_enter(&spa_namespace_lock); 6753 spa_close(spa, FTAG); 6754 } 6755 mutex_exit(&spa_namespace_lock); 6756 } 6757 6758 /* 6759 * ========================================================================== 6760 * Miscellaneous routines 6761 * ========================================================================== 6762 */ 6763 6764 /* 6765 * Remove all pools in the system. 6766 */ 6767 void 6768 spa_evict_all(void) 6769 { 6770 spa_t *spa; 6771 6772 /* 6773 * Remove all cached state. All pools should be closed now, 6774 * so every spa in the AVL tree should be unreferenced. 6775 */ 6776 mutex_enter(&spa_namespace_lock); 6777 while ((spa = spa_next(NULL)) != NULL) { 6778 /* 6779 * Stop async tasks. The async thread may need to detach 6780 * a device that's been replaced, which requires grabbing 6781 * spa_namespace_lock, so we must drop it here. 6782 */ 6783 spa_open_ref(spa, FTAG); 6784 mutex_exit(&spa_namespace_lock); 6785 spa_async_suspend(spa); 6786 mutex_enter(&spa_namespace_lock); 6787 spa_close(spa, FTAG); 6788 6789 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 6790 spa_unload(spa); 6791 spa_deactivate(spa); 6792 } 6793 spa_remove(spa); 6794 } 6795 mutex_exit(&spa_namespace_lock); 6796 } 6797 6798 vdev_t * 6799 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 6800 { 6801 vdev_t *vd; 6802 int i; 6803 6804 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 6805 return (vd); 6806 6807 if (aux) { 6808 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 6809 vd = spa->spa_l2cache.sav_vdevs[i]; 6810 if (vd->vdev_guid == guid) 6811 return (vd); 6812 } 6813 6814 for (i = 0; i < spa->spa_spares.sav_count; i++) { 6815 vd = spa->spa_spares.sav_vdevs[i]; 6816 if (vd->vdev_guid == guid) 6817 return (vd); 6818 } 6819 } 6820 6821 return (NULL); 6822 } 6823 6824 void 6825 spa_upgrade(spa_t *spa, uint64_t version) 6826 { 6827 ASSERT(spa_writeable(spa)); 6828 6829 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6830 6831 /* 6832 * This should only be called for a non-faulted pool, and since a 6833 * future version would result in an unopenable pool, this shouldn't be 6834 * possible. 6835 */ 6836 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 6837 ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 6838 6839 spa->spa_uberblock.ub_version = version; 6840 vdev_config_dirty(spa->spa_root_vdev); 6841 6842 spa_config_exit(spa, SCL_ALL, FTAG); 6843 6844 txg_wait_synced(spa_get_dsl(spa), 0); 6845 } 6846 6847 boolean_t 6848 spa_has_spare(spa_t *spa, uint64_t guid) 6849 { 6850 int i; 6851 uint64_t spareguid; 6852 spa_aux_vdev_t *sav = &spa->spa_spares; 6853 6854 for (i = 0; i < sav->sav_count; i++) 6855 if (sav->sav_vdevs[i]->vdev_guid == guid) 6856 return (B_TRUE); 6857 6858 for (i = 0; i < sav->sav_npending; i++) { 6859 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 6860 &spareguid) == 0 && spareguid == guid) 6861 return (B_TRUE); 6862 } 6863 6864 return (B_FALSE); 6865 } 6866 6867 /* 6868 * Check if a pool has an active shared spare device. 6869 * Note: reference count of an active spare is 2, as a spare and as a replace 6870 */ 6871 static boolean_t 6872 spa_has_active_shared_spare(spa_t *spa) 6873 { 6874 int i, refcnt; 6875 uint64_t pool; 6876 spa_aux_vdev_t *sav = &spa->spa_spares; 6877 6878 for (i = 0; i < sav->sav_count; i++) { 6879 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 6880 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 6881 refcnt > 2) 6882 return (B_TRUE); 6883 } 6884 6885 return (B_FALSE); 6886 } 6887 6888 sysevent_t * 6889 spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 6890 { 6891 sysevent_t *ev = NULL; 6892 #ifdef _KERNEL 6893 sysevent_attr_list_t *attr = NULL; 6894 sysevent_value_t value; 6895 6896 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 6897 SE_SLEEP); 6898 ASSERT(ev != NULL); 6899 6900 value.value_type = SE_DATA_TYPE_STRING; 6901 value.value.sv_string = spa_name(spa); 6902 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 6903 goto done; 6904 6905 value.value_type = SE_DATA_TYPE_UINT64; 6906 value.value.sv_uint64 = spa_guid(spa); 6907 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 6908 goto done; 6909 6910 if (vd) { 6911 value.value_type = SE_DATA_TYPE_UINT64; 6912 value.value.sv_uint64 = vd->vdev_guid; 6913 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 6914 SE_SLEEP) != 0) 6915 goto done; 6916 6917 if (vd->vdev_path) { 6918 value.value_type = SE_DATA_TYPE_STRING; 6919 value.value.sv_string = vd->vdev_path; 6920 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 6921 &value, SE_SLEEP) != 0) 6922 goto done; 6923 } 6924 } 6925 6926 if (hist_nvl != NULL) { 6927 fnvlist_merge((nvlist_t *)attr, hist_nvl); 6928 } 6929 6930 if (sysevent_attach_attributes(ev, attr) != 0) 6931 goto done; 6932 attr = NULL; 6933 6934 done: 6935 if (attr) 6936 sysevent_free_attr(attr); 6937 6938 #endif 6939 return (ev); 6940 } 6941 6942 void 6943 spa_event_post(sysevent_t *ev) 6944 { 6945 #ifdef _KERNEL 6946 sysevent_id_t eid; 6947 6948 (void) log_sysevent(ev, SE_SLEEP, &eid); 6949 sysevent_free(ev); 6950 #endif 6951 } 6952 6953 void 6954 spa_event_discard(sysevent_t *ev) 6955 { 6956 #ifdef _KERNEL 6957 sysevent_free(ev); 6958 #endif 6959 } 6960 6961 /* 6962 * Post a sysevent corresponding to the given event. The 'name' must be one of 6963 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 6964 * filled in from the spa and (optionally) the vdev and history nvl. This 6965 * doesn't do anything in the userland libzpool, as we don't want consumers to 6966 * misinterpret ztest or zdb as real changes. 6967 */ 6968 void 6969 spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 6970 { 6971 spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); 6972 } 6973