1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2020 by Delphix. All rights reserved. 25 * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 27 * Copyright 2013 Saso Kiselkov. All rights reserved. 28 * Copyright (c) 2014 Integros [integros.com] 29 * Copyright 2016 Toomas Soome <tsoome@me.com> 30 * Copyright (c) 2016 Actifio, Inc. All rights reserved. 31 * Copyright 2018 Joyent, Inc. 32 * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. 33 * Copyright 2017 Joyent, Inc. 34 * Copyright (c) 2017, Intel Corporation. 35 * Copyright (c) 2021, Colm Buckley <colm@tuatha.org> 36 */ 37 38 /* 39 * SPA: Storage Pool Allocator 40 * 41 * This file contains all the routines used when modifying on-disk SPA state. 42 * This includes opening, importing, destroying, exporting a pool, and syncing a 43 * pool. 44 */ 45 46 #include <sys/zfs_context.h> 47 #include <sys/fm/fs/zfs.h> 48 #include <sys/spa_impl.h> 49 #include <sys/zio.h> 50 #include <sys/zio_checksum.h> 51 #include <sys/dmu.h> 52 #include <sys/dmu_tx.h> 53 #include <sys/zap.h> 54 #include <sys/zil.h> 55 #include <sys/brt.h> 56 #include <sys/ddt.h> 57 #include <sys/vdev_impl.h> 58 #include <sys/vdev_removal.h> 59 #include <sys/vdev_indirect_mapping.h> 60 #include <sys/vdev_indirect_births.h> 61 #include <sys/vdev_initialize.h> 62 #include <sys/vdev_rebuild.h> 63 #include <sys/vdev_trim.h> 64 #include <sys/vdev_disk.h> 65 #include <sys/vdev_draid.h> 66 #include <sys/metaslab.h> 67 #include <sys/metaslab_impl.h> 68 #include <sys/mmp.h> 69 #include <sys/uberblock_impl.h> 70 #include <sys/txg.h> 71 #include <sys/avl.h> 72 #include <sys/bpobj.h> 73 #include <sys/dmu_traverse.h> 74 #include <sys/dmu_objset.h> 75 #include <sys/unique.h> 76 #include <sys/dsl_pool.h> 77 #include <sys/dsl_dataset.h> 78 #include <sys/dsl_dir.h> 79 #include <sys/dsl_prop.h> 80 #include <sys/dsl_synctask.h> 81 #include <sys/fs/zfs.h> 82 #include <sys/arc.h> 83 #include <sys/callb.h> 84 #include <sys/systeminfo.h> 85 #include <sys/zfs_ioctl.h> 86 #include <sys/dsl_scan.h> 87 #include <sys/zfeature.h> 88 #include <sys/dsl_destroy.h> 89 #include <sys/zvol.h> 90 91 #ifdef _KERNEL 92 #include <sys/fm/protocol.h> 93 #include <sys/fm/util.h> 94 #include <sys/callb.h> 95 #include <sys/zone.h> 96 #include <sys/vmsystm.h> 97 #endif /* _KERNEL */ 98 99 #include "zfs_prop.h" 100 #include "zfs_comutil.h" 101 102 /* 103 * The interval, in seconds, at which failed configuration cache file writes 104 * should be retried. 105 */ 106 int zfs_ccw_retry_interval = 300; 107 108 typedef enum zti_modes { 109 ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 110 ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ 111 ZTI_MODE_SCALE, /* Taskqs scale with CPUs. */ 112 ZTI_MODE_NULL, /* don't create a taskq */ 113 ZTI_NMODES 114 } zti_modes_t; 115 116 #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 117 #define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 } 118 #define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } 119 #define ZTI_SCALE { ZTI_MODE_SCALE, 0, 1 } 120 #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 121 122 #define ZTI_N(n) ZTI_P(n, 1) 123 #define ZTI_ONE ZTI_N(1) 124 125 typedef struct zio_taskq_info { 126 zti_modes_t zti_mode; 127 uint_t zti_value; 128 uint_t zti_count; 129 } zio_taskq_info_t; 130 131 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 132 "iss", "iss_h", "int", "int_h" 133 }; 134 135 /* 136 * This table defines the taskq settings for each ZFS I/O type. When 137 * initializing a pool, we use this table to create an appropriately sized 138 * taskq. Some operations are low volume and therefore have a small, static 139 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 140 * macros. Other operations process a large amount of data; the ZTI_BATCH 141 * macro causes us to create a taskq oriented for throughput. Some operations 142 * are so high frequency and short-lived that the taskq itself can become a 143 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 144 * additional degree of parallelism specified by the number of threads per- 145 * taskq and the number of taskqs; when dispatching an event in this case, the 146 * particular taskq is chosen at random. ZTI_SCALE is similar to ZTI_BATCH, 147 * but with number of taskqs also scaling with number of CPUs. 148 * 149 * The different taskq priorities are to handle the different contexts (issue 150 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that 151 * need to be handled with minimum delay. 152 */ 153 static const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 154 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 155 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 156 { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ 157 { ZTI_BATCH, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */ 158 { ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 159 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 160 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ 161 { ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */ 162 }; 163 164 static void spa_sync_version(void *arg, dmu_tx_t *tx); 165 static void spa_sync_props(void *arg, dmu_tx_t *tx); 166 static boolean_t spa_has_active_shared_spare(spa_t *spa); 167 static int spa_load_impl(spa_t *spa, spa_import_type_t type, 168 const char **ereport); 169 static void spa_vdev_resilver_done(spa_t *spa); 170 171 static uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */ 172 static uint_t zio_taskq_batch_tpq; /* threads per taskq */ 173 static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 174 static const uint_t zio_taskq_basedc = 80; /* base duty cycle */ 175 176 static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */ 177 178 /* 179 * Report any spa_load_verify errors found, but do not fail spa_load. 180 * This is used by zdb to analyze non-idle pools. 181 */ 182 boolean_t spa_load_verify_dryrun = B_FALSE; 183 184 /* 185 * Allow read spacemaps in case of readonly import (spa_mode == SPA_MODE_READ). 186 * This is used by zdb for spacemaps verification. 187 */ 188 boolean_t spa_mode_readable_spacemaps = B_FALSE; 189 190 /* 191 * This (illegal) pool name is used when temporarily importing a spa_t in order 192 * to get the vdev stats associated with the imported devices. 193 */ 194 #define TRYIMPORT_NAME "$import" 195 196 /* 197 * For debugging purposes: print out vdev tree during pool import. 198 */ 199 static int spa_load_print_vdev_tree = B_FALSE; 200 201 /* 202 * A non-zero value for zfs_max_missing_tvds means that we allow importing 203 * pools with missing top-level vdevs. This is strictly intended for advanced 204 * pool recovery cases since missing data is almost inevitable. Pools with 205 * missing devices can only be imported read-only for safety reasons, and their 206 * fail-mode will be automatically set to "continue". 207 * 208 * With 1 missing vdev we should be able to import the pool and mount all 209 * datasets. User data that was not modified after the missing device has been 210 * added should be recoverable. This means that snapshots created prior to the 211 * addition of that device should be completely intact. 212 * 213 * With 2 missing vdevs, some datasets may fail to mount since there are 214 * dataset statistics that are stored as regular metadata. Some data might be 215 * recoverable if those vdevs were added recently. 216 * 217 * With 3 or more missing vdevs, the pool is severely damaged and MOS entries 218 * may be missing entirely. Chances of data recovery are very low. Note that 219 * there are also risks of performing an inadvertent rewind as we might be 220 * missing all the vdevs with the latest uberblocks. 221 */ 222 uint64_t zfs_max_missing_tvds = 0; 223 224 /* 225 * The parameters below are similar to zfs_max_missing_tvds but are only 226 * intended for a preliminary open of the pool with an untrusted config which 227 * might be incomplete or out-dated. 228 * 229 * We are more tolerant for pools opened from a cachefile since we could have 230 * an out-dated cachefile where a device removal was not registered. 231 * We could have set the limit arbitrarily high but in the case where devices 232 * are really missing we would want to return the proper error codes; we chose 233 * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available 234 * and we get a chance to retrieve the trusted config. 235 */ 236 uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; 237 238 /* 239 * In the case where config was assembled by scanning device paths (/dev/dsks 240 * by default) we are less tolerant since all the existing devices should have 241 * been detected and we want spa_load to return the right error codes. 242 */ 243 uint64_t zfs_max_missing_tvds_scan = 0; 244 245 /* 246 * Debugging aid that pauses spa_sync() towards the end. 247 */ 248 static const boolean_t zfs_pause_spa_sync = B_FALSE; 249 250 /* 251 * Variables to indicate the livelist condense zthr func should wait at certain 252 * points for the livelist to be removed - used to test condense/destroy races 253 */ 254 static int zfs_livelist_condense_zthr_pause = 0; 255 static int zfs_livelist_condense_sync_pause = 0; 256 257 /* 258 * Variables to track whether or not condense cancellation has been 259 * triggered in testing. 260 */ 261 static int zfs_livelist_condense_sync_cancel = 0; 262 static int zfs_livelist_condense_zthr_cancel = 0; 263 264 /* 265 * Variable to track whether or not extra ALLOC blkptrs were added to a 266 * livelist entry while it was being condensed (caused by the way we track 267 * remapped blkptrs in dbuf_remap_impl) 268 */ 269 static int zfs_livelist_condense_new_alloc = 0; 270 271 /* 272 * ========================================================================== 273 * SPA properties routines 274 * ========================================================================== 275 */ 276 277 /* 278 * Add a (source=src, propname=propval) list to an nvlist. 279 */ 280 static void 281 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, const char *strval, 282 uint64_t intval, zprop_source_t src) 283 { 284 const char *propname = zpool_prop_to_name(prop); 285 nvlist_t *propval; 286 287 propval = fnvlist_alloc(); 288 fnvlist_add_uint64(propval, ZPROP_SOURCE, src); 289 290 if (strval != NULL) 291 fnvlist_add_string(propval, ZPROP_VALUE, strval); 292 else 293 fnvlist_add_uint64(propval, ZPROP_VALUE, intval); 294 295 fnvlist_add_nvlist(nvl, propname, propval); 296 nvlist_free(propval); 297 } 298 299 /* 300 * Add a user property (source=src, propname=propval) to an nvlist. 301 */ 302 static void 303 spa_prop_add_user(nvlist_t *nvl, const char *propname, char *strval, 304 zprop_source_t src) 305 { 306 nvlist_t *propval; 307 308 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 309 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 310 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 311 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 312 nvlist_free(propval); 313 } 314 315 /* 316 * Get property values from the spa configuration. 317 */ 318 static void 319 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 320 { 321 vdev_t *rvd = spa->spa_root_vdev; 322 dsl_pool_t *pool = spa->spa_dsl_pool; 323 uint64_t size, alloc, cap, version; 324 const zprop_source_t src = ZPROP_SRC_NONE; 325 spa_config_dirent_t *dp; 326 metaslab_class_t *mc = spa_normal_class(spa); 327 328 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 329 330 if (rvd != NULL) { 331 alloc = metaslab_class_get_alloc(mc); 332 alloc += metaslab_class_get_alloc(spa_special_class(spa)); 333 alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); 334 alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa)); 335 336 size = metaslab_class_get_space(mc); 337 size += metaslab_class_get_space(spa_special_class(spa)); 338 size += metaslab_class_get_space(spa_dedup_class(spa)); 339 size += metaslab_class_get_space(spa_embedded_log_class(spa)); 340 341 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 342 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 343 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 344 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 345 size - alloc, src); 346 spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL, 347 spa->spa_checkpoint_info.sci_dspace, src); 348 349 spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, 350 metaslab_class_fragmentation(mc), src); 351 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, 352 metaslab_class_expandable_space(mc), src); 353 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 354 (spa_mode(spa) == SPA_MODE_READ), src); 355 356 cap = (size == 0) ? 0 : (alloc * 100 / size); 357 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 358 359 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 360 ddt_get_pool_dedup_ratio(spa), src); 361 spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONEUSED, NULL, 362 brt_get_used(spa), src); 363 spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONESAVED, NULL, 364 brt_get_saved(spa), src); 365 spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONERATIO, NULL, 366 brt_get_ratio(spa), src); 367 368 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 369 rvd->vdev_state, src); 370 371 version = spa_version(spa); 372 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) { 373 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, 374 version, ZPROP_SRC_DEFAULT); 375 } else { 376 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, 377 version, ZPROP_SRC_LOCAL); 378 } 379 spa_prop_add_list(*nvp, ZPOOL_PROP_LOAD_GUID, 380 NULL, spa_load_guid(spa), src); 381 } 382 383 if (pool != NULL) { 384 /* 385 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 386 * when opening pools before this version freedir will be NULL. 387 */ 388 if (pool->dp_free_dir != NULL) { 389 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 390 dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, 391 src); 392 } else { 393 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 394 NULL, 0, src); 395 } 396 397 if (pool->dp_leak_dir != NULL) { 398 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 399 dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, 400 src); 401 } else { 402 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, 403 NULL, 0, src); 404 } 405 } 406 407 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 408 409 if (spa->spa_comment != NULL) { 410 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 411 0, ZPROP_SRC_LOCAL); 412 } 413 414 if (spa->spa_compatibility != NULL) { 415 spa_prop_add_list(*nvp, ZPOOL_PROP_COMPATIBILITY, 416 spa->spa_compatibility, 0, ZPROP_SRC_LOCAL); 417 } 418 419 if (spa->spa_root != NULL) 420 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 421 0, ZPROP_SRC_LOCAL); 422 423 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { 424 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 425 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); 426 } else { 427 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 428 SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); 429 } 430 431 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { 432 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, 433 DNODE_MAX_SIZE, ZPROP_SRC_NONE); 434 } else { 435 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, 436 DNODE_MIN_SIZE, ZPROP_SRC_NONE); 437 } 438 439 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 440 if (dp->scd_path == NULL) { 441 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 442 "none", 0, ZPROP_SRC_LOCAL); 443 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 444 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 445 dp->scd_path, 0, ZPROP_SRC_LOCAL); 446 } 447 } 448 } 449 450 /* 451 * Get zpool property values. 452 */ 453 int 454 spa_prop_get(spa_t *spa, nvlist_t **nvp) 455 { 456 objset_t *mos = spa->spa_meta_objset; 457 zap_cursor_t zc; 458 zap_attribute_t za; 459 dsl_pool_t *dp; 460 int err; 461 462 err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP); 463 if (err) 464 return (err); 465 466 dp = spa_get_dsl(spa); 467 dsl_pool_config_enter(dp, FTAG); 468 mutex_enter(&spa->spa_props_lock); 469 470 /* 471 * Get properties from the spa config. 472 */ 473 spa_prop_get_config(spa, nvp); 474 475 /* If no pool property object, no more prop to get. */ 476 if (mos == NULL || spa->spa_pool_props_object == 0) 477 goto out; 478 479 /* 480 * Get properties from the MOS pool property object. 481 */ 482 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 483 (err = zap_cursor_retrieve(&zc, &za)) == 0; 484 zap_cursor_advance(&zc)) { 485 uint64_t intval = 0; 486 char *strval = NULL; 487 zprop_source_t src = ZPROP_SRC_DEFAULT; 488 zpool_prop_t prop; 489 490 if ((prop = zpool_name_to_prop(za.za_name)) == 491 ZPOOL_PROP_INVAL && !zfs_prop_user(za.za_name)) 492 continue; 493 494 switch (za.za_integer_length) { 495 case 8: 496 /* integer property */ 497 if (za.za_first_integer != 498 zpool_prop_default_numeric(prop)) 499 src = ZPROP_SRC_LOCAL; 500 501 if (prop == ZPOOL_PROP_BOOTFS) { 502 dsl_dataset_t *ds = NULL; 503 504 err = dsl_dataset_hold_obj(dp, 505 za.za_first_integer, FTAG, &ds); 506 if (err != 0) 507 break; 508 509 strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, 510 KM_SLEEP); 511 dsl_dataset_name(ds, strval); 512 dsl_dataset_rele(ds, FTAG); 513 } else { 514 strval = NULL; 515 intval = za.za_first_integer; 516 } 517 518 spa_prop_add_list(*nvp, prop, strval, intval, src); 519 520 if (strval != NULL) 521 kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); 522 523 break; 524 525 case 1: 526 /* string property */ 527 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 528 err = zap_lookup(mos, spa->spa_pool_props_object, 529 za.za_name, 1, za.za_num_integers, strval); 530 if (err) { 531 kmem_free(strval, za.za_num_integers); 532 break; 533 } 534 if (prop != ZPOOL_PROP_INVAL) { 535 spa_prop_add_list(*nvp, prop, strval, 0, src); 536 } else { 537 src = ZPROP_SRC_LOCAL; 538 spa_prop_add_user(*nvp, za.za_name, strval, 539 src); 540 } 541 kmem_free(strval, za.za_num_integers); 542 break; 543 544 default: 545 break; 546 } 547 } 548 zap_cursor_fini(&zc); 549 out: 550 mutex_exit(&spa->spa_props_lock); 551 dsl_pool_config_exit(dp, FTAG); 552 if (err && err != ENOENT) { 553 nvlist_free(*nvp); 554 *nvp = NULL; 555 return (err); 556 } 557 558 return (0); 559 } 560 561 /* 562 * Validate the given pool properties nvlist and modify the list 563 * for the property values to be set. 564 */ 565 static int 566 spa_prop_validate(spa_t *spa, nvlist_t *props) 567 { 568 nvpair_t *elem; 569 int error = 0, reset_bootfs = 0; 570 uint64_t objnum = 0; 571 boolean_t has_feature = B_FALSE; 572 573 elem = NULL; 574 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 575 uint64_t intval; 576 const char *strval, *slash, *check, *fname; 577 const char *propname = nvpair_name(elem); 578 zpool_prop_t prop = zpool_name_to_prop(propname); 579 580 switch (prop) { 581 case ZPOOL_PROP_INVAL: 582 /* 583 * Sanitize the input. 584 */ 585 if (zfs_prop_user(propname)) { 586 if (strlen(propname) >= ZAP_MAXNAMELEN) { 587 error = SET_ERROR(ENAMETOOLONG); 588 break; 589 } 590 591 if (strlen(fnvpair_value_string(elem)) >= 592 ZAP_MAXVALUELEN) { 593 error = SET_ERROR(E2BIG); 594 break; 595 } 596 } else if (zpool_prop_feature(propname)) { 597 if (nvpair_type(elem) != DATA_TYPE_UINT64) { 598 error = SET_ERROR(EINVAL); 599 break; 600 } 601 602 if (nvpair_value_uint64(elem, &intval) != 0) { 603 error = SET_ERROR(EINVAL); 604 break; 605 } 606 607 if (intval != 0) { 608 error = SET_ERROR(EINVAL); 609 break; 610 } 611 612 fname = strchr(propname, '@') + 1; 613 if (zfeature_lookup_name(fname, NULL) != 0) { 614 error = SET_ERROR(EINVAL); 615 break; 616 } 617 618 has_feature = B_TRUE; 619 } else { 620 error = SET_ERROR(EINVAL); 621 break; 622 } 623 break; 624 625 case ZPOOL_PROP_VERSION: 626 error = nvpair_value_uint64(elem, &intval); 627 if (!error && 628 (intval < spa_version(spa) || 629 intval > SPA_VERSION_BEFORE_FEATURES || 630 has_feature)) 631 error = SET_ERROR(EINVAL); 632 break; 633 634 case ZPOOL_PROP_DELEGATION: 635 case ZPOOL_PROP_AUTOREPLACE: 636 case ZPOOL_PROP_LISTSNAPS: 637 case ZPOOL_PROP_AUTOEXPAND: 638 case ZPOOL_PROP_AUTOTRIM: 639 error = nvpair_value_uint64(elem, &intval); 640 if (!error && intval > 1) 641 error = SET_ERROR(EINVAL); 642 break; 643 644 case ZPOOL_PROP_MULTIHOST: 645 error = nvpair_value_uint64(elem, &intval); 646 if (!error && intval > 1) 647 error = SET_ERROR(EINVAL); 648 649 if (!error) { 650 uint32_t hostid = zone_get_hostid(NULL); 651 if (hostid) 652 spa->spa_hostid = hostid; 653 else 654 error = SET_ERROR(ENOTSUP); 655 } 656 657 break; 658 659 case ZPOOL_PROP_BOOTFS: 660 /* 661 * If the pool version is less than SPA_VERSION_BOOTFS, 662 * or the pool is still being created (version == 0), 663 * the bootfs property cannot be set. 664 */ 665 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 666 error = SET_ERROR(ENOTSUP); 667 break; 668 } 669 670 /* 671 * Make sure the vdev config is bootable 672 */ 673 if (!vdev_is_bootable(spa->spa_root_vdev)) { 674 error = SET_ERROR(ENOTSUP); 675 break; 676 } 677 678 reset_bootfs = 1; 679 680 error = nvpair_value_string(elem, &strval); 681 682 if (!error) { 683 objset_t *os; 684 685 if (strval == NULL || strval[0] == '\0') { 686 objnum = zpool_prop_default_numeric( 687 ZPOOL_PROP_BOOTFS); 688 break; 689 } 690 691 error = dmu_objset_hold(strval, FTAG, &os); 692 if (error != 0) 693 break; 694 695 /* Must be ZPL. */ 696 if (dmu_objset_type(os) != DMU_OST_ZFS) { 697 error = SET_ERROR(ENOTSUP); 698 } else { 699 objnum = dmu_objset_id(os); 700 } 701 dmu_objset_rele(os, FTAG); 702 } 703 break; 704 705 case ZPOOL_PROP_FAILUREMODE: 706 error = nvpair_value_uint64(elem, &intval); 707 if (!error && intval > ZIO_FAILURE_MODE_PANIC) 708 error = SET_ERROR(EINVAL); 709 710 /* 711 * This is a special case which only occurs when 712 * the pool has completely failed. This allows 713 * the user to change the in-core failmode property 714 * without syncing it out to disk (I/Os might 715 * currently be blocked). We do this by returning 716 * EIO to the caller (spa_prop_set) to trick it 717 * into thinking we encountered a property validation 718 * error. 719 */ 720 if (!error && spa_suspended(spa)) { 721 spa->spa_failmode = intval; 722 error = SET_ERROR(EIO); 723 } 724 break; 725 726 case ZPOOL_PROP_CACHEFILE: 727 if ((error = nvpair_value_string(elem, &strval)) != 0) 728 break; 729 730 if (strval[0] == '\0') 731 break; 732 733 if (strcmp(strval, "none") == 0) 734 break; 735 736 if (strval[0] != '/') { 737 error = SET_ERROR(EINVAL); 738 break; 739 } 740 741 slash = strrchr(strval, '/'); 742 ASSERT(slash != NULL); 743 744 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 745 strcmp(slash, "/..") == 0) 746 error = SET_ERROR(EINVAL); 747 break; 748 749 case ZPOOL_PROP_COMMENT: 750 if ((error = nvpair_value_string(elem, &strval)) != 0) 751 break; 752 for (check = strval; *check != '\0'; check++) { 753 if (!isprint(*check)) { 754 error = SET_ERROR(EINVAL); 755 break; 756 } 757 } 758 if (strlen(strval) > ZPROP_MAX_COMMENT) 759 error = SET_ERROR(E2BIG); 760 break; 761 762 default: 763 break; 764 } 765 766 if (error) 767 break; 768 } 769 770 (void) nvlist_remove_all(props, 771 zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO)); 772 773 if (!error && reset_bootfs) { 774 error = nvlist_remove(props, 775 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 776 777 if (!error) { 778 error = nvlist_add_uint64(props, 779 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 780 } 781 } 782 783 return (error); 784 } 785 786 void 787 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 788 { 789 const char *cachefile; 790 spa_config_dirent_t *dp; 791 792 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 793 &cachefile) != 0) 794 return; 795 796 dp = kmem_alloc(sizeof (spa_config_dirent_t), 797 KM_SLEEP); 798 799 if (cachefile[0] == '\0') 800 dp->scd_path = spa_strdup(spa_config_path); 801 else if (strcmp(cachefile, "none") == 0) 802 dp->scd_path = NULL; 803 else 804 dp->scd_path = spa_strdup(cachefile); 805 806 list_insert_head(&spa->spa_config_list, dp); 807 if (need_sync) 808 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 809 } 810 811 int 812 spa_prop_set(spa_t *spa, nvlist_t *nvp) 813 { 814 int error; 815 nvpair_t *elem = NULL; 816 boolean_t need_sync = B_FALSE; 817 818 if ((error = spa_prop_validate(spa, nvp)) != 0) 819 return (error); 820 821 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 822 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 823 824 if (prop == ZPOOL_PROP_CACHEFILE || 825 prop == ZPOOL_PROP_ALTROOT || 826 prop == ZPOOL_PROP_READONLY) 827 continue; 828 829 if (prop == ZPOOL_PROP_INVAL && 830 zfs_prop_user(nvpair_name(elem))) { 831 need_sync = B_TRUE; 832 break; 833 } 834 835 if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { 836 uint64_t ver = 0; 837 838 if (prop == ZPOOL_PROP_VERSION) { 839 VERIFY(nvpair_value_uint64(elem, &ver) == 0); 840 } else { 841 ASSERT(zpool_prop_feature(nvpair_name(elem))); 842 ver = SPA_VERSION_FEATURES; 843 need_sync = B_TRUE; 844 } 845 846 /* Save time if the version is already set. */ 847 if (ver == spa_version(spa)) 848 continue; 849 850 /* 851 * In addition to the pool directory object, we might 852 * create the pool properties object, the features for 853 * read object, the features for write object, or the 854 * feature descriptions object. 855 */ 856 error = dsl_sync_task(spa->spa_name, NULL, 857 spa_sync_version, &ver, 858 6, ZFS_SPACE_CHECK_RESERVED); 859 if (error) 860 return (error); 861 continue; 862 } 863 864 need_sync = B_TRUE; 865 break; 866 } 867 868 if (need_sync) { 869 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 870 nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 871 } 872 873 return (0); 874 } 875 876 /* 877 * If the bootfs property value is dsobj, clear it. 878 */ 879 void 880 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 881 { 882 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 883 VERIFY(zap_remove(spa->spa_meta_objset, 884 spa->spa_pool_props_object, 885 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 886 spa->spa_bootfs = 0; 887 } 888 } 889 890 static int 891 spa_change_guid_check(void *arg, dmu_tx_t *tx) 892 { 893 uint64_t *newguid __maybe_unused = arg; 894 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 895 vdev_t *rvd = spa->spa_root_vdev; 896 uint64_t vdev_state; 897 898 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 899 int error = (spa_has_checkpoint(spa)) ? 900 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 901 return (SET_ERROR(error)); 902 } 903 904 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 905 vdev_state = rvd->vdev_state; 906 spa_config_exit(spa, SCL_STATE, FTAG); 907 908 if (vdev_state != VDEV_STATE_HEALTHY) 909 return (SET_ERROR(ENXIO)); 910 911 ASSERT3U(spa_guid(spa), !=, *newguid); 912 913 return (0); 914 } 915 916 static void 917 spa_change_guid_sync(void *arg, dmu_tx_t *tx) 918 { 919 uint64_t *newguid = arg; 920 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 921 uint64_t oldguid; 922 vdev_t *rvd = spa->spa_root_vdev; 923 924 oldguid = spa_guid(spa); 925 926 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 927 rvd->vdev_guid = *newguid; 928 rvd->vdev_guid_sum += (*newguid - oldguid); 929 vdev_config_dirty(rvd); 930 spa_config_exit(spa, SCL_STATE, FTAG); 931 932 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 933 (u_longlong_t)oldguid, (u_longlong_t)*newguid); 934 } 935 936 /* 937 * Change the GUID for the pool. This is done so that we can later 938 * re-import a pool built from a clone of our own vdevs. We will modify 939 * the root vdev's guid, our own pool guid, and then mark all of our 940 * vdevs dirty. Note that we must make sure that all our vdevs are 941 * online when we do this, or else any vdevs that weren't present 942 * would be orphaned from our pool. We are also going to issue a 943 * sysevent to update any watchers. 944 */ 945 int 946 spa_change_guid(spa_t *spa) 947 { 948 int error; 949 uint64_t guid; 950 951 mutex_enter(&spa->spa_vdev_top_lock); 952 mutex_enter(&spa_namespace_lock); 953 guid = spa_generate_guid(NULL); 954 955 error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 956 spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 957 958 if (error == 0) { 959 /* 960 * Clear the kobj flag from all the vdevs to allow 961 * vdev_cache_process_kobj_evt() to post events to all the 962 * vdevs since GUID is updated. 963 */ 964 vdev_clear_kobj_evt(spa->spa_root_vdev); 965 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 966 vdev_clear_kobj_evt(spa->spa_l2cache.sav_vdevs[i]); 967 968 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); 969 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); 970 } 971 972 mutex_exit(&spa_namespace_lock); 973 mutex_exit(&spa->spa_vdev_top_lock); 974 975 return (error); 976 } 977 978 /* 979 * ========================================================================== 980 * SPA state manipulation (open/create/destroy/import/export) 981 * ========================================================================== 982 */ 983 984 static int 985 spa_error_entry_compare(const void *a, const void *b) 986 { 987 const spa_error_entry_t *sa = (const spa_error_entry_t *)a; 988 const spa_error_entry_t *sb = (const spa_error_entry_t *)b; 989 int ret; 990 991 ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, 992 sizeof (zbookmark_phys_t)); 993 994 return (TREE_ISIGN(ret)); 995 } 996 997 /* 998 * Utility function which retrieves copies of the current logs and 999 * re-initializes them in the process. 1000 */ 1001 void 1002 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 1003 { 1004 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 1005 1006 memcpy(last, &spa->spa_errlist_last, sizeof (avl_tree_t)); 1007 memcpy(scrub, &spa->spa_errlist_scrub, sizeof (avl_tree_t)); 1008 1009 avl_create(&spa->spa_errlist_scrub, 1010 spa_error_entry_compare, sizeof (spa_error_entry_t), 1011 offsetof(spa_error_entry_t, se_avl)); 1012 avl_create(&spa->spa_errlist_last, 1013 spa_error_entry_compare, sizeof (spa_error_entry_t), 1014 offsetof(spa_error_entry_t, se_avl)); 1015 } 1016 1017 static void 1018 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 1019 { 1020 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 1021 enum zti_modes mode = ztip->zti_mode; 1022 uint_t value = ztip->zti_value; 1023 uint_t count = ztip->zti_count; 1024 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1025 uint_t cpus, flags = TASKQ_DYNAMIC; 1026 boolean_t batch = B_FALSE; 1027 1028 switch (mode) { 1029 case ZTI_MODE_FIXED: 1030 ASSERT3U(value, >, 0); 1031 break; 1032 1033 case ZTI_MODE_BATCH: 1034 batch = B_TRUE; 1035 flags |= TASKQ_THREADS_CPU_PCT; 1036 value = MIN(zio_taskq_batch_pct, 100); 1037 break; 1038 1039 case ZTI_MODE_SCALE: 1040 flags |= TASKQ_THREADS_CPU_PCT; 1041 /* 1042 * We want more taskqs to reduce lock contention, but we want 1043 * less for better request ordering and CPU utilization. 1044 */ 1045 cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); 1046 if (zio_taskq_batch_tpq > 0) { 1047 count = MAX(1, (cpus + zio_taskq_batch_tpq / 2) / 1048 zio_taskq_batch_tpq); 1049 } else { 1050 /* 1051 * Prefer 6 threads per taskq, but no more taskqs 1052 * than threads in them on large systems. For 80%: 1053 * 1054 * taskq taskq total 1055 * cpus taskqs percent threads threads 1056 * ------- ------- ------- ------- ------- 1057 * 1 1 80% 1 1 1058 * 2 1 80% 1 1 1059 * 4 1 80% 3 3 1060 * 8 2 40% 3 6 1061 * 16 3 27% 4 12 1062 * 32 5 16% 5 25 1063 * 64 7 11% 7 49 1064 * 128 10 8% 10 100 1065 * 256 14 6% 15 210 1066 */ 1067 count = 1 + cpus / 6; 1068 while (count * count > cpus) 1069 count--; 1070 } 1071 /* Limit each taskq within 100% to not trigger assertion. */ 1072 count = MAX(count, (zio_taskq_batch_pct + 99) / 100); 1073 value = (zio_taskq_batch_pct + count / 2) / count; 1074 break; 1075 1076 case ZTI_MODE_NULL: 1077 tqs->stqs_count = 0; 1078 tqs->stqs_taskq = NULL; 1079 return; 1080 1081 default: 1082 panic("unrecognized mode for %s_%s taskq (%u:%u) in " 1083 "spa_activate()", 1084 zio_type_name[t], zio_taskq_types[q], mode, value); 1085 break; 1086 } 1087 1088 ASSERT3U(count, >, 0); 1089 tqs->stqs_count = count; 1090 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 1091 1092 for (uint_t i = 0; i < count; i++) { 1093 taskq_t *tq; 1094 char name[32]; 1095 1096 if (count > 1) 1097 (void) snprintf(name, sizeof (name), "%s_%s_%u", 1098 zio_type_name[t], zio_taskq_types[q], i); 1099 else 1100 (void) snprintf(name, sizeof (name), "%s_%s", 1101 zio_type_name[t], zio_taskq_types[q]); 1102 1103 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 1104 if (batch) 1105 flags |= TASKQ_DC_BATCH; 1106 1107 (void) zio_taskq_basedc; 1108 tq = taskq_create_sysdc(name, value, 50, INT_MAX, 1109 spa->spa_proc, zio_taskq_basedc, flags); 1110 } else { 1111 pri_t pri = maxclsyspri; 1112 /* 1113 * The write issue taskq can be extremely CPU 1114 * intensive. Run it at slightly less important 1115 * priority than the other taskqs. 1116 * 1117 * Under Linux and FreeBSD this means incrementing 1118 * the priority value as opposed to platforms like 1119 * illumos where it should be decremented. 1120 * 1121 * On FreeBSD, if priorities divided by four (RQ_PPQ) 1122 * are equal then a difference between them is 1123 * insignificant. 1124 */ 1125 if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) { 1126 #if defined(__linux__) 1127 pri++; 1128 #elif defined(__FreeBSD__) 1129 pri += 4; 1130 #else 1131 #error "unknown OS" 1132 #endif 1133 } 1134 tq = taskq_create_proc(name, value, pri, 50, 1135 INT_MAX, spa->spa_proc, flags); 1136 } 1137 1138 tqs->stqs_taskq[i] = tq; 1139 } 1140 } 1141 1142 static void 1143 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 1144 { 1145 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1146 1147 if (tqs->stqs_taskq == NULL) { 1148 ASSERT3U(tqs->stqs_count, ==, 0); 1149 return; 1150 } 1151 1152 for (uint_t i = 0; i < tqs->stqs_count; i++) { 1153 ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 1154 taskq_destroy(tqs->stqs_taskq[i]); 1155 } 1156 1157 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 1158 tqs->stqs_taskq = NULL; 1159 } 1160 1161 /* 1162 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 1163 * Note that a type may have multiple discrete taskqs to avoid lock contention 1164 * on the taskq itself. In that case we choose which taskq at random by using 1165 * the low bits of gethrtime(). 1166 */ 1167 void 1168 spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 1169 task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) 1170 { 1171 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1172 taskq_t *tq; 1173 1174 ASSERT3P(tqs->stqs_taskq, !=, NULL); 1175 ASSERT3U(tqs->stqs_count, !=, 0); 1176 1177 if (tqs->stqs_count == 1) { 1178 tq = tqs->stqs_taskq[0]; 1179 } else { 1180 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; 1181 } 1182 1183 taskq_dispatch_ent(tq, func, arg, flags, ent); 1184 } 1185 1186 /* 1187 * Same as spa_taskq_dispatch_ent() but block on the task until completion. 1188 */ 1189 void 1190 spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 1191 task_func_t *func, void *arg, uint_t flags) 1192 { 1193 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1194 taskq_t *tq; 1195 taskqid_t id; 1196 1197 ASSERT3P(tqs->stqs_taskq, !=, NULL); 1198 ASSERT3U(tqs->stqs_count, !=, 0); 1199 1200 if (tqs->stqs_count == 1) { 1201 tq = tqs->stqs_taskq[0]; 1202 } else { 1203 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; 1204 } 1205 1206 id = taskq_dispatch(tq, func, arg, flags); 1207 if (id) 1208 taskq_wait_id(tq, id); 1209 } 1210 1211 static void 1212 spa_create_zio_taskqs(spa_t *spa) 1213 { 1214 for (int t = 0; t < ZIO_TYPES; t++) { 1215 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1216 spa_taskqs_init(spa, t, q); 1217 } 1218 } 1219 } 1220 1221 /* 1222 * Disabled until spa_thread() can be adapted for Linux. 1223 */ 1224 #undef HAVE_SPA_THREAD 1225 1226 #if defined(_KERNEL) && defined(HAVE_SPA_THREAD) 1227 static void 1228 spa_thread(void *arg) 1229 { 1230 psetid_t zio_taskq_psrset_bind = PS_NONE; 1231 callb_cpr_t cprinfo; 1232 1233 spa_t *spa = arg; 1234 user_t *pu = PTOU(curproc); 1235 1236 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 1237 spa->spa_name); 1238 1239 ASSERT(curproc != &p0); 1240 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 1241 "zpool-%s", spa->spa_name); 1242 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 1243 1244 /* bind this thread to the requested psrset */ 1245 if (zio_taskq_psrset_bind != PS_NONE) { 1246 pool_lock(); 1247 mutex_enter(&cpu_lock); 1248 mutex_enter(&pidlock); 1249 mutex_enter(&curproc->p_lock); 1250 1251 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 1252 0, NULL, NULL) == 0) { 1253 curthread->t_bind_pset = zio_taskq_psrset_bind; 1254 } else { 1255 cmn_err(CE_WARN, 1256 "Couldn't bind process for zfs pool \"%s\" to " 1257 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1258 } 1259 1260 mutex_exit(&curproc->p_lock); 1261 mutex_exit(&pidlock); 1262 mutex_exit(&cpu_lock); 1263 pool_unlock(); 1264 } 1265 1266 if (zio_taskq_sysdc) { 1267 sysdc_thread_enter(curthread, 100, 0); 1268 } 1269 1270 spa->spa_proc = curproc; 1271 spa->spa_did = curthread->t_did; 1272 1273 spa_create_zio_taskqs(spa); 1274 1275 mutex_enter(&spa->spa_proc_lock); 1276 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1277 1278 spa->spa_proc_state = SPA_PROC_ACTIVE; 1279 cv_broadcast(&spa->spa_proc_cv); 1280 1281 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1282 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1283 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1284 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1285 1286 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1287 spa->spa_proc_state = SPA_PROC_GONE; 1288 spa->spa_proc = &p0; 1289 cv_broadcast(&spa->spa_proc_cv); 1290 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1291 1292 mutex_enter(&curproc->p_lock); 1293 lwp_exit(); 1294 } 1295 #endif 1296 1297 /* 1298 * Activate an uninitialized pool. 1299 */ 1300 static void 1301 spa_activate(spa_t *spa, spa_mode_t mode) 1302 { 1303 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1304 1305 spa->spa_state = POOL_STATE_ACTIVE; 1306 spa->spa_mode = mode; 1307 spa->spa_read_spacemaps = spa_mode_readable_spacemaps; 1308 1309 spa->spa_normal_class = metaslab_class_create(spa, &zfs_metaslab_ops); 1310 spa->spa_log_class = metaslab_class_create(spa, &zfs_metaslab_ops); 1311 spa->spa_embedded_log_class = 1312 metaslab_class_create(spa, &zfs_metaslab_ops); 1313 spa->spa_special_class = metaslab_class_create(spa, &zfs_metaslab_ops); 1314 spa->spa_dedup_class = metaslab_class_create(spa, &zfs_metaslab_ops); 1315 1316 /* Try to create a covering process */ 1317 mutex_enter(&spa->spa_proc_lock); 1318 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1319 ASSERT(spa->spa_proc == &p0); 1320 spa->spa_did = 0; 1321 1322 (void) spa_create_process; 1323 #ifdef HAVE_SPA_THREAD 1324 /* Only create a process if we're going to be around a while. */ 1325 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1326 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1327 NULL, 0) == 0) { 1328 spa->spa_proc_state = SPA_PROC_CREATED; 1329 while (spa->spa_proc_state == SPA_PROC_CREATED) { 1330 cv_wait(&spa->spa_proc_cv, 1331 &spa->spa_proc_lock); 1332 } 1333 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1334 ASSERT(spa->spa_proc != &p0); 1335 ASSERT(spa->spa_did != 0); 1336 } else { 1337 #ifdef _KERNEL 1338 cmn_err(CE_WARN, 1339 "Couldn't create process for zfs pool \"%s\"\n", 1340 spa->spa_name); 1341 #endif 1342 } 1343 } 1344 #endif /* HAVE_SPA_THREAD */ 1345 mutex_exit(&spa->spa_proc_lock); 1346 1347 /* If we didn't create a process, we need to create our taskqs. */ 1348 if (spa->spa_proc == &p0) { 1349 spa_create_zio_taskqs(spa); 1350 } 1351 1352 for (size_t i = 0; i < TXG_SIZE; i++) { 1353 spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 1354 ZIO_FLAG_CANFAIL); 1355 } 1356 1357 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1358 offsetof(vdev_t, vdev_config_dirty_node)); 1359 list_create(&spa->spa_evicting_os_list, sizeof (objset_t), 1360 offsetof(objset_t, os_evicting_node)); 1361 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1362 offsetof(vdev_t, vdev_state_dirty_node)); 1363 1364 txg_list_create(&spa->spa_vdev_txg_list, spa, 1365 offsetof(struct vdev, vdev_txg_node)); 1366 1367 avl_create(&spa->spa_errlist_scrub, 1368 spa_error_entry_compare, sizeof (spa_error_entry_t), 1369 offsetof(spa_error_entry_t, se_avl)); 1370 avl_create(&spa->spa_errlist_last, 1371 spa_error_entry_compare, sizeof (spa_error_entry_t), 1372 offsetof(spa_error_entry_t, se_avl)); 1373 avl_create(&spa->spa_errlist_healed, 1374 spa_error_entry_compare, sizeof (spa_error_entry_t), 1375 offsetof(spa_error_entry_t, se_avl)); 1376 1377 spa_activate_os(spa); 1378 1379 spa_keystore_init(&spa->spa_keystore); 1380 1381 /* 1382 * This taskq is used to perform zvol-minor-related tasks 1383 * asynchronously. This has several advantages, including easy 1384 * resolution of various deadlocks. 1385 * 1386 * The taskq must be single threaded to ensure tasks are always 1387 * processed in the order in which they were dispatched. 1388 * 1389 * A taskq per pool allows one to keep the pools independent. 1390 * This way if one pool is suspended, it will not impact another. 1391 * 1392 * The preferred location to dispatch a zvol minor task is a sync 1393 * task. In this context, there is easy access to the spa_t and minimal 1394 * error handling is required because the sync task must succeed. 1395 */ 1396 spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri, 1397 1, INT_MAX, 0); 1398 1399 /* 1400 * Taskq dedicated to prefetcher threads: this is used to prevent the 1401 * pool traverse code from monopolizing the global (and limited) 1402 * system_taskq by inappropriately scheduling long running tasks on it. 1403 */ 1404 spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100, 1405 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1406 1407 /* 1408 * The taskq to upgrade datasets in this pool. Currently used by 1409 * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA. 1410 */ 1411 spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100, 1412 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1413 } 1414 1415 /* 1416 * Opposite of spa_activate(). 1417 */ 1418 static void 1419 spa_deactivate(spa_t *spa) 1420 { 1421 ASSERT(spa->spa_sync_on == B_FALSE); 1422 ASSERT(spa->spa_dsl_pool == NULL); 1423 ASSERT(spa->spa_root_vdev == NULL); 1424 ASSERT(spa->spa_async_zio_root == NULL); 1425 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1426 1427 spa_evicting_os_wait(spa); 1428 1429 if (spa->spa_zvol_taskq) { 1430 taskq_destroy(spa->spa_zvol_taskq); 1431 spa->spa_zvol_taskq = NULL; 1432 } 1433 1434 if (spa->spa_prefetch_taskq) { 1435 taskq_destroy(spa->spa_prefetch_taskq); 1436 spa->spa_prefetch_taskq = NULL; 1437 } 1438 1439 if (spa->spa_upgrade_taskq) { 1440 taskq_destroy(spa->spa_upgrade_taskq); 1441 spa->spa_upgrade_taskq = NULL; 1442 } 1443 1444 txg_list_destroy(&spa->spa_vdev_txg_list); 1445 1446 list_destroy(&spa->spa_config_dirty_list); 1447 list_destroy(&spa->spa_evicting_os_list); 1448 list_destroy(&spa->spa_state_dirty_list); 1449 1450 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 1451 1452 for (int t = 0; t < ZIO_TYPES; t++) { 1453 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1454 spa_taskqs_fini(spa, t, q); 1455 } 1456 } 1457 1458 for (size_t i = 0; i < TXG_SIZE; i++) { 1459 ASSERT3P(spa->spa_txg_zio[i], !=, NULL); 1460 VERIFY0(zio_wait(spa->spa_txg_zio[i])); 1461 spa->spa_txg_zio[i] = NULL; 1462 } 1463 1464 metaslab_class_destroy(spa->spa_normal_class); 1465 spa->spa_normal_class = NULL; 1466 1467 metaslab_class_destroy(spa->spa_log_class); 1468 spa->spa_log_class = NULL; 1469 1470 metaslab_class_destroy(spa->spa_embedded_log_class); 1471 spa->spa_embedded_log_class = NULL; 1472 1473 metaslab_class_destroy(spa->spa_special_class); 1474 spa->spa_special_class = NULL; 1475 1476 metaslab_class_destroy(spa->spa_dedup_class); 1477 spa->spa_dedup_class = NULL; 1478 1479 /* 1480 * If this was part of an import or the open otherwise failed, we may 1481 * still have errors left in the queues. Empty them just in case. 1482 */ 1483 spa_errlog_drain(spa); 1484 avl_destroy(&spa->spa_errlist_scrub); 1485 avl_destroy(&spa->spa_errlist_last); 1486 avl_destroy(&spa->spa_errlist_healed); 1487 1488 spa_keystore_fini(&spa->spa_keystore); 1489 1490 spa->spa_state = POOL_STATE_UNINITIALIZED; 1491 1492 mutex_enter(&spa->spa_proc_lock); 1493 if (spa->spa_proc_state != SPA_PROC_NONE) { 1494 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1495 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1496 cv_broadcast(&spa->spa_proc_cv); 1497 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1498 ASSERT(spa->spa_proc != &p0); 1499 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1500 } 1501 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1502 spa->spa_proc_state = SPA_PROC_NONE; 1503 } 1504 ASSERT(spa->spa_proc == &p0); 1505 mutex_exit(&spa->spa_proc_lock); 1506 1507 /* 1508 * We want to make sure spa_thread() has actually exited the ZFS 1509 * module, so that the module can't be unloaded out from underneath 1510 * it. 1511 */ 1512 if (spa->spa_did != 0) { 1513 thread_join(spa->spa_did); 1514 spa->spa_did = 0; 1515 } 1516 1517 spa_deactivate_os(spa); 1518 1519 } 1520 1521 /* 1522 * Verify a pool configuration, and construct the vdev tree appropriately. This 1523 * will create all the necessary vdevs in the appropriate layout, with each vdev 1524 * in the CLOSED state. This will prep the pool before open/creation/import. 1525 * All vdev validation is done by the vdev_alloc() routine. 1526 */ 1527 int 1528 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1529 uint_t id, int atype) 1530 { 1531 nvlist_t **child; 1532 uint_t children; 1533 int error; 1534 1535 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1536 return (error); 1537 1538 if ((*vdp)->vdev_ops->vdev_op_leaf) 1539 return (0); 1540 1541 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1542 &child, &children); 1543 1544 if (error == ENOENT) 1545 return (0); 1546 1547 if (error) { 1548 vdev_free(*vdp); 1549 *vdp = NULL; 1550 return (SET_ERROR(EINVAL)); 1551 } 1552 1553 for (int c = 0; c < children; c++) { 1554 vdev_t *vd; 1555 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1556 atype)) != 0) { 1557 vdev_free(*vdp); 1558 *vdp = NULL; 1559 return (error); 1560 } 1561 } 1562 1563 ASSERT(*vdp != NULL); 1564 1565 return (0); 1566 } 1567 1568 static boolean_t 1569 spa_should_flush_logs_on_unload(spa_t *spa) 1570 { 1571 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 1572 return (B_FALSE); 1573 1574 if (!spa_writeable(spa)) 1575 return (B_FALSE); 1576 1577 if (!spa->spa_sync_on) 1578 return (B_FALSE); 1579 1580 if (spa_state(spa) != POOL_STATE_EXPORTED) 1581 return (B_FALSE); 1582 1583 if (zfs_keep_log_spacemaps_at_export) 1584 return (B_FALSE); 1585 1586 return (B_TRUE); 1587 } 1588 1589 /* 1590 * Opens a transaction that will set the flag that will instruct 1591 * spa_sync to attempt to flush all the metaslabs for that txg. 1592 */ 1593 static void 1594 spa_unload_log_sm_flush_all(spa_t *spa) 1595 { 1596 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 1597 VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 1598 1599 ASSERT3U(spa->spa_log_flushall_txg, ==, 0); 1600 spa->spa_log_flushall_txg = dmu_tx_get_txg(tx); 1601 1602 dmu_tx_commit(tx); 1603 txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg); 1604 } 1605 1606 static void 1607 spa_unload_log_sm_metadata(spa_t *spa) 1608 { 1609 void *cookie = NULL; 1610 spa_log_sm_t *sls; 1611 while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg, 1612 &cookie)) != NULL) { 1613 VERIFY0(sls->sls_mscount); 1614 kmem_free(sls, sizeof (spa_log_sm_t)); 1615 } 1616 1617 for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); 1618 e != NULL; e = list_head(&spa->spa_log_summary)) { 1619 VERIFY0(e->lse_mscount); 1620 list_remove(&spa->spa_log_summary, e); 1621 kmem_free(e, sizeof (log_summary_entry_t)); 1622 } 1623 1624 spa->spa_unflushed_stats.sus_nblocks = 0; 1625 spa->spa_unflushed_stats.sus_memused = 0; 1626 spa->spa_unflushed_stats.sus_blocklimit = 0; 1627 } 1628 1629 static void 1630 spa_destroy_aux_threads(spa_t *spa) 1631 { 1632 if (spa->spa_condense_zthr != NULL) { 1633 zthr_destroy(spa->spa_condense_zthr); 1634 spa->spa_condense_zthr = NULL; 1635 } 1636 if (spa->spa_checkpoint_discard_zthr != NULL) { 1637 zthr_destroy(spa->spa_checkpoint_discard_zthr); 1638 spa->spa_checkpoint_discard_zthr = NULL; 1639 } 1640 if (spa->spa_livelist_delete_zthr != NULL) { 1641 zthr_destroy(spa->spa_livelist_delete_zthr); 1642 spa->spa_livelist_delete_zthr = NULL; 1643 } 1644 if (spa->spa_livelist_condense_zthr != NULL) { 1645 zthr_destroy(spa->spa_livelist_condense_zthr); 1646 spa->spa_livelist_condense_zthr = NULL; 1647 } 1648 } 1649 1650 /* 1651 * Opposite of spa_load(). 1652 */ 1653 static void 1654 spa_unload(spa_t *spa) 1655 { 1656 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1657 ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED); 1658 1659 spa_import_progress_remove(spa_guid(spa)); 1660 spa_load_note(spa, "UNLOADING"); 1661 1662 spa_wake_waiters(spa); 1663 1664 /* 1665 * If we have set the spa_final_txg, we have already performed the 1666 * tasks below in spa_export_common(). We should not redo it here since 1667 * we delay the final TXGs beyond what spa_final_txg is set at. 1668 */ 1669 if (spa->spa_final_txg == UINT64_MAX) { 1670 /* 1671 * If the log space map feature is enabled and the pool is 1672 * getting exported (but not destroyed), we want to spend some 1673 * time flushing as many metaslabs as we can in an attempt to 1674 * destroy log space maps and save import time. 1675 */ 1676 if (spa_should_flush_logs_on_unload(spa)) 1677 spa_unload_log_sm_flush_all(spa); 1678 1679 /* 1680 * Stop async tasks. 1681 */ 1682 spa_async_suspend(spa); 1683 1684 if (spa->spa_root_vdev) { 1685 vdev_t *root_vdev = spa->spa_root_vdev; 1686 vdev_initialize_stop_all(root_vdev, 1687 VDEV_INITIALIZE_ACTIVE); 1688 vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE); 1689 vdev_autotrim_stop_all(spa); 1690 vdev_rebuild_stop_all(spa); 1691 } 1692 } 1693 1694 /* 1695 * Stop syncing. 1696 */ 1697 if (spa->spa_sync_on) { 1698 txg_sync_stop(spa->spa_dsl_pool); 1699 spa->spa_sync_on = B_FALSE; 1700 } 1701 1702 /* 1703 * This ensures that there is no async metaslab prefetching 1704 * while we attempt to unload the spa. 1705 */ 1706 if (spa->spa_root_vdev != NULL) { 1707 for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { 1708 vdev_t *vc = spa->spa_root_vdev->vdev_child[c]; 1709 if (vc->vdev_mg != NULL) 1710 taskq_wait(vc->vdev_mg->mg_taskq); 1711 } 1712 } 1713 1714 if (spa->spa_mmp.mmp_thread) 1715 mmp_thread_stop(spa); 1716 1717 /* 1718 * Wait for any outstanding async I/O to complete. 1719 */ 1720 if (spa->spa_async_zio_root != NULL) { 1721 for (int i = 0; i < max_ncpus; i++) 1722 (void) zio_wait(spa->spa_async_zio_root[i]); 1723 kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); 1724 spa->spa_async_zio_root = NULL; 1725 } 1726 1727 if (spa->spa_vdev_removal != NULL) { 1728 spa_vdev_removal_destroy(spa->spa_vdev_removal); 1729 spa->spa_vdev_removal = NULL; 1730 } 1731 1732 spa_destroy_aux_threads(spa); 1733 1734 spa_condense_fini(spa); 1735 1736 bpobj_close(&spa->spa_deferred_bpobj); 1737 1738 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); 1739 1740 /* 1741 * Close all vdevs. 1742 */ 1743 if (spa->spa_root_vdev) 1744 vdev_free(spa->spa_root_vdev); 1745 ASSERT(spa->spa_root_vdev == NULL); 1746 1747 /* 1748 * Close the dsl pool. 1749 */ 1750 if (spa->spa_dsl_pool) { 1751 dsl_pool_close(spa->spa_dsl_pool); 1752 spa->spa_dsl_pool = NULL; 1753 spa->spa_meta_objset = NULL; 1754 } 1755 1756 ddt_unload(spa); 1757 brt_unload(spa); 1758 spa_unload_log_sm_metadata(spa); 1759 1760 /* 1761 * Drop and purge level 2 cache 1762 */ 1763 spa_l2cache_drop(spa); 1764 1765 if (spa->spa_spares.sav_vdevs) { 1766 for (int i = 0; i < spa->spa_spares.sav_count; i++) 1767 vdev_free(spa->spa_spares.sav_vdevs[i]); 1768 kmem_free(spa->spa_spares.sav_vdevs, 1769 spa->spa_spares.sav_count * sizeof (void *)); 1770 spa->spa_spares.sav_vdevs = NULL; 1771 } 1772 if (spa->spa_spares.sav_config) { 1773 nvlist_free(spa->spa_spares.sav_config); 1774 spa->spa_spares.sav_config = NULL; 1775 } 1776 spa->spa_spares.sav_count = 0; 1777 1778 if (spa->spa_l2cache.sav_vdevs) { 1779 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { 1780 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1781 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1782 } 1783 kmem_free(spa->spa_l2cache.sav_vdevs, 1784 spa->spa_l2cache.sav_count * sizeof (void *)); 1785 spa->spa_l2cache.sav_vdevs = NULL; 1786 } 1787 if (spa->spa_l2cache.sav_config) { 1788 nvlist_free(spa->spa_l2cache.sav_config); 1789 spa->spa_l2cache.sav_config = NULL; 1790 } 1791 spa->spa_l2cache.sav_count = 0; 1792 1793 spa->spa_async_suspended = 0; 1794 1795 spa->spa_indirect_vdevs_loaded = B_FALSE; 1796 1797 if (spa->spa_comment != NULL) { 1798 spa_strfree(spa->spa_comment); 1799 spa->spa_comment = NULL; 1800 } 1801 if (spa->spa_compatibility != NULL) { 1802 spa_strfree(spa->spa_compatibility); 1803 spa->spa_compatibility = NULL; 1804 } 1805 1806 spa_config_exit(spa, SCL_ALL, spa); 1807 } 1808 1809 /* 1810 * Load (or re-load) the current list of vdevs describing the active spares for 1811 * this pool. When this is called, we have some form of basic information in 1812 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1813 * then re-generate a more complete list including status information. 1814 */ 1815 void 1816 spa_load_spares(spa_t *spa) 1817 { 1818 nvlist_t **spares; 1819 uint_t nspares; 1820 int i; 1821 vdev_t *vd, *tvd; 1822 1823 #ifndef _KERNEL 1824 /* 1825 * zdb opens both the current state of the pool and the 1826 * checkpointed state (if present), with a different spa_t. 1827 * 1828 * As spare vdevs are shared among open pools, we skip loading 1829 * them when we load the checkpointed state of the pool. 1830 */ 1831 if (!spa_writeable(spa)) 1832 return; 1833 #endif 1834 1835 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1836 1837 /* 1838 * First, close and free any existing spare vdevs. 1839 */ 1840 if (spa->spa_spares.sav_vdevs) { 1841 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1842 vd = spa->spa_spares.sav_vdevs[i]; 1843 1844 /* Undo the call to spa_activate() below */ 1845 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1846 B_FALSE)) != NULL && tvd->vdev_isspare) 1847 spa_spare_remove(tvd); 1848 vdev_close(vd); 1849 vdev_free(vd); 1850 } 1851 1852 kmem_free(spa->spa_spares.sav_vdevs, 1853 spa->spa_spares.sav_count * sizeof (void *)); 1854 } 1855 1856 if (spa->spa_spares.sav_config == NULL) 1857 nspares = 0; 1858 else 1859 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1860 ZPOOL_CONFIG_SPARES, &spares, &nspares)); 1861 1862 spa->spa_spares.sav_count = (int)nspares; 1863 spa->spa_spares.sav_vdevs = NULL; 1864 1865 if (nspares == 0) 1866 return; 1867 1868 /* 1869 * Construct the array of vdevs, opening them to get status in the 1870 * process. For each spare, there is potentially two different vdev_t 1871 * structures associated with it: one in the list of spares (used only 1872 * for basic validation purposes) and one in the active vdev 1873 * configuration (if it's spared in). During this phase we open and 1874 * validate each vdev on the spare list. If the vdev also exists in the 1875 * active configuration, then we also mark this vdev as an active spare. 1876 */ 1877 spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *), 1878 KM_SLEEP); 1879 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1880 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1881 VDEV_ALLOC_SPARE) == 0); 1882 ASSERT(vd != NULL); 1883 1884 spa->spa_spares.sav_vdevs[i] = vd; 1885 1886 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1887 B_FALSE)) != NULL) { 1888 if (!tvd->vdev_isspare) 1889 spa_spare_add(tvd); 1890 1891 /* 1892 * We only mark the spare active if we were successfully 1893 * able to load the vdev. Otherwise, importing a pool 1894 * with a bad active spare would result in strange 1895 * behavior, because multiple pool would think the spare 1896 * is actively in use. 1897 * 1898 * There is a vulnerability here to an equally bizarre 1899 * circumstance, where a dead active spare is later 1900 * brought back to life (onlined or otherwise). Given 1901 * the rarity of this scenario, and the extra complexity 1902 * it adds, we ignore the possibility. 1903 */ 1904 if (!vdev_is_dead(tvd)) 1905 spa_spare_activate(tvd); 1906 } 1907 1908 vd->vdev_top = vd; 1909 vd->vdev_aux = &spa->spa_spares; 1910 1911 if (vdev_open(vd) != 0) 1912 continue; 1913 1914 if (vdev_validate_aux(vd) == 0) 1915 spa_spare_add(vd); 1916 } 1917 1918 /* 1919 * Recompute the stashed list of spares, with status information 1920 * this time. 1921 */ 1922 fnvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES); 1923 1924 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1925 KM_SLEEP); 1926 for (i = 0; i < spa->spa_spares.sav_count; i++) 1927 spares[i] = vdev_config_generate(spa, 1928 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1929 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 1930 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 1931 spa->spa_spares.sav_count); 1932 for (i = 0; i < spa->spa_spares.sav_count; i++) 1933 nvlist_free(spares[i]); 1934 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1935 } 1936 1937 /* 1938 * Load (or re-load) the current list of vdevs describing the active l2cache for 1939 * this pool. When this is called, we have some form of basic information in 1940 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1941 * then re-generate a more complete list including status information. 1942 * Devices which are already active have their details maintained, and are 1943 * not re-opened. 1944 */ 1945 void 1946 spa_load_l2cache(spa_t *spa) 1947 { 1948 nvlist_t **l2cache = NULL; 1949 uint_t nl2cache; 1950 int i, j, oldnvdevs; 1951 uint64_t guid; 1952 vdev_t *vd, **oldvdevs, **newvdevs; 1953 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1954 1955 #ifndef _KERNEL 1956 /* 1957 * zdb opens both the current state of the pool and the 1958 * checkpointed state (if present), with a different spa_t. 1959 * 1960 * As L2 caches are part of the ARC which is shared among open 1961 * pools, we skip loading them when we load the checkpointed 1962 * state of the pool. 1963 */ 1964 if (!spa_writeable(spa)) 1965 return; 1966 #endif 1967 1968 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1969 1970 oldvdevs = sav->sav_vdevs; 1971 oldnvdevs = sav->sav_count; 1972 sav->sav_vdevs = NULL; 1973 sav->sav_count = 0; 1974 1975 if (sav->sav_config == NULL) { 1976 nl2cache = 0; 1977 newvdevs = NULL; 1978 goto out; 1979 } 1980 1981 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, 1982 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); 1983 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1984 1985 /* 1986 * Process new nvlist of vdevs. 1987 */ 1988 for (i = 0; i < nl2cache; i++) { 1989 guid = fnvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID); 1990 1991 newvdevs[i] = NULL; 1992 for (j = 0; j < oldnvdevs; j++) { 1993 vd = oldvdevs[j]; 1994 if (vd != NULL && guid == vd->vdev_guid) { 1995 /* 1996 * Retain previous vdev for add/remove ops. 1997 */ 1998 newvdevs[i] = vd; 1999 oldvdevs[j] = NULL; 2000 break; 2001 } 2002 } 2003 2004 if (newvdevs[i] == NULL) { 2005 /* 2006 * Create new vdev 2007 */ 2008 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 2009 VDEV_ALLOC_L2CACHE) == 0); 2010 ASSERT(vd != NULL); 2011 newvdevs[i] = vd; 2012 2013 /* 2014 * Commit this vdev as an l2cache device, 2015 * even if it fails to open. 2016 */ 2017 spa_l2cache_add(vd); 2018 2019 vd->vdev_top = vd; 2020 vd->vdev_aux = sav; 2021 2022 spa_l2cache_activate(vd); 2023 2024 if (vdev_open(vd) != 0) 2025 continue; 2026 2027 (void) vdev_validate_aux(vd); 2028 2029 if (!vdev_is_dead(vd)) 2030 l2arc_add_vdev(spa, vd); 2031 2032 /* 2033 * Upon cache device addition to a pool or pool 2034 * creation with a cache device or if the header 2035 * of the device is invalid we issue an async 2036 * TRIM command for the whole device which will 2037 * execute if l2arc_trim_ahead > 0. 2038 */ 2039 spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); 2040 } 2041 } 2042 2043 sav->sav_vdevs = newvdevs; 2044 sav->sav_count = (int)nl2cache; 2045 2046 /* 2047 * Recompute the stashed list of l2cache devices, with status 2048 * information this time. 2049 */ 2050 fnvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE); 2051 2052 if (sav->sav_count > 0) 2053 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), 2054 KM_SLEEP); 2055 for (i = 0; i < sav->sav_count; i++) 2056 l2cache[i] = vdev_config_generate(spa, 2057 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 2058 fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 2059 (const nvlist_t * const *)l2cache, sav->sav_count); 2060 2061 out: 2062 /* 2063 * Purge vdevs that were dropped 2064 */ 2065 if (oldvdevs) { 2066 for (i = 0; i < oldnvdevs; i++) { 2067 uint64_t pool; 2068 2069 vd = oldvdevs[i]; 2070 if (vd != NULL) { 2071 ASSERT(vd->vdev_isl2cache); 2072 2073 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 2074 pool != 0ULL && l2arc_vdev_present(vd)) 2075 l2arc_remove_vdev(vd); 2076 vdev_clear_stats(vd); 2077 vdev_free(vd); 2078 } 2079 } 2080 2081 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 2082 } 2083 2084 for (i = 0; i < sav->sav_count; i++) 2085 nvlist_free(l2cache[i]); 2086 if (sav->sav_count) 2087 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 2088 } 2089 2090 static int 2091 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 2092 { 2093 dmu_buf_t *db; 2094 char *packed = NULL; 2095 size_t nvsize = 0; 2096 int error; 2097 *value = NULL; 2098 2099 error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 2100 if (error) 2101 return (error); 2102 2103 nvsize = *(uint64_t *)db->db_data; 2104 dmu_buf_rele(db, FTAG); 2105 2106 packed = vmem_alloc(nvsize, KM_SLEEP); 2107 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 2108 DMU_READ_PREFETCH); 2109 if (error == 0) 2110 error = nvlist_unpack(packed, nvsize, value, 0); 2111 vmem_free(packed, nvsize); 2112 2113 return (error); 2114 } 2115 2116 /* 2117 * Concrete top-level vdevs that are not missing and are not logs. At every 2118 * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds. 2119 */ 2120 static uint64_t 2121 spa_healthy_core_tvds(spa_t *spa) 2122 { 2123 vdev_t *rvd = spa->spa_root_vdev; 2124 uint64_t tvds = 0; 2125 2126 for (uint64_t i = 0; i < rvd->vdev_children; i++) { 2127 vdev_t *vd = rvd->vdev_child[i]; 2128 if (vd->vdev_islog) 2129 continue; 2130 if (vdev_is_concrete(vd) && !vdev_is_dead(vd)) 2131 tvds++; 2132 } 2133 2134 return (tvds); 2135 } 2136 2137 /* 2138 * Checks to see if the given vdev could not be opened, in which case we post a 2139 * sysevent to notify the autoreplace code that the device has been removed. 2140 */ 2141 static void 2142 spa_check_removed(vdev_t *vd) 2143 { 2144 for (uint64_t c = 0; c < vd->vdev_children; c++) 2145 spa_check_removed(vd->vdev_child[c]); 2146 2147 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 2148 vdev_is_concrete(vd)) { 2149 zfs_post_autoreplace(vd->vdev_spa, vd); 2150 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); 2151 } 2152 } 2153 2154 static int 2155 spa_check_for_missing_logs(spa_t *spa) 2156 { 2157 vdev_t *rvd = spa->spa_root_vdev; 2158 2159 /* 2160 * If we're doing a normal import, then build up any additional 2161 * diagnostic information about missing log devices. 2162 * We'll pass this up to the user for further processing. 2163 */ 2164 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 2165 nvlist_t **child, *nv; 2166 uint64_t idx = 0; 2167 2168 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *), 2169 KM_SLEEP); 2170 nv = fnvlist_alloc(); 2171 2172 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 2173 vdev_t *tvd = rvd->vdev_child[c]; 2174 2175 /* 2176 * We consider a device as missing only if it failed 2177 * to open (i.e. offline or faulted is not considered 2178 * as missing). 2179 */ 2180 if (tvd->vdev_islog && 2181 tvd->vdev_state == VDEV_STATE_CANT_OPEN) { 2182 child[idx++] = vdev_config_generate(spa, tvd, 2183 B_FALSE, VDEV_CONFIG_MISSING); 2184 } 2185 } 2186 2187 if (idx > 0) { 2188 fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 2189 (const nvlist_t * const *)child, idx); 2190 fnvlist_add_nvlist(spa->spa_load_info, 2191 ZPOOL_CONFIG_MISSING_DEVICES, nv); 2192 2193 for (uint64_t i = 0; i < idx; i++) 2194 nvlist_free(child[i]); 2195 } 2196 nvlist_free(nv); 2197 kmem_free(child, rvd->vdev_children * sizeof (char **)); 2198 2199 if (idx > 0) { 2200 spa_load_failed(spa, "some log devices are missing"); 2201 vdev_dbgmsg_print_tree(rvd, 2); 2202 return (SET_ERROR(ENXIO)); 2203 } 2204 } else { 2205 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 2206 vdev_t *tvd = rvd->vdev_child[c]; 2207 2208 if (tvd->vdev_islog && 2209 tvd->vdev_state == VDEV_STATE_CANT_OPEN) { 2210 spa_set_log_state(spa, SPA_LOG_CLEAR); 2211 spa_load_note(spa, "some log devices are " 2212 "missing, ZIL is dropped."); 2213 vdev_dbgmsg_print_tree(rvd, 2); 2214 break; 2215 } 2216 } 2217 } 2218 2219 return (0); 2220 } 2221 2222 /* 2223 * Check for missing log devices 2224 */ 2225 static boolean_t 2226 spa_check_logs(spa_t *spa) 2227 { 2228 boolean_t rv = B_FALSE; 2229 dsl_pool_t *dp = spa_get_dsl(spa); 2230 2231 switch (spa->spa_log_state) { 2232 default: 2233 break; 2234 case SPA_LOG_MISSING: 2235 /* need to recheck in case slog has been restored */ 2236 case SPA_LOG_UNKNOWN: 2237 rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 2238 zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); 2239 if (rv) 2240 spa_set_log_state(spa, SPA_LOG_MISSING); 2241 break; 2242 } 2243 return (rv); 2244 } 2245 2246 /* 2247 * Passivate any log vdevs (note, does not apply to embedded log metaslabs). 2248 */ 2249 static boolean_t 2250 spa_passivate_log(spa_t *spa) 2251 { 2252 vdev_t *rvd = spa->spa_root_vdev; 2253 boolean_t slog_found = B_FALSE; 2254 2255 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 2256 2257 for (int c = 0; c < rvd->vdev_children; c++) { 2258 vdev_t *tvd = rvd->vdev_child[c]; 2259 2260 if (tvd->vdev_islog) { 2261 ASSERT3P(tvd->vdev_log_mg, ==, NULL); 2262 metaslab_group_passivate(tvd->vdev_mg); 2263 slog_found = B_TRUE; 2264 } 2265 } 2266 2267 return (slog_found); 2268 } 2269 2270 /* 2271 * Activate any log vdevs (note, does not apply to embedded log metaslabs). 2272 */ 2273 static void 2274 spa_activate_log(spa_t *spa) 2275 { 2276 vdev_t *rvd = spa->spa_root_vdev; 2277 2278 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 2279 2280 for (int c = 0; c < rvd->vdev_children; c++) { 2281 vdev_t *tvd = rvd->vdev_child[c]; 2282 2283 if (tvd->vdev_islog) { 2284 ASSERT3P(tvd->vdev_log_mg, ==, NULL); 2285 metaslab_group_activate(tvd->vdev_mg); 2286 } 2287 } 2288 } 2289 2290 int 2291 spa_reset_logs(spa_t *spa) 2292 { 2293 int error; 2294 2295 error = dmu_objset_find(spa_name(spa), zil_reset, 2296 NULL, DS_FIND_CHILDREN); 2297 if (error == 0) { 2298 /* 2299 * We successfully offlined the log device, sync out the 2300 * current txg so that the "stubby" block can be removed 2301 * by zil_sync(). 2302 */ 2303 txg_wait_synced(spa->spa_dsl_pool, 0); 2304 } 2305 return (error); 2306 } 2307 2308 static void 2309 spa_aux_check_removed(spa_aux_vdev_t *sav) 2310 { 2311 for (int i = 0; i < sav->sav_count; i++) 2312 spa_check_removed(sav->sav_vdevs[i]); 2313 } 2314 2315 void 2316 spa_claim_notify(zio_t *zio) 2317 { 2318 spa_t *spa = zio->io_spa; 2319 2320 if (zio->io_error) 2321 return; 2322 2323 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 2324 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 2325 spa->spa_claim_max_txg = zio->io_bp->blk_birth; 2326 mutex_exit(&spa->spa_props_lock); 2327 } 2328 2329 typedef struct spa_load_error { 2330 boolean_t sle_verify_data; 2331 uint64_t sle_meta_count; 2332 uint64_t sle_data_count; 2333 } spa_load_error_t; 2334 2335 static void 2336 spa_load_verify_done(zio_t *zio) 2337 { 2338 blkptr_t *bp = zio->io_bp; 2339 spa_load_error_t *sle = zio->io_private; 2340 dmu_object_type_t type = BP_GET_TYPE(bp); 2341 int error = zio->io_error; 2342 spa_t *spa = zio->io_spa; 2343 2344 abd_free(zio->io_abd); 2345 if (error) { 2346 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 2347 type != DMU_OT_INTENT_LOG) 2348 atomic_inc_64(&sle->sle_meta_count); 2349 else 2350 atomic_inc_64(&sle->sle_data_count); 2351 } 2352 2353 mutex_enter(&spa->spa_scrub_lock); 2354 spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); 2355 cv_broadcast(&spa->spa_scrub_io_cv); 2356 mutex_exit(&spa->spa_scrub_lock); 2357 } 2358 2359 /* 2360 * Maximum number of inflight bytes is the log2 fraction of the arc size. 2361 * By default, we set it to 1/16th of the arc. 2362 */ 2363 static uint_t spa_load_verify_shift = 4; 2364 static int spa_load_verify_metadata = B_TRUE; 2365 static int spa_load_verify_data = B_TRUE; 2366 2367 static int 2368 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 2369 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 2370 { 2371 zio_t *rio = arg; 2372 spa_load_error_t *sle = rio->io_private; 2373 2374 (void) zilog, (void) dnp; 2375 2376 /* 2377 * Note: normally this routine will not be called if 2378 * spa_load_verify_metadata is not set. However, it may be useful 2379 * to manually set the flag after the traversal has begun. 2380 */ 2381 if (!spa_load_verify_metadata) 2382 return (0); 2383 2384 /* 2385 * Sanity check the block pointer in order to detect obvious damage 2386 * before using the contents in subsequent checks or in zio_read(). 2387 * When damaged consider it to be a metadata error since we cannot 2388 * trust the BP_GET_TYPE and BP_GET_LEVEL values. 2389 */ 2390 if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { 2391 atomic_inc_64(&sle->sle_meta_count); 2392 return (0); 2393 } 2394 2395 if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || 2396 BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) 2397 return (0); 2398 2399 if (!BP_IS_METADATA(bp) && 2400 (!spa_load_verify_data || !sle->sle_verify_data)) 2401 return (0); 2402 2403 uint64_t maxinflight_bytes = 2404 arc_target_bytes() >> spa_load_verify_shift; 2405 size_t size = BP_GET_PSIZE(bp); 2406 2407 mutex_enter(&spa->spa_scrub_lock); 2408 while (spa->spa_load_verify_bytes >= maxinflight_bytes) 2409 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2410 spa->spa_load_verify_bytes += size; 2411 mutex_exit(&spa->spa_scrub_lock); 2412 2413 zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, 2414 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 2415 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 2416 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 2417 return (0); 2418 } 2419 2420 static int 2421 verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 2422 { 2423 (void) dp, (void) arg; 2424 2425 if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) 2426 return (SET_ERROR(ENAMETOOLONG)); 2427 2428 return (0); 2429 } 2430 2431 static int 2432 spa_load_verify(spa_t *spa) 2433 { 2434 zio_t *rio; 2435 spa_load_error_t sle = { 0 }; 2436 zpool_load_policy_t policy; 2437 boolean_t verify_ok = B_FALSE; 2438 int error = 0; 2439 2440 zpool_get_load_policy(spa->spa_config, &policy); 2441 2442 if (policy.zlp_rewind & ZPOOL_NEVER_REWIND || 2443 policy.zlp_maxmeta == UINT64_MAX) 2444 return (0); 2445 2446 dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); 2447 error = dmu_objset_find_dp(spa->spa_dsl_pool, 2448 spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, 2449 DS_FIND_CHILDREN); 2450 dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); 2451 if (error != 0) 2452 return (error); 2453 2454 /* 2455 * Verify data only if we are rewinding or error limit was set. 2456 * Otherwise nothing except dbgmsg care about it to waste time. 2457 */ 2458 sle.sle_verify_data = (policy.zlp_rewind & ZPOOL_REWIND_MASK) || 2459 (policy.zlp_maxdata < UINT64_MAX); 2460 2461 rio = zio_root(spa, NULL, &sle, 2462 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 2463 2464 if (spa_load_verify_metadata) { 2465 if (spa->spa_extreme_rewind) { 2466 spa_load_note(spa, "performing a complete scan of the " 2467 "pool since extreme rewind is on. This may take " 2468 "a very long time.\n (spa_load_verify_data=%u, " 2469 "spa_load_verify_metadata=%u)", 2470 spa_load_verify_data, spa_load_verify_metadata); 2471 } 2472 2473 error = traverse_pool(spa, spa->spa_verify_min_txg, 2474 TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | 2475 TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio); 2476 } 2477 2478 (void) zio_wait(rio); 2479 ASSERT0(spa->spa_load_verify_bytes); 2480 2481 spa->spa_load_meta_errors = sle.sle_meta_count; 2482 spa->spa_load_data_errors = sle.sle_data_count; 2483 2484 if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) { 2485 spa_load_note(spa, "spa_load_verify found %llu metadata errors " 2486 "and %llu data errors", (u_longlong_t)sle.sle_meta_count, 2487 (u_longlong_t)sle.sle_data_count); 2488 } 2489 2490 if (spa_load_verify_dryrun || 2491 (!error && sle.sle_meta_count <= policy.zlp_maxmeta && 2492 sle.sle_data_count <= policy.zlp_maxdata)) { 2493 int64_t loss = 0; 2494 2495 verify_ok = B_TRUE; 2496 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 2497 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 2498 2499 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 2500 fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME, 2501 spa->spa_load_txg_ts); 2502 fnvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME, 2503 loss); 2504 fnvlist_add_uint64(spa->spa_load_info, 2505 ZPOOL_CONFIG_LOAD_META_ERRORS, sle.sle_meta_count); 2506 fnvlist_add_uint64(spa->spa_load_info, 2507 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count); 2508 } else { 2509 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 2510 } 2511 2512 if (spa_load_verify_dryrun) 2513 return (0); 2514 2515 if (error) { 2516 if (error != ENXIO && error != EIO) 2517 error = SET_ERROR(EIO); 2518 return (error); 2519 } 2520 2521 return (verify_ok ? 0 : EIO); 2522 } 2523 2524 /* 2525 * Find a value in the pool props object. 2526 */ 2527 static void 2528 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 2529 { 2530 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 2531 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 2532 } 2533 2534 /* 2535 * Find a value in the pool directory object. 2536 */ 2537 static int 2538 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) 2539 { 2540 int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2541 name, sizeof (uint64_t), 1, val); 2542 2543 if (error != 0 && (error != ENOENT || log_enoent)) { 2544 spa_load_failed(spa, "couldn't get '%s' value in MOS directory " 2545 "[error=%d]", name, error); 2546 } 2547 2548 return (error); 2549 } 2550 2551 static int 2552 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 2553 { 2554 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 2555 return (SET_ERROR(err)); 2556 } 2557 2558 boolean_t 2559 spa_livelist_delete_check(spa_t *spa) 2560 { 2561 return (spa->spa_livelists_to_delete != 0); 2562 } 2563 2564 static boolean_t 2565 spa_livelist_delete_cb_check(void *arg, zthr_t *z) 2566 { 2567 (void) z; 2568 spa_t *spa = arg; 2569 return (spa_livelist_delete_check(spa)); 2570 } 2571 2572 static int 2573 delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 2574 { 2575 spa_t *spa = arg; 2576 zio_free(spa, tx->tx_txg, bp); 2577 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, 2578 -bp_get_dsize_sync(spa, bp), 2579 -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); 2580 return (0); 2581 } 2582 2583 static int 2584 dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp) 2585 { 2586 int err; 2587 zap_cursor_t zc; 2588 zap_attribute_t za; 2589 zap_cursor_init(&zc, os, zap_obj); 2590 err = zap_cursor_retrieve(&zc, &za); 2591 zap_cursor_fini(&zc); 2592 if (err == 0) 2593 *llp = za.za_first_integer; 2594 return (err); 2595 } 2596 2597 /* 2598 * Components of livelist deletion that must be performed in syncing 2599 * context: freeing block pointers and updating the pool-wide data 2600 * structures to indicate how much work is left to do 2601 */ 2602 typedef struct sublist_delete_arg { 2603 spa_t *spa; 2604 dsl_deadlist_t *ll; 2605 uint64_t key; 2606 bplist_t *to_free; 2607 } sublist_delete_arg_t; 2608 2609 static void 2610 sublist_delete_sync(void *arg, dmu_tx_t *tx) 2611 { 2612 sublist_delete_arg_t *sda = arg; 2613 spa_t *spa = sda->spa; 2614 dsl_deadlist_t *ll = sda->ll; 2615 uint64_t key = sda->key; 2616 bplist_t *to_free = sda->to_free; 2617 2618 bplist_iterate(to_free, delete_blkptr_cb, spa, tx); 2619 dsl_deadlist_remove_entry(ll, key, tx); 2620 } 2621 2622 typedef struct livelist_delete_arg { 2623 spa_t *spa; 2624 uint64_t ll_obj; 2625 uint64_t zap_obj; 2626 } livelist_delete_arg_t; 2627 2628 static void 2629 livelist_delete_sync(void *arg, dmu_tx_t *tx) 2630 { 2631 livelist_delete_arg_t *lda = arg; 2632 spa_t *spa = lda->spa; 2633 uint64_t ll_obj = lda->ll_obj; 2634 uint64_t zap_obj = lda->zap_obj; 2635 objset_t *mos = spa->spa_meta_objset; 2636 uint64_t count; 2637 2638 /* free the livelist and decrement the feature count */ 2639 VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx)); 2640 dsl_deadlist_free(mos, ll_obj, tx); 2641 spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); 2642 VERIFY0(zap_count(mos, zap_obj, &count)); 2643 if (count == 0) { 2644 /* no more livelists to delete */ 2645 VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, 2646 DMU_POOL_DELETED_CLONES, tx)); 2647 VERIFY0(zap_destroy(mos, zap_obj, tx)); 2648 spa->spa_livelists_to_delete = 0; 2649 spa_notify_waiters(spa); 2650 } 2651 } 2652 2653 /* 2654 * Load in the value for the livelist to be removed and open it. Then, 2655 * load its first sublist and determine which block pointers should actually 2656 * be freed. Then, call a synctask which performs the actual frees and updates 2657 * the pool-wide livelist data. 2658 */ 2659 static void 2660 spa_livelist_delete_cb(void *arg, zthr_t *z) 2661 { 2662 spa_t *spa = arg; 2663 uint64_t ll_obj = 0, count; 2664 objset_t *mos = spa->spa_meta_objset; 2665 uint64_t zap_obj = spa->spa_livelists_to_delete; 2666 /* 2667 * Determine the next livelist to delete. This function should only 2668 * be called if there is at least one deleted clone. 2669 */ 2670 VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj)); 2671 VERIFY0(zap_count(mos, ll_obj, &count)); 2672 if (count > 0) { 2673 dsl_deadlist_t *ll; 2674 dsl_deadlist_entry_t *dle; 2675 bplist_t to_free; 2676 ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP); 2677 dsl_deadlist_open(ll, mos, ll_obj); 2678 dle = dsl_deadlist_first(ll); 2679 ASSERT3P(dle, !=, NULL); 2680 bplist_create(&to_free); 2681 int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free, 2682 z, NULL); 2683 if (err == 0) { 2684 sublist_delete_arg_t sync_arg = { 2685 .spa = spa, 2686 .ll = ll, 2687 .key = dle->dle_mintxg, 2688 .to_free = &to_free 2689 }; 2690 zfs_dbgmsg("deleting sublist (id %llu) from" 2691 " livelist %llu, %lld remaining", 2692 (u_longlong_t)dle->dle_bpobj.bpo_object, 2693 (u_longlong_t)ll_obj, (longlong_t)count - 1); 2694 VERIFY0(dsl_sync_task(spa_name(spa), NULL, 2695 sublist_delete_sync, &sync_arg, 0, 2696 ZFS_SPACE_CHECK_DESTROY)); 2697 } else { 2698 VERIFY3U(err, ==, EINTR); 2699 } 2700 bplist_clear(&to_free); 2701 bplist_destroy(&to_free); 2702 dsl_deadlist_close(ll); 2703 kmem_free(ll, sizeof (dsl_deadlist_t)); 2704 } else { 2705 livelist_delete_arg_t sync_arg = { 2706 .spa = spa, 2707 .ll_obj = ll_obj, 2708 .zap_obj = zap_obj 2709 }; 2710 zfs_dbgmsg("deletion of livelist %llu completed", 2711 (u_longlong_t)ll_obj); 2712 VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync, 2713 &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); 2714 } 2715 } 2716 2717 static void 2718 spa_start_livelist_destroy_thread(spa_t *spa) 2719 { 2720 ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL); 2721 spa->spa_livelist_delete_zthr = 2722 zthr_create("z_livelist_destroy", 2723 spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa, 2724 minclsyspri); 2725 } 2726 2727 typedef struct livelist_new_arg { 2728 bplist_t *allocs; 2729 bplist_t *frees; 2730 } livelist_new_arg_t; 2731 2732 static int 2733 livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 2734 dmu_tx_t *tx) 2735 { 2736 ASSERT(tx == NULL); 2737 livelist_new_arg_t *lna = arg; 2738 if (bp_freed) { 2739 bplist_append(lna->frees, bp); 2740 } else { 2741 bplist_append(lna->allocs, bp); 2742 zfs_livelist_condense_new_alloc++; 2743 } 2744 return (0); 2745 } 2746 2747 typedef struct livelist_condense_arg { 2748 spa_t *spa; 2749 bplist_t to_keep; 2750 uint64_t first_size; 2751 uint64_t next_size; 2752 } livelist_condense_arg_t; 2753 2754 static void 2755 spa_livelist_condense_sync(void *arg, dmu_tx_t *tx) 2756 { 2757 livelist_condense_arg_t *lca = arg; 2758 spa_t *spa = lca->spa; 2759 bplist_t new_frees; 2760 dsl_dataset_t *ds = spa->spa_to_condense.ds; 2761 2762 /* Have we been cancelled? */ 2763 if (spa->spa_to_condense.cancelled) { 2764 zfs_livelist_condense_sync_cancel++; 2765 goto out; 2766 } 2767 2768 dsl_deadlist_entry_t *first = spa->spa_to_condense.first; 2769 dsl_deadlist_entry_t *next = spa->spa_to_condense.next; 2770 dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; 2771 2772 /* 2773 * It's possible that the livelist was changed while the zthr was 2774 * running. Therefore, we need to check for new blkptrs in the two 2775 * entries being condensed and continue to track them in the livelist. 2776 * Because of the way we handle remapped blkptrs (see dbuf_remap_impl), 2777 * it's possible that the newly added blkptrs are FREEs or ALLOCs so 2778 * we need to sort them into two different bplists. 2779 */ 2780 uint64_t first_obj = first->dle_bpobj.bpo_object; 2781 uint64_t next_obj = next->dle_bpobj.bpo_object; 2782 uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs; 2783 uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs; 2784 2785 bplist_create(&new_frees); 2786 livelist_new_arg_t new_bps = { 2787 .allocs = &lca->to_keep, 2788 .frees = &new_frees, 2789 }; 2790 2791 if (cur_first_size > lca->first_size) { 2792 VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj, 2793 livelist_track_new_cb, &new_bps, lca->first_size)); 2794 } 2795 if (cur_next_size > lca->next_size) { 2796 VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj, 2797 livelist_track_new_cb, &new_bps, lca->next_size)); 2798 } 2799 2800 dsl_deadlist_clear_entry(first, ll, tx); 2801 ASSERT(bpobj_is_empty(&first->dle_bpobj)); 2802 dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx); 2803 2804 bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx); 2805 bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx); 2806 bplist_destroy(&new_frees); 2807 2808 char dsname[ZFS_MAX_DATASET_NAME_LEN]; 2809 dsl_dataset_name(ds, dsname); 2810 zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu " 2811 "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu " 2812 "(%llu blkptrs)", (u_longlong_t)tx->tx_txg, dsname, 2813 (u_longlong_t)ds->ds_object, (u_longlong_t)first_obj, 2814 (u_longlong_t)cur_first_size, (u_longlong_t)next_obj, 2815 (u_longlong_t)cur_next_size, 2816 (u_longlong_t)first->dle_bpobj.bpo_object, 2817 (u_longlong_t)first->dle_bpobj.bpo_phys->bpo_num_blkptrs); 2818 out: 2819 dmu_buf_rele(ds->ds_dbuf, spa); 2820 spa->spa_to_condense.ds = NULL; 2821 bplist_clear(&lca->to_keep); 2822 bplist_destroy(&lca->to_keep); 2823 kmem_free(lca, sizeof (livelist_condense_arg_t)); 2824 spa->spa_to_condense.syncing = B_FALSE; 2825 } 2826 2827 static void 2828 spa_livelist_condense_cb(void *arg, zthr_t *t) 2829 { 2830 while (zfs_livelist_condense_zthr_pause && 2831 !(zthr_has_waiters(t) || zthr_iscancelled(t))) 2832 delay(1); 2833 2834 spa_t *spa = arg; 2835 dsl_deadlist_entry_t *first = spa->spa_to_condense.first; 2836 dsl_deadlist_entry_t *next = spa->spa_to_condense.next; 2837 uint64_t first_size, next_size; 2838 2839 livelist_condense_arg_t *lca = 2840 kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP); 2841 bplist_create(&lca->to_keep); 2842 2843 /* 2844 * Process the livelists (matching FREEs and ALLOCs) in open context 2845 * so we have minimal work in syncing context to condense. 2846 * 2847 * We save bpobj sizes (first_size and next_size) to use later in 2848 * syncing context to determine if entries were added to these sublists 2849 * while in open context. This is possible because the clone is still 2850 * active and open for normal writes and we want to make sure the new, 2851 * unprocessed blockpointers are inserted into the livelist normally. 2852 * 2853 * Note that dsl_process_sub_livelist() both stores the size number of 2854 * blockpointers and iterates over them while the bpobj's lock held, so 2855 * the sizes returned to us are consistent which what was actually 2856 * processed. 2857 */ 2858 int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t, 2859 &first_size); 2860 if (err == 0) 2861 err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep, 2862 t, &next_size); 2863 2864 if (err == 0) { 2865 while (zfs_livelist_condense_sync_pause && 2866 !(zthr_has_waiters(t) || zthr_iscancelled(t))) 2867 delay(1); 2868 2869 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 2870 dmu_tx_mark_netfree(tx); 2871 dmu_tx_hold_space(tx, 1); 2872 err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE); 2873 if (err == 0) { 2874 /* 2875 * Prevent the condense zthr restarting before 2876 * the synctask completes. 2877 */ 2878 spa->spa_to_condense.syncing = B_TRUE; 2879 lca->spa = spa; 2880 lca->first_size = first_size; 2881 lca->next_size = next_size; 2882 dsl_sync_task_nowait(spa_get_dsl(spa), 2883 spa_livelist_condense_sync, lca, tx); 2884 dmu_tx_commit(tx); 2885 return; 2886 } 2887 } 2888 /* 2889 * Condensing can not continue: either it was externally stopped or 2890 * we were unable to assign to a tx because the pool has run out of 2891 * space. In the second case, we'll just end up trying to condense 2892 * again in a later txg. 2893 */ 2894 ASSERT(err != 0); 2895 bplist_clear(&lca->to_keep); 2896 bplist_destroy(&lca->to_keep); 2897 kmem_free(lca, sizeof (livelist_condense_arg_t)); 2898 dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa); 2899 spa->spa_to_condense.ds = NULL; 2900 if (err == EINTR) 2901 zfs_livelist_condense_zthr_cancel++; 2902 } 2903 2904 /* 2905 * Check that there is something to condense but that a condense is not 2906 * already in progress and that condensing has not been cancelled. 2907 */ 2908 static boolean_t 2909 spa_livelist_condense_cb_check(void *arg, zthr_t *z) 2910 { 2911 (void) z; 2912 spa_t *spa = arg; 2913 if ((spa->spa_to_condense.ds != NULL) && 2914 (spa->spa_to_condense.syncing == B_FALSE) && 2915 (spa->spa_to_condense.cancelled == B_FALSE)) { 2916 return (B_TRUE); 2917 } 2918 return (B_FALSE); 2919 } 2920 2921 static void 2922 spa_start_livelist_condensing_thread(spa_t *spa) 2923 { 2924 spa->spa_to_condense.ds = NULL; 2925 spa->spa_to_condense.first = NULL; 2926 spa->spa_to_condense.next = NULL; 2927 spa->spa_to_condense.syncing = B_FALSE; 2928 spa->spa_to_condense.cancelled = B_FALSE; 2929 2930 ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL); 2931 spa->spa_livelist_condense_zthr = 2932 zthr_create("z_livelist_condense", 2933 spa_livelist_condense_cb_check, 2934 spa_livelist_condense_cb, spa, minclsyspri); 2935 } 2936 2937 static void 2938 spa_spawn_aux_threads(spa_t *spa) 2939 { 2940 ASSERT(spa_writeable(spa)); 2941 2942 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2943 2944 spa_start_indirect_condensing_thread(spa); 2945 spa_start_livelist_destroy_thread(spa); 2946 spa_start_livelist_condensing_thread(spa); 2947 2948 ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); 2949 spa->spa_checkpoint_discard_zthr = 2950 zthr_create("z_checkpoint_discard", 2951 spa_checkpoint_discard_thread_check, 2952 spa_checkpoint_discard_thread, spa, minclsyspri); 2953 } 2954 2955 /* 2956 * Fix up config after a partly-completed split. This is done with the 2957 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 2958 * pool have that entry in their config, but only the splitting one contains 2959 * a list of all the guids of the vdevs that are being split off. 2960 * 2961 * This function determines what to do with that list: either rejoin 2962 * all the disks to the pool, or complete the splitting process. To attempt 2963 * the rejoin, each disk that is offlined is marked online again, and 2964 * we do a reopen() call. If the vdev label for every disk that was 2965 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 2966 * then we call vdev_split() on each disk, and complete the split. 2967 * 2968 * Otherwise we leave the config alone, with all the vdevs in place in 2969 * the original pool. 2970 */ 2971 static void 2972 spa_try_repair(spa_t *spa, nvlist_t *config) 2973 { 2974 uint_t extracted; 2975 uint64_t *glist; 2976 uint_t i, gcount; 2977 nvlist_t *nvl; 2978 vdev_t **vd; 2979 boolean_t attempt_reopen; 2980 2981 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 2982 return; 2983 2984 /* check that the config is complete */ 2985 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 2986 &glist, &gcount) != 0) 2987 return; 2988 2989 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 2990 2991 /* attempt to online all the vdevs & validate */ 2992 attempt_reopen = B_TRUE; 2993 for (i = 0; i < gcount; i++) { 2994 if (glist[i] == 0) /* vdev is hole */ 2995 continue; 2996 2997 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 2998 if (vd[i] == NULL) { 2999 /* 3000 * Don't bother attempting to reopen the disks; 3001 * just do the split. 3002 */ 3003 attempt_reopen = B_FALSE; 3004 } else { 3005 /* attempt to re-online it */ 3006 vd[i]->vdev_offline = B_FALSE; 3007 } 3008 } 3009 3010 if (attempt_reopen) { 3011 vdev_reopen(spa->spa_root_vdev); 3012 3013 /* check each device to see what state it's in */ 3014 for (extracted = 0, i = 0; i < gcount; i++) { 3015 if (vd[i] != NULL && 3016 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 3017 break; 3018 ++extracted; 3019 } 3020 } 3021 3022 /* 3023 * If every disk has been moved to the new pool, or if we never 3024 * even attempted to look at them, then we split them off for 3025 * good. 3026 */ 3027 if (!attempt_reopen || gcount == extracted) { 3028 for (i = 0; i < gcount; i++) 3029 if (vd[i] != NULL) 3030 vdev_split(vd[i]); 3031 vdev_reopen(spa->spa_root_vdev); 3032 } 3033 3034 kmem_free(vd, gcount * sizeof (vdev_t *)); 3035 } 3036 3037 static int 3038 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) 3039 { 3040 const char *ereport = FM_EREPORT_ZFS_POOL; 3041 int error; 3042 3043 spa->spa_load_state = state; 3044 (void) spa_import_progress_set_state(spa_guid(spa), 3045 spa_load_state(spa)); 3046 3047 gethrestime(&spa->spa_loaded_ts); 3048 error = spa_load_impl(spa, type, &ereport); 3049 3050 /* 3051 * Don't count references from objsets that are already closed 3052 * and are making their way through the eviction process. 3053 */ 3054 spa_evicting_os_wait(spa); 3055 spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); 3056 if (error) { 3057 if (error != EEXIST) { 3058 spa->spa_loaded_ts.tv_sec = 0; 3059 spa->spa_loaded_ts.tv_nsec = 0; 3060 } 3061 if (error != EBADF) { 3062 (void) zfs_ereport_post(ereport, spa, 3063 NULL, NULL, NULL, 0); 3064 } 3065 } 3066 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 3067 spa->spa_ena = 0; 3068 3069 (void) spa_import_progress_set_state(spa_guid(spa), 3070 spa_load_state(spa)); 3071 3072 return (error); 3073 } 3074 3075 #ifdef ZFS_DEBUG 3076 /* 3077 * Count the number of per-vdev ZAPs associated with all of the vdevs in the 3078 * vdev tree rooted in the given vd, and ensure that each ZAP is present in the 3079 * spa's per-vdev ZAP list. 3080 */ 3081 static uint64_t 3082 vdev_count_verify_zaps(vdev_t *vd) 3083 { 3084 spa_t *spa = vd->vdev_spa; 3085 uint64_t total = 0; 3086 3087 if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2) && 3088 vd->vdev_root_zap != 0) { 3089 total++; 3090 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3091 spa->spa_all_vdev_zaps, vd->vdev_root_zap)); 3092 } 3093 if (vd->vdev_top_zap != 0) { 3094 total++; 3095 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3096 spa->spa_all_vdev_zaps, vd->vdev_top_zap)); 3097 } 3098 if (vd->vdev_leaf_zap != 0) { 3099 total++; 3100 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3101 spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); 3102 } 3103 3104 for (uint64_t i = 0; i < vd->vdev_children; i++) { 3105 total += vdev_count_verify_zaps(vd->vdev_child[i]); 3106 } 3107 3108 return (total); 3109 } 3110 #else 3111 #define vdev_count_verify_zaps(vd) ((void) sizeof (vd), 0) 3112 #endif 3113 3114 /* 3115 * Determine whether the activity check is required. 3116 */ 3117 static boolean_t 3118 spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label, 3119 nvlist_t *config) 3120 { 3121 uint64_t state = 0; 3122 uint64_t hostid = 0; 3123 uint64_t tryconfig_txg = 0; 3124 uint64_t tryconfig_timestamp = 0; 3125 uint16_t tryconfig_mmp_seq = 0; 3126 nvlist_t *nvinfo; 3127 3128 if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { 3129 nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); 3130 (void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG, 3131 &tryconfig_txg); 3132 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 3133 &tryconfig_timestamp); 3134 (void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ, 3135 &tryconfig_mmp_seq); 3136 } 3137 3138 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state); 3139 3140 /* 3141 * Disable the MMP activity check - This is used by zdb which 3142 * is intended to be used on potentially active pools. 3143 */ 3144 if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) 3145 return (B_FALSE); 3146 3147 /* 3148 * Skip the activity check when the MMP feature is disabled. 3149 */ 3150 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0) 3151 return (B_FALSE); 3152 3153 /* 3154 * If the tryconfig_ values are nonzero, they are the results of an 3155 * earlier tryimport. If they all match the uberblock we just found, 3156 * then the pool has not changed and we return false so we do not test 3157 * a second time. 3158 */ 3159 if (tryconfig_txg && tryconfig_txg == ub->ub_txg && 3160 tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp && 3161 tryconfig_mmp_seq && tryconfig_mmp_seq == 3162 (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) 3163 return (B_FALSE); 3164 3165 /* 3166 * Allow the activity check to be skipped when importing the pool 3167 * on the same host which last imported it. Since the hostid from 3168 * configuration may be stale use the one read from the label. 3169 */ 3170 if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID)) 3171 hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID); 3172 3173 if (hostid == spa_get_hostid(spa)) 3174 return (B_FALSE); 3175 3176 /* 3177 * Skip the activity test when the pool was cleanly exported. 3178 */ 3179 if (state != POOL_STATE_ACTIVE) 3180 return (B_FALSE); 3181 3182 return (B_TRUE); 3183 } 3184 3185 /* 3186 * Nanoseconds the activity check must watch for changes on-disk. 3187 */ 3188 static uint64_t 3189 spa_activity_check_duration(spa_t *spa, uberblock_t *ub) 3190 { 3191 uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1); 3192 uint64_t multihost_interval = MSEC2NSEC( 3193 MMP_INTERVAL_OK(zfs_multihost_interval)); 3194 uint64_t import_delay = MAX(NANOSEC, import_intervals * 3195 multihost_interval); 3196 3197 /* 3198 * Local tunables determine a minimum duration except for the case 3199 * where we know when the remote host will suspend the pool if MMP 3200 * writes do not land. 3201 * 3202 * See Big Theory comment at the top of mmp.c for the reasoning behind 3203 * these cases and times. 3204 */ 3205 3206 ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100); 3207 3208 if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && 3209 MMP_FAIL_INT(ub) > 0) { 3210 3211 /* MMP on remote host will suspend pool after failed writes */ 3212 import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) * 3213 MMP_IMPORT_SAFETY_FACTOR / 100; 3214 3215 zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp " 3216 "mmp_fails=%llu ub_mmp mmp_interval=%llu " 3217 "import_intervals=%llu", (u_longlong_t)import_delay, 3218 (u_longlong_t)MMP_FAIL_INT(ub), 3219 (u_longlong_t)MMP_INTERVAL(ub), 3220 (u_longlong_t)import_intervals); 3221 3222 } else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && 3223 MMP_FAIL_INT(ub) == 0) { 3224 3225 /* MMP on remote host will never suspend pool */ 3226 import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) + 3227 ub->ub_mmp_delay) * import_intervals); 3228 3229 zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp " 3230 "mmp_interval=%llu ub_mmp_delay=%llu " 3231 "import_intervals=%llu", (u_longlong_t)import_delay, 3232 (u_longlong_t)MMP_INTERVAL(ub), 3233 (u_longlong_t)ub->ub_mmp_delay, 3234 (u_longlong_t)import_intervals); 3235 3236 } else if (MMP_VALID(ub)) { 3237 /* 3238 * zfs-0.7 compatibility case 3239 */ 3240 3241 import_delay = MAX(import_delay, (multihost_interval + 3242 ub->ub_mmp_delay) * import_intervals); 3243 3244 zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu " 3245 "import_intervals=%llu leaves=%u", 3246 (u_longlong_t)import_delay, 3247 (u_longlong_t)ub->ub_mmp_delay, 3248 (u_longlong_t)import_intervals, 3249 vdev_count_leaves(spa)); 3250 } else { 3251 /* Using local tunings is the only reasonable option */ 3252 zfs_dbgmsg("pool last imported on non-MMP aware " 3253 "host using import_delay=%llu multihost_interval=%llu " 3254 "import_intervals=%llu", (u_longlong_t)import_delay, 3255 (u_longlong_t)multihost_interval, 3256 (u_longlong_t)import_intervals); 3257 } 3258 3259 return (import_delay); 3260 } 3261 3262 /* 3263 * Perform the import activity check. If the user canceled the import or 3264 * we detected activity then fail. 3265 */ 3266 static int 3267 spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) 3268 { 3269 uint64_t txg = ub->ub_txg; 3270 uint64_t timestamp = ub->ub_timestamp; 3271 uint64_t mmp_config = ub->ub_mmp_config; 3272 uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0; 3273 uint64_t import_delay; 3274 hrtime_t import_expire; 3275 nvlist_t *mmp_label = NULL; 3276 vdev_t *rvd = spa->spa_root_vdev; 3277 kcondvar_t cv; 3278 kmutex_t mtx; 3279 int error = 0; 3280 3281 cv_init(&cv, NULL, CV_DEFAULT, NULL); 3282 mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL); 3283 mutex_enter(&mtx); 3284 3285 /* 3286 * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed 3287 * during the earlier tryimport. If the txg recorded there is 0 then 3288 * the pool is known to be active on another host. 3289 * 3290 * Otherwise, the pool might be in use on another host. Check for 3291 * changes in the uberblocks on disk if necessary. 3292 */ 3293 if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { 3294 nvlist_t *nvinfo = fnvlist_lookup_nvlist(config, 3295 ZPOOL_CONFIG_LOAD_INFO); 3296 3297 if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) && 3298 fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) { 3299 vdev_uberblock_load(rvd, ub, &mmp_label); 3300 error = SET_ERROR(EREMOTEIO); 3301 goto out; 3302 } 3303 } 3304 3305 import_delay = spa_activity_check_duration(spa, ub); 3306 3307 /* Add a small random factor in case of simultaneous imports (0-25%) */ 3308 import_delay += import_delay * random_in_range(250) / 1000; 3309 3310 import_expire = gethrtime() + import_delay; 3311 3312 while (gethrtime() < import_expire) { 3313 (void) spa_import_progress_set_mmp_check(spa_guid(spa), 3314 NSEC2SEC(import_expire - gethrtime())); 3315 3316 vdev_uberblock_load(rvd, ub, &mmp_label); 3317 3318 if (txg != ub->ub_txg || timestamp != ub->ub_timestamp || 3319 mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) { 3320 zfs_dbgmsg("multihost activity detected " 3321 "txg %llu ub_txg %llu " 3322 "timestamp %llu ub_timestamp %llu " 3323 "mmp_config %#llx ub_mmp_config %#llx", 3324 (u_longlong_t)txg, (u_longlong_t)ub->ub_txg, 3325 (u_longlong_t)timestamp, 3326 (u_longlong_t)ub->ub_timestamp, 3327 (u_longlong_t)mmp_config, 3328 (u_longlong_t)ub->ub_mmp_config); 3329 3330 error = SET_ERROR(EREMOTEIO); 3331 break; 3332 } 3333 3334 if (mmp_label) { 3335 nvlist_free(mmp_label); 3336 mmp_label = NULL; 3337 } 3338 3339 error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz); 3340 if (error != -1) { 3341 error = SET_ERROR(EINTR); 3342 break; 3343 } 3344 error = 0; 3345 } 3346 3347 out: 3348 mutex_exit(&mtx); 3349 mutex_destroy(&mtx); 3350 cv_destroy(&cv); 3351 3352 /* 3353 * If the pool is determined to be active store the status in the 3354 * spa->spa_load_info nvlist. If the remote hostname or hostid are 3355 * available from configuration read from disk store them as well. 3356 * This allows 'zpool import' to generate a more useful message. 3357 * 3358 * ZPOOL_CONFIG_MMP_STATE - observed pool status (mandatory) 3359 * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool 3360 * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool 3361 */ 3362 if (error == EREMOTEIO) { 3363 const char *hostname = "<unknown>"; 3364 uint64_t hostid = 0; 3365 3366 if (mmp_label) { 3367 if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) { 3368 hostname = fnvlist_lookup_string(mmp_label, 3369 ZPOOL_CONFIG_HOSTNAME); 3370 fnvlist_add_string(spa->spa_load_info, 3371 ZPOOL_CONFIG_MMP_HOSTNAME, hostname); 3372 } 3373 3374 if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) { 3375 hostid = fnvlist_lookup_uint64(mmp_label, 3376 ZPOOL_CONFIG_HOSTID); 3377 fnvlist_add_uint64(spa->spa_load_info, 3378 ZPOOL_CONFIG_MMP_HOSTID, hostid); 3379 } 3380 } 3381 3382 fnvlist_add_uint64(spa->spa_load_info, 3383 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE); 3384 fnvlist_add_uint64(spa->spa_load_info, 3385 ZPOOL_CONFIG_MMP_TXG, 0); 3386 3387 error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO); 3388 } 3389 3390 if (mmp_label) 3391 nvlist_free(mmp_label); 3392 3393 return (error); 3394 } 3395 3396 static int 3397 spa_verify_host(spa_t *spa, nvlist_t *mos_config) 3398 { 3399 uint64_t hostid; 3400 const char *hostname; 3401 uint64_t myhostid = 0; 3402 3403 if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, 3404 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 3405 hostname = fnvlist_lookup_string(mos_config, 3406 ZPOOL_CONFIG_HOSTNAME); 3407 3408 myhostid = zone_get_hostid(NULL); 3409 3410 if (hostid != 0 && myhostid != 0 && hostid != myhostid) { 3411 cmn_err(CE_WARN, "pool '%s' could not be " 3412 "loaded as it was last accessed by " 3413 "another system (host: %s hostid: 0x%llx). " 3414 "See: https://openzfs.github.io/openzfs-docs/msg/" 3415 "ZFS-8000-EY", 3416 spa_name(spa), hostname, (u_longlong_t)hostid); 3417 spa_load_failed(spa, "hostid verification failed: pool " 3418 "last accessed by host: %s (hostid: 0x%llx)", 3419 hostname, (u_longlong_t)hostid); 3420 return (SET_ERROR(EBADF)); 3421 } 3422 } 3423 3424 return (0); 3425 } 3426 3427 static int 3428 spa_ld_parse_config(spa_t *spa, spa_import_type_t type) 3429 { 3430 int error = 0; 3431 nvlist_t *nvtree, *nvl, *config = spa->spa_config; 3432 int parse; 3433 vdev_t *rvd; 3434 uint64_t pool_guid; 3435 const char *comment; 3436 const char *compatibility; 3437 3438 /* 3439 * Versioning wasn't explicitly added to the label until later, so if 3440 * it's not present treat it as the initial version. 3441 */ 3442 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 3443 &spa->spa_ubsync.ub_version) != 0) 3444 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 3445 3446 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 3447 spa_load_failed(spa, "invalid config provided: '%s' missing", 3448 ZPOOL_CONFIG_POOL_GUID); 3449 return (SET_ERROR(EINVAL)); 3450 } 3451 3452 /* 3453 * If we are doing an import, ensure that the pool is not already 3454 * imported by checking if its pool guid already exists in the 3455 * spa namespace. 3456 * 3457 * The only case that we allow an already imported pool to be 3458 * imported again, is when the pool is checkpointed and we want to 3459 * look at its checkpointed state from userland tools like zdb. 3460 */ 3461 #ifdef _KERNEL 3462 if ((spa->spa_load_state == SPA_LOAD_IMPORT || 3463 spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 3464 spa_guid_exists(pool_guid, 0)) { 3465 #else 3466 if ((spa->spa_load_state == SPA_LOAD_IMPORT || 3467 spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 3468 spa_guid_exists(pool_guid, 0) && 3469 !spa_importing_readonly_checkpoint(spa)) { 3470 #endif 3471 spa_load_failed(spa, "a pool with guid %llu is already open", 3472 (u_longlong_t)pool_guid); 3473 return (SET_ERROR(EEXIST)); 3474 } 3475 3476 spa->spa_config_guid = pool_guid; 3477 3478 nvlist_free(spa->spa_load_info); 3479 spa->spa_load_info = fnvlist_alloc(); 3480 3481 ASSERT(spa->spa_comment == NULL); 3482 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 3483 spa->spa_comment = spa_strdup(comment); 3484 3485 ASSERT(spa->spa_compatibility == NULL); 3486 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMPATIBILITY, 3487 &compatibility) == 0) 3488 spa->spa_compatibility = spa_strdup(compatibility); 3489 3490 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 3491 &spa->spa_config_txg); 3492 3493 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) 3494 spa->spa_config_splitting = fnvlist_dup(nvl); 3495 3496 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { 3497 spa_load_failed(spa, "invalid config provided: '%s' missing", 3498 ZPOOL_CONFIG_VDEV_TREE); 3499 return (SET_ERROR(EINVAL)); 3500 } 3501 3502 /* 3503 * Create "The Godfather" zio to hold all async IOs 3504 */ 3505 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 3506 KM_SLEEP); 3507 for (int i = 0; i < max_ncpus; i++) { 3508 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 3509 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 3510 ZIO_FLAG_GODFATHER); 3511 } 3512 3513 /* 3514 * Parse the configuration into a vdev tree. We explicitly set the 3515 * value that will be returned by spa_version() since parsing the 3516 * configuration requires knowing the version number. 3517 */ 3518 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3519 parse = (type == SPA_IMPORT_EXISTING ? 3520 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 3521 error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); 3522 spa_config_exit(spa, SCL_ALL, FTAG); 3523 3524 if (error != 0) { 3525 spa_load_failed(spa, "unable to parse config [error=%d]", 3526 error); 3527 return (error); 3528 } 3529 3530 ASSERT(spa->spa_root_vdev == rvd); 3531 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 3532 ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); 3533 3534 if (type != SPA_IMPORT_ASSEMBLE) { 3535 ASSERT(spa_guid(spa) == pool_guid); 3536 } 3537 3538 return (0); 3539 } 3540 3541 /* 3542 * Recursively open all vdevs in the vdev tree. This function is called twice: 3543 * first with the untrusted config, then with the trusted config. 3544 */ 3545 static int 3546 spa_ld_open_vdevs(spa_t *spa) 3547 { 3548 int error = 0; 3549 3550 /* 3551 * spa_missing_tvds_allowed defines how many top-level vdevs can be 3552 * missing/unopenable for the root vdev to be still considered openable. 3553 */ 3554 if (spa->spa_trust_config) { 3555 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds; 3556 } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) { 3557 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile; 3558 } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) { 3559 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan; 3560 } else { 3561 spa->spa_missing_tvds_allowed = 0; 3562 } 3563 3564 spa->spa_missing_tvds_allowed = 3565 MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed); 3566 3567 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3568 error = vdev_open(spa->spa_root_vdev); 3569 spa_config_exit(spa, SCL_ALL, FTAG); 3570 3571 if (spa->spa_missing_tvds != 0) { 3572 spa_load_note(spa, "vdev tree has %lld missing top-level " 3573 "vdevs.", (u_longlong_t)spa->spa_missing_tvds); 3574 if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) { 3575 /* 3576 * Although theoretically we could allow users to open 3577 * incomplete pools in RW mode, we'd need to add a lot 3578 * of extra logic (e.g. adjust pool space to account 3579 * for missing vdevs). 3580 * This limitation also prevents users from accidentally 3581 * opening the pool in RW mode during data recovery and 3582 * damaging it further. 3583 */ 3584 spa_load_note(spa, "pools with missing top-level " 3585 "vdevs can only be opened in read-only mode."); 3586 error = SET_ERROR(ENXIO); 3587 } else { 3588 spa_load_note(spa, "current settings allow for maximum " 3589 "%lld missing top-level vdevs at this stage.", 3590 (u_longlong_t)spa->spa_missing_tvds_allowed); 3591 } 3592 } 3593 if (error != 0) { 3594 spa_load_failed(spa, "unable to open vdev tree [error=%d]", 3595 error); 3596 } 3597 if (spa->spa_missing_tvds != 0 || error != 0) 3598 vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2); 3599 3600 return (error); 3601 } 3602 3603 /* 3604 * We need to validate the vdev labels against the configuration that 3605 * we have in hand. This function is called twice: first with an untrusted 3606 * config, then with a trusted config. The validation is more strict when the 3607 * config is trusted. 3608 */ 3609 static int 3610 spa_ld_validate_vdevs(spa_t *spa) 3611 { 3612 int error = 0; 3613 vdev_t *rvd = spa->spa_root_vdev; 3614 3615 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3616 error = vdev_validate(rvd); 3617 spa_config_exit(spa, SCL_ALL, FTAG); 3618 3619 if (error != 0) { 3620 spa_load_failed(spa, "vdev_validate failed [error=%d]", error); 3621 return (error); 3622 } 3623 3624 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 3625 spa_load_failed(spa, "cannot open vdev tree after invalidating " 3626 "some vdevs"); 3627 vdev_dbgmsg_print_tree(rvd, 2); 3628 return (SET_ERROR(ENXIO)); 3629 } 3630 3631 return (0); 3632 } 3633 3634 static void 3635 spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub) 3636 { 3637 spa->spa_state = POOL_STATE_ACTIVE; 3638 spa->spa_ubsync = spa->spa_uberblock; 3639 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 3640 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 3641 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 3642 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 3643 spa->spa_claim_max_txg = spa->spa_first_txg; 3644 spa->spa_prev_software_version = ub->ub_software_version; 3645 } 3646 3647 static int 3648 spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) 3649 { 3650 vdev_t *rvd = spa->spa_root_vdev; 3651 nvlist_t *label; 3652 uberblock_t *ub = &spa->spa_uberblock; 3653 boolean_t activity_check = B_FALSE; 3654 3655 /* 3656 * If we are opening the checkpointed state of the pool by 3657 * rewinding to it, at this point we will have written the 3658 * checkpointed uberblock to the vdev labels, so searching 3659 * the labels will find the right uberblock. However, if 3660 * we are opening the checkpointed state read-only, we have 3661 * not modified the labels. Therefore, we must ignore the 3662 * labels and continue using the spa_uberblock that was set 3663 * by spa_ld_checkpoint_rewind. 3664 * 3665 * Note that it would be fine to ignore the labels when 3666 * rewinding (opening writeable) as well. However, if we 3667 * crash just after writing the labels, we will end up 3668 * searching the labels. Doing so in the common case means 3669 * that this code path gets exercised normally, rather than 3670 * just in the edge case. 3671 */ 3672 if (ub->ub_checkpoint_txg != 0 && 3673 spa_importing_readonly_checkpoint(spa)) { 3674 spa_ld_select_uberblock_done(spa, ub); 3675 return (0); 3676 } 3677 3678 /* 3679 * Find the best uberblock. 3680 */ 3681 vdev_uberblock_load(rvd, ub, &label); 3682 3683 /* 3684 * If we weren't able to find a single valid uberblock, return failure. 3685 */ 3686 if (ub->ub_txg == 0) { 3687 nvlist_free(label); 3688 spa_load_failed(spa, "no valid uberblock found"); 3689 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 3690 } 3691 3692 if (spa->spa_load_max_txg != UINT64_MAX) { 3693 (void) spa_import_progress_set_max_txg(spa_guid(spa), 3694 (u_longlong_t)spa->spa_load_max_txg); 3695 } 3696 spa_load_note(spa, "using uberblock with txg=%llu", 3697 (u_longlong_t)ub->ub_txg); 3698 3699 3700 /* 3701 * For pools which have the multihost property on determine if the 3702 * pool is truly inactive and can be safely imported. Prevent 3703 * hosts which don't have a hostid set from importing the pool. 3704 */ 3705 activity_check = spa_activity_check_required(spa, ub, label, 3706 spa->spa_config); 3707 if (activity_check) { 3708 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay && 3709 spa_get_hostid(spa) == 0) { 3710 nvlist_free(label); 3711 fnvlist_add_uint64(spa->spa_load_info, 3712 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); 3713 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); 3714 } 3715 3716 int error = spa_activity_check(spa, ub, spa->spa_config); 3717 if (error) { 3718 nvlist_free(label); 3719 return (error); 3720 } 3721 3722 fnvlist_add_uint64(spa->spa_load_info, 3723 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE); 3724 fnvlist_add_uint64(spa->spa_load_info, 3725 ZPOOL_CONFIG_MMP_TXG, ub->ub_txg); 3726 fnvlist_add_uint16(spa->spa_load_info, 3727 ZPOOL_CONFIG_MMP_SEQ, 3728 (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)); 3729 } 3730 3731 /* 3732 * If the pool has an unsupported version we can't open it. 3733 */ 3734 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 3735 nvlist_free(label); 3736 spa_load_failed(spa, "version %llu is not supported", 3737 (u_longlong_t)ub->ub_version); 3738 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 3739 } 3740 3741 if (ub->ub_version >= SPA_VERSION_FEATURES) { 3742 nvlist_t *features; 3743 3744 /* 3745 * If we weren't able to find what's necessary for reading the 3746 * MOS in the label, return failure. 3747 */ 3748 if (label == NULL) { 3749 spa_load_failed(spa, "label config unavailable"); 3750 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 3751 ENXIO)); 3752 } 3753 3754 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, 3755 &features) != 0) { 3756 nvlist_free(label); 3757 spa_load_failed(spa, "invalid label: '%s' missing", 3758 ZPOOL_CONFIG_FEATURES_FOR_READ); 3759 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 3760 ENXIO)); 3761 } 3762 3763 /* 3764 * Update our in-core representation with the definitive values 3765 * from the label. 3766 */ 3767 nvlist_free(spa->spa_label_features); 3768 spa->spa_label_features = fnvlist_dup(features); 3769 } 3770 3771 nvlist_free(label); 3772 3773 /* 3774 * Look through entries in the label nvlist's features_for_read. If 3775 * there is a feature listed there which we don't understand then we 3776 * cannot open a pool. 3777 */ 3778 if (ub->ub_version >= SPA_VERSION_FEATURES) { 3779 nvlist_t *unsup_feat; 3780 3781 unsup_feat = fnvlist_alloc(); 3782 3783 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 3784 NULL); nvp != NULL; 3785 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 3786 if (!zfeature_is_supported(nvpair_name(nvp))) { 3787 fnvlist_add_string(unsup_feat, 3788 nvpair_name(nvp), ""); 3789 } 3790 } 3791 3792 if (!nvlist_empty(unsup_feat)) { 3793 fnvlist_add_nvlist(spa->spa_load_info, 3794 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 3795 nvlist_free(unsup_feat); 3796 spa_load_failed(spa, "some features are unsupported"); 3797 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 3798 ENOTSUP)); 3799 } 3800 3801 nvlist_free(unsup_feat); 3802 } 3803 3804 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 3805 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3806 spa_try_repair(spa, spa->spa_config); 3807 spa_config_exit(spa, SCL_ALL, FTAG); 3808 nvlist_free(spa->spa_config_splitting); 3809 spa->spa_config_splitting = NULL; 3810 } 3811 3812 /* 3813 * Initialize internal SPA structures. 3814 */ 3815 spa_ld_select_uberblock_done(spa, ub); 3816 3817 return (0); 3818 } 3819 3820 static int 3821 spa_ld_open_rootbp(spa_t *spa) 3822 { 3823 int error = 0; 3824 vdev_t *rvd = spa->spa_root_vdev; 3825 3826 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 3827 if (error != 0) { 3828 spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " 3829 "[error=%d]", error); 3830 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3831 } 3832 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 3833 3834 return (0); 3835 } 3836 3837 static int 3838 spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, 3839 boolean_t reloading) 3840 { 3841 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 3842 nvlist_t *nv, *mos_config, *policy; 3843 int error = 0, copy_error; 3844 uint64_t healthy_tvds, healthy_tvds_mos; 3845 uint64_t mos_config_txg; 3846 3847 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) 3848 != 0) 3849 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3850 3851 /* 3852 * If we're assembling a pool from a split, the config provided is 3853 * already trusted so there is nothing to do. 3854 */ 3855 if (type == SPA_IMPORT_ASSEMBLE) 3856 return (0); 3857 3858 healthy_tvds = spa_healthy_core_tvds(spa); 3859 3860 if (load_nvlist(spa, spa->spa_config_object, &mos_config) 3861 != 0) { 3862 spa_load_failed(spa, "unable to retrieve MOS config"); 3863 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3864 } 3865 3866 /* 3867 * If we are doing an open, pool owner wasn't verified yet, thus do 3868 * the verification here. 3869 */ 3870 if (spa->spa_load_state == SPA_LOAD_OPEN) { 3871 error = spa_verify_host(spa, mos_config); 3872 if (error != 0) { 3873 nvlist_free(mos_config); 3874 return (error); 3875 } 3876 } 3877 3878 nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE); 3879 3880 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3881 3882 /* 3883 * Build a new vdev tree from the trusted config 3884 */ 3885 error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD); 3886 if (error != 0) { 3887 nvlist_free(mos_config); 3888 spa_config_exit(spa, SCL_ALL, FTAG); 3889 spa_load_failed(spa, "spa_config_parse failed [error=%d]", 3890 error); 3891 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 3892 } 3893 3894 /* 3895 * Vdev paths in the MOS may be obsolete. If the untrusted config was 3896 * obtained by scanning /dev/dsk, then it will have the right vdev 3897 * paths. We update the trusted MOS config with this information. 3898 * We first try to copy the paths with vdev_copy_path_strict, which 3899 * succeeds only when both configs have exactly the same vdev tree. 3900 * If that fails, we fall back to a more flexible method that has a 3901 * best effort policy. 3902 */ 3903 copy_error = vdev_copy_path_strict(rvd, mrvd); 3904 if (copy_error != 0 || spa_load_print_vdev_tree) { 3905 spa_load_note(spa, "provided vdev tree:"); 3906 vdev_dbgmsg_print_tree(rvd, 2); 3907 spa_load_note(spa, "MOS vdev tree:"); 3908 vdev_dbgmsg_print_tree(mrvd, 2); 3909 } 3910 if (copy_error != 0) { 3911 spa_load_note(spa, "vdev_copy_path_strict failed, falling " 3912 "back to vdev_copy_path_relaxed"); 3913 vdev_copy_path_relaxed(rvd, mrvd); 3914 } 3915 3916 vdev_close(rvd); 3917 vdev_free(rvd); 3918 spa->spa_root_vdev = mrvd; 3919 rvd = mrvd; 3920 spa_config_exit(spa, SCL_ALL, FTAG); 3921 3922 /* 3923 * We will use spa_config if we decide to reload the spa or if spa_load 3924 * fails and we rewind. We must thus regenerate the config using the 3925 * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to 3926 * pass settings on how to load the pool and is not stored in the MOS. 3927 * We copy it over to our new, trusted config. 3928 */ 3929 mos_config_txg = fnvlist_lookup_uint64(mos_config, 3930 ZPOOL_CONFIG_POOL_TXG); 3931 nvlist_free(mos_config); 3932 mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE); 3933 if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY, 3934 &policy) == 0) 3935 fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy); 3936 spa_config_set(spa, mos_config); 3937 spa->spa_config_source = SPA_CONFIG_SRC_MOS; 3938 3939 /* 3940 * Now that we got the config from the MOS, we should be more strict 3941 * in checking blkptrs and can make assumptions about the consistency 3942 * of the vdev tree. spa_trust_config must be set to true before opening 3943 * vdevs in order for them to be writeable. 3944 */ 3945 spa->spa_trust_config = B_TRUE; 3946 3947 /* 3948 * Open and validate the new vdev tree 3949 */ 3950 error = spa_ld_open_vdevs(spa); 3951 if (error != 0) 3952 return (error); 3953 3954 error = spa_ld_validate_vdevs(spa); 3955 if (error != 0) 3956 return (error); 3957 3958 if (copy_error != 0 || spa_load_print_vdev_tree) { 3959 spa_load_note(spa, "final vdev tree:"); 3960 vdev_dbgmsg_print_tree(rvd, 2); 3961 } 3962 3963 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT && 3964 !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) { 3965 /* 3966 * Sanity check to make sure that we are indeed loading the 3967 * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds 3968 * in the config provided and they happened to be the only ones 3969 * to have the latest uberblock, we could involuntarily perform 3970 * an extreme rewind. 3971 */ 3972 healthy_tvds_mos = spa_healthy_core_tvds(spa); 3973 if (healthy_tvds_mos - healthy_tvds >= 3974 SPA_SYNC_MIN_VDEVS) { 3975 spa_load_note(spa, "config provided misses too many " 3976 "top-level vdevs compared to MOS (%lld vs %lld). ", 3977 (u_longlong_t)healthy_tvds, 3978 (u_longlong_t)healthy_tvds_mos); 3979 spa_load_note(spa, "vdev tree:"); 3980 vdev_dbgmsg_print_tree(rvd, 2); 3981 if (reloading) { 3982 spa_load_failed(spa, "config was already " 3983 "provided from MOS. Aborting."); 3984 return (spa_vdev_err(rvd, 3985 VDEV_AUX_CORRUPT_DATA, EIO)); 3986 } 3987 spa_load_note(spa, "spa must be reloaded using MOS " 3988 "config"); 3989 return (SET_ERROR(EAGAIN)); 3990 } 3991 } 3992 3993 error = spa_check_for_missing_logs(spa); 3994 if (error != 0) 3995 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 3996 3997 if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) { 3998 spa_load_failed(spa, "uberblock guid sum doesn't match MOS " 3999 "guid sum (%llu != %llu)", 4000 (u_longlong_t)spa->spa_uberblock.ub_guid_sum, 4001 (u_longlong_t)rvd->vdev_guid_sum); 4002 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 4003 ENXIO)); 4004 } 4005 4006 return (0); 4007 } 4008 4009 static int 4010 spa_ld_open_indirect_vdev_metadata(spa_t *spa) 4011 { 4012 int error = 0; 4013 vdev_t *rvd = spa->spa_root_vdev; 4014 4015 /* 4016 * Everything that we read before spa_remove_init() must be stored 4017 * on concreted vdevs. Therefore we do this as early as possible. 4018 */ 4019 error = spa_remove_init(spa); 4020 if (error != 0) { 4021 spa_load_failed(spa, "spa_remove_init failed [error=%d]", 4022 error); 4023 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4024 } 4025 4026 /* 4027 * Retrieve information needed to condense indirect vdev mappings. 4028 */ 4029 error = spa_condense_init(spa); 4030 if (error != 0) { 4031 spa_load_failed(spa, "spa_condense_init failed [error=%d]", 4032 error); 4033 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4034 } 4035 4036 return (0); 4037 } 4038 4039 static int 4040 spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) 4041 { 4042 int error = 0; 4043 vdev_t *rvd = spa->spa_root_vdev; 4044 4045 if (spa_version(spa) >= SPA_VERSION_FEATURES) { 4046 boolean_t missing_feat_read = B_FALSE; 4047 nvlist_t *unsup_feat, *enabled_feat; 4048 4049 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 4050 &spa->spa_feat_for_read_obj, B_TRUE) != 0) { 4051 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4052 } 4053 4054 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 4055 &spa->spa_feat_for_write_obj, B_TRUE) != 0) { 4056 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4057 } 4058 4059 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 4060 &spa->spa_feat_desc_obj, B_TRUE) != 0) { 4061 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4062 } 4063 4064 enabled_feat = fnvlist_alloc(); 4065 unsup_feat = fnvlist_alloc(); 4066 4067 if (!spa_features_check(spa, B_FALSE, 4068 unsup_feat, enabled_feat)) 4069 missing_feat_read = B_TRUE; 4070 4071 if (spa_writeable(spa) || 4072 spa->spa_load_state == SPA_LOAD_TRYIMPORT) { 4073 if (!spa_features_check(spa, B_TRUE, 4074 unsup_feat, enabled_feat)) { 4075 *missing_feat_writep = B_TRUE; 4076 } 4077 } 4078 4079 fnvlist_add_nvlist(spa->spa_load_info, 4080 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 4081 4082 if (!nvlist_empty(unsup_feat)) { 4083 fnvlist_add_nvlist(spa->spa_load_info, 4084 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 4085 } 4086 4087 fnvlist_free(enabled_feat); 4088 fnvlist_free(unsup_feat); 4089 4090 if (!missing_feat_read) { 4091 fnvlist_add_boolean(spa->spa_load_info, 4092 ZPOOL_CONFIG_CAN_RDONLY); 4093 } 4094 4095 /* 4096 * If the state is SPA_LOAD_TRYIMPORT, our objective is 4097 * twofold: to determine whether the pool is available for 4098 * import in read-write mode and (if it is not) whether the 4099 * pool is available for import in read-only mode. If the pool 4100 * is available for import in read-write mode, it is displayed 4101 * as available in userland; if it is not available for import 4102 * in read-only mode, it is displayed as unavailable in 4103 * userland. If the pool is available for import in read-only 4104 * mode but not read-write mode, it is displayed as unavailable 4105 * in userland with a special note that the pool is actually 4106 * available for open in read-only mode. 4107 * 4108 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 4109 * missing a feature for write, we must first determine whether 4110 * the pool can be opened read-only before returning to 4111 * userland in order to know whether to display the 4112 * abovementioned note. 4113 */ 4114 if (missing_feat_read || (*missing_feat_writep && 4115 spa_writeable(spa))) { 4116 spa_load_failed(spa, "pool uses unsupported features"); 4117 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 4118 ENOTSUP)); 4119 } 4120 4121 /* 4122 * Load refcounts for ZFS features from disk into an in-memory 4123 * cache during SPA initialization. 4124 */ 4125 for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 4126 uint64_t refcount; 4127 4128 error = feature_get_refcount_from_disk(spa, 4129 &spa_feature_table[i], &refcount); 4130 if (error == 0) { 4131 spa->spa_feat_refcount_cache[i] = refcount; 4132 } else if (error == ENOTSUP) { 4133 spa->spa_feat_refcount_cache[i] = 4134 SPA_FEATURE_DISABLED; 4135 } else { 4136 spa_load_failed(spa, "error getting refcount " 4137 "for feature %s [error=%d]", 4138 spa_feature_table[i].fi_guid, error); 4139 return (spa_vdev_err(rvd, 4140 VDEV_AUX_CORRUPT_DATA, EIO)); 4141 } 4142 } 4143 } 4144 4145 if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 4146 if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 4147 &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) 4148 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4149 } 4150 4151 /* 4152 * Encryption was added before bookmark_v2, even though bookmark_v2 4153 * is now a dependency. If this pool has encryption enabled without 4154 * bookmark_v2, trigger an errata message. 4155 */ 4156 if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) && 4157 !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) { 4158 spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION; 4159 } 4160 4161 return (0); 4162 } 4163 4164 static int 4165 spa_ld_load_special_directories(spa_t *spa) 4166 { 4167 int error = 0; 4168 vdev_t *rvd = spa->spa_root_vdev; 4169 4170 spa->spa_is_initializing = B_TRUE; 4171 error = dsl_pool_open(spa->spa_dsl_pool); 4172 spa->spa_is_initializing = B_FALSE; 4173 if (error != 0) { 4174 spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); 4175 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4176 } 4177 4178 return (0); 4179 } 4180 4181 static int 4182 spa_ld_get_props(spa_t *spa) 4183 { 4184 int error = 0; 4185 uint64_t obj; 4186 vdev_t *rvd = spa->spa_root_vdev; 4187 4188 /* Grab the checksum salt from the MOS. */ 4189 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 4190 DMU_POOL_CHECKSUM_SALT, 1, 4191 sizeof (spa->spa_cksum_salt.zcs_bytes), 4192 spa->spa_cksum_salt.zcs_bytes); 4193 if (error == ENOENT) { 4194 /* Generate a new salt for subsequent use */ 4195 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 4196 sizeof (spa->spa_cksum_salt.zcs_bytes)); 4197 } else if (error != 0) { 4198 spa_load_failed(spa, "unable to retrieve checksum salt from " 4199 "MOS [error=%d]", error); 4200 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4201 } 4202 4203 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) 4204 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4205 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 4206 if (error != 0) { 4207 spa_load_failed(spa, "error opening deferred-frees bpobj " 4208 "[error=%d]", error); 4209 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4210 } 4211 4212 /* 4213 * Load the bit that tells us to use the new accounting function 4214 * (raid-z deflation). If we have an older pool, this will not 4215 * be present. 4216 */ 4217 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); 4218 if (error != 0 && error != ENOENT) 4219 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4220 4221 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 4222 &spa->spa_creation_version, B_FALSE); 4223 if (error != 0 && error != ENOENT) 4224 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4225 4226 /* 4227 * Load the persistent error log. If we have an older pool, this will 4228 * not be present. 4229 */ 4230 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, 4231 B_FALSE); 4232 if (error != 0 && error != ENOENT) 4233 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4234 4235 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 4236 &spa->spa_errlog_scrub, B_FALSE); 4237 if (error != 0 && error != ENOENT) 4238 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4239 4240 /* 4241 * Load the livelist deletion field. If a livelist is queued for 4242 * deletion, indicate that in the spa 4243 */ 4244 error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES, 4245 &spa->spa_livelists_to_delete, B_FALSE); 4246 if (error != 0 && error != ENOENT) 4247 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4248 4249 /* 4250 * Load the history object. If we have an older pool, this 4251 * will not be present. 4252 */ 4253 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); 4254 if (error != 0 && error != ENOENT) 4255 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4256 4257 /* 4258 * Load the per-vdev ZAP map. If we have an older pool, this will not 4259 * be present; in this case, defer its creation to a later time to 4260 * avoid dirtying the MOS this early / out of sync context. See 4261 * spa_sync_config_object. 4262 */ 4263 4264 /* The sentinel is only available in the MOS config. */ 4265 nvlist_t *mos_config; 4266 if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { 4267 spa_load_failed(spa, "unable to retrieve MOS config"); 4268 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4269 } 4270 4271 error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, 4272 &spa->spa_all_vdev_zaps, B_FALSE); 4273 4274 if (error == ENOENT) { 4275 VERIFY(!nvlist_exists(mos_config, 4276 ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 4277 spa->spa_avz_action = AVZ_ACTION_INITIALIZE; 4278 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 4279 } else if (error != 0) { 4280 nvlist_free(mos_config); 4281 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4282 } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { 4283 /* 4284 * An older version of ZFS overwrote the sentinel value, so 4285 * we have orphaned per-vdev ZAPs in the MOS. Defer their 4286 * destruction to later; see spa_sync_config_object. 4287 */ 4288 spa->spa_avz_action = AVZ_ACTION_DESTROY; 4289 /* 4290 * We're assuming that no vdevs have had their ZAPs created 4291 * before this. Better be sure of it. 4292 */ 4293 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 4294 } 4295 nvlist_free(mos_config); 4296 4297 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 4298 4299 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, 4300 B_FALSE); 4301 if (error && error != ENOENT) 4302 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4303 4304 if (error == 0) { 4305 uint64_t autoreplace = 0; 4306 4307 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 4308 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 4309 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 4310 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 4311 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 4312 spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); 4313 spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim); 4314 spa->spa_autoreplace = (autoreplace != 0); 4315 } 4316 4317 /* 4318 * If we are importing a pool with missing top-level vdevs, 4319 * we enforce that the pool doesn't panic or get suspended on 4320 * error since the likelihood of missing data is extremely high. 4321 */ 4322 if (spa->spa_missing_tvds > 0 && 4323 spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE && 4324 spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4325 spa_load_note(spa, "forcing failmode to 'continue' " 4326 "as some top level vdevs are missing"); 4327 spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE; 4328 } 4329 4330 return (0); 4331 } 4332 4333 static int 4334 spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) 4335 { 4336 int error = 0; 4337 vdev_t *rvd = spa->spa_root_vdev; 4338 4339 /* 4340 * If we're assembling the pool from the split-off vdevs of 4341 * an existing pool, we don't want to attach the spares & cache 4342 * devices. 4343 */ 4344 4345 /* 4346 * Load any hot spares for this pool. 4347 */ 4348 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, 4349 B_FALSE); 4350 if (error != 0 && error != ENOENT) 4351 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4352 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 4353 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 4354 if (load_nvlist(spa, spa->spa_spares.sav_object, 4355 &spa->spa_spares.sav_config) != 0) { 4356 spa_load_failed(spa, "error loading spares nvlist"); 4357 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4358 } 4359 4360 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4361 spa_load_spares(spa); 4362 spa_config_exit(spa, SCL_ALL, FTAG); 4363 } else if (error == 0) { 4364 spa->spa_spares.sav_sync = B_TRUE; 4365 } 4366 4367 /* 4368 * Load any level 2 ARC devices for this pool. 4369 */ 4370 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 4371 &spa->spa_l2cache.sav_object, B_FALSE); 4372 if (error != 0 && error != ENOENT) 4373 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4374 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 4375 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 4376 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 4377 &spa->spa_l2cache.sav_config) != 0) { 4378 spa_load_failed(spa, "error loading l2cache nvlist"); 4379 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4380 } 4381 4382 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4383 spa_load_l2cache(spa); 4384 spa_config_exit(spa, SCL_ALL, FTAG); 4385 } else if (error == 0) { 4386 spa->spa_l2cache.sav_sync = B_TRUE; 4387 } 4388 4389 return (0); 4390 } 4391 4392 static int 4393 spa_ld_load_vdev_metadata(spa_t *spa) 4394 { 4395 int error = 0; 4396 vdev_t *rvd = spa->spa_root_vdev; 4397 4398 /* 4399 * If the 'multihost' property is set, then never allow a pool to 4400 * be imported when the system hostid is zero. The exception to 4401 * this rule is zdb which is always allowed to access pools. 4402 */ 4403 if (spa_multihost(spa) && spa_get_hostid(spa) == 0 && 4404 (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) { 4405 fnvlist_add_uint64(spa->spa_load_info, 4406 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); 4407 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); 4408 } 4409 4410 /* 4411 * If the 'autoreplace' property is set, then post a resource notifying 4412 * the ZFS DE that it should not issue any faults for unopenable 4413 * devices. We also iterate over the vdevs, and post a sysevent for any 4414 * unopenable vdevs so that the normal autoreplace handler can take 4415 * over. 4416 */ 4417 if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4418 spa_check_removed(spa->spa_root_vdev); 4419 /* 4420 * For the import case, this is done in spa_import(), because 4421 * at this point we're using the spare definitions from 4422 * the MOS config, not necessarily from the userland config. 4423 */ 4424 if (spa->spa_load_state != SPA_LOAD_IMPORT) { 4425 spa_aux_check_removed(&spa->spa_spares); 4426 spa_aux_check_removed(&spa->spa_l2cache); 4427 } 4428 } 4429 4430 /* 4431 * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. 4432 */ 4433 error = vdev_load(rvd); 4434 if (error != 0) { 4435 spa_load_failed(spa, "vdev_load failed [error=%d]", error); 4436 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4437 } 4438 4439 error = spa_ld_log_spacemaps(spa); 4440 if (error != 0) { 4441 spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]", 4442 error); 4443 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4444 } 4445 4446 /* 4447 * Propagate the leaf DTLs we just loaded all the way up the vdev tree. 4448 */ 4449 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4450 vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE); 4451 spa_config_exit(spa, SCL_ALL, FTAG); 4452 4453 return (0); 4454 } 4455 4456 static int 4457 spa_ld_load_dedup_tables(spa_t *spa) 4458 { 4459 int error = 0; 4460 vdev_t *rvd = spa->spa_root_vdev; 4461 4462 error = ddt_load(spa); 4463 if (error != 0) { 4464 spa_load_failed(spa, "ddt_load failed [error=%d]", error); 4465 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4466 } 4467 4468 return (0); 4469 } 4470 4471 static int 4472 spa_ld_load_brt(spa_t *spa) 4473 { 4474 int error = 0; 4475 vdev_t *rvd = spa->spa_root_vdev; 4476 4477 error = brt_load(spa); 4478 if (error != 0) { 4479 spa_load_failed(spa, "brt_load failed [error=%d]", error); 4480 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4481 } 4482 4483 return (0); 4484 } 4485 4486 static int 4487 spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, const char **ereport) 4488 { 4489 vdev_t *rvd = spa->spa_root_vdev; 4490 4491 if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { 4492 boolean_t missing = spa_check_logs(spa); 4493 if (missing) { 4494 if (spa->spa_missing_tvds != 0) { 4495 spa_load_note(spa, "spa_check_logs failed " 4496 "so dropping the logs"); 4497 } else { 4498 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 4499 spa_load_failed(spa, "spa_check_logs failed"); 4500 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, 4501 ENXIO)); 4502 } 4503 } 4504 } 4505 4506 return (0); 4507 } 4508 4509 static int 4510 spa_ld_verify_pool_data(spa_t *spa) 4511 { 4512 int error = 0; 4513 vdev_t *rvd = spa->spa_root_vdev; 4514 4515 /* 4516 * We've successfully opened the pool, verify that we're ready 4517 * to start pushing transactions. 4518 */ 4519 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4520 error = spa_load_verify(spa); 4521 if (error != 0) { 4522 spa_load_failed(spa, "spa_load_verify failed " 4523 "[error=%d]", error); 4524 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 4525 error)); 4526 } 4527 } 4528 4529 return (0); 4530 } 4531 4532 static void 4533 spa_ld_claim_log_blocks(spa_t *spa) 4534 { 4535 dmu_tx_t *tx; 4536 dsl_pool_t *dp = spa_get_dsl(spa); 4537 4538 /* 4539 * Claim log blocks that haven't been committed yet. 4540 * This must all happen in a single txg. 4541 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 4542 * invoked from zil_claim_log_block()'s i/o done callback. 4543 * Price of rollback is that we abandon the log. 4544 */ 4545 spa->spa_claiming = B_TRUE; 4546 4547 tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); 4548 (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 4549 zil_claim, tx, DS_FIND_CHILDREN); 4550 dmu_tx_commit(tx); 4551 4552 spa->spa_claiming = B_FALSE; 4553 4554 spa_set_log_state(spa, SPA_LOG_GOOD); 4555 } 4556 4557 static void 4558 spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, 4559 boolean_t update_config_cache) 4560 { 4561 vdev_t *rvd = spa->spa_root_vdev; 4562 int need_update = B_FALSE; 4563 4564 /* 4565 * If the config cache is stale, or we have uninitialized 4566 * metaslabs (see spa_vdev_add()), then update the config. 4567 * 4568 * If this is a verbatim import, trust the current 4569 * in-core spa_config and update the disk labels. 4570 */ 4571 if (update_config_cache || config_cache_txg != spa->spa_config_txg || 4572 spa->spa_load_state == SPA_LOAD_IMPORT || 4573 spa->spa_load_state == SPA_LOAD_RECOVER || 4574 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 4575 need_update = B_TRUE; 4576 4577 for (int c = 0; c < rvd->vdev_children; c++) 4578 if (rvd->vdev_child[c]->vdev_ms_array == 0) 4579 need_update = B_TRUE; 4580 4581 /* 4582 * Update the config cache asynchronously in case we're the 4583 * root pool, in which case the config cache isn't writable yet. 4584 */ 4585 if (need_update) 4586 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 4587 } 4588 4589 static void 4590 spa_ld_prepare_for_reload(spa_t *spa) 4591 { 4592 spa_mode_t mode = spa->spa_mode; 4593 int async_suspended = spa->spa_async_suspended; 4594 4595 spa_unload(spa); 4596 spa_deactivate(spa); 4597 spa_activate(spa, mode); 4598 4599 /* 4600 * We save the value of spa_async_suspended as it gets reset to 0 by 4601 * spa_unload(). We want to restore it back to the original value before 4602 * returning as we might be calling spa_async_resume() later. 4603 */ 4604 spa->spa_async_suspended = async_suspended; 4605 } 4606 4607 static int 4608 spa_ld_read_checkpoint_txg(spa_t *spa) 4609 { 4610 uberblock_t checkpoint; 4611 int error = 0; 4612 4613 ASSERT0(spa->spa_checkpoint_txg); 4614 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4615 4616 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 4617 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 4618 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 4619 4620 if (error == ENOENT) 4621 return (0); 4622 4623 if (error != 0) 4624 return (error); 4625 4626 ASSERT3U(checkpoint.ub_txg, !=, 0); 4627 ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0); 4628 ASSERT3U(checkpoint.ub_timestamp, !=, 0); 4629 spa->spa_checkpoint_txg = checkpoint.ub_txg; 4630 spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; 4631 4632 return (0); 4633 } 4634 4635 static int 4636 spa_ld_mos_init(spa_t *spa, spa_import_type_t type) 4637 { 4638 int error = 0; 4639 4640 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4641 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); 4642 4643 /* 4644 * Never trust the config that is provided unless we are assembling 4645 * a pool following a split. 4646 * This means don't trust blkptrs and the vdev tree in general. This 4647 * also effectively puts the spa in read-only mode since 4648 * spa_writeable() checks for spa_trust_config to be true. 4649 * We will later load a trusted config from the MOS. 4650 */ 4651 if (type != SPA_IMPORT_ASSEMBLE) 4652 spa->spa_trust_config = B_FALSE; 4653 4654 /* 4655 * Parse the config provided to create a vdev tree. 4656 */ 4657 error = spa_ld_parse_config(spa, type); 4658 if (error != 0) 4659 return (error); 4660 4661 spa_import_progress_add(spa); 4662 4663 /* 4664 * Now that we have the vdev tree, try to open each vdev. This involves 4665 * opening the underlying physical device, retrieving its geometry and 4666 * probing the vdev with a dummy I/O. The state of each vdev will be set 4667 * based on the success of those operations. After this we'll be ready 4668 * to read from the vdevs. 4669 */ 4670 error = spa_ld_open_vdevs(spa); 4671 if (error != 0) 4672 return (error); 4673 4674 /* 4675 * Read the label of each vdev and make sure that the GUIDs stored 4676 * there match the GUIDs in the config provided. 4677 * If we're assembling a new pool that's been split off from an 4678 * existing pool, the labels haven't yet been updated so we skip 4679 * validation for now. 4680 */ 4681 if (type != SPA_IMPORT_ASSEMBLE) { 4682 error = spa_ld_validate_vdevs(spa); 4683 if (error != 0) 4684 return (error); 4685 } 4686 4687 /* 4688 * Read all vdev labels to find the best uberblock (i.e. latest, 4689 * unless spa_load_max_txg is set) and store it in spa_uberblock. We 4690 * get the list of features required to read blkptrs in the MOS from 4691 * the vdev label with the best uberblock and verify that our version 4692 * of zfs supports them all. 4693 */ 4694 error = spa_ld_select_uberblock(spa, type); 4695 if (error != 0) 4696 return (error); 4697 4698 /* 4699 * Pass that uberblock to the dsl_pool layer which will open the root 4700 * blkptr. This blkptr points to the latest version of the MOS and will 4701 * allow us to read its contents. 4702 */ 4703 error = spa_ld_open_rootbp(spa); 4704 if (error != 0) 4705 return (error); 4706 4707 return (0); 4708 } 4709 4710 static int 4711 spa_ld_checkpoint_rewind(spa_t *spa) 4712 { 4713 uberblock_t checkpoint; 4714 int error = 0; 4715 4716 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4717 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 4718 4719 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 4720 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 4721 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 4722 4723 if (error != 0) { 4724 spa_load_failed(spa, "unable to retrieve checkpointed " 4725 "uberblock from the MOS config [error=%d]", error); 4726 4727 if (error == ENOENT) 4728 error = ZFS_ERR_NO_CHECKPOINT; 4729 4730 return (error); 4731 } 4732 4733 ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg); 4734 ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg); 4735 4736 /* 4737 * We need to update the txg and timestamp of the checkpointed 4738 * uberblock to be higher than the latest one. This ensures that 4739 * the checkpointed uberblock is selected if we were to close and 4740 * reopen the pool right after we've written it in the vdev labels. 4741 * (also see block comment in vdev_uberblock_compare) 4742 */ 4743 checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1; 4744 checkpoint.ub_timestamp = gethrestime_sec(); 4745 4746 /* 4747 * Set current uberblock to be the checkpointed uberblock. 4748 */ 4749 spa->spa_uberblock = checkpoint; 4750 4751 /* 4752 * If we are doing a normal rewind, then the pool is open for 4753 * writing and we sync the "updated" checkpointed uberblock to 4754 * disk. Once this is done, we've basically rewound the whole 4755 * pool and there is no way back. 4756 * 4757 * There are cases when we don't want to attempt and sync the 4758 * checkpointed uberblock to disk because we are opening a 4759 * pool as read-only. Specifically, verifying the checkpointed 4760 * state with zdb, and importing the checkpointed state to get 4761 * a "preview" of its content. 4762 */ 4763 if (spa_writeable(spa)) { 4764 vdev_t *rvd = spa->spa_root_vdev; 4765 4766 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4767 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; 4768 int svdcount = 0; 4769 int children = rvd->vdev_children; 4770 int c0 = random_in_range(children); 4771 4772 for (int c = 0; c < children; c++) { 4773 vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; 4774 4775 /* Stop when revisiting the first vdev */ 4776 if (c > 0 && svd[0] == vd) 4777 break; 4778 4779 if (vd->vdev_ms_array == 0 || vd->vdev_islog || 4780 !vdev_is_concrete(vd)) 4781 continue; 4782 4783 svd[svdcount++] = vd; 4784 if (svdcount == SPA_SYNC_MIN_VDEVS) 4785 break; 4786 } 4787 error = vdev_config_sync(svd, svdcount, spa->spa_first_txg); 4788 if (error == 0) 4789 spa->spa_last_synced_guid = rvd->vdev_guid; 4790 spa_config_exit(spa, SCL_ALL, FTAG); 4791 4792 if (error != 0) { 4793 spa_load_failed(spa, "failed to write checkpointed " 4794 "uberblock to the vdev labels [error=%d]", error); 4795 return (error); 4796 } 4797 } 4798 4799 return (0); 4800 } 4801 4802 static int 4803 spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type, 4804 boolean_t *update_config_cache) 4805 { 4806 int error; 4807 4808 /* 4809 * Parse the config for pool, open and validate vdevs, 4810 * select an uberblock, and use that uberblock to open 4811 * the MOS. 4812 */ 4813 error = spa_ld_mos_init(spa, type); 4814 if (error != 0) 4815 return (error); 4816 4817 /* 4818 * Retrieve the trusted config stored in the MOS and use it to create 4819 * a new, exact version of the vdev tree, then reopen all vdevs. 4820 */ 4821 error = spa_ld_trusted_config(spa, type, B_FALSE); 4822 if (error == EAGAIN) { 4823 if (update_config_cache != NULL) 4824 *update_config_cache = B_TRUE; 4825 4826 /* 4827 * Redo the loading process with the trusted config if it is 4828 * too different from the untrusted config. 4829 */ 4830 spa_ld_prepare_for_reload(spa); 4831 spa_load_note(spa, "RELOADING"); 4832 error = spa_ld_mos_init(spa, type); 4833 if (error != 0) 4834 return (error); 4835 4836 error = spa_ld_trusted_config(spa, type, B_TRUE); 4837 if (error != 0) 4838 return (error); 4839 4840 } else if (error != 0) { 4841 return (error); 4842 } 4843 4844 return (0); 4845 } 4846 4847 /* 4848 * Load an existing storage pool, using the config provided. This config 4849 * describes which vdevs are part of the pool and is later validated against 4850 * partial configs present in each vdev's label and an entire copy of the 4851 * config stored in the MOS. 4852 */ 4853 static int 4854 spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) 4855 { 4856 int error = 0; 4857 boolean_t missing_feat_write = B_FALSE; 4858 boolean_t checkpoint_rewind = 4859 (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 4860 boolean_t update_config_cache = B_FALSE; 4861 4862 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4863 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); 4864 4865 spa_load_note(spa, "LOADING"); 4866 4867 error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache); 4868 if (error != 0) 4869 return (error); 4870 4871 /* 4872 * If we are rewinding to the checkpoint then we need to repeat 4873 * everything we've done so far in this function but this time 4874 * selecting the checkpointed uberblock and using that to open 4875 * the MOS. 4876 */ 4877 if (checkpoint_rewind) { 4878 /* 4879 * If we are rewinding to the checkpoint update config cache 4880 * anyway. 4881 */ 4882 update_config_cache = B_TRUE; 4883 4884 /* 4885 * Extract the checkpointed uberblock from the current MOS 4886 * and use this as the pool's uberblock from now on. If the 4887 * pool is imported as writeable we also write the checkpoint 4888 * uberblock to the labels, making the rewind permanent. 4889 */ 4890 error = spa_ld_checkpoint_rewind(spa); 4891 if (error != 0) 4892 return (error); 4893 4894 /* 4895 * Redo the loading process again with the 4896 * checkpointed uberblock. 4897 */ 4898 spa_ld_prepare_for_reload(spa); 4899 spa_load_note(spa, "LOADING checkpointed uberblock"); 4900 error = spa_ld_mos_with_trusted_config(spa, type, NULL); 4901 if (error != 0) 4902 return (error); 4903 } 4904 4905 /* 4906 * Retrieve the checkpoint txg if the pool has a checkpoint. 4907 */ 4908 error = spa_ld_read_checkpoint_txg(spa); 4909 if (error != 0) 4910 return (error); 4911 4912 /* 4913 * Retrieve the mapping of indirect vdevs. Those vdevs were removed 4914 * from the pool and their contents were re-mapped to other vdevs. Note 4915 * that everything that we read before this step must have been 4916 * rewritten on concrete vdevs after the last device removal was 4917 * initiated. Otherwise we could be reading from indirect vdevs before 4918 * we have loaded their mappings. 4919 */ 4920 error = spa_ld_open_indirect_vdev_metadata(spa); 4921 if (error != 0) 4922 return (error); 4923 4924 /* 4925 * Retrieve the full list of active features from the MOS and check if 4926 * they are all supported. 4927 */ 4928 error = spa_ld_check_features(spa, &missing_feat_write); 4929 if (error != 0) 4930 return (error); 4931 4932 /* 4933 * Load several special directories from the MOS needed by the dsl_pool 4934 * layer. 4935 */ 4936 error = spa_ld_load_special_directories(spa); 4937 if (error != 0) 4938 return (error); 4939 4940 /* 4941 * Retrieve pool properties from the MOS. 4942 */ 4943 error = spa_ld_get_props(spa); 4944 if (error != 0) 4945 return (error); 4946 4947 /* 4948 * Retrieve the list of auxiliary devices - cache devices and spares - 4949 * and open them. 4950 */ 4951 error = spa_ld_open_aux_vdevs(spa, type); 4952 if (error != 0) 4953 return (error); 4954 4955 /* 4956 * Load the metadata for all vdevs. Also check if unopenable devices 4957 * should be autoreplaced. 4958 */ 4959 error = spa_ld_load_vdev_metadata(spa); 4960 if (error != 0) 4961 return (error); 4962 4963 error = spa_ld_load_dedup_tables(spa); 4964 if (error != 0) 4965 return (error); 4966 4967 error = spa_ld_load_brt(spa); 4968 if (error != 0) 4969 return (error); 4970 4971 /* 4972 * Verify the logs now to make sure we don't have any unexpected errors 4973 * when we claim log blocks later. 4974 */ 4975 error = spa_ld_verify_logs(spa, type, ereport); 4976 if (error != 0) 4977 return (error); 4978 4979 if (missing_feat_write) { 4980 ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); 4981 4982 /* 4983 * At this point, we know that we can open the pool in 4984 * read-only mode but not read-write mode. We now have enough 4985 * information and can return to userland. 4986 */ 4987 return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, 4988 ENOTSUP)); 4989 } 4990 4991 /* 4992 * Traverse the last txgs to make sure the pool was left off in a safe 4993 * state. When performing an extreme rewind, we verify the whole pool, 4994 * which can take a very long time. 4995 */ 4996 error = spa_ld_verify_pool_data(spa); 4997 if (error != 0) 4998 return (error); 4999 5000 /* 5001 * Calculate the deflated space for the pool. This must be done before 5002 * we write anything to the pool because we'd need to update the space 5003 * accounting using the deflated sizes. 5004 */ 5005 spa_update_dspace(spa); 5006 5007 /* 5008 * We have now retrieved all the information we needed to open the 5009 * pool. If we are importing the pool in read-write mode, a few 5010 * additional steps must be performed to finish the import. 5011 */ 5012 if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || 5013 spa->spa_load_max_txg == UINT64_MAX)) { 5014 uint64_t config_cache_txg = spa->spa_config_txg; 5015 5016 ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); 5017 5018 /* 5019 * In case of a checkpoint rewind, log the original txg 5020 * of the checkpointed uberblock. 5021 */ 5022 if (checkpoint_rewind) { 5023 spa_history_log_internal(spa, "checkpoint rewind", 5024 NULL, "rewound state to txg=%llu", 5025 (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); 5026 } 5027 5028 /* 5029 * Traverse the ZIL and claim all blocks. 5030 */ 5031 spa_ld_claim_log_blocks(spa); 5032 5033 /* 5034 * Kick-off the syncing thread. 5035 */ 5036 spa->spa_sync_on = B_TRUE; 5037 txg_sync_start(spa->spa_dsl_pool); 5038 mmp_thread_start(spa); 5039 5040 /* 5041 * Wait for all claims to sync. We sync up to the highest 5042 * claimed log block birth time so that claimed log blocks 5043 * don't appear to be from the future. spa_claim_max_txg 5044 * will have been set for us by ZIL traversal operations 5045 * performed above. 5046 */ 5047 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 5048 5049 /* 5050 * Check if we need to request an update of the config. On the 5051 * next sync, we would update the config stored in vdev labels 5052 * and the cachefile (by default /etc/zfs/zpool.cache). 5053 */ 5054 spa_ld_check_for_config_update(spa, config_cache_txg, 5055 update_config_cache); 5056 5057 /* 5058 * Check if a rebuild was in progress and if so resume it. 5059 * Then check all DTLs to see if anything needs resilvering. 5060 * The resilver will be deferred if a rebuild was started. 5061 */ 5062 if (vdev_rebuild_active(spa->spa_root_vdev)) { 5063 vdev_rebuild_restart(spa); 5064 } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 5065 vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5066 spa_async_request(spa, SPA_ASYNC_RESILVER); 5067 } 5068 5069 /* 5070 * Log the fact that we booted up (so that we can detect if 5071 * we rebooted in the middle of an operation). 5072 */ 5073 spa_history_log_version(spa, "open", NULL); 5074 5075 spa_restart_removal(spa); 5076 spa_spawn_aux_threads(spa); 5077 5078 /* 5079 * Delete any inconsistent datasets. 5080 * 5081 * Note: 5082 * Since we may be issuing deletes for clones here, 5083 * we make sure to do so after we've spawned all the 5084 * auxiliary threads above (from which the livelist 5085 * deletion zthr is part of). 5086 */ 5087 (void) dmu_objset_find(spa_name(spa), 5088 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 5089 5090 /* 5091 * Clean up any stale temporary dataset userrefs. 5092 */ 5093 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 5094 5095 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5096 vdev_initialize_restart(spa->spa_root_vdev); 5097 vdev_trim_restart(spa->spa_root_vdev); 5098 vdev_autotrim_restart(spa); 5099 spa_config_exit(spa, SCL_CONFIG, FTAG); 5100 } 5101 5102 spa_import_progress_remove(spa_guid(spa)); 5103 spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); 5104 5105 spa_load_note(spa, "LOADED"); 5106 5107 return (0); 5108 } 5109 5110 static int 5111 spa_load_retry(spa_t *spa, spa_load_state_t state) 5112 { 5113 spa_mode_t mode = spa->spa_mode; 5114 5115 spa_unload(spa); 5116 spa_deactivate(spa); 5117 5118 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; 5119 5120 spa_activate(spa, mode); 5121 spa_async_suspend(spa); 5122 5123 spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", 5124 (u_longlong_t)spa->spa_load_max_txg); 5125 5126 return (spa_load(spa, state, SPA_IMPORT_EXISTING)); 5127 } 5128 5129 /* 5130 * If spa_load() fails this function will try loading prior txg's. If 5131 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 5132 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 5133 * function will not rewind the pool and will return the same error as 5134 * spa_load(). 5135 */ 5136 static int 5137 spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, 5138 int rewind_flags) 5139 { 5140 nvlist_t *loadinfo = NULL; 5141 nvlist_t *config = NULL; 5142 int load_error, rewind_error; 5143 uint64_t safe_rewind_txg; 5144 uint64_t min_txg; 5145 5146 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 5147 spa->spa_load_max_txg = spa->spa_load_txg; 5148 spa_set_log_state(spa, SPA_LOG_CLEAR); 5149 } else { 5150 spa->spa_load_max_txg = max_request; 5151 if (max_request != UINT64_MAX) 5152 spa->spa_extreme_rewind = B_TRUE; 5153 } 5154 5155 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING); 5156 if (load_error == 0) 5157 return (0); 5158 if (load_error == ZFS_ERR_NO_CHECKPOINT) { 5159 /* 5160 * When attempting checkpoint-rewind on a pool with no 5161 * checkpoint, we should not attempt to load uberblocks 5162 * from previous txgs when spa_load fails. 5163 */ 5164 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 5165 spa_import_progress_remove(spa_guid(spa)); 5166 return (load_error); 5167 } 5168 5169 if (spa->spa_root_vdev != NULL) 5170 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 5171 5172 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 5173 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 5174 5175 if (rewind_flags & ZPOOL_NEVER_REWIND) { 5176 nvlist_free(config); 5177 spa_import_progress_remove(spa_guid(spa)); 5178 return (load_error); 5179 } 5180 5181 if (state == SPA_LOAD_RECOVER) { 5182 /* Price of rolling back is discarding txgs, including log */ 5183 spa_set_log_state(spa, SPA_LOG_CLEAR); 5184 } else { 5185 /* 5186 * If we aren't rolling back save the load info from our first 5187 * import attempt so that we can restore it after attempting 5188 * to rewind. 5189 */ 5190 loadinfo = spa->spa_load_info; 5191 spa->spa_load_info = fnvlist_alloc(); 5192 } 5193 5194 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 5195 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 5196 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 5197 TXG_INITIAL : safe_rewind_txg; 5198 5199 /* 5200 * Continue as long as we're finding errors, we're still within 5201 * the acceptable rewind range, and we're still finding uberblocks 5202 */ 5203 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 5204 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 5205 if (spa->spa_load_max_txg < safe_rewind_txg) 5206 spa->spa_extreme_rewind = B_TRUE; 5207 rewind_error = spa_load_retry(spa, state); 5208 } 5209 5210 spa->spa_extreme_rewind = B_FALSE; 5211 spa->spa_load_max_txg = UINT64_MAX; 5212 5213 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 5214 spa_config_set(spa, config); 5215 else 5216 nvlist_free(config); 5217 5218 if (state == SPA_LOAD_RECOVER) { 5219 ASSERT3P(loadinfo, ==, NULL); 5220 spa_import_progress_remove(spa_guid(spa)); 5221 return (rewind_error); 5222 } else { 5223 /* Store the rewind info as part of the initial load info */ 5224 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 5225 spa->spa_load_info); 5226 5227 /* Restore the initial load info */ 5228 fnvlist_free(spa->spa_load_info); 5229 spa->spa_load_info = loadinfo; 5230 5231 spa_import_progress_remove(spa_guid(spa)); 5232 return (load_error); 5233 } 5234 } 5235 5236 /* 5237 * Pool Open/Import 5238 * 5239 * The import case is identical to an open except that the configuration is sent 5240 * down from userland, instead of grabbed from the configuration cache. For the 5241 * case of an open, the pool configuration will exist in the 5242 * POOL_STATE_UNINITIALIZED state. 5243 * 5244 * The stats information (gen/count/ustats) is used to gather vdev statistics at 5245 * the same time open the pool, without having to keep around the spa_t in some 5246 * ambiguous state. 5247 */ 5248 static int 5249 spa_open_common(const char *pool, spa_t **spapp, const void *tag, 5250 nvlist_t *nvpolicy, nvlist_t **config) 5251 { 5252 spa_t *spa; 5253 spa_load_state_t state = SPA_LOAD_OPEN; 5254 int error; 5255 int locked = B_FALSE; 5256 int firstopen = B_FALSE; 5257 5258 *spapp = NULL; 5259 5260 /* 5261 * As disgusting as this is, we need to support recursive calls to this 5262 * function because dsl_dir_open() is called during spa_load(), and ends 5263 * up calling spa_open() again. The real fix is to figure out how to 5264 * avoid dsl_dir_open() calling this in the first place. 5265 */ 5266 if (MUTEX_NOT_HELD(&spa_namespace_lock)) { 5267 mutex_enter(&spa_namespace_lock); 5268 locked = B_TRUE; 5269 } 5270 5271 if ((spa = spa_lookup(pool)) == NULL) { 5272 if (locked) 5273 mutex_exit(&spa_namespace_lock); 5274 return (SET_ERROR(ENOENT)); 5275 } 5276 5277 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 5278 zpool_load_policy_t policy; 5279 5280 firstopen = B_TRUE; 5281 5282 zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config, 5283 &policy); 5284 if (policy.zlp_rewind & ZPOOL_DO_REWIND) 5285 state = SPA_LOAD_RECOVER; 5286 5287 spa_activate(spa, spa_mode_global); 5288 5289 if (state != SPA_LOAD_RECOVER) 5290 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 5291 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; 5292 5293 zfs_dbgmsg("spa_open_common: opening %s", pool); 5294 error = spa_load_best(spa, state, policy.zlp_txg, 5295 policy.zlp_rewind); 5296 5297 if (error == EBADF) { 5298 /* 5299 * If vdev_validate() returns failure (indicated by 5300 * EBADF), it indicates that one of the vdevs indicates 5301 * that the pool has been exported or destroyed. If 5302 * this is the case, the config cache is out of sync and 5303 * we should remove the pool from the namespace. 5304 */ 5305 spa_unload(spa); 5306 spa_deactivate(spa); 5307 spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); 5308 spa_remove(spa); 5309 if (locked) 5310 mutex_exit(&spa_namespace_lock); 5311 return (SET_ERROR(ENOENT)); 5312 } 5313 5314 if (error) { 5315 /* 5316 * We can't open the pool, but we still have useful 5317 * information: the state of each vdev after the 5318 * attempted vdev_open(). Return this to the user. 5319 */ 5320 if (config != NULL && spa->spa_config) { 5321 *config = fnvlist_dup(spa->spa_config); 5322 fnvlist_add_nvlist(*config, 5323 ZPOOL_CONFIG_LOAD_INFO, 5324 spa->spa_load_info); 5325 } 5326 spa_unload(spa); 5327 spa_deactivate(spa); 5328 spa->spa_last_open_failed = error; 5329 if (locked) 5330 mutex_exit(&spa_namespace_lock); 5331 *spapp = NULL; 5332 return (error); 5333 } 5334 } 5335 5336 spa_open_ref(spa, tag); 5337 5338 if (config != NULL) 5339 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 5340 5341 /* 5342 * If we've recovered the pool, pass back any information we 5343 * gathered while doing the load. 5344 */ 5345 if (state == SPA_LOAD_RECOVER && config != NULL) { 5346 fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 5347 spa->spa_load_info); 5348 } 5349 5350 if (locked) { 5351 spa->spa_last_open_failed = 0; 5352 spa->spa_last_ubsync_txg = 0; 5353 spa->spa_load_txg = 0; 5354 mutex_exit(&spa_namespace_lock); 5355 } 5356 5357 if (firstopen) 5358 zvol_create_minors_recursive(spa_name(spa)); 5359 5360 *spapp = spa; 5361 5362 return (0); 5363 } 5364 5365 int 5366 spa_open_rewind(const char *name, spa_t **spapp, const void *tag, 5367 nvlist_t *policy, nvlist_t **config) 5368 { 5369 return (spa_open_common(name, spapp, tag, policy, config)); 5370 } 5371 5372 int 5373 spa_open(const char *name, spa_t **spapp, const void *tag) 5374 { 5375 return (spa_open_common(name, spapp, tag, NULL, NULL)); 5376 } 5377 5378 /* 5379 * Lookup the given spa_t, incrementing the inject count in the process, 5380 * preventing it from being exported or destroyed. 5381 */ 5382 spa_t * 5383 spa_inject_addref(char *name) 5384 { 5385 spa_t *spa; 5386 5387 mutex_enter(&spa_namespace_lock); 5388 if ((spa = spa_lookup(name)) == NULL) { 5389 mutex_exit(&spa_namespace_lock); 5390 return (NULL); 5391 } 5392 spa->spa_inject_ref++; 5393 mutex_exit(&spa_namespace_lock); 5394 5395 return (spa); 5396 } 5397 5398 void 5399 spa_inject_delref(spa_t *spa) 5400 { 5401 mutex_enter(&spa_namespace_lock); 5402 spa->spa_inject_ref--; 5403 mutex_exit(&spa_namespace_lock); 5404 } 5405 5406 /* 5407 * Add spares device information to the nvlist. 5408 */ 5409 static void 5410 spa_add_spares(spa_t *spa, nvlist_t *config) 5411 { 5412 nvlist_t **spares; 5413 uint_t i, nspares; 5414 nvlist_t *nvroot; 5415 uint64_t guid; 5416 vdev_stat_t *vs; 5417 uint_t vsc; 5418 uint64_t pool; 5419 5420 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 5421 5422 if (spa->spa_spares.sav_count == 0) 5423 return; 5424 5425 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 5426 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5427 ZPOOL_CONFIG_SPARES, &spares, &nspares)); 5428 if (nspares != 0) { 5429 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 5430 (const nvlist_t * const *)spares, nspares); 5431 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 5432 &spares, &nspares)); 5433 5434 /* 5435 * Go through and find any spares which have since been 5436 * repurposed as an active spare. If this is the case, update 5437 * their status appropriately. 5438 */ 5439 for (i = 0; i < nspares; i++) { 5440 guid = fnvlist_lookup_uint64(spares[i], 5441 ZPOOL_CONFIG_GUID); 5442 VERIFY0(nvlist_lookup_uint64_array(spares[i], 5443 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); 5444 if (spa_spare_exists(guid, &pool, NULL) && 5445 pool != 0ULL) { 5446 vs->vs_state = VDEV_STATE_CANT_OPEN; 5447 vs->vs_aux = VDEV_AUX_SPARED; 5448 } else { 5449 vs->vs_state = 5450 spa->spa_spares.sav_vdevs[i]->vdev_state; 5451 } 5452 } 5453 } 5454 } 5455 5456 /* 5457 * Add l2cache device information to the nvlist, including vdev stats. 5458 */ 5459 static void 5460 spa_add_l2cache(spa_t *spa, nvlist_t *config) 5461 { 5462 nvlist_t **l2cache; 5463 uint_t i, j, nl2cache; 5464 nvlist_t *nvroot; 5465 uint64_t guid; 5466 vdev_t *vd; 5467 vdev_stat_t *vs; 5468 uint_t vsc; 5469 5470 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 5471 5472 if (spa->spa_l2cache.sav_count == 0) 5473 return; 5474 5475 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 5476 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5477 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); 5478 if (nl2cache != 0) { 5479 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 5480 (const nvlist_t * const *)l2cache, nl2cache); 5481 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 5482 &l2cache, &nl2cache)); 5483 5484 /* 5485 * Update level 2 cache device stats. 5486 */ 5487 5488 for (i = 0; i < nl2cache; i++) { 5489 guid = fnvlist_lookup_uint64(l2cache[i], 5490 ZPOOL_CONFIG_GUID); 5491 5492 vd = NULL; 5493 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 5494 if (guid == 5495 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 5496 vd = spa->spa_l2cache.sav_vdevs[j]; 5497 break; 5498 } 5499 } 5500 ASSERT(vd != NULL); 5501 5502 VERIFY0(nvlist_lookup_uint64_array(l2cache[i], 5503 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); 5504 vdev_get_stats(vd, vs); 5505 vdev_config_generate_stats(vd, l2cache[i]); 5506 5507 } 5508 } 5509 } 5510 5511 static void 5512 spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features) 5513 { 5514 zap_cursor_t zc; 5515 zap_attribute_t za; 5516 5517 if (spa->spa_feat_for_read_obj != 0) { 5518 for (zap_cursor_init(&zc, spa->spa_meta_objset, 5519 spa->spa_feat_for_read_obj); 5520 zap_cursor_retrieve(&zc, &za) == 0; 5521 zap_cursor_advance(&zc)) { 5522 ASSERT(za.za_integer_length == sizeof (uint64_t) && 5523 za.za_num_integers == 1); 5524 VERIFY0(nvlist_add_uint64(features, za.za_name, 5525 za.za_first_integer)); 5526 } 5527 zap_cursor_fini(&zc); 5528 } 5529 5530 if (spa->spa_feat_for_write_obj != 0) { 5531 for (zap_cursor_init(&zc, spa->spa_meta_objset, 5532 spa->spa_feat_for_write_obj); 5533 zap_cursor_retrieve(&zc, &za) == 0; 5534 zap_cursor_advance(&zc)) { 5535 ASSERT(za.za_integer_length == sizeof (uint64_t) && 5536 za.za_num_integers == 1); 5537 VERIFY0(nvlist_add_uint64(features, za.za_name, 5538 za.za_first_integer)); 5539 } 5540 zap_cursor_fini(&zc); 5541 } 5542 } 5543 5544 static void 5545 spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features) 5546 { 5547 int i; 5548 5549 for (i = 0; i < SPA_FEATURES; i++) { 5550 zfeature_info_t feature = spa_feature_table[i]; 5551 uint64_t refcount; 5552 5553 if (feature_get_refcount(spa, &feature, &refcount) != 0) 5554 continue; 5555 5556 VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount)); 5557 } 5558 } 5559 5560 /* 5561 * Store a list of pool features and their reference counts in the 5562 * config. 5563 * 5564 * The first time this is called on a spa, allocate a new nvlist, fetch 5565 * the pool features and reference counts from disk, then save the list 5566 * in the spa. In subsequent calls on the same spa use the saved nvlist 5567 * and refresh its values from the cached reference counts. This 5568 * ensures we don't block here on I/O on a suspended pool so 'zpool 5569 * clear' can resume the pool. 5570 */ 5571 static void 5572 spa_add_feature_stats(spa_t *spa, nvlist_t *config) 5573 { 5574 nvlist_t *features; 5575 5576 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 5577 5578 mutex_enter(&spa->spa_feat_stats_lock); 5579 features = spa->spa_feat_stats; 5580 5581 if (features != NULL) { 5582 spa_feature_stats_from_cache(spa, features); 5583 } else { 5584 VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP)); 5585 spa->spa_feat_stats = features; 5586 spa_feature_stats_from_disk(spa, features); 5587 } 5588 5589 VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 5590 features)); 5591 5592 mutex_exit(&spa->spa_feat_stats_lock); 5593 } 5594 5595 int 5596 spa_get_stats(const char *name, nvlist_t **config, 5597 char *altroot, size_t buflen) 5598 { 5599 int error; 5600 spa_t *spa; 5601 5602 *config = NULL; 5603 error = spa_open_common(name, &spa, FTAG, NULL, config); 5604 5605 if (spa != NULL) { 5606 /* 5607 * This still leaves a window of inconsistency where the spares 5608 * or l2cache devices could change and the config would be 5609 * self-inconsistent. 5610 */ 5611 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5612 5613 if (*config != NULL) { 5614 uint64_t loadtimes[2]; 5615 5616 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 5617 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 5618 fnvlist_add_uint64_array(*config, 5619 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2); 5620 5621 fnvlist_add_uint64(*config, 5622 ZPOOL_CONFIG_ERRCOUNT, 5623 spa_approx_errlog_size(spa)); 5624 5625 if (spa_suspended(spa)) { 5626 fnvlist_add_uint64(*config, 5627 ZPOOL_CONFIG_SUSPENDED, 5628 spa->spa_failmode); 5629 fnvlist_add_uint64(*config, 5630 ZPOOL_CONFIG_SUSPENDED_REASON, 5631 spa->spa_suspended); 5632 } 5633 5634 spa_add_spares(spa, *config); 5635 spa_add_l2cache(spa, *config); 5636 spa_add_feature_stats(spa, *config); 5637 } 5638 } 5639 5640 /* 5641 * We want to get the alternate root even for faulted pools, so we cheat 5642 * and call spa_lookup() directly. 5643 */ 5644 if (altroot) { 5645 if (spa == NULL) { 5646 mutex_enter(&spa_namespace_lock); 5647 spa = spa_lookup(name); 5648 if (spa) 5649 spa_altroot(spa, altroot, buflen); 5650 else 5651 altroot[0] = '\0'; 5652 spa = NULL; 5653 mutex_exit(&spa_namespace_lock); 5654 } else { 5655 spa_altroot(spa, altroot, buflen); 5656 } 5657 } 5658 5659 if (spa != NULL) { 5660 spa_config_exit(spa, SCL_CONFIG, FTAG); 5661 spa_close(spa, FTAG); 5662 } 5663 5664 return (error); 5665 } 5666 5667 /* 5668 * Validate that the auxiliary device array is well formed. We must have an 5669 * array of nvlists, each which describes a valid leaf vdev. If this is an 5670 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 5671 * specified, as long as they are well-formed. 5672 */ 5673 static int 5674 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 5675 spa_aux_vdev_t *sav, const char *config, uint64_t version, 5676 vdev_labeltype_t label) 5677 { 5678 nvlist_t **dev; 5679 uint_t i, ndev; 5680 vdev_t *vd; 5681 int error; 5682 5683 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5684 5685 /* 5686 * It's acceptable to have no devs specified. 5687 */ 5688 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 5689 return (0); 5690 5691 if (ndev == 0) 5692 return (SET_ERROR(EINVAL)); 5693 5694 /* 5695 * Make sure the pool is formatted with a version that supports this 5696 * device type. 5697 */ 5698 if (spa_version(spa) < version) 5699 return (SET_ERROR(ENOTSUP)); 5700 5701 /* 5702 * Set the pending device list so we correctly handle device in-use 5703 * checking. 5704 */ 5705 sav->sav_pending = dev; 5706 sav->sav_npending = ndev; 5707 5708 for (i = 0; i < ndev; i++) { 5709 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 5710 mode)) != 0) 5711 goto out; 5712 5713 if (!vd->vdev_ops->vdev_op_leaf) { 5714 vdev_free(vd); 5715 error = SET_ERROR(EINVAL); 5716 goto out; 5717 } 5718 5719 vd->vdev_top = vd; 5720 5721 if ((error = vdev_open(vd)) == 0 && 5722 (error = vdev_label_init(vd, crtxg, label)) == 0) { 5723 fnvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 5724 vd->vdev_guid); 5725 } 5726 5727 vdev_free(vd); 5728 5729 if (error && 5730 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 5731 goto out; 5732 else 5733 error = 0; 5734 } 5735 5736 out: 5737 sav->sav_pending = NULL; 5738 sav->sav_npending = 0; 5739 return (error); 5740 } 5741 5742 static int 5743 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 5744 { 5745 int error; 5746 5747 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5748 5749 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 5750 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 5751 VDEV_LABEL_SPARE)) != 0) { 5752 return (error); 5753 } 5754 5755 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 5756 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 5757 VDEV_LABEL_L2CACHE)); 5758 } 5759 5760 static void 5761 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 5762 const char *config) 5763 { 5764 int i; 5765 5766 if (sav->sav_config != NULL) { 5767 nvlist_t **olddevs; 5768 uint_t oldndevs; 5769 nvlist_t **newdevs; 5770 5771 /* 5772 * Generate new dev list by concatenating with the 5773 * current dev list. 5774 */ 5775 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, config, 5776 &olddevs, &oldndevs)); 5777 5778 newdevs = kmem_alloc(sizeof (void *) * 5779 (ndevs + oldndevs), KM_SLEEP); 5780 for (i = 0; i < oldndevs; i++) 5781 newdevs[i] = fnvlist_dup(olddevs[i]); 5782 for (i = 0; i < ndevs; i++) 5783 newdevs[i + oldndevs] = fnvlist_dup(devs[i]); 5784 5785 fnvlist_remove(sav->sav_config, config); 5786 5787 fnvlist_add_nvlist_array(sav->sav_config, config, 5788 (const nvlist_t * const *)newdevs, ndevs + oldndevs); 5789 for (i = 0; i < oldndevs + ndevs; i++) 5790 nvlist_free(newdevs[i]); 5791 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 5792 } else { 5793 /* 5794 * Generate a new dev list. 5795 */ 5796 sav->sav_config = fnvlist_alloc(); 5797 fnvlist_add_nvlist_array(sav->sav_config, config, 5798 (const nvlist_t * const *)devs, ndevs); 5799 } 5800 } 5801 5802 /* 5803 * Stop and drop level 2 ARC devices 5804 */ 5805 void 5806 spa_l2cache_drop(spa_t *spa) 5807 { 5808 vdev_t *vd; 5809 int i; 5810 spa_aux_vdev_t *sav = &spa->spa_l2cache; 5811 5812 for (i = 0; i < sav->sav_count; i++) { 5813 uint64_t pool; 5814 5815 vd = sav->sav_vdevs[i]; 5816 ASSERT(vd != NULL); 5817 5818 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 5819 pool != 0ULL && l2arc_vdev_present(vd)) 5820 l2arc_remove_vdev(vd); 5821 } 5822 } 5823 5824 /* 5825 * Verify encryption parameters for spa creation. If we are encrypting, we must 5826 * have the encryption feature flag enabled. 5827 */ 5828 static int 5829 spa_create_check_encryption_params(dsl_crypto_params_t *dcp, 5830 boolean_t has_encryption) 5831 { 5832 if (dcp->cp_crypt != ZIO_CRYPT_OFF && 5833 dcp->cp_crypt != ZIO_CRYPT_INHERIT && 5834 !has_encryption) 5835 return (SET_ERROR(ENOTSUP)); 5836 5837 return (dmu_objset_create_crypt_check(NULL, dcp, NULL)); 5838 } 5839 5840 /* 5841 * Pool Creation 5842 */ 5843 int 5844 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 5845 nvlist_t *zplprops, dsl_crypto_params_t *dcp) 5846 { 5847 spa_t *spa; 5848 const char *altroot = NULL; 5849 vdev_t *rvd; 5850 dsl_pool_t *dp; 5851 dmu_tx_t *tx; 5852 int error = 0; 5853 uint64_t txg = TXG_INITIAL; 5854 nvlist_t **spares, **l2cache; 5855 uint_t nspares, nl2cache; 5856 uint64_t version, obj, ndraid = 0; 5857 boolean_t has_features; 5858 boolean_t has_encryption; 5859 boolean_t has_allocclass; 5860 spa_feature_t feat; 5861 const char *feat_name; 5862 const char *poolname; 5863 nvlist_t *nvl; 5864 5865 if (props == NULL || 5866 nvlist_lookup_string(props, "tname", &poolname) != 0) 5867 poolname = (char *)pool; 5868 5869 /* 5870 * If this pool already exists, return failure. 5871 */ 5872 mutex_enter(&spa_namespace_lock); 5873 if (spa_lookup(poolname) != NULL) { 5874 mutex_exit(&spa_namespace_lock); 5875 return (SET_ERROR(EEXIST)); 5876 } 5877 5878 /* 5879 * Allocate a new spa_t structure. 5880 */ 5881 nvl = fnvlist_alloc(); 5882 fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool); 5883 (void) nvlist_lookup_string(props, 5884 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5885 spa = spa_add(poolname, nvl, altroot); 5886 fnvlist_free(nvl); 5887 spa_activate(spa, spa_mode_global); 5888 5889 if (props && (error = spa_prop_validate(spa, props))) { 5890 spa_deactivate(spa); 5891 spa_remove(spa); 5892 mutex_exit(&spa_namespace_lock); 5893 return (error); 5894 } 5895 5896 /* 5897 * Temporary pool names should never be written to disk. 5898 */ 5899 if (poolname != pool) 5900 spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME; 5901 5902 has_features = B_FALSE; 5903 has_encryption = B_FALSE; 5904 has_allocclass = B_FALSE; 5905 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 5906 elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 5907 if (zpool_prop_feature(nvpair_name(elem))) { 5908 has_features = B_TRUE; 5909 5910 feat_name = strchr(nvpair_name(elem), '@') + 1; 5911 VERIFY0(zfeature_lookup_name(feat_name, &feat)); 5912 if (feat == SPA_FEATURE_ENCRYPTION) 5913 has_encryption = B_TRUE; 5914 if (feat == SPA_FEATURE_ALLOCATION_CLASSES) 5915 has_allocclass = B_TRUE; 5916 } 5917 } 5918 5919 /* verify encryption params, if they were provided */ 5920 if (dcp != NULL) { 5921 error = spa_create_check_encryption_params(dcp, has_encryption); 5922 if (error != 0) { 5923 spa_deactivate(spa); 5924 spa_remove(spa); 5925 mutex_exit(&spa_namespace_lock); 5926 return (error); 5927 } 5928 } 5929 if (!has_allocclass && zfs_special_devs(nvroot, NULL)) { 5930 spa_deactivate(spa); 5931 spa_remove(spa); 5932 mutex_exit(&spa_namespace_lock); 5933 return (ENOTSUP); 5934 } 5935 5936 if (has_features || nvlist_lookup_uint64(props, 5937 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 5938 version = SPA_VERSION; 5939 } 5940 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 5941 5942 spa->spa_first_txg = txg; 5943 spa->spa_uberblock.ub_txg = txg - 1; 5944 spa->spa_uberblock.ub_version = version; 5945 spa->spa_ubsync = spa->spa_uberblock; 5946 spa->spa_load_state = SPA_LOAD_CREATE; 5947 spa->spa_removing_phys.sr_state = DSS_NONE; 5948 spa->spa_removing_phys.sr_removing_vdev = -1; 5949 spa->spa_removing_phys.sr_prev_indirect_vdev = -1; 5950 spa->spa_indirect_vdevs_loaded = B_TRUE; 5951 5952 /* 5953 * Create "The Godfather" zio to hold all async IOs 5954 */ 5955 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 5956 KM_SLEEP); 5957 for (int i = 0; i < max_ncpus; i++) { 5958 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 5959 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 5960 ZIO_FLAG_GODFATHER); 5961 } 5962 5963 /* 5964 * Create the root vdev. 5965 */ 5966 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5967 5968 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 5969 5970 ASSERT(error != 0 || rvd != NULL); 5971 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 5972 5973 if (error == 0 && !zfs_allocatable_devs(nvroot)) 5974 error = SET_ERROR(EINVAL); 5975 5976 if (error == 0 && 5977 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 5978 (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 && 5979 (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { 5980 /* 5981 * instantiate the metaslab groups (this will dirty the vdevs) 5982 * we can no longer error exit past this point 5983 */ 5984 for (int c = 0; error == 0 && c < rvd->vdev_children; c++) { 5985 vdev_t *vd = rvd->vdev_child[c]; 5986 5987 vdev_metaslab_set_size(vd); 5988 vdev_expand(vd, txg); 5989 } 5990 } 5991 5992 spa_config_exit(spa, SCL_ALL, FTAG); 5993 5994 if (error != 0) { 5995 spa_unload(spa); 5996 spa_deactivate(spa); 5997 spa_remove(spa); 5998 mutex_exit(&spa_namespace_lock); 5999 return (error); 6000 } 6001 6002 /* 6003 * Get the list of spares, if specified. 6004 */ 6005 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 6006 &spares, &nspares) == 0) { 6007 spa->spa_spares.sav_config = fnvlist_alloc(); 6008 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 6009 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 6010 nspares); 6011 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6012 spa_load_spares(spa); 6013 spa_config_exit(spa, SCL_ALL, FTAG); 6014 spa->spa_spares.sav_sync = B_TRUE; 6015 } 6016 6017 /* 6018 * Get the list of level 2 cache devices, if specified. 6019 */ 6020 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 6021 &l2cache, &nl2cache) == 0) { 6022 VERIFY0(nvlist_alloc(&spa->spa_l2cache.sav_config, 6023 NV_UNIQUE_NAME, KM_SLEEP)); 6024 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 6025 ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, 6026 nl2cache); 6027 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6028 spa_load_l2cache(spa); 6029 spa_config_exit(spa, SCL_ALL, FTAG); 6030 spa->spa_l2cache.sav_sync = B_TRUE; 6031 } 6032 6033 spa->spa_is_initializing = B_TRUE; 6034 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg); 6035 spa->spa_is_initializing = B_FALSE; 6036 6037 /* 6038 * Create DDTs (dedup tables). 6039 */ 6040 ddt_create(spa); 6041 /* 6042 * Create BRT table and BRT table object. 6043 */ 6044 brt_create(spa); 6045 6046 spa_update_dspace(spa); 6047 6048 tx = dmu_tx_create_assigned(dp, txg); 6049 6050 /* 6051 * Create the pool's history object. 6052 */ 6053 if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history) 6054 spa_history_create_obj(spa, tx); 6055 6056 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); 6057 spa_history_log_version(spa, "create", tx); 6058 6059 /* 6060 * Create the pool config object. 6061 */ 6062 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 6063 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 6064 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 6065 6066 if (zap_add(spa->spa_meta_objset, 6067 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 6068 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 6069 cmn_err(CE_PANIC, "failed to add pool config"); 6070 } 6071 6072 if (zap_add(spa->spa_meta_objset, 6073 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 6074 sizeof (uint64_t), 1, &version, tx) != 0) { 6075 cmn_err(CE_PANIC, "failed to add pool version"); 6076 } 6077 6078 /* Newly created pools with the right version are always deflated. */ 6079 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 6080 spa->spa_deflate = TRUE; 6081 if (zap_add(spa->spa_meta_objset, 6082 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6083 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 6084 cmn_err(CE_PANIC, "failed to add deflate"); 6085 } 6086 } 6087 6088 /* 6089 * Create the deferred-free bpobj. Turn off compression 6090 * because sync-to-convergence takes longer if the blocksize 6091 * keeps changing. 6092 */ 6093 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 6094 dmu_object_set_compress(spa->spa_meta_objset, obj, 6095 ZIO_COMPRESS_OFF, tx); 6096 if (zap_add(spa->spa_meta_objset, 6097 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 6098 sizeof (uint64_t), 1, &obj, tx) != 0) { 6099 cmn_err(CE_PANIC, "failed to add bpobj"); 6100 } 6101 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 6102 spa->spa_meta_objset, obj)); 6103 6104 /* 6105 * Generate some random noise for salted checksums to operate on. 6106 */ 6107 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 6108 sizeof (spa->spa_cksum_salt.zcs_bytes)); 6109 6110 /* 6111 * Set pool properties. 6112 */ 6113 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 6114 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 6115 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 6116 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 6117 spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST); 6118 spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM); 6119 6120 if (props != NULL) { 6121 spa_configfile_set(spa, props, B_FALSE); 6122 spa_sync_props(props, tx); 6123 } 6124 6125 for (int i = 0; i < ndraid; i++) 6126 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); 6127 6128 dmu_tx_commit(tx); 6129 6130 spa->spa_sync_on = B_TRUE; 6131 txg_sync_start(dp); 6132 mmp_thread_start(spa); 6133 txg_wait_synced(dp, txg); 6134 6135 spa_spawn_aux_threads(spa); 6136 6137 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); 6138 6139 /* 6140 * Don't count references from objsets that are already closed 6141 * and are making their way through the eviction process. 6142 */ 6143 spa_evicting_os_wait(spa); 6144 spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); 6145 spa->spa_load_state = SPA_LOAD_NONE; 6146 6147 spa_import_os(spa); 6148 6149 mutex_exit(&spa_namespace_lock); 6150 6151 return (0); 6152 } 6153 6154 /* 6155 * Import a non-root pool into the system. 6156 */ 6157 int 6158 spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 6159 { 6160 spa_t *spa; 6161 const char *altroot = NULL; 6162 spa_load_state_t state = SPA_LOAD_IMPORT; 6163 zpool_load_policy_t policy; 6164 spa_mode_t mode = spa_mode_global; 6165 uint64_t readonly = B_FALSE; 6166 int error; 6167 nvlist_t *nvroot; 6168 nvlist_t **spares, **l2cache; 6169 uint_t nspares, nl2cache; 6170 6171 /* 6172 * If a pool with this name exists, return failure. 6173 */ 6174 mutex_enter(&spa_namespace_lock); 6175 if (spa_lookup(pool) != NULL) { 6176 mutex_exit(&spa_namespace_lock); 6177 return (SET_ERROR(EEXIST)); 6178 } 6179 6180 /* 6181 * Create and initialize the spa structure. 6182 */ 6183 (void) nvlist_lookup_string(props, 6184 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 6185 (void) nvlist_lookup_uint64(props, 6186 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 6187 if (readonly) 6188 mode = SPA_MODE_READ; 6189 spa = spa_add(pool, config, altroot); 6190 spa->spa_import_flags = flags; 6191 6192 /* 6193 * Verbatim import - Take a pool and insert it into the namespace 6194 * as if it had been loaded at boot. 6195 */ 6196 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 6197 if (props != NULL) 6198 spa_configfile_set(spa, props, B_FALSE); 6199 6200 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); 6201 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 6202 zfs_dbgmsg("spa_import: verbatim import of %s", pool); 6203 mutex_exit(&spa_namespace_lock); 6204 return (0); 6205 } 6206 6207 spa_activate(spa, mode); 6208 6209 /* 6210 * Don't start async tasks until we know everything is healthy. 6211 */ 6212 spa_async_suspend(spa); 6213 6214 zpool_get_load_policy(config, &policy); 6215 if (policy.zlp_rewind & ZPOOL_DO_REWIND) 6216 state = SPA_LOAD_RECOVER; 6217 6218 spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT; 6219 6220 if (state != SPA_LOAD_RECOVER) { 6221 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 6222 zfs_dbgmsg("spa_import: importing %s", pool); 6223 } else { 6224 zfs_dbgmsg("spa_import: importing %s, max_txg=%lld " 6225 "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg); 6226 } 6227 error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); 6228 6229 /* 6230 * Propagate anything learned while loading the pool and pass it 6231 * back to caller (i.e. rewind info, missing devices, etc). 6232 */ 6233 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); 6234 6235 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6236 /* 6237 * Toss any existing sparelist, as it doesn't have any validity 6238 * anymore, and conflicts with spa_has_spare(). 6239 */ 6240 if (spa->spa_spares.sav_config) { 6241 nvlist_free(spa->spa_spares.sav_config); 6242 spa->spa_spares.sav_config = NULL; 6243 spa_load_spares(spa); 6244 } 6245 if (spa->spa_l2cache.sav_config) { 6246 nvlist_free(spa->spa_l2cache.sav_config); 6247 spa->spa_l2cache.sav_config = NULL; 6248 spa_load_l2cache(spa); 6249 } 6250 6251 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 6252 spa_config_exit(spa, SCL_ALL, FTAG); 6253 6254 if (props != NULL) 6255 spa_configfile_set(spa, props, B_FALSE); 6256 6257 if (error != 0 || (props && spa_writeable(spa) && 6258 (error = spa_prop_set(spa, props)))) { 6259 spa_unload(spa); 6260 spa_deactivate(spa); 6261 spa_remove(spa); 6262 mutex_exit(&spa_namespace_lock); 6263 return (error); 6264 } 6265 6266 spa_async_resume(spa); 6267 6268 /* 6269 * Override any spares and level 2 cache devices as specified by 6270 * the user, as these may have correct device names/devids, etc. 6271 */ 6272 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 6273 &spares, &nspares) == 0) { 6274 if (spa->spa_spares.sav_config) 6275 fnvlist_remove(spa->spa_spares.sav_config, 6276 ZPOOL_CONFIG_SPARES); 6277 else 6278 spa->spa_spares.sav_config = fnvlist_alloc(); 6279 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 6280 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 6281 nspares); 6282 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6283 spa_load_spares(spa); 6284 spa_config_exit(spa, SCL_ALL, FTAG); 6285 spa->spa_spares.sav_sync = B_TRUE; 6286 } 6287 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 6288 &l2cache, &nl2cache) == 0) { 6289 if (spa->spa_l2cache.sav_config) 6290 fnvlist_remove(spa->spa_l2cache.sav_config, 6291 ZPOOL_CONFIG_L2CACHE); 6292 else 6293 spa->spa_l2cache.sav_config = fnvlist_alloc(); 6294 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 6295 ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, 6296 nl2cache); 6297 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6298 spa_load_l2cache(spa); 6299 spa_config_exit(spa, SCL_ALL, FTAG); 6300 spa->spa_l2cache.sav_sync = B_TRUE; 6301 } 6302 6303 /* 6304 * Check for any removed devices. 6305 */ 6306 if (spa->spa_autoreplace) { 6307 spa_aux_check_removed(&spa->spa_spares); 6308 spa_aux_check_removed(&spa->spa_l2cache); 6309 } 6310 6311 if (spa_writeable(spa)) { 6312 /* 6313 * Update the config cache to include the newly-imported pool. 6314 */ 6315 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 6316 } 6317 6318 /* 6319 * It's possible that the pool was expanded while it was exported. 6320 * We kick off an async task to handle this for us. 6321 */ 6322 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 6323 6324 spa_history_log_version(spa, "import", NULL); 6325 6326 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 6327 6328 mutex_exit(&spa_namespace_lock); 6329 6330 zvol_create_minors_recursive(pool); 6331 6332 spa_import_os(spa); 6333 6334 return (0); 6335 } 6336 6337 nvlist_t * 6338 spa_tryimport(nvlist_t *tryconfig) 6339 { 6340 nvlist_t *config = NULL; 6341 const char *poolname, *cachefile; 6342 spa_t *spa; 6343 uint64_t state; 6344 int error; 6345 zpool_load_policy_t policy; 6346 6347 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 6348 return (NULL); 6349 6350 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 6351 return (NULL); 6352 6353 /* 6354 * Create and initialize the spa structure. 6355 */ 6356 mutex_enter(&spa_namespace_lock); 6357 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 6358 spa_activate(spa, SPA_MODE_READ); 6359 6360 /* 6361 * Rewind pool if a max txg was provided. 6362 */ 6363 zpool_get_load_policy(spa->spa_config, &policy); 6364 if (policy.zlp_txg != UINT64_MAX) { 6365 spa->spa_load_max_txg = policy.zlp_txg; 6366 spa->spa_extreme_rewind = B_TRUE; 6367 zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld", 6368 poolname, (longlong_t)policy.zlp_txg); 6369 } else { 6370 zfs_dbgmsg("spa_tryimport: importing %s", poolname); 6371 } 6372 6373 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile) 6374 == 0) { 6375 zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile); 6376 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; 6377 } else { 6378 spa->spa_config_source = SPA_CONFIG_SRC_SCAN; 6379 } 6380 6381 /* 6382 * spa_import() relies on a pool config fetched by spa_try_import() 6383 * for spare/cache devices. Import flags are not passed to 6384 * spa_tryimport(), which makes it return early due to a missing log 6385 * device and missing retrieving the cache device and spare eventually. 6386 * Passing ZFS_IMPORT_MISSING_LOG to spa_tryimport() makes it fetch 6387 * the correct configuration regardless of the missing log device. 6388 */ 6389 spa->spa_import_flags |= ZFS_IMPORT_MISSING_LOG; 6390 6391 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); 6392 6393 /* 6394 * If 'tryconfig' was at least parsable, return the current config. 6395 */ 6396 if (spa->spa_root_vdev != NULL) { 6397 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 6398 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname); 6399 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state); 6400 fnvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 6401 spa->spa_uberblock.ub_timestamp); 6402 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 6403 spa->spa_load_info); 6404 fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, 6405 spa->spa_errata); 6406 6407 /* 6408 * If the bootfs property exists on this pool then we 6409 * copy it out so that external consumers can tell which 6410 * pools are bootable. 6411 */ 6412 if ((!error || error == EEXIST) && spa->spa_bootfs) { 6413 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 6414 6415 /* 6416 * We have to play games with the name since the 6417 * pool was opened as TRYIMPORT_NAME. 6418 */ 6419 if (dsl_dsobj_to_dsname(spa_name(spa), 6420 spa->spa_bootfs, tmpname) == 0) { 6421 char *cp; 6422 char *dsname; 6423 6424 dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 6425 6426 cp = strchr(tmpname, '/'); 6427 if (cp == NULL) { 6428 (void) strlcpy(dsname, tmpname, 6429 MAXPATHLEN); 6430 } else { 6431 (void) snprintf(dsname, MAXPATHLEN, 6432 "%s/%s", poolname, ++cp); 6433 } 6434 fnvlist_add_string(config, ZPOOL_CONFIG_BOOTFS, 6435 dsname); 6436 kmem_free(dsname, MAXPATHLEN); 6437 } 6438 kmem_free(tmpname, MAXPATHLEN); 6439 } 6440 6441 /* 6442 * Add the list of hot spares and level 2 cache devices. 6443 */ 6444 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6445 spa_add_spares(spa, config); 6446 spa_add_l2cache(spa, config); 6447 spa_config_exit(spa, SCL_CONFIG, FTAG); 6448 } 6449 6450 spa_unload(spa); 6451 spa_deactivate(spa); 6452 spa_remove(spa); 6453 mutex_exit(&spa_namespace_lock); 6454 6455 return (config); 6456 } 6457 6458 /* 6459 * Pool export/destroy 6460 * 6461 * The act of destroying or exporting a pool is very simple. We make sure there 6462 * is no more pending I/O and any references to the pool are gone. Then, we 6463 * update the pool state and sync all the labels to disk, removing the 6464 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 6465 * we don't sync the labels or remove the configuration cache. 6466 */ 6467 static int 6468 spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, 6469 boolean_t force, boolean_t hardforce) 6470 { 6471 int error; 6472 spa_t *spa; 6473 6474 if (oldconfig) 6475 *oldconfig = NULL; 6476 6477 if (!(spa_mode_global & SPA_MODE_WRITE)) 6478 return (SET_ERROR(EROFS)); 6479 6480 mutex_enter(&spa_namespace_lock); 6481 if ((spa = spa_lookup(pool)) == NULL) { 6482 mutex_exit(&spa_namespace_lock); 6483 return (SET_ERROR(ENOENT)); 6484 } 6485 6486 if (spa->spa_is_exporting) { 6487 /* the pool is being exported by another thread */ 6488 mutex_exit(&spa_namespace_lock); 6489 return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS)); 6490 } 6491 spa->spa_is_exporting = B_TRUE; 6492 6493 /* 6494 * Put a hold on the pool, drop the namespace lock, stop async tasks, 6495 * reacquire the namespace lock, and see if we can export. 6496 */ 6497 spa_open_ref(spa, FTAG); 6498 mutex_exit(&spa_namespace_lock); 6499 spa_async_suspend(spa); 6500 if (spa->spa_zvol_taskq) { 6501 zvol_remove_minors(spa, spa_name(spa), B_TRUE); 6502 taskq_wait(spa->spa_zvol_taskq); 6503 } 6504 mutex_enter(&spa_namespace_lock); 6505 spa_close(spa, FTAG); 6506 6507 if (spa->spa_state == POOL_STATE_UNINITIALIZED) 6508 goto export_spa; 6509 /* 6510 * The pool will be in core if it's openable, in which case we can 6511 * modify its state. Objsets may be open only because they're dirty, 6512 * so we have to force it to sync before checking spa_refcnt. 6513 */ 6514 if (spa->spa_sync_on) { 6515 txg_wait_synced(spa->spa_dsl_pool, 0); 6516 spa_evicting_os_wait(spa); 6517 } 6518 6519 /* 6520 * A pool cannot be exported or destroyed if there are active 6521 * references. If we are resetting a pool, allow references by 6522 * fault injection handlers. 6523 */ 6524 if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) { 6525 error = SET_ERROR(EBUSY); 6526 goto fail; 6527 } 6528 6529 if (spa->spa_sync_on) { 6530 vdev_t *rvd = spa->spa_root_vdev; 6531 /* 6532 * A pool cannot be exported if it has an active shared spare. 6533 * This is to prevent other pools stealing the active spare 6534 * from an exported pool. At user's own will, such pool can 6535 * be forcedly exported. 6536 */ 6537 if (!force && new_state == POOL_STATE_EXPORTED && 6538 spa_has_active_shared_spare(spa)) { 6539 error = SET_ERROR(EXDEV); 6540 goto fail; 6541 } 6542 6543 /* 6544 * We're about to export or destroy this pool. Make sure 6545 * we stop all initialization and trim activity here before 6546 * we set the spa_final_txg. This will ensure that all 6547 * dirty data resulting from the initialization is 6548 * committed to disk before we unload the pool. 6549 */ 6550 vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE); 6551 vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE); 6552 vdev_autotrim_stop_all(spa); 6553 vdev_rebuild_stop_all(spa); 6554 6555 /* 6556 * We want this to be reflected on every label, 6557 * so mark them all dirty. spa_unload() will do the 6558 * final sync that pushes these changes out. 6559 */ 6560 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 6561 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6562 spa->spa_state = new_state; 6563 vdev_config_dirty(rvd); 6564 spa_config_exit(spa, SCL_ALL, FTAG); 6565 } 6566 6567 /* 6568 * If the log space map feature is enabled and the pool is 6569 * getting exported (but not destroyed), we want to spend some 6570 * time flushing as many metaslabs as we can in an attempt to 6571 * destroy log space maps and save import time. This has to be 6572 * done before we set the spa_final_txg, otherwise 6573 * spa_sync() -> spa_flush_metaslabs() may dirty the final TXGs. 6574 * spa_should_flush_logs_on_unload() should be called after 6575 * spa_state has been set to the new_state. 6576 */ 6577 if (spa_should_flush_logs_on_unload(spa)) 6578 spa_unload_log_sm_flush_all(spa); 6579 6580 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 6581 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6582 spa->spa_final_txg = spa_last_synced_txg(spa) + 6583 TXG_DEFER_SIZE + 1; 6584 spa_config_exit(spa, SCL_ALL, FTAG); 6585 } 6586 } 6587 6588 export_spa: 6589 spa_export_os(spa); 6590 6591 if (new_state == POOL_STATE_DESTROYED) 6592 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); 6593 else if (new_state == POOL_STATE_EXPORTED) 6594 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT); 6595 6596 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 6597 spa_unload(spa); 6598 spa_deactivate(spa); 6599 } 6600 6601 if (oldconfig && spa->spa_config) 6602 *oldconfig = fnvlist_dup(spa->spa_config); 6603 6604 if (new_state != POOL_STATE_UNINITIALIZED) { 6605 if (!hardforce) 6606 spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); 6607 spa_remove(spa); 6608 } else { 6609 /* 6610 * If spa_remove() is not called for this spa_t and 6611 * there is any possibility that it can be reused, 6612 * we make sure to reset the exporting flag. 6613 */ 6614 spa->spa_is_exporting = B_FALSE; 6615 } 6616 6617 mutex_exit(&spa_namespace_lock); 6618 return (0); 6619 6620 fail: 6621 spa->spa_is_exporting = B_FALSE; 6622 spa_async_resume(spa); 6623 mutex_exit(&spa_namespace_lock); 6624 return (error); 6625 } 6626 6627 /* 6628 * Destroy a storage pool. 6629 */ 6630 int 6631 spa_destroy(const char *pool) 6632 { 6633 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 6634 B_FALSE, B_FALSE)); 6635 } 6636 6637 /* 6638 * Export a storage pool. 6639 */ 6640 int 6641 spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force, 6642 boolean_t hardforce) 6643 { 6644 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 6645 force, hardforce)); 6646 } 6647 6648 /* 6649 * Similar to spa_export(), this unloads the spa_t without actually removing it 6650 * from the namespace in any way. 6651 */ 6652 int 6653 spa_reset(const char *pool) 6654 { 6655 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 6656 B_FALSE, B_FALSE)); 6657 } 6658 6659 /* 6660 * ========================================================================== 6661 * Device manipulation 6662 * ========================================================================== 6663 */ 6664 6665 /* 6666 * This is called as a synctask to increment the draid feature flag 6667 */ 6668 static void 6669 spa_draid_feature_incr(void *arg, dmu_tx_t *tx) 6670 { 6671 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6672 int draid = (int)(uintptr_t)arg; 6673 6674 for (int c = 0; c < draid; c++) 6675 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); 6676 } 6677 6678 /* 6679 * Add a device to a storage pool. 6680 */ 6681 int 6682 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 6683 { 6684 uint64_t txg, ndraid = 0; 6685 int error; 6686 vdev_t *rvd = spa->spa_root_vdev; 6687 vdev_t *vd, *tvd; 6688 nvlist_t **spares, **l2cache; 6689 uint_t nspares, nl2cache; 6690 6691 ASSERT(spa_writeable(spa)); 6692 6693 txg = spa_vdev_enter(spa); 6694 6695 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 6696 VDEV_ALLOC_ADD)) != 0) 6697 return (spa_vdev_exit(spa, NULL, txg, error)); 6698 6699 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 6700 6701 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 6702 &nspares) != 0) 6703 nspares = 0; 6704 6705 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 6706 &nl2cache) != 0) 6707 nl2cache = 0; 6708 6709 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 6710 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 6711 6712 if (vd->vdev_children != 0 && 6713 (error = vdev_create(vd, txg, B_FALSE)) != 0) { 6714 return (spa_vdev_exit(spa, vd, txg, error)); 6715 } 6716 6717 /* 6718 * The virtual dRAID spares must be added after vdev tree is created 6719 * and the vdev guids are generated. The guid of their associated 6720 * dRAID is stored in the config and used when opening the spare. 6721 */ 6722 if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid, 6723 rvd->vdev_children)) == 0) { 6724 if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot, 6725 ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) 6726 nspares = 0; 6727 } else { 6728 return (spa_vdev_exit(spa, vd, txg, error)); 6729 } 6730 6731 /* 6732 * We must validate the spares and l2cache devices after checking the 6733 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 6734 */ 6735 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 6736 return (spa_vdev_exit(spa, vd, txg, error)); 6737 6738 /* 6739 * If we are in the middle of a device removal, we can only add 6740 * devices which match the existing devices in the pool. 6741 * If we are in the middle of a removal, or have some indirect 6742 * vdevs, we can not add raidz or dRAID top levels. 6743 */ 6744 if (spa->spa_vdev_removal != NULL || 6745 spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { 6746 for (int c = 0; c < vd->vdev_children; c++) { 6747 tvd = vd->vdev_child[c]; 6748 if (spa->spa_vdev_removal != NULL && 6749 tvd->vdev_ashift != spa->spa_max_ashift) { 6750 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 6751 } 6752 /* Fail if top level vdev is raidz or a dRAID */ 6753 if (vdev_get_nparity(tvd) != 0) 6754 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 6755 6756 /* 6757 * Need the top level mirror to be 6758 * a mirror of leaf vdevs only 6759 */ 6760 if (tvd->vdev_ops == &vdev_mirror_ops) { 6761 for (uint64_t cid = 0; 6762 cid < tvd->vdev_children; cid++) { 6763 vdev_t *cvd = tvd->vdev_child[cid]; 6764 if (!cvd->vdev_ops->vdev_op_leaf) { 6765 return (spa_vdev_exit(spa, vd, 6766 txg, EINVAL)); 6767 } 6768 } 6769 } 6770 } 6771 } 6772 6773 for (int c = 0; c < vd->vdev_children; c++) { 6774 tvd = vd->vdev_child[c]; 6775 vdev_remove_child(vd, tvd); 6776 tvd->vdev_id = rvd->vdev_children; 6777 vdev_add_child(rvd, tvd); 6778 vdev_config_dirty(tvd); 6779 } 6780 6781 if (nspares != 0) { 6782 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 6783 ZPOOL_CONFIG_SPARES); 6784 spa_load_spares(spa); 6785 spa->spa_spares.sav_sync = B_TRUE; 6786 } 6787 6788 if (nl2cache != 0) { 6789 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 6790 ZPOOL_CONFIG_L2CACHE); 6791 spa_load_l2cache(spa); 6792 spa->spa_l2cache.sav_sync = B_TRUE; 6793 } 6794 6795 /* 6796 * We can't increment a feature while holding spa_vdev so we 6797 * have to do it in a synctask. 6798 */ 6799 if (ndraid != 0) { 6800 dmu_tx_t *tx; 6801 6802 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 6803 dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr, 6804 (void *)(uintptr_t)ndraid, tx); 6805 dmu_tx_commit(tx); 6806 } 6807 6808 /* 6809 * We have to be careful when adding new vdevs to an existing pool. 6810 * If other threads start allocating from these vdevs before we 6811 * sync the config cache, and we lose power, then upon reboot we may 6812 * fail to open the pool because there are DVAs that the config cache 6813 * can't translate. Therefore, we first add the vdevs without 6814 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 6815 * and then let spa_config_update() initialize the new metaslabs. 6816 * 6817 * spa_load() checks for added-but-not-initialized vdevs, so that 6818 * if we lose power at any point in this sequence, the remaining 6819 * steps will be completed the next time we load the pool. 6820 */ 6821 (void) spa_vdev_exit(spa, vd, txg, 0); 6822 6823 mutex_enter(&spa_namespace_lock); 6824 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 6825 spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); 6826 mutex_exit(&spa_namespace_lock); 6827 6828 return (0); 6829 } 6830 6831 /* 6832 * Attach a device to a mirror. The arguments are the path to any device 6833 * in the mirror, and the nvroot for the new device. If the path specifies 6834 * a device that is not mirrored, we automatically insert the mirror vdev. 6835 * 6836 * If 'replacing' is specified, the new device is intended to replace the 6837 * existing device; in this case the two devices are made into their own 6838 * mirror using the 'replacing' vdev, which is functionally identical to 6839 * the mirror vdev (it actually reuses all the same ops) but has a few 6840 * extra rules: you can't attach to it after it's been created, and upon 6841 * completion of resilvering, the first disk (the one being replaced) 6842 * is automatically detached. 6843 * 6844 * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild) 6845 * should be performed instead of traditional healing reconstruction. From 6846 * an administrators perspective these are both resilver operations. 6847 */ 6848 int 6849 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, 6850 int rebuild) 6851 { 6852 uint64_t txg, dtl_max_txg; 6853 vdev_t *rvd = spa->spa_root_vdev; 6854 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 6855 vdev_ops_t *pvops; 6856 char *oldvdpath, *newvdpath; 6857 int newvd_isspare; 6858 int error; 6859 6860 ASSERT(spa_writeable(spa)); 6861 6862 txg = spa_vdev_enter(spa); 6863 6864 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 6865 6866 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 6867 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 6868 error = (spa_has_checkpoint(spa)) ? 6869 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 6870 return (spa_vdev_exit(spa, NULL, txg, error)); 6871 } 6872 6873 if (rebuild) { 6874 if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) 6875 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 6876 6877 if (dsl_scan_resilvering(spa_get_dsl(spa))) 6878 return (spa_vdev_exit(spa, NULL, txg, 6879 ZFS_ERR_RESILVER_IN_PROGRESS)); 6880 } else { 6881 if (vdev_rebuild_active(rvd)) 6882 return (spa_vdev_exit(spa, NULL, txg, 6883 ZFS_ERR_REBUILD_IN_PROGRESS)); 6884 } 6885 6886 if (spa->spa_vdev_removal != NULL) 6887 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 6888 6889 if (oldvd == NULL) 6890 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 6891 6892 if (!oldvd->vdev_ops->vdev_op_leaf) 6893 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 6894 6895 pvd = oldvd->vdev_parent; 6896 6897 if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 6898 VDEV_ALLOC_ATTACH) != 0) 6899 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 6900 6901 if (newrootvd->vdev_children != 1) 6902 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 6903 6904 newvd = newrootvd->vdev_child[0]; 6905 6906 if (!newvd->vdev_ops->vdev_op_leaf) 6907 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 6908 6909 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 6910 return (spa_vdev_exit(spa, newrootvd, txg, error)); 6911 6912 /* 6913 * log, dedup and special vdevs should not be replaced by spares. 6914 */ 6915 if ((oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE || 6916 oldvd->vdev_top->vdev_islog) && newvd->vdev_isspare) { 6917 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6918 } 6919 6920 /* 6921 * A dRAID spare can only replace a child of its parent dRAID vdev. 6922 */ 6923 if (newvd->vdev_ops == &vdev_draid_spare_ops && 6924 oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) { 6925 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6926 } 6927 6928 if (rebuild) { 6929 /* 6930 * For rebuilds, the top vdev must support reconstruction 6931 * using only space maps. This means the only allowable 6932 * vdevs types are the root vdev, a mirror, or dRAID. 6933 */ 6934 tvd = pvd; 6935 if (pvd->vdev_top != NULL) 6936 tvd = pvd->vdev_top; 6937 6938 if (tvd->vdev_ops != &vdev_mirror_ops && 6939 tvd->vdev_ops != &vdev_root_ops && 6940 tvd->vdev_ops != &vdev_draid_ops) { 6941 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6942 } 6943 } 6944 6945 if (!replacing) { 6946 /* 6947 * For attach, the only allowable parent is a mirror or the root 6948 * vdev. 6949 */ 6950 if (pvd->vdev_ops != &vdev_mirror_ops && 6951 pvd->vdev_ops != &vdev_root_ops) 6952 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6953 6954 pvops = &vdev_mirror_ops; 6955 } else { 6956 /* 6957 * Active hot spares can only be replaced by inactive hot 6958 * spares. 6959 */ 6960 if (pvd->vdev_ops == &vdev_spare_ops && 6961 oldvd->vdev_isspare && 6962 !spa_has_spare(spa, newvd->vdev_guid)) 6963 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6964 6965 /* 6966 * If the source is a hot spare, and the parent isn't already a 6967 * spare, then we want to create a new hot spare. Otherwise, we 6968 * want to create a replacing vdev. The user is not allowed to 6969 * attach to a spared vdev child unless the 'isspare' state is 6970 * the same (spare replaces spare, non-spare replaces 6971 * non-spare). 6972 */ 6973 if (pvd->vdev_ops == &vdev_replacing_ops && 6974 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 6975 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6976 } else if (pvd->vdev_ops == &vdev_spare_ops && 6977 newvd->vdev_isspare != oldvd->vdev_isspare) { 6978 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6979 } 6980 6981 if (newvd->vdev_isspare) 6982 pvops = &vdev_spare_ops; 6983 else 6984 pvops = &vdev_replacing_ops; 6985 } 6986 6987 /* 6988 * Make sure the new device is big enough. 6989 */ 6990 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 6991 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 6992 6993 /* 6994 * The new device cannot have a higher alignment requirement 6995 * than the top-level vdev. 6996 */ 6997 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 6998 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6999 7000 /* 7001 * If this is an in-place replacement, update oldvd's path and devid 7002 * to make it distinguishable from newvd, and unopenable from now on. 7003 */ 7004 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 7005 spa_strfree(oldvd->vdev_path); 7006 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 7007 KM_SLEEP); 7008 (void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5, 7009 "%s/%s", newvd->vdev_path, "old"); 7010 if (oldvd->vdev_devid != NULL) { 7011 spa_strfree(oldvd->vdev_devid); 7012 oldvd->vdev_devid = NULL; 7013 } 7014 } 7015 7016 /* 7017 * If the parent is not a mirror, or if we're replacing, insert the new 7018 * mirror/replacing/spare vdev above oldvd. 7019 */ 7020 if (pvd->vdev_ops != pvops) 7021 pvd = vdev_add_parent(oldvd, pvops); 7022 7023 ASSERT(pvd->vdev_top->vdev_parent == rvd); 7024 ASSERT(pvd->vdev_ops == pvops); 7025 ASSERT(oldvd->vdev_parent == pvd); 7026 7027 /* 7028 * Extract the new device from its root and add it to pvd. 7029 */ 7030 vdev_remove_child(newrootvd, newvd); 7031 newvd->vdev_id = pvd->vdev_children; 7032 newvd->vdev_crtxg = oldvd->vdev_crtxg; 7033 vdev_add_child(pvd, newvd); 7034 7035 /* 7036 * Reevaluate the parent vdev state. 7037 */ 7038 vdev_propagate_state(pvd); 7039 7040 tvd = newvd->vdev_top; 7041 ASSERT(pvd->vdev_top == tvd); 7042 ASSERT(tvd->vdev_parent == rvd); 7043 7044 vdev_config_dirty(tvd); 7045 7046 /* 7047 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 7048 * for any dmu_sync-ed blocks. It will propagate upward when 7049 * spa_vdev_exit() calls vdev_dtl_reassess(). 7050 */ 7051 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 7052 7053 vdev_dtl_dirty(newvd, DTL_MISSING, 7054 TXG_INITIAL, dtl_max_txg - TXG_INITIAL); 7055 7056 if (newvd->vdev_isspare) { 7057 spa_spare_activate(newvd); 7058 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); 7059 } 7060 7061 oldvdpath = spa_strdup(oldvd->vdev_path); 7062 newvdpath = spa_strdup(newvd->vdev_path); 7063 newvd_isspare = newvd->vdev_isspare; 7064 7065 /* 7066 * Mark newvd's DTL dirty in this txg. 7067 */ 7068 vdev_dirty(tvd, VDD_DTL, newvd, txg); 7069 7070 /* 7071 * Schedule the resilver or rebuild to restart in the future. We do 7072 * this to ensure that dmu_sync-ed blocks have been stitched into the 7073 * respective datasets. 7074 */ 7075 if (rebuild) { 7076 newvd->vdev_rebuild_txg = txg; 7077 7078 vdev_rebuild(tvd); 7079 } else { 7080 newvd->vdev_resilver_txg = txg; 7081 7082 if (dsl_scan_resilvering(spa_get_dsl(spa)) && 7083 spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) { 7084 vdev_defer_resilver(newvd); 7085 } else { 7086 dsl_scan_restart_resilver(spa->spa_dsl_pool, 7087 dtl_max_txg); 7088 } 7089 } 7090 7091 if (spa->spa_bootfs) 7092 spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); 7093 7094 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); 7095 7096 /* 7097 * Commit the config 7098 */ 7099 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 7100 7101 spa_history_log_internal(spa, "vdev attach", NULL, 7102 "%s vdev=%s %s vdev=%s", 7103 replacing && newvd_isspare ? "spare in" : 7104 replacing ? "replace" : "attach", newvdpath, 7105 replacing ? "for" : "to", oldvdpath); 7106 7107 spa_strfree(oldvdpath); 7108 spa_strfree(newvdpath); 7109 7110 return (0); 7111 } 7112 7113 /* 7114 * Detach a device from a mirror or replacing vdev. 7115 * 7116 * If 'replace_done' is specified, only detach if the parent 7117 * is a replacing or a spare vdev. 7118 */ 7119 int 7120 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 7121 { 7122 uint64_t txg; 7123 int error; 7124 vdev_t *rvd __maybe_unused = spa->spa_root_vdev; 7125 vdev_t *vd, *pvd, *cvd, *tvd; 7126 boolean_t unspare = B_FALSE; 7127 uint64_t unspare_guid = 0; 7128 char *vdpath; 7129 7130 ASSERT(spa_writeable(spa)); 7131 7132 txg = spa_vdev_detach_enter(spa, guid); 7133 7134 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 7135 7136 /* 7137 * Besides being called directly from the userland through the 7138 * ioctl interface, spa_vdev_detach() can be potentially called 7139 * at the end of spa_vdev_resilver_done(). 7140 * 7141 * In the regular case, when we have a checkpoint this shouldn't 7142 * happen as we never empty the DTLs of a vdev during the scrub 7143 * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done() 7144 * should never get here when we have a checkpoint. 7145 * 7146 * That said, even in a case when we checkpoint the pool exactly 7147 * as spa_vdev_resilver_done() calls this function everything 7148 * should be fine as the resilver will return right away. 7149 */ 7150 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7151 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 7152 error = (spa_has_checkpoint(spa)) ? 7153 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 7154 return (spa_vdev_exit(spa, NULL, txg, error)); 7155 } 7156 7157 if (vd == NULL) 7158 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 7159 7160 if (!vd->vdev_ops->vdev_op_leaf) 7161 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7162 7163 pvd = vd->vdev_parent; 7164 7165 /* 7166 * If the parent/child relationship is not as expected, don't do it. 7167 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 7168 * vdev that's replacing B with C. The user's intent in replacing 7169 * is to go from M(A,B) to M(A,C). If the user decides to cancel 7170 * the replace by detaching C, the expected behavior is to end up 7171 * M(A,B). But suppose that right after deciding to detach C, 7172 * the replacement of B completes. We would have M(A,C), and then 7173 * ask to detach C, which would leave us with just A -- not what 7174 * the user wanted. To prevent this, we make sure that the 7175 * parent/child relationship hasn't changed -- in this example, 7176 * that C's parent is still the replacing vdev R. 7177 */ 7178 if (pvd->vdev_guid != pguid && pguid != 0) 7179 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 7180 7181 /* 7182 * Only 'replacing' or 'spare' vdevs can be replaced. 7183 */ 7184 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 7185 pvd->vdev_ops != &vdev_spare_ops) 7186 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7187 7188 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 7189 spa_version(spa) >= SPA_VERSION_SPARES); 7190 7191 /* 7192 * Only mirror, replacing, and spare vdevs support detach. 7193 */ 7194 if (pvd->vdev_ops != &vdev_replacing_ops && 7195 pvd->vdev_ops != &vdev_mirror_ops && 7196 pvd->vdev_ops != &vdev_spare_ops) 7197 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7198 7199 /* 7200 * If this device has the only valid copy of some data, 7201 * we cannot safely detach it. 7202 */ 7203 if (vdev_dtl_required(vd)) 7204 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 7205 7206 ASSERT(pvd->vdev_children >= 2); 7207 7208 /* 7209 * If we are detaching the second disk from a replacing vdev, then 7210 * check to see if we changed the original vdev's path to have "/old" 7211 * at the end in spa_vdev_attach(). If so, undo that change now. 7212 */ 7213 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 7214 vd->vdev_path != NULL) { 7215 size_t len = strlen(vd->vdev_path); 7216 7217 for (int c = 0; c < pvd->vdev_children; c++) { 7218 cvd = pvd->vdev_child[c]; 7219 7220 if (cvd == vd || cvd->vdev_path == NULL) 7221 continue; 7222 7223 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 7224 strcmp(cvd->vdev_path + len, "/old") == 0) { 7225 spa_strfree(cvd->vdev_path); 7226 cvd->vdev_path = spa_strdup(vd->vdev_path); 7227 break; 7228 } 7229 } 7230 } 7231 7232 /* 7233 * If we are detaching the original disk from a normal spare, then it 7234 * implies that the spare should become a real disk, and be removed 7235 * from the active spare list for the pool. dRAID spares on the 7236 * other hand are coupled to the pool and thus should never be removed 7237 * from the spares list. 7238 */ 7239 if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) { 7240 vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1]; 7241 7242 if (last_cvd->vdev_isspare && 7243 last_cvd->vdev_ops != &vdev_draid_spare_ops) { 7244 unspare = B_TRUE; 7245 } 7246 } 7247 7248 /* 7249 * Erase the disk labels so the disk can be used for other things. 7250 * This must be done after all other error cases are handled, 7251 * but before we disembowel vd (so we can still do I/O to it). 7252 * But if we can't do it, don't treat the error as fatal -- 7253 * it may be that the unwritability of the disk is the reason 7254 * it's being detached! 7255 */ 7256 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 7257 7258 /* 7259 * Remove vd from its parent and compact the parent's children. 7260 */ 7261 vdev_remove_child(pvd, vd); 7262 vdev_compact_children(pvd); 7263 7264 /* 7265 * Remember one of the remaining children so we can get tvd below. 7266 */ 7267 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 7268 7269 /* 7270 * If we need to remove the remaining child from the list of hot spares, 7271 * do it now, marking the vdev as no longer a spare in the process. 7272 * We must do this before vdev_remove_parent(), because that can 7273 * change the GUID if it creates a new toplevel GUID. For a similar 7274 * reason, we must remove the spare now, in the same txg as the detach; 7275 * otherwise someone could attach a new sibling, change the GUID, and 7276 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 7277 */ 7278 if (unspare) { 7279 ASSERT(cvd->vdev_isspare); 7280 spa_spare_remove(cvd); 7281 unspare_guid = cvd->vdev_guid; 7282 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 7283 cvd->vdev_unspare = B_TRUE; 7284 } 7285 7286 /* 7287 * If the parent mirror/replacing vdev only has one child, 7288 * the parent is no longer needed. Remove it from the tree. 7289 */ 7290 if (pvd->vdev_children == 1) { 7291 if (pvd->vdev_ops == &vdev_spare_ops) 7292 cvd->vdev_unspare = B_FALSE; 7293 vdev_remove_parent(cvd); 7294 } 7295 7296 /* 7297 * We don't set tvd until now because the parent we just removed 7298 * may have been the previous top-level vdev. 7299 */ 7300 tvd = cvd->vdev_top; 7301 ASSERT(tvd->vdev_parent == rvd); 7302 7303 /* 7304 * Reevaluate the parent vdev state. 7305 */ 7306 vdev_propagate_state(cvd); 7307 7308 /* 7309 * If the 'autoexpand' property is set on the pool then automatically 7310 * try to expand the size of the pool. For example if the device we 7311 * just detached was smaller than the others, it may be possible to 7312 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 7313 * first so that we can obtain the updated sizes of the leaf vdevs. 7314 */ 7315 if (spa->spa_autoexpand) { 7316 vdev_reopen(tvd); 7317 vdev_expand(tvd, txg); 7318 } 7319 7320 vdev_config_dirty(tvd); 7321 7322 /* 7323 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 7324 * vd->vdev_detached is set and free vd's DTL object in syncing context. 7325 * But first make sure we're not on any *other* txg's DTL list, to 7326 * prevent vd from being accessed after it's freed. 7327 */ 7328 vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none"); 7329 for (int t = 0; t < TXG_SIZE; t++) 7330 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 7331 vd->vdev_detached = B_TRUE; 7332 vdev_dirty(tvd, VDD_DTL, vd, txg); 7333 7334 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); 7335 spa_notify_waiters(spa); 7336 7337 /* hang on to the spa before we release the lock */ 7338 spa_open_ref(spa, FTAG); 7339 7340 error = spa_vdev_exit(spa, vd, txg, 0); 7341 7342 spa_history_log_internal(spa, "detach", NULL, 7343 "vdev=%s", vdpath); 7344 spa_strfree(vdpath); 7345 7346 /* 7347 * If this was the removal of the original device in a hot spare vdev, 7348 * then we want to go through and remove the device from the hot spare 7349 * list of every other pool. 7350 */ 7351 if (unspare) { 7352 spa_t *altspa = NULL; 7353 7354 mutex_enter(&spa_namespace_lock); 7355 while ((altspa = spa_next(altspa)) != NULL) { 7356 if (altspa->spa_state != POOL_STATE_ACTIVE || 7357 altspa == spa) 7358 continue; 7359 7360 spa_open_ref(altspa, FTAG); 7361 mutex_exit(&spa_namespace_lock); 7362 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 7363 mutex_enter(&spa_namespace_lock); 7364 spa_close(altspa, FTAG); 7365 } 7366 mutex_exit(&spa_namespace_lock); 7367 7368 /* search the rest of the vdevs for spares to remove */ 7369 spa_vdev_resilver_done(spa); 7370 } 7371 7372 /* all done with the spa; OK to release */ 7373 mutex_enter(&spa_namespace_lock); 7374 spa_close(spa, FTAG); 7375 mutex_exit(&spa_namespace_lock); 7376 7377 return (error); 7378 } 7379 7380 static int 7381 spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, 7382 list_t *vd_list) 7383 { 7384 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7385 7386 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 7387 7388 /* Look up vdev and ensure it's a leaf. */ 7389 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); 7390 if (vd == NULL || vd->vdev_detached) { 7391 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7392 return (SET_ERROR(ENODEV)); 7393 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { 7394 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7395 return (SET_ERROR(EINVAL)); 7396 } else if (!vdev_writeable(vd)) { 7397 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7398 return (SET_ERROR(EROFS)); 7399 } 7400 mutex_enter(&vd->vdev_initialize_lock); 7401 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7402 7403 /* 7404 * When we activate an initialize action we check to see 7405 * if the vdev_initialize_thread is NULL. We do this instead 7406 * of using the vdev_initialize_state since there might be 7407 * a previous initialization process which has completed but 7408 * the thread is not exited. 7409 */ 7410 if (cmd_type == POOL_INITIALIZE_START && 7411 (vd->vdev_initialize_thread != NULL || 7412 vd->vdev_top->vdev_removing)) { 7413 mutex_exit(&vd->vdev_initialize_lock); 7414 return (SET_ERROR(EBUSY)); 7415 } else if (cmd_type == POOL_INITIALIZE_CANCEL && 7416 (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE && 7417 vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) { 7418 mutex_exit(&vd->vdev_initialize_lock); 7419 return (SET_ERROR(ESRCH)); 7420 } else if (cmd_type == POOL_INITIALIZE_SUSPEND && 7421 vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { 7422 mutex_exit(&vd->vdev_initialize_lock); 7423 return (SET_ERROR(ESRCH)); 7424 } 7425 7426 switch (cmd_type) { 7427 case POOL_INITIALIZE_START: 7428 vdev_initialize(vd); 7429 break; 7430 case POOL_INITIALIZE_CANCEL: 7431 vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list); 7432 break; 7433 case POOL_INITIALIZE_SUSPEND: 7434 vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list); 7435 break; 7436 default: 7437 panic("invalid cmd_type %llu", (unsigned long long)cmd_type); 7438 } 7439 mutex_exit(&vd->vdev_initialize_lock); 7440 7441 return (0); 7442 } 7443 7444 int 7445 spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, 7446 nvlist_t *vdev_errlist) 7447 { 7448 int total_errors = 0; 7449 list_t vd_list; 7450 7451 list_create(&vd_list, sizeof (vdev_t), 7452 offsetof(vdev_t, vdev_initialize_node)); 7453 7454 /* 7455 * We hold the namespace lock through the whole function 7456 * to prevent any changes to the pool while we're starting or 7457 * stopping initialization. The config and state locks are held so that 7458 * we can properly assess the vdev state before we commit to 7459 * the initializing operation. 7460 */ 7461 mutex_enter(&spa_namespace_lock); 7462 7463 for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); 7464 pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { 7465 uint64_t vdev_guid = fnvpair_value_uint64(pair); 7466 7467 int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type, 7468 &vd_list); 7469 if (error != 0) { 7470 char guid_as_str[MAXNAMELEN]; 7471 7472 (void) snprintf(guid_as_str, sizeof (guid_as_str), 7473 "%llu", (unsigned long long)vdev_guid); 7474 fnvlist_add_int64(vdev_errlist, guid_as_str, error); 7475 total_errors++; 7476 } 7477 } 7478 7479 /* Wait for all initialize threads to stop. */ 7480 vdev_initialize_stop_wait(spa, &vd_list); 7481 7482 /* Sync out the initializing state */ 7483 txg_wait_synced(spa->spa_dsl_pool, 0); 7484 mutex_exit(&spa_namespace_lock); 7485 7486 list_destroy(&vd_list); 7487 7488 return (total_errors); 7489 } 7490 7491 static int 7492 spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, 7493 uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list) 7494 { 7495 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7496 7497 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 7498 7499 /* Look up vdev and ensure it's a leaf. */ 7500 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); 7501 if (vd == NULL || vd->vdev_detached) { 7502 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7503 return (SET_ERROR(ENODEV)); 7504 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { 7505 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7506 return (SET_ERROR(EINVAL)); 7507 } else if (!vdev_writeable(vd)) { 7508 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7509 return (SET_ERROR(EROFS)); 7510 } else if (!vd->vdev_has_trim) { 7511 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7512 return (SET_ERROR(EOPNOTSUPP)); 7513 } else if (secure && !vd->vdev_has_securetrim) { 7514 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7515 return (SET_ERROR(EOPNOTSUPP)); 7516 } 7517 mutex_enter(&vd->vdev_trim_lock); 7518 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7519 7520 /* 7521 * When we activate a TRIM action we check to see if the 7522 * vdev_trim_thread is NULL. We do this instead of using the 7523 * vdev_trim_state since there might be a previous TRIM process 7524 * which has completed but the thread is not exited. 7525 */ 7526 if (cmd_type == POOL_TRIM_START && 7527 (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) { 7528 mutex_exit(&vd->vdev_trim_lock); 7529 return (SET_ERROR(EBUSY)); 7530 } else if (cmd_type == POOL_TRIM_CANCEL && 7531 (vd->vdev_trim_state != VDEV_TRIM_ACTIVE && 7532 vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) { 7533 mutex_exit(&vd->vdev_trim_lock); 7534 return (SET_ERROR(ESRCH)); 7535 } else if (cmd_type == POOL_TRIM_SUSPEND && 7536 vd->vdev_trim_state != VDEV_TRIM_ACTIVE) { 7537 mutex_exit(&vd->vdev_trim_lock); 7538 return (SET_ERROR(ESRCH)); 7539 } 7540 7541 switch (cmd_type) { 7542 case POOL_TRIM_START: 7543 vdev_trim(vd, rate, partial, secure); 7544 break; 7545 case POOL_TRIM_CANCEL: 7546 vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list); 7547 break; 7548 case POOL_TRIM_SUSPEND: 7549 vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list); 7550 break; 7551 default: 7552 panic("invalid cmd_type %llu", (unsigned long long)cmd_type); 7553 } 7554 mutex_exit(&vd->vdev_trim_lock); 7555 7556 return (0); 7557 } 7558 7559 /* 7560 * Initiates a manual TRIM for the requested vdevs. This kicks off individual 7561 * TRIM threads for each child vdev. These threads pass over all of the free 7562 * space in the vdev's metaslabs and issues TRIM commands for that space. 7563 */ 7564 int 7565 spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate, 7566 boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist) 7567 { 7568 int total_errors = 0; 7569 list_t vd_list; 7570 7571 list_create(&vd_list, sizeof (vdev_t), 7572 offsetof(vdev_t, vdev_trim_node)); 7573 7574 /* 7575 * We hold the namespace lock through the whole function 7576 * to prevent any changes to the pool while we're starting or 7577 * stopping TRIM. The config and state locks are held so that 7578 * we can properly assess the vdev state before we commit to 7579 * the TRIM operation. 7580 */ 7581 mutex_enter(&spa_namespace_lock); 7582 7583 for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); 7584 pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { 7585 uint64_t vdev_guid = fnvpair_value_uint64(pair); 7586 7587 int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type, 7588 rate, partial, secure, &vd_list); 7589 if (error != 0) { 7590 char guid_as_str[MAXNAMELEN]; 7591 7592 (void) snprintf(guid_as_str, sizeof (guid_as_str), 7593 "%llu", (unsigned long long)vdev_guid); 7594 fnvlist_add_int64(vdev_errlist, guid_as_str, error); 7595 total_errors++; 7596 } 7597 } 7598 7599 /* Wait for all TRIM threads to stop. */ 7600 vdev_trim_stop_wait(spa, &vd_list); 7601 7602 /* Sync out the TRIM state */ 7603 txg_wait_synced(spa->spa_dsl_pool, 0); 7604 mutex_exit(&spa_namespace_lock); 7605 7606 list_destroy(&vd_list); 7607 7608 return (total_errors); 7609 } 7610 7611 /* 7612 * Split a set of devices from their mirrors, and create a new pool from them. 7613 */ 7614 int 7615 spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config, 7616 nvlist_t *props, boolean_t exp) 7617 { 7618 int error = 0; 7619 uint64_t txg, *glist; 7620 spa_t *newspa; 7621 uint_t c, children, lastlog; 7622 nvlist_t **child, *nvl, *tmp; 7623 dmu_tx_t *tx; 7624 const char *altroot = NULL; 7625 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 7626 boolean_t activate_slog; 7627 7628 ASSERT(spa_writeable(spa)); 7629 7630 txg = spa_vdev_enter(spa); 7631 7632 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7633 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 7634 error = (spa_has_checkpoint(spa)) ? 7635 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 7636 return (spa_vdev_exit(spa, NULL, txg, error)); 7637 } 7638 7639 /* clear the log and flush everything up to now */ 7640 activate_slog = spa_passivate_log(spa); 7641 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 7642 error = spa_reset_logs(spa); 7643 txg = spa_vdev_config_enter(spa); 7644 7645 if (activate_slog) 7646 spa_activate_log(spa); 7647 7648 if (error != 0) 7649 return (spa_vdev_exit(spa, NULL, txg, error)); 7650 7651 /* check new spa name before going any further */ 7652 if (spa_lookup(newname) != NULL) 7653 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 7654 7655 /* 7656 * scan through all the children to ensure they're all mirrors 7657 */ 7658 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 7659 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 7660 &children) != 0) 7661 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 7662 7663 /* first, check to ensure we've got the right child count */ 7664 rvd = spa->spa_root_vdev; 7665 lastlog = 0; 7666 for (c = 0; c < rvd->vdev_children; c++) { 7667 vdev_t *vd = rvd->vdev_child[c]; 7668 7669 /* don't count the holes & logs as children */ 7670 if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops && 7671 !vdev_is_concrete(vd))) { 7672 if (lastlog == 0) 7673 lastlog = c; 7674 continue; 7675 } 7676 7677 lastlog = 0; 7678 } 7679 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 7680 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 7681 7682 /* next, ensure no spare or cache devices are part of the split */ 7683 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 7684 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 7685 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 7686 7687 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 7688 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 7689 7690 /* then, loop over each vdev and validate it */ 7691 for (c = 0; c < children; c++) { 7692 uint64_t is_hole = 0; 7693 7694 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 7695 &is_hole); 7696 7697 if (is_hole != 0) { 7698 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 7699 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 7700 continue; 7701 } else { 7702 error = SET_ERROR(EINVAL); 7703 break; 7704 } 7705 } 7706 7707 /* deal with indirect vdevs */ 7708 if (spa->spa_root_vdev->vdev_child[c]->vdev_ops == 7709 &vdev_indirect_ops) 7710 continue; 7711 7712 /* which disk is going to be split? */ 7713 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 7714 &glist[c]) != 0) { 7715 error = SET_ERROR(EINVAL); 7716 break; 7717 } 7718 7719 /* look it up in the spa */ 7720 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 7721 if (vml[c] == NULL) { 7722 error = SET_ERROR(ENODEV); 7723 break; 7724 } 7725 7726 /* make sure there's nothing stopping the split */ 7727 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 7728 vml[c]->vdev_islog || 7729 !vdev_is_concrete(vml[c]) || 7730 vml[c]->vdev_isspare || 7731 vml[c]->vdev_isl2cache || 7732 !vdev_writeable(vml[c]) || 7733 vml[c]->vdev_children != 0 || 7734 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 7735 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 7736 error = SET_ERROR(EINVAL); 7737 break; 7738 } 7739 7740 if (vdev_dtl_required(vml[c]) || 7741 vdev_resilver_needed(vml[c], NULL, NULL)) { 7742 error = SET_ERROR(EBUSY); 7743 break; 7744 } 7745 7746 /* we need certain info from the top level */ 7747 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 7748 vml[c]->vdev_top->vdev_ms_array); 7749 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 7750 vml[c]->vdev_top->vdev_ms_shift); 7751 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 7752 vml[c]->vdev_top->vdev_asize); 7753 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 7754 vml[c]->vdev_top->vdev_ashift); 7755 7756 /* transfer per-vdev ZAPs */ 7757 ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); 7758 VERIFY0(nvlist_add_uint64(child[c], 7759 ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); 7760 7761 ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); 7762 VERIFY0(nvlist_add_uint64(child[c], 7763 ZPOOL_CONFIG_VDEV_TOP_ZAP, 7764 vml[c]->vdev_parent->vdev_top_zap)); 7765 } 7766 7767 if (error != 0) { 7768 kmem_free(vml, children * sizeof (vdev_t *)); 7769 kmem_free(glist, children * sizeof (uint64_t)); 7770 return (spa_vdev_exit(spa, NULL, txg, error)); 7771 } 7772 7773 /* stop writers from using the disks */ 7774 for (c = 0; c < children; c++) { 7775 if (vml[c] != NULL) 7776 vml[c]->vdev_offline = B_TRUE; 7777 } 7778 vdev_reopen(spa->spa_root_vdev); 7779 7780 /* 7781 * Temporarily record the splitting vdevs in the spa config. This 7782 * will disappear once the config is regenerated. 7783 */ 7784 nvl = fnvlist_alloc(); 7785 fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children); 7786 kmem_free(glist, children * sizeof (uint64_t)); 7787 7788 mutex_enter(&spa->spa_props_lock); 7789 fnvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl); 7790 mutex_exit(&spa->spa_props_lock); 7791 spa->spa_config_splitting = nvl; 7792 vdev_config_dirty(spa->spa_root_vdev); 7793 7794 /* configure and create the new pool */ 7795 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname); 7796 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 7797 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE); 7798 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)); 7799 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg); 7800 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 7801 spa_generate_guid(NULL)); 7802 VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 7803 (void) nvlist_lookup_string(props, 7804 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 7805 7806 /* add the new pool to the namespace */ 7807 newspa = spa_add(newname, config, altroot); 7808 newspa->spa_avz_action = AVZ_ACTION_REBUILD; 7809 newspa->spa_config_txg = spa->spa_config_txg; 7810 spa_set_log_state(newspa, SPA_LOG_CLEAR); 7811 7812 /* release the spa config lock, retaining the namespace lock */ 7813 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 7814 7815 if (zio_injection_enabled) 7816 zio_handle_panic_injection(spa, FTAG, 1); 7817 7818 spa_activate(newspa, spa_mode_global); 7819 spa_async_suspend(newspa); 7820 7821 /* 7822 * Temporarily stop the initializing and TRIM activity. We set the 7823 * state to ACTIVE so that we know to resume initializing or TRIM 7824 * once the split has completed. 7825 */ 7826 list_t vd_initialize_list; 7827 list_create(&vd_initialize_list, sizeof (vdev_t), 7828 offsetof(vdev_t, vdev_initialize_node)); 7829 7830 list_t vd_trim_list; 7831 list_create(&vd_trim_list, sizeof (vdev_t), 7832 offsetof(vdev_t, vdev_trim_node)); 7833 7834 for (c = 0; c < children; c++) { 7835 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { 7836 mutex_enter(&vml[c]->vdev_initialize_lock); 7837 vdev_initialize_stop(vml[c], 7838 VDEV_INITIALIZE_ACTIVE, &vd_initialize_list); 7839 mutex_exit(&vml[c]->vdev_initialize_lock); 7840 7841 mutex_enter(&vml[c]->vdev_trim_lock); 7842 vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list); 7843 mutex_exit(&vml[c]->vdev_trim_lock); 7844 } 7845 } 7846 7847 vdev_initialize_stop_wait(spa, &vd_initialize_list); 7848 vdev_trim_stop_wait(spa, &vd_trim_list); 7849 7850 list_destroy(&vd_initialize_list); 7851 list_destroy(&vd_trim_list); 7852 7853 newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; 7854 newspa->spa_is_splitting = B_TRUE; 7855 7856 /* create the new pool from the disks of the original pool */ 7857 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE); 7858 if (error) 7859 goto out; 7860 7861 /* if that worked, generate a real config for the new pool */ 7862 if (newspa->spa_root_vdev != NULL) { 7863 newspa->spa_config_splitting = fnvlist_alloc(); 7864 fnvlist_add_uint64(newspa->spa_config_splitting, 7865 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)); 7866 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 7867 B_TRUE)); 7868 } 7869 7870 /* set the props */ 7871 if (props != NULL) { 7872 spa_configfile_set(newspa, props, B_FALSE); 7873 error = spa_prop_set(newspa, props); 7874 if (error) 7875 goto out; 7876 } 7877 7878 /* flush everything */ 7879 txg = spa_vdev_config_enter(newspa); 7880 vdev_config_dirty(newspa->spa_root_vdev); 7881 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 7882 7883 if (zio_injection_enabled) 7884 zio_handle_panic_injection(spa, FTAG, 2); 7885 7886 spa_async_resume(newspa); 7887 7888 /* finally, update the original pool's config */ 7889 txg = spa_vdev_config_enter(spa); 7890 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 7891 error = dmu_tx_assign(tx, TXG_WAIT); 7892 if (error != 0) 7893 dmu_tx_abort(tx); 7894 for (c = 0; c < children; c++) { 7895 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { 7896 vdev_t *tvd = vml[c]->vdev_top; 7897 7898 /* 7899 * Need to be sure the detachable VDEV is not 7900 * on any *other* txg's DTL list to prevent it 7901 * from being accessed after it's freed. 7902 */ 7903 for (int t = 0; t < TXG_SIZE; t++) { 7904 (void) txg_list_remove_this( 7905 &tvd->vdev_dtl_list, vml[c], t); 7906 } 7907 7908 vdev_split(vml[c]); 7909 if (error == 0) 7910 spa_history_log_internal(spa, "detach", tx, 7911 "vdev=%s", vml[c]->vdev_path); 7912 7913 vdev_free(vml[c]); 7914 } 7915 } 7916 spa->spa_avz_action = AVZ_ACTION_REBUILD; 7917 vdev_config_dirty(spa->spa_root_vdev); 7918 spa->spa_config_splitting = NULL; 7919 nvlist_free(nvl); 7920 if (error == 0) 7921 dmu_tx_commit(tx); 7922 (void) spa_vdev_exit(spa, NULL, txg, 0); 7923 7924 if (zio_injection_enabled) 7925 zio_handle_panic_injection(spa, FTAG, 3); 7926 7927 /* split is complete; log a history record */ 7928 spa_history_log_internal(newspa, "split", NULL, 7929 "from pool %s", spa_name(spa)); 7930 7931 newspa->spa_is_splitting = B_FALSE; 7932 kmem_free(vml, children * sizeof (vdev_t *)); 7933 7934 /* if we're not going to mount the filesystems in userland, export */ 7935 if (exp) 7936 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 7937 B_FALSE, B_FALSE); 7938 7939 return (error); 7940 7941 out: 7942 spa_unload(newspa); 7943 spa_deactivate(newspa); 7944 spa_remove(newspa); 7945 7946 txg = spa_vdev_config_enter(spa); 7947 7948 /* re-online all offlined disks */ 7949 for (c = 0; c < children; c++) { 7950 if (vml[c] != NULL) 7951 vml[c]->vdev_offline = B_FALSE; 7952 } 7953 7954 /* restart initializing or trimming disks as necessary */ 7955 spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); 7956 spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); 7957 spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); 7958 7959 vdev_reopen(spa->spa_root_vdev); 7960 7961 nvlist_free(spa->spa_config_splitting); 7962 spa->spa_config_splitting = NULL; 7963 (void) spa_vdev_exit(spa, NULL, txg, error); 7964 7965 kmem_free(vml, children * sizeof (vdev_t *)); 7966 return (error); 7967 } 7968 7969 /* 7970 * Find any device that's done replacing, or a vdev marked 'unspare' that's 7971 * currently spared, so we can detach it. 7972 */ 7973 static vdev_t * 7974 spa_vdev_resilver_done_hunt(vdev_t *vd) 7975 { 7976 vdev_t *newvd, *oldvd; 7977 7978 for (int c = 0; c < vd->vdev_children; c++) { 7979 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 7980 if (oldvd != NULL) 7981 return (oldvd); 7982 } 7983 7984 /* 7985 * Check for a completed replacement. We always consider the first 7986 * vdev in the list to be the oldest vdev, and the last one to be 7987 * the newest (see spa_vdev_attach() for how that works). In 7988 * the case where the newest vdev is faulted, we will not automatically 7989 * remove it after a resilver completes. This is OK as it will require 7990 * user intervention to determine which disk the admin wishes to keep. 7991 */ 7992 if (vd->vdev_ops == &vdev_replacing_ops) { 7993 ASSERT(vd->vdev_children > 1); 7994 7995 newvd = vd->vdev_child[vd->vdev_children - 1]; 7996 oldvd = vd->vdev_child[0]; 7997 7998 if (vdev_dtl_empty(newvd, DTL_MISSING) && 7999 vdev_dtl_empty(newvd, DTL_OUTAGE) && 8000 !vdev_dtl_required(oldvd)) 8001 return (oldvd); 8002 } 8003 8004 /* 8005 * Check for a completed resilver with the 'unspare' flag set. 8006 * Also potentially update faulted state. 8007 */ 8008 if (vd->vdev_ops == &vdev_spare_ops) { 8009 vdev_t *first = vd->vdev_child[0]; 8010 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 8011 8012 if (last->vdev_unspare) { 8013 oldvd = first; 8014 newvd = last; 8015 } else if (first->vdev_unspare) { 8016 oldvd = last; 8017 newvd = first; 8018 } else { 8019 oldvd = NULL; 8020 } 8021 8022 if (oldvd != NULL && 8023 vdev_dtl_empty(newvd, DTL_MISSING) && 8024 vdev_dtl_empty(newvd, DTL_OUTAGE) && 8025 !vdev_dtl_required(oldvd)) 8026 return (oldvd); 8027 8028 vdev_propagate_state(vd); 8029 8030 /* 8031 * If there are more than two spares attached to a disk, 8032 * and those spares are not required, then we want to 8033 * attempt to free them up now so that they can be used 8034 * by other pools. Once we're back down to a single 8035 * disk+spare, we stop removing them. 8036 */ 8037 if (vd->vdev_children > 2) { 8038 newvd = vd->vdev_child[1]; 8039 8040 if (newvd->vdev_isspare && last->vdev_isspare && 8041 vdev_dtl_empty(last, DTL_MISSING) && 8042 vdev_dtl_empty(last, DTL_OUTAGE) && 8043 !vdev_dtl_required(newvd)) 8044 return (newvd); 8045 } 8046 } 8047 8048 return (NULL); 8049 } 8050 8051 static void 8052 spa_vdev_resilver_done(spa_t *spa) 8053 { 8054 vdev_t *vd, *pvd, *ppvd; 8055 uint64_t guid, sguid, pguid, ppguid; 8056 8057 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 8058 8059 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 8060 pvd = vd->vdev_parent; 8061 ppvd = pvd->vdev_parent; 8062 guid = vd->vdev_guid; 8063 pguid = pvd->vdev_guid; 8064 ppguid = ppvd->vdev_guid; 8065 sguid = 0; 8066 /* 8067 * If we have just finished replacing a hot spared device, then 8068 * we need to detach the parent's first child (the original hot 8069 * spare) as well. 8070 */ 8071 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 8072 ppvd->vdev_children == 2) { 8073 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 8074 sguid = ppvd->vdev_child[1]->vdev_guid; 8075 } 8076 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 8077 8078 spa_config_exit(spa, SCL_ALL, FTAG); 8079 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 8080 return; 8081 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 8082 return; 8083 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 8084 } 8085 8086 spa_config_exit(spa, SCL_ALL, FTAG); 8087 8088 /* 8089 * If a detach was not performed above replace waiters will not have 8090 * been notified. In which case we must do so now. 8091 */ 8092 spa_notify_waiters(spa); 8093 } 8094 8095 /* 8096 * Update the stored path or FRU for this vdev. 8097 */ 8098 static int 8099 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 8100 boolean_t ispath) 8101 { 8102 vdev_t *vd; 8103 boolean_t sync = B_FALSE; 8104 8105 ASSERT(spa_writeable(spa)); 8106 8107 spa_vdev_state_enter(spa, SCL_ALL); 8108 8109 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 8110 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 8111 8112 if (!vd->vdev_ops->vdev_op_leaf) 8113 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 8114 8115 if (ispath) { 8116 if (strcmp(value, vd->vdev_path) != 0) { 8117 spa_strfree(vd->vdev_path); 8118 vd->vdev_path = spa_strdup(value); 8119 sync = B_TRUE; 8120 } 8121 } else { 8122 if (vd->vdev_fru == NULL) { 8123 vd->vdev_fru = spa_strdup(value); 8124 sync = B_TRUE; 8125 } else if (strcmp(value, vd->vdev_fru) != 0) { 8126 spa_strfree(vd->vdev_fru); 8127 vd->vdev_fru = spa_strdup(value); 8128 sync = B_TRUE; 8129 } 8130 } 8131 8132 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 8133 } 8134 8135 int 8136 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 8137 { 8138 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 8139 } 8140 8141 int 8142 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 8143 { 8144 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 8145 } 8146 8147 /* 8148 * ========================================================================== 8149 * SPA Scanning 8150 * ========================================================================== 8151 */ 8152 int 8153 spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) 8154 { 8155 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 8156 8157 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 8158 return (SET_ERROR(EBUSY)); 8159 8160 return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); 8161 } 8162 8163 int 8164 spa_scan_stop(spa_t *spa) 8165 { 8166 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 8167 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 8168 return (SET_ERROR(EBUSY)); 8169 return (dsl_scan_cancel(spa->spa_dsl_pool)); 8170 } 8171 8172 int 8173 spa_scan(spa_t *spa, pool_scan_func_t func) 8174 { 8175 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 8176 8177 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 8178 return (SET_ERROR(ENOTSUP)); 8179 8180 if (func == POOL_SCAN_RESILVER && 8181 !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) 8182 return (SET_ERROR(ENOTSUP)); 8183 8184 /* 8185 * If a resilver was requested, but there is no DTL on a 8186 * writeable leaf device, we have nothing to do. 8187 */ 8188 if (func == POOL_SCAN_RESILVER && 8189 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 8190 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 8191 return (0); 8192 } 8193 8194 return (dsl_scan(spa->spa_dsl_pool, func)); 8195 } 8196 8197 /* 8198 * ========================================================================== 8199 * SPA async task processing 8200 * ========================================================================== 8201 */ 8202 8203 static void 8204 spa_async_remove(spa_t *spa, vdev_t *vd) 8205 { 8206 if (vd->vdev_remove_wanted) { 8207 vd->vdev_remove_wanted = B_FALSE; 8208 vd->vdev_delayed_close = B_FALSE; 8209 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 8210 8211 /* 8212 * We want to clear the stats, but we don't want to do a full 8213 * vdev_clear() as that will cause us to throw away 8214 * degraded/faulted state as well as attempt to reopen the 8215 * device, all of which is a waste. 8216 */ 8217 vd->vdev_stat.vs_read_errors = 0; 8218 vd->vdev_stat.vs_write_errors = 0; 8219 vd->vdev_stat.vs_checksum_errors = 0; 8220 8221 vdev_state_dirty(vd->vdev_top); 8222 8223 /* Tell userspace that the vdev is gone. */ 8224 zfs_post_remove(spa, vd); 8225 } 8226 8227 for (int c = 0; c < vd->vdev_children; c++) 8228 spa_async_remove(spa, vd->vdev_child[c]); 8229 } 8230 8231 static void 8232 spa_async_probe(spa_t *spa, vdev_t *vd) 8233 { 8234 if (vd->vdev_probe_wanted) { 8235 vd->vdev_probe_wanted = B_FALSE; 8236 vdev_reopen(vd); /* vdev_open() does the actual probe */ 8237 } 8238 8239 for (int c = 0; c < vd->vdev_children; c++) 8240 spa_async_probe(spa, vd->vdev_child[c]); 8241 } 8242 8243 static void 8244 spa_async_autoexpand(spa_t *spa, vdev_t *vd) 8245 { 8246 if (!spa->spa_autoexpand) 8247 return; 8248 8249 for (int c = 0; c < vd->vdev_children; c++) { 8250 vdev_t *cvd = vd->vdev_child[c]; 8251 spa_async_autoexpand(spa, cvd); 8252 } 8253 8254 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 8255 return; 8256 8257 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND); 8258 } 8259 8260 static __attribute__((noreturn)) void 8261 spa_async_thread(void *arg) 8262 { 8263 spa_t *spa = (spa_t *)arg; 8264 dsl_pool_t *dp = spa->spa_dsl_pool; 8265 int tasks; 8266 8267 ASSERT(spa->spa_sync_on); 8268 8269 mutex_enter(&spa->spa_async_lock); 8270 tasks = spa->spa_async_tasks; 8271 spa->spa_async_tasks = 0; 8272 mutex_exit(&spa->spa_async_lock); 8273 8274 /* 8275 * See if the config needs to be updated. 8276 */ 8277 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 8278 uint64_t old_space, new_space; 8279 8280 mutex_enter(&spa_namespace_lock); 8281 old_space = metaslab_class_get_space(spa_normal_class(spa)); 8282 old_space += metaslab_class_get_space(spa_special_class(spa)); 8283 old_space += metaslab_class_get_space(spa_dedup_class(spa)); 8284 old_space += metaslab_class_get_space( 8285 spa_embedded_log_class(spa)); 8286 8287 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 8288 8289 new_space = metaslab_class_get_space(spa_normal_class(spa)); 8290 new_space += metaslab_class_get_space(spa_special_class(spa)); 8291 new_space += metaslab_class_get_space(spa_dedup_class(spa)); 8292 new_space += metaslab_class_get_space( 8293 spa_embedded_log_class(spa)); 8294 mutex_exit(&spa_namespace_lock); 8295 8296 /* 8297 * If the pool grew as a result of the config update, 8298 * then log an internal history event. 8299 */ 8300 if (new_space != old_space) { 8301 spa_history_log_internal(spa, "vdev online", NULL, 8302 "pool '%s' size: %llu(+%llu)", 8303 spa_name(spa), (u_longlong_t)new_space, 8304 (u_longlong_t)(new_space - old_space)); 8305 } 8306 } 8307 8308 /* 8309 * See if any devices need to be marked REMOVED. 8310 */ 8311 if (tasks & SPA_ASYNC_REMOVE) { 8312 spa_vdev_state_enter(spa, SCL_NONE); 8313 spa_async_remove(spa, spa->spa_root_vdev); 8314 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 8315 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 8316 for (int i = 0; i < spa->spa_spares.sav_count; i++) 8317 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 8318 (void) spa_vdev_state_exit(spa, NULL, 0); 8319 } 8320 8321 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 8322 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8323 spa_async_autoexpand(spa, spa->spa_root_vdev); 8324 spa_config_exit(spa, SCL_CONFIG, FTAG); 8325 } 8326 8327 /* 8328 * See if any devices need to be probed. 8329 */ 8330 if (tasks & SPA_ASYNC_PROBE) { 8331 spa_vdev_state_enter(spa, SCL_NONE); 8332 spa_async_probe(spa, spa->spa_root_vdev); 8333 (void) spa_vdev_state_exit(spa, NULL, 0); 8334 } 8335 8336 /* 8337 * If any devices are done replacing, detach them. 8338 */ 8339 if (tasks & SPA_ASYNC_RESILVER_DONE || 8340 tasks & SPA_ASYNC_REBUILD_DONE || 8341 tasks & SPA_ASYNC_DETACH_SPARE) { 8342 spa_vdev_resilver_done(spa); 8343 } 8344 8345 /* 8346 * Kick off a resilver. 8347 */ 8348 if (tasks & SPA_ASYNC_RESILVER && 8349 !vdev_rebuild_active(spa->spa_root_vdev) && 8350 (!dsl_scan_resilvering(dp) || 8351 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))) 8352 dsl_scan_restart_resilver(dp, 0); 8353 8354 if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { 8355 mutex_enter(&spa_namespace_lock); 8356 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8357 vdev_initialize_restart(spa->spa_root_vdev); 8358 spa_config_exit(spa, SCL_CONFIG, FTAG); 8359 mutex_exit(&spa_namespace_lock); 8360 } 8361 8362 if (tasks & SPA_ASYNC_TRIM_RESTART) { 8363 mutex_enter(&spa_namespace_lock); 8364 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8365 vdev_trim_restart(spa->spa_root_vdev); 8366 spa_config_exit(spa, SCL_CONFIG, FTAG); 8367 mutex_exit(&spa_namespace_lock); 8368 } 8369 8370 if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) { 8371 mutex_enter(&spa_namespace_lock); 8372 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8373 vdev_autotrim_restart(spa); 8374 spa_config_exit(spa, SCL_CONFIG, FTAG); 8375 mutex_exit(&spa_namespace_lock); 8376 } 8377 8378 /* 8379 * Kick off L2 cache whole device TRIM. 8380 */ 8381 if (tasks & SPA_ASYNC_L2CACHE_TRIM) { 8382 mutex_enter(&spa_namespace_lock); 8383 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8384 vdev_trim_l2arc(spa); 8385 spa_config_exit(spa, SCL_CONFIG, FTAG); 8386 mutex_exit(&spa_namespace_lock); 8387 } 8388 8389 /* 8390 * Kick off L2 cache rebuilding. 8391 */ 8392 if (tasks & SPA_ASYNC_L2CACHE_REBUILD) { 8393 mutex_enter(&spa_namespace_lock); 8394 spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER); 8395 l2arc_spa_rebuild_start(spa); 8396 spa_config_exit(spa, SCL_L2ARC, FTAG); 8397 mutex_exit(&spa_namespace_lock); 8398 } 8399 8400 /* 8401 * Let the world know that we're done. 8402 */ 8403 mutex_enter(&spa->spa_async_lock); 8404 spa->spa_async_thread = NULL; 8405 cv_broadcast(&spa->spa_async_cv); 8406 mutex_exit(&spa->spa_async_lock); 8407 thread_exit(); 8408 } 8409 8410 void 8411 spa_async_suspend(spa_t *spa) 8412 { 8413 mutex_enter(&spa->spa_async_lock); 8414 spa->spa_async_suspended++; 8415 while (spa->spa_async_thread != NULL) 8416 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 8417 mutex_exit(&spa->spa_async_lock); 8418 8419 spa_vdev_remove_suspend(spa); 8420 8421 zthr_t *condense_thread = spa->spa_condense_zthr; 8422 if (condense_thread != NULL) 8423 zthr_cancel(condense_thread); 8424 8425 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; 8426 if (discard_thread != NULL) 8427 zthr_cancel(discard_thread); 8428 8429 zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; 8430 if (ll_delete_thread != NULL) 8431 zthr_cancel(ll_delete_thread); 8432 8433 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; 8434 if (ll_condense_thread != NULL) 8435 zthr_cancel(ll_condense_thread); 8436 } 8437 8438 void 8439 spa_async_resume(spa_t *spa) 8440 { 8441 mutex_enter(&spa->spa_async_lock); 8442 ASSERT(spa->spa_async_suspended != 0); 8443 spa->spa_async_suspended--; 8444 mutex_exit(&spa->spa_async_lock); 8445 spa_restart_removal(spa); 8446 8447 zthr_t *condense_thread = spa->spa_condense_zthr; 8448 if (condense_thread != NULL) 8449 zthr_resume(condense_thread); 8450 8451 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; 8452 if (discard_thread != NULL) 8453 zthr_resume(discard_thread); 8454 8455 zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; 8456 if (ll_delete_thread != NULL) 8457 zthr_resume(ll_delete_thread); 8458 8459 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; 8460 if (ll_condense_thread != NULL) 8461 zthr_resume(ll_condense_thread); 8462 } 8463 8464 static boolean_t 8465 spa_async_tasks_pending(spa_t *spa) 8466 { 8467 uint_t non_config_tasks; 8468 uint_t config_task; 8469 boolean_t config_task_suspended; 8470 8471 non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE; 8472 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 8473 if (spa->spa_ccw_fail_time == 0) { 8474 config_task_suspended = B_FALSE; 8475 } else { 8476 config_task_suspended = 8477 (gethrtime() - spa->spa_ccw_fail_time) < 8478 ((hrtime_t)zfs_ccw_retry_interval * NANOSEC); 8479 } 8480 8481 return (non_config_tasks || (config_task && !config_task_suspended)); 8482 } 8483 8484 static void 8485 spa_async_dispatch(spa_t *spa) 8486 { 8487 mutex_enter(&spa->spa_async_lock); 8488 if (spa_async_tasks_pending(spa) && 8489 !spa->spa_async_suspended && 8490 spa->spa_async_thread == NULL) 8491 spa->spa_async_thread = thread_create(NULL, 0, 8492 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 8493 mutex_exit(&spa->spa_async_lock); 8494 } 8495 8496 void 8497 spa_async_request(spa_t *spa, int task) 8498 { 8499 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 8500 mutex_enter(&spa->spa_async_lock); 8501 spa->spa_async_tasks |= task; 8502 mutex_exit(&spa->spa_async_lock); 8503 } 8504 8505 int 8506 spa_async_tasks(spa_t *spa) 8507 { 8508 return (spa->spa_async_tasks); 8509 } 8510 8511 /* 8512 * ========================================================================== 8513 * SPA syncing routines 8514 * ========================================================================== 8515 */ 8516 8517 8518 static int 8519 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 8520 dmu_tx_t *tx) 8521 { 8522 bpobj_t *bpo = arg; 8523 bpobj_enqueue(bpo, bp, bp_freed, tx); 8524 return (0); 8525 } 8526 8527 int 8528 bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 8529 { 8530 return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx)); 8531 } 8532 8533 int 8534 bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 8535 { 8536 return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx)); 8537 } 8538 8539 static int 8540 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 8541 { 8542 zio_t *pio = arg; 8543 8544 zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp, 8545 pio->io_flags)); 8546 return (0); 8547 } 8548 8549 static int 8550 bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 8551 dmu_tx_t *tx) 8552 { 8553 ASSERT(!bp_freed); 8554 return (spa_free_sync_cb(arg, bp, tx)); 8555 } 8556 8557 /* 8558 * Note: this simple function is not inlined to make it easier to dtrace the 8559 * amount of time spent syncing frees. 8560 */ 8561 static void 8562 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 8563 { 8564 zio_t *zio = zio_root(spa, NULL, NULL, 0); 8565 bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 8566 VERIFY(zio_wait(zio) == 0); 8567 } 8568 8569 /* 8570 * Note: this simple function is not inlined to make it easier to dtrace the 8571 * amount of time spent syncing deferred frees. 8572 */ 8573 static void 8574 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 8575 { 8576 if (spa_sync_pass(spa) != 1) 8577 return; 8578 8579 /* 8580 * Note: 8581 * If the log space map feature is active, we stop deferring 8582 * frees to the next TXG and therefore running this function 8583 * would be considered a no-op as spa_deferred_bpobj should 8584 * not have any entries. 8585 * 8586 * That said we run this function anyway (instead of returning 8587 * immediately) for the edge-case scenario where we just 8588 * activated the log space map feature in this TXG but we have 8589 * deferred frees from the previous TXG. 8590 */ 8591 zio_t *zio = zio_root(spa, NULL, NULL, 0); 8592 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 8593 bpobj_spa_free_sync_cb, zio, tx), ==, 0); 8594 VERIFY0(zio_wait(zio)); 8595 } 8596 8597 static void 8598 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 8599 { 8600 char *packed = NULL; 8601 size_t bufsize; 8602 size_t nvsize = 0; 8603 dmu_buf_t *db; 8604 8605 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 8606 8607 /* 8608 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 8609 * information. This avoids the dmu_buf_will_dirty() path and 8610 * saves us a pre-read to get data we don't actually care about. 8611 */ 8612 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 8613 packed = vmem_alloc(bufsize, KM_SLEEP); 8614 8615 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 8616 KM_SLEEP) == 0); 8617 memset(packed + nvsize, 0, bufsize - nvsize); 8618 8619 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 8620 8621 vmem_free(packed, bufsize); 8622 8623 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 8624 dmu_buf_will_dirty(db, tx); 8625 *(uint64_t *)db->db_data = nvsize; 8626 dmu_buf_rele(db, FTAG); 8627 } 8628 8629 static void 8630 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 8631 const char *config, const char *entry) 8632 { 8633 nvlist_t *nvroot; 8634 nvlist_t **list; 8635 int i; 8636 8637 if (!sav->sav_sync) 8638 return; 8639 8640 /* 8641 * Update the MOS nvlist describing the list of available devices. 8642 * spa_validate_aux() will have already made sure this nvlist is 8643 * valid and the vdevs are labeled appropriately. 8644 */ 8645 if (sav->sav_object == 0) { 8646 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 8647 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 8648 sizeof (uint64_t), tx); 8649 VERIFY(zap_update(spa->spa_meta_objset, 8650 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 8651 &sav->sav_object, tx) == 0); 8652 } 8653 8654 nvroot = fnvlist_alloc(); 8655 if (sav->sav_count == 0) { 8656 fnvlist_add_nvlist_array(nvroot, config, 8657 (const nvlist_t * const *)NULL, 0); 8658 } else { 8659 list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP); 8660 for (i = 0; i < sav->sav_count; i++) 8661 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 8662 B_FALSE, VDEV_CONFIG_L2CACHE); 8663 fnvlist_add_nvlist_array(nvroot, config, 8664 (const nvlist_t * const *)list, sav->sav_count); 8665 for (i = 0; i < sav->sav_count; i++) 8666 nvlist_free(list[i]); 8667 kmem_free(list, sav->sav_count * sizeof (void *)); 8668 } 8669 8670 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 8671 nvlist_free(nvroot); 8672 8673 sav->sav_sync = B_FALSE; 8674 } 8675 8676 /* 8677 * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. 8678 * The all-vdev ZAP must be empty. 8679 */ 8680 static void 8681 spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) 8682 { 8683 spa_t *spa = vd->vdev_spa; 8684 8685 if (vd->vdev_root_zap != 0 && 8686 spa_feature_is_active(spa, SPA_FEATURE_AVZ_V2)) { 8687 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 8688 vd->vdev_root_zap, tx)); 8689 } 8690 if (vd->vdev_top_zap != 0) { 8691 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 8692 vd->vdev_top_zap, tx)); 8693 } 8694 if (vd->vdev_leaf_zap != 0) { 8695 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 8696 vd->vdev_leaf_zap, tx)); 8697 } 8698 for (uint64_t i = 0; i < vd->vdev_children; i++) { 8699 spa_avz_build(vd->vdev_child[i], avz, tx); 8700 } 8701 } 8702 8703 static void 8704 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 8705 { 8706 nvlist_t *config; 8707 8708 /* 8709 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, 8710 * its config may not be dirty but we still need to build per-vdev ZAPs. 8711 * Similarly, if the pool is being assembled (e.g. after a split), we 8712 * need to rebuild the AVZ although the config may not be dirty. 8713 */ 8714 if (list_is_empty(&spa->spa_config_dirty_list) && 8715 spa->spa_avz_action == AVZ_ACTION_NONE) 8716 return; 8717 8718 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 8719 8720 ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || 8721 spa->spa_avz_action == AVZ_ACTION_INITIALIZE || 8722 spa->spa_all_vdev_zaps != 0); 8723 8724 if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { 8725 /* Make and build the new AVZ */ 8726 uint64_t new_avz = zap_create(spa->spa_meta_objset, 8727 DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); 8728 spa_avz_build(spa->spa_root_vdev, new_avz, tx); 8729 8730 /* Diff old AVZ with new one */ 8731 zap_cursor_t zc; 8732 zap_attribute_t za; 8733 8734 for (zap_cursor_init(&zc, spa->spa_meta_objset, 8735 spa->spa_all_vdev_zaps); 8736 zap_cursor_retrieve(&zc, &za) == 0; 8737 zap_cursor_advance(&zc)) { 8738 uint64_t vdzap = za.za_first_integer; 8739 if (zap_lookup_int(spa->spa_meta_objset, new_avz, 8740 vdzap) == ENOENT) { 8741 /* 8742 * ZAP is listed in old AVZ but not in new one; 8743 * destroy it 8744 */ 8745 VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, 8746 tx)); 8747 } 8748 } 8749 8750 zap_cursor_fini(&zc); 8751 8752 /* Destroy the old AVZ */ 8753 VERIFY0(zap_destroy(spa->spa_meta_objset, 8754 spa->spa_all_vdev_zaps, tx)); 8755 8756 /* Replace the old AVZ in the dir obj with the new one */ 8757 VERIFY0(zap_update(spa->spa_meta_objset, 8758 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, 8759 sizeof (new_avz), 1, &new_avz, tx)); 8760 8761 spa->spa_all_vdev_zaps = new_avz; 8762 } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { 8763 zap_cursor_t zc; 8764 zap_attribute_t za; 8765 8766 /* Walk through the AVZ and destroy all listed ZAPs */ 8767 for (zap_cursor_init(&zc, spa->spa_meta_objset, 8768 spa->spa_all_vdev_zaps); 8769 zap_cursor_retrieve(&zc, &za) == 0; 8770 zap_cursor_advance(&zc)) { 8771 uint64_t zap = za.za_first_integer; 8772 VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); 8773 } 8774 8775 zap_cursor_fini(&zc); 8776 8777 /* Destroy and unlink the AVZ itself */ 8778 VERIFY0(zap_destroy(spa->spa_meta_objset, 8779 spa->spa_all_vdev_zaps, tx)); 8780 VERIFY0(zap_remove(spa->spa_meta_objset, 8781 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); 8782 spa->spa_all_vdev_zaps = 0; 8783 } 8784 8785 if (spa->spa_all_vdev_zaps == 0) { 8786 spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, 8787 DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, 8788 DMU_POOL_VDEV_ZAP_MAP, tx); 8789 } 8790 spa->spa_avz_action = AVZ_ACTION_NONE; 8791 8792 /* Create ZAPs for vdevs that don't have them. */ 8793 vdev_construct_zaps(spa->spa_root_vdev, tx); 8794 8795 config = spa_config_generate(spa, spa->spa_root_vdev, 8796 dmu_tx_get_txg(tx), B_FALSE); 8797 8798 /* 8799 * If we're upgrading the spa version then make sure that 8800 * the config object gets updated with the correct version. 8801 */ 8802 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 8803 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 8804 spa->spa_uberblock.ub_version); 8805 8806 spa_config_exit(spa, SCL_STATE, FTAG); 8807 8808 nvlist_free(spa->spa_config_syncing); 8809 spa->spa_config_syncing = config; 8810 8811 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 8812 } 8813 8814 static void 8815 spa_sync_version(void *arg, dmu_tx_t *tx) 8816 { 8817 uint64_t *versionp = arg; 8818 uint64_t version = *versionp; 8819 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 8820 8821 /* 8822 * Setting the version is special cased when first creating the pool. 8823 */ 8824 ASSERT(tx->tx_txg != TXG_INITIAL); 8825 8826 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 8827 ASSERT(version >= spa_version(spa)); 8828 8829 spa->spa_uberblock.ub_version = version; 8830 vdev_config_dirty(spa->spa_root_vdev); 8831 spa_history_log_internal(spa, "set", tx, "version=%lld", 8832 (longlong_t)version); 8833 } 8834 8835 /* 8836 * Set zpool properties. 8837 */ 8838 static void 8839 spa_sync_props(void *arg, dmu_tx_t *tx) 8840 { 8841 nvlist_t *nvp = arg; 8842 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 8843 objset_t *mos = spa->spa_meta_objset; 8844 nvpair_t *elem = NULL; 8845 8846 mutex_enter(&spa->spa_props_lock); 8847 8848 while ((elem = nvlist_next_nvpair(nvp, elem))) { 8849 uint64_t intval; 8850 const char *strval, *fname; 8851 zpool_prop_t prop; 8852 const char *propname; 8853 const char *elemname = nvpair_name(elem); 8854 zprop_type_t proptype; 8855 spa_feature_t fid; 8856 8857 switch (prop = zpool_name_to_prop(elemname)) { 8858 case ZPOOL_PROP_VERSION: 8859 intval = fnvpair_value_uint64(elem); 8860 /* 8861 * The version is synced separately before other 8862 * properties and should be correct by now. 8863 */ 8864 ASSERT3U(spa_version(spa), >=, intval); 8865 break; 8866 8867 case ZPOOL_PROP_ALTROOT: 8868 /* 8869 * 'altroot' is a non-persistent property. It should 8870 * have been set temporarily at creation or import time. 8871 */ 8872 ASSERT(spa->spa_root != NULL); 8873 break; 8874 8875 case ZPOOL_PROP_READONLY: 8876 case ZPOOL_PROP_CACHEFILE: 8877 /* 8878 * 'readonly' and 'cachefile' are also non-persistent 8879 * properties. 8880 */ 8881 break; 8882 case ZPOOL_PROP_COMMENT: 8883 strval = fnvpair_value_string(elem); 8884 if (spa->spa_comment != NULL) 8885 spa_strfree(spa->spa_comment); 8886 spa->spa_comment = spa_strdup(strval); 8887 /* 8888 * We need to dirty the configuration on all the vdevs 8889 * so that their labels get updated. We also need to 8890 * update the cache file to keep it in sync with the 8891 * MOS version. It's unnecessary to do this for pool 8892 * creation since the vdev's configuration has already 8893 * been dirtied. 8894 */ 8895 if (tx->tx_txg != TXG_INITIAL) { 8896 vdev_config_dirty(spa->spa_root_vdev); 8897 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 8898 } 8899 spa_history_log_internal(spa, "set", tx, 8900 "%s=%s", elemname, strval); 8901 break; 8902 case ZPOOL_PROP_COMPATIBILITY: 8903 strval = fnvpair_value_string(elem); 8904 if (spa->spa_compatibility != NULL) 8905 spa_strfree(spa->spa_compatibility); 8906 spa->spa_compatibility = spa_strdup(strval); 8907 /* 8908 * Dirty the configuration on vdevs as above. 8909 */ 8910 if (tx->tx_txg != TXG_INITIAL) { 8911 vdev_config_dirty(spa->spa_root_vdev); 8912 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 8913 } 8914 8915 spa_history_log_internal(spa, "set", tx, 8916 "%s=%s", nvpair_name(elem), strval); 8917 break; 8918 8919 case ZPOOL_PROP_INVAL: 8920 if (zpool_prop_feature(elemname)) { 8921 fname = strchr(elemname, '@') + 1; 8922 VERIFY0(zfeature_lookup_name(fname, &fid)); 8923 8924 spa_feature_enable(spa, fid, tx); 8925 spa_history_log_internal(spa, "set", tx, 8926 "%s=enabled", elemname); 8927 break; 8928 } else if (!zfs_prop_user(elemname)) { 8929 ASSERT(zpool_prop_feature(elemname)); 8930 break; 8931 } 8932 zfs_fallthrough; 8933 default: 8934 /* 8935 * Set pool property values in the poolprops mos object. 8936 */ 8937 if (spa->spa_pool_props_object == 0) { 8938 spa->spa_pool_props_object = 8939 zap_create_link(mos, DMU_OT_POOL_PROPS, 8940 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 8941 tx); 8942 } 8943 8944 /* normalize the property name */ 8945 propname = zpool_prop_to_name(prop); 8946 proptype = zpool_prop_get_type(prop); 8947 if (prop == ZPOOL_PROP_INVAL && 8948 zfs_prop_user(elemname)) { 8949 propname = elemname; 8950 proptype = PROP_TYPE_STRING; 8951 } 8952 8953 if (nvpair_type(elem) == DATA_TYPE_STRING) { 8954 ASSERT(proptype == PROP_TYPE_STRING); 8955 strval = fnvpair_value_string(elem); 8956 VERIFY0(zap_update(mos, 8957 spa->spa_pool_props_object, propname, 8958 1, strlen(strval) + 1, strval, tx)); 8959 spa_history_log_internal(spa, "set", tx, 8960 "%s=%s", elemname, strval); 8961 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 8962 intval = fnvpair_value_uint64(elem); 8963 8964 if (proptype == PROP_TYPE_INDEX) { 8965 const char *unused; 8966 VERIFY0(zpool_prop_index_to_string( 8967 prop, intval, &unused)); 8968 } 8969 VERIFY0(zap_update(mos, 8970 spa->spa_pool_props_object, propname, 8971 8, 1, &intval, tx)); 8972 spa_history_log_internal(spa, "set", tx, 8973 "%s=%lld", elemname, 8974 (longlong_t)intval); 8975 8976 switch (prop) { 8977 case ZPOOL_PROP_DELEGATION: 8978 spa->spa_delegation = intval; 8979 break; 8980 case ZPOOL_PROP_BOOTFS: 8981 spa->spa_bootfs = intval; 8982 break; 8983 case ZPOOL_PROP_FAILUREMODE: 8984 spa->spa_failmode = intval; 8985 break; 8986 case ZPOOL_PROP_AUTOTRIM: 8987 spa->spa_autotrim = intval; 8988 spa_async_request(spa, 8989 SPA_ASYNC_AUTOTRIM_RESTART); 8990 break; 8991 case ZPOOL_PROP_AUTOEXPAND: 8992 spa->spa_autoexpand = intval; 8993 if (tx->tx_txg != TXG_INITIAL) 8994 spa_async_request(spa, 8995 SPA_ASYNC_AUTOEXPAND); 8996 break; 8997 case ZPOOL_PROP_MULTIHOST: 8998 spa->spa_multihost = intval; 8999 break; 9000 default: 9001 break; 9002 } 9003 } else { 9004 ASSERT(0); /* not allowed */ 9005 } 9006 } 9007 9008 } 9009 9010 mutex_exit(&spa->spa_props_lock); 9011 } 9012 9013 /* 9014 * Perform one-time upgrade on-disk changes. spa_version() does not 9015 * reflect the new version this txg, so there must be no changes this 9016 * txg to anything that the upgrade code depends on after it executes. 9017 * Therefore this must be called after dsl_pool_sync() does the sync 9018 * tasks. 9019 */ 9020 static void 9021 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 9022 { 9023 if (spa_sync_pass(spa) != 1) 9024 return; 9025 9026 dsl_pool_t *dp = spa->spa_dsl_pool; 9027 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 9028 9029 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 9030 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 9031 dsl_pool_create_origin(dp, tx); 9032 9033 /* Keeping the origin open increases spa_minref */ 9034 spa->spa_minref += 3; 9035 } 9036 9037 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 9038 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 9039 dsl_pool_upgrade_clones(dp, tx); 9040 } 9041 9042 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 9043 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 9044 dsl_pool_upgrade_dir_clones(dp, tx); 9045 9046 /* Keeping the freedir open increases spa_minref */ 9047 spa->spa_minref += 3; 9048 } 9049 9050 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 9051 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 9052 spa_feature_create_zap_objects(spa, tx); 9053 } 9054 9055 /* 9056 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 9057 * when possibility to use lz4 compression for metadata was added 9058 * Old pools that have this feature enabled must be upgraded to have 9059 * this feature active 9060 */ 9061 if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 9062 boolean_t lz4_en = spa_feature_is_enabled(spa, 9063 SPA_FEATURE_LZ4_COMPRESS); 9064 boolean_t lz4_ac = spa_feature_is_active(spa, 9065 SPA_FEATURE_LZ4_COMPRESS); 9066 9067 if (lz4_en && !lz4_ac) 9068 spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 9069 } 9070 9071 /* 9072 * If we haven't written the salt, do so now. Note that the 9073 * feature may not be activated yet, but that's fine since 9074 * the presence of this ZAP entry is backwards compatible. 9075 */ 9076 if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 9077 DMU_POOL_CHECKSUM_SALT) == ENOENT) { 9078 VERIFY0(zap_add(spa->spa_meta_objset, 9079 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, 9080 sizeof (spa->spa_cksum_salt.zcs_bytes), 9081 spa->spa_cksum_salt.zcs_bytes, tx)); 9082 } 9083 9084 rrw_exit(&dp->dp_config_rwlock, FTAG); 9085 } 9086 9087 static void 9088 vdev_indirect_state_sync_verify(vdev_t *vd) 9089 { 9090 vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping; 9091 vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births; 9092 9093 if (vd->vdev_ops == &vdev_indirect_ops) { 9094 ASSERT(vim != NULL); 9095 ASSERT(vib != NULL); 9096 } 9097 9098 uint64_t obsolete_sm_object = 0; 9099 ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); 9100 if (obsolete_sm_object != 0) { 9101 ASSERT(vd->vdev_obsolete_sm != NULL); 9102 ASSERT(vd->vdev_removing || 9103 vd->vdev_ops == &vdev_indirect_ops); 9104 ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); 9105 ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); 9106 ASSERT3U(obsolete_sm_object, ==, 9107 space_map_object(vd->vdev_obsolete_sm)); 9108 ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, 9109 space_map_allocated(vd->vdev_obsolete_sm)); 9110 } 9111 ASSERT(vd->vdev_obsolete_segments != NULL); 9112 9113 /* 9114 * Since frees / remaps to an indirect vdev can only 9115 * happen in syncing context, the obsolete segments 9116 * tree must be empty when we start syncing. 9117 */ 9118 ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); 9119 } 9120 9121 /* 9122 * Set the top-level vdev's max queue depth. Evaluate each top-level's 9123 * async write queue depth in case it changed. The max queue depth will 9124 * not change in the middle of syncing out this txg. 9125 */ 9126 static void 9127 spa_sync_adjust_vdev_max_queue_depth(spa_t *spa) 9128 { 9129 ASSERT(spa_writeable(spa)); 9130 9131 vdev_t *rvd = spa->spa_root_vdev; 9132 uint32_t max_queue_depth = zfs_vdev_async_write_max_active * 9133 zfs_vdev_queue_depth_pct / 100; 9134 metaslab_class_t *normal = spa_normal_class(spa); 9135 metaslab_class_t *special = spa_special_class(spa); 9136 metaslab_class_t *dedup = spa_dedup_class(spa); 9137 9138 uint64_t slots_per_allocator = 0; 9139 for (int c = 0; c < rvd->vdev_children; c++) { 9140 vdev_t *tvd = rvd->vdev_child[c]; 9141 9142 metaslab_group_t *mg = tvd->vdev_mg; 9143 if (mg == NULL || !metaslab_group_initialized(mg)) 9144 continue; 9145 9146 metaslab_class_t *mc = mg->mg_class; 9147 if (mc != normal && mc != special && mc != dedup) 9148 continue; 9149 9150 /* 9151 * It is safe to do a lock-free check here because only async 9152 * allocations look at mg_max_alloc_queue_depth, and async 9153 * allocations all happen from spa_sync(). 9154 */ 9155 for (int i = 0; i < mg->mg_allocators; i++) { 9156 ASSERT0(zfs_refcount_count( 9157 &(mg->mg_allocator[i].mga_alloc_queue_depth))); 9158 } 9159 mg->mg_max_alloc_queue_depth = max_queue_depth; 9160 9161 for (int i = 0; i < mg->mg_allocators; i++) { 9162 mg->mg_allocator[i].mga_cur_max_alloc_queue_depth = 9163 zfs_vdev_def_queue_depth; 9164 } 9165 slots_per_allocator += zfs_vdev_def_queue_depth; 9166 } 9167 9168 for (int i = 0; i < spa->spa_alloc_count; i++) { 9169 ASSERT0(zfs_refcount_count(&normal->mc_allocator[i]. 9170 mca_alloc_slots)); 9171 ASSERT0(zfs_refcount_count(&special->mc_allocator[i]. 9172 mca_alloc_slots)); 9173 ASSERT0(zfs_refcount_count(&dedup->mc_allocator[i]. 9174 mca_alloc_slots)); 9175 normal->mc_allocator[i].mca_alloc_max_slots = 9176 slots_per_allocator; 9177 special->mc_allocator[i].mca_alloc_max_slots = 9178 slots_per_allocator; 9179 dedup->mc_allocator[i].mca_alloc_max_slots = 9180 slots_per_allocator; 9181 } 9182 normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 9183 special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 9184 dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 9185 } 9186 9187 static void 9188 spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx) 9189 { 9190 ASSERT(spa_writeable(spa)); 9191 9192 vdev_t *rvd = spa->spa_root_vdev; 9193 for (int c = 0; c < rvd->vdev_children; c++) { 9194 vdev_t *vd = rvd->vdev_child[c]; 9195 vdev_indirect_state_sync_verify(vd); 9196 9197 if (vdev_indirect_should_condense(vd)) { 9198 spa_condense_indirect_start_sync(vd, tx); 9199 break; 9200 } 9201 } 9202 } 9203 9204 static void 9205 spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) 9206 { 9207 objset_t *mos = spa->spa_meta_objset; 9208 dsl_pool_t *dp = spa->spa_dsl_pool; 9209 uint64_t txg = tx->tx_txg; 9210 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 9211 9212 do { 9213 int pass = ++spa->spa_sync_pass; 9214 9215 spa_sync_config_object(spa, tx); 9216 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 9217 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 9218 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 9219 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 9220 spa_errlog_sync(spa, txg); 9221 dsl_pool_sync(dp, txg); 9222 9223 if (pass < zfs_sync_pass_deferred_free || 9224 spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { 9225 /* 9226 * If the log space map feature is active we don't 9227 * care about deferred frees and the deferred bpobj 9228 * as the log space map should effectively have the 9229 * same results (i.e. appending only to one object). 9230 */ 9231 spa_sync_frees(spa, free_bpl, tx); 9232 } else { 9233 /* 9234 * We can not defer frees in pass 1, because 9235 * we sync the deferred frees later in pass 1. 9236 */ 9237 ASSERT3U(pass, >, 1); 9238 bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb, 9239 &spa->spa_deferred_bpobj, tx); 9240 } 9241 9242 brt_sync(spa, txg); 9243 ddt_sync(spa, txg); 9244 dsl_scan_sync(dp, tx); 9245 svr_sync(spa, tx); 9246 spa_sync_upgrades(spa, tx); 9247 9248 spa_flush_metaslabs(spa, tx); 9249 9250 vdev_t *vd = NULL; 9251 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 9252 != NULL) 9253 vdev_sync(vd, txg); 9254 9255 /* 9256 * Note: We need to check if the MOS is dirty because we could 9257 * have marked the MOS dirty without updating the uberblock 9258 * (e.g. if we have sync tasks but no dirty user data). We need 9259 * to check the uberblock's rootbp because it is updated if we 9260 * have synced out dirty data (though in this case the MOS will 9261 * most likely also be dirty due to second order effects, we 9262 * don't want to rely on that here). 9263 */ 9264 if (pass == 1 && 9265 spa->spa_uberblock.ub_rootbp.blk_birth < txg && 9266 !dmu_objset_is_dirty(mos, txg)) { 9267 /* 9268 * Nothing changed on the first pass, therefore this 9269 * TXG is a no-op. Avoid syncing deferred frees, so 9270 * that we can keep this TXG as a no-op. 9271 */ 9272 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 9273 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 9274 ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); 9275 ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg)); 9276 break; 9277 } 9278 9279 spa_sync_deferred_frees(spa, tx); 9280 } while (dmu_objset_is_dirty(mos, txg)); 9281 } 9282 9283 /* 9284 * Rewrite the vdev configuration (which includes the uberblock) to 9285 * commit the transaction group. 9286 * 9287 * If there are no dirty vdevs, we sync the uberblock to a few random 9288 * top-level vdevs that are known to be visible in the config cache 9289 * (see spa_vdev_add() for a complete description). If there *are* dirty 9290 * vdevs, sync the uberblock to all vdevs. 9291 */ 9292 static void 9293 spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) 9294 { 9295 vdev_t *rvd = spa->spa_root_vdev; 9296 uint64_t txg = tx->tx_txg; 9297 9298 for (;;) { 9299 int error = 0; 9300 9301 /* 9302 * We hold SCL_STATE to prevent vdev open/close/etc. 9303 * while we're attempting to write the vdev labels. 9304 */ 9305 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 9306 9307 if (list_is_empty(&spa->spa_config_dirty_list)) { 9308 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; 9309 int svdcount = 0; 9310 int children = rvd->vdev_children; 9311 int c0 = random_in_range(children); 9312 9313 for (int c = 0; c < children; c++) { 9314 vdev_t *vd = 9315 rvd->vdev_child[(c0 + c) % children]; 9316 9317 /* Stop when revisiting the first vdev */ 9318 if (c > 0 && svd[0] == vd) 9319 break; 9320 9321 if (vd->vdev_ms_array == 0 || 9322 vd->vdev_islog || 9323 !vdev_is_concrete(vd)) 9324 continue; 9325 9326 svd[svdcount++] = vd; 9327 if (svdcount == SPA_SYNC_MIN_VDEVS) 9328 break; 9329 } 9330 error = vdev_config_sync(svd, svdcount, txg); 9331 } else { 9332 error = vdev_config_sync(rvd->vdev_child, 9333 rvd->vdev_children, txg); 9334 } 9335 9336 if (error == 0) 9337 spa->spa_last_synced_guid = rvd->vdev_guid; 9338 9339 spa_config_exit(spa, SCL_STATE, FTAG); 9340 9341 if (error == 0) 9342 break; 9343 zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); 9344 zio_resume_wait(spa); 9345 } 9346 } 9347 9348 /* 9349 * Sync the specified transaction group. New blocks may be dirtied as 9350 * part of the process, so we iterate until it converges. 9351 */ 9352 void 9353 spa_sync(spa_t *spa, uint64_t txg) 9354 { 9355 vdev_t *vd = NULL; 9356 9357 VERIFY(spa_writeable(spa)); 9358 9359 /* 9360 * Wait for i/os issued in open context that need to complete 9361 * before this txg syncs. 9362 */ 9363 (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); 9364 spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 9365 ZIO_FLAG_CANFAIL); 9366 9367 /* 9368 * Now that there can be no more cloning in this transaction group, 9369 * but we are still before issuing frees, we can process pending BRT 9370 * updates. 9371 */ 9372 brt_pending_apply(spa, txg); 9373 9374 /* 9375 * Lock out configuration changes. 9376 */ 9377 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 9378 9379 spa->spa_syncing_txg = txg; 9380 spa->spa_sync_pass = 0; 9381 9382 for (int i = 0; i < spa->spa_alloc_count; i++) { 9383 mutex_enter(&spa->spa_allocs[i].spaa_lock); 9384 VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); 9385 mutex_exit(&spa->spa_allocs[i].spaa_lock); 9386 } 9387 9388 /* 9389 * If there are any pending vdev state changes, convert them 9390 * into config changes that go out with this transaction group. 9391 */ 9392 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 9393 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 9394 /* Avoid holding the write lock unless actually necessary */ 9395 if (vd->vdev_aux == NULL) { 9396 vdev_state_clean(vd); 9397 vdev_config_dirty(vd); 9398 continue; 9399 } 9400 /* 9401 * We need the write lock here because, for aux vdevs, 9402 * calling vdev_config_dirty() modifies sav_config. 9403 * This is ugly and will become unnecessary when we 9404 * eliminate the aux vdev wart by integrating all vdevs 9405 * into the root vdev tree. 9406 */ 9407 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9408 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 9409 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 9410 vdev_state_clean(vd); 9411 vdev_config_dirty(vd); 9412 } 9413 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9414 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 9415 } 9416 spa_config_exit(spa, SCL_STATE, FTAG); 9417 9418 dsl_pool_t *dp = spa->spa_dsl_pool; 9419 dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); 9420 9421 spa->spa_sync_starttime = gethrtime(); 9422 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 9423 spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, 9424 spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + 9425 NSEC_TO_TICK(spa->spa_deadman_synctime)); 9426 9427 /* 9428 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 9429 * set spa_deflate if we have no raid-z vdevs. 9430 */ 9431 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 9432 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 9433 vdev_t *rvd = spa->spa_root_vdev; 9434 9435 int i; 9436 for (i = 0; i < rvd->vdev_children; i++) { 9437 vd = rvd->vdev_child[i]; 9438 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 9439 break; 9440 } 9441 if (i == rvd->vdev_children) { 9442 spa->spa_deflate = TRUE; 9443 VERIFY0(zap_add(spa->spa_meta_objset, 9444 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 9445 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 9446 } 9447 } 9448 9449 spa_sync_adjust_vdev_max_queue_depth(spa); 9450 9451 spa_sync_condense_indirect(spa, tx); 9452 9453 spa_sync_iterate_to_convergence(spa, tx); 9454 9455 #ifdef ZFS_DEBUG 9456 if (!list_is_empty(&spa->spa_config_dirty_list)) { 9457 /* 9458 * Make sure that the number of ZAPs for all the vdevs matches 9459 * the number of ZAPs in the per-vdev ZAP list. This only gets 9460 * called if the config is dirty; otherwise there may be 9461 * outstanding AVZ operations that weren't completed in 9462 * spa_sync_config_object. 9463 */ 9464 uint64_t all_vdev_zap_entry_count; 9465 ASSERT0(zap_count(spa->spa_meta_objset, 9466 spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); 9467 ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, 9468 all_vdev_zap_entry_count); 9469 } 9470 #endif 9471 9472 if (spa->spa_vdev_removal != NULL) { 9473 ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); 9474 } 9475 9476 spa_sync_rewrite_vdev_config(spa, tx); 9477 dmu_tx_commit(tx); 9478 9479 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 9480 spa->spa_deadman_tqid = 0; 9481 9482 /* 9483 * Clear the dirty config list. 9484 */ 9485 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 9486 vdev_config_clean(vd); 9487 9488 /* 9489 * Now that the new config has synced transactionally, 9490 * let it become visible to the config cache. 9491 */ 9492 if (spa->spa_config_syncing != NULL) { 9493 spa_config_set(spa, spa->spa_config_syncing); 9494 spa->spa_config_txg = txg; 9495 spa->spa_config_syncing = NULL; 9496 } 9497 9498 dsl_pool_sync_done(dp, txg); 9499 9500 for (int i = 0; i < spa->spa_alloc_count; i++) { 9501 mutex_enter(&spa->spa_allocs[i].spaa_lock); 9502 VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); 9503 mutex_exit(&spa->spa_allocs[i].spaa_lock); 9504 } 9505 9506 /* 9507 * Update usable space statistics. 9508 */ 9509 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 9510 != NULL) 9511 vdev_sync_done(vd, txg); 9512 9513 metaslab_class_evict_old(spa->spa_normal_class, txg); 9514 metaslab_class_evict_old(spa->spa_log_class, txg); 9515 9516 spa_sync_close_syncing_log_sm(spa); 9517 9518 spa_update_dspace(spa); 9519 9520 if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) 9521 vdev_autotrim_kick(spa); 9522 9523 /* 9524 * It had better be the case that we didn't dirty anything 9525 * since vdev_config_sync(). 9526 */ 9527 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 9528 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 9529 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 9530 9531 while (zfs_pause_spa_sync) 9532 delay(1); 9533 9534 spa->spa_sync_pass = 0; 9535 9536 /* 9537 * Update the last synced uberblock here. We want to do this at 9538 * the end of spa_sync() so that consumers of spa_last_synced_txg() 9539 * will be guaranteed that all the processing associated with 9540 * that txg has been completed. 9541 */ 9542 spa->spa_ubsync = spa->spa_uberblock; 9543 spa_config_exit(spa, SCL_CONFIG, FTAG); 9544 9545 spa_handle_ignored_writes(spa); 9546 9547 /* 9548 * If any async tasks have been requested, kick them off. 9549 */ 9550 spa_async_dispatch(spa); 9551 } 9552 9553 /* 9554 * Sync all pools. We don't want to hold the namespace lock across these 9555 * operations, so we take a reference on the spa_t and drop the lock during the 9556 * sync. 9557 */ 9558 void 9559 spa_sync_allpools(void) 9560 { 9561 spa_t *spa = NULL; 9562 mutex_enter(&spa_namespace_lock); 9563 while ((spa = spa_next(spa)) != NULL) { 9564 if (spa_state(spa) != POOL_STATE_ACTIVE || 9565 !spa_writeable(spa) || spa_suspended(spa)) 9566 continue; 9567 spa_open_ref(spa, FTAG); 9568 mutex_exit(&spa_namespace_lock); 9569 txg_wait_synced(spa_get_dsl(spa), 0); 9570 mutex_enter(&spa_namespace_lock); 9571 spa_close(spa, FTAG); 9572 } 9573 mutex_exit(&spa_namespace_lock); 9574 } 9575 9576 /* 9577 * ========================================================================== 9578 * Miscellaneous routines 9579 * ========================================================================== 9580 */ 9581 9582 /* 9583 * Remove all pools in the system. 9584 */ 9585 void 9586 spa_evict_all(void) 9587 { 9588 spa_t *spa; 9589 9590 /* 9591 * Remove all cached state. All pools should be closed now, 9592 * so every spa in the AVL tree should be unreferenced. 9593 */ 9594 mutex_enter(&spa_namespace_lock); 9595 while ((spa = spa_next(NULL)) != NULL) { 9596 /* 9597 * Stop async tasks. The async thread may need to detach 9598 * a device that's been replaced, which requires grabbing 9599 * spa_namespace_lock, so we must drop it here. 9600 */ 9601 spa_open_ref(spa, FTAG); 9602 mutex_exit(&spa_namespace_lock); 9603 spa_async_suspend(spa); 9604 mutex_enter(&spa_namespace_lock); 9605 spa_close(spa, FTAG); 9606 9607 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 9608 spa_unload(spa); 9609 spa_deactivate(spa); 9610 } 9611 spa_remove(spa); 9612 } 9613 mutex_exit(&spa_namespace_lock); 9614 } 9615 9616 vdev_t * 9617 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 9618 { 9619 vdev_t *vd; 9620 int i; 9621 9622 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 9623 return (vd); 9624 9625 if (aux) { 9626 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 9627 vd = spa->spa_l2cache.sav_vdevs[i]; 9628 if (vd->vdev_guid == guid) 9629 return (vd); 9630 } 9631 9632 for (i = 0; i < spa->spa_spares.sav_count; i++) { 9633 vd = spa->spa_spares.sav_vdevs[i]; 9634 if (vd->vdev_guid == guid) 9635 return (vd); 9636 } 9637 } 9638 9639 return (NULL); 9640 } 9641 9642 void 9643 spa_upgrade(spa_t *spa, uint64_t version) 9644 { 9645 ASSERT(spa_writeable(spa)); 9646 9647 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 9648 9649 /* 9650 * This should only be called for a non-faulted pool, and since a 9651 * future version would result in an unopenable pool, this shouldn't be 9652 * possible. 9653 */ 9654 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 9655 ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 9656 9657 spa->spa_uberblock.ub_version = version; 9658 vdev_config_dirty(spa->spa_root_vdev); 9659 9660 spa_config_exit(spa, SCL_ALL, FTAG); 9661 9662 txg_wait_synced(spa_get_dsl(spa), 0); 9663 } 9664 9665 static boolean_t 9666 spa_has_aux_vdev(spa_t *spa, uint64_t guid, spa_aux_vdev_t *sav) 9667 { 9668 (void) spa; 9669 int i; 9670 uint64_t vdev_guid; 9671 9672 for (i = 0; i < sav->sav_count; i++) 9673 if (sav->sav_vdevs[i]->vdev_guid == guid) 9674 return (B_TRUE); 9675 9676 for (i = 0; i < sav->sav_npending; i++) { 9677 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 9678 &vdev_guid) == 0 && vdev_guid == guid) 9679 return (B_TRUE); 9680 } 9681 9682 return (B_FALSE); 9683 } 9684 9685 boolean_t 9686 spa_has_l2cache(spa_t *spa, uint64_t guid) 9687 { 9688 return (spa_has_aux_vdev(spa, guid, &spa->spa_l2cache)); 9689 } 9690 9691 boolean_t 9692 spa_has_spare(spa_t *spa, uint64_t guid) 9693 { 9694 return (spa_has_aux_vdev(spa, guid, &spa->spa_spares)); 9695 } 9696 9697 /* 9698 * Check if a pool has an active shared spare device. 9699 * Note: reference count of an active spare is 2, as a spare and as a replace 9700 */ 9701 static boolean_t 9702 spa_has_active_shared_spare(spa_t *spa) 9703 { 9704 int i, refcnt; 9705 uint64_t pool; 9706 spa_aux_vdev_t *sav = &spa->spa_spares; 9707 9708 for (i = 0; i < sav->sav_count; i++) { 9709 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 9710 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 9711 refcnt > 2) 9712 return (B_TRUE); 9713 } 9714 9715 return (B_FALSE); 9716 } 9717 9718 uint64_t 9719 spa_total_metaslabs(spa_t *spa) 9720 { 9721 vdev_t *rvd = spa->spa_root_vdev; 9722 9723 uint64_t m = 0; 9724 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 9725 vdev_t *vd = rvd->vdev_child[c]; 9726 if (!vdev_is_concrete(vd)) 9727 continue; 9728 m += vd->vdev_ms_count; 9729 } 9730 return (m); 9731 } 9732 9733 /* 9734 * Notify any waiting threads that some activity has switched from being in- 9735 * progress to not-in-progress so that the thread can wake up and determine 9736 * whether it is finished waiting. 9737 */ 9738 void 9739 spa_notify_waiters(spa_t *spa) 9740 { 9741 /* 9742 * Acquiring spa_activities_lock here prevents the cv_broadcast from 9743 * happening between the waiting thread's check and cv_wait. 9744 */ 9745 mutex_enter(&spa->spa_activities_lock); 9746 cv_broadcast(&spa->spa_activities_cv); 9747 mutex_exit(&spa->spa_activities_lock); 9748 } 9749 9750 /* 9751 * Notify any waiting threads that the pool is exporting, and then block until 9752 * they are finished using the spa_t. 9753 */ 9754 void 9755 spa_wake_waiters(spa_t *spa) 9756 { 9757 mutex_enter(&spa->spa_activities_lock); 9758 spa->spa_waiters_cancel = B_TRUE; 9759 cv_broadcast(&spa->spa_activities_cv); 9760 while (spa->spa_waiters != 0) 9761 cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock); 9762 spa->spa_waiters_cancel = B_FALSE; 9763 mutex_exit(&spa->spa_activities_lock); 9764 } 9765 9766 /* Whether the vdev or any of its descendants are being initialized/trimmed. */ 9767 static boolean_t 9768 spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity) 9769 { 9770 spa_t *spa = vd->vdev_spa; 9771 9772 ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER)); 9773 ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); 9774 ASSERT(activity == ZPOOL_WAIT_INITIALIZE || 9775 activity == ZPOOL_WAIT_TRIM); 9776 9777 kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ? 9778 &vd->vdev_initialize_lock : &vd->vdev_trim_lock; 9779 9780 mutex_exit(&spa->spa_activities_lock); 9781 mutex_enter(lock); 9782 mutex_enter(&spa->spa_activities_lock); 9783 9784 boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ? 9785 (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) : 9786 (vd->vdev_trim_state == VDEV_TRIM_ACTIVE); 9787 mutex_exit(lock); 9788 9789 if (in_progress) 9790 return (B_TRUE); 9791 9792 for (int i = 0; i < vd->vdev_children; i++) { 9793 if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i], 9794 activity)) 9795 return (B_TRUE); 9796 } 9797 9798 return (B_FALSE); 9799 } 9800 9801 /* 9802 * If use_guid is true, this checks whether the vdev specified by guid is 9803 * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool 9804 * is being initialized/trimmed. The caller must hold the config lock and 9805 * spa_activities_lock. 9806 */ 9807 static int 9808 spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid, 9809 zpool_wait_activity_t activity, boolean_t *in_progress) 9810 { 9811 mutex_exit(&spa->spa_activities_lock); 9812 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 9813 mutex_enter(&spa->spa_activities_lock); 9814 9815 vdev_t *vd; 9816 if (use_guid) { 9817 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 9818 if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) { 9819 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9820 return (EINVAL); 9821 } 9822 } else { 9823 vd = spa->spa_root_vdev; 9824 } 9825 9826 *in_progress = spa_vdev_activity_in_progress_impl(vd, activity); 9827 9828 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9829 return (0); 9830 } 9831 9832 /* 9833 * Locking for waiting threads 9834 * --------------------------- 9835 * 9836 * Waiting threads need a way to check whether a given activity is in progress, 9837 * and then, if it is, wait for it to complete. Each activity will have some 9838 * in-memory representation of the relevant on-disk state which can be used to 9839 * determine whether or not the activity is in progress. The in-memory state and 9840 * the locking used to protect it will be different for each activity, and may 9841 * not be suitable for use with a cvar (e.g., some state is protected by the 9842 * config lock). To allow waiting threads to wait without any races, another 9843 * lock, spa_activities_lock, is used. 9844 * 9845 * When the state is checked, both the activity-specific lock (if there is one) 9846 * and spa_activities_lock are held. In some cases, the activity-specific lock 9847 * is acquired explicitly (e.g. the config lock). In others, the locking is 9848 * internal to some check (e.g. bpobj_is_empty). After checking, the waiting 9849 * thread releases the activity-specific lock and, if the activity is in 9850 * progress, then cv_waits using spa_activities_lock. 9851 * 9852 * The waiting thread is woken when another thread, one completing some 9853 * activity, updates the state of the activity and then calls 9854 * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only 9855 * needs to hold its activity-specific lock when updating the state, and this 9856 * lock can (but doesn't have to) be dropped before calling spa_notify_waiters. 9857 * 9858 * Because spa_notify_waiters acquires spa_activities_lock before broadcasting, 9859 * and because it is held when the waiting thread checks the state of the 9860 * activity, it can never be the case that the completing thread both updates 9861 * the activity state and cv_broadcasts in between the waiting thread's check 9862 * and cv_wait. Thus, a waiting thread can never miss a wakeup. 9863 * 9864 * In order to prevent deadlock, when the waiting thread does its check, in some 9865 * cases it will temporarily drop spa_activities_lock in order to acquire the 9866 * activity-specific lock. The order in which spa_activities_lock and the 9867 * activity specific lock are acquired in the waiting thread is determined by 9868 * the order in which they are acquired in the completing thread; if the 9869 * completing thread calls spa_notify_waiters with the activity-specific lock 9870 * held, then the waiting thread must also acquire the activity-specific lock 9871 * first. 9872 */ 9873 9874 static int 9875 spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, 9876 boolean_t use_tag, uint64_t tag, boolean_t *in_progress) 9877 { 9878 int error = 0; 9879 9880 ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); 9881 9882 switch (activity) { 9883 case ZPOOL_WAIT_CKPT_DISCARD: 9884 *in_progress = 9885 (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) && 9886 zap_contains(spa_meta_objset(spa), 9887 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) == 9888 ENOENT); 9889 break; 9890 case ZPOOL_WAIT_FREE: 9891 *in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS && 9892 !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) || 9893 spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) || 9894 spa_livelist_delete_check(spa)); 9895 break; 9896 case ZPOOL_WAIT_INITIALIZE: 9897 case ZPOOL_WAIT_TRIM: 9898 error = spa_vdev_activity_in_progress(spa, use_tag, tag, 9899 activity, in_progress); 9900 break; 9901 case ZPOOL_WAIT_REPLACE: 9902 mutex_exit(&spa->spa_activities_lock); 9903 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 9904 mutex_enter(&spa->spa_activities_lock); 9905 9906 *in_progress = vdev_replace_in_progress(spa->spa_root_vdev); 9907 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9908 break; 9909 case ZPOOL_WAIT_REMOVE: 9910 *in_progress = (spa->spa_removing_phys.sr_state == 9911 DSS_SCANNING); 9912 break; 9913 case ZPOOL_WAIT_RESILVER: 9914 if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev))) 9915 break; 9916 zfs_fallthrough; 9917 case ZPOOL_WAIT_SCRUB: 9918 { 9919 boolean_t scanning, paused, is_scrub; 9920 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 9921 9922 is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB); 9923 scanning = (scn->scn_phys.scn_state == DSS_SCANNING); 9924 paused = dsl_scan_is_paused_scrub(scn); 9925 *in_progress = (scanning && !paused && 9926 is_scrub == (activity == ZPOOL_WAIT_SCRUB)); 9927 break; 9928 } 9929 default: 9930 panic("unrecognized value for activity %d", activity); 9931 } 9932 9933 return (error); 9934 } 9935 9936 static int 9937 spa_wait_common(const char *pool, zpool_wait_activity_t activity, 9938 boolean_t use_tag, uint64_t tag, boolean_t *waited) 9939 { 9940 /* 9941 * The tag is used to distinguish between instances of an activity. 9942 * 'initialize' and 'trim' are the only activities that we use this for. 9943 * The other activities can only have a single instance in progress in a 9944 * pool at one time, making the tag unnecessary. 9945 * 9946 * There can be multiple devices being replaced at once, but since they 9947 * all finish once resilvering finishes, we don't bother keeping track 9948 * of them individually, we just wait for them all to finish. 9949 */ 9950 if (use_tag && activity != ZPOOL_WAIT_INITIALIZE && 9951 activity != ZPOOL_WAIT_TRIM) 9952 return (EINVAL); 9953 9954 if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES) 9955 return (EINVAL); 9956 9957 spa_t *spa; 9958 int error = spa_open(pool, &spa, FTAG); 9959 if (error != 0) 9960 return (error); 9961 9962 /* 9963 * Increment the spa's waiter count so that we can call spa_close and 9964 * still ensure that the spa_t doesn't get freed before this thread is 9965 * finished with it when the pool is exported. We want to call spa_close 9966 * before we start waiting because otherwise the additional ref would 9967 * prevent the pool from being exported or destroyed throughout the 9968 * potentially long wait. 9969 */ 9970 mutex_enter(&spa->spa_activities_lock); 9971 spa->spa_waiters++; 9972 spa_close(spa, FTAG); 9973 9974 *waited = B_FALSE; 9975 for (;;) { 9976 boolean_t in_progress; 9977 error = spa_activity_in_progress(spa, activity, use_tag, tag, 9978 &in_progress); 9979 9980 if (error || !in_progress || spa->spa_waiters_cancel) 9981 break; 9982 9983 *waited = B_TRUE; 9984 9985 if (cv_wait_sig(&spa->spa_activities_cv, 9986 &spa->spa_activities_lock) == 0) { 9987 error = EINTR; 9988 break; 9989 } 9990 } 9991 9992 spa->spa_waiters--; 9993 cv_signal(&spa->spa_waiters_cv); 9994 mutex_exit(&spa->spa_activities_lock); 9995 9996 return (error); 9997 } 9998 9999 /* 10000 * Wait for a particular instance of the specified activity to complete, where 10001 * the instance is identified by 'tag' 10002 */ 10003 int 10004 spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag, 10005 boolean_t *waited) 10006 { 10007 return (spa_wait_common(pool, activity, B_TRUE, tag, waited)); 10008 } 10009 10010 /* 10011 * Wait for all instances of the specified activity complete 10012 */ 10013 int 10014 spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited) 10015 { 10016 10017 return (spa_wait_common(pool, activity, B_FALSE, 0, waited)); 10018 } 10019 10020 sysevent_t * 10021 spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 10022 { 10023 sysevent_t *ev = NULL; 10024 #ifdef _KERNEL 10025 nvlist_t *resource; 10026 10027 resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl); 10028 if (resource) { 10029 ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP); 10030 ev->resource = resource; 10031 } 10032 #else 10033 (void) spa, (void) vd, (void) hist_nvl, (void) name; 10034 #endif 10035 return (ev); 10036 } 10037 10038 void 10039 spa_event_post(sysevent_t *ev) 10040 { 10041 #ifdef _KERNEL 10042 if (ev) { 10043 zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb); 10044 kmem_free(ev, sizeof (*ev)); 10045 } 10046 #else 10047 (void) ev; 10048 #endif 10049 } 10050 10051 /* 10052 * Post a zevent corresponding to the given sysevent. The 'name' must be one 10053 * of the event definitions in sys/sysevent/eventdefs.h. The payload will be 10054 * filled in from the spa and (optionally) the vdev. This doesn't do anything 10055 * in the userland libzpool, as we don't want consumers to misinterpret ztest 10056 * or zdb as real changes. 10057 */ 10058 void 10059 spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 10060 { 10061 spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); 10062 } 10063 10064 /* state manipulation functions */ 10065 EXPORT_SYMBOL(spa_open); 10066 EXPORT_SYMBOL(spa_open_rewind); 10067 EXPORT_SYMBOL(spa_get_stats); 10068 EXPORT_SYMBOL(spa_create); 10069 EXPORT_SYMBOL(spa_import); 10070 EXPORT_SYMBOL(spa_tryimport); 10071 EXPORT_SYMBOL(spa_destroy); 10072 EXPORT_SYMBOL(spa_export); 10073 EXPORT_SYMBOL(spa_reset); 10074 EXPORT_SYMBOL(spa_async_request); 10075 EXPORT_SYMBOL(spa_async_suspend); 10076 EXPORT_SYMBOL(spa_async_resume); 10077 EXPORT_SYMBOL(spa_inject_addref); 10078 EXPORT_SYMBOL(spa_inject_delref); 10079 EXPORT_SYMBOL(spa_scan_stat_init); 10080 EXPORT_SYMBOL(spa_scan_get_stats); 10081 10082 /* device manipulation */ 10083 EXPORT_SYMBOL(spa_vdev_add); 10084 EXPORT_SYMBOL(spa_vdev_attach); 10085 EXPORT_SYMBOL(spa_vdev_detach); 10086 EXPORT_SYMBOL(spa_vdev_setpath); 10087 EXPORT_SYMBOL(spa_vdev_setfru); 10088 EXPORT_SYMBOL(spa_vdev_split_mirror); 10089 10090 /* spare statech is global across all pools) */ 10091 EXPORT_SYMBOL(spa_spare_add); 10092 EXPORT_SYMBOL(spa_spare_remove); 10093 EXPORT_SYMBOL(spa_spare_exists); 10094 EXPORT_SYMBOL(spa_spare_activate); 10095 10096 /* L2ARC statech is global across all pools) */ 10097 EXPORT_SYMBOL(spa_l2cache_add); 10098 EXPORT_SYMBOL(spa_l2cache_remove); 10099 EXPORT_SYMBOL(spa_l2cache_exists); 10100 EXPORT_SYMBOL(spa_l2cache_activate); 10101 EXPORT_SYMBOL(spa_l2cache_drop); 10102 10103 /* scanning */ 10104 EXPORT_SYMBOL(spa_scan); 10105 EXPORT_SYMBOL(spa_scan_stop); 10106 10107 /* spa syncing */ 10108 EXPORT_SYMBOL(spa_sync); /* only for DMU use */ 10109 EXPORT_SYMBOL(spa_sync_allpools); 10110 10111 /* properties */ 10112 EXPORT_SYMBOL(spa_prop_set); 10113 EXPORT_SYMBOL(spa_prop_get); 10114 EXPORT_SYMBOL(spa_prop_clear_bootfs); 10115 10116 /* asynchronous event notification */ 10117 EXPORT_SYMBOL(spa_event_notify); 10118 10119 /* BEGIN CSTYLED */ 10120 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW, 10121 "log2 fraction of arc that can be used by inflight I/Os when " 10122 "verifying pool during import"); 10123 /* END CSTYLED */ 10124 10125 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW, 10126 "Set to traverse metadata on pool import"); 10127 10128 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW, 10129 "Set to traverse data on pool import"); 10130 10131 ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW, 10132 "Print vdev tree to zfs_dbgmsg during pool import"); 10133 10134 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD, 10135 "Percentage of CPUs to run an IO worker thread"); 10136 10137 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD, 10138 "Number of threads per IO worker taskqueue"); 10139 10140 /* BEGIN CSTYLED */ 10141 ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW, 10142 "Allow importing pool with up to this number of missing top-level " 10143 "vdevs (in read-only mode)"); 10144 /* END CSTYLED */ 10145 10146 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, 10147 ZMOD_RW, "Set the livelist condense zthr to pause"); 10148 10149 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT, 10150 ZMOD_RW, "Set the livelist condense synctask to pause"); 10151 10152 /* BEGIN CSTYLED */ 10153 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel, 10154 INT, ZMOD_RW, 10155 "Whether livelist condensing was canceled in the synctask"); 10156 10157 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, 10158 INT, ZMOD_RW, 10159 "Whether livelist condensing was canceled in the zthr function"); 10160 10161 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, 10162 ZMOD_RW, 10163 "Whether extra ALLOC blkptrs were added to a livelist entry while it " 10164 "was being condensed"); 10165 /* END CSTYLED */ 10166