1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2020 by Delphix. All rights reserved. 25 * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 27 * Copyright 2013 Saso Kiselkov. All rights reserved. 28 * Copyright (c) 2014 Integros [integros.com] 29 * Copyright 2016 Toomas Soome <tsoome@me.com> 30 * Copyright (c) 2016 Actifio, Inc. All rights reserved. 31 * Copyright 2018 Joyent, Inc. 32 * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. 33 * Copyright 2017 Joyent, Inc. 34 * Copyright (c) 2017, Intel Corporation. 35 * Copyright (c) 2021, Colm Buckley <colm@tuatha.org> 36 */ 37 38 /* 39 * SPA: Storage Pool Allocator 40 * 41 * This file contains all the routines used when modifying on-disk SPA state. 42 * This includes opening, importing, destroying, exporting a pool, and syncing a 43 * pool. 44 */ 45 46 #include <sys/zfs_context.h> 47 #include <sys/fm/fs/zfs.h> 48 #include <sys/spa_impl.h> 49 #include <sys/zio.h> 50 #include <sys/zio_checksum.h> 51 #include <sys/dmu.h> 52 #include <sys/dmu_tx.h> 53 #include <sys/zap.h> 54 #include <sys/zil.h> 55 #include <sys/brt.h> 56 #include <sys/ddt.h> 57 #include <sys/vdev_impl.h> 58 #include <sys/vdev_removal.h> 59 #include <sys/vdev_indirect_mapping.h> 60 #include <sys/vdev_indirect_births.h> 61 #include <sys/vdev_initialize.h> 62 #include <sys/vdev_rebuild.h> 63 #include <sys/vdev_trim.h> 64 #include <sys/vdev_disk.h> 65 #include <sys/vdev_draid.h> 66 #include <sys/metaslab.h> 67 #include <sys/metaslab_impl.h> 68 #include <sys/mmp.h> 69 #include <sys/uberblock_impl.h> 70 #include <sys/txg.h> 71 #include <sys/avl.h> 72 #include <sys/bpobj.h> 73 #include <sys/dmu_traverse.h> 74 #include <sys/dmu_objset.h> 75 #include <sys/unique.h> 76 #include <sys/dsl_pool.h> 77 #include <sys/dsl_dataset.h> 78 #include <sys/dsl_dir.h> 79 #include <sys/dsl_prop.h> 80 #include <sys/dsl_synctask.h> 81 #include <sys/fs/zfs.h> 82 #include <sys/arc.h> 83 #include <sys/callb.h> 84 #include <sys/systeminfo.h> 85 #include <sys/zfs_ioctl.h> 86 #include <sys/dsl_scan.h> 87 #include <sys/zfeature.h> 88 #include <sys/dsl_destroy.h> 89 #include <sys/zvol.h> 90 91 #ifdef _KERNEL 92 #include <sys/fm/protocol.h> 93 #include <sys/fm/util.h> 94 #include <sys/callb.h> 95 #include <sys/zone.h> 96 #include <sys/vmsystm.h> 97 #endif /* _KERNEL */ 98 99 #include "zfs_prop.h" 100 #include "zfs_comutil.h" 101 102 /* 103 * The interval, in seconds, at which failed configuration cache file writes 104 * should be retried. 105 */ 106 int zfs_ccw_retry_interval = 300; 107 108 typedef enum zti_modes { 109 ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 110 ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ 111 ZTI_MODE_SCALE, /* Taskqs scale with CPUs. */ 112 ZTI_MODE_NULL, /* don't create a taskq */ 113 ZTI_NMODES 114 } zti_modes_t; 115 116 #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 117 #define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 } 118 #define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } 119 #define ZTI_SCALE { ZTI_MODE_SCALE, 0, 1 } 120 #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 121 122 #define ZTI_N(n) ZTI_P(n, 1) 123 #define ZTI_ONE ZTI_N(1) 124 125 typedef struct zio_taskq_info { 126 zti_modes_t zti_mode; 127 uint_t zti_value; 128 uint_t zti_count; 129 } zio_taskq_info_t; 130 131 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 132 "iss", "iss_h", "int", "int_h" 133 }; 134 135 /* 136 * This table defines the taskq settings for each ZFS I/O type. When 137 * initializing a pool, we use this table to create an appropriately sized 138 * taskq. Some operations are low volume and therefore have a small, static 139 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 140 * macros. Other operations process a large amount of data; the ZTI_BATCH 141 * macro causes us to create a taskq oriented for throughput. Some operations 142 * are so high frequency and short-lived that the taskq itself can become a 143 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 144 * additional degree of parallelism specified by the number of threads per- 145 * taskq and the number of taskqs; when dispatching an event in this case, the 146 * particular taskq is chosen at random. ZTI_SCALE is similar to ZTI_BATCH, 147 * but with number of taskqs also scaling with number of CPUs. 148 * 149 * The different taskq priorities are to handle the different contexts (issue 150 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that 151 * need to be handled with minimum delay. 152 */ 153 static const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 154 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 155 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 156 { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ 157 { ZTI_BATCH, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */ 158 { ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 159 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 160 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ 161 { ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */ 162 }; 163 164 static void spa_sync_version(void *arg, dmu_tx_t *tx); 165 static void spa_sync_props(void *arg, dmu_tx_t *tx); 166 static boolean_t spa_has_active_shared_spare(spa_t *spa); 167 static int spa_load_impl(spa_t *spa, spa_import_type_t type, 168 const char **ereport); 169 static void spa_vdev_resilver_done(spa_t *spa); 170 171 static uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */ 172 static uint_t zio_taskq_batch_tpq; /* threads per taskq */ 173 static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 174 static const uint_t zio_taskq_basedc = 80; /* base duty cycle */ 175 176 static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */ 177 178 /* 179 * Report any spa_load_verify errors found, but do not fail spa_load. 180 * This is used by zdb to analyze non-idle pools. 181 */ 182 boolean_t spa_load_verify_dryrun = B_FALSE; 183 184 /* 185 * Allow read spacemaps in case of readonly import (spa_mode == SPA_MODE_READ). 186 * This is used by zdb for spacemaps verification. 187 */ 188 boolean_t spa_mode_readable_spacemaps = B_FALSE; 189 190 /* 191 * This (illegal) pool name is used when temporarily importing a spa_t in order 192 * to get the vdev stats associated with the imported devices. 193 */ 194 #define TRYIMPORT_NAME "$import" 195 196 /* 197 * For debugging purposes: print out vdev tree during pool import. 198 */ 199 static int spa_load_print_vdev_tree = B_FALSE; 200 201 /* 202 * A non-zero value for zfs_max_missing_tvds means that we allow importing 203 * pools with missing top-level vdevs. This is strictly intended for advanced 204 * pool recovery cases since missing data is almost inevitable. Pools with 205 * missing devices can only be imported read-only for safety reasons, and their 206 * fail-mode will be automatically set to "continue". 207 * 208 * With 1 missing vdev we should be able to import the pool and mount all 209 * datasets. User data that was not modified after the missing device has been 210 * added should be recoverable. This means that snapshots created prior to the 211 * addition of that device should be completely intact. 212 * 213 * With 2 missing vdevs, some datasets may fail to mount since there are 214 * dataset statistics that are stored as regular metadata. Some data might be 215 * recoverable if those vdevs were added recently. 216 * 217 * With 3 or more missing vdevs, the pool is severely damaged and MOS entries 218 * may be missing entirely. Chances of data recovery are very low. Note that 219 * there are also risks of performing an inadvertent rewind as we might be 220 * missing all the vdevs with the latest uberblocks. 221 */ 222 uint64_t zfs_max_missing_tvds = 0; 223 224 /* 225 * The parameters below are similar to zfs_max_missing_tvds but are only 226 * intended for a preliminary open of the pool with an untrusted config which 227 * might be incomplete or out-dated. 228 * 229 * We are more tolerant for pools opened from a cachefile since we could have 230 * an out-dated cachefile where a device removal was not registered. 231 * We could have set the limit arbitrarily high but in the case where devices 232 * are really missing we would want to return the proper error codes; we chose 233 * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available 234 * and we get a chance to retrieve the trusted config. 235 */ 236 uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; 237 238 /* 239 * In the case where config was assembled by scanning device paths (/dev/dsks 240 * by default) we are less tolerant since all the existing devices should have 241 * been detected and we want spa_load to return the right error codes. 242 */ 243 uint64_t zfs_max_missing_tvds_scan = 0; 244 245 /* 246 * Debugging aid that pauses spa_sync() towards the end. 247 */ 248 static const boolean_t zfs_pause_spa_sync = B_FALSE; 249 250 /* 251 * Variables to indicate the livelist condense zthr func should wait at certain 252 * points for the livelist to be removed - used to test condense/destroy races 253 */ 254 static int zfs_livelist_condense_zthr_pause = 0; 255 static int zfs_livelist_condense_sync_pause = 0; 256 257 /* 258 * Variables to track whether or not condense cancellation has been 259 * triggered in testing. 260 */ 261 static int zfs_livelist_condense_sync_cancel = 0; 262 static int zfs_livelist_condense_zthr_cancel = 0; 263 264 /* 265 * Variable to track whether or not extra ALLOC blkptrs were added to a 266 * livelist entry while it was being condensed (caused by the way we track 267 * remapped blkptrs in dbuf_remap_impl) 268 */ 269 static int zfs_livelist_condense_new_alloc = 0; 270 271 /* 272 * ========================================================================== 273 * SPA properties routines 274 * ========================================================================== 275 */ 276 277 /* 278 * Add a (source=src, propname=propval) list to an nvlist. 279 */ 280 static void 281 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, const char *strval, 282 uint64_t intval, zprop_source_t src) 283 { 284 const char *propname = zpool_prop_to_name(prop); 285 nvlist_t *propval; 286 287 propval = fnvlist_alloc(); 288 fnvlist_add_uint64(propval, ZPROP_SOURCE, src); 289 290 if (strval != NULL) 291 fnvlist_add_string(propval, ZPROP_VALUE, strval); 292 else 293 fnvlist_add_uint64(propval, ZPROP_VALUE, intval); 294 295 fnvlist_add_nvlist(nvl, propname, propval); 296 nvlist_free(propval); 297 } 298 299 /* 300 * Add a user property (source=src, propname=propval) to an nvlist. 301 */ 302 static void 303 spa_prop_add_user(nvlist_t *nvl, const char *propname, char *strval, 304 zprop_source_t src) 305 { 306 nvlist_t *propval; 307 308 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 309 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 310 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 311 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 312 nvlist_free(propval); 313 } 314 315 /* 316 * Get property values from the spa configuration. 317 */ 318 static void 319 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 320 { 321 vdev_t *rvd = spa->spa_root_vdev; 322 dsl_pool_t *pool = spa->spa_dsl_pool; 323 uint64_t size, alloc, cap, version; 324 const zprop_source_t src = ZPROP_SRC_NONE; 325 spa_config_dirent_t *dp; 326 metaslab_class_t *mc = spa_normal_class(spa); 327 328 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 329 330 if (rvd != NULL) { 331 alloc = metaslab_class_get_alloc(mc); 332 alloc += metaslab_class_get_alloc(spa_special_class(spa)); 333 alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); 334 alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa)); 335 336 size = metaslab_class_get_space(mc); 337 size += metaslab_class_get_space(spa_special_class(spa)); 338 size += metaslab_class_get_space(spa_dedup_class(spa)); 339 size += metaslab_class_get_space(spa_embedded_log_class(spa)); 340 341 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 342 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 343 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 344 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 345 size - alloc, src); 346 spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL, 347 spa->spa_checkpoint_info.sci_dspace, src); 348 349 spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, 350 metaslab_class_fragmentation(mc), src); 351 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, 352 metaslab_class_expandable_space(mc), src); 353 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 354 (spa_mode(spa) == SPA_MODE_READ), src); 355 356 cap = (size == 0) ? 0 : (alloc * 100 / size); 357 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 358 359 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 360 ddt_get_pool_dedup_ratio(spa), src); 361 spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONEUSED, NULL, 362 brt_get_used(spa), src); 363 spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONESAVED, NULL, 364 brt_get_saved(spa), src); 365 spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONERATIO, NULL, 366 brt_get_ratio(spa), src); 367 368 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 369 rvd->vdev_state, src); 370 371 version = spa_version(spa); 372 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) { 373 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, 374 version, ZPROP_SRC_DEFAULT); 375 } else { 376 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, 377 version, ZPROP_SRC_LOCAL); 378 } 379 spa_prop_add_list(*nvp, ZPOOL_PROP_LOAD_GUID, 380 NULL, spa_load_guid(spa), src); 381 } 382 383 if (pool != NULL) { 384 /* 385 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 386 * when opening pools before this version freedir will be NULL. 387 */ 388 if (pool->dp_free_dir != NULL) { 389 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 390 dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, 391 src); 392 } else { 393 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 394 NULL, 0, src); 395 } 396 397 if (pool->dp_leak_dir != NULL) { 398 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 399 dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, 400 src); 401 } else { 402 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, 403 NULL, 0, src); 404 } 405 } 406 407 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 408 409 if (spa->spa_comment != NULL) { 410 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 411 0, ZPROP_SRC_LOCAL); 412 } 413 414 if (spa->spa_compatibility != NULL) { 415 spa_prop_add_list(*nvp, ZPOOL_PROP_COMPATIBILITY, 416 spa->spa_compatibility, 0, ZPROP_SRC_LOCAL); 417 } 418 419 if (spa->spa_root != NULL) 420 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 421 0, ZPROP_SRC_LOCAL); 422 423 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { 424 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 425 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); 426 } else { 427 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 428 SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); 429 } 430 431 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { 432 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, 433 DNODE_MAX_SIZE, ZPROP_SRC_NONE); 434 } else { 435 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, 436 DNODE_MIN_SIZE, ZPROP_SRC_NONE); 437 } 438 439 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 440 if (dp->scd_path == NULL) { 441 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 442 "none", 0, ZPROP_SRC_LOCAL); 443 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 444 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 445 dp->scd_path, 0, ZPROP_SRC_LOCAL); 446 } 447 } 448 } 449 450 /* 451 * Get zpool property values. 452 */ 453 int 454 spa_prop_get(spa_t *spa, nvlist_t **nvp) 455 { 456 objset_t *mos = spa->spa_meta_objset; 457 zap_cursor_t zc; 458 zap_attribute_t za; 459 dsl_pool_t *dp; 460 int err; 461 462 err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP); 463 if (err) 464 return (err); 465 466 dp = spa_get_dsl(spa); 467 dsl_pool_config_enter(dp, FTAG); 468 mutex_enter(&spa->spa_props_lock); 469 470 /* 471 * Get properties from the spa config. 472 */ 473 spa_prop_get_config(spa, nvp); 474 475 /* If no pool property object, no more prop to get. */ 476 if (mos == NULL || spa->spa_pool_props_object == 0) 477 goto out; 478 479 /* 480 * Get properties from the MOS pool property object. 481 */ 482 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 483 (err = zap_cursor_retrieve(&zc, &za)) == 0; 484 zap_cursor_advance(&zc)) { 485 uint64_t intval = 0; 486 char *strval = NULL; 487 zprop_source_t src = ZPROP_SRC_DEFAULT; 488 zpool_prop_t prop; 489 490 if ((prop = zpool_name_to_prop(za.za_name)) == 491 ZPOOL_PROP_INVAL && !zfs_prop_user(za.za_name)) 492 continue; 493 494 switch (za.za_integer_length) { 495 case 8: 496 /* integer property */ 497 if (za.za_first_integer != 498 zpool_prop_default_numeric(prop)) 499 src = ZPROP_SRC_LOCAL; 500 501 if (prop == ZPOOL_PROP_BOOTFS) { 502 dsl_dataset_t *ds = NULL; 503 504 err = dsl_dataset_hold_obj(dp, 505 za.za_first_integer, FTAG, &ds); 506 if (err != 0) 507 break; 508 509 strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, 510 KM_SLEEP); 511 dsl_dataset_name(ds, strval); 512 dsl_dataset_rele(ds, FTAG); 513 } else { 514 strval = NULL; 515 intval = za.za_first_integer; 516 } 517 518 spa_prop_add_list(*nvp, prop, strval, intval, src); 519 520 if (strval != NULL) 521 kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); 522 523 break; 524 525 case 1: 526 /* string property */ 527 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 528 err = zap_lookup(mos, spa->spa_pool_props_object, 529 za.za_name, 1, za.za_num_integers, strval); 530 if (err) { 531 kmem_free(strval, za.za_num_integers); 532 break; 533 } 534 if (prop != ZPOOL_PROP_INVAL) { 535 spa_prop_add_list(*nvp, prop, strval, 0, src); 536 } else { 537 src = ZPROP_SRC_LOCAL; 538 spa_prop_add_user(*nvp, za.za_name, strval, 539 src); 540 } 541 kmem_free(strval, za.za_num_integers); 542 break; 543 544 default: 545 break; 546 } 547 } 548 zap_cursor_fini(&zc); 549 out: 550 mutex_exit(&spa->spa_props_lock); 551 dsl_pool_config_exit(dp, FTAG); 552 if (err && err != ENOENT) { 553 nvlist_free(*nvp); 554 *nvp = NULL; 555 return (err); 556 } 557 558 return (0); 559 } 560 561 /* 562 * Validate the given pool properties nvlist and modify the list 563 * for the property values to be set. 564 */ 565 static int 566 spa_prop_validate(spa_t *spa, nvlist_t *props) 567 { 568 nvpair_t *elem; 569 int error = 0, reset_bootfs = 0; 570 uint64_t objnum = 0; 571 boolean_t has_feature = B_FALSE; 572 573 elem = NULL; 574 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 575 uint64_t intval; 576 const char *strval, *slash, *check, *fname; 577 const char *propname = nvpair_name(elem); 578 zpool_prop_t prop = zpool_name_to_prop(propname); 579 580 switch (prop) { 581 case ZPOOL_PROP_INVAL: 582 /* 583 * Sanitize the input. 584 */ 585 if (zfs_prop_user(propname)) { 586 if (strlen(propname) >= ZAP_MAXNAMELEN) { 587 error = SET_ERROR(ENAMETOOLONG); 588 break; 589 } 590 591 if (strlen(fnvpair_value_string(elem)) >= 592 ZAP_MAXVALUELEN) { 593 error = SET_ERROR(E2BIG); 594 break; 595 } 596 } else if (zpool_prop_feature(propname)) { 597 if (nvpair_type(elem) != DATA_TYPE_UINT64) { 598 error = SET_ERROR(EINVAL); 599 break; 600 } 601 602 if (nvpair_value_uint64(elem, &intval) != 0) { 603 error = SET_ERROR(EINVAL); 604 break; 605 } 606 607 if (intval != 0) { 608 error = SET_ERROR(EINVAL); 609 break; 610 } 611 612 fname = strchr(propname, '@') + 1; 613 if (zfeature_lookup_name(fname, NULL) != 0) { 614 error = SET_ERROR(EINVAL); 615 break; 616 } 617 618 has_feature = B_TRUE; 619 } else { 620 error = SET_ERROR(EINVAL); 621 break; 622 } 623 break; 624 625 case ZPOOL_PROP_VERSION: 626 error = nvpair_value_uint64(elem, &intval); 627 if (!error && 628 (intval < spa_version(spa) || 629 intval > SPA_VERSION_BEFORE_FEATURES || 630 has_feature)) 631 error = SET_ERROR(EINVAL); 632 break; 633 634 case ZPOOL_PROP_DELEGATION: 635 case ZPOOL_PROP_AUTOREPLACE: 636 case ZPOOL_PROP_LISTSNAPS: 637 case ZPOOL_PROP_AUTOEXPAND: 638 case ZPOOL_PROP_AUTOTRIM: 639 error = nvpair_value_uint64(elem, &intval); 640 if (!error && intval > 1) 641 error = SET_ERROR(EINVAL); 642 break; 643 644 case ZPOOL_PROP_MULTIHOST: 645 error = nvpair_value_uint64(elem, &intval); 646 if (!error && intval > 1) 647 error = SET_ERROR(EINVAL); 648 649 if (!error) { 650 uint32_t hostid = zone_get_hostid(NULL); 651 if (hostid) 652 spa->spa_hostid = hostid; 653 else 654 error = SET_ERROR(ENOTSUP); 655 } 656 657 break; 658 659 case ZPOOL_PROP_BOOTFS: 660 /* 661 * If the pool version is less than SPA_VERSION_BOOTFS, 662 * or the pool is still being created (version == 0), 663 * the bootfs property cannot be set. 664 */ 665 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 666 error = SET_ERROR(ENOTSUP); 667 break; 668 } 669 670 /* 671 * Make sure the vdev config is bootable 672 */ 673 if (!vdev_is_bootable(spa->spa_root_vdev)) { 674 error = SET_ERROR(ENOTSUP); 675 break; 676 } 677 678 reset_bootfs = 1; 679 680 error = nvpair_value_string(elem, &strval); 681 682 if (!error) { 683 objset_t *os; 684 685 if (strval == NULL || strval[0] == '\0') { 686 objnum = zpool_prop_default_numeric( 687 ZPOOL_PROP_BOOTFS); 688 break; 689 } 690 691 error = dmu_objset_hold(strval, FTAG, &os); 692 if (error != 0) 693 break; 694 695 /* Must be ZPL. */ 696 if (dmu_objset_type(os) != DMU_OST_ZFS) { 697 error = SET_ERROR(ENOTSUP); 698 } else { 699 objnum = dmu_objset_id(os); 700 } 701 dmu_objset_rele(os, FTAG); 702 } 703 break; 704 705 case ZPOOL_PROP_FAILUREMODE: 706 error = nvpair_value_uint64(elem, &intval); 707 if (!error && intval > ZIO_FAILURE_MODE_PANIC) 708 error = SET_ERROR(EINVAL); 709 710 /* 711 * This is a special case which only occurs when 712 * the pool has completely failed. This allows 713 * the user to change the in-core failmode property 714 * without syncing it out to disk (I/Os might 715 * currently be blocked). We do this by returning 716 * EIO to the caller (spa_prop_set) to trick it 717 * into thinking we encountered a property validation 718 * error. 719 */ 720 if (!error && spa_suspended(spa)) { 721 spa->spa_failmode = intval; 722 error = SET_ERROR(EIO); 723 } 724 break; 725 726 case ZPOOL_PROP_CACHEFILE: 727 if ((error = nvpair_value_string(elem, &strval)) != 0) 728 break; 729 730 if (strval[0] == '\0') 731 break; 732 733 if (strcmp(strval, "none") == 0) 734 break; 735 736 if (strval[0] != '/') { 737 error = SET_ERROR(EINVAL); 738 break; 739 } 740 741 slash = strrchr(strval, '/'); 742 ASSERT(slash != NULL); 743 744 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 745 strcmp(slash, "/..") == 0) 746 error = SET_ERROR(EINVAL); 747 break; 748 749 case ZPOOL_PROP_COMMENT: 750 if ((error = nvpair_value_string(elem, &strval)) != 0) 751 break; 752 for (check = strval; *check != '\0'; check++) { 753 if (!isprint(*check)) { 754 error = SET_ERROR(EINVAL); 755 break; 756 } 757 } 758 if (strlen(strval) > ZPROP_MAX_COMMENT) 759 error = SET_ERROR(E2BIG); 760 break; 761 762 default: 763 break; 764 } 765 766 if (error) 767 break; 768 } 769 770 (void) nvlist_remove_all(props, 771 zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO)); 772 773 if (!error && reset_bootfs) { 774 error = nvlist_remove(props, 775 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 776 777 if (!error) { 778 error = nvlist_add_uint64(props, 779 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 780 } 781 } 782 783 return (error); 784 } 785 786 void 787 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 788 { 789 const char *cachefile; 790 spa_config_dirent_t *dp; 791 792 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 793 &cachefile) != 0) 794 return; 795 796 dp = kmem_alloc(sizeof (spa_config_dirent_t), 797 KM_SLEEP); 798 799 if (cachefile[0] == '\0') 800 dp->scd_path = spa_strdup(spa_config_path); 801 else if (strcmp(cachefile, "none") == 0) 802 dp->scd_path = NULL; 803 else 804 dp->scd_path = spa_strdup(cachefile); 805 806 list_insert_head(&spa->spa_config_list, dp); 807 if (need_sync) 808 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 809 } 810 811 int 812 spa_prop_set(spa_t *spa, nvlist_t *nvp) 813 { 814 int error; 815 nvpair_t *elem = NULL; 816 boolean_t need_sync = B_FALSE; 817 818 if ((error = spa_prop_validate(spa, nvp)) != 0) 819 return (error); 820 821 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 822 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 823 824 if (prop == ZPOOL_PROP_CACHEFILE || 825 prop == ZPOOL_PROP_ALTROOT || 826 prop == ZPOOL_PROP_READONLY) 827 continue; 828 829 if (prop == ZPOOL_PROP_INVAL && 830 zfs_prop_user(nvpair_name(elem))) { 831 need_sync = B_TRUE; 832 break; 833 } 834 835 if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { 836 uint64_t ver = 0; 837 838 if (prop == ZPOOL_PROP_VERSION) { 839 VERIFY(nvpair_value_uint64(elem, &ver) == 0); 840 } else { 841 ASSERT(zpool_prop_feature(nvpair_name(elem))); 842 ver = SPA_VERSION_FEATURES; 843 need_sync = B_TRUE; 844 } 845 846 /* Save time if the version is already set. */ 847 if (ver == spa_version(spa)) 848 continue; 849 850 /* 851 * In addition to the pool directory object, we might 852 * create the pool properties object, the features for 853 * read object, the features for write object, or the 854 * feature descriptions object. 855 */ 856 error = dsl_sync_task(spa->spa_name, NULL, 857 spa_sync_version, &ver, 858 6, ZFS_SPACE_CHECK_RESERVED); 859 if (error) 860 return (error); 861 continue; 862 } 863 864 need_sync = B_TRUE; 865 break; 866 } 867 868 if (need_sync) { 869 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 870 nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 871 } 872 873 return (0); 874 } 875 876 /* 877 * If the bootfs property value is dsobj, clear it. 878 */ 879 void 880 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 881 { 882 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 883 VERIFY(zap_remove(spa->spa_meta_objset, 884 spa->spa_pool_props_object, 885 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 886 spa->spa_bootfs = 0; 887 } 888 } 889 890 static int 891 spa_change_guid_check(void *arg, dmu_tx_t *tx) 892 { 893 uint64_t *newguid __maybe_unused = arg; 894 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 895 vdev_t *rvd = spa->spa_root_vdev; 896 uint64_t vdev_state; 897 898 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 899 int error = (spa_has_checkpoint(spa)) ? 900 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 901 return (SET_ERROR(error)); 902 } 903 904 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 905 vdev_state = rvd->vdev_state; 906 spa_config_exit(spa, SCL_STATE, FTAG); 907 908 if (vdev_state != VDEV_STATE_HEALTHY) 909 return (SET_ERROR(ENXIO)); 910 911 ASSERT3U(spa_guid(spa), !=, *newguid); 912 913 return (0); 914 } 915 916 static void 917 spa_change_guid_sync(void *arg, dmu_tx_t *tx) 918 { 919 uint64_t *newguid = arg; 920 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 921 uint64_t oldguid; 922 vdev_t *rvd = spa->spa_root_vdev; 923 924 oldguid = spa_guid(spa); 925 926 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 927 rvd->vdev_guid = *newguid; 928 rvd->vdev_guid_sum += (*newguid - oldguid); 929 vdev_config_dirty(rvd); 930 spa_config_exit(spa, SCL_STATE, FTAG); 931 932 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 933 (u_longlong_t)oldguid, (u_longlong_t)*newguid); 934 } 935 936 /* 937 * Change the GUID for the pool. This is done so that we can later 938 * re-import a pool built from a clone of our own vdevs. We will modify 939 * the root vdev's guid, our own pool guid, and then mark all of our 940 * vdevs dirty. Note that we must make sure that all our vdevs are 941 * online when we do this, or else any vdevs that weren't present 942 * would be orphaned from our pool. We are also going to issue a 943 * sysevent to update any watchers. 944 */ 945 int 946 spa_change_guid(spa_t *spa) 947 { 948 int error; 949 uint64_t guid; 950 951 mutex_enter(&spa->spa_vdev_top_lock); 952 mutex_enter(&spa_namespace_lock); 953 guid = spa_generate_guid(NULL); 954 955 error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 956 spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 957 958 if (error == 0) { 959 /* 960 * Clear the kobj flag from all the vdevs to allow 961 * vdev_cache_process_kobj_evt() to post events to all the 962 * vdevs since GUID is updated. 963 */ 964 vdev_clear_kobj_evt(spa->spa_root_vdev); 965 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 966 vdev_clear_kobj_evt(spa->spa_l2cache.sav_vdevs[i]); 967 968 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); 969 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); 970 } 971 972 mutex_exit(&spa_namespace_lock); 973 mutex_exit(&spa->spa_vdev_top_lock); 974 975 return (error); 976 } 977 978 /* 979 * ========================================================================== 980 * SPA state manipulation (open/create/destroy/import/export) 981 * ========================================================================== 982 */ 983 984 static int 985 spa_error_entry_compare(const void *a, const void *b) 986 { 987 const spa_error_entry_t *sa = (const spa_error_entry_t *)a; 988 const spa_error_entry_t *sb = (const spa_error_entry_t *)b; 989 int ret; 990 991 ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, 992 sizeof (zbookmark_phys_t)); 993 994 return (TREE_ISIGN(ret)); 995 } 996 997 /* 998 * Utility function which retrieves copies of the current logs and 999 * re-initializes them in the process. 1000 */ 1001 void 1002 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 1003 { 1004 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 1005 1006 memcpy(last, &spa->spa_errlist_last, sizeof (avl_tree_t)); 1007 memcpy(scrub, &spa->spa_errlist_scrub, sizeof (avl_tree_t)); 1008 1009 avl_create(&spa->spa_errlist_scrub, 1010 spa_error_entry_compare, sizeof (spa_error_entry_t), 1011 offsetof(spa_error_entry_t, se_avl)); 1012 avl_create(&spa->spa_errlist_last, 1013 spa_error_entry_compare, sizeof (spa_error_entry_t), 1014 offsetof(spa_error_entry_t, se_avl)); 1015 } 1016 1017 static void 1018 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 1019 { 1020 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 1021 enum zti_modes mode = ztip->zti_mode; 1022 uint_t value = ztip->zti_value; 1023 uint_t count = ztip->zti_count; 1024 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1025 uint_t cpus, flags = TASKQ_DYNAMIC; 1026 boolean_t batch = B_FALSE; 1027 1028 switch (mode) { 1029 case ZTI_MODE_FIXED: 1030 ASSERT3U(value, >, 0); 1031 break; 1032 1033 case ZTI_MODE_BATCH: 1034 batch = B_TRUE; 1035 flags |= TASKQ_THREADS_CPU_PCT; 1036 value = MIN(zio_taskq_batch_pct, 100); 1037 break; 1038 1039 case ZTI_MODE_SCALE: 1040 flags |= TASKQ_THREADS_CPU_PCT; 1041 /* 1042 * We want more taskqs to reduce lock contention, but we want 1043 * less for better request ordering and CPU utilization. 1044 */ 1045 cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); 1046 if (zio_taskq_batch_tpq > 0) { 1047 count = MAX(1, (cpus + zio_taskq_batch_tpq / 2) / 1048 zio_taskq_batch_tpq); 1049 } else { 1050 /* 1051 * Prefer 6 threads per taskq, but no more taskqs 1052 * than threads in them on large systems. For 80%: 1053 * 1054 * taskq taskq total 1055 * cpus taskqs percent threads threads 1056 * ------- ------- ------- ------- ------- 1057 * 1 1 80% 1 1 1058 * 2 1 80% 1 1 1059 * 4 1 80% 3 3 1060 * 8 2 40% 3 6 1061 * 16 3 27% 4 12 1062 * 32 5 16% 5 25 1063 * 64 7 11% 7 49 1064 * 128 10 8% 10 100 1065 * 256 14 6% 15 210 1066 */ 1067 count = 1 + cpus / 6; 1068 while (count * count > cpus) 1069 count--; 1070 } 1071 /* Limit each taskq within 100% to not trigger assertion. */ 1072 count = MAX(count, (zio_taskq_batch_pct + 99) / 100); 1073 value = (zio_taskq_batch_pct + count / 2) / count; 1074 break; 1075 1076 case ZTI_MODE_NULL: 1077 tqs->stqs_count = 0; 1078 tqs->stqs_taskq = NULL; 1079 return; 1080 1081 default: 1082 panic("unrecognized mode for %s_%s taskq (%u:%u) in " 1083 "spa_activate()", 1084 zio_type_name[t], zio_taskq_types[q], mode, value); 1085 break; 1086 } 1087 1088 ASSERT3U(count, >, 0); 1089 tqs->stqs_count = count; 1090 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 1091 1092 for (uint_t i = 0; i < count; i++) { 1093 taskq_t *tq; 1094 char name[32]; 1095 1096 if (count > 1) 1097 (void) snprintf(name, sizeof (name), "%s_%s_%u", 1098 zio_type_name[t], zio_taskq_types[q], i); 1099 else 1100 (void) snprintf(name, sizeof (name), "%s_%s", 1101 zio_type_name[t], zio_taskq_types[q]); 1102 1103 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 1104 if (batch) 1105 flags |= TASKQ_DC_BATCH; 1106 1107 (void) zio_taskq_basedc; 1108 tq = taskq_create_sysdc(name, value, 50, INT_MAX, 1109 spa->spa_proc, zio_taskq_basedc, flags); 1110 } else { 1111 pri_t pri = maxclsyspri; 1112 /* 1113 * The write issue taskq can be extremely CPU 1114 * intensive. Run it at slightly less important 1115 * priority than the other taskqs. 1116 * 1117 * Under Linux and FreeBSD this means incrementing 1118 * the priority value as opposed to platforms like 1119 * illumos where it should be decremented. 1120 * 1121 * On FreeBSD, if priorities divided by four (RQ_PPQ) 1122 * are equal then a difference between them is 1123 * insignificant. 1124 */ 1125 if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) { 1126 #if defined(__linux__) 1127 pri++; 1128 #elif defined(__FreeBSD__) 1129 pri += 4; 1130 #else 1131 #error "unknown OS" 1132 #endif 1133 } 1134 tq = taskq_create_proc(name, value, pri, 50, 1135 INT_MAX, spa->spa_proc, flags); 1136 } 1137 1138 tqs->stqs_taskq[i] = tq; 1139 } 1140 } 1141 1142 static void 1143 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 1144 { 1145 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1146 1147 if (tqs->stqs_taskq == NULL) { 1148 ASSERT3U(tqs->stqs_count, ==, 0); 1149 return; 1150 } 1151 1152 for (uint_t i = 0; i < tqs->stqs_count; i++) { 1153 ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 1154 taskq_destroy(tqs->stqs_taskq[i]); 1155 } 1156 1157 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 1158 tqs->stqs_taskq = NULL; 1159 } 1160 1161 /* 1162 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 1163 * Note that a type may have multiple discrete taskqs to avoid lock contention 1164 * on the taskq itself. In that case we choose which taskq at random by using 1165 * the low bits of gethrtime(). 1166 */ 1167 void 1168 spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 1169 task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) 1170 { 1171 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1172 taskq_t *tq; 1173 1174 ASSERT3P(tqs->stqs_taskq, !=, NULL); 1175 ASSERT3U(tqs->stqs_count, !=, 0); 1176 1177 if (tqs->stqs_count == 1) { 1178 tq = tqs->stqs_taskq[0]; 1179 } else { 1180 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; 1181 } 1182 1183 taskq_dispatch_ent(tq, func, arg, flags, ent); 1184 } 1185 1186 /* 1187 * Same as spa_taskq_dispatch_ent() but block on the task until completion. 1188 */ 1189 void 1190 spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 1191 task_func_t *func, void *arg, uint_t flags) 1192 { 1193 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1194 taskq_t *tq; 1195 taskqid_t id; 1196 1197 ASSERT3P(tqs->stqs_taskq, !=, NULL); 1198 ASSERT3U(tqs->stqs_count, !=, 0); 1199 1200 if (tqs->stqs_count == 1) { 1201 tq = tqs->stqs_taskq[0]; 1202 } else { 1203 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; 1204 } 1205 1206 id = taskq_dispatch(tq, func, arg, flags); 1207 if (id) 1208 taskq_wait_id(tq, id); 1209 } 1210 1211 static void 1212 spa_create_zio_taskqs(spa_t *spa) 1213 { 1214 for (int t = 0; t < ZIO_TYPES; t++) { 1215 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1216 spa_taskqs_init(spa, t, q); 1217 } 1218 } 1219 } 1220 1221 /* 1222 * Disabled until spa_thread() can be adapted for Linux. 1223 */ 1224 #undef HAVE_SPA_THREAD 1225 1226 #if defined(_KERNEL) && defined(HAVE_SPA_THREAD) 1227 static void 1228 spa_thread(void *arg) 1229 { 1230 psetid_t zio_taskq_psrset_bind = PS_NONE; 1231 callb_cpr_t cprinfo; 1232 1233 spa_t *spa = arg; 1234 user_t *pu = PTOU(curproc); 1235 1236 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 1237 spa->spa_name); 1238 1239 ASSERT(curproc != &p0); 1240 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 1241 "zpool-%s", spa->spa_name); 1242 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 1243 1244 /* bind this thread to the requested psrset */ 1245 if (zio_taskq_psrset_bind != PS_NONE) { 1246 pool_lock(); 1247 mutex_enter(&cpu_lock); 1248 mutex_enter(&pidlock); 1249 mutex_enter(&curproc->p_lock); 1250 1251 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 1252 0, NULL, NULL) == 0) { 1253 curthread->t_bind_pset = zio_taskq_psrset_bind; 1254 } else { 1255 cmn_err(CE_WARN, 1256 "Couldn't bind process for zfs pool \"%s\" to " 1257 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1258 } 1259 1260 mutex_exit(&curproc->p_lock); 1261 mutex_exit(&pidlock); 1262 mutex_exit(&cpu_lock); 1263 pool_unlock(); 1264 } 1265 1266 if (zio_taskq_sysdc) { 1267 sysdc_thread_enter(curthread, 100, 0); 1268 } 1269 1270 spa->spa_proc = curproc; 1271 spa->spa_did = curthread->t_did; 1272 1273 spa_create_zio_taskqs(spa); 1274 1275 mutex_enter(&spa->spa_proc_lock); 1276 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1277 1278 spa->spa_proc_state = SPA_PROC_ACTIVE; 1279 cv_broadcast(&spa->spa_proc_cv); 1280 1281 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1282 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1283 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1284 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1285 1286 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1287 spa->spa_proc_state = SPA_PROC_GONE; 1288 spa->spa_proc = &p0; 1289 cv_broadcast(&spa->spa_proc_cv); 1290 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1291 1292 mutex_enter(&curproc->p_lock); 1293 lwp_exit(); 1294 } 1295 #endif 1296 1297 /* 1298 * Activate an uninitialized pool. 1299 */ 1300 static void 1301 spa_activate(spa_t *spa, spa_mode_t mode) 1302 { 1303 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1304 1305 spa->spa_state = POOL_STATE_ACTIVE; 1306 spa->spa_mode = mode; 1307 spa->spa_read_spacemaps = spa_mode_readable_spacemaps; 1308 1309 spa->spa_normal_class = metaslab_class_create(spa, &zfs_metaslab_ops); 1310 spa->spa_log_class = metaslab_class_create(spa, &zfs_metaslab_ops); 1311 spa->spa_embedded_log_class = 1312 metaslab_class_create(spa, &zfs_metaslab_ops); 1313 spa->spa_special_class = metaslab_class_create(spa, &zfs_metaslab_ops); 1314 spa->spa_dedup_class = metaslab_class_create(spa, &zfs_metaslab_ops); 1315 1316 /* Try to create a covering process */ 1317 mutex_enter(&spa->spa_proc_lock); 1318 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1319 ASSERT(spa->spa_proc == &p0); 1320 spa->spa_did = 0; 1321 1322 (void) spa_create_process; 1323 #ifdef HAVE_SPA_THREAD 1324 /* Only create a process if we're going to be around a while. */ 1325 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1326 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1327 NULL, 0) == 0) { 1328 spa->spa_proc_state = SPA_PROC_CREATED; 1329 while (spa->spa_proc_state == SPA_PROC_CREATED) { 1330 cv_wait(&spa->spa_proc_cv, 1331 &spa->spa_proc_lock); 1332 } 1333 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1334 ASSERT(spa->spa_proc != &p0); 1335 ASSERT(spa->spa_did != 0); 1336 } else { 1337 #ifdef _KERNEL 1338 cmn_err(CE_WARN, 1339 "Couldn't create process for zfs pool \"%s\"\n", 1340 spa->spa_name); 1341 #endif 1342 } 1343 } 1344 #endif /* HAVE_SPA_THREAD */ 1345 mutex_exit(&spa->spa_proc_lock); 1346 1347 /* If we didn't create a process, we need to create our taskqs. */ 1348 if (spa->spa_proc == &p0) { 1349 spa_create_zio_taskqs(spa); 1350 } 1351 1352 for (size_t i = 0; i < TXG_SIZE; i++) { 1353 spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 1354 ZIO_FLAG_CANFAIL); 1355 } 1356 1357 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1358 offsetof(vdev_t, vdev_config_dirty_node)); 1359 list_create(&spa->spa_evicting_os_list, sizeof (objset_t), 1360 offsetof(objset_t, os_evicting_node)); 1361 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1362 offsetof(vdev_t, vdev_state_dirty_node)); 1363 1364 txg_list_create(&spa->spa_vdev_txg_list, spa, 1365 offsetof(struct vdev, vdev_txg_node)); 1366 1367 avl_create(&spa->spa_errlist_scrub, 1368 spa_error_entry_compare, sizeof (spa_error_entry_t), 1369 offsetof(spa_error_entry_t, se_avl)); 1370 avl_create(&spa->spa_errlist_last, 1371 spa_error_entry_compare, sizeof (spa_error_entry_t), 1372 offsetof(spa_error_entry_t, se_avl)); 1373 avl_create(&spa->spa_errlist_healed, 1374 spa_error_entry_compare, sizeof (spa_error_entry_t), 1375 offsetof(spa_error_entry_t, se_avl)); 1376 1377 spa_activate_os(spa); 1378 1379 spa_keystore_init(&spa->spa_keystore); 1380 1381 /* 1382 * This taskq is used to perform zvol-minor-related tasks 1383 * asynchronously. This has several advantages, including easy 1384 * resolution of various deadlocks. 1385 * 1386 * The taskq must be single threaded to ensure tasks are always 1387 * processed in the order in which they were dispatched. 1388 * 1389 * A taskq per pool allows one to keep the pools independent. 1390 * This way if one pool is suspended, it will not impact another. 1391 * 1392 * The preferred location to dispatch a zvol minor task is a sync 1393 * task. In this context, there is easy access to the spa_t and minimal 1394 * error handling is required because the sync task must succeed. 1395 */ 1396 spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri, 1397 1, INT_MAX, 0); 1398 1399 /* 1400 * Taskq dedicated to prefetcher threads: this is used to prevent the 1401 * pool traverse code from monopolizing the global (and limited) 1402 * system_taskq by inappropriately scheduling long running tasks on it. 1403 */ 1404 spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100, 1405 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1406 1407 /* 1408 * The taskq to upgrade datasets in this pool. Currently used by 1409 * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA. 1410 */ 1411 spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100, 1412 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1413 } 1414 1415 /* 1416 * Opposite of spa_activate(). 1417 */ 1418 static void 1419 spa_deactivate(spa_t *spa) 1420 { 1421 ASSERT(spa->spa_sync_on == B_FALSE); 1422 ASSERT(spa->spa_dsl_pool == NULL); 1423 ASSERT(spa->spa_root_vdev == NULL); 1424 ASSERT(spa->spa_async_zio_root == NULL); 1425 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1426 1427 spa_evicting_os_wait(spa); 1428 1429 if (spa->spa_zvol_taskq) { 1430 taskq_destroy(spa->spa_zvol_taskq); 1431 spa->spa_zvol_taskq = NULL; 1432 } 1433 1434 if (spa->spa_prefetch_taskq) { 1435 taskq_destroy(spa->spa_prefetch_taskq); 1436 spa->spa_prefetch_taskq = NULL; 1437 } 1438 1439 if (spa->spa_upgrade_taskq) { 1440 taskq_destroy(spa->spa_upgrade_taskq); 1441 spa->spa_upgrade_taskq = NULL; 1442 } 1443 1444 txg_list_destroy(&spa->spa_vdev_txg_list); 1445 1446 list_destroy(&spa->spa_config_dirty_list); 1447 list_destroy(&spa->spa_evicting_os_list); 1448 list_destroy(&spa->spa_state_dirty_list); 1449 1450 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 1451 1452 for (int t = 0; t < ZIO_TYPES; t++) { 1453 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1454 spa_taskqs_fini(spa, t, q); 1455 } 1456 } 1457 1458 for (size_t i = 0; i < TXG_SIZE; i++) { 1459 ASSERT3P(spa->spa_txg_zio[i], !=, NULL); 1460 VERIFY0(zio_wait(spa->spa_txg_zio[i])); 1461 spa->spa_txg_zio[i] = NULL; 1462 } 1463 1464 metaslab_class_destroy(spa->spa_normal_class); 1465 spa->spa_normal_class = NULL; 1466 1467 metaslab_class_destroy(spa->spa_log_class); 1468 spa->spa_log_class = NULL; 1469 1470 metaslab_class_destroy(spa->spa_embedded_log_class); 1471 spa->spa_embedded_log_class = NULL; 1472 1473 metaslab_class_destroy(spa->spa_special_class); 1474 spa->spa_special_class = NULL; 1475 1476 metaslab_class_destroy(spa->spa_dedup_class); 1477 spa->spa_dedup_class = NULL; 1478 1479 /* 1480 * If this was part of an import or the open otherwise failed, we may 1481 * still have errors left in the queues. Empty them just in case. 1482 */ 1483 spa_errlog_drain(spa); 1484 avl_destroy(&spa->spa_errlist_scrub); 1485 avl_destroy(&spa->spa_errlist_last); 1486 avl_destroy(&spa->spa_errlist_healed); 1487 1488 spa_keystore_fini(&spa->spa_keystore); 1489 1490 spa->spa_state = POOL_STATE_UNINITIALIZED; 1491 1492 mutex_enter(&spa->spa_proc_lock); 1493 if (spa->spa_proc_state != SPA_PROC_NONE) { 1494 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1495 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1496 cv_broadcast(&spa->spa_proc_cv); 1497 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1498 ASSERT(spa->spa_proc != &p0); 1499 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1500 } 1501 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1502 spa->spa_proc_state = SPA_PROC_NONE; 1503 } 1504 ASSERT(spa->spa_proc == &p0); 1505 mutex_exit(&spa->spa_proc_lock); 1506 1507 /* 1508 * We want to make sure spa_thread() has actually exited the ZFS 1509 * module, so that the module can't be unloaded out from underneath 1510 * it. 1511 */ 1512 if (spa->spa_did != 0) { 1513 thread_join(spa->spa_did); 1514 spa->spa_did = 0; 1515 } 1516 1517 spa_deactivate_os(spa); 1518 1519 } 1520 1521 /* 1522 * Verify a pool configuration, and construct the vdev tree appropriately. This 1523 * will create all the necessary vdevs in the appropriate layout, with each vdev 1524 * in the CLOSED state. This will prep the pool before open/creation/import. 1525 * All vdev validation is done by the vdev_alloc() routine. 1526 */ 1527 int 1528 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1529 uint_t id, int atype) 1530 { 1531 nvlist_t **child; 1532 uint_t children; 1533 int error; 1534 1535 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1536 return (error); 1537 1538 if ((*vdp)->vdev_ops->vdev_op_leaf) 1539 return (0); 1540 1541 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1542 &child, &children); 1543 1544 if (error == ENOENT) 1545 return (0); 1546 1547 if (error) { 1548 vdev_free(*vdp); 1549 *vdp = NULL; 1550 return (SET_ERROR(EINVAL)); 1551 } 1552 1553 for (int c = 0; c < children; c++) { 1554 vdev_t *vd; 1555 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1556 atype)) != 0) { 1557 vdev_free(*vdp); 1558 *vdp = NULL; 1559 return (error); 1560 } 1561 } 1562 1563 ASSERT(*vdp != NULL); 1564 1565 return (0); 1566 } 1567 1568 static boolean_t 1569 spa_should_flush_logs_on_unload(spa_t *spa) 1570 { 1571 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 1572 return (B_FALSE); 1573 1574 if (!spa_writeable(spa)) 1575 return (B_FALSE); 1576 1577 if (!spa->spa_sync_on) 1578 return (B_FALSE); 1579 1580 if (spa_state(spa) != POOL_STATE_EXPORTED) 1581 return (B_FALSE); 1582 1583 if (zfs_keep_log_spacemaps_at_export) 1584 return (B_FALSE); 1585 1586 return (B_TRUE); 1587 } 1588 1589 /* 1590 * Opens a transaction that will set the flag that will instruct 1591 * spa_sync to attempt to flush all the metaslabs for that txg. 1592 */ 1593 static void 1594 spa_unload_log_sm_flush_all(spa_t *spa) 1595 { 1596 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 1597 VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 1598 1599 ASSERT3U(spa->spa_log_flushall_txg, ==, 0); 1600 spa->spa_log_flushall_txg = dmu_tx_get_txg(tx); 1601 1602 dmu_tx_commit(tx); 1603 txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg); 1604 } 1605 1606 static void 1607 spa_unload_log_sm_metadata(spa_t *spa) 1608 { 1609 void *cookie = NULL; 1610 spa_log_sm_t *sls; 1611 while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg, 1612 &cookie)) != NULL) { 1613 VERIFY0(sls->sls_mscount); 1614 kmem_free(sls, sizeof (spa_log_sm_t)); 1615 } 1616 1617 for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); 1618 e != NULL; e = list_head(&spa->spa_log_summary)) { 1619 VERIFY0(e->lse_mscount); 1620 list_remove(&spa->spa_log_summary, e); 1621 kmem_free(e, sizeof (log_summary_entry_t)); 1622 } 1623 1624 spa->spa_unflushed_stats.sus_nblocks = 0; 1625 spa->spa_unflushed_stats.sus_memused = 0; 1626 spa->spa_unflushed_stats.sus_blocklimit = 0; 1627 } 1628 1629 static void 1630 spa_destroy_aux_threads(spa_t *spa) 1631 { 1632 if (spa->spa_condense_zthr != NULL) { 1633 zthr_destroy(spa->spa_condense_zthr); 1634 spa->spa_condense_zthr = NULL; 1635 } 1636 if (spa->spa_checkpoint_discard_zthr != NULL) { 1637 zthr_destroy(spa->spa_checkpoint_discard_zthr); 1638 spa->spa_checkpoint_discard_zthr = NULL; 1639 } 1640 if (spa->spa_livelist_delete_zthr != NULL) { 1641 zthr_destroy(spa->spa_livelist_delete_zthr); 1642 spa->spa_livelist_delete_zthr = NULL; 1643 } 1644 if (spa->spa_livelist_condense_zthr != NULL) { 1645 zthr_destroy(spa->spa_livelist_condense_zthr); 1646 spa->spa_livelist_condense_zthr = NULL; 1647 } 1648 } 1649 1650 /* 1651 * Opposite of spa_load(). 1652 */ 1653 static void 1654 spa_unload(spa_t *spa) 1655 { 1656 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1657 ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED); 1658 1659 spa_import_progress_remove(spa_guid(spa)); 1660 spa_load_note(spa, "UNLOADING"); 1661 1662 spa_wake_waiters(spa); 1663 1664 /* 1665 * If we have set the spa_final_txg, we have already performed the 1666 * tasks below in spa_export_common(). We should not redo it here since 1667 * we delay the final TXGs beyond what spa_final_txg is set at. 1668 */ 1669 if (spa->spa_final_txg == UINT64_MAX) { 1670 /* 1671 * If the log space map feature is enabled and the pool is 1672 * getting exported (but not destroyed), we want to spend some 1673 * time flushing as many metaslabs as we can in an attempt to 1674 * destroy log space maps and save import time. 1675 */ 1676 if (spa_should_flush_logs_on_unload(spa)) 1677 spa_unload_log_sm_flush_all(spa); 1678 1679 /* 1680 * Stop async tasks. 1681 */ 1682 spa_async_suspend(spa); 1683 1684 if (spa->spa_root_vdev) { 1685 vdev_t *root_vdev = spa->spa_root_vdev; 1686 vdev_initialize_stop_all(root_vdev, 1687 VDEV_INITIALIZE_ACTIVE); 1688 vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE); 1689 vdev_autotrim_stop_all(spa); 1690 vdev_rebuild_stop_all(spa); 1691 } 1692 } 1693 1694 /* 1695 * Stop syncing. 1696 */ 1697 if (spa->spa_sync_on) { 1698 txg_sync_stop(spa->spa_dsl_pool); 1699 spa->spa_sync_on = B_FALSE; 1700 } 1701 1702 /* 1703 * This ensures that there is no async metaslab prefetching 1704 * while we attempt to unload the spa. 1705 */ 1706 if (spa->spa_root_vdev != NULL) { 1707 for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { 1708 vdev_t *vc = spa->spa_root_vdev->vdev_child[c]; 1709 if (vc->vdev_mg != NULL) 1710 taskq_wait(vc->vdev_mg->mg_taskq); 1711 } 1712 } 1713 1714 if (spa->spa_mmp.mmp_thread) 1715 mmp_thread_stop(spa); 1716 1717 /* 1718 * Wait for any outstanding async I/O to complete. 1719 */ 1720 if (spa->spa_async_zio_root != NULL) { 1721 for (int i = 0; i < max_ncpus; i++) 1722 (void) zio_wait(spa->spa_async_zio_root[i]); 1723 kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); 1724 spa->spa_async_zio_root = NULL; 1725 } 1726 1727 if (spa->spa_vdev_removal != NULL) { 1728 spa_vdev_removal_destroy(spa->spa_vdev_removal); 1729 spa->spa_vdev_removal = NULL; 1730 } 1731 1732 spa_destroy_aux_threads(spa); 1733 1734 spa_condense_fini(spa); 1735 1736 bpobj_close(&spa->spa_deferred_bpobj); 1737 1738 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); 1739 1740 /* 1741 * Close all vdevs. 1742 */ 1743 if (spa->spa_root_vdev) 1744 vdev_free(spa->spa_root_vdev); 1745 ASSERT(spa->spa_root_vdev == NULL); 1746 1747 /* 1748 * Close the dsl pool. 1749 */ 1750 if (spa->spa_dsl_pool) { 1751 dsl_pool_close(spa->spa_dsl_pool); 1752 spa->spa_dsl_pool = NULL; 1753 spa->spa_meta_objset = NULL; 1754 } 1755 1756 ddt_unload(spa); 1757 brt_unload(spa); 1758 spa_unload_log_sm_metadata(spa); 1759 1760 /* 1761 * Drop and purge level 2 cache 1762 */ 1763 spa_l2cache_drop(spa); 1764 1765 if (spa->spa_spares.sav_vdevs) { 1766 for (int i = 0; i < spa->spa_spares.sav_count; i++) 1767 vdev_free(spa->spa_spares.sav_vdevs[i]); 1768 kmem_free(spa->spa_spares.sav_vdevs, 1769 spa->spa_spares.sav_count * sizeof (void *)); 1770 spa->spa_spares.sav_vdevs = NULL; 1771 } 1772 if (spa->spa_spares.sav_config) { 1773 nvlist_free(spa->spa_spares.sav_config); 1774 spa->spa_spares.sav_config = NULL; 1775 } 1776 spa->spa_spares.sav_count = 0; 1777 1778 if (spa->spa_l2cache.sav_vdevs) { 1779 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { 1780 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1781 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1782 } 1783 kmem_free(spa->spa_l2cache.sav_vdevs, 1784 spa->spa_l2cache.sav_count * sizeof (void *)); 1785 spa->spa_l2cache.sav_vdevs = NULL; 1786 } 1787 if (spa->spa_l2cache.sav_config) { 1788 nvlist_free(spa->spa_l2cache.sav_config); 1789 spa->spa_l2cache.sav_config = NULL; 1790 } 1791 spa->spa_l2cache.sav_count = 0; 1792 1793 spa->spa_async_suspended = 0; 1794 1795 spa->spa_indirect_vdevs_loaded = B_FALSE; 1796 1797 if (spa->spa_comment != NULL) { 1798 spa_strfree(spa->spa_comment); 1799 spa->spa_comment = NULL; 1800 } 1801 if (spa->spa_compatibility != NULL) { 1802 spa_strfree(spa->spa_compatibility); 1803 spa->spa_compatibility = NULL; 1804 } 1805 1806 spa_config_exit(spa, SCL_ALL, spa); 1807 } 1808 1809 /* 1810 * Load (or re-load) the current list of vdevs describing the active spares for 1811 * this pool. When this is called, we have some form of basic information in 1812 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1813 * then re-generate a more complete list including status information. 1814 */ 1815 void 1816 spa_load_spares(spa_t *spa) 1817 { 1818 nvlist_t **spares; 1819 uint_t nspares; 1820 int i; 1821 vdev_t *vd, *tvd; 1822 1823 #ifndef _KERNEL 1824 /* 1825 * zdb opens both the current state of the pool and the 1826 * checkpointed state (if present), with a different spa_t. 1827 * 1828 * As spare vdevs are shared among open pools, we skip loading 1829 * them when we load the checkpointed state of the pool. 1830 */ 1831 if (!spa_writeable(spa)) 1832 return; 1833 #endif 1834 1835 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1836 1837 /* 1838 * First, close and free any existing spare vdevs. 1839 */ 1840 if (spa->spa_spares.sav_vdevs) { 1841 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1842 vd = spa->spa_spares.sav_vdevs[i]; 1843 1844 /* Undo the call to spa_activate() below */ 1845 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1846 B_FALSE)) != NULL && tvd->vdev_isspare) 1847 spa_spare_remove(tvd); 1848 vdev_close(vd); 1849 vdev_free(vd); 1850 } 1851 1852 kmem_free(spa->spa_spares.sav_vdevs, 1853 spa->spa_spares.sav_count * sizeof (void *)); 1854 } 1855 1856 if (spa->spa_spares.sav_config == NULL) 1857 nspares = 0; 1858 else 1859 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1860 ZPOOL_CONFIG_SPARES, &spares, &nspares)); 1861 1862 spa->spa_spares.sav_count = (int)nspares; 1863 spa->spa_spares.sav_vdevs = NULL; 1864 1865 if (nspares == 0) 1866 return; 1867 1868 /* 1869 * Construct the array of vdevs, opening them to get status in the 1870 * process. For each spare, there is potentially two different vdev_t 1871 * structures associated with it: one in the list of spares (used only 1872 * for basic validation purposes) and one in the active vdev 1873 * configuration (if it's spared in). During this phase we open and 1874 * validate each vdev on the spare list. If the vdev also exists in the 1875 * active configuration, then we also mark this vdev as an active spare. 1876 */ 1877 spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *), 1878 KM_SLEEP); 1879 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1880 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1881 VDEV_ALLOC_SPARE) == 0); 1882 ASSERT(vd != NULL); 1883 1884 spa->spa_spares.sav_vdevs[i] = vd; 1885 1886 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1887 B_FALSE)) != NULL) { 1888 if (!tvd->vdev_isspare) 1889 spa_spare_add(tvd); 1890 1891 /* 1892 * We only mark the spare active if we were successfully 1893 * able to load the vdev. Otherwise, importing a pool 1894 * with a bad active spare would result in strange 1895 * behavior, because multiple pool would think the spare 1896 * is actively in use. 1897 * 1898 * There is a vulnerability here to an equally bizarre 1899 * circumstance, where a dead active spare is later 1900 * brought back to life (onlined or otherwise). Given 1901 * the rarity of this scenario, and the extra complexity 1902 * it adds, we ignore the possibility. 1903 */ 1904 if (!vdev_is_dead(tvd)) 1905 spa_spare_activate(tvd); 1906 } 1907 1908 vd->vdev_top = vd; 1909 vd->vdev_aux = &spa->spa_spares; 1910 1911 if (vdev_open(vd) != 0) 1912 continue; 1913 1914 if (vdev_validate_aux(vd) == 0) 1915 spa_spare_add(vd); 1916 } 1917 1918 /* 1919 * Recompute the stashed list of spares, with status information 1920 * this time. 1921 */ 1922 fnvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES); 1923 1924 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1925 KM_SLEEP); 1926 for (i = 0; i < spa->spa_spares.sav_count; i++) 1927 spares[i] = vdev_config_generate(spa, 1928 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1929 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 1930 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 1931 spa->spa_spares.sav_count); 1932 for (i = 0; i < spa->spa_spares.sav_count; i++) 1933 nvlist_free(spares[i]); 1934 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1935 } 1936 1937 /* 1938 * Load (or re-load) the current list of vdevs describing the active l2cache for 1939 * this pool. When this is called, we have some form of basic information in 1940 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1941 * then re-generate a more complete list including status information. 1942 * Devices which are already active have their details maintained, and are 1943 * not re-opened. 1944 */ 1945 void 1946 spa_load_l2cache(spa_t *spa) 1947 { 1948 nvlist_t **l2cache = NULL; 1949 uint_t nl2cache; 1950 int i, j, oldnvdevs; 1951 uint64_t guid; 1952 vdev_t *vd, **oldvdevs, **newvdevs; 1953 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1954 1955 #ifndef _KERNEL 1956 /* 1957 * zdb opens both the current state of the pool and the 1958 * checkpointed state (if present), with a different spa_t. 1959 * 1960 * As L2 caches are part of the ARC which is shared among open 1961 * pools, we skip loading them when we load the checkpointed 1962 * state of the pool. 1963 */ 1964 if (!spa_writeable(spa)) 1965 return; 1966 #endif 1967 1968 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1969 1970 oldvdevs = sav->sav_vdevs; 1971 oldnvdevs = sav->sav_count; 1972 sav->sav_vdevs = NULL; 1973 sav->sav_count = 0; 1974 1975 if (sav->sav_config == NULL) { 1976 nl2cache = 0; 1977 newvdevs = NULL; 1978 goto out; 1979 } 1980 1981 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, 1982 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); 1983 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1984 1985 /* 1986 * Process new nvlist of vdevs. 1987 */ 1988 for (i = 0; i < nl2cache; i++) { 1989 guid = fnvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID); 1990 1991 newvdevs[i] = NULL; 1992 for (j = 0; j < oldnvdevs; j++) { 1993 vd = oldvdevs[j]; 1994 if (vd != NULL && guid == vd->vdev_guid) { 1995 /* 1996 * Retain previous vdev for add/remove ops. 1997 */ 1998 newvdevs[i] = vd; 1999 oldvdevs[j] = NULL; 2000 break; 2001 } 2002 } 2003 2004 if (newvdevs[i] == NULL) { 2005 /* 2006 * Create new vdev 2007 */ 2008 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 2009 VDEV_ALLOC_L2CACHE) == 0); 2010 ASSERT(vd != NULL); 2011 newvdevs[i] = vd; 2012 2013 /* 2014 * Commit this vdev as an l2cache device, 2015 * even if it fails to open. 2016 */ 2017 spa_l2cache_add(vd); 2018 2019 vd->vdev_top = vd; 2020 vd->vdev_aux = sav; 2021 2022 spa_l2cache_activate(vd); 2023 2024 if (vdev_open(vd) != 0) 2025 continue; 2026 2027 (void) vdev_validate_aux(vd); 2028 2029 if (!vdev_is_dead(vd)) 2030 l2arc_add_vdev(spa, vd); 2031 2032 /* 2033 * Upon cache device addition to a pool or pool 2034 * creation with a cache device or if the header 2035 * of the device is invalid we issue an async 2036 * TRIM command for the whole device which will 2037 * execute if l2arc_trim_ahead > 0. 2038 */ 2039 spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); 2040 } 2041 } 2042 2043 sav->sav_vdevs = newvdevs; 2044 sav->sav_count = (int)nl2cache; 2045 2046 /* 2047 * Recompute the stashed list of l2cache devices, with status 2048 * information this time. 2049 */ 2050 fnvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE); 2051 2052 if (sav->sav_count > 0) 2053 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), 2054 KM_SLEEP); 2055 for (i = 0; i < sav->sav_count; i++) 2056 l2cache[i] = vdev_config_generate(spa, 2057 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 2058 fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 2059 (const nvlist_t * const *)l2cache, sav->sav_count); 2060 2061 out: 2062 /* 2063 * Purge vdevs that were dropped 2064 */ 2065 if (oldvdevs) { 2066 for (i = 0; i < oldnvdevs; i++) { 2067 uint64_t pool; 2068 2069 vd = oldvdevs[i]; 2070 if (vd != NULL) { 2071 ASSERT(vd->vdev_isl2cache); 2072 2073 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 2074 pool != 0ULL && l2arc_vdev_present(vd)) 2075 l2arc_remove_vdev(vd); 2076 vdev_clear_stats(vd); 2077 vdev_free(vd); 2078 } 2079 } 2080 2081 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 2082 } 2083 2084 for (i = 0; i < sav->sav_count; i++) 2085 nvlist_free(l2cache[i]); 2086 if (sav->sav_count) 2087 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 2088 } 2089 2090 static int 2091 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 2092 { 2093 dmu_buf_t *db; 2094 char *packed = NULL; 2095 size_t nvsize = 0; 2096 int error; 2097 *value = NULL; 2098 2099 error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 2100 if (error) 2101 return (error); 2102 2103 nvsize = *(uint64_t *)db->db_data; 2104 dmu_buf_rele(db, FTAG); 2105 2106 packed = vmem_alloc(nvsize, KM_SLEEP); 2107 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 2108 DMU_READ_PREFETCH); 2109 if (error == 0) 2110 error = nvlist_unpack(packed, nvsize, value, 0); 2111 vmem_free(packed, nvsize); 2112 2113 return (error); 2114 } 2115 2116 /* 2117 * Concrete top-level vdevs that are not missing and are not logs. At every 2118 * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds. 2119 */ 2120 static uint64_t 2121 spa_healthy_core_tvds(spa_t *spa) 2122 { 2123 vdev_t *rvd = spa->spa_root_vdev; 2124 uint64_t tvds = 0; 2125 2126 for (uint64_t i = 0; i < rvd->vdev_children; i++) { 2127 vdev_t *vd = rvd->vdev_child[i]; 2128 if (vd->vdev_islog) 2129 continue; 2130 if (vdev_is_concrete(vd) && !vdev_is_dead(vd)) 2131 tvds++; 2132 } 2133 2134 return (tvds); 2135 } 2136 2137 /* 2138 * Checks to see if the given vdev could not be opened, in which case we post a 2139 * sysevent to notify the autoreplace code that the device has been removed. 2140 */ 2141 static void 2142 spa_check_removed(vdev_t *vd) 2143 { 2144 for (uint64_t c = 0; c < vd->vdev_children; c++) 2145 spa_check_removed(vd->vdev_child[c]); 2146 2147 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 2148 vdev_is_concrete(vd)) { 2149 zfs_post_autoreplace(vd->vdev_spa, vd); 2150 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); 2151 } 2152 } 2153 2154 static int 2155 spa_check_for_missing_logs(spa_t *spa) 2156 { 2157 vdev_t *rvd = spa->spa_root_vdev; 2158 2159 /* 2160 * If we're doing a normal import, then build up any additional 2161 * diagnostic information about missing log devices. 2162 * We'll pass this up to the user for further processing. 2163 */ 2164 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 2165 nvlist_t **child, *nv; 2166 uint64_t idx = 0; 2167 2168 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *), 2169 KM_SLEEP); 2170 nv = fnvlist_alloc(); 2171 2172 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 2173 vdev_t *tvd = rvd->vdev_child[c]; 2174 2175 /* 2176 * We consider a device as missing only if it failed 2177 * to open (i.e. offline or faulted is not considered 2178 * as missing). 2179 */ 2180 if (tvd->vdev_islog && 2181 tvd->vdev_state == VDEV_STATE_CANT_OPEN) { 2182 child[idx++] = vdev_config_generate(spa, tvd, 2183 B_FALSE, VDEV_CONFIG_MISSING); 2184 } 2185 } 2186 2187 if (idx > 0) { 2188 fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 2189 (const nvlist_t * const *)child, idx); 2190 fnvlist_add_nvlist(spa->spa_load_info, 2191 ZPOOL_CONFIG_MISSING_DEVICES, nv); 2192 2193 for (uint64_t i = 0; i < idx; i++) 2194 nvlist_free(child[i]); 2195 } 2196 nvlist_free(nv); 2197 kmem_free(child, rvd->vdev_children * sizeof (char **)); 2198 2199 if (idx > 0) { 2200 spa_load_failed(spa, "some log devices are missing"); 2201 vdev_dbgmsg_print_tree(rvd, 2); 2202 return (SET_ERROR(ENXIO)); 2203 } 2204 } else { 2205 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 2206 vdev_t *tvd = rvd->vdev_child[c]; 2207 2208 if (tvd->vdev_islog && 2209 tvd->vdev_state == VDEV_STATE_CANT_OPEN) { 2210 spa_set_log_state(spa, SPA_LOG_CLEAR); 2211 spa_load_note(spa, "some log devices are " 2212 "missing, ZIL is dropped."); 2213 vdev_dbgmsg_print_tree(rvd, 2); 2214 break; 2215 } 2216 } 2217 } 2218 2219 return (0); 2220 } 2221 2222 /* 2223 * Check for missing log devices 2224 */ 2225 static boolean_t 2226 spa_check_logs(spa_t *spa) 2227 { 2228 boolean_t rv = B_FALSE; 2229 dsl_pool_t *dp = spa_get_dsl(spa); 2230 2231 switch (spa->spa_log_state) { 2232 default: 2233 break; 2234 case SPA_LOG_MISSING: 2235 /* need to recheck in case slog has been restored */ 2236 case SPA_LOG_UNKNOWN: 2237 rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 2238 zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); 2239 if (rv) 2240 spa_set_log_state(spa, SPA_LOG_MISSING); 2241 break; 2242 } 2243 return (rv); 2244 } 2245 2246 /* 2247 * Passivate any log vdevs (note, does not apply to embedded log metaslabs). 2248 */ 2249 static boolean_t 2250 spa_passivate_log(spa_t *spa) 2251 { 2252 vdev_t *rvd = spa->spa_root_vdev; 2253 boolean_t slog_found = B_FALSE; 2254 2255 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 2256 2257 for (int c = 0; c < rvd->vdev_children; c++) { 2258 vdev_t *tvd = rvd->vdev_child[c]; 2259 2260 if (tvd->vdev_islog) { 2261 ASSERT3P(tvd->vdev_log_mg, ==, NULL); 2262 metaslab_group_passivate(tvd->vdev_mg); 2263 slog_found = B_TRUE; 2264 } 2265 } 2266 2267 return (slog_found); 2268 } 2269 2270 /* 2271 * Activate any log vdevs (note, does not apply to embedded log metaslabs). 2272 */ 2273 static void 2274 spa_activate_log(spa_t *spa) 2275 { 2276 vdev_t *rvd = spa->spa_root_vdev; 2277 2278 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 2279 2280 for (int c = 0; c < rvd->vdev_children; c++) { 2281 vdev_t *tvd = rvd->vdev_child[c]; 2282 2283 if (tvd->vdev_islog) { 2284 ASSERT3P(tvd->vdev_log_mg, ==, NULL); 2285 metaslab_group_activate(tvd->vdev_mg); 2286 } 2287 } 2288 } 2289 2290 int 2291 spa_reset_logs(spa_t *spa) 2292 { 2293 int error; 2294 2295 error = dmu_objset_find(spa_name(spa), zil_reset, 2296 NULL, DS_FIND_CHILDREN); 2297 if (error == 0) { 2298 /* 2299 * We successfully offlined the log device, sync out the 2300 * current txg so that the "stubby" block can be removed 2301 * by zil_sync(). 2302 */ 2303 txg_wait_synced(spa->spa_dsl_pool, 0); 2304 } 2305 return (error); 2306 } 2307 2308 static void 2309 spa_aux_check_removed(spa_aux_vdev_t *sav) 2310 { 2311 for (int i = 0; i < sav->sav_count; i++) 2312 spa_check_removed(sav->sav_vdevs[i]); 2313 } 2314 2315 void 2316 spa_claim_notify(zio_t *zio) 2317 { 2318 spa_t *spa = zio->io_spa; 2319 2320 if (zio->io_error) 2321 return; 2322 2323 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 2324 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 2325 spa->spa_claim_max_txg = zio->io_bp->blk_birth; 2326 mutex_exit(&spa->spa_props_lock); 2327 } 2328 2329 typedef struct spa_load_error { 2330 boolean_t sle_verify_data; 2331 uint64_t sle_meta_count; 2332 uint64_t sle_data_count; 2333 } spa_load_error_t; 2334 2335 static void 2336 spa_load_verify_done(zio_t *zio) 2337 { 2338 blkptr_t *bp = zio->io_bp; 2339 spa_load_error_t *sle = zio->io_private; 2340 dmu_object_type_t type = BP_GET_TYPE(bp); 2341 int error = zio->io_error; 2342 spa_t *spa = zio->io_spa; 2343 2344 abd_free(zio->io_abd); 2345 if (error) { 2346 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 2347 type != DMU_OT_INTENT_LOG) 2348 atomic_inc_64(&sle->sle_meta_count); 2349 else 2350 atomic_inc_64(&sle->sle_data_count); 2351 } 2352 2353 mutex_enter(&spa->spa_scrub_lock); 2354 spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); 2355 cv_broadcast(&spa->spa_scrub_io_cv); 2356 mutex_exit(&spa->spa_scrub_lock); 2357 } 2358 2359 /* 2360 * Maximum number of inflight bytes is the log2 fraction of the arc size. 2361 * By default, we set it to 1/16th of the arc. 2362 */ 2363 static uint_t spa_load_verify_shift = 4; 2364 static int spa_load_verify_metadata = B_TRUE; 2365 static int spa_load_verify_data = B_TRUE; 2366 2367 static int 2368 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 2369 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 2370 { 2371 zio_t *rio = arg; 2372 spa_load_error_t *sle = rio->io_private; 2373 2374 (void) zilog, (void) dnp; 2375 2376 /* 2377 * Note: normally this routine will not be called if 2378 * spa_load_verify_metadata is not set. However, it may be useful 2379 * to manually set the flag after the traversal has begun. 2380 */ 2381 if (!spa_load_verify_metadata) 2382 return (0); 2383 2384 /* 2385 * Sanity check the block pointer in order to detect obvious damage 2386 * before using the contents in subsequent checks or in zio_read(). 2387 * When damaged consider it to be a metadata error since we cannot 2388 * trust the BP_GET_TYPE and BP_GET_LEVEL values. 2389 */ 2390 if (!zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_LOG)) { 2391 atomic_inc_64(&sle->sle_meta_count); 2392 return (0); 2393 } 2394 2395 if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || 2396 BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) 2397 return (0); 2398 2399 if (!BP_IS_METADATA(bp) && 2400 (!spa_load_verify_data || !sle->sle_verify_data)) 2401 return (0); 2402 2403 uint64_t maxinflight_bytes = 2404 arc_target_bytes() >> spa_load_verify_shift; 2405 size_t size = BP_GET_PSIZE(bp); 2406 2407 mutex_enter(&spa->spa_scrub_lock); 2408 while (spa->spa_load_verify_bytes >= maxinflight_bytes) 2409 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2410 spa->spa_load_verify_bytes += size; 2411 mutex_exit(&spa->spa_scrub_lock); 2412 2413 zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, 2414 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 2415 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 2416 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 2417 return (0); 2418 } 2419 2420 static int 2421 verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 2422 { 2423 (void) dp, (void) arg; 2424 2425 if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) 2426 return (SET_ERROR(ENAMETOOLONG)); 2427 2428 return (0); 2429 } 2430 2431 static int 2432 spa_load_verify(spa_t *spa) 2433 { 2434 zio_t *rio; 2435 spa_load_error_t sle = { 0 }; 2436 zpool_load_policy_t policy; 2437 boolean_t verify_ok = B_FALSE; 2438 int error = 0; 2439 2440 zpool_get_load_policy(spa->spa_config, &policy); 2441 2442 if (policy.zlp_rewind & ZPOOL_NEVER_REWIND || 2443 policy.zlp_maxmeta == UINT64_MAX) 2444 return (0); 2445 2446 dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); 2447 error = dmu_objset_find_dp(spa->spa_dsl_pool, 2448 spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, 2449 DS_FIND_CHILDREN); 2450 dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); 2451 if (error != 0) 2452 return (error); 2453 2454 /* 2455 * Verify data only if we are rewinding or error limit was set. 2456 * Otherwise nothing except dbgmsg care about it to waste time. 2457 */ 2458 sle.sle_verify_data = (policy.zlp_rewind & ZPOOL_REWIND_MASK) || 2459 (policy.zlp_maxdata < UINT64_MAX); 2460 2461 rio = zio_root(spa, NULL, &sle, 2462 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 2463 2464 if (spa_load_verify_metadata) { 2465 if (spa->spa_extreme_rewind) { 2466 spa_load_note(spa, "performing a complete scan of the " 2467 "pool since extreme rewind is on. This may take " 2468 "a very long time.\n (spa_load_verify_data=%u, " 2469 "spa_load_verify_metadata=%u)", 2470 spa_load_verify_data, spa_load_verify_metadata); 2471 } 2472 2473 error = traverse_pool(spa, spa->spa_verify_min_txg, 2474 TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | 2475 TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio); 2476 } 2477 2478 (void) zio_wait(rio); 2479 ASSERT0(spa->spa_load_verify_bytes); 2480 2481 spa->spa_load_meta_errors = sle.sle_meta_count; 2482 spa->spa_load_data_errors = sle.sle_data_count; 2483 2484 if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) { 2485 spa_load_note(spa, "spa_load_verify found %llu metadata errors " 2486 "and %llu data errors", (u_longlong_t)sle.sle_meta_count, 2487 (u_longlong_t)sle.sle_data_count); 2488 } 2489 2490 if (spa_load_verify_dryrun || 2491 (!error && sle.sle_meta_count <= policy.zlp_maxmeta && 2492 sle.sle_data_count <= policy.zlp_maxdata)) { 2493 int64_t loss = 0; 2494 2495 verify_ok = B_TRUE; 2496 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 2497 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 2498 2499 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 2500 fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME, 2501 spa->spa_load_txg_ts); 2502 fnvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME, 2503 loss); 2504 fnvlist_add_uint64(spa->spa_load_info, 2505 ZPOOL_CONFIG_LOAD_META_ERRORS, sle.sle_meta_count); 2506 fnvlist_add_uint64(spa->spa_load_info, 2507 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count); 2508 } else { 2509 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 2510 } 2511 2512 if (spa_load_verify_dryrun) 2513 return (0); 2514 2515 if (error) { 2516 if (error != ENXIO && error != EIO) 2517 error = SET_ERROR(EIO); 2518 return (error); 2519 } 2520 2521 return (verify_ok ? 0 : EIO); 2522 } 2523 2524 /* 2525 * Find a value in the pool props object. 2526 */ 2527 static void 2528 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 2529 { 2530 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 2531 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 2532 } 2533 2534 /* 2535 * Find a value in the pool directory object. 2536 */ 2537 static int 2538 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) 2539 { 2540 int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2541 name, sizeof (uint64_t), 1, val); 2542 2543 if (error != 0 && (error != ENOENT || log_enoent)) { 2544 spa_load_failed(spa, "couldn't get '%s' value in MOS directory " 2545 "[error=%d]", name, error); 2546 } 2547 2548 return (error); 2549 } 2550 2551 static int 2552 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 2553 { 2554 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 2555 return (SET_ERROR(err)); 2556 } 2557 2558 boolean_t 2559 spa_livelist_delete_check(spa_t *spa) 2560 { 2561 return (spa->spa_livelists_to_delete != 0); 2562 } 2563 2564 static boolean_t 2565 spa_livelist_delete_cb_check(void *arg, zthr_t *z) 2566 { 2567 (void) z; 2568 spa_t *spa = arg; 2569 return (spa_livelist_delete_check(spa)); 2570 } 2571 2572 static int 2573 delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 2574 { 2575 spa_t *spa = arg; 2576 zio_free(spa, tx->tx_txg, bp); 2577 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, 2578 -bp_get_dsize_sync(spa, bp), 2579 -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); 2580 return (0); 2581 } 2582 2583 static int 2584 dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp) 2585 { 2586 int err; 2587 zap_cursor_t zc; 2588 zap_attribute_t za; 2589 zap_cursor_init(&zc, os, zap_obj); 2590 err = zap_cursor_retrieve(&zc, &za); 2591 zap_cursor_fini(&zc); 2592 if (err == 0) 2593 *llp = za.za_first_integer; 2594 return (err); 2595 } 2596 2597 /* 2598 * Components of livelist deletion that must be performed in syncing 2599 * context: freeing block pointers and updating the pool-wide data 2600 * structures to indicate how much work is left to do 2601 */ 2602 typedef struct sublist_delete_arg { 2603 spa_t *spa; 2604 dsl_deadlist_t *ll; 2605 uint64_t key; 2606 bplist_t *to_free; 2607 } sublist_delete_arg_t; 2608 2609 static void 2610 sublist_delete_sync(void *arg, dmu_tx_t *tx) 2611 { 2612 sublist_delete_arg_t *sda = arg; 2613 spa_t *spa = sda->spa; 2614 dsl_deadlist_t *ll = sda->ll; 2615 uint64_t key = sda->key; 2616 bplist_t *to_free = sda->to_free; 2617 2618 bplist_iterate(to_free, delete_blkptr_cb, spa, tx); 2619 dsl_deadlist_remove_entry(ll, key, tx); 2620 } 2621 2622 typedef struct livelist_delete_arg { 2623 spa_t *spa; 2624 uint64_t ll_obj; 2625 uint64_t zap_obj; 2626 } livelist_delete_arg_t; 2627 2628 static void 2629 livelist_delete_sync(void *arg, dmu_tx_t *tx) 2630 { 2631 livelist_delete_arg_t *lda = arg; 2632 spa_t *spa = lda->spa; 2633 uint64_t ll_obj = lda->ll_obj; 2634 uint64_t zap_obj = lda->zap_obj; 2635 objset_t *mos = spa->spa_meta_objset; 2636 uint64_t count; 2637 2638 /* free the livelist and decrement the feature count */ 2639 VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx)); 2640 dsl_deadlist_free(mos, ll_obj, tx); 2641 spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); 2642 VERIFY0(zap_count(mos, zap_obj, &count)); 2643 if (count == 0) { 2644 /* no more livelists to delete */ 2645 VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, 2646 DMU_POOL_DELETED_CLONES, tx)); 2647 VERIFY0(zap_destroy(mos, zap_obj, tx)); 2648 spa->spa_livelists_to_delete = 0; 2649 spa_notify_waiters(spa); 2650 } 2651 } 2652 2653 /* 2654 * Load in the value for the livelist to be removed and open it. Then, 2655 * load its first sublist and determine which block pointers should actually 2656 * be freed. Then, call a synctask which performs the actual frees and updates 2657 * the pool-wide livelist data. 2658 */ 2659 static void 2660 spa_livelist_delete_cb(void *arg, zthr_t *z) 2661 { 2662 spa_t *spa = arg; 2663 uint64_t ll_obj = 0, count; 2664 objset_t *mos = spa->spa_meta_objset; 2665 uint64_t zap_obj = spa->spa_livelists_to_delete; 2666 /* 2667 * Determine the next livelist to delete. This function should only 2668 * be called if there is at least one deleted clone. 2669 */ 2670 VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj)); 2671 VERIFY0(zap_count(mos, ll_obj, &count)); 2672 if (count > 0) { 2673 dsl_deadlist_t *ll; 2674 dsl_deadlist_entry_t *dle; 2675 bplist_t to_free; 2676 ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP); 2677 dsl_deadlist_open(ll, mos, ll_obj); 2678 dle = dsl_deadlist_first(ll); 2679 ASSERT3P(dle, !=, NULL); 2680 bplist_create(&to_free); 2681 int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free, 2682 z, NULL); 2683 if (err == 0) { 2684 sublist_delete_arg_t sync_arg = { 2685 .spa = spa, 2686 .ll = ll, 2687 .key = dle->dle_mintxg, 2688 .to_free = &to_free 2689 }; 2690 zfs_dbgmsg("deleting sublist (id %llu) from" 2691 " livelist %llu, %lld remaining", 2692 (u_longlong_t)dle->dle_bpobj.bpo_object, 2693 (u_longlong_t)ll_obj, (longlong_t)count - 1); 2694 VERIFY0(dsl_sync_task(spa_name(spa), NULL, 2695 sublist_delete_sync, &sync_arg, 0, 2696 ZFS_SPACE_CHECK_DESTROY)); 2697 } else { 2698 VERIFY3U(err, ==, EINTR); 2699 } 2700 bplist_clear(&to_free); 2701 bplist_destroy(&to_free); 2702 dsl_deadlist_close(ll); 2703 kmem_free(ll, sizeof (dsl_deadlist_t)); 2704 } else { 2705 livelist_delete_arg_t sync_arg = { 2706 .spa = spa, 2707 .ll_obj = ll_obj, 2708 .zap_obj = zap_obj 2709 }; 2710 zfs_dbgmsg("deletion of livelist %llu completed", 2711 (u_longlong_t)ll_obj); 2712 VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync, 2713 &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); 2714 } 2715 } 2716 2717 static void 2718 spa_start_livelist_destroy_thread(spa_t *spa) 2719 { 2720 ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL); 2721 spa->spa_livelist_delete_zthr = 2722 zthr_create("z_livelist_destroy", 2723 spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa, 2724 minclsyspri); 2725 } 2726 2727 typedef struct livelist_new_arg { 2728 bplist_t *allocs; 2729 bplist_t *frees; 2730 } livelist_new_arg_t; 2731 2732 static int 2733 livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 2734 dmu_tx_t *tx) 2735 { 2736 ASSERT(tx == NULL); 2737 livelist_new_arg_t *lna = arg; 2738 if (bp_freed) { 2739 bplist_append(lna->frees, bp); 2740 } else { 2741 bplist_append(lna->allocs, bp); 2742 zfs_livelist_condense_new_alloc++; 2743 } 2744 return (0); 2745 } 2746 2747 typedef struct livelist_condense_arg { 2748 spa_t *spa; 2749 bplist_t to_keep; 2750 uint64_t first_size; 2751 uint64_t next_size; 2752 } livelist_condense_arg_t; 2753 2754 static void 2755 spa_livelist_condense_sync(void *arg, dmu_tx_t *tx) 2756 { 2757 livelist_condense_arg_t *lca = arg; 2758 spa_t *spa = lca->spa; 2759 bplist_t new_frees; 2760 dsl_dataset_t *ds = spa->spa_to_condense.ds; 2761 2762 /* Have we been cancelled? */ 2763 if (spa->spa_to_condense.cancelled) { 2764 zfs_livelist_condense_sync_cancel++; 2765 goto out; 2766 } 2767 2768 dsl_deadlist_entry_t *first = spa->spa_to_condense.first; 2769 dsl_deadlist_entry_t *next = spa->spa_to_condense.next; 2770 dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; 2771 2772 /* 2773 * It's possible that the livelist was changed while the zthr was 2774 * running. Therefore, we need to check for new blkptrs in the two 2775 * entries being condensed and continue to track them in the livelist. 2776 * Because of the way we handle remapped blkptrs (see dbuf_remap_impl), 2777 * it's possible that the newly added blkptrs are FREEs or ALLOCs so 2778 * we need to sort them into two different bplists. 2779 */ 2780 uint64_t first_obj = first->dle_bpobj.bpo_object; 2781 uint64_t next_obj = next->dle_bpobj.bpo_object; 2782 uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs; 2783 uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs; 2784 2785 bplist_create(&new_frees); 2786 livelist_new_arg_t new_bps = { 2787 .allocs = &lca->to_keep, 2788 .frees = &new_frees, 2789 }; 2790 2791 if (cur_first_size > lca->first_size) { 2792 VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj, 2793 livelist_track_new_cb, &new_bps, lca->first_size)); 2794 } 2795 if (cur_next_size > lca->next_size) { 2796 VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj, 2797 livelist_track_new_cb, &new_bps, lca->next_size)); 2798 } 2799 2800 dsl_deadlist_clear_entry(first, ll, tx); 2801 ASSERT(bpobj_is_empty(&first->dle_bpobj)); 2802 dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx); 2803 2804 bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx); 2805 bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx); 2806 bplist_destroy(&new_frees); 2807 2808 char dsname[ZFS_MAX_DATASET_NAME_LEN]; 2809 dsl_dataset_name(ds, dsname); 2810 zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu " 2811 "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu " 2812 "(%llu blkptrs)", (u_longlong_t)tx->tx_txg, dsname, 2813 (u_longlong_t)ds->ds_object, (u_longlong_t)first_obj, 2814 (u_longlong_t)cur_first_size, (u_longlong_t)next_obj, 2815 (u_longlong_t)cur_next_size, 2816 (u_longlong_t)first->dle_bpobj.bpo_object, 2817 (u_longlong_t)first->dle_bpobj.bpo_phys->bpo_num_blkptrs); 2818 out: 2819 dmu_buf_rele(ds->ds_dbuf, spa); 2820 spa->spa_to_condense.ds = NULL; 2821 bplist_clear(&lca->to_keep); 2822 bplist_destroy(&lca->to_keep); 2823 kmem_free(lca, sizeof (livelist_condense_arg_t)); 2824 spa->spa_to_condense.syncing = B_FALSE; 2825 } 2826 2827 static void 2828 spa_livelist_condense_cb(void *arg, zthr_t *t) 2829 { 2830 while (zfs_livelist_condense_zthr_pause && 2831 !(zthr_has_waiters(t) || zthr_iscancelled(t))) 2832 delay(1); 2833 2834 spa_t *spa = arg; 2835 dsl_deadlist_entry_t *first = spa->spa_to_condense.first; 2836 dsl_deadlist_entry_t *next = spa->spa_to_condense.next; 2837 uint64_t first_size, next_size; 2838 2839 livelist_condense_arg_t *lca = 2840 kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP); 2841 bplist_create(&lca->to_keep); 2842 2843 /* 2844 * Process the livelists (matching FREEs and ALLOCs) in open context 2845 * so we have minimal work in syncing context to condense. 2846 * 2847 * We save bpobj sizes (first_size and next_size) to use later in 2848 * syncing context to determine if entries were added to these sublists 2849 * while in open context. This is possible because the clone is still 2850 * active and open for normal writes and we want to make sure the new, 2851 * unprocessed blockpointers are inserted into the livelist normally. 2852 * 2853 * Note that dsl_process_sub_livelist() both stores the size number of 2854 * blockpointers and iterates over them while the bpobj's lock held, so 2855 * the sizes returned to us are consistent which what was actually 2856 * processed. 2857 */ 2858 int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t, 2859 &first_size); 2860 if (err == 0) 2861 err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep, 2862 t, &next_size); 2863 2864 if (err == 0) { 2865 while (zfs_livelist_condense_sync_pause && 2866 !(zthr_has_waiters(t) || zthr_iscancelled(t))) 2867 delay(1); 2868 2869 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 2870 dmu_tx_mark_netfree(tx); 2871 dmu_tx_hold_space(tx, 1); 2872 err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE); 2873 if (err == 0) { 2874 /* 2875 * Prevent the condense zthr restarting before 2876 * the synctask completes. 2877 */ 2878 spa->spa_to_condense.syncing = B_TRUE; 2879 lca->spa = spa; 2880 lca->first_size = first_size; 2881 lca->next_size = next_size; 2882 dsl_sync_task_nowait(spa_get_dsl(spa), 2883 spa_livelist_condense_sync, lca, tx); 2884 dmu_tx_commit(tx); 2885 return; 2886 } 2887 } 2888 /* 2889 * Condensing can not continue: either it was externally stopped or 2890 * we were unable to assign to a tx because the pool has run out of 2891 * space. In the second case, we'll just end up trying to condense 2892 * again in a later txg. 2893 */ 2894 ASSERT(err != 0); 2895 bplist_clear(&lca->to_keep); 2896 bplist_destroy(&lca->to_keep); 2897 kmem_free(lca, sizeof (livelist_condense_arg_t)); 2898 dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa); 2899 spa->spa_to_condense.ds = NULL; 2900 if (err == EINTR) 2901 zfs_livelist_condense_zthr_cancel++; 2902 } 2903 2904 /* 2905 * Check that there is something to condense but that a condense is not 2906 * already in progress and that condensing has not been cancelled. 2907 */ 2908 static boolean_t 2909 spa_livelist_condense_cb_check(void *arg, zthr_t *z) 2910 { 2911 (void) z; 2912 spa_t *spa = arg; 2913 if ((spa->spa_to_condense.ds != NULL) && 2914 (spa->spa_to_condense.syncing == B_FALSE) && 2915 (spa->spa_to_condense.cancelled == B_FALSE)) { 2916 return (B_TRUE); 2917 } 2918 return (B_FALSE); 2919 } 2920 2921 static void 2922 spa_start_livelist_condensing_thread(spa_t *spa) 2923 { 2924 spa->spa_to_condense.ds = NULL; 2925 spa->spa_to_condense.first = NULL; 2926 spa->spa_to_condense.next = NULL; 2927 spa->spa_to_condense.syncing = B_FALSE; 2928 spa->spa_to_condense.cancelled = B_FALSE; 2929 2930 ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL); 2931 spa->spa_livelist_condense_zthr = 2932 zthr_create("z_livelist_condense", 2933 spa_livelist_condense_cb_check, 2934 spa_livelist_condense_cb, spa, minclsyspri); 2935 } 2936 2937 static void 2938 spa_spawn_aux_threads(spa_t *spa) 2939 { 2940 ASSERT(spa_writeable(spa)); 2941 2942 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2943 2944 spa_start_indirect_condensing_thread(spa); 2945 spa_start_livelist_destroy_thread(spa); 2946 spa_start_livelist_condensing_thread(spa); 2947 2948 ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); 2949 spa->spa_checkpoint_discard_zthr = 2950 zthr_create("z_checkpoint_discard", 2951 spa_checkpoint_discard_thread_check, 2952 spa_checkpoint_discard_thread, spa, minclsyspri); 2953 } 2954 2955 /* 2956 * Fix up config after a partly-completed split. This is done with the 2957 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 2958 * pool have that entry in their config, but only the splitting one contains 2959 * a list of all the guids of the vdevs that are being split off. 2960 * 2961 * This function determines what to do with that list: either rejoin 2962 * all the disks to the pool, or complete the splitting process. To attempt 2963 * the rejoin, each disk that is offlined is marked online again, and 2964 * we do a reopen() call. If the vdev label for every disk that was 2965 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 2966 * then we call vdev_split() on each disk, and complete the split. 2967 * 2968 * Otherwise we leave the config alone, with all the vdevs in place in 2969 * the original pool. 2970 */ 2971 static void 2972 spa_try_repair(spa_t *spa, nvlist_t *config) 2973 { 2974 uint_t extracted; 2975 uint64_t *glist; 2976 uint_t i, gcount; 2977 nvlist_t *nvl; 2978 vdev_t **vd; 2979 boolean_t attempt_reopen; 2980 2981 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 2982 return; 2983 2984 /* check that the config is complete */ 2985 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 2986 &glist, &gcount) != 0) 2987 return; 2988 2989 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 2990 2991 /* attempt to online all the vdevs & validate */ 2992 attempt_reopen = B_TRUE; 2993 for (i = 0; i < gcount; i++) { 2994 if (glist[i] == 0) /* vdev is hole */ 2995 continue; 2996 2997 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 2998 if (vd[i] == NULL) { 2999 /* 3000 * Don't bother attempting to reopen the disks; 3001 * just do the split. 3002 */ 3003 attempt_reopen = B_FALSE; 3004 } else { 3005 /* attempt to re-online it */ 3006 vd[i]->vdev_offline = B_FALSE; 3007 } 3008 } 3009 3010 if (attempt_reopen) { 3011 vdev_reopen(spa->spa_root_vdev); 3012 3013 /* check each device to see what state it's in */ 3014 for (extracted = 0, i = 0; i < gcount; i++) { 3015 if (vd[i] != NULL && 3016 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 3017 break; 3018 ++extracted; 3019 } 3020 } 3021 3022 /* 3023 * If every disk has been moved to the new pool, or if we never 3024 * even attempted to look at them, then we split them off for 3025 * good. 3026 */ 3027 if (!attempt_reopen || gcount == extracted) { 3028 for (i = 0; i < gcount; i++) 3029 if (vd[i] != NULL) 3030 vdev_split(vd[i]); 3031 vdev_reopen(spa->spa_root_vdev); 3032 } 3033 3034 kmem_free(vd, gcount * sizeof (vdev_t *)); 3035 } 3036 3037 static int 3038 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) 3039 { 3040 const char *ereport = FM_EREPORT_ZFS_POOL; 3041 int error; 3042 3043 spa->spa_load_state = state; 3044 (void) spa_import_progress_set_state(spa_guid(spa), 3045 spa_load_state(spa)); 3046 3047 gethrestime(&spa->spa_loaded_ts); 3048 error = spa_load_impl(spa, type, &ereport); 3049 3050 /* 3051 * Don't count references from objsets that are already closed 3052 * and are making their way through the eviction process. 3053 */ 3054 spa_evicting_os_wait(spa); 3055 spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); 3056 if (error) { 3057 if (error != EEXIST) { 3058 spa->spa_loaded_ts.tv_sec = 0; 3059 spa->spa_loaded_ts.tv_nsec = 0; 3060 } 3061 if (error != EBADF) { 3062 (void) zfs_ereport_post(ereport, spa, 3063 NULL, NULL, NULL, 0); 3064 } 3065 } 3066 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 3067 spa->spa_ena = 0; 3068 3069 (void) spa_import_progress_set_state(spa_guid(spa), 3070 spa_load_state(spa)); 3071 3072 return (error); 3073 } 3074 3075 #ifdef ZFS_DEBUG 3076 /* 3077 * Count the number of per-vdev ZAPs associated with all of the vdevs in the 3078 * vdev tree rooted in the given vd, and ensure that each ZAP is present in the 3079 * spa's per-vdev ZAP list. 3080 */ 3081 static uint64_t 3082 vdev_count_verify_zaps(vdev_t *vd) 3083 { 3084 spa_t *spa = vd->vdev_spa; 3085 uint64_t total = 0; 3086 3087 if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2) && 3088 vd->vdev_root_zap != 0) { 3089 total++; 3090 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3091 spa->spa_all_vdev_zaps, vd->vdev_root_zap)); 3092 } 3093 if (vd->vdev_top_zap != 0) { 3094 total++; 3095 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3096 spa->spa_all_vdev_zaps, vd->vdev_top_zap)); 3097 } 3098 if (vd->vdev_leaf_zap != 0) { 3099 total++; 3100 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3101 spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); 3102 } 3103 3104 for (uint64_t i = 0; i < vd->vdev_children; i++) { 3105 total += vdev_count_verify_zaps(vd->vdev_child[i]); 3106 } 3107 3108 return (total); 3109 } 3110 #else 3111 #define vdev_count_verify_zaps(vd) ((void) sizeof (vd), 0) 3112 #endif 3113 3114 /* 3115 * Determine whether the activity check is required. 3116 */ 3117 static boolean_t 3118 spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label, 3119 nvlist_t *config) 3120 { 3121 uint64_t state = 0; 3122 uint64_t hostid = 0; 3123 uint64_t tryconfig_txg = 0; 3124 uint64_t tryconfig_timestamp = 0; 3125 uint16_t tryconfig_mmp_seq = 0; 3126 nvlist_t *nvinfo; 3127 3128 if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { 3129 nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); 3130 (void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG, 3131 &tryconfig_txg); 3132 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 3133 &tryconfig_timestamp); 3134 (void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ, 3135 &tryconfig_mmp_seq); 3136 } 3137 3138 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state); 3139 3140 /* 3141 * Disable the MMP activity check - This is used by zdb which 3142 * is intended to be used on potentially active pools. 3143 */ 3144 if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) 3145 return (B_FALSE); 3146 3147 /* 3148 * Skip the activity check when the MMP feature is disabled. 3149 */ 3150 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0) 3151 return (B_FALSE); 3152 3153 /* 3154 * If the tryconfig_ values are nonzero, they are the results of an 3155 * earlier tryimport. If they all match the uberblock we just found, 3156 * then the pool has not changed and we return false so we do not test 3157 * a second time. 3158 */ 3159 if (tryconfig_txg && tryconfig_txg == ub->ub_txg && 3160 tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp && 3161 tryconfig_mmp_seq && tryconfig_mmp_seq == 3162 (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) 3163 return (B_FALSE); 3164 3165 /* 3166 * Allow the activity check to be skipped when importing the pool 3167 * on the same host which last imported it. Since the hostid from 3168 * configuration may be stale use the one read from the label. 3169 */ 3170 if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID)) 3171 hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID); 3172 3173 if (hostid == spa_get_hostid(spa)) 3174 return (B_FALSE); 3175 3176 /* 3177 * Skip the activity test when the pool was cleanly exported. 3178 */ 3179 if (state != POOL_STATE_ACTIVE) 3180 return (B_FALSE); 3181 3182 return (B_TRUE); 3183 } 3184 3185 /* 3186 * Nanoseconds the activity check must watch for changes on-disk. 3187 */ 3188 static uint64_t 3189 spa_activity_check_duration(spa_t *spa, uberblock_t *ub) 3190 { 3191 uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1); 3192 uint64_t multihost_interval = MSEC2NSEC( 3193 MMP_INTERVAL_OK(zfs_multihost_interval)); 3194 uint64_t import_delay = MAX(NANOSEC, import_intervals * 3195 multihost_interval); 3196 3197 /* 3198 * Local tunables determine a minimum duration except for the case 3199 * where we know when the remote host will suspend the pool if MMP 3200 * writes do not land. 3201 * 3202 * See Big Theory comment at the top of mmp.c for the reasoning behind 3203 * these cases and times. 3204 */ 3205 3206 ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100); 3207 3208 if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && 3209 MMP_FAIL_INT(ub) > 0) { 3210 3211 /* MMP on remote host will suspend pool after failed writes */ 3212 import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) * 3213 MMP_IMPORT_SAFETY_FACTOR / 100; 3214 3215 zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp " 3216 "mmp_fails=%llu ub_mmp mmp_interval=%llu " 3217 "import_intervals=%llu", (u_longlong_t)import_delay, 3218 (u_longlong_t)MMP_FAIL_INT(ub), 3219 (u_longlong_t)MMP_INTERVAL(ub), 3220 (u_longlong_t)import_intervals); 3221 3222 } else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && 3223 MMP_FAIL_INT(ub) == 0) { 3224 3225 /* MMP on remote host will never suspend pool */ 3226 import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) + 3227 ub->ub_mmp_delay) * import_intervals); 3228 3229 zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp " 3230 "mmp_interval=%llu ub_mmp_delay=%llu " 3231 "import_intervals=%llu", (u_longlong_t)import_delay, 3232 (u_longlong_t)MMP_INTERVAL(ub), 3233 (u_longlong_t)ub->ub_mmp_delay, 3234 (u_longlong_t)import_intervals); 3235 3236 } else if (MMP_VALID(ub)) { 3237 /* 3238 * zfs-0.7 compatibility case 3239 */ 3240 3241 import_delay = MAX(import_delay, (multihost_interval + 3242 ub->ub_mmp_delay) * import_intervals); 3243 3244 zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu " 3245 "import_intervals=%llu leaves=%u", 3246 (u_longlong_t)import_delay, 3247 (u_longlong_t)ub->ub_mmp_delay, 3248 (u_longlong_t)import_intervals, 3249 vdev_count_leaves(spa)); 3250 } else { 3251 /* Using local tunings is the only reasonable option */ 3252 zfs_dbgmsg("pool last imported on non-MMP aware " 3253 "host using import_delay=%llu multihost_interval=%llu " 3254 "import_intervals=%llu", (u_longlong_t)import_delay, 3255 (u_longlong_t)multihost_interval, 3256 (u_longlong_t)import_intervals); 3257 } 3258 3259 return (import_delay); 3260 } 3261 3262 /* 3263 * Perform the import activity check. If the user canceled the import or 3264 * we detected activity then fail. 3265 */ 3266 static int 3267 spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) 3268 { 3269 uint64_t txg = ub->ub_txg; 3270 uint64_t timestamp = ub->ub_timestamp; 3271 uint64_t mmp_config = ub->ub_mmp_config; 3272 uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0; 3273 uint64_t import_delay; 3274 hrtime_t import_expire; 3275 nvlist_t *mmp_label = NULL; 3276 vdev_t *rvd = spa->spa_root_vdev; 3277 kcondvar_t cv; 3278 kmutex_t mtx; 3279 int error = 0; 3280 3281 cv_init(&cv, NULL, CV_DEFAULT, NULL); 3282 mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL); 3283 mutex_enter(&mtx); 3284 3285 /* 3286 * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed 3287 * during the earlier tryimport. If the txg recorded there is 0 then 3288 * the pool is known to be active on another host. 3289 * 3290 * Otherwise, the pool might be in use on another host. Check for 3291 * changes in the uberblocks on disk if necessary. 3292 */ 3293 if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { 3294 nvlist_t *nvinfo = fnvlist_lookup_nvlist(config, 3295 ZPOOL_CONFIG_LOAD_INFO); 3296 3297 if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) && 3298 fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) { 3299 vdev_uberblock_load(rvd, ub, &mmp_label); 3300 error = SET_ERROR(EREMOTEIO); 3301 goto out; 3302 } 3303 } 3304 3305 import_delay = spa_activity_check_duration(spa, ub); 3306 3307 /* Add a small random factor in case of simultaneous imports (0-25%) */ 3308 import_delay += import_delay * random_in_range(250) / 1000; 3309 3310 import_expire = gethrtime() + import_delay; 3311 3312 while (gethrtime() < import_expire) { 3313 (void) spa_import_progress_set_mmp_check(spa_guid(spa), 3314 NSEC2SEC(import_expire - gethrtime())); 3315 3316 vdev_uberblock_load(rvd, ub, &mmp_label); 3317 3318 if (txg != ub->ub_txg || timestamp != ub->ub_timestamp || 3319 mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) { 3320 zfs_dbgmsg("multihost activity detected " 3321 "txg %llu ub_txg %llu " 3322 "timestamp %llu ub_timestamp %llu " 3323 "mmp_config %#llx ub_mmp_config %#llx", 3324 (u_longlong_t)txg, (u_longlong_t)ub->ub_txg, 3325 (u_longlong_t)timestamp, 3326 (u_longlong_t)ub->ub_timestamp, 3327 (u_longlong_t)mmp_config, 3328 (u_longlong_t)ub->ub_mmp_config); 3329 3330 error = SET_ERROR(EREMOTEIO); 3331 break; 3332 } 3333 3334 if (mmp_label) { 3335 nvlist_free(mmp_label); 3336 mmp_label = NULL; 3337 } 3338 3339 error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz); 3340 if (error != -1) { 3341 error = SET_ERROR(EINTR); 3342 break; 3343 } 3344 error = 0; 3345 } 3346 3347 out: 3348 mutex_exit(&mtx); 3349 mutex_destroy(&mtx); 3350 cv_destroy(&cv); 3351 3352 /* 3353 * If the pool is determined to be active store the status in the 3354 * spa->spa_load_info nvlist. If the remote hostname or hostid are 3355 * available from configuration read from disk store them as well. 3356 * This allows 'zpool import' to generate a more useful message. 3357 * 3358 * ZPOOL_CONFIG_MMP_STATE - observed pool status (mandatory) 3359 * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool 3360 * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool 3361 */ 3362 if (error == EREMOTEIO) { 3363 const char *hostname = "<unknown>"; 3364 uint64_t hostid = 0; 3365 3366 if (mmp_label) { 3367 if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) { 3368 hostname = fnvlist_lookup_string(mmp_label, 3369 ZPOOL_CONFIG_HOSTNAME); 3370 fnvlist_add_string(spa->spa_load_info, 3371 ZPOOL_CONFIG_MMP_HOSTNAME, hostname); 3372 } 3373 3374 if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) { 3375 hostid = fnvlist_lookup_uint64(mmp_label, 3376 ZPOOL_CONFIG_HOSTID); 3377 fnvlist_add_uint64(spa->spa_load_info, 3378 ZPOOL_CONFIG_MMP_HOSTID, hostid); 3379 } 3380 } 3381 3382 fnvlist_add_uint64(spa->spa_load_info, 3383 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE); 3384 fnvlist_add_uint64(spa->spa_load_info, 3385 ZPOOL_CONFIG_MMP_TXG, 0); 3386 3387 error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO); 3388 } 3389 3390 if (mmp_label) 3391 nvlist_free(mmp_label); 3392 3393 return (error); 3394 } 3395 3396 static int 3397 spa_verify_host(spa_t *spa, nvlist_t *mos_config) 3398 { 3399 uint64_t hostid; 3400 const char *hostname; 3401 uint64_t myhostid = 0; 3402 3403 if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, 3404 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 3405 hostname = fnvlist_lookup_string(mos_config, 3406 ZPOOL_CONFIG_HOSTNAME); 3407 3408 myhostid = zone_get_hostid(NULL); 3409 3410 if (hostid != 0 && myhostid != 0 && hostid != myhostid) { 3411 cmn_err(CE_WARN, "pool '%s' could not be " 3412 "loaded as it was last accessed by " 3413 "another system (host: %s hostid: 0x%llx). " 3414 "See: https://openzfs.github.io/openzfs-docs/msg/" 3415 "ZFS-8000-EY", 3416 spa_name(spa), hostname, (u_longlong_t)hostid); 3417 spa_load_failed(spa, "hostid verification failed: pool " 3418 "last accessed by host: %s (hostid: 0x%llx)", 3419 hostname, (u_longlong_t)hostid); 3420 return (SET_ERROR(EBADF)); 3421 } 3422 } 3423 3424 return (0); 3425 } 3426 3427 static int 3428 spa_ld_parse_config(spa_t *spa, spa_import_type_t type) 3429 { 3430 int error = 0; 3431 nvlist_t *nvtree, *nvl, *config = spa->spa_config; 3432 int parse; 3433 vdev_t *rvd; 3434 uint64_t pool_guid; 3435 const char *comment; 3436 const char *compatibility; 3437 3438 /* 3439 * Versioning wasn't explicitly added to the label until later, so if 3440 * it's not present treat it as the initial version. 3441 */ 3442 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 3443 &spa->spa_ubsync.ub_version) != 0) 3444 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 3445 3446 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 3447 spa_load_failed(spa, "invalid config provided: '%s' missing", 3448 ZPOOL_CONFIG_POOL_GUID); 3449 return (SET_ERROR(EINVAL)); 3450 } 3451 3452 /* 3453 * If we are doing an import, ensure that the pool is not already 3454 * imported by checking if its pool guid already exists in the 3455 * spa namespace. 3456 * 3457 * The only case that we allow an already imported pool to be 3458 * imported again, is when the pool is checkpointed and we want to 3459 * look at its checkpointed state from userland tools like zdb. 3460 */ 3461 #ifdef _KERNEL 3462 if ((spa->spa_load_state == SPA_LOAD_IMPORT || 3463 spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 3464 spa_guid_exists(pool_guid, 0)) { 3465 #else 3466 if ((spa->spa_load_state == SPA_LOAD_IMPORT || 3467 spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 3468 spa_guid_exists(pool_guid, 0) && 3469 !spa_importing_readonly_checkpoint(spa)) { 3470 #endif 3471 spa_load_failed(spa, "a pool with guid %llu is already open", 3472 (u_longlong_t)pool_guid); 3473 return (SET_ERROR(EEXIST)); 3474 } 3475 3476 spa->spa_config_guid = pool_guid; 3477 3478 nvlist_free(spa->spa_load_info); 3479 spa->spa_load_info = fnvlist_alloc(); 3480 3481 ASSERT(spa->spa_comment == NULL); 3482 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 3483 spa->spa_comment = spa_strdup(comment); 3484 3485 ASSERT(spa->spa_compatibility == NULL); 3486 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMPATIBILITY, 3487 &compatibility) == 0) 3488 spa->spa_compatibility = spa_strdup(compatibility); 3489 3490 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 3491 &spa->spa_config_txg); 3492 3493 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) 3494 spa->spa_config_splitting = fnvlist_dup(nvl); 3495 3496 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { 3497 spa_load_failed(spa, "invalid config provided: '%s' missing", 3498 ZPOOL_CONFIG_VDEV_TREE); 3499 return (SET_ERROR(EINVAL)); 3500 } 3501 3502 /* 3503 * Create "The Godfather" zio to hold all async IOs 3504 */ 3505 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 3506 KM_SLEEP); 3507 for (int i = 0; i < max_ncpus; i++) { 3508 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 3509 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 3510 ZIO_FLAG_GODFATHER); 3511 } 3512 3513 /* 3514 * Parse the configuration into a vdev tree. We explicitly set the 3515 * value that will be returned by spa_version() since parsing the 3516 * configuration requires knowing the version number. 3517 */ 3518 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3519 parse = (type == SPA_IMPORT_EXISTING ? 3520 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 3521 error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); 3522 spa_config_exit(spa, SCL_ALL, FTAG); 3523 3524 if (error != 0) { 3525 spa_load_failed(spa, "unable to parse config [error=%d]", 3526 error); 3527 return (error); 3528 } 3529 3530 ASSERT(spa->spa_root_vdev == rvd); 3531 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 3532 ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); 3533 3534 if (type != SPA_IMPORT_ASSEMBLE) { 3535 ASSERT(spa_guid(spa) == pool_guid); 3536 } 3537 3538 return (0); 3539 } 3540 3541 /* 3542 * Recursively open all vdevs in the vdev tree. This function is called twice: 3543 * first with the untrusted config, then with the trusted config. 3544 */ 3545 static int 3546 spa_ld_open_vdevs(spa_t *spa) 3547 { 3548 int error = 0; 3549 3550 /* 3551 * spa_missing_tvds_allowed defines how many top-level vdevs can be 3552 * missing/unopenable for the root vdev to be still considered openable. 3553 */ 3554 if (spa->spa_trust_config) { 3555 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds; 3556 } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) { 3557 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile; 3558 } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) { 3559 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan; 3560 } else { 3561 spa->spa_missing_tvds_allowed = 0; 3562 } 3563 3564 spa->spa_missing_tvds_allowed = 3565 MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed); 3566 3567 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3568 error = vdev_open(spa->spa_root_vdev); 3569 spa_config_exit(spa, SCL_ALL, FTAG); 3570 3571 if (spa->spa_missing_tvds != 0) { 3572 spa_load_note(spa, "vdev tree has %lld missing top-level " 3573 "vdevs.", (u_longlong_t)spa->spa_missing_tvds); 3574 if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) { 3575 /* 3576 * Although theoretically we could allow users to open 3577 * incomplete pools in RW mode, we'd need to add a lot 3578 * of extra logic (e.g. adjust pool space to account 3579 * for missing vdevs). 3580 * This limitation also prevents users from accidentally 3581 * opening the pool in RW mode during data recovery and 3582 * damaging it further. 3583 */ 3584 spa_load_note(spa, "pools with missing top-level " 3585 "vdevs can only be opened in read-only mode."); 3586 error = SET_ERROR(ENXIO); 3587 } else { 3588 spa_load_note(spa, "current settings allow for maximum " 3589 "%lld missing top-level vdevs at this stage.", 3590 (u_longlong_t)spa->spa_missing_tvds_allowed); 3591 } 3592 } 3593 if (error != 0) { 3594 spa_load_failed(spa, "unable to open vdev tree [error=%d]", 3595 error); 3596 } 3597 if (spa->spa_missing_tvds != 0 || error != 0) 3598 vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2); 3599 3600 return (error); 3601 } 3602 3603 /* 3604 * We need to validate the vdev labels against the configuration that 3605 * we have in hand. This function is called twice: first with an untrusted 3606 * config, then with a trusted config. The validation is more strict when the 3607 * config is trusted. 3608 */ 3609 static int 3610 spa_ld_validate_vdevs(spa_t *spa) 3611 { 3612 int error = 0; 3613 vdev_t *rvd = spa->spa_root_vdev; 3614 3615 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3616 error = vdev_validate(rvd); 3617 spa_config_exit(spa, SCL_ALL, FTAG); 3618 3619 if (error != 0) { 3620 spa_load_failed(spa, "vdev_validate failed [error=%d]", error); 3621 return (error); 3622 } 3623 3624 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 3625 spa_load_failed(spa, "cannot open vdev tree after invalidating " 3626 "some vdevs"); 3627 vdev_dbgmsg_print_tree(rvd, 2); 3628 return (SET_ERROR(ENXIO)); 3629 } 3630 3631 return (0); 3632 } 3633 3634 static void 3635 spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub) 3636 { 3637 spa->spa_state = POOL_STATE_ACTIVE; 3638 spa->spa_ubsync = spa->spa_uberblock; 3639 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 3640 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 3641 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 3642 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 3643 spa->spa_claim_max_txg = spa->spa_first_txg; 3644 spa->spa_prev_software_version = ub->ub_software_version; 3645 } 3646 3647 static int 3648 spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) 3649 { 3650 vdev_t *rvd = spa->spa_root_vdev; 3651 nvlist_t *label; 3652 uberblock_t *ub = &spa->spa_uberblock; 3653 boolean_t activity_check = B_FALSE; 3654 3655 /* 3656 * If we are opening the checkpointed state of the pool by 3657 * rewinding to it, at this point we will have written the 3658 * checkpointed uberblock to the vdev labels, so searching 3659 * the labels will find the right uberblock. However, if 3660 * we are opening the checkpointed state read-only, we have 3661 * not modified the labels. Therefore, we must ignore the 3662 * labels and continue using the spa_uberblock that was set 3663 * by spa_ld_checkpoint_rewind. 3664 * 3665 * Note that it would be fine to ignore the labels when 3666 * rewinding (opening writeable) as well. However, if we 3667 * crash just after writing the labels, we will end up 3668 * searching the labels. Doing so in the common case means 3669 * that this code path gets exercised normally, rather than 3670 * just in the edge case. 3671 */ 3672 if (ub->ub_checkpoint_txg != 0 && 3673 spa_importing_readonly_checkpoint(spa)) { 3674 spa_ld_select_uberblock_done(spa, ub); 3675 return (0); 3676 } 3677 3678 /* 3679 * Find the best uberblock. 3680 */ 3681 vdev_uberblock_load(rvd, ub, &label); 3682 3683 /* 3684 * If we weren't able to find a single valid uberblock, return failure. 3685 */ 3686 if (ub->ub_txg == 0) { 3687 nvlist_free(label); 3688 spa_load_failed(spa, "no valid uberblock found"); 3689 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 3690 } 3691 3692 if (spa->spa_load_max_txg != UINT64_MAX) { 3693 (void) spa_import_progress_set_max_txg(spa_guid(spa), 3694 (u_longlong_t)spa->spa_load_max_txg); 3695 } 3696 spa_load_note(spa, "using uberblock with txg=%llu", 3697 (u_longlong_t)ub->ub_txg); 3698 3699 3700 /* 3701 * For pools which have the multihost property on determine if the 3702 * pool is truly inactive and can be safely imported. Prevent 3703 * hosts which don't have a hostid set from importing the pool. 3704 */ 3705 activity_check = spa_activity_check_required(spa, ub, label, 3706 spa->spa_config); 3707 if (activity_check) { 3708 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay && 3709 spa_get_hostid(spa) == 0) { 3710 nvlist_free(label); 3711 fnvlist_add_uint64(spa->spa_load_info, 3712 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); 3713 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); 3714 } 3715 3716 int error = spa_activity_check(spa, ub, spa->spa_config); 3717 if (error) { 3718 nvlist_free(label); 3719 return (error); 3720 } 3721 3722 fnvlist_add_uint64(spa->spa_load_info, 3723 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE); 3724 fnvlist_add_uint64(spa->spa_load_info, 3725 ZPOOL_CONFIG_MMP_TXG, ub->ub_txg); 3726 fnvlist_add_uint16(spa->spa_load_info, 3727 ZPOOL_CONFIG_MMP_SEQ, 3728 (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)); 3729 } 3730 3731 /* 3732 * If the pool has an unsupported version we can't open it. 3733 */ 3734 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 3735 nvlist_free(label); 3736 spa_load_failed(spa, "version %llu is not supported", 3737 (u_longlong_t)ub->ub_version); 3738 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 3739 } 3740 3741 if (ub->ub_version >= SPA_VERSION_FEATURES) { 3742 nvlist_t *features; 3743 3744 /* 3745 * If we weren't able to find what's necessary for reading the 3746 * MOS in the label, return failure. 3747 */ 3748 if (label == NULL) { 3749 spa_load_failed(spa, "label config unavailable"); 3750 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 3751 ENXIO)); 3752 } 3753 3754 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, 3755 &features) != 0) { 3756 nvlist_free(label); 3757 spa_load_failed(spa, "invalid label: '%s' missing", 3758 ZPOOL_CONFIG_FEATURES_FOR_READ); 3759 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 3760 ENXIO)); 3761 } 3762 3763 /* 3764 * Update our in-core representation with the definitive values 3765 * from the label. 3766 */ 3767 nvlist_free(spa->spa_label_features); 3768 spa->spa_label_features = fnvlist_dup(features); 3769 } 3770 3771 nvlist_free(label); 3772 3773 /* 3774 * Look through entries in the label nvlist's features_for_read. If 3775 * there is a feature listed there which we don't understand then we 3776 * cannot open a pool. 3777 */ 3778 if (ub->ub_version >= SPA_VERSION_FEATURES) { 3779 nvlist_t *unsup_feat; 3780 3781 unsup_feat = fnvlist_alloc(); 3782 3783 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 3784 NULL); nvp != NULL; 3785 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 3786 if (!zfeature_is_supported(nvpair_name(nvp))) { 3787 fnvlist_add_string(unsup_feat, 3788 nvpair_name(nvp), ""); 3789 } 3790 } 3791 3792 if (!nvlist_empty(unsup_feat)) { 3793 fnvlist_add_nvlist(spa->spa_load_info, 3794 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 3795 nvlist_free(unsup_feat); 3796 spa_load_failed(spa, "some features are unsupported"); 3797 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 3798 ENOTSUP)); 3799 } 3800 3801 nvlist_free(unsup_feat); 3802 } 3803 3804 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 3805 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3806 spa_try_repair(spa, spa->spa_config); 3807 spa_config_exit(spa, SCL_ALL, FTAG); 3808 nvlist_free(spa->spa_config_splitting); 3809 spa->spa_config_splitting = NULL; 3810 } 3811 3812 /* 3813 * Initialize internal SPA structures. 3814 */ 3815 spa_ld_select_uberblock_done(spa, ub); 3816 3817 return (0); 3818 } 3819 3820 static int 3821 spa_ld_open_rootbp(spa_t *spa) 3822 { 3823 int error = 0; 3824 vdev_t *rvd = spa->spa_root_vdev; 3825 3826 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 3827 if (error != 0) { 3828 spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " 3829 "[error=%d]", error); 3830 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3831 } 3832 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 3833 3834 return (0); 3835 } 3836 3837 static int 3838 spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, 3839 boolean_t reloading) 3840 { 3841 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 3842 nvlist_t *nv, *mos_config, *policy; 3843 int error = 0, copy_error; 3844 uint64_t healthy_tvds, healthy_tvds_mos; 3845 uint64_t mos_config_txg; 3846 3847 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) 3848 != 0) 3849 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3850 3851 /* 3852 * If we're assembling a pool from a split, the config provided is 3853 * already trusted so there is nothing to do. 3854 */ 3855 if (type == SPA_IMPORT_ASSEMBLE) 3856 return (0); 3857 3858 healthy_tvds = spa_healthy_core_tvds(spa); 3859 3860 if (load_nvlist(spa, spa->spa_config_object, &mos_config) 3861 != 0) { 3862 spa_load_failed(spa, "unable to retrieve MOS config"); 3863 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3864 } 3865 3866 /* 3867 * If we are doing an open, pool owner wasn't verified yet, thus do 3868 * the verification here. 3869 */ 3870 if (spa->spa_load_state == SPA_LOAD_OPEN) { 3871 error = spa_verify_host(spa, mos_config); 3872 if (error != 0) { 3873 nvlist_free(mos_config); 3874 return (error); 3875 } 3876 } 3877 3878 nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE); 3879 3880 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3881 3882 /* 3883 * Build a new vdev tree from the trusted config 3884 */ 3885 error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD); 3886 if (error != 0) { 3887 nvlist_free(mos_config); 3888 spa_config_exit(spa, SCL_ALL, FTAG); 3889 spa_load_failed(spa, "spa_config_parse failed [error=%d]", 3890 error); 3891 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 3892 } 3893 3894 /* 3895 * Vdev paths in the MOS may be obsolete. If the untrusted config was 3896 * obtained by scanning /dev/dsk, then it will have the right vdev 3897 * paths. We update the trusted MOS config with this information. 3898 * We first try to copy the paths with vdev_copy_path_strict, which 3899 * succeeds only when both configs have exactly the same vdev tree. 3900 * If that fails, we fall back to a more flexible method that has a 3901 * best effort policy. 3902 */ 3903 copy_error = vdev_copy_path_strict(rvd, mrvd); 3904 if (copy_error != 0 || spa_load_print_vdev_tree) { 3905 spa_load_note(spa, "provided vdev tree:"); 3906 vdev_dbgmsg_print_tree(rvd, 2); 3907 spa_load_note(spa, "MOS vdev tree:"); 3908 vdev_dbgmsg_print_tree(mrvd, 2); 3909 } 3910 if (copy_error != 0) { 3911 spa_load_note(spa, "vdev_copy_path_strict failed, falling " 3912 "back to vdev_copy_path_relaxed"); 3913 vdev_copy_path_relaxed(rvd, mrvd); 3914 } 3915 3916 vdev_close(rvd); 3917 vdev_free(rvd); 3918 spa->spa_root_vdev = mrvd; 3919 rvd = mrvd; 3920 spa_config_exit(spa, SCL_ALL, FTAG); 3921 3922 /* 3923 * We will use spa_config if we decide to reload the spa or if spa_load 3924 * fails and we rewind. We must thus regenerate the config using the 3925 * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to 3926 * pass settings on how to load the pool and is not stored in the MOS. 3927 * We copy it over to our new, trusted config. 3928 */ 3929 mos_config_txg = fnvlist_lookup_uint64(mos_config, 3930 ZPOOL_CONFIG_POOL_TXG); 3931 nvlist_free(mos_config); 3932 mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE); 3933 if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY, 3934 &policy) == 0) 3935 fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy); 3936 spa_config_set(spa, mos_config); 3937 spa->spa_config_source = SPA_CONFIG_SRC_MOS; 3938 3939 /* 3940 * Now that we got the config from the MOS, we should be more strict 3941 * in checking blkptrs and can make assumptions about the consistency 3942 * of the vdev tree. spa_trust_config must be set to true before opening 3943 * vdevs in order for them to be writeable. 3944 */ 3945 spa->spa_trust_config = B_TRUE; 3946 3947 /* 3948 * Open and validate the new vdev tree 3949 */ 3950 error = spa_ld_open_vdevs(spa); 3951 if (error != 0) 3952 return (error); 3953 3954 error = spa_ld_validate_vdevs(spa); 3955 if (error != 0) 3956 return (error); 3957 3958 if (copy_error != 0 || spa_load_print_vdev_tree) { 3959 spa_load_note(spa, "final vdev tree:"); 3960 vdev_dbgmsg_print_tree(rvd, 2); 3961 } 3962 3963 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT && 3964 !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) { 3965 /* 3966 * Sanity check to make sure that we are indeed loading the 3967 * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds 3968 * in the config provided and they happened to be the only ones 3969 * to have the latest uberblock, we could involuntarily perform 3970 * an extreme rewind. 3971 */ 3972 healthy_tvds_mos = spa_healthy_core_tvds(spa); 3973 if (healthy_tvds_mos - healthy_tvds >= 3974 SPA_SYNC_MIN_VDEVS) { 3975 spa_load_note(spa, "config provided misses too many " 3976 "top-level vdevs compared to MOS (%lld vs %lld). ", 3977 (u_longlong_t)healthy_tvds, 3978 (u_longlong_t)healthy_tvds_mos); 3979 spa_load_note(spa, "vdev tree:"); 3980 vdev_dbgmsg_print_tree(rvd, 2); 3981 if (reloading) { 3982 spa_load_failed(spa, "config was already " 3983 "provided from MOS. Aborting."); 3984 return (spa_vdev_err(rvd, 3985 VDEV_AUX_CORRUPT_DATA, EIO)); 3986 } 3987 spa_load_note(spa, "spa must be reloaded using MOS " 3988 "config"); 3989 return (SET_ERROR(EAGAIN)); 3990 } 3991 } 3992 3993 error = spa_check_for_missing_logs(spa); 3994 if (error != 0) 3995 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 3996 3997 if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) { 3998 spa_load_failed(spa, "uberblock guid sum doesn't match MOS " 3999 "guid sum (%llu != %llu)", 4000 (u_longlong_t)spa->spa_uberblock.ub_guid_sum, 4001 (u_longlong_t)rvd->vdev_guid_sum); 4002 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 4003 ENXIO)); 4004 } 4005 4006 return (0); 4007 } 4008 4009 static int 4010 spa_ld_open_indirect_vdev_metadata(spa_t *spa) 4011 { 4012 int error = 0; 4013 vdev_t *rvd = spa->spa_root_vdev; 4014 4015 /* 4016 * Everything that we read before spa_remove_init() must be stored 4017 * on concreted vdevs. Therefore we do this as early as possible. 4018 */ 4019 error = spa_remove_init(spa); 4020 if (error != 0) { 4021 spa_load_failed(spa, "spa_remove_init failed [error=%d]", 4022 error); 4023 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4024 } 4025 4026 /* 4027 * Retrieve information needed to condense indirect vdev mappings. 4028 */ 4029 error = spa_condense_init(spa); 4030 if (error != 0) { 4031 spa_load_failed(spa, "spa_condense_init failed [error=%d]", 4032 error); 4033 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4034 } 4035 4036 return (0); 4037 } 4038 4039 static int 4040 spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) 4041 { 4042 int error = 0; 4043 vdev_t *rvd = spa->spa_root_vdev; 4044 4045 if (spa_version(spa) >= SPA_VERSION_FEATURES) { 4046 boolean_t missing_feat_read = B_FALSE; 4047 nvlist_t *unsup_feat, *enabled_feat; 4048 4049 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 4050 &spa->spa_feat_for_read_obj, B_TRUE) != 0) { 4051 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4052 } 4053 4054 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 4055 &spa->spa_feat_for_write_obj, B_TRUE) != 0) { 4056 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4057 } 4058 4059 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 4060 &spa->spa_feat_desc_obj, B_TRUE) != 0) { 4061 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4062 } 4063 4064 enabled_feat = fnvlist_alloc(); 4065 unsup_feat = fnvlist_alloc(); 4066 4067 if (!spa_features_check(spa, B_FALSE, 4068 unsup_feat, enabled_feat)) 4069 missing_feat_read = B_TRUE; 4070 4071 if (spa_writeable(spa) || 4072 spa->spa_load_state == SPA_LOAD_TRYIMPORT) { 4073 if (!spa_features_check(spa, B_TRUE, 4074 unsup_feat, enabled_feat)) { 4075 *missing_feat_writep = B_TRUE; 4076 } 4077 } 4078 4079 fnvlist_add_nvlist(spa->spa_load_info, 4080 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 4081 4082 if (!nvlist_empty(unsup_feat)) { 4083 fnvlist_add_nvlist(spa->spa_load_info, 4084 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 4085 } 4086 4087 fnvlist_free(enabled_feat); 4088 fnvlist_free(unsup_feat); 4089 4090 if (!missing_feat_read) { 4091 fnvlist_add_boolean(spa->spa_load_info, 4092 ZPOOL_CONFIG_CAN_RDONLY); 4093 } 4094 4095 /* 4096 * If the state is SPA_LOAD_TRYIMPORT, our objective is 4097 * twofold: to determine whether the pool is available for 4098 * import in read-write mode and (if it is not) whether the 4099 * pool is available for import in read-only mode. If the pool 4100 * is available for import in read-write mode, it is displayed 4101 * as available in userland; if it is not available for import 4102 * in read-only mode, it is displayed as unavailable in 4103 * userland. If the pool is available for import in read-only 4104 * mode but not read-write mode, it is displayed as unavailable 4105 * in userland with a special note that the pool is actually 4106 * available for open in read-only mode. 4107 * 4108 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 4109 * missing a feature for write, we must first determine whether 4110 * the pool can be opened read-only before returning to 4111 * userland in order to know whether to display the 4112 * abovementioned note. 4113 */ 4114 if (missing_feat_read || (*missing_feat_writep && 4115 spa_writeable(spa))) { 4116 spa_load_failed(spa, "pool uses unsupported features"); 4117 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 4118 ENOTSUP)); 4119 } 4120 4121 /* 4122 * Load refcounts for ZFS features from disk into an in-memory 4123 * cache during SPA initialization. 4124 */ 4125 for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 4126 uint64_t refcount; 4127 4128 error = feature_get_refcount_from_disk(spa, 4129 &spa_feature_table[i], &refcount); 4130 if (error == 0) { 4131 spa->spa_feat_refcount_cache[i] = refcount; 4132 } else if (error == ENOTSUP) { 4133 spa->spa_feat_refcount_cache[i] = 4134 SPA_FEATURE_DISABLED; 4135 } else { 4136 spa_load_failed(spa, "error getting refcount " 4137 "for feature %s [error=%d]", 4138 spa_feature_table[i].fi_guid, error); 4139 return (spa_vdev_err(rvd, 4140 VDEV_AUX_CORRUPT_DATA, EIO)); 4141 } 4142 } 4143 } 4144 4145 if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 4146 if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 4147 &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) 4148 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4149 } 4150 4151 /* 4152 * Encryption was added before bookmark_v2, even though bookmark_v2 4153 * is now a dependency. If this pool has encryption enabled without 4154 * bookmark_v2, trigger an errata message. 4155 */ 4156 if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) && 4157 !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) { 4158 spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION; 4159 } 4160 4161 return (0); 4162 } 4163 4164 static int 4165 spa_ld_load_special_directories(spa_t *spa) 4166 { 4167 int error = 0; 4168 vdev_t *rvd = spa->spa_root_vdev; 4169 4170 spa->spa_is_initializing = B_TRUE; 4171 error = dsl_pool_open(spa->spa_dsl_pool); 4172 spa->spa_is_initializing = B_FALSE; 4173 if (error != 0) { 4174 spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); 4175 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4176 } 4177 4178 return (0); 4179 } 4180 4181 static int 4182 spa_ld_get_props(spa_t *spa) 4183 { 4184 int error = 0; 4185 uint64_t obj; 4186 vdev_t *rvd = spa->spa_root_vdev; 4187 4188 /* Grab the checksum salt from the MOS. */ 4189 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 4190 DMU_POOL_CHECKSUM_SALT, 1, 4191 sizeof (spa->spa_cksum_salt.zcs_bytes), 4192 spa->spa_cksum_salt.zcs_bytes); 4193 if (error == ENOENT) { 4194 /* Generate a new salt for subsequent use */ 4195 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 4196 sizeof (spa->spa_cksum_salt.zcs_bytes)); 4197 } else if (error != 0) { 4198 spa_load_failed(spa, "unable to retrieve checksum salt from " 4199 "MOS [error=%d]", error); 4200 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4201 } 4202 4203 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) 4204 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4205 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 4206 if (error != 0) { 4207 spa_load_failed(spa, "error opening deferred-frees bpobj " 4208 "[error=%d]", error); 4209 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4210 } 4211 4212 /* 4213 * Load the bit that tells us to use the new accounting function 4214 * (raid-z deflation). If we have an older pool, this will not 4215 * be present. 4216 */ 4217 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); 4218 if (error != 0 && error != ENOENT) 4219 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4220 4221 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 4222 &spa->spa_creation_version, B_FALSE); 4223 if (error != 0 && error != ENOENT) 4224 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4225 4226 /* 4227 * Load the persistent error log. If we have an older pool, this will 4228 * not be present. 4229 */ 4230 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, 4231 B_FALSE); 4232 if (error != 0 && error != ENOENT) 4233 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4234 4235 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 4236 &spa->spa_errlog_scrub, B_FALSE); 4237 if (error != 0 && error != ENOENT) 4238 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4239 4240 /* 4241 * Load the livelist deletion field. If a livelist is queued for 4242 * deletion, indicate that in the spa 4243 */ 4244 error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES, 4245 &spa->spa_livelists_to_delete, B_FALSE); 4246 if (error != 0 && error != ENOENT) 4247 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4248 4249 /* 4250 * Load the history object. If we have an older pool, this 4251 * will not be present. 4252 */ 4253 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); 4254 if (error != 0 && error != ENOENT) 4255 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4256 4257 /* 4258 * Load the per-vdev ZAP map. If we have an older pool, this will not 4259 * be present; in this case, defer its creation to a later time to 4260 * avoid dirtying the MOS this early / out of sync context. See 4261 * spa_sync_config_object. 4262 */ 4263 4264 /* The sentinel is only available in the MOS config. */ 4265 nvlist_t *mos_config; 4266 if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { 4267 spa_load_failed(spa, "unable to retrieve MOS config"); 4268 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4269 } 4270 4271 error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, 4272 &spa->spa_all_vdev_zaps, B_FALSE); 4273 4274 if (error == ENOENT) { 4275 VERIFY(!nvlist_exists(mos_config, 4276 ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 4277 spa->spa_avz_action = AVZ_ACTION_INITIALIZE; 4278 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 4279 } else if (error != 0) { 4280 nvlist_free(mos_config); 4281 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4282 } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { 4283 /* 4284 * An older version of ZFS overwrote the sentinel value, so 4285 * we have orphaned per-vdev ZAPs in the MOS. Defer their 4286 * destruction to later; see spa_sync_config_object. 4287 */ 4288 spa->spa_avz_action = AVZ_ACTION_DESTROY; 4289 /* 4290 * We're assuming that no vdevs have had their ZAPs created 4291 * before this. Better be sure of it. 4292 */ 4293 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 4294 } 4295 nvlist_free(mos_config); 4296 4297 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 4298 4299 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, 4300 B_FALSE); 4301 if (error && error != ENOENT) 4302 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4303 4304 if (error == 0) { 4305 uint64_t autoreplace = 0; 4306 4307 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 4308 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 4309 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 4310 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 4311 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 4312 spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); 4313 spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim); 4314 spa->spa_autoreplace = (autoreplace != 0); 4315 } 4316 4317 /* 4318 * If we are importing a pool with missing top-level vdevs, 4319 * we enforce that the pool doesn't panic or get suspended on 4320 * error since the likelihood of missing data is extremely high. 4321 */ 4322 if (spa->spa_missing_tvds > 0 && 4323 spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE && 4324 spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4325 spa_load_note(spa, "forcing failmode to 'continue' " 4326 "as some top level vdevs are missing"); 4327 spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE; 4328 } 4329 4330 return (0); 4331 } 4332 4333 static int 4334 spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) 4335 { 4336 int error = 0; 4337 vdev_t *rvd = spa->spa_root_vdev; 4338 4339 /* 4340 * If we're assembling the pool from the split-off vdevs of 4341 * an existing pool, we don't want to attach the spares & cache 4342 * devices. 4343 */ 4344 4345 /* 4346 * Load any hot spares for this pool. 4347 */ 4348 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, 4349 B_FALSE); 4350 if (error != 0 && error != ENOENT) 4351 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4352 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 4353 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 4354 if (load_nvlist(spa, spa->spa_spares.sav_object, 4355 &spa->spa_spares.sav_config) != 0) { 4356 spa_load_failed(spa, "error loading spares nvlist"); 4357 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4358 } 4359 4360 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4361 spa_load_spares(spa); 4362 spa_config_exit(spa, SCL_ALL, FTAG); 4363 } else if (error == 0) { 4364 spa->spa_spares.sav_sync = B_TRUE; 4365 } 4366 4367 /* 4368 * Load any level 2 ARC devices for this pool. 4369 */ 4370 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 4371 &spa->spa_l2cache.sav_object, B_FALSE); 4372 if (error != 0 && error != ENOENT) 4373 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4374 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 4375 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 4376 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 4377 &spa->spa_l2cache.sav_config) != 0) { 4378 spa_load_failed(spa, "error loading l2cache nvlist"); 4379 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4380 } 4381 4382 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4383 spa_load_l2cache(spa); 4384 spa_config_exit(spa, SCL_ALL, FTAG); 4385 } else if (error == 0) { 4386 spa->spa_l2cache.sav_sync = B_TRUE; 4387 } 4388 4389 return (0); 4390 } 4391 4392 static int 4393 spa_ld_load_vdev_metadata(spa_t *spa) 4394 { 4395 int error = 0; 4396 vdev_t *rvd = spa->spa_root_vdev; 4397 4398 /* 4399 * If the 'multihost' property is set, then never allow a pool to 4400 * be imported when the system hostid is zero. The exception to 4401 * this rule is zdb which is always allowed to access pools. 4402 */ 4403 if (spa_multihost(spa) && spa_get_hostid(spa) == 0 && 4404 (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) { 4405 fnvlist_add_uint64(spa->spa_load_info, 4406 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); 4407 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); 4408 } 4409 4410 /* 4411 * If the 'autoreplace' property is set, then post a resource notifying 4412 * the ZFS DE that it should not issue any faults for unopenable 4413 * devices. We also iterate over the vdevs, and post a sysevent for any 4414 * unopenable vdevs so that the normal autoreplace handler can take 4415 * over. 4416 */ 4417 if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4418 spa_check_removed(spa->spa_root_vdev); 4419 /* 4420 * For the import case, this is done in spa_import(), because 4421 * at this point we're using the spare definitions from 4422 * the MOS config, not necessarily from the userland config. 4423 */ 4424 if (spa->spa_load_state != SPA_LOAD_IMPORT) { 4425 spa_aux_check_removed(&spa->spa_spares); 4426 spa_aux_check_removed(&spa->spa_l2cache); 4427 } 4428 } 4429 4430 /* 4431 * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. 4432 */ 4433 error = vdev_load(rvd); 4434 if (error != 0) { 4435 spa_load_failed(spa, "vdev_load failed [error=%d]", error); 4436 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4437 } 4438 4439 error = spa_ld_log_spacemaps(spa); 4440 if (error != 0) { 4441 spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]", 4442 error); 4443 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4444 } 4445 4446 /* 4447 * Propagate the leaf DTLs we just loaded all the way up the vdev tree. 4448 */ 4449 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4450 vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE); 4451 spa_config_exit(spa, SCL_ALL, FTAG); 4452 4453 return (0); 4454 } 4455 4456 static int 4457 spa_ld_load_dedup_tables(spa_t *spa) 4458 { 4459 int error = 0; 4460 vdev_t *rvd = spa->spa_root_vdev; 4461 4462 error = ddt_load(spa); 4463 if (error != 0) { 4464 spa_load_failed(spa, "ddt_load failed [error=%d]", error); 4465 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4466 } 4467 4468 return (0); 4469 } 4470 4471 static int 4472 spa_ld_load_brt(spa_t *spa) 4473 { 4474 int error = 0; 4475 vdev_t *rvd = spa->spa_root_vdev; 4476 4477 error = brt_load(spa); 4478 if (error != 0) { 4479 spa_load_failed(spa, "brt_load failed [error=%d]", error); 4480 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4481 } 4482 4483 return (0); 4484 } 4485 4486 static int 4487 spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, const char **ereport) 4488 { 4489 vdev_t *rvd = spa->spa_root_vdev; 4490 4491 if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { 4492 boolean_t missing = spa_check_logs(spa); 4493 if (missing) { 4494 if (spa->spa_missing_tvds != 0) { 4495 spa_load_note(spa, "spa_check_logs failed " 4496 "so dropping the logs"); 4497 } else { 4498 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 4499 spa_load_failed(spa, "spa_check_logs failed"); 4500 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, 4501 ENXIO)); 4502 } 4503 } 4504 } 4505 4506 return (0); 4507 } 4508 4509 static int 4510 spa_ld_verify_pool_data(spa_t *spa) 4511 { 4512 int error = 0; 4513 vdev_t *rvd = spa->spa_root_vdev; 4514 4515 /* 4516 * We've successfully opened the pool, verify that we're ready 4517 * to start pushing transactions. 4518 */ 4519 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4520 error = spa_load_verify(spa); 4521 if (error != 0) { 4522 spa_load_failed(spa, "spa_load_verify failed " 4523 "[error=%d]", error); 4524 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 4525 error)); 4526 } 4527 } 4528 4529 return (0); 4530 } 4531 4532 static void 4533 spa_ld_claim_log_blocks(spa_t *spa) 4534 { 4535 dmu_tx_t *tx; 4536 dsl_pool_t *dp = spa_get_dsl(spa); 4537 4538 /* 4539 * Claim log blocks that haven't been committed yet. 4540 * This must all happen in a single txg. 4541 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 4542 * invoked from zil_claim_log_block()'s i/o done callback. 4543 * Price of rollback is that we abandon the log. 4544 */ 4545 spa->spa_claiming = B_TRUE; 4546 4547 tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); 4548 (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 4549 zil_claim, tx, DS_FIND_CHILDREN); 4550 dmu_tx_commit(tx); 4551 4552 spa->spa_claiming = B_FALSE; 4553 4554 spa_set_log_state(spa, SPA_LOG_GOOD); 4555 } 4556 4557 static void 4558 spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, 4559 boolean_t update_config_cache) 4560 { 4561 vdev_t *rvd = spa->spa_root_vdev; 4562 int need_update = B_FALSE; 4563 4564 /* 4565 * If the config cache is stale, or we have uninitialized 4566 * metaslabs (see spa_vdev_add()), then update the config. 4567 * 4568 * If this is a verbatim import, trust the current 4569 * in-core spa_config and update the disk labels. 4570 */ 4571 if (update_config_cache || config_cache_txg != spa->spa_config_txg || 4572 spa->spa_load_state == SPA_LOAD_IMPORT || 4573 spa->spa_load_state == SPA_LOAD_RECOVER || 4574 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 4575 need_update = B_TRUE; 4576 4577 for (int c = 0; c < rvd->vdev_children; c++) 4578 if (rvd->vdev_child[c]->vdev_ms_array == 0) 4579 need_update = B_TRUE; 4580 4581 /* 4582 * Update the config cache asynchronously in case we're the 4583 * root pool, in which case the config cache isn't writable yet. 4584 */ 4585 if (need_update) 4586 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 4587 } 4588 4589 static void 4590 spa_ld_prepare_for_reload(spa_t *spa) 4591 { 4592 spa_mode_t mode = spa->spa_mode; 4593 int async_suspended = spa->spa_async_suspended; 4594 4595 spa_unload(spa); 4596 spa_deactivate(spa); 4597 spa_activate(spa, mode); 4598 4599 /* 4600 * We save the value of spa_async_suspended as it gets reset to 0 by 4601 * spa_unload(). We want to restore it back to the original value before 4602 * returning as we might be calling spa_async_resume() later. 4603 */ 4604 spa->spa_async_suspended = async_suspended; 4605 } 4606 4607 static int 4608 spa_ld_read_checkpoint_txg(spa_t *spa) 4609 { 4610 uberblock_t checkpoint; 4611 int error = 0; 4612 4613 ASSERT0(spa->spa_checkpoint_txg); 4614 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4615 4616 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 4617 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 4618 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 4619 4620 if (error == ENOENT) 4621 return (0); 4622 4623 if (error != 0) 4624 return (error); 4625 4626 ASSERT3U(checkpoint.ub_txg, !=, 0); 4627 ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0); 4628 ASSERT3U(checkpoint.ub_timestamp, !=, 0); 4629 spa->spa_checkpoint_txg = checkpoint.ub_txg; 4630 spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; 4631 4632 return (0); 4633 } 4634 4635 static int 4636 spa_ld_mos_init(spa_t *spa, spa_import_type_t type) 4637 { 4638 int error = 0; 4639 4640 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4641 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); 4642 4643 /* 4644 * Never trust the config that is provided unless we are assembling 4645 * a pool following a split. 4646 * This means don't trust blkptrs and the vdev tree in general. This 4647 * also effectively puts the spa in read-only mode since 4648 * spa_writeable() checks for spa_trust_config to be true. 4649 * We will later load a trusted config from the MOS. 4650 */ 4651 if (type != SPA_IMPORT_ASSEMBLE) 4652 spa->spa_trust_config = B_FALSE; 4653 4654 /* 4655 * Parse the config provided to create a vdev tree. 4656 */ 4657 error = spa_ld_parse_config(spa, type); 4658 if (error != 0) 4659 return (error); 4660 4661 spa_import_progress_add(spa); 4662 4663 /* 4664 * Now that we have the vdev tree, try to open each vdev. This involves 4665 * opening the underlying physical device, retrieving its geometry and 4666 * probing the vdev with a dummy I/O. The state of each vdev will be set 4667 * based on the success of those operations. After this we'll be ready 4668 * to read from the vdevs. 4669 */ 4670 error = spa_ld_open_vdevs(spa); 4671 if (error != 0) 4672 return (error); 4673 4674 /* 4675 * Read the label of each vdev and make sure that the GUIDs stored 4676 * there match the GUIDs in the config provided. 4677 * If we're assembling a new pool that's been split off from an 4678 * existing pool, the labels haven't yet been updated so we skip 4679 * validation for now. 4680 */ 4681 if (type != SPA_IMPORT_ASSEMBLE) { 4682 error = spa_ld_validate_vdevs(spa); 4683 if (error != 0) 4684 return (error); 4685 } 4686 4687 /* 4688 * Read all vdev labels to find the best uberblock (i.e. latest, 4689 * unless spa_load_max_txg is set) and store it in spa_uberblock. We 4690 * get the list of features required to read blkptrs in the MOS from 4691 * the vdev label with the best uberblock and verify that our version 4692 * of zfs supports them all. 4693 */ 4694 error = spa_ld_select_uberblock(spa, type); 4695 if (error != 0) 4696 return (error); 4697 4698 /* 4699 * Pass that uberblock to the dsl_pool layer which will open the root 4700 * blkptr. This blkptr points to the latest version of the MOS and will 4701 * allow us to read its contents. 4702 */ 4703 error = spa_ld_open_rootbp(spa); 4704 if (error != 0) 4705 return (error); 4706 4707 return (0); 4708 } 4709 4710 static int 4711 spa_ld_checkpoint_rewind(spa_t *spa) 4712 { 4713 uberblock_t checkpoint; 4714 int error = 0; 4715 4716 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4717 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 4718 4719 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 4720 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 4721 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 4722 4723 if (error != 0) { 4724 spa_load_failed(spa, "unable to retrieve checkpointed " 4725 "uberblock from the MOS config [error=%d]", error); 4726 4727 if (error == ENOENT) 4728 error = ZFS_ERR_NO_CHECKPOINT; 4729 4730 return (error); 4731 } 4732 4733 ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg); 4734 ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg); 4735 4736 /* 4737 * We need to update the txg and timestamp of the checkpointed 4738 * uberblock to be higher than the latest one. This ensures that 4739 * the checkpointed uberblock is selected if we were to close and 4740 * reopen the pool right after we've written it in the vdev labels. 4741 * (also see block comment in vdev_uberblock_compare) 4742 */ 4743 checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1; 4744 checkpoint.ub_timestamp = gethrestime_sec(); 4745 4746 /* 4747 * Set current uberblock to be the checkpointed uberblock. 4748 */ 4749 spa->spa_uberblock = checkpoint; 4750 4751 /* 4752 * If we are doing a normal rewind, then the pool is open for 4753 * writing and we sync the "updated" checkpointed uberblock to 4754 * disk. Once this is done, we've basically rewound the whole 4755 * pool and there is no way back. 4756 * 4757 * There are cases when we don't want to attempt and sync the 4758 * checkpointed uberblock to disk because we are opening a 4759 * pool as read-only. Specifically, verifying the checkpointed 4760 * state with zdb, and importing the checkpointed state to get 4761 * a "preview" of its content. 4762 */ 4763 if (spa_writeable(spa)) { 4764 vdev_t *rvd = spa->spa_root_vdev; 4765 4766 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4767 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; 4768 int svdcount = 0; 4769 int children = rvd->vdev_children; 4770 int c0 = random_in_range(children); 4771 4772 for (int c = 0; c < children; c++) { 4773 vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; 4774 4775 /* Stop when revisiting the first vdev */ 4776 if (c > 0 && svd[0] == vd) 4777 break; 4778 4779 if (vd->vdev_ms_array == 0 || vd->vdev_islog || 4780 !vdev_is_concrete(vd)) 4781 continue; 4782 4783 svd[svdcount++] = vd; 4784 if (svdcount == SPA_SYNC_MIN_VDEVS) 4785 break; 4786 } 4787 error = vdev_config_sync(svd, svdcount, spa->spa_first_txg); 4788 if (error == 0) 4789 spa->spa_last_synced_guid = rvd->vdev_guid; 4790 spa_config_exit(spa, SCL_ALL, FTAG); 4791 4792 if (error != 0) { 4793 spa_load_failed(spa, "failed to write checkpointed " 4794 "uberblock to the vdev labels [error=%d]", error); 4795 return (error); 4796 } 4797 } 4798 4799 return (0); 4800 } 4801 4802 static int 4803 spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type, 4804 boolean_t *update_config_cache) 4805 { 4806 int error; 4807 4808 /* 4809 * Parse the config for pool, open and validate vdevs, 4810 * select an uberblock, and use that uberblock to open 4811 * the MOS. 4812 */ 4813 error = spa_ld_mos_init(spa, type); 4814 if (error != 0) 4815 return (error); 4816 4817 /* 4818 * Retrieve the trusted config stored in the MOS and use it to create 4819 * a new, exact version of the vdev tree, then reopen all vdevs. 4820 */ 4821 error = spa_ld_trusted_config(spa, type, B_FALSE); 4822 if (error == EAGAIN) { 4823 if (update_config_cache != NULL) 4824 *update_config_cache = B_TRUE; 4825 4826 /* 4827 * Redo the loading process with the trusted config if it is 4828 * too different from the untrusted config. 4829 */ 4830 spa_ld_prepare_for_reload(spa); 4831 spa_load_note(spa, "RELOADING"); 4832 error = spa_ld_mos_init(spa, type); 4833 if (error != 0) 4834 return (error); 4835 4836 error = spa_ld_trusted_config(spa, type, B_TRUE); 4837 if (error != 0) 4838 return (error); 4839 4840 } else if (error != 0) { 4841 return (error); 4842 } 4843 4844 return (0); 4845 } 4846 4847 /* 4848 * Load an existing storage pool, using the config provided. This config 4849 * describes which vdevs are part of the pool and is later validated against 4850 * partial configs present in each vdev's label and an entire copy of the 4851 * config stored in the MOS. 4852 */ 4853 static int 4854 spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) 4855 { 4856 int error = 0; 4857 boolean_t missing_feat_write = B_FALSE; 4858 boolean_t checkpoint_rewind = 4859 (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 4860 boolean_t update_config_cache = B_FALSE; 4861 4862 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4863 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); 4864 4865 spa_load_note(spa, "LOADING"); 4866 4867 error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache); 4868 if (error != 0) 4869 return (error); 4870 4871 /* 4872 * If we are rewinding to the checkpoint then we need to repeat 4873 * everything we've done so far in this function but this time 4874 * selecting the checkpointed uberblock and using that to open 4875 * the MOS. 4876 */ 4877 if (checkpoint_rewind) { 4878 /* 4879 * If we are rewinding to the checkpoint update config cache 4880 * anyway. 4881 */ 4882 update_config_cache = B_TRUE; 4883 4884 /* 4885 * Extract the checkpointed uberblock from the current MOS 4886 * and use this as the pool's uberblock from now on. If the 4887 * pool is imported as writeable we also write the checkpoint 4888 * uberblock to the labels, making the rewind permanent. 4889 */ 4890 error = spa_ld_checkpoint_rewind(spa); 4891 if (error != 0) 4892 return (error); 4893 4894 /* 4895 * Redo the loading process again with the 4896 * checkpointed uberblock. 4897 */ 4898 spa_ld_prepare_for_reload(spa); 4899 spa_load_note(spa, "LOADING checkpointed uberblock"); 4900 error = spa_ld_mos_with_trusted_config(spa, type, NULL); 4901 if (error != 0) 4902 return (error); 4903 } 4904 4905 /* 4906 * Retrieve the checkpoint txg if the pool has a checkpoint. 4907 */ 4908 error = spa_ld_read_checkpoint_txg(spa); 4909 if (error != 0) 4910 return (error); 4911 4912 /* 4913 * Retrieve the mapping of indirect vdevs. Those vdevs were removed 4914 * from the pool and their contents were re-mapped to other vdevs. Note 4915 * that everything that we read before this step must have been 4916 * rewritten on concrete vdevs after the last device removal was 4917 * initiated. Otherwise we could be reading from indirect vdevs before 4918 * we have loaded their mappings. 4919 */ 4920 error = spa_ld_open_indirect_vdev_metadata(spa); 4921 if (error != 0) 4922 return (error); 4923 4924 /* 4925 * Retrieve the full list of active features from the MOS and check if 4926 * they are all supported. 4927 */ 4928 error = spa_ld_check_features(spa, &missing_feat_write); 4929 if (error != 0) 4930 return (error); 4931 4932 /* 4933 * Load several special directories from the MOS needed by the dsl_pool 4934 * layer. 4935 */ 4936 error = spa_ld_load_special_directories(spa); 4937 if (error != 0) 4938 return (error); 4939 4940 /* 4941 * Retrieve pool properties from the MOS. 4942 */ 4943 error = spa_ld_get_props(spa); 4944 if (error != 0) 4945 return (error); 4946 4947 /* 4948 * Retrieve the list of auxiliary devices - cache devices and spares - 4949 * and open them. 4950 */ 4951 error = spa_ld_open_aux_vdevs(spa, type); 4952 if (error != 0) 4953 return (error); 4954 4955 /* 4956 * Load the metadata for all vdevs. Also check if unopenable devices 4957 * should be autoreplaced. 4958 */ 4959 error = spa_ld_load_vdev_metadata(spa); 4960 if (error != 0) 4961 return (error); 4962 4963 error = spa_ld_load_dedup_tables(spa); 4964 if (error != 0) 4965 return (error); 4966 4967 error = spa_ld_load_brt(spa); 4968 if (error != 0) 4969 return (error); 4970 4971 /* 4972 * Verify the logs now to make sure we don't have any unexpected errors 4973 * when we claim log blocks later. 4974 */ 4975 error = spa_ld_verify_logs(spa, type, ereport); 4976 if (error != 0) 4977 return (error); 4978 4979 if (missing_feat_write) { 4980 ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); 4981 4982 /* 4983 * At this point, we know that we can open the pool in 4984 * read-only mode but not read-write mode. We now have enough 4985 * information and can return to userland. 4986 */ 4987 return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, 4988 ENOTSUP)); 4989 } 4990 4991 /* 4992 * Traverse the last txgs to make sure the pool was left off in a safe 4993 * state. When performing an extreme rewind, we verify the whole pool, 4994 * which can take a very long time. 4995 */ 4996 error = spa_ld_verify_pool_data(spa); 4997 if (error != 0) 4998 return (error); 4999 5000 /* 5001 * Calculate the deflated space for the pool. This must be done before 5002 * we write anything to the pool because we'd need to update the space 5003 * accounting using the deflated sizes. 5004 */ 5005 spa_update_dspace(spa); 5006 5007 /* 5008 * We have now retrieved all the information we needed to open the 5009 * pool. If we are importing the pool in read-write mode, a few 5010 * additional steps must be performed to finish the import. 5011 */ 5012 if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || 5013 spa->spa_load_max_txg == UINT64_MAX)) { 5014 uint64_t config_cache_txg = spa->spa_config_txg; 5015 5016 ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); 5017 5018 /* 5019 * In case of a checkpoint rewind, log the original txg 5020 * of the checkpointed uberblock. 5021 */ 5022 if (checkpoint_rewind) { 5023 spa_history_log_internal(spa, "checkpoint rewind", 5024 NULL, "rewound state to txg=%llu", 5025 (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); 5026 } 5027 5028 /* 5029 * Traverse the ZIL and claim all blocks. 5030 */ 5031 spa_ld_claim_log_blocks(spa); 5032 5033 /* 5034 * Kick-off the syncing thread. 5035 */ 5036 spa->spa_sync_on = B_TRUE; 5037 txg_sync_start(spa->spa_dsl_pool); 5038 mmp_thread_start(spa); 5039 5040 /* 5041 * Wait for all claims to sync. We sync up to the highest 5042 * claimed log block birth time so that claimed log blocks 5043 * don't appear to be from the future. spa_claim_max_txg 5044 * will have been set for us by ZIL traversal operations 5045 * performed above. 5046 */ 5047 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 5048 5049 /* 5050 * Check if we need to request an update of the config. On the 5051 * next sync, we would update the config stored in vdev labels 5052 * and the cachefile (by default /etc/zfs/zpool.cache). 5053 */ 5054 spa_ld_check_for_config_update(spa, config_cache_txg, 5055 update_config_cache); 5056 5057 /* 5058 * Check if a rebuild was in progress and if so resume it. 5059 * Then check all DTLs to see if anything needs resilvering. 5060 * The resilver will be deferred if a rebuild was started. 5061 */ 5062 if (vdev_rebuild_active(spa->spa_root_vdev)) { 5063 vdev_rebuild_restart(spa); 5064 } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 5065 vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5066 spa_async_request(spa, SPA_ASYNC_RESILVER); 5067 } 5068 5069 /* 5070 * Log the fact that we booted up (so that we can detect if 5071 * we rebooted in the middle of an operation). 5072 */ 5073 spa_history_log_version(spa, "open", NULL); 5074 5075 spa_restart_removal(spa); 5076 spa_spawn_aux_threads(spa); 5077 5078 /* 5079 * Delete any inconsistent datasets. 5080 * 5081 * Note: 5082 * Since we may be issuing deletes for clones here, 5083 * we make sure to do so after we've spawned all the 5084 * auxiliary threads above (from which the livelist 5085 * deletion zthr is part of). 5086 */ 5087 (void) dmu_objset_find(spa_name(spa), 5088 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 5089 5090 /* 5091 * Clean up any stale temporary dataset userrefs. 5092 */ 5093 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 5094 5095 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5096 vdev_initialize_restart(spa->spa_root_vdev); 5097 vdev_trim_restart(spa->spa_root_vdev); 5098 vdev_autotrim_restart(spa); 5099 spa_config_exit(spa, SCL_CONFIG, FTAG); 5100 } 5101 5102 spa_import_progress_remove(spa_guid(spa)); 5103 spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); 5104 5105 spa_load_note(spa, "LOADED"); 5106 5107 return (0); 5108 } 5109 5110 static int 5111 spa_load_retry(spa_t *spa, spa_load_state_t state) 5112 { 5113 spa_mode_t mode = spa->spa_mode; 5114 5115 spa_unload(spa); 5116 spa_deactivate(spa); 5117 5118 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; 5119 5120 spa_activate(spa, mode); 5121 spa_async_suspend(spa); 5122 5123 spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", 5124 (u_longlong_t)spa->spa_load_max_txg); 5125 5126 return (spa_load(spa, state, SPA_IMPORT_EXISTING)); 5127 } 5128 5129 /* 5130 * If spa_load() fails this function will try loading prior txg's. If 5131 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 5132 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 5133 * function will not rewind the pool and will return the same error as 5134 * spa_load(). 5135 */ 5136 static int 5137 spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, 5138 int rewind_flags) 5139 { 5140 nvlist_t *loadinfo = NULL; 5141 nvlist_t *config = NULL; 5142 int load_error, rewind_error; 5143 uint64_t safe_rewind_txg; 5144 uint64_t min_txg; 5145 5146 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 5147 spa->spa_load_max_txg = spa->spa_load_txg; 5148 spa_set_log_state(spa, SPA_LOG_CLEAR); 5149 } else { 5150 spa->spa_load_max_txg = max_request; 5151 if (max_request != UINT64_MAX) 5152 spa->spa_extreme_rewind = B_TRUE; 5153 } 5154 5155 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING); 5156 if (load_error == 0) 5157 return (0); 5158 if (load_error == ZFS_ERR_NO_CHECKPOINT) { 5159 /* 5160 * When attempting checkpoint-rewind on a pool with no 5161 * checkpoint, we should not attempt to load uberblocks 5162 * from previous txgs when spa_load fails. 5163 */ 5164 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 5165 spa_import_progress_remove(spa_guid(spa)); 5166 return (load_error); 5167 } 5168 5169 if (spa->spa_root_vdev != NULL) 5170 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 5171 5172 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 5173 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 5174 5175 if (rewind_flags & ZPOOL_NEVER_REWIND) { 5176 nvlist_free(config); 5177 spa_import_progress_remove(spa_guid(spa)); 5178 return (load_error); 5179 } 5180 5181 if (state == SPA_LOAD_RECOVER) { 5182 /* Price of rolling back is discarding txgs, including log */ 5183 spa_set_log_state(spa, SPA_LOG_CLEAR); 5184 } else { 5185 /* 5186 * If we aren't rolling back save the load info from our first 5187 * import attempt so that we can restore it after attempting 5188 * to rewind. 5189 */ 5190 loadinfo = spa->spa_load_info; 5191 spa->spa_load_info = fnvlist_alloc(); 5192 } 5193 5194 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 5195 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 5196 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 5197 TXG_INITIAL : safe_rewind_txg; 5198 5199 /* 5200 * Continue as long as we're finding errors, we're still within 5201 * the acceptable rewind range, and we're still finding uberblocks 5202 */ 5203 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 5204 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 5205 if (spa->spa_load_max_txg < safe_rewind_txg) 5206 spa->spa_extreme_rewind = B_TRUE; 5207 rewind_error = spa_load_retry(spa, state); 5208 } 5209 5210 spa->spa_extreme_rewind = B_FALSE; 5211 spa->spa_load_max_txg = UINT64_MAX; 5212 5213 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 5214 spa_config_set(spa, config); 5215 else 5216 nvlist_free(config); 5217 5218 if (state == SPA_LOAD_RECOVER) { 5219 ASSERT3P(loadinfo, ==, NULL); 5220 spa_import_progress_remove(spa_guid(spa)); 5221 return (rewind_error); 5222 } else { 5223 /* Store the rewind info as part of the initial load info */ 5224 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 5225 spa->spa_load_info); 5226 5227 /* Restore the initial load info */ 5228 fnvlist_free(spa->spa_load_info); 5229 spa->spa_load_info = loadinfo; 5230 5231 spa_import_progress_remove(spa_guid(spa)); 5232 return (load_error); 5233 } 5234 } 5235 5236 /* 5237 * Pool Open/Import 5238 * 5239 * The import case is identical to an open except that the configuration is sent 5240 * down from userland, instead of grabbed from the configuration cache. For the 5241 * case of an open, the pool configuration will exist in the 5242 * POOL_STATE_UNINITIALIZED state. 5243 * 5244 * The stats information (gen/count/ustats) is used to gather vdev statistics at 5245 * the same time open the pool, without having to keep around the spa_t in some 5246 * ambiguous state. 5247 */ 5248 static int 5249 spa_open_common(const char *pool, spa_t **spapp, const void *tag, 5250 nvlist_t *nvpolicy, nvlist_t **config) 5251 { 5252 spa_t *spa; 5253 spa_load_state_t state = SPA_LOAD_OPEN; 5254 int error; 5255 int locked = B_FALSE; 5256 int firstopen = B_FALSE; 5257 5258 *spapp = NULL; 5259 5260 /* 5261 * As disgusting as this is, we need to support recursive calls to this 5262 * function because dsl_dir_open() is called during spa_load(), and ends 5263 * up calling spa_open() again. The real fix is to figure out how to 5264 * avoid dsl_dir_open() calling this in the first place. 5265 */ 5266 if (MUTEX_NOT_HELD(&spa_namespace_lock)) { 5267 mutex_enter(&spa_namespace_lock); 5268 locked = B_TRUE; 5269 } 5270 5271 if ((spa = spa_lookup(pool)) == NULL) { 5272 if (locked) 5273 mutex_exit(&spa_namespace_lock); 5274 return (SET_ERROR(ENOENT)); 5275 } 5276 5277 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 5278 zpool_load_policy_t policy; 5279 5280 firstopen = B_TRUE; 5281 5282 zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config, 5283 &policy); 5284 if (policy.zlp_rewind & ZPOOL_DO_REWIND) 5285 state = SPA_LOAD_RECOVER; 5286 5287 spa_activate(spa, spa_mode_global); 5288 5289 if (state != SPA_LOAD_RECOVER) 5290 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 5291 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; 5292 5293 zfs_dbgmsg("spa_open_common: opening %s", pool); 5294 error = spa_load_best(spa, state, policy.zlp_txg, 5295 policy.zlp_rewind); 5296 5297 if (error == EBADF) { 5298 /* 5299 * If vdev_validate() returns failure (indicated by 5300 * EBADF), it indicates that one of the vdevs indicates 5301 * that the pool has been exported or destroyed. If 5302 * this is the case, the config cache is out of sync and 5303 * we should remove the pool from the namespace. 5304 */ 5305 spa_unload(spa); 5306 spa_deactivate(spa); 5307 spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); 5308 spa_remove(spa); 5309 if (locked) 5310 mutex_exit(&spa_namespace_lock); 5311 return (SET_ERROR(ENOENT)); 5312 } 5313 5314 if (error) { 5315 /* 5316 * We can't open the pool, but we still have useful 5317 * information: the state of each vdev after the 5318 * attempted vdev_open(). Return this to the user. 5319 */ 5320 if (config != NULL && spa->spa_config) { 5321 *config = fnvlist_dup(spa->spa_config); 5322 fnvlist_add_nvlist(*config, 5323 ZPOOL_CONFIG_LOAD_INFO, 5324 spa->spa_load_info); 5325 } 5326 spa_unload(spa); 5327 spa_deactivate(spa); 5328 spa->spa_last_open_failed = error; 5329 if (locked) 5330 mutex_exit(&spa_namespace_lock); 5331 *spapp = NULL; 5332 return (error); 5333 } 5334 } 5335 5336 spa_open_ref(spa, tag); 5337 5338 if (config != NULL) 5339 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 5340 5341 /* 5342 * If we've recovered the pool, pass back any information we 5343 * gathered while doing the load. 5344 */ 5345 if (state == SPA_LOAD_RECOVER && config != NULL) { 5346 fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 5347 spa->spa_load_info); 5348 } 5349 5350 if (locked) { 5351 spa->spa_last_open_failed = 0; 5352 spa->spa_last_ubsync_txg = 0; 5353 spa->spa_load_txg = 0; 5354 mutex_exit(&spa_namespace_lock); 5355 } 5356 5357 if (firstopen) 5358 zvol_create_minors_recursive(spa_name(spa)); 5359 5360 *spapp = spa; 5361 5362 return (0); 5363 } 5364 5365 int 5366 spa_open_rewind(const char *name, spa_t **spapp, const void *tag, 5367 nvlist_t *policy, nvlist_t **config) 5368 { 5369 return (spa_open_common(name, spapp, tag, policy, config)); 5370 } 5371 5372 int 5373 spa_open(const char *name, spa_t **spapp, const void *tag) 5374 { 5375 return (spa_open_common(name, spapp, tag, NULL, NULL)); 5376 } 5377 5378 /* 5379 * Lookup the given spa_t, incrementing the inject count in the process, 5380 * preventing it from being exported or destroyed. 5381 */ 5382 spa_t * 5383 spa_inject_addref(char *name) 5384 { 5385 spa_t *spa; 5386 5387 mutex_enter(&spa_namespace_lock); 5388 if ((spa = spa_lookup(name)) == NULL) { 5389 mutex_exit(&spa_namespace_lock); 5390 return (NULL); 5391 } 5392 spa->spa_inject_ref++; 5393 mutex_exit(&spa_namespace_lock); 5394 5395 return (spa); 5396 } 5397 5398 void 5399 spa_inject_delref(spa_t *spa) 5400 { 5401 mutex_enter(&spa_namespace_lock); 5402 spa->spa_inject_ref--; 5403 mutex_exit(&spa_namespace_lock); 5404 } 5405 5406 /* 5407 * Add spares device information to the nvlist. 5408 */ 5409 static void 5410 spa_add_spares(spa_t *spa, nvlist_t *config) 5411 { 5412 nvlist_t **spares; 5413 uint_t i, nspares; 5414 nvlist_t *nvroot; 5415 uint64_t guid; 5416 vdev_stat_t *vs; 5417 uint_t vsc; 5418 uint64_t pool; 5419 5420 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 5421 5422 if (spa->spa_spares.sav_count == 0) 5423 return; 5424 5425 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 5426 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5427 ZPOOL_CONFIG_SPARES, &spares, &nspares)); 5428 if (nspares != 0) { 5429 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 5430 (const nvlist_t * const *)spares, nspares); 5431 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 5432 &spares, &nspares)); 5433 5434 /* 5435 * Go through and find any spares which have since been 5436 * repurposed as an active spare. If this is the case, update 5437 * their status appropriately. 5438 */ 5439 for (i = 0; i < nspares; i++) { 5440 guid = fnvlist_lookup_uint64(spares[i], 5441 ZPOOL_CONFIG_GUID); 5442 VERIFY0(nvlist_lookup_uint64_array(spares[i], 5443 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); 5444 if (spa_spare_exists(guid, &pool, NULL) && 5445 pool != 0ULL) { 5446 vs->vs_state = VDEV_STATE_CANT_OPEN; 5447 vs->vs_aux = VDEV_AUX_SPARED; 5448 } else { 5449 vs->vs_state = 5450 spa->spa_spares.sav_vdevs[i]->vdev_state; 5451 } 5452 } 5453 } 5454 } 5455 5456 /* 5457 * Add l2cache device information to the nvlist, including vdev stats. 5458 */ 5459 static void 5460 spa_add_l2cache(spa_t *spa, nvlist_t *config) 5461 { 5462 nvlist_t **l2cache; 5463 uint_t i, j, nl2cache; 5464 nvlist_t *nvroot; 5465 uint64_t guid; 5466 vdev_t *vd; 5467 vdev_stat_t *vs; 5468 uint_t vsc; 5469 5470 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 5471 5472 if (spa->spa_l2cache.sav_count == 0) 5473 return; 5474 5475 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 5476 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5477 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); 5478 if (nl2cache != 0) { 5479 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 5480 (const nvlist_t * const *)l2cache, nl2cache); 5481 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 5482 &l2cache, &nl2cache)); 5483 5484 /* 5485 * Update level 2 cache device stats. 5486 */ 5487 5488 for (i = 0; i < nl2cache; i++) { 5489 guid = fnvlist_lookup_uint64(l2cache[i], 5490 ZPOOL_CONFIG_GUID); 5491 5492 vd = NULL; 5493 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 5494 if (guid == 5495 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 5496 vd = spa->spa_l2cache.sav_vdevs[j]; 5497 break; 5498 } 5499 } 5500 ASSERT(vd != NULL); 5501 5502 VERIFY0(nvlist_lookup_uint64_array(l2cache[i], 5503 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); 5504 vdev_get_stats(vd, vs); 5505 vdev_config_generate_stats(vd, l2cache[i]); 5506 5507 } 5508 } 5509 } 5510 5511 static void 5512 spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features) 5513 { 5514 zap_cursor_t zc; 5515 zap_attribute_t za; 5516 5517 if (spa->spa_feat_for_read_obj != 0) { 5518 for (zap_cursor_init(&zc, spa->spa_meta_objset, 5519 spa->spa_feat_for_read_obj); 5520 zap_cursor_retrieve(&zc, &za) == 0; 5521 zap_cursor_advance(&zc)) { 5522 ASSERT(za.za_integer_length == sizeof (uint64_t) && 5523 za.za_num_integers == 1); 5524 VERIFY0(nvlist_add_uint64(features, za.za_name, 5525 za.za_first_integer)); 5526 } 5527 zap_cursor_fini(&zc); 5528 } 5529 5530 if (spa->spa_feat_for_write_obj != 0) { 5531 for (zap_cursor_init(&zc, spa->spa_meta_objset, 5532 spa->spa_feat_for_write_obj); 5533 zap_cursor_retrieve(&zc, &za) == 0; 5534 zap_cursor_advance(&zc)) { 5535 ASSERT(za.za_integer_length == sizeof (uint64_t) && 5536 za.za_num_integers == 1); 5537 VERIFY0(nvlist_add_uint64(features, za.za_name, 5538 za.za_first_integer)); 5539 } 5540 zap_cursor_fini(&zc); 5541 } 5542 } 5543 5544 static void 5545 spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features) 5546 { 5547 int i; 5548 5549 for (i = 0; i < SPA_FEATURES; i++) { 5550 zfeature_info_t feature = spa_feature_table[i]; 5551 uint64_t refcount; 5552 5553 if (feature_get_refcount(spa, &feature, &refcount) != 0) 5554 continue; 5555 5556 VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount)); 5557 } 5558 } 5559 5560 /* 5561 * Store a list of pool features and their reference counts in the 5562 * config. 5563 * 5564 * The first time this is called on a spa, allocate a new nvlist, fetch 5565 * the pool features and reference counts from disk, then save the list 5566 * in the spa. In subsequent calls on the same spa use the saved nvlist 5567 * and refresh its values from the cached reference counts. This 5568 * ensures we don't block here on I/O on a suspended pool so 'zpool 5569 * clear' can resume the pool. 5570 */ 5571 static void 5572 spa_add_feature_stats(spa_t *spa, nvlist_t *config) 5573 { 5574 nvlist_t *features; 5575 5576 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 5577 5578 mutex_enter(&spa->spa_feat_stats_lock); 5579 features = spa->spa_feat_stats; 5580 5581 if (features != NULL) { 5582 spa_feature_stats_from_cache(spa, features); 5583 } else { 5584 VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP)); 5585 spa->spa_feat_stats = features; 5586 spa_feature_stats_from_disk(spa, features); 5587 } 5588 5589 VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 5590 features)); 5591 5592 mutex_exit(&spa->spa_feat_stats_lock); 5593 } 5594 5595 int 5596 spa_get_stats(const char *name, nvlist_t **config, 5597 char *altroot, size_t buflen) 5598 { 5599 int error; 5600 spa_t *spa; 5601 5602 *config = NULL; 5603 error = spa_open_common(name, &spa, FTAG, NULL, config); 5604 5605 if (spa != NULL) { 5606 /* 5607 * This still leaves a window of inconsistency where the spares 5608 * or l2cache devices could change and the config would be 5609 * self-inconsistent. 5610 */ 5611 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5612 5613 if (*config != NULL) { 5614 uint64_t loadtimes[2]; 5615 5616 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 5617 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 5618 fnvlist_add_uint64_array(*config, 5619 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2); 5620 5621 fnvlist_add_uint64(*config, 5622 ZPOOL_CONFIG_ERRCOUNT, 5623 spa_approx_errlog_size(spa)); 5624 5625 if (spa_suspended(spa)) { 5626 fnvlist_add_uint64(*config, 5627 ZPOOL_CONFIG_SUSPENDED, 5628 spa->spa_failmode); 5629 fnvlist_add_uint64(*config, 5630 ZPOOL_CONFIG_SUSPENDED_REASON, 5631 spa->spa_suspended); 5632 } 5633 5634 spa_add_spares(spa, *config); 5635 spa_add_l2cache(spa, *config); 5636 spa_add_feature_stats(spa, *config); 5637 } 5638 } 5639 5640 /* 5641 * We want to get the alternate root even for faulted pools, so we cheat 5642 * and call spa_lookup() directly. 5643 */ 5644 if (altroot) { 5645 if (spa == NULL) { 5646 mutex_enter(&spa_namespace_lock); 5647 spa = spa_lookup(name); 5648 if (spa) 5649 spa_altroot(spa, altroot, buflen); 5650 else 5651 altroot[0] = '\0'; 5652 spa = NULL; 5653 mutex_exit(&spa_namespace_lock); 5654 } else { 5655 spa_altroot(spa, altroot, buflen); 5656 } 5657 } 5658 5659 if (spa != NULL) { 5660 spa_config_exit(spa, SCL_CONFIG, FTAG); 5661 spa_close(spa, FTAG); 5662 } 5663 5664 return (error); 5665 } 5666 5667 /* 5668 * Validate that the auxiliary device array is well formed. We must have an 5669 * array of nvlists, each which describes a valid leaf vdev. If this is an 5670 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 5671 * specified, as long as they are well-formed. 5672 */ 5673 static int 5674 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 5675 spa_aux_vdev_t *sav, const char *config, uint64_t version, 5676 vdev_labeltype_t label) 5677 { 5678 nvlist_t **dev; 5679 uint_t i, ndev; 5680 vdev_t *vd; 5681 int error; 5682 5683 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5684 5685 /* 5686 * It's acceptable to have no devs specified. 5687 */ 5688 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 5689 return (0); 5690 5691 if (ndev == 0) 5692 return (SET_ERROR(EINVAL)); 5693 5694 /* 5695 * Make sure the pool is formatted with a version that supports this 5696 * device type. 5697 */ 5698 if (spa_version(spa) < version) 5699 return (SET_ERROR(ENOTSUP)); 5700 5701 /* 5702 * Set the pending device list so we correctly handle device in-use 5703 * checking. 5704 */ 5705 sav->sav_pending = dev; 5706 sav->sav_npending = ndev; 5707 5708 for (i = 0; i < ndev; i++) { 5709 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 5710 mode)) != 0) 5711 goto out; 5712 5713 if (!vd->vdev_ops->vdev_op_leaf) { 5714 vdev_free(vd); 5715 error = SET_ERROR(EINVAL); 5716 goto out; 5717 } 5718 5719 vd->vdev_top = vd; 5720 5721 if ((error = vdev_open(vd)) == 0 && 5722 (error = vdev_label_init(vd, crtxg, label)) == 0) { 5723 fnvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 5724 vd->vdev_guid); 5725 } 5726 5727 vdev_free(vd); 5728 5729 if (error && 5730 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 5731 goto out; 5732 else 5733 error = 0; 5734 } 5735 5736 out: 5737 sav->sav_pending = NULL; 5738 sav->sav_npending = 0; 5739 return (error); 5740 } 5741 5742 static int 5743 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 5744 { 5745 int error; 5746 5747 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5748 5749 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 5750 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 5751 VDEV_LABEL_SPARE)) != 0) { 5752 return (error); 5753 } 5754 5755 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 5756 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 5757 VDEV_LABEL_L2CACHE)); 5758 } 5759 5760 static void 5761 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 5762 const char *config) 5763 { 5764 int i; 5765 5766 if (sav->sav_config != NULL) { 5767 nvlist_t **olddevs; 5768 uint_t oldndevs; 5769 nvlist_t **newdevs; 5770 5771 /* 5772 * Generate new dev list by concatenating with the 5773 * current dev list. 5774 */ 5775 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, config, 5776 &olddevs, &oldndevs)); 5777 5778 newdevs = kmem_alloc(sizeof (void *) * 5779 (ndevs + oldndevs), KM_SLEEP); 5780 for (i = 0; i < oldndevs; i++) 5781 newdevs[i] = fnvlist_dup(olddevs[i]); 5782 for (i = 0; i < ndevs; i++) 5783 newdevs[i + oldndevs] = fnvlist_dup(devs[i]); 5784 5785 fnvlist_remove(sav->sav_config, config); 5786 5787 fnvlist_add_nvlist_array(sav->sav_config, config, 5788 (const nvlist_t * const *)newdevs, ndevs + oldndevs); 5789 for (i = 0; i < oldndevs + ndevs; i++) 5790 nvlist_free(newdevs[i]); 5791 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 5792 } else { 5793 /* 5794 * Generate a new dev list. 5795 */ 5796 sav->sav_config = fnvlist_alloc(); 5797 fnvlist_add_nvlist_array(sav->sav_config, config, 5798 (const nvlist_t * const *)devs, ndevs); 5799 } 5800 } 5801 5802 /* 5803 * Stop and drop level 2 ARC devices 5804 */ 5805 void 5806 spa_l2cache_drop(spa_t *spa) 5807 { 5808 vdev_t *vd; 5809 int i; 5810 spa_aux_vdev_t *sav = &spa->spa_l2cache; 5811 5812 for (i = 0; i < sav->sav_count; i++) { 5813 uint64_t pool; 5814 5815 vd = sav->sav_vdevs[i]; 5816 ASSERT(vd != NULL); 5817 5818 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 5819 pool != 0ULL && l2arc_vdev_present(vd)) 5820 l2arc_remove_vdev(vd); 5821 } 5822 } 5823 5824 /* 5825 * Verify encryption parameters for spa creation. If we are encrypting, we must 5826 * have the encryption feature flag enabled. 5827 */ 5828 static int 5829 spa_create_check_encryption_params(dsl_crypto_params_t *dcp, 5830 boolean_t has_encryption) 5831 { 5832 if (dcp->cp_crypt != ZIO_CRYPT_OFF && 5833 dcp->cp_crypt != ZIO_CRYPT_INHERIT && 5834 !has_encryption) 5835 return (SET_ERROR(ENOTSUP)); 5836 5837 return (dmu_objset_create_crypt_check(NULL, dcp, NULL)); 5838 } 5839 5840 /* 5841 * Pool Creation 5842 */ 5843 int 5844 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 5845 nvlist_t *zplprops, dsl_crypto_params_t *dcp) 5846 { 5847 spa_t *spa; 5848 const char *altroot = NULL; 5849 vdev_t *rvd; 5850 dsl_pool_t *dp; 5851 dmu_tx_t *tx; 5852 int error = 0; 5853 uint64_t txg = TXG_INITIAL; 5854 nvlist_t **spares, **l2cache; 5855 uint_t nspares, nl2cache; 5856 uint64_t version, obj, ndraid = 0; 5857 boolean_t has_features; 5858 boolean_t has_encryption; 5859 boolean_t has_allocclass; 5860 spa_feature_t feat; 5861 const char *feat_name; 5862 const char *poolname; 5863 nvlist_t *nvl; 5864 5865 if (props == NULL || 5866 nvlist_lookup_string(props, "tname", &poolname) != 0) 5867 poolname = (char *)pool; 5868 5869 /* 5870 * If this pool already exists, return failure. 5871 */ 5872 mutex_enter(&spa_namespace_lock); 5873 if (spa_lookup(poolname) != NULL) { 5874 mutex_exit(&spa_namespace_lock); 5875 return (SET_ERROR(EEXIST)); 5876 } 5877 5878 /* 5879 * Allocate a new spa_t structure. 5880 */ 5881 nvl = fnvlist_alloc(); 5882 fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool); 5883 (void) nvlist_lookup_string(props, 5884 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5885 spa = spa_add(poolname, nvl, altroot); 5886 fnvlist_free(nvl); 5887 spa_activate(spa, spa_mode_global); 5888 5889 if (props && (error = spa_prop_validate(spa, props))) { 5890 spa_deactivate(spa); 5891 spa_remove(spa); 5892 mutex_exit(&spa_namespace_lock); 5893 return (error); 5894 } 5895 5896 /* 5897 * Temporary pool names should never be written to disk. 5898 */ 5899 if (poolname != pool) 5900 spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME; 5901 5902 has_features = B_FALSE; 5903 has_encryption = B_FALSE; 5904 has_allocclass = B_FALSE; 5905 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 5906 elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 5907 if (zpool_prop_feature(nvpair_name(elem))) { 5908 has_features = B_TRUE; 5909 5910 feat_name = strchr(nvpair_name(elem), '@') + 1; 5911 VERIFY0(zfeature_lookup_name(feat_name, &feat)); 5912 if (feat == SPA_FEATURE_ENCRYPTION) 5913 has_encryption = B_TRUE; 5914 if (feat == SPA_FEATURE_ALLOCATION_CLASSES) 5915 has_allocclass = B_TRUE; 5916 } 5917 } 5918 5919 /* verify encryption params, if they were provided */ 5920 if (dcp != NULL) { 5921 error = spa_create_check_encryption_params(dcp, has_encryption); 5922 if (error != 0) { 5923 spa_deactivate(spa); 5924 spa_remove(spa); 5925 mutex_exit(&spa_namespace_lock); 5926 return (error); 5927 } 5928 } 5929 if (!has_allocclass && zfs_special_devs(nvroot, NULL)) { 5930 spa_deactivate(spa); 5931 spa_remove(spa); 5932 mutex_exit(&spa_namespace_lock); 5933 return (ENOTSUP); 5934 } 5935 5936 if (has_features || nvlist_lookup_uint64(props, 5937 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 5938 version = SPA_VERSION; 5939 } 5940 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 5941 5942 spa->spa_first_txg = txg; 5943 spa->spa_uberblock.ub_txg = txg - 1; 5944 spa->spa_uberblock.ub_version = version; 5945 spa->spa_ubsync = spa->spa_uberblock; 5946 spa->spa_load_state = SPA_LOAD_CREATE; 5947 spa->spa_removing_phys.sr_state = DSS_NONE; 5948 spa->spa_removing_phys.sr_removing_vdev = -1; 5949 spa->spa_removing_phys.sr_prev_indirect_vdev = -1; 5950 spa->spa_indirect_vdevs_loaded = B_TRUE; 5951 5952 /* 5953 * Create "The Godfather" zio to hold all async IOs 5954 */ 5955 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 5956 KM_SLEEP); 5957 for (int i = 0; i < max_ncpus; i++) { 5958 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 5959 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 5960 ZIO_FLAG_GODFATHER); 5961 } 5962 5963 /* 5964 * Create the root vdev. 5965 */ 5966 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5967 5968 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 5969 5970 ASSERT(error != 0 || rvd != NULL); 5971 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 5972 5973 if (error == 0 && !zfs_allocatable_devs(nvroot)) 5974 error = SET_ERROR(EINVAL); 5975 5976 if (error == 0 && 5977 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 5978 (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 && 5979 (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { 5980 /* 5981 * instantiate the metaslab groups (this will dirty the vdevs) 5982 * we can no longer error exit past this point 5983 */ 5984 for (int c = 0; error == 0 && c < rvd->vdev_children; c++) { 5985 vdev_t *vd = rvd->vdev_child[c]; 5986 5987 vdev_metaslab_set_size(vd); 5988 vdev_expand(vd, txg); 5989 } 5990 } 5991 5992 spa_config_exit(spa, SCL_ALL, FTAG); 5993 5994 if (error != 0) { 5995 spa_unload(spa); 5996 spa_deactivate(spa); 5997 spa_remove(spa); 5998 mutex_exit(&spa_namespace_lock); 5999 return (error); 6000 } 6001 6002 /* 6003 * Get the list of spares, if specified. 6004 */ 6005 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 6006 &spares, &nspares) == 0) { 6007 spa->spa_spares.sav_config = fnvlist_alloc(); 6008 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 6009 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 6010 nspares); 6011 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6012 spa_load_spares(spa); 6013 spa_config_exit(spa, SCL_ALL, FTAG); 6014 spa->spa_spares.sav_sync = B_TRUE; 6015 } 6016 6017 /* 6018 * Get the list of level 2 cache devices, if specified. 6019 */ 6020 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 6021 &l2cache, &nl2cache) == 0) { 6022 VERIFY0(nvlist_alloc(&spa->spa_l2cache.sav_config, 6023 NV_UNIQUE_NAME, KM_SLEEP)); 6024 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 6025 ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, 6026 nl2cache); 6027 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6028 spa_load_l2cache(spa); 6029 spa_config_exit(spa, SCL_ALL, FTAG); 6030 spa->spa_l2cache.sav_sync = B_TRUE; 6031 } 6032 6033 spa->spa_is_initializing = B_TRUE; 6034 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg); 6035 spa->spa_is_initializing = B_FALSE; 6036 6037 /* 6038 * Create DDTs (dedup tables). 6039 */ 6040 ddt_create(spa); 6041 /* 6042 * Create BRT table and BRT table object. 6043 */ 6044 brt_create(spa); 6045 6046 spa_update_dspace(spa); 6047 6048 tx = dmu_tx_create_assigned(dp, txg); 6049 6050 /* 6051 * Create the pool's history object. 6052 */ 6053 if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history) 6054 spa_history_create_obj(spa, tx); 6055 6056 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); 6057 spa_history_log_version(spa, "create", tx); 6058 6059 /* 6060 * Create the pool config object. 6061 */ 6062 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 6063 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 6064 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 6065 6066 if (zap_add(spa->spa_meta_objset, 6067 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 6068 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 6069 cmn_err(CE_PANIC, "failed to add pool config"); 6070 } 6071 6072 if (zap_add(spa->spa_meta_objset, 6073 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 6074 sizeof (uint64_t), 1, &version, tx) != 0) { 6075 cmn_err(CE_PANIC, "failed to add pool version"); 6076 } 6077 6078 /* Newly created pools with the right version are always deflated. */ 6079 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 6080 spa->spa_deflate = TRUE; 6081 if (zap_add(spa->spa_meta_objset, 6082 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6083 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 6084 cmn_err(CE_PANIC, "failed to add deflate"); 6085 } 6086 } 6087 6088 /* 6089 * Create the deferred-free bpobj. Turn off compression 6090 * because sync-to-convergence takes longer if the blocksize 6091 * keeps changing. 6092 */ 6093 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 6094 dmu_object_set_compress(spa->spa_meta_objset, obj, 6095 ZIO_COMPRESS_OFF, tx); 6096 if (zap_add(spa->spa_meta_objset, 6097 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 6098 sizeof (uint64_t), 1, &obj, tx) != 0) { 6099 cmn_err(CE_PANIC, "failed to add bpobj"); 6100 } 6101 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 6102 spa->spa_meta_objset, obj)); 6103 6104 /* 6105 * Generate some random noise for salted checksums to operate on. 6106 */ 6107 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 6108 sizeof (spa->spa_cksum_salt.zcs_bytes)); 6109 6110 /* 6111 * Set pool properties. 6112 */ 6113 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 6114 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 6115 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 6116 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 6117 spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST); 6118 spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM); 6119 6120 if (props != NULL) { 6121 spa_configfile_set(spa, props, B_FALSE); 6122 spa_sync_props(props, tx); 6123 } 6124 6125 for (int i = 0; i < ndraid; i++) 6126 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); 6127 6128 dmu_tx_commit(tx); 6129 6130 spa->spa_sync_on = B_TRUE; 6131 txg_sync_start(dp); 6132 mmp_thread_start(spa); 6133 txg_wait_synced(dp, txg); 6134 6135 spa_spawn_aux_threads(spa); 6136 6137 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); 6138 6139 /* 6140 * Don't count references from objsets that are already closed 6141 * and are making their way through the eviction process. 6142 */ 6143 spa_evicting_os_wait(spa); 6144 spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); 6145 spa->spa_load_state = SPA_LOAD_NONE; 6146 6147 spa_import_os(spa); 6148 6149 mutex_exit(&spa_namespace_lock); 6150 6151 return (0); 6152 } 6153 6154 /* 6155 * Import a non-root pool into the system. 6156 */ 6157 int 6158 spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 6159 { 6160 spa_t *spa; 6161 const char *altroot = NULL; 6162 spa_load_state_t state = SPA_LOAD_IMPORT; 6163 zpool_load_policy_t policy; 6164 spa_mode_t mode = spa_mode_global; 6165 uint64_t readonly = B_FALSE; 6166 int error; 6167 nvlist_t *nvroot; 6168 nvlist_t **spares, **l2cache; 6169 uint_t nspares, nl2cache; 6170 6171 /* 6172 * If a pool with this name exists, return failure. 6173 */ 6174 mutex_enter(&spa_namespace_lock); 6175 if (spa_lookup(pool) != NULL) { 6176 mutex_exit(&spa_namespace_lock); 6177 return (SET_ERROR(EEXIST)); 6178 } 6179 6180 /* 6181 * Create and initialize the spa structure. 6182 */ 6183 (void) nvlist_lookup_string(props, 6184 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 6185 (void) nvlist_lookup_uint64(props, 6186 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 6187 if (readonly) 6188 mode = SPA_MODE_READ; 6189 spa = spa_add(pool, config, altroot); 6190 spa->spa_import_flags = flags; 6191 6192 /* 6193 * Verbatim import - Take a pool and insert it into the namespace 6194 * as if it had been loaded at boot. 6195 */ 6196 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 6197 if (props != NULL) 6198 spa_configfile_set(spa, props, B_FALSE); 6199 6200 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); 6201 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 6202 zfs_dbgmsg("spa_import: verbatim import of %s", pool); 6203 mutex_exit(&spa_namespace_lock); 6204 return (0); 6205 } 6206 6207 spa_activate(spa, mode); 6208 6209 /* 6210 * Don't start async tasks until we know everything is healthy. 6211 */ 6212 spa_async_suspend(spa); 6213 6214 zpool_get_load_policy(config, &policy); 6215 if (policy.zlp_rewind & ZPOOL_DO_REWIND) 6216 state = SPA_LOAD_RECOVER; 6217 6218 spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT; 6219 6220 if (state != SPA_LOAD_RECOVER) { 6221 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 6222 zfs_dbgmsg("spa_import: importing %s", pool); 6223 } else { 6224 zfs_dbgmsg("spa_import: importing %s, max_txg=%lld " 6225 "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg); 6226 } 6227 error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); 6228 6229 /* 6230 * Propagate anything learned while loading the pool and pass it 6231 * back to caller (i.e. rewind info, missing devices, etc). 6232 */ 6233 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); 6234 6235 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6236 /* 6237 * Toss any existing sparelist, as it doesn't have any validity 6238 * anymore, and conflicts with spa_has_spare(). 6239 */ 6240 if (spa->spa_spares.sav_config) { 6241 nvlist_free(spa->spa_spares.sav_config); 6242 spa->spa_spares.sav_config = NULL; 6243 spa_load_spares(spa); 6244 } 6245 if (spa->spa_l2cache.sav_config) { 6246 nvlist_free(spa->spa_l2cache.sav_config); 6247 spa->spa_l2cache.sav_config = NULL; 6248 spa_load_l2cache(spa); 6249 } 6250 6251 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 6252 spa_config_exit(spa, SCL_ALL, FTAG); 6253 6254 if (props != NULL) 6255 spa_configfile_set(spa, props, B_FALSE); 6256 6257 if (error != 0 || (props && spa_writeable(spa) && 6258 (error = spa_prop_set(spa, props)))) { 6259 spa_unload(spa); 6260 spa_deactivate(spa); 6261 spa_remove(spa); 6262 mutex_exit(&spa_namespace_lock); 6263 return (error); 6264 } 6265 6266 spa_async_resume(spa); 6267 6268 /* 6269 * Override any spares and level 2 cache devices as specified by 6270 * the user, as these may have correct device names/devids, etc. 6271 */ 6272 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 6273 &spares, &nspares) == 0) { 6274 if (spa->spa_spares.sav_config) 6275 fnvlist_remove(spa->spa_spares.sav_config, 6276 ZPOOL_CONFIG_SPARES); 6277 else 6278 spa->spa_spares.sav_config = fnvlist_alloc(); 6279 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 6280 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 6281 nspares); 6282 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6283 spa_load_spares(spa); 6284 spa_config_exit(spa, SCL_ALL, FTAG); 6285 spa->spa_spares.sav_sync = B_TRUE; 6286 } 6287 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 6288 &l2cache, &nl2cache) == 0) { 6289 if (spa->spa_l2cache.sav_config) 6290 fnvlist_remove(spa->spa_l2cache.sav_config, 6291 ZPOOL_CONFIG_L2CACHE); 6292 else 6293 spa->spa_l2cache.sav_config = fnvlist_alloc(); 6294 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 6295 ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, 6296 nl2cache); 6297 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6298 spa_load_l2cache(spa); 6299 spa_config_exit(spa, SCL_ALL, FTAG); 6300 spa->spa_l2cache.sav_sync = B_TRUE; 6301 } 6302 6303 /* 6304 * Check for any removed devices. 6305 */ 6306 if (spa->spa_autoreplace) { 6307 spa_aux_check_removed(&spa->spa_spares); 6308 spa_aux_check_removed(&spa->spa_l2cache); 6309 } 6310 6311 if (spa_writeable(spa)) { 6312 /* 6313 * Update the config cache to include the newly-imported pool. 6314 */ 6315 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 6316 } 6317 6318 /* 6319 * It's possible that the pool was expanded while it was exported. 6320 * We kick off an async task to handle this for us. 6321 */ 6322 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 6323 6324 spa_history_log_version(spa, "import", NULL); 6325 6326 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 6327 6328 mutex_exit(&spa_namespace_lock); 6329 6330 zvol_create_minors_recursive(pool); 6331 6332 spa_import_os(spa); 6333 6334 return (0); 6335 } 6336 6337 nvlist_t * 6338 spa_tryimport(nvlist_t *tryconfig) 6339 { 6340 nvlist_t *config = NULL; 6341 const char *poolname, *cachefile; 6342 spa_t *spa; 6343 uint64_t state; 6344 int error; 6345 zpool_load_policy_t policy; 6346 6347 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 6348 return (NULL); 6349 6350 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 6351 return (NULL); 6352 6353 /* 6354 * Create and initialize the spa structure. 6355 */ 6356 mutex_enter(&spa_namespace_lock); 6357 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 6358 spa_activate(spa, SPA_MODE_READ); 6359 6360 /* 6361 * Rewind pool if a max txg was provided. 6362 */ 6363 zpool_get_load_policy(spa->spa_config, &policy); 6364 if (policy.zlp_txg != UINT64_MAX) { 6365 spa->spa_load_max_txg = policy.zlp_txg; 6366 spa->spa_extreme_rewind = B_TRUE; 6367 zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld", 6368 poolname, (longlong_t)policy.zlp_txg); 6369 } else { 6370 zfs_dbgmsg("spa_tryimport: importing %s", poolname); 6371 } 6372 6373 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile) 6374 == 0) { 6375 zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile); 6376 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; 6377 } else { 6378 spa->spa_config_source = SPA_CONFIG_SRC_SCAN; 6379 } 6380 6381 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); 6382 6383 /* 6384 * If 'tryconfig' was at least parsable, return the current config. 6385 */ 6386 if (spa->spa_root_vdev != NULL) { 6387 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 6388 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname); 6389 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state); 6390 fnvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 6391 spa->spa_uberblock.ub_timestamp); 6392 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 6393 spa->spa_load_info); 6394 fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, 6395 spa->spa_errata); 6396 6397 /* 6398 * If the bootfs property exists on this pool then we 6399 * copy it out so that external consumers can tell which 6400 * pools are bootable. 6401 */ 6402 if ((!error || error == EEXIST) && spa->spa_bootfs) { 6403 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 6404 6405 /* 6406 * We have to play games with the name since the 6407 * pool was opened as TRYIMPORT_NAME. 6408 */ 6409 if (dsl_dsobj_to_dsname(spa_name(spa), 6410 spa->spa_bootfs, tmpname) == 0) { 6411 char *cp; 6412 char *dsname; 6413 6414 dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 6415 6416 cp = strchr(tmpname, '/'); 6417 if (cp == NULL) { 6418 (void) strlcpy(dsname, tmpname, 6419 MAXPATHLEN); 6420 } else { 6421 (void) snprintf(dsname, MAXPATHLEN, 6422 "%s/%s", poolname, ++cp); 6423 } 6424 fnvlist_add_string(config, ZPOOL_CONFIG_BOOTFS, 6425 dsname); 6426 kmem_free(dsname, MAXPATHLEN); 6427 } 6428 kmem_free(tmpname, MAXPATHLEN); 6429 } 6430 6431 /* 6432 * Add the list of hot spares and level 2 cache devices. 6433 */ 6434 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6435 spa_add_spares(spa, config); 6436 spa_add_l2cache(spa, config); 6437 spa_config_exit(spa, SCL_CONFIG, FTAG); 6438 } 6439 6440 spa_unload(spa); 6441 spa_deactivate(spa); 6442 spa_remove(spa); 6443 mutex_exit(&spa_namespace_lock); 6444 6445 return (config); 6446 } 6447 6448 /* 6449 * Pool export/destroy 6450 * 6451 * The act of destroying or exporting a pool is very simple. We make sure there 6452 * is no more pending I/O and any references to the pool are gone. Then, we 6453 * update the pool state and sync all the labels to disk, removing the 6454 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 6455 * we don't sync the labels or remove the configuration cache. 6456 */ 6457 static int 6458 spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, 6459 boolean_t force, boolean_t hardforce) 6460 { 6461 int error; 6462 spa_t *spa; 6463 6464 if (oldconfig) 6465 *oldconfig = NULL; 6466 6467 if (!(spa_mode_global & SPA_MODE_WRITE)) 6468 return (SET_ERROR(EROFS)); 6469 6470 mutex_enter(&spa_namespace_lock); 6471 if ((spa = spa_lookup(pool)) == NULL) { 6472 mutex_exit(&spa_namespace_lock); 6473 return (SET_ERROR(ENOENT)); 6474 } 6475 6476 if (spa->spa_is_exporting) { 6477 /* the pool is being exported by another thread */ 6478 mutex_exit(&spa_namespace_lock); 6479 return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS)); 6480 } 6481 spa->spa_is_exporting = B_TRUE; 6482 6483 /* 6484 * Put a hold on the pool, drop the namespace lock, stop async tasks, 6485 * reacquire the namespace lock, and see if we can export. 6486 */ 6487 spa_open_ref(spa, FTAG); 6488 mutex_exit(&spa_namespace_lock); 6489 spa_async_suspend(spa); 6490 if (spa->spa_zvol_taskq) { 6491 zvol_remove_minors(spa, spa_name(spa), B_TRUE); 6492 taskq_wait(spa->spa_zvol_taskq); 6493 } 6494 mutex_enter(&spa_namespace_lock); 6495 spa_close(spa, FTAG); 6496 6497 if (spa->spa_state == POOL_STATE_UNINITIALIZED) 6498 goto export_spa; 6499 /* 6500 * The pool will be in core if it's openable, in which case we can 6501 * modify its state. Objsets may be open only because they're dirty, 6502 * so we have to force it to sync before checking spa_refcnt. 6503 */ 6504 if (spa->spa_sync_on) { 6505 txg_wait_synced(spa->spa_dsl_pool, 0); 6506 spa_evicting_os_wait(spa); 6507 } 6508 6509 /* 6510 * A pool cannot be exported or destroyed if there are active 6511 * references. If we are resetting a pool, allow references by 6512 * fault injection handlers. 6513 */ 6514 if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) { 6515 error = SET_ERROR(EBUSY); 6516 goto fail; 6517 } 6518 6519 if (spa->spa_sync_on) { 6520 vdev_t *rvd = spa->spa_root_vdev; 6521 /* 6522 * A pool cannot be exported if it has an active shared spare. 6523 * This is to prevent other pools stealing the active spare 6524 * from an exported pool. At user's own will, such pool can 6525 * be forcedly exported. 6526 */ 6527 if (!force && new_state == POOL_STATE_EXPORTED && 6528 spa_has_active_shared_spare(spa)) { 6529 error = SET_ERROR(EXDEV); 6530 goto fail; 6531 } 6532 6533 /* 6534 * We're about to export or destroy this pool. Make sure 6535 * we stop all initialization and trim activity here before 6536 * we set the spa_final_txg. This will ensure that all 6537 * dirty data resulting from the initialization is 6538 * committed to disk before we unload the pool. 6539 */ 6540 vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE); 6541 vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE); 6542 vdev_autotrim_stop_all(spa); 6543 vdev_rebuild_stop_all(spa); 6544 6545 /* 6546 * We want this to be reflected on every label, 6547 * so mark them all dirty. spa_unload() will do the 6548 * final sync that pushes these changes out. 6549 */ 6550 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 6551 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6552 spa->spa_state = new_state; 6553 vdev_config_dirty(rvd); 6554 spa_config_exit(spa, SCL_ALL, FTAG); 6555 } 6556 6557 /* 6558 * If the log space map feature is enabled and the pool is 6559 * getting exported (but not destroyed), we want to spend some 6560 * time flushing as many metaslabs as we can in an attempt to 6561 * destroy log space maps and save import time. This has to be 6562 * done before we set the spa_final_txg, otherwise 6563 * spa_sync() -> spa_flush_metaslabs() may dirty the final TXGs. 6564 * spa_should_flush_logs_on_unload() should be called after 6565 * spa_state has been set to the new_state. 6566 */ 6567 if (spa_should_flush_logs_on_unload(spa)) 6568 spa_unload_log_sm_flush_all(spa); 6569 6570 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 6571 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6572 spa->spa_final_txg = spa_last_synced_txg(spa) + 6573 TXG_DEFER_SIZE + 1; 6574 spa_config_exit(spa, SCL_ALL, FTAG); 6575 } 6576 } 6577 6578 export_spa: 6579 spa_export_os(spa); 6580 6581 if (new_state == POOL_STATE_DESTROYED) 6582 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); 6583 else if (new_state == POOL_STATE_EXPORTED) 6584 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT); 6585 6586 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 6587 spa_unload(spa); 6588 spa_deactivate(spa); 6589 } 6590 6591 if (oldconfig && spa->spa_config) 6592 *oldconfig = fnvlist_dup(spa->spa_config); 6593 6594 if (new_state != POOL_STATE_UNINITIALIZED) { 6595 if (!hardforce) 6596 spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); 6597 spa_remove(spa); 6598 } else { 6599 /* 6600 * If spa_remove() is not called for this spa_t and 6601 * there is any possibility that it can be reused, 6602 * we make sure to reset the exporting flag. 6603 */ 6604 spa->spa_is_exporting = B_FALSE; 6605 } 6606 6607 mutex_exit(&spa_namespace_lock); 6608 return (0); 6609 6610 fail: 6611 spa->spa_is_exporting = B_FALSE; 6612 spa_async_resume(spa); 6613 mutex_exit(&spa_namespace_lock); 6614 return (error); 6615 } 6616 6617 /* 6618 * Destroy a storage pool. 6619 */ 6620 int 6621 spa_destroy(const char *pool) 6622 { 6623 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 6624 B_FALSE, B_FALSE)); 6625 } 6626 6627 /* 6628 * Export a storage pool. 6629 */ 6630 int 6631 spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force, 6632 boolean_t hardforce) 6633 { 6634 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 6635 force, hardforce)); 6636 } 6637 6638 /* 6639 * Similar to spa_export(), this unloads the spa_t without actually removing it 6640 * from the namespace in any way. 6641 */ 6642 int 6643 spa_reset(const char *pool) 6644 { 6645 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 6646 B_FALSE, B_FALSE)); 6647 } 6648 6649 /* 6650 * ========================================================================== 6651 * Device manipulation 6652 * ========================================================================== 6653 */ 6654 6655 /* 6656 * This is called as a synctask to increment the draid feature flag 6657 */ 6658 static void 6659 spa_draid_feature_incr(void *arg, dmu_tx_t *tx) 6660 { 6661 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6662 int draid = (int)(uintptr_t)arg; 6663 6664 for (int c = 0; c < draid; c++) 6665 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); 6666 } 6667 6668 /* 6669 * Add a device to a storage pool. 6670 */ 6671 int 6672 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 6673 { 6674 uint64_t txg, ndraid = 0; 6675 int error; 6676 vdev_t *rvd = spa->spa_root_vdev; 6677 vdev_t *vd, *tvd; 6678 nvlist_t **spares, **l2cache; 6679 uint_t nspares, nl2cache; 6680 6681 ASSERT(spa_writeable(spa)); 6682 6683 txg = spa_vdev_enter(spa); 6684 6685 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 6686 VDEV_ALLOC_ADD)) != 0) 6687 return (spa_vdev_exit(spa, NULL, txg, error)); 6688 6689 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 6690 6691 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 6692 &nspares) != 0) 6693 nspares = 0; 6694 6695 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 6696 &nl2cache) != 0) 6697 nl2cache = 0; 6698 6699 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 6700 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 6701 6702 if (vd->vdev_children != 0 && 6703 (error = vdev_create(vd, txg, B_FALSE)) != 0) { 6704 return (spa_vdev_exit(spa, vd, txg, error)); 6705 } 6706 6707 /* 6708 * The virtual dRAID spares must be added after vdev tree is created 6709 * and the vdev guids are generated. The guid of their associated 6710 * dRAID is stored in the config and used when opening the spare. 6711 */ 6712 if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid, 6713 rvd->vdev_children)) == 0) { 6714 if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot, 6715 ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) 6716 nspares = 0; 6717 } else { 6718 return (spa_vdev_exit(spa, vd, txg, error)); 6719 } 6720 6721 /* 6722 * We must validate the spares and l2cache devices after checking the 6723 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 6724 */ 6725 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 6726 return (spa_vdev_exit(spa, vd, txg, error)); 6727 6728 /* 6729 * If we are in the middle of a device removal, we can only add 6730 * devices which match the existing devices in the pool. 6731 * If we are in the middle of a removal, or have some indirect 6732 * vdevs, we can not add raidz or dRAID top levels. 6733 */ 6734 if (spa->spa_vdev_removal != NULL || 6735 spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { 6736 for (int c = 0; c < vd->vdev_children; c++) { 6737 tvd = vd->vdev_child[c]; 6738 if (spa->spa_vdev_removal != NULL && 6739 tvd->vdev_ashift != spa->spa_max_ashift) { 6740 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 6741 } 6742 /* Fail if top level vdev is raidz or a dRAID */ 6743 if (vdev_get_nparity(tvd) != 0) 6744 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 6745 6746 /* 6747 * Need the top level mirror to be 6748 * a mirror of leaf vdevs only 6749 */ 6750 if (tvd->vdev_ops == &vdev_mirror_ops) { 6751 for (uint64_t cid = 0; 6752 cid < tvd->vdev_children; cid++) { 6753 vdev_t *cvd = tvd->vdev_child[cid]; 6754 if (!cvd->vdev_ops->vdev_op_leaf) { 6755 return (spa_vdev_exit(spa, vd, 6756 txg, EINVAL)); 6757 } 6758 } 6759 } 6760 } 6761 } 6762 6763 for (int c = 0; c < vd->vdev_children; c++) { 6764 tvd = vd->vdev_child[c]; 6765 vdev_remove_child(vd, tvd); 6766 tvd->vdev_id = rvd->vdev_children; 6767 vdev_add_child(rvd, tvd); 6768 vdev_config_dirty(tvd); 6769 } 6770 6771 if (nspares != 0) { 6772 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 6773 ZPOOL_CONFIG_SPARES); 6774 spa_load_spares(spa); 6775 spa->spa_spares.sav_sync = B_TRUE; 6776 } 6777 6778 if (nl2cache != 0) { 6779 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 6780 ZPOOL_CONFIG_L2CACHE); 6781 spa_load_l2cache(spa); 6782 spa->spa_l2cache.sav_sync = B_TRUE; 6783 } 6784 6785 /* 6786 * We can't increment a feature while holding spa_vdev so we 6787 * have to do it in a synctask. 6788 */ 6789 if (ndraid != 0) { 6790 dmu_tx_t *tx; 6791 6792 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 6793 dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr, 6794 (void *)(uintptr_t)ndraid, tx); 6795 dmu_tx_commit(tx); 6796 } 6797 6798 /* 6799 * We have to be careful when adding new vdevs to an existing pool. 6800 * If other threads start allocating from these vdevs before we 6801 * sync the config cache, and we lose power, then upon reboot we may 6802 * fail to open the pool because there are DVAs that the config cache 6803 * can't translate. Therefore, we first add the vdevs without 6804 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 6805 * and then let spa_config_update() initialize the new metaslabs. 6806 * 6807 * spa_load() checks for added-but-not-initialized vdevs, so that 6808 * if we lose power at any point in this sequence, the remaining 6809 * steps will be completed the next time we load the pool. 6810 */ 6811 (void) spa_vdev_exit(spa, vd, txg, 0); 6812 6813 mutex_enter(&spa_namespace_lock); 6814 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 6815 spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); 6816 mutex_exit(&spa_namespace_lock); 6817 6818 return (0); 6819 } 6820 6821 /* 6822 * Attach a device to a mirror. The arguments are the path to any device 6823 * in the mirror, and the nvroot for the new device. If the path specifies 6824 * a device that is not mirrored, we automatically insert the mirror vdev. 6825 * 6826 * If 'replacing' is specified, the new device is intended to replace the 6827 * existing device; in this case the two devices are made into their own 6828 * mirror using the 'replacing' vdev, which is functionally identical to 6829 * the mirror vdev (it actually reuses all the same ops) but has a few 6830 * extra rules: you can't attach to it after it's been created, and upon 6831 * completion of resilvering, the first disk (the one being replaced) 6832 * is automatically detached. 6833 * 6834 * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild) 6835 * should be performed instead of traditional healing reconstruction. From 6836 * an administrators perspective these are both resilver operations. 6837 */ 6838 int 6839 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, 6840 int rebuild) 6841 { 6842 uint64_t txg, dtl_max_txg; 6843 vdev_t *rvd = spa->spa_root_vdev; 6844 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 6845 vdev_ops_t *pvops; 6846 char *oldvdpath, *newvdpath; 6847 int newvd_isspare; 6848 int error; 6849 6850 ASSERT(spa_writeable(spa)); 6851 6852 txg = spa_vdev_enter(spa); 6853 6854 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 6855 6856 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 6857 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 6858 error = (spa_has_checkpoint(spa)) ? 6859 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 6860 return (spa_vdev_exit(spa, NULL, txg, error)); 6861 } 6862 6863 if (rebuild) { 6864 if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) 6865 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 6866 6867 if (dsl_scan_resilvering(spa_get_dsl(spa))) 6868 return (spa_vdev_exit(spa, NULL, txg, 6869 ZFS_ERR_RESILVER_IN_PROGRESS)); 6870 } else { 6871 if (vdev_rebuild_active(rvd)) 6872 return (spa_vdev_exit(spa, NULL, txg, 6873 ZFS_ERR_REBUILD_IN_PROGRESS)); 6874 } 6875 6876 if (spa->spa_vdev_removal != NULL) 6877 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 6878 6879 if (oldvd == NULL) 6880 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 6881 6882 if (!oldvd->vdev_ops->vdev_op_leaf) 6883 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 6884 6885 pvd = oldvd->vdev_parent; 6886 6887 if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 6888 VDEV_ALLOC_ATTACH) != 0) 6889 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 6890 6891 if (newrootvd->vdev_children != 1) 6892 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 6893 6894 newvd = newrootvd->vdev_child[0]; 6895 6896 if (!newvd->vdev_ops->vdev_op_leaf) 6897 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 6898 6899 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 6900 return (spa_vdev_exit(spa, newrootvd, txg, error)); 6901 6902 /* 6903 * log, dedup and special vdevs should not be replaced by spares. 6904 */ 6905 if ((oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE || 6906 oldvd->vdev_top->vdev_islog) && newvd->vdev_isspare) { 6907 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6908 } 6909 6910 /* 6911 * A dRAID spare can only replace a child of its parent dRAID vdev. 6912 */ 6913 if (newvd->vdev_ops == &vdev_draid_spare_ops && 6914 oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) { 6915 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6916 } 6917 6918 if (rebuild) { 6919 /* 6920 * For rebuilds, the top vdev must support reconstruction 6921 * using only space maps. This means the only allowable 6922 * vdevs types are the root vdev, a mirror, or dRAID. 6923 */ 6924 tvd = pvd; 6925 if (pvd->vdev_top != NULL) 6926 tvd = pvd->vdev_top; 6927 6928 if (tvd->vdev_ops != &vdev_mirror_ops && 6929 tvd->vdev_ops != &vdev_root_ops && 6930 tvd->vdev_ops != &vdev_draid_ops) { 6931 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6932 } 6933 } 6934 6935 if (!replacing) { 6936 /* 6937 * For attach, the only allowable parent is a mirror or the root 6938 * vdev. 6939 */ 6940 if (pvd->vdev_ops != &vdev_mirror_ops && 6941 pvd->vdev_ops != &vdev_root_ops) 6942 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6943 6944 pvops = &vdev_mirror_ops; 6945 } else { 6946 /* 6947 * Active hot spares can only be replaced by inactive hot 6948 * spares. 6949 */ 6950 if (pvd->vdev_ops == &vdev_spare_ops && 6951 oldvd->vdev_isspare && 6952 !spa_has_spare(spa, newvd->vdev_guid)) 6953 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6954 6955 /* 6956 * If the source is a hot spare, and the parent isn't already a 6957 * spare, then we want to create a new hot spare. Otherwise, we 6958 * want to create a replacing vdev. The user is not allowed to 6959 * attach to a spared vdev child unless the 'isspare' state is 6960 * the same (spare replaces spare, non-spare replaces 6961 * non-spare). 6962 */ 6963 if (pvd->vdev_ops == &vdev_replacing_ops && 6964 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 6965 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6966 } else if (pvd->vdev_ops == &vdev_spare_ops && 6967 newvd->vdev_isspare != oldvd->vdev_isspare) { 6968 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6969 } 6970 6971 if (newvd->vdev_isspare) 6972 pvops = &vdev_spare_ops; 6973 else 6974 pvops = &vdev_replacing_ops; 6975 } 6976 6977 /* 6978 * Make sure the new device is big enough. 6979 */ 6980 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 6981 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 6982 6983 /* 6984 * The new device cannot have a higher alignment requirement 6985 * than the top-level vdev. 6986 */ 6987 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 6988 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6989 6990 /* 6991 * If this is an in-place replacement, update oldvd's path and devid 6992 * to make it distinguishable from newvd, and unopenable from now on. 6993 */ 6994 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 6995 spa_strfree(oldvd->vdev_path); 6996 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 6997 KM_SLEEP); 6998 (void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5, 6999 "%s/%s", newvd->vdev_path, "old"); 7000 if (oldvd->vdev_devid != NULL) { 7001 spa_strfree(oldvd->vdev_devid); 7002 oldvd->vdev_devid = NULL; 7003 } 7004 } 7005 7006 /* 7007 * If the parent is not a mirror, or if we're replacing, insert the new 7008 * mirror/replacing/spare vdev above oldvd. 7009 */ 7010 if (pvd->vdev_ops != pvops) 7011 pvd = vdev_add_parent(oldvd, pvops); 7012 7013 ASSERT(pvd->vdev_top->vdev_parent == rvd); 7014 ASSERT(pvd->vdev_ops == pvops); 7015 ASSERT(oldvd->vdev_parent == pvd); 7016 7017 /* 7018 * Extract the new device from its root and add it to pvd. 7019 */ 7020 vdev_remove_child(newrootvd, newvd); 7021 newvd->vdev_id = pvd->vdev_children; 7022 newvd->vdev_crtxg = oldvd->vdev_crtxg; 7023 vdev_add_child(pvd, newvd); 7024 7025 /* 7026 * Reevaluate the parent vdev state. 7027 */ 7028 vdev_propagate_state(pvd); 7029 7030 tvd = newvd->vdev_top; 7031 ASSERT(pvd->vdev_top == tvd); 7032 ASSERT(tvd->vdev_parent == rvd); 7033 7034 vdev_config_dirty(tvd); 7035 7036 /* 7037 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 7038 * for any dmu_sync-ed blocks. It will propagate upward when 7039 * spa_vdev_exit() calls vdev_dtl_reassess(). 7040 */ 7041 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 7042 7043 vdev_dtl_dirty(newvd, DTL_MISSING, 7044 TXG_INITIAL, dtl_max_txg - TXG_INITIAL); 7045 7046 if (newvd->vdev_isspare) { 7047 spa_spare_activate(newvd); 7048 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); 7049 } 7050 7051 oldvdpath = spa_strdup(oldvd->vdev_path); 7052 newvdpath = spa_strdup(newvd->vdev_path); 7053 newvd_isspare = newvd->vdev_isspare; 7054 7055 /* 7056 * Mark newvd's DTL dirty in this txg. 7057 */ 7058 vdev_dirty(tvd, VDD_DTL, newvd, txg); 7059 7060 /* 7061 * Schedule the resilver or rebuild to restart in the future. We do 7062 * this to ensure that dmu_sync-ed blocks have been stitched into the 7063 * respective datasets. 7064 */ 7065 if (rebuild) { 7066 newvd->vdev_rebuild_txg = txg; 7067 7068 vdev_rebuild(tvd); 7069 } else { 7070 newvd->vdev_resilver_txg = txg; 7071 7072 if (dsl_scan_resilvering(spa_get_dsl(spa)) && 7073 spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) { 7074 vdev_defer_resilver(newvd); 7075 } else { 7076 dsl_scan_restart_resilver(spa->spa_dsl_pool, 7077 dtl_max_txg); 7078 } 7079 } 7080 7081 if (spa->spa_bootfs) 7082 spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); 7083 7084 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); 7085 7086 /* 7087 * Commit the config 7088 */ 7089 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 7090 7091 spa_history_log_internal(spa, "vdev attach", NULL, 7092 "%s vdev=%s %s vdev=%s", 7093 replacing && newvd_isspare ? "spare in" : 7094 replacing ? "replace" : "attach", newvdpath, 7095 replacing ? "for" : "to", oldvdpath); 7096 7097 spa_strfree(oldvdpath); 7098 spa_strfree(newvdpath); 7099 7100 return (0); 7101 } 7102 7103 /* 7104 * Detach a device from a mirror or replacing vdev. 7105 * 7106 * If 'replace_done' is specified, only detach if the parent 7107 * is a replacing or a spare vdev. 7108 */ 7109 int 7110 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 7111 { 7112 uint64_t txg; 7113 int error; 7114 vdev_t *rvd __maybe_unused = spa->spa_root_vdev; 7115 vdev_t *vd, *pvd, *cvd, *tvd; 7116 boolean_t unspare = B_FALSE; 7117 uint64_t unspare_guid = 0; 7118 char *vdpath; 7119 7120 ASSERT(spa_writeable(spa)); 7121 7122 txg = spa_vdev_detach_enter(spa, guid); 7123 7124 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 7125 7126 /* 7127 * Besides being called directly from the userland through the 7128 * ioctl interface, spa_vdev_detach() can be potentially called 7129 * at the end of spa_vdev_resilver_done(). 7130 * 7131 * In the regular case, when we have a checkpoint this shouldn't 7132 * happen as we never empty the DTLs of a vdev during the scrub 7133 * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done() 7134 * should never get here when we have a checkpoint. 7135 * 7136 * That said, even in a case when we checkpoint the pool exactly 7137 * as spa_vdev_resilver_done() calls this function everything 7138 * should be fine as the resilver will return right away. 7139 */ 7140 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7141 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 7142 error = (spa_has_checkpoint(spa)) ? 7143 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 7144 return (spa_vdev_exit(spa, NULL, txg, error)); 7145 } 7146 7147 if (vd == NULL) 7148 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 7149 7150 if (!vd->vdev_ops->vdev_op_leaf) 7151 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7152 7153 pvd = vd->vdev_parent; 7154 7155 /* 7156 * If the parent/child relationship is not as expected, don't do it. 7157 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 7158 * vdev that's replacing B with C. The user's intent in replacing 7159 * is to go from M(A,B) to M(A,C). If the user decides to cancel 7160 * the replace by detaching C, the expected behavior is to end up 7161 * M(A,B). But suppose that right after deciding to detach C, 7162 * the replacement of B completes. We would have M(A,C), and then 7163 * ask to detach C, which would leave us with just A -- not what 7164 * the user wanted. To prevent this, we make sure that the 7165 * parent/child relationship hasn't changed -- in this example, 7166 * that C's parent is still the replacing vdev R. 7167 */ 7168 if (pvd->vdev_guid != pguid && pguid != 0) 7169 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 7170 7171 /* 7172 * Only 'replacing' or 'spare' vdevs can be replaced. 7173 */ 7174 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 7175 pvd->vdev_ops != &vdev_spare_ops) 7176 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7177 7178 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 7179 spa_version(spa) >= SPA_VERSION_SPARES); 7180 7181 /* 7182 * Only mirror, replacing, and spare vdevs support detach. 7183 */ 7184 if (pvd->vdev_ops != &vdev_replacing_ops && 7185 pvd->vdev_ops != &vdev_mirror_ops && 7186 pvd->vdev_ops != &vdev_spare_ops) 7187 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7188 7189 /* 7190 * If this device has the only valid copy of some data, 7191 * we cannot safely detach it. 7192 */ 7193 if (vdev_dtl_required(vd)) 7194 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 7195 7196 ASSERT(pvd->vdev_children >= 2); 7197 7198 /* 7199 * If we are detaching the second disk from a replacing vdev, then 7200 * check to see if we changed the original vdev's path to have "/old" 7201 * at the end in spa_vdev_attach(). If so, undo that change now. 7202 */ 7203 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 7204 vd->vdev_path != NULL) { 7205 size_t len = strlen(vd->vdev_path); 7206 7207 for (int c = 0; c < pvd->vdev_children; c++) { 7208 cvd = pvd->vdev_child[c]; 7209 7210 if (cvd == vd || cvd->vdev_path == NULL) 7211 continue; 7212 7213 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 7214 strcmp(cvd->vdev_path + len, "/old") == 0) { 7215 spa_strfree(cvd->vdev_path); 7216 cvd->vdev_path = spa_strdup(vd->vdev_path); 7217 break; 7218 } 7219 } 7220 } 7221 7222 /* 7223 * If we are detaching the original disk from a normal spare, then it 7224 * implies that the spare should become a real disk, and be removed 7225 * from the active spare list for the pool. dRAID spares on the 7226 * other hand are coupled to the pool and thus should never be removed 7227 * from the spares list. 7228 */ 7229 if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) { 7230 vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1]; 7231 7232 if (last_cvd->vdev_isspare && 7233 last_cvd->vdev_ops != &vdev_draid_spare_ops) { 7234 unspare = B_TRUE; 7235 } 7236 } 7237 7238 /* 7239 * Erase the disk labels so the disk can be used for other things. 7240 * This must be done after all other error cases are handled, 7241 * but before we disembowel vd (so we can still do I/O to it). 7242 * But if we can't do it, don't treat the error as fatal -- 7243 * it may be that the unwritability of the disk is the reason 7244 * it's being detached! 7245 */ 7246 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 7247 7248 /* 7249 * Remove vd from its parent and compact the parent's children. 7250 */ 7251 vdev_remove_child(pvd, vd); 7252 vdev_compact_children(pvd); 7253 7254 /* 7255 * Remember one of the remaining children so we can get tvd below. 7256 */ 7257 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 7258 7259 /* 7260 * If we need to remove the remaining child from the list of hot spares, 7261 * do it now, marking the vdev as no longer a spare in the process. 7262 * We must do this before vdev_remove_parent(), because that can 7263 * change the GUID if it creates a new toplevel GUID. For a similar 7264 * reason, we must remove the spare now, in the same txg as the detach; 7265 * otherwise someone could attach a new sibling, change the GUID, and 7266 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 7267 */ 7268 if (unspare) { 7269 ASSERT(cvd->vdev_isspare); 7270 spa_spare_remove(cvd); 7271 unspare_guid = cvd->vdev_guid; 7272 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 7273 cvd->vdev_unspare = B_TRUE; 7274 } 7275 7276 /* 7277 * If the parent mirror/replacing vdev only has one child, 7278 * the parent is no longer needed. Remove it from the tree. 7279 */ 7280 if (pvd->vdev_children == 1) { 7281 if (pvd->vdev_ops == &vdev_spare_ops) 7282 cvd->vdev_unspare = B_FALSE; 7283 vdev_remove_parent(cvd); 7284 } 7285 7286 /* 7287 * We don't set tvd until now because the parent we just removed 7288 * may have been the previous top-level vdev. 7289 */ 7290 tvd = cvd->vdev_top; 7291 ASSERT(tvd->vdev_parent == rvd); 7292 7293 /* 7294 * Reevaluate the parent vdev state. 7295 */ 7296 vdev_propagate_state(cvd); 7297 7298 /* 7299 * If the 'autoexpand' property is set on the pool then automatically 7300 * try to expand the size of the pool. For example if the device we 7301 * just detached was smaller than the others, it may be possible to 7302 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 7303 * first so that we can obtain the updated sizes of the leaf vdevs. 7304 */ 7305 if (spa->spa_autoexpand) { 7306 vdev_reopen(tvd); 7307 vdev_expand(tvd, txg); 7308 } 7309 7310 vdev_config_dirty(tvd); 7311 7312 /* 7313 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 7314 * vd->vdev_detached is set and free vd's DTL object in syncing context. 7315 * But first make sure we're not on any *other* txg's DTL list, to 7316 * prevent vd from being accessed after it's freed. 7317 */ 7318 vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none"); 7319 for (int t = 0; t < TXG_SIZE; t++) 7320 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 7321 vd->vdev_detached = B_TRUE; 7322 vdev_dirty(tvd, VDD_DTL, vd, txg); 7323 7324 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); 7325 spa_notify_waiters(spa); 7326 7327 /* hang on to the spa before we release the lock */ 7328 spa_open_ref(spa, FTAG); 7329 7330 error = spa_vdev_exit(spa, vd, txg, 0); 7331 7332 spa_history_log_internal(spa, "detach", NULL, 7333 "vdev=%s", vdpath); 7334 spa_strfree(vdpath); 7335 7336 /* 7337 * If this was the removal of the original device in a hot spare vdev, 7338 * then we want to go through and remove the device from the hot spare 7339 * list of every other pool. 7340 */ 7341 if (unspare) { 7342 spa_t *altspa = NULL; 7343 7344 mutex_enter(&spa_namespace_lock); 7345 while ((altspa = spa_next(altspa)) != NULL) { 7346 if (altspa->spa_state != POOL_STATE_ACTIVE || 7347 altspa == spa) 7348 continue; 7349 7350 spa_open_ref(altspa, FTAG); 7351 mutex_exit(&spa_namespace_lock); 7352 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 7353 mutex_enter(&spa_namespace_lock); 7354 spa_close(altspa, FTAG); 7355 } 7356 mutex_exit(&spa_namespace_lock); 7357 7358 /* search the rest of the vdevs for spares to remove */ 7359 spa_vdev_resilver_done(spa); 7360 } 7361 7362 /* all done with the spa; OK to release */ 7363 mutex_enter(&spa_namespace_lock); 7364 spa_close(spa, FTAG); 7365 mutex_exit(&spa_namespace_lock); 7366 7367 return (error); 7368 } 7369 7370 static int 7371 spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, 7372 list_t *vd_list) 7373 { 7374 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7375 7376 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 7377 7378 /* Look up vdev and ensure it's a leaf. */ 7379 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); 7380 if (vd == NULL || vd->vdev_detached) { 7381 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7382 return (SET_ERROR(ENODEV)); 7383 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { 7384 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7385 return (SET_ERROR(EINVAL)); 7386 } else if (!vdev_writeable(vd)) { 7387 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7388 return (SET_ERROR(EROFS)); 7389 } 7390 mutex_enter(&vd->vdev_initialize_lock); 7391 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7392 7393 /* 7394 * When we activate an initialize action we check to see 7395 * if the vdev_initialize_thread is NULL. We do this instead 7396 * of using the vdev_initialize_state since there might be 7397 * a previous initialization process which has completed but 7398 * the thread is not exited. 7399 */ 7400 if (cmd_type == POOL_INITIALIZE_START && 7401 (vd->vdev_initialize_thread != NULL || 7402 vd->vdev_top->vdev_removing)) { 7403 mutex_exit(&vd->vdev_initialize_lock); 7404 return (SET_ERROR(EBUSY)); 7405 } else if (cmd_type == POOL_INITIALIZE_CANCEL && 7406 (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE && 7407 vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) { 7408 mutex_exit(&vd->vdev_initialize_lock); 7409 return (SET_ERROR(ESRCH)); 7410 } else if (cmd_type == POOL_INITIALIZE_SUSPEND && 7411 vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { 7412 mutex_exit(&vd->vdev_initialize_lock); 7413 return (SET_ERROR(ESRCH)); 7414 } 7415 7416 switch (cmd_type) { 7417 case POOL_INITIALIZE_START: 7418 vdev_initialize(vd); 7419 break; 7420 case POOL_INITIALIZE_CANCEL: 7421 vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list); 7422 break; 7423 case POOL_INITIALIZE_SUSPEND: 7424 vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list); 7425 break; 7426 default: 7427 panic("invalid cmd_type %llu", (unsigned long long)cmd_type); 7428 } 7429 mutex_exit(&vd->vdev_initialize_lock); 7430 7431 return (0); 7432 } 7433 7434 int 7435 spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, 7436 nvlist_t *vdev_errlist) 7437 { 7438 int total_errors = 0; 7439 list_t vd_list; 7440 7441 list_create(&vd_list, sizeof (vdev_t), 7442 offsetof(vdev_t, vdev_initialize_node)); 7443 7444 /* 7445 * We hold the namespace lock through the whole function 7446 * to prevent any changes to the pool while we're starting or 7447 * stopping initialization. The config and state locks are held so that 7448 * we can properly assess the vdev state before we commit to 7449 * the initializing operation. 7450 */ 7451 mutex_enter(&spa_namespace_lock); 7452 7453 for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); 7454 pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { 7455 uint64_t vdev_guid = fnvpair_value_uint64(pair); 7456 7457 int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type, 7458 &vd_list); 7459 if (error != 0) { 7460 char guid_as_str[MAXNAMELEN]; 7461 7462 (void) snprintf(guid_as_str, sizeof (guid_as_str), 7463 "%llu", (unsigned long long)vdev_guid); 7464 fnvlist_add_int64(vdev_errlist, guid_as_str, error); 7465 total_errors++; 7466 } 7467 } 7468 7469 /* Wait for all initialize threads to stop. */ 7470 vdev_initialize_stop_wait(spa, &vd_list); 7471 7472 /* Sync out the initializing state */ 7473 txg_wait_synced(spa->spa_dsl_pool, 0); 7474 mutex_exit(&spa_namespace_lock); 7475 7476 list_destroy(&vd_list); 7477 7478 return (total_errors); 7479 } 7480 7481 static int 7482 spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, 7483 uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list) 7484 { 7485 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7486 7487 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 7488 7489 /* Look up vdev and ensure it's a leaf. */ 7490 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); 7491 if (vd == NULL || vd->vdev_detached) { 7492 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7493 return (SET_ERROR(ENODEV)); 7494 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { 7495 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7496 return (SET_ERROR(EINVAL)); 7497 } else if (!vdev_writeable(vd)) { 7498 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7499 return (SET_ERROR(EROFS)); 7500 } else if (!vd->vdev_has_trim) { 7501 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7502 return (SET_ERROR(EOPNOTSUPP)); 7503 } else if (secure && !vd->vdev_has_securetrim) { 7504 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7505 return (SET_ERROR(EOPNOTSUPP)); 7506 } 7507 mutex_enter(&vd->vdev_trim_lock); 7508 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7509 7510 /* 7511 * When we activate a TRIM action we check to see if the 7512 * vdev_trim_thread is NULL. We do this instead of using the 7513 * vdev_trim_state since there might be a previous TRIM process 7514 * which has completed but the thread is not exited. 7515 */ 7516 if (cmd_type == POOL_TRIM_START && 7517 (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) { 7518 mutex_exit(&vd->vdev_trim_lock); 7519 return (SET_ERROR(EBUSY)); 7520 } else if (cmd_type == POOL_TRIM_CANCEL && 7521 (vd->vdev_trim_state != VDEV_TRIM_ACTIVE && 7522 vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) { 7523 mutex_exit(&vd->vdev_trim_lock); 7524 return (SET_ERROR(ESRCH)); 7525 } else if (cmd_type == POOL_TRIM_SUSPEND && 7526 vd->vdev_trim_state != VDEV_TRIM_ACTIVE) { 7527 mutex_exit(&vd->vdev_trim_lock); 7528 return (SET_ERROR(ESRCH)); 7529 } 7530 7531 switch (cmd_type) { 7532 case POOL_TRIM_START: 7533 vdev_trim(vd, rate, partial, secure); 7534 break; 7535 case POOL_TRIM_CANCEL: 7536 vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list); 7537 break; 7538 case POOL_TRIM_SUSPEND: 7539 vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list); 7540 break; 7541 default: 7542 panic("invalid cmd_type %llu", (unsigned long long)cmd_type); 7543 } 7544 mutex_exit(&vd->vdev_trim_lock); 7545 7546 return (0); 7547 } 7548 7549 /* 7550 * Initiates a manual TRIM for the requested vdevs. This kicks off individual 7551 * TRIM threads for each child vdev. These threads pass over all of the free 7552 * space in the vdev's metaslabs and issues TRIM commands for that space. 7553 */ 7554 int 7555 spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate, 7556 boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist) 7557 { 7558 int total_errors = 0; 7559 list_t vd_list; 7560 7561 list_create(&vd_list, sizeof (vdev_t), 7562 offsetof(vdev_t, vdev_trim_node)); 7563 7564 /* 7565 * We hold the namespace lock through the whole function 7566 * to prevent any changes to the pool while we're starting or 7567 * stopping TRIM. The config and state locks are held so that 7568 * we can properly assess the vdev state before we commit to 7569 * the TRIM operation. 7570 */ 7571 mutex_enter(&spa_namespace_lock); 7572 7573 for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); 7574 pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { 7575 uint64_t vdev_guid = fnvpair_value_uint64(pair); 7576 7577 int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type, 7578 rate, partial, secure, &vd_list); 7579 if (error != 0) { 7580 char guid_as_str[MAXNAMELEN]; 7581 7582 (void) snprintf(guid_as_str, sizeof (guid_as_str), 7583 "%llu", (unsigned long long)vdev_guid); 7584 fnvlist_add_int64(vdev_errlist, guid_as_str, error); 7585 total_errors++; 7586 } 7587 } 7588 7589 /* Wait for all TRIM threads to stop. */ 7590 vdev_trim_stop_wait(spa, &vd_list); 7591 7592 /* Sync out the TRIM state */ 7593 txg_wait_synced(spa->spa_dsl_pool, 0); 7594 mutex_exit(&spa_namespace_lock); 7595 7596 list_destroy(&vd_list); 7597 7598 return (total_errors); 7599 } 7600 7601 /* 7602 * Split a set of devices from their mirrors, and create a new pool from them. 7603 */ 7604 int 7605 spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config, 7606 nvlist_t *props, boolean_t exp) 7607 { 7608 int error = 0; 7609 uint64_t txg, *glist; 7610 spa_t *newspa; 7611 uint_t c, children, lastlog; 7612 nvlist_t **child, *nvl, *tmp; 7613 dmu_tx_t *tx; 7614 const char *altroot = NULL; 7615 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 7616 boolean_t activate_slog; 7617 7618 ASSERT(spa_writeable(spa)); 7619 7620 txg = spa_vdev_enter(spa); 7621 7622 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7623 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 7624 error = (spa_has_checkpoint(spa)) ? 7625 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 7626 return (spa_vdev_exit(spa, NULL, txg, error)); 7627 } 7628 7629 /* clear the log and flush everything up to now */ 7630 activate_slog = spa_passivate_log(spa); 7631 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 7632 error = spa_reset_logs(spa); 7633 txg = spa_vdev_config_enter(spa); 7634 7635 if (activate_slog) 7636 spa_activate_log(spa); 7637 7638 if (error != 0) 7639 return (spa_vdev_exit(spa, NULL, txg, error)); 7640 7641 /* check new spa name before going any further */ 7642 if (spa_lookup(newname) != NULL) 7643 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 7644 7645 /* 7646 * scan through all the children to ensure they're all mirrors 7647 */ 7648 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 7649 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 7650 &children) != 0) 7651 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 7652 7653 /* first, check to ensure we've got the right child count */ 7654 rvd = spa->spa_root_vdev; 7655 lastlog = 0; 7656 for (c = 0; c < rvd->vdev_children; c++) { 7657 vdev_t *vd = rvd->vdev_child[c]; 7658 7659 /* don't count the holes & logs as children */ 7660 if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops && 7661 !vdev_is_concrete(vd))) { 7662 if (lastlog == 0) 7663 lastlog = c; 7664 continue; 7665 } 7666 7667 lastlog = 0; 7668 } 7669 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 7670 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 7671 7672 /* next, ensure no spare or cache devices are part of the split */ 7673 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 7674 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 7675 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 7676 7677 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 7678 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 7679 7680 /* then, loop over each vdev and validate it */ 7681 for (c = 0; c < children; c++) { 7682 uint64_t is_hole = 0; 7683 7684 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 7685 &is_hole); 7686 7687 if (is_hole != 0) { 7688 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 7689 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 7690 continue; 7691 } else { 7692 error = SET_ERROR(EINVAL); 7693 break; 7694 } 7695 } 7696 7697 /* deal with indirect vdevs */ 7698 if (spa->spa_root_vdev->vdev_child[c]->vdev_ops == 7699 &vdev_indirect_ops) 7700 continue; 7701 7702 /* which disk is going to be split? */ 7703 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 7704 &glist[c]) != 0) { 7705 error = SET_ERROR(EINVAL); 7706 break; 7707 } 7708 7709 /* look it up in the spa */ 7710 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 7711 if (vml[c] == NULL) { 7712 error = SET_ERROR(ENODEV); 7713 break; 7714 } 7715 7716 /* make sure there's nothing stopping the split */ 7717 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 7718 vml[c]->vdev_islog || 7719 !vdev_is_concrete(vml[c]) || 7720 vml[c]->vdev_isspare || 7721 vml[c]->vdev_isl2cache || 7722 !vdev_writeable(vml[c]) || 7723 vml[c]->vdev_children != 0 || 7724 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 7725 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 7726 error = SET_ERROR(EINVAL); 7727 break; 7728 } 7729 7730 if (vdev_dtl_required(vml[c]) || 7731 vdev_resilver_needed(vml[c], NULL, NULL)) { 7732 error = SET_ERROR(EBUSY); 7733 break; 7734 } 7735 7736 /* we need certain info from the top level */ 7737 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 7738 vml[c]->vdev_top->vdev_ms_array); 7739 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 7740 vml[c]->vdev_top->vdev_ms_shift); 7741 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 7742 vml[c]->vdev_top->vdev_asize); 7743 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 7744 vml[c]->vdev_top->vdev_ashift); 7745 7746 /* transfer per-vdev ZAPs */ 7747 ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); 7748 VERIFY0(nvlist_add_uint64(child[c], 7749 ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); 7750 7751 ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); 7752 VERIFY0(nvlist_add_uint64(child[c], 7753 ZPOOL_CONFIG_VDEV_TOP_ZAP, 7754 vml[c]->vdev_parent->vdev_top_zap)); 7755 } 7756 7757 if (error != 0) { 7758 kmem_free(vml, children * sizeof (vdev_t *)); 7759 kmem_free(glist, children * sizeof (uint64_t)); 7760 return (spa_vdev_exit(spa, NULL, txg, error)); 7761 } 7762 7763 /* stop writers from using the disks */ 7764 for (c = 0; c < children; c++) { 7765 if (vml[c] != NULL) 7766 vml[c]->vdev_offline = B_TRUE; 7767 } 7768 vdev_reopen(spa->spa_root_vdev); 7769 7770 /* 7771 * Temporarily record the splitting vdevs in the spa config. This 7772 * will disappear once the config is regenerated. 7773 */ 7774 nvl = fnvlist_alloc(); 7775 fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children); 7776 kmem_free(glist, children * sizeof (uint64_t)); 7777 7778 mutex_enter(&spa->spa_props_lock); 7779 fnvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl); 7780 mutex_exit(&spa->spa_props_lock); 7781 spa->spa_config_splitting = nvl; 7782 vdev_config_dirty(spa->spa_root_vdev); 7783 7784 /* configure and create the new pool */ 7785 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname); 7786 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 7787 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE); 7788 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)); 7789 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg); 7790 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 7791 spa_generate_guid(NULL)); 7792 VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 7793 (void) nvlist_lookup_string(props, 7794 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 7795 7796 /* add the new pool to the namespace */ 7797 newspa = spa_add(newname, config, altroot); 7798 newspa->spa_avz_action = AVZ_ACTION_REBUILD; 7799 newspa->spa_config_txg = spa->spa_config_txg; 7800 spa_set_log_state(newspa, SPA_LOG_CLEAR); 7801 7802 /* release the spa config lock, retaining the namespace lock */ 7803 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 7804 7805 if (zio_injection_enabled) 7806 zio_handle_panic_injection(spa, FTAG, 1); 7807 7808 spa_activate(newspa, spa_mode_global); 7809 spa_async_suspend(newspa); 7810 7811 /* 7812 * Temporarily stop the initializing and TRIM activity. We set the 7813 * state to ACTIVE so that we know to resume initializing or TRIM 7814 * once the split has completed. 7815 */ 7816 list_t vd_initialize_list; 7817 list_create(&vd_initialize_list, sizeof (vdev_t), 7818 offsetof(vdev_t, vdev_initialize_node)); 7819 7820 list_t vd_trim_list; 7821 list_create(&vd_trim_list, sizeof (vdev_t), 7822 offsetof(vdev_t, vdev_trim_node)); 7823 7824 for (c = 0; c < children; c++) { 7825 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { 7826 mutex_enter(&vml[c]->vdev_initialize_lock); 7827 vdev_initialize_stop(vml[c], 7828 VDEV_INITIALIZE_ACTIVE, &vd_initialize_list); 7829 mutex_exit(&vml[c]->vdev_initialize_lock); 7830 7831 mutex_enter(&vml[c]->vdev_trim_lock); 7832 vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list); 7833 mutex_exit(&vml[c]->vdev_trim_lock); 7834 } 7835 } 7836 7837 vdev_initialize_stop_wait(spa, &vd_initialize_list); 7838 vdev_trim_stop_wait(spa, &vd_trim_list); 7839 7840 list_destroy(&vd_initialize_list); 7841 list_destroy(&vd_trim_list); 7842 7843 newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; 7844 newspa->spa_is_splitting = B_TRUE; 7845 7846 /* create the new pool from the disks of the original pool */ 7847 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE); 7848 if (error) 7849 goto out; 7850 7851 /* if that worked, generate a real config for the new pool */ 7852 if (newspa->spa_root_vdev != NULL) { 7853 newspa->spa_config_splitting = fnvlist_alloc(); 7854 fnvlist_add_uint64(newspa->spa_config_splitting, 7855 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)); 7856 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 7857 B_TRUE)); 7858 } 7859 7860 /* set the props */ 7861 if (props != NULL) { 7862 spa_configfile_set(newspa, props, B_FALSE); 7863 error = spa_prop_set(newspa, props); 7864 if (error) 7865 goto out; 7866 } 7867 7868 /* flush everything */ 7869 txg = spa_vdev_config_enter(newspa); 7870 vdev_config_dirty(newspa->spa_root_vdev); 7871 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 7872 7873 if (zio_injection_enabled) 7874 zio_handle_panic_injection(spa, FTAG, 2); 7875 7876 spa_async_resume(newspa); 7877 7878 /* finally, update the original pool's config */ 7879 txg = spa_vdev_config_enter(spa); 7880 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 7881 error = dmu_tx_assign(tx, TXG_WAIT); 7882 if (error != 0) 7883 dmu_tx_abort(tx); 7884 for (c = 0; c < children; c++) { 7885 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { 7886 vdev_t *tvd = vml[c]->vdev_top; 7887 7888 /* 7889 * Need to be sure the detachable VDEV is not 7890 * on any *other* txg's DTL list to prevent it 7891 * from being accessed after it's freed. 7892 */ 7893 for (int t = 0; t < TXG_SIZE; t++) { 7894 (void) txg_list_remove_this( 7895 &tvd->vdev_dtl_list, vml[c], t); 7896 } 7897 7898 vdev_split(vml[c]); 7899 if (error == 0) 7900 spa_history_log_internal(spa, "detach", tx, 7901 "vdev=%s", vml[c]->vdev_path); 7902 7903 vdev_free(vml[c]); 7904 } 7905 } 7906 spa->spa_avz_action = AVZ_ACTION_REBUILD; 7907 vdev_config_dirty(spa->spa_root_vdev); 7908 spa->spa_config_splitting = NULL; 7909 nvlist_free(nvl); 7910 if (error == 0) 7911 dmu_tx_commit(tx); 7912 (void) spa_vdev_exit(spa, NULL, txg, 0); 7913 7914 if (zio_injection_enabled) 7915 zio_handle_panic_injection(spa, FTAG, 3); 7916 7917 /* split is complete; log a history record */ 7918 spa_history_log_internal(newspa, "split", NULL, 7919 "from pool %s", spa_name(spa)); 7920 7921 newspa->spa_is_splitting = B_FALSE; 7922 kmem_free(vml, children * sizeof (vdev_t *)); 7923 7924 /* if we're not going to mount the filesystems in userland, export */ 7925 if (exp) 7926 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 7927 B_FALSE, B_FALSE); 7928 7929 return (error); 7930 7931 out: 7932 spa_unload(newspa); 7933 spa_deactivate(newspa); 7934 spa_remove(newspa); 7935 7936 txg = spa_vdev_config_enter(spa); 7937 7938 /* re-online all offlined disks */ 7939 for (c = 0; c < children; c++) { 7940 if (vml[c] != NULL) 7941 vml[c]->vdev_offline = B_FALSE; 7942 } 7943 7944 /* restart initializing or trimming disks as necessary */ 7945 spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); 7946 spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); 7947 spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); 7948 7949 vdev_reopen(spa->spa_root_vdev); 7950 7951 nvlist_free(spa->spa_config_splitting); 7952 spa->spa_config_splitting = NULL; 7953 (void) spa_vdev_exit(spa, NULL, txg, error); 7954 7955 kmem_free(vml, children * sizeof (vdev_t *)); 7956 return (error); 7957 } 7958 7959 /* 7960 * Find any device that's done replacing, or a vdev marked 'unspare' that's 7961 * currently spared, so we can detach it. 7962 */ 7963 static vdev_t * 7964 spa_vdev_resilver_done_hunt(vdev_t *vd) 7965 { 7966 vdev_t *newvd, *oldvd; 7967 7968 for (int c = 0; c < vd->vdev_children; c++) { 7969 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 7970 if (oldvd != NULL) 7971 return (oldvd); 7972 } 7973 7974 /* 7975 * Check for a completed replacement. We always consider the first 7976 * vdev in the list to be the oldest vdev, and the last one to be 7977 * the newest (see spa_vdev_attach() for how that works). In 7978 * the case where the newest vdev is faulted, we will not automatically 7979 * remove it after a resilver completes. This is OK as it will require 7980 * user intervention to determine which disk the admin wishes to keep. 7981 */ 7982 if (vd->vdev_ops == &vdev_replacing_ops) { 7983 ASSERT(vd->vdev_children > 1); 7984 7985 newvd = vd->vdev_child[vd->vdev_children - 1]; 7986 oldvd = vd->vdev_child[0]; 7987 7988 if (vdev_dtl_empty(newvd, DTL_MISSING) && 7989 vdev_dtl_empty(newvd, DTL_OUTAGE) && 7990 !vdev_dtl_required(oldvd)) 7991 return (oldvd); 7992 } 7993 7994 /* 7995 * Check for a completed resilver with the 'unspare' flag set. 7996 * Also potentially update faulted state. 7997 */ 7998 if (vd->vdev_ops == &vdev_spare_ops) { 7999 vdev_t *first = vd->vdev_child[0]; 8000 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 8001 8002 if (last->vdev_unspare) { 8003 oldvd = first; 8004 newvd = last; 8005 } else if (first->vdev_unspare) { 8006 oldvd = last; 8007 newvd = first; 8008 } else { 8009 oldvd = NULL; 8010 } 8011 8012 if (oldvd != NULL && 8013 vdev_dtl_empty(newvd, DTL_MISSING) && 8014 vdev_dtl_empty(newvd, DTL_OUTAGE) && 8015 !vdev_dtl_required(oldvd)) 8016 return (oldvd); 8017 8018 vdev_propagate_state(vd); 8019 8020 /* 8021 * If there are more than two spares attached to a disk, 8022 * and those spares are not required, then we want to 8023 * attempt to free them up now so that they can be used 8024 * by other pools. Once we're back down to a single 8025 * disk+spare, we stop removing them. 8026 */ 8027 if (vd->vdev_children > 2) { 8028 newvd = vd->vdev_child[1]; 8029 8030 if (newvd->vdev_isspare && last->vdev_isspare && 8031 vdev_dtl_empty(last, DTL_MISSING) && 8032 vdev_dtl_empty(last, DTL_OUTAGE) && 8033 !vdev_dtl_required(newvd)) 8034 return (newvd); 8035 } 8036 } 8037 8038 return (NULL); 8039 } 8040 8041 static void 8042 spa_vdev_resilver_done(spa_t *spa) 8043 { 8044 vdev_t *vd, *pvd, *ppvd; 8045 uint64_t guid, sguid, pguid, ppguid; 8046 8047 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 8048 8049 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 8050 pvd = vd->vdev_parent; 8051 ppvd = pvd->vdev_parent; 8052 guid = vd->vdev_guid; 8053 pguid = pvd->vdev_guid; 8054 ppguid = ppvd->vdev_guid; 8055 sguid = 0; 8056 /* 8057 * If we have just finished replacing a hot spared device, then 8058 * we need to detach the parent's first child (the original hot 8059 * spare) as well. 8060 */ 8061 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 8062 ppvd->vdev_children == 2) { 8063 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 8064 sguid = ppvd->vdev_child[1]->vdev_guid; 8065 } 8066 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 8067 8068 spa_config_exit(spa, SCL_ALL, FTAG); 8069 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 8070 return; 8071 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 8072 return; 8073 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 8074 } 8075 8076 spa_config_exit(spa, SCL_ALL, FTAG); 8077 8078 /* 8079 * If a detach was not performed above replace waiters will not have 8080 * been notified. In which case we must do so now. 8081 */ 8082 spa_notify_waiters(spa); 8083 } 8084 8085 /* 8086 * Update the stored path or FRU for this vdev. 8087 */ 8088 static int 8089 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 8090 boolean_t ispath) 8091 { 8092 vdev_t *vd; 8093 boolean_t sync = B_FALSE; 8094 8095 ASSERT(spa_writeable(spa)); 8096 8097 spa_vdev_state_enter(spa, SCL_ALL); 8098 8099 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 8100 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 8101 8102 if (!vd->vdev_ops->vdev_op_leaf) 8103 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 8104 8105 if (ispath) { 8106 if (strcmp(value, vd->vdev_path) != 0) { 8107 spa_strfree(vd->vdev_path); 8108 vd->vdev_path = spa_strdup(value); 8109 sync = B_TRUE; 8110 } 8111 } else { 8112 if (vd->vdev_fru == NULL) { 8113 vd->vdev_fru = spa_strdup(value); 8114 sync = B_TRUE; 8115 } else if (strcmp(value, vd->vdev_fru) != 0) { 8116 spa_strfree(vd->vdev_fru); 8117 vd->vdev_fru = spa_strdup(value); 8118 sync = B_TRUE; 8119 } 8120 } 8121 8122 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 8123 } 8124 8125 int 8126 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 8127 { 8128 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 8129 } 8130 8131 int 8132 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 8133 { 8134 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 8135 } 8136 8137 /* 8138 * ========================================================================== 8139 * SPA Scanning 8140 * ========================================================================== 8141 */ 8142 int 8143 spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) 8144 { 8145 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 8146 8147 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 8148 return (SET_ERROR(EBUSY)); 8149 8150 return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); 8151 } 8152 8153 int 8154 spa_scan_stop(spa_t *spa) 8155 { 8156 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 8157 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 8158 return (SET_ERROR(EBUSY)); 8159 return (dsl_scan_cancel(spa->spa_dsl_pool)); 8160 } 8161 8162 int 8163 spa_scan(spa_t *spa, pool_scan_func_t func) 8164 { 8165 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 8166 8167 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 8168 return (SET_ERROR(ENOTSUP)); 8169 8170 if (func == POOL_SCAN_RESILVER && 8171 !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) 8172 return (SET_ERROR(ENOTSUP)); 8173 8174 /* 8175 * If a resilver was requested, but there is no DTL on a 8176 * writeable leaf device, we have nothing to do. 8177 */ 8178 if (func == POOL_SCAN_RESILVER && 8179 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 8180 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 8181 return (0); 8182 } 8183 8184 return (dsl_scan(spa->spa_dsl_pool, func)); 8185 } 8186 8187 /* 8188 * ========================================================================== 8189 * SPA async task processing 8190 * ========================================================================== 8191 */ 8192 8193 static void 8194 spa_async_remove(spa_t *spa, vdev_t *vd) 8195 { 8196 if (vd->vdev_remove_wanted) { 8197 vd->vdev_remove_wanted = B_FALSE; 8198 vd->vdev_delayed_close = B_FALSE; 8199 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 8200 8201 /* 8202 * We want to clear the stats, but we don't want to do a full 8203 * vdev_clear() as that will cause us to throw away 8204 * degraded/faulted state as well as attempt to reopen the 8205 * device, all of which is a waste. 8206 */ 8207 vd->vdev_stat.vs_read_errors = 0; 8208 vd->vdev_stat.vs_write_errors = 0; 8209 vd->vdev_stat.vs_checksum_errors = 0; 8210 8211 vdev_state_dirty(vd->vdev_top); 8212 8213 /* Tell userspace that the vdev is gone. */ 8214 zfs_post_remove(spa, vd); 8215 } 8216 8217 for (int c = 0; c < vd->vdev_children; c++) 8218 spa_async_remove(spa, vd->vdev_child[c]); 8219 } 8220 8221 static void 8222 spa_async_probe(spa_t *spa, vdev_t *vd) 8223 { 8224 if (vd->vdev_probe_wanted) { 8225 vd->vdev_probe_wanted = B_FALSE; 8226 vdev_reopen(vd); /* vdev_open() does the actual probe */ 8227 } 8228 8229 for (int c = 0; c < vd->vdev_children; c++) 8230 spa_async_probe(spa, vd->vdev_child[c]); 8231 } 8232 8233 static void 8234 spa_async_autoexpand(spa_t *spa, vdev_t *vd) 8235 { 8236 if (!spa->spa_autoexpand) 8237 return; 8238 8239 for (int c = 0; c < vd->vdev_children; c++) { 8240 vdev_t *cvd = vd->vdev_child[c]; 8241 spa_async_autoexpand(spa, cvd); 8242 } 8243 8244 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 8245 return; 8246 8247 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND); 8248 } 8249 8250 static __attribute__((noreturn)) void 8251 spa_async_thread(void *arg) 8252 { 8253 spa_t *spa = (spa_t *)arg; 8254 dsl_pool_t *dp = spa->spa_dsl_pool; 8255 int tasks; 8256 8257 ASSERT(spa->spa_sync_on); 8258 8259 mutex_enter(&spa->spa_async_lock); 8260 tasks = spa->spa_async_tasks; 8261 spa->spa_async_tasks = 0; 8262 mutex_exit(&spa->spa_async_lock); 8263 8264 /* 8265 * See if the config needs to be updated. 8266 */ 8267 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 8268 uint64_t old_space, new_space; 8269 8270 mutex_enter(&spa_namespace_lock); 8271 old_space = metaslab_class_get_space(spa_normal_class(spa)); 8272 old_space += metaslab_class_get_space(spa_special_class(spa)); 8273 old_space += metaslab_class_get_space(spa_dedup_class(spa)); 8274 old_space += metaslab_class_get_space( 8275 spa_embedded_log_class(spa)); 8276 8277 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 8278 8279 new_space = metaslab_class_get_space(spa_normal_class(spa)); 8280 new_space += metaslab_class_get_space(spa_special_class(spa)); 8281 new_space += metaslab_class_get_space(spa_dedup_class(spa)); 8282 new_space += metaslab_class_get_space( 8283 spa_embedded_log_class(spa)); 8284 mutex_exit(&spa_namespace_lock); 8285 8286 /* 8287 * If the pool grew as a result of the config update, 8288 * then log an internal history event. 8289 */ 8290 if (new_space != old_space) { 8291 spa_history_log_internal(spa, "vdev online", NULL, 8292 "pool '%s' size: %llu(+%llu)", 8293 spa_name(spa), (u_longlong_t)new_space, 8294 (u_longlong_t)(new_space - old_space)); 8295 } 8296 } 8297 8298 /* 8299 * See if any devices need to be marked REMOVED. 8300 */ 8301 if (tasks & SPA_ASYNC_REMOVE) { 8302 spa_vdev_state_enter(spa, SCL_NONE); 8303 spa_async_remove(spa, spa->spa_root_vdev); 8304 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 8305 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 8306 for (int i = 0; i < spa->spa_spares.sav_count; i++) 8307 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 8308 (void) spa_vdev_state_exit(spa, NULL, 0); 8309 } 8310 8311 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 8312 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8313 spa_async_autoexpand(spa, spa->spa_root_vdev); 8314 spa_config_exit(spa, SCL_CONFIG, FTAG); 8315 } 8316 8317 /* 8318 * See if any devices need to be probed. 8319 */ 8320 if (tasks & SPA_ASYNC_PROBE) { 8321 spa_vdev_state_enter(spa, SCL_NONE); 8322 spa_async_probe(spa, spa->spa_root_vdev); 8323 (void) spa_vdev_state_exit(spa, NULL, 0); 8324 } 8325 8326 /* 8327 * If any devices are done replacing, detach them. 8328 */ 8329 if (tasks & SPA_ASYNC_RESILVER_DONE || 8330 tasks & SPA_ASYNC_REBUILD_DONE || 8331 tasks & SPA_ASYNC_DETACH_SPARE) { 8332 spa_vdev_resilver_done(spa); 8333 } 8334 8335 /* 8336 * Kick off a resilver. 8337 */ 8338 if (tasks & SPA_ASYNC_RESILVER && 8339 !vdev_rebuild_active(spa->spa_root_vdev) && 8340 (!dsl_scan_resilvering(dp) || 8341 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))) 8342 dsl_scan_restart_resilver(dp, 0); 8343 8344 if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { 8345 mutex_enter(&spa_namespace_lock); 8346 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8347 vdev_initialize_restart(spa->spa_root_vdev); 8348 spa_config_exit(spa, SCL_CONFIG, FTAG); 8349 mutex_exit(&spa_namespace_lock); 8350 } 8351 8352 if (tasks & SPA_ASYNC_TRIM_RESTART) { 8353 mutex_enter(&spa_namespace_lock); 8354 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8355 vdev_trim_restart(spa->spa_root_vdev); 8356 spa_config_exit(spa, SCL_CONFIG, FTAG); 8357 mutex_exit(&spa_namespace_lock); 8358 } 8359 8360 if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) { 8361 mutex_enter(&spa_namespace_lock); 8362 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8363 vdev_autotrim_restart(spa); 8364 spa_config_exit(spa, SCL_CONFIG, FTAG); 8365 mutex_exit(&spa_namespace_lock); 8366 } 8367 8368 /* 8369 * Kick off L2 cache whole device TRIM. 8370 */ 8371 if (tasks & SPA_ASYNC_L2CACHE_TRIM) { 8372 mutex_enter(&spa_namespace_lock); 8373 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8374 vdev_trim_l2arc(spa); 8375 spa_config_exit(spa, SCL_CONFIG, FTAG); 8376 mutex_exit(&spa_namespace_lock); 8377 } 8378 8379 /* 8380 * Kick off L2 cache rebuilding. 8381 */ 8382 if (tasks & SPA_ASYNC_L2CACHE_REBUILD) { 8383 mutex_enter(&spa_namespace_lock); 8384 spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER); 8385 l2arc_spa_rebuild_start(spa); 8386 spa_config_exit(spa, SCL_L2ARC, FTAG); 8387 mutex_exit(&spa_namespace_lock); 8388 } 8389 8390 /* 8391 * Let the world know that we're done. 8392 */ 8393 mutex_enter(&spa->spa_async_lock); 8394 spa->spa_async_thread = NULL; 8395 cv_broadcast(&spa->spa_async_cv); 8396 mutex_exit(&spa->spa_async_lock); 8397 thread_exit(); 8398 } 8399 8400 void 8401 spa_async_suspend(spa_t *spa) 8402 { 8403 mutex_enter(&spa->spa_async_lock); 8404 spa->spa_async_suspended++; 8405 while (spa->spa_async_thread != NULL) 8406 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 8407 mutex_exit(&spa->spa_async_lock); 8408 8409 spa_vdev_remove_suspend(spa); 8410 8411 zthr_t *condense_thread = spa->spa_condense_zthr; 8412 if (condense_thread != NULL) 8413 zthr_cancel(condense_thread); 8414 8415 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; 8416 if (discard_thread != NULL) 8417 zthr_cancel(discard_thread); 8418 8419 zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; 8420 if (ll_delete_thread != NULL) 8421 zthr_cancel(ll_delete_thread); 8422 8423 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; 8424 if (ll_condense_thread != NULL) 8425 zthr_cancel(ll_condense_thread); 8426 } 8427 8428 void 8429 spa_async_resume(spa_t *spa) 8430 { 8431 mutex_enter(&spa->spa_async_lock); 8432 ASSERT(spa->spa_async_suspended != 0); 8433 spa->spa_async_suspended--; 8434 mutex_exit(&spa->spa_async_lock); 8435 spa_restart_removal(spa); 8436 8437 zthr_t *condense_thread = spa->spa_condense_zthr; 8438 if (condense_thread != NULL) 8439 zthr_resume(condense_thread); 8440 8441 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; 8442 if (discard_thread != NULL) 8443 zthr_resume(discard_thread); 8444 8445 zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; 8446 if (ll_delete_thread != NULL) 8447 zthr_resume(ll_delete_thread); 8448 8449 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; 8450 if (ll_condense_thread != NULL) 8451 zthr_resume(ll_condense_thread); 8452 } 8453 8454 static boolean_t 8455 spa_async_tasks_pending(spa_t *spa) 8456 { 8457 uint_t non_config_tasks; 8458 uint_t config_task; 8459 boolean_t config_task_suspended; 8460 8461 non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE; 8462 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 8463 if (spa->spa_ccw_fail_time == 0) { 8464 config_task_suspended = B_FALSE; 8465 } else { 8466 config_task_suspended = 8467 (gethrtime() - spa->spa_ccw_fail_time) < 8468 ((hrtime_t)zfs_ccw_retry_interval * NANOSEC); 8469 } 8470 8471 return (non_config_tasks || (config_task && !config_task_suspended)); 8472 } 8473 8474 static void 8475 spa_async_dispatch(spa_t *spa) 8476 { 8477 mutex_enter(&spa->spa_async_lock); 8478 if (spa_async_tasks_pending(spa) && 8479 !spa->spa_async_suspended && 8480 spa->spa_async_thread == NULL) 8481 spa->spa_async_thread = thread_create(NULL, 0, 8482 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 8483 mutex_exit(&spa->spa_async_lock); 8484 } 8485 8486 void 8487 spa_async_request(spa_t *spa, int task) 8488 { 8489 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 8490 mutex_enter(&spa->spa_async_lock); 8491 spa->spa_async_tasks |= task; 8492 mutex_exit(&spa->spa_async_lock); 8493 } 8494 8495 int 8496 spa_async_tasks(spa_t *spa) 8497 { 8498 return (spa->spa_async_tasks); 8499 } 8500 8501 /* 8502 * ========================================================================== 8503 * SPA syncing routines 8504 * ========================================================================== 8505 */ 8506 8507 8508 static int 8509 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 8510 dmu_tx_t *tx) 8511 { 8512 bpobj_t *bpo = arg; 8513 bpobj_enqueue(bpo, bp, bp_freed, tx); 8514 return (0); 8515 } 8516 8517 int 8518 bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 8519 { 8520 return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx)); 8521 } 8522 8523 int 8524 bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 8525 { 8526 return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx)); 8527 } 8528 8529 static int 8530 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 8531 { 8532 zio_t *pio = arg; 8533 8534 zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp, 8535 pio->io_flags)); 8536 return (0); 8537 } 8538 8539 static int 8540 bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 8541 dmu_tx_t *tx) 8542 { 8543 ASSERT(!bp_freed); 8544 return (spa_free_sync_cb(arg, bp, tx)); 8545 } 8546 8547 /* 8548 * Note: this simple function is not inlined to make it easier to dtrace the 8549 * amount of time spent syncing frees. 8550 */ 8551 static void 8552 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 8553 { 8554 zio_t *zio = zio_root(spa, NULL, NULL, 0); 8555 bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 8556 VERIFY(zio_wait(zio) == 0); 8557 } 8558 8559 /* 8560 * Note: this simple function is not inlined to make it easier to dtrace the 8561 * amount of time spent syncing deferred frees. 8562 */ 8563 static void 8564 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 8565 { 8566 if (spa_sync_pass(spa) != 1) 8567 return; 8568 8569 /* 8570 * Note: 8571 * If the log space map feature is active, we stop deferring 8572 * frees to the next TXG and therefore running this function 8573 * would be considered a no-op as spa_deferred_bpobj should 8574 * not have any entries. 8575 * 8576 * That said we run this function anyway (instead of returning 8577 * immediately) for the edge-case scenario where we just 8578 * activated the log space map feature in this TXG but we have 8579 * deferred frees from the previous TXG. 8580 */ 8581 zio_t *zio = zio_root(spa, NULL, NULL, 0); 8582 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 8583 bpobj_spa_free_sync_cb, zio, tx), ==, 0); 8584 VERIFY0(zio_wait(zio)); 8585 } 8586 8587 static void 8588 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 8589 { 8590 char *packed = NULL; 8591 size_t bufsize; 8592 size_t nvsize = 0; 8593 dmu_buf_t *db; 8594 8595 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 8596 8597 /* 8598 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 8599 * information. This avoids the dmu_buf_will_dirty() path and 8600 * saves us a pre-read to get data we don't actually care about. 8601 */ 8602 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 8603 packed = vmem_alloc(bufsize, KM_SLEEP); 8604 8605 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 8606 KM_SLEEP) == 0); 8607 memset(packed + nvsize, 0, bufsize - nvsize); 8608 8609 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 8610 8611 vmem_free(packed, bufsize); 8612 8613 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 8614 dmu_buf_will_dirty(db, tx); 8615 *(uint64_t *)db->db_data = nvsize; 8616 dmu_buf_rele(db, FTAG); 8617 } 8618 8619 static void 8620 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 8621 const char *config, const char *entry) 8622 { 8623 nvlist_t *nvroot; 8624 nvlist_t **list; 8625 int i; 8626 8627 if (!sav->sav_sync) 8628 return; 8629 8630 /* 8631 * Update the MOS nvlist describing the list of available devices. 8632 * spa_validate_aux() will have already made sure this nvlist is 8633 * valid and the vdevs are labeled appropriately. 8634 */ 8635 if (sav->sav_object == 0) { 8636 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 8637 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 8638 sizeof (uint64_t), tx); 8639 VERIFY(zap_update(spa->spa_meta_objset, 8640 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 8641 &sav->sav_object, tx) == 0); 8642 } 8643 8644 nvroot = fnvlist_alloc(); 8645 if (sav->sav_count == 0) { 8646 fnvlist_add_nvlist_array(nvroot, config, 8647 (const nvlist_t * const *)NULL, 0); 8648 } else { 8649 list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP); 8650 for (i = 0; i < sav->sav_count; i++) 8651 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 8652 B_FALSE, VDEV_CONFIG_L2CACHE); 8653 fnvlist_add_nvlist_array(nvroot, config, 8654 (const nvlist_t * const *)list, sav->sav_count); 8655 for (i = 0; i < sav->sav_count; i++) 8656 nvlist_free(list[i]); 8657 kmem_free(list, sav->sav_count * sizeof (void *)); 8658 } 8659 8660 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 8661 nvlist_free(nvroot); 8662 8663 sav->sav_sync = B_FALSE; 8664 } 8665 8666 /* 8667 * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. 8668 * The all-vdev ZAP must be empty. 8669 */ 8670 static void 8671 spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) 8672 { 8673 spa_t *spa = vd->vdev_spa; 8674 8675 if (vd->vdev_root_zap != 0 && 8676 spa_feature_is_active(spa, SPA_FEATURE_AVZ_V2)) { 8677 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 8678 vd->vdev_root_zap, tx)); 8679 } 8680 if (vd->vdev_top_zap != 0) { 8681 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 8682 vd->vdev_top_zap, tx)); 8683 } 8684 if (vd->vdev_leaf_zap != 0) { 8685 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 8686 vd->vdev_leaf_zap, tx)); 8687 } 8688 for (uint64_t i = 0; i < vd->vdev_children; i++) { 8689 spa_avz_build(vd->vdev_child[i], avz, tx); 8690 } 8691 } 8692 8693 static void 8694 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 8695 { 8696 nvlist_t *config; 8697 8698 /* 8699 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, 8700 * its config may not be dirty but we still need to build per-vdev ZAPs. 8701 * Similarly, if the pool is being assembled (e.g. after a split), we 8702 * need to rebuild the AVZ although the config may not be dirty. 8703 */ 8704 if (list_is_empty(&spa->spa_config_dirty_list) && 8705 spa->spa_avz_action == AVZ_ACTION_NONE) 8706 return; 8707 8708 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 8709 8710 ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || 8711 spa->spa_avz_action == AVZ_ACTION_INITIALIZE || 8712 spa->spa_all_vdev_zaps != 0); 8713 8714 if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { 8715 /* Make and build the new AVZ */ 8716 uint64_t new_avz = zap_create(spa->spa_meta_objset, 8717 DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); 8718 spa_avz_build(spa->spa_root_vdev, new_avz, tx); 8719 8720 /* Diff old AVZ with new one */ 8721 zap_cursor_t zc; 8722 zap_attribute_t za; 8723 8724 for (zap_cursor_init(&zc, spa->spa_meta_objset, 8725 spa->spa_all_vdev_zaps); 8726 zap_cursor_retrieve(&zc, &za) == 0; 8727 zap_cursor_advance(&zc)) { 8728 uint64_t vdzap = za.za_first_integer; 8729 if (zap_lookup_int(spa->spa_meta_objset, new_avz, 8730 vdzap) == ENOENT) { 8731 /* 8732 * ZAP is listed in old AVZ but not in new one; 8733 * destroy it 8734 */ 8735 VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, 8736 tx)); 8737 } 8738 } 8739 8740 zap_cursor_fini(&zc); 8741 8742 /* Destroy the old AVZ */ 8743 VERIFY0(zap_destroy(spa->spa_meta_objset, 8744 spa->spa_all_vdev_zaps, tx)); 8745 8746 /* Replace the old AVZ in the dir obj with the new one */ 8747 VERIFY0(zap_update(spa->spa_meta_objset, 8748 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, 8749 sizeof (new_avz), 1, &new_avz, tx)); 8750 8751 spa->spa_all_vdev_zaps = new_avz; 8752 } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { 8753 zap_cursor_t zc; 8754 zap_attribute_t za; 8755 8756 /* Walk through the AVZ and destroy all listed ZAPs */ 8757 for (zap_cursor_init(&zc, spa->spa_meta_objset, 8758 spa->spa_all_vdev_zaps); 8759 zap_cursor_retrieve(&zc, &za) == 0; 8760 zap_cursor_advance(&zc)) { 8761 uint64_t zap = za.za_first_integer; 8762 VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); 8763 } 8764 8765 zap_cursor_fini(&zc); 8766 8767 /* Destroy and unlink the AVZ itself */ 8768 VERIFY0(zap_destroy(spa->spa_meta_objset, 8769 spa->spa_all_vdev_zaps, tx)); 8770 VERIFY0(zap_remove(spa->spa_meta_objset, 8771 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); 8772 spa->spa_all_vdev_zaps = 0; 8773 } 8774 8775 if (spa->spa_all_vdev_zaps == 0) { 8776 spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, 8777 DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, 8778 DMU_POOL_VDEV_ZAP_MAP, tx); 8779 } 8780 spa->spa_avz_action = AVZ_ACTION_NONE; 8781 8782 /* Create ZAPs for vdevs that don't have them. */ 8783 vdev_construct_zaps(spa->spa_root_vdev, tx); 8784 8785 config = spa_config_generate(spa, spa->spa_root_vdev, 8786 dmu_tx_get_txg(tx), B_FALSE); 8787 8788 /* 8789 * If we're upgrading the spa version then make sure that 8790 * the config object gets updated with the correct version. 8791 */ 8792 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 8793 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 8794 spa->spa_uberblock.ub_version); 8795 8796 spa_config_exit(spa, SCL_STATE, FTAG); 8797 8798 nvlist_free(spa->spa_config_syncing); 8799 spa->spa_config_syncing = config; 8800 8801 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 8802 } 8803 8804 static void 8805 spa_sync_version(void *arg, dmu_tx_t *tx) 8806 { 8807 uint64_t *versionp = arg; 8808 uint64_t version = *versionp; 8809 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 8810 8811 /* 8812 * Setting the version is special cased when first creating the pool. 8813 */ 8814 ASSERT(tx->tx_txg != TXG_INITIAL); 8815 8816 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 8817 ASSERT(version >= spa_version(spa)); 8818 8819 spa->spa_uberblock.ub_version = version; 8820 vdev_config_dirty(spa->spa_root_vdev); 8821 spa_history_log_internal(spa, "set", tx, "version=%lld", 8822 (longlong_t)version); 8823 } 8824 8825 /* 8826 * Set zpool properties. 8827 */ 8828 static void 8829 spa_sync_props(void *arg, dmu_tx_t *tx) 8830 { 8831 nvlist_t *nvp = arg; 8832 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 8833 objset_t *mos = spa->spa_meta_objset; 8834 nvpair_t *elem = NULL; 8835 8836 mutex_enter(&spa->spa_props_lock); 8837 8838 while ((elem = nvlist_next_nvpair(nvp, elem))) { 8839 uint64_t intval; 8840 const char *strval, *fname; 8841 zpool_prop_t prop; 8842 const char *propname; 8843 const char *elemname = nvpair_name(elem); 8844 zprop_type_t proptype; 8845 spa_feature_t fid; 8846 8847 switch (prop = zpool_name_to_prop(elemname)) { 8848 case ZPOOL_PROP_VERSION: 8849 intval = fnvpair_value_uint64(elem); 8850 /* 8851 * The version is synced separately before other 8852 * properties and should be correct by now. 8853 */ 8854 ASSERT3U(spa_version(spa), >=, intval); 8855 break; 8856 8857 case ZPOOL_PROP_ALTROOT: 8858 /* 8859 * 'altroot' is a non-persistent property. It should 8860 * have been set temporarily at creation or import time. 8861 */ 8862 ASSERT(spa->spa_root != NULL); 8863 break; 8864 8865 case ZPOOL_PROP_READONLY: 8866 case ZPOOL_PROP_CACHEFILE: 8867 /* 8868 * 'readonly' and 'cachefile' are also non-persistent 8869 * properties. 8870 */ 8871 break; 8872 case ZPOOL_PROP_COMMENT: 8873 strval = fnvpair_value_string(elem); 8874 if (spa->spa_comment != NULL) 8875 spa_strfree(spa->spa_comment); 8876 spa->spa_comment = spa_strdup(strval); 8877 /* 8878 * We need to dirty the configuration on all the vdevs 8879 * so that their labels get updated. We also need to 8880 * update the cache file to keep it in sync with the 8881 * MOS version. It's unnecessary to do this for pool 8882 * creation since the vdev's configuration has already 8883 * been dirtied. 8884 */ 8885 if (tx->tx_txg != TXG_INITIAL) { 8886 vdev_config_dirty(spa->spa_root_vdev); 8887 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 8888 } 8889 spa_history_log_internal(spa, "set", tx, 8890 "%s=%s", elemname, strval); 8891 break; 8892 case ZPOOL_PROP_COMPATIBILITY: 8893 strval = fnvpair_value_string(elem); 8894 if (spa->spa_compatibility != NULL) 8895 spa_strfree(spa->spa_compatibility); 8896 spa->spa_compatibility = spa_strdup(strval); 8897 /* 8898 * Dirty the configuration on vdevs as above. 8899 */ 8900 if (tx->tx_txg != TXG_INITIAL) { 8901 vdev_config_dirty(spa->spa_root_vdev); 8902 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 8903 } 8904 8905 spa_history_log_internal(spa, "set", tx, 8906 "%s=%s", nvpair_name(elem), strval); 8907 break; 8908 8909 case ZPOOL_PROP_INVAL: 8910 if (zpool_prop_feature(elemname)) { 8911 fname = strchr(elemname, '@') + 1; 8912 VERIFY0(zfeature_lookup_name(fname, &fid)); 8913 8914 spa_feature_enable(spa, fid, tx); 8915 spa_history_log_internal(spa, "set", tx, 8916 "%s=enabled", elemname); 8917 break; 8918 } else if (!zfs_prop_user(elemname)) { 8919 ASSERT(zpool_prop_feature(elemname)); 8920 break; 8921 } 8922 zfs_fallthrough; 8923 default: 8924 /* 8925 * Set pool property values in the poolprops mos object. 8926 */ 8927 if (spa->spa_pool_props_object == 0) { 8928 spa->spa_pool_props_object = 8929 zap_create_link(mos, DMU_OT_POOL_PROPS, 8930 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 8931 tx); 8932 } 8933 8934 /* normalize the property name */ 8935 propname = zpool_prop_to_name(prop); 8936 proptype = zpool_prop_get_type(prop); 8937 if (prop == ZPOOL_PROP_INVAL && 8938 zfs_prop_user(elemname)) { 8939 propname = elemname; 8940 proptype = PROP_TYPE_STRING; 8941 } 8942 8943 if (nvpair_type(elem) == DATA_TYPE_STRING) { 8944 ASSERT(proptype == PROP_TYPE_STRING); 8945 strval = fnvpair_value_string(elem); 8946 VERIFY0(zap_update(mos, 8947 spa->spa_pool_props_object, propname, 8948 1, strlen(strval) + 1, strval, tx)); 8949 spa_history_log_internal(spa, "set", tx, 8950 "%s=%s", elemname, strval); 8951 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 8952 intval = fnvpair_value_uint64(elem); 8953 8954 if (proptype == PROP_TYPE_INDEX) { 8955 const char *unused; 8956 VERIFY0(zpool_prop_index_to_string( 8957 prop, intval, &unused)); 8958 } 8959 VERIFY0(zap_update(mos, 8960 spa->spa_pool_props_object, propname, 8961 8, 1, &intval, tx)); 8962 spa_history_log_internal(spa, "set", tx, 8963 "%s=%lld", elemname, 8964 (longlong_t)intval); 8965 8966 switch (prop) { 8967 case ZPOOL_PROP_DELEGATION: 8968 spa->spa_delegation = intval; 8969 break; 8970 case ZPOOL_PROP_BOOTFS: 8971 spa->spa_bootfs = intval; 8972 break; 8973 case ZPOOL_PROP_FAILUREMODE: 8974 spa->spa_failmode = intval; 8975 break; 8976 case ZPOOL_PROP_AUTOTRIM: 8977 spa->spa_autotrim = intval; 8978 spa_async_request(spa, 8979 SPA_ASYNC_AUTOTRIM_RESTART); 8980 break; 8981 case ZPOOL_PROP_AUTOEXPAND: 8982 spa->spa_autoexpand = intval; 8983 if (tx->tx_txg != TXG_INITIAL) 8984 spa_async_request(spa, 8985 SPA_ASYNC_AUTOEXPAND); 8986 break; 8987 case ZPOOL_PROP_MULTIHOST: 8988 spa->spa_multihost = intval; 8989 break; 8990 default: 8991 break; 8992 } 8993 } else { 8994 ASSERT(0); /* not allowed */ 8995 } 8996 } 8997 8998 } 8999 9000 mutex_exit(&spa->spa_props_lock); 9001 } 9002 9003 /* 9004 * Perform one-time upgrade on-disk changes. spa_version() does not 9005 * reflect the new version this txg, so there must be no changes this 9006 * txg to anything that the upgrade code depends on after it executes. 9007 * Therefore this must be called after dsl_pool_sync() does the sync 9008 * tasks. 9009 */ 9010 static void 9011 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 9012 { 9013 if (spa_sync_pass(spa) != 1) 9014 return; 9015 9016 dsl_pool_t *dp = spa->spa_dsl_pool; 9017 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 9018 9019 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 9020 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 9021 dsl_pool_create_origin(dp, tx); 9022 9023 /* Keeping the origin open increases spa_minref */ 9024 spa->spa_minref += 3; 9025 } 9026 9027 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 9028 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 9029 dsl_pool_upgrade_clones(dp, tx); 9030 } 9031 9032 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 9033 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 9034 dsl_pool_upgrade_dir_clones(dp, tx); 9035 9036 /* Keeping the freedir open increases spa_minref */ 9037 spa->spa_minref += 3; 9038 } 9039 9040 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 9041 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 9042 spa_feature_create_zap_objects(spa, tx); 9043 } 9044 9045 /* 9046 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 9047 * when possibility to use lz4 compression for metadata was added 9048 * Old pools that have this feature enabled must be upgraded to have 9049 * this feature active 9050 */ 9051 if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 9052 boolean_t lz4_en = spa_feature_is_enabled(spa, 9053 SPA_FEATURE_LZ4_COMPRESS); 9054 boolean_t lz4_ac = spa_feature_is_active(spa, 9055 SPA_FEATURE_LZ4_COMPRESS); 9056 9057 if (lz4_en && !lz4_ac) 9058 spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 9059 } 9060 9061 /* 9062 * If we haven't written the salt, do so now. Note that the 9063 * feature may not be activated yet, but that's fine since 9064 * the presence of this ZAP entry is backwards compatible. 9065 */ 9066 if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 9067 DMU_POOL_CHECKSUM_SALT) == ENOENT) { 9068 VERIFY0(zap_add(spa->spa_meta_objset, 9069 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, 9070 sizeof (spa->spa_cksum_salt.zcs_bytes), 9071 spa->spa_cksum_salt.zcs_bytes, tx)); 9072 } 9073 9074 rrw_exit(&dp->dp_config_rwlock, FTAG); 9075 } 9076 9077 static void 9078 vdev_indirect_state_sync_verify(vdev_t *vd) 9079 { 9080 vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping; 9081 vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births; 9082 9083 if (vd->vdev_ops == &vdev_indirect_ops) { 9084 ASSERT(vim != NULL); 9085 ASSERT(vib != NULL); 9086 } 9087 9088 uint64_t obsolete_sm_object = 0; 9089 ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); 9090 if (obsolete_sm_object != 0) { 9091 ASSERT(vd->vdev_obsolete_sm != NULL); 9092 ASSERT(vd->vdev_removing || 9093 vd->vdev_ops == &vdev_indirect_ops); 9094 ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); 9095 ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); 9096 ASSERT3U(obsolete_sm_object, ==, 9097 space_map_object(vd->vdev_obsolete_sm)); 9098 ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, 9099 space_map_allocated(vd->vdev_obsolete_sm)); 9100 } 9101 ASSERT(vd->vdev_obsolete_segments != NULL); 9102 9103 /* 9104 * Since frees / remaps to an indirect vdev can only 9105 * happen in syncing context, the obsolete segments 9106 * tree must be empty when we start syncing. 9107 */ 9108 ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); 9109 } 9110 9111 /* 9112 * Set the top-level vdev's max queue depth. Evaluate each top-level's 9113 * async write queue depth in case it changed. The max queue depth will 9114 * not change in the middle of syncing out this txg. 9115 */ 9116 static void 9117 spa_sync_adjust_vdev_max_queue_depth(spa_t *spa) 9118 { 9119 ASSERT(spa_writeable(spa)); 9120 9121 vdev_t *rvd = spa->spa_root_vdev; 9122 uint32_t max_queue_depth = zfs_vdev_async_write_max_active * 9123 zfs_vdev_queue_depth_pct / 100; 9124 metaslab_class_t *normal = spa_normal_class(spa); 9125 metaslab_class_t *special = spa_special_class(spa); 9126 metaslab_class_t *dedup = spa_dedup_class(spa); 9127 9128 uint64_t slots_per_allocator = 0; 9129 for (int c = 0; c < rvd->vdev_children; c++) { 9130 vdev_t *tvd = rvd->vdev_child[c]; 9131 9132 metaslab_group_t *mg = tvd->vdev_mg; 9133 if (mg == NULL || !metaslab_group_initialized(mg)) 9134 continue; 9135 9136 metaslab_class_t *mc = mg->mg_class; 9137 if (mc != normal && mc != special && mc != dedup) 9138 continue; 9139 9140 /* 9141 * It is safe to do a lock-free check here because only async 9142 * allocations look at mg_max_alloc_queue_depth, and async 9143 * allocations all happen from spa_sync(). 9144 */ 9145 for (int i = 0; i < mg->mg_allocators; i++) { 9146 ASSERT0(zfs_refcount_count( 9147 &(mg->mg_allocator[i].mga_alloc_queue_depth))); 9148 } 9149 mg->mg_max_alloc_queue_depth = max_queue_depth; 9150 9151 for (int i = 0; i < mg->mg_allocators; i++) { 9152 mg->mg_allocator[i].mga_cur_max_alloc_queue_depth = 9153 zfs_vdev_def_queue_depth; 9154 } 9155 slots_per_allocator += zfs_vdev_def_queue_depth; 9156 } 9157 9158 for (int i = 0; i < spa->spa_alloc_count; i++) { 9159 ASSERT0(zfs_refcount_count(&normal->mc_allocator[i]. 9160 mca_alloc_slots)); 9161 ASSERT0(zfs_refcount_count(&special->mc_allocator[i]. 9162 mca_alloc_slots)); 9163 ASSERT0(zfs_refcount_count(&dedup->mc_allocator[i]. 9164 mca_alloc_slots)); 9165 normal->mc_allocator[i].mca_alloc_max_slots = 9166 slots_per_allocator; 9167 special->mc_allocator[i].mca_alloc_max_slots = 9168 slots_per_allocator; 9169 dedup->mc_allocator[i].mca_alloc_max_slots = 9170 slots_per_allocator; 9171 } 9172 normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 9173 special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 9174 dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 9175 } 9176 9177 static void 9178 spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx) 9179 { 9180 ASSERT(spa_writeable(spa)); 9181 9182 vdev_t *rvd = spa->spa_root_vdev; 9183 for (int c = 0; c < rvd->vdev_children; c++) { 9184 vdev_t *vd = rvd->vdev_child[c]; 9185 vdev_indirect_state_sync_verify(vd); 9186 9187 if (vdev_indirect_should_condense(vd)) { 9188 spa_condense_indirect_start_sync(vd, tx); 9189 break; 9190 } 9191 } 9192 } 9193 9194 static void 9195 spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) 9196 { 9197 objset_t *mos = spa->spa_meta_objset; 9198 dsl_pool_t *dp = spa->spa_dsl_pool; 9199 uint64_t txg = tx->tx_txg; 9200 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 9201 9202 do { 9203 int pass = ++spa->spa_sync_pass; 9204 9205 spa_sync_config_object(spa, tx); 9206 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 9207 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 9208 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 9209 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 9210 spa_errlog_sync(spa, txg); 9211 dsl_pool_sync(dp, txg); 9212 9213 if (pass < zfs_sync_pass_deferred_free || 9214 spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { 9215 /* 9216 * If the log space map feature is active we don't 9217 * care about deferred frees and the deferred bpobj 9218 * as the log space map should effectively have the 9219 * same results (i.e. appending only to one object). 9220 */ 9221 spa_sync_frees(spa, free_bpl, tx); 9222 } else { 9223 /* 9224 * We can not defer frees in pass 1, because 9225 * we sync the deferred frees later in pass 1. 9226 */ 9227 ASSERT3U(pass, >, 1); 9228 bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb, 9229 &spa->spa_deferred_bpobj, tx); 9230 } 9231 9232 brt_sync(spa, txg); 9233 ddt_sync(spa, txg); 9234 dsl_scan_sync(dp, tx); 9235 svr_sync(spa, tx); 9236 spa_sync_upgrades(spa, tx); 9237 9238 spa_flush_metaslabs(spa, tx); 9239 9240 vdev_t *vd = NULL; 9241 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 9242 != NULL) 9243 vdev_sync(vd, txg); 9244 9245 /* 9246 * Note: We need to check if the MOS is dirty because we could 9247 * have marked the MOS dirty without updating the uberblock 9248 * (e.g. if we have sync tasks but no dirty user data). We need 9249 * to check the uberblock's rootbp because it is updated if we 9250 * have synced out dirty data (though in this case the MOS will 9251 * most likely also be dirty due to second order effects, we 9252 * don't want to rely on that here). 9253 */ 9254 if (pass == 1 && 9255 spa->spa_uberblock.ub_rootbp.blk_birth < txg && 9256 !dmu_objset_is_dirty(mos, txg)) { 9257 /* 9258 * Nothing changed on the first pass, therefore this 9259 * TXG is a no-op. Avoid syncing deferred frees, so 9260 * that we can keep this TXG as a no-op. 9261 */ 9262 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 9263 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 9264 ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); 9265 ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg)); 9266 break; 9267 } 9268 9269 spa_sync_deferred_frees(spa, tx); 9270 } while (dmu_objset_is_dirty(mos, txg)); 9271 } 9272 9273 /* 9274 * Rewrite the vdev configuration (which includes the uberblock) to 9275 * commit the transaction group. 9276 * 9277 * If there are no dirty vdevs, we sync the uberblock to a few random 9278 * top-level vdevs that are known to be visible in the config cache 9279 * (see spa_vdev_add() for a complete description). If there *are* dirty 9280 * vdevs, sync the uberblock to all vdevs. 9281 */ 9282 static void 9283 spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) 9284 { 9285 vdev_t *rvd = spa->spa_root_vdev; 9286 uint64_t txg = tx->tx_txg; 9287 9288 for (;;) { 9289 int error = 0; 9290 9291 /* 9292 * We hold SCL_STATE to prevent vdev open/close/etc. 9293 * while we're attempting to write the vdev labels. 9294 */ 9295 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 9296 9297 if (list_is_empty(&spa->spa_config_dirty_list)) { 9298 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; 9299 int svdcount = 0; 9300 int children = rvd->vdev_children; 9301 int c0 = random_in_range(children); 9302 9303 for (int c = 0; c < children; c++) { 9304 vdev_t *vd = 9305 rvd->vdev_child[(c0 + c) % children]; 9306 9307 /* Stop when revisiting the first vdev */ 9308 if (c > 0 && svd[0] == vd) 9309 break; 9310 9311 if (vd->vdev_ms_array == 0 || 9312 vd->vdev_islog || 9313 !vdev_is_concrete(vd)) 9314 continue; 9315 9316 svd[svdcount++] = vd; 9317 if (svdcount == SPA_SYNC_MIN_VDEVS) 9318 break; 9319 } 9320 error = vdev_config_sync(svd, svdcount, txg); 9321 } else { 9322 error = vdev_config_sync(rvd->vdev_child, 9323 rvd->vdev_children, txg); 9324 } 9325 9326 if (error == 0) 9327 spa->spa_last_synced_guid = rvd->vdev_guid; 9328 9329 spa_config_exit(spa, SCL_STATE, FTAG); 9330 9331 if (error == 0) 9332 break; 9333 zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); 9334 zio_resume_wait(spa); 9335 } 9336 } 9337 9338 /* 9339 * Sync the specified transaction group. New blocks may be dirtied as 9340 * part of the process, so we iterate until it converges. 9341 */ 9342 void 9343 spa_sync(spa_t *spa, uint64_t txg) 9344 { 9345 vdev_t *vd = NULL; 9346 9347 VERIFY(spa_writeable(spa)); 9348 9349 /* 9350 * Wait for i/os issued in open context that need to complete 9351 * before this txg syncs. 9352 */ 9353 (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); 9354 spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 9355 ZIO_FLAG_CANFAIL); 9356 9357 /* 9358 * Now that there can be no more cloning in this transaction group, 9359 * but we are still before issuing frees, we can process pending BRT 9360 * updates. 9361 */ 9362 brt_pending_apply(spa, txg); 9363 9364 /* 9365 * Lock out configuration changes. 9366 */ 9367 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 9368 9369 spa->spa_syncing_txg = txg; 9370 spa->spa_sync_pass = 0; 9371 9372 for (int i = 0; i < spa->spa_alloc_count; i++) { 9373 mutex_enter(&spa->spa_allocs[i].spaa_lock); 9374 VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); 9375 mutex_exit(&spa->spa_allocs[i].spaa_lock); 9376 } 9377 9378 /* 9379 * If there are any pending vdev state changes, convert them 9380 * into config changes that go out with this transaction group. 9381 */ 9382 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 9383 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 9384 /* Avoid holding the write lock unless actually necessary */ 9385 if (vd->vdev_aux == NULL) { 9386 vdev_state_clean(vd); 9387 vdev_config_dirty(vd); 9388 continue; 9389 } 9390 /* 9391 * We need the write lock here because, for aux vdevs, 9392 * calling vdev_config_dirty() modifies sav_config. 9393 * This is ugly and will become unnecessary when we 9394 * eliminate the aux vdev wart by integrating all vdevs 9395 * into the root vdev tree. 9396 */ 9397 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9398 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 9399 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 9400 vdev_state_clean(vd); 9401 vdev_config_dirty(vd); 9402 } 9403 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9404 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 9405 } 9406 spa_config_exit(spa, SCL_STATE, FTAG); 9407 9408 dsl_pool_t *dp = spa->spa_dsl_pool; 9409 dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); 9410 9411 spa->spa_sync_starttime = gethrtime(); 9412 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 9413 spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, 9414 spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + 9415 NSEC_TO_TICK(spa->spa_deadman_synctime)); 9416 9417 /* 9418 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 9419 * set spa_deflate if we have no raid-z vdevs. 9420 */ 9421 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 9422 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 9423 vdev_t *rvd = spa->spa_root_vdev; 9424 9425 int i; 9426 for (i = 0; i < rvd->vdev_children; i++) { 9427 vd = rvd->vdev_child[i]; 9428 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 9429 break; 9430 } 9431 if (i == rvd->vdev_children) { 9432 spa->spa_deflate = TRUE; 9433 VERIFY0(zap_add(spa->spa_meta_objset, 9434 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 9435 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 9436 } 9437 } 9438 9439 spa_sync_adjust_vdev_max_queue_depth(spa); 9440 9441 spa_sync_condense_indirect(spa, tx); 9442 9443 spa_sync_iterate_to_convergence(spa, tx); 9444 9445 #ifdef ZFS_DEBUG 9446 if (!list_is_empty(&spa->spa_config_dirty_list)) { 9447 /* 9448 * Make sure that the number of ZAPs for all the vdevs matches 9449 * the number of ZAPs in the per-vdev ZAP list. This only gets 9450 * called if the config is dirty; otherwise there may be 9451 * outstanding AVZ operations that weren't completed in 9452 * spa_sync_config_object. 9453 */ 9454 uint64_t all_vdev_zap_entry_count; 9455 ASSERT0(zap_count(spa->spa_meta_objset, 9456 spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); 9457 ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, 9458 all_vdev_zap_entry_count); 9459 } 9460 #endif 9461 9462 if (spa->spa_vdev_removal != NULL) { 9463 ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); 9464 } 9465 9466 spa_sync_rewrite_vdev_config(spa, tx); 9467 dmu_tx_commit(tx); 9468 9469 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 9470 spa->spa_deadman_tqid = 0; 9471 9472 /* 9473 * Clear the dirty config list. 9474 */ 9475 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 9476 vdev_config_clean(vd); 9477 9478 /* 9479 * Now that the new config has synced transactionally, 9480 * let it become visible to the config cache. 9481 */ 9482 if (spa->spa_config_syncing != NULL) { 9483 spa_config_set(spa, spa->spa_config_syncing); 9484 spa->spa_config_txg = txg; 9485 spa->spa_config_syncing = NULL; 9486 } 9487 9488 dsl_pool_sync_done(dp, txg); 9489 9490 for (int i = 0; i < spa->spa_alloc_count; i++) { 9491 mutex_enter(&spa->spa_allocs[i].spaa_lock); 9492 VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); 9493 mutex_exit(&spa->spa_allocs[i].spaa_lock); 9494 } 9495 9496 /* 9497 * Update usable space statistics. 9498 */ 9499 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 9500 != NULL) 9501 vdev_sync_done(vd, txg); 9502 9503 metaslab_class_evict_old(spa->spa_normal_class, txg); 9504 metaslab_class_evict_old(spa->spa_log_class, txg); 9505 9506 spa_sync_close_syncing_log_sm(spa); 9507 9508 spa_update_dspace(spa); 9509 9510 if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) 9511 vdev_autotrim_kick(spa); 9512 9513 /* 9514 * It had better be the case that we didn't dirty anything 9515 * since vdev_config_sync(). 9516 */ 9517 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 9518 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 9519 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 9520 9521 while (zfs_pause_spa_sync) 9522 delay(1); 9523 9524 spa->spa_sync_pass = 0; 9525 9526 /* 9527 * Update the last synced uberblock here. We want to do this at 9528 * the end of spa_sync() so that consumers of spa_last_synced_txg() 9529 * will be guaranteed that all the processing associated with 9530 * that txg has been completed. 9531 */ 9532 spa->spa_ubsync = spa->spa_uberblock; 9533 spa_config_exit(spa, SCL_CONFIG, FTAG); 9534 9535 spa_handle_ignored_writes(spa); 9536 9537 /* 9538 * If any async tasks have been requested, kick them off. 9539 */ 9540 spa_async_dispatch(spa); 9541 } 9542 9543 /* 9544 * Sync all pools. We don't want to hold the namespace lock across these 9545 * operations, so we take a reference on the spa_t and drop the lock during the 9546 * sync. 9547 */ 9548 void 9549 spa_sync_allpools(void) 9550 { 9551 spa_t *spa = NULL; 9552 mutex_enter(&spa_namespace_lock); 9553 while ((spa = spa_next(spa)) != NULL) { 9554 if (spa_state(spa) != POOL_STATE_ACTIVE || 9555 !spa_writeable(spa) || spa_suspended(spa)) 9556 continue; 9557 spa_open_ref(spa, FTAG); 9558 mutex_exit(&spa_namespace_lock); 9559 txg_wait_synced(spa_get_dsl(spa), 0); 9560 mutex_enter(&spa_namespace_lock); 9561 spa_close(spa, FTAG); 9562 } 9563 mutex_exit(&spa_namespace_lock); 9564 } 9565 9566 /* 9567 * ========================================================================== 9568 * Miscellaneous routines 9569 * ========================================================================== 9570 */ 9571 9572 /* 9573 * Remove all pools in the system. 9574 */ 9575 void 9576 spa_evict_all(void) 9577 { 9578 spa_t *spa; 9579 9580 /* 9581 * Remove all cached state. All pools should be closed now, 9582 * so every spa in the AVL tree should be unreferenced. 9583 */ 9584 mutex_enter(&spa_namespace_lock); 9585 while ((spa = spa_next(NULL)) != NULL) { 9586 /* 9587 * Stop async tasks. The async thread may need to detach 9588 * a device that's been replaced, which requires grabbing 9589 * spa_namespace_lock, so we must drop it here. 9590 */ 9591 spa_open_ref(spa, FTAG); 9592 mutex_exit(&spa_namespace_lock); 9593 spa_async_suspend(spa); 9594 mutex_enter(&spa_namespace_lock); 9595 spa_close(spa, FTAG); 9596 9597 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 9598 spa_unload(spa); 9599 spa_deactivate(spa); 9600 } 9601 spa_remove(spa); 9602 } 9603 mutex_exit(&spa_namespace_lock); 9604 } 9605 9606 vdev_t * 9607 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 9608 { 9609 vdev_t *vd; 9610 int i; 9611 9612 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 9613 return (vd); 9614 9615 if (aux) { 9616 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 9617 vd = spa->spa_l2cache.sav_vdevs[i]; 9618 if (vd->vdev_guid == guid) 9619 return (vd); 9620 } 9621 9622 for (i = 0; i < spa->spa_spares.sav_count; i++) { 9623 vd = spa->spa_spares.sav_vdevs[i]; 9624 if (vd->vdev_guid == guid) 9625 return (vd); 9626 } 9627 } 9628 9629 return (NULL); 9630 } 9631 9632 void 9633 spa_upgrade(spa_t *spa, uint64_t version) 9634 { 9635 ASSERT(spa_writeable(spa)); 9636 9637 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 9638 9639 /* 9640 * This should only be called for a non-faulted pool, and since a 9641 * future version would result in an unopenable pool, this shouldn't be 9642 * possible. 9643 */ 9644 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 9645 ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 9646 9647 spa->spa_uberblock.ub_version = version; 9648 vdev_config_dirty(spa->spa_root_vdev); 9649 9650 spa_config_exit(spa, SCL_ALL, FTAG); 9651 9652 txg_wait_synced(spa_get_dsl(spa), 0); 9653 } 9654 9655 static boolean_t 9656 spa_has_aux_vdev(spa_t *spa, uint64_t guid, spa_aux_vdev_t *sav) 9657 { 9658 (void) spa; 9659 int i; 9660 uint64_t vdev_guid; 9661 9662 for (i = 0; i < sav->sav_count; i++) 9663 if (sav->sav_vdevs[i]->vdev_guid == guid) 9664 return (B_TRUE); 9665 9666 for (i = 0; i < sav->sav_npending; i++) { 9667 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 9668 &vdev_guid) == 0 && vdev_guid == guid) 9669 return (B_TRUE); 9670 } 9671 9672 return (B_FALSE); 9673 } 9674 9675 boolean_t 9676 spa_has_l2cache(spa_t *spa, uint64_t guid) 9677 { 9678 return (spa_has_aux_vdev(spa, guid, &spa->spa_l2cache)); 9679 } 9680 9681 boolean_t 9682 spa_has_spare(spa_t *spa, uint64_t guid) 9683 { 9684 return (spa_has_aux_vdev(spa, guid, &spa->spa_spares)); 9685 } 9686 9687 /* 9688 * Check if a pool has an active shared spare device. 9689 * Note: reference count of an active spare is 2, as a spare and as a replace 9690 */ 9691 static boolean_t 9692 spa_has_active_shared_spare(spa_t *spa) 9693 { 9694 int i, refcnt; 9695 uint64_t pool; 9696 spa_aux_vdev_t *sav = &spa->spa_spares; 9697 9698 for (i = 0; i < sav->sav_count; i++) { 9699 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 9700 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 9701 refcnt > 2) 9702 return (B_TRUE); 9703 } 9704 9705 return (B_FALSE); 9706 } 9707 9708 uint64_t 9709 spa_total_metaslabs(spa_t *spa) 9710 { 9711 vdev_t *rvd = spa->spa_root_vdev; 9712 9713 uint64_t m = 0; 9714 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 9715 vdev_t *vd = rvd->vdev_child[c]; 9716 if (!vdev_is_concrete(vd)) 9717 continue; 9718 m += vd->vdev_ms_count; 9719 } 9720 return (m); 9721 } 9722 9723 /* 9724 * Notify any waiting threads that some activity has switched from being in- 9725 * progress to not-in-progress so that the thread can wake up and determine 9726 * whether it is finished waiting. 9727 */ 9728 void 9729 spa_notify_waiters(spa_t *spa) 9730 { 9731 /* 9732 * Acquiring spa_activities_lock here prevents the cv_broadcast from 9733 * happening between the waiting thread's check and cv_wait. 9734 */ 9735 mutex_enter(&spa->spa_activities_lock); 9736 cv_broadcast(&spa->spa_activities_cv); 9737 mutex_exit(&spa->spa_activities_lock); 9738 } 9739 9740 /* 9741 * Notify any waiting threads that the pool is exporting, and then block until 9742 * they are finished using the spa_t. 9743 */ 9744 void 9745 spa_wake_waiters(spa_t *spa) 9746 { 9747 mutex_enter(&spa->spa_activities_lock); 9748 spa->spa_waiters_cancel = B_TRUE; 9749 cv_broadcast(&spa->spa_activities_cv); 9750 while (spa->spa_waiters != 0) 9751 cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock); 9752 spa->spa_waiters_cancel = B_FALSE; 9753 mutex_exit(&spa->spa_activities_lock); 9754 } 9755 9756 /* Whether the vdev or any of its descendants are being initialized/trimmed. */ 9757 static boolean_t 9758 spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity) 9759 { 9760 spa_t *spa = vd->vdev_spa; 9761 9762 ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER)); 9763 ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); 9764 ASSERT(activity == ZPOOL_WAIT_INITIALIZE || 9765 activity == ZPOOL_WAIT_TRIM); 9766 9767 kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ? 9768 &vd->vdev_initialize_lock : &vd->vdev_trim_lock; 9769 9770 mutex_exit(&spa->spa_activities_lock); 9771 mutex_enter(lock); 9772 mutex_enter(&spa->spa_activities_lock); 9773 9774 boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ? 9775 (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) : 9776 (vd->vdev_trim_state == VDEV_TRIM_ACTIVE); 9777 mutex_exit(lock); 9778 9779 if (in_progress) 9780 return (B_TRUE); 9781 9782 for (int i = 0; i < vd->vdev_children; i++) { 9783 if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i], 9784 activity)) 9785 return (B_TRUE); 9786 } 9787 9788 return (B_FALSE); 9789 } 9790 9791 /* 9792 * If use_guid is true, this checks whether the vdev specified by guid is 9793 * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool 9794 * is being initialized/trimmed. The caller must hold the config lock and 9795 * spa_activities_lock. 9796 */ 9797 static int 9798 spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid, 9799 zpool_wait_activity_t activity, boolean_t *in_progress) 9800 { 9801 mutex_exit(&spa->spa_activities_lock); 9802 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 9803 mutex_enter(&spa->spa_activities_lock); 9804 9805 vdev_t *vd; 9806 if (use_guid) { 9807 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 9808 if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) { 9809 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9810 return (EINVAL); 9811 } 9812 } else { 9813 vd = spa->spa_root_vdev; 9814 } 9815 9816 *in_progress = spa_vdev_activity_in_progress_impl(vd, activity); 9817 9818 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9819 return (0); 9820 } 9821 9822 /* 9823 * Locking for waiting threads 9824 * --------------------------- 9825 * 9826 * Waiting threads need a way to check whether a given activity is in progress, 9827 * and then, if it is, wait for it to complete. Each activity will have some 9828 * in-memory representation of the relevant on-disk state which can be used to 9829 * determine whether or not the activity is in progress. The in-memory state and 9830 * the locking used to protect it will be different for each activity, and may 9831 * not be suitable for use with a cvar (e.g., some state is protected by the 9832 * config lock). To allow waiting threads to wait without any races, another 9833 * lock, spa_activities_lock, is used. 9834 * 9835 * When the state is checked, both the activity-specific lock (if there is one) 9836 * and spa_activities_lock are held. In some cases, the activity-specific lock 9837 * is acquired explicitly (e.g. the config lock). In others, the locking is 9838 * internal to some check (e.g. bpobj_is_empty). After checking, the waiting 9839 * thread releases the activity-specific lock and, if the activity is in 9840 * progress, then cv_waits using spa_activities_lock. 9841 * 9842 * The waiting thread is woken when another thread, one completing some 9843 * activity, updates the state of the activity and then calls 9844 * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only 9845 * needs to hold its activity-specific lock when updating the state, and this 9846 * lock can (but doesn't have to) be dropped before calling spa_notify_waiters. 9847 * 9848 * Because spa_notify_waiters acquires spa_activities_lock before broadcasting, 9849 * and because it is held when the waiting thread checks the state of the 9850 * activity, it can never be the case that the completing thread both updates 9851 * the activity state and cv_broadcasts in between the waiting thread's check 9852 * and cv_wait. Thus, a waiting thread can never miss a wakeup. 9853 * 9854 * In order to prevent deadlock, when the waiting thread does its check, in some 9855 * cases it will temporarily drop spa_activities_lock in order to acquire the 9856 * activity-specific lock. The order in which spa_activities_lock and the 9857 * activity specific lock are acquired in the waiting thread is determined by 9858 * the order in which they are acquired in the completing thread; if the 9859 * completing thread calls spa_notify_waiters with the activity-specific lock 9860 * held, then the waiting thread must also acquire the activity-specific lock 9861 * first. 9862 */ 9863 9864 static int 9865 spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, 9866 boolean_t use_tag, uint64_t tag, boolean_t *in_progress) 9867 { 9868 int error = 0; 9869 9870 ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); 9871 9872 switch (activity) { 9873 case ZPOOL_WAIT_CKPT_DISCARD: 9874 *in_progress = 9875 (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) && 9876 zap_contains(spa_meta_objset(spa), 9877 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) == 9878 ENOENT); 9879 break; 9880 case ZPOOL_WAIT_FREE: 9881 *in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS && 9882 !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) || 9883 spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) || 9884 spa_livelist_delete_check(spa)); 9885 break; 9886 case ZPOOL_WAIT_INITIALIZE: 9887 case ZPOOL_WAIT_TRIM: 9888 error = spa_vdev_activity_in_progress(spa, use_tag, tag, 9889 activity, in_progress); 9890 break; 9891 case ZPOOL_WAIT_REPLACE: 9892 mutex_exit(&spa->spa_activities_lock); 9893 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 9894 mutex_enter(&spa->spa_activities_lock); 9895 9896 *in_progress = vdev_replace_in_progress(spa->spa_root_vdev); 9897 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9898 break; 9899 case ZPOOL_WAIT_REMOVE: 9900 *in_progress = (spa->spa_removing_phys.sr_state == 9901 DSS_SCANNING); 9902 break; 9903 case ZPOOL_WAIT_RESILVER: 9904 if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev))) 9905 break; 9906 zfs_fallthrough; 9907 case ZPOOL_WAIT_SCRUB: 9908 { 9909 boolean_t scanning, paused, is_scrub; 9910 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 9911 9912 is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB); 9913 scanning = (scn->scn_phys.scn_state == DSS_SCANNING); 9914 paused = dsl_scan_is_paused_scrub(scn); 9915 *in_progress = (scanning && !paused && 9916 is_scrub == (activity == ZPOOL_WAIT_SCRUB)); 9917 break; 9918 } 9919 default: 9920 panic("unrecognized value for activity %d", activity); 9921 } 9922 9923 return (error); 9924 } 9925 9926 static int 9927 spa_wait_common(const char *pool, zpool_wait_activity_t activity, 9928 boolean_t use_tag, uint64_t tag, boolean_t *waited) 9929 { 9930 /* 9931 * The tag is used to distinguish between instances of an activity. 9932 * 'initialize' and 'trim' are the only activities that we use this for. 9933 * The other activities can only have a single instance in progress in a 9934 * pool at one time, making the tag unnecessary. 9935 * 9936 * There can be multiple devices being replaced at once, but since they 9937 * all finish once resilvering finishes, we don't bother keeping track 9938 * of them individually, we just wait for them all to finish. 9939 */ 9940 if (use_tag && activity != ZPOOL_WAIT_INITIALIZE && 9941 activity != ZPOOL_WAIT_TRIM) 9942 return (EINVAL); 9943 9944 if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES) 9945 return (EINVAL); 9946 9947 spa_t *spa; 9948 int error = spa_open(pool, &spa, FTAG); 9949 if (error != 0) 9950 return (error); 9951 9952 /* 9953 * Increment the spa's waiter count so that we can call spa_close and 9954 * still ensure that the spa_t doesn't get freed before this thread is 9955 * finished with it when the pool is exported. We want to call spa_close 9956 * before we start waiting because otherwise the additional ref would 9957 * prevent the pool from being exported or destroyed throughout the 9958 * potentially long wait. 9959 */ 9960 mutex_enter(&spa->spa_activities_lock); 9961 spa->spa_waiters++; 9962 spa_close(spa, FTAG); 9963 9964 *waited = B_FALSE; 9965 for (;;) { 9966 boolean_t in_progress; 9967 error = spa_activity_in_progress(spa, activity, use_tag, tag, 9968 &in_progress); 9969 9970 if (error || !in_progress || spa->spa_waiters_cancel) 9971 break; 9972 9973 *waited = B_TRUE; 9974 9975 if (cv_wait_sig(&spa->spa_activities_cv, 9976 &spa->spa_activities_lock) == 0) { 9977 error = EINTR; 9978 break; 9979 } 9980 } 9981 9982 spa->spa_waiters--; 9983 cv_signal(&spa->spa_waiters_cv); 9984 mutex_exit(&spa->spa_activities_lock); 9985 9986 return (error); 9987 } 9988 9989 /* 9990 * Wait for a particular instance of the specified activity to complete, where 9991 * the instance is identified by 'tag' 9992 */ 9993 int 9994 spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag, 9995 boolean_t *waited) 9996 { 9997 return (spa_wait_common(pool, activity, B_TRUE, tag, waited)); 9998 } 9999 10000 /* 10001 * Wait for all instances of the specified activity complete 10002 */ 10003 int 10004 spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited) 10005 { 10006 10007 return (spa_wait_common(pool, activity, B_FALSE, 0, waited)); 10008 } 10009 10010 sysevent_t * 10011 spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 10012 { 10013 sysevent_t *ev = NULL; 10014 #ifdef _KERNEL 10015 nvlist_t *resource; 10016 10017 resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl); 10018 if (resource) { 10019 ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP); 10020 ev->resource = resource; 10021 } 10022 #else 10023 (void) spa, (void) vd, (void) hist_nvl, (void) name; 10024 #endif 10025 return (ev); 10026 } 10027 10028 void 10029 spa_event_post(sysevent_t *ev) 10030 { 10031 #ifdef _KERNEL 10032 if (ev) { 10033 zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb); 10034 kmem_free(ev, sizeof (*ev)); 10035 } 10036 #else 10037 (void) ev; 10038 #endif 10039 } 10040 10041 /* 10042 * Post a zevent corresponding to the given sysevent. The 'name' must be one 10043 * of the event definitions in sys/sysevent/eventdefs.h. The payload will be 10044 * filled in from the spa and (optionally) the vdev. This doesn't do anything 10045 * in the userland libzpool, as we don't want consumers to misinterpret ztest 10046 * or zdb as real changes. 10047 */ 10048 void 10049 spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 10050 { 10051 spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); 10052 } 10053 10054 /* state manipulation functions */ 10055 EXPORT_SYMBOL(spa_open); 10056 EXPORT_SYMBOL(spa_open_rewind); 10057 EXPORT_SYMBOL(spa_get_stats); 10058 EXPORT_SYMBOL(spa_create); 10059 EXPORT_SYMBOL(spa_import); 10060 EXPORT_SYMBOL(spa_tryimport); 10061 EXPORT_SYMBOL(spa_destroy); 10062 EXPORT_SYMBOL(spa_export); 10063 EXPORT_SYMBOL(spa_reset); 10064 EXPORT_SYMBOL(spa_async_request); 10065 EXPORT_SYMBOL(spa_async_suspend); 10066 EXPORT_SYMBOL(spa_async_resume); 10067 EXPORT_SYMBOL(spa_inject_addref); 10068 EXPORT_SYMBOL(spa_inject_delref); 10069 EXPORT_SYMBOL(spa_scan_stat_init); 10070 EXPORT_SYMBOL(spa_scan_get_stats); 10071 10072 /* device manipulation */ 10073 EXPORT_SYMBOL(spa_vdev_add); 10074 EXPORT_SYMBOL(spa_vdev_attach); 10075 EXPORT_SYMBOL(spa_vdev_detach); 10076 EXPORT_SYMBOL(spa_vdev_setpath); 10077 EXPORT_SYMBOL(spa_vdev_setfru); 10078 EXPORT_SYMBOL(spa_vdev_split_mirror); 10079 10080 /* spare statech is global across all pools) */ 10081 EXPORT_SYMBOL(spa_spare_add); 10082 EXPORT_SYMBOL(spa_spare_remove); 10083 EXPORT_SYMBOL(spa_spare_exists); 10084 EXPORT_SYMBOL(spa_spare_activate); 10085 10086 /* L2ARC statech is global across all pools) */ 10087 EXPORT_SYMBOL(spa_l2cache_add); 10088 EXPORT_SYMBOL(spa_l2cache_remove); 10089 EXPORT_SYMBOL(spa_l2cache_exists); 10090 EXPORT_SYMBOL(spa_l2cache_activate); 10091 EXPORT_SYMBOL(spa_l2cache_drop); 10092 10093 /* scanning */ 10094 EXPORT_SYMBOL(spa_scan); 10095 EXPORT_SYMBOL(spa_scan_stop); 10096 10097 /* spa syncing */ 10098 EXPORT_SYMBOL(spa_sync); /* only for DMU use */ 10099 EXPORT_SYMBOL(spa_sync_allpools); 10100 10101 /* properties */ 10102 EXPORT_SYMBOL(spa_prop_set); 10103 EXPORT_SYMBOL(spa_prop_get); 10104 EXPORT_SYMBOL(spa_prop_clear_bootfs); 10105 10106 /* asynchronous event notification */ 10107 EXPORT_SYMBOL(spa_event_notify); 10108 10109 /* BEGIN CSTYLED */ 10110 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW, 10111 "log2 fraction of arc that can be used by inflight I/Os when " 10112 "verifying pool during import"); 10113 /* END CSTYLED */ 10114 10115 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW, 10116 "Set to traverse metadata on pool import"); 10117 10118 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW, 10119 "Set to traverse data on pool import"); 10120 10121 ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW, 10122 "Print vdev tree to zfs_dbgmsg during pool import"); 10123 10124 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD, 10125 "Percentage of CPUs to run an IO worker thread"); 10126 10127 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD, 10128 "Number of threads per IO worker taskqueue"); 10129 10130 /* BEGIN CSTYLED */ 10131 ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW, 10132 "Allow importing pool with up to this number of missing top-level " 10133 "vdevs (in read-only mode)"); 10134 /* END CSTYLED */ 10135 10136 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, 10137 ZMOD_RW, "Set the livelist condense zthr to pause"); 10138 10139 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT, 10140 ZMOD_RW, "Set the livelist condense synctask to pause"); 10141 10142 /* BEGIN CSTYLED */ 10143 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel, 10144 INT, ZMOD_RW, 10145 "Whether livelist condensing was canceled in the synctask"); 10146 10147 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, 10148 INT, ZMOD_RW, 10149 "Whether livelist condensing was canceled in the zthr function"); 10150 10151 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, 10152 ZMOD_RW, 10153 "Whether extra ALLOC blkptrs were added to a livelist entry while it " 10154 "was being condensed"); 10155 /* END CSTYLED */ 10156