1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2020 by Delphix. All rights reserved. 25 * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 27 * Copyright 2013 Saso Kiselkov. All rights reserved. 28 * Copyright (c) 2014 Integros [integros.com] 29 * Copyright 2016 Toomas Soome <tsoome@me.com> 30 * Copyright (c) 2016 Actifio, Inc. All rights reserved. 31 * Copyright 2018 Joyent, Inc. 32 * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. 33 * Copyright 2017 Joyent, Inc. 34 * Copyright (c) 2017, Intel Corporation. 35 * Copyright (c) 2021, Colm Buckley <colm@tuatha.org> 36 */ 37 38 /* 39 * SPA: Storage Pool Allocator 40 * 41 * This file contains all the routines used when modifying on-disk SPA state. 42 * This includes opening, importing, destroying, exporting a pool, and syncing a 43 * pool. 44 */ 45 46 #include <sys/zfs_context.h> 47 #include <sys/fm/fs/zfs.h> 48 #include <sys/spa_impl.h> 49 #include <sys/zio.h> 50 #include <sys/zio_checksum.h> 51 #include <sys/dmu.h> 52 #include <sys/dmu_tx.h> 53 #include <sys/zap.h> 54 #include <sys/zil.h> 55 #include <sys/brt.h> 56 #include <sys/ddt.h> 57 #include <sys/vdev_impl.h> 58 #include <sys/vdev_removal.h> 59 #include <sys/vdev_indirect_mapping.h> 60 #include <sys/vdev_indirect_births.h> 61 #include <sys/vdev_initialize.h> 62 #include <sys/vdev_rebuild.h> 63 #include <sys/vdev_trim.h> 64 #include <sys/vdev_disk.h> 65 #include <sys/vdev_draid.h> 66 #include <sys/metaslab.h> 67 #include <sys/metaslab_impl.h> 68 #include <sys/mmp.h> 69 #include <sys/uberblock_impl.h> 70 #include <sys/txg.h> 71 #include <sys/avl.h> 72 #include <sys/bpobj.h> 73 #include <sys/dmu_traverse.h> 74 #include <sys/dmu_objset.h> 75 #include <sys/unique.h> 76 #include <sys/dsl_pool.h> 77 #include <sys/dsl_dataset.h> 78 #include <sys/dsl_dir.h> 79 #include <sys/dsl_prop.h> 80 #include <sys/dsl_synctask.h> 81 #include <sys/fs/zfs.h> 82 #include <sys/arc.h> 83 #include <sys/callb.h> 84 #include <sys/systeminfo.h> 85 #include <sys/zfs_ioctl.h> 86 #include <sys/dsl_scan.h> 87 #include <sys/zfeature.h> 88 #include <sys/dsl_destroy.h> 89 #include <sys/zvol.h> 90 91 #ifdef _KERNEL 92 #include <sys/fm/protocol.h> 93 #include <sys/fm/util.h> 94 #include <sys/callb.h> 95 #include <sys/zone.h> 96 #include <sys/vmsystm.h> 97 #endif /* _KERNEL */ 98 99 #include "zfs_prop.h" 100 #include "zfs_comutil.h" 101 102 /* 103 * The interval, in seconds, at which failed configuration cache file writes 104 * should be retried. 105 */ 106 int zfs_ccw_retry_interval = 300; 107 108 typedef enum zti_modes { 109 ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 110 ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ 111 ZTI_MODE_SCALE, /* Taskqs scale with CPUs. */ 112 ZTI_MODE_NULL, /* don't create a taskq */ 113 ZTI_NMODES 114 } zti_modes_t; 115 116 #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 117 #define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 } 118 #define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } 119 #define ZTI_SCALE { ZTI_MODE_SCALE, 0, 1 } 120 #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 121 122 #define ZTI_N(n) ZTI_P(n, 1) 123 #define ZTI_ONE ZTI_N(1) 124 125 typedef struct zio_taskq_info { 126 zti_modes_t zti_mode; 127 uint_t zti_value; 128 uint_t zti_count; 129 } zio_taskq_info_t; 130 131 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 132 "iss", "iss_h", "int", "int_h" 133 }; 134 135 /* 136 * This table defines the taskq settings for each ZFS I/O type. When 137 * initializing a pool, we use this table to create an appropriately sized 138 * taskq. Some operations are low volume and therefore have a small, static 139 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 140 * macros. Other operations process a large amount of data; the ZTI_BATCH 141 * macro causes us to create a taskq oriented for throughput. Some operations 142 * are so high frequency and short-lived that the taskq itself can become a 143 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 144 * additional degree of parallelism specified by the number of threads per- 145 * taskq and the number of taskqs; when dispatching an event in this case, the 146 * particular taskq is chosen at random. ZTI_SCALE is similar to ZTI_BATCH, 147 * but with number of taskqs also scaling with number of CPUs. 148 * 149 * The different taskq priorities are to handle the different contexts (issue 150 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that 151 * need to be handled with minimum delay. 152 */ 153 static const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 154 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 155 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 156 { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ 157 { ZTI_BATCH, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */ 158 { ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 159 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 160 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ 161 { ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */ 162 }; 163 164 static void spa_sync_version(void *arg, dmu_tx_t *tx); 165 static void spa_sync_props(void *arg, dmu_tx_t *tx); 166 static boolean_t spa_has_active_shared_spare(spa_t *spa); 167 static int spa_load_impl(spa_t *spa, spa_import_type_t type, 168 const char **ereport); 169 static void spa_vdev_resilver_done(spa_t *spa); 170 171 static uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */ 172 static uint_t zio_taskq_batch_tpq; /* threads per taskq */ 173 static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 174 static const uint_t zio_taskq_basedc = 80; /* base duty cycle */ 175 176 static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */ 177 178 /* 179 * Report any spa_load_verify errors found, but do not fail spa_load. 180 * This is used by zdb to analyze non-idle pools. 181 */ 182 boolean_t spa_load_verify_dryrun = B_FALSE; 183 184 /* 185 * Allow read spacemaps in case of readonly import (spa_mode == SPA_MODE_READ). 186 * This is used by zdb for spacemaps verification. 187 */ 188 boolean_t spa_mode_readable_spacemaps = B_FALSE; 189 190 /* 191 * This (illegal) pool name is used when temporarily importing a spa_t in order 192 * to get the vdev stats associated with the imported devices. 193 */ 194 #define TRYIMPORT_NAME "$import" 195 196 /* 197 * For debugging purposes: print out vdev tree during pool import. 198 */ 199 static int spa_load_print_vdev_tree = B_FALSE; 200 201 /* 202 * A non-zero value for zfs_max_missing_tvds means that we allow importing 203 * pools with missing top-level vdevs. This is strictly intended for advanced 204 * pool recovery cases since missing data is almost inevitable. Pools with 205 * missing devices can only be imported read-only for safety reasons, and their 206 * fail-mode will be automatically set to "continue". 207 * 208 * With 1 missing vdev we should be able to import the pool and mount all 209 * datasets. User data that was not modified after the missing device has been 210 * added should be recoverable. This means that snapshots created prior to the 211 * addition of that device should be completely intact. 212 * 213 * With 2 missing vdevs, some datasets may fail to mount since there are 214 * dataset statistics that are stored as regular metadata. Some data might be 215 * recoverable if those vdevs were added recently. 216 * 217 * With 3 or more missing vdevs, the pool is severely damaged and MOS entries 218 * may be missing entirely. Chances of data recovery are very low. Note that 219 * there are also risks of performing an inadvertent rewind as we might be 220 * missing all the vdevs with the latest uberblocks. 221 */ 222 uint64_t zfs_max_missing_tvds = 0; 223 224 /* 225 * The parameters below are similar to zfs_max_missing_tvds but are only 226 * intended for a preliminary open of the pool with an untrusted config which 227 * might be incomplete or out-dated. 228 * 229 * We are more tolerant for pools opened from a cachefile since we could have 230 * an out-dated cachefile where a device removal was not registered. 231 * We could have set the limit arbitrarily high but in the case where devices 232 * are really missing we would want to return the proper error codes; we chose 233 * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available 234 * and we get a chance to retrieve the trusted config. 235 */ 236 uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; 237 238 /* 239 * In the case where config was assembled by scanning device paths (/dev/dsks 240 * by default) we are less tolerant since all the existing devices should have 241 * been detected and we want spa_load to return the right error codes. 242 */ 243 uint64_t zfs_max_missing_tvds_scan = 0; 244 245 /* 246 * Debugging aid that pauses spa_sync() towards the end. 247 */ 248 static const boolean_t zfs_pause_spa_sync = B_FALSE; 249 250 /* 251 * Variables to indicate the livelist condense zthr func should wait at certain 252 * points for the livelist to be removed - used to test condense/destroy races 253 */ 254 static int zfs_livelist_condense_zthr_pause = 0; 255 static int zfs_livelist_condense_sync_pause = 0; 256 257 /* 258 * Variables to track whether or not condense cancellation has been 259 * triggered in testing. 260 */ 261 static int zfs_livelist_condense_sync_cancel = 0; 262 static int zfs_livelist_condense_zthr_cancel = 0; 263 264 /* 265 * Variable to track whether or not extra ALLOC blkptrs were added to a 266 * livelist entry while it was being condensed (caused by the way we track 267 * remapped blkptrs in dbuf_remap_impl) 268 */ 269 static int zfs_livelist_condense_new_alloc = 0; 270 271 /* 272 * ========================================================================== 273 * SPA properties routines 274 * ========================================================================== 275 */ 276 277 /* 278 * Add a (source=src, propname=propval) list to an nvlist. 279 */ 280 static void 281 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, const char *strval, 282 uint64_t intval, zprop_source_t src) 283 { 284 const char *propname = zpool_prop_to_name(prop); 285 nvlist_t *propval; 286 287 propval = fnvlist_alloc(); 288 fnvlist_add_uint64(propval, ZPROP_SOURCE, src); 289 290 if (strval != NULL) 291 fnvlist_add_string(propval, ZPROP_VALUE, strval); 292 else 293 fnvlist_add_uint64(propval, ZPROP_VALUE, intval); 294 295 fnvlist_add_nvlist(nvl, propname, propval); 296 nvlist_free(propval); 297 } 298 299 /* 300 * Add a user property (source=src, propname=propval) to an nvlist. 301 */ 302 static void 303 spa_prop_add_user(nvlist_t *nvl, const char *propname, char *strval, 304 zprop_source_t src) 305 { 306 nvlist_t *propval; 307 308 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 309 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 310 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 311 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 312 nvlist_free(propval); 313 } 314 315 /* 316 * Get property values from the spa configuration. 317 */ 318 static void 319 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 320 { 321 vdev_t *rvd = spa->spa_root_vdev; 322 dsl_pool_t *pool = spa->spa_dsl_pool; 323 uint64_t size, alloc, cap, version; 324 const zprop_source_t src = ZPROP_SRC_NONE; 325 spa_config_dirent_t *dp; 326 metaslab_class_t *mc = spa_normal_class(spa); 327 328 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 329 330 if (rvd != NULL) { 331 alloc = metaslab_class_get_alloc(mc); 332 alloc += metaslab_class_get_alloc(spa_special_class(spa)); 333 alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); 334 alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa)); 335 336 size = metaslab_class_get_space(mc); 337 size += metaslab_class_get_space(spa_special_class(spa)); 338 size += metaslab_class_get_space(spa_dedup_class(spa)); 339 size += metaslab_class_get_space(spa_embedded_log_class(spa)); 340 341 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 342 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 343 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 344 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 345 size - alloc, src); 346 spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL, 347 spa->spa_checkpoint_info.sci_dspace, src); 348 349 spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, 350 metaslab_class_fragmentation(mc), src); 351 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, 352 metaslab_class_expandable_space(mc), src); 353 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 354 (spa_mode(spa) == SPA_MODE_READ), src); 355 356 cap = (size == 0) ? 0 : (alloc * 100 / size); 357 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 358 359 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 360 ddt_get_pool_dedup_ratio(spa), src); 361 spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONEUSED, NULL, 362 brt_get_used(spa), src); 363 spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONESAVED, NULL, 364 brt_get_saved(spa), src); 365 spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONERATIO, NULL, 366 brt_get_ratio(spa), src); 367 368 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 369 rvd->vdev_state, src); 370 371 version = spa_version(spa); 372 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) { 373 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, 374 version, ZPROP_SRC_DEFAULT); 375 } else { 376 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, 377 version, ZPROP_SRC_LOCAL); 378 } 379 spa_prop_add_list(*nvp, ZPOOL_PROP_LOAD_GUID, 380 NULL, spa_load_guid(spa), src); 381 } 382 383 if (pool != NULL) { 384 /* 385 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 386 * when opening pools before this version freedir will be NULL. 387 */ 388 if (pool->dp_free_dir != NULL) { 389 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 390 dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, 391 src); 392 } else { 393 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 394 NULL, 0, src); 395 } 396 397 if (pool->dp_leak_dir != NULL) { 398 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 399 dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, 400 src); 401 } else { 402 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, 403 NULL, 0, src); 404 } 405 } 406 407 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 408 409 if (spa->spa_comment != NULL) { 410 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 411 0, ZPROP_SRC_LOCAL); 412 } 413 414 if (spa->spa_compatibility != NULL) { 415 spa_prop_add_list(*nvp, ZPOOL_PROP_COMPATIBILITY, 416 spa->spa_compatibility, 0, ZPROP_SRC_LOCAL); 417 } 418 419 if (spa->spa_root != NULL) 420 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 421 0, ZPROP_SRC_LOCAL); 422 423 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { 424 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 425 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); 426 } else { 427 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 428 SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); 429 } 430 431 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { 432 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, 433 DNODE_MAX_SIZE, ZPROP_SRC_NONE); 434 } else { 435 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, 436 DNODE_MIN_SIZE, ZPROP_SRC_NONE); 437 } 438 439 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 440 if (dp->scd_path == NULL) { 441 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 442 "none", 0, ZPROP_SRC_LOCAL); 443 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 444 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 445 dp->scd_path, 0, ZPROP_SRC_LOCAL); 446 } 447 } 448 } 449 450 /* 451 * Get zpool property values. 452 */ 453 int 454 spa_prop_get(spa_t *spa, nvlist_t **nvp) 455 { 456 objset_t *mos = spa->spa_meta_objset; 457 zap_cursor_t zc; 458 zap_attribute_t za; 459 dsl_pool_t *dp; 460 int err; 461 462 err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP); 463 if (err) 464 return (err); 465 466 dp = spa_get_dsl(spa); 467 dsl_pool_config_enter(dp, FTAG); 468 mutex_enter(&spa->spa_props_lock); 469 470 /* 471 * Get properties from the spa config. 472 */ 473 spa_prop_get_config(spa, nvp); 474 475 /* If no pool property object, no more prop to get. */ 476 if (mos == NULL || spa->spa_pool_props_object == 0) 477 goto out; 478 479 /* 480 * Get properties from the MOS pool property object. 481 */ 482 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 483 (err = zap_cursor_retrieve(&zc, &za)) == 0; 484 zap_cursor_advance(&zc)) { 485 uint64_t intval = 0; 486 char *strval = NULL; 487 zprop_source_t src = ZPROP_SRC_DEFAULT; 488 zpool_prop_t prop; 489 490 if ((prop = zpool_name_to_prop(za.za_name)) == 491 ZPOOL_PROP_INVAL && !zfs_prop_user(za.za_name)) 492 continue; 493 494 switch (za.za_integer_length) { 495 case 8: 496 /* integer property */ 497 if (za.za_first_integer != 498 zpool_prop_default_numeric(prop)) 499 src = ZPROP_SRC_LOCAL; 500 501 if (prop == ZPOOL_PROP_BOOTFS) { 502 dsl_dataset_t *ds = NULL; 503 504 err = dsl_dataset_hold_obj(dp, 505 za.za_first_integer, FTAG, &ds); 506 if (err != 0) 507 break; 508 509 strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, 510 KM_SLEEP); 511 dsl_dataset_name(ds, strval); 512 dsl_dataset_rele(ds, FTAG); 513 } else { 514 strval = NULL; 515 intval = za.za_first_integer; 516 } 517 518 spa_prop_add_list(*nvp, prop, strval, intval, src); 519 520 if (strval != NULL) 521 kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); 522 523 break; 524 525 case 1: 526 /* string property */ 527 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 528 err = zap_lookup(mos, spa->spa_pool_props_object, 529 za.za_name, 1, za.za_num_integers, strval); 530 if (err) { 531 kmem_free(strval, za.za_num_integers); 532 break; 533 } 534 if (prop != ZPOOL_PROP_INVAL) { 535 spa_prop_add_list(*nvp, prop, strval, 0, src); 536 } else { 537 src = ZPROP_SRC_LOCAL; 538 spa_prop_add_user(*nvp, za.za_name, strval, 539 src); 540 } 541 kmem_free(strval, za.za_num_integers); 542 break; 543 544 default: 545 break; 546 } 547 } 548 zap_cursor_fini(&zc); 549 out: 550 mutex_exit(&spa->spa_props_lock); 551 dsl_pool_config_exit(dp, FTAG); 552 if (err && err != ENOENT) { 553 nvlist_free(*nvp); 554 *nvp = NULL; 555 return (err); 556 } 557 558 return (0); 559 } 560 561 /* 562 * Validate the given pool properties nvlist and modify the list 563 * for the property values to be set. 564 */ 565 static int 566 spa_prop_validate(spa_t *spa, nvlist_t *props) 567 { 568 nvpair_t *elem; 569 int error = 0, reset_bootfs = 0; 570 uint64_t objnum = 0; 571 boolean_t has_feature = B_FALSE; 572 573 elem = NULL; 574 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 575 uint64_t intval; 576 const char *strval, *slash, *check, *fname; 577 const char *propname = nvpair_name(elem); 578 zpool_prop_t prop = zpool_name_to_prop(propname); 579 580 switch (prop) { 581 case ZPOOL_PROP_INVAL: 582 /* 583 * Sanitize the input. 584 */ 585 if (zfs_prop_user(propname)) { 586 if (strlen(propname) >= ZAP_MAXNAMELEN) { 587 error = SET_ERROR(ENAMETOOLONG); 588 break; 589 } 590 591 if (strlen(fnvpair_value_string(elem)) >= 592 ZAP_MAXVALUELEN) { 593 error = SET_ERROR(E2BIG); 594 break; 595 } 596 } else if (zpool_prop_feature(propname)) { 597 if (nvpair_type(elem) != DATA_TYPE_UINT64) { 598 error = SET_ERROR(EINVAL); 599 break; 600 } 601 602 if (nvpair_value_uint64(elem, &intval) != 0) { 603 error = SET_ERROR(EINVAL); 604 break; 605 } 606 607 if (intval != 0) { 608 error = SET_ERROR(EINVAL); 609 break; 610 } 611 612 fname = strchr(propname, '@') + 1; 613 if (zfeature_lookup_name(fname, NULL) != 0) { 614 error = SET_ERROR(EINVAL); 615 break; 616 } 617 618 has_feature = B_TRUE; 619 } else { 620 error = SET_ERROR(EINVAL); 621 break; 622 } 623 break; 624 625 case ZPOOL_PROP_VERSION: 626 error = nvpair_value_uint64(elem, &intval); 627 if (!error && 628 (intval < spa_version(spa) || 629 intval > SPA_VERSION_BEFORE_FEATURES || 630 has_feature)) 631 error = SET_ERROR(EINVAL); 632 break; 633 634 case ZPOOL_PROP_DELEGATION: 635 case ZPOOL_PROP_AUTOREPLACE: 636 case ZPOOL_PROP_LISTSNAPS: 637 case ZPOOL_PROP_AUTOEXPAND: 638 case ZPOOL_PROP_AUTOTRIM: 639 error = nvpair_value_uint64(elem, &intval); 640 if (!error && intval > 1) 641 error = SET_ERROR(EINVAL); 642 break; 643 644 case ZPOOL_PROP_MULTIHOST: 645 error = nvpair_value_uint64(elem, &intval); 646 if (!error && intval > 1) 647 error = SET_ERROR(EINVAL); 648 649 if (!error) { 650 uint32_t hostid = zone_get_hostid(NULL); 651 if (hostid) 652 spa->spa_hostid = hostid; 653 else 654 error = SET_ERROR(ENOTSUP); 655 } 656 657 break; 658 659 case ZPOOL_PROP_BOOTFS: 660 /* 661 * If the pool version is less than SPA_VERSION_BOOTFS, 662 * or the pool is still being created (version == 0), 663 * the bootfs property cannot be set. 664 */ 665 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 666 error = SET_ERROR(ENOTSUP); 667 break; 668 } 669 670 /* 671 * Make sure the vdev config is bootable 672 */ 673 if (!vdev_is_bootable(spa->spa_root_vdev)) { 674 error = SET_ERROR(ENOTSUP); 675 break; 676 } 677 678 reset_bootfs = 1; 679 680 error = nvpair_value_string(elem, &strval); 681 682 if (!error) { 683 objset_t *os; 684 685 if (strval == NULL || strval[0] == '\0') { 686 objnum = zpool_prop_default_numeric( 687 ZPOOL_PROP_BOOTFS); 688 break; 689 } 690 691 error = dmu_objset_hold(strval, FTAG, &os); 692 if (error != 0) 693 break; 694 695 /* Must be ZPL. */ 696 if (dmu_objset_type(os) != DMU_OST_ZFS) { 697 error = SET_ERROR(ENOTSUP); 698 } else { 699 objnum = dmu_objset_id(os); 700 } 701 dmu_objset_rele(os, FTAG); 702 } 703 break; 704 705 case ZPOOL_PROP_FAILUREMODE: 706 error = nvpair_value_uint64(elem, &intval); 707 if (!error && intval > ZIO_FAILURE_MODE_PANIC) 708 error = SET_ERROR(EINVAL); 709 710 /* 711 * This is a special case which only occurs when 712 * the pool has completely failed. This allows 713 * the user to change the in-core failmode property 714 * without syncing it out to disk (I/Os might 715 * currently be blocked). We do this by returning 716 * EIO to the caller (spa_prop_set) to trick it 717 * into thinking we encountered a property validation 718 * error. 719 */ 720 if (!error && spa_suspended(spa)) { 721 spa->spa_failmode = intval; 722 error = SET_ERROR(EIO); 723 } 724 break; 725 726 case ZPOOL_PROP_CACHEFILE: 727 if ((error = nvpair_value_string(elem, &strval)) != 0) 728 break; 729 730 if (strval[0] == '\0') 731 break; 732 733 if (strcmp(strval, "none") == 0) 734 break; 735 736 if (strval[0] != '/') { 737 error = SET_ERROR(EINVAL); 738 break; 739 } 740 741 slash = strrchr(strval, '/'); 742 ASSERT(slash != NULL); 743 744 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 745 strcmp(slash, "/..") == 0) 746 error = SET_ERROR(EINVAL); 747 break; 748 749 case ZPOOL_PROP_COMMENT: 750 if ((error = nvpair_value_string(elem, &strval)) != 0) 751 break; 752 for (check = strval; *check != '\0'; check++) { 753 if (!isprint(*check)) { 754 error = SET_ERROR(EINVAL); 755 break; 756 } 757 } 758 if (strlen(strval) > ZPROP_MAX_COMMENT) 759 error = SET_ERROR(E2BIG); 760 break; 761 762 default: 763 break; 764 } 765 766 if (error) 767 break; 768 } 769 770 (void) nvlist_remove_all(props, 771 zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO)); 772 773 if (!error && reset_bootfs) { 774 error = nvlist_remove(props, 775 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 776 777 if (!error) { 778 error = nvlist_add_uint64(props, 779 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 780 } 781 } 782 783 return (error); 784 } 785 786 void 787 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 788 { 789 const char *cachefile; 790 spa_config_dirent_t *dp; 791 792 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 793 &cachefile) != 0) 794 return; 795 796 dp = kmem_alloc(sizeof (spa_config_dirent_t), 797 KM_SLEEP); 798 799 if (cachefile[0] == '\0') 800 dp->scd_path = spa_strdup(spa_config_path); 801 else if (strcmp(cachefile, "none") == 0) 802 dp->scd_path = NULL; 803 else 804 dp->scd_path = spa_strdup(cachefile); 805 806 list_insert_head(&spa->spa_config_list, dp); 807 if (need_sync) 808 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 809 } 810 811 int 812 spa_prop_set(spa_t *spa, nvlist_t *nvp) 813 { 814 int error; 815 nvpair_t *elem = NULL; 816 boolean_t need_sync = B_FALSE; 817 818 if ((error = spa_prop_validate(spa, nvp)) != 0) 819 return (error); 820 821 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 822 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 823 824 if (prop == ZPOOL_PROP_CACHEFILE || 825 prop == ZPOOL_PROP_ALTROOT || 826 prop == ZPOOL_PROP_READONLY) 827 continue; 828 829 if (prop == ZPOOL_PROP_INVAL && 830 zfs_prop_user(nvpair_name(elem))) { 831 need_sync = B_TRUE; 832 break; 833 } 834 835 if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { 836 uint64_t ver = 0; 837 838 if (prop == ZPOOL_PROP_VERSION) { 839 VERIFY(nvpair_value_uint64(elem, &ver) == 0); 840 } else { 841 ASSERT(zpool_prop_feature(nvpair_name(elem))); 842 ver = SPA_VERSION_FEATURES; 843 need_sync = B_TRUE; 844 } 845 846 /* Save time if the version is already set. */ 847 if (ver == spa_version(spa)) 848 continue; 849 850 /* 851 * In addition to the pool directory object, we might 852 * create the pool properties object, the features for 853 * read object, the features for write object, or the 854 * feature descriptions object. 855 */ 856 error = dsl_sync_task(spa->spa_name, NULL, 857 spa_sync_version, &ver, 858 6, ZFS_SPACE_CHECK_RESERVED); 859 if (error) 860 return (error); 861 continue; 862 } 863 864 need_sync = B_TRUE; 865 break; 866 } 867 868 if (need_sync) { 869 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 870 nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 871 } 872 873 return (0); 874 } 875 876 /* 877 * If the bootfs property value is dsobj, clear it. 878 */ 879 void 880 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 881 { 882 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 883 VERIFY(zap_remove(spa->spa_meta_objset, 884 spa->spa_pool_props_object, 885 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 886 spa->spa_bootfs = 0; 887 } 888 } 889 890 static int 891 spa_change_guid_check(void *arg, dmu_tx_t *tx) 892 { 893 uint64_t *newguid __maybe_unused = arg; 894 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 895 vdev_t *rvd = spa->spa_root_vdev; 896 uint64_t vdev_state; 897 898 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 899 int error = (spa_has_checkpoint(spa)) ? 900 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 901 return (SET_ERROR(error)); 902 } 903 904 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 905 vdev_state = rvd->vdev_state; 906 spa_config_exit(spa, SCL_STATE, FTAG); 907 908 if (vdev_state != VDEV_STATE_HEALTHY) 909 return (SET_ERROR(ENXIO)); 910 911 ASSERT3U(spa_guid(spa), !=, *newguid); 912 913 return (0); 914 } 915 916 static void 917 spa_change_guid_sync(void *arg, dmu_tx_t *tx) 918 { 919 uint64_t *newguid = arg; 920 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 921 uint64_t oldguid; 922 vdev_t *rvd = spa->spa_root_vdev; 923 924 oldguid = spa_guid(spa); 925 926 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 927 rvd->vdev_guid = *newguid; 928 rvd->vdev_guid_sum += (*newguid - oldguid); 929 vdev_config_dirty(rvd); 930 spa_config_exit(spa, SCL_STATE, FTAG); 931 932 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 933 (u_longlong_t)oldguid, (u_longlong_t)*newguid); 934 } 935 936 /* 937 * Change the GUID for the pool. This is done so that we can later 938 * re-import a pool built from a clone of our own vdevs. We will modify 939 * the root vdev's guid, our own pool guid, and then mark all of our 940 * vdevs dirty. Note that we must make sure that all our vdevs are 941 * online when we do this, or else any vdevs that weren't present 942 * would be orphaned from our pool. We are also going to issue a 943 * sysevent to update any watchers. 944 */ 945 int 946 spa_change_guid(spa_t *spa) 947 { 948 int error; 949 uint64_t guid; 950 951 mutex_enter(&spa->spa_vdev_top_lock); 952 mutex_enter(&spa_namespace_lock); 953 guid = spa_generate_guid(NULL); 954 955 error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 956 spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 957 958 if (error == 0) { 959 /* 960 * Clear the kobj flag from all the vdevs to allow 961 * vdev_cache_process_kobj_evt() to post events to all the 962 * vdevs since GUID is updated. 963 */ 964 vdev_clear_kobj_evt(spa->spa_root_vdev); 965 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 966 vdev_clear_kobj_evt(spa->spa_l2cache.sav_vdevs[i]); 967 968 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); 969 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); 970 } 971 972 mutex_exit(&spa_namespace_lock); 973 mutex_exit(&spa->spa_vdev_top_lock); 974 975 return (error); 976 } 977 978 /* 979 * ========================================================================== 980 * SPA state manipulation (open/create/destroy/import/export) 981 * ========================================================================== 982 */ 983 984 static int 985 spa_error_entry_compare(const void *a, const void *b) 986 { 987 const spa_error_entry_t *sa = (const spa_error_entry_t *)a; 988 const spa_error_entry_t *sb = (const spa_error_entry_t *)b; 989 int ret; 990 991 ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, 992 sizeof (zbookmark_phys_t)); 993 994 return (TREE_ISIGN(ret)); 995 } 996 997 /* 998 * Utility function which retrieves copies of the current logs and 999 * re-initializes them in the process. 1000 */ 1001 void 1002 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 1003 { 1004 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 1005 1006 memcpy(last, &spa->spa_errlist_last, sizeof (avl_tree_t)); 1007 memcpy(scrub, &spa->spa_errlist_scrub, sizeof (avl_tree_t)); 1008 1009 avl_create(&spa->spa_errlist_scrub, 1010 spa_error_entry_compare, sizeof (spa_error_entry_t), 1011 offsetof(spa_error_entry_t, se_avl)); 1012 avl_create(&spa->spa_errlist_last, 1013 spa_error_entry_compare, sizeof (spa_error_entry_t), 1014 offsetof(spa_error_entry_t, se_avl)); 1015 } 1016 1017 static void 1018 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 1019 { 1020 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 1021 enum zti_modes mode = ztip->zti_mode; 1022 uint_t value = ztip->zti_value; 1023 uint_t count = ztip->zti_count; 1024 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1025 uint_t cpus, flags = TASKQ_DYNAMIC; 1026 boolean_t batch = B_FALSE; 1027 1028 switch (mode) { 1029 case ZTI_MODE_FIXED: 1030 ASSERT3U(value, >, 0); 1031 break; 1032 1033 case ZTI_MODE_BATCH: 1034 batch = B_TRUE; 1035 flags |= TASKQ_THREADS_CPU_PCT; 1036 value = MIN(zio_taskq_batch_pct, 100); 1037 break; 1038 1039 case ZTI_MODE_SCALE: 1040 flags |= TASKQ_THREADS_CPU_PCT; 1041 /* 1042 * We want more taskqs to reduce lock contention, but we want 1043 * less for better request ordering and CPU utilization. 1044 */ 1045 cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); 1046 if (zio_taskq_batch_tpq > 0) { 1047 count = MAX(1, (cpus + zio_taskq_batch_tpq / 2) / 1048 zio_taskq_batch_tpq); 1049 } else { 1050 /* 1051 * Prefer 6 threads per taskq, but no more taskqs 1052 * than threads in them on large systems. For 80%: 1053 * 1054 * taskq taskq total 1055 * cpus taskqs percent threads threads 1056 * ------- ------- ------- ------- ------- 1057 * 1 1 80% 1 1 1058 * 2 1 80% 1 1 1059 * 4 1 80% 3 3 1060 * 8 2 40% 3 6 1061 * 16 3 27% 4 12 1062 * 32 5 16% 5 25 1063 * 64 7 11% 7 49 1064 * 128 10 8% 10 100 1065 * 256 14 6% 15 210 1066 */ 1067 count = 1 + cpus / 6; 1068 while (count * count > cpus) 1069 count--; 1070 } 1071 /* Limit each taskq within 100% to not trigger assertion. */ 1072 count = MAX(count, (zio_taskq_batch_pct + 99) / 100); 1073 value = (zio_taskq_batch_pct + count / 2) / count; 1074 break; 1075 1076 case ZTI_MODE_NULL: 1077 tqs->stqs_count = 0; 1078 tqs->stqs_taskq = NULL; 1079 return; 1080 1081 default: 1082 panic("unrecognized mode for %s_%s taskq (%u:%u) in " 1083 "spa_activate()", 1084 zio_type_name[t], zio_taskq_types[q], mode, value); 1085 break; 1086 } 1087 1088 ASSERT3U(count, >, 0); 1089 tqs->stqs_count = count; 1090 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 1091 1092 for (uint_t i = 0; i < count; i++) { 1093 taskq_t *tq; 1094 char name[32]; 1095 1096 if (count > 1) 1097 (void) snprintf(name, sizeof (name), "%s_%s_%u", 1098 zio_type_name[t], zio_taskq_types[q], i); 1099 else 1100 (void) snprintf(name, sizeof (name), "%s_%s", 1101 zio_type_name[t], zio_taskq_types[q]); 1102 1103 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 1104 if (batch) 1105 flags |= TASKQ_DC_BATCH; 1106 1107 (void) zio_taskq_basedc; 1108 tq = taskq_create_sysdc(name, value, 50, INT_MAX, 1109 spa->spa_proc, zio_taskq_basedc, flags); 1110 } else { 1111 pri_t pri = maxclsyspri; 1112 /* 1113 * The write issue taskq can be extremely CPU 1114 * intensive. Run it at slightly less important 1115 * priority than the other taskqs. 1116 * 1117 * Under Linux and FreeBSD this means incrementing 1118 * the priority value as opposed to platforms like 1119 * illumos where it should be decremented. 1120 * 1121 * On FreeBSD, if priorities divided by four (RQ_PPQ) 1122 * are equal then a difference between them is 1123 * insignificant. 1124 */ 1125 if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) { 1126 #if defined(__linux__) 1127 pri++; 1128 #elif defined(__FreeBSD__) 1129 pri += 4; 1130 #else 1131 #error "unknown OS" 1132 #endif 1133 } 1134 tq = taskq_create_proc(name, value, pri, 50, 1135 INT_MAX, spa->spa_proc, flags); 1136 } 1137 1138 tqs->stqs_taskq[i] = tq; 1139 } 1140 } 1141 1142 static void 1143 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 1144 { 1145 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1146 1147 if (tqs->stqs_taskq == NULL) { 1148 ASSERT3U(tqs->stqs_count, ==, 0); 1149 return; 1150 } 1151 1152 for (uint_t i = 0; i < tqs->stqs_count; i++) { 1153 ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 1154 taskq_destroy(tqs->stqs_taskq[i]); 1155 } 1156 1157 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 1158 tqs->stqs_taskq = NULL; 1159 } 1160 1161 /* 1162 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 1163 * Note that a type may have multiple discrete taskqs to avoid lock contention 1164 * on the taskq itself. In that case we choose which taskq at random by using 1165 * the low bits of gethrtime(). 1166 */ 1167 void 1168 spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 1169 task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) 1170 { 1171 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1172 taskq_t *tq; 1173 1174 ASSERT3P(tqs->stqs_taskq, !=, NULL); 1175 ASSERT3U(tqs->stqs_count, !=, 0); 1176 1177 if (tqs->stqs_count == 1) { 1178 tq = tqs->stqs_taskq[0]; 1179 } else { 1180 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; 1181 } 1182 1183 taskq_dispatch_ent(tq, func, arg, flags, ent); 1184 } 1185 1186 /* 1187 * Same as spa_taskq_dispatch_ent() but block on the task until completion. 1188 */ 1189 void 1190 spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 1191 task_func_t *func, void *arg, uint_t flags) 1192 { 1193 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1194 taskq_t *tq; 1195 taskqid_t id; 1196 1197 ASSERT3P(tqs->stqs_taskq, !=, NULL); 1198 ASSERT3U(tqs->stqs_count, !=, 0); 1199 1200 if (tqs->stqs_count == 1) { 1201 tq = tqs->stqs_taskq[0]; 1202 } else { 1203 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; 1204 } 1205 1206 id = taskq_dispatch(tq, func, arg, flags); 1207 if (id) 1208 taskq_wait_id(tq, id); 1209 } 1210 1211 static void 1212 spa_create_zio_taskqs(spa_t *spa) 1213 { 1214 for (int t = 0; t < ZIO_TYPES; t++) { 1215 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1216 spa_taskqs_init(spa, t, q); 1217 } 1218 } 1219 } 1220 1221 /* 1222 * Disabled until spa_thread() can be adapted for Linux. 1223 */ 1224 #undef HAVE_SPA_THREAD 1225 1226 #if defined(_KERNEL) && defined(HAVE_SPA_THREAD) 1227 static void 1228 spa_thread(void *arg) 1229 { 1230 psetid_t zio_taskq_psrset_bind = PS_NONE; 1231 callb_cpr_t cprinfo; 1232 1233 spa_t *spa = arg; 1234 user_t *pu = PTOU(curproc); 1235 1236 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 1237 spa->spa_name); 1238 1239 ASSERT(curproc != &p0); 1240 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 1241 "zpool-%s", spa->spa_name); 1242 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 1243 1244 /* bind this thread to the requested psrset */ 1245 if (zio_taskq_psrset_bind != PS_NONE) { 1246 pool_lock(); 1247 mutex_enter(&cpu_lock); 1248 mutex_enter(&pidlock); 1249 mutex_enter(&curproc->p_lock); 1250 1251 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 1252 0, NULL, NULL) == 0) { 1253 curthread->t_bind_pset = zio_taskq_psrset_bind; 1254 } else { 1255 cmn_err(CE_WARN, 1256 "Couldn't bind process for zfs pool \"%s\" to " 1257 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1258 } 1259 1260 mutex_exit(&curproc->p_lock); 1261 mutex_exit(&pidlock); 1262 mutex_exit(&cpu_lock); 1263 pool_unlock(); 1264 } 1265 1266 if (zio_taskq_sysdc) { 1267 sysdc_thread_enter(curthread, 100, 0); 1268 } 1269 1270 spa->spa_proc = curproc; 1271 spa->spa_did = curthread->t_did; 1272 1273 spa_create_zio_taskqs(spa); 1274 1275 mutex_enter(&spa->spa_proc_lock); 1276 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1277 1278 spa->spa_proc_state = SPA_PROC_ACTIVE; 1279 cv_broadcast(&spa->spa_proc_cv); 1280 1281 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1282 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1283 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1284 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1285 1286 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1287 spa->spa_proc_state = SPA_PROC_GONE; 1288 spa->spa_proc = &p0; 1289 cv_broadcast(&spa->spa_proc_cv); 1290 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1291 1292 mutex_enter(&curproc->p_lock); 1293 lwp_exit(); 1294 } 1295 #endif 1296 1297 /* 1298 * Activate an uninitialized pool. 1299 */ 1300 static void 1301 spa_activate(spa_t *spa, spa_mode_t mode) 1302 { 1303 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1304 1305 spa->spa_state = POOL_STATE_ACTIVE; 1306 spa->spa_mode = mode; 1307 spa->spa_read_spacemaps = spa_mode_readable_spacemaps; 1308 1309 spa->spa_normal_class = metaslab_class_create(spa, &zfs_metaslab_ops); 1310 spa->spa_log_class = metaslab_class_create(spa, &zfs_metaslab_ops); 1311 spa->spa_embedded_log_class = 1312 metaslab_class_create(spa, &zfs_metaslab_ops); 1313 spa->spa_special_class = metaslab_class_create(spa, &zfs_metaslab_ops); 1314 spa->spa_dedup_class = metaslab_class_create(spa, &zfs_metaslab_ops); 1315 1316 /* Try to create a covering process */ 1317 mutex_enter(&spa->spa_proc_lock); 1318 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1319 ASSERT(spa->spa_proc == &p0); 1320 spa->spa_did = 0; 1321 1322 (void) spa_create_process; 1323 #ifdef HAVE_SPA_THREAD 1324 /* Only create a process if we're going to be around a while. */ 1325 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1326 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1327 NULL, 0) == 0) { 1328 spa->spa_proc_state = SPA_PROC_CREATED; 1329 while (spa->spa_proc_state == SPA_PROC_CREATED) { 1330 cv_wait(&spa->spa_proc_cv, 1331 &spa->spa_proc_lock); 1332 } 1333 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1334 ASSERT(spa->spa_proc != &p0); 1335 ASSERT(spa->spa_did != 0); 1336 } else { 1337 #ifdef _KERNEL 1338 cmn_err(CE_WARN, 1339 "Couldn't create process for zfs pool \"%s\"\n", 1340 spa->spa_name); 1341 #endif 1342 } 1343 } 1344 #endif /* HAVE_SPA_THREAD */ 1345 mutex_exit(&spa->spa_proc_lock); 1346 1347 /* If we didn't create a process, we need to create our taskqs. */ 1348 if (spa->spa_proc == &p0) { 1349 spa_create_zio_taskqs(spa); 1350 } 1351 1352 for (size_t i = 0; i < TXG_SIZE; i++) { 1353 spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 1354 ZIO_FLAG_CANFAIL); 1355 } 1356 1357 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1358 offsetof(vdev_t, vdev_config_dirty_node)); 1359 list_create(&spa->spa_evicting_os_list, sizeof (objset_t), 1360 offsetof(objset_t, os_evicting_node)); 1361 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1362 offsetof(vdev_t, vdev_state_dirty_node)); 1363 1364 txg_list_create(&spa->spa_vdev_txg_list, spa, 1365 offsetof(struct vdev, vdev_txg_node)); 1366 1367 avl_create(&spa->spa_errlist_scrub, 1368 spa_error_entry_compare, sizeof (spa_error_entry_t), 1369 offsetof(spa_error_entry_t, se_avl)); 1370 avl_create(&spa->spa_errlist_last, 1371 spa_error_entry_compare, sizeof (spa_error_entry_t), 1372 offsetof(spa_error_entry_t, se_avl)); 1373 avl_create(&spa->spa_errlist_healed, 1374 spa_error_entry_compare, sizeof (spa_error_entry_t), 1375 offsetof(spa_error_entry_t, se_avl)); 1376 1377 spa_activate_os(spa); 1378 1379 spa_keystore_init(&spa->spa_keystore); 1380 1381 /* 1382 * This taskq is used to perform zvol-minor-related tasks 1383 * asynchronously. This has several advantages, including easy 1384 * resolution of various deadlocks. 1385 * 1386 * The taskq must be single threaded to ensure tasks are always 1387 * processed in the order in which they were dispatched. 1388 * 1389 * A taskq per pool allows one to keep the pools independent. 1390 * This way if one pool is suspended, it will not impact another. 1391 * 1392 * The preferred location to dispatch a zvol minor task is a sync 1393 * task. In this context, there is easy access to the spa_t and minimal 1394 * error handling is required because the sync task must succeed. 1395 */ 1396 spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri, 1397 1, INT_MAX, 0); 1398 1399 /* 1400 * Taskq dedicated to prefetcher threads: this is used to prevent the 1401 * pool traverse code from monopolizing the global (and limited) 1402 * system_taskq by inappropriately scheduling long running tasks on it. 1403 */ 1404 spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100, 1405 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1406 1407 /* 1408 * The taskq to upgrade datasets in this pool. Currently used by 1409 * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA. 1410 */ 1411 spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100, 1412 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1413 } 1414 1415 /* 1416 * Opposite of spa_activate(). 1417 */ 1418 static void 1419 spa_deactivate(spa_t *spa) 1420 { 1421 ASSERT(spa->spa_sync_on == B_FALSE); 1422 ASSERT(spa->spa_dsl_pool == NULL); 1423 ASSERT(spa->spa_root_vdev == NULL); 1424 ASSERT(spa->spa_async_zio_root == NULL); 1425 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1426 1427 spa_evicting_os_wait(spa); 1428 1429 if (spa->spa_zvol_taskq) { 1430 taskq_destroy(spa->spa_zvol_taskq); 1431 spa->spa_zvol_taskq = NULL; 1432 } 1433 1434 if (spa->spa_prefetch_taskq) { 1435 taskq_destroy(spa->spa_prefetch_taskq); 1436 spa->spa_prefetch_taskq = NULL; 1437 } 1438 1439 if (spa->spa_upgrade_taskq) { 1440 taskq_destroy(spa->spa_upgrade_taskq); 1441 spa->spa_upgrade_taskq = NULL; 1442 } 1443 1444 txg_list_destroy(&spa->spa_vdev_txg_list); 1445 1446 list_destroy(&spa->spa_config_dirty_list); 1447 list_destroy(&spa->spa_evicting_os_list); 1448 list_destroy(&spa->spa_state_dirty_list); 1449 1450 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 1451 1452 for (int t = 0; t < ZIO_TYPES; t++) { 1453 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1454 spa_taskqs_fini(spa, t, q); 1455 } 1456 } 1457 1458 for (size_t i = 0; i < TXG_SIZE; i++) { 1459 ASSERT3P(spa->spa_txg_zio[i], !=, NULL); 1460 VERIFY0(zio_wait(spa->spa_txg_zio[i])); 1461 spa->spa_txg_zio[i] = NULL; 1462 } 1463 1464 metaslab_class_destroy(spa->spa_normal_class); 1465 spa->spa_normal_class = NULL; 1466 1467 metaslab_class_destroy(spa->spa_log_class); 1468 spa->spa_log_class = NULL; 1469 1470 metaslab_class_destroy(spa->spa_embedded_log_class); 1471 spa->spa_embedded_log_class = NULL; 1472 1473 metaslab_class_destroy(spa->spa_special_class); 1474 spa->spa_special_class = NULL; 1475 1476 metaslab_class_destroy(spa->spa_dedup_class); 1477 spa->spa_dedup_class = NULL; 1478 1479 /* 1480 * If this was part of an import or the open otherwise failed, we may 1481 * still have errors left in the queues. Empty them just in case. 1482 */ 1483 spa_errlog_drain(spa); 1484 avl_destroy(&spa->spa_errlist_scrub); 1485 avl_destroy(&spa->spa_errlist_last); 1486 avl_destroy(&spa->spa_errlist_healed); 1487 1488 spa_keystore_fini(&spa->spa_keystore); 1489 1490 spa->spa_state = POOL_STATE_UNINITIALIZED; 1491 1492 mutex_enter(&spa->spa_proc_lock); 1493 if (spa->spa_proc_state != SPA_PROC_NONE) { 1494 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1495 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1496 cv_broadcast(&spa->spa_proc_cv); 1497 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1498 ASSERT(spa->spa_proc != &p0); 1499 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1500 } 1501 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1502 spa->spa_proc_state = SPA_PROC_NONE; 1503 } 1504 ASSERT(spa->spa_proc == &p0); 1505 mutex_exit(&spa->spa_proc_lock); 1506 1507 /* 1508 * We want to make sure spa_thread() has actually exited the ZFS 1509 * module, so that the module can't be unloaded out from underneath 1510 * it. 1511 */ 1512 if (spa->spa_did != 0) { 1513 thread_join(spa->spa_did); 1514 spa->spa_did = 0; 1515 } 1516 1517 spa_deactivate_os(spa); 1518 1519 } 1520 1521 /* 1522 * Verify a pool configuration, and construct the vdev tree appropriately. This 1523 * will create all the necessary vdevs in the appropriate layout, with each vdev 1524 * in the CLOSED state. This will prep the pool before open/creation/import. 1525 * All vdev validation is done by the vdev_alloc() routine. 1526 */ 1527 int 1528 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1529 uint_t id, int atype) 1530 { 1531 nvlist_t **child; 1532 uint_t children; 1533 int error; 1534 1535 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1536 return (error); 1537 1538 if ((*vdp)->vdev_ops->vdev_op_leaf) 1539 return (0); 1540 1541 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1542 &child, &children); 1543 1544 if (error == ENOENT) 1545 return (0); 1546 1547 if (error) { 1548 vdev_free(*vdp); 1549 *vdp = NULL; 1550 return (SET_ERROR(EINVAL)); 1551 } 1552 1553 for (int c = 0; c < children; c++) { 1554 vdev_t *vd; 1555 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1556 atype)) != 0) { 1557 vdev_free(*vdp); 1558 *vdp = NULL; 1559 return (error); 1560 } 1561 } 1562 1563 ASSERT(*vdp != NULL); 1564 1565 return (0); 1566 } 1567 1568 static boolean_t 1569 spa_should_flush_logs_on_unload(spa_t *spa) 1570 { 1571 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 1572 return (B_FALSE); 1573 1574 if (!spa_writeable(spa)) 1575 return (B_FALSE); 1576 1577 if (!spa->spa_sync_on) 1578 return (B_FALSE); 1579 1580 if (spa_state(spa) != POOL_STATE_EXPORTED) 1581 return (B_FALSE); 1582 1583 if (zfs_keep_log_spacemaps_at_export) 1584 return (B_FALSE); 1585 1586 return (B_TRUE); 1587 } 1588 1589 /* 1590 * Opens a transaction that will set the flag that will instruct 1591 * spa_sync to attempt to flush all the metaslabs for that txg. 1592 */ 1593 static void 1594 spa_unload_log_sm_flush_all(spa_t *spa) 1595 { 1596 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 1597 VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 1598 1599 ASSERT3U(spa->spa_log_flushall_txg, ==, 0); 1600 spa->spa_log_flushall_txg = dmu_tx_get_txg(tx); 1601 1602 dmu_tx_commit(tx); 1603 txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg); 1604 } 1605 1606 static void 1607 spa_unload_log_sm_metadata(spa_t *spa) 1608 { 1609 void *cookie = NULL; 1610 spa_log_sm_t *sls; 1611 while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg, 1612 &cookie)) != NULL) { 1613 VERIFY0(sls->sls_mscount); 1614 kmem_free(sls, sizeof (spa_log_sm_t)); 1615 } 1616 1617 for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); 1618 e != NULL; e = list_head(&spa->spa_log_summary)) { 1619 VERIFY0(e->lse_mscount); 1620 list_remove(&spa->spa_log_summary, e); 1621 kmem_free(e, sizeof (log_summary_entry_t)); 1622 } 1623 1624 spa->spa_unflushed_stats.sus_nblocks = 0; 1625 spa->spa_unflushed_stats.sus_memused = 0; 1626 spa->spa_unflushed_stats.sus_blocklimit = 0; 1627 } 1628 1629 static void 1630 spa_destroy_aux_threads(spa_t *spa) 1631 { 1632 if (spa->spa_condense_zthr != NULL) { 1633 zthr_destroy(spa->spa_condense_zthr); 1634 spa->spa_condense_zthr = NULL; 1635 } 1636 if (spa->spa_checkpoint_discard_zthr != NULL) { 1637 zthr_destroy(spa->spa_checkpoint_discard_zthr); 1638 spa->spa_checkpoint_discard_zthr = NULL; 1639 } 1640 if (spa->spa_livelist_delete_zthr != NULL) { 1641 zthr_destroy(spa->spa_livelist_delete_zthr); 1642 spa->spa_livelist_delete_zthr = NULL; 1643 } 1644 if (spa->spa_livelist_condense_zthr != NULL) { 1645 zthr_destroy(spa->spa_livelist_condense_zthr); 1646 spa->spa_livelist_condense_zthr = NULL; 1647 } 1648 } 1649 1650 /* 1651 * Opposite of spa_load(). 1652 */ 1653 static void 1654 spa_unload(spa_t *spa) 1655 { 1656 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1657 ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED); 1658 1659 spa_import_progress_remove(spa_guid(spa)); 1660 spa_load_note(spa, "UNLOADING"); 1661 1662 spa_wake_waiters(spa); 1663 1664 /* 1665 * If we have set the spa_final_txg, we have already performed the 1666 * tasks below in spa_export_common(). We should not redo it here since 1667 * we delay the final TXGs beyond what spa_final_txg is set at. 1668 */ 1669 if (spa->spa_final_txg == UINT64_MAX) { 1670 /* 1671 * If the log space map feature is enabled and the pool is 1672 * getting exported (but not destroyed), we want to spend some 1673 * time flushing as many metaslabs as we can in an attempt to 1674 * destroy log space maps and save import time. 1675 */ 1676 if (spa_should_flush_logs_on_unload(spa)) 1677 spa_unload_log_sm_flush_all(spa); 1678 1679 /* 1680 * Stop async tasks. 1681 */ 1682 spa_async_suspend(spa); 1683 1684 if (spa->spa_root_vdev) { 1685 vdev_t *root_vdev = spa->spa_root_vdev; 1686 vdev_initialize_stop_all(root_vdev, 1687 VDEV_INITIALIZE_ACTIVE); 1688 vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE); 1689 vdev_autotrim_stop_all(spa); 1690 vdev_rebuild_stop_all(spa); 1691 } 1692 } 1693 1694 /* 1695 * Stop syncing. 1696 */ 1697 if (spa->spa_sync_on) { 1698 txg_sync_stop(spa->spa_dsl_pool); 1699 spa->spa_sync_on = B_FALSE; 1700 } 1701 1702 /* 1703 * This ensures that there is no async metaslab prefetching 1704 * while we attempt to unload the spa. 1705 */ 1706 if (spa->spa_root_vdev != NULL) { 1707 for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { 1708 vdev_t *vc = spa->spa_root_vdev->vdev_child[c]; 1709 if (vc->vdev_mg != NULL) 1710 taskq_wait(vc->vdev_mg->mg_taskq); 1711 } 1712 } 1713 1714 if (spa->spa_mmp.mmp_thread) 1715 mmp_thread_stop(spa); 1716 1717 /* 1718 * Wait for any outstanding async I/O to complete. 1719 */ 1720 if (spa->spa_async_zio_root != NULL) { 1721 for (int i = 0; i < max_ncpus; i++) 1722 (void) zio_wait(spa->spa_async_zio_root[i]); 1723 kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); 1724 spa->spa_async_zio_root = NULL; 1725 } 1726 1727 if (spa->spa_vdev_removal != NULL) { 1728 spa_vdev_removal_destroy(spa->spa_vdev_removal); 1729 spa->spa_vdev_removal = NULL; 1730 } 1731 1732 spa_destroy_aux_threads(spa); 1733 1734 spa_condense_fini(spa); 1735 1736 bpobj_close(&spa->spa_deferred_bpobj); 1737 1738 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); 1739 1740 /* 1741 * Close all vdevs. 1742 */ 1743 if (spa->spa_root_vdev) 1744 vdev_free(spa->spa_root_vdev); 1745 ASSERT(spa->spa_root_vdev == NULL); 1746 1747 /* 1748 * Close the dsl pool. 1749 */ 1750 if (spa->spa_dsl_pool) { 1751 dsl_pool_close(spa->spa_dsl_pool); 1752 spa->spa_dsl_pool = NULL; 1753 spa->spa_meta_objset = NULL; 1754 } 1755 1756 ddt_unload(spa); 1757 brt_unload(spa); 1758 spa_unload_log_sm_metadata(spa); 1759 1760 /* 1761 * Drop and purge level 2 cache 1762 */ 1763 spa_l2cache_drop(spa); 1764 1765 if (spa->spa_spares.sav_vdevs) { 1766 for (int i = 0; i < spa->spa_spares.sav_count; i++) 1767 vdev_free(spa->spa_spares.sav_vdevs[i]); 1768 kmem_free(spa->spa_spares.sav_vdevs, 1769 spa->spa_spares.sav_count * sizeof (void *)); 1770 spa->spa_spares.sav_vdevs = NULL; 1771 } 1772 if (spa->spa_spares.sav_config) { 1773 nvlist_free(spa->spa_spares.sav_config); 1774 spa->spa_spares.sav_config = NULL; 1775 } 1776 spa->spa_spares.sav_count = 0; 1777 1778 if (spa->spa_l2cache.sav_vdevs) { 1779 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { 1780 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1781 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1782 } 1783 kmem_free(spa->spa_l2cache.sav_vdevs, 1784 spa->spa_l2cache.sav_count * sizeof (void *)); 1785 spa->spa_l2cache.sav_vdevs = NULL; 1786 } 1787 if (spa->spa_l2cache.sav_config) { 1788 nvlist_free(spa->spa_l2cache.sav_config); 1789 spa->spa_l2cache.sav_config = NULL; 1790 } 1791 spa->spa_l2cache.sav_count = 0; 1792 1793 spa->spa_async_suspended = 0; 1794 1795 spa->spa_indirect_vdevs_loaded = B_FALSE; 1796 1797 if (spa->spa_comment != NULL) { 1798 spa_strfree(spa->spa_comment); 1799 spa->spa_comment = NULL; 1800 } 1801 if (spa->spa_compatibility != NULL) { 1802 spa_strfree(spa->spa_compatibility); 1803 spa->spa_compatibility = NULL; 1804 } 1805 1806 spa_config_exit(spa, SCL_ALL, spa); 1807 } 1808 1809 /* 1810 * Load (or re-load) the current list of vdevs describing the active spares for 1811 * this pool. When this is called, we have some form of basic information in 1812 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1813 * then re-generate a more complete list including status information. 1814 */ 1815 void 1816 spa_load_spares(spa_t *spa) 1817 { 1818 nvlist_t **spares; 1819 uint_t nspares; 1820 int i; 1821 vdev_t *vd, *tvd; 1822 1823 #ifndef _KERNEL 1824 /* 1825 * zdb opens both the current state of the pool and the 1826 * checkpointed state (if present), with a different spa_t. 1827 * 1828 * As spare vdevs are shared among open pools, we skip loading 1829 * them when we load the checkpointed state of the pool. 1830 */ 1831 if (!spa_writeable(spa)) 1832 return; 1833 #endif 1834 1835 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1836 1837 /* 1838 * First, close and free any existing spare vdevs. 1839 */ 1840 if (spa->spa_spares.sav_vdevs) { 1841 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1842 vd = spa->spa_spares.sav_vdevs[i]; 1843 1844 /* Undo the call to spa_activate() below */ 1845 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1846 B_FALSE)) != NULL && tvd->vdev_isspare) 1847 spa_spare_remove(tvd); 1848 vdev_close(vd); 1849 vdev_free(vd); 1850 } 1851 1852 kmem_free(spa->spa_spares.sav_vdevs, 1853 spa->spa_spares.sav_count * sizeof (void *)); 1854 } 1855 1856 if (spa->spa_spares.sav_config == NULL) 1857 nspares = 0; 1858 else 1859 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1860 ZPOOL_CONFIG_SPARES, &spares, &nspares)); 1861 1862 spa->spa_spares.sav_count = (int)nspares; 1863 spa->spa_spares.sav_vdevs = NULL; 1864 1865 if (nspares == 0) 1866 return; 1867 1868 /* 1869 * Construct the array of vdevs, opening them to get status in the 1870 * process. For each spare, there is potentially two different vdev_t 1871 * structures associated with it: one in the list of spares (used only 1872 * for basic validation purposes) and one in the active vdev 1873 * configuration (if it's spared in). During this phase we open and 1874 * validate each vdev on the spare list. If the vdev also exists in the 1875 * active configuration, then we also mark this vdev as an active spare. 1876 */ 1877 spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *), 1878 KM_SLEEP); 1879 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1880 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1881 VDEV_ALLOC_SPARE) == 0); 1882 ASSERT(vd != NULL); 1883 1884 spa->spa_spares.sav_vdevs[i] = vd; 1885 1886 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1887 B_FALSE)) != NULL) { 1888 if (!tvd->vdev_isspare) 1889 spa_spare_add(tvd); 1890 1891 /* 1892 * We only mark the spare active if we were successfully 1893 * able to load the vdev. Otherwise, importing a pool 1894 * with a bad active spare would result in strange 1895 * behavior, because multiple pool would think the spare 1896 * is actively in use. 1897 * 1898 * There is a vulnerability here to an equally bizarre 1899 * circumstance, where a dead active spare is later 1900 * brought back to life (onlined or otherwise). Given 1901 * the rarity of this scenario, and the extra complexity 1902 * it adds, we ignore the possibility. 1903 */ 1904 if (!vdev_is_dead(tvd)) 1905 spa_spare_activate(tvd); 1906 } 1907 1908 vd->vdev_top = vd; 1909 vd->vdev_aux = &spa->spa_spares; 1910 1911 if (vdev_open(vd) != 0) 1912 continue; 1913 1914 if (vdev_validate_aux(vd) == 0) 1915 spa_spare_add(vd); 1916 } 1917 1918 /* 1919 * Recompute the stashed list of spares, with status information 1920 * this time. 1921 */ 1922 fnvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES); 1923 1924 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1925 KM_SLEEP); 1926 for (i = 0; i < spa->spa_spares.sav_count; i++) 1927 spares[i] = vdev_config_generate(spa, 1928 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1929 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 1930 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 1931 spa->spa_spares.sav_count); 1932 for (i = 0; i < spa->spa_spares.sav_count; i++) 1933 nvlist_free(spares[i]); 1934 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1935 } 1936 1937 /* 1938 * Load (or re-load) the current list of vdevs describing the active l2cache for 1939 * this pool. When this is called, we have some form of basic information in 1940 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1941 * then re-generate a more complete list including status information. 1942 * Devices which are already active have their details maintained, and are 1943 * not re-opened. 1944 */ 1945 void 1946 spa_load_l2cache(spa_t *spa) 1947 { 1948 nvlist_t **l2cache = NULL; 1949 uint_t nl2cache; 1950 int i, j, oldnvdevs; 1951 uint64_t guid; 1952 vdev_t *vd, **oldvdevs, **newvdevs; 1953 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1954 1955 #ifndef _KERNEL 1956 /* 1957 * zdb opens both the current state of the pool and the 1958 * checkpointed state (if present), with a different spa_t. 1959 * 1960 * As L2 caches are part of the ARC which is shared among open 1961 * pools, we skip loading them when we load the checkpointed 1962 * state of the pool. 1963 */ 1964 if (!spa_writeable(spa)) 1965 return; 1966 #endif 1967 1968 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1969 1970 oldvdevs = sav->sav_vdevs; 1971 oldnvdevs = sav->sav_count; 1972 sav->sav_vdevs = NULL; 1973 sav->sav_count = 0; 1974 1975 if (sav->sav_config == NULL) { 1976 nl2cache = 0; 1977 newvdevs = NULL; 1978 goto out; 1979 } 1980 1981 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, 1982 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); 1983 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1984 1985 /* 1986 * Process new nvlist of vdevs. 1987 */ 1988 for (i = 0; i < nl2cache; i++) { 1989 guid = fnvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID); 1990 1991 newvdevs[i] = NULL; 1992 for (j = 0; j < oldnvdevs; j++) { 1993 vd = oldvdevs[j]; 1994 if (vd != NULL && guid == vd->vdev_guid) { 1995 /* 1996 * Retain previous vdev for add/remove ops. 1997 */ 1998 newvdevs[i] = vd; 1999 oldvdevs[j] = NULL; 2000 break; 2001 } 2002 } 2003 2004 if (newvdevs[i] == NULL) { 2005 /* 2006 * Create new vdev 2007 */ 2008 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 2009 VDEV_ALLOC_L2CACHE) == 0); 2010 ASSERT(vd != NULL); 2011 newvdevs[i] = vd; 2012 2013 /* 2014 * Commit this vdev as an l2cache device, 2015 * even if it fails to open. 2016 */ 2017 spa_l2cache_add(vd); 2018 2019 vd->vdev_top = vd; 2020 vd->vdev_aux = sav; 2021 2022 spa_l2cache_activate(vd); 2023 2024 if (vdev_open(vd) != 0) 2025 continue; 2026 2027 (void) vdev_validate_aux(vd); 2028 2029 if (!vdev_is_dead(vd)) 2030 l2arc_add_vdev(spa, vd); 2031 2032 /* 2033 * Upon cache device addition to a pool or pool 2034 * creation with a cache device or if the header 2035 * of the device is invalid we issue an async 2036 * TRIM command for the whole device which will 2037 * execute if l2arc_trim_ahead > 0. 2038 */ 2039 spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); 2040 } 2041 } 2042 2043 sav->sav_vdevs = newvdevs; 2044 sav->sav_count = (int)nl2cache; 2045 2046 /* 2047 * Recompute the stashed list of l2cache devices, with status 2048 * information this time. 2049 */ 2050 fnvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE); 2051 2052 if (sav->sav_count > 0) 2053 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), 2054 KM_SLEEP); 2055 for (i = 0; i < sav->sav_count; i++) 2056 l2cache[i] = vdev_config_generate(spa, 2057 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 2058 fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 2059 (const nvlist_t * const *)l2cache, sav->sav_count); 2060 2061 out: 2062 /* 2063 * Purge vdevs that were dropped 2064 */ 2065 if (oldvdevs) { 2066 for (i = 0; i < oldnvdevs; i++) { 2067 uint64_t pool; 2068 2069 vd = oldvdevs[i]; 2070 if (vd != NULL) { 2071 ASSERT(vd->vdev_isl2cache); 2072 2073 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 2074 pool != 0ULL && l2arc_vdev_present(vd)) 2075 l2arc_remove_vdev(vd); 2076 vdev_clear_stats(vd); 2077 vdev_free(vd); 2078 } 2079 } 2080 2081 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 2082 } 2083 2084 for (i = 0; i < sav->sav_count; i++) 2085 nvlist_free(l2cache[i]); 2086 if (sav->sav_count) 2087 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 2088 } 2089 2090 static int 2091 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 2092 { 2093 dmu_buf_t *db; 2094 char *packed = NULL; 2095 size_t nvsize = 0; 2096 int error; 2097 *value = NULL; 2098 2099 error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 2100 if (error) 2101 return (error); 2102 2103 nvsize = *(uint64_t *)db->db_data; 2104 dmu_buf_rele(db, FTAG); 2105 2106 packed = vmem_alloc(nvsize, KM_SLEEP); 2107 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 2108 DMU_READ_PREFETCH); 2109 if (error == 0) 2110 error = nvlist_unpack(packed, nvsize, value, 0); 2111 vmem_free(packed, nvsize); 2112 2113 return (error); 2114 } 2115 2116 /* 2117 * Concrete top-level vdevs that are not missing and are not logs. At every 2118 * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds. 2119 */ 2120 static uint64_t 2121 spa_healthy_core_tvds(spa_t *spa) 2122 { 2123 vdev_t *rvd = spa->spa_root_vdev; 2124 uint64_t tvds = 0; 2125 2126 for (uint64_t i = 0; i < rvd->vdev_children; i++) { 2127 vdev_t *vd = rvd->vdev_child[i]; 2128 if (vd->vdev_islog) 2129 continue; 2130 if (vdev_is_concrete(vd) && !vdev_is_dead(vd)) 2131 tvds++; 2132 } 2133 2134 return (tvds); 2135 } 2136 2137 /* 2138 * Checks to see if the given vdev could not be opened, in which case we post a 2139 * sysevent to notify the autoreplace code that the device has been removed. 2140 */ 2141 static void 2142 spa_check_removed(vdev_t *vd) 2143 { 2144 for (uint64_t c = 0; c < vd->vdev_children; c++) 2145 spa_check_removed(vd->vdev_child[c]); 2146 2147 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 2148 vdev_is_concrete(vd)) { 2149 zfs_post_autoreplace(vd->vdev_spa, vd); 2150 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); 2151 } 2152 } 2153 2154 static int 2155 spa_check_for_missing_logs(spa_t *spa) 2156 { 2157 vdev_t *rvd = spa->spa_root_vdev; 2158 2159 /* 2160 * If we're doing a normal import, then build up any additional 2161 * diagnostic information about missing log devices. 2162 * We'll pass this up to the user for further processing. 2163 */ 2164 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 2165 nvlist_t **child, *nv; 2166 uint64_t idx = 0; 2167 2168 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *), 2169 KM_SLEEP); 2170 nv = fnvlist_alloc(); 2171 2172 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 2173 vdev_t *tvd = rvd->vdev_child[c]; 2174 2175 /* 2176 * We consider a device as missing only if it failed 2177 * to open (i.e. offline or faulted is not considered 2178 * as missing). 2179 */ 2180 if (tvd->vdev_islog && 2181 tvd->vdev_state == VDEV_STATE_CANT_OPEN) { 2182 child[idx++] = vdev_config_generate(spa, tvd, 2183 B_FALSE, VDEV_CONFIG_MISSING); 2184 } 2185 } 2186 2187 if (idx > 0) { 2188 fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 2189 (const nvlist_t * const *)child, idx); 2190 fnvlist_add_nvlist(spa->spa_load_info, 2191 ZPOOL_CONFIG_MISSING_DEVICES, nv); 2192 2193 for (uint64_t i = 0; i < idx; i++) 2194 nvlist_free(child[i]); 2195 } 2196 nvlist_free(nv); 2197 kmem_free(child, rvd->vdev_children * sizeof (char **)); 2198 2199 if (idx > 0) { 2200 spa_load_failed(spa, "some log devices are missing"); 2201 vdev_dbgmsg_print_tree(rvd, 2); 2202 return (SET_ERROR(ENXIO)); 2203 } 2204 } else { 2205 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 2206 vdev_t *tvd = rvd->vdev_child[c]; 2207 2208 if (tvd->vdev_islog && 2209 tvd->vdev_state == VDEV_STATE_CANT_OPEN) { 2210 spa_set_log_state(spa, SPA_LOG_CLEAR); 2211 spa_load_note(spa, "some log devices are " 2212 "missing, ZIL is dropped."); 2213 vdev_dbgmsg_print_tree(rvd, 2); 2214 break; 2215 } 2216 } 2217 } 2218 2219 return (0); 2220 } 2221 2222 /* 2223 * Check for missing log devices 2224 */ 2225 static boolean_t 2226 spa_check_logs(spa_t *spa) 2227 { 2228 boolean_t rv = B_FALSE; 2229 dsl_pool_t *dp = spa_get_dsl(spa); 2230 2231 switch (spa->spa_log_state) { 2232 default: 2233 break; 2234 case SPA_LOG_MISSING: 2235 /* need to recheck in case slog has been restored */ 2236 case SPA_LOG_UNKNOWN: 2237 rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 2238 zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); 2239 if (rv) 2240 spa_set_log_state(spa, SPA_LOG_MISSING); 2241 break; 2242 } 2243 return (rv); 2244 } 2245 2246 /* 2247 * Passivate any log vdevs (note, does not apply to embedded log metaslabs). 2248 */ 2249 static boolean_t 2250 spa_passivate_log(spa_t *spa) 2251 { 2252 vdev_t *rvd = spa->spa_root_vdev; 2253 boolean_t slog_found = B_FALSE; 2254 2255 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 2256 2257 for (int c = 0; c < rvd->vdev_children; c++) { 2258 vdev_t *tvd = rvd->vdev_child[c]; 2259 2260 if (tvd->vdev_islog) { 2261 ASSERT3P(tvd->vdev_log_mg, ==, NULL); 2262 metaslab_group_passivate(tvd->vdev_mg); 2263 slog_found = B_TRUE; 2264 } 2265 } 2266 2267 return (slog_found); 2268 } 2269 2270 /* 2271 * Activate any log vdevs (note, does not apply to embedded log metaslabs). 2272 */ 2273 static void 2274 spa_activate_log(spa_t *spa) 2275 { 2276 vdev_t *rvd = spa->spa_root_vdev; 2277 2278 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 2279 2280 for (int c = 0; c < rvd->vdev_children; c++) { 2281 vdev_t *tvd = rvd->vdev_child[c]; 2282 2283 if (tvd->vdev_islog) { 2284 ASSERT3P(tvd->vdev_log_mg, ==, NULL); 2285 metaslab_group_activate(tvd->vdev_mg); 2286 } 2287 } 2288 } 2289 2290 int 2291 spa_reset_logs(spa_t *spa) 2292 { 2293 int error; 2294 2295 error = dmu_objset_find(spa_name(spa), zil_reset, 2296 NULL, DS_FIND_CHILDREN); 2297 if (error == 0) { 2298 /* 2299 * We successfully offlined the log device, sync out the 2300 * current txg so that the "stubby" block can be removed 2301 * by zil_sync(). 2302 */ 2303 txg_wait_synced(spa->spa_dsl_pool, 0); 2304 } 2305 return (error); 2306 } 2307 2308 static void 2309 spa_aux_check_removed(spa_aux_vdev_t *sav) 2310 { 2311 for (int i = 0; i < sav->sav_count; i++) 2312 spa_check_removed(sav->sav_vdevs[i]); 2313 } 2314 2315 void 2316 spa_claim_notify(zio_t *zio) 2317 { 2318 spa_t *spa = zio->io_spa; 2319 2320 if (zio->io_error) 2321 return; 2322 2323 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 2324 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 2325 spa->spa_claim_max_txg = zio->io_bp->blk_birth; 2326 mutex_exit(&spa->spa_props_lock); 2327 } 2328 2329 typedef struct spa_load_error { 2330 boolean_t sle_verify_data; 2331 uint64_t sle_meta_count; 2332 uint64_t sle_data_count; 2333 } spa_load_error_t; 2334 2335 static void 2336 spa_load_verify_done(zio_t *zio) 2337 { 2338 blkptr_t *bp = zio->io_bp; 2339 spa_load_error_t *sle = zio->io_private; 2340 dmu_object_type_t type = BP_GET_TYPE(bp); 2341 int error = zio->io_error; 2342 spa_t *spa = zio->io_spa; 2343 2344 abd_free(zio->io_abd); 2345 if (error) { 2346 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 2347 type != DMU_OT_INTENT_LOG) 2348 atomic_inc_64(&sle->sle_meta_count); 2349 else 2350 atomic_inc_64(&sle->sle_data_count); 2351 } 2352 2353 mutex_enter(&spa->spa_scrub_lock); 2354 spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); 2355 cv_broadcast(&spa->spa_scrub_io_cv); 2356 mutex_exit(&spa->spa_scrub_lock); 2357 } 2358 2359 /* 2360 * Maximum number of inflight bytes is the log2 fraction of the arc size. 2361 * By default, we set it to 1/16th of the arc. 2362 */ 2363 static uint_t spa_load_verify_shift = 4; 2364 static int spa_load_verify_metadata = B_TRUE; 2365 static int spa_load_verify_data = B_TRUE; 2366 2367 static int 2368 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 2369 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 2370 { 2371 zio_t *rio = arg; 2372 spa_load_error_t *sle = rio->io_private; 2373 2374 (void) zilog, (void) dnp; 2375 2376 /* 2377 * Note: normally this routine will not be called if 2378 * spa_load_verify_metadata is not set. However, it may be useful 2379 * to manually set the flag after the traversal has begun. 2380 */ 2381 if (!spa_load_verify_metadata) 2382 return (0); 2383 2384 /* 2385 * Sanity check the block pointer in order to detect obvious damage 2386 * before using the contents in subsequent checks or in zio_read(). 2387 * When damaged consider it to be a metadata error since we cannot 2388 * trust the BP_GET_TYPE and BP_GET_LEVEL values. 2389 */ 2390 if (!zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_LOG)) { 2391 atomic_inc_64(&sle->sle_meta_count); 2392 return (0); 2393 } 2394 2395 if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || 2396 BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) 2397 return (0); 2398 2399 if (!BP_IS_METADATA(bp) && 2400 (!spa_load_verify_data || !sle->sle_verify_data)) 2401 return (0); 2402 2403 uint64_t maxinflight_bytes = 2404 arc_target_bytes() >> spa_load_verify_shift; 2405 size_t size = BP_GET_PSIZE(bp); 2406 2407 mutex_enter(&spa->spa_scrub_lock); 2408 while (spa->spa_load_verify_bytes >= maxinflight_bytes) 2409 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2410 spa->spa_load_verify_bytes += size; 2411 mutex_exit(&spa->spa_scrub_lock); 2412 2413 zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, 2414 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 2415 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 2416 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 2417 return (0); 2418 } 2419 2420 static int 2421 verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 2422 { 2423 (void) dp, (void) arg; 2424 2425 if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) 2426 return (SET_ERROR(ENAMETOOLONG)); 2427 2428 return (0); 2429 } 2430 2431 static int 2432 spa_load_verify(spa_t *spa) 2433 { 2434 zio_t *rio; 2435 spa_load_error_t sle = { 0 }; 2436 zpool_load_policy_t policy; 2437 boolean_t verify_ok = B_FALSE; 2438 int error = 0; 2439 2440 zpool_get_load_policy(spa->spa_config, &policy); 2441 2442 if (policy.zlp_rewind & ZPOOL_NEVER_REWIND || 2443 policy.zlp_maxmeta == UINT64_MAX) 2444 return (0); 2445 2446 dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); 2447 error = dmu_objset_find_dp(spa->spa_dsl_pool, 2448 spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, 2449 DS_FIND_CHILDREN); 2450 dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); 2451 if (error != 0) 2452 return (error); 2453 2454 /* 2455 * Verify data only if we are rewinding or error limit was set. 2456 * Otherwise nothing except dbgmsg care about it to waste time. 2457 */ 2458 sle.sle_verify_data = (policy.zlp_rewind & ZPOOL_REWIND_MASK) || 2459 (policy.zlp_maxdata < UINT64_MAX); 2460 2461 rio = zio_root(spa, NULL, &sle, 2462 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 2463 2464 if (spa_load_verify_metadata) { 2465 if (spa->spa_extreme_rewind) { 2466 spa_load_note(spa, "performing a complete scan of the " 2467 "pool since extreme rewind is on. This may take " 2468 "a very long time.\n (spa_load_verify_data=%u, " 2469 "spa_load_verify_metadata=%u)", 2470 spa_load_verify_data, spa_load_verify_metadata); 2471 } 2472 2473 error = traverse_pool(spa, spa->spa_verify_min_txg, 2474 TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | 2475 TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio); 2476 } 2477 2478 (void) zio_wait(rio); 2479 ASSERT0(spa->spa_load_verify_bytes); 2480 2481 spa->spa_load_meta_errors = sle.sle_meta_count; 2482 spa->spa_load_data_errors = sle.sle_data_count; 2483 2484 if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) { 2485 spa_load_note(spa, "spa_load_verify found %llu metadata errors " 2486 "and %llu data errors", (u_longlong_t)sle.sle_meta_count, 2487 (u_longlong_t)sle.sle_data_count); 2488 } 2489 2490 if (spa_load_verify_dryrun || 2491 (!error && sle.sle_meta_count <= policy.zlp_maxmeta && 2492 sle.sle_data_count <= policy.zlp_maxdata)) { 2493 int64_t loss = 0; 2494 2495 verify_ok = B_TRUE; 2496 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 2497 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 2498 2499 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 2500 fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME, 2501 spa->spa_load_txg_ts); 2502 fnvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME, 2503 loss); 2504 fnvlist_add_uint64(spa->spa_load_info, 2505 ZPOOL_CONFIG_LOAD_META_ERRORS, sle.sle_meta_count); 2506 fnvlist_add_uint64(spa->spa_load_info, 2507 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count); 2508 } else { 2509 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 2510 } 2511 2512 if (spa_load_verify_dryrun) 2513 return (0); 2514 2515 if (error) { 2516 if (error != ENXIO && error != EIO) 2517 error = SET_ERROR(EIO); 2518 return (error); 2519 } 2520 2521 return (verify_ok ? 0 : EIO); 2522 } 2523 2524 /* 2525 * Find a value in the pool props object. 2526 */ 2527 static void 2528 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 2529 { 2530 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 2531 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 2532 } 2533 2534 /* 2535 * Find a value in the pool directory object. 2536 */ 2537 static int 2538 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) 2539 { 2540 int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2541 name, sizeof (uint64_t), 1, val); 2542 2543 if (error != 0 && (error != ENOENT || log_enoent)) { 2544 spa_load_failed(spa, "couldn't get '%s' value in MOS directory " 2545 "[error=%d]", name, error); 2546 } 2547 2548 return (error); 2549 } 2550 2551 static int 2552 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 2553 { 2554 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 2555 return (SET_ERROR(err)); 2556 } 2557 2558 boolean_t 2559 spa_livelist_delete_check(spa_t *spa) 2560 { 2561 return (spa->spa_livelists_to_delete != 0); 2562 } 2563 2564 static boolean_t 2565 spa_livelist_delete_cb_check(void *arg, zthr_t *z) 2566 { 2567 (void) z; 2568 spa_t *spa = arg; 2569 return (spa_livelist_delete_check(spa)); 2570 } 2571 2572 static int 2573 delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 2574 { 2575 spa_t *spa = arg; 2576 zio_free(spa, tx->tx_txg, bp); 2577 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, 2578 -bp_get_dsize_sync(spa, bp), 2579 -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); 2580 return (0); 2581 } 2582 2583 static int 2584 dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp) 2585 { 2586 int err; 2587 zap_cursor_t zc; 2588 zap_attribute_t za; 2589 zap_cursor_init(&zc, os, zap_obj); 2590 err = zap_cursor_retrieve(&zc, &za); 2591 zap_cursor_fini(&zc); 2592 if (err == 0) 2593 *llp = za.za_first_integer; 2594 return (err); 2595 } 2596 2597 /* 2598 * Components of livelist deletion that must be performed in syncing 2599 * context: freeing block pointers and updating the pool-wide data 2600 * structures to indicate how much work is left to do 2601 */ 2602 typedef struct sublist_delete_arg { 2603 spa_t *spa; 2604 dsl_deadlist_t *ll; 2605 uint64_t key; 2606 bplist_t *to_free; 2607 } sublist_delete_arg_t; 2608 2609 static void 2610 sublist_delete_sync(void *arg, dmu_tx_t *tx) 2611 { 2612 sublist_delete_arg_t *sda = arg; 2613 spa_t *spa = sda->spa; 2614 dsl_deadlist_t *ll = sda->ll; 2615 uint64_t key = sda->key; 2616 bplist_t *to_free = sda->to_free; 2617 2618 bplist_iterate(to_free, delete_blkptr_cb, spa, tx); 2619 dsl_deadlist_remove_entry(ll, key, tx); 2620 } 2621 2622 typedef struct livelist_delete_arg { 2623 spa_t *spa; 2624 uint64_t ll_obj; 2625 uint64_t zap_obj; 2626 } livelist_delete_arg_t; 2627 2628 static void 2629 livelist_delete_sync(void *arg, dmu_tx_t *tx) 2630 { 2631 livelist_delete_arg_t *lda = arg; 2632 spa_t *spa = lda->spa; 2633 uint64_t ll_obj = lda->ll_obj; 2634 uint64_t zap_obj = lda->zap_obj; 2635 objset_t *mos = spa->spa_meta_objset; 2636 uint64_t count; 2637 2638 /* free the livelist and decrement the feature count */ 2639 VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx)); 2640 dsl_deadlist_free(mos, ll_obj, tx); 2641 spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); 2642 VERIFY0(zap_count(mos, zap_obj, &count)); 2643 if (count == 0) { 2644 /* no more livelists to delete */ 2645 VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, 2646 DMU_POOL_DELETED_CLONES, tx)); 2647 VERIFY0(zap_destroy(mos, zap_obj, tx)); 2648 spa->spa_livelists_to_delete = 0; 2649 spa_notify_waiters(spa); 2650 } 2651 } 2652 2653 /* 2654 * Load in the value for the livelist to be removed and open it. Then, 2655 * load its first sublist and determine which block pointers should actually 2656 * be freed. Then, call a synctask which performs the actual frees and updates 2657 * the pool-wide livelist data. 2658 */ 2659 static void 2660 spa_livelist_delete_cb(void *arg, zthr_t *z) 2661 { 2662 spa_t *spa = arg; 2663 uint64_t ll_obj = 0, count; 2664 objset_t *mos = spa->spa_meta_objset; 2665 uint64_t zap_obj = spa->spa_livelists_to_delete; 2666 /* 2667 * Determine the next livelist to delete. This function should only 2668 * be called if there is at least one deleted clone. 2669 */ 2670 VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj)); 2671 VERIFY0(zap_count(mos, ll_obj, &count)); 2672 if (count > 0) { 2673 dsl_deadlist_t *ll; 2674 dsl_deadlist_entry_t *dle; 2675 bplist_t to_free; 2676 ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP); 2677 dsl_deadlist_open(ll, mos, ll_obj); 2678 dle = dsl_deadlist_first(ll); 2679 ASSERT3P(dle, !=, NULL); 2680 bplist_create(&to_free); 2681 int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free, 2682 z, NULL); 2683 if (err == 0) { 2684 sublist_delete_arg_t sync_arg = { 2685 .spa = spa, 2686 .ll = ll, 2687 .key = dle->dle_mintxg, 2688 .to_free = &to_free 2689 }; 2690 zfs_dbgmsg("deleting sublist (id %llu) from" 2691 " livelist %llu, %lld remaining", 2692 (u_longlong_t)dle->dle_bpobj.bpo_object, 2693 (u_longlong_t)ll_obj, (longlong_t)count - 1); 2694 VERIFY0(dsl_sync_task(spa_name(spa), NULL, 2695 sublist_delete_sync, &sync_arg, 0, 2696 ZFS_SPACE_CHECK_DESTROY)); 2697 } else { 2698 VERIFY3U(err, ==, EINTR); 2699 } 2700 bplist_clear(&to_free); 2701 bplist_destroy(&to_free); 2702 dsl_deadlist_close(ll); 2703 kmem_free(ll, sizeof (dsl_deadlist_t)); 2704 } else { 2705 livelist_delete_arg_t sync_arg = { 2706 .spa = spa, 2707 .ll_obj = ll_obj, 2708 .zap_obj = zap_obj 2709 }; 2710 zfs_dbgmsg("deletion of livelist %llu completed", 2711 (u_longlong_t)ll_obj); 2712 VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync, 2713 &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); 2714 } 2715 } 2716 2717 static void 2718 spa_start_livelist_destroy_thread(spa_t *spa) 2719 { 2720 ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL); 2721 spa->spa_livelist_delete_zthr = 2722 zthr_create("z_livelist_destroy", 2723 spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa, 2724 minclsyspri); 2725 } 2726 2727 typedef struct livelist_new_arg { 2728 bplist_t *allocs; 2729 bplist_t *frees; 2730 } livelist_new_arg_t; 2731 2732 static int 2733 livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 2734 dmu_tx_t *tx) 2735 { 2736 ASSERT(tx == NULL); 2737 livelist_new_arg_t *lna = arg; 2738 if (bp_freed) { 2739 bplist_append(lna->frees, bp); 2740 } else { 2741 bplist_append(lna->allocs, bp); 2742 zfs_livelist_condense_new_alloc++; 2743 } 2744 return (0); 2745 } 2746 2747 typedef struct livelist_condense_arg { 2748 spa_t *spa; 2749 bplist_t to_keep; 2750 uint64_t first_size; 2751 uint64_t next_size; 2752 } livelist_condense_arg_t; 2753 2754 static void 2755 spa_livelist_condense_sync(void *arg, dmu_tx_t *tx) 2756 { 2757 livelist_condense_arg_t *lca = arg; 2758 spa_t *spa = lca->spa; 2759 bplist_t new_frees; 2760 dsl_dataset_t *ds = spa->spa_to_condense.ds; 2761 2762 /* Have we been cancelled? */ 2763 if (spa->spa_to_condense.cancelled) { 2764 zfs_livelist_condense_sync_cancel++; 2765 goto out; 2766 } 2767 2768 dsl_deadlist_entry_t *first = spa->spa_to_condense.first; 2769 dsl_deadlist_entry_t *next = spa->spa_to_condense.next; 2770 dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; 2771 2772 /* 2773 * It's possible that the livelist was changed while the zthr was 2774 * running. Therefore, we need to check for new blkptrs in the two 2775 * entries being condensed and continue to track them in the livelist. 2776 * Because of the way we handle remapped blkptrs (see dbuf_remap_impl), 2777 * it's possible that the newly added blkptrs are FREEs or ALLOCs so 2778 * we need to sort them into two different bplists. 2779 */ 2780 uint64_t first_obj = first->dle_bpobj.bpo_object; 2781 uint64_t next_obj = next->dle_bpobj.bpo_object; 2782 uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs; 2783 uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs; 2784 2785 bplist_create(&new_frees); 2786 livelist_new_arg_t new_bps = { 2787 .allocs = &lca->to_keep, 2788 .frees = &new_frees, 2789 }; 2790 2791 if (cur_first_size > lca->first_size) { 2792 VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj, 2793 livelist_track_new_cb, &new_bps, lca->first_size)); 2794 } 2795 if (cur_next_size > lca->next_size) { 2796 VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj, 2797 livelist_track_new_cb, &new_bps, lca->next_size)); 2798 } 2799 2800 dsl_deadlist_clear_entry(first, ll, tx); 2801 ASSERT(bpobj_is_empty(&first->dle_bpobj)); 2802 dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx); 2803 2804 bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx); 2805 bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx); 2806 bplist_destroy(&new_frees); 2807 2808 char dsname[ZFS_MAX_DATASET_NAME_LEN]; 2809 dsl_dataset_name(ds, dsname); 2810 zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu " 2811 "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu " 2812 "(%llu blkptrs)", (u_longlong_t)tx->tx_txg, dsname, 2813 (u_longlong_t)ds->ds_object, (u_longlong_t)first_obj, 2814 (u_longlong_t)cur_first_size, (u_longlong_t)next_obj, 2815 (u_longlong_t)cur_next_size, 2816 (u_longlong_t)first->dle_bpobj.bpo_object, 2817 (u_longlong_t)first->dle_bpobj.bpo_phys->bpo_num_blkptrs); 2818 out: 2819 dmu_buf_rele(ds->ds_dbuf, spa); 2820 spa->spa_to_condense.ds = NULL; 2821 bplist_clear(&lca->to_keep); 2822 bplist_destroy(&lca->to_keep); 2823 kmem_free(lca, sizeof (livelist_condense_arg_t)); 2824 spa->spa_to_condense.syncing = B_FALSE; 2825 } 2826 2827 static void 2828 spa_livelist_condense_cb(void *arg, zthr_t *t) 2829 { 2830 while (zfs_livelist_condense_zthr_pause && 2831 !(zthr_has_waiters(t) || zthr_iscancelled(t))) 2832 delay(1); 2833 2834 spa_t *spa = arg; 2835 dsl_deadlist_entry_t *first = spa->spa_to_condense.first; 2836 dsl_deadlist_entry_t *next = spa->spa_to_condense.next; 2837 uint64_t first_size, next_size; 2838 2839 livelist_condense_arg_t *lca = 2840 kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP); 2841 bplist_create(&lca->to_keep); 2842 2843 /* 2844 * Process the livelists (matching FREEs and ALLOCs) in open context 2845 * so we have minimal work in syncing context to condense. 2846 * 2847 * We save bpobj sizes (first_size and next_size) to use later in 2848 * syncing context to determine if entries were added to these sublists 2849 * while in open context. This is possible because the clone is still 2850 * active and open for normal writes and we want to make sure the new, 2851 * unprocessed blockpointers are inserted into the livelist normally. 2852 * 2853 * Note that dsl_process_sub_livelist() both stores the size number of 2854 * blockpointers and iterates over them while the bpobj's lock held, so 2855 * the sizes returned to us are consistent which what was actually 2856 * processed. 2857 */ 2858 int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t, 2859 &first_size); 2860 if (err == 0) 2861 err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep, 2862 t, &next_size); 2863 2864 if (err == 0) { 2865 while (zfs_livelist_condense_sync_pause && 2866 !(zthr_has_waiters(t) || zthr_iscancelled(t))) 2867 delay(1); 2868 2869 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 2870 dmu_tx_mark_netfree(tx); 2871 dmu_tx_hold_space(tx, 1); 2872 err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE); 2873 if (err == 0) { 2874 /* 2875 * Prevent the condense zthr restarting before 2876 * the synctask completes. 2877 */ 2878 spa->spa_to_condense.syncing = B_TRUE; 2879 lca->spa = spa; 2880 lca->first_size = first_size; 2881 lca->next_size = next_size; 2882 dsl_sync_task_nowait(spa_get_dsl(spa), 2883 spa_livelist_condense_sync, lca, tx); 2884 dmu_tx_commit(tx); 2885 return; 2886 } 2887 } 2888 /* 2889 * Condensing can not continue: either it was externally stopped or 2890 * we were unable to assign to a tx because the pool has run out of 2891 * space. In the second case, we'll just end up trying to condense 2892 * again in a later txg. 2893 */ 2894 ASSERT(err != 0); 2895 bplist_clear(&lca->to_keep); 2896 bplist_destroy(&lca->to_keep); 2897 kmem_free(lca, sizeof (livelist_condense_arg_t)); 2898 dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa); 2899 spa->spa_to_condense.ds = NULL; 2900 if (err == EINTR) 2901 zfs_livelist_condense_zthr_cancel++; 2902 } 2903 2904 /* 2905 * Check that there is something to condense but that a condense is not 2906 * already in progress and that condensing has not been cancelled. 2907 */ 2908 static boolean_t 2909 spa_livelist_condense_cb_check(void *arg, zthr_t *z) 2910 { 2911 (void) z; 2912 spa_t *spa = arg; 2913 if ((spa->spa_to_condense.ds != NULL) && 2914 (spa->spa_to_condense.syncing == B_FALSE) && 2915 (spa->spa_to_condense.cancelled == B_FALSE)) { 2916 return (B_TRUE); 2917 } 2918 return (B_FALSE); 2919 } 2920 2921 static void 2922 spa_start_livelist_condensing_thread(spa_t *spa) 2923 { 2924 spa->spa_to_condense.ds = NULL; 2925 spa->spa_to_condense.first = NULL; 2926 spa->spa_to_condense.next = NULL; 2927 spa->spa_to_condense.syncing = B_FALSE; 2928 spa->spa_to_condense.cancelled = B_FALSE; 2929 2930 ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL); 2931 spa->spa_livelist_condense_zthr = 2932 zthr_create("z_livelist_condense", 2933 spa_livelist_condense_cb_check, 2934 spa_livelist_condense_cb, spa, minclsyspri); 2935 } 2936 2937 static void 2938 spa_spawn_aux_threads(spa_t *spa) 2939 { 2940 ASSERT(spa_writeable(spa)); 2941 2942 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2943 2944 spa_start_indirect_condensing_thread(spa); 2945 spa_start_livelist_destroy_thread(spa); 2946 spa_start_livelist_condensing_thread(spa); 2947 2948 ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); 2949 spa->spa_checkpoint_discard_zthr = 2950 zthr_create("z_checkpoint_discard", 2951 spa_checkpoint_discard_thread_check, 2952 spa_checkpoint_discard_thread, spa, minclsyspri); 2953 } 2954 2955 /* 2956 * Fix up config after a partly-completed split. This is done with the 2957 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 2958 * pool have that entry in their config, but only the splitting one contains 2959 * a list of all the guids of the vdevs that are being split off. 2960 * 2961 * This function determines what to do with that list: either rejoin 2962 * all the disks to the pool, or complete the splitting process. To attempt 2963 * the rejoin, each disk that is offlined is marked online again, and 2964 * we do a reopen() call. If the vdev label for every disk that was 2965 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 2966 * then we call vdev_split() on each disk, and complete the split. 2967 * 2968 * Otherwise we leave the config alone, with all the vdevs in place in 2969 * the original pool. 2970 */ 2971 static void 2972 spa_try_repair(spa_t *spa, nvlist_t *config) 2973 { 2974 uint_t extracted; 2975 uint64_t *glist; 2976 uint_t i, gcount; 2977 nvlist_t *nvl; 2978 vdev_t **vd; 2979 boolean_t attempt_reopen; 2980 2981 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 2982 return; 2983 2984 /* check that the config is complete */ 2985 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 2986 &glist, &gcount) != 0) 2987 return; 2988 2989 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 2990 2991 /* attempt to online all the vdevs & validate */ 2992 attempt_reopen = B_TRUE; 2993 for (i = 0; i < gcount; i++) { 2994 if (glist[i] == 0) /* vdev is hole */ 2995 continue; 2996 2997 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 2998 if (vd[i] == NULL) { 2999 /* 3000 * Don't bother attempting to reopen the disks; 3001 * just do the split. 3002 */ 3003 attempt_reopen = B_FALSE; 3004 } else { 3005 /* attempt to re-online it */ 3006 vd[i]->vdev_offline = B_FALSE; 3007 } 3008 } 3009 3010 if (attempt_reopen) { 3011 vdev_reopen(spa->spa_root_vdev); 3012 3013 /* check each device to see what state it's in */ 3014 for (extracted = 0, i = 0; i < gcount; i++) { 3015 if (vd[i] != NULL && 3016 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 3017 break; 3018 ++extracted; 3019 } 3020 } 3021 3022 /* 3023 * If every disk has been moved to the new pool, or if we never 3024 * even attempted to look at them, then we split them off for 3025 * good. 3026 */ 3027 if (!attempt_reopen || gcount == extracted) { 3028 for (i = 0; i < gcount; i++) 3029 if (vd[i] != NULL) 3030 vdev_split(vd[i]); 3031 vdev_reopen(spa->spa_root_vdev); 3032 } 3033 3034 kmem_free(vd, gcount * sizeof (vdev_t *)); 3035 } 3036 3037 static int 3038 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) 3039 { 3040 const char *ereport = FM_EREPORT_ZFS_POOL; 3041 int error; 3042 3043 spa->spa_load_state = state; 3044 (void) spa_import_progress_set_state(spa_guid(spa), 3045 spa_load_state(spa)); 3046 3047 gethrestime(&spa->spa_loaded_ts); 3048 error = spa_load_impl(spa, type, &ereport); 3049 3050 /* 3051 * Don't count references from objsets that are already closed 3052 * and are making their way through the eviction process. 3053 */ 3054 spa_evicting_os_wait(spa); 3055 spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); 3056 if (error) { 3057 if (error != EEXIST) { 3058 spa->spa_loaded_ts.tv_sec = 0; 3059 spa->spa_loaded_ts.tv_nsec = 0; 3060 } 3061 if (error != EBADF) { 3062 (void) zfs_ereport_post(ereport, spa, 3063 NULL, NULL, NULL, 0); 3064 } 3065 } 3066 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 3067 spa->spa_ena = 0; 3068 3069 (void) spa_import_progress_set_state(spa_guid(spa), 3070 spa_load_state(spa)); 3071 3072 return (error); 3073 } 3074 3075 #ifdef ZFS_DEBUG 3076 /* 3077 * Count the number of per-vdev ZAPs associated with all of the vdevs in the 3078 * vdev tree rooted in the given vd, and ensure that each ZAP is present in the 3079 * spa's per-vdev ZAP list. 3080 */ 3081 static uint64_t 3082 vdev_count_verify_zaps(vdev_t *vd) 3083 { 3084 spa_t *spa = vd->vdev_spa; 3085 uint64_t total = 0; 3086 3087 if (vd->vdev_top_zap != 0) { 3088 total++; 3089 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3090 spa->spa_all_vdev_zaps, vd->vdev_top_zap)); 3091 } 3092 if (vd->vdev_leaf_zap != 0) { 3093 total++; 3094 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3095 spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); 3096 } 3097 3098 for (uint64_t i = 0; i < vd->vdev_children; i++) { 3099 total += vdev_count_verify_zaps(vd->vdev_child[i]); 3100 } 3101 3102 return (total); 3103 } 3104 #else 3105 #define vdev_count_verify_zaps(vd) ((void) sizeof (vd), 0) 3106 #endif 3107 3108 /* 3109 * Determine whether the activity check is required. 3110 */ 3111 static boolean_t 3112 spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label, 3113 nvlist_t *config) 3114 { 3115 uint64_t state = 0; 3116 uint64_t hostid = 0; 3117 uint64_t tryconfig_txg = 0; 3118 uint64_t tryconfig_timestamp = 0; 3119 uint16_t tryconfig_mmp_seq = 0; 3120 nvlist_t *nvinfo; 3121 3122 if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { 3123 nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); 3124 (void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG, 3125 &tryconfig_txg); 3126 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 3127 &tryconfig_timestamp); 3128 (void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ, 3129 &tryconfig_mmp_seq); 3130 } 3131 3132 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state); 3133 3134 /* 3135 * Disable the MMP activity check - This is used by zdb which 3136 * is intended to be used on potentially active pools. 3137 */ 3138 if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) 3139 return (B_FALSE); 3140 3141 /* 3142 * Skip the activity check when the MMP feature is disabled. 3143 */ 3144 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0) 3145 return (B_FALSE); 3146 3147 /* 3148 * If the tryconfig_ values are nonzero, they are the results of an 3149 * earlier tryimport. If they all match the uberblock we just found, 3150 * then the pool has not changed and we return false so we do not test 3151 * a second time. 3152 */ 3153 if (tryconfig_txg && tryconfig_txg == ub->ub_txg && 3154 tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp && 3155 tryconfig_mmp_seq && tryconfig_mmp_seq == 3156 (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) 3157 return (B_FALSE); 3158 3159 /* 3160 * Allow the activity check to be skipped when importing the pool 3161 * on the same host which last imported it. Since the hostid from 3162 * configuration may be stale use the one read from the label. 3163 */ 3164 if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID)) 3165 hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID); 3166 3167 if (hostid == spa_get_hostid(spa)) 3168 return (B_FALSE); 3169 3170 /* 3171 * Skip the activity test when the pool was cleanly exported. 3172 */ 3173 if (state != POOL_STATE_ACTIVE) 3174 return (B_FALSE); 3175 3176 return (B_TRUE); 3177 } 3178 3179 /* 3180 * Nanoseconds the activity check must watch for changes on-disk. 3181 */ 3182 static uint64_t 3183 spa_activity_check_duration(spa_t *spa, uberblock_t *ub) 3184 { 3185 uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1); 3186 uint64_t multihost_interval = MSEC2NSEC( 3187 MMP_INTERVAL_OK(zfs_multihost_interval)); 3188 uint64_t import_delay = MAX(NANOSEC, import_intervals * 3189 multihost_interval); 3190 3191 /* 3192 * Local tunables determine a minimum duration except for the case 3193 * where we know when the remote host will suspend the pool if MMP 3194 * writes do not land. 3195 * 3196 * See Big Theory comment at the top of mmp.c for the reasoning behind 3197 * these cases and times. 3198 */ 3199 3200 ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100); 3201 3202 if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && 3203 MMP_FAIL_INT(ub) > 0) { 3204 3205 /* MMP on remote host will suspend pool after failed writes */ 3206 import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) * 3207 MMP_IMPORT_SAFETY_FACTOR / 100; 3208 3209 zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp " 3210 "mmp_fails=%llu ub_mmp mmp_interval=%llu " 3211 "import_intervals=%llu", (u_longlong_t)import_delay, 3212 (u_longlong_t)MMP_FAIL_INT(ub), 3213 (u_longlong_t)MMP_INTERVAL(ub), 3214 (u_longlong_t)import_intervals); 3215 3216 } else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && 3217 MMP_FAIL_INT(ub) == 0) { 3218 3219 /* MMP on remote host will never suspend pool */ 3220 import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) + 3221 ub->ub_mmp_delay) * import_intervals); 3222 3223 zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp " 3224 "mmp_interval=%llu ub_mmp_delay=%llu " 3225 "import_intervals=%llu", (u_longlong_t)import_delay, 3226 (u_longlong_t)MMP_INTERVAL(ub), 3227 (u_longlong_t)ub->ub_mmp_delay, 3228 (u_longlong_t)import_intervals); 3229 3230 } else if (MMP_VALID(ub)) { 3231 /* 3232 * zfs-0.7 compatibility case 3233 */ 3234 3235 import_delay = MAX(import_delay, (multihost_interval + 3236 ub->ub_mmp_delay) * import_intervals); 3237 3238 zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu " 3239 "import_intervals=%llu leaves=%u", 3240 (u_longlong_t)import_delay, 3241 (u_longlong_t)ub->ub_mmp_delay, 3242 (u_longlong_t)import_intervals, 3243 vdev_count_leaves(spa)); 3244 } else { 3245 /* Using local tunings is the only reasonable option */ 3246 zfs_dbgmsg("pool last imported on non-MMP aware " 3247 "host using import_delay=%llu multihost_interval=%llu " 3248 "import_intervals=%llu", (u_longlong_t)import_delay, 3249 (u_longlong_t)multihost_interval, 3250 (u_longlong_t)import_intervals); 3251 } 3252 3253 return (import_delay); 3254 } 3255 3256 /* 3257 * Perform the import activity check. If the user canceled the import or 3258 * we detected activity then fail. 3259 */ 3260 static int 3261 spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) 3262 { 3263 uint64_t txg = ub->ub_txg; 3264 uint64_t timestamp = ub->ub_timestamp; 3265 uint64_t mmp_config = ub->ub_mmp_config; 3266 uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0; 3267 uint64_t import_delay; 3268 hrtime_t import_expire; 3269 nvlist_t *mmp_label = NULL; 3270 vdev_t *rvd = spa->spa_root_vdev; 3271 kcondvar_t cv; 3272 kmutex_t mtx; 3273 int error = 0; 3274 3275 cv_init(&cv, NULL, CV_DEFAULT, NULL); 3276 mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL); 3277 mutex_enter(&mtx); 3278 3279 /* 3280 * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed 3281 * during the earlier tryimport. If the txg recorded there is 0 then 3282 * the pool is known to be active on another host. 3283 * 3284 * Otherwise, the pool might be in use on another host. Check for 3285 * changes in the uberblocks on disk if necessary. 3286 */ 3287 if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { 3288 nvlist_t *nvinfo = fnvlist_lookup_nvlist(config, 3289 ZPOOL_CONFIG_LOAD_INFO); 3290 3291 if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) && 3292 fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) { 3293 vdev_uberblock_load(rvd, ub, &mmp_label); 3294 error = SET_ERROR(EREMOTEIO); 3295 goto out; 3296 } 3297 } 3298 3299 import_delay = spa_activity_check_duration(spa, ub); 3300 3301 /* Add a small random factor in case of simultaneous imports (0-25%) */ 3302 import_delay += import_delay * random_in_range(250) / 1000; 3303 3304 import_expire = gethrtime() + import_delay; 3305 3306 while (gethrtime() < import_expire) { 3307 (void) spa_import_progress_set_mmp_check(spa_guid(spa), 3308 NSEC2SEC(import_expire - gethrtime())); 3309 3310 vdev_uberblock_load(rvd, ub, &mmp_label); 3311 3312 if (txg != ub->ub_txg || timestamp != ub->ub_timestamp || 3313 mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) { 3314 zfs_dbgmsg("multihost activity detected " 3315 "txg %llu ub_txg %llu " 3316 "timestamp %llu ub_timestamp %llu " 3317 "mmp_config %#llx ub_mmp_config %#llx", 3318 (u_longlong_t)txg, (u_longlong_t)ub->ub_txg, 3319 (u_longlong_t)timestamp, 3320 (u_longlong_t)ub->ub_timestamp, 3321 (u_longlong_t)mmp_config, 3322 (u_longlong_t)ub->ub_mmp_config); 3323 3324 error = SET_ERROR(EREMOTEIO); 3325 break; 3326 } 3327 3328 if (mmp_label) { 3329 nvlist_free(mmp_label); 3330 mmp_label = NULL; 3331 } 3332 3333 error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz); 3334 if (error != -1) { 3335 error = SET_ERROR(EINTR); 3336 break; 3337 } 3338 error = 0; 3339 } 3340 3341 out: 3342 mutex_exit(&mtx); 3343 mutex_destroy(&mtx); 3344 cv_destroy(&cv); 3345 3346 /* 3347 * If the pool is determined to be active store the status in the 3348 * spa->spa_load_info nvlist. If the remote hostname or hostid are 3349 * available from configuration read from disk store them as well. 3350 * This allows 'zpool import' to generate a more useful message. 3351 * 3352 * ZPOOL_CONFIG_MMP_STATE - observed pool status (mandatory) 3353 * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool 3354 * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool 3355 */ 3356 if (error == EREMOTEIO) { 3357 const char *hostname = "<unknown>"; 3358 uint64_t hostid = 0; 3359 3360 if (mmp_label) { 3361 if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) { 3362 hostname = fnvlist_lookup_string(mmp_label, 3363 ZPOOL_CONFIG_HOSTNAME); 3364 fnvlist_add_string(spa->spa_load_info, 3365 ZPOOL_CONFIG_MMP_HOSTNAME, hostname); 3366 } 3367 3368 if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) { 3369 hostid = fnvlist_lookup_uint64(mmp_label, 3370 ZPOOL_CONFIG_HOSTID); 3371 fnvlist_add_uint64(spa->spa_load_info, 3372 ZPOOL_CONFIG_MMP_HOSTID, hostid); 3373 } 3374 } 3375 3376 fnvlist_add_uint64(spa->spa_load_info, 3377 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE); 3378 fnvlist_add_uint64(spa->spa_load_info, 3379 ZPOOL_CONFIG_MMP_TXG, 0); 3380 3381 error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO); 3382 } 3383 3384 if (mmp_label) 3385 nvlist_free(mmp_label); 3386 3387 return (error); 3388 } 3389 3390 static int 3391 spa_verify_host(spa_t *spa, nvlist_t *mos_config) 3392 { 3393 uint64_t hostid; 3394 const char *hostname; 3395 uint64_t myhostid = 0; 3396 3397 if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, 3398 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 3399 hostname = fnvlist_lookup_string(mos_config, 3400 ZPOOL_CONFIG_HOSTNAME); 3401 3402 myhostid = zone_get_hostid(NULL); 3403 3404 if (hostid != 0 && myhostid != 0 && hostid != myhostid) { 3405 cmn_err(CE_WARN, "pool '%s' could not be " 3406 "loaded as it was last accessed by " 3407 "another system (host: %s hostid: 0x%llx). " 3408 "See: https://openzfs.github.io/openzfs-docs/msg/" 3409 "ZFS-8000-EY", 3410 spa_name(spa), hostname, (u_longlong_t)hostid); 3411 spa_load_failed(spa, "hostid verification failed: pool " 3412 "last accessed by host: %s (hostid: 0x%llx)", 3413 hostname, (u_longlong_t)hostid); 3414 return (SET_ERROR(EBADF)); 3415 } 3416 } 3417 3418 return (0); 3419 } 3420 3421 static int 3422 spa_ld_parse_config(spa_t *spa, spa_import_type_t type) 3423 { 3424 int error = 0; 3425 nvlist_t *nvtree, *nvl, *config = spa->spa_config; 3426 int parse; 3427 vdev_t *rvd; 3428 uint64_t pool_guid; 3429 const char *comment; 3430 const char *compatibility; 3431 3432 /* 3433 * Versioning wasn't explicitly added to the label until later, so if 3434 * it's not present treat it as the initial version. 3435 */ 3436 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 3437 &spa->spa_ubsync.ub_version) != 0) 3438 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 3439 3440 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 3441 spa_load_failed(spa, "invalid config provided: '%s' missing", 3442 ZPOOL_CONFIG_POOL_GUID); 3443 return (SET_ERROR(EINVAL)); 3444 } 3445 3446 /* 3447 * If we are doing an import, ensure that the pool is not already 3448 * imported by checking if its pool guid already exists in the 3449 * spa namespace. 3450 * 3451 * The only case that we allow an already imported pool to be 3452 * imported again, is when the pool is checkpointed and we want to 3453 * look at its checkpointed state from userland tools like zdb. 3454 */ 3455 #ifdef _KERNEL 3456 if ((spa->spa_load_state == SPA_LOAD_IMPORT || 3457 spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 3458 spa_guid_exists(pool_guid, 0)) { 3459 #else 3460 if ((spa->spa_load_state == SPA_LOAD_IMPORT || 3461 spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 3462 spa_guid_exists(pool_guid, 0) && 3463 !spa_importing_readonly_checkpoint(spa)) { 3464 #endif 3465 spa_load_failed(spa, "a pool with guid %llu is already open", 3466 (u_longlong_t)pool_guid); 3467 return (SET_ERROR(EEXIST)); 3468 } 3469 3470 spa->spa_config_guid = pool_guid; 3471 3472 nvlist_free(spa->spa_load_info); 3473 spa->spa_load_info = fnvlist_alloc(); 3474 3475 ASSERT(spa->spa_comment == NULL); 3476 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 3477 spa->spa_comment = spa_strdup(comment); 3478 3479 ASSERT(spa->spa_compatibility == NULL); 3480 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMPATIBILITY, 3481 &compatibility) == 0) 3482 spa->spa_compatibility = spa_strdup(compatibility); 3483 3484 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 3485 &spa->spa_config_txg); 3486 3487 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) 3488 spa->spa_config_splitting = fnvlist_dup(nvl); 3489 3490 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { 3491 spa_load_failed(spa, "invalid config provided: '%s' missing", 3492 ZPOOL_CONFIG_VDEV_TREE); 3493 return (SET_ERROR(EINVAL)); 3494 } 3495 3496 /* 3497 * Create "The Godfather" zio to hold all async IOs 3498 */ 3499 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 3500 KM_SLEEP); 3501 for (int i = 0; i < max_ncpus; i++) { 3502 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 3503 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 3504 ZIO_FLAG_GODFATHER); 3505 } 3506 3507 /* 3508 * Parse the configuration into a vdev tree. We explicitly set the 3509 * value that will be returned by spa_version() since parsing the 3510 * configuration requires knowing the version number. 3511 */ 3512 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3513 parse = (type == SPA_IMPORT_EXISTING ? 3514 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 3515 error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); 3516 spa_config_exit(spa, SCL_ALL, FTAG); 3517 3518 if (error != 0) { 3519 spa_load_failed(spa, "unable to parse config [error=%d]", 3520 error); 3521 return (error); 3522 } 3523 3524 ASSERT(spa->spa_root_vdev == rvd); 3525 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 3526 ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); 3527 3528 if (type != SPA_IMPORT_ASSEMBLE) { 3529 ASSERT(spa_guid(spa) == pool_guid); 3530 } 3531 3532 return (0); 3533 } 3534 3535 /* 3536 * Recursively open all vdevs in the vdev tree. This function is called twice: 3537 * first with the untrusted config, then with the trusted config. 3538 */ 3539 static int 3540 spa_ld_open_vdevs(spa_t *spa) 3541 { 3542 int error = 0; 3543 3544 /* 3545 * spa_missing_tvds_allowed defines how many top-level vdevs can be 3546 * missing/unopenable for the root vdev to be still considered openable. 3547 */ 3548 if (spa->spa_trust_config) { 3549 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds; 3550 } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) { 3551 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile; 3552 } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) { 3553 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan; 3554 } else { 3555 spa->spa_missing_tvds_allowed = 0; 3556 } 3557 3558 spa->spa_missing_tvds_allowed = 3559 MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed); 3560 3561 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3562 error = vdev_open(spa->spa_root_vdev); 3563 spa_config_exit(spa, SCL_ALL, FTAG); 3564 3565 if (spa->spa_missing_tvds != 0) { 3566 spa_load_note(spa, "vdev tree has %lld missing top-level " 3567 "vdevs.", (u_longlong_t)spa->spa_missing_tvds); 3568 if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) { 3569 /* 3570 * Although theoretically we could allow users to open 3571 * incomplete pools in RW mode, we'd need to add a lot 3572 * of extra logic (e.g. adjust pool space to account 3573 * for missing vdevs). 3574 * This limitation also prevents users from accidentally 3575 * opening the pool in RW mode during data recovery and 3576 * damaging it further. 3577 */ 3578 spa_load_note(spa, "pools with missing top-level " 3579 "vdevs can only be opened in read-only mode."); 3580 error = SET_ERROR(ENXIO); 3581 } else { 3582 spa_load_note(spa, "current settings allow for maximum " 3583 "%lld missing top-level vdevs at this stage.", 3584 (u_longlong_t)spa->spa_missing_tvds_allowed); 3585 } 3586 } 3587 if (error != 0) { 3588 spa_load_failed(spa, "unable to open vdev tree [error=%d]", 3589 error); 3590 } 3591 if (spa->spa_missing_tvds != 0 || error != 0) 3592 vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2); 3593 3594 return (error); 3595 } 3596 3597 /* 3598 * We need to validate the vdev labels against the configuration that 3599 * we have in hand. This function is called twice: first with an untrusted 3600 * config, then with a trusted config. The validation is more strict when the 3601 * config is trusted. 3602 */ 3603 static int 3604 spa_ld_validate_vdevs(spa_t *spa) 3605 { 3606 int error = 0; 3607 vdev_t *rvd = spa->spa_root_vdev; 3608 3609 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3610 error = vdev_validate(rvd); 3611 spa_config_exit(spa, SCL_ALL, FTAG); 3612 3613 if (error != 0) { 3614 spa_load_failed(spa, "vdev_validate failed [error=%d]", error); 3615 return (error); 3616 } 3617 3618 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 3619 spa_load_failed(spa, "cannot open vdev tree after invalidating " 3620 "some vdevs"); 3621 vdev_dbgmsg_print_tree(rvd, 2); 3622 return (SET_ERROR(ENXIO)); 3623 } 3624 3625 return (0); 3626 } 3627 3628 static void 3629 spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub) 3630 { 3631 spa->spa_state = POOL_STATE_ACTIVE; 3632 spa->spa_ubsync = spa->spa_uberblock; 3633 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 3634 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 3635 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 3636 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 3637 spa->spa_claim_max_txg = spa->spa_first_txg; 3638 spa->spa_prev_software_version = ub->ub_software_version; 3639 } 3640 3641 static int 3642 spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) 3643 { 3644 vdev_t *rvd = spa->spa_root_vdev; 3645 nvlist_t *label; 3646 uberblock_t *ub = &spa->spa_uberblock; 3647 boolean_t activity_check = B_FALSE; 3648 3649 /* 3650 * If we are opening the checkpointed state of the pool by 3651 * rewinding to it, at this point we will have written the 3652 * checkpointed uberblock to the vdev labels, so searching 3653 * the labels will find the right uberblock. However, if 3654 * we are opening the checkpointed state read-only, we have 3655 * not modified the labels. Therefore, we must ignore the 3656 * labels and continue using the spa_uberblock that was set 3657 * by spa_ld_checkpoint_rewind. 3658 * 3659 * Note that it would be fine to ignore the labels when 3660 * rewinding (opening writeable) as well. However, if we 3661 * crash just after writing the labels, we will end up 3662 * searching the labels. Doing so in the common case means 3663 * that this code path gets exercised normally, rather than 3664 * just in the edge case. 3665 */ 3666 if (ub->ub_checkpoint_txg != 0 && 3667 spa_importing_readonly_checkpoint(spa)) { 3668 spa_ld_select_uberblock_done(spa, ub); 3669 return (0); 3670 } 3671 3672 /* 3673 * Find the best uberblock. 3674 */ 3675 vdev_uberblock_load(rvd, ub, &label); 3676 3677 /* 3678 * If we weren't able to find a single valid uberblock, return failure. 3679 */ 3680 if (ub->ub_txg == 0) { 3681 nvlist_free(label); 3682 spa_load_failed(spa, "no valid uberblock found"); 3683 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 3684 } 3685 3686 if (spa->spa_load_max_txg != UINT64_MAX) { 3687 (void) spa_import_progress_set_max_txg(spa_guid(spa), 3688 (u_longlong_t)spa->spa_load_max_txg); 3689 } 3690 spa_load_note(spa, "using uberblock with txg=%llu", 3691 (u_longlong_t)ub->ub_txg); 3692 3693 3694 /* 3695 * For pools which have the multihost property on determine if the 3696 * pool is truly inactive and can be safely imported. Prevent 3697 * hosts which don't have a hostid set from importing the pool. 3698 */ 3699 activity_check = spa_activity_check_required(spa, ub, label, 3700 spa->spa_config); 3701 if (activity_check) { 3702 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay && 3703 spa_get_hostid(spa) == 0) { 3704 nvlist_free(label); 3705 fnvlist_add_uint64(spa->spa_load_info, 3706 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); 3707 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); 3708 } 3709 3710 int error = spa_activity_check(spa, ub, spa->spa_config); 3711 if (error) { 3712 nvlist_free(label); 3713 return (error); 3714 } 3715 3716 fnvlist_add_uint64(spa->spa_load_info, 3717 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE); 3718 fnvlist_add_uint64(spa->spa_load_info, 3719 ZPOOL_CONFIG_MMP_TXG, ub->ub_txg); 3720 fnvlist_add_uint16(spa->spa_load_info, 3721 ZPOOL_CONFIG_MMP_SEQ, 3722 (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)); 3723 } 3724 3725 /* 3726 * If the pool has an unsupported version we can't open it. 3727 */ 3728 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 3729 nvlist_free(label); 3730 spa_load_failed(spa, "version %llu is not supported", 3731 (u_longlong_t)ub->ub_version); 3732 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 3733 } 3734 3735 if (ub->ub_version >= SPA_VERSION_FEATURES) { 3736 nvlist_t *features; 3737 3738 /* 3739 * If we weren't able to find what's necessary for reading the 3740 * MOS in the label, return failure. 3741 */ 3742 if (label == NULL) { 3743 spa_load_failed(spa, "label config unavailable"); 3744 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 3745 ENXIO)); 3746 } 3747 3748 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, 3749 &features) != 0) { 3750 nvlist_free(label); 3751 spa_load_failed(spa, "invalid label: '%s' missing", 3752 ZPOOL_CONFIG_FEATURES_FOR_READ); 3753 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 3754 ENXIO)); 3755 } 3756 3757 /* 3758 * Update our in-core representation with the definitive values 3759 * from the label. 3760 */ 3761 nvlist_free(spa->spa_label_features); 3762 spa->spa_label_features = fnvlist_dup(features); 3763 } 3764 3765 nvlist_free(label); 3766 3767 /* 3768 * Look through entries in the label nvlist's features_for_read. If 3769 * there is a feature listed there which we don't understand then we 3770 * cannot open a pool. 3771 */ 3772 if (ub->ub_version >= SPA_VERSION_FEATURES) { 3773 nvlist_t *unsup_feat; 3774 3775 unsup_feat = fnvlist_alloc(); 3776 3777 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 3778 NULL); nvp != NULL; 3779 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 3780 if (!zfeature_is_supported(nvpair_name(nvp))) { 3781 fnvlist_add_string(unsup_feat, 3782 nvpair_name(nvp), ""); 3783 } 3784 } 3785 3786 if (!nvlist_empty(unsup_feat)) { 3787 fnvlist_add_nvlist(spa->spa_load_info, 3788 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 3789 nvlist_free(unsup_feat); 3790 spa_load_failed(spa, "some features are unsupported"); 3791 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 3792 ENOTSUP)); 3793 } 3794 3795 nvlist_free(unsup_feat); 3796 } 3797 3798 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 3799 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3800 spa_try_repair(spa, spa->spa_config); 3801 spa_config_exit(spa, SCL_ALL, FTAG); 3802 nvlist_free(spa->spa_config_splitting); 3803 spa->spa_config_splitting = NULL; 3804 } 3805 3806 /* 3807 * Initialize internal SPA structures. 3808 */ 3809 spa_ld_select_uberblock_done(spa, ub); 3810 3811 return (0); 3812 } 3813 3814 static int 3815 spa_ld_open_rootbp(spa_t *spa) 3816 { 3817 int error = 0; 3818 vdev_t *rvd = spa->spa_root_vdev; 3819 3820 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 3821 if (error != 0) { 3822 spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " 3823 "[error=%d]", error); 3824 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3825 } 3826 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 3827 3828 return (0); 3829 } 3830 3831 static int 3832 spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, 3833 boolean_t reloading) 3834 { 3835 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 3836 nvlist_t *nv, *mos_config, *policy; 3837 int error = 0, copy_error; 3838 uint64_t healthy_tvds, healthy_tvds_mos; 3839 uint64_t mos_config_txg; 3840 3841 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) 3842 != 0) 3843 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3844 3845 /* 3846 * If we're assembling a pool from a split, the config provided is 3847 * already trusted so there is nothing to do. 3848 */ 3849 if (type == SPA_IMPORT_ASSEMBLE) 3850 return (0); 3851 3852 healthy_tvds = spa_healthy_core_tvds(spa); 3853 3854 if (load_nvlist(spa, spa->spa_config_object, &mos_config) 3855 != 0) { 3856 spa_load_failed(spa, "unable to retrieve MOS config"); 3857 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3858 } 3859 3860 /* 3861 * If we are doing an open, pool owner wasn't verified yet, thus do 3862 * the verification here. 3863 */ 3864 if (spa->spa_load_state == SPA_LOAD_OPEN) { 3865 error = spa_verify_host(spa, mos_config); 3866 if (error != 0) { 3867 nvlist_free(mos_config); 3868 return (error); 3869 } 3870 } 3871 3872 nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE); 3873 3874 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3875 3876 /* 3877 * Build a new vdev tree from the trusted config 3878 */ 3879 error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD); 3880 if (error != 0) { 3881 nvlist_free(mos_config); 3882 spa_config_exit(spa, SCL_ALL, FTAG); 3883 spa_load_failed(spa, "spa_config_parse failed [error=%d]", 3884 error); 3885 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 3886 } 3887 3888 /* 3889 * Vdev paths in the MOS may be obsolete. If the untrusted config was 3890 * obtained by scanning /dev/dsk, then it will have the right vdev 3891 * paths. We update the trusted MOS config with this information. 3892 * We first try to copy the paths with vdev_copy_path_strict, which 3893 * succeeds only when both configs have exactly the same vdev tree. 3894 * If that fails, we fall back to a more flexible method that has a 3895 * best effort policy. 3896 */ 3897 copy_error = vdev_copy_path_strict(rvd, mrvd); 3898 if (copy_error != 0 || spa_load_print_vdev_tree) { 3899 spa_load_note(spa, "provided vdev tree:"); 3900 vdev_dbgmsg_print_tree(rvd, 2); 3901 spa_load_note(spa, "MOS vdev tree:"); 3902 vdev_dbgmsg_print_tree(mrvd, 2); 3903 } 3904 if (copy_error != 0) { 3905 spa_load_note(spa, "vdev_copy_path_strict failed, falling " 3906 "back to vdev_copy_path_relaxed"); 3907 vdev_copy_path_relaxed(rvd, mrvd); 3908 } 3909 3910 vdev_close(rvd); 3911 vdev_free(rvd); 3912 spa->spa_root_vdev = mrvd; 3913 rvd = mrvd; 3914 spa_config_exit(spa, SCL_ALL, FTAG); 3915 3916 /* 3917 * We will use spa_config if we decide to reload the spa or if spa_load 3918 * fails and we rewind. We must thus regenerate the config using the 3919 * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to 3920 * pass settings on how to load the pool and is not stored in the MOS. 3921 * We copy it over to our new, trusted config. 3922 */ 3923 mos_config_txg = fnvlist_lookup_uint64(mos_config, 3924 ZPOOL_CONFIG_POOL_TXG); 3925 nvlist_free(mos_config); 3926 mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE); 3927 if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY, 3928 &policy) == 0) 3929 fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy); 3930 spa_config_set(spa, mos_config); 3931 spa->spa_config_source = SPA_CONFIG_SRC_MOS; 3932 3933 /* 3934 * Now that we got the config from the MOS, we should be more strict 3935 * in checking blkptrs and can make assumptions about the consistency 3936 * of the vdev tree. spa_trust_config must be set to true before opening 3937 * vdevs in order for them to be writeable. 3938 */ 3939 spa->spa_trust_config = B_TRUE; 3940 3941 /* 3942 * Open and validate the new vdev tree 3943 */ 3944 error = spa_ld_open_vdevs(spa); 3945 if (error != 0) 3946 return (error); 3947 3948 error = spa_ld_validate_vdevs(spa); 3949 if (error != 0) 3950 return (error); 3951 3952 if (copy_error != 0 || spa_load_print_vdev_tree) { 3953 spa_load_note(spa, "final vdev tree:"); 3954 vdev_dbgmsg_print_tree(rvd, 2); 3955 } 3956 3957 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT && 3958 !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) { 3959 /* 3960 * Sanity check to make sure that we are indeed loading the 3961 * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds 3962 * in the config provided and they happened to be the only ones 3963 * to have the latest uberblock, we could involuntarily perform 3964 * an extreme rewind. 3965 */ 3966 healthy_tvds_mos = spa_healthy_core_tvds(spa); 3967 if (healthy_tvds_mos - healthy_tvds >= 3968 SPA_SYNC_MIN_VDEVS) { 3969 spa_load_note(spa, "config provided misses too many " 3970 "top-level vdevs compared to MOS (%lld vs %lld). ", 3971 (u_longlong_t)healthy_tvds, 3972 (u_longlong_t)healthy_tvds_mos); 3973 spa_load_note(spa, "vdev tree:"); 3974 vdev_dbgmsg_print_tree(rvd, 2); 3975 if (reloading) { 3976 spa_load_failed(spa, "config was already " 3977 "provided from MOS. Aborting."); 3978 return (spa_vdev_err(rvd, 3979 VDEV_AUX_CORRUPT_DATA, EIO)); 3980 } 3981 spa_load_note(spa, "spa must be reloaded using MOS " 3982 "config"); 3983 return (SET_ERROR(EAGAIN)); 3984 } 3985 } 3986 3987 error = spa_check_for_missing_logs(spa); 3988 if (error != 0) 3989 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 3990 3991 if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) { 3992 spa_load_failed(spa, "uberblock guid sum doesn't match MOS " 3993 "guid sum (%llu != %llu)", 3994 (u_longlong_t)spa->spa_uberblock.ub_guid_sum, 3995 (u_longlong_t)rvd->vdev_guid_sum); 3996 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 3997 ENXIO)); 3998 } 3999 4000 return (0); 4001 } 4002 4003 static int 4004 spa_ld_open_indirect_vdev_metadata(spa_t *spa) 4005 { 4006 int error = 0; 4007 vdev_t *rvd = spa->spa_root_vdev; 4008 4009 /* 4010 * Everything that we read before spa_remove_init() must be stored 4011 * on concreted vdevs. Therefore we do this as early as possible. 4012 */ 4013 error = spa_remove_init(spa); 4014 if (error != 0) { 4015 spa_load_failed(spa, "spa_remove_init failed [error=%d]", 4016 error); 4017 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4018 } 4019 4020 /* 4021 * Retrieve information needed to condense indirect vdev mappings. 4022 */ 4023 error = spa_condense_init(spa); 4024 if (error != 0) { 4025 spa_load_failed(spa, "spa_condense_init failed [error=%d]", 4026 error); 4027 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4028 } 4029 4030 return (0); 4031 } 4032 4033 static int 4034 spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) 4035 { 4036 int error = 0; 4037 vdev_t *rvd = spa->spa_root_vdev; 4038 4039 if (spa_version(spa) >= SPA_VERSION_FEATURES) { 4040 boolean_t missing_feat_read = B_FALSE; 4041 nvlist_t *unsup_feat, *enabled_feat; 4042 4043 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 4044 &spa->spa_feat_for_read_obj, B_TRUE) != 0) { 4045 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4046 } 4047 4048 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 4049 &spa->spa_feat_for_write_obj, B_TRUE) != 0) { 4050 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4051 } 4052 4053 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 4054 &spa->spa_feat_desc_obj, B_TRUE) != 0) { 4055 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4056 } 4057 4058 enabled_feat = fnvlist_alloc(); 4059 unsup_feat = fnvlist_alloc(); 4060 4061 if (!spa_features_check(spa, B_FALSE, 4062 unsup_feat, enabled_feat)) 4063 missing_feat_read = B_TRUE; 4064 4065 if (spa_writeable(spa) || 4066 spa->spa_load_state == SPA_LOAD_TRYIMPORT) { 4067 if (!spa_features_check(spa, B_TRUE, 4068 unsup_feat, enabled_feat)) { 4069 *missing_feat_writep = B_TRUE; 4070 } 4071 } 4072 4073 fnvlist_add_nvlist(spa->spa_load_info, 4074 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 4075 4076 if (!nvlist_empty(unsup_feat)) { 4077 fnvlist_add_nvlist(spa->spa_load_info, 4078 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 4079 } 4080 4081 fnvlist_free(enabled_feat); 4082 fnvlist_free(unsup_feat); 4083 4084 if (!missing_feat_read) { 4085 fnvlist_add_boolean(spa->spa_load_info, 4086 ZPOOL_CONFIG_CAN_RDONLY); 4087 } 4088 4089 /* 4090 * If the state is SPA_LOAD_TRYIMPORT, our objective is 4091 * twofold: to determine whether the pool is available for 4092 * import in read-write mode and (if it is not) whether the 4093 * pool is available for import in read-only mode. If the pool 4094 * is available for import in read-write mode, it is displayed 4095 * as available in userland; if it is not available for import 4096 * in read-only mode, it is displayed as unavailable in 4097 * userland. If the pool is available for import in read-only 4098 * mode but not read-write mode, it is displayed as unavailable 4099 * in userland with a special note that the pool is actually 4100 * available for open in read-only mode. 4101 * 4102 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 4103 * missing a feature for write, we must first determine whether 4104 * the pool can be opened read-only before returning to 4105 * userland in order to know whether to display the 4106 * abovementioned note. 4107 */ 4108 if (missing_feat_read || (*missing_feat_writep && 4109 spa_writeable(spa))) { 4110 spa_load_failed(spa, "pool uses unsupported features"); 4111 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 4112 ENOTSUP)); 4113 } 4114 4115 /* 4116 * Load refcounts for ZFS features from disk into an in-memory 4117 * cache during SPA initialization. 4118 */ 4119 for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 4120 uint64_t refcount; 4121 4122 error = feature_get_refcount_from_disk(spa, 4123 &spa_feature_table[i], &refcount); 4124 if (error == 0) { 4125 spa->spa_feat_refcount_cache[i] = refcount; 4126 } else if (error == ENOTSUP) { 4127 spa->spa_feat_refcount_cache[i] = 4128 SPA_FEATURE_DISABLED; 4129 } else { 4130 spa_load_failed(spa, "error getting refcount " 4131 "for feature %s [error=%d]", 4132 spa_feature_table[i].fi_guid, error); 4133 return (spa_vdev_err(rvd, 4134 VDEV_AUX_CORRUPT_DATA, EIO)); 4135 } 4136 } 4137 } 4138 4139 if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 4140 if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 4141 &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) 4142 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4143 } 4144 4145 /* 4146 * Encryption was added before bookmark_v2, even though bookmark_v2 4147 * is now a dependency. If this pool has encryption enabled without 4148 * bookmark_v2, trigger an errata message. 4149 */ 4150 if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) && 4151 !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) { 4152 spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION; 4153 } 4154 4155 return (0); 4156 } 4157 4158 static int 4159 spa_ld_load_special_directories(spa_t *spa) 4160 { 4161 int error = 0; 4162 vdev_t *rvd = spa->spa_root_vdev; 4163 4164 spa->spa_is_initializing = B_TRUE; 4165 error = dsl_pool_open(spa->spa_dsl_pool); 4166 spa->spa_is_initializing = B_FALSE; 4167 if (error != 0) { 4168 spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); 4169 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4170 } 4171 4172 return (0); 4173 } 4174 4175 static int 4176 spa_ld_get_props(spa_t *spa) 4177 { 4178 int error = 0; 4179 uint64_t obj; 4180 vdev_t *rvd = spa->spa_root_vdev; 4181 4182 /* Grab the checksum salt from the MOS. */ 4183 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 4184 DMU_POOL_CHECKSUM_SALT, 1, 4185 sizeof (spa->spa_cksum_salt.zcs_bytes), 4186 spa->spa_cksum_salt.zcs_bytes); 4187 if (error == ENOENT) { 4188 /* Generate a new salt for subsequent use */ 4189 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 4190 sizeof (spa->spa_cksum_salt.zcs_bytes)); 4191 } else if (error != 0) { 4192 spa_load_failed(spa, "unable to retrieve checksum salt from " 4193 "MOS [error=%d]", error); 4194 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4195 } 4196 4197 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) 4198 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4199 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 4200 if (error != 0) { 4201 spa_load_failed(spa, "error opening deferred-frees bpobj " 4202 "[error=%d]", error); 4203 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4204 } 4205 4206 /* 4207 * Load the bit that tells us to use the new accounting function 4208 * (raid-z deflation). If we have an older pool, this will not 4209 * be present. 4210 */ 4211 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); 4212 if (error != 0 && error != ENOENT) 4213 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4214 4215 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 4216 &spa->spa_creation_version, B_FALSE); 4217 if (error != 0 && error != ENOENT) 4218 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4219 4220 /* 4221 * Load the persistent error log. If we have an older pool, this will 4222 * not be present. 4223 */ 4224 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, 4225 B_FALSE); 4226 if (error != 0 && error != ENOENT) 4227 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4228 4229 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 4230 &spa->spa_errlog_scrub, B_FALSE); 4231 if (error != 0 && error != ENOENT) 4232 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4233 4234 /* 4235 * Load the livelist deletion field. If a livelist is queued for 4236 * deletion, indicate that in the spa 4237 */ 4238 error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES, 4239 &spa->spa_livelists_to_delete, B_FALSE); 4240 if (error != 0 && error != ENOENT) 4241 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4242 4243 /* 4244 * Load the history object. If we have an older pool, this 4245 * will not be present. 4246 */ 4247 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); 4248 if (error != 0 && error != ENOENT) 4249 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4250 4251 /* 4252 * Load the per-vdev ZAP map. If we have an older pool, this will not 4253 * be present; in this case, defer its creation to a later time to 4254 * avoid dirtying the MOS this early / out of sync context. See 4255 * spa_sync_config_object. 4256 */ 4257 4258 /* The sentinel is only available in the MOS config. */ 4259 nvlist_t *mos_config; 4260 if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { 4261 spa_load_failed(spa, "unable to retrieve MOS config"); 4262 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4263 } 4264 4265 error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, 4266 &spa->spa_all_vdev_zaps, B_FALSE); 4267 4268 if (error == ENOENT) { 4269 VERIFY(!nvlist_exists(mos_config, 4270 ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 4271 spa->spa_avz_action = AVZ_ACTION_INITIALIZE; 4272 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 4273 } else if (error != 0) { 4274 nvlist_free(mos_config); 4275 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4276 } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { 4277 /* 4278 * An older version of ZFS overwrote the sentinel value, so 4279 * we have orphaned per-vdev ZAPs in the MOS. Defer their 4280 * destruction to later; see spa_sync_config_object. 4281 */ 4282 spa->spa_avz_action = AVZ_ACTION_DESTROY; 4283 /* 4284 * We're assuming that no vdevs have had their ZAPs created 4285 * before this. Better be sure of it. 4286 */ 4287 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 4288 } 4289 nvlist_free(mos_config); 4290 4291 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 4292 4293 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, 4294 B_FALSE); 4295 if (error && error != ENOENT) 4296 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4297 4298 if (error == 0) { 4299 uint64_t autoreplace = 0; 4300 4301 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 4302 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 4303 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 4304 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 4305 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 4306 spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); 4307 spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim); 4308 spa->spa_autoreplace = (autoreplace != 0); 4309 } 4310 4311 /* 4312 * If we are importing a pool with missing top-level vdevs, 4313 * we enforce that the pool doesn't panic or get suspended on 4314 * error since the likelihood of missing data is extremely high. 4315 */ 4316 if (spa->spa_missing_tvds > 0 && 4317 spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE && 4318 spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4319 spa_load_note(spa, "forcing failmode to 'continue' " 4320 "as some top level vdevs are missing"); 4321 spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE; 4322 } 4323 4324 return (0); 4325 } 4326 4327 static int 4328 spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) 4329 { 4330 int error = 0; 4331 vdev_t *rvd = spa->spa_root_vdev; 4332 4333 /* 4334 * If we're assembling the pool from the split-off vdevs of 4335 * an existing pool, we don't want to attach the spares & cache 4336 * devices. 4337 */ 4338 4339 /* 4340 * Load any hot spares for this pool. 4341 */ 4342 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, 4343 B_FALSE); 4344 if (error != 0 && error != ENOENT) 4345 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4346 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 4347 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 4348 if (load_nvlist(spa, spa->spa_spares.sav_object, 4349 &spa->spa_spares.sav_config) != 0) { 4350 spa_load_failed(spa, "error loading spares nvlist"); 4351 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4352 } 4353 4354 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4355 spa_load_spares(spa); 4356 spa_config_exit(spa, SCL_ALL, FTAG); 4357 } else if (error == 0) { 4358 spa->spa_spares.sav_sync = B_TRUE; 4359 } 4360 4361 /* 4362 * Load any level 2 ARC devices for this pool. 4363 */ 4364 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 4365 &spa->spa_l2cache.sav_object, B_FALSE); 4366 if (error != 0 && error != ENOENT) 4367 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4368 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 4369 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 4370 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 4371 &spa->spa_l2cache.sav_config) != 0) { 4372 spa_load_failed(spa, "error loading l2cache nvlist"); 4373 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4374 } 4375 4376 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4377 spa_load_l2cache(spa); 4378 spa_config_exit(spa, SCL_ALL, FTAG); 4379 } else if (error == 0) { 4380 spa->spa_l2cache.sav_sync = B_TRUE; 4381 } 4382 4383 return (0); 4384 } 4385 4386 static int 4387 spa_ld_load_vdev_metadata(spa_t *spa) 4388 { 4389 int error = 0; 4390 vdev_t *rvd = spa->spa_root_vdev; 4391 4392 /* 4393 * If the 'multihost' property is set, then never allow a pool to 4394 * be imported when the system hostid is zero. The exception to 4395 * this rule is zdb which is always allowed to access pools. 4396 */ 4397 if (spa_multihost(spa) && spa_get_hostid(spa) == 0 && 4398 (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) { 4399 fnvlist_add_uint64(spa->spa_load_info, 4400 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); 4401 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); 4402 } 4403 4404 /* 4405 * If the 'autoreplace' property is set, then post a resource notifying 4406 * the ZFS DE that it should not issue any faults for unopenable 4407 * devices. We also iterate over the vdevs, and post a sysevent for any 4408 * unopenable vdevs so that the normal autoreplace handler can take 4409 * over. 4410 */ 4411 if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4412 spa_check_removed(spa->spa_root_vdev); 4413 /* 4414 * For the import case, this is done in spa_import(), because 4415 * at this point we're using the spare definitions from 4416 * the MOS config, not necessarily from the userland config. 4417 */ 4418 if (spa->spa_load_state != SPA_LOAD_IMPORT) { 4419 spa_aux_check_removed(&spa->spa_spares); 4420 spa_aux_check_removed(&spa->spa_l2cache); 4421 } 4422 } 4423 4424 /* 4425 * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. 4426 */ 4427 error = vdev_load(rvd); 4428 if (error != 0) { 4429 spa_load_failed(spa, "vdev_load failed [error=%d]", error); 4430 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4431 } 4432 4433 error = spa_ld_log_spacemaps(spa); 4434 if (error != 0) { 4435 spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]", 4436 error); 4437 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4438 } 4439 4440 /* 4441 * Propagate the leaf DTLs we just loaded all the way up the vdev tree. 4442 */ 4443 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4444 vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE); 4445 spa_config_exit(spa, SCL_ALL, FTAG); 4446 4447 return (0); 4448 } 4449 4450 static int 4451 spa_ld_load_dedup_tables(spa_t *spa) 4452 { 4453 int error = 0; 4454 vdev_t *rvd = spa->spa_root_vdev; 4455 4456 error = ddt_load(spa); 4457 if (error != 0) { 4458 spa_load_failed(spa, "ddt_load failed [error=%d]", error); 4459 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4460 } 4461 4462 return (0); 4463 } 4464 4465 static int 4466 spa_ld_load_brt(spa_t *spa) 4467 { 4468 int error = 0; 4469 vdev_t *rvd = spa->spa_root_vdev; 4470 4471 error = brt_load(spa); 4472 if (error != 0) { 4473 spa_load_failed(spa, "brt_load failed [error=%d]", error); 4474 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4475 } 4476 4477 return (0); 4478 } 4479 4480 static int 4481 spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, const char **ereport) 4482 { 4483 vdev_t *rvd = spa->spa_root_vdev; 4484 4485 if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { 4486 boolean_t missing = spa_check_logs(spa); 4487 if (missing) { 4488 if (spa->spa_missing_tvds != 0) { 4489 spa_load_note(spa, "spa_check_logs failed " 4490 "so dropping the logs"); 4491 } else { 4492 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 4493 spa_load_failed(spa, "spa_check_logs failed"); 4494 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, 4495 ENXIO)); 4496 } 4497 } 4498 } 4499 4500 return (0); 4501 } 4502 4503 static int 4504 spa_ld_verify_pool_data(spa_t *spa) 4505 { 4506 int error = 0; 4507 vdev_t *rvd = spa->spa_root_vdev; 4508 4509 /* 4510 * We've successfully opened the pool, verify that we're ready 4511 * to start pushing transactions. 4512 */ 4513 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4514 error = spa_load_verify(spa); 4515 if (error != 0) { 4516 spa_load_failed(spa, "spa_load_verify failed " 4517 "[error=%d]", error); 4518 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 4519 error)); 4520 } 4521 } 4522 4523 return (0); 4524 } 4525 4526 static void 4527 spa_ld_claim_log_blocks(spa_t *spa) 4528 { 4529 dmu_tx_t *tx; 4530 dsl_pool_t *dp = spa_get_dsl(spa); 4531 4532 /* 4533 * Claim log blocks that haven't been committed yet. 4534 * This must all happen in a single txg. 4535 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 4536 * invoked from zil_claim_log_block()'s i/o done callback. 4537 * Price of rollback is that we abandon the log. 4538 */ 4539 spa->spa_claiming = B_TRUE; 4540 4541 tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); 4542 (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 4543 zil_claim, tx, DS_FIND_CHILDREN); 4544 dmu_tx_commit(tx); 4545 4546 spa->spa_claiming = B_FALSE; 4547 4548 spa_set_log_state(spa, SPA_LOG_GOOD); 4549 } 4550 4551 static void 4552 spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, 4553 boolean_t update_config_cache) 4554 { 4555 vdev_t *rvd = spa->spa_root_vdev; 4556 int need_update = B_FALSE; 4557 4558 /* 4559 * If the config cache is stale, or we have uninitialized 4560 * metaslabs (see spa_vdev_add()), then update the config. 4561 * 4562 * If this is a verbatim import, trust the current 4563 * in-core spa_config and update the disk labels. 4564 */ 4565 if (update_config_cache || config_cache_txg != spa->spa_config_txg || 4566 spa->spa_load_state == SPA_LOAD_IMPORT || 4567 spa->spa_load_state == SPA_LOAD_RECOVER || 4568 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 4569 need_update = B_TRUE; 4570 4571 for (int c = 0; c < rvd->vdev_children; c++) 4572 if (rvd->vdev_child[c]->vdev_ms_array == 0) 4573 need_update = B_TRUE; 4574 4575 /* 4576 * Update the config cache asynchronously in case we're the 4577 * root pool, in which case the config cache isn't writable yet. 4578 */ 4579 if (need_update) 4580 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 4581 } 4582 4583 static void 4584 spa_ld_prepare_for_reload(spa_t *spa) 4585 { 4586 spa_mode_t mode = spa->spa_mode; 4587 int async_suspended = spa->spa_async_suspended; 4588 4589 spa_unload(spa); 4590 spa_deactivate(spa); 4591 spa_activate(spa, mode); 4592 4593 /* 4594 * We save the value of spa_async_suspended as it gets reset to 0 by 4595 * spa_unload(). We want to restore it back to the original value before 4596 * returning as we might be calling spa_async_resume() later. 4597 */ 4598 spa->spa_async_suspended = async_suspended; 4599 } 4600 4601 static int 4602 spa_ld_read_checkpoint_txg(spa_t *spa) 4603 { 4604 uberblock_t checkpoint; 4605 int error = 0; 4606 4607 ASSERT0(spa->spa_checkpoint_txg); 4608 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4609 4610 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 4611 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 4612 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 4613 4614 if (error == ENOENT) 4615 return (0); 4616 4617 if (error != 0) 4618 return (error); 4619 4620 ASSERT3U(checkpoint.ub_txg, !=, 0); 4621 ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0); 4622 ASSERT3U(checkpoint.ub_timestamp, !=, 0); 4623 spa->spa_checkpoint_txg = checkpoint.ub_txg; 4624 spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; 4625 4626 return (0); 4627 } 4628 4629 static int 4630 spa_ld_mos_init(spa_t *spa, spa_import_type_t type) 4631 { 4632 int error = 0; 4633 4634 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4635 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); 4636 4637 /* 4638 * Never trust the config that is provided unless we are assembling 4639 * a pool following a split. 4640 * This means don't trust blkptrs and the vdev tree in general. This 4641 * also effectively puts the spa in read-only mode since 4642 * spa_writeable() checks for spa_trust_config to be true. 4643 * We will later load a trusted config from the MOS. 4644 */ 4645 if (type != SPA_IMPORT_ASSEMBLE) 4646 spa->spa_trust_config = B_FALSE; 4647 4648 /* 4649 * Parse the config provided to create a vdev tree. 4650 */ 4651 error = spa_ld_parse_config(spa, type); 4652 if (error != 0) 4653 return (error); 4654 4655 spa_import_progress_add(spa); 4656 4657 /* 4658 * Now that we have the vdev tree, try to open each vdev. This involves 4659 * opening the underlying physical device, retrieving its geometry and 4660 * probing the vdev with a dummy I/O. The state of each vdev will be set 4661 * based on the success of those operations. After this we'll be ready 4662 * to read from the vdevs. 4663 */ 4664 error = spa_ld_open_vdevs(spa); 4665 if (error != 0) 4666 return (error); 4667 4668 /* 4669 * Read the label of each vdev and make sure that the GUIDs stored 4670 * there match the GUIDs in the config provided. 4671 * If we're assembling a new pool that's been split off from an 4672 * existing pool, the labels haven't yet been updated so we skip 4673 * validation for now. 4674 */ 4675 if (type != SPA_IMPORT_ASSEMBLE) { 4676 error = spa_ld_validate_vdevs(spa); 4677 if (error != 0) 4678 return (error); 4679 } 4680 4681 /* 4682 * Read all vdev labels to find the best uberblock (i.e. latest, 4683 * unless spa_load_max_txg is set) and store it in spa_uberblock. We 4684 * get the list of features required to read blkptrs in the MOS from 4685 * the vdev label with the best uberblock and verify that our version 4686 * of zfs supports them all. 4687 */ 4688 error = spa_ld_select_uberblock(spa, type); 4689 if (error != 0) 4690 return (error); 4691 4692 /* 4693 * Pass that uberblock to the dsl_pool layer which will open the root 4694 * blkptr. This blkptr points to the latest version of the MOS and will 4695 * allow us to read its contents. 4696 */ 4697 error = spa_ld_open_rootbp(spa); 4698 if (error != 0) 4699 return (error); 4700 4701 return (0); 4702 } 4703 4704 static int 4705 spa_ld_checkpoint_rewind(spa_t *spa) 4706 { 4707 uberblock_t checkpoint; 4708 int error = 0; 4709 4710 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4711 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 4712 4713 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 4714 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 4715 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 4716 4717 if (error != 0) { 4718 spa_load_failed(spa, "unable to retrieve checkpointed " 4719 "uberblock from the MOS config [error=%d]", error); 4720 4721 if (error == ENOENT) 4722 error = ZFS_ERR_NO_CHECKPOINT; 4723 4724 return (error); 4725 } 4726 4727 ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg); 4728 ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg); 4729 4730 /* 4731 * We need to update the txg and timestamp of the checkpointed 4732 * uberblock to be higher than the latest one. This ensures that 4733 * the checkpointed uberblock is selected if we were to close and 4734 * reopen the pool right after we've written it in the vdev labels. 4735 * (also see block comment in vdev_uberblock_compare) 4736 */ 4737 checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1; 4738 checkpoint.ub_timestamp = gethrestime_sec(); 4739 4740 /* 4741 * Set current uberblock to be the checkpointed uberblock. 4742 */ 4743 spa->spa_uberblock = checkpoint; 4744 4745 /* 4746 * If we are doing a normal rewind, then the pool is open for 4747 * writing and we sync the "updated" checkpointed uberblock to 4748 * disk. Once this is done, we've basically rewound the whole 4749 * pool and there is no way back. 4750 * 4751 * There are cases when we don't want to attempt and sync the 4752 * checkpointed uberblock to disk because we are opening a 4753 * pool as read-only. Specifically, verifying the checkpointed 4754 * state with zdb, and importing the checkpointed state to get 4755 * a "preview" of its content. 4756 */ 4757 if (spa_writeable(spa)) { 4758 vdev_t *rvd = spa->spa_root_vdev; 4759 4760 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4761 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; 4762 int svdcount = 0; 4763 int children = rvd->vdev_children; 4764 int c0 = random_in_range(children); 4765 4766 for (int c = 0; c < children; c++) { 4767 vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; 4768 4769 /* Stop when revisiting the first vdev */ 4770 if (c > 0 && svd[0] == vd) 4771 break; 4772 4773 if (vd->vdev_ms_array == 0 || vd->vdev_islog || 4774 !vdev_is_concrete(vd)) 4775 continue; 4776 4777 svd[svdcount++] = vd; 4778 if (svdcount == SPA_SYNC_MIN_VDEVS) 4779 break; 4780 } 4781 error = vdev_config_sync(svd, svdcount, spa->spa_first_txg); 4782 if (error == 0) 4783 spa->spa_last_synced_guid = rvd->vdev_guid; 4784 spa_config_exit(spa, SCL_ALL, FTAG); 4785 4786 if (error != 0) { 4787 spa_load_failed(spa, "failed to write checkpointed " 4788 "uberblock to the vdev labels [error=%d]", error); 4789 return (error); 4790 } 4791 } 4792 4793 return (0); 4794 } 4795 4796 static int 4797 spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type, 4798 boolean_t *update_config_cache) 4799 { 4800 int error; 4801 4802 /* 4803 * Parse the config for pool, open and validate vdevs, 4804 * select an uberblock, and use that uberblock to open 4805 * the MOS. 4806 */ 4807 error = spa_ld_mos_init(spa, type); 4808 if (error != 0) 4809 return (error); 4810 4811 /* 4812 * Retrieve the trusted config stored in the MOS and use it to create 4813 * a new, exact version of the vdev tree, then reopen all vdevs. 4814 */ 4815 error = spa_ld_trusted_config(spa, type, B_FALSE); 4816 if (error == EAGAIN) { 4817 if (update_config_cache != NULL) 4818 *update_config_cache = B_TRUE; 4819 4820 /* 4821 * Redo the loading process with the trusted config if it is 4822 * too different from the untrusted config. 4823 */ 4824 spa_ld_prepare_for_reload(spa); 4825 spa_load_note(spa, "RELOADING"); 4826 error = spa_ld_mos_init(spa, type); 4827 if (error != 0) 4828 return (error); 4829 4830 error = spa_ld_trusted_config(spa, type, B_TRUE); 4831 if (error != 0) 4832 return (error); 4833 4834 } else if (error != 0) { 4835 return (error); 4836 } 4837 4838 return (0); 4839 } 4840 4841 /* 4842 * Load an existing storage pool, using the config provided. This config 4843 * describes which vdevs are part of the pool and is later validated against 4844 * partial configs present in each vdev's label and an entire copy of the 4845 * config stored in the MOS. 4846 */ 4847 static int 4848 spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) 4849 { 4850 int error = 0; 4851 boolean_t missing_feat_write = B_FALSE; 4852 boolean_t checkpoint_rewind = 4853 (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 4854 boolean_t update_config_cache = B_FALSE; 4855 4856 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4857 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); 4858 4859 spa_load_note(spa, "LOADING"); 4860 4861 error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache); 4862 if (error != 0) 4863 return (error); 4864 4865 /* 4866 * If we are rewinding to the checkpoint then we need to repeat 4867 * everything we've done so far in this function but this time 4868 * selecting the checkpointed uberblock and using that to open 4869 * the MOS. 4870 */ 4871 if (checkpoint_rewind) { 4872 /* 4873 * If we are rewinding to the checkpoint update config cache 4874 * anyway. 4875 */ 4876 update_config_cache = B_TRUE; 4877 4878 /* 4879 * Extract the checkpointed uberblock from the current MOS 4880 * and use this as the pool's uberblock from now on. If the 4881 * pool is imported as writeable we also write the checkpoint 4882 * uberblock to the labels, making the rewind permanent. 4883 */ 4884 error = spa_ld_checkpoint_rewind(spa); 4885 if (error != 0) 4886 return (error); 4887 4888 /* 4889 * Redo the loading process again with the 4890 * checkpointed uberblock. 4891 */ 4892 spa_ld_prepare_for_reload(spa); 4893 spa_load_note(spa, "LOADING checkpointed uberblock"); 4894 error = spa_ld_mos_with_trusted_config(spa, type, NULL); 4895 if (error != 0) 4896 return (error); 4897 } 4898 4899 /* 4900 * Retrieve the checkpoint txg if the pool has a checkpoint. 4901 */ 4902 error = spa_ld_read_checkpoint_txg(spa); 4903 if (error != 0) 4904 return (error); 4905 4906 /* 4907 * Retrieve the mapping of indirect vdevs. Those vdevs were removed 4908 * from the pool and their contents were re-mapped to other vdevs. Note 4909 * that everything that we read before this step must have been 4910 * rewritten on concrete vdevs after the last device removal was 4911 * initiated. Otherwise we could be reading from indirect vdevs before 4912 * we have loaded their mappings. 4913 */ 4914 error = spa_ld_open_indirect_vdev_metadata(spa); 4915 if (error != 0) 4916 return (error); 4917 4918 /* 4919 * Retrieve the full list of active features from the MOS and check if 4920 * they are all supported. 4921 */ 4922 error = spa_ld_check_features(spa, &missing_feat_write); 4923 if (error != 0) 4924 return (error); 4925 4926 /* 4927 * Load several special directories from the MOS needed by the dsl_pool 4928 * layer. 4929 */ 4930 error = spa_ld_load_special_directories(spa); 4931 if (error != 0) 4932 return (error); 4933 4934 /* 4935 * Retrieve pool properties from the MOS. 4936 */ 4937 error = spa_ld_get_props(spa); 4938 if (error != 0) 4939 return (error); 4940 4941 /* 4942 * Retrieve the list of auxiliary devices - cache devices and spares - 4943 * and open them. 4944 */ 4945 error = spa_ld_open_aux_vdevs(spa, type); 4946 if (error != 0) 4947 return (error); 4948 4949 /* 4950 * Load the metadata for all vdevs. Also check if unopenable devices 4951 * should be autoreplaced. 4952 */ 4953 error = spa_ld_load_vdev_metadata(spa); 4954 if (error != 0) 4955 return (error); 4956 4957 error = spa_ld_load_dedup_tables(spa); 4958 if (error != 0) 4959 return (error); 4960 4961 error = spa_ld_load_brt(spa); 4962 if (error != 0) 4963 return (error); 4964 4965 /* 4966 * Verify the logs now to make sure we don't have any unexpected errors 4967 * when we claim log blocks later. 4968 */ 4969 error = spa_ld_verify_logs(spa, type, ereport); 4970 if (error != 0) 4971 return (error); 4972 4973 if (missing_feat_write) { 4974 ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); 4975 4976 /* 4977 * At this point, we know that we can open the pool in 4978 * read-only mode but not read-write mode. We now have enough 4979 * information and can return to userland. 4980 */ 4981 return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, 4982 ENOTSUP)); 4983 } 4984 4985 /* 4986 * Traverse the last txgs to make sure the pool was left off in a safe 4987 * state. When performing an extreme rewind, we verify the whole pool, 4988 * which can take a very long time. 4989 */ 4990 error = spa_ld_verify_pool_data(spa); 4991 if (error != 0) 4992 return (error); 4993 4994 /* 4995 * Calculate the deflated space for the pool. This must be done before 4996 * we write anything to the pool because we'd need to update the space 4997 * accounting using the deflated sizes. 4998 */ 4999 spa_update_dspace(spa); 5000 5001 /* 5002 * We have now retrieved all the information we needed to open the 5003 * pool. If we are importing the pool in read-write mode, a few 5004 * additional steps must be performed to finish the import. 5005 */ 5006 if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || 5007 spa->spa_load_max_txg == UINT64_MAX)) { 5008 uint64_t config_cache_txg = spa->spa_config_txg; 5009 5010 ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); 5011 5012 /* 5013 * In case of a checkpoint rewind, log the original txg 5014 * of the checkpointed uberblock. 5015 */ 5016 if (checkpoint_rewind) { 5017 spa_history_log_internal(spa, "checkpoint rewind", 5018 NULL, "rewound state to txg=%llu", 5019 (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); 5020 } 5021 5022 /* 5023 * Traverse the ZIL and claim all blocks. 5024 */ 5025 spa_ld_claim_log_blocks(spa); 5026 5027 /* 5028 * Kick-off the syncing thread. 5029 */ 5030 spa->spa_sync_on = B_TRUE; 5031 txg_sync_start(spa->spa_dsl_pool); 5032 mmp_thread_start(spa); 5033 5034 /* 5035 * Wait for all claims to sync. We sync up to the highest 5036 * claimed log block birth time so that claimed log blocks 5037 * don't appear to be from the future. spa_claim_max_txg 5038 * will have been set for us by ZIL traversal operations 5039 * performed above. 5040 */ 5041 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 5042 5043 /* 5044 * Check if we need to request an update of the config. On the 5045 * next sync, we would update the config stored in vdev labels 5046 * and the cachefile (by default /etc/zfs/zpool.cache). 5047 */ 5048 spa_ld_check_for_config_update(spa, config_cache_txg, 5049 update_config_cache); 5050 5051 /* 5052 * Check if a rebuild was in progress and if so resume it. 5053 * Then check all DTLs to see if anything needs resilvering. 5054 * The resilver will be deferred if a rebuild was started. 5055 */ 5056 if (vdev_rebuild_active(spa->spa_root_vdev)) { 5057 vdev_rebuild_restart(spa); 5058 } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 5059 vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5060 spa_async_request(spa, SPA_ASYNC_RESILVER); 5061 } 5062 5063 /* 5064 * Log the fact that we booted up (so that we can detect if 5065 * we rebooted in the middle of an operation). 5066 */ 5067 spa_history_log_version(spa, "open", NULL); 5068 5069 spa_restart_removal(spa); 5070 spa_spawn_aux_threads(spa); 5071 5072 /* 5073 * Delete any inconsistent datasets. 5074 * 5075 * Note: 5076 * Since we may be issuing deletes for clones here, 5077 * we make sure to do so after we've spawned all the 5078 * auxiliary threads above (from which the livelist 5079 * deletion zthr is part of). 5080 */ 5081 (void) dmu_objset_find(spa_name(spa), 5082 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 5083 5084 /* 5085 * Clean up any stale temporary dataset userrefs. 5086 */ 5087 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 5088 5089 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5090 vdev_initialize_restart(spa->spa_root_vdev); 5091 vdev_trim_restart(spa->spa_root_vdev); 5092 vdev_autotrim_restart(spa); 5093 spa_config_exit(spa, SCL_CONFIG, FTAG); 5094 } 5095 5096 spa_import_progress_remove(spa_guid(spa)); 5097 spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); 5098 5099 spa_load_note(spa, "LOADED"); 5100 5101 return (0); 5102 } 5103 5104 static int 5105 spa_load_retry(spa_t *spa, spa_load_state_t state) 5106 { 5107 spa_mode_t mode = spa->spa_mode; 5108 5109 spa_unload(spa); 5110 spa_deactivate(spa); 5111 5112 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; 5113 5114 spa_activate(spa, mode); 5115 spa_async_suspend(spa); 5116 5117 spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", 5118 (u_longlong_t)spa->spa_load_max_txg); 5119 5120 return (spa_load(spa, state, SPA_IMPORT_EXISTING)); 5121 } 5122 5123 /* 5124 * If spa_load() fails this function will try loading prior txg's. If 5125 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 5126 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 5127 * function will not rewind the pool and will return the same error as 5128 * spa_load(). 5129 */ 5130 static int 5131 spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, 5132 int rewind_flags) 5133 { 5134 nvlist_t *loadinfo = NULL; 5135 nvlist_t *config = NULL; 5136 int load_error, rewind_error; 5137 uint64_t safe_rewind_txg; 5138 uint64_t min_txg; 5139 5140 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 5141 spa->spa_load_max_txg = spa->spa_load_txg; 5142 spa_set_log_state(spa, SPA_LOG_CLEAR); 5143 } else { 5144 spa->spa_load_max_txg = max_request; 5145 if (max_request != UINT64_MAX) 5146 spa->spa_extreme_rewind = B_TRUE; 5147 } 5148 5149 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING); 5150 if (load_error == 0) 5151 return (0); 5152 if (load_error == ZFS_ERR_NO_CHECKPOINT) { 5153 /* 5154 * When attempting checkpoint-rewind on a pool with no 5155 * checkpoint, we should not attempt to load uberblocks 5156 * from previous txgs when spa_load fails. 5157 */ 5158 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 5159 spa_import_progress_remove(spa_guid(spa)); 5160 return (load_error); 5161 } 5162 5163 if (spa->spa_root_vdev != NULL) 5164 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 5165 5166 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 5167 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 5168 5169 if (rewind_flags & ZPOOL_NEVER_REWIND) { 5170 nvlist_free(config); 5171 spa_import_progress_remove(spa_guid(spa)); 5172 return (load_error); 5173 } 5174 5175 if (state == SPA_LOAD_RECOVER) { 5176 /* Price of rolling back is discarding txgs, including log */ 5177 spa_set_log_state(spa, SPA_LOG_CLEAR); 5178 } else { 5179 /* 5180 * If we aren't rolling back save the load info from our first 5181 * import attempt so that we can restore it after attempting 5182 * to rewind. 5183 */ 5184 loadinfo = spa->spa_load_info; 5185 spa->spa_load_info = fnvlist_alloc(); 5186 } 5187 5188 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 5189 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 5190 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 5191 TXG_INITIAL : safe_rewind_txg; 5192 5193 /* 5194 * Continue as long as we're finding errors, we're still within 5195 * the acceptable rewind range, and we're still finding uberblocks 5196 */ 5197 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 5198 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 5199 if (spa->spa_load_max_txg < safe_rewind_txg) 5200 spa->spa_extreme_rewind = B_TRUE; 5201 rewind_error = spa_load_retry(spa, state); 5202 } 5203 5204 spa->spa_extreme_rewind = B_FALSE; 5205 spa->spa_load_max_txg = UINT64_MAX; 5206 5207 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 5208 spa_config_set(spa, config); 5209 else 5210 nvlist_free(config); 5211 5212 if (state == SPA_LOAD_RECOVER) { 5213 ASSERT3P(loadinfo, ==, NULL); 5214 spa_import_progress_remove(spa_guid(spa)); 5215 return (rewind_error); 5216 } else { 5217 /* Store the rewind info as part of the initial load info */ 5218 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 5219 spa->spa_load_info); 5220 5221 /* Restore the initial load info */ 5222 fnvlist_free(spa->spa_load_info); 5223 spa->spa_load_info = loadinfo; 5224 5225 spa_import_progress_remove(spa_guid(spa)); 5226 return (load_error); 5227 } 5228 } 5229 5230 /* 5231 * Pool Open/Import 5232 * 5233 * The import case is identical to an open except that the configuration is sent 5234 * down from userland, instead of grabbed from the configuration cache. For the 5235 * case of an open, the pool configuration will exist in the 5236 * POOL_STATE_UNINITIALIZED state. 5237 * 5238 * The stats information (gen/count/ustats) is used to gather vdev statistics at 5239 * the same time open the pool, without having to keep around the spa_t in some 5240 * ambiguous state. 5241 */ 5242 static int 5243 spa_open_common(const char *pool, spa_t **spapp, const void *tag, 5244 nvlist_t *nvpolicy, nvlist_t **config) 5245 { 5246 spa_t *spa; 5247 spa_load_state_t state = SPA_LOAD_OPEN; 5248 int error; 5249 int locked = B_FALSE; 5250 int firstopen = B_FALSE; 5251 5252 *spapp = NULL; 5253 5254 /* 5255 * As disgusting as this is, we need to support recursive calls to this 5256 * function because dsl_dir_open() is called during spa_load(), and ends 5257 * up calling spa_open() again. The real fix is to figure out how to 5258 * avoid dsl_dir_open() calling this in the first place. 5259 */ 5260 if (MUTEX_NOT_HELD(&spa_namespace_lock)) { 5261 mutex_enter(&spa_namespace_lock); 5262 locked = B_TRUE; 5263 } 5264 5265 if ((spa = spa_lookup(pool)) == NULL) { 5266 if (locked) 5267 mutex_exit(&spa_namespace_lock); 5268 return (SET_ERROR(ENOENT)); 5269 } 5270 5271 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 5272 zpool_load_policy_t policy; 5273 5274 firstopen = B_TRUE; 5275 5276 zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config, 5277 &policy); 5278 if (policy.zlp_rewind & ZPOOL_DO_REWIND) 5279 state = SPA_LOAD_RECOVER; 5280 5281 spa_activate(spa, spa_mode_global); 5282 5283 if (state != SPA_LOAD_RECOVER) 5284 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 5285 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; 5286 5287 zfs_dbgmsg("spa_open_common: opening %s", pool); 5288 error = spa_load_best(spa, state, policy.zlp_txg, 5289 policy.zlp_rewind); 5290 5291 if (error == EBADF) { 5292 /* 5293 * If vdev_validate() returns failure (indicated by 5294 * EBADF), it indicates that one of the vdevs indicates 5295 * that the pool has been exported or destroyed. If 5296 * this is the case, the config cache is out of sync and 5297 * we should remove the pool from the namespace. 5298 */ 5299 spa_unload(spa); 5300 spa_deactivate(spa); 5301 spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); 5302 spa_remove(spa); 5303 if (locked) 5304 mutex_exit(&spa_namespace_lock); 5305 return (SET_ERROR(ENOENT)); 5306 } 5307 5308 if (error) { 5309 /* 5310 * We can't open the pool, but we still have useful 5311 * information: the state of each vdev after the 5312 * attempted vdev_open(). Return this to the user. 5313 */ 5314 if (config != NULL && spa->spa_config) { 5315 *config = fnvlist_dup(spa->spa_config); 5316 fnvlist_add_nvlist(*config, 5317 ZPOOL_CONFIG_LOAD_INFO, 5318 spa->spa_load_info); 5319 } 5320 spa_unload(spa); 5321 spa_deactivate(spa); 5322 spa->spa_last_open_failed = error; 5323 if (locked) 5324 mutex_exit(&spa_namespace_lock); 5325 *spapp = NULL; 5326 return (error); 5327 } 5328 } 5329 5330 spa_open_ref(spa, tag); 5331 5332 if (config != NULL) 5333 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 5334 5335 /* 5336 * If we've recovered the pool, pass back any information we 5337 * gathered while doing the load. 5338 */ 5339 if (state == SPA_LOAD_RECOVER && config != NULL) { 5340 fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 5341 spa->spa_load_info); 5342 } 5343 5344 if (locked) { 5345 spa->spa_last_open_failed = 0; 5346 spa->spa_last_ubsync_txg = 0; 5347 spa->spa_load_txg = 0; 5348 mutex_exit(&spa_namespace_lock); 5349 } 5350 5351 if (firstopen) 5352 zvol_create_minors_recursive(spa_name(spa)); 5353 5354 *spapp = spa; 5355 5356 return (0); 5357 } 5358 5359 int 5360 spa_open_rewind(const char *name, spa_t **spapp, const void *tag, 5361 nvlist_t *policy, nvlist_t **config) 5362 { 5363 return (spa_open_common(name, spapp, tag, policy, config)); 5364 } 5365 5366 int 5367 spa_open(const char *name, spa_t **spapp, const void *tag) 5368 { 5369 return (spa_open_common(name, spapp, tag, NULL, NULL)); 5370 } 5371 5372 /* 5373 * Lookup the given spa_t, incrementing the inject count in the process, 5374 * preventing it from being exported or destroyed. 5375 */ 5376 spa_t * 5377 spa_inject_addref(char *name) 5378 { 5379 spa_t *spa; 5380 5381 mutex_enter(&spa_namespace_lock); 5382 if ((spa = spa_lookup(name)) == NULL) { 5383 mutex_exit(&spa_namespace_lock); 5384 return (NULL); 5385 } 5386 spa->spa_inject_ref++; 5387 mutex_exit(&spa_namespace_lock); 5388 5389 return (spa); 5390 } 5391 5392 void 5393 spa_inject_delref(spa_t *spa) 5394 { 5395 mutex_enter(&spa_namespace_lock); 5396 spa->spa_inject_ref--; 5397 mutex_exit(&spa_namespace_lock); 5398 } 5399 5400 /* 5401 * Add spares device information to the nvlist. 5402 */ 5403 static void 5404 spa_add_spares(spa_t *spa, nvlist_t *config) 5405 { 5406 nvlist_t **spares; 5407 uint_t i, nspares; 5408 nvlist_t *nvroot; 5409 uint64_t guid; 5410 vdev_stat_t *vs; 5411 uint_t vsc; 5412 uint64_t pool; 5413 5414 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 5415 5416 if (spa->spa_spares.sav_count == 0) 5417 return; 5418 5419 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 5420 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5421 ZPOOL_CONFIG_SPARES, &spares, &nspares)); 5422 if (nspares != 0) { 5423 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 5424 (const nvlist_t * const *)spares, nspares); 5425 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 5426 &spares, &nspares)); 5427 5428 /* 5429 * Go through and find any spares which have since been 5430 * repurposed as an active spare. If this is the case, update 5431 * their status appropriately. 5432 */ 5433 for (i = 0; i < nspares; i++) { 5434 guid = fnvlist_lookup_uint64(spares[i], 5435 ZPOOL_CONFIG_GUID); 5436 VERIFY0(nvlist_lookup_uint64_array(spares[i], 5437 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); 5438 if (spa_spare_exists(guid, &pool, NULL) && 5439 pool != 0ULL) { 5440 vs->vs_state = VDEV_STATE_CANT_OPEN; 5441 vs->vs_aux = VDEV_AUX_SPARED; 5442 } else { 5443 vs->vs_state = 5444 spa->spa_spares.sav_vdevs[i]->vdev_state; 5445 } 5446 } 5447 } 5448 } 5449 5450 /* 5451 * Add l2cache device information to the nvlist, including vdev stats. 5452 */ 5453 static void 5454 spa_add_l2cache(spa_t *spa, nvlist_t *config) 5455 { 5456 nvlist_t **l2cache; 5457 uint_t i, j, nl2cache; 5458 nvlist_t *nvroot; 5459 uint64_t guid; 5460 vdev_t *vd; 5461 vdev_stat_t *vs; 5462 uint_t vsc; 5463 5464 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 5465 5466 if (spa->spa_l2cache.sav_count == 0) 5467 return; 5468 5469 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 5470 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5471 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); 5472 if (nl2cache != 0) { 5473 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 5474 (const nvlist_t * const *)l2cache, nl2cache); 5475 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 5476 &l2cache, &nl2cache)); 5477 5478 /* 5479 * Update level 2 cache device stats. 5480 */ 5481 5482 for (i = 0; i < nl2cache; i++) { 5483 guid = fnvlist_lookup_uint64(l2cache[i], 5484 ZPOOL_CONFIG_GUID); 5485 5486 vd = NULL; 5487 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 5488 if (guid == 5489 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 5490 vd = spa->spa_l2cache.sav_vdevs[j]; 5491 break; 5492 } 5493 } 5494 ASSERT(vd != NULL); 5495 5496 VERIFY0(nvlist_lookup_uint64_array(l2cache[i], 5497 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); 5498 vdev_get_stats(vd, vs); 5499 vdev_config_generate_stats(vd, l2cache[i]); 5500 5501 } 5502 } 5503 } 5504 5505 static void 5506 spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features) 5507 { 5508 zap_cursor_t zc; 5509 zap_attribute_t za; 5510 5511 if (spa->spa_feat_for_read_obj != 0) { 5512 for (zap_cursor_init(&zc, spa->spa_meta_objset, 5513 spa->spa_feat_for_read_obj); 5514 zap_cursor_retrieve(&zc, &za) == 0; 5515 zap_cursor_advance(&zc)) { 5516 ASSERT(za.za_integer_length == sizeof (uint64_t) && 5517 za.za_num_integers == 1); 5518 VERIFY0(nvlist_add_uint64(features, za.za_name, 5519 za.za_first_integer)); 5520 } 5521 zap_cursor_fini(&zc); 5522 } 5523 5524 if (spa->spa_feat_for_write_obj != 0) { 5525 for (zap_cursor_init(&zc, spa->spa_meta_objset, 5526 spa->spa_feat_for_write_obj); 5527 zap_cursor_retrieve(&zc, &za) == 0; 5528 zap_cursor_advance(&zc)) { 5529 ASSERT(za.za_integer_length == sizeof (uint64_t) && 5530 za.za_num_integers == 1); 5531 VERIFY0(nvlist_add_uint64(features, za.za_name, 5532 za.za_first_integer)); 5533 } 5534 zap_cursor_fini(&zc); 5535 } 5536 } 5537 5538 static void 5539 spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features) 5540 { 5541 int i; 5542 5543 for (i = 0; i < SPA_FEATURES; i++) { 5544 zfeature_info_t feature = spa_feature_table[i]; 5545 uint64_t refcount; 5546 5547 if (feature_get_refcount(spa, &feature, &refcount) != 0) 5548 continue; 5549 5550 VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount)); 5551 } 5552 } 5553 5554 /* 5555 * Store a list of pool features and their reference counts in the 5556 * config. 5557 * 5558 * The first time this is called on a spa, allocate a new nvlist, fetch 5559 * the pool features and reference counts from disk, then save the list 5560 * in the spa. In subsequent calls on the same spa use the saved nvlist 5561 * and refresh its values from the cached reference counts. This 5562 * ensures we don't block here on I/O on a suspended pool so 'zpool 5563 * clear' can resume the pool. 5564 */ 5565 static void 5566 spa_add_feature_stats(spa_t *spa, nvlist_t *config) 5567 { 5568 nvlist_t *features; 5569 5570 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 5571 5572 mutex_enter(&spa->spa_feat_stats_lock); 5573 features = spa->spa_feat_stats; 5574 5575 if (features != NULL) { 5576 spa_feature_stats_from_cache(spa, features); 5577 } else { 5578 VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP)); 5579 spa->spa_feat_stats = features; 5580 spa_feature_stats_from_disk(spa, features); 5581 } 5582 5583 VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 5584 features)); 5585 5586 mutex_exit(&spa->spa_feat_stats_lock); 5587 } 5588 5589 int 5590 spa_get_stats(const char *name, nvlist_t **config, 5591 char *altroot, size_t buflen) 5592 { 5593 int error; 5594 spa_t *spa; 5595 5596 *config = NULL; 5597 error = spa_open_common(name, &spa, FTAG, NULL, config); 5598 5599 if (spa != NULL) { 5600 /* 5601 * This still leaves a window of inconsistency where the spares 5602 * or l2cache devices could change and the config would be 5603 * self-inconsistent. 5604 */ 5605 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5606 5607 if (*config != NULL) { 5608 uint64_t loadtimes[2]; 5609 5610 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 5611 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 5612 fnvlist_add_uint64_array(*config, 5613 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2); 5614 5615 fnvlist_add_uint64(*config, 5616 ZPOOL_CONFIG_ERRCOUNT, 5617 spa_approx_errlog_size(spa)); 5618 5619 if (spa_suspended(spa)) { 5620 fnvlist_add_uint64(*config, 5621 ZPOOL_CONFIG_SUSPENDED, 5622 spa->spa_failmode); 5623 fnvlist_add_uint64(*config, 5624 ZPOOL_CONFIG_SUSPENDED_REASON, 5625 spa->spa_suspended); 5626 } 5627 5628 spa_add_spares(spa, *config); 5629 spa_add_l2cache(spa, *config); 5630 spa_add_feature_stats(spa, *config); 5631 } 5632 } 5633 5634 /* 5635 * We want to get the alternate root even for faulted pools, so we cheat 5636 * and call spa_lookup() directly. 5637 */ 5638 if (altroot) { 5639 if (spa == NULL) { 5640 mutex_enter(&spa_namespace_lock); 5641 spa = spa_lookup(name); 5642 if (spa) 5643 spa_altroot(spa, altroot, buflen); 5644 else 5645 altroot[0] = '\0'; 5646 spa = NULL; 5647 mutex_exit(&spa_namespace_lock); 5648 } else { 5649 spa_altroot(spa, altroot, buflen); 5650 } 5651 } 5652 5653 if (spa != NULL) { 5654 spa_config_exit(spa, SCL_CONFIG, FTAG); 5655 spa_close(spa, FTAG); 5656 } 5657 5658 return (error); 5659 } 5660 5661 /* 5662 * Validate that the auxiliary device array is well formed. We must have an 5663 * array of nvlists, each which describes a valid leaf vdev. If this is an 5664 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 5665 * specified, as long as they are well-formed. 5666 */ 5667 static int 5668 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 5669 spa_aux_vdev_t *sav, const char *config, uint64_t version, 5670 vdev_labeltype_t label) 5671 { 5672 nvlist_t **dev; 5673 uint_t i, ndev; 5674 vdev_t *vd; 5675 int error; 5676 5677 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5678 5679 /* 5680 * It's acceptable to have no devs specified. 5681 */ 5682 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 5683 return (0); 5684 5685 if (ndev == 0) 5686 return (SET_ERROR(EINVAL)); 5687 5688 /* 5689 * Make sure the pool is formatted with a version that supports this 5690 * device type. 5691 */ 5692 if (spa_version(spa) < version) 5693 return (SET_ERROR(ENOTSUP)); 5694 5695 /* 5696 * Set the pending device list so we correctly handle device in-use 5697 * checking. 5698 */ 5699 sav->sav_pending = dev; 5700 sav->sav_npending = ndev; 5701 5702 for (i = 0; i < ndev; i++) { 5703 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 5704 mode)) != 0) 5705 goto out; 5706 5707 if (!vd->vdev_ops->vdev_op_leaf) { 5708 vdev_free(vd); 5709 error = SET_ERROR(EINVAL); 5710 goto out; 5711 } 5712 5713 vd->vdev_top = vd; 5714 5715 if ((error = vdev_open(vd)) == 0 && 5716 (error = vdev_label_init(vd, crtxg, label)) == 0) { 5717 fnvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 5718 vd->vdev_guid); 5719 } 5720 5721 vdev_free(vd); 5722 5723 if (error && 5724 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 5725 goto out; 5726 else 5727 error = 0; 5728 } 5729 5730 out: 5731 sav->sav_pending = NULL; 5732 sav->sav_npending = 0; 5733 return (error); 5734 } 5735 5736 static int 5737 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 5738 { 5739 int error; 5740 5741 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5742 5743 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 5744 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 5745 VDEV_LABEL_SPARE)) != 0) { 5746 return (error); 5747 } 5748 5749 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 5750 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 5751 VDEV_LABEL_L2CACHE)); 5752 } 5753 5754 static void 5755 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 5756 const char *config) 5757 { 5758 int i; 5759 5760 if (sav->sav_config != NULL) { 5761 nvlist_t **olddevs; 5762 uint_t oldndevs; 5763 nvlist_t **newdevs; 5764 5765 /* 5766 * Generate new dev list by concatenating with the 5767 * current dev list. 5768 */ 5769 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, config, 5770 &olddevs, &oldndevs)); 5771 5772 newdevs = kmem_alloc(sizeof (void *) * 5773 (ndevs + oldndevs), KM_SLEEP); 5774 for (i = 0; i < oldndevs; i++) 5775 newdevs[i] = fnvlist_dup(olddevs[i]); 5776 for (i = 0; i < ndevs; i++) 5777 newdevs[i + oldndevs] = fnvlist_dup(devs[i]); 5778 5779 fnvlist_remove(sav->sav_config, config); 5780 5781 fnvlist_add_nvlist_array(sav->sav_config, config, 5782 (const nvlist_t * const *)newdevs, ndevs + oldndevs); 5783 for (i = 0; i < oldndevs + ndevs; i++) 5784 nvlist_free(newdevs[i]); 5785 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 5786 } else { 5787 /* 5788 * Generate a new dev list. 5789 */ 5790 sav->sav_config = fnvlist_alloc(); 5791 fnvlist_add_nvlist_array(sav->sav_config, config, 5792 (const nvlist_t * const *)devs, ndevs); 5793 } 5794 } 5795 5796 /* 5797 * Stop and drop level 2 ARC devices 5798 */ 5799 void 5800 spa_l2cache_drop(spa_t *spa) 5801 { 5802 vdev_t *vd; 5803 int i; 5804 spa_aux_vdev_t *sav = &spa->spa_l2cache; 5805 5806 for (i = 0; i < sav->sav_count; i++) { 5807 uint64_t pool; 5808 5809 vd = sav->sav_vdevs[i]; 5810 ASSERT(vd != NULL); 5811 5812 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 5813 pool != 0ULL && l2arc_vdev_present(vd)) 5814 l2arc_remove_vdev(vd); 5815 } 5816 } 5817 5818 /* 5819 * Verify encryption parameters for spa creation. If we are encrypting, we must 5820 * have the encryption feature flag enabled. 5821 */ 5822 static int 5823 spa_create_check_encryption_params(dsl_crypto_params_t *dcp, 5824 boolean_t has_encryption) 5825 { 5826 if (dcp->cp_crypt != ZIO_CRYPT_OFF && 5827 dcp->cp_crypt != ZIO_CRYPT_INHERIT && 5828 !has_encryption) 5829 return (SET_ERROR(ENOTSUP)); 5830 5831 return (dmu_objset_create_crypt_check(NULL, dcp, NULL)); 5832 } 5833 5834 /* 5835 * Pool Creation 5836 */ 5837 int 5838 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 5839 nvlist_t *zplprops, dsl_crypto_params_t *dcp) 5840 { 5841 spa_t *spa; 5842 const char *altroot = NULL; 5843 vdev_t *rvd; 5844 dsl_pool_t *dp; 5845 dmu_tx_t *tx; 5846 int error = 0; 5847 uint64_t txg = TXG_INITIAL; 5848 nvlist_t **spares, **l2cache; 5849 uint_t nspares, nl2cache; 5850 uint64_t version, obj, ndraid = 0; 5851 boolean_t has_features; 5852 boolean_t has_encryption; 5853 boolean_t has_allocclass; 5854 spa_feature_t feat; 5855 const char *feat_name; 5856 const char *poolname; 5857 nvlist_t *nvl; 5858 5859 if (props == NULL || 5860 nvlist_lookup_string(props, "tname", &poolname) != 0) 5861 poolname = (char *)pool; 5862 5863 /* 5864 * If this pool already exists, return failure. 5865 */ 5866 mutex_enter(&spa_namespace_lock); 5867 if (spa_lookup(poolname) != NULL) { 5868 mutex_exit(&spa_namespace_lock); 5869 return (SET_ERROR(EEXIST)); 5870 } 5871 5872 /* 5873 * Allocate a new spa_t structure. 5874 */ 5875 nvl = fnvlist_alloc(); 5876 fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool); 5877 (void) nvlist_lookup_string(props, 5878 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5879 spa = spa_add(poolname, nvl, altroot); 5880 fnvlist_free(nvl); 5881 spa_activate(spa, spa_mode_global); 5882 5883 if (props && (error = spa_prop_validate(spa, props))) { 5884 spa_deactivate(spa); 5885 spa_remove(spa); 5886 mutex_exit(&spa_namespace_lock); 5887 return (error); 5888 } 5889 5890 /* 5891 * Temporary pool names should never be written to disk. 5892 */ 5893 if (poolname != pool) 5894 spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME; 5895 5896 has_features = B_FALSE; 5897 has_encryption = B_FALSE; 5898 has_allocclass = B_FALSE; 5899 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 5900 elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 5901 if (zpool_prop_feature(nvpair_name(elem))) { 5902 has_features = B_TRUE; 5903 5904 feat_name = strchr(nvpair_name(elem), '@') + 1; 5905 VERIFY0(zfeature_lookup_name(feat_name, &feat)); 5906 if (feat == SPA_FEATURE_ENCRYPTION) 5907 has_encryption = B_TRUE; 5908 if (feat == SPA_FEATURE_ALLOCATION_CLASSES) 5909 has_allocclass = B_TRUE; 5910 } 5911 } 5912 5913 /* verify encryption params, if they were provided */ 5914 if (dcp != NULL) { 5915 error = spa_create_check_encryption_params(dcp, has_encryption); 5916 if (error != 0) { 5917 spa_deactivate(spa); 5918 spa_remove(spa); 5919 mutex_exit(&spa_namespace_lock); 5920 return (error); 5921 } 5922 } 5923 if (!has_allocclass && zfs_special_devs(nvroot, NULL)) { 5924 spa_deactivate(spa); 5925 spa_remove(spa); 5926 mutex_exit(&spa_namespace_lock); 5927 return (ENOTSUP); 5928 } 5929 5930 if (has_features || nvlist_lookup_uint64(props, 5931 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 5932 version = SPA_VERSION; 5933 } 5934 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 5935 5936 spa->spa_first_txg = txg; 5937 spa->spa_uberblock.ub_txg = txg - 1; 5938 spa->spa_uberblock.ub_version = version; 5939 spa->spa_ubsync = spa->spa_uberblock; 5940 spa->spa_load_state = SPA_LOAD_CREATE; 5941 spa->spa_removing_phys.sr_state = DSS_NONE; 5942 spa->spa_removing_phys.sr_removing_vdev = -1; 5943 spa->spa_removing_phys.sr_prev_indirect_vdev = -1; 5944 spa->spa_indirect_vdevs_loaded = B_TRUE; 5945 5946 /* 5947 * Create "The Godfather" zio to hold all async IOs 5948 */ 5949 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 5950 KM_SLEEP); 5951 for (int i = 0; i < max_ncpus; i++) { 5952 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 5953 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 5954 ZIO_FLAG_GODFATHER); 5955 } 5956 5957 /* 5958 * Create the root vdev. 5959 */ 5960 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5961 5962 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 5963 5964 ASSERT(error != 0 || rvd != NULL); 5965 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 5966 5967 if (error == 0 && !zfs_allocatable_devs(nvroot)) 5968 error = SET_ERROR(EINVAL); 5969 5970 if (error == 0 && 5971 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 5972 (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 && 5973 (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { 5974 /* 5975 * instantiate the metaslab groups (this will dirty the vdevs) 5976 * we can no longer error exit past this point 5977 */ 5978 for (int c = 0; error == 0 && c < rvd->vdev_children; c++) { 5979 vdev_t *vd = rvd->vdev_child[c]; 5980 5981 vdev_metaslab_set_size(vd); 5982 vdev_expand(vd, txg); 5983 } 5984 } 5985 5986 spa_config_exit(spa, SCL_ALL, FTAG); 5987 5988 if (error != 0) { 5989 spa_unload(spa); 5990 spa_deactivate(spa); 5991 spa_remove(spa); 5992 mutex_exit(&spa_namespace_lock); 5993 return (error); 5994 } 5995 5996 /* 5997 * Get the list of spares, if specified. 5998 */ 5999 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 6000 &spares, &nspares) == 0) { 6001 spa->spa_spares.sav_config = fnvlist_alloc(); 6002 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 6003 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 6004 nspares); 6005 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6006 spa_load_spares(spa); 6007 spa_config_exit(spa, SCL_ALL, FTAG); 6008 spa->spa_spares.sav_sync = B_TRUE; 6009 } 6010 6011 /* 6012 * Get the list of level 2 cache devices, if specified. 6013 */ 6014 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 6015 &l2cache, &nl2cache) == 0) { 6016 VERIFY0(nvlist_alloc(&spa->spa_l2cache.sav_config, 6017 NV_UNIQUE_NAME, KM_SLEEP)); 6018 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 6019 ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, 6020 nl2cache); 6021 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6022 spa_load_l2cache(spa); 6023 spa_config_exit(spa, SCL_ALL, FTAG); 6024 spa->spa_l2cache.sav_sync = B_TRUE; 6025 } 6026 6027 spa->spa_is_initializing = B_TRUE; 6028 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg); 6029 spa->spa_is_initializing = B_FALSE; 6030 6031 /* 6032 * Create DDTs (dedup tables). 6033 */ 6034 ddt_create(spa); 6035 /* 6036 * Create BRT table and BRT table object. 6037 */ 6038 brt_create(spa); 6039 6040 spa_update_dspace(spa); 6041 6042 tx = dmu_tx_create_assigned(dp, txg); 6043 6044 /* 6045 * Create the pool's history object. 6046 */ 6047 if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history) 6048 spa_history_create_obj(spa, tx); 6049 6050 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); 6051 spa_history_log_version(spa, "create", tx); 6052 6053 /* 6054 * Create the pool config object. 6055 */ 6056 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 6057 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 6058 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 6059 6060 if (zap_add(spa->spa_meta_objset, 6061 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 6062 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 6063 cmn_err(CE_PANIC, "failed to add pool config"); 6064 } 6065 6066 if (zap_add(spa->spa_meta_objset, 6067 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 6068 sizeof (uint64_t), 1, &version, tx) != 0) { 6069 cmn_err(CE_PANIC, "failed to add pool version"); 6070 } 6071 6072 /* Newly created pools with the right version are always deflated. */ 6073 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 6074 spa->spa_deflate = TRUE; 6075 if (zap_add(spa->spa_meta_objset, 6076 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6077 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 6078 cmn_err(CE_PANIC, "failed to add deflate"); 6079 } 6080 } 6081 6082 /* 6083 * Create the deferred-free bpobj. Turn off compression 6084 * because sync-to-convergence takes longer if the blocksize 6085 * keeps changing. 6086 */ 6087 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 6088 dmu_object_set_compress(spa->spa_meta_objset, obj, 6089 ZIO_COMPRESS_OFF, tx); 6090 if (zap_add(spa->spa_meta_objset, 6091 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 6092 sizeof (uint64_t), 1, &obj, tx) != 0) { 6093 cmn_err(CE_PANIC, "failed to add bpobj"); 6094 } 6095 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 6096 spa->spa_meta_objset, obj)); 6097 6098 /* 6099 * Generate some random noise for salted checksums to operate on. 6100 */ 6101 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 6102 sizeof (spa->spa_cksum_salt.zcs_bytes)); 6103 6104 /* 6105 * Set pool properties. 6106 */ 6107 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 6108 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 6109 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 6110 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 6111 spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST); 6112 spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM); 6113 6114 if (props != NULL) { 6115 spa_configfile_set(spa, props, B_FALSE); 6116 spa_sync_props(props, tx); 6117 } 6118 6119 for (int i = 0; i < ndraid; i++) 6120 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); 6121 6122 dmu_tx_commit(tx); 6123 6124 spa->spa_sync_on = B_TRUE; 6125 txg_sync_start(dp); 6126 mmp_thread_start(spa); 6127 txg_wait_synced(dp, txg); 6128 6129 spa_spawn_aux_threads(spa); 6130 6131 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); 6132 6133 /* 6134 * Don't count references from objsets that are already closed 6135 * and are making their way through the eviction process. 6136 */ 6137 spa_evicting_os_wait(spa); 6138 spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); 6139 spa->spa_load_state = SPA_LOAD_NONE; 6140 6141 spa_import_os(spa); 6142 6143 mutex_exit(&spa_namespace_lock); 6144 6145 return (0); 6146 } 6147 6148 /* 6149 * Import a non-root pool into the system. 6150 */ 6151 int 6152 spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 6153 { 6154 spa_t *spa; 6155 const char *altroot = NULL; 6156 spa_load_state_t state = SPA_LOAD_IMPORT; 6157 zpool_load_policy_t policy; 6158 spa_mode_t mode = spa_mode_global; 6159 uint64_t readonly = B_FALSE; 6160 int error; 6161 nvlist_t *nvroot; 6162 nvlist_t **spares, **l2cache; 6163 uint_t nspares, nl2cache; 6164 6165 /* 6166 * If a pool with this name exists, return failure. 6167 */ 6168 mutex_enter(&spa_namespace_lock); 6169 if (spa_lookup(pool) != NULL) { 6170 mutex_exit(&spa_namespace_lock); 6171 return (SET_ERROR(EEXIST)); 6172 } 6173 6174 /* 6175 * Create and initialize the spa structure. 6176 */ 6177 (void) nvlist_lookup_string(props, 6178 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 6179 (void) nvlist_lookup_uint64(props, 6180 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 6181 if (readonly) 6182 mode = SPA_MODE_READ; 6183 spa = spa_add(pool, config, altroot); 6184 spa->spa_import_flags = flags; 6185 6186 /* 6187 * Verbatim import - Take a pool and insert it into the namespace 6188 * as if it had been loaded at boot. 6189 */ 6190 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 6191 if (props != NULL) 6192 spa_configfile_set(spa, props, B_FALSE); 6193 6194 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); 6195 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 6196 zfs_dbgmsg("spa_import: verbatim import of %s", pool); 6197 mutex_exit(&spa_namespace_lock); 6198 return (0); 6199 } 6200 6201 spa_activate(spa, mode); 6202 6203 /* 6204 * Don't start async tasks until we know everything is healthy. 6205 */ 6206 spa_async_suspend(spa); 6207 6208 zpool_get_load_policy(config, &policy); 6209 if (policy.zlp_rewind & ZPOOL_DO_REWIND) 6210 state = SPA_LOAD_RECOVER; 6211 6212 spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT; 6213 6214 if (state != SPA_LOAD_RECOVER) { 6215 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 6216 zfs_dbgmsg("spa_import: importing %s", pool); 6217 } else { 6218 zfs_dbgmsg("spa_import: importing %s, max_txg=%lld " 6219 "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg); 6220 } 6221 error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); 6222 6223 /* 6224 * Propagate anything learned while loading the pool and pass it 6225 * back to caller (i.e. rewind info, missing devices, etc). 6226 */ 6227 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); 6228 6229 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6230 /* 6231 * Toss any existing sparelist, as it doesn't have any validity 6232 * anymore, and conflicts with spa_has_spare(). 6233 */ 6234 if (spa->spa_spares.sav_config) { 6235 nvlist_free(spa->spa_spares.sav_config); 6236 spa->spa_spares.sav_config = NULL; 6237 spa_load_spares(spa); 6238 } 6239 if (spa->spa_l2cache.sav_config) { 6240 nvlist_free(spa->spa_l2cache.sav_config); 6241 spa->spa_l2cache.sav_config = NULL; 6242 spa_load_l2cache(spa); 6243 } 6244 6245 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 6246 spa_config_exit(spa, SCL_ALL, FTAG); 6247 6248 if (props != NULL) 6249 spa_configfile_set(spa, props, B_FALSE); 6250 6251 if (error != 0 || (props && spa_writeable(spa) && 6252 (error = spa_prop_set(spa, props)))) { 6253 spa_unload(spa); 6254 spa_deactivate(spa); 6255 spa_remove(spa); 6256 mutex_exit(&spa_namespace_lock); 6257 return (error); 6258 } 6259 6260 spa_async_resume(spa); 6261 6262 /* 6263 * Override any spares and level 2 cache devices as specified by 6264 * the user, as these may have correct device names/devids, etc. 6265 */ 6266 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 6267 &spares, &nspares) == 0) { 6268 if (spa->spa_spares.sav_config) 6269 fnvlist_remove(spa->spa_spares.sav_config, 6270 ZPOOL_CONFIG_SPARES); 6271 else 6272 spa->spa_spares.sav_config = fnvlist_alloc(); 6273 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 6274 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 6275 nspares); 6276 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6277 spa_load_spares(spa); 6278 spa_config_exit(spa, SCL_ALL, FTAG); 6279 spa->spa_spares.sav_sync = B_TRUE; 6280 } 6281 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 6282 &l2cache, &nl2cache) == 0) { 6283 if (spa->spa_l2cache.sav_config) 6284 fnvlist_remove(spa->spa_l2cache.sav_config, 6285 ZPOOL_CONFIG_L2CACHE); 6286 else 6287 spa->spa_l2cache.sav_config = fnvlist_alloc(); 6288 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 6289 ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, 6290 nl2cache); 6291 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6292 spa_load_l2cache(spa); 6293 spa_config_exit(spa, SCL_ALL, FTAG); 6294 spa->spa_l2cache.sav_sync = B_TRUE; 6295 } 6296 6297 /* 6298 * Check for any removed devices. 6299 */ 6300 if (spa->spa_autoreplace) { 6301 spa_aux_check_removed(&spa->spa_spares); 6302 spa_aux_check_removed(&spa->spa_l2cache); 6303 } 6304 6305 if (spa_writeable(spa)) { 6306 /* 6307 * Update the config cache to include the newly-imported pool. 6308 */ 6309 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 6310 } 6311 6312 /* 6313 * It's possible that the pool was expanded while it was exported. 6314 * We kick off an async task to handle this for us. 6315 */ 6316 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 6317 6318 spa_history_log_version(spa, "import", NULL); 6319 6320 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 6321 6322 mutex_exit(&spa_namespace_lock); 6323 6324 zvol_create_minors_recursive(pool); 6325 6326 spa_import_os(spa); 6327 6328 return (0); 6329 } 6330 6331 nvlist_t * 6332 spa_tryimport(nvlist_t *tryconfig) 6333 { 6334 nvlist_t *config = NULL; 6335 const char *poolname, *cachefile; 6336 spa_t *spa; 6337 uint64_t state; 6338 int error; 6339 zpool_load_policy_t policy; 6340 6341 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 6342 return (NULL); 6343 6344 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 6345 return (NULL); 6346 6347 /* 6348 * Create and initialize the spa structure. 6349 */ 6350 mutex_enter(&spa_namespace_lock); 6351 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 6352 spa_activate(spa, SPA_MODE_READ); 6353 6354 /* 6355 * Rewind pool if a max txg was provided. 6356 */ 6357 zpool_get_load_policy(spa->spa_config, &policy); 6358 if (policy.zlp_txg != UINT64_MAX) { 6359 spa->spa_load_max_txg = policy.zlp_txg; 6360 spa->spa_extreme_rewind = B_TRUE; 6361 zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld", 6362 poolname, (longlong_t)policy.zlp_txg); 6363 } else { 6364 zfs_dbgmsg("spa_tryimport: importing %s", poolname); 6365 } 6366 6367 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile) 6368 == 0) { 6369 zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile); 6370 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; 6371 } else { 6372 spa->spa_config_source = SPA_CONFIG_SRC_SCAN; 6373 } 6374 6375 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); 6376 6377 /* 6378 * If 'tryconfig' was at least parsable, return the current config. 6379 */ 6380 if (spa->spa_root_vdev != NULL) { 6381 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 6382 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname); 6383 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state); 6384 fnvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 6385 spa->spa_uberblock.ub_timestamp); 6386 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 6387 spa->spa_load_info); 6388 fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, 6389 spa->spa_errata); 6390 6391 /* 6392 * If the bootfs property exists on this pool then we 6393 * copy it out so that external consumers can tell which 6394 * pools are bootable. 6395 */ 6396 if ((!error || error == EEXIST) && spa->spa_bootfs) { 6397 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 6398 6399 /* 6400 * We have to play games with the name since the 6401 * pool was opened as TRYIMPORT_NAME. 6402 */ 6403 if (dsl_dsobj_to_dsname(spa_name(spa), 6404 spa->spa_bootfs, tmpname) == 0) { 6405 char *cp; 6406 char *dsname; 6407 6408 dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 6409 6410 cp = strchr(tmpname, '/'); 6411 if (cp == NULL) { 6412 (void) strlcpy(dsname, tmpname, 6413 MAXPATHLEN); 6414 } else { 6415 (void) snprintf(dsname, MAXPATHLEN, 6416 "%s/%s", poolname, ++cp); 6417 } 6418 fnvlist_add_string(config, ZPOOL_CONFIG_BOOTFS, 6419 dsname); 6420 kmem_free(dsname, MAXPATHLEN); 6421 } 6422 kmem_free(tmpname, MAXPATHLEN); 6423 } 6424 6425 /* 6426 * Add the list of hot spares and level 2 cache devices. 6427 */ 6428 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6429 spa_add_spares(spa, config); 6430 spa_add_l2cache(spa, config); 6431 spa_config_exit(spa, SCL_CONFIG, FTAG); 6432 } 6433 6434 spa_unload(spa); 6435 spa_deactivate(spa); 6436 spa_remove(spa); 6437 mutex_exit(&spa_namespace_lock); 6438 6439 return (config); 6440 } 6441 6442 /* 6443 * Pool export/destroy 6444 * 6445 * The act of destroying or exporting a pool is very simple. We make sure there 6446 * is no more pending I/O and any references to the pool are gone. Then, we 6447 * update the pool state and sync all the labels to disk, removing the 6448 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 6449 * we don't sync the labels or remove the configuration cache. 6450 */ 6451 static int 6452 spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, 6453 boolean_t force, boolean_t hardforce) 6454 { 6455 int error; 6456 spa_t *spa; 6457 6458 if (oldconfig) 6459 *oldconfig = NULL; 6460 6461 if (!(spa_mode_global & SPA_MODE_WRITE)) 6462 return (SET_ERROR(EROFS)); 6463 6464 mutex_enter(&spa_namespace_lock); 6465 if ((spa = spa_lookup(pool)) == NULL) { 6466 mutex_exit(&spa_namespace_lock); 6467 return (SET_ERROR(ENOENT)); 6468 } 6469 6470 if (spa->spa_is_exporting) { 6471 /* the pool is being exported by another thread */ 6472 mutex_exit(&spa_namespace_lock); 6473 return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS)); 6474 } 6475 spa->spa_is_exporting = B_TRUE; 6476 6477 /* 6478 * Put a hold on the pool, drop the namespace lock, stop async tasks, 6479 * reacquire the namespace lock, and see if we can export. 6480 */ 6481 spa_open_ref(spa, FTAG); 6482 mutex_exit(&spa_namespace_lock); 6483 spa_async_suspend(spa); 6484 if (spa->spa_zvol_taskq) { 6485 zvol_remove_minors(spa, spa_name(spa), B_TRUE); 6486 taskq_wait(spa->spa_zvol_taskq); 6487 } 6488 mutex_enter(&spa_namespace_lock); 6489 spa_close(spa, FTAG); 6490 6491 if (spa->spa_state == POOL_STATE_UNINITIALIZED) 6492 goto export_spa; 6493 /* 6494 * The pool will be in core if it's openable, in which case we can 6495 * modify its state. Objsets may be open only because they're dirty, 6496 * so we have to force it to sync before checking spa_refcnt. 6497 */ 6498 if (spa->spa_sync_on) { 6499 txg_wait_synced(spa->spa_dsl_pool, 0); 6500 spa_evicting_os_wait(spa); 6501 } 6502 6503 /* 6504 * A pool cannot be exported or destroyed if there are active 6505 * references. If we are resetting a pool, allow references by 6506 * fault injection handlers. 6507 */ 6508 if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) { 6509 error = SET_ERROR(EBUSY); 6510 goto fail; 6511 } 6512 6513 if (spa->spa_sync_on) { 6514 vdev_t *rvd = spa->spa_root_vdev; 6515 /* 6516 * A pool cannot be exported if it has an active shared spare. 6517 * This is to prevent other pools stealing the active spare 6518 * from an exported pool. At user's own will, such pool can 6519 * be forcedly exported. 6520 */ 6521 if (!force && new_state == POOL_STATE_EXPORTED && 6522 spa_has_active_shared_spare(spa)) { 6523 error = SET_ERROR(EXDEV); 6524 goto fail; 6525 } 6526 6527 /* 6528 * We're about to export or destroy this pool. Make sure 6529 * we stop all initialization and trim activity here before 6530 * we set the spa_final_txg. This will ensure that all 6531 * dirty data resulting from the initialization is 6532 * committed to disk before we unload the pool. 6533 */ 6534 vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE); 6535 vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE); 6536 vdev_autotrim_stop_all(spa); 6537 vdev_rebuild_stop_all(spa); 6538 6539 /* 6540 * We want this to be reflected on every label, 6541 * so mark them all dirty. spa_unload() will do the 6542 * final sync that pushes these changes out. 6543 */ 6544 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 6545 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6546 spa->spa_state = new_state; 6547 vdev_config_dirty(rvd); 6548 spa_config_exit(spa, SCL_ALL, FTAG); 6549 } 6550 6551 /* 6552 * If the log space map feature is enabled and the pool is 6553 * getting exported (but not destroyed), we want to spend some 6554 * time flushing as many metaslabs as we can in an attempt to 6555 * destroy log space maps and save import time. This has to be 6556 * done before we set the spa_final_txg, otherwise 6557 * spa_sync() -> spa_flush_metaslabs() may dirty the final TXGs. 6558 * spa_should_flush_logs_on_unload() should be called after 6559 * spa_state has been set to the new_state. 6560 */ 6561 if (spa_should_flush_logs_on_unload(spa)) 6562 spa_unload_log_sm_flush_all(spa); 6563 6564 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 6565 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6566 spa->spa_final_txg = spa_last_synced_txg(spa) + 6567 TXG_DEFER_SIZE + 1; 6568 spa_config_exit(spa, SCL_ALL, FTAG); 6569 } 6570 } 6571 6572 export_spa: 6573 spa_export_os(spa); 6574 6575 if (new_state == POOL_STATE_DESTROYED) 6576 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); 6577 else if (new_state == POOL_STATE_EXPORTED) 6578 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT); 6579 6580 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 6581 spa_unload(spa); 6582 spa_deactivate(spa); 6583 } 6584 6585 if (oldconfig && spa->spa_config) 6586 *oldconfig = fnvlist_dup(spa->spa_config); 6587 6588 if (new_state != POOL_STATE_UNINITIALIZED) { 6589 if (!hardforce) 6590 spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); 6591 spa_remove(spa); 6592 } else { 6593 /* 6594 * If spa_remove() is not called for this spa_t and 6595 * there is any possibility that it can be reused, 6596 * we make sure to reset the exporting flag. 6597 */ 6598 spa->spa_is_exporting = B_FALSE; 6599 } 6600 6601 mutex_exit(&spa_namespace_lock); 6602 return (0); 6603 6604 fail: 6605 spa->spa_is_exporting = B_FALSE; 6606 spa_async_resume(spa); 6607 mutex_exit(&spa_namespace_lock); 6608 return (error); 6609 } 6610 6611 /* 6612 * Destroy a storage pool. 6613 */ 6614 int 6615 spa_destroy(const char *pool) 6616 { 6617 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 6618 B_FALSE, B_FALSE)); 6619 } 6620 6621 /* 6622 * Export a storage pool. 6623 */ 6624 int 6625 spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force, 6626 boolean_t hardforce) 6627 { 6628 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 6629 force, hardforce)); 6630 } 6631 6632 /* 6633 * Similar to spa_export(), this unloads the spa_t without actually removing it 6634 * from the namespace in any way. 6635 */ 6636 int 6637 spa_reset(const char *pool) 6638 { 6639 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 6640 B_FALSE, B_FALSE)); 6641 } 6642 6643 /* 6644 * ========================================================================== 6645 * Device manipulation 6646 * ========================================================================== 6647 */ 6648 6649 /* 6650 * This is called as a synctask to increment the draid feature flag 6651 */ 6652 static void 6653 spa_draid_feature_incr(void *arg, dmu_tx_t *tx) 6654 { 6655 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6656 int draid = (int)(uintptr_t)arg; 6657 6658 for (int c = 0; c < draid; c++) 6659 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); 6660 } 6661 6662 /* 6663 * Add a device to a storage pool. 6664 */ 6665 int 6666 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 6667 { 6668 uint64_t txg, ndraid = 0; 6669 int error; 6670 vdev_t *rvd = spa->spa_root_vdev; 6671 vdev_t *vd, *tvd; 6672 nvlist_t **spares, **l2cache; 6673 uint_t nspares, nl2cache; 6674 6675 ASSERT(spa_writeable(spa)); 6676 6677 txg = spa_vdev_enter(spa); 6678 6679 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 6680 VDEV_ALLOC_ADD)) != 0) 6681 return (spa_vdev_exit(spa, NULL, txg, error)); 6682 6683 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 6684 6685 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 6686 &nspares) != 0) 6687 nspares = 0; 6688 6689 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 6690 &nl2cache) != 0) 6691 nl2cache = 0; 6692 6693 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 6694 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 6695 6696 if (vd->vdev_children != 0 && 6697 (error = vdev_create(vd, txg, B_FALSE)) != 0) { 6698 return (spa_vdev_exit(spa, vd, txg, error)); 6699 } 6700 6701 /* 6702 * The virtual dRAID spares must be added after vdev tree is created 6703 * and the vdev guids are generated. The guid of their associated 6704 * dRAID is stored in the config and used when opening the spare. 6705 */ 6706 if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid, 6707 rvd->vdev_children)) == 0) { 6708 if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot, 6709 ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) 6710 nspares = 0; 6711 } else { 6712 return (spa_vdev_exit(spa, vd, txg, error)); 6713 } 6714 6715 /* 6716 * We must validate the spares and l2cache devices after checking the 6717 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 6718 */ 6719 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 6720 return (spa_vdev_exit(spa, vd, txg, error)); 6721 6722 /* 6723 * If we are in the middle of a device removal, we can only add 6724 * devices which match the existing devices in the pool. 6725 * If we are in the middle of a removal, or have some indirect 6726 * vdevs, we can not add raidz or dRAID top levels. 6727 */ 6728 if (spa->spa_vdev_removal != NULL || 6729 spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { 6730 for (int c = 0; c < vd->vdev_children; c++) { 6731 tvd = vd->vdev_child[c]; 6732 if (spa->spa_vdev_removal != NULL && 6733 tvd->vdev_ashift != spa->spa_max_ashift) { 6734 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 6735 } 6736 /* Fail if top level vdev is raidz or a dRAID */ 6737 if (vdev_get_nparity(tvd) != 0) 6738 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 6739 6740 /* 6741 * Need the top level mirror to be 6742 * a mirror of leaf vdevs only 6743 */ 6744 if (tvd->vdev_ops == &vdev_mirror_ops) { 6745 for (uint64_t cid = 0; 6746 cid < tvd->vdev_children; cid++) { 6747 vdev_t *cvd = tvd->vdev_child[cid]; 6748 if (!cvd->vdev_ops->vdev_op_leaf) { 6749 return (spa_vdev_exit(spa, vd, 6750 txg, EINVAL)); 6751 } 6752 } 6753 } 6754 } 6755 } 6756 6757 for (int c = 0; c < vd->vdev_children; c++) { 6758 tvd = vd->vdev_child[c]; 6759 vdev_remove_child(vd, tvd); 6760 tvd->vdev_id = rvd->vdev_children; 6761 vdev_add_child(rvd, tvd); 6762 vdev_config_dirty(tvd); 6763 } 6764 6765 if (nspares != 0) { 6766 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 6767 ZPOOL_CONFIG_SPARES); 6768 spa_load_spares(spa); 6769 spa->spa_spares.sav_sync = B_TRUE; 6770 } 6771 6772 if (nl2cache != 0) { 6773 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 6774 ZPOOL_CONFIG_L2CACHE); 6775 spa_load_l2cache(spa); 6776 spa->spa_l2cache.sav_sync = B_TRUE; 6777 } 6778 6779 /* 6780 * We can't increment a feature while holding spa_vdev so we 6781 * have to do it in a synctask. 6782 */ 6783 if (ndraid != 0) { 6784 dmu_tx_t *tx; 6785 6786 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 6787 dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr, 6788 (void *)(uintptr_t)ndraid, tx); 6789 dmu_tx_commit(tx); 6790 } 6791 6792 /* 6793 * We have to be careful when adding new vdevs to an existing pool. 6794 * If other threads start allocating from these vdevs before we 6795 * sync the config cache, and we lose power, then upon reboot we may 6796 * fail to open the pool because there are DVAs that the config cache 6797 * can't translate. Therefore, we first add the vdevs without 6798 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 6799 * and then let spa_config_update() initialize the new metaslabs. 6800 * 6801 * spa_load() checks for added-but-not-initialized vdevs, so that 6802 * if we lose power at any point in this sequence, the remaining 6803 * steps will be completed the next time we load the pool. 6804 */ 6805 (void) spa_vdev_exit(spa, vd, txg, 0); 6806 6807 mutex_enter(&spa_namespace_lock); 6808 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 6809 spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); 6810 mutex_exit(&spa_namespace_lock); 6811 6812 return (0); 6813 } 6814 6815 /* 6816 * Attach a device to a mirror. The arguments are the path to any device 6817 * in the mirror, and the nvroot for the new device. If the path specifies 6818 * a device that is not mirrored, we automatically insert the mirror vdev. 6819 * 6820 * If 'replacing' is specified, the new device is intended to replace the 6821 * existing device; in this case the two devices are made into their own 6822 * mirror using the 'replacing' vdev, which is functionally identical to 6823 * the mirror vdev (it actually reuses all the same ops) but has a few 6824 * extra rules: you can't attach to it after it's been created, and upon 6825 * completion of resilvering, the first disk (the one being replaced) 6826 * is automatically detached. 6827 * 6828 * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild) 6829 * should be performed instead of traditional healing reconstruction. From 6830 * an administrators perspective these are both resilver operations. 6831 */ 6832 int 6833 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, 6834 int rebuild) 6835 { 6836 uint64_t txg, dtl_max_txg; 6837 vdev_t *rvd = spa->spa_root_vdev; 6838 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 6839 vdev_ops_t *pvops; 6840 char *oldvdpath, *newvdpath; 6841 int newvd_isspare; 6842 int error; 6843 6844 ASSERT(spa_writeable(spa)); 6845 6846 txg = spa_vdev_enter(spa); 6847 6848 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 6849 6850 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 6851 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 6852 error = (spa_has_checkpoint(spa)) ? 6853 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 6854 return (spa_vdev_exit(spa, NULL, txg, error)); 6855 } 6856 6857 if (rebuild) { 6858 if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) 6859 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 6860 6861 if (dsl_scan_resilvering(spa_get_dsl(spa))) 6862 return (spa_vdev_exit(spa, NULL, txg, 6863 ZFS_ERR_RESILVER_IN_PROGRESS)); 6864 } else { 6865 if (vdev_rebuild_active(rvd)) 6866 return (spa_vdev_exit(spa, NULL, txg, 6867 ZFS_ERR_REBUILD_IN_PROGRESS)); 6868 } 6869 6870 if (spa->spa_vdev_removal != NULL) 6871 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 6872 6873 if (oldvd == NULL) 6874 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 6875 6876 if (!oldvd->vdev_ops->vdev_op_leaf) 6877 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 6878 6879 pvd = oldvd->vdev_parent; 6880 6881 if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 6882 VDEV_ALLOC_ATTACH) != 0) 6883 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 6884 6885 if (newrootvd->vdev_children != 1) 6886 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 6887 6888 newvd = newrootvd->vdev_child[0]; 6889 6890 if (!newvd->vdev_ops->vdev_op_leaf) 6891 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 6892 6893 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 6894 return (spa_vdev_exit(spa, newrootvd, txg, error)); 6895 6896 /* 6897 * log, dedup and special vdevs should not be replaced by spares. 6898 */ 6899 if ((oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE || 6900 oldvd->vdev_top->vdev_islog) && newvd->vdev_isspare) { 6901 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6902 } 6903 6904 /* 6905 * A dRAID spare can only replace a child of its parent dRAID vdev. 6906 */ 6907 if (newvd->vdev_ops == &vdev_draid_spare_ops && 6908 oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) { 6909 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6910 } 6911 6912 if (rebuild) { 6913 /* 6914 * For rebuilds, the top vdev must support reconstruction 6915 * using only space maps. This means the only allowable 6916 * vdevs types are the root vdev, a mirror, or dRAID. 6917 */ 6918 tvd = pvd; 6919 if (pvd->vdev_top != NULL) 6920 tvd = pvd->vdev_top; 6921 6922 if (tvd->vdev_ops != &vdev_mirror_ops && 6923 tvd->vdev_ops != &vdev_root_ops && 6924 tvd->vdev_ops != &vdev_draid_ops) { 6925 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6926 } 6927 } 6928 6929 if (!replacing) { 6930 /* 6931 * For attach, the only allowable parent is a mirror or the root 6932 * vdev. 6933 */ 6934 if (pvd->vdev_ops != &vdev_mirror_ops && 6935 pvd->vdev_ops != &vdev_root_ops) 6936 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6937 6938 pvops = &vdev_mirror_ops; 6939 } else { 6940 /* 6941 * Active hot spares can only be replaced by inactive hot 6942 * spares. 6943 */ 6944 if (pvd->vdev_ops == &vdev_spare_ops && 6945 oldvd->vdev_isspare && 6946 !spa_has_spare(spa, newvd->vdev_guid)) 6947 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6948 6949 /* 6950 * If the source is a hot spare, and the parent isn't already a 6951 * spare, then we want to create a new hot spare. Otherwise, we 6952 * want to create a replacing vdev. The user is not allowed to 6953 * attach to a spared vdev child unless the 'isspare' state is 6954 * the same (spare replaces spare, non-spare replaces 6955 * non-spare). 6956 */ 6957 if (pvd->vdev_ops == &vdev_replacing_ops && 6958 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 6959 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6960 } else if (pvd->vdev_ops == &vdev_spare_ops && 6961 newvd->vdev_isspare != oldvd->vdev_isspare) { 6962 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6963 } 6964 6965 if (newvd->vdev_isspare) 6966 pvops = &vdev_spare_ops; 6967 else 6968 pvops = &vdev_replacing_ops; 6969 } 6970 6971 /* 6972 * Make sure the new device is big enough. 6973 */ 6974 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 6975 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 6976 6977 /* 6978 * The new device cannot have a higher alignment requirement 6979 * than the top-level vdev. 6980 */ 6981 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 6982 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6983 6984 /* 6985 * If this is an in-place replacement, update oldvd's path and devid 6986 * to make it distinguishable from newvd, and unopenable from now on. 6987 */ 6988 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 6989 spa_strfree(oldvd->vdev_path); 6990 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 6991 KM_SLEEP); 6992 (void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5, 6993 "%s/%s", newvd->vdev_path, "old"); 6994 if (oldvd->vdev_devid != NULL) { 6995 spa_strfree(oldvd->vdev_devid); 6996 oldvd->vdev_devid = NULL; 6997 } 6998 } 6999 7000 /* 7001 * If the parent is not a mirror, or if we're replacing, insert the new 7002 * mirror/replacing/spare vdev above oldvd. 7003 */ 7004 if (pvd->vdev_ops != pvops) 7005 pvd = vdev_add_parent(oldvd, pvops); 7006 7007 ASSERT(pvd->vdev_top->vdev_parent == rvd); 7008 ASSERT(pvd->vdev_ops == pvops); 7009 ASSERT(oldvd->vdev_parent == pvd); 7010 7011 /* 7012 * Extract the new device from its root and add it to pvd. 7013 */ 7014 vdev_remove_child(newrootvd, newvd); 7015 newvd->vdev_id = pvd->vdev_children; 7016 newvd->vdev_crtxg = oldvd->vdev_crtxg; 7017 vdev_add_child(pvd, newvd); 7018 7019 /* 7020 * Reevaluate the parent vdev state. 7021 */ 7022 vdev_propagate_state(pvd); 7023 7024 tvd = newvd->vdev_top; 7025 ASSERT(pvd->vdev_top == tvd); 7026 ASSERT(tvd->vdev_parent == rvd); 7027 7028 vdev_config_dirty(tvd); 7029 7030 /* 7031 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 7032 * for any dmu_sync-ed blocks. It will propagate upward when 7033 * spa_vdev_exit() calls vdev_dtl_reassess(). 7034 */ 7035 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 7036 7037 vdev_dtl_dirty(newvd, DTL_MISSING, 7038 TXG_INITIAL, dtl_max_txg - TXG_INITIAL); 7039 7040 if (newvd->vdev_isspare) { 7041 spa_spare_activate(newvd); 7042 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); 7043 } 7044 7045 oldvdpath = spa_strdup(oldvd->vdev_path); 7046 newvdpath = spa_strdup(newvd->vdev_path); 7047 newvd_isspare = newvd->vdev_isspare; 7048 7049 /* 7050 * Mark newvd's DTL dirty in this txg. 7051 */ 7052 vdev_dirty(tvd, VDD_DTL, newvd, txg); 7053 7054 /* 7055 * Schedule the resilver or rebuild to restart in the future. We do 7056 * this to ensure that dmu_sync-ed blocks have been stitched into the 7057 * respective datasets. 7058 */ 7059 if (rebuild) { 7060 newvd->vdev_rebuild_txg = txg; 7061 7062 vdev_rebuild(tvd); 7063 } else { 7064 newvd->vdev_resilver_txg = txg; 7065 7066 if (dsl_scan_resilvering(spa_get_dsl(spa)) && 7067 spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) { 7068 vdev_defer_resilver(newvd); 7069 } else { 7070 dsl_scan_restart_resilver(spa->spa_dsl_pool, 7071 dtl_max_txg); 7072 } 7073 } 7074 7075 if (spa->spa_bootfs) 7076 spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); 7077 7078 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); 7079 7080 /* 7081 * Commit the config 7082 */ 7083 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 7084 7085 spa_history_log_internal(spa, "vdev attach", NULL, 7086 "%s vdev=%s %s vdev=%s", 7087 replacing && newvd_isspare ? "spare in" : 7088 replacing ? "replace" : "attach", newvdpath, 7089 replacing ? "for" : "to", oldvdpath); 7090 7091 spa_strfree(oldvdpath); 7092 spa_strfree(newvdpath); 7093 7094 return (0); 7095 } 7096 7097 /* 7098 * Detach a device from a mirror or replacing vdev. 7099 * 7100 * If 'replace_done' is specified, only detach if the parent 7101 * is a replacing vdev. 7102 */ 7103 int 7104 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 7105 { 7106 uint64_t txg; 7107 int error; 7108 vdev_t *rvd __maybe_unused = spa->spa_root_vdev; 7109 vdev_t *vd, *pvd, *cvd, *tvd; 7110 boolean_t unspare = B_FALSE; 7111 uint64_t unspare_guid = 0; 7112 char *vdpath; 7113 7114 ASSERT(spa_writeable(spa)); 7115 7116 txg = spa_vdev_detach_enter(spa, guid); 7117 7118 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 7119 7120 /* 7121 * Besides being called directly from the userland through the 7122 * ioctl interface, spa_vdev_detach() can be potentially called 7123 * at the end of spa_vdev_resilver_done(). 7124 * 7125 * In the regular case, when we have a checkpoint this shouldn't 7126 * happen as we never empty the DTLs of a vdev during the scrub 7127 * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done() 7128 * should never get here when we have a checkpoint. 7129 * 7130 * That said, even in a case when we checkpoint the pool exactly 7131 * as spa_vdev_resilver_done() calls this function everything 7132 * should be fine as the resilver will return right away. 7133 */ 7134 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7135 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 7136 error = (spa_has_checkpoint(spa)) ? 7137 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 7138 return (spa_vdev_exit(spa, NULL, txg, error)); 7139 } 7140 7141 if (vd == NULL) 7142 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 7143 7144 if (!vd->vdev_ops->vdev_op_leaf) 7145 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7146 7147 pvd = vd->vdev_parent; 7148 7149 /* 7150 * If the parent/child relationship is not as expected, don't do it. 7151 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 7152 * vdev that's replacing B with C. The user's intent in replacing 7153 * is to go from M(A,B) to M(A,C). If the user decides to cancel 7154 * the replace by detaching C, the expected behavior is to end up 7155 * M(A,B). But suppose that right after deciding to detach C, 7156 * the replacement of B completes. We would have M(A,C), and then 7157 * ask to detach C, which would leave us with just A -- not what 7158 * the user wanted. To prevent this, we make sure that the 7159 * parent/child relationship hasn't changed -- in this example, 7160 * that C's parent is still the replacing vdev R. 7161 */ 7162 if (pvd->vdev_guid != pguid && pguid != 0) 7163 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 7164 7165 /* 7166 * Only 'replacing' or 'spare' vdevs can be replaced. 7167 */ 7168 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 7169 pvd->vdev_ops != &vdev_spare_ops) 7170 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7171 7172 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 7173 spa_version(spa) >= SPA_VERSION_SPARES); 7174 7175 /* 7176 * Only mirror, replacing, and spare vdevs support detach. 7177 */ 7178 if (pvd->vdev_ops != &vdev_replacing_ops && 7179 pvd->vdev_ops != &vdev_mirror_ops && 7180 pvd->vdev_ops != &vdev_spare_ops) 7181 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7182 7183 /* 7184 * If this device has the only valid copy of some data, 7185 * we cannot safely detach it. 7186 */ 7187 if (vdev_dtl_required(vd)) 7188 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 7189 7190 ASSERT(pvd->vdev_children >= 2); 7191 7192 /* 7193 * If we are detaching the second disk from a replacing vdev, then 7194 * check to see if we changed the original vdev's path to have "/old" 7195 * at the end in spa_vdev_attach(). If so, undo that change now. 7196 */ 7197 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 7198 vd->vdev_path != NULL) { 7199 size_t len = strlen(vd->vdev_path); 7200 7201 for (int c = 0; c < pvd->vdev_children; c++) { 7202 cvd = pvd->vdev_child[c]; 7203 7204 if (cvd == vd || cvd->vdev_path == NULL) 7205 continue; 7206 7207 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 7208 strcmp(cvd->vdev_path + len, "/old") == 0) { 7209 spa_strfree(cvd->vdev_path); 7210 cvd->vdev_path = spa_strdup(vd->vdev_path); 7211 break; 7212 } 7213 } 7214 } 7215 7216 /* 7217 * If we are detaching the original disk from a normal spare, then it 7218 * implies that the spare should become a real disk, and be removed 7219 * from the active spare list for the pool. dRAID spares on the 7220 * other hand are coupled to the pool and thus should never be removed 7221 * from the spares list. 7222 */ 7223 if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) { 7224 vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1]; 7225 7226 if (last_cvd->vdev_isspare && 7227 last_cvd->vdev_ops != &vdev_draid_spare_ops) { 7228 unspare = B_TRUE; 7229 } 7230 } 7231 7232 /* 7233 * Erase the disk labels so the disk can be used for other things. 7234 * This must be done after all other error cases are handled, 7235 * but before we disembowel vd (so we can still do I/O to it). 7236 * But if we can't do it, don't treat the error as fatal -- 7237 * it may be that the unwritability of the disk is the reason 7238 * it's being detached! 7239 */ 7240 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 7241 7242 /* 7243 * Remove vd from its parent and compact the parent's children. 7244 */ 7245 vdev_remove_child(pvd, vd); 7246 vdev_compact_children(pvd); 7247 7248 /* 7249 * Remember one of the remaining children so we can get tvd below. 7250 */ 7251 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 7252 7253 /* 7254 * If we need to remove the remaining child from the list of hot spares, 7255 * do it now, marking the vdev as no longer a spare in the process. 7256 * We must do this before vdev_remove_parent(), because that can 7257 * change the GUID if it creates a new toplevel GUID. For a similar 7258 * reason, we must remove the spare now, in the same txg as the detach; 7259 * otherwise someone could attach a new sibling, change the GUID, and 7260 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 7261 */ 7262 if (unspare) { 7263 ASSERT(cvd->vdev_isspare); 7264 spa_spare_remove(cvd); 7265 unspare_guid = cvd->vdev_guid; 7266 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 7267 cvd->vdev_unspare = B_TRUE; 7268 } 7269 7270 /* 7271 * If the parent mirror/replacing vdev only has one child, 7272 * the parent is no longer needed. Remove it from the tree. 7273 */ 7274 if (pvd->vdev_children == 1) { 7275 if (pvd->vdev_ops == &vdev_spare_ops) 7276 cvd->vdev_unspare = B_FALSE; 7277 vdev_remove_parent(cvd); 7278 } 7279 7280 /* 7281 * We don't set tvd until now because the parent we just removed 7282 * may have been the previous top-level vdev. 7283 */ 7284 tvd = cvd->vdev_top; 7285 ASSERT(tvd->vdev_parent == rvd); 7286 7287 /* 7288 * Reevaluate the parent vdev state. 7289 */ 7290 vdev_propagate_state(cvd); 7291 7292 /* 7293 * If the 'autoexpand' property is set on the pool then automatically 7294 * try to expand the size of the pool. For example if the device we 7295 * just detached was smaller than the others, it may be possible to 7296 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 7297 * first so that we can obtain the updated sizes of the leaf vdevs. 7298 */ 7299 if (spa->spa_autoexpand) { 7300 vdev_reopen(tvd); 7301 vdev_expand(tvd, txg); 7302 } 7303 7304 vdev_config_dirty(tvd); 7305 7306 /* 7307 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 7308 * vd->vdev_detached is set and free vd's DTL object in syncing context. 7309 * But first make sure we're not on any *other* txg's DTL list, to 7310 * prevent vd from being accessed after it's freed. 7311 */ 7312 vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none"); 7313 for (int t = 0; t < TXG_SIZE; t++) 7314 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 7315 vd->vdev_detached = B_TRUE; 7316 vdev_dirty(tvd, VDD_DTL, vd, txg); 7317 7318 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); 7319 spa_notify_waiters(spa); 7320 7321 /* hang on to the spa before we release the lock */ 7322 spa_open_ref(spa, FTAG); 7323 7324 error = spa_vdev_exit(spa, vd, txg, 0); 7325 7326 spa_history_log_internal(spa, "detach", NULL, 7327 "vdev=%s", vdpath); 7328 spa_strfree(vdpath); 7329 7330 /* 7331 * If this was the removal of the original device in a hot spare vdev, 7332 * then we want to go through and remove the device from the hot spare 7333 * list of every other pool. 7334 */ 7335 if (unspare) { 7336 spa_t *altspa = NULL; 7337 7338 mutex_enter(&spa_namespace_lock); 7339 while ((altspa = spa_next(altspa)) != NULL) { 7340 if (altspa->spa_state != POOL_STATE_ACTIVE || 7341 altspa == spa) 7342 continue; 7343 7344 spa_open_ref(altspa, FTAG); 7345 mutex_exit(&spa_namespace_lock); 7346 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 7347 mutex_enter(&spa_namespace_lock); 7348 spa_close(altspa, FTAG); 7349 } 7350 mutex_exit(&spa_namespace_lock); 7351 7352 /* search the rest of the vdevs for spares to remove */ 7353 spa_vdev_resilver_done(spa); 7354 } 7355 7356 /* all done with the spa; OK to release */ 7357 mutex_enter(&spa_namespace_lock); 7358 spa_close(spa, FTAG); 7359 mutex_exit(&spa_namespace_lock); 7360 7361 return (error); 7362 } 7363 7364 static int 7365 spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, 7366 list_t *vd_list) 7367 { 7368 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7369 7370 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 7371 7372 /* Look up vdev and ensure it's a leaf. */ 7373 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); 7374 if (vd == NULL || vd->vdev_detached) { 7375 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7376 return (SET_ERROR(ENODEV)); 7377 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { 7378 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7379 return (SET_ERROR(EINVAL)); 7380 } else if (!vdev_writeable(vd)) { 7381 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7382 return (SET_ERROR(EROFS)); 7383 } 7384 mutex_enter(&vd->vdev_initialize_lock); 7385 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7386 7387 /* 7388 * When we activate an initialize action we check to see 7389 * if the vdev_initialize_thread is NULL. We do this instead 7390 * of using the vdev_initialize_state since there might be 7391 * a previous initialization process which has completed but 7392 * the thread is not exited. 7393 */ 7394 if (cmd_type == POOL_INITIALIZE_START && 7395 (vd->vdev_initialize_thread != NULL || 7396 vd->vdev_top->vdev_removing)) { 7397 mutex_exit(&vd->vdev_initialize_lock); 7398 return (SET_ERROR(EBUSY)); 7399 } else if (cmd_type == POOL_INITIALIZE_CANCEL && 7400 (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE && 7401 vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) { 7402 mutex_exit(&vd->vdev_initialize_lock); 7403 return (SET_ERROR(ESRCH)); 7404 } else if (cmd_type == POOL_INITIALIZE_SUSPEND && 7405 vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { 7406 mutex_exit(&vd->vdev_initialize_lock); 7407 return (SET_ERROR(ESRCH)); 7408 } 7409 7410 switch (cmd_type) { 7411 case POOL_INITIALIZE_START: 7412 vdev_initialize(vd); 7413 break; 7414 case POOL_INITIALIZE_CANCEL: 7415 vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list); 7416 break; 7417 case POOL_INITIALIZE_SUSPEND: 7418 vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list); 7419 break; 7420 default: 7421 panic("invalid cmd_type %llu", (unsigned long long)cmd_type); 7422 } 7423 mutex_exit(&vd->vdev_initialize_lock); 7424 7425 return (0); 7426 } 7427 7428 int 7429 spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, 7430 nvlist_t *vdev_errlist) 7431 { 7432 int total_errors = 0; 7433 list_t vd_list; 7434 7435 list_create(&vd_list, sizeof (vdev_t), 7436 offsetof(vdev_t, vdev_initialize_node)); 7437 7438 /* 7439 * We hold the namespace lock through the whole function 7440 * to prevent any changes to the pool while we're starting or 7441 * stopping initialization. The config and state locks are held so that 7442 * we can properly assess the vdev state before we commit to 7443 * the initializing operation. 7444 */ 7445 mutex_enter(&spa_namespace_lock); 7446 7447 for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); 7448 pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { 7449 uint64_t vdev_guid = fnvpair_value_uint64(pair); 7450 7451 int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type, 7452 &vd_list); 7453 if (error != 0) { 7454 char guid_as_str[MAXNAMELEN]; 7455 7456 (void) snprintf(guid_as_str, sizeof (guid_as_str), 7457 "%llu", (unsigned long long)vdev_guid); 7458 fnvlist_add_int64(vdev_errlist, guid_as_str, error); 7459 total_errors++; 7460 } 7461 } 7462 7463 /* Wait for all initialize threads to stop. */ 7464 vdev_initialize_stop_wait(spa, &vd_list); 7465 7466 /* Sync out the initializing state */ 7467 txg_wait_synced(spa->spa_dsl_pool, 0); 7468 mutex_exit(&spa_namespace_lock); 7469 7470 list_destroy(&vd_list); 7471 7472 return (total_errors); 7473 } 7474 7475 static int 7476 spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, 7477 uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list) 7478 { 7479 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7480 7481 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 7482 7483 /* Look up vdev and ensure it's a leaf. */ 7484 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); 7485 if (vd == NULL || vd->vdev_detached) { 7486 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7487 return (SET_ERROR(ENODEV)); 7488 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { 7489 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7490 return (SET_ERROR(EINVAL)); 7491 } else if (!vdev_writeable(vd)) { 7492 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7493 return (SET_ERROR(EROFS)); 7494 } else if (!vd->vdev_has_trim) { 7495 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7496 return (SET_ERROR(EOPNOTSUPP)); 7497 } else if (secure && !vd->vdev_has_securetrim) { 7498 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7499 return (SET_ERROR(EOPNOTSUPP)); 7500 } 7501 mutex_enter(&vd->vdev_trim_lock); 7502 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7503 7504 /* 7505 * When we activate a TRIM action we check to see if the 7506 * vdev_trim_thread is NULL. We do this instead of using the 7507 * vdev_trim_state since there might be a previous TRIM process 7508 * which has completed but the thread is not exited. 7509 */ 7510 if (cmd_type == POOL_TRIM_START && 7511 (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) { 7512 mutex_exit(&vd->vdev_trim_lock); 7513 return (SET_ERROR(EBUSY)); 7514 } else if (cmd_type == POOL_TRIM_CANCEL && 7515 (vd->vdev_trim_state != VDEV_TRIM_ACTIVE && 7516 vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) { 7517 mutex_exit(&vd->vdev_trim_lock); 7518 return (SET_ERROR(ESRCH)); 7519 } else if (cmd_type == POOL_TRIM_SUSPEND && 7520 vd->vdev_trim_state != VDEV_TRIM_ACTIVE) { 7521 mutex_exit(&vd->vdev_trim_lock); 7522 return (SET_ERROR(ESRCH)); 7523 } 7524 7525 switch (cmd_type) { 7526 case POOL_TRIM_START: 7527 vdev_trim(vd, rate, partial, secure); 7528 break; 7529 case POOL_TRIM_CANCEL: 7530 vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list); 7531 break; 7532 case POOL_TRIM_SUSPEND: 7533 vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list); 7534 break; 7535 default: 7536 panic("invalid cmd_type %llu", (unsigned long long)cmd_type); 7537 } 7538 mutex_exit(&vd->vdev_trim_lock); 7539 7540 return (0); 7541 } 7542 7543 /* 7544 * Initiates a manual TRIM for the requested vdevs. This kicks off individual 7545 * TRIM threads for each child vdev. These threads pass over all of the free 7546 * space in the vdev's metaslabs and issues TRIM commands for that space. 7547 */ 7548 int 7549 spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate, 7550 boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist) 7551 { 7552 int total_errors = 0; 7553 list_t vd_list; 7554 7555 list_create(&vd_list, sizeof (vdev_t), 7556 offsetof(vdev_t, vdev_trim_node)); 7557 7558 /* 7559 * We hold the namespace lock through the whole function 7560 * to prevent any changes to the pool while we're starting or 7561 * stopping TRIM. The config and state locks are held so that 7562 * we can properly assess the vdev state before we commit to 7563 * the TRIM operation. 7564 */ 7565 mutex_enter(&spa_namespace_lock); 7566 7567 for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); 7568 pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { 7569 uint64_t vdev_guid = fnvpair_value_uint64(pair); 7570 7571 int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type, 7572 rate, partial, secure, &vd_list); 7573 if (error != 0) { 7574 char guid_as_str[MAXNAMELEN]; 7575 7576 (void) snprintf(guid_as_str, sizeof (guid_as_str), 7577 "%llu", (unsigned long long)vdev_guid); 7578 fnvlist_add_int64(vdev_errlist, guid_as_str, error); 7579 total_errors++; 7580 } 7581 } 7582 7583 /* Wait for all TRIM threads to stop. */ 7584 vdev_trim_stop_wait(spa, &vd_list); 7585 7586 /* Sync out the TRIM state */ 7587 txg_wait_synced(spa->spa_dsl_pool, 0); 7588 mutex_exit(&spa_namespace_lock); 7589 7590 list_destroy(&vd_list); 7591 7592 return (total_errors); 7593 } 7594 7595 /* 7596 * Split a set of devices from their mirrors, and create a new pool from them. 7597 */ 7598 int 7599 spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config, 7600 nvlist_t *props, boolean_t exp) 7601 { 7602 int error = 0; 7603 uint64_t txg, *glist; 7604 spa_t *newspa; 7605 uint_t c, children, lastlog; 7606 nvlist_t **child, *nvl, *tmp; 7607 dmu_tx_t *tx; 7608 const char *altroot = NULL; 7609 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 7610 boolean_t activate_slog; 7611 7612 ASSERT(spa_writeable(spa)); 7613 7614 txg = spa_vdev_enter(spa); 7615 7616 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7617 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 7618 error = (spa_has_checkpoint(spa)) ? 7619 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 7620 return (spa_vdev_exit(spa, NULL, txg, error)); 7621 } 7622 7623 /* clear the log and flush everything up to now */ 7624 activate_slog = spa_passivate_log(spa); 7625 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 7626 error = spa_reset_logs(spa); 7627 txg = spa_vdev_config_enter(spa); 7628 7629 if (activate_slog) 7630 spa_activate_log(spa); 7631 7632 if (error != 0) 7633 return (spa_vdev_exit(spa, NULL, txg, error)); 7634 7635 /* check new spa name before going any further */ 7636 if (spa_lookup(newname) != NULL) 7637 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 7638 7639 /* 7640 * scan through all the children to ensure they're all mirrors 7641 */ 7642 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 7643 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 7644 &children) != 0) 7645 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 7646 7647 /* first, check to ensure we've got the right child count */ 7648 rvd = spa->spa_root_vdev; 7649 lastlog = 0; 7650 for (c = 0; c < rvd->vdev_children; c++) { 7651 vdev_t *vd = rvd->vdev_child[c]; 7652 7653 /* don't count the holes & logs as children */ 7654 if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops && 7655 !vdev_is_concrete(vd))) { 7656 if (lastlog == 0) 7657 lastlog = c; 7658 continue; 7659 } 7660 7661 lastlog = 0; 7662 } 7663 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 7664 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 7665 7666 /* next, ensure no spare or cache devices are part of the split */ 7667 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 7668 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 7669 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 7670 7671 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 7672 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 7673 7674 /* then, loop over each vdev and validate it */ 7675 for (c = 0; c < children; c++) { 7676 uint64_t is_hole = 0; 7677 7678 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 7679 &is_hole); 7680 7681 if (is_hole != 0) { 7682 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 7683 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 7684 continue; 7685 } else { 7686 error = SET_ERROR(EINVAL); 7687 break; 7688 } 7689 } 7690 7691 /* deal with indirect vdevs */ 7692 if (spa->spa_root_vdev->vdev_child[c]->vdev_ops == 7693 &vdev_indirect_ops) 7694 continue; 7695 7696 /* which disk is going to be split? */ 7697 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 7698 &glist[c]) != 0) { 7699 error = SET_ERROR(EINVAL); 7700 break; 7701 } 7702 7703 /* look it up in the spa */ 7704 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 7705 if (vml[c] == NULL) { 7706 error = SET_ERROR(ENODEV); 7707 break; 7708 } 7709 7710 /* make sure there's nothing stopping the split */ 7711 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 7712 vml[c]->vdev_islog || 7713 !vdev_is_concrete(vml[c]) || 7714 vml[c]->vdev_isspare || 7715 vml[c]->vdev_isl2cache || 7716 !vdev_writeable(vml[c]) || 7717 vml[c]->vdev_children != 0 || 7718 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 7719 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 7720 error = SET_ERROR(EINVAL); 7721 break; 7722 } 7723 7724 if (vdev_dtl_required(vml[c]) || 7725 vdev_resilver_needed(vml[c], NULL, NULL)) { 7726 error = SET_ERROR(EBUSY); 7727 break; 7728 } 7729 7730 /* we need certain info from the top level */ 7731 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 7732 vml[c]->vdev_top->vdev_ms_array); 7733 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 7734 vml[c]->vdev_top->vdev_ms_shift); 7735 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 7736 vml[c]->vdev_top->vdev_asize); 7737 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 7738 vml[c]->vdev_top->vdev_ashift); 7739 7740 /* transfer per-vdev ZAPs */ 7741 ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); 7742 VERIFY0(nvlist_add_uint64(child[c], 7743 ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); 7744 7745 ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); 7746 VERIFY0(nvlist_add_uint64(child[c], 7747 ZPOOL_CONFIG_VDEV_TOP_ZAP, 7748 vml[c]->vdev_parent->vdev_top_zap)); 7749 } 7750 7751 if (error != 0) { 7752 kmem_free(vml, children * sizeof (vdev_t *)); 7753 kmem_free(glist, children * sizeof (uint64_t)); 7754 return (spa_vdev_exit(spa, NULL, txg, error)); 7755 } 7756 7757 /* stop writers from using the disks */ 7758 for (c = 0; c < children; c++) { 7759 if (vml[c] != NULL) 7760 vml[c]->vdev_offline = B_TRUE; 7761 } 7762 vdev_reopen(spa->spa_root_vdev); 7763 7764 /* 7765 * Temporarily record the splitting vdevs in the spa config. This 7766 * will disappear once the config is regenerated. 7767 */ 7768 nvl = fnvlist_alloc(); 7769 fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children); 7770 kmem_free(glist, children * sizeof (uint64_t)); 7771 7772 mutex_enter(&spa->spa_props_lock); 7773 fnvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl); 7774 mutex_exit(&spa->spa_props_lock); 7775 spa->spa_config_splitting = nvl; 7776 vdev_config_dirty(spa->spa_root_vdev); 7777 7778 /* configure and create the new pool */ 7779 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname); 7780 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 7781 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE); 7782 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)); 7783 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg); 7784 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 7785 spa_generate_guid(NULL)); 7786 VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 7787 (void) nvlist_lookup_string(props, 7788 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 7789 7790 /* add the new pool to the namespace */ 7791 newspa = spa_add(newname, config, altroot); 7792 newspa->spa_avz_action = AVZ_ACTION_REBUILD; 7793 newspa->spa_config_txg = spa->spa_config_txg; 7794 spa_set_log_state(newspa, SPA_LOG_CLEAR); 7795 7796 /* release the spa config lock, retaining the namespace lock */ 7797 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 7798 7799 if (zio_injection_enabled) 7800 zio_handle_panic_injection(spa, FTAG, 1); 7801 7802 spa_activate(newspa, spa_mode_global); 7803 spa_async_suspend(newspa); 7804 7805 /* 7806 * Temporarily stop the initializing and TRIM activity. We set the 7807 * state to ACTIVE so that we know to resume initializing or TRIM 7808 * once the split has completed. 7809 */ 7810 list_t vd_initialize_list; 7811 list_create(&vd_initialize_list, sizeof (vdev_t), 7812 offsetof(vdev_t, vdev_initialize_node)); 7813 7814 list_t vd_trim_list; 7815 list_create(&vd_trim_list, sizeof (vdev_t), 7816 offsetof(vdev_t, vdev_trim_node)); 7817 7818 for (c = 0; c < children; c++) { 7819 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { 7820 mutex_enter(&vml[c]->vdev_initialize_lock); 7821 vdev_initialize_stop(vml[c], 7822 VDEV_INITIALIZE_ACTIVE, &vd_initialize_list); 7823 mutex_exit(&vml[c]->vdev_initialize_lock); 7824 7825 mutex_enter(&vml[c]->vdev_trim_lock); 7826 vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list); 7827 mutex_exit(&vml[c]->vdev_trim_lock); 7828 } 7829 } 7830 7831 vdev_initialize_stop_wait(spa, &vd_initialize_list); 7832 vdev_trim_stop_wait(spa, &vd_trim_list); 7833 7834 list_destroy(&vd_initialize_list); 7835 list_destroy(&vd_trim_list); 7836 7837 newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; 7838 newspa->spa_is_splitting = B_TRUE; 7839 7840 /* create the new pool from the disks of the original pool */ 7841 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE); 7842 if (error) 7843 goto out; 7844 7845 /* if that worked, generate a real config for the new pool */ 7846 if (newspa->spa_root_vdev != NULL) { 7847 newspa->spa_config_splitting = fnvlist_alloc(); 7848 fnvlist_add_uint64(newspa->spa_config_splitting, 7849 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)); 7850 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 7851 B_TRUE)); 7852 } 7853 7854 /* set the props */ 7855 if (props != NULL) { 7856 spa_configfile_set(newspa, props, B_FALSE); 7857 error = spa_prop_set(newspa, props); 7858 if (error) 7859 goto out; 7860 } 7861 7862 /* flush everything */ 7863 txg = spa_vdev_config_enter(newspa); 7864 vdev_config_dirty(newspa->spa_root_vdev); 7865 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 7866 7867 if (zio_injection_enabled) 7868 zio_handle_panic_injection(spa, FTAG, 2); 7869 7870 spa_async_resume(newspa); 7871 7872 /* finally, update the original pool's config */ 7873 txg = spa_vdev_config_enter(spa); 7874 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 7875 error = dmu_tx_assign(tx, TXG_WAIT); 7876 if (error != 0) 7877 dmu_tx_abort(tx); 7878 for (c = 0; c < children; c++) { 7879 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { 7880 vdev_t *tvd = vml[c]->vdev_top; 7881 7882 /* 7883 * Need to be sure the detachable VDEV is not 7884 * on any *other* txg's DTL list to prevent it 7885 * from being accessed after it's freed. 7886 */ 7887 for (int t = 0; t < TXG_SIZE; t++) { 7888 (void) txg_list_remove_this( 7889 &tvd->vdev_dtl_list, vml[c], t); 7890 } 7891 7892 vdev_split(vml[c]); 7893 if (error == 0) 7894 spa_history_log_internal(spa, "detach", tx, 7895 "vdev=%s", vml[c]->vdev_path); 7896 7897 vdev_free(vml[c]); 7898 } 7899 } 7900 spa->spa_avz_action = AVZ_ACTION_REBUILD; 7901 vdev_config_dirty(spa->spa_root_vdev); 7902 spa->spa_config_splitting = NULL; 7903 nvlist_free(nvl); 7904 if (error == 0) 7905 dmu_tx_commit(tx); 7906 (void) spa_vdev_exit(spa, NULL, txg, 0); 7907 7908 if (zio_injection_enabled) 7909 zio_handle_panic_injection(spa, FTAG, 3); 7910 7911 /* split is complete; log a history record */ 7912 spa_history_log_internal(newspa, "split", NULL, 7913 "from pool %s", spa_name(spa)); 7914 7915 newspa->spa_is_splitting = B_FALSE; 7916 kmem_free(vml, children * sizeof (vdev_t *)); 7917 7918 /* if we're not going to mount the filesystems in userland, export */ 7919 if (exp) 7920 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 7921 B_FALSE, B_FALSE); 7922 7923 return (error); 7924 7925 out: 7926 spa_unload(newspa); 7927 spa_deactivate(newspa); 7928 spa_remove(newspa); 7929 7930 txg = spa_vdev_config_enter(spa); 7931 7932 /* re-online all offlined disks */ 7933 for (c = 0; c < children; c++) { 7934 if (vml[c] != NULL) 7935 vml[c]->vdev_offline = B_FALSE; 7936 } 7937 7938 /* restart initializing or trimming disks as necessary */ 7939 spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); 7940 spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); 7941 spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); 7942 7943 vdev_reopen(spa->spa_root_vdev); 7944 7945 nvlist_free(spa->spa_config_splitting); 7946 spa->spa_config_splitting = NULL; 7947 (void) spa_vdev_exit(spa, NULL, txg, error); 7948 7949 kmem_free(vml, children * sizeof (vdev_t *)); 7950 return (error); 7951 } 7952 7953 /* 7954 * Find any device that's done replacing, or a vdev marked 'unspare' that's 7955 * currently spared, so we can detach it. 7956 */ 7957 static vdev_t * 7958 spa_vdev_resilver_done_hunt(vdev_t *vd) 7959 { 7960 vdev_t *newvd, *oldvd; 7961 7962 for (int c = 0; c < vd->vdev_children; c++) { 7963 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 7964 if (oldvd != NULL) 7965 return (oldvd); 7966 } 7967 7968 /* 7969 * Check for a completed replacement. We always consider the first 7970 * vdev in the list to be the oldest vdev, and the last one to be 7971 * the newest (see spa_vdev_attach() for how that works). In 7972 * the case where the newest vdev is faulted, we will not automatically 7973 * remove it after a resilver completes. This is OK as it will require 7974 * user intervention to determine which disk the admin wishes to keep. 7975 */ 7976 if (vd->vdev_ops == &vdev_replacing_ops) { 7977 ASSERT(vd->vdev_children > 1); 7978 7979 newvd = vd->vdev_child[vd->vdev_children - 1]; 7980 oldvd = vd->vdev_child[0]; 7981 7982 if (vdev_dtl_empty(newvd, DTL_MISSING) && 7983 vdev_dtl_empty(newvd, DTL_OUTAGE) && 7984 !vdev_dtl_required(oldvd)) 7985 return (oldvd); 7986 } 7987 7988 /* 7989 * Check for a completed resilver with the 'unspare' flag set. 7990 * Also potentially update faulted state. 7991 */ 7992 if (vd->vdev_ops == &vdev_spare_ops) { 7993 vdev_t *first = vd->vdev_child[0]; 7994 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 7995 7996 if (last->vdev_unspare) { 7997 oldvd = first; 7998 newvd = last; 7999 } else if (first->vdev_unspare) { 8000 oldvd = last; 8001 newvd = first; 8002 } else { 8003 oldvd = NULL; 8004 } 8005 8006 if (oldvd != NULL && 8007 vdev_dtl_empty(newvd, DTL_MISSING) && 8008 vdev_dtl_empty(newvd, DTL_OUTAGE) && 8009 !vdev_dtl_required(oldvd)) 8010 return (oldvd); 8011 8012 vdev_propagate_state(vd); 8013 8014 /* 8015 * If there are more than two spares attached to a disk, 8016 * and those spares are not required, then we want to 8017 * attempt to free them up now so that they can be used 8018 * by other pools. Once we're back down to a single 8019 * disk+spare, we stop removing them. 8020 */ 8021 if (vd->vdev_children > 2) { 8022 newvd = vd->vdev_child[1]; 8023 8024 if (newvd->vdev_isspare && last->vdev_isspare && 8025 vdev_dtl_empty(last, DTL_MISSING) && 8026 vdev_dtl_empty(last, DTL_OUTAGE) && 8027 !vdev_dtl_required(newvd)) 8028 return (newvd); 8029 } 8030 } 8031 8032 return (NULL); 8033 } 8034 8035 static void 8036 spa_vdev_resilver_done(spa_t *spa) 8037 { 8038 vdev_t *vd, *pvd, *ppvd; 8039 uint64_t guid, sguid, pguid, ppguid; 8040 8041 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 8042 8043 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 8044 pvd = vd->vdev_parent; 8045 ppvd = pvd->vdev_parent; 8046 guid = vd->vdev_guid; 8047 pguid = pvd->vdev_guid; 8048 ppguid = ppvd->vdev_guid; 8049 sguid = 0; 8050 /* 8051 * If we have just finished replacing a hot spared device, then 8052 * we need to detach the parent's first child (the original hot 8053 * spare) as well. 8054 */ 8055 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 8056 ppvd->vdev_children == 2) { 8057 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 8058 sguid = ppvd->vdev_child[1]->vdev_guid; 8059 } 8060 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 8061 8062 spa_config_exit(spa, SCL_ALL, FTAG); 8063 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 8064 return; 8065 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 8066 return; 8067 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 8068 } 8069 8070 spa_config_exit(spa, SCL_ALL, FTAG); 8071 8072 /* 8073 * If a detach was not performed above replace waiters will not have 8074 * been notified. In which case we must do so now. 8075 */ 8076 spa_notify_waiters(spa); 8077 } 8078 8079 /* 8080 * Update the stored path or FRU for this vdev. 8081 */ 8082 static int 8083 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 8084 boolean_t ispath) 8085 { 8086 vdev_t *vd; 8087 boolean_t sync = B_FALSE; 8088 8089 ASSERT(spa_writeable(spa)); 8090 8091 spa_vdev_state_enter(spa, SCL_ALL); 8092 8093 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 8094 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 8095 8096 if (!vd->vdev_ops->vdev_op_leaf) 8097 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 8098 8099 if (ispath) { 8100 if (strcmp(value, vd->vdev_path) != 0) { 8101 spa_strfree(vd->vdev_path); 8102 vd->vdev_path = spa_strdup(value); 8103 sync = B_TRUE; 8104 } 8105 } else { 8106 if (vd->vdev_fru == NULL) { 8107 vd->vdev_fru = spa_strdup(value); 8108 sync = B_TRUE; 8109 } else if (strcmp(value, vd->vdev_fru) != 0) { 8110 spa_strfree(vd->vdev_fru); 8111 vd->vdev_fru = spa_strdup(value); 8112 sync = B_TRUE; 8113 } 8114 } 8115 8116 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 8117 } 8118 8119 int 8120 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 8121 { 8122 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 8123 } 8124 8125 int 8126 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 8127 { 8128 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 8129 } 8130 8131 /* 8132 * ========================================================================== 8133 * SPA Scanning 8134 * ========================================================================== 8135 */ 8136 int 8137 spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) 8138 { 8139 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 8140 8141 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 8142 return (SET_ERROR(EBUSY)); 8143 8144 return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); 8145 } 8146 8147 int 8148 spa_scan_stop(spa_t *spa) 8149 { 8150 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 8151 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 8152 return (SET_ERROR(EBUSY)); 8153 return (dsl_scan_cancel(spa->spa_dsl_pool)); 8154 } 8155 8156 int 8157 spa_scan(spa_t *spa, pool_scan_func_t func) 8158 { 8159 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 8160 8161 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 8162 return (SET_ERROR(ENOTSUP)); 8163 8164 if (func == POOL_SCAN_RESILVER && 8165 !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) 8166 return (SET_ERROR(ENOTSUP)); 8167 8168 /* 8169 * If a resilver was requested, but there is no DTL on a 8170 * writeable leaf device, we have nothing to do. 8171 */ 8172 if (func == POOL_SCAN_RESILVER && 8173 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 8174 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 8175 return (0); 8176 } 8177 8178 return (dsl_scan(spa->spa_dsl_pool, func)); 8179 } 8180 8181 /* 8182 * ========================================================================== 8183 * SPA async task processing 8184 * ========================================================================== 8185 */ 8186 8187 static void 8188 spa_async_remove(spa_t *spa, vdev_t *vd) 8189 { 8190 if (vd->vdev_remove_wanted) { 8191 vd->vdev_remove_wanted = B_FALSE; 8192 vd->vdev_delayed_close = B_FALSE; 8193 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 8194 8195 /* 8196 * We want to clear the stats, but we don't want to do a full 8197 * vdev_clear() as that will cause us to throw away 8198 * degraded/faulted state as well as attempt to reopen the 8199 * device, all of which is a waste. 8200 */ 8201 vd->vdev_stat.vs_read_errors = 0; 8202 vd->vdev_stat.vs_write_errors = 0; 8203 vd->vdev_stat.vs_checksum_errors = 0; 8204 8205 vdev_state_dirty(vd->vdev_top); 8206 8207 /* Tell userspace that the vdev is gone. */ 8208 zfs_post_remove(spa, vd); 8209 } 8210 8211 for (int c = 0; c < vd->vdev_children; c++) 8212 spa_async_remove(spa, vd->vdev_child[c]); 8213 } 8214 8215 static void 8216 spa_async_probe(spa_t *spa, vdev_t *vd) 8217 { 8218 if (vd->vdev_probe_wanted) { 8219 vd->vdev_probe_wanted = B_FALSE; 8220 vdev_reopen(vd); /* vdev_open() does the actual probe */ 8221 } 8222 8223 for (int c = 0; c < vd->vdev_children; c++) 8224 spa_async_probe(spa, vd->vdev_child[c]); 8225 } 8226 8227 static void 8228 spa_async_autoexpand(spa_t *spa, vdev_t *vd) 8229 { 8230 if (!spa->spa_autoexpand) 8231 return; 8232 8233 for (int c = 0; c < vd->vdev_children; c++) { 8234 vdev_t *cvd = vd->vdev_child[c]; 8235 spa_async_autoexpand(spa, cvd); 8236 } 8237 8238 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 8239 return; 8240 8241 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND); 8242 } 8243 8244 static __attribute__((noreturn)) void 8245 spa_async_thread(void *arg) 8246 { 8247 spa_t *spa = (spa_t *)arg; 8248 dsl_pool_t *dp = spa->spa_dsl_pool; 8249 int tasks; 8250 8251 ASSERT(spa->spa_sync_on); 8252 8253 mutex_enter(&spa->spa_async_lock); 8254 tasks = spa->spa_async_tasks; 8255 spa->spa_async_tasks = 0; 8256 mutex_exit(&spa->spa_async_lock); 8257 8258 /* 8259 * See if the config needs to be updated. 8260 */ 8261 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 8262 uint64_t old_space, new_space; 8263 8264 mutex_enter(&spa_namespace_lock); 8265 old_space = metaslab_class_get_space(spa_normal_class(spa)); 8266 old_space += metaslab_class_get_space(spa_special_class(spa)); 8267 old_space += metaslab_class_get_space(spa_dedup_class(spa)); 8268 old_space += metaslab_class_get_space( 8269 spa_embedded_log_class(spa)); 8270 8271 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 8272 8273 new_space = metaslab_class_get_space(spa_normal_class(spa)); 8274 new_space += metaslab_class_get_space(spa_special_class(spa)); 8275 new_space += metaslab_class_get_space(spa_dedup_class(spa)); 8276 new_space += metaslab_class_get_space( 8277 spa_embedded_log_class(spa)); 8278 mutex_exit(&spa_namespace_lock); 8279 8280 /* 8281 * If the pool grew as a result of the config update, 8282 * then log an internal history event. 8283 */ 8284 if (new_space != old_space) { 8285 spa_history_log_internal(spa, "vdev online", NULL, 8286 "pool '%s' size: %llu(+%llu)", 8287 spa_name(spa), (u_longlong_t)new_space, 8288 (u_longlong_t)(new_space - old_space)); 8289 } 8290 } 8291 8292 /* 8293 * See if any devices need to be marked REMOVED. 8294 */ 8295 if (tasks & SPA_ASYNC_REMOVE) { 8296 spa_vdev_state_enter(spa, SCL_NONE); 8297 spa_async_remove(spa, spa->spa_root_vdev); 8298 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 8299 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 8300 for (int i = 0; i < spa->spa_spares.sav_count; i++) 8301 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 8302 (void) spa_vdev_state_exit(spa, NULL, 0); 8303 } 8304 8305 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 8306 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8307 spa_async_autoexpand(spa, spa->spa_root_vdev); 8308 spa_config_exit(spa, SCL_CONFIG, FTAG); 8309 } 8310 8311 /* 8312 * See if any devices need to be probed. 8313 */ 8314 if (tasks & SPA_ASYNC_PROBE) { 8315 spa_vdev_state_enter(spa, SCL_NONE); 8316 spa_async_probe(spa, spa->spa_root_vdev); 8317 (void) spa_vdev_state_exit(spa, NULL, 0); 8318 } 8319 8320 /* 8321 * If any devices are done replacing, detach them. 8322 */ 8323 if (tasks & SPA_ASYNC_RESILVER_DONE || 8324 tasks & SPA_ASYNC_REBUILD_DONE) { 8325 spa_vdev_resilver_done(spa); 8326 } 8327 8328 /* 8329 * Kick off a resilver. 8330 */ 8331 if (tasks & SPA_ASYNC_RESILVER && 8332 !vdev_rebuild_active(spa->spa_root_vdev) && 8333 (!dsl_scan_resilvering(dp) || 8334 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))) 8335 dsl_scan_restart_resilver(dp, 0); 8336 8337 if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { 8338 mutex_enter(&spa_namespace_lock); 8339 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8340 vdev_initialize_restart(spa->spa_root_vdev); 8341 spa_config_exit(spa, SCL_CONFIG, FTAG); 8342 mutex_exit(&spa_namespace_lock); 8343 } 8344 8345 if (tasks & SPA_ASYNC_TRIM_RESTART) { 8346 mutex_enter(&spa_namespace_lock); 8347 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8348 vdev_trim_restart(spa->spa_root_vdev); 8349 spa_config_exit(spa, SCL_CONFIG, FTAG); 8350 mutex_exit(&spa_namespace_lock); 8351 } 8352 8353 if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) { 8354 mutex_enter(&spa_namespace_lock); 8355 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8356 vdev_autotrim_restart(spa); 8357 spa_config_exit(spa, SCL_CONFIG, FTAG); 8358 mutex_exit(&spa_namespace_lock); 8359 } 8360 8361 /* 8362 * Kick off L2 cache whole device TRIM. 8363 */ 8364 if (tasks & SPA_ASYNC_L2CACHE_TRIM) { 8365 mutex_enter(&spa_namespace_lock); 8366 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8367 vdev_trim_l2arc(spa); 8368 spa_config_exit(spa, SCL_CONFIG, FTAG); 8369 mutex_exit(&spa_namespace_lock); 8370 } 8371 8372 /* 8373 * Kick off L2 cache rebuilding. 8374 */ 8375 if (tasks & SPA_ASYNC_L2CACHE_REBUILD) { 8376 mutex_enter(&spa_namespace_lock); 8377 spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER); 8378 l2arc_spa_rebuild_start(spa); 8379 spa_config_exit(spa, SCL_L2ARC, FTAG); 8380 mutex_exit(&spa_namespace_lock); 8381 } 8382 8383 /* 8384 * Let the world know that we're done. 8385 */ 8386 mutex_enter(&spa->spa_async_lock); 8387 spa->spa_async_thread = NULL; 8388 cv_broadcast(&spa->spa_async_cv); 8389 mutex_exit(&spa->spa_async_lock); 8390 thread_exit(); 8391 } 8392 8393 void 8394 spa_async_suspend(spa_t *spa) 8395 { 8396 mutex_enter(&spa->spa_async_lock); 8397 spa->spa_async_suspended++; 8398 while (spa->spa_async_thread != NULL) 8399 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 8400 mutex_exit(&spa->spa_async_lock); 8401 8402 spa_vdev_remove_suspend(spa); 8403 8404 zthr_t *condense_thread = spa->spa_condense_zthr; 8405 if (condense_thread != NULL) 8406 zthr_cancel(condense_thread); 8407 8408 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; 8409 if (discard_thread != NULL) 8410 zthr_cancel(discard_thread); 8411 8412 zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; 8413 if (ll_delete_thread != NULL) 8414 zthr_cancel(ll_delete_thread); 8415 8416 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; 8417 if (ll_condense_thread != NULL) 8418 zthr_cancel(ll_condense_thread); 8419 } 8420 8421 void 8422 spa_async_resume(spa_t *spa) 8423 { 8424 mutex_enter(&spa->spa_async_lock); 8425 ASSERT(spa->spa_async_suspended != 0); 8426 spa->spa_async_suspended--; 8427 mutex_exit(&spa->spa_async_lock); 8428 spa_restart_removal(spa); 8429 8430 zthr_t *condense_thread = spa->spa_condense_zthr; 8431 if (condense_thread != NULL) 8432 zthr_resume(condense_thread); 8433 8434 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; 8435 if (discard_thread != NULL) 8436 zthr_resume(discard_thread); 8437 8438 zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; 8439 if (ll_delete_thread != NULL) 8440 zthr_resume(ll_delete_thread); 8441 8442 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; 8443 if (ll_condense_thread != NULL) 8444 zthr_resume(ll_condense_thread); 8445 } 8446 8447 static boolean_t 8448 spa_async_tasks_pending(spa_t *spa) 8449 { 8450 uint_t non_config_tasks; 8451 uint_t config_task; 8452 boolean_t config_task_suspended; 8453 8454 non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE; 8455 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 8456 if (spa->spa_ccw_fail_time == 0) { 8457 config_task_suspended = B_FALSE; 8458 } else { 8459 config_task_suspended = 8460 (gethrtime() - spa->spa_ccw_fail_time) < 8461 ((hrtime_t)zfs_ccw_retry_interval * NANOSEC); 8462 } 8463 8464 return (non_config_tasks || (config_task && !config_task_suspended)); 8465 } 8466 8467 static void 8468 spa_async_dispatch(spa_t *spa) 8469 { 8470 mutex_enter(&spa->spa_async_lock); 8471 if (spa_async_tasks_pending(spa) && 8472 !spa->spa_async_suspended && 8473 spa->spa_async_thread == NULL) 8474 spa->spa_async_thread = thread_create(NULL, 0, 8475 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 8476 mutex_exit(&spa->spa_async_lock); 8477 } 8478 8479 void 8480 spa_async_request(spa_t *spa, int task) 8481 { 8482 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 8483 mutex_enter(&spa->spa_async_lock); 8484 spa->spa_async_tasks |= task; 8485 mutex_exit(&spa->spa_async_lock); 8486 } 8487 8488 int 8489 spa_async_tasks(spa_t *spa) 8490 { 8491 return (spa->spa_async_tasks); 8492 } 8493 8494 /* 8495 * ========================================================================== 8496 * SPA syncing routines 8497 * ========================================================================== 8498 */ 8499 8500 8501 static int 8502 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 8503 dmu_tx_t *tx) 8504 { 8505 bpobj_t *bpo = arg; 8506 bpobj_enqueue(bpo, bp, bp_freed, tx); 8507 return (0); 8508 } 8509 8510 int 8511 bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 8512 { 8513 return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx)); 8514 } 8515 8516 int 8517 bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 8518 { 8519 return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx)); 8520 } 8521 8522 static int 8523 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 8524 { 8525 zio_t *pio = arg; 8526 8527 zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp, 8528 pio->io_flags)); 8529 return (0); 8530 } 8531 8532 static int 8533 bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 8534 dmu_tx_t *tx) 8535 { 8536 ASSERT(!bp_freed); 8537 return (spa_free_sync_cb(arg, bp, tx)); 8538 } 8539 8540 /* 8541 * Note: this simple function is not inlined to make it easier to dtrace the 8542 * amount of time spent syncing frees. 8543 */ 8544 static void 8545 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 8546 { 8547 zio_t *zio = zio_root(spa, NULL, NULL, 0); 8548 bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 8549 VERIFY(zio_wait(zio) == 0); 8550 } 8551 8552 /* 8553 * Note: this simple function is not inlined to make it easier to dtrace the 8554 * amount of time spent syncing deferred frees. 8555 */ 8556 static void 8557 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 8558 { 8559 if (spa_sync_pass(spa) != 1) 8560 return; 8561 8562 /* 8563 * Note: 8564 * If the log space map feature is active, we stop deferring 8565 * frees to the next TXG and therefore running this function 8566 * would be considered a no-op as spa_deferred_bpobj should 8567 * not have any entries. 8568 * 8569 * That said we run this function anyway (instead of returning 8570 * immediately) for the edge-case scenario where we just 8571 * activated the log space map feature in this TXG but we have 8572 * deferred frees from the previous TXG. 8573 */ 8574 zio_t *zio = zio_root(spa, NULL, NULL, 0); 8575 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 8576 bpobj_spa_free_sync_cb, zio, tx), ==, 0); 8577 VERIFY0(zio_wait(zio)); 8578 } 8579 8580 static void 8581 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 8582 { 8583 char *packed = NULL; 8584 size_t bufsize; 8585 size_t nvsize = 0; 8586 dmu_buf_t *db; 8587 8588 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 8589 8590 /* 8591 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 8592 * information. This avoids the dmu_buf_will_dirty() path and 8593 * saves us a pre-read to get data we don't actually care about. 8594 */ 8595 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 8596 packed = vmem_alloc(bufsize, KM_SLEEP); 8597 8598 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 8599 KM_SLEEP) == 0); 8600 memset(packed + nvsize, 0, bufsize - nvsize); 8601 8602 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 8603 8604 vmem_free(packed, bufsize); 8605 8606 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 8607 dmu_buf_will_dirty(db, tx); 8608 *(uint64_t *)db->db_data = nvsize; 8609 dmu_buf_rele(db, FTAG); 8610 } 8611 8612 static void 8613 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 8614 const char *config, const char *entry) 8615 { 8616 nvlist_t *nvroot; 8617 nvlist_t **list; 8618 int i; 8619 8620 if (!sav->sav_sync) 8621 return; 8622 8623 /* 8624 * Update the MOS nvlist describing the list of available devices. 8625 * spa_validate_aux() will have already made sure this nvlist is 8626 * valid and the vdevs are labeled appropriately. 8627 */ 8628 if (sav->sav_object == 0) { 8629 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 8630 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 8631 sizeof (uint64_t), tx); 8632 VERIFY(zap_update(spa->spa_meta_objset, 8633 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 8634 &sav->sav_object, tx) == 0); 8635 } 8636 8637 nvroot = fnvlist_alloc(); 8638 if (sav->sav_count == 0) { 8639 fnvlist_add_nvlist_array(nvroot, config, 8640 (const nvlist_t * const *)NULL, 0); 8641 } else { 8642 list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP); 8643 for (i = 0; i < sav->sav_count; i++) 8644 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 8645 B_FALSE, VDEV_CONFIG_L2CACHE); 8646 fnvlist_add_nvlist_array(nvroot, config, 8647 (const nvlist_t * const *)list, sav->sav_count); 8648 for (i = 0; i < sav->sav_count; i++) 8649 nvlist_free(list[i]); 8650 kmem_free(list, sav->sav_count * sizeof (void *)); 8651 } 8652 8653 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 8654 nvlist_free(nvroot); 8655 8656 sav->sav_sync = B_FALSE; 8657 } 8658 8659 /* 8660 * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. 8661 * The all-vdev ZAP must be empty. 8662 */ 8663 static void 8664 spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) 8665 { 8666 spa_t *spa = vd->vdev_spa; 8667 8668 if (vd->vdev_top_zap != 0) { 8669 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 8670 vd->vdev_top_zap, tx)); 8671 } 8672 if (vd->vdev_leaf_zap != 0) { 8673 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 8674 vd->vdev_leaf_zap, tx)); 8675 } 8676 for (uint64_t i = 0; i < vd->vdev_children; i++) { 8677 spa_avz_build(vd->vdev_child[i], avz, tx); 8678 } 8679 } 8680 8681 static void 8682 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 8683 { 8684 nvlist_t *config; 8685 8686 /* 8687 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, 8688 * its config may not be dirty but we still need to build per-vdev ZAPs. 8689 * Similarly, if the pool is being assembled (e.g. after a split), we 8690 * need to rebuild the AVZ although the config may not be dirty. 8691 */ 8692 if (list_is_empty(&spa->spa_config_dirty_list) && 8693 spa->spa_avz_action == AVZ_ACTION_NONE) 8694 return; 8695 8696 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 8697 8698 ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || 8699 spa->spa_avz_action == AVZ_ACTION_INITIALIZE || 8700 spa->spa_all_vdev_zaps != 0); 8701 8702 if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { 8703 /* Make and build the new AVZ */ 8704 uint64_t new_avz = zap_create(spa->spa_meta_objset, 8705 DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); 8706 spa_avz_build(spa->spa_root_vdev, new_avz, tx); 8707 8708 /* Diff old AVZ with new one */ 8709 zap_cursor_t zc; 8710 zap_attribute_t za; 8711 8712 for (zap_cursor_init(&zc, spa->spa_meta_objset, 8713 spa->spa_all_vdev_zaps); 8714 zap_cursor_retrieve(&zc, &za) == 0; 8715 zap_cursor_advance(&zc)) { 8716 uint64_t vdzap = za.za_first_integer; 8717 if (zap_lookup_int(spa->spa_meta_objset, new_avz, 8718 vdzap) == ENOENT) { 8719 /* 8720 * ZAP is listed in old AVZ but not in new one; 8721 * destroy it 8722 */ 8723 VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, 8724 tx)); 8725 } 8726 } 8727 8728 zap_cursor_fini(&zc); 8729 8730 /* Destroy the old AVZ */ 8731 VERIFY0(zap_destroy(spa->spa_meta_objset, 8732 spa->spa_all_vdev_zaps, tx)); 8733 8734 /* Replace the old AVZ in the dir obj with the new one */ 8735 VERIFY0(zap_update(spa->spa_meta_objset, 8736 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, 8737 sizeof (new_avz), 1, &new_avz, tx)); 8738 8739 spa->spa_all_vdev_zaps = new_avz; 8740 } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { 8741 zap_cursor_t zc; 8742 zap_attribute_t za; 8743 8744 /* Walk through the AVZ and destroy all listed ZAPs */ 8745 for (zap_cursor_init(&zc, spa->spa_meta_objset, 8746 spa->spa_all_vdev_zaps); 8747 zap_cursor_retrieve(&zc, &za) == 0; 8748 zap_cursor_advance(&zc)) { 8749 uint64_t zap = za.za_first_integer; 8750 VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); 8751 } 8752 8753 zap_cursor_fini(&zc); 8754 8755 /* Destroy and unlink the AVZ itself */ 8756 VERIFY0(zap_destroy(spa->spa_meta_objset, 8757 spa->spa_all_vdev_zaps, tx)); 8758 VERIFY0(zap_remove(spa->spa_meta_objset, 8759 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); 8760 spa->spa_all_vdev_zaps = 0; 8761 } 8762 8763 if (spa->spa_all_vdev_zaps == 0) { 8764 spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, 8765 DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, 8766 DMU_POOL_VDEV_ZAP_MAP, tx); 8767 } 8768 spa->spa_avz_action = AVZ_ACTION_NONE; 8769 8770 /* Create ZAPs for vdevs that don't have them. */ 8771 vdev_construct_zaps(spa->spa_root_vdev, tx); 8772 8773 config = spa_config_generate(spa, spa->spa_root_vdev, 8774 dmu_tx_get_txg(tx), B_FALSE); 8775 8776 /* 8777 * If we're upgrading the spa version then make sure that 8778 * the config object gets updated with the correct version. 8779 */ 8780 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 8781 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 8782 spa->spa_uberblock.ub_version); 8783 8784 spa_config_exit(spa, SCL_STATE, FTAG); 8785 8786 nvlist_free(spa->spa_config_syncing); 8787 spa->spa_config_syncing = config; 8788 8789 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 8790 } 8791 8792 static void 8793 spa_sync_version(void *arg, dmu_tx_t *tx) 8794 { 8795 uint64_t *versionp = arg; 8796 uint64_t version = *versionp; 8797 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 8798 8799 /* 8800 * Setting the version is special cased when first creating the pool. 8801 */ 8802 ASSERT(tx->tx_txg != TXG_INITIAL); 8803 8804 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 8805 ASSERT(version >= spa_version(spa)); 8806 8807 spa->spa_uberblock.ub_version = version; 8808 vdev_config_dirty(spa->spa_root_vdev); 8809 spa_history_log_internal(spa, "set", tx, "version=%lld", 8810 (longlong_t)version); 8811 } 8812 8813 /* 8814 * Set zpool properties. 8815 */ 8816 static void 8817 spa_sync_props(void *arg, dmu_tx_t *tx) 8818 { 8819 nvlist_t *nvp = arg; 8820 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 8821 objset_t *mos = spa->spa_meta_objset; 8822 nvpair_t *elem = NULL; 8823 8824 mutex_enter(&spa->spa_props_lock); 8825 8826 while ((elem = nvlist_next_nvpair(nvp, elem))) { 8827 uint64_t intval; 8828 const char *strval, *fname; 8829 zpool_prop_t prop; 8830 const char *propname; 8831 const char *elemname = nvpair_name(elem); 8832 zprop_type_t proptype; 8833 spa_feature_t fid; 8834 8835 switch (prop = zpool_name_to_prop(elemname)) { 8836 case ZPOOL_PROP_VERSION: 8837 intval = fnvpair_value_uint64(elem); 8838 /* 8839 * The version is synced separately before other 8840 * properties and should be correct by now. 8841 */ 8842 ASSERT3U(spa_version(spa), >=, intval); 8843 break; 8844 8845 case ZPOOL_PROP_ALTROOT: 8846 /* 8847 * 'altroot' is a non-persistent property. It should 8848 * have been set temporarily at creation or import time. 8849 */ 8850 ASSERT(spa->spa_root != NULL); 8851 break; 8852 8853 case ZPOOL_PROP_READONLY: 8854 case ZPOOL_PROP_CACHEFILE: 8855 /* 8856 * 'readonly' and 'cachefile' are also non-persistent 8857 * properties. 8858 */ 8859 break; 8860 case ZPOOL_PROP_COMMENT: 8861 strval = fnvpair_value_string(elem); 8862 if (spa->spa_comment != NULL) 8863 spa_strfree(spa->spa_comment); 8864 spa->spa_comment = spa_strdup(strval); 8865 /* 8866 * We need to dirty the configuration on all the vdevs 8867 * so that their labels get updated. We also need to 8868 * update the cache file to keep it in sync with the 8869 * MOS version. It's unnecessary to do this for pool 8870 * creation since the vdev's configuration has already 8871 * been dirtied. 8872 */ 8873 if (tx->tx_txg != TXG_INITIAL) { 8874 vdev_config_dirty(spa->spa_root_vdev); 8875 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 8876 } 8877 spa_history_log_internal(spa, "set", tx, 8878 "%s=%s", elemname, strval); 8879 break; 8880 case ZPOOL_PROP_COMPATIBILITY: 8881 strval = fnvpair_value_string(elem); 8882 if (spa->spa_compatibility != NULL) 8883 spa_strfree(spa->spa_compatibility); 8884 spa->spa_compatibility = spa_strdup(strval); 8885 /* 8886 * Dirty the configuration on vdevs as above. 8887 */ 8888 if (tx->tx_txg != TXG_INITIAL) { 8889 vdev_config_dirty(spa->spa_root_vdev); 8890 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 8891 } 8892 8893 spa_history_log_internal(spa, "set", tx, 8894 "%s=%s", nvpair_name(elem), strval); 8895 break; 8896 8897 case ZPOOL_PROP_INVAL: 8898 if (zpool_prop_feature(elemname)) { 8899 fname = strchr(elemname, '@') + 1; 8900 VERIFY0(zfeature_lookup_name(fname, &fid)); 8901 8902 spa_feature_enable(spa, fid, tx); 8903 spa_history_log_internal(spa, "set", tx, 8904 "%s=enabled", elemname); 8905 break; 8906 } else if (!zfs_prop_user(elemname)) { 8907 ASSERT(zpool_prop_feature(elemname)); 8908 break; 8909 } 8910 zfs_fallthrough; 8911 default: 8912 /* 8913 * Set pool property values in the poolprops mos object. 8914 */ 8915 if (spa->spa_pool_props_object == 0) { 8916 spa->spa_pool_props_object = 8917 zap_create_link(mos, DMU_OT_POOL_PROPS, 8918 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 8919 tx); 8920 } 8921 8922 /* normalize the property name */ 8923 propname = zpool_prop_to_name(prop); 8924 proptype = zpool_prop_get_type(prop); 8925 if (prop == ZPOOL_PROP_INVAL && 8926 zfs_prop_user(elemname)) { 8927 propname = elemname; 8928 proptype = PROP_TYPE_STRING; 8929 } 8930 8931 if (nvpair_type(elem) == DATA_TYPE_STRING) { 8932 ASSERT(proptype == PROP_TYPE_STRING); 8933 strval = fnvpair_value_string(elem); 8934 VERIFY0(zap_update(mos, 8935 spa->spa_pool_props_object, propname, 8936 1, strlen(strval) + 1, strval, tx)); 8937 spa_history_log_internal(spa, "set", tx, 8938 "%s=%s", elemname, strval); 8939 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 8940 intval = fnvpair_value_uint64(elem); 8941 8942 if (proptype == PROP_TYPE_INDEX) { 8943 const char *unused; 8944 VERIFY0(zpool_prop_index_to_string( 8945 prop, intval, &unused)); 8946 } 8947 VERIFY0(zap_update(mos, 8948 spa->spa_pool_props_object, propname, 8949 8, 1, &intval, tx)); 8950 spa_history_log_internal(spa, "set", tx, 8951 "%s=%lld", elemname, 8952 (longlong_t)intval); 8953 8954 switch (prop) { 8955 case ZPOOL_PROP_DELEGATION: 8956 spa->spa_delegation = intval; 8957 break; 8958 case ZPOOL_PROP_BOOTFS: 8959 spa->spa_bootfs = intval; 8960 break; 8961 case ZPOOL_PROP_FAILUREMODE: 8962 spa->spa_failmode = intval; 8963 break; 8964 case ZPOOL_PROP_AUTOTRIM: 8965 spa->spa_autotrim = intval; 8966 spa_async_request(spa, 8967 SPA_ASYNC_AUTOTRIM_RESTART); 8968 break; 8969 case ZPOOL_PROP_AUTOEXPAND: 8970 spa->spa_autoexpand = intval; 8971 if (tx->tx_txg != TXG_INITIAL) 8972 spa_async_request(spa, 8973 SPA_ASYNC_AUTOEXPAND); 8974 break; 8975 case ZPOOL_PROP_MULTIHOST: 8976 spa->spa_multihost = intval; 8977 break; 8978 default: 8979 break; 8980 } 8981 } else { 8982 ASSERT(0); /* not allowed */ 8983 } 8984 } 8985 8986 } 8987 8988 mutex_exit(&spa->spa_props_lock); 8989 } 8990 8991 /* 8992 * Perform one-time upgrade on-disk changes. spa_version() does not 8993 * reflect the new version this txg, so there must be no changes this 8994 * txg to anything that the upgrade code depends on after it executes. 8995 * Therefore this must be called after dsl_pool_sync() does the sync 8996 * tasks. 8997 */ 8998 static void 8999 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 9000 { 9001 if (spa_sync_pass(spa) != 1) 9002 return; 9003 9004 dsl_pool_t *dp = spa->spa_dsl_pool; 9005 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 9006 9007 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 9008 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 9009 dsl_pool_create_origin(dp, tx); 9010 9011 /* Keeping the origin open increases spa_minref */ 9012 spa->spa_minref += 3; 9013 } 9014 9015 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 9016 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 9017 dsl_pool_upgrade_clones(dp, tx); 9018 } 9019 9020 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 9021 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 9022 dsl_pool_upgrade_dir_clones(dp, tx); 9023 9024 /* Keeping the freedir open increases spa_minref */ 9025 spa->spa_minref += 3; 9026 } 9027 9028 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 9029 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 9030 spa_feature_create_zap_objects(spa, tx); 9031 } 9032 9033 /* 9034 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 9035 * when possibility to use lz4 compression for metadata was added 9036 * Old pools that have this feature enabled must be upgraded to have 9037 * this feature active 9038 */ 9039 if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 9040 boolean_t lz4_en = spa_feature_is_enabled(spa, 9041 SPA_FEATURE_LZ4_COMPRESS); 9042 boolean_t lz4_ac = spa_feature_is_active(spa, 9043 SPA_FEATURE_LZ4_COMPRESS); 9044 9045 if (lz4_en && !lz4_ac) 9046 spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 9047 } 9048 9049 /* 9050 * If we haven't written the salt, do so now. Note that the 9051 * feature may not be activated yet, but that's fine since 9052 * the presence of this ZAP entry is backwards compatible. 9053 */ 9054 if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 9055 DMU_POOL_CHECKSUM_SALT) == ENOENT) { 9056 VERIFY0(zap_add(spa->spa_meta_objset, 9057 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, 9058 sizeof (spa->spa_cksum_salt.zcs_bytes), 9059 spa->spa_cksum_salt.zcs_bytes, tx)); 9060 } 9061 9062 rrw_exit(&dp->dp_config_rwlock, FTAG); 9063 } 9064 9065 static void 9066 vdev_indirect_state_sync_verify(vdev_t *vd) 9067 { 9068 vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping; 9069 vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births; 9070 9071 if (vd->vdev_ops == &vdev_indirect_ops) { 9072 ASSERT(vim != NULL); 9073 ASSERT(vib != NULL); 9074 } 9075 9076 uint64_t obsolete_sm_object = 0; 9077 ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); 9078 if (obsolete_sm_object != 0) { 9079 ASSERT(vd->vdev_obsolete_sm != NULL); 9080 ASSERT(vd->vdev_removing || 9081 vd->vdev_ops == &vdev_indirect_ops); 9082 ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); 9083 ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); 9084 ASSERT3U(obsolete_sm_object, ==, 9085 space_map_object(vd->vdev_obsolete_sm)); 9086 ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, 9087 space_map_allocated(vd->vdev_obsolete_sm)); 9088 } 9089 ASSERT(vd->vdev_obsolete_segments != NULL); 9090 9091 /* 9092 * Since frees / remaps to an indirect vdev can only 9093 * happen in syncing context, the obsolete segments 9094 * tree must be empty when we start syncing. 9095 */ 9096 ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); 9097 } 9098 9099 /* 9100 * Set the top-level vdev's max queue depth. Evaluate each top-level's 9101 * async write queue depth in case it changed. The max queue depth will 9102 * not change in the middle of syncing out this txg. 9103 */ 9104 static void 9105 spa_sync_adjust_vdev_max_queue_depth(spa_t *spa) 9106 { 9107 ASSERT(spa_writeable(spa)); 9108 9109 vdev_t *rvd = spa->spa_root_vdev; 9110 uint32_t max_queue_depth = zfs_vdev_async_write_max_active * 9111 zfs_vdev_queue_depth_pct / 100; 9112 metaslab_class_t *normal = spa_normal_class(spa); 9113 metaslab_class_t *special = spa_special_class(spa); 9114 metaslab_class_t *dedup = spa_dedup_class(spa); 9115 9116 uint64_t slots_per_allocator = 0; 9117 for (int c = 0; c < rvd->vdev_children; c++) { 9118 vdev_t *tvd = rvd->vdev_child[c]; 9119 9120 metaslab_group_t *mg = tvd->vdev_mg; 9121 if (mg == NULL || !metaslab_group_initialized(mg)) 9122 continue; 9123 9124 metaslab_class_t *mc = mg->mg_class; 9125 if (mc != normal && mc != special && mc != dedup) 9126 continue; 9127 9128 /* 9129 * It is safe to do a lock-free check here because only async 9130 * allocations look at mg_max_alloc_queue_depth, and async 9131 * allocations all happen from spa_sync(). 9132 */ 9133 for (int i = 0; i < mg->mg_allocators; i++) { 9134 ASSERT0(zfs_refcount_count( 9135 &(mg->mg_allocator[i].mga_alloc_queue_depth))); 9136 } 9137 mg->mg_max_alloc_queue_depth = max_queue_depth; 9138 9139 for (int i = 0; i < mg->mg_allocators; i++) { 9140 mg->mg_allocator[i].mga_cur_max_alloc_queue_depth = 9141 zfs_vdev_def_queue_depth; 9142 } 9143 slots_per_allocator += zfs_vdev_def_queue_depth; 9144 } 9145 9146 for (int i = 0; i < spa->spa_alloc_count; i++) { 9147 ASSERT0(zfs_refcount_count(&normal->mc_allocator[i]. 9148 mca_alloc_slots)); 9149 ASSERT0(zfs_refcount_count(&special->mc_allocator[i]. 9150 mca_alloc_slots)); 9151 ASSERT0(zfs_refcount_count(&dedup->mc_allocator[i]. 9152 mca_alloc_slots)); 9153 normal->mc_allocator[i].mca_alloc_max_slots = 9154 slots_per_allocator; 9155 special->mc_allocator[i].mca_alloc_max_slots = 9156 slots_per_allocator; 9157 dedup->mc_allocator[i].mca_alloc_max_slots = 9158 slots_per_allocator; 9159 } 9160 normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 9161 special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 9162 dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 9163 } 9164 9165 static void 9166 spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx) 9167 { 9168 ASSERT(spa_writeable(spa)); 9169 9170 vdev_t *rvd = spa->spa_root_vdev; 9171 for (int c = 0; c < rvd->vdev_children; c++) { 9172 vdev_t *vd = rvd->vdev_child[c]; 9173 vdev_indirect_state_sync_verify(vd); 9174 9175 if (vdev_indirect_should_condense(vd)) { 9176 spa_condense_indirect_start_sync(vd, tx); 9177 break; 9178 } 9179 } 9180 } 9181 9182 static void 9183 spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) 9184 { 9185 objset_t *mos = spa->spa_meta_objset; 9186 dsl_pool_t *dp = spa->spa_dsl_pool; 9187 uint64_t txg = tx->tx_txg; 9188 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 9189 9190 do { 9191 int pass = ++spa->spa_sync_pass; 9192 9193 spa_sync_config_object(spa, tx); 9194 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 9195 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 9196 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 9197 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 9198 spa_errlog_sync(spa, txg); 9199 dsl_pool_sync(dp, txg); 9200 9201 if (pass < zfs_sync_pass_deferred_free || 9202 spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { 9203 /* 9204 * If the log space map feature is active we don't 9205 * care about deferred frees and the deferred bpobj 9206 * as the log space map should effectively have the 9207 * same results (i.e. appending only to one object). 9208 */ 9209 spa_sync_frees(spa, free_bpl, tx); 9210 } else { 9211 /* 9212 * We can not defer frees in pass 1, because 9213 * we sync the deferred frees later in pass 1. 9214 */ 9215 ASSERT3U(pass, >, 1); 9216 bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb, 9217 &spa->spa_deferred_bpobj, tx); 9218 } 9219 9220 brt_sync(spa, txg); 9221 ddt_sync(spa, txg); 9222 dsl_scan_sync(dp, tx); 9223 svr_sync(spa, tx); 9224 spa_sync_upgrades(spa, tx); 9225 9226 spa_flush_metaslabs(spa, tx); 9227 9228 vdev_t *vd = NULL; 9229 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 9230 != NULL) 9231 vdev_sync(vd, txg); 9232 9233 /* 9234 * Note: We need to check if the MOS is dirty because we could 9235 * have marked the MOS dirty without updating the uberblock 9236 * (e.g. if we have sync tasks but no dirty user data). We need 9237 * to check the uberblock's rootbp because it is updated if we 9238 * have synced out dirty data (though in this case the MOS will 9239 * most likely also be dirty due to second order effects, we 9240 * don't want to rely on that here). 9241 */ 9242 if (pass == 1 && 9243 spa->spa_uberblock.ub_rootbp.blk_birth < txg && 9244 !dmu_objset_is_dirty(mos, txg)) { 9245 /* 9246 * Nothing changed on the first pass, therefore this 9247 * TXG is a no-op. Avoid syncing deferred frees, so 9248 * that we can keep this TXG as a no-op. 9249 */ 9250 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 9251 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 9252 ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); 9253 ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg)); 9254 break; 9255 } 9256 9257 spa_sync_deferred_frees(spa, tx); 9258 } while (dmu_objset_is_dirty(mos, txg)); 9259 } 9260 9261 /* 9262 * Rewrite the vdev configuration (which includes the uberblock) to 9263 * commit the transaction group. 9264 * 9265 * If there are no dirty vdevs, we sync the uberblock to a few random 9266 * top-level vdevs that are known to be visible in the config cache 9267 * (see spa_vdev_add() for a complete description). If there *are* dirty 9268 * vdevs, sync the uberblock to all vdevs. 9269 */ 9270 static void 9271 spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) 9272 { 9273 vdev_t *rvd = spa->spa_root_vdev; 9274 uint64_t txg = tx->tx_txg; 9275 9276 for (;;) { 9277 int error = 0; 9278 9279 /* 9280 * We hold SCL_STATE to prevent vdev open/close/etc. 9281 * while we're attempting to write the vdev labels. 9282 */ 9283 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 9284 9285 if (list_is_empty(&spa->spa_config_dirty_list)) { 9286 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; 9287 int svdcount = 0; 9288 int children = rvd->vdev_children; 9289 int c0 = random_in_range(children); 9290 9291 for (int c = 0; c < children; c++) { 9292 vdev_t *vd = 9293 rvd->vdev_child[(c0 + c) % children]; 9294 9295 /* Stop when revisiting the first vdev */ 9296 if (c > 0 && svd[0] == vd) 9297 break; 9298 9299 if (vd->vdev_ms_array == 0 || 9300 vd->vdev_islog || 9301 !vdev_is_concrete(vd)) 9302 continue; 9303 9304 svd[svdcount++] = vd; 9305 if (svdcount == SPA_SYNC_MIN_VDEVS) 9306 break; 9307 } 9308 error = vdev_config_sync(svd, svdcount, txg); 9309 } else { 9310 error = vdev_config_sync(rvd->vdev_child, 9311 rvd->vdev_children, txg); 9312 } 9313 9314 if (error == 0) 9315 spa->spa_last_synced_guid = rvd->vdev_guid; 9316 9317 spa_config_exit(spa, SCL_STATE, FTAG); 9318 9319 if (error == 0) 9320 break; 9321 zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); 9322 zio_resume_wait(spa); 9323 } 9324 } 9325 9326 /* 9327 * Sync the specified transaction group. New blocks may be dirtied as 9328 * part of the process, so we iterate until it converges. 9329 */ 9330 void 9331 spa_sync(spa_t *spa, uint64_t txg) 9332 { 9333 vdev_t *vd = NULL; 9334 9335 VERIFY(spa_writeable(spa)); 9336 9337 /* 9338 * Wait for i/os issued in open context that need to complete 9339 * before this txg syncs. 9340 */ 9341 (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); 9342 spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 9343 ZIO_FLAG_CANFAIL); 9344 9345 /* 9346 * Now that there can be no more cloning in this transaction group, 9347 * but we are still before issuing frees, we can process pending BRT 9348 * updates. 9349 */ 9350 brt_pending_apply(spa, txg); 9351 9352 /* 9353 * Lock out configuration changes. 9354 */ 9355 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 9356 9357 spa->spa_syncing_txg = txg; 9358 spa->spa_sync_pass = 0; 9359 9360 for (int i = 0; i < spa->spa_alloc_count; i++) { 9361 mutex_enter(&spa->spa_allocs[i].spaa_lock); 9362 VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); 9363 mutex_exit(&spa->spa_allocs[i].spaa_lock); 9364 } 9365 9366 /* 9367 * If there are any pending vdev state changes, convert them 9368 * into config changes that go out with this transaction group. 9369 */ 9370 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 9371 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 9372 /* Avoid holding the write lock unless actually necessary */ 9373 if (vd->vdev_aux == NULL) { 9374 vdev_state_clean(vd); 9375 vdev_config_dirty(vd); 9376 continue; 9377 } 9378 /* 9379 * We need the write lock here because, for aux vdevs, 9380 * calling vdev_config_dirty() modifies sav_config. 9381 * This is ugly and will become unnecessary when we 9382 * eliminate the aux vdev wart by integrating all vdevs 9383 * into the root vdev tree. 9384 */ 9385 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9386 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 9387 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 9388 vdev_state_clean(vd); 9389 vdev_config_dirty(vd); 9390 } 9391 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9392 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 9393 } 9394 spa_config_exit(spa, SCL_STATE, FTAG); 9395 9396 dsl_pool_t *dp = spa->spa_dsl_pool; 9397 dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); 9398 9399 spa->spa_sync_starttime = gethrtime(); 9400 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 9401 spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, 9402 spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + 9403 NSEC_TO_TICK(spa->spa_deadman_synctime)); 9404 9405 /* 9406 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 9407 * set spa_deflate if we have no raid-z vdevs. 9408 */ 9409 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 9410 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 9411 vdev_t *rvd = spa->spa_root_vdev; 9412 9413 int i; 9414 for (i = 0; i < rvd->vdev_children; i++) { 9415 vd = rvd->vdev_child[i]; 9416 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 9417 break; 9418 } 9419 if (i == rvd->vdev_children) { 9420 spa->spa_deflate = TRUE; 9421 VERIFY0(zap_add(spa->spa_meta_objset, 9422 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 9423 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 9424 } 9425 } 9426 9427 spa_sync_adjust_vdev_max_queue_depth(spa); 9428 9429 spa_sync_condense_indirect(spa, tx); 9430 9431 spa_sync_iterate_to_convergence(spa, tx); 9432 9433 #ifdef ZFS_DEBUG 9434 if (!list_is_empty(&spa->spa_config_dirty_list)) { 9435 /* 9436 * Make sure that the number of ZAPs for all the vdevs matches 9437 * the number of ZAPs in the per-vdev ZAP list. This only gets 9438 * called if the config is dirty; otherwise there may be 9439 * outstanding AVZ operations that weren't completed in 9440 * spa_sync_config_object. 9441 */ 9442 uint64_t all_vdev_zap_entry_count; 9443 ASSERT0(zap_count(spa->spa_meta_objset, 9444 spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); 9445 ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, 9446 all_vdev_zap_entry_count); 9447 } 9448 #endif 9449 9450 if (spa->spa_vdev_removal != NULL) { 9451 ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); 9452 } 9453 9454 spa_sync_rewrite_vdev_config(spa, tx); 9455 dmu_tx_commit(tx); 9456 9457 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 9458 spa->spa_deadman_tqid = 0; 9459 9460 /* 9461 * Clear the dirty config list. 9462 */ 9463 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 9464 vdev_config_clean(vd); 9465 9466 /* 9467 * Now that the new config has synced transactionally, 9468 * let it become visible to the config cache. 9469 */ 9470 if (spa->spa_config_syncing != NULL) { 9471 spa_config_set(spa, spa->spa_config_syncing); 9472 spa->spa_config_txg = txg; 9473 spa->spa_config_syncing = NULL; 9474 } 9475 9476 dsl_pool_sync_done(dp, txg); 9477 9478 for (int i = 0; i < spa->spa_alloc_count; i++) { 9479 mutex_enter(&spa->spa_allocs[i].spaa_lock); 9480 VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); 9481 mutex_exit(&spa->spa_allocs[i].spaa_lock); 9482 } 9483 9484 /* 9485 * Update usable space statistics. 9486 */ 9487 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 9488 != NULL) 9489 vdev_sync_done(vd, txg); 9490 9491 metaslab_class_evict_old(spa->spa_normal_class, txg); 9492 metaslab_class_evict_old(spa->spa_log_class, txg); 9493 9494 spa_sync_close_syncing_log_sm(spa); 9495 9496 spa_update_dspace(spa); 9497 9498 if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) 9499 vdev_autotrim_kick(spa); 9500 9501 /* 9502 * It had better be the case that we didn't dirty anything 9503 * since vdev_config_sync(). 9504 */ 9505 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 9506 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 9507 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 9508 9509 while (zfs_pause_spa_sync) 9510 delay(1); 9511 9512 spa->spa_sync_pass = 0; 9513 9514 /* 9515 * Update the last synced uberblock here. We want to do this at 9516 * the end of spa_sync() so that consumers of spa_last_synced_txg() 9517 * will be guaranteed that all the processing associated with 9518 * that txg has been completed. 9519 */ 9520 spa->spa_ubsync = spa->spa_uberblock; 9521 spa_config_exit(spa, SCL_CONFIG, FTAG); 9522 9523 spa_handle_ignored_writes(spa); 9524 9525 /* 9526 * If any async tasks have been requested, kick them off. 9527 */ 9528 spa_async_dispatch(spa); 9529 } 9530 9531 /* 9532 * Sync all pools. We don't want to hold the namespace lock across these 9533 * operations, so we take a reference on the spa_t and drop the lock during the 9534 * sync. 9535 */ 9536 void 9537 spa_sync_allpools(void) 9538 { 9539 spa_t *spa = NULL; 9540 mutex_enter(&spa_namespace_lock); 9541 while ((spa = spa_next(spa)) != NULL) { 9542 if (spa_state(spa) != POOL_STATE_ACTIVE || 9543 !spa_writeable(spa) || spa_suspended(spa)) 9544 continue; 9545 spa_open_ref(spa, FTAG); 9546 mutex_exit(&spa_namespace_lock); 9547 txg_wait_synced(spa_get_dsl(spa), 0); 9548 mutex_enter(&spa_namespace_lock); 9549 spa_close(spa, FTAG); 9550 } 9551 mutex_exit(&spa_namespace_lock); 9552 } 9553 9554 /* 9555 * ========================================================================== 9556 * Miscellaneous routines 9557 * ========================================================================== 9558 */ 9559 9560 /* 9561 * Remove all pools in the system. 9562 */ 9563 void 9564 spa_evict_all(void) 9565 { 9566 spa_t *spa; 9567 9568 /* 9569 * Remove all cached state. All pools should be closed now, 9570 * so every spa in the AVL tree should be unreferenced. 9571 */ 9572 mutex_enter(&spa_namespace_lock); 9573 while ((spa = spa_next(NULL)) != NULL) { 9574 /* 9575 * Stop async tasks. The async thread may need to detach 9576 * a device that's been replaced, which requires grabbing 9577 * spa_namespace_lock, so we must drop it here. 9578 */ 9579 spa_open_ref(spa, FTAG); 9580 mutex_exit(&spa_namespace_lock); 9581 spa_async_suspend(spa); 9582 mutex_enter(&spa_namespace_lock); 9583 spa_close(spa, FTAG); 9584 9585 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 9586 spa_unload(spa); 9587 spa_deactivate(spa); 9588 } 9589 spa_remove(spa); 9590 } 9591 mutex_exit(&spa_namespace_lock); 9592 } 9593 9594 vdev_t * 9595 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 9596 { 9597 vdev_t *vd; 9598 int i; 9599 9600 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 9601 return (vd); 9602 9603 if (aux) { 9604 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 9605 vd = spa->spa_l2cache.sav_vdevs[i]; 9606 if (vd->vdev_guid == guid) 9607 return (vd); 9608 } 9609 9610 for (i = 0; i < spa->spa_spares.sav_count; i++) { 9611 vd = spa->spa_spares.sav_vdevs[i]; 9612 if (vd->vdev_guid == guid) 9613 return (vd); 9614 } 9615 } 9616 9617 return (NULL); 9618 } 9619 9620 void 9621 spa_upgrade(spa_t *spa, uint64_t version) 9622 { 9623 ASSERT(spa_writeable(spa)); 9624 9625 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 9626 9627 /* 9628 * This should only be called for a non-faulted pool, and since a 9629 * future version would result in an unopenable pool, this shouldn't be 9630 * possible. 9631 */ 9632 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 9633 ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 9634 9635 spa->spa_uberblock.ub_version = version; 9636 vdev_config_dirty(spa->spa_root_vdev); 9637 9638 spa_config_exit(spa, SCL_ALL, FTAG); 9639 9640 txg_wait_synced(spa_get_dsl(spa), 0); 9641 } 9642 9643 static boolean_t 9644 spa_has_aux_vdev(spa_t *spa, uint64_t guid, spa_aux_vdev_t *sav) 9645 { 9646 (void) spa; 9647 int i; 9648 uint64_t vdev_guid; 9649 9650 for (i = 0; i < sav->sav_count; i++) 9651 if (sav->sav_vdevs[i]->vdev_guid == guid) 9652 return (B_TRUE); 9653 9654 for (i = 0; i < sav->sav_npending; i++) { 9655 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 9656 &vdev_guid) == 0 && vdev_guid == guid) 9657 return (B_TRUE); 9658 } 9659 9660 return (B_FALSE); 9661 } 9662 9663 boolean_t 9664 spa_has_l2cache(spa_t *spa, uint64_t guid) 9665 { 9666 return (spa_has_aux_vdev(spa, guid, &spa->spa_l2cache)); 9667 } 9668 9669 boolean_t 9670 spa_has_spare(spa_t *spa, uint64_t guid) 9671 { 9672 return (spa_has_aux_vdev(spa, guid, &spa->spa_spares)); 9673 } 9674 9675 /* 9676 * Check if a pool has an active shared spare device. 9677 * Note: reference count of an active spare is 2, as a spare and as a replace 9678 */ 9679 static boolean_t 9680 spa_has_active_shared_spare(spa_t *spa) 9681 { 9682 int i, refcnt; 9683 uint64_t pool; 9684 spa_aux_vdev_t *sav = &spa->spa_spares; 9685 9686 for (i = 0; i < sav->sav_count; i++) { 9687 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 9688 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 9689 refcnt > 2) 9690 return (B_TRUE); 9691 } 9692 9693 return (B_FALSE); 9694 } 9695 9696 uint64_t 9697 spa_total_metaslabs(spa_t *spa) 9698 { 9699 vdev_t *rvd = spa->spa_root_vdev; 9700 9701 uint64_t m = 0; 9702 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 9703 vdev_t *vd = rvd->vdev_child[c]; 9704 if (!vdev_is_concrete(vd)) 9705 continue; 9706 m += vd->vdev_ms_count; 9707 } 9708 return (m); 9709 } 9710 9711 /* 9712 * Notify any waiting threads that some activity has switched from being in- 9713 * progress to not-in-progress so that the thread can wake up and determine 9714 * whether it is finished waiting. 9715 */ 9716 void 9717 spa_notify_waiters(spa_t *spa) 9718 { 9719 /* 9720 * Acquiring spa_activities_lock here prevents the cv_broadcast from 9721 * happening between the waiting thread's check and cv_wait. 9722 */ 9723 mutex_enter(&spa->spa_activities_lock); 9724 cv_broadcast(&spa->spa_activities_cv); 9725 mutex_exit(&spa->spa_activities_lock); 9726 } 9727 9728 /* 9729 * Notify any waiting threads that the pool is exporting, and then block until 9730 * they are finished using the spa_t. 9731 */ 9732 void 9733 spa_wake_waiters(spa_t *spa) 9734 { 9735 mutex_enter(&spa->spa_activities_lock); 9736 spa->spa_waiters_cancel = B_TRUE; 9737 cv_broadcast(&spa->spa_activities_cv); 9738 while (spa->spa_waiters != 0) 9739 cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock); 9740 spa->spa_waiters_cancel = B_FALSE; 9741 mutex_exit(&spa->spa_activities_lock); 9742 } 9743 9744 /* Whether the vdev or any of its descendants are being initialized/trimmed. */ 9745 static boolean_t 9746 spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity) 9747 { 9748 spa_t *spa = vd->vdev_spa; 9749 9750 ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER)); 9751 ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); 9752 ASSERT(activity == ZPOOL_WAIT_INITIALIZE || 9753 activity == ZPOOL_WAIT_TRIM); 9754 9755 kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ? 9756 &vd->vdev_initialize_lock : &vd->vdev_trim_lock; 9757 9758 mutex_exit(&spa->spa_activities_lock); 9759 mutex_enter(lock); 9760 mutex_enter(&spa->spa_activities_lock); 9761 9762 boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ? 9763 (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) : 9764 (vd->vdev_trim_state == VDEV_TRIM_ACTIVE); 9765 mutex_exit(lock); 9766 9767 if (in_progress) 9768 return (B_TRUE); 9769 9770 for (int i = 0; i < vd->vdev_children; i++) { 9771 if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i], 9772 activity)) 9773 return (B_TRUE); 9774 } 9775 9776 return (B_FALSE); 9777 } 9778 9779 /* 9780 * If use_guid is true, this checks whether the vdev specified by guid is 9781 * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool 9782 * is being initialized/trimmed. The caller must hold the config lock and 9783 * spa_activities_lock. 9784 */ 9785 static int 9786 spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid, 9787 zpool_wait_activity_t activity, boolean_t *in_progress) 9788 { 9789 mutex_exit(&spa->spa_activities_lock); 9790 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 9791 mutex_enter(&spa->spa_activities_lock); 9792 9793 vdev_t *vd; 9794 if (use_guid) { 9795 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 9796 if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) { 9797 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9798 return (EINVAL); 9799 } 9800 } else { 9801 vd = spa->spa_root_vdev; 9802 } 9803 9804 *in_progress = spa_vdev_activity_in_progress_impl(vd, activity); 9805 9806 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9807 return (0); 9808 } 9809 9810 /* 9811 * Locking for waiting threads 9812 * --------------------------- 9813 * 9814 * Waiting threads need a way to check whether a given activity is in progress, 9815 * and then, if it is, wait for it to complete. Each activity will have some 9816 * in-memory representation of the relevant on-disk state which can be used to 9817 * determine whether or not the activity is in progress. The in-memory state and 9818 * the locking used to protect it will be different for each activity, and may 9819 * not be suitable for use with a cvar (e.g., some state is protected by the 9820 * config lock). To allow waiting threads to wait without any races, another 9821 * lock, spa_activities_lock, is used. 9822 * 9823 * When the state is checked, both the activity-specific lock (if there is one) 9824 * and spa_activities_lock are held. In some cases, the activity-specific lock 9825 * is acquired explicitly (e.g. the config lock). In others, the locking is 9826 * internal to some check (e.g. bpobj_is_empty). After checking, the waiting 9827 * thread releases the activity-specific lock and, if the activity is in 9828 * progress, then cv_waits using spa_activities_lock. 9829 * 9830 * The waiting thread is woken when another thread, one completing some 9831 * activity, updates the state of the activity and then calls 9832 * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only 9833 * needs to hold its activity-specific lock when updating the state, and this 9834 * lock can (but doesn't have to) be dropped before calling spa_notify_waiters. 9835 * 9836 * Because spa_notify_waiters acquires spa_activities_lock before broadcasting, 9837 * and because it is held when the waiting thread checks the state of the 9838 * activity, it can never be the case that the completing thread both updates 9839 * the activity state and cv_broadcasts in between the waiting thread's check 9840 * and cv_wait. Thus, a waiting thread can never miss a wakeup. 9841 * 9842 * In order to prevent deadlock, when the waiting thread does its check, in some 9843 * cases it will temporarily drop spa_activities_lock in order to acquire the 9844 * activity-specific lock. The order in which spa_activities_lock and the 9845 * activity specific lock are acquired in the waiting thread is determined by 9846 * the order in which they are acquired in the completing thread; if the 9847 * completing thread calls spa_notify_waiters with the activity-specific lock 9848 * held, then the waiting thread must also acquire the activity-specific lock 9849 * first. 9850 */ 9851 9852 static int 9853 spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, 9854 boolean_t use_tag, uint64_t tag, boolean_t *in_progress) 9855 { 9856 int error = 0; 9857 9858 ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); 9859 9860 switch (activity) { 9861 case ZPOOL_WAIT_CKPT_DISCARD: 9862 *in_progress = 9863 (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) && 9864 zap_contains(spa_meta_objset(spa), 9865 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) == 9866 ENOENT); 9867 break; 9868 case ZPOOL_WAIT_FREE: 9869 *in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS && 9870 !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) || 9871 spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) || 9872 spa_livelist_delete_check(spa)); 9873 break; 9874 case ZPOOL_WAIT_INITIALIZE: 9875 case ZPOOL_WAIT_TRIM: 9876 error = spa_vdev_activity_in_progress(spa, use_tag, tag, 9877 activity, in_progress); 9878 break; 9879 case ZPOOL_WAIT_REPLACE: 9880 mutex_exit(&spa->spa_activities_lock); 9881 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 9882 mutex_enter(&spa->spa_activities_lock); 9883 9884 *in_progress = vdev_replace_in_progress(spa->spa_root_vdev); 9885 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9886 break; 9887 case ZPOOL_WAIT_REMOVE: 9888 *in_progress = (spa->spa_removing_phys.sr_state == 9889 DSS_SCANNING); 9890 break; 9891 case ZPOOL_WAIT_RESILVER: 9892 if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev))) 9893 break; 9894 zfs_fallthrough; 9895 case ZPOOL_WAIT_SCRUB: 9896 { 9897 boolean_t scanning, paused, is_scrub; 9898 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 9899 9900 is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB); 9901 scanning = (scn->scn_phys.scn_state == DSS_SCANNING); 9902 paused = dsl_scan_is_paused_scrub(scn); 9903 *in_progress = (scanning && !paused && 9904 is_scrub == (activity == ZPOOL_WAIT_SCRUB)); 9905 break; 9906 } 9907 default: 9908 panic("unrecognized value for activity %d", activity); 9909 } 9910 9911 return (error); 9912 } 9913 9914 static int 9915 spa_wait_common(const char *pool, zpool_wait_activity_t activity, 9916 boolean_t use_tag, uint64_t tag, boolean_t *waited) 9917 { 9918 /* 9919 * The tag is used to distinguish between instances of an activity. 9920 * 'initialize' and 'trim' are the only activities that we use this for. 9921 * The other activities can only have a single instance in progress in a 9922 * pool at one time, making the tag unnecessary. 9923 * 9924 * There can be multiple devices being replaced at once, but since they 9925 * all finish once resilvering finishes, we don't bother keeping track 9926 * of them individually, we just wait for them all to finish. 9927 */ 9928 if (use_tag && activity != ZPOOL_WAIT_INITIALIZE && 9929 activity != ZPOOL_WAIT_TRIM) 9930 return (EINVAL); 9931 9932 if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES) 9933 return (EINVAL); 9934 9935 spa_t *spa; 9936 int error = spa_open(pool, &spa, FTAG); 9937 if (error != 0) 9938 return (error); 9939 9940 /* 9941 * Increment the spa's waiter count so that we can call spa_close and 9942 * still ensure that the spa_t doesn't get freed before this thread is 9943 * finished with it when the pool is exported. We want to call spa_close 9944 * before we start waiting because otherwise the additional ref would 9945 * prevent the pool from being exported or destroyed throughout the 9946 * potentially long wait. 9947 */ 9948 mutex_enter(&spa->spa_activities_lock); 9949 spa->spa_waiters++; 9950 spa_close(spa, FTAG); 9951 9952 *waited = B_FALSE; 9953 for (;;) { 9954 boolean_t in_progress; 9955 error = spa_activity_in_progress(spa, activity, use_tag, tag, 9956 &in_progress); 9957 9958 if (error || !in_progress || spa->spa_waiters_cancel) 9959 break; 9960 9961 *waited = B_TRUE; 9962 9963 if (cv_wait_sig(&spa->spa_activities_cv, 9964 &spa->spa_activities_lock) == 0) { 9965 error = EINTR; 9966 break; 9967 } 9968 } 9969 9970 spa->spa_waiters--; 9971 cv_signal(&spa->spa_waiters_cv); 9972 mutex_exit(&spa->spa_activities_lock); 9973 9974 return (error); 9975 } 9976 9977 /* 9978 * Wait for a particular instance of the specified activity to complete, where 9979 * the instance is identified by 'tag' 9980 */ 9981 int 9982 spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag, 9983 boolean_t *waited) 9984 { 9985 return (spa_wait_common(pool, activity, B_TRUE, tag, waited)); 9986 } 9987 9988 /* 9989 * Wait for all instances of the specified activity complete 9990 */ 9991 int 9992 spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited) 9993 { 9994 9995 return (spa_wait_common(pool, activity, B_FALSE, 0, waited)); 9996 } 9997 9998 sysevent_t * 9999 spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 10000 { 10001 sysevent_t *ev = NULL; 10002 #ifdef _KERNEL 10003 nvlist_t *resource; 10004 10005 resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl); 10006 if (resource) { 10007 ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP); 10008 ev->resource = resource; 10009 } 10010 #else 10011 (void) spa, (void) vd, (void) hist_nvl, (void) name; 10012 #endif 10013 return (ev); 10014 } 10015 10016 void 10017 spa_event_post(sysevent_t *ev) 10018 { 10019 #ifdef _KERNEL 10020 if (ev) { 10021 zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb); 10022 kmem_free(ev, sizeof (*ev)); 10023 } 10024 #else 10025 (void) ev; 10026 #endif 10027 } 10028 10029 /* 10030 * Post a zevent corresponding to the given sysevent. The 'name' must be one 10031 * of the event definitions in sys/sysevent/eventdefs.h. The payload will be 10032 * filled in from the spa and (optionally) the vdev. This doesn't do anything 10033 * in the userland libzpool, as we don't want consumers to misinterpret ztest 10034 * or zdb as real changes. 10035 */ 10036 void 10037 spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 10038 { 10039 spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); 10040 } 10041 10042 /* state manipulation functions */ 10043 EXPORT_SYMBOL(spa_open); 10044 EXPORT_SYMBOL(spa_open_rewind); 10045 EXPORT_SYMBOL(spa_get_stats); 10046 EXPORT_SYMBOL(spa_create); 10047 EXPORT_SYMBOL(spa_import); 10048 EXPORT_SYMBOL(spa_tryimport); 10049 EXPORT_SYMBOL(spa_destroy); 10050 EXPORT_SYMBOL(spa_export); 10051 EXPORT_SYMBOL(spa_reset); 10052 EXPORT_SYMBOL(spa_async_request); 10053 EXPORT_SYMBOL(spa_async_suspend); 10054 EXPORT_SYMBOL(spa_async_resume); 10055 EXPORT_SYMBOL(spa_inject_addref); 10056 EXPORT_SYMBOL(spa_inject_delref); 10057 EXPORT_SYMBOL(spa_scan_stat_init); 10058 EXPORT_SYMBOL(spa_scan_get_stats); 10059 10060 /* device manipulation */ 10061 EXPORT_SYMBOL(spa_vdev_add); 10062 EXPORT_SYMBOL(spa_vdev_attach); 10063 EXPORT_SYMBOL(spa_vdev_detach); 10064 EXPORT_SYMBOL(spa_vdev_setpath); 10065 EXPORT_SYMBOL(spa_vdev_setfru); 10066 EXPORT_SYMBOL(spa_vdev_split_mirror); 10067 10068 /* spare statech is global across all pools) */ 10069 EXPORT_SYMBOL(spa_spare_add); 10070 EXPORT_SYMBOL(spa_spare_remove); 10071 EXPORT_SYMBOL(spa_spare_exists); 10072 EXPORT_SYMBOL(spa_spare_activate); 10073 10074 /* L2ARC statech is global across all pools) */ 10075 EXPORT_SYMBOL(spa_l2cache_add); 10076 EXPORT_SYMBOL(spa_l2cache_remove); 10077 EXPORT_SYMBOL(spa_l2cache_exists); 10078 EXPORT_SYMBOL(spa_l2cache_activate); 10079 EXPORT_SYMBOL(spa_l2cache_drop); 10080 10081 /* scanning */ 10082 EXPORT_SYMBOL(spa_scan); 10083 EXPORT_SYMBOL(spa_scan_stop); 10084 10085 /* spa syncing */ 10086 EXPORT_SYMBOL(spa_sync); /* only for DMU use */ 10087 EXPORT_SYMBOL(spa_sync_allpools); 10088 10089 /* properties */ 10090 EXPORT_SYMBOL(spa_prop_set); 10091 EXPORT_SYMBOL(spa_prop_get); 10092 EXPORT_SYMBOL(spa_prop_clear_bootfs); 10093 10094 /* asynchronous event notification */ 10095 EXPORT_SYMBOL(spa_event_notify); 10096 10097 /* BEGIN CSTYLED */ 10098 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW, 10099 "log2 fraction of arc that can be used by inflight I/Os when " 10100 "verifying pool during import"); 10101 /* END CSTYLED */ 10102 10103 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW, 10104 "Set to traverse metadata on pool import"); 10105 10106 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW, 10107 "Set to traverse data on pool import"); 10108 10109 ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW, 10110 "Print vdev tree to zfs_dbgmsg during pool import"); 10111 10112 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD, 10113 "Percentage of CPUs to run an IO worker thread"); 10114 10115 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD, 10116 "Number of threads per IO worker taskqueue"); 10117 10118 /* BEGIN CSTYLED */ 10119 ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW, 10120 "Allow importing pool with up to this number of missing top-level " 10121 "vdevs (in read-only mode)"); 10122 /* END CSTYLED */ 10123 10124 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, 10125 ZMOD_RW, "Set the livelist condense zthr to pause"); 10126 10127 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT, 10128 ZMOD_RW, "Set the livelist condense synctask to pause"); 10129 10130 /* BEGIN CSTYLED */ 10131 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel, 10132 INT, ZMOD_RW, 10133 "Whether livelist condensing was canceled in the synctask"); 10134 10135 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, 10136 INT, ZMOD_RW, 10137 "Whether livelist condensing was canceled in the zthr function"); 10138 10139 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, 10140 ZMOD_RW, 10141 "Whether extra ALLOC blkptrs were added to a livelist entry while it " 10142 "was being condensed"); 10143 /* END CSTYLED */ 10144