1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2024 by Delphix. All rights reserved. 25 * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 27 * Copyright 2013 Saso Kiselkov. All rights reserved. 28 * Copyright (c) 2014 Integros [integros.com] 29 * Copyright 2016 Toomas Soome <tsoome@me.com> 30 * Copyright (c) 2016 Actifio, Inc. All rights reserved. 31 * Copyright 2018 Joyent, Inc. 32 * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. 33 * Copyright 2017 Joyent, Inc. 34 * Copyright (c) 2017, Intel Corporation. 35 * Copyright (c) 2021, Colm Buckley <colm@tuatha.org> 36 * Copyright (c) 2023 Hewlett Packard Enterprise Development LP. 37 * Copyright (c) 2023, 2024, Klara Inc. 38 */ 39 40 /* 41 * SPA: Storage Pool Allocator 42 * 43 * This file contains all the routines used when modifying on-disk SPA state. 44 * This includes opening, importing, destroying, exporting a pool, and syncing a 45 * pool. 46 */ 47 48 #include <sys/zfs_context.h> 49 #include <sys/fm/fs/zfs.h> 50 #include <sys/spa_impl.h> 51 #include <sys/zio.h> 52 #include <sys/zio_checksum.h> 53 #include <sys/dmu.h> 54 #include <sys/dmu_tx.h> 55 #include <sys/zap.h> 56 #include <sys/zil.h> 57 #include <sys/brt.h> 58 #include <sys/ddt.h> 59 #include <sys/vdev_impl.h> 60 #include <sys/vdev_removal.h> 61 #include <sys/vdev_indirect_mapping.h> 62 #include <sys/vdev_indirect_births.h> 63 #include <sys/vdev_initialize.h> 64 #include <sys/vdev_rebuild.h> 65 #include <sys/vdev_trim.h> 66 #include <sys/vdev_disk.h> 67 #include <sys/vdev_raidz.h> 68 #include <sys/vdev_draid.h> 69 #include <sys/metaslab.h> 70 #include <sys/metaslab_impl.h> 71 #include <sys/mmp.h> 72 #include <sys/uberblock_impl.h> 73 #include <sys/txg.h> 74 #include <sys/avl.h> 75 #include <sys/bpobj.h> 76 #include <sys/dmu_traverse.h> 77 #include <sys/dmu_objset.h> 78 #include <sys/unique.h> 79 #include <sys/dsl_pool.h> 80 #include <sys/dsl_dataset.h> 81 #include <sys/dsl_dir.h> 82 #include <sys/dsl_prop.h> 83 #include <sys/dsl_synctask.h> 84 #include <sys/fs/zfs.h> 85 #include <sys/arc.h> 86 #include <sys/callb.h> 87 #include <sys/systeminfo.h> 88 #include <sys/zfs_ioctl.h> 89 #include <sys/dsl_scan.h> 90 #include <sys/zfeature.h> 91 #include <sys/dsl_destroy.h> 92 #include <sys/zvol.h> 93 94 #ifdef _KERNEL 95 #include <sys/fm/protocol.h> 96 #include <sys/fm/util.h> 97 #include <sys/callb.h> 98 #include <sys/zone.h> 99 #include <sys/vmsystm.h> 100 #endif /* _KERNEL */ 101 102 #include "zfs_prop.h" 103 #include "zfs_comutil.h" 104 #include <cityhash.h> 105 106 /* 107 * spa_thread() existed on Illumos as a parent thread for the various worker 108 * threads that actually run the pool, as a way to both reference the entire 109 * pool work as a single object, and to share properties like scheduling 110 * options. It has not yet been adapted to Linux or FreeBSD. This define is 111 * used to mark related parts of the code to make things easier for the reader, 112 * and to compile this code out. It can be removed when someone implements it, 113 * moves it to some Illumos-specific place, or removes it entirely. 114 */ 115 #undef HAVE_SPA_THREAD 116 117 /* 118 * The "System Duty Cycle" scheduling class is an Illumos feature to help 119 * prevent CPU-intensive kernel threads from affecting latency on interactive 120 * threads. It doesn't exist on Linux or FreeBSD, so the supporting code is 121 * gated behind a define. On Illumos SDC depends on spa_thread(), but 122 * spa_thread() also has other uses, so this is a separate define. 123 */ 124 #undef HAVE_SYSDC 125 126 /* 127 * The interval, in seconds, at which failed configuration cache file writes 128 * should be retried. 129 */ 130 int zfs_ccw_retry_interval = 300; 131 132 typedef enum zti_modes { 133 ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 134 ZTI_MODE_SCALE, /* Taskqs scale with CPUs. */ 135 ZTI_MODE_SYNC, /* sync thread assigned */ 136 ZTI_MODE_NULL, /* don't create a taskq */ 137 ZTI_NMODES 138 } zti_modes_t; 139 140 #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 141 #define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 } 142 #define ZTI_SCALE { ZTI_MODE_SCALE, 0, 1 } 143 #define ZTI_SYNC { ZTI_MODE_SYNC, 0, 1 } 144 #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 145 146 #define ZTI_N(n) ZTI_P(n, 1) 147 #define ZTI_ONE ZTI_N(1) 148 149 typedef struct zio_taskq_info { 150 zti_modes_t zti_mode; 151 uint_t zti_value; 152 uint_t zti_count; 153 } zio_taskq_info_t; 154 155 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 156 "iss", "iss_h", "int", "int_h" 157 }; 158 159 /* 160 * This table defines the taskq settings for each ZFS I/O type. When 161 * initializing a pool, we use this table to create an appropriately sized 162 * taskq. Some operations are low volume and therefore have a small, static 163 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 164 * macros. Other operations process a large amount of data; the ZTI_SCALE 165 * macro causes us to create a taskq oriented for throughput. Some operations 166 * are so high frequency and short-lived that the taskq itself can become a 167 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 168 * additional degree of parallelism specified by the number of threads per- 169 * taskq and the number of taskqs; when dispatching an event in this case, the 170 * particular taskq is chosen at random. ZTI_SCALE uses a number of taskqs 171 * that scales with the number of CPUs. 172 * 173 * The different taskq priorities are to handle the different contexts (issue 174 * and interrupt) and then to reserve threads for high priority I/Os that 175 * need to be handled with minimum delay. Illumos taskq has unfair TQ_FRONT 176 * implementation, so separate high priority threads are used there. 177 */ 178 static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 179 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 180 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 181 { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ 182 #ifdef illumos 183 { ZTI_SYNC, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */ 184 #else 185 { ZTI_SYNC, ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* WRITE */ 186 #endif 187 { ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 188 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 189 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FLUSH */ 190 { ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */ 191 }; 192 193 static void spa_sync_version(void *arg, dmu_tx_t *tx); 194 static void spa_sync_props(void *arg, dmu_tx_t *tx); 195 static boolean_t spa_has_active_shared_spare(spa_t *spa); 196 static int spa_load_impl(spa_t *spa, spa_import_type_t type, 197 const char **ereport); 198 static void spa_vdev_resilver_done(spa_t *spa); 199 200 /* 201 * Percentage of all CPUs that can be used by the metaslab preload taskq. 202 */ 203 static uint_t metaslab_preload_pct = 50; 204 205 static uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */ 206 static uint_t zio_taskq_batch_tpq; /* threads per taskq */ 207 208 #ifdef HAVE_SYSDC 209 static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 210 static const uint_t zio_taskq_basedc = 80; /* base duty cycle */ 211 #endif 212 213 #ifdef HAVE_SPA_THREAD 214 static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */ 215 #endif 216 217 static uint_t zio_taskq_write_tpq = 16; 218 219 /* 220 * Report any spa_load_verify errors found, but do not fail spa_load. 221 * This is used by zdb to analyze non-idle pools. 222 */ 223 boolean_t spa_load_verify_dryrun = B_FALSE; 224 225 /* 226 * Allow read spacemaps in case of readonly import (spa_mode == SPA_MODE_READ). 227 * This is used by zdb for spacemaps verification. 228 */ 229 boolean_t spa_mode_readable_spacemaps = B_FALSE; 230 231 /* 232 * This (illegal) pool name is used when temporarily importing a spa_t in order 233 * to get the vdev stats associated with the imported devices. 234 */ 235 #define TRYIMPORT_NAME "$import" 236 237 /* 238 * For debugging purposes: print out vdev tree during pool import. 239 */ 240 static int spa_load_print_vdev_tree = B_FALSE; 241 242 /* 243 * A non-zero value for zfs_max_missing_tvds means that we allow importing 244 * pools with missing top-level vdevs. This is strictly intended for advanced 245 * pool recovery cases since missing data is almost inevitable. Pools with 246 * missing devices can only be imported read-only for safety reasons, and their 247 * fail-mode will be automatically set to "continue". 248 * 249 * With 1 missing vdev we should be able to import the pool and mount all 250 * datasets. User data that was not modified after the missing device has been 251 * added should be recoverable. This means that snapshots created prior to the 252 * addition of that device should be completely intact. 253 * 254 * With 2 missing vdevs, some datasets may fail to mount since there are 255 * dataset statistics that are stored as regular metadata. Some data might be 256 * recoverable if those vdevs were added recently. 257 * 258 * With 3 or more missing vdevs, the pool is severely damaged and MOS entries 259 * may be missing entirely. Chances of data recovery are very low. Note that 260 * there are also risks of performing an inadvertent rewind as we might be 261 * missing all the vdevs with the latest uberblocks. 262 */ 263 uint64_t zfs_max_missing_tvds = 0; 264 265 /* 266 * The parameters below are similar to zfs_max_missing_tvds but are only 267 * intended for a preliminary open of the pool with an untrusted config which 268 * might be incomplete or out-dated. 269 * 270 * We are more tolerant for pools opened from a cachefile since we could have 271 * an out-dated cachefile where a device removal was not registered. 272 * We could have set the limit arbitrarily high but in the case where devices 273 * are really missing we would want to return the proper error codes; we chose 274 * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available 275 * and we get a chance to retrieve the trusted config. 276 */ 277 uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; 278 279 /* 280 * In the case where config was assembled by scanning device paths (/dev/dsks 281 * by default) we are less tolerant since all the existing devices should have 282 * been detected and we want spa_load to return the right error codes. 283 */ 284 uint64_t zfs_max_missing_tvds_scan = 0; 285 286 /* 287 * Debugging aid that pauses spa_sync() towards the end. 288 */ 289 static const boolean_t zfs_pause_spa_sync = B_FALSE; 290 291 /* 292 * Variables to indicate the livelist condense zthr func should wait at certain 293 * points for the livelist to be removed - used to test condense/destroy races 294 */ 295 static int zfs_livelist_condense_zthr_pause = 0; 296 static int zfs_livelist_condense_sync_pause = 0; 297 298 /* 299 * Variables to track whether or not condense cancellation has been 300 * triggered in testing. 301 */ 302 static int zfs_livelist_condense_sync_cancel = 0; 303 static int zfs_livelist_condense_zthr_cancel = 0; 304 305 /* 306 * Variable to track whether or not extra ALLOC blkptrs were added to a 307 * livelist entry while it was being condensed (caused by the way we track 308 * remapped blkptrs in dbuf_remap_impl) 309 */ 310 static int zfs_livelist_condense_new_alloc = 0; 311 312 /* 313 * ========================================================================== 314 * SPA properties routines 315 * ========================================================================== 316 */ 317 318 /* 319 * Add a (source=src, propname=propval) list to an nvlist. 320 */ 321 static void 322 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, const char *strval, 323 uint64_t intval, zprop_source_t src) 324 { 325 const char *propname = zpool_prop_to_name(prop); 326 nvlist_t *propval; 327 328 propval = fnvlist_alloc(); 329 fnvlist_add_uint64(propval, ZPROP_SOURCE, src); 330 331 if (strval != NULL) 332 fnvlist_add_string(propval, ZPROP_VALUE, strval); 333 else 334 fnvlist_add_uint64(propval, ZPROP_VALUE, intval); 335 336 fnvlist_add_nvlist(nvl, propname, propval); 337 nvlist_free(propval); 338 } 339 340 static int 341 spa_prop_add(spa_t *spa, const char *propname, nvlist_t *outnvl) 342 { 343 zpool_prop_t prop = zpool_name_to_prop(propname); 344 zprop_source_t src = ZPROP_SRC_NONE; 345 uint64_t intval; 346 int err; 347 348 /* 349 * NB: Not all properties lookups via this API require 350 * the spa props lock, so they must explicitly grab it here. 351 */ 352 switch (prop) { 353 case ZPOOL_PROP_DEDUPCACHED: 354 err = ddt_get_pool_dedup_cached(spa, &intval); 355 if (err != 0) 356 return (SET_ERROR(err)); 357 break; 358 default: 359 return (SET_ERROR(EINVAL)); 360 } 361 362 spa_prop_add_list(outnvl, prop, NULL, intval, src); 363 364 return (0); 365 } 366 367 int 368 spa_prop_get_nvlist(spa_t *spa, char **props, unsigned int n_props, 369 nvlist_t **outnvl) 370 { 371 int err = 0; 372 373 if (props == NULL) 374 return (0); 375 376 if (*outnvl == NULL) { 377 err = nvlist_alloc(outnvl, NV_UNIQUE_NAME, KM_SLEEP); 378 if (err) 379 return (err); 380 } 381 382 for (unsigned int i = 0; i < n_props && err == 0; i++) { 383 err = spa_prop_add(spa, props[i], *outnvl); 384 } 385 386 return (err); 387 } 388 389 /* 390 * Add a user property (source=src, propname=propval) to an nvlist. 391 */ 392 static void 393 spa_prop_add_user(nvlist_t *nvl, const char *propname, char *strval, 394 zprop_source_t src) 395 { 396 nvlist_t *propval; 397 398 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 399 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 400 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 401 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 402 nvlist_free(propval); 403 } 404 405 /* 406 * Get property values from the spa configuration. 407 */ 408 static void 409 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 410 { 411 vdev_t *rvd = spa->spa_root_vdev; 412 dsl_pool_t *pool = spa->spa_dsl_pool; 413 uint64_t size, alloc, cap, version; 414 const zprop_source_t src = ZPROP_SRC_NONE; 415 spa_config_dirent_t *dp; 416 metaslab_class_t *mc = spa_normal_class(spa); 417 418 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 419 420 if (rvd != NULL) { 421 alloc = metaslab_class_get_alloc(mc); 422 alloc += metaslab_class_get_alloc(spa_special_class(spa)); 423 alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); 424 alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa)); 425 426 size = metaslab_class_get_space(mc); 427 size += metaslab_class_get_space(spa_special_class(spa)); 428 size += metaslab_class_get_space(spa_dedup_class(spa)); 429 size += metaslab_class_get_space(spa_embedded_log_class(spa)); 430 431 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 432 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 433 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 434 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 435 size - alloc, src); 436 spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL, 437 spa->spa_checkpoint_info.sci_dspace, src); 438 439 spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, 440 metaslab_class_fragmentation(mc), src); 441 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, 442 metaslab_class_expandable_space(mc), src); 443 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 444 (spa_mode(spa) == SPA_MODE_READ), src); 445 446 cap = (size == 0) ? 0 : (alloc * 100 / size); 447 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 448 449 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 450 ddt_get_pool_dedup_ratio(spa), src); 451 spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONEUSED, NULL, 452 brt_get_used(spa), src); 453 spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONESAVED, NULL, 454 brt_get_saved(spa), src); 455 spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONERATIO, NULL, 456 brt_get_ratio(spa), src); 457 458 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUP_TABLE_SIZE, NULL, 459 ddt_get_ddt_dsize(spa), src); 460 461 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 462 rvd->vdev_state, src); 463 464 version = spa_version(spa); 465 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) { 466 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, 467 version, ZPROP_SRC_DEFAULT); 468 } else { 469 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, 470 version, ZPROP_SRC_LOCAL); 471 } 472 spa_prop_add_list(*nvp, ZPOOL_PROP_LOAD_GUID, 473 NULL, spa_load_guid(spa), src); 474 } 475 476 if (pool != NULL) { 477 /* 478 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 479 * when opening pools before this version freedir will be NULL. 480 */ 481 if (pool->dp_free_dir != NULL) { 482 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 483 dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, 484 src); 485 } else { 486 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 487 NULL, 0, src); 488 } 489 490 if (pool->dp_leak_dir != NULL) { 491 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 492 dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, 493 src); 494 } else { 495 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, 496 NULL, 0, src); 497 } 498 } 499 500 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 501 502 if (spa->spa_comment != NULL) { 503 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 504 0, ZPROP_SRC_LOCAL); 505 } 506 507 if (spa->spa_compatibility != NULL) { 508 spa_prop_add_list(*nvp, ZPOOL_PROP_COMPATIBILITY, 509 spa->spa_compatibility, 0, ZPROP_SRC_LOCAL); 510 } 511 512 if (spa->spa_root != NULL) 513 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 514 0, ZPROP_SRC_LOCAL); 515 516 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { 517 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 518 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); 519 } else { 520 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 521 SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); 522 } 523 524 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { 525 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, 526 DNODE_MAX_SIZE, ZPROP_SRC_NONE); 527 } else { 528 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, 529 DNODE_MIN_SIZE, ZPROP_SRC_NONE); 530 } 531 532 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 533 if (dp->scd_path == NULL) { 534 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 535 "none", 0, ZPROP_SRC_LOCAL); 536 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 537 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 538 dp->scd_path, 0, ZPROP_SRC_LOCAL); 539 } 540 } 541 } 542 543 /* 544 * Get zpool property values. 545 */ 546 int 547 spa_prop_get(spa_t *spa, nvlist_t **nvp) 548 { 549 objset_t *mos = spa->spa_meta_objset; 550 zap_cursor_t zc; 551 zap_attribute_t za; 552 dsl_pool_t *dp; 553 int err; 554 555 if (*nvp == NULL) { 556 err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP); 557 if (err) 558 return (err); 559 } 560 561 dp = spa_get_dsl(spa); 562 dsl_pool_config_enter(dp, FTAG); 563 mutex_enter(&spa->spa_props_lock); 564 565 /* 566 * Get properties from the spa config. 567 */ 568 spa_prop_get_config(spa, nvp); 569 570 /* If no pool property object, no more prop to get. */ 571 if (mos == NULL || spa->spa_pool_props_object == 0) 572 goto out; 573 574 /* 575 * Get properties from the MOS pool property object. 576 */ 577 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 578 (err = zap_cursor_retrieve(&zc, &za)) == 0; 579 zap_cursor_advance(&zc)) { 580 uint64_t intval = 0; 581 char *strval = NULL; 582 zprop_source_t src = ZPROP_SRC_DEFAULT; 583 zpool_prop_t prop; 584 585 if ((prop = zpool_name_to_prop(za.za_name)) == 586 ZPOOL_PROP_INVAL && !zfs_prop_user(za.za_name)) 587 continue; 588 589 switch (za.za_integer_length) { 590 case 8: 591 /* integer property */ 592 if (za.za_first_integer != 593 zpool_prop_default_numeric(prop)) 594 src = ZPROP_SRC_LOCAL; 595 596 if (prop == ZPOOL_PROP_BOOTFS) { 597 dsl_dataset_t *ds = NULL; 598 599 err = dsl_dataset_hold_obj(dp, 600 za.za_first_integer, FTAG, &ds); 601 if (err != 0) 602 break; 603 604 strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, 605 KM_SLEEP); 606 dsl_dataset_name(ds, strval); 607 dsl_dataset_rele(ds, FTAG); 608 } else { 609 strval = NULL; 610 intval = za.za_first_integer; 611 } 612 613 spa_prop_add_list(*nvp, prop, strval, intval, src); 614 615 if (strval != NULL) 616 kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); 617 618 break; 619 620 case 1: 621 /* string property */ 622 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 623 err = zap_lookup(mos, spa->spa_pool_props_object, 624 za.za_name, 1, za.za_num_integers, strval); 625 if (err) { 626 kmem_free(strval, za.za_num_integers); 627 break; 628 } 629 if (prop != ZPOOL_PROP_INVAL) { 630 spa_prop_add_list(*nvp, prop, strval, 0, src); 631 } else { 632 src = ZPROP_SRC_LOCAL; 633 spa_prop_add_user(*nvp, za.za_name, strval, 634 src); 635 } 636 kmem_free(strval, za.za_num_integers); 637 break; 638 639 default: 640 break; 641 } 642 } 643 zap_cursor_fini(&zc); 644 out: 645 mutex_exit(&spa->spa_props_lock); 646 dsl_pool_config_exit(dp, FTAG); 647 if (err && err != ENOENT) { 648 nvlist_free(*nvp); 649 *nvp = NULL; 650 return (err); 651 } 652 653 return (0); 654 } 655 656 /* 657 * Validate the given pool properties nvlist and modify the list 658 * for the property values to be set. 659 */ 660 static int 661 spa_prop_validate(spa_t *spa, nvlist_t *props) 662 { 663 nvpair_t *elem; 664 int error = 0, reset_bootfs = 0; 665 uint64_t objnum = 0; 666 boolean_t has_feature = B_FALSE; 667 668 elem = NULL; 669 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 670 uint64_t intval; 671 const char *strval, *slash, *check, *fname; 672 const char *propname = nvpair_name(elem); 673 zpool_prop_t prop = zpool_name_to_prop(propname); 674 675 switch (prop) { 676 case ZPOOL_PROP_INVAL: 677 /* 678 * Sanitize the input. 679 */ 680 if (zfs_prop_user(propname)) { 681 if (strlen(propname) >= ZAP_MAXNAMELEN) { 682 error = SET_ERROR(ENAMETOOLONG); 683 break; 684 } 685 686 if (strlen(fnvpair_value_string(elem)) >= 687 ZAP_MAXVALUELEN) { 688 error = SET_ERROR(E2BIG); 689 break; 690 } 691 } else if (zpool_prop_feature(propname)) { 692 if (nvpair_type(elem) != DATA_TYPE_UINT64) { 693 error = SET_ERROR(EINVAL); 694 break; 695 } 696 697 if (nvpair_value_uint64(elem, &intval) != 0) { 698 error = SET_ERROR(EINVAL); 699 break; 700 } 701 702 if (intval != 0) { 703 error = SET_ERROR(EINVAL); 704 break; 705 } 706 707 fname = strchr(propname, '@') + 1; 708 if (zfeature_lookup_name(fname, NULL) != 0) { 709 error = SET_ERROR(EINVAL); 710 break; 711 } 712 713 has_feature = B_TRUE; 714 } else { 715 error = SET_ERROR(EINVAL); 716 break; 717 } 718 break; 719 720 case ZPOOL_PROP_VERSION: 721 error = nvpair_value_uint64(elem, &intval); 722 if (!error && 723 (intval < spa_version(spa) || 724 intval > SPA_VERSION_BEFORE_FEATURES || 725 has_feature)) 726 error = SET_ERROR(EINVAL); 727 break; 728 729 case ZPOOL_PROP_DEDUP_TABLE_QUOTA: 730 error = nvpair_value_uint64(elem, &intval); 731 break; 732 733 case ZPOOL_PROP_DELEGATION: 734 case ZPOOL_PROP_AUTOREPLACE: 735 case ZPOOL_PROP_LISTSNAPS: 736 case ZPOOL_PROP_AUTOEXPAND: 737 case ZPOOL_PROP_AUTOTRIM: 738 error = nvpair_value_uint64(elem, &intval); 739 if (!error && intval > 1) 740 error = SET_ERROR(EINVAL); 741 break; 742 743 case ZPOOL_PROP_MULTIHOST: 744 error = nvpair_value_uint64(elem, &intval); 745 if (!error && intval > 1) 746 error = SET_ERROR(EINVAL); 747 748 if (!error) { 749 uint32_t hostid = zone_get_hostid(NULL); 750 if (hostid) 751 spa->spa_hostid = hostid; 752 else 753 error = SET_ERROR(ENOTSUP); 754 } 755 756 break; 757 758 case ZPOOL_PROP_BOOTFS: 759 /* 760 * If the pool version is less than SPA_VERSION_BOOTFS, 761 * or the pool is still being created (version == 0), 762 * the bootfs property cannot be set. 763 */ 764 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 765 error = SET_ERROR(ENOTSUP); 766 break; 767 } 768 769 /* 770 * Make sure the vdev config is bootable 771 */ 772 if (!vdev_is_bootable(spa->spa_root_vdev)) { 773 error = SET_ERROR(ENOTSUP); 774 break; 775 } 776 777 reset_bootfs = 1; 778 779 error = nvpair_value_string(elem, &strval); 780 781 if (!error) { 782 objset_t *os; 783 784 if (strval == NULL || strval[0] == '\0') { 785 objnum = zpool_prop_default_numeric( 786 ZPOOL_PROP_BOOTFS); 787 break; 788 } 789 790 error = dmu_objset_hold(strval, FTAG, &os); 791 if (error != 0) 792 break; 793 794 /* Must be ZPL. */ 795 if (dmu_objset_type(os) != DMU_OST_ZFS) { 796 error = SET_ERROR(ENOTSUP); 797 } else { 798 objnum = dmu_objset_id(os); 799 } 800 dmu_objset_rele(os, FTAG); 801 } 802 break; 803 804 case ZPOOL_PROP_FAILUREMODE: 805 error = nvpair_value_uint64(elem, &intval); 806 if (!error && intval > ZIO_FAILURE_MODE_PANIC) 807 error = SET_ERROR(EINVAL); 808 809 /* 810 * This is a special case which only occurs when 811 * the pool has completely failed. This allows 812 * the user to change the in-core failmode property 813 * without syncing it out to disk (I/Os might 814 * currently be blocked). We do this by returning 815 * EIO to the caller (spa_prop_set) to trick it 816 * into thinking we encountered a property validation 817 * error. 818 */ 819 if (!error && spa_suspended(spa)) { 820 spa->spa_failmode = intval; 821 error = SET_ERROR(EIO); 822 } 823 break; 824 825 case ZPOOL_PROP_CACHEFILE: 826 if ((error = nvpair_value_string(elem, &strval)) != 0) 827 break; 828 829 if (strval[0] == '\0') 830 break; 831 832 if (strcmp(strval, "none") == 0) 833 break; 834 835 if (strval[0] != '/') { 836 error = SET_ERROR(EINVAL); 837 break; 838 } 839 840 slash = strrchr(strval, '/'); 841 ASSERT(slash != NULL); 842 843 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 844 strcmp(slash, "/..") == 0) 845 error = SET_ERROR(EINVAL); 846 break; 847 848 case ZPOOL_PROP_COMMENT: 849 if ((error = nvpair_value_string(elem, &strval)) != 0) 850 break; 851 for (check = strval; *check != '\0'; check++) { 852 if (!isprint(*check)) { 853 error = SET_ERROR(EINVAL); 854 break; 855 } 856 } 857 if (strlen(strval) > ZPROP_MAX_COMMENT) 858 error = SET_ERROR(E2BIG); 859 break; 860 861 default: 862 break; 863 } 864 865 if (error) 866 break; 867 } 868 869 (void) nvlist_remove_all(props, 870 zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO)); 871 872 if (!error && reset_bootfs) { 873 error = nvlist_remove(props, 874 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 875 876 if (!error) { 877 error = nvlist_add_uint64(props, 878 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 879 } 880 } 881 882 return (error); 883 } 884 885 void 886 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 887 { 888 const char *cachefile; 889 spa_config_dirent_t *dp; 890 891 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 892 &cachefile) != 0) 893 return; 894 895 dp = kmem_alloc(sizeof (spa_config_dirent_t), 896 KM_SLEEP); 897 898 if (cachefile[0] == '\0') 899 dp->scd_path = spa_strdup(spa_config_path); 900 else if (strcmp(cachefile, "none") == 0) 901 dp->scd_path = NULL; 902 else 903 dp->scd_path = spa_strdup(cachefile); 904 905 list_insert_head(&spa->spa_config_list, dp); 906 if (need_sync) 907 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 908 } 909 910 int 911 spa_prop_set(spa_t *spa, nvlist_t *nvp) 912 { 913 int error; 914 nvpair_t *elem = NULL; 915 boolean_t need_sync = B_FALSE; 916 917 if ((error = spa_prop_validate(spa, nvp)) != 0) 918 return (error); 919 920 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 921 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 922 923 if (prop == ZPOOL_PROP_CACHEFILE || 924 prop == ZPOOL_PROP_ALTROOT || 925 prop == ZPOOL_PROP_READONLY) 926 continue; 927 928 if (prop == ZPOOL_PROP_INVAL && 929 zfs_prop_user(nvpair_name(elem))) { 930 need_sync = B_TRUE; 931 break; 932 } 933 934 if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { 935 uint64_t ver = 0; 936 937 if (prop == ZPOOL_PROP_VERSION) { 938 VERIFY(nvpair_value_uint64(elem, &ver) == 0); 939 } else { 940 ASSERT(zpool_prop_feature(nvpair_name(elem))); 941 ver = SPA_VERSION_FEATURES; 942 need_sync = B_TRUE; 943 } 944 945 /* Save time if the version is already set. */ 946 if (ver == spa_version(spa)) 947 continue; 948 949 /* 950 * In addition to the pool directory object, we might 951 * create the pool properties object, the features for 952 * read object, the features for write object, or the 953 * feature descriptions object. 954 */ 955 error = dsl_sync_task(spa->spa_name, NULL, 956 spa_sync_version, &ver, 957 6, ZFS_SPACE_CHECK_RESERVED); 958 if (error) 959 return (error); 960 continue; 961 } 962 963 need_sync = B_TRUE; 964 break; 965 } 966 967 if (need_sync) { 968 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 969 nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 970 } 971 972 return (0); 973 } 974 975 /* 976 * If the bootfs property value is dsobj, clear it. 977 */ 978 void 979 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 980 { 981 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 982 VERIFY(zap_remove(spa->spa_meta_objset, 983 spa->spa_pool_props_object, 984 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 985 spa->spa_bootfs = 0; 986 } 987 } 988 989 static int 990 spa_change_guid_check(void *arg, dmu_tx_t *tx) 991 { 992 uint64_t *newguid __maybe_unused = arg; 993 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 994 vdev_t *rvd = spa->spa_root_vdev; 995 uint64_t vdev_state; 996 997 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 998 int error = (spa_has_checkpoint(spa)) ? 999 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 1000 return (SET_ERROR(error)); 1001 } 1002 1003 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 1004 vdev_state = rvd->vdev_state; 1005 spa_config_exit(spa, SCL_STATE, FTAG); 1006 1007 if (vdev_state != VDEV_STATE_HEALTHY) 1008 return (SET_ERROR(ENXIO)); 1009 1010 ASSERT3U(spa_guid(spa), !=, *newguid); 1011 1012 return (0); 1013 } 1014 1015 static void 1016 spa_change_guid_sync(void *arg, dmu_tx_t *tx) 1017 { 1018 uint64_t *newguid = arg; 1019 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 1020 uint64_t oldguid; 1021 vdev_t *rvd = spa->spa_root_vdev; 1022 1023 oldguid = spa_guid(spa); 1024 1025 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 1026 rvd->vdev_guid = *newguid; 1027 rvd->vdev_guid_sum += (*newguid - oldguid); 1028 vdev_config_dirty(rvd); 1029 spa_config_exit(spa, SCL_STATE, FTAG); 1030 1031 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 1032 (u_longlong_t)oldguid, (u_longlong_t)*newguid); 1033 } 1034 1035 /* 1036 * Change the GUID for the pool. This is done so that we can later 1037 * re-import a pool built from a clone of our own vdevs. We will modify 1038 * the root vdev's guid, our own pool guid, and then mark all of our 1039 * vdevs dirty. Note that we must make sure that all our vdevs are 1040 * online when we do this, or else any vdevs that weren't present 1041 * would be orphaned from our pool. We are also going to issue a 1042 * sysevent to update any watchers. 1043 */ 1044 int 1045 spa_change_guid(spa_t *spa) 1046 { 1047 int error; 1048 uint64_t guid; 1049 1050 mutex_enter(&spa->spa_vdev_top_lock); 1051 mutex_enter(&spa_namespace_lock); 1052 guid = spa_generate_guid(NULL); 1053 1054 error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 1055 spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 1056 1057 if (error == 0) { 1058 /* 1059 * Clear the kobj flag from all the vdevs to allow 1060 * vdev_cache_process_kobj_evt() to post events to all the 1061 * vdevs since GUID is updated. 1062 */ 1063 vdev_clear_kobj_evt(spa->spa_root_vdev); 1064 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 1065 vdev_clear_kobj_evt(spa->spa_l2cache.sav_vdevs[i]); 1066 1067 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); 1068 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); 1069 } 1070 1071 mutex_exit(&spa_namespace_lock); 1072 mutex_exit(&spa->spa_vdev_top_lock); 1073 1074 return (error); 1075 } 1076 1077 /* 1078 * ========================================================================== 1079 * SPA state manipulation (open/create/destroy/import/export) 1080 * ========================================================================== 1081 */ 1082 1083 static int 1084 spa_error_entry_compare(const void *a, const void *b) 1085 { 1086 const spa_error_entry_t *sa = (const spa_error_entry_t *)a; 1087 const spa_error_entry_t *sb = (const spa_error_entry_t *)b; 1088 int ret; 1089 1090 ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, 1091 sizeof (zbookmark_phys_t)); 1092 1093 return (TREE_ISIGN(ret)); 1094 } 1095 1096 /* 1097 * Utility function which retrieves copies of the current logs and 1098 * re-initializes them in the process. 1099 */ 1100 void 1101 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 1102 { 1103 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 1104 1105 memcpy(last, &spa->spa_errlist_last, sizeof (avl_tree_t)); 1106 memcpy(scrub, &spa->spa_errlist_scrub, sizeof (avl_tree_t)); 1107 1108 avl_create(&spa->spa_errlist_scrub, 1109 spa_error_entry_compare, sizeof (spa_error_entry_t), 1110 offsetof(spa_error_entry_t, se_avl)); 1111 avl_create(&spa->spa_errlist_last, 1112 spa_error_entry_compare, sizeof (spa_error_entry_t), 1113 offsetof(spa_error_entry_t, se_avl)); 1114 } 1115 1116 static void 1117 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 1118 { 1119 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 1120 enum zti_modes mode = ztip->zti_mode; 1121 uint_t value = ztip->zti_value; 1122 uint_t count = ztip->zti_count; 1123 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1124 uint_t cpus, flags = TASKQ_DYNAMIC; 1125 1126 switch (mode) { 1127 case ZTI_MODE_FIXED: 1128 ASSERT3U(value, >, 0); 1129 break; 1130 1131 case ZTI_MODE_SYNC: 1132 1133 /* 1134 * Create one wr_iss taskq for every 'zio_taskq_write_tpq' CPUs, 1135 * not to exceed the number of spa allocators, and align to it. 1136 */ 1137 cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); 1138 count = MAX(1, cpus / MAX(1, zio_taskq_write_tpq)); 1139 count = MAX(count, (zio_taskq_batch_pct + 99) / 100); 1140 count = MIN(count, spa->spa_alloc_count); 1141 while (spa->spa_alloc_count % count != 0 && 1142 spa->spa_alloc_count < count * 2) 1143 count--; 1144 1145 /* 1146 * zio_taskq_batch_pct is unbounded and may exceed 100%, but no 1147 * single taskq may have more threads than 100% of online cpus. 1148 */ 1149 value = (zio_taskq_batch_pct + count / 2) / count; 1150 value = MIN(value, 100); 1151 flags |= TASKQ_THREADS_CPU_PCT; 1152 break; 1153 1154 case ZTI_MODE_SCALE: 1155 flags |= TASKQ_THREADS_CPU_PCT; 1156 /* 1157 * We want more taskqs to reduce lock contention, but we want 1158 * less for better request ordering and CPU utilization. 1159 */ 1160 cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); 1161 if (zio_taskq_batch_tpq > 0) { 1162 count = MAX(1, (cpus + zio_taskq_batch_tpq / 2) / 1163 zio_taskq_batch_tpq); 1164 } else { 1165 /* 1166 * Prefer 6 threads per taskq, but no more taskqs 1167 * than threads in them on large systems. For 80%: 1168 * 1169 * taskq taskq total 1170 * cpus taskqs percent threads threads 1171 * ------- ------- ------- ------- ------- 1172 * 1 1 80% 1 1 1173 * 2 1 80% 1 1 1174 * 4 1 80% 3 3 1175 * 8 2 40% 3 6 1176 * 16 3 27% 4 12 1177 * 32 5 16% 5 25 1178 * 64 7 11% 7 49 1179 * 128 10 8% 10 100 1180 * 256 14 6% 15 210 1181 */ 1182 count = 1 + cpus / 6; 1183 while (count * count > cpus) 1184 count--; 1185 } 1186 /* Limit each taskq within 100% to not trigger assertion. */ 1187 count = MAX(count, (zio_taskq_batch_pct + 99) / 100); 1188 value = (zio_taskq_batch_pct + count / 2) / count; 1189 break; 1190 1191 case ZTI_MODE_NULL: 1192 tqs->stqs_count = 0; 1193 tqs->stqs_taskq = NULL; 1194 return; 1195 1196 default: 1197 panic("unrecognized mode for %s_%s taskq (%u:%u) in " 1198 "spa_taskqs_init()", 1199 zio_type_name[t], zio_taskq_types[q], mode, value); 1200 break; 1201 } 1202 1203 ASSERT3U(count, >, 0); 1204 tqs->stqs_count = count; 1205 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 1206 1207 for (uint_t i = 0; i < count; i++) { 1208 taskq_t *tq; 1209 char name[32]; 1210 1211 if (count > 1) 1212 (void) snprintf(name, sizeof (name), "%s_%s_%u", 1213 zio_type_name[t], zio_taskq_types[q], i); 1214 else 1215 (void) snprintf(name, sizeof (name), "%s_%s", 1216 zio_type_name[t], zio_taskq_types[q]); 1217 1218 #ifdef HAVE_SYSDC 1219 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 1220 (void) zio_taskq_basedc; 1221 tq = taskq_create_sysdc(name, value, 50, INT_MAX, 1222 spa->spa_proc, zio_taskq_basedc, flags); 1223 } else { 1224 #endif 1225 pri_t pri = maxclsyspri; 1226 /* 1227 * The write issue taskq can be extremely CPU 1228 * intensive. Run it at slightly less important 1229 * priority than the other taskqs. 1230 * 1231 * Under Linux and FreeBSD this means incrementing 1232 * the priority value as opposed to platforms like 1233 * illumos where it should be decremented. 1234 * 1235 * On FreeBSD, if priorities divided by four (RQ_PPQ) 1236 * are equal then a difference between them is 1237 * insignificant. 1238 */ 1239 if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) { 1240 #if defined(__linux__) 1241 pri++; 1242 #elif defined(__FreeBSD__) 1243 pri += 4; 1244 #else 1245 #error "unknown OS" 1246 #endif 1247 } 1248 tq = taskq_create_proc(name, value, pri, 50, 1249 INT_MAX, spa->spa_proc, flags); 1250 #ifdef HAVE_SYSDC 1251 } 1252 #endif 1253 1254 tqs->stqs_taskq[i] = tq; 1255 } 1256 } 1257 1258 static void 1259 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 1260 { 1261 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1262 1263 if (tqs->stqs_taskq == NULL) { 1264 ASSERT3U(tqs->stqs_count, ==, 0); 1265 return; 1266 } 1267 1268 for (uint_t i = 0; i < tqs->stqs_count; i++) { 1269 ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 1270 taskq_destroy(tqs->stqs_taskq[i]); 1271 } 1272 1273 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 1274 tqs->stqs_taskq = NULL; 1275 } 1276 1277 #ifdef _KERNEL 1278 /* 1279 * The READ and WRITE rows of zio_taskqs are configurable at module load time 1280 * by setting zio_taskq_read or zio_taskq_write. 1281 * 1282 * Example (the defaults for READ and WRITE) 1283 * zio_taskq_read='fixed,1,8 null scale null' 1284 * zio_taskq_write='sync null scale null' 1285 * 1286 * Each sets the entire row at a time. 1287 * 1288 * 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number 1289 * of threads per taskq. 1290 * 1291 * 'null' can only be set on the high-priority queues (queue selection for 1292 * high-priority queues will fall back to the regular queue if the high-pri 1293 * is NULL. 1294 */ 1295 static const char *const modes[ZTI_NMODES] = { 1296 "fixed", "scale", "sync", "null" 1297 }; 1298 1299 /* Parse the incoming config string. Modifies cfg */ 1300 static int 1301 spa_taskq_param_set(zio_type_t t, char *cfg) 1302 { 1303 int err = 0; 1304 1305 zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}}; 1306 1307 char *next = cfg, *tok, *c; 1308 1309 /* 1310 * Parse out each element from the string and fill `row`. The entire 1311 * row has to be set at once, so any errors are flagged by just 1312 * breaking out of this loop early. 1313 */ 1314 uint_t q; 1315 for (q = 0; q < ZIO_TASKQ_TYPES; q++) { 1316 /* `next` is the start of the config */ 1317 if (next == NULL) 1318 break; 1319 1320 /* Eat up leading space */ 1321 while (isspace(*next)) 1322 next++; 1323 if (*next == '\0') 1324 break; 1325 1326 /* Mode ends at space or end of string */ 1327 tok = next; 1328 next = strchr(tok, ' '); 1329 if (next != NULL) *next++ = '\0'; 1330 1331 /* Parameters start after a comma */ 1332 c = strchr(tok, ','); 1333 if (c != NULL) *c++ = '\0'; 1334 1335 /* Match mode string */ 1336 uint_t mode; 1337 for (mode = 0; mode < ZTI_NMODES; mode++) 1338 if (strcmp(tok, modes[mode]) == 0) 1339 break; 1340 if (mode == ZTI_NMODES) 1341 break; 1342 1343 /* Invalid canary */ 1344 row[q].zti_mode = ZTI_NMODES; 1345 1346 /* Per-mode setup */ 1347 switch (mode) { 1348 1349 /* 1350 * FIXED is parameterised: number of queues, and number of 1351 * threads per queue. 1352 */ 1353 case ZTI_MODE_FIXED: { 1354 /* No parameters? */ 1355 if (c == NULL || *c == '\0') 1356 break; 1357 1358 /* Find next parameter */ 1359 tok = c; 1360 c = strchr(tok, ','); 1361 if (c == NULL) 1362 break; 1363 1364 /* Take digits and convert */ 1365 unsigned long long nq; 1366 if (!(isdigit(*tok))) 1367 break; 1368 err = ddi_strtoull(tok, &tok, 10, &nq); 1369 /* Must succeed and also end at the next param sep */ 1370 if (err != 0 || tok != c) 1371 break; 1372 1373 /* Move past the comma */ 1374 tok++; 1375 /* Need another number */ 1376 if (!(isdigit(*tok))) 1377 break; 1378 /* Remember start to make sure we moved */ 1379 c = tok; 1380 1381 /* Take digits */ 1382 unsigned long long ntpq; 1383 err = ddi_strtoull(tok, &tok, 10, &ntpq); 1384 /* Must succeed, and moved forward */ 1385 if (err != 0 || tok == c || *tok != '\0') 1386 break; 1387 1388 /* 1389 * sanity; zero queues/threads make no sense, and 1390 * 16K is almost certainly more than anyone will ever 1391 * need and avoids silly numbers like UINT32_MAX 1392 */ 1393 if (nq == 0 || nq >= 16384 || 1394 ntpq == 0 || ntpq >= 16384) 1395 break; 1396 1397 const zio_taskq_info_t zti = ZTI_P(ntpq, nq); 1398 row[q] = zti; 1399 break; 1400 } 1401 1402 case ZTI_MODE_SCALE: { 1403 const zio_taskq_info_t zti = ZTI_SCALE; 1404 row[q] = zti; 1405 break; 1406 } 1407 1408 case ZTI_MODE_SYNC: { 1409 const zio_taskq_info_t zti = ZTI_SYNC; 1410 row[q] = zti; 1411 break; 1412 } 1413 1414 case ZTI_MODE_NULL: { 1415 /* 1416 * Can only null the high-priority queues; the general- 1417 * purpose ones have to exist. 1418 */ 1419 if (q != ZIO_TASKQ_ISSUE_HIGH && 1420 q != ZIO_TASKQ_INTERRUPT_HIGH) 1421 break; 1422 1423 const zio_taskq_info_t zti = ZTI_NULL; 1424 row[q] = zti; 1425 break; 1426 } 1427 1428 default: 1429 break; 1430 } 1431 1432 /* Ensure we set a mode */ 1433 if (row[q].zti_mode == ZTI_NMODES) 1434 break; 1435 } 1436 1437 /* Didn't get a full row, fail */ 1438 if (q < ZIO_TASKQ_TYPES) 1439 return (SET_ERROR(EINVAL)); 1440 1441 /* Eat trailing space */ 1442 if (next != NULL) 1443 while (isspace(*next)) 1444 next++; 1445 1446 /* If there's anything left over then fail */ 1447 if (next != NULL && *next != '\0') 1448 return (SET_ERROR(EINVAL)); 1449 1450 /* Success! Copy it into the real config */ 1451 for (q = 0; q < ZIO_TASKQ_TYPES; q++) 1452 zio_taskqs[t][q] = row[q]; 1453 1454 return (0); 1455 } 1456 1457 static int 1458 spa_taskq_param_get(zio_type_t t, char *buf, boolean_t add_newline) 1459 { 1460 int pos = 0; 1461 1462 /* Build paramater string from live config */ 1463 const char *sep = ""; 1464 for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) { 1465 const zio_taskq_info_t *zti = &zio_taskqs[t][q]; 1466 if (zti->zti_mode == ZTI_MODE_FIXED) 1467 pos += sprintf(&buf[pos], "%s%s,%u,%u", sep, 1468 modes[zti->zti_mode], zti->zti_count, 1469 zti->zti_value); 1470 else 1471 pos += sprintf(&buf[pos], "%s%s", sep, 1472 modes[zti->zti_mode]); 1473 sep = " "; 1474 } 1475 1476 if (add_newline) 1477 buf[pos++] = '\n'; 1478 buf[pos] = '\0'; 1479 1480 return (pos); 1481 } 1482 1483 #ifdef __linux__ 1484 static int 1485 spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp) 1486 { 1487 char *cfg = kmem_strdup(val); 1488 int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg); 1489 kmem_free(cfg, strlen(val)+1); 1490 return (-err); 1491 } 1492 static int 1493 spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp) 1494 { 1495 return (spa_taskq_param_get(ZIO_TYPE_READ, buf, TRUE)); 1496 } 1497 1498 static int 1499 spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp) 1500 { 1501 char *cfg = kmem_strdup(val); 1502 int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg); 1503 kmem_free(cfg, strlen(val)+1); 1504 return (-err); 1505 } 1506 static int 1507 spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp) 1508 { 1509 return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf, TRUE)); 1510 } 1511 #else 1512 /* 1513 * On FreeBSD load-time parameters can be set up before malloc() is available, 1514 * so we have to do all the parsing work on the stack. 1515 */ 1516 #define SPA_TASKQ_PARAM_MAX (128) 1517 1518 static int 1519 spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS) 1520 { 1521 char buf[SPA_TASKQ_PARAM_MAX]; 1522 int err; 1523 1524 (void) spa_taskq_param_get(ZIO_TYPE_READ, buf, FALSE); 1525 err = sysctl_handle_string(oidp, buf, sizeof (buf), req); 1526 if (err || req->newptr == NULL) 1527 return (err); 1528 return (spa_taskq_param_set(ZIO_TYPE_READ, buf)); 1529 } 1530 1531 static int 1532 spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS) 1533 { 1534 char buf[SPA_TASKQ_PARAM_MAX]; 1535 int err; 1536 1537 (void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf, FALSE); 1538 err = sysctl_handle_string(oidp, buf, sizeof (buf), req); 1539 if (err || req->newptr == NULL) 1540 return (err); 1541 return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf)); 1542 } 1543 #endif 1544 #endif /* _KERNEL */ 1545 1546 /* 1547 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 1548 * Note that a type may have multiple discrete taskqs to avoid lock contention 1549 * on the taskq itself. 1550 */ 1551 void 1552 spa_taskq_dispatch(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 1553 task_func_t *func, zio_t *zio, boolean_t cutinline) 1554 { 1555 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1556 taskq_t *tq; 1557 1558 ASSERT3P(tqs->stqs_taskq, !=, NULL); 1559 ASSERT3U(tqs->stqs_count, !=, 0); 1560 1561 /* 1562 * NB: We are assuming that the zio can only be dispatched 1563 * to a single taskq at a time. It would be a grievous error 1564 * to dispatch the zio to another taskq at the same time. 1565 */ 1566 ASSERT(zio); 1567 ASSERT(taskq_empty_ent(&zio->io_tqent)); 1568 1569 if (tqs->stqs_count == 1) { 1570 tq = tqs->stqs_taskq[0]; 1571 } else if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) && 1572 ZIO_HAS_ALLOCATOR(zio)) { 1573 tq = tqs->stqs_taskq[zio->io_allocator % tqs->stqs_count]; 1574 } else { 1575 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; 1576 } 1577 1578 taskq_dispatch_ent(tq, func, zio, cutinline ? TQ_FRONT : 0, 1579 &zio->io_tqent); 1580 } 1581 1582 static void 1583 spa_create_zio_taskqs(spa_t *spa) 1584 { 1585 for (int t = 0; t < ZIO_TYPES; t++) { 1586 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1587 spa_taskqs_init(spa, t, q); 1588 } 1589 } 1590 } 1591 1592 #if defined(_KERNEL) && defined(HAVE_SPA_THREAD) 1593 static void 1594 spa_thread(void *arg) 1595 { 1596 psetid_t zio_taskq_psrset_bind = PS_NONE; 1597 callb_cpr_t cprinfo; 1598 1599 spa_t *spa = arg; 1600 user_t *pu = PTOU(curproc); 1601 1602 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 1603 spa->spa_name); 1604 1605 ASSERT(curproc != &p0); 1606 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 1607 "zpool-%s", spa->spa_name); 1608 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 1609 1610 /* bind this thread to the requested psrset */ 1611 if (zio_taskq_psrset_bind != PS_NONE) { 1612 pool_lock(); 1613 mutex_enter(&cpu_lock); 1614 mutex_enter(&pidlock); 1615 mutex_enter(&curproc->p_lock); 1616 1617 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 1618 0, NULL, NULL) == 0) { 1619 curthread->t_bind_pset = zio_taskq_psrset_bind; 1620 } else { 1621 cmn_err(CE_WARN, 1622 "Couldn't bind process for zfs pool \"%s\" to " 1623 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1624 } 1625 1626 mutex_exit(&curproc->p_lock); 1627 mutex_exit(&pidlock); 1628 mutex_exit(&cpu_lock); 1629 pool_unlock(); 1630 } 1631 1632 #ifdef HAVE_SYSDC 1633 if (zio_taskq_sysdc) { 1634 sysdc_thread_enter(curthread, 100, 0); 1635 } 1636 #endif 1637 1638 spa->spa_proc = curproc; 1639 spa->spa_did = curthread->t_did; 1640 1641 spa_create_zio_taskqs(spa); 1642 1643 mutex_enter(&spa->spa_proc_lock); 1644 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1645 1646 spa->spa_proc_state = SPA_PROC_ACTIVE; 1647 cv_broadcast(&spa->spa_proc_cv); 1648 1649 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1650 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1651 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1652 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1653 1654 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1655 spa->spa_proc_state = SPA_PROC_GONE; 1656 spa->spa_proc = &p0; 1657 cv_broadcast(&spa->spa_proc_cv); 1658 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1659 1660 mutex_enter(&curproc->p_lock); 1661 lwp_exit(); 1662 } 1663 #endif 1664 1665 extern metaslab_ops_t *metaslab_allocator(spa_t *spa); 1666 1667 /* 1668 * Activate an uninitialized pool. 1669 */ 1670 static void 1671 spa_activate(spa_t *spa, spa_mode_t mode) 1672 { 1673 metaslab_ops_t *msp = metaslab_allocator(spa); 1674 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1675 1676 spa->spa_state = POOL_STATE_ACTIVE; 1677 spa->spa_mode = mode; 1678 spa->spa_read_spacemaps = spa_mode_readable_spacemaps; 1679 1680 spa->spa_normal_class = metaslab_class_create(spa, msp); 1681 spa->spa_log_class = metaslab_class_create(spa, msp); 1682 spa->spa_embedded_log_class = metaslab_class_create(spa, msp); 1683 spa->spa_special_class = metaslab_class_create(spa, msp); 1684 spa->spa_dedup_class = metaslab_class_create(spa, msp); 1685 1686 /* Try to create a covering process */ 1687 mutex_enter(&spa->spa_proc_lock); 1688 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1689 ASSERT(spa->spa_proc == &p0); 1690 spa->spa_did = 0; 1691 1692 #ifdef HAVE_SPA_THREAD 1693 /* Only create a process if we're going to be around a while. */ 1694 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1695 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1696 NULL, 0) == 0) { 1697 spa->spa_proc_state = SPA_PROC_CREATED; 1698 while (spa->spa_proc_state == SPA_PROC_CREATED) { 1699 cv_wait(&spa->spa_proc_cv, 1700 &spa->spa_proc_lock); 1701 } 1702 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1703 ASSERT(spa->spa_proc != &p0); 1704 ASSERT(spa->spa_did != 0); 1705 } else { 1706 #ifdef _KERNEL 1707 cmn_err(CE_WARN, 1708 "Couldn't create process for zfs pool \"%s\"\n", 1709 spa->spa_name); 1710 #endif 1711 } 1712 } 1713 #endif /* HAVE_SPA_THREAD */ 1714 mutex_exit(&spa->spa_proc_lock); 1715 1716 /* If we didn't create a process, we need to create our taskqs. */ 1717 if (spa->spa_proc == &p0) { 1718 spa_create_zio_taskqs(spa); 1719 } 1720 1721 for (size_t i = 0; i < TXG_SIZE; i++) { 1722 spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 1723 ZIO_FLAG_CANFAIL); 1724 } 1725 1726 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1727 offsetof(vdev_t, vdev_config_dirty_node)); 1728 list_create(&spa->spa_evicting_os_list, sizeof (objset_t), 1729 offsetof(objset_t, os_evicting_node)); 1730 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1731 offsetof(vdev_t, vdev_state_dirty_node)); 1732 1733 txg_list_create(&spa->spa_vdev_txg_list, spa, 1734 offsetof(struct vdev, vdev_txg_node)); 1735 1736 avl_create(&spa->spa_errlist_scrub, 1737 spa_error_entry_compare, sizeof (spa_error_entry_t), 1738 offsetof(spa_error_entry_t, se_avl)); 1739 avl_create(&spa->spa_errlist_last, 1740 spa_error_entry_compare, sizeof (spa_error_entry_t), 1741 offsetof(spa_error_entry_t, se_avl)); 1742 avl_create(&spa->spa_errlist_healed, 1743 spa_error_entry_compare, sizeof (spa_error_entry_t), 1744 offsetof(spa_error_entry_t, se_avl)); 1745 1746 spa_activate_os(spa); 1747 1748 spa_keystore_init(&spa->spa_keystore); 1749 1750 /* 1751 * This taskq is used to perform zvol-minor-related tasks 1752 * asynchronously. This has several advantages, including easy 1753 * resolution of various deadlocks. 1754 * 1755 * The taskq must be single threaded to ensure tasks are always 1756 * processed in the order in which they were dispatched. 1757 * 1758 * A taskq per pool allows one to keep the pools independent. 1759 * This way if one pool is suspended, it will not impact another. 1760 * 1761 * The preferred location to dispatch a zvol minor task is a sync 1762 * task. In this context, there is easy access to the spa_t and minimal 1763 * error handling is required because the sync task must succeed. 1764 */ 1765 spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri, 1766 1, INT_MAX, 0); 1767 1768 /* 1769 * The taskq to preload metaslabs. 1770 */ 1771 spa->spa_metaslab_taskq = taskq_create("z_metaslab", 1772 metaslab_preload_pct, maxclsyspri, 1, INT_MAX, 1773 TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1774 1775 /* 1776 * Taskq dedicated to prefetcher threads: this is used to prevent the 1777 * pool traverse code from monopolizing the global (and limited) 1778 * system_taskq by inappropriately scheduling long running tasks on it. 1779 */ 1780 spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100, 1781 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1782 1783 /* 1784 * The taskq to upgrade datasets in this pool. Currently used by 1785 * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA. 1786 */ 1787 spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100, 1788 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1789 } 1790 1791 /* 1792 * Opposite of spa_activate(). 1793 */ 1794 static void 1795 spa_deactivate(spa_t *spa) 1796 { 1797 ASSERT(spa->spa_sync_on == B_FALSE); 1798 ASSERT(spa->spa_dsl_pool == NULL); 1799 ASSERT(spa->spa_root_vdev == NULL); 1800 ASSERT(spa->spa_async_zio_root == NULL); 1801 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1802 1803 spa_evicting_os_wait(spa); 1804 1805 if (spa->spa_zvol_taskq) { 1806 taskq_destroy(spa->spa_zvol_taskq); 1807 spa->spa_zvol_taskq = NULL; 1808 } 1809 1810 if (spa->spa_metaslab_taskq) { 1811 taskq_destroy(spa->spa_metaslab_taskq); 1812 spa->spa_metaslab_taskq = NULL; 1813 } 1814 1815 if (spa->spa_prefetch_taskq) { 1816 taskq_destroy(spa->spa_prefetch_taskq); 1817 spa->spa_prefetch_taskq = NULL; 1818 } 1819 1820 if (spa->spa_upgrade_taskq) { 1821 taskq_destroy(spa->spa_upgrade_taskq); 1822 spa->spa_upgrade_taskq = NULL; 1823 } 1824 1825 txg_list_destroy(&spa->spa_vdev_txg_list); 1826 1827 list_destroy(&spa->spa_config_dirty_list); 1828 list_destroy(&spa->spa_evicting_os_list); 1829 list_destroy(&spa->spa_state_dirty_list); 1830 1831 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 1832 1833 for (int t = 0; t < ZIO_TYPES; t++) { 1834 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1835 spa_taskqs_fini(spa, t, q); 1836 } 1837 } 1838 1839 for (size_t i = 0; i < TXG_SIZE; i++) { 1840 ASSERT3P(spa->spa_txg_zio[i], !=, NULL); 1841 VERIFY0(zio_wait(spa->spa_txg_zio[i])); 1842 spa->spa_txg_zio[i] = NULL; 1843 } 1844 1845 metaslab_class_destroy(spa->spa_normal_class); 1846 spa->spa_normal_class = NULL; 1847 1848 metaslab_class_destroy(spa->spa_log_class); 1849 spa->spa_log_class = NULL; 1850 1851 metaslab_class_destroy(spa->spa_embedded_log_class); 1852 spa->spa_embedded_log_class = NULL; 1853 1854 metaslab_class_destroy(spa->spa_special_class); 1855 spa->spa_special_class = NULL; 1856 1857 metaslab_class_destroy(spa->spa_dedup_class); 1858 spa->spa_dedup_class = NULL; 1859 1860 /* 1861 * If this was part of an import or the open otherwise failed, we may 1862 * still have errors left in the queues. Empty them just in case. 1863 */ 1864 spa_errlog_drain(spa); 1865 avl_destroy(&spa->spa_errlist_scrub); 1866 avl_destroy(&spa->spa_errlist_last); 1867 avl_destroy(&spa->spa_errlist_healed); 1868 1869 spa_keystore_fini(&spa->spa_keystore); 1870 1871 spa->spa_state = POOL_STATE_UNINITIALIZED; 1872 1873 mutex_enter(&spa->spa_proc_lock); 1874 if (spa->spa_proc_state != SPA_PROC_NONE) { 1875 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1876 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1877 cv_broadcast(&spa->spa_proc_cv); 1878 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1879 ASSERT(spa->spa_proc != &p0); 1880 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1881 } 1882 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1883 spa->spa_proc_state = SPA_PROC_NONE; 1884 } 1885 ASSERT(spa->spa_proc == &p0); 1886 mutex_exit(&spa->spa_proc_lock); 1887 1888 /* 1889 * We want to make sure spa_thread() has actually exited the ZFS 1890 * module, so that the module can't be unloaded out from underneath 1891 * it. 1892 */ 1893 if (spa->spa_did != 0) { 1894 thread_join(spa->spa_did); 1895 spa->spa_did = 0; 1896 } 1897 1898 spa_deactivate_os(spa); 1899 1900 } 1901 1902 /* 1903 * Verify a pool configuration, and construct the vdev tree appropriately. This 1904 * will create all the necessary vdevs in the appropriate layout, with each vdev 1905 * in the CLOSED state. This will prep the pool before open/creation/import. 1906 * All vdev validation is done by the vdev_alloc() routine. 1907 */ 1908 int 1909 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1910 uint_t id, int atype) 1911 { 1912 nvlist_t **child; 1913 uint_t children; 1914 int error; 1915 1916 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1917 return (error); 1918 1919 if ((*vdp)->vdev_ops->vdev_op_leaf) 1920 return (0); 1921 1922 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1923 &child, &children); 1924 1925 if (error == ENOENT) 1926 return (0); 1927 1928 if (error) { 1929 vdev_free(*vdp); 1930 *vdp = NULL; 1931 return (SET_ERROR(EINVAL)); 1932 } 1933 1934 for (int c = 0; c < children; c++) { 1935 vdev_t *vd; 1936 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1937 atype)) != 0) { 1938 vdev_free(*vdp); 1939 *vdp = NULL; 1940 return (error); 1941 } 1942 } 1943 1944 ASSERT(*vdp != NULL); 1945 1946 return (0); 1947 } 1948 1949 static boolean_t 1950 spa_should_flush_logs_on_unload(spa_t *spa) 1951 { 1952 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 1953 return (B_FALSE); 1954 1955 if (!spa_writeable(spa)) 1956 return (B_FALSE); 1957 1958 if (!spa->spa_sync_on) 1959 return (B_FALSE); 1960 1961 if (spa_state(spa) != POOL_STATE_EXPORTED) 1962 return (B_FALSE); 1963 1964 if (zfs_keep_log_spacemaps_at_export) 1965 return (B_FALSE); 1966 1967 return (B_TRUE); 1968 } 1969 1970 /* 1971 * Opens a transaction that will set the flag that will instruct 1972 * spa_sync to attempt to flush all the metaslabs for that txg. 1973 */ 1974 static void 1975 spa_unload_log_sm_flush_all(spa_t *spa) 1976 { 1977 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 1978 VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 1979 1980 ASSERT3U(spa->spa_log_flushall_txg, ==, 0); 1981 spa->spa_log_flushall_txg = dmu_tx_get_txg(tx); 1982 1983 dmu_tx_commit(tx); 1984 txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg); 1985 } 1986 1987 static void 1988 spa_unload_log_sm_metadata(spa_t *spa) 1989 { 1990 void *cookie = NULL; 1991 spa_log_sm_t *sls; 1992 log_summary_entry_t *e; 1993 1994 while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg, 1995 &cookie)) != NULL) { 1996 VERIFY0(sls->sls_mscount); 1997 kmem_free(sls, sizeof (spa_log_sm_t)); 1998 } 1999 2000 while ((e = list_remove_head(&spa->spa_log_summary)) != NULL) { 2001 VERIFY0(e->lse_mscount); 2002 kmem_free(e, sizeof (log_summary_entry_t)); 2003 } 2004 2005 spa->spa_unflushed_stats.sus_nblocks = 0; 2006 spa->spa_unflushed_stats.sus_memused = 0; 2007 spa->spa_unflushed_stats.sus_blocklimit = 0; 2008 } 2009 2010 static void 2011 spa_destroy_aux_threads(spa_t *spa) 2012 { 2013 if (spa->spa_condense_zthr != NULL) { 2014 zthr_destroy(spa->spa_condense_zthr); 2015 spa->spa_condense_zthr = NULL; 2016 } 2017 if (spa->spa_checkpoint_discard_zthr != NULL) { 2018 zthr_destroy(spa->spa_checkpoint_discard_zthr); 2019 spa->spa_checkpoint_discard_zthr = NULL; 2020 } 2021 if (spa->spa_livelist_delete_zthr != NULL) { 2022 zthr_destroy(spa->spa_livelist_delete_zthr); 2023 spa->spa_livelist_delete_zthr = NULL; 2024 } 2025 if (spa->spa_livelist_condense_zthr != NULL) { 2026 zthr_destroy(spa->spa_livelist_condense_zthr); 2027 spa->spa_livelist_condense_zthr = NULL; 2028 } 2029 if (spa->spa_raidz_expand_zthr != NULL) { 2030 zthr_destroy(spa->spa_raidz_expand_zthr); 2031 spa->spa_raidz_expand_zthr = NULL; 2032 } 2033 } 2034 2035 /* 2036 * Opposite of spa_load(). 2037 */ 2038 static void 2039 spa_unload(spa_t *spa) 2040 { 2041 ASSERT(MUTEX_HELD(&spa_namespace_lock) || 2042 spa->spa_export_thread == curthread); 2043 ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED); 2044 2045 spa_import_progress_remove(spa_guid(spa)); 2046 spa_load_note(spa, "UNLOADING"); 2047 2048 spa_wake_waiters(spa); 2049 2050 /* 2051 * If we have set the spa_final_txg, we have already performed the 2052 * tasks below in spa_export_common(). We should not redo it here since 2053 * we delay the final TXGs beyond what spa_final_txg is set at. 2054 */ 2055 if (spa->spa_final_txg == UINT64_MAX) { 2056 /* 2057 * If the log space map feature is enabled and the pool is 2058 * getting exported (but not destroyed), we want to spend some 2059 * time flushing as many metaslabs as we can in an attempt to 2060 * destroy log space maps and save import time. 2061 */ 2062 if (spa_should_flush_logs_on_unload(spa)) 2063 spa_unload_log_sm_flush_all(spa); 2064 2065 /* 2066 * Stop async tasks. 2067 */ 2068 spa_async_suspend(spa); 2069 2070 if (spa->spa_root_vdev) { 2071 vdev_t *root_vdev = spa->spa_root_vdev; 2072 vdev_initialize_stop_all(root_vdev, 2073 VDEV_INITIALIZE_ACTIVE); 2074 vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE); 2075 vdev_autotrim_stop_all(spa); 2076 vdev_rebuild_stop_all(spa); 2077 } 2078 } 2079 2080 /* 2081 * Stop syncing. 2082 */ 2083 if (spa->spa_sync_on) { 2084 txg_sync_stop(spa->spa_dsl_pool); 2085 spa->spa_sync_on = B_FALSE; 2086 } 2087 2088 /* 2089 * This ensures that there is no async metaslab prefetching 2090 * while we attempt to unload the spa. 2091 */ 2092 taskq_wait(spa->spa_metaslab_taskq); 2093 2094 if (spa->spa_mmp.mmp_thread) 2095 mmp_thread_stop(spa); 2096 2097 /* 2098 * Wait for any outstanding async I/O to complete. 2099 */ 2100 if (spa->spa_async_zio_root != NULL) { 2101 for (int i = 0; i < max_ncpus; i++) 2102 (void) zio_wait(spa->spa_async_zio_root[i]); 2103 kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); 2104 spa->spa_async_zio_root = NULL; 2105 } 2106 2107 if (spa->spa_vdev_removal != NULL) { 2108 spa_vdev_removal_destroy(spa->spa_vdev_removal); 2109 spa->spa_vdev_removal = NULL; 2110 } 2111 2112 spa_destroy_aux_threads(spa); 2113 2114 spa_condense_fini(spa); 2115 2116 bpobj_close(&spa->spa_deferred_bpobj); 2117 2118 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); 2119 2120 /* 2121 * Close all vdevs. 2122 */ 2123 if (spa->spa_root_vdev) 2124 vdev_free(spa->spa_root_vdev); 2125 ASSERT(spa->spa_root_vdev == NULL); 2126 2127 /* 2128 * Close the dsl pool. 2129 */ 2130 if (spa->spa_dsl_pool) { 2131 dsl_pool_close(spa->spa_dsl_pool); 2132 spa->spa_dsl_pool = NULL; 2133 spa->spa_meta_objset = NULL; 2134 } 2135 2136 ddt_unload(spa); 2137 brt_unload(spa); 2138 spa_unload_log_sm_metadata(spa); 2139 2140 /* 2141 * Drop and purge level 2 cache 2142 */ 2143 spa_l2cache_drop(spa); 2144 2145 if (spa->spa_spares.sav_vdevs) { 2146 for (int i = 0; i < spa->spa_spares.sav_count; i++) 2147 vdev_free(spa->spa_spares.sav_vdevs[i]); 2148 kmem_free(spa->spa_spares.sav_vdevs, 2149 spa->spa_spares.sav_count * sizeof (void *)); 2150 spa->spa_spares.sav_vdevs = NULL; 2151 } 2152 if (spa->spa_spares.sav_config) { 2153 nvlist_free(spa->spa_spares.sav_config); 2154 spa->spa_spares.sav_config = NULL; 2155 } 2156 spa->spa_spares.sav_count = 0; 2157 2158 if (spa->spa_l2cache.sav_vdevs) { 2159 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { 2160 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 2161 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 2162 } 2163 kmem_free(spa->spa_l2cache.sav_vdevs, 2164 spa->spa_l2cache.sav_count * sizeof (void *)); 2165 spa->spa_l2cache.sav_vdevs = NULL; 2166 } 2167 if (spa->spa_l2cache.sav_config) { 2168 nvlist_free(spa->spa_l2cache.sav_config); 2169 spa->spa_l2cache.sav_config = NULL; 2170 } 2171 spa->spa_l2cache.sav_count = 0; 2172 2173 spa->spa_async_suspended = 0; 2174 2175 spa->spa_indirect_vdevs_loaded = B_FALSE; 2176 2177 if (spa->spa_comment != NULL) { 2178 spa_strfree(spa->spa_comment); 2179 spa->spa_comment = NULL; 2180 } 2181 if (spa->spa_compatibility != NULL) { 2182 spa_strfree(spa->spa_compatibility); 2183 spa->spa_compatibility = NULL; 2184 } 2185 2186 spa->spa_raidz_expand = NULL; 2187 2188 spa_config_exit(spa, SCL_ALL, spa); 2189 } 2190 2191 /* 2192 * Load (or re-load) the current list of vdevs describing the active spares for 2193 * this pool. When this is called, we have some form of basic information in 2194 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 2195 * then re-generate a more complete list including status information. 2196 */ 2197 void 2198 spa_load_spares(spa_t *spa) 2199 { 2200 nvlist_t **spares; 2201 uint_t nspares; 2202 int i; 2203 vdev_t *vd, *tvd; 2204 2205 #ifndef _KERNEL 2206 /* 2207 * zdb opens both the current state of the pool and the 2208 * checkpointed state (if present), with a different spa_t. 2209 * 2210 * As spare vdevs are shared among open pools, we skip loading 2211 * them when we load the checkpointed state of the pool. 2212 */ 2213 if (!spa_writeable(spa)) 2214 return; 2215 #endif 2216 2217 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2218 2219 /* 2220 * First, close and free any existing spare vdevs. 2221 */ 2222 if (spa->spa_spares.sav_vdevs) { 2223 for (i = 0; i < spa->spa_spares.sav_count; i++) { 2224 vd = spa->spa_spares.sav_vdevs[i]; 2225 2226 /* Undo the call to spa_activate() below */ 2227 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 2228 B_FALSE)) != NULL && tvd->vdev_isspare) 2229 spa_spare_remove(tvd); 2230 vdev_close(vd); 2231 vdev_free(vd); 2232 } 2233 2234 kmem_free(spa->spa_spares.sav_vdevs, 2235 spa->spa_spares.sav_count * sizeof (void *)); 2236 } 2237 2238 if (spa->spa_spares.sav_config == NULL) 2239 nspares = 0; 2240 else 2241 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 2242 ZPOOL_CONFIG_SPARES, &spares, &nspares)); 2243 2244 spa->spa_spares.sav_count = (int)nspares; 2245 spa->spa_spares.sav_vdevs = NULL; 2246 2247 if (nspares == 0) 2248 return; 2249 2250 /* 2251 * Construct the array of vdevs, opening them to get status in the 2252 * process. For each spare, there is potentially two different vdev_t 2253 * structures associated with it: one in the list of spares (used only 2254 * for basic validation purposes) and one in the active vdev 2255 * configuration (if it's spared in). During this phase we open and 2256 * validate each vdev on the spare list. If the vdev also exists in the 2257 * active configuration, then we also mark this vdev as an active spare. 2258 */ 2259 spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *), 2260 KM_SLEEP); 2261 for (i = 0; i < spa->spa_spares.sav_count; i++) { 2262 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 2263 VDEV_ALLOC_SPARE) == 0); 2264 ASSERT(vd != NULL); 2265 2266 spa->spa_spares.sav_vdevs[i] = vd; 2267 2268 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 2269 B_FALSE)) != NULL) { 2270 if (!tvd->vdev_isspare) 2271 spa_spare_add(tvd); 2272 2273 /* 2274 * We only mark the spare active if we were successfully 2275 * able to load the vdev. Otherwise, importing a pool 2276 * with a bad active spare would result in strange 2277 * behavior, because multiple pool would think the spare 2278 * is actively in use. 2279 * 2280 * There is a vulnerability here to an equally bizarre 2281 * circumstance, where a dead active spare is later 2282 * brought back to life (onlined or otherwise). Given 2283 * the rarity of this scenario, and the extra complexity 2284 * it adds, we ignore the possibility. 2285 */ 2286 if (!vdev_is_dead(tvd)) 2287 spa_spare_activate(tvd); 2288 } 2289 2290 vd->vdev_top = vd; 2291 vd->vdev_aux = &spa->spa_spares; 2292 2293 if (vdev_open(vd) != 0) 2294 continue; 2295 2296 if (vdev_validate_aux(vd) == 0) 2297 spa_spare_add(vd); 2298 } 2299 2300 /* 2301 * Recompute the stashed list of spares, with status information 2302 * this time. 2303 */ 2304 fnvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES); 2305 2306 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 2307 KM_SLEEP); 2308 for (i = 0; i < spa->spa_spares.sav_count; i++) 2309 spares[i] = vdev_config_generate(spa, 2310 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 2311 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 2312 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 2313 spa->spa_spares.sav_count); 2314 for (i = 0; i < spa->spa_spares.sav_count; i++) 2315 nvlist_free(spares[i]); 2316 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 2317 } 2318 2319 /* 2320 * Load (or re-load) the current list of vdevs describing the active l2cache for 2321 * this pool. When this is called, we have some form of basic information in 2322 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 2323 * then re-generate a more complete list including status information. 2324 * Devices which are already active have their details maintained, and are 2325 * not re-opened. 2326 */ 2327 void 2328 spa_load_l2cache(spa_t *spa) 2329 { 2330 nvlist_t **l2cache = NULL; 2331 uint_t nl2cache; 2332 int i, j, oldnvdevs; 2333 uint64_t guid; 2334 vdev_t *vd, **oldvdevs, **newvdevs; 2335 spa_aux_vdev_t *sav = &spa->spa_l2cache; 2336 2337 #ifndef _KERNEL 2338 /* 2339 * zdb opens both the current state of the pool and the 2340 * checkpointed state (if present), with a different spa_t. 2341 * 2342 * As L2 caches are part of the ARC which is shared among open 2343 * pools, we skip loading them when we load the checkpointed 2344 * state of the pool. 2345 */ 2346 if (!spa_writeable(spa)) 2347 return; 2348 #endif 2349 2350 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2351 2352 oldvdevs = sav->sav_vdevs; 2353 oldnvdevs = sav->sav_count; 2354 sav->sav_vdevs = NULL; 2355 sav->sav_count = 0; 2356 2357 if (sav->sav_config == NULL) { 2358 nl2cache = 0; 2359 newvdevs = NULL; 2360 goto out; 2361 } 2362 2363 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, 2364 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); 2365 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 2366 2367 /* 2368 * Process new nvlist of vdevs. 2369 */ 2370 for (i = 0; i < nl2cache; i++) { 2371 guid = fnvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID); 2372 2373 newvdevs[i] = NULL; 2374 for (j = 0; j < oldnvdevs; j++) { 2375 vd = oldvdevs[j]; 2376 if (vd != NULL && guid == vd->vdev_guid) { 2377 /* 2378 * Retain previous vdev for add/remove ops. 2379 */ 2380 newvdevs[i] = vd; 2381 oldvdevs[j] = NULL; 2382 break; 2383 } 2384 } 2385 2386 if (newvdevs[i] == NULL) { 2387 /* 2388 * Create new vdev 2389 */ 2390 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 2391 VDEV_ALLOC_L2CACHE) == 0); 2392 ASSERT(vd != NULL); 2393 newvdevs[i] = vd; 2394 2395 /* 2396 * Commit this vdev as an l2cache device, 2397 * even if it fails to open. 2398 */ 2399 spa_l2cache_add(vd); 2400 2401 vd->vdev_top = vd; 2402 vd->vdev_aux = sav; 2403 2404 spa_l2cache_activate(vd); 2405 2406 if (vdev_open(vd) != 0) 2407 continue; 2408 2409 (void) vdev_validate_aux(vd); 2410 2411 if (!vdev_is_dead(vd)) 2412 l2arc_add_vdev(spa, vd); 2413 2414 /* 2415 * Upon cache device addition to a pool or pool 2416 * creation with a cache device or if the header 2417 * of the device is invalid we issue an async 2418 * TRIM command for the whole device which will 2419 * execute if l2arc_trim_ahead > 0. 2420 */ 2421 spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); 2422 } 2423 } 2424 2425 sav->sav_vdevs = newvdevs; 2426 sav->sav_count = (int)nl2cache; 2427 2428 /* 2429 * Recompute the stashed list of l2cache devices, with status 2430 * information this time. 2431 */ 2432 fnvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE); 2433 2434 if (sav->sav_count > 0) 2435 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), 2436 KM_SLEEP); 2437 for (i = 0; i < sav->sav_count; i++) 2438 l2cache[i] = vdev_config_generate(spa, 2439 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 2440 fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 2441 (const nvlist_t * const *)l2cache, sav->sav_count); 2442 2443 out: 2444 /* 2445 * Purge vdevs that were dropped 2446 */ 2447 if (oldvdevs) { 2448 for (i = 0; i < oldnvdevs; i++) { 2449 uint64_t pool; 2450 2451 vd = oldvdevs[i]; 2452 if (vd != NULL) { 2453 ASSERT(vd->vdev_isl2cache); 2454 2455 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 2456 pool != 0ULL && l2arc_vdev_present(vd)) 2457 l2arc_remove_vdev(vd); 2458 vdev_clear_stats(vd); 2459 vdev_free(vd); 2460 } 2461 } 2462 2463 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 2464 } 2465 2466 for (i = 0; i < sav->sav_count; i++) 2467 nvlist_free(l2cache[i]); 2468 if (sav->sav_count) 2469 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 2470 } 2471 2472 static int 2473 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 2474 { 2475 dmu_buf_t *db; 2476 char *packed = NULL; 2477 size_t nvsize = 0; 2478 int error; 2479 *value = NULL; 2480 2481 error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 2482 if (error) 2483 return (error); 2484 2485 nvsize = *(uint64_t *)db->db_data; 2486 dmu_buf_rele(db, FTAG); 2487 2488 packed = vmem_alloc(nvsize, KM_SLEEP); 2489 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 2490 DMU_READ_PREFETCH); 2491 if (error == 0) 2492 error = nvlist_unpack(packed, nvsize, value, 0); 2493 vmem_free(packed, nvsize); 2494 2495 return (error); 2496 } 2497 2498 /* 2499 * Concrete top-level vdevs that are not missing and are not logs. At every 2500 * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds. 2501 */ 2502 static uint64_t 2503 spa_healthy_core_tvds(spa_t *spa) 2504 { 2505 vdev_t *rvd = spa->spa_root_vdev; 2506 uint64_t tvds = 0; 2507 2508 for (uint64_t i = 0; i < rvd->vdev_children; i++) { 2509 vdev_t *vd = rvd->vdev_child[i]; 2510 if (vd->vdev_islog) 2511 continue; 2512 if (vdev_is_concrete(vd) && !vdev_is_dead(vd)) 2513 tvds++; 2514 } 2515 2516 return (tvds); 2517 } 2518 2519 /* 2520 * Checks to see if the given vdev could not be opened, in which case we post a 2521 * sysevent to notify the autoreplace code that the device has been removed. 2522 */ 2523 static void 2524 spa_check_removed(vdev_t *vd) 2525 { 2526 for (uint64_t c = 0; c < vd->vdev_children; c++) 2527 spa_check_removed(vd->vdev_child[c]); 2528 2529 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 2530 vdev_is_concrete(vd)) { 2531 zfs_post_autoreplace(vd->vdev_spa, vd); 2532 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); 2533 } 2534 } 2535 2536 static int 2537 spa_check_for_missing_logs(spa_t *spa) 2538 { 2539 vdev_t *rvd = spa->spa_root_vdev; 2540 2541 /* 2542 * If we're doing a normal import, then build up any additional 2543 * diagnostic information about missing log devices. 2544 * We'll pass this up to the user for further processing. 2545 */ 2546 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 2547 nvlist_t **child, *nv; 2548 uint64_t idx = 0; 2549 2550 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *), 2551 KM_SLEEP); 2552 nv = fnvlist_alloc(); 2553 2554 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 2555 vdev_t *tvd = rvd->vdev_child[c]; 2556 2557 /* 2558 * We consider a device as missing only if it failed 2559 * to open (i.e. offline or faulted is not considered 2560 * as missing). 2561 */ 2562 if (tvd->vdev_islog && 2563 tvd->vdev_state == VDEV_STATE_CANT_OPEN) { 2564 child[idx++] = vdev_config_generate(spa, tvd, 2565 B_FALSE, VDEV_CONFIG_MISSING); 2566 } 2567 } 2568 2569 if (idx > 0) { 2570 fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 2571 (const nvlist_t * const *)child, idx); 2572 fnvlist_add_nvlist(spa->spa_load_info, 2573 ZPOOL_CONFIG_MISSING_DEVICES, nv); 2574 2575 for (uint64_t i = 0; i < idx; i++) 2576 nvlist_free(child[i]); 2577 } 2578 nvlist_free(nv); 2579 kmem_free(child, rvd->vdev_children * sizeof (char **)); 2580 2581 if (idx > 0) { 2582 spa_load_failed(spa, "some log devices are missing"); 2583 vdev_dbgmsg_print_tree(rvd, 2); 2584 return (SET_ERROR(ENXIO)); 2585 } 2586 } else { 2587 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 2588 vdev_t *tvd = rvd->vdev_child[c]; 2589 2590 if (tvd->vdev_islog && 2591 tvd->vdev_state == VDEV_STATE_CANT_OPEN) { 2592 spa_set_log_state(spa, SPA_LOG_CLEAR); 2593 spa_load_note(spa, "some log devices are " 2594 "missing, ZIL is dropped."); 2595 vdev_dbgmsg_print_tree(rvd, 2); 2596 break; 2597 } 2598 } 2599 } 2600 2601 return (0); 2602 } 2603 2604 /* 2605 * Check for missing log devices 2606 */ 2607 static boolean_t 2608 spa_check_logs(spa_t *spa) 2609 { 2610 boolean_t rv = B_FALSE; 2611 dsl_pool_t *dp = spa_get_dsl(spa); 2612 2613 switch (spa->spa_log_state) { 2614 default: 2615 break; 2616 case SPA_LOG_MISSING: 2617 /* need to recheck in case slog has been restored */ 2618 case SPA_LOG_UNKNOWN: 2619 rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 2620 zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); 2621 if (rv) 2622 spa_set_log_state(spa, SPA_LOG_MISSING); 2623 break; 2624 } 2625 return (rv); 2626 } 2627 2628 /* 2629 * Passivate any log vdevs (note, does not apply to embedded log metaslabs). 2630 */ 2631 static boolean_t 2632 spa_passivate_log(spa_t *spa) 2633 { 2634 vdev_t *rvd = spa->spa_root_vdev; 2635 boolean_t slog_found = B_FALSE; 2636 2637 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 2638 2639 for (int c = 0; c < rvd->vdev_children; c++) { 2640 vdev_t *tvd = rvd->vdev_child[c]; 2641 2642 if (tvd->vdev_islog) { 2643 ASSERT3P(tvd->vdev_log_mg, ==, NULL); 2644 metaslab_group_passivate(tvd->vdev_mg); 2645 slog_found = B_TRUE; 2646 } 2647 } 2648 2649 return (slog_found); 2650 } 2651 2652 /* 2653 * Activate any log vdevs (note, does not apply to embedded log metaslabs). 2654 */ 2655 static void 2656 spa_activate_log(spa_t *spa) 2657 { 2658 vdev_t *rvd = spa->spa_root_vdev; 2659 2660 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 2661 2662 for (int c = 0; c < rvd->vdev_children; c++) { 2663 vdev_t *tvd = rvd->vdev_child[c]; 2664 2665 if (tvd->vdev_islog) { 2666 ASSERT3P(tvd->vdev_log_mg, ==, NULL); 2667 metaslab_group_activate(tvd->vdev_mg); 2668 } 2669 } 2670 } 2671 2672 int 2673 spa_reset_logs(spa_t *spa) 2674 { 2675 int error; 2676 2677 error = dmu_objset_find(spa_name(spa), zil_reset, 2678 NULL, DS_FIND_CHILDREN); 2679 if (error == 0) { 2680 /* 2681 * We successfully offlined the log device, sync out the 2682 * current txg so that the "stubby" block can be removed 2683 * by zil_sync(). 2684 */ 2685 txg_wait_synced(spa->spa_dsl_pool, 0); 2686 } 2687 return (error); 2688 } 2689 2690 static void 2691 spa_aux_check_removed(spa_aux_vdev_t *sav) 2692 { 2693 for (int i = 0; i < sav->sav_count; i++) 2694 spa_check_removed(sav->sav_vdevs[i]); 2695 } 2696 2697 void 2698 spa_claim_notify(zio_t *zio) 2699 { 2700 spa_t *spa = zio->io_spa; 2701 2702 if (zio->io_error) 2703 return; 2704 2705 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 2706 if (spa->spa_claim_max_txg < BP_GET_LOGICAL_BIRTH(zio->io_bp)) 2707 spa->spa_claim_max_txg = BP_GET_LOGICAL_BIRTH(zio->io_bp); 2708 mutex_exit(&spa->spa_props_lock); 2709 } 2710 2711 typedef struct spa_load_error { 2712 boolean_t sle_verify_data; 2713 uint64_t sle_meta_count; 2714 uint64_t sle_data_count; 2715 } spa_load_error_t; 2716 2717 static void 2718 spa_load_verify_done(zio_t *zio) 2719 { 2720 blkptr_t *bp = zio->io_bp; 2721 spa_load_error_t *sle = zio->io_private; 2722 dmu_object_type_t type = BP_GET_TYPE(bp); 2723 int error = zio->io_error; 2724 spa_t *spa = zio->io_spa; 2725 2726 abd_free(zio->io_abd); 2727 if (error) { 2728 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 2729 type != DMU_OT_INTENT_LOG) 2730 atomic_inc_64(&sle->sle_meta_count); 2731 else 2732 atomic_inc_64(&sle->sle_data_count); 2733 } 2734 2735 mutex_enter(&spa->spa_scrub_lock); 2736 spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); 2737 cv_broadcast(&spa->spa_scrub_io_cv); 2738 mutex_exit(&spa->spa_scrub_lock); 2739 } 2740 2741 /* 2742 * Maximum number of inflight bytes is the log2 fraction of the arc size. 2743 * By default, we set it to 1/16th of the arc. 2744 */ 2745 static uint_t spa_load_verify_shift = 4; 2746 static int spa_load_verify_metadata = B_TRUE; 2747 static int spa_load_verify_data = B_TRUE; 2748 2749 static int 2750 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 2751 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 2752 { 2753 zio_t *rio = arg; 2754 spa_load_error_t *sle = rio->io_private; 2755 2756 (void) zilog, (void) dnp; 2757 2758 /* 2759 * Note: normally this routine will not be called if 2760 * spa_load_verify_metadata is not set. However, it may be useful 2761 * to manually set the flag after the traversal has begun. 2762 */ 2763 if (!spa_load_verify_metadata) 2764 return (0); 2765 2766 /* 2767 * Sanity check the block pointer in order to detect obvious damage 2768 * before using the contents in subsequent checks or in zio_read(). 2769 * When damaged consider it to be a metadata error since we cannot 2770 * trust the BP_GET_TYPE and BP_GET_LEVEL values. 2771 */ 2772 if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { 2773 atomic_inc_64(&sle->sle_meta_count); 2774 return (0); 2775 } 2776 2777 if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || 2778 BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) 2779 return (0); 2780 2781 if (!BP_IS_METADATA(bp) && 2782 (!spa_load_verify_data || !sle->sle_verify_data)) 2783 return (0); 2784 2785 uint64_t maxinflight_bytes = 2786 arc_target_bytes() >> spa_load_verify_shift; 2787 size_t size = BP_GET_PSIZE(bp); 2788 2789 mutex_enter(&spa->spa_scrub_lock); 2790 while (spa->spa_load_verify_bytes >= maxinflight_bytes) 2791 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2792 spa->spa_load_verify_bytes += size; 2793 mutex_exit(&spa->spa_scrub_lock); 2794 2795 zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, 2796 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 2797 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 2798 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 2799 return (0); 2800 } 2801 2802 static int 2803 verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 2804 { 2805 (void) dp, (void) arg; 2806 2807 if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) 2808 return (SET_ERROR(ENAMETOOLONG)); 2809 2810 return (0); 2811 } 2812 2813 static int 2814 spa_load_verify(spa_t *spa) 2815 { 2816 zio_t *rio; 2817 spa_load_error_t sle = { 0 }; 2818 zpool_load_policy_t policy; 2819 boolean_t verify_ok = B_FALSE; 2820 int error = 0; 2821 2822 zpool_get_load_policy(spa->spa_config, &policy); 2823 2824 if (policy.zlp_rewind & ZPOOL_NEVER_REWIND || 2825 policy.zlp_maxmeta == UINT64_MAX) 2826 return (0); 2827 2828 dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); 2829 error = dmu_objset_find_dp(spa->spa_dsl_pool, 2830 spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, 2831 DS_FIND_CHILDREN); 2832 dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); 2833 if (error != 0) 2834 return (error); 2835 2836 /* 2837 * Verify data only if we are rewinding or error limit was set. 2838 * Otherwise nothing except dbgmsg care about it to waste time. 2839 */ 2840 sle.sle_verify_data = (policy.zlp_rewind & ZPOOL_REWIND_MASK) || 2841 (policy.zlp_maxdata < UINT64_MAX); 2842 2843 rio = zio_root(spa, NULL, &sle, 2844 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 2845 2846 if (spa_load_verify_metadata) { 2847 if (spa->spa_extreme_rewind) { 2848 spa_load_note(spa, "performing a complete scan of the " 2849 "pool since extreme rewind is on. This may take " 2850 "a very long time.\n (spa_load_verify_data=%u, " 2851 "spa_load_verify_metadata=%u)", 2852 spa_load_verify_data, spa_load_verify_metadata); 2853 } 2854 2855 error = traverse_pool(spa, spa->spa_verify_min_txg, 2856 TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | 2857 TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio); 2858 } 2859 2860 (void) zio_wait(rio); 2861 ASSERT0(spa->spa_load_verify_bytes); 2862 2863 spa->spa_load_meta_errors = sle.sle_meta_count; 2864 spa->spa_load_data_errors = sle.sle_data_count; 2865 2866 if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) { 2867 spa_load_note(spa, "spa_load_verify found %llu metadata errors " 2868 "and %llu data errors", (u_longlong_t)sle.sle_meta_count, 2869 (u_longlong_t)sle.sle_data_count); 2870 } 2871 2872 if (spa_load_verify_dryrun || 2873 (!error && sle.sle_meta_count <= policy.zlp_maxmeta && 2874 sle.sle_data_count <= policy.zlp_maxdata)) { 2875 int64_t loss = 0; 2876 2877 verify_ok = B_TRUE; 2878 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 2879 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 2880 2881 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 2882 fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME, 2883 spa->spa_load_txg_ts); 2884 fnvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME, 2885 loss); 2886 fnvlist_add_uint64(spa->spa_load_info, 2887 ZPOOL_CONFIG_LOAD_META_ERRORS, sle.sle_meta_count); 2888 fnvlist_add_uint64(spa->spa_load_info, 2889 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count); 2890 } else { 2891 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 2892 } 2893 2894 if (spa_load_verify_dryrun) 2895 return (0); 2896 2897 if (error) { 2898 if (error != ENXIO && error != EIO) 2899 error = SET_ERROR(EIO); 2900 return (error); 2901 } 2902 2903 return (verify_ok ? 0 : EIO); 2904 } 2905 2906 /* 2907 * Find a value in the pool props object. 2908 */ 2909 static void 2910 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 2911 { 2912 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 2913 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 2914 } 2915 2916 /* 2917 * Find a value in the pool directory object. 2918 */ 2919 static int 2920 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) 2921 { 2922 int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2923 name, sizeof (uint64_t), 1, val); 2924 2925 if (error != 0 && (error != ENOENT || log_enoent)) { 2926 spa_load_failed(spa, "couldn't get '%s' value in MOS directory " 2927 "[error=%d]", name, error); 2928 } 2929 2930 return (error); 2931 } 2932 2933 static int 2934 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 2935 { 2936 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 2937 return (SET_ERROR(err)); 2938 } 2939 2940 boolean_t 2941 spa_livelist_delete_check(spa_t *spa) 2942 { 2943 return (spa->spa_livelists_to_delete != 0); 2944 } 2945 2946 static boolean_t 2947 spa_livelist_delete_cb_check(void *arg, zthr_t *z) 2948 { 2949 (void) z; 2950 spa_t *spa = arg; 2951 return (spa_livelist_delete_check(spa)); 2952 } 2953 2954 static int 2955 delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 2956 { 2957 spa_t *spa = arg; 2958 zio_free(spa, tx->tx_txg, bp); 2959 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, 2960 -bp_get_dsize_sync(spa, bp), 2961 -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); 2962 return (0); 2963 } 2964 2965 static int 2966 dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp) 2967 { 2968 int err; 2969 zap_cursor_t zc; 2970 zap_attribute_t za; 2971 zap_cursor_init(&zc, os, zap_obj); 2972 err = zap_cursor_retrieve(&zc, &za); 2973 zap_cursor_fini(&zc); 2974 if (err == 0) 2975 *llp = za.za_first_integer; 2976 return (err); 2977 } 2978 2979 /* 2980 * Components of livelist deletion that must be performed in syncing 2981 * context: freeing block pointers and updating the pool-wide data 2982 * structures to indicate how much work is left to do 2983 */ 2984 typedef struct sublist_delete_arg { 2985 spa_t *spa; 2986 dsl_deadlist_t *ll; 2987 uint64_t key; 2988 bplist_t *to_free; 2989 } sublist_delete_arg_t; 2990 2991 static void 2992 sublist_delete_sync(void *arg, dmu_tx_t *tx) 2993 { 2994 sublist_delete_arg_t *sda = arg; 2995 spa_t *spa = sda->spa; 2996 dsl_deadlist_t *ll = sda->ll; 2997 uint64_t key = sda->key; 2998 bplist_t *to_free = sda->to_free; 2999 3000 bplist_iterate(to_free, delete_blkptr_cb, spa, tx); 3001 dsl_deadlist_remove_entry(ll, key, tx); 3002 } 3003 3004 typedef struct livelist_delete_arg { 3005 spa_t *spa; 3006 uint64_t ll_obj; 3007 uint64_t zap_obj; 3008 } livelist_delete_arg_t; 3009 3010 static void 3011 livelist_delete_sync(void *arg, dmu_tx_t *tx) 3012 { 3013 livelist_delete_arg_t *lda = arg; 3014 spa_t *spa = lda->spa; 3015 uint64_t ll_obj = lda->ll_obj; 3016 uint64_t zap_obj = lda->zap_obj; 3017 objset_t *mos = spa->spa_meta_objset; 3018 uint64_t count; 3019 3020 /* free the livelist and decrement the feature count */ 3021 VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx)); 3022 dsl_deadlist_free(mos, ll_obj, tx); 3023 spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); 3024 VERIFY0(zap_count(mos, zap_obj, &count)); 3025 if (count == 0) { 3026 /* no more livelists to delete */ 3027 VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, 3028 DMU_POOL_DELETED_CLONES, tx)); 3029 VERIFY0(zap_destroy(mos, zap_obj, tx)); 3030 spa->spa_livelists_to_delete = 0; 3031 spa_notify_waiters(spa); 3032 } 3033 } 3034 3035 /* 3036 * Load in the value for the livelist to be removed and open it. Then, 3037 * load its first sublist and determine which block pointers should actually 3038 * be freed. Then, call a synctask which performs the actual frees and updates 3039 * the pool-wide livelist data. 3040 */ 3041 static void 3042 spa_livelist_delete_cb(void *arg, zthr_t *z) 3043 { 3044 spa_t *spa = arg; 3045 uint64_t ll_obj = 0, count; 3046 objset_t *mos = spa->spa_meta_objset; 3047 uint64_t zap_obj = spa->spa_livelists_to_delete; 3048 /* 3049 * Determine the next livelist to delete. This function should only 3050 * be called if there is at least one deleted clone. 3051 */ 3052 VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj)); 3053 VERIFY0(zap_count(mos, ll_obj, &count)); 3054 if (count > 0) { 3055 dsl_deadlist_t *ll; 3056 dsl_deadlist_entry_t *dle; 3057 bplist_t to_free; 3058 ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP); 3059 dsl_deadlist_open(ll, mos, ll_obj); 3060 dle = dsl_deadlist_first(ll); 3061 ASSERT3P(dle, !=, NULL); 3062 bplist_create(&to_free); 3063 int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free, 3064 z, NULL); 3065 if (err == 0) { 3066 sublist_delete_arg_t sync_arg = { 3067 .spa = spa, 3068 .ll = ll, 3069 .key = dle->dle_mintxg, 3070 .to_free = &to_free 3071 }; 3072 zfs_dbgmsg("deleting sublist (id %llu) from" 3073 " livelist %llu, %lld remaining", 3074 (u_longlong_t)dle->dle_bpobj.bpo_object, 3075 (u_longlong_t)ll_obj, (longlong_t)count - 1); 3076 VERIFY0(dsl_sync_task(spa_name(spa), NULL, 3077 sublist_delete_sync, &sync_arg, 0, 3078 ZFS_SPACE_CHECK_DESTROY)); 3079 } else { 3080 VERIFY3U(err, ==, EINTR); 3081 } 3082 bplist_clear(&to_free); 3083 bplist_destroy(&to_free); 3084 dsl_deadlist_close(ll); 3085 kmem_free(ll, sizeof (dsl_deadlist_t)); 3086 } else { 3087 livelist_delete_arg_t sync_arg = { 3088 .spa = spa, 3089 .ll_obj = ll_obj, 3090 .zap_obj = zap_obj 3091 }; 3092 zfs_dbgmsg("deletion of livelist %llu completed", 3093 (u_longlong_t)ll_obj); 3094 VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync, 3095 &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); 3096 } 3097 } 3098 3099 static void 3100 spa_start_livelist_destroy_thread(spa_t *spa) 3101 { 3102 ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL); 3103 spa->spa_livelist_delete_zthr = 3104 zthr_create("z_livelist_destroy", 3105 spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa, 3106 minclsyspri); 3107 } 3108 3109 typedef struct livelist_new_arg { 3110 bplist_t *allocs; 3111 bplist_t *frees; 3112 } livelist_new_arg_t; 3113 3114 static int 3115 livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 3116 dmu_tx_t *tx) 3117 { 3118 ASSERT(tx == NULL); 3119 livelist_new_arg_t *lna = arg; 3120 if (bp_freed) { 3121 bplist_append(lna->frees, bp); 3122 } else { 3123 bplist_append(lna->allocs, bp); 3124 zfs_livelist_condense_new_alloc++; 3125 } 3126 return (0); 3127 } 3128 3129 typedef struct livelist_condense_arg { 3130 spa_t *spa; 3131 bplist_t to_keep; 3132 uint64_t first_size; 3133 uint64_t next_size; 3134 } livelist_condense_arg_t; 3135 3136 static void 3137 spa_livelist_condense_sync(void *arg, dmu_tx_t *tx) 3138 { 3139 livelist_condense_arg_t *lca = arg; 3140 spa_t *spa = lca->spa; 3141 bplist_t new_frees; 3142 dsl_dataset_t *ds = spa->spa_to_condense.ds; 3143 3144 /* Have we been cancelled? */ 3145 if (spa->spa_to_condense.cancelled) { 3146 zfs_livelist_condense_sync_cancel++; 3147 goto out; 3148 } 3149 3150 dsl_deadlist_entry_t *first = spa->spa_to_condense.first; 3151 dsl_deadlist_entry_t *next = spa->spa_to_condense.next; 3152 dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; 3153 3154 /* 3155 * It's possible that the livelist was changed while the zthr was 3156 * running. Therefore, we need to check for new blkptrs in the two 3157 * entries being condensed and continue to track them in the livelist. 3158 * Because of the way we handle remapped blkptrs (see dbuf_remap_impl), 3159 * it's possible that the newly added blkptrs are FREEs or ALLOCs so 3160 * we need to sort them into two different bplists. 3161 */ 3162 uint64_t first_obj = first->dle_bpobj.bpo_object; 3163 uint64_t next_obj = next->dle_bpobj.bpo_object; 3164 uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs; 3165 uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs; 3166 3167 bplist_create(&new_frees); 3168 livelist_new_arg_t new_bps = { 3169 .allocs = &lca->to_keep, 3170 .frees = &new_frees, 3171 }; 3172 3173 if (cur_first_size > lca->first_size) { 3174 VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj, 3175 livelist_track_new_cb, &new_bps, lca->first_size)); 3176 } 3177 if (cur_next_size > lca->next_size) { 3178 VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj, 3179 livelist_track_new_cb, &new_bps, lca->next_size)); 3180 } 3181 3182 dsl_deadlist_clear_entry(first, ll, tx); 3183 ASSERT(bpobj_is_empty(&first->dle_bpobj)); 3184 dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx); 3185 3186 bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx); 3187 bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx); 3188 bplist_destroy(&new_frees); 3189 3190 char dsname[ZFS_MAX_DATASET_NAME_LEN]; 3191 dsl_dataset_name(ds, dsname); 3192 zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu " 3193 "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu " 3194 "(%llu blkptrs)", (u_longlong_t)tx->tx_txg, dsname, 3195 (u_longlong_t)ds->ds_object, (u_longlong_t)first_obj, 3196 (u_longlong_t)cur_first_size, (u_longlong_t)next_obj, 3197 (u_longlong_t)cur_next_size, 3198 (u_longlong_t)first->dle_bpobj.bpo_object, 3199 (u_longlong_t)first->dle_bpobj.bpo_phys->bpo_num_blkptrs); 3200 out: 3201 dmu_buf_rele(ds->ds_dbuf, spa); 3202 spa->spa_to_condense.ds = NULL; 3203 bplist_clear(&lca->to_keep); 3204 bplist_destroy(&lca->to_keep); 3205 kmem_free(lca, sizeof (livelist_condense_arg_t)); 3206 spa->spa_to_condense.syncing = B_FALSE; 3207 } 3208 3209 static void 3210 spa_livelist_condense_cb(void *arg, zthr_t *t) 3211 { 3212 while (zfs_livelist_condense_zthr_pause && 3213 !(zthr_has_waiters(t) || zthr_iscancelled(t))) 3214 delay(1); 3215 3216 spa_t *spa = arg; 3217 dsl_deadlist_entry_t *first = spa->spa_to_condense.first; 3218 dsl_deadlist_entry_t *next = spa->spa_to_condense.next; 3219 uint64_t first_size, next_size; 3220 3221 livelist_condense_arg_t *lca = 3222 kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP); 3223 bplist_create(&lca->to_keep); 3224 3225 /* 3226 * Process the livelists (matching FREEs and ALLOCs) in open context 3227 * so we have minimal work in syncing context to condense. 3228 * 3229 * We save bpobj sizes (first_size and next_size) to use later in 3230 * syncing context to determine if entries were added to these sublists 3231 * while in open context. This is possible because the clone is still 3232 * active and open for normal writes and we want to make sure the new, 3233 * unprocessed blockpointers are inserted into the livelist normally. 3234 * 3235 * Note that dsl_process_sub_livelist() both stores the size number of 3236 * blockpointers and iterates over them while the bpobj's lock held, so 3237 * the sizes returned to us are consistent which what was actually 3238 * processed. 3239 */ 3240 int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t, 3241 &first_size); 3242 if (err == 0) 3243 err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep, 3244 t, &next_size); 3245 3246 if (err == 0) { 3247 while (zfs_livelist_condense_sync_pause && 3248 !(zthr_has_waiters(t) || zthr_iscancelled(t))) 3249 delay(1); 3250 3251 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 3252 dmu_tx_mark_netfree(tx); 3253 dmu_tx_hold_space(tx, 1); 3254 err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE); 3255 if (err == 0) { 3256 /* 3257 * Prevent the condense zthr restarting before 3258 * the synctask completes. 3259 */ 3260 spa->spa_to_condense.syncing = B_TRUE; 3261 lca->spa = spa; 3262 lca->first_size = first_size; 3263 lca->next_size = next_size; 3264 dsl_sync_task_nowait(spa_get_dsl(spa), 3265 spa_livelist_condense_sync, lca, tx); 3266 dmu_tx_commit(tx); 3267 return; 3268 } 3269 } 3270 /* 3271 * Condensing can not continue: either it was externally stopped or 3272 * we were unable to assign to a tx because the pool has run out of 3273 * space. In the second case, we'll just end up trying to condense 3274 * again in a later txg. 3275 */ 3276 ASSERT(err != 0); 3277 bplist_clear(&lca->to_keep); 3278 bplist_destroy(&lca->to_keep); 3279 kmem_free(lca, sizeof (livelist_condense_arg_t)); 3280 dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa); 3281 spa->spa_to_condense.ds = NULL; 3282 if (err == EINTR) 3283 zfs_livelist_condense_zthr_cancel++; 3284 } 3285 3286 /* 3287 * Check that there is something to condense but that a condense is not 3288 * already in progress and that condensing has not been cancelled. 3289 */ 3290 static boolean_t 3291 spa_livelist_condense_cb_check(void *arg, zthr_t *z) 3292 { 3293 (void) z; 3294 spa_t *spa = arg; 3295 if ((spa->spa_to_condense.ds != NULL) && 3296 (spa->spa_to_condense.syncing == B_FALSE) && 3297 (spa->spa_to_condense.cancelled == B_FALSE)) { 3298 return (B_TRUE); 3299 } 3300 return (B_FALSE); 3301 } 3302 3303 static void 3304 spa_start_livelist_condensing_thread(spa_t *spa) 3305 { 3306 spa->spa_to_condense.ds = NULL; 3307 spa->spa_to_condense.first = NULL; 3308 spa->spa_to_condense.next = NULL; 3309 spa->spa_to_condense.syncing = B_FALSE; 3310 spa->spa_to_condense.cancelled = B_FALSE; 3311 3312 ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL); 3313 spa->spa_livelist_condense_zthr = 3314 zthr_create("z_livelist_condense", 3315 spa_livelist_condense_cb_check, 3316 spa_livelist_condense_cb, spa, minclsyspri); 3317 } 3318 3319 static void 3320 spa_spawn_aux_threads(spa_t *spa) 3321 { 3322 ASSERT(spa_writeable(spa)); 3323 3324 spa_start_raidz_expansion_thread(spa); 3325 spa_start_indirect_condensing_thread(spa); 3326 spa_start_livelist_destroy_thread(spa); 3327 spa_start_livelist_condensing_thread(spa); 3328 3329 ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); 3330 spa->spa_checkpoint_discard_zthr = 3331 zthr_create("z_checkpoint_discard", 3332 spa_checkpoint_discard_thread_check, 3333 spa_checkpoint_discard_thread, spa, minclsyspri); 3334 } 3335 3336 /* 3337 * Fix up config after a partly-completed split. This is done with the 3338 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 3339 * pool have that entry in their config, but only the splitting one contains 3340 * a list of all the guids of the vdevs that are being split off. 3341 * 3342 * This function determines what to do with that list: either rejoin 3343 * all the disks to the pool, or complete the splitting process. To attempt 3344 * the rejoin, each disk that is offlined is marked online again, and 3345 * we do a reopen() call. If the vdev label for every disk that was 3346 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 3347 * then we call vdev_split() on each disk, and complete the split. 3348 * 3349 * Otherwise we leave the config alone, with all the vdevs in place in 3350 * the original pool. 3351 */ 3352 static void 3353 spa_try_repair(spa_t *spa, nvlist_t *config) 3354 { 3355 uint_t extracted; 3356 uint64_t *glist; 3357 uint_t i, gcount; 3358 nvlist_t *nvl; 3359 vdev_t **vd; 3360 boolean_t attempt_reopen; 3361 3362 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 3363 return; 3364 3365 /* check that the config is complete */ 3366 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 3367 &glist, &gcount) != 0) 3368 return; 3369 3370 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 3371 3372 /* attempt to online all the vdevs & validate */ 3373 attempt_reopen = B_TRUE; 3374 for (i = 0; i < gcount; i++) { 3375 if (glist[i] == 0) /* vdev is hole */ 3376 continue; 3377 3378 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 3379 if (vd[i] == NULL) { 3380 /* 3381 * Don't bother attempting to reopen the disks; 3382 * just do the split. 3383 */ 3384 attempt_reopen = B_FALSE; 3385 } else { 3386 /* attempt to re-online it */ 3387 vd[i]->vdev_offline = B_FALSE; 3388 } 3389 } 3390 3391 if (attempt_reopen) { 3392 vdev_reopen(spa->spa_root_vdev); 3393 3394 /* check each device to see what state it's in */ 3395 for (extracted = 0, i = 0; i < gcount; i++) { 3396 if (vd[i] != NULL && 3397 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 3398 break; 3399 ++extracted; 3400 } 3401 } 3402 3403 /* 3404 * If every disk has been moved to the new pool, or if we never 3405 * even attempted to look at them, then we split them off for 3406 * good. 3407 */ 3408 if (!attempt_reopen || gcount == extracted) { 3409 for (i = 0; i < gcount; i++) 3410 if (vd[i] != NULL) 3411 vdev_split(vd[i]); 3412 vdev_reopen(spa->spa_root_vdev); 3413 } 3414 3415 kmem_free(vd, gcount * sizeof (vdev_t *)); 3416 } 3417 3418 static int 3419 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) 3420 { 3421 const char *ereport = FM_EREPORT_ZFS_POOL; 3422 int error; 3423 3424 spa->spa_load_state = state; 3425 (void) spa_import_progress_set_state(spa_guid(spa), 3426 spa_load_state(spa)); 3427 spa_import_progress_set_notes(spa, "spa_load()"); 3428 3429 gethrestime(&spa->spa_loaded_ts); 3430 error = spa_load_impl(spa, type, &ereport); 3431 3432 /* 3433 * Don't count references from objsets that are already closed 3434 * and are making their way through the eviction process. 3435 */ 3436 spa_evicting_os_wait(spa); 3437 spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); 3438 if (error) { 3439 if (error != EEXIST) { 3440 spa->spa_loaded_ts.tv_sec = 0; 3441 spa->spa_loaded_ts.tv_nsec = 0; 3442 } 3443 if (error != EBADF) { 3444 (void) zfs_ereport_post(ereport, spa, 3445 NULL, NULL, NULL, 0); 3446 } 3447 } 3448 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 3449 spa->spa_ena = 0; 3450 3451 (void) spa_import_progress_set_state(spa_guid(spa), 3452 spa_load_state(spa)); 3453 3454 return (error); 3455 } 3456 3457 #ifdef ZFS_DEBUG 3458 /* 3459 * Count the number of per-vdev ZAPs associated with all of the vdevs in the 3460 * vdev tree rooted in the given vd, and ensure that each ZAP is present in the 3461 * spa's per-vdev ZAP list. 3462 */ 3463 static uint64_t 3464 vdev_count_verify_zaps(vdev_t *vd) 3465 { 3466 spa_t *spa = vd->vdev_spa; 3467 uint64_t total = 0; 3468 3469 if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2) && 3470 vd->vdev_root_zap != 0) { 3471 total++; 3472 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3473 spa->spa_all_vdev_zaps, vd->vdev_root_zap)); 3474 } 3475 if (vd->vdev_top_zap != 0) { 3476 total++; 3477 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3478 spa->spa_all_vdev_zaps, vd->vdev_top_zap)); 3479 } 3480 if (vd->vdev_leaf_zap != 0) { 3481 total++; 3482 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3483 spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); 3484 } 3485 3486 for (uint64_t i = 0; i < vd->vdev_children; i++) { 3487 total += vdev_count_verify_zaps(vd->vdev_child[i]); 3488 } 3489 3490 return (total); 3491 } 3492 #else 3493 #define vdev_count_verify_zaps(vd) ((void) sizeof (vd), 0) 3494 #endif 3495 3496 /* 3497 * Determine whether the activity check is required. 3498 */ 3499 static boolean_t 3500 spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label, 3501 nvlist_t *config) 3502 { 3503 uint64_t state = 0; 3504 uint64_t hostid = 0; 3505 uint64_t tryconfig_txg = 0; 3506 uint64_t tryconfig_timestamp = 0; 3507 uint16_t tryconfig_mmp_seq = 0; 3508 nvlist_t *nvinfo; 3509 3510 if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { 3511 nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); 3512 (void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG, 3513 &tryconfig_txg); 3514 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 3515 &tryconfig_timestamp); 3516 (void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ, 3517 &tryconfig_mmp_seq); 3518 } 3519 3520 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state); 3521 3522 /* 3523 * Disable the MMP activity check - This is used by zdb which 3524 * is intended to be used on potentially active pools. 3525 */ 3526 if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) 3527 return (B_FALSE); 3528 3529 /* 3530 * Skip the activity check when the MMP feature is disabled. 3531 */ 3532 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0) 3533 return (B_FALSE); 3534 3535 /* 3536 * If the tryconfig_ values are nonzero, they are the results of an 3537 * earlier tryimport. If they all match the uberblock we just found, 3538 * then the pool has not changed and we return false so we do not test 3539 * a second time. 3540 */ 3541 if (tryconfig_txg && tryconfig_txg == ub->ub_txg && 3542 tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp && 3543 tryconfig_mmp_seq && tryconfig_mmp_seq == 3544 (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) 3545 return (B_FALSE); 3546 3547 /* 3548 * Allow the activity check to be skipped when importing the pool 3549 * on the same host which last imported it. Since the hostid from 3550 * configuration may be stale use the one read from the label. 3551 */ 3552 if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID)) 3553 hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID); 3554 3555 if (hostid == spa_get_hostid(spa)) 3556 return (B_FALSE); 3557 3558 /* 3559 * Skip the activity test when the pool was cleanly exported. 3560 */ 3561 if (state != POOL_STATE_ACTIVE) 3562 return (B_FALSE); 3563 3564 return (B_TRUE); 3565 } 3566 3567 /* 3568 * Nanoseconds the activity check must watch for changes on-disk. 3569 */ 3570 static uint64_t 3571 spa_activity_check_duration(spa_t *spa, uberblock_t *ub) 3572 { 3573 uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1); 3574 uint64_t multihost_interval = MSEC2NSEC( 3575 MMP_INTERVAL_OK(zfs_multihost_interval)); 3576 uint64_t import_delay = MAX(NANOSEC, import_intervals * 3577 multihost_interval); 3578 3579 /* 3580 * Local tunables determine a minimum duration except for the case 3581 * where we know when the remote host will suspend the pool if MMP 3582 * writes do not land. 3583 * 3584 * See Big Theory comment at the top of mmp.c for the reasoning behind 3585 * these cases and times. 3586 */ 3587 3588 ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100); 3589 3590 if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && 3591 MMP_FAIL_INT(ub) > 0) { 3592 3593 /* MMP on remote host will suspend pool after failed writes */ 3594 import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) * 3595 MMP_IMPORT_SAFETY_FACTOR / 100; 3596 3597 zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp " 3598 "mmp_fails=%llu ub_mmp mmp_interval=%llu " 3599 "import_intervals=%llu", (u_longlong_t)import_delay, 3600 (u_longlong_t)MMP_FAIL_INT(ub), 3601 (u_longlong_t)MMP_INTERVAL(ub), 3602 (u_longlong_t)import_intervals); 3603 3604 } else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && 3605 MMP_FAIL_INT(ub) == 0) { 3606 3607 /* MMP on remote host will never suspend pool */ 3608 import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) + 3609 ub->ub_mmp_delay) * import_intervals); 3610 3611 zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp " 3612 "mmp_interval=%llu ub_mmp_delay=%llu " 3613 "import_intervals=%llu", (u_longlong_t)import_delay, 3614 (u_longlong_t)MMP_INTERVAL(ub), 3615 (u_longlong_t)ub->ub_mmp_delay, 3616 (u_longlong_t)import_intervals); 3617 3618 } else if (MMP_VALID(ub)) { 3619 /* 3620 * zfs-0.7 compatibility case 3621 */ 3622 3623 import_delay = MAX(import_delay, (multihost_interval + 3624 ub->ub_mmp_delay) * import_intervals); 3625 3626 zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu " 3627 "import_intervals=%llu leaves=%u", 3628 (u_longlong_t)import_delay, 3629 (u_longlong_t)ub->ub_mmp_delay, 3630 (u_longlong_t)import_intervals, 3631 vdev_count_leaves(spa)); 3632 } else { 3633 /* Using local tunings is the only reasonable option */ 3634 zfs_dbgmsg("pool last imported on non-MMP aware " 3635 "host using import_delay=%llu multihost_interval=%llu " 3636 "import_intervals=%llu", (u_longlong_t)import_delay, 3637 (u_longlong_t)multihost_interval, 3638 (u_longlong_t)import_intervals); 3639 } 3640 3641 return (import_delay); 3642 } 3643 3644 /* 3645 * Remote host activity check. 3646 * 3647 * error results: 3648 * 0 - no activity detected 3649 * EREMOTEIO - remote activity detected 3650 * EINTR - user canceled the operation 3651 */ 3652 static int 3653 spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config, 3654 boolean_t importing) 3655 { 3656 uint64_t txg = ub->ub_txg; 3657 uint64_t timestamp = ub->ub_timestamp; 3658 uint64_t mmp_config = ub->ub_mmp_config; 3659 uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0; 3660 uint64_t import_delay; 3661 hrtime_t import_expire, now; 3662 nvlist_t *mmp_label = NULL; 3663 vdev_t *rvd = spa->spa_root_vdev; 3664 kcondvar_t cv; 3665 kmutex_t mtx; 3666 int error = 0; 3667 3668 cv_init(&cv, NULL, CV_DEFAULT, NULL); 3669 mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL); 3670 mutex_enter(&mtx); 3671 3672 /* 3673 * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed 3674 * during the earlier tryimport. If the txg recorded there is 0 then 3675 * the pool is known to be active on another host. 3676 * 3677 * Otherwise, the pool might be in use on another host. Check for 3678 * changes in the uberblocks on disk if necessary. 3679 */ 3680 if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { 3681 nvlist_t *nvinfo = fnvlist_lookup_nvlist(config, 3682 ZPOOL_CONFIG_LOAD_INFO); 3683 3684 if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) && 3685 fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) { 3686 vdev_uberblock_load(rvd, ub, &mmp_label); 3687 error = SET_ERROR(EREMOTEIO); 3688 goto out; 3689 } 3690 } 3691 3692 import_delay = spa_activity_check_duration(spa, ub); 3693 3694 /* Add a small random factor in case of simultaneous imports (0-25%) */ 3695 import_delay += import_delay * random_in_range(250) / 1000; 3696 3697 import_expire = gethrtime() + import_delay; 3698 3699 if (importing) { 3700 spa_import_progress_set_notes(spa, "Checking MMP activity, " 3701 "waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay)); 3702 } 3703 3704 int iterations = 0; 3705 while ((now = gethrtime()) < import_expire) { 3706 if (importing && iterations++ % 30 == 0) { 3707 spa_import_progress_set_notes(spa, "Checking MMP " 3708 "activity, %llu ms remaining", 3709 (u_longlong_t)NSEC2MSEC(import_expire - now)); 3710 } 3711 3712 if (importing) { 3713 (void) spa_import_progress_set_mmp_check(spa_guid(spa), 3714 NSEC2SEC(import_expire - gethrtime())); 3715 } 3716 3717 vdev_uberblock_load(rvd, ub, &mmp_label); 3718 3719 if (txg != ub->ub_txg || timestamp != ub->ub_timestamp || 3720 mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) { 3721 zfs_dbgmsg("multihost activity detected " 3722 "txg %llu ub_txg %llu " 3723 "timestamp %llu ub_timestamp %llu " 3724 "mmp_config %#llx ub_mmp_config %#llx", 3725 (u_longlong_t)txg, (u_longlong_t)ub->ub_txg, 3726 (u_longlong_t)timestamp, 3727 (u_longlong_t)ub->ub_timestamp, 3728 (u_longlong_t)mmp_config, 3729 (u_longlong_t)ub->ub_mmp_config); 3730 3731 error = SET_ERROR(EREMOTEIO); 3732 break; 3733 } 3734 3735 if (mmp_label) { 3736 nvlist_free(mmp_label); 3737 mmp_label = NULL; 3738 } 3739 3740 error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz); 3741 if (error != -1) { 3742 error = SET_ERROR(EINTR); 3743 break; 3744 } 3745 error = 0; 3746 } 3747 3748 out: 3749 mutex_exit(&mtx); 3750 mutex_destroy(&mtx); 3751 cv_destroy(&cv); 3752 3753 /* 3754 * If the pool is determined to be active store the status in the 3755 * spa->spa_load_info nvlist. If the remote hostname or hostid are 3756 * available from configuration read from disk store them as well. 3757 * This allows 'zpool import' to generate a more useful message. 3758 * 3759 * ZPOOL_CONFIG_MMP_STATE - observed pool status (mandatory) 3760 * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool 3761 * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool 3762 */ 3763 if (error == EREMOTEIO) { 3764 const char *hostname = "<unknown>"; 3765 uint64_t hostid = 0; 3766 3767 if (mmp_label) { 3768 if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) { 3769 hostname = fnvlist_lookup_string(mmp_label, 3770 ZPOOL_CONFIG_HOSTNAME); 3771 fnvlist_add_string(spa->spa_load_info, 3772 ZPOOL_CONFIG_MMP_HOSTNAME, hostname); 3773 } 3774 3775 if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) { 3776 hostid = fnvlist_lookup_uint64(mmp_label, 3777 ZPOOL_CONFIG_HOSTID); 3778 fnvlist_add_uint64(spa->spa_load_info, 3779 ZPOOL_CONFIG_MMP_HOSTID, hostid); 3780 } 3781 } 3782 3783 fnvlist_add_uint64(spa->spa_load_info, 3784 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE); 3785 fnvlist_add_uint64(spa->spa_load_info, 3786 ZPOOL_CONFIG_MMP_TXG, 0); 3787 3788 error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO); 3789 } 3790 3791 if (mmp_label) 3792 nvlist_free(mmp_label); 3793 3794 return (error); 3795 } 3796 3797 /* 3798 * Called from zfs_ioc_clear for a pool that was suspended 3799 * after failing mmp write checks. 3800 */ 3801 boolean_t 3802 spa_mmp_remote_host_activity(spa_t *spa) 3803 { 3804 ASSERT(spa_multihost(spa) && spa_suspended(spa)); 3805 3806 nvlist_t *best_label; 3807 uberblock_t best_ub; 3808 3809 /* 3810 * Locate the best uberblock on disk 3811 */ 3812 vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label); 3813 if (best_label) { 3814 /* 3815 * confirm that the best hostid matches our hostid 3816 */ 3817 if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) && 3818 spa_get_hostid(spa) != 3819 fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) { 3820 nvlist_free(best_label); 3821 return (B_TRUE); 3822 } 3823 nvlist_free(best_label); 3824 } else { 3825 return (B_TRUE); 3826 } 3827 3828 if (!MMP_VALID(&best_ub) || 3829 !MMP_FAIL_INT_VALID(&best_ub) || 3830 MMP_FAIL_INT(&best_ub) == 0) { 3831 return (B_TRUE); 3832 } 3833 3834 if (best_ub.ub_txg != spa->spa_uberblock.ub_txg || 3835 best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) { 3836 zfs_dbgmsg("txg mismatch detected during pool clear " 3837 "txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu", 3838 (u_longlong_t)spa->spa_uberblock.ub_txg, 3839 (u_longlong_t)best_ub.ub_txg, 3840 (u_longlong_t)spa->spa_uberblock.ub_timestamp, 3841 (u_longlong_t)best_ub.ub_timestamp); 3842 return (B_TRUE); 3843 } 3844 3845 /* 3846 * Perform an activity check looking for any remote writer 3847 */ 3848 return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config, 3849 B_FALSE) != 0); 3850 } 3851 3852 static int 3853 spa_verify_host(spa_t *spa, nvlist_t *mos_config) 3854 { 3855 uint64_t hostid; 3856 const char *hostname; 3857 uint64_t myhostid = 0; 3858 3859 if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, 3860 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 3861 hostname = fnvlist_lookup_string(mos_config, 3862 ZPOOL_CONFIG_HOSTNAME); 3863 3864 myhostid = zone_get_hostid(NULL); 3865 3866 if (hostid != 0 && myhostid != 0 && hostid != myhostid) { 3867 cmn_err(CE_WARN, "pool '%s' could not be " 3868 "loaded as it was last accessed by " 3869 "another system (host: %s hostid: 0x%llx). " 3870 "See: https://openzfs.github.io/openzfs-docs/msg/" 3871 "ZFS-8000-EY", 3872 spa_name(spa), hostname, (u_longlong_t)hostid); 3873 spa_load_failed(spa, "hostid verification failed: pool " 3874 "last accessed by host: %s (hostid: 0x%llx)", 3875 hostname, (u_longlong_t)hostid); 3876 return (SET_ERROR(EBADF)); 3877 } 3878 } 3879 3880 return (0); 3881 } 3882 3883 static int 3884 spa_ld_parse_config(spa_t *spa, spa_import_type_t type) 3885 { 3886 int error = 0; 3887 nvlist_t *nvtree, *nvl, *config = spa->spa_config; 3888 int parse; 3889 vdev_t *rvd; 3890 uint64_t pool_guid; 3891 const char *comment; 3892 const char *compatibility; 3893 3894 /* 3895 * Versioning wasn't explicitly added to the label until later, so if 3896 * it's not present treat it as the initial version. 3897 */ 3898 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 3899 &spa->spa_ubsync.ub_version) != 0) 3900 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 3901 3902 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 3903 spa_load_failed(spa, "invalid config provided: '%s' missing", 3904 ZPOOL_CONFIG_POOL_GUID); 3905 return (SET_ERROR(EINVAL)); 3906 } 3907 3908 /* 3909 * If we are doing an import, ensure that the pool is not already 3910 * imported by checking if its pool guid already exists in the 3911 * spa namespace. 3912 * 3913 * The only case that we allow an already imported pool to be 3914 * imported again, is when the pool is checkpointed and we want to 3915 * look at its checkpointed state from userland tools like zdb. 3916 */ 3917 #ifdef _KERNEL 3918 if ((spa->spa_load_state == SPA_LOAD_IMPORT || 3919 spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 3920 spa_guid_exists(pool_guid, 0)) { 3921 #else 3922 if ((spa->spa_load_state == SPA_LOAD_IMPORT || 3923 spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 3924 spa_guid_exists(pool_guid, 0) && 3925 !spa_importing_readonly_checkpoint(spa)) { 3926 #endif 3927 spa_load_failed(spa, "a pool with guid %llu is already open", 3928 (u_longlong_t)pool_guid); 3929 return (SET_ERROR(EEXIST)); 3930 } 3931 3932 spa->spa_config_guid = pool_guid; 3933 3934 nvlist_free(spa->spa_load_info); 3935 spa->spa_load_info = fnvlist_alloc(); 3936 3937 ASSERT(spa->spa_comment == NULL); 3938 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 3939 spa->spa_comment = spa_strdup(comment); 3940 3941 ASSERT(spa->spa_compatibility == NULL); 3942 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMPATIBILITY, 3943 &compatibility) == 0) 3944 spa->spa_compatibility = spa_strdup(compatibility); 3945 3946 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 3947 &spa->spa_config_txg); 3948 3949 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) 3950 spa->spa_config_splitting = fnvlist_dup(nvl); 3951 3952 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { 3953 spa_load_failed(spa, "invalid config provided: '%s' missing", 3954 ZPOOL_CONFIG_VDEV_TREE); 3955 return (SET_ERROR(EINVAL)); 3956 } 3957 3958 /* 3959 * Create "The Godfather" zio to hold all async IOs 3960 */ 3961 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 3962 KM_SLEEP); 3963 for (int i = 0; i < max_ncpus; i++) { 3964 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 3965 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 3966 ZIO_FLAG_GODFATHER); 3967 } 3968 3969 /* 3970 * Parse the configuration into a vdev tree. We explicitly set the 3971 * value that will be returned by spa_version() since parsing the 3972 * configuration requires knowing the version number. 3973 */ 3974 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3975 parse = (type == SPA_IMPORT_EXISTING ? 3976 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 3977 error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); 3978 spa_config_exit(spa, SCL_ALL, FTAG); 3979 3980 if (error != 0) { 3981 spa_load_failed(spa, "unable to parse config [error=%d]", 3982 error); 3983 return (error); 3984 } 3985 3986 ASSERT(spa->spa_root_vdev == rvd); 3987 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 3988 ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); 3989 3990 if (type != SPA_IMPORT_ASSEMBLE) { 3991 ASSERT(spa_guid(spa) == pool_guid); 3992 } 3993 3994 return (0); 3995 } 3996 3997 /* 3998 * Recursively open all vdevs in the vdev tree. This function is called twice: 3999 * first with the untrusted config, then with the trusted config. 4000 */ 4001 static int 4002 spa_ld_open_vdevs(spa_t *spa) 4003 { 4004 int error = 0; 4005 4006 /* 4007 * spa_missing_tvds_allowed defines how many top-level vdevs can be 4008 * missing/unopenable for the root vdev to be still considered openable. 4009 */ 4010 if (spa->spa_trust_config) { 4011 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds; 4012 } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) { 4013 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile; 4014 } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) { 4015 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan; 4016 } else { 4017 spa->spa_missing_tvds_allowed = 0; 4018 } 4019 4020 spa->spa_missing_tvds_allowed = 4021 MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed); 4022 4023 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4024 error = vdev_open(spa->spa_root_vdev); 4025 spa_config_exit(spa, SCL_ALL, FTAG); 4026 4027 if (spa->spa_missing_tvds != 0) { 4028 spa_load_note(spa, "vdev tree has %lld missing top-level " 4029 "vdevs.", (u_longlong_t)spa->spa_missing_tvds); 4030 if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) { 4031 /* 4032 * Although theoretically we could allow users to open 4033 * incomplete pools in RW mode, we'd need to add a lot 4034 * of extra logic (e.g. adjust pool space to account 4035 * for missing vdevs). 4036 * This limitation also prevents users from accidentally 4037 * opening the pool in RW mode during data recovery and 4038 * damaging it further. 4039 */ 4040 spa_load_note(spa, "pools with missing top-level " 4041 "vdevs can only be opened in read-only mode."); 4042 error = SET_ERROR(ENXIO); 4043 } else { 4044 spa_load_note(spa, "current settings allow for maximum " 4045 "%lld missing top-level vdevs at this stage.", 4046 (u_longlong_t)spa->spa_missing_tvds_allowed); 4047 } 4048 } 4049 if (error != 0) { 4050 spa_load_failed(spa, "unable to open vdev tree [error=%d]", 4051 error); 4052 } 4053 if (spa->spa_missing_tvds != 0 || error != 0) 4054 vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2); 4055 4056 return (error); 4057 } 4058 4059 /* 4060 * We need to validate the vdev labels against the configuration that 4061 * we have in hand. This function is called twice: first with an untrusted 4062 * config, then with a trusted config. The validation is more strict when the 4063 * config is trusted. 4064 */ 4065 static int 4066 spa_ld_validate_vdevs(spa_t *spa) 4067 { 4068 int error = 0; 4069 vdev_t *rvd = spa->spa_root_vdev; 4070 4071 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4072 error = vdev_validate(rvd); 4073 spa_config_exit(spa, SCL_ALL, FTAG); 4074 4075 if (error != 0) { 4076 spa_load_failed(spa, "vdev_validate failed [error=%d]", error); 4077 return (error); 4078 } 4079 4080 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 4081 spa_load_failed(spa, "cannot open vdev tree after invalidating " 4082 "some vdevs"); 4083 vdev_dbgmsg_print_tree(rvd, 2); 4084 return (SET_ERROR(ENXIO)); 4085 } 4086 4087 return (0); 4088 } 4089 4090 static void 4091 spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub) 4092 { 4093 spa->spa_state = POOL_STATE_ACTIVE; 4094 spa->spa_ubsync = spa->spa_uberblock; 4095 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 4096 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 4097 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 4098 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 4099 spa->spa_claim_max_txg = spa->spa_first_txg; 4100 spa->spa_prev_software_version = ub->ub_software_version; 4101 } 4102 4103 static int 4104 spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) 4105 { 4106 vdev_t *rvd = spa->spa_root_vdev; 4107 nvlist_t *label; 4108 uberblock_t *ub = &spa->spa_uberblock; 4109 boolean_t activity_check = B_FALSE; 4110 4111 /* 4112 * If we are opening the checkpointed state of the pool by 4113 * rewinding to it, at this point we will have written the 4114 * checkpointed uberblock to the vdev labels, so searching 4115 * the labels will find the right uberblock. However, if 4116 * we are opening the checkpointed state read-only, we have 4117 * not modified the labels. Therefore, we must ignore the 4118 * labels and continue using the spa_uberblock that was set 4119 * by spa_ld_checkpoint_rewind. 4120 * 4121 * Note that it would be fine to ignore the labels when 4122 * rewinding (opening writeable) as well. However, if we 4123 * crash just after writing the labels, we will end up 4124 * searching the labels. Doing so in the common case means 4125 * that this code path gets exercised normally, rather than 4126 * just in the edge case. 4127 */ 4128 if (ub->ub_checkpoint_txg != 0 && 4129 spa_importing_readonly_checkpoint(spa)) { 4130 spa_ld_select_uberblock_done(spa, ub); 4131 return (0); 4132 } 4133 4134 /* 4135 * Find the best uberblock. 4136 */ 4137 vdev_uberblock_load(rvd, ub, &label); 4138 4139 /* 4140 * If we weren't able to find a single valid uberblock, return failure. 4141 */ 4142 if (ub->ub_txg == 0) { 4143 nvlist_free(label); 4144 spa_load_failed(spa, "no valid uberblock found"); 4145 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 4146 } 4147 4148 if (spa->spa_load_max_txg != UINT64_MAX) { 4149 (void) spa_import_progress_set_max_txg(spa_guid(spa), 4150 (u_longlong_t)spa->spa_load_max_txg); 4151 } 4152 spa_load_note(spa, "using uberblock with txg=%llu", 4153 (u_longlong_t)ub->ub_txg); 4154 if (ub->ub_raidz_reflow_info != 0) { 4155 spa_load_note(spa, "uberblock raidz_reflow_info: " 4156 "state=%u offset=%llu", 4157 (int)RRSS_GET_STATE(ub), 4158 (u_longlong_t)RRSS_GET_OFFSET(ub)); 4159 } 4160 4161 4162 /* 4163 * For pools which have the multihost property on determine if the 4164 * pool is truly inactive and can be safely imported. Prevent 4165 * hosts which don't have a hostid set from importing the pool. 4166 */ 4167 activity_check = spa_activity_check_required(spa, ub, label, 4168 spa->spa_config); 4169 if (activity_check) { 4170 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay && 4171 spa_get_hostid(spa) == 0) { 4172 nvlist_free(label); 4173 fnvlist_add_uint64(spa->spa_load_info, 4174 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); 4175 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); 4176 } 4177 4178 int error = 4179 spa_activity_check(spa, ub, spa->spa_config, B_TRUE); 4180 if (error) { 4181 nvlist_free(label); 4182 return (error); 4183 } 4184 4185 fnvlist_add_uint64(spa->spa_load_info, 4186 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE); 4187 fnvlist_add_uint64(spa->spa_load_info, 4188 ZPOOL_CONFIG_MMP_TXG, ub->ub_txg); 4189 fnvlist_add_uint16(spa->spa_load_info, 4190 ZPOOL_CONFIG_MMP_SEQ, 4191 (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)); 4192 } 4193 4194 /* 4195 * If the pool has an unsupported version we can't open it. 4196 */ 4197 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 4198 nvlist_free(label); 4199 spa_load_failed(spa, "version %llu is not supported", 4200 (u_longlong_t)ub->ub_version); 4201 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 4202 } 4203 4204 if (ub->ub_version >= SPA_VERSION_FEATURES) { 4205 nvlist_t *features; 4206 4207 /* 4208 * If we weren't able to find what's necessary for reading the 4209 * MOS in the label, return failure. 4210 */ 4211 if (label == NULL) { 4212 spa_load_failed(spa, "label config unavailable"); 4213 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 4214 ENXIO)); 4215 } 4216 4217 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, 4218 &features) != 0) { 4219 nvlist_free(label); 4220 spa_load_failed(spa, "invalid label: '%s' missing", 4221 ZPOOL_CONFIG_FEATURES_FOR_READ); 4222 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 4223 ENXIO)); 4224 } 4225 4226 /* 4227 * Update our in-core representation with the definitive values 4228 * from the label. 4229 */ 4230 nvlist_free(spa->spa_label_features); 4231 spa->spa_label_features = fnvlist_dup(features); 4232 } 4233 4234 nvlist_free(label); 4235 4236 /* 4237 * Look through entries in the label nvlist's features_for_read. If 4238 * there is a feature listed there which we don't understand then we 4239 * cannot open a pool. 4240 */ 4241 if (ub->ub_version >= SPA_VERSION_FEATURES) { 4242 nvlist_t *unsup_feat; 4243 4244 unsup_feat = fnvlist_alloc(); 4245 4246 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 4247 NULL); nvp != NULL; 4248 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 4249 if (!zfeature_is_supported(nvpair_name(nvp))) { 4250 fnvlist_add_string(unsup_feat, 4251 nvpair_name(nvp), ""); 4252 } 4253 } 4254 4255 if (!nvlist_empty(unsup_feat)) { 4256 fnvlist_add_nvlist(spa->spa_load_info, 4257 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 4258 nvlist_free(unsup_feat); 4259 spa_load_failed(spa, "some features are unsupported"); 4260 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 4261 ENOTSUP)); 4262 } 4263 4264 nvlist_free(unsup_feat); 4265 } 4266 4267 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 4268 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4269 spa_try_repair(spa, spa->spa_config); 4270 spa_config_exit(spa, SCL_ALL, FTAG); 4271 nvlist_free(spa->spa_config_splitting); 4272 spa->spa_config_splitting = NULL; 4273 } 4274 4275 /* 4276 * Initialize internal SPA structures. 4277 */ 4278 spa_ld_select_uberblock_done(spa, ub); 4279 4280 return (0); 4281 } 4282 4283 static int 4284 spa_ld_open_rootbp(spa_t *spa) 4285 { 4286 int error = 0; 4287 vdev_t *rvd = spa->spa_root_vdev; 4288 4289 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 4290 if (error != 0) { 4291 spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " 4292 "[error=%d]", error); 4293 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4294 } 4295 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 4296 4297 return (0); 4298 } 4299 4300 static int 4301 spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, 4302 boolean_t reloading) 4303 { 4304 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 4305 nvlist_t *nv, *mos_config, *policy; 4306 int error = 0, copy_error; 4307 uint64_t healthy_tvds, healthy_tvds_mos; 4308 uint64_t mos_config_txg; 4309 4310 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) 4311 != 0) 4312 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4313 4314 /* 4315 * If we're assembling a pool from a split, the config provided is 4316 * already trusted so there is nothing to do. 4317 */ 4318 if (type == SPA_IMPORT_ASSEMBLE) 4319 return (0); 4320 4321 healthy_tvds = spa_healthy_core_tvds(spa); 4322 4323 if (load_nvlist(spa, spa->spa_config_object, &mos_config) 4324 != 0) { 4325 spa_load_failed(spa, "unable to retrieve MOS config"); 4326 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4327 } 4328 4329 /* 4330 * If we are doing an open, pool owner wasn't verified yet, thus do 4331 * the verification here. 4332 */ 4333 if (spa->spa_load_state == SPA_LOAD_OPEN) { 4334 error = spa_verify_host(spa, mos_config); 4335 if (error != 0) { 4336 nvlist_free(mos_config); 4337 return (error); 4338 } 4339 } 4340 4341 nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE); 4342 4343 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4344 4345 /* 4346 * Build a new vdev tree from the trusted config 4347 */ 4348 error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD); 4349 if (error != 0) { 4350 nvlist_free(mos_config); 4351 spa_config_exit(spa, SCL_ALL, FTAG); 4352 spa_load_failed(spa, "spa_config_parse failed [error=%d]", 4353 error); 4354 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4355 } 4356 4357 /* 4358 * Vdev paths in the MOS may be obsolete. If the untrusted config was 4359 * obtained by scanning /dev/dsk, then it will have the right vdev 4360 * paths. We update the trusted MOS config with this information. 4361 * We first try to copy the paths with vdev_copy_path_strict, which 4362 * succeeds only when both configs have exactly the same vdev tree. 4363 * If that fails, we fall back to a more flexible method that has a 4364 * best effort policy. 4365 */ 4366 copy_error = vdev_copy_path_strict(rvd, mrvd); 4367 if (copy_error != 0 || spa_load_print_vdev_tree) { 4368 spa_load_note(spa, "provided vdev tree:"); 4369 vdev_dbgmsg_print_tree(rvd, 2); 4370 spa_load_note(spa, "MOS vdev tree:"); 4371 vdev_dbgmsg_print_tree(mrvd, 2); 4372 } 4373 if (copy_error != 0) { 4374 spa_load_note(spa, "vdev_copy_path_strict failed, falling " 4375 "back to vdev_copy_path_relaxed"); 4376 vdev_copy_path_relaxed(rvd, mrvd); 4377 } 4378 4379 vdev_close(rvd); 4380 vdev_free(rvd); 4381 spa->spa_root_vdev = mrvd; 4382 rvd = mrvd; 4383 spa_config_exit(spa, SCL_ALL, FTAG); 4384 4385 /* 4386 * If 'zpool import' used a cached config, then the on-disk hostid and 4387 * hostname may be different to the cached config in ways that should 4388 * prevent import. Userspace can't discover this without a scan, but 4389 * we know, so we add these values to LOAD_INFO so the caller can know 4390 * the difference. 4391 * 4392 * Note that we have to do this before the config is regenerated, 4393 * because the new config will have the hostid and hostname for this 4394 * host, in readiness for import. 4395 */ 4396 if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTID)) 4397 fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_HOSTID, 4398 fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID)); 4399 if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTNAME)) 4400 fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_HOSTNAME, 4401 fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME)); 4402 4403 /* 4404 * We will use spa_config if we decide to reload the spa or if spa_load 4405 * fails and we rewind. We must thus regenerate the config using the 4406 * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to 4407 * pass settings on how to load the pool and is not stored in the MOS. 4408 * We copy it over to our new, trusted config. 4409 */ 4410 mos_config_txg = fnvlist_lookup_uint64(mos_config, 4411 ZPOOL_CONFIG_POOL_TXG); 4412 nvlist_free(mos_config); 4413 mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE); 4414 if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY, 4415 &policy) == 0) 4416 fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy); 4417 spa_config_set(spa, mos_config); 4418 spa->spa_config_source = SPA_CONFIG_SRC_MOS; 4419 4420 /* 4421 * Now that we got the config from the MOS, we should be more strict 4422 * in checking blkptrs and can make assumptions about the consistency 4423 * of the vdev tree. spa_trust_config must be set to true before opening 4424 * vdevs in order for them to be writeable. 4425 */ 4426 spa->spa_trust_config = B_TRUE; 4427 4428 /* 4429 * Open and validate the new vdev tree 4430 */ 4431 error = spa_ld_open_vdevs(spa); 4432 if (error != 0) 4433 return (error); 4434 4435 error = spa_ld_validate_vdevs(spa); 4436 if (error != 0) 4437 return (error); 4438 4439 if (copy_error != 0 || spa_load_print_vdev_tree) { 4440 spa_load_note(spa, "final vdev tree:"); 4441 vdev_dbgmsg_print_tree(rvd, 2); 4442 } 4443 4444 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT && 4445 !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) { 4446 /* 4447 * Sanity check to make sure that we are indeed loading the 4448 * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds 4449 * in the config provided and they happened to be the only ones 4450 * to have the latest uberblock, we could involuntarily perform 4451 * an extreme rewind. 4452 */ 4453 healthy_tvds_mos = spa_healthy_core_tvds(spa); 4454 if (healthy_tvds_mos - healthy_tvds >= 4455 SPA_SYNC_MIN_VDEVS) { 4456 spa_load_note(spa, "config provided misses too many " 4457 "top-level vdevs compared to MOS (%lld vs %lld). ", 4458 (u_longlong_t)healthy_tvds, 4459 (u_longlong_t)healthy_tvds_mos); 4460 spa_load_note(spa, "vdev tree:"); 4461 vdev_dbgmsg_print_tree(rvd, 2); 4462 if (reloading) { 4463 spa_load_failed(spa, "config was already " 4464 "provided from MOS. Aborting."); 4465 return (spa_vdev_err(rvd, 4466 VDEV_AUX_CORRUPT_DATA, EIO)); 4467 } 4468 spa_load_note(spa, "spa must be reloaded using MOS " 4469 "config"); 4470 return (SET_ERROR(EAGAIN)); 4471 } 4472 } 4473 4474 error = spa_check_for_missing_logs(spa); 4475 if (error != 0) 4476 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 4477 4478 if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) { 4479 spa_load_failed(spa, "uberblock guid sum doesn't match MOS " 4480 "guid sum (%llu != %llu)", 4481 (u_longlong_t)spa->spa_uberblock.ub_guid_sum, 4482 (u_longlong_t)rvd->vdev_guid_sum); 4483 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 4484 ENXIO)); 4485 } 4486 4487 return (0); 4488 } 4489 4490 static int 4491 spa_ld_open_indirect_vdev_metadata(spa_t *spa) 4492 { 4493 int error = 0; 4494 vdev_t *rvd = spa->spa_root_vdev; 4495 4496 /* 4497 * Everything that we read before spa_remove_init() must be stored 4498 * on concreted vdevs. Therefore we do this as early as possible. 4499 */ 4500 error = spa_remove_init(spa); 4501 if (error != 0) { 4502 spa_load_failed(spa, "spa_remove_init failed [error=%d]", 4503 error); 4504 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4505 } 4506 4507 /* 4508 * Retrieve information needed to condense indirect vdev mappings. 4509 */ 4510 error = spa_condense_init(spa); 4511 if (error != 0) { 4512 spa_load_failed(spa, "spa_condense_init failed [error=%d]", 4513 error); 4514 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4515 } 4516 4517 return (0); 4518 } 4519 4520 static int 4521 spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) 4522 { 4523 int error = 0; 4524 vdev_t *rvd = spa->spa_root_vdev; 4525 4526 if (spa_version(spa) >= SPA_VERSION_FEATURES) { 4527 boolean_t missing_feat_read = B_FALSE; 4528 nvlist_t *unsup_feat, *enabled_feat; 4529 4530 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 4531 &spa->spa_feat_for_read_obj, B_TRUE) != 0) { 4532 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4533 } 4534 4535 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 4536 &spa->spa_feat_for_write_obj, B_TRUE) != 0) { 4537 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4538 } 4539 4540 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 4541 &spa->spa_feat_desc_obj, B_TRUE) != 0) { 4542 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4543 } 4544 4545 enabled_feat = fnvlist_alloc(); 4546 unsup_feat = fnvlist_alloc(); 4547 4548 if (!spa_features_check(spa, B_FALSE, 4549 unsup_feat, enabled_feat)) 4550 missing_feat_read = B_TRUE; 4551 4552 if (spa_writeable(spa) || 4553 spa->spa_load_state == SPA_LOAD_TRYIMPORT) { 4554 if (!spa_features_check(spa, B_TRUE, 4555 unsup_feat, enabled_feat)) { 4556 *missing_feat_writep = B_TRUE; 4557 } 4558 } 4559 4560 fnvlist_add_nvlist(spa->spa_load_info, 4561 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 4562 4563 if (!nvlist_empty(unsup_feat)) { 4564 fnvlist_add_nvlist(spa->spa_load_info, 4565 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 4566 } 4567 4568 fnvlist_free(enabled_feat); 4569 fnvlist_free(unsup_feat); 4570 4571 if (!missing_feat_read) { 4572 fnvlist_add_boolean(spa->spa_load_info, 4573 ZPOOL_CONFIG_CAN_RDONLY); 4574 } 4575 4576 /* 4577 * If the state is SPA_LOAD_TRYIMPORT, our objective is 4578 * twofold: to determine whether the pool is available for 4579 * import in read-write mode and (if it is not) whether the 4580 * pool is available for import in read-only mode. If the pool 4581 * is available for import in read-write mode, it is displayed 4582 * as available in userland; if it is not available for import 4583 * in read-only mode, it is displayed as unavailable in 4584 * userland. If the pool is available for import in read-only 4585 * mode but not read-write mode, it is displayed as unavailable 4586 * in userland with a special note that the pool is actually 4587 * available for open in read-only mode. 4588 * 4589 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 4590 * missing a feature for write, we must first determine whether 4591 * the pool can be opened read-only before returning to 4592 * userland in order to know whether to display the 4593 * abovementioned note. 4594 */ 4595 if (missing_feat_read || (*missing_feat_writep && 4596 spa_writeable(spa))) { 4597 spa_load_failed(spa, "pool uses unsupported features"); 4598 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 4599 ENOTSUP)); 4600 } 4601 4602 /* 4603 * Load refcounts for ZFS features from disk into an in-memory 4604 * cache during SPA initialization. 4605 */ 4606 for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 4607 uint64_t refcount; 4608 4609 error = feature_get_refcount_from_disk(spa, 4610 &spa_feature_table[i], &refcount); 4611 if (error == 0) { 4612 spa->spa_feat_refcount_cache[i] = refcount; 4613 } else if (error == ENOTSUP) { 4614 spa->spa_feat_refcount_cache[i] = 4615 SPA_FEATURE_DISABLED; 4616 } else { 4617 spa_load_failed(spa, "error getting refcount " 4618 "for feature %s [error=%d]", 4619 spa_feature_table[i].fi_guid, error); 4620 return (spa_vdev_err(rvd, 4621 VDEV_AUX_CORRUPT_DATA, EIO)); 4622 } 4623 } 4624 } 4625 4626 if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 4627 if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 4628 &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) 4629 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4630 } 4631 4632 /* 4633 * Encryption was added before bookmark_v2, even though bookmark_v2 4634 * is now a dependency. If this pool has encryption enabled without 4635 * bookmark_v2, trigger an errata message. 4636 */ 4637 if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) && 4638 !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) { 4639 spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION; 4640 } 4641 4642 return (0); 4643 } 4644 4645 static int 4646 spa_ld_load_special_directories(spa_t *spa) 4647 { 4648 int error = 0; 4649 vdev_t *rvd = spa->spa_root_vdev; 4650 4651 spa->spa_is_initializing = B_TRUE; 4652 error = dsl_pool_open(spa->spa_dsl_pool); 4653 spa->spa_is_initializing = B_FALSE; 4654 if (error != 0) { 4655 spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); 4656 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4657 } 4658 4659 return (0); 4660 } 4661 4662 static int 4663 spa_ld_get_props(spa_t *spa) 4664 { 4665 int error = 0; 4666 uint64_t obj; 4667 vdev_t *rvd = spa->spa_root_vdev; 4668 4669 /* Grab the checksum salt from the MOS. */ 4670 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 4671 DMU_POOL_CHECKSUM_SALT, 1, 4672 sizeof (spa->spa_cksum_salt.zcs_bytes), 4673 spa->spa_cksum_salt.zcs_bytes); 4674 if (error == ENOENT) { 4675 /* Generate a new salt for subsequent use */ 4676 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 4677 sizeof (spa->spa_cksum_salt.zcs_bytes)); 4678 } else if (error != 0) { 4679 spa_load_failed(spa, "unable to retrieve checksum salt from " 4680 "MOS [error=%d]", error); 4681 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4682 } 4683 4684 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) 4685 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4686 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 4687 if (error != 0) { 4688 spa_load_failed(spa, "error opening deferred-frees bpobj " 4689 "[error=%d]", error); 4690 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4691 } 4692 4693 /* 4694 * Load the bit that tells us to use the new accounting function 4695 * (raid-z deflation). If we have an older pool, this will not 4696 * be present. 4697 */ 4698 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); 4699 if (error != 0 && error != ENOENT) 4700 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4701 4702 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 4703 &spa->spa_creation_version, B_FALSE); 4704 if (error != 0 && error != ENOENT) 4705 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4706 4707 /* 4708 * Load the persistent error log. If we have an older pool, this will 4709 * not be present. 4710 */ 4711 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, 4712 B_FALSE); 4713 if (error != 0 && error != ENOENT) 4714 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4715 4716 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 4717 &spa->spa_errlog_scrub, B_FALSE); 4718 if (error != 0 && error != ENOENT) 4719 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4720 4721 /* 4722 * Load the livelist deletion field. If a livelist is queued for 4723 * deletion, indicate that in the spa 4724 */ 4725 error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES, 4726 &spa->spa_livelists_to_delete, B_FALSE); 4727 if (error != 0 && error != ENOENT) 4728 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4729 4730 /* 4731 * Load the history object. If we have an older pool, this 4732 * will not be present. 4733 */ 4734 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); 4735 if (error != 0 && error != ENOENT) 4736 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4737 4738 /* 4739 * Load the per-vdev ZAP map. If we have an older pool, this will not 4740 * be present; in this case, defer its creation to a later time to 4741 * avoid dirtying the MOS this early / out of sync context. See 4742 * spa_sync_config_object. 4743 */ 4744 4745 /* The sentinel is only available in the MOS config. */ 4746 nvlist_t *mos_config; 4747 if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { 4748 spa_load_failed(spa, "unable to retrieve MOS config"); 4749 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4750 } 4751 4752 error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, 4753 &spa->spa_all_vdev_zaps, B_FALSE); 4754 4755 if (error == ENOENT) { 4756 VERIFY(!nvlist_exists(mos_config, 4757 ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 4758 spa->spa_avz_action = AVZ_ACTION_INITIALIZE; 4759 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 4760 } else if (error != 0) { 4761 nvlist_free(mos_config); 4762 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4763 } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { 4764 /* 4765 * An older version of ZFS overwrote the sentinel value, so 4766 * we have orphaned per-vdev ZAPs in the MOS. Defer their 4767 * destruction to later; see spa_sync_config_object. 4768 */ 4769 spa->spa_avz_action = AVZ_ACTION_DESTROY; 4770 /* 4771 * We're assuming that no vdevs have had their ZAPs created 4772 * before this. Better be sure of it. 4773 */ 4774 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 4775 } 4776 nvlist_free(mos_config); 4777 4778 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 4779 4780 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, 4781 B_FALSE); 4782 if (error && error != ENOENT) 4783 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4784 4785 if (error == 0) { 4786 uint64_t autoreplace = 0; 4787 4788 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 4789 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 4790 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 4791 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 4792 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 4793 spa_prop_find(spa, ZPOOL_PROP_DEDUP_TABLE_QUOTA, 4794 &spa->spa_dedup_table_quota); 4795 spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); 4796 spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim); 4797 spa->spa_autoreplace = (autoreplace != 0); 4798 } 4799 4800 /* 4801 * If we are importing a pool with missing top-level vdevs, 4802 * we enforce that the pool doesn't panic or get suspended on 4803 * error since the likelihood of missing data is extremely high. 4804 */ 4805 if (spa->spa_missing_tvds > 0 && 4806 spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE && 4807 spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4808 spa_load_note(spa, "forcing failmode to 'continue' " 4809 "as some top level vdevs are missing"); 4810 spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE; 4811 } 4812 4813 return (0); 4814 } 4815 4816 static int 4817 spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) 4818 { 4819 int error = 0; 4820 vdev_t *rvd = spa->spa_root_vdev; 4821 4822 /* 4823 * If we're assembling the pool from the split-off vdevs of 4824 * an existing pool, we don't want to attach the spares & cache 4825 * devices. 4826 */ 4827 4828 /* 4829 * Load any hot spares for this pool. 4830 */ 4831 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, 4832 B_FALSE); 4833 if (error != 0 && error != ENOENT) 4834 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4835 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 4836 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 4837 if (load_nvlist(spa, spa->spa_spares.sav_object, 4838 &spa->spa_spares.sav_config) != 0) { 4839 spa_load_failed(spa, "error loading spares nvlist"); 4840 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4841 } 4842 4843 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4844 spa_load_spares(spa); 4845 spa_config_exit(spa, SCL_ALL, FTAG); 4846 } else if (error == 0) { 4847 spa->spa_spares.sav_sync = B_TRUE; 4848 } 4849 4850 /* 4851 * Load any level 2 ARC devices for this pool. 4852 */ 4853 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 4854 &spa->spa_l2cache.sav_object, B_FALSE); 4855 if (error != 0 && error != ENOENT) 4856 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4857 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 4858 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 4859 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 4860 &spa->spa_l2cache.sav_config) != 0) { 4861 spa_load_failed(spa, "error loading l2cache nvlist"); 4862 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4863 } 4864 4865 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4866 spa_load_l2cache(spa); 4867 spa_config_exit(spa, SCL_ALL, FTAG); 4868 } else if (error == 0) { 4869 spa->spa_l2cache.sav_sync = B_TRUE; 4870 } 4871 4872 return (0); 4873 } 4874 4875 static int 4876 spa_ld_load_vdev_metadata(spa_t *spa) 4877 { 4878 int error = 0; 4879 vdev_t *rvd = spa->spa_root_vdev; 4880 4881 /* 4882 * If the 'multihost' property is set, then never allow a pool to 4883 * be imported when the system hostid is zero. The exception to 4884 * this rule is zdb which is always allowed to access pools. 4885 */ 4886 if (spa_multihost(spa) && spa_get_hostid(spa) == 0 && 4887 (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) { 4888 fnvlist_add_uint64(spa->spa_load_info, 4889 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); 4890 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); 4891 } 4892 4893 /* 4894 * If the 'autoreplace' property is set, then post a resource notifying 4895 * the ZFS DE that it should not issue any faults for unopenable 4896 * devices. We also iterate over the vdevs, and post a sysevent for any 4897 * unopenable vdevs so that the normal autoreplace handler can take 4898 * over. 4899 */ 4900 if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4901 spa_check_removed(spa->spa_root_vdev); 4902 /* 4903 * For the import case, this is done in spa_import(), because 4904 * at this point we're using the spare definitions from 4905 * the MOS config, not necessarily from the userland config. 4906 */ 4907 if (spa->spa_load_state != SPA_LOAD_IMPORT) { 4908 spa_aux_check_removed(&spa->spa_spares); 4909 spa_aux_check_removed(&spa->spa_l2cache); 4910 } 4911 } 4912 4913 /* 4914 * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. 4915 */ 4916 error = vdev_load(rvd); 4917 if (error != 0) { 4918 spa_load_failed(spa, "vdev_load failed [error=%d]", error); 4919 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4920 } 4921 4922 error = spa_ld_log_spacemaps(spa); 4923 if (error != 0) { 4924 spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]", 4925 error); 4926 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4927 } 4928 4929 /* 4930 * Propagate the leaf DTLs we just loaded all the way up the vdev tree. 4931 */ 4932 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4933 vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE); 4934 spa_config_exit(spa, SCL_ALL, FTAG); 4935 4936 return (0); 4937 } 4938 4939 static int 4940 spa_ld_load_dedup_tables(spa_t *spa) 4941 { 4942 int error = 0; 4943 vdev_t *rvd = spa->spa_root_vdev; 4944 4945 error = ddt_load(spa); 4946 if (error != 0) { 4947 spa_load_failed(spa, "ddt_load failed [error=%d]", error); 4948 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4949 } 4950 4951 return (0); 4952 } 4953 4954 static int 4955 spa_ld_load_brt(spa_t *spa) 4956 { 4957 int error = 0; 4958 vdev_t *rvd = spa->spa_root_vdev; 4959 4960 error = brt_load(spa); 4961 if (error != 0) { 4962 spa_load_failed(spa, "brt_load failed [error=%d]", error); 4963 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4964 } 4965 4966 return (0); 4967 } 4968 4969 static int 4970 spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, const char **ereport) 4971 { 4972 vdev_t *rvd = spa->spa_root_vdev; 4973 4974 if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { 4975 boolean_t missing = spa_check_logs(spa); 4976 if (missing) { 4977 if (spa->spa_missing_tvds != 0) { 4978 spa_load_note(spa, "spa_check_logs failed " 4979 "so dropping the logs"); 4980 } else { 4981 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 4982 spa_load_failed(spa, "spa_check_logs failed"); 4983 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, 4984 ENXIO)); 4985 } 4986 } 4987 } 4988 4989 return (0); 4990 } 4991 4992 static int 4993 spa_ld_verify_pool_data(spa_t *spa) 4994 { 4995 int error = 0; 4996 vdev_t *rvd = spa->spa_root_vdev; 4997 4998 /* 4999 * We've successfully opened the pool, verify that we're ready 5000 * to start pushing transactions. 5001 */ 5002 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 5003 error = spa_load_verify(spa); 5004 if (error != 0) { 5005 spa_load_failed(spa, "spa_load_verify failed " 5006 "[error=%d]", error); 5007 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 5008 error)); 5009 } 5010 } 5011 5012 return (0); 5013 } 5014 5015 static void 5016 spa_ld_claim_log_blocks(spa_t *spa) 5017 { 5018 dmu_tx_t *tx; 5019 dsl_pool_t *dp = spa_get_dsl(spa); 5020 5021 /* 5022 * Claim log blocks that haven't been committed yet. 5023 * This must all happen in a single txg. 5024 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 5025 * invoked from zil_claim_log_block()'s i/o done callback. 5026 * Price of rollback is that we abandon the log. 5027 */ 5028 spa->spa_claiming = B_TRUE; 5029 5030 tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); 5031 (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 5032 zil_claim, tx, DS_FIND_CHILDREN); 5033 dmu_tx_commit(tx); 5034 5035 spa->spa_claiming = B_FALSE; 5036 5037 spa_set_log_state(spa, SPA_LOG_GOOD); 5038 } 5039 5040 static void 5041 spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, 5042 boolean_t update_config_cache) 5043 { 5044 vdev_t *rvd = spa->spa_root_vdev; 5045 int need_update = B_FALSE; 5046 5047 /* 5048 * If the config cache is stale, or we have uninitialized 5049 * metaslabs (see spa_vdev_add()), then update the config. 5050 * 5051 * If this is a verbatim import, trust the current 5052 * in-core spa_config and update the disk labels. 5053 */ 5054 if (update_config_cache || config_cache_txg != spa->spa_config_txg || 5055 spa->spa_load_state == SPA_LOAD_IMPORT || 5056 spa->spa_load_state == SPA_LOAD_RECOVER || 5057 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 5058 need_update = B_TRUE; 5059 5060 for (int c = 0; c < rvd->vdev_children; c++) 5061 if (rvd->vdev_child[c]->vdev_ms_array == 0) 5062 need_update = B_TRUE; 5063 5064 /* 5065 * Update the config cache asynchronously in case we're the 5066 * root pool, in which case the config cache isn't writable yet. 5067 */ 5068 if (need_update) 5069 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 5070 } 5071 5072 static void 5073 spa_ld_prepare_for_reload(spa_t *spa) 5074 { 5075 spa_mode_t mode = spa->spa_mode; 5076 int async_suspended = spa->spa_async_suspended; 5077 5078 spa_unload(spa); 5079 spa_deactivate(spa); 5080 spa_activate(spa, mode); 5081 5082 /* 5083 * We save the value of spa_async_suspended as it gets reset to 0 by 5084 * spa_unload(). We want to restore it back to the original value before 5085 * returning as we might be calling spa_async_resume() later. 5086 */ 5087 spa->spa_async_suspended = async_suspended; 5088 } 5089 5090 static int 5091 spa_ld_read_checkpoint_txg(spa_t *spa) 5092 { 5093 uberblock_t checkpoint; 5094 int error = 0; 5095 5096 ASSERT0(spa->spa_checkpoint_txg); 5097 ASSERT(MUTEX_HELD(&spa_namespace_lock) || 5098 spa->spa_load_thread == curthread); 5099 5100 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 5101 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 5102 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 5103 5104 if (error == ENOENT) 5105 return (0); 5106 5107 if (error != 0) 5108 return (error); 5109 5110 ASSERT3U(checkpoint.ub_txg, !=, 0); 5111 ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0); 5112 ASSERT3U(checkpoint.ub_timestamp, !=, 0); 5113 spa->spa_checkpoint_txg = checkpoint.ub_txg; 5114 spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; 5115 5116 return (0); 5117 } 5118 5119 static int 5120 spa_ld_mos_init(spa_t *spa, spa_import_type_t type) 5121 { 5122 int error = 0; 5123 5124 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5125 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); 5126 5127 /* 5128 * Never trust the config that is provided unless we are assembling 5129 * a pool following a split. 5130 * This means don't trust blkptrs and the vdev tree in general. This 5131 * also effectively puts the spa in read-only mode since 5132 * spa_writeable() checks for spa_trust_config to be true. 5133 * We will later load a trusted config from the MOS. 5134 */ 5135 if (type != SPA_IMPORT_ASSEMBLE) 5136 spa->spa_trust_config = B_FALSE; 5137 5138 /* 5139 * Parse the config provided to create a vdev tree. 5140 */ 5141 error = spa_ld_parse_config(spa, type); 5142 if (error != 0) 5143 return (error); 5144 5145 spa_import_progress_add(spa); 5146 5147 /* 5148 * Now that we have the vdev tree, try to open each vdev. This involves 5149 * opening the underlying physical device, retrieving its geometry and 5150 * probing the vdev with a dummy I/O. The state of each vdev will be set 5151 * based on the success of those operations. After this we'll be ready 5152 * to read from the vdevs. 5153 */ 5154 error = spa_ld_open_vdevs(spa); 5155 if (error != 0) 5156 return (error); 5157 5158 /* 5159 * Read the label of each vdev and make sure that the GUIDs stored 5160 * there match the GUIDs in the config provided. 5161 * If we're assembling a new pool that's been split off from an 5162 * existing pool, the labels haven't yet been updated so we skip 5163 * validation for now. 5164 */ 5165 if (type != SPA_IMPORT_ASSEMBLE) { 5166 error = spa_ld_validate_vdevs(spa); 5167 if (error != 0) 5168 return (error); 5169 } 5170 5171 /* 5172 * Read all vdev labels to find the best uberblock (i.e. latest, 5173 * unless spa_load_max_txg is set) and store it in spa_uberblock. We 5174 * get the list of features required to read blkptrs in the MOS from 5175 * the vdev label with the best uberblock and verify that our version 5176 * of zfs supports them all. 5177 */ 5178 error = spa_ld_select_uberblock(spa, type); 5179 if (error != 0) 5180 return (error); 5181 5182 /* 5183 * Pass that uberblock to the dsl_pool layer which will open the root 5184 * blkptr. This blkptr points to the latest version of the MOS and will 5185 * allow us to read its contents. 5186 */ 5187 error = spa_ld_open_rootbp(spa); 5188 if (error != 0) 5189 return (error); 5190 5191 return (0); 5192 } 5193 5194 static int 5195 spa_ld_checkpoint_rewind(spa_t *spa) 5196 { 5197 uberblock_t checkpoint; 5198 int error = 0; 5199 5200 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5201 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 5202 5203 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 5204 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 5205 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 5206 5207 if (error != 0) { 5208 spa_load_failed(spa, "unable to retrieve checkpointed " 5209 "uberblock from the MOS config [error=%d]", error); 5210 5211 if (error == ENOENT) 5212 error = ZFS_ERR_NO_CHECKPOINT; 5213 5214 return (error); 5215 } 5216 5217 ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg); 5218 ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg); 5219 5220 /* 5221 * We need to update the txg and timestamp of the checkpointed 5222 * uberblock to be higher than the latest one. This ensures that 5223 * the checkpointed uberblock is selected if we were to close and 5224 * reopen the pool right after we've written it in the vdev labels. 5225 * (also see block comment in vdev_uberblock_compare) 5226 */ 5227 checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1; 5228 checkpoint.ub_timestamp = gethrestime_sec(); 5229 5230 /* 5231 * Set current uberblock to be the checkpointed uberblock. 5232 */ 5233 spa->spa_uberblock = checkpoint; 5234 5235 /* 5236 * If we are doing a normal rewind, then the pool is open for 5237 * writing and we sync the "updated" checkpointed uberblock to 5238 * disk. Once this is done, we've basically rewound the whole 5239 * pool and there is no way back. 5240 * 5241 * There are cases when we don't want to attempt and sync the 5242 * checkpointed uberblock to disk because we are opening a 5243 * pool as read-only. Specifically, verifying the checkpointed 5244 * state with zdb, and importing the checkpointed state to get 5245 * a "preview" of its content. 5246 */ 5247 if (spa_writeable(spa)) { 5248 vdev_t *rvd = spa->spa_root_vdev; 5249 5250 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5251 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; 5252 int svdcount = 0; 5253 int children = rvd->vdev_children; 5254 int c0 = random_in_range(children); 5255 5256 for (int c = 0; c < children; c++) { 5257 vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; 5258 5259 /* Stop when revisiting the first vdev */ 5260 if (c > 0 && svd[0] == vd) 5261 break; 5262 5263 if (vd->vdev_ms_array == 0 || vd->vdev_islog || 5264 !vdev_is_concrete(vd)) 5265 continue; 5266 5267 svd[svdcount++] = vd; 5268 if (svdcount == SPA_SYNC_MIN_VDEVS) 5269 break; 5270 } 5271 error = vdev_config_sync(svd, svdcount, spa->spa_first_txg); 5272 if (error == 0) 5273 spa->spa_last_synced_guid = rvd->vdev_guid; 5274 spa_config_exit(spa, SCL_ALL, FTAG); 5275 5276 if (error != 0) { 5277 spa_load_failed(spa, "failed to write checkpointed " 5278 "uberblock to the vdev labels [error=%d]", error); 5279 return (error); 5280 } 5281 } 5282 5283 return (0); 5284 } 5285 5286 static int 5287 spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type, 5288 boolean_t *update_config_cache) 5289 { 5290 int error; 5291 5292 /* 5293 * Parse the config for pool, open and validate vdevs, 5294 * select an uberblock, and use that uberblock to open 5295 * the MOS. 5296 */ 5297 error = spa_ld_mos_init(spa, type); 5298 if (error != 0) 5299 return (error); 5300 5301 /* 5302 * Retrieve the trusted config stored in the MOS and use it to create 5303 * a new, exact version of the vdev tree, then reopen all vdevs. 5304 */ 5305 error = spa_ld_trusted_config(spa, type, B_FALSE); 5306 if (error == EAGAIN) { 5307 if (update_config_cache != NULL) 5308 *update_config_cache = B_TRUE; 5309 5310 /* 5311 * Redo the loading process with the trusted config if it is 5312 * too different from the untrusted config. 5313 */ 5314 spa_ld_prepare_for_reload(spa); 5315 spa_load_note(spa, "RELOADING"); 5316 error = spa_ld_mos_init(spa, type); 5317 if (error != 0) 5318 return (error); 5319 5320 error = spa_ld_trusted_config(spa, type, B_TRUE); 5321 if (error != 0) 5322 return (error); 5323 5324 } else if (error != 0) { 5325 return (error); 5326 } 5327 5328 return (0); 5329 } 5330 5331 /* 5332 * Load an existing storage pool, using the config provided. This config 5333 * describes which vdevs are part of the pool and is later validated against 5334 * partial configs present in each vdev's label and an entire copy of the 5335 * config stored in the MOS. 5336 */ 5337 static int 5338 spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) 5339 { 5340 int error = 0; 5341 boolean_t missing_feat_write = B_FALSE; 5342 boolean_t checkpoint_rewind = 5343 (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 5344 boolean_t update_config_cache = B_FALSE; 5345 hrtime_t load_start = gethrtime(); 5346 5347 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5348 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); 5349 5350 spa_load_note(spa, "LOADING"); 5351 5352 error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache); 5353 if (error != 0) 5354 return (error); 5355 5356 /* 5357 * If we are rewinding to the checkpoint then we need to repeat 5358 * everything we've done so far in this function but this time 5359 * selecting the checkpointed uberblock and using that to open 5360 * the MOS. 5361 */ 5362 if (checkpoint_rewind) { 5363 /* 5364 * If we are rewinding to the checkpoint update config cache 5365 * anyway. 5366 */ 5367 update_config_cache = B_TRUE; 5368 5369 /* 5370 * Extract the checkpointed uberblock from the current MOS 5371 * and use this as the pool's uberblock from now on. If the 5372 * pool is imported as writeable we also write the checkpoint 5373 * uberblock to the labels, making the rewind permanent. 5374 */ 5375 error = spa_ld_checkpoint_rewind(spa); 5376 if (error != 0) 5377 return (error); 5378 5379 /* 5380 * Redo the loading process again with the 5381 * checkpointed uberblock. 5382 */ 5383 spa_ld_prepare_for_reload(spa); 5384 spa_load_note(spa, "LOADING checkpointed uberblock"); 5385 error = spa_ld_mos_with_trusted_config(spa, type, NULL); 5386 if (error != 0) 5387 return (error); 5388 } 5389 5390 /* 5391 * Drop the namespace lock for the rest of the function. 5392 */ 5393 spa->spa_load_thread = curthread; 5394 mutex_exit(&spa_namespace_lock); 5395 5396 /* 5397 * Retrieve the checkpoint txg if the pool has a checkpoint. 5398 */ 5399 spa_import_progress_set_notes(spa, "Loading checkpoint txg"); 5400 error = spa_ld_read_checkpoint_txg(spa); 5401 if (error != 0) 5402 goto fail; 5403 5404 /* 5405 * Retrieve the mapping of indirect vdevs. Those vdevs were removed 5406 * from the pool and their contents were re-mapped to other vdevs. Note 5407 * that everything that we read before this step must have been 5408 * rewritten on concrete vdevs after the last device removal was 5409 * initiated. Otherwise we could be reading from indirect vdevs before 5410 * we have loaded their mappings. 5411 */ 5412 spa_import_progress_set_notes(spa, "Loading indirect vdev metadata"); 5413 error = spa_ld_open_indirect_vdev_metadata(spa); 5414 if (error != 0) 5415 goto fail; 5416 5417 /* 5418 * Retrieve the full list of active features from the MOS and check if 5419 * they are all supported. 5420 */ 5421 spa_import_progress_set_notes(spa, "Checking feature flags"); 5422 error = spa_ld_check_features(spa, &missing_feat_write); 5423 if (error != 0) 5424 goto fail; 5425 5426 /* 5427 * Load several special directories from the MOS needed by the dsl_pool 5428 * layer. 5429 */ 5430 spa_import_progress_set_notes(spa, "Loading special MOS directories"); 5431 error = spa_ld_load_special_directories(spa); 5432 if (error != 0) 5433 goto fail; 5434 5435 /* 5436 * Retrieve pool properties from the MOS. 5437 */ 5438 spa_import_progress_set_notes(spa, "Loading properties"); 5439 error = spa_ld_get_props(spa); 5440 if (error != 0) 5441 goto fail; 5442 5443 /* 5444 * Retrieve the list of auxiliary devices - cache devices and spares - 5445 * and open them. 5446 */ 5447 spa_import_progress_set_notes(spa, "Loading AUX vdevs"); 5448 error = spa_ld_open_aux_vdevs(spa, type); 5449 if (error != 0) 5450 goto fail; 5451 5452 /* 5453 * Load the metadata for all vdevs. Also check if unopenable devices 5454 * should be autoreplaced. 5455 */ 5456 spa_import_progress_set_notes(spa, "Loading vdev metadata"); 5457 error = spa_ld_load_vdev_metadata(spa); 5458 if (error != 0) 5459 goto fail; 5460 5461 spa_import_progress_set_notes(spa, "Loading dedup tables"); 5462 error = spa_ld_load_dedup_tables(spa); 5463 if (error != 0) 5464 goto fail; 5465 5466 spa_import_progress_set_notes(spa, "Loading BRT"); 5467 error = spa_ld_load_brt(spa); 5468 if (error != 0) 5469 goto fail; 5470 5471 /* 5472 * Verify the logs now to make sure we don't have any unexpected errors 5473 * when we claim log blocks later. 5474 */ 5475 spa_import_progress_set_notes(spa, "Verifying Log Devices"); 5476 error = spa_ld_verify_logs(spa, type, ereport); 5477 if (error != 0) 5478 goto fail; 5479 5480 if (missing_feat_write) { 5481 ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); 5482 5483 /* 5484 * At this point, we know that we can open the pool in 5485 * read-only mode but not read-write mode. We now have enough 5486 * information and can return to userland. 5487 */ 5488 error = spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, 5489 ENOTSUP); 5490 goto fail; 5491 } 5492 5493 /* 5494 * Traverse the last txgs to make sure the pool was left off in a safe 5495 * state. When performing an extreme rewind, we verify the whole pool, 5496 * which can take a very long time. 5497 */ 5498 spa_import_progress_set_notes(spa, "Verifying pool data"); 5499 error = spa_ld_verify_pool_data(spa); 5500 if (error != 0) 5501 goto fail; 5502 5503 /* 5504 * Calculate the deflated space for the pool. This must be done before 5505 * we write anything to the pool because we'd need to update the space 5506 * accounting using the deflated sizes. 5507 */ 5508 spa_import_progress_set_notes(spa, "Calculating deflated space"); 5509 spa_update_dspace(spa); 5510 5511 /* 5512 * We have now retrieved all the information we needed to open the 5513 * pool. If we are importing the pool in read-write mode, a few 5514 * additional steps must be performed to finish the import. 5515 */ 5516 spa_import_progress_set_notes(spa, "Starting import"); 5517 if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || 5518 spa->spa_load_max_txg == UINT64_MAX)) { 5519 uint64_t config_cache_txg = spa->spa_config_txg; 5520 5521 ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); 5522 5523 /* 5524 * Before we do any zio_write's, complete the raidz expansion 5525 * scratch space copying, if necessary. 5526 */ 5527 if (RRSS_GET_STATE(&spa->spa_uberblock) == RRSS_SCRATCH_VALID) 5528 vdev_raidz_reflow_copy_scratch(spa); 5529 5530 /* 5531 * In case of a checkpoint rewind, log the original txg 5532 * of the checkpointed uberblock. 5533 */ 5534 if (checkpoint_rewind) { 5535 spa_history_log_internal(spa, "checkpoint rewind", 5536 NULL, "rewound state to txg=%llu", 5537 (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); 5538 } 5539 5540 spa_import_progress_set_notes(spa, "Claiming ZIL blocks"); 5541 /* 5542 * Traverse the ZIL and claim all blocks. 5543 */ 5544 spa_ld_claim_log_blocks(spa); 5545 5546 /* 5547 * Kick-off the syncing thread. 5548 */ 5549 spa->spa_sync_on = B_TRUE; 5550 txg_sync_start(spa->spa_dsl_pool); 5551 mmp_thread_start(spa); 5552 5553 /* 5554 * Wait for all claims to sync. We sync up to the highest 5555 * claimed log block birth time so that claimed log blocks 5556 * don't appear to be from the future. spa_claim_max_txg 5557 * will have been set for us by ZIL traversal operations 5558 * performed above. 5559 */ 5560 spa_import_progress_set_notes(spa, "Syncing ZIL claims"); 5561 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 5562 5563 /* 5564 * Check if we need to request an update of the config. On the 5565 * next sync, we would update the config stored in vdev labels 5566 * and the cachefile (by default /etc/zfs/zpool.cache). 5567 */ 5568 spa_import_progress_set_notes(spa, "Updating configs"); 5569 spa_ld_check_for_config_update(spa, config_cache_txg, 5570 update_config_cache); 5571 5572 /* 5573 * Check if a rebuild was in progress and if so resume it. 5574 * Then check all DTLs to see if anything needs resilvering. 5575 * The resilver will be deferred if a rebuild was started. 5576 */ 5577 spa_import_progress_set_notes(spa, "Starting resilvers"); 5578 if (vdev_rebuild_active(spa->spa_root_vdev)) { 5579 vdev_rebuild_restart(spa); 5580 } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 5581 vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5582 spa_async_request(spa, SPA_ASYNC_RESILVER); 5583 } 5584 5585 /* 5586 * Log the fact that we booted up (so that we can detect if 5587 * we rebooted in the middle of an operation). 5588 */ 5589 spa_history_log_version(spa, "open", NULL); 5590 5591 spa_import_progress_set_notes(spa, 5592 "Restarting device removals"); 5593 spa_restart_removal(spa); 5594 spa_spawn_aux_threads(spa); 5595 5596 /* 5597 * Delete any inconsistent datasets. 5598 * 5599 * Note: 5600 * Since we may be issuing deletes for clones here, 5601 * we make sure to do so after we've spawned all the 5602 * auxiliary threads above (from which the livelist 5603 * deletion zthr is part of). 5604 */ 5605 spa_import_progress_set_notes(spa, 5606 "Cleaning up inconsistent objsets"); 5607 (void) dmu_objset_find(spa_name(spa), 5608 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 5609 5610 /* 5611 * Clean up any stale temporary dataset userrefs. 5612 */ 5613 spa_import_progress_set_notes(spa, 5614 "Cleaning up temporary userrefs"); 5615 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 5616 5617 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5618 spa_import_progress_set_notes(spa, "Restarting initialize"); 5619 vdev_initialize_restart(spa->spa_root_vdev); 5620 spa_import_progress_set_notes(spa, "Restarting TRIM"); 5621 vdev_trim_restart(spa->spa_root_vdev); 5622 vdev_autotrim_restart(spa); 5623 spa_config_exit(spa, SCL_CONFIG, FTAG); 5624 spa_import_progress_set_notes(spa, "Finished importing"); 5625 } 5626 zio_handle_import_delay(spa, gethrtime() - load_start); 5627 5628 spa_import_progress_remove(spa_guid(spa)); 5629 spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); 5630 5631 spa_load_note(spa, "LOADED"); 5632 fail: 5633 mutex_enter(&spa_namespace_lock); 5634 spa->spa_load_thread = NULL; 5635 cv_broadcast(&spa_namespace_cv); 5636 5637 return (error); 5638 5639 } 5640 5641 static int 5642 spa_load_retry(spa_t *spa, spa_load_state_t state) 5643 { 5644 spa_mode_t mode = spa->spa_mode; 5645 5646 spa_unload(spa); 5647 spa_deactivate(spa); 5648 5649 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; 5650 5651 spa_activate(spa, mode); 5652 spa_async_suspend(spa); 5653 5654 spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", 5655 (u_longlong_t)spa->spa_load_max_txg); 5656 5657 return (spa_load(spa, state, SPA_IMPORT_EXISTING)); 5658 } 5659 5660 /* 5661 * If spa_load() fails this function will try loading prior txg's. If 5662 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 5663 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 5664 * function will not rewind the pool and will return the same error as 5665 * spa_load(). 5666 */ 5667 static int 5668 spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, 5669 int rewind_flags) 5670 { 5671 nvlist_t *loadinfo = NULL; 5672 nvlist_t *config = NULL; 5673 int load_error, rewind_error; 5674 uint64_t safe_rewind_txg; 5675 uint64_t min_txg; 5676 5677 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 5678 spa->spa_load_max_txg = spa->spa_load_txg; 5679 spa_set_log_state(spa, SPA_LOG_CLEAR); 5680 } else { 5681 spa->spa_load_max_txg = max_request; 5682 if (max_request != UINT64_MAX) 5683 spa->spa_extreme_rewind = B_TRUE; 5684 } 5685 5686 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING); 5687 if (load_error == 0) 5688 return (0); 5689 if (load_error == ZFS_ERR_NO_CHECKPOINT) { 5690 /* 5691 * When attempting checkpoint-rewind on a pool with no 5692 * checkpoint, we should not attempt to load uberblocks 5693 * from previous txgs when spa_load fails. 5694 */ 5695 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 5696 spa_import_progress_remove(spa_guid(spa)); 5697 return (load_error); 5698 } 5699 5700 if (spa->spa_root_vdev != NULL) 5701 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 5702 5703 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 5704 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 5705 5706 if (rewind_flags & ZPOOL_NEVER_REWIND) { 5707 nvlist_free(config); 5708 spa_import_progress_remove(spa_guid(spa)); 5709 return (load_error); 5710 } 5711 5712 if (state == SPA_LOAD_RECOVER) { 5713 /* Price of rolling back is discarding txgs, including log */ 5714 spa_set_log_state(spa, SPA_LOG_CLEAR); 5715 } else { 5716 /* 5717 * If we aren't rolling back save the load info from our first 5718 * import attempt so that we can restore it after attempting 5719 * to rewind. 5720 */ 5721 loadinfo = spa->spa_load_info; 5722 spa->spa_load_info = fnvlist_alloc(); 5723 } 5724 5725 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 5726 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 5727 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 5728 TXG_INITIAL : safe_rewind_txg; 5729 5730 /* 5731 * Continue as long as we're finding errors, we're still within 5732 * the acceptable rewind range, and we're still finding uberblocks 5733 */ 5734 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 5735 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 5736 if (spa->spa_load_max_txg < safe_rewind_txg) 5737 spa->spa_extreme_rewind = B_TRUE; 5738 rewind_error = spa_load_retry(spa, state); 5739 } 5740 5741 spa->spa_extreme_rewind = B_FALSE; 5742 spa->spa_load_max_txg = UINT64_MAX; 5743 5744 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 5745 spa_config_set(spa, config); 5746 else 5747 nvlist_free(config); 5748 5749 if (state == SPA_LOAD_RECOVER) { 5750 ASSERT3P(loadinfo, ==, NULL); 5751 spa_import_progress_remove(spa_guid(spa)); 5752 return (rewind_error); 5753 } else { 5754 /* Store the rewind info as part of the initial load info */ 5755 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 5756 spa->spa_load_info); 5757 5758 /* Restore the initial load info */ 5759 fnvlist_free(spa->spa_load_info); 5760 spa->spa_load_info = loadinfo; 5761 5762 spa_import_progress_remove(spa_guid(spa)); 5763 return (load_error); 5764 } 5765 } 5766 5767 /* 5768 * Pool Open/Import 5769 * 5770 * The import case is identical to an open except that the configuration is sent 5771 * down from userland, instead of grabbed from the configuration cache. For the 5772 * case of an open, the pool configuration will exist in the 5773 * POOL_STATE_UNINITIALIZED state. 5774 * 5775 * The stats information (gen/count/ustats) is used to gather vdev statistics at 5776 * the same time open the pool, without having to keep around the spa_t in some 5777 * ambiguous state. 5778 */ 5779 static int 5780 spa_open_common(const char *pool, spa_t **spapp, const void *tag, 5781 nvlist_t *nvpolicy, nvlist_t **config) 5782 { 5783 spa_t *spa; 5784 spa_load_state_t state = SPA_LOAD_OPEN; 5785 int error; 5786 int locked = B_FALSE; 5787 int firstopen = B_FALSE; 5788 5789 *spapp = NULL; 5790 5791 /* 5792 * As disgusting as this is, we need to support recursive calls to this 5793 * function because dsl_dir_open() is called during spa_load(), and ends 5794 * up calling spa_open() again. The real fix is to figure out how to 5795 * avoid dsl_dir_open() calling this in the first place. 5796 */ 5797 if (MUTEX_NOT_HELD(&spa_namespace_lock)) { 5798 mutex_enter(&spa_namespace_lock); 5799 locked = B_TRUE; 5800 } 5801 5802 if ((spa = spa_lookup(pool)) == NULL) { 5803 if (locked) 5804 mutex_exit(&spa_namespace_lock); 5805 return (SET_ERROR(ENOENT)); 5806 } 5807 5808 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 5809 zpool_load_policy_t policy; 5810 5811 firstopen = B_TRUE; 5812 5813 zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config, 5814 &policy); 5815 if (policy.zlp_rewind & ZPOOL_DO_REWIND) 5816 state = SPA_LOAD_RECOVER; 5817 5818 spa_activate(spa, spa_mode_global); 5819 5820 if (state != SPA_LOAD_RECOVER) 5821 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 5822 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; 5823 5824 zfs_dbgmsg("spa_open_common: opening %s", pool); 5825 error = spa_load_best(spa, state, policy.zlp_txg, 5826 policy.zlp_rewind); 5827 5828 if (error == EBADF) { 5829 /* 5830 * If vdev_validate() returns failure (indicated by 5831 * EBADF), it indicates that one of the vdevs indicates 5832 * that the pool has been exported or destroyed. If 5833 * this is the case, the config cache is out of sync and 5834 * we should remove the pool from the namespace. 5835 */ 5836 spa_unload(spa); 5837 spa_deactivate(spa); 5838 spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); 5839 spa_remove(spa); 5840 if (locked) 5841 mutex_exit(&spa_namespace_lock); 5842 return (SET_ERROR(ENOENT)); 5843 } 5844 5845 if (error) { 5846 /* 5847 * We can't open the pool, but we still have useful 5848 * information: the state of each vdev after the 5849 * attempted vdev_open(). Return this to the user. 5850 */ 5851 if (config != NULL && spa->spa_config) { 5852 *config = fnvlist_dup(spa->spa_config); 5853 fnvlist_add_nvlist(*config, 5854 ZPOOL_CONFIG_LOAD_INFO, 5855 spa->spa_load_info); 5856 } 5857 spa_unload(spa); 5858 spa_deactivate(spa); 5859 spa->spa_last_open_failed = error; 5860 if (locked) 5861 mutex_exit(&spa_namespace_lock); 5862 *spapp = NULL; 5863 return (error); 5864 } 5865 } 5866 5867 spa_open_ref(spa, tag); 5868 5869 if (config != NULL) 5870 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 5871 5872 /* 5873 * If we've recovered the pool, pass back any information we 5874 * gathered while doing the load. 5875 */ 5876 if (state == SPA_LOAD_RECOVER && config != NULL) { 5877 fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 5878 spa->spa_load_info); 5879 } 5880 5881 if (locked) { 5882 spa->spa_last_open_failed = 0; 5883 spa->spa_last_ubsync_txg = 0; 5884 spa->spa_load_txg = 0; 5885 mutex_exit(&spa_namespace_lock); 5886 } 5887 5888 if (firstopen) 5889 zvol_create_minors_recursive(spa_name(spa)); 5890 5891 *spapp = spa; 5892 5893 return (0); 5894 } 5895 5896 int 5897 spa_open_rewind(const char *name, spa_t **spapp, const void *tag, 5898 nvlist_t *policy, nvlist_t **config) 5899 { 5900 return (spa_open_common(name, spapp, tag, policy, config)); 5901 } 5902 5903 int 5904 spa_open(const char *name, spa_t **spapp, const void *tag) 5905 { 5906 return (spa_open_common(name, spapp, tag, NULL, NULL)); 5907 } 5908 5909 /* 5910 * Lookup the given spa_t, incrementing the inject count in the process, 5911 * preventing it from being exported or destroyed. 5912 */ 5913 spa_t * 5914 spa_inject_addref(char *name) 5915 { 5916 spa_t *spa; 5917 5918 mutex_enter(&spa_namespace_lock); 5919 if ((spa = spa_lookup(name)) == NULL) { 5920 mutex_exit(&spa_namespace_lock); 5921 return (NULL); 5922 } 5923 spa->spa_inject_ref++; 5924 mutex_exit(&spa_namespace_lock); 5925 5926 return (spa); 5927 } 5928 5929 void 5930 spa_inject_delref(spa_t *spa) 5931 { 5932 mutex_enter(&spa_namespace_lock); 5933 spa->spa_inject_ref--; 5934 mutex_exit(&spa_namespace_lock); 5935 } 5936 5937 /* 5938 * Add spares device information to the nvlist. 5939 */ 5940 static void 5941 spa_add_spares(spa_t *spa, nvlist_t *config) 5942 { 5943 nvlist_t **spares; 5944 uint_t i, nspares; 5945 nvlist_t *nvroot; 5946 uint64_t guid; 5947 vdev_stat_t *vs; 5948 uint_t vsc; 5949 uint64_t pool; 5950 5951 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 5952 5953 if (spa->spa_spares.sav_count == 0) 5954 return; 5955 5956 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 5957 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5958 ZPOOL_CONFIG_SPARES, &spares, &nspares)); 5959 if (nspares != 0) { 5960 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 5961 (const nvlist_t * const *)spares, nspares); 5962 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 5963 &spares, &nspares)); 5964 5965 /* 5966 * Go through and find any spares which have since been 5967 * repurposed as an active spare. If this is the case, update 5968 * their status appropriately. 5969 */ 5970 for (i = 0; i < nspares; i++) { 5971 guid = fnvlist_lookup_uint64(spares[i], 5972 ZPOOL_CONFIG_GUID); 5973 VERIFY0(nvlist_lookup_uint64_array(spares[i], 5974 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); 5975 if (spa_spare_exists(guid, &pool, NULL) && 5976 pool != 0ULL) { 5977 vs->vs_state = VDEV_STATE_CANT_OPEN; 5978 vs->vs_aux = VDEV_AUX_SPARED; 5979 } else { 5980 vs->vs_state = 5981 spa->spa_spares.sav_vdevs[i]->vdev_state; 5982 } 5983 } 5984 } 5985 } 5986 5987 /* 5988 * Add l2cache device information to the nvlist, including vdev stats. 5989 */ 5990 static void 5991 spa_add_l2cache(spa_t *spa, nvlist_t *config) 5992 { 5993 nvlist_t **l2cache; 5994 uint_t i, j, nl2cache; 5995 nvlist_t *nvroot; 5996 uint64_t guid; 5997 vdev_t *vd; 5998 vdev_stat_t *vs; 5999 uint_t vsc; 6000 6001 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 6002 6003 if (spa->spa_l2cache.sav_count == 0) 6004 return; 6005 6006 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 6007 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 6008 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); 6009 if (nl2cache != 0) { 6010 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 6011 (const nvlist_t * const *)l2cache, nl2cache); 6012 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 6013 &l2cache, &nl2cache)); 6014 6015 /* 6016 * Update level 2 cache device stats. 6017 */ 6018 6019 for (i = 0; i < nl2cache; i++) { 6020 guid = fnvlist_lookup_uint64(l2cache[i], 6021 ZPOOL_CONFIG_GUID); 6022 6023 vd = NULL; 6024 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 6025 if (guid == 6026 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 6027 vd = spa->spa_l2cache.sav_vdevs[j]; 6028 break; 6029 } 6030 } 6031 ASSERT(vd != NULL); 6032 6033 VERIFY0(nvlist_lookup_uint64_array(l2cache[i], 6034 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); 6035 vdev_get_stats(vd, vs); 6036 vdev_config_generate_stats(vd, l2cache[i]); 6037 6038 } 6039 } 6040 } 6041 6042 static void 6043 spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features) 6044 { 6045 zap_cursor_t zc; 6046 zap_attribute_t za; 6047 6048 if (spa->spa_feat_for_read_obj != 0) { 6049 for (zap_cursor_init(&zc, spa->spa_meta_objset, 6050 spa->spa_feat_for_read_obj); 6051 zap_cursor_retrieve(&zc, &za) == 0; 6052 zap_cursor_advance(&zc)) { 6053 ASSERT(za.za_integer_length == sizeof (uint64_t) && 6054 za.za_num_integers == 1); 6055 VERIFY0(nvlist_add_uint64(features, za.za_name, 6056 za.za_first_integer)); 6057 } 6058 zap_cursor_fini(&zc); 6059 } 6060 6061 if (spa->spa_feat_for_write_obj != 0) { 6062 for (zap_cursor_init(&zc, spa->spa_meta_objset, 6063 spa->spa_feat_for_write_obj); 6064 zap_cursor_retrieve(&zc, &za) == 0; 6065 zap_cursor_advance(&zc)) { 6066 ASSERT(za.za_integer_length == sizeof (uint64_t) && 6067 za.za_num_integers == 1); 6068 VERIFY0(nvlist_add_uint64(features, za.za_name, 6069 za.za_first_integer)); 6070 } 6071 zap_cursor_fini(&zc); 6072 } 6073 } 6074 6075 static void 6076 spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features) 6077 { 6078 int i; 6079 6080 for (i = 0; i < SPA_FEATURES; i++) { 6081 zfeature_info_t feature = spa_feature_table[i]; 6082 uint64_t refcount; 6083 6084 if (feature_get_refcount(spa, &feature, &refcount) != 0) 6085 continue; 6086 6087 VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount)); 6088 } 6089 } 6090 6091 /* 6092 * Store a list of pool features and their reference counts in the 6093 * config. 6094 * 6095 * The first time this is called on a spa, allocate a new nvlist, fetch 6096 * the pool features and reference counts from disk, then save the list 6097 * in the spa. In subsequent calls on the same spa use the saved nvlist 6098 * and refresh its values from the cached reference counts. This 6099 * ensures we don't block here on I/O on a suspended pool so 'zpool 6100 * clear' can resume the pool. 6101 */ 6102 static void 6103 spa_add_feature_stats(spa_t *spa, nvlist_t *config) 6104 { 6105 nvlist_t *features; 6106 6107 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 6108 6109 mutex_enter(&spa->spa_feat_stats_lock); 6110 features = spa->spa_feat_stats; 6111 6112 if (features != NULL) { 6113 spa_feature_stats_from_cache(spa, features); 6114 } else { 6115 VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP)); 6116 spa->spa_feat_stats = features; 6117 spa_feature_stats_from_disk(spa, features); 6118 } 6119 6120 VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 6121 features)); 6122 6123 mutex_exit(&spa->spa_feat_stats_lock); 6124 } 6125 6126 int 6127 spa_get_stats(const char *name, nvlist_t **config, 6128 char *altroot, size_t buflen) 6129 { 6130 int error; 6131 spa_t *spa; 6132 6133 *config = NULL; 6134 error = spa_open_common(name, &spa, FTAG, NULL, config); 6135 6136 if (spa != NULL) { 6137 /* 6138 * This still leaves a window of inconsistency where the spares 6139 * or l2cache devices could change and the config would be 6140 * self-inconsistent. 6141 */ 6142 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6143 6144 if (*config != NULL) { 6145 uint64_t loadtimes[2]; 6146 6147 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 6148 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 6149 fnvlist_add_uint64_array(*config, 6150 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2); 6151 6152 fnvlist_add_uint64(*config, 6153 ZPOOL_CONFIG_ERRCOUNT, 6154 spa_approx_errlog_size(spa)); 6155 6156 if (spa_suspended(spa)) { 6157 fnvlist_add_uint64(*config, 6158 ZPOOL_CONFIG_SUSPENDED, 6159 spa->spa_failmode); 6160 fnvlist_add_uint64(*config, 6161 ZPOOL_CONFIG_SUSPENDED_REASON, 6162 spa->spa_suspended); 6163 } 6164 6165 spa_add_spares(spa, *config); 6166 spa_add_l2cache(spa, *config); 6167 spa_add_feature_stats(spa, *config); 6168 } 6169 } 6170 6171 /* 6172 * We want to get the alternate root even for faulted pools, so we cheat 6173 * and call spa_lookup() directly. 6174 */ 6175 if (altroot) { 6176 if (spa == NULL) { 6177 mutex_enter(&spa_namespace_lock); 6178 spa = spa_lookup(name); 6179 if (spa) 6180 spa_altroot(spa, altroot, buflen); 6181 else 6182 altroot[0] = '\0'; 6183 spa = NULL; 6184 mutex_exit(&spa_namespace_lock); 6185 } else { 6186 spa_altroot(spa, altroot, buflen); 6187 } 6188 } 6189 6190 if (spa != NULL) { 6191 spa_config_exit(spa, SCL_CONFIG, FTAG); 6192 spa_close(spa, FTAG); 6193 } 6194 6195 return (error); 6196 } 6197 6198 /* 6199 * Validate that the auxiliary device array is well formed. We must have an 6200 * array of nvlists, each which describes a valid leaf vdev. If this is an 6201 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 6202 * specified, as long as they are well-formed. 6203 */ 6204 static int 6205 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 6206 spa_aux_vdev_t *sav, const char *config, uint64_t version, 6207 vdev_labeltype_t label) 6208 { 6209 nvlist_t **dev; 6210 uint_t i, ndev; 6211 vdev_t *vd; 6212 int error; 6213 6214 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 6215 6216 /* 6217 * It's acceptable to have no devs specified. 6218 */ 6219 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 6220 return (0); 6221 6222 if (ndev == 0) 6223 return (SET_ERROR(EINVAL)); 6224 6225 /* 6226 * Make sure the pool is formatted with a version that supports this 6227 * device type. 6228 */ 6229 if (spa_version(spa) < version) 6230 return (SET_ERROR(ENOTSUP)); 6231 6232 /* 6233 * Set the pending device list so we correctly handle device in-use 6234 * checking. 6235 */ 6236 sav->sav_pending = dev; 6237 sav->sav_npending = ndev; 6238 6239 for (i = 0; i < ndev; i++) { 6240 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 6241 mode)) != 0) 6242 goto out; 6243 6244 if (!vd->vdev_ops->vdev_op_leaf) { 6245 vdev_free(vd); 6246 error = SET_ERROR(EINVAL); 6247 goto out; 6248 } 6249 6250 vd->vdev_top = vd; 6251 6252 if ((error = vdev_open(vd)) == 0 && 6253 (error = vdev_label_init(vd, crtxg, label)) == 0) { 6254 fnvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 6255 vd->vdev_guid); 6256 } 6257 6258 vdev_free(vd); 6259 6260 if (error && 6261 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 6262 goto out; 6263 else 6264 error = 0; 6265 } 6266 6267 out: 6268 sav->sav_pending = NULL; 6269 sav->sav_npending = 0; 6270 return (error); 6271 } 6272 6273 static int 6274 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 6275 { 6276 int error; 6277 6278 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 6279 6280 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 6281 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 6282 VDEV_LABEL_SPARE)) != 0) { 6283 return (error); 6284 } 6285 6286 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 6287 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 6288 VDEV_LABEL_L2CACHE)); 6289 } 6290 6291 static void 6292 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 6293 const char *config) 6294 { 6295 int i; 6296 6297 if (sav->sav_config != NULL) { 6298 nvlist_t **olddevs; 6299 uint_t oldndevs; 6300 nvlist_t **newdevs; 6301 6302 /* 6303 * Generate new dev list by concatenating with the 6304 * current dev list. 6305 */ 6306 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, config, 6307 &olddevs, &oldndevs)); 6308 6309 newdevs = kmem_alloc(sizeof (void *) * 6310 (ndevs + oldndevs), KM_SLEEP); 6311 for (i = 0; i < oldndevs; i++) 6312 newdevs[i] = fnvlist_dup(olddevs[i]); 6313 for (i = 0; i < ndevs; i++) 6314 newdevs[i + oldndevs] = fnvlist_dup(devs[i]); 6315 6316 fnvlist_remove(sav->sav_config, config); 6317 6318 fnvlist_add_nvlist_array(sav->sav_config, config, 6319 (const nvlist_t * const *)newdevs, ndevs + oldndevs); 6320 for (i = 0; i < oldndevs + ndevs; i++) 6321 nvlist_free(newdevs[i]); 6322 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 6323 } else { 6324 /* 6325 * Generate a new dev list. 6326 */ 6327 sav->sav_config = fnvlist_alloc(); 6328 fnvlist_add_nvlist_array(sav->sav_config, config, 6329 (const nvlist_t * const *)devs, ndevs); 6330 } 6331 } 6332 6333 /* 6334 * Stop and drop level 2 ARC devices 6335 */ 6336 void 6337 spa_l2cache_drop(spa_t *spa) 6338 { 6339 vdev_t *vd; 6340 int i; 6341 spa_aux_vdev_t *sav = &spa->spa_l2cache; 6342 6343 for (i = 0; i < sav->sav_count; i++) { 6344 uint64_t pool; 6345 6346 vd = sav->sav_vdevs[i]; 6347 ASSERT(vd != NULL); 6348 6349 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 6350 pool != 0ULL && l2arc_vdev_present(vd)) 6351 l2arc_remove_vdev(vd); 6352 } 6353 } 6354 6355 /* 6356 * Verify encryption parameters for spa creation. If we are encrypting, we must 6357 * have the encryption feature flag enabled. 6358 */ 6359 static int 6360 spa_create_check_encryption_params(dsl_crypto_params_t *dcp, 6361 boolean_t has_encryption) 6362 { 6363 if (dcp->cp_crypt != ZIO_CRYPT_OFF && 6364 dcp->cp_crypt != ZIO_CRYPT_INHERIT && 6365 !has_encryption) 6366 return (SET_ERROR(ENOTSUP)); 6367 6368 return (dmu_objset_create_crypt_check(NULL, dcp, NULL)); 6369 } 6370 6371 /* 6372 * Pool Creation 6373 */ 6374 int 6375 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 6376 nvlist_t *zplprops, dsl_crypto_params_t *dcp) 6377 { 6378 spa_t *spa; 6379 const char *altroot = NULL; 6380 vdev_t *rvd; 6381 dsl_pool_t *dp; 6382 dmu_tx_t *tx; 6383 int error = 0; 6384 uint64_t txg = TXG_INITIAL; 6385 nvlist_t **spares, **l2cache; 6386 uint_t nspares, nl2cache; 6387 uint64_t version, obj, ndraid = 0; 6388 boolean_t has_features; 6389 boolean_t has_encryption; 6390 boolean_t has_allocclass; 6391 spa_feature_t feat; 6392 const char *feat_name; 6393 const char *poolname; 6394 nvlist_t *nvl; 6395 6396 if (props == NULL || 6397 nvlist_lookup_string(props, 6398 zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0) 6399 poolname = (char *)pool; 6400 6401 /* 6402 * If this pool already exists, return failure. 6403 */ 6404 mutex_enter(&spa_namespace_lock); 6405 if (spa_lookup(poolname) != NULL) { 6406 mutex_exit(&spa_namespace_lock); 6407 return (SET_ERROR(EEXIST)); 6408 } 6409 6410 /* 6411 * Allocate a new spa_t structure. 6412 */ 6413 nvl = fnvlist_alloc(); 6414 fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool); 6415 (void) nvlist_lookup_string(props, 6416 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 6417 spa = spa_add(poolname, nvl, altroot); 6418 fnvlist_free(nvl); 6419 spa_activate(spa, spa_mode_global); 6420 6421 if (props && (error = spa_prop_validate(spa, props))) { 6422 spa_deactivate(spa); 6423 spa_remove(spa); 6424 mutex_exit(&spa_namespace_lock); 6425 return (error); 6426 } 6427 6428 /* 6429 * Temporary pool names should never be written to disk. 6430 */ 6431 if (poolname != pool) 6432 spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME; 6433 6434 has_features = B_FALSE; 6435 has_encryption = B_FALSE; 6436 has_allocclass = B_FALSE; 6437 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 6438 elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 6439 if (zpool_prop_feature(nvpair_name(elem))) { 6440 has_features = B_TRUE; 6441 6442 feat_name = strchr(nvpair_name(elem), '@') + 1; 6443 VERIFY0(zfeature_lookup_name(feat_name, &feat)); 6444 if (feat == SPA_FEATURE_ENCRYPTION) 6445 has_encryption = B_TRUE; 6446 if (feat == SPA_FEATURE_ALLOCATION_CLASSES) 6447 has_allocclass = B_TRUE; 6448 } 6449 } 6450 6451 /* verify encryption params, if they were provided */ 6452 if (dcp != NULL) { 6453 error = spa_create_check_encryption_params(dcp, has_encryption); 6454 if (error != 0) { 6455 spa_deactivate(spa); 6456 spa_remove(spa); 6457 mutex_exit(&spa_namespace_lock); 6458 return (error); 6459 } 6460 } 6461 if (!has_allocclass && zfs_special_devs(nvroot, NULL)) { 6462 spa_deactivate(spa); 6463 spa_remove(spa); 6464 mutex_exit(&spa_namespace_lock); 6465 return (ENOTSUP); 6466 } 6467 6468 if (has_features || nvlist_lookup_uint64(props, 6469 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 6470 version = SPA_VERSION; 6471 } 6472 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 6473 6474 spa->spa_first_txg = txg; 6475 spa->spa_uberblock.ub_txg = txg - 1; 6476 spa->spa_uberblock.ub_version = version; 6477 spa->spa_ubsync = spa->spa_uberblock; 6478 spa->spa_load_state = SPA_LOAD_CREATE; 6479 spa->spa_removing_phys.sr_state = DSS_NONE; 6480 spa->spa_removing_phys.sr_removing_vdev = -1; 6481 spa->spa_removing_phys.sr_prev_indirect_vdev = -1; 6482 spa->spa_indirect_vdevs_loaded = B_TRUE; 6483 6484 /* 6485 * Create "The Godfather" zio to hold all async IOs 6486 */ 6487 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 6488 KM_SLEEP); 6489 for (int i = 0; i < max_ncpus; i++) { 6490 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 6491 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 6492 ZIO_FLAG_GODFATHER); 6493 } 6494 6495 /* 6496 * Create the root vdev. 6497 */ 6498 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6499 6500 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 6501 6502 ASSERT(error != 0 || rvd != NULL); 6503 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 6504 6505 if (error == 0 && !zfs_allocatable_devs(nvroot)) 6506 error = SET_ERROR(EINVAL); 6507 6508 if (error == 0 && 6509 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 6510 (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 && 6511 (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { 6512 /* 6513 * instantiate the metaslab groups (this will dirty the vdevs) 6514 * we can no longer error exit past this point 6515 */ 6516 for (int c = 0; error == 0 && c < rvd->vdev_children; c++) { 6517 vdev_t *vd = rvd->vdev_child[c]; 6518 6519 vdev_metaslab_set_size(vd); 6520 vdev_expand(vd, txg); 6521 } 6522 } 6523 6524 spa_config_exit(spa, SCL_ALL, FTAG); 6525 6526 if (error != 0) { 6527 spa_unload(spa); 6528 spa_deactivate(spa); 6529 spa_remove(spa); 6530 mutex_exit(&spa_namespace_lock); 6531 return (error); 6532 } 6533 6534 /* 6535 * Get the list of spares, if specified. 6536 */ 6537 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 6538 &spares, &nspares) == 0) { 6539 spa->spa_spares.sav_config = fnvlist_alloc(); 6540 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 6541 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 6542 nspares); 6543 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6544 spa_load_spares(spa); 6545 spa_config_exit(spa, SCL_ALL, FTAG); 6546 spa->spa_spares.sav_sync = B_TRUE; 6547 } 6548 6549 /* 6550 * Get the list of level 2 cache devices, if specified. 6551 */ 6552 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 6553 &l2cache, &nl2cache) == 0) { 6554 VERIFY0(nvlist_alloc(&spa->spa_l2cache.sav_config, 6555 NV_UNIQUE_NAME, KM_SLEEP)); 6556 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 6557 ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, 6558 nl2cache); 6559 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6560 spa_load_l2cache(spa); 6561 spa_config_exit(spa, SCL_ALL, FTAG); 6562 spa->spa_l2cache.sav_sync = B_TRUE; 6563 } 6564 6565 spa->spa_is_initializing = B_TRUE; 6566 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg); 6567 spa->spa_is_initializing = B_FALSE; 6568 6569 /* 6570 * Create DDTs (dedup tables). 6571 */ 6572 ddt_create(spa); 6573 /* 6574 * Create BRT table and BRT table object. 6575 */ 6576 brt_create(spa); 6577 6578 spa_update_dspace(spa); 6579 6580 tx = dmu_tx_create_assigned(dp, txg); 6581 6582 /* 6583 * Create the pool's history object. 6584 */ 6585 if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history) 6586 spa_history_create_obj(spa, tx); 6587 6588 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); 6589 spa_history_log_version(spa, "create", tx); 6590 6591 /* 6592 * Create the pool config object. 6593 */ 6594 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 6595 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 6596 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 6597 6598 if (zap_add(spa->spa_meta_objset, 6599 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 6600 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 6601 cmn_err(CE_PANIC, "failed to add pool config"); 6602 } 6603 6604 if (zap_add(spa->spa_meta_objset, 6605 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 6606 sizeof (uint64_t), 1, &version, tx) != 0) { 6607 cmn_err(CE_PANIC, "failed to add pool version"); 6608 } 6609 6610 /* Newly created pools with the right version are always deflated. */ 6611 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 6612 spa->spa_deflate = TRUE; 6613 if (zap_add(spa->spa_meta_objset, 6614 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6615 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 6616 cmn_err(CE_PANIC, "failed to add deflate"); 6617 } 6618 } 6619 6620 /* 6621 * Create the deferred-free bpobj. Turn off compression 6622 * because sync-to-convergence takes longer if the blocksize 6623 * keeps changing. 6624 */ 6625 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 6626 dmu_object_set_compress(spa->spa_meta_objset, obj, 6627 ZIO_COMPRESS_OFF, tx); 6628 if (zap_add(spa->spa_meta_objset, 6629 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 6630 sizeof (uint64_t), 1, &obj, tx) != 0) { 6631 cmn_err(CE_PANIC, "failed to add bpobj"); 6632 } 6633 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 6634 spa->spa_meta_objset, obj)); 6635 6636 /* 6637 * Generate some random noise for salted checksums to operate on. 6638 */ 6639 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 6640 sizeof (spa->spa_cksum_salt.zcs_bytes)); 6641 6642 /* 6643 * Set pool properties. 6644 */ 6645 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 6646 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 6647 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 6648 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 6649 spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST); 6650 spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM); 6651 spa->spa_dedup_table_quota = 6652 zpool_prop_default_numeric(ZPOOL_PROP_DEDUP_TABLE_QUOTA); 6653 6654 if (props != NULL) { 6655 spa_configfile_set(spa, props, B_FALSE); 6656 spa_sync_props(props, tx); 6657 } 6658 6659 for (int i = 0; i < ndraid; i++) 6660 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); 6661 6662 dmu_tx_commit(tx); 6663 6664 spa->spa_sync_on = B_TRUE; 6665 txg_sync_start(dp); 6666 mmp_thread_start(spa); 6667 txg_wait_synced(dp, txg); 6668 6669 spa_spawn_aux_threads(spa); 6670 6671 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); 6672 6673 /* 6674 * Don't count references from objsets that are already closed 6675 * and are making their way through the eviction process. 6676 */ 6677 spa_evicting_os_wait(spa); 6678 spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); 6679 spa->spa_load_state = SPA_LOAD_NONE; 6680 6681 spa_import_os(spa); 6682 6683 mutex_exit(&spa_namespace_lock); 6684 6685 return (0); 6686 } 6687 6688 /* 6689 * Import a non-root pool into the system. 6690 */ 6691 int 6692 spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 6693 { 6694 spa_t *spa; 6695 const char *altroot = NULL; 6696 spa_load_state_t state = SPA_LOAD_IMPORT; 6697 zpool_load_policy_t policy; 6698 spa_mode_t mode = spa_mode_global; 6699 uint64_t readonly = B_FALSE; 6700 int error; 6701 nvlist_t *nvroot; 6702 nvlist_t **spares, **l2cache; 6703 uint_t nspares, nl2cache; 6704 6705 /* 6706 * If a pool with this name exists, return failure. 6707 */ 6708 mutex_enter(&spa_namespace_lock); 6709 if (spa_lookup(pool) != NULL) { 6710 mutex_exit(&spa_namespace_lock); 6711 return (SET_ERROR(EEXIST)); 6712 } 6713 6714 /* 6715 * Create and initialize the spa structure. 6716 */ 6717 (void) nvlist_lookup_string(props, 6718 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 6719 (void) nvlist_lookup_uint64(props, 6720 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 6721 if (readonly) 6722 mode = SPA_MODE_READ; 6723 spa = spa_add(pool, config, altroot); 6724 spa->spa_import_flags = flags; 6725 6726 /* 6727 * Verbatim import - Take a pool and insert it into the namespace 6728 * as if it had been loaded at boot. 6729 */ 6730 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 6731 if (props != NULL) 6732 spa_configfile_set(spa, props, B_FALSE); 6733 6734 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); 6735 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 6736 zfs_dbgmsg("spa_import: verbatim import of %s", pool); 6737 mutex_exit(&spa_namespace_lock); 6738 return (0); 6739 } 6740 6741 spa_activate(spa, mode); 6742 6743 /* 6744 * Don't start async tasks until we know everything is healthy. 6745 */ 6746 spa_async_suspend(spa); 6747 6748 zpool_get_load_policy(config, &policy); 6749 if (policy.zlp_rewind & ZPOOL_DO_REWIND) 6750 state = SPA_LOAD_RECOVER; 6751 6752 spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT; 6753 6754 if (state != SPA_LOAD_RECOVER) { 6755 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 6756 zfs_dbgmsg("spa_import: importing %s", pool); 6757 } else { 6758 zfs_dbgmsg("spa_import: importing %s, max_txg=%lld " 6759 "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg); 6760 } 6761 error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); 6762 6763 /* 6764 * Propagate anything learned while loading the pool and pass it 6765 * back to caller (i.e. rewind info, missing devices, etc). 6766 */ 6767 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); 6768 6769 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6770 /* 6771 * Toss any existing sparelist, as it doesn't have any validity 6772 * anymore, and conflicts with spa_has_spare(). 6773 */ 6774 if (spa->spa_spares.sav_config) { 6775 nvlist_free(spa->spa_spares.sav_config); 6776 spa->spa_spares.sav_config = NULL; 6777 spa_load_spares(spa); 6778 } 6779 if (spa->spa_l2cache.sav_config) { 6780 nvlist_free(spa->spa_l2cache.sav_config); 6781 spa->spa_l2cache.sav_config = NULL; 6782 spa_load_l2cache(spa); 6783 } 6784 6785 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 6786 spa_config_exit(spa, SCL_ALL, FTAG); 6787 6788 if (props != NULL) 6789 spa_configfile_set(spa, props, B_FALSE); 6790 6791 if (error != 0 || (props && spa_writeable(spa) && 6792 (error = spa_prop_set(spa, props)))) { 6793 spa_unload(spa); 6794 spa_deactivate(spa); 6795 spa_remove(spa); 6796 mutex_exit(&spa_namespace_lock); 6797 return (error); 6798 } 6799 6800 spa_async_resume(spa); 6801 6802 /* 6803 * Override any spares and level 2 cache devices as specified by 6804 * the user, as these may have correct device names/devids, etc. 6805 */ 6806 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 6807 &spares, &nspares) == 0) { 6808 if (spa->spa_spares.sav_config) 6809 fnvlist_remove(spa->spa_spares.sav_config, 6810 ZPOOL_CONFIG_SPARES); 6811 else 6812 spa->spa_spares.sav_config = fnvlist_alloc(); 6813 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 6814 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 6815 nspares); 6816 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6817 spa_load_spares(spa); 6818 spa_config_exit(spa, SCL_ALL, FTAG); 6819 spa->spa_spares.sav_sync = B_TRUE; 6820 spa->spa_spares.sav_label_sync = B_TRUE; 6821 } 6822 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 6823 &l2cache, &nl2cache) == 0) { 6824 if (spa->spa_l2cache.sav_config) 6825 fnvlist_remove(spa->spa_l2cache.sav_config, 6826 ZPOOL_CONFIG_L2CACHE); 6827 else 6828 spa->spa_l2cache.sav_config = fnvlist_alloc(); 6829 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 6830 ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, 6831 nl2cache); 6832 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6833 spa_load_l2cache(spa); 6834 spa_config_exit(spa, SCL_ALL, FTAG); 6835 spa->spa_l2cache.sav_sync = B_TRUE; 6836 spa->spa_l2cache.sav_label_sync = B_TRUE; 6837 } 6838 6839 /* 6840 * Check for any removed devices. 6841 */ 6842 if (spa->spa_autoreplace) { 6843 spa_aux_check_removed(&spa->spa_spares); 6844 spa_aux_check_removed(&spa->spa_l2cache); 6845 } 6846 6847 if (spa_writeable(spa)) { 6848 /* 6849 * Update the config cache to include the newly-imported pool. 6850 */ 6851 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 6852 } 6853 6854 /* 6855 * It's possible that the pool was expanded while it was exported. 6856 * We kick off an async task to handle this for us. 6857 */ 6858 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 6859 6860 spa_history_log_version(spa, "import", NULL); 6861 6862 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 6863 6864 mutex_exit(&spa_namespace_lock); 6865 6866 zvol_create_minors_recursive(pool); 6867 6868 spa_import_os(spa); 6869 6870 return (0); 6871 } 6872 6873 nvlist_t * 6874 spa_tryimport(nvlist_t *tryconfig) 6875 { 6876 nvlist_t *config = NULL; 6877 const char *poolname, *cachefile; 6878 spa_t *spa; 6879 uint64_t state; 6880 int error; 6881 zpool_load_policy_t policy; 6882 6883 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 6884 return (NULL); 6885 6886 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 6887 return (NULL); 6888 6889 /* 6890 * Create and initialize the spa structure. 6891 */ 6892 char *name = kmem_alloc(MAXPATHLEN, KM_SLEEP); 6893 (void) snprintf(name, MAXPATHLEN, "%s-%llx-%s", 6894 TRYIMPORT_NAME, (u_longlong_t)(uintptr_t)curthread, poolname); 6895 6896 mutex_enter(&spa_namespace_lock); 6897 spa = spa_add(name, tryconfig, NULL); 6898 spa_activate(spa, SPA_MODE_READ); 6899 kmem_free(name, MAXPATHLEN); 6900 6901 /* 6902 * Rewind pool if a max txg was provided. 6903 */ 6904 zpool_get_load_policy(spa->spa_config, &policy); 6905 if (policy.zlp_txg != UINT64_MAX) { 6906 spa->spa_load_max_txg = policy.zlp_txg; 6907 spa->spa_extreme_rewind = B_TRUE; 6908 zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld", 6909 poolname, (longlong_t)policy.zlp_txg); 6910 } else { 6911 zfs_dbgmsg("spa_tryimport: importing %s", poolname); 6912 } 6913 6914 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile) 6915 == 0) { 6916 zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile); 6917 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; 6918 } else { 6919 spa->spa_config_source = SPA_CONFIG_SRC_SCAN; 6920 } 6921 6922 /* 6923 * spa_import() relies on a pool config fetched by spa_try_import() 6924 * for spare/cache devices. Import flags are not passed to 6925 * spa_tryimport(), which makes it return early due to a missing log 6926 * device and missing retrieving the cache device and spare eventually. 6927 * Passing ZFS_IMPORT_MISSING_LOG to spa_tryimport() makes it fetch 6928 * the correct configuration regardless of the missing log device. 6929 */ 6930 spa->spa_import_flags |= ZFS_IMPORT_MISSING_LOG; 6931 6932 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); 6933 6934 /* 6935 * If 'tryconfig' was at least parsable, return the current config. 6936 */ 6937 if (spa->spa_root_vdev != NULL) { 6938 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 6939 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname); 6940 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state); 6941 fnvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 6942 spa->spa_uberblock.ub_timestamp); 6943 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 6944 spa->spa_load_info); 6945 fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, 6946 spa->spa_errata); 6947 6948 /* 6949 * If the bootfs property exists on this pool then we 6950 * copy it out so that external consumers can tell which 6951 * pools are bootable. 6952 */ 6953 if ((!error || error == EEXIST) && spa->spa_bootfs) { 6954 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 6955 6956 /* 6957 * We have to play games with the name since the 6958 * pool was opened as TRYIMPORT_NAME. 6959 */ 6960 if (dsl_dsobj_to_dsname(spa_name(spa), 6961 spa->spa_bootfs, tmpname) == 0) { 6962 char *cp; 6963 char *dsname; 6964 6965 dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 6966 6967 cp = strchr(tmpname, '/'); 6968 if (cp == NULL) { 6969 (void) strlcpy(dsname, tmpname, 6970 MAXPATHLEN); 6971 } else { 6972 (void) snprintf(dsname, MAXPATHLEN, 6973 "%s/%s", poolname, ++cp); 6974 } 6975 fnvlist_add_string(config, ZPOOL_CONFIG_BOOTFS, 6976 dsname); 6977 kmem_free(dsname, MAXPATHLEN); 6978 } 6979 kmem_free(tmpname, MAXPATHLEN); 6980 } 6981 6982 /* 6983 * Add the list of hot spares and level 2 cache devices. 6984 */ 6985 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6986 spa_add_spares(spa, config); 6987 spa_add_l2cache(spa, config); 6988 spa_config_exit(spa, SCL_CONFIG, FTAG); 6989 } 6990 6991 spa_unload(spa); 6992 spa_deactivate(spa); 6993 spa_remove(spa); 6994 mutex_exit(&spa_namespace_lock); 6995 6996 return (config); 6997 } 6998 6999 /* 7000 * Pool export/destroy 7001 * 7002 * The act of destroying or exporting a pool is very simple. We make sure there 7003 * is no more pending I/O and any references to the pool are gone. Then, we 7004 * update the pool state and sync all the labels to disk, removing the 7005 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 7006 * we don't sync the labels or remove the configuration cache. 7007 */ 7008 static int 7009 spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, 7010 boolean_t force, boolean_t hardforce) 7011 { 7012 int error = 0; 7013 spa_t *spa; 7014 hrtime_t export_start = gethrtime(); 7015 7016 if (oldconfig) 7017 *oldconfig = NULL; 7018 7019 if (!(spa_mode_global & SPA_MODE_WRITE)) 7020 return (SET_ERROR(EROFS)); 7021 7022 mutex_enter(&spa_namespace_lock); 7023 if ((spa = spa_lookup(pool)) == NULL) { 7024 mutex_exit(&spa_namespace_lock); 7025 return (SET_ERROR(ENOENT)); 7026 } 7027 7028 if (spa->spa_is_exporting) { 7029 /* the pool is being exported by another thread */ 7030 mutex_exit(&spa_namespace_lock); 7031 return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS)); 7032 } 7033 spa->spa_is_exporting = B_TRUE; 7034 7035 /* 7036 * Put a hold on the pool, drop the namespace lock, stop async tasks 7037 * and see if we can export. 7038 */ 7039 spa_open_ref(spa, FTAG); 7040 mutex_exit(&spa_namespace_lock); 7041 spa_async_suspend(spa); 7042 if (spa->spa_zvol_taskq) { 7043 zvol_remove_minors(spa, spa_name(spa), B_TRUE); 7044 taskq_wait(spa->spa_zvol_taskq); 7045 } 7046 mutex_enter(&spa_namespace_lock); 7047 spa->spa_export_thread = curthread; 7048 spa_close(spa, FTAG); 7049 7050 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 7051 mutex_exit(&spa_namespace_lock); 7052 goto export_spa; 7053 } 7054 7055 /* 7056 * The pool will be in core if it's openable, in which case we can 7057 * modify its state. Objsets may be open only because they're dirty, 7058 * so we have to force it to sync before checking spa_refcnt. 7059 */ 7060 if (spa->spa_sync_on) { 7061 txg_wait_synced(spa->spa_dsl_pool, 0); 7062 spa_evicting_os_wait(spa); 7063 } 7064 7065 /* 7066 * A pool cannot be exported or destroyed if there are active 7067 * references. If we are resetting a pool, allow references by 7068 * fault injection handlers. 7069 */ 7070 if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) { 7071 error = SET_ERROR(EBUSY); 7072 goto fail; 7073 } 7074 7075 mutex_exit(&spa_namespace_lock); 7076 /* 7077 * At this point we no longer hold the spa_namespace_lock and 7078 * there were no references on the spa. Future spa_lookups will 7079 * notice the spa->spa_export_thread and wait until we signal 7080 * that we are finshed. 7081 */ 7082 7083 if (spa->spa_sync_on) { 7084 vdev_t *rvd = spa->spa_root_vdev; 7085 /* 7086 * A pool cannot be exported if it has an active shared spare. 7087 * This is to prevent other pools stealing the active spare 7088 * from an exported pool. At user's own will, such pool can 7089 * be forcedly exported. 7090 */ 7091 if (!force && new_state == POOL_STATE_EXPORTED && 7092 spa_has_active_shared_spare(spa)) { 7093 error = SET_ERROR(EXDEV); 7094 mutex_enter(&spa_namespace_lock); 7095 goto fail; 7096 } 7097 7098 /* 7099 * We're about to export or destroy this pool. Make sure 7100 * we stop all initialization and trim activity here before 7101 * we set the spa_final_txg. This will ensure that all 7102 * dirty data resulting from the initialization is 7103 * committed to disk before we unload the pool. 7104 */ 7105 vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE); 7106 vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE); 7107 vdev_autotrim_stop_all(spa); 7108 vdev_rebuild_stop_all(spa); 7109 7110 /* 7111 * We want this to be reflected on every label, 7112 * so mark them all dirty. spa_unload() will do the 7113 * final sync that pushes these changes out. 7114 */ 7115 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 7116 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 7117 spa->spa_state = new_state; 7118 vdev_config_dirty(rvd); 7119 spa_config_exit(spa, SCL_ALL, FTAG); 7120 } 7121 7122 /* 7123 * If the log space map feature is enabled and the pool is 7124 * getting exported (but not destroyed), we want to spend some 7125 * time flushing as many metaslabs as we can in an attempt to 7126 * destroy log space maps and save import time. This has to be 7127 * done before we set the spa_final_txg, otherwise 7128 * spa_sync() -> spa_flush_metaslabs() may dirty the final TXGs. 7129 * spa_should_flush_logs_on_unload() should be called after 7130 * spa_state has been set to the new_state. 7131 */ 7132 if (spa_should_flush_logs_on_unload(spa)) 7133 spa_unload_log_sm_flush_all(spa); 7134 7135 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 7136 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 7137 spa->spa_final_txg = spa_last_synced_txg(spa) + 7138 TXG_DEFER_SIZE + 1; 7139 spa_config_exit(spa, SCL_ALL, FTAG); 7140 } 7141 } 7142 7143 export_spa: 7144 spa_export_os(spa); 7145 7146 if (new_state == POOL_STATE_DESTROYED) 7147 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); 7148 else if (new_state == POOL_STATE_EXPORTED) 7149 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT); 7150 7151 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 7152 spa_unload(spa); 7153 spa_deactivate(spa); 7154 } 7155 7156 if (oldconfig && spa->spa_config) 7157 *oldconfig = fnvlist_dup(spa->spa_config); 7158 7159 if (new_state == POOL_STATE_EXPORTED) 7160 zio_handle_export_delay(spa, gethrtime() - export_start); 7161 7162 /* 7163 * Take the namespace lock for the actual spa_t removal 7164 */ 7165 mutex_enter(&spa_namespace_lock); 7166 if (new_state != POOL_STATE_UNINITIALIZED) { 7167 if (!hardforce) 7168 spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); 7169 spa_remove(spa); 7170 } else { 7171 /* 7172 * If spa_remove() is not called for this spa_t and 7173 * there is any possibility that it can be reused, 7174 * we make sure to reset the exporting flag. 7175 */ 7176 spa->spa_is_exporting = B_FALSE; 7177 spa->spa_export_thread = NULL; 7178 } 7179 7180 /* 7181 * Wake up any waiters in spa_lookup() 7182 */ 7183 cv_broadcast(&spa_namespace_cv); 7184 mutex_exit(&spa_namespace_lock); 7185 return (0); 7186 7187 fail: 7188 spa->spa_is_exporting = B_FALSE; 7189 spa->spa_export_thread = NULL; 7190 7191 spa_async_resume(spa); 7192 /* 7193 * Wake up any waiters in spa_lookup() 7194 */ 7195 cv_broadcast(&spa_namespace_cv); 7196 mutex_exit(&spa_namespace_lock); 7197 return (error); 7198 } 7199 7200 /* 7201 * Destroy a storage pool. 7202 */ 7203 int 7204 spa_destroy(const char *pool) 7205 { 7206 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 7207 B_FALSE, B_FALSE)); 7208 } 7209 7210 /* 7211 * Export a storage pool. 7212 */ 7213 int 7214 spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force, 7215 boolean_t hardforce) 7216 { 7217 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 7218 force, hardforce)); 7219 } 7220 7221 /* 7222 * Similar to spa_export(), this unloads the spa_t without actually removing it 7223 * from the namespace in any way. 7224 */ 7225 int 7226 spa_reset(const char *pool) 7227 { 7228 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 7229 B_FALSE, B_FALSE)); 7230 } 7231 7232 /* 7233 * ========================================================================== 7234 * Device manipulation 7235 * ========================================================================== 7236 */ 7237 7238 /* 7239 * This is called as a synctask to increment the draid feature flag 7240 */ 7241 static void 7242 spa_draid_feature_incr(void *arg, dmu_tx_t *tx) 7243 { 7244 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 7245 int draid = (int)(uintptr_t)arg; 7246 7247 for (int c = 0; c < draid; c++) 7248 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); 7249 } 7250 7251 /* 7252 * Add a device to a storage pool. 7253 */ 7254 int 7255 spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift) 7256 { 7257 uint64_t txg, ndraid = 0; 7258 int error; 7259 vdev_t *rvd = spa->spa_root_vdev; 7260 vdev_t *vd, *tvd; 7261 nvlist_t **spares, **l2cache; 7262 uint_t nspares, nl2cache; 7263 7264 ASSERT(spa_writeable(spa)); 7265 7266 txg = spa_vdev_enter(spa); 7267 7268 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 7269 VDEV_ALLOC_ADD)) != 0) 7270 return (spa_vdev_exit(spa, NULL, txg, error)); 7271 7272 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 7273 7274 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 7275 &nspares) != 0) 7276 nspares = 0; 7277 7278 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 7279 &nl2cache) != 0) 7280 nl2cache = 0; 7281 7282 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 7283 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 7284 7285 if (vd->vdev_children != 0 && 7286 (error = vdev_create(vd, txg, B_FALSE)) != 0) { 7287 return (spa_vdev_exit(spa, vd, txg, error)); 7288 } 7289 7290 /* 7291 * The virtual dRAID spares must be added after vdev tree is created 7292 * and the vdev guids are generated. The guid of their associated 7293 * dRAID is stored in the config and used when opening the spare. 7294 */ 7295 if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid, 7296 rvd->vdev_children)) == 0) { 7297 if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot, 7298 ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) 7299 nspares = 0; 7300 } else { 7301 return (spa_vdev_exit(spa, vd, txg, error)); 7302 } 7303 7304 /* 7305 * We must validate the spares and l2cache devices after checking the 7306 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 7307 */ 7308 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 7309 return (spa_vdev_exit(spa, vd, txg, error)); 7310 7311 /* 7312 * If we are in the middle of a device removal, we can only add 7313 * devices which match the existing devices in the pool. 7314 * If we are in the middle of a removal, or have some indirect 7315 * vdevs, we can not add raidz or dRAID top levels. 7316 */ 7317 if (spa->spa_vdev_removal != NULL || 7318 spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { 7319 for (int c = 0; c < vd->vdev_children; c++) { 7320 tvd = vd->vdev_child[c]; 7321 if (spa->spa_vdev_removal != NULL && 7322 tvd->vdev_ashift != spa->spa_max_ashift) { 7323 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 7324 } 7325 /* Fail if top level vdev is raidz or a dRAID */ 7326 if (vdev_get_nparity(tvd) != 0) 7327 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 7328 7329 /* 7330 * Need the top level mirror to be 7331 * a mirror of leaf vdevs only 7332 */ 7333 if (tvd->vdev_ops == &vdev_mirror_ops) { 7334 for (uint64_t cid = 0; 7335 cid < tvd->vdev_children; cid++) { 7336 vdev_t *cvd = tvd->vdev_child[cid]; 7337 if (!cvd->vdev_ops->vdev_op_leaf) { 7338 return (spa_vdev_exit(spa, vd, 7339 txg, EINVAL)); 7340 } 7341 } 7342 } 7343 } 7344 } 7345 7346 if (check_ashift && spa->spa_max_ashift == spa->spa_min_ashift) { 7347 for (int c = 0; c < vd->vdev_children; c++) { 7348 tvd = vd->vdev_child[c]; 7349 if (tvd->vdev_ashift != spa->spa_max_ashift) { 7350 return (spa_vdev_exit(spa, vd, txg, 7351 ZFS_ERR_ASHIFT_MISMATCH)); 7352 } 7353 } 7354 } 7355 7356 for (int c = 0; c < vd->vdev_children; c++) { 7357 tvd = vd->vdev_child[c]; 7358 vdev_remove_child(vd, tvd); 7359 tvd->vdev_id = rvd->vdev_children; 7360 vdev_add_child(rvd, tvd); 7361 vdev_config_dirty(tvd); 7362 } 7363 7364 if (nspares != 0) { 7365 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 7366 ZPOOL_CONFIG_SPARES); 7367 spa_load_spares(spa); 7368 spa->spa_spares.sav_sync = B_TRUE; 7369 } 7370 7371 if (nl2cache != 0) { 7372 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 7373 ZPOOL_CONFIG_L2CACHE); 7374 spa_load_l2cache(spa); 7375 spa->spa_l2cache.sav_sync = B_TRUE; 7376 } 7377 7378 /* 7379 * We can't increment a feature while holding spa_vdev so we 7380 * have to do it in a synctask. 7381 */ 7382 if (ndraid != 0) { 7383 dmu_tx_t *tx; 7384 7385 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 7386 dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr, 7387 (void *)(uintptr_t)ndraid, tx); 7388 dmu_tx_commit(tx); 7389 } 7390 7391 /* 7392 * We have to be careful when adding new vdevs to an existing pool. 7393 * If other threads start allocating from these vdevs before we 7394 * sync the config cache, and we lose power, then upon reboot we may 7395 * fail to open the pool because there are DVAs that the config cache 7396 * can't translate. Therefore, we first add the vdevs without 7397 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 7398 * and then let spa_config_update() initialize the new metaslabs. 7399 * 7400 * spa_load() checks for added-but-not-initialized vdevs, so that 7401 * if we lose power at any point in this sequence, the remaining 7402 * steps will be completed the next time we load the pool. 7403 */ 7404 (void) spa_vdev_exit(spa, vd, txg, 0); 7405 7406 mutex_enter(&spa_namespace_lock); 7407 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 7408 spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); 7409 mutex_exit(&spa_namespace_lock); 7410 7411 return (0); 7412 } 7413 7414 /* 7415 * Attach a device to a vdev specified by its guid. The vdev type can be 7416 * a mirror, a raidz, or a leaf device that is also a top-level (e.g. a 7417 * single device). When the vdev is a single device, a mirror vdev will be 7418 * automatically inserted. 7419 * 7420 * If 'replacing' is specified, the new device is intended to replace the 7421 * existing device; in this case the two devices are made into their own 7422 * mirror using the 'replacing' vdev, which is functionally identical to 7423 * the mirror vdev (it actually reuses all the same ops) but has a few 7424 * extra rules: you can't attach to it after it's been created, and upon 7425 * completion of resilvering, the first disk (the one being replaced) 7426 * is automatically detached. 7427 * 7428 * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild) 7429 * should be performed instead of traditional healing reconstruction. From 7430 * an administrators perspective these are both resilver operations. 7431 */ 7432 int 7433 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, 7434 int rebuild) 7435 { 7436 uint64_t txg, dtl_max_txg; 7437 vdev_t *rvd = spa->spa_root_vdev; 7438 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 7439 vdev_ops_t *pvops; 7440 char *oldvdpath, *newvdpath; 7441 int newvd_isspare = B_FALSE; 7442 int error; 7443 7444 ASSERT(spa_writeable(spa)); 7445 7446 txg = spa_vdev_enter(spa); 7447 7448 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 7449 7450 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7451 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 7452 error = (spa_has_checkpoint(spa)) ? 7453 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 7454 return (spa_vdev_exit(spa, NULL, txg, error)); 7455 } 7456 7457 if (rebuild) { 7458 if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) 7459 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7460 7461 if (dsl_scan_resilvering(spa_get_dsl(spa)) || 7462 dsl_scan_resilver_scheduled(spa_get_dsl(spa))) { 7463 return (spa_vdev_exit(spa, NULL, txg, 7464 ZFS_ERR_RESILVER_IN_PROGRESS)); 7465 } 7466 } else { 7467 if (vdev_rebuild_active(rvd)) 7468 return (spa_vdev_exit(spa, NULL, txg, 7469 ZFS_ERR_REBUILD_IN_PROGRESS)); 7470 } 7471 7472 if (spa->spa_vdev_removal != NULL) { 7473 return (spa_vdev_exit(spa, NULL, txg, 7474 ZFS_ERR_DEVRM_IN_PROGRESS)); 7475 } 7476 7477 if (oldvd == NULL) 7478 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 7479 7480 boolean_t raidz = oldvd->vdev_ops == &vdev_raidz_ops; 7481 7482 if (raidz) { 7483 if (!spa_feature_is_enabled(spa, SPA_FEATURE_RAIDZ_EXPANSION)) 7484 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7485 7486 /* 7487 * Can't expand a raidz while prior expand is in progress. 7488 */ 7489 if (spa->spa_raidz_expand != NULL) { 7490 return (spa_vdev_exit(spa, NULL, txg, 7491 ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS)); 7492 } 7493 } else if (!oldvd->vdev_ops->vdev_op_leaf) { 7494 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7495 } 7496 7497 if (raidz) 7498 pvd = oldvd; 7499 else 7500 pvd = oldvd->vdev_parent; 7501 7502 if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 7503 VDEV_ALLOC_ATTACH) != 0) 7504 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 7505 7506 if (newrootvd->vdev_children != 1) 7507 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 7508 7509 newvd = newrootvd->vdev_child[0]; 7510 7511 if (!newvd->vdev_ops->vdev_op_leaf) 7512 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 7513 7514 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 7515 return (spa_vdev_exit(spa, newrootvd, txg, error)); 7516 7517 /* 7518 * log, dedup and special vdevs should not be replaced by spares. 7519 */ 7520 if ((oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE || 7521 oldvd->vdev_top->vdev_islog) && newvd->vdev_isspare) { 7522 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7523 } 7524 7525 /* 7526 * A dRAID spare can only replace a child of its parent dRAID vdev. 7527 */ 7528 if (newvd->vdev_ops == &vdev_draid_spare_ops && 7529 oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) { 7530 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7531 } 7532 7533 if (rebuild) { 7534 /* 7535 * For rebuilds, the top vdev must support reconstruction 7536 * using only space maps. This means the only allowable 7537 * vdevs types are the root vdev, a mirror, or dRAID. 7538 */ 7539 tvd = pvd; 7540 if (pvd->vdev_top != NULL) 7541 tvd = pvd->vdev_top; 7542 7543 if (tvd->vdev_ops != &vdev_mirror_ops && 7544 tvd->vdev_ops != &vdev_root_ops && 7545 tvd->vdev_ops != &vdev_draid_ops) { 7546 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7547 } 7548 } 7549 7550 if (!replacing) { 7551 /* 7552 * For attach, the only allowable parent is a mirror or 7553 * the root vdev. A raidz vdev can be attached to, but 7554 * you cannot attach to a raidz child. 7555 */ 7556 if (pvd->vdev_ops != &vdev_mirror_ops && 7557 pvd->vdev_ops != &vdev_root_ops && 7558 !raidz) 7559 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7560 7561 pvops = &vdev_mirror_ops; 7562 } else { 7563 /* 7564 * Active hot spares can only be replaced by inactive hot 7565 * spares. 7566 */ 7567 if (pvd->vdev_ops == &vdev_spare_ops && 7568 oldvd->vdev_isspare && 7569 !spa_has_spare(spa, newvd->vdev_guid)) 7570 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7571 7572 /* 7573 * If the source is a hot spare, and the parent isn't already a 7574 * spare, then we want to create a new hot spare. Otherwise, we 7575 * want to create a replacing vdev. The user is not allowed to 7576 * attach to a spared vdev child unless the 'isspare' state is 7577 * the same (spare replaces spare, non-spare replaces 7578 * non-spare). 7579 */ 7580 if (pvd->vdev_ops == &vdev_replacing_ops && 7581 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 7582 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7583 } else if (pvd->vdev_ops == &vdev_spare_ops && 7584 newvd->vdev_isspare != oldvd->vdev_isspare) { 7585 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7586 } 7587 7588 if (newvd->vdev_isspare) 7589 pvops = &vdev_spare_ops; 7590 else 7591 pvops = &vdev_replacing_ops; 7592 } 7593 7594 /* 7595 * Make sure the new device is big enough. 7596 */ 7597 vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd; 7598 if (newvd->vdev_asize < vdev_get_min_asize(min_vdev)) 7599 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 7600 7601 /* 7602 * The new device cannot have a higher alignment requirement 7603 * than the top-level vdev. 7604 */ 7605 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 7606 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7607 7608 /* 7609 * RAIDZ-expansion-specific checks. 7610 */ 7611 if (raidz) { 7612 if (vdev_raidz_attach_check(newvd) != 0) 7613 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7614 7615 /* 7616 * Fail early if a child is not healthy or being replaced 7617 */ 7618 for (int i = 0; i < oldvd->vdev_children; i++) { 7619 if (vdev_is_dead(oldvd->vdev_child[i]) || 7620 !oldvd->vdev_child[i]->vdev_ops->vdev_op_leaf) { 7621 return (spa_vdev_exit(spa, newrootvd, txg, 7622 ENXIO)); 7623 } 7624 /* Also fail if reserved boot area is in-use */ 7625 if (vdev_check_boot_reserve(spa, oldvd->vdev_child[i]) 7626 != 0) { 7627 return (spa_vdev_exit(spa, newrootvd, txg, 7628 EADDRINUSE)); 7629 } 7630 } 7631 } 7632 7633 if (raidz) { 7634 /* 7635 * Note: oldvdpath is freed by spa_strfree(), but 7636 * kmem_asprintf() is freed by kmem_strfree(), so we have to 7637 * move it to a spa_strdup-ed string. 7638 */ 7639 char *tmp = kmem_asprintf("raidz%u-%u", 7640 (uint_t)vdev_get_nparity(oldvd), (uint_t)oldvd->vdev_id); 7641 oldvdpath = spa_strdup(tmp); 7642 kmem_strfree(tmp); 7643 } else { 7644 oldvdpath = spa_strdup(oldvd->vdev_path); 7645 } 7646 newvdpath = spa_strdup(newvd->vdev_path); 7647 7648 /* 7649 * If this is an in-place replacement, update oldvd's path and devid 7650 * to make it distinguishable from newvd, and unopenable from now on. 7651 */ 7652 if (strcmp(oldvdpath, newvdpath) == 0) { 7653 spa_strfree(oldvd->vdev_path); 7654 oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5, 7655 KM_SLEEP); 7656 (void) sprintf(oldvd->vdev_path, "%s/old", 7657 newvdpath); 7658 if (oldvd->vdev_devid != NULL) { 7659 spa_strfree(oldvd->vdev_devid); 7660 oldvd->vdev_devid = NULL; 7661 } 7662 spa_strfree(oldvdpath); 7663 oldvdpath = spa_strdup(oldvd->vdev_path); 7664 } 7665 7666 /* 7667 * If the parent is not a mirror, or if we're replacing, insert the new 7668 * mirror/replacing/spare vdev above oldvd. 7669 */ 7670 if (!raidz && pvd->vdev_ops != pvops) { 7671 pvd = vdev_add_parent(oldvd, pvops); 7672 ASSERT(pvd->vdev_ops == pvops); 7673 ASSERT(oldvd->vdev_parent == pvd); 7674 } 7675 7676 ASSERT(pvd->vdev_top->vdev_parent == rvd); 7677 7678 /* 7679 * Extract the new device from its root and add it to pvd. 7680 */ 7681 vdev_remove_child(newrootvd, newvd); 7682 newvd->vdev_id = pvd->vdev_children; 7683 newvd->vdev_crtxg = oldvd->vdev_crtxg; 7684 vdev_add_child(pvd, newvd); 7685 7686 /* 7687 * Reevaluate the parent vdev state. 7688 */ 7689 vdev_propagate_state(pvd); 7690 7691 tvd = newvd->vdev_top; 7692 ASSERT(pvd->vdev_top == tvd); 7693 ASSERT(tvd->vdev_parent == rvd); 7694 7695 vdev_config_dirty(tvd); 7696 7697 /* 7698 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 7699 * for any dmu_sync-ed blocks. It will propagate upward when 7700 * spa_vdev_exit() calls vdev_dtl_reassess(). 7701 */ 7702 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 7703 7704 if (raidz) { 7705 /* 7706 * Wait for the youngest allocations and frees to sync, 7707 * and then wait for the deferral of those frees to finish. 7708 */ 7709 spa_vdev_config_exit(spa, NULL, 7710 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 7711 7712 vdev_initialize_stop_all(tvd, VDEV_INITIALIZE_ACTIVE); 7713 vdev_trim_stop_all(tvd, VDEV_TRIM_ACTIVE); 7714 vdev_autotrim_stop_wait(tvd); 7715 7716 dtl_max_txg = spa_vdev_config_enter(spa); 7717 7718 tvd->vdev_rz_expanding = B_TRUE; 7719 7720 vdev_dirty_leaves(tvd, VDD_DTL, dtl_max_txg); 7721 vdev_config_dirty(tvd); 7722 7723 dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, 7724 dtl_max_txg); 7725 dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync, 7726 newvd, tx); 7727 dmu_tx_commit(tx); 7728 } else { 7729 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 7730 dtl_max_txg - TXG_INITIAL); 7731 7732 if (newvd->vdev_isspare) { 7733 spa_spare_activate(newvd); 7734 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); 7735 } 7736 7737 newvd_isspare = newvd->vdev_isspare; 7738 7739 /* 7740 * Mark newvd's DTL dirty in this txg. 7741 */ 7742 vdev_dirty(tvd, VDD_DTL, newvd, txg); 7743 7744 /* 7745 * Schedule the resilver or rebuild to restart in the future. 7746 * We do this to ensure that dmu_sync-ed blocks have been 7747 * stitched into the respective datasets. 7748 */ 7749 if (rebuild) { 7750 newvd->vdev_rebuild_txg = txg; 7751 7752 vdev_rebuild(tvd); 7753 } else { 7754 newvd->vdev_resilver_txg = txg; 7755 7756 if (dsl_scan_resilvering(spa_get_dsl(spa)) && 7757 spa_feature_is_enabled(spa, 7758 SPA_FEATURE_RESILVER_DEFER)) { 7759 vdev_defer_resilver(newvd); 7760 } else { 7761 dsl_scan_restart_resilver(spa->spa_dsl_pool, 7762 dtl_max_txg); 7763 } 7764 } 7765 } 7766 7767 if (spa->spa_bootfs) 7768 spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); 7769 7770 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); 7771 7772 /* 7773 * Commit the config 7774 */ 7775 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 7776 7777 spa_history_log_internal(spa, "vdev attach", NULL, 7778 "%s vdev=%s %s vdev=%s", 7779 replacing && newvd_isspare ? "spare in" : 7780 replacing ? "replace" : "attach", newvdpath, 7781 replacing ? "for" : "to", oldvdpath); 7782 7783 spa_strfree(oldvdpath); 7784 spa_strfree(newvdpath); 7785 7786 return (0); 7787 } 7788 7789 /* 7790 * Detach a device from a mirror or replacing vdev. 7791 * 7792 * If 'replace_done' is specified, only detach if the parent 7793 * is a replacing or a spare vdev. 7794 */ 7795 int 7796 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 7797 { 7798 uint64_t txg; 7799 int error; 7800 vdev_t *rvd __maybe_unused = spa->spa_root_vdev; 7801 vdev_t *vd, *pvd, *cvd, *tvd; 7802 boolean_t unspare = B_FALSE; 7803 uint64_t unspare_guid = 0; 7804 char *vdpath; 7805 7806 ASSERT(spa_writeable(spa)); 7807 7808 txg = spa_vdev_detach_enter(spa, guid); 7809 7810 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 7811 7812 /* 7813 * Besides being called directly from the userland through the 7814 * ioctl interface, spa_vdev_detach() can be potentially called 7815 * at the end of spa_vdev_resilver_done(). 7816 * 7817 * In the regular case, when we have a checkpoint this shouldn't 7818 * happen as we never empty the DTLs of a vdev during the scrub 7819 * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done() 7820 * should never get here when we have a checkpoint. 7821 * 7822 * That said, even in a case when we checkpoint the pool exactly 7823 * as spa_vdev_resilver_done() calls this function everything 7824 * should be fine as the resilver will return right away. 7825 */ 7826 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7827 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 7828 error = (spa_has_checkpoint(spa)) ? 7829 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 7830 return (spa_vdev_exit(spa, NULL, txg, error)); 7831 } 7832 7833 if (vd == NULL) 7834 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 7835 7836 if (!vd->vdev_ops->vdev_op_leaf) 7837 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7838 7839 pvd = vd->vdev_parent; 7840 7841 /* 7842 * If the parent/child relationship is not as expected, don't do it. 7843 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 7844 * vdev that's replacing B with C. The user's intent in replacing 7845 * is to go from M(A,B) to M(A,C). If the user decides to cancel 7846 * the replace by detaching C, the expected behavior is to end up 7847 * M(A,B). But suppose that right after deciding to detach C, 7848 * the replacement of B completes. We would have M(A,C), and then 7849 * ask to detach C, which would leave us with just A -- not what 7850 * the user wanted. To prevent this, we make sure that the 7851 * parent/child relationship hasn't changed -- in this example, 7852 * that C's parent is still the replacing vdev R. 7853 */ 7854 if (pvd->vdev_guid != pguid && pguid != 0) 7855 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 7856 7857 /* 7858 * Only 'replacing' or 'spare' vdevs can be replaced. 7859 */ 7860 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 7861 pvd->vdev_ops != &vdev_spare_ops) 7862 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7863 7864 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 7865 spa_version(spa) >= SPA_VERSION_SPARES); 7866 7867 /* 7868 * Only mirror, replacing, and spare vdevs support detach. 7869 */ 7870 if (pvd->vdev_ops != &vdev_replacing_ops && 7871 pvd->vdev_ops != &vdev_mirror_ops && 7872 pvd->vdev_ops != &vdev_spare_ops) 7873 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7874 7875 /* 7876 * If this device has the only valid copy of some data, 7877 * we cannot safely detach it. 7878 */ 7879 if (vdev_dtl_required(vd)) 7880 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 7881 7882 ASSERT(pvd->vdev_children >= 2); 7883 7884 /* 7885 * If we are detaching the second disk from a replacing vdev, then 7886 * check to see if we changed the original vdev's path to have "/old" 7887 * at the end in spa_vdev_attach(). If so, undo that change now. 7888 */ 7889 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 7890 vd->vdev_path != NULL) { 7891 size_t len = strlen(vd->vdev_path); 7892 7893 for (int c = 0; c < pvd->vdev_children; c++) { 7894 cvd = pvd->vdev_child[c]; 7895 7896 if (cvd == vd || cvd->vdev_path == NULL) 7897 continue; 7898 7899 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 7900 strcmp(cvd->vdev_path + len, "/old") == 0) { 7901 spa_strfree(cvd->vdev_path); 7902 cvd->vdev_path = spa_strdup(vd->vdev_path); 7903 break; 7904 } 7905 } 7906 } 7907 7908 /* 7909 * If we are detaching the original disk from a normal spare, then it 7910 * implies that the spare should become a real disk, and be removed 7911 * from the active spare list for the pool. dRAID spares on the 7912 * other hand are coupled to the pool and thus should never be removed 7913 * from the spares list. 7914 */ 7915 if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) { 7916 vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1]; 7917 7918 if (last_cvd->vdev_isspare && 7919 last_cvd->vdev_ops != &vdev_draid_spare_ops) { 7920 unspare = B_TRUE; 7921 } 7922 } 7923 7924 /* 7925 * Erase the disk labels so the disk can be used for other things. 7926 * This must be done after all other error cases are handled, 7927 * but before we disembowel vd (so we can still do I/O to it). 7928 * But if we can't do it, don't treat the error as fatal -- 7929 * it may be that the unwritability of the disk is the reason 7930 * it's being detached! 7931 */ 7932 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 7933 7934 /* 7935 * Remove vd from its parent and compact the parent's children. 7936 */ 7937 vdev_remove_child(pvd, vd); 7938 vdev_compact_children(pvd); 7939 7940 /* 7941 * Remember one of the remaining children so we can get tvd below. 7942 */ 7943 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 7944 7945 /* 7946 * If we need to remove the remaining child from the list of hot spares, 7947 * do it now, marking the vdev as no longer a spare in the process. 7948 * We must do this before vdev_remove_parent(), because that can 7949 * change the GUID if it creates a new toplevel GUID. For a similar 7950 * reason, we must remove the spare now, in the same txg as the detach; 7951 * otherwise someone could attach a new sibling, change the GUID, and 7952 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 7953 */ 7954 if (unspare) { 7955 ASSERT(cvd->vdev_isspare); 7956 spa_spare_remove(cvd); 7957 unspare_guid = cvd->vdev_guid; 7958 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 7959 cvd->vdev_unspare = B_TRUE; 7960 } 7961 7962 /* 7963 * If the parent mirror/replacing vdev only has one child, 7964 * the parent is no longer needed. Remove it from the tree. 7965 */ 7966 if (pvd->vdev_children == 1) { 7967 if (pvd->vdev_ops == &vdev_spare_ops) 7968 cvd->vdev_unspare = B_FALSE; 7969 vdev_remove_parent(cvd); 7970 } 7971 7972 /* 7973 * We don't set tvd until now because the parent we just removed 7974 * may have been the previous top-level vdev. 7975 */ 7976 tvd = cvd->vdev_top; 7977 ASSERT(tvd->vdev_parent == rvd); 7978 7979 /* 7980 * Reevaluate the parent vdev state. 7981 */ 7982 vdev_propagate_state(cvd); 7983 7984 /* 7985 * If the 'autoexpand' property is set on the pool then automatically 7986 * try to expand the size of the pool. For example if the device we 7987 * just detached was smaller than the others, it may be possible to 7988 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 7989 * first so that we can obtain the updated sizes of the leaf vdevs. 7990 */ 7991 if (spa->spa_autoexpand) { 7992 vdev_reopen(tvd); 7993 vdev_expand(tvd, txg); 7994 } 7995 7996 vdev_config_dirty(tvd); 7997 7998 /* 7999 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 8000 * vd->vdev_detached is set and free vd's DTL object in syncing context. 8001 * But first make sure we're not on any *other* txg's DTL list, to 8002 * prevent vd from being accessed after it's freed. 8003 */ 8004 vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none"); 8005 for (int t = 0; t < TXG_SIZE; t++) 8006 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 8007 vd->vdev_detached = B_TRUE; 8008 vdev_dirty(tvd, VDD_DTL, vd, txg); 8009 8010 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); 8011 spa_notify_waiters(spa); 8012 8013 /* hang on to the spa before we release the lock */ 8014 spa_open_ref(spa, FTAG); 8015 8016 error = spa_vdev_exit(spa, vd, txg, 0); 8017 8018 spa_history_log_internal(spa, "detach", NULL, 8019 "vdev=%s", vdpath); 8020 spa_strfree(vdpath); 8021 8022 /* 8023 * If this was the removal of the original device in a hot spare vdev, 8024 * then we want to go through and remove the device from the hot spare 8025 * list of every other pool. 8026 */ 8027 if (unspare) { 8028 spa_t *altspa = NULL; 8029 8030 mutex_enter(&spa_namespace_lock); 8031 while ((altspa = spa_next(altspa)) != NULL) { 8032 if (altspa->spa_state != POOL_STATE_ACTIVE || 8033 altspa == spa) 8034 continue; 8035 8036 spa_open_ref(altspa, FTAG); 8037 mutex_exit(&spa_namespace_lock); 8038 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 8039 mutex_enter(&spa_namespace_lock); 8040 spa_close(altspa, FTAG); 8041 } 8042 mutex_exit(&spa_namespace_lock); 8043 8044 /* search the rest of the vdevs for spares to remove */ 8045 spa_vdev_resilver_done(spa); 8046 } 8047 8048 /* all done with the spa; OK to release */ 8049 mutex_enter(&spa_namespace_lock); 8050 spa_close(spa, FTAG); 8051 mutex_exit(&spa_namespace_lock); 8052 8053 return (error); 8054 } 8055 8056 static int 8057 spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, 8058 list_t *vd_list) 8059 { 8060 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 8061 8062 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 8063 8064 /* Look up vdev and ensure it's a leaf. */ 8065 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); 8066 if (vd == NULL || vd->vdev_detached) { 8067 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8068 return (SET_ERROR(ENODEV)); 8069 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { 8070 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8071 return (SET_ERROR(EINVAL)); 8072 } else if (!vdev_writeable(vd)) { 8073 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8074 return (SET_ERROR(EROFS)); 8075 } 8076 mutex_enter(&vd->vdev_initialize_lock); 8077 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8078 8079 /* 8080 * When we activate an initialize action we check to see 8081 * if the vdev_initialize_thread is NULL. We do this instead 8082 * of using the vdev_initialize_state since there might be 8083 * a previous initialization process which has completed but 8084 * the thread is not exited. 8085 */ 8086 if (cmd_type == POOL_INITIALIZE_START && 8087 (vd->vdev_initialize_thread != NULL || 8088 vd->vdev_top->vdev_removing || vd->vdev_top->vdev_rz_expanding)) { 8089 mutex_exit(&vd->vdev_initialize_lock); 8090 return (SET_ERROR(EBUSY)); 8091 } else if (cmd_type == POOL_INITIALIZE_CANCEL && 8092 (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE && 8093 vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) { 8094 mutex_exit(&vd->vdev_initialize_lock); 8095 return (SET_ERROR(ESRCH)); 8096 } else if (cmd_type == POOL_INITIALIZE_SUSPEND && 8097 vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { 8098 mutex_exit(&vd->vdev_initialize_lock); 8099 return (SET_ERROR(ESRCH)); 8100 } else if (cmd_type == POOL_INITIALIZE_UNINIT && 8101 vd->vdev_initialize_thread != NULL) { 8102 mutex_exit(&vd->vdev_initialize_lock); 8103 return (SET_ERROR(EBUSY)); 8104 } 8105 8106 switch (cmd_type) { 8107 case POOL_INITIALIZE_START: 8108 vdev_initialize(vd); 8109 break; 8110 case POOL_INITIALIZE_CANCEL: 8111 vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list); 8112 break; 8113 case POOL_INITIALIZE_SUSPEND: 8114 vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list); 8115 break; 8116 case POOL_INITIALIZE_UNINIT: 8117 vdev_uninitialize(vd); 8118 break; 8119 default: 8120 panic("invalid cmd_type %llu", (unsigned long long)cmd_type); 8121 } 8122 mutex_exit(&vd->vdev_initialize_lock); 8123 8124 return (0); 8125 } 8126 8127 int 8128 spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, 8129 nvlist_t *vdev_errlist) 8130 { 8131 int total_errors = 0; 8132 list_t vd_list; 8133 8134 list_create(&vd_list, sizeof (vdev_t), 8135 offsetof(vdev_t, vdev_initialize_node)); 8136 8137 /* 8138 * We hold the namespace lock through the whole function 8139 * to prevent any changes to the pool while we're starting or 8140 * stopping initialization. The config and state locks are held so that 8141 * we can properly assess the vdev state before we commit to 8142 * the initializing operation. 8143 */ 8144 mutex_enter(&spa_namespace_lock); 8145 8146 for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); 8147 pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { 8148 uint64_t vdev_guid = fnvpair_value_uint64(pair); 8149 8150 int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type, 8151 &vd_list); 8152 if (error != 0) { 8153 char guid_as_str[MAXNAMELEN]; 8154 8155 (void) snprintf(guid_as_str, sizeof (guid_as_str), 8156 "%llu", (unsigned long long)vdev_guid); 8157 fnvlist_add_int64(vdev_errlist, guid_as_str, error); 8158 total_errors++; 8159 } 8160 } 8161 8162 /* Wait for all initialize threads to stop. */ 8163 vdev_initialize_stop_wait(spa, &vd_list); 8164 8165 /* Sync out the initializing state */ 8166 txg_wait_synced(spa->spa_dsl_pool, 0); 8167 mutex_exit(&spa_namespace_lock); 8168 8169 list_destroy(&vd_list); 8170 8171 return (total_errors); 8172 } 8173 8174 static int 8175 spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, 8176 uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list) 8177 { 8178 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 8179 8180 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 8181 8182 /* Look up vdev and ensure it's a leaf. */ 8183 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); 8184 if (vd == NULL || vd->vdev_detached) { 8185 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8186 return (SET_ERROR(ENODEV)); 8187 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { 8188 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8189 return (SET_ERROR(EINVAL)); 8190 } else if (!vdev_writeable(vd)) { 8191 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8192 return (SET_ERROR(EROFS)); 8193 } else if (!vd->vdev_has_trim) { 8194 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8195 return (SET_ERROR(EOPNOTSUPP)); 8196 } else if (secure && !vd->vdev_has_securetrim) { 8197 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8198 return (SET_ERROR(EOPNOTSUPP)); 8199 } 8200 mutex_enter(&vd->vdev_trim_lock); 8201 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8202 8203 /* 8204 * When we activate a TRIM action we check to see if the 8205 * vdev_trim_thread is NULL. We do this instead of using the 8206 * vdev_trim_state since there might be a previous TRIM process 8207 * which has completed but the thread is not exited. 8208 */ 8209 if (cmd_type == POOL_TRIM_START && 8210 (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing || 8211 vd->vdev_top->vdev_rz_expanding)) { 8212 mutex_exit(&vd->vdev_trim_lock); 8213 return (SET_ERROR(EBUSY)); 8214 } else if (cmd_type == POOL_TRIM_CANCEL && 8215 (vd->vdev_trim_state != VDEV_TRIM_ACTIVE && 8216 vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) { 8217 mutex_exit(&vd->vdev_trim_lock); 8218 return (SET_ERROR(ESRCH)); 8219 } else if (cmd_type == POOL_TRIM_SUSPEND && 8220 vd->vdev_trim_state != VDEV_TRIM_ACTIVE) { 8221 mutex_exit(&vd->vdev_trim_lock); 8222 return (SET_ERROR(ESRCH)); 8223 } 8224 8225 switch (cmd_type) { 8226 case POOL_TRIM_START: 8227 vdev_trim(vd, rate, partial, secure); 8228 break; 8229 case POOL_TRIM_CANCEL: 8230 vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list); 8231 break; 8232 case POOL_TRIM_SUSPEND: 8233 vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list); 8234 break; 8235 default: 8236 panic("invalid cmd_type %llu", (unsigned long long)cmd_type); 8237 } 8238 mutex_exit(&vd->vdev_trim_lock); 8239 8240 return (0); 8241 } 8242 8243 /* 8244 * Initiates a manual TRIM for the requested vdevs. This kicks off individual 8245 * TRIM threads for each child vdev. These threads pass over all of the free 8246 * space in the vdev's metaslabs and issues TRIM commands for that space. 8247 */ 8248 int 8249 spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate, 8250 boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist) 8251 { 8252 int total_errors = 0; 8253 list_t vd_list; 8254 8255 list_create(&vd_list, sizeof (vdev_t), 8256 offsetof(vdev_t, vdev_trim_node)); 8257 8258 /* 8259 * We hold the namespace lock through the whole function 8260 * to prevent any changes to the pool while we're starting or 8261 * stopping TRIM. The config and state locks are held so that 8262 * we can properly assess the vdev state before we commit to 8263 * the TRIM operation. 8264 */ 8265 mutex_enter(&spa_namespace_lock); 8266 8267 for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); 8268 pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { 8269 uint64_t vdev_guid = fnvpair_value_uint64(pair); 8270 8271 int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type, 8272 rate, partial, secure, &vd_list); 8273 if (error != 0) { 8274 char guid_as_str[MAXNAMELEN]; 8275 8276 (void) snprintf(guid_as_str, sizeof (guid_as_str), 8277 "%llu", (unsigned long long)vdev_guid); 8278 fnvlist_add_int64(vdev_errlist, guid_as_str, error); 8279 total_errors++; 8280 } 8281 } 8282 8283 /* Wait for all TRIM threads to stop. */ 8284 vdev_trim_stop_wait(spa, &vd_list); 8285 8286 /* Sync out the TRIM state */ 8287 txg_wait_synced(spa->spa_dsl_pool, 0); 8288 mutex_exit(&spa_namespace_lock); 8289 8290 list_destroy(&vd_list); 8291 8292 return (total_errors); 8293 } 8294 8295 /* 8296 * Split a set of devices from their mirrors, and create a new pool from them. 8297 */ 8298 int 8299 spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config, 8300 nvlist_t *props, boolean_t exp) 8301 { 8302 int error = 0; 8303 uint64_t txg, *glist; 8304 spa_t *newspa; 8305 uint_t c, children, lastlog; 8306 nvlist_t **child, *nvl, *tmp; 8307 dmu_tx_t *tx; 8308 const char *altroot = NULL; 8309 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 8310 boolean_t activate_slog; 8311 8312 ASSERT(spa_writeable(spa)); 8313 8314 txg = spa_vdev_enter(spa); 8315 8316 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 8317 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 8318 error = (spa_has_checkpoint(spa)) ? 8319 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 8320 return (spa_vdev_exit(spa, NULL, txg, error)); 8321 } 8322 8323 /* clear the log and flush everything up to now */ 8324 activate_slog = spa_passivate_log(spa); 8325 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 8326 error = spa_reset_logs(spa); 8327 txg = spa_vdev_config_enter(spa); 8328 8329 if (activate_slog) 8330 spa_activate_log(spa); 8331 8332 if (error != 0) 8333 return (spa_vdev_exit(spa, NULL, txg, error)); 8334 8335 /* check new spa name before going any further */ 8336 if (spa_lookup(newname) != NULL) 8337 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 8338 8339 /* 8340 * scan through all the children to ensure they're all mirrors 8341 */ 8342 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 8343 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 8344 &children) != 0) 8345 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 8346 8347 /* first, check to ensure we've got the right child count */ 8348 rvd = spa->spa_root_vdev; 8349 lastlog = 0; 8350 for (c = 0; c < rvd->vdev_children; c++) { 8351 vdev_t *vd = rvd->vdev_child[c]; 8352 8353 /* don't count the holes & logs as children */ 8354 if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops && 8355 !vdev_is_concrete(vd))) { 8356 if (lastlog == 0) 8357 lastlog = c; 8358 continue; 8359 } 8360 8361 lastlog = 0; 8362 } 8363 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 8364 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 8365 8366 /* next, ensure no spare or cache devices are part of the split */ 8367 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 8368 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 8369 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 8370 8371 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 8372 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 8373 8374 /* then, loop over each vdev and validate it */ 8375 for (c = 0; c < children; c++) { 8376 uint64_t is_hole = 0; 8377 8378 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 8379 &is_hole); 8380 8381 if (is_hole != 0) { 8382 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 8383 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 8384 continue; 8385 } else { 8386 error = SET_ERROR(EINVAL); 8387 break; 8388 } 8389 } 8390 8391 /* deal with indirect vdevs */ 8392 if (spa->spa_root_vdev->vdev_child[c]->vdev_ops == 8393 &vdev_indirect_ops) 8394 continue; 8395 8396 /* which disk is going to be split? */ 8397 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 8398 &glist[c]) != 0) { 8399 error = SET_ERROR(EINVAL); 8400 break; 8401 } 8402 8403 /* look it up in the spa */ 8404 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 8405 if (vml[c] == NULL) { 8406 error = SET_ERROR(ENODEV); 8407 break; 8408 } 8409 8410 /* make sure there's nothing stopping the split */ 8411 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 8412 vml[c]->vdev_islog || 8413 !vdev_is_concrete(vml[c]) || 8414 vml[c]->vdev_isspare || 8415 vml[c]->vdev_isl2cache || 8416 !vdev_writeable(vml[c]) || 8417 vml[c]->vdev_children != 0 || 8418 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 8419 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 8420 error = SET_ERROR(EINVAL); 8421 break; 8422 } 8423 8424 if (vdev_dtl_required(vml[c]) || 8425 vdev_resilver_needed(vml[c], NULL, NULL)) { 8426 error = SET_ERROR(EBUSY); 8427 break; 8428 } 8429 8430 /* we need certain info from the top level */ 8431 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 8432 vml[c]->vdev_top->vdev_ms_array); 8433 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 8434 vml[c]->vdev_top->vdev_ms_shift); 8435 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 8436 vml[c]->vdev_top->vdev_asize); 8437 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 8438 vml[c]->vdev_top->vdev_ashift); 8439 8440 /* transfer per-vdev ZAPs */ 8441 ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); 8442 VERIFY0(nvlist_add_uint64(child[c], 8443 ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); 8444 8445 ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); 8446 VERIFY0(nvlist_add_uint64(child[c], 8447 ZPOOL_CONFIG_VDEV_TOP_ZAP, 8448 vml[c]->vdev_parent->vdev_top_zap)); 8449 } 8450 8451 if (error != 0) { 8452 kmem_free(vml, children * sizeof (vdev_t *)); 8453 kmem_free(glist, children * sizeof (uint64_t)); 8454 return (spa_vdev_exit(spa, NULL, txg, error)); 8455 } 8456 8457 /* stop writers from using the disks */ 8458 for (c = 0; c < children; c++) { 8459 if (vml[c] != NULL) 8460 vml[c]->vdev_offline = B_TRUE; 8461 } 8462 vdev_reopen(spa->spa_root_vdev); 8463 8464 /* 8465 * Temporarily record the splitting vdevs in the spa config. This 8466 * will disappear once the config is regenerated. 8467 */ 8468 nvl = fnvlist_alloc(); 8469 fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children); 8470 kmem_free(glist, children * sizeof (uint64_t)); 8471 8472 mutex_enter(&spa->spa_props_lock); 8473 fnvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl); 8474 mutex_exit(&spa->spa_props_lock); 8475 spa->spa_config_splitting = nvl; 8476 vdev_config_dirty(spa->spa_root_vdev); 8477 8478 /* configure and create the new pool */ 8479 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname); 8480 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 8481 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE); 8482 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)); 8483 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg); 8484 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 8485 spa_generate_guid(NULL)); 8486 VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 8487 (void) nvlist_lookup_string(props, 8488 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 8489 8490 /* add the new pool to the namespace */ 8491 newspa = spa_add(newname, config, altroot); 8492 newspa->spa_avz_action = AVZ_ACTION_REBUILD; 8493 newspa->spa_config_txg = spa->spa_config_txg; 8494 spa_set_log_state(newspa, SPA_LOG_CLEAR); 8495 8496 /* release the spa config lock, retaining the namespace lock */ 8497 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 8498 8499 if (zio_injection_enabled) 8500 zio_handle_panic_injection(spa, FTAG, 1); 8501 8502 spa_activate(newspa, spa_mode_global); 8503 spa_async_suspend(newspa); 8504 8505 /* 8506 * Temporarily stop the initializing and TRIM activity. We set the 8507 * state to ACTIVE so that we know to resume initializing or TRIM 8508 * once the split has completed. 8509 */ 8510 list_t vd_initialize_list; 8511 list_create(&vd_initialize_list, sizeof (vdev_t), 8512 offsetof(vdev_t, vdev_initialize_node)); 8513 8514 list_t vd_trim_list; 8515 list_create(&vd_trim_list, sizeof (vdev_t), 8516 offsetof(vdev_t, vdev_trim_node)); 8517 8518 for (c = 0; c < children; c++) { 8519 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { 8520 mutex_enter(&vml[c]->vdev_initialize_lock); 8521 vdev_initialize_stop(vml[c], 8522 VDEV_INITIALIZE_ACTIVE, &vd_initialize_list); 8523 mutex_exit(&vml[c]->vdev_initialize_lock); 8524 8525 mutex_enter(&vml[c]->vdev_trim_lock); 8526 vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list); 8527 mutex_exit(&vml[c]->vdev_trim_lock); 8528 } 8529 } 8530 8531 vdev_initialize_stop_wait(spa, &vd_initialize_list); 8532 vdev_trim_stop_wait(spa, &vd_trim_list); 8533 8534 list_destroy(&vd_initialize_list); 8535 list_destroy(&vd_trim_list); 8536 8537 newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; 8538 newspa->spa_is_splitting = B_TRUE; 8539 8540 /* create the new pool from the disks of the original pool */ 8541 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE); 8542 if (error) 8543 goto out; 8544 8545 /* if that worked, generate a real config for the new pool */ 8546 if (newspa->spa_root_vdev != NULL) { 8547 newspa->spa_config_splitting = fnvlist_alloc(); 8548 fnvlist_add_uint64(newspa->spa_config_splitting, 8549 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)); 8550 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 8551 B_TRUE)); 8552 } 8553 8554 /* set the props */ 8555 if (props != NULL) { 8556 spa_configfile_set(newspa, props, B_FALSE); 8557 error = spa_prop_set(newspa, props); 8558 if (error) 8559 goto out; 8560 } 8561 8562 /* flush everything */ 8563 txg = spa_vdev_config_enter(newspa); 8564 vdev_config_dirty(newspa->spa_root_vdev); 8565 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 8566 8567 if (zio_injection_enabled) 8568 zio_handle_panic_injection(spa, FTAG, 2); 8569 8570 spa_async_resume(newspa); 8571 8572 /* finally, update the original pool's config */ 8573 txg = spa_vdev_config_enter(spa); 8574 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 8575 error = dmu_tx_assign(tx, TXG_WAIT); 8576 if (error != 0) 8577 dmu_tx_abort(tx); 8578 for (c = 0; c < children; c++) { 8579 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { 8580 vdev_t *tvd = vml[c]->vdev_top; 8581 8582 /* 8583 * Need to be sure the detachable VDEV is not 8584 * on any *other* txg's DTL list to prevent it 8585 * from being accessed after it's freed. 8586 */ 8587 for (int t = 0; t < TXG_SIZE; t++) { 8588 (void) txg_list_remove_this( 8589 &tvd->vdev_dtl_list, vml[c], t); 8590 } 8591 8592 vdev_split(vml[c]); 8593 if (error == 0) 8594 spa_history_log_internal(spa, "detach", tx, 8595 "vdev=%s", vml[c]->vdev_path); 8596 8597 vdev_free(vml[c]); 8598 } 8599 } 8600 spa->spa_avz_action = AVZ_ACTION_REBUILD; 8601 vdev_config_dirty(spa->spa_root_vdev); 8602 spa->spa_config_splitting = NULL; 8603 nvlist_free(nvl); 8604 if (error == 0) 8605 dmu_tx_commit(tx); 8606 (void) spa_vdev_exit(spa, NULL, txg, 0); 8607 8608 if (zio_injection_enabled) 8609 zio_handle_panic_injection(spa, FTAG, 3); 8610 8611 /* split is complete; log a history record */ 8612 spa_history_log_internal(newspa, "split", NULL, 8613 "from pool %s", spa_name(spa)); 8614 8615 newspa->spa_is_splitting = B_FALSE; 8616 kmem_free(vml, children * sizeof (vdev_t *)); 8617 8618 /* if we're not going to mount the filesystems in userland, export */ 8619 if (exp) 8620 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 8621 B_FALSE, B_FALSE); 8622 8623 return (error); 8624 8625 out: 8626 spa_unload(newspa); 8627 spa_deactivate(newspa); 8628 spa_remove(newspa); 8629 8630 txg = spa_vdev_config_enter(spa); 8631 8632 /* re-online all offlined disks */ 8633 for (c = 0; c < children; c++) { 8634 if (vml[c] != NULL) 8635 vml[c]->vdev_offline = B_FALSE; 8636 } 8637 8638 /* restart initializing or trimming disks as necessary */ 8639 spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); 8640 spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); 8641 spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); 8642 8643 vdev_reopen(spa->spa_root_vdev); 8644 8645 nvlist_free(spa->spa_config_splitting); 8646 spa->spa_config_splitting = NULL; 8647 (void) spa_vdev_exit(spa, NULL, txg, error); 8648 8649 kmem_free(vml, children * sizeof (vdev_t *)); 8650 return (error); 8651 } 8652 8653 /* 8654 * Find any device that's done replacing, or a vdev marked 'unspare' that's 8655 * currently spared, so we can detach it. 8656 */ 8657 static vdev_t * 8658 spa_vdev_resilver_done_hunt(vdev_t *vd) 8659 { 8660 vdev_t *newvd, *oldvd; 8661 8662 for (int c = 0; c < vd->vdev_children; c++) { 8663 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 8664 if (oldvd != NULL) 8665 return (oldvd); 8666 } 8667 8668 /* 8669 * Check for a completed replacement. We always consider the first 8670 * vdev in the list to be the oldest vdev, and the last one to be 8671 * the newest (see spa_vdev_attach() for how that works). In 8672 * the case where the newest vdev is faulted, we will not automatically 8673 * remove it after a resilver completes. This is OK as it will require 8674 * user intervention to determine which disk the admin wishes to keep. 8675 */ 8676 if (vd->vdev_ops == &vdev_replacing_ops) { 8677 ASSERT(vd->vdev_children > 1); 8678 8679 newvd = vd->vdev_child[vd->vdev_children - 1]; 8680 oldvd = vd->vdev_child[0]; 8681 8682 if (vdev_dtl_empty(newvd, DTL_MISSING) && 8683 vdev_dtl_empty(newvd, DTL_OUTAGE) && 8684 !vdev_dtl_required(oldvd)) 8685 return (oldvd); 8686 } 8687 8688 /* 8689 * Check for a completed resilver with the 'unspare' flag set. 8690 * Also potentially update faulted state. 8691 */ 8692 if (vd->vdev_ops == &vdev_spare_ops) { 8693 vdev_t *first = vd->vdev_child[0]; 8694 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 8695 8696 if (last->vdev_unspare) { 8697 oldvd = first; 8698 newvd = last; 8699 } else if (first->vdev_unspare) { 8700 oldvd = last; 8701 newvd = first; 8702 } else { 8703 oldvd = NULL; 8704 } 8705 8706 if (oldvd != NULL && 8707 vdev_dtl_empty(newvd, DTL_MISSING) && 8708 vdev_dtl_empty(newvd, DTL_OUTAGE) && 8709 !vdev_dtl_required(oldvd)) 8710 return (oldvd); 8711 8712 vdev_propagate_state(vd); 8713 8714 /* 8715 * If there are more than two spares attached to a disk, 8716 * and those spares are not required, then we want to 8717 * attempt to free them up now so that they can be used 8718 * by other pools. Once we're back down to a single 8719 * disk+spare, we stop removing them. 8720 */ 8721 if (vd->vdev_children > 2) { 8722 newvd = vd->vdev_child[1]; 8723 8724 if (newvd->vdev_isspare && last->vdev_isspare && 8725 vdev_dtl_empty(last, DTL_MISSING) && 8726 vdev_dtl_empty(last, DTL_OUTAGE) && 8727 !vdev_dtl_required(newvd)) 8728 return (newvd); 8729 } 8730 } 8731 8732 return (NULL); 8733 } 8734 8735 static void 8736 spa_vdev_resilver_done(spa_t *spa) 8737 { 8738 vdev_t *vd, *pvd, *ppvd; 8739 uint64_t guid, sguid, pguid, ppguid; 8740 8741 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 8742 8743 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 8744 pvd = vd->vdev_parent; 8745 ppvd = pvd->vdev_parent; 8746 guid = vd->vdev_guid; 8747 pguid = pvd->vdev_guid; 8748 ppguid = ppvd->vdev_guid; 8749 sguid = 0; 8750 /* 8751 * If we have just finished replacing a hot spared device, then 8752 * we need to detach the parent's first child (the original hot 8753 * spare) as well. 8754 */ 8755 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 8756 ppvd->vdev_children == 2) { 8757 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 8758 sguid = ppvd->vdev_child[1]->vdev_guid; 8759 } 8760 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 8761 8762 spa_config_exit(spa, SCL_ALL, FTAG); 8763 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 8764 return; 8765 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 8766 return; 8767 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 8768 } 8769 8770 spa_config_exit(spa, SCL_ALL, FTAG); 8771 8772 /* 8773 * If a detach was not performed above replace waiters will not have 8774 * been notified. In which case we must do so now. 8775 */ 8776 spa_notify_waiters(spa); 8777 } 8778 8779 /* 8780 * Update the stored path or FRU for this vdev. 8781 */ 8782 static int 8783 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 8784 boolean_t ispath) 8785 { 8786 vdev_t *vd; 8787 boolean_t sync = B_FALSE; 8788 8789 ASSERT(spa_writeable(spa)); 8790 8791 spa_vdev_state_enter(spa, SCL_ALL); 8792 8793 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 8794 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 8795 8796 if (!vd->vdev_ops->vdev_op_leaf) 8797 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 8798 8799 if (ispath) { 8800 if (strcmp(value, vd->vdev_path) != 0) { 8801 spa_strfree(vd->vdev_path); 8802 vd->vdev_path = spa_strdup(value); 8803 sync = B_TRUE; 8804 } 8805 } else { 8806 if (vd->vdev_fru == NULL) { 8807 vd->vdev_fru = spa_strdup(value); 8808 sync = B_TRUE; 8809 } else if (strcmp(value, vd->vdev_fru) != 0) { 8810 spa_strfree(vd->vdev_fru); 8811 vd->vdev_fru = spa_strdup(value); 8812 sync = B_TRUE; 8813 } 8814 } 8815 8816 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 8817 } 8818 8819 int 8820 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 8821 { 8822 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 8823 } 8824 8825 int 8826 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 8827 { 8828 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 8829 } 8830 8831 /* 8832 * ========================================================================== 8833 * SPA Scanning 8834 * ========================================================================== 8835 */ 8836 int 8837 spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) 8838 { 8839 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 8840 8841 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 8842 return (SET_ERROR(EBUSY)); 8843 8844 return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); 8845 } 8846 8847 int 8848 spa_scan_stop(spa_t *spa) 8849 { 8850 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 8851 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 8852 return (SET_ERROR(EBUSY)); 8853 8854 return (dsl_scan_cancel(spa->spa_dsl_pool)); 8855 } 8856 8857 int 8858 spa_scan(spa_t *spa, pool_scan_func_t func) 8859 { 8860 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 8861 8862 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 8863 return (SET_ERROR(ENOTSUP)); 8864 8865 if (func == POOL_SCAN_RESILVER && 8866 !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) 8867 return (SET_ERROR(ENOTSUP)); 8868 8869 /* 8870 * If a resilver was requested, but there is no DTL on a 8871 * writeable leaf device, we have nothing to do. 8872 */ 8873 if (func == POOL_SCAN_RESILVER && 8874 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 8875 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 8876 return (0); 8877 } 8878 8879 if (func == POOL_SCAN_ERRORSCRUB && 8880 !spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) 8881 return (SET_ERROR(ENOTSUP)); 8882 8883 return (dsl_scan(spa->spa_dsl_pool, func)); 8884 } 8885 8886 /* 8887 * ========================================================================== 8888 * SPA async task processing 8889 * ========================================================================== 8890 */ 8891 8892 static void 8893 spa_async_remove(spa_t *spa, vdev_t *vd) 8894 { 8895 if (vd->vdev_remove_wanted) { 8896 vd->vdev_remove_wanted = B_FALSE; 8897 vd->vdev_delayed_close = B_FALSE; 8898 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 8899 8900 /* 8901 * We want to clear the stats, but we don't want to do a full 8902 * vdev_clear() as that will cause us to throw away 8903 * degraded/faulted state as well as attempt to reopen the 8904 * device, all of which is a waste. 8905 */ 8906 vd->vdev_stat.vs_read_errors = 0; 8907 vd->vdev_stat.vs_write_errors = 0; 8908 vd->vdev_stat.vs_checksum_errors = 0; 8909 8910 vdev_state_dirty(vd->vdev_top); 8911 8912 /* Tell userspace that the vdev is gone. */ 8913 zfs_post_remove(spa, vd); 8914 } 8915 8916 for (int c = 0; c < vd->vdev_children; c++) 8917 spa_async_remove(spa, vd->vdev_child[c]); 8918 } 8919 8920 static void 8921 spa_async_fault_vdev(spa_t *spa, vdev_t *vd) 8922 { 8923 if (vd->vdev_fault_wanted) { 8924 vd->vdev_fault_wanted = B_FALSE; 8925 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 8926 VDEV_AUX_ERR_EXCEEDED); 8927 } 8928 8929 for (int c = 0; c < vd->vdev_children; c++) 8930 spa_async_fault_vdev(spa, vd->vdev_child[c]); 8931 } 8932 8933 static void 8934 spa_async_autoexpand(spa_t *spa, vdev_t *vd) 8935 { 8936 if (!spa->spa_autoexpand) 8937 return; 8938 8939 for (int c = 0; c < vd->vdev_children; c++) { 8940 vdev_t *cvd = vd->vdev_child[c]; 8941 spa_async_autoexpand(spa, cvd); 8942 } 8943 8944 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 8945 return; 8946 8947 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND); 8948 } 8949 8950 static __attribute__((noreturn)) void 8951 spa_async_thread(void *arg) 8952 { 8953 spa_t *spa = (spa_t *)arg; 8954 dsl_pool_t *dp = spa->spa_dsl_pool; 8955 int tasks; 8956 8957 ASSERT(spa->spa_sync_on); 8958 8959 mutex_enter(&spa->spa_async_lock); 8960 tasks = spa->spa_async_tasks; 8961 spa->spa_async_tasks = 0; 8962 mutex_exit(&spa->spa_async_lock); 8963 8964 /* 8965 * See if the config needs to be updated. 8966 */ 8967 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 8968 uint64_t old_space, new_space; 8969 8970 mutex_enter(&spa_namespace_lock); 8971 old_space = metaslab_class_get_space(spa_normal_class(spa)); 8972 old_space += metaslab_class_get_space(spa_special_class(spa)); 8973 old_space += metaslab_class_get_space(spa_dedup_class(spa)); 8974 old_space += metaslab_class_get_space( 8975 spa_embedded_log_class(spa)); 8976 8977 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 8978 8979 new_space = metaslab_class_get_space(spa_normal_class(spa)); 8980 new_space += metaslab_class_get_space(spa_special_class(spa)); 8981 new_space += metaslab_class_get_space(spa_dedup_class(spa)); 8982 new_space += metaslab_class_get_space( 8983 spa_embedded_log_class(spa)); 8984 mutex_exit(&spa_namespace_lock); 8985 8986 /* 8987 * If the pool grew as a result of the config update, 8988 * then log an internal history event. 8989 */ 8990 if (new_space != old_space) { 8991 spa_history_log_internal(spa, "vdev online", NULL, 8992 "pool '%s' size: %llu(+%llu)", 8993 spa_name(spa), (u_longlong_t)new_space, 8994 (u_longlong_t)(new_space - old_space)); 8995 } 8996 } 8997 8998 /* 8999 * See if any devices need to be marked REMOVED. 9000 */ 9001 if (tasks & SPA_ASYNC_REMOVE) { 9002 spa_vdev_state_enter(spa, SCL_NONE); 9003 spa_async_remove(spa, spa->spa_root_vdev); 9004 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 9005 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 9006 for (int i = 0; i < spa->spa_spares.sav_count; i++) 9007 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 9008 (void) spa_vdev_state_exit(spa, NULL, 0); 9009 } 9010 9011 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 9012 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 9013 spa_async_autoexpand(spa, spa->spa_root_vdev); 9014 spa_config_exit(spa, SCL_CONFIG, FTAG); 9015 } 9016 9017 /* 9018 * See if any devices need to be marked faulted. 9019 */ 9020 if (tasks & SPA_ASYNC_FAULT_VDEV) { 9021 spa_vdev_state_enter(spa, SCL_NONE); 9022 spa_async_fault_vdev(spa, spa->spa_root_vdev); 9023 (void) spa_vdev_state_exit(spa, NULL, 0); 9024 } 9025 9026 /* 9027 * If any devices are done replacing, detach them. 9028 */ 9029 if (tasks & SPA_ASYNC_RESILVER_DONE || 9030 tasks & SPA_ASYNC_REBUILD_DONE || 9031 tasks & SPA_ASYNC_DETACH_SPARE) { 9032 spa_vdev_resilver_done(spa); 9033 } 9034 9035 /* 9036 * Kick off a resilver. 9037 */ 9038 if (tasks & SPA_ASYNC_RESILVER && 9039 !vdev_rebuild_active(spa->spa_root_vdev) && 9040 (!dsl_scan_resilvering(dp) || 9041 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))) 9042 dsl_scan_restart_resilver(dp, 0); 9043 9044 if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { 9045 mutex_enter(&spa_namespace_lock); 9046 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 9047 vdev_initialize_restart(spa->spa_root_vdev); 9048 spa_config_exit(spa, SCL_CONFIG, FTAG); 9049 mutex_exit(&spa_namespace_lock); 9050 } 9051 9052 if (tasks & SPA_ASYNC_TRIM_RESTART) { 9053 mutex_enter(&spa_namespace_lock); 9054 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 9055 vdev_trim_restart(spa->spa_root_vdev); 9056 spa_config_exit(spa, SCL_CONFIG, FTAG); 9057 mutex_exit(&spa_namespace_lock); 9058 } 9059 9060 if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) { 9061 mutex_enter(&spa_namespace_lock); 9062 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 9063 vdev_autotrim_restart(spa); 9064 spa_config_exit(spa, SCL_CONFIG, FTAG); 9065 mutex_exit(&spa_namespace_lock); 9066 } 9067 9068 /* 9069 * Kick off L2 cache whole device TRIM. 9070 */ 9071 if (tasks & SPA_ASYNC_L2CACHE_TRIM) { 9072 mutex_enter(&spa_namespace_lock); 9073 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 9074 vdev_trim_l2arc(spa); 9075 spa_config_exit(spa, SCL_CONFIG, FTAG); 9076 mutex_exit(&spa_namespace_lock); 9077 } 9078 9079 /* 9080 * Kick off L2 cache rebuilding. 9081 */ 9082 if (tasks & SPA_ASYNC_L2CACHE_REBUILD) { 9083 mutex_enter(&spa_namespace_lock); 9084 spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER); 9085 l2arc_spa_rebuild_start(spa); 9086 spa_config_exit(spa, SCL_L2ARC, FTAG); 9087 mutex_exit(&spa_namespace_lock); 9088 } 9089 9090 /* 9091 * Let the world know that we're done. 9092 */ 9093 mutex_enter(&spa->spa_async_lock); 9094 spa->spa_async_thread = NULL; 9095 cv_broadcast(&spa->spa_async_cv); 9096 mutex_exit(&spa->spa_async_lock); 9097 thread_exit(); 9098 } 9099 9100 void 9101 spa_async_suspend(spa_t *spa) 9102 { 9103 mutex_enter(&spa->spa_async_lock); 9104 spa->spa_async_suspended++; 9105 while (spa->spa_async_thread != NULL) 9106 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 9107 mutex_exit(&spa->spa_async_lock); 9108 9109 spa_vdev_remove_suspend(spa); 9110 9111 zthr_t *condense_thread = spa->spa_condense_zthr; 9112 if (condense_thread != NULL) 9113 zthr_cancel(condense_thread); 9114 9115 zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr; 9116 if (raidz_expand_thread != NULL) 9117 zthr_cancel(raidz_expand_thread); 9118 9119 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; 9120 if (discard_thread != NULL) 9121 zthr_cancel(discard_thread); 9122 9123 zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; 9124 if (ll_delete_thread != NULL) 9125 zthr_cancel(ll_delete_thread); 9126 9127 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; 9128 if (ll_condense_thread != NULL) 9129 zthr_cancel(ll_condense_thread); 9130 } 9131 9132 void 9133 spa_async_resume(spa_t *spa) 9134 { 9135 mutex_enter(&spa->spa_async_lock); 9136 ASSERT(spa->spa_async_suspended != 0); 9137 spa->spa_async_suspended--; 9138 mutex_exit(&spa->spa_async_lock); 9139 spa_restart_removal(spa); 9140 9141 zthr_t *condense_thread = spa->spa_condense_zthr; 9142 if (condense_thread != NULL) 9143 zthr_resume(condense_thread); 9144 9145 zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr; 9146 if (raidz_expand_thread != NULL) 9147 zthr_resume(raidz_expand_thread); 9148 9149 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; 9150 if (discard_thread != NULL) 9151 zthr_resume(discard_thread); 9152 9153 zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; 9154 if (ll_delete_thread != NULL) 9155 zthr_resume(ll_delete_thread); 9156 9157 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; 9158 if (ll_condense_thread != NULL) 9159 zthr_resume(ll_condense_thread); 9160 } 9161 9162 static boolean_t 9163 spa_async_tasks_pending(spa_t *spa) 9164 { 9165 uint_t non_config_tasks; 9166 uint_t config_task; 9167 boolean_t config_task_suspended; 9168 9169 non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE; 9170 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 9171 if (spa->spa_ccw_fail_time == 0) { 9172 config_task_suspended = B_FALSE; 9173 } else { 9174 config_task_suspended = 9175 (gethrtime() - spa->spa_ccw_fail_time) < 9176 ((hrtime_t)zfs_ccw_retry_interval * NANOSEC); 9177 } 9178 9179 return (non_config_tasks || (config_task && !config_task_suspended)); 9180 } 9181 9182 static void 9183 spa_async_dispatch(spa_t *spa) 9184 { 9185 mutex_enter(&spa->spa_async_lock); 9186 if (spa_async_tasks_pending(spa) && 9187 !spa->spa_async_suspended && 9188 spa->spa_async_thread == NULL) 9189 spa->spa_async_thread = thread_create(NULL, 0, 9190 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 9191 mutex_exit(&spa->spa_async_lock); 9192 } 9193 9194 void 9195 spa_async_request(spa_t *spa, int task) 9196 { 9197 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 9198 mutex_enter(&spa->spa_async_lock); 9199 spa->spa_async_tasks |= task; 9200 mutex_exit(&spa->spa_async_lock); 9201 } 9202 9203 int 9204 spa_async_tasks(spa_t *spa) 9205 { 9206 return (spa->spa_async_tasks); 9207 } 9208 9209 /* 9210 * ========================================================================== 9211 * SPA syncing routines 9212 * ========================================================================== 9213 */ 9214 9215 9216 static int 9217 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 9218 dmu_tx_t *tx) 9219 { 9220 bpobj_t *bpo = arg; 9221 bpobj_enqueue(bpo, bp, bp_freed, tx); 9222 return (0); 9223 } 9224 9225 int 9226 bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 9227 { 9228 return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx)); 9229 } 9230 9231 int 9232 bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 9233 { 9234 return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx)); 9235 } 9236 9237 static int 9238 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 9239 { 9240 zio_t *pio = arg; 9241 9242 zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp, 9243 pio->io_flags)); 9244 return (0); 9245 } 9246 9247 static int 9248 bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 9249 dmu_tx_t *tx) 9250 { 9251 ASSERT(!bp_freed); 9252 return (spa_free_sync_cb(arg, bp, tx)); 9253 } 9254 9255 /* 9256 * Note: this simple function is not inlined to make it easier to dtrace the 9257 * amount of time spent syncing frees. 9258 */ 9259 static void 9260 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 9261 { 9262 zio_t *zio = zio_root(spa, NULL, NULL, 0); 9263 bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 9264 VERIFY(zio_wait(zio) == 0); 9265 } 9266 9267 /* 9268 * Note: this simple function is not inlined to make it easier to dtrace the 9269 * amount of time spent syncing deferred frees. 9270 */ 9271 static void 9272 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 9273 { 9274 if (spa_sync_pass(spa) != 1) 9275 return; 9276 9277 /* 9278 * Note: 9279 * If the log space map feature is active, we stop deferring 9280 * frees to the next TXG and therefore running this function 9281 * would be considered a no-op as spa_deferred_bpobj should 9282 * not have any entries. 9283 * 9284 * That said we run this function anyway (instead of returning 9285 * immediately) for the edge-case scenario where we just 9286 * activated the log space map feature in this TXG but we have 9287 * deferred frees from the previous TXG. 9288 */ 9289 zio_t *zio = zio_root(spa, NULL, NULL, 0); 9290 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 9291 bpobj_spa_free_sync_cb, zio, tx), ==, 0); 9292 VERIFY0(zio_wait(zio)); 9293 } 9294 9295 static void 9296 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 9297 { 9298 char *packed = NULL; 9299 size_t bufsize; 9300 size_t nvsize = 0; 9301 dmu_buf_t *db; 9302 9303 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 9304 9305 /* 9306 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 9307 * information. This avoids the dmu_buf_will_dirty() path and 9308 * saves us a pre-read to get data we don't actually care about. 9309 */ 9310 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 9311 packed = vmem_alloc(bufsize, KM_SLEEP); 9312 9313 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 9314 KM_SLEEP) == 0); 9315 memset(packed + nvsize, 0, bufsize - nvsize); 9316 9317 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 9318 9319 vmem_free(packed, bufsize); 9320 9321 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 9322 dmu_buf_will_dirty(db, tx); 9323 *(uint64_t *)db->db_data = nvsize; 9324 dmu_buf_rele(db, FTAG); 9325 } 9326 9327 static void 9328 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 9329 const char *config, const char *entry) 9330 { 9331 nvlist_t *nvroot; 9332 nvlist_t **list; 9333 int i; 9334 9335 if (!sav->sav_sync) 9336 return; 9337 9338 /* 9339 * Update the MOS nvlist describing the list of available devices. 9340 * spa_validate_aux() will have already made sure this nvlist is 9341 * valid and the vdevs are labeled appropriately. 9342 */ 9343 if (sav->sav_object == 0) { 9344 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 9345 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 9346 sizeof (uint64_t), tx); 9347 VERIFY(zap_update(spa->spa_meta_objset, 9348 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 9349 &sav->sav_object, tx) == 0); 9350 } 9351 9352 nvroot = fnvlist_alloc(); 9353 if (sav->sav_count == 0) { 9354 fnvlist_add_nvlist_array(nvroot, config, 9355 (const nvlist_t * const *)NULL, 0); 9356 } else { 9357 list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP); 9358 for (i = 0; i < sav->sav_count; i++) 9359 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 9360 B_FALSE, VDEV_CONFIG_L2CACHE); 9361 fnvlist_add_nvlist_array(nvroot, config, 9362 (const nvlist_t * const *)list, sav->sav_count); 9363 for (i = 0; i < sav->sav_count; i++) 9364 nvlist_free(list[i]); 9365 kmem_free(list, sav->sav_count * sizeof (void *)); 9366 } 9367 9368 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 9369 nvlist_free(nvroot); 9370 9371 sav->sav_sync = B_FALSE; 9372 } 9373 9374 /* 9375 * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. 9376 * The all-vdev ZAP must be empty. 9377 */ 9378 static void 9379 spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) 9380 { 9381 spa_t *spa = vd->vdev_spa; 9382 9383 if (vd->vdev_root_zap != 0 && 9384 spa_feature_is_active(spa, SPA_FEATURE_AVZ_V2)) { 9385 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 9386 vd->vdev_root_zap, tx)); 9387 } 9388 if (vd->vdev_top_zap != 0) { 9389 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 9390 vd->vdev_top_zap, tx)); 9391 } 9392 if (vd->vdev_leaf_zap != 0) { 9393 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 9394 vd->vdev_leaf_zap, tx)); 9395 } 9396 for (uint64_t i = 0; i < vd->vdev_children; i++) { 9397 spa_avz_build(vd->vdev_child[i], avz, tx); 9398 } 9399 } 9400 9401 static void 9402 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 9403 { 9404 nvlist_t *config; 9405 9406 /* 9407 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, 9408 * its config may not be dirty but we still need to build per-vdev ZAPs. 9409 * Similarly, if the pool is being assembled (e.g. after a split), we 9410 * need to rebuild the AVZ although the config may not be dirty. 9411 */ 9412 if (list_is_empty(&spa->spa_config_dirty_list) && 9413 spa->spa_avz_action == AVZ_ACTION_NONE) 9414 return; 9415 9416 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 9417 9418 ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || 9419 spa->spa_avz_action == AVZ_ACTION_INITIALIZE || 9420 spa->spa_all_vdev_zaps != 0); 9421 9422 if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { 9423 /* Make and build the new AVZ */ 9424 uint64_t new_avz = zap_create(spa->spa_meta_objset, 9425 DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); 9426 spa_avz_build(spa->spa_root_vdev, new_avz, tx); 9427 9428 /* Diff old AVZ with new one */ 9429 zap_cursor_t zc; 9430 zap_attribute_t za; 9431 9432 for (zap_cursor_init(&zc, spa->spa_meta_objset, 9433 spa->spa_all_vdev_zaps); 9434 zap_cursor_retrieve(&zc, &za) == 0; 9435 zap_cursor_advance(&zc)) { 9436 uint64_t vdzap = za.za_first_integer; 9437 if (zap_lookup_int(spa->spa_meta_objset, new_avz, 9438 vdzap) == ENOENT) { 9439 /* 9440 * ZAP is listed in old AVZ but not in new one; 9441 * destroy it 9442 */ 9443 VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, 9444 tx)); 9445 } 9446 } 9447 9448 zap_cursor_fini(&zc); 9449 9450 /* Destroy the old AVZ */ 9451 VERIFY0(zap_destroy(spa->spa_meta_objset, 9452 spa->spa_all_vdev_zaps, tx)); 9453 9454 /* Replace the old AVZ in the dir obj with the new one */ 9455 VERIFY0(zap_update(spa->spa_meta_objset, 9456 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, 9457 sizeof (new_avz), 1, &new_avz, tx)); 9458 9459 spa->spa_all_vdev_zaps = new_avz; 9460 } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { 9461 zap_cursor_t zc; 9462 zap_attribute_t za; 9463 9464 /* Walk through the AVZ and destroy all listed ZAPs */ 9465 for (zap_cursor_init(&zc, spa->spa_meta_objset, 9466 spa->spa_all_vdev_zaps); 9467 zap_cursor_retrieve(&zc, &za) == 0; 9468 zap_cursor_advance(&zc)) { 9469 uint64_t zap = za.za_first_integer; 9470 VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); 9471 } 9472 9473 zap_cursor_fini(&zc); 9474 9475 /* Destroy and unlink the AVZ itself */ 9476 VERIFY0(zap_destroy(spa->spa_meta_objset, 9477 spa->spa_all_vdev_zaps, tx)); 9478 VERIFY0(zap_remove(spa->spa_meta_objset, 9479 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); 9480 spa->spa_all_vdev_zaps = 0; 9481 } 9482 9483 if (spa->spa_all_vdev_zaps == 0) { 9484 spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, 9485 DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, 9486 DMU_POOL_VDEV_ZAP_MAP, tx); 9487 } 9488 spa->spa_avz_action = AVZ_ACTION_NONE; 9489 9490 /* Create ZAPs for vdevs that don't have them. */ 9491 vdev_construct_zaps(spa->spa_root_vdev, tx); 9492 9493 config = spa_config_generate(spa, spa->spa_root_vdev, 9494 dmu_tx_get_txg(tx), B_FALSE); 9495 9496 /* 9497 * If we're upgrading the spa version then make sure that 9498 * the config object gets updated with the correct version. 9499 */ 9500 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 9501 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 9502 spa->spa_uberblock.ub_version); 9503 9504 spa_config_exit(spa, SCL_STATE, FTAG); 9505 9506 nvlist_free(spa->spa_config_syncing); 9507 spa->spa_config_syncing = config; 9508 9509 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 9510 } 9511 9512 static void 9513 spa_sync_version(void *arg, dmu_tx_t *tx) 9514 { 9515 uint64_t *versionp = arg; 9516 uint64_t version = *versionp; 9517 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 9518 9519 /* 9520 * Setting the version is special cased when first creating the pool. 9521 */ 9522 ASSERT(tx->tx_txg != TXG_INITIAL); 9523 9524 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 9525 ASSERT(version >= spa_version(spa)); 9526 9527 spa->spa_uberblock.ub_version = version; 9528 vdev_config_dirty(spa->spa_root_vdev); 9529 spa_history_log_internal(spa, "set", tx, "version=%lld", 9530 (longlong_t)version); 9531 } 9532 9533 /* 9534 * Set zpool properties. 9535 */ 9536 static void 9537 spa_sync_props(void *arg, dmu_tx_t *tx) 9538 { 9539 nvlist_t *nvp = arg; 9540 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 9541 objset_t *mos = spa->spa_meta_objset; 9542 nvpair_t *elem = NULL; 9543 9544 mutex_enter(&spa->spa_props_lock); 9545 9546 while ((elem = nvlist_next_nvpair(nvp, elem))) { 9547 uint64_t intval; 9548 const char *strval, *fname; 9549 zpool_prop_t prop; 9550 const char *propname; 9551 const char *elemname = nvpair_name(elem); 9552 zprop_type_t proptype; 9553 spa_feature_t fid; 9554 9555 switch (prop = zpool_name_to_prop(elemname)) { 9556 case ZPOOL_PROP_VERSION: 9557 intval = fnvpair_value_uint64(elem); 9558 /* 9559 * The version is synced separately before other 9560 * properties and should be correct by now. 9561 */ 9562 ASSERT3U(spa_version(spa), >=, intval); 9563 break; 9564 9565 case ZPOOL_PROP_ALTROOT: 9566 /* 9567 * 'altroot' is a non-persistent property. It should 9568 * have been set temporarily at creation or import time. 9569 */ 9570 ASSERT(spa->spa_root != NULL); 9571 break; 9572 9573 case ZPOOL_PROP_READONLY: 9574 case ZPOOL_PROP_CACHEFILE: 9575 /* 9576 * 'readonly' and 'cachefile' are also non-persistent 9577 * properties. 9578 */ 9579 break; 9580 case ZPOOL_PROP_COMMENT: 9581 strval = fnvpair_value_string(elem); 9582 if (spa->spa_comment != NULL) 9583 spa_strfree(spa->spa_comment); 9584 spa->spa_comment = spa_strdup(strval); 9585 /* 9586 * We need to dirty the configuration on all the vdevs 9587 * so that their labels get updated. We also need to 9588 * update the cache file to keep it in sync with the 9589 * MOS version. It's unnecessary to do this for pool 9590 * creation since the vdev's configuration has already 9591 * been dirtied. 9592 */ 9593 if (tx->tx_txg != TXG_INITIAL) { 9594 vdev_config_dirty(spa->spa_root_vdev); 9595 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 9596 } 9597 spa_history_log_internal(spa, "set", tx, 9598 "%s=%s", elemname, strval); 9599 break; 9600 case ZPOOL_PROP_COMPATIBILITY: 9601 strval = fnvpair_value_string(elem); 9602 if (spa->spa_compatibility != NULL) 9603 spa_strfree(spa->spa_compatibility); 9604 spa->spa_compatibility = spa_strdup(strval); 9605 /* 9606 * Dirty the configuration on vdevs as above. 9607 */ 9608 if (tx->tx_txg != TXG_INITIAL) { 9609 vdev_config_dirty(spa->spa_root_vdev); 9610 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 9611 } 9612 9613 spa_history_log_internal(spa, "set", tx, 9614 "%s=%s", nvpair_name(elem), strval); 9615 break; 9616 9617 case ZPOOL_PROP_INVAL: 9618 if (zpool_prop_feature(elemname)) { 9619 fname = strchr(elemname, '@') + 1; 9620 VERIFY0(zfeature_lookup_name(fname, &fid)); 9621 9622 spa_feature_enable(spa, fid, tx); 9623 spa_history_log_internal(spa, "set", tx, 9624 "%s=enabled", elemname); 9625 break; 9626 } else if (!zfs_prop_user(elemname)) { 9627 ASSERT(zpool_prop_feature(elemname)); 9628 break; 9629 } 9630 zfs_fallthrough; 9631 default: 9632 /* 9633 * Set pool property values in the poolprops mos object. 9634 */ 9635 if (spa->spa_pool_props_object == 0) { 9636 spa->spa_pool_props_object = 9637 zap_create_link(mos, DMU_OT_POOL_PROPS, 9638 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 9639 tx); 9640 } 9641 9642 /* normalize the property name */ 9643 if (prop == ZPOOL_PROP_INVAL) { 9644 propname = elemname; 9645 proptype = PROP_TYPE_STRING; 9646 } else { 9647 propname = zpool_prop_to_name(prop); 9648 proptype = zpool_prop_get_type(prop); 9649 } 9650 9651 if (nvpair_type(elem) == DATA_TYPE_STRING) { 9652 ASSERT(proptype == PROP_TYPE_STRING); 9653 strval = fnvpair_value_string(elem); 9654 VERIFY0(zap_update(mos, 9655 spa->spa_pool_props_object, propname, 9656 1, strlen(strval) + 1, strval, tx)); 9657 spa_history_log_internal(spa, "set", tx, 9658 "%s=%s", elemname, strval); 9659 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 9660 intval = fnvpair_value_uint64(elem); 9661 9662 if (proptype == PROP_TYPE_INDEX) { 9663 const char *unused; 9664 VERIFY0(zpool_prop_index_to_string( 9665 prop, intval, &unused)); 9666 } 9667 VERIFY0(zap_update(mos, 9668 spa->spa_pool_props_object, propname, 9669 8, 1, &intval, tx)); 9670 spa_history_log_internal(spa, "set", tx, 9671 "%s=%lld", elemname, 9672 (longlong_t)intval); 9673 9674 switch (prop) { 9675 case ZPOOL_PROP_DELEGATION: 9676 spa->spa_delegation = intval; 9677 break; 9678 case ZPOOL_PROP_BOOTFS: 9679 spa->spa_bootfs = intval; 9680 break; 9681 case ZPOOL_PROP_FAILUREMODE: 9682 spa->spa_failmode = intval; 9683 break; 9684 case ZPOOL_PROP_AUTOTRIM: 9685 spa->spa_autotrim = intval; 9686 spa_async_request(spa, 9687 SPA_ASYNC_AUTOTRIM_RESTART); 9688 break; 9689 case ZPOOL_PROP_AUTOEXPAND: 9690 spa->spa_autoexpand = intval; 9691 if (tx->tx_txg != TXG_INITIAL) 9692 spa_async_request(spa, 9693 SPA_ASYNC_AUTOEXPAND); 9694 break; 9695 case ZPOOL_PROP_MULTIHOST: 9696 spa->spa_multihost = intval; 9697 break; 9698 case ZPOOL_PROP_DEDUP_TABLE_QUOTA: 9699 spa->spa_dedup_table_quota = intval; 9700 break; 9701 default: 9702 break; 9703 } 9704 } else { 9705 ASSERT(0); /* not allowed */ 9706 } 9707 } 9708 9709 } 9710 9711 mutex_exit(&spa->spa_props_lock); 9712 } 9713 9714 /* 9715 * Perform one-time upgrade on-disk changes. spa_version() does not 9716 * reflect the new version this txg, so there must be no changes this 9717 * txg to anything that the upgrade code depends on after it executes. 9718 * Therefore this must be called after dsl_pool_sync() does the sync 9719 * tasks. 9720 */ 9721 static void 9722 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 9723 { 9724 if (spa_sync_pass(spa) != 1) 9725 return; 9726 9727 dsl_pool_t *dp = spa->spa_dsl_pool; 9728 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 9729 9730 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 9731 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 9732 dsl_pool_create_origin(dp, tx); 9733 9734 /* Keeping the origin open increases spa_minref */ 9735 spa->spa_minref += 3; 9736 } 9737 9738 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 9739 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 9740 dsl_pool_upgrade_clones(dp, tx); 9741 } 9742 9743 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 9744 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 9745 dsl_pool_upgrade_dir_clones(dp, tx); 9746 9747 /* Keeping the freedir open increases spa_minref */ 9748 spa->spa_minref += 3; 9749 } 9750 9751 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 9752 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 9753 spa_feature_create_zap_objects(spa, tx); 9754 } 9755 9756 /* 9757 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 9758 * when possibility to use lz4 compression for metadata was added 9759 * Old pools that have this feature enabled must be upgraded to have 9760 * this feature active 9761 */ 9762 if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 9763 boolean_t lz4_en = spa_feature_is_enabled(spa, 9764 SPA_FEATURE_LZ4_COMPRESS); 9765 boolean_t lz4_ac = spa_feature_is_active(spa, 9766 SPA_FEATURE_LZ4_COMPRESS); 9767 9768 if (lz4_en && !lz4_ac) 9769 spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 9770 } 9771 9772 /* 9773 * If we haven't written the salt, do so now. Note that the 9774 * feature may not be activated yet, but that's fine since 9775 * the presence of this ZAP entry is backwards compatible. 9776 */ 9777 if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 9778 DMU_POOL_CHECKSUM_SALT) == ENOENT) { 9779 VERIFY0(zap_add(spa->spa_meta_objset, 9780 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, 9781 sizeof (spa->spa_cksum_salt.zcs_bytes), 9782 spa->spa_cksum_salt.zcs_bytes, tx)); 9783 } 9784 9785 rrw_exit(&dp->dp_config_rwlock, FTAG); 9786 } 9787 9788 static void 9789 vdev_indirect_state_sync_verify(vdev_t *vd) 9790 { 9791 vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping; 9792 vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births; 9793 9794 if (vd->vdev_ops == &vdev_indirect_ops) { 9795 ASSERT(vim != NULL); 9796 ASSERT(vib != NULL); 9797 } 9798 9799 uint64_t obsolete_sm_object = 0; 9800 ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); 9801 if (obsolete_sm_object != 0) { 9802 ASSERT(vd->vdev_obsolete_sm != NULL); 9803 ASSERT(vd->vdev_removing || 9804 vd->vdev_ops == &vdev_indirect_ops); 9805 ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); 9806 ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); 9807 ASSERT3U(obsolete_sm_object, ==, 9808 space_map_object(vd->vdev_obsolete_sm)); 9809 ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, 9810 space_map_allocated(vd->vdev_obsolete_sm)); 9811 } 9812 ASSERT(vd->vdev_obsolete_segments != NULL); 9813 9814 /* 9815 * Since frees / remaps to an indirect vdev can only 9816 * happen in syncing context, the obsolete segments 9817 * tree must be empty when we start syncing. 9818 */ 9819 ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); 9820 } 9821 9822 /* 9823 * Set the top-level vdev's max queue depth. Evaluate each top-level's 9824 * async write queue depth in case it changed. The max queue depth will 9825 * not change in the middle of syncing out this txg. 9826 */ 9827 static void 9828 spa_sync_adjust_vdev_max_queue_depth(spa_t *spa) 9829 { 9830 ASSERT(spa_writeable(spa)); 9831 9832 vdev_t *rvd = spa->spa_root_vdev; 9833 uint32_t max_queue_depth = zfs_vdev_async_write_max_active * 9834 zfs_vdev_queue_depth_pct / 100; 9835 metaslab_class_t *normal = spa_normal_class(spa); 9836 metaslab_class_t *special = spa_special_class(spa); 9837 metaslab_class_t *dedup = spa_dedup_class(spa); 9838 9839 uint64_t slots_per_allocator = 0; 9840 for (int c = 0; c < rvd->vdev_children; c++) { 9841 vdev_t *tvd = rvd->vdev_child[c]; 9842 9843 metaslab_group_t *mg = tvd->vdev_mg; 9844 if (mg == NULL || !metaslab_group_initialized(mg)) 9845 continue; 9846 9847 metaslab_class_t *mc = mg->mg_class; 9848 if (mc != normal && mc != special && mc != dedup) 9849 continue; 9850 9851 /* 9852 * It is safe to do a lock-free check here because only async 9853 * allocations look at mg_max_alloc_queue_depth, and async 9854 * allocations all happen from spa_sync(). 9855 */ 9856 for (int i = 0; i < mg->mg_allocators; i++) { 9857 ASSERT0(zfs_refcount_count( 9858 &(mg->mg_allocator[i].mga_alloc_queue_depth))); 9859 } 9860 mg->mg_max_alloc_queue_depth = max_queue_depth; 9861 9862 for (int i = 0; i < mg->mg_allocators; i++) { 9863 mg->mg_allocator[i].mga_cur_max_alloc_queue_depth = 9864 zfs_vdev_def_queue_depth; 9865 } 9866 slots_per_allocator += zfs_vdev_def_queue_depth; 9867 } 9868 9869 for (int i = 0; i < spa->spa_alloc_count; i++) { 9870 ASSERT0(zfs_refcount_count(&normal->mc_allocator[i]. 9871 mca_alloc_slots)); 9872 ASSERT0(zfs_refcount_count(&special->mc_allocator[i]. 9873 mca_alloc_slots)); 9874 ASSERT0(zfs_refcount_count(&dedup->mc_allocator[i]. 9875 mca_alloc_slots)); 9876 normal->mc_allocator[i].mca_alloc_max_slots = 9877 slots_per_allocator; 9878 special->mc_allocator[i].mca_alloc_max_slots = 9879 slots_per_allocator; 9880 dedup->mc_allocator[i].mca_alloc_max_slots = 9881 slots_per_allocator; 9882 } 9883 normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 9884 special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 9885 dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 9886 } 9887 9888 static void 9889 spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx) 9890 { 9891 ASSERT(spa_writeable(spa)); 9892 9893 vdev_t *rvd = spa->spa_root_vdev; 9894 for (int c = 0; c < rvd->vdev_children; c++) { 9895 vdev_t *vd = rvd->vdev_child[c]; 9896 vdev_indirect_state_sync_verify(vd); 9897 9898 if (vdev_indirect_should_condense(vd)) { 9899 spa_condense_indirect_start_sync(vd, tx); 9900 break; 9901 } 9902 } 9903 } 9904 9905 static void 9906 spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) 9907 { 9908 objset_t *mos = spa->spa_meta_objset; 9909 dsl_pool_t *dp = spa->spa_dsl_pool; 9910 uint64_t txg = tx->tx_txg; 9911 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 9912 9913 do { 9914 int pass = ++spa->spa_sync_pass; 9915 9916 spa_sync_config_object(spa, tx); 9917 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 9918 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 9919 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 9920 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 9921 spa_errlog_sync(spa, txg); 9922 dsl_pool_sync(dp, txg); 9923 9924 if (pass < zfs_sync_pass_deferred_free || 9925 spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { 9926 /* 9927 * If the log space map feature is active we don't 9928 * care about deferred frees and the deferred bpobj 9929 * as the log space map should effectively have the 9930 * same results (i.e. appending only to one object). 9931 */ 9932 spa_sync_frees(spa, free_bpl, tx); 9933 } else { 9934 /* 9935 * We can not defer frees in pass 1, because 9936 * we sync the deferred frees later in pass 1. 9937 */ 9938 ASSERT3U(pass, >, 1); 9939 bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb, 9940 &spa->spa_deferred_bpobj, tx); 9941 } 9942 9943 brt_sync(spa, txg); 9944 ddt_sync(spa, txg); 9945 dsl_scan_sync(dp, tx); 9946 dsl_errorscrub_sync(dp, tx); 9947 svr_sync(spa, tx); 9948 spa_sync_upgrades(spa, tx); 9949 9950 spa_flush_metaslabs(spa, tx); 9951 9952 vdev_t *vd = NULL; 9953 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 9954 != NULL) 9955 vdev_sync(vd, txg); 9956 9957 if (pass == 1) { 9958 /* 9959 * dsl_pool_sync() -> dp_sync_tasks may have dirtied 9960 * the config. If that happens, this txg should not 9961 * be a no-op. So we must sync the config to the MOS 9962 * before checking for no-op. 9963 * 9964 * Note that when the config is dirty, it will 9965 * be written to the MOS (i.e. the MOS will be 9966 * dirtied) every time we call spa_sync_config_object() 9967 * in this txg. Therefore we can't call this after 9968 * dsl_pool_sync() every pass, because it would 9969 * prevent us from converging, since we'd dirty 9970 * the MOS every pass. 9971 * 9972 * Sync tasks can only be processed in pass 1, so 9973 * there's no need to do this in later passes. 9974 */ 9975 spa_sync_config_object(spa, tx); 9976 } 9977 9978 /* 9979 * Note: We need to check if the MOS is dirty because we could 9980 * have marked the MOS dirty without updating the uberblock 9981 * (e.g. if we have sync tasks but no dirty user data). We need 9982 * to check the uberblock's rootbp because it is updated if we 9983 * have synced out dirty data (though in this case the MOS will 9984 * most likely also be dirty due to second order effects, we 9985 * don't want to rely on that here). 9986 */ 9987 if (pass == 1 && 9988 BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg && 9989 !dmu_objset_is_dirty(mos, txg)) { 9990 /* 9991 * Nothing changed on the first pass, therefore this 9992 * TXG is a no-op. Avoid syncing deferred frees, so 9993 * that we can keep this TXG as a no-op. 9994 */ 9995 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 9996 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 9997 ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); 9998 ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg)); 9999 break; 10000 } 10001 10002 spa_sync_deferred_frees(spa, tx); 10003 } while (dmu_objset_is_dirty(mos, txg)); 10004 } 10005 10006 /* 10007 * Rewrite the vdev configuration (which includes the uberblock) to 10008 * commit the transaction group. 10009 * 10010 * If there are no dirty vdevs, we sync the uberblock to a few random 10011 * top-level vdevs that are known to be visible in the config cache 10012 * (see spa_vdev_add() for a complete description). If there *are* dirty 10013 * vdevs, sync the uberblock to all vdevs. 10014 */ 10015 static void 10016 spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) 10017 { 10018 vdev_t *rvd = spa->spa_root_vdev; 10019 uint64_t txg = tx->tx_txg; 10020 10021 for (;;) { 10022 int error = 0; 10023 10024 /* 10025 * We hold SCL_STATE to prevent vdev open/close/etc. 10026 * while we're attempting to write the vdev labels. 10027 */ 10028 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 10029 10030 if (list_is_empty(&spa->spa_config_dirty_list)) { 10031 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; 10032 int svdcount = 0; 10033 int children = rvd->vdev_children; 10034 int c0 = random_in_range(children); 10035 10036 for (int c = 0; c < children; c++) { 10037 vdev_t *vd = 10038 rvd->vdev_child[(c0 + c) % children]; 10039 10040 /* Stop when revisiting the first vdev */ 10041 if (c > 0 && svd[0] == vd) 10042 break; 10043 10044 if (vd->vdev_ms_array == 0 || 10045 vd->vdev_islog || 10046 !vdev_is_concrete(vd)) 10047 continue; 10048 10049 svd[svdcount++] = vd; 10050 if (svdcount == SPA_SYNC_MIN_VDEVS) 10051 break; 10052 } 10053 error = vdev_config_sync(svd, svdcount, txg); 10054 } else { 10055 error = vdev_config_sync(rvd->vdev_child, 10056 rvd->vdev_children, txg); 10057 } 10058 10059 if (error == 0) 10060 spa->spa_last_synced_guid = rvd->vdev_guid; 10061 10062 spa_config_exit(spa, SCL_STATE, FTAG); 10063 10064 if (error == 0) 10065 break; 10066 zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); 10067 zio_resume_wait(spa); 10068 } 10069 } 10070 10071 /* 10072 * Sync the specified transaction group. New blocks may be dirtied as 10073 * part of the process, so we iterate until it converges. 10074 */ 10075 void 10076 spa_sync(spa_t *spa, uint64_t txg) 10077 { 10078 vdev_t *vd = NULL; 10079 10080 VERIFY(spa_writeable(spa)); 10081 10082 /* 10083 * Wait for i/os issued in open context that need to complete 10084 * before this txg syncs. 10085 */ 10086 (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); 10087 spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 10088 ZIO_FLAG_CANFAIL); 10089 10090 /* 10091 * Now that there can be no more cloning in this transaction group, 10092 * but we are still before issuing frees, we can process pending BRT 10093 * updates. 10094 */ 10095 brt_pending_apply(spa, txg); 10096 10097 /* 10098 * Lock out configuration changes. 10099 */ 10100 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 10101 10102 spa->spa_syncing_txg = txg; 10103 spa->spa_sync_pass = 0; 10104 10105 for (int i = 0; i < spa->spa_alloc_count; i++) { 10106 mutex_enter(&spa->spa_allocs[i].spaa_lock); 10107 VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); 10108 mutex_exit(&spa->spa_allocs[i].spaa_lock); 10109 } 10110 10111 /* 10112 * If there are any pending vdev state changes, convert them 10113 * into config changes that go out with this transaction group. 10114 */ 10115 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 10116 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 10117 /* Avoid holding the write lock unless actually necessary */ 10118 if (vd->vdev_aux == NULL) { 10119 vdev_state_clean(vd); 10120 vdev_config_dirty(vd); 10121 continue; 10122 } 10123 /* 10124 * We need the write lock here because, for aux vdevs, 10125 * calling vdev_config_dirty() modifies sav_config. 10126 * This is ugly and will become unnecessary when we 10127 * eliminate the aux vdev wart by integrating all vdevs 10128 * into the root vdev tree. 10129 */ 10130 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 10131 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 10132 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 10133 vdev_state_clean(vd); 10134 vdev_config_dirty(vd); 10135 } 10136 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 10137 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 10138 } 10139 spa_config_exit(spa, SCL_STATE, FTAG); 10140 10141 dsl_pool_t *dp = spa->spa_dsl_pool; 10142 dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); 10143 10144 spa->spa_sync_starttime = gethrtime(); 10145 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 10146 spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, 10147 spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + 10148 NSEC_TO_TICK(spa->spa_deadman_synctime)); 10149 10150 /* 10151 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 10152 * set spa_deflate if we have no raid-z vdevs. 10153 */ 10154 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 10155 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 10156 vdev_t *rvd = spa->spa_root_vdev; 10157 10158 int i; 10159 for (i = 0; i < rvd->vdev_children; i++) { 10160 vd = rvd->vdev_child[i]; 10161 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 10162 break; 10163 } 10164 if (i == rvd->vdev_children) { 10165 spa->spa_deflate = TRUE; 10166 VERIFY0(zap_add(spa->spa_meta_objset, 10167 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 10168 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 10169 } 10170 } 10171 10172 spa_sync_adjust_vdev_max_queue_depth(spa); 10173 10174 spa_sync_condense_indirect(spa, tx); 10175 10176 spa_sync_iterate_to_convergence(spa, tx); 10177 10178 #ifdef ZFS_DEBUG 10179 if (!list_is_empty(&spa->spa_config_dirty_list)) { 10180 /* 10181 * Make sure that the number of ZAPs for all the vdevs matches 10182 * the number of ZAPs in the per-vdev ZAP list. This only gets 10183 * called if the config is dirty; otherwise there may be 10184 * outstanding AVZ operations that weren't completed in 10185 * spa_sync_config_object. 10186 */ 10187 uint64_t all_vdev_zap_entry_count; 10188 ASSERT0(zap_count(spa->spa_meta_objset, 10189 spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); 10190 ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, 10191 all_vdev_zap_entry_count); 10192 } 10193 #endif 10194 10195 if (spa->spa_vdev_removal != NULL) { 10196 ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); 10197 } 10198 10199 spa_sync_rewrite_vdev_config(spa, tx); 10200 dmu_tx_commit(tx); 10201 10202 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 10203 spa->spa_deadman_tqid = 0; 10204 10205 /* 10206 * Clear the dirty config list. 10207 */ 10208 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 10209 vdev_config_clean(vd); 10210 10211 /* 10212 * Now that the new config has synced transactionally, 10213 * let it become visible to the config cache. 10214 */ 10215 if (spa->spa_config_syncing != NULL) { 10216 spa_config_set(spa, spa->spa_config_syncing); 10217 spa->spa_config_txg = txg; 10218 spa->spa_config_syncing = NULL; 10219 } 10220 10221 dsl_pool_sync_done(dp, txg); 10222 10223 for (int i = 0; i < spa->spa_alloc_count; i++) { 10224 mutex_enter(&spa->spa_allocs[i].spaa_lock); 10225 VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); 10226 mutex_exit(&spa->spa_allocs[i].spaa_lock); 10227 } 10228 10229 /* 10230 * Update usable space statistics. 10231 */ 10232 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 10233 != NULL) 10234 vdev_sync_done(vd, txg); 10235 10236 metaslab_class_evict_old(spa->spa_normal_class, txg); 10237 metaslab_class_evict_old(spa->spa_log_class, txg); 10238 /* spa_embedded_log_class has only one metaslab per vdev. */ 10239 metaslab_class_evict_old(spa->spa_special_class, txg); 10240 metaslab_class_evict_old(spa->spa_dedup_class, txg); 10241 10242 spa_sync_close_syncing_log_sm(spa); 10243 10244 spa_update_dspace(spa); 10245 10246 if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) 10247 vdev_autotrim_kick(spa); 10248 10249 /* 10250 * It had better be the case that we didn't dirty anything 10251 * since vdev_config_sync(). 10252 */ 10253 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 10254 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 10255 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 10256 10257 while (zfs_pause_spa_sync) 10258 delay(1); 10259 10260 spa->spa_sync_pass = 0; 10261 10262 /* 10263 * Update the last synced uberblock here. We want to do this at 10264 * the end of spa_sync() so that consumers of spa_last_synced_txg() 10265 * will be guaranteed that all the processing associated with 10266 * that txg has been completed. 10267 */ 10268 spa->spa_ubsync = spa->spa_uberblock; 10269 spa_config_exit(spa, SCL_CONFIG, FTAG); 10270 10271 spa_handle_ignored_writes(spa); 10272 10273 /* 10274 * If any async tasks have been requested, kick them off. 10275 */ 10276 spa_async_dispatch(spa); 10277 } 10278 10279 /* 10280 * Sync all pools. We don't want to hold the namespace lock across these 10281 * operations, so we take a reference on the spa_t and drop the lock during the 10282 * sync. 10283 */ 10284 void 10285 spa_sync_allpools(void) 10286 { 10287 spa_t *spa = NULL; 10288 mutex_enter(&spa_namespace_lock); 10289 while ((spa = spa_next(spa)) != NULL) { 10290 if (spa_state(spa) != POOL_STATE_ACTIVE || 10291 !spa_writeable(spa) || spa_suspended(spa)) 10292 continue; 10293 spa_open_ref(spa, FTAG); 10294 mutex_exit(&spa_namespace_lock); 10295 txg_wait_synced(spa_get_dsl(spa), 0); 10296 mutex_enter(&spa_namespace_lock); 10297 spa_close(spa, FTAG); 10298 } 10299 mutex_exit(&spa_namespace_lock); 10300 } 10301 10302 taskq_t * 10303 spa_sync_tq_create(spa_t *spa, const char *name) 10304 { 10305 kthread_t **kthreads; 10306 10307 ASSERT(spa->spa_sync_tq == NULL); 10308 ASSERT3S(spa->spa_alloc_count, <=, boot_ncpus); 10309 10310 /* 10311 * - do not allow more allocators than cpus. 10312 * - there may be more cpus than allocators. 10313 * - do not allow more sync taskq threads than allocators or cpus. 10314 */ 10315 int nthreads = spa->spa_alloc_count; 10316 spa->spa_syncthreads = kmem_zalloc(sizeof (spa_syncthread_info_t) * 10317 nthreads, KM_SLEEP); 10318 10319 spa->spa_sync_tq = taskq_create_synced(name, nthreads, minclsyspri, 10320 nthreads, INT_MAX, TASKQ_PREPOPULATE, &kthreads); 10321 VERIFY(spa->spa_sync_tq != NULL); 10322 VERIFY(kthreads != NULL); 10323 10324 spa_syncthread_info_t *ti = spa->spa_syncthreads; 10325 for (int i = 0; i < nthreads; i++, ti++) { 10326 ti->sti_thread = kthreads[i]; 10327 ti->sti_allocator = i; 10328 } 10329 10330 kmem_free(kthreads, sizeof (*kthreads) * nthreads); 10331 return (spa->spa_sync_tq); 10332 } 10333 10334 void 10335 spa_sync_tq_destroy(spa_t *spa) 10336 { 10337 ASSERT(spa->spa_sync_tq != NULL); 10338 10339 taskq_wait(spa->spa_sync_tq); 10340 taskq_destroy(spa->spa_sync_tq); 10341 kmem_free(spa->spa_syncthreads, 10342 sizeof (spa_syncthread_info_t) * spa->spa_alloc_count); 10343 spa->spa_sync_tq = NULL; 10344 } 10345 10346 uint_t 10347 spa_acq_allocator(spa_t *spa) 10348 { 10349 int i; 10350 10351 if (spa->spa_alloc_count == 1) 10352 return (0); 10353 10354 mutex_enter(&spa->spa_allocs_use->sau_lock); 10355 uint_t r = spa->spa_allocs_use->sau_rotor; 10356 do { 10357 if (++r == spa->spa_alloc_count) 10358 r = 0; 10359 } while (spa->spa_allocs_use->sau_inuse[r]); 10360 spa->spa_allocs_use->sau_inuse[r] = B_TRUE; 10361 spa->spa_allocs_use->sau_rotor = r; 10362 mutex_exit(&spa->spa_allocs_use->sau_lock); 10363 10364 spa_syncthread_info_t *ti = spa->spa_syncthreads; 10365 for (i = 0; i < spa->spa_alloc_count; i++, ti++) { 10366 if (ti->sti_thread == curthread) { 10367 ti->sti_allocator = r; 10368 break; 10369 } 10370 } 10371 ASSERT3S(i, <, spa->spa_alloc_count); 10372 return (r); 10373 } 10374 10375 void 10376 spa_rel_allocator(spa_t *spa, uint_t allocator) 10377 { 10378 if (spa->spa_alloc_count > 1) 10379 spa->spa_allocs_use->sau_inuse[allocator] = B_FALSE; 10380 } 10381 10382 void 10383 spa_select_allocator(zio_t *zio) 10384 { 10385 zbookmark_phys_t *bm = &zio->io_bookmark; 10386 spa_t *spa = zio->io_spa; 10387 10388 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 10389 10390 /* 10391 * A gang block (for example) may have inherited its parent's 10392 * allocator, in which case there is nothing further to do here. 10393 */ 10394 if (ZIO_HAS_ALLOCATOR(zio)) 10395 return; 10396 10397 ASSERT(spa != NULL); 10398 ASSERT(bm != NULL); 10399 10400 /* 10401 * First try to use an allocator assigned to the syncthread, and set 10402 * the corresponding write issue taskq for the allocator. 10403 * Note, we must have an open pool to do this. 10404 */ 10405 if (spa->spa_sync_tq != NULL) { 10406 spa_syncthread_info_t *ti = spa->spa_syncthreads; 10407 for (int i = 0; i < spa->spa_alloc_count; i++, ti++) { 10408 if (ti->sti_thread == curthread) { 10409 zio->io_allocator = ti->sti_allocator; 10410 return; 10411 } 10412 } 10413 } 10414 10415 /* 10416 * We want to try to use as many allocators as possible to help improve 10417 * performance, but we also want logically adjacent IOs to be physically 10418 * adjacent to improve sequential read performance. We chunk each object 10419 * into 2^20 block regions, and then hash based on the objset, object, 10420 * level, and region to accomplish both of these goals. 10421 */ 10422 uint64_t hv = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level, 10423 bm->zb_blkid >> 20); 10424 10425 zio->io_allocator = (uint_t)hv % spa->spa_alloc_count; 10426 } 10427 10428 /* 10429 * ========================================================================== 10430 * Miscellaneous routines 10431 * ========================================================================== 10432 */ 10433 10434 /* 10435 * Remove all pools in the system. 10436 */ 10437 void 10438 spa_evict_all(void) 10439 { 10440 spa_t *spa; 10441 10442 /* 10443 * Remove all cached state. All pools should be closed now, 10444 * so every spa in the AVL tree should be unreferenced. 10445 */ 10446 mutex_enter(&spa_namespace_lock); 10447 while ((spa = spa_next(NULL)) != NULL) { 10448 /* 10449 * Stop async tasks. The async thread may need to detach 10450 * a device that's been replaced, which requires grabbing 10451 * spa_namespace_lock, so we must drop it here. 10452 */ 10453 spa_open_ref(spa, FTAG); 10454 mutex_exit(&spa_namespace_lock); 10455 spa_async_suspend(spa); 10456 mutex_enter(&spa_namespace_lock); 10457 spa_close(spa, FTAG); 10458 10459 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 10460 spa_unload(spa); 10461 spa_deactivate(spa); 10462 } 10463 spa_remove(spa); 10464 } 10465 mutex_exit(&spa_namespace_lock); 10466 } 10467 10468 vdev_t * 10469 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 10470 { 10471 vdev_t *vd; 10472 int i; 10473 10474 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 10475 return (vd); 10476 10477 if (aux) { 10478 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 10479 vd = spa->spa_l2cache.sav_vdevs[i]; 10480 if (vd->vdev_guid == guid) 10481 return (vd); 10482 } 10483 10484 for (i = 0; i < spa->spa_spares.sav_count; i++) { 10485 vd = spa->spa_spares.sav_vdevs[i]; 10486 if (vd->vdev_guid == guid) 10487 return (vd); 10488 } 10489 } 10490 10491 return (NULL); 10492 } 10493 10494 void 10495 spa_upgrade(spa_t *spa, uint64_t version) 10496 { 10497 ASSERT(spa_writeable(spa)); 10498 10499 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 10500 10501 /* 10502 * This should only be called for a non-faulted pool, and since a 10503 * future version would result in an unopenable pool, this shouldn't be 10504 * possible. 10505 */ 10506 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 10507 ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 10508 10509 spa->spa_uberblock.ub_version = version; 10510 vdev_config_dirty(spa->spa_root_vdev); 10511 10512 spa_config_exit(spa, SCL_ALL, FTAG); 10513 10514 txg_wait_synced(spa_get_dsl(spa), 0); 10515 } 10516 10517 static boolean_t 10518 spa_has_aux_vdev(spa_t *spa, uint64_t guid, spa_aux_vdev_t *sav) 10519 { 10520 (void) spa; 10521 int i; 10522 uint64_t vdev_guid; 10523 10524 for (i = 0; i < sav->sav_count; i++) 10525 if (sav->sav_vdevs[i]->vdev_guid == guid) 10526 return (B_TRUE); 10527 10528 for (i = 0; i < sav->sav_npending; i++) { 10529 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 10530 &vdev_guid) == 0 && vdev_guid == guid) 10531 return (B_TRUE); 10532 } 10533 10534 return (B_FALSE); 10535 } 10536 10537 boolean_t 10538 spa_has_l2cache(spa_t *spa, uint64_t guid) 10539 { 10540 return (spa_has_aux_vdev(spa, guid, &spa->spa_l2cache)); 10541 } 10542 10543 boolean_t 10544 spa_has_spare(spa_t *spa, uint64_t guid) 10545 { 10546 return (spa_has_aux_vdev(spa, guid, &spa->spa_spares)); 10547 } 10548 10549 /* 10550 * Check if a pool has an active shared spare device. 10551 * Note: reference count of an active spare is 2, as a spare and as a replace 10552 */ 10553 static boolean_t 10554 spa_has_active_shared_spare(spa_t *spa) 10555 { 10556 int i, refcnt; 10557 uint64_t pool; 10558 spa_aux_vdev_t *sav = &spa->spa_spares; 10559 10560 for (i = 0; i < sav->sav_count; i++) { 10561 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 10562 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 10563 refcnt > 2) 10564 return (B_TRUE); 10565 } 10566 10567 return (B_FALSE); 10568 } 10569 10570 uint64_t 10571 spa_total_metaslabs(spa_t *spa) 10572 { 10573 vdev_t *rvd = spa->spa_root_vdev; 10574 10575 uint64_t m = 0; 10576 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 10577 vdev_t *vd = rvd->vdev_child[c]; 10578 if (!vdev_is_concrete(vd)) 10579 continue; 10580 m += vd->vdev_ms_count; 10581 } 10582 return (m); 10583 } 10584 10585 /* 10586 * Notify any waiting threads that some activity has switched from being in- 10587 * progress to not-in-progress so that the thread can wake up and determine 10588 * whether it is finished waiting. 10589 */ 10590 void 10591 spa_notify_waiters(spa_t *spa) 10592 { 10593 /* 10594 * Acquiring spa_activities_lock here prevents the cv_broadcast from 10595 * happening between the waiting thread's check and cv_wait. 10596 */ 10597 mutex_enter(&spa->spa_activities_lock); 10598 cv_broadcast(&spa->spa_activities_cv); 10599 mutex_exit(&spa->spa_activities_lock); 10600 } 10601 10602 /* 10603 * Notify any waiting threads that the pool is exporting, and then block until 10604 * they are finished using the spa_t. 10605 */ 10606 void 10607 spa_wake_waiters(spa_t *spa) 10608 { 10609 mutex_enter(&spa->spa_activities_lock); 10610 spa->spa_waiters_cancel = B_TRUE; 10611 cv_broadcast(&spa->spa_activities_cv); 10612 while (spa->spa_waiters != 0) 10613 cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock); 10614 spa->spa_waiters_cancel = B_FALSE; 10615 mutex_exit(&spa->spa_activities_lock); 10616 } 10617 10618 /* Whether the vdev or any of its descendants are being initialized/trimmed. */ 10619 static boolean_t 10620 spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity) 10621 { 10622 spa_t *spa = vd->vdev_spa; 10623 10624 ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER)); 10625 ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); 10626 ASSERT(activity == ZPOOL_WAIT_INITIALIZE || 10627 activity == ZPOOL_WAIT_TRIM); 10628 10629 kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ? 10630 &vd->vdev_initialize_lock : &vd->vdev_trim_lock; 10631 10632 mutex_exit(&spa->spa_activities_lock); 10633 mutex_enter(lock); 10634 mutex_enter(&spa->spa_activities_lock); 10635 10636 boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ? 10637 (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) : 10638 (vd->vdev_trim_state == VDEV_TRIM_ACTIVE); 10639 mutex_exit(lock); 10640 10641 if (in_progress) 10642 return (B_TRUE); 10643 10644 for (int i = 0; i < vd->vdev_children; i++) { 10645 if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i], 10646 activity)) 10647 return (B_TRUE); 10648 } 10649 10650 return (B_FALSE); 10651 } 10652 10653 /* 10654 * If use_guid is true, this checks whether the vdev specified by guid is 10655 * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool 10656 * is being initialized/trimmed. The caller must hold the config lock and 10657 * spa_activities_lock. 10658 */ 10659 static int 10660 spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid, 10661 zpool_wait_activity_t activity, boolean_t *in_progress) 10662 { 10663 mutex_exit(&spa->spa_activities_lock); 10664 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 10665 mutex_enter(&spa->spa_activities_lock); 10666 10667 vdev_t *vd; 10668 if (use_guid) { 10669 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 10670 if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) { 10671 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 10672 return (EINVAL); 10673 } 10674 } else { 10675 vd = spa->spa_root_vdev; 10676 } 10677 10678 *in_progress = spa_vdev_activity_in_progress_impl(vd, activity); 10679 10680 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 10681 return (0); 10682 } 10683 10684 /* 10685 * Locking for waiting threads 10686 * --------------------------- 10687 * 10688 * Waiting threads need a way to check whether a given activity is in progress, 10689 * and then, if it is, wait for it to complete. Each activity will have some 10690 * in-memory representation of the relevant on-disk state which can be used to 10691 * determine whether or not the activity is in progress. The in-memory state and 10692 * the locking used to protect it will be different for each activity, and may 10693 * not be suitable for use with a cvar (e.g., some state is protected by the 10694 * config lock). To allow waiting threads to wait without any races, another 10695 * lock, spa_activities_lock, is used. 10696 * 10697 * When the state is checked, both the activity-specific lock (if there is one) 10698 * and spa_activities_lock are held. In some cases, the activity-specific lock 10699 * is acquired explicitly (e.g. the config lock). In others, the locking is 10700 * internal to some check (e.g. bpobj_is_empty). After checking, the waiting 10701 * thread releases the activity-specific lock and, if the activity is in 10702 * progress, then cv_waits using spa_activities_lock. 10703 * 10704 * The waiting thread is woken when another thread, one completing some 10705 * activity, updates the state of the activity and then calls 10706 * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only 10707 * needs to hold its activity-specific lock when updating the state, and this 10708 * lock can (but doesn't have to) be dropped before calling spa_notify_waiters. 10709 * 10710 * Because spa_notify_waiters acquires spa_activities_lock before broadcasting, 10711 * and because it is held when the waiting thread checks the state of the 10712 * activity, it can never be the case that the completing thread both updates 10713 * the activity state and cv_broadcasts in between the waiting thread's check 10714 * and cv_wait. Thus, a waiting thread can never miss a wakeup. 10715 * 10716 * In order to prevent deadlock, when the waiting thread does its check, in some 10717 * cases it will temporarily drop spa_activities_lock in order to acquire the 10718 * activity-specific lock. The order in which spa_activities_lock and the 10719 * activity specific lock are acquired in the waiting thread is determined by 10720 * the order in which they are acquired in the completing thread; if the 10721 * completing thread calls spa_notify_waiters with the activity-specific lock 10722 * held, then the waiting thread must also acquire the activity-specific lock 10723 * first. 10724 */ 10725 10726 static int 10727 spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, 10728 boolean_t use_tag, uint64_t tag, boolean_t *in_progress) 10729 { 10730 int error = 0; 10731 10732 ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); 10733 10734 switch (activity) { 10735 case ZPOOL_WAIT_CKPT_DISCARD: 10736 *in_progress = 10737 (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) && 10738 zap_contains(spa_meta_objset(spa), 10739 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) == 10740 ENOENT); 10741 break; 10742 case ZPOOL_WAIT_FREE: 10743 *in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS && 10744 !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) || 10745 spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) || 10746 spa_livelist_delete_check(spa)); 10747 break; 10748 case ZPOOL_WAIT_INITIALIZE: 10749 case ZPOOL_WAIT_TRIM: 10750 error = spa_vdev_activity_in_progress(spa, use_tag, tag, 10751 activity, in_progress); 10752 break; 10753 case ZPOOL_WAIT_REPLACE: 10754 mutex_exit(&spa->spa_activities_lock); 10755 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 10756 mutex_enter(&spa->spa_activities_lock); 10757 10758 *in_progress = vdev_replace_in_progress(spa->spa_root_vdev); 10759 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 10760 break; 10761 case ZPOOL_WAIT_REMOVE: 10762 *in_progress = (spa->spa_removing_phys.sr_state == 10763 DSS_SCANNING); 10764 break; 10765 case ZPOOL_WAIT_RESILVER: 10766 *in_progress = vdev_rebuild_active(spa->spa_root_vdev); 10767 if (*in_progress) 10768 break; 10769 zfs_fallthrough; 10770 case ZPOOL_WAIT_SCRUB: 10771 { 10772 boolean_t scanning, paused, is_scrub; 10773 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 10774 10775 is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB); 10776 scanning = (scn->scn_phys.scn_state == DSS_SCANNING); 10777 paused = dsl_scan_is_paused_scrub(scn); 10778 *in_progress = (scanning && !paused && 10779 is_scrub == (activity == ZPOOL_WAIT_SCRUB)); 10780 break; 10781 } 10782 case ZPOOL_WAIT_RAIDZ_EXPAND: 10783 { 10784 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 10785 *in_progress = (vre != NULL && vre->vre_state == DSS_SCANNING); 10786 break; 10787 } 10788 default: 10789 panic("unrecognized value for activity %d", activity); 10790 } 10791 10792 return (error); 10793 } 10794 10795 static int 10796 spa_wait_common(const char *pool, zpool_wait_activity_t activity, 10797 boolean_t use_tag, uint64_t tag, boolean_t *waited) 10798 { 10799 /* 10800 * The tag is used to distinguish between instances of an activity. 10801 * 'initialize' and 'trim' are the only activities that we use this for. 10802 * The other activities can only have a single instance in progress in a 10803 * pool at one time, making the tag unnecessary. 10804 * 10805 * There can be multiple devices being replaced at once, but since they 10806 * all finish once resilvering finishes, we don't bother keeping track 10807 * of them individually, we just wait for them all to finish. 10808 */ 10809 if (use_tag && activity != ZPOOL_WAIT_INITIALIZE && 10810 activity != ZPOOL_WAIT_TRIM) 10811 return (EINVAL); 10812 10813 if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES) 10814 return (EINVAL); 10815 10816 spa_t *spa; 10817 int error = spa_open(pool, &spa, FTAG); 10818 if (error != 0) 10819 return (error); 10820 10821 /* 10822 * Increment the spa's waiter count so that we can call spa_close and 10823 * still ensure that the spa_t doesn't get freed before this thread is 10824 * finished with it when the pool is exported. We want to call spa_close 10825 * before we start waiting because otherwise the additional ref would 10826 * prevent the pool from being exported or destroyed throughout the 10827 * potentially long wait. 10828 */ 10829 mutex_enter(&spa->spa_activities_lock); 10830 spa->spa_waiters++; 10831 spa_close(spa, FTAG); 10832 10833 *waited = B_FALSE; 10834 for (;;) { 10835 boolean_t in_progress; 10836 error = spa_activity_in_progress(spa, activity, use_tag, tag, 10837 &in_progress); 10838 10839 if (error || !in_progress || spa->spa_waiters_cancel) 10840 break; 10841 10842 *waited = B_TRUE; 10843 10844 if (cv_wait_sig(&spa->spa_activities_cv, 10845 &spa->spa_activities_lock) == 0) { 10846 error = EINTR; 10847 break; 10848 } 10849 } 10850 10851 spa->spa_waiters--; 10852 cv_signal(&spa->spa_waiters_cv); 10853 mutex_exit(&spa->spa_activities_lock); 10854 10855 return (error); 10856 } 10857 10858 /* 10859 * Wait for a particular instance of the specified activity to complete, where 10860 * the instance is identified by 'tag' 10861 */ 10862 int 10863 spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag, 10864 boolean_t *waited) 10865 { 10866 return (spa_wait_common(pool, activity, B_TRUE, tag, waited)); 10867 } 10868 10869 /* 10870 * Wait for all instances of the specified activity complete 10871 */ 10872 int 10873 spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited) 10874 { 10875 10876 return (spa_wait_common(pool, activity, B_FALSE, 0, waited)); 10877 } 10878 10879 sysevent_t * 10880 spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 10881 { 10882 sysevent_t *ev = NULL; 10883 #ifdef _KERNEL 10884 nvlist_t *resource; 10885 10886 resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl); 10887 if (resource) { 10888 ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP); 10889 ev->resource = resource; 10890 } 10891 #else 10892 (void) spa, (void) vd, (void) hist_nvl, (void) name; 10893 #endif 10894 return (ev); 10895 } 10896 10897 void 10898 spa_event_post(sysevent_t *ev) 10899 { 10900 #ifdef _KERNEL 10901 if (ev) { 10902 zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb); 10903 kmem_free(ev, sizeof (*ev)); 10904 } 10905 #else 10906 (void) ev; 10907 #endif 10908 } 10909 10910 /* 10911 * Post a zevent corresponding to the given sysevent. The 'name' must be one 10912 * of the event definitions in sys/sysevent/eventdefs.h. The payload will be 10913 * filled in from the spa and (optionally) the vdev. This doesn't do anything 10914 * in the userland libzpool, as we don't want consumers to misinterpret ztest 10915 * or zdb as real changes. 10916 */ 10917 void 10918 spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 10919 { 10920 spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); 10921 } 10922 10923 /* state manipulation functions */ 10924 EXPORT_SYMBOL(spa_open); 10925 EXPORT_SYMBOL(spa_open_rewind); 10926 EXPORT_SYMBOL(spa_get_stats); 10927 EXPORT_SYMBOL(spa_create); 10928 EXPORT_SYMBOL(spa_import); 10929 EXPORT_SYMBOL(spa_tryimport); 10930 EXPORT_SYMBOL(spa_destroy); 10931 EXPORT_SYMBOL(spa_export); 10932 EXPORT_SYMBOL(spa_reset); 10933 EXPORT_SYMBOL(spa_async_request); 10934 EXPORT_SYMBOL(spa_async_suspend); 10935 EXPORT_SYMBOL(spa_async_resume); 10936 EXPORT_SYMBOL(spa_inject_addref); 10937 EXPORT_SYMBOL(spa_inject_delref); 10938 EXPORT_SYMBOL(spa_scan_stat_init); 10939 EXPORT_SYMBOL(spa_scan_get_stats); 10940 10941 /* device manipulation */ 10942 EXPORT_SYMBOL(spa_vdev_add); 10943 EXPORT_SYMBOL(spa_vdev_attach); 10944 EXPORT_SYMBOL(spa_vdev_detach); 10945 EXPORT_SYMBOL(spa_vdev_setpath); 10946 EXPORT_SYMBOL(spa_vdev_setfru); 10947 EXPORT_SYMBOL(spa_vdev_split_mirror); 10948 10949 /* spare statech is global across all pools) */ 10950 EXPORT_SYMBOL(spa_spare_add); 10951 EXPORT_SYMBOL(spa_spare_remove); 10952 EXPORT_SYMBOL(spa_spare_exists); 10953 EXPORT_SYMBOL(spa_spare_activate); 10954 10955 /* L2ARC statech is global across all pools) */ 10956 EXPORT_SYMBOL(spa_l2cache_add); 10957 EXPORT_SYMBOL(spa_l2cache_remove); 10958 EXPORT_SYMBOL(spa_l2cache_exists); 10959 EXPORT_SYMBOL(spa_l2cache_activate); 10960 EXPORT_SYMBOL(spa_l2cache_drop); 10961 10962 /* scanning */ 10963 EXPORT_SYMBOL(spa_scan); 10964 EXPORT_SYMBOL(spa_scan_stop); 10965 10966 /* spa syncing */ 10967 EXPORT_SYMBOL(spa_sync); /* only for DMU use */ 10968 EXPORT_SYMBOL(spa_sync_allpools); 10969 10970 /* properties */ 10971 EXPORT_SYMBOL(spa_prop_set); 10972 EXPORT_SYMBOL(spa_prop_get); 10973 EXPORT_SYMBOL(spa_prop_clear_bootfs); 10974 10975 /* asynchronous event notification */ 10976 EXPORT_SYMBOL(spa_event_notify); 10977 10978 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_pct, UINT, ZMOD_RW, 10979 "Percentage of CPUs to run a metaslab preload taskq"); 10980 10981 /* BEGIN CSTYLED */ 10982 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW, 10983 "log2 fraction of arc that can be used by inflight I/Os when " 10984 "verifying pool during import"); 10985 /* END CSTYLED */ 10986 10987 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW, 10988 "Set to traverse metadata on pool import"); 10989 10990 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW, 10991 "Set to traverse data on pool import"); 10992 10993 ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW, 10994 "Print vdev tree to zfs_dbgmsg during pool import"); 10995 10996 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RW, 10997 "Percentage of CPUs to run an IO worker thread"); 10998 10999 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RW, 11000 "Number of threads per IO worker taskqueue"); 11001 11002 /* BEGIN CSTYLED */ 11003 ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW, 11004 "Allow importing pool with up to this number of missing top-level " 11005 "vdevs (in read-only mode)"); 11006 /* END CSTYLED */ 11007 11008 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, 11009 ZMOD_RW, "Set the livelist condense zthr to pause"); 11010 11011 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT, 11012 ZMOD_RW, "Set the livelist condense synctask to pause"); 11013 11014 /* BEGIN CSTYLED */ 11015 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel, 11016 INT, ZMOD_RW, 11017 "Whether livelist condensing was canceled in the synctask"); 11018 11019 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, 11020 INT, ZMOD_RW, 11021 "Whether livelist condensing was canceled in the zthr function"); 11022 11023 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, 11024 ZMOD_RW, 11025 "Whether extra ALLOC blkptrs were added to a livelist entry while it " 11026 "was being condensed"); 11027 11028 #ifdef _KERNEL 11029 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read, 11030 spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW, 11031 "Configure IO queues for read IO"); 11032 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write, 11033 spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RW, 11034 "Configure IO queues for write IO"); 11035 #endif 11036 /* END CSTYLED */ 11037 11038 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_write_tpq, UINT, ZMOD_RW, 11039 "Number of CPUs per write issue taskq"); 11040